From 1853c949646005b5959c483becde86608f548f24 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 10 Sep 2015 20:05:46 +0200 Subject: netlink, mmap: transform mmap skb into full skb on taps Ken-ichirou reported that running netlink in mmap mode for receive in combination with nlmon will throw a NULL pointer dereference in __kfree_skb() on nlmon_xmit(), in my case I can also trigger an "unable to handle kernel paging request". The problem is the skb_clone() in __netlink_deliver_tap_skb() for skbs that are mmaped. I.e. the cloned skb doesn't have a destructor, whereas the mmap netlink skb has it pointed to netlink_skb_destructor(), set in the handler netlink_ring_setup_skb(). There, skb->head is being set to NULL, so that in such cases, __kfree_skb() doesn't perform a skb_release_data() via skb_release_all(), where skb->head is possibly being freed through kfree(head) into slab allocator, although netlink mmap skb->head points to the mmap buffer. Similarly, the same has to be done also for large netlink skbs where the data area is vmalloced. Therefore, as discussed, make a copy for these rather rare cases for now. This fixes the issue on my and Ken-ichirou's test-cases. Reference: http://thread.gmane.org/gmane.linux.network/371129 Fixes: bcbde0d449ed ("net: netlink: virtual tap device management") Reported-by: Ken-ichirou MATSUZAWA Signed-off-by: Daniel Borkmann Tested-by: Ken-ichirou MATSUZAWA Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 30 +++++++++++++++++++++++------- net/netlink/af_netlink.h | 9 +++++++++ 2 files changed, 32 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 7f86d3b..4cad99d 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -125,6 +125,24 @@ static inline u32 netlink_group_mask(u32 group) return group ? 1 << (group - 1) : 0; } +static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb, + gfp_t gfp_mask) +{ + unsigned int len = skb_end_offset(skb); + struct sk_buff *new; + + new = alloc_skb(len, gfp_mask); + if (new == NULL) + return NULL; + + NETLINK_CB(new).portid = NETLINK_CB(skb).portid; + NETLINK_CB(new).dst_group = NETLINK_CB(skb).dst_group; + NETLINK_CB(new).creds = NETLINK_CB(skb).creds; + + memcpy(skb_put(new, len), skb->data, len); + return new; +} + int netlink_add_tap(struct netlink_tap *nt) { if (unlikely(nt->dev->type != ARPHRD_NETLINK)) @@ -206,7 +224,11 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb, int ret = -ENOMEM; dev_hold(dev); - nskb = skb_clone(skb, GFP_ATOMIC); + + if (netlink_skb_is_mmaped(skb) || is_vmalloc_addr(skb->head)) + nskb = netlink_to_full_skb(skb, GFP_ATOMIC); + else + nskb = skb_clone(skb, GFP_ATOMIC); if (nskb) { nskb->dev = dev; nskb->protocol = htons((u16) sk->sk_protocol); @@ -279,11 +301,6 @@ static void netlink_rcv_wake(struct sock *sk) } #ifdef CONFIG_NETLINK_MMAP -static bool netlink_skb_is_mmaped(const struct sk_buff *skb) -{ - return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; -} - static bool netlink_rx_is_mmaped(struct sock *sk) { return nlk_sk(sk)->rx_ring.pg_vec != NULL; @@ -846,7 +863,6 @@ static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb) } #else /* CONFIG_NETLINK_MMAP */ -#define netlink_skb_is_mmaped(skb) false #define netlink_rx_is_mmaped(sk) false #define netlink_tx_is_mmaped(sk) false #define netlink_mmap sock_no_mmap diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 8900840..df9a060 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -59,6 +59,15 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk) return container_of(sk, struct netlink_sock, sk); } +static inline bool netlink_skb_is_mmaped(const struct sk_buff *skb) +{ +#ifdef CONFIG_NETLINK_MMAP + return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; +#else + return false; +#endif /* CONFIG_NETLINK_MMAP */ +} + struct netlink_table { struct rhashtable hash; struct hlist_head mc_list; -- cgit v1.1 From 19539ce783dd27768d4f7f3b753152bf983db65b Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Thu, 10 Sep 2015 18:25:07 -0600 Subject: ebpf: emit correct src_reg for conditional jumps Instead of always emitting BPF_REG_X, let's emit BPF_REG_X only when the source actually is BPF_X. This causes programs generated by the classic converter to not be importable via bpf(), as the eBPF verifier checks that the src_reg is correct or 0. While not a problem yet, this will be a problem when BPF_PROG_DUMP lands, and we can potentially dump and re-import programs generated by the converter. Signed-off-by: Tycho Andersen CC: Alexei Starovoitov CC: Daniel Borkmann Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 13079f0..05a04ea 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -478,9 +478,9 @@ do_pass: bpf_src = BPF_X; } else { insn->dst_reg = BPF_REG_A; - insn->src_reg = BPF_REG_X; insn->imm = fp->k; bpf_src = BPF_SRC(fp->code); + insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0; } /* Common case where 'jump_false' is next insn. */ -- cgit v1.1 From 8e2d61e0aed2b7c4ecb35844fe07e0b2b762dee4 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 10 Sep 2015 17:31:15 -0300 Subject: sctp: fix race on protocol/netns initialization Consider sctp module is unloaded and is being requested because an user is creating a sctp socket. During initialization, sctp will add the new protocol type and then initialize pernet subsys: status = sctp_v4_protosw_init(); if (status) goto err_protosw_init; status = sctp_v6_protosw_init(); if (status) goto err_v6_protosw_init; status = register_pernet_subsys(&sctp_net_ops); The problem is that after those calls to sctp_v{4,6}_protosw_init(), it is possible for userspace to create SCTP sockets like if the module is already fully loaded. If that happens, one of the possible effects is that we will have readers for net->sctp.local_addr_list list earlier than expected and sctp_net_init() does not take precautions while dealing with that list, leading to a potential panic but not limited to that, as sctp_sock_init() will copy a bunch of blank/partially initialized values from net->sctp. The race happens like this: CPU 0 | CPU 1 socket() | __sock_create | socket() inet_create | __sock_create list_for_each_entry_rcu( | answer, &inetsw[sock->type], | list) { | inet_create /* no hits */ | if (unlikely(err)) { | ... | request_module() | /* socket creation is blocked | * the module is fully loaded | */ | sctp_init | sctp_v4_protosw_init | inet_register_protosw | list_add_rcu(&p->list, | last_perm); | | list_for_each_entry_rcu( | answer, &inetsw[sock->type], sctp_v6_protosw_init | list) { | /* hit, so assumes protocol | * is already loaded | */ | /* socket creation continues | * before netns is initialized | */ register_pernet_subsys | Simply inverting the initialization order between register_pernet_subsys() and sctp_v4_protosw_init() is not possible because register_pernet_subsys() will create a control sctp socket, so the protocol must be already visible by then. Deferring the socket creation to a work-queue is not good specially because we loose the ability to handle its errors. So, as suggested by Vlad, the fix is to split netns initialization in two moments: defaults and control socket, so that the defaults are already loaded by when we register the protocol, while control socket initialization is kept at the same moment it is today. Fixes: 4db67e808640 ("sctp: Make the address lists per network namespace") Signed-off-by: Vlad Yasevich Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/protocol.c | 64 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index b714333..3d9ea9a 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1186,7 +1186,7 @@ static void sctp_v4_del_protocol(void) unregister_inetaddr_notifier(&sctp_inetaddr_notifier); } -static int __net_init sctp_net_init(struct net *net) +static int __net_init sctp_defaults_init(struct net *net) { int status; @@ -1279,12 +1279,6 @@ static int __net_init sctp_net_init(struct net *net) sctp_dbg_objcnt_init(net); - /* Initialize the control inode/socket for handling OOTB packets. */ - if ((status = sctp_ctl_sock_init(net))) { - pr_err("Failed to initialize the SCTP control sock\n"); - goto err_ctl_sock_init; - } - /* Initialize the local address list. */ INIT_LIST_HEAD(&net->sctp.local_addr_list); spin_lock_init(&net->sctp.local_addr_lock); @@ -1300,9 +1294,6 @@ static int __net_init sctp_net_init(struct net *net) return 0; -err_ctl_sock_init: - sctp_dbg_objcnt_exit(net); - sctp_proc_exit(net); err_init_proc: cleanup_sctp_mibs(net); err_init_mibs: @@ -1311,15 +1302,12 @@ err_sysctl_register: return status; } -static void __net_exit sctp_net_exit(struct net *net) +static void __net_exit sctp_defaults_exit(struct net *net) { /* Free the local address list */ sctp_free_addr_wq(net); sctp_free_local_addr_list(net); - /* Free the control endpoint. */ - inet_ctl_sock_destroy(net->sctp.ctl_sock); - sctp_dbg_objcnt_exit(net); sctp_proc_exit(net); @@ -1327,9 +1315,32 @@ static void __net_exit sctp_net_exit(struct net *net) sctp_sysctl_net_unregister(net); } -static struct pernet_operations sctp_net_ops = { - .init = sctp_net_init, - .exit = sctp_net_exit, +static struct pernet_operations sctp_defaults_ops = { + .init = sctp_defaults_init, + .exit = sctp_defaults_exit, +}; + +static int __net_init sctp_ctrlsock_init(struct net *net) +{ + int status; + + /* Initialize the control inode/socket for handling OOTB packets. */ + status = sctp_ctl_sock_init(net); + if (status) + pr_err("Failed to initialize the SCTP control sock\n"); + + return status; +} + +static void __net_init sctp_ctrlsock_exit(struct net *net) +{ + /* Free the control endpoint. */ + inet_ctl_sock_destroy(net->sctp.ctl_sock); +} + +static struct pernet_operations sctp_ctrlsock_ops = { + .init = sctp_ctrlsock_init, + .exit = sctp_ctrlsock_exit, }; /* Initialize the universe into something sensible. */ @@ -1462,8 +1473,11 @@ static __init int sctp_init(void) sctp_v4_pf_init(); sctp_v6_pf_init(); - status = sctp_v4_protosw_init(); + status = register_pernet_subsys(&sctp_defaults_ops); + if (status) + goto err_register_defaults; + status = sctp_v4_protosw_init(); if (status) goto err_protosw_init; @@ -1471,9 +1485,9 @@ static __init int sctp_init(void) if (status) goto err_v6_protosw_init; - status = register_pernet_subsys(&sctp_net_ops); + status = register_pernet_subsys(&sctp_ctrlsock_ops); if (status) - goto err_register_pernet_subsys; + goto err_register_ctrlsock; status = sctp_v4_add_protocol(); if (status) @@ -1489,12 +1503,14 @@ out: err_v6_add_protocol: sctp_v4_del_protocol(); err_add_protocol: - unregister_pernet_subsys(&sctp_net_ops); -err_register_pernet_subsys: + unregister_pernet_subsys(&sctp_ctrlsock_ops); +err_register_ctrlsock: sctp_v6_protosw_exit(); err_v6_protosw_init: sctp_v4_protosw_exit(); err_protosw_init: + unregister_pernet_subsys(&sctp_defaults_ops); +err_register_defaults: sctp_v4_pf_exit(); sctp_v6_pf_exit(); sctp_sysctl_unregister(); @@ -1527,12 +1543,14 @@ static __exit void sctp_exit(void) sctp_v6_del_protocol(); sctp_v4_del_protocol(); - unregister_pernet_subsys(&sctp_net_ops); + unregister_pernet_subsys(&sctp_ctrlsock_ops); /* Free protosw registrations */ sctp_v6_protosw_exit(); sctp_v4_protosw_exit(); + unregister_pernet_subsys(&sctp_defaults_ops); + /* Unregister with socket layer. */ sctp_v6_pf_exit(); sctp_v4_pf_exit(); -- cgit v1.1 From c2d4fbd2163e607915cc05798ce7fb7f31117cc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Fri, 11 Sep 2015 18:39:48 +0200 Subject: bridge: fix igmpv3 / mldv2 report parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the newly introduced helper functions the skb pulling is hidden in the checksumming function - and undone before returning to the caller. The IGMPv3 and MLDv2 report parsing functions in the bridge still assumed that the skb is pointing to the beginning of the IGMP/MLD message while it is now kept at the beginning of the IPv4/6 header, breaking the message parsing and creating packet loss. Fixing this by taking the offset between IP and IGMP/MLD header into account, too. Fixes: 9afd85c9e455 ("net: Export IGMP/MLD message validation code") Reported-by: Tobias Powalowski Tested-by: Tobias Powalowski Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 66efdc2..480b3de 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1006,7 +1006,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, ih = igmpv3_report_hdr(skb); num = ntohs(ih->ngrec); - len = sizeof(*ih); + len = skb_transport_offset(skb) + sizeof(*ih); for (i = 0; i < num; i++) { len += sizeof(*grec); @@ -1067,7 +1067,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, icmp6h = icmp6_hdr(skb); num = ntohs(icmp6h->icmp6_dataun.un_data16[1]); - len = sizeof(*icmp6h); + len = skb_transport_offset(skb) + sizeof(*icmp6h); for (i = 0; i < num; i++) { __be16 *nsrcs, _nsrcs; -- cgit v1.1 From 38c089d1d8d058f5dff018a811568aa8e8bc47fc Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Fri, 11 Sep 2015 15:01:16 -0700 Subject: openvswitch: Fix dependency on IPv6 defrag. When NF_CONNTRACK is built-in, NF_DEFRAG_IPV6 is a module, and OPENVSWITCH is built-in, the following build error would occur: net/built-in.o: In function `ovs_ct_execute': (.text+0x10f587): undefined reference to `nf_ct_frag6_gather' Fixes: 7f8a436eaa2c ("openvswitch: Add conntrack action") Reported-by: Jim Davis Signed-off-by: Joe Stringer Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 2a071f4..d143aa9 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -5,7 +5,8 @@ config OPENVSWITCH tristate "Open vSwitch" depends on INET - depends on (!NF_CONNTRACK || NF_CONNTRACK) + depends on !NF_CONNTRACK || \ + (NF_CONNTRACK && (!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6)) select LIBCRC32C select MPLS select NET_MPLS_GSO -- cgit v1.1 From 205ee117d4dc4a11ac3bd9638bb9b2e839f4de9a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 9 Sep 2015 02:57:21 +0200 Subject: netfilter: nf_log: don't zap all loggers on unregister like nf_log_unset, nf_log_unregister must not reset the list of loggers. Otherwise, a call to nf_log_unregister() will render loggers of other nf protocols unusable: iptables -A INPUT -j LOG modprobe nf_log_arp ; rmmod nf_log_arp iptables -A INPUT -j LOG iptables: No chain/target/match by that name Fixes: 30e0c6a6be ("netfilter: nf_log: prepare net namespace support for loggers") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_log.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 675d12c..a5ebd7d 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -107,11 +107,15 @@ EXPORT_SYMBOL(nf_log_register); void nf_log_unregister(struct nf_logger *logger) { + const struct nf_logger *log; int i; mutex_lock(&nf_log_mutex); - for (i = 0; i < NFPROTO_NUMPROTO; i++) - RCU_INIT_POINTER(loggers[i][logger->type], NULL); + for (i = 0; i < NFPROTO_NUMPROTO; i++) { + log = nft_log_dereference(loggers[i][logger->type]); + if (log == logger) + RCU_INIT_POINTER(loggers[i][logger->type], NULL); + } mutex_unlock(&nf_log_mutex); } EXPORT_SYMBOL(nf_log_unregister); -- cgit v1.1 From ba378ca9c04a5fc1b2cf0f0274a9d02eb3d1bad9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 14 Sep 2015 18:04:09 +0200 Subject: netfilter: nft_compat: skip family comparison in case of NFPROTO_UNSPEC Fix lookup of existing match/target structures in the corresponding list by skipping the family check if NFPROTO_UNSPEC is used. This is resulting in the allocation and insertion of one match/target structure for each use of them. So this not only bloats memory consumption but also severely affects the time to reload the ruleset from the iptables-compat utility. After this patch, iptables-compat-restore and iptables-compat take almost the same time to reload large rulesets. Fixes: 0ca743a55991 ("netfilter: nf_tables: add compatibility layer for x_tables") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 66def31..9c8fab0 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -619,6 +619,13 @@ struct nft_xt { static struct nft_expr_type nft_match_type; +static bool nft_match_cmp(const struct xt_match *match, + const char *name, u32 rev, u32 family) +{ + return strcmp(match->name, name) == 0 && match->revision == rev && + (match->family == NFPROTO_UNSPEC || match->family == family); +} + static const struct nft_expr_ops * nft_match_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -626,7 +633,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, struct nft_xt *nft_match; struct xt_match *match; char *mt_name; - __u32 rev, family; + u32 rev, family; if (tb[NFTA_MATCH_NAME] == NULL || tb[NFTA_MATCH_REV] == NULL || @@ -641,8 +648,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, list_for_each_entry(nft_match, &nft_match_list, head) { struct xt_match *match = nft_match->ops.data; - if (strcmp(match->name, mt_name) == 0 && - match->revision == rev && match->family == family) { + if (nft_match_cmp(match, mt_name, rev, family)) { if (!try_module_get(match->me)) return ERR_PTR(-ENOENT); @@ -693,6 +699,13 @@ static LIST_HEAD(nft_target_list); static struct nft_expr_type nft_target_type; +static bool nft_target_cmp(const struct xt_target *tg, + const char *name, u32 rev, u32 family) +{ + return strcmp(tg->name, name) == 0 && tg->revision == rev && + (tg->family == NFPROTO_UNSPEC || tg->family == family); +} + static const struct nft_expr_ops * nft_target_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -700,7 +713,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, struct nft_xt *nft_target; struct xt_target *target; char *tg_name; - __u32 rev, family; + u32 rev, family; if (tb[NFTA_TARGET_NAME] == NULL || tb[NFTA_TARGET_REV] == NULL || @@ -715,8 +728,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, list_for_each_entry(nft_target, &nft_target_list, head) { struct xt_target *target = nft_target->ops.data; - if (strcmp(target->name, tg_name) == 0 && - target->revision == rev && target->family == family) { + if (nft_target_cmp(target, tg_name, rev, family)) { if (!try_module_get(target->me)) return ERR_PTR(-ENOENT); -- cgit v1.1 From a3c119d392d7d7c68865fe76f5732ca9b8164d68 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 15 Sep 2015 14:30:05 -0700 Subject: ipv6: Refactor common ip6gre_tunnel_init codes It is a prep work to fix the dst_entry refcnt bugs in ip6_tunnel. This patch refactors some common init codes used by both ip6gre_tunnel_init and ip6gre_tap_init. Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 4038c69..af60d46 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1245,7 +1245,7 @@ static void ip6gre_tunnel_setup(struct net_device *dev) netif_keep_dst(dev); } -static int ip6gre_tunnel_init(struct net_device *dev) +static int ip6gre_tunnel_init_common(struct net_device *dev) { struct ip6_tnl *tunnel; @@ -1255,16 +1255,30 @@ static int ip6gre_tunnel_init(struct net_device *dev) tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + return 0; +} + +static int ip6gre_tunnel_init(struct net_device *dev) +{ + struct ip6_tnl *tunnel; + int ret; + + ret = ip6gre_tunnel_init_common(dev); + if (ret) + return ret; + + tunnel = netdev_priv(dev); + memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr)); if (ipv6_addr_any(&tunnel->parms.raddr)) dev->header_ops = &ip6gre_header_ops; - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - return 0; } @@ -1460,19 +1474,16 @@ static void ip6gre_netlink_parms(struct nlattr *data[], static int ip6gre_tap_init(struct net_device *dev) { struct ip6_tnl *tunnel; + int ret; - tunnel = netdev_priv(dev); + ret = ip6gre_tunnel_init_common(dev); + if (ret) + return ret; - tunnel->dev = dev; - tunnel->net = dev_net(dev); - strcpy(tunnel->parms.name, dev->name); + tunnel = netdev_priv(dev); ip6gre_tnl_link_config(tunnel, 1); - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - return 0; } -- cgit v1.1 From f230d1e891ba1da5953460516960894154f265db Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 15 Sep 2015 14:30:06 -0700 Subject: ipv6: Rename the dst_cache helper functions in ip6_tunnel It is a prep work to fix the dst_entry refcnt bugs in ip6_tunnel. This patch rename: 1. ip6_tnl_dst_check() to ip6_tnl_dst_get() to better reflect that it will take a dst refcnt in the next patch. 2. ip6_tnl_dst_store() to ip6_tnl_dst_set() to have a more conventional name matching with ip6_tnl_dst_get(). Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 4 ++-- net/ipv6/ip6_tunnel.c | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index af60d46..24f5dd8 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -634,7 +634,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, } if (!fl6->flowi6_mark) - dst = ip6_tnl_dst_check(tunnel); + dst = ip6_tnl_dst_get(tunnel); if (!dst) { ndst = ip6_route_output(net, NULL, fl6); @@ -763,7 +763,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, ip6tunnel_xmit(NULL, skb, dev); if (ndst) - ip6_tnl_dst_store(tunnel, ndst); + ip6_tnl_dst_set(tunnel, ndst); return 0; tx_err_link_failure: stats->tx_carrier_errors++; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index b0ab420..599b0b4 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -126,7 +126,7 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) * Locking : hash tables are protected by RCU and RTNL */ -struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t) +struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t) { struct dst_entry *dst = t->dst_cache; @@ -139,7 +139,7 @@ struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t) return dst; } -EXPORT_SYMBOL_GPL(ip6_tnl_dst_check); +EXPORT_SYMBOL_GPL(ip6_tnl_dst_get); void ip6_tnl_dst_reset(struct ip6_tnl *t) { @@ -148,14 +148,14 @@ void ip6_tnl_dst_reset(struct ip6_tnl *t) } EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset); -void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst) +void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *) dst; t->dst_cookie = rt6_get_cookie(rt); dst_release(t->dst_cache); t->dst_cache = dst; } -EXPORT_SYMBOL_GPL(ip6_tnl_dst_store); +EXPORT_SYMBOL_GPL(ip6_tnl_dst_set); /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses @@ -1010,7 +1010,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } else if (!fl6->flowi6_mark) - dst = ip6_tnl_dst_check(t); + dst = ip6_tnl_dst_get(t); if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr)) goto tx_err_link_failure; @@ -1102,7 +1102,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, ipv6h->daddr = fl6->daddr; ip6tunnel_xmit(NULL, skb, dev); if (ndst) - ip6_tnl_dst_store(t, ndst); + ip6_tnl_dst_set(t, ndst); return 0; tx_err_link_failure: stats->tx_carrier_errors++; -- cgit v1.1 From cdf3464e6c6bd764277cbbe992cd12da735b92fb Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 15 Sep 2015 14:30:07 -0700 Subject: ipv6: Fix dst_entry refcnt bugs in ip6_tunnel Problems in the current dst_entry cache in the ip6_tunnel: 1. ip6_tnl_dst_set is racy. There is no lock to protect it: - One major problem is that the dst refcnt gets messed up. F.e. the same dst_cache can be released multiple times and then triggering the infamous dst refcnt < 0 warning message. - Another issue is the inconsistency between dst_cache and dst_cookie. It can be reproduced by adding and removing the ip6gre tunnel while running a super_netperf TCP_CRR test. 2. ip6_tnl_dst_get does not take the dst refcnt before returning the dst. This patch: 1. Create a percpu dst_entry cache in ip6_tnl 2. Use a spinlock to protect the dst_cache operations 3. ip6_tnl_dst_get always takes the dst refcnt before returning Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 38 +++++++++------- net/ipv6/ip6_tunnel.c | 122 +++++++++++++++++++++++++++++++++++++------------- 2 files changed, 114 insertions(+), 46 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 24f5dd8..6465124 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -637,17 +637,17 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, dst = ip6_tnl_dst_get(tunnel); if (!dst) { - ndst = ip6_route_output(net, NULL, fl6); + dst = ip6_route_output(net, NULL, fl6); - if (ndst->error) + if (dst->error) goto tx_err_link_failure; - ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0); - if (IS_ERR(ndst)) { - err = PTR_ERR(ndst); - ndst = NULL; + dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; goto tx_err_link_failure; } - dst = ndst; + ndst = dst; } tdev = dst->dev; @@ -702,12 +702,9 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, skb = new_skb; } - if (fl6->flowi6_mark) { - skb_dst_set(skb, dst); - ndst = NULL; - } else { - skb_dst_set_noref(skb, dst); - } + if (!fl6->flowi6_mark && ndst) + ip6_tnl_dst_set(tunnel, ndst); + skb_dst_set(skb, dst); proto = NEXTHDR_GRE; if (encap_limit >= 0) { @@ -762,14 +759,12 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, skb_set_inner_protocol(skb, protocol); ip6tunnel_xmit(NULL, skb, dev); - if (ndst) - ip6_tnl_dst_set(tunnel, ndst); return 0; tx_err_link_failure: stats->tx_carrier_errors++; dst_link_failure(skb); tx_err_dst_release: - dst_release(ndst); + dst_release(dst); return err; } @@ -1223,6 +1218,9 @@ static const struct net_device_ops ip6gre_netdev_ops = { static void ip6gre_dev_free(struct net_device *dev) { + struct ip6_tnl *t = netdev_priv(dev); + + ip6_tnl_dst_destroy(t); free_percpu(dev->tstats); free_netdev(dev); } @@ -1248,6 +1246,7 @@ static void ip6gre_tunnel_setup(struct net_device *dev) static int ip6gre_tunnel_init_common(struct net_device *dev) { struct ip6_tnl *tunnel; + int ret; tunnel = netdev_priv(dev); @@ -1259,6 +1258,13 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) if (!dev->tstats) return -ENOMEM; + ret = ip6_tnl_dst_init(tunnel); + if (ret) { + free_percpu(dev->tstats); + dev->tstats = NULL; + return ret; + } + return 0; } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 599b0b4..851cf6d 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -126,37 +126,90 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) * Locking : hash tables are protected by RCU and RTNL */ -struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t) +static void __ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst, + struct dst_entry *dst) { - struct dst_entry *dst = t->dst_cache; - - if (dst && dst->obsolete && - !dst->ops->check(dst, t->dst_cookie)) { - t->dst_cache = NULL; - dst_release(dst); - return NULL; + dst_release(idst->dst); + if (dst) { + dst_hold(dst); + idst->cookie = rt6_get_cookie((struct rt6_info *)dst); + } else { + idst->cookie = 0; } + idst->dst = dst; +} + +static void ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst, + struct dst_entry *dst) +{ + spin_lock_bh(&idst->lock); + __ip6_tnl_per_cpu_dst_set(idst, dst); + spin_unlock_bh(&idst->lock); +} + +struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t) +{ + struct ip6_tnl_dst *idst; + struct dst_entry *dst; + + idst = raw_cpu_ptr(t->dst_cache); + spin_lock_bh(&idst->lock); + dst = idst->dst; + if (dst) { + if (!dst->obsolete || dst->ops->check(dst, idst->cookie)) { + dst_hold(idst->dst); + } else { + __ip6_tnl_per_cpu_dst_set(idst, NULL); + dst = NULL; + } + } + spin_unlock_bh(&idst->lock); return dst; } EXPORT_SYMBOL_GPL(ip6_tnl_dst_get); void ip6_tnl_dst_reset(struct ip6_tnl *t) { - dst_release(t->dst_cache); - t->dst_cache = NULL; + int i; + + for_each_possible_cpu(i) + ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), NULL); } EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset); void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst) { - struct rt6_info *rt = (struct rt6_info *) dst; - t->dst_cookie = rt6_get_cookie(rt); - dst_release(t->dst_cache); - t->dst_cache = dst; + ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), dst); + } EXPORT_SYMBOL_GPL(ip6_tnl_dst_set); +void ip6_tnl_dst_destroy(struct ip6_tnl *t) +{ + if (!t->dst_cache) + return; + + ip6_tnl_dst_reset(t); + free_percpu(t->dst_cache); +} +EXPORT_SYMBOL_GPL(ip6_tnl_dst_destroy); + +int ip6_tnl_dst_init(struct ip6_tnl *t) +{ + int i; + + t->dst_cache = alloc_percpu(struct ip6_tnl_dst); + if (!t->dst_cache) + return -ENOMEM; + + for_each_possible_cpu(i) + spin_lock_init(&per_cpu_ptr(t->dst_cache, i)->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(ip6_tnl_dst_init); + /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses * @remote: the address of the tunnel exit-point @@ -271,6 +324,9 @@ ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) static void ip6_dev_free(struct net_device *dev) { + struct ip6_tnl *t = netdev_priv(dev); + + ip6_tnl_dst_destroy(t); free_percpu(dev->tstats); free_netdev(dev); } @@ -1016,17 +1072,17 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, goto tx_err_link_failure; if (!dst) { - ndst = ip6_route_output(net, NULL, fl6); + dst = ip6_route_output(net, NULL, fl6); - if (ndst->error) + if (dst->error) goto tx_err_link_failure; - ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0); - if (IS_ERR(ndst)) { - err = PTR_ERR(ndst); - ndst = NULL; + dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; goto tx_err_link_failure; } - dst = ndst; + ndst = dst; } tdev = dst->dev; @@ -1072,12 +1128,11 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, consume_skb(skb); skb = new_skb; } - if (fl6->flowi6_mark) { - skb_dst_set(skb, dst); - ndst = NULL; - } else { - skb_dst_set_noref(skb, dst); - } + + if (!fl6->flowi6_mark && ndst) + ip6_tnl_dst_set(t, ndst); + skb_dst_set(skb, dst); + skb->transport_header = skb->network_header; proto = fl6->flowi6_proto; @@ -1101,14 +1156,12 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, ipv6h->saddr = fl6->saddr; ipv6h->daddr = fl6->daddr; ip6tunnel_xmit(NULL, skb, dev); - if (ndst) - ip6_tnl_dst_set(t, ndst); return 0; tx_err_link_failure: stats->tx_carrier_errors++; dst_link_failure(skb); tx_err_dst_release: - dst_release(ndst); + dst_release(dst); return err; } @@ -1573,12 +1626,21 @@ static inline int ip6_tnl_dev_init_gen(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); + int ret; t->dev = dev; t->net = dev_net(dev); dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; + + ret = ip6_tnl_dst_init(t); + if (ret) { + free_percpu(dev->tstats); + dev->tstats = NULL; + return ret; + } + return 0; } -- cgit v1.1 From 8e3d5be7368107f0c27a1f8126d79b01a47e9567 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 15 Sep 2015 14:30:08 -0700 Subject: ipv6: Avoid double dst_free It is a prep work to get dst freeing from fib tree undergo a rcu grace period. The following is a common paradigm: if (ip6_del_rt(rt)) dst_free(rt) which means, if rt cannot be deleted from the fib tree, dst_free(rt) now. 1. We don't know the ip6_del_rt(rt) failure is because it was not managed by fib tree (e.g. DST_NOCACHE) or it had already been removed from the fib tree. 2. If rt had been managed by the fib tree, ip6_del_rt(rt) failure means dst_free(rt) has been called already. A second dst_free(rt) is not always obviously safe. The rt may have been destroyed already. 3. If rt is a DST_NOCACHE, dst_free(rt) should not be called. 4. It is a stopper to make dst freeing from fib tree undergo a rcu grace period. This patch is to use a DST_NOCACHE flag to indicate a rt is not managed by the fib tree. Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 7 +++---- net/ipv6/ip6_fib.c | 11 +++++++++-- net/ipv6/route.c | 7 ++++--- 3 files changed, 16 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 030fefd..9001133 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5127,13 +5127,12 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) rt = addrconf_get_prefix_route(&ifp->peer_addr, 128, ifp->idev->dev, 0, 0); - if (rt && ip6_del_rt(rt)) - dst_free(&rt->dst); + if (rt) + ip6_del_rt(rt); } dst_hold(&ifp->rt->dst); - if (ip6_del_rt(ifp->rt)) - dst_free(&ifp->rt->dst); + ip6_del_rt(ifp->rt); rt_genid_bump_ipv6(net); break; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 418d982..e68350b 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -933,6 +933,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, int replace_required = 0; int sernum = fib6_new_sernum(info->nl_net); + if (WARN_ON_ONCE((rt->dst.flags & DST_NOCACHE) && + !atomic_read(&rt->dst.__refcnt))) + return -EINVAL; + if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) allow_create = 0; @@ -1025,6 +1029,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, fib6_start_gc(info->nl_net, rt); if (!(rt->rt6i_flags & RTF_CACHE)) fib6_prune_clones(info->nl_net, pn); + rt->dst.flags &= ~DST_NOCACHE; } out: @@ -1049,7 +1054,8 @@ out: atomic_inc(&pn->leaf->rt6i_ref); } #endif - dst_free(&rt->dst); + if (!(rt->dst.flags & DST_NOCACHE)) + dst_free(&rt->dst); } return err; @@ -1060,7 +1066,8 @@ out: st_failure: if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) fib6_repair_tree(info->nl_net, fn); - dst_free(&rt->dst); + if (!(rt->dst.flags & DST_NOCACHE)) + dst_free(&rt->dst); return err; #endif } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 53617d7..3d3c1b2 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1322,8 +1322,7 @@ static void ip6_link_failure(struct sk_buff *skb) if (rt) { if (rt->rt6i_flags & RTF_CACHE) { dst_hold(&rt->dst); - if (ip6_del_rt(rt)) - dst_free(&rt->dst); + ip6_del_rt(rt); } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) { rt->rt6i_node->fn_sernum = -1; } @@ -2028,7 +2027,8 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) struct fib6_table *table; struct net *net = dev_net(rt->dst.dev); - if (rt == net->ipv6.ip6_null_entry) { + if (rt == net->ipv6.ip6_null_entry || + rt->dst.flags & DST_NOCACHE) { err = -ENOENT; goto out; } @@ -2515,6 +2515,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, rt->rt6i_dst.addr = *addr; rt->rt6i_dst.plen = 128; rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL); + rt->dst.flags |= DST_NOCACHE; atomic_set(&rt->dst.__refcnt, 1); -- cgit v1.1 From 70da5b5c532f0ec8aa76b4f46158da5f010f34b3 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 15 Sep 2015 14:30:09 -0700 Subject: ipv6: Replace spinlock with seqlock and rcu in ip6_tunnel This patch uses a seqlock to ensure consistency between idst->dst and idst->cookie. It also makes dst freeing from fib tree to undergo a rcu grace period. Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 9 +++++++-- net/ipv6/ip6_tunnel.c | 51 +++++++++++++++++++++++++++------------------------ 2 files changed, 34 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index e68350b..8a9ec01 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -155,6 +155,11 @@ static void node_free(struct fib6_node *fn) kmem_cache_free(fib6_node_kmem, fn); } +static void rt6_rcu_free(struct rt6_info *rt) +{ + call_rcu(&rt->dst.rcu_head, dst_rcu_free); +} + static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) { int cpu; @@ -169,7 +174,7 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu); pcpu_rt = *ppcpu_rt; if (pcpu_rt) { - dst_free(&pcpu_rt->dst); + rt6_rcu_free(pcpu_rt); *ppcpu_rt = NULL; } } @@ -181,7 +186,7 @@ static void rt6_release(struct rt6_info *rt) { if (atomic_dec_and_test(&rt->rt6i_ref)) { rt6_free_pcpu(rt); - dst_free(&rt->dst); + rt6_rcu_free(rt); } } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 851cf6d..983f0d2 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -126,45 +126,48 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) * Locking : hash tables are protected by RCU and RTNL */ -static void __ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst, - struct dst_entry *dst) +static void ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst, + struct dst_entry *dst) { - dst_release(idst->dst); + write_seqlock_bh(&idst->lock); + dst_release(rcu_dereference_protected( + idst->dst, + lockdep_is_held(&idst->lock.lock))); if (dst) { dst_hold(dst); idst->cookie = rt6_get_cookie((struct rt6_info *)dst); } else { idst->cookie = 0; } - idst->dst = dst; -} - -static void ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst, - struct dst_entry *dst) -{ - - spin_lock_bh(&idst->lock); - __ip6_tnl_per_cpu_dst_set(idst, dst); - spin_unlock_bh(&idst->lock); + rcu_assign_pointer(idst->dst, dst); + write_sequnlock_bh(&idst->lock); } struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t) { struct ip6_tnl_dst *idst; struct dst_entry *dst; + unsigned int seq; + u32 cookie; idst = raw_cpu_ptr(t->dst_cache); - spin_lock_bh(&idst->lock); - dst = idst->dst; - if (dst) { - if (!dst->obsolete || dst->ops->check(dst, idst->cookie)) { - dst_hold(idst->dst); - } else { - __ip6_tnl_per_cpu_dst_set(idst, NULL); - dst = NULL; - } + + rcu_read_lock(); + do { + seq = read_seqbegin(&idst->lock); + dst = rcu_dereference(idst->dst); + cookie = idst->cookie; + } while (read_seqretry(&idst->lock, seq)); + + if (dst && !atomic_inc_not_zero(&dst->__refcnt)) + dst = NULL; + rcu_read_unlock(); + + if (dst && dst->obsolete && !dst->ops->check(dst, cookie)) { + ip6_tnl_per_cpu_dst_set(idst, NULL); + dst_release(dst); + dst = NULL; } - spin_unlock_bh(&idst->lock); return dst; } EXPORT_SYMBOL_GPL(ip6_tnl_dst_get); @@ -204,7 +207,7 @@ int ip6_tnl_dst_init(struct ip6_tnl *t) return -ENOMEM; for_each_possible_cpu(i) - spin_lock_init(&per_cpu_ptr(t->dst_cache, i)->lock); + seqlock_init(&per_cpu_ptr(t->dst_cache, i)->lock); return 0; } -- cgit v1.1 From d64f69b0373a7d0bcec8b5da7712977518a8f42b Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 15 Sep 2015 14:44:29 -0700 Subject: rtnetlink: catch -EOPNOTSUPP errors from ndo_bridge_getlink problem reported: kernel 4.1.3 ------------ # bridge vlan port vlan ids eth0 1 PVID Egress Untagged 90 91 92 93 94 95 96 97 98 99 100 vmbr0 1 PVID Egress Untagged 94 kernel 4.2 ----------- # bridge vlan port vlan ids ndo_bridge_getlink can return -EOPNOTSUPP when an interfaces ndo_bridge_getlink op is set to switchdev_port_bridge_getlink and CONFIG_SWITCHDEV is not defined. This today can happen to bond, rocker and team devices. This patch adds -EOPNOTSUPP checks after calls to ndo_bridge_getlink. Fixes: 85fdb956726ff2a ("switchdev: cut over to new switchdev_port_bridge_getlink") Reported-by: Alexandre DERUMIER Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a466821..0ec4840 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3047,6 +3047,7 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) u32 portid = NETLINK_CB(cb->skb).portid; u32 seq = cb->nlh->nlmsg_seq; u32 filter_mask = 0; + int err; if (nlmsg_len(cb->nlh) > sizeof(struct ifinfomsg)) { struct nlattr *extfilt; @@ -3067,20 +3068,25 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { - if (idx >= cb->args[0] && - br_dev->netdev_ops->ndo_bridge_getlink( - skb, portid, seq, dev, filter_mask, - NLM_F_MULTI) < 0) - break; + if (idx >= cb->args[0]) { + err = br_dev->netdev_ops->ndo_bridge_getlink( + skb, portid, seq, dev, + filter_mask, NLM_F_MULTI); + if (err < 0 && err != -EOPNOTSUPP) + break; + } idx++; } if (ops->ndo_bridge_getlink) { - if (idx >= cb->args[0] && - ops->ndo_bridge_getlink(skb, portid, seq, dev, - filter_mask, - NLM_F_MULTI) < 0) - break; + if (idx >= cb->args[0]) { + err = ops->ndo_bridge_getlink(skb, portid, + seq, dev, + filter_mask, + NLM_F_MULTI); + if (err < 0 && err != -EOPNOTSUPP) + break; + } idx++; } } -- cgit v1.1 From 982b527004826b40de1e821b123c70f05b41496c Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Fri, 11 Sep 2015 18:38:28 -0700 Subject: openvswitch: Fix mask generation for nested attributes. Masks were added to OVS flows in a way that was backwards compatible with userspace programs that did not generate masks. As a result, it is possible that we may receive flows that do not have a mask and we need to synthesize one. Generating a mask requires iterating over attributes and descending into nested attributes. For each level we need to know the size to generate the correct mask. We do this with a linked table of attribute types. Although the logic to handle these nested attributes was there in concept, there are a number of bugs in practice. Examples include incomplete links between tables, variable length attributes being treated as nested and missing sanity checks. Signed-off-by: Jesse Gross Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 82 ++++++++++++++++++++++++++++++------------ 1 file changed, 59 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index c92d6a2..5c030a4 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -57,6 +57,7 @@ struct ovs_len_tbl { }; #define OVS_ATTR_NESTED -1 +#define OVS_ATTR_VARIABLE -2 static void update_range(struct sw_flow_match *match, size_t offset, size_t size, bool is_mask) @@ -304,6 +305,10 @@ size_t ovs_key_attr_size(void) + nla_total_size(28); /* OVS_KEY_ATTR_ND */ } +static const struct ovs_len_tbl ovs_vxlan_ext_key_lens[OVS_VXLAN_EXT_MAX + 1] = { + [OVS_VXLAN_EXT_GBP] = { .len = sizeof(u32) }, +}; + static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = { [OVS_TUNNEL_KEY_ATTR_ID] = { .len = sizeof(u64) }, [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = { .len = sizeof(u32) }, @@ -315,8 +320,9 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] [OVS_TUNNEL_KEY_ATTR_TP_SRC] = { .len = sizeof(u16) }, [OVS_TUNNEL_KEY_ATTR_TP_DST] = { .len = sizeof(u16) }, [OVS_TUNNEL_KEY_ATTR_OAM] = { .len = 0 }, - [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_NESTED }, - [OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS] = { .len = OVS_ATTR_NESTED }, + [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_VARIABLE }, + [OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS] = { .len = OVS_ATTR_NESTED, + .next = ovs_vxlan_ext_key_lens }, }; /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ @@ -349,6 +355,13 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_CT_LABEL] = { .len = sizeof(struct ovs_key_ct_label) }, }; +static bool check_attr_len(unsigned int attr_len, unsigned int expected_len) +{ + return expected_len == attr_len || + expected_len == OVS_ATTR_NESTED || + expected_len == OVS_ATTR_VARIABLE; +} + static bool is_all_zero(const u8 *fp, size_t size) { int i; @@ -388,7 +401,7 @@ static int __parse_flow_nlattrs(const struct nlattr *attr, } expected_len = ovs_key_lens[type].len; - if (nla_len(nla) != expected_len && expected_len != OVS_ATTR_NESTED) { + if (!check_attr_len(nla_len(nla), expected_len)) { OVS_NLERR(log, "Key %d has unexpected len %d expected %d", type, nla_len(nla), expected_len); return -EINVAL; @@ -473,29 +486,50 @@ static int genev_tun_opt_from_nlattr(const struct nlattr *a, return 0; } -static const struct nla_policy vxlan_opt_policy[OVS_VXLAN_EXT_MAX + 1] = { - [OVS_VXLAN_EXT_GBP] = { .type = NLA_U32 }, -}; - -static int vxlan_tun_opt_from_nlattr(const struct nlattr *a, +static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr, struct sw_flow_match *match, bool is_mask, bool log) { - struct nlattr *tb[OVS_VXLAN_EXT_MAX+1]; + struct nlattr *a; + int rem; unsigned long opt_key_offset; struct vxlan_metadata opts; - int err; BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); - err = nla_parse_nested(tb, OVS_VXLAN_EXT_MAX, a, vxlan_opt_policy); - if (err < 0) - return err; - memset(&opts, 0, sizeof(opts)); + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); - if (tb[OVS_VXLAN_EXT_GBP]) - opts.gbp = nla_get_u32(tb[OVS_VXLAN_EXT_GBP]); + if (type > OVS_VXLAN_EXT_MAX) { + OVS_NLERR(log, "VXLAN extension %d out of range max %d", + type, OVS_VXLAN_EXT_MAX); + return -EINVAL; + } + + if (!check_attr_len(nla_len(a), + ovs_vxlan_ext_key_lens[type].len)) { + OVS_NLERR(log, "VXLAN extension %d has unexpected len %d expected %d", + type, nla_len(a), + ovs_vxlan_ext_key_lens[type].len); + return -EINVAL; + } + + switch (type) { + case OVS_VXLAN_EXT_GBP: + opts.gbp = nla_get_u32(a); + break; + default: + OVS_NLERR(log, "Unknown VXLAN extension attribute %d", + type); + return -EINVAL; + } + } + if (rem) { + OVS_NLERR(log, "VXLAN extension message has %d unknown bytes.", + rem); + return -EINVAL; + } if (!is_mask) SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), false); @@ -528,8 +562,8 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, return -EINVAL; } - if (ovs_tunnel_key_lens[type].len != nla_len(a) && - ovs_tunnel_key_lens[type].len != OVS_ATTR_NESTED) { + if (!check_attr_len(nla_len(a), + ovs_tunnel_key_lens[type].len)) { OVS_NLERR(log, "Tunnel attr %d has unexpected len %d expected %d", type, nla_len(a), ovs_tunnel_key_lens[type].len); return -EINVAL; @@ -1052,10 +1086,13 @@ static void nlattr_set(struct nlattr *attr, u8 val, /* The nlattr stream should already have been validated */ nla_for_each_nested(nla, attr, rem) { - if (tbl && tbl[nla_type(nla)].len == OVS_ATTR_NESTED) - nlattr_set(nla, val, tbl[nla_type(nla)].next); - else + if (tbl[nla_type(nla)].len == OVS_ATTR_NESTED) { + if (tbl[nla_type(nla)].next) + tbl = tbl[nla_type(nla)].next; + nlattr_set(nla, val, tbl); + } else { memset(nla_data(nla), val, nla_len(nla)); + } } } @@ -1922,8 +1959,7 @@ static int validate_set(const struct nlattr *a, key_len /= 2; if (key_type > OVS_KEY_ATTR_MAX || - (ovs_key_lens[key_type].len != key_len && - ovs_key_lens[key_type].len != OVS_ATTR_NESTED)) + !check_attr_len(key_len, ovs_key_lens[key_type].len)) return -EINVAL; if (masked && !validate_masked(nla_data(ovs_key), key_len)) -- cgit v1.1 From adf78edac09f9640cd9676571586c4be46fb527c Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 13 Sep 2015 14:15:18 +0200 Subject: net: core: drop null test before destroy functions Remove unneeded NULL test. The semantic patch that makes this change is as follows: (http://coccinelle.lip6.fr/) // @@ expression x; @@ -if (x != NULL) { \(kmem_cache_destroy\|mempool_destroy\|dma_pool_destroy\)(x); x = NULL; -} // Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- net/core/sock.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index ca2984a..3307c02 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2740,10 +2740,8 @@ static void req_prot_cleanup(struct request_sock_ops *rsk_prot) return; kfree(rsk_prot->slab_name); rsk_prot->slab_name = NULL; - if (rsk_prot->slab) { - kmem_cache_destroy(rsk_prot->slab); - rsk_prot->slab = NULL; - } + kmem_cache_destroy(rsk_prot->slab); + rsk_prot->slab = NULL; } static int req_prot_init(const struct proto *prot) @@ -2828,10 +2826,8 @@ void proto_unregister(struct proto *prot) list_del(&prot->node); mutex_unlock(&proto_list_mutex); - if (prot->slab != NULL) { - kmem_cache_destroy(prot->slab); - prot->slab = NULL; - } + kmem_cache_destroy(prot->slab); + prot->slab = NULL; req_prot_cleanup(prot->rsk_prot); -- cgit v1.1 From 20471ed4d403a5f4de6aa0c10cd1e446f7f2b3c7 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 13 Sep 2015 14:15:27 +0200 Subject: dccp: drop null test before destroy functions Remove unneeded NULL test. The semantic patch that makes this change is as follows: (http://coccinelle.lip6.fr/) // @@ expression x; @@ -if (x != NULL) \(kmem_cache_destroy\|mempool_destroy\|dma_pool_destroy\)(x); @@ expression x; @@ -if (x != NULL) { \(kmem_cache_destroy\|mempool_destroy\|dma_pool_destroy\)(x); x = NULL; -} // Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- net/dccp/ackvec.c | 12 ++++-------- net/dccp/ccid.c | 3 +-- 2 files changed, 5 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index bd9e718..3de0d03 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -398,12 +398,8 @@ out_err: void dccp_ackvec_exit(void) { - if (dccp_ackvec_slab != NULL) { - kmem_cache_destroy(dccp_ackvec_slab); - dccp_ackvec_slab = NULL; - } - if (dccp_ackvec_record_slab != NULL) { - kmem_cache_destroy(dccp_ackvec_record_slab); - dccp_ackvec_record_slab = NULL; - } + kmem_cache_destroy(dccp_ackvec_slab); + dccp_ackvec_slab = NULL; + kmem_cache_destroy(dccp_ackvec_record_slab); + dccp_ackvec_record_slab = NULL; } diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index 8349897..90f77d0 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c @@ -95,8 +95,7 @@ static struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_f static void ccid_kmem_cache_destroy(struct kmem_cache *slab) { - if (slab != NULL) - kmem_cache_destroy(slab); + kmem_cache_destroy(slab); } static int __init ccid_activate(struct ccid_operations *ccid_ops) -- cgit v1.1 From d8949aad3eab5d396f4fefcd581773bf07b9a79e Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 4 Sep 2015 12:22:46 +0300 Subject: Bluetooth: Delay check for conn->smp in smp_conn_security() There are several actions that smp_conn_security() might make that do not require a valid SMP context (conn->smp pointer). One of these actions is to encrypt the link with an existing LTK. If the SMP context wasn't initialized properly we should still allow the independent actions to be done, i.e. the check for the context should only be done at the last possible moment. Reported-by: Chuck Ebbert Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann Cc: stable@vger.kernel.org # 4.0+ --- net/bluetooth/smp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index ad82324..0510a57 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -2311,12 +2311,6 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) if (!conn) return 1; - chan = conn->smp; - if (!chan) { - BT_ERR("SMP security requested but not available"); - return 1; - } - if (!hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED)) return 1; @@ -2330,6 +2324,12 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) if (smp_ltk_encrypt(conn, hcon->pending_sec_level)) return 0; + chan = conn->smp; + if (!chan) { + BT_ERR("SMP security requested but not available"); + return 1; + } + l2cap_chan_lock(chan); /* If SMP is already in progress ignore this request */ -- cgit v1.1 From ad5001cc7cdf9aaee5eb213fdee657e4a3c94776 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 17 Sep 2015 13:37:00 +0200 Subject: netfilter: nf_log: wait for rcu grace after logger unregistration The nf_log_unregister() function needs to call synchronize_rcu() to make sure that the objects are not dereferenced anymore on module removal. Fixes: 5962815a6a56 ("netfilter: nf_log: use an array of loggers instead of list") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_log.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index a5ebd7d..a5d41df 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -117,6 +117,7 @@ void nf_log_unregister(struct nf_logger *logger) RCU_INIT_POINTER(loggers[i][logger->type], NULL); } mutex_unlock(&nf_log_mutex); + synchronize_rcu(); } EXPORT_SYMBOL(nf_log_unregister); -- cgit v1.1 From 37a1d3611c126fd9782ce5235791f898f053e763 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Sun, 13 Sep 2015 10:18:33 -0700 Subject: ipv6: include NLM_F_REPLACE in route replace notifications This patch adds NLM_F_REPLACE flag to ipv6 route replace notifications. This makes nlm_flags in ipv6 replace notifications consistent with ipv4. Signed-off-by: Roopa Prabhu Acked-by: Nicolas Dichtel Reviewed-by: Michal Kubecek Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 6 +++--- net/ipv6/route.c | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 8a9ec01..7d2e002 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -851,7 +851,7 @@ add: *ins = rt; rt->rt6i_node = fn; atomic_inc(&rt->rt6i_ref); - inet6_rt_notify(RTM_NEWROUTE, rt, info); + inet6_rt_notify(RTM_NEWROUTE, rt, info, 0); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; if (!(fn->fn_flags & RTN_RTINFO)) { @@ -877,7 +877,7 @@ add: rt->rt6i_node = fn; rt->dst.rt6_next = iter->dst.rt6_next; atomic_inc(&rt->rt6i_ref); - inet6_rt_notify(RTM_NEWROUTE, rt, info); + inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; @@ -1422,7 +1422,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, fib6_purge_rt(rt, fn, net); - inet6_rt_notify(RTM_DELROUTE, rt, info); + inet6_rt_notify(RTM_DELROUTE, rt, info, 0); rt6_release(rt); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3d3c1b2..d5fa502 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3304,7 +3304,8 @@ errout: return err; } -void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) +void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, + unsigned int nlm_flags) { struct sk_buff *skb; struct net *net = info->nl_net; @@ -3319,7 +3320,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) goto errout; err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, - event, info->portid, seq, 0, 0, 0); + event, info->portid, seq, 0, 0, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); -- cgit v1.1 From cc5706056baa3002b844ff240a1cc2199a978795 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Mon, 14 Sep 2015 11:14:50 -0700 Subject: openvswitch: Fix IPv6 exthdr handling with ct helpers. Static code analysis reveals the following bug: net/openvswitch/conntrack.c:281 ovs_ct_helper() warn: unsigned 'protoff' is never less than zero. This signedness bug breaks error handling for IPv6 extension headers when using conntrack helpers. Fix the error by using a local signed variable. Fixes: cae3a2627520: "openvswitch: Allow attaching helpers to ct action" Reported-by: Dan Carpenter Signed-off-by: Joe Stringer Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index e8e524a..002a755 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -275,13 +275,15 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto) case NFPROTO_IPV6: { u8 nexthdr = ipv6_hdr(skb)->nexthdr; __be16 frag_off; + int ofs; - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), - &nexthdr, &frag_off); - if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { + ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (ofs < 0 || (frag_off & htons(~0x7)) != 0) { pr_debug("proto header not found\n"); return NF_ACCEPT; } + protoff = ofs; break; } default: -- cgit v1.1 From 58189ca7b27411c3dc9a5cb9eeee0906da684c59 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 15 Sep 2015 15:10:50 -0700 Subject: net: Fix vti use case with oif in dst lookups Steffen reported that the recent change to add oif to dst lookups breaks the VTI use case. The problem is that with the oif set in the flow struct the comparison to the nh_oif is triggered. Fix by splitting the FLOWI_FLAG_VRFSRC into 2 flags -- one that triggers the vrf device cache bypass (FLOWI_FLAG_VRFSRC) and another telling the lookup to not compare nh oif (FLOWI_FLAG_SKIP_NH_OIF). Fixes: 42a7b32b73d6 ("xfrm: Add oif to dst lookups") Signed-off-by: David Ahern Acked-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 2 +- net/ipv4/udp.c | 3 ++- net/ipv4/xfrm4_policy.c | 2 ++ 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 26d6ffb..6c2af79 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1426,7 +1426,7 @@ found: nh->nh_flags & RTNH_F_LINKDOWN && !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) continue; - if (!(flp->flowi4_flags & FLOWI_FLAG_VRFSRC)) { + if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) { if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) continue; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c0a15e7..f7d1d5e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1024,7 +1024,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (netif_index_is_vrf(net, ipc.oif)) { flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, - (flow_flags | FLOWI_FLAG_VRFSRC), + (flow_flags | FLOWI_FLAG_VRFSRC | + FLOWI_FLAG_SKIP_NH_OIF), faddr, saddr, dport, inet->inet_sport); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index bb919b2..c10a9ee 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -33,6 +33,8 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, if (saddr) fl4->saddr = saddr->a4; + fl4->flowi4_flags = FLOWI_FLAG_SKIP_NH_OIF; + rt = __ip_route_output_key(net, fl4); if (!IS_ERR(rt)) return &rt->dst; -- cgit v1.1 From 1d325d217c7f190a42fb620ead20bb240fc16af0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 16 Sep 2015 17:26:14 +0200 Subject: ipv6: ip6_fragment: fix headroom tests and skb leak David Woodhouse reports skb_under_panic when we try to push ethernet header to fragmented ipv6 skbs: skbuff: skb_under_panic: text:c1277f1e len:1294 put:14 head:dec98000 data:dec97ffc tail:0xdec9850a end:0xdec98f40 dev:br-lan [..] ip6_finish_output2+0x196/0x4da David further debugged this: [..] offending fragments were arriving here with skb_headroom(skb)==10. Which is reasonable, being the Solos ADSL card's header of 8 bytes followed by 2 bytes of PPP frame type. The problem is that if netfilter ipv6 defragmentation is used, skb_cow() in ip6_forward will only see reassembled skb. Therefore, headroom is overestimated by 8 bytes (we pulled fragment header) and we don't check the skbs in the frag_list either. We can't do these checks in netfilter defrag since outdev isn't known yet. Furthermore, existing tests in ip6_fragment did not consider the fragment or ipv6 header size when checking headroom of the fraglist skbs. While at it, also fix a skb leak on memory allocation -- ip6_fragment must consume the skb. I tested this e1000 driver hacked to not allocate additional headroom (we end up in slowpath, since LL_RESERVED_SPACE is 16). If 2 bytes of headroom are allocated, fastpath is taken (14 byte ethernet header was pulled, so 16 byte headroom available in all fragments). Reported-by: David Woodhouse Diagnosed-by: David Woodhouse Signed-off-by: Florian Westphal Tested-by: David Woodhouse Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 26ea479..92b1aa3 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -586,20 +586,22 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, &ipv6_hdr(skb)->saddr); + hroom = LL_RESERVED_SPACE(rt->dst.dev); if (skb_has_frag_list(skb)) { int first_len = skb_pagelen(skb); struct sk_buff *frag2; if (first_len - hlen > mtu || ((first_len - hlen) & 7) || - skb_cloned(skb)) + skb_cloned(skb) || + skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) goto slow_path; skb_walk_frags(skb, frag) { /* Correct geometry. */ if (frag->len > mtu || ((frag->len & 7) && frag->next) || - skb_headroom(frag) < hlen) + skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) goto slow_path_clean; /* Partially cloned skb? */ @@ -616,8 +618,6 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, err = 0; offset = 0; - frag = skb_shinfo(skb)->frag_list; - skb_frag_list_init(skb); /* BUILD HEADER */ *prevhdr = NEXTHDR_FRAGMENT; @@ -625,8 +625,11 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, if (!tmp_hdr) { IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); - return -ENOMEM; + err = -ENOMEM; + goto fail; } + frag = skb_shinfo(skb)->frag_list; + skb_frag_list_init(skb); __skb_pull(skb, hlen); fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr)); @@ -723,7 +726,6 @@ slow_path: */ *prevhdr = NEXTHDR_FRAGMENT; - hroom = LL_RESERVED_SPACE(rt->dst.dev); troom = rt->dst.dev->needed_tailroom; /* -- cgit v1.1 From 34f5b0066435ffb793049b84fafd29fa195bcf90 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 16 Sep 2015 15:30:21 -0400 Subject: atm: deal with setting entry before mkip was called If we didn't call ATMARP_MKIP before ATMARP_ENCAP the VCC descriptor is non-existant and we'll end up dereferencing a NULL ptr: [1033173.491930] kasan: GPF could be caused by NULL-ptr deref or user memory accessirq event stamp: 123386 [1033173.493678] general protection fault: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC KASAN [1033173.493689] Modules linked in: [1033173.493697] CPU: 9 PID: 23815 Comm: trinity-c64 Not tainted 4.2.0-next-20150911-sasha-00043-g353d875-dirty #2545 [1033173.493706] task: ffff8800630c4000 ti: ffff880063110000 task.ti: ffff880063110000 [1033173.493823] RIP: clip_ioctl (net/atm/clip.c:320 net/atm/clip.c:689) [1033173.493826] RSP: 0018:ffff880063117a88 EFLAGS: 00010203 [1033173.493828] RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 000000000000000c [1033173.493830] RDX: 0000000000000002 RSI: ffffffffb3f10720 RDI: 0000000000000014 [1033173.493832] RBP: ffff880063117b80 R08: ffff88047574d9a4 R09: 0000000000000000 [1033173.493834] R10: 0000000000000000 R11: 0000000000000000 R12: 1ffff1000c622f53 [1033173.493836] R13: ffff8800cb905500 R14: ffff8808d6da2000 R15: 00000000fffffdfd [1033173.493840] FS: 00007fa56b92d700(0000) GS:ffff880478000000(0000) knlGS:0000000000000000 [1033173.493843] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [1033173.493845] CR2: 0000000000000000 CR3: 00000000630e8000 CR4: 00000000000006a0 [1033173.493855] Stack: [1033173.493862] ffffffffb0b60444 000000000000eaea 0000000041b58ab3 ffffffffb3c3ce32 [1033173.493867] ffffffffb0b6f3e0 ffffffffb0b60444 ffffffffb5ea2e50 1ffff1000c622f5e [1033173.493873] ffff8800630c4cd8 00000000000ee09a ffffffffb3ec4888 ffffffffb5ea2de8 [1033173.493874] Call Trace: [1033173.494108] do_vcc_ioctl (net/atm/ioctl.c:170) [1033173.494113] vcc_ioctl (net/atm/ioctl.c:189) [1033173.494116] svc_ioctl (net/atm/svc.c:605) [1033173.494200] sock_do_ioctl (net/socket.c:874) [1033173.494204] sock_ioctl (net/socket.c:958) [1033173.494244] do_vfs_ioctl (fs/ioctl.c:43 fs/ioctl.c:607) [1033173.494290] SyS_ioctl (fs/ioctl.c:622 fs/ioctl.c:613) [1033173.494295] entry_SYSCALL_64_fastpath (arch/x86/entry/entry_64.S:186) [1033173.494362] Code: fa 48 c1 ea 03 80 3c 02 00 0f 85 50 09 00 00 49 8b 9e 60 06 00 00 48 b8 00 00 00 00 00 fc ff df 48 8d 7b 14 48 89 fa 48 c1 ea 03 <0f> b6 04 02 48 89 fa 83 e2 07 38 d0 7f 08 84 c0 0f 85 14 09 00 All code ======== 0: fa cli 1: 48 c1 ea 03 shr $0x3,%rdx 5: 80 3c 02 00 cmpb $0x0,(%rdx,%rax,1) 9: 0f 85 50 09 00 00 jne 0x95f f: 49 8b 9e 60 06 00 00 mov 0x660(%r14),%rbx 16: 48 b8 00 00 00 00 00 movabs $0xdffffc0000000000,%rax 1d: fc ff df 20: 48 8d 7b 14 lea 0x14(%rbx),%rdi 24: 48 89 fa mov %rdi,%rdx 27: 48 c1 ea 03 shr $0x3,%rdx 2b:* 0f b6 04 02 movzbl (%rdx,%rax,1),%eax <-- trapping instruction 2f: 48 89 fa mov %rdi,%rdx 32: 83 e2 07 and $0x7,%edx 35: 38 d0 cmp %dl,%al 37: 7f 08 jg 0x41 39: 84 c0 test %al,%al 3b: 0f 85 14 09 00 00 jne 0x955 Code starting with the faulting instruction =========================================== 0: 0f b6 04 02 movzbl (%rdx,%rax,1),%eax 4: 48 89 fa mov %rdi,%rdx 7: 83 e2 07 and $0x7,%edx a: 38 d0 cmp %dl,%al c: 7f 08 jg 0x16 e: 84 c0 test %al,%al 10: 0f 85 14 09 00 00 jne 0x92a [1033173.494366] RIP clip_ioctl (net/atm/clip.c:320 net/atm/clip.c:689) [1033173.494368] RSP Signed-off-by: Sasha Levin Signed-off-by: David S. Miller --- net/atm/clip.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/atm/clip.c b/net/atm/clip.c index 17e55df..e07f551 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -317,6 +317,9 @@ static int clip_constructor(struct neighbour *neigh) static int clip_encap(struct atm_vcc *vcc, int mode) { + if (!CLIP_VCC(vcc)) + return -EBADFD; + CLIP_VCC(vcc)->encap = mode; return 0; } -- cgit v1.1 From c2e7204d180f8efc80f27959ca9cf16fa17f67db Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 Sep 2015 08:38:00 -0700 Subject: tcp_cubic: do not set epoch_start in the future Tracking idle time in bictcp_cwnd_event() is imprecise, as epoch_start is normally set at ACK processing time, not at send time. Doing a proper fix would need to add an additional state variable, and does not seem worth the trouble, given CUBIC bug has been there forever before Jana noticed it. Let's simply not set epoch_start in the future, otherwise bictcp_update() could overflow and CUBIC would again grow cwnd too fast. This was detected thanks to a packetdrill test Neal wrote that was flaky before applying this fix. Fixes: 30927520dbae ("tcp_cubic: better follow cubic curve after idle period") Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Cc: Jana Iyengar Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index c6ded6b..448c261 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -154,14 +154,20 @@ static void bictcp_init(struct sock *sk) static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_TX_START) { - s32 delta = tcp_time_stamp - tcp_sk(sk)->lsndtime; struct bictcp *ca = inet_csk_ca(sk); + u32 now = tcp_time_stamp; + s32 delta; + + delta = now - tcp_sk(sk)->lsndtime; /* We were application limited (idle) for a while. * Shift epoch_start to keep cwnd growth to cubic curve. */ - if (ca->epoch_start && delta > 0) + if (ca->epoch_start && delta > 0) { ca->epoch_start += delta; + if (after(ca->epoch_start, now)) + ca->epoch_start = now; + } return; } } -- cgit v1.1 From 0315e382704817b279e5693dca8ab9d89aa20b3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nikola=20Forr=C3=B3?= Date: Thu, 17 Sep 2015 16:01:32 +0200 Subject: net: Fix behaviour of unreachable, blackhole and prohibit routes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Man page of ip-route(8) says following about route types: unreachable - these destinations are unreachable. Packets are dis‐ carded and the ICMP message host unreachable is generated. The local senders get an EHOSTUNREACH error. blackhole - these destinations are unreachable. Packets are dis‐ carded silently. The local senders get an EINVAL error. prohibit - these destinations are unreachable. Packets are discarded and the ICMP message communication administratively prohibited is generated. The local senders get an EACCES error. In the inet6 address family, this was correct, except the local senders got ENETUNREACH error instead of EHOSTUNREACH in case of unreachable route. In the inet address family, all three route types generated ICMP message net unreachable, and the local senders got ENETUNREACH error. In both address families all three route types now behave consistently with documentation. Signed-off-by: Nikola Forró Signed-off-by: David S. Miller --- net/ipv4/route.c | 6 ++++-- net/ipv6/route.c | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 5f4a556..c6ad99a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2045,6 +2045,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) struct fib_result res; struct rtable *rth; int orig_oif; + int err = -ENETUNREACH; res.tclassid = 0; res.fi = NULL; @@ -2153,7 +2154,8 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) goto make_route; } - if (fib_lookup(net, fl4, &res, 0)) { + err = fib_lookup(net, fl4, &res, 0); + if (err) { res.fi = NULL; res.table = NULL; if (fl4->flowi4_oif) { @@ -2181,7 +2183,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) res.type = RTN_UNICAST; goto make_route; } - rth = ERR_PTR(-ENETUNREACH); + rth = ERR_PTR(err); goto out; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d5fa502..f204089 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1885,9 +1885,11 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret) rt->dst.input = ip6_pkt_prohibit; break; case RTN_THROW: + case RTN_UNREACHABLE: default: rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN - : -ENETUNREACH; + : (cfg->fc_type == RTN_UNREACHABLE) + ? -EHOSTUNREACH : -ENETUNREACH; rt->dst.output = ip6_pkt_discard_out; rt->dst.input = ip6_pkt_discard; break; -- cgit v1.1 From 4e3ae00100945d39e1f83b7c0179a114ccf55759 Mon Sep 17 00:00:00 2001 From: Erik Hugne Date: Fri, 18 Sep 2015 10:46:31 +0200 Subject: tipc: reinitialize pointer after skb linearize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The msg pointer into header may change after skb linearization. We must reinitialize it after calling skb_linearize to prevent operating on a freed or invalid pointer. Signed-off-by: Erik Hugne Reported-by: Tamás Végh Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/msg.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 562c926..c5ac436 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -539,6 +539,7 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) *err = -TIPC_ERR_NO_NAME; if (skb_linearize(skb)) return false; + msg = buf_msg(skb); if (msg_reroute_cnt(msg)) return false; dnode = addr_domain(net, msg_lookup_scope(msg)); -- cgit v1.1 From bc22a0e2ea03b75b51a1f722f93821744b5b5ff1 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 18 Sep 2015 11:47:40 +0200 Subject: iptunnel: make rx/tx bytes counters consistent This was already done a long time ago in commit 64194c31a0b6 ("inet: Make tunnel RX/TX byte counters more consistent") but tx path was broken (at least since 3.10). Before the patch the gre header was included on tx. After the patch: $ ping -c1 192.168.0.121 ; ip -s l ls dev gre1 PING 192.168.0.121 (192.168.0.121) 56(84) bytes of data. 64 bytes from 192.168.0.121: icmp_req=1 ttl=64 time=2.95 ms --- 192.168.0.121 ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 2.955/2.955/2.955/0.000 ms 7: gre1@NONE: mtu 1468 qdisc noqueue state UNKNOWN mode DEFAULT group default link/gre 10.16.0.249 peer 10.16.0.121 RX: bytes packets errors dropped overrun mcast 84 1 0 0 0 0 TX: bytes packets errors dropped carrier collsns 84 1 0 0 0 0 Reported-by: Julien Meunier Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 29ed6c5..9b97204 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -51,7 +51,7 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl, __be16 df, bool xnet) { - int pkt_len = skb->len; + int pkt_len = skb->len - skb_inner_network_offset(skb); struct iphdr *iph; int err; -- cgit v1.1 From 1f770c0a09da855a2b51af6d19de97fb955eca85 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 18 Sep 2015 19:16:50 +0800 Subject: netlink: Fix autobind race condition that leads to zero port ID The commit c0bb07df7d981e4091432754e30c9c720e2c0c78 ("netlink: Reset portid after netlink_insert failure") introduced a race condition where if two threads try to autobind the same socket one of them may end up with a zero port ID. This led to kernel deadlocks that were observed by multiple people. This patch reverts that commit and instead fixes it by introducing a separte rhash_portid variable so that the real portid is only set after the socket has been successfully hashed. Fixes: c0bb07df7d98 ("netlink: Reset portid after netlink_insert failure") Reported-by: Tejun Heo Reported-by: Linus Torvalds Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 12 +++++++----- net/netlink/af_netlink.h | 1 + 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 4cad99d..9f51608 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1031,7 +1031,7 @@ static inline int netlink_compare(struct rhashtable_compare_arg *arg, const struct netlink_compare_arg *x = arg->key; const struct netlink_sock *nlk = ptr; - return nlk->portid != x->portid || + return nlk->rhash_portid != x->portid || !net_eq(sock_net(&nlk->sk), read_pnet(&x->pnet)); } @@ -1057,7 +1057,7 @@ static int __netlink_insert(struct netlink_table *table, struct sock *sk) { struct netlink_compare_arg arg; - netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->portid); + netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->rhash_portid); return rhashtable_lookup_insert_key(&table->hash, &arg, &nlk_sk(sk)->node, netlink_rhashtable_params); @@ -1119,7 +1119,7 @@ static int netlink_insert(struct sock *sk, u32 portid) unlikely(atomic_read(&table->hash.nelems) >= UINT_MAX)) goto err; - nlk_sk(sk)->portid = portid; + nlk_sk(sk)->rhash_portid = portid; sock_hold(sk); err = __netlink_insert(table, sk); @@ -1131,10 +1131,12 @@ static int netlink_insert(struct sock *sk, u32 portid) err = -EOVERFLOW; if (err == -EEXIST) err = -EADDRINUSE; - nlk_sk(sk)->portid = 0; sock_put(sk); + goto err; } + nlk_sk(sk)->portid = portid; + err: release_sock(sk); return err; @@ -3271,7 +3273,7 @@ static inline u32 netlink_hash(const void *data, u32 len, u32 seed) const struct netlink_sock *nlk = data; struct netlink_compare_arg arg; - netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->portid); + netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->rhash_portid); return jhash2((u32 *)&arg, netlink_compare_arg_len / sizeof(u32), seed); } diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index df9a060..80b2b75 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -25,6 +25,7 @@ struct netlink_ring { struct netlink_sock { /* struct sock has to be the first member of netlink_sock */ struct sock sk; + u32 rhash_portid; u32 portid; u32 dst_portid; u32 dst_group; -- cgit v1.1 From ed2e923945892a8372ab70d2f61d364b0b6d9054 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 19 Sep 2015 09:08:34 -0700 Subject: tcp/dccp: fix timewait races in timer handling When creating a timewait socket, we need to arm the timer before allowing other cpus to find it. The signal allowing cpus to find the socket is setting tw_refcnt to non zero value. As we set tw_refcnt in __inet_twsk_hashdance(), we therefore need to call inet_twsk_schedule() first. This also means we need to remove tw_refcnt changes from inet_twsk_schedule() and let the caller handle it. Note that because we use mod_timer_pinned(), we have the guarantee the timer wont expire before we set tw_refcnt as we run in BH context. To make things more readable I introduced inet_twsk_reschedule() helper. When rearming the timer, we can use mod_timer_pending() to make sure we do not rearm a canceled timer. Note: This bug can possibly trigger if packets of a flow can hit multiple cpus. This does not normally happen, unless flow steering is broken somehow. This explains this bug was spotted ~5 months after its introduction. A similar fix is needed for SYN_RECV sockets in reqsk_queue_hash_req(), but will be provided in a separate patch for proper tracking. Fixes: 789f558cfb36 ("tcp/dccp: get rid of central timewait timer") Signed-off-by: Eric Dumazet Reported-by: Ying Cai Signed-off-by: David S. Miller --- net/dccp/minisocks.c | 4 ++-- net/ipv4/inet_timewait_sock.c | 16 ++++++++++------ net/ipv4/tcp_minisocks.c | 13 ++++++------- 3 files changed, 18 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 30addee..838f524 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -48,8 +48,6 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) tw->tw_ipv6only = sk->sk_ipv6only; } #endif - /* Linkage updates. */ - __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) @@ -60,6 +58,8 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) timeo = DCCP_TIMEWAIT_LEN; inet_twsk_schedule(tw, timeo); + /* Linkage updates. */ + __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index ae22cc2..c67f9bd 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -123,13 +123,15 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, /* * Step 2: Hash TW into tcp ehash chain. * Notes : - * - tw_refcnt is set to 3 because : + * - tw_refcnt is set to 4 because : * - We have one reference from bhash chain. * - We have one reference from ehash chain. + * - We have one reference from timer. + * - One reference for ourself (our caller will release it). * We can use atomic_set() because prior spin_lock()/spin_unlock() * committed into memory all tw fields. */ - atomic_set(&tw->tw_refcnt, 1 + 1 + 1); + atomic_set(&tw->tw_refcnt, 4); inet_twsk_add_node_rcu(tw, &ehead->chain); /* Step 3: Remove SK from hash chain */ @@ -217,7 +219,7 @@ void inet_twsk_deschedule_put(struct inet_timewait_sock *tw) } EXPORT_SYMBOL(inet_twsk_deschedule_put); -void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo) +void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) { /* timeout := RTO * 3.5 * @@ -245,12 +247,14 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo) */ tw->tw_kill = timeo <= 4*HZ; - if (!mod_timer_pinned(&tw->tw_timer, jiffies + timeo)) { - atomic_inc(&tw->tw_refcnt); + if (!rearm) { + BUG_ON(mod_timer_pinned(&tw->tw_timer, jiffies + timeo)); atomic_inc(&tw->tw_dr->tw_count); + } else { + mod_timer_pending(&tw->tw_timer, jiffies + timeo); } } -EXPORT_SYMBOL_GPL(inet_twsk_schedule); +EXPORT_SYMBOL_GPL(__inet_twsk_schedule); void inet_twsk_purge(struct inet_hashinfo *hashinfo, struct inet_timewait_death_row *twdr, int family) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 6d8795b..def7659 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -162,9 +162,9 @@ kill_with_rst: if (tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && tcp_tw_remember_stamp(tw)) - inet_twsk_schedule(tw, tw->tw_timeout); + inet_twsk_reschedule(tw, tw->tw_timeout); else - inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } @@ -201,7 +201,7 @@ kill: return TCP_TW_SUCCESS; } } - inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; @@ -251,7 +251,7 @@ kill: * Do not reschedule in the last case. */ if (paws_reject || th->ack) - inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); @@ -322,9 +322,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } while (0); #endif - /* Linkage updates. */ - __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); - /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) timeo = rto; @@ -338,6 +335,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } inet_twsk_schedule(tw, timeo); + /* Linkage updates. */ + __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this -- cgit v1.1 From 29c6852602e259d2c1882f320b29d5c3fec0de04 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 19 Sep 2015 09:48:04 -0700 Subject: inet: fix races in reqsk_queue_hash_req() Before allowing lockless LISTEN processing, we need to make sure to arm the SYN_RECV timer before the req socket is visible in hash tables. Also, req->rsk_hash should be written before we set rsk_refcnt to a non zero value. Fixes: fa76ce7328b2 ("inet: get rid of central tcp/dccp listener timer") Signed-off-by: Eric Dumazet Cc: Ying Cai Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 1349571..7bb9c39 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -685,20 +685,20 @@ void reqsk_queue_hash_req(struct request_sock_queue *queue, req->num_timeout = 0; req->sk = NULL; + setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); + mod_timer_pinned(&req->rsk_timer, jiffies + timeout); + req->rsk_hash = hash; + /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ smp_wmb(); atomic_set(&req->rsk_refcnt, 2); - setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); - req->rsk_hash = hash; spin_lock(&queue->syn_wait_lock); req->dl_next = lopt->syn_table[hash]; lopt->syn_table[hash] = req; spin_unlock(&queue->syn_wait_lock); - - mod_timer_pinned(&req->rsk_timer, jiffies + timeout); } EXPORT_SYMBOL(reqsk_queue_hash_req); -- cgit v1.1 From 2df1b131b54f431877a6665139dac805ba5ce1a5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 21 Aug 2015 14:07:13 +0200 Subject: mac80211: fix VHT MCS mask array overrun The HT MCS mask has 9 bytes, the VHT one only has 8 streams. Split the loops to handle this correctly. Reported-by: Dan Carpenter Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 17b1fe9..a30ec3c 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2518,15 +2518,17 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, continue; for (j = 0; j < IEEE80211_HT_MCS_MASK_LEN; j++) { - if (~sdata->rc_rateidx_mcs_mask[i][j]) + if (~sdata->rc_rateidx_mcs_mask[i][j]) { sdata->rc_has_mcs_mask[i] = true; + break; + } + } - if (~sdata->rc_rateidx_vht_mcs_mask[i][j]) + for (j = 0; j < NL80211_VHT_NSS_MAX; j++) { + if (~sdata->rc_rateidx_vht_mcs_mask[i][j]) { sdata->rc_has_vht_mcs_mask[i] = true; - - if (sdata->rc_has_mcs_mask[i] && - sdata->rc_has_vht_mcs_mask[i]) break; + } } } -- cgit v1.1 From babc305e21ea3811d98b67437299360904ac1b6a Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Mon, 21 Sep 2015 15:47:40 +0300 Subject: mac80211: reset CQM history upon reconfiguration The current behavior of notifying CQM events is inconsistent: Upon first configuration there is a cqm event with the current status according to threshold configured, regardless of signal stability. When there is reconfiguration no event is sent unless there is a significant change to the signal level according to the new configuration. Since the current reconfiguration behavior might cause missing CQM events in case the current signal did not change but is on the other side of the new threshold, fix that by resetting the stored signal level upon reconfiguration. Signed-off-by: Sara Sharon Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index a30ec3c..7a77a14 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2474,6 +2474,7 @@ static int ieee80211_set_cqm_rssi_config(struct wiphy *wiphy, bss_conf->cqm_rssi_thold = rssi_thold; bss_conf->cqm_rssi_hyst = rssi_hyst; + sdata->u.mgd.last_cqm_event_signal = 0; /* tell the driver upon association, unless already associated */ if (sdata->u.mgd.associated && -- cgit v1.1 From ae5f2fb1d51fa128a460bcfbe3c56d7ab8bf6a43 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Mon, 21 Sep 2015 20:21:20 -0700 Subject: openvswitch: Zero flows on allocation. When support for megaflows was introduced, OVS needed to start installing flows with a mask applied to them. Since masking is an expensive operation, OVS also had an optimization that would only take the parts of the flow keys that were covered by a non-zero mask. The values stored in the remaining pieces should not matter because they are masked out. While this works fine for the purposes of matching (which must always look at the mask), serialization to netlink can be problematic. Since the flow and the mask are serialized separately, the uninitialized portions of the flow can be encoded with whatever values happen to be present. In terms of functionality, this has little effect since these fields will be masked out by definition. However, it leaks kernel memory to userspace, which is a potential security vulnerability. It is also possible that other code paths could look at the masked key and get uninitialized data, although this does not currently appear to be an issue in practice. This removes the mask optimization for flows that are being installed. This was always intended to be the case as the mask optimizations were really targetting per-packet flow operations. Fixes: 03f0d916 ("openvswitch: Mega flow implementation") Signed-off-by: Jesse Gross Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 4 ++-- net/openvswitch/flow_table.c | 23 ++++++++++++----------- net/openvswitch/flow_table.h | 2 +- 3 files changed, 15 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 6fbd2de..b816ff8 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -952,7 +952,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) if (error) goto err_kfree_flow; - ovs_flow_mask_key(&new_flow->key, &key, &mask); + ovs_flow_mask_key(&new_flow->key, &key, true, &mask); /* Extract flow identifier. */ error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], @@ -1080,7 +1080,7 @@ static struct sw_flow_actions *get_flow_actions(struct net *net, struct sw_flow_key masked_key; int error; - ovs_flow_mask_key(&masked_key, key, mask); + ovs_flow_mask_key(&masked_key, key, true, mask); error = ovs_nla_copy_actions(net, a, &masked_key, &acts, log); if (error) { OVS_NLERR(log, diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index d22d8e9..f2ea83b 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -57,20 +57,21 @@ static u16 range_n_bytes(const struct sw_flow_key_range *range) } void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, - const struct sw_flow_mask *mask) + bool full, const struct sw_flow_mask *mask) { - const long *m = (const long *)((const u8 *)&mask->key + - mask->range.start); - const long *s = (const long *)((const u8 *)src + - mask->range.start); - long *d = (long *)((u8 *)dst + mask->range.start); + int start = full ? 0 : mask->range.start; + int len = full ? sizeof *dst : range_n_bytes(&mask->range); + const long *m = (const long *)((const u8 *)&mask->key + start); + const long *s = (const long *)((const u8 *)src + start); + long *d = (long *)((u8 *)dst + start); int i; - /* The memory outside of the 'mask->range' are not set since - * further operations on 'dst' only uses contents within - * 'mask->range'. + /* If 'full' is true then all of 'dst' is fully initialized. Otherwise, + * if 'full' is false the memory outside of the 'mask->range' is left + * uninitialized. This can be used as an optimization when further + * operations on 'dst' only use contents within 'mask->range'. */ - for (i = 0; i < range_n_bytes(&mask->range); i += sizeof(long)) + for (i = 0; i < len; i += sizeof(long)) *d++ = *s++ & *m++; } @@ -475,7 +476,7 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti, u32 hash; struct sw_flow_key masked_key; - ovs_flow_mask_key(&masked_key, unmasked, mask); + ovs_flow_mask_key(&masked_key, unmasked, false, mask); hash = flow_hash(&masked_key, &mask->range); head = find_bucket(ti, hash); hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) { diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 616eda1..2dd9900 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -86,5 +86,5 @@ struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *, bool ovs_flow_cmp(const struct sw_flow *, const struct sw_flow_match *); void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, - const struct sw_flow_mask *mask); + bool full, const struct sw_flow_mask *mask); #endif /* flow_table.h */ -- cgit v1.1 From fbd03513bf36c4e5c2942f436f05c8eb99a3cc9e Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Tue, 22 Sep 2015 11:28:14 +0200 Subject: net: dsa: Fix Marvell Egress Trailer check The Marvell Egress rx trailer check must be fixed to correctly detect bad bits in the third byte of the Eggress trailer as described in the Table 28 of the 88E6060 datasheet. The current code incorrectly omits to check the third byte and checks the fourth byte twice. Signed-off-by: Neil Armstrong Acked-by: Guenter Roeck Signed-off-by: David S. Miller --- net/dsa/tag_trailer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index d25efc9..b6ca089 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -78,7 +78,7 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev, trailer = skb_tail_pointer(skb) - 4; if (trailer[0] != 0x80 || (trailer[1] & 0xf8) != 0x00 || - (trailer[3] & 0xef) != 0x00 || trailer[3] != 0x00) + (trailer[2] & 0xef) != 0x00 || trailer[3] != 0x00) goto out_drop; source_port = trailer[1] & 7; -- cgit v1.1 From 675ee231d960af2af3606b4480324e26797eb010 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 23 Sep 2015 14:00:21 -0700 Subject: tcp: add proper TS val into RST packets RST packets sent on behalf of TCP connections with TS option (RFC 7323 TCP timestamps) have incorrect TS val (set to 0), but correct TS ecr. A > B: Flags [S], seq 0, win 65535, options [mss 1000,nop,nop,TS val 100 ecr 0], length 0 B > A: Flags [S.], seq 2444755794, ack 1, win 28960, options [mss 1460,nop,nop,TS val 7264344 ecr 100], length 0 A > B: Flags [.], ack 1, win 65535, options [nop,nop,TS val 110 ecr 7264344], length 0 B > A: Flags [R.], seq 1, ack 1, win 28960, options [nop,nop,TS val 0 ecr 110], length 0 We need to call skb_mstamp_get() to get proper TS val, derived from skb->skb_mstamp Note that RFC 1323 was advocating to not send TS option in RST segment, but RFC 7323 recommends the opposite : Once TSopt has been successfully negotiated, that is both and contain TSopt, the TSopt MUST be sent in every non- segment for the duration of the connection, and SHOULD be sent in an segment (see Section 5.2 for details) Note this RFC recommends to send TS val = 0, but we believe it is premature : We do not know if all TCP stacks are properly handling the receive side : When an segment is received, it MUST NOT be subjected to the PAWS check by verifying an acceptable value in SEG.TSval, and information from the Timestamps option MUST NOT be used to update connection state information. SEG.TSecr MAY be used to provide stricter acceptance checks. In 5 years, if/when all TCP stack are RFC 7323 ready, we might consider to decide to send TS val = 0, if it buys something. Fixes: 7faee5c0d514 ("tcp: remove TCP_SKB_CB(skb)->when") Signed-off-by: Eric Dumazet Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f9a8a12..1100ffe 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2897,6 +2897,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) skb_reserve(skb, MAX_TCP_HEADER); tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); + skb_mstamp_get(&skb->skb_mstamp); /* Send it off. */ if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); -- cgit v1.1 From 2d8bff12699abc3a9bf886bb0b79f44d94d81496 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Wed, 23 Sep 2015 14:57:58 -0400 Subject: netpoll: Close race condition between poll_one_napi and napi_disable Drivers might call napi_disable while not holding the napi instance poll_lock. In those instances, its possible for a race condition to exist between poll_one_napi and napi_disable. That is to say, poll_one_napi only tests the NAPI_STATE_SCHED bit to see if there is work to do during a poll, and as such the following may happen: CPU0 CPU1 ndo_tx_timeout napi_poll_dev napi_disable poll_one_napi test_and_set_bit (ret 0) test_bit (ret 1) reset adapter napi_poll_routine If the adapter gets a tx timeout without a napi instance scheduled, its possible for the adapter to think it has exclusive access to the hardware (as the napi instance is now scheduled via the napi_disable call), while the netpoll code thinks there is simply work to do. The result is parallel hardware access leading to corrupt data structures in the driver, and a crash. Additionaly, there is another, more critical race between netpoll and napi_disable. The disabled napi state is actually identical to the scheduled state for a given napi instance. The implication being that, if a napi instance is disabled, a netconsole instance would see the napi state of the device as having been scheduled, and poll it, likely while the driver was dong something requiring exclusive access. In the case above, its fairly clear that not having the rings in a state ready to be polled will cause any number of crashes. The fix should be pretty easy. netpoll uses its own bit to indicate that that the napi instance is in a state of being serviced by netpoll (NAPI_STATE_NPSVC). We can just gate disabling on that bit as well as the sched bit. That should prevent netpoll from conducting a napi poll if we convert its set bit to a test_and_set_bit operation to provide mutual exclusion Change notes: V2) Remove a trailing whtiespace Resubmit with proper subject prefix V3) Clean up spacing nits Signed-off-by: Neil Horman CC: "David S. Miller" CC: jmaxwell@redhat.com Tested-by: jmaxwell@redhat.com Signed-off-by: David S. Miller --- net/core/dev.c | 2 ++ net/core/netpoll.c | 10 ++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 877c848..6bb6470 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4713,6 +4713,8 @@ void napi_disable(struct napi_struct *n) while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) msleep(1); + while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state)) + msleep(1); hrtimer_cancel(&n->timer); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 6aa3db8..8bdada2 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -142,7 +142,7 @@ static void queue_process(struct work_struct *work) */ static int poll_one_napi(struct napi_struct *napi, int budget) { - int work; + int work = 0; /* net_rx_action's ->poll() invocations and our's are * synchronized by this test which is only made while @@ -151,7 +151,12 @@ static int poll_one_napi(struct napi_struct *napi, int budget) if (!test_bit(NAPI_STATE_SCHED, &napi->state)) return budget; - set_bit(NAPI_STATE_NPSVC, &napi->state); + /* If we set this bit but see that it has already been set, + * that indicates that napi has been disabled and we need + * to abort this operation + */ + if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state)) + goto out; work = napi->poll(napi, budget); WARN_ONCE(work > budget, "%pF exceeded budget in poll\n", napi->poll); @@ -159,6 +164,7 @@ static int poll_one_napi(struct napi_struct *napi, int budget) clear_bit(NAPI_STATE_NPSVC, &napi->state); +out: return budget - work; } -- cgit v1.1 From d3869efe7a8a2298516d9af4f91487cf486ca945 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 23 Sep 2015 19:45:08 +0100 Subject: Fix AF_PACKET ABI breakage in 4.2 Commit 7d82410950aa ("virtio: add explicit big-endian support to memory accessors") accidentally changed the virtio_net header used by AF_PACKET with PACKET_VNET_HDR from host-endian to big-endian. Since virtio_legacy_is_little_endian() is a very long identifier, define a vio_le macro and use that throughout the code instead of the hard-coded 'false' for little-endian. This restores the ABI to match 4.1 and earlier kernels, and makes my test program work again. Signed-off-by: David Woodhouse Signed-off-by: David S. Miller --- net/packet/af_packet.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 7b8e39a..aa4b15c 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -230,6 +230,8 @@ struct packet_skb_cb { } sa; }; +#define vio_le() virtio_legacy_is_little_endian() + #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc)) @@ -2680,15 +2682,15 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) goto out_unlock; if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && - (__virtio16_to_cpu(false, vnet_hdr.csum_start) + - __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2 > - __virtio16_to_cpu(false, vnet_hdr.hdr_len))) - vnet_hdr.hdr_len = __cpu_to_virtio16(false, - __virtio16_to_cpu(false, vnet_hdr.csum_start) + - __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2); + (__virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) + + __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2 > + __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len))) + vnet_hdr.hdr_len = __cpu_to_virtio16(vio_le(), + __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) + + __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2); err = -EINVAL; - if (__virtio16_to_cpu(false, vnet_hdr.hdr_len) > len) + if (__virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len) > len) goto out_unlock; if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) { @@ -2731,7 +2733,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, - __virtio16_to_cpu(false, vnet_hdr.hdr_len), + __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len), msg->msg_flags & MSG_DONTWAIT, &err); if (skb == NULL) goto out_unlock; @@ -2778,8 +2780,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (po->has_vnet_hdr) { if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { - u16 s = __virtio16_to_cpu(false, vnet_hdr.csum_start); - u16 o = __virtio16_to_cpu(false, vnet_hdr.csum_offset); + u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start); + u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset); if (!skb_partial_csum_set(skb, s, o)) { err = -EINVAL; goto out_free; @@ -2787,7 +2789,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) } skb_shinfo(skb)->gso_size = - __virtio16_to_cpu(false, vnet_hdr.gso_size); + __virtio16_to_cpu(vio_le(), vnet_hdr.gso_size); skb_shinfo(skb)->gso_type = gso_type; /* Header must be checked, and gso_segs computed. */ @@ -3161,9 +3163,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, /* This is a hint as to how much should be linear. */ vnet_hdr.hdr_len = - __cpu_to_virtio16(false, skb_headlen(skb)); + __cpu_to_virtio16(vio_le(), skb_headlen(skb)); vnet_hdr.gso_size = - __cpu_to_virtio16(false, sinfo->gso_size); + __cpu_to_virtio16(vio_le(), sinfo->gso_size); if (sinfo->gso_type & SKB_GSO_TCPV4) vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; else if (sinfo->gso_type & SKB_GSO_TCPV6) @@ -3181,9 +3183,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (skb->ip_summed == CHECKSUM_PARTIAL) { vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - vnet_hdr.csum_start = __cpu_to_virtio16(false, + vnet_hdr.csum_start = __cpu_to_virtio16(vio_le(), skb_checksum_start_offset(skb)); - vnet_hdr.csum_offset = __cpu_to_virtio16(false, + vnet_hdr.csum_offset = __cpu_to_virtio16(vio_le(), skb->csum_offset); } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID; -- cgit v1.1 From da314c9923fed553a007785a901fd395b7eb6c19 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 22 Sep 2015 11:38:56 +0800 Subject: netlink: Replace rhash_portid with bound On Mon, Sep 21, 2015 at 02:20:22PM -0400, Tejun Heo wrote: > > store_release and load_acquire are different from the usual memory > barriers and can't be paired this way. You have to pair store_release > and load_acquire. Besides, it isn't a particularly good idea to OK I've decided to drop the acquire/release helpers as they don't help us at all and simply pessimises the code by using full memory barriers (on some architectures) where only a write or read barrier is needed. > depend on memory barriers embedded in other data structures like the > above. Here, especially, rhashtable_insert() would have write barrier > *before* the entry is hashed not necessarily *after*, which means that > in the above case, a socket which appears to have set bound to a > reader might not visible when the reader tries to look up the socket > on the hashtable. But you are right we do need an explicit write barrier here to ensure that the hashing is visible. > There's no reason to be overly smart here. This isn't a crazy hot > path, write barriers tend to be very cheap, store_release more so. > Please just do smp_store_release() and note what it's paired with. It's not about being overly smart. It's about actually understanding what's going on with the code. I've seen too many instances of people simply sprinkling synchronisation primitives around without any knowledge of what is happening underneath, which is just a recipe for creating hard-to-debug races. > > @@ -1539,7 +1546,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, > > } > > } > > > > - if (!nlk->portid) { > > + if (!nlk->bound) { > > I don't think you can skip load_acquire here just because this is the > second deref of the variable. That doesn't change anything. Race > condition could still happen between the first and second tests and > skipping the second would lead to the same kind of bug. The reason this one is OK is because we do not use nlk->portid or try to get nlk from the hash table before we return to user-space. However, there is a real bug here that none of these acquire/release helpers discovered. The two bound tests here used to be a single one. Now that they are separate it is entirely possible for another thread to come in the middle and bind the socket. So we need to repeat the portid check in order to maintain consistency. > > @@ -1587,7 +1594,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, > > !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) > > return -EPERM; > > > > - if (!nlk->portid) > > + if (!nlk->bound) > > Don't we need load_acquire here too? Is this path holding a lock > which makes that unnecessary? Ditto. ---8<--- The commit 1f770c0a09da855a2b51af6d19de97fb955eca85 ("netlink: Fix autobind race condition that leads to zero port ID") created some new races that can occur due to inconcsistencies between the two port IDs. Tejun is right that a barrier is unavoidable. Therefore I am reverting to the original patch that used a boolean to indicate that a user netlink socket has been bound. Barriers have been added where necessary to ensure that a valid portid and the hashed socket is visible. I have also changed netlink_insert to only return EBUSY if the socket is bound to a portid different to the requested one. This combined with only reading nlk->bound once in netlink_bind fixes a race where two threads that bind the socket at the same time with different port IDs may both succeed. Fixes: 1f770c0a09da ("netlink: Fix autobind race condition that leads to zero port ID") Reported-by: Tejun Heo Reported-by: Linus Torvalds Signed-off-by: Herbert Xu Nacked-by: Tejun Heo Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 39 ++++++++++++++++++++++++++++----------- net/netlink/af_netlink.h | 2 +- 2 files changed, 29 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 9f51608..8f060d7 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1031,7 +1031,7 @@ static inline int netlink_compare(struct rhashtable_compare_arg *arg, const struct netlink_compare_arg *x = arg->key; const struct netlink_sock *nlk = ptr; - return nlk->rhash_portid != x->portid || + return nlk->portid != x->portid || !net_eq(sock_net(&nlk->sk), read_pnet(&x->pnet)); } @@ -1057,7 +1057,7 @@ static int __netlink_insert(struct netlink_table *table, struct sock *sk) { struct netlink_compare_arg arg; - netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->rhash_portid); + netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->portid); return rhashtable_lookup_insert_key(&table->hash, &arg, &nlk_sk(sk)->node, netlink_rhashtable_params); @@ -1110,8 +1110,8 @@ static int netlink_insert(struct sock *sk, u32 portid) lock_sock(sk); - err = -EBUSY; - if (nlk_sk(sk)->portid) + err = nlk_sk(sk)->portid == portid ? 0 : -EBUSY; + if (nlk_sk(sk)->bound) goto err; err = -ENOMEM; @@ -1119,7 +1119,7 @@ static int netlink_insert(struct sock *sk, u32 portid) unlikely(atomic_read(&table->hash.nelems) >= UINT_MAX)) goto err; - nlk_sk(sk)->rhash_portid = portid; + nlk_sk(sk)->portid = portid; sock_hold(sk); err = __netlink_insert(table, sk); @@ -1135,7 +1135,9 @@ static int netlink_insert(struct sock *sk, u32 portid) goto err; } - nlk_sk(sk)->portid = portid; + /* We need to ensure that the socket is hashed and visible. */ + smp_wmb(); + nlk_sk(sk)->bound = portid; err: release_sock(sk); @@ -1521,6 +1523,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; int err; long unsigned int groups = nladdr->nl_groups; + bool bound; if (addr_len < sizeof(struct sockaddr_nl)) return -EINVAL; @@ -1537,9 +1540,14 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, return err; } - if (nlk->portid) + bound = nlk->bound; + if (bound) { + /* Ensure nlk->portid is up-to-date. */ + smp_rmb(); + if (nladdr->nl_pid != nlk->portid) return -EINVAL; + } if (nlk->netlink_bind && groups) { int group; @@ -1555,7 +1563,10 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, } } - if (!nlk->portid) { + /* No need for barriers here as we return to user-space without + * using any of the bound attributes. + */ + if (!bound) { err = nladdr->nl_pid ? netlink_insert(sk, nladdr->nl_pid) : netlink_autobind(sock); @@ -1603,7 +1614,10 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) return -EPERM; - if (!nlk->portid) + /* No need for barriers here as we return to user-space without + * using any of the bound attributes. + */ + if (!nlk->bound) err = netlink_autobind(sock); if (err == 0) { @@ -2444,10 +2458,13 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) dst_group = nlk->dst_group; } - if (!nlk->portid) { + if (!nlk->bound) { err = netlink_autobind(sock); if (err) goto out; + } else { + /* Ensure nlk is hashed and visible. */ + smp_rmb(); } /* It's a really convoluted way for userland to ask for mmaped @@ -3273,7 +3290,7 @@ static inline u32 netlink_hash(const void *data, u32 len, u32 seed) const struct netlink_sock *nlk = data; struct netlink_compare_arg arg; - netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->rhash_portid); + netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->portid); return jhash2((u32 *)&arg, netlink_compare_arg_len / sizeof(u32), seed); } diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 80b2b75..14437d9 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -25,7 +25,6 @@ struct netlink_ring { struct netlink_sock { /* struct sock has to be the first member of netlink_sock */ struct sock sk; - u32 rhash_portid; u32 portid; u32 dst_portid; u32 dst_group; @@ -36,6 +35,7 @@ struct netlink_sock { unsigned long state; size_t max_recvmsg_len; wait_queue_head_t wait; + bool bound; bool cb_running; struct netlink_callback cb; struct mutex *cb_mutex; -- cgit v1.1 From 63d008a4e9ee86614ca5671b7f3ba447df007190 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Tue, 22 Sep 2015 18:12:11 +0200 Subject: ipv4: send arp replies to the correct tunnel When using ip lwtunnels, the additional data for xmit (basically, the actual tunnel to use) are carried in ip_tunnel_info either in dst->lwtstate or in metadata dst. When replying to ARP requests, we need to send the reply to the same tunnel the request came from. This means we need to construct proper metadata dst for ARP replies. We could perform another route lookup to get a dst entry with the correct lwtstate. However, this won't always ensure that the outgoing tunnel is the same as the incoming one, and it won't work anyway for IPv4 duplicate address detection. The only thing to do is to "reverse" the ip_tunnel_info. Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/arp.c | 39 +++++++++++++++++++++++++-------------- net/ipv4/ip_tunnel_core.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 30409b7..f03db8b 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -113,6 +113,8 @@ #include #include #include +#include +#include #include @@ -296,7 +298,8 @@ static void arp_send_dst(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, - const unsigned char *target_hw, struct sk_buff *oskb) + const unsigned char *target_hw, + struct dst_entry *dst) { struct sk_buff *skb; @@ -309,9 +312,7 @@ static void arp_send_dst(int type, int ptype, __be32 dest_ip, if (!skb) return; - if (oskb) - skb_dst_copy(skb, oskb); - + skb_dst_set(skb, dst); arp_xmit(skb); } @@ -333,6 +334,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) __be32 target = *(__be32 *)neigh->primary_key; int probes = atomic_read(&neigh->probes); struct in_device *in_dev; + struct dst_entry *dst = NULL; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); @@ -381,9 +383,10 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) } } + if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE)) + dst = dst_clone(skb_dst(skb)); arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, - dst_hw, dev->dev_addr, NULL, - dev->priv_flags & IFF_XMIT_DST_RELEASE ? NULL : skb); + dst_hw, dev->dev_addr, NULL, dst); } static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) @@ -649,6 +652,7 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) int addr_type; struct neighbour *n; struct net *net = dev_net(dev); + struct dst_entry *reply_dst = NULL; bool is_garp = false; /* arp_rcv below verifies the ARP header and verifies the device @@ -749,13 +753,18 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) * cache. */ + if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb)) + reply_dst = (struct dst_entry *) + iptunnel_metadata_reply(skb_metadata_dst(skb), + GFP_ATOMIC); + /* Special case: IPv4 duplicate address detection packet (RFC2131) */ if (sip == 0) { if (arp->ar_op == htons(ARPOP_REQUEST) && inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL && !arp_ignore(in_dev, sip, tip)) - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, - dev->dev_addr, sha); + arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, + sha, dev->dev_addr, sha, reply_dst); goto out; } @@ -774,9 +783,10 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) if (!dont_send) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) { - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, - dev, tip, sha, dev->dev_addr, - sha); + arp_send_dst(ARPOP_REPLY, ETH_P_ARP, + sip, dev, tip, sha, + dev->dev_addr, sha, + reply_dst); neigh_release(n); } } @@ -794,9 +804,10 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || skb->pkt_type == PACKET_HOST || NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) { - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, - dev, tip, sha, dev->dev_addr, - sha); + arp_send_dst(ARPOP_REPLY, ETH_P_ARP, + sip, dev, tip, sha, + dev->dev_addr, sha, + reply_dst); } else { pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 9b97204..ce3a1e7 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -46,6 +46,7 @@ #include #include #include +#include int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, @@ -119,6 +120,33 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) } EXPORT_SYMBOL_GPL(iptunnel_pull_header); +struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, + gfp_t flags) +{ + struct metadata_dst *res; + struct ip_tunnel_info *dst, *src; + + if (!md || md->u.tun_info.mode & IP_TUNNEL_INFO_TX) + return NULL; + + res = metadata_dst_alloc(0, flags); + if (!res) + return NULL; + + dst = &res->u.tun_info; + src = &md->u.tun_info; + dst->key.tun_id = src->key.tun_id; + if (src->mode & IP_TUNNEL_INFO_IPV6) + memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src, + sizeof(struct in6_addr)); + else + dst->key.u.ipv4.dst = src->key.u.ipv4.src; + dst->mode = src->mode | IP_TUNNEL_INFO_TX; + + return res; +} +EXPORT_SYMBOL_GPL(iptunnel_metadata_reply); + struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, bool csum_help, int gso_type_mask) -- cgit v1.1 From b194f30c61efb0767a98f47a64530baa8b731670 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Tue, 22 Sep 2015 18:12:12 +0200 Subject: lwtunnel: remove source and destination UDP port config option The UDP tunnel config is asymmetric wrt. to the ports used. The source and destination ports from one direction of the tunnel are not related to the ports of the other direction. We need to be able to respond to ARP requests using the correct ports without involving routing. As the consequence, UDP ports need to be fixed property of the tunnel interface and cannot be set per route. Remove the ability to set ports per route. This is still okay to do, as no kernel has been released with these attributes yet. Note that the ability to specify source and destination ports is preserved for other users of the lwtunnel API which don't use routes for tunnel key specification (like openvswitch). If in the future we rework ARP handling to allow port specification, the attributes can be added back. Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel_core.c | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index ce3a1e7..84dce6a 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -226,8 +226,6 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = { [LWTUNNEL_IP_SRC] = { .type = NLA_U32 }, [LWTUNNEL_IP_TTL] = { .type = NLA_U8 }, [LWTUNNEL_IP_TOS] = { .type = NLA_U8 }, - [LWTUNNEL_IP_SPORT] = { .type = NLA_U16 }, - [LWTUNNEL_IP_DPORT] = { .type = NLA_U16 }, [LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 }, }; @@ -267,12 +265,6 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, if (tb[LWTUNNEL_IP_TOS]) tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]); - if (tb[LWTUNNEL_IP_SPORT]) - tun_info->key.tp_src = nla_get_be16(tb[LWTUNNEL_IP_SPORT]); - - if (tb[LWTUNNEL_IP_DPORT]) - tun_info->key.tp_dst = nla_get_be16(tb[LWTUNNEL_IP_DPORT]); - if (tb[LWTUNNEL_IP_FLAGS]) tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP_FLAGS]); @@ -294,8 +286,6 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb, nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) || - nla_put_u16(skb, LWTUNNEL_IP_SPORT, tun_info->key.tp_src) || - nla_put_u16(skb, LWTUNNEL_IP_DPORT, tun_info->key.tp_dst) || nla_put_u16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags)) return -ENOMEM; @@ -309,8 +299,6 @@ static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) + nla_total_size(4) /* LWTUNNEL_IP_SRC */ + nla_total_size(1) /* LWTUNNEL_IP_TOS */ + nla_total_size(1) /* LWTUNNEL_IP_TTL */ - + nla_total_size(2) /* LWTUNNEL_IP_SPORT */ - + nla_total_size(2) /* LWTUNNEL_IP_DPORT */ + nla_total_size(2); /* LWTUNNEL_IP_FLAGS */ } @@ -333,8 +321,6 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { [LWTUNNEL_IP6_SRC] = { .len = sizeof(struct in6_addr) }, [LWTUNNEL_IP6_HOPLIMIT] = { .type = NLA_U8 }, [LWTUNNEL_IP6_TC] = { .type = NLA_U8 }, - [LWTUNNEL_IP6_SPORT] = { .type = NLA_U16 }, - [LWTUNNEL_IP6_DPORT] = { .type = NLA_U16 }, [LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 }, }; @@ -374,12 +360,6 @@ static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr, if (tb[LWTUNNEL_IP6_TC]) tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]); - if (tb[LWTUNNEL_IP6_SPORT]) - tun_info->key.tp_src = nla_get_be16(tb[LWTUNNEL_IP6_SPORT]); - - if (tb[LWTUNNEL_IP6_DPORT]) - tun_info->key.tp_dst = nla_get_be16(tb[LWTUNNEL_IP6_DPORT]); - if (tb[LWTUNNEL_IP6_FLAGS]) tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP6_FLAGS]); @@ -401,8 +381,6 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb, nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) || nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.ttl) || - nla_put_u16(skb, LWTUNNEL_IP6_SPORT, tun_info->key.tp_src) || - nla_put_u16(skb, LWTUNNEL_IP6_DPORT, tun_info->key.tp_dst) || nla_put_u16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags)) return -ENOMEM; @@ -416,8 +394,6 @@ static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate) + nla_total_size(16) /* LWTUNNEL_IP6_SRC */ + nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */ + nla_total_size(1) /* LWTUNNEL_IP6_TC */ - + nla_total_size(2) /* LWTUNNEL_IP6_SPORT */ - + nla_total_size(2) /* LWTUNNEL_IP6_DPORT */ + nla_total_size(2); /* LWTUNNEL_IP6_FLAGS */ } -- cgit v1.1 From d8aecb10115497f6cdf841df8c88ebb3ba25fa28 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 22 Sep 2015 17:01:11 -0700 Subject: net: revert "net_sched: move tp->root allocation into fw_init()" fw filter uses tp->root==NULL to check if it is the old method, so it doesn't need allocation at all in this case. This patch reverts the offending commit and adds some comments for old method to make it obvious. Fixes: 33f8b9ecdb15 ("net_sched: move tp->root allocation into fw_init()") Reported-by: Akshat Kakkar Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/cls_fw.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 715e01e..f23a3b6 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -33,7 +33,6 @@ struct fw_head { u32 mask; - bool mask_set; struct fw_filter __rcu *ht[HTSIZE]; struct rcu_head rcu; }; @@ -84,7 +83,7 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, } } } else { - /* old method */ + /* Old method: classify the packet using its skb mark. */ if (id && (TC_H_MAJ(id) == 0 || !(TC_H_MAJ(id ^ tp->q->handle)))) { res->classid = id; @@ -114,14 +113,9 @@ static unsigned long fw_get(struct tcf_proto *tp, u32 handle) static int fw_init(struct tcf_proto *tp) { - struct fw_head *head; - - head = kzalloc(sizeof(struct fw_head), GFP_KERNEL); - if (head == NULL) - return -ENOBUFS; - - head->mask_set = false; - rcu_assign_pointer(tp->root, head); + /* We don't allocate fw_head here, because in the old method + * we don't need it at all. + */ return 0; } @@ -252,7 +246,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, int err; if (!opt) - return handle ? -EINVAL : 0; + return handle ? -EINVAL : 0; /* Succeed if it is old method. */ err = nla_parse_nested(tb, TCA_FW_MAX, opt, fw_policy); if (err < 0) @@ -302,11 +296,17 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, if (!handle) return -EINVAL; - if (!head->mask_set) { - head->mask = 0xFFFFFFFF; + if (!head) { + u32 mask = 0xFFFFFFFF; if (tb[TCA_FW_MASK]) - head->mask = nla_get_u32(tb[TCA_FW_MASK]); - head->mask_set = true; + mask = nla_get_u32(tb[TCA_FW_MASK]); + + head = kzalloc(sizeof(*head), GFP_KERNEL); + if (!head) + return -ENOBUFS; + head->mask = mask; + + rcu_assign_pointer(tp->root, head); } f = kzalloc(sizeof(struct fw_filter), GFP_KERNEL); -- cgit v1.1 From 41fc014332d91ee90c32840bf161f9685b7fbf2b Mon Sep 17 00:00:00 2001 From: Wilson Kok Date: Tue, 22 Sep 2015 21:40:22 -0700 Subject: fib_rules: fix fib rule dumps across multiple skbs dump_rules returns skb length and not error. But when family == AF_UNSPEC, the caller of dump_rules assumes that it returns an error. Hence, when family == AF_UNSPEC, we continue trying to dump on -EMSGSIZE errors resulting in incorrect dump idx carried between skbs belonging to the same dump. This results in fib rule dump always only dumping rules that fit into the first skb. This patch fixes dump_rules to return error so that we exit correctly and idx is correctly maintained between skbs that are part of the same dump. Signed-off-by: Wilson Kok Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/core/fib_rules.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index bf77e36..365de66 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -631,15 +631,17 @@ static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb, { int idx = 0; struct fib_rule *rule; + int err = 0; rcu_read_lock(); list_for_each_entry_rcu(rule, &ops->rules_list, list) { if (idx < cb->args[1]) goto skip; - if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, RTM_NEWRULE, - NLM_F_MULTI, ops) < 0) + err = fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWRULE, + NLM_F_MULTI, ops); + if (err) break; skip: idx++; @@ -648,7 +650,7 @@ skip: cb->args[1] = idx; rules_ops_put(ops); - return skb->len; + return err; } static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) @@ -664,7 +666,9 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) if (ops == NULL) return -EAFNOSUPPORT; - return dump_rules(skb, cb, ops); + dump_rules(skb, cb, ops); + + return skb->len; } rcu_read_lock(); -- cgit v1.1 From a46496ce38eeb401344d5623c1960dbf2f1769be Mon Sep 17 00:00:00 2001 From: Matt Bennett Date: Wed, 23 Sep 2015 16:58:31 +1200 Subject: ip6_gre: Reduce log level in ip6gre_err() to debug Currently error log messages in ip6gre_err are printed at 'warn' level. This is different to most other tunnel types which don't print any messages. These log messages don't provide any information that couldn't be deduced with networking tools. Also it can be annoying to have one end of the tunnel go down and have the logs fill with pointless messages such as "Path to destination invalid or inactive!". This patch reduces the log level of these messages to 'dbg' level to bring the visible behaviour into line with other tunnel types. Signed-off-by: Matt Bennett Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 6465124..3c7b931 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -404,13 +404,13 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct ipv6_tlv_tnl_enc_lim *tel; __u32 mtu; case ICMPV6_DEST_UNREACH: - net_warn_ratelimited("%s: Path to destination invalid or inactive!\n", - t->parms.name); + net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", + t->parms.name); break; case ICMPV6_TIME_EXCEED: if (code == ICMPV6_EXC_HOPLIMIT) { - net_warn_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", - t->parms.name); + net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", + t->parms.name); } break; case ICMPV6_PARAMPROB: @@ -421,12 +421,12 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (teli && teli == be32_to_cpu(info) - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { - net_warn_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", - t->parms.name); + net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", + t->parms.name); } } else { - net_warn_ratelimited("%s: Recipient unable to parse tunneled packet!\n", - t->parms.name); + net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", + t->parms.name); } break; case ICMPV6_PKT_TOOBIG: -- cgit v1.1 From 17a10c9215b39a5ea8faeb241759c1aee6553919 Mon Sep 17 00:00:00 2001 From: Matt Bennett Date: Fri, 25 Sep 2015 11:01:47 +1200 Subject: ip6_tunnel: Reduce log level in ip6_tnl_err() to debug Currently error log messages in ip6_tnl_err are printed at 'warn' level. This is different to other tunnel types which don't print any messages. These log messages don't provide any information that couldn't be deduced with networking tools. Also it can be annoying to have one end of the tunnel go down and have the logs fill with pointless messages such as "Path to destination invalid or inactive!". This patch reduces the log level of these messages to 'dbg' level to bring the visible behaviour into line with other tunnel types. Signed-off-by: Matt Bennett Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 983f0d2..eabffbb 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -569,14 +569,14 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, struct ipv6_tlv_tnl_enc_lim *tel; __u32 mtu; case ICMPV6_DEST_UNREACH: - net_warn_ratelimited("%s: Path to destination invalid or inactive!\n", - t->parms.name); + net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", + t->parms.name); rel_msg = 1; break; case ICMPV6_TIME_EXCEED: if ((*code) == ICMPV6_EXC_HOPLIMIT) { - net_warn_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", - t->parms.name); + net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", + t->parms.name); rel_msg = 1; } break; @@ -588,13 +588,13 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, if (teli && teli == *info - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { - net_warn_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", - t->parms.name); + net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", + t->parms.name); rel_msg = 1; } } else { - net_warn_ratelimited("%s: Recipient unable to parse tunneled packet!\n", - t->parms.name); + net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", + t->parms.name); } break; case ICMPV6_PKT_TOOBIG: -- cgit v1.1 From e496ae690b2faff751e1849fb97b060615e21f28 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 24 Sep 2015 20:35:57 +0100 Subject: net: dsa: fix of_mdio_find_bus() device refcount leak Current users of of_mdio_find_bus() leak a struct device refcount, as they fail to clean up the reference obtained inside class_find_device(). Fix the DSA code to properly refcount the returned MDIO bus by: 1. taking a reference on the struct device whenever we assign it to pd->chip[x].host_dev. 2. dropping the reference when we overwrite the existing reference. 3. dropping the reference when we free the data structure. 4. dropping the initial reference we obtained after setting up the platform data structure, or on failure. In step 2 above, where we obtain a new MDIO bus, there is no need to take a reference on it as we would only have to drop it immediately after assignment again, iow: put_device(cd->host_dev); /* drop original assignment ref */ cd->host_dev = get_device(&mdio_bus_switch->dev); /* get our ref */ put_device(&mdio_bus_switch->dev); /* drop of_mdio_find_bus ref */ Signed-off-by: Russell King Signed-off-by: David S. Miller --- net/dsa/dsa.c | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 76e380076..bf4ba15 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -634,6 +634,10 @@ static void dsa_of_free_platform_data(struct dsa_platform_data *pd) port_index++; } kfree(pd->chip[i].rtable); + + /* Drop our reference to the MDIO bus device */ + if (pd->chip[i].host_dev) + put_device(pd->chip[i].host_dev); } kfree(pd->chip); } @@ -661,16 +665,22 @@ static int dsa_of_probe(struct device *dev) return -EPROBE_DEFER; ethernet = of_parse_phandle(np, "dsa,ethernet", 0); - if (!ethernet) - return -EINVAL; + if (!ethernet) { + ret = -EINVAL; + goto out_put_mdio; + } ethernet_dev = of_find_net_device_by_node(ethernet); - if (!ethernet_dev) - return -EPROBE_DEFER; + if (!ethernet_dev) { + ret = -EPROBE_DEFER; + goto out_put_mdio; + } pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) - return -ENOMEM; + if (!pd) { + ret = -ENOMEM; + goto out_put_mdio; + } dev->platform_data = pd; pd->of_netdev = ethernet_dev; @@ -691,7 +701,9 @@ static int dsa_of_probe(struct device *dev) cd = &pd->chip[chip_index]; cd->of_node = child; - cd->host_dev = &mdio_bus->dev; + + /* When assigning the host device, increment its refcount */ + cd->host_dev = get_device(&mdio_bus->dev); sw_addr = of_get_property(child, "reg", NULL); if (!sw_addr) @@ -711,6 +723,12 @@ static int dsa_of_probe(struct device *dev) ret = -EPROBE_DEFER; goto out_free_chip; } + + /* Drop the mdio_bus device ref, replacing the host + * device with the mdio_bus_switch device, keeping + * the refcount from of_mdio_find_bus() above. + */ + put_device(cd->host_dev); cd->host_dev = &mdio_bus_switch->dev; } @@ -744,6 +762,10 @@ static int dsa_of_probe(struct device *dev) } } + /* The individual chips hold their own refcount on the mdio bus, + * so drop ours */ + put_device(&mdio_bus->dev); + return 0; out_free_chip: @@ -751,6 +773,8 @@ out_free_chip: out_free: kfree(pd); dev->platform_data = NULL; +out_put_mdio: + put_device(&mdio_bus->dev); return ret; } -- cgit v1.1 From 9861f72074c77a8a065622c1be7e9c4277e600eb Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 24 Sep 2015 20:36:33 +0100 Subject: net: fix net_device refcounting of_find_net_device_by_node() uses class_find_device() internally to lookup the corresponding network device. class_find_device() returns a reference to the embedded struct device, with its refcount incremented. Add a comment to the definition in net/core/net-sysfs.c indicating the need to drop this refcount, and fix the DSA code to drop this refcount when the OF-generated platform data is cleaned up and freed. Also arrange for the ref to be dropped when handling errors. Signed-off-by: Russell King Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 9 +++++++++ net/dsa/dsa.c | 5 ++++- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index b279077..805a95a 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1481,6 +1481,15 @@ static int of_dev_node_match(struct device *dev, const void *data) return ret == 0 ? dev->of_node == data : ret; } +/* + * of_find_net_device_by_node - lookup the net device for the device node + * @np: OF device node + * + * Looks up the net_device structure corresponding with the device node. + * If successful, returns a pointer to the net_device with the embedded + * struct device refcount incremented by one, or NULL on failure. The + * refcount must be dropped when done with the net_device. + */ struct net_device *of_find_net_device_by_node(struct device_node *np) { struct device *dev; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index bf4ba15..c59fa5d 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -679,7 +679,7 @@ static int dsa_of_probe(struct device *dev) pd = kzalloc(sizeof(*pd), GFP_KERNEL); if (!pd) { ret = -ENOMEM; - goto out_put_mdio; + goto out_put_ethernet; } dev->platform_data = pd; @@ -773,6 +773,8 @@ out_free_chip: out_free: kfree(pd); dev->platform_data = NULL; +out_put_ethernet: + put_device(ðernet_dev->dev); out_put_mdio: put_device(&mdio_bus->dev); return ret; @@ -786,6 +788,7 @@ static void dsa_of_remove(struct device *dev) return; dsa_of_free_platform_data(pd); + put_device(&pd->of_netdev->dev); kfree(pd); } #else -- cgit v1.1 From bdb06cbf77cb01911694cc9076ffa8196b7b9b61 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 24 Sep 2015 15:31:29 -0600 Subject: net: Fix panic in icmp_route_lookup Andrey reported a panic: [ 7249.865507] BUG: unable to handle kernel pointer dereference at 000000b4 [ 7249.865559] IP: [] icmp_route_lookup+0xaa/0x320 [ 7249.865598] *pdpt = 0000000030f7f001 *pde = 0000000000000000 [ 7249.865637] Oops: 0000 [#1] ... [ 7249.866811] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.3.0-999-generic #201509220155 [ 7249.866876] Hardware name: MSI MS-7250/MS-7250, BIOS 080014 08/02/2006 [ 7249.866916] task: c1a5ab00 ti: c1a52000 task.ti: c1a52000 [ 7249.866949] EIP: 0060:[] EFLAGS: 00210246 CPU: 0 [ 7249.866981] EIP is at icmp_route_lookup+0xaa/0x320 [ 7249.867012] EAX: 00000000 EBX: f483ba48 ECX: 00000000 EDX: f2e18a00 [ 7249.867045] ESI: 000000c0 EDI: f483ba70 EBP: f483b9ec ESP: f483b974 [ 7249.867077] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 [ 7249.867108] CR0: 8005003b CR2: 000000b4 CR3: 36ee07c0 CR4: 000006f0 [ 7249.867141] Stack: [ 7249.867165] 320310ee 00000000 00000042 320310ee 00000000 c1aeca00 f3920240 f0c69180 [ 7249.867268] f483ba04 f855058b a89b66cd f483ba44 f8962f4b 00000000 e659266c f483ba54 [ 7249.867361] 8004753c f483ba5c f8962f4b f2031140 000003c1 ffbd8fa0 c16b0e00 00000064 [ 7249.867448] Call Trace: [ 7249.867494] [] ? e1000_xmit_frame+0x87b/0xdc0 [e1000e] [ 7249.867534] [] ? tcp_in_window+0xeb/0xb10 [nf_conntrack] [ 7249.867576] [] ? tcp_in_window+0xeb/0xb10 [nf_conntrack] [ 7249.867615] [] ? icmp_send+0xa0/0x380 [ 7249.867648] [] icmp_send+0x2cf/0x380 [ 7249.867681] [] nf_send_unreach+0xa6/0xc0 [nf_reject_ipv4] [ 7249.867714] [] reject_tg+0x7a/0x9f [ipt_REJECT] [ 7249.867746] [] ipt_do_table+0x317/0x70c [ip_tables] [ 7249.867780] [] ? __nf_conntrack_find_get+0x166/0x3b0 [nf_conntrack] [ 7249.867838] [] ? nf_conntrack_in+0x398/0x600 [nf_conntrack] [ 7249.867889] [] iptable_filter_hook+0x35/0x80 [iptable_filter] [ 7249.867933] [] nf_iterate+0x71/0x80 [ 7249.867970] [] nf_hook_slow+0x65/0xc0 [ 7249.868002] [] __ip_local_out_sk+0xc1/0xd0 [ 7249.868034] [] ? ip_forward_options+0x1a0/0x1a0 [ 7249.868066] [] ip_local_out_sk+0x16/0x30 [ 7249.868097] [] ip_send_skb+0x14/0x80 [ 7249.868129] [] ip_push_pending_frames+0x34/0x40 [ 7249.868163] [] ip_send_unicast_reply+0x282/0x310 [ 7249.868196] [] tcp_v4_send_reset+0x1b3/0x380 [ 7249.868227] [] tcp_v4_rcv+0x323/0x990 [ 7249.868257] [] ? nf_iterate+0x71/0x80 [ 7249.868289] [] ip_local_deliver_finish+0x8b/0x230 [ 7249.868322] [] ip_local_deliver+0x4c/0xa0 [ 7249.868353] [] ? ip_rcv_finish+0x390/0x390 [ 7249.868384] [] ip_rcv_finish+0x7c/0x390 [ 7249.868415] [] ip_rcv+0x2e0/0x420 ... Prior to the VRF change the oif was not set in the flow struct, so the VRF support should really have only added the vrf_master_ifindex lookup. Fixes: 613d09b30f8b ("net: Use VRF device index for lookups on TX") Cc: Andrey Melnikov Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 79fe05b..e5eb8ac 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -427,7 +427,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.flowi4_mark = mark; fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_proto = IPPROTO_ICMP; - fl4.flowi4_oif = vrf_master_ifindex(skb->dev) ? : skb->dev->ifindex; + fl4.flowi4_oif = vrf_master_ifindex(skb->dev); security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) @@ -461,7 +461,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; - fl4->flowi4_oif = vrf_master_ifindex(skb_in->dev) ? : skb_in->dev->ifindex; + fl4->flowi4_oif = vrf_master_ifindex(skb_in->dev); security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); rt = __ip_route_output_key(net, fl4); -- cgit v1.1