From 0f97ede45e65ffb6eab856313e79b14b902bcfaa Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 2 Apr 2014 20:52:56 +0200 Subject: packet: report tx_dropped in packet_direct_xmit Since commit 015f0688f57c ("net: net: add a core netdev->tx_dropped counter"), we can now account for TX drops from within the core stack instead of drivers. Therefore, fix packet_direct_xmit() and increase drop count when we encounter a problem before driver's xmit function was called (we do not want to doubly account for it). Suggested-by: Eric Dumazet Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/packet/af_packet.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 01039d2..c81a971 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -275,6 +275,7 @@ static int packet_direct_xmit(struct sk_buff *skb) return ret; drop: + atomic_long_inc(&dev->tx_dropped); kfree_skb(skb); return NET_XMIT_DROP; } -- cgit v1.1 From 8e2f1a63f2217365223026422a2f8ba5967051d6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 2 Apr 2014 20:52:57 +0200 Subject: packet: fix packet_direct_xmit for BQL enabled drivers Currently, in packet_direct_xmit() we test the assigned netdevice queue for netif_xmit_frozen_or_stopped() before doing an ndo_start_xmit(). This can have the side-effect that BQL enabled drivers which make use of netdev_tx_sent_queue() internally, set __QUEUE_STATE_STACK_XOFF from within the stack and would not fully fill the device's TX ring from packet sockets with PACKET_QDISC_BYPASS enabled. Instead, use a test without BQL bit so that bursts can be absorbed into the NICs TX ring. Fix and code suggested by Eric Dumazet, thanks! Signed-off-by: Eric Dumazet Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/packet/af_packet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c81a971..72e0c71 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -261,7 +261,7 @@ static int packet_direct_xmit(struct sk_buff *skb) local_bh_disable(); HARD_TX_LOCK(dev, txq, smp_processor_id()); - if (!netif_xmit_frozen_or_stopped(txq)) { + if (!netif_xmit_frozen_or_drv_stopped(txq)) { ret = ops->ndo_start_xmit(skb, dev); if (ret == NETDEV_TX_OK) txq_trans_update(txq); -- cgit v1.1 From d0290214de712150b118a532ded378a29255893b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Apr 2014 23:09:31 +0200 Subject: net: add busy_poll device feature Currently there is no way how to find out if a device supports busy polling. So add a feature and make it dependent on ndo_busy_poll existence. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 7 +++++++ net/core/ethtool.c | 1 + 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 7570634..75e88e0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5696,6 +5696,13 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, } } +#ifdef CONFIG_NET_RX_BUSY_POLL + if (dev->netdev_ops->ndo_busy_poll) + features |= NETIF_F_BUSY_POLL; + else +#endif + features &= ~NETIF_F_BUSY_POLL; + return features; } diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 30071de..640ba0e 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -97,6 +97,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_RXFCS_BIT] = "rx-fcs", [NETIF_F_RXALL_BIT] = "rx-all", [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", + [NETIF_F_BUSY_POLL_BIT] = "busy-poll", }; static int ethtool_get_features(struct net_device *dev, void __user *useraddr) -- cgit v1.1 From a5e7ac5ce134d8f72f59631011fafa7bbf7ca174 Mon Sep 17 00:00:00 2001 From: Erik Hugne Date: Thu, 3 Apr 2014 08:28:01 +0200 Subject: tipc: fix regression bug where node events are not being generated Commit 5902385a2440a55f005b266c93e0bb9398e5a62b ("tipc: obsolete the remote management feature") introduces a regression where node topology events are not being generated because the publication that triggers this: {0, , } is no longer available. This will break applications that rely on node events to discover when nodes join/leave a cluster. We fix this by advertising the node publication when TIPC enters networking mode, and withdraws it upon shutdown. Signed-off-by: Erik Hugne Reviewed-by: Jon Maloy Reviewed-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/net.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/tipc/net.c b/net/tipc/net.c index 0374a81..4c564eb 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -182,6 +182,8 @@ void tipc_net_start(u32 addr) tipc_bclink_init(); write_unlock_bh(&tipc_net_lock); + tipc_nametbl_publish(TIPC_CFG_SRV, tipc_own_addr, tipc_own_addr, + TIPC_ZONE_SCOPE, 0, tipc_own_addr); pr_info("Started in network mode\n"); pr_info("Own node address %s, network identity %u\n", tipc_addr_string_fill(addr_string, tipc_own_addr), tipc_net_id); @@ -192,6 +194,7 @@ void tipc_net_stop(void) if (!tipc_own_addr) return; + tipc_nametbl_withdraw(TIPC_CFG_SRV, tipc_own_addr, 0, tipc_own_addr); write_lock_bh(&tipc_net_lock); tipc_bearer_stop(); tipc_bclink_stop(); -- cgit v1.1 From e33d0ba8047b049c9262fdb1fcafb93cb52ceceb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Apr 2014 09:28:10 -0700 Subject: net-gro: reset skb->truesize in napi_reuse_skb() Recycling skb always had been very tough... This time it appears GRO layer can accumulate skb->truesize adjustments made by drivers when they attach a fragment to skb. skb_gro_receive() can only subtract from skb->truesize the used part of a fragment. I spotted this problem seeing TcpExtPruneCalled and TcpExtTCPRcvCollapsed that were unexpected with a recent kernel, where TCP receive window should be sized properly to accept traffic coming from a driver not overshooting skb->truesize. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 75e88e0..5777018 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4043,6 +4043,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->vlan_tci = 0; skb->dev = napi->dev; skb->skb_iif = 0; + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); napi->skb = skb; } -- cgit v1.1 From e5ac6eafba887821044c65b6fe59d9eb8b7c7f61 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 17 Mar 2014 22:27:50 +0100 Subject: netfilter: connlimit: fix UP build cannot use ARRAY_SIZE() if spinlock_t is empty struct. Fixes: 1442e7507dd597 ("netfilter: connlimit: use keyed locks") Reported-by: kbuild test robot Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_connlimit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 458464e..a6e129e 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -377,7 +377,7 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par) return -ENOMEM; } - for (i = 0; i < ARRAY_SIZE(info->data->locks); ++i) + for (i = 0; i < CONNLIMIT_LOCK_SLOTS; ++i) spin_lock_init(&info->data->locks[i]); for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i) -- cgit v1.1 From e00b437b3d6d4d26ecd95108b575ee1bcfcb478f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 20 Mar 2014 11:53:39 +0100 Subject: netfilter: connlimit: move lock array out of struct connlimit_data Eric points out that the locks can be global. Moreover, both Jesper and Eric note that using only 32 locks increases false sharing as only two cache lines are used. This increases locks to 256 (16 cache lines assuming 64byte cacheline and 4 bytes per spinlock). Suggested-by: Jesper Dangaard Brouer Suggested-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_connlimit.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index a6e129e..fbc66bb 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -32,8 +32,14 @@ #include #include -#define CONNLIMIT_SLOTS 32 -#define CONNLIMIT_LOCK_SLOTS 32 +#define CONNLIMIT_SLOTS 256U + +#ifdef CONFIG_LOCKDEP +#define CONNLIMIT_LOCK_SLOTS 8U +#else +#define CONNLIMIT_LOCK_SLOTS 256U +#endif + #define CONNLIMIT_GC_MAX_NODES 8 /* we will save the tuples of all connections we care about */ @@ -49,10 +55,11 @@ struct xt_connlimit_rb { union nf_inet_addr addr; /* search key */ }; +static spinlock_t xt_connlimit_locks[CONNLIMIT_LOCK_SLOTS] __cacheline_aligned_in_smp; + struct xt_connlimit_data { struct rb_root climit_root4[CONNLIMIT_SLOTS]; struct rb_root climit_root6[CONNLIMIT_SLOTS]; - spinlock_t locks[CONNLIMIT_LOCK_SLOTS]; }; static u_int32_t connlimit_rnd __read_mostly; @@ -297,11 +304,11 @@ static int count_them(struct net *net, root = &data->climit_root4[hash]; } - spin_lock_bh(&data->locks[hash % CONNLIMIT_LOCK_SLOTS]); + spin_lock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); count = count_tree(net, root, tuple, addr, mask, family); - spin_unlock_bh(&data->locks[hash % CONNLIMIT_LOCK_SLOTS]); + spin_unlock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); return count; } @@ -377,9 +384,6 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par) return -ENOMEM; } - for (i = 0; i < CONNLIMIT_LOCK_SLOTS; ++i) - spin_lock_init(&info->data->locks[i]); - for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i) info->data->climit_root4[i] = RB_ROOT; for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i) @@ -435,11 +439,14 @@ static struct xt_match connlimit_mt_reg __read_mostly = { static int __init connlimit_mt_init(void) { - int ret; + int ret, i; BUILD_BUG_ON(CONNLIMIT_LOCK_SLOTS > CONNLIMIT_SLOTS); BUILD_BUG_ON((CONNLIMIT_SLOTS % CONNLIMIT_LOCK_SLOTS) != 0); + for (i = 0; i < CONNLIMIT_LOCK_SLOTS; ++i) + spin_lock_init(&xt_connlimit_locks[i]); + connlimit_conn_cachep = kmem_cache_create("xt_connlimit_conn", sizeof(struct xt_connlimit_conn), 0, 0, NULL); -- cgit v1.1 From a00e76349f3564bb8129fc0510dfd93248c3084d Mon Sep 17 00:00:00 2001 From: Alexey Perevalov Date: Wed, 19 Mar 2014 10:58:42 +0400 Subject: netfilter: x_tables: allow to use cgroup match for LOCAL_IN nf hooks This simple modification allows iptables to work with INPUT chain in combination with cgroup module. It could be useful for counting ingress traffic per cgroup with nfacct netfilter module. There were no problems to count the egress traffic that way formerly. It's possible to get classified sk_buff after PREROUTING, due to socket lookup being done in early_demux (tcp_v4_early_demux). Also it works for udp as well. Trivial usage example, assuming we're in the same shell every step and we have enough permissions: 1) Classic net_cls cgroup initialization: mkdir /sys/fs/cgroup/net_cls mount -t cgroup -o net_cls net_cls /sys/fs/cgroup/net_cls 2) Set up cgroup for interesting application: mkdir /sys/fs/cgroup/net_cls/wget echo 1 > /sys/fs/cgroup/net_cls/wget/net_cls.classid echo $BASHPID > /sys/fs/cgroup/net_cls/wget/cgroup.procs 3) Create kernel counters: nfacct add wget-cgroup-in iptables -A INPUT -m cgroup ! --cgroup 1 -m nfacct --nfacct-name wget-cgroup-in nfacct add wget-cgroup-out iptables -A OUTPUT -m cgroup ! --cgroup 1 -m nfacct --nfacct-name wget-cgroup-out 4) Network usage: wget https://www.kernel.org/pub/linux/kernel/v3.x/testing/linux-3.14-rc6.tar.xz 5) Check results: nfacct list Cgroup approach is being used for the DataUsage (counting & blocking traffic) feature for Samsung's modification of the Tizen OS. Signed-off-by: Alexey Perevalov Acked-by: Daniel Borkmann Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index 9a8e77e7..f4e8330 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -54,7 +54,8 @@ static struct xt_match cgroup_mt_reg __read_mostly = { .matchsize = sizeof(struct xt_cgroup_info), .me = THIS_MODULE, .hooks = (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_POST_ROUTING), + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_IN), }; static int __init cgroup_mt_init(void) -- cgit v1.1 From b8ddd9eac8788b0aa9a9d4e09d76dc9e1667bb2c Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 26 Mar 2014 14:37:59 +0400 Subject: netfilter: Add {ipt,ip6t}_osf aliases for xt_osf There are no these aliases, so kernel can not request appropriate match table: $ iptables -I INPUT -p tcp -m osf --genre Windows --ttl 2 -j DROP iptables: No chain/target/match by that name. setsockopt() requests ipt_osf module, which is not present. Add the aliases. Signed-off-by: Kirill Tkhai Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_osf.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index 7174611..c529161 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -422,4 +422,6 @@ module_exit(xt_osf_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Evgeniy Polyakov "); MODULE_DESCRIPTION("Passive OS fingerprint matching."); +MODULE_ALIAS("ipt_osf"); +MODULE_ALIAS("ip6t_osf"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); -- cgit v1.1 From a9bdd8365684810e3de804f8c51e52c26a5eccbb Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 24 Mar 2014 15:10:37 +0100 Subject: netfilter: nf_tables: set names cannot be larger than 15 bytes Currently, nf_tables trims off the set name if it exceeeds 15 bytes, so explicitly reject set names that are too large. Reported-by: Giuseppe Longo Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 33045a5..43ae487 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1946,7 +1946,8 @@ static const struct nft_set_ops *nft_select_set_ops(const struct nlattr * const static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_TABLE] = { .type = NLA_STRING }, - [NFTA_SET_NAME] = { .type = NLA_STRING }, + [NFTA_SET_NAME] = { .type = NLA_STRING, + .len = IFNAMSIZ - 1 }, [NFTA_SET_FLAGS] = { .type = NLA_U32 }, [NFTA_SET_KEY_TYPE] = { .type = NLA_U32 }, [NFTA_SET_KEY_LEN] = { .type = NLA_U32 }, -- cgit v1.1 From 2fec6bb6f484b1a88b4a325724234d6cfd08c918 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 31 Mar 2014 12:26:39 +0200 Subject: netfilter: nf_tables: fix wrong format in request_module() The intended format in request_module is %.*s instead of %*.s. Reported-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 43ae487..3fd159d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -152,8 +152,8 @@ nf_tables_chain_type_lookup(const struct nft_af_info *afi, #ifdef CONFIG_MODULES if (autoload) { nfnl_unlock(NFNL_SUBSYS_NFTABLES); - request_module("nft-chain-%u-%*.s", afi->family, - nla_len(nla)-1, (const char *)nla_data(nla)); + request_module("nft-chain-%u-%.*s", afi->family, + nla_len(nla), (const char *)nla_data(nla)); nfnl_lock(NFNL_SUBSYS_NFTABLES); type = __nf_tables_chain_type_lookup(afi->family, nla); if (type != NULL) -- cgit v1.1 From c58dd2dd443c26d856a168db108a0cd11c285bf3 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Fri, 4 Apr 2014 17:57:45 +0200 Subject: netfilter: Can't fail and free after table replacement All xtables variants suffer from the defect that the copy_to_user() to copy the counters to user memory may fail after the table has already been exchanged and thus exposed. Return an error at this point will result in freeing the already exposed table. Any subsequent packet processing will result in a kernel panic. We can't copy the counters before exposing the new tables as we want provide the counter state after the old table has been unhooked. Therefore convert this into a silent error. Cc: Florian Westphal Signed-off-by: Thomas Graf Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 5 ++--- net/ipv4/netfilter/arp_tables.c | 6 ++++-- net/ipv4/netfilter/ip_tables.c | 6 ++++-- net/ipv6/netfilter/ip6_tables.c | 6 ++++-- 4 files changed, 14 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 0e474b1..1059ed3 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1044,10 +1044,9 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, if (repl->num_counters && copy_to_user(repl->counters, counterstmp, repl->num_counters * sizeof(struct ebt_counter))) { - ret = -EFAULT; + /* Silent error, can't fail, new table is already in place */ + net_warn_ratelimited("ebtables: counters copy to user failed while replacing table\n"); } - else - ret = 0; /* decrease module count and free resources */ EBT_ENTRY_ITERATE(table->entries, table->entries_size, diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 59da7cd..f95b6f9 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1044,8 +1044,10 @@ static int __do_replace(struct net *net, const char *name, xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, - sizeof(struct xt_counters) * num_counters) != 0) - ret = -EFAULT; + sizeof(struct xt_counters) * num_counters) != 0) { + /* Silent error, can't fail, new table is already in place */ + net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n"); + } vfree(counters); xt_table_unlock(t); return ret; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 718dfbd..99e810f 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1231,8 +1231,10 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, - sizeof(struct xt_counters) * num_counters) != 0) - ret = -EFAULT; + sizeof(struct xt_counters) * num_counters) != 0) { + /* Silent error, can't fail, new table is already in place */ + net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n"); + } vfree(counters); xt_table_unlock(t); return ret; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 710238f..e080fbb 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1241,8 +1241,10 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, - sizeof(struct xt_counters) * num_counters) != 0) - ret = -EFAULT; + sizeof(struct xt_counters) * num_counters) != 0) { + /* Silent error, can't fail, new table is already in place */ + net_warn_ratelimited("ip6tables: counters copy to user failed while replacing table\n"); + } vfree(counters); xt_table_unlock(t); return ret; -- cgit v1.1 From 5f9fde5f799df7156eeb3fa58282e9fd2f38a5f8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 5 Apr 2014 01:04:03 +0200 Subject: net: filter: be more defensive on div/mod by X==0 The old interpreter behaviour was that we returned with 0 whenever we found a division by 0 would take place. In the new interpreter we would currently just skip that instead and continue execution. It's true that a value of 0 as return might not be appropriate in all cases, but current users (socket filters -> drop packet, seccomp -> SECCOMP_RET_KILL, cls_bpf -> unclassified, etc) seem fine with that behaviour. Better this than undefined BPF program behaviour as it's expected that A contains the result of the division. In future, as more use cases open up, we could further adapt this return value to our needs, if necessary. So reintroduce return of 0 for division by 0 as in the old interpreter. Also in case of K which is guaranteed to be 32bit wide, sk_chk_filter() already takes care of preventing division by 0 invoked through K, so we can generally spare us these tests. Signed-off-by: Daniel Borkmann Reviewed-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 765556b..e08b382 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -295,43 +295,43 @@ select_insn: (*(s64 *) &A) >>= K; CONT; BPF_ALU64_BPF_MOD_BPF_X: + if (unlikely(X == 0)) + return 0; tmp = A; - if (X) - A = do_div(tmp, X); + A = do_div(tmp, X); CONT; BPF_ALU_BPF_MOD_BPF_X: + if (unlikely(X == 0)) + return 0; tmp = (u32) A; - if (X) - A = do_div(tmp, (u32) X); + A = do_div(tmp, (u32) X); CONT; BPF_ALU64_BPF_MOD_BPF_K: tmp = A; - if (K) - A = do_div(tmp, K); + A = do_div(tmp, K); CONT; BPF_ALU_BPF_MOD_BPF_K: tmp = (u32) A; - if (K) - A = do_div(tmp, (u32) K); + A = do_div(tmp, (u32) K); CONT; BPF_ALU64_BPF_DIV_BPF_X: - if (X) - do_div(A, X); + if (unlikely(X == 0)) + return 0; + do_div(A, X); CONT; BPF_ALU_BPF_DIV_BPF_X: + if (unlikely(X == 0)) + return 0; tmp = (u32) A; - if (X) - do_div(tmp, (u32) X); + do_div(tmp, (u32) X); A = (u32) tmp; CONT; BPF_ALU64_BPF_DIV_BPF_K: - if (K) - do_div(A, K); + do_div(A, K); CONT; BPF_ALU_BPF_DIV_BPF_K: tmp = (u32) A; - if (K) - do_div(tmp, (u32) K); + do_div(tmp, (u32) K); A = (u32) tmp; CONT; BPF_ALU_BPF_END_BPF_TO_BE: -- cgit v1.1 From 6c6a9855560d3cfa93120f2ab07b8ea0110f953d Mon Sep 17 00:00:00 2001 From: Jean Sacren Date: Sat, 5 Apr 2014 00:29:01 -0600 Subject: mac802154: fix duplicate #include headers The commit e6278d92005e ("mac802154: use header operations to create/parse headers") included the header net/ieee802154_netdev.h which had been included by the commit b70ab2e87f17 ("ieee802154: enforce consistent endianness in the 802.15.4 stack"). Fix this duplicate #include by deleting the latter one as the required header has already been in place. Signed-off-by: Jean Sacren Cc: Alexander Smirnov Cc: Dmitry Eremin-Solenikov Cc: Phoebe Buckheister Cc: linux-zigbee-devel@lists.sourceforge.net Signed-off-by: David S. Miller --- net/mac802154/mib.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/mac802154/mib.c b/net/mac802154/mib.c index 153bd1d..f0991f2 100644 --- a/net/mac802154/mib.c +++ b/net/mac802154/mib.c @@ -26,7 +26,6 @@ #include #include #include -#include #include "mac802154.h" -- cgit v1.1 From 065d7e39563b092dbb429373bd8f0f2295768cea Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 6 Apr 2014 15:56:14 +0200 Subject: tipc: Let tipc_release() return 0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit net/tipc/socket.c: In function ‘tipc_release’: net/tipc/socket.c:352: warning: ‘res’ is used uninitialized in this function Introduced by commit 24be34b5a0c9114541891d29dff1152bb1a8df34 ("tipc: eliminate upcall function pointers between port and socket"), which removed the sole initializer of "res". Just return 0 to fix it. Signed-off-by: Geert Uytterhoeven Signed-off-by: David S. Miller --- net/tipc/socket.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 29b7f26..adc12e2 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -301,7 +301,6 @@ static int tipc_release(struct socket *sock) struct tipc_sock *tsk; struct tipc_port *port; struct sk_buff *buf; - int res; /* * Exit if socket isn't fully initialized (occurs when a failed accept() @@ -349,7 +348,7 @@ static int tipc_release(struct socket *sock) sock_put(sk); sock->sk = NULL; - return res; + return 0; } /** -- cgit v1.1 From 6f25cd47dcd2b9912c6e52aa833ba1614f7b5086 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 7 Apr 2014 17:18:30 +0200 Subject: pktgen: fix xmit test for BQL enabled devices Similarly as in commit 8e2f1a63f221 ("packet: fix packet_direct_xmit for BQL enabled drivers"), we test for __QUEUE_STATE_STACK_XOFF bit in pktgen's xmit, which would not fully fill the device's TX ring for BQL drivers that use netdev_tx_sent_queue(). Fix is to use, similarly as we do in packet sockets, netif_xmit_frozen_or_drv_stopped() test. Signed-off-by: Daniel Borkmann Cc: Eric Dumazet Signed-off-by: David S. Miller --- net/core/pktgen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index d0dac57..d068ec2 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3340,7 +3340,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) __netif_tx_lock_bh(txq); - if (unlikely(netif_xmit_frozen_or_stopped(txq))) { + if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) { ret = NETDEV_TX_BUSY; pkt_dev->last_ok = 0; goto unlock; -- cgit v1.1 From 6859e7df6d9045a461412777e63bd8cef12f9705 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Mon, 7 Apr 2014 11:25:12 +0200 Subject: netdev: remove potentially harmful checks Currently we're checking a variable for != NULL after actually dereferencing it, in netdev_lower_get_next_private*(). It's counter-intuitive at best, and can lead to faulty usage (as it implies that the variable can be NULL), so fix it by removing the useless checks. Reported-by: Daniel Borkmann CC: "David S. Miller" CC: Eric Dumazet CC: Nicolas Dichtel CC: Jiri Pirko CC: stephen hemminger CC: Jerry Chu Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 5777018..14dac06 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4589,8 +4589,7 @@ void *netdev_lower_get_next_private(struct net_device *dev, if (&lower->list == &dev->adj_list.lower) return NULL; - if (iter) - *iter = lower->list.next; + *iter = lower->list.next; return lower->private; } @@ -4618,8 +4617,7 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev, if (&lower->list == &dev->adj_list.lower) return NULL; - if (iter) - *iter = &lower->list; + *iter = &lower->list; return lower->private; } -- cgit v1.1 From 52c35befb69b005c3fc5afdaae3a5717ad013411 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 8 Apr 2014 17:26:13 +0200 Subject: net: sctp: wake up all assocs if sndbuf policy is per socket SCTP charges chunks for wmem accounting via skb->truesize in sctp_set_owner_w(), and sctp_wfree() respectively as the reverse operation. If a sender runs out of wmem, it needs to wait via sctp_wait_for_sndbuf(), and gets woken up by a call to __sctp_write_space() mostly via sctp_wfree(). __sctp_write_space() is being called per association. Although we assign sk->sk_write_space() to sctp_write_space(), which is then being done per socket, it is only used if send space is increased per socket option (SO_SNDBUF), as SOCK_USE_WRITE_QUEUE is set and therefore not invoked in sock_wfree(). Commit 4c3a5bdae293 ("sctp: Don't charge for data in sndbuf again when transmitting packet") fixed an issue where in case sctp_packet_transmit() manages to queue up more than sndbuf bytes, sctp_wait_for_sndbuf() will never be woken up again unless it is interrupted by a signal. However, a still remaining issue is that if net.sctp.sndbuf_policy=0, that is accounting per socket, and one-to-many sockets are in use, the reclaimed write space from sctp_wfree() is 'unfairly' handed back on the server to the association that is the lucky one to be woken up again via __sctp_write_space(), while the remaining associations are never be woken up again (unless by a signal). The effect disappears with net.sctp.sndbuf_policy=1, that is wmem accounting per association, as it guarantees a fair share of wmem among associations. Therefore, if we have reclaimed memory in case of per socket accounting, wake all related associations to a socket in a fair manner, that is, traverse the socket association list starting from the current neighbour of the association and issue a __sctp_write_space() to everyone until we end up waking ourselves. This guarantees that no association is preferred over another and even if more associations are taken into the one-to-many session, all receivers will get messages from the server and are not stalled forever on high load. This setting still leaves the advantage of per socket accounting in touch as an association can still use up global limits if unused by others. Fixes: 4eb701dfc618 ("[SCTP] Fix SCTP sendbuffer accouting.") Signed-off-by: Daniel Borkmann Cc: Thomas Graf Cc: Neil Horman Cc: Vlad Yasevich Acked-by: Vlad Yasevich Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/socket.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 981aaf8..5f83a6a 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -6593,6 +6593,40 @@ static void __sctp_write_space(struct sctp_association *asoc) } } +static void sctp_wake_up_waiters(struct sock *sk, + struct sctp_association *asoc) +{ + struct sctp_association *tmp = asoc; + + /* We do accounting for the sndbuf space per association, + * so we only need to wake our own association. + */ + if (asoc->ep->sndbuf_policy) + return __sctp_write_space(asoc); + + /* Accounting for the sndbuf space is per socket, so we + * need to wake up others, try to be fair and in case of + * other associations, let them have a go first instead + * of just doing a sctp_write_space() call. + * + * Note that we reach sctp_wake_up_waiters() only when + * associations free up queued chunks, thus we are under + * lock and the list of associations on a socket is + * guaranteed not to change. + */ + for (tmp = list_next_entry(tmp, asocs); 1; + tmp = list_next_entry(tmp, asocs)) { + /* Manually skip the head element. */ + if (&tmp->asocs == &((sctp_sk(sk))->ep->asocs)) + continue; + /* Wake up association. */ + __sctp_write_space(tmp); + /* We've reached the end. */ + if (tmp == asoc) + break; + } +} + /* Do accounting for the sndbuf space. * Decrement the used sndbuf space of the corresponding association by the * data size which was just transmitted(freed). @@ -6620,7 +6654,7 @@ static void sctp_wfree(struct sk_buff *skb) sk_mem_uncharge(sk, skb->truesize); sock_wfree(skb); - __sctp_write_space(asoc); + sctp_wake_up_waiters(sk, asoc); sctp_association_put(asoc); } -- cgit v1.1