From 750e9fad8c7f44e0005ffb7195f72dd76978c2cf Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 31 Aug 2010 05:50:43 +0000 Subject: ipv4: minor fix about RPF in help of Kconfig Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 7c3a7d1..571f895 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -46,7 +46,7 @@ config IP_ADVANCED_ROUTER rp_filter on use: echo 1 > /proc/sys/net/ipv4/conf//rp_filter - and + or echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter Note that some distributions enable it in startup scripts. -- cgit v1.1 From 6f86b325189e0a53c97bf86cff0c8b02ff624934 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 6 Sep 2010 22:36:19 -0700 Subject: ipv4: Fix reverse path filtering with multipath routing. Actually iterate over the next-hops to make sure we have a device match. Otherwise RP filtering is always elided when the route matched has multiple next-hops. Reported-by: Igor M Podlesny Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index a439689..7d02a9f 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -246,6 +246,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, struct fib_result res; int no_addr, rpf, accept_local; + bool dev_match; int ret; struct net *net; @@ -273,12 +274,22 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, } *spec_dst = FIB_RES_PREFSRC(res); fib_combine_itag(itag, &res); + dev_match = false; + #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1) + for (ret = 0; ret < res.fi->fib_nhs; ret++) { + struct fib_nh *nh = &res.fi->fib_nh[ret]; + + if (nh->nh_dev == dev) { + dev_match = true; + break; + } + } #else if (FIB_RES_DEV(res) == dev) + dev_match = true; #endif - { + if (dev_match) { ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; fib_res_put(&res); return ret; -- cgit v1.1 From f6b085b69d1cbbd62f49f34e71a3d58cb6d34b7e Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Tue, 7 Sep 2010 07:51:17 +0000 Subject: ipv4: Suppress lockdep-RCU false positive in FIB trie (3) Hi, Here is one more of these warnings and a patch below: Sep 5 23:52:33 del kernel: [46044.244833] =================================================== Sep 5 23:52:33 del kernel: [46044.269681] [ INFO: suspicious rcu_dereference_check() usage. ] Sep 5 23:52:33 del kernel: [46044.277000] --------------------------------------------------- Sep 5 23:52:33 del kernel: [46044.285185] net/ipv4/fib_trie.c:1756 invoked rcu_dereference_check() without protection! Sep 5 23:52:33 del kernel: [46044.293627] Sep 5 23:52:33 del kernel: [46044.293632] other info that might help us debug this: Sep 5 23:52:33 del kernel: [46044.293634] Sep 5 23:52:33 del kernel: [46044.325333] Sep 5 23:52:33 del kernel: [46044.325335] rcu_scheduler_active = 1, debug_locks = 0 Sep 5 23:52:33 del kernel: [46044.348013] 1 lock held by pppd/1717: Sep 5 23:52:33 del kernel: [46044.357548] #0: (rtnl_mutex){+.+.+.}, at: [] rtnl_lock+0xf/0x20 Sep 5 23:52:33 del kernel: [46044.367647] Sep 5 23:52:33 del kernel: [46044.367652] stack backtrace: Sep 5 23:52:33 del kernel: [46044.387429] Pid: 1717, comm: pppd Not tainted 2.6.35.4.4a #3 Sep 5 23:52:33 del kernel: [46044.398764] Call Trace: Sep 5 23:52:33 del kernel: [46044.409596] [] ? printk+0x18/0x1e Sep 5 23:52:33 del kernel: [46044.420761] [] lockdep_rcu_dereference+0xa9/0xb0 Sep 5 23:52:33 del kernel: [46044.432229] [] trie_firstleaf+0x65/0x70 Sep 5 23:52:33 del kernel: [46044.443941] [] fib_table_flush+0x14/0x170 Sep 5 23:52:33 del kernel: [46044.455823] [] ? local_bh_enable_ip+0x62/0xd0 Sep 5 23:52:33 del kernel: [46044.467995] [] ? _raw_spin_unlock_bh+0x2f/0x40 Sep 5 23:52:33 del kernel: [46044.480404] [] ? fib_sync_down_dev+0x120/0x180 Sep 5 23:52:33 del kernel: [46044.493025] [] fib_flush+0x2d/0x60 Sep 5 23:52:33 del kernel: [46044.505796] [] fib_disable_ip+0x25/0x50 Sep 5 23:52:33 del kernel: [46044.518772] [] fib_netdev_event+0x73/0xd0 Sep 5 23:52:33 del kernel: [46044.531918] [] notifier_call_chain+0x2d/0x70 Sep 5 23:52:33 del kernel: [46044.545358] [] raw_notifier_call_chain+0x1a/0x20 Sep 5 23:52:33 del kernel: [46044.559092] [] call_netdevice_notifiers+0x27/0x60 Sep 5 23:52:33 del kernel: [46044.573037] [] __dev_notify_flags+0x5c/0x80 Sep 5 23:52:33 del kernel: [46044.586489] [] dev_change_flags+0x37/0x60 Sep 5 23:52:33 del kernel: [46044.599394] [] devinet_ioctl+0x54d/0x630 Sep 5 23:52:33 del kernel: [46044.612277] [] inet_ioctl+0x97/0xc0 Sep 5 23:52:34 del kernel: [46044.625208] [] sock_ioctl+0x6f/0x270 Sep 5 23:52:34 del kernel: [46044.638046] [] ? handle_mm_fault+0x420/0x6c0 Sep 5 23:52:34 del kernel: [46044.650968] [] ? sock_ioctl+0x0/0x270 Sep 5 23:52:34 del kernel: [46044.663865] [] vfs_ioctl+0x28/0xa0 Sep 5 23:52:34 del kernel: [46044.676556] [] do_vfs_ioctl+0x6a/0x5c0 Sep 5 23:52:34 del kernel: [46044.688989] [] ? up_read+0x16/0x30 Sep 5 23:52:34 del kernel: [46044.701411] [] ? do_page_fault+0x1d6/0x3a0 Sep 5 23:52:34 del kernel: [46044.714223] [] ? fget_light+0xf8/0x2f0 Sep 5 23:52:34 del kernel: [46044.726601] [] ? sys_socketcall+0x208/0x2c0 Sep 5 23:52:34 del kernel: [46044.739140] [] sys_ioctl+0x63/0x70 Sep 5 23:52:34 del kernel: [46044.751967] [] syscall_call+0x7/0xb Sep 5 23:52:34 del kernel: [46044.764734] [] ? cookie_v6_check+0x3d0/0x630 --------------> This patch fixes the warning: =================================================== [ INFO: suspicious rcu_dereference_check() usage. ] --------------------------------------------------- net/ipv4/fib_trie.c:1756 invoked rcu_dereference_check() without protection! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 1 lock held by pppd/1717: #0: (rtnl_mutex){+.+.+.}, at: [] rtnl_lock+0xf/0x20 stack backtrace: Pid: 1717, comm: pppd Not tainted 2.6.35.4a #3 Call Trace: [] ? printk+0x18/0x1e [] lockdep_rcu_dereference+0xa9/0xb0 [] trie_firstleaf+0x65/0x70 [] fib_table_flush+0x14/0x170 ... Allow trie_firstleaf() to be called either under rcu_read_lock() protection or with RTNL held. The same annotation is added to node_parent_rcu() to prevent a similar warning a bit later. Followup of commits 634a4b20 and 4eaa0e3c. Signed-off-by: Jarek Poplawski Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 79d057a..4a8e370 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -186,7 +186,9 @@ static inline struct tnode *node_parent_rcu(struct node *node) { struct tnode *ret = node_parent(node); - return rcu_dereference(ret); + return rcu_dereference_check(ret, + rcu_read_lock_held() || + lockdep_rtnl_is_held()); } /* Same as rcu_assign_pointer @@ -1753,7 +1755,9 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) static struct leaf *trie_firstleaf(struct trie *t) { - struct tnode *n = (struct tnode *) rcu_dereference(t->trie); + struct tnode *n = (struct tnode *) rcu_dereference_check(t->trie, + rcu_read_lock_held() || + lockdep_rtnl_is_held()); if (!n) return NULL; -- cgit v1.1 From ae2688d59b5f861dc70a091d003773975d2ae7fb Mon Sep 17 00:00:00 2001 From: Jianzhao Wang Date: Wed, 8 Sep 2010 14:35:43 -0700 Subject: net: blackhole route should always be recalculated Blackhole routes are used when xfrm_lookup() returns -EREMOTE (error triggered by IKE for example), hence this kind of route is always temporary and so we should check if a better route exists for next packets. Bug has been introduced by commit d11a4dc18bf41719c9f0d7ed494d295dd2973b92. Signed-off-by: Jianzhao Wang Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv4/route.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3f56b6e..6298f75 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2738,6 +2738,11 @@ slow_output: } EXPORT_SYMBOL_GPL(__ip_route_output_key); +static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) +{ + return NULL; +} + static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) { } @@ -2746,7 +2751,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = { .family = AF_INET, .protocol = cpu_to_be16(ETH_P_IP), .destroy = ipv4_dst_destroy, - .check = ipv4_dst_check, + .check = ipv4_blackhole_dst_check, .update_pmtu = ipv4_rt_blackhole_update_pmtu, .entries = ATOMIC_INIT(0), }; -- cgit v1.1 From 719f835853a92f6090258114a72ffe41f09155cd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Sep 2010 05:08:44 +0000 Subject: udp: add rehash on connect() commit 30fff923 introduced in linux-2.6.33 (udp: bind() optimisation) added a secondary hash on UDP, hashed on (local addr, local port). Problem is that following sequence : fd = socket(...) connect(fd, &remote, ...) not only selects remote end point (address and port), but also sets local address, while UDP stack stored in secondary hash table the socket while its local address was INADDR_ANY (or ipv6 equivalent) Sequence is : - autobind() : choose a random local port, insert socket in hash tables [while local address is INADDR_ANY] - connect() : set remote address and port, change local address to IP given by a route lookup. When an incoming UDP frame comes, if more than 10 sockets are found in primary hash table, we switch to secondary table, and fail to find socket because its local address changed. One solution to this problem is to rehash datagram socket if needed. We add a new rehash(struct socket *) method in "struct proto", and implement this method for UDP v4 & v6, using a common helper. This rehashing only takes care of secondary hash table, since primary hash (based on local port only) is not changed. Reported-by: Krzysztof Piotr Oledzki Signed-off-by: Eric Dumazet Tested-by: Krzysztof Piotr Oledzki Signed-off-by: David S. Miller --- net/ipv4/datagram.c | 5 ++++- net/ipv4/udp.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index f055094..721a8a3 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -62,8 +62,11 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) } if (!inet->inet_saddr) inet->inet_saddr = rt->rt_src; /* Update source address */ - if (!inet->inet_rcv_saddr) + if (!inet->inet_rcv_saddr) { inet->inet_rcv_saddr = rt->rt_src; + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); + } inet->inet_daddr = rt->rt_dst; inet->inet_dport = usin->sin_port; sk->sk_state = TCP_ESTABLISHED; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 32e0bef..fb23c2e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1260,6 +1260,49 @@ void udp_lib_unhash(struct sock *sk) } EXPORT_SYMBOL(udp_lib_unhash); +/* + * inet_rcv_saddr was changed, we must rehash secondary hash + */ +void udp_lib_rehash(struct sock *sk, u16 newhash) +{ + if (sk_hashed(sk)) { + struct udp_table *udptable = sk->sk_prot->h.udp_table; + struct udp_hslot *hslot, *hslot2, *nhslot2; + + hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); + nhslot2 = udp_hashslot2(udptable, newhash); + udp_sk(sk)->udp_portaddr_hash = newhash; + if (hslot2 != nhslot2) { + hslot = udp_hashslot(udptable, sock_net(sk), + udp_sk(sk)->udp_port_hash); + /* we must lock primary chain too */ + spin_lock_bh(&hslot->lock); + + spin_lock(&hslot2->lock); + hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); + hslot2->count--; + spin_unlock(&hslot2->lock); + + spin_lock(&nhslot2->lock); + hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + &nhslot2->head); + nhslot2->count++; + spin_unlock(&nhslot2->lock); + + spin_unlock_bh(&hslot->lock); + } + } +} +EXPORT_SYMBOL(udp_lib_rehash); + +static void udp_v4_rehash(struct sock *sk) +{ + u16 new_hash = udp4_portaddr_hash(sock_net(sk), + inet_sk(sk)->inet_rcv_saddr, + inet_sk(sk)->inet_num); + udp_lib_rehash(sk, new_hash); +} + static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int rc; @@ -1843,6 +1886,7 @@ struct proto udp_prot = { .backlog_rcv = __udp_queue_rcv_skb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, + .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, -- cgit v1.1 From 7998156344b0d93de61ff8e5d75e96500e43a571 Mon Sep 17 00:00:00 2001 From: Bob Arendt Date: Mon, 13 Sep 2010 12:56:03 -0700 Subject: ipv4: force_igmp_version ignored when a IGMPv3 query received After all these years, it turns out that the /proc/sys/net/ipv4/conf/*/force_igmp_version parameter isn't fully implemented. *Symptom*: When set force_igmp_version to a value of 2, the kernel should only perform multicast IGMPv2 operations (IETF rfc2236). An host-initiated Join message will be sent as a IGMPv2 Join message. But if a IGMPv3 query message is received, the host responds with a IGMPv3 join message. Per rfc3376 and rfc2236, a IGMPv2 host should treat a IGMPv3 query as a IGMPv2 query and respond with an IGMPv2 Join message. *Consequences*: This is an issue when a IGMPv3 capable switch is the querier and will only issue IGMPv3 queries (which double as IGMPv2 querys) and there's an intermediate switch that is only IGMPv2 capable. The intermediate switch processes the initial v2 Join, but fails to recognize the IGMPv3 Join responses to the Query, resulting in a dropped connection when the intermediate v2-only switch times it out. *Identifying issue in the kernel source*: The issue is in this section of code (in net/ipv4/igmp.c), which is called when an IGMP query is received (from mainline 2.6.36-rc3 gitweb): ... A IGMPv3 query has a length >= 12 and no sources. This routine will exit after line 880, setting the general query timer (random timeout between 0 and query response time). This calls igmp_gq_timer_expire(): ... .. which only sends a v3 response. So if a v3 query is received, the kernel always sends a v3 response. IGMP queries happen once every 60 sec (per vlan), so the traffic is low. A IGMPv3 query *is* a strict superset of a IGMPv2 query, so this patch properly short circuit's the v3 behaviour. One issue is that this does not address force_igmp_version=1. Then again, I've never seen any IGMPv1 multicast equipment in the wild. However there is a lot of v2-only equipment. If it's necessary to support the IGMPv1 case as well: 837 if (len == 8 || IGMP_V2_SEEN(in_dev) || IGMP_V1_SEEN(in_dev)) { Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index a1ad0e7..1fdcacd 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -834,7 +834,7 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, int mark = 0; - if (len == 8) { + if (len == 8 || IGMP_V2_SEEN(in_dev)) { if (ih->code == 0) { /* Alas, old v1 router presents here. */ -- cgit v1.1 From a89b47639f3e11dd9a8eb78a5d3382e109c876f2 Mon Sep 17 00:00:00 2001 From: Michael Kerrisk Date: Fri, 10 Sep 2010 20:26:56 +0000 Subject: ipv4: enable getsockopt() for IP_NODEFRAG While integrating your man-pages patch for IP_NODEFRAG, I noticed that this option is settable by setsockopt(), but not gettable by getsockopt(). I suppose this is not intended. The (untested, trivial) patch below adds getsockopt() support. Signed-off-by: Michael kerrisk Acked-by: Jiri Olsa Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 6c40a8c..64b70ad 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1129,6 +1129,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_HDRINCL: val = inet->hdrincl; break; + case IP_NODEFRAG: + val = inet->nodefrag; + break; case IP_MTU_DISCOVER: val = inet->pmtudisc; break; -- cgit v1.1