From b531ed61a2a2a77eeb2f7c88b49aa5ec7d9880d8 Mon Sep 17 00:00:00 2001 From: Li Wei Date: Thu, 21 Feb 2013 00:09:54 +0000 Subject: ipv4: fix a bug in ping_err(). We should get 'type' and 'code' from the outer ICMP header. Signed-off-by: Li Wei Signed-off-by: David S. Miller --- net/ipv4/ping.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 55c4ee1..2e91006 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -322,8 +322,8 @@ void ping_err(struct sk_buff *skb, u32 info) struct iphdr *iph = (struct iphdr *)skb->data; struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2)); struct inet_sock *inet_sock; - int type = icmph->type; - int code = icmph->code; + int type = icmp_hdr(skb)->type; + int code = icmp_hdr(skb)->code; struct net *net = dev_net(skb->dev); struct sock *sk; int harderr; -- cgit v1.1 From 08dcdbf6a7b9d14c2302c5bd0c5390ddf122f664 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Feb 2013 12:18:52 +0000 Subject: ipv6: use a stronger hash for tcp It looks like its possible to open thousands of TCP IPv6 sessions on a server, all landing in a single slot of TCP hash table. Incoming packets have to lookup sockets in a very long list. We should hash all bits from foreign IPv6 addresses, using a salt and hash mix, not a simple XOR. inet6_ehashfn() can also separately use the ports, instead of xoring them. Reported-by: Neal Cardwell Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e225a4e..9cbcb94 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -248,8 +248,12 @@ EXPORT_SYMBOL(inet_listen); u32 inet_ehash_secret __read_mostly; EXPORT_SYMBOL(inet_ehash_secret); +u32 ipv6_hash_secret __read_mostly; +EXPORT_SYMBOL(ipv6_hash_secret); + /* - * inet_ehash_secret must be set exactly once + * inet_ehash_secret must be set exactly once, and to a non nul value + * ipv6_hash_secret must be set exactly once. */ void build_ehash_secret(void) { @@ -259,7 +263,8 @@ void build_ehash_secret(void) get_random_bytes(&rnd, sizeof(rnd)); } while (rnd == 0); - cmpxchg(&inet_ehash_secret, 0, rnd); + if (cmpxchg(&inet_ehash_secret, 0, rnd) == 0) + get_random_bytes(&ipv6_hash_secret, sizeof(ipv6_hash_secret)); } EXPORT_SYMBOL(build_ehash_secret); -- cgit v1.1 From 5b0520425e5ea81ba95ec486dd6bbb59a09fff0e Mon Sep 17 00:00:00 2001 From: Li Wei Date: Thu, 21 Feb 2013 22:18:44 +0000 Subject: ipv4: fix error handling in icmp_protocol. Now we handle icmp errors in each transport protocol's err_handler, for icmp protocols, that is ping_err. Since this handler only care of those icmp errors triggered by echo request, errors triggered by echo reply(which sent by kernel) are sliently ignored. So wrap ping_err() with icmp_err() to deal with those icmp errors. Signed-off-by: Li Wei Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 +- net/ipv4/icmp.c | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 9cbcb94..15847e1 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1577,7 +1577,7 @@ static const struct net_offload udp_offload = { static const struct net_protocol icmp_protocol = { .handler = icmp_rcv, - .err_handler = ping_err, + .err_handler = icmp_err, .no_policy = 1, .netns_ok = 1, }; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 17ff9fd..3ac5dff 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -934,6 +934,29 @@ error: goto drop; } +void icmp_err(struct sk_buff *skb, u32 info) +{ + struct iphdr *iph = (struct iphdr *)skb->data; + struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2)); + int type = icmp_hdr(skb)->type; + int code = icmp_hdr(skb)->code; + struct net *net = dev_net(skb->dev); + + /* + * Use ping_err to handle all icmp errors except those + * triggered by ICMP_ECHOREPLY which sent from kernel. + */ + if (icmph->type != ICMP_ECHOREPLY) { + ping_err(skb, info); + return; + } + + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) + ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ICMP, 0); + else if (type == ICMP_REDIRECT) + ipv4_redirect(skb, net, 0, 0, IPPROTO_ICMP, 0); +} + /* * This table is the definition of how we handle ICMP. */ -- cgit v1.1 From 2bb60cb9b7b997bdbc7fd6c8001dcca02a4ea2e1 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 22 Feb 2013 06:38:44 +0000 Subject: net: Fix locking bug in netif_set_xps_queue Smatch found a locking bug in netif_set_xps_queue in which we were not releasing the lock in the case of an allocation failure. This change corrects that so that we release the xps_map_mutex before returning -ENOMEM in the case of an allocation failure. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/core/dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 17bc535..18d8b5a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1882,8 +1882,10 @@ int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index) if (!new_dev_maps) new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); - if (!new_dev_maps) + if (!new_dev_maps) { + mutex_unlock(&xps_map_mutex); return -ENOMEM; + } map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : NULL; -- cgit v1.1 From cbda4eaffa2da9cefb89dd748803da689596d28b Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Fri, 22 Feb 2013 07:59:10 +0000 Subject: sock: only define socket limit if mem cgroup configured The mem cgroup socket limit is only used if the config option is enabled. Found with sparse Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/sock.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index fe96c5d..b261a79 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -186,8 +186,10 @@ void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg) static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; +#if defined(CONFIG_MEMCG_KMEM) struct static_key memcg_socket_limit_enabled; EXPORT_SYMBOL(memcg_socket_limit_enabled); +#endif /* * Make lock validator output more readable. (we pre-construct these -- cgit v1.1 From 1b63edd6ecc55c3a61b40297b49e2323783bddfd Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 22 Feb 2013 08:59:06 +0000 Subject: tcp: fix SYN-data space mis-accounting In fast open the sender unncessarily reduces the space available for data in SYN by 12 bytes. This is because in the sender incorrectly reserves space for TS option twice in tcp_send_syn_data(): tcp_mtu_to_mss() already accounts for TS option space. But it further reserves MAX_TCP_OPTION_SPACE when computing the payload space. Signed-off-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index fd0cea1..e2b4461 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1351,8 +1351,8 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) return 0; } -/* Calculate MSS. Not accounting for SACKs here. */ -int tcp_mtu_to_mss(struct sock *sk, int pmtu) +/* Calculate MSS not accounting any TCP options. */ +static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1381,13 +1381,17 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu) /* Then reserve room for full set of TCP options and 8 bytes of data */ if (mss_now < 48) mss_now = 48; - - /* Now subtract TCP options size, not including SACKs */ - mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); - return mss_now; } +/* Calculate MSS. Not accounting for SACKs here. */ +int tcp_mtu_to_mss(struct sock *sk, int pmtu) +{ + /* Subtract TCP options size, not including SACKs */ + return __tcp_mtu_to_mss(sk, pmtu) - + (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr)); +} + /* Inverse of above */ int tcp_mss_to_mtu(struct sock *sk, int mss) { @@ -2930,7 +2934,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) */ if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp) tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; - space = tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - + space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - MAX_TCP_OPTION_SPACE; syn_data = skb_copy_expand(syn, skb_headroom(syn), space, -- cgit v1.1 From 6e601a53566d84e1ffd25e7b6fe0b6894ffd79c0 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sat, 23 Feb 2013 01:13:47 +0000 Subject: sock_diag: Fix out-of-bounds access to sock_diag_handlers[] Userland can send a netlink message requesting SOCK_DIAG_BY_FAMILY with a family greater or equal then AF_MAX -- the array size of sock_diag_handlers[]. The current code does not test for this condition therefore is vulnerable to an out-of-bound access opening doors for a privilege escalation. Signed-off-by: Mathias Krause Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock_diag.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 602cd63..750f44f 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -121,6 +121,9 @@ static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlmsg_len(nlh) < sizeof(*req)) return -EINVAL; + if (req->sdiag_family >= AF_MAX) + return -EINVAL; + hndl = sock_diag_lock_handler(req->sdiag_family); if (hndl == NULL) err = -ENOENT; -- cgit v1.1 From 8e904550d0fffcda2b18d7ab12750b0c75757e89 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sat, 23 Feb 2013 01:13:48 +0000 Subject: sock_diag: Simplify sock_diag_handlers[] handling in __sock_diag_rcv_msg The sock_diag_lock_handler() and sock_diag_unlock_handler() actually make the code less readable. Get rid of them and make the lock usage and access to sock_diag_handlers[] clear on the first sight. Signed-off-by: Mathias Krause Signed-off-by: David S. Miller --- net/core/sock_diag.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 750f44f..a29e90c 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -97,21 +97,6 @@ void sock_diag_unregister(const struct sock_diag_handler *hnld) } EXPORT_SYMBOL_GPL(sock_diag_unregister); -static const inline struct sock_diag_handler *sock_diag_lock_handler(int family) -{ - if (sock_diag_handlers[family] == NULL) - request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, - NETLINK_SOCK_DIAG, family); - - mutex_lock(&sock_diag_table_mutex); - return sock_diag_handlers[family]; -} - -static inline void sock_diag_unlock_handler(const struct sock_diag_handler *h) -{ - mutex_unlock(&sock_diag_table_mutex); -} - static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { int err; @@ -124,12 +109,17 @@ static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (req->sdiag_family >= AF_MAX) return -EINVAL; - hndl = sock_diag_lock_handler(req->sdiag_family); + if (sock_diag_handlers[req->sdiag_family] == NULL) + request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, + NETLINK_SOCK_DIAG, req->sdiag_family); + + mutex_lock(&sock_diag_table_mutex); + hndl = sock_diag_handlers[req->sdiag_family]; if (hndl == NULL) err = -ENOENT; else err = hndl->dump(skb, nlh); - sock_diag_unlock_handler(hndl); + mutex_unlock(&sock_diag_table_mutex); return err; } -- cgit v1.1 From 490ab08127cebc25e3a260a74556b56ce5f47c0f Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Fri, 22 Feb 2013 07:30:30 +0000 Subject: IP_GRE: Fix IP-Identification. GRE-GSO generates ip fragments with id 0,2,3,4... for every GSO packet, which is not correct. Following patch fixes it by setting ip-header id unique id of fragments are allowed. As Eric Dumazet suggested it is optimized by using inner ip-header whenever inner packet is ipv4. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 6 ++++-- net/ipv4/ip_gre.c | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 15847e1..68f6a94 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1332,8 +1332,10 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, if (skb->next != NULL) iph->frag_off |= htons(IP_MF); offset += (skb->len - skb->mac_len - iph->ihl * 4); - } else - iph->id = htons(id++); + } else { + if (!(iph->frag_off & htons(IP_DF))) + iph->id = htons(id++); + } iph->tot_len = htons(skb->len - skb->mac_len); iph->check = 0; iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 5ef4da7..b8bada0 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -970,7 +970,8 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev iph->daddr = fl4.daddr; iph->saddr = fl4.saddr; iph->ttl = ttl; - iph->id = 0; + + tunnel_ip_select_ident(skb, old_iph, &rt->dst); if (ttl == 0) { if (skb->protocol == htons(ETH_P_IP)) -- cgit v1.1 From 8f10098fb978c3cdd8ee7d72f7aa4330873f4544 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Sun, 24 Feb 2013 20:05:05 +0000 Subject: IP_GRE: Fix GRE_CSUM case. commit "ip_gre: allow CSUM capable devices to handle packets" aa0e51cdda005cd37e2, broke GRE_CSUM case. GRE_CSUM needs checksum computed for inner packet. Therefore csum-calculation can not be offloaded if tunnel device requires GRE_CSUM. Following patch fixes it by computing inner packet checksum for GRE_CSUM type, for all other type of GRE devices csum is offloaded. CC: Dmitry Kravkov Signed-off-by: Pravin B Shelar Acked-by: Dmitry Kravkov Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index b8bada0..4065129 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -735,7 +735,7 @@ drop: return 0; } -static struct sk_buff *handle_offloads(struct sk_buff *skb) +static struct sk_buff *handle_offloads(struct ip_tunnel *tunnel, struct sk_buff *skb) { int err; @@ -745,8 +745,12 @@ static struct sk_buff *handle_offloads(struct sk_buff *skb) goto error; skb_shinfo(skb)->gso_type |= SKB_GSO_GRE; return skb; - } - if (skb->ip_summed != CHECKSUM_PARTIAL) + } else if (skb->ip_summed == CHECKSUM_PARTIAL && + tunnel->parms.o_flags&GRE_CSUM) { + err = skb_checksum_help(skb); + if (unlikely(err)) + goto error; + } else if (skb->ip_summed != CHECKSUM_PARTIAL) skb->ip_summed = CHECKSUM_NONE; return skb; @@ -776,7 +780,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev int err; int pkt_len; - skb = handle_offloads(skb); + skb = handle_offloads(tunnel, skb); if (IS_ERR(skb)) { dev->stats.tx_dropped++; return NETDEV_TX_OK; -- cgit v1.1 From 7992ae6df9733677fcc2e63c40b97854c605c399 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Sun, 24 Feb 2013 20:05:12 +0000 Subject: Revert "ip_gre: propogate target device GSO capability to the tunnel device" This reverts commit eb6b9a8cad65e820b145547844b108117cece3a0. Above commit limits GSO capability of gre device to just TSO, but software GRE-GSO is capable of handling all GSO capabilities. This patch also fixes following panic which reverted commit introduced:- BUG: unable to handle kernel NULL pointer dereference at 00000000000000a2 IP: [] ipgre_tunnel_bind_dev+0x161/0x1f0 [ip_gre] PGD 42bc19067 PUD 42bca9067 PMD 0 Oops: 0000 [#1] SMP Pid: 2636, comm: ip Tainted: GF 3.8.0+ #83 Dell Inc. PowerEdge R620/0KCKR5 RIP: 0010:[] [] ipgre_tunnel_bind_dev+0x161/0x1f0 [ip_gre] RSP: 0018:ffff88042bfcb708 EFLAGS: 00010246 RAX: 00000000000005b6 RBX: ffff88042d2fa000 RCX: 0000000000000044 RDX: 0000000000000018 RSI: 0000000000000078 RDI: 0000000000000060 RBP: ffff88042bfcb748 R08: 0000000000000018 R09: 000000000000000c R10: 0000000000000020 R11: 000000000101010a R12: ffff88042d2fa800 R13: 0000000000000000 R14: ffff88042d2fa800 R15: ffff88042cd7f650 FS: 00007fa784f55700(0000) GS:ffff88043fd20000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000a2 CR3: 000000042d8b9000 CR4: 00000000000407e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process ip (pid: 2636, threadinfo ffff88042bfca000, task ffff88042d142a80) Stack: 0000000100000000 002f000000000000 0a01010100000000 000000000b010101 ffff88042d2fa800 ffff88042d2fa000 ffff88042bfcb858 ffff88042f418c00 ffff88042bfcb798 ffffffffa068199a ffff88042bfcb798 ffff88042d2fa830 Call Trace: [] ipgre_newlink+0xca/0x160 [ip_gre] [] rtnl_newlink+0x532/0x5f0 [] ? rtnl_newlink+0x19c/0x5f0 [] rtnetlink_rcv_msg+0x2c8/0x340 [] ? rtnetlink_rcv+0x40/0x40 [] netlink_rcv_skb+0xa9/0xd0 [] rtnetlink_rcv+0x25/0x40 [] netlink_unicast+0x1ac/0x230 [] netlink_sendmsg+0x265/0x380 [] sock_sendmsg+0xb0/0xe0 [] ? move_addr_to_kernel+0x4e/0x90 [] ? verify_iovec+0x85/0xf0 [] __sys_sendmsg+0x3fd/0x420 [] ? handle_mm_fault+0x251/0x3b0 [] ? vma_link+0xcf/0xe0 [] sys_sendmsg+0x49/0x90 [] system_call_fastpath+0x16/0x1b CC: Dmitry Kravkov Signed-off-by: Pravin B Shelar Acked-by: Dmitry Kravkov Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 4065129..d0ef0e6 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1106,14 +1106,8 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev) tunnel->hlen = addend; /* TCP offload with GRE SEQ is not supported. */ if (!(tunnel->parms.o_flags & GRE_SEQ)) { - /* device supports enc gso offload*/ - if (tdev->hw_enc_features & NETIF_F_GRE_GSO) { - dev->features |= NETIF_F_TSO; - dev->hw_features |= NETIF_F_TSO; - } else { - dev->features |= NETIF_F_GSO_SOFTWARE; - dev->hw_features |= NETIF_F_GSO_SOFTWARE; - } + dev->features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; } return mtu; -- cgit v1.1