From 5d0ba55b6486f58cc890918d7167063d83f7fbb4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Jun 2012 01:17:19 +0000 Subject: net: use consume_skb() in place of kfree_skb() Remove some dropwatch/drop_monitor false positives. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/ip_output.c') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 451f97c..b99ca4e 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -200,7 +200,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) } if (skb->sk) skb_set_owner_w(skb2, skb->sk); - kfree_skb(skb); + consume_skb(skb); skb = skb2; } @@ -709,7 +709,7 @@ slow_path: IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); } - kfree_skb(skb); + consume_skb(skb); IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); return err; -- cgit v1.1 From 95603e2293de556de7e82221649bfd7fd98b64a3 Mon Sep 17 00:00:00 2001 From: Michel Machado Date: Tue, 12 Jun 2012 10:16:35 +0000 Subject: net-next: add dev_loopback_xmit() to avoid duplicate code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add dev_loopback_xmit() in order to deduplicate functions ip_dev_loopback_xmit() (in net/ipv4/ip_output.c) and ip6_dev_loopback_xmit() (in net/ipv6/ip6_output.c). I was about to reinvent the wheel when I noticed that ip_dev_loopback_xmit() and ip6_dev_loopback_xmit() do exactly what I need and are not IP-only functions, but they were not available to reuse elsewhere. ip6_dev_loopback_xmit() does not have line "skb_dst_force(skb);", but I understand that this is harmless, and should be in dev_loopback_xmit(). Signed-off-by: Michel Machado CC: "David S. Miller" CC: Alexey Kuznetsov CC: James Morris CC: Hideaki YOSHIFUJI CC: Patrick McHardy CC: Eric Dumazet CC: Jiri Pirko CC: "Michał Mirosław" CC: Ben Hutchings Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'net/ipv4/ip_output.c') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b99ca4e..0f3185a 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -113,19 +113,6 @@ int ip_local_out(struct sk_buff *skb) } EXPORT_SYMBOL_GPL(ip_local_out); -/* dev_loopback_xmit for use with netfilter. */ -static int ip_dev_loopback_xmit(struct sk_buff *newskb) -{ - skb_reset_mac_header(newskb); - __skb_pull(newskb, skb_network_offset(newskb)); - newskb->pkt_type = PACKET_LOOPBACK; - newskb->ip_summed = CHECKSUM_UNNECESSARY; - WARN_ON(!skb_dst(newskb)); - skb_dst_force(newskb); - netif_rx_ni(newskb); - return 0; -} - static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) { int ttl = inet->uc_ttl; @@ -281,7 +268,7 @@ int ip_mc_output(struct sk_buff *skb) if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, NULL, newskb->dev, - ip_dev_loopback_xmit); + dev_loopback_xmit); } /* Multicasts with ttl 0 must not go beyond the host */ @@ -296,7 +283,7 @@ int ip_mc_output(struct sk_buff *skb) struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, - NULL, newskb->dev, ip_dev_loopback_xmit); + NULL, newskb->dev, dev_loopback_xmit); } return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, -- cgit v1.1 From 70e7341673a47fb1525cfc7d6651cc98b5348928 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 28 Jun 2012 03:21:41 -0700 Subject: ipv4: Show that ip_send_reply() is purely unicast routine. Rename it to ip_send_unicast_reply() and add explicit 'saddr' argument. This removed one of the few users of rt->rt_spec_dst. Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net/ipv4/ip_output.c') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 0f3185a..2630900 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1459,13 +1459,14 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, /* * Generic function to send a packet as reply to another packet. - * Used to send TCP resets so far. ICMP should use this function too. + * Used to send TCP resets so far. * * Should run single threaded per socket because it uses the sock * structure to pass arguments. */ -void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, - const struct ip_reply_arg *arg, unsigned int len) +void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, + __be32 saddr, const struct ip_reply_arg *arg, + unsigned int len) { struct inet_sock *inet = inet_sk(sk); struct ip_options_data replyopts; @@ -1491,7 +1492,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, RT_TOS(arg->tos), RT_SCOPE_UNIVERSE, sk->sk_protocol, ip_reply_arg_flowi_flags(arg), - daddr, rt->rt_spec_dst, + daddr, saddr, tcp_hdr(skb)->source, tcp_hdr(skb)->dest); security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(sock_net(sk), &fl4); -- cgit v1.1 From a263b3093641fb1ec377582c90986a7fd0625184 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 2 Jul 2012 02:02:15 -0700 Subject: ipv4: Make neigh lookups directly in output packet path. Do not use the dst cached neigh, we'll be getting rid of that. Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net/ipv4/ip_output.c') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 2630900..6e9a266 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -170,6 +170,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; + u32 nexthop; if (rt->rt_type == RTN_MULTICAST) { IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); @@ -191,15 +192,18 @@ static inline int ip_finish_output2(struct sk_buff *skb) skb = skb2; } - rcu_read_lock(); - neigh = dst_get_neighbour_noref(dst); + rcu_read_lock_bh(); + nexthop = rt->rt_gateway ? rt->rt_gateway : ip_hdr(skb)->daddr; + neigh = __ipv4_neigh_lookup_noref(dev, nexthop); + if (unlikely(!neigh)) + neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); if (neigh) { int res = neigh_output(neigh, skb); - rcu_read_unlock(); + rcu_read_unlock_bh(); return res; } - rcu_read_unlock(); + rcu_read_unlock_bh(); net_dbg_ratelimited("%s: No header cache and no neighbour!\n", __func__); -- cgit v1.1 From 5110effee8fde2edfacac9cd12a9960ab2dc39ea Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 2 Jul 2012 02:21:03 -0700 Subject: net: Do delayed neigh confirmation. When a dst_confirm() happens, mark the confirmation as pending in the dst. Then on the next packet out, when we have the neigh in-hand, do the update. This removes the dependency in dst_confirm() of dst's having an attached neigh. While we're here, remove the explicit 'dst' NULL check, all except 2 or 3 call sites ensure it's not NULL. So just fix those cases up. Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/ip_output.c') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6e9a266..cc52679 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -198,7 +198,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); if (neigh) { - int res = neigh_output(neigh, skb); + int res = dst_neigh_output(dst, neigh, skb); rcu_read_unlock_bh(); return res; -- cgit v1.1 From be9f4a44e7d41cee50ddb5f038fc2391cbbb4046 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 19 Jul 2012 07:34:03 +0000 Subject: ipv4: tcp: remove per net tcp_sock tcp_v4_send_reset() and tcp_v4_send_ack() use a single socket per network namespace. This leads to bad behavior on multiqueue NICS, because many cpus contend for the socket lock and once socket lock is acquired, extra false sharing on various socket fields slow down the operations. To better resist to attacks, we use a percpu socket. Each cpu can run without contention, using appropriate memory (local node) Additional features : 1) We also mirror the queue_mapping of the incoming skb, so that answers use the same queue if possible. 2) Setting SOCK_USE_WRITE_QUEUE socket flag speedup sock_wfree() 3) We now limit the number of in-flight RST/ACK [1] packets per cpu, instead of per namespace, and we honor the sysctl_wmem_default limit dynamically. (Prior to this patch, sysctl_wmem_default value was copied at boot time, so any further change would not affect tcp_sock limit) [1] These packets are only generated when no socket was matched for the incoming packet. Reported-by: Bill Sommerfeld Signed-off-by: Eric Dumazet Cc: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 50 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 18 deletions(-) (limited to 'net/ipv4/ip_output.c') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index cc52679..c528f84 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1463,20 +1463,33 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, /* * Generic function to send a packet as reply to another packet. - * Used to send TCP resets so far. + * Used to send some TCP resets/acks so far. * - * Should run single threaded per socket because it uses the sock - * structure to pass arguments. + * Use a fake percpu inet socket to avoid false sharing and contention. */ -void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, +static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { + .sk = { + .__sk_common = { + .skc_refcnt = ATOMIC_INIT(1), + }, + .sk_wmem_alloc = ATOMIC_INIT(1), + .sk_allocation = GFP_ATOMIC, + .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE), + }, + .pmtudisc = IP_PMTUDISC_WANT, +}; + +void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, unsigned int len) { - struct inet_sock *inet = inet_sk(sk); struct ip_options_data replyopts; struct ipcm_cookie ipc; struct flowi4 fl4; struct rtable *rt = skb_rtable(skb); + struct sk_buff *nskb; + struct sock *sk; + struct inet_sock *inet; if (ip_options_echo(&replyopts.opt.opt, skb)) return; @@ -1494,38 +1507,39 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, flowi4_init_output(&fl4, arg->bound_dev_if, 0, RT_TOS(arg->tos), - RT_SCOPE_UNIVERSE, sk->sk_protocol, + RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, ip_reply_arg_flowi_flags(arg), daddr, saddr, tcp_hdr(skb)->source, tcp_hdr(skb)->dest); security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); - rt = ip_route_output_key(sock_net(sk), &fl4); + rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) return; - /* And let IP do all the hard work. + inet = &get_cpu_var(unicast_sock); - This chunk is not reenterable, hence spinlock. - Note that it uses the fact, that this function is called - with locally disabled BH and that sk cannot be already spinlocked. - */ - bh_lock_sock(sk); inet->tos = arg->tos; + sk = &inet->sk; sk->sk_priority = skb->priority; sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; + sock_net_set(sk, net); + __skb_queue_head_init(&sk->sk_write_queue); + sk->sk_sndbuf = sysctl_wmem_default; ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, &ipc, &rt, MSG_DONTWAIT); - if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { + nskb = skb_peek(&sk->sk_write_queue); + if (nskb) { if (arg->csumoffset >= 0) - *((__sum16 *)skb_transport_header(skb) + - arg->csumoffset) = csum_fold(csum_add(skb->csum, + *((__sum16 *)skb_transport_header(nskb) + + arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); - skb->ip_summed = CHECKSUM_NONE; + nskb->ip_summed = CHECKSUM_NONE; + skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); ip_push_pending_frames(sk, &fl4); } - bh_unlock_sock(sk); + put_cpu_var(unicast_sock); ip_rt_put(rt); } -- cgit v1.1 From f8126f1d5136be1ca1a3536d43ad7a710b5620f8 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 13 Jul 2012 05:03:45 -0700 Subject: ipv4: Adjust semantics of rt->rt_gateway. In order to allow prefixed routes, we have to adjust how rt_gateway is set and interpreted. The new interpretation is: 1) rt_gateway == 0, destination is on-link, nexthop is iph->daddr 2) rt_gateway != 0, destination requires a nexthop gateway Abstract the fetching of the proper nexthop value using a new inline helper, rt_nexthop(), as suggested by Joe Perches. Signed-off-by: David S. Miller Tested-by: Vijay Subramanian --- net/ipv4/ip_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/ip_output.c') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c528f84..4494015 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -371,7 +371,7 @@ int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) skb_dst_set_noref(skb, &rt->dst); packet_routed: - if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) + if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gateway) goto no_route; /* OK, we know where to send it, allocate and build IP header. */ -- cgit v1.1 From 0980e56e506b1e45022fad00bca8c8a974fda4e6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Jul 2012 22:28:51 +0000 Subject: ipv4: tcp: set unicast_sock uc_ttl to -1 Set unicast_sock uc_ttl to -1 so that we select the right ttl, instead of sending packets with a 0 ttl. Bug added in commit be9f4a44e7d4 (ipv4: tcp: remove per net tcp_sock) Signed-off-by: Hiroaki SHIMODA Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4/ip_output.c') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c528f84..665abbb 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1476,7 +1476,8 @@ static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { .sk_allocation = GFP_ATOMIC, .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE), }, - .pmtudisc = IP_PMTUDISC_WANT, + .pmtudisc = IP_PMTUDISC_WANT, + .uc_ttl = -1, }; void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, -- cgit v1.1