From 0e0d44ab4275549998567cd4700b43f7496eb62b Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 28 Aug 2013 08:04:14 +0200 Subject: net: Remove FLOWI_FLAG_CAN_SLEEP FLOWI_FLAG_CAN_SLEEP was used to notify xfrm about the posibility to sleep until the needed states are resolved. This code is gone, so FLOWI_FLAG_CAN_SLEEP is not needed anymore. Signed-off-by: Steffen Klassert --- net/ipv4/af_inet.c | 2 +- net/ipv4/datagram.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/udp.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 70011e0..4f6d50b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1133,7 +1133,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) fl4 = &inet->cork.fl.u.ip4; rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, sk->sk_protocol, - inet->inet_sport, inet->inet_dport, sk, false); + inet->inet_sport, inet->inet_dport, sk); if (IS_ERR(rt)) return PTR_ERR(rt); diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 19e3637..8b5134c 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -53,7 +53,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr, RT_CONN_FLAGS(sk), oif, sk->sk_protocol, - inet->inet_sport, usin->sin_port, sk, true); + inet->inet_sport, usin->sin_port, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); if (err == -ENETUNREACH) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 23c3e5b..81e6cfd 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -575,7 +575,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, - inet_sk_flowi_flags(sk) | FLOWI_FLAG_CAN_SLEEP | + inet_sk_flowi_flags(sk) | (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0672139..bbaf8cb 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -173,7 +173,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, IPPROTO_TCP, - orig_sport, orig_dport, sk, true); + orig_sport, orig_dport, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); if (err == -ENETUNREACH) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 44f6a20..480fb95df 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -990,7 +990,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, fl4 = &fl4_stack; flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, - inet_sk_flowi_flags(sk)|FLOWI_FLAG_CAN_SLEEP, + inet_sk_flowi_flags(sk), faddr, saddr, dport, inet->inet_sport); security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); -- cgit v1.1 From a99421d9b1bf332d8eaebfe0ff17096a9f84d50b Mon Sep 17 00:00:00 2001 From: Jeff Kirsher Date: Fri, 6 Dec 2013 09:13:39 -0800 Subject: ipv4/ipv6: Fix FSF address in file headers Several files refer to an old address for the Free Software Foundation in the file header comment. Resolve by replacing the address with the URL so that we do not have to keep updating the header comments anytime the address changes. CC: Alexey Kuznetsov CC: James Morris CC: Hideaki YOSHIFUJI CC: Patrick McHardy Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- net/ipv4/cipso_ipv4.c | 3 +-- net/ipv4/netfilter/nf_nat_snmp_basic.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 667c1d4..4b59b6e 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -31,8 +31,7 @@ * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see . * */ diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index 5f011cc..61a9422 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -34,8 +34,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program; if not, see . * * Author: James Morris * -- cgit v1.1 From 7b7fc97aa390dfe4770e1d19e215fa289d94b477 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Dec 2013 22:31:30 -0800 Subject: tcp: optimize some skb_shinfo(skb) uses Compiler doesn't know skb_shinfo(skb) pointer is usually constant. By using a temporary variable, we help generating smaller code. For example, tcp_init_nondata_skb() is inlined after this patch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7820f3a..993da00 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -363,15 +363,17 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, */ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; TCP_SKB_CB(skb)->tcp_flags = flags; TCP_SKB_CB(skb)->sacked = 0; - skb_shinfo(skb)->gso_segs = 1; - skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; + shinfo->gso_segs = 1; + shinfo->gso_size = 0; + shinfo->gso_type = 0; TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPHDR_SYN | TCPHDR_FIN)) @@ -986,6 +988,8 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, unsigned int mss_now) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + /* Make sure we own this skb before messing gso_size/gso_segs */ WARN_ON_ONCE(skb_cloned(skb)); @@ -993,13 +997,13 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, /* Avoid the costly divide in the normal * non-TSO case. */ - skb_shinfo(skb)->gso_segs = 1; - skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; + shinfo->gso_segs = 1; + shinfo->gso_size = 0; + shinfo->gso_type = 0; } else { - skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now); - skb_shinfo(skb)->gso_size = mss_now; - skb_shinfo(skb)->gso_type = sk->sk_gso_type; + shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now); + shinfo->gso_size = mss_now; + shinfo->gso_type = sk->sk_gso_type; } } @@ -1146,6 +1150,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, */ static void __pskb_trim_head(struct sk_buff *skb, int len) { + struct skb_shared_info *shinfo; int i, k, eat; eat = min_t(int, len, skb_headlen(skb)); @@ -1157,23 +1162,24 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) } eat = len; k = 0; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); + shinfo = skb_shinfo(skb); + for (i = 0; i < shinfo->nr_frags; i++) { + int size = skb_frag_size(&shinfo->frags[i]); if (size <= eat) { skb_frag_unref(skb, i); eat -= size; } else { - skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; + shinfo->frags[k] = shinfo->frags[i]; if (eat) { - skb_shinfo(skb)->frags[k].page_offset += eat; - skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); + shinfo->frags[k].page_offset += eat; + skb_frag_size_sub(&shinfo->frags[k], eat); eat = 0; } k++; } } - skb_shinfo(skb)->nr_frags = k; + shinfo->nr_frags = k; skb_reset_tail_pointer(skb); skb->data_len -= len; -- cgit v1.1 From f54b311142a92ea2e42598e347b84e1655caf8e3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Dec 2013 22:36:05 -0800 Subject: tcp: auto corking With the introduction of TCP Small Queues, TSO auto sizing, and TCP pacing, we can implement Automatic Corking in the kernel, to help applications doing small write()/sendmsg() to TCP sockets. Idea is to change tcp_push() to check if the current skb payload is under skb optimal size (a multiple of MSS bytes) If under 'size_goal', and at least one packet is still in Qdisc or NIC TX queues, set the TCP Small Queue Throttled bit, so that the push will be delayed up to TX completion time. This delay might allow the application to coalesce more bytes in the skb in following write()/sendmsg()/sendfile() system calls. The exact duration of the delay is depending on the dynamics of the system, and might be zero if no packet for this flow is actually held in Qdisc or NIC TX ring. Using FQ/pacing is a way to increase the probability of autocorking being triggered. Add a new sysctl (/proc/sys/net/ipv4/tcp_autocorking) to control this feature and default it to 1 (enabled) Add a new SNMP counter : nstat -a | grep TcpExtTCPAutoCorking This counter is incremented every time we detected skb was under used and its flush was deferred. Tested: Interesting effects when using line buffered commands under ssh. Excellent performance results in term of cpu usage and total throughput. lpq83:~# echo 1 >/proc/sys/net/ipv4/tcp_autocorking lpq83:~# perf stat ./super_netperf 4 -t TCP_STREAM -H lpq84 -- -m 128 9410.39 Performance counter stats for './super_netperf 4 -t TCP_STREAM -H lpq84 -- -m 128': 35209.439626 task-clock # 2.901 CPUs utilized 2,294 context-switches # 0.065 K/sec 101 CPU-migrations # 0.003 K/sec 4,079 page-faults # 0.116 K/sec 97,923,241,298 cycles # 2.781 GHz [83.31%] 51,832,908,236 stalled-cycles-frontend # 52.93% frontend cycles idle [83.30%] 25,697,986,603 stalled-cycles-backend # 26.24% backend cycles idle [66.70%] 102,225,978,536 instructions # 1.04 insns per cycle # 0.51 stalled cycles per insn [83.38%] 18,657,696,819 branches # 529.906 M/sec [83.29%] 91,679,646 branch-misses # 0.49% of all branches [83.40%] 12.136204899 seconds time elapsed lpq83:~# echo 0 >/proc/sys/net/ipv4/tcp_autocorking lpq83:~# perf stat ./super_netperf 4 -t TCP_STREAM -H lpq84 -- -m 128 6624.89 Performance counter stats for './super_netperf 4 -t TCP_STREAM -H lpq84 -- -m 128': 40045.864494 task-clock # 3.301 CPUs utilized 171 context-switches # 0.004 K/sec 53 CPU-migrations # 0.001 K/sec 4,080 page-faults # 0.102 K/sec 111,340,458,645 cycles # 2.780 GHz [83.34%] 61,778,039,277 stalled-cycles-frontend # 55.49% frontend cycles idle [83.31%] 29,295,522,759 stalled-cycles-backend # 26.31% backend cycles idle [66.67%] 108,654,349,355 instructions # 0.98 insns per cycle # 0.57 stalled cycles per insn [83.34%] 19,552,170,748 branches # 488.244 M/sec [83.34%] 157,875,417 branch-misses # 0.81% of all branches [83.34%] 12.130267788 seconds time elapsed Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/proc.c | 1 + net/ipv4/sysctl_net_ipv4.c | 9 +++++++ net/ipv4/tcp.c | 63 ++++++++++++++++++++++++++++++++++++---------- 3 files changed, 60 insertions(+), 13 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 4a03358..8ecd7ad 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -279,6 +279,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), + SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 3d69ec8..38c8ec9 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -733,6 +733,15 @@ static struct ctl_table ipv4_table[] = { .extra2 = &gso_max_segs, }, { + .procname = "tcp_autocorking", + .data = &sysctl_tcp_autocorking, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { .procname = "udp_mem", .data = &sysctl_udp_mem, .maxlen = sizeof(sysctl_udp_mem), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c4638e6..0ca8754 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -285,6 +285,8 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; int sysctl_tcp_min_tso_segs __read_mostly = 2; +int sysctl_tcp_autocorking __read_mostly = 1; + struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); @@ -619,19 +621,52 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) tp->snd_up = tp->write_seq; } -static inline void tcp_push(struct sock *sk, int flags, int mss_now, - int nonagle) +/* If a not yet filled skb is pushed, do not send it if + * we have packets in Qdisc or NIC queues : + * Because TX completion will happen shortly, it gives a chance + * to coalesce future sendmsg() payload into this skb, without + * need for a timer, and with no latency trade off. + * As packets containing data payload have a bigger truesize + * than pure acks (dataless) packets, the last check prevents + * autocorking if we only have an ACK in Qdisc/NIC queues. + */ +static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, + int size_goal) { - if (tcp_send_head(sk)) { - struct tcp_sock *tp = tcp_sk(sk); + return skb->len < size_goal && + sysctl_tcp_autocorking && + atomic_read(&sk->sk_wmem_alloc) > skb->truesize; +} + +static void tcp_push(struct sock *sk, int flags, int mss_now, + int nonagle, int size_goal) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; - if (!(flags & MSG_MORE) || forced_push(tp)) - tcp_mark_push(tp, tcp_write_queue_tail(sk)); + if (!tcp_send_head(sk)) + return; + + skb = tcp_write_queue_tail(sk); + if (!(flags & MSG_MORE) || forced_push(tp)) + tcp_mark_push(tp, skb); + + tcp_mark_urg(tp, flags); + + if (tcp_should_autocork(sk, skb, size_goal)) { - tcp_mark_urg(tp, flags); - __tcp_push_pending_frames(sk, mss_now, - (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); + /* avoid atomic op if TSQ_THROTTLED bit is already set */ + if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); + set_bit(TSQ_THROTTLED, &tp->tsq_flags); + } + return; } + + if (flags & MSG_MORE) + nonagle = TCP_NAGLE_CORK; + + __tcp_push_pending_frames(sk, mss_now, nonagle); } static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, @@ -934,7 +969,8 @@ new_segment: wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: - tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + tcp_push(sk, flags & ~MSG_MORE, mss_now, + TCP_NAGLE_PUSH, size_goal); if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; @@ -944,7 +980,7 @@ wait_for_memory: out: if (copied && !(flags & MSG_SENDPAGE_NOTLAST)) - tcp_push(sk, flags, mss_now, tp->nonagle); + tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); return copied; do_error: @@ -1225,7 +1261,8 @@ wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: if (copied) - tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + tcp_push(sk, flags & ~MSG_MORE, mss_now, + TCP_NAGLE_PUSH, size_goal); if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; @@ -1236,7 +1273,7 @@ wait_for_memory: out: if (copied) - tcp_push(sk, flags, mss_now, tp->nonagle); + tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); release_sock(sk); return copied + copied_syn; -- cgit v1.1 From 1f9248e5606afc6485255e38ad57bdac08fa7711 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 7 Dec 2013 19:26:53 +0100 Subject: neigh: convert parms to an array This patch converts the neigh param members to an array. This allows easier manipulation which will be needed later on to provide better management of default values. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/ipv4/arp.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 7808093..3a37c7d 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -166,18 +166,20 @@ struct neigh_table arp_tbl = { .id = "arp_cache", .parms = { .tbl = &arp_tbl, - .base_reachable_time = 30 * HZ, - .retrans_time = 1 * HZ, - .gc_staletime = 60 * HZ, .reachable_time = 30 * HZ, - .delay_probe_time = 5 * HZ, - .queue_len_bytes = 64*1024, - .ucast_probes = 3, - .mcast_probes = 3, - .anycast_delay = 1 * HZ, - .proxy_delay = (8 * HZ) / 10, - .proxy_qlen = 64, - .locktime = 1 * HZ, + .data = { + [NEIGH_VAR_MCAST_PROBES] = 3, + [NEIGH_VAR_UCAST_PROBES] = 3, + [NEIGH_VAR_RETRANS_TIME] = 1 * HZ, + [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, + [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, + [NEIGH_VAR_GC_STALETIME] = 60 * HZ, + [NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024, + [NEIGH_VAR_PROXY_QLEN] = 64, + [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, + [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, + [NEIGH_VAR_LOCKTIME] = 1 * HZ, + }, }, .gc_interval = 30 * HZ, .gc_thresh1 = 128, @@ -359,14 +361,14 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) if (!saddr) saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); - probes -= neigh->parms->ucast_probes; + probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES); if (probes < 0) { if (!(neigh->nud_state & NUD_VALID)) pr_debug("trying to ucast probe in NUD_INVALID\n"); neigh_ha_snapshot(dst_ha, neigh, dev); dst_hw = dst_ha; } else { - probes -= neigh->parms->app_probes; + probes -= NEIGH_VAR(neigh->parms, APP_PROBES); if (probes < 0) { neigh_app_ns(neigh); return; @@ -871,7 +873,7 @@ static int arp_process(struct sk_buff *skb) if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || skb->pkt_type == PACKET_HOST || - in_dev->arp_parms->proxy_delay == 0) { + NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) { arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha); @@ -910,7 +912,8 @@ static int arp_process(struct sk_buff *skb) agents are active. Taking the first reply prevents arp trashing and chooses the fastest router. */ - override = time_after(jiffies, n->updated + n->parms->locktime); + override = time_after(jiffies, n->updated + + NEIGH_VAR(n->parms, LOCKTIME)); /* Broadcast replies and request packets do not assert neighbour reachability. -- cgit v1.1 From 73af614aedd221df8495fc8c9993c50e87f899f2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 7 Dec 2013 19:26:55 +0100 Subject: neigh: use tbl->family to distinguish ipv4 from ipv6 Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/ipv4/arp.c | 2 +- net/ipv4/devinet.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 3a37c7d..f041858 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1287,7 +1287,7 @@ void __init arp_init(void) dev_add_pack(&arp_packet_type); arp_proc_init(); #ifdef CONFIG_SYSCTL - neigh_sysctl_register(NULL, &arp_tbl.parms, "ipv4", NULL); + neigh_sysctl_register(NULL, &arp_tbl.parms, NULL); #endif register_netdevice_notifier(&arp_netdev_notifier); } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index a1b5bcb..e1c1953 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2160,7 +2160,7 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf) static void devinet_sysctl_register(struct in_device *idev) { - neigh_sysctl_register(idev->dev, idev->arp_parms, "ipv4", NULL); + neigh_sysctl_register(idev->dev, idev->arp_parms, NULL); __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name, &idev->cnf); } -- cgit v1.1 From 1d4c8c29841b9991cdf3c7cc4ba7f96a94f104ca Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 7 Dec 2013 19:26:56 +0100 Subject: neigh: restore old behaviour of default parms values Previously inet devices were only constructed when addresses are added. Therefore the default neigh parms values they get are the ones at the time of these operations. Now that we're creating inet devices earlier, this changes the behaviour of default neigh parms values in an incompatible way (see bug #8519). This patch creates a compromise by setting the default values at the same point as before but only for those that have not been explicitly set by the user since the inet device's creation. Introduced by: commit 8030f54499925d073a88c09f30d5d844fb1b3190 Author: Herbert Xu Date: Thu Feb 22 01:53:47 2007 +0900 [IPV4] devinet: Register inetdev earlier. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 2 ++ net/ipv4/ipmr.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index e1c1953..43065be 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -500,6 +500,7 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) return -ENOBUFS; } ipv4_devconf_setall(in_dev); + neigh_parms_data_state_setall(in_dev->arp_parms); if (ifa->ifa_dev != in_dev) { WARN_ON(ifa->ifa_dev); in_dev_hold(in_dev); @@ -747,6 +748,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, goto errout; ipv4_devconf_setall(in_dev); + neigh_parms_data_state_setall(in_dev->arp_parms); in_dev_hold(in_dev); if (tb[IFA_ADDRESS] == NULL) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 62212c7..421a249 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -425,6 +425,7 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) goto failure; ipv4_devconf_setall(in_dev); + neigh_parms_data_state_setall(in_dev->arp_parms); IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0; if (dev_open(dev)) @@ -517,6 +518,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) } ipv4_devconf_setall(in_dev); + neigh_parms_data_state_setall(in_dev->arp_parms); IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0; rcu_read_unlock(); -- cgit v1.1 From ad6c81359fc3e6086d1d6f91acda9d5d0e64b2c3 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sun, 8 Dec 2013 12:16:10 +0100 Subject: ipv4: add support for IFA_FLAGS nl attribute Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 43065be..6f49efc9 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -99,6 +99,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_BROADCAST] = { .type = NLA_U32 }, [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, + [IFA_FLAGS] = { .type = NLA_U32 }, }; #define IN4_ADDR_HSIZE_SHIFT 8 @@ -757,7 +758,8 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, INIT_HLIST_NODE(&ifa->hash); ifa->ifa_prefixlen = ifm->ifa_prefixlen; ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); - ifa->ifa_flags = ifm->ifa_flags; + ifa->ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : + ifm->ifa_flags; ifa->ifa_scope = ifm->ifa_scope; ifa->ifa_dev = in_dev; @@ -1437,7 +1439,8 @@ static size_t inet_nlmsg_size(void) + nla_total_size(4) /* IFA_ADDRESS */ + nla_total_size(4) /* IFA_LOCAL */ + nla_total_size(4) /* IFA_BROADCAST */ - + nla_total_size(IFNAMSIZ); /* IFA_LABEL */ + + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ + + nla_total_size(4); /* IFA_FLAGS */ } static inline u32 cstamp_delta(unsigned long cstamp) @@ -1505,6 +1508,7 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, nla_put_be32(skb, IFA_BROADCAST, ifa->ifa_broadcast)) || (ifa->ifa_label[0] && nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) || + nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) || put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp, preferred, valid)) goto nla_put_failure; -- cgit v1.1 From 8e3bff96afa67369008153f3326fa5ce985cabab Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Sun, 8 Dec 2013 12:15:44 -0800 Subject: net: more spelling fixes Various spelling fixes in networking stack Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 2 +- net/ipv4/tcp_output.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index ddf32a6..a9fc435 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1051,7 +1051,7 @@ e_inval: * * To support IP_CMSG_PKTINFO option, we store rt_iif and specific * destination in skb->cb[] before dst drop. - * This way, receiver doesnt make cache line misses to read rtable. + * This way, receiver doesn't make cache line misses to read rtable. */ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 993da00..2a69f42 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -408,7 +408,7 @@ struct tcp_out_options { * Beware: Something in the Internet is very sensitive to the ordering of * TCP options, we learned this through the hard way, so be careful here. * Luckily we can at least blame others for their non-compliance but from - * inter-operatibility perspective it seems that we're somewhat stuck with + * inter-operability perspective it seems that we're somewhat stuck with * the ordering which we have been using if we want to keep working with * those broken things (not that it currently hurts anybody as there isn't * particular reason why the ordering would need to be changed). @@ -681,7 +681,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb * * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb * needs to be reallocated in a driver. - * The invariant being skb->truesize substracted from sk->sk_wmem_alloc + * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc * * Since transmit from skb destructor is forbidden, we use a tasklet * to process all sockets that eventually need to send more skbs. @@ -701,9 +701,9 @@ static void tcp_tsq_handler(struct sock *sk) tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC); } /* - * One tasklest per cpu tries to send more skbs. + * One tasklet per cpu tries to send more skbs. * We run in tasklet context but need to disable irqs when - * transfering tsq->head because tcp_wfree() might + * transferring tsq->head because tcp_wfree() might * interrupt us (non NAPI drivers) */ static void tcp_tasklet_func(unsigned long data) @@ -797,7 +797,7 @@ void __init tcp_tasklet_init(void) /* * Write buffer destructor automatically called from kfree_skb. - * We cant xmit new skbs from this context, as we might already + * We can't xmit new skbs from this context, as we might already * hold qdisc lock. */ void tcp_wfree(struct sk_buff *skb) -- cgit v1.1 From b601fa197fff265bf60eaf6950d4c194da080f4a Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 10 Dec 2013 15:02:40 +0100 Subject: ipv4: fix wildcard search with inet_confirm_addr() Help of this function says: "in_dev: only on this interface, 0=any interface", but since commit 39a6d0630012 ("[NETNS]: Process inet_confirm_addr in the correct namespace."), the code supposes that it will never be NULL. This function is never called with in_dev == NULL, but it's exported and may be used by an external module. Because this patch restore the ability to call inet_confirm_addr() with in_dev == NULL, I partially revert the above commit, as suggested by Julian. CC: Julian Anastasov Signed-off-by: Nicolas Dichtel Reviewed-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/arp.c | 4 +++- net/ipv4/devinet.c | 9 ++++----- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index f041858..b67cf80 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -381,6 +381,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) { + struct net *net = dev_net(in_dev->dev); int scope; switch (IN_DEV_ARP_IGNORE(in_dev)) { @@ -399,6 +400,7 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) case 3: /* Do not reply for scope host addresses */ sip = 0; scope = RT_SCOPE_LINK; + in_dev = NULL; break; case 4: /* Reserved */ case 5: @@ -410,7 +412,7 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) default: return 0; } - return !inet_confirm_addr(in_dev, sip, tip, scope); + return !inet_confirm_addr(net, in_dev, sip, tip, scope); } static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 6f49efc9..84956f5 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1240,22 +1240,21 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, /* * Confirm that local IP address exists using wildcards: - * - in_dev: only on this interface, 0=any interface + * - net: netns to check, cannot be NULL + * - in_dev: only on this interface, NULL=any interface * - dst: only in the same subnet as dst, 0=any dst * - local: address, 0=autoselect the local address * - scope: maximum allowed scope value for the local address */ -__be32 inet_confirm_addr(struct in_device *in_dev, +__be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst, __be32 local, int scope) { __be32 addr = 0; struct net_device *dev; - struct net *net; - if (scope != RT_SCOPE_LINK) + if (in_dev != NULL) return confirm_addr_indev(in_dev, dst, local, scope); - net = dev_net(in_dev->dev); rcu_read_lock(); for_each_netdev_rcu(net, dev) { in_dev = __in_dev_get_rcu(dev); -- cgit v1.1 From 299603e8370a93dd5d8e8d800f0dff1ce2c53d36 Mon Sep 17 00:00:00 2001 From: Jerry Chu Date: Wed, 11 Dec 2013 20:53:45 -0800 Subject: net-gro: Prepare GRO stack for the upcoming tunneling support This patch modifies the GRO stack to avoid the use of "network_header" and associated macros like ip_hdr() and ipv6_hdr() in order to allow an arbitary number of IP hdrs (v4 or v6) to be used in the encapsulation chain. This lays the foundation for various IP tunneling support (IP-in-IP, GRE, VXLAN, SIT,...) to be added later. With this patch, the GRO stack traversing now is mostly based on skb_gro_offset rather than special hdr offsets saved in skb (e.g., skb->network_header). As a result all but the top layer (i.e., the the transport layer) must have hdrs of the same length in order for a pkt to be considered for aggregation. Therefore when adding a new encap layer (e.g., for tunneling), one must check and skip flows (e.g., by setting NAPI_GRO_CB(p)->same_flow to 0) that have a different hdr length. Note that unlike the network header, the transport header can and will continue to be set by the GRO code since there will be at most one "transport layer" in the encap chain. Signed-off-by: H.K. Jerry Chu Suggested-by: Eric Dumazet Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 25 +++++++++++++++++++------ net/ipv4/tcp_offload.c | 9 +++++---- 2 files changed, 24 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 70011e0..ef4f9df 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1377,8 +1377,12 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, if (!NAPI_GRO_CB(p)->same_flow) continue; - iph2 = ip_hdr(p); - + iph2 = (struct iphdr *)(p->data + off); + /* The above works because, with the exception of the top + * (inner most) layer, we only aggregate pkts with the same + * hdr length so all the hdrs we'll need to verify will start + * at the same offset. + */ if ((iph->protocol ^ iph2->protocol) | ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) | ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) { @@ -1397,6 +1401,11 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, } NAPI_GRO_CB(skb)->flush |= flush; + skb_set_network_header(skb, off); + /* The above will be needed by the transport layer if there is one + * immediately following this IP hdr. + */ + skb_gro_pull(skb, sizeof(*iph)); skb_set_transport_header(skb, skb_gro_offset(skb)); @@ -1411,10 +1420,10 @@ out: return pp; } -static int inet_gro_complete(struct sk_buff *skb) +static int inet_gro_complete(struct sk_buff *skb, int nhoff) { - __be16 newlen = htons(skb->len - skb_network_offset(skb)); - struct iphdr *iph = ip_hdr(skb); + __be16 newlen = htons(skb->len - nhoff); + struct iphdr *iph = (struct iphdr *)(skb->data + nhoff); const struct net_offload *ops; int proto = iph->protocol; int err = -ENOSYS; @@ -1427,7 +1436,11 @@ static int inet_gro_complete(struct sk_buff *skb) if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out_unlock; - err = ops->callbacks.gro_complete(skb); + /* Only need to add sizeof(*iph) to get to the next hdr below + * because any hdr with option will have been flushed in + * inet_gro_receive(). + */ + err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph)); out_unlock: rcu_read_unlock(); diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 0560635..2658a27 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -240,7 +240,7 @@ int tcp_gro_complete(struct sk_buff *skb) { struct tcphdr *th = tcp_hdr(skb); - skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_start = (unsigned char *)th - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); skb->ip_summed = CHECKSUM_PARTIAL; @@ -272,6 +272,7 @@ static int tcp_v4_gso_send_check(struct sk_buff *skb) static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) { + /* Use the IP hdr immediately proceeding for this transport */ const struct iphdr *iph = skb_gro_network_header(skb); __wsum wsum; @@ -303,13 +304,13 @@ skip_csum: return tcp_gro_receive(head, skb); } -static int tcp4_gro_complete(struct sk_buff *skb) +static int tcp4_gro_complete(struct sk_buff *skb, int thoff) { const struct iphdr *iph = ip_hdr(skb); struct tcphdr *th = tcp_hdr(skb); - th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb), - iph->saddr, iph->daddr, 0); + th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, + iph->daddr, 0); skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; return tcp_gro_complete(skb); -- cgit v1.1 From f085ff1c131c08fb6b34802f63c22921c4d8c506 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Thu, 12 Dec 2013 13:06:50 -0800 Subject: netconf: add proxy-arp support Add support to netconf to show changes to proxy-arp status on a per interface basis via netlink in a manner similar to forwarding and reverse path state. Signed-off-by: Stephen Hemminger Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 84956f5..de03fe7 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1696,6 +1696,8 @@ static int inet_netconf_msgsize_devconf(int type) size += nla_total_size(4); if (type == -1 || type == NETCONFA_MC_FORWARDING) size += nla_total_size(4); + if (type == -1 || type == NETCONFA_PROXY_ARP) + size += nla_total_size(4); return size; } @@ -1732,6 +1734,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, nla_put_s32(skb, NETCONFA_MC_FORWARDING, IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0) goto nla_put_failure; + if ((type == -1 || type == NETCONFA_PROXY_ARP) && + nla_put_s32(skb, NETCONFA_PROXY_ARP, + IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0) + goto nla_put_failure; return nlmsg_end(skb, nlh); @@ -1769,6 +1775,7 @@ static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = { [NETCONFA_IFINDEX] = { .len = sizeof(int) }, [NETCONFA_FORWARDING] = { .len = sizeof(int) }, [NETCONFA_RP_FILTER] = { .len = sizeof(int) }, + [NETCONFA_PROXY_ARP] = { .len = sizeof(int) }, }; static int inet_netconf_get_devconf(struct sk_buff *in_skb, @@ -1950,6 +1957,19 @@ static void inet_forward_change(struct net *net) } } +static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf) +{ + if (cnf == net->ipv4.devconf_dflt) + return NETCONFA_IFINDEX_DEFAULT; + else if (cnf == net->ipv4.devconf_all) + return NETCONFA_IFINDEX_ALL; + else { + struct in_device *idev + = container_of(cnf, struct in_device, cnf); + return idev->dev->ifindex; + } +} + static int devinet_conf_proc(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -1962,6 +1982,7 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write, struct ipv4_devconf *cnf = ctl->extra1; struct net *net = ctl->extra2; int i = (int *)ctl->data - cnf->data; + int ifindex; set_bit(i, cnf->state); @@ -1971,23 +1992,19 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write, i == IPV4_DEVCONF_ROUTE_LOCALNET - 1) if ((new_value == 0) && (old_value != 0)) rt_cache_flush(net); + if (i == IPV4_DEVCONF_RP_FILTER - 1 && new_value != old_value) { - int ifindex; - - if (cnf == net->ipv4.devconf_dflt) - ifindex = NETCONFA_IFINDEX_DEFAULT; - else if (cnf == net->ipv4.devconf_all) - ifindex = NETCONFA_IFINDEX_ALL; - else { - struct in_device *idev = - container_of(cnf, struct in_device, - cnf); - ifindex = idev->dev->ifindex; - } + ifindex = devinet_conf_ifindex(net, cnf); inet_netconf_notify_devconf(net, NETCONFA_RP_FILTER, ifindex, cnf); } + if (i == IPV4_DEVCONF_PROXY_ARP - 1 && + new_value != old_value) { + ifindex = devinet_conf_ifindex(net, cnf); + inet_netconf_notify_devconf(net, NETCONFA_PROXY_ARP, + ifindex, cnf); + } } return ret; -- cgit v1.1 From d4589926d7a9a4b88e650bb9154bab71e8d2a7dd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 Dec 2013 13:51:23 -0800 Subject: tcp: refine TSO splits While investigating performance problems on small RPC workloads, I noticed linux TCP stack was always splitting the last TSO skb into two parts (skbs). One being a multiple of MSS, and a small one with the Push flag. This split is done even if TCP_NODELAY is set, or if no small packet is in flight. Example with request/response of 4K/4K IP A > B: . ack 68432 win 2783 IP A > B: . 65537:68433(2896) ack 69632 win 2783 IP A > B: P 68433:69633(1200) ack 69632 win 2783 IP B > A: . ack 68433 win 2768 IP B > A: . 69632:72528(2896) ack 69633 win 2768 IP B > A: P 72528:73728(1200) ack 69633 win 2768 IP A > B: . ack 72528 win 2783 IP A > B: . 69633:72529(2896) ack 73728 win 2783 IP A > B: P 72529:73729(1200) ack 73728 win 2783 We can avoid this split by including the Nagle tests at the right place. Note : If some NIC had trouble sending TSO packets with a partial last segment, we would have hit the problem in GRO/forwarding workload already. tcp_minshall_update() is moved to tcp_output.c and is updated as we might feed a TSO packet with a partial last segment. This patch tremendously improves performance, as the traffic now looks like : IP A > B: . ack 98304 win 2783 IP A > B: P 94209:98305(4096) ack 98304 win 2783 IP B > A: . ack 98305 win 2768 IP B > A: P 98304:102400(4096) ack 98305 win 2768 IP A > B: . ack 102400 win 2783 IP A > B: P 98305:102401(4096) ack 102400 win 2783 IP B > A: . ack 102401 win 2768 IP B > A: P 102400:106496(4096) ack 102401 win 2768 IP A > B: . ack 106496 win 2783 IP A > B: P 102401:106497(4096) ack 106496 win 2783 IP B > A: . ack 106497 win 2768 IP B > A: P 106496:110592(4096) ack 106497 win 2768 Before : lpq83:~# nstat >/dev/null;perf stat ./super_netperf 200 -t TCP_RR -H lpq84 -l 20 -- -r 4K,4K 280774 Performance counter stats for './super_netperf 200 -t TCP_RR -H lpq84 -l 20 -- -r 4K,4K': 205719.049006 task-clock # 9.278 CPUs utilized 8,449,968 context-switches # 0.041 M/sec 1,935,997 CPU-migrations # 0.009 M/sec 160,541 page-faults # 0.780 K/sec 548,478,722,290 cycles # 2.666 GHz [83.20%] 455,240,670,857 stalled-cycles-frontend # 83.00% frontend cycles idle [83.48%] 272,881,454,275 stalled-cycles-backend # 49.75% backend cycles idle [66.73%] 166,091,460,030 instructions # 0.30 insns per cycle # 2.74 stalled cycles per insn [83.39%] 29,150,229,399 branches # 141.699 M/sec [83.30%] 1,943,814,026 branch-misses # 6.67% of all branches [83.32%] 22.173517844 seconds time elapsed lpq83:~# nstat | egrep "IpOutRequests|IpExtOutOctets" IpOutRequests 16851063 0.0 IpExtOutOctets 23878580777 0.0 After patch : lpq83:~# nstat >/dev/null;perf stat ./super_netperf 200 -t TCP_RR -H lpq84 -l 20 -- -r 4K,4K 280877 Performance counter stats for './super_netperf 200 -t TCP_RR -H lpq84 -l 20 -- -r 4K,4K': 107496.071918 task-clock # 4.847 CPUs utilized 5,635,458 context-switches # 0.052 M/sec 1,374,707 CPU-migrations # 0.013 M/sec 160,920 page-faults # 0.001 M/sec 281,500,010,924 cycles # 2.619 GHz [83.28%] 228,865,069,307 stalled-cycles-frontend # 81.30% frontend cycles idle [83.38%] 142,462,742,658 stalled-cycles-backend # 50.61% backend cycles idle [66.81%] 95,227,712,566 instructions # 0.34 insns per cycle # 2.40 stalled cycles per insn [83.43%] 16,209,868,171 branches # 150.795 M/sec [83.20%] 874,252,952 branch-misses # 5.39% of all branches [83.37%] 22.175821286 seconds time elapsed lpq83:~# nstat | egrep "IpOutRequests|IpExtOutOctets" IpOutRequests 11239428 0.0 IpExtOutOctets 23595191035 0.0 Indeed, the occupancy of tx skbs (IpExtOutOctets/IpOutRequests) is higher : 2099 instead of 1417, thus helping GRO to be more efficient when using FQ packet scheduler. Many thanks to Neal for review and ideas. Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Neal Cardwell Cc: Nandita Dukkipati Cc: Van Jacobson Acked-by: Neal Cardwell Tested-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 93 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 39 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2a69f42..9e7aec7 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1384,23 +1384,51 @@ static void tcp_cwnd_validate(struct sock *sk) } } -/* Returns the portion of skb which can be sent right away without - * introducing MSS oddities to segment boundaries. In rare cases where - * mss_now != mss_cache, we will request caller to create a small skb - * per input skb which could be mostly avoided here (if desired). - * - * We explicitly want to create a request for splitting write queue tail - * to a small skb for Nagle purposes while avoiding unnecessary modulos, - * thus all the complexity (cwnd_len is always MSS multiple which we - * return whenever allowed by the other factors). Basically we need the - * modulo only when the receiver window alone is the limiting factor or - * when we would be allowed to send the split-due-to-Nagle skb fully. +/* Minshall's variant of the Nagle send check. */ +static bool tcp_minshall_check(const struct tcp_sock *tp) +{ + return after(tp->snd_sml, tp->snd_una) && + !after(tp->snd_sml, tp->snd_nxt); +} + +/* Update snd_sml if this skb is under mss + * Note that a TSO packet might end with a sub-mss segment + * The test is really : + * if ((skb->len % mss) != 0) + * tp->snd_sml = TCP_SKB_CB(skb)->end_seq; + * But we can avoid doing the divide again given we already have + * skb_pcount = skb->len / mss_now */ -static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, - unsigned int mss_now, unsigned int max_segs) +static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, + const struct sk_buff *skb) +{ + if (skb->len < tcp_skb_pcount(skb) * mss_now) + tp->snd_sml = TCP_SKB_CB(skb)->end_seq; +} + +/* Return false, if packet can be sent now without violation Nagle's rules: + * 1. It is full sized. (provided by caller in %partial bool) + * 2. Or it contains FIN. (already checked by caller) + * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. + * 4. Or TCP_CORK is not set, and all sent packets are ACKed. + * With Minshall's modification: all sent small packets are ACKed. + */ +static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, + unsigned int mss_now, int nonagle) +{ + return partial && + ((nonagle & TCP_NAGLE_CORK) || + (!nonagle && tp->packets_out && tcp_minshall_check(tp))); +} +/* Returns the portion of skb which can be sent right away */ +static unsigned int tcp_mss_split_point(const struct sock *sk, + const struct sk_buff *skb, + unsigned int mss_now, + unsigned int max_segs, + int nonagle) { const struct tcp_sock *tp = tcp_sk(sk); - u32 needed, window, max_len; + u32 partial, needed, window, max_len; window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; max_len = mss_now * max_segs; @@ -1413,7 +1441,15 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_b if (max_len <= needed) return max_len; - return needed - needed % mss_now; + partial = needed % mss_now; + /* If last segment is not a full MSS, check if Nagle rules allow us + * to include this last segment in this skb. + * Otherwise, we'll split the skb at last MSS boundary + */ + if (tcp_nagle_check(partial != 0, tp, mss_now, nonagle)) + return needed - partial; + + return needed; } /* Can at least one segment of SKB be sent right now, according to the @@ -1453,28 +1489,6 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, return tso_segs; } -/* Minshall's variant of the Nagle send check. */ -static inline bool tcp_minshall_check(const struct tcp_sock *tp) -{ - return after(tp->snd_sml, tp->snd_una) && - !after(tp->snd_sml, tp->snd_nxt); -} - -/* Return false, if packet can be sent now without violation Nagle's rules: - * 1. It is full sized. - * 2. Or it contains FIN. (already checked by caller) - * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. - * 4. Or TCP_CORK is not set, and all sent packets are ACKed. - * With Minshall's modification: all sent small packets are ACKed. - */ -static inline bool tcp_nagle_check(const struct tcp_sock *tp, - const struct sk_buff *skb, - unsigned int mss_now, int nonagle) -{ - return skb->len < mss_now && - ((nonagle & TCP_NAGLE_CORK) || - (!nonagle && tp->packets_out && tcp_minshall_check(tp))); -} /* Return true if the Nagle test allows this packet to be * sent now. @@ -1495,7 +1509,7 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) return true; - if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) + if (!tcp_nagle_check(skb->len < cur_mss, tp, cur_mss, nonagle)) return true; return false; @@ -1898,7 +1912,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, limit = tcp_mss_split_point(sk, skb, mss_now, min_t(unsigned int, cwnd_quota, - sk->sk_gso_max_segs)); + sk->sk_gso_max_segs), + nonagle); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) -- cgit v1.1 From 7539fadcb8146a5f0db51e80d99c9e724efec7b0 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 15 Dec 2013 22:12:18 -0800 Subject: net: Add utility functions to clear rxhash In several places 'skb->rxhash = 0' is being done to clear the rxhash value in an skb. This does not clear l4_rxhash which could still be set so that the rxhash wouldn't be recalculated on subsequent call to skb_get_rxhash. This patch adds an explict function to clear all the rxhash related information in the skb properly. skb_clear_hash_if_not_l4 clears the rxhash only if it is not marked as l4_rxhash. Fixed up places where 'skb->rxhash = 0' was being called. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 2 +- net/ipv4/ip_tunnel_core.c | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 2481993..c10a3ce 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -704,7 +704,7 @@ struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); if (ip_defrag(skb, user)) return NULL; - skb->rxhash = 0; + skb_clear_hash(skb); } } return skb; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 42ffbc8..6156f4e 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -56,7 +56,7 @@ int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb, skb_scrub_packet(skb, xnet); - skb->rxhash = 0; + skb_clear_hash(skb); skb_dst_set(skb, &rt->dst); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); @@ -107,8 +107,7 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) nf_reset(skb); secpath_reset(skb); - if (!skb->l4_rxhash) - skb->rxhash = 0; + skb_clear_hash_if_not_l4(skb); skb_dst_drop(skb); skb->vlan_tci = 0; skb_set_queue_mapping(skb, 0); -- cgit v1.1 From 974eda11c54290a1be8f8b155edae7d791e5ce57 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sat, 14 Dec 2013 05:13:38 +0100 Subject: inet: make no_pmtu_disc per namespace and kill ipv4_config The other field in ipv4_config, log_martians, was converted to a per-interface setting, so we can just remove the whole structure. Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 5 +---- net/ipv4/icmp.c | 2 +- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/xfrm4_state.c | 2 +- 4 files changed, 10 insertions(+), 13 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ef4f9df..6b1193e 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -126,9 +126,6 @@ static struct list_head inetsw[SOCK_MAX]; static DEFINE_SPINLOCK(inetsw_lock); -struct ipv4_config ipv4_config; -EXPORT_SYMBOL(ipv4_config); - /* New destruction routine */ void inet_sock_destruct(struct sock *sk) @@ -342,7 +339,7 @@ lookup_protocol: inet->hdrincl = 1; } - if (ipv4_config.no_pmtu_disc) + if (net->ipv4.sysctl_ip_no_pmtu_disc) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 5c0e8bc..1a4ee84 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -705,7 +705,7 @@ static void icmp_unreach(struct sk_buff *skb) case ICMP_PORT_UNREACH: break; case ICMP_FRAG_NEEDED: - if (ipv4_config.no_pmtu_disc) { + if (net->ipv4.sysctl_ip_no_pmtu_disc) { LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"), &iph->daddr); } else { diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 38c8ec9..d7b63a6 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -286,13 +286,6 @@ static struct ctl_table ipv4_table[] = { .extra2 = &ip_ttl_max, }, { - .procname = "ip_no_pmtu_disc", - .data = &ipv4_config.no_pmtu_disc, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "ip_nonlocal_bind", .data = &sysctl_ip_nonlocal_bind, .maxlen = sizeof(int), @@ -831,6 +824,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = ipv4_local_port_range, }, + { + .procname = "ip_no_pmtu_disc", + .data = &init_net.ipv4.sysctl_ip_no_pmtu_disc, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 0b2a064..542074c 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -16,7 +16,7 @@ static int xfrm4_init_flags(struct xfrm_state *x) { - if (ipv4_config.no_pmtu_disc) + if (xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc) x->props.flags |= XFRM_STATE_NOPMTUDISC; return 0; } -- cgit v1.1 From cd174e67a6b312fce9bab502ba2b0583e11f537f Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sat, 14 Dec 2013 05:13:45 +0100 Subject: ipv4: new ip_no_pmtu_disc mode to always discard incoming frag needed msgs This new mode discards all incoming fragmentation-needed notifications as I guess was originally intended with this knob. To not break backward compatibility too much, I only added a special case for mode 2 in the receiving path. Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 1a4ee84..fb3c563 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -705,7 +705,9 @@ static void icmp_unreach(struct sk_buff *skb) case ICMP_PORT_UNREACH: break; case ICMP_FRAG_NEEDED: - if (net->ipv4.sysctl_ip_no_pmtu_disc) { + if (net->ipv4.sysctl_ip_no_pmtu_disc == 2) { + goto out; + } else if (net->ipv4.sysctl_ip_no_pmtu_disc) { LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"), &iph->daddr); } else { -- cgit v1.1 From a181ceb501b31b4bf8812a5c84c716cc31d82c2d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 17 Dec 2013 09:58:30 -0800 Subject: tcp: autocork should not hold first packet in write queue Willem noticed a TCP_RR regression caused by TCP autocorking on a Mellanox test bed. MLX4_EN_TX_COAL_TIME is 16 us, which can be right above RTT between hosts. We can receive a ACK for a packet still in NIC TX ring buffer or in a softnet completion queue. Fix this by always pushing the skb if it is at the head of write queue. Also, as TX completion is lockless, it's safer to perform sk_wmem_alloc test after setting TSQ_THROTTLED. erd:~# MIB="MIN_LATENCY,MEAN_LATENCY,MAX_LATENCY,P99_LATENCY,STDDEV_LATENCY" erd:~# ./netperf -H remote -t TCP_RR -- -o $MIB | tail -n 1 (repeat 3 times) Before patch : 18,1049.87,41004,39631,6295.47 17,239.52,40804,48,2912.79 18,348.40,40877,54,3573.39 After patch : 18,22.84,4606,38,16.39 17,21.56,2871,36,13.51 17,22.46,2705,37,11.83 Reported-by: Willem de Bruijn Signed-off-by: Eric Dumazet Fixes: f54b311142a9 ("tcp: auto corking") Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0ca8754..d099f9a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -622,19 +622,21 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) } /* If a not yet filled skb is pushed, do not send it if - * we have packets in Qdisc or NIC queues : + * we have data packets in Qdisc or NIC queues : * Because TX completion will happen shortly, it gives a chance * to coalesce future sendmsg() payload into this skb, without * need for a timer, and with no latency trade off. * As packets containing data payload have a bigger truesize - * than pure acks (dataless) packets, the last check prevents - * autocorking if we only have an ACK in Qdisc/NIC queues. + * than pure acks (dataless) packets, the last checks prevent + * autocorking if we only have an ACK in Qdisc/NIC queues, + * or if TX completion was delayed after we processed ACK packet. */ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, int size_goal) { return skb->len < size_goal && sysctl_tcp_autocorking && + skb != tcp_write_queue_head(sk) && atomic_read(&sk->sk_wmem_alloc) > skb->truesize; } @@ -660,7 +662,11 @@ static void tcp_push(struct sock *sk, int flags, int mss_now, NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); set_bit(TSQ_THROTTLED, &tp->tsq_flags); } - return; + /* It is possible TX completion already happened + * before we set TSQ_THROTTLED. + */ + if (atomic_read(&sk->sk_wmem_alloc) > skb->truesize) + return; } if (flags & MSG_MORE) -- cgit v1.1 From 09aea5df7fbf0b987623f7007a11e64008284a0e Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Tue, 17 Dec 2013 22:35:52 -0800 Subject: netconf: rename PROXY_ARP to NEIGH_PROXY Use same field for both IPv4 (proxy_arp) and IPv6 (proxy_ndp) so fix it before API is set to be a common name Signed-off-by: Stephen Hemminger Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index de03fe7..0feebd5 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1696,7 +1696,7 @@ static int inet_netconf_msgsize_devconf(int type) size += nla_total_size(4); if (type == -1 || type == NETCONFA_MC_FORWARDING) size += nla_total_size(4); - if (type == -1 || type == NETCONFA_PROXY_ARP) + if (type == -1 || type == NETCONFA_PROXY_NEIGH) size += nla_total_size(4); return size; @@ -1734,8 +1734,8 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, nla_put_s32(skb, NETCONFA_MC_FORWARDING, IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0) goto nla_put_failure; - if ((type == -1 || type == NETCONFA_PROXY_ARP) && - nla_put_s32(skb, NETCONFA_PROXY_ARP, + if ((type == -1 || type == NETCONFA_PROXY_NEIGH) && + nla_put_s32(skb, NETCONFA_PROXY_NEIGH, IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0) goto nla_put_failure; @@ -1775,7 +1775,7 @@ static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = { [NETCONFA_IFINDEX] = { .len = sizeof(int) }, [NETCONFA_FORWARDING] = { .len = sizeof(int) }, [NETCONFA_RP_FILTER] = { .len = sizeof(int) }, - [NETCONFA_PROXY_ARP] = { .len = sizeof(int) }, + [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) }, }; static int inet_netconf_get_devconf(struct sk_buff *in_skb, @@ -2002,7 +2002,7 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write, if (i == IPV4_DEVCONF_PROXY_ARP - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); - inet_netconf_notify_devconf(net, NETCONFA_PROXY_ARP, + inet_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH, ifindex, cnf); } } -- cgit v1.1 From a22318e83bc4726252262f91b08f17d2d66bd88c Mon Sep 17 00:00:00 2001 From: Weilong Chen Date: Mon, 23 Dec 2013 14:37:26 +0800 Subject: ipv4: do clean up with spaces Fix checkpatch errors like: ERROR: spaces required around that XXX Signed-off-by: Weilong Chen Signed-off-by: David S. Miller --- net/ipv4/ip_options.c | 12 ++++++------ net/ipv4/tcp_input.c | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index ec72645..dd9d90b 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -167,7 +167,7 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) soffset -= 4; if (soffset > 3) { memcpy(&faddr, &start[soffset-1], 4); - for (soffset-=4, doffset=4; soffset > 3; soffset-=4, doffset+=4) + for (soffset -= 4, doffset = 4; soffset > 3; soffset -= 4, doffset += 4) memcpy(&dptr[doffset-1], &start[soffset-1], 4); /* * RFC1812 requires to fix illegal source routes. @@ -227,7 +227,7 @@ void ip_options_fragment(struct sk_buff *skb) continue; } optlen = optptr[1]; - if (optlen<2 || optlen>l) + if (optlen < 2 || optlen > l) return; if (!IPOPT_COPIED(*optptr)) memset(optptr, IPOPT_NOOP, optlen); @@ -276,7 +276,7 @@ int ip_options_compile(struct net *net, for (l = opt->optlen; l > 0; ) { switch (*optptr) { case IPOPT_END: - for (optptr++, l--; l>0; optptr++, l--) { + for (optptr++, l--; l > 0; optptr++, l--) { if (*optptr != IPOPT_END) { *optptr = IPOPT_END; opt->is_changed = 1; @@ -289,7 +289,7 @@ int ip_options_compile(struct net *net, continue; } optlen = optptr[1]; - if (optlen<2 || optlen>l) { + if (optlen < 2 || optlen > l) { pp_ptr = optptr; goto error; } @@ -572,7 +572,7 @@ void ip_forward_options(struct sk_buff *skb) optptr = raw + opt->srr; - for ( srrptr=optptr[2], srrspace = optptr[1]; + for ( srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4 ) { @@ -628,7 +628,7 @@ int ip_options_rcv_srr(struct sk_buff *skb) if (rt->rt_type != RTN_LOCAL) return -EINVAL; - for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) { + for (srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) { if (srrptr + 3 > srrspace) { icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24)); return -EINVAL; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c53b7f3..f75cf15 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3686,7 +3686,7 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th) int opcode = *ptr++; int opsize; - switch(opcode) { + switch (opcode) { case TCPOPT_EOL: return NULL; case TCPOPT_NOP: @@ -4046,7 +4046,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) WARN_ON(before(tp->rcv_nxt, sp->end_seq)); /* Zap this SACK, by moving forward any other SACKS. */ - for (i=this_sack+1; i < num_sacks; i++) + for (i = this_sack+1; i < num_sacks; i++) tp->selective_acks[i-1] = tp->selective_acks[i]; num_sacks--; continue; -- cgit v1.1 From 0c9a67d2ed028e0edd3260abafef4f1efd91aa5a Mon Sep 17 00:00:00 2001 From: Weilong Chen Date: Mon, 23 Dec 2013 14:37:27 +0800 Subject: ipv4: fix checkpatch error "space prohibited" Signed-off-by: Weilong Chen Signed-off-by: David S. Miller --- net/ipv4/inetpeer.c | 4 ++-- net/ipv4/netfilter/nf_nat_snmp_basic.c | 12 ++++++------ net/ipv4/syncookies.c | 2 +- net/ipv4/tcp_yeah.c | 4 ++-- 4 files changed, 11 insertions(+), 11 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 33d5537..d047c88 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -227,7 +227,7 @@ static int addr_compare(const struct inetpeer_addr *a, stackptr = _stack; \ *stackptr++ = &_base->root; \ for (u = rcu_deref_locked(_base->root, _base); \ - u != peer_avl_empty; ) { \ + u != peer_avl_empty;) { \ int cmp = addr_compare(_daddr, &u->daddr); \ if (cmp == 0) \ break; \ @@ -282,7 +282,7 @@ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, *stackptr++ = &start->avl_left; \ v = &start->avl_left; \ for (u = rcu_deref_locked(*v, base); \ - u->avl_right != peer_avl_empty_rcu; ) { \ + u->avl_right != peer_avl_empty_rcu;) { \ v = &u->avl_right; \ *stackptr++ = v; \ u = rcu_deref_locked(*v, base); \ diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index 61a9422..d551e31 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -461,14 +461,14 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx, } if (subid < 40) { - optr [0] = 0; - optr [1] = subid; + optr[0] = 0; + optr[1] = subid; } else if (subid < 80) { - optr [0] = 1; - optr [1] = subid - 40; + optr[0] = 1; + optr[1] = subid - 40; } else { - optr [0] = 2; - optr [1] = subid - 80; + optr[0] = 2; + optr[1] = subid - 80; } *len = 2; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index b95331e..f2ed13c 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -121,7 +121,7 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr, cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */ - diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS); + diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS); if (diff >= MAX_SYNCOOKIE_AGE) return (__u32)-1; diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index a347a07..20cfaab 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -214,9 +214,9 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) { if (yeah->doing_reno_now < TCP_YEAH_RHO) { reduction = yeah->lastQ; - reduction = min( reduction, max(tp->snd_cwnd>>1, 2U) ); + reduction = min(reduction, max(tp->snd_cwnd>>1, 2U)); - reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA); + reduction = max(reduction, tp->snd_cwnd >> TCP_YEAH_DELTA); } else reduction = max(tp->snd_cwnd>>1, 2U); -- cgit v1.1 From d41db5af26dba01235af9a9d514e043a44b85d60 Mon Sep 17 00:00:00 2001 From: Weilong Chen Date: Mon, 23 Dec 2013 14:37:28 +0800 Subject: ipv4: fix checkpatch error with foo * bar Signed-off-by: Weilong Chen Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bbaf8cb..7297b56 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -827,7 +827,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; int err = -1; - struct sk_buff * skb; + struct sk_buff *skb; /* First, grab a route. */ if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) -- cgit v1.1 From c71151f05bf6da37a94861c9f181b0b6ff866e6a Mon Sep 17 00:00:00 2001 From: Weilong Chen Date: Mon, 23 Dec 2013 14:37:29 +0800 Subject: ipv4: fix all space errors in file igmp.c Signed-off-by: Weilong Chen Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 70 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 35 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 7defdc9..84c4329 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -310,7 +310,7 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) struct ip_sf_list *psf; int scount = 0; - for (psf=pmc->sources; psf; psf=psf->sf_next) { + for (psf = pmc->sources; psf; psf = psf->sf_next) { if (!is_in(pmc, psf, type, gdeleted, sdeleted)) continue; scount++; @@ -463,7 +463,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, } first = 1; psf_prev = NULL; - for (psf=*psf_list; psf; psf=psf_next) { + for (psf = *psf_list; psf; psf = psf_next) { __be32 *psrc; psf_next = psf->sf_next; @@ -520,7 +520,7 @@ empty_source: return skb; if (pmc->crcount || isquery) { /* make sure we have room for group header */ - if (skb && AVAILABLE(skb)sf_next; if (psf->sf_crcount == 0) { if (psf_prev) @@ -600,7 +600,7 @@ static void igmpv3_send_cr(struct in_device *in_dev) /* deleted MCA's */ pmc_prev = NULL; - for (pmc=in_dev->mc_tomb; pmc; pmc=pmc_next) { + for (pmc = in_dev->mc_tomb; pmc; pmc = pmc_next) { pmc_next = pmc->next; if (pmc->sfmode == MCAST_INCLUDE) { type = IGMPV3_BLOCK_OLD_SOURCES; @@ -764,7 +764,7 @@ static void igmp_ifc_event(struct in_device *in_dev) static void igmp_timer_expire(unsigned long data) { - struct ip_mc_list *im=(struct ip_mc_list *)data; + struct ip_mc_list *im = (struct ip_mc_list *)data; struct in_device *in_dev = im->interface; spin_lock(&im->lock); @@ -794,10 +794,10 @@ static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) int i, scount; scount = 0; - for (psf=pmc->sources; psf; psf=psf->sf_next) { + for (psf = pmc->sources; psf; psf = psf->sf_next) { if (scount == nsrcs) break; - for (i=0; isf_count[MCAST_INCLUDE] || pmc->sfcount[MCAST_EXCLUDE] != @@ -825,10 +825,10 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) /* mark INCLUDE-mode sources */ scount = 0; - for (psf=pmc->sources; psf; psf=psf->sf_next) { + for (psf = pmc->sources; psf; psf = psf->sf_next) { if (scount == nsrcs) break; - for (i=0; isf_inaddr) { psf->sf_gsresp = 1; scount++; @@ -1103,7 +1103,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) pmc->tomb = im->tomb; pmc->sources = im->sources; im->tomb = im->sources = NULL; - for (psf=pmc->sources; psf; psf=psf->sf_next) + for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = pmc->crcount; } spin_unlock_bh(&im->lock); @@ -1121,7 +1121,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr) spin_lock_bh(&in_dev->mc_tomb_lock); pmc_prev = NULL; - for (pmc=in_dev->mc_tomb; pmc; pmc=pmc->next) { + for (pmc = in_dev->mc_tomb; pmc; pmc = pmc->next) { if (pmc->multiaddr == multiaddr) break; pmc_prev = pmc; @@ -1134,7 +1134,7 @@ static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr) } spin_unlock_bh(&in_dev->mc_tomb_lock); if (pmc) { - for (psf=pmc->tomb; psf; psf=psf_next) { + for (psf = pmc->tomb; psf; psf = psf_next) { psf_next = psf->sf_next; kfree(psf); } @@ -1167,7 +1167,7 @@ static void igmpv3_clear_delrec(struct in_device *in_dev) psf = pmc->tomb; pmc->tomb = NULL; spin_unlock_bh(&pmc->lock); - for (; psf; psf=psf_next) { + for (; psf; psf = psf_next) { psf_next = psf->sf_next; kfree(psf); } @@ -1557,7 +1557,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, int rv = 0; psf_prev = NULL; - for (psf=pmc->sources; psf; psf=psf->sf_next) { + for (psf = pmc->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == *psfsrc) break; psf_prev = psf; @@ -1630,7 +1630,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, pmc->sfcount[sfmode]--; } err = 0; - for (i=0; i 0; @@ -1650,7 +1650,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : IGMP_Unsolicited_Report_Count; in_dev->mr_ifc_count = pmc->crcount; - for (psf=pmc->sources; psf; psf = psf->sf_next) + for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; igmp_ifc_event(pmc->interface); } else if (sf_setstate(pmc) || changerec) { @@ -1671,7 +1671,7 @@ static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode, struct ip_sf_list *psf, *psf_prev; psf_prev = NULL; - for (psf=pmc->sources; psf; psf=psf->sf_next) { + for (psf = pmc->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == *psfsrc) break; psf_prev = psf; @@ -1699,7 +1699,7 @@ static void sf_markstate(struct ip_mc_list *pmc) struct ip_sf_list *psf; int mca_xcount = pmc->sfcount[MCAST_EXCLUDE]; - for (psf=pmc->sources; psf; psf=psf->sf_next) + for (psf = pmc->sources; psf; psf = psf->sf_next) if (pmc->sfcount[MCAST_EXCLUDE]) { psf->sf_oldin = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && @@ -1716,7 +1716,7 @@ static int sf_setstate(struct ip_mc_list *pmc) int new_in, rv; rv = 0; - for (psf=pmc->sources; psf; psf=psf->sf_next) { + for (psf = pmc->sources; psf; psf = psf->sf_next) { if (pmc->sfcount[MCAST_EXCLUDE]) { new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; @@ -1726,7 +1726,7 @@ static int sf_setstate(struct ip_mc_list *pmc) if (!psf->sf_oldin) { struct ip_sf_list *prev = NULL; - for (dpsf=pmc->tomb; dpsf; dpsf=dpsf->sf_next) { + for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) { if (dpsf->sf_inaddr == psf->sf_inaddr) break; prev = dpsf; @@ -1748,7 +1748,7 @@ static int sf_setstate(struct ip_mc_list *pmc) * add or update "delete" records if an active filter * is now inactive */ - for (dpsf=pmc->tomb; dpsf; dpsf=dpsf->sf_next) + for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) if (dpsf->sf_inaddr == psf->sf_inaddr) break; if (!dpsf) { @@ -1800,7 +1800,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, if (!delta) pmc->sfcount[sfmode]++; err = 0; - for (i=0; isfcount[sfmode]--; - for (j=0; jsfcount[MCAST_EXCLUDE] != 0)) { #ifdef CONFIG_IP_MULTICAST @@ -1829,7 +1829,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : IGMP_Unsolicited_Report_Count; in_dev->mr_ifc_count = pmc->crcount; - for (psf=pmc->sources; psf; psf = psf->sf_next) + for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; igmp_ifc_event(in_dev); } else if (sf_setstate(pmc)) { @@ -1844,12 +1844,12 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc) { struct ip_sf_list *psf, *nextpsf; - for (psf=pmc->tomb; psf; psf=nextpsf) { + for (psf = pmc->tomb; psf; psf = nextpsf) { nextpsf = psf->sf_next; kfree(psf); } pmc->tomb = NULL; - for (psf=pmc->sources; psf; psf=nextpsf) { + for (psf = pmc->sources; psf; psf = nextpsf) { nextpsf = psf->sf_next; kfree(psf); } @@ -2043,7 +2043,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct if (!psl) goto done; /* err = -EADDRNOTAVAIL */ rv = !0; - for (i=0; isl_count; i++) { + for (i = 0; i < psl->sl_count; i++) { rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, sizeof(__be32)); if (rv == 0) @@ -2062,7 +2062,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1, &mreqs->imr_sourceaddr, 1); - for (j=i+1; jsl_count; j++) + for (j = i+1; j < psl->sl_count; j++) psl->sl_addr[j-1] = psl->sl_addr[j]; psl->sl_count--; err = 0; @@ -2088,7 +2088,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct newpsl->sl_max = count; newpsl->sl_count = count - IP_SFBLOCK; if (psl) { - for (i=0; isl_count; i++) + for (i = 0; i < psl->sl_count; i++) newpsl->sl_addr[i] = psl->sl_addr[i]; /* decrease mem now to avoid the memleak warning */ atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); @@ -2098,7 +2098,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct psl = newpsl; } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ - for (i=0; isl_count; i++) { + for (i = 0; i < psl->sl_count; i++) { rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, sizeof(__be32)); if (rv == 0) @@ -2106,7 +2106,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct } if (rv == 0) /* address already there is an error */ goto done; - for (j=psl->sl_count-1; j>=i; j--) + for (j = psl->sl_count-1; j >= i; j--) psl->sl_addr[j+1] = psl->sl_addr[j]; psl->sl_addr[i] = mreqs->imr_sourceaddr; psl->sl_count++; @@ -2305,7 +2305,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) { return -EFAULT; } - for (i=0; isl_count; i++) { + for (i = 0; i < psl->sl_count; i++) { if (psl->sl_addr[i] == rmt_addr) break; } @@ -2423,7 +2423,7 @@ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u rv = 1; } else if (im) { if (src_addr) { - for (psf=im->sources; psf; psf=psf->sf_next) { + for (psf = im->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == src_addr) break; } -- cgit v1.1 From 47d18a9be14e6f5d84e5b241f5ba1a2cd2a6ebdc Mon Sep 17 00:00:00 2001 From: Weilong Chen Date: Mon, 23 Dec 2013 14:37:30 +0800 Subject: ipv4: ERROR: do not initialise globals to 0 or NULL Signed-off-by: Weilong Chen Signed-off-by: David S. Miller --- net/ipv4/tcp_probe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 8b97d71e..1f2d376 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -38,7 +38,7 @@ MODULE_DESCRIPTION("TCP cwnd snooper"); MODULE_LICENSE("GPL"); MODULE_VERSION("1.1"); -static int port __read_mostly = 0; +static int port __read_mostly; MODULE_PARM_DESC(port, "Port to match (0=all)"); module_param(port, int, 0); @@ -46,7 +46,7 @@ static unsigned int bufsize __read_mostly = 4096; MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)"); module_param(bufsize, uint, 0); -static unsigned int fwmark __read_mostly = 0; +static unsigned int fwmark __read_mostly; MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)"); module_param(fwmark, uint, 0); -- cgit v1.1 From 5797deb657b9bec41214b50eaa3e2c4feb4b7d8d Mon Sep 17 00:00:00 2001 From: Weilong Chen Date: Mon, 23 Dec 2013 14:37:31 +0800 Subject: ipv4: ERROR: code indent should use tabs where possible Signed-off-by: Weilong Chen Signed-off-by: David S. Miller --- net/ipv4/cipso_ipv4.c | 9 +++------ net/ipv4/sysctl_net_ipv4.c | 2 +- net/ipv4/xfrm4_mode_beet.c | 2 +- 3 files changed, 5 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 4b59b6e..69e77c8 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1335,8 +1335,7 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def, secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { - secattr->attr.mls.cat = - netlbl_secattr_catmap_alloc(GFP_ATOMIC); + secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); if (secattr->attr.mls.cat == NULL) return -ENOMEM; @@ -1431,8 +1430,7 @@ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def, secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { - secattr->attr.mls.cat = - netlbl_secattr_catmap_alloc(GFP_ATOMIC); + secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); if (secattr->attr.mls.cat == NULL) return -ENOMEM; @@ -1526,8 +1524,7 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def, secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { - secattr->attr.mls.cat = - netlbl_secattr_catmap_alloc(GFP_ATOMIC); + secattr->attr.mls.cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); if (secattr->attr.mls.cat == NULL) return -ENOMEM; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d7b63a6..1d2480a 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -700,7 +700,7 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { + { .procname = "tcp_thin_dupack", .data = &sysctl_tcp_thin_dupack, .maxlen = sizeof(int), diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index e3db3f9..71acd00 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -48,7 +48,7 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4); skb_set_network_header(skb, -x->props.header_len - - hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph))); + hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph))); if (x->sel.family != AF_INET6) skb->network_header += IPV4_BEET_PHMAXLEN; skb->mac_header = skb->network_header + -- cgit v1.1 From 49564e5516796bdda31ac15d8a48d06757fd41ec Mon Sep 17 00:00:00 2001 From: Weilong Chen Date: Mon, 23 Dec 2013 14:37:32 +0800 Subject: ipv4: ipv4: Cleanup the comments in tcp_yeah.c This cleanup the comments in tcp_yeah.c. 1.The old link is dead,use a new one to instead. 2.'lin' add nothing useful,remove it. 3.do not use C99 // comments. Signed-off-by: Weilong Chen Signed-off-by: David S. Miller --- net/ipv4/tcp_yeah.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 20cfaab..1a8d271 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -3,7 +3,7 @@ * YeAH TCP * * For further details look at: - * http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf + * https://web.archive.org/web/20080316215752/http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf * */ #include @@ -15,13 +15,13 @@ #include "tcp_vegas.h" -#define TCP_YEAH_ALPHA 80 //lin number of packets queued at the bottleneck -#define TCP_YEAH_GAMMA 1 //lin fraction of queue to be removed per rtt -#define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss -#define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion -#define TCP_YEAH_PHY 8 //lin maximum delta from base -#define TCP_YEAH_RHO 16 //lin minimum number of consecutive rtt to consider competition on loss -#define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count +#define TCP_YEAH_ALPHA 80 /* number of packets queued at the bottleneck */ +#define TCP_YEAH_GAMMA 1 /* fraction of queue to be removed per rtt */ +#define TCP_YEAH_DELTA 3 /* log minimum fraction of cwnd to be removed on loss */ +#define TCP_YEAH_EPSILON 1 /* log maximum fraction to be removed on early decongestion */ +#define TCP_YEAH_PHY 8 /* maximum delta from base */ +#define TCP_YEAH_RHO 16 /* minimum number of consecutive rtt to consider competition on loss */ +#define TCP_YEAH_ZETA 50 /* minimum number of state switches to reset reno_count */ #define TCP_SCALABLE_AI_CNT 100U -- cgit v1.1 From c9cb6b6ec18ca91d090178d3c6d1dde7428c791b Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sat, 28 Dec 2013 11:05:36 -0800 Subject: ipv4: make fib_detect_death static Make fib_detect_death function static only used in one file. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/fib_lookup.h | 2 -- net/ipv4/fib_semantics.c | 5 +++-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 388d113..1e4f660 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -33,8 +33,6 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id, void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); -int fib_detect_death(struct fib_info *fi, int order, - struct fib_info **last_resort, int *last_idx, int dflt); static inline void fib_result_assign(struct fib_result *res, struct fib_info *fi) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e63f47a..b53f0bf 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -426,8 +426,9 @@ struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio) return NULL; } -int fib_detect_death(struct fib_info *fi, int order, - struct fib_info **last_resort, int *last_idx, int dflt) +static int fib_detect_death(struct fib_info *fi, int order, + struct fib_info **last_resort, int *last_idx, + int dflt) { struct neighbour *n; int state = NUD_NONE; -- cgit v1.1 From 7195cf72211d77108de8271a247ec2876ae60502 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sat, 28 Dec 2013 11:07:18 -0800 Subject: arp: make arp_invalidate static Don't export arp_invalidate, only used in arp.c Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/arp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index b67cf80..5bf408b 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1112,7 +1112,7 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev) return err; } -int arp_invalidate(struct net_device *dev, __be32 ip) +static int arp_invalidate(struct net_device *dev, __be32 ip) { struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev); int err = -ENXIO; @@ -1127,7 +1127,6 @@ int arp_invalidate(struct net_device *dev, __be32 ip) return err; } -EXPORT_SYMBOL(arp_invalidate); static int arp_req_delete_public(struct net *net, struct arpreq *r, struct net_device *dev) -- cgit v1.1 From 068a6e183484b1c619025da502a6fbfa53c50991 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sat, 28 Dec 2013 11:08:22 -0800 Subject: ipv4: remove unused function inetpeer_invalidate_family defined but never used Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/inetpeer.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index d047c88..48f4244 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -109,13 +109,6 @@ static inline void flush_check(struct inet_peer_base *base, int family) } } -void inetpeer_invalidate_family(int family) -{ - atomic_t *fp = inetpeer_seq_ptr(family); - - atomic_inc(fp); -} - #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ /* Exported for sysctl_net_ipv4. */ -- cgit v1.1 From ea074b3495a023c2cf969605d51991b2b9df9514 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sat, 28 Dec 2013 11:11:42 -0800 Subject: ipv4: ping make local stuff static Don't export ping_table or ping_v4_sendmsg. Both are only used inside ping code. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/ping.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 242e7f4..cae5262 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -53,8 +53,12 @@ #include #endif +struct ping_table { + struct hlist_nulls_head hash[PING_HTABLE_SIZE]; + rwlock_t lock; +}; -struct ping_table ping_table; +static struct ping_table ping_table; struct pingv6_ops pingv6_ops; EXPORT_SYMBOL_GPL(pingv6_ops); @@ -668,8 +672,8 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, } EXPORT_SYMBOL_GPL(ping_common_sendmsg); -int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len) +static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len) { struct net *net = sock_net(sk); struct flowi4 fl4; -- cgit v1.1 From f7e56a76acf642d21a8e7bd587e270ae7addc4db Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Sun, 29 Dec 2013 11:39:51 -0800 Subject: tcp: make local functions static The following are only used in one file: tcp_connect_init tcp_set_rto Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_output.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f75cf15..65cf90e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -766,7 +766,7 @@ static void tcp_update_pacing_rate(struct sock *sk) /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ -void tcp_set_rto(struct sock *sk) +static void tcp_set_rto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); /* Old crap is replaced with new one. 8) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9e7aec7..03d26b8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2777,7 +2777,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, EXPORT_SYMBOL(tcp_make_synack); /* Do all connect socket setups that can be done AF independent. */ -void tcp_connect_init(struct sock *sk) +static void tcp_connect_init(struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); -- cgit v1.1 From 24245a1b055df246dc94517c1a8b1fdfe7668da0 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Sun, 29 Dec 2013 11:43:35 -0800 Subject: lro: remove dead code Remove leftover code that is not used anywhere in current tree. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/inet_lro.c | 173 ---------------------------------------------------- 1 file changed, 173 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c index 1975f52..f17ea49 100644 --- a/net/ipv4/inet_lro.c +++ b/net/ipv4/inet_lro.c @@ -230,29 +230,6 @@ static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb, lro_desc->last_skb = skb; } -static void lro_add_frags(struct net_lro_desc *lro_desc, - int len, int hlen, int truesize, - struct skb_frag_struct *skb_frags, - struct iphdr *iph, struct tcphdr *tcph) -{ - struct sk_buff *skb = lro_desc->parent; - int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); - - lro_add_common(lro_desc, iph, tcph, tcp_data_len); - - skb->truesize += truesize; - - skb_frags[0].page_offset += hlen; - skb_frag_size_sub(&skb_frags[0], hlen); - - while (tcp_data_len > 0) { - *(lro_desc->next_frag) = *skb_frags; - tcp_data_len -= skb_frag_size(skb_frags); - lro_desc->next_frag++; - skb_frags++; - skb_shinfo(skb)->nr_frags++; - } -} static int lro_check_tcp_conn(struct net_lro_desc *lro_desc, struct iphdr *iph, @@ -371,128 +348,6 @@ out: return 1; } - -static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr, - struct skb_frag_struct *frags, - int len, int true_size, - void *mac_hdr, - int hlen, __wsum sum, - u32 ip_summed) -{ - struct sk_buff *skb; - struct skb_frag_struct *skb_frags; - int data_len = len; - int hdr_len = min(len, hlen); - - skb = netdev_alloc_skb(lro_mgr->dev, hlen + lro_mgr->frag_align_pad); - if (!skb) - return NULL; - - skb_reserve(skb, lro_mgr->frag_align_pad); - skb->len = len; - skb->data_len = len - hdr_len; - skb->truesize += true_size; - skb->tail += hdr_len; - - memcpy(skb->data, mac_hdr, hdr_len); - - skb_frags = skb_shinfo(skb)->frags; - while (data_len > 0) { - *skb_frags = *frags; - data_len -= skb_frag_size(frags); - skb_frags++; - frags++; - skb_shinfo(skb)->nr_frags++; - } - - skb_shinfo(skb)->frags[0].page_offset += hdr_len; - skb_frag_size_sub(&skb_shinfo(skb)->frags[0], hdr_len); - - skb->ip_summed = ip_summed; - skb->csum = sum; - skb->protocol = eth_type_trans(skb, lro_mgr->dev); - return skb; -} - -static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr, - struct skb_frag_struct *frags, - int len, int true_size, - void *priv, __wsum sum) -{ - struct net_lro_desc *lro_desc; - struct iphdr *iph; - struct tcphdr *tcph; - struct sk_buff *skb; - u64 flags; - void *mac_hdr; - int mac_hdr_len; - int hdr_len = LRO_MAX_PG_HLEN; - int vlan_hdr_len = 0; - - if (!lro_mgr->get_frag_header || - lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph, - (void *)&tcph, &flags, priv)) { - mac_hdr = skb_frag_address(frags); - goto out1; - } - - if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) - goto out1; - - hdr_len = (int)((void *)(tcph) + TCP_HDR_LEN(tcph) - mac_hdr); - mac_hdr_len = (int)((void *)(iph) - mac_hdr); - - lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); - if (!lro_desc) - goto out1; - - if (!lro_desc->active) { /* start new lro session */ - if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, NULL)) - goto out1; - - skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr, - hdr_len, 0, lro_mgr->ip_summed_aggr); - if (!skb) - goto out; - - if ((skb->protocol == htons(ETH_P_8021Q)) && - !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) - vlan_hdr_len = VLAN_HLEN; - - iph = (void *)(skb->data + vlan_hdr_len); - tcph = (void *)((u8 *)skb->data + vlan_hdr_len - + IP_HDR_LEN(iph)); - - lro_init_desc(lro_desc, skb, iph, tcph); - LRO_INC_STATS(lro_mgr, aggregated); - return NULL; - } - - if (lro_desc->tcp_next_seq != ntohl(tcph->seq)) - goto out2; - - if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, lro_desc)) - goto out2; - - lro_add_frags(lro_desc, len, hdr_len, true_size, frags, iph, tcph); - LRO_INC_STATS(lro_mgr, aggregated); - - if ((skb_shinfo(lro_desc->parent)->nr_frags >= lro_mgr->max_aggr) || - lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu)) - lro_flush(lro_mgr, lro_desc); - - return NULL; - -out2: /* send aggregated packets to the stack */ - lro_flush(lro_mgr, lro_desc); - -out1: /* Original packet has to be posted to the stack */ - skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr, - hdr_len, sum, lro_mgr->ip_summed); -out: - return skb; -} - void lro_receive_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, void *priv) @@ -506,23 +361,6 @@ void lro_receive_skb(struct net_lro_mgr *lro_mgr, } EXPORT_SYMBOL(lro_receive_skb); -void lro_receive_frags(struct net_lro_mgr *lro_mgr, - struct skb_frag_struct *frags, - int len, int true_size, void *priv, __wsum sum) -{ - struct sk_buff *skb; - - skb = __lro_proc_segment(lro_mgr, frags, len, true_size, priv, sum); - if (!skb) - return; - - if (lro_mgr->features & LRO_F_NAPI) - netif_receive_skb(skb); - else - netif_rx(skb); -} -EXPORT_SYMBOL(lro_receive_frags); - void lro_flush_all(struct net_lro_mgr *lro_mgr) { int i; @@ -534,14 +372,3 @@ void lro_flush_all(struct net_lro_mgr *lro_mgr) } } EXPORT_SYMBOL(lro_flush_all); - -void lro_flush_pkt(struct net_lro_mgr *lro_mgr, - struct iphdr *iph, struct tcphdr *tcph) -{ - struct net_lro_desc *lro_desc; - - lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); - if (lro_desc->active) - lro_flush(lro_mgr, lro_desc); -} -EXPORT_SYMBOL(lro_flush_pkt); -- cgit v1.1 From cc70d069e2b9cece683206c0f6a1d1484414e577 Mon Sep 17 00:00:00 2001 From: Eric Leblond Date: Sun, 29 Dec 2013 12:28:13 +0100 Subject: netfilter: REJECT: separate reusable code This patch prepares the addition of TCP reset support in the nft_reject module by moving reusable code into a header file. Signed-off-by: Eric Leblond Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_REJECT.c | 140 +++------------------------------------- 1 file changed, 10 insertions(+), 130 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index b969131..5b6e0df 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -17,10 +17,6 @@ #include #include #include -#include -#include -#include -#include #include #include #include @@ -28,128 +24,12 @@ #include #endif +#include + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team "); MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv4"); -/* Send RST reply */ -static void send_reset(struct sk_buff *oldskb, int hook) -{ - struct sk_buff *nskb; - const struct iphdr *oiph; - struct iphdr *niph; - const struct tcphdr *oth; - struct tcphdr _otcph, *tcph; - - /* IP header checks: fragment. */ - if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) - return; - - oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb), - sizeof(_otcph), &_otcph); - if (oth == NULL) - return; - - /* No RST for RST. */ - if (oth->rst) - return; - - if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) - return; - - /* Check checksum */ - if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) - return; - oiph = ip_hdr(oldskb); - - nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + - LL_MAX_HEADER, GFP_ATOMIC); - if (!nskb) - return; - - skb_reserve(nskb, LL_MAX_HEADER); - - skb_reset_network_header(nskb); - niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr)); - niph->version = 4; - niph->ihl = sizeof(struct iphdr) / 4; - niph->tos = 0; - niph->id = 0; - niph->frag_off = htons(IP_DF); - niph->protocol = IPPROTO_TCP; - niph->check = 0; - niph->saddr = oiph->daddr; - niph->daddr = oiph->saddr; - - skb_reset_transport_header(nskb); - tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); - memset(tcph, 0, sizeof(*tcph)); - tcph->source = oth->dest; - tcph->dest = oth->source; - tcph->doff = sizeof(struct tcphdr) / 4; - - if (oth->ack) - tcph->seq = oth->ack_seq; - else { - tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + - oldskb->len - ip_hdrlen(oldskb) - - (oth->doff << 2)); - tcph->ack = 1; - } - - tcph->rst = 1; - tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr, - niph->daddr, 0); - nskb->ip_summed = CHECKSUM_PARTIAL; - nskb->csum_start = (unsigned char *)tcph - nskb->head; - nskb->csum_offset = offsetof(struct tcphdr, check); - - /* ip_route_me_harder expects skb->dst to be set */ - skb_dst_set_noref(nskb, skb_dst(oldskb)); - - nskb->protocol = htons(ETH_P_IP); - if (ip_route_me_harder(nskb, RTN_UNSPEC)) - goto free_nskb; - - niph->ttl = ip4_dst_hoplimit(skb_dst(nskb)); - - /* "Never happens" */ - if (nskb->len > dst_mtu(skb_dst(nskb))) - goto free_nskb; - - nf_ct_attach(nskb, oldskb); - -#ifdef CONFIG_BRIDGE_NETFILTER - /* If we use ip_local_out for bridged traffic, the MAC source on - * the RST will be ours, instead of the destination's. This confuses - * some routers/firewalls, and they drop the packet. So we need to - * build the eth header using the original destination's MAC as the - * source, and send the RST packet directly. - */ - if (oldskb->nf_bridge) { - struct ethhdr *oeth = eth_hdr(oldskb); - nskb->dev = oldskb->nf_bridge->physindev; - niph->tot_len = htons(nskb->len); - ip_send_check(niph); - if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), - oeth->h_source, oeth->h_dest, nskb->len) < 0) - goto free_nskb; - dev_queue_xmit(nskb); - } else -#endif - ip_local_out(nskb); - - return; - - free_nskb: - kfree_skb(nskb); -} - -static inline void send_unreach(struct sk_buff *skb_in, int code) -{ - icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); -} - static unsigned int reject_tg(struct sk_buff *skb, const struct xt_action_param *par) { @@ -157,28 +37,28 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par) switch (reject->with) { case IPT_ICMP_NET_UNREACHABLE: - send_unreach(skb, ICMP_NET_UNREACH); + nf_send_unreach(skb, ICMP_NET_UNREACH); break; case IPT_ICMP_HOST_UNREACHABLE: - send_unreach(skb, ICMP_HOST_UNREACH); + nf_send_unreach(skb, ICMP_HOST_UNREACH); break; case IPT_ICMP_PROT_UNREACHABLE: - send_unreach(skb, ICMP_PROT_UNREACH); + nf_send_unreach(skb, ICMP_PROT_UNREACH); break; case IPT_ICMP_PORT_UNREACHABLE: - send_unreach(skb, ICMP_PORT_UNREACH); + nf_send_unreach(skb, ICMP_PORT_UNREACH); break; case IPT_ICMP_NET_PROHIBITED: - send_unreach(skb, ICMP_NET_ANO); + nf_send_unreach(skb, ICMP_NET_ANO); break; case IPT_ICMP_HOST_PROHIBITED: - send_unreach(skb, ICMP_HOST_ANO); + nf_send_unreach(skb, ICMP_HOST_ANO); break; case IPT_ICMP_ADMIN_PROHIBITED: - send_unreach(skb, ICMP_PKT_FILTERED); + nf_send_unreach(skb, ICMP_PKT_FILTERED); break; case IPT_TCP_RESET: - send_reset(skb, par->hooknum); + nf_send_reset(skb, par->hooknum); case IPT_ICMP_ECHOREPLY: /* Doesn't happen. */ break; -- cgit v1.1 From bee11dc78fc8a41299be5ce04b1c76b0057af450 Mon Sep 17 00:00:00 2001 From: Eric Leblond Date: Sun, 29 Dec 2013 12:28:14 +0100 Subject: netfilter: nft_reject: support for IPv6 and TCP reset This patch moves nft_reject_ipv4 to nft_reject and adds support for IPv6 protocol. This patch uses functions included in nf_reject.h to implement reject by TCP reset. The code has to be build as a module if NF_TABLES_IPV6 is also a module to avoid compilation error due to usage of IPv6 functions. This has been done in Kconfig by using the construct: depends on NF_TABLES_IPV6 || !NF_TABLES_IPV6 This seems a bit weird in terms of syntax but works perfectly. Signed-off-by: Eric Leblond Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Kconfig | 4 -- net/ipv4/netfilter/Makefile | 1 - net/ipv4/netfilter/nft_reject_ipv4.c | 123 ----------------------------------- 3 files changed, 128 deletions(-) delete mode 100644 net/ipv4/netfilter/nft_reject_ipv4.c (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 40d5607..9d3d69a 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -40,10 +40,6 @@ config NF_TABLES_IPV4 depends on NF_TABLES tristate "IPv4 nf_tables support" -config NFT_REJECT_IPV4 - depends on NF_TABLES_IPV4 - tristate "nf_tables IPv4 reject support" - config NFT_CHAIN_ROUTE_IPV4 depends on NF_TABLES_IPV4 tristate "IPv4 nf_tables route chain support" diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 19df72b..c16be9d 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -28,7 +28,6 @@ obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o -obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c deleted file mode 100644 index fff5ba1..0000000 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2008-2009 Patrick McHardy - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -struct nft_reject { - enum nft_reject_types type:8; - u8 icmp_code; -}; - -static void nft_reject_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt) -{ - struct nft_reject *priv = nft_expr_priv(expr); - - switch (priv->type) { - case NFT_REJECT_ICMP_UNREACH: - icmp_send(pkt->skb, ICMP_DEST_UNREACH, priv->icmp_code, 0); - break; - case NFT_REJECT_TCP_RST: - break; - } - - data[NFT_REG_VERDICT].verdict = NF_DROP; -} - -static const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { - [NFTA_REJECT_TYPE] = { .type = NLA_U32 }, - [NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 }, -}; - -static int nft_reject_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) -{ - struct nft_reject *priv = nft_expr_priv(expr); - - if (tb[NFTA_REJECT_TYPE] == NULL) - return -EINVAL; - - priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); - switch (priv->type) { - case NFT_REJECT_ICMP_UNREACH: - if (tb[NFTA_REJECT_ICMP_CODE] == NULL) - return -EINVAL; - priv->icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); - case NFT_REJECT_TCP_RST: - break; - default: - return -EINVAL; - } - - return 0; -} - -static int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) -{ - const struct nft_reject *priv = nft_expr_priv(expr); - - if (nla_put_be32(skb, NFTA_REJECT_TYPE, priv->type)) - goto nla_put_failure; - - switch (priv->type) { - case NFT_REJECT_ICMP_UNREACH: - if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) - goto nla_put_failure; - break; - } - - return 0; - -nla_put_failure: - return -1; -} - -static struct nft_expr_type nft_reject_type; -static const struct nft_expr_ops nft_reject_ops = { - .type = &nft_reject_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), - .eval = nft_reject_eval, - .init = nft_reject_init, - .dump = nft_reject_dump, -}; - -static struct nft_expr_type nft_reject_type __read_mostly = { - .name = "reject", - .ops = &nft_reject_ops, - .policy = nft_reject_policy, - .maxattr = NFTA_REJECT_MAX, - .owner = THIS_MODULE, -}; - -static int __init nft_reject_module_init(void) -{ - return nft_register_expr(&nft_reject_type); -} - -static void __exit nft_reject_module_exit(void) -{ - nft_unregister_expr(&nft_reject_type); -} - -module_init(nft_reject_module_init); -module_exit(nft_reject_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy "); -MODULE_ALIAS_NFT_EXPR("reject"); -- cgit v1.1 From d497c63527369844ea705149d5a26c9ac0f78282 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 30 Dec 2013 15:09:18 +0100 Subject: netfilter: add help information to new nf_tables Kconfig options Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Kconfig | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 9d3d69a..81c6910 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -39,19 +39,33 @@ config NF_CONNTRACK_PROC_COMPAT config NF_TABLES_IPV4 depends on NF_TABLES tristate "IPv4 nf_tables support" + help + This option enables the IPv4 support for nf_tables. config NFT_CHAIN_ROUTE_IPV4 depends on NF_TABLES_IPV4 tristate "IPv4 nf_tables route chain support" + help + This option enables the "route" chain for IPv4 in nf_tables. This + chain type is used to force packet re-routing after mangling header + fields such as the source, destination, type of service and + the packet mark. config NFT_CHAIN_NAT_IPV4 depends on NF_TABLES_IPV4 depends on NF_NAT_IPV4 && NFT_NAT tristate "IPv4 nf_tables nat chain support" + help + This option enables the "nat" chain for IPv4 in nf_tables. This + chain type is used to perform Network Address Translation (NAT) + packet transformations such as the source, destination address and + source and destination ports. config NF_TABLES_ARP depends on NF_TABLES tristate "ARP nf_tables support" + help + This option enables the ARP support for nf_tables. config IP_NF_IPTABLES tristate "IP tables support (required for filtering/masq/NAT)" -- cgit v1.1 From 56022a8fdd874c56bb61d8c82559e43044d1aa06 Mon Sep 17 00:00:00 2001 From: Salam Noureddine Date: Tue, 24 Dec 2013 14:17:02 -0800 Subject: ipv4: arp: update neighbour address when a gratuitous arp is received and arp_accept is set Gratuitous arp packets are useful in switchover scenarios to update client arp tables as quickly as possible. Currently, the mac address of a neighbour is only updated after a locktime period has elapsed since the last update. In most use cases such delays are unacceptable for network admins. Moreover, the "updated" field of the neighbour stucture doesn't record the last time the address of a neighbour changed but records any change that happens to the neighbour. This is clearly a bug since locktime uses that field as meaning "addr_updated". With this observation, I was able to perpetuate a stale address by sending a stream of gratuitous arp packets spaced less than locktime apart. With this change the address is updated when a gratuitous arp is received and the arp_accept sysctl is set. Signed-off-by: Salam Noureddine Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/arp.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 5bf408b..1a9b99e 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -732,6 +732,7 @@ static int arp_process(struct sk_buff *skb) int addr_type; struct neighbour *n; struct net *net = dev_net(dev); + bool is_garp = false; /* arp_rcv below verifies the ARP header and verifies the device * is ARP'able. @@ -898,10 +899,12 @@ static int arp_process(struct sk_buff *skb) It is possible, that this option should be enabled for some devices (strip is candidate) */ + is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip && + inet_addr_type(net, sip) == RTN_UNICAST; + if (n == NULL && - (arp->ar_op == htons(ARPOP_REPLY) || - (arp->ar_op == htons(ARPOP_REQUEST) && tip == sip)) && - inet_addr_type(net, sip) == RTN_UNICAST) + ((arp->ar_op == htons(ARPOP_REPLY) && + inet_addr_type(net, sip) == RTN_UNICAST) || is_garp)) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); } @@ -914,8 +917,10 @@ static int arp_process(struct sk_buff *skb) agents are active. Taking the first reply prevents arp trashing and chooses the fastest router. */ - override = time_after(jiffies, n->updated + - NEIGH_VAR(n->parms, LOCKTIME)); + override = time_after(jiffies, + n->updated + + NEIGH_VAR(n->parms, LOCKTIME)) || + is_garp; /* Broadcast replies and request packets do not assert neighbour reachability. -- cgit v1.1 From 442c67f8448321c1919ddb33524cc4ecaa9a94f3 Mon Sep 17 00:00:00 2001 From: Weilong Chen Date: Tue, 31 Dec 2013 15:11:27 +0800 Subject: ipv4: spaces required around that '=' Signed-off-by: Weilong Chen Signed-off-by: David S. Miller --- net/ipv4/proc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 8ecd7ad..a6c8a80 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -333,22 +333,22 @@ static void icmp_put(struct seq_file *seq) atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs; seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors"); - for (i=0; icmpmibmap[i].name != NULL; i++) + for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " In%s", icmpmibmap[i].name); seq_printf(seq, " OutMsgs OutErrors"); - for (i=0; icmpmibmap[i].name != NULL; i++) + for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu %lu", snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INMSGS), snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS), snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS)); - for (i=0; icmpmibmap[i].name != NULL; i++) + for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + icmpmibmap[i].index)); seq_printf(seq, " %lu %lu", snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); - for (i=0; icmpmibmap[i].name != NULL; i++) + for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + (icmpmibmap[i].index | 0x100))); } -- cgit v1.1 From dd9b45598a7198f8b12965f2ec453bcb5cb90aec Mon Sep 17 00:00:00 2001 From: Weilong Chen Date: Tue, 31 Dec 2013 15:11:28 +0800 Subject: ipv4: switch and case should be at the same indent Signed-off-by: Weilong Chen Signed-off-by: David S. Miller --- net/ipv4/ip_options.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index dd9d90b..f4ab72e 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -275,7 +275,7 @@ int ip_options_compile(struct net *net, for (l = opt->optlen; l > 0; ) { switch (*optptr) { - case IPOPT_END: + case IPOPT_END: for (optptr++, l--; l > 0; optptr++, l--) { if (*optptr != IPOPT_END) { *optptr = IPOPT_END; @@ -283,7 +283,7 @@ int ip_options_compile(struct net *net, } } goto eol; - case IPOPT_NOOP: + case IPOPT_NOOP: l--; optptr++; continue; @@ -294,8 +294,8 @@ int ip_options_compile(struct net *net, goto error; } switch (*optptr) { - case IPOPT_SSRR: - case IPOPT_LSRR: + case IPOPT_SSRR: + case IPOPT_LSRR: if (optlen < 3) { pp_ptr = optptr + 1; goto error; @@ -321,7 +321,7 @@ int ip_options_compile(struct net *net, opt->is_strictroute = (optptr[0] == IPOPT_SSRR); opt->srr = optptr - iph; break; - case IPOPT_RR: + case IPOPT_RR: if (opt->rr) { pp_ptr = optptr; goto error; @@ -349,7 +349,7 @@ int ip_options_compile(struct net *net, } opt->rr = optptr - iph; break; - case IPOPT_TIMESTAMP: + case IPOPT_TIMESTAMP: if (opt->ts) { pp_ptr = optptr; goto error; @@ -369,13 +369,13 @@ int ip_options_compile(struct net *net, goto error; } switch (optptr[3]&0xF) { - case IPOPT_TS_TSONLY: + case IPOPT_TS_TSONLY: if (skb) timeptr = &optptr[optptr[2]-1]; opt->ts_needtime = 1; optptr[2] += 4; break; - case IPOPT_TS_TSANDADDR: + case IPOPT_TS_TSANDADDR: if (optptr[2]+7 > optptr[1]) { pp_ptr = optptr + 2; goto error; @@ -389,7 +389,7 @@ int ip_options_compile(struct net *net, opt->ts_needtime = 1; optptr[2] += 8; break; - case IPOPT_TS_PRESPEC: + case IPOPT_TS_PRESPEC: if (optptr[2]+7 > optptr[1]) { pp_ptr = optptr + 2; goto error; @@ -405,7 +405,7 @@ int ip_options_compile(struct net *net, opt->ts_needtime = 1; optptr[2] += 8; break; - default: + default: if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) { pp_ptr = optptr + 3; goto error; @@ -433,7 +433,7 @@ int ip_options_compile(struct net *net, } opt->ts = optptr - iph; break; - case IPOPT_RA: + case IPOPT_RA: if (optlen < 4) { pp_ptr = optptr + 1; goto error; @@ -441,7 +441,7 @@ int ip_options_compile(struct net *net, if (optptr[2] == 0 && optptr[3] == 0) opt->router_alert = optptr - iph; break; - case IPOPT_CIPSO: + case IPOPT_CIPSO: if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) { pp_ptr = optptr; goto error; @@ -452,9 +452,9 @@ int ip_options_compile(struct net *net, goto error; } break; - case IPOPT_SEC: - case IPOPT_SID: - default: + case IPOPT_SEC: + case IPOPT_SID: + default: if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) { pp_ptr = optptr; goto error; -- cgit v1.1 From dcd93ed4cd1669b2c1510e801fe5f1132390761c Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 30 Dec 2013 17:16:08 -0800 Subject: netfilter: nf_conntrack: remove dead code The following code is not used in current upstream code. Some of this seems to be old hooks, other might be used by some out of tree module (which I don't care about breaking), and the need_ipv4_conntrack was used by old NAT code but no longer called. Signed-off-by: Stephen Hemminger Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index ecd8bec..8127dc8 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -548,9 +548,3 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void) module_init(nf_conntrack_l3proto_ipv4_init); module_exit(nf_conntrack_l3proto_ipv4_fini); - -void need_ipv4_conntrack(void) -{ - return; -} -EXPORT_SYMBOL_GPL(need_ipv4_conntrack); -- cgit v1.1 From 7d442fab0a6777fd7612cfcada32ea859553d370 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 2 Jan 2014 11:48:26 -0800 Subject: ipv4: Cache dst in tunnels Avoid doing a route lookup on every packet being tunneled. In ip_tunnel.c cache the route returned from ip_route_output if the tunnel is "connected" so that all the rouitng parameters are taken from tunnel parms for a packet. Specifically, not NBMA tunnel and tos is from tunnel parms (not inner packet). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 113 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 89 insertions(+), 24 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 90ff957..27d756f 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -68,6 +68,54 @@ static unsigned int ip_tunnel_hash(struct ip_tunnel_net *itn, IP_TNL_HASH_BITS); } +static inline void __tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst) +{ + struct dst_entry *old_dst; + + if (dst && (dst->flags & DST_NOCACHE)) + dst = NULL; + + spin_lock_bh(&t->dst_lock); + old_dst = rcu_dereference_raw(t->dst_cache); + rcu_assign_pointer(t->dst_cache, dst); + dst_release(old_dst); + spin_unlock_bh(&t->dst_lock); +} + +static inline void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst) +{ + __tunnel_dst_set(t, dst); +} + +static inline void tunnel_dst_reset(struct ip_tunnel *t) +{ + tunnel_dst_set(t, NULL); +} + +static inline struct dst_entry *tunnel_dst_get(struct ip_tunnel *t) +{ + struct dst_entry *dst; + + rcu_read_lock(); + dst = rcu_dereference(t->dst_cache); + if (dst) + dst_hold(dst); + rcu_read_unlock(); + return dst; +} + +struct dst_entry *tunnel_dst_check(struct ip_tunnel *t, u32 cookie) +{ + struct dst_entry *dst = tunnel_dst_get(t); + + if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { + tunnel_dst_reset(t); + return NULL; + } + + return dst; +} + /* Often modified stats are per cpu, other are shared (netdev->stats) */ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *tot) @@ -318,11 +366,10 @@ failed: return ERR_PTR(err); } -static inline struct rtable *ip_route_output_tunnel(struct net *net, - struct flowi4 *fl4, - int proto, - __be32 daddr, __be32 saddr, - __be32 key, __u8 tos, int oif) +static inline void init_tunnel_flow(struct flowi4 *fl4, + int proto, + __be32 daddr, __be32 saddr, + __be32 key, __u8 tos, int oif) { memset(fl4, 0, sizeof(*fl4)); fl4->flowi4_oif = oif; @@ -331,7 +378,6 @@ static inline struct rtable *ip_route_output_tunnel(struct net *net, fl4->flowi4_tos = tos; fl4->flowi4_proto = proto; fl4->fl4_gre_key = key; - return ip_route_output_key(net, fl4); } static int ip_tunnel_bind_dev(struct net_device *dev) @@ -350,14 +396,14 @@ static int ip_tunnel_bind_dev(struct net_device *dev) struct flowi4 fl4; struct rtable *rt; - rt = ip_route_output_tunnel(tunnel->net, &fl4, - tunnel->parms.iph.protocol, - iph->daddr, iph->saddr, - tunnel->parms.o_key, - RT_TOS(iph->tos), - tunnel->parms.link); + init_tunnel_flow(&fl4, iph->protocol, iph->daddr, + iph->saddr, tunnel->parms.o_key, + RT_TOS(iph->tos), tunnel->parms.link); + rt = ip_route_output_key(tunnel->net, &fl4); + if (!IS_ERR(rt)) { tdev = rt->dst.dev; + tunnel_dst_set(tunnel, dst_clone(&rt->dst)); ip_rt_put(rt); } if (dev->type != ARPHRD_ETHER) @@ -528,10 +574,11 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi4 fl4; u8 tos, ttl; __be16 df; - struct rtable *rt; /* Route to the other host */ + struct rtable *rt = NULL; /* Route to the other host */ unsigned int max_headroom; /* The extra header space needed */ __be32 dst; int err; + bool connected = true; inner_iph = (const struct iphdr *)skb_inner_network_header(skb); @@ -581,27 +628,39 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, #endif else goto tx_error; + + connected = false; } tos = tnl_params->tos; if (tos & 0x1) { tos &= ~0x1; - if (skb->protocol == htons(ETH_P_IP)) + if (skb->protocol == htons(ETH_P_IP)) { tos = inner_iph->tos; - else if (skb->protocol == htons(ETH_P_IPV6)) + connected = false; + } else if (skb->protocol == htons(ETH_P_IPV6)) { tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); + connected = false; + } } - rt = ip_route_output_tunnel(tunnel->net, &fl4, - protocol, - dst, tnl_params->saddr, - tunnel->parms.o_key, - RT_TOS(tos), - tunnel->parms.link); - if (IS_ERR(rt)) { - dev->stats.tx_carrier_errors++; - goto tx_error; + init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, + tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link); + + if (connected) + rt = (struct rtable *)tunnel_dst_check(tunnel, 0); + + if (!rt) { + rt = ip_route_output_key(tunnel->net, &fl4); + + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error; + } + if (connected) + tunnel_dst_set(tunnel, dst_clone(&rt->dst)); } + if (rt->dst.dev == dev) { ip_rt_put(rt); dev->stats.collisions++; @@ -696,6 +755,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, if (set_mtu) dev->mtu = mtu; } + tunnel_dst_reset(t); netdev_state_change(dev); } @@ -1001,6 +1061,9 @@ int ip_tunnel_init(struct net_device *dev) iph->version = 4; iph->ihl = 5; + tunnel->dst_cache = NULL; + spin_lock_init(&tunnel->dst_lock); + return 0; } EXPORT_SYMBOL_GPL(ip_tunnel_init); @@ -1015,6 +1078,8 @@ void ip_tunnel_uninit(struct net_device *dev) /* fb_tunnel_dev will be unregisted in net-exit call. */ if (itn->fb_tunnel_dev != dev) ip_tunnel_del(netdev_priv(dev)); + + tunnel_dst_reset(tunnel); } EXPORT_SYMBOL_GPL(ip_tunnel_uninit); -- cgit v1.1 From 9a4aa9af447f784f0a47313c8dcb79ac63442cf7 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 2 Jan 2014 11:48:33 -0800 Subject: ipv4: Use percpu Cache route in IP tunnels percpu route cache eliminates share of dst refcnt between CPUs. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 27d756f..e2c9cff 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -68,23 +68,24 @@ static unsigned int ip_tunnel_hash(struct ip_tunnel_net *itn, IP_TNL_HASH_BITS); } -static inline void __tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst) +static inline void __tunnel_dst_set(struct ip_tunnel_dst *idst, + struct dst_entry *dst) { struct dst_entry *old_dst; if (dst && (dst->flags & DST_NOCACHE)) dst = NULL; - spin_lock_bh(&t->dst_lock); - old_dst = rcu_dereference_raw(t->dst_cache); - rcu_assign_pointer(t->dst_cache, dst); + spin_lock_bh(&idst->lock); + old_dst = rcu_dereference(idst->dst); + rcu_assign_pointer(idst->dst, dst); dst_release(old_dst); - spin_unlock_bh(&t->dst_lock); + spin_unlock_bh(&idst->lock); } static inline void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst) { - __tunnel_dst_set(t, dst); + __tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst); } static inline void tunnel_dst_reset(struct ip_tunnel *t) @@ -92,12 +93,20 @@ static inline void tunnel_dst_reset(struct ip_tunnel *t) tunnel_dst_set(t, NULL); } +static void tunnel_dst_reset_all(struct ip_tunnel *t) +{ + int i; + + for_each_possible_cpu(i) + __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL); +} + static inline struct dst_entry *tunnel_dst_get(struct ip_tunnel *t) { struct dst_entry *dst; rcu_read_lock(); - dst = rcu_dereference(t->dst_cache); + dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst); if (dst) dst_hold(dst); rcu_read_unlock(); @@ -755,7 +764,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, if (set_mtu) dev->mtu = mtu; } - tunnel_dst_reset(t); + tunnel_dst_reset_all(t); netdev_state_change(dev); } @@ -871,6 +880,7 @@ static void ip_tunnel_dev_free(struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); gro_cells_destroy(&tunnel->gro_cells); + free_percpu(tunnel->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -1049,8 +1059,21 @@ int ip_tunnel_init(struct net_device *dev) u64_stats_init(&ipt_stats->syncp); } + tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); + if (!tunnel->dst_cache) { + free_percpu(dev->tstats); + return -ENOMEM; + } + + for_each_possible_cpu(i) { + struct ip_tunnel_dst *idst = per_cpu_ptr(tunnel->dst_cache, i); + idst-> dst = NULL; + spin_lock_init(&idst->lock); + } + err = gro_cells_init(&tunnel->gro_cells, dev); if (err) { + free_percpu(tunnel->dst_cache); free_percpu(dev->tstats); return err; } @@ -1061,9 +1084,6 @@ int ip_tunnel_init(struct net_device *dev) iph->version = 4; iph->ihl = 5; - tunnel->dst_cache = NULL; - spin_lock_init(&tunnel->dst_lock); - return 0; } EXPORT_SYMBOL_GPL(ip_tunnel_init); @@ -1079,7 +1099,7 @@ void ip_tunnel_uninit(struct net_device *dev) if (itn->fb_tunnel_dev != dev) ip_tunnel_del(netdev_priv(dev)); - tunnel_dst_reset(tunnel); + tunnel_dst_reset_all(tunnel); } EXPORT_SYMBOL_GPL(ip_tunnel_uninit); -- cgit v1.1 From 8f84985fec10de64a6b4cdfea45f2b0ab8f07c78 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Sat, 4 Jan 2014 13:57:59 +0800 Subject: net: unify the pcpu_tstats and br_cpu_netstats as one They are same, so unify them as one, pcpu_sw_netstats. Define pcpu_sw_netstat in netdevice.h, remove pcpu_tstats from if_tunnel and remove br_cpu_netstats from br_private.h Cc: Cong Wang Cc: Stephen Hemminger Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 9 +++++---- net/ipv4/ip_vti.c | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index e2c9cff..07a5ed3 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -132,7 +132,8 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, int i; for_each_possible_cpu(i) { - const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); + const struct pcpu_sw_netstats *tstats = + per_cpu_ptr(dev->tstats, i); u64 rx_packets, rx_bytes, tx_packets, tx_bytes; unsigned int start; @@ -460,7 +461,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, bool log_ecn_error) { - struct pcpu_tstats *tstats; + struct pcpu_sw_netstats *tstats; const struct iphdr *iph = ip_hdr(skb); int err; @@ -1049,12 +1050,12 @@ int ip_tunnel_init(struct net_device *dev) int i, err; dev->destructor = ip_tunnel_dev_free; - dev->tstats = alloc_percpu(struct pcpu_tstats); + dev->tstats = alloc_percpu(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; for_each_possible_cpu(i) { - struct pcpu_tstats *ipt_stats; + struct pcpu_sw_netstats *ipt_stats; ipt_stats = per_cpu_ptr(dev->tstats, i); u64_stats_init(&ipt_stats->syncp); } diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 52b802a..0783200 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -60,7 +60,7 @@ static int vti_rcv(struct sk_buff *skb) tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, iph->saddr, iph->daddr, 0); if (tunnel != NULL) { - struct pcpu_tstats *tstats; + struct pcpu_sw_netstats *tstats; u32 oldmark = skb->mark; int ret; -- cgit v1.1 From 996b175e39ed42ec2aa0c63b4a03cc500aa6269f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Jan 2014 09:36:12 -0800 Subject: tcp: out_of_order_queue do not use its lock TCP out_of_order_queue lock is not used, as queue manipulation happens with socket lock held and we therefore use the lockless skb queue routines (as __skb_queue_head()) We can use __skb_queue_head_init() instead of skb_queue_head_init() to make this more consistent. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_minisocks.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d099f9a..e2a4051 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -381,7 +381,7 @@ void tcp_init_sock(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - skb_queue_head_init(&tp->out_of_order_queue); + __skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); INIT_LIST_HEAD(&tp->tsq_node); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 97b6841..3aa9e32 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -425,7 +425,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_set_ca_state(newsk, TCP_CA_Open); tcp_init_xmit_timers(newsk); - skb_queue_head_init(&newtp->out_of_order_queue); + __skb_queue_head_init(&newtp->out_of_order_queue); newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; newtp->rx_opt.saw_tstamp = 0; -- cgit v1.1 From 438e38fadca2f6e57eeecc08326c8a95758594d4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Jan 2014 14:03:07 -0800 Subject: gre_offload: statically build GRE offloading support GRO/GSO layers can be enabled on a node, even if said node is only forwarding packets. This patch permits GSO (and upcoming GRO) support for GRE encapsulated packets, even if the host has no GRE tunnel setup. Signed-off-by: Eric Dumazet Cc: H.K. Jerry Chu Signed-off-by: David S. Miller --- net/ipv4/Makefile | 4 ++-- net/ipv4/gre_demux.c | 9 --------- net/ipv4/gre_offload.c | 7 +++++-- 3 files changed, 7 insertions(+), 13 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 4b81e91..f8c49ce 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -11,7 +11,7 @@ obj-y := route.o inetpeer.o protocol.o \ tcp_offload.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o \ - inet_fragment.o ping.o ip_tunnel_core.o + inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o @@ -19,7 +19,7 @@ obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o obj-$(CONFIG_IP_MROUTE) += ipmr.o obj-$(CONFIG_NET_IPIP) += ipip.o -gre-y := gre_demux.o gre_offload.o +gre-y := gre_demux.o obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o obj-$(CONFIG_NET_IPGRE) += ip_gre.o obj-$(CONFIG_NET_IPVTI) += ip_vti.o diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 5893e99..1863422f 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -355,14 +355,7 @@ static int __init gre_init(void) goto err_gre; } - if (gre_offload_init()) { - pr_err("can't add protocol offload\n"); - goto err_gso; - } - return 0; -err_gso: - gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); err_gre: inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); err: @@ -371,8 +364,6 @@ err: static void __exit gre_exit(void) { - gre_offload_exit(); - gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); } diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 2cd02f3..9138cfb 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -123,12 +123,15 @@ static const struct net_offload gre_offload = { }, }; -int __init gre_offload_init(void) +static int __init gre_offload_init(void) { return inet_add_offload(&gre_offload, IPPROTO_GRE); } -void __exit gre_offload_exit(void) +static void __exit gre_offload_exit(void) { inet_del_offload(&gre_offload, IPPROTO_GRE); } + +module_init(gre_offload_init); +module_exit(gre_offload_exit); -- cgit v1.1 From dfd1582d1e4d117f46df720679d595f984ef902a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 7 Jan 2014 15:55:45 +0100 Subject: ipv4: loopback device: ignore value changes after device is upped When lo is brought up, new ifa is created. Then, devconf and neigh values bitfield should be set so later changes of default values would not affect lo values. Note that the same behaviour is in ipv6. Also note that this is likely not an issue in many distros (for example Fedora 19) because userspace sets address to lo manually before bringing it up. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 0feebd5..9809f7b 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1385,6 +1385,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); + ipv4_devconf_setall(in_dev); + neigh_parms_data_state_setall(in_dev->arp_parms); inet_insert_ifa(ifa); } } -- cgit v1.1 From bf5a755f5e9186406bbf50f4087100af5bd68e40 Mon Sep 17 00:00:00 2001 From: Jerry Chu Date: Tue, 7 Jan 2014 10:23:19 -0800 Subject: net-gre-gro: Add GRE support to the GRO stack This patch built on top of Commit 299603e8370a93dd5d8e8d800f0dff1ce2c53d36 ("net-gro: Prepare GRO stack for the upcoming tunneling support") to add the support of the standard GRE (RFC1701/RFC2784/RFC2890) to the GRO stack. It also serves as an example for supporting other encapsulation protocols in the GRO stack in the future. The patch supports version 0 and all the flags (key, csum, seq#) but will flush any pkt with the S (seq#) flag. This is because the S flag is not support by GSO, and a GRO pkt may end up in the forwarding path, thus requiring GSO support to break it up correctly. Currently the "packet_offload" structure only contains L3 (ETH_P_IP/ ETH_P_IPV6) GRO offload support so the encapped pkts are limited to IP pkts (i.e., w/o L2 hdr). But support for other protocol type can be easily added, so is the support for GRE variations like NVGRE. The patch also support csum offload. Specifically if the csum flag is on and the h/w is capable of checksumming the payload (CHECKSUM_COMPLETE), the code will take advantage of the csum computed by the h/w when validating the GRE csum. Note that commit 60769a5dcd8755715c7143b4571d5c44f01796f1 "ipv4: gre: add GRO capability" already introduces GRO capability to IPv4 GRE tunnels, using the gro_cells infrastructure. But GRO is done after GRE hdr has been removed (i.e., decapped). The following patch applies GRO when pkts first come in (before hitting the GRE tunnel code). There is some performance advantage for applying GRO as early as possible. Also this approach is transparent to other subsystem like Open vSwitch where GRE decap is handled outside of the IP stack hence making it harder for the gro_cells stuff to apply. On the other hand, some NICs are still not capable of hashing on the inner hdr of a GRE pkt (RSS). In that case the GRO processing of pkts from the same remote host will all happen on the same CPU and the performance may be suboptimal. I'm including some rough preliminary performance numbers below. Note that the performance will be highly dependent on traffic load, mix as usual. Moreover it also depends on NIC offload features hence the following is by no means a comprehesive study. Local testing and tuning will be needed to decide the best setting. All tests spawned 50 copies of netperf TCP_STREAM and ran for 30 secs. (super_netperf 50 -H 192.168.1.18 -l 30) An IP GRE tunnel with only the key flag on (e.g., ip tunnel add gre1 mode gre local 10.246.17.18 remote 10.246.17.17 ttl 255 key 123) is configured. The GRO support for pkts AFTER decap are controlled through the device feature of the GRE device (e.g., ethtool -K gre1 gro on/off). 1.1 ethtool -K gre1 gro off; ethtool -K eth0 gro off thruput: 9.16Gbps CPU utilization: 19% 1.2 ethtool -K gre1 gro on; ethtool -K eth0 gro off thruput: 5.9Gbps CPU utilization: 15% 1.3 ethtool -K gre1 gro off; ethtool -K eth0 gro on thruput: 9.26Gbps CPU utilization: 12-13% 1.4 ethtool -K gre1 gro on; ethtool -K eth0 gro on thruput: 9.26Gbps CPU utilization: 10% The following tests were performed on a different NIC that is capable of csum offload. I.e., the h/w is capable of computing IP payload csum (CHECKSUM_COMPLETE). 2.1 ethtool -K gre1 gro on (hence will use gro_cells) 2.1.1 ethtool -K eth0 gro off; csum offload disabled thruput: 8.53Gbps CPU utilization: 9% 2.1.2 ethtool -K eth0 gro off; csum offload enabled thruput: 8.97Gbps CPU utilization: 7-8% 2.1.3 ethtool -K eth0 gro on; csum offload disabled thruput: 8.83Gbps CPU utilization: 5-6% 2.1.4 ethtool -K eth0 gro on; csum offload enabled thruput: 8.98Gbps CPU utilization: 5% 2.2 ethtool -K gre1 gro off 2.2.1 ethtool -K eth0 gro off; csum offload disabled thruput: 5.93Gbps CPU utilization: 9% 2.2.2 ethtool -K eth0 gro off; csum offload enabled thruput: 5.62Gbps CPU utilization: 8% 2.2.3 ethtool -K eth0 gro on; csum offload disabled thruput: 7.69Gbps CPU utilization: 8% 2.2.4 ethtool -K eth0 gro on; csum offload enabled thruput: 8.96Gbps CPU utilization: 5-6% Signed-off-by: H.K. Jerry Chu Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 10 +++- net/ipv4/gre_offload.c | 160 +++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_offload.c | 7 ++- 3 files changed, 172 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index b8bc1a3..6268a47 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1391,9 +1391,15 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, NAPI_GRO_CB(p)->flush |= (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | - (__force int)((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)) | - ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); + ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)); + /* Save the IP ID check to be included later when we get to + * the transport layer so only the inner most IP ID is checked. + * This is because some GSO/TSO implementations do not + * correctly increment the IP ID for the outer hdrs. + */ + NAPI_GRO_CB(p)->flush_id = + ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); NAPI_GRO_CB(p)->flush |= flush; } diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 9138cfb..746a7b1 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -116,10 +116,170 @@ out: return segs; } +/* Compute the whole skb csum in s/w and store it, then verify GRO csum + * starting from gro_offset. + */ +static __sum16 gro_skb_checksum(struct sk_buff *skb) +{ + __sum16 sum; + + skb->csum = skb_checksum(skb, 0, skb->len, 0); + NAPI_GRO_CB(skb)->csum = csum_sub(skb->csum, + csum_partial(skb->data, skb_gro_offset(skb), 0)); + sum = csum_fold(NAPI_GRO_CB(skb)->csum); + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) { + if (unlikely(!sum)) + netdev_rx_csum_fault(skb->dev); + } else + skb->ip_summed = CHECKSUM_COMPLETE; + + return sum; +} + +static struct sk_buff **gre_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + struct sk_buff **pp = NULL; + struct sk_buff *p; + const struct gre_base_hdr *greh; + unsigned int hlen, grehlen; + unsigned int off; + int flush = 1; + struct packet_offload *ptype; + __be16 type; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*greh); + greh = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + greh = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!greh)) + goto out; + } + + /* Only support version 0 and K (key), C (csum) flags. Note that + * although the support for the S (seq#) flag can be added easily + * for GRO, this is problematic for GSO hence can not be enabled + * here because a GRO pkt may end up in the forwarding path, thus + * requiring GSO support to break it up correctly. + */ + if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0) + goto out; + + type = greh->protocol; + + rcu_read_lock(); + ptype = gro_find_receive_by_type(type); + if (ptype == NULL) + goto out_unlock; + + grehlen = GRE_HEADER_SECTION; + + if (greh->flags & GRE_KEY) + grehlen += GRE_HEADER_SECTION; + + if (greh->flags & GRE_CSUM) + grehlen += GRE_HEADER_SECTION; + + hlen = off + grehlen; + if (skb_gro_header_hard(skb, hlen)) { + greh = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!greh)) + goto out_unlock; + } + if (greh->flags & GRE_CSUM) { /* Need to verify GRE csum first */ + __sum16 csum = 0; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + csum = csum_fold(NAPI_GRO_CB(skb)->csum); + /* Don't trust csum error calculated/reported by h/w */ + if (skb->ip_summed == CHECKSUM_NONE || csum != 0) + csum = gro_skb_checksum(skb); + + /* GRE CSUM is the 1's complement of the 1's complement sum + * of the GRE hdr plus payload so it should add up to 0xffff + * (and 0 after csum_fold()) just like the IPv4 hdr csum. + */ + if (csum) + goto out_unlock; + } + flush = 0; + + for (p = *head; p; p = p->next) { + const struct gre_base_hdr *greh2; + + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + /* The following checks are needed to ensure only pkts + * from the same tunnel are considered for aggregation. + * The criteria for "the same tunnel" includes: + * 1) same version (we only support version 0 here) + * 2) same protocol (we only support ETH_P_IP for now) + * 3) same set of flags + * 4) same key if the key field is present. + */ + greh2 = (struct gre_base_hdr *)(p->data + off); + + if (greh2->flags != greh->flags || + greh2->protocol != greh->protocol) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + if (greh->flags & GRE_KEY) { + /* compare keys */ + if (*(__be32 *)(greh2+1) != *(__be32 *)(greh+1)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + } + } + + skb_gro_pull(skb, grehlen); + + /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ + skb_gro_postpull_rcsum(skb, greh, grehlen); + + pp = ptype->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} + +int gre_gro_complete(struct sk_buff *skb, int nhoff) +{ + struct gre_base_hdr *greh = (struct gre_base_hdr *)(skb->data + nhoff); + struct packet_offload *ptype; + unsigned int grehlen = sizeof(*greh); + int err = -ENOENT; + __be16 type; + + type = greh->protocol; + if (greh->flags & GRE_KEY) + grehlen += GRE_HEADER_SECTION; + + if (greh->flags & GRE_CSUM) + grehlen += GRE_HEADER_SECTION; + + rcu_read_lock(); + ptype = gro_find_complete_by_type(type); + if (ptype != NULL) + err = ptype->callbacks.gro_complete(skb, nhoff + grehlen); + + rcu_read_unlock(); + return err; +} + static const struct net_offload gre_offload = { .callbacks = { .gso_send_check = gre_gso_send_check, .gso_segment = gre_gso_segment, + .gro_receive = gre_gro_receive, + .gro_complete = gre_gro_complete, }, }; diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 2658a27..771a395 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -197,7 +197,8 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) goto out_check_final; found: - flush = NAPI_GRO_CB(p)->flush; + /* Include the IP ID check below from the inner most IP hdr */ + flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id; flush |= (__force int)(flags & TCP_FLAG_CWR); flush |= (__force int)((flags ^ tcp_flag_word(th2)) & ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); @@ -230,7 +231,7 @@ out_check_final: pp = head; out: - NAPI_GRO_CB(skb)->flush |= flush; + NAPI_GRO_CB(skb)->flush |= (flush != 0); return pp; } @@ -280,7 +281,7 @@ static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff * if (NAPI_GRO_CB(skb)->flush) goto skip_csum; - wsum = skb->csum; + wsum = NAPI_GRO_CB(skb)->csum; switch (skb->ip_summed) { case CHECKSUM_NONE: -- cgit v1.1 From 3b088c4bc0035da662faa81818ba217e34c4bba4 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 3 Jan 2014 12:16:13 +0000 Subject: netfilter: nf_tables: make chain types override the default AF functions Currently the AF-specific hook functions override the chain-type specific hook functions. That doesn't make too much sense since the chain types are a special case of the AF-specific hooks. Make the AF-specific hook functions the default and make the optional chain type hooks override them. As a side effect, the necessary code restructuring reduces the code size, f.i. in case of nf_tables_ipv4.o: nf_tables_ipv4_init_net | -24 nft_do_chain_ipv4 | -113 2 functions changed, 137 bytes removed, diff: -137 Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_tables_arp.c | 38 ++++++++++++++++---------------- net/ipv4/netfilter/nf_tables_ipv4.c | 43 ++++++++++++++++--------------------- 2 files changed, 37 insertions(+), 44 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index 3e67ef1..31bb778 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -14,10 +14,29 @@ #include #include +static unsigned int +nft_do_chain_arp(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nft_pktinfo pkt; + + nft_set_pktinfo(&pkt, ops, skb, in, out); + + return nft_do_chain_pktinfo(&pkt, ops); +} + static struct nft_af_info nft_af_arp __read_mostly = { .family = NFPROTO_ARP, .nhooks = NF_ARP_NUMHOOKS, .owner = THIS_MODULE, + .hooks = { + [NF_ARP_IN] = nft_do_chain_arp, + [NF_ARP_OUT] = nft_do_chain_arp, + [NF_ARP_FORWARD] = nft_do_chain_arp, + }, }; static int nf_tables_arp_init_net(struct net *net) @@ -48,20 +67,6 @@ static struct pernet_operations nf_tables_arp_net_ops = { .exit = nf_tables_arp_exit_net, }; -static unsigned int -nft_do_chain_arp(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct nft_pktinfo pkt; - - nft_set_pktinfo(&pkt, ops, skb, in, out); - - return nft_do_chain_pktinfo(&pkt, ops); -} - static struct nf_chain_type filter_arp = { .family = NFPROTO_ARP, .name = "filter", @@ -69,11 +74,6 @@ static struct nf_chain_type filter_arp = { .hook_mask = (1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | (1 << NF_ARP_FORWARD), - .fn = { - [NF_ARP_IN] = nft_do_chain_arp, - [NF_ARP_OUT] = nft_do_chain_arp, - [NF_ARP_FORWARD] = nft_do_chain_arp, - }, }; static int __init nf_tables_arp_init(void) diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c index 0f4cbfe..ed7e15a 100644 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -18,14 +18,25 @@ #include #include +static unsigned int nft_do_chain_ipv4(const struct nf_hook_ops *ops, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nft_pktinfo pkt; + + nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); + + return nft_do_chain_pktinfo(&pkt, ops); +} + static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - struct nft_pktinfo pkt; - if (unlikely(skb->len < sizeof(struct iphdr) || ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) { if (net_ratelimit()) @@ -33,9 +44,8 @@ static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops, "packet\n"); return NF_ACCEPT; } - nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); - return nft_do_chain_pktinfo(&pkt, ops); + return nft_do_chain_ipv4(ops, skb, in, out, okfn); } static struct nft_af_info nft_af_ipv4 __read_mostly = { @@ -43,7 +53,11 @@ static struct nft_af_info nft_af_ipv4 __read_mostly = { .nhooks = NF_INET_NUMHOOKS, .owner = THIS_MODULE, .hooks = { + [NF_INET_LOCAL_IN] = nft_do_chain_ipv4, [NF_INET_LOCAL_OUT] = nft_ipv4_output, + [NF_INET_FORWARD] = nft_do_chain_ipv4, + [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4, + [NF_INET_POST_ROUTING] = nft_do_chain_ipv4, }, }; @@ -75,20 +89,6 @@ static struct pernet_operations nf_tables_ipv4_net_ops = { .exit = nf_tables_ipv4_exit_net, }; -static unsigned int -nft_do_chain_ipv4(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct nft_pktinfo pkt; - - nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); - - return nft_do_chain_pktinfo(&pkt, ops); -} - static struct nf_chain_type filter_ipv4 = { .family = NFPROTO_IPV4, .name = "filter", @@ -98,13 +98,6 @@ static struct nf_chain_type filter_ipv4 = { (1 << NF_INET_FORWARD) | (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING), - .fn = { - [NF_INET_LOCAL_IN] = nft_do_chain_ipv4, - [NF_INET_LOCAL_OUT] = nft_ipv4_output, - [NF_INET_FORWARD] = nft_do_chain_ipv4, - [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4, - [NF_INET_POST_ROUTING] = nft_do_chain_ipv4, - }, }; static int __init nf_tables_ipv4_init(void) -- cgit v1.1 From 115a60b173af0170e0db26b9a3fd6a911fba70a3 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 3 Jan 2014 12:16:15 +0000 Subject: netfilter: nf_tables: add support for multi family tables Add support to register chains to multiple hooks for different address families for mixed IPv4/IPv6 tables. Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/nf_tables_arp.c | 1 + net/ipv4/netfilter/nf_tables_ipv4.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index 31bb778..36d27fc 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -32,6 +32,7 @@ static struct nft_af_info nft_af_arp __read_mostly = { .family = NFPROTO_ARP, .nhooks = NF_ARP_NUMHOOKS, .owner = THIS_MODULE, + .nops = 1, .hooks = { [NF_ARP_IN] = nft_do_chain_arp, [NF_ARP_OUT] = nft_do_chain_arp, diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c index ed7e15a..177c3bc 100644 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -52,6 +52,7 @@ static struct nft_af_info nft_af_ipv4 __read_mostly = { .family = NFPROTO_IPV4, .nhooks = NF_INET_NUMHOOKS, .owner = THIS_MODULE, + .nops = 1, .hooks = { [NF_INET_LOCAL_IN] = nft_do_chain_ipv4, [NF_INET_LOCAL_OUT] = nft_ipv4_output, -- cgit v1.1 From 1d49144c0aaa61be4e3ccbef9cc5c40b0ec5f2fe Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 3 Jan 2014 12:16:16 +0000 Subject: netfilter: nf_tables: add "inet" table for IPv4/IPv6 This patch adds a new table family and a new filter chain that you can use to attach IPv4 and IPv6 rules. This should help to simplify rule-set maintainance in dual-stack setups. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_tables_ipv4.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c index 177c3bc..da927dc 100644 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -48,7 +48,7 @@ static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops, return nft_do_chain_ipv4(ops, skb, in, out, okfn); } -static struct nft_af_info nft_af_ipv4 __read_mostly = { +struct nft_af_info nft_af_ipv4 __read_mostly = { .family = NFPROTO_IPV4, .nhooks = NF_INET_NUMHOOKS, .owner = THIS_MODULE, @@ -61,6 +61,7 @@ static struct nft_af_info nft_af_ipv4 __read_mostly = { [NF_INET_POST_ROUTING] = nft_do_chain_ipv4, }, }; +EXPORT_SYMBOL_GPL(nft_af_ipv4); static int nf_tables_ipv4_init_net(struct net *net) { -- cgit v1.1 From 88ce65a71c39901494eb2f1393856bff8ba0158d Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 9 Jan 2014 18:42:35 +0000 Subject: netfilter: nf_tables: add missing module references to chain types In some cases we neither take a reference to the AF info nor to the chain type, allowing the module to be unloaded while in use. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_tables_arp.c | 1 + net/ipv4/netfilter/nf_tables_ipv4.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index 36d27fc..228df00 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -72,6 +72,7 @@ static struct nf_chain_type filter_arp = { .family = NFPROTO_ARP, .name = "filter", .type = NFT_CHAIN_T_DEFAULT, + .me = THIS_MODULE, .hook_mask = (1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | (1 << NF_ARP_FORWARD), diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c index da927dc..d6fc1b4 100644 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -95,6 +95,7 @@ static struct nf_chain_type filter_ipv4 = { .family = NFPROTO_IPV4, .name = "filter", .type = NFT_CHAIN_T_DEFAULT, + .me = THIS_MODULE, .hook_mask = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | -- cgit v1.1 From 2a37d755b885995443f11cdcaf1f9d4b5f246eab Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 9 Jan 2014 18:42:37 +0000 Subject: netfilter: nf_tables: constify chain type definitions and pointers Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_tables_arp.c | 2 +- net/ipv4/netfilter/nf_tables_ipv4.c | 2 +- net/ipv4/netfilter/nft_chain_nat_ipv4.c | 2 +- net/ipv4/netfilter/nft_chain_route_ipv4.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index 228df00..8af01a5 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -68,7 +68,7 @@ static struct pernet_operations nf_tables_arp_net_ops = { .exit = nf_tables_arp_exit_net, }; -static struct nf_chain_type filter_arp = { +static const struct nf_chain_type filter_arp = { .family = NFPROTO_ARP, .name = "filter", .type = NFT_CHAIN_T_DEFAULT, diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c index d6fc1b4..cec7805 100644 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -91,7 +91,7 @@ static struct pernet_operations nf_tables_ipv4_net_ops = { .exit = nf_tables_ipv4_exit_net, }; -static struct nf_chain_type filter_ipv4 = { +static const struct nf_chain_type filter_ipv4 = { .family = NFPROTO_IPV4, .name = "filter", .type = NFT_CHAIN_T_DEFAULT, diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c index cf2c792..9e535c2 100644 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -164,7 +164,7 @@ static unsigned int nf_nat_output(const struct nf_hook_ops *ops, return ret; } -static struct nf_chain_type nft_chain_nat_ipv4 = { +static const struct nf_chain_type nft_chain_nat_ipv4 = { .family = NFPROTO_IPV4, .name = "nat", .type = NFT_CHAIN_T_NAT, diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c index 4e6bf9a..2dd2eea 100644 --- a/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -61,7 +61,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, return ret; } -static struct nf_chain_type nft_chain_route_ipv4 = { +static const struct nf_chain_type nft_chain_route_ipv4 = { .family = NFPROTO_IPV4, .name = "route", .type = NFT_CHAIN_T_ROUTE, -- cgit v1.1 From fa2c1de0bbd98985f7f930205de97ae0d3e86c16 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 9 Jan 2014 18:42:38 +0000 Subject: netfilter: nf_tables: minor nf_chain_type cleanups Minor nf_chain_type cleanups: - reorder struct to plug a hoe - rename struct module member to "owner" for consistency - rename nf_hookfn array to "hooks" for consistency - reorder initializers for better readability Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_tables_arp.c | 4 ++-- net/ipv4/netfilter/nf_tables_ipv4.c | 4 ++-- net/ipv4/netfilter/nft_chain_nat_ipv4.c | 6 +++--- net/ipv4/netfilter/nft_chain_route_ipv4.c | 6 +++--- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index 8af01a5..b90d16c 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -69,10 +69,10 @@ static struct pernet_operations nf_tables_arp_net_ops = { }; static const struct nf_chain_type filter_arp = { - .family = NFPROTO_ARP, .name = "filter", .type = NFT_CHAIN_T_DEFAULT, - .me = THIS_MODULE, + .family = NFPROTO_ARP, + .owner = THIS_MODULE, .hook_mask = (1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | (1 << NF_ARP_FORWARD), diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c index cec7805..66679fd 100644 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -92,10 +92,10 @@ static struct pernet_operations nf_tables_ipv4_net_ops = { }; static const struct nf_chain_type filter_ipv4 = { - .family = NFPROTO_IPV4, .name = "filter", .type = NFT_CHAIN_T_DEFAULT, - .me = THIS_MODULE, + .family = NFPROTO_IPV4, + .owner = THIS_MODULE, .hook_mask = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c index 9e535c2..208d60a 100644 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -165,20 +165,20 @@ static unsigned int nf_nat_output(const struct nf_hook_ops *ops, } static const struct nf_chain_type nft_chain_nat_ipv4 = { - .family = NFPROTO_IPV4, .name = "nat", .type = NFT_CHAIN_T_NAT, + .family = NFPROTO_IPV4, + .owner = THIS_MODULE, .hook_mask = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), - .fn = { + .hooks = { [NF_INET_PRE_ROUTING] = nf_nat_prerouting, [NF_INET_POST_ROUTING] = nf_nat_postrouting, [NF_INET_LOCAL_OUT] = nf_nat_output, [NF_INET_LOCAL_IN] = nf_nat_fn, }, - .me = THIS_MODULE, }; static int __init nft_chain_nat_init(void) diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c index 2dd2eea..67db1bb 100644 --- a/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -62,14 +62,14 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, } static const struct nf_chain_type nft_chain_route_ipv4 = { - .family = NFPROTO_IPV4, .name = "route", .type = NFT_CHAIN_T_ROUTE, + .family = NFPROTO_IPV4, + .owner = THIS_MODULE, .hook_mask = (1 << NF_INET_LOCAL_OUT), - .fn = { + .hooks = { [NF_INET_LOCAL_OUT] = nf_route_table_hook, }, - .me = THIS_MODULE, }; static int __init nft_chain_route_init(void) -- cgit v1.1 From 3876d22dba62ebf6582f33e1ef2160eeb95e1129 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 9 Jan 2014 18:42:43 +0000 Subject: netfilter: nf_tables: rename nft_do_chain_pktinfo() to nft_do_chain() We don't encode argument types into function names and since besides nft_do_chain() there are only AF-specific versions, there is no risk of confusion. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_tables_arp.c | 2 +- net/ipv4/netfilter/nf_tables_ipv4.c | 2 +- net/ipv4/netfilter/nft_chain_nat_ipv4.c | 2 +- net/ipv4/netfilter/nft_chain_route_ipv4.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index b90d16c..19412a4 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -25,7 +25,7 @@ nft_do_chain_arp(const struct nf_hook_ops *ops, nft_set_pktinfo(&pkt, ops, skb, in, out); - return nft_do_chain_pktinfo(&pkt, ops); + return nft_do_chain(&pkt, ops); } static struct nft_af_info nft_af_arp __read_mostly = { diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c index 66679fd..fec163a 100644 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -28,7 +28,7 @@ static unsigned int nft_do_chain_ipv4(const struct nf_hook_ops *ops, nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); - return nft_do_chain_pktinfo(&pkt, ops); + return nft_do_chain(&pkt, ops); } static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops, diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c index 208d60a..b5b256d 100644 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -75,7 +75,7 @@ static unsigned int nf_nat_fn(const struct nf_hook_ops *ops, nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out); - ret = nft_do_chain_pktinfo(&pkt, ops); + ret = nft_do_chain(&pkt, ops); if (ret != NF_ACCEPT) return ret; if (!nf_nat_initialized(ct, maniptype)) { diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c index 67db1bb..125b667 100644 --- a/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -47,7 +47,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops, daddr = iph->daddr; tos = iph->tos; - ret = nft_do_chain_pktinfo(&pkt, ops); + ret = nft_do_chain(&pkt, ops); if (ret != NF_DROP && ret != NF_QUEUE) { iph = ip_hdr(skb); -- cgit v1.1 From d0eb1f7e66dd53355746cd6a8e7e56c465dc6cde Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 8 Jan 2014 21:59:30 +0800 Subject: ip_tunnel: fix sparse non static symbol warning Fixes the following sparse warning: net/ipv4/ip_tunnel.c:116:18: warning: symbol 'tunnel_dst_check' was not declared. Should it be static? Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 07a5ed3..d3929a6 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -113,7 +113,7 @@ static inline struct dst_entry *tunnel_dst_get(struct ip_tunnel *t) return dst; } -struct dst_entry *tunnel_dst_check(struct ip_tunnel *t, u32 cookie) +static struct dst_entry *tunnel_dst_check(struct ip_tunnel *t, u32 cookie) { struct dst_entry *dst = tunnel_dst_get(t); -- cgit v1.1 From cf4dfa85395ebe2769267a072b39e48301669842 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 9 Jan 2014 20:32:19 +0100 Subject: netfilter: nf_tables: fix error path in the init functions We have to unregister chain type if this fails to register netns. Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_tables_ipv4.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c index fec163a..6820c8c 100644 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -105,8 +105,14 @@ static const struct nf_chain_type filter_ipv4 = { static int __init nf_tables_ipv4_init(void) { + int ret; + nft_register_chain_type(&filter_ipv4); - return register_pernet_subsys(&nf_tables_ipv4_net_ops); + ret = register_pernet_subsys(&nf_tables_ipv4_net_ops); + if (ret < 0) + nft_unregister_chain_type(&filter_ipv4); + + return ret; } static void __exit nf_tables_ipv4_exit(void) -- cgit v1.1 From 324fd55a19827b7191cc6ab73865e30c0e6e6423 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Wed, 8 Jan 2014 16:05:55 +0100 Subject: tcp: metrics: rename tcpm_addr to tcpm_daddr As we will add also the source-address, we rename all accesses to the tcp-metrics address to use "daddr". Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 72 +++++++++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 36 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 0649373..f9b5f51 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -31,7 +31,7 @@ struct tcp_fastopen_metrics { struct tcp_metrics_block { struct tcp_metrics_block __rcu *tcpm_next; - struct inetpeer_addr tcpm_addr; + struct inetpeer_addr tcpm_daddr; unsigned long tcpm_stamp; u32 tcpm_ts; u32 tcpm_ts_stamp; @@ -131,7 +131,7 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst, } static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, - struct inetpeer_addr *addr, + struct inetpeer_addr *daddr, unsigned int hash, bool reclaim) { @@ -155,7 +155,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, if (!tm) goto out_unlock; } - tm->tcpm_addr = *addr; + tm->tcpm_daddr = *daddr; tcpm_suck_dst(tm, dst, true); @@ -189,7 +189,7 @@ static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, in return NULL; } -static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr, +static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *daddr, struct net *net, unsigned int hash) { struct tcp_metrics_block *tm; @@ -197,7 +197,7 @@ static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *a for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { - if (addr_same(&tm->tcpm_addr, addr)) + if (addr_same(&tm->tcpm_daddr, daddr)) break; depth++; } @@ -208,19 +208,19 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, struct dst_entry *dst) { struct tcp_metrics_block *tm; - struct inetpeer_addr addr; + struct inetpeer_addr daddr; unsigned int hash; struct net *net; - addr.family = req->rsk_ops->family; - switch (addr.family) { + daddr.family = req->rsk_ops->family; + switch (daddr.family) { case AF_INET: - addr.addr.a4 = inet_rsk(req)->ir_rmt_addr; - hash = (__force unsigned int) addr.addr.a4; + daddr.addr.a4 = inet_rsk(req)->ir_rmt_addr; + hash = (__force unsigned int) daddr.addr.a4; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - *(struct in6_addr *)addr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr; + *(struct in6_addr *)daddr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr; hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr); break; #endif @@ -233,7 +233,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { - if (addr_same(&tm->tcpm_addr, &addr)) + if (addr_same(&tm->tcpm_daddr, &daddr)) break; } tcpm_check_stamp(tm, dst); @@ -243,19 +243,19 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw) { struct tcp_metrics_block *tm; - struct inetpeer_addr addr; + struct inetpeer_addr daddr; unsigned int hash; struct net *net; - addr.family = tw->tw_family; - switch (addr.family) { + daddr.family = tw->tw_family; + switch (daddr.family) { case AF_INET: - addr.addr.a4 = tw->tw_daddr; - hash = (__force unsigned int) addr.addr.a4; + daddr.addr.a4 = tw->tw_daddr; + hash = (__force unsigned int) daddr.addr.a4; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - *(struct in6_addr *)addr.addr.a6 = tw->tw_v6_daddr; + *(struct in6_addr *)daddr.addr.a6 = tw->tw_v6_daddr; hash = ipv6_addr_hash(&tw->tw_v6_daddr); break; #endif @@ -268,7 +268,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { - if (addr_same(&tm->tcpm_addr, &addr)) + if (addr_same(&tm->tcpm_daddr, &daddr)) break; } return tm; @@ -279,20 +279,20 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, bool create) { struct tcp_metrics_block *tm; - struct inetpeer_addr addr; + struct inetpeer_addr daddr; unsigned int hash; struct net *net; bool reclaim; - addr.family = sk->sk_family; - switch (addr.family) { + daddr.family = sk->sk_family; + switch (daddr.family) { case AF_INET: - addr.addr.a4 = inet_sk(sk)->inet_daddr; - hash = (__force unsigned int) addr.addr.a4; + daddr.addr.a4 = inet_sk(sk)->inet_daddr; + hash = (__force unsigned int) daddr.addr.a4; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - *(struct in6_addr *)addr.addr.a6 = sk->sk_v6_daddr; + *(struct in6_addr *)daddr.addr.a6 = sk->sk_v6_daddr; hash = ipv6_addr_hash(&sk->sk_v6_daddr); break; #endif @@ -303,14 +303,14 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, net = dev_net(dst->dev); hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); - tm = __tcp_get_metrics(&addr, net, hash); + tm = __tcp_get_metrics(&daddr, net, hash); reclaim = false; if (tm == TCP_METRICS_RECLAIM_PTR) { reclaim = true; tm = NULL; } if (!tm && create) - tm = tcpm_new(dst, &addr, hash, reclaim); + tm = tcpm_new(dst, &daddr, hash, reclaim); else tcpm_check_stamp(tm, dst); @@ -724,15 +724,15 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, struct nlattr *nest; int i; - switch (tm->tcpm_addr.family) { + switch (tm->tcpm_daddr.family) { case AF_INET: if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4, - tm->tcpm_addr.addr.a4) < 0) + tm->tcpm_daddr.addr.a4) < 0) goto nla_put_failure; break; case AF_INET6: if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16, - tm->tcpm_addr.addr.a6) < 0) + tm->tcpm_daddr.addr.a6) < 0) goto nla_put_failure; break; default: @@ -882,14 +882,14 @@ static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct tcp_metrics_block *tm; - struct inetpeer_addr addr; + struct inetpeer_addr daddr; unsigned int hash; struct sk_buff *msg; struct net *net = genl_info_net(info); void *reply; int ret; - ret = parse_nl_addr(info, &addr, &hash, 0); + ret = parse_nl_addr(info, &daddr, &hash, 0); if (ret < 0) return ret; @@ -907,7 +907,7 @@ static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info) rcu_read_lock(); for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { - if (addr_same(&tm->tcpm_addr, &addr)) { + if (addr_same(&tm->tcpm_daddr, &daddr)) { ret = tcp_metrics_fill_info(msg, tm); break; } @@ -962,12 +962,12 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) struct tcpm_hash_bucket *hb; struct tcp_metrics_block *tm; struct tcp_metrics_block __rcu **pp; - struct inetpeer_addr addr; + struct inetpeer_addr daddr; unsigned int hash; struct net *net = genl_info_net(info); int ret; - ret = parse_nl_addr(info, &addr, &hash, 1); + ret = parse_nl_addr(info, &daddr, &hash, 1); if (ret < 0) return ret; if (ret > 0) @@ -979,7 +979,7 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) spin_lock_bh(&tcp_metrics_lock); for (tm = deref_locked_genl(*pp); tm; pp = &tm->tcpm_next, tm = deref_locked_genl(*pp)) { - if (addr_same(&tm->tcpm_addr, &addr)) { + if (addr_same(&tm->tcpm_daddr, &daddr)) { *pp = tm->tcpm_next; break; } -- cgit v1.1 From a544302820db12660b15de185b9e67c781a6b74e Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Wed, 8 Jan 2014 16:05:56 +0100 Subject: tcp: metrics: Add source-address to tcp-metrics We add the source-address to the tcp-metrics, so that different metrics will be used per source/destination-pair. We use the destination-hash to store the metric inside the hash-table. That way, deleting and dumping via "ip tcp_metrics" is easy. Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index f9b5f51..de32aa4 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -31,6 +31,7 @@ struct tcp_fastopen_metrics { struct tcp_metrics_block { struct tcp_metrics_block __rcu *tcpm_next; + struct inetpeer_addr tcpm_saddr; struct inetpeer_addr tcpm_daddr; unsigned long tcpm_stamp; u32 tcpm_ts; @@ -131,6 +132,7 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst, } static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, + struct inetpeer_addr *saddr, struct inetpeer_addr *daddr, unsigned int hash, bool reclaim) @@ -155,6 +157,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, if (!tm) goto out_unlock; } + tm->tcpm_saddr = *saddr; tm->tcpm_daddr = *daddr; tcpm_suck_dst(tm, dst, true); @@ -189,7 +192,8 @@ static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, in return NULL; } -static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *daddr, +static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr, + const struct inetpeer_addr *daddr, struct net *net, unsigned int hash) { struct tcp_metrics_block *tm; @@ -197,7 +201,8 @@ static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *d for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { - if (addr_same(&tm->tcpm_daddr, daddr)) + if (addr_same(&tm->tcpm_saddr, saddr) && + addr_same(&tm->tcpm_daddr, daddr)) break; depth++; } @@ -208,18 +213,21 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, struct dst_entry *dst) { struct tcp_metrics_block *tm; - struct inetpeer_addr daddr; + struct inetpeer_addr saddr, daddr; unsigned int hash; struct net *net; + saddr.family = req->rsk_ops->family; daddr.family = req->rsk_ops->family; switch (daddr.family) { case AF_INET: + saddr.addr.a4 = inet_rsk(req)->ir_loc_addr; daddr.addr.a4 = inet_rsk(req)->ir_rmt_addr; hash = (__force unsigned int) daddr.addr.a4; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: + *(struct in6_addr *)saddr.addr.a6 = inet_rsk(req)->ir_v6_loc_addr; *(struct in6_addr *)daddr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr; hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr); break; @@ -233,7 +241,8 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { - if (addr_same(&tm->tcpm_daddr, &daddr)) + if (addr_same(&tm->tcpm_saddr, &saddr) && + addr_same(&tm->tcpm_daddr, &daddr)) break; } tcpm_check_stamp(tm, dst); @@ -243,18 +252,21 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw) { struct tcp_metrics_block *tm; - struct inetpeer_addr daddr; + struct inetpeer_addr saddr, daddr; unsigned int hash; struct net *net; + saddr.family = tw->tw_family; daddr.family = tw->tw_family; switch (daddr.family) { case AF_INET: + saddr.addr.a4 = tw->tw_rcv_saddr; daddr.addr.a4 = tw->tw_daddr; hash = (__force unsigned int) daddr.addr.a4; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: + *(struct in6_addr *)saddr.addr.a6 = tw->tw_v6_rcv_saddr; *(struct in6_addr *)daddr.addr.a6 = tw->tw_v6_daddr; hash = ipv6_addr_hash(&tw->tw_v6_daddr); break; @@ -268,7 +280,8 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { - if (addr_same(&tm->tcpm_daddr, &daddr)) + if (addr_same(&tm->tcpm_saddr, &saddr) && + addr_same(&tm->tcpm_daddr, &daddr)) break; } return tm; @@ -279,19 +292,22 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, bool create) { struct tcp_metrics_block *tm; - struct inetpeer_addr daddr; + struct inetpeer_addr saddr, daddr; unsigned int hash; struct net *net; bool reclaim; + saddr.family = sk->sk_family; daddr.family = sk->sk_family; switch (daddr.family) { case AF_INET: + saddr.addr.a4 = inet_sk(sk)->inet_saddr; daddr.addr.a4 = inet_sk(sk)->inet_daddr; hash = (__force unsigned int) daddr.addr.a4; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: + *(struct in6_addr *)saddr.addr.a6 = sk->sk_v6_rcv_saddr; *(struct in6_addr *)daddr.addr.a6 = sk->sk_v6_daddr; hash = ipv6_addr_hash(&sk->sk_v6_daddr); break; @@ -303,14 +319,14 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, net = dev_net(dst->dev); hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); - tm = __tcp_get_metrics(&daddr, net, hash); + tm = __tcp_get_metrics(&saddr, &daddr, net, hash); reclaim = false; if (tm == TCP_METRICS_RECLAIM_PTR) { reclaim = true; tm = NULL; } if (!tm && create) - tm = tcpm_new(dst, &daddr, hash, reclaim); + tm = tcpm_new(dst, &saddr, &daddr, hash, reclaim); else tcpm_check_stamp(tm, dst); -- cgit v1.1 From 8a59359cb80f448923a7bc9f555d477e74547d7a Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Wed, 8 Jan 2014 16:05:57 +0100 Subject: tcp: metrics: New netlink attribute for src IP and dumped in netlink reply This patch adds a new netlink attribute for the source-IP and appends it to the netlink reply. Now, iproute2 can have access to the source-IP. Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index de32aa4..199659f 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -745,11 +745,17 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4, tm->tcpm_daddr.addr.a4) < 0) goto nla_put_failure; + if (nla_put_be32(msg, TCP_METRICS_ATTR_SADDR_IPV4, + tm->tcpm_saddr.addr.a4) < 0) + goto nla_put_failure; break; case AF_INET6: if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16, tm->tcpm_daddr.addr.a6) < 0) goto nla_put_failure; + if (nla_put(msg, TCP_METRICS_ATTR_SADDR_IPV6, 16, + tm->tcpm_saddr.addr.a6) < 0) + goto nla_put_failure; break; default: return -EAFNOSUPPORT; -- cgit v1.1 From bbf852b96ebdc6d1be7a67143824523280bbcf44 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Wed, 8 Jan 2014 16:05:58 +0100 Subject: tcp: metrics: Delete all entries matching a certain destination As we now can have multiple entries per destination-IP, the "ip tcp_metrics delete address ADDRESS" command deletes all of them. Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 199659f..e150f26 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -982,7 +982,7 @@ static int tcp_metrics_flush_all(struct net *net) static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) { struct tcpm_hash_bucket *hb; - struct tcp_metrics_block *tm; + struct tcp_metrics_block *tm, *tmlist = NULL; struct tcp_metrics_block __rcu **pp; struct inetpeer_addr daddr; unsigned int hash; @@ -999,17 +999,22 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) hb = net->ipv4.tcp_metrics_hash + hash; pp = &hb->chain; spin_lock_bh(&tcp_metrics_lock); - for (tm = deref_locked_genl(*pp); tm; - pp = &tm->tcpm_next, tm = deref_locked_genl(*pp)) { + for (tm = deref_locked_genl(*pp); tm; tm = deref_locked_genl(*pp)) { if (addr_same(&tm->tcpm_daddr, &daddr)) { *pp = tm->tcpm_next; - break; + tm->tcpm_next = tmlist; + tmlist = tm; + } else { + pp = &tm->tcpm_next; } } spin_unlock_bh(&tcp_metrics_lock); - if (!tm) + if (!tmlist) return -ESRCH; - kfree_rcu(tm, rcu_head); + for (tm = tmlist; tm; tm = tmlist) { + tmlist = tm->tcpm_next; + kfree_rcu(tm, rcu_head); + } return 0; } -- cgit v1.1 From 3e7013ddf55af7bc191792b8aea0c2b94fb0fef5 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Wed, 8 Jan 2014 16:05:59 +0100 Subject: tcp: metrics: Allow selective get/del of tcp-metrics based on src IP We want to be able to get/del tcp-metrics based on the src IP. This patch adds the necessary parsing of the netlink attribute and if the source address is set, it will match on this one too. Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 48 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index e150f26..699a42f 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -877,44 +877,66 @@ done: return skb->len; } -static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, - unsigned int *hash, int optional) +static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, + unsigned int *hash, int optional, int v4, int v6) { struct nlattr *a; - a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV4]; + a = info->attrs[v4]; if (a) { addr->family = AF_INET; addr->addr.a4 = nla_get_be32(a); - *hash = (__force unsigned int) addr->addr.a4; + if (hash) + *hash = (__force unsigned int) addr->addr.a4; return 0; } - a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV6]; + a = info->attrs[v6]; if (a) { if (nla_len(a) != sizeof(struct in6_addr)) return -EINVAL; addr->family = AF_INET6; memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6)); - *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6); + if (hash) + *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6); return 0; } return optional ? 1 : -EAFNOSUPPORT; } +static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, + unsigned int *hash, int optional) +{ + return __parse_nl_addr(info, addr, hash, optional, + TCP_METRICS_ATTR_ADDR_IPV4, + TCP_METRICS_ATTR_ADDR_IPV6); +} + +static int parse_nl_saddr(struct genl_info *info, struct inetpeer_addr *addr) +{ + return __parse_nl_addr(info, addr, NULL, 0, + TCP_METRICS_ATTR_SADDR_IPV4, + TCP_METRICS_ATTR_SADDR_IPV6); +} + static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct tcp_metrics_block *tm; - struct inetpeer_addr daddr; + struct inetpeer_addr saddr, daddr; unsigned int hash; struct sk_buff *msg; struct net *net = genl_info_net(info); void *reply; int ret; + bool src = true; ret = parse_nl_addr(info, &daddr, &hash, 0); if (ret < 0) return ret; + ret = parse_nl_saddr(info, &saddr); + if (ret < 0) + src = false; + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; @@ -929,7 +951,8 @@ static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info) rcu_read_lock(); for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { - if (addr_same(&tm->tcpm_daddr, &daddr)) { + if (addr_same(&tm->tcpm_daddr, &daddr) && + (!src || addr_same(&tm->tcpm_saddr, &saddr))) { ret = tcp_metrics_fill_info(msg, tm); break; } @@ -984,23 +1007,28 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) struct tcpm_hash_bucket *hb; struct tcp_metrics_block *tm, *tmlist = NULL; struct tcp_metrics_block __rcu **pp; - struct inetpeer_addr daddr; + struct inetpeer_addr saddr, daddr; unsigned int hash; struct net *net = genl_info_net(info); int ret; + bool src = true; ret = parse_nl_addr(info, &daddr, &hash, 1); if (ret < 0) return ret; if (ret > 0) return tcp_metrics_flush_all(net); + ret = parse_nl_saddr(info, &saddr); + if (ret < 0) + src = false; hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); hb = net->ipv4.tcp_metrics_hash + hash; pp = &hb->chain; spin_lock_bh(&tcp_metrics_lock); for (tm = deref_locked_genl(*pp); tm; tm = deref_locked_genl(*pp)) { - if (addr_same(&tm->tcpm_daddr, &daddr)) { + if (addr_same(&tm->tcpm_daddr, &daddr) && + (!src || addr_same(&tm->tcpm_saddr, &saddr))) { *pp = tm->tcpm_next; tm->tcpm_next = tmlist; tmlist = tm; -- cgit v1.1 From f87c10a8aa1e82498c42d0335524d6ae7cf5a52b Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Thu, 9 Jan 2014 10:01:15 +0100 Subject: ipv4: introduce ip_dst_mtu_maybe_forward and protect forwarding path against pmtu spoofing While forwarding we should not use the protocol path mtu to calculate the mtu for a forwarded packet but instead use the interface mtu. We mark forwarded skbs in ip_forward with IPSKB_FORWARDED, which was introduced for multicast forwarding. But as it does not conflict with our usage in unicast code path it is perfect for reuse. I moved the functions ip_sk_accept_pmtu, ip_sk_use_pmtu and ip_skb_dst_mtu along with the new ip_dst_mtu_maybe_forward to net/ip.h to fix circular dependencies because of IPSKB_FORWARDED. Because someone might have written a software which does probe destinations manually and expects the kernel to honour those path mtus I introduced a new per-namespace "ip_forward_use_pmtu" knob so someone can disable this new behaviour. We also still use mtus which are locked on a route for forwarding. The reason for this change is, that path mtus information can be injected into the kernel via e.g. icmp_err protocol handler without verification of local sockets. As such, this could cause the IPv4 forwarding path to wrongfully emit fragmentation needed notifications or start to fragment packets along a path. Tunnel and ipsec output paths clear IPCB again, thus IPSKB_FORWARDED won't be set and further fragmentation logic will use the path mtu to determine the fragmentation size. They also recheck packet size with help of path mtu discovery and report appropriate errors. Cc: Eric Dumazet Cc: David Miller Cc: John Heffner Cc: Steffen Klassert Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/ip_forward.c | 7 +++++-- net/ipv4/ip_output.c | 8 +++++--- net/ipv4/route.c | 3 --- net/ipv4/sysctl_net_ipv4.c | 7 +++++++ 4 files changed, 17 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 694de3b..e9f1217 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -54,6 +54,7 @@ static int ip_forward_finish(struct sk_buff *skb) int ip_forward(struct sk_buff *skb) { + u32 mtu; struct iphdr *iph; /* Our header */ struct rtable *rt; /* Route we use */ struct ip_options *opt = &(IPCB(skb)->opt); @@ -88,11 +89,13 @@ int ip_forward(struct sk_buff *skb) if (opt->is_strictroute && rt->rt_uses_gateway) goto sr_failed; - if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && + IPCB(skb)->flags |= IPSKB_FORWARDED; + mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); + if (unlikely(skb->len > mtu && !skb_is_gso(skb) && (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(dst_mtu(&rt->dst))); + htonl(mtu)); goto drop; } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index df18461..9a78804 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -449,6 +449,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) __be16 not_last_frag; struct rtable *rt = skb_rtable(skb); int err = 0; + bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED; dev = rt->dst.dev; @@ -458,12 +459,13 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) iph = ip_hdr(skb); + mtu = ip_dst_mtu_maybe_forward(&rt->dst, forwarding); if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) || (IPCB(skb)->frag_max_size && - IPCB(skb)->frag_max_size > dst_mtu(&rt->dst)))) { + IPCB(skb)->frag_max_size > mtu))) { IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(ip_skb_dst_mtu(skb))); + htonl(mtu)); kfree_skb(skb); return -EMSGSIZE; } @@ -473,7 +475,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) */ hlen = iph->ihl * 4; - mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */ + mtu = mtu - hlen; /* Size of data space */ #ifdef CONFIG_BRIDGE_NETFILTER if (skb->nf_bridge) mtu -= nf_bridge_mtu_reduction(skb); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f8da282..25071b4 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -112,9 +112,6 @@ #define RT_FL_TOS(oldflp4) \ ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) -/* IPv4 datagram length is stored into 16bit field (tot_len) */ -#define IP_MAX_MTU 0xFFFF - #define RT_GC_TIMEOUT (300*HZ) static int ip_rt_max_size; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 1d2480a..44eba05 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -831,6 +831,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "ip_forward_use_pmtu", + .data = &init_net.ipv4.sysctl_ip_fwd_use_pmtu, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; -- cgit v1.1 From 8ed1dc44d3e9e8387a104b1ae8f92e9a3fbf1b1e Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Thu, 9 Jan 2014 10:01:17 +0100 Subject: ipv4: introduce hardened ip_no_pmtu_disc mode This new ip_no_pmtu_disc mode only allowes fragmentation-needed errors to be honored by protocols which do more stringent validation on the ICMP's packet payload. This knob is useful for people who e.g. want to run an unmodified DNS server in a namespace where they need to use pmtu for TCP connections (as they are used for zone transfers or fallback for requests) but don't want to use possibly spoofed UDP pmtu information. Currently the whitelisted protocols are TCP, SCTP and DCCP as they check if the returned packet is in the window or if the association is valid. Cc: Eric Dumazet Cc: David Miller Cc: John Heffner Suggested-by: Florian Weimer Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 1 + net/ipv4/icmp.c | 28 ++++++++++++++++++++++++---- 2 files changed, 25 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6268a47..ecd2c3f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1545,6 +1545,7 @@ static const struct net_protocol tcp_protocol = { .err_handler = tcp_v4_err, .no_policy = 1, .netns_ok = 1, + .icmp_strict_tag_validation = 1, }; static const struct net_protocol udp_protocol = { diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index fb3c563..0134663 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -668,6 +668,16 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info) rcu_read_unlock(); } +static bool icmp_tag_validation(int proto) +{ + bool ok; + + rcu_read_lock(); + ok = rcu_dereference(inet_protos[proto])->icmp_strict_tag_validation; + rcu_read_unlock(); + return ok; +} + /* * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_QUENCH, and * ICMP_PARAMETERPROB. @@ -705,12 +715,22 @@ static void icmp_unreach(struct sk_buff *skb) case ICMP_PORT_UNREACH: break; case ICMP_FRAG_NEEDED: - if (net->ipv4.sysctl_ip_no_pmtu_disc == 2) { - goto out; - } else if (net->ipv4.sysctl_ip_no_pmtu_disc) { + /* for documentation of the ip_no_pmtu_disc + * values please see + * Documentation/networking/ip-sysctl.txt + */ + switch (net->ipv4.sysctl_ip_no_pmtu_disc) { + default: LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"), &iph->daddr); - } else { + break; + case 2: + goto out; + case 3: + if (!icmp_tag_validation(iph->protocol)) + goto out; + /* fall through */ + case 0: info = ntohs(icmph->un.frag.mtu); if (!info) goto out; -- cgit v1.1 From d10dbad2aafacc7e1391595596e7c5138892dd93 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 9 Jan 2014 22:22:05 +0800 Subject: gre_offload: fix sparse non static symbol warning Fixes the following sparse warning: net/ipv4/gre_offload.c:253:5: warning: symbol 'gre_gro_complete' was not declared. Should it be static? Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- net/ipv4/gre_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 746a7b1..31da9f5 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -250,7 +250,7 @@ out: return pp; } -int gre_gro_complete(struct sk_buff *skb, int nhoff) +static int gre_gro_complete(struct sk_buff *skb, int nhoff) { struct gre_base_hdr *greh = (struct gre_base_hdr *)(skb->data + nhoff); struct packet_offload *ptype; -- cgit v1.1 From b884b1a46f205d56e33c0391c1d04644d846f41e Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 9 Jan 2014 20:47:17 -0500 Subject: gre_offload: simplify GRE header length calculation in gre_gso_segment() Simplify the GRE header length calculation in gre_gso_segment(). Switch to an approach that is simpler, faster, and more general. The new approach will continue to be correct even if we add support for the optional variable-length routing info that may be present in a GRE header. Signed-off-by: Neal Cardwell Cc: Eric Dumazet Cc: H.K. Jerry Chu Cc: Pravin B Shelar Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/gre_offload.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 31da9f5..29512e3 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -26,7 +26,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, { struct sk_buff *segs = ERR_PTR(-EINVAL); netdev_features_t enc_features; - int ghl = GRE_HEADER_SECTION; + int ghl; struct gre_base_hdr *greh; u16 mac_offset = skb->mac_header; int mac_len = skb->mac_len; @@ -49,15 +49,11 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, greh = (struct gre_base_hdr *)skb_transport_header(skb); - if (greh->flags & GRE_KEY) - ghl += GRE_HEADER_SECTION; - if (greh->flags & GRE_SEQ) - ghl += GRE_HEADER_SECTION; - if (greh->flags & GRE_CSUM) { - ghl += GRE_HEADER_SECTION; - csum = true; - } else - csum = false; + ghl = skb_inner_network_header(skb) - skb_transport_header(skb); + if (unlikely(ghl < sizeof(*greh))) + goto out; + + csum = !!(greh->flags & GRE_CSUM); if (unlikely(!pskb_may_pull(skb, ghl))) goto out; -- cgit v1.1 From 72c1d3bdd5bf10a789608336ba0d61f1e44e4350 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Fri, 10 Jan 2014 16:09:45 -0800 Subject: ipv4: register igmp_notifier even when !CONFIG_PROC_FS We still need this notifier even when we don't config PROC_FS. It should be rare to have a kernel without PROC_FS, so just for completeness. Cc: Stephen Hemminger Cc: David S. Miller Cc: Patrick McHardy Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 8 ++++++-- net/ipv4/ip_output.c | 4 ++-- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 84c4329..bcbf33e 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2762,6 +2762,7 @@ static struct pernet_operations igmp_net_ops = { .init = igmp_net_init, .exit = igmp_net_exit, }; +#endif static int igmp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) @@ -2785,8 +2786,9 @@ static struct notifier_block igmp_notifier = { .notifier_call = igmp_netdev_event, }; -int __init igmp_mc_proc_init(void) +int __init igmp_mc_init(void) { +#if defined(CONFIG_PROC_FS) int err; err = register_pernet_subsys(&igmp_net_ops); @@ -2800,5 +2802,7 @@ int __init igmp_mc_proc_init(void) reg_notif_fail: unregister_pernet_subsys(&igmp_net_ops); return err; -} +#else + return register_netdevice_notifier(&igmp_notifier); #endif +} diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 9a78804..8971780 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1553,7 +1553,7 @@ void __init ip_init(void) ip_rt_init(); inet_initpeers(); -#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS) - igmp_mc_proc_init(); +#if defined(CONFIG_IP_MULTICAST) + igmp_mc_init(); #endif } -- cgit v1.1 From 63862b5bef7349dd1137e4c70702c67d77565785 Mon Sep 17 00:00:00 2001 From: Aruna-Hewapathirane Date: Sat, 11 Jan 2014 07:15:59 -0500 Subject: net: replace macros net_random and net_srandom with direct calls to prandom This patch removes the net_random and net_srandom macros and replaces them with direct calls to the prandom ones. As new commits only seem to use prandom_u32 there is no use to keep them around. This change makes it easier to grep for users of prandom_u32. Signed-off-by: Aruna-Hewapathirane Suggested-by: Hannes Frederic Sowa Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 2 +- net/ipv4/igmp.c | 6 +++--- net/ipv4/inet_connection_sock.c | 2 +- net/ipv4/udp.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 9809f7b..646023b 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -464,7 +464,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, } if (!(ifa->ifa_flags & IFA_F_SECONDARY)) { - net_srandom(ifa->ifa_local); + prandom_seed((__force u32) ifa->ifa_local); ifap = last_primary; } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index bcbf33e..97e4d16 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -211,7 +211,7 @@ static void igmp_stop_timer(struct ip_mc_list *im) /* It must be called with locked im->lock */ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) { - int tv = net_random() % max_delay; + int tv = prandom_u32() % max_delay; im->tm_running = 1; if (!mod_timer(&im->timer, jiffies+tv+2)) @@ -220,7 +220,7 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) static void igmp_gq_start_timer(struct in_device *in_dev) { - int tv = net_random() % in_dev->mr_maxdelay; + int tv = prandom_u32() % in_dev->mr_maxdelay; in_dev->mr_gq_running = 1; if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2)) @@ -229,7 +229,7 @@ static void igmp_gq_start_timer(struct in_device *in_dev) static void igmp_ifc_start_timer(struct in_device *in_dev, int delay) { - int tv = net_random() % delay; + int tv = prandom_u32() % delay; if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2)) in_dev_hold(in_dev); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index fc0e649..0d1e2cb 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -109,7 +109,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) again: inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; - smallest_rover = rover = net_random() % remaining + low; + smallest_rover = rover = prandom_u32() % remaining + low; smallest_size = -1; do { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 80f649f..3d3141f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -223,7 +223,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; - rand = net_random(); + rand = prandom_u32(); first = (((u64)rand * remaining) >> 32) + low; /* * force rand to be an odd multiple of UDP_HTABLE_SIZE -- cgit v1.1 From b53c7336007784e435fee830063d3262ae98cd31 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Jan 2014 16:31:46 -0800 Subject: tcp: do not export tcp_gso_segment() and tcp_gro_receive() tcp_gso_segment() and tcp_gro_receive() no longer need to be exported. IPv4 and IPv6 offloads are statically linked. Note that tcp_gro_complete() is still used by bnx2x, unfortunately. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_offload.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 771a395..b92b817 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -138,7 +138,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, out: return segs; } -EXPORT_SYMBOL(tcp_gso_segment); struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) { @@ -235,7 +234,6 @@ out: return pp; } -EXPORT_SYMBOL(tcp_gro_receive); int tcp_gro_complete(struct sk_buff *skb) { -- cgit v1.1 From cf1722837246cdca199f7abfcd658bd4632c337b Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 15 Jan 2014 11:19:55 -0500 Subject: net/ipv4: don't use module_init in non-modular gre_offload Recent commit 438e38fadca2f6e57eeecc08326c8a95758594d4 ("gre_offload: statically build GRE offloading support") added new module_init/module_exit calls to the gre_offload.c file. The file is obj-y and can't be anything other than built-in. Currently it can never be built modular, so using module_init as an alias for __initcall can be somewhat misleading. Fix this up now, so that we can relocate module_init from init.h into module.h in the future. If we don't do this, we'd have to add module.h to obviously non-modular code, and that would be a worse thing. We also make the inclusion explicit. Note that direct use of __initcall is discouraged, vs. one of the priority categorized subgroups. As __initcall gets mapped onto device_initcall, our use of device_initcall directly in this change means that the runtime impact is zero -- it will remain at level 6 in initcall ordering. As for the module_exit, rather than replace it with __exitcall, we simply remove it, since it appears only UML does anything with those, and even for UML, there is no relevant cleanup to be done here. Cc: Eric Dumazet Signed-off-by: Paul Gortmaker Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/gre_offload.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 29512e3..f1d3228 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -11,6 +11,7 @@ */ #include +#include #include #include @@ -283,11 +284,4 @@ static int __init gre_offload_init(void) { return inet_add_offload(&gre_offload, IPPROTO_GRE); } - -static void __exit gre_offload_exit(void) -{ - inet_del_offload(&gre_offload, IPPROTO_GRE); -} - -module_init(gre_offload_init); -module_exit(gre_offload_exit); +device_initcall(gre_offload_init); -- cgit v1.1 From 1d13a96c74fc4802a775189ddb58bc6469ffdaa3 Mon Sep 17 00:00:00 2001 From: Florent Fourcot Date: Thu, 16 Jan 2014 17:21:22 +0100 Subject: ipv6: tcp: fix flowlabel value in ACK messages send from TIME_WAIT This patch is following the commit b903d324bee262 (ipv6: tcp: fix TCLASS value in ACK messages sent from TIME_WAIT). For the same reason than tclass, we have to store the flow label in the inet_timewait_sock to provide consistency of flow label on the last ACK. Signed-off-by: Florent Fourcot Signed-off-by: David S. Miller --- net/ipv4/tcp_minisocks.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 3aa9e32..7a436c5 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -297,6 +297,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_v6_daddr = sk->sk_v6_daddr; tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; + tw->tw_flowlabel = np->flow_label >> 12; tw->tw_ipv6only = np->ipv6only; } #endif -- cgit v1.1 From 6c7e7610ff6888ea15a901fbcb30c5d461816b34 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Jan 2014 16:41:19 -0800 Subject: ipv4: fix a dst leak in tunnels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch : 1) Remove a dst leak if DST_NOCACHE was set on dst Fix this by holding a reference only if dst really cached. 2) Remove a lockdep warning in __tunnel_dst_set() This was reported by Cong Wang. 3) Remove usage of a spinlock where xchg() is enough 4) Remove some spurious inline keywords. Let compiler decide for us. Fixes: 7d442fab0a67 ("ipv4: Cache dst in tunnels") Signed-off-by: Eric Dumazet Cc: Cong Wang Cc: Tom Herbert Cc: Maciej Żenczykowski Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index d3929a6..432c28a 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -68,27 +68,27 @@ static unsigned int ip_tunnel_hash(struct ip_tunnel_net *itn, IP_TNL_HASH_BITS); } -static inline void __tunnel_dst_set(struct ip_tunnel_dst *idst, - struct dst_entry *dst) +static void __tunnel_dst_set(struct ip_tunnel_dst *idst, + struct dst_entry *dst) { struct dst_entry *old_dst; - if (dst && (dst->flags & DST_NOCACHE)) - dst = NULL; - - spin_lock_bh(&idst->lock); - old_dst = rcu_dereference(idst->dst); - rcu_assign_pointer(idst->dst, dst); + if (dst) { + if (dst->flags & DST_NOCACHE) + dst = NULL; + else + dst_clone(dst); + } + old_dst = xchg((__force struct dst_entry **)&idst->dst, dst); dst_release(old_dst); - spin_unlock_bh(&idst->lock); } -static inline void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst) +static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst) { __tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst); } -static inline void tunnel_dst_reset(struct ip_tunnel *t) +static void tunnel_dst_reset(struct ip_tunnel *t) { tunnel_dst_set(t, NULL); } @@ -101,7 +101,7 @@ static void tunnel_dst_reset_all(struct ip_tunnel *t) __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL); } -static inline struct dst_entry *tunnel_dst_get(struct ip_tunnel *t) +static struct dst_entry *tunnel_dst_get(struct ip_tunnel *t) { struct dst_entry *dst; @@ -413,7 +413,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) if (!IS_ERR(rt)) { tdev = rt->dst.dev; - tunnel_dst_set(tunnel, dst_clone(&rt->dst)); + tunnel_dst_set(tunnel, &rt->dst); ip_rt_put(rt); } if (dev->type != ARPHRD_ETHER) @@ -668,7 +668,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } if (connected) - tunnel_dst_set(tunnel, dst_clone(&rt->dst)); + tunnel_dst_set(tunnel, &rt->dst); } if (rt->dst.dev == dev) { @@ -1066,12 +1066,6 @@ int ip_tunnel_init(struct net_device *dev) return -ENOMEM; } - for_each_possible_cpu(i) { - struct ip_tunnel_dst *idst = per_cpu_ptr(tunnel->dst_cache, i); - idst-> dst = NULL; - spin_lock_init(&idst->lock); - } - err = gro_cells_init(&tunnel->gro_cells, dev); if (err) { free_percpu(tunnel->dst_cache); -- cgit v1.1 From 342dfc306fb32155314dad277f3c3686b83fb9f1 Mon Sep 17 00:00:00 2001 From: Steffen Hurrle Date: Fri, 17 Jan 2014 22:53:15 +0100 Subject: net: add build-time checks for msg->msg_name size This is a follow-up patch to f3d3342602f8bc ("net: rework recvmsg handler msg_name and msg_namelen logic"). DECLARE_SOCKADDR validates that the structure we use for writing the name information to is not larger than the buffer which is reserved for msg->msg_name (which is 128 bytes). Also use DECLARE_SOCKADDR consistently in sendmsg code paths. Signed-off-by: Steffen Hurrle Suggested-by: Hannes Frederic Sowa Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 3 +-- net/ipv4/ping.c | 7 +++---- net/ipv4/raw.c | 4 ++-- net/ipv4/udp.c | 4 ++-- 4 files changed, 8 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index a9fc435..22f15eb 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -390,7 +390,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct sock_exterr_skb *serr; struct sk_buff *skb, *skb2; - struct sockaddr_in *sin; + DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct { struct sock_extended_err ee; struct sockaddr_in offender; @@ -416,7 +416,6 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) serr = SKB_EXT_ERR(skb); - sin = (struct sockaddr_in *)msg->msg_name; if (sin) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index cae5262..e09e883 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -700,7 +700,7 @@ static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m */ if (msg->msg_name) { - struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) @@ -873,7 +873,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* Copy the address and add cmsg data. */ if (family == AF_INET) { - struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); if (sin) { sin->sin_family = AF_INET; @@ -890,8 +890,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } else if (family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6hdr *ip6 = ipv6_hdr(skb); - struct sockaddr_in6 *sin6 = - (struct sockaddr_in6 *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); if (sin6) { sin6->sin6_family = AF_INET6; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 81e6cfd..c04518f 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -493,7 +493,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, */ if (msg->msg_namelen) { - struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); err = -EINVAL; if (msg->msg_namelen < sizeof(*usin)) goto out; @@ -690,7 +690,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct inet_sock *inet = inet_sk(sk); size_t copied = 0; int err = -EOPNOTSUPP; - struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; if (flags & MSG_OOB) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3d3141f..77bd16f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -902,7 +902,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, * Get and verify the address. */ if (msg->msg_name) { - struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) { @@ -1226,7 +1226,7 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len) { struct inet_sock *inet = inet_sk(sk); - struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; unsigned int ulen, copied; int peeked, off = 0; -- cgit v1.1 From 3acfa1e73c2a2cbf1fda7aef0c6c2c9281ce9db2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 18 Jan 2014 18:27:49 -0800 Subject: ipv4: be friend with drop monitor Replace some dev_kfree_skb() with kfree_skb() calls when we drop one skb, this might help bug tracking. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 4 ++-- net/ipv4/ip_tunnel.c | 4 ++-- net/ipv4/ip_vti.c | 2 +- net/ipv4/ipip.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index e560ef3..e7a92fd 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -278,7 +278,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, return NETDEV_TX_OK; free_skb: - dev_kfree_skb(skb); + kfree_skb(skb); out: dev->stats.tx_dropped++; return NETDEV_TX_OK; @@ -301,7 +301,7 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, return NETDEV_TX_OK; free_skb: - dev_kfree_skb(skb); + kfree_skb(skb); out: dev->stats.tx_dropped++; return NETDEV_TX_OK; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 432c28a..83ede2e 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -716,7 +716,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, if (skb_cow_head(skb, dev->needed_headroom)) { dev->stats.tx_dropped++; - dev_kfree_skb(skb); + kfree_skb(skb); return; } @@ -732,7 +732,7 @@ tx_error_icmp: #endif tx_error: dev->stats.tx_errors++; - dev_kfree_skb(skb); + kfree_skb(skb); } EXPORT_SYMBOL_GPL(ip_tunnel_xmit); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 0783200..48eafae 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -162,7 +162,7 @@ tx_error_icmp: dst_link_failure(skb); tx_error: dev->stats.tx_errors++; - dev_kfree_skb(skb); + kfree_skb(skb); return NETDEV_TX_OK; } diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index fe3e9f7..812b183 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -228,7 +228,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; tx_error: - dev_kfree_skb(skb); + kfree_skb(skb); out: dev->stats.tx_errors++; return NETDEV_TX_OK; -- cgit v1.1 From 4b261c75a99f29c93a0b6babfc180cdf566bd654 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 20 Jan 2014 03:43:08 +0100 Subject: ipv6: make IPV6_RECVPKTINFO work for ipv4 datagrams We currently don't report IPV6_RECVPKTINFO in cmsg access ancillary data for IPv4 datagrams on IPv6 sockets. This patch splits the ip6_datagram_recv_ctl into two functions, one which handles both protocol families, AF_INET and AF_INET6, while the ip6_datagram_recv_specific_ctl only handles IPv6 cmsg data. ip6_datagram_recv_*_ctl never reported back any errors, so we can make them return void. Also provide a helper for protocols which don't offer dual personality to further use ip6_datagram_recv_ctl, which is exported to modules. I needed to shuffle the code for ping around a bit to make it easier to implement dual personality for ping ipv6 sockets in future. Reported-by: Gert Doering Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 6 +++--- net/ipv4/ping.c | 7 ++++++- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 22f15eb..580dd96 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -56,7 +56,6 @@ /* * SOL_IP control messages. */ -#define PKTINFO_SKB_CB(__skb) ((struct in_pktinfo *)((__skb)->cb)) static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) { @@ -1055,9 +1054,10 @@ e_inval: void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) { struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); + bool prepare = (inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) || + ipv6_sk_rxinfo(sk); - if ((inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) && - skb_rtable(skb)) { + if (prepare && skb_rtable(skb)) { pktinfo->ipi_ifindex = inet_iif(skb); pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); } else { diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index e09e883..4a9e426 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -906,7 +906,12 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } if (inet6_sk(sk)->rxopt.all) - pingv6_ops.ip6_datagram_recv_ctl(sk, msg, skb); + pingv6_ops.ip6_datagram_recv_common_ctl(sk, msg, skb); + if (skb->protocol == htons(ETH_P_IPV6) && + inet6_sk(sk)->rxopt.all) + pingv6_ops.ip6_datagram_recv_specific_ctl(sk, msg, skb); + else if (skb->protocol == htons(ETH_P_IP) && isk->cmsg_flags) + ip_cmsg_recv(msg, skb); #endif } else { BUG(); -- cgit v1.1 From 967680e02724afe9bb4bf0b0c20d0a55de315fcb Mon Sep 17 00:00:00 2001 From: Duan Jiong Date: Sun, 19 Jan 2014 16:43:42 +0800 Subject: ipv4: remove the useless argument from ip_tunnel_hash() Since commit c544193214("GRE: Refactor GRE tunneling code") introduced function ip_tunnel_hash(), the argument itn is no longer in use, so remove it. Signed-off-by: Duan Jiong Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 83ede2e..1886cd4 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -61,8 +61,7 @@ #include #endif -static unsigned int ip_tunnel_hash(struct ip_tunnel_net *itn, - __be32 key, __be32 remote) +static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) { return hash_32((__force u32)key ^ (__force u32)remote, IP_TNL_HASH_BITS); @@ -204,7 +203,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, struct ip_tunnel *t, *cand = NULL; struct hlist_head *head; - hash = ip_tunnel_hash(itn, key, remote); + hash = ip_tunnel_hash(key, remote); head = &itn->tunnels[hash]; hlist_for_each_entry_rcu(t, head, hash_node) { @@ -236,7 +235,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, cand = t; } - hash = ip_tunnel_hash(itn, key, 0); + hash = ip_tunnel_hash(key, 0); head = &itn->tunnels[hash]; hlist_for_each_entry_rcu(t, head, hash_node) { @@ -292,7 +291,7 @@ static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, else remote = 0; - h = ip_tunnel_hash(itn, parms->i_key, remote); + h = ip_tunnel_hash(parms->i_key, remote); return &itn->tunnels[h]; } -- cgit v1.1 From 4d83e1773031c2b7bbcfeb3736ad20980785dd93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20Pan=28=E6=BD=98=E5=8D=AB=E5=B9=B3=29?= Date: Sun, 19 Jan 2014 20:44:46 +0800 Subject: tcp: delete redundant calls of tcp_mtup_init() As tcp_rcv_state_process() has already calls tcp_mtup_init() for non-fastopen sock, we can delete the redundant calls of tcp_mtup_init() in tcp_{v4,v6}_syn_recv_sock(). Signed-off-by: Weiping Pan Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7297b56..3cf9765 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1668,7 +1668,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, } sk_setup_caps(newsk, dst); - tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric_advmss(dst); if (tcp_sk(sk)->rx_opt.user_mss && -- cgit v1.1 From 82b276cd2b0bacd58e7c307bf8856925a68c4d14 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 20 Jan 2014 05:16:39 +0100 Subject: ipv6: protect protocols not handling ipv4 from v4 connection/bind attempts Some ipv6 protocols cannot handle ipv4 addresses, so we must not allow connecting and binding to them. sendmsg logic does already check msg->name for this but must trust already connected sockets which could be set up for connection to ipv4 address family. Per-socket flag ipv6only is of no use here, as it is under users control by setsockopt. Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/ping.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 4a9e426..2d11c09 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -320,6 +320,9 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, if (addr_len < sizeof(*addr)) return -EINVAL; + if (addr->sin6_family != AF_INET6) + return -EINVAL; + pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n", sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port)); -- cgit v1.1 From b582ef0990d457f7ce8ccf827af51a575ca0b4a6 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Mon, 20 Jan 2014 13:59:19 +0200 Subject: net: Add GRO support for UDP encapsulating protocols Add GRO handlers for protocols that do UDP encapsulation, with the intent of being able to coalesce packets which encapsulate packets belonging to the same TCP session. For GRO purposes, the destination UDP port takes the role of the ether type field in the ethernet header or the next protocol in the IP header. The UDP GRO handler will only attempt to coalesce packets whose destination port is registered to have gro handler. Use a mark on the skb GRO CB data to disallow (flush) running the udp gro receive code twice on a packet. This solves the problem of udp encapsulated packets whose inner VM packet is udp and happen to carry a port which has registered offloads. Signed-off-by: Shlomo Pongratz Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- net/ipv4/udp_offload.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 79c62bd..ee853c5 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -14,6 +14,15 @@ #include #include +static DEFINE_SPINLOCK(udp_offload_lock); +static struct udp_offload_priv *udp_offload_base __read_mostly; + +struct udp_offload_priv { + struct udp_offload *offload; + struct rcu_head rcu; + struct udp_offload_priv __rcu *next; +}; + static int udp4_ufo_send_check(struct sk_buff *skb) { if (!pskb_may_pull(skb, sizeof(struct udphdr))) @@ -89,10 +98,144 @@ out: return segs; } +int udp_add_offload(struct udp_offload *uo) +{ + struct udp_offload_priv **head = &udp_offload_base; + struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_KERNEL); + + if (!new_offload) + return -ENOMEM; + + new_offload->offload = uo; + + spin_lock(&udp_offload_lock); + rcu_assign_pointer(new_offload->next, rcu_dereference(*head)); + rcu_assign_pointer(*head, rcu_dereference(new_offload)); + spin_unlock(&udp_offload_lock); + + return 0; +} +EXPORT_SYMBOL(udp_add_offload); + +static void udp_offload_free_routine(struct rcu_head *head) +{ + struct udp_offload_priv *ou_priv = container_of(head, struct udp_offload_priv, rcu); + kfree(ou_priv); +} + +void udp_del_offload(struct udp_offload *uo) +{ + struct udp_offload_priv __rcu **head = &udp_offload_base; + struct udp_offload_priv *uo_priv; + + spin_lock(&udp_offload_lock); + + uo_priv = rcu_dereference(*head); + for (; uo_priv != NULL; + uo_priv = rcu_dereference(*head)) { + + if (uo_priv->offload == uo) { + rcu_assign_pointer(*head, rcu_dereference(uo_priv->next)); + goto unlock; + } + head = &uo_priv->next; + } + pr_warn("udp_del_offload: didn't find offload for port %d\n", htons(uo->port)); +unlock: + spin_unlock(&udp_offload_lock); + if (uo_priv != NULL) + call_rcu(&uo_priv->rcu, udp_offload_free_routine); +} +EXPORT_SYMBOL(udp_del_offload); + +static struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb) +{ + struct udp_offload_priv *uo_priv; + struct sk_buff *p, **pp = NULL; + struct udphdr *uh, *uh2; + unsigned int hlen, off; + int flush = 1; + + if (NAPI_GRO_CB(skb)->udp_mark || + (!skb->encapsulation && skb->ip_summed != CHECKSUM_COMPLETE)) + goto out; + + /* mark that this skb passed once through the udp gro layer */ + NAPI_GRO_CB(skb)->udp_mark = 1; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*uh); + uh = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + uh = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!uh)) + goto out; + } + + rcu_read_lock(); + uo_priv = rcu_dereference(udp_offload_base); + for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) { + if (uo_priv->offload->port == uh->dest && + uo_priv->offload->callbacks.gro_receive) + goto unflush; + } + goto out_unlock; + +unflush: + flush = 0; + + for (p = *head; p; p = p->next) { + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + uh2 = (struct udphdr *)(p->data + off); + if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + } + + skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ + pp = uo_priv->offload->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); +out: + NAPI_GRO_CB(skb)->flush |= flush; + return pp; +} + +static int udp_gro_complete(struct sk_buff *skb, int nhoff) +{ + struct udp_offload_priv *uo_priv; + __be16 newlen = htons(skb->len - nhoff); + struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); + int err = -ENOSYS; + + uh->len = newlen; + + rcu_read_lock(); + + uo_priv = rcu_dereference(udp_offload_base); + for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) { + if (uo_priv->offload->port == uh->dest && + uo_priv->offload->callbacks.gro_complete) + break; + } + + if (uo_priv != NULL) + err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr)); + + rcu_read_unlock(); + return err; +} + static const struct net_offload udpv4_offload = { .callbacks = { .gso_send_check = udp4_ufo_send_check, .gso_segment = udp4_ufo_fragment, + .gro_receive = udp_gro_receive, + .gro_complete = udp_gro_complete, }, }; -- cgit v1.1 From 00ca9c5b2b11d44eaf20a4b647efc999734323ec Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Tue, 21 Jan 2014 13:30:26 +0100 Subject: tcp: metrics: Fix rcu-race when deleting multiple entries In bbf852b96ebdc6d1 I introduced the tmlist, which allows to delete multiple entries from the cache that match a specified destination if no source-IP is specified. However, as the cache is an RCU-list, we should not create this tmlist, as it will change the tcpm_next pointer of the element that will be deleted and so a thread iterating over the cache's entries while holding the RCU-lock might get "redirected" to this tmlist. This patch fixes this, by reverting back to the old behavior prior to bbf852b96ebdc6d1, which means that we simply change the tcpm_next pointer of the previous element (pp) to jump over the one we are deleting. The difference is that we call kfree_rcu() directly on the cache entry, which allows us to delete multiple entries from the list. Fixes: bbf852b96ebdc6d1 (tcp: metrics: Delete all entries matching a certain destination) Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index fa95094..9ae48b4 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -1019,13 +1019,13 @@ static int tcp_metrics_flush_all(struct net *net) static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) { struct tcpm_hash_bucket *hb; - struct tcp_metrics_block *tm, *tmlist = NULL; + struct tcp_metrics_block *tm; struct tcp_metrics_block __rcu **pp; struct inetpeer_addr saddr, daddr; unsigned int hash; struct net *net = genl_info_net(info); int ret; - bool src = true; + bool src = true, found = false; ret = parse_nl_addr(info, &daddr, &hash, 1); if (ret < 0) @@ -1044,19 +1044,15 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) if (addr_same(&tm->tcpm_daddr, &daddr) && (!src || addr_same(&tm->tcpm_saddr, &saddr))) { *pp = tm->tcpm_next; - tm->tcpm_next = tmlist; - tmlist = tm; + kfree_rcu(tm, rcu_head); + found = true; } else { pp = &tm->tcpm_next; } } spin_unlock_bh(&tcp_metrics_lock); - if (!tmlist) + if (!found) return -ESRCH; - for (tm = tmlist; tm; tm = tmlist) { - tmlist = tm->tcpm_next; - kfree_rcu(tm, rcu_head); - } return 0; } -- cgit v1.1 From 906e073f3e842877b59d669b25aa76f65ba775b3 Mon Sep 17 00:00:00 2001 From: viresh kumar Date: Wed, 22 Jan 2014 12:23:32 +0530 Subject: net/ipv4: queue work on power efficient wq Workqueue used in ipv4 layer have no real dependency of scheduling these on the cpu which scheduled them. On a idle system, it is observed that an idle cpu wakes up many times just to service this work. It would be better if we can schedule it on a cpu which the scheduler believes to be the most appropriate one. This patch replaces normal workqueues with power efficient versions. This doesn't change existing behavior of code unless CONFIG_WQ_POWER_EFFICIENT is enabled. Signed-off-by: Viresh Kumar Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 646023b..ac2dff3 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -474,7 +474,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, inet_hash_insert(dev_net(in_dev->dev), ifa); cancel_delayed_work(&check_lifetime_work); - schedule_delayed_work(&check_lifetime_work, 0); + queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0); /* Send message first, then call notifier. Notifier will trigger FIB update, so that @@ -684,7 +684,8 @@ static void check_lifetime(struct work_struct *work) if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX)) next_sched = now + ADDRCONF_TIMER_FUZZ_MAX; - schedule_delayed_work(&check_lifetime_work, next_sched - now); + queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, + next_sched - now); } static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft, @@ -842,7 +843,8 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) ifa = ifa_existing; set_ifa_lifetime(ifa, valid_lft, prefered_lft); cancel_delayed_work(&check_lifetime_work); - schedule_delayed_work(&check_lifetime_work, 0); + queue_delayed_work(system_power_efficient_wq, + &check_lifetime_work, 0); rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); } @@ -2322,7 +2324,7 @@ void __init devinet_init(void) register_gifconf(PF_INET, inet_gifconf); register_netdevice_notifier(&ip_netdev_notifier); - schedule_delayed_work(&check_lifetime_work, 0); + queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0); rtnl_af_register(&inet_af_ops); -- cgit v1.1 From 3ad88cf70af79e6f19c4f89dd85453ba4fdf425e Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Wed, 22 Jan 2014 13:58:44 +0100 Subject: tcp: metrics: Handle v6/v4-mapped sockets in tcp-metrics A socket may be v6/v4-mapped. In that case sk->sk_family is AF_INET6, but the IP being used is actually an IPv4-address. Current's tcp-metrics will thus represent it as an IPv6-address: root@server:~# ip tcp_metrics ::ffff:10.1.1.2 age 22.920sec rtt 18750us rttvar 15000us cwnd 10 10.1.1.2 age 47.970sec rtt 16250us rttvar 10000us cwnd 10 This patch modifies the tcp-metrics so that they are able to handle the v6/v4-mapped sockets correctly. Signed-off-by: Christoph Paasch Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 64 +++++++++++++++++++++++++++++++------------------- 1 file changed, 40 insertions(+), 24 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 9ae48b4..d547075 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -274,24 +274,32 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock unsigned int hash; struct net *net; - saddr.family = tw->tw_family; - daddr.family = tw->tw_family; - switch (daddr.family) { - case AF_INET: + if (tw->tw_family == AF_INET) { + saddr.family = AF_INET; saddr.addr.a4 = tw->tw_rcv_saddr; + daddr.family = AF_INET; daddr.addr.a4 = tw->tw_daddr; hash = (__force unsigned int) daddr.addr.a4; - break; + } #if IS_ENABLED(CONFIG_IPV6) - case AF_INET6: - *(struct in6_addr *)saddr.addr.a6 = tw->tw_v6_rcv_saddr; - *(struct in6_addr *)daddr.addr.a6 = tw->tw_v6_daddr; - hash = ipv6_addr_hash(&tw->tw_v6_daddr); - break; + else if (tw->tw_family == AF_INET6) { + if (ipv6_addr_v4mapped(&tw->tw_v6_daddr)) { + saddr.family = AF_INET; + saddr.addr.a4 = tw->tw_rcv_saddr; + daddr.family = AF_INET; + daddr.addr.a4 = tw->tw_daddr; + hash = (__force unsigned int) daddr.addr.a4; + } else { + saddr.family = AF_INET6; + *(struct in6_addr *)saddr.addr.a6 = tw->tw_v6_rcv_saddr; + daddr.family = AF_INET6; + *(struct in6_addr *)daddr.addr.a6 = tw->tw_v6_daddr; + hash = ipv6_addr_hash(&tw->tw_v6_daddr); + } + } #endif - default: + else return NULL; - } net = twsk_net(tw); hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); @@ -314,24 +322,32 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, unsigned int hash; struct net *net; - saddr.family = sk->sk_family; - daddr.family = sk->sk_family; - switch (daddr.family) { - case AF_INET: + if (sk->sk_family == AF_INET) { + saddr.family = AF_INET; saddr.addr.a4 = inet_sk(sk)->inet_saddr; + daddr.family = AF_INET; daddr.addr.a4 = inet_sk(sk)->inet_daddr; hash = (__force unsigned int) daddr.addr.a4; - break; + } #if IS_ENABLED(CONFIG_IPV6) - case AF_INET6: - *(struct in6_addr *)saddr.addr.a6 = sk->sk_v6_rcv_saddr; - *(struct in6_addr *)daddr.addr.a6 = sk->sk_v6_daddr; - hash = ipv6_addr_hash(&sk->sk_v6_daddr); - break; + else if (sk->sk_family == AF_INET6) { + if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { + saddr.family = AF_INET; + saddr.addr.a4 = inet_sk(sk)->inet_saddr; + daddr.family = AF_INET; + daddr.addr.a4 = inet_sk(sk)->inet_daddr; + hash = (__force unsigned int) daddr.addr.a4; + } else { + saddr.family = AF_INET6; + *(struct in6_addr *)saddr.addr.a6 = sk->sk_v6_rcv_saddr; + daddr.family = AF_INET6; + *(struct in6_addr *)daddr.addr.a6 = sk->sk_v6_daddr; + hash = ipv6_addr_hash(&sk->sk_v6_daddr); + } + } #endif - default: + else return NULL; - } net = dev_net(dst->dev); hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); -- cgit v1.1 From a1d0cd8ed5d315a1f2e9dec54920b73b9b30147b Mon Sep 17 00:00:00 2001 From: Shlomo Pongratz Date: Wed, 22 Jan 2014 15:23:29 +0200 Subject: net/udp_offload: Handle static checker complaints Fixed few issues around using __rcu prefix and rcu_assign_pointer, also fixed a warning print to use ntohs(port) and not htons(port). net/ipv4/udp_offload.c:112:9: error: incompatible types in comparison expression (different address spaces) net/ipv4/udp_offload.c:113:9: error: incompatible types in comparison expression (different address spaces) net/ipv4/udp_offload.c:176:19: error: incompatible types in comparison expression (different address spaces) Signed-off-by: Shlomo Pongratz Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- net/ipv4/udp_offload.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index ee853c5..25f5cee 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -15,7 +15,7 @@ #include static DEFINE_SPINLOCK(udp_offload_lock); -static struct udp_offload_priv *udp_offload_base __read_mostly; +static struct udp_offload_priv __rcu *udp_offload_base __read_mostly; struct udp_offload_priv { struct udp_offload *offload; @@ -100,7 +100,7 @@ out: int udp_add_offload(struct udp_offload *uo) { - struct udp_offload_priv **head = &udp_offload_base; + struct udp_offload_priv __rcu **head = &udp_offload_base; struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_KERNEL); if (!new_offload) @@ -110,7 +110,7 @@ int udp_add_offload(struct udp_offload *uo) spin_lock(&udp_offload_lock); rcu_assign_pointer(new_offload->next, rcu_dereference(*head)); - rcu_assign_pointer(*head, rcu_dereference(new_offload)); + rcu_assign_pointer(*head, new_offload); spin_unlock(&udp_offload_lock); return 0; @@ -140,7 +140,7 @@ void udp_del_offload(struct udp_offload *uo) } head = &uo_priv->next; } - pr_warn("udp_del_offload: didn't find offload for port %d\n", htons(uo->port)); + pr_warn("udp_del_offload: didn't find offload for port %d\n", ntohs(uo->port)); unlock: spin_unlock(&udp_offload_lock); if (uo_priv != NULL) -- cgit v1.1 From 11c21a307d79ea5f6b6fc0d3dfdeda271e5e65f6 Mon Sep 17 00:00:00 2001 From: Duan Jiong Date: Thu, 23 Jan 2014 14:00:25 +0800 Subject: ip_tunnel: clear IPCB in ip_tunnel_xmit() in case dst_link_failure() is called commit a622260254ee48("ip_tunnel: fix kernel panic with icmp_dest_unreach") clear IPCB in ip_tunnel_xmit() , or else skb->cb[] may contain garbage from GSO segmentation layer. But commit 0e6fbc5b6c621("ip_tunnels: extend iptunnel_xmit()") refactor codes, and it clear IPCB behind the dst_link_failure(). So clear IPCB in ip_tunnel_xmit() just like commti a622260254ee48("ip_tunnel: fix kernel panic with icmp_dest_unreach"). Signed-off-by: Duan Jiong Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 1886cd4..c0e3cb7 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -686,6 +686,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { tunnel->err_count--; + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); dst_link_failure(skb); } else tunnel->err_count = 0; -- cgit v1.1 From a0065f266a9b5d51575535a25c15ccbeed9a9966 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Thu, 23 Jan 2014 10:19:34 +0100 Subject: fib_frontend: fix possible NULL pointer dereference The two commits 0115e8e30d (net: remove delay at device dismantle) and 748e2d9396a (net: reinstate rtnl in call_netdevice_notifiers()) silently removed a NULL pointer check for in_dev since Linux 3.7. This patch re-introduces this check as it causes crashing the kernel when setting small mtu values on non-ip capable netdevices. Signed-off-by: Oliver Hartkopp Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index d846304..c7539e2 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1047,6 +1047,8 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo } in_dev = __in_dev_get_rtnl(dev); + if (!in_dev) + return NOTIFY_DONE; switch (event) { case NETDEV_UP: -- cgit v1.1