diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-11 14:27:06 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-11 14:27:06 -0800 |
commit | 70e71ca0af244f48a5dcf56dc435243792e3a495 (patch) | |
tree | f7d9c4c4d9a857a00043e9bf6aa2d6f533a34778 /net/ipv4 | |
parent | bae41e45b7400496b9bf0c70c6004419d9987819 (diff) | |
parent | 00c83b01d58068dfeb2e1351cca6fccf2a83fa8f (diff) | |
download | op-kernel-dev-70e71ca0af244f48a5dcf56dc435243792e3a495.zip op-kernel-dev-70e71ca0af244f48a5dcf56dc435243792e3a495.tar.gz |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
1) New offloading infrastructure and example 'rocker' driver for
offloading of switching and routing to hardware.
This work was done by a large group of dedicated individuals, not
limited to: Scott Feldman, Jiri Pirko, Thomas Graf, John Fastabend,
Jamal Hadi Salim, Andy Gospodarek, Florian Fainelli, Roopa Prabhu
2) Start making the networking operate on IOV iterators instead of
modifying iov objects in-situ during transfers. Thanks to Al Viro
and Herbert Xu.
3) A set of new netlink interfaces for the TIPC stack, from Richard
Alpe.
4) Remove unnecessary looping during ipv6 routing lookups, from Martin
KaFai Lau.
5) Add PAUSE frame generation support to gianfar driver, from Matei
Pavaluca.
6) Allow for larger reordering levels in TCP, which are easily
achievable in the real world right now, from Eric Dumazet.
7) Add a variable of napi_schedule that doesn't need to disable cpu
interrupts, from Eric Dumazet.
8) Use a doubly linked list to optimize neigh_parms_release(), from
Nicolas Dichtel.
9) Various enhancements to the kernel BPF verifier, and allow eBPF
programs to actually be attached to sockets. From Alexei
Starovoitov.
10) Support TSO/LSO in sunvnet driver, from David L Stevens.
11) Allow controlling ECN usage via routing metrics, from Florian
Westphal.
12) Remote checksum offload, from Tom Herbert.
13) Add split-header receive, BQL, and xmit_more support to amd-xgbe
driver, from Thomas Lendacky.
14) Add MPLS support to openvswitch, from Simon Horman.
15) Support wildcard tunnel endpoints in ipv6 tunnels, from Steffen
Klassert.
16) Do gro flushes on a per-device basis using a timer, from Eric
Dumazet. This tries to resolve the conflicting goals between the
desired handling of bulk vs. RPC-like traffic.
17) Allow userspace to ask for the CPU upon what a packet was
received/steered, via SO_INCOMING_CPU. From Eric Dumazet.
18) Limit GSO packets to half the current congestion window, from Eric
Dumazet.
19) Add a generic helper so that all drivers set their RSS keys in a
consistent way, from Eric Dumazet.
20) Add xmit_more support to enic driver, from Govindarajulu
Varadarajan.
21) Add VLAN packet scheduler action, from Jiri Pirko.
22) Support configurable RSS hash functions via ethtool, from Eyal
Perry.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1820 commits)
Fix race condition between vxlan_sock_add and vxlan_sock_release
net/macb: fix compilation warning for print_hex_dump() called with skb->mac_header
net/mlx4: Add support for A0 steering
net/mlx4: Refactor QUERY_PORT
net/mlx4_core: Add explicit error message when rule doesn't meet configuration
net/mlx4: Add A0 hybrid steering
net/mlx4: Add mlx4_bitmap zone allocator
net/mlx4: Add a check if there are too many reserved QPs
net/mlx4: Change QP allocation scheme
net/mlx4_core: Use tasklet for user-space CQ completion events
net/mlx4_core: Mask out host side virtualization features for guests
net/mlx4_en: Set csum level for encapsulated packets
be2net: Export tunnel offloads only when a VxLAN tunnel is created
gianfar: Fix dma check map error when DMA_API_DEBUG is enabled
cxgb4/csiostor: Don't use MASTER_MUST for fw_hello call
net: fec: only enable mdio interrupt before phy device link up
net: fec: clear all interrupt events to support i.MX6SX
net: fec: reset fep link status in suspend function
net: sock: fix access via invalid file descriptor
net: introduce helper macro for_each_cmsghdr
...
Diffstat (limited to 'net/ipv4')
40 files changed, 1175 insertions, 561 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index e682b48..bd29016 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -322,6 +322,15 @@ config NET_FOU network mechanisms and optimizations for UDP (such as ECMP and RSS) can be leveraged to provide better service. +config NET_FOU_IP_TUNNELS + bool "IP: FOU encapsulation of IP tunnels" + depends on NET_IPIP || NET_IPGRE || IPV6_SIT + select NET_FOU + ---help--- + Allow configuration of FOU or GUE encapsulation for IP tunnels. + When this option is enabled IP tunnels can be configured to use + FOU or GUE encapsulation. + config GENEVE tristate "Generic Network Virtualization Encapsulation (Geneve)" depends on INET diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e67da4e..a44773c 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1222,7 +1222,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, SKB_GSO_TCPV6 | SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | - SKB_GSO_MPLS | + SKB_GSO_TUNNEL_REMCSUM | 0))) goto out; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 16acb59..205e147 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1292,7 +1292,7 @@ static int arp_proc_init(void); void __init arp_init(void) { - neigh_table_init(&arp_tbl); + neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl); dev_add_pack(&arp_packet_type); arp_proc_init(); diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 4715f25..5160c71 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -50,7 +50,7 @@ #include <net/netlabel.h> #include <net/cipso_ipv4.h> #include <linux/atomic.h> -#include <asm/bug.h> +#include <linux/bug.h> #include <asm/unaligned.h> /* List of available DOI definitions */ @@ -72,6 +72,7 @@ struct cipso_v4_map_cache_bkt { u32 size; struct list_head list; }; + struct cipso_v4_map_cache_entry { u32 hash; unsigned char *key; @@ -82,7 +83,8 @@ struct cipso_v4_map_cache_entry { u32 activity; struct list_head list; }; -static struct cipso_v4_map_cache_bkt *cipso_v4_cache = NULL; + +static struct cipso_v4_map_cache_bkt *cipso_v4_cache; /* Restricted bitmap (tag #1) flags */ int cipso_v4_rbm_optfmt = 0; @@ -539,7 +541,7 @@ doi_add_return: /** * cipso_v4_doi_free - Frees a DOI definition - * @entry: the entry's RCU field + * @doi_def: the DOI definition * * Description: * This function frees all of the memory associated with a DOI definition. diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 360b565..60173d4 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -392,8 +392,10 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) if (elen <= 0) goto out; - if ((err = skb_cow_data(skb, 0, &trailer)) < 0) + err = skb_cow_data(skb, 0, &trailer); + if (err < 0) goto out; + nfrags = err; assoclen = sizeof(*esph); @@ -601,12 +603,12 @@ static int esp_init_authenc(struct xfrm_state *x) BUG_ON(!aalg_desc); err = -EINVAL; - if (aalg_desc->uinfo.auth.icv_fullbits/8 != + if (aalg_desc->uinfo.auth.icv_fullbits / 8 != crypto_aead_authsize(aead)) { - NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n", - x->aalg->alg_name, - crypto_aead_authsize(aead), - aalg_desc->uinfo.auth.icv_fullbits/8); + pr_info("ESP: %s digestsize %u != %hu\n", + x->aalg->alg_name, + crypto_aead_authsize(aead), + aalg_desc->uinfo.auth.icv_fullbits / 8); goto free_key; } diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 606c520..b986298 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -38,21 +38,17 @@ static inline struct fou *fou_from_sock(struct sock *sk) return sk->sk_user_data; } -static int fou_udp_encap_recv_deliver(struct sk_buff *skb, - u8 protocol, size_t len) +static void fou_recv_pull(struct sk_buff *skb, size_t len) { struct iphdr *iph = ip_hdr(skb); /* Remove 'len' bytes from the packet (UDP header and - * FOU header if present), modify the protocol to the one - * we found, and then call rcv_encap. + * FOU header if present). */ iph->tot_len = htons(ntohs(iph->tot_len) - len); __skb_pull(skb, len); skb_postpull_rcsum(skb, udp_hdr(skb), len); skb_reset_transport_header(skb); - - return -protocol; } static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) @@ -62,16 +58,56 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) if (!fou) return 1; - return fou_udp_encap_recv_deliver(skb, fou->protocol, - sizeof(struct udphdr)); + fou_recv_pull(skb, sizeof(struct udphdr)); + + return -fou->protocol; +} + +static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr, + void *data, size_t hdrlen, u8 ipproto) +{ + __be16 *pd = data; + size_t start = ntohs(pd[0]); + size_t offset = ntohs(pd[1]); + size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); + __wsum delta; + + if (skb->remcsum_offload) { + /* Already processed in GRO path */ + skb->remcsum_offload = 0; + return guehdr; + } + + if (!pskb_may_pull(skb, plen)) + return NULL; + guehdr = (struct guehdr *)&udp_hdr(skb)[1]; + + if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) + __skb_checksum_complete(skb); + + delta = remcsum_adjust((void *)guehdr + hdrlen, + skb->csum, start, offset); + + /* Adjust skb->csum since we changed the packet */ + skb->csum = csum_add(skb->csum, delta); + + return guehdr; +} + +static int gue_control_message(struct sk_buff *skb, struct guehdr *guehdr) +{ + /* No support yet */ + kfree_skb(skb); + return 0; } static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) { struct fou *fou = fou_from_sock(sk); - size_t len; + size_t len, optlen, hdrlen; struct guehdr *guehdr; - struct udphdr *uh; + void *data; + u16 doffset = 0; if (!fou) return 1; @@ -80,25 +116,58 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) if (!pskb_may_pull(skb, len)) goto drop; - uh = udp_hdr(skb); - guehdr = (struct guehdr *)&uh[1]; + guehdr = (struct guehdr *)&udp_hdr(skb)[1]; + + optlen = guehdr->hlen << 2; + len += optlen; - len += guehdr->hlen << 2; if (!pskb_may_pull(skb, len)) goto drop; - uh = udp_hdr(skb); - guehdr = (struct guehdr *)&uh[1]; + /* guehdr may change after pull */ + guehdr = (struct guehdr *)&udp_hdr(skb)[1]; - if (guehdr->version != 0) - goto drop; + hdrlen = sizeof(struct guehdr) + optlen; - if (guehdr->flags) { - /* No support yet */ + if (guehdr->version != 0 || validate_gue_flags(guehdr, optlen)) goto drop; + + hdrlen = sizeof(struct guehdr) + optlen; + + ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len); + + /* Pull csum through the guehdr now . This can be used if + * there is a remote checksum offload. + */ + skb_postpull_rcsum(skb, udp_hdr(skb), len); + + data = &guehdr[1]; + + if (guehdr->flags & GUE_FLAG_PRIV) { + __be32 flags = *(__be32 *)(data + doffset); + + doffset += GUE_LEN_PRIV; + + if (flags & GUE_PFLAG_REMCSUM) { + guehdr = gue_remcsum(skb, guehdr, data + doffset, + hdrlen, guehdr->proto_ctype); + if (!guehdr) + goto drop; + + data = &guehdr[1]; + + doffset += GUE_PLEN_REMCSUM; + } } - return fou_udp_encap_recv_deliver(skb, guehdr->next_hdr, len); + if (unlikely(guehdr->control)) + return gue_control_message(skb, guehdr); + + __skb_pull(skb, sizeof(struct udphdr) + hdrlen); + skb_reset_transport_header(skb); + + return -guehdr->proto_ctype; + drop: kfree_skb(skb); return 0; @@ -149,6 +218,41 @@ out_unlock: return err; } +static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, + struct guehdr *guehdr, void *data, + size_t hdrlen, u8 ipproto) +{ + __be16 *pd = data; + size_t start = ntohs(pd[0]); + size_t offset = ntohs(pd[1]); + size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); + __wsum delta; + + if (skb->remcsum_offload) + return guehdr; + + if (!NAPI_GRO_CB(skb)->csum_valid) + return NULL; + + /* Pull checksum that will be written */ + if (skb_gro_header_hard(skb, off + plen)) { + guehdr = skb_gro_header_slow(skb, off + plen, off); + if (!guehdr) + return NULL; + } + + delta = remcsum_adjust((void *)guehdr + hdrlen, + NAPI_GRO_CB(skb)->csum, start, offset); + + /* Adjust skb->csum since we changed the packet */ + skb->csum = csum_add(skb->csum, delta); + NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta); + + skb->remcsum_offload = 1; + + return guehdr; +} + static struct sk_buff **gue_gro_receive(struct sk_buff **head, struct sk_buff *skb) { @@ -156,38 +260,64 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, const struct net_offload *ops; struct sk_buff **pp = NULL; struct sk_buff *p; - u8 proto; struct guehdr *guehdr; - unsigned int hlen, guehlen; - unsigned int off; + size_t len, optlen, hdrlen, off; + void *data; + u16 doffset = 0; int flush = 1; off = skb_gro_offset(skb); - hlen = off + sizeof(*guehdr); + len = off + sizeof(*guehdr); + guehdr = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, hlen)) { - guehdr = skb_gro_header_slow(skb, hlen, off); + if (skb_gro_header_hard(skb, len)) { + guehdr = skb_gro_header_slow(skb, len, off); if (unlikely(!guehdr)) goto out; } - proto = guehdr->next_hdr; + optlen = guehdr->hlen << 2; + len += optlen; - rcu_read_lock(); - offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[proto]); - if (WARN_ON(!ops || !ops->callbacks.gro_receive)) - goto out_unlock; + if (skb_gro_header_hard(skb, len)) { + guehdr = skb_gro_header_slow(skb, len, off); + if (unlikely(!guehdr)) + goto out; + } - guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); + if (unlikely(guehdr->control) || guehdr->version != 0 || + validate_gue_flags(guehdr, optlen)) + goto out; - hlen = off + guehlen; - if (skb_gro_header_hard(skb, hlen)) { - guehdr = skb_gro_header_slow(skb, hlen, off); - if (unlikely(!guehdr)) - goto out_unlock; + hdrlen = sizeof(*guehdr) + optlen; + + /* Adjust NAPI_GRO_CB(skb)->csum to account for guehdr, + * this is needed if there is a remote checkcsum offload. + */ + skb_gro_postpull_rcsum(skb, guehdr, hdrlen); + + data = &guehdr[1]; + + if (guehdr->flags & GUE_FLAG_PRIV) { + __be32 flags = *(__be32 *)(data + doffset); + + doffset += GUE_LEN_PRIV; + + if (flags & GUE_PFLAG_REMCSUM) { + guehdr = gue_gro_remcsum(skb, off, guehdr, + data + doffset, hdrlen, + guehdr->proto_ctype); + if (!guehdr) + goto out; + + data = &guehdr[1]; + + doffset += GUE_PLEN_REMCSUM; + } } + skb_gro_pull(skb, hdrlen); + flush = 0; for (p = *head; p; p = p->next) { @@ -199,7 +329,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, guehdr2 = (struct guehdr *)(p->data + off); /* Compare base GUE header to be equal (covers - * hlen, version, next_hdr, and flags. + * hlen, version, proto_ctype, and flags. */ if (guehdr->word != guehdr2->word) { NAPI_GRO_CB(p)->same_flow = 0; @@ -214,10 +344,11 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, } } - skb_gro_pull(skb, guehlen); - - /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ - skb_gro_postpull_rcsum(skb, guehdr, guehlen); + rcu_read_lock(); + offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; + ops = rcu_dereference(offloads[guehdr->proto_ctype]); + if (WARN_ON(!ops || !ops->callbacks.gro_receive)) + goto out_unlock; pp = ops->callbacks.gro_receive(head, skb); @@ -238,7 +369,7 @@ static int gue_gro_complete(struct sk_buff *skb, int nhoff) u8 proto; int err = -ENOENT; - proto = guehdr->next_hdr; + proto = guehdr->proto_ctype; guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); @@ -489,6 +620,200 @@ static const struct genl_ops fou_nl_ops[] = { }, }; +size_t fou_encap_hlen(struct ip_tunnel_encap *e) +{ + return sizeof(struct udphdr); +} +EXPORT_SYMBOL(fou_encap_hlen); + +size_t gue_encap_hlen(struct ip_tunnel_encap *e) +{ + size_t len; + bool need_priv = false; + + len = sizeof(struct udphdr) + sizeof(struct guehdr); + + if (e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) { + len += GUE_PLEN_REMCSUM; + need_priv = true; + } + + len += need_priv ? GUE_LEN_PRIV : 0; + + return len; +} +EXPORT_SYMBOL(gue_encap_hlen); + +static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, + struct flowi4 *fl4, u8 *protocol, __be16 sport) +{ + struct udphdr *uh; + + skb_push(skb, sizeof(struct udphdr)); + skb_reset_transport_header(skb); + + uh = udp_hdr(skb); + + uh->dest = e->dport; + uh->source = sport; + uh->len = htons(skb->len); + uh->check = 0; + udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, + fl4->saddr, fl4->daddr, skb->len); + + *protocol = IPPROTO_UDP; +} + +int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi4 *fl4) +{ + bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); + int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + __be16 sport; + + skb = iptunnel_handle_offloads(skb, csum, type); + + if (IS_ERR(skb)) + return PTR_ERR(skb); + + sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), + skb, 0, 0, false); + fou_build_udp(skb, e, fl4, protocol, sport); + + return 0; +} +EXPORT_SYMBOL(fou_build_header); + +int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi4 *fl4) +{ + bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); + int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + struct guehdr *guehdr; + size_t hdrlen, optlen = 0; + __be16 sport; + void *data; + bool need_priv = false; + + if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + csum = false; + optlen += GUE_PLEN_REMCSUM; + type |= SKB_GSO_TUNNEL_REMCSUM; + need_priv = true; + } + + optlen += need_priv ? GUE_LEN_PRIV : 0; + + skb = iptunnel_handle_offloads(skb, csum, type); + + if (IS_ERR(skb)) + return PTR_ERR(skb); + + /* Get source port (based on flow hash) before skb_push */ + sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), + skb, 0, 0, false); + + hdrlen = sizeof(struct guehdr) + optlen; + + skb_push(skb, hdrlen); + + guehdr = (struct guehdr *)skb->data; + + guehdr->control = 0; + guehdr->version = 0; + guehdr->hlen = optlen >> 2; + guehdr->flags = 0; + guehdr->proto_ctype = *protocol; + + data = &guehdr[1]; + + if (need_priv) { + __be32 *flags = data; + + guehdr->flags |= GUE_FLAG_PRIV; + *flags = 0; + data += GUE_LEN_PRIV; + + if (type & SKB_GSO_TUNNEL_REMCSUM) { + u16 csum_start = skb_checksum_start_offset(skb); + __be16 *pd = data; + + if (csum_start < hdrlen) + return -EINVAL; + + csum_start -= hdrlen; + pd[0] = htons(csum_start); + pd[1] = htons(csum_start + skb->csum_offset); + + if (!skb_is_gso(skb)) { + skb->ip_summed = CHECKSUM_NONE; + skb->encapsulation = 0; + } + + *flags |= GUE_PFLAG_REMCSUM; + data += GUE_PLEN_REMCSUM; + } + + } + + fou_build_udp(skb, e, fl4, protocol, sport); + + return 0; +} +EXPORT_SYMBOL(gue_build_header); + +#ifdef CONFIG_NET_FOU_IP_TUNNELS + +static const struct ip_tunnel_encap_ops __read_mostly fou_iptun_ops = { + .encap_hlen = fou_encap_hlen, + .build_header = fou_build_header, +}; + +static const struct ip_tunnel_encap_ops __read_mostly gue_iptun_ops = { + .encap_hlen = gue_encap_hlen, + .build_header = gue_build_header, +}; + +static int ip_tunnel_encap_add_fou_ops(void) +{ + int ret; + + ret = ip_tunnel_encap_add_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU); + if (ret < 0) { + pr_err("can't add fou ops\n"); + return ret; + } + + ret = ip_tunnel_encap_add_ops(&gue_iptun_ops, TUNNEL_ENCAP_GUE); + if (ret < 0) { + pr_err("can't add gue ops\n"); + ip_tunnel_encap_del_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU); + return ret; + } + + return 0; +} + +static void ip_tunnel_encap_del_fou_ops(void) +{ + ip_tunnel_encap_del_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU); + ip_tunnel_encap_del_ops(&gue_iptun_ops, TUNNEL_ENCAP_GUE); +} + +#else + +static int ip_tunnel_encap_add_fou_ops(void) +{ + return 0; +} + +static void ip_tunnel_encap_del_fou_ops(void) +{ +} + +#endif + static int __init fou_init(void) { int ret; @@ -496,6 +821,14 @@ static int __init fou_init(void) ret = genl_register_family_with_ops(&fou_nl_family, fou_nl_ops); + if (ret < 0) + goto exit; + + ret = ip_tunnel_encap_add_fou_ops(); + if (ret < 0) + genl_unregister_family(&fou_nl_family); + +exit: return ret; } @@ -503,6 +836,8 @@ static void __exit fou_fini(void) { struct fou *fou, *next; + ip_tunnel_encap_del_fou_ops(); + genl_unregister_family(&fou_nl_family); /* Close all the FOU sockets */ diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c index dedb21e..a457232 100644 --- a/net/ipv4/geneve.c +++ b/net/ipv4/geneve.c @@ -104,7 +104,7 @@ static void geneve_build_header(struct genevehdr *geneveh, memcpy(geneveh->options, options, options_len); } -/* Transmit a fully formated Geneve frame. +/* Transmit a fully formatted Geneve frame. * * When calling this function. The skb->data should point * to the geneve header which is fully formed. @@ -131,15 +131,9 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, if (unlikely(err)) return err; - if (vlan_tx_tag_present(skb)) { - if (unlikely(!__vlan_put_tag(skb, - skb->vlan_proto, - vlan_tx_tag_get(skb)))) { - err = -ENOMEM; - return err; - } - skb->vlan_tci = 0; - } + skb = vlan_hwaccel_push_inside(skb); + if (unlikely(!skb)) + return -ENOMEM; gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index bb5947b..51973dd 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -247,6 +247,9 @@ static int gre_gro_complete(struct sk_buff *skb, int nhoff) err = ptype->callbacks.gro_complete(skb, nhoff + grehlen); rcu_read_unlock(); + + skb_set_inner_mac_header(skb, nhoff + grehlen); + return err; } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 5882f58..36f5584 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -190,7 +190,7 @@ EXPORT_SYMBOL(icmp_err_convert); */ struct icmp_control { - void (*handler)(struct sk_buff *skb); + bool (*handler)(struct sk_buff *skb); short error; /* This ICMP is classed as an error message */ }; @@ -746,7 +746,7 @@ static bool icmp_tag_validation(int proto) * ICMP_PARAMETERPROB. */ -static void icmp_unreach(struct sk_buff *skb) +static bool icmp_unreach(struct sk_buff *skb) { const struct iphdr *iph; struct icmphdr *icmph; @@ -784,8 +784,8 @@ static void icmp_unreach(struct sk_buff *skb) */ switch (net->ipv4.sysctl_ip_no_pmtu_disc) { default: - LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"), - &iph->daddr); + net_dbg_ratelimited("%pI4: fragmentation needed and DF set\n", + &iph->daddr); break; case 2: goto out; @@ -798,8 +798,8 @@ static void icmp_unreach(struct sk_buff *skb) } break; case ICMP_SR_FAILED: - LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: Source Route Failed\n"), - &iph->daddr); + net_dbg_ratelimited("%pI4: Source Route Failed\n", + &iph->daddr); break; default: break; @@ -839,10 +839,10 @@ static void icmp_unreach(struct sk_buff *skb) icmp_socket_deliver(skb, info); out: - return; + return true; out_err: ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); - goto out; + return false; } @@ -850,17 +850,20 @@ out_err: * Handle ICMP_REDIRECT. */ -static void icmp_redirect(struct sk_buff *skb) +static bool icmp_redirect(struct sk_buff *skb) { if (skb->len < sizeof(struct iphdr)) { ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); - return; + return false; } - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - return; + if (!pskb_may_pull(skb, sizeof(struct iphdr))) { + /* there aught to be a stat */ + return false; + } icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway); + return true; } /* @@ -875,7 +878,7 @@ static void icmp_redirect(struct sk_buff *skb) * See also WRT handling of options once they are done and working. */ -static void icmp_echo(struct sk_buff *skb) +static bool icmp_echo(struct sk_buff *skb) { struct net *net; @@ -891,6 +894,8 @@ static void icmp_echo(struct sk_buff *skb) icmp_param.head_len = sizeof(struct icmphdr); icmp_reply(&icmp_param, skb); } + /* should there be an ICMP stat for ignored echos? */ + return true; } /* @@ -900,7 +905,7 @@ static void icmp_echo(struct sk_buff *skb) * MUST be accurate to a few minutes. * MUST be updated at least at 15Hz. */ -static void icmp_timestamp(struct sk_buff *skb) +static bool icmp_timestamp(struct sk_buff *skb) { struct timespec tv; struct icmp_bxm icmp_param; @@ -927,15 +932,17 @@ static void icmp_timestamp(struct sk_buff *skb) icmp_param.data_len = 0; icmp_param.head_len = sizeof(struct icmphdr) + 12; icmp_reply(&icmp_param, skb); -out: - return; + return true; + out_err: ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS); - goto out; + return false; } -static void icmp_discard(struct sk_buff *skb) +static bool icmp_discard(struct sk_buff *skb) { + /* pretend it was a success */ + return true; } /* @@ -946,6 +953,7 @@ int icmp_rcv(struct sk_buff *skb) struct icmphdr *icmph; struct rtable *rt = skb_rtable(skb); struct net *net = dev_net(rt->dst.dev); + bool success; if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { struct sec_path *sp = skb_sec_path(skb); @@ -1012,7 +1020,12 @@ int icmp_rcv(struct sk_buff *skb) } } - icmp_pointers[icmph->type].handler(skb); + success = icmp_pointers[icmph->type].handler(skb); + + if (success) { + consume_skb(skb); + return 0; + } drop: kfree_skb(skb); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index bb15d0e..666cf36 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -112,17 +112,17 @@ #ifdef CONFIG_IP_MULTICAST /* Parameter names and values are taken from igmp-v2-06 draft */ -#define IGMP_V1_Router_Present_Timeout (400*HZ) -#define IGMP_V2_Router_Present_Timeout (400*HZ) -#define IGMP_V2_Unsolicited_Report_Interval (10*HZ) -#define IGMP_V3_Unsolicited_Report_Interval (1*HZ) -#define IGMP_Query_Response_Interval (10*HZ) -#define IGMP_Query_Robustness_Variable 2 +#define IGMP_V1_ROUTER_PRESENT_TIMEOUT (400*HZ) +#define IGMP_V2_ROUTER_PRESENT_TIMEOUT (400*HZ) +#define IGMP_V2_UNSOLICITED_REPORT_INTERVAL (10*HZ) +#define IGMP_V3_UNSOLICITED_REPORT_INTERVAL (1*HZ) +#define IGMP_QUERY_RESPONSE_INTERVAL (10*HZ) +#define IGMP_QUERY_ROBUSTNESS_VARIABLE 2 -#define IGMP_Initial_Report_Delay (1) +#define IGMP_INITIAL_REPORT_DELAY (1) -/* IGMP_Initial_Report_Delay is not from IGMP specs! +/* IGMP_INITIAL_REPORT_DELAY is not from IGMP specs! * IGMP specs require to report membership immediately after * joining a group, but we delay the first report by a * small interval. It seems more natural and still does not @@ -878,15 +878,15 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, if (ih->code == 0) { /* Alas, old v1 router presents here. */ - max_delay = IGMP_Query_Response_Interval; + max_delay = IGMP_QUERY_RESPONSE_INTERVAL; in_dev->mr_v1_seen = jiffies + - IGMP_V1_Router_Present_Timeout; + IGMP_V1_ROUTER_PRESENT_TIMEOUT; group = 0; } else { /* v2 router present */ max_delay = ih->code*(HZ/IGMP_TIMER_SCALE); in_dev->mr_v2_seen = jiffies + - IGMP_V2_Router_Present_Timeout; + IGMP_V2_ROUTER_PRESENT_TIMEOUT; } /* cancel the interface change timer */ in_dev->mr_ifc_count = 0; @@ -898,7 +898,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, return true; /* ignore bogus packet; freed by caller */ } else if (IGMP_V1_SEEN(in_dev)) { /* This is a v3 query with v1 queriers present */ - max_delay = IGMP_Query_Response_Interval; + max_delay = IGMP_QUERY_RESPONSE_INTERVAL; group = 0; } else if (IGMP_V2_SEEN(in_dev)) { /* this is a v3 query with v2 queriers present; @@ -1217,7 +1217,7 @@ static void igmp_group_added(struct ip_mc_list *im) return; if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) { spin_lock_bh(&im->lock); - igmp_start_timer(im, IGMP_Initial_Report_Delay); + igmp_start_timer(im, IGMP_INITIAL_REPORT_DELAY); spin_unlock_bh(&im->lock); return; } @@ -1540,7 +1540,7 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS; int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF; #ifdef CONFIG_IP_MULTICAST -int sysctl_igmp_qrv __read_mostly = IGMP_Query_Robustness_Variable; +int sysctl_igmp_qrv __read_mostly = IGMP_QUERY_ROBUSTNESS_VARIABLE; #endif static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, @@ -2686,11 +2686,7 @@ static int igmp_mcf_seq_show(struct seq_file *seq, void *v) struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); if (v == SEQ_START_TOKEN) { - seq_printf(seq, - "%3s %6s " - "%10s %10s %6s %6s\n", "Idx", - "Device", "MCA", - "SRC", "INC", "EXC"); + seq_puts(seq, "Idx Device MCA SRC INC EXC\n"); } else { seq_printf(seq, "%3d %6.6s 0x%08x " diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 19419b6..e792035 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -458,6 +458,6 @@ void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, ". Dropping fragment.\n"; if (PTR_ERR(q) == -ENOBUFS) - LIMIT_NETDEBUG(KERN_WARNING "%s%s", prefix, msg); + net_dbg_ratelimited("%s%s", prefix, msg); } EXPORT_SYMBOL(inet_frag_maybe_warn_overflow); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 2811cc1..e5b6d0d 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -80,7 +80,7 @@ struct ipq { struct inet_peer *peer; }; -static inline u8 ip4_frag_ecn(u8 tos) +static u8 ip4_frag_ecn(u8 tos) { return 1 << (tos & INET_ECN_MASK); } @@ -148,7 +148,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a) inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL; } -static __inline__ void ip4_frag_free(struct inet_frag_queue *q) +static void ip4_frag_free(struct inet_frag_queue *q) { struct ipq *qp; @@ -160,7 +160,7 @@ static __inline__ void ip4_frag_free(struct inet_frag_queue *q) /* Destruction primitives. */ -static __inline__ void ipq_put(struct ipq *ipq) +static void ipq_put(struct ipq *ipq) { inet_frag_put(&ipq->q, &ip4_frags); } @@ -236,7 +236,7 @@ out: /* Find the correct entry in the "incomplete datagrams" queue for * this IP datagram, and create new one, if nothing is found. */ -static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) +static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) { struct inet_frag_queue *q; struct ip4_create_arg arg; @@ -256,7 +256,7 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) } /* Is the fragment too far ahead to be part of ipq? */ -static inline int ip_frag_too_far(struct ipq *qp) +static int ip_frag_too_far(struct ipq *qp) { struct inet_peer *peer = qp->peer; unsigned int max = sysctl_ipfrag_max_dist; @@ -618,8 +618,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, return 0; out_nomem: - LIMIT_NETDEBUG(KERN_ERR pr_fmt("queue_glue: no memory for gluing queue %p\n"), - qp); + net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp); err = -ENOMEM; goto out_fail; out_oversize: @@ -795,16 +794,16 @@ static void __init ip4_frags_ctl_register(void) register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table); } #else -static inline int ip4_frags_ns_ctl_register(struct net *net) +static int ip4_frags_ns_ctl_register(struct net *net) { return 0; } -static inline void ip4_frags_ns_ctl_unregister(struct net *net) +static void ip4_frags_ns_ctl_unregister(struct net *net) { } -static inline void __init ip4_frags_ctl_register(void) +static void __init ip4_frags_ctl_register(void) { } #endif diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 12055fd..ac84912 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -789,7 +789,7 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u16(skb, IFLA_GRE_ENCAP_DPORT, t->encap.dport) || nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS, - t->encap.dport)) + t->encap.flags)) goto nla_put_failure; return 0; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index bc6471d..b50861b 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -662,12 +662,10 @@ slow_path: if (len < left) { len &= ~7; } - /* - * Allocate buffer. - */ - if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) { - NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n"); + /* Allocate buffer */ + skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC); + if (!skb2) { err = -ENOMEM; goto fail; } @@ -754,14 +752,16 @@ EXPORT_SYMBOL(ip_fragment); int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { - struct iovec *iov = from; + struct msghdr *msg = from; if (skb->ip_summed == CHECKSUM_PARTIAL) { - if (memcpy_fromiovecend(to, iov, offset, len) < 0) + /* XXX: stripping const */ + if (memcpy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len) < 0) return -EFAULT; } else { __wsum csum = 0; - if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0) + /* XXX: stripping const */ + if (csum_partial_copy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len, &csum) < 0) return -EFAULT; skb->csum = csum_block_add(skb->csum, csum, odd); } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 9daf2177..8a89c73 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -192,7 +192,7 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, int err, val; struct cmsghdr *cmsg; - for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; #if IS_ENABLED(CONFIG_IPV6) @@ -399,6 +399,22 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf kfree_skb(skb); } +static bool ipv4_pktinfo_prepare_errqueue(const struct sock *sk, + const struct sk_buff *skb, + int ee_origin) +{ + struct in_pktinfo *info = PKTINFO_SKB_CB(skb); + + if ((ee_origin != SO_EE_ORIGIN_TIMESTAMPING) || + (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) || + (!skb->dev)) + return false; + + info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr; + info->ipi_ifindex = skb->dev->ifindex; + return true; +} + /* * Handle MSG_ERRQUEUE */ @@ -414,6 +430,8 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) int err; int copied; + WARN_ON_ONCE(sk->sk_family == AF_INET6); + err = -EAGAIN; skb = sock_dequeue_err_skb(sk); if (skb == NULL) @@ -424,7 +442,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; @@ -444,7 +462,9 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); sin = &errhdr.offender; sin->sin_family = AF_UNSPEC; - if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) { + + if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || + ipv4_pktinfo_prepare_errqueue(sk, skb, serr->ee.ee_origin)) { struct inet_sock *inet = inet_sk(sk); sin->sin_family = AF_INET; @@ -1049,7 +1069,7 @@ e_inval: } /** - * ipv4_pktinfo_prepare - transfert some info from rtable to skb + * ipv4_pktinfo_prepare - transfer some info from rtable to skb * @sk: socket * @skb: buffer * diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 0bb8e14..63e745a 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -56,7 +56,6 @@ #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/udp.h> -#include <net/gue.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> @@ -491,18 +490,51 @@ EXPORT_SYMBOL_GPL(ip_tunnel_rcv); static int ip_encap_hlen(struct ip_tunnel_encap *e) { - switch (e->type) { - case TUNNEL_ENCAP_NONE: + const struct ip_tunnel_encap_ops *ops; + int hlen = -EINVAL; + + if (e->type == TUNNEL_ENCAP_NONE) return 0; - case TUNNEL_ENCAP_FOU: - return sizeof(struct udphdr); - case TUNNEL_ENCAP_GUE: - return sizeof(struct udphdr) + sizeof(struct guehdr); - default: + + if (e->type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; - } + + rcu_read_lock(); + ops = rcu_dereference(iptun_encaps[e->type]); + if (likely(ops && ops->encap_hlen)) + hlen = ops->encap_hlen(e); + rcu_read_unlock(); + + return hlen; } +const struct ip_tunnel_encap_ops __rcu * + iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly; + +int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops, + unsigned int num) +{ + return !cmpxchg((const struct ip_tunnel_encap_ops **) + &iptun_encaps[num], + NULL, ops) ? 0 : -1; +} +EXPORT_SYMBOL(ip_tunnel_encap_add_ops); + +int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops, + unsigned int num) +{ + int ret; + + ret = (cmpxchg((const struct ip_tunnel_encap_ops **) + &iptun_encaps[num], + ops, NULL) == ops) ? 0 : -1; + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(ip_tunnel_encap_del_ops); + int ip_tunnel_encap_setup(struct ip_tunnel *t, struct ip_tunnel_encap *ipencap) { @@ -526,63 +558,22 @@ int ip_tunnel_encap_setup(struct ip_tunnel *t, } EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); -static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, - size_t hdr_len, u8 *protocol, struct flowi4 *fl4) -{ - struct udphdr *uh; - __be16 sport; - bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); - int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; - - skb = iptunnel_handle_offloads(skb, csum, type); - - if (IS_ERR(skb)) - return PTR_ERR(skb); - - /* Get length and hash before making space in skb */ - - sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), - skb, 0, 0, false); - - skb_push(skb, hdr_len); - - skb_reset_transport_header(skb); - uh = udp_hdr(skb); - - if (e->type == TUNNEL_ENCAP_GUE) { - struct guehdr *guehdr = (struct guehdr *)&uh[1]; - - guehdr->version = 0; - guehdr->hlen = 0; - guehdr->flags = 0; - guehdr->next_hdr = *protocol; - } - - uh->dest = e->dport; - uh->source = sport; - uh->len = htons(skb->len); - uh->check = 0; - udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, - fl4->saddr, fl4->daddr, skb->len); - - *protocol = IPPROTO_UDP; - - return 0; -} - int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, u8 *protocol, struct flowi4 *fl4) { - switch (t->encap.type) { - case TUNNEL_ENCAP_NONE: + const struct ip_tunnel_encap_ops *ops; + int ret = -EINVAL; + + if (t->encap.type == TUNNEL_ENCAP_NONE) return 0; - case TUNNEL_ENCAP_FOU: - case TUNNEL_ENCAP_GUE: - return fou_build_header(skb, &t->encap, t->encap_hlen, - protocol, fl4); - default: - return -EINVAL; - } + + rcu_read_lock(); + ops = rcu_dereference(iptun_encaps[t->encap.type]); + if (likely(ops && ops->build_header)) + ret = ops->build_header(skb, &t->encap, protocol, fl4); + rcu_read_unlock(); + + return ret; } EXPORT_SYMBOL(ip_tunnel_encap); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 648fa14..7fa18bc 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -115,7 +115,7 @@ */ int ic_set_manually __initdata = 0; /* IPconfig parameters set manually */ -static int ic_enable __initdata = 0; /* IP config enabled? */ +static int ic_enable __initdata; /* IP config enabled? */ /* Protocol choice */ int ic_proto_enabled __initdata = 0 @@ -130,7 +130,7 @@ int ic_proto_enabled __initdata = 0 #endif ; -static int ic_host_name_set __initdata = 0; /* Host name set by us? */ +static int ic_host_name_set __initdata; /* Host name set by us? */ __be32 ic_myaddr = NONE; /* My IP address */ static __be32 ic_netmask = NONE; /* Netmask for local subnet */ @@ -160,17 +160,17 @@ static u8 ic_domain[64]; /* DNS (not NIS) domain name */ static char user_dev_name[IFNAMSIZ] __initdata = { 0, }; /* Protocols supported by available interfaces */ -static int ic_proto_have_if __initdata = 0; +static int ic_proto_have_if __initdata; /* MTU for boot device */ -static int ic_dev_mtu __initdata = 0; +static int ic_dev_mtu __initdata; #ifdef IPCONFIG_DYNAMIC static DEFINE_SPINLOCK(ic_recv_lock); -static volatile int ic_got_reply __initdata = 0; /* Proto(s) that replied */ +static volatile int ic_got_reply __initdata; /* Proto(s) that replied */ #endif #ifdef IPCONFIG_DHCP -static int ic_dhcp_msgtype __initdata = 0; /* DHCP msg type received */ +static int ic_dhcp_msgtype __initdata; /* DHCP msg type received */ #endif @@ -186,8 +186,8 @@ struct ic_device { __be32 xid; }; -static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */ -static struct net_device *ic_dev __initdata = NULL; /* Selected device */ +static struct ic_device *ic_first_dev __initdata; /* List of open device */ +static struct net_device *ic_dev __initdata; /* Selected device */ static bool __init ic_is_init_dev(struct net_device *dev) { @@ -498,7 +498,7 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt struct arphdr *rarp; unsigned char *rarp_ptr; __be32 sip, tip; - unsigned char *sha, *tha; /* s for "source", t for "target" */ + unsigned char *tha; /* t for "target" */ struct ic_device *d; if (!net_eq(dev_net(dev), &init_net)) @@ -549,7 +549,6 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt goto drop_unlock; /* should never happen */ /* Extract variable-width fields */ - sha = rarp_ptr; rarp_ptr += dev->addr_len; memcpy(&sip, rarp_ptr, 4); rarp_ptr += 4; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 37096d6..40403114 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -465,7 +465,7 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, - tunnel->encap.dport)) + tunnel->encap.flags)) goto nla_put_failure; return 0; diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 4c019d5..59f883d 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -113,6 +113,15 @@ config NFT_MASQ_IPV4 This is the expression that provides IPv4 masquerading support for nf_tables. +config NFT_REDIR_IPV4 + tristate "IPv4 redirect support for nf_tables" + depends on NF_TABLES_IPV4 + depends on NFT_REDIR + select NF_NAT_REDIRECT + help + This is the expression that provides IPv4 redirect support for + nf_tables. + config NF_NAT_SNMP_BASIC tristate "Basic SNMP-ALG support" depends on NF_CONNTRACK_SNMP diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index f4cef5a..7fe6c70 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -40,6 +40,7 @@ obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o +obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o # generic IP tables diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c index ccfc78d..d059182 100644 --- a/net/ipv4/netfilter/nf_log_arp.c +++ b/net/ipv4/netfilter/nf_log_arp.c @@ -10,6 +10,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/spinlock.h> @@ -74,12 +75,12 @@ static void dump_arp_packet(struct nf_log_buf *m, ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst); } -void nf_log_arp_packet(struct net *net, u_int8_t pf, - unsigned int hooknum, const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) +static void nf_log_arp_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) { struct nf_log_buf *m; @@ -130,8 +131,17 @@ static int __init nf_log_arp_init(void) if (ret < 0) return ret; - nf_log_register(NFPROTO_ARP, &nf_arp_logger); + ret = nf_log_register(NFPROTO_ARP, &nf_arp_logger); + if (ret < 0) { + pr_err("failed to register logger\n"); + goto err1; + } + return 0; + +err1: + unregister_pernet_subsys(&nf_log_arp_net_ops); + return ret; } static void __exit nf_log_arp_exit(void) diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 078bdca..7510198 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -5,6 +5,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/spinlock.h> @@ -366,8 +367,17 @@ static int __init nf_log_ipv4_init(void) if (ret < 0) return ret; - nf_log_register(NFPROTO_IPV4, &nf_ip_logger); + ret = nf_log_register(NFPROTO_IPV4, &nf_ip_logger); + if (ret < 0) { + pr_err("failed to register logger\n"); + goto err1; + } + return 0; + +err1: + unregister_pernet_subsys(&nf_log_ipv4_net_ops); + return ret; } static void __exit nf_log_ipv4_exit(void) diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 1baaa83..536da7b 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -11,6 +11,7 @@ #include <net/tcp.h> #include <net/route.h> #include <net/dst.h> +#include <net/netfilter/ipv4/nf_reject.h> #include <linux/netfilter_ipv4.h> #include <net/netfilter/ipv4/nf_reject.h> diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c new file mode 100644 index 0000000..ff2d23d --- /dev/null +++ b/net/ipv4/netfilter/nft_redir_ipv4.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_redirect.h> +#include <net/netfilter/nft_redir.h> + +static void nft_redir_ipv4_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_redir *priv = nft_expr_priv(expr); + struct nf_nat_ipv4_multi_range_compat mr; + unsigned int verdict; + + memset(&mr, 0, sizeof(mr)); + if (priv->sreg_proto_min) { + mr.range[0].min.all = (__force __be16) + data[priv->sreg_proto_min].data[0]; + mr.range[0].max.all = (__force __be16) + data[priv->sreg_proto_max].data[0]; + mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + } + + mr.range[0].flags |= priv->flags; + + verdict = nf_nat_redirect_ipv4(pkt->skb, &mr, pkt->ops->hooknum); + data[NFT_REG_VERDICT].verdict = verdict; +} + +static struct nft_expr_type nft_redir_ipv4_type; +static const struct nft_expr_ops nft_redir_ipv4_ops = { + .type = &nft_redir_ipv4_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), + .eval = nft_redir_ipv4_eval, + .init = nft_redir_init, + .dump = nft_redir_dump, + .validate = nft_redir_validate, +}; + +static struct nft_expr_type nft_redir_ipv4_type __read_mostly = { + .family = NFPROTO_IPV4, + .name = "redir", + .ops = &nft_redir_ipv4_ops, + .policy = nft_redir_policy, + .maxattr = NFTA_REDIR_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_redir_ipv4_module_init(void) +{ + return nft_register_expr(&nft_redir_ipv4_type); +} + +static void __exit nft_redir_ipv4_module_exit(void) +{ + nft_unregister_expr(&nft_redir_ipv4_type); +} + +module_init(nft_redir_ipv4_module_init); +module_exit(nft_redir_ipv4_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>"); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "redir"); diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index ed33299..d729542 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -19,9 +19,9 @@ #include <net/netfilter/ipv4/nf_reject.h> #include <net/netfilter/nft_reject.h> -void nft_reject_ipv4_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt) +static void nft_reject_ipv4_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) { struct nft_reject *priv = nft_expr_priv(expr); @@ -36,7 +36,6 @@ void nft_reject_ipv4_eval(const struct nft_expr *expr, data[NFT_REG_VERDICT].verdict = NF_DROP; } -EXPORT_SYMBOL_GPL(nft_reject_ipv4_eval); static struct nft_expr_type nft_reject_ipv4_type; static const struct nft_expr_ops nft_reject_ipv4_ops = { diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 5d740cc..c0d82f7 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -662,7 +662,7 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, * Fetch the ICMP header provided by the userland. * iovec is modified! The ICMP header is consumed. */ - if (memcpy_fromiovec(user_icmph, msg->msg_iov, icmph_len)) + if (memcpy_from_msg(user_icmph, msg, icmph_len)) return -EFAULT; if (family == AF_INET) { @@ -811,7 +811,8 @@ back_from_confirm: pfh.icmph.checksum = 0; pfh.icmph.un.echo.id = inet->inet_sport; pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence; - pfh.iov = msg->msg_iov; + /* XXX: stripping const */ + pfh.iov = (struct iovec *)msg->msg_iter.iov; pfh.wcheck = 0; pfh.family = AF_INET; @@ -869,7 +870,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } /* Don't bother checking the checksum */ - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; @@ -949,7 +950,7 @@ EXPORT_SYMBOL_GPL(ping_queue_rcv_skb); * All we need to do is get the socket. */ -void ping_rcv(struct sk_buff *skb) +bool ping_rcv(struct sk_buff *skb) { struct sock *sk; struct net *net = dev_net(skb->dev); @@ -968,11 +969,11 @@ void ping_rcv(struct sk_buff *skb) pr_debug("rcv on socket %p\n", sk); ping_queue_rcv_skb(sk, skb_get(skb)); sock_put(sk); - return; + return true; } pr_debug("no socket, dropping\n"); - /* We're called from icmp_rcv(). kfree_skb() is done there. */ + return false; } EXPORT_SYMBOL_GPL(ping_rcv); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 8e3eb39..8f9cd20 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -181,6 +181,7 @@ static const struct snmp_mib snmp4_udp_list[] = { SNMP_MIB_ITEM("RcvbufErrors", UDP_MIB_RCVBUFERRORS), SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS), + SNMP_MIB_ITEM("IgnoredMulti", UDP_MIB_IGNOREDMULTI), SNMP_MIB_SENTINEL }; @@ -287,6 +288,10 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV), SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS), SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT), + SNMP_MIB_ITEM("TCPHystartTrainDetect", LINUX_MIB_TCPHYSTARTTRAINDETECT), + SNMP_MIB_ITEM("TCPHystartTrainCwnd", LINUX_MIB_TCPHYSTARTTRAINCWND), + SNMP_MIB_ITEM("TCPHystartDelayDetect", LINUX_MIB_TCPHYSTARTDELAYDETECT), + SNMP_MIB_ITEM("TCPHystartDelayCwnd", LINUX_MIB_TCPHYSTARTDELAYCWND), SNMP_MIB_SENTINEL }; @@ -296,12 +301,12 @@ static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals, int j; if (count) { - seq_printf(seq, "\nIcmpMsg:"); + seq_puts(seq, "\nIcmpMsg:"); for (j = 0; j < count; ++j) seq_printf(seq, " %sType%u", type[j] & 0x100 ? "Out" : "In", type[j] & 0xff); - seq_printf(seq, "\nIcmpMsg:"); + seq_puts(seq, "\nIcmpMsg:"); for (j = 0; j < count; ++j) seq_printf(seq, " %lu", vals[j]); } @@ -342,7 +347,7 @@ static void icmp_put(struct seq_file *seq) seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors"); for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " In%s", icmpmibmap[i].name); - seq_printf(seq, " OutMsgs OutErrors"); + seq_puts(seq, " OutMsgs OutErrors"); for (i = 0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu %lu", diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 739db31..0bb68df 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -79,6 +79,16 @@ #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/compat.h> +#include <linux/uio.h> + +struct raw_frag_vec { + struct msghdr *msg; + union { + struct icmphdr icmph; + char c[1]; + } hdr; + int hlen; +}; static struct raw_hashinfo raw_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), @@ -420,53 +430,57 @@ error: return err; } -static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg) +static int raw_probe_proto_opt(struct raw_frag_vec *rfv, struct flowi4 *fl4) { - struct iovec *iov; - u8 __user *type = NULL; - u8 __user *code = NULL; - int probed = 0; - unsigned int i; + int err; - if (!msg->msg_iov) + if (fl4->flowi4_proto != IPPROTO_ICMP) return 0; - for (i = 0; i < msg->msg_iovlen; i++) { - iov = &msg->msg_iov[i]; - if (!iov) - continue; - - switch (fl4->flowi4_proto) { - case IPPROTO_ICMP: - /* check if one-byte field is readable or not. */ - if (iov->iov_base && iov->iov_len < 1) - break; - - if (!type) { - type = iov->iov_base; - /* check if code field is readable or not. */ - if (iov->iov_len > 1) - code = type + 1; - } else if (!code) - code = iov->iov_base; - - if (type && code) { - if (get_user(fl4->fl4_icmp_type, type) || - get_user(fl4->fl4_icmp_code, code)) - return -EFAULT; - probed = 1; - } - break; - default: - probed = 1; - break; - } - if (probed) - break; - } + /* We only need the first two bytes. */ + rfv->hlen = 2; + + err = memcpy_from_msg(rfv->hdr.c, rfv->msg, rfv->hlen); + if (err) + return err; + + fl4->fl4_icmp_type = rfv->hdr.icmph.type; + fl4->fl4_icmp_code = rfv->hdr.icmph.code; + return 0; } +static int raw_getfrag(void *from, char *to, int offset, int len, int odd, + struct sk_buff *skb) +{ + struct raw_frag_vec *rfv = from; + + if (offset < rfv->hlen) { + int copy = min(rfv->hlen - offset, len); + + if (skb->ip_summed == CHECKSUM_PARTIAL) + memcpy(to, rfv->hdr.c + offset, copy); + else + skb->csum = csum_block_add( + skb->csum, + csum_partial_copy_nocheck(rfv->hdr.c + offset, + to, copy, 0), + odd); + + odd = 0; + offset += copy; + to += copy; + len -= copy; + + if (!len) + return 0; + } + + offset -= rfv->hlen; + + return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb); +} + static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { @@ -480,6 +494,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, u8 tos; int err; struct ip_options_data opt_copy; + struct raw_frag_vec rfv; err = -EMSGSIZE; if (len > 0xFFFF) @@ -585,7 +600,10 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, daddr, saddr, 0, 0); if (!inet->hdrincl) { - err = raw_probe_proto_opt(&fl4, msg); + rfv.msg = msg; + rfv.hlen = 0; + + err = raw_probe_proto_opt(&rfv, &fl4); if (err) goto done; } @@ -607,7 +625,8 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, back_from_confirm: if (inet->hdrincl) - err = raw_send_hdrinc(sk, &fl4, msg->msg_iov, len, + /* XXX: stripping const */ + err = raw_send_hdrinc(sk, &fl4, (struct iovec *)msg->msg_iter.iov, len, &rt, msg->msg_flags); else { @@ -616,8 +635,8 @@ back_from_confirm: if (!ipc.addr) ipc.addr = fl4.daddr; lock_sock(sk); - err = ip_append_data(sk, &fl4, ip_generic_getfrag, - msg->msg_iov, len, 0, + err = ip_append_data(sk, &fl4, raw_getfrag, + &rfv, len, 0, &ipc, &rt, msg->msg_flags); if (err) ip_flush_pending_frames(sk); @@ -718,7 +737,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 32b98d0..45fe60c 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -19,10 +19,6 @@ #include <net/tcp.h> #include <net/route.h> -/* Timestamps: lowest bits store TCP options */ -#define TSBITS 6 -#define TSMASK (((__u32)1 << TSBITS) - 1) - extern int sysctl_tcp_syncookies; static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; @@ -30,6 +26,30 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) +/* TCP Timestamp: 6 lowest bits of timestamp sent in the cookie SYN-ACK + * stores TCP options: + * + * MSB LSB + * | 31 ... 6 | 5 | 4 | 3 2 1 0 | + * | Timestamp | ECN | SACK | WScale | + * + * When we receive a valid cookie-ACK, we look at the echoed tsval (if + * any) to figure out which TCP options we should use for the rebuilt + * connection. + * + * A WScale setting of '0xf' (which is an invalid scaling value) + * means that original syn did not include the TCP window scaling option. + */ +#define TS_OPT_WSCALE_MASK 0xf +#define TS_OPT_SACK BIT(4) +#define TS_OPT_ECN BIT(5) +/* There is no TS_OPT_TIMESTAMP: + * if ACK contains timestamp option, we already know it was + * requested/supported by the syn/synack exchange. + */ +#define TSBITS 6 +#define TSMASK (((__u32)1 << TSBITS) - 1) + static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch); @@ -67,9 +87,11 @@ __u32 cookie_init_timestamp(struct request_sock *req) ireq = inet_rsk(req); - options = ireq->wscale_ok ? ireq->snd_wscale : 0xf; - options |= ireq->sack_ok << 4; - options |= ireq->ecn_ok << 5; + options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK; + if (ireq->sack_ok) + options |= TS_OPT_SACK; + if (ireq->ecn_ok) + options |= TS_OPT_ECN; ts = ts_now & ~TSMASK; ts |= options; @@ -219,16 +241,13 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, * additional tcp options in the timestamp. * This extracts these options from the timestamp echo. * - * The lowest 4 bits store snd_wscale. - * next 2 bits indicate SACK and ECN support. - * - * return false if we decode an option that should not be. + * return false if we decode a tcp option that is disabled + * on the host. */ -bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, - struct net *net, bool *ecn_ok) +bool cookie_timestamp_decode(struct tcp_options_received *tcp_opt) { /* echoed timestamp, lowest bits contain options */ - u32 options = tcp_opt->rcv_tsecr & TSMASK; + u32 options = tcp_opt->rcv_tsecr; if (!tcp_opt->saw_tstamp) { tcp_clear_options(tcp_opt); @@ -238,22 +257,35 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, if (!sysctl_tcp_timestamps) return false; - tcp_opt->sack_ok = (options & (1 << 4)) ? TCP_SACK_SEEN : 0; - *ecn_ok = (options >> 5) & 1; - if (*ecn_ok && !net->ipv4.sysctl_tcp_ecn) - return false; + tcp_opt->sack_ok = (options & TS_OPT_SACK) ? TCP_SACK_SEEN : 0; if (tcp_opt->sack_ok && !sysctl_tcp_sack) return false; - if ((options & 0xf) == 0xf) + if ((options & TS_OPT_WSCALE_MASK) == TS_OPT_WSCALE_MASK) return true; /* no window scaling */ tcp_opt->wscale_ok = 1; - tcp_opt->snd_wscale = options & 0xf; + tcp_opt->snd_wscale = options & TS_OPT_WSCALE_MASK; + return sysctl_tcp_window_scaling != 0; } -EXPORT_SYMBOL(cookie_check_timestamp); +EXPORT_SYMBOL(cookie_timestamp_decode); + +bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt, + const struct net *net, const struct dst_entry *dst) +{ + bool ecn_ok = tcp_opt->rcv_tsecr & TS_OPT_ECN; + + if (!ecn_ok) + return false; + + if (net->ipv4.sysctl_tcp_ecn) + return true; + + return dst_feature(dst, RTAX_FEATURE_ECN); +} +EXPORT_SYMBOL(cookie_ecn_ok); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) { @@ -269,14 +301,16 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) int mss; struct rtable *rt; __u8 rcv_wscale; - bool ecn_ok = false; struct flowi4 fl4; if (!sysctl_tcp_syncookies || !th->ack || th->rst) goto out; - if (tcp_synq_no_recent_overflow(sk) || - (mss = __cookie_v4_check(ip_hdr(skb), th, cookie)) == 0) { + if (tcp_synq_no_recent_overflow(sk)) + goto out; + + mss = __cookie_v4_check(ip_hdr(skb), th, cookie); + if (mss == 0) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); goto out; } @@ -287,7 +321,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) memset(&tcp_opt, 0, sizeof(tcp_opt)); tcp_parse_options(skb, &tcp_opt, 0, NULL); - if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok)) + if (!cookie_timestamp_decode(&tcp_opt)) goto out; ret = NULL; @@ -305,7 +339,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->ir_loc_addr = ip_hdr(skb)->daddr; ireq->ir_rmt_addr = ip_hdr(skb)->saddr; ireq->ir_mark = inet_request_mark(sk, skb); - ireq->ecn_ok = ecn_ok; ireq->snd_wscale = tcp_opt.snd_wscale; ireq->sack_ok = tcp_opt.sack_ok; ireq->wscale_ok = tcp_opt.wscale_ok; @@ -354,6 +387,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) dst_metric(&rt->dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; + ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst); ret = get_cookie_sock(sk, skb, req, &rt->dst); /* ip_queue_xmit() depends on our flow being setup diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index b3c53c8..e0ee384 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -496,6 +496,13 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { + .procname = "tcp_max_reordering", + .data = &sysctl_tcp_max_reordering, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .procname = "tcp_dsack", .data = &sysctl_tcp_dsack, .maxlen = sizeof(int), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 38c2bcb..3075723 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -835,47 +835,29 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); - u32 xmit_size_goal, old_size_goal; - - xmit_size_goal = mss_now; - - if (large_allowed && sk_can_gso(sk)) { - u32 gso_size, hlen; - - /* Maybe we should/could use sk->sk_prot->max_header here ? */ - hlen = inet_csk(sk)->icsk_af_ops->net_header_len + - inet_csk(sk)->icsk_ext_hdr_len + - tp->tcp_header_len; - - /* Goal is to send at least one packet per ms, - * not one big TSO packet every 100 ms. - * This preserves ACK clocking and is consistent - * with tcp_tso_should_defer() heuristic. - */ - gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC); - gso_size = max_t(u32, gso_size, - sysctl_tcp_min_tso_segs * mss_now); - - xmit_size_goal = min_t(u32, gso_size, - sk->sk_gso_max_size - 1 - hlen); - - xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); - - /* We try hard to avoid divides here */ - old_size_goal = tp->xmit_size_goal_segs * mss_now; - - if (likely(old_size_goal <= xmit_size_goal && - old_size_goal + mss_now > xmit_size_goal)) { - xmit_size_goal = old_size_goal; - } else { - tp->xmit_size_goal_segs = - min_t(u16, xmit_size_goal / mss_now, - sk->sk_gso_max_segs); - xmit_size_goal = tp->xmit_size_goal_segs * mss_now; - } + u32 new_size_goal, size_goal, hlen; + + if (!large_allowed || !sk_can_gso(sk)) + return mss_now; + + /* Maybe we should/could use sk->sk_prot->max_header here ? */ + hlen = inet_csk(sk)->icsk_af_ops->net_header_len + + inet_csk(sk)->icsk_ext_hdr_len + + tp->tcp_header_len; + + new_size_goal = sk->sk_gso_max_size - 1 - hlen; + new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal); + + /* We try hard to avoid divides here */ + size_goal = tp->gso_segs * mss_now; + if (unlikely(new_size_goal < size_goal || + new_size_goal >= size_goal + mss_now)) { + tp->gso_segs = min_t(u16, new_size_goal / mss_now, + sk->sk_gso_max_segs); + size_goal = tp->gso_segs * mss_now; } - return max(xmit_size_goal, mss_now); + return max(size_goal, mss_now); } static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) @@ -1085,7 +1067,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size) { - struct iovec *iov; + const struct iovec *iov; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int iovlen, flags, err, copied = 0; @@ -1136,8 +1118,8 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, mss_now = tcp_send_mss(sk, &size_goal, flags); /* Ok commence sending. */ - iovlen = msg->msg_iovlen; - iov = msg->msg_iov; + iovlen = msg->msg_iter.nr_segs; + iov = msg->msg_iter.iov; copied = 0; err = -EPIPE; @@ -1349,7 +1331,7 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags) if (len > 0) { if (!(flags & MSG_TRUNC)) - err = memcpy_toiovec(msg->msg_iov, &c, 1); + err = memcpy_to_msg(msg, &c, 1); len = 1; } else msg->msg_flags |= MSG_TRUNC; @@ -1377,7 +1359,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) /* XXX -- need to support SO_PEEK_OFF */ skb_queue_walk(&sk->sk_write_queue, skb) { - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len); + err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) break; @@ -1729,7 +1711,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) { user_recv = current; tp->ucopy.task = user_recv; - tp->ucopy.iov = msg->msg_iov; + tp->ucopy.msg = msg; } tp->ucopy.len = len; @@ -1833,8 +1815,7 @@ do_prequeue: } if (!(flags & MSG_TRUNC)) { - err = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); + err = skb_copy_datagram_msg(skb, offset, msg, used); if (err) { /* Exception. Bailout! */ if (!copied) diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index b1c5970..27ead0d 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -1,5 +1,5 @@ /* - * Plugable TCP congestion control support and newReno + * Pluggable TCP congestion control support and newReno * congestion control. * Based on ideas from I/O scheduler support and Web100. * diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 20de011..6b60024 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -363,16 +363,28 @@ static void hystart_update(struct sock *sk, u32 delay) struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - if (!(ca->found & hystart_detect)) { + if (ca->found & hystart_detect) + return; + + if (hystart_detect & HYSTART_ACK_TRAIN) { u32 now = bictcp_clock(); /* first detection parameter - ack-train detection */ if ((s32)(now - ca->last_ack) <= hystart_ack_delta) { ca->last_ack = now; - if ((s32)(now - ca->round_start) > ca->delay_min >> 4) + if ((s32)(now - ca->round_start) > ca->delay_min >> 4) { ca->found |= HYSTART_ACK_TRAIN; + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINDETECT); + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINCWND, + tp->snd_cwnd); + tp->snd_ssthresh = tp->snd_cwnd; + } } + } + if (hystart_detect & HYSTART_DELAY) { /* obtain the minimum delay of more than sampling packets */ if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { if (ca->curr_rtt == 0 || ca->curr_rtt > delay) @@ -381,15 +393,16 @@ static void hystart_update(struct sock *sk, u32 delay) ca->sample_cnt++; } else { if (ca->curr_rtt > ca->delay_min + - HYSTART_DELAY_THRESH(ca->delay_min>>4)) + HYSTART_DELAY_THRESH(ca->delay_min >> 3)) { ca->found |= HYSTART_DELAY; + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYDETECT); + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYCWND, + tp->snd_cwnd); + tp->snd_ssthresh = tp->snd_cwnd; + } } - /* - * Either one of two conditions are met, - * we exit from slow start immediately. - */ - if (ca->found & hystart_detect) - tp->snd_ssthresh = tp->snd_cwnd; } } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d107ee2..075ab4d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -81,6 +81,7 @@ int sysctl_tcp_window_scaling __read_mostly = 1; int sysctl_tcp_sack __read_mostly = 1; int sysctl_tcp_fack __read_mostly = 1; int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; +int sysctl_tcp_max_reordering __read_mostly = 300; EXPORT_SYMBOL(sysctl_tcp_reordering); int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; @@ -833,7 +834,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric, if (metric > tp->reordering) { int mib_idx; - tp->reordering = min(TCP_MAX_REORDERING, metric); + tp->reordering = min(sysctl_tcp_max_reordering, metric); /* This exciting event is worth to be remembered. 8) */ if (ts) @@ -4367,7 +4368,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) goto err_free; - if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size)) + if (memcpy_from_msg(skb_put(skb, size), msg, size)) goto err_free; TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt; @@ -4420,7 +4421,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) __set_current_state(TASK_RUNNING); local_bh_enable(); - if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) { + if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) { tp->ucopy.len -= chunk; tp->copied_seq += chunk; eaten = (chunk == skb->len); @@ -4940,10 +4941,9 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) local_bh_enable(); if (skb_csum_unnecessary(skb)) - err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk); + err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk); else - err = skb_copy_and_csum_datagram_iovec(skb, hlen, - tp->ucopy.iov); + err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg); if (!err) { tp->ucopy.len -= chunk; @@ -5030,7 +5030,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, /* step 3: check security and precedence [ignored] */ /* step 4: Check for a SYN - * RFC 5691 4.2 : Send a challenge ack + * RFC 5961 4.2 : Send a challenge ack */ if (th->syn) { syn_challenge: @@ -5853,12 +5853,12 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) struct inet_request_sock *ireq = inet_rsk(req); if (family == AF_INET) - LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"), - &ireq->ir_rmt_addr, port); + net_dbg_ratelimited("drop open request from %pI4/%u\n", + &ireq->ir_rmt_addr, port); #if IS_ENABLED(CONFIG_IPV6) else if (family == AF_INET6) - LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"), - &ireq->ir_v6_rmt_addr, port); + net_dbg_ratelimited("drop open request from %pI6/%u\n", + &ireq->ir_v6_rmt_addr, port); #endif } @@ -5867,7 +5867,7 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) * If we receive a SYN packet with these bits set, it means a * network is playing bad games with TOS bits. In order to * avoid possible false congestion notifications, we disable - * TCP ECN negociation. + * TCP ECN negotiation. * * Exception: tcp_ca wants ECN. This is required for DCTCP * congestion control; it requires setting ECT on all packets, @@ -5877,20 +5877,22 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) */ static void tcp_ecn_create_request(struct request_sock *req, const struct sk_buff *skb, - const struct sock *listen_sk) + const struct sock *listen_sk, + const struct dst_entry *dst) { const struct tcphdr *th = tcp_hdr(skb); const struct net *net = sock_net(listen_sk); bool th_ecn = th->ece && th->cwr; - bool ect, need_ecn; + bool ect, need_ecn, ecn_ok; if (!th_ecn) return; ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); need_ecn = tcp_ca_needs_ecn(listen_sk); + ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN); - if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn) + if (!ect && !need_ecn && ecn_ok) inet_rsk(req)->ecn_ok = 1; else if (ect && need_ecn) inet_rsk(req)->ecn_ok = 1; @@ -5955,13 +5957,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; - if (!want_cookie || tmp_opt.tstamp_ok) - tcp_ecn_create_request(req, skb, sk); - - if (want_cookie) { - isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); - req->cookie_ts = tmp_opt.tstamp_ok; - } else if (!isn) { + if (!want_cookie && !isn) { /* VJ's idea. We save last timestamp seen * from the destination in peer table, when entering * state TIME-WAIT, and check against it before @@ -6009,6 +6005,15 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop_and_free; } + tcp_ecn_create_request(req, skb, sk, dst); + + if (want_cookie) { + isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); + req->cookie_ts = tmp_opt.tstamp_ok; + if (!tmp_opt.tstamp_ok) + inet_rsk(req)->ecn_ok = 0; + } + tcp_rsk(req)->snt_isn = isn; tcp_openreq_init_rwin(req, sk, dst); fastopen = !want_cookie && diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 147be20..a3f72d7 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -623,6 +623,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.iov[0].iov_base = (unsigned char *)&rep; arg.iov[0].iov_len = sizeof(rep.th); + net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); #ifdef CONFIG_TCP_MD5SIG hash_location = tcp_parse_md5sig_option(th); if (!sk && hash_location) { @@ -633,7 +634,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev), + sk1 = __inet_lookup_listener(net, &tcp_hashinfo, ip_hdr(skb)->saddr, th->source, ip_hdr(skb)->daddr, ntohs(th->source), inet_iif(skb)); @@ -681,7 +682,6 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) if (sk) arg.bound_dev_if = sk->sk_bound_dev_if; - net = dev_net(skb_dst(skb)->dev); arg.tos = ip_hdr(skb)->tos; ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, @@ -1432,6 +1432,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) struct dst_entry *dst = sk->sk_rx_dst; sock_rps_save_rxhash(sk, skb); + sk_mark_napi_id(sk, skb); if (dst) { if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || dst->ops->check(dst, 0) == NULL) { @@ -1453,6 +1454,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); + sk_mark_napi_id(sk, skb); if (tcp_child_process(sk, nsk, skb)) { rsk = nsk; goto reset; @@ -1664,7 +1666,7 @@ process: if (sk_filter(sk, skb)) goto discard_and_relse; - sk_mark_napi_id(sk, skb); + sk_incoming_cpu_update(sk); skb->dev = NULL; bh_lock_sock_nested(sk); diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 5b90f2f..9d7930b 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -94,9 +94,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, SKB_GSO_GRE_CSUM | SKB_GSO_IPIP | SKB_GSO_SIT | - SKB_GSO_MPLS | SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | + SKB_GSO_TUNNEL_REMCSUM | 0) || !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) goto out; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a3d453b..7f18262 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -333,10 +333,19 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || + tcp_ca_needs_ecn(sk); + + if (!use_ecn) { + const struct dst_entry *dst = __sk_dst_get(sk); + + if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) + use_ecn = true; + } tp->ecn_flags = 0; - if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || - tcp_ca_needs_ecn(sk)) { + + if (use_ecn) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; tp->ecn_flags = TCP_ECN_OK; if (tcp_ca_needs_ecn(sk)) @@ -1515,6 +1524,27 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, ((nonagle & TCP_NAGLE_CORK) || (!nonagle && tp->packets_out && tcp_minshall_check(tp))); } + +/* Return how many segs we'd like on a TSO packet, + * to send one TSO packet per ms + */ +static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now) +{ + u32 bytes, segs; + + bytes = min(sk->sk_pacing_rate >> 10, + sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); + + /* Goal is to send at least one packet per ms, + * not one big TSO packet every 100 ms. + * This preserves ACK clocking and is consistent + * with tcp_tso_should_defer() heuristic. + */ + segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs); + + return min_t(u32, segs, sk->sk_gso_max_segs); +} + /* Returns the portion of skb which can be sent right away */ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, @@ -1553,7 +1583,7 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, const struct sk_buff *skb) { - u32 in_flight, cwnd; + u32 in_flight, cwnd, halfcwnd; /* Don't be strict about the congestion window for the final FIN. */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && @@ -1562,10 +1592,14 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, in_flight = tcp_packets_in_flight(tp); cwnd = tp->snd_cwnd; - if (in_flight < cwnd) - return (cwnd - in_flight); + if (in_flight >= cwnd) + return 0; - return 0; + /* For better scheduling, ensure we have at least + * 2 GSO packets in flight. + */ + halfcwnd = max(cwnd >> 1, 1U); + return min(halfcwnd, cwnd - in_flight); } /* Initialize TSO state of a skb. @@ -1718,7 +1752,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, * This algorithm is from John Heffner. */ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, - bool *is_cwnd_limited) + bool *is_cwnd_limited, u32 max_segs) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1748,8 +1782,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, limit = min(send_win, cong_win); /* If a full-sized TSO skb can be sent, do it. */ - if (limit >= min_t(unsigned int, sk->sk_gso_max_size, - tp->xmit_size_goal_segs * tp->mss_cache)) + if (limit >= max_segs * tp->mss_cache) goto send_now; /* Middle in queue won't get any more data, full sendable already? */ @@ -1946,6 +1979,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int cwnd_quota; int result; bool is_cwnd_limited = false; + u32 max_segs; sent_pkts = 0; @@ -1959,6 +1993,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } } + max_segs = tcp_tso_autosize(sk, mss_now); while ((skb = tcp_send_head(sk))) { unsigned int limit; @@ -1991,10 +2026,23 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, break; } else { if (!push_one && - tcp_tso_should_defer(sk, skb, &is_cwnd_limited)) + tcp_tso_should_defer(sk, skb, &is_cwnd_limited, + max_segs)) break; } + limit = mss_now; + if (tso_segs > 1 && !tcp_urg_mode(tp)) + limit = tcp_mss_split_point(sk, skb, mss_now, + min_t(unsigned int, + cwnd_quota, + max_segs), + nonagle); + + if (skb->len > limit && + unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) + break; + /* TCP Small Queues : * Control number of packets in qdisc/devices to two packets / or ~1 ms. * This allows for : @@ -2005,8 +2053,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, * of queued bytes to ensure line rate. * One example is wifi aggregation (802.11 AMPDU) */ - limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes, - sk->sk_pacing_rate >> 10); + limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10); + limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes); if (atomic_read(&sk->sk_wmem_alloc) > limit) { set_bit(TSQ_THROTTLED, &tp->tsq_flags); @@ -2019,18 +2067,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, break; } - limit = mss_now; - if (tso_segs > 1 && !tcp_urg_mode(tp)) - limit = tcp_mss_split_point(sk, skb, mss_now, - min_t(unsigned int, - cwnd_quota, - sk->sk_gso_max_segs), - nonagle); - - if (skb->len > limit && - unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) - break; - if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; @@ -2998,9 +3034,9 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_request *fo = tp->fastopen_req; - int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen; - struct sk_buff *syn_data = NULL, *data; + int syn_loss = 0, space, err = 0; unsigned long last_syn_loss = 0; + struct sk_buff *syn_data; tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, @@ -3031,48 +3067,40 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) /* limit to order-0 allocations */ space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); - syn_data = skb_copy_expand(syn, MAX_TCP_HEADER, space, - sk->sk_allocation); - if (syn_data == NULL) + syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation); + if (!syn_data) + goto fallback; + syn_data->ip_summed = CHECKSUM_PARTIAL; + memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); + if (unlikely(memcpy_fromiovecend(skb_put(syn_data, space), + fo->data->msg_iter.iov, 0, space))) { + kfree_skb(syn_data); goto fallback; + } - for (i = 0; i < iovlen && syn_data->len < space; ++i) { - struct iovec *iov = &fo->data->msg_iov[i]; - unsigned char __user *from = iov->iov_base; - int len = iov->iov_len; + /* No more data pending in inet_wait_for_connect() */ + if (space == fo->size) + fo->data = NULL; + fo->copied = space; - if (syn_data->len + len > space) - len = space - syn_data->len; - else if (i + 1 == iovlen) - /* No more data pending in inet_wait_for_connect() */ - fo->data = NULL; + tcp_connect_queue_skb(sk, syn_data); - if (skb_add_data(syn_data, from, len)) - goto fallback; - } + err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); - /* Queue a data-only packet after the regular SYN for retransmission */ - data = pskb_copy(syn_data, sk->sk_allocation); - if (data == NULL) - goto fallback; - TCP_SKB_CB(data)->seq++; - TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN; - TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH); - tcp_connect_queue_skb(sk, data); - fo->copied = data->len; - - /* syn_data is about to be sent, we need to take current time stamps - * for the packets that are in write queue : SYN packet and DATA - */ - skb_mstamp_get(&syn->skb_mstamp); - data->skb_mstamp = syn->skb_mstamp; + syn->skb_mstamp = syn_data->skb_mstamp; - if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { + /* Now full SYN+DATA was cloned and sent (or not), + * remove the SYN from the original skb (syn_data) + * we keep in write queue in case of a retransmit, as we + * also have the SYN packet (with no data) in the same queue. + */ + TCP_SKB_CB(syn_data)->seq++; + TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; + if (!err) { tp->syn_data = (fo->copied > 0); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); goto done; } - syn_data = NULL; fallback: /* Send a regular SYN with Fast Open cookie request option */ @@ -3081,7 +3109,6 @@ fallback: err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); if (err) tp->syn_fastopen = 0; - kfree_skb(syn_data); done: fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */ return err; @@ -3101,13 +3128,10 @@ int tcp_connect(struct sock *sk) return 0; } - buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); - if (unlikely(buff == NULL)) + buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); + if (unlikely(!buff)) return -ENOBUFS; - /* Reserve space for headers. */ - skb_reserve(buff, MAX_TCP_HEADER); - tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); tp->retrans_stamp = tcp_time_stamp; tcp_connect_queue_skb(sk, buff); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 9b21ae8..1829c7f 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -374,17 +374,19 @@ void tcp_retransmit_timer(struct sock *sk) */ struct inet_sock *inet = inet_sk(sk); if (sk->sk_family == AF_INET) { - LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"), - &inet->inet_daddr, - ntohs(inet->inet_dport), inet->inet_num, - tp->snd_una, tp->snd_nxt); + net_dbg_ratelimited("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", + &inet->inet_daddr, + ntohs(inet->inet_dport), + inet->inet_num, + tp->snd_una, tp->snd_nxt); } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { - LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"), - &sk->sk_v6_daddr, - ntohs(inet->inet_dport), inet->inet_num, - tp->snd_una, tp->snd_nxt); + net_dbg_ratelimited("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", + &sk->sk_v6_daddr, + ntohs(inet->inet_dport), + inet->inet_num, + tp->snd_una, tp->snd_nxt); } #endif if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cd0db54..13b4dcf 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -144,7 +144,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, struct hlist_nulls_node *node; kuid_t uid = sock_i_uid(sk); - sk_nulls_for_each(sk2, node, &hslot->head) + sk_nulls_for_each(sk2, node, &hslot->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && (bitmap || udp_sk(sk2)->udp_port_hash == num) && @@ -152,14 +152,13 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && (!sk2->sk_reuseport || !sk->sk_reuseport || - !uid_eq(uid, sock_i_uid(sk2))) && - (*saddr_comp)(sk, sk2)) { - if (bitmap) - __set_bit(udp_sk(sk2)->udp_port_hash >> log, - bitmap); - else + !uid_eq(uid, sock_i_uid(sk2))) && + saddr_comp(sk, sk2)) { + if (!bitmap) return 1; + __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap); } + } return 0; } @@ -168,10 +167,10 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, * can insert/delete a socket with local_port == num */ static int udp_lib_lport_inuse2(struct net *net, __u16 num, - struct udp_hslot *hslot2, - struct sock *sk, - int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2)) + struct udp_hslot *hslot2, + struct sock *sk, + int (*saddr_comp)(const struct sock *sk1, + const struct sock *sk2)) { struct sock *sk2; struct hlist_nulls_node *node; @@ -179,7 +178,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, int res = 0; spin_lock(&hslot2->lock); - udp_portaddr_for_each_entry(sk2, node, &hslot2->head) + udp_portaddr_for_each_entry(sk2, node, &hslot2->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && (udp_sk(sk2)->udp_port_hash == num) && @@ -187,11 +186,12 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && (!sk2->sk_reuseport || !sk->sk_reuseport || - !uid_eq(uid, sock_i_uid(sk2))) && - (*saddr_comp)(sk, sk2)) { + !uid_eq(uid, sock_i_uid(sk2))) && + saddr_comp(sk, sk2)) { res = 1; break; } + } spin_unlock(&hslot2->lock); return res; } @@ -206,8 +206,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, * with NULL address */ int udp_lib_get_port(struct sock *sk, unsigned short snum, - int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2), + int (*saddr_comp)(const struct sock *sk1, + const struct sock *sk2), unsigned int hash2_nulladdr) { struct udp_hslot *hslot, *hslot2; @@ -336,38 +336,45 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum) return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr); } -static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, - unsigned short hnum, - __be16 sport, __be32 daddr, __be16 dport, int dif) +static inline int compute_score(struct sock *sk, struct net *net, + __be32 saddr, unsigned short hnum, __be16 sport, + __be32 daddr, __be16 dport, int dif) { - int score = -1; + int score; + struct inet_sock *inet; - if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum && - !ipv6_only_sock(sk)) { - struct inet_sock *inet = inet_sk(sk); + if (!net_eq(sock_net(sk), net) || + udp_sk(sk)->udp_port_hash != hnum || + ipv6_only_sock(sk)) + return -1; - score = (sk->sk_family == PF_INET ? 2 : 1); - if (inet->inet_rcv_saddr) { - if (inet->inet_rcv_saddr != daddr) - return -1; - score += 4; - } - if (inet->inet_daddr) { - if (inet->inet_daddr != saddr) - return -1; - score += 4; - } - if (inet->inet_dport) { - if (inet->inet_dport != sport) - return -1; - score += 4; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - return -1; - score += 4; - } + score = (sk->sk_family == PF_INET) ? 2 : 1; + inet = inet_sk(sk); + + if (inet->inet_rcv_saddr) { + if (inet->inet_rcv_saddr != daddr) + return -1; + score += 4; + } + + if (inet->inet_daddr) { + if (inet->inet_daddr != saddr) + return -1; + score += 4; + } + + if (inet->inet_dport) { + if (inet->inet_dport != sport) + return -1; + score += 4; + } + + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score += 4; } + return score; } @@ -378,33 +385,39 @@ static inline int compute_score2(struct sock *sk, struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, int dif) { - int score = -1; + int score; + struct inet_sock *inet; + + if (!net_eq(sock_net(sk), net) || + ipv6_only_sock(sk)) + return -1; - if (net_eq(sock_net(sk), net) && !ipv6_only_sock(sk)) { - struct inet_sock *inet = inet_sk(sk); + inet = inet_sk(sk); - if (inet->inet_rcv_saddr != daddr) + if (inet->inet_rcv_saddr != daddr || + inet->inet_num != hnum) + return -1; + + score = (sk->sk_family == PF_INET) ? 2 : 1; + + if (inet->inet_daddr) { + if (inet->inet_daddr != saddr) return -1; - if (inet->inet_num != hnum) + score += 4; + } + + if (inet->inet_dport) { + if (inet->inet_dport != sport) return -1; + score += 4; + } - score = (sk->sk_family == PF_INET ? 2 : 1); - if (inet->inet_daddr) { - if (inet->inet_daddr != saddr) - return -1; - score += 4; - } - if (inet->inet_dport) { - if (inet->inet_dport != sport) - return -1; - score += 4; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - return -1; - score += 4; - } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score += 4; } + return score; } @@ -1036,7 +1049,7 @@ back_from_confirm: /* Lockless fast path for the non-corking case. */ if (!corkreq) { - skb = ip_make_skb(sk, fl4, getfrag, msg->msg_iov, ulen, + skb = ip_make_skb(sk, fl4, getfrag, msg, ulen, sizeof(struct udphdr), &ipc, &rt, msg->msg_flags); err = PTR_ERR(skb); @@ -1051,7 +1064,7 @@ back_from_confirm: /* ... which is an evident application bug. --ANK */ release_sock(sk); - LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("cork app bug 2\n")); + net_dbg_ratelimited("cork app bug 2\n"); err = -EINVAL; goto out; } @@ -1067,7 +1080,7 @@ back_from_confirm: do_append_data: up->len += ulen; - err = ip_append_data(sk, fl4, getfrag, msg->msg_iov, ulen, + err = ip_append_data(sk, fl4, getfrag, msg, ulen, sizeof(struct udphdr), &ipc, &rt, corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) @@ -1133,7 +1146,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset, if (unlikely(!up->pending)) { release_sock(sk); - LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("udp cork app bug 3\n")); + net_dbg_ratelimited("udp cork app bug 3\n"); return -EINVAL; } @@ -1281,12 +1294,11 @@ try_again: } if (skb_csum_unnecessary(skb)) - err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), - msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, sizeof(struct udphdr), + msg, copied); else { - err = skb_copy_and_csum_datagram_iovec(skb, - sizeof(struct udphdr), - msg->msg_iov); + err = skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr), + msg); if (err == -EINVAL) goto csum_copy_err; @@ -1445,6 +1457,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (inet_sk(sk)->inet_daddr) { sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); + sk_incoming_cpu_update(sk); } rc = sock_queue_rcv_skb(sk, skb); @@ -1546,8 +1559,8 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) * provided by the application." */ if (up->pcrlen == 0) { /* full coverage was set */ - LIMIT_NETDEBUG(KERN_WARNING "UDPLite: partial coverage %d while full coverage %d requested\n", - UDP_SKB_CB(skb)->cscov, skb->len); + net_dbg_ratelimited("UDPLite: partial coverage %d while full coverage %d requested\n", + UDP_SKB_CB(skb)->cscov, skb->len); goto drop; } /* The next case involves violating the min. coverage requested @@ -1557,8 +1570,8 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) * Therefore the above ...()->partial_cov statement is essential. */ if (UDP_SKB_CB(skb)->cscov < up->pcrlen) { - LIMIT_NETDEBUG(KERN_WARNING "UDPLite: coverage %d too small, need min %d\n", - UDP_SKB_CB(skb)->cscov, up->pcrlen); + net_dbg_ratelimited("UDPLite: coverage %d too small, need min %d\n", + UDP_SKB_CB(skb)->cscov, up->pcrlen); goto drop; } } @@ -1647,7 +1660,8 @@ static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udphdr *uh, __be32 saddr, __be32 daddr, - struct udp_table *udptable) + struct udp_table *udptable, + int proto) { struct sock *sk, *stack[256 / sizeof(struct sock *)]; struct hlist_nulls_node *node; @@ -1656,6 +1670,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, int dif = skb->dev->ifindex; unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); + bool inner_flushed = false; if (use_hash2) { hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & @@ -1674,6 +1689,7 @@ start_lookup: dif, hnum)) { if (unlikely(count == ARRAY_SIZE(stack))) { flush_stack(stack, count, skb, ~0); + inner_flushed = true; count = 0; } stack[count++] = sk; @@ -1695,7 +1711,10 @@ start_lookup: if (count) { flush_stack(stack, count, skb, count - 1); } else { - kfree_skb(skb); + if (!inner_flushed) + UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, + proto == IPPROTO_UDPLITE); + consume_skb(skb); } return 0; } @@ -1777,14 +1796,13 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (ret > 0) return -ret; return 0; - } else { - if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) - return __udp4_lib_mcast_deliver(net, skb, uh, - saddr, daddr, udptable); - - sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); } + if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) + return __udp4_lib_mcast_deliver(net, skb, uh, + saddr, daddr, udptable, proto); + + sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk != NULL) { int ret; @@ -1822,11 +1840,11 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, return 0; short_packet: - LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n", - proto == IPPROTO_UDPLITE ? "Lite" : "", - &saddr, ntohs(uh->source), - ulen, skb->len, - &daddr, ntohs(uh->dest)); + net_dbg_ratelimited("UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n", + proto == IPPROTO_UDPLITE ? "Lite" : "", + &saddr, ntohs(uh->source), + ulen, skb->len, + &daddr, ntohs(uh->dest)); goto drop; csum_error: @@ -1834,10 +1852,10 @@ csum_error: * RFC1122: OK. Discards the bad packet silently (as far as * the network is concerned, anyway) as per 4.1.3.4 (MUST). */ - LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n", - proto == IPPROTO_UDPLITE ? "Lite" : "", - &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest), - ulen); + net_dbg_ratelimited("UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n", + proto == IPPROTO_UDPLITE ? "Lite" : "", + &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest), + ulen); UDP_INC_STATS_BH(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); drop: UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); @@ -2027,7 +2045,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, } else { up->corkflag = 0; lock_sock(sk); - (*push_pending_frames)(sk); + push_pending_frames(sk); release_sock(sk); } break; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 6480cea..d3e537e 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -29,7 +29,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb, netdev_features_t features), - __be16 new_protocol) + __be16 new_protocol, bool is_ipv6) { struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; @@ -39,7 +39,10 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t enc_features; int udp_offset, outer_hlen; unsigned int oldlen; - bool need_csum; + bool need_csum = !!(skb_shinfo(skb)->gso_type & + SKB_GSO_UDP_TUNNEL_CSUM); + bool remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); + bool offload_csum = false, dont_encap = (need_csum || remcsum); oldlen = (u16)~skb->len; @@ -52,10 +55,13 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, skb_set_network_header(skb, skb_inner_network_offset(skb)); skb->mac_len = skb_inner_network_offset(skb); skb->protocol = new_protocol; + skb->encap_hdr_csum = need_csum; + skb->remcsum_offload = remcsum; - need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); - if (need_csum) - skb->encap_hdr_csum = 1; + /* Try to offload checksum if possible */ + offload_csum = !!(need_csum && + (skb->dev->features & + (is_ipv6 ? NETIF_F_V6_CSUM : NETIF_F_V4_CSUM))); /* segment inner packet. */ enc_features = skb->dev->hw_enc_features & features; @@ -72,11 +78,21 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, do { struct udphdr *uh; int len; - - skb_reset_inner_headers(skb); - skb->encapsulation = 1; + __be32 delta; + + if (dont_encap) { + skb->encapsulation = 0; + skb->ip_summed = CHECKSUM_NONE; + } else { + /* Only set up inner headers if we might be offloading + * inner checksum. + */ + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } skb->mac_len = mac_len; + skb->protocol = protocol; skb_push(skb, outer_hlen); skb_reset_mac_header(skb); @@ -86,19 +102,36 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, uh = udp_hdr(skb); uh->len = htons(len); - if (need_csum) { - __be32 delta = htonl(oldlen + len); + if (!need_csum) + continue; - uh->check = ~csum_fold((__force __wsum) - ((__force u32)uh->check + - (__force u32)delta)); + delta = htonl(oldlen + len); + + uh->check = ~csum_fold((__force __wsum) + ((__force u32)uh->check + + (__force u32)delta)); + if (offload_csum) { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + } else if (remcsum) { + /* Need to calculate checksum from scratch, + * inner checksums are never when doing + * remote_checksum_offload. + */ + + skb->csum = skb_checksum(skb, udp_offset, + skb->len - udp_offset, + 0); + uh->check = csum_fold(skb->csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } else { uh->check = gso_make_checksum(skb, ~uh->check); if (uh->check == 0) uh->check = CSUM_MANGLED_0; } - - skb->protocol = protocol; } while ((skb = skb->next)); out: return segs; @@ -134,7 +167,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, } segs = __skb_udp_tunnel_segment(skb, features, gso_inner_segment, - protocol); + protocol, is_ipv6); out_unlock: rcu_read_unlock(); @@ -172,9 +205,9 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM | + SKB_GSO_TUNNEL_REMCSUM | SKB_GSO_IPIP | - SKB_GSO_GRE | SKB_GSO_GRE_CSUM | - SKB_GSO_MPLS) || + SKB_GSO_GRE | SKB_GSO_GRE_CSUM) || !(type & (SKB_GSO_UDP)))) goto out; |