diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-31 14:31:10 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-31 14:31:10 -0800 |
commit | b2fe5fa68642860e7de76167c3111623aa0d5de1 (patch) | |
tree | b7f9b89b7039ecefbc35fe3c8e73a6ff972641dd /net/ipv4 | |
parent | a103950e0dd2058df5e8a8d4a915707bdcf205f0 (diff) | |
parent | a54667f6728c2714a400f3c884727da74b6d1717 (diff) | |
download | op-kernel-dev-b2fe5fa68642860e7de76167c3111623aa0d5de1.zip op-kernel-dev-b2fe5fa68642860e7de76167c3111623aa0d5de1.tar.gz |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
1) Significantly shrink the core networking routing structures. Result
of http://vger.kernel.org/~davem/seoul2017_netdev_keynote.pdf
2) Add netdevsim driver for testing various offloads, from Jakub
Kicinski.
3) Support cross-chip FDB operations in DSA, from Vivien Didelot.
4) Add a 2nd listener hash table for TCP, similar to what was done for
UDP. From Martin KaFai Lau.
5) Add eBPF based queue selection to tun, from Jason Wang.
6) Lockless qdisc support, from John Fastabend.
7) SCTP stream interleave support, from Xin Long.
8) Smoother TCP receive autotuning, from Eric Dumazet.
9) Lots of erspan tunneling enhancements, from William Tu.
10) Add true function call support to BPF, from Alexei Starovoitov.
11) Add explicit support for GRO HW offloading, from Michael Chan.
12) Support extack generation in more netlink subsystems. From Alexander
Aring, Quentin Monnet, and Jakub Kicinski.
13) Add 1000BaseX, flow control, and EEE support to mvneta driver. From
Russell King.
14) Add flow table abstraction to netfilter, from Pablo Neira Ayuso.
15) Many improvements and simplifications to the NFP driver bpf JIT,
from Jakub Kicinski.
16) Support for ipv6 non-equal cost multipath routing, from Ido
Schimmel.
17) Add resource abstration to devlink, from Arkadi Sharshevsky.
18) Packet scheduler classifier shared filter block support, from Jiri
Pirko.
19) Avoid locking in act_csum, from Davide Caratti.
20) devinet_ioctl() simplifications from Al viro.
21) More TCP bpf improvements from Lawrence Brakmo.
22) Add support for onlink ipv6 route flag, similar to ipv4, from David
Ahern.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1925 commits)
tls: Add support for encryption using async offload accelerator
ip6mr: fix stale iterator
net/sched: kconfig: Remove blank help texts
openvswitch: meter: Use 64-bit arithmetic instead of 32-bit
tcp_nv: fix potential integer overflow in tcpnv_acked
r8169: fix RTL8168EP take too long to complete driver initialization.
qmi_wwan: Add support for Quectel EP06
rtnetlink: enable IFLA_IF_NETNSID for RTM_NEWLINK
ipmr: Fix ptrdiff_t print formatting
ibmvnic: Wait for device response when changing MAC
qlcnic: fix deadlock bug
tcp: release sk_frag.page in tcp_disconnect
ipv4: Get the address of interface correctly.
net_sched: gen_estimator: fix lockdep splat
net: macb: Handle HRESP error
net/mlx5e: IPoIB, Fix copy-paste bug in flow steering refactoring
ipv6: addrconf: break critical section in addrconf_verify_rtnl()
ipv6: change route cache aging logic
i40e/i40evf: Update DESC_NEEDED value to reflect larger value
bnxt_en: cleanup DIM work on device shutdown
...
Diffstat (limited to 'net/ipv4')
59 files changed, 1427 insertions, 2262 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index c6c8ad1..47a0a66 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -43,7 +43,6 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o -obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f00499a..c24008d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -121,6 +121,7 @@ #endif #include <net/l3mdev.h> +#include <trace/events/sock.h> /* The inetsw table contains everything that inet_create needs to * build a new socket. @@ -789,7 +790,8 @@ int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int addr_len = 0; int err; - sock_rps_record_flow(sk); + if (likely(!(flags & MSG_ERRQUEUE))) + sock_rps_record_flow(sk); err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, flags & ~MSG_DONTWAIT, &addr_len); @@ -870,6 +872,9 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) struct sock *sk = sock->sk; int err = 0; struct net *net = sock_net(sk); + void __user *p = (void __user *)arg; + struct ifreq ifr; + struct rtentry rt; switch (cmd) { case SIOCGSTAMP: @@ -880,8 +885,12 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) break; case SIOCADDRT: case SIOCDELRT: + if (copy_from_user(&rt, p, sizeof(struct rtentry))) + return -EFAULT; + err = ip_rt_ioctl(net, cmd, &rt); + break; case SIOCRTMSG: - err = ip_rt_ioctl(net, cmd, (void __user *)arg); + err = -EINVAL; break; case SIOCDARP: case SIOCGARP: @@ -889,17 +898,26 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) err = arp_ioctl(net, cmd, (void __user *)arg); break; case SIOCGIFADDR: - case SIOCSIFADDR: case SIOCGIFBRDADDR: - case SIOCSIFBRDADDR: case SIOCGIFNETMASK: - case SIOCSIFNETMASK: case SIOCGIFDSTADDR: + case SIOCGIFPFLAGS: + if (copy_from_user(&ifr, p, sizeof(struct ifreq))) + return -EFAULT; + err = devinet_ioctl(net, cmd, &ifr); + if (!err && copy_to_user(p, &ifr, sizeof(struct ifreq))) + err = -EFAULT; + break; + + case SIOCSIFADDR: + case SIOCSIFBRDADDR: + case SIOCSIFNETMASK: case SIOCSIFDSTADDR: case SIOCSIFPFLAGS: - case SIOCGIFPFLAGS: case SIOCSIFFLAGS: - err = devinet_ioctl(net, cmd, (void __user *)arg); + if (copy_from_user(&ifr, p, sizeof(struct ifreq))) + return -EFAULT; + err = devinet_ioctl(net, cmd, &ifr); break; default: if (sk->sk_prot->ioctl) @@ -1220,6 +1238,19 @@ int inet_sk_rebuild_header(struct sock *sk) } EXPORT_SYMBOL(inet_sk_rebuild_header); +void inet_sk_set_state(struct sock *sk, int state) +{ + trace_inet_sock_set_state(sk, sk->sk_state, state); + sk->sk_state = state; +} +EXPORT_SYMBOL(inet_sk_set_state); + +void inet_sk_state_store(struct sock *sk, int newstate) +{ + trace_inet_sock_set_state(sk, sk->sk_state, newstate); + smp_store_release(&sk->sk_state, newstate); +} + struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 6c231b4..f28f06c 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1425,7 +1425,6 @@ static int arp_seq_open(struct inode *inode, struct file *file) } static const struct file_operations arp_seq_fops = { - .owner = THIS_MODULE, .open = arp_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 7a93359..40f0017 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -946,11 +946,10 @@ static int inet_abc_len(__be32 addr) } -int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) +int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) { - struct ifreq ifr; struct sockaddr_in sin_orig; - struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr; + struct sockaddr_in *sin = (struct sockaddr_in *)&ifr->ifr_addr; struct in_device *in_dev; struct in_ifaddr **ifap = NULL; struct in_ifaddr *ifa = NULL; @@ -959,22 +958,16 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) int ret = -EFAULT; int tryaddrmatch = 0; - /* - * Fetch the caller's info block into kernel space - */ - - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) - goto out; - ifr.ifr_name[IFNAMSIZ - 1] = 0; + ifr->ifr_name[IFNAMSIZ - 1] = 0; /* save original address for comparison */ memcpy(&sin_orig, sin, sizeof(*sin)); - colon = strchr(ifr.ifr_name, ':'); + colon = strchr(ifr->ifr_name, ':'); if (colon) *colon = 0; - dev_load(net, ifr.ifr_name); + dev_load(net, ifr->ifr_name); switch (cmd) { case SIOCGIFADDR: /* Get interface address */ @@ -1014,7 +1007,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) rtnl_lock(); ret = -ENODEV; - dev = __dev_get_by_name(net, ifr.ifr_name); + dev = __dev_get_by_name(net, ifr->ifr_name); if (!dev) goto done; @@ -1031,7 +1024,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) This is checked above. */ for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; ifap = &ifa->ifa_next) { - if (!strcmp(ifr.ifr_name, ifa->ifa_label) && + if (!strcmp(ifr->ifr_name, ifa->ifa_label) && sin_orig.sin_addr.s_addr == ifa->ifa_local) { break; /* found */ @@ -1044,7 +1037,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (!ifa) { for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; ifap = &ifa->ifa_next) - if (!strcmp(ifr.ifr_name, ifa->ifa_label)) + if (!strcmp(ifr->ifr_name, ifa->ifa_label)) break; } } @@ -1055,20 +1048,24 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) switch (cmd) { case SIOCGIFADDR: /* Get interface address */ + ret = 0; sin->sin_addr.s_addr = ifa->ifa_local; - goto rarok; + break; case SIOCGIFBRDADDR: /* Get the broadcast address */ + ret = 0; sin->sin_addr.s_addr = ifa->ifa_broadcast; - goto rarok; + break; case SIOCGIFDSTADDR: /* Get the destination address */ + ret = 0; sin->sin_addr.s_addr = ifa->ifa_address; - goto rarok; + break; case SIOCGIFNETMASK: /* Get the netmask for the interface */ + ret = 0; sin->sin_addr.s_addr = ifa->ifa_mask; - goto rarok; + break; case SIOCSIFFLAGS: if (colon) { @@ -1076,11 +1073,11 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (!ifa) break; ret = 0; - if (!(ifr.ifr_flags & IFF_UP)) + if (!(ifr->ifr_flags & IFF_UP)) inet_del_ifa(in_dev, ifap, 1); break; } - ret = dev_change_flags(dev, ifr.ifr_flags); + ret = dev_change_flags(dev, ifr->ifr_flags); break; case SIOCSIFADDR: /* Set interface address (and family) */ @@ -1095,7 +1092,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) break; INIT_HLIST_NODE(&ifa->hash); if (colon) - memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ); + memcpy(ifa->ifa_label, ifr->ifr_name, IFNAMSIZ); else memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); } else { @@ -1182,28 +1179,27 @@ done: rtnl_unlock(); out: return ret; -rarok: - rtnl_unlock(); - ret = copy_to_user(arg, &ifr, sizeof(struct ifreq)) ? -EFAULT : 0; - goto out; } -static int inet_gifconf(struct net_device *dev, char __user *buf, int len) +static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size) { struct in_device *in_dev = __in_dev_get_rtnl(dev); struct in_ifaddr *ifa; struct ifreq ifr; int done = 0; + if (WARN_ON(size > sizeof(struct ifreq))) + goto out; + if (!in_dev) goto out; for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { if (!buf) { - done += sizeof(ifr); + done += size; continue; } - if (len < (int) sizeof(ifr)) + if (len < size) break; memset(&ifr, 0, sizeof(struct ifreq)); strcpy(ifr.ifr_name, ifa->ifa_label); @@ -1212,13 +1208,12 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len) (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr = ifa->ifa_local; - if (copy_to_user(buf, &ifr, sizeof(struct ifreq))) { + if (copy_to_user(buf + done, &ifr, size)) { done = -EFAULT; break; } - buf += sizeof(struct ifreq); - len -= sizeof(struct ifreq); - done += sizeof(struct ifreq); + len -= size; + done += size; } out: return done; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 61fe6e4..296d0b9 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -121,14 +121,32 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) static void esp_output_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; + struct xfrm_offload *xo = xfrm_offload(skb); void *tmp; - struct dst_entry *dst = skb_dst(skb); - struct xfrm_state *x = dst->xfrm; + struct xfrm_state *x; + + if (xo && (xo->flags & XFRM_DEV_RESUME)) + x = skb->sp->xvec[skb->sp->len - 1]; + else + x = skb_dst(skb)->xfrm; tmp = ESP_SKB_CB(skb)->tmp; esp_ssg_unref(x, tmp); kfree(tmp); - xfrm_output_resume(skb, err); + + if (xo && (xo->flags & XFRM_DEV_RESUME)) { + if (err) { + XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR); + kfree_skb(skb); + return; + } + + skb_push(skb, skb->data - skb_mac_header(skb)); + secpath_reset(skb); + xfrm_dev_resume(skb); + } else { + xfrm_output_resume(skb, err); + } } /* Move ESP header back into place. */ @@ -825,17 +843,13 @@ static int esp_init_aead(struct xfrm_state *x) char aead_name[CRYPTO_MAX_ALG_NAME]; struct crypto_aead *aead; int err; - u32 mask = 0; err = -ENAMETOOLONG; if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) goto error; - if (x->xso.offload_handle) - mask |= CRYPTO_ALG_ASYNC; - - aead = crypto_alloc_aead(aead_name, 0, mask); + aead = crypto_alloc_aead(aead_name, 0, 0); err = PTR_ERR(aead); if (IS_ERR(aead)) goto error; @@ -865,7 +879,6 @@ static int esp_init_authenc(struct xfrm_state *x) char authenc_name[CRYPTO_MAX_ALG_NAME]; unsigned int keylen; int err; - u32 mask = 0; err = -EINVAL; if (!x->ealg) @@ -891,10 +904,7 @@ static int esp_init_authenc(struct xfrm_state *x) goto error; } - if (x->xso.offload_handle) - mask |= CRYPTO_ALG_ASYNC; - - aead = crypto_alloc_aead(authenc_name, 0, mask); + aead = crypto_alloc_aead(authenc_name, 0, 0); err = PTR_ERR(aead); if (IS_ERR(aead)) goto error; diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 29b333a..da5635f 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -109,78 +109,39 @@ static void esp4_gso_encap(struct xfrm_state *x, struct sk_buff *skb) static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, netdev_features_t features) { - __u32 seq; - int err = 0; - struct sk_buff *skb2; struct xfrm_state *x; struct ip_esp_hdr *esph; struct crypto_aead *aead; - struct sk_buff *segs = ERR_PTR(-EINVAL); netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); if (!xo) - goto out; + return ERR_PTR(-EINVAL); if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP)) - goto out; - - seq = xo->seq.low; + return ERR_PTR(-EINVAL); x = skb->sp->xvec[skb->sp->len - 1]; aead = x->data; esph = ip_esp_hdr(skb); if (esph->spi != x->id.spi) - goto out; + return ERR_PTR(-EINVAL); if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) - goto out; + return ERR_PTR(-EINVAL); __skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)); skb->encap_hdr_csum = 1; - if (!(features & NETIF_F_HW_ESP)) + if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || + (x->xso.dev != skb->dev)) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); - segs = x->outer_mode->gso_segment(x, skb, esp_features); - if (IS_ERR_OR_NULL(segs)) - goto out; - - __skb_pull(skb, skb->data - skb_mac_header(skb)); - - skb2 = segs; - do { - struct sk_buff *nskb = skb2->next; - - xo = xfrm_offload(skb2); - xo->flags |= XFRM_GSO_SEGMENT; - xo->seq.low = seq; - xo->seq.hi = xfrm_replay_seqhi(x, seq); - - if(!(features & NETIF_F_HW_ESP)) - xo->flags |= CRYPTO_FALLBACK; - - x->outer_mode->xmit(x, skb2); + xo->flags |= XFRM_GSO_SEGMENT; - err = x->type_offload->xmit(x, skb2, esp_features); - if (err) { - kfree_skb_list(segs); - return ERR_PTR(err); - } - - if (!skb_is_gso(skb2)) - seq++; - else - seq += skb_shinfo(skb2)->gso_segs; - - skb_push(skb2, skb2->mac_len); - skb2 = nskb; - } while (skb2); - -out: - return segs; + return x->outer_mode->gso_segment(x, skb, esp_features); } static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb) @@ -207,6 +168,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ struct crypto_aead *aead; struct esp_info esp; bool hw_offload = true; + __u32 seq; esp.inplace = true; @@ -245,23 +207,30 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ return esp.nfrags; } + seq = xo->seq.low; + esph = esp.esph; esph->spi = x->id.spi; skb_push(skb, -skb_network_offset(skb)); if (xo->flags & XFRM_GSO_SEGMENT) { - esph->seq_no = htonl(xo->seq.low); - } else { - ip_hdr(skb)->tot_len = htons(skb->len); - ip_send_check(ip_hdr(skb)); + esph->seq_no = htonl(seq); + + if (!skb_is_gso(skb)) + xo->seq.low++; + else + xo->seq.low += skb_shinfo(skb)->gso_segs; } + esp.seqno = cpu_to_be64(seq + ((u64)xo->seq.hi << 32)); + + ip_hdr(skb)->tot_len = htons(skb->len); + ip_send_check(ip_hdr(skb)); + if (hw_offload) return 0; - esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); - err = esp_output_tail(x, skb, &esp); if (err) return err; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 08259d0..f05afaf 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -587,10 +587,9 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, * Handle IP routing ioctl calls. * These are used to manipulate the routing tables */ -int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) +int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt) { struct fib_config cfg; - struct rtentry rt; int err; switch (cmd) { @@ -599,11 +598,8 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; - if (copy_from_user(&rt, arg, sizeof(rt))) - return -EFAULT; - rtnl_lock(); - err = rtentry_to_fib_config(net, cmd, &rt, &cfg); + err = rtentry_to_fib_config(net, cmd, rt, &cfg); if (err == 0) { struct fib_table *tb; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5ddc4aef..5530cd6 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2334,7 +2334,6 @@ static int fib_triestat_seq_open(struct inode *inode, struct file *file) } static const struct file_operations fib_triestat_fops = { - .owner = THIS_MODULE, .open = fib_triestat_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -2521,7 +2520,6 @@ static int fib_trie_seq_open(struct inode *inode, struct file *file) } static const struct file_operations fib_trie_fops = { - .owner = THIS_MODULE, .open = fib_trie_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -2715,7 +2713,6 @@ static int fib_route_seq_open(struct inode *inode, struct file *file) } static const struct file_operations fib_route_fops = { - .owner = THIS_MODULE, .open = fib_route_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 2d49717..10f7f74 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2832,7 +2832,6 @@ static int igmp_mc_seq_open(struct inode *inode, struct file *file) } static const struct file_operations igmp_mc_seq_fops = { - .owner = THIS_MODULE, .open = igmp_mc_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -2979,7 +2978,6 @@ static int igmp_mcf_seq_open(struct inode *inode, struct file *file) } static const struct file_operations igmp_mcf_seq_fops = { - .owner = THIS_MODULE, .open = igmp_mcf_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 4ca46dc..12410ec 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -685,7 +685,7 @@ static void reqsk_timer_handler(struct timer_list *t) int max_retries, thresh; u8 defer_accept; - if (sk_state_load(sk_listener) != TCP_LISTEN) + if (inet_sk_state_load(sk_listener) != TCP_LISTEN) goto drop; max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; @@ -783,7 +783,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, if (newsk) { struct inet_connection_sock *newicsk = inet_csk(newsk); - newsk->sk_state = TCP_SYN_RECV; + inet_sk_set_state(newsk, TCP_SYN_RECV); newicsk->icsk_bind_hash = NULL; inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; @@ -877,7 +877,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog) * It is OK, because this socket enters to hash table only * after validation is complete. */ - sk_state_store(sk, TCP_LISTEN); + inet_sk_state_store(sk, TCP_LISTEN); if (!sk->sk_prot->get_port(sk, inet->inet_num)) { inet->inet_sport = htons(inet->inet_num); @@ -888,7 +888,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog) return 0; } - sk->sk_state = TCP_CLOSE; + inet_sk_set_state(sk, TCP_CLOSE); return err; } EXPORT_SYMBOL_GPL(inet_csk_listen_start); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index c9c35b6..a383f29 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -564,12 +564,18 @@ static int inet_diag_bc_run(const struct nlattr *_bc, case INET_DIAG_BC_JMP: yes = 0; break; + case INET_DIAG_BC_S_EQ: + yes = entry->sport == op[1].no; + break; case INET_DIAG_BC_S_GE: yes = entry->sport >= op[1].no; break; case INET_DIAG_BC_S_LE: yes = entry->sport <= op[1].no; break; + case INET_DIAG_BC_D_EQ: + yes = entry->dport == op[1].no; + break; case INET_DIAG_BC_D_GE: yes = entry->dport >= op[1].no; break; @@ -802,8 +808,10 @@ static int inet_diag_bc_audit(const struct nlattr *attr, if (!valid_devcond(bc, len, &min_len)) return -EINVAL; break; + case INET_DIAG_BC_S_EQ: case INET_DIAG_BC_S_GE: case INET_DIAG_BC_S_LE: + case INET_DIAG_BC_D_EQ: case INET_DIAG_BC_D_GE: case INET_DIAG_BC_D_LE: if (!valid_port_comparison(bc, len, &min_len)) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index e7d15fb0..37b7da0 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -19,6 +19,7 @@ #include <linux/slab.h> #include <linux/wait.h> #include <linux/vmalloc.h> +#include <linux/bootmem.h> #include <net/addrconf.h> #include <net/inet_connection_sock.h> @@ -168,6 +169,60 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) } EXPORT_SYMBOL_GPL(__inet_inherit_port); +static struct inet_listen_hashbucket * +inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) +{ + u32 hash; + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + hash = ipv6_portaddr_hash(sock_net(sk), + &sk->sk_v6_rcv_saddr, + inet_sk(sk)->inet_num); + else +#endif + hash = ipv4_portaddr_hash(sock_net(sk), + inet_sk(sk)->inet_rcv_saddr, + inet_sk(sk)->inet_num); + return inet_lhash2_bucket(h, hash); +} + +static void inet_hash2(struct inet_hashinfo *h, struct sock *sk) +{ + struct inet_listen_hashbucket *ilb2; + + if (!h->lhash2) + return; + + ilb2 = inet_lhash2_bucket_sk(h, sk); + + spin_lock(&ilb2->lock); + if (sk->sk_reuseport && sk->sk_family == AF_INET6) + hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, + &ilb2->head); + else + hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, + &ilb2->head); + ilb2->count++; + spin_unlock(&ilb2->lock); +} + +static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk) +{ + struct inet_listen_hashbucket *ilb2; + + if (!h->lhash2 || + WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node))) + return; + + ilb2 = inet_lhash2_bucket_sk(h, sk); + + spin_lock(&ilb2->lock); + hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node); + ilb2->count--; + spin_unlock(&ilb2->lock); +} + static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const __be32 daddr, const int dif, const int sdif, bool exact_dif) @@ -207,6 +262,40 @@ static inline int compute_score(struct sock *sk, struct net *net, */ /* called with rcu_read_lock() : No refcount taken on the socket */ +static struct sock *inet_lhash2_lookup(struct net *net, + struct inet_listen_hashbucket *ilb2, + struct sk_buff *skb, int doff, + const __be32 saddr, __be16 sport, + const __be32 daddr, const unsigned short hnum, + const int dif, const int sdif) +{ + bool exact_dif = inet_exact_dif_match(net, skb); + struct inet_connection_sock *icsk; + struct sock *sk, *result = NULL; + int score, hiscore = 0; + u32 phash = 0; + + inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { + sk = (struct sock *)icsk; + score = compute_score(sk, net, hnum, daddr, + dif, sdif, exact_dif); + if (score > hiscore) { + if (sk->sk_reuseport) { + phash = inet_ehashfn(net, daddr, hnum, + saddr, sport); + result = reuseport_select_sock(sk, phash, + skb, doff); + if (result) + return result; + } + result = sk; + hiscore = score; + } + } + + return result; +} + struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -216,32 +305,57 @@ struct sock *__inet_lookup_listener(struct net *net, { unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; - int score, hiscore = 0, matches = 0, reuseport = 0; bool exact_dif = inet_exact_dif_match(net, skb); + struct inet_listen_hashbucket *ilb2; struct sock *sk, *result = NULL; + int score, hiscore = 0; + unsigned int hash2; u32 phash = 0; + if (ilb->count <= 10 || !hashinfo->lhash2) + goto port_lookup; + + /* Too many sk in the ilb bucket (which is hashed by port alone). + * Try lhash2 (which is hashed by port and addr) instead. + */ + + hash2 = ipv4_portaddr_hash(net, daddr, hnum); + ilb2 = inet_lhash2_bucket(hashinfo, hash2); + if (ilb2->count > ilb->count) + goto port_lookup; + + result = inet_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + if (result) + return result; + + /* Lookup lhash2 with INADDR_ANY */ + + hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); + ilb2 = inet_lhash2_bucket(hashinfo, hash2); + if (ilb2->count > ilb->count) + goto port_lookup; + + return inet_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + +port_lookup: sk_for_each_rcu(sk, &ilb->head) { score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif); if (score > hiscore) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { phash = inet_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, phash, skb, doff); if (result) return result; - matches = 1; } result = sk; hiscore = score; - } else if (score == hiscore && reuseport) { - matches++; - if (reciprocal_scale(phash, matches) == 0) - result = sk; - phash = next_pseudo_random32(phash); } } return result; @@ -430,7 +544,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } else { percpu_counter_inc(sk->sk_prot->orphan_count); - sk->sk_state = TCP_CLOSE; + inet_sk_set_state(sk, TCP_CLOSE); sock_set_flag(sk, SOCK_DEAD); inet_csk_destroy_sock(sk); } @@ -483,6 +597,8 @@ int __inet_hash(struct sock *sk, struct sock *osk) hlist_add_tail_rcu(&sk->sk_node, &ilb->head); else hlist_add_head_rcu(&sk->sk_node, &ilb->head); + inet_hash2(hashinfo, sk); + ilb->count++; sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: @@ -509,28 +625,35 @@ EXPORT_SYMBOL_GPL(inet_hash); void inet_unhash(struct sock *sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + struct inet_listen_hashbucket *ilb; spinlock_t *lock; bool listener = false; - int done; if (sk_unhashed(sk)) return; if (sk->sk_state == TCP_LISTEN) { - lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock; + ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + lock = &ilb->lock; listener = true; } else { lock = inet_ehash_lockp(hashinfo, sk->sk_hash); } spin_lock_bh(lock); + if (sk_unhashed(sk)) + goto unlock; + if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); - if (listener) - done = __sk_del_node_init(sk); - else - done = __sk_nulls_del_node_init_rcu(sk); - if (done) - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + if (listener) { + inet_unhash2(hashinfo, sk); + __sk_del_node_init(sk); + ilb->count--; + } else { + __sk_nulls_del_node_init_rcu(sk); + } + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); +unlock: spin_unlock_bh(lock); } EXPORT_SYMBOL_GPL(inet_unhash); @@ -665,10 +788,37 @@ void inet_hashinfo_init(struct inet_hashinfo *h) for (i = 0; i < INET_LHTABLE_SIZE; i++) { spin_lock_init(&h->listening_hash[i].lock); INIT_HLIST_HEAD(&h->listening_hash[i].head); + h->listening_hash[i].count = 0; } + + h->lhash2 = NULL; } EXPORT_SYMBOL_GPL(inet_hashinfo_init); +void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, + unsigned long numentries, int scale, + unsigned long low_limit, + unsigned long high_limit) +{ + unsigned int i; + + h->lhash2 = alloc_large_system_hash(name, + sizeof(*h->lhash2), + numentries, + scale, + 0, + NULL, + &h->lhash2_mask, + low_limit, + high_limit); + + for (i = 0; i <= h->lhash2_mask; i++) { + spin_lock_init(&h->lhash2[i].lock); + INIT_HLIST_HEAD(&h->lhash2[i].head); + h->lhash2[i].count = 0; + } +} + int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { unsigned int locksz = sizeof(spinlock_t); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index b563e0c..c3ea490 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -97,7 +97,7 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, * Essentially we whip up a timewait bucket, copy the relevant info into it * from the SK, and mess with hash chains and list linkage. */ -void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, +void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, struct inet_hashinfo *hashinfo) { const struct inet_sock *inet = inet_sk(sk); @@ -119,18 +119,6 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, spin_lock(lock); - /* - * Step 2: Hash TW into tcp ehash chain. - * Notes : - * - tw_refcnt is set to 4 because : - * - We have one reference from bhash chain. - * - We have one reference from ehash chain. - * - We have one reference from timer. - * - One reference for ourself (our caller will release it). - * We can use atomic_set() because prior spin_lock()/spin_unlock() - * committed into memory all tw fields. - */ - refcount_set(&tw->tw_refcnt, 4); inet_twsk_add_node_rcu(tw, &ehead->chain); /* Step 3: Remove SK from hash chain */ @@ -138,8 +126,19 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock(lock); + + /* tw_refcnt is set to 3 because we have : + * - one reference for bhash chain. + * - one reference for ehash chain. + * - one reference for timer. + * We can use atomic_set() because prior spin_lock()/spin_unlock() + * committed into memory all tw fields. + * Also note that after this point, we lost our implicit reference + * so we are not allowed to use tw anymore. + */ + refcount_set(&tw->tw_refcnt, 3); } -EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); +EXPORT_SYMBOL_GPL(inet_twsk_hashdance); static void tw_timer_handler(struct timer_list *t) { @@ -271,14 +270,14 @@ restart: continue; tw = inet_twsk(sk); if ((tw->tw_family != family) || - atomic_read(&twsk_net(tw)->count)) + refcount_read(&twsk_net(tw)->count)) continue; if (unlikely(!refcount_inc_not_zero(&tw->tw_refcnt))) continue; if (unlikely((tw->tw_family != family) || - atomic_read(&twsk_net(tw)->count))) { + refcount_read(&twsk_net(tw)->count))) { inet_twsk_put(tw); goto restart; } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 45ffd3d..6ec670f 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -114,7 +114,8 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static struct rtnl_link_ops ipgre_link_ops __read_mostly; static int ipgre_tunnel_init(struct net_device *dev); static void erspan_build_header(struct sk_buff *skb, - __be32 id, u32 index, bool truncate); + u32 id, u32 index, + bool truncate, bool is_ipv4); static unsigned int ipgre_net_id __read_mostly; static unsigned int gre_tap_net_id __read_mostly; @@ -255,34 +256,43 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, { struct net *net = dev_net(skb->dev); struct metadata_dst *tun_dst = NULL; + struct erspan_base_hdr *ershdr; + struct erspan_metadata *pkt_md; struct ip_tunnel_net *itn; struct ip_tunnel *tunnel; - struct erspanhdr *ershdr; const struct iphdr *iph; - __be32 index; + int ver; int len; itn = net_generic(net, erspan_net_id); len = gre_hdr_len + sizeof(*ershdr); + /* Check based hdr len */ if (unlikely(!pskb_may_pull(skb, len))) return PACKET_REJECT; iph = ip_hdr(skb); - ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len); + ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); + ver = ershdr->ver; /* The original GRE header does not have key field, * Use ERSPAN 10-bit session ID as key. */ - tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK); - index = ershdr->md.index; + tpi->key = cpu_to_be32(get_session_id(ershdr)); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags | TUNNEL_KEY, iph->saddr, iph->daddr, tpi->key); if (tunnel) { + len = gre_hdr_len + erspan_hdr_len(ver); + if (unlikely(!pskb_may_pull(skb, len))) + return PACKET_REJECT; + + ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); + pkt_md = (struct erspan_metadata *)(ershdr + 1); + if (__iptunnel_pull_header(skb, - gre_hdr_len + sizeof(*ershdr), + len, htons(ETH_P_TEB), false, false) < 0) goto drop; @@ -303,15 +313,21 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, return PACKET_REJECT; md = ip_tunnel_info_opts(&tun_dst->u.tun_info); - if (!md) - return PACKET_REJECT; + memcpy(md, pkt_md, sizeof(*md)); + md->version = ver; - md->index = index; info = &tun_dst->u.tun_info; info->key.tun_flags |= TUNNEL_ERSPAN_OPT; info->options_len = sizeof(*md); } else { - tunnel->index = ntohl(index); + tunnel->erspan_ver = ver; + if (ver == 1) { + tunnel->index = ntohl(pkt_md->u.index); + } else { + tunnel->dir = pkt_md->u.md2.dir; + tunnel->hwid = get_hwid(&pkt_md->u.md2); + } + } skb_reset_mac_header(skb); @@ -405,14 +421,17 @@ static int gre_rcv(struct sk_buff *skb) if (hdr_len < 0) goto drop; - if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) { + if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) || + tpi.proto == htons(ETH_P_ERSPAN2))) { if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) return 0; + goto out; } if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) return 0; +out: icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: kfree_skb(skb); @@ -560,6 +579,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, bool truncate = false; struct flowi4 fl; int tunnel_hlen; + int version; __be16 df; tun_info = skb_tunnel_info(skb); @@ -568,9 +588,13 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, goto err_free_skb; key = &tun_info->key; + md = ip_tunnel_info_opts(tun_info); + if (!md) + goto err_free_rt; /* ERSPAN has fixed 8 byte GRE header */ - tunnel_hlen = 8 + sizeof(struct erspanhdr); + version = md->version; + tunnel_hlen = 8 + erspan_hdr_len(version); rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen); if (!rt) @@ -584,12 +608,18 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, truncate = true; } - md = ip_tunnel_info_opts(tun_info); - if (!md) + if (version == 1) { + erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)), + ntohl(md->u.index), truncate, true); + } else if (version == 2) { + erspan_build_header_v2(skb, + ntohl(tunnel_id_to_key32(key->tun_id)), + md->u.md2.dir, + get_hwid(&md->u.md2), + truncate, true); + } else { goto err_free_rt; - - erspan_build_header(skb, tunnel_id_to_key32(key->tun_id), - ntohl(md->index), truncate); + } gre_build_header(skb, 8, TUNNEL_SEQ, htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++)); @@ -668,52 +698,6 @@ free_skb: return NETDEV_TX_OK; } -static inline u8 tos_to_cos(u8 tos) -{ - u8 dscp, cos; - - dscp = tos >> 2; - cos = dscp >> 3; - return cos; -} - -static void erspan_build_header(struct sk_buff *skb, - __be32 id, u32 index, bool truncate) -{ - struct iphdr *iphdr = ip_hdr(skb); - struct ethhdr *eth = eth_hdr(skb); - enum erspan_encap_type enc_type; - struct erspanhdr *ershdr; - struct qtag_prefix { - __be16 eth_type; - __be16 tci; - } *qp; - u16 vlan_tci = 0; - - enc_type = ERSPAN_ENCAP_NOVLAN; - - /* If mirrored packet has vlan tag, extract tci and - * perserve vlan header in the mirrored frame. - */ - if (eth->h_proto == htons(ETH_P_8021Q)) { - qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN); - vlan_tci = ntohs(qp->tci); - enc_type = ERSPAN_ENCAP_INFRAME; - } - - skb_push(skb, sizeof(*ershdr)); - ershdr = (struct erspanhdr *)skb->data; - memset(ershdr, 0, sizeof(*ershdr)); - - ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) | - (ERSPAN_VERSION << VER_OFFSET)); - ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) | - ((tos_to_cos(iphdr->tos) << COS_OFFSET) & COS_MASK) | - (enc_type << EN_OFFSET & EN_MASK) | - ((truncate << T_OFFSET) & T_MASK)); - ershdr->md.index = htonl(index & INDEX_MASK); -} - static netdev_tx_t erspan_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -737,7 +721,15 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, } /* Push ERSPAN header */ - erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, truncate); + if (tunnel->erspan_ver == 1) + erspan_build_header(skb, ntohl(tunnel->parms.o_key), + tunnel->index, + truncate, true); + else + erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key), + tunnel->dir, tunnel->hwid, + truncate, true); + tunnel->parms.o_flags &= ~TUNNEL_KEY; __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN)); return NETDEV_TX_OK; @@ -1209,13 +1201,32 @@ static int ipgre_netlink_parms(struct net_device *dev, if (data[IFLA_GRE_FWMARK]) *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]); - if (data[IFLA_GRE_ERSPAN_INDEX]) { - t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + if (data[IFLA_GRE_ERSPAN_VER]) { + t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); - if (t->index & ~INDEX_MASK) + if (t->erspan_ver != 1 && t->erspan_ver != 2) return -EINVAL; } + if (t->erspan_ver == 1) { + if (data[IFLA_GRE_ERSPAN_INDEX]) { + t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + if (t->index & ~INDEX_MASK) + return -EINVAL; + } + } else if (t->erspan_ver == 2) { + if (data[IFLA_GRE_ERSPAN_DIR]) { + t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]); + if (t->dir & ~(DIR_MASK >> DIR_OFFSET)) + return -EINVAL; + } + if (data[IFLA_GRE_ERSPAN_HWID]) { + t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]); + if (t->hwid & ~(HWID_MASK >> HWID_OFFSET)) + return -EINVAL; + } + } + return 0; } @@ -1282,7 +1293,7 @@ static int erspan_tunnel_init(struct net_device *dev) tunnel->tun_hlen = 8; tunnel->parms.iph.protocol = IPPROTO_GRE; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + - sizeof(struct erspanhdr); + erspan_hdr_len(tunnel->erspan_ver); t_hlen = tunnel->hlen + sizeof(struct iphdr); dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; @@ -1413,6 +1424,12 @@ static size_t ipgre_get_size(const struct net_device *dev) nla_total_size(4) + /* IFLA_GRE_ERSPAN_INDEX */ nla_total_size(4) + + /* IFLA_GRE_ERSPAN_VER */ + nla_total_size(1) + + /* IFLA_GRE_ERSPAN_DIR */ + nla_total_size(1) + + /* IFLA_GRE_ERSPAN_HWID */ + nla_total_size(2) + 0; } @@ -1455,9 +1472,18 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; } - if (t->index) + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver)) + goto nla_put_failure; + + if (t->erspan_ver == 1) { if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index)) goto nla_put_failure; + } else if (t->erspan_ver == 2) { + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir)) + goto nla_put_failure; + if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid)) + goto nla_put_failure; + } return 0; @@ -1493,6 +1519,9 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_IGNORE_DF] = { .type = NLA_U8 }, [IFLA_GRE_FWMARK] = { .type = NLA_U32 }, [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 }, + [IFLA_GRE_ERSPAN_VER] = { .type = NLA_U8 }, + [IFLA_GRE_ERSPAN_DIR] = { .type = NLA_U8 }, + [IFLA_GRE_ERSPAN_HWID] = { .type = NLA_U16 }, }; static struct rtnl_link_ops ipgre_link_ops __read_mostly = { diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 60fb1eb..6cc70fa 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -808,6 +808,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, { struct net_device *dev = NULL; int ifindex; + int midx; if (optlen != sizeof(int)) goto e_inval; @@ -823,10 +824,13 @@ static int do_ip_setsockopt(struct sock *sk, int level, err = -EADDRNOTAVAIL; if (!dev) break; + + midx = l3mdev_master_ifindex(dev); dev_put(dev); err = -EINVAL; - if (sk->sk_bound_dev_if) + if (sk->sk_bound_dev_if && + (!midx || midx != sk->sk_bound_dev_if)) break; inet->uc_index = ifindex; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 6d21068..d786a84 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -710,9 +710,16 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } } - init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, - tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link, - tunnel->fwmark); + if (tunnel->fwmark) { + init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, + tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link, + tunnel->fwmark); + } + else { + init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, + tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link, + skb->mark); + } if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) goto tx_error; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index abdebca..f75802a 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -329,39 +329,6 @@ set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port) sin->sin_port = port; } -static int __init ic_devinet_ioctl(unsigned int cmd, struct ifreq *arg) -{ - int res; - - mm_segment_t oldfs = get_fs(); - set_fs(get_ds()); - res = devinet_ioctl(&init_net, cmd, (struct ifreq __user *) arg); - set_fs(oldfs); - return res; -} - -static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg) -{ - int res; - - mm_segment_t oldfs = get_fs(); - set_fs(get_ds()); - res = dev_ioctl(&init_net, cmd, (struct ifreq __user *) arg); - set_fs(oldfs); - return res; -} - -static int __init ic_route_ioctl(unsigned int cmd, struct rtentry *arg) -{ - int res; - - mm_segment_t oldfs = get_fs(); - set_fs(get_ds()); - res = ip_rt_ioctl(&init_net, cmd, (void __user *) arg); - set_fs(oldfs); - return res; -} - /* * Set up interface addresses and routes. */ @@ -375,19 +342,19 @@ static int __init ic_setup_if(void) memset(&ir, 0, sizeof(ir)); strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->dev->name); set_sockaddr(sin, ic_myaddr, 0); - if ((err = ic_devinet_ioctl(SIOCSIFADDR, &ir)) < 0) { + if ((err = devinet_ioctl(&init_net, SIOCSIFADDR, &ir)) < 0) { pr_err("IP-Config: Unable to set interface address (%d)\n", err); return -1; } set_sockaddr(sin, ic_netmask, 0); - if ((err = ic_devinet_ioctl(SIOCSIFNETMASK, &ir)) < 0) { + if ((err = devinet_ioctl(&init_net, SIOCSIFNETMASK, &ir)) < 0) { pr_err("IP-Config: Unable to set interface netmask (%d)\n", err); return -1; } set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0); - if ((err = ic_devinet_ioctl(SIOCSIFBRDADDR, &ir)) < 0) { + if ((err = devinet_ioctl(&init_net, SIOCSIFBRDADDR, &ir)) < 0) { pr_err("IP-Config: Unable to set interface broadcast address (%d)\n", err); return -1; @@ -397,11 +364,11 @@ static int __init ic_setup_if(void) * out, we'll try to muddle along. */ if (ic_dev_mtu != 0) { - strcpy(ir.ifr_name, ic_dev->dev->name); - ir.ifr_mtu = ic_dev_mtu; - if ((err = ic_dev_ioctl(SIOCSIFMTU, &ir)) < 0) + rtnl_lock(); + if ((err = dev_set_mtu(ic_dev->dev, ic_dev_mtu)) < 0) pr_err("IP-Config: Unable to set interface mtu to %d (%d)\n", ic_dev_mtu, err); + rtnl_unlock(); } return 0; } @@ -423,7 +390,7 @@ static int __init ic_setup_routes(void) set_sockaddr((struct sockaddr_in *) &rm.rt_genmask, 0, 0); set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0); rm.rt_flags = RTF_UP | RTF_GATEWAY; - if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) { + if ((err = ip_rt_ioctl(&init_net, SIOCADDRT, &rm)) < 0) { pr_err("IP-Config: Cannot add default route (%d)\n", err); return -1; @@ -1322,7 +1289,6 @@ static int pnp_seq_open(struct inode *indoe, struct file *file) } static const struct file_operations pnp_seq_fops = { - .owner = THIS_MODULE, .open = pnp_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index fd5f19c..b05689b 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -3022,7 +3022,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v) const char *name = vif->dev ? vif->dev->name : "none"; seq_printf(seq, - "%2zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", + "%2td %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", vif - mrt->vif_table, name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, @@ -3045,7 +3045,6 @@ static int ipmr_vif_open(struct inode *inode, struct file *file) } static const struct file_operations ipmr_vif_fops = { - .owner = THIS_MODULE, .open = ipmr_vif_open, .read = seq_read, .llseek = seq_lseek, @@ -3198,7 +3197,6 @@ static int ipmr_mfc_open(struct inode *inode, struct file *file) } static const struct file_operations ipmr_mfc_fops = { - .owner = THIS_MODULE, .open = ipmr_mfc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index c0cc6aa..e6774cc 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -80,35 +80,7 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t } EXPORT_SYMBOL(ip_route_me_harder); -/* - * Extra routing may needed on local out, as the QUEUE target never - * returns control to the table. - */ - -struct ip_rt_info { - __be32 daddr; - __be32 saddr; - u_int8_t tos; - u_int32_t mark; -}; - -static void nf_ip_saveroute(const struct sk_buff *skb, - struct nf_queue_entry *entry) -{ - struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); - - if (entry->state.hook == NF_INET_LOCAL_OUT) { - const struct iphdr *iph = ip_hdr(skb); - - rt_info->tos = iph->tos; - rt_info->daddr = iph->daddr; - rt_info->saddr = iph->saddr; - rt_info->mark = skb->mark; - } -} - -static int nf_ip_reroute(struct net *net, struct sk_buff *skb, - const struct nf_queue_entry *entry) +int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) { const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); @@ -119,10 +91,12 @@ static int nf_ip_reroute(struct net *net, struct sk_buff *skb, skb->mark == rt_info->mark && iph->daddr == rt_info->daddr && iph->saddr == rt_info->saddr)) - return ip_route_me_harder(net, skb, RTN_UNSPEC); + return ip_route_me_harder(entry->state.net, skb, + RTN_UNSPEC); } return 0; } +EXPORT_SYMBOL_GPL(nf_ip_reroute); __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol) @@ -155,9 +129,9 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, } EXPORT_SYMBOL(nf_ip_checksum); -static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, unsigned int len, - u_int8_t protocol) +__sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, unsigned int len, + u_int8_t protocol) { const struct iphdr *iph = ip_hdr(skb); __sum16 csum = 0; @@ -175,9 +149,10 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, } return csum; } +EXPORT_SYMBOL_GPL(nf_ip_checksum_partial); -static int nf_ip_route(struct net *net, struct dst_entry **dst, - struct flowi *fl, bool strict __always_unused) +int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl, + bool strict __always_unused) { struct rtable *rt = ip_route_output_key(net, &fl->u.ip4); if (IS_ERR(rt)) @@ -185,19 +160,4 @@ static int nf_ip_route(struct net *net, struct dst_entry **dst, *dst = &rt->dst; return 0; } - -static const struct nf_afinfo nf_ip_afinfo = { - .family = AF_INET, - .checksum = nf_ip_checksum, - .checksum_partial = nf_ip_checksum_partial, - .route = nf_ip_route, - .saveroute = nf_ip_saveroute, - .reroute = nf_ip_reroute, - .route_key_size = sizeof(struct ip_rt_info), -}; - -static int __init ipv4_netfilter_init(void) -{ - return nf_register_afinfo(&nf_ip_afinfo); -} -subsys_initcall(ipv4_netfilter_init); +EXPORT_SYMBOL_GPL(nf_ip_route); diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index c11eb17..5f52236 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -72,11 +72,21 @@ endif # NF_TABLES_IPV4 config NF_TABLES_ARP tristate "ARP nf_tables support" + select NETFILTER_FAMILY_ARP help This option enables the ARP support for nf_tables. endif # NF_TABLES +config NF_FLOW_TABLE_IPV4 + tristate "Netfilter flow table IPv4 module" + depends on NF_CONNTRACK && NF_TABLES + select NF_FLOW_TABLE + help + This option adds the flow table IPv4 support. + + To compile it as a module, choose M here. + config NF_DUP_IPV4 tristate "Netfilter IPv4 packet duplication to alternate destination" depends on !NF_CONNTRACK || NF_CONNTRACK @@ -148,6 +158,7 @@ config NF_NAT_SNMP_BASIC depends on NF_CONNTRACK_SNMP depends on NETFILTER_ADVANCED default NF_NAT && NF_CONNTRACK_SNMP + select ASN1 ---help--- This module implements an Application Layer Gateway (ALG) for @@ -333,6 +344,7 @@ config IP_NF_TARGET_CLUSTERIP depends on NF_CONNTRACK_IPV4 depends on NETFILTER_ADVANCED select NF_CONNTRACK_MARK + select NETFILTER_FAMILY_ARP help The CLUSTERIP target allows you to build load-balancing clusters of network servers without having a dedicated load-balancing @@ -392,6 +404,7 @@ endif # IP_NF_IPTABLES config IP_NF_ARPTABLES tristate "ARP tables support" select NETFILTER_XTABLES + select NETFILTER_FAMILY_ARP depends on NETFILTER_ADVANCED help arptables is a general, extensible packet identification framework. diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index adcdae3..2dad20e 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -27,9 +27,15 @@ obj-$(CONFIG_NF_REJECT_IPV4) += nf_reject_ipv4.o # NAT helpers (nf_conntrack) obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o + +nf_nat_snmp_basic-y := nf_nat_snmp_basic-asn1.o nf_nat_snmp_basic_main.o +nf_nat_snmp_basic-y : nf_nat_snmp_basic-asn1.h nf_nat_snmp_basic-asn1.c obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o +clean-files := nf_nat_snmp_basic-asn1.c nf_nat_snmp_basic-asn1.h + obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o + # NAT protocols (nf_nat) obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o @@ -43,6 +49,9 @@ obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o +# flow table support +obj-$(CONFIG_NF_FLOW_TABLE_IPV4) += nf_flow_table_ipv4.o + # generic IP tables obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index eb8246c..4ffe302 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -805,9 +805,8 @@ static int get_info(struct net *net, void __user *user, if (compat) xt_compat_lock(NFPROTO_ARP); #endif - t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name), - "arptable_%s", name); - if (t) { + t = xt_request_find_table_lock(net, NFPROTO_ARP, name); + if (!IS_ERR(t)) { struct arpt_getinfo info; const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT @@ -836,7 +835,7 @@ static int get_info(struct net *net, void __user *user, xt_table_unlock(t); module_put(t->me); } else - ret = -ENOENT; + ret = PTR_ERR(t); #ifdef CONFIG_COMPAT if (compat) xt_compat_unlock(NFPROTO_ARP); @@ -861,7 +860,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, NFPROTO_ARP, get.name); - if (t) { + if (!IS_ERR(t)) { const struct xt_table_info *private = t->private; if (get.size == private->size) @@ -873,7 +872,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, module_put(t->me); xt_table_unlock(t); } else - ret = -ENOENT; + ret = PTR_ERR(t); return ret; } @@ -898,10 +897,9 @@ static int __do_replace(struct net *net, const char *name, goto out; } - t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name), - "arptable_%s", name); - if (!t) { - ret = -ENOENT; + t = xt_request_find_table_lock(net, NFPROTO_ARP, name); + if (IS_ERR(t)) { + ret = PTR_ERR(t); goto free_newinfo_counters_untrans; } @@ -1015,8 +1013,8 @@ static int do_add_counters(struct net *net, const void __user *user, return PTR_ERR(paddc); t = xt_find_table_lock(net, NFPROTO_ARP, tmp.name); - if (!t) { - ret = -ENOENT; + if (IS_ERR(t)) { + ret = PTR_ERR(t); goto free; } @@ -1403,7 +1401,7 @@ static int compat_get_entries(struct net *net, xt_compat_lock(NFPROTO_ARP); t = xt_find_table_lock(net, NFPROTO_ARP, get.name); - if (t) { + if (!IS_ERR(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; @@ -1418,7 +1416,7 @@ static int compat_get_entries(struct net *net, module_put(t->me); xt_table_unlock(t); } else - ret = -ENOENT; + ret = PTR_ERR(t); xt_compat_unlock(NFPROTO_ARP); return ret; @@ -1653,7 +1651,6 @@ static int __init arp_tables_init(void) if (ret < 0) goto err4; - pr_info("arp_tables: (C) 2002 David S. Miller\n"); return 0; err4: diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index cc984d0..9a71f31 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -968,9 +968,8 @@ static int get_info(struct net *net, void __user *user, if (compat) xt_compat_lock(AF_INET); #endif - t = try_then_request_module(xt_find_table_lock(net, AF_INET, name), - "iptable_%s", name); - if (t) { + t = xt_request_find_table_lock(net, AF_INET, name); + if (!IS_ERR(t)) { struct ipt_getinfo info; const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT @@ -1000,7 +999,7 @@ static int get_info(struct net *net, void __user *user, xt_table_unlock(t); module_put(t->me); } else - ret = -ENOENT; + ret = PTR_ERR(t); #ifdef CONFIG_COMPAT if (compat) xt_compat_unlock(AF_INET); @@ -1025,7 +1024,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET, get.name); - if (t) { + if (!IS_ERR(t)) { const struct xt_table_info *private = t->private; if (get.size == private->size) ret = copy_entries_to_user(private->size, @@ -1036,7 +1035,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, module_put(t->me); xt_table_unlock(t); } else - ret = -ENOENT; + ret = PTR_ERR(t); return ret; } @@ -1059,10 +1058,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, goto out; } - t = try_then_request_module(xt_find_table_lock(net, AF_INET, name), - "iptable_%s", name); - if (!t) { - ret = -ENOENT; + t = xt_request_find_table_lock(net, AF_INET, name); + if (IS_ERR(t)) { + ret = PTR_ERR(t); goto free_newinfo_counters_untrans; } @@ -1176,8 +1174,8 @@ do_add_counters(struct net *net, const void __user *user, return PTR_ERR(paddc); t = xt_find_table_lock(net, AF_INET, tmp.name); - if (!t) { - ret = -ENOENT; + if (IS_ERR(t)) { + ret = PTR_ERR(t); goto free; } @@ -1620,7 +1618,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, xt_compat_lock(AF_INET); t = xt_find_table_lock(net, AF_INET, get.name); - if (t) { + if (!IS_ERR(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; ret = compat_table_info(private, &info); @@ -1634,7 +1632,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, module_put(t->me); xt_table_unlock(t); } else - ret = -ENOENT; + ret = PTR_ERR(t); xt_compat_unlock(AF_INET); return ret; @@ -1936,7 +1934,6 @@ static int __init ip_tables_init(void) if (ret < 0) goto err5; - pr_info("(C) 2000-2006 Netfilter Core Team\n"); return 0; err5: diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 69060e3..c29a6ca 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -776,7 +776,6 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input, } static const struct file_operations clusterip_proc_fops = { - .owner = THIS_MODULE, .open = clusterip_proc_open, .read = seq_read, .write = clusterip_proc_write, diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 7667f22..9ac92ea 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -38,12 +38,6 @@ static unsigned int iptable_filter_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (state->hook == NF_INET_LOCAL_OUT && - (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr))) - /* root is playing with raw sockets. */ - return NF_ACCEPT; - return ipt_do_table(skb, state, state->net->ipv4.iptable_filter); } diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index aebdb33..dea138ca 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -49,11 +49,6 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) u_int32_t mark; int err; - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) - return NF_ACCEPT; - /* Save things which could affect route */ mark = skb->mark; iph = ip_hdr(skb); diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index a1a07b3..0f7255c 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -72,6 +72,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = { { .hook = iptable_nat_ipv4_in, .pf = NFPROTO_IPV4, + .nat_hook = true, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, }, @@ -79,6 +80,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = { { .hook = iptable_nat_ipv4_out, .pf = NFPROTO_IPV4, + .nat_hook = true, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC, }, @@ -86,6 +88,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = { { .hook = iptable_nat_ipv4_local_fn, .pf = NFPROTO_IPV4, + .nat_hook = true, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST, }, @@ -93,6 +96,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = { { .hook = iptable_nat_ipv4_fn, .pf = NFPROTO_IPV4, + .nat_hook = true, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, }, diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 2642ecd..960625a 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -3,6 +3,7 @@ * * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/slab.h> @@ -12,6 +13,10 @@ static int __net_init iptable_raw_table_init(struct net *net); +static bool raw_before_defrag __read_mostly; +MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag"); +module_param(raw_before_defrag, bool, 0000); + static const struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, @@ -21,17 +26,20 @@ static const struct xt_table packet_raw = { .table_init = iptable_raw_table_init, }; +static const struct xt_table packet_raw_before_defrag = { + .name = "raw", + .valid_hooks = RAW_VALID_HOOKS, + .me = THIS_MODULE, + .af = NFPROTO_IPV4, + .priority = NF_IP_PRI_RAW_BEFORE_DEFRAG, + .table_init = iptable_raw_table_init, +}; + /* The work comes in here from netfilter.c. */ static unsigned int iptable_raw_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (state->hook == NF_INET_LOCAL_OUT && - (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr))) - /* root is playing with raw sockets. */ - return NF_ACCEPT; - return ipt_do_table(skb, state, state->net->ipv4.iptable_raw); } @@ -40,15 +48,19 @@ static struct nf_hook_ops *rawtable_ops __read_mostly; static int __net_init iptable_raw_table_init(struct net *net) { struct ipt_replace *repl; + const struct xt_table *table = &packet_raw; int ret; + if (raw_before_defrag) + table = &packet_raw_before_defrag; + if (net->ipv4.iptable_raw) return 0; - repl = ipt_alloc_initial_table(&packet_raw); + repl = ipt_alloc_initial_table(table); if (repl == NULL) return -ENOMEM; - ret = ipt_register_table(net, &packet_raw, repl, rawtable_ops, + ret = ipt_register_table(net, table, repl, rawtable_ops, &net->ipv4.iptable_raw); kfree(repl); return ret; @@ -69,8 +81,15 @@ static struct pernet_operations iptable_raw_net_ops = { static int __init iptable_raw_init(void) { int ret; + const struct xt_table *table = &packet_raw; + + if (raw_before_defrag) { + table = &packet_raw_before_defrag; + + pr_info("Enabling raw table before defrag\n"); + } - rawtable_ops = xt_hook_ops_alloc(&packet_raw, iptable_raw_hook); + rawtable_ops = xt_hook_ops_alloc(table, iptable_raw_hook); if (IS_ERR(rawtable_ops)) return PTR_ERR(rawtable_ops); diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index ff22659..e5379fe 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -43,12 +43,6 @@ static unsigned int iptable_security_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (state->hook == NF_INET_LOCAL_OUT && - (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr))) - /* Somebody is playing with raw sockets. */ - return NF_ACCEPT; - return ipt_do_table(skb, state, state->net->ipv4.iptable_security); } diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 89af9d8..de213a3 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -154,11 +154,6 @@ static unsigned int ipv4_conntrack_local(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) - return NF_ACCEPT; - if (ip_is_fragment(ip_hdr(skb))) /* IP_NODEFRAG setsockopt set */ return NF_ACCEPT; @@ -368,7 +363,7 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); MODULE_ALIAS("ip_conntrack"); MODULE_LICENSE("GPL"); -static struct nf_conntrack_l4proto *builtin_l4proto4[] = { +static const struct nf_conntrack_l4proto * const builtin_l4proto4[] = { &nf_conntrack_l4proto_tcp4, &nf_conntrack_l4proto_udp4, &nf_conntrack_l4proto_icmp, diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 1849fed..5c15bea 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -22,7 +22,7 @@ #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_log.h> -static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ; +static const unsigned int nf_ct_icmp_timeout = 30*HZ; static inline struct nf_icmp_net *icmp_pernet(struct net *net) { @@ -351,7 +351,7 @@ static struct nf_proto_net *icmp_get_net_proto(struct net *net) return &net->ct.nf_ct_proto.icmp.pn; } -struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = +const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = { .l3proto = PF_INET, .l4proto = IPPROTO_ICMP, diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 37fe1616..a0d3ad6 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -78,6 +78,8 @@ static unsigned int ipv4_conntrack_defrag(void *priv, if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb))) return NF_ACCEPT; #endif + if (skb->_nfct == IP_CT_UNTRACKED) + return NF_ACCEPT; #endif /* Gather fragments. */ if (ip_is_fragment(ip_hdr(skb))) { diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c new file mode 100644 index 0000000..b2d01eb --- /dev/null +++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c @@ -0,0 +1,284 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/rhashtable.h> +#include <linux/ip.h> +#include <linux/netdevice.h> +#include <net/ip.h> +#include <net/neighbour.h> +#include <net/netfilter/nf_flow_table.h> +#include <net/netfilter/nf_tables.h> +/* For layer 4 checksum field offset. */ +#include <linux/tcp.h> +#include <linux/udp.h> + +static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff, + __be32 addr, __be32 new_addr) +{ + struct tcphdr *tcph; + + if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) || + skb_try_make_writable(skb, thoff + sizeof(*tcph))) + return -1; + + tcph = (void *)(skb_network_header(skb) + thoff); + inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true); + + return 0; +} + +static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff, + __be32 addr, __be32 new_addr) +{ + struct udphdr *udph; + + if (!pskb_may_pull(skb, thoff + sizeof(*udph)) || + skb_try_make_writable(skb, thoff + sizeof(*udph))) + return -1; + + udph = (void *)(skb_network_header(skb) + thoff); + if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace4(&udph->check, skb, addr, + new_addr, true); + if (!udph->check) + udph->check = CSUM_MANGLED_0; + } + + return 0; +} + +static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph, + unsigned int thoff, __be32 addr, + __be32 new_addr) +{ + switch (iph->protocol) { + case IPPROTO_TCP: + if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0) + return NF_DROP; + break; + case IPPROTO_UDP: + if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0) + return NF_DROP; + break; + } + + return 0; +} + +static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb, + struct iphdr *iph, unsigned int thoff, + enum flow_offload_tuple_dir dir) +{ + __be32 addr, new_addr; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = iph->saddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr; + iph->saddr = new_addr; + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = iph->daddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr; + iph->daddr = new_addr; + break; + default: + return -1; + } + csum_replace4(&iph->check, addr, new_addr); + + return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); +} + +static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb, + struct iphdr *iph, unsigned int thoff, + enum flow_offload_tuple_dir dir) +{ + __be32 addr, new_addr; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = iph->daddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr; + iph->daddr = new_addr; + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = iph->saddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr; + iph->saddr = new_addr; + break; + default: + return -1; + } + + return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); +} + +static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb, + enum flow_offload_tuple_dir dir) +{ + struct iphdr *iph = ip_hdr(skb); + unsigned int thoff = iph->ihl * 4; + + if (flow->flags & FLOW_OFFLOAD_SNAT && + (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 || + nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0)) + return -1; + if (flow->flags & FLOW_OFFLOAD_DNAT && + (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 || + nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0)) + return -1; + + return 0; +} + +static bool ip_has_options(unsigned int thoff) +{ + return thoff != sizeof(struct iphdr); +} + +static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, + struct flow_offload_tuple *tuple) +{ + struct flow_ports *ports; + unsigned int thoff; + struct iphdr *iph; + + if (!pskb_may_pull(skb, sizeof(*iph))) + return -1; + + iph = ip_hdr(skb); + thoff = iph->ihl * 4; + + if (ip_is_fragment(iph) || + unlikely(ip_has_options(thoff))) + return -1; + + if (iph->protocol != IPPROTO_TCP && + iph->protocol != IPPROTO_UDP) + return -1; + + thoff = iph->ihl * 4; + if (!pskb_may_pull(skb, thoff + sizeof(*ports))) + return -1; + + ports = (struct flow_ports *)(skb_network_header(skb) + thoff); + + tuple->src_v4.s_addr = iph->saddr; + tuple->dst_v4.s_addr = iph->daddr; + tuple->src_port = ports->source; + tuple->dst_port = ports->dest; + tuple->l3proto = AF_INET; + tuple->l4proto = iph->protocol; + tuple->iifidx = dev->ifindex; + + return 0; +} + +/* Based on ip_exceeds_mtu(). */ +static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu) + return false; + + if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) + return false; + + if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) + return false; + + return true; +} + +static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rtable *rt) +{ + u32 mtu; + + mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); + if (__nf_flow_exceeds_mtu(skb, mtu)) + return true; + + return false; +} + +unsigned int +nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct nf_flowtable *flow_table = priv; + struct flow_offload_tuple tuple = {}; + enum flow_offload_tuple_dir dir; + struct flow_offload *flow; + struct net_device *outdev; + const struct rtable *rt; + struct iphdr *iph; + __be32 nexthop; + + if (skb->protocol != htons(ETH_P_IP)) + return NF_ACCEPT; + + if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0) + return NF_ACCEPT; + + tuplehash = flow_offload_lookup(flow_table, &tuple); + if (tuplehash == NULL) + return NF_ACCEPT; + + outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx); + if (!outdev) + return NF_ACCEPT; + + dir = tuplehash->tuple.dir; + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + + rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache; + if (unlikely(nf_flow_exceeds_mtu(skb, rt))) + return NF_ACCEPT; + + if (skb_try_make_writable(skb, sizeof(*iph))) + return NF_DROP; + + if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) && + nf_flow_nat_ip(flow, skb, dir) < 0) + return NF_DROP; + + flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; + iph = ip_hdr(skb); + ip_decrease_ttl(iph); + + skb->dev = outdev; + nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); + neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb); + + return NF_STOLEN; +} +EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook); + +static struct nf_flowtable_type flowtable_ipv4 = { + .family = NFPROTO_IPV4, + .params = &nf_flow_offload_rhash_params, + .gc = nf_flow_offload_work_gc, + .hook = nf_flow_offload_ip_hook, + .owner = THIS_MODULE, +}; + +static int __init nf_flow_ipv4_module_init(void) +{ + nft_register_flowtable_type(&flowtable_ipv4); + + return 0; +} + +static void __exit nf_flow_ipv4_module_exit(void) +{ + nft_unregister_flowtable_type(&flowtable_ipv4); +} + +module_init(nf_flow_ipv4_module_init); +module_exit(nf_flow_ipv4_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_ALIAS_NF_FLOWTABLE(AF_INET); diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 0443ca4..f7ff6a36 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -356,11 +356,6 @@ nf_nat_ipv4_out(void *priv, struct sk_buff *skb, #endif unsigned int ret; - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) - return NF_ACCEPT; - ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); #ifdef CONFIG_XFRM if (ret != NF_DROP && ret != NF_STOLEN && @@ -396,11 +391,6 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, unsigned int ret; int err; - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) - return NF_ACCEPT; - ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); if (ret != NF_DROP && ret != NF_STOLEN && (ct = nf_ct_get(skb, &ctinfo)) != NULL) { diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.asn1 b/net/ipv4/netfilter/nf_nat_snmp_basic.asn1 new file mode 100644 index 0000000..24b7326 --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.asn1 @@ -0,0 +1,177 @@ +Message ::= + SEQUENCE { + version + INTEGER ({snmp_version}), + + community + OCTET STRING, + + pdu + PDUs + } + + +ObjectName ::= + OBJECT IDENTIFIER + +ObjectSyntax ::= + CHOICE { + simple + SimpleSyntax, + + application-wide + ApplicationSyntax + } + +SimpleSyntax ::= + CHOICE { + integer-value + INTEGER, + + string-value + OCTET STRING, + + objectID-value + OBJECT IDENTIFIER + } + +ApplicationSyntax ::= + CHOICE { + ipAddress-value + IpAddress, + + counter-value + Counter32, + + timeticks-value + TimeTicks, + + arbitrary-value + Opaque, + + big-counter-value + Counter64, + + unsigned-integer-value + Unsigned32 + } + +IpAddress ::= + [APPLICATION 0] + IMPLICIT OCTET STRING OPTIONAL ({snmp_helper}) + +Counter32 ::= + [APPLICATION 1] + IMPLICIT INTEGER OPTIONAL + +Unsigned32 ::= + [APPLICATION 2] + IMPLICIT INTEGER OPTIONAL + +Gauge32 ::= Unsigned32 OPTIONAL + +TimeTicks ::= + [APPLICATION 3] + IMPLICIT INTEGER OPTIONAL + +Opaque ::= + [APPLICATION 4] + IMPLICIT OCTET STRING OPTIONAL + +Counter64 ::= + [APPLICATION 6] + IMPLICIT INTEGER OPTIONAL + +PDUs ::= + CHOICE { + get-request + GetRequest-PDU, + + get-next-request + GetNextRequest-PDU, + + get-bulk-request + GetBulkRequest-PDU, + + response + Response-PDU, + + set-request + SetRequest-PDU, + + inform-request + InformRequest-PDU, + + snmpV2-trap + SNMPv2-Trap-PDU, + + report + Report-PDU + } + +GetRequest-PDU ::= + [0] IMPLICIT PDU OPTIONAL + +GetNextRequest-PDU ::= + [1] IMPLICIT PDU OPTIONAL + +Response-PDU ::= + [2] IMPLICIT PDU OPTIONAL + +SetRequest-PDU ::= + [3] IMPLICIT PDU OPTIONAL + +-- [4] is obsolete + +GetBulkRequest-PDU ::= + [5] IMPLICIT PDU OPTIONAL + +InformRequest-PDU ::= + [6] IMPLICIT PDU OPTIONAL + +SNMPv2-Trap-PDU ::= + [7] IMPLICIT PDU OPTIONAL + +Report-PDU ::= + [8] IMPLICIT PDU OPTIONAL + +PDU ::= + SEQUENCE { + request-id + INTEGER, + + error-status + INTEGER, + + error-index + INTEGER, + + variable-bindings + VarBindList + } + + +VarBind ::= + SEQUENCE { + name + ObjectName, + + CHOICE { + value + ObjectSyntax, + + unSpecified + NULL, + + noSuchObject + [0] IMPLICIT NULL, + + noSuchInstance + [1] IMPLICIT NULL, + + endOfMibView + [2] IMPLICIT NULL + } +} + +VarBindList ::= SEQUENCE OF VarBind diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c deleted file mode 100644 index d5b1e0b..0000000 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ /dev/null @@ -1,1286 +0,0 @@ -/* - * nf_nat_snmp_basic.c - * - * Basic SNMP Application Layer Gateway - * - * This IP NAT module is intended for use with SNMP network - * discovery and monitoring applications where target networks use - * conflicting private address realms. - * - * Static NAT is used to remap the networks from the view of the network - * management system at the IP layer, and this module remaps some application - * layer addresses to match. - * - * The simplest form of ALG is performed, where only tagged IP addresses - * are modified. The module does not need to be MIB aware and only scans - * messages at the ASN.1/BER level. - * - * Currently, only SNMPv1 and SNMPv2 are supported. - * - * More information on ALG and associated issues can be found in - * RFC 2962 - * - * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory - * McLean & Jochen Friedrich, stripped down for use in the kernel. - * - * Copyright (c) 2000 RP Internet (www.rpi.net.au). - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. - * - * Author: James Morris <jmorris@intercode.com.au> - * - * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net> - */ -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/udp.h> -#include <net/checksum.h> -#include <net/udp.h> - -#include <net/netfilter/nf_nat.h> -#include <net/netfilter/nf_conntrack_expect.h> -#include <net/netfilter/nf_conntrack_helper.h> -#include <net/netfilter/nf_nat_helper.h> -#include <linux/netfilter/nf_conntrack_snmp.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); -MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway"); -MODULE_ALIAS("ip_nat_snmp_basic"); - -#define SNMP_PORT 161 -#define SNMP_TRAP_PORT 162 -#define NOCT1(n) (*(u8 *)(n)) - -static int debug; -static DEFINE_SPINLOCK(snmp_lock); - -/* - * Application layer address mapping mimics the NAT mapping, but - * only for the first octet in this case (a more flexible system - * can be implemented if needed). - */ -struct oct1_map -{ - u_int8_t from; - u_int8_t to; -}; - - -/***************************************************************************** - * - * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse) - * - *****************************************************************************/ - -/* Class */ -#define ASN1_UNI 0 /* Universal */ -#define ASN1_APL 1 /* Application */ -#define ASN1_CTX 2 /* Context */ -#define ASN1_PRV 3 /* Private */ - -/* Tag */ -#define ASN1_EOC 0 /* End Of Contents */ -#define ASN1_BOL 1 /* Boolean */ -#define ASN1_INT 2 /* Integer */ -#define ASN1_BTS 3 /* Bit String */ -#define ASN1_OTS 4 /* Octet String */ -#define ASN1_NUL 5 /* Null */ -#define ASN1_OJI 6 /* Object Identifier */ -#define ASN1_OJD 7 /* Object Description */ -#define ASN1_EXT 8 /* External */ -#define ASN1_SEQ 16 /* Sequence */ -#define ASN1_SET 17 /* Set */ -#define ASN1_NUMSTR 18 /* Numerical String */ -#define ASN1_PRNSTR 19 /* Printable String */ -#define ASN1_TEXSTR 20 /* Teletext String */ -#define ASN1_VIDSTR 21 /* Video String */ -#define ASN1_IA5STR 22 /* IA5 String */ -#define ASN1_UNITIM 23 /* Universal Time */ -#define ASN1_GENTIM 24 /* General Time */ -#define ASN1_GRASTR 25 /* Graphical String */ -#define ASN1_VISSTR 26 /* Visible String */ -#define ASN1_GENSTR 27 /* General String */ - -/* Primitive / Constructed methods*/ -#define ASN1_PRI 0 /* Primitive */ -#define ASN1_CON 1 /* Constructed */ - -/* - * Error codes. - */ -#define ASN1_ERR_NOERROR 0 -#define ASN1_ERR_DEC_EMPTY 2 -#define ASN1_ERR_DEC_EOC_MISMATCH 3 -#define ASN1_ERR_DEC_LENGTH_MISMATCH 4 -#define ASN1_ERR_DEC_BADVALUE 5 - -/* - * ASN.1 context. - */ -struct asn1_ctx -{ - int error; /* Error condition */ - unsigned char *pointer; /* Octet just to be decoded */ - unsigned char *begin; /* First octet */ - unsigned char *end; /* Octet after last octet */ -}; - -/* - * Octet string (not null terminated) - */ -struct asn1_octstr -{ - unsigned char *data; - unsigned int len; -}; - -static void asn1_open(struct asn1_ctx *ctx, - unsigned char *buf, - unsigned int len) -{ - ctx->begin = buf; - ctx->end = buf + len; - ctx->pointer = buf; - ctx->error = ASN1_ERR_NOERROR; -} - -static unsigned char asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch) -{ - if (ctx->pointer >= ctx->end) { - ctx->error = ASN1_ERR_DEC_EMPTY; - return 0; - } - *ch = *(ctx->pointer)++; - return 1; -} - -static unsigned char asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag) -{ - unsigned char ch; - - *tag = 0; - - do - { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - *tag <<= 7; - *tag |= ch & 0x7F; - } while ((ch & 0x80) == 0x80); - return 1; -} - -static unsigned char asn1_id_decode(struct asn1_ctx *ctx, - unsigned int *cls, - unsigned int *con, - unsigned int *tag) -{ - unsigned char ch; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *cls = (ch & 0xC0) >> 6; - *con = (ch & 0x20) >> 5; - *tag = (ch & 0x1F); - - if (*tag == 0x1F) { - if (!asn1_tag_decode(ctx, tag)) - return 0; - } - return 1; -} - -static unsigned char asn1_length_decode(struct asn1_ctx *ctx, - unsigned int *def, - unsigned int *len) -{ - unsigned char ch, cnt; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - if (ch == 0x80) - *def = 0; - else { - *def = 1; - - if (ch < 0x80) - *len = ch; - else { - cnt = ch & 0x7F; - *len = 0; - - while (cnt > 0) { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - *len <<= 8; - *len |= ch; - cnt--; - } - } - } - - /* don't trust len bigger than ctx buffer */ - if (*len > ctx->end - ctx->pointer) - return 0; - - return 1; -} - -static unsigned char asn1_header_decode(struct asn1_ctx *ctx, - unsigned char **eoc, - unsigned int *cls, - unsigned int *con, - unsigned int *tag) -{ - unsigned int def, len; - - if (!asn1_id_decode(ctx, cls, con, tag)) - return 0; - - def = len = 0; - if (!asn1_length_decode(ctx, &def, &len)) - return 0; - - /* primitive shall be definite, indefinite shall be constructed */ - if (*con == ASN1_PRI && !def) - return 0; - - if (def) - *eoc = ctx->pointer + len; - else - *eoc = NULL; - return 1; -} - -static unsigned char asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc) -{ - unsigned char ch; - - if (eoc == NULL) { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - if (ch != 0x00) { - ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - if (ch != 0x00) { - ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; - return 0; - } - return 1; - } else { - if (ctx->pointer != eoc) { - ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH; - return 0; - } - return 1; - } -} - -static unsigned char asn1_null_decode(struct asn1_ctx *ctx, unsigned char *eoc) -{ - ctx->pointer = eoc; - return 1; -} - -static unsigned char asn1_long_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - long *integer) -{ - unsigned char ch; - unsigned int len; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer = (signed char) ch; - len = 1; - - while (ctx->pointer < eoc) { - if (++len > sizeof (long)) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer <<= 8; - *integer |= ch; - } - return 1; -} - -static unsigned char asn1_uint_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned int *integer) -{ - unsigned char ch; - unsigned int len; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer = ch; - if (ch == 0) len = 0; - else len = 1; - - while (ctx->pointer < eoc) { - if (++len > sizeof (unsigned int)) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer <<= 8; - *integer |= ch; - } - return 1; -} - -static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned long *integer) -{ - unsigned char ch; - unsigned int len; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer = ch; - if (ch == 0) len = 0; - else len = 1; - - while (ctx->pointer < eoc) { - if (++len > sizeof (unsigned long)) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer <<= 8; - *integer |= ch; - } - return 1; -} - -static unsigned char asn1_octets_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned char **octets, - unsigned int *len) -{ - unsigned char *ptr; - - *len = 0; - - *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC); - if (*octets == NULL) - return 0; - - ptr = *octets; - while (ctx->pointer < eoc) { - if (!asn1_octet_decode(ctx, ptr++)) { - kfree(*octets); - *octets = NULL; - return 0; - } - (*len)++; - } - return 1; -} - -static unsigned char asn1_subid_decode(struct asn1_ctx *ctx, - unsigned long *subid) -{ - unsigned char ch; - - *subid = 0; - - do { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *subid <<= 7; - *subid |= ch & 0x7F; - } while ((ch & 0x80) == 0x80); - return 1; -} - -static unsigned char asn1_oid_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned long **oid, - unsigned int *len) -{ - unsigned long subid; - unsigned long *optr; - size_t size; - - size = eoc - ctx->pointer + 1; - - /* first subid actually encodes first two subids */ - if (size < 2 || size > ULONG_MAX/sizeof(unsigned long)) - return 0; - - *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); - if (*oid == NULL) - return 0; - - optr = *oid; - - if (!asn1_subid_decode(ctx, &subid)) { - kfree(*oid); - *oid = NULL; - return 0; - } - - if (subid < 40) { - optr[0] = 0; - optr[1] = subid; - } else if (subid < 80) { - optr[0] = 1; - optr[1] = subid - 40; - } else { - optr[0] = 2; - optr[1] = subid - 80; - } - - *len = 2; - optr += 2; - - while (ctx->pointer < eoc) { - if (++(*len) > size) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - kfree(*oid); - *oid = NULL; - return 0; - } - - if (!asn1_subid_decode(ctx, optr++)) { - kfree(*oid); - *oid = NULL; - return 0; - } - } - return 1; -} - -/***************************************************************************** - * - * SNMP decoding routines (gxsnmp author Dirk Wisse) - * - *****************************************************************************/ - -/* SNMP Versions */ -#define SNMP_V1 0 -#define SNMP_V2C 1 -#define SNMP_V2 2 -#define SNMP_V3 3 - -/* Default Sizes */ -#define SNMP_SIZE_COMM 256 -#define SNMP_SIZE_OBJECTID 128 -#define SNMP_SIZE_BUFCHR 256 -#define SNMP_SIZE_BUFINT 128 -#define SNMP_SIZE_SMALLOBJECTID 16 - -/* Requests */ -#define SNMP_PDU_GET 0 -#define SNMP_PDU_NEXT 1 -#define SNMP_PDU_RESPONSE 2 -#define SNMP_PDU_SET 3 -#define SNMP_PDU_TRAP1 4 -#define SNMP_PDU_BULK 5 -#define SNMP_PDU_INFORM 6 -#define SNMP_PDU_TRAP2 7 - -/* Errors */ -#define SNMP_NOERROR 0 -#define SNMP_TOOBIG 1 -#define SNMP_NOSUCHNAME 2 -#define SNMP_BADVALUE 3 -#define SNMP_READONLY 4 -#define SNMP_GENERROR 5 -#define SNMP_NOACCESS 6 -#define SNMP_WRONGTYPE 7 -#define SNMP_WRONGLENGTH 8 -#define SNMP_WRONGENCODING 9 -#define SNMP_WRONGVALUE 10 -#define SNMP_NOCREATION 11 -#define SNMP_INCONSISTENTVALUE 12 -#define SNMP_RESOURCEUNAVAILABLE 13 -#define SNMP_COMMITFAILED 14 -#define SNMP_UNDOFAILED 15 -#define SNMP_AUTHORIZATIONERROR 16 -#define SNMP_NOTWRITABLE 17 -#define SNMP_INCONSISTENTNAME 18 - -/* General SNMP V1 Traps */ -#define SNMP_TRAP_COLDSTART 0 -#define SNMP_TRAP_WARMSTART 1 -#define SNMP_TRAP_LINKDOWN 2 -#define SNMP_TRAP_LINKUP 3 -#define SNMP_TRAP_AUTFAILURE 4 -#define SNMP_TRAP_EQPNEIGHBORLOSS 5 -#define SNMP_TRAP_ENTSPECIFIC 6 - -/* SNMPv1 Types */ -#define SNMP_NULL 0 -#define SNMP_INTEGER 1 /* l */ -#define SNMP_OCTETSTR 2 /* c */ -#define SNMP_DISPLAYSTR 2 /* c */ -#define SNMP_OBJECTID 3 /* ul */ -#define SNMP_IPADDR 4 /* uc */ -#define SNMP_COUNTER 5 /* ul */ -#define SNMP_GAUGE 6 /* ul */ -#define SNMP_TIMETICKS 7 /* ul */ -#define SNMP_OPAQUE 8 /* c */ - -/* Additional SNMPv2 Types */ -#define SNMP_UINTEGER 5 /* ul */ -#define SNMP_BITSTR 9 /* uc */ -#define SNMP_NSAP 10 /* uc */ -#define SNMP_COUNTER64 11 /* ul */ -#define SNMP_NOSUCHOBJECT 12 -#define SNMP_NOSUCHINSTANCE 13 -#define SNMP_ENDOFMIBVIEW 14 - -union snmp_syntax -{ - unsigned char uc[0]; /* 8 bit unsigned */ - char c[0]; /* 8 bit signed */ - unsigned long ul[0]; /* 32 bit unsigned */ - long l[0]; /* 32 bit signed */ -}; - -struct snmp_object -{ - unsigned long *id; - unsigned int id_len; - unsigned short type; - unsigned int syntax_len; - union snmp_syntax syntax; -}; - -struct snmp_request -{ - unsigned long id; - unsigned int error_status; - unsigned int error_index; -}; - -struct snmp_v1_trap -{ - unsigned long *id; - unsigned int id_len; - unsigned long ip_address; /* pointer */ - unsigned int general; - unsigned int specific; - unsigned long time; -}; - -/* SNMP types */ -#define SNMP_IPA 0 -#define SNMP_CNT 1 -#define SNMP_GGE 2 -#define SNMP_TIT 3 -#define SNMP_OPQ 4 -#define SNMP_C64 6 - -/* SNMP errors */ -#define SERR_NSO 0 -#define SERR_NSI 1 -#define SERR_EOM 2 - -static inline void mangle_address(unsigned char *begin, - unsigned char *addr, - const struct oct1_map *map, - __sum16 *check); -struct snmp_cnv -{ - unsigned int class; - unsigned int tag; - int syntax; -}; - -static const struct snmp_cnv snmp_conv[] = { - {ASN1_UNI, ASN1_NUL, SNMP_NULL}, - {ASN1_UNI, ASN1_INT, SNMP_INTEGER}, - {ASN1_UNI, ASN1_OTS, SNMP_OCTETSTR}, - {ASN1_UNI, ASN1_OTS, SNMP_DISPLAYSTR}, - {ASN1_UNI, ASN1_OJI, SNMP_OBJECTID}, - {ASN1_APL, SNMP_IPA, SNMP_IPADDR}, - {ASN1_APL, SNMP_CNT, SNMP_COUNTER}, /* Counter32 */ - {ASN1_APL, SNMP_GGE, SNMP_GAUGE}, /* Gauge32 == Unsigned32 */ - {ASN1_APL, SNMP_TIT, SNMP_TIMETICKS}, - {ASN1_APL, SNMP_OPQ, SNMP_OPAQUE}, - - /* SNMPv2 data types and errors */ - {ASN1_UNI, ASN1_BTS, SNMP_BITSTR}, - {ASN1_APL, SNMP_C64, SNMP_COUNTER64}, - {ASN1_CTX, SERR_NSO, SNMP_NOSUCHOBJECT}, - {ASN1_CTX, SERR_NSI, SNMP_NOSUCHINSTANCE}, - {ASN1_CTX, SERR_EOM, SNMP_ENDOFMIBVIEW}, - {0, 0, -1} -}; - -static unsigned char snmp_tag_cls2syntax(unsigned int tag, - unsigned int cls, - unsigned short *syntax) -{ - const struct snmp_cnv *cnv; - - cnv = snmp_conv; - - while (cnv->syntax != -1) { - if (cnv->tag == tag && cnv->class == cls) { - *syntax = cnv->syntax; - return 1; - } - cnv++; - } - return 0; -} - -static unsigned char snmp_object_decode(struct asn1_ctx *ctx, - struct snmp_object **obj) -{ - unsigned int cls, con, tag, len, idlen; - unsigned short type; - unsigned char *eoc, *end, *p; - unsigned long *lp, *id; - unsigned long ul; - long l; - - *obj = NULL; - id = NULL; - - if (!asn1_header_decode(ctx, &eoc, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI) - return 0; - - if (!asn1_oid_decode(ctx, end, &id, &idlen)) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) { - kfree(id); - return 0; - } - - if (con != ASN1_PRI) { - kfree(id); - return 0; - } - - type = 0; - if (!snmp_tag_cls2syntax(tag, cls, &type)) { - kfree(id); - return 0; - } - - l = 0; - switch (type) { - case SNMP_INTEGER: - len = sizeof(long); - if (!asn1_long_decode(ctx, end, &l)) { - kfree(id); - return 0; - } - *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); - if (*obj == NULL) { - kfree(id); - return 0; - } - (*obj)->syntax.l[0] = l; - break; - case SNMP_OCTETSTR: - case SNMP_OPAQUE: - if (!asn1_octets_decode(ctx, end, &p, &len)) { - kfree(id); - return 0; - } - *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); - if (*obj == NULL) { - kfree(p); - kfree(id); - return 0; - } - memcpy((*obj)->syntax.c, p, len); - kfree(p); - break; - case SNMP_NULL: - case SNMP_NOSUCHOBJECT: - case SNMP_NOSUCHINSTANCE: - case SNMP_ENDOFMIBVIEW: - len = 0; - *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC); - if (*obj == NULL) { - kfree(id); - return 0; - } - if (!asn1_null_decode(ctx, end)) { - kfree(id); - kfree(*obj); - *obj = NULL; - return 0; - } - break; - case SNMP_OBJECTID: - if (!asn1_oid_decode(ctx, end, &lp, &len)) { - kfree(id); - return 0; - } - len *= sizeof(unsigned long); - *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); - if (*obj == NULL) { - kfree(lp); - kfree(id); - return 0; - } - memcpy((*obj)->syntax.ul, lp, len); - kfree(lp); - break; - case SNMP_IPADDR: - if (!asn1_octets_decode(ctx, end, &p, &len)) { - kfree(id); - return 0; - } - if (len != 4) { - kfree(p); - kfree(id); - return 0; - } - *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); - if (*obj == NULL) { - kfree(p); - kfree(id); - return 0; - } - memcpy((*obj)->syntax.uc, p, len); - kfree(p); - break; - case SNMP_COUNTER: - case SNMP_GAUGE: - case SNMP_TIMETICKS: - len = sizeof(unsigned long); - if (!asn1_ulong_decode(ctx, end, &ul)) { - kfree(id); - return 0; - } - *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); - if (*obj == NULL) { - kfree(id); - return 0; - } - (*obj)->syntax.ul[0] = ul; - break; - default: - kfree(id); - return 0; - } - - (*obj)->syntax_len = len; - (*obj)->type = type; - (*obj)->id = id; - (*obj)->id_len = idlen; - - if (!asn1_eoc_decode(ctx, eoc)) { - kfree(id); - kfree(*obj); - *obj = NULL; - return 0; - } - return 1; -} - -static unsigned char noinline_for_stack -snmp_request_decode(struct asn1_ctx *ctx, struct snmp_request *request) -{ - unsigned int cls, con, tag; - unsigned char *end; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - return 0; - - if (!asn1_ulong_decode(ctx, end, &request->id)) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - return 0; - - if (!asn1_uint_decode(ctx, end, &request->error_status)) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - return 0; - - if (!asn1_uint_decode(ctx, end, &request->error_index)) - return 0; - - return 1; -} - -/* - * Fast checksum update for possibly oddly-aligned UDP byte, from the - * code example in the draft. - */ -static void fast_csum(__sum16 *csum, - const unsigned char *optr, - const unsigned char *nptr, - int offset) -{ - unsigned char s[4]; - - if (offset & 1) { - s[0] = ~0; - s[1] = ~*optr; - s[2] = 0; - s[3] = *nptr; - } else { - s[0] = ~*optr; - s[1] = ~0; - s[2] = *nptr; - s[3] = 0; - } - - *csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum))); -} - -/* - * Mangle IP address. - * - begin points to the start of the snmp messgae - * - addr points to the start of the address - */ -static inline void mangle_address(unsigned char *begin, - unsigned char *addr, - const struct oct1_map *map, - __sum16 *check) -{ - if (map->from == NOCT1(addr)) { - u_int32_t old; - - if (debug) - memcpy(&old, addr, sizeof(old)); - - *addr = map->to; - - /* Update UDP checksum if being used */ - if (*check) { - fast_csum(check, - &map->from, &map->to, addr - begin); - - } - - if (debug) - printk(KERN_DEBUG "bsalg: mapped %pI4 to %pI4\n", - &old, addr); - } -} - -static unsigned char noinline_for_stack -snmp_trap_decode(struct asn1_ctx *ctx, struct snmp_v1_trap *trap, - const struct oct1_map *map, - __sum16 *check) -{ - unsigned int cls, con, tag, len; - unsigned char *end; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI) - return 0; - - if (!asn1_oid_decode(ctx, end, &trap->id, &trap->id_len)) - return 0; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - goto err_id_free; - - if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_IPA) || - (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_OTS))) - goto err_id_free; - - if (!asn1_octets_decode(ctx, end, (unsigned char **)&trap->ip_address, &len)) - goto err_id_free; - - /* IPv4 only */ - if (len != 4) - goto err_addr_free; - - mangle_address(ctx->begin, ctx->pointer - 4, map, check); - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - goto err_addr_free; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - goto err_addr_free; - - if (!asn1_uint_decode(ctx, end, &trap->general)) - goto err_addr_free; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - goto err_addr_free; - - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - goto err_addr_free; - - if (!asn1_uint_decode(ctx, end, &trap->specific)) - goto err_addr_free; - - if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) - goto err_addr_free; - - if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_TIT) || - (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_INT))) - goto err_addr_free; - - if (!asn1_ulong_decode(ctx, end, &trap->time)) - goto err_addr_free; - - return 1; - -err_addr_free: - kfree((unsigned long *)trap->ip_address); - -err_id_free: - kfree(trap->id); - - return 0; -} - -/***************************************************************************** - * - * Misc. routines - * - *****************************************************************************/ - -/* - * Parse and mangle SNMP message according to mapping. - * (And this is the fucking 'basic' method). - */ -static int snmp_parse_mangle(unsigned char *msg, - u_int16_t len, - const struct oct1_map *map, - __sum16 *check) -{ - unsigned char *eoc, *end; - unsigned int cls, con, tag, vers, pdutype; - struct asn1_ctx ctx; - struct asn1_octstr comm; - struct snmp_object *obj; - - if (debug > 1) - print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_NONE, 16, 1, - msg, len, 0); - - asn1_open(&ctx, msg, len); - - /* - * Start of SNMP message. - */ - if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag)) - return 0; - if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) - return 0; - - /* - * Version 1 or 2 handled. - */ - if (!asn1_header_decode(&ctx, &end, &cls, &con, &tag)) - return 0; - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) - return 0; - if (!asn1_uint_decode (&ctx, end, &vers)) - return 0; - if (debug > 1) - pr_debug("bsalg: snmp version: %u\n", vers + 1); - if (vers > 1) - return 1; - - /* - * Community. - */ - if (!asn1_header_decode (&ctx, &end, &cls, &con, &tag)) - return 0; - if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OTS) - return 0; - if (!asn1_octets_decode(&ctx, end, &comm.data, &comm.len)) - return 0; - if (debug > 1) { - unsigned int i; - - pr_debug("bsalg: community: "); - for (i = 0; i < comm.len; i++) - pr_cont("%c", comm.data[i]); - pr_cont("\n"); - } - kfree(comm.data); - - /* - * PDU type - */ - if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &pdutype)) - return 0; - if (cls != ASN1_CTX || con != ASN1_CON) - return 0; - if (debug > 1) { - static const unsigned char *const pdus[] = { - [SNMP_PDU_GET] = "get", - [SNMP_PDU_NEXT] = "get-next", - [SNMP_PDU_RESPONSE] = "response", - [SNMP_PDU_SET] = "set", - [SNMP_PDU_TRAP1] = "trapv1", - [SNMP_PDU_BULK] = "bulk", - [SNMP_PDU_INFORM] = "inform", - [SNMP_PDU_TRAP2] = "trapv2" - }; - - if (pdutype > SNMP_PDU_TRAP2) - pr_debug("bsalg: bad pdu type %u\n", pdutype); - else - pr_debug("bsalg: pdu: %s\n", pdus[pdutype]); - } - if (pdutype != SNMP_PDU_RESPONSE && - pdutype != SNMP_PDU_TRAP1 && pdutype != SNMP_PDU_TRAP2) - return 1; - - /* - * Request header or v1 trap - */ - if (pdutype == SNMP_PDU_TRAP1) { - struct snmp_v1_trap trap; - unsigned char ret = snmp_trap_decode(&ctx, &trap, map, check); - - if (ret) { - kfree(trap.id); - kfree((unsigned long *)trap.ip_address); - } else - return ret; - - } else { - struct snmp_request req; - - if (!snmp_request_decode(&ctx, &req)) - return 0; - - if (debug > 1) - pr_debug("bsalg: request: id=0x%lx error_status=%u " - "error_index=%u\n", req.id, req.error_status, - req.error_index); - } - - /* - * Loop through objects, look for IP addresses to mangle. - */ - if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag)) - return 0; - - if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) - return 0; - - while (!asn1_eoc_decode(&ctx, eoc)) { - unsigned int i; - - if (!snmp_object_decode(&ctx, &obj)) { - if (obj) { - kfree(obj->id); - kfree(obj); - } - return 0; - } - - if (debug > 1) { - pr_debug("bsalg: object: "); - for (i = 0; i < obj->id_len; i++) { - if (i > 0) - pr_cont("."); - pr_cont("%lu", obj->id[i]); - } - pr_cont(": type=%u\n", obj->type); - - } - - if (obj->type == SNMP_IPADDR) - mangle_address(ctx.begin, ctx.pointer - 4, map, check); - - kfree(obj->id); - kfree(obj); - } - - if (!asn1_eoc_decode(&ctx, eoc)) - return 0; - - return 1; -} - -/***************************************************************************** - * - * NAT routines. - * - *****************************************************************************/ - -/* - * SNMP translation routine. - */ -static int snmp_translate(struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - struct sk_buff *skb) -{ - struct iphdr *iph = ip_hdr(skb); - struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl); - u_int16_t udplen = ntohs(udph->len); - u_int16_t paylen = udplen - sizeof(struct udphdr); - int dir = CTINFO2DIR(ctinfo); - struct oct1_map map; - - /* - * Determine mappping for application layer addresses based - * on NAT manipulations for the packet. - */ - if (dir == IP_CT_DIR_ORIGINAL) { - /* SNAT traps */ - map.from = NOCT1(&ct->tuplehash[dir].tuple.src.u3.ip); - map.to = NOCT1(&ct->tuplehash[!dir].tuple.dst.u3.ip); - } else { - /* DNAT replies */ - map.from = NOCT1(&ct->tuplehash[!dir].tuple.src.u3.ip); - map.to = NOCT1(&ct->tuplehash[dir].tuple.dst.u3.ip); - } - - if (map.from == map.to) - return NF_ACCEPT; - - if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr), - paylen, &map, &udph->check)) { - net_warn_ratelimited("bsalg: parser failed\n"); - return NF_DROP; - } - return NF_ACCEPT; -} - -/* We don't actually set up expectations, just adjust internal IP - * addresses if this is being NATted */ -static int help(struct sk_buff *skb, unsigned int protoff, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ - int dir = CTINFO2DIR(ctinfo); - unsigned int ret; - const struct iphdr *iph = ip_hdr(skb); - const struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl); - - /* SNMP replies and originating SNMP traps get mangled */ - if (udph->source == htons(SNMP_PORT) && dir != IP_CT_DIR_REPLY) - return NF_ACCEPT; - if (udph->dest == htons(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL) - return NF_ACCEPT; - - /* No NAT? */ - if (!(ct->status & IPS_NAT_MASK)) - return NF_ACCEPT; - - /* - * Make sure the packet length is ok. So far, we were only guaranteed - * to have a valid length IP header plus 8 bytes, which means we have - * enough room for a UDP header. Just verify the UDP length field so we - * can mess around with the payload. - */ - if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) { - net_warn_ratelimited("SNMP: dropping malformed packet src=%pI4 dst=%pI4\n", - &iph->saddr, &iph->daddr); - return NF_DROP; - } - - if (!skb_make_writable(skb, skb->len)) - return NF_DROP; - - spin_lock_bh(&snmp_lock); - ret = snmp_translate(ct, ctinfo, skb); - spin_unlock_bh(&snmp_lock); - return ret; -} - -static const struct nf_conntrack_expect_policy snmp_exp_policy = { - .max_expected = 0, - .timeout = 180, -}; - -static struct nf_conntrack_helper snmp_trap_helper __read_mostly = { - .me = THIS_MODULE, - .help = help, - .expect_policy = &snmp_exp_policy, - .name = "snmp_trap", - .tuple.src.l3num = AF_INET, - .tuple.src.u.udp.port = cpu_to_be16(SNMP_TRAP_PORT), - .tuple.dst.protonum = IPPROTO_UDP, -}; - -/***************************************************************************** - * - * Module stuff. - * - *****************************************************************************/ - -static int __init nf_nat_snmp_basic_init(void) -{ - BUG_ON(nf_nat_snmp_hook != NULL); - RCU_INIT_POINTER(nf_nat_snmp_hook, help); - - return nf_conntrack_helper_register(&snmp_trap_helper); -} - -static void __exit nf_nat_snmp_basic_fini(void) -{ - RCU_INIT_POINTER(nf_nat_snmp_hook, NULL); - synchronize_rcu(); - nf_conntrack_helper_unregister(&snmp_trap_helper); -} - -module_init(nf_nat_snmp_basic_init); -module_exit(nf_nat_snmp_basic_fini); - -module_param(debug, int, 0600); diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c new file mode 100644 index 0000000..b6e2770 --- /dev/null +++ b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c @@ -0,0 +1,235 @@ +/* + * nf_nat_snmp_basic.c + * + * Basic SNMP Application Layer Gateway + * + * This IP NAT module is intended for use with SNMP network + * discovery and monitoring applications where target networks use + * conflicting private address realms. + * + * Static NAT is used to remap the networks from the view of the network + * management system at the IP layer, and this module remaps some application + * layer addresses to match. + * + * The simplest form of ALG is performed, where only tagged IP addresses + * are modified. The module does not need to be MIB aware and only scans + * messages at the ASN.1/BER level. + * + * Currently, only SNMPv1 and SNMPv2 are supported. + * + * More information on ALG and associated issues can be found in + * RFC 2962 + * + * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory + * McLean & Jochen Friedrich, stripped down for use in the kernel. + * + * Copyright (c) 2000 RP Internet (www.rpi.net.au). + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + * + * Author: James Morris <jmorris@intercode.com.au> + * + * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net> + */ +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include <net/checksum.h> +#include <net/udp.h> + +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <linux/netfilter/nf_conntrack_snmp.h> +#include "nf_nat_snmp_basic-asn1.h" + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); +MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway"); +MODULE_ALIAS("ip_nat_snmp_basic"); + +#define SNMP_PORT 161 +#define SNMP_TRAP_PORT 162 + +static DEFINE_SPINLOCK(snmp_lock); + +struct snmp_ctx { + unsigned char *begin; + __sum16 *check; + __be32 from; + __be32 to; +}; + +static void fast_csum(struct snmp_ctx *ctx, unsigned char offset) +{ + unsigned char s[12] = {0,}; + int size; + + if (offset & 1) { + memcpy(&s[1], &ctx->from, 4); + memcpy(&s[7], &ctx->to, 4); + s[0] = ~0; + s[1] = ~s[1]; + s[2] = ~s[2]; + s[3] = ~s[3]; + s[4] = ~s[4]; + s[5] = ~0; + size = 12; + } else { + memcpy(&s[0], &ctx->from, 4); + memcpy(&s[4], &ctx->to, 4); + s[0] = ~s[0]; + s[1] = ~s[1]; + s[2] = ~s[2]; + s[3] = ~s[3]; + size = 8; + } + *ctx->check = csum_fold(csum_partial(s, size, + ~csum_unfold(*ctx->check))); +} + +int snmp_version(void *context, size_t hdrlen, unsigned char tag, + const void *data, size_t datalen) +{ + if (*(unsigned char *)data > 1) + return -ENOTSUPP; + return 1; +} + +int snmp_helper(void *context, size_t hdrlen, unsigned char tag, + const void *data, size_t datalen) +{ + struct snmp_ctx *ctx = (struct snmp_ctx *)context; + __be32 *pdata = (__be32 *)data; + + if (*pdata == ctx->from) { + pr_debug("%s: %pI4 to %pI4\n", __func__, + (void *)&ctx->from, (void *)&ctx->to); + + if (*ctx->check) + fast_csum(ctx, (unsigned char *)data - ctx->begin); + *pdata = ctx->to; + } + + return 1; +} + +static int snmp_translate(struct nf_conn *ct, int dir, struct sk_buff *skb) +{ + struct iphdr *iph = ip_hdr(skb); + struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl); + u16 datalen = ntohs(udph->len) - sizeof(struct udphdr); + char *data = (unsigned char *)udph + sizeof(struct udphdr); + struct snmp_ctx ctx; + int ret; + + if (dir == IP_CT_DIR_ORIGINAL) { + ctx.from = ct->tuplehash[dir].tuple.src.u3.ip; + ctx.to = ct->tuplehash[!dir].tuple.dst.u3.ip; + } else { + ctx.from = ct->tuplehash[!dir].tuple.src.u3.ip; + ctx.to = ct->tuplehash[dir].tuple.dst.u3.ip; + } + + if (ctx.from == ctx.to) + return NF_ACCEPT; + + ctx.begin = (unsigned char *)udph + sizeof(struct udphdr); + ctx.check = &udph->check; + ret = asn1_ber_decoder(&nf_nat_snmp_basic_decoder, &ctx, data, datalen); + if (ret < 0) { + nf_ct_helper_log(skb, ct, "parser failed\n"); + return NF_DROP; + } + + return NF_ACCEPT; +} + +/* We don't actually set up expectations, just adjust internal IP + * addresses if this is being NATted + */ +static int help(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + int dir = CTINFO2DIR(ctinfo); + unsigned int ret; + const struct iphdr *iph = ip_hdr(skb); + const struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl); + + /* SNMP replies and originating SNMP traps get mangled */ + if (udph->source == htons(SNMP_PORT) && dir != IP_CT_DIR_REPLY) + return NF_ACCEPT; + if (udph->dest == htons(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL) + return NF_ACCEPT; + + /* No NAT? */ + if (!(ct->status & IPS_NAT_MASK)) + return NF_ACCEPT; + + /* Make sure the packet length is ok. So far, we were only guaranteed + * to have a valid length IP header plus 8 bytes, which means we have + * enough room for a UDP header. Just verify the UDP length field so we + * can mess around with the payload. + */ + if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) { + nf_ct_helper_log(skb, ct, "dropping malformed packet\n"); + return NF_DROP; + } + + if (!skb_make_writable(skb, skb->len)) { + nf_ct_helper_log(skb, ct, "cannot mangle packet"); + return NF_DROP; + } + + spin_lock_bh(&snmp_lock); + ret = snmp_translate(ct, dir, skb); + spin_unlock_bh(&snmp_lock); + return ret; +} + +static const struct nf_conntrack_expect_policy snmp_exp_policy = { + .max_expected = 0, + .timeout = 180, +}; + +static struct nf_conntrack_helper snmp_trap_helper __read_mostly = { + .me = THIS_MODULE, + .help = help, + .expect_policy = &snmp_exp_policy, + .name = "snmp_trap", + .tuple.src.l3num = AF_INET, + .tuple.src.u.udp.port = cpu_to_be16(SNMP_TRAP_PORT), + .tuple.dst.protonum = IPPROTO_UDP, +}; + +static int __init nf_nat_snmp_basic_init(void) +{ + BUG_ON(nf_nat_snmp_hook != NULL); + RCU_INIT_POINTER(nf_nat_snmp_hook, help); + + return nf_conntrack_helper_register(&snmp_trap_helper); +} + +static void __exit nf_nat_snmp_basic_fini(void) +{ + RCU_INIT_POINTER(nf_nat_snmp_hook, NULL); + synchronize_rcu(); + nf_conntrack_helper_unregister(&snmp_trap_helper); +} + +module_init(nf_nat_snmp_basic_init); +module_exit(nf_nat_snmp_basic_fini); diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index 4bbc273..036c074 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -21,51 +21,12 @@ nft_do_chain_arp(void *priv, { struct nft_pktinfo pkt; - nft_set_pktinfo_unspec(&pkt, skb, state); + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_unspec(&pkt, skb); return nft_do_chain(&pkt, priv); } -static struct nft_af_info nft_af_arp __read_mostly = { - .family = NFPROTO_ARP, - .nhooks = NF_ARP_NUMHOOKS, - .owner = THIS_MODULE, - .nops = 1, - .hooks = { - [NF_ARP_IN] = nft_do_chain_arp, - [NF_ARP_OUT] = nft_do_chain_arp, - [NF_ARP_FORWARD] = nft_do_chain_arp, - }, -}; - -static int nf_tables_arp_init_net(struct net *net) -{ - net->nft.arp = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); - if (net->nft.arp== NULL) - return -ENOMEM; - - memcpy(net->nft.arp, &nft_af_arp, sizeof(nft_af_arp)); - - if (nft_register_afinfo(net, net->nft.arp) < 0) - goto err; - - return 0; -err: - kfree(net->nft.arp); - return -ENOMEM; -} - -static void nf_tables_arp_exit_net(struct net *net) -{ - nft_unregister_afinfo(net, net->nft.arp); - kfree(net->nft.arp); -} - -static struct pernet_operations nf_tables_arp_net_ops = { - .init = nf_tables_arp_init_net, - .exit = nf_tables_arp_exit_net, -}; - static const struct nf_chain_type filter_arp = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, @@ -73,26 +34,19 @@ static const struct nf_chain_type filter_arp = { .owner = THIS_MODULE, .hook_mask = (1 << NF_ARP_IN) | (1 << NF_ARP_OUT), + .hooks = { + [NF_ARP_IN] = nft_do_chain_arp, + [NF_ARP_OUT] = nft_do_chain_arp, + }, }; static int __init nf_tables_arp_init(void) { - int ret; - - ret = nft_register_chain_type(&filter_arp); - if (ret < 0) - return ret; - - ret = register_pernet_subsys(&nf_tables_arp_net_ops); - if (ret < 0) - nft_unregister_chain_type(&filter_arp); - - return ret; + return nft_register_chain_type(&filter_arp); } static void __exit nf_tables_arp_exit(void) { - unregister_pernet_subsys(&nf_tables_arp_net_ops); nft_unregister_chain_type(&filter_arp); } @@ -101,4 +55,4 @@ module_exit(nf_tables_arp_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_FAMILY(3); /* NFPROTO_ARP */ +MODULE_ALIAS_NFT_CHAIN(3, "filter"); /* NFPROTO_ARP */ diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c index 2840a29..96f9554 100644 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -24,69 +24,12 @@ static unsigned int nft_do_chain_ipv4(void *priv, { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv4(&pkt, skb, state); + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb); return nft_do_chain(&pkt, priv); } -static unsigned int nft_ipv4_output(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - if (unlikely(skb->len < sizeof(struct iphdr) || - ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) { - if (net_ratelimit()) - pr_info("nf_tables_ipv4: ignoring short SOCK_RAW " - "packet\n"); - return NF_ACCEPT; - } - - return nft_do_chain_ipv4(priv, skb, state); -} - -struct nft_af_info nft_af_ipv4 __read_mostly = { - .family = NFPROTO_IPV4, - .nhooks = NF_INET_NUMHOOKS, - .owner = THIS_MODULE, - .nops = 1, - .hooks = { - [NF_INET_LOCAL_IN] = nft_do_chain_ipv4, - [NF_INET_LOCAL_OUT] = nft_ipv4_output, - [NF_INET_FORWARD] = nft_do_chain_ipv4, - [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4, - [NF_INET_POST_ROUTING] = nft_do_chain_ipv4, - }, -}; -EXPORT_SYMBOL_GPL(nft_af_ipv4); - -static int nf_tables_ipv4_init_net(struct net *net) -{ - net->nft.ipv4 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); - if (net->nft.ipv4 == NULL) - return -ENOMEM; - - memcpy(net->nft.ipv4, &nft_af_ipv4, sizeof(nft_af_ipv4)); - - if (nft_register_afinfo(net, net->nft.ipv4) < 0) - goto err; - - return 0; -err: - kfree(net->nft.ipv4); - return -ENOMEM; -} - -static void nf_tables_ipv4_exit_net(struct net *net) -{ - nft_unregister_afinfo(net, net->nft.ipv4); - kfree(net->nft.ipv4); -} - -static struct pernet_operations nf_tables_ipv4_net_ops = { - .init = nf_tables_ipv4_init_net, - .exit = nf_tables_ipv4_exit_net, -}; - static const struct nf_chain_type filter_ipv4 = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, @@ -97,26 +40,22 @@ static const struct nf_chain_type filter_ipv4 = { (1 << NF_INET_FORWARD) | (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING), + .hooks = { + [NF_INET_LOCAL_IN] = nft_do_chain_ipv4, + [NF_INET_LOCAL_OUT] = nft_do_chain_ipv4, + [NF_INET_FORWARD] = nft_do_chain_ipv4, + [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4, + [NF_INET_POST_ROUTING] = nft_do_chain_ipv4, + }, }; static int __init nf_tables_ipv4_init(void) { - int ret; - - ret = nft_register_chain_type(&filter_ipv4); - if (ret < 0) - return ret; - - ret = register_pernet_subsys(&nf_tables_ipv4_net_ops); - if (ret < 0) - nft_unregister_chain_type(&filter_ipv4); - - return ret; + return nft_register_chain_type(&filter_ipv4); } static void __exit nf_tables_ipv4_exit(void) { - unregister_pernet_subsys(&nf_tables_ipv4_net_ops); nft_unregister_chain_type(&filter_ipv4); } @@ -125,4 +64,4 @@ module_exit(nf_tables_ipv4_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_FAMILY(AF_INET); +MODULE_ALIAS_NFT_CHAIN(AF_INET, "filter"); diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c index f5c66a7..f2a4909 100644 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -33,7 +33,8 @@ static unsigned int nft_nat_do_chain(void *priv, { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv4(&pkt, skb, state); + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb); return nft_do_chain(&pkt, priv); } diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c index 30493be..d965c22 100644 --- a/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -33,12 +33,8 @@ static unsigned int nf_route_table_hook(void *priv, const struct iphdr *iph; int err; - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) - return NF_ACCEPT; - - nft_set_pktinfo_ipv4(&pkt, skb, state); + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb); mark = skb->mark; iph = ip_hdr(skb); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 9f37c47..dc5edc8 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -83,7 +83,6 @@ static int sockstat_seq_open(struct inode *inode, struct file *file) } static const struct file_operations sockstat_seq_fops = { - .owner = THIS_MODULE, .open = sockstat_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -467,7 +466,6 @@ static int snmp_seq_open(struct inode *inode, struct file *file) } static const struct file_operations snmp_seq_fops = { - .owner = THIS_MODULE, .open = snmp_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -515,7 +513,6 @@ static int netstat_seq_open(struct inode *inode, struct file *file) } static const struct file_operations netstat_seq_fops = { - .owner = THIS_MODULE, .open = netstat_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 5e570aa..7c50969 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -617,8 +617,21 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; - } else if (!ipc.oif) + } else if (!ipc.oif) { ipc.oif = inet->uc_index; + } else if (ipv4_is_lbcast(daddr) && inet->uc_index) { + /* oif is set, packet is to local broadcast and + * and uc_index is set. oif is most likely set + * by sk_bound_dev_if. If uc_index != oif check if the + * oif is an L3 master and uc_index is an L3 slave. + * If so, we want to allow the send using the uc_index. + */ + if (ipc.oif != inet->uc_index && + ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk), + inet->uc_index)) { + ipc.oif = inet->uc_index; + } + } flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, @@ -1119,7 +1132,6 @@ static int raw_v4_seq_open(struct inode *inode, struct file *file) } static const struct file_operations raw_seq_fops = { - .owner = THIS_MODULE, .open = raw_v4_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 4e153b2..49cc1c1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -240,7 +240,6 @@ static int rt_cache_seq_open(struct inode *inode, struct file *file) } static const struct file_operations rt_cache_seq_fops = { - .owner = THIS_MODULE, .open = rt_cache_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -331,7 +330,6 @@ static int rt_cpu_seq_open(struct inode *inode, struct file *file) } static const struct file_operations rt_cpu_seq_fops = { - .owner = THIS_MODULE, .open = rt_cpu_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -369,7 +367,6 @@ static int rt_acct_proc_open(struct inode *inode, struct file *file) } static const struct file_operations rt_acct_proc_fops = { - .owner = THIS_MODULE, .open = rt_acct_proc_open, .read = seq_read, .llseek = seq_lseek, @@ -1106,7 +1103,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) new = true; } - __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu); + __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu); if (!dst_check(&rt->dst, 0)) { if (new) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1b38b42..c059aa7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -283,8 +283,6 @@ #include <asm/ioctls.h> #include <net/busy_poll.h> -#include <trace/events/tcp.h> - struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); @@ -465,7 +463,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op) tcp_mtup_init(sk); icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); - tcp_call_bpf(sk, bpf_op); + tcp_call_bpf(sk, bpf_op, 0, NULL); tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); } @@ -500,11 +498,9 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) const struct tcp_sock *tp = tcp_sk(sk); int state; - sock_rps_record_flow(sk); - sock_poll_wait(file, sk_sleep(sk), wait); - state = sk_state_load(sk); + state = inet_sk_state_load(sk); if (state == TCP_LISTEN) return inet_csk_listen_poll(sk); @@ -1106,12 +1102,15 @@ static int linear_payload_sz(bool first_skb) return 0; } -static int select_size(const struct sock *sk, bool sg, bool first_skb) +static int select_size(const struct sock *sk, bool sg, bool first_skb, bool zc) { const struct tcp_sock *tp = tcp_sk(sk); int tmp = tp->mss_cache; if (sg) { + if (zc) + return 0; + if (sk_can_gso(sk)) { tmp = linear_payload_sz(first_skb); } else { @@ -1188,7 +1187,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; bool process_backlog = false; - bool sg; + bool sg, zc = false; long timeo; flags = msg->msg_flags; @@ -1206,7 +1205,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) goto out_err; } - if (!(sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG)) + zc = sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG; + if (!zc) uarg->zerocopy = 0; } @@ -1283,6 +1283,7 @@ restart: if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; + int linear; new_segment: /* Allocate new segment. If the interface is SG, @@ -1296,9 +1297,8 @@ new_segment: goto restart; } first_skb = tcp_rtx_and_write_queues_empty(sk); - skb = sk_stream_alloc_skb(sk, - select_size(sk, sg, first_skb), - sk->sk_allocation, + linear = select_size(sk, sg, first_skb, zc); + skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation, first_skb); if (!skb) goto wait_for_memory; @@ -1327,13 +1327,13 @@ new_segment: copy = msg_data_left(msg); /* Where to copy to? */ - if (skb_availroom(skb) > 0) { + if (skb_availroom(skb) > 0 && !zc) { /* We have some space in skb head. Superb! */ copy = min_t(int, copy, skb_availroom(skb)); err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy); if (err) goto do_fault; - } else if (!uarg || !uarg->zerocopy) { + } else if (!zc) { bool merge = true; int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); @@ -1373,8 +1373,10 @@ new_segment: pfrag->offset += copy; } else { err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); - if (err == -EMSGSIZE || err == -EEXIST) + if (err == -EMSGSIZE || err == -EEXIST) { + tcp_mark_push(tp, skb); goto new_segment; + } if (err < 0) goto do_error; copy = err; @@ -1731,8 +1733,8 @@ static void tcp_update_recv_tstamps(struct sk_buff *skb, } /* Similar to __sock_recv_timestamp, but does not require an skb */ -void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, - struct scm_timestamping *tss) +static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, + struct scm_timestamping *tss) { struct timeval tv; bool has_timestamping = false; @@ -2040,7 +2042,29 @@ void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; - trace_tcp_set_state(sk, oldstate, state); + /* We defined a new enum for TCP states that are exported in BPF + * so as not force the internal TCP states to be frozen. The + * following checks will detect if an internal state value ever + * differs from the BPF value. If this ever happens, then we will + * need to remap the internal value to the BPF value before calling + * tcp_call_bpf_2arg. + */ + BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED); + BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT); + BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV); + BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1); + BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2); + BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT); + BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE); + BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT); + BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK); + BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN); + BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING); + BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV); + BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES); + + if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG)) + tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state); switch (state) { case TCP_ESTABLISHED: @@ -2065,7 +2089,7 @@ void tcp_set_state(struct sock *sk, int state) /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ - sk_state_store(sk, state); + inet_sk_state_store(sk, state); #ifdef STATE_TRACE SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); @@ -2434,6 +2458,12 @@ int tcp_disconnect(struct sock *sk, int flags) WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); + if (sk->sk_frag.page) { + put_page(sk->sk_frag.page); + sk->sk_frag.page = NULL; + sk->sk_frag.offset = 0; + } + sk->sk_error_report(sk); return err; } @@ -2923,7 +2953,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (sk->sk_type != SOCK_STREAM) return; - info->tcpi_state = sk_state_load(sk); + info->tcpi_state = inet_sk_state_load(sk); /* Report meaningful fields for all TCP states, including listeners */ rate = READ_ONCE(sk->sk_pacing_rate); @@ -3581,6 +3611,9 @@ void __init tcp_init(void) percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); inet_hashinfo_init(&tcp_hashinfo); + inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash", + thash_entries, 21, /* one slot per 2 MB*/ + 0, 64 * 1024); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 8322f26..785712b 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -766,7 +766,8 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) filter_expired = after(tcp_jiffies32, bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); if (rs->rtt_us >= 0 && - (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) { + (rs->rtt_us <= bbr->min_rtt_us || + (filter_expired && !rs->is_ack_delayed))) { bbr->min_rtt_us = rs->rtt_us; bbr->min_rtt_stamp = tcp_jiffies32; } diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index abbf0ed..81148f7 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -24,7 +24,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, { struct tcp_info *info = _info; - if (sk_state_load(sk) == TCP_LISTEN) { + if (inet_sk_state_load(sk) == TCP_LISTEN) { r->idiag_rqueue = sk->sk_ack_backlog; r->idiag_wqueue = sk->sk_max_ack_backlog; } else if (sk->sk_type == SOCK_STREAM) { diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 78c192e..018a484 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -379,18 +379,9 @@ fastopen: bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie) { - unsigned long last_syn_loss = 0; const struct dst_entry *dst; - int syn_loss = 0; - tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss); - - /* Recurring FO SYN losses: no cookie or data in SYN */ - if (syn_loss > 1 && - time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) { - cookie->len = -1; - return false; - } + tcp_fastopen_cache_get(sk, mss, cookie); /* Firewall blackhole issue check */ if (tcp_fastopen_active_should_disable(sk)) { @@ -448,6 +439,8 @@ EXPORT_SYMBOL(tcp_fastopen_defer_connect); * following circumstances: * 1. client side TFO socket receives out of order FIN * 2. client side TFO socket receives out of order RST + * 3. client side TFO socket has timed out three times consecutively during + * or after handshake * We disable active side TFO globally for 1hr at first. Then if it * happens again, we disable it for 2h, then 4h, 8h, ... * And we reset the timeout back to 1hr when we see a successful active @@ -524,3 +517,20 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk) dst_release(dst); } } + +void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired) +{ + u32 timeouts = inet_csk(sk)->icsk_retransmits; + struct tcp_sock *tp = tcp_sk(sk); + + /* Broken middle-boxes may black-hole Fast Open connection during or + * even after the handshake. Be extremely conservative and pause + * Fast Open globally after hitting the third consecutive timeout or + * exceeding the configured timeout limit. + */ + if ((tp->syn_fastopen || tp->syn_data || tp->syn_data_acked) && + (timeouts == 2 || (timeouts < 2 && expired))) { + tcp_fastopen_active_disable(sk); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); + } +} diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 45f750e..cfa51cf 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -97,6 +97,7 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE; #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ #define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ #define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ +#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -578,8 +579,8 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, void tcp_rcv_space_adjust(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + u32 copied; int time; - int copied; tcp_mstamp_refresh(tp); time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); @@ -602,38 +603,31 @@ void tcp_rcv_space_adjust(struct sock *sk) if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int rcvwin, rcvmem, rcvbuf; + int rcvmem, rcvbuf; + u64 rcvwin, grow; /* minimal window to cope with packet losses, assuming * steady state. Add some cushion because of small variations. */ - rcvwin = (copied << 1) + 16 * tp->advmss; + rcvwin = ((u64)copied << 1) + 16 * tp->advmss; - /* If rate increased by 25%, - * assume slow start, rcvwin = 3 * copied - * If rate increased by 50%, - * assume sender can use 2x growth, rcvwin = 4 * copied - */ - if (copied >= - tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) { - if (copied >= - tp->rcvq_space.space + (tp->rcvq_space.space >> 1)) - rcvwin <<= 1; - else - rcvwin += (rcvwin >> 1); - } + /* Accommodate for sender rate increase (eg. slow start) */ + grow = rcvwin * (copied - tp->rcvq_space.space); + do_div(grow, tp->rcvq_space.space); + rcvwin += (grow << 1); rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); while (tcp_win_from_space(sk, rcvmem) < tp->advmss) rcvmem += 128; - rcvbuf = min(rcvwin / tp->advmss * rcvmem, - sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); + do_div(rcvwin, tp->advmss); + rcvbuf = min_t(u64, rcvwin * rcvmem, + sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); if (rcvbuf > sk->sk_rcvbuf) { sk->sk_rcvbuf = rcvbuf; /* Make the window clamp follow along. */ - tp->window_clamp = rcvwin; + tp->window_clamp = tcp_win_from_space(sk, rcvbuf); } } tp->rcvq_space.space = copied; @@ -2864,11 +2858,18 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, *rexmit = REXMIT_LOST; } -static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) +static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag) { u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ; struct tcp_sock *tp = tcp_sk(sk); + if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) { + /* If the remote keeps returning delayed ACKs, eventually + * the min filter would pick it up and overestimate the + * prop. delay when it expires. Skip suspected delayed ACKs. + */ + return; + } minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32, rtt_us ? : jiffies_to_usecs(1)); } @@ -2908,7 +2909,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, * always taken together with ACK, SACK, or TS-opts. Any negative * values will be skipped with the seq_rtt_us < 0 check above. */ - tcp_update_rtt_min(sk, ca_rtt_us); + tcp_update_rtt_min(sk, ca_rtt_us, flag); tcp_rtt_estimator(sk, seq_rtt_us); tcp_set_rto(sk); @@ -3132,6 +3133,17 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) { seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt); + + if (pkts_acked == 1 && last_in_flight < tp->mss_cache && + last_in_flight && !prior_sacked && fully_acked && + sack->rate->prior_delivered + 1 == tp->delivered && + !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) { + /* Conservatively mark a delayed ACK. It's typically + * from a lone runt packet over the round trip to + * a receiver w/o out-of-order or CE events. + */ + flag |= FLAG_ACK_MAYBE_DELAYED; + } } if (sack->first_sackt) { sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt); @@ -3621,6 +3633,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ lost = tp->lost - lost; /* freshly marked lost */ + rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); @@ -5306,6 +5319,9 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, unsigned int len = skb->len; struct tcp_sock *tp = tcp_sk(sk); + /* TCP congestion window tracking */ + trace_tcp_probe(sk, skb); + tcp_mstamp_refresh(tp); if (unlikely(!sk->sk_rx_dst)) inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 94e2835..95738aa 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1911,7 +1911,7 @@ void tcp_v4_destroy_sock(struct sock *sk) /* Clean up the MD5 key list, if any */ if (tp->md5sig_info) { tcp_clear_md5_list(sk); - kfree_rcu(tp->md5sig_info, rcu); + kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); tp->md5sig_info = NULL; } #endif @@ -2281,7 +2281,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) timer_expires = jiffies; } - state = sk_state_load(sk); + state = inet_sk_state_load(sk); if (state == TCP_LISTEN) rx_queue = sk->sk_ack_backlog; else @@ -2358,7 +2358,6 @@ out: } static const struct file_operations tcp_afinfo_seq_fops = { - .owner = THIS_MODULE, .open = tcp_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 7097f92..03b51cd 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -546,8 +546,7 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) static DEFINE_SEQLOCK(fastopen_seqlock); void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, - struct tcp_fastopen_cookie *cookie, - int *syn_loss, unsigned long *last_syn_loss) + struct tcp_fastopen_cookie *cookie) { struct tcp_metrics_block *tm; @@ -564,8 +563,6 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, *cookie = tfom->cookie; if (cookie->len <= 0 && tfom->try_exp == 1) cookie->exp = true; - *syn_loss = tfom->syn_loss; - *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0; } while (read_seqretry(&fastopen_seqlock, seq)); } rcu_read_unlock(); @@ -895,7 +892,7 @@ static void tcp_metrics_flush_all(struct net *net) pp = &hb->chain; for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { match = net ? net_eq(tm_net(tm), net) : - !atomic_read(&tm_net(tm)->count); + !refcount_read(&tm_net(tm)->count); if (match) { *pp = tm->tcpm_next; kfree_rcu(tm, rcu_head); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b079b61..a8384b0c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -316,9 +316,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) */ local_bh_disable(); inet_twsk_schedule(tw, timeo); - /* Linkage updates. */ - __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); - inet_twsk_put(tw); + /* Linkage updates. + * Note that access to tw after this point is illegal. + */ + inet_twsk_hashdance(tw, sk, &tcp_hashinfo); local_bh_enable(); } else { /* Sorry, if we're out of memory, just CLOSE this diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c index 0b5a05b..764298e 100644 --- a/net/ipv4/tcp_nv.c +++ b/net/ipv4/tcp_nv.c @@ -146,7 +146,7 @@ static void tcpnv_init(struct sock *sk) * within a datacenter, where we have reasonable estimates of * RTTs */ - base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT); + base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT, 0, NULL); if (base_rtt > 0) { ca->nv_base_rtt = base_rtt; ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */ @@ -364,7 +364,7 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) */ cwnd_by_slope = (u32) div64_u64(((u64)ca->nv_rtt_max_rate) * ca->nv_min_rtt, - (u64)(80000 * tp->mss_cache)); + 80000ULL * tp->mss_cache); max_win = cwnd_by_slope + nv_pad; /* If cwnd > max_win, decrease cwnd diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a4d214c..e9f985e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1944,7 +1944,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, in_flight = tcp_packets_in_flight(tp); - BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight)); + BUG_ON(tcp_skb_pcount(skb) <= 1); + BUG_ON(tp->snd_cwnd <= in_flight); send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; @@ -2414,15 +2415,12 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans; /* Schedule a loss probe in 2*RTT for SACK capable connections - * in Open state, that are either limited by cwnd or application. + * not in loss recovery, that are either limited by cwnd or application. */ if ((early_retrans != 3 && early_retrans != 4) || !tp->packets_out || !tcp_is_sack(tp) || - icsk->icsk_ca_state != TCP_CA_Open) - return false; - - if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && - !tcp_write_queue_empty(sk)) + (icsk->icsk_ca_state != TCP_CA_Open && + icsk->icsk_ca_state != TCP_CA_CWR)) return false; /* Probe timeout is 2*rtt. Add minimum RTO to account @@ -2907,6 +2905,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) + tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, + TCP_SKB_CB(skb)->seq, segs, err); + if (likely(!err)) { TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; trace_tcp_retransmit_skb(sk, skb); @@ -3471,7 +3473,7 @@ int tcp_connect(struct sock *sk) struct sk_buff *buff; int err; - tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB); + tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL); if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c deleted file mode 100644 index 697f4c6..0000000 --- a/net/ipv4/tcp_probe.c +++ /dev/null @@ -1,301 +0,0 @@ -/* - * tcpprobe - Observe the TCP flow with kprobes. - * - * The idea for this came from Werner Almesberger's umlsim - * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kernel.h> -#include <linux/kprobes.h> -#include <linux/socket.h> -#include <linux/tcp.h> -#include <linux/slab.h> -#include <linux/proc_fs.h> -#include <linux/module.h> -#include <linux/ktime.h> -#include <linux/time.h> -#include <net/net_namespace.h> - -#include <net/tcp.h> - -MODULE_AUTHOR("Stephen Hemminger <shemminger@linux-foundation.org>"); -MODULE_DESCRIPTION("TCP cwnd snooper"); -MODULE_LICENSE("GPL"); -MODULE_VERSION("1.1"); - -static int port __read_mostly; -MODULE_PARM_DESC(port, "Port to match (0=all)"); -module_param(port, int, 0); - -static unsigned int bufsize __read_mostly = 4096; -MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)"); -module_param(bufsize, uint, 0); - -static unsigned int fwmark __read_mostly; -MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)"); -module_param(fwmark, uint, 0); - -static int full __read_mostly; -MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)"); -module_param(full, int, 0); - -static const char procname[] = "tcpprobe"; - -struct tcp_log { - ktime_t tstamp; - union { - struct sockaddr raw; - struct sockaddr_in v4; - struct sockaddr_in6 v6; - } src, dst; - u16 length; - u32 snd_nxt; - u32 snd_una; - u32 snd_wnd; - u32 rcv_wnd; - u32 snd_cwnd; - u32 ssthresh; - u32 srtt; -}; - -static struct { - spinlock_t lock; - wait_queue_head_t wait; - ktime_t start; - u32 lastcwnd; - - unsigned long head, tail; - struct tcp_log *log; -} tcp_probe; - -static inline int tcp_probe_used(void) -{ - return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1); -} - -static inline int tcp_probe_avail(void) -{ - return bufsize - tcp_probe_used() - 1; -} - -#define tcp_probe_copy_fl_to_si4(inet, si4, mem) \ - do { \ - si4.sin_family = AF_INET; \ - si4.sin_port = inet->inet_##mem##port; \ - si4.sin_addr.s_addr = inet->inet_##mem##addr; \ - } while (0) \ - -/* - * Hook inserted to be called before each receive packet. - * Note: arguments must match tcp_rcv_established()! - */ -static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th) -{ - unsigned int len = skb->len; - const struct tcp_sock *tp = tcp_sk(sk); - const struct inet_sock *inet = inet_sk(sk); - - /* Only update if port or skb mark matches */ - if (((port == 0 && fwmark == 0) || - ntohs(inet->inet_dport) == port || - ntohs(inet->inet_sport) == port || - (fwmark > 0 && skb->mark == fwmark)) && - (full || tp->snd_cwnd != tcp_probe.lastcwnd)) { - - spin_lock(&tcp_probe.lock); - /* If log fills, just silently drop */ - if (tcp_probe_avail() > 1) { - struct tcp_log *p = tcp_probe.log + tcp_probe.head; - - p->tstamp = ktime_get(); - switch (sk->sk_family) { - case AF_INET: - tcp_probe_copy_fl_to_si4(inet, p->src.v4, s); - tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d); - break; - case AF_INET6: - memset(&p->src.v6, 0, sizeof(p->src.v6)); - memset(&p->dst.v6, 0, sizeof(p->dst.v6)); -#if IS_ENABLED(CONFIG_IPV6) - p->src.v6.sin6_family = AF_INET6; - p->src.v6.sin6_port = inet->inet_sport; - p->src.v6.sin6_addr = inet6_sk(sk)->saddr; - - p->dst.v6.sin6_family = AF_INET6; - p->dst.v6.sin6_port = inet->inet_dport; - p->dst.v6.sin6_addr = sk->sk_v6_daddr; -#endif - break; - default: - BUG(); - } - - p->length = len; - p->snd_nxt = tp->snd_nxt; - p->snd_una = tp->snd_una; - p->snd_cwnd = tp->snd_cwnd; - p->snd_wnd = tp->snd_wnd; - p->rcv_wnd = tp->rcv_wnd; - p->ssthresh = tcp_current_ssthresh(sk); - p->srtt = tp->srtt_us >> 3; - - tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1); - } - tcp_probe.lastcwnd = tp->snd_cwnd; - spin_unlock(&tcp_probe.lock); - - wake_up(&tcp_probe.wait); - } - - jprobe_return(); -} - -static struct jprobe tcp_jprobe = { - .kp = { - .symbol_name = "tcp_rcv_established", - }, - .entry = jtcp_rcv_established, -}; - -static int tcpprobe_open(struct inode *inode, struct file *file) -{ - /* Reset (empty) log */ - spin_lock_bh(&tcp_probe.lock); - tcp_probe.head = tcp_probe.tail = 0; - tcp_probe.start = ktime_get(); - spin_unlock_bh(&tcp_probe.lock); - - return 0; -} - -static int tcpprobe_sprint(char *tbuf, int n) -{ - const struct tcp_log *p - = tcp_probe.log + tcp_probe.tail; - struct timespec64 ts - = ktime_to_timespec64(ktime_sub(p->tstamp, tcp_probe.start)); - - return scnprintf(tbuf, n, - "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n", - (unsigned long)ts.tv_sec, - (unsigned long)ts.tv_nsec, - &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una, - p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd); -} - -static ssize_t tcpprobe_read(struct file *file, char __user *buf, - size_t len, loff_t *ppos) -{ - int error = 0; - size_t cnt = 0; - - if (!buf) - return -EINVAL; - - while (cnt < len) { - char tbuf[256]; - int width; - - /* Wait for data in buffer */ - error = wait_event_interruptible(tcp_probe.wait, - tcp_probe_used() > 0); - if (error) - break; - - spin_lock_bh(&tcp_probe.lock); - if (tcp_probe.head == tcp_probe.tail) { - /* multiple readers race? */ - spin_unlock_bh(&tcp_probe.lock); - continue; - } - - width = tcpprobe_sprint(tbuf, sizeof(tbuf)); - - if (cnt + width < len) - tcp_probe.tail = (tcp_probe.tail + 1) & (bufsize - 1); - - spin_unlock_bh(&tcp_probe.lock); - - /* if record greater than space available - return partial buffer (so far) */ - if (cnt + width >= len) - break; - - if (copy_to_user(buf + cnt, tbuf, width)) - return -EFAULT; - cnt += width; - } - - return cnt == 0 ? error : cnt; -} - -static const struct file_operations tcpprobe_fops = { - .owner = THIS_MODULE, - .open = tcpprobe_open, - .read = tcpprobe_read, - .llseek = noop_llseek, -}; - -static __init int tcpprobe_init(void) -{ - int ret = -ENOMEM; - - /* Warning: if the function signature of tcp_rcv_established, - * has been changed, you also have to change the signature of - * jtcp_rcv_established, otherwise you end up right here! - */ - BUILD_BUG_ON(__same_type(tcp_rcv_established, - jtcp_rcv_established) == 0); - - init_waitqueue_head(&tcp_probe.wait); - spin_lock_init(&tcp_probe.lock); - - if (bufsize == 0) - return -EINVAL; - - bufsize = roundup_pow_of_two(bufsize); - tcp_probe.log = kcalloc(bufsize, sizeof(struct tcp_log), GFP_KERNEL); - if (!tcp_probe.log) - goto err0; - - if (!proc_create(procname, S_IRUSR, init_net.proc_net, &tcpprobe_fops)) - goto err0; - - ret = register_jprobe(&tcp_jprobe); - if (ret) - goto err1; - - pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n", - port, fwmark, bufsize); - return 0; - err1: - remove_proc_entry(procname, init_net.proc_net); - err0: - kfree(tcp_probe.log); - return ret; -} -module_init(tcpprobe_init); - -static __exit void tcpprobe_exit(void) -{ - remove_proc_entry(procname, init_net.proc_net); - unregister_jprobe(&tcp_jprobe); - kfree(tcp_probe.log); -} -module_exit(tcpprobe_exit); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 388158c..71fc60f 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -198,11 +198,6 @@ static int tcp_write_timeout(struct sock *sk) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) { dst_negative_advice(sk); - if (tp->syn_fastopen || tp->syn_data) - tcp_fastopen_cache_set(sk, 0, NULL, true, 0); - if (tp->syn_data && icsk->icsk_retransmits == 1) - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPFASTOPENACTIVEFAIL); } else if (!tp->syn_data && !tp->syn_fastopen) { sk_rethink_txhash(sk); } @@ -210,17 +205,6 @@ static int tcp_write_timeout(struct sock *sk) expired = icsk->icsk_retransmits >= retry_until; } else { if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) { - /* Some middle-boxes may black-hole Fast Open _after_ - * the handshake. Therefore we conservatively disable - * Fast Open on this path on recurring timeouts after - * successful Fast Open. - */ - if (tp->syn_data_acked) { - tcp_fastopen_cache_set(sk, 0, NULL, true, 0); - if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1) - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPFASTOPENACTIVEFAIL); - } /* Black hole detection */ tcp_mtu_probing(icsk, sk); @@ -243,11 +227,19 @@ static int tcp_write_timeout(struct sock *sk) expired = retransmits_timed_out(sk, retry_until, icsk->icsk_user_timeout); } + tcp_fastopen_active_detect_blackhole(sk, expired); + + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG)) + tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB, + icsk->icsk_retransmits, + icsk->icsk_rto, (int)expired); + if (expired) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; } + return 0; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ef45adf..f81f969 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -357,18 +357,12 @@ fail: } EXPORT_SYMBOL(udp_lib_get_port); -static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr, - unsigned int port) -{ - return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port; -} - int udp_v4_get_port(struct sock *sk, unsigned short snum) { unsigned int hash2_nulladdr = - udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum); + ipv4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum); unsigned int hash2_partial = - udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0); + ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0); /* precompute partial secondary hash */ udp_sk(sk)->udp_portaddr_hash = hash2_partial; @@ -445,7 +439,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, struct sk_buff *skb) { struct sock *sk, *result; - int score, badness, matches = 0, reuseport = 0; + int score, badness; u32 hash = 0; result = NULL; @@ -454,23 +448,16 @@ static struct sock *udp4_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif, exact_dif); if (score > badness) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); if (result) return result; - matches = 1; } badness = score; result = sk; - } else if (score == badness && reuseport) { - matches++; - if (reciprocal_scale(hash, matches) == 0) - result = sk; - hash = next_pseudo_random32(hash); } } return result; @@ -488,11 +475,11 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; bool exact_dif = udp_lib_exact_dif_match(net, skb); - int score, badness, matches = 0, reuseport = 0; + int score, badness; u32 hash = 0; if (hslot->count > 10) { - hash2 = udp4_portaddr_hash(net, daddr, hnum); + hash2 = ipv4_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; if (hslot->count < hslot2->count) @@ -503,7 +490,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, exact_dif, hslot2, skb); if (!result) { unsigned int old_slot2 = slot2; - hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum); + hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); slot2 = hash2 & udptable->mask; /* avoid searching the same slot again. */ if (unlikely(slot2 == old_slot2)) @@ -526,23 +513,16 @@ begin: score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif, exact_dif); if (score > badness) { - reuseport = sk->sk_reuseport; - if (reuseport) { + if (sk->sk_reuseport) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); if (result) return result; - matches = 1; } result = sk; badness = score; - } else if (score == badness && reuseport) { - matches++; - if (reciprocal_scale(hash, matches) == 0) - result = sk; - hash = next_pseudo_random32(hash); } } return result; @@ -997,8 +977,21 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (!saddr) saddr = inet->mc_addr; connected = 0; - } else if (!ipc.oif) + } else if (!ipc.oif) { ipc.oif = inet->uc_index; + } else if (ipv4_is_lbcast(daddr) && inet->uc_index) { + /* oif is set, packet is to local broadcast and + * and uc_index is set. oif is most likely set + * by sk_bound_dev_if. If uc_index != oif check if the + * oif is an L3 master and uc_index is an L3 slave. + * If so, we want to allow the send using the uc_index. + */ + if (ipc.oif != inet->uc_index && + ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk), + inet->uc_index)) { + ipc.oif = inet->uc_index; + } + } if (connected) rt = (struct rtable *)sk_dst_check(sk, 0); @@ -1775,7 +1768,7 @@ EXPORT_SYMBOL(udp_lib_rehash); static void udp_v4_rehash(struct sock *sk) { - u16 new_hash = udp4_portaddr_hash(sock_net(sk), + u16 new_hash = ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, inet_sk(sk)->inet_num); udp_lib_rehash(sk, new_hash); @@ -1966,9 +1959,9 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct sk_buff *nskb; if (use_hash2) { - hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & + hash2_any = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & udptable->mask; - hash2 = udp4_portaddr_hash(net, daddr, hnum) & udptable->mask; + hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: hslot = &udptable->hash2[hash2]; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); @@ -2200,7 +2193,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, int dif, int sdif) { unsigned short hnum = ntohs(loc_port); - unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum); + unsigned int hash2 = ipv4_portaddr_hash(net, loc_addr, hnum); unsigned int slot2 = hash2 & udp_table.mask; struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); @@ -2510,8 +2503,6 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait) if (!skb_queue_empty(&udp_sk(sk)->reader_queue)) mask |= POLLIN | POLLRDNORM; - sock_rps_record_flow(sk); - /* Check for false positives due to checksum errors */ if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) && !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1) @@ -2736,7 +2727,6 @@ int udp4_seq_show(struct seq_file *seq, void *v) } static const struct file_operations udp_afinfo_seq_fops = { - .owner = THIS_MODULE, .open = udp_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 59f10fe..f96614e 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -75,7 +75,6 @@ static struct inet_protosw udplite4_protosw = { #ifdef CONFIG_PROC_FS static const struct file_operations udplite_afinfo_seq_fops = { - .owner = THIS_MODULE, .open = udp_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 20ca486..63faeee 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -62,7 +62,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); - top_iph->ttl = ip4_dst_hoplimit(dst->child); + top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst)); top_iph->saddr = x->props.saddr.a4; top_iph->daddr = x->id.daddr.a4; @@ -106,18 +106,15 @@ static struct sk_buff *xfrm4_mode_tunnel_gso_segment(struct xfrm_state *x, { __skb_push(skb, skb->mac_len); return skb_mac_gso_segment(skb, features); - } static void xfrm4_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb) { struct xfrm_offload *xo = xfrm_offload(skb); - if (xo->flags & XFRM_GSO_SEGMENT) { - skb->network_header = skb->network_header - x->props.header_len; + if (xo->flags & XFRM_GSO_SEGMENT) skb->transport_header = skb->network_header + sizeof(struct iphdr); - } skb_reset_mac_len(skb); pskb_pull(skb, skb->mac_len + x->props.header_len); |