diff options
Diffstat (limited to 'net/ipv4')
74 files changed, 3915 insertions, 3724 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index a5a1050..cbb505b 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -55,45 +55,9 @@ config IP_ADVANCED_ROUTER If unsure, say N here. -choice - prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)" - depends on IP_ADVANCED_ROUTER - default ASK_IP_FIB_HASH - -config ASK_IP_FIB_HASH - bool "FIB_HASH" - ---help--- - Current FIB is very proven and good enough for most users. - -config IP_FIB_TRIE - bool "FIB_TRIE" - ---help--- - Use new experimental LC-trie as FIB lookup algorithm. - This improves lookup performance if you have a large - number of routes. - - LC-trie is a longest matching prefix lookup algorithm which - performs better than FIB_HASH for large routing tables. - But, it consumes more memory and is more complex. - - LC-trie is described in: - - IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson - IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, - June 1999 - - An experimental study of compression methods for dynamic tries - Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. - <http://www.csc.kth.se/~snilsson/software/dyntrie2/> - -endchoice - -config IP_FIB_HASH - def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER - config IP_FIB_TRIE_STATS bool "FIB TRIE statistics" - depends on IP_FIB_TRIE + depends on IP_ADVANCED_ROUTER ---help--- Keep track of statistics on structure of FIB TRIE table. Useful for testing and measuring TRIE performance. @@ -140,6 +104,9 @@ config IP_ROUTE_VERBOSE handled by the klogd daemon which is responsible for kernel messages ("man klogd"). +config IP_ROUTE_CLASSID + bool + config IP_PNP bool "IP: kernel level autoconfiguration" help @@ -657,4 +624,3 @@ config TCP_MD5SIG on the Internet. If unsure, say N. - diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 4978d22..f2dc69c 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -10,12 +10,10 @@ obj-y := route.o inetpeer.o protocol.o \ tcp_minisocks.o tcp_cong.o \ datagram.o raw.o udp.o udplite.o \ arp.o icmp.o devinet.o af_inet.o igmp.o \ - fib_frontend.o fib_semantics.o \ - inet_fragment.o + fib_frontend.o fib_semantics.o fib_trie.o \ + inet_fragment.o ping.o obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o -obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o -obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o obj-$(CONFIG_IP_MROUTE) += ipmr.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 45b89d7..cc14631 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -105,6 +105,7 @@ #include <net/tcp.h> #include <net/udp.h> #include <net/udplite.h> +#include <net/ping.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/raw.h> @@ -153,7 +154,7 @@ void inet_sock_destruct(struct sock *sk) WARN_ON(sk->sk_wmem_queued); WARN_ON(sk->sk_forward_alloc); - kfree(inet->opt); + kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); sk_refcnt_debug_dec(sk); } @@ -1008,6 +1009,14 @@ static struct inet_protosw inetsw_array[] = .flags = INET_PROTOSW_PERMANENT, }, + { + .type = SOCK_DGRAM, + .protocol = IPPROTO_ICMP, + .prot = &ping_prot, + .ops = &inet_dgram_ops, + .no_check = UDP_CSUM_DEFAULT, + .flags = INET_PROTOSW_REUSE, + }, { .type = SOCK_RAW, @@ -1101,27 +1110,29 @@ int sysctl_ip_dynaddr __read_mostly; static int inet_sk_reselect_saddr(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); - int err; - struct rtable *rt; __be32 old_saddr = inet->inet_saddr; - __be32 new_saddr; __be32 daddr = inet->inet_daddr; + struct flowi4 *fl4; + struct rtable *rt; + __be32 new_saddr; + struct ip_options_rcu *inet_opt; - if (inet->opt && inet->opt->srr) - daddr = inet->opt->faddr; + inet_opt = rcu_dereference_protected(inet->inet_opt, + sock_owned_by_user(sk)); + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; /* Query new route. */ - err = ip_route_connect(&rt, daddr, 0, - RT_CONN_FLAGS(sk), - sk->sk_bound_dev_if, - sk->sk_protocol, - inet->inet_sport, inet->inet_dport, sk, 0); - if (err) - return err; + fl4 = &inet->cork.fl.u.ip4; + rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk), + sk->sk_bound_dev_if, sk->sk_protocol, + inet->inet_sport, inet->inet_dport, sk, false); + if (IS_ERR(rt)) + return PTR_ERR(rt); sk_setup_caps(sk, &rt->dst); - new_saddr = rt->rt_src; + new_saddr = fl4->saddr; if (new_saddr == old_saddr) return 0; @@ -1150,6 +1161,8 @@ int inet_sk_rebuild_header(struct sock *sk) struct inet_sock *inet = inet_sk(sk); struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); __be32 daddr; + struct ip_options_rcu *inet_opt; + struct flowi4 *fl4; int err; /* Route is OK, nothing to do. */ @@ -1157,28 +1170,23 @@ int inet_sk_rebuild_header(struct sock *sk) return 0; /* Reroute. */ + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); daddr = inet->inet_daddr; - if (inet->opt && inet->opt->srr) - daddr = inet->opt->faddr; -{ - struct flowi fl = { - .oif = sk->sk_bound_dev_if, - .mark = sk->sk_mark, - .fl4_dst = daddr, - .fl4_src = inet->inet_saddr, - .fl4_tos = RT_CONN_FLAGS(sk), - .proto = sk->sk_protocol, - .flags = inet_sk_flowi_flags(sk), - .fl_ip_sport = inet->inet_sport, - .fl_ip_dport = inet->inet_dport, - }; - - security_sk_classify_flow(sk, &fl); - err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0); -} - if (!err) + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; + rcu_read_unlock(); + fl4 = &inet->cork.fl.u.ip4; + rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, + inet->inet_dport, inet->inet_sport, + sk->sk_protocol, RT_CONN_FLAGS(sk), + sk->sk_bound_dev_if); + if (!IS_ERR(rt)) { + err = 0; sk_setup_caps(sk, &rt->dst); - else { + } else { + err = PTR_ERR(rt); + /* Routing failed... */ sk->sk_route_caps = 0; /* @@ -1198,7 +1206,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header); static int inet_gso_send_check(struct sk_buff *skb) { - struct iphdr *iph; + const struct iphdr *iph; const struct net_protocol *ops; int proto; int ihl; @@ -1231,7 +1239,7 @@ out: return err; } -static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) +static struct sk_buff *inet_gso_segment(struct sk_buff *skb, u32 features) { struct sk_buff *segs = ERR_PTR(-EINVAL); struct iphdr *iph; @@ -1305,7 +1313,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, const struct net_protocol *ops; struct sk_buff **pp = NULL; struct sk_buff *p; - struct iphdr *iph; + const struct iphdr *iph; unsigned int hlen; unsigned int off; unsigned int id; @@ -1528,6 +1536,7 @@ static const struct net_protocol udp_protocol = { static const struct net_protocol icmp_protocol = { .handler = icmp_rcv, + .err_handler = ping_err, .no_policy = 1, .netns_ok = 1, }; @@ -1643,6 +1652,10 @@ static int __init inet_init(void) if (rc) goto out_unregister_udp_proto; + rc = proto_register(&ping_prot, 1); + if (rc) + goto out_unregister_raw_proto; + /* * Tell SOCKET that we are alive... */ @@ -1698,6 +1711,8 @@ static int __init inet_init(void) /* Add UDP-Lite (RFC 3828) */ udplite4_register(); + ping_init(); + /* * Set the ICMP layer up */ @@ -1728,6 +1743,8 @@ static int __init inet_init(void) rc = 0; out: return rc; +out_unregister_raw_proto: + proto_unregister(&raw_prot); out_unregister_udp_proto: proto_unregister(&udp_prot); out_unregister_tcp_proto: @@ -1752,11 +1769,15 @@ static int __init ipv4_proc_init(void) goto out_tcp; if (udp4_proc_init()) goto out_udp; + if (ping_proc_init()) + goto out_ping; if (ip_misc_proc_init()) goto out_misc; out: return rc; out_misc: + ping_proc_exit(); +out_ping: udp4_proc_exit(); out_udp: tcp4_proc_exit(); diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 86961be..c1f4154 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -73,7 +73,7 @@ static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash, * into IP header for icv calculation. Options are already checked * for validity, so paranoia is not required. */ -static int ip_clear_mutable_options(struct iphdr *iph, __be32 *daddr) +static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr) { unsigned char * optptr = (unsigned char*)(iph+1); int l = iph->ihl*4 - sizeof(struct iphdr); @@ -201,11 +201,14 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->ttl = 0; top_iph->check = 0; - ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; + if (x->props.flags & XFRM_STATE_ALIGN4) + ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; + else + ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; ah->reserved = 0; ah->spi = x->id.spi; - ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); + ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, 0, skb->len); @@ -299,9 +302,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) nexthdr = ah->nexthdr; ah_hlen = (ah->hdrlen + 2) << 2; - if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && - ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) - goto out; + if (x->props.flags & XFRM_STATE_ALIGN4) { + if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) && + ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len)) + goto out; + } else { + if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && + ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) + goto out; + } if (!pskb_may_pull(skb, ah_hlen)) goto out; @@ -387,7 +396,7 @@ out: static void ah4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); - struct iphdr *iph = (struct iphdr *)skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; @@ -395,7 +404,8 @@ static void ah4_err(struct sk_buff *skb, u32 info) icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return; - x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + ah->spi, IPPROTO_AH, AF_INET); if (!x) return; printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n", @@ -450,8 +460,12 @@ static int ah_init_state(struct xfrm_state *x) BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); - x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + - ahp->icv_trunc_len); + if (x->props.flags & XFRM_STATE_ALIGN4) + x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) + + ahp->icv_trunc_len); + else + x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + + ahp->icv_trunc_len); if (x->props.mode == XFRM_MODE_TUNNEL) x->props.header_len += sizeof(struct iphdr); x->data = ahp; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 7927589..1b74d3b 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -215,6 +215,9 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) case ARPHRD_INFINIBAND: ip_ib_mc_map(addr, dev->broadcast, haddr); return 0; + case ARPHRD_IPGRE: + ip_ipgre_mc_map(addr, dev->broadcast, haddr); + return 0; default: if (dir) { memcpy(haddr, dev->broadcast, dev->addr_len); @@ -433,14 +436,13 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) { - struct flowi fl = { .fl4_dst = sip, - .fl4_src = tip }; struct rtable *rt; int flag = 0; /*unsigned long now; */ struct net *net = dev_net(dev); - if (ip_route_output_key(net, &rt, &fl) < 0) + rt = ip_route_output(net, sip, tip, 0, 0); + if (IS_ERR(rt)) return 1; if (rt->dst.dev != dev) { NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER); @@ -1061,12 +1063,10 @@ static int arp_req_set(struct net *net, struct arpreq *r, if (r->arp_flags & ATF_PERM) r->arp_flags |= ATF_COM; if (dev == NULL) { - struct flowi fl = { .fl4_dst = ip, - .fl4_tos = RTO_ONLINK }; - struct rtable *rt; - err = ip_route_output_key(net, &rt, &fl); - if (err != 0) - return err; + struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0); + + if (IS_ERR(rt)) + return PTR_ERR(rt); dev = rt->dst.dev; ip_rt_put(rt); if (!dev) @@ -1177,7 +1177,6 @@ static int arp_req_delete_public(struct net *net, struct arpreq *r, static int arp_req_delete(struct net *net, struct arpreq *r, struct net_device *dev) { - int err; __be32 ip; if (r->arp_flags & ATF_PUBL) @@ -1185,12 +1184,9 @@ static int arp_req_delete(struct net *net, struct arpreq *r, ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; if (dev == NULL) { - struct flowi fl = { .fl4_dst = ip, - .fl4_tos = RTO_ONLINK }; - struct rtable *rt; - err = ip_route_output_key(net, &rt, &fl); - if (err != 0) - return err; + struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0); + if (IS_ERR(rt)) + return PTR_ERR(rt); dev = rt->dst.dev; ip_rt_put(rt); if (!dev) diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 094e150..2b3c23c 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -112,7 +112,7 @@ int cipso_v4_rbm_strictvalid = 1; /* The maximum number of category ranges permitted in the ranged category tag * (tag #5). You may note that the IETF draft states that the maximum number * of category ranges is 7, but if the low end of the last category range is - * zero then it is possibile to fit 8 category ranges because the zero should + * zero then it is possible to fit 8 category ranges because the zero should * be omitted. */ #define CIPSO_V4_TAG_RNG_CAT_MAX 8 @@ -438,7 +438,7 @@ cache_add_failure: * * Description: * Search the DOI definition list for a DOI definition with a DOI value that - * matches @doi. The caller is responsibile for calling rcu_read_[un]lock(). + * matches @doi. The caller is responsible for calling rcu_read_[un]lock(). * Returns a pointer to the DOI definition on success and NULL on failure. */ static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi) @@ -1293,7 +1293,7 @@ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def, return ret_val; /* This will send packets using the "optimized" format when - * possibile as specified in section 3.4.2.6 of the + * possible as specified in section 3.4.2.6 of the * CIPSO draft. */ if (cipso_v4_rbm_optfmt && ret_val > 0 && ret_val <= 10) tag_len = 14; @@ -1752,7 +1752,7 @@ validate_return: } /** - * cipso_v4_error - Send the correct reponse for a bad packet + * cipso_v4_error - Send the correct response for a bad packet * @skb: the packet * @error: the error code * @gateway: CIPSO gateway flag @@ -1857,6 +1857,11 @@ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len, return CIPSO_V4_HDR_LEN + ret_val; } +static void opt_kfree_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct ip_options_rcu, rcu)); +} + /** * cipso_v4_sock_setattr - Add a CIPSO option to a socket * @sk: the socket @@ -1879,7 +1884,7 @@ int cipso_v4_sock_setattr(struct sock *sk, unsigned char *buf = NULL; u32 buf_len; u32 opt_len; - struct ip_options *opt = NULL; + struct ip_options_rcu *old, *opt = NULL; struct inet_sock *sk_inet; struct inet_connection_sock *sk_conn; @@ -1915,22 +1920,25 @@ int cipso_v4_sock_setattr(struct sock *sk, ret_val = -ENOMEM; goto socket_setattr_failure; } - memcpy(opt->__data, buf, buf_len); - opt->optlen = opt_len; - opt->cipso = sizeof(struct iphdr); + memcpy(opt->opt.__data, buf, buf_len); + opt->opt.optlen = opt_len; + opt->opt.cipso = sizeof(struct iphdr); kfree(buf); buf = NULL; sk_inet = inet_sk(sk); + + old = rcu_dereference_protected(sk_inet->inet_opt, sock_owned_by_user(sk)); if (sk_inet->is_icsk) { sk_conn = inet_csk(sk); - if (sk_inet->opt) - sk_conn->icsk_ext_hdr_len -= sk_inet->opt->optlen; - sk_conn->icsk_ext_hdr_len += opt->optlen; + if (old) + sk_conn->icsk_ext_hdr_len -= old->opt.optlen; + sk_conn->icsk_ext_hdr_len += opt->opt.optlen; sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie); } - opt = xchg(&sk_inet->opt, opt); - kfree(opt); + rcu_assign_pointer(sk_inet->inet_opt, opt); + if (old) + call_rcu(&old->rcu, opt_kfree_rcu); return 0; @@ -1960,7 +1968,7 @@ int cipso_v4_req_setattr(struct request_sock *req, unsigned char *buf = NULL; u32 buf_len; u32 opt_len; - struct ip_options *opt = NULL; + struct ip_options_rcu *opt = NULL; struct inet_request_sock *req_inet; /* We allocate the maximum CIPSO option size here so we are probably @@ -1988,15 +1996,16 @@ int cipso_v4_req_setattr(struct request_sock *req, ret_val = -ENOMEM; goto req_setattr_failure; } - memcpy(opt->__data, buf, buf_len); - opt->optlen = opt_len; - opt->cipso = sizeof(struct iphdr); + memcpy(opt->opt.__data, buf, buf_len); + opt->opt.optlen = opt_len; + opt->opt.cipso = sizeof(struct iphdr); kfree(buf); buf = NULL; req_inet = inet_rsk(req); opt = xchg(&req_inet->opt, opt); - kfree(opt); + if (opt) + call_rcu(&opt->rcu, opt_kfree_rcu); return 0; @@ -2016,34 +2025,34 @@ req_setattr_failure: * values on failure. * */ -static int cipso_v4_delopt(struct ip_options **opt_ptr) +static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr) { int hdr_delta = 0; - struct ip_options *opt = *opt_ptr; + struct ip_options_rcu *opt = *opt_ptr; - if (opt->srr || opt->rr || opt->ts || opt->router_alert) { + if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) { u8 cipso_len; u8 cipso_off; unsigned char *cipso_ptr; int iter; int optlen_new; - cipso_off = opt->cipso - sizeof(struct iphdr); - cipso_ptr = &opt->__data[cipso_off]; + cipso_off = opt->opt.cipso - sizeof(struct iphdr); + cipso_ptr = &opt->opt.__data[cipso_off]; cipso_len = cipso_ptr[1]; - if (opt->srr > opt->cipso) - opt->srr -= cipso_len; - if (opt->rr > opt->cipso) - opt->rr -= cipso_len; - if (opt->ts > opt->cipso) - opt->ts -= cipso_len; - if (opt->router_alert > opt->cipso) - opt->router_alert -= cipso_len; - opt->cipso = 0; + if (opt->opt.srr > opt->opt.cipso) + opt->opt.srr -= cipso_len; + if (opt->opt.rr > opt->opt.cipso) + opt->opt.rr -= cipso_len; + if (opt->opt.ts > opt->opt.cipso) + opt->opt.ts -= cipso_len; + if (opt->opt.router_alert > opt->opt.cipso) + opt->opt.router_alert -= cipso_len; + opt->opt.cipso = 0; memmove(cipso_ptr, cipso_ptr + cipso_len, - opt->optlen - cipso_off - cipso_len); + opt->opt.optlen - cipso_off - cipso_len); /* determining the new total option length is tricky because of * the padding necessary, the only thing i can think to do at @@ -2052,21 +2061,21 @@ static int cipso_v4_delopt(struct ip_options **opt_ptr) * from there we can determine the new total option length */ iter = 0; optlen_new = 0; - while (iter < opt->optlen) - if (opt->__data[iter] != IPOPT_NOP) { - iter += opt->__data[iter + 1]; + while (iter < opt->opt.optlen) + if (opt->opt.__data[iter] != IPOPT_NOP) { + iter += opt->opt.__data[iter + 1]; optlen_new = iter; } else iter++; - hdr_delta = opt->optlen; - opt->optlen = (optlen_new + 3) & ~3; - hdr_delta -= opt->optlen; + hdr_delta = opt->opt.optlen; + opt->opt.optlen = (optlen_new + 3) & ~3; + hdr_delta -= opt->opt.optlen; } else { /* only the cipso option was present on the socket so we can * remove the entire option struct */ *opt_ptr = NULL; - hdr_delta = opt->optlen; - kfree(opt); + hdr_delta = opt->opt.optlen; + call_rcu(&opt->rcu, opt_kfree_rcu); } return hdr_delta; @@ -2083,15 +2092,15 @@ static int cipso_v4_delopt(struct ip_options **opt_ptr) void cipso_v4_sock_delattr(struct sock *sk) { int hdr_delta; - struct ip_options *opt; + struct ip_options_rcu *opt; struct inet_sock *sk_inet; sk_inet = inet_sk(sk); - opt = sk_inet->opt; - if (opt == NULL || opt->cipso == 0) + opt = rcu_dereference_protected(sk_inet->inet_opt, 1); + if (opt == NULL || opt->opt.cipso == 0) return; - hdr_delta = cipso_v4_delopt(&sk_inet->opt); + hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt); if (sk_inet->is_icsk && hdr_delta > 0) { struct inet_connection_sock *sk_conn = inet_csk(sk); sk_conn->icsk_ext_hdr_len -= hdr_delta; @@ -2109,12 +2118,12 @@ void cipso_v4_sock_delattr(struct sock *sk) */ void cipso_v4_req_delattr(struct request_sock *req) { - struct ip_options *opt; + struct ip_options_rcu *opt; struct inet_request_sock *req_inet; req_inet = inet_rsk(req); opt = req_inet->opt; - if (opt == NULL || opt->cipso == 0) + if (opt == NULL || opt->opt.cipso == 0) return; cipso_v4_delopt(&req_inet->opt); @@ -2184,14 +2193,18 @@ getattr_return: */ int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) { - struct ip_options *opt; + struct ip_options_rcu *opt; + int res = -ENOMSG; - opt = inet_sk(sk)->opt; - if (opt == NULL || opt->cipso == 0) - return -ENOMSG; - - return cipso_v4_getattr(opt->__data + opt->cipso - sizeof(struct iphdr), - secattr); + rcu_read_lock(); + opt = rcu_dereference(inet_sk(sk)->inet_opt); + if (opt && opt->opt.cipso) + res = cipso_v4_getattr(opt->opt.__data + + opt->opt.cipso - + sizeof(struct iphdr), + secattr); + rcu_read_unlock(); + return res; } /** diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 174be6c..424fafb 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -24,6 +24,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; + struct flowi4 *fl4; struct rtable *rt; __be32 saddr; int oif; @@ -38,6 +39,8 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk_dst_reset(sk); + lock_sock(sk); + oif = sk->sk_bound_dev_if; saddr = inet->inet_saddr; if (ipv4_is_multicast(usin->sin_addr.s_addr)) { @@ -46,33 +49,39 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (!saddr) saddr = inet->mc_addr; } - err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr, - RT_CONN_FLAGS(sk), oif, - sk->sk_protocol, - inet->inet_sport, usin->sin_port, sk, 1); - if (err) { + fl4 = &inet->cork.fl.u.ip4; + rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr, + RT_CONN_FLAGS(sk), oif, + sk->sk_protocol, + inet->inet_sport, usin->sin_port, sk, true); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); if (err == -ENETUNREACH) IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); - return err; + goto out; } if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) { ip_rt_put(rt); - return -EACCES; + err = -EACCES; + goto out; } if (!inet->inet_saddr) - inet->inet_saddr = rt->rt_src; /* Update source address */ + inet->inet_saddr = fl4->saddr; /* Update source address */ if (!inet->inet_rcv_saddr) { - inet->inet_rcv_saddr = rt->rt_src; + inet->inet_rcv_saddr = fl4->saddr; if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); } - inet->inet_daddr = rt->rt_dst; + inet->inet_daddr = fl4->daddr; inet->inet_dport = usin->sin_port; sk->sk_state = TCP_ESTABLISHED; inet->inet_id = jiffies; sk_dst_set(sk, &rt->dst); - return 0; + err = 0; +out: + release_sock(sk); + return err; } EXPORT_SYMBOL(ip4_datagram_connect); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 748cb5b..0d4a184 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -51,6 +51,7 @@ #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/slab.h> +#include <linux/hash.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif @@ -63,6 +64,8 @@ #include <net/rtnetlink.h> #include <net/net_namespace.h> +#include "fib_lookup.h" + static struct ipv4_devconf ipv4_devconf = { .data = { [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, @@ -92,6 +95,85 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, }; +/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE + * value. So if you change this define, make appropriate changes to + * inet_addr_hash as well. + */ +#define IN4_ADDR_HSIZE 256 +static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; +static DEFINE_SPINLOCK(inet_addr_hash_lock); + +static inline unsigned int inet_addr_hash(struct net *net, __be32 addr) +{ + u32 val = (__force u32) addr ^ hash_ptr(net, 8); + + return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) & + (IN4_ADDR_HSIZE - 1)); +} + +static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) +{ + unsigned int hash = inet_addr_hash(net, ifa->ifa_local); + + spin_lock(&inet_addr_hash_lock); + hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]); + spin_unlock(&inet_addr_hash_lock); +} + +static void inet_hash_remove(struct in_ifaddr *ifa) +{ + spin_lock(&inet_addr_hash_lock); + hlist_del_init_rcu(&ifa->hash); + spin_unlock(&inet_addr_hash_lock); +} + +/** + * __ip_dev_find - find the first device with a given source address. + * @net: the net namespace + * @addr: the source address + * @devref: if true, take a reference on the found device + * + * If a caller uses devref=false, it should be protected by RCU, or RTNL + */ +struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) +{ + unsigned int hash = inet_addr_hash(net, addr); + struct net_device *result = NULL; + struct in_ifaddr *ifa; + struct hlist_node *node; + + rcu_read_lock(); + hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) { + struct net_device *dev = ifa->ifa_dev->dev; + + if (!net_eq(dev_net(dev), net)) + continue; + if (ifa->ifa_local == addr) { + result = dev; + break; + } + } + if (!result) { + struct flowi4 fl4 = { .daddr = addr }; + struct fib_result res = { 0 }; + struct fib_table *local; + + /* Fallback to FIB local table so that communication + * over loopback subnets work. + */ + local = fib_get_table(net, RT_TABLE_LOCAL); + if (local && + !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) && + res.type == RTN_LOCAL) + result = FIB_RES_DEV(res); + } + if (result && devref) + dev_hold(result); + rcu_read_unlock(); + return result; +} +EXPORT_SYMBOL(__ip_dev_find); + static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); @@ -265,6 +347,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, } if (!do_promote) { + inet_hash_remove(ifa); *ifap1 = ifa->ifa_next; rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid); @@ -278,9 +361,21 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, } } + /* On promotion all secondaries from subnet are changing + * the primary IP, we must remove all their routes silently + * and later to add them back with new prefsrc. Do this + * while all addresses are on the device list. + */ + for (ifa = promote; ifa; ifa = ifa->ifa_next) { + if (ifa1->ifa_mask == ifa->ifa_mask && + inet_ifa_match(ifa1->ifa_address, ifa)) + fib_del_ifaddr(ifa, ifa1); + } + /* 2. Unlink it */ *ifap = ifa1->ifa_next; + inet_hash_remove(ifa1); /* 3. Announce address deletion */ @@ -296,6 +391,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); if (promote) { + struct in_ifaddr *next_sec = promote->ifa_next; if (prev_prom) { prev_prom->ifa_next = promote->ifa_next; @@ -307,7 +403,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote); - for (ifa = promote->ifa_next; ifa; ifa = ifa->ifa_next) { + for (ifa = next_sec; ifa; ifa = ifa->ifa_next) { if (ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) continue; @@ -368,6 +464,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, ifa->ifa_next = *ifap; *ifap = ifa; + inet_hash_insert(dev_net(in_dev->dev), ifa); + /* Send message first, then call notifier. Notifier will trigger FIB update, so that listeners of netlink will know about new ifaddr */ @@ -521,6 +619,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh) if (tb[IFA_ADDRESS] == NULL) tb[IFA_ADDRESS] = tb[IFA_LOCAL]; + INIT_HLIST_NODE(&ifa->hash); ifa->ifa_prefixlen = ifm->ifa_prefixlen; ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); ifa->ifa_flags = ifm->ifa_flags; @@ -670,7 +769,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) ifap = &ifa->ifa_next) { if (!strcmp(ifr.ifr_name, ifa->ifa_label) && sin_orig.sin_addr.s_addr == - ifa->ifa_address) { + ifa->ifa_local) { break; /* found */ } } @@ -728,6 +827,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (!ifa) { ret = -ENOBUFS; ifa = inet_alloc_ifa(); + INIT_HLIST_NODE(&ifa->hash); if (!ifa) break; if (colon) @@ -1030,6 +1130,21 @@ static inline bool inetdev_valid_mtu(unsigned mtu) return mtu >= 68; } +static void inetdev_send_gratuitous_arp(struct net_device *dev, + struct in_device *in_dev) + +{ + struct in_ifaddr *ifa = in_dev->ifa_list; + + if (!ifa) + return; + + arp_send(ARPOP_REQUEST, ETH_P_ARP, + ifa->ifa_local, dev, + ifa->ifa_local, NULL, + dev->dev_addr, NULL); +} + /* Called only under RTNL semaphore */ static int inetdev_event(struct notifier_block *this, unsigned long event, @@ -1069,6 +1184,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, struct in_ifaddr *ifa = inet_alloc_ifa(); if (ifa) { + INIT_HLIST_NODE(&ifa->hash); ifa->ifa_local = ifa->ifa_address = htonl(INADDR_LOOPBACK); ifa->ifa_prefixlen = 8; @@ -1082,18 +1198,13 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, } ip_mc_up(in_dev); /* fall through */ - case NETDEV_NOTIFY_PEERS: case NETDEV_CHANGEADDR: + if (!IN_DEV_ARP_NOTIFY(in_dev)) + break; + /* fall through */ + case NETDEV_NOTIFY_PEERS: /* Send gratuitous ARP to notify of link change */ - if (IN_DEV_ARP_NOTIFY(in_dev)) { - struct in_ifaddr *ifa = in_dev->ifa_list; - - if (ifa) - arp_send(ARPOP_REQUEST, ETH_P_ARP, - ifa->ifa_address, dev, - ifa->ifa_address, NULL, - dev->dev_addr, NULL); - } + inetdev_send_gratuitous_arp(dev, in_dev); break; case NETDEV_DOWN: ip_mc_down(in_dev); @@ -1258,7 +1369,7 @@ errout: static size_t inet_get_link_af_size(const struct net_device *dev) { - struct in_device *in_dev = __in_dev_get_rtnl(dev); + struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); if (!in_dev) return 0; @@ -1268,7 +1379,7 @@ static size_t inet_get_link_af_size(const struct net_device *dev) static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev) { - struct in_device *in_dev = __in_dev_get_rtnl(dev); + struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); struct nlattr *nla; int i; @@ -1569,7 +1680,7 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf) return; cnf->sysctl = NULL; - unregister_sysctl_table(t->sysctl_header); + unregister_net_sysctl_table(t->sysctl_header); kfree(t->dev_name); kfree(t); } @@ -1710,6 +1821,11 @@ static struct rtnl_af_ops inet_af_ops = { void __init devinet_init(void) { + int i; + + for (i = 0; i < IN4_ADDR_HSIZE; i++) + INIT_HLIST_HEAD(&inet_addr_lst[i]); + register_pernet_subsys(&devinet_ops); register_gifconf(PF_INET, inet_gifconf); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index e42a905..a5b4134 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -33,11 +33,14 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu); * * TODO: Use spare space in skb for this where possible. */ -static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags) +static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen) { unsigned int len; - len = crypto_aead_ivsize(aead); + len = seqhilen; + + len += crypto_aead_ivsize(aead); + if (len) { len += crypto_aead_alignmask(aead) & ~(crypto_tfm_ctx_alignment() - 1); @@ -52,10 +55,15 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags) return kmalloc(len, GFP_ATOMIC); } -static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp) +static inline __be32 *esp_tmp_seqhi(void *tmp) +{ + return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32)); +} +static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen) { return crypto_aead_ivsize(aead) ? - PTR_ALIGN((u8 *)tmp, crypto_aead_alignmask(aead) + 1) : tmp; + PTR_ALIGN((u8 *)tmp + seqhilen, + crypto_aead_alignmask(aead) + 1) : tmp + seqhilen; } static inline struct aead_givcrypt_request *esp_tmp_givreq( @@ -122,6 +130,10 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) int plen; int tfclen; int nfrags; + int assoclen; + int sglists; + int seqhilen; + __be32 *seqhi; /* skb is pure payload to encrypt */ @@ -151,14 +163,25 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) goto error; nfrags = err; - tmp = esp_alloc_tmp(aead, nfrags + 1); + assoclen = sizeof(*esph); + sglists = 1; + seqhilen = 0; + + if (x->props.flags & XFRM_STATE_ESN) { + sglists += 2; + seqhilen += sizeof(__be32); + assoclen += seqhilen; + } + + tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); if (!tmp) goto error; - iv = esp_tmp_iv(aead, tmp); + seqhi = esp_tmp_seqhi(tmp); + iv = esp_tmp_iv(aead, tmp, seqhilen); req = esp_tmp_givreq(aead, iv); asg = esp_givreq_sg(aead, req); - sg = asg + 1; + sg = asg + sglists; /* Fill padding... */ tail = skb_tail_pointer(trailer); @@ -215,19 +238,27 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) } esph->spi = x->id.spi; - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, esph->enc_data + crypto_aead_ivsize(aead) - skb->data, clen + alen); - sg_init_one(asg, esph, sizeof(*esph)); + + if ((x->props.flags & XFRM_STATE_ESN)) { + sg_init_table(asg, 3); + sg_set_buf(asg, &esph->spi, sizeof(__be32)); + *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + sg_set_buf(asg + 1, seqhi, seqhilen); + sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); + } else + sg_init_one(asg, esph, sizeof(*esph)); aead_givcrypt_set_callback(req, 0, esp_output_done, skb); aead_givcrypt_set_crypt(req, sg, sg, clen, iv); - aead_givcrypt_set_assoc(req, asg, sizeof(*esph)); + aead_givcrypt_set_assoc(req, asg, assoclen); aead_givcrypt_set_giv(req, esph->enc_data, - XFRM_SKB_CB(skb)->seq.output); + XFRM_SKB_CB(skb)->seq.output.low); ESP_SKB_CB(skb)->tmp = tmp; err = crypto_aead_givencrypt(req); @@ -245,7 +276,7 @@ error: static int esp_input_done2(struct sk_buff *skb, int err) { - struct iphdr *iph; + const struct iphdr *iph; struct xfrm_state *x = xfrm_input_state(skb); struct esp_data *esp = x->data; struct crypto_aead *aead = esp->aead; @@ -346,6 +377,10 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) struct sk_buff *trailer; int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead); int nfrags; + int assoclen; + int sglists; + int seqhilen; + __be32 *seqhi; void *tmp; u8 *iv; struct scatterlist *sg; @@ -362,16 +397,27 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) goto out; nfrags = err; + assoclen = sizeof(*esph); + sglists = 1; + seqhilen = 0; + + if (x->props.flags & XFRM_STATE_ESN) { + sglists += 2; + seqhilen += sizeof(__be32); + assoclen += seqhilen; + } + err = -ENOMEM; - tmp = esp_alloc_tmp(aead, nfrags + 1); + tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); if (!tmp) goto out; ESP_SKB_CB(skb)->tmp = tmp; - iv = esp_tmp_iv(aead, tmp); + seqhi = esp_tmp_seqhi(tmp); + iv = esp_tmp_iv(aead, tmp, seqhilen); req = esp_tmp_req(aead, iv); asg = esp_req_sg(aead, req); - sg = asg + 1; + sg = asg + sglists; skb->ip_summed = CHECKSUM_NONE; @@ -382,11 +428,19 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen); - sg_init_one(asg, esph, sizeof(*esph)); + + if ((x->props.flags & XFRM_STATE_ESN)) { + sg_init_table(asg, 3); + sg_set_buf(asg, &esph->spi, sizeof(__be32)); + *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; + sg_set_buf(asg + 1, seqhi, seqhilen); + sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); + } else + sg_init_one(asg, esph, sizeof(*esph)); aead_request_set_callback(req, 0, esp_input_done, skb); aead_request_set_crypt(req, sg, sg, elen, iv); - aead_request_set_assoc(req, asg, sizeof(*esph)); + aead_request_set_assoc(req, asg, assoclen); err = crypto_aead_decrypt(req); if (err == -EINPROGRESS) @@ -430,7 +484,7 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) static void esp4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); - struct iphdr *iph = (struct iphdr *)skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; @@ -438,7 +492,8 @@ static void esp4_err(struct sk_buff *skb, u32 info) icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return; - x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + esph->spi, IPPROTO_ESP, AF_INET); if (!x) return; NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", @@ -500,10 +555,20 @@ static int esp_init_authenc(struct xfrm_state *x) goto error; err = -ENAMETOOLONG; - if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, "authenc(%s,%s)", - x->aalg ? x->aalg->alg_name : "digest_null", - x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) - goto error; + + if ((x->props.flags & XFRM_STATE_ESN)) { + if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, + "authencesn(%s,%s)", + x->aalg ? x->aalg->alg_name : "digest_null", + x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) + goto error; + } else { + if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, + "authenc(%s,%s)", + x->aalg ? x->aalg->alg_name : "digest_null", + x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) + goto error; + } aead = crypto_alloc_aead(authenc_name, 0, 0); err = PTR_ERR(aead); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 1d2cdd4..2252471 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -44,6 +44,7 @@ #include <net/arp.h> #include <net/ip_fib.h> #include <net/rtnetlink.h> +#include <net/xfrm.h> #ifndef CONFIG_IP_MULTIPLE_TABLES @@ -51,11 +52,11 @@ static int __net_init fib4_rules_init(struct net *net) { struct fib_table *local_table, *main_table; - local_table = fib_hash_table(RT_TABLE_LOCAL); + local_table = fib_trie_table(RT_TABLE_LOCAL); if (local_table == NULL) return -ENOMEM; - main_table = fib_hash_table(RT_TABLE_MAIN); + main_table = fib_trie_table(RT_TABLE_MAIN); if (main_table == NULL) goto fail; @@ -82,7 +83,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id) if (tb) return tb; - tb = fib_hash_table(id); + tb = fib_trie_table(id); if (!tb) return NULL; h = id & (FIB_TABLE_HASHSZ - 1); @@ -114,21 +115,6 @@ struct fib_table *fib_get_table(struct net *net, u32 id) } #endif /* CONFIG_IP_MULTIPLE_TABLES */ -void fib_select_default(struct net *net, - const struct flowi *flp, struct fib_result *res) -{ - struct fib_table *tb; - int table = RT_TABLE_MAIN; -#ifdef CONFIG_IP_MULTIPLE_TABLES - if (res->r == NULL || res->r->action != FR_ACT_TO_TBL) - return; - table = res->r->table; -#endif - tb = fib_get_table(net, table); - if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) - fib_table_select_default(tb, flp, res); -} - static void fib_flush(struct net *net) { int flushed = 0; @@ -147,46 +133,6 @@ static void fib_flush(struct net *net) rt_cache_flush(net, -1); } -/** - * __ip_dev_find - find the first device with a given source address. - * @net: the net namespace - * @addr: the source address - * @devref: if true, take a reference on the found device - * - * If a caller uses devref=false, it should be protected by RCU, or RTNL - */ -struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) -{ - struct flowi fl = { - .fl4_dst = addr, - }; - struct fib_result res = { 0 }; - struct net_device *dev = NULL; - struct fib_table *local_table; - -#ifdef CONFIG_IP_MULTIPLE_TABLES - res.r = NULL; -#endif - - rcu_read_lock(); - local_table = fib_get_table(net, RT_TABLE_LOCAL); - if (!local_table || - fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) { - rcu_read_unlock(); - return NULL; - } - if (res.type != RTN_LOCAL) - goto out; - dev = FIB_RES_DEV(res); - - if (dev && devref) - dev_hold(dev); -out: - rcu_read_unlock(); - return dev; -} -EXPORT_SYMBOL(__ip_dev_find); - /* * Find address type as if only "dev" was present in the system. If * on_dev is NULL then all interfaces are taken into consideration. @@ -195,7 +141,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr) { - struct flowi fl = { .fl4_dst = addr }; + struct flowi4 fl4 = { .daddr = addr }; struct fib_result res; unsigned ret = RTN_BROADCAST; struct fib_table *local_table; @@ -213,7 +159,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net, if (local_table) { ret = RTN_UNICAST; rcu_read_lock(); - if (!fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) { + if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) { if (!dev || dev == res.fi->fib_dev) ret = res.type; } @@ -243,45 +189,48 @@ EXPORT_SYMBOL(inet_dev_addr_type); * - check, that packet arrived from expected physical interface. * called with rcu_read_lock() */ -int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, - struct net_device *dev, __be32 *spec_dst, - u32 *itag, u32 mark) +int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, + int oif, struct net_device *dev, __be32 *spec_dst, + u32 *itag) { struct in_device *in_dev; - struct flowi fl = { - .fl4_dst = src, - .fl4_src = dst, - .fl4_tos = tos, - .mark = mark, - .iif = oif - }; + struct flowi4 fl4; struct fib_result res; int no_addr, rpf, accept_local; bool dev_match; int ret; struct net *net; + fl4.flowi4_oif = 0; + fl4.flowi4_iif = oif; + fl4.daddr = src; + fl4.saddr = dst; + fl4.flowi4_tos = tos; + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + no_addr = rpf = accept_local = 0; in_dev = __in_dev_get_rcu(dev); if (in_dev) { no_addr = in_dev->ifa_list == NULL; - rpf = IN_DEV_RPFILTER(in_dev); + + /* Ignore rp_filter for packets protected by IPsec. */ + rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(in_dev); + accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); - if (mark && !IN_DEV_SRC_VMARK(in_dev)) - fl.mark = 0; + fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; } if (in_dev == NULL) goto e_inval; net = dev_net(dev); - if (fib_lookup(net, &fl, &res)) + if (fib_lookup(net, &fl4, &res)) goto last_resort; if (res.type != RTN_UNICAST) { if (res.type != RTN_LOCAL || !accept_local) goto e_inval; } - *spec_dst = FIB_RES_PREFSRC(res); + *spec_dst = FIB_RES_PREFSRC(net, res); fib_combine_itag(itag, &res); dev_match = false; @@ -306,12 +255,12 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, goto last_resort; if (rpf == 1) goto e_rpf; - fl.oif = dev->ifindex; + fl4.flowi4_oif = dev->ifindex; ret = 0; - if (fib_lookup(net, &fl, &res) == 0) { + if (fib_lookup(net, &fl4, &res) == 0) { if (res.type == RTN_UNICAST) { - *spec_dst = FIB_RES_PREFSRC(res); + *spec_dst = FIB_RES_PREFSRC(net, res); ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; } } @@ -775,12 +724,17 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) } } -static void fib_del_ifaddr(struct in_ifaddr *ifa) +/* Delete primary or secondary address. + * Optionally, on secondary address promotion consider the addresses + * from subnet iprim as deleted, even if they are in device list. + * In this case the secondary ifa can be in device list. + */ +void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) { struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; struct in_ifaddr *ifa1; - struct in_ifaddr *prim = ifa; + struct in_ifaddr *prim = ifa, *prim1 = NULL; __be32 brd = ifa->ifa_address | ~ifa->ifa_mask; __be32 any = ifa->ifa_address & ifa->ifa_mask; #define LOCAL_OK 1 @@ -788,17 +742,26 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) #define BRD0_OK 4 #define BRD1_OK 8 unsigned ok = 0; + int subnet = 0; /* Primary network */ + int gone = 1; /* Address is missing */ + int same_prefsrc = 0; /* Another primary with same IP */ - if (!(ifa->ifa_flags & IFA_F_SECONDARY)) - fib_magic(RTM_DELROUTE, - dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, - any, ifa->ifa_prefixlen, prim); - else { + if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); if (prim == NULL) { printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n"); return; } + if (iprim && iprim != prim) { + printk(KERN_WARNING "fib_del_ifaddr: bug: iprim != prim\n"); + return; + } + } else if (!ipv4_is_zeronet(any) && + (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) { + fib_magic(RTM_DELROUTE, + dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, + any, ifa->ifa_prefixlen, prim); + subnet = 1; } /* Deletion is more complicated than add. @@ -808,6 +771,49 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) */ for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { + if (ifa1 == ifa) { + /* promotion, keep the IP */ + gone = 0; + continue; + } + /* Ignore IFAs from our subnet */ + if (iprim && ifa1->ifa_mask == iprim->ifa_mask && + inet_ifa_match(ifa1->ifa_address, iprim)) + continue; + + /* Ignore ifa1 if it uses different primary IP (prefsrc) */ + if (ifa1->ifa_flags & IFA_F_SECONDARY) { + /* Another address from our subnet? */ + if (ifa1->ifa_mask == prim->ifa_mask && + inet_ifa_match(ifa1->ifa_address, prim)) + prim1 = prim; + else { + /* We reached the secondaries, so + * same_prefsrc should be determined. + */ + if (!same_prefsrc) + continue; + /* Search new prim1 if ifa1 is not + * using the current prim1 + */ + if (!prim1 || + ifa1->ifa_mask != prim1->ifa_mask || + !inet_ifa_match(ifa1->ifa_address, prim1)) + prim1 = inet_ifa_byprefix(in_dev, + ifa1->ifa_address, + ifa1->ifa_mask); + if (!prim1) + continue; + if (prim1->ifa_local != prim->ifa_local) + continue; + } + } else { + if (prim->ifa_local != ifa1->ifa_local) + continue; + prim1 = ifa1; + if (prim != prim1) + same_prefsrc = 1; + } if (ifa->ifa_local == ifa1->ifa_local) ok |= LOCAL_OK; if (ifa->ifa_broadcast == ifa1->ifa_broadcast) @@ -816,19 +822,37 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) ok |= BRD1_OK; if (any == ifa1->ifa_broadcast) ok |= BRD0_OK; + /* primary has network specific broadcasts */ + if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) { + __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask; + __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask; + + if (!ipv4_is_zeronet(any1)) { + if (ifa->ifa_broadcast == brd1 || + ifa->ifa_broadcast == any1) + ok |= BRD_OK; + if (brd == brd1 || brd == any1) + ok |= BRD1_OK; + if (any == brd1 || any == any1) + ok |= BRD0_OK; + } + } } if (!(ok & BRD_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); - if (!(ok & BRD1_OK)) - fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); - if (!(ok & BRD0_OK)) - fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); + if (subnet && ifa->ifa_prefixlen < 31) { + if (!(ok & BRD1_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); + if (!(ok & BRD0_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); + } if (!(ok & LOCAL_OK)) { fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); /* Check, that this local address finally disappeared. */ - if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { + if (gone && + inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { /* And the last, but not the least thing. * We must flush stray FIB entries. * @@ -849,11 +873,11 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) { struct fib_result res; - struct flowi fl = { - .mark = frn->fl_mark, - .fl4_dst = frn->fl_addr, - .fl4_tos = frn->fl_tos, - .fl4_scope = frn->fl_scope, + struct flowi4 fl4 = { + .flowi4_mark = frn->fl_mark, + .daddr = frn->fl_addr, + .flowi4_tos = frn->fl_tos, + .flowi4_scope = frn->fl_scope, }; #ifdef CONFIG_IP_MULTIPLE_TABLES @@ -866,7 +890,7 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) frn->tb_id = tb->tb_id; rcu_read_lock(); - frn->err = fib_table_lookup(tb, &fl, &res, FIB_LOOKUP_NOREF); + frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); if (!frn->err) { frn->prefixlen = res.prefixlen; @@ -938,6 +962,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, { struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct net_device *dev = ifa->ifa_dev->dev; + struct net *net = dev_net(dev); switch (event) { case NETDEV_UP: @@ -945,10 +970,12 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev); #endif + atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(dev_net(dev), -1); break; case NETDEV_DOWN: - fib_del_ifaddr(ifa); + fib_del_ifaddr(ifa, NULL); + atomic_inc(&net->ipv4.dev_addr_genid); if (ifa->ifa_dev->ifa_list == NULL) { /* Last address was deleted from this interface. * Disable IP. @@ -966,6 +993,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo { struct net_device *dev = ptr; struct in_device *in_dev = __in_dev_get_rtnl(dev); + struct net *net = dev_net(dev); if (event == NETDEV_UNREGISTER) { fib_disable_ip(dev, 2, -1); @@ -983,6 +1011,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev); #endif + atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(dev_net(dev), -1); break; case NETDEV_DOWN: @@ -1041,6 +1070,7 @@ static void ip_fib_net_exit(struct net *net) fib4_rules_exit(net); #endif + rtnl_lock(); for (i = 0; i < FIB_TABLE_HASHSZ; i++) { struct fib_table *tb; struct hlist_head *head; @@ -1053,6 +1083,7 @@ static void ip_fib_net_exit(struct net *net) fib_free_table(tb); } } + rtnl_unlock(); kfree(net->ipv4.fib_table_hash); } @@ -1101,5 +1132,5 @@ void __init ip_fib_init(void) register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); - fib_hash_init(); + fib_trie_init(); } diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c deleted file mode 100644 index b3acb04..0000000 --- a/net/ipv4/fib_hash.c +++ /dev/null @@ -1,1133 +0,0 @@ -/* - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * IPv4 FIB: lookup engine and maintenance routines. - * - * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/errno.h> -#include <linux/in.h> -#include <linux/inet.h> -#include <linux/inetdevice.h> -#include <linux/netdevice.h> -#include <linux/if_arp.h> -#include <linux/proc_fs.h> -#include <linux/skbuff.h> -#include <linux/netlink.h> -#include <linux/init.h> -#include <linux/slab.h> - -#include <net/net_namespace.h> -#include <net/ip.h> -#include <net/protocol.h> -#include <net/route.h> -#include <net/tcp.h> -#include <net/sock.h> -#include <net/ip_fib.h> - -#include "fib_lookup.h" - -static struct kmem_cache *fn_hash_kmem __read_mostly; -static struct kmem_cache *fn_alias_kmem __read_mostly; - -struct fib_node { - struct hlist_node fn_hash; - struct list_head fn_alias; - __be32 fn_key; - struct fib_alias fn_embedded_alias; -}; - -#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head)) - -struct fn_zone { - struct fn_zone __rcu *fz_next; /* Next not empty zone */ - struct hlist_head __rcu *fz_hash; /* Hash table pointer */ - seqlock_t fz_lock; - u32 fz_hashmask; /* (fz_divisor - 1) */ - - u8 fz_order; /* Zone order (0..32) */ - u8 fz_revorder; /* 32 - fz_order */ - __be32 fz_mask; /* inet_make_mask(order) */ -#define FZ_MASK(fz) ((fz)->fz_mask) - - struct hlist_head fz_embedded_hash[EMBEDDED_HASH_SIZE]; - - int fz_nent; /* Number of entries */ - int fz_divisor; /* Hash size (mask+1) */ -}; - -struct fn_hash { - struct fn_zone *fn_zones[33]; - struct fn_zone __rcu *fn_zone_list; -}; - -static inline u32 fn_hash(__be32 key, struct fn_zone *fz) -{ - u32 h = ntohl(key) >> fz->fz_revorder; - h ^= (h>>20); - h ^= (h>>10); - h ^= (h>>5); - h &= fz->fz_hashmask; - return h; -} - -static inline __be32 fz_key(__be32 dst, struct fn_zone *fz) -{ - return dst & FZ_MASK(fz); -} - -static unsigned int fib_hash_genid; - -#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head)) - -static struct hlist_head *fz_hash_alloc(int divisor) -{ - unsigned long size = divisor * sizeof(struct hlist_head); - - if (size <= PAGE_SIZE) - return kzalloc(size, GFP_KERNEL); - - return (struct hlist_head *) - __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size)); -} - -/* The fib hash lock must be held when this is called. */ -static inline void fn_rebuild_zone(struct fn_zone *fz, - struct hlist_head *old_ht, - int old_divisor) -{ - int i; - - for (i = 0; i < old_divisor; i++) { - struct hlist_node *node, *n; - struct fib_node *f; - - hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) { - struct hlist_head *new_head; - - hlist_del_rcu(&f->fn_hash); - - new_head = rcu_dereference_protected(fz->fz_hash, 1) + - fn_hash(f->fn_key, fz); - hlist_add_head_rcu(&f->fn_hash, new_head); - } - } -} - -static void fz_hash_free(struct hlist_head *hash, int divisor) -{ - unsigned long size = divisor * sizeof(struct hlist_head); - - if (size <= PAGE_SIZE) - kfree(hash); - else - free_pages((unsigned long)hash, get_order(size)); -} - -static void fn_rehash_zone(struct fn_zone *fz) -{ - struct hlist_head *ht, *old_ht; - int old_divisor, new_divisor; - u32 new_hashmask; - - new_divisor = old_divisor = fz->fz_divisor; - - switch (old_divisor) { - case EMBEDDED_HASH_SIZE: - new_divisor *= EMBEDDED_HASH_SIZE; - break; - case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE: - new_divisor *= (EMBEDDED_HASH_SIZE/2); - break; - default: - if ((old_divisor << 1) > FZ_MAX_DIVISOR) { - printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor); - return; - } - new_divisor = (old_divisor << 1); - break; - } - - new_hashmask = (new_divisor - 1); - -#if RT_CACHE_DEBUG >= 2 - printk(KERN_DEBUG "fn_rehash_zone: hash for zone %d grows from %d\n", - fz->fz_order, old_divisor); -#endif - - ht = fz_hash_alloc(new_divisor); - - if (ht) { - struct fn_zone nfz; - - memcpy(&nfz, fz, sizeof(nfz)); - - write_seqlock_bh(&fz->fz_lock); - old_ht = rcu_dereference_protected(fz->fz_hash, 1); - RCU_INIT_POINTER(nfz.fz_hash, ht); - nfz.fz_hashmask = new_hashmask; - nfz.fz_divisor = new_divisor; - fn_rebuild_zone(&nfz, old_ht, old_divisor); - fib_hash_genid++; - rcu_assign_pointer(fz->fz_hash, ht); - fz->fz_hashmask = new_hashmask; - fz->fz_divisor = new_divisor; - write_sequnlock_bh(&fz->fz_lock); - - if (old_ht != fz->fz_embedded_hash) { - synchronize_rcu(); - fz_hash_free(old_ht, old_divisor); - } - } -} - -static void fn_free_node_rcu(struct rcu_head *head) -{ - struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu); - - kmem_cache_free(fn_hash_kmem, f); -} - -static inline void fn_free_node(struct fib_node *f) -{ - call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu); -} - -static void fn_free_alias_rcu(struct rcu_head *head) -{ - struct fib_alias *fa = container_of(head, struct fib_alias, rcu); - - kmem_cache_free(fn_alias_kmem, fa); -} - -static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f) -{ - fib_release_info(fa->fa_info); - if (fa == &f->fn_embedded_alias) - fa->fa_info = NULL; - else - call_rcu(&fa->rcu, fn_free_alias_rcu); -} - -static struct fn_zone * -fn_new_zone(struct fn_hash *table, int z) -{ - int i; - struct fn_zone *fz = kzalloc(sizeof(struct fn_zone), GFP_KERNEL); - if (!fz) - return NULL; - - seqlock_init(&fz->fz_lock); - fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1; - fz->fz_hashmask = fz->fz_divisor - 1; - RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash); - fz->fz_order = z; - fz->fz_revorder = 32 - z; - fz->fz_mask = inet_make_mask(z); - - /* Find the first not empty zone with more specific mask */ - for (i = z + 1; i <= 32; i++) - if (table->fn_zones[i]) - break; - if (i > 32) { - /* No more specific masks, we are the first. */ - rcu_assign_pointer(fz->fz_next, - rtnl_dereference(table->fn_zone_list)); - rcu_assign_pointer(table->fn_zone_list, fz); - } else { - rcu_assign_pointer(fz->fz_next, - rtnl_dereference(table->fn_zones[i]->fz_next)); - rcu_assign_pointer(table->fn_zones[i]->fz_next, fz); - } - table->fn_zones[z] = fz; - fib_hash_genid++; - return fz; -} - -int fib_table_lookup(struct fib_table *tb, - const struct flowi *flp, struct fib_result *res, - int fib_flags) -{ - int err; - struct fn_zone *fz; - struct fn_hash *t = (struct fn_hash *)tb->tb_data; - - rcu_read_lock(); - for (fz = rcu_dereference(t->fn_zone_list); - fz != NULL; - fz = rcu_dereference(fz->fz_next)) { - struct hlist_head *head; - struct hlist_node *node; - struct fib_node *f; - __be32 k; - unsigned int seq; - - do { - seq = read_seqbegin(&fz->fz_lock); - k = fz_key(flp->fl4_dst, fz); - - head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz); - hlist_for_each_entry_rcu(f, node, head, fn_hash) { - if (f->fn_key != k) - continue; - - err = fib_semantic_match(&f->fn_alias, - flp, res, - fz->fz_order, fib_flags); - if (err <= 0) - goto out; - } - } while (read_seqretry(&fz->fz_lock, seq)); - } - err = 1; -out: - rcu_read_unlock(); - return err; -} - -void fib_table_select_default(struct fib_table *tb, - const struct flowi *flp, struct fib_result *res) -{ - int order, last_idx; - struct hlist_node *node; - struct fib_node *f; - struct fib_info *fi = NULL; - struct fib_info *last_resort; - struct fn_hash *t = (struct fn_hash *)tb->tb_data; - struct fn_zone *fz = t->fn_zones[0]; - struct hlist_head *head; - - if (fz == NULL) - return; - - last_idx = -1; - last_resort = NULL; - order = -1; - - rcu_read_lock(); - head = rcu_dereference(fz->fz_hash); - hlist_for_each_entry_rcu(f, node, head, fn_hash) { - struct fib_alias *fa; - - list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) { - struct fib_info *next_fi = fa->fa_info; - - if (fa->fa_scope != res->scope || - fa->fa_type != RTN_UNICAST) - continue; - - if (next_fi->fib_priority > res->fi->fib_priority) - break; - if (!next_fi->fib_nh[0].nh_gw || - next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) - continue; - - fib_alias_accessed(fa); - - if (fi == NULL) { - if (next_fi != res->fi) - break; - } else if (!fib_detect_death(fi, order, &last_resort, - &last_idx, tb->tb_default)) { - fib_result_assign(res, fi); - tb->tb_default = order; - goto out; - } - fi = next_fi; - order++; - } - } - - if (order <= 0 || fi == NULL) { - tb->tb_default = -1; - goto out; - } - - if (!fib_detect_death(fi, order, &last_resort, &last_idx, - tb->tb_default)) { - fib_result_assign(res, fi); - tb->tb_default = order; - goto out; - } - - if (last_idx >= 0) - fib_result_assign(res, last_resort); - tb->tb_default = last_idx; -out: - rcu_read_unlock(); -} - -/* Insert node F to FZ. */ -static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f) -{ - struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz); - - hlist_add_head_rcu(&f->fn_hash, head); -} - -/* Return the node in FZ matching KEY. */ -static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key) -{ - struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz); - struct hlist_node *node; - struct fib_node *f; - - hlist_for_each_entry_rcu(f, node, head, fn_hash) { - if (f->fn_key == key) - return f; - } - - return NULL; -} - - -static struct fib_alias *fib_fast_alloc(struct fib_node *f) -{ - struct fib_alias *fa = &f->fn_embedded_alias; - - if (fa->fa_info != NULL) - fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); - return fa; -} - -/* Caller must hold RTNL. */ -int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) -{ - struct fn_hash *table = (struct fn_hash *) tb->tb_data; - struct fib_node *new_f = NULL; - struct fib_node *f; - struct fib_alias *fa, *new_fa; - struct fn_zone *fz; - struct fib_info *fi; - u8 tos = cfg->fc_tos; - __be32 key; - int err; - - if (cfg->fc_dst_len > 32) - return -EINVAL; - - fz = table->fn_zones[cfg->fc_dst_len]; - if (!fz && !(fz = fn_new_zone(table, cfg->fc_dst_len))) - return -ENOBUFS; - - key = 0; - if (cfg->fc_dst) { - if (cfg->fc_dst & ~FZ_MASK(fz)) - return -EINVAL; - key = fz_key(cfg->fc_dst, fz); - } - - fi = fib_create_info(cfg); - if (IS_ERR(fi)) - return PTR_ERR(fi); - - if (fz->fz_nent > (fz->fz_divisor<<1) && - fz->fz_divisor < FZ_MAX_DIVISOR && - (cfg->fc_dst_len == 32 || - (1 << cfg->fc_dst_len) > fz->fz_divisor)) - fn_rehash_zone(fz); - - f = fib_find_node(fz, key); - - if (!f) - fa = NULL; - else - fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority); - - /* Now fa, if non-NULL, points to the first fib alias - * with the same keys [prefix,tos,priority], if such key already - * exists or to the node before which we will insert new one. - * - * If fa is NULL, we will need to allocate a new one and - * insert to the head of f. - * - * If f is NULL, no fib node matched the destination key - * and we need to allocate a new one of those as well. - */ - - if (fa && fa->fa_tos == tos && - fa->fa_info->fib_priority == fi->fib_priority) { - struct fib_alias *fa_first, *fa_match; - - err = -EEXIST; - if (cfg->fc_nlflags & NLM_F_EXCL) - goto out; - - /* We have 2 goals: - * 1. Find exact match for type, scope, fib_info to avoid - * duplicate routes - * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it - */ - fa_match = NULL; - fa_first = fa; - fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); - list_for_each_entry_continue(fa, &f->fn_alias, fa_list) { - if (fa->fa_tos != tos) - break; - if (fa->fa_info->fib_priority != fi->fib_priority) - break; - if (fa->fa_type == cfg->fc_type && - fa->fa_scope == cfg->fc_scope && - fa->fa_info == fi) { - fa_match = fa; - break; - } - } - - if (cfg->fc_nlflags & NLM_F_REPLACE) { - u8 state; - - fa = fa_first; - if (fa_match) { - if (fa == fa_match) - err = 0; - goto out; - } - err = -ENOBUFS; - new_fa = fib_fast_alloc(f); - if (new_fa == NULL) - goto out; - - new_fa->fa_tos = fa->fa_tos; - new_fa->fa_info = fi; - new_fa->fa_type = cfg->fc_type; - new_fa->fa_scope = cfg->fc_scope; - state = fa->fa_state; - new_fa->fa_state = state & ~FA_S_ACCESSED; - fib_hash_genid++; - list_replace_rcu(&fa->fa_list, &new_fa->fa_list); - - fn_free_alias(fa, f); - if (state & FA_S_ACCESSED) - rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); - rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, - tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); - return 0; - } - - /* Error if we find a perfect match which - * uses the same scope, type, and nexthop - * information. - */ - if (fa_match) - goto out; - - if (!(cfg->fc_nlflags & NLM_F_APPEND)) - fa = fa_first; - } - - err = -ENOENT; - if (!(cfg->fc_nlflags & NLM_F_CREATE)) - goto out; - - err = -ENOBUFS; - - if (!f) { - new_f = kmem_cache_zalloc(fn_hash_kmem, GFP_KERNEL); - if (new_f == NULL) - goto out; - - INIT_HLIST_NODE(&new_f->fn_hash); - INIT_LIST_HEAD(&new_f->fn_alias); - new_f->fn_key = key; - f = new_f; - } - - new_fa = fib_fast_alloc(f); - if (new_fa == NULL) - goto out; - - new_fa->fa_info = fi; - new_fa->fa_tos = tos; - new_fa->fa_type = cfg->fc_type; - new_fa->fa_scope = cfg->fc_scope; - new_fa->fa_state = 0; - - /* - * Insert new entry to the list. - */ - - if (new_f) - fib_insert_node(fz, new_f); - list_add_tail_rcu(&new_fa->fa_list, - (fa ? &fa->fa_list : &f->fn_alias)); - fib_hash_genid++; - - if (new_f) - fz->fz_nent++; - rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); - - rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id, - &cfg->fc_nlinfo, 0); - return 0; - -out: - if (new_f) - kmem_cache_free(fn_hash_kmem, new_f); - fib_release_info(fi); - return err; -} - -int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) -{ - struct fn_hash *table = (struct fn_hash *)tb->tb_data; - struct fib_node *f; - struct fib_alias *fa, *fa_to_delete; - struct fn_zone *fz; - __be32 key; - - if (cfg->fc_dst_len > 32) - return -EINVAL; - - if ((fz = table->fn_zones[cfg->fc_dst_len]) == NULL) - return -ESRCH; - - key = 0; - if (cfg->fc_dst) { - if (cfg->fc_dst & ~FZ_MASK(fz)) - return -EINVAL; - key = fz_key(cfg->fc_dst, fz); - } - - f = fib_find_node(fz, key); - - if (!f) - fa = NULL; - else - fa = fib_find_alias(&f->fn_alias, cfg->fc_tos, 0); - if (!fa) - return -ESRCH; - - fa_to_delete = NULL; - fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); - list_for_each_entry_continue(fa, &f->fn_alias, fa_list) { - struct fib_info *fi = fa->fa_info; - - if (fa->fa_tos != cfg->fc_tos) - break; - - if ((!cfg->fc_type || - fa->fa_type == cfg->fc_type) && - (cfg->fc_scope == RT_SCOPE_NOWHERE || - fa->fa_scope == cfg->fc_scope) && - (!cfg->fc_protocol || - fi->fib_protocol == cfg->fc_protocol) && - fib_nh_match(cfg, fi) == 0) { - fa_to_delete = fa; - break; - } - } - - if (fa_to_delete) { - int kill_fn; - - fa = fa_to_delete; - rtmsg_fib(RTM_DELROUTE, key, fa, cfg->fc_dst_len, - tb->tb_id, &cfg->fc_nlinfo, 0); - - kill_fn = 0; - list_del_rcu(&fa->fa_list); - if (list_empty(&f->fn_alias)) { - hlist_del_rcu(&f->fn_hash); - kill_fn = 1; - } - fib_hash_genid++; - - if (fa->fa_state & FA_S_ACCESSED) - rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); - fn_free_alias(fa, f); - if (kill_fn) { - fn_free_node(f); - fz->fz_nent--; - } - - return 0; - } - return -ESRCH; -} - -static int fn_flush_list(struct fn_zone *fz, int idx) -{ - struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx; - struct hlist_node *node, *n; - struct fib_node *f; - int found = 0; - - hlist_for_each_entry_safe(f, node, n, head, fn_hash) { - struct fib_alias *fa, *fa_node; - int kill_f; - - kill_f = 0; - list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) { - struct fib_info *fi = fa->fa_info; - - if (fi && (fi->fib_flags&RTNH_F_DEAD)) { - list_del_rcu(&fa->fa_list); - if (list_empty(&f->fn_alias)) { - hlist_del_rcu(&f->fn_hash); - kill_f = 1; - } - fib_hash_genid++; - - fn_free_alias(fa, f); - found++; - } - } - if (kill_f) { - fn_free_node(f); - fz->fz_nent--; - } - } - return found; -} - -/* caller must hold RTNL. */ -int fib_table_flush(struct fib_table *tb) -{ - struct fn_hash *table = (struct fn_hash *) tb->tb_data; - struct fn_zone *fz; - int found = 0; - - for (fz = rtnl_dereference(table->fn_zone_list); - fz != NULL; - fz = rtnl_dereference(fz->fz_next)) { - int i; - - for (i = fz->fz_divisor - 1; i >= 0; i--) - found += fn_flush_list(fz, i); - } - return found; -} - -void fib_free_table(struct fib_table *tb) -{ - struct fn_hash *table = (struct fn_hash *) tb->tb_data; - struct fn_zone *fz, *next; - - next = table->fn_zone_list; - while (next != NULL) { - fz = next; - next = fz->fz_next; - - if (fz->fz_hash != fz->fz_embedded_hash) - fz_hash_free(fz->fz_hash, fz->fz_divisor); - - kfree(fz); - } - - kfree(tb); -} - -static inline int -fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, - struct fib_table *tb, - struct fn_zone *fz, - struct hlist_head *head) -{ - struct hlist_node *node; - struct fib_node *f; - int i, s_i; - - s_i = cb->args[4]; - i = 0; - hlist_for_each_entry_rcu(f, node, head, fn_hash) { - struct fib_alias *fa; - - list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) { - if (i < s_i) - goto next; - - if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, - RTM_NEWROUTE, - tb->tb_id, - fa->fa_type, - fa->fa_scope, - f->fn_key, - fz->fz_order, - fa->fa_tos, - fa->fa_info, - NLM_F_MULTI) < 0) { - cb->args[4] = i; - return -1; - } -next: - i++; - } - } - cb->args[4] = i; - return skb->len; -} - -static inline int -fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb, - struct fib_table *tb, - struct fn_zone *fz) -{ - int h, s_h; - struct hlist_head *head = rcu_dereference(fz->fz_hash); - - if (head == NULL) - return skb->len; - s_h = cb->args[3]; - for (h = s_h; h < fz->fz_divisor; h++) { - if (hlist_empty(head + h)) - continue; - if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) { - cb->args[3] = h; - return -1; - } - memset(&cb->args[4], 0, - sizeof(cb->args) - 4*sizeof(cb->args[0])); - } - cb->args[3] = h; - return skb->len; -} - -int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, - struct netlink_callback *cb) -{ - int m = 0, s_m; - struct fn_zone *fz; - struct fn_hash *table = (struct fn_hash *)tb->tb_data; - - s_m = cb->args[2]; - rcu_read_lock(); - for (fz = rcu_dereference(table->fn_zone_list); - fz != NULL; - fz = rcu_dereference(fz->fz_next), m++) { - if (m < s_m) - continue; - if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) { - cb->args[2] = m; - rcu_read_unlock(); - return -1; - } - memset(&cb->args[3], 0, - sizeof(cb->args) - 3*sizeof(cb->args[0])); - } - rcu_read_unlock(); - cb->args[2] = m; - return skb->len; -} - -void __init fib_hash_init(void) -{ - fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node), - 0, SLAB_PANIC, NULL); - - fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), - 0, SLAB_PANIC, NULL); - -} - -struct fib_table *fib_hash_table(u32 id) -{ - struct fib_table *tb; - - tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash), - GFP_KERNEL); - if (tb == NULL) - return NULL; - - tb->tb_id = id; - tb->tb_default = -1; - - memset(tb->tb_data, 0, sizeof(struct fn_hash)); - return tb; -} - -/* ------------------------------------------------------------------------ */ -#ifdef CONFIG_PROC_FS - -struct fib_iter_state { - struct seq_net_private p; - struct fn_zone *zone; - int bucket; - struct hlist_head *hash_head; - struct fib_node *fn; - struct fib_alias *fa; - loff_t pos; - unsigned int genid; - int valid; -}; - -static struct fib_alias *fib_get_first(struct seq_file *seq) -{ - struct fib_iter_state *iter = seq->private; - struct fib_table *main_table; - struct fn_hash *table; - - main_table = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN); - table = (struct fn_hash *)main_table->tb_data; - - iter->bucket = 0; - iter->hash_head = NULL; - iter->fn = NULL; - iter->fa = NULL; - iter->pos = 0; - iter->genid = fib_hash_genid; - iter->valid = 1; - - for (iter->zone = rcu_dereference(table->fn_zone_list); - iter->zone != NULL; - iter->zone = rcu_dereference(iter->zone->fz_next)) { - int maxslot; - - if (!iter->zone->fz_nent) - continue; - - iter->hash_head = rcu_dereference(iter->zone->fz_hash); - maxslot = iter->zone->fz_divisor; - - for (iter->bucket = 0; iter->bucket < maxslot; - ++iter->bucket, ++iter->hash_head) { - struct hlist_node *node; - struct fib_node *fn; - - hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { - struct fib_alias *fa; - - list_for_each_entry(fa, &fn->fn_alias, fa_list) { - iter->fn = fn; - iter->fa = fa; - goto out; - } - } - } - } -out: - return iter->fa; -} - -static struct fib_alias *fib_get_next(struct seq_file *seq) -{ - struct fib_iter_state *iter = seq->private; - struct fib_node *fn; - struct fib_alias *fa; - - /* Advance FA, if any. */ - fn = iter->fn; - fa = iter->fa; - if (fa) { - BUG_ON(!fn); - list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) { - iter->fa = fa; - goto out; - } - } - - fa = iter->fa = NULL; - - /* Advance FN. */ - if (fn) { - struct hlist_node *node = &fn->fn_hash; - hlist_for_each_entry_continue(fn, node, fn_hash) { - iter->fn = fn; - - list_for_each_entry(fa, &fn->fn_alias, fa_list) { - iter->fa = fa; - goto out; - } - } - } - - fn = iter->fn = NULL; - - /* Advance hash chain. */ - if (!iter->zone) - goto out; - - for (;;) { - struct hlist_node *node; - int maxslot; - - maxslot = iter->zone->fz_divisor; - - while (++iter->bucket < maxslot) { - iter->hash_head++; - - hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { - list_for_each_entry(fa, &fn->fn_alias, fa_list) { - iter->fn = fn; - iter->fa = fa; - goto out; - } - } - } - - iter->zone = rcu_dereference(iter->zone->fz_next); - - if (!iter->zone) - goto out; - - iter->bucket = 0; - iter->hash_head = rcu_dereference(iter->zone->fz_hash); - - hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { - list_for_each_entry(fa, &fn->fn_alias, fa_list) { - iter->fn = fn; - iter->fa = fa; - goto out; - } - } - } -out: - iter->pos++; - return fa; -} - -static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos) -{ - struct fib_iter_state *iter = seq->private; - struct fib_alias *fa; - - if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) { - fa = iter->fa; - pos -= iter->pos; - } else - fa = fib_get_first(seq); - - if (fa) - while (pos && (fa = fib_get_next(seq))) - --pos; - return pos ? NULL : fa; -} - -static void *fib_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(RCU) -{ - void *v = NULL; - - rcu_read_lock(); - if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN)) - v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; - return v; -} - -static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - ++*pos; - return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq); -} - -static void fib_seq_stop(struct seq_file *seq, void *v) - __releases(RCU) -{ - rcu_read_unlock(); -} - -static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi) -{ - static const unsigned type2flags[RTN_MAX + 1] = { - [7] = RTF_REJECT, - [8] = RTF_REJECT, - }; - unsigned flags = type2flags[type]; - - if (fi && fi->fib_nh->nh_gw) - flags |= RTF_GATEWAY; - if (mask == htonl(0xFFFFFFFF)) - flags |= RTF_HOST; - flags |= RTF_UP; - return flags; -} - -/* - * This outputs /proc/net/route. - * - * It always works in backward compatibility mode. - * The format of the file is not supposed to be changed. - */ -static int fib_seq_show(struct seq_file *seq, void *v) -{ - struct fib_iter_state *iter; - int len; - __be32 prefix, mask; - unsigned flags; - struct fib_node *f; - struct fib_alias *fa; - struct fib_info *fi; - - if (v == SEQ_START_TOKEN) { - seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " - "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU" - "\tWindow\tIRTT"); - goto out; - } - - iter = seq->private; - f = iter->fn; - fa = iter->fa; - fi = fa->fa_info; - prefix = f->fn_key; - mask = FZ_MASK(iter->zone); - flags = fib_flag_trans(fa->fa_type, mask, fi); - if (fi) - seq_printf(seq, - "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n", - fi->fib_dev ? fi->fib_dev->name : "*", prefix, - fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority, - mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0), - fi->fib_window, - fi->fib_rtt >> 3, &len); - else - seq_printf(seq, - "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n", - prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0, &len); - - seq_printf(seq, "%*s\n", 127 - len, ""); -out: - return 0; -} - -static const struct seq_operations fib_seq_ops = { - .start = fib_seq_start, - .next = fib_seq_next, - .stop = fib_seq_stop, - .show = fib_seq_show, -}; - -static int fib_seq_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &fib_seq_ops, - sizeof(struct fib_iter_state)); -} - -static const struct file_operations fib_seq_fops = { - .owner = THIS_MODULE, - .open = fib_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -int __net_init fib_proc_init(struct net *net) -{ - if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_seq_fops)) - return -ENOMEM; - return 0; -} - -void __net_exit fib_proc_exit(struct net *net) -{ - proc_net_remove(net, "route"); -} -#endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index c079cc0..af0f14a 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -10,7 +10,6 @@ struct fib_alias { struct fib_info *fa_info; u8 fa_tos; u8 fa_type; - u8 fa_scope; u8 fa_state; struct rcu_head rcu; }; @@ -25,14 +24,11 @@ static inline void fib_alias_accessed(struct fib_alias *fa) } /* Exported by fib_semantics.c */ -extern int fib_semantic_match(struct list_head *head, - const struct flowi *flp, - struct fib_result *res, int prefixlen, int fib_flags); extern void fib_release_info(struct fib_info *); extern struct fib_info *fib_create_info(struct fib_config *cfg); extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, - u32 tb_id, u8 type, u8 scope, __be32 dst, + u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, unsigned int); extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, @@ -51,4 +47,11 @@ static inline void fib_result_assign(struct fib_result *res, res->fi = fi; } +struct fib_prop { + int error; + u8 scope; +}; + +extern const struct fib_prop fib_props[RTN_MAX + 1]; + #endif /* _FIB_LOOKUP_H */ diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 7981a24..a53bb1b 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -41,19 +41,19 @@ struct fib4_rule { __be32 srcmask; __be32 dst; __be32 dstmask; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID u32 tclassid; #endif }; -#ifdef CONFIG_NET_CLS_ROUTE -u32 fib_rules_tclass(struct fib_result *res) +#ifdef CONFIG_IP_ROUTE_CLASSID +u32 fib_rules_tclass(const struct fib_result *res) { return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0; } #endif -int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res) +int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) { struct fib_lookup_arg arg = { .result = res, @@ -61,7 +61,7 @@ int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res) }; int err; - err = fib_rules_lookup(net->ipv4.rules_ops, flp, 0, &arg); + err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg); res->r = arg.rule; return err; @@ -95,7 +95,7 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, if (!tbl) goto errout; - err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result, arg->flags); + err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *) arg->result, arg->flags); if (err > 0) err = -EAGAIN; errout: @@ -106,14 +106,15 @@ errout: static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { struct fib4_rule *r = (struct fib4_rule *) rule; - __be32 daddr = fl->fl4_dst; - __be32 saddr = fl->fl4_src; + struct flowi4 *fl4 = &fl->u.ip4; + __be32 daddr = fl4->daddr; + __be32 saddr = fl4->saddr; if (((saddr ^ r->src) & r->srcmask) || ((daddr ^ r->dst) & r->dstmask)) return 0; - if (r->tos && (r->tos != fl->fl4_tos)) + if (r->tos && (r->tos != fl4->flowi4_tos)) return 0; return 1; @@ -165,7 +166,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, if (frh->dst_len) rule4->dst = nla_get_be32(tb[FRA_DST]); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW]) rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); #endif @@ -195,7 +196,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, if (frh->tos && (rule4->tos != frh->tos)) return 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW]))) return 0; #endif @@ -224,7 +225,7 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb, if (rule4->src_len) NLA_PUT_BE32(skb, FRA_SRC, rule4->src); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (rule4->tclassid) NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid); #endif diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 12d3dc3..33e2c35 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -49,7 +49,7 @@ static DEFINE_SPINLOCK(fib_info_lock); static struct hlist_head *fib_info_hash; static struct hlist_head *fib_info_laddrhash; -static unsigned int fib_hash_size; +static unsigned int fib_info_hash_size; static unsigned int fib_info_cnt; #define DEVINDEX_HASHBITS 8 @@ -90,11 +90,7 @@ static DEFINE_SPINLOCK(fib_multipath_lock); #define endfor_nexthops(fi) } -static const struct -{ - int error; - u8 scope; -} fib_props[RTN_MAX + 1] = { +const struct fib_prop fib_props[RTN_MAX + 1] = { [RTN_UNSPEC] = { .error = 0, .scope = RT_SCOPE_NOWHERE, @@ -145,16 +141,8 @@ static const struct }, }; - /* Release a nexthop info record */ -static void free_fib_info_rcu(struct rcu_head *head) -{ - struct fib_info *fi = container_of(head, struct fib_info, rcu); - - kfree(fi); -} - void free_fib_info(struct fib_info *fi) { if (fi->fib_dead == 0) { @@ -168,7 +156,7 @@ void free_fib_info(struct fib_info *fi) } endfor_nexthops(fi); fib_info_cnt--; release_net(fi->fib_net); - call_rcu(&fi->rcu, free_fib_info_rcu); + kfree_rcu(fi, rcu); } void fib_release_info(struct fib_info *fi) @@ -200,7 +188,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->nh_weight != onh->nh_weight || #endif -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid != onh->nh_tclassid || #endif ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) @@ -221,10 +209,10 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val) static inline unsigned int fib_info_hashfn(const struct fib_info *fi) { - unsigned int mask = (fib_hash_size - 1); + unsigned int mask = (fib_info_hash_size - 1); unsigned int val = fi->fib_nhs; - val ^= fi->fib_protocol; + val ^= (fi->fib_protocol << 8) | fi->fib_scope; val ^= (__force u32)fi->fib_prefsrc; val ^= fi->fib_priority; for_nexthops(fi) { @@ -250,10 +238,11 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) if (fi->fib_nhs != nfi->fib_nhs) continue; if (nfi->fib_protocol == fi->fib_protocol && + nfi->fib_scope == fi->fib_scope && nfi->fib_prefsrc == fi->fib_prefsrc && nfi->fib_priority == fi->fib_priority && memcmp(nfi->fib_metrics, fi->fib_metrics, - sizeof(fi->fib_metrics)) == 0 && + sizeof(u32) * RTAX_MAX) == 0 && ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) return fi; @@ -330,7 +319,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, goto errout; err = fib_dump_info(skb, info->pid, seq, event, tb_id, - fa->fa_type, fa->fa_scope, key, dst_len, + fa->fa_type, key, dst_len, fa->fa_tos, fa->fa_info, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ @@ -422,7 +411,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, nla = nla_find(attrs, attrlen, RTA_GATEWAY); nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; #endif @@ -476,7 +465,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla && nla_get_be32(nla) != nh->nh_gw) return 1; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); if (nla && nla_get_u32(nla) != nh->nh_tclassid) return 1; @@ -562,16 +551,16 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, } rcu_read_lock(); { - struct flowi fl = { - .fl4_dst = nh->nh_gw, - .fl4_scope = cfg->fc_scope + 1, - .oif = nh->nh_oif, + struct flowi4 fl4 = { + .daddr = nh->nh_gw, + .flowi4_scope = cfg->fc_scope + 1, + .flowi4_oif = nh->nh_oif, }; /* It is not necessary, but requires a bit of thinking */ - if (fl.fl4_scope < RT_SCOPE_LINK) - fl.fl4_scope = RT_SCOPE_LINK; - err = fib_lookup(net, &fl, &res); + if (fl4.flowi4_scope < RT_SCOPE_LINK) + fl4.flowi4_scope = RT_SCOPE_LINK; + err = fib_lookup(net, &fl4, &res); if (err) { rcu_read_unlock(); return err; @@ -613,14 +602,14 @@ out: static inline unsigned int fib_laddr_hashfn(__be32 val) { - unsigned int mask = (fib_hash_size - 1); + unsigned int mask = (fib_info_hash_size - 1); return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask; } -static struct hlist_head *fib_hash_alloc(int bytes) +static struct hlist_head *fib_info_hash_alloc(int bytes) { if (bytes <= PAGE_SIZE) return kzalloc(bytes, GFP_KERNEL); @@ -630,7 +619,7 @@ static struct hlist_head *fib_hash_alloc(int bytes) get_order(bytes)); } -static void fib_hash_free(struct hlist_head *hash, int bytes) +static void fib_info_hash_free(struct hlist_head *hash, int bytes) { if (!hash) return; @@ -641,18 +630,18 @@ static void fib_hash_free(struct hlist_head *hash, int bytes) free_pages((unsigned long) hash, get_order(bytes)); } -static void fib_hash_move(struct hlist_head *new_info_hash, - struct hlist_head *new_laddrhash, - unsigned int new_size) +static void fib_info_hash_move(struct hlist_head *new_info_hash, + struct hlist_head *new_laddrhash, + unsigned int new_size) { struct hlist_head *old_info_hash, *old_laddrhash; - unsigned int old_size = fib_hash_size; + unsigned int old_size = fib_info_hash_size; unsigned int i, bytes; spin_lock_bh(&fib_info_lock); old_info_hash = fib_info_hash; old_laddrhash = fib_info_laddrhash; - fib_hash_size = new_size; + fib_info_hash_size = new_size; for (i = 0; i < old_size; i++) { struct hlist_head *head = &fib_info_hash[i]; @@ -693,8 +682,18 @@ static void fib_hash_move(struct hlist_head *new_info_hash, spin_unlock_bh(&fib_info_lock); bytes = old_size * sizeof(struct hlist_head *); - fib_hash_free(old_info_hash, bytes); - fib_hash_free(old_laddrhash, bytes); + fib_info_hash_free(old_info_hash, bytes); + fib_info_hash_free(old_laddrhash, bytes); +} + +__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) +{ + nh->nh_saddr = inet_select_addr(nh->nh_dev, + nh->nh_gw, + nh->nh_parent->fib_scope); + nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid); + + return nh->nh_saddr; } struct fib_info *fib_create_info(struct fib_config *cfg) @@ -705,6 +704,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg) int nhs = 1; struct net *net = cfg->fc_nlinfo.nl_net; + if (cfg->fc_type > RTN_MAX) + goto err_inval; + /* Fast check to catch the most weird cases */ if (fib_props[cfg->fc_type].scope > cfg->fc_scope) goto err_inval; @@ -718,8 +720,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) #endif err = -ENOBUFS; - if (fib_info_cnt >= fib_hash_size) { - unsigned int new_size = fib_hash_size << 1; + if (fib_info_cnt >= fib_info_hash_size) { + unsigned int new_size = fib_info_hash_size << 1; struct hlist_head *new_info_hash; struct hlist_head *new_laddrhash; unsigned int bytes; @@ -727,25 +729,32 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (!new_size) new_size = 1; bytes = new_size * sizeof(struct hlist_head *); - new_info_hash = fib_hash_alloc(bytes); - new_laddrhash = fib_hash_alloc(bytes); + new_info_hash = fib_info_hash_alloc(bytes); + new_laddrhash = fib_info_hash_alloc(bytes); if (!new_info_hash || !new_laddrhash) { - fib_hash_free(new_info_hash, bytes); - fib_hash_free(new_laddrhash, bytes); + fib_info_hash_free(new_info_hash, bytes); + fib_info_hash_free(new_laddrhash, bytes); } else - fib_hash_move(new_info_hash, new_laddrhash, new_size); + fib_info_hash_move(new_info_hash, new_laddrhash, new_size); - if (!fib_hash_size) + if (!fib_info_hash_size) goto failure; } fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); if (fi == NULL) goto failure; + if (cfg->fc_mx) { + fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); + if (!fi->fib_metrics) + goto failure; + } else + fi->fib_metrics = (u32 *) dst_default_metrics; fib_info_cnt++; fi->fib_net = hold_net(net); fi->fib_protocol = cfg->fc_protocol; + fi->fib_scope = cfg->fc_scope; fi->fib_flags = cfg->fc_flags; fi->fib_priority = cfg->fc_priority; fi->fib_prefsrc = cfg->fc_prefsrc; @@ -779,7 +788,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto err_inval; if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) goto err_inval; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) goto err_inval; #endif @@ -792,7 +801,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) nh->nh_oif = cfg->fc_oif; nh->nh_gw = cfg->fc_gw; nh->nh_flags = cfg->fc_flags; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid = cfg->fc_flow; #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -804,6 +813,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) goto err_inval; goto link_it; + } else { + switch (cfg->fc_type) { + case RTN_UNICAST: + case RTN_LOCAL: + case RTN_BROADCAST: + case RTN_ANYCAST: + case RTN_MULTICAST: + break; + default: + goto err_inval; + } } if (cfg->fc_scope > RT_SCOPE_HOST) @@ -835,6 +855,10 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto err_inval; } + change_nexthops(fi) { + fib_info_update_nh_saddr(net, nexthop_nh); + } endfor_nexthops(fi) + link_it: ofi = fib_find_info(fi); if (ofi) { @@ -880,86 +904,8 @@ failure: return ERR_PTR(err); } -/* Note! fib_semantic_match intentionally uses RCU list functions. */ -int fib_semantic_match(struct list_head *head, const struct flowi *flp, - struct fib_result *res, int prefixlen, int fib_flags) -{ - struct fib_alias *fa; - int nh_sel = 0; - - list_for_each_entry_rcu(fa, head, fa_list) { - int err; - - if (fa->fa_tos && - fa->fa_tos != flp->fl4_tos) - continue; - - if (fa->fa_scope < flp->fl4_scope) - continue; - - fib_alias_accessed(fa); - - err = fib_props[fa->fa_type].error; - if (err == 0) { - struct fib_info *fi = fa->fa_info; - - if (fi->fib_flags & RTNH_F_DEAD) - continue; - - switch (fa->fa_type) { - case RTN_UNICAST: - case RTN_LOCAL: - case RTN_BROADCAST: - case RTN_ANYCAST: - case RTN_MULTICAST: - for_nexthops(fi) { - if (nh->nh_flags & RTNH_F_DEAD) - continue; - if (!flp->oif || flp->oif == nh->nh_oif) - break; - } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - if (nhsel < fi->fib_nhs) { - nh_sel = nhsel; - goto out_fill_res; - } -#else - if (nhsel < 1) - goto out_fill_res; -#endif - endfor_nexthops(fi); - continue; - - default: - pr_warning("fib_semantic_match bad type %#x\n", - fa->fa_type); - return -EINVAL; - } - } - return err; - } - return 1; - -out_fill_res: - res->prefixlen = prefixlen; - res->nh_sel = nh_sel; - res->type = fa->fa_type; - res->scope = fa->fa_scope; - res->fi = fa->fa_info; - if (!(fib_flags & FIB_LOOKUP_NOREF)) - atomic_inc(&res->fi->fib_clntref); - return 0; -} - -/* Find appropriate source address to this destination */ - -__be32 __fib_res_prefsrc(struct fib_result *res) -{ - return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope); -} - int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, - u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos, + u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, unsigned int flags) { struct nlmsghdr *nlh; @@ -981,7 +927,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, NLA_PUT_U32(skb, RTA_TABLE, tb_id); rtm->rtm_type = type; rtm->rtm_flags = fi->fib_flags; - rtm->rtm_scope = scope; + rtm->rtm_scope = fi->fib_scope; rtm->rtm_protocol = fi->fib_protocol; if (rtm->rtm_dst_len) @@ -1002,7 +948,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, if (fi->fib_nh->nh_oif) NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (fi->fib_nh[0].nh_tclassid) NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid); #endif @@ -1027,7 +973,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, if (nh->nh_gw) NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (nh->nh_tclassid) NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid); #endif @@ -1125,6 +1071,62 @@ int fib_sync_down_dev(struct net_device *dev, int force) return ret; } +/* Must be invoked inside of an RCU protected region. */ +void fib_select_default(struct fib_result *res) +{ + struct fib_info *fi = NULL, *last_resort = NULL; + struct list_head *fa_head = res->fa_head; + struct fib_table *tb = res->table; + int order = -1, last_idx = -1; + struct fib_alias *fa; + + list_for_each_entry_rcu(fa, fa_head, fa_list) { + struct fib_info *next_fi = fa->fa_info; + + if (next_fi->fib_scope != res->scope || + fa->fa_type != RTN_UNICAST) + continue; + + if (next_fi->fib_priority > res->fi->fib_priority) + break; + if (!next_fi->fib_nh[0].nh_gw || + next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) + continue; + + fib_alias_accessed(fa); + + if (fi == NULL) { + if (next_fi != res->fi) + break; + } else if (!fib_detect_death(fi, order, &last_resort, + &last_idx, tb->tb_default)) { + fib_result_assign(res, fi); + tb->tb_default = order; + goto out; + } + fi = next_fi; + order++; + } + + if (order <= 0 || fi == NULL) { + tb->tb_default = -1; + goto out; + } + + if (!fib_detect_death(fi, order, &last_resort, &last_idx, + tb->tb_default)) { + fib_result_assign(res, fi); + tb->tb_default = order; + goto out; + } + + if (last_idx >= 0) + fib_result_assign(res, last_resort); + tb->tb_default = last_idx; +out: + return; +} + #ifdef CONFIG_IP_ROUTE_MULTIPATH /* @@ -1189,7 +1191,7 @@ int fib_sync_up(struct net_device *dev) * The algorithm is suboptimal, but it provides really * fair weighted route distribution. */ -void fib_select_multipath(const struct flowi *flp, struct fib_result *res) +void fib_select_multipath(struct fib_result *res) { struct fib_info *fi = res->fi; int w; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 0f28034..58c25ea 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -12,7 +12,7 @@ * * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet * - * This work is based on the LPC-trie which is originally descibed in: + * This work is based on the LPC-trie which is originally described in: * * An experimental study of compression methods for dynamic tries * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. @@ -72,6 +72,7 @@ #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> +#include <linux/prefetch.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> @@ -95,7 +96,7 @@ typedef unsigned int t_key; #define IS_TNODE(n) (!(n->parent & T_LEAF)) #define IS_LEAF(n) (n->parent & T_LEAF) -struct node { +struct rt_trie_node { unsigned long parent; t_key key; }; @@ -126,7 +127,7 @@ struct tnode { struct work_struct work; struct tnode *tnode_free; }; - struct node *child[0]; + struct rt_trie_node __rcu *child[0]; }; #ifdef CONFIG_IP_FIB_TRIE_STATS @@ -151,16 +152,16 @@ struct trie_stat { }; struct trie { - struct node *trie; + struct rt_trie_node __rcu *trie; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats stats; #endif }; -static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); -static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, +static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n); +static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, int wasfull); -static struct node *resize(struct trie *t, struct tnode *tn); +static struct rt_trie_node *resize(struct trie *t, struct tnode *tn); static struct tnode *inflate(struct trie *t, struct tnode *tn); static struct tnode *halve(struct trie *t, struct tnode *tn); /* tnodes to free after resize(); protected by RTNL */ @@ -177,39 +178,58 @@ static const int sync_pages = 128; static struct kmem_cache *fn_alias_kmem __read_mostly; static struct kmem_cache *trie_leaf_kmem __read_mostly; -static inline struct tnode *node_parent(struct node *node) +/* + * caller must hold RTNL + */ +static inline struct tnode *node_parent(const struct rt_trie_node *node) { - return (struct tnode *)(node->parent & ~NODE_TYPE_MASK); + unsigned long parent; + + parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held()); + + return (struct tnode *)(parent & ~NODE_TYPE_MASK); } -static inline struct tnode *node_parent_rcu(struct node *node) +/* + * caller must hold RCU read lock or RTNL + */ +static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node) { - struct tnode *ret = node_parent(node); + unsigned long parent; + + parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() || + lockdep_rtnl_is_held()); - return rcu_dereference_rtnl(ret); + return (struct tnode *)(parent & ~NODE_TYPE_MASK); } /* Same as rcu_assign_pointer * but that macro() assumes that value is a pointer. */ -static inline void node_set_parent(struct node *node, struct tnode *ptr) +static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr) { smp_wmb(); node->parent = (unsigned long)ptr | NODE_TYPE(node); } -static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i) +/* + * caller must hold RTNL + */ +static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i) { BUG_ON(i >= 1U << tn->bits); - return tn->child[i]; + return rtnl_dereference(tn->child[i]); } -static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i) +/* + * caller must hold RCU read lock or RTNL + */ +static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i) { - struct node *ret = tnode_get_child(tn, i); + BUG_ON(i >= 1U << tn->bits); - return rcu_dereference_rtnl(ret); + return rcu_dereference_rtnl(tn->child[i]); } static inline int tnode_child_length(const struct tnode *tn) @@ -217,12 +237,12 @@ static inline int tnode_child_length(const struct tnode *tn) return 1 << tn->bits; } -static inline t_key mask_pfx(t_key k, unsigned short l) +static inline t_key mask_pfx(t_key k, unsigned int l) { return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l); } -static inline t_key tkey_extract_bits(t_key a, int offset, int bits) +static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits) { if (offset < KEYLENGTH) return ((t_key)(a << offset)) >> (KEYLENGTH - bits); @@ -350,14 +370,9 @@ static inline void free_leaf(struct leaf *l) call_rcu_bh(&l->rcu, __leaf_free_rcu); } -static void __leaf_info_free_rcu(struct rcu_head *head) -{ - kfree(container_of(head, struct leaf_info, rcu)); -} - static inline void free_leaf_info(struct leaf_info *leaf) { - call_rcu(&leaf->rcu, __leaf_info_free_rcu); + kfree_rcu(leaf, rcu); } static struct tnode *tnode_alloc(size_t size) @@ -378,7 +393,7 @@ static void __tnode_free_rcu(struct rcu_head *head) { struct tnode *tn = container_of(head, struct tnode, rcu); size_t size = sizeof(struct tnode) + - (sizeof(struct node *) << tn->bits); + (sizeof(struct rt_trie_node *) << tn->bits); if (size <= PAGE_SIZE) kfree(tn); @@ -402,7 +417,7 @@ static void tnode_free_safe(struct tnode *tn) tn->tnode_free = tnode_free_head; tnode_free_head = tn; tnode_free_size += sizeof(struct tnode) + - (sizeof(struct node *) << tn->bits); + (sizeof(struct rt_trie_node *) << tn->bits); } static void tnode_free_flush(void) @@ -443,7 +458,7 @@ static struct leaf_info *leaf_info_new(int plen) static struct tnode *tnode_new(t_key key, int pos, int bits) { - size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits); + size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits); struct tnode *tn = tnode_alloc(sz); if (tn) { @@ -456,7 +471,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) } pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), - sizeof(struct node) << bits); + sizeof(struct rt_trie_node) << bits); return tn; } @@ -465,7 +480,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) * and no bits are skipped. See discussion in dyntree paper p. 6 */ -static inline int tnode_full(const struct tnode *tn, const struct node *n) +static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n) { if (n == NULL || IS_LEAF(n)) return 0; @@ -474,7 +489,7 @@ static inline int tnode_full(const struct tnode *tn, const struct node *n) } static inline void put_child(struct trie *t, struct tnode *tn, int i, - struct node *n) + struct rt_trie_node *n) { tnode_put_child_reorg(tn, i, n, -1); } @@ -484,10 +499,10 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i, * Update the value of full_children and empty_children. */ -static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, +static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, int wasfull) { - struct node *chi = tn->child[i]; + struct rt_trie_node *chi = rtnl_dereference(tn->child[i]); int isfull; BUG_ON(i >= 1<<tn->bits); @@ -515,7 +530,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, } #define MAX_WORK 10 -static struct node *resize(struct trie *t, struct tnode *tn) +static struct rt_trie_node *resize(struct trie *t, struct tnode *tn) { int i; struct tnode *old_tn; @@ -605,7 +620,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* Keep root node larger */ - if (!node_parent((struct node *)tn)) { + if (!node_parent((struct rt_trie_node *)tn)) { inflate_threshold_use = inflate_threshold_root; halve_threshold_use = halve_threshold_root; } else { @@ -635,7 +650,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* Return if at least one inflate is run */ if (max_work != MAX_WORK) - return (struct node *) tn; + return (struct rt_trie_node *) tn; /* * Halve as long as the number of empty children in this @@ -663,9 +678,9 @@ static struct node *resize(struct trie *t, struct tnode *tn) if (tn->empty_children == tnode_child_length(tn) - 1) { one_child: for (i = 0; i < tnode_child_length(tn); i++) { - struct node *n; + struct rt_trie_node *n; - n = tn->child[i]; + n = rtnl_dereference(tn->child[i]); if (!n) continue; @@ -676,7 +691,21 @@ one_child: return n; } } - return (struct node *) tn; + return (struct rt_trie_node *) tn; +} + + +static void tnode_clean_free(struct tnode *tn) +{ + int i; + struct tnode *tofree; + + for (i = 0; i < tnode_child_length(tn); i++) { + tofree = (struct tnode *)rtnl_dereference(tn->child[i]); + if (tofree) + tnode_free(tofree); + } + tnode_free(tn); } static struct tnode *inflate(struct trie *t, struct tnode *tn) @@ -723,14 +752,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) goto nomem; } - put_child(t, tn, 2*i, (struct node *) left); - put_child(t, tn, 2*i+1, (struct node *) right); + put_child(t, tn, 2*i, (struct rt_trie_node *) left); + put_child(t, tn, 2*i+1, (struct rt_trie_node *) right); } } for (i = 0; i < olen; i++) { struct tnode *inode; - struct node *node = tnode_get_child(oldtnode, i); + struct rt_trie_node *node = tnode_get_child(oldtnode, i); struct tnode *left, *right; int size, j; @@ -755,8 +784,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) inode = (struct tnode *) node; if (inode->bits == 1) { - put_child(t, tn, 2*i, inode->child[0]); - put_child(t, tn, 2*i+1, inode->child[1]); + put_child(t, tn, 2*i, rtnl_dereference(inode->child[0])); + put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1])); tnode_free_safe(inode); continue; @@ -797,8 +826,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) size = tnode_child_length(left); for (j = 0; j < size; j++) { - put_child(t, left, j, inode->child[j]); - put_child(t, right, j, inode->child[j + size]); + put_child(t, left, j, rtnl_dereference(inode->child[j])); + put_child(t, right, j, rtnl_dereference(inode->child[j + size])); } put_child(t, tn, 2*i, resize(t, left)); put_child(t, tn, 2*i+1, resize(t, right)); @@ -808,24 +837,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) tnode_free_safe(oldtnode); return tn; nomem: - { - int size = tnode_child_length(tn); - int j; - - for (j = 0; j < size; j++) - if (tn->child[j]) - tnode_free((struct tnode *)tn->child[j]); - - tnode_free(tn); - - return ERR_PTR(-ENOMEM); - } + tnode_clean_free(tn); + return ERR_PTR(-ENOMEM); } static struct tnode *halve(struct trie *t, struct tnode *tn) { struct tnode *oldtnode = tn; - struct node *left, *right; + struct rt_trie_node *left, *right; int i; int olen = tnode_child_length(tn); @@ -856,7 +875,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) if (!newn) goto nomem; - put_child(t, tn, i/2, (struct node *)newn); + put_child(t, tn, i/2, (struct rt_trie_node *)newn); } } @@ -890,18 +909,8 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) tnode_free_safe(oldtnode); return tn; nomem: - { - int size = tnode_child_length(tn); - int j; - - for (j = 0; j < size; j++) - if (tn->child[j]) - tnode_free((struct tnode *)tn->child[j]); - - tnode_free(tn); - - return ERR_PTR(-ENOMEM); - } + tnode_clean_free(tn); + return ERR_PTR(-ENOMEM); } /* readside must use rcu_read_lock currently dump routines @@ -958,7 +967,7 @@ fib_find_node(struct trie *t, u32 key) { int pos; struct tnode *tn; - struct node *n; + struct rt_trie_node *n; pos = 0; n = rcu_dereference_rtnl(t->trie); @@ -993,17 +1002,17 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) key = tn->key; - while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) { + while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); tn = (struct tnode *) resize(t, (struct tnode *)tn); tnode_put_child_reorg((struct tnode *)tp, cindex, - (struct node *)tn, wasfull); + (struct rt_trie_node *)tn, wasfull); - tp = node_parent((struct node *) tn); + tp = node_parent((struct rt_trie_node *) tn); if (!tp) - rcu_assign_pointer(t->trie, (struct node *)tn); + rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); tnode_free_flush(); if (!tp) @@ -1015,7 +1024,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) if (IS_TNODE(tn)) tn = (struct tnode *)resize(t, (struct tnode *)tn); - rcu_assign_pointer(t->trie, (struct node *)tn); + rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); tnode_free_flush(); } @@ -1025,7 +1034,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) { int pos, newpos; struct tnode *tp = NULL, *tn = NULL; - struct node *n; + struct rt_trie_node *n; struct leaf *l; int missbit; struct list_head *fa_head = NULL; @@ -1033,7 +1042,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) t_key cindex; pos = 0; - n = t->trie; + n = rtnl_dereference(t->trie); /* If we point to NULL, stop. Either the tree is empty and we should * just put a new leaf in if, or we have reached an empty child slot, @@ -1111,10 +1120,10 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) if (t->trie && n == NULL) { /* Case 2: n is NULL, and will just insert a new leaf */ - node_set_parent((struct node *)l, tp); + node_set_parent((struct rt_trie_node *)l, tp); cindex = tkey_extract_bits(key, tp->pos, tp->bits); - put_child(t, (struct tnode *)tp, cindex, (struct node *)l); + put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l); } else { /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ /* @@ -1141,18 +1150,18 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) return NULL; } - node_set_parent((struct node *)tn, tp); + node_set_parent((struct rt_trie_node *)tn, tp); missbit = tkey_extract_bits(key, newpos, 1); - put_child(t, tn, missbit, (struct node *)l); + put_child(t, tn, missbit, (struct rt_trie_node *)l); put_child(t, tn, 1-missbit, n); if (tp) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); put_child(t, (struct tnode *)tp, cindex, - (struct node *)tn); + (struct rt_trie_node *)tn); } else { - rcu_assign_pointer(t->trie, (struct node *)tn); + rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); tp = tn; } } @@ -1245,7 +1254,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) if (fa->fa_info->fib_priority != fi->fib_priority) break; if (fa->fa_type == cfg->fc_type && - fa->fa_scope == cfg->fc_scope && fa->fa_info == fi) { fa_match = fa; break; @@ -1271,7 +1279,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->fa_tos = fa->fa_tos; new_fa->fa_info = fi; new_fa->fa_type = cfg->fc_type; - new_fa->fa_scope = cfg->fc_scope; state = fa->fa_state; new_fa->fa_state = state & ~FA_S_ACCESSED; @@ -1308,7 +1315,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->fa_info = fi; new_fa->fa_tos = tos; new_fa->fa_type = cfg->fc_type; - new_fa->fa_scope = cfg->fc_scope; new_fa->fa_state = 0; /* * Insert new entry to the list. @@ -1322,6 +1328,9 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) } } + if (!plen) + tb->tb_num_default++; + list_add_tail_rcu(&new_fa->fa_list, (fa ? &fa->fa_list : fa_head)); @@ -1340,8 +1349,8 @@ err: } /* should be called with rcu_read_lock */ -static int check_leaf(struct trie *t, struct leaf *l, - t_key key, const struct flowi *flp, +static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, + t_key key, const struct flowi4 *flp, struct fib_result *res, int fib_flags) { struct leaf_info *li; @@ -1349,40 +1358,75 @@ static int check_leaf(struct trie *t, struct leaf *l, struct hlist_node *node; hlist_for_each_entry_rcu(li, node, hhead, hlist) { - int err; + struct fib_alias *fa; int plen = li->plen; __be32 mask = inet_make_mask(plen); if (l->key != (key & ntohl(mask))) continue; - err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags); + list_for_each_entry_rcu(fa, &li->falh, fa_list) { + struct fib_info *fi = fa->fa_info; + int nhsel, err; + if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) + continue; + if (fa->fa_info->fib_scope < flp->flowi4_scope) + continue; + fib_alias_accessed(fa); + err = fib_props[fa->fa_type].error; + if (err) { #ifdef CONFIG_IP_FIB_TRIE_STATS - if (err <= 0) - t->stats.semantic_match_passed++; - else - t->stats.semantic_match_miss++; + t->stats.semantic_match_passed++; +#endif + return err; + } + if (fi->fib_flags & RTNH_F_DEAD) + continue; + for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { + const struct fib_nh *nh = &fi->fib_nh[nhsel]; + + if (nh->nh_flags & RTNH_F_DEAD) + continue; + if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) + continue; + +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.semantic_match_passed++; +#endif + res->prefixlen = plen; + res->nh_sel = nhsel; + res->type = fa->fa_type; + res->scope = fa->fa_info->fib_scope; + res->fi = fi; + res->table = tb; + res->fa_head = &li->falh; + if (!(fib_flags & FIB_LOOKUP_NOREF)) + atomic_inc(&res->fi->fib_clntref); + return 0; + } + } + +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.semantic_match_miss++; #endif - if (err <= 0) - return err; } return 1; } -int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, +int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct fib_result *res, int fib_flags) { struct trie *t = (struct trie *) tb->tb_data; int ret; - struct node *n; + struct rt_trie_node *n; struct tnode *pn; - int pos, bits; - t_key key = ntohl(flp->fl4_dst); - int chopped_off; + unsigned int pos, bits; + t_key key = ntohl(flp->daddr); + unsigned int chopped_off; t_key cindex = 0; - int current_prefix_length = KEYLENGTH; + unsigned int current_prefix_length = KEYLENGTH; struct tnode *cn; t_key pref_mismatch; @@ -1398,7 +1442,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, /* Just a leaf? */ if (IS_LEAF(n)) { - ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags); + ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags); goto found; } @@ -1423,7 +1467,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, } if (IS_LEAF(n)) { - ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags); + ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags); if (ret > 0) goto backtrace; goto found; @@ -1541,7 +1585,7 @@ backtrace: if (chopped_off <= pn->bits) { cindex &= ~(1 << (chopped_off-1)); } else { - struct tnode *parent = node_parent_rcu((struct node *) pn); + struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn); if (!parent) goto failed; @@ -1568,7 +1612,7 @@ found: */ static void trie_leaf_remove(struct trie *t, struct leaf *l) { - struct tnode *tp = node_parent((struct node *) l); + struct tnode *tp = node_parent((struct rt_trie_node *) l); pr_debug("entering trie_leaf_remove(%p)\n", l); @@ -1629,7 +1673,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) && (cfg->fc_scope == RT_SCOPE_NOWHERE || - fa->fa_scope == cfg->fc_scope) && + fa->fa_info->fib_scope == cfg->fc_scope) && + (!cfg->fc_prefsrc || + fi->fib_prefsrc == cfg->fc_prefsrc) && (!cfg->fc_protocol || fi->fib_protocol == cfg->fc_protocol) && fib_nh_match(cfg, fi) == 0) { @@ -1650,6 +1696,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) list_del_rcu(&fa->fa_list); + if (!plen) + tb->tb_num_default--; + if (list_empty(fa_head)) { hlist_del_rcu(&li->hlist); free_leaf_info(li); @@ -1706,7 +1755,7 @@ static int trie_flush_leaf(struct leaf *l) * Scan for the next right leaf starting at node p->child[idx] * Since we have back pointer, no recursion necessary. */ -static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) +static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c) { do { t_key idx; @@ -1722,7 +1771,7 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) continue; if (IS_LEAF(c)) { - prefetch(p->child[idx]); + prefetch(rcu_dereference_rtnl(p->child[idx])); return (struct leaf *) c; } @@ -1732,7 +1781,7 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) } /* Node empty, walk back up to parent */ - c = (struct node *) p; + c = (struct rt_trie_node *) p; } while ((p = node_parent_rcu(c)) != NULL); return NULL; /* Root of trie */ @@ -1753,7 +1802,7 @@ static struct leaf *trie_firstleaf(struct trie *t) static struct leaf *trie_nextleaf(struct leaf *l) { - struct node *c = (struct node *) l; + struct rt_trie_node *c = (struct rt_trie_node *) l; struct tnode *p = node_parent_rcu(c); if (!p) @@ -1802,80 +1851,6 @@ void fib_free_table(struct fib_table *tb) kfree(tb); } -void fib_table_select_default(struct fib_table *tb, - const struct flowi *flp, - struct fib_result *res) -{ - struct trie *t = (struct trie *) tb->tb_data; - int order, last_idx; - struct fib_info *fi = NULL; - struct fib_info *last_resort; - struct fib_alias *fa = NULL; - struct list_head *fa_head; - struct leaf *l; - - last_idx = -1; - last_resort = NULL; - order = -1; - - rcu_read_lock(); - - l = fib_find_node(t, 0); - if (!l) - goto out; - - fa_head = get_fa_head(l, 0); - if (!fa_head) - goto out; - - if (list_empty(fa_head)) - goto out; - - list_for_each_entry_rcu(fa, fa_head, fa_list) { - struct fib_info *next_fi = fa->fa_info; - - if (fa->fa_scope != res->scope || - fa->fa_type != RTN_UNICAST) - continue; - - if (next_fi->fib_priority > res->fi->fib_priority) - break; - if (!next_fi->fib_nh[0].nh_gw || - next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) - continue; - - fib_alias_accessed(fa); - - if (fi == NULL) { - if (next_fi != res->fi) - break; - } else if (!fib_detect_death(fi, order, &last_resort, - &last_idx, tb->tb_default)) { - fib_result_assign(res, fi); - tb->tb_default = order; - goto out; - } - fi = next_fi; - order++; - } - if (order <= 0 || fi == NULL) { - tb->tb_default = -1; - goto out; - } - - if (!fib_detect_death(fi, order, &last_resort, &last_idx, - tb->tb_default)) { - fib_result_assign(res, fi); - tb->tb_default = order; - goto out; - } - if (last_idx >= 0) - fib_result_assign(res, last_resort); - tb->tb_default = last_idx; -out: - rcu_read_unlock(); -} - static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) @@ -1900,7 +1875,6 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, RTM_NEWROUTE, tb->tb_id, fa->fa_type, - fa->fa_scope, xkey, plen, fa->fa_tos, @@ -1990,7 +1964,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, return skb->len; } -void __init fib_hash_init(void) +void __init fib_trie_init(void) { fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), @@ -2003,8 +1977,7 @@ void __init fib_hash_init(void) } -/* Fix more generic FIB names for init later */ -struct fib_table *fib_hash_table(u32 id) +struct fib_table *fib_trie_table(u32 id) { struct fib_table *tb; struct trie *t; @@ -2016,13 +1989,11 @@ struct fib_table *fib_hash_table(u32 id) tb->tb_id = id; tb->tb_default = -1; + tb->tb_num_default = 0; t = (struct trie *) tb->tb_data; memset(t, 0, sizeof(*t)); - if (id == RT_TABLE_LOCAL) - pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION); - return tb; } @@ -2036,7 +2007,7 @@ struct fib_trie_iter { unsigned int depth; }; -static struct node *fib_trie_get_next(struct fib_trie_iter *iter) +static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter) { struct tnode *tn = iter->tnode; unsigned int cindex = iter->index; @@ -2050,7 +2021,7 @@ static struct node *fib_trie_get_next(struct fib_trie_iter *iter) iter->tnode, iter->index, iter->depth); rescan: while (cindex < (1<<tn->bits)) { - struct node *n = tnode_get_child_rcu(tn, cindex); + struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex); if (n) { if (IS_LEAF(n)) { @@ -2069,7 +2040,7 @@ rescan: } /* Current node exhausted, pop back up */ - p = node_parent_rcu((struct node *)tn); + p = node_parent_rcu((struct rt_trie_node *)tn); if (p) { cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; tn = p; @@ -2081,10 +2052,10 @@ rescan: return NULL; } -static struct node *fib_trie_get_first(struct fib_trie_iter *iter, +static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter, struct trie *t) { - struct node *n; + struct rt_trie_node *n; if (!t) return NULL; @@ -2108,7 +2079,7 @@ static struct node *fib_trie_get_first(struct fib_trie_iter *iter, static void trie_collect_stats(struct trie *t, struct trie_stat *s) { - struct node *n; + struct rt_trie_node *n; struct fib_trie_iter iter; memset(s, 0, sizeof(*s)); @@ -2181,7 +2152,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) seq_putc(seq, '\n'); seq_printf(seq, "\tPointers: %u\n", pointers); - bytes += sizeof(struct node *) * pointers; + bytes += sizeof(struct rt_trie_node *) * pointers; seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); } @@ -2262,7 +2233,7 @@ static const struct file_operations fib_triestat_fops = { .release = single_release_net, }; -static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) +static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) { struct fib_trie_iter *iter = seq->private; struct net *net = seq_file_net(seq); @@ -2275,7 +2246,7 @@ static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) struct fib_table *tb; hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { - struct node *n; + struct rt_trie_node *n; for (n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); @@ -2304,7 +2275,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct fib_table *tb = iter->tb; struct hlist_node *tb_node; unsigned int h; - struct node *n; + struct rt_trie_node *n; ++*pos; /* next node in same table */ @@ -2314,7 +2285,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) /* walk rest of this hash chain */ h = tb->tb_id & (FIB_TABLE_HASHSZ - 1); - while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) { + while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) { tb = hlist_entry(tb_node, struct fib_table, tb_hlist); n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); if (n) @@ -2390,7 +2361,7 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t) static int fib_trie_seq_show(struct seq_file *seq, void *v) { const struct fib_trie_iter *iter = seq->private; - struct node *n = v; + struct rt_trie_node *n = v; if (!node_parent_rcu(n)) fib_table_print(seq, iter->tb); @@ -2422,7 +2393,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) seq_indent(seq, iter->depth+1); seq_printf(seq, " /%d %s %s", li->plen, rtn_scope(buf1, sizeof(buf1), - fa->fa_scope), + fa->fa_info->fib_scope), rtn_type(buf2, sizeof(buf2), fa->fa_type)); if (fa->fa_tos) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 4aa1b7f..5395e45 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -83,6 +83,7 @@ #include <net/tcp.h> #include <net/udp.h> #include <net/raw.h> +#include <net/ping.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/errno.h> @@ -108,8 +109,7 @@ struct icmp_bxm { __be32 times[3]; } data; int head_len; - struct ip_options replyopts; - unsigned char optbuf[40]; + struct ip_options_data replyopts; }; /* An array of errno for error messages from dest unreach. */ @@ -233,48 +233,11 @@ static inline void icmp_xmit_unlock(struct sock *sk) * Send an ICMP frame. */ -/* - * Check transmit rate limitation for given message. - * The rate information is held in the destination cache now. - * This function is generic and could be used for other purposes - * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. - * - * Note that the same dst_entry fields are modified by functions in - * route.c too, but these work for packet destinations while xrlim_allow - * works for icmp destinations. This means the rate limiting information - * for one "ip object" is shared - and these ICMPs are twice limited: - * by source and by destination. - * - * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate - * SHOULD allow setting of rate limits - * - * Shared between ICMPv4 and ICMPv6. - */ -#define XRLIM_BURST_FACTOR 6 -int xrlim_allow(struct dst_entry *dst, int timeout) -{ - unsigned long now, token = dst->rate_tokens; - int rc = 0; - - now = jiffies; - token += now - dst->rate_last; - dst->rate_last = now; - if (token > XRLIM_BURST_FACTOR * timeout) - token = XRLIM_BURST_FACTOR * timeout; - if (token >= timeout) { - token -= timeout; - rc = 1; - } - dst->rate_tokens = token; - return rc; -} -EXPORT_SYMBOL(xrlim_allow); - -static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt, - int type, int code) +static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, + struct flowi4 *fl4, int type, int code) { struct dst_entry *dst = &rt->dst; - int rc = 1; + bool rc = true; if (type > NR_ICMP_TYPES) goto out; @@ -288,8 +251,12 @@ static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt, goto out; /* Limit if icmp type is enabled in ratemask. */ - if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) - rc = xrlim_allow(dst, net->ipv4.sysctl_icmp_ratelimit); + if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { + if (!rt->peer) + rt_bind_peer(rt, fl4->daddr, 1); + rc = inet_peer_xrlim_allow(rt->peer, + net->ipv4.sysctl_icmp_ratelimit); + } out: return rc; } @@ -324,13 +291,14 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd, } static void icmp_push_reply(struct icmp_bxm *icmp_param, + struct flowi4 *fl4, struct ipcm_cookie *ipc, struct rtable **rt) { struct sock *sk; struct sk_buff *skb; sk = icmp_sk(dev_net((*rt)->dst.dev)); - if (ip_append_data(sk, icmp_glue_bits, icmp_param, + if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param, icmp_param->data_len+icmp_param->head_len, icmp_param->head_len, ipc, rt, MSG_DONTWAIT) < 0) { @@ -349,7 +317,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param, icmp_param->head_len, csum); icmph->checksum = csum_fold(csum); skb->ip_summed = CHECKSUM_NONE; - ip_push_pending_frames(sk); + ip_push_pending_frames(sk, fl4); } } @@ -362,11 +330,12 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) struct ipcm_cookie ipc; struct rtable *rt = skb_rtable(skb); struct net *net = dev_net(rt->dst.dev); + struct flowi4 fl4; struct sock *sk; struct inet_sock *inet; __be32 daddr; - if (ip_options_echo(&icmp_param->replyopts, skb)) + if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) return; sk = icmp_xmit_lock(net); @@ -377,31 +346,120 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) icmp_param->data.icmph.checksum = 0; inet->tos = ip_hdr(skb)->tos; - daddr = ipc.addr = rt->rt_src; + daddr = ipc.addr = ip_hdr(skb)->saddr; ipc.opt = NULL; ipc.tx_flags = 0; - if (icmp_param->replyopts.optlen) { - ipc.opt = &icmp_param->replyopts; - if (ipc.opt->srr) - daddr = icmp_param->replyopts.faddr; + if (icmp_param->replyopts.opt.opt.optlen) { + ipc.opt = &icmp_param->replyopts.opt; + if (ipc.opt->opt.srr) + daddr = icmp_param->replyopts.opt.opt.faddr; } - { - struct flowi fl = { .fl4_dst= daddr, - .fl4_src = rt->rt_spec_dst, - .fl4_tos = RT_TOS(ip_hdr(skb)->tos), - .proto = IPPROTO_ICMP }; - security_skb_classify_flow(skb, &fl); - if (ip_route_output_key(net, &rt, &fl)) - goto out_unlock; - } - if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type, + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = daddr; + fl4.saddr = rt->rt_spec_dst; + fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); + fl4.flowi4_proto = IPPROTO_ICMP; + security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) + goto out_unlock; + if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type, icmp_param->data.icmph.code)) - icmp_push_reply(icmp_param, &ipc, &rt); + icmp_push_reply(icmp_param, &fl4, &ipc, &rt); ip_rt_put(rt); out_unlock: icmp_xmit_unlock(sk); } +static struct rtable *icmp_route_lookup(struct net *net, + struct flowi4 *fl4, + struct sk_buff *skb_in, + const struct iphdr *iph, + __be32 saddr, u8 tos, + int type, int code, + struct icmp_bxm *param) +{ + struct rtable *rt, *rt2; + int err; + + memset(fl4, 0, sizeof(*fl4)); + fl4->daddr = (param->replyopts.opt.opt.srr ? + param->replyopts.opt.opt.faddr : iph->saddr); + fl4->saddr = saddr; + fl4->flowi4_tos = RT_TOS(tos); + fl4->flowi4_proto = IPPROTO_ICMP; + fl4->fl4_icmp_type = type; + fl4->fl4_icmp_code = code; + security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); + rt = __ip_route_output_key(net, fl4); + if (IS_ERR(rt)) + return rt; + + /* No need to clone since we're just using its address. */ + rt2 = rt; + + rt = (struct rtable *) xfrm_lookup(net, &rt->dst, + flowi4_to_flowi(fl4), NULL, 0); + if (!IS_ERR(rt)) { + if (rt != rt2) + return rt; + } else if (PTR_ERR(rt) == -EPERM) { + rt = NULL; + } else + return rt; + + err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(fl4), AF_INET); + if (err) + goto relookup_failed; + + if (inet_addr_type(net, fl4->saddr) == RTN_LOCAL) { + rt2 = __ip_route_output_key(net, fl4); + if (IS_ERR(rt2)) + err = PTR_ERR(rt2); + } else { + struct flowi4 fl4_2 = {}; + unsigned long orefdst; + + fl4_2.daddr = fl4->saddr; + rt2 = ip_route_output_key(net, &fl4_2); + if (IS_ERR(rt2)) { + err = PTR_ERR(rt2); + goto relookup_failed; + } + /* Ugh! */ + orefdst = skb_in->_skb_refdst; /* save old refdst */ + err = ip_route_input(skb_in, fl4->daddr, fl4->saddr, + RT_TOS(tos), rt2->dst.dev); + + dst_release(&rt2->dst); + rt2 = skb_rtable(skb_in); + skb_in->_skb_refdst = orefdst; /* restore old refdst */ + } + + if (err) + goto relookup_failed; + + rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst, + flowi4_to_flowi(fl4), NULL, + XFRM_LOOKUP_ICMP); + if (!IS_ERR(rt2)) { + dst_release(&rt->dst); + rt = rt2; + } else if (PTR_ERR(rt2) == -EPERM) { + if (rt) + dst_release(&rt->dst); + return rt2; + } else { + err = PTR_ERR(rt2); + goto relookup_failed; + } + return rt; + +relookup_failed: + if (rt) + return rt; + return ERR_PTR(err); +} /* * Send an ICMP message in response to a situation @@ -421,6 +479,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) struct icmp_bxm icmp_param; struct rtable *rt = skb_rtable(skb_in); struct ipcm_cookie ipc; + struct flowi4 fl4; __be32 saddr; u8 tos; struct net *net; @@ -507,7 +566,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) rcu_read_lock(); if (rt_is_input_route(rt) && net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) - dev = dev_get_by_index_rcu(net, rt->fl.iif); + dev = dev_get_by_index_rcu(net, rt->rt_iif); if (dev) saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); @@ -520,7 +579,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) IPTOS_PREC_INTERNETCONTROL) : iph->tos; - if (ip_options_echo(&icmp_param.replyopts, skb_in)) + if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in)) goto out_unlock; @@ -536,90 +595,15 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) icmp_param.offset = skb_network_offset(skb_in); inet_sk(sk)->tos = tos; ipc.addr = iph->saddr; - ipc.opt = &icmp_param.replyopts; + ipc.opt = &icmp_param.replyopts.opt; ipc.tx_flags = 0; - { - struct flowi fl = { - .fl4_dst = icmp_param.replyopts.srr ? - icmp_param.replyopts.faddr : iph->saddr, - .fl4_src = saddr, - .fl4_tos = RT_TOS(tos), - .proto = IPPROTO_ICMP, - .fl_icmp_type = type, - .fl_icmp_code = code, - }; - int err; - struct rtable *rt2; - - security_skb_classify_flow(skb_in, &fl); - if (__ip_route_output_key(net, &rt, &fl)) - goto out_unlock; - - /* No need to clone since we're just using its address. */ - rt2 = rt; - - if (!fl.nl_u.ip4_u.saddr) - fl.nl_u.ip4_u.saddr = rt->rt_src; - - err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0); - switch (err) { - case 0: - if (rt != rt2) - goto route_done; - break; - case -EPERM: - rt = NULL; - break; - default: - goto out_unlock; - } - - if (xfrm_decode_session_reverse(skb_in, &fl, AF_INET)) - goto relookup_failed; - - if (inet_addr_type(net, fl.fl4_src) == RTN_LOCAL) - err = __ip_route_output_key(net, &rt2, &fl); - else { - struct flowi fl2 = {}; - unsigned long orefdst; - - fl2.fl4_dst = fl.fl4_src; - if (ip_route_output_key(net, &rt2, &fl2)) - goto relookup_failed; - - /* Ugh! */ - orefdst = skb_in->_skb_refdst; /* save old refdst */ - err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src, - RT_TOS(tos), rt2->dst.dev); - - dst_release(&rt2->dst); - rt2 = skb_rtable(skb_in); - skb_in->_skb_refdst = orefdst; /* restore old refdst */ - } - - if (err) - goto relookup_failed; - - err = xfrm_lookup(net, (struct dst_entry **)&rt2, &fl, NULL, - XFRM_LOOKUP_ICMP); - switch (err) { - case 0: - dst_release(&rt->dst); - rt = rt2; - break; - case -EPERM: - goto ende; - default: -relookup_failed: - if (!rt) - goto out_unlock; - break; - } - } + rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, + type, code, &icmp_param); + if (IS_ERR(rt)) + goto out_unlock; -route_done: - if (!icmpv4_xrlim_allow(net, rt, type, code)) + if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code)) goto ende; /* RFC says return as much as we can without exceeding 576 bytes. */ @@ -627,7 +611,7 @@ route_done: room = dst_mtu(&rt->dst); if (room > 576) room = 576; - room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen; + room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen; room -= sizeof(struct icmphdr); icmp_param.data_len = skb_in->len - icmp_param.offset; @@ -635,7 +619,7 @@ route_done: icmp_param.data_len = room; icmp_param.head_len = sizeof(struct icmphdr); - icmp_push_reply(&icmp_param, &ipc, &rt); + icmp_push_reply(&icmp_param, &fl4, &ipc, &rt); ende: ip_rt_put(rt); out_unlock: @@ -651,7 +635,7 @@ EXPORT_SYMBOL(icmp_send); static void icmp_unreach(struct sk_buff *skb) { - struct iphdr *iph; + const struct iphdr *iph; struct icmphdr *icmph; int hash, protocol; const struct net_protocol *ipprot; @@ -670,7 +654,7 @@ static void icmp_unreach(struct sk_buff *skb) goto out_err; icmph = icmp_hdr(skb); - iph = (struct iphdr *)skb->data; + iph = (const struct iphdr *)skb->data; if (iph->ihl < 5) /* Mangled header, drop. */ goto out_err; @@ -718,7 +702,7 @@ static void icmp_unreach(struct sk_buff *skb) */ /* - * Check the other end isnt violating RFC 1122. Some routers send + * Check the other end isn't violating RFC 1122. Some routers send * bogus responses to broadcast frames. If you see this message * first check your netmask matches at both ends, if it does then * get the other vendor to fix their kit. @@ -743,7 +727,7 @@ static void icmp_unreach(struct sk_buff *skb) if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) goto out; - iph = (struct iphdr *)skb->data; + iph = (const struct iphdr *)skb->data; protocol = iph->protocol; /* @@ -772,7 +756,7 @@ out_err: static void icmp_redirect(struct sk_buff *skb) { - struct iphdr *iph; + const struct iphdr *iph; if (skb->len < sizeof(struct iphdr)) goto out_err; @@ -783,7 +767,7 @@ static void icmp_redirect(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto out; - iph = (struct iphdr *)skb->data; + iph = (const struct iphdr *)skb->data; switch (icmp_hdr(skb)->code & 7) { case ICMP_REDIR_NET: @@ -798,6 +782,15 @@ static void icmp_redirect(struct sk_buff *skb) iph->saddr, skb->dev); break; } + + /* Ping wants to see redirects. + * Let's pretend they are errors of sorts... */ + if (iph->protocol == IPPROTO_ICMP && + iph->ihl >= 5 && + pskb_may_pull(skb, (iph->ihl<<2)+8)) { + ping_err(skb, icmp_hdr(skb)->un.gateway); + } + out: return; out_err: @@ -947,12 +940,12 @@ static void icmp_address_reply(struct sk_buff *skb) BUG_ON(mp == NULL); for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { if (*mp == ifa->ifa_mask && - inet_ifa_match(rt->rt_src, ifa)) + inet_ifa_match(ip_hdr(skb)->saddr, ifa)) break; } if (!ifa && net_ratelimit()) { printk(KERN_INFO "Wrong address mask %pI4 from %s/%pI4\n", - mp, dev->name, &rt->rt_src); + mp, dev->name, &ip_hdr(skb)->saddr); } } } @@ -1058,7 +1051,7 @@ error: */ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { [ICMP_ECHOREPLY] = { - .handler = icmp_discard, + .handler = ping_rcv, }, [1] = { .handler = icmp_discard, diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index e0e77e2..f1d27f6 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -149,17 +149,11 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc); static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, int sfcount, __be32 *psfsrc, int delta); - -static void ip_mc_list_reclaim(struct rcu_head *head) -{ - kfree(container_of(head, struct ip_mc_list, rcu)); -} - static void ip_ma_put(struct ip_mc_list *im) { if (atomic_dec_and_test(&im->refcnt)) { in_dev_put(im->interface); - call_rcu(&im->rcu, ip_mc_list_reclaim); + kfree_rcu(im, rcu); } } @@ -309,6 +303,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) struct iphdr *pip; struct igmpv3_report *pig; struct net *net = dev_net(dev); + struct flowi4 fl4; while (1) { skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev), @@ -321,18 +316,11 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) } igmp_skb_size(skb) = size; - { - struct flowi fl = { .oif = dev->ifindex, - .fl4_dst = IGMPV3_ALL_MCR, - .proto = IPPROTO_IGMP }; - if (ip_route_output_key(net, &rt, &fl)) { - kfree_skb(skb); - return NULL; - } - } - if (rt->rt_src == 0) { + rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0, + 0, 0, + IPPROTO_IGMP, 0, dev->ifindex); + if (IS_ERR(rt)) { kfree_skb(skb); - ip_rt_put(rt); return NULL; } @@ -350,8 +338,8 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) pip->tos = 0xc0; pip->frag_off = htons(IP_DF); pip->ttl = 1; - pip->daddr = rt->rt_dst; - pip->saddr = rt->rt_src; + pip->daddr = fl4.daddr; + pip->saddr = fl4.saddr; pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ ip_select_ident(pip, &rt->dst, NULL); @@ -657,6 +645,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, struct net_device *dev = in_dev->dev; struct net *net = dev_net(dev); __be32 group = pmc ? pmc->multiaddr : 0; + struct flowi4 fl4; __be32 dst; if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) @@ -666,17 +655,11 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, else dst = group; - { - struct flowi fl = { .oif = dev->ifindex, - .fl4_dst = dst, - .proto = IPPROTO_IGMP }; - if (ip_route_output_key(net, &rt, &fl)) - return -1; - } - if (rt->rt_src == 0) { - ip_rt_put(rt); + rt = ip_route_output_ports(net, &fl4, NULL, dst, 0, + 0, 0, + IPPROTO_IGMP, 0, dev->ifindex); + if (IS_ERR(rt)) return -1; - } skb = alloc_skb(IGMP_SIZE+LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); if (skb == NULL) { @@ -698,7 +681,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, iph->frag_off = htons(IP_DF); iph->ttl = 1; iph->daddr = dst; - iph->saddr = rt->rt_src; + iph->saddr = fl4.saddr; iph->protocol = IPPROTO_IGMP; ip_select_ident(iph, &rt->dst, NULL); ((u8*)&iph[1])[0] = IPOPT_RA; @@ -1172,20 +1155,18 @@ static void igmp_group_dropped(struct ip_mc_list *im) if (!in_dev->dead) { if (IGMP_V1_SEEN(in_dev)) - goto done; + return; if (IGMP_V2_SEEN(in_dev)) { if (reporter) igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE); - goto done; + return; } /* IGMPv3 */ igmpv3_add_delrec(in_dev, im); igmp_ifc_event(in_dev); } -done: #endif - ip_mc_clear_src(im); } static void igmp_group_added(struct ip_mc_list *im) @@ -1322,6 +1303,7 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr) *ip = i->next_rcu; in_dev->mc_count--; igmp_group_dropped(i); + ip_mc_clear_src(i); if (!in_dev->dead) ip_rt_multicast_event(in_dev); @@ -1431,7 +1413,8 @@ void ip_mc_destroy_dev(struct in_device *in_dev) in_dev->mc_list = i->next_rcu; in_dev->mc_count--; - igmp_group_dropped(i); + /* We've dropped the groups in ip_mc_down already */ + ip_mc_clear_src(i); ip_ma_put(i); } } @@ -1439,8 +1422,6 @@ void ip_mc_destroy_dev(struct in_device *in_dev) /* RTNL is locked */ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) { - struct flowi fl = { .fl4_dst = imr->imr_multiaddr.s_addr }; - struct rtable *rt; struct net_device *dev = NULL; struct in_device *idev = NULL; @@ -1454,9 +1435,14 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) return NULL; } - if (!dev && !ip_route_output_key(net, &rt, &fl)) { - dev = rt->dst.dev; - ip_rt_put(rt); + if (!dev) { + struct rtable *rt = ip_route_output(net, + imr->imr_multiaddr.s_addr, + 0, 0, 0); + if (!IS_ERR(rt)) { + dev = rt->dst.dev; + ip_rt_put(rt); + } } if (dev) { imr->imr_ifindex = dev->ifindex; @@ -1836,12 +1822,6 @@ done: } EXPORT_SYMBOL(ip_mc_join_group); -static void ip_sf_socklist_reclaim(struct rcu_head *rp) -{ - kfree(container_of(rp, struct ip_sf_socklist, rcu)); - /* sk_omem_alloc should have been decreased by the caller*/ -} - static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, struct in_device *in_dev) { @@ -1858,18 +1838,10 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, rcu_assign_pointer(iml->sflist, NULL); /* decrease mem now to avoid the memleak warning */ atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc); - call_rcu(&psf->rcu, ip_sf_socklist_reclaim); + kfree_rcu(psf, rcu); return err; } - -static void ip_mc_socklist_reclaim(struct rcu_head *rp) -{ - kfree(container_of(rp, struct ip_mc_socklist, rcu)); - /* sk_omem_alloc should have been decreased by the caller*/ -} - - /* * Ask a socket to leave a group. */ @@ -1909,7 +1881,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) rtnl_unlock(); /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); - call_rcu(&iml->rcu, ip_mc_socklist_reclaim); + kfree_rcu(iml, rcu); return 0; } if (!in_dev) @@ -2026,7 +1998,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct newpsl->sl_addr[i] = psl->sl_addr[i]; /* decrease mem now to avoid the memleak warning */ atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); - call_rcu(&psl->rcu, ip_sf_socklist_reclaim); + kfree_rcu(psl, rcu); } rcu_assign_pointer(pmc->sflist, newpsl); psl = newpsl; @@ -2127,7 +2099,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) psl->sl_count, psl->sl_addr, 0); /* decrease mem now to avoid the memleak warning */ atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); - call_rcu(&psl->rcu, ip_sf_socklist_reclaim); + kfree_rcu(psl, rcu); } else (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 0, NULL, 0); @@ -2324,18 +2296,18 @@ void ip_mc_drop_socket(struct sock *sk) ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); - call_rcu(&iml->rcu, ip_mc_socklist_reclaim); + kfree_rcu(iml, rcu); } rtnl_unlock(); } -int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto) +/* called with rcu_read_lock() */ +int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto) { struct ip_mc_list *im; struct ip_sf_list *psf; int rv = 0; - rcu_read_lock(); for_each_pmc_rcu(in_dev, im) { if (im->multiaddr == mc_addr) break; @@ -2357,7 +2329,6 @@ int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 p } else rv = 1; /* unspecified source; tentatively allow */ } - rcu_read_unlock(); return rv; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 97e5fb7..c14d88a 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -33,7 +33,7 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg); * This struct holds the first and last local port number. */ struct local_ports sysctl_local_ports __read_mostly = { - .lock = SEQLOCK_UNLOCKED, + .lock = __SEQLOCK_UNLOCKED(sysctl_local_ports.lock), .range = { 32768, 61000 }, }; @@ -73,7 +73,7 @@ int inet_csk_bind_conflict(const struct sock *sk, !sk2->sk_bound_dev_if || sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { if (!reuse || !sk2->sk_reuse || - ((1 << sk2->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))) { + sk2->sk_state == TCP_LISTEN) { const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) || sk2_rcv_saddr == sk_rcv_saddr(sk)) @@ -122,8 +122,7 @@ again: (tb->num_owners < smallest_size || smallest_size == -1)) { smallest_size = tb->num_owners; smallest_rover = rover; - if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 && - !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) { + if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) { spin_unlock(&head->lock); snum = smallest_rover; goto have_snum; @@ -351,27 +350,24 @@ void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); struct dst_entry *inet_csk_route_req(struct sock *sk, + struct flowi4 *fl4, const struct request_sock *req) { struct rtable *rt; const struct inet_request_sock *ireq = inet_rsk(req); - struct ip_options *opt = inet_rsk(req)->opt; - struct flowi fl = { .oif = sk->sk_bound_dev_if, - .mark = sk->sk_mark, - .fl4_dst = ((opt && opt->srr) ? - opt->faddr : ireq->rmt_addr), - .fl4_src = ireq->loc_addr, - .fl4_tos = RT_CONN_FLAGS(sk), - .proto = sk->sk_protocol, - .flags = inet_sk_flowi_flags(sk), - .fl_ip_sport = inet_sk(sk)->inet_sport, - .fl_ip_dport = ireq->rmt_port }; + struct ip_options_rcu *opt = inet_rsk(req)->opt; struct net *net = sock_net(sk); - security_req_classify_flow(req, &fl); - if (ip_route_output_flow(net, &rt, &fl, sk, 0)) + flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, + sk->sk_protocol, inet_sk_flowi_flags(sk), + (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, + ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); + security_req_classify_flow(req, flowi4_to_flowi(fl4)); + rt = ip_route_output_flow(net, fl4, sk); + if (IS_ERR(rt)) goto no_route; - if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) + if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) goto route_err; return &rt->dst; @@ -383,6 +379,39 @@ no_route: } EXPORT_SYMBOL_GPL(inet_csk_route_req); +struct dst_entry *inet_csk_route_child_sock(struct sock *sk, + struct sock *newsk, + const struct request_sock *req) +{ + const struct inet_request_sock *ireq = inet_rsk(req); + struct inet_sock *newinet = inet_sk(newsk); + struct ip_options_rcu *opt = ireq->opt; + struct net *net = sock_net(sk); + struct flowi4 *fl4; + struct rtable *rt; + + fl4 = &newinet->cork.fl.u.ip4; + flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, + sk->sk_protocol, inet_sk_flowi_flags(sk), + (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, + ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); + security_req_classify_flow(req, flowi4_to_flowi(fl4)); + rt = ip_route_output_flow(net, fl4, sk); + if (IS_ERR(rt)) + goto no_route; + if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) + goto route_err; + return &rt->dst; + +route_err: + ip_rt_put(rt); +no_route: + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); + return NULL; +} +EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); + static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd, const u32 synq_hsize) { diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 2ada171..6ffe94c 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -124,7 +124,7 @@ static int inet_csk_diag_fill(struct sock *sk, #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) if (r->idiag_family == AF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, &np->rcv_saddr); diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c index 47038cb..85a0f75 100644 --- a/net/ipv4/inet_lro.c +++ b/net/ipv4/inet_lro.c @@ -51,8 +51,8 @@ MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)"); * Basic tcp checks whether packet is suitable for LRO */ -static int lro_tcp_ip_check(struct iphdr *iph, struct tcphdr *tcph, - int len, struct net_lro_desc *lro_desc) +static int lro_tcp_ip_check(const struct iphdr *iph, const struct tcphdr *tcph, + int len, const struct net_lro_desc *lro_desc) { /* check ip header: don't aggregate padded frames */ if (ntohs(iph->tot_len) != len) diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index c5af909..3c8dfa1 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -505,7 +505,9 @@ restart: } rcu_read_unlock(); + local_bh_disable(); inet_twsk_deschedule(tw, twdr); + local_bh_enable(); inet_twsk_put(tw); goto restart_rcu; } diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index a96e656..9df4e63 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -81,19 +81,19 @@ static const struct inet_peer peer_fake_node = { struct inet_peer_base { struct inet_peer __rcu *root; - spinlock_t lock; + seqlock_t lock; int total; }; static struct inet_peer_base v4_peers = { .root = peer_avl_empty_rcu, - .lock = __SPIN_LOCK_UNLOCKED(v4_peers.lock), + .lock = __SEQLOCK_UNLOCKED(v4_peers.lock), .total = 0, }; static struct inet_peer_base v6_peers = { .root = peer_avl_empty_rcu, - .lock = __SPIN_LOCK_UNLOCKED(v6_peers.lock), + .lock = __SEQLOCK_UNLOCKED(v6_peers.lock), .total = 0, }; @@ -167,9 +167,9 @@ static int addr_compare(const struct inetpeer_addr *a, int i, n = (a->family == AF_INET ? 1 : 4); for (i = 0; i < n; i++) { - if (a->a6[i] == b->a6[i]) + if (a->addr.a6[i] == b->addr.a6[i]) continue; - if (a->a6[i] < b->a6[i]) + if (a->addr.a6[i] < b->addr.a6[i]) return -1; return 1; } @@ -177,6 +177,9 @@ static int addr_compare(const struct inetpeer_addr *a, return 0; } +#define rcu_deref_locked(X, BASE) \ + rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock)) + /* * Called with local BH disabled and the pool lock held. */ @@ -187,8 +190,7 @@ static int addr_compare(const struct inetpeer_addr *a, \ stackptr = _stack; \ *stackptr++ = &_base->root; \ - for (u = rcu_dereference_protected(_base->root, \ - lockdep_is_held(&_base->lock)); \ + for (u = rcu_deref_locked(_base->root, _base); \ u != peer_avl_empty; ) { \ int cmp = addr_compare(_daddr, &u->daddr); \ if (cmp == 0) \ @@ -198,23 +200,22 @@ static int addr_compare(const struct inetpeer_addr *a, else \ v = &u->avl_right; \ *stackptr++ = v; \ - u = rcu_dereference_protected(*v, \ - lockdep_is_held(&_base->lock)); \ + u = rcu_deref_locked(*v, _base); \ } \ u; \ }) /* - * Called with rcu_read_lock_bh() + * Called with rcu_read_lock() * Because we hold no lock against a writer, its quite possible we fall * in an endless loop. * But every pointer we follow is guaranteed to be valid thanks to RCU. * We exit from this function if number of links exceeds PEER_MAXDEPTH */ -static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr, - struct inet_peer_base *base) +static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, + struct inet_peer_base *base) { - struct inet_peer *u = rcu_dereference_bh(base->root); + struct inet_peer *u = rcu_dereference(base->root); int count = 0; while (u != peer_avl_empty) { @@ -230,9 +231,9 @@ static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr, return u; } if (cmp == -1) - u = rcu_dereference_bh(u->avl_left); + u = rcu_dereference(u->avl_left); else - u = rcu_dereference_bh(u->avl_right); + u = rcu_dereference(u->avl_right); if (unlikely(++count == PEER_MAXDEPTH)) break; } @@ -246,13 +247,11 @@ static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr, struct inet_peer __rcu **v; \ *stackptr++ = &start->avl_left; \ v = &start->avl_left; \ - for (u = rcu_dereference_protected(*v, \ - lockdep_is_held(&base->lock)); \ + for (u = rcu_deref_locked(*v, base); \ u->avl_right != peer_avl_empty_rcu; ) { \ v = &u->avl_right; \ *stackptr++ = v; \ - u = rcu_dereference_protected(*v, \ - lockdep_is_held(&base->lock)); \ + u = rcu_deref_locked(*v, base); \ } \ u; \ }) @@ -271,21 +270,16 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[], while (stackend > stack) { nodep = *--stackend; - node = rcu_dereference_protected(*nodep, - lockdep_is_held(&base->lock)); - l = rcu_dereference_protected(node->avl_left, - lockdep_is_held(&base->lock)); - r = rcu_dereference_protected(node->avl_right, - lockdep_is_held(&base->lock)); + node = rcu_deref_locked(*nodep, base); + l = rcu_deref_locked(node->avl_left, base); + r = rcu_deref_locked(node->avl_right, base); lh = node_height(l); rh = node_height(r); if (lh > rh + 1) { /* l: RH+2 */ struct inet_peer *ll, *lr, *lrl, *lrr; int lrh; - ll = rcu_dereference_protected(l->avl_left, - lockdep_is_held(&base->lock)); - lr = rcu_dereference_protected(l->avl_right, - lockdep_is_held(&base->lock)); + ll = rcu_deref_locked(l->avl_left, base); + lr = rcu_deref_locked(l->avl_right, base); lrh = node_height(lr); if (lrh <= node_height(ll)) { /* ll: RH+1 */ RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */ @@ -296,10 +290,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[], l->avl_height = node->avl_height + 1; RCU_INIT_POINTER(*nodep, l); } else { /* ll: RH, lr: RH+1 */ - lrl = rcu_dereference_protected(lr->avl_left, - lockdep_is_held(&base->lock)); /* lrl: RH or RH-1 */ - lrr = rcu_dereference_protected(lr->avl_right, - lockdep_is_held(&base->lock)); /* lrr: RH or RH-1 */ + lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */ + lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */ RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */ RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ node->avl_height = rh + 1; /* node: RH+1 */ @@ -314,10 +306,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[], } else if (rh > lh + 1) { /* r: LH+2 */ struct inet_peer *rr, *rl, *rlr, *rll; int rlh; - rr = rcu_dereference_protected(r->avl_right, - lockdep_is_held(&base->lock)); - rl = rcu_dereference_protected(r->avl_left, - lockdep_is_held(&base->lock)); + rr = rcu_deref_locked(r->avl_right, base); + rl = rcu_deref_locked(r->avl_left, base); rlh = node_height(rl); if (rlh <= node_height(rr)) { /* rr: LH+1 */ RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */ @@ -328,10 +318,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[], r->avl_height = node->avl_height + 1; RCU_INIT_POINTER(*nodep, r); } else { /* rr: RH, rl: RH+1 */ - rlr = rcu_dereference_protected(rl->avl_right, - lockdep_is_held(&base->lock)); /* rlr: LH or LH-1 */ - rll = rcu_dereference_protected(rl->avl_left, - lockdep_is_held(&base->lock)); /* rll: LH or LH-1 */ + rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */ + rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */ RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */ RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ node->avl_height = lh + 1; /* node: LH+1 */ @@ -366,13 +354,14 @@ static void inetpeer_free_rcu(struct rcu_head *head) } /* May be called with local BH enabled. */ -static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base) +static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base, + struct inet_peer __rcu **stack[PEER_MAXDEPTH]) { int do_free; do_free = 0; - spin_lock_bh(&base->lock); + write_seqlock_bh(&base->lock); /* Check the reference counter. It was artificially incremented by 1 * in cleanup() function to prevent sudden disappearing. If we can * atomically (because of lockless readers) take this last reference, @@ -380,7 +369,6 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base) * We use refcnt=-1 to alert lockless readers this entry is deleted. */ if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { - struct inet_peer __rcu **stack[PEER_MAXDEPTH]; struct inet_peer __rcu ***stackptr, ***delp; if (lookup(&p->daddr, stack, base) != p) BUG(); @@ -392,8 +380,7 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base) /* look for a node to insert instead of p */ struct inet_peer *t; t = lookup_rightempty(p, base); - BUG_ON(rcu_dereference_protected(*stackptr[-1], - lockdep_is_held(&base->lock)) != t); + BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t); **--stackptr = t->avl_left; /* t is removed, t->daddr > x->daddr for any * x in p->avl_left subtree. @@ -409,10 +396,10 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base) base->total--; do_free = 1; } - spin_unlock_bh(&base->lock); + write_sequnlock_bh(&base->lock); if (do_free) - call_rcu_bh(&p->rcu, inetpeer_free_rcu); + call_rcu(&p->rcu, inetpeer_free_rcu); else /* The node is used again. Decrease the reference counter * back. The loop "cleanup -> unlink_from_unused @@ -435,7 +422,7 @@ static struct inet_peer_base *peer_to_base(struct inet_peer *p) } /* May be called with local BH enabled. */ -static int cleanup_once(unsigned long ttl) +static int cleanup_once(unsigned long ttl, struct inet_peer __rcu **stack[PEER_MAXDEPTH]) { struct inet_peer *p = NULL; @@ -467,7 +454,7 @@ static int cleanup_once(unsigned long ttl) * happen because of entry limits in route cache. */ return -1; - unlink_from_pool(p, peer_to_base(p)); + unlink_from_pool(p, peer_to_base(p), stack); return 0; } @@ -477,13 +464,17 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; struct inet_peer_base *base = family_to_base(daddr->family); struct inet_peer *p; + unsigned int sequence; + int invalidated; /* Look up for the address quickly, lockless. * Because of a concurrent writer, we might not find an existing entry. */ - rcu_read_lock_bh(); - p = lookup_rcu_bh(daddr, base); - rcu_read_unlock_bh(); + rcu_read_lock(); + sequence = read_seqbegin(&base->lock); + p = lookup_rcu(daddr, base); + invalidated = read_seqretry(&base->lock, sequence); + rcu_read_unlock(); if (p) { /* The existing node has been found. @@ -493,14 +484,18 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) return p; } + /* If no writer did a change during our lookup, we can return early. */ + if (!create && !invalidated) + return NULL; + /* retry an exact lookup, taking the lock before. * At least, nodes should be hot in our cache. */ - spin_lock_bh(&base->lock); + write_seqlock_bh(&base->lock); p = lookup(daddr, stack, base); if (p != peer_avl_empty) { atomic_inc(&p->refcnt); - spin_unlock_bh(&base->lock); + write_sequnlock_bh(&base->lock); /* Remove the entry from unused list if it was there. */ unlink_from_unused(p); return p; @@ -510,8 +505,14 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) p->daddr = *daddr; atomic_set(&p->refcnt, 1); atomic_set(&p->rid, 0); - atomic_set(&p->ip_id_count, secure_ip_id(daddr->a4)); + atomic_set(&p->ip_id_count, secure_ip_id(daddr->addr.a4)); p->tcp_ts_stamp = 0; + p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; + p->rate_tokens = 0; + p->rate_last = 0; + p->pmtu_expires = 0; + p->pmtu_orig = 0; + memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); INIT_LIST_HEAD(&p->unused); @@ -519,11 +520,11 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) link_to_pool(p, base); base->total++; } - spin_unlock_bh(&base->lock); + write_sequnlock_bh(&base->lock); if (base->total >= inet_peer_threshold) /* Remove one less-recently-used entry. */ - cleanup_once(0); + cleanup_once(0, stack); return p; } @@ -539,6 +540,7 @@ static void peer_check_expire(unsigned long dummy) { unsigned long now = jiffies; int ttl, total; + struct inet_peer __rcu **stack[PEER_MAXDEPTH]; total = compute_total(); if (total >= inet_peer_threshold) @@ -547,7 +549,7 @@ static void peer_check_expire(unsigned long dummy) ttl = inet_peer_maxttl - (inet_peer_maxttl - inet_peer_minttl) / HZ * total / inet_peer_threshold * HZ; - while (!cleanup_once(ttl)) { + while (!cleanup_once(ttl, stack)) { if (jiffies != now) break; } @@ -579,3 +581,44 @@ void inet_putpeer(struct inet_peer *p) local_bh_enable(); } EXPORT_SYMBOL_GPL(inet_putpeer); + +/* + * Check transmit rate limitation for given message. + * The rate information is held in the inet_peer entries now. + * This function is generic and could be used for other purposes + * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. + * + * Note that the same inet_peer fields are modified by functions in + * route.c too, but these work for packet destinations while xrlim_allow + * works for icmp destinations. This means the rate limiting information + * for one "ip object" is shared - and these ICMPs are twice limited: + * by source and by destination. + * + * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate + * SHOULD allow setting of rate limits + * + * Shared between ICMPv4 and ICMPv6. + */ +#define XRLIM_BURST_FACTOR 6 +bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) +{ + unsigned long now, token; + bool rc = false; + + if (!peer) + return true; + + token = peer->rate_tokens; + now = jiffies; + token += now - peer->rate_last; + peer->rate_last = now; + if (token > XRLIM_BURST_FACTOR * timeout) + token = XRLIM_BURST_FACTOR * timeout; + if (token >= timeout) { + token -= timeout; + rc = true; + } + peer->rate_tokens = token; + return rc; +} +EXPORT_SYMBOL(inet_peer_xrlim_allow); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 99461f0..3b34d1c 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -84,7 +84,7 @@ int ip_forward(struct sk_buff *skb) rt = skb_rtable(skb); - if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) + if (opt->is_strictroute && ip_hdr(skb)->daddr != rt->rt_gateway) goto sr_failed; if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index a1151b8..0ad6035 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -77,22 +77,40 @@ struct ipq { struct inet_peer *peer; }; -#define IPFRAG_ECN_CLEAR 0x01 /* one frag had INET_ECN_NOT_ECT */ -#define IPFRAG_ECN_SET_CE 0x04 /* one frag had INET_ECN_CE */ +/* RFC 3168 support : + * We want to check ECN values of all fragments, do detect invalid combinations. + * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value. + */ +#define IPFRAG_ECN_NOT_ECT 0x01 /* one frag had ECN_NOT_ECT */ +#define IPFRAG_ECN_ECT_1 0x02 /* one frag had ECN_ECT_1 */ +#define IPFRAG_ECN_ECT_0 0x04 /* one frag had ECN_ECT_0 */ +#define IPFRAG_ECN_CE 0x08 /* one frag had ECN_CE */ static inline u8 ip4_frag_ecn(u8 tos) { - tos = (tos & INET_ECN_MASK) + 1; - /* - * After the last operation we have (in binary): - * INET_ECN_NOT_ECT => 001 - * INET_ECN_ECT_1 => 010 - * INET_ECN_ECT_0 => 011 - * INET_ECN_CE => 100 - */ - return (tos & 2) ? 0 : tos; + return 1 << (tos & INET_ECN_MASK); } +/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements + * Value : 0xff if frame should be dropped. + * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field + */ +static const u8 ip4_frag_ecn_table[16] = { + /* at least one fragment had CE, and others ECT_0 or ECT_1 */ + [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE, + [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE, + [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE, + + /* invalid combinations : drop frame */ + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff, + [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, +}; + static struct inet_frags ip4_frags; int ip_frag_nqueues(struct net *net) @@ -223,31 +241,30 @@ static void ip_expire(unsigned long arg) if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { struct sk_buff *head = qp->q.fragments; + const struct iphdr *iph; + int err; rcu_read_lock(); head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) goto out_rcu_unlock; + /* skb dst is stale, drop it, and perform route lookup again */ + skb_dst_drop(head); + iph = ip_hdr(head); + err = ip_route_input_noref(head, iph->daddr, iph->saddr, + iph->tos, head->dev); + if (err) + goto out_rcu_unlock; + /* - * Only search router table for the head fragment, - * when defraging timeout at PRE_ROUTING HOOK. + * Only an end host needs to send an ICMP + * "Fragment Reassembly Timeout" message, per RFC792. */ - if (qp->user == IP_DEFRAG_CONNTRACK_IN && !skb_dst(head)) { - const struct iphdr *iph = ip_hdr(head); - int err = ip_route_input(head, iph->daddr, iph->saddr, - iph->tos, head->dev); - if (unlikely(err)) - goto out_rcu_unlock; - - /* - * Only an end host needs to send an ICMP - * "Fragment Reassembly Timeout" message, per RFC792. - */ - if (skb_rtable(head)->rt_type != RTN_LOCAL) - goto out_rcu_unlock; + if (qp->user == IP_DEFRAG_CONNTRACK_IN && + skb_rtable(head)->rt_type != RTN_LOCAL) + goto out_rcu_unlock; - } /* Send an ICMP "Fragment Reassembly Timeout" message. */ icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); @@ -525,9 +542,15 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, int len; int ihlen; int err; + u8 ecn; ipq_kill(qp); + ecn = ip4_frag_ecn_table[qp->ecn]; + if (unlikely(ecn == 0xff)) { + err = -EINVAL; + goto out_fail; + } /* Make the one we just received the head. */ if (prev) { head = prev->next; @@ -606,17 +629,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, iph = ip_hdr(head); iph->frag_off = 0; iph->tot_len = htons(len); - /* RFC3168 5.3 Fragmentation support - * If one fragment had INET_ECN_NOT_ECT, - * reassembled frame also has INET_ECN_NOT_ECT - * Elif one fragment had INET_ECN_CE - * reassembled frame also has INET_ECN_CE - */ - if (qp->ecn & IPFRAG_ECN_CLEAR) - iph->tos &= ~INET_ECN_MASK; - else if (qp->ecn & IPFRAG_ECN_SET_CE) - iph->tos |= INET_ECN_CE; - + iph->tos |= ecn; IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); qp->q.fragments = NULL; qp->q.fragments_tail = NULL; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index eb68a0e..8871067 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -413,11 +413,6 @@ static struct ip_tunnel *ipgre_tunnel_locate(struct net *net, dev_net_set(dev, net); - if (strchr(name, '%')) { - if (dev_alloc_name(dev, name) < 0) - goto failed_free; - } - nt = netdev_priv(dev); nt->parms = *parms; dev->rtnl_link_ops = &ipgre_link_ops; @@ -462,7 +457,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info) by themself??? */ - struct iphdr *iph = (struct iphdr *)skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; __be16 *p = (__be16*)(skb->data+(iph->ihl<<2)); int grehlen = (iph->ihl<<2) + 4; const int type = icmp_hdr(skb)->type; @@ -534,7 +529,7 @@ out: rcu_read_unlock(); } -static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) +static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb) { if (INET_ECN_is_ce(iph->tos)) { if (skb->protocol == htons(ETH_P_IP)) { @@ -546,19 +541,19 @@ static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) } static inline u8 -ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb) +ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb) { u8 inner = 0; if (skb->protocol == htons(ETH_P_IP)) inner = old_iph->tos; else if (skb->protocol == htons(ETH_P_IPV6)) - inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph); + inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph); return INET_ECN_encapsulate(tos, inner); } static int ipgre_rcv(struct sk_buff *skb) { - struct iphdr *iph; + const struct iphdr *iph; u8 *h; __be16 flags; __sum16 csum = 0; @@ -697,8 +692,9 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev { struct ip_tunnel *tunnel = netdev_priv(dev); struct pcpu_tstats *tstats; - struct iphdr *old_iph = ip_hdr(skb); - struct iphdr *tiph; + const struct iphdr *old_iph = ip_hdr(skb); + const struct iphdr *tiph; + struct flowi4 fl4; u8 tos; __be16 df; struct rtable *rt; /* Route to the other host */ @@ -714,7 +710,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev if (dev->header_ops && dev->type == ARPHRD_IPGRE) { gre_hlen = 0; - tiph = (struct iphdr *)skb->data; + tiph = (const struct iphdr *)skb->data; } else { gre_hlen = tunnel->hlen; tiph = &tunnel->parms.iph; @@ -735,14 +731,14 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) else if (skb->protocol == htons(ETH_P_IPV6)) { - struct in6_addr *addr6; + const struct in6_addr *addr6; int addr_type; struct neighbour *neigh = skb_dst(skb)->neighbour; if (neigh == NULL) goto tx_error; - addr6 = (struct in6_addr *)&neigh->primary_key; + addr6 = (const struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) { @@ -766,21 +762,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev if (skb->protocol == htons(ETH_P_IP)) tos = old_iph->tos; else if (skb->protocol == htons(ETH_P_IPV6)) - tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph); + tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph); } - { - struct flowi fl = { - .oif = tunnel->parms.link, - .fl4_dst = dst, - .fl4_src = tiph->saddr, - .fl4_tos = RT_TOS(tos), - .fl_gre_key = tunnel->parms.o_key - }; - if (ip_route_output_key(dev_net(dev), &rt, &fl)) { - dev->stats.tx_carrier_errors++; - goto tx_error; - } + rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr, + tunnel->parms.o_key, RT_TOS(tos), + tunnel->parms.link); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error; } tdev = rt->dst.dev; @@ -879,15 +869,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev iph->frag_off = df; iph->protocol = IPPROTO_GRE; iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb); - iph->daddr = rt->rt_dst; - iph->saddr = rt->rt_src; + iph->daddr = fl4.daddr; + iph->saddr = fl4.saddr; if ((iph->ttl = tiph->ttl) == 0) { if (skb->protocol == htons(ETH_P_IP)) iph->ttl = old_iph->ttl; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) else if (skb->protocol == htons(ETH_P_IPV6)) - iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit; + iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit; #endif else iph->ttl = ip4_dst_hoplimit(&rt->dst); @@ -933,7 +923,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev) { struct net_device *tdev = NULL; struct ip_tunnel *tunnel; - struct iphdr *iph; + const struct iphdr *iph; int hlen = LL_MAX_HEADER; int mtu = ETH_DATA_LEN; int addend = sizeof(struct iphdr) + 4; @@ -944,17 +934,15 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev) /* Guess output device to choose reasonable mtu and needed_headroom */ if (iph->daddr) { - struct flowi fl = { - .oif = tunnel->parms.link, - .fl4_dst = iph->daddr, - .fl4_src = iph->saddr, - .fl4_tos = RT_TOS(iph->tos), - .proto = IPPROTO_GRE, - .fl_gre_key = tunnel->parms.o_key - }; + struct flowi4 fl4; struct rtable *rt; - if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { + rt = ip_route_output_gre(dev_net(dev), &fl4, + iph->daddr, iph->saddr, + tunnel->parms.o_key, + RT_TOS(iph->tos), + tunnel->parms.link); + if (!IS_ERR(rt)) { tdev = rt->dst.dev; ip_rt_put(rt); } @@ -1190,7 +1178,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) { - struct iphdr *iph = (struct iphdr *) skb_mac_header(skb); + const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb); memcpy(haddr, &iph->saddr, 4); return 4; } @@ -1206,17 +1194,16 @@ static int ipgre_open(struct net_device *dev) struct ip_tunnel *t = netdev_priv(dev); if (ipv4_is_multicast(t->parms.iph.daddr)) { - struct flowi fl = { - .oif = t->parms.link, - .fl4_dst = t->parms.iph.daddr, - .fl4_src = t->parms.iph.saddr, - .fl4_tos = RT_TOS(t->parms.iph.tos), - .proto = IPPROTO_GRE, - .fl_gre_key = t->parms.o_key - }; + struct flowi4 fl4; struct rtable *rt; - if (ip_route_output_key(dev_net(dev), &rt, &fl)) + rt = ip_route_output_gre(dev_net(dev), &fl4, + t->parms.iph.daddr, + t->parms.iph.saddr, + t->parms.o_key, + RT_TOS(t->parms.iph.tos), + t->parms.link); + if (IS_ERR(rt)) return -EADDRNOTAVAIL; dev = rt->dst.dev; ip_rt_put(rt); @@ -1764,4 +1751,4 @@ module_exit(ipgre_fini); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("gre"); MODULE_ALIAS_RTNL_LINK("gretap"); -MODULE_ALIAS("gre0"); +MODULE_ALIAS_NETDEV("gre0"); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d859bcc..c8f48ef 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -268,7 +268,7 @@ int ip_local_deliver(struct sk_buff *skb) static inline int ip_rcv_options(struct sk_buff *skb) { struct ip_options *opt; - struct iphdr *iph; + const struct iphdr *iph; struct net_device *dev = skb->dev; /* It looks as overkill, because not all @@ -340,7 +340,7 @@ static int ip_rcv_finish(struct sk_buff *skb) } } -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (unlikely(skb_dst(skb)->tclassid)) { struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); u32 idx = skb_dst(skb)->tclassid; @@ -374,7 +374,7 @@ drop: */ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - struct iphdr *iph; + const struct iphdr *iph; u32 len; /* When the interface is in promisc. mode, drop all the crap diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 1906fa3..c3118e1 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -36,8 +36,8 @@ * saddr is address of outgoing interface. */ -void ip_options_build(struct sk_buff * skb, struct ip_options * opt, - __be32 daddr, struct rtable *rt, int is_frag) +void ip_options_build(struct sk_buff *skb, struct ip_options *opt, + __be32 daddr, struct rtable *rt, int is_frag) { unsigned char *iph = skb_network_header(skb); @@ -50,9 +50,9 @@ void ip_options_build(struct sk_buff * skb, struct ip_options * opt, if (!is_frag) { if (opt->rr_needaddr) - ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, rt); + ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, skb, rt); if (opt->ts_needaddr) - ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, rt); + ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt); if (opt->ts_needtime) { struct timespec tv; __be32 midtime; @@ -83,9 +83,9 @@ void ip_options_build(struct sk_buff * skb, struct ip_options * opt, * NOTE: dopt cannot point to skb. */ -int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) +int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) { - struct ip_options *sopt; + const struct ip_options *sopt; unsigned char *sptr, *dptr; int soffset, doffset; int optlen; @@ -95,10 +95,8 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) sopt = &(IPCB(skb)->opt); - if (sopt->optlen == 0) { - dopt->optlen = 0; + if (sopt->optlen == 0) return 0; - } sptr = skb_network_header(skb); dptr = dopt->__data; @@ -140,11 +138,11 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) } else { dopt->ts_needtime = 0; - if (soffset + 8 <= optlen) { + if (soffset + 7 <= optlen) { __be32 addr; - memcpy(&addr, sptr+soffset-1, 4); - if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_LOCAL) { + memcpy(&addr, dptr+soffset-1, 4); + if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_UNICAST) { dopt->ts_needtime = 1; soffset += 8; } @@ -157,7 +155,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) dopt->optlen += optlen; } if (sopt->srr) { - unsigned char * start = sptr+sopt->srr; + unsigned char *start = sptr+sopt->srr; __be32 faddr; optlen = start[1]; @@ -329,7 +327,7 @@ int ip_options_compile(struct net *net, pp_ptr = optptr + 2; goto error; } - if (skb) { + if (rt) { memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); opt->is_changed = 1; } @@ -371,7 +369,7 @@ int ip_options_compile(struct net *net, goto error; } opt->ts = optptr - iph; - if (skb) { + if (rt) { memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); timeptr = (__be32*)&optptr[optptr[2]+3]; } @@ -499,19 +497,19 @@ void ip_options_undo(struct ip_options * opt) } } -static struct ip_options *ip_options_get_alloc(const int optlen) +static struct ip_options_rcu *ip_options_get_alloc(const int optlen) { - return kzalloc(sizeof(struct ip_options) + ((optlen + 3) & ~3), + return kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3), GFP_KERNEL); } -static int ip_options_get_finish(struct net *net, struct ip_options **optp, - struct ip_options *opt, int optlen) +static int ip_options_get_finish(struct net *net, struct ip_options_rcu **optp, + struct ip_options_rcu *opt, int optlen) { while (optlen & 3) - opt->__data[optlen++] = IPOPT_END; - opt->optlen = optlen; - if (optlen && ip_options_compile(net, opt, NULL)) { + opt->opt.__data[optlen++] = IPOPT_END; + opt->opt.optlen = optlen; + if (optlen && ip_options_compile(net, &opt->opt, NULL)) { kfree(opt); return -EINVAL; } @@ -520,29 +518,29 @@ static int ip_options_get_finish(struct net *net, struct ip_options **optp, return 0; } -int ip_options_get_from_user(struct net *net, struct ip_options **optp, +int ip_options_get_from_user(struct net *net, struct ip_options_rcu **optp, unsigned char __user *data, int optlen) { - struct ip_options *opt = ip_options_get_alloc(optlen); + struct ip_options_rcu *opt = ip_options_get_alloc(optlen); if (!opt) return -ENOMEM; - if (optlen && copy_from_user(opt->__data, data, optlen)) { + if (optlen && copy_from_user(opt->opt.__data, data, optlen)) { kfree(opt); return -EFAULT; } return ip_options_get_finish(net, optp, opt, optlen); } -int ip_options_get(struct net *net, struct ip_options **optp, +int ip_options_get(struct net *net, struct ip_options_rcu **optp, unsigned char *data, int optlen) { - struct ip_options *opt = ip_options_get_alloc(optlen); + struct ip_options_rcu *opt = ip_options_get_alloc(optlen); if (!opt) return -ENOMEM; if (optlen) - memcpy(opt->__data, data, optlen); + memcpy(opt->opt.__data, data, optlen); return ip_options_get_finish(net, optp, opt, optlen); } @@ -555,7 +553,7 @@ void ip_forward_options(struct sk_buff *skb) if (opt->rr_needaddr) { optptr = (unsigned char *)raw + opt->rr; - ip_rt_get_source(&optptr[optptr[2]-5], rt); + ip_rt_get_source(&optptr[optptr[2]-5], skb, rt); opt->is_changed = 1; } if (opt->srr_is_hit) { @@ -569,19 +567,18 @@ void ip_forward_options(struct sk_buff *skb) ) { if (srrptr + 3 > srrspace) break; - if (memcmp(&rt->rt_dst, &optptr[srrptr-1], 4) == 0) + if (memcmp(&ip_hdr(skb)->daddr, &optptr[srrptr-1], 4) == 0) break; } if (srrptr + 3 <= srrspace) { opt->is_changed = 1; - ip_rt_get_source(&optptr[srrptr-1], rt); - ip_hdr(skb)->daddr = rt->rt_dst; + ip_rt_get_source(&optptr[srrptr-1], skb, rt); optptr[2] = srrptr+4; } else if (net_ratelimit()) printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); if (opt->ts_needaddr) { optptr = raw + opt->ts; - ip_rt_get_source(&optptr[optptr[2]-9], rt); + ip_rt_get_source(&optptr[optptr[2]-9], skb, rt); opt->is_changed = 1; } } @@ -603,7 +600,7 @@ int ip_options_rcv_srr(struct sk_buff *skb) unsigned long orefdst; int err; - if (!opt->srr) + if (!rt) return 0; if (skb->pkt_type != PACKET_HOST) @@ -637,7 +634,7 @@ int ip_options_rcv_srr(struct sk_buff *skb) if (rt2->rt_type != RTN_LOCAL) break; /* Superfast 8) loopback forward */ - memcpy(&iph->daddr, &optptr[srrptr-1], 4); + iph->daddr = nexthop; opt->is_changed = 1; } if (srrptr <= srrspace) { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 04c7b3b..98af369 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -140,14 +140,14 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) * */ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, - __be32 saddr, __be32 daddr, struct ip_options *opt) + __be32 saddr, __be32 daddr, struct ip_options_rcu *opt) { struct inet_sock *inet = inet_sk(sk); struct rtable *rt = skb_rtable(skb); struct iphdr *iph; /* Build the IP header. */ - skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); + skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0)); skb_reset_network_header(skb); iph = ip_hdr(skb); iph->version = 4; @@ -158,14 +158,14 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, else iph->frag_off = 0; iph->ttl = ip_select_ttl(inet, &rt->dst); - iph->daddr = rt->rt_dst; - iph->saddr = rt->rt_src; + iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); + iph->saddr = saddr; iph->protocol = sk->sk_protocol; ip_select_ident(iph, &rt->dst, sk); - if (opt && opt->optlen) { - iph->ihl += opt->optlen>>2; - ip_options_build(skb, opt, daddr, rt, 0); + if (opt && opt->opt.optlen) { + iph->ihl += opt->opt.optlen>>2; + ip_options_build(skb, &opt->opt, daddr, rt, 0); } skb->priority = sk->sk_priority; @@ -312,11 +312,12 @@ int ip_output(struct sk_buff *skb) !(IPCB(skb)->flags & IPSKB_REROUTED)); } -int ip_queue_xmit(struct sk_buff *skb) +int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) { struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); - struct ip_options *opt = inet->opt; + struct ip_options_rcu *inet_opt; + struct flowi4 *fl4; struct rtable *rt; struct iphdr *iph; int res; @@ -325,6 +326,8 @@ int ip_queue_xmit(struct sk_buff *skb) * f.e. by something like SCTP. */ rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + fl4 = &fl->u.ip4; rt = skb_rtable(skb); if (rt != NULL) goto packet_routed; @@ -336,38 +339,32 @@ int ip_queue_xmit(struct sk_buff *skb) /* Use correct destination address if we have options. */ daddr = inet->inet_daddr; - if(opt && opt->srr) - daddr = opt->faddr; - - { - struct flowi fl = { .oif = sk->sk_bound_dev_if, - .mark = sk->sk_mark, - .fl4_dst = daddr, - .fl4_src = inet->inet_saddr, - .fl4_tos = RT_CONN_FLAGS(sk), - .proto = sk->sk_protocol, - .flags = inet_sk_flowi_flags(sk), - .fl_ip_sport = inet->inet_sport, - .fl_ip_dport = inet->inet_dport }; - - /* If this fails, retransmit mechanism of transport layer will - * keep trying until route appears or the connection times - * itself out. - */ - security_sk_classify_flow(sk, &fl); - if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0)) - goto no_route; - } + if (inet_opt && inet_opt->opt.srr) + daddr = inet_opt->opt.faddr; + + /* If this fails, retransmit mechanism of transport layer will + * keep trying until route appears or the connection times + * itself out. + */ + rt = ip_route_output_ports(sock_net(sk), fl4, sk, + daddr, inet->inet_saddr, + inet->inet_dport, + inet->inet_sport, + sk->sk_protocol, + RT_CONN_FLAGS(sk), + sk->sk_bound_dev_if); + if (IS_ERR(rt)) + goto no_route; sk_setup_caps(sk, &rt->dst); } skb_dst_set_noref(skb, &rt->dst); packet_routed: - if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) + if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) goto no_route; /* OK, we know where to send it, allocate and build IP header. */ - skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); + skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0)); skb_reset_network_header(skb); iph = ip_hdr(skb); *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); @@ -377,13 +374,13 @@ packet_routed: iph->frag_off = 0; iph->ttl = ip_select_ttl(inet, &rt->dst); iph->protocol = sk->sk_protocol; - iph->saddr = rt->rt_src; - iph->daddr = rt->rt_dst; + iph->saddr = fl4->saddr; + iph->daddr = fl4->daddr; /* Transport layer set skb->h.foo itself. */ - if (opt && opt->optlen) { - iph->ihl += opt->optlen >> 2; - ip_options_build(skb, opt, inet->inet_daddr, rt, 0); + if (inet_opt && inet_opt->opt.optlen) { + iph->ihl += inet_opt->opt.optlen >> 2; + ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); } ip_select_ident_more(iph, &rt->dst, sk, @@ -609,7 +606,7 @@ slow_path: /* IF: it doesn't fit, use 'mtu' - the data space left */ if (len > mtu) len = mtu; - /* IF: we are not sending upto and including the packet end + /* IF: we are not sending up to and including the packet end then align the next start on an eight byte boundary */ if (len < left) { len &= ~7; @@ -733,6 +730,7 @@ csum_page(struct page *page, int offset, int copy) } static inline int ip_ufo_append_data(struct sock *sk, + struct sk_buff_head *queue, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int hh_len, int fragheaderlen, @@ -745,7 +743,7 @@ static inline int ip_ufo_append_data(struct sock *sk, * device, so create one single skb packet containing complete * udp datagram */ - if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { + if ((skb = skb_peek_tail(queue)) == NULL) { skb = sock_alloc_send_skb(sk, hh_len + fragheaderlen + transhdrlen + 20, (flags & MSG_DONTWAIT), &err); @@ -767,40 +765,30 @@ static inline int ip_ufo_append_data(struct sock *sk, skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; - sk->sk_sndmsg_off = 0; /* specify the length of each IP datagram fragment */ skb_shinfo(skb)->gso_size = mtu - fragheaderlen; skb_shinfo(skb)->gso_type = SKB_GSO_UDP; - __skb_queue_tail(&sk->sk_write_queue, skb); + __skb_queue_tail(queue, skb); } return skb_append_datato_frags(sk, skb, getfrag, from, (length - transhdrlen)); } -/* - * ip_append_data() and ip_append_page() can make one large IP datagram - * from many pieces of data. Each pieces will be holded on the socket - * until ip_push_pending_frames() is called. Each piece can be a page - * or non-page data. - * - * Not only UDP, other transport protocols - e.g. raw sockets - can use - * this interface potentially. - * - * LATER: length must be adjusted by pad at tail, when it is required. - */ -int ip_append_data(struct sock *sk, - int getfrag(void *from, char *to, int offset, int len, - int odd, struct sk_buff *skb), - void *from, int length, int transhdrlen, - struct ipcm_cookie *ipc, struct rtable **rtp, - unsigned int flags) +static int __ip_append_data(struct sock *sk, + struct flowi4 *fl4, + struct sk_buff_head *queue, + struct inet_cork *cork, + int getfrag(void *from, char *to, int offset, + int len, int odd, struct sk_buff *skb), + void *from, int length, int transhdrlen, + unsigned int flags) { struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; - struct ip_options *opt = NULL; + struct ip_options *opt = cork->opt; int hh_len; int exthdrlen; int mtu; @@ -809,59 +797,20 @@ int ip_append_data(struct sock *sk, int offset = 0; unsigned int maxfraglen, fragheaderlen; int csummode = CHECKSUM_NONE; - struct rtable *rt; - - if (flags&MSG_PROBE) - return 0; + struct rtable *rt = (struct rtable *)cork->dst; - if (skb_queue_empty(&sk->sk_write_queue)) { - /* - * setup for corking. - */ - opt = ipc->opt; - if (opt) { - if (inet->cork.opt == NULL) { - inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation); - if (unlikely(inet->cork.opt == NULL)) - return -ENOBUFS; - } - memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen); - inet->cork.flags |= IPCORK_OPT; - inet->cork.addr = ipc->addr; - } - rt = *rtp; - if (unlikely(!rt)) - return -EFAULT; - /* - * We steal reference to this route, caller should not release it - */ - *rtp = NULL; - inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? - rt->dst.dev->mtu : - dst_mtu(rt->dst.path); - inet->cork.dst = &rt->dst; - inet->cork.length = 0; - sk->sk_sndmsg_page = NULL; - sk->sk_sndmsg_off = 0; - exthdrlen = rt->dst.header_len; - length += exthdrlen; - transhdrlen += exthdrlen; - } else { - rt = (struct rtable *)inet->cork.dst; - if (inet->cork.flags & IPCORK_OPT) - opt = inet->cork.opt; + exthdrlen = transhdrlen ? rt->dst.header_len : 0; + length += exthdrlen; + transhdrlen += exthdrlen; + mtu = cork->fragsize; - transhdrlen = 0; - exthdrlen = 0; - mtu = inet->cork.fragsize; - } hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; - if (inet->cork.length + length > 0xFFFF - fragheaderlen) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, + if (cork->length + length > 0xFFFF - fragheaderlen) { + ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu-exthdrlen); return -EMSGSIZE; } @@ -876,15 +825,15 @@ int ip_append_data(struct sock *sk, !exthdrlen) csummode = CHECKSUM_PARTIAL; - skb = skb_peek_tail(&sk->sk_write_queue); + skb = skb_peek_tail(queue); - inet->cork.length += length; + cork->length += length; if (((length > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO)) { - err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, - fragheaderlen, transhdrlen, mtu, - flags); + err = ip_ufo_append_data(sk, queue, getfrag, from, length, + hh_len, fragheaderlen, transhdrlen, + mtu, flags); if (err) goto error; return 0; @@ -961,7 +910,7 @@ alloc_new_skb: else /* only the initial fragment is time stamped */ - ipc->tx_flags = 0; + cork->tx_flags = 0; } if (skb == NULL) goto error; @@ -972,7 +921,7 @@ alloc_new_skb: skb->ip_summed = csummode; skb->csum = 0; skb_reserve(skb, hh_len); - skb_shinfo(skb)->tx_flags = ipc->tx_flags; + skb_shinfo(skb)->tx_flags = cork->tx_flags; /* * Find where to start putting bytes. @@ -1009,7 +958,7 @@ alloc_new_skb: /* * Put the packet on the pending queue. */ - __skb_queue_tail(&sk->sk_write_queue, skb); + __skb_queue_tail(queue, skb); continue; } @@ -1029,8 +978,8 @@ alloc_new_skb: } else { int i = skb_shinfo(skb)->nr_frags; skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; - struct page *page = sk->sk_sndmsg_page; - int off = sk->sk_sndmsg_off; + struct page *page = cork->page; + int off = cork->off; unsigned int left; if (page && (left = PAGE_SIZE - off) > 0) { @@ -1042,7 +991,7 @@ alloc_new_skb: goto error; } get_page(page); - skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); + skb_fill_page_desc(skb, i, page, off, 0); frag = &skb_shinfo(skb)->frags[i]; } } else if (i < MAX_SKB_FRAGS) { @@ -1053,8 +1002,8 @@ alloc_new_skb: err = -ENOMEM; goto error; } - sk->sk_sndmsg_page = page; - sk->sk_sndmsg_off = 0; + cork->page = page; + cork->off = 0; skb_fill_page_desc(skb, i, page, 0, 0); frag = &skb_shinfo(skb)->frags[i]; @@ -1066,7 +1015,7 @@ alloc_new_skb: err = -EFAULT; goto error; } - sk->sk_sndmsg_off += copy; + cork->off += copy; frag->size += copy; skb->len += copy; skb->data_len += copy; @@ -1080,18 +1029,95 @@ alloc_new_skb: return 0; error: - inet->cork.length -= length; + cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); return err; } -ssize_t ip_append_page(struct sock *sk, struct page *page, +static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, + struct ipcm_cookie *ipc, struct rtable **rtp) +{ + struct inet_sock *inet = inet_sk(sk); + struct ip_options_rcu *opt; + struct rtable *rt; + + /* + * setup for corking. + */ + opt = ipc->opt; + if (opt) { + if (cork->opt == NULL) { + cork->opt = kmalloc(sizeof(struct ip_options) + 40, + sk->sk_allocation); + if (unlikely(cork->opt == NULL)) + return -ENOBUFS; + } + memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen); + cork->flags |= IPCORK_OPT; + cork->addr = ipc->addr; + } + rt = *rtp; + if (unlikely(!rt)) + return -EFAULT; + /* + * We steal reference to this route, caller should not release it + */ + *rtp = NULL; + cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ? + rt->dst.dev->mtu : dst_mtu(rt->dst.path); + cork->dst = &rt->dst; + cork->length = 0; + cork->tx_flags = ipc->tx_flags; + cork->page = NULL; + cork->off = 0; + + return 0; +} + +/* + * ip_append_data() and ip_append_page() can make one large IP datagram + * from many pieces of data. Each pieces will be holded on the socket + * until ip_push_pending_frames() is called. Each piece can be a page + * or non-page data. + * + * Not only UDP, other transport protocols - e.g. raw sockets - can use + * this interface potentially. + * + * LATER: length must be adjusted by pad at tail, when it is required. + */ +int ip_append_data(struct sock *sk, struct flowi4 *fl4, + int getfrag(void *from, char *to, int offset, int len, + int odd, struct sk_buff *skb), + void *from, int length, int transhdrlen, + struct ipcm_cookie *ipc, struct rtable **rtp, + unsigned int flags) +{ + struct inet_sock *inet = inet_sk(sk); + int err; + + if (flags&MSG_PROBE) + return 0; + + if (skb_queue_empty(&sk->sk_write_queue)) { + err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp); + if (err) + return err; + } else { + transhdrlen = 0; + } + + return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag, + from, length, transhdrlen, flags); +} + +ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, int offset, size_t size, int flags) { struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; struct rtable *rt; struct ip_options *opt = NULL; + struct inet_cork *cork; int hh_len; int mtu; int len; @@ -1107,28 +1133,29 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, if (skb_queue_empty(&sk->sk_write_queue)) return -EINVAL; - rt = (struct rtable *)inet->cork.dst; - if (inet->cork.flags & IPCORK_OPT) - opt = inet->cork.opt; + cork = &inet->cork.base; + rt = (struct rtable *)cork->dst; + if (cork->flags & IPCORK_OPT) + opt = cork->opt; if (!(rt->dst.dev->features&NETIF_F_SG)) return -EOPNOTSUPP; hh_len = LL_RESERVED_SPACE(rt->dst.dev); - mtu = inet->cork.fragsize; + mtu = cork->fragsize; fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; - if (inet->cork.length + size > 0xFFFF - fragheaderlen) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu); + if (cork->length + size > 0xFFFF - fragheaderlen) { + ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu); return -EMSGSIZE; } if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) return -EINVAL; - inet->cork.length += size; + cork->length += size; if ((size + skb->len > mtu) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO)) { @@ -1223,45 +1250,47 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, return 0; error: - inet->cork.length -= size; + cork->length -= size; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); return err; } -static void ip_cork_release(struct inet_sock *inet) +static void ip_cork_release(struct inet_cork *cork) { - inet->cork.flags &= ~IPCORK_OPT; - kfree(inet->cork.opt); - inet->cork.opt = NULL; - dst_release(inet->cork.dst); - inet->cork.dst = NULL; + cork->flags &= ~IPCORK_OPT; + kfree(cork->opt); + cork->opt = NULL; + dst_release(cork->dst); + cork->dst = NULL; } /* * Combined all pending IP fragments on the socket as one IP datagram * and push them out. */ -int ip_push_pending_frames(struct sock *sk) +struct sk_buff *__ip_make_skb(struct sock *sk, + struct flowi4 *fl4, + struct sk_buff_head *queue, + struct inet_cork *cork) { struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct ip_options *opt = NULL; - struct rtable *rt = (struct rtable *)inet->cork.dst; + struct rtable *rt = (struct rtable *)cork->dst; struct iphdr *iph; __be16 df = 0; __u8 ttl; - int err = 0; - if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) + if ((skb = __skb_dequeue(queue)) == NULL) goto out; tail_skb = &(skb_shinfo(skb)->frag_list); /* move skb->data to ip header from ext header */ if (skb->data < skb_network_header(skb)) __skb_pull(skb, skb_network_offset(skb)); - while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { + while ((tmp_skb = __skb_dequeue(queue)) != NULL) { __skb_pull(tmp_skb, skb_network_header_len(skb)); *tail_skb = tmp_skb; tail_skb = &(tmp_skb->next); @@ -1287,8 +1316,8 @@ int ip_push_pending_frames(struct sock *sk) ip_dont_fragment(sk, &rt->dst))) df = htons(IP_DF); - if (inet->cork.flags & IPCORK_OPT) - opt = inet->cork.opt; + if (cork->flags & IPCORK_OPT) + opt = cork->opt; if (rt->rt_type == RTN_MULTICAST) ttl = inet->mc_ttl; @@ -1298,17 +1327,18 @@ int ip_push_pending_frames(struct sock *sk) iph = (struct iphdr *)skb->data; iph->version = 4; iph->ihl = 5; - if (opt) { - iph->ihl += opt->optlen>>2; - ip_options_build(skb, opt, inet->cork.addr, rt, 0); - } iph->tos = inet->tos; iph->frag_off = df; ip_select_ident(iph, &rt->dst, sk); iph->ttl = ttl; iph->protocol = sk->sk_protocol; - iph->saddr = rt->rt_src; - iph->daddr = rt->rt_dst; + iph->saddr = fl4->saddr; + iph->daddr = fl4->daddr; + + if (opt) { + iph->ihl += opt->optlen>>2; + ip_options_build(skb, opt, cork->addr, rt, 0); + } skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; @@ -1316,44 +1346,99 @@ int ip_push_pending_frames(struct sock *sk) * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec * on dst refcount */ - inet->cork.dst = NULL; + cork->dst = NULL; skb_dst_set(skb, &rt->dst); if (iph->protocol == IPPROTO_ICMP) icmp_out_count(net, ((struct icmphdr *) skb_transport_header(skb))->type); - /* Netfilter gets whole the not fragmented skb. */ + ip_cork_release(cork); +out: + return skb; +} + +int ip_send_skb(struct sk_buff *skb) +{ + struct net *net = sock_net(skb->sk); + int err; + err = ip_local_out(skb); if (err) { if (err > 0) err = net_xmit_errno(err); if (err) - goto error; + IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); } -out: - ip_cork_release(inet); return err; +} -error: - IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); - goto out; +int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4) +{ + struct sk_buff *skb; + + skb = ip_finish_skb(sk, fl4); + if (!skb) + return 0; + + /* Netfilter gets whole the not fragmented skb. */ + return ip_send_skb(skb); } /* * Throw away all pending data on the socket. */ -void ip_flush_pending_frames(struct sock *sk) +static void __ip_flush_pending_frames(struct sock *sk, + struct sk_buff_head *queue, + struct inet_cork *cork) { struct sk_buff *skb; - while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) + while ((skb = __skb_dequeue_tail(queue)) != NULL) kfree_skb(skb); - ip_cork_release(inet_sk(sk)); + ip_cork_release(cork); } +void ip_flush_pending_frames(struct sock *sk) +{ + __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base); +} + +struct sk_buff *ip_make_skb(struct sock *sk, + struct flowi4 *fl4, + int getfrag(void *from, char *to, int offset, + int len, int odd, struct sk_buff *skb), + void *from, int length, int transhdrlen, + struct ipcm_cookie *ipc, struct rtable **rtp, + unsigned int flags) +{ + struct inet_cork cork; + struct sk_buff_head queue; + int err; + + if (flags & MSG_PROBE) + return NULL; + + __skb_queue_head_init(&queue); + + cork.flags = 0; + cork.addr = 0; + cork.opt = NULL; + err = ip_setup_cork(sk, &cork, ipc, rtp); + if (err) + return ERR_PTR(err); + + err = __ip_append_data(sk, fl4, &queue, &cork, getfrag, + from, length, transhdrlen, flags); + if (err) { + __ip_flush_pending_frames(sk, &queue, &cork); + return ERR_PTR(err); + } + + return __ip_make_skb(sk, fl4, &queue, &cork); +} /* * Fetch data from kernel space and fill in checksum if needed. @@ -1375,45 +1460,39 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, * Should run single threaded per socket because it uses the sock * structure to pass arguments. */ -void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, - unsigned int len) +void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, + struct ip_reply_arg *arg, unsigned int len) { struct inet_sock *inet = inet_sk(sk); - struct { - struct ip_options opt; - char data[40]; - } replyopts; + struct ip_options_data replyopts; struct ipcm_cookie ipc; - __be32 daddr; + struct flowi4 fl4; struct rtable *rt = skb_rtable(skb); - if (ip_options_echo(&replyopts.opt, skb)) + if (ip_options_echo(&replyopts.opt.opt, skb)) return; - daddr = ipc.addr = rt->rt_src; + ipc.addr = daddr; ipc.opt = NULL; ipc.tx_flags = 0; - if (replyopts.opt.optlen) { + if (replyopts.opt.opt.optlen) { ipc.opt = &replyopts.opt; - if (ipc.opt->srr) - daddr = replyopts.opt.faddr; + if (replyopts.opt.opt.srr) + daddr = replyopts.opt.opt.faddr; } - { - struct flowi fl = { .oif = arg->bound_dev_if, - .fl4_dst = daddr, - .fl4_src = rt->rt_spec_dst, - .fl4_tos = RT_TOS(ip_hdr(skb)->tos), - .fl_ip_sport = tcp_hdr(skb)->dest, - .fl_ip_dport = tcp_hdr(skb)->source, - .proto = sk->sk_protocol, - .flags = ip_reply_arg_flowi_flags(arg) }; - security_skb_classify_flow(skb, &fl); - if (ip_route_output_key(sock_net(sk), &rt, &fl)) - return; - } + flowi4_init_output(&fl4, arg->bound_dev_if, 0, + RT_TOS(ip_hdr(skb)->tos), + RT_SCOPE_UNIVERSE, sk->sk_protocol, + ip_reply_arg_flowi_flags(arg), + daddr, rt->rt_spec_dst, + tcp_hdr(skb)->source, tcp_hdr(skb)->dest); + security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); + rt = ip_route_output_key(sock_net(sk), &fl4); + if (IS_ERR(rt)) + return; /* And let IP do all the hard work. @@ -1426,7 +1505,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar sk->sk_priority = skb->priority; sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; - ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, + ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, &ipc, &rt, MSG_DONTWAIT); if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { if (arg->csumoffset >= 0) @@ -1434,7 +1513,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum)); skb->ip_summed = CHECKSUM_NONE; - ip_push_pending_frames(sk); + ip_push_pending_frames(sk, &fl4); } bh_unlock_sock(sk); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 3948c86..ab0c9ef 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -131,7 +131,7 @@ static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb) static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) { struct sockaddr_in sin; - struct iphdr *iph = ip_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); __be16 *ports = (__be16 *)skb_transport_header(skb); if (skb_transport_offset(skb) + 4 > skb->len) @@ -451,6 +451,11 @@ out: } +static void opt_kfree_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct ip_options_rcu, rcu)); +} + /* * Socket option code for IP. This is the end of the line after any * TCP,UDP etc options on an IP socket. @@ -497,13 +502,16 @@ static int do_ip_setsockopt(struct sock *sk, int level, switch (optname) { case IP_OPTIONS: { - struct ip_options *opt = NULL; + struct ip_options_rcu *old, *opt = NULL; + if (optlen > 40) goto e_inval; err = ip_options_get_from_user(sock_net(sk), &opt, optval, optlen); if (err) break; + old = rcu_dereference_protected(inet->inet_opt, + sock_owned_by_user(sk)); if (inet->is_icsk) { struct inet_connection_sock *icsk = inet_csk(sk); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) @@ -512,17 +520,18 @@ static int do_ip_setsockopt(struct sock *sk, int level, (TCPF_LISTEN | TCPF_CLOSE)) && inet->inet_daddr != LOOPBACK4_IPV6)) { #endif - if (inet->opt) - icsk->icsk_ext_hdr_len -= inet->opt->optlen; + if (old) + icsk->icsk_ext_hdr_len -= old->opt.optlen; if (opt) - icsk->icsk_ext_hdr_len += opt->optlen; + icsk->icsk_ext_hdr_len += opt->opt.optlen; icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) } #endif } - opt = xchg(&inet->opt, opt); - kfree(opt); + rcu_assign_pointer(inet->inet_opt, opt); + if (old) + call_rcu(&old->rcu, opt_kfree_rcu); break; } case IP_PKTINFO: @@ -1081,12 +1090,16 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_OPTIONS: { unsigned char optbuf[sizeof(struct ip_options)+40]; - struct ip_options * opt = (struct ip_options *)optbuf; + struct ip_options *opt = (struct ip_options *)optbuf; + struct ip_options_rcu *inet_opt; + + inet_opt = rcu_dereference_protected(inet->inet_opt, + sock_owned_by_user(sk)); opt->optlen = 0; - if (inet->opt) - memcpy(optbuf, inet->opt, - sizeof(struct ip_options)+ - inet->opt->optlen); + if (inet_opt) + memcpy(optbuf, &inet_opt->opt, + sizeof(struct ip_options) + + inet_opt->opt.optlen); release_sock(sk); if (opt->optlen == 0) diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 6290675..c857f6f 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -27,7 +27,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); __be32 spi; - struct iphdr *iph = (struct iphdr *)skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; @@ -36,7 +36,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) return; spi = htonl(ntohs(ipch->cpi)); - x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, IPPROTO_COMP, AF_INET); if (!x) return; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 2b09775..ab7e554 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -87,8 +87,8 @@ #endif /* Define the friendly delay before and after opening net devices */ -#define CONF_PRE_OPEN 500 /* Before opening: 1/2 second */ -#define CONF_POST_OPEN 1 /* After opening: 1 second */ +#define CONF_POST_OPEN 10 /* After opening: 10 msecs */ +#define CONF_CARRIER_TIMEOUT 120000 /* Wait for carrier timeout */ /* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */ #define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */ @@ -188,14 +188,14 @@ struct ic_device { static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */ static struct net_device *ic_dev __initdata = NULL; /* Selected device */ -static bool __init ic_device_match(struct net_device *dev) +static bool __init ic_is_init_dev(struct net_device *dev) { - if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : + if (dev->flags & IFF_LOOPBACK) + return false; + return user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : (!(dev->flags & IFF_LOOPBACK) && (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) && - strncmp(dev->name, "dummy", 5))) - return true; - return false; + strncmp(dev->name, "dummy", 5)); } static int __init ic_open_devs(void) @@ -203,6 +203,7 @@ static int __init ic_open_devs(void) struct ic_device *d, **last; struct net_device *dev; unsigned short oflags; + unsigned long start; last = &ic_first_dev; rtnl_lock(); @@ -216,9 +217,7 @@ static int __init ic_open_devs(void) } for_each_netdev(&init_net, dev) { - if (dev->flags & IFF_LOOPBACK) - continue; - if (ic_device_match(dev)) { + if (ic_is_init_dev(dev)) { int able = 0; if (dev->mtu >= 364) able |= IC_BOOTP; @@ -252,6 +251,17 @@ static int __init ic_open_devs(void) dev->name, able, d->xid)); } } + + /* wait for a carrier on at least one device */ + start = jiffies; + while (jiffies - start < msecs_to_jiffies(CONF_CARRIER_TIMEOUT)) { + for_each_netdev(&init_net, dev) + if (ic_is_init_dev(dev) && netif_carrier_ok(dev)) + goto have_carrier; + + msleep(1); + } +have_carrier: rtnl_unlock(); *last = NULL; @@ -1324,14 +1334,13 @@ static int __init wait_for_devices(void) { int i; - msleep(CONF_PRE_OPEN); for (i = 0; i < DEVICE_WAIT_MAX; i++) { struct net_device *dev; int found = 0; rtnl_lock(); for_each_netdev(&init_net, dev) { - if (ic_device_match(dev)) { + if (ic_is_init_dev(dev)) { found = 1; break; } @@ -1378,7 +1387,7 @@ static int __init ip_auto_config(void) return err; /* Give drivers a chance to settle */ - ssleep(CONF_POST_OPEN); + msleep(CONF_POST_OPEN); /* * If the config information is insufficient (e.g., our IP address or @@ -1444,7 +1453,7 @@ static int __init ip_auto_config(void) root_server_addr = addr; /* - * Use defaults whereever applicable. + * Use defaults wherever applicable. */ if (ic_defaults() < 0) return -1; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 988f52f..378b20b 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -276,11 +276,6 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, dev_net_set(dev, net); - if (strchr(name, '%')) { - if (dev_alloc_name(dev, name) < 0) - goto failed_free; - } - nt = netdev_priv(dev); nt->parms = *parms; @@ -319,7 +314,7 @@ static int ipip_err(struct sk_buff *skb, u32 info) 8 bytes of packet payload. It means, that precise relaying of ICMP in the real Internet is absolutely infeasible. */ - struct iphdr *iph = (struct iphdr *)skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; @@ -433,15 +428,16 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct pcpu_tstats *tstats; - struct iphdr *tiph = &tunnel->parms.iph; + const struct iphdr *tiph = &tunnel->parms.iph; u8 tos = tunnel->parms.iph.tos; __be16 df = tiph->frag_off; struct rtable *rt; /* Route to the other host */ struct net_device *tdev; /* Device to other host */ - struct iphdr *old_iph = ip_hdr(skb); + const struct iphdr *old_iph = ip_hdr(skb); struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ __be32 dst = tiph->daddr; + struct flowi4 fl4; int mtu; if (skb->protocol != htons(ETH_P_IP)) @@ -460,19 +456,14 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) goto tx_error_icmp; } - { - struct flowi fl = { - .oif = tunnel->parms.link, - .fl4_dst = dst, - .fl4_src= tiph->saddr, - .fl4_tos = RT_TOS(tos), - .proto = IPPROTO_IPIP - }; - - if (ip_route_output_key(dev_net(dev), &rt, &fl)) { - dev->stats.tx_carrier_errors++; - goto tx_error_icmp; - } + rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, + dst, tiph->saddr, + 0, 0, + IPPROTO_IPIP, RT_TOS(tos), + tunnel->parms.link); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; } tdev = rt->dst.dev; @@ -554,8 +545,8 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) iph->frag_off = df; iph->protocol = IPPROTO_IPIP; iph->tos = INET_ECN_encapsulate(tos, old_iph->tos); - iph->daddr = rt->rt_dst; - iph->saddr = rt->rt_src; + iph->daddr = fl4.daddr; + iph->saddr = fl4.saddr; if ((iph->ttl = tiph->ttl) == 0) iph->ttl = old_iph->ttl; @@ -577,22 +568,22 @@ static void ipip_tunnel_bind_dev(struct net_device *dev) { struct net_device *tdev = NULL; struct ip_tunnel *tunnel; - struct iphdr *iph; + const struct iphdr *iph; tunnel = netdev_priv(dev); iph = &tunnel->parms.iph; if (iph->daddr) { - struct flowi fl = { - .oif = tunnel->parms.link, - .fl4_dst = iph->daddr, - .fl4_src = iph->saddr, - .fl4_tos = RT_TOS(iph->tos), - .proto = IPPROTO_IPIP - }; struct rtable *rt; - - if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { + struct flowi4 fl4; + + rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, + iph->daddr, iph->saddr, + 0, 0, + IPPROTO_IPIP, + RT_TOS(iph->tos), + tunnel->parms.link); + if (!IS_ERR(rt)) { tdev = rt->dst.dev; ip_rt_put(rt); } @@ -913,4 +904,4 @@ static void __exit ipip_fini(void) module_init(ipip_init); module_exit(ipip_fini); MODULE_LICENSE("GPL"); -MODULE_ALIAS("tunl0"); +MODULE_ALIAS_NETDEV("tunl0"); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 8b65a12..30a7763 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -148,14 +148,15 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id) return NULL; } -static int ipmr_fib_lookup(struct net *net, struct flowi *flp, +static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, struct mr_table **mrt) { struct ipmr_result res; struct fib_lookup_arg arg = { .result = &res, }; int err; - err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg); + err = fib_rules_lookup(net->ipv4.mr_rules_ops, + flowi4_to_flowi(flp4), 0, &arg); if (err < 0) return err; *mrt = res.mrt; @@ -283,7 +284,7 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id) return net->ipv4.mrt; } -static int ipmr_fib_lookup(struct net *net, struct flowi *flp, +static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, struct mr_table **mrt) { *mrt = net->ipv4.mrt; @@ -435,14 +436,14 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); struct mr_table *mrt; - struct flowi fl = { - .oif = dev->ifindex, - .iif = skb->skb_iif, - .mark = skb->mark, + struct flowi4 fl4 = { + .flowi4_oif = dev->ifindex, + .flowi4_iif = skb->skb_iif, + .flowi4_mark = skb->mark, }; int err; - err = ipmr_fib_lookup(net, &fl, &mrt); + err = ipmr_fib_lookup(net, &fl4, &mrt); if (err < 0) { kfree_skb(skb); return err; @@ -1548,7 +1549,7 @@ static struct notifier_block ip_mr_notifier = { static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) { struct iphdr *iph; - struct iphdr *old_iph = ip_hdr(skb); + const struct iphdr *old_iph = ip_hdr(skb); skb_push(skb, sizeof(struct iphdr)); skb->transport_header = skb->network_header; @@ -1594,6 +1595,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, struct vif_device *vif = &mrt->vif_table[vifi]; struct net_device *dev; struct rtable *rt; + struct flowi4 fl4; int encap = 0; if (vif->dev == NULL) @@ -1611,26 +1613,20 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, #endif if (vif->flags & VIFF_TUNNEL) { - struct flowi fl = { - .oif = vif->link, - .fl4_dst = vif->remote, - .fl4_src = vif->local, - .fl4_tos = RT_TOS(iph->tos), - .proto = IPPROTO_IPIP - }; - - if (ip_route_output_key(net, &rt, &fl)) + rt = ip_route_output_ports(net, &fl4, NULL, + vif->remote, vif->local, + 0, 0, + IPPROTO_IPIP, + RT_TOS(iph->tos), vif->link); + if (IS_ERR(rt)) goto out_free; encap = sizeof(struct iphdr); } else { - struct flowi fl = { - .oif = vif->link, - .fl4_dst = iph->daddr, - .fl4_tos = RT_TOS(iph->tos), - .proto = IPPROTO_IPIP - }; - - if (ip_route_output_key(net, &rt, &fl)) + rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0, + 0, 0, + IPPROTO_IPIP, + RT_TOS(iph->tos), vif->link); + if (IS_ERR(rt)) goto out_free; } @@ -1793,6 +1789,26 @@ dont_forward: return 0; } +static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) +{ + struct rtable *rt = skb_rtable(skb); + struct iphdr *iph = ip_hdr(skb); + struct flowi4 fl4 = { + .daddr = iph->daddr, + .saddr = iph->saddr, + .flowi4_tos = iph->tos, + .flowi4_oif = rt->rt_oif, + .flowi4_iif = rt->rt_iif, + .flowi4_mark = rt->rt_mark, + }; + struct mr_table *mrt; + int err; + + err = ipmr_fib_lookup(net, &fl4, &mrt); + if (err) + return ERR_PTR(err); + return mrt; +} /* * Multicast packets for forwarding arrive here @@ -1805,7 +1821,6 @@ int ip_mr_input(struct sk_buff *skb) struct net *net = dev_net(skb->dev); int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; struct mr_table *mrt; - int err; /* Packet is looped back after forward, it should not be * forwarded second time, but still can be delivered locally. @@ -1813,12 +1828,11 @@ int ip_mr_input(struct sk_buff *skb) if (IPCB(skb)->flags & IPSKB_FORWARDED) goto dont_forward; - err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt); - if (err < 0) { + mrt = ipmr_rt_fib_lookup(net, skb); + if (IS_ERR(mrt)) { kfree_skb(skb); - return err; + return PTR_ERR(mrt); } - if (!local) { if (IPCB(skb)->opt.router_alert) { if (ip_call_ra_chain(skb)) @@ -1946,9 +1960,9 @@ int pim_rcv_v1(struct sk_buff *skb) pim = igmp_hdr(skb); - if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0) + mrt = ipmr_rt_fib_lookup(net, skb); + if (IS_ERR(mrt)) goto drop; - if (!mrt->mroute_do_pim || pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) goto drop; @@ -1978,9 +1992,9 @@ static int pim_rcv(struct sk_buff *skb) csum_fold(skb_checksum(skb, 0, skb->len, 0)))) goto drop; - if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0) + mrt = ipmr_rt_fib_lookup(net, skb); + if (IS_ERR(mrt)) goto drop; - if (__pim_rcv(mrt, skb, sizeof(*pim))) { drop: kfree_skb(skb); @@ -2027,20 +2041,20 @@ rtattr_failure: return -EMSGSIZE; } -int ipmr_get_route(struct net *net, - struct sk_buff *skb, struct rtmsg *rtm, int nowait) +int ipmr_get_route(struct net *net, struct sk_buff *skb, + __be32 saddr, __be32 daddr, + struct rtmsg *rtm, int nowait) { - int err; - struct mr_table *mrt; struct mfc_cache *cache; - struct rtable *rt = skb_rtable(skb); + struct mr_table *mrt; + int err; mrt = ipmr_get_table(net, RT_TABLE_DEFAULT); if (mrt == NULL) return -ENOENT; rcu_read_lock(); - cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst); + cache = ipmr_cache_find(mrt, saddr, daddr); if (cache == NULL) { struct sk_buff *skb2; @@ -2073,8 +2087,8 @@ int ipmr_get_route(struct net *net, skb_reset_network_header(skb2); iph = ip_hdr(skb2); iph->ihl = sizeof(struct iphdr) >> 2; - iph->saddr = rt->rt_src; - iph->daddr = rt->rt_dst; + iph->saddr = saddr; + iph->daddr = daddr; iph->version = 0; err = ipmr_cache_unresolved(mrt, vif, skb2); read_unlock(&mrt_lock); diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 994a1f2..4614bab 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -16,7 +16,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) struct net *net = dev_net(skb_dst(skb)->dev); const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; - struct flowi fl = {}; + struct flowi4 fl4 = {}; unsigned long orefdst; unsigned int hh_len; unsigned int type; @@ -31,14 +31,15 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook. */ if (addr_type == RTN_LOCAL) { - fl.fl4_dst = iph->daddr; + fl4.daddr = iph->daddr; if (type == RTN_LOCAL) - fl.fl4_src = iph->saddr; - fl.fl4_tos = RT_TOS(iph->tos); - fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; - fl.mark = skb->mark; - fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; - if (ip_route_output_key(net, &rt, &fl) != 0) + fl4.saddr = iph->saddr; + fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; + fl4.flowi4_mark = skb->mark; + fl4.flowi4_flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) return -1; /* Drop old route. */ @@ -47,8 +48,9 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) } else { /* non-local src, find valid iif to satisfy * rp-filter when calling ip_route_input. */ - fl.fl4_dst = iph->saddr; - if (ip_route_output_key(net, &rt, &fl) != 0) + fl4.daddr = iph->saddr; + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) return -1; orefdst = skb->_skb_refdst; @@ -66,10 +68,11 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) #ifdef CONFIG_XFRM if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && - xfrm_decode_session(skb, &fl, AF_INET) == 0) { + xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) { struct dst_entry *dst = skb_dst(skb); skb_dst_set(skb, NULL); - if (xfrm_lookup(net, &dst, &fl, skb->sk, 0)) + dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0); + if (IS_ERR(dst)) return -1; skb_dst_set(skb, dst); } @@ -102,7 +105,8 @@ int ip_xfrm_me_harder(struct sk_buff *skb) dst = ((struct xfrm_dst *)dst)->route; dst_hold(dst); - if (xfrm_lookup(dev_net(dst->dev), &dst, &fl, skb->sk, 0) < 0) + dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0); + if (IS_ERR(dst)) return -1; skb_dst_drop(skb); @@ -217,9 +221,14 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, return csum; } -static int nf_ip_route(struct dst_entry **dst, struct flowi *fl) +static int nf_ip_route(struct net *net, struct dst_entry **dst, + struct flowi *fl, bool strict __always_unused) { - return ip_route_output_key(&init_net, (struct rtable **)dst, fl); + struct rtable *rt = ip_route_output_key(net, &fl->u.ip4); + if (IS_ERR(rt)) + return PTR_ERR(rt); + *dst = &rt->dst; + return 0; } static const struct nf_afinfo nf_ip_afinfo = { diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index babd1a2..1dfc18a 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -64,16 +64,6 @@ config IP_NF_IPTABLES if IP_NF_IPTABLES # The matches. -config IP_NF_MATCH_ADDRTYPE - tristate '"addrtype" address type match support' - depends on NETFILTER_ADVANCED - help - This option allows you to match what routing thinks of an address, - eg. UNICAST, LOCAL, BROADCAST, ... - - If you want to compile it as a module, say M here and read - <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. - config IP_NF_MATCH_AH tristate '"ah" match support' depends on NETFILTER_ADVANCED @@ -206,8 +196,9 @@ config IP_NF_TARGET_REDIRECT config NF_NAT_SNMP_BASIC tristate "Basic SNMP-ALG support" - depends on NF_NAT + depends on NF_CONNTRACK_SNMP && NF_NAT depends on NETFILTER_ADVANCED + default NF_NAT && NF_CONNTRACK_SNMP ---help--- This module implements an Application Layer Gateway (ALG) for diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 19eb59d..dca2082 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -48,7 +48,6 @@ obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o # matches -obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index e855fff..fd7a3f6 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -76,7 +76,7 @@ static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, } /* - * Unfortunatly, _b and _mask are not aligned to an int (or long int) + * Unfortunately, _b and _mask are not aligned to an int (or long int) * Some arches dont care, unrolling the loop is a win on them. * For other arches, we only have a 16bit alignement. */ @@ -260,6 +260,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, void *table_base; const struct xt_table_info *private; struct xt_action_param acpar; + unsigned int addend; if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) return NF_DROP; @@ -267,7 +268,8 @@ unsigned int arpt_do_table(struct sk_buff *skb, indev = in ? in->name : nulldevname; outdev = out ? out->name : nulldevname; - xt_info_rdlock_bh(); + local_bh_disable(); + addend = xt_write_recseq_begin(); private = table->private; table_base = private->entries[smp_processor_id()]; @@ -338,7 +340,8 @@ unsigned int arpt_do_table(struct sk_buff *skb, /* Verdict */ break; } while (!acpar.hotdrop); - xt_info_rdunlock_bh(); + xt_write_recseq_end(addend); + local_bh_enable(); if (acpar.hotdrop) return NF_DROP; @@ -712,7 +715,7 @@ static void get_counters(const struct xt_table_info *t, unsigned int i; for_each_possible_cpu(cpu) { - seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock; + seqcount_t *s = &per_cpu(xt_recseq, cpu); i = 0; xt_entry_foreach(iter, t->entries[cpu], t->size) { @@ -720,10 +723,10 @@ static void get_counters(const struct xt_table_info *t, unsigned int start; do { - start = read_seqbegin(lock); + start = read_seqcount_begin(s); bcnt = iter->counters.bcnt; pcnt = iter->counters.pcnt; - } while (read_seqretry(lock, start)); + } while (read_seqcount_retry(s, start)); ADD_COUNTER(counters[i], bcnt, pcnt); ++i; @@ -866,6 +869,7 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries[raw_smp_processor_id()]; + xt_compat_init_offsets(NFPROTO_ARP, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) @@ -1065,6 +1069,7 @@ static int do_replace(struct net *net, const void __user *user, /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) @@ -1113,6 +1118,7 @@ static int do_add_counters(struct net *net, const void __user *user, int ret = 0; void *loc_cpu_entry; struct arpt_entry *iter; + unsigned int addend; #ifdef CONFIG_COMPAT struct compat_xt_counters_info compat_tmp; @@ -1169,12 +1175,12 @@ static int do_add_counters(struct net *net, const void __user *user, /* Choose the copy that is on our node */ curcpu = smp_processor_id(); loc_cpu_entry = private->entries[curcpu]; - xt_info_wrlock(curcpu); + addend = xt_write_recseq_begin(); xt_entry_foreach(iter, loc_cpu_entry, private->size) { ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); ++i; } - xt_info_wrunlock(curcpu); + xt_write_recseq_end(addend); unlock_up_free: local_bh_enable(); xt_table_unlock(t); @@ -1333,6 +1339,7 @@ static int translate_compat_table(const char *name, duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(NFPROTO_ARP); + xt_compat_init_offsets(NFPROTO_ARP, number); /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, total_size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, @@ -1486,6 +1493,7 @@ static int compat_do_replace(struct net *net, void __user *user, return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) @@ -1738,6 +1746,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len ret = -EFAULT; break; } + rev.name[sizeof(rev.name)-1] = 0; try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name, rev.revision, 1, &ret), @@ -1869,7 +1878,7 @@ static int __init arp_tables_init(void) if (ret < 0) goto err1; - /* Noone else will be downing sem now, so we won't sleep */ + /* No one else will be downing sem now, so we won't sleep */ ret = xt_register_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg)); if (ret < 0) goto err2; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 652efea..7647438 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -68,15 +68,6 @@ void *ipt_alloc_initial_table(const struct xt_table *info) } EXPORT_SYMBOL_GPL(ipt_alloc_initial_table); -/* - We keep a set of rules for each CPU, so we can avoid write-locking - them in the softirq when updating the counters and therefore - only need to read-lock in the softirq; doing a write_lock_bh() in user - context stops packets coming through and allows user context to read - the counters or update the rules. - - Hence the start of any table is given by get_table() below. */ - /* Returns whether matches rule or not. */ /* Performance critical - called for every packet */ static inline bool @@ -311,6 +302,7 @@ ipt_do_table(struct sk_buff *skb, unsigned int *stackptr, origptr, cpu; const struct xt_table_info *private; struct xt_action_param acpar; + unsigned int addend; /* Initialization */ ip = ip_hdr(skb); @@ -331,7 +323,8 @@ ipt_do_table(struct sk_buff *skb, acpar.hooknum = hook; IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - xt_info_rdlock_bh(); + local_bh_disable(); + addend = xt_write_recseq_begin(); private = table->private; cpu = smp_processor_id(); table_base = private->entries[cpu]; @@ -387,7 +380,7 @@ ipt_do_table(struct sk_buff *skb, verdict = (unsigned)(-v) - 1; break; } - if (*stackptr == 0) { + if (*stackptr <= origptr) { e = get_entry(table_base, private->underflow[hook]); pr_debug("Underflow (this is normal) " @@ -427,10 +420,12 @@ ipt_do_table(struct sk_buff *skb, /* Verdict */ break; } while (!acpar.hotdrop); - xt_info_rdunlock_bh(); pr_debug("Exiting %s; resetting sp from %u to %u\n", __func__, *stackptr, origptr); *stackptr = origptr; + xt_write_recseq_end(addend); + local_bh_enable(); + #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; #else @@ -886,7 +881,7 @@ get_counters(const struct xt_table_info *t, unsigned int i; for_each_possible_cpu(cpu) { - seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock; + seqcount_t *s = &per_cpu(xt_recseq, cpu); i = 0; xt_entry_foreach(iter, t->entries[cpu], t->size) { @@ -894,10 +889,10 @@ get_counters(const struct xt_table_info *t, unsigned int start; do { - start = read_seqbegin(lock); + start = read_seqcount_begin(s); bcnt = iter->counters.bcnt; pcnt = iter->counters.pcnt; - } while (read_seqretry(lock, start)); + } while (read_seqcount_retry(s, start)); ADD_COUNTER(counters[i], bcnt, pcnt); ++i; /* macro does multi eval of i */ @@ -1063,6 +1058,7 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries[raw_smp_processor_id()]; + xt_compat_init_offsets(AF_INET, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) @@ -1261,6 +1257,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) @@ -1310,6 +1307,7 @@ do_add_counters(struct net *net, const void __user *user, int ret = 0; void *loc_cpu_entry; struct ipt_entry *iter; + unsigned int addend; #ifdef CONFIG_COMPAT struct compat_xt_counters_info compat_tmp; @@ -1366,12 +1364,12 @@ do_add_counters(struct net *net, const void __user *user, /* Choose the copy that is on our node */ curcpu = smp_processor_id(); loc_cpu_entry = private->entries[curcpu]; - xt_info_wrlock(curcpu); + addend = xt_write_recseq_begin(); xt_entry_foreach(iter, loc_cpu_entry, private->size) { ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); ++i; } - xt_info_wrunlock(curcpu); + xt_write_recseq_end(addend); unlock_up_free: local_bh_enable(); xt_table_unlock(t); @@ -1664,6 +1662,7 @@ translate_compat_table(struct net *net, duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(AF_INET); + xt_compat_init_offsets(AF_INET, number); /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, total_size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, @@ -1805,6 +1804,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) @@ -2034,6 +2034,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) ret = -EFAULT; break; } + rev.name[sizeof(rev.name)-1] = 0; if (cmd == IPT_SO_GET_REVISION_TARGET) target = 1; @@ -2228,7 +2229,7 @@ static int __init ip_tables_init(void) if (ret < 0) goto err1; - /* Noone else will be downing sem now, so we won't sleep */ + /* No one else will be downing sem now, so we won't sleep */ ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg)); if (ret < 0) goto err2; diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 1e26a48..d609ac3 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -300,13 +300,8 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) * that the ->target() function isn't called after ->destroy() */ ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) { - pr_info("no conntrack!\n"); - /* FIXME: need to drop invalid ones, since replies - * to outgoing connections of other nodes will be - * marked as INVALID */ + if (ct == NULL) return NF_DROP; - } /* special case: ICMP error handling. conntrack distinguishes between * error messages (RELATED) and information requests (see below) */ @@ -669,8 +664,11 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input, char buffer[PROC_WRITELEN+1]; unsigned long nodenum; - if (copy_from_user(buffer, input, PROC_WRITELEN)) + if (size > PROC_WRITELEN) + return -EIO; + if (copy_from_user(buffer, input, size)) return -EFAULT; + buffer[size] = 0; if (*buffer == '+') { nodenum = simple_strtoul(buffer+1, NULL, 10); diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 72ffc8f..d76d6c9 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -442,8 +442,7 @@ ipt_log_packet(u_int8_t pf, } #endif - /* MAC logging for input path only. */ - if (in && !out) + if (in != NULL) dump_mac_header(m, loginfo, skb); dump_packet(m, loginfo, skb, 0); diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c deleted file mode 100644 index db8bff0..0000000 --- a/net/ipv4/netfilter/ipt_addrtype.c +++ /dev/null @@ -1,134 +0,0 @@ -/* - * iptables module to match inet_addr_type() of an ip. - * - * Copyright (c) 2004 Patrick McHardy <kaber@trash.net> - * (C) 2007 Laszlo Attila Toth <panther@balabit.hu> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <linux/ip.h> -#include <net/route.h> - -#include <linux/netfilter_ipv4/ipt_addrtype.h> -#include <linux/netfilter/x_tables.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_DESCRIPTION("Xtables: address type match for IPv4"); - -static inline bool match_type(struct net *net, const struct net_device *dev, - __be32 addr, u_int16_t mask) -{ - return !!(mask & (1 << inet_dev_addr_type(net, dev, addr))); -} - -static bool -addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) -{ - struct net *net = dev_net(par->in ? par->in : par->out); - const struct ipt_addrtype_info *info = par->matchinfo; - const struct iphdr *iph = ip_hdr(skb); - bool ret = true; - - if (info->source) - ret &= match_type(net, NULL, iph->saddr, info->source) ^ - info->invert_source; - if (info->dest) - ret &= match_type(net, NULL, iph->daddr, info->dest) ^ - info->invert_dest; - - return ret; -} - -static bool -addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) -{ - struct net *net = dev_net(par->in ? par->in : par->out); - const struct ipt_addrtype_info_v1 *info = par->matchinfo; - const struct iphdr *iph = ip_hdr(skb); - const struct net_device *dev = NULL; - bool ret = true; - - if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) - dev = par->in; - else if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) - dev = par->out; - - if (info->source) - ret &= match_type(net, dev, iph->saddr, info->source) ^ - (info->flags & IPT_ADDRTYPE_INVERT_SOURCE); - if (ret && info->dest) - ret &= match_type(net, dev, iph->daddr, info->dest) ^ - !!(info->flags & IPT_ADDRTYPE_INVERT_DEST); - return ret; -} - -static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) -{ - struct ipt_addrtype_info_v1 *info = par->matchinfo; - - if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN && - info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { - pr_info("both incoming and outgoing " - "interface limitation cannot be selected\n"); - return -EINVAL; - } - - if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | - (1 << NF_INET_LOCAL_IN)) && - info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { - pr_info("output interface limitation " - "not valid in PREROUTING and INPUT\n"); - return -EINVAL; - } - - if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | - (1 << NF_INET_LOCAL_OUT)) && - info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) { - pr_info("input interface limitation " - "not valid in POSTROUTING and OUTPUT\n"); - return -EINVAL; - } - - return 0; -} - -static struct xt_match addrtype_mt_reg[] __read_mostly = { - { - .name = "addrtype", - .family = NFPROTO_IPV4, - .match = addrtype_mt_v0, - .matchsize = sizeof(struct ipt_addrtype_info), - .me = THIS_MODULE - }, - { - .name = "addrtype", - .family = NFPROTO_IPV4, - .revision = 1, - .match = addrtype_mt_v1, - .checkentry = addrtype_mt_checkentry_v1, - .matchsize = sizeof(struct ipt_addrtype_info_v1), - .me = THIS_MODULE - } -}; - -static int __init addrtype_mt_init(void) -{ - return xt_register_matches(addrtype_mt_reg, - ARRAY_SIZE(addrtype_mt_reg)); -} - -static void __exit addrtype_mt_exit(void) -{ - xt_unregister_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg)); -} - -module_init(addrtype_mt_init); -module_exit(addrtype_mt_exit); diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 294a2a3..aef5d1f 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -60,7 +60,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out) ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out, dev_net(out)->ipv4.iptable_mangle); /* Reroute for ANY change. */ - if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { + if (ret != NF_DROP && ret != NF_STOLEN) { iph = ip_hdr(skb); if (iph->saddr != saddr || diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 63f60fc..5585980 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -20,6 +20,7 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_acct.h> +#include <linux/rculist_nulls.h> struct ct_iter_state { struct seq_net_private p; @@ -35,7 +36,8 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) for (st->bucket = 0; st->bucket < net->ct.htable_size; st->bucket++) { - n = rcu_dereference(net->ct.hash[st->bucket].first); + n = rcu_dereference( + hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); if (!is_a_nulls(n)) return n; } @@ -48,13 +50,14 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; - head = rcu_dereference(head->next); + head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { if (++st->bucket >= net->ct.htable_size) return NULL; } - head = rcu_dereference(net->ct.hash[st->bucket].first); + head = rcu_dereference( + hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); } return head; } @@ -217,7 +220,8 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { - n = rcu_dereference(net->ct.expect_hash[st->bucket].first); + n = rcu_dereference( + hlist_first_rcu(&net->ct.expect_hash[st->bucket])); if (n) return n; } @@ -230,11 +234,12 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; - head = rcu_dereference(head->next); + head = rcu_dereference(hlist_next_rcu(head)); while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; - head = rcu_dereference(net->ct.expect_hash[st->bucket].first); + head = rcu_dereference( + hlist_first_rcu(&net->ct.expect_hash[st->bucket])); } return head; } diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c index 0f23b3f..703f366f 100644 --- a/net/ipv4/netfilter/nf_nat_amanda.c +++ b/net/ipv4/netfilter/nf_nat_amanda.c @@ -44,13 +44,13 @@ static unsigned int help(struct sk_buff *skb, /* Try to get same port: if not, try to change it. */ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { - int ret; + int res; exp->tuple.dst.u.tcp.port = htons(port); - ret = nf_ct_expect_related(exp); - if (ret == 0) + res = nf_ct_expect_related(exp); + if (res == 0) break; - else if (ret != -EBUSY) { + else if (res != -EBUSY) { port = 0; break; } diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index c04787c..9c71b27 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -221,7 +221,14 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, manips not an issue. */ if (maniptype == IP_NAT_MANIP_SRC && !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { - if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { + /* try the original tuple first */ + if (in_range(orig_tuple, range)) { + if (!nf_nat_used_tuple(orig_tuple, ct)) { + *tuple = *orig_tuple; + return; + } + } else if (find_appropriate_src(net, zone, orig_tuple, tuple, + range)) { pr_debug("get_unique_tuple: Found current src map\n"); if (!nf_nat_used_tuple(tuple, ct)) return; @@ -266,7 +273,6 @@ nf_nat_setup_info(struct nf_conn *ct, struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; struct nf_conn_nat *nat; - int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK); /* nat helper or nfctnetlink also setup binding */ nat = nfct_nat(ct); @@ -306,8 +312,7 @@ nf_nat_setup_info(struct nf_conn *ct, ct->status |= IPS_DST_NAT; } - /* Place in source hash if this is the first time. */ - if (have_to_hash) { + if (maniptype == IP_NAT_MANIP_SRC) { unsigned int srchash; srchash = hash_by_src(net, nf_ct_zone(ct), @@ -323,9 +328,9 @@ nf_nat_setup_info(struct nf_conn *ct, /* It's done. */ if (maniptype == IP_NAT_MANIP_DST) - set_bit(IPS_DST_NAT_DONE_BIT, &ct->status); + ct->status |= IPS_DST_NAT_DONE; else - set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); + ct->status |= IPS_SRC_NAT_DONE; return NF_ACCEPT; } @@ -502,7 +507,10 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto) int ret = 0; spin_lock_bh(&nf_nat_lock); - if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) { + if (rcu_dereference_protected( + nf_nat_protos[proto->protonum], + lockdep_is_held(&nf_nat_lock) + ) != &nf_nat_unknown_protocol) { ret = -EBUSY; goto out; } @@ -513,7 +521,7 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto) } EXPORT_SYMBOL(nf_nat_protocol_register); -/* Noone stores the protocol anywhere; simply delete it. */ +/* No one stores the protocol anywhere; simply delete it. */ void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto) { spin_lock_bh(&nf_nat_lock); @@ -524,7 +532,7 @@ void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto) } EXPORT_SYMBOL(nf_nat_protocol_unregister); -/* Noone using conntrack by the time this called. */ +/* No one using conntrack by the time this called. */ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT); @@ -532,7 +540,7 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) if (nat == NULL || nat->ct == NULL) return; - NF_CT_ASSERT(nat->ct->status & IPS_NAT_DONE_MASK); + NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE); spin_lock_bh(&nf_nat_lock); hlist_del_rcu(&nat->bysource); @@ -545,11 +553,10 @@ static void nf_nat_move_storage(void *new, void *old) struct nf_conn_nat *old_nat = old; struct nf_conn *ct = old_nat->ct; - if (!ct || !(ct->status & IPS_NAT_DONE_MASK)) + if (!ct || !(ct->status & IPS_SRC_NAT_DONE)) return; spin_lock_bh(&nf_nat_lock); - new_nat->ct = ct; hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); spin_unlock_bh(&nf_nat_lock); } @@ -679,8 +686,7 @@ static int __net_init nf_nat_net_init(struct net *net) { /* Leave them the same for the moment. */ net->ipv4.nat_htable_size = net->ct.htable_size; - net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, - &net->ipv4.nat_vmalloced, 0); + net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 0); if (!net->ipv4.nat_bysource) return -ENOMEM; return 0; @@ -702,8 +708,7 @@ static void __net_exit nf_nat_net_exit(struct net *net) { nf_ct_iterate_cleanup(net, &clean_nat, NULL); synchronize_rcu(); - nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced, - net->ipv4.nat_htable_size); + nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_htable_size); } static struct pernet_operations nf_nat_net_ops = { diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c index 31427fb..99cfa28 100644 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ b/net/ipv4/netfilter/nf_nat_helper.c @@ -153,7 +153,7 @@ void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo, } EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust); -static void nf_nat_csum(struct sk_buff *skb, struct iphdr *iph, void *data, +static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data, int datalen, __sum16 *check, int oldlen) { struct rtable *rt = skb_rtable(skb); diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index ee5f419..8812a02 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -54,6 +54,7 @@ #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_nat_helper.h> +#include <linux/netfilter/nf_conntrack_snmp.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); @@ -1310,9 +1311,9 @@ static int __init nf_nat_snmp_basic_init(void) { int ret = 0; - ret = nf_conntrack_helper_register(&snmp_helper); - if (ret < 0) - return ret; + BUG_ON(nf_nat_snmp_hook != NULL); + rcu_assign_pointer(nf_nat_snmp_hook, help); + ret = nf_conntrack_helper_register(&snmp_trap_helper); if (ret < 0) { nf_conntrack_helper_unregister(&snmp_helper); @@ -1323,7 +1324,7 @@ static int __init nf_nat_snmp_basic_init(void) static void __exit nf_nat_snmp_basic_fini(void) { - nf_conntrack_helper_unregister(&snmp_helper); + rcu_assign_pointer(nf_nat_snmp_hook, NULL); nf_conntrack_helper_unregister(&snmp_trap_helper); } diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c index 95481fe..7317bdf 100644 --- a/net/ipv4/netfilter/nf_nat_standalone.c +++ b/net/ipv4/netfilter/nf_nat_standalone.c @@ -31,6 +31,7 @@ #ifdef CONFIG_XFRM static void nat_decode_session(struct sk_buff *skb, struct flowi *fl) { + struct flowi4 *fl4 = &fl->u.ip4; const struct nf_conn *ct; const struct nf_conntrack_tuple *t; enum ip_conntrack_info ctinfo; @@ -49,25 +50,25 @@ static void nat_decode_session(struct sk_buff *skb, struct flowi *fl) statusbit = IPS_SRC_NAT; if (ct->status & statusbit) { - fl->fl4_dst = t->dst.u3.ip; + fl4->daddr = t->dst.u3.ip; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) - fl->fl_ip_dport = t->dst.u.tcp.port; + fl4->fl4_dport = t->dst.u.tcp.port; } statusbit ^= IPS_NAT_MASK; if (ct->status & statusbit) { - fl->fl4_src = t->src.u3.ip; + fl4->saddr = t->src.u3.ip; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) - fl->fl_ip_sport = t->src.u.tcp.port; + fl4->fl4_sport = t->src.u.tcp.port; } } #endif diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c new file mode 100644 index 0000000..9aaa671 --- /dev/null +++ b/net/ipv4/ping.c @@ -0,0 +1,932 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * "Ping" sockets + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Based on ipv4/udp.c code. + * + * Authors: Vasiliy Kulikov / Openwall (for Linux 2.6), + * Pavel Kankovsky (for Linux 2.4.32) + * + * Pavel gave all rights to bugs to Vasiliy, + * none of the bugs are Pavel's now. + * + */ + +#include <asm/system.h> +#include <linux/uaccess.h> +#include <linux/types.h> +#include <linux/fcntl.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <linux/mm.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <net/snmp.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/icmp.h> +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <net/sock.h> +#include <net/ping.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/route.h> +#include <net/inet_common.h> +#include <net/checksum.h> + + +static struct ping_table ping_table; + +static u16 ping_port_rover; + +static inline int ping_hashfn(struct net *net, unsigned num, unsigned mask) +{ + int res = (num + net_hash_mix(net)) & mask; + pr_debug("hash(%d) = %d\n", num, res); + return res; +} + +static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table, + struct net *net, unsigned num) +{ + return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)]; +} + +static int ping_v4_get_port(struct sock *sk, unsigned short ident) +{ + struct hlist_nulls_node *node; + struct hlist_nulls_head *hlist; + struct inet_sock *isk, *isk2; + struct sock *sk2 = NULL; + + isk = inet_sk(sk); + write_lock_bh(&ping_table.lock); + if (ident == 0) { + u32 i; + u16 result = ping_port_rover + 1; + + for (i = 0; i < (1L << 16); i++, result++) { + if (!result) + result++; /* avoid zero */ + hlist = ping_hashslot(&ping_table, sock_net(sk), + result); + ping_portaddr_for_each_entry(sk2, node, hlist) { + isk2 = inet_sk(sk2); + + if (isk2->inet_num == result) + goto next_port; + } + + /* found */ + ping_port_rover = ident = result; + break; +next_port: + ; + } + if (i >= (1L << 16)) + goto fail; + } else { + hlist = ping_hashslot(&ping_table, sock_net(sk), ident); + ping_portaddr_for_each_entry(sk2, node, hlist) { + isk2 = inet_sk(sk2); + + if ((isk2->inet_num == ident) && + (sk2 != sk) && + (!sk2->sk_reuse || !sk->sk_reuse)) + goto fail; + } + } + + pr_debug("found port/ident = %d\n", ident); + isk->inet_num = ident; + if (sk_unhashed(sk)) { + pr_debug("was not hashed\n"); + sock_hold(sk); + hlist_nulls_add_head(&sk->sk_nulls_node, hlist); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + } + write_unlock_bh(&ping_table.lock); + return 0; + +fail: + write_unlock_bh(&ping_table.lock); + return 1; +} + +static void ping_v4_hash(struct sock *sk) +{ + pr_debug("ping_v4_hash(sk->port=%u)\n", inet_sk(sk)->inet_num); + BUG(); /* "Please do not press this button again." */ +} + +static void ping_v4_unhash(struct sock *sk) +{ + struct inet_sock *isk = inet_sk(sk); + pr_debug("ping_v4_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num); + if (sk_hashed(sk)) { + write_lock_bh(&ping_table.lock); + hlist_nulls_del(&sk->sk_nulls_node); + sock_put(sk); + isk->inet_num = isk->inet_sport = 0; + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + write_unlock_bh(&ping_table.lock); + } +} + +static struct sock *ping_v4_lookup(struct net *net, u32 saddr, u32 daddr, + u16 ident, int dif) +{ + struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident); + struct sock *sk = NULL; + struct inet_sock *isk; + struct hlist_nulls_node *hnode; + + pr_debug("try to find: num = %d, daddr = %ld, dif = %d\n", + (int)ident, (unsigned long)daddr, dif); + read_lock_bh(&ping_table.lock); + + ping_portaddr_for_each_entry(sk, hnode, hslot) { + isk = inet_sk(sk); + + pr_debug("found: %p: num = %d, daddr = %ld, dif = %d\n", sk, + (int)isk->inet_num, (unsigned long)isk->inet_rcv_saddr, + sk->sk_bound_dev_if); + + pr_debug("iterate\n"); + if (isk->inet_num != ident) + continue; + if (isk->inet_rcv_saddr && isk->inet_rcv_saddr != daddr) + continue; + if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) + continue; + + sock_hold(sk); + goto exit; + } + + sk = NULL; +exit: + read_unlock_bh(&ping_table.lock); + + return sk; +} + +static void inet_get_ping_group_range_net(struct net *net, gid_t *low, + gid_t *high) +{ + gid_t *data = net->ipv4.sysctl_ping_group_range; + unsigned seq; + do { + seq = read_seqbegin(&sysctl_local_ports.lock); + + *low = data[0]; + *high = data[1]; + } while (read_seqretry(&sysctl_local_ports.lock, seq)); +} + + +static int ping_init_sock(struct sock *sk) +{ + struct net *net = sock_net(sk); + gid_t group = current_egid(); + gid_t range[2]; + struct group_info *group_info = get_current_groups(); + int i, j, count = group_info->ngroups; + + inet_get_ping_group_range_net(net, range, range+1); + if (range[0] <= group && group <= range[1]) + return 0; + + for (i = 0; i < group_info->nblocks; i++) { + int cp_count = min_t(int, NGROUPS_PER_BLOCK, count); + + for (j = 0; j < cp_count; j++) { + group = group_info->blocks[i][j]; + if (range[0] <= group && group <= range[1]) + return 0; + } + + count -= cp_count; + } + + return -EACCES; +} + +static void ping_close(struct sock *sk, long timeout) +{ + pr_debug("ping_close(sk=%p,sk->num=%u)\n", + inet_sk(sk), inet_sk(sk)->inet_num); + pr_debug("isk->refcnt = %d\n", sk->sk_refcnt.counter); + + sk_common_release(sk); +} + +/* + * We need our own bind because there are no privileged id's == local ports. + * Moreover, we don't allow binding to multi- and broadcast addresses. + */ + +static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; + struct inet_sock *isk = inet_sk(sk); + unsigned short snum; + int chk_addr_ret; + int err; + + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + pr_debug("ping_v4_bind(sk=%p,sa_addr=%08x,sa_port=%d)\n", + sk, addr->sin_addr.s_addr, ntohs(addr->sin_port)); + + chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); + if (addr->sin_addr.s_addr == INADDR_ANY) + chk_addr_ret = RTN_LOCAL; + + if ((sysctl_ip_nonlocal_bind == 0 && + isk->freebind == 0 && isk->transparent == 0 && + chk_addr_ret != RTN_LOCAL) || + chk_addr_ret == RTN_MULTICAST || + chk_addr_ret == RTN_BROADCAST) + return -EADDRNOTAVAIL; + + lock_sock(sk); + + err = -EINVAL; + if (isk->inet_num != 0) + goto out; + + err = -EADDRINUSE; + isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr; + snum = ntohs(addr->sin_port); + if (ping_v4_get_port(sk, snum) != 0) { + isk->inet_saddr = isk->inet_rcv_saddr = 0; + goto out; + } + + pr_debug("after bind(): num = %d, daddr = %ld, dif = %d\n", + (int)isk->inet_num, + (unsigned long) isk->inet_rcv_saddr, + (int)sk->sk_bound_dev_if); + + err = 0; + if (isk->inet_rcv_saddr) + sk->sk_userlocks |= SOCK_BINDADDR_LOCK; + if (snum) + sk->sk_userlocks |= SOCK_BINDPORT_LOCK; + isk->inet_sport = htons(isk->inet_num); + isk->inet_daddr = 0; + isk->inet_dport = 0; + sk_dst_reset(sk); +out: + release_sock(sk); + pr_debug("ping_v4_bind -> %d\n", err); + return err; +} + +/* + * Is this a supported type of ICMP message? + */ + +static inline int ping_supported(int type, int code) +{ + if (type == ICMP_ECHO && code == 0) + return 1; + return 0; +} + +/* + * This routine is called by the ICMP module when it gets some + * sort of error condition. + */ + +static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); + +void ping_err(struct sk_buff *skb, u32 info) +{ + struct iphdr *iph = (struct iphdr *)skb->data; + struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2)); + struct inet_sock *inet_sock; + int type = icmph->type; + int code = icmph->code; + struct net *net = dev_net(skb->dev); + struct sock *sk; + int harderr; + int err; + + /* We assume the packet has already been checked by icmp_unreach */ + + if (!ping_supported(icmph->type, icmph->code)) + return; + + pr_debug("ping_err(type=%04x,code=%04x,id=%04x,seq=%04x)\n", type, + code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); + + sk = ping_v4_lookup(net, iph->daddr, iph->saddr, + ntohs(icmph->un.echo.id), skb->dev->ifindex); + if (sk == NULL) { + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + pr_debug("no socket, dropping\n"); + return; /* No socket for error */ + } + pr_debug("err on socket %p\n", sk); + + err = 0; + harderr = 0; + inet_sock = inet_sk(sk); + + switch (type) { + default: + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + case ICMP_SOURCE_QUENCH: + /* This is not a real error but ping wants to see it. + * Report it with some fake errno. */ + err = EREMOTEIO; + break; + case ICMP_PARAMETERPROB: + err = EPROTO; + harderr = 1; + break; + case ICMP_DEST_UNREACH: + if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ + if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) { + err = EMSGSIZE; + harderr = 1; + break; + } + goto out; + } + err = EHOSTUNREACH; + if (code <= NR_ICMP_UNREACH) { + harderr = icmp_err_convert[code].fatal; + err = icmp_err_convert[code].errno; + } + break; + case ICMP_REDIRECT: + /* See ICMP_SOURCE_QUENCH */ + err = EREMOTEIO; + break; + } + + /* + * RFC1122: OK. Passes ICMP errors back to application, as per + * 4.1.3.3. + */ + if (!inet_sock->recverr) { + if (!harderr || sk->sk_state != TCP_ESTABLISHED) + goto out; + } else { + ip_icmp_error(sk, skb, err, 0 /* no remote port */, + info, (u8 *)icmph); + } + sk->sk_err = err; + sk->sk_error_report(sk); +out: + sock_put(sk); +} + +/* + * Copy and checksum an ICMP Echo packet from user space into a buffer. + */ + +struct pingfakehdr { + struct icmphdr icmph; + struct iovec *iov; + u32 wcheck; +}; + +static int ping_getfrag(void *from, char * to, + int offset, int fraglen, int odd, struct sk_buff *skb) +{ + struct pingfakehdr *pfh = (struct pingfakehdr *)from; + + if (offset == 0) { + if (fraglen < sizeof(struct icmphdr)) + BUG(); + if (csum_partial_copy_fromiovecend(to + sizeof(struct icmphdr), + pfh->iov, 0, fraglen - sizeof(struct icmphdr), + &pfh->wcheck)) + return -EFAULT; + + return 0; + } + if (offset < sizeof(struct icmphdr)) + BUG(); + if (csum_partial_copy_fromiovecend + (to, pfh->iov, offset - sizeof(struct icmphdr), + fraglen, &pfh->wcheck)) + return -EFAULT; + return 0; +} + +static int ping_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh, + struct flowi4 *fl4) +{ + struct sk_buff *skb = skb_peek(&sk->sk_write_queue); + + pfh->wcheck = csum_partial((char *)&pfh->icmph, + sizeof(struct icmphdr), pfh->wcheck); + pfh->icmph.checksum = csum_fold(pfh->wcheck); + memcpy(icmp_hdr(skb), &pfh->icmph, sizeof(struct icmphdr)); + skb->ip_summed = CHECKSUM_NONE; + return ip_push_pending_frames(sk, fl4); +} + +static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len) +{ + struct net *net = sock_net(sk); + struct flowi4 fl4; + struct inet_sock *inet = inet_sk(sk); + struct ipcm_cookie ipc; + struct icmphdr user_icmph; + struct pingfakehdr pfh; + struct rtable *rt = NULL; + struct ip_options_data opt_copy; + int free = 0; + u32 saddr, daddr, faddr; + u8 tos; + int err; + + pr_debug("ping_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); + + + if (len > 0xFFFF) + return -EMSGSIZE; + + /* + * Check the flags. + */ + + /* Mirror BSD error message compatibility */ + if (msg->msg_flags & MSG_OOB) + return -EOPNOTSUPP; + + /* + * Fetch the ICMP header provided by the userland. + * iovec is modified! + */ + + if (memcpy_fromiovec((u8 *)&user_icmph, msg->msg_iov, + sizeof(struct icmphdr))) + return -EFAULT; + if (!ping_supported(user_icmph.type, user_icmph.code)) + return -EINVAL; + + /* + * Get and verify the address. + */ + + if (msg->msg_name) { + struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; + if (msg->msg_namelen < sizeof(*usin)) + return -EINVAL; + if (usin->sin_family != AF_INET) + return -EINVAL; + daddr = usin->sin_addr.s_addr; + /* no remote port */ + } else { + if (sk->sk_state != TCP_ESTABLISHED) + return -EDESTADDRREQ; + daddr = inet->inet_daddr; + /* no remote port */ + } + + ipc.addr = inet->inet_saddr; + ipc.opt = NULL; + ipc.oif = sk->sk_bound_dev_if; + ipc.tx_flags = 0; + err = sock_tx_timestamp(sk, &ipc.tx_flags); + if (err) + return err; + + if (msg->msg_controllen) { + err = ip_cmsg_send(sock_net(sk), msg, &ipc); + if (err) + return err; + if (ipc.opt) + free = 1; + } + if (!ipc.opt) { + struct ip_options_rcu *inet_opt; + + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + if (inet_opt) { + memcpy(&opt_copy, inet_opt, + sizeof(*inet_opt) + inet_opt->opt.optlen); + ipc.opt = &opt_copy.opt; + } + rcu_read_unlock(); + } + + saddr = ipc.addr; + ipc.addr = faddr = daddr; + + if (ipc.opt && ipc.opt->opt.srr) { + if (!daddr) + return -EINVAL; + faddr = ipc.opt->opt.faddr; + } + tos = RT_TOS(inet->tos); + if (sock_flag(sk, SOCK_LOCALROUTE) || + (msg->msg_flags & MSG_DONTROUTE) || + (ipc.opt && ipc.opt->opt.is_strictroute)) { + tos |= RTO_ONLINK; + } + + if (ipv4_is_multicast(daddr)) { + if (!ipc.oif) + ipc.oif = inet->mc_index; + if (!saddr) + saddr = inet->mc_addr; + } + + flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, + RT_SCOPE_UNIVERSE, sk->sk_protocol, + inet_sk_flowi_flags(sk), faddr, saddr, 0, 0); + + security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); + rt = ip_route_output_flow(net, &fl4, sk); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; + if (err == -ENETUNREACH) + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); + goto out; + } + + err = -EACCES; + if ((rt->rt_flags & RTCF_BROADCAST) && + !sock_flag(sk, SOCK_BROADCAST)) + goto out; + + if (msg->msg_flags & MSG_CONFIRM) + goto do_confirm; +back_from_confirm: + + if (!ipc.addr) + ipc.addr = fl4.daddr; + + lock_sock(sk); + + pfh.icmph.type = user_icmph.type; /* already checked */ + pfh.icmph.code = user_icmph.code; /* ditto */ + pfh.icmph.checksum = 0; + pfh.icmph.un.echo.id = inet->inet_sport; + pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence; + pfh.iov = msg->msg_iov; + pfh.wcheck = 0; + + err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len, + 0, &ipc, &rt, msg->msg_flags); + if (err) + ip_flush_pending_frames(sk); + else + err = ping_push_pending_frames(sk, &pfh, &fl4); + release_sock(sk); + +out: + ip_rt_put(rt); + if (free) + kfree(ipc.opt); + if (!err) { + icmp_out_count(sock_net(sk), user_icmph.type); + return len; + } + return err; + +do_confirm: + dst_confirm(&rt->dst); + if (!(msg->msg_flags & MSG_PROBE) || len) + goto back_from_confirm; + err = 0; + goto out; +} + +static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int noblock, int flags, int *addr_len) +{ + struct inet_sock *isk = inet_sk(sk); + struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + struct sk_buff *skb; + int copied, err; + + pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num); + + if (flags & MSG_OOB) + goto out; + + if (addr_len) + *addr_len = sizeof(*sin); + + if (flags & MSG_ERRQUEUE) + return ip_recv_error(sk, msg, len); + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + + /* Don't bother checking the checksum */ + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto done; + + sock_recv_timestamp(msg, sk, skb); + + /* Copy the address. */ + if (sin) { + sin->sin_family = AF_INET; + sin->sin_port = 0 /* skb->h.uh->source */; + sin->sin_addr.s_addr = ip_hdr(skb)->saddr; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } + if (isk->cmsg_flags) + ip_cmsg_recv(msg, skb); + err = copied; + +done: + skb_free_datagram(sk, skb); +out: + pr_debug("ping_recvmsg -> %d\n", err); + return err; +} + +static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n", + inet_sk(sk), inet_sk(sk)->inet_num, skb); + if (sock_queue_rcv_skb(sk, skb) < 0) { + ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_INERRORS); + kfree_skb(skb); + pr_debug("ping_queue_rcv_skb -> failed\n"); + return -1; + } + return 0; +} + + +/* + * All we need to do is get the socket. + */ + +void ping_rcv(struct sk_buff *skb) +{ + struct sock *sk; + struct net *net = dev_net(skb->dev); + struct iphdr *iph = ip_hdr(skb); + struct icmphdr *icmph = icmp_hdr(skb); + u32 saddr = iph->saddr; + u32 daddr = iph->daddr; + + /* We assume the packet has already been checked by icmp_rcv */ + + pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n", + skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); + + /* Push ICMP header back */ + skb_push(skb, skb->data - (u8 *)icmph); + + sk = ping_v4_lookup(net, saddr, daddr, ntohs(icmph->un.echo.id), + skb->dev->ifindex); + if (sk != NULL) { + pr_debug("rcv on socket %p\n", sk); + ping_queue_rcv_skb(sk, skb_get(skb)); + sock_put(sk); + return; + } + pr_debug("no socket, dropping\n"); + + /* We're called from icmp_rcv(). kfree_skb() is done there. */ +} + +struct proto ping_prot = { + .name = "PING", + .owner = THIS_MODULE, + .init = ping_init_sock, + .close = ping_close, + .connect = ip4_datagram_connect, + .disconnect = udp_disconnect, + .setsockopt = ip_setsockopt, + .getsockopt = ip_getsockopt, + .sendmsg = ping_sendmsg, + .recvmsg = ping_recvmsg, + .bind = ping_bind, + .backlog_rcv = ping_queue_rcv_skb, + .hash = ping_v4_hash, + .unhash = ping_v4_unhash, + .get_port = ping_v4_get_port, + .obj_size = sizeof(struct inet_sock), +}; +EXPORT_SYMBOL(ping_prot); + +#ifdef CONFIG_PROC_FS + +static struct sock *ping_get_first(struct seq_file *seq, int start) +{ + struct sock *sk; + struct ping_iter_state *state = seq->private; + struct net *net = seq_file_net(seq); + + for (state->bucket = start; state->bucket < PING_HTABLE_SIZE; + ++state->bucket) { + struct hlist_nulls_node *node; + struct hlist_nulls_head *hslot; + + hslot = &ping_table.hash[state->bucket]; + + if (hlist_nulls_empty(hslot)) + continue; + + sk_nulls_for_each(sk, node, hslot) { + if (net_eq(sock_net(sk), net)) + goto found; + } + } + sk = NULL; +found: + return sk; +} + +static struct sock *ping_get_next(struct seq_file *seq, struct sock *sk) +{ + struct ping_iter_state *state = seq->private; + struct net *net = seq_file_net(seq); + + do { + sk = sk_nulls_next(sk); + } while (sk && (!net_eq(sock_net(sk), net))); + + if (!sk) + return ping_get_first(seq, state->bucket + 1); + return sk; +} + +static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos) +{ + struct sock *sk = ping_get_first(seq, 0); + + if (sk) + while (pos && (sk = ping_get_next(seq, sk)) != NULL) + --pos; + return pos ? NULL : sk; +} + +static void *ping_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct ping_iter_state *state = seq->private; + state->bucket = 0; + + read_lock_bh(&ping_table.lock); + + return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN; +} + +static void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock *sk; + + if (v == SEQ_START_TOKEN) + sk = ping_get_idx(seq, 0); + else + sk = ping_get_next(seq, v); + + ++*pos; + return sk; +} + +static void ping_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock_bh(&ping_table.lock); +} + +static void ping_format_sock(struct sock *sp, struct seq_file *f, + int bucket, int *len) +{ + struct inet_sock *inet = inet_sk(sp); + __be32 dest = inet->inet_daddr; + __be32 src = inet->inet_rcv_saddr; + __u16 destp = ntohs(inet->inet_dport); + __u16 srcp = ntohs(inet->inet_sport); + + seq_printf(f, "%5d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n", + bucket, src, srcp, dest, destp, sp->sk_state, + sk_wmem_alloc_get(sp), + sk_rmem_alloc_get(sp), + 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp, + atomic_read(&sp->sk_drops), len); +} + +static int ping_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, "%-127s\n", + " sl local_address rem_address st tx_queue " + "rx_queue tr tm->when retrnsmt uid timeout " + "inode ref pointer drops"); + else { + struct ping_iter_state *state = seq->private; + int len; + + ping_format_sock(v, seq, state->bucket, &len); + seq_printf(seq, "%*s\n", 127 - len, ""); + } + return 0; +} + +static const struct seq_operations ping_seq_ops = { + .show = ping_seq_show, + .start = ping_seq_start, + .next = ping_seq_next, + .stop = ping_seq_stop, +}; + +static int ping_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ping_seq_ops, + sizeof(struct ping_iter_state)); +} + +static const struct file_operations ping_seq_fops = { + .open = ping_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int ping_proc_register(struct net *net) +{ + struct proc_dir_entry *p; + int rc = 0; + + p = proc_net_fops_create(net, "icmp", S_IRUGO, &ping_seq_fops); + if (!p) + rc = -ENOMEM; + return rc; +} + +static void ping_proc_unregister(struct net *net) +{ + proc_net_remove(net, "icmp"); +} + + +static int __net_init ping_proc_init_net(struct net *net) +{ + return ping_proc_register(net); +} + +static void __net_exit ping_proc_exit_net(struct net *net) +{ + ping_proc_unregister(net); +} + +static struct pernet_operations ping_net_ops = { + .init = ping_proc_init_net, + .exit = ping_proc_exit_net, +}; + +int __init ping_proc_init(void) +{ + return register_pernet_subsys(&ping_net_ops); +} + +void ping_proc_exit(void) +{ + unregister_pernet_subsys(&ping_net_ops); +} + +#endif + +void __init ping_init(void) +{ + int i; + + for (i = 0; i < PING_HTABLE_SIZE; i++) + INIT_HLIST_NULLS_HEAD(&ping_table.hash[i], i); + rwlock_init(&ping_table.lock); +} diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 6390ba2..c9893d4 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -154,7 +154,7 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) * RFC 1122: SHOULD pass TOS value up to the transport layer. * -> It does. And not only TOS, but all IP header. */ -static int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) +static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) { struct sock *sk; struct hlist_head *head; @@ -247,7 +247,7 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) } if (inet->recverr) { - struct iphdr *iph = (struct iphdr *)skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; u8 *payload = skb->data + (iph->ihl << 2); if (inet->hdrincl) @@ -265,7 +265,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) { int hash; struct sock *raw_sk; - struct iphdr *iph; + const struct iphdr *iph; struct net *net; hash = protocol & (RAW_HTABLE_SIZE - 1); @@ -273,7 +273,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) read_lock(&raw_v4_hashinfo.lock); raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); if (raw_sk != NULL) { - iph = (struct iphdr *)skb->data; + iph = (const struct iphdr *)skb->data; net = dev_net(skb->dev); while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol, @@ -281,7 +281,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) skb->dev->ifindex)) != NULL) { raw_err(raw_sk, skb, info); raw_sk = sk_next(raw_sk); - iph = (struct iphdr *)skb->data; + iph = (const struct iphdr *)skb->data; } } read_unlock(&raw_v4_hashinfo.lock); @@ -314,9 +314,10 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) return 0; } -static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, - struct rtable **rtp, - unsigned int flags) +static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, + void *from, size_t length, + struct rtable **rtp, + unsigned int flags) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); @@ -327,7 +328,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, struct rtable *rt = *rtp; if (length > rt->dst.dev->mtu) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, + ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, rt->dst.dev->mtu); return -EMSGSIZE; } @@ -372,7 +373,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, if (iphlen >= sizeof(*iph)) { if (!iph->saddr) - iph->saddr = rt->rt_src; + iph->saddr = fl4->saddr; iph->check = 0; iph->tot_len = htons(length); if (!iph->id) @@ -402,7 +403,7 @@ error: return err; } -static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg) +static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg) { struct iovec *iov; u8 __user *type = NULL; @@ -418,7 +419,7 @@ static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg) if (!iov) continue; - switch (fl->proto) { + switch (fl4->flowi4_proto) { case IPPROTO_ICMP: /* check if one-byte field is readable or not. */ if (iov->iov_base && iov->iov_len < 1) @@ -433,8 +434,8 @@ static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg) code = iov->iov_base; if (type && code) { - if (get_user(fl->fl_icmp_type, type) || - get_user(fl->fl_icmp_code, code)) + if (get_user(fl4->fl4_icmp_type, type) || + get_user(fl4->fl4_icmp_code, code)) return -EFAULT; probed = 1; } @@ -455,11 +456,13 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct inet_sock *inet = inet_sk(sk); struct ipcm_cookie ipc; struct rtable *rt = NULL; + struct flowi4 fl4; int free = 0; __be32 daddr; __be32 saddr; u8 tos; int err; + struct ip_options_data opt_copy; err = -EMSGSIZE; if (len > 0xFFFF) @@ -520,8 +523,18 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, saddr = ipc.addr; ipc.addr = daddr; - if (!ipc.opt) - ipc.opt = inet->opt; + if (!ipc.opt) { + struct ip_options_rcu *inet_opt; + + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + if (inet_opt) { + memcpy(&opt_copy, inet_opt, + sizeof(*inet_opt) + inet_opt->opt.optlen); + ipc.opt = &opt_copy.opt; + } + rcu_read_unlock(); + } if (ipc.opt) { err = -EINVAL; @@ -530,10 +543,10 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, */ if (inet->hdrincl) goto done; - if (ipc.opt->srr) { + if (ipc.opt->opt.srr) { if (!daddr) goto done; - daddr = ipc.opt->faddr; + daddr = ipc.opt->opt.faddr; } } tos = RT_CONN_FLAGS(sk); @@ -547,26 +560,24 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, saddr = inet->mc_addr; } - { - struct flowi fl = { .oif = ipc.oif, - .mark = sk->sk_mark, - .fl4_dst = daddr, - .fl4_src = saddr, - .fl4_tos = tos, - .proto = inet->hdrincl ? IPPROTO_RAW : - sk->sk_protocol, - }; - if (!inet->hdrincl) { - err = raw_probe_proto_opt(&fl, msg); - if (err) - goto done; - } + flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, + RT_SCOPE_UNIVERSE, + inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, + FLOWI_FLAG_CAN_SLEEP, daddr, saddr, 0, 0); - security_sk_classify_flow(sk, &fl); - err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1); + if (!inet->hdrincl) { + err = raw_probe_proto_opt(&fl4, msg); + if (err) + goto done; } - if (err) + + security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); + rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; goto done; + } err = -EACCES; if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST)) @@ -577,19 +588,20 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, back_from_confirm: if (inet->hdrincl) - err = raw_send_hdrinc(sk, msg->msg_iov, len, - &rt, msg->msg_flags); + err = raw_send_hdrinc(sk, &fl4, msg->msg_iov, len, + &rt, msg->msg_flags); else { if (!ipc.addr) - ipc.addr = rt->rt_dst; + ipc.addr = fl4.daddr; lock_sock(sk); - err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, - &ipc, &rt, msg->msg_flags); + err = ip_append_data(sk, &fl4, ip_generic_getfrag, + msg->msg_iov, len, 0, + &ipc, &rt, msg->msg_flags); if (err) ip_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) { - err = ip_push_pending_frames(sk); + err = ip_push_pending_frames(sk, &fl4); if (err == -ENOBUFS && !inet->recverr) err = 0; } @@ -616,7 +628,7 @@ do_confirm: static void raw_close(struct sock *sk, long timeout) { /* - * Raw sockets may have direct kernel refereneces. Kill them. + * Raw sockets may have direct kernel references. Kill them. */ ip_ra_control(sk, 0, NULL); @@ -967,7 +979,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) srcp = inet->inet_num; seq_printf(seq, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n", + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n", i, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 788a3e7..52b0b95 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -109,8 +109,8 @@ #include <linux/sysctl.h> #endif -#define RT_FL_TOS(oldflp) \ - ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) +#define RT_FL_TOS(oldflp4) \ + ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) #define IP_MAX_MTU 0xFFF0 @@ -131,9 +131,6 @@ static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; static int rt_chain_length_max __read_mostly = 20; -static struct delayed_work expires_work; -static unsigned long expires_ljiffies; - /* * Interface to generic destination cache. */ @@ -152,6 +149,41 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, { } +static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) +{ + struct rtable *rt = (struct rtable *) dst; + struct inet_peer *peer; + u32 *p = NULL; + + if (!rt->peer) + rt_bind_peer(rt, rt->rt_dst, 1); + + peer = rt->peer; + if (peer) { + u32 *old_p = __DST_METRICS_PTR(old); + unsigned long prev, new; + + p = peer->metrics; + if (inet_metrics_new(peer)) + memcpy(p, old_p, sizeof(u32) * RTAX_MAX); + + new = (unsigned long) p; + prev = cmpxchg(&dst->_metrics, old, new); + + if (prev != old) { + p = __DST_METRICS_PTR(prev); + if (prev & DST_METRICS_READ_ONLY) + p = NULL; + } else { + if (rt->fi) { + fib_info_put(rt->fi); + rt->fi = NULL; + } + } + } + return p; +} + static struct dst_ops ipv4_dst_ops = { .family = AF_INET, .protocol = cpu_to_be16(ETH_P_IP), @@ -159,6 +191,7 @@ static struct dst_ops ipv4_dst_ops = { .check = ipv4_dst_check, .default_advmss = ipv4_default_advmss, .default_mtu = ipv4_default_mtu, + .cow_metrics = ipv4_cow_metrics, .destroy = ipv4_dst_destroy, .ifdown = ipv4_dst_ifdown, .negative_advice = ipv4_negative_advice, @@ -171,7 +204,7 @@ static struct dst_ops ipv4_dst_ops = { const __u8 ip_tos2prio[16] = { TC_PRIO_BESTEFFORT, - ECN_OR_COST(FILLER), + ECN_OR_COST(BESTEFFORT), TC_PRIO_BESTEFFORT, ECN_OR_COST(BESTEFFORT), TC_PRIO_BULK, @@ -391,7 +424,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) dst_metric(&r->dst, RTAX_WINDOW), (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + dst_metric(&r->dst, RTAX_RTTVAR)), - r->fl.fl4_tos, + r->rt_key_tos, r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1, r->dst.hh ? (r->dst.hh->hh_output == dev_queue_xmit) : 0, @@ -514,7 +547,7 @@ static const struct file_operations rt_cpu_seq_fops = { .release = seq_release, }; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID static int rt_acct_proc_show(struct seq_file *m, void *v) { struct ip_rt_acct *dst, *src; @@ -567,14 +600,14 @@ static int __net_init ip_rt_do_proc_init(struct net *net) if (!pde) goto err2; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); if (!pde) goto err3; #endif return 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID err3: remove_proc_entry("rt_cache", net->proc_net_stat); #endif @@ -588,7 +621,7 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net) { remove_proc_entry("rt_cache", net->proc_net_stat); remove_proc_entry("rt_cache", net->proc_net); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID remove_proc_entry("rt_acct", net->proc_net); #endif } @@ -632,7 +665,7 @@ static inline int rt_fast_clean(struct rtable *rth) static inline int rt_valuable(struct rtable *rth) { return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || - rth->dst.expires; + (rth->peer && rth->peer->pmtu_expires); } static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) @@ -643,13 +676,7 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t if (atomic_read(&rth->dst.__refcnt)) goto out; - ret = 1; - if (rth->dst.expires && - time_after_eq(jiffies, rth->dst.expires)) - goto out; - age = jiffies - rth->dst.lastuse; - ret = 0; if ((age <= tmo1 && !rt_fast_clean(rth)) || (age <= tmo2 && rt_valuable(rth))) goto out; @@ -684,22 +711,22 @@ static inline bool rt_caching(const struct net *net) net->ipv4.sysctl_rt_cache_rebuild_count; } -static inline bool compare_hash_inputs(const struct flowi *fl1, - const struct flowi *fl2) +static inline bool compare_hash_inputs(const struct rtable *rt1, + const struct rtable *rt2) { - return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) | - ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) | - (fl1->iif ^ fl2->iif)) == 0); + return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | + ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | + (rt1->rt_iif ^ rt2->rt_iif)) == 0); } -static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) +static inline int compare_keys(struct rtable *rt1, struct rtable *rt2) { - return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) | - ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) | - (fl1->mark ^ fl2->mark) | - (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) | - (fl1->oif ^ fl2->oif) | - (fl1->iif ^ fl2->iif)) == 0; + return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | + ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | + (rt1->rt_mark ^ rt2->rt_mark) | + (rt1->rt_key_tos ^ rt2->rt_key_tos) | + (rt1->rt_oif ^ rt2->rt_oif) | + (rt1->rt_iif ^ rt2->rt_iif)) == 0; } static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) @@ -786,106 +813,15 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth) const struct rtable *aux = head; while (aux != rth) { - if (compare_hash_inputs(&aux->fl, &rth->fl)) + if (compare_hash_inputs(aux, rth)) return 0; aux = rcu_dereference_protected(aux->dst.rt_next, 1); } return ONE; } -static void rt_check_expire(void) -{ - static unsigned int rover; - unsigned int i = rover, goal; - struct rtable *rth; - struct rtable __rcu **rthp; - unsigned long samples = 0; - unsigned long sum = 0, sum2 = 0; - unsigned long delta; - u64 mult; - - delta = jiffies - expires_ljiffies; - expires_ljiffies = jiffies; - mult = ((u64)delta) << rt_hash_log; - if (ip_rt_gc_timeout > 1) - do_div(mult, ip_rt_gc_timeout); - goal = (unsigned int)mult; - if (goal > rt_hash_mask) - goal = rt_hash_mask + 1; - for (; goal > 0; goal--) { - unsigned long tmo = ip_rt_gc_timeout; - unsigned long length; - - i = (i + 1) & rt_hash_mask; - rthp = &rt_hash_table[i].chain; - - if (need_resched()) - cond_resched(); - - samples++; - - if (rcu_dereference_raw(*rthp) == NULL) - continue; - length = 0; - spin_lock_bh(rt_hash_lock_addr(i)); - while ((rth = rcu_dereference_protected(*rthp, - lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { - prefetch(rth->dst.rt_next); - if (rt_is_expired(rth)) { - *rthp = rth->dst.rt_next; - rt_free(rth); - continue; - } - if (rth->dst.expires) { - /* Entry is expired even if it is in use */ - if (time_before_eq(jiffies, rth->dst.expires)) { -nofree: - tmo >>= 1; - rthp = &rth->dst.rt_next; - /* - * We only count entries on - * a chain with equal hash inputs once - * so that entries for different QOS - * levels, and other non-hash input - * attributes don't unfairly skew - * the length computation - */ - length += has_noalias(rt_hash_table[i].chain, rth); - continue; - } - } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) - goto nofree; - - /* Cleanup aged off entries. */ - *rthp = rth->dst.rt_next; - rt_free(rth); - } - spin_unlock_bh(rt_hash_lock_addr(i)); - sum += length; - sum2 += length*length; - } - if (samples) { - unsigned long avg = sum / samples; - unsigned long sd = int_sqrt(sum2 / samples - avg*avg); - rt_chain_length_max = max_t(unsigned long, - ip_rt_gc_elasticity, - (avg + 4*sd) >> FRACT_BITS); - } - rover = i; -} - /* - * rt_worker_func() is run in process context. - * we call rt_check_expire() to scan part of the hash table - */ -static void rt_worker_func(struct work_struct *work) -{ - rt_check_expire(); - schedule_delayed_work(&expires_work, ip_rt_gc_interval); -} - -/* - * Pertubation of rt_genid by a small quantity [1..256] + * Perturbation of rt_genid by a small quantity [1..256] * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() * many times (2^24) without giving recent rt_genid. * Jenkins hash is strong enough that litle changes of rt_genid are OK. @@ -1032,10 +968,6 @@ static int rt_garbage_collect(struct dst_ops *ops) break; expire >>= 1; -#if RT_CACHE_DEBUG >= 2 - printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, - dst_entries_get_fast(&ipv4_dst_ops), goal, i); -#endif if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) goto out; @@ -1056,10 +988,6 @@ work_done: dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh || dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh) expire = ip_rt_gc_timeout; -#if RT_CACHE_DEBUG >= 2 - printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, - dst_entries_get_fast(&ipv4_dst_ops), goal, rover); -#endif out: return 0; } @@ -1078,8 +1006,8 @@ static int slow_chain_length(const struct rtable *head) return length >> FRACT_BITS; } -static int rt_intern_hash(unsigned hash, struct rtable *rt, - struct rtable **rp, struct sk_buff *skb, int ifindex) +static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt, + struct sk_buff *skb, int ifindex) { struct rtable *rth, *cand; struct rtable __rcu **rthp, **candp; @@ -1120,7 +1048,7 @@ restart: printk(KERN_WARNING "Neighbour table failure & not caching routes.\n"); ip_rt_put(rt); - return err; + return ERR_PTR(err); } } @@ -1137,7 +1065,7 @@ restart: rt_free(rth); continue; } - if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { + if (compare_keys(rth, rt) && compare_netns(rth, rt)) { /* Put it first */ *rthp = rth->dst.rt_next; /* @@ -1157,11 +1085,9 @@ restart: spin_unlock_bh(rt_hash_lock_addr(hash)); rt_drop(rt); - if (rp) - *rp = rth; - else + if (skb) skb_dst_set(skb, &rth->dst); - return 0; + return rth; } if (!atomic_read(&rth->dst.__refcnt)) { @@ -1202,7 +1128,7 @@ restart: rt_emergency_hash_rebuild(net); spin_unlock_bh(rt_hash_lock_addr(hash)); - hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, + hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, ifindex, rt_genid(net)); goto restart; } @@ -1218,7 +1144,7 @@ restart: if (err != -ENOBUFS) { rt_drop(rt); - return err; + return ERR_PTR(err); } /* Neighbour tables are full and nothing @@ -1239,25 +1165,15 @@ restart: if (net_ratelimit()) printk(KERN_WARNING "ipv4: Neighbour table overflow.\n"); rt_drop(rt); - return -ENOBUFS; + return ERR_PTR(-ENOBUFS); } } rt->dst.rt_next = rt_hash_table[hash].chain; -#if RT_CACHE_DEBUG >= 2 - if (rt->dst.rt_next) { - struct rtable *trt; - printk(KERN_DEBUG "rt_cache @%02x: %pI4", - hash, &rt->rt_dst); - for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next) - printk(" . %pI4", &trt->rt_dst); - printk("\n"); - } -#endif /* * Since lookup is lockfree, we must make sure - * previous writes to rt are comitted to memory + * previous writes to rt are committed to memory * before making rt visible to other CPUS. */ rcu_assign_pointer(rt_hash_table[hash].chain, rt); @@ -1265,21 +1181,28 @@ restart: spin_unlock_bh(rt_hash_lock_addr(hash)); skip_hashing: - if (rp) - *rp = rt; - else + if (skb) skb_dst_set(skb, &rt->dst); - return 0; + return rt; +} + +static atomic_t __rt_peer_genid = ATOMIC_INIT(0); + +static u32 rt_peer_genid(void) +{ + return atomic_read(&__rt_peer_genid); } -void rt_bind_peer(struct rtable *rt, int create) +void rt_bind_peer(struct rtable *rt, __be32 daddr, int create) { struct inet_peer *peer; - peer = inet_getpeer_v4(rt->rt_dst, create); + peer = inet_getpeer_v4(daddr, create); if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) inet_putpeer(peer); + else + rt->rt_peer_genid = rt_peer_genid(); } /* @@ -1308,7 +1231,7 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) if (rt) { if (rt->peer == NULL) - rt_bind_peer(rt, 1); + rt_bind_peer(rt, rt->rt_dst, 1); /* If peer is attached to destination, it is never detached, so that we need not to grab a lock to dereference it. @@ -1349,13 +1272,8 @@ static void rt_del(unsigned hash, struct rtable *rt) void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, __be32 saddr, struct net_device *dev) { - int i, k; struct in_device *in_dev = __in_dev_get_rcu(dev); - struct rtable *rth; - struct rtable __rcu **rthp; - __be32 skeys[2] = { saddr, 0 }; - int ikeys[2] = { dev->ifindex, 0 }; - struct netevent_redirect netevent; + struct inet_peer *peer; struct net *net; if (!in_dev) @@ -1367,9 +1285,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, ipv4_is_zeronet(new_gw)) goto reject_redirect; - if (!rt_caching(net)) - goto reject_redirect; - if (!IN_DEV_SHARED_MEDIA(in_dev)) { if (!inet_addr_onlink(in_dev, new_gw, old_gw)) goto reject_redirect; @@ -1380,91 +1295,13 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, goto reject_redirect; } - for (i = 0; i < 2; i++) { - for (k = 0; k < 2; k++) { - unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], - rt_genid(net)); - - rthp = &rt_hash_table[hash].chain; - - while ((rth = rcu_dereference(*rthp)) != NULL) { - struct rtable *rt; - - if (rth->fl.fl4_dst != daddr || - rth->fl.fl4_src != skeys[i] || - rth->fl.oif != ikeys[k] || - rt_is_input_route(rth) || - rt_is_expired(rth) || - !net_eq(dev_net(rth->dst.dev), net)) { - rthp = &rth->dst.rt_next; - continue; - } - - if (rth->rt_dst != daddr || - rth->rt_src != saddr || - rth->dst.error || - rth->rt_gateway != old_gw || - rth->dst.dev != dev) - break; - - dst_hold(&rth->dst); - - rt = dst_alloc(&ipv4_dst_ops); - if (rt == NULL) { - ip_rt_put(rth); - return; - } - - /* Copy all the information. */ - *rt = *rth; - rt->dst.__use = 1; - atomic_set(&rt->dst.__refcnt, 1); - rt->dst.child = NULL; - if (rt->dst.dev) - dev_hold(rt->dst.dev); - rt->dst.obsolete = -1; - rt->dst.lastuse = jiffies; - rt->dst.path = &rt->dst; - rt->dst.neighbour = NULL; - rt->dst.hh = NULL; -#ifdef CONFIG_XFRM - rt->dst.xfrm = NULL; -#endif - rt->rt_genid = rt_genid(net); - rt->rt_flags |= RTCF_REDIRECTED; - - /* Gateway is different ... */ - rt->rt_gateway = new_gw; - - /* Redirect received -> path was valid */ - dst_confirm(&rth->dst); - - if (rt->peer) - atomic_inc(&rt->peer->refcnt); - - if (arp_bind_neighbour(&rt->dst) || - !(rt->dst.neighbour->nud_state & - NUD_VALID)) { - if (rt->dst.neighbour) - neigh_event_send(rt->dst.neighbour, NULL); - ip_rt_put(rth); - rt_drop(rt); - goto do_next; - } + peer = inet_getpeer_v4(daddr, 1); + if (peer) { + peer->redirect_learned.a4 = new_gw; - netevent.old = &rth->dst; - netevent.new = &rt->dst; - call_netevent_notifiers(NETEVENT_REDIRECT, - &netevent); + inet_putpeer(peer); - rt_del(hash, rth); - if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif)) - ip_rt_put(rt); - goto do_next; - } - do_next: - ; - } + atomic_inc(&__rt_peer_genid); } return; @@ -1488,18 +1325,20 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) if (dst->obsolete > 0) { ip_rt_put(rt); ret = NULL; - } else if ((rt->rt_flags & RTCF_REDIRECTED) || - (rt->dst.expires && - time_after_eq(jiffies, rt->dst.expires))) { - unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, - rt->fl.oif, + } else if (rt->rt_flags & RTCF_REDIRECTED) { + unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, + rt->rt_oif, rt_genid(dev_net(dst->dev))); -#if RT_CACHE_DEBUG >= 1 - printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n", - &rt->rt_dst, rt->fl.fl4_tos); -#endif rt_del(hash, rt); ret = NULL; + } else if (rt->peer && + rt->peer->pmtu_expires && + time_after_eq(jiffies, rt->peer->pmtu_expires)) { + unsigned long orig = rt->peer->pmtu_expires; + + if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig) + dst_metric_set(dst, RTAX_MTU, + rt->peer->pmtu_orig); } } return ret; @@ -1525,6 +1364,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct in_device *in_dev; + struct inet_peer *peer; int log_martians; rcu_read_lock(); @@ -1536,36 +1376,44 @@ void ip_rt_send_redirect(struct sk_buff *skb) log_martians = IN_DEV_LOG_MARTIANS(in_dev); rcu_read_unlock(); + if (!rt->peer) + rt_bind_peer(rt, rt->rt_dst, 1); + peer = rt->peer; + if (!peer) { + icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); + return; + } + /* No redirected packets during ip_rt_redirect_silence; * reset the algorithm. */ - if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence)) - rt->dst.rate_tokens = 0; + if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) + peer->rate_tokens = 0; /* Too many ignored redirects; do not send anything * set dst.rate_last to the last seen redirected packet. */ - if (rt->dst.rate_tokens >= ip_rt_redirect_number) { - rt->dst.rate_last = jiffies; + if (peer->rate_tokens >= ip_rt_redirect_number) { + peer->rate_last = jiffies; return; } /* Check for load limit; set rate_last to the latest sent * redirect. */ - if (rt->dst.rate_tokens == 0 || + if (peer->rate_tokens == 0 || time_after(jiffies, - (rt->dst.rate_last + - (ip_rt_redirect_load << rt->dst.rate_tokens)))) { + (peer->rate_last + + (ip_rt_redirect_load << peer->rate_tokens)))) { icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); - rt->dst.rate_last = jiffies; - ++rt->dst.rate_tokens; + peer->rate_last = jiffies; + ++peer->rate_tokens; #ifdef CONFIG_IP_ROUTE_VERBOSE if (log_martians && - rt->dst.rate_tokens == ip_rt_redirect_number && + peer->rate_tokens == ip_rt_redirect_number && net_ratelimit()) printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", - &rt->rt_src, rt->rt_iif, + &ip_hdr(skb)->saddr, rt->rt_iif, &rt->rt_dst, &rt->rt_gateway); #endif } @@ -1574,7 +1422,9 @@ void ip_rt_send_redirect(struct sk_buff *skb) static int ip_error(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); + struct inet_peer *peer; unsigned long now; + bool send; int code; switch (rt->dst.error) { @@ -1594,15 +1444,24 @@ static int ip_error(struct sk_buff *skb) break; } - now = jiffies; - rt->dst.rate_tokens += now - rt->dst.rate_last; - if (rt->dst.rate_tokens > ip_rt_error_burst) - rt->dst.rate_tokens = ip_rt_error_burst; - rt->dst.rate_last = now; - if (rt->dst.rate_tokens >= ip_rt_error_cost) { - rt->dst.rate_tokens -= ip_rt_error_cost; - icmp_send(skb, ICMP_DEST_UNREACH, code, 0); + if (!rt->peer) + rt_bind_peer(rt, rt->rt_dst, 1); + peer = rt->peer; + + send = true; + if (peer) { + now = jiffies; + peer->rate_tokens += now - peer->rate_last; + if (peer->rate_tokens > ip_rt_error_burst) + peer->rate_tokens = ip_rt_error_burst; + peer->rate_last = now; + if (peer->rate_tokens >= ip_rt_error_cost) + peer->rate_tokens -= ip_rt_error_cost; + else + send = false; } + if (send) + icmp_send(skb, ICMP_DEST_UNREACH, code, 0); out: kfree_skb(skb); return 0; @@ -1626,92 +1485,144 @@ static inline unsigned short guess_mtu(unsigned short old_mtu) return 68; } -unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, +unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph, unsigned short new_mtu, struct net_device *dev) { - int i, k; unsigned short old_mtu = ntohs(iph->tot_len); - struct rtable *rth; - int ikeys[2] = { dev->ifindex, 0 }; - __be32 skeys[2] = { iph->saddr, 0, }; - __be32 daddr = iph->daddr; unsigned short est_mtu = 0; + struct inet_peer *peer; - for (k = 0; k < 2; k++) { - for (i = 0; i < 2; i++) { - unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], - rt_genid(net)); - - rcu_read_lock(); - for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->dst.rt_next)) { - unsigned short mtu = new_mtu; - - if (rth->fl.fl4_dst != daddr || - rth->fl.fl4_src != skeys[i] || - rth->rt_dst != daddr || - rth->rt_src != iph->saddr || - rth->fl.oif != ikeys[k] || - rt_is_input_route(rth) || - dst_metric_locked(&rth->dst, RTAX_MTU) || - !net_eq(dev_net(rth->dst.dev), net) || - rt_is_expired(rth)) - continue; + peer = inet_getpeer_v4(iph->daddr, 1); + if (peer) { + unsigned short mtu = new_mtu; - if (new_mtu < 68 || new_mtu >= old_mtu) { + if (new_mtu < 68 || new_mtu >= old_mtu) { + /* BSD 4.2 derived systems incorrectly adjust + * tot_len by the IP header length, and report + * a zero MTU in the ICMP message. + */ + if (mtu == 0 && + old_mtu >= 68 + (iph->ihl << 2)) + old_mtu -= iph->ihl << 2; + mtu = guess_mtu(old_mtu); + } - /* BSD 4.2 compatibility hack :-( */ - if (mtu == 0 && - old_mtu >= dst_mtu(&rth->dst) && - old_mtu >= 68 + (iph->ihl << 2)) - old_mtu -= iph->ihl << 2; + if (mtu < ip_rt_min_pmtu) + mtu = ip_rt_min_pmtu; + if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { + unsigned long pmtu_expires; - mtu = guess_mtu(old_mtu); - } - if (mtu <= dst_mtu(&rth->dst)) { - if (mtu < dst_mtu(&rth->dst)) { - dst_confirm(&rth->dst); - if (mtu < ip_rt_min_pmtu) { - u32 lock = dst_metric(&rth->dst, - RTAX_LOCK); - mtu = ip_rt_min_pmtu; - lock |= (1 << RTAX_MTU); - dst_metric_set(&rth->dst, RTAX_LOCK, - lock); - } - dst_metric_set(&rth->dst, RTAX_MTU, mtu); - dst_set_expires(&rth->dst, - ip_rt_mtu_expires); - } - est_mtu = mtu; - } - } - rcu_read_unlock(); + pmtu_expires = jiffies + ip_rt_mtu_expires; + if (!pmtu_expires) + pmtu_expires = 1UL; + + est_mtu = mtu; + peer->pmtu_learned = mtu; + peer->pmtu_expires = pmtu_expires; } + + inet_putpeer(peer); + + atomic_inc(&__rt_peer_genid); } return est_mtu ? : new_mtu; } +static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer) +{ + unsigned long expires = peer->pmtu_expires; + + if (time_before(jiffies, expires)) { + u32 orig_dst_mtu = dst_mtu(dst); + if (peer->pmtu_learned < orig_dst_mtu) { + if (!peer->pmtu_orig) + peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU); + dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned); + } + } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires) + dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig); +} + static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) { - if (dst_mtu(dst) > mtu && mtu >= 68 && - !(dst_metric_locked(dst, RTAX_MTU))) { - if (mtu < ip_rt_min_pmtu) { - u32 lock = dst_metric(dst, RTAX_LOCK); + struct rtable *rt = (struct rtable *) dst; + struct inet_peer *peer; + + dst_confirm(dst); + + if (!rt->peer) + rt_bind_peer(rt, rt->rt_dst, 1); + peer = rt->peer; + if (peer) { + if (mtu < ip_rt_min_pmtu) mtu = ip_rt_min_pmtu; - dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU)); + if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { + unsigned long pmtu_expires; + + pmtu_expires = jiffies + ip_rt_mtu_expires; + if (!pmtu_expires) + pmtu_expires = 1UL; + + peer->pmtu_learned = mtu; + peer->pmtu_expires = pmtu_expires; + + atomic_inc(&__rt_peer_genid); + rt->rt_peer_genid = rt_peer_genid(); } - dst_metric_set(dst, RTAX_MTU, mtu); - dst_set_expires(dst, ip_rt_mtu_expires); - call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); + check_peer_pmtu(dst, peer); } } +static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) +{ + struct rtable *rt = (struct rtable *) dst; + __be32 orig_gw = rt->rt_gateway; + + dst_confirm(&rt->dst); + + neigh_release(rt->dst.neighbour); + rt->dst.neighbour = NULL; + + rt->rt_gateway = peer->redirect_learned.a4; + if (arp_bind_neighbour(&rt->dst) || + !(rt->dst.neighbour->nud_state & NUD_VALID)) { + if (rt->dst.neighbour) + neigh_event_send(rt->dst.neighbour, NULL); + rt->rt_gateway = orig_gw; + return -EAGAIN; + } else { + rt->rt_flags |= RTCF_REDIRECTED; + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, + rt->dst.neighbour); + } + return 0; +} + static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { - if (rt_is_expired((struct rtable *)dst)) + struct rtable *rt = (struct rtable *) dst; + + if (rt_is_expired(rt)) return NULL; + if (rt->rt_peer_genid != rt_peer_genid()) { + struct inet_peer *peer; + + if (!rt->peer) + rt_bind_peer(rt, rt->rt_dst, 0); + + peer = rt->peer; + if (peer && peer->pmtu_expires) + check_peer_pmtu(dst, peer); + + if (peer && peer->redirect_learned.a4 && + peer->redirect_learned.a4 != rt->rt_gateway) { + if (check_peer_redir(dst, peer)) + return NULL; + } + + rt->rt_peer_genid = rt_peer_genid(); + } return dst; } @@ -1720,6 +1631,10 @@ static void ipv4_dst_destroy(struct dst_entry *dst) struct rtable *rt = (struct rtable *) dst; struct inet_peer *peer = rt->peer; + if (rt->fi) { + fib_info_put(rt->fi); + rt->fi = NULL; + } if (peer) { rt->peer = NULL; inet_putpeer(peer); @@ -1734,8 +1649,14 @@ static void ipv4_link_failure(struct sk_buff *skb) icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); rt = skb_rtable(skb); - if (rt) - dst_set_expires(&rt->dst, 0); + if (rt && + rt->peer && + rt->peer->pmtu_expires) { + unsigned long orig = rt->peer->pmtu_expires; + + if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig) + dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig); + } } static int ip_rt_bug(struct sk_buff *skb) @@ -1744,6 +1665,7 @@ static int ip_rt_bug(struct sk_buff *skb) &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, skb->dev ? skb->dev->name : "?"); kfree_skb(skb); + WARN_ON(1); return 0; } @@ -1756,17 +1678,30 @@ static int ip_rt_bug(struct sk_buff *skb) in IP options! */ -void ip_rt_get_source(u8 *addr, struct rtable *rt) +void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) { __be32 src; - struct fib_result res; if (rt_is_output_route(rt)) - src = rt->rt_src; + src = ip_hdr(skb)->saddr; else { + struct fib_result res; + struct flowi4 fl4; + struct iphdr *iph; + + iph = ip_hdr(skb); + + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = iph->daddr; + fl4.saddr = iph->saddr; + fl4.flowi4_tos = iph->tos; + fl4.flowi4_oif = rt->dst.dev->ifindex; + fl4.flowi4_iif = skb->dev->ifindex; + fl4.flowi4_mark = skb->mark; + rcu_read_lock(); - if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) - src = FIB_RES_PREFSRC(res); + if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) + src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); else src = inet_select_addr(rt->dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE); @@ -1775,7 +1710,7 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) memcpy(addr, &src, 4); } -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID static void set_class_tag(struct rtable *rt, u32 tag) { if (!(rt->dst.tclassid & 0xFFFF)) @@ -1815,17 +1750,54 @@ static unsigned int ipv4_default_mtu(const struct dst_entry *dst) return mtu; } -static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) +static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, + struct fib_info *fi) +{ + struct inet_peer *peer; + int create = 0; + + /* If a peer entry exists for this destination, we must hook + * it up in order to get at cached metrics. + */ + if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS)) + create = 1; + + rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create); + if (peer) { + rt->rt_peer_genid = rt_peer_genid(); + if (inet_metrics_new(peer)) + memcpy(peer->metrics, fi->fib_metrics, + sizeof(u32) * RTAX_MAX); + dst_init_metrics(&rt->dst, peer->metrics, false); + + if (peer->pmtu_expires) + check_peer_pmtu(&rt->dst, peer); + if (peer->redirect_learned.a4 && + peer->redirect_learned.a4 != rt->rt_gateway) { + rt->rt_gateway = peer->redirect_learned.a4; + rt->rt_flags |= RTCF_REDIRECTED; + } + } else { + if (fi->fib_metrics != (u32 *) dst_default_metrics) { + rt->fi = fi; + atomic_inc(&fi->fib_clntref); + } + dst_init_metrics(&rt->dst, fi->fib_metrics, true); + } +} + +static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, + const struct fib_result *res, + struct fib_info *fi, u16 type, u32 itag) { struct dst_entry *dst = &rt->dst; - struct fib_info *fi = res->fi; if (fi) { if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) rt->rt_gateway = FIB_RES_GW(*res); - dst_import_metrics(dst, fi->fib_metrics); -#ifdef CONFIG_NET_CLS_ROUTE + rt_init_metrics(rt, fl4, fi); +#ifdef CONFIG_IP_ROUTE_CLASSID dst->tclassid = FIB_RES_NH(*res).nh_tclassid; #endif } @@ -1835,13 +1807,21 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40) dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES set_class_tag(rt, fib_rules_tclass(res)); #endif set_class_tag(rt, itag); #endif - rt->rt_type = res->type; +} + +static struct rtable *rt_dst_alloc(struct net_device *dev, + bool nopolicy, bool noxfrm) +{ + return dst_alloc(&ipv4_dst_ops, dev, 1, -1, + DST_HOST | + (nopolicy ? DST_NOPOLICY : 0) | + (noxfrm ? DST_NOXFRM : 0)); } /* called in rcu_read_lock() section */ @@ -1869,41 +1849,38 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto e_inval; spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); } else { - err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, - &itag, 0); + err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, + &itag); if (err < 0) goto e_err; } - rth = dst_alloc(&ipv4_dst_ops); + rth = rt_dst_alloc(init_net.loopback_dev, + IN_DEV_CONF_GET(in_dev, NOPOLICY), false); if (!rth) goto e_nobufs; +#ifdef CONFIG_IP_ROUTE_CLASSID + rth->dst.tclassid = itag; +#endif rth->dst.output = ip_rt_bug; - rth->dst.obsolete = -1; - atomic_set(&rth->dst.__refcnt, 1); - rth->dst.flags= DST_HOST; - if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->dst.flags |= DST_NOPOLICY; - rth->fl.fl4_dst = daddr; + rth->rt_key_dst = daddr; + rth->rt_key_src = saddr; + rth->rt_genid = rt_genid(dev_net(dev)); + rth->rt_flags = RTCF_MULTICAST; + rth->rt_type = RTN_MULTICAST; + rth->rt_key_tos = tos; rth->rt_dst = daddr; - rth->fl.fl4_tos = tos; - rth->fl.mark = skb->mark; - rth->fl.fl4_src = saddr; rth->rt_src = saddr; -#ifdef CONFIG_NET_CLS_ROUTE - rth->dst.tclassid = itag; -#endif - rth->rt_iif = - rth->fl.iif = dev->ifindex; - rth->dst.dev = init_net.loopback_dev; - dev_hold(rth->dst.dev); - rth->fl.oif = 0; + rth->rt_route_iif = dev->ifindex; + rth->rt_iif = dev->ifindex; + rth->rt_oif = 0; + rth->rt_mark = skb->mark; rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; - rth->rt_genid = rt_genid(dev_net(dev)); - rth->rt_flags = RTCF_MULTICAST; - rth->rt_type = RTN_MULTICAST; + rth->rt_peer_genid = 0; + rth->peer = NULL; + rth->fi = NULL; if (our) { rth->dst.input= ip_local_deliver; rth->rt_flags |= RTCF_LOCAL; @@ -1916,7 +1893,10 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, RT_CACHE_STAT_INC(in_slow_mc); hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); - return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex); + rth = rt_intern_hash(hash, rth, skb, dev->ifindex); + err = 0; + if (IS_ERR(rth)) + err = PTR_ERR(rth); e_nobufs: return -ENOBUFS; @@ -1959,7 +1939,7 @@ static void ip_handle_martian_source(struct net_device *dev, /* called in rcu_read_lock() section */ static int __mkroute_input(struct sk_buff *skb, - struct fib_result *res, + const struct fib_result *res, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos, struct rtable **result) @@ -1981,8 +1961,8 @@ static int __mkroute_input(struct sk_buff *skb, } - err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), - in_dev->dev, &spec_dst, &itag, skb->mark); + err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), + in_dev->dev, &spec_dst, &itag); if (err < 0) { ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); @@ -2013,41 +1993,36 @@ static int __mkroute_input(struct sk_buff *skb, } } - - rth = dst_alloc(&ipv4_dst_ops); + rth = rt_dst_alloc(out_dev->dev, + IN_DEV_CONF_GET(in_dev, NOPOLICY), + IN_DEV_CONF_GET(out_dev, NOXFRM)); if (!rth) { err = -ENOBUFS; goto cleanup; } - atomic_set(&rth->dst.__refcnt, 1); - rth->dst.flags= DST_HOST; - if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->dst.flags |= DST_NOPOLICY; - if (IN_DEV_CONF_GET(out_dev, NOXFRM)) - rth->dst.flags |= DST_NOXFRM; - rth->fl.fl4_dst = daddr; + rth->rt_key_dst = daddr; + rth->rt_key_src = saddr; + rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); + rth->rt_flags = flags; + rth->rt_type = res->type; + rth->rt_key_tos = tos; rth->rt_dst = daddr; - rth->fl.fl4_tos = tos; - rth->fl.mark = skb->mark; - rth->fl.fl4_src = saddr; rth->rt_src = saddr; + rth->rt_route_iif = in_dev->dev->ifindex; + rth->rt_iif = in_dev->dev->ifindex; + rth->rt_oif = 0; + rth->rt_mark = skb->mark; rth->rt_gateway = daddr; - rth->rt_iif = - rth->fl.iif = in_dev->dev->ifindex; - rth->dst.dev = (out_dev)->dev; - dev_hold(rth->dst.dev); - rth->fl.oif = 0; rth->rt_spec_dst= spec_dst; + rth->rt_peer_genid = 0; + rth->peer = NULL; + rth->fi = NULL; - rth->dst.obsolete = -1; rth->dst.input = ip_forward; rth->dst.output = ip_output; - rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); - - rt_set_nexthop(rth, res, itag); - rth->rt_flags = flags; + rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag); *result = rth; err = 0; @@ -2057,7 +2032,7 @@ static int __mkroute_input(struct sk_buff *skb, static int ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, - const struct flowi *fl, + const struct flowi4 *fl4, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos) { @@ -2066,8 +2041,8 @@ static int ip_mkroute_input(struct sk_buff *skb, unsigned hash; #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0) - fib_select_multipath(fl, res); + if (res->fi && res->fi->fib_nhs > 1) + fib_select_multipath(res); #endif /* create a routing cache entry */ @@ -2076,9 +2051,12 @@ static int ip_mkroute_input(struct sk_buff *skb, return err; /* put it into the cache */ - hash = rt_hash(daddr, saddr, fl->iif, + hash = rt_hash(daddr, saddr, fl4->flowi4_iif, rt_genid(dev_net(rth->dst.dev))); - return rt_intern_hash(hash, rth, NULL, skb, fl->iif); + rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif); + if (IS_ERR(rth)) + return PTR_ERR(rth); + return 0; } /* @@ -2097,12 +2075,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, { struct fib_result res; struct in_device *in_dev = __in_dev_get_rcu(dev); - struct flowi fl = { .fl4_dst = daddr, - .fl4_src = saddr, - .fl4_tos = tos, - .fl4_scope = RT_SCOPE_UNIVERSE, - .mark = skb->mark, - .iif = dev->ifindex }; + struct flowi4 fl4; unsigned flags = 0; u32 itag = 0; struct rtable * rth; @@ -2139,7 +2112,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, /* * Now we are ready to route packet. */ - err = fib_lookup(net, &fl, &res); + fl4.flowi4_oif = 0; + fl4.flowi4_iif = dev->ifindex; + fl4.flowi4_mark = skb->mark; + fl4.flowi4_tos = tos; + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.daddr = daddr; + fl4.saddr = saddr; + err = fib_lookup(net, &fl4, &res); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) goto e_hostunreach; @@ -2152,9 +2132,9 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto brd_input; if (res.type == RTN_LOCAL) { - err = fib_validate_source(saddr, daddr, tos, + err = fib_validate_source(skb, saddr, daddr, tos, net->loopback_dev->ifindex, - dev, &spec_dst, &itag, skb->mark); + dev, &spec_dst, &itag); if (err < 0) goto martian_source_keep_err; if (err) @@ -2168,7 +2148,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res.type != RTN_UNICAST) goto martian_destination; - err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); + err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos); out: return err; brd_input: @@ -2178,8 +2158,8 @@ brd_input: if (ipv4_is_zeronet(saddr)) spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); else { - err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, - &itag, skb->mark); + err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, + &itag); if (err < 0) goto martian_source_keep_err; if (err) @@ -2190,43 +2170,47 @@ brd_input: RT_CACHE_STAT_INC(in_brd); local_input: - rth = dst_alloc(&ipv4_dst_ops); + rth = rt_dst_alloc(net->loopback_dev, + IN_DEV_CONF_GET(in_dev, NOPOLICY), false); if (!rth) goto e_nobufs; + rth->dst.input= ip_local_deliver; rth->dst.output= ip_rt_bug; - rth->dst.obsolete = -1; - rth->rt_genid = rt_genid(net); +#ifdef CONFIG_IP_ROUTE_CLASSID + rth->dst.tclassid = itag; +#endif - atomic_set(&rth->dst.__refcnt, 1); - rth->dst.flags= DST_HOST; - if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->dst.flags |= DST_NOPOLICY; - rth->fl.fl4_dst = daddr; + rth->rt_key_dst = daddr; + rth->rt_key_src = saddr; + rth->rt_genid = rt_genid(net); + rth->rt_flags = flags|RTCF_LOCAL; + rth->rt_type = res.type; + rth->rt_key_tos = tos; rth->rt_dst = daddr; - rth->fl.fl4_tos = tos; - rth->fl.mark = skb->mark; - rth->fl.fl4_src = saddr; rth->rt_src = saddr; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif - rth->rt_iif = - rth->fl.iif = dev->ifindex; - rth->dst.dev = net->loopback_dev; - dev_hold(rth->dst.dev); + rth->rt_route_iif = dev->ifindex; + rth->rt_iif = dev->ifindex; + rth->rt_oif = 0; + rth->rt_mark = skb->mark; rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; - rth->dst.input= ip_local_deliver; - rth->rt_flags = flags|RTCF_LOCAL; + rth->rt_peer_genid = 0; + rth->peer = NULL; + rth->fi = NULL; if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } - rth->rt_type = res.type; - hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); - err = rt_intern_hash(hash, rth, NULL, skb, fl.iif); + hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); + rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif); + err = 0; + if (IS_ERR(rth)) + err = PTR_ERR(rth); goto out; no_route: @@ -2288,12 +2272,12 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; rth = rcu_dereference(rth->dst.rt_next)) { - if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) | - ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) | - (rth->fl.iif ^ iif) | - rth->fl.oif | - (rth->fl.fl4_tos ^ tos)) == 0 && - rth->fl.mark == skb->mark && + if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) | + ((__force u32)rth->rt_key_src ^ (__force u32)saddr) | + (rth->rt_iif ^ iif) | + rth->rt_oif | + (rth->rt_key_tos ^ tos)) == 0 && + rth->rt_mark == skb->mark && net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { if (noref) { @@ -2326,8 +2310,8 @@ skip_cache: struct in_device *in_dev = __in_dev_get_rcu(dev); if (in_dev) { - int our = ip_check_mc(in_dev, daddr, saddr, - ip_hdr(skb)->protocol); + int our = ip_check_mc_rcu(in_dev, daddr, saddr, + ip_hdr(skb)->protocol); if (our #ifdef CONFIG_IP_MROUTE || @@ -2351,98 +2335,94 @@ skip_cache: EXPORT_SYMBOL(ip_route_input_common); /* called with rcu_read_lock() */ -static int __mkroute_output(struct rtable **result, - struct fib_result *res, - const struct flowi *fl, - const struct flowi *oldflp, - struct net_device *dev_out, - unsigned flags) +static struct rtable *__mkroute_output(const struct fib_result *res, + const struct flowi4 *fl4, + __be32 orig_daddr, __be32 orig_saddr, + int orig_oif, struct net_device *dev_out, + unsigned int flags) { - struct rtable *rth; + struct fib_info *fi = res->fi; + u32 tos = RT_FL_TOS(fl4); struct in_device *in_dev; - u32 tos = RT_FL_TOS(oldflp); + u16 type = res->type; + struct rtable *rth; - if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK)) - return -EINVAL; + if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) + return ERR_PTR(-EINVAL); - if (ipv4_is_lbcast(fl->fl4_dst)) - res->type = RTN_BROADCAST; - else if (ipv4_is_multicast(fl->fl4_dst)) - res->type = RTN_MULTICAST; - else if (ipv4_is_zeronet(fl->fl4_dst)) - return -EINVAL; + if (ipv4_is_lbcast(fl4->daddr)) + type = RTN_BROADCAST; + else if (ipv4_is_multicast(fl4->daddr)) + type = RTN_MULTICAST; + else if (ipv4_is_zeronet(fl4->daddr)) + return ERR_PTR(-EINVAL); if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; in_dev = __in_dev_get_rcu(dev_out); if (!in_dev) - return -EINVAL; + return ERR_PTR(-EINVAL); - if (res->type == RTN_BROADCAST) { + if (type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; - res->fi = NULL; - } else if (res->type == RTN_MULTICAST) { + fi = NULL; + } else if (type == RTN_MULTICAST) { flags |= RTCF_MULTICAST | RTCF_LOCAL; - if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, - oldflp->proto)) + if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, + fl4->flowi4_proto)) flags &= ~RTCF_LOCAL; /* If multicast route do not exist use * default one, but do not gateway in this case. * Yes, it is hack. */ - if (res->fi && res->prefixlen < 4) - res->fi = NULL; + if (fi && res->prefixlen < 4) + fi = NULL; } - - rth = dst_alloc(&ipv4_dst_ops); + rth = rt_dst_alloc(dev_out, + IN_DEV_CONF_GET(in_dev, NOPOLICY), + IN_DEV_CONF_GET(in_dev, NOXFRM)); if (!rth) - return -ENOBUFS; - - atomic_set(&rth->dst.__refcnt, 1); - rth->dst.flags= DST_HOST; - if (IN_DEV_CONF_GET(in_dev, NOXFRM)) - rth->dst.flags |= DST_NOXFRM; - if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->dst.flags |= DST_NOPOLICY; - - rth->fl.fl4_dst = oldflp->fl4_dst; - rth->fl.fl4_tos = tos; - rth->fl.fl4_src = oldflp->fl4_src; - rth->fl.oif = oldflp->oif; - rth->fl.mark = oldflp->mark; - rth->rt_dst = fl->fl4_dst; - rth->rt_src = fl->fl4_src; - rth->rt_iif = oldflp->oif ? : dev_out->ifindex; - /* get references to the devices that are to be hold by the routing - cache entry */ - rth->dst.dev = dev_out; - dev_hold(dev_out); - rth->rt_gateway = fl->fl4_dst; - rth->rt_spec_dst= fl->fl4_src; - - rth->dst.output=ip_output; - rth->dst.obsolete = -1; + return ERR_PTR(-ENOBUFS); + + rth->dst.output = ip_output; + + rth->rt_key_dst = orig_daddr; + rth->rt_key_src = orig_saddr; rth->rt_genid = rt_genid(dev_net(dev_out)); + rth->rt_flags = flags; + rth->rt_type = type; + rth->rt_key_tos = tos; + rth->rt_dst = fl4->daddr; + rth->rt_src = fl4->saddr; + rth->rt_route_iif = 0; + rth->rt_iif = orig_oif ? : dev_out->ifindex; + rth->rt_oif = orig_oif; + rth->rt_mark = fl4->flowi4_mark; + rth->rt_gateway = fl4->daddr; + rth->rt_spec_dst= fl4->saddr; + rth->rt_peer_genid = 0; + rth->peer = NULL; + rth->fi = NULL; RT_CACHE_STAT_INC(out_slow_tot); if (flags & RTCF_LOCAL) { rth->dst.input = ip_local_deliver; - rth->rt_spec_dst = fl->fl4_dst; + rth->rt_spec_dst = fl4->daddr; } if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { - rth->rt_spec_dst = fl->fl4_src; + rth->rt_spec_dst = fl4->saddr; if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { rth->dst.output = ip_mc_output; RT_CACHE_STAT_INC(out_slow_mc); } #ifdef CONFIG_IP_MROUTE - if (res->type == RTN_MULTICAST) { + if (type == RTN_MULTICAST) { if (IN_DEV_MFORWARD(in_dev) && - !ipv4_is_local_multicast(oldflp->fl4_dst)) { + !ipv4_is_local_multicast(fl4->daddr)) { rth->dst.input = ip_mr_input; rth->dst.output = ip_mc_output; } @@ -2450,31 +2430,9 @@ static int __mkroute_output(struct rtable **result, #endif } - rt_set_nexthop(rth, res, 0); + rt_set_nexthop(rth, fl4, res, fi, type, 0); - rth->rt_flags = flags; - *result = rth; - return 0; -} - -/* called with rcu_read_lock() */ -static int ip_mkroute_output(struct rtable **rp, - struct fib_result *res, - const struct flowi *fl, - const struct flowi *oldflp, - struct net_device *dev_out, - unsigned flags) -{ - struct rtable *rth = NULL; - int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); - unsigned hash; - if (err == 0) { - hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, - rt_genid(dev_net(dev_out))); - err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif); - } - - return err; + return rth; } /* @@ -2482,34 +2440,37 @@ static int ip_mkroute_output(struct rtable **rp, * called with rcu_read_lock(); */ -static int ip_route_output_slow(struct net *net, struct rtable **rp, - const struct flowi *oldflp) -{ - u32 tos = RT_FL_TOS(oldflp); - struct flowi fl = { .fl4_dst = oldflp->fl4_dst, - .fl4_src = oldflp->fl4_src, - .fl4_tos = tos & IPTOS_RT_MASK, - .fl4_scope = ((tos & RTO_ONLINK) ? - RT_SCOPE_LINK : RT_SCOPE_UNIVERSE), - .mark = oldflp->mark, - .iif = net->loopback_dev->ifindex, - .oif = oldflp->oif }; - struct fib_result res; - unsigned int flags = 0; +static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) +{ struct net_device *dev_out = NULL; - int err; - + u32 tos = RT_FL_TOS(fl4); + unsigned int flags = 0; + struct fib_result res; + struct rtable *rth; + __be32 orig_daddr; + __be32 orig_saddr; + int orig_oif; res.fi = NULL; #ifdef CONFIG_IP_MULTIPLE_TABLES res.r = NULL; #endif - if (oldflp->fl4_src) { - err = -EINVAL; - if (ipv4_is_multicast(oldflp->fl4_src) || - ipv4_is_lbcast(oldflp->fl4_src) || - ipv4_is_zeronet(oldflp->fl4_src)) + orig_daddr = fl4->daddr; + orig_saddr = fl4->saddr; + orig_oif = fl4->flowi4_oif; + + fl4->flowi4_iif = net->loopback_dev->ifindex; + fl4->flowi4_tos = tos & IPTOS_RT_MASK; + fl4->flowi4_scope = ((tos & RTO_ONLINK) ? + RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); + + rcu_read_lock(); + if (fl4->saddr) { + rth = ERR_PTR(-EINVAL); + if (ipv4_is_multicast(fl4->saddr) || + ipv4_is_lbcast(fl4->saddr) || + ipv4_is_zeronet(fl4->saddr)) goto out; /* I removed check for oif == dev_out->oif here. @@ -2520,11 +2481,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, of another iface. --ANK */ - if (oldflp->oif == 0 && - (ipv4_is_multicast(oldflp->fl4_dst) || - ipv4_is_lbcast(oldflp->fl4_dst))) { + if (fl4->flowi4_oif == 0 && + (ipv4_is_multicast(fl4->daddr) || + ipv4_is_lbcast(fl4->daddr))) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = __ip_dev_find(net, oldflp->fl4_src, false); + dev_out = __ip_dev_find(net, fl4->saddr, false); if (dev_out == NULL) goto out; @@ -2543,60 +2504,60 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, Luckily, this hack is good workaround. */ - fl.oif = dev_out->ifindex; + fl4->flowi4_oif = dev_out->ifindex; goto make_route; } - if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { + if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - if (!__ip_dev_find(net, oldflp->fl4_src, false)) + if (!__ip_dev_find(net, fl4->saddr, false)) goto out; } } - if (oldflp->oif) { - dev_out = dev_get_by_index_rcu(net, oldflp->oif); - err = -ENODEV; + if (fl4->flowi4_oif) { + dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); + rth = ERR_PTR(-ENODEV); if (dev_out == NULL) goto out; /* RACE: Check return value of inet_select_addr instead. */ if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { - err = -ENETUNREACH; + rth = ERR_PTR(-ENETUNREACH); goto out; } - if (ipv4_is_local_multicast(oldflp->fl4_dst) || - ipv4_is_lbcast(oldflp->fl4_dst)) { - if (!fl.fl4_src) - fl.fl4_src = inet_select_addr(dev_out, 0, + if (ipv4_is_local_multicast(fl4->daddr) || + ipv4_is_lbcast(fl4->daddr)) { + if (!fl4->saddr) + fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); goto make_route; } - if (!fl.fl4_src) { - if (ipv4_is_multicast(oldflp->fl4_dst)) - fl.fl4_src = inet_select_addr(dev_out, 0, - fl.fl4_scope); - else if (!oldflp->fl4_dst) - fl.fl4_src = inet_select_addr(dev_out, 0, + if (fl4->saddr) { + if (ipv4_is_multicast(fl4->daddr)) + fl4->saddr = inet_select_addr(dev_out, 0, + fl4->flowi4_scope); + else if (!fl4->daddr) + fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } } - if (!fl.fl4_dst) { - fl.fl4_dst = fl.fl4_src; - if (!fl.fl4_dst) - fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); + if (!fl4->daddr) { + fl4->daddr = fl4->saddr; + if (!fl4->daddr) + fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); dev_out = net->loopback_dev; - fl.oif = net->loopback_dev->ifindex; + fl4->flowi4_oif = net->loopback_dev->ifindex; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; } - if (fib_lookup(net, &fl, &res)) { + if (fib_lookup(net, fl4, &res)) { res.fi = NULL; - if (oldflp->oif) { + if (fl4->flowi4_oif) { /* Apparently, routing tables are wrong. Assume, that the destination is on link. @@ -2615,90 +2576,100 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, likely IPv6, but we do not. */ - if (fl.fl4_src == 0) - fl.fl4_src = inet_select_addr(dev_out, 0, + if (fl4->saddr == 0) + fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); res.type = RTN_UNICAST; goto make_route; } - err = -ENETUNREACH; + rth = ERR_PTR(-ENETUNREACH); goto out; } if (res.type == RTN_LOCAL) { - if (!fl.fl4_src) { + if (!fl4->saddr) { if (res.fi->fib_prefsrc) - fl.fl4_src = res.fi->fib_prefsrc; + fl4->saddr = res.fi->fib_prefsrc; else - fl.fl4_src = fl.fl4_dst; + fl4->saddr = fl4->daddr; } dev_out = net->loopback_dev; - fl.oif = dev_out->ifindex; + fl4->flowi4_oif = dev_out->ifindex; res.fi = NULL; flags |= RTCF_LOCAL; goto make_route; } #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res.fi->fib_nhs > 1 && fl.oif == 0) - fib_select_multipath(&fl, &res); + if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) + fib_select_multipath(&res); else #endif - if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) - fib_select_default(net, &fl, &res); + if (!res.prefixlen && + res.table->tb_num_default > 1 && + res.type == RTN_UNICAST && !fl4->flowi4_oif) + fib_select_default(&res); - if (!fl.fl4_src) - fl.fl4_src = FIB_RES_PREFSRC(res); + if (!fl4->saddr) + fl4->saddr = FIB_RES_PREFSRC(net, res); dev_out = FIB_RES_DEV(res); - fl.oif = dev_out->ifindex; + fl4->flowi4_oif = dev_out->ifindex; make_route: - err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); + rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif, + dev_out, flags); + if (!IS_ERR(rth)) { + unsigned int hash; -out: return err; + hash = rt_hash(orig_daddr, orig_saddr, orig_oif, + rt_genid(dev_net(dev_out))); + rth = rt_intern_hash(hash, rth, NULL, orig_oif); + } + +out: + rcu_read_unlock(); + return rth; } -int __ip_route_output_key(struct net *net, struct rtable **rp, - const struct flowi *flp) +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4) { - unsigned int hash; - int res; struct rtable *rth; + unsigned int hash; if (!rt_caching(net)) goto slow_output; - hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); + hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net)); rcu_read_lock_bh(); for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; rth = rcu_dereference_bh(rth->dst.rt_next)) { - if (rth->fl.fl4_dst == flp->fl4_dst && - rth->fl.fl4_src == flp->fl4_src && + if (rth->rt_key_dst == flp4->daddr && + rth->rt_key_src == flp4->saddr && rt_is_output_route(rth) && - rth->fl.oif == flp->oif && - rth->fl.mark == flp->mark && - !((rth->fl.fl4_tos ^ flp->fl4_tos) & + rth->rt_oif == flp4->flowi4_oif && + rth->rt_mark == flp4->flowi4_mark && + !((rth->rt_key_tos ^ flp4->flowi4_tos) & (IPTOS_RT_MASK | RTO_ONLINK)) && net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { dst_use(&rth->dst, jiffies); RT_CACHE_STAT_INC(out_hit); rcu_read_unlock_bh(); - *rp = rth; - return 0; + if (!flp4->saddr) + flp4->saddr = rth->rt_src; + if (!flp4->daddr) + flp4->daddr = rth->rt_dst; + return rth; } RT_CACHE_STAT_INC(out_hlist_search); } rcu_read_unlock_bh(); slow_output: - rcu_read_lock(); - res = ip_route_output_slow(net, rp, flp); - rcu_read_unlock(); - return res; + return ip_route_output_slow(net, flp4); } EXPORT_SYMBOL_GPL(__ip_route_output_key); @@ -2716,26 +2687,31 @@ static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) { } +static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, + unsigned long old) +{ + return NULL; +} + static struct dst_ops ipv4_dst_blackhole_ops = { .family = AF_INET, .protocol = cpu_to_be16(ETH_P_IP), .destroy = ipv4_dst_destroy, .check = ipv4_blackhole_dst_check, .default_mtu = ipv4_blackhole_default_mtu, + .default_advmss = ipv4_default_advmss, .update_pmtu = ipv4_rt_blackhole_update_pmtu, + .cow_metrics = ipv4_rt_blackhole_cow_metrics, }; - -static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp) +struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) { - struct rtable *ort = *rp; - struct rtable *rt = (struct rtable *) - dst_alloc(&ipv4_dst_blackhole_ops); + struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0); + struct rtable *ort = (struct rtable *) dst_orig; if (rt) { struct dst_entry *new = &rt->dst; - atomic_set(&new->__refcnt, 1); new->__use = 1; new->input = dst_discard; new->output = dst_discard; @@ -2745,59 +2721,53 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi if (new->dev) dev_hold(new->dev); - rt->fl = ort->fl; + rt->rt_key_dst = ort->rt_key_dst; + rt->rt_key_src = ort->rt_key_src; + rt->rt_key_tos = ort->rt_key_tos; + rt->rt_route_iif = ort->rt_route_iif; + rt->rt_iif = ort->rt_iif; + rt->rt_oif = ort->rt_oif; + rt->rt_mark = ort->rt_mark; rt->rt_genid = rt_genid(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; rt->rt_dst = ort->rt_dst; rt->rt_src = ort->rt_src; - rt->rt_iif = ort->rt_iif; rt->rt_gateway = ort->rt_gateway; rt->rt_spec_dst = ort->rt_spec_dst; rt->peer = ort->peer; if (rt->peer) atomic_inc(&rt->peer->refcnt); + rt->fi = ort->fi; + if (rt->fi) + atomic_inc(&rt->fi->fib_clntref); dst_free(new); } - dst_release(&(*rp)->dst); - *rp = rt; - return rt ? 0 : -ENOMEM; + dst_release(dst_orig); + + return rt ? &rt->dst : ERR_PTR(-ENOMEM); } -int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, - struct sock *sk, int flags) +struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, + struct sock *sk) { - int err; - - if ((err = __ip_route_output_key(net, rp, flp)) != 0) - return err; + struct rtable *rt = __ip_route_output_key(net, flp4); - if (flp->proto) { - if (!flp->fl4_src) - flp->fl4_src = (*rp)->rt_src; - if (!flp->fl4_dst) - flp->fl4_dst = (*rp)->rt_dst; - err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk, - flags ? XFRM_LOOKUP_WAIT : 0); - if (err == -EREMOTE) - err = ipv4_dst_blackhole(net, rp, flp); + if (IS_ERR(rt)) + return rt; - return err; - } + if (flp4->flowi4_proto) + rt = (struct rtable *) xfrm_lookup(net, &rt->dst, + flowi4_to_flowi(flp4), + sk, 0); - return 0; + return rt; } EXPORT_SYMBOL_GPL(ip_route_output_flow); -int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp) -{ - return ip_route_output_flow(net, rp, flp, NULL, 0); -} -EXPORT_SYMBOL(ip_route_output_key); - static int rt_fill_info(struct net *net, struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait, unsigned int flags) @@ -2816,7 +2786,7 @@ static int rt_fill_info(struct net *net, r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; - r->rtm_tos = rt->fl.fl4_tos; + r->rtm_tos = rt->rt_key_tos; r->rtm_table = RT_TABLE_MAIN; NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); r->rtm_type = rt->rt_type; @@ -2828,19 +2798,19 @@ static int rt_fill_info(struct net *net, NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); - if (rt->fl.fl4_src) { + if (rt->rt_key_src) { r->rtm_src_len = 32; - NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src); + NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src); } if (rt->dst.dev) NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (rt->dst.tclassid) NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); #endif if (rt_is_input_route(rt)) NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); - else if (rt->rt_src != rt->fl.fl4_src) + else if (rt->rt_src != rt->rt_key_src) NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); if (rt->rt_dst != rt->rt_gateway) @@ -2849,11 +2819,12 @@ static int rt_fill_info(struct net *net, if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) goto nla_put_failure; - if (rt->fl.mark) - NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark); + if (rt->rt_mark) + NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark); error = rt->dst.error; - expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; + expires = (rt->peer && rt->peer->pmtu_expires) ? + rt->peer->pmtu_expires - jiffies : 0; if (rt->peer) { inet_peer_refcheck(rt->peer); id = atomic_read(&rt->peer->ip_id_count) & 0xffff; @@ -2869,7 +2840,9 @@ static int rt_fill_info(struct net *net, if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { - int err = ipmr_get_route(net, skb, r, nowait); + int err = ipmr_get_route(net, skb, + rt->rt_src, rt->rt_dst, + r, nowait); if (err <= 0) { if (!nowait) { if (err == 0) @@ -2883,7 +2856,7 @@ static int rt_fill_info(struct net *net, } } else #endif - NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif); + NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif); } if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, @@ -2957,14 +2930,18 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void if (err == 0 && rt->dst.error) err = -rt->dst.error; } else { - struct flowi fl = { - .fl4_dst = dst, - .fl4_src = src, - .fl4_tos = rtm->rtm_tos, - .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, - .mark = mark, + struct flowi4 fl4 = { + .daddr = dst, + .saddr = src, + .flowi4_tos = rtm->rtm_tos, + .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, + .flowi4_mark = mark, }; - err = ip_route_output_key(net, &rt, &fl); + rt = ip_route_output_key(net, &fl4); + + err = 0; + if (IS_ERR(rt)) + err = PTR_ERR(rt); } if (err) @@ -3247,6 +3224,8 @@ static __net_init int rt_genid_init(struct net *net) { get_random_bytes(&net->ipv4.rt_genid, sizeof(net->ipv4.rt_genid)); + get_random_bytes(&net->ipv4.dev_addr_genid, + sizeof(net->ipv4.dev_addr_genid)); return 0; } @@ -3255,9 +3234,9 @@ static __net_initdata struct pernet_operations rt_genid_ops = { }; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; -#endif /* CONFIG_NET_CLS_ROUTE */ +#endif /* CONFIG_IP_ROUTE_CLASSID */ static __initdata unsigned long rhash_entries; static int __init set_rhash_entries(char *str) @@ -3273,7 +3252,7 @@ int __init ip_rt_init(void) { int rc = 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); @@ -3310,14 +3289,6 @@ int __init ip_rt_init(void) devinet_init(); ip_fib_init(); - /* All the timers, started at system startup tend - to synchronize. Perturb it a bit. - */ - INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); - expires_ljiffies = jiffies; - schedule_delayed_work(&expires_work, - net_random() % ip_rt_gc_interval + ip_rt_gc_interval); - if (ip_rt_proc_init()) printk(KERN_ERR "Unable to create route proc files\n"); #ifdef CONFIG_XFRM diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 4751920..2646149 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -321,10 +321,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, * the ACK carries the same options again (see RFC1122 4.2.3.8) */ if (opt && opt->optlen) { - int opt_size = sizeof(struct ip_options) + opt->optlen; + int opt_size = sizeof(struct ip_options_rcu) + opt->optlen; ireq->opt = kmalloc(opt_size, GFP_ATOMIC); - if (ireq->opt != NULL && ip_options_echo(ireq->opt, skb)) { + if (ireq->opt != NULL && ip_options_echo(&ireq->opt->opt, skb)) { kfree(ireq->opt); ireq->opt = NULL; } @@ -345,17 +345,16 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, * no easy way to do this. */ { - struct flowi fl = { .mark = sk->sk_mark, - .fl4_dst = ((opt && opt->srr) ? - opt->faddr : ireq->rmt_addr), - .fl4_src = ireq->loc_addr, - .fl4_tos = RT_CONN_FLAGS(sk), - .proto = IPPROTO_TCP, - .flags = inet_sk_flowi_flags(sk), - .fl_ip_sport = th->dest, - .fl_ip_dport = th->source }; - security_req_classify_flow(req, &fl); - if (ip_route_output_key(sock_net(sk), &rt, &fl)) { + struct flowi4 fl4; + + flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk), + RT_SCOPE_UNIVERSE, IPPROTO_TCP, + inet_sk_flowi_flags(sk), + (opt && opt->srr) ? opt->faddr : ireq->rmt_addr, + ireq->loc_addr, th->source, th->dest); + security_req_classify_flow(req, flowi4_to_flowi(&fl4)); + rt = ip_route_output_key(sock_net(sk), &fl4); + if (IS_ERR(rt)) { reqsk_free(req); goto out; } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 1a45665..57d0752 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -13,6 +13,7 @@ #include <linux/seqlock.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/nsproxy.h> #include <net/snmp.h> #include <net/icmp.h> #include <net/ip.h> @@ -21,6 +22,7 @@ #include <net/udp.h> #include <net/cipso_ipv4.h> #include <net/inet_frag.h> +#include <net/ping.h> static int zero; static int tcp_retr1_max = 255; @@ -30,6 +32,8 @@ static int tcp_adv_win_scale_min = -31; static int tcp_adv_win_scale_max = 31; static int ip_ttl_min = 1; static int ip_ttl_max = 255; +static int ip_ping_group_range_min[] = { 0, 0 }; +static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; /* Update system visible IP port range */ static void set_local_port_range(int range[2]) @@ -68,6 +72,53 @@ static int ipv4_local_port_range(ctl_table *table, int write, return ret; } + +void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high) +{ + gid_t *data = table->data; + unsigned seq; + do { + seq = read_seqbegin(&sysctl_local_ports.lock); + + *low = data[0]; + *high = data[1]; + } while (read_seqretry(&sysctl_local_ports.lock, seq)); +} + +/* Update system visible IP port range */ +static void set_ping_group_range(struct ctl_table *table, int range[2]) +{ + gid_t *data = table->data; + write_seqlock(&sysctl_local_ports.lock); + data[0] = range[0]; + data[1] = range[1]; + write_sequnlock(&sysctl_local_ports.lock); +} + +/* Validate changes from /proc interface. */ +static int ipv4_ping_group_range(ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret; + gid_t range[2]; + ctl_table tmp = { + .data = &range, + .maxlen = sizeof(range), + .mode = table->mode, + .extra1 = &ip_ping_group_range_min, + .extra2 = &ip_ping_group_range_max, + }; + + inet_get_ping_group_range_table(table, range, range + 1); + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + + if (write && ret == 0) + set_ping_group_range(table, range); + + return ret; +} + static int proc_tcp_congestion_control(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -311,7 +362,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_do_large_bitmap, }, -#ifdef CONFIG_IP_MULTICAST { .procname = "igmp_max_memberships", .data = &sysctl_igmp_max_memberships, @@ -319,8 +369,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - -#endif { .procname = "igmp_max_msf", .data = &sysctl_igmp_max_msf, @@ -680,6 +728,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "ping_group_range", + .data = &init_net.ipv4.sysctl_ping_group_range, + .maxlen = sizeof(init_net.ipv4.sysctl_ping_group_range), + .mode = 0644, + .proc_handler = ipv4_ping_group_range, + }, { } }; @@ -714,8 +769,18 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) &net->ipv4.sysctl_icmp_ratemask; table[6].data = &net->ipv4.sysctl_rt_cache_rebuild_count; + table[7].data = + &net->ipv4.sysctl_ping_group_range; + } + /* + * Sane defaults - nobody may create ping sockets. + * Boot scripts should set this to distro-specific group. + */ + net->ipv4.sysctl_ping_group_range[0] = 1; + net->ipv4.sysctl_ping_group_range[1] = 0; + net->ipv4.sysctl_rt_cache_rebuild_count = 4; net->ipv4.ipv4_hdr = register_net_sysctl_table(net, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6c11eec..054a59d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -505,6 +505,15 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) else answ = tp->write_seq - tp->snd_una; break; + case SIOCOUTQNSD: + if (sk->sk_state == TCP_LISTEN) + return -EINVAL; + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) + answ = 0; + else + answ = tp->write_seq - tp->snd_nxt; + break; default: return -ENOIOCTLCMD; } @@ -873,9 +882,7 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, flags); lock_sock(sk); - TCP_CHECK_TIMER(sk); res = do_tcp_sendpages(sk, &page, offset, size, flags); - TCP_CHECK_TIMER(sk); release_sock(sk); return res; } @@ -916,7 +923,6 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, long timeo; lock_sock(sk); - TCP_CHECK_TIMER(sk); flags = msg->msg_flags; timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); @@ -993,7 +999,8 @@ new_segment: /* We have some space in skb head. Superb! */ if (copy > skb_tailroom(skb)) copy = skb_tailroom(skb); - if ((err = skb_add_data(skb, from, copy)) != 0) + err = skb_add_data_nocache(sk, skb, from, copy); + if (err) goto do_fault; } else { int merge = 0; @@ -1036,8 +1043,8 @@ new_segment: /* Time to copy data. We are close to * the end! */ - err = skb_copy_to_page(sk, from, skb, page, - off, copy); + err = skb_copy_to_page_nocache(sk, from, skb, + page, off, copy); if (err) { /* If this page was new, give it to the * socket so it does not get leaked. @@ -1104,7 +1111,6 @@ wait_for_memory: out: if (copied) tcp_push(sk, flags, mss_now, tp->nonagle); - TCP_CHECK_TIMER(sk); release_sock(sk); return copied; @@ -1123,7 +1129,6 @@ do_error: goto out; out_err: err = sk_stream_error(sk, flags, err); - TCP_CHECK_TIMER(sk); release_sock(sk); return err; } @@ -1415,8 +1420,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, lock_sock(sk); - TCP_CHECK_TIMER(sk); - err = -ENOTCONN; if (sk->sk_state == TCP_LISTEN) goto out; @@ -1767,12 +1770,10 @@ skip_copy: /* Clean up data we have read: This will do ACK frames. */ tcp_cleanup_rbuf(sk, copied); - TCP_CHECK_TIMER(sk); release_sock(sk); return copied; out: - TCP_CHECK_TIMER(sk); release_sock(sk); return err; @@ -2653,7 +2654,7 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL(compat_tcp_getsockopt); #endif -struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) +struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features) { struct sk_buff *segs = ERR_PTR(-EINVAL); struct tcphdr *th; diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 3b53fd1..6187eb4 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -209,7 +209,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt) } -static struct tcp_congestion_ops bictcp = { +static struct tcp_congestion_ops bictcp __read_mostly = { .init = bictcp_init, .ssthresh = bictcp_recalc_ssthresh, .cong_avoid = bictcp_cong_avoid, diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 71d5f2f..f376b05 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -39,7 +39,7 @@ /* Number of delay samples for detecting the increase of delay */ #define HYSTART_MIN_SAMPLES 8 -#define HYSTART_DELAY_MIN (2U<<3) +#define HYSTART_DELAY_MIN (4U<<3) #define HYSTART_DELAY_MAX (16U<<3) #define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) @@ -52,6 +52,7 @@ static int tcp_friendliness __read_mostly = 1; static int hystart __read_mostly = 1; static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY; static int hystart_low_window __read_mostly = 16; +static int hystart_ack_delta __read_mostly = 2; static u32 cube_rtt_scale __read_mostly; static u32 beta_scale __read_mostly; @@ -75,6 +76,8 @@ MODULE_PARM_DESC(hystart_detect, "hyrbrid slow start detection mechanisms" " 1: packet-train 2: delay 3: both packet-train and delay"); module_param(hystart_low_window, int, 0644); MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); +module_param(hystart_ack_delta, int, 0644); +MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)"); /* BIC TCP Parameters */ struct bictcp { @@ -85,17 +88,18 @@ struct bictcp { u32 last_time; /* time when updated last_cwnd */ u32 bic_origin_point;/* origin point of bic function */ u32 bic_K; /* time to origin point from the beginning of the current epoch */ - u32 delay_min; /* min delay */ + u32 delay_min; /* min delay (msec << 3) */ u32 epoch_start; /* beginning of an epoch */ u32 ack_cnt; /* number of acks */ u32 tcp_cwnd; /* estimated tcp cwnd */ #define ACK_RATIO_SHIFT 4 +#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT) u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ u8 sample_cnt; /* number of samples to decide curr_rtt */ u8 found; /* the exit point is found? */ u32 round_start; /* beginning of each round */ u32 end_seq; /* end_seq of the round */ - u32 last_jiffies; /* last time when the ACK spacing is close */ + u32 last_ack; /* last time when the ACK spacing is close */ u32 curr_rtt; /* the minimum rtt of current round */ }; @@ -116,12 +120,21 @@ static inline void bictcp_reset(struct bictcp *ca) ca->found = 0; } +static inline u32 bictcp_clock(void) +{ +#if HZ < 1000 + return ktime_to_ms(ktime_get_real()); +#else + return jiffies_to_msecs(jiffies); +#endif +} + static inline void bictcp_hystart_reset(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - ca->round_start = ca->last_jiffies = jiffies; + ca->round_start = ca->last_ack = bictcp_clock(); ca->end_seq = tp->snd_nxt; ca->curr_rtt = 0; ca->sample_cnt = 0; @@ -236,8 +249,8 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) */ /* change the unit from HZ to bictcp_HZ */ - t = ((tcp_time_stamp + (ca->delay_min>>3) - ca->epoch_start) - << BICTCP_HZ) / HZ; + t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3) + - ca->epoch_start) << BICTCP_HZ) / HZ; if (t < ca->bic_K) /* t - K */ offs = ca->bic_K - t; @@ -258,6 +271,13 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->cnt = 100 * cwnd; /* very small increment*/ } + /* + * The initial growth of cubic function may be too conservative + * when the available bandwidth is still unknown. + */ + if (ca->loss_cwnd == 0 && ca->cnt > 20) + ca->cnt = 20; /* increase cwnd 5% per RTT */ + /* TCP Friendly */ if (tcp_friendliness) { u32 scale = beta_scale; @@ -339,12 +359,12 @@ static void hystart_update(struct sock *sk, u32 delay) struct bictcp *ca = inet_csk_ca(sk); if (!(ca->found & hystart_detect)) { - u32 curr_jiffies = jiffies; + u32 now = bictcp_clock(); /* first detection parameter - ack-train detection */ - if (curr_jiffies - ca->last_jiffies <= msecs_to_jiffies(2)) { - ca->last_jiffies = curr_jiffies; - if (curr_jiffies - ca->round_start >= ca->delay_min>>4) + if ((s32)(now - ca->last_ack) <= hystart_ack_delta) { + ca->last_ack = now; + if ((s32)(now - ca->round_start) > ca->delay_min >> 4) ca->found |= HYSTART_ACK_TRAIN; } @@ -379,8 +399,12 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) u32 delay; if (icsk->icsk_ca_state == TCP_CA_Open) { - cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; - ca->delayed_ack += cnt; + u32 ratio = ca->delayed_ack; + + ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT; + ratio += cnt; + + ca->delayed_ack = min(ratio, ACK_RATIO_LIMIT); } /* Some calls are for duplicates without timetamps */ @@ -391,7 +415,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ) return; - delay = usecs_to_jiffies(rtt_us) << 3; + delay = (rtt_us << 3) / USEC_PER_MSEC; if (delay == 0) delay = 1; @@ -405,7 +429,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) hystart_update(sk, delay); } -static struct tcp_congestion_ops cubictcp = { +static struct tcp_congestion_ops cubictcp __read_mostly = { .init = bictcp_init, .ssthresh = bictcp_recalc_ssthresh, .cong_avoid = bictcp_cong_avoid, @@ -447,6 +471,10 @@ static int __init cubictcp_register(void) /* divide by bic_scale and by constant Srtt (100ms) */ do_div(cube_factor, bic_scale * 10); + /* hystart needs ms clock resolution */ + if (hystart && HZ < 1000) + cubictcp.flags |= TCP_CONG_RTT_STAMP; + return tcp_register_congestion_control(&cubictcp); } diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 8b6caaf..30f27f6 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -158,7 +158,7 @@ static u32 hstcp_ssthresh(struct sock *sk) } -static struct tcp_congestion_ops tcp_highspeed = { +static struct tcp_congestion_ops tcp_highspeed __read_mostly = { .init = hstcp_init, .ssthresh = hstcp_ssthresh, .cong_avoid = hstcp_cong_avoid, diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 7c94a49..c1a8175 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -284,7 +284,7 @@ static void htcp_state(struct sock *sk, u8 new_state) } } -static struct tcp_congestion_ops htcp = { +static struct tcp_congestion_ops htcp __read_mostly = { .init = htcp_init, .ssthresh = htcp_recalc_ssthresh, .cong_avoid = htcp_cong_avoid, diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 377bc93..fe3ecf4 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -162,7 +162,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); } -static struct tcp_congestion_ops tcp_hybla = { +static struct tcp_congestion_ops tcp_hybla __read_mostly = { .init = hybla_init, .ssthresh = tcp_reno_ssthresh, .min_cwnd = tcp_reno_min_cwnd, diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 00ca688d..813b43a 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -322,7 +322,7 @@ static void tcp_illinois_info(struct sock *sk, u32 ext, } } -static struct tcp_congestion_ops tcp_illinois = { +static struct tcp_congestion_ops tcp_illinois __read_mostly = { .flags = TCP_CONG_RTT_STAMP, .init = tcp_illinois_init, .ssthresh = tcp_illinois_ssthresh, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eb7f82e..bef9f04 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -817,7 +817,7 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); if (!cwnd) - cwnd = rfc3390_bytes_to_packets(tp->mss_cache); + cwnd = TCP_INIT_CWND; return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } @@ -1222,7 +1222,7 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb, } /* D-SACK for already forgotten data... Do dumb counting. */ - if (dup_sack && + if (dup_sack && tp->undo_marker && tp->undo_retrans && !after(end_seq_0, prior_snd_una) && after(end_seq_0, tp->undo_marker)) tp->undo_retrans--; @@ -1299,7 +1299,8 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, /* Account D-SACK for retransmitted packet. */ if (dup_sack && (sacked & TCPCB_RETRANS)) { - if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) + if (tp->undo_marker && tp->undo_retrans && + after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) tp->undo_retrans--; if (sacked & TCPCB_SACKED_ACKED) state->reord = min(fack_count, state->reord); @@ -2658,7 +2659,7 @@ static void DBGUNDO(struct sock *sk, const char *msg) #define DBGUNDO(x...) do { } while (0) #endif -static void tcp_undo_cwr(struct sock *sk, const int undo) +static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) { struct tcp_sock *tp = tcp_sk(sk); @@ -2670,14 +2671,13 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) else tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1); - if (undo && tp->prior_ssthresh > tp->snd_ssthresh) { + if (undo_ssthresh && tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; TCP_ECN_withdraw_cwr(tp); } } else { tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); } - tcp_moderate_cwnd(tp); tp->snd_cwnd_stamp = tcp_time_stamp; } @@ -2698,7 +2698,7 @@ static int tcp_try_undo_recovery(struct sock *sk) * or our original transmission succeeded. */ DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); - tcp_undo_cwr(sk, 1); + tcp_undo_cwr(sk, true); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) mib_idx = LINUX_MIB_TCPLOSSUNDO; else @@ -2725,7 +2725,7 @@ static void tcp_try_undo_dsack(struct sock *sk) if (tp->undo_marker && !tp->undo_retrans) { DBGUNDO(sk, "D-SACK"); - tcp_undo_cwr(sk, 1); + tcp_undo_cwr(sk, true); tp->undo_marker = 0; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); } @@ -2778,7 +2778,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); DBGUNDO(sk, "Hoe"); - tcp_undo_cwr(sk, 0); + tcp_undo_cwr(sk, false); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); /* So... Do not make Hoe's retransmit yet. @@ -2807,7 +2807,7 @@ static int tcp_try_undo_loss(struct sock *sk) DBGUNDO(sk, "partial loss"); tp->lost_out = 0; - tcp_undo_cwr(sk, 1); + tcp_undo_cwr(sk, true); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); inet_csk(sk)->icsk_retransmits = 0; tp->undo_marker = 0; @@ -2821,8 +2821,11 @@ static int tcp_try_undo_loss(struct sock *sk) static inline void tcp_complete_cwr(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); - tp->snd_cwnd_stamp = tcp_time_stamp; + /* Do not moderate cwnd if it's already undone in cwr or recovery */ + if (tp->undo_marker && tp->snd_cwnd > tp->snd_ssthresh) { + tp->snd_cwnd = tp->snd_ssthresh; + tp->snd_cwnd_stamp = tcp_time_stamp; + } tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } @@ -3349,7 +3352,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, net_invalid_timestamp())) rtt_us = ktime_us_delta(ktime_get_real(), last_ackt); - else if (ca_seq_rtt > 0) + else if (ca_seq_rtt >= 0) rtt_us = jiffies_to_usecs(ca_seq_rtt); } @@ -3493,7 +3496,7 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag) if (flag & FLAG_ECE) tcp_ratehalving_spur_to_response(sk); else - tcp_undo_cwr(sk, 1); + tcp_undo_cwr(sk, true); } /* F-RTO spurious RTO detection algorithm (RFC4138) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 02f583b..a7d6671 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -146,13 +146,15 @@ EXPORT_SYMBOL_GPL(tcp_twsk_unique); /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { + struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); - struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; - struct rtable *rt; + __be16 orig_sport, orig_dport; __be32 daddr, nexthop; - int tmp; + struct flowi4 *fl4; + struct rtable *rt; int err; + struct ip_options_rcu *inet_opt; if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; @@ -161,20 +163,26 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) return -EAFNOSUPPORT; nexthop = daddr = usin->sin_addr.s_addr; - if (inet->opt && inet->opt->srr) { + inet_opt = rcu_dereference_protected(inet->inet_opt, + sock_owned_by_user(sk)); + if (inet_opt && inet_opt->opt.srr) { if (!daddr) return -EINVAL; - nexthop = inet->opt->faddr; - } - - tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr, - RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, - IPPROTO_TCP, - inet->inet_sport, usin->sin_port, sk, 1); - if (tmp < 0) { - if (tmp == -ENETUNREACH) + nexthop = inet_opt->opt.faddr; + } + + orig_sport = inet->inet_sport; + orig_dport = usin->sin_port; + fl4 = &inet->cork.fl.u.ip4; + rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, + RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, + IPPROTO_TCP, + orig_sport, orig_dport, sk, true); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + if (err == -ENETUNREACH) IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); - return tmp; + return err; } if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { @@ -182,11 +190,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) return -ENETUNREACH; } - if (!inet->opt || !inet->opt->srr) - daddr = rt->rt_dst; + if (!inet_opt || !inet_opt->opt.srr) + daddr = fl4->daddr; if (!inet->inet_saddr) - inet->inet_saddr = rt->rt_src; + inet->inet_saddr = fl4->saddr; inet->inet_rcv_saddr = inet->inet_saddr; if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { @@ -197,8 +205,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) } if (tcp_death_row.sysctl_tw_recycle && - !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) { - struct inet_peer *peer = rt_get_peer(rt); + !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) { + struct inet_peer *peer = rt_get_peer(rt, fl4->daddr); /* * VJ's idea. We save last timestamp seen from * the destination in peer table, when entering state @@ -218,8 +226,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_daddr = daddr; inet_csk(sk)->icsk_ext_hdr_len = 0; - if (inet->opt) - inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; + if (inet_opt) + inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; @@ -233,11 +241,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (err) goto failure; - err = ip_route_newports(&rt, IPPROTO_TCP, - inet->inet_sport, inet->inet_dport, sk); - if (err) + rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, + inet->inet_sport, inet->inet_dport, sk); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; goto failure; - + } /* OK, now commit destination to socket. */ sk->sk_gso_type = SKB_GSO_TCPV4; sk_setup_caps(sk, &rt->dst); @@ -273,7 +283,7 @@ EXPORT_SYMBOL(tcp_v4_connect); /* * This routine does path mtu discovery as defined in RFC1191. */ -static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu) +static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) { struct dst_entry *dst; struct inet_sock *inet = inet_sk(sk); @@ -335,7 +345,7 @@ static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu) void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) { - struct iphdr *iph = (struct iphdr *)icmp_skb->data; + const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); struct inet_connection_sock *icsk; struct tcp_sock *tp; @@ -641,7 +651,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; net = dev_net(skb_dst(skb)->dev); - ip_send_reply(net->ipv4.tcp_sock, skb, + ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); @@ -716,7 +726,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, if (oif) arg.bound_dev_if = oif; - ip_send_reply(net->ipv4.tcp_sock, skb, + ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); @@ -759,11 +769,12 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, struct request_values *rvp) { const struct inet_request_sock *ireq = inet_rsk(req); + struct flowi4 fl4; int err = -1; struct sk_buff * skb; /* First, grab a route. */ - if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) + if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; skb = tcp_make_synack(sk, dst, req, rvp); @@ -814,17 +825,18 @@ static void syn_flood_warning(const struct sk_buff *skb) /* * Save and compile IPv4 options into the request_sock if needed. */ -static struct ip_options *tcp_v4_save_options(struct sock *sk, - struct sk_buff *skb) +static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk, + struct sk_buff *skb) { - struct ip_options *opt = &(IPCB(skb)->opt); - struct ip_options *dopt = NULL; + const struct ip_options *opt = &(IPCB(skb)->opt); + struct ip_options_rcu *dopt = NULL; if (opt && opt->optlen) { - int opt_size = optlength(opt); + int opt_size = sizeof(*dopt) + opt->optlen; + dopt = kmalloc(opt_size, GFP_ATOMIC); if (dopt) { - if (ip_options_echo(dopt, skb)) { + if (ip_options_echo(&dopt->opt, skb)) { kfree(dopt); dopt = NULL; } @@ -1327,6 +1339,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) req->cookie_ts = tmp_opt.tstamp_ok; } else if (!isn) { struct inet_peer *peer = NULL; + struct flowi4 fl4; /* VJ's idea. We save last timestamp seen * from the destination in peer table, when entering @@ -1339,9 +1352,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) */ if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle && - (dst = inet_csk_route_req(sk, req)) != NULL && - (peer = rt_get_peer((struct rtable *)dst)) != NULL && - peer->daddr.a4 == saddr) { + (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && + fl4.daddr == saddr && + (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) { inet_peer_refcheck(peer); if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && (s32)(peer->tcp_ts - req->ts_recent) > @@ -1405,19 +1418,16 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *key; #endif + struct ip_options_rcu *inet_opt; if (sk_acceptq_is_full(sk)) goto exit_overflow; - if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) - goto exit; - newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) goto exit_nonewsk; newsk->sk_gso_type = SKB_GSO_TCPV4; - sk_setup_caps(newsk, dst); newtp = tcp_sk(newsk); newinet = inet_sk(newsk); @@ -1425,15 +1435,21 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->inet_daddr = ireq->rmt_addr; newinet->inet_rcv_saddr = ireq->loc_addr; newinet->inet_saddr = ireq->loc_addr; - newinet->opt = ireq->opt; + inet_opt = ireq->opt; + rcu_assign_pointer(newinet->inet_opt, inet_opt); ireq->opt = NULL; newinet->mc_index = inet_iif(skb); newinet->mc_ttl = ip_hdr(skb)->ttl; inet_csk(newsk)->icsk_ext_hdr_len = 0; - if (newinet->opt) - inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; + if (inet_opt) + inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = newtp->write_seq ^ jiffies; + if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL) + goto put_and_exit; + + sk_setup_caps(newsk, dst); + tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric_advmss(dst); @@ -1461,10 +1477,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, } #endif - if (__inet_inherit_port(sk, newsk) < 0) { - sock_put(newsk); - goto exit; - } + if (__inet_inherit_port(sk, newsk) < 0) + goto put_and_exit; __inet_hash_nolisten(newsk, NULL); return newsk; @@ -1476,6 +1490,9 @@ exit_nonewsk: exit: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); return NULL; +put_and_exit: + sock_put(newsk); + goto exit; } EXPORT_SYMBOL(tcp_v4_syn_recv_sock); @@ -1556,12 +1573,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ sock_rps_save_rxhash(sk, skb->rxhash); - TCP_CHECK_TIMER(sk); if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; } - TCP_CHECK_TIMER(sk); return 0; } @@ -1583,13 +1598,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) } else sock_rps_save_rxhash(sk, skb->rxhash); - - TCP_CHECK_TIMER(sk); if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; } - TCP_CHECK_TIMER(sk); return 0; reset: @@ -1763,12 +1775,13 @@ struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it) struct inet_sock *inet = inet_sk(sk); struct inet_peer *peer; - if (!rt || rt->rt_dst != inet->inet_daddr) { + if (!rt || + inet->cork.fl.u.ip4.daddr != inet->inet_daddr) { peer = inet_getpeer_v4(inet->inet_daddr, 1); *release_it = true; } else { if (!rt->peer) - rt_bind_peer(rt, 1); + rt_bind_peer(rt, inet->inet_daddr, 1); peer = rt->peer; *release_it = false; } @@ -2358,7 +2371,7 @@ static void get_openreq4(struct sock *sk, struct request_sock *req, int ttd = req->expires - jiffies; seq_printf(f, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n", + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n", i, ireq->loc_addr, ntohs(inet_sk(sk)->inet_sport), @@ -2413,7 +2426,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " - "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n", + "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n", i, src, srcp, dest, destp, sk->sk_state, tp->write_seq - tp->snd_una, rx_queue, @@ -2448,7 +2461,7 @@ static void get_timewait4_sock(struct inet_timewait_sock *tw, srcp = ntohs(tw->tw_sport); seq_printf(f, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n", + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n", i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, atomic_read(&tw->tw_refcnt), tw, len); @@ -2526,7 +2539,7 @@ void tcp4_proc_exit(void) struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) { - struct iphdr *iph = skb_gro_network_header(skb); + const struct iphdr *iph = skb_gro_network_header(skb); switch (skb->ip_summed) { case CHECKSUM_COMPLETE: @@ -2547,7 +2560,7 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) int tcp4_gro_complete(struct sk_buff *skb) { - struct iphdr *iph = ip_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); struct tcphdr *th = tcp_hdr(skb); th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb), diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index de87037..72f7218 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -12,7 +12,7 @@ * within cong_avoid. * o Error correcting in remote HZ, therefore remote HZ will be keeped * on checking and updating. - * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, sicne + * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, since * OWD have a similar meaning as RTT. Also correct the buggy formular. * o Handle reaction for Early Congestion Indication (ECI) within * pkts_acked, as mentioned within pseudo code. @@ -313,7 +313,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us) lp->last_drop = tcp_time_stamp; } -static struct tcp_congestion_ops tcp_lp = { +static struct tcp_congestion_ops tcp_lp __read_mostly = { .flags = TCP_CONG_RTT_STAMP, .init = tcp_lp_init, .ssthresh = tcp_reno_ssthresh, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 406f320..882e0b0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -73,7 +73,7 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) tcp_advance_send_head(sk, skb); tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - /* Don't override Nagle indefinately with F-RTO */ + /* Don't override Nagle indefinitely with F-RTO */ if (tp->frto_counter == 2) tp->frto_counter = 3; @@ -899,7 +899,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); - err = icsk->icsk_af_ops->queue_xmit(skb); + err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); if (likely(err <= 0)) return err; @@ -1003,7 +1003,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, int nlen; u8 flags; - BUG_ON(len > skb->len); + if (WARN_ON(len > skb->len)) + return -EINVAL; nsize = skb_headlen(skb) - len; if (nsize < 0) @@ -2162,7 +2163,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (!tp->retrans_stamp) tp->retrans_stamp = TCP_SKB_CB(skb)->when; - tp->undo_retrans++; + tp->undo_retrans += tcp_skb_pcount(skb); /* snd_nxt is stored to detect loss of retransmitted segment, * see tcp_input.c tcp_sacktag_write_queue(). diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index a765137..8ce55b8 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -35,7 +35,7 @@ static u32 tcp_scalable_ssthresh(struct sock *sk) } -static struct tcp_congestion_ops tcp_scalable = { +static struct tcp_congestion_ops tcp_scalable __read_mostly = { .ssthresh = tcp_scalable_ssthresh, .cong_avoid = tcp_scalable_cong_avoid, .min_cwnd = tcp_reno_min_cwnd, diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 74a6aa0..ecd44b0 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -259,7 +259,6 @@ static void tcp_delack_timer(unsigned long data) tcp_send_ack(sk); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS); } - TCP_CHECK_TIMER(sk); out: if (tcp_memory_pressure) @@ -481,7 +480,6 @@ static void tcp_write_timer(unsigned long data) tcp_probe_timer(sk); break; } - TCP_CHECK_TIMER(sk); out: sk_mem_reclaim(sk); @@ -589,7 +587,6 @@ static void tcp_keepalive_timer (unsigned long data) elapsed = keepalive_time_when(tp) - elapsed; } - TCP_CHECK_TIMER(sk); sk_mem_reclaim(sk); resched: diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index c6743ee..80fa2bf 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -304,7 +304,7 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(tcp_vegas_get_info); -static struct tcp_congestion_ops tcp_vegas = { +static struct tcp_congestion_ops tcp_vegas __read_mostly = { .flags = TCP_CONG_RTT_STAMP, .init = tcp_vegas_init, .ssthresh = tcp_reno_ssthresh, diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 38bc0b5..ac43cd7 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -201,7 +201,7 @@ static u32 tcp_veno_ssthresh(struct sock *sk) return max(tp->snd_cwnd >> 1U, 2U); } -static struct tcp_congestion_ops tcp_veno = { +static struct tcp_congestion_ops tcp_veno __read_mostly = { .flags = TCP_CONG_RTT_STAMP, .init = tcp_veno_init, .ssthresh = tcp_veno_ssthresh, diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index a534dda..1b91bf4 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -272,7 +272,7 @@ static void tcp_westwood_info(struct sock *sk, u32 ext, } -static struct tcp_congestion_ops tcp_westwood = { +static struct tcp_congestion_ops tcp_westwood __read_mostly = { .init = tcp_westwood_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index a0f2403..05c3b6f 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -20,7 +20,7 @@ #define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss #define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion #define TCP_YEAH_PHY 8 //lin maximum delta from base -#define TCP_YEAH_RHO 16 //lin minumum number of consecutive rtt to consider competition on loss +#define TCP_YEAH_RHO 16 //lin minimum number of consecutive rtt to consider competition on loss #define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count #define TCP_SCALABLE_AI_CNT 100U @@ -225,7 +225,7 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) { return tp->snd_cwnd - reduction; } -static struct tcp_congestion_ops tcp_yeah = { +static struct tcp_congestion_ops tcp_yeah __read_mostly = { .flags = TCP_CONG_RTT_STAMP, .init = tcp_yeah_init, .ssthresh = tcp_yeah_ssthresh, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8157b17..abca870 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -189,7 +189,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, * @sk: socket struct in question * @snum: port number to look up * @saddr_comp: AF-dependent comparison of bound local IP addresses - * @hash2_nulladdr: AF-dependant hash value in secondary hash chains, + * @hash2_nulladdr: AF-dependent hash value in secondary hash chains, * with NULL address */ int udp_lib_get_port(struct sock *sk, unsigned short snum, @@ -578,7 +578,7 @@ found: void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) { struct inet_sock *inet; - struct iphdr *iph = (struct iphdr *)skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2)); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; @@ -663,75 +663,71 @@ void udp_flush_pending_frames(struct sock *sk) EXPORT_SYMBOL(udp_flush_pending_frames); /** - * udp4_hwcsum_outgoing - handle outgoing HW checksumming - * @sk: socket we are sending on + * udp4_hwcsum - handle outgoing HW checksumming * @skb: sk_buff containing the filled-in UDP header * (checksum field must be zeroed out) + * @src: source IP address + * @dst: destination IP address */ -static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, - __be32 src, __be32 dst, int len) +static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) { - unsigned int offset; struct udphdr *uh = udp_hdr(skb); + struct sk_buff *frags = skb_shinfo(skb)->frag_list; + int offset = skb_transport_offset(skb); + int len = skb->len - offset; + int hlen = len; __wsum csum = 0; - if (skb_queue_len(&sk->sk_write_queue) == 1) { + if (!frags) { /* * Only one fragment on the socket. */ skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); - uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0); + uh->check = ~csum_tcpudp_magic(src, dst, len, + IPPROTO_UDP, 0); } else { /* * HW-checksum won't work as there are two or more * fragments on the socket so that all csums of sk_buffs * should be together */ - offset = skb_transport_offset(skb); - skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); + do { + csum = csum_add(csum, frags->csum); + hlen -= frags->len; + } while ((frags = frags->next)); + csum = skb_checksum(skb, offset, hlen, csum); skb->ip_summed = CHECKSUM_NONE; - skb_queue_walk(&sk->sk_write_queue, skb) { - csum = csum_add(csum, skb->csum); - } - uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum); if (uh->check == 0) uh->check = CSUM_MANGLED_0; } } -/* - * Push out all pending data as one UDP datagram. Socket is locked. - */ -static int udp_push_pending_frames(struct sock *sk) +static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) { - struct udp_sock *up = udp_sk(sk); + struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); - struct flowi *fl = &inet->cork.fl; - struct sk_buff *skb; struct udphdr *uh; int err = 0; int is_udplite = IS_UDPLITE(sk); + int offset = skb_transport_offset(skb); + int len = skb->len - offset; __wsum csum = 0; - /* Grab the skbuff where UDP header space exists. */ - if ((skb = skb_peek(&sk->sk_write_queue)) == NULL) - goto out; - /* * Create a UDP header */ uh = udp_hdr(skb); - uh->source = fl->fl_ip_sport; - uh->dest = fl->fl_ip_dport; - uh->len = htons(up->len); + uh->source = inet->inet_sport; + uh->dest = fl4->fl4_dport; + uh->len = htons(len); uh->check = 0; if (is_udplite) /* UDP-Lite */ - csum = udplite_csum_outgoing(sk, skb); + csum = udplite_csum(skb); else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */ @@ -740,20 +736,20 @@ static int udp_push_pending_frames(struct sock *sk) } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ - udp4_hwcsum_outgoing(sk, skb, fl->fl4_src, fl->fl4_dst, up->len); + udp4_hwcsum(skb, fl4->saddr, fl4->daddr); goto send; - } else /* `normal' UDP */ - csum = udp_csum_outgoing(sk, skb); + } else + csum = udp_csum(skb); /* add protocol-dependent pseudo-header */ - uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, up->len, + uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len, sk->sk_protocol, csum); if (uh->check == 0) uh->check = CSUM_MANGLED_0; send: - err = ip_push_pending_frames(sk); + err = ip_send_skb(skb); if (err) { if (err == -ENOBUFS && !inet->recverr) { UDP_INC_STATS_USER(sock_net(sk), @@ -763,6 +759,26 @@ send: } else UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_OUTDATAGRAMS, is_udplite); + return err; +} + +/* + * Push out all pending data as one UDP datagram. Socket is locked. + */ +static int udp_push_pending_frames(struct sock *sk) +{ + struct udp_sock *up = udp_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct flowi4 *fl4 = &inet->cork.fl.u.ip4; + struct sk_buff *skb; + int err = 0; + + skb = ip_finish_skb(sk, fl4); + if (!skb) + goto out; + + err = udp_send_skb(skb, fl4); + out: up->len = 0; up->pending = 0; @@ -774,6 +790,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, { struct inet_sock *inet = inet_sk(sk); struct udp_sock *up = udp_sk(sk); + struct flowi4 fl4_stack; + struct flowi4 *fl4; int ulen = len; struct ipcm_cookie ipc; struct rtable *rt = NULL; @@ -785,6 +803,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, int err, is_udplite = IS_UDPLITE(sk); int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); + struct sk_buff *skb; + struct ip_options_data opt_copy; if (len > 0xFFFF) return -EMSGSIZE; @@ -799,6 +819,9 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.opt = NULL; ipc.tx_flags = 0; + getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; + + fl4 = &inet->cork.fl.u.ip4; if (up->pending) { /* * There are pending frames. @@ -856,22 +879,32 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, free = 1; connected = 0; } - if (!ipc.opt) - ipc.opt = inet->opt; + if (!ipc.opt) { + struct ip_options_rcu *inet_opt; + + rcu_read_lock(); + inet_opt = rcu_dereference(inet->inet_opt); + if (inet_opt) { + memcpy(&opt_copy, inet_opt, + sizeof(*inet_opt) + inet_opt->opt.optlen); + ipc.opt = &opt_copy.opt; + } + rcu_read_unlock(); + } saddr = ipc.addr; ipc.addr = faddr = daddr; - if (ipc.opt && ipc.opt->srr) { + if (ipc.opt && ipc.opt->opt.srr) { if (!daddr) return -EINVAL; - faddr = ipc.opt->faddr; + faddr = ipc.opt->opt.faddr; connected = 0; } tos = RT_TOS(inet->tos); if (sock_flag(sk, SOCK_LOCALROUTE) || (msg->msg_flags & MSG_DONTROUTE) || - (ipc.opt && ipc.opt->is_strictroute)) { + (ipc.opt && ipc.opt->opt.is_strictroute)) { tos |= RTO_ONLINK; connected = 0; } @@ -888,20 +921,19 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, rt = (struct rtable *)sk_dst_check(sk, 0); if (rt == NULL) { - struct flowi fl = { .oif = ipc.oif, - .mark = sk->sk_mark, - .fl4_dst = faddr, - .fl4_src = saddr, - .fl4_tos = tos, - .proto = sk->sk_protocol, - .flags = inet_sk_flowi_flags(sk), - .fl_ip_sport = inet->inet_sport, - .fl_ip_dport = dport }; struct net *net = sock_net(sk); - security_sk_classify_flow(sk, &fl); - err = ip_route_output_flow(net, &rt, &fl, sk, 1); - if (err) { + fl4 = &fl4_stack; + flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, + RT_SCOPE_UNIVERSE, sk->sk_protocol, + inet_sk_flowi_flags(sk)|FLOWI_FLAG_CAN_SLEEP, + faddr, saddr, dport, inet->inet_sport); + + security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); + rt = ip_route_output_flow(net, fl4, sk); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; if (err == -ENETUNREACH) IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); goto out; @@ -919,9 +951,20 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, goto do_confirm; back_from_confirm: - saddr = rt->rt_src; + saddr = fl4->saddr; if (!ipc.addr) - daddr = ipc.addr = rt->rt_dst; + daddr = ipc.addr = fl4->daddr; + + /* Lockless fast path for the non-corking case. */ + if (!corkreq) { + skb = ip_make_skb(sk, fl4, getfrag, msg->msg_iov, ulen, + sizeof(struct udphdr), &ipc, &rt, + msg->msg_flags); + err = PTR_ERR(skb); + if (skb && !IS_ERR(skb)) + err = udp_send_skb(skb, fl4); + goto out; + } lock_sock(sk); if (unlikely(up->pending)) { @@ -936,18 +979,18 @@ back_from_confirm: /* * Now cork the socket to pend data. */ - inet->cork.fl.fl4_dst = daddr; - inet->cork.fl.fl_ip_dport = dport; - inet->cork.fl.fl4_src = saddr; - inet->cork.fl.fl_ip_sport = inet->inet_sport; + fl4 = &inet->cork.fl.u.ip4; + fl4->daddr = daddr; + fl4->saddr = saddr; + fl4->fl4_dport = dport; + fl4->fl4_sport = inet->inet_sport; up->pending = AF_INET; do_append_data: up->len += ulen; - getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; - err = ip_append_data(sk, getfrag, msg->msg_iov, ulen, - sizeof(struct udphdr), &ipc, &rt, - corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); + err = ip_append_data(sk, fl4, getfrag, msg->msg_iov, ulen, + sizeof(struct udphdr), &ipc, &rt, + corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) udp_flush_pending_frames(sk); else if (!corkreq) @@ -987,6 +1030,7 @@ EXPORT_SYMBOL(udp_sendmsg); int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { + struct inet_sock *inet = inet_sk(sk); struct udp_sock *up = udp_sk(sk); int ret; @@ -1011,7 +1055,8 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset, return -EINVAL; } - ret = ip_append_page(sk, page, offset, size, flags); + ret = ip_append_page(sk, &inet->cork.fl.u.ip4, + page, offset, size, flags); if (ret == -EOPNOTSUPP) { release_sock(sk); return sock_no_sendpage(sk->sk_socket, page, offset, @@ -2045,7 +2090,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, __u16 srcp = ntohs(inet->inet_sport); seq_printf(f, "%5d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n", + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), @@ -2199,7 +2244,7 @@ int udp4_ufo_send_check(struct sk_buff *skb) return 0; } -struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features) +struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, u32 features) { struct sk_buff *segs = ERR_PTR(-EINVAL); unsigned int mss; diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 571aa96..2d51840 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -69,7 +69,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb) } EXPORT_SYMBOL(xfrm4_prepare_output); -static int xfrm4_output_finish(struct sk_buff *skb) +int xfrm4_output_finish(struct sk_buff *skb) { #ifdef CONFIG_NETFILTER if (!skb_dst(skb)->xfrm) { @@ -86,7 +86,11 @@ static int xfrm4_output_finish(struct sk_buff *skb) int xfrm4_output(struct sk_buff *skb) { + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, - NULL, skb_dst(skb)->dev, xfrm4_output_finish, + NULL, dst->dev, + x->outer_mode->afinfo->output_finish, !(IPCB(skb)->flags & IPSKB_REROUTED)); } diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index b057d40..981e43e 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -18,47 +18,53 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo; -static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, - xfrm_address_t *saddr, - xfrm_address_t *daddr) +static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, + int tos, + const xfrm_address_t *saddr, + const xfrm_address_t *daddr) { - struct flowi fl = { - .fl4_dst = daddr->a4, - .fl4_tos = tos, - }; - struct dst_entry *dst; struct rtable *rt; - int err; + memset(fl4, 0, sizeof(*fl4)); + fl4->daddr = daddr->a4; + fl4->flowi4_tos = tos; if (saddr) - fl.fl4_src = saddr->a4; + fl4->saddr = saddr->a4; + + rt = __ip_route_output_key(net, fl4); + if (!IS_ERR(rt)) + return &rt->dst; - err = __ip_route_output_key(net, &rt, &fl); - dst = &rt->dst; - if (err) - dst = ERR_PTR(err); - return dst; + return ERR_CAST(rt); +} + +static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, + const xfrm_address_t *saddr, + const xfrm_address_t *daddr) +{ + struct flowi4 fl4; + + return __xfrm4_dst_lookup(net, &fl4, tos, saddr, daddr); } static int xfrm4_get_saddr(struct net *net, xfrm_address_t *saddr, xfrm_address_t *daddr) { struct dst_entry *dst; - struct rtable *rt; + struct flowi4 fl4; - dst = xfrm4_dst_lookup(net, 0, NULL, daddr); + dst = __xfrm4_dst_lookup(net, &fl4, 0, NULL, daddr); if (IS_ERR(dst)) return -EHOSTUNREACH; - rt = (struct rtable *)dst; - saddr->a4 = rt->rt_src; + saddr->a4 = fl4.saddr; dst_release(dst); return 0; } -static int xfrm4_get_tos(struct flowi *fl) +static int xfrm4_get_tos(const struct flowi *fl) { - return IPTOS_RT_MASK & fl->fl4_tos; /* Strip ECN bits */ + return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; /* Strip ECN bits */ } static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, @@ -68,11 +74,18 @@ static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, } static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, - struct flowi *fl) + const struct flowi *fl) { struct rtable *rt = (struct rtable *)xdst->route; + const struct flowi4 *fl4 = &fl->u.ip4; - xdst->u.rt.fl = *fl; + rt->rt_key_dst = fl4->daddr; + rt->rt_key_src = fl4->saddr; + rt->rt_key_tos = fl4->flowi4_tos; + rt->rt_route_iif = fl4->flowi4_iif; + rt->rt_iif = fl4->flowi4_iif; + rt->rt_oif = fl4->flowi4_oif; + rt->rt_mark = fl4->flowi4_mark; xdst->u.dst.dev = dev; dev_hold(dev); @@ -97,11 +110,12 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, static void _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) { - struct iphdr *iph = ip_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); u8 *xprth = skb_network_header(skb) + iph->ihl * 4; + struct flowi4 *fl4 = &fl->u.ip4; - memset(fl, 0, sizeof(struct flowi)); - fl->mark = skb->mark; + memset(fl4, 0, sizeof(struct flowi4)); + fl4->flowi4_mark = skb->mark; if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { switch (iph->protocol) { @@ -114,8 +128,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) pskb_may_pull(skb, xprth + 4 - skb->data)) { __be16 *ports = (__be16 *)xprth; - fl->fl_ip_sport = ports[!!reverse]; - fl->fl_ip_dport = ports[!reverse]; + fl4->fl4_sport = ports[!!reverse]; + fl4->fl4_dport = ports[!reverse]; } break; @@ -123,8 +137,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) if (pskb_may_pull(skb, xprth + 2 - skb->data)) { u8 *icmp = xprth; - fl->fl_icmp_type = icmp[0]; - fl->fl_icmp_code = icmp[1]; + fl4->fl4_icmp_type = icmp[0]; + fl4->fl4_icmp_code = icmp[1]; } break; @@ -132,7 +146,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) if (pskb_may_pull(skb, xprth + 4 - skb->data)) { __be32 *ehdr = (__be32 *)xprth; - fl->fl_ipsec_spi = ehdr[0]; + fl4->fl4_ipsec_spi = ehdr[0]; } break; @@ -140,7 +154,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) if (pskb_may_pull(skb, xprth + 8 - skb->data)) { __be32 *ah_hdr = (__be32*)xprth; - fl->fl_ipsec_spi = ah_hdr[1]; + fl4->fl4_ipsec_spi = ah_hdr[1]; } break; @@ -148,7 +162,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) if (pskb_may_pull(skb, xprth + 4 - skb->data)) { __be16 *ipcomp_hdr = (__be16 *)xprth; - fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); + fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); } break; @@ -160,20 +174,20 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) if (greflags[0] & GRE_KEY) { if (greflags[0] & GRE_CSUM) gre_hdr++; - fl->fl_gre_key = gre_hdr[1]; + fl4->fl4_gre_key = gre_hdr[1]; } } break; default: - fl->fl_ipsec_spi = 0; + fl4->fl4_ipsec_spi = 0; break; } } - fl->proto = iph->protocol; - fl->fl4_dst = reverse ? iph->saddr : iph->daddr; - fl->fl4_src = reverse ? iph->daddr : iph->saddr; - fl->fl4_tos = iph->tos; + fl4->flowi4_proto = iph->protocol; + fl4->daddr = reverse ? iph->saddr : iph->daddr; + fl4->saddr = reverse ? iph->daddr : iph->saddr; + fl4->flowi4_tos = iph->tos; } static inline int xfrm4_garbage_collect(struct dst_ops *ops) @@ -196,8 +210,11 @@ static void xfrm4_dst_destroy(struct dst_entry *dst) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + dst_destroy_metrics_generic(dst); + if (likely(xdst->u.rt.peer)) inet_putpeer(xdst->u.rt.peer); + xfrm_dst_destroy(xdst); } @@ -215,6 +232,7 @@ static struct dst_ops xfrm4_dst_ops = { .protocol = cpu_to_be16(ETH_P_IP), .gc = xfrm4_garbage_collect, .update_pmtu = xfrm4_update_pmtu, + .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, .local_out = __ip_local_out, @@ -230,6 +248,7 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .get_tos = xfrm4_get_tos, .init_path = xfrm4_init_path, .fill_dst = xfrm4_fill_dst, + .blackhole_route = ipv4_blackhole_route, }; #ifdef CONFIG_SYSCTL diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 4794762..d9ac0a0 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -21,24 +21,26 @@ static int xfrm4_init_flags(struct xfrm_state *x) } static void -__xfrm4_init_tempsel(struct xfrm_selector *sel, struct flowi *fl) +__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl) { - sel->daddr.a4 = fl->fl4_dst; - sel->saddr.a4 = fl->fl4_src; - sel->dport = xfrm_flowi_dport(fl); + const struct flowi4 *fl4 = &fl->u.ip4; + + sel->daddr.a4 = fl4->daddr; + sel->saddr.a4 = fl4->saddr; + sel->dport = xfrm_flowi_dport(fl, &fl4->uli); sel->dport_mask = htons(0xffff); - sel->sport = xfrm_flowi_sport(fl); + sel->sport = xfrm_flowi_sport(fl, &fl4->uli); sel->sport_mask = htons(0xffff); sel->family = AF_INET; sel->prefixlen_d = 32; sel->prefixlen_s = 32; - sel->proto = fl->proto; - sel->ifindex = fl->oif; + sel->proto = fl4->flowi4_proto; + sel->ifindex = fl4->flowi4_oif; } static void -xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl, - xfrm_address_t *daddr, xfrm_address_t *saddr) +xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl, + const xfrm_address_t *daddr, const xfrm_address_t *saddr) { x->id = tmpl->id; if (x->id.daddr.a4 == 0) @@ -53,7 +55,7 @@ xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl, int xfrm4_extract_header(struct sk_buff *skb) { - struct iphdr *iph = ip_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); XFRM_MODE_SKB_CB(skb)->id = iph->id; @@ -76,6 +78,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = { .init_tempsel = __xfrm4_init_tempsel, .init_temprop = xfrm4_init_temprop, .output = xfrm4_output, + .output_finish = xfrm4_output_finish, .extract_input = xfrm4_extract_input, .extract_output = xfrm4_extract_output, .transport_finish = xfrm4_transport_finish, |