diff options
Diffstat (limited to 'net/ipv4')
41 files changed, 1438 insertions, 535 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 04a14b1..7d12c6a 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -262,7 +262,8 @@ static inline int inet_netns_ok(struct net *net, int protocol) * Create an inet socket. */ -static int inet_create(struct net *net, struct socket *sock, int protocol) +static int inet_create(struct net *net, struct socket *sock, int protocol, + int kern) { struct sock *sk; struct inet_protosw *answer; @@ -325,7 +326,7 @@ lookup_protocol: } err = -EPERM; - if (answer->capability > 0 && !capable(answer->capability)) + if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) goto out_rcu_unlock; err = -EAFNOSUPPORT; @@ -685,7 +686,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, { struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); - struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr); sin->sin_family = AF_INET; if (peer) { @@ -947,7 +948,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_TCP, .prot = &tcp_prot, .ops = &inet_stream_ops, - .capability = -1, .no_check = 0, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, @@ -958,7 +958,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_UDP, .prot = &udp_prot, .ops = &inet_dgram_ops, - .capability = -1, .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_PERMANENT, }, @@ -969,7 +968,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_IP, /* wild card */ .prot = &raw_prot, .ops = &inet_sockraw_ops, - .capability = CAP_NET_RAW, .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_REUSE, } diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index d07b0c1..7ed3e4a 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -444,7 +444,7 @@ static int ah_init_state(struct xfrm_state *x) } ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; - ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8; + ahp->icv_trunc_len = x->aalg->alg_trunc_len/8; BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 5df2f6a..e312661 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -140,11 +140,11 @@ void in_dev_finish_destroy(struct in_device *idev) #endif dev_put(dev); if (!idev->dead) - printk("Freeing alive in_device %p\n", idev); - else { + pr_err("Freeing alive in_device %p\n", idev); + else kfree(idev); - } } +EXPORT_SYMBOL(in_dev_finish_destroy); static struct in_device *inetdev_init(struct net_device *dev) { @@ -159,7 +159,8 @@ static struct in_device *inetdev_init(struct net_device *dev) sizeof(in_dev->cnf)); in_dev->cnf.sysctl = NULL; in_dev->dev = dev; - if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL) + in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl); + if (!in_dev->arp_parms) goto out_kfree; if (IPV4_DEVCONF(in_dev->cnf, FORWARDING)) dev_disable_lro(dev); @@ -405,13 +406,15 @@ struct in_device *inetdev_by_index(struct net *net, int ifindex) { struct net_device *dev; struct in_device *in_dev = NULL; - read_lock(&dev_base_lock); - dev = __dev_get_by_index(net, ifindex); + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); if (dev) in_dev = in_dev_get(dev); - read_unlock(&dev_base_lock); + rcu_read_unlock(); return in_dev; } +EXPORT_SYMBOL(inetdev_by_index); /* Called only from RTNL semaphored context. No locks. */ @@ -557,7 +560,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg * Determine a default network mask, based on the IP address. */ -static __inline__ int inet_abc_len(__be32 addr) +static inline int inet_abc_len(__be32 addr) { int rc = -1; /* Something else, probably a multicast. */ @@ -646,13 +649,15 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) rtnl_lock(); ret = -ENODEV; - if ((dev = __dev_get_by_name(net, ifr.ifr_name)) == NULL) + dev = __dev_get_by_name(net, ifr.ifr_name); + if (!dev) goto done; if (colon) *colon = ':'; - if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) { + in_dev = __in_dev_get_rtnl(dev); + if (in_dev) { if (tryaddrmatch) { /* Matthias Andree */ /* compare label and address (4.4BSD style) */ @@ -720,7 +725,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (!ifa) { ret = -ENOBUFS; - if ((ifa = inet_alloc_ifa()) == NULL) + ifa = inet_alloc_ifa(); + if (!ifa) break; if (colon) memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ); @@ -822,10 +828,10 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len) struct ifreq ifr; int done = 0; - if (!in_dev || (ifa = in_dev->ifa_list) == NULL) + if (!in_dev) goto out; - for (; ifa; ifa = ifa->ifa_next) { + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { if (!buf) { done += sizeof(ifr); continue; @@ -875,36 +881,33 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) if (!addr) addr = ifa->ifa_local; } endfor_ifa(in_dev); -no_in_dev: - rcu_read_unlock(); if (addr) - goto out; + goto out_unlock; +no_in_dev: /* Not loopback addresses on loopback should be preferred in this case. It is importnat that lo is the first interface in dev_base list. */ - read_lock(&dev_base_lock); - rcu_read_lock(); - for_each_netdev(net, dev) { - if ((in_dev = __in_dev_get_rcu(dev)) == NULL) + for_each_netdev_rcu(net, dev) { + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) continue; for_primary_ifa(in_dev) { if (ifa->ifa_scope != RT_SCOPE_LINK && ifa->ifa_scope <= scope) { addr = ifa->ifa_local; - goto out_unlock_both; + goto out_unlock; } } endfor_ifa(in_dev); } -out_unlock_both: - read_unlock(&dev_base_lock); +out_unlock: rcu_read_unlock(); -out: return addr; } +EXPORT_SYMBOL(inet_select_addr); static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, __be32 local, int scope) @@ -940,7 +943,7 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, } } endfor_ifa(in_dev); - return same? addr : 0; + return same ? addr : 0; } /* @@ -961,17 +964,16 @@ __be32 inet_confirm_addr(struct in_device *in_dev, return confirm_addr_indev(in_dev, dst, local, scope); net = dev_net(in_dev->dev); - read_lock(&dev_base_lock); rcu_read_lock(); - for_each_netdev(net, dev) { - if ((in_dev = __in_dev_get_rcu(dev))) { + for_each_netdev_rcu(net, dev) { + in_dev = __in_dev_get_rcu(dev); + if (in_dev) { addr = confirm_addr_indev(in_dev, dst, local, scope); if (addr) break; } } rcu_read_unlock(); - read_unlock(&dev_base_lock); return addr; } @@ -984,14 +986,16 @@ int register_inetaddr_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&inetaddr_chain, nb); } +EXPORT_SYMBOL(register_inetaddr_notifier); int unregister_inetaddr_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&inetaddr_chain, nb); } +EXPORT_SYMBOL(unregister_inetaddr_notifier); -/* Rename ifa_labels for a device name change. Make some effort to preserve existing - * alias numbering and to create unique labels if possible. +/* Rename ifa_labels for a device name change. Make some effort to preserve + * existing alias numbering and to create unique labels if possible. */ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) { @@ -1010,11 +1014,10 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) sprintf(old, ":%d", named); dot = old; } - if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) { + if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) strcat(ifa->ifa_label, dot); - } else { + else strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot); - } skip: rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); } @@ -1061,8 +1064,9 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, if (!inetdev_valid_mtu(dev->mtu)) break; if (dev->flags & IFF_LOOPBACK) { - struct in_ifaddr *ifa; - if ((ifa = inet_alloc_ifa()) != NULL) { + struct in_ifaddr *ifa = inet_alloc_ifa(); + + if (ifa) { ifa->ifa_local = ifa->ifa_address = htonl(INADDR_LOOPBACK); ifa->ifa_prefixlen = 8; @@ -1170,38 +1174,54 @@ nla_put_failure: static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); - int idx, ip_idx; + int h, s_h; + int idx, s_idx; + int ip_idx, s_ip_idx; struct net_device *dev; struct in_device *in_dev; struct in_ifaddr *ifa; - int s_ip_idx, s_idx = cb->args[0]; + struct hlist_head *head; + struct hlist_node *node; - s_ip_idx = ip_idx = cb->args[1]; - idx = 0; - for_each_netdev(net, dev) { - if (idx < s_idx) - goto cont; - if (idx > s_idx) - s_ip_idx = 0; - if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) - goto cont; - - for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; - ifa = ifa->ifa_next, ip_idx++) { - if (ip_idx < s_ip_idx) - continue; - if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, + s_h = cb->args[0]; + s_idx = idx = cb->args[1]; + s_ip_idx = ip_idx = cb->args[2]; + + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + rcu_read_lock(); + hlist_for_each_entry_rcu(dev, node, head, index_hlist) { + if (idx < s_idx) + goto cont; + if (idx > s_idx) + s_ip_idx = 0; + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + goto cont; + + for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; + ifa = ifa->ifa_next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + if (inet_fill_ifaddr(skb, ifa, + NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, - RTM_NEWADDR, NLM_F_MULTI) <= 0) - goto done; - } + RTM_NEWADDR, NLM_F_MULTI) <= 0) { + rcu_read_unlock(); + goto done; + } + } cont: - idx++; + idx++; + } + rcu_read_unlock(); } done: - cb->args[0] = idx; - cb->args[1] = ip_idx; + cb->args[0] = h; + cb->args[1] = idx; + cb->args[2] = ip_idx; return skb->len; } @@ -1239,18 +1259,18 @@ static void devinet_copy_dflt_conf(struct net *net, int i) { struct net_device *dev; - read_lock(&dev_base_lock); - for_each_netdev(net, dev) { + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { struct in_device *in_dev; - rcu_read_lock(); + in_dev = __in_dev_get_rcu(dev); if (in_dev && !test_bit(i, in_dev->cnf.state)) in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i]; - rcu_read_unlock(); } - read_unlock(&dev_base_lock); + rcu_read_unlock(); } +/* called with RTNL locked */ static void inet_forward_change(struct net *net) { struct net_device *dev; @@ -1259,7 +1279,6 @@ static void inet_forward_change(struct net *net) IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on; IPV4_DEVCONF_DFLT(net, FORWARDING) = on; - read_lock(&dev_base_lock); for_each_netdev(net, dev) { struct in_device *in_dev; if (on) @@ -1270,7 +1289,6 @@ static void inet_forward_change(struct net *net) IN_DEV_CONF_SET(in_dev, FORWARDING, on); rcu_read_unlock(); } - read_unlock(&dev_base_lock); } static int devinet_conf_proc(ctl_table *ctl, int write, @@ -1450,6 +1468,7 @@ static struct devinet_sysctl_table { DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE, "accept_source_route"), + DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"), DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"), DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"), DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"), @@ -1587,7 +1606,7 @@ static __net_init int devinet_init_net(struct net *net) all = &ipv4_devconf; dflt = &ipv4_devconf_dflt; - if (net != &init_net) { + if (!net_eq(net, &init_net)) { all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL); if (all == NULL) goto err_alloc_all; @@ -1680,8 +1699,3 @@ void __init devinet_init(void) rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr); } -EXPORT_SYMBOL(in_dev_finish_destroy); -EXPORT_SYMBOL(inet_select_addr); -EXPORT_SYMBOL(inetdev_by_index); -EXPORT_SYMBOL(register_inetaddr_notifier); -EXPORT_SYMBOL(unregister_inetaddr_notifier); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 12f7287..1948895 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -530,7 +530,7 @@ static int esp_init_authenc(struct xfrm_state *x) } err = crypto_aead_setauthsize( - aead, aalg_desc->uinfo.auth.icv_truncbits / 8); + aead, x->aalg->alg_trunc_len / 8); if (err) goto free_key; } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index f73dbed..3323168 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -229,25 +229,29 @@ unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, */ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, - struct net_device *dev, __be32 *spec_dst, u32 *itag) + struct net_device *dev, __be32 *spec_dst, + u32 *itag, u32 mark) { struct in_device *in_dev; struct flowi fl = { .nl_u = { .ip4_u = { .daddr = src, .saddr = dst, .tos = tos } }, + .mark = mark, .iif = oif }; + struct fib_result res; - int no_addr, rpf; + int no_addr, rpf, accept_local; int ret; struct net *net; - no_addr = rpf = 0; + no_addr = rpf = accept_local = 0; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (in_dev) { no_addr = in_dev->ifa_list == NULL; rpf = IN_DEV_RPFILTER(in_dev); + accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); } rcu_read_unlock(); @@ -257,8 +261,10 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, net = dev_net(dev); if (fib_lookup(net, &fl, &res)) goto last_resort; - if (res.type != RTN_UNICAST) - goto e_inval_res; + if (res.type != RTN_UNICAST) { + if (res.type != RTN_LOCAL || !accept_local) + goto e_inval_res; + } *spec_dst = FIB_RES_PREFSRC(res); fib_combine_itag(itag, &res); #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -892,11 +898,11 @@ static void nl_fib_lookup_exit(struct net *net) net->ipv4.fibnl = NULL; } -static void fib_disable_ip(struct net_device *dev, int force) +static void fib_disable_ip(struct net_device *dev, int force, int delay) { if (fib_sync_down_dev(dev, force)) fib_flush(dev_net(dev)); - rt_cache_flush(dev_net(dev), 0); + rt_cache_flush(dev_net(dev), delay); arp_ifdown(dev); } @@ -919,7 +925,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, /* Last address was deleted from this interface. Disable IP. */ - fib_disable_ip(dev, 1); + fib_disable_ip(dev, 1, 0); } else { rt_cache_flush(dev_net(dev), -1); } @@ -934,7 +940,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo struct in_device *in_dev = __in_dev_get_rtnl(dev); if (event == NETDEV_UNREGISTER) { - fib_disable_ip(dev, 2); + fib_disable_ip(dev, 2, -1); return NOTIFY_DONE; } @@ -952,12 +958,15 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo rt_cache_flush(dev_net(dev), -1); break; case NETDEV_DOWN: - fib_disable_ip(dev, 0); + fib_disable_ip(dev, 0, 0); break; case NETDEV_CHANGEMTU: case NETDEV_CHANGE: rt_cache_flush(dev_net(dev), 0); break; + case NETDEV_UNREGISTER_BATCH: + rt_cache_flush_batch(); + break; } return NOTIFY_DONE; } diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 835262c..ca2d07b 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -284,7 +284,7 @@ static int fib_default_rules_init(struct fib_rules_ops *ops) { int err; - err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, FIB_RULE_PERMANENT); + err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, 0); if (err < 0) return err; err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0); @@ -301,13 +301,9 @@ int __net_init fib4_rules_init(struct net *net) int err; struct fib_rules_ops *ops; - ops = kmemdup(&fib4_rules_ops_template, sizeof(*ops), GFP_KERNEL); - if (ops == NULL) - return -ENOMEM; - INIT_LIST_HEAD(&ops->rules_list); - ops->fro_net = net; - - fib_rules_register(ops); + ops = fib_rules_register(&fib4_rules_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); err = fib_default_rules_init(ops); if (err < 0) @@ -318,12 +314,10 @@ int __net_init fib4_rules_init(struct net *net) fail: /* also cleans all rules already added */ fib_rules_unregister(ops); - kfree(ops); return err; } void __net_exit fib4_rules_exit(struct net *net) { fib_rules_unregister(net->ipv4.rules_ops); - kfree(net->ipv4.rules_ops); } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 9b096d6..ed19aa6 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -228,7 +228,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) head = &fib_info_hash[hash]; hlist_for_each_entry(fi, node, head, fib_hash) { - if (fi->fib_net != nfi->fib_net) + if (!net_eq(fi->fib_net, nfi->fib_net)) continue; if (fi->fib_nhs != nfi->fib_nhs) continue; @@ -1047,7 +1047,7 @@ int fib_sync_down_addr(struct net *net, __be32 local) return 0; hlist_for_each_entry(fi, node, head, fib_lhash) { - if (fi->fib_net != net) + if (!net_eq(fi->fib_net, net)) continue; if (fi->fib_prefsrc == local) { fi->fib_flags |= RTNH_F_DEAD; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 84adb57..fe11f60 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -501,15 +501,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) if (!(rt->rt_flags & RTCF_LOCAL)) { struct net_device *dev = NULL; + rcu_read_lock(); if (rt->fl.iif && net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) - dev = dev_get_by_index(net, rt->fl.iif); + dev = dev_get_by_index_rcu(net, rt->fl.iif); - if (dev) { + if (dev) saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); - dev_put(dev); - } else + else saddr = 0; + rcu_read_unlock(); } tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index d41e5de..76c0840 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1899,8 +1899,9 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct err = -EADDRNOTAVAIL; for (pmc=inet->mc_list; pmc; pmc=pmc->next) { - if (pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr - && pmc->multi.imr_ifindex == imr.imr_ifindex) + if ((pmc->multi.imr_multiaddr.s_addr == + imr.imr_multiaddr.s_addr) && + (pmc->multi.imr_ifindex == imr.imr_ifindex)) break; } if (!pmc) { /* must have a prior join */ @@ -2311,9 +2312,10 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq) struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); state->in_dev = NULL; - for_each_netdev(net, state->dev) { + for_each_netdev_rcu(net, state->dev) { struct in_device *in_dev; - in_dev = in_dev_get(state->dev); + + in_dev = __in_dev_get_rcu(state->dev); if (!in_dev) continue; read_lock(&in_dev->mc_list_lock); @@ -2323,7 +2325,6 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq) break; } read_unlock(&in_dev->mc_list_lock); - in_dev_put(in_dev); } return im; } @@ -2333,16 +2334,15 @@ static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_li struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); im = im->next; while (!im) { - if (likely(state->in_dev != NULL)) { + if (likely(state->in_dev != NULL)) read_unlock(&state->in_dev->mc_list_lock); - in_dev_put(state->in_dev); - } - state->dev = next_net_device(state->dev); + + state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->in_dev = NULL; break; } - state->in_dev = in_dev_get(state->dev); + state->in_dev = __in_dev_get_rcu(state->dev); if (!state->in_dev) continue; read_lock(&state->in_dev->mc_list_lock); @@ -2361,9 +2361,9 @@ static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos) } static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(dev_base_lock) + __acquires(rcu) { - read_lock(&dev_base_lock); + rcu_read_lock(); return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } @@ -2379,16 +2379,15 @@ static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void igmp_mc_seq_stop(struct seq_file *seq, void *v) - __releases(dev_base_lock) + __releases(rcu) { struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); if (likely(state->in_dev != NULL)) { read_unlock(&state->in_dev->mc_list_lock); - in_dev_put(state->in_dev); state->in_dev = NULL; } state->dev = NULL; - read_unlock(&dev_base_lock); + rcu_read_unlock(); } static int igmp_mc_seq_show(struct seq_file *seq, void *v) @@ -2462,9 +2461,9 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq) state->idev = NULL; state->im = NULL; - for_each_netdev(net, state->dev) { + for_each_netdev_rcu(net, state->dev) { struct in_device *idev; - idev = in_dev_get(state->dev); + idev = __in_dev_get_rcu(state->dev); if (unlikely(idev == NULL)) continue; read_lock(&idev->mc_list_lock); @@ -2480,7 +2479,6 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq) spin_unlock_bh(&im->lock); } read_unlock(&idev->mc_list_lock); - in_dev_put(idev); } return psf; } @@ -2494,16 +2492,15 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l spin_unlock_bh(&state->im->lock); state->im = state->im->next; while (!state->im) { - if (likely(state->idev != NULL)) { + if (likely(state->idev != NULL)) read_unlock(&state->idev->mc_list_lock); - in_dev_put(state->idev); - } - state->dev = next_net_device(state->dev); + + state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->idev = NULL; goto out; } - state->idev = in_dev_get(state->dev); + state->idev = __in_dev_get_rcu(state->dev); if (!state->idev) continue; read_lock(&state->idev->mc_list_lock); @@ -2528,8 +2525,9 @@ static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos) } static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(rcu) { - read_lock(&dev_base_lock); + rcu_read_lock(); return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } @@ -2545,6 +2543,7 @@ static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void igmp_mcf_seq_stop(struct seq_file *seq, void *v) + __releases(rcu) { struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); if (likely(state->im != NULL)) { @@ -2553,11 +2552,10 @@ static void igmp_mcf_seq_stop(struct seq_file *seq, void *v) } if (likely(state->idev != NULL)) { read_unlock(&state->idev->mc_list_lock); - in_dev_put(state->idev); state->idev = NULL; } state->dev = NULL; - read_unlock(&dev_base_lock); + rcu_read_unlock(); } static int igmp_mcf_seq_show(struct seq_file *seq, void *v) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 26fb50e..ee16475 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -112,7 +112,7 @@ again: hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) - if (ib_net(tb) == net && tb->port == rover) { + if (net_eq(ib_net(tb), net) && tb->port == rover) { if (tb->fastreuse > 0 && sk->sk_reuse && sk->sk_state != TCP_LISTEN && @@ -158,7 +158,7 @@ have_snum: hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) - if (ib_net(tb) == net && tb->port == snum) + if (net_eq(ib_net(tb), net) && tb->port == snum) goto tb_found; } tb = NULL; @@ -531,7 +531,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, &expire, &resend); if (!expire && (!resend || - !req->rsk_ops->rtx_syn_ack(parent, req) || + !req->rsk_ops->rtx_syn_ack(parent, req, NULL) || inet_rsk(req)->acked)) { unsigned long timeo; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 47ad7aa..94ef51a 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -454,7 +454,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, * unique enough. */ inet_bind_bucket_for_each(tb, node, &head->chain) { - if (ib_net(tb) == net && tb->port == port) { + if (net_eq(ib_net(tb), net) && + tb->port == port) { if (tb->fastreuse >= 0) goto next_port; WARN_ON(hlist_empty(&tb->owners)); diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c index 6a667da..47038cb 100644 --- a/net/ipv4/inet_lro.c +++ b/net/ipv4/inet_lro.c @@ -64,15 +64,15 @@ static int lro_tcp_ip_check(struct iphdr *iph, struct tcphdr *tcph, if (iph->ihl != IPH_LEN_WO_OPTIONS) return -1; - if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack - || tcph->rst || tcph->syn || tcph->fin) + if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack || + tcph->rst || tcph->syn || tcph->fin) return -1; if (INET_ECN_is_ce(ipv4_get_dsfield(iph))) return -1; - if (tcph->doff != TCPH_LEN_WO_OPTIONS - && tcph->doff != TCPH_LEN_W_TIMESTAMP) + if (tcph->doff != TCPH_LEN_WO_OPTIONS && + tcph->doff != TCPH_LEN_W_TIMESTAMP) return -1; /* check tcp options (only timestamp allowed) */ @@ -262,10 +262,10 @@ static int lro_check_tcp_conn(struct net_lro_desc *lro_desc, struct iphdr *iph, struct tcphdr *tcph) { - if ((lro_desc->iph->saddr != iph->saddr) - || (lro_desc->iph->daddr != iph->daddr) - || (lro_desc->tcph->source != tcph->source) - || (lro_desc->tcph->dest != tcph->dest)) + if ((lro_desc->iph->saddr != iph->saddr) || + (lro_desc->iph->daddr != iph->daddr) || + (lro_desc->tcph->source != tcph->source) || + (lro_desc->tcph->dest != tcph->dest)) return -1; return 0; } @@ -339,9 +339,9 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, u64 flags; int vlan_hdr_len = 0; - if (!lro_mgr->get_skb_header - || lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph, - &flags, priv)) + if (!lro_mgr->get_skb_header || + lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph, + &flags, priv)) goto out; if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) @@ -351,8 +351,8 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, if (!lro_desc) goto out; - if ((skb->protocol == htons(ETH_P_8021Q)) - && !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) + if ((skb->protocol == htons(ETH_P_8021Q)) && + !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) vlan_hdr_len = VLAN_HLEN; if (!lro_desc->active) { /* start new lro session */ @@ -446,9 +446,9 @@ static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr, int hdr_len = LRO_MAX_PG_HLEN; int vlan_hdr_len = 0; - if (!lro_mgr->get_frag_header - || lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph, - (void *)&tcph, &flags, priv)) { + if (!lro_mgr->get_frag_header || + lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph, + (void *)&tcph, &flags, priv)) { mac_hdr = page_address(frags->page) + frags->page_offset; goto out1; } @@ -472,8 +472,8 @@ static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr, if (!skb) goto out; - if ((skb->protocol == htons(ETH_P_8021Q)) - && !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) + if ((skb->protocol == htons(ETH_P_8021Q)) && + !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) vlan_hdr_len = VLAN_HLEN; iph = (void *)(skb->data + vlan_hdr_len); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 1f5d508..31f931e 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -421,37 +421,46 @@ out: EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick); -void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo, +void inet_twsk_purge(struct inet_hashinfo *hashinfo, struct inet_timewait_death_row *twdr, int family) { struct inet_timewait_sock *tw; struct sock *sk; struct hlist_nulls_node *node; - int h; + unsigned int slot; - local_bh_disable(); - for (h = 0; h <= hashinfo->ehash_mask; h++) { - struct inet_ehash_bucket *head = - inet_ehash_bucket(hashinfo, h); - spinlock_t *lock = inet_ehash_lockp(hashinfo, h); + for (slot = 0; slot <= hashinfo->ehash_mask; slot++) { + struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; +restart_rcu: + rcu_read_lock(); restart: - spin_lock(lock); - sk_nulls_for_each(sk, node, &head->twchain) { - + sk_nulls_for_each_rcu(sk, node, &head->twchain) { tw = inet_twsk(sk); - if (!net_eq(twsk_net(tw), net) || - tw->tw_family != family) + if ((tw->tw_family != family) || + atomic_read(&twsk_net(tw)->count)) + continue; + + if (unlikely(!atomic_inc_not_zero(&tw->tw_refcnt))) continue; - atomic_inc(&tw->tw_refcnt); - spin_unlock(lock); + if (unlikely((tw->tw_family != family) || + atomic_read(&twsk_net(tw)->count))) { + inet_twsk_put(tw); + goto restart; + } + + rcu_read_unlock(); inet_twsk_deschedule(tw, twdr); inet_twsk_put(tw); - - goto restart; + goto restart_rcu; } - spin_unlock(lock); + /* If the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot) + goto restart; + rcu_read_unlock(); } - local_bh_enable(); } EXPORT_SYMBOL_GPL(inet_twsk_purge); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index b1fbe18..6bcfe52 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -67,9 +67,6 @@ * ip_id_count: idlock */ -/* Exported for inet_getid inline function. */ -DEFINE_SPINLOCK(inet_peer_idlock); - static struct kmem_cache *peer_cachep __read_mostly; #define node_height(x) x->avl_height @@ -390,7 +387,7 @@ struct inet_peer *inet_getpeer(__be32 daddr, int create) n->v4daddr = daddr; atomic_set(&n->refcnt, 1); atomic_set(&n->rid, 0); - n->ip_id_count = secure_ip_id(daddr); + atomic_set(&n->ip_id_count, secure_ip_id(daddr)); n->tcp_ts_stamp = 0; write_lock_bh(&peer_pool_lock); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 575f9bd..c473531 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -206,10 +206,11 @@ static void ip_expire(unsigned long arg) struct sk_buff *head = qp->q.fragments; /* Send an ICMP "Fragment Reassembly Timeout" message. */ - if ((head->dev = dev_get_by_index(net, qp->iif)) != NULL) { + rcu_read_lock(); + head->dev = dev_get_by_index_rcu(net, qp->iif); + if (head->dev) icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); - dev_put(head->dev); - } + rcu_read_unlock(); } out: spin_unlock(&qp->q.lock); @@ -563,7 +564,7 @@ out_oversize: printk(KERN_INFO "Oversized IP packet from %pI4.\n", &qp->saddr); out_fail: - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_REASMFAILS); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); return err; } @@ -657,7 +658,7 @@ static int ip4_frags_ns_ctl_register(struct net *net) struct ctl_table_header *hdr; table = ip4_frags_ns_ctl_table; - if (net != &init_net) { + if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL); if (table == NULL) goto err_alloc; @@ -675,7 +676,7 @@ static int ip4_frags_ns_ctl_register(struct net *net) return 0; err_reg: - if (net != &init_net) + if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index a77807d..f36ce15 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -125,7 +125,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev); #define HASH_SIZE 16 -static int ipgre_net_id; +static int ipgre_net_id __read_mostly; struct ipgre_net { struct ip_tunnel *tunnels[4][HASH_SIZE]; @@ -1309,17 +1309,8 @@ static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) static int ipgre_init_net(struct net *net) { + struct ipgre_net *ign = net_generic(net, ipgre_net_id); int err; - struct ipgre_net *ign; - - err = -ENOMEM; - ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL); - if (ign == NULL) - goto err_alloc; - - err = net_assign_generic(net, ipgre_net_id, ign); - if (err < 0) - goto err_assign; ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0", ipgre_tunnel_setup); @@ -1340,10 +1331,6 @@ static int ipgre_init_net(struct net *net) err_reg_dev: free_netdev(ign->fb_tunnel_dev); err_alloc_dev: - /* nothing */ -err_assign: - kfree(ign); -err_alloc: return err; } @@ -1357,12 +1344,13 @@ static void ipgre_exit_net(struct net *net) ipgre_destroy_tunnels(ign, &list); unregister_netdevice_many(&list); rtnl_unlock(); - kfree(ign); } static struct pernet_operations ipgre_net_ops = { .init = ipgre_init_net, .exit = ipgre_exit_net, + .id = &ipgre_net_id, + .size = sizeof(struct ipgre_net), }; static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) @@ -1476,14 +1464,14 @@ static void ipgre_tap_setup(struct net_device *dev) ether_setup(dev); - dev->netdev_ops = &ipgre_netdev_ops; + dev->netdev_ops = &ipgre_tap_netdev_ops; dev->destructor = free_netdev; dev->iflink = 0; dev->features |= NETIF_F_NETNS_LOCAL; } -static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[], +static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct ip_tunnel *nt; @@ -1537,25 +1525,29 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], if (t->dev != dev) return -EEXIST; } else { - unsigned nflags = 0; - t = nt; - if (ipv4_is_multicast(p.iph.daddr)) - nflags = IFF_BROADCAST; - else if (p.iph.daddr) - nflags = IFF_POINTOPOINT; + if (dev->type != ARPHRD_ETHER) { + unsigned nflags = 0; - if ((dev->flags ^ nflags) & - (IFF_POINTOPOINT | IFF_BROADCAST)) - return -EINVAL; + if (ipv4_is_multicast(p.iph.daddr)) + nflags = IFF_BROADCAST; + else if (p.iph.daddr) + nflags = IFF_POINTOPOINT; + + if ((dev->flags ^ nflags) & + (IFF_POINTOPOINT | IFF_BROADCAST)) + return -EINVAL; + } ipgre_tunnel_unlink(ign, t); t->parms.iph.saddr = p.iph.saddr; t->parms.iph.daddr = p.iph.daddr; t->parms.i_key = p.i_key; - memcpy(dev->dev_addr, &p.iph.saddr, 4); - memcpy(dev->broadcast, &p.iph.daddr, 4); + if (dev->type != ARPHRD_ETHER) { + memcpy(dev->dev_addr, &p.iph.saddr, 4); + memcpy(dev->broadcast, &p.iph.daddr, 4); + } ipgre_tunnel_link(ign, t); netdev_state_change(dev); } @@ -1678,7 +1670,7 @@ static int __init ipgre_init(void) return -EAGAIN; } - err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops); + err = register_pernet_device(&ipgre_net_ops); if (err < 0) goto gen_device_failed; @@ -1696,7 +1688,7 @@ out: tap_ops_failed: rtnl_link_unregister(&ipgre_link_ops); rtnl_link_failed: - unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops); + unregister_pernet_device(&ipgre_net_ops); gen_device_failed: inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); goto out; @@ -1706,7 +1698,7 @@ static void __exit ipgre_fini(void) { rtnl_link_unregister(&ipgre_tap_ops); rtnl_link_unregister(&ipgre_link_ops); - unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops); + unregister_pernet_device(&ipgre_net_ops); if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) printk(KERN_INFO "ipgre close: can't remove protocol\n"); } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index fdf51ba..c29de98 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -164,7 +164,7 @@ int ip_call_ra_chain(struct sk_buff *skb) if (sk && inet_sk(sk)->inet_num == protocol && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dev->ifindex) && - sock_net(sk) == dev_net(dev)) { + net_eq(sock_net(sk), dev_net(dev))) { if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) { read_unlock(&ip_ra_lock); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 322b408..e34013a 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -264,9 +264,11 @@ int ip_mc_output(struct sk_buff *skb) This check is duplicated in ip_mr_input at the moment. */ - && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED)) + && + ((rt->rt_flags & RTCF_LOCAL) || + !(IPCB(skb)->flags & IPSKB_FORWARDED)) #endif - ) { + ) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, @@ -501,8 +503,8 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) if (skb->sk) { frag->sk = skb->sk; frag->destructor = sock_wfree; - truesizes += frag->truesize; } + truesizes += frag->truesize; } /* Everything is OK. Generate! */ diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index f8d04c2..4e08b7f 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1172,10 +1172,9 @@ static int __init ic_dynamic(void) schedule_timeout_uninterruptible(1); #ifdef IPCONFIG_DHCP /* DHCP isn't done until we get a DHCPACK. */ - if ((ic_got_reply & IC_BOOTP) - && (ic_proto_enabled & IC_USE_DHCP) - && ic_dhcp_msgtype != DHCPACK) - { + if ((ic_got_reply & IC_BOOTP) && + (ic_proto_enabled & IC_USE_DHCP) && + ic_dhcp_msgtype != DHCPACK) { ic_got_reply = 0; printk(","); continue; @@ -1344,9 +1343,9 @@ static int __init ip_auto_config(void) */ if (ic_myaddr == NONE || #ifdef CONFIG_ROOT_NFS - (root_server_addr == NONE - && ic_servaddr == NONE - && ROOT_DEV == Root_NFS) || + (root_server_addr == NONE && + ic_servaddr == NONE && + ROOT_DEV == Root_NFS) || #endif ic_first_dev->next) { #ifdef IPCONFIG_DYNAMIC diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index a2ca53d..eda04fe 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -119,7 +119,7 @@ #define HASH_SIZE 16 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) -static int ipip_net_id; +static int ipip_net_id __read_mostly; struct ipip_net { struct ip_tunnel *tunnels_r_l[HASH_SIZE]; struct ip_tunnel *tunnels_r[HASH_SIZE]; @@ -446,25 +446,27 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) goto tx_error; } - if (tiph->frag_off) + df |= old_iph->frag_off & htons(IP_DF); + + if (df) { mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); - else - mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; - if (mtu < 68) { - stats->collisions++; - ip_rt_put(rt); - goto tx_error; - } - if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + if (mtu < 68) { + stats->collisions++; + ip_rt_put(rt); + goto tx_error; + } - df |= (old_iph->frag_off&htons(IP_DF)); + if (skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); - if ((old_iph->frag_off&htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); - ip_rt_put(rt); - goto tx_error; + if ((old_iph->frag_off & htons(IP_DF)) && + mtu < ntohs(old_iph->tot_len)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + ip_rt_put(rt); + goto tx_error; + } } if (tunnel->err_count > 0) { @@ -773,17 +775,8 @@ static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head) static int ipip_init_net(struct net *net) { + struct ipip_net *ipn = net_generic(net, ipip_net_id); int err; - struct ipip_net *ipn; - - err = -ENOMEM; - ipn = kzalloc(sizeof(struct ipip_net), GFP_KERNEL); - if (ipn == NULL) - goto err_alloc; - - err = net_assign_generic(net, ipip_net_id, ipn); - if (err < 0) - goto err_assign; ipn->tunnels[0] = ipn->tunnels_wc; ipn->tunnels[1] = ipn->tunnels_l; @@ -810,29 +803,26 @@ err_reg_dev: free_netdev(ipn->fb_tunnel_dev); err_alloc_dev: /* nothing */ -err_assign: - kfree(ipn); -err_alloc: return err; } static void ipip_exit_net(struct net *net) { - struct ipip_net *ipn; + struct ipip_net *ipn = net_generic(net, ipip_net_id); LIST_HEAD(list); - ipn = net_generic(net, ipip_net_id); rtnl_lock(); ipip_destroy_tunnels(ipn, &list); unregister_netdevice_queue(ipn->fb_tunnel_dev, &list); unregister_netdevice_many(&list); rtnl_unlock(); - kfree(ipn); } static struct pernet_operations ipip_net_ops = { .init = ipip_init_net, .exit = ipip_exit_net, + .id = &ipip_net_id, + .size = sizeof(struct ipip_net), }; static int __init ipip_init(void) @@ -846,7 +836,7 @@ static int __init ipip_init(void) return -EAGAIN; } - err = register_pernet_gen_device(&ipip_net_id, &ipip_net_ops); + err = register_pernet_device(&ipip_net_ops); if (err) xfrm4_tunnel_deregister(&ipip_handler, AF_INET); @@ -858,7 +848,7 @@ static void __exit ipip_fini(void) if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) printk(KERN_INFO "ipip close: can't deregister tunnel\n"); - unregister_pernet_gen_device(ipip_net_id, &ipip_net_ops); + unregister_pernet_device(&ipip_net_ops); } module_init(ipip_init); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index ef4ee45..54596f7 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -494,8 +494,10 @@ static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock) return -EINVAL; } - if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) + if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) { + dev_put(dev); return -EADDRNOTAVAIL; + } IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; ip_rt_multicast_event(in_dev); diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 1725dc0..f53cb8d 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -155,10 +155,10 @@ static int nf_ip_reroute(struct sk_buff *skb, if (entry->hook == NF_INET_LOCAL_OUT) { const struct iphdr *iph = ip_hdr(skb); - if (!(iph->tos == rt_info->tos - && skb->mark == rt_info->mark - && iph->daddr == rt_info->daddr - && iph->saddr == rt_info->saddr)) + if (!(iph->tos == rt_info->tos && + skb->mark == rt_info->mark && + iph->daddr == rt_info->daddr && + iph->saddr == rt_info->saddr)) return ip_route_me_harder(skb, RTN_UNSPEC); } return 0; diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 9f07870..49ad447 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -499,7 +499,7 @@ ipq_rcv_nl_event(struct notifier_block *this, if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) { write_lock_bh(&queue_lock); - if ((n->net == &init_net) && (n->pid == peer_pid)) + if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid)) __ipq_reset(); write_unlock_bh(&queue_lock); } diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 68afc6e..fe1a644 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -750,6 +750,8 @@ static int __init nf_nat_init(void) BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); rcu_assign_pointer(nfnetlink_parse_nat_setup_hook, nfnetlink_parse_nat_setup); + BUG_ON(nf_ct_nat_offset != NULL); + rcu_assign_pointer(nf_ct_nat_offset, nf_nat_get_offset); return 0; cleanup_extend: @@ -764,6 +766,7 @@ static void __exit nf_nat_cleanup(void) nf_ct_extend_unregister(&nat_extend); rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL); rcu_assign_pointer(nfnetlink_parse_nat_setup_hook, NULL); + rcu_assign_pointer(nf_ct_nat_offset, NULL); synchronize_net(); } diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c index 5bf6a92..7f10a6b 100644 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ b/net/ipv4/netfilter/nf_nat_helper.c @@ -69,6 +69,28 @@ adjust_tcp_sequence(u32 seq, DUMP_OFFSET(this_way); } +/* Get the offset value, for conntrack */ +s16 nf_nat_get_offset(const struct nf_conn *ct, + enum ip_conntrack_dir dir, + u32 seq) +{ + struct nf_conn_nat *nat = nfct_nat(ct); + struct nf_nat_seq *this_way; + s16 offset; + + if (!nat) + return 0; + + this_way = &nat->seq[dir]; + spin_lock_bh(&nf_nat_seqofs_lock); + offset = after(seq, this_way->correction_pos) + ? this_way->offset_after : this_way->offset_before; + spin_unlock_bh(&nf_nat_seqofs_lock); + + return offset; +} +EXPORT_SYMBOL_GPL(nf_nat_get_offset); + /* Frobs data inside this packet, which is linear. */ static void mangle_contents(struct sk_buff *skb, unsigned int dataoff, @@ -185,11 +207,6 @@ nf_nat_mangle_tcp_packet(struct sk_buff *skb, adjust_tcp_sequence(ntohl(tcph->seq), (int)rep_len - (int)match_len, ct, ctinfo); - /* Tell TCP window tracking about seq change */ - nf_conntrack_tcp_update(skb, ip_hdrlen(skb), - ct, CTINFO2DIR(ctinfo), - (int)rep_len - (int)match_len); - nf_conntrack_event_cache(IPCT_NATSEQADJ, ct); } return 1; @@ -411,12 +428,7 @@ nf_nat_seq_adjust(struct sk_buff *skb, tcph->seq = newseq; tcph->ack_seq = newack; - if (!nf_nat_sack_adjust(skb, tcph, ct, ctinfo)) - return 0; - - nf_conntrack_tcp_update(skb, ip_hdrlen(skb), ct, dir, seqoff); - - return 1; + return nf_nat_sack_adjust(skb, tcph, ct, ctinfo); } /* Setup NAT on this expected conntrack so it follows master. */ diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 9ef8c08..ce154b4 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -351,13 +351,24 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, skb->ip_summed = CHECKSUM_NONE; skb->transport_header = skb->network_header; - err = memcpy_fromiovecend((void *)iph, from, 0, length); - if (err) - goto error_fault; + err = -EFAULT; + if (memcpy_fromiovecend((void *)iph, from, 0, length)) + goto error_free; - /* We don't modify invalid header */ iphlen = iph->ihl * 4; - if (iphlen >= sizeof(*iph) && iphlen <= length) { + + /* + * We don't want to modify the ip header, but we do need to + * be sure that it won't cause problems later along the network + * stack. Specifically we want to make sure that iph->ihl is a + * sane value. If ihl points beyond the length of the buffer passed + * in, reject the frame as invalid + */ + err = -EINVAL; + if (iphlen > length) + goto error_free; + + if (iphlen >= sizeof(*iph)) { if (!iph->saddr) iph->saddr = rt->rt_src; iph->check = 0; @@ -380,8 +391,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, out: return 0; -error_fault: - err = -EFAULT; +error_free: kfree_skb(skb); error: IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 68fb227..90cdcfc 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -513,43 +513,42 @@ static const struct file_operations rt_cpu_seq_fops = { }; #ifdef CONFIG_NET_CLS_ROUTE -static int ip_rt_acct_read(char *buffer, char **start, off_t offset, - int length, int *eof, void *data) -{ - unsigned int i; - - if ((offset & 3) || (length & 3)) - return -EIO; - - if (offset >= sizeof(struct ip_rt_acct) * 256) { - *eof = 1; - return 0; - } - - if (offset + length >= sizeof(struct ip_rt_acct) * 256) { - length = sizeof(struct ip_rt_acct) * 256 - offset; - *eof = 1; +static int rt_acct_proc_show(struct seq_file *m, void *v) +{ + struct ip_rt_acct *dst, *src; + unsigned int i, j; + + dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); + if (!dst) + return -ENOMEM; + + for_each_possible_cpu(i) { + src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); + for (j = 0; j < 256; j++) { + dst[j].o_bytes += src[j].o_bytes; + dst[j].o_packets += src[j].o_packets; + dst[j].i_bytes += src[j].i_bytes; + dst[j].i_packets += src[j].i_packets; + } } - offset /= sizeof(u32); - - if (length > 0) { - u32 *dst = (u32 *) buffer; - - *start = buffer; - memset(dst, 0, length); - - for_each_possible_cpu(i) { - unsigned int j; - u32 *src; + seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); + kfree(dst); + return 0; +} - src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset; - for (j = 0; j < length/4; j++) - dst[j] += src[j]; - } - } - return length; +static int rt_acct_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, rt_acct_proc_show, NULL); } + +static const struct file_operations rt_acct_proc_fops = { + .owner = THIS_MODULE, + .open = rt_acct_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; #endif static int __net_init ip_rt_do_proc_init(struct net *net) @@ -567,8 +566,7 @@ static int __net_init ip_rt_do_proc_init(struct net *net) goto err2; #ifdef CONFIG_NET_CLS_ROUTE - pde = create_proc_read_entry("rt_acct", 0, net->proc_net, - ip_rt_acct_read, NULL); + pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); if (!pde) goto err3; #endif @@ -703,7 +701,7 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) { - return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev); + return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev)); } static inline int rt_is_expired(struct rtable *rth) @@ -902,6 +900,12 @@ void rt_cache_flush(struct net *net, int delay) rt_do_flush(!in_softirq()); } +/* Flush previous cache invalidated entries from the cache */ +void rt_cache_flush_batch(void) +{ + rt_do_flush(!in_softirq()); +} + /* * We change rt_genid and let gc do the cleanup */ @@ -1346,9 +1350,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, return; net = dev_net(dev); - if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) - || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) - || ipv4_is_zeronet(new_gw)) + if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || + ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || + ipv4_is_zeronet(new_gw)) goto reject_redirect; if (!rt_caching(net)) @@ -1851,7 +1855,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto e_inval; spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); } else if (fib_validate_source(saddr, 0, tos, 0, - dev, &spec_dst, &itag) < 0) + dev, &spec_dst, &itag, 0) < 0) goto e_inval; rth = dst_alloc(&ipv4_dst_ops); @@ -1964,7 +1968,7 @@ static int __mkroute_input(struct sk_buff *skb, err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), - in_dev->dev, &spec_dst, &itag); + in_dev->dev, &spec_dst, &itag, skb->mark); if (err < 0) { ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); @@ -2138,7 +2142,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, int result; result = fib_validate_source(saddr, daddr, tos, net->loopback_dev->ifindex, - dev, &spec_dst, &itag); + dev, &spec_dst, &itag, skb->mark); if (result < 0) goto martian_source; if (result) @@ -2167,7 +2171,7 @@ brd_input: spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); else { err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, - &itag); + &itag, skb->mark); if (err < 0) goto martian_source; if (err) @@ -2311,10 +2315,11 @@ skip_cache: ip_hdr(skb)->protocol); if (our #ifdef CONFIG_IP_MROUTE - || (!ipv4_is_local_multicast(daddr) && - IN_DEV_MFORWARD(in_dev)) + || + (!ipv4_is_local_multicast(daddr) && + IN_DEV_MFORWARD(in_dev)) #endif - ) { + ) { rcu_read_unlock(); return ip_route_input_mc(skb, daddr, saddr, tos, dev, our); @@ -2511,9 +2516,9 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, of another iface. --ANK */ - if (oldflp->oif == 0 - && (ipv4_is_multicast(oldflp->fl4_dst) || - oldflp->fl4_dst == htonl(0xFFFFFFFF))) { + if (oldflp->oif == 0 && + (ipv4_is_multicast(oldflp->fl4_dst) || + oldflp->fl4_dst == htonl(0xFFFFFFFF))) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ dev_out = ip_dev_find(net, oldflp->fl4_src); if (dev_out == NULL) @@ -2852,7 +2857,7 @@ static int rt_fill_info(struct net *net, error = rt->u.dst.error; expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0; if (rt->peer) { - id = rt->peer->ip_id_count; + id = atomic_read(&rt->peer->ip_id_count) & 0xffff; if (rt->peer->tcp_ts_stamp) { ts = rt->peer->tcp_ts; tsage = get_seconds() - rt->peer->tcp_ts_stamp; @@ -3309,7 +3314,7 @@ static __net_init int sysctl_route_net_init(struct net *net) struct ctl_table *tbl; tbl = ipv4_route_flush_table; - if (net != &init_net) { + if (!net_eq(net, &init_net)) { tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 3146cc4..26399ad 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -253,6 +253,8 @@ EXPORT_SYMBOL(cookie_check_timestamp); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt) { + struct tcp_options_received tcp_opt; + u8 *hash_location; struct inet_request_sock *ireq; struct tcp_request_sock *treq; struct tcp_sock *tp = tcp_sk(sk); @@ -263,7 +265,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, int mss; struct rtable *rt; __u8 rcv_wscale; - struct tcp_options_received tcp_opt; if (!sysctl_tcp_syncookies || !th->ack) goto out; @@ -341,7 +342,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); - tcp_parse_options(skb, &tcp_opt, 0, &rt->u.dst); + tcp_parse_options(skb, &tcp_opt, &hash_location, 0, &rt->u.dst); if (tcp_opt.saw_tstamp) cookie_check_timestamp(&tcp_opt); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 2dcf04d..13f7ab6 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -714,6 +714,14 @@ static struct ctl_table ipv4_table[] = { }, { .ctl_name = CTL_UNNUMBERED, + .procname = "tcp_cookie_size", + .data = &sysctl_tcp_cookie_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .ctl_name = CTL_UNNUMBERED, .procname = "udp_mem", .data = &sysctl_udp_mem, .maxlen = sizeof(sysctl_udp_mem), @@ -818,7 +826,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) struct ctl_table *table; table = ipv4_net_table; - if (net != &init_net) { + if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL); if (table == NULL) goto err_alloc; @@ -849,7 +857,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) return 0; err_reg: - if (net != &init_net) + if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e0cfa63..c8666b7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -264,6 +264,7 @@ #include <linux/cache.h> #include <linux/err.h> #include <linux/crypto.h> +#include <linux/time.h> #include <net/icmp.h> #include <net/tcp.h> @@ -1183,7 +1184,9 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) #if TCP_DEBUG struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); - WARN_ON(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); + WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), + KERN_INFO "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", + tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); #endif if (inet_csk_ack_scheduled(sk)) { @@ -1430,11 +1433,13 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* Now that we have two receive queues this * shouldn't happen. */ - if (before(*seq, TCP_SKB_CB(skb)->seq)) { - printk(KERN_INFO "recvmsg bug: copied %X " - "seq %X\n", *seq, TCP_SKB_CB(skb)->seq); + if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), + KERN_INFO "recvmsg bug: copied %X " + "seq %X rcvnxt %X fl %X\n", *seq, + TCP_SKB_CB(skb)->seq, tp->rcv_nxt, + flags)) break; - } + offset = *seq - TCP_SKB_CB(skb)->seq; if (tcp_hdr(skb)->syn) offset--; @@ -1443,8 +1448,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (tcp_hdr(skb)->fin) goto found_fin_ok; WARN(!(flags & MSG_PEEK), KERN_INFO "recvmsg bug 2: " - "copied %X seq %X\n", *seq, - TCP_SKB_CB(skb)->seq); + "copied %X seq %X rcvnxt %X fl %X\n", + *seq, TCP_SKB_CB(skb)->seq, + tp->rcv_nxt, flags); } /* Well, if we have backlog, try to process it now yet. */ @@ -2054,6 +2060,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_cnt = 0; tp->bytes_acked = 0; + tp->window_clamp = 0; tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); @@ -2078,8 +2085,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int val; int err = 0; - /* This is a string value all the others are int's */ - if (optname == TCP_CONGESTION) { + /* These are data/string values, all the others are ints */ + switch (optname) { + case TCP_CONGESTION: { char name[TCP_CA_NAME_MAX]; if (optlen < 1) @@ -2096,6 +2104,93 @@ static int do_tcp_setsockopt(struct sock *sk, int level, release_sock(sk); return err; } + case TCP_COOKIE_TRANSACTIONS: { + struct tcp_cookie_transactions ctd; + struct tcp_cookie_values *cvp = NULL; + + if (sizeof(ctd) > optlen) + return -EINVAL; + if (copy_from_user(&ctd, optval, sizeof(ctd))) + return -EFAULT; + + if (ctd.tcpct_used > sizeof(ctd.tcpct_value) || + ctd.tcpct_s_data_desired > TCP_MSS_DESIRED) + return -EINVAL; + + if (ctd.tcpct_cookie_desired == 0) { + /* default to global value */ + } else if ((0x1 & ctd.tcpct_cookie_desired) || + ctd.tcpct_cookie_desired > TCP_COOKIE_MAX || + ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) { + return -EINVAL; + } + + if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) { + /* Supercedes all other values */ + lock_sock(sk); + if (tp->cookie_values != NULL) { + kref_put(&tp->cookie_values->kref, + tcp_cookie_values_release); + tp->cookie_values = NULL; + } + tp->rx_opt.cookie_in_always = 0; /* false */ + tp->rx_opt.cookie_out_never = 1; /* true */ + release_sock(sk); + return err; + } + + /* Allocate ancillary memory before locking. + */ + if (ctd.tcpct_used > 0 || + (tp->cookie_values == NULL && + (sysctl_tcp_cookie_size > 0 || + ctd.tcpct_cookie_desired > 0 || + ctd.tcpct_s_data_desired > 0))) { + cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used, + GFP_KERNEL); + if (cvp == NULL) + return -ENOMEM; + } + lock_sock(sk); + tp->rx_opt.cookie_in_always = + (TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags); + tp->rx_opt.cookie_out_never = 0; /* false */ + + if (tp->cookie_values != NULL) { + if (cvp != NULL) { + /* Changed values are recorded by a changed + * pointer, ensuring the cookie will differ, + * without separately hashing each value later. + */ + kref_put(&tp->cookie_values->kref, + tcp_cookie_values_release); + kref_init(&cvp->kref); + tp->cookie_values = cvp; + } else { + cvp = tp->cookie_values; + } + } + if (cvp != NULL) { + cvp->cookie_desired = ctd.tcpct_cookie_desired; + + if (ctd.tcpct_used > 0) { + memcpy(cvp->s_data_payload, ctd.tcpct_value, + ctd.tcpct_used); + cvp->s_data_desired = ctd.tcpct_used; + cvp->s_data_constant = 1; /* true */ + } else { + /* No constant payload data. */ + cvp->s_data_desired = ctd.tcpct_s_data_desired; + cvp->s_data_constant = 0; /* false */ + } + } + release_sock(sk); + return err; + } + default: + /* fallthru */ + break; + }; if (optlen < sizeof(int)) return -EINVAL; @@ -2420,6 +2515,47 @@ static int do_tcp_getsockopt(struct sock *sk, int level, if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) return -EFAULT; return 0; + + case TCP_COOKIE_TRANSACTIONS: { + struct tcp_cookie_transactions ctd; + struct tcp_cookie_values *cvp = tp->cookie_values; + + if (get_user(len, optlen)) + return -EFAULT; + if (len < sizeof(ctd)) + return -EINVAL; + + memset(&ctd, 0, sizeof(ctd)); + ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ? + TCP_COOKIE_IN_ALWAYS : 0) + | (tp->rx_opt.cookie_out_never ? + TCP_COOKIE_OUT_NEVER : 0); + + if (cvp != NULL) { + ctd.tcpct_flags |= (cvp->s_data_in ? + TCP_S_DATA_IN : 0) + | (cvp->s_data_out ? + TCP_S_DATA_OUT : 0); + + ctd.tcpct_cookie_desired = cvp->cookie_desired; + ctd.tcpct_s_data_desired = cvp->s_data_desired; + + /* Cookie(s) saved, return as nonce */ + if (sizeof(ctd.tcpct_value) < cvp->cookie_pair_size) { + /* impossible? */ + return -EINVAL; + } + memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0], + cvp->cookie_pair_size); + ctd.tcpct_used = cvp->cookie_pair_size; + } + + if (put_user(sizeof(ctd), optlen)) + return -EFAULT; + if (copy_to_user(optval, &ctd, sizeof(ctd))) + return -EFAULT; + return 0; + } default: return -ENOPROTOOPT; } @@ -2842,6 +2978,135 @@ EXPORT_SYMBOL(tcp_md5_hash_key); #endif +/** + * Each Responder maintains up to two secret values concurrently for + * efficient secret rollover. Each secret value has 4 states: + * + * Generating. (tcp_secret_generating != tcp_secret_primary) + * Generates new Responder-Cookies, but not yet used for primary + * verification. This is a short-term state, typically lasting only + * one round trip time (RTT). + * + * Primary. (tcp_secret_generating == tcp_secret_primary) + * Used both for generation and primary verification. + * + * Retiring. (tcp_secret_retiring != tcp_secret_secondary) + * Used for verification, until the first failure that can be + * verified by the newer Generating secret. At that time, this + * cookie's state is changed to Secondary, and the Generating + * cookie's state is changed to Primary. This is a short-term state, + * typically lasting only one round trip time (RTT). + * + * Secondary. (tcp_secret_retiring == tcp_secret_secondary) + * Used for secondary verification, after primary verification + * failures. This state lasts no more than twice the Maximum Segment + * Lifetime (2MSL). Then, the secret is discarded. + */ +struct tcp_cookie_secret { + /* The secret is divided into two parts. The digest part is the + * equivalent of previously hashing a secret and saving the state, + * and serves as an initialization vector (IV). The message part + * serves as the trailing secret. + */ + u32 secrets[COOKIE_WORKSPACE_WORDS]; + unsigned long expires; +}; + +#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL) +#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2) +#define TCP_SECRET_LIFE (HZ * 600) + +static struct tcp_cookie_secret tcp_secret_one; +static struct tcp_cookie_secret tcp_secret_two; + +/* Essentially a circular list, without dynamic allocation. */ +static struct tcp_cookie_secret *tcp_secret_generating; +static struct tcp_cookie_secret *tcp_secret_primary; +static struct tcp_cookie_secret *tcp_secret_retiring; +static struct tcp_cookie_secret *tcp_secret_secondary; + +static DEFINE_SPINLOCK(tcp_secret_locker); + +/* Select a pseudo-random word in the cookie workspace. + */ +static inline u32 tcp_cookie_work(const u32 *ws, const int n) +{ + return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])]; +} + +/* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed. + * Called in softirq context. + * Returns: 0 for success. + */ +int tcp_cookie_generator(u32 *bakery) +{ + unsigned long jiffy = jiffies; + + if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) { + spin_lock_bh(&tcp_secret_locker); + if (!time_after_eq(jiffy, tcp_secret_generating->expires)) { + /* refreshed by another */ + memcpy(bakery, + &tcp_secret_generating->secrets[0], + COOKIE_WORKSPACE_WORDS); + } else { + /* still needs refreshing */ + get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS); + + /* The first time, paranoia assumes that the + * randomization function isn't as strong. But, + * this secret initialization is delayed until + * the last possible moment (packet arrival). + * Although that time is observable, it is + * unpredictably variable. Mash in the most + * volatile clock bits available, and expire the + * secret extra quickly. + */ + if (unlikely(tcp_secret_primary->expires == + tcp_secret_secondary->expires)) { + struct timespec tv; + + getnstimeofday(&tv); + bakery[COOKIE_DIGEST_WORDS+0] ^= + (u32)tv.tv_nsec; + + tcp_secret_secondary->expires = jiffy + + TCP_SECRET_1MSL + + (0x0f & tcp_cookie_work(bakery, 0)); + } else { + tcp_secret_secondary->expires = jiffy + + TCP_SECRET_LIFE + + (0xff & tcp_cookie_work(bakery, 1)); + tcp_secret_primary->expires = jiffy + + TCP_SECRET_2MSL + + (0x1f & tcp_cookie_work(bakery, 2)); + } + memcpy(&tcp_secret_secondary->secrets[0], + bakery, COOKIE_WORKSPACE_WORDS); + + rcu_assign_pointer(tcp_secret_generating, + tcp_secret_secondary); + rcu_assign_pointer(tcp_secret_retiring, + tcp_secret_primary); + /* + * Neither call_rcu() nor synchronize_rcu() needed. + * Retiring data is not freed. It is replaced after + * further (locked) pointer updates, and a quiet time + * (minimum 1MSL, maximum LIFE - 2MSL). + */ + } + spin_unlock_bh(&tcp_secret_locker); + } else { + rcu_read_lock_bh(); + memcpy(bakery, + &rcu_dereference(tcp_secret_generating)->secrets[0], + COOKIE_WORKSPACE_WORDS); + rcu_read_unlock_bh(); + } + return 0; +} +EXPORT_SYMBOL(tcp_cookie_generator); + void tcp_done(struct sock *sk) { if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) @@ -2876,6 +3141,7 @@ void __init tcp_init(void) struct sk_buff *skb = NULL; unsigned long nr_pages, limit; int order, i, max_share; + unsigned long jiffy = jiffies; BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); @@ -2969,6 +3235,15 @@ void __init tcp_init(void) tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); tcp_register_congestion_control(&tcp_reno); + + memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets)); + memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets)); + tcp_secret_one.expires = jiffy; /* past due */ + tcp_secret_two.expires = jiffy; /* past due */ + tcp_secret_generating = &tcp_secret_one; + tcp_secret_primary = &tcp_secret_one; + tcp_secret_retiring = &tcp_secret_two; + tcp_secret_secondary = &tcp_secret_two; } EXPORT_SYMBOL(tcp_close); diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 26d5c7f..7c94a49 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -92,8 +92,8 @@ static inline void measure_rtt(struct sock *sk, u32 srtt) if (icsk->icsk_ca_state == TCP_CA_Open) { if (ca->maxRTT < ca->minRTT) ca->maxRTT = ca->minRTT; - if (ca->maxRTT < srtt - && srtt <= ca->maxRTT + msecs_to_jiffies(20)) + if (ca->maxRTT < srtt && + srtt <= ca->maxRTT + msecs_to_jiffies(20)) ca->maxRTT = srtt; } } @@ -123,9 +123,9 @@ static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, s32 rtt ca->packetcount += pkts_acked; - if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) - && now - ca->lasttime >= ca->minRTT - && ca->minRTT > 0) { + if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) && + now - ca->lasttime >= ca->minRTT && + ca->minRTT > 0) { __u32 cur_Bi = ca->packetcount * HZ / (now - ca->lasttime); if (htcp_ccount(ca) <= 3) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ba0eab6..57ae96a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -140,7 +140,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) * "len" is invariant segment length, including TCP header. */ len += skb->data - skb_transport_header(skb); - if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) || + if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) || /* If PSH is not set, packet should be * full sized, provided peer TCP is not badly broken. * This observation (if it is correct 8)) allows @@ -411,7 +411,7 @@ void tcp_initialize_rcv_mss(struct sock *sk) unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); hint = min(hint, tp->rcv_wnd / 2); - hint = min(hint, TCP_MIN_RCVMSS); + hint = min(hint, TCP_MSS_DEFAULT); hint = max(hint, TCP_MIN_MSS); inet_csk(sk)->icsk_ack.rcv_mss = hint; @@ -3698,14 +3698,12 @@ old_ack: * the fast version below fails. */ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, - int estab, struct dst_entry *dst) + u8 **hvpp, int estab, struct dst_entry *dst) { unsigned char *ptr; struct tcphdr *th = tcp_hdr(skb); int length = (th->doff * 4) - sizeof(struct tcphdr); - BUG_ON(!estab && !dst); - ptr = (unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; @@ -3787,7 +3785,30 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, */ break; #endif - } + case TCPOPT_COOKIE: + /* This option is variable length. + */ + switch (opsize) { + case TCPOLEN_COOKIE_BASE: + /* not yet implemented */ + break; + case TCPOLEN_COOKIE_PAIR: + /* not yet implemented */ + break; + case TCPOLEN_COOKIE_MIN+0: + case TCPOLEN_COOKIE_MIN+2: + case TCPOLEN_COOKIE_MIN+4: + case TCPOLEN_COOKIE_MIN+6: + case TCPOLEN_COOKIE_MAX: + /* 16-bit multiple */ + opt_rx->cookie_plus = opsize; + *hvpp = ptr; + default: + /* ignore option */ + break; + }; + break; + }; ptr += opsize-2; length -= opsize; @@ -3815,17 +3836,20 @@ static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th) * If it is wrong it falls back on tcp_parse_options(). */ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, - struct tcp_sock *tp) + struct tcp_sock *tp, u8 **hvpp) { - if (th->doff == sizeof(struct tcphdr) >> 2) { + /* In the spirit of fast parsing, compare doff directly to constant + * values. Because equality is used, short doff can be ignored here. + */ + if (th->doff == (sizeof(*th) / 4)) { tp->rx_opt.saw_tstamp = 0; return 0; } else if (tp->rx_opt.tstamp_ok && - th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { + th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { if (tcp_parse_aligned_timestamp(tp, th)) return 1; } - tcp_parse_options(skb, &tp->rx_opt, 1, NULL); + tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL); return 1; } @@ -4854,11 +4878,11 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) struct tcp_sock *tp = tcp_sk(sk); /* More than one full frame received... */ - if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss + if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && /* ... and right edge of window advances far enough. * (tcp_recvmsg() will send ACK otherwise). Or... */ - && __tcp_select_window(sk) >= tp->rcv_wnd) || + __tcp_select_window(sk) >= tp->rcv_wnd) || /* We ACK each frame or... */ tcp_in_quickack_mode(sk) || /* We have out of order data. */ @@ -5079,10 +5103,12 @@ out: static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, int syn_inerr) { + u8 *hash_location; struct tcp_sock *tp = tcp_sk(sk); /* RFC1323: H1. Apply PAWS check first. */ - if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && + if (tcp_fast_parse_options(skb, th, tp, &hash_location) && + tp->rx_opt.saw_tstamp && tcp_paws_discard(sk, skb)) { if (!th->rst) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); @@ -5370,12 +5396,14 @@ discard: static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, unsigned len) { - struct tcp_sock *tp = tcp_sk(sk); + u8 *hash_location; struct inet_connection_sock *icsk = inet_csk(sk); - int saved_clamp = tp->rx_opt.mss_clamp; + struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); + struct tcp_cookie_values *cvp = tp->cookie_values; + int saved_clamp = tp->rx_opt.mss_clamp; - tcp_parse_options(skb, &tp->rx_opt, 0, dst); + tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, dst); if (th->ack) { /* rfc793: @@ -5472,6 +5500,31 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * Change state from SYN-SENT only after copied_seq * is initialized. */ tp->copied_seq = tp->rcv_nxt; + + if (cvp != NULL && + cvp->cookie_pair_size > 0 && + tp->rx_opt.cookie_plus > 0) { + int cookie_size = tp->rx_opt.cookie_plus + - TCPOLEN_COOKIE_BASE; + int cookie_pair_size = cookie_size + + cvp->cookie_desired; + + /* A cookie extension option was sent and returned. + * Note that each incoming SYNACK replaces the + * Responder cookie. The initial exchange is most + * fragile, as protection against spoofing relies + * entirely upon the sequence and timestamp (above). + * This replacement strategy allows the correct pair to + * pass through, while any others will be filtered via + * Responder verification later. + */ + if (sizeof(cvp->cookie_pair) >= cookie_pair_size) { + memcpy(&cvp->cookie_pair[cvp->cookie_desired], + hash_location, cookie_size); + cvp->cookie_pair_size = cookie_pair_size; + } + } + smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 657ae33..fee9aab 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -204,7 +204,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) * when trying new connection. */ if (peer != NULL && - peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) { + (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; tp->rx_opt.ts_recent = peer->tcp_ts; } @@ -217,7 +217,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (inet->opt) inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; - tp->rx_opt.mss_clamp = 536; + tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; /* Socket identity is still unknown (sport may be zero). * However we set state to SYN-SENT and not releasing socket @@ -742,8 +742,9 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, * This still operates on a request_sock only, not on a big * socket. */ -static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req, - struct dst_entry *dst) +static int __tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, + struct request_sock *req, + struct request_values *rvp) { const struct inet_request_sock *ireq = inet_rsk(req); int err = -1; @@ -753,7 +754,7 @@ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req, if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) return -1; - skb = tcp_make_synack(sk, dst, req); + skb = tcp_make_synack(sk, dst, req, rvp); if (skb) { struct tcphdr *th = tcp_hdr(skb); @@ -774,9 +775,10 @@ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req, return err; } -static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req) +static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req, + struct request_values *rvp) { - return __tcp_v4_send_synack(sk, req, NULL); + return __tcp_v4_send_synack(sk, NULL, req, rvp); } /* @@ -1211,13 +1213,16 @@ static struct timewait_sock_ops tcp_timewait_sock_ops = { int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { - struct inet_request_sock *ireq; + struct tcp_extend_values tmp_ext; struct tcp_options_received tmp_opt; + u8 *hash_location; struct request_sock *req; + struct inet_request_sock *ireq; + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = NULL; __be32 saddr = ip_hdr(skb)->saddr; __be32 daddr = ip_hdr(skb)->daddr; __u32 isn = TCP_SKB_CB(skb)->when; - struct dst_entry *dst = NULL; #ifdef CONFIG_SYN_COOKIES int want_cookie = 0; #else @@ -1268,16 +1273,50 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_free; tcp_clear_options(&tmp_opt); - tmp_opt.mss_clamp = 536; - tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss; + tmp_opt.mss_clamp = TCP_MSS_DEFAULT; + tmp_opt.user_mss = tp->rx_opt.user_mss; + tcp_parse_options(skb, &tmp_opt, &hash_location, 0, dst); + + if (tmp_opt.cookie_plus > 0 && + tmp_opt.saw_tstamp && + !tp->rx_opt.cookie_out_never && + (sysctl_tcp_cookie_size > 0 || + (tp->cookie_values != NULL && + tp->cookie_values->cookie_desired > 0))) { + u8 *c; + u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS]; + int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE; + + if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0) + goto drop_and_release; + + /* Secret recipe starts with IP addresses */ + *mess++ ^= daddr; + *mess++ ^= saddr; + + /* plus variable length Initiator Cookie */ + c = (u8 *)mess; + while (l-- > 0) + *c++ ^= *hash_location++; - tcp_parse_options(skb, &tmp_opt, 0, dst); +#ifdef CONFIG_SYN_COOKIES + want_cookie = 0; /* not our kind of cookie */ +#endif + tmp_ext.cookie_out_never = 0; /* false */ + tmp_ext.cookie_plus = tmp_opt.cookie_plus; + } else if (!tp->rx_opt.cookie_in_always) { + /* redundant indications, but ensure initialization. */ + tmp_ext.cookie_out_never = 1; /* true */ + tmp_ext.cookie_plus = 0; + } else { + goto drop_and_release; + } + tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always; if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; - tcp_openreq_init(req, &tmp_opt, skb); if (security_inet_conn_request(sk, skb, req)) @@ -1308,7 +1347,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_death_row.sysctl_tw_recycle && (peer = rt_get_peer((struct rtable *)dst)) != NULL && peer->v4daddr == saddr) { - if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL && + if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); @@ -1337,7 +1376,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) } tcp_rsk(req)->snt_isn = isn; - if (__tcp_v4_send_synack(sk, req, dst) || want_cookie) + if (__tcp_v4_send_synack(sk, dst, req, + (struct request_values *)&tmp_ext) || + want_cookie) goto drop_and_free; inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); @@ -1727,9 +1768,9 @@ int tcp_v4_remember_stamp(struct sock *sk) if (peer) { if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || - (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() && - peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) { - peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp; + ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && + peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { + peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; peer->tcp_ts = tp->rx_opt.ts_recent; } if (release_it) @@ -1748,9 +1789,9 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || - (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() && - peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) { - peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp; + ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && + peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { + peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; peer->tcp_ts = tcptw->tw_ts_recent; } inet_putpeer(peer); @@ -1815,7 +1856,7 @@ static int tcp_v4_init_sock(struct sock *sk) */ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_clamp = ~0; - tp->mss_cache = 536; + tp->mss_cache = TCP_MSS_DEFAULT; tp->reordering = sysctl_tcp_reordering; icsk->icsk_ca_ops = &tcp_init_congestion_ops; @@ -1831,6 +1872,19 @@ static int tcp_v4_init_sock(struct sock *sk) tp->af_specific = &tcp_sock_ipv4_specific; #endif + /* TCP Cookie Transactions */ + if (sysctl_tcp_cookie_size > 0) { + /* Default, cookies without s_data_payload. */ + tp->cookie_values = + kzalloc(sizeof(*tp->cookie_values), + sk->sk_allocation); + if (tp->cookie_values != NULL) + kref_init(&tp->cookie_values->kref); + } + /* Presumed zeroed, in order of appearance: + * cookie_in_always, cookie_out_never, + * s_data_constant, s_data_in, s_data_out + */ sk->sk_sndbuf = sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1]; @@ -1884,6 +1938,13 @@ void tcp_v4_destroy_sock(struct sock *sk) sk->sk_sndmsg_page = NULL; } + /* TCP Cookie Transactions */ + if (tp->cookie_values != NULL) { + kref_put(&tp->cookie_values->kref, + tcp_cookie_values_release); + tp->cookie_values = NULL; + } + percpu_counter_dec(&tcp_sockets_allocated); } @@ -2468,12 +2529,17 @@ static int __net_init tcp_sk_init(struct net *net) static void __net_exit tcp_sk_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv4.tcp_sock); - inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET); +} + +static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) +{ + inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET); } static struct pernet_operations __net_initdata tcp_sk_ops = { - .init = tcp_sk_init, - .exit = tcp_sk_exit, + .init = tcp_sk_init, + .exit = tcp_sk_exit, + .exit_batch = tcp_sk_exit_batch, }; void __init tcp_v4_init(void) diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index ce3c41f..de87037 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -143,8 +143,8 @@ static u32 tcp_lp_remote_hz_estimator(struct sock *sk) goto out; /* we can't calc remote HZ with no different!! */ - if (tp->rx_opt.rcv_tsval == lp->remote_ref_time - || tp->rx_opt.rcv_tsecr == lp->local_ref_time) + if (tp->rx_opt.rcv_tsval == lp->remote_ref_time || + tp->rx_opt.rcv_tsecr == lp->local_ref_time) goto out; m = HZ * (tp->rx_opt.rcv_tsval - diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 463d51b..87accec 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -26,13 +26,7 @@ #include <net/inet_common.h> #include <net/xfrm.h> -#ifdef CONFIG_SYSCTL -#define SYNC_INIT 0 /* let the user enable it */ -#else -#define SYNC_INIT 1 -#endif - -int sysctl_tcp_syncookies __read_mostly = SYNC_INIT; +int sysctl_tcp_syncookies __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_syncookies); int sysctl_tcp_abort_on_overflow __read_mostly; @@ -96,13 +90,14 @@ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th) { - struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); struct tcp_options_received tmp_opt; + u8 *hash_location; + struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); int paws_reject = 0; if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { tmp_opt.tstamp_ok = 1; - tcp_parse_options(skb, &tmp_opt, 1, NULL); + tcp_parse_options(skb, &tmp_opt, &hash_location, 1, NULL); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = tcptw->tw_ts_recent; @@ -389,14 +384,43 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, const struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); struct inet_connection_sock *newicsk = inet_csk(newsk); - struct tcp_sock *newtp; + struct tcp_sock *newtp = tcp_sk(newsk); + struct tcp_sock *oldtp = tcp_sk(sk); + struct tcp_cookie_values *oldcvp = oldtp->cookie_values; + + /* TCP Cookie Transactions require space for the cookie pair, + * as it differs for each connection. There is no need to + * copy any s_data_payload stored at the original socket. + * Failure will prevent resuming the connection. + * + * Presumed copied, in order of appearance: + * cookie_in_always, cookie_out_never + */ + if (oldcvp != NULL) { + struct tcp_cookie_values *newcvp = + kzalloc(sizeof(*newtp->cookie_values), + GFP_ATOMIC); + + if (newcvp != NULL) { + kref_init(&newcvp->kref); + newcvp->cookie_desired = + oldcvp->cookie_desired; + newtp->cookie_values = newcvp; + } else { + /* Not Yet Implemented */ + newtp->cookie_values = NULL; + } + } /* Now setup tcp_sock */ - newtp = tcp_sk(newsk); newtp->pred_flags = 0; - newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; - newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1; - newtp->snd_up = treq->snt_isn + 1; + + newtp->rcv_wup = newtp->copied_seq = + newtp->rcv_nxt = treq->rcv_isn + 1; + + newtp->snd_sml = newtp->snd_una = + newtp->snd_nxt = newtp->snd_up = + treq->snt_isn + 1 + tcp_s_data_size(oldtp); tcp_prequeue_init(newtp); @@ -429,8 +453,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_set_ca_state(newsk, TCP_CA_Open); tcp_init_xmit_timers(newsk); skb_queue_head_init(&newtp->out_of_order_queue); - newtp->write_seq = treq->snt_isn + 1; - newtp->pushed_seq = newtp->write_seq; + newtp->write_seq = newtp->pushed_seq = + treq->snt_isn + 1 + tcp_s_data_size(oldtp); newtp->rx_opt.saw_tstamp = 0; @@ -476,7 +500,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, if (newtp->af_specific->md5_lookup(sk, newsk)) newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; #endif - if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len) + if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; TCP_ECN_openreq_child(newtp, req); @@ -495,16 +519,16 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct request_sock **prev) { + struct tcp_options_received tmp_opt; + u8 *hash_location; + struct sock *child; const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); int paws_reject = 0; - struct tcp_options_received tmp_opt; - struct sock *child; - struct dst_entry *dst = inet_csk_route_req(sk, req); - tmp_opt.saw_tstamp = 0; - if (th->doff > (sizeof(struct tcphdr)>>2)) { - tcp_parse_options(skb, &tmp_opt, 0, dst); + if ((th->doff > (sizeof(*th) >> 2)) && (req->ts_recent)) { + tmp_opt.tstamp_ok = 1; + tcp_parse_options(skb, &tmp_opt, &hash_location, 1, NULL); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = req->ts_recent; @@ -517,8 +541,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, } } - dst_release(dst); - /* Check for pure retransmitted SYN. */ if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn && flg == TCP_FLAG_SYN && @@ -540,7 +562,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * Enforce "SYN-ACK" according to figure 8, figure 6 * of RFC793, fixed by RFC1122. */ - req->rsk_ops->rtx_syn_ack(sk, req); + req->rsk_ops->rtx_syn_ack(sk, req, NULL); return NULL; } @@ -599,7 +621,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * Invalid ACK: reset will be sent by listening socket */ if ((flg & TCP_FLAG_ACK) && - (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1)) + (TCP_SKB_CB(skb)->ack_seq != + tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) return sk; /* Also, it would be not so bad idea to check rcv_tsecr, which diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 616c686..93316a9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -59,6 +59,10 @@ int sysctl_tcp_base_mss __read_mostly = 512; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; +int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ +EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); + + /* Account for new data that has been sent to the network. */ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { @@ -362,15 +366,45 @@ static inline int tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_TS (1 << 1) #define OPTION_MD5 (1 << 2) #define OPTION_WSCALE (1 << 3) +#define OPTION_COOKIE_EXTENSION (1 << 4) struct tcp_out_options { u8 options; /* bit field of OPTION_* */ u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ + u8 hash_size; /* bytes in hash_location */ u16 mss; /* 0 to disable */ __u32 tsval, tsecr; /* need to include OPTION_TS */ + __u8 *hash_location; /* temporary pointer, overloaded */ }; +/* The sysctl int routines are generic, so check consistency here. + */ +static u8 tcp_cookie_size_check(u8 desired) +{ + if (desired > 0) { + /* previously specified */ + return desired; + } + if (sysctl_tcp_cookie_size <= 0) { + /* no default specified */ + return 0; + } + if (sysctl_tcp_cookie_size <= TCP_COOKIE_MIN) { + /* value too small, specify minimum */ + return TCP_COOKIE_MIN; + } + if (sysctl_tcp_cookie_size >= TCP_COOKIE_MAX) { + /* value too large, specify maximum */ + return TCP_COOKIE_MAX; + } + if (0x1 & sysctl_tcp_cookie_size) { + /* 8-bit multiple, illegal, fix it */ + return (u8)(sysctl_tcp_cookie_size + 0x1); + } + return (u8)sysctl_tcp_cookie_size; +} + /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of @@ -385,17 +419,34 @@ struct tcp_out_options { * (but it may well be that other scenarios fail similarly). */ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, - const struct tcp_out_options *opts, - __u8 **md5_hash) { - if (unlikely(OPTION_MD5 & opts->options)) { - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_MD5SIG << 8) | - TCPOLEN_MD5SIG); - *md5_hash = (__u8 *)ptr; + struct tcp_out_options *opts) +{ + u8 options = opts->options; /* mungable copy */ + + /* Having both authentication and cookies for security is redundant, + * and there's certainly not enough room. Instead, the cookie-less + * extension variant is proposed. + * + * Consider the pessimal case with authentication. The options + * could look like: + * COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40 + */ + if (unlikely(OPTION_MD5 & options)) { + if (unlikely(OPTION_COOKIE_EXTENSION & options)) { + *ptr++ = htonl((TCPOPT_COOKIE << 24) | + (TCPOLEN_COOKIE_BASE << 16) | + (TCPOPT_MD5SIG << 8) | + TCPOLEN_MD5SIG); + } else { + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_MD5SIG << 8) | + TCPOLEN_MD5SIG); + } + options &= ~OPTION_COOKIE_EXTENSION; + /* overload cookie hash location */ + opts->hash_location = (__u8 *)ptr; ptr += 4; - } else { - *md5_hash = NULL; } if (unlikely(opts->mss)) { @@ -404,12 +455,13 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, opts->mss); } - if (likely(OPTION_TS & opts->options)) { - if (unlikely(OPTION_SACK_ADVERTISE & opts->options)) { + if (likely(OPTION_TS & options)) { + if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + options &= ~OPTION_SACK_ADVERTISE; } else { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | @@ -420,15 +472,52 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, *ptr++ = htonl(opts->tsecr); } - if (unlikely(OPTION_SACK_ADVERTISE & opts->options && - !(OPTION_TS & opts->options))) { + /* Specification requires after timestamp, so do it now. + * + * Consider the pessimal case without authentication. The options + * could look like: + * MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40 + */ + if (unlikely(OPTION_COOKIE_EXTENSION & options)) { + __u8 *cookie_copy = opts->hash_location; + u8 cookie_size = opts->hash_size; + + /* 8-bit multiple handled in tcp_cookie_size_check() above, + * and elsewhere. + */ + if (0x2 & cookie_size) { + __u8 *p = (__u8 *)ptr; + + /* 16-bit multiple */ + *p++ = TCPOPT_COOKIE; + *p++ = TCPOLEN_COOKIE_BASE + cookie_size; + *p++ = *cookie_copy++; + *p++ = *cookie_copy++; + ptr++; + cookie_size -= 2; + } else { + /* 32-bit multiple */ + *ptr++ = htonl(((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_COOKIE << 8) | + TCPOLEN_COOKIE_BASE) + + cookie_size); + } + + if (cookie_size > 0) { + memcpy(ptr, cookie_copy, cookie_size); + ptr += (cookie_size / 4); + } + } + + if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); } - if (unlikely(OPTION_WSCALE & opts->options)) { + if (unlikely(OPTION_WSCALE & options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | @@ -463,14 +552,18 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_md5sig_key **md5) { struct tcp_sock *tp = tcp_sk(sk); - unsigned size = 0; + struct tcp_cookie_values *cvp = tp->cookie_values; struct dst_entry *dst = __sk_dst_get(sk); + unsigned remaining = MAX_TCP_OPTION_SPACE; + u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ? + tcp_cookie_size_check(cvp->cookie_desired) : + 0; #ifdef CONFIG_TCP_MD5SIG *md5 = tp->af_specific->md5_lookup(sk, sk); if (*md5) { opts->options |= OPTION_MD5; - size += TCPOLEN_MD5SIG_ALIGNED; + remaining -= TCPOLEN_MD5SIG_ALIGNED; } #else *md5 = NULL; @@ -486,7 +579,7 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, * SACKs don't matter, we never delay an ACK when we have any of those * going out. */ opts->mss = tcp_advertise_mss(sk); - size += TCPOLEN_MSS_ALIGNED; + remaining -= TCPOLEN_MSS_ALIGNED; if (likely(sysctl_tcp_timestamps && !dst_feature(dst, RTAX_FEATURE_NO_TSTAMP) && @@ -494,22 +587,68 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, opts->options |= OPTION_TS; opts->tsval = TCP_SKB_CB(skb)->when; opts->tsecr = tp->rx_opt.ts_recent; - size += TCPOLEN_TSTAMP_ALIGNED; + remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(sysctl_tcp_window_scaling && !dst_feature(dst, RTAX_FEATURE_NO_WSCALE))) { opts->ws = tp->rx_opt.rcv_wscale; opts->options |= OPTION_WSCALE; - size += TCPOLEN_WSCALE_ALIGNED; + remaining -= TCPOLEN_WSCALE_ALIGNED; } if (likely(sysctl_tcp_sack && !dst_feature(dst, RTAX_FEATURE_NO_SACK))) { opts->options |= OPTION_SACK_ADVERTISE; if (unlikely(!(OPTION_TS & opts->options))) - size += TCPOLEN_SACKPERM_ALIGNED; + remaining -= TCPOLEN_SACKPERM_ALIGNED; } - return size; + /* Note that timestamps are required by the specification. + * + * Odd numbers of bytes are prohibited by the specification, ensuring + * that the cookie is 16-bit aligned, and the resulting cookie pair is + * 32-bit aligned. + */ + if (*md5 == NULL && + (OPTION_TS & opts->options) && + cookie_size > 0) { + int need = TCPOLEN_COOKIE_BASE + cookie_size; + + if (0x2 & need) { + /* 32-bit multiple */ + need += 2; /* NOPs */ + + if (need > remaining) { + /* try shrinking cookie to fit */ + cookie_size -= 2; + need -= 4; + } + } + while (need > remaining && TCP_COOKIE_MIN <= cookie_size) { + cookie_size -= 4; + need -= 4; + } + if (TCP_COOKIE_MIN <= cookie_size) { + opts->options |= OPTION_COOKIE_EXTENSION; + opts->hash_location = (__u8 *)&cvp->cookie_pair[0]; + opts->hash_size = cookie_size; + + /* Remember for future incarnations. */ + cvp->cookie_desired = cookie_size; + + if (cvp->cookie_desired != cvp->cookie_pair_size) { + /* Currently use random bytes as a nonce, + * assuming these are completely unpredictable + * by hostile users of the same system. + */ + get_random_bytes(&cvp->cookie_pair[0], + cookie_size); + cvp->cookie_pair_size = cookie_size; + } + + remaining -= need; + } + } + return MAX_TCP_OPTION_SPACE - remaining; } /* Set up TCP options for SYN-ACKs. */ @@ -517,48 +656,77 @@ static unsigned tcp_synack_options(struct sock *sk, struct request_sock *req, unsigned mss, struct sk_buff *skb, struct tcp_out_options *opts, - struct tcp_md5sig_key **md5) { - unsigned size = 0; + struct tcp_md5sig_key **md5, + struct tcp_extend_values *xvp) +{ struct inet_request_sock *ireq = inet_rsk(req); - char doing_ts; + unsigned remaining = MAX_TCP_OPTION_SPACE; + u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ? + xvp->cookie_plus : + 0; + bool doing_ts = ireq->tstamp_ok; #ifdef CONFIG_TCP_MD5SIG *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); if (*md5) { opts->options |= OPTION_MD5; - size += TCPOLEN_MD5SIG_ALIGNED; + remaining -= TCPOLEN_MD5SIG_ALIGNED; + + /* We can't fit any SACK blocks in a packet with MD5 + TS + * options. There was discussion about disabling SACK + * rather than TS in order to fit in better with old, + * buggy kernels, but that was deemed to be unnecessary. + */ + doing_ts &= !ireq->sack_ok; } #else *md5 = NULL; #endif - /* we can't fit any SACK blocks in a packet with MD5 + TS - options. There was discussion about disabling SACK rather than TS in - order to fit in better with old, buggy kernels, but that was deemed - to be unnecessary. */ - doing_ts = ireq->tstamp_ok && !(*md5 && ireq->sack_ok); - + /* We always send an MSS option. */ opts->mss = mss; - size += TCPOLEN_MSS_ALIGNED; + remaining -= TCPOLEN_MSS_ALIGNED; if (likely(ireq->wscale_ok)) { opts->ws = ireq->rcv_wscale; opts->options |= OPTION_WSCALE; - size += TCPOLEN_WSCALE_ALIGNED; + remaining -= TCPOLEN_WSCALE_ALIGNED; } if (likely(doing_ts)) { opts->options |= OPTION_TS; opts->tsval = TCP_SKB_CB(skb)->when; opts->tsecr = req->ts_recent; - size += TCPOLEN_TSTAMP_ALIGNED; + remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(ireq->sack_ok)) { opts->options |= OPTION_SACK_ADVERTISE; if (unlikely(!doing_ts)) - size += TCPOLEN_SACKPERM_ALIGNED; + remaining -= TCPOLEN_SACKPERM_ALIGNED; } - return size; + /* Similar rationale to tcp_syn_options() applies here, too. + * If the <SYN> options fit, the same options should fit now! + */ + if (*md5 == NULL && + doing_ts && + cookie_plus > TCPOLEN_COOKIE_BASE) { + int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */ + + if (0x2 & need) { + /* 32-bit multiple */ + need += 2; /* NOPs */ + } + if (need <= remaining) { + opts->options |= OPTION_COOKIE_EXTENSION; + opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE; + remaining -= need; + } else { + /* There's no error return, so flag it. */ + xvp->cookie_out_never = 1; /* true */ + opts->hash_size = 0; + } + } + return MAX_TCP_OPTION_SPACE - remaining; } /* Compute TCP options for ESTABLISHED sockets. This is not the @@ -624,7 +792,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, struct tcp_out_options opts; unsigned tcp_options_size, tcp_header_size; struct tcp_md5sig_key *md5; - __u8 *md5_hash_location; struct tcphdr *th; int err; @@ -695,7 +862,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, } } - tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); + tcp_options_write((__be32 *)(th + 1), tp, &opts); if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0)) TCP_ECN_send(sk, skb, tcp_header_size); @@ -703,7 +870,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, /* Calculate the MD5 hash, as we have all we need now */ if (md5) { sk->sk_route_caps &= ~NETIF_F_GSO_MASK; - tp->af_specific->calc_md5_hash(md5_hash_location, + tp->af_specific->calc_md5_hash(opts.hash_location, md5, sk, NULL, skb); } #endif @@ -1923,8 +2090,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) * case, when window is shrunk to zero. In this case * our retransmit serves as a zero window probe. */ - if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) - && TCP_SKB_CB(skb)->seq != tp->snd_una) + if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) && + TCP_SKB_CB(skb)->seq != tp->snd_una) return -EAGAIN; if (skb->len > cur_mss) { @@ -2224,16 +2391,17 @@ int tcp_send_synack(struct sock *sk) /* Prepare a SYN-ACK. */ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, - struct request_sock *req) + struct request_sock *req, + struct request_values *rvp) { + struct tcp_out_options opts; + struct tcp_extend_values *xvp = tcp_xv(rvp); struct inet_request_sock *ireq = inet_rsk(req); struct tcp_sock *tp = tcp_sk(sk); struct tcphdr *th; - int tcp_header_size; - struct tcp_out_options opts; struct sk_buff *skb; struct tcp_md5sig_key *md5; - __u8 *md5_hash_location; + int tcp_header_size; int mss; skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); @@ -2271,8 +2439,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, #endif TCP_SKB_CB(skb)->when = tcp_time_stamp; tcp_header_size = tcp_synack_options(sk, req, mss, - skb, &opts, &md5) + - sizeof(struct tcphdr); + skb, &opts, &md5, xvp) + + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -2289,19 +2457,58 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, */ tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, TCPCB_FLAG_SYN | TCPCB_FLAG_ACK); + + if (OPTION_COOKIE_EXTENSION & opts.options) { + const struct tcp_cookie_values *cvp = tp->cookie_values; + + if (cvp != NULL && + cvp->s_data_constant && + cvp->s_data_desired > 0) { + u8 *buf = skb_put(skb, cvp->s_data_desired); + + /* copy data directly from the listening socket. */ + memcpy(buf, cvp->s_data_payload, cvp->s_data_desired); + TCP_SKB_CB(skb)->end_seq += cvp->s_data_desired; + } + + if (opts.hash_size > 0) { + __u32 workspace[SHA_WORKSPACE_WORDS]; + u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS]; + u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1]; + + /* Secret recipe depends on the Timestamp, (future) + * Sequence and Acknowledgment Numbers, Initiator + * Cookie, and others handled by IP variant caller. + */ + *tail-- ^= opts.tsval; + *tail-- ^= tcp_rsk(req)->rcv_isn + 1; + *tail-- ^= TCP_SKB_CB(skb)->seq + 1; + + /* recommended */ + *tail-- ^= ((th->dest << 16) | th->source); + *tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */ + + sha_transform((__u32 *)&xvp->cookie_bakery[0], + (char *)mess, + &workspace[0]); + opts.hash_location = + (__u8 *)&xvp->cookie_bakery[0]; + } + } + th->seq = htonl(TCP_SKB_CB(skb)->seq); th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rcv_wnd, 65535U)); - tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); + tcp_options_write((__be32 *)(th + 1), tp, &opts); th->doff = (tcp_header_size >> 2); TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); #ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ if (md5) { - tcp_rsk(req)->af_specific->calc_md5_hash(md5_hash_location, + tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location, md5, NULL, req, skb); } #endif diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 7a3cc2f..bb110c5 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -95,8 +95,8 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, /* Only update if port matches */ if ((port == 0 || ntohs(inet->inet_dport) == port || - ntohs(inet->inet_sport) == port) - && (full || tp->snd_cwnd != tcp_probe.lastcwnd)) { + ntohs(inet->inet_sport) == port) && + (full || tp->snd_cwnd != tcp_probe.lastcwnd)) { spin_lock(&tcp_probe.lock); /* If log fills, just silently drop */ diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index e9bbff7..b612acf 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -165,9 +165,8 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) * every other rtt. */ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - if (veno->inc - && tp->snd_cwnd < - tp->snd_cwnd_clamp) { + if (veno->inc && + tp->snd_cwnd < tp->snd_cwnd_clamp) { tp->snd_cwnd++; veno->inc = 0; } else diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 66b6821..a0f2403 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -157,8 +157,8 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) if (queue > TCP_YEAH_ALPHA || rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) { - if (queue > TCP_YEAH_ALPHA - && tp->snd_cwnd > yeah->reno_count) { + if (queue > TCP_YEAH_ALPHA && + tp->snd_cwnd > yeah->reno_count) { u32 reduction = min(queue / TCP_YEAH_GAMMA , tp->snd_cwnd >> TCP_YEAH_EPSILON); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4274c1c..1f95348 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -136,33 +136,67 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, struct hlist_nulls_node *node; sk_nulls_for_each(sk2, node, &hslot->head) - if (net_eq(sock_net(sk2), net) && - sk2 != sk && - (bitmap || sk2->sk_hash == num) && - (!sk2->sk_reuse || !sk->sk_reuse) && - (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if - || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + if (net_eq(sock_net(sk2), net) && + sk2 != sk && + (bitmap || udp_sk(sk2)->udp_port_hash == num) && + (!sk2->sk_reuse || !sk->sk_reuse) && + (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || + sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && (*saddr_comp)(sk, sk2)) { if (bitmap) - __set_bit(sk2->sk_hash >> log, bitmap); + __set_bit(udp_sk(sk2)->udp_port_hash >> log, + bitmap); else return 1; } return 0; } +/* + * Note: we still hold spinlock of primary hash chain, so no other writer + * can insert/delete a socket with local_port == num + */ +static int udp_lib_lport_inuse2(struct net *net, __u16 num, + struct udp_hslot *hslot2, + struct sock *sk, + int (*saddr_comp)(const struct sock *sk1, + const struct sock *sk2)) +{ + struct sock *sk2; + struct hlist_nulls_node *node; + int res = 0; + + spin_lock(&hslot2->lock); + udp_portaddr_for_each_entry(sk2, node, &hslot2->head) + if (net_eq(sock_net(sk2), net) && + sk2 != sk && + (udp_sk(sk2)->udp_port_hash == num) && + (!sk2->sk_reuse || !sk->sk_reuse) && + (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || + sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + (*saddr_comp)(sk, sk2)) { + res = 1; + break; + } + spin_unlock(&hslot2->lock); + return res; +} + /** * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 * * @sk: socket struct in question * @snum: port number to look up * @saddr_comp: AF-dependent comparison of bound local IP addresses + * @hash2_nulladdr: AF-dependant hash value in secondary hash chains, + * with NULL address */ int udp_lib_get_port(struct sock *sk, unsigned short snum, int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2)) + const struct sock *sk2), + unsigned int hash2_nulladdr) { - struct udp_hslot *hslot; + struct udp_hslot *hslot, *hslot2; struct udp_table *udptable = sk->sk_prot->h.udp_table; int error = 1; struct net *net = sock_net(sk); @@ -209,16 +243,49 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, } else { hslot = udp_hashslot(udptable, net, snum); spin_lock_bh(&hslot->lock); + if (hslot->count > 10) { + int exist; + unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum; + + slot2 &= udptable->mask; + hash2_nulladdr &= udptable->mask; + + hslot2 = udp_hashslot2(udptable, slot2); + if (hslot->count < hslot2->count) + goto scan_primary_hash; + + exist = udp_lib_lport_inuse2(net, snum, hslot2, + sk, saddr_comp); + if (!exist && (hash2_nulladdr != slot2)) { + hslot2 = udp_hashslot2(udptable, hash2_nulladdr); + exist = udp_lib_lport_inuse2(net, snum, hslot2, + sk, saddr_comp); + } + if (exist) + goto fail_unlock; + else + goto found; + } +scan_primary_hash: if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, saddr_comp, 0)) goto fail_unlock; } found: inet_sk(sk)->inet_num = snum; - sk->sk_hash = snum; + udp_sk(sk)->udp_port_hash = snum; + udp_sk(sk)->udp_portaddr_hash ^= snum; if (sk_unhashed(sk)) { sk_nulls_add_node_rcu(sk, &hslot->head); + hslot->count++; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + + hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); + spin_lock(&hslot2->lock); + hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + &hslot2->head); + hslot2->count++; + spin_unlock(&hslot2->lock); } error = 0; fail_unlock: @@ -237,9 +304,22 @@ static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)); } +static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr, + unsigned int port) +{ + return jhash_1word(saddr, net_hash_mix(net)) ^ port; +} + int udp_v4_get_port(struct sock *sk, unsigned short snum) { - return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal); + unsigned int hash2_nulladdr = + udp4_portaddr_hash(sock_net(sk), INADDR_ANY, snum); + unsigned int hash2_partial = + udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0); + + /* precompute partial secondary hash */ + udp_sk(sk)->udp_portaddr_hash = hash2_partial; + return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr); } static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, @@ -248,7 +328,7 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, { int score = -1; - if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && + if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum && !ipv6_only_sock(sk)) { struct inet_sock *inet = inet_sk(sk); @@ -277,6 +357,89 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, return score; } +/* + * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num) + */ +#define SCORE2_MAX (1 + 2 + 2 + 2) +static inline int compute_score2(struct sock *sk, struct net *net, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned int hnum, int dif) +{ + int score = -1; + + if (net_eq(sock_net(sk), net) && !ipv6_only_sock(sk)) { + struct inet_sock *inet = inet_sk(sk); + + if (inet->inet_rcv_saddr != daddr) + return -1; + if (inet->inet_num != hnum) + return -1; + + score = (sk->sk_family == PF_INET ? 1 : 0); + if (inet->inet_daddr) { + if (inet->inet_daddr != saddr) + return -1; + score += 2; + } + if (inet->inet_dport) { + if (inet->inet_dport != sport) + return -1; + score += 2; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score += 2; + } + } + return score; +} + + +/* called with read_rcu_lock() */ +static struct sock *udp4_lib_lookup2(struct net *net, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned int hnum, int dif, + struct udp_hslot *hslot2, unsigned int slot2) +{ + struct sock *sk, *result; + struct hlist_nulls_node *node; + int score, badness; + +begin: + result = NULL; + badness = -1; + udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { + score = compute_score2(sk, net, saddr, sport, + daddr, hnum, dif); + if (score > badness) { + result = sk; + badness = score; + if (score == SCORE2_MAX) + goto exact_match; + } + } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot2) + goto begin; + + if (result) { +exact_match: + if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) + result = NULL; + else if (unlikely(compute_score2(result, net, saddr, sport, + daddr, hnum, dif) < badness)) { + sock_put(result); + goto begin; + } + } + return result; +} + /* UDP is nearly always wildcards out the wazoo, it makes no sense to try * harder than this. -DaveM */ @@ -287,11 +450,35 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, struct sock *sk, *result; struct hlist_nulls_node *node; unsigned short hnum = ntohs(dport); - unsigned int hash = udp_hashfn(net, hnum, udptable->mask); - struct udp_hslot *hslot = &udptable->hash[hash]; + unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); + struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; int score, badness; rcu_read_lock(); + if (hslot->count > 10) { + hash2 = udp4_portaddr_hash(net, daddr, hnum); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + if (hslot->count < hslot2->count) + goto begin; + + result = udp4_lib_lookup2(net, saddr, sport, + daddr, hnum, dif, + hslot2, slot2); + if (!result) { + hash2 = udp4_portaddr_hash(net, INADDR_ANY, hnum); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + if (hslot->count < hslot2->count) + goto begin; + + result = udp4_lib_lookup2(net, INADDR_ANY, sport, + daddr, hnum, dif, + hslot2, slot2); + } + rcu_read_unlock(); + return result; + } begin: result = NULL; badness = -1; @@ -308,7 +495,7 @@ begin: * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ - if (get_nulls_value(node) != hash) + if (get_nulls_value(node) != slot) goto begin; if (result) { @@ -358,13 +545,13 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk, sk_nulls_for_each_from(s, node) { struct inet_sock *inet = inet_sk(s); - if (!net_eq(sock_net(s), net) || - s->sk_hash != hnum || - (inet->inet_daddr && inet->inet_daddr != rmt_addr) || - (inet->inet_dport != rmt_port && inet->inet_dport) || - (inet->inet_rcv_saddr && - inet->inet_rcv_saddr != loc_addr) || - ipv6_only_sock(s) || + if (!net_eq(sock_net(s), net) || + udp_sk(s)->udp_port_hash != hnum || + (inet->inet_daddr && inet->inet_daddr != rmt_addr) || + (inet->inet_dport != rmt_port && inet->inet_dport) || + (inet->inet_rcv_saddr && + inet->inet_rcv_saddr != loc_addr) || + ipv6_only_sock(s) || (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)) continue; if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif)) @@ -1005,9 +1192,7 @@ try_again: err = ulen; out_free: - lock_sock(sk); - skb_free_datagram(sk, skb); - release_sock(sk); + skb_free_datagram_locked(sk, skb); out: return err; @@ -1050,13 +1235,22 @@ void udp_lib_unhash(struct sock *sk) { if (sk_hashed(sk)) { struct udp_table *udptable = sk->sk_prot->h.udp_table; - struct udp_hslot *hslot = udp_hashslot(udptable, sock_net(sk), - sk->sk_hash); + struct udp_hslot *hslot, *hslot2; + + hslot = udp_hashslot(udptable, sock_net(sk), + udp_sk(sk)->udp_port_hash); + hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock_bh(&hslot->lock); if (sk_nulls_del_node_init_rcu(sk)) { + hslot->count--; inet_sk(sk)->inet_num = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + + spin_lock(&hslot2->lock); + hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); + hslot2->count--; + spin_unlock(&hslot2->lock); } spin_unlock_bh(&hslot->lock); } @@ -1192,49 +1386,83 @@ drop: return -1; } + +static void flush_stack(struct sock **stack, unsigned int count, + struct sk_buff *skb, unsigned int final) +{ + unsigned int i; + struct sk_buff *skb1 = NULL; + struct sock *sk; + + for (i = 0; i < count; i++) { + sk = stack[i]; + if (likely(skb1 == NULL)) + skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC); + + if (!skb1) { + atomic_inc(&sk->sk_drops); + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, + IS_UDPLITE(sk)); + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, + IS_UDPLITE(sk)); + } + + if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0) + skb1 = NULL; + } + if (unlikely(skb1)) + kfree_skb(skb1); +} + /* * Multicasts and broadcasts go to each listener. * - * Note: called only from the BH handler context, - * so we don't need to lock the hashes. + * Note: called only from the BH handler context. */ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udphdr *uh, __be32 saddr, __be32 daddr, struct udp_table *udptable) { - struct sock *sk; + struct sock *sk, *stack[256 / sizeof(struct sock *)]; struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); int dif; + unsigned int i, count = 0; spin_lock(&hslot->lock); sk = sk_nulls_head(&hslot->head); dif = skb->dev->ifindex; sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); - if (sk) { - struct sock *sknext = NULL; - - do { - struct sk_buff *skb1 = skb; - - sknext = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest, - daddr, uh->source, saddr, - dif); - if (sknext) - skb1 = skb_clone(skb, GFP_ATOMIC); - - if (skb1) { - int ret = udp_queue_rcv_skb(sk, skb1); - if (ret > 0) - /* we should probably re-process instead - * of dropping packets here. */ - kfree_skb(skb1); - } - sk = sknext; - } while (sknext); - } else - consume_skb(skb); + while (sk) { + stack[count++] = sk; + sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest, + daddr, uh->source, saddr, dif); + if (unlikely(count == ARRAY_SIZE(stack))) { + if (!sk) + break; + flush_stack(stack, count, skb, ~0); + count = 0; + } + } + /* + * before releasing chain lock, we must take a reference on sockets + */ + for (i = 0; i < count; i++) + sock_hold(stack[i]); + spin_unlock(&hslot->lock); + + /* + * do the slow work with no lock held + */ + if (count) { + flush_stack(stack, count, skb, count - 1); + + for (i = 0; i < count; i++) + sock_put(stack[i]); + } else { + kfree_skb(skb); + } return 0; } @@ -1844,7 +2072,7 @@ void __init udp_table_init(struct udp_table *table, const char *name) if (!CONFIG_BASE_SMALL) table->hash = alloc_large_system_hash(name, - sizeof(struct udp_hslot), + 2 * sizeof(struct udp_hslot), uhash_entries, 21, /* one slot per 2 MB */ 0, @@ -1856,16 +2084,23 @@ void __init udp_table_init(struct udp_table *table, const char *name) */ if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) { table->hash = kmalloc(UDP_HTABLE_SIZE_MIN * - sizeof(struct udp_hslot), GFP_KERNEL); + 2 * sizeof(struct udp_hslot), GFP_KERNEL); if (!table->hash) panic(name); table->log = ilog2(UDP_HTABLE_SIZE_MIN); table->mask = UDP_HTABLE_SIZE_MIN - 1; } + table->hash2 = table->hash + (table->mask + 1); for (i = 0; i <= table->mask; i++) { INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); + table->hash[i].count = 0; spin_lock_init(&table->hash[i].lock); } + for (i = 0; i <= table->mask; i++) { + INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i); + table->hash2[i].count = 0; + spin_lock_init(&table->hash2[i].lock); + } } void __init udp_init(void) diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 470c504..66f7951 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -64,7 +64,6 @@ static struct inet_protosw udplite4_protosw = { .protocol = IPPROTO_UDPLITE, .prot = &udplite_prot, .ops = &inet_dgram_ops, - .capability = -1, .no_check = 0, /* must checksum (RFC 3828) */ .flags = INET_PROTOSW_PERMANENT, }; |