summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c10
-rw-r--r--net/ipv4/ah4.c2
-rw-r--r--net/ipv4/devinet.c158
-rw-r--r--net/ipv4/esp4.c2
-rw-r--r--net/ipv4/fib_frontend.c29
-rw-r--r--net/ipv4/fib_rules.c14
-rw-r--r--net/ipv4/fib_semantics.c4
-rw-r--r--net/ipv4/icmp.c9
-rw-r--r--net/ipv4/igmp.c50
-rw-r--r--net/ipv4/inet_connection_sock.c6
-rw-r--r--net/ipv4/inet_hashtables.c3
-rw-r--r--net/ipv4/inet_lro.c36
-rw-r--r--net/ipv4/inet_timewait_sock.c45
-rw-r--r--net/ipv4/inetpeer.c5
-rw-r--r--net/ipv4/ip_fragment.c13
-rw-r--r--net/ipv4/ip_gre.c56
-rw-r--r--net/ipv4/ip_input.c2
-rw-r--r--net/ipv4/ip_output.c8
-rw-r--r--net/ipv4/ipconfig.c13
-rw-r--r--net/ipv4/ipip.c58
-rw-r--r--net/ipv4/ipmr.c4
-rw-r--r--net/ipv4/netfilter.c8
-rw-r--r--net/ipv4/netfilter/ip_queue.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c3
-rw-r--r--net/ipv4/netfilter/nf_nat_helper.c34
-rw-r--r--net/ipv4/raw.c24
-rw-r--r--net/ipv4/route.c107
-rw-r--r--net/ipv4/syncookies.c5
-rw-r--r--net/ipv4/sysctl_net_ipv4.c12
-rw-r--r--net/ipv4/tcp.c293
-rw-r--r--net/ipv4/tcp_htcp.c10
-rw-r--r--net/ipv4/tcp_input.c85
-rw-r--r--net/ipv4/tcp_ipv4.c116
-rw-r--r--net/ipv4/tcp_lp.c4
-rw-r--r--net/ipv4/tcp_minisocks.c77
-rw-r--r--net/ipv4/tcp_output.c307
-rw-r--r--net/ipv4/tcp_probe.c4
-rw-r--r--net/ipv4/tcp_veno.c5
-rw-r--r--net/ipv4/tcp_yeah.c4
-rw-r--r--net/ipv4/udp.c345
-rw-r--r--net/ipv4/udplite.c1
41 files changed, 1438 insertions, 535 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 04a14b1..7d12c6a 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -262,7 +262,8 @@ static inline int inet_netns_ok(struct net *net, int protocol)
* Create an inet socket.
*/
-static int inet_create(struct net *net, struct socket *sock, int protocol)
+static int inet_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
{
struct sock *sk;
struct inet_protosw *answer;
@@ -325,7 +326,7 @@ lookup_protocol:
}
err = -EPERM;
- if (answer->capability > 0 && !capable(answer->capability))
+ if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
goto out_rcu_unlock;
err = -EAFNOSUPPORT;
@@ -685,7 +686,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
{
struct sock *sk = sock->sk;
struct inet_sock *inet = inet_sk(sk);
- struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
sin->sin_family = AF_INET;
if (peer) {
@@ -947,7 +948,6 @@ static struct inet_protosw inetsw_array[] =
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
- .capability = -1,
.no_check = 0,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
@@ -958,7 +958,6 @@ static struct inet_protosw inetsw_array[] =
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
- .capability = -1,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
},
@@ -969,7 +968,6 @@ static struct inet_protosw inetsw_array[] =
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
- .capability = CAP_NET_RAW,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
}
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index d07b0c1..7ed3e4a 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -444,7 +444,7 @@ static int ah_init_state(struct xfrm_state *x)
}
ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
- ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8;
+ ahp->icv_trunc_len = x->aalg->alg_trunc_len/8;
BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 5df2f6a..e312661 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -140,11 +140,11 @@ void in_dev_finish_destroy(struct in_device *idev)
#endif
dev_put(dev);
if (!idev->dead)
- printk("Freeing alive in_device %p\n", idev);
- else {
+ pr_err("Freeing alive in_device %p\n", idev);
+ else
kfree(idev);
- }
}
+EXPORT_SYMBOL(in_dev_finish_destroy);
static struct in_device *inetdev_init(struct net_device *dev)
{
@@ -159,7 +159,8 @@ static struct in_device *inetdev_init(struct net_device *dev)
sizeof(in_dev->cnf));
in_dev->cnf.sysctl = NULL;
in_dev->dev = dev;
- if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL)
+ in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl);
+ if (!in_dev->arp_parms)
goto out_kfree;
if (IPV4_DEVCONF(in_dev->cnf, FORWARDING))
dev_disable_lro(dev);
@@ -405,13 +406,15 @@ struct in_device *inetdev_by_index(struct net *net, int ifindex)
{
struct net_device *dev;
struct in_device *in_dev = NULL;
- read_lock(&dev_base_lock);
- dev = __dev_get_by_index(net, ifindex);
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, ifindex);
if (dev)
in_dev = in_dev_get(dev);
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
return in_dev;
}
+EXPORT_SYMBOL(inetdev_by_index);
/* Called only from RTNL semaphored context. No locks. */
@@ -557,7 +560,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
* Determine a default network mask, based on the IP address.
*/
-static __inline__ int inet_abc_len(__be32 addr)
+static inline int inet_abc_len(__be32 addr)
{
int rc = -1; /* Something else, probably a multicast. */
@@ -646,13 +649,15 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
rtnl_lock();
ret = -ENODEV;
- if ((dev = __dev_get_by_name(net, ifr.ifr_name)) == NULL)
+ dev = __dev_get_by_name(net, ifr.ifr_name);
+ if (!dev)
goto done;
if (colon)
*colon = ':';
- if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
+ in_dev = __in_dev_get_rtnl(dev);
+ if (in_dev) {
if (tryaddrmatch) {
/* Matthias Andree */
/* compare label and address (4.4BSD style) */
@@ -720,7 +725,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
if (!ifa) {
ret = -ENOBUFS;
- if ((ifa = inet_alloc_ifa()) == NULL)
+ ifa = inet_alloc_ifa();
+ if (!ifa)
break;
if (colon)
memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);
@@ -822,10 +828,10 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
struct ifreq ifr;
int done = 0;
- if (!in_dev || (ifa = in_dev->ifa_list) == NULL)
+ if (!in_dev)
goto out;
- for (; ifa; ifa = ifa->ifa_next) {
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
if (!buf) {
done += sizeof(ifr);
continue;
@@ -875,36 +881,33 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
if (!addr)
addr = ifa->ifa_local;
} endfor_ifa(in_dev);
-no_in_dev:
- rcu_read_unlock();
if (addr)
- goto out;
+ goto out_unlock;
+no_in_dev:
/* Not loopback addresses on loopback should be preferred
in this case. It is importnat that lo is the first interface
in dev_base list.
*/
- read_lock(&dev_base_lock);
- rcu_read_lock();
- for_each_netdev(net, dev) {
- if ((in_dev = __in_dev_get_rcu(dev)) == NULL)
+ for_each_netdev_rcu(net, dev) {
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
continue;
for_primary_ifa(in_dev) {
if (ifa->ifa_scope != RT_SCOPE_LINK &&
ifa->ifa_scope <= scope) {
addr = ifa->ifa_local;
- goto out_unlock_both;
+ goto out_unlock;
}
} endfor_ifa(in_dev);
}
-out_unlock_both:
- read_unlock(&dev_base_lock);
+out_unlock:
rcu_read_unlock();
-out:
return addr;
}
+EXPORT_SYMBOL(inet_select_addr);
static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
__be32 local, int scope)
@@ -940,7 +943,7 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
}
} endfor_ifa(in_dev);
- return same? addr : 0;
+ return same ? addr : 0;
}
/*
@@ -961,17 +964,16 @@ __be32 inet_confirm_addr(struct in_device *in_dev,
return confirm_addr_indev(in_dev, dst, local, scope);
net = dev_net(in_dev->dev);
- read_lock(&dev_base_lock);
rcu_read_lock();
- for_each_netdev(net, dev) {
- if ((in_dev = __in_dev_get_rcu(dev))) {
+ for_each_netdev_rcu(net, dev) {
+ in_dev = __in_dev_get_rcu(dev);
+ if (in_dev) {
addr = confirm_addr_indev(in_dev, dst, local, scope);
if (addr)
break;
}
}
rcu_read_unlock();
- read_unlock(&dev_base_lock);
return addr;
}
@@ -984,14 +986,16 @@ int register_inetaddr_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&inetaddr_chain, nb);
}
+EXPORT_SYMBOL(register_inetaddr_notifier);
int unregister_inetaddr_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_unregister(&inetaddr_chain, nb);
}
+EXPORT_SYMBOL(unregister_inetaddr_notifier);
-/* Rename ifa_labels for a device name change. Make some effort to preserve existing
- * alias numbering and to create unique labels if possible.
+/* Rename ifa_labels for a device name change. Make some effort to preserve
+ * existing alias numbering and to create unique labels if possible.
*/
static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
{
@@ -1010,11 +1014,10 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
sprintf(old, ":%d", named);
dot = old;
}
- if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) {
+ if (strlen(dot) + strlen(dev->name) < IFNAMSIZ)
strcat(ifa->ifa_label, dot);
- } else {
+ else
strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot);
- }
skip:
rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
}
@@ -1061,8 +1064,9 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
if (!inetdev_valid_mtu(dev->mtu))
break;
if (dev->flags & IFF_LOOPBACK) {
- struct in_ifaddr *ifa;
- if ((ifa = inet_alloc_ifa()) != NULL) {
+ struct in_ifaddr *ifa = inet_alloc_ifa();
+
+ if (ifa) {
ifa->ifa_local =
ifa->ifa_address = htonl(INADDR_LOOPBACK);
ifa->ifa_prefixlen = 8;
@@ -1170,38 +1174,54 @@ nla_put_failure:
static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
- int idx, ip_idx;
+ int h, s_h;
+ int idx, s_idx;
+ int ip_idx, s_ip_idx;
struct net_device *dev;
struct in_device *in_dev;
struct in_ifaddr *ifa;
- int s_ip_idx, s_idx = cb->args[0];
+ struct hlist_head *head;
+ struct hlist_node *node;
- s_ip_idx = ip_idx = cb->args[1];
- idx = 0;
- for_each_netdev(net, dev) {
- if (idx < s_idx)
- goto cont;
- if (idx > s_idx)
- s_ip_idx = 0;
- if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
- goto cont;
-
- for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
- ifa = ifa->ifa_next, ip_idx++) {
- if (ip_idx < s_ip_idx)
- continue;
- if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid,
+ s_h = cb->args[0];
+ s_idx = idx = cb->args[1];
+ s_ip_idx = ip_idx = cb->args[2];
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(dev, node, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ if (idx > s_idx)
+ s_ip_idx = 0;
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ goto cont;
+
+ for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
+ ifa = ifa->ifa_next, ip_idx++) {
+ if (ip_idx < s_ip_idx)
+ continue;
+ if (inet_fill_ifaddr(skb, ifa,
+ NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq,
- RTM_NEWADDR, NLM_F_MULTI) <= 0)
- goto done;
- }
+ RTM_NEWADDR, NLM_F_MULTI) <= 0) {
+ rcu_read_unlock();
+ goto done;
+ }
+ }
cont:
- idx++;
+ idx++;
+ }
+ rcu_read_unlock();
}
done:
- cb->args[0] = idx;
- cb->args[1] = ip_idx;
+ cb->args[0] = h;
+ cb->args[1] = idx;
+ cb->args[2] = ip_idx;
return skb->len;
}
@@ -1239,18 +1259,18 @@ static void devinet_copy_dflt_conf(struct net *net, int i)
{
struct net_device *dev;
- read_lock(&dev_base_lock);
- for_each_netdev(net, dev) {
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
struct in_device *in_dev;
- rcu_read_lock();
+
in_dev = __in_dev_get_rcu(dev);
if (in_dev && !test_bit(i, in_dev->cnf.state))
in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i];
- rcu_read_unlock();
}
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
}
+/* called with RTNL locked */
static void inet_forward_change(struct net *net)
{
struct net_device *dev;
@@ -1259,7 +1279,6 @@ static void inet_forward_change(struct net *net)
IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on;
IPV4_DEVCONF_DFLT(net, FORWARDING) = on;
- read_lock(&dev_base_lock);
for_each_netdev(net, dev) {
struct in_device *in_dev;
if (on)
@@ -1270,7 +1289,6 @@ static void inet_forward_change(struct net *net)
IN_DEV_CONF_SET(in_dev, FORWARDING, on);
rcu_read_unlock();
}
- read_unlock(&dev_base_lock);
}
static int devinet_conf_proc(ctl_table *ctl, int write,
@@ -1450,6 +1468,7 @@ static struct devinet_sysctl_table {
DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"),
DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE,
"accept_source_route"),
+ DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"),
DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"),
DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"),
DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"),
@@ -1587,7 +1606,7 @@ static __net_init int devinet_init_net(struct net *net)
all = &ipv4_devconf;
dflt = &ipv4_devconf_dflt;
- if (net != &init_net) {
+ if (!net_eq(net, &init_net)) {
all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL);
if (all == NULL)
goto err_alloc_all;
@@ -1680,8 +1699,3 @@ void __init devinet_init(void)
rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr);
}
-EXPORT_SYMBOL(in_dev_finish_destroy);
-EXPORT_SYMBOL(inet_select_addr);
-EXPORT_SYMBOL(inetdev_by_index);
-EXPORT_SYMBOL(register_inetaddr_notifier);
-EXPORT_SYMBOL(unregister_inetaddr_notifier);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 12f7287..1948895 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -530,7 +530,7 @@ static int esp_init_authenc(struct xfrm_state *x)
}
err = crypto_aead_setauthsize(
- aead, aalg_desc->uinfo.auth.icv_truncbits / 8);
+ aead, x->aalg->alg_trunc_len / 8);
if (err)
goto free_key;
}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index f73dbed..3323168 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -229,25 +229,29 @@ unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
*/
int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
- struct net_device *dev, __be32 *spec_dst, u32 *itag)
+ struct net_device *dev, __be32 *spec_dst,
+ u32 *itag, u32 mark)
{
struct in_device *in_dev;
struct flowi fl = { .nl_u = { .ip4_u =
{ .daddr = src,
.saddr = dst,
.tos = tos } },
+ .mark = mark,
.iif = oif };
+
struct fib_result res;
- int no_addr, rpf;
+ int no_addr, rpf, accept_local;
int ret;
struct net *net;
- no_addr = rpf = 0;
+ no_addr = rpf = accept_local = 0;
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
if (in_dev) {
no_addr = in_dev->ifa_list == NULL;
rpf = IN_DEV_RPFILTER(in_dev);
+ accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
}
rcu_read_unlock();
@@ -257,8 +261,10 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
net = dev_net(dev);
if (fib_lookup(net, &fl, &res))
goto last_resort;
- if (res.type != RTN_UNICAST)
- goto e_inval_res;
+ if (res.type != RTN_UNICAST) {
+ if (res.type != RTN_LOCAL || !accept_local)
+ goto e_inval_res;
+ }
*spec_dst = FIB_RES_PREFSRC(res);
fib_combine_itag(itag, &res);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -892,11 +898,11 @@ static void nl_fib_lookup_exit(struct net *net)
net->ipv4.fibnl = NULL;
}
-static void fib_disable_ip(struct net_device *dev, int force)
+static void fib_disable_ip(struct net_device *dev, int force, int delay)
{
if (fib_sync_down_dev(dev, force))
fib_flush(dev_net(dev));
- rt_cache_flush(dev_net(dev), 0);
+ rt_cache_flush(dev_net(dev), delay);
arp_ifdown(dev);
}
@@ -919,7 +925,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
/* Last address was deleted from this interface.
Disable IP.
*/
- fib_disable_ip(dev, 1);
+ fib_disable_ip(dev, 1, 0);
} else {
rt_cache_flush(dev_net(dev), -1);
}
@@ -934,7 +940,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
struct in_device *in_dev = __in_dev_get_rtnl(dev);
if (event == NETDEV_UNREGISTER) {
- fib_disable_ip(dev, 2);
+ fib_disable_ip(dev, 2, -1);
return NOTIFY_DONE;
}
@@ -952,12 +958,15 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
rt_cache_flush(dev_net(dev), -1);
break;
case NETDEV_DOWN:
- fib_disable_ip(dev, 0);
+ fib_disable_ip(dev, 0, 0);
break;
case NETDEV_CHANGEMTU:
case NETDEV_CHANGE:
rt_cache_flush(dev_net(dev), 0);
break;
+ case NETDEV_UNREGISTER_BATCH:
+ rt_cache_flush_batch();
+ break;
}
return NOTIFY_DONE;
}
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 835262c..ca2d07b 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -284,7 +284,7 @@ static int fib_default_rules_init(struct fib_rules_ops *ops)
{
int err;
- err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, FIB_RULE_PERMANENT);
+ err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, 0);
if (err < 0)
return err;
err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0);
@@ -301,13 +301,9 @@ int __net_init fib4_rules_init(struct net *net)
int err;
struct fib_rules_ops *ops;
- ops = kmemdup(&fib4_rules_ops_template, sizeof(*ops), GFP_KERNEL);
- if (ops == NULL)
- return -ENOMEM;
- INIT_LIST_HEAD(&ops->rules_list);
- ops->fro_net = net;
-
- fib_rules_register(ops);
+ ops = fib_rules_register(&fib4_rules_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
err = fib_default_rules_init(ops);
if (err < 0)
@@ -318,12 +314,10 @@ int __net_init fib4_rules_init(struct net *net)
fail:
/* also cleans all rules already added */
fib_rules_unregister(ops);
- kfree(ops);
return err;
}
void __net_exit fib4_rules_exit(struct net *net)
{
fib_rules_unregister(net->ipv4.rules_ops);
- kfree(net->ipv4.rules_ops);
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 9b096d6..ed19aa6 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -228,7 +228,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
head = &fib_info_hash[hash];
hlist_for_each_entry(fi, node, head, fib_hash) {
- if (fi->fib_net != nfi->fib_net)
+ if (!net_eq(fi->fib_net, nfi->fib_net))
continue;
if (fi->fib_nhs != nfi->fib_nhs)
continue;
@@ -1047,7 +1047,7 @@ int fib_sync_down_addr(struct net *net, __be32 local)
return 0;
hlist_for_each_entry(fi, node, head, fib_lhash) {
- if (fi->fib_net != net)
+ if (!net_eq(fi->fib_net, net))
continue;
if (fi->fib_prefsrc == local) {
fi->fib_flags |= RTNH_F_DEAD;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 84adb57..fe11f60 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -501,15 +501,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
if (!(rt->rt_flags & RTCF_LOCAL)) {
struct net_device *dev = NULL;
+ rcu_read_lock();
if (rt->fl.iif &&
net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
- dev = dev_get_by_index(net, rt->fl.iif);
+ dev = dev_get_by_index_rcu(net, rt->fl.iif);
- if (dev) {
+ if (dev)
saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
- dev_put(dev);
- } else
+ else
saddr = 0;
+ rcu_read_unlock();
}
tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index d41e5de..76c0840 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1899,8 +1899,9 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
err = -EADDRNOTAVAIL;
for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
- if (pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr
- && pmc->multi.imr_ifindex == imr.imr_ifindex)
+ if ((pmc->multi.imr_multiaddr.s_addr ==
+ imr.imr_multiaddr.s_addr) &&
+ (pmc->multi.imr_ifindex == imr.imr_ifindex))
break;
}
if (!pmc) { /* must have a prior join */
@@ -2311,9 +2312,10 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
state->in_dev = NULL;
- for_each_netdev(net, state->dev) {
+ for_each_netdev_rcu(net, state->dev) {
struct in_device *in_dev;
- in_dev = in_dev_get(state->dev);
+
+ in_dev = __in_dev_get_rcu(state->dev);
if (!in_dev)
continue;
read_lock(&in_dev->mc_list_lock);
@@ -2323,7 +2325,6 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
break;
}
read_unlock(&in_dev->mc_list_lock);
- in_dev_put(in_dev);
}
return im;
}
@@ -2333,16 +2334,15 @@ static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_li
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
im = im->next;
while (!im) {
- if (likely(state->in_dev != NULL)) {
+ if (likely(state->in_dev != NULL))
read_unlock(&state->in_dev->mc_list_lock);
- in_dev_put(state->in_dev);
- }
- state->dev = next_net_device(state->dev);
+
+ state->dev = next_net_device_rcu(state->dev);
if (!state->dev) {
state->in_dev = NULL;
break;
}
- state->in_dev = in_dev_get(state->dev);
+ state->in_dev = __in_dev_get_rcu(state->dev);
if (!state->in_dev)
continue;
read_lock(&state->in_dev->mc_list_lock);
@@ -2361,9 +2361,9 @@ static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos)
}
static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(dev_base_lock)
+ __acquires(rcu)
{
- read_lock(&dev_base_lock);
+ rcu_read_lock();
return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
@@ -2379,16 +2379,15 @@ static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
- __releases(dev_base_lock)
+ __releases(rcu)
{
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
if (likely(state->in_dev != NULL)) {
read_unlock(&state->in_dev->mc_list_lock);
- in_dev_put(state->in_dev);
state->in_dev = NULL;
}
state->dev = NULL;
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
}
static int igmp_mc_seq_show(struct seq_file *seq, void *v)
@@ -2462,9 +2461,9 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
state->idev = NULL;
state->im = NULL;
- for_each_netdev(net, state->dev) {
+ for_each_netdev_rcu(net, state->dev) {
struct in_device *idev;
- idev = in_dev_get(state->dev);
+ idev = __in_dev_get_rcu(state->dev);
if (unlikely(idev == NULL))
continue;
read_lock(&idev->mc_list_lock);
@@ -2480,7 +2479,6 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
spin_unlock_bh(&im->lock);
}
read_unlock(&idev->mc_list_lock);
- in_dev_put(idev);
}
return psf;
}
@@ -2494,16 +2492,15 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l
spin_unlock_bh(&state->im->lock);
state->im = state->im->next;
while (!state->im) {
- if (likely(state->idev != NULL)) {
+ if (likely(state->idev != NULL))
read_unlock(&state->idev->mc_list_lock);
- in_dev_put(state->idev);
- }
- state->dev = next_net_device(state->dev);
+
+ state->dev = next_net_device_rcu(state->dev);
if (!state->dev) {
state->idev = NULL;
goto out;
}
- state->idev = in_dev_get(state->dev);
+ state->idev = __in_dev_get_rcu(state->dev);
if (!state->idev)
continue;
read_lock(&state->idev->mc_list_lock);
@@ -2528,8 +2525,9 @@ static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos)
}
static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
{
- read_lock(&dev_base_lock);
+ rcu_read_lock();
return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
@@ -2545,6 +2543,7 @@ static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
{
struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
if (likely(state->im != NULL)) {
@@ -2553,11 +2552,10 @@ static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
}
if (likely(state->idev != NULL)) {
read_unlock(&state->idev->mc_list_lock);
- in_dev_put(state->idev);
state->idev = NULL;
}
state->dev = NULL;
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
}
static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 26fb50e..ee16475 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -112,7 +112,7 @@ again:
hashinfo->bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, node, &head->chain)
- if (ib_net(tb) == net && tb->port == rover) {
+ if (net_eq(ib_net(tb), net) && tb->port == rover) {
if (tb->fastreuse > 0 &&
sk->sk_reuse &&
sk->sk_state != TCP_LISTEN &&
@@ -158,7 +158,7 @@ have_snum:
hashinfo->bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, node, &head->chain)
- if (ib_net(tb) == net && tb->port == snum)
+ if (net_eq(ib_net(tb), net) && tb->port == snum)
goto tb_found;
}
tb = NULL;
@@ -531,7 +531,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
&expire, &resend);
if (!expire &&
(!resend ||
- !req->rsk_ops->rtx_syn_ack(parent, req) ||
+ !req->rsk_ops->rtx_syn_ack(parent, req, NULL) ||
inet_rsk(req)->acked)) {
unsigned long timeo;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 47ad7aa..94ef51a 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -454,7 +454,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
* unique enough.
*/
inet_bind_bucket_for_each(tb, node, &head->chain) {
- if (ib_net(tb) == net && tb->port == port) {
+ if (net_eq(ib_net(tb), net) &&
+ tb->port == port) {
if (tb->fastreuse >= 0)
goto next_port;
WARN_ON(hlist_empty(&tb->owners));
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
index 6a667da..47038cb 100644
--- a/net/ipv4/inet_lro.c
+++ b/net/ipv4/inet_lro.c
@@ -64,15 +64,15 @@ static int lro_tcp_ip_check(struct iphdr *iph, struct tcphdr *tcph,
if (iph->ihl != IPH_LEN_WO_OPTIONS)
return -1;
- if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack
- || tcph->rst || tcph->syn || tcph->fin)
+ if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack ||
+ tcph->rst || tcph->syn || tcph->fin)
return -1;
if (INET_ECN_is_ce(ipv4_get_dsfield(iph)))
return -1;
- if (tcph->doff != TCPH_LEN_WO_OPTIONS
- && tcph->doff != TCPH_LEN_W_TIMESTAMP)
+ if (tcph->doff != TCPH_LEN_WO_OPTIONS &&
+ tcph->doff != TCPH_LEN_W_TIMESTAMP)
return -1;
/* check tcp options (only timestamp allowed) */
@@ -262,10 +262,10 @@ static int lro_check_tcp_conn(struct net_lro_desc *lro_desc,
struct iphdr *iph,
struct tcphdr *tcph)
{
- if ((lro_desc->iph->saddr != iph->saddr)
- || (lro_desc->iph->daddr != iph->daddr)
- || (lro_desc->tcph->source != tcph->source)
- || (lro_desc->tcph->dest != tcph->dest))
+ if ((lro_desc->iph->saddr != iph->saddr) ||
+ (lro_desc->iph->daddr != iph->daddr) ||
+ (lro_desc->tcph->source != tcph->source) ||
+ (lro_desc->tcph->dest != tcph->dest))
return -1;
return 0;
}
@@ -339,9 +339,9 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
u64 flags;
int vlan_hdr_len = 0;
- if (!lro_mgr->get_skb_header
- || lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph,
- &flags, priv))
+ if (!lro_mgr->get_skb_header ||
+ lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph,
+ &flags, priv))
goto out;
if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
@@ -351,8 +351,8 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
if (!lro_desc)
goto out;
- if ((skb->protocol == htons(ETH_P_8021Q))
- && !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
+ if ((skb->protocol == htons(ETH_P_8021Q)) &&
+ !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
vlan_hdr_len = VLAN_HLEN;
if (!lro_desc->active) { /* start new lro session */
@@ -446,9 +446,9 @@ static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
int hdr_len = LRO_MAX_PG_HLEN;
int vlan_hdr_len = 0;
- if (!lro_mgr->get_frag_header
- || lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph,
- (void *)&tcph, &flags, priv)) {
+ if (!lro_mgr->get_frag_header ||
+ lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph,
+ (void *)&tcph, &flags, priv)) {
mac_hdr = page_address(frags->page) + frags->page_offset;
goto out1;
}
@@ -472,8 +472,8 @@ static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
if (!skb)
goto out;
- if ((skb->protocol == htons(ETH_P_8021Q))
- && !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
+ if ((skb->protocol == htons(ETH_P_8021Q)) &&
+ !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
vlan_hdr_len = VLAN_HLEN;
iph = (void *)(skb->data + vlan_hdr_len);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 1f5d508..31f931e 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -421,37 +421,46 @@ out:
EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
-void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
+void inet_twsk_purge(struct inet_hashinfo *hashinfo,
struct inet_timewait_death_row *twdr, int family)
{
struct inet_timewait_sock *tw;
struct sock *sk;
struct hlist_nulls_node *node;
- int h;
+ unsigned int slot;
- local_bh_disable();
- for (h = 0; h <= hashinfo->ehash_mask; h++) {
- struct inet_ehash_bucket *head =
- inet_ehash_bucket(hashinfo, h);
- spinlock_t *lock = inet_ehash_lockp(hashinfo, h);
+ for (slot = 0; slot <= hashinfo->ehash_mask; slot++) {
+ struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
+restart_rcu:
+ rcu_read_lock();
restart:
- spin_lock(lock);
- sk_nulls_for_each(sk, node, &head->twchain) {
-
+ sk_nulls_for_each_rcu(sk, node, &head->twchain) {
tw = inet_twsk(sk);
- if (!net_eq(twsk_net(tw), net) ||
- tw->tw_family != family)
+ if ((tw->tw_family != family) ||
+ atomic_read(&twsk_net(tw)->count))
+ continue;
+
+ if (unlikely(!atomic_inc_not_zero(&tw->tw_refcnt)))
continue;
- atomic_inc(&tw->tw_refcnt);
- spin_unlock(lock);
+ if (unlikely((tw->tw_family != family) ||
+ atomic_read(&twsk_net(tw)->count))) {
+ inet_twsk_put(tw);
+ goto restart;
+ }
+
+ rcu_read_unlock();
inet_twsk_deschedule(tw, twdr);
inet_twsk_put(tw);
-
- goto restart;
+ goto restart_rcu;
}
- spin_unlock(lock);
+ /* If the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot)
+ goto restart;
+ rcu_read_unlock();
}
- local_bh_enable();
}
EXPORT_SYMBOL_GPL(inet_twsk_purge);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index b1fbe18..6bcfe52 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -67,9 +67,6 @@
* ip_id_count: idlock
*/
-/* Exported for inet_getid inline function. */
-DEFINE_SPINLOCK(inet_peer_idlock);
-
static struct kmem_cache *peer_cachep __read_mostly;
#define node_height(x) x->avl_height
@@ -390,7 +387,7 @@ struct inet_peer *inet_getpeer(__be32 daddr, int create)
n->v4daddr = daddr;
atomic_set(&n->refcnt, 1);
atomic_set(&n->rid, 0);
- n->ip_id_count = secure_ip_id(daddr);
+ atomic_set(&n->ip_id_count, secure_ip_id(daddr));
n->tcp_ts_stamp = 0;
write_lock_bh(&peer_pool_lock);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 575f9bd..c473531 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -206,10 +206,11 @@ static void ip_expire(unsigned long arg)
struct sk_buff *head = qp->q.fragments;
/* Send an ICMP "Fragment Reassembly Timeout" message. */
- if ((head->dev = dev_get_by_index(net, qp->iif)) != NULL) {
+ rcu_read_lock();
+ head->dev = dev_get_by_index_rcu(net, qp->iif);
+ if (head->dev)
icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
- dev_put(head->dev);
- }
+ rcu_read_unlock();
}
out:
spin_unlock(&qp->q.lock);
@@ -563,7 +564,7 @@ out_oversize:
printk(KERN_INFO "Oversized IP packet from %pI4.\n",
&qp->saddr);
out_fail:
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_REASMFAILS);
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
return err;
}
@@ -657,7 +658,7 @@ static int ip4_frags_ns_ctl_register(struct net *net)
struct ctl_table_header *hdr;
table = ip4_frags_ns_ctl_table;
- if (net != &init_net) {
+ if (!net_eq(net, &init_net)) {
table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
if (table == NULL)
goto err_alloc;
@@ -675,7 +676,7 @@ static int ip4_frags_ns_ctl_register(struct net *net)
return 0;
err_reg:
- if (net != &init_net)
+ if (!net_eq(net, &init_net))
kfree(table);
err_alloc:
return -ENOMEM;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index a77807d..f36ce15 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -125,7 +125,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev);
#define HASH_SIZE 16
-static int ipgre_net_id;
+static int ipgre_net_id __read_mostly;
struct ipgre_net {
struct ip_tunnel *tunnels[4][HASH_SIZE];
@@ -1309,17 +1309,8 @@ static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
static int ipgre_init_net(struct net *net)
{
+ struct ipgre_net *ign = net_generic(net, ipgre_net_id);
int err;
- struct ipgre_net *ign;
-
- err = -ENOMEM;
- ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
- if (ign == NULL)
- goto err_alloc;
-
- err = net_assign_generic(net, ipgre_net_id, ign);
- if (err < 0)
- goto err_assign;
ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
ipgre_tunnel_setup);
@@ -1340,10 +1331,6 @@ static int ipgre_init_net(struct net *net)
err_reg_dev:
free_netdev(ign->fb_tunnel_dev);
err_alloc_dev:
- /* nothing */
-err_assign:
- kfree(ign);
-err_alloc:
return err;
}
@@ -1357,12 +1344,13 @@ static void ipgre_exit_net(struct net *net)
ipgre_destroy_tunnels(ign, &list);
unregister_netdevice_many(&list);
rtnl_unlock();
- kfree(ign);
}
static struct pernet_operations ipgre_net_ops = {
.init = ipgre_init_net,
.exit = ipgre_exit_net,
+ .id = &ipgre_net_id,
+ .size = sizeof(struct ipgre_net),
};
static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -1476,14 +1464,14 @@ static void ipgre_tap_setup(struct net_device *dev)
ether_setup(dev);
- dev->netdev_ops = &ipgre_netdev_ops;
+ dev->netdev_ops = &ipgre_tap_netdev_ops;
dev->destructor = free_netdev;
dev->iflink = 0;
dev->features |= NETIF_F_NETNS_LOCAL;
}
-static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
+static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
struct nlattr *data[])
{
struct ip_tunnel *nt;
@@ -1537,25 +1525,29 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
if (t->dev != dev)
return -EEXIST;
} else {
- unsigned nflags = 0;
-
t = nt;
- if (ipv4_is_multicast(p.iph.daddr))
- nflags = IFF_BROADCAST;
- else if (p.iph.daddr)
- nflags = IFF_POINTOPOINT;
+ if (dev->type != ARPHRD_ETHER) {
+ unsigned nflags = 0;
- if ((dev->flags ^ nflags) &
- (IFF_POINTOPOINT | IFF_BROADCAST))
- return -EINVAL;
+ if (ipv4_is_multicast(p.iph.daddr))
+ nflags = IFF_BROADCAST;
+ else if (p.iph.daddr)
+ nflags = IFF_POINTOPOINT;
+
+ if ((dev->flags ^ nflags) &
+ (IFF_POINTOPOINT | IFF_BROADCAST))
+ return -EINVAL;
+ }
ipgre_tunnel_unlink(ign, t);
t->parms.iph.saddr = p.iph.saddr;
t->parms.iph.daddr = p.iph.daddr;
t->parms.i_key = p.i_key;
- memcpy(dev->dev_addr, &p.iph.saddr, 4);
- memcpy(dev->broadcast, &p.iph.daddr, 4);
+ if (dev->type != ARPHRD_ETHER) {
+ memcpy(dev->dev_addr, &p.iph.saddr, 4);
+ memcpy(dev->broadcast, &p.iph.daddr, 4);
+ }
ipgre_tunnel_link(ign, t);
netdev_state_change(dev);
}
@@ -1678,7 +1670,7 @@ static int __init ipgre_init(void)
return -EAGAIN;
}
- err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
+ err = register_pernet_device(&ipgre_net_ops);
if (err < 0)
goto gen_device_failed;
@@ -1696,7 +1688,7 @@ out:
tap_ops_failed:
rtnl_link_unregister(&ipgre_link_ops);
rtnl_link_failed:
- unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
+ unregister_pernet_device(&ipgre_net_ops);
gen_device_failed:
inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
goto out;
@@ -1706,7 +1698,7 @@ static void __exit ipgre_fini(void)
{
rtnl_link_unregister(&ipgre_tap_ops);
rtnl_link_unregister(&ipgre_link_ops);
- unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
+ unregister_pernet_device(&ipgre_net_ops);
if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
printk(KERN_INFO "ipgre close: can't remove protocol\n");
}
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index fdf51ba..c29de98 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -164,7 +164,7 @@ int ip_call_ra_chain(struct sk_buff *skb)
if (sk && inet_sk(sk)->inet_num == protocol &&
(!sk->sk_bound_dev_if ||
sk->sk_bound_dev_if == dev->ifindex) &&
- sock_net(sk) == dev_net(dev)) {
+ net_eq(sock_net(sk), dev_net(dev))) {
if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) {
read_unlock(&ip_ra_lock);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 322b408..e34013a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -264,9 +264,11 @@ int ip_mc_output(struct sk_buff *skb)
This check is duplicated in ip_mr_input at the moment.
*/
- && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
+ &&
+ ((rt->rt_flags & RTCF_LOCAL) ||
+ !(IPCB(skb)->flags & IPSKB_FORWARDED))
#endif
- ) {
+ ) {
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
if (newskb)
NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb,
@@ -501,8 +503,8 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
if (skb->sk) {
frag->sk = skb->sk;
frag->destructor = sock_wfree;
- truesizes += frag->truesize;
}
+ truesizes += frag->truesize;
}
/* Everything is OK. Generate! */
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index f8d04c2..4e08b7f 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1172,10 +1172,9 @@ static int __init ic_dynamic(void)
schedule_timeout_uninterruptible(1);
#ifdef IPCONFIG_DHCP
/* DHCP isn't done until we get a DHCPACK. */
- if ((ic_got_reply & IC_BOOTP)
- && (ic_proto_enabled & IC_USE_DHCP)
- && ic_dhcp_msgtype != DHCPACK)
- {
+ if ((ic_got_reply & IC_BOOTP) &&
+ (ic_proto_enabled & IC_USE_DHCP) &&
+ ic_dhcp_msgtype != DHCPACK) {
ic_got_reply = 0;
printk(",");
continue;
@@ -1344,9 +1343,9 @@ static int __init ip_auto_config(void)
*/
if (ic_myaddr == NONE ||
#ifdef CONFIG_ROOT_NFS
- (root_server_addr == NONE
- && ic_servaddr == NONE
- && ROOT_DEV == Root_NFS) ||
+ (root_server_addr == NONE &&
+ ic_servaddr == NONE &&
+ ROOT_DEV == Root_NFS) ||
#endif
ic_first_dev->next) {
#ifdef IPCONFIG_DYNAMIC
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index a2ca53d..eda04fe 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -119,7 +119,7 @@
#define HASH_SIZE 16
#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
-static int ipip_net_id;
+static int ipip_net_id __read_mostly;
struct ipip_net {
struct ip_tunnel *tunnels_r_l[HASH_SIZE];
struct ip_tunnel *tunnels_r[HASH_SIZE];
@@ -446,25 +446,27 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
goto tx_error;
}
- if (tiph->frag_off)
+ df |= old_iph->frag_off & htons(IP_DF);
+
+ if (df) {
mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
- else
- mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
- if (mtu < 68) {
- stats->collisions++;
- ip_rt_put(rt);
- goto tx_error;
- }
- if (skb_dst(skb))
- skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+ if (mtu < 68) {
+ stats->collisions++;
+ ip_rt_put(rt);
+ goto tx_error;
+ }
- df |= (old_iph->frag_off&htons(IP_DF));
+ if (skb_dst(skb))
+ skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
- if ((old_iph->frag_off&htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
- ip_rt_put(rt);
- goto tx_error;
+ if ((old_iph->frag_off & htons(IP_DF)) &&
+ mtu < ntohs(old_iph->tot_len)) {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
+ ip_rt_put(rt);
+ goto tx_error;
+ }
}
if (tunnel->err_count > 0) {
@@ -773,17 +775,8 @@ static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
static int ipip_init_net(struct net *net)
{
+ struct ipip_net *ipn = net_generic(net, ipip_net_id);
int err;
- struct ipip_net *ipn;
-
- err = -ENOMEM;
- ipn = kzalloc(sizeof(struct ipip_net), GFP_KERNEL);
- if (ipn == NULL)
- goto err_alloc;
-
- err = net_assign_generic(net, ipip_net_id, ipn);
- if (err < 0)
- goto err_assign;
ipn->tunnels[0] = ipn->tunnels_wc;
ipn->tunnels[1] = ipn->tunnels_l;
@@ -810,29 +803,26 @@ err_reg_dev:
free_netdev(ipn->fb_tunnel_dev);
err_alloc_dev:
/* nothing */
-err_assign:
- kfree(ipn);
-err_alloc:
return err;
}
static void ipip_exit_net(struct net *net)
{
- struct ipip_net *ipn;
+ struct ipip_net *ipn = net_generic(net, ipip_net_id);
LIST_HEAD(list);
- ipn = net_generic(net, ipip_net_id);
rtnl_lock();
ipip_destroy_tunnels(ipn, &list);
unregister_netdevice_queue(ipn->fb_tunnel_dev, &list);
unregister_netdevice_many(&list);
rtnl_unlock();
- kfree(ipn);
}
static struct pernet_operations ipip_net_ops = {
.init = ipip_init_net,
.exit = ipip_exit_net,
+ .id = &ipip_net_id,
+ .size = sizeof(struct ipip_net),
};
static int __init ipip_init(void)
@@ -846,7 +836,7 @@ static int __init ipip_init(void)
return -EAGAIN;
}
- err = register_pernet_gen_device(&ipip_net_id, &ipip_net_ops);
+ err = register_pernet_device(&ipip_net_ops);
if (err)
xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
@@ -858,7 +848,7 @@ static void __exit ipip_fini(void)
if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
printk(KERN_INFO "ipip close: can't deregister tunnel\n");
- unregister_pernet_gen_device(ipip_net_id, &ipip_net_ops);
+ unregister_pernet_device(&ipip_net_ops);
}
module_init(ipip_init);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index ef4ee45..54596f7 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -494,8 +494,10 @@ static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock)
return -EINVAL;
}
- if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
+ if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) {
+ dev_put(dev);
return -EADDRNOTAVAIL;
+ }
IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
ip_rt_multicast_event(in_dev);
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 1725dc0..f53cb8d 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -155,10 +155,10 @@ static int nf_ip_reroute(struct sk_buff *skb,
if (entry->hook == NF_INET_LOCAL_OUT) {
const struct iphdr *iph = ip_hdr(skb);
- if (!(iph->tos == rt_info->tos
- && skb->mark == rt_info->mark
- && iph->daddr == rt_info->daddr
- && iph->saddr == rt_info->saddr))
+ if (!(iph->tos == rt_info->tos &&
+ skb->mark == rt_info->mark &&
+ iph->daddr == rt_info->daddr &&
+ iph->saddr == rt_info->saddr))
return ip_route_me_harder(skb, RTN_UNSPEC);
}
return 0;
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index 9f07870..49ad447 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -499,7 +499,7 @@ ipq_rcv_nl_event(struct notifier_block *this,
if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) {
write_lock_bh(&queue_lock);
- if ((n->net == &init_net) && (n->pid == peer_pid))
+ if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid))
__ipq_reset();
write_unlock_bh(&queue_lock);
}
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 68afc6e..fe1a644 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -750,6 +750,8 @@ static int __init nf_nat_init(void)
BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
rcu_assign_pointer(nfnetlink_parse_nat_setup_hook,
nfnetlink_parse_nat_setup);
+ BUG_ON(nf_ct_nat_offset != NULL);
+ rcu_assign_pointer(nf_ct_nat_offset, nf_nat_get_offset);
return 0;
cleanup_extend:
@@ -764,6 +766,7 @@ static void __exit nf_nat_cleanup(void)
nf_ct_extend_unregister(&nat_extend);
rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL);
rcu_assign_pointer(nfnetlink_parse_nat_setup_hook, NULL);
+ rcu_assign_pointer(nf_ct_nat_offset, NULL);
synchronize_net();
}
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index 5bf6a92..7f10a6b 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -69,6 +69,28 @@ adjust_tcp_sequence(u32 seq,
DUMP_OFFSET(this_way);
}
+/* Get the offset value, for conntrack */
+s16 nf_nat_get_offset(const struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ u32 seq)
+{
+ struct nf_conn_nat *nat = nfct_nat(ct);
+ struct nf_nat_seq *this_way;
+ s16 offset;
+
+ if (!nat)
+ return 0;
+
+ this_way = &nat->seq[dir];
+ spin_lock_bh(&nf_nat_seqofs_lock);
+ offset = after(seq, this_way->correction_pos)
+ ? this_way->offset_after : this_way->offset_before;
+ spin_unlock_bh(&nf_nat_seqofs_lock);
+
+ return offset;
+}
+EXPORT_SYMBOL_GPL(nf_nat_get_offset);
+
/* Frobs data inside this packet, which is linear. */
static void mangle_contents(struct sk_buff *skb,
unsigned int dataoff,
@@ -185,11 +207,6 @@ nf_nat_mangle_tcp_packet(struct sk_buff *skb,
adjust_tcp_sequence(ntohl(tcph->seq),
(int)rep_len - (int)match_len,
ct, ctinfo);
- /* Tell TCP window tracking about seq change */
- nf_conntrack_tcp_update(skb, ip_hdrlen(skb),
- ct, CTINFO2DIR(ctinfo),
- (int)rep_len - (int)match_len);
-
nf_conntrack_event_cache(IPCT_NATSEQADJ, ct);
}
return 1;
@@ -411,12 +428,7 @@ nf_nat_seq_adjust(struct sk_buff *skb,
tcph->seq = newseq;
tcph->ack_seq = newack;
- if (!nf_nat_sack_adjust(skb, tcph, ct, ctinfo))
- return 0;
-
- nf_conntrack_tcp_update(skb, ip_hdrlen(skb), ct, dir, seqoff);
-
- return 1;
+ return nf_nat_sack_adjust(skb, tcph, ct, ctinfo);
}
/* Setup NAT on this expected conntrack so it follows master. */
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 9ef8c08..ce154b4 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -351,13 +351,24 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
skb->ip_summed = CHECKSUM_NONE;
skb->transport_header = skb->network_header;
- err = memcpy_fromiovecend((void *)iph, from, 0, length);
- if (err)
- goto error_fault;
+ err = -EFAULT;
+ if (memcpy_fromiovecend((void *)iph, from, 0, length))
+ goto error_free;
- /* We don't modify invalid header */
iphlen = iph->ihl * 4;
- if (iphlen >= sizeof(*iph) && iphlen <= length) {
+
+ /*
+ * We don't want to modify the ip header, but we do need to
+ * be sure that it won't cause problems later along the network
+ * stack. Specifically we want to make sure that iph->ihl is a
+ * sane value. If ihl points beyond the length of the buffer passed
+ * in, reject the frame as invalid
+ */
+ err = -EINVAL;
+ if (iphlen > length)
+ goto error_free;
+
+ if (iphlen >= sizeof(*iph)) {
if (!iph->saddr)
iph->saddr = rt->rt_src;
iph->check = 0;
@@ -380,8 +391,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
out:
return 0;
-error_fault:
- err = -EFAULT;
+error_free:
kfree_skb(skb);
error:
IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 68fb227..90cdcfc 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -513,43 +513,42 @@ static const struct file_operations rt_cpu_seq_fops = {
};
#ifdef CONFIG_NET_CLS_ROUTE
-static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
- int length, int *eof, void *data)
-{
- unsigned int i;
-
- if ((offset & 3) || (length & 3))
- return -EIO;
-
- if (offset >= sizeof(struct ip_rt_acct) * 256) {
- *eof = 1;
- return 0;
- }
-
- if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
- length = sizeof(struct ip_rt_acct) * 256 - offset;
- *eof = 1;
+static int rt_acct_proc_show(struct seq_file *m, void *v)
+{
+ struct ip_rt_acct *dst, *src;
+ unsigned int i, j;
+
+ dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
+ if (!dst)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
+ for (j = 0; j < 256; j++) {
+ dst[j].o_bytes += src[j].o_bytes;
+ dst[j].o_packets += src[j].o_packets;
+ dst[j].i_bytes += src[j].i_bytes;
+ dst[j].i_packets += src[j].i_packets;
+ }
}
- offset /= sizeof(u32);
-
- if (length > 0) {
- u32 *dst = (u32 *) buffer;
-
- *start = buffer;
- memset(dst, 0, length);
-
- for_each_possible_cpu(i) {
- unsigned int j;
- u32 *src;
+ seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
+ kfree(dst);
+ return 0;
+}
- src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
- for (j = 0; j < length/4; j++)
- dst[j] += src[j];
- }
- }
- return length;
+static int rt_acct_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, rt_acct_proc_show, NULL);
}
+
+static const struct file_operations rt_acct_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = rt_acct_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif
static int __net_init ip_rt_do_proc_init(struct net *net)
@@ -567,8 +566,7 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
goto err2;
#ifdef CONFIG_NET_CLS_ROUTE
- pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
- ip_rt_acct_read, NULL);
+ pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
if (!pde)
goto err3;
#endif
@@ -703,7 +701,7 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
{
- return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
+ return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
}
static inline int rt_is_expired(struct rtable *rth)
@@ -902,6 +900,12 @@ void rt_cache_flush(struct net *net, int delay)
rt_do_flush(!in_softirq());
}
+/* Flush previous cache invalidated entries from the cache */
+void rt_cache_flush_batch(void)
+{
+ rt_do_flush(!in_softirq());
+}
+
/*
* We change rt_genid and let gc do the cleanup
*/
@@ -1346,9 +1350,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
return;
net = dev_net(dev);
- if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
- || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
- || ipv4_is_zeronet(new_gw))
+ if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
+ ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
+ ipv4_is_zeronet(new_gw))
goto reject_redirect;
if (!rt_caching(net))
@@ -1851,7 +1855,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
goto e_inval;
spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
} else if (fib_validate_source(saddr, 0, tos, 0,
- dev, &spec_dst, &itag) < 0)
+ dev, &spec_dst, &itag, 0) < 0)
goto e_inval;
rth = dst_alloc(&ipv4_dst_ops);
@@ -1964,7 +1968,7 @@ static int __mkroute_input(struct sk_buff *skb,
err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
- in_dev->dev, &spec_dst, &itag);
+ in_dev->dev, &spec_dst, &itag, skb->mark);
if (err < 0) {
ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
saddr);
@@ -2138,7 +2142,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
int result;
result = fib_validate_source(saddr, daddr, tos,
net->loopback_dev->ifindex,
- dev, &spec_dst, &itag);
+ dev, &spec_dst, &itag, skb->mark);
if (result < 0)
goto martian_source;
if (result)
@@ -2167,7 +2171,7 @@ brd_input:
spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
else {
err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
- &itag);
+ &itag, skb->mark);
if (err < 0)
goto martian_source;
if (err)
@@ -2311,10 +2315,11 @@ skip_cache:
ip_hdr(skb)->protocol);
if (our
#ifdef CONFIG_IP_MROUTE
- || (!ipv4_is_local_multicast(daddr) &&
- IN_DEV_MFORWARD(in_dev))
+ ||
+ (!ipv4_is_local_multicast(daddr) &&
+ IN_DEV_MFORWARD(in_dev))
#endif
- ) {
+ ) {
rcu_read_unlock();
return ip_route_input_mc(skb, daddr, saddr,
tos, dev, our);
@@ -2511,9 +2516,9 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
of another iface. --ANK
*/
- if (oldflp->oif == 0
- && (ipv4_is_multicast(oldflp->fl4_dst) ||
- oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
+ if (oldflp->oif == 0 &&
+ (ipv4_is_multicast(oldflp->fl4_dst) ||
+ oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
dev_out = ip_dev_find(net, oldflp->fl4_src);
if (dev_out == NULL)
@@ -2852,7 +2857,7 @@ static int rt_fill_info(struct net *net,
error = rt->u.dst.error;
expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
if (rt->peer) {
- id = rt->peer->ip_id_count;
+ id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
if (rt->peer->tcp_ts_stamp) {
ts = rt->peer->tcp_ts;
tsage = get_seconds() - rt->peer->tcp_ts_stamp;
@@ -3309,7 +3314,7 @@ static __net_init int sysctl_route_net_init(struct net *net)
struct ctl_table *tbl;
tbl = ipv4_route_flush_table;
- if (net != &init_net) {
+ if (!net_eq(net, &init_net)) {
tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
if (tbl == NULL)
goto err_dup;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 3146cc4..26399ad 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -253,6 +253,8 @@ EXPORT_SYMBOL(cookie_check_timestamp);
struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
struct ip_options *opt)
{
+ struct tcp_options_received tcp_opt;
+ u8 *hash_location;
struct inet_request_sock *ireq;
struct tcp_request_sock *treq;
struct tcp_sock *tp = tcp_sk(sk);
@@ -263,7 +265,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
int mss;
struct rtable *rt;
__u8 rcv_wscale;
- struct tcp_options_received tcp_opt;
if (!sysctl_tcp_syncookies || !th->ack)
goto out;
@@ -341,7 +342,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
/* check for timestamp cookie support */
memset(&tcp_opt, 0, sizeof(tcp_opt));
- tcp_parse_options(skb, &tcp_opt, 0, &rt->u.dst);
+ tcp_parse_options(skb, &tcp_opt, &hash_location, 0, &rt->u.dst);
if (tcp_opt.saw_tstamp)
cookie_check_timestamp(&tcp_opt);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 2dcf04d..13f7ab6 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -714,6 +714,14 @@ static struct ctl_table ipv4_table[] = {
},
{
.ctl_name = CTL_UNNUMBERED,
+ .procname = "tcp_cookie_size",
+ .data = &sysctl_tcp_cookie_size,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
.procname = "udp_mem",
.data = &sysctl_udp_mem,
.maxlen = sizeof(sysctl_udp_mem),
@@ -818,7 +826,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
struct ctl_table *table;
table = ipv4_net_table;
- if (net != &init_net) {
+ if (!net_eq(net, &init_net)) {
table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL);
if (table == NULL)
goto err_alloc;
@@ -849,7 +857,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
return 0;
err_reg:
- if (net != &init_net)
+ if (!net_eq(net, &init_net))
kfree(table);
err_alloc:
return -ENOMEM;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e0cfa63..c8666b7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -264,6 +264,7 @@
#include <linux/cache.h>
#include <linux/err.h>
#include <linux/crypto.h>
+#include <linux/time.h>
#include <net/icmp.h>
#include <net/tcp.h>
@@ -1183,7 +1184,9 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
#if TCP_DEBUG
struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
- WARN_ON(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
+ WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
+ KERN_INFO "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
+ tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
#endif
if (inet_csk_ack_scheduled(sk)) {
@@ -1430,11 +1433,13 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
/* Now that we have two receive queues this
* shouldn't happen.
*/
- if (before(*seq, TCP_SKB_CB(skb)->seq)) {
- printk(KERN_INFO "recvmsg bug: copied %X "
- "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
+ if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
+ KERN_INFO "recvmsg bug: copied %X "
+ "seq %X rcvnxt %X fl %X\n", *seq,
+ TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
+ flags))
break;
- }
+
offset = *seq - TCP_SKB_CB(skb)->seq;
if (tcp_hdr(skb)->syn)
offset--;
@@ -1443,8 +1448,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (tcp_hdr(skb)->fin)
goto found_fin_ok;
WARN(!(flags & MSG_PEEK), KERN_INFO "recvmsg bug 2: "
- "copied %X seq %X\n", *seq,
- TCP_SKB_CB(skb)->seq);
+ "copied %X seq %X rcvnxt %X fl %X\n",
+ *seq, TCP_SKB_CB(skb)->seq,
+ tp->rcv_nxt, flags);
}
/* Well, if we have backlog, try to process it now yet. */
@@ -2054,6 +2060,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_cnt = 0;
tp->bytes_acked = 0;
+ tp->window_clamp = 0;
tcp_set_ca_state(sk, TCP_CA_Open);
tcp_clear_retrans(tp);
inet_csk_delack_init(sk);
@@ -2078,8 +2085,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
int val;
int err = 0;
- /* This is a string value all the others are int's */
- if (optname == TCP_CONGESTION) {
+ /* These are data/string values, all the others are ints */
+ switch (optname) {
+ case TCP_CONGESTION: {
char name[TCP_CA_NAME_MAX];
if (optlen < 1)
@@ -2096,6 +2104,93 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
release_sock(sk);
return err;
}
+ case TCP_COOKIE_TRANSACTIONS: {
+ struct tcp_cookie_transactions ctd;
+ struct tcp_cookie_values *cvp = NULL;
+
+ if (sizeof(ctd) > optlen)
+ return -EINVAL;
+ if (copy_from_user(&ctd, optval, sizeof(ctd)))
+ return -EFAULT;
+
+ if (ctd.tcpct_used > sizeof(ctd.tcpct_value) ||
+ ctd.tcpct_s_data_desired > TCP_MSS_DESIRED)
+ return -EINVAL;
+
+ if (ctd.tcpct_cookie_desired == 0) {
+ /* default to global value */
+ } else if ((0x1 & ctd.tcpct_cookie_desired) ||
+ ctd.tcpct_cookie_desired > TCP_COOKIE_MAX ||
+ ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) {
+ return -EINVAL;
+ }
+
+ if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) {
+ /* Supercedes all other values */
+ lock_sock(sk);
+ if (tp->cookie_values != NULL) {
+ kref_put(&tp->cookie_values->kref,
+ tcp_cookie_values_release);
+ tp->cookie_values = NULL;
+ }
+ tp->rx_opt.cookie_in_always = 0; /* false */
+ tp->rx_opt.cookie_out_never = 1; /* true */
+ release_sock(sk);
+ return err;
+ }
+
+ /* Allocate ancillary memory before locking.
+ */
+ if (ctd.tcpct_used > 0 ||
+ (tp->cookie_values == NULL &&
+ (sysctl_tcp_cookie_size > 0 ||
+ ctd.tcpct_cookie_desired > 0 ||
+ ctd.tcpct_s_data_desired > 0))) {
+ cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used,
+ GFP_KERNEL);
+ if (cvp == NULL)
+ return -ENOMEM;
+ }
+ lock_sock(sk);
+ tp->rx_opt.cookie_in_always =
+ (TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags);
+ tp->rx_opt.cookie_out_never = 0; /* false */
+
+ if (tp->cookie_values != NULL) {
+ if (cvp != NULL) {
+ /* Changed values are recorded by a changed
+ * pointer, ensuring the cookie will differ,
+ * without separately hashing each value later.
+ */
+ kref_put(&tp->cookie_values->kref,
+ tcp_cookie_values_release);
+ kref_init(&cvp->kref);
+ tp->cookie_values = cvp;
+ } else {
+ cvp = tp->cookie_values;
+ }
+ }
+ if (cvp != NULL) {
+ cvp->cookie_desired = ctd.tcpct_cookie_desired;
+
+ if (ctd.tcpct_used > 0) {
+ memcpy(cvp->s_data_payload, ctd.tcpct_value,
+ ctd.tcpct_used);
+ cvp->s_data_desired = ctd.tcpct_used;
+ cvp->s_data_constant = 1; /* true */
+ } else {
+ /* No constant payload data. */
+ cvp->s_data_desired = ctd.tcpct_s_data_desired;
+ cvp->s_data_constant = 0; /* false */
+ }
+ }
+ release_sock(sk);
+ return err;
+ }
+ default:
+ /* fallthru */
+ break;
+ };
if (optlen < sizeof(int))
return -EINVAL;
@@ -2420,6 +2515,47 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
return -EFAULT;
return 0;
+
+ case TCP_COOKIE_TRANSACTIONS: {
+ struct tcp_cookie_transactions ctd;
+ struct tcp_cookie_values *cvp = tp->cookie_values;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < sizeof(ctd))
+ return -EINVAL;
+
+ memset(&ctd, 0, sizeof(ctd));
+ ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ?
+ TCP_COOKIE_IN_ALWAYS : 0)
+ | (tp->rx_opt.cookie_out_never ?
+ TCP_COOKIE_OUT_NEVER : 0);
+
+ if (cvp != NULL) {
+ ctd.tcpct_flags |= (cvp->s_data_in ?
+ TCP_S_DATA_IN : 0)
+ | (cvp->s_data_out ?
+ TCP_S_DATA_OUT : 0);
+
+ ctd.tcpct_cookie_desired = cvp->cookie_desired;
+ ctd.tcpct_s_data_desired = cvp->s_data_desired;
+
+ /* Cookie(s) saved, return as nonce */
+ if (sizeof(ctd.tcpct_value) < cvp->cookie_pair_size) {
+ /* impossible? */
+ return -EINVAL;
+ }
+ memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0],
+ cvp->cookie_pair_size);
+ ctd.tcpct_used = cvp->cookie_pair_size;
+ }
+
+ if (put_user(sizeof(ctd), optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &ctd, sizeof(ctd)))
+ return -EFAULT;
+ return 0;
+ }
default:
return -ENOPROTOOPT;
}
@@ -2842,6 +2978,135 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
#endif
+/**
+ * Each Responder maintains up to two secret values concurrently for
+ * efficient secret rollover. Each secret value has 4 states:
+ *
+ * Generating. (tcp_secret_generating != tcp_secret_primary)
+ * Generates new Responder-Cookies, but not yet used for primary
+ * verification. This is a short-term state, typically lasting only
+ * one round trip time (RTT).
+ *
+ * Primary. (tcp_secret_generating == tcp_secret_primary)
+ * Used both for generation and primary verification.
+ *
+ * Retiring. (tcp_secret_retiring != tcp_secret_secondary)
+ * Used for verification, until the first failure that can be
+ * verified by the newer Generating secret. At that time, this
+ * cookie's state is changed to Secondary, and the Generating
+ * cookie's state is changed to Primary. This is a short-term state,
+ * typically lasting only one round trip time (RTT).
+ *
+ * Secondary. (tcp_secret_retiring == tcp_secret_secondary)
+ * Used for secondary verification, after primary verification
+ * failures. This state lasts no more than twice the Maximum Segment
+ * Lifetime (2MSL). Then, the secret is discarded.
+ */
+struct tcp_cookie_secret {
+ /* The secret is divided into two parts. The digest part is the
+ * equivalent of previously hashing a secret and saving the state,
+ * and serves as an initialization vector (IV). The message part
+ * serves as the trailing secret.
+ */
+ u32 secrets[COOKIE_WORKSPACE_WORDS];
+ unsigned long expires;
+};
+
+#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL)
+#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2)
+#define TCP_SECRET_LIFE (HZ * 600)
+
+static struct tcp_cookie_secret tcp_secret_one;
+static struct tcp_cookie_secret tcp_secret_two;
+
+/* Essentially a circular list, without dynamic allocation. */
+static struct tcp_cookie_secret *tcp_secret_generating;
+static struct tcp_cookie_secret *tcp_secret_primary;
+static struct tcp_cookie_secret *tcp_secret_retiring;
+static struct tcp_cookie_secret *tcp_secret_secondary;
+
+static DEFINE_SPINLOCK(tcp_secret_locker);
+
+/* Select a pseudo-random word in the cookie workspace.
+ */
+static inline u32 tcp_cookie_work(const u32 *ws, const int n)
+{
+ return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])];
+}
+
+/* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed.
+ * Called in softirq context.
+ * Returns: 0 for success.
+ */
+int tcp_cookie_generator(u32 *bakery)
+{
+ unsigned long jiffy = jiffies;
+
+ if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) {
+ spin_lock_bh(&tcp_secret_locker);
+ if (!time_after_eq(jiffy, tcp_secret_generating->expires)) {
+ /* refreshed by another */
+ memcpy(bakery,
+ &tcp_secret_generating->secrets[0],
+ COOKIE_WORKSPACE_WORDS);
+ } else {
+ /* still needs refreshing */
+ get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS);
+
+ /* The first time, paranoia assumes that the
+ * randomization function isn't as strong. But,
+ * this secret initialization is delayed until
+ * the last possible moment (packet arrival).
+ * Although that time is observable, it is
+ * unpredictably variable. Mash in the most
+ * volatile clock bits available, and expire the
+ * secret extra quickly.
+ */
+ if (unlikely(tcp_secret_primary->expires ==
+ tcp_secret_secondary->expires)) {
+ struct timespec tv;
+
+ getnstimeofday(&tv);
+ bakery[COOKIE_DIGEST_WORDS+0] ^=
+ (u32)tv.tv_nsec;
+
+ tcp_secret_secondary->expires = jiffy
+ + TCP_SECRET_1MSL
+ + (0x0f & tcp_cookie_work(bakery, 0));
+ } else {
+ tcp_secret_secondary->expires = jiffy
+ + TCP_SECRET_LIFE
+ + (0xff & tcp_cookie_work(bakery, 1));
+ tcp_secret_primary->expires = jiffy
+ + TCP_SECRET_2MSL
+ + (0x1f & tcp_cookie_work(bakery, 2));
+ }
+ memcpy(&tcp_secret_secondary->secrets[0],
+ bakery, COOKIE_WORKSPACE_WORDS);
+
+ rcu_assign_pointer(tcp_secret_generating,
+ tcp_secret_secondary);
+ rcu_assign_pointer(tcp_secret_retiring,
+ tcp_secret_primary);
+ /*
+ * Neither call_rcu() nor synchronize_rcu() needed.
+ * Retiring data is not freed. It is replaced after
+ * further (locked) pointer updates, and a quiet time
+ * (minimum 1MSL, maximum LIFE - 2MSL).
+ */
+ }
+ spin_unlock_bh(&tcp_secret_locker);
+ } else {
+ rcu_read_lock_bh();
+ memcpy(bakery,
+ &rcu_dereference(tcp_secret_generating)->secrets[0],
+ COOKIE_WORKSPACE_WORDS);
+ rcu_read_unlock_bh();
+ }
+ return 0;
+}
+EXPORT_SYMBOL(tcp_cookie_generator);
+
void tcp_done(struct sock *sk)
{
if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
@@ -2876,6 +3141,7 @@ void __init tcp_init(void)
struct sk_buff *skb = NULL;
unsigned long nr_pages, limit;
int order, i, max_share;
+ unsigned long jiffy = jiffies;
BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
@@ -2969,6 +3235,15 @@ void __init tcp_init(void)
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
tcp_register_congestion_control(&tcp_reno);
+
+ memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
+ memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets));
+ tcp_secret_one.expires = jiffy; /* past due */
+ tcp_secret_two.expires = jiffy; /* past due */
+ tcp_secret_generating = &tcp_secret_one;
+ tcp_secret_primary = &tcp_secret_one;
+ tcp_secret_retiring = &tcp_secret_two;
+ tcp_secret_secondary = &tcp_secret_two;
}
EXPORT_SYMBOL(tcp_close);
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 26d5c7f..7c94a49 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -92,8 +92,8 @@ static inline void measure_rtt(struct sock *sk, u32 srtt)
if (icsk->icsk_ca_state == TCP_CA_Open) {
if (ca->maxRTT < ca->minRTT)
ca->maxRTT = ca->minRTT;
- if (ca->maxRTT < srtt
- && srtt <= ca->maxRTT + msecs_to_jiffies(20))
+ if (ca->maxRTT < srtt &&
+ srtt <= ca->maxRTT + msecs_to_jiffies(20))
ca->maxRTT = srtt;
}
}
@@ -123,9 +123,9 @@ static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, s32 rtt
ca->packetcount += pkts_acked;
- if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1)
- && now - ca->lasttime >= ca->minRTT
- && ca->minRTT > 0) {
+ if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) &&
+ now - ca->lasttime >= ca->minRTT &&
+ ca->minRTT > 0) {
__u32 cur_Bi = ca->packetcount * HZ / (now - ca->lasttime);
if (htcp_ccount(ca) <= 3) {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ba0eab6..57ae96a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -140,7 +140,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
* "len" is invariant segment length, including TCP header.
*/
len += skb->data - skb_transport_header(skb);
- if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) ||
+ if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
/* If PSH is not set, packet should be
* full sized, provided peer TCP is not badly broken.
* This observation (if it is correct 8)) allows
@@ -411,7 +411,7 @@ void tcp_initialize_rcv_mss(struct sock *sk)
unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
hint = min(hint, tp->rcv_wnd / 2);
- hint = min(hint, TCP_MIN_RCVMSS);
+ hint = min(hint, TCP_MSS_DEFAULT);
hint = max(hint, TCP_MIN_MSS);
inet_csk(sk)->icsk_ack.rcv_mss = hint;
@@ -3698,14 +3698,12 @@ old_ack:
* the fast version below fails.
*/
void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
- int estab, struct dst_entry *dst)
+ u8 **hvpp, int estab, struct dst_entry *dst)
{
unsigned char *ptr;
struct tcphdr *th = tcp_hdr(skb);
int length = (th->doff * 4) - sizeof(struct tcphdr);
- BUG_ON(!estab && !dst);
-
ptr = (unsigned char *)(th + 1);
opt_rx->saw_tstamp = 0;
@@ -3787,7 +3785,30 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
*/
break;
#endif
- }
+ case TCPOPT_COOKIE:
+ /* This option is variable length.
+ */
+ switch (opsize) {
+ case TCPOLEN_COOKIE_BASE:
+ /* not yet implemented */
+ break;
+ case TCPOLEN_COOKIE_PAIR:
+ /* not yet implemented */
+ break;
+ case TCPOLEN_COOKIE_MIN+0:
+ case TCPOLEN_COOKIE_MIN+2:
+ case TCPOLEN_COOKIE_MIN+4:
+ case TCPOLEN_COOKIE_MIN+6:
+ case TCPOLEN_COOKIE_MAX:
+ /* 16-bit multiple */
+ opt_rx->cookie_plus = opsize;
+ *hvpp = ptr;
+ default:
+ /* ignore option */
+ break;
+ };
+ break;
+ };
ptr += opsize-2;
length -= opsize;
@@ -3815,17 +3836,20 @@ static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
* If it is wrong it falls back on tcp_parse_options().
*/
static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
- struct tcp_sock *tp)
+ struct tcp_sock *tp, u8 **hvpp)
{
- if (th->doff == sizeof(struct tcphdr) >> 2) {
+ /* In the spirit of fast parsing, compare doff directly to constant
+ * values. Because equality is used, short doff can be ignored here.
+ */
+ if (th->doff == (sizeof(*th) / 4)) {
tp->rx_opt.saw_tstamp = 0;
return 0;
} else if (tp->rx_opt.tstamp_ok &&
- th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
+ th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
if (tcp_parse_aligned_timestamp(tp, th))
return 1;
}
- tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
+ tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL);
return 1;
}
@@ -4854,11 +4878,11 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
struct tcp_sock *tp = tcp_sk(sk);
/* More than one full frame received... */
- if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss
+ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
/* ... and right edge of window advances far enough.
* (tcp_recvmsg() will send ACK otherwise). Or...
*/
- && __tcp_select_window(sk) >= tp->rcv_wnd) ||
+ __tcp_select_window(sk) >= tp->rcv_wnd) ||
/* We ACK each frame or... */
tcp_in_quickack_mode(sk) ||
/* We have out of order data. */
@@ -5079,10 +5103,12 @@ out:
static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, int syn_inerr)
{
+ u8 *hash_location;
struct tcp_sock *tp = tcp_sk(sk);
/* RFC1323: H1. Apply PAWS check first. */
- if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
+ if (tcp_fast_parse_options(skb, th, tp, &hash_location) &&
+ tp->rx_opt.saw_tstamp &&
tcp_paws_discard(sk, skb)) {
if (!th->rst) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
@@ -5370,12 +5396,14 @@ discard:
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ u8 *hash_location;
struct inet_connection_sock *icsk = inet_csk(sk);
- int saved_clamp = tp->rx_opt.mss_clamp;
+ struct tcp_sock *tp = tcp_sk(sk);
struct dst_entry *dst = __sk_dst_get(sk);
+ struct tcp_cookie_values *cvp = tp->cookie_values;
+ int saved_clamp = tp->rx_opt.mss_clamp;
- tcp_parse_options(skb, &tp->rx_opt, 0, dst);
+ tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, dst);
if (th->ack) {
/* rfc793:
@@ -5472,6 +5500,31 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* Change state from SYN-SENT only after copied_seq
* is initialized. */
tp->copied_seq = tp->rcv_nxt;
+
+ if (cvp != NULL &&
+ cvp->cookie_pair_size > 0 &&
+ tp->rx_opt.cookie_plus > 0) {
+ int cookie_size = tp->rx_opt.cookie_plus
+ - TCPOLEN_COOKIE_BASE;
+ int cookie_pair_size = cookie_size
+ + cvp->cookie_desired;
+
+ /* A cookie extension option was sent and returned.
+ * Note that each incoming SYNACK replaces the
+ * Responder cookie. The initial exchange is most
+ * fragile, as protection against spoofing relies
+ * entirely upon the sequence and timestamp (above).
+ * This replacement strategy allows the correct pair to
+ * pass through, while any others will be filtered via
+ * Responder verification later.
+ */
+ if (sizeof(cvp->cookie_pair) >= cookie_pair_size) {
+ memcpy(&cvp->cookie_pair[cvp->cookie_desired],
+ hash_location, cookie_size);
+ cvp->cookie_pair_size = cookie_pair_size;
+ }
+ }
+
smp_mb();
tcp_set_state(sk, TCP_ESTABLISHED);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 657ae33..fee9aab 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -204,7 +204,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
* when trying new connection.
*/
if (peer != NULL &&
- peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
+ (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
tp->rx_opt.ts_recent = peer->tcp_ts;
}
@@ -217,7 +217,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (inet->opt)
inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
- tp->rx_opt.mss_clamp = 536;
+ tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
/* Socket identity is still unknown (sport may be zero).
* However we set state to SYN-SENT and not releasing socket
@@ -742,8 +742,9 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
* This still operates on a request_sock only, not on a big
* socket.
*/
-static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
- struct dst_entry *dst)
+static int __tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
+ struct request_sock *req,
+ struct request_values *rvp)
{
const struct inet_request_sock *ireq = inet_rsk(req);
int err = -1;
@@ -753,7 +754,7 @@ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
return -1;
- skb = tcp_make_synack(sk, dst, req);
+ skb = tcp_make_synack(sk, dst, req, rvp);
if (skb) {
struct tcphdr *th = tcp_hdr(skb);
@@ -774,9 +775,10 @@ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
return err;
}
-static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
+static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
+ struct request_values *rvp)
{
- return __tcp_v4_send_synack(sk, req, NULL);
+ return __tcp_v4_send_synack(sk, NULL, req, rvp);
}
/*
@@ -1211,13 +1213,16 @@ static struct timewait_sock_ops tcp_timewait_sock_ops = {
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
- struct inet_request_sock *ireq;
+ struct tcp_extend_values tmp_ext;
struct tcp_options_received tmp_opt;
+ u8 *hash_location;
struct request_sock *req;
+ struct inet_request_sock *ireq;
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct dst_entry *dst = NULL;
__be32 saddr = ip_hdr(skb)->saddr;
__be32 daddr = ip_hdr(skb)->daddr;
__u32 isn = TCP_SKB_CB(skb)->when;
- struct dst_entry *dst = NULL;
#ifdef CONFIG_SYN_COOKIES
int want_cookie = 0;
#else
@@ -1268,16 +1273,50 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
goto drop_and_free;
tcp_clear_options(&tmp_opt);
- tmp_opt.mss_clamp = 536;
- tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
+ tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
+ tmp_opt.user_mss = tp->rx_opt.user_mss;
+ tcp_parse_options(skb, &tmp_opt, &hash_location, 0, dst);
+
+ if (tmp_opt.cookie_plus > 0 &&
+ tmp_opt.saw_tstamp &&
+ !tp->rx_opt.cookie_out_never &&
+ (sysctl_tcp_cookie_size > 0 ||
+ (tp->cookie_values != NULL &&
+ tp->cookie_values->cookie_desired > 0))) {
+ u8 *c;
+ u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
+ int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
+
+ if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
+ goto drop_and_release;
+
+ /* Secret recipe starts with IP addresses */
+ *mess++ ^= daddr;
+ *mess++ ^= saddr;
+
+ /* plus variable length Initiator Cookie */
+ c = (u8 *)mess;
+ while (l-- > 0)
+ *c++ ^= *hash_location++;
- tcp_parse_options(skb, &tmp_opt, 0, dst);
+#ifdef CONFIG_SYN_COOKIES
+ want_cookie = 0; /* not our kind of cookie */
+#endif
+ tmp_ext.cookie_out_never = 0; /* false */
+ tmp_ext.cookie_plus = tmp_opt.cookie_plus;
+ } else if (!tp->rx_opt.cookie_in_always) {
+ /* redundant indications, but ensure initialization. */
+ tmp_ext.cookie_out_never = 1; /* true */
+ tmp_ext.cookie_plus = 0;
+ } else {
+ goto drop_and_release;
+ }
+ tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
if (want_cookie && !tmp_opt.saw_tstamp)
tcp_clear_options(&tmp_opt);
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
-
tcp_openreq_init(req, &tmp_opt, skb);
if (security_inet_conn_request(sk, skb, req))
@@ -1308,7 +1347,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
tcp_death_row.sysctl_tw_recycle &&
(peer = rt_get_peer((struct rtable *)dst)) != NULL &&
peer->v4daddr == saddr) {
- if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
+ if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
(s32)(peer->tcp_ts - req->ts_recent) >
TCP_PAWS_WINDOW) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
@@ -1337,7 +1376,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
}
tcp_rsk(req)->snt_isn = isn;
- if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
+ if (__tcp_v4_send_synack(sk, dst, req,
+ (struct request_values *)&tmp_ext) ||
+ want_cookie)
goto drop_and_free;
inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
@@ -1727,9 +1768,9 @@ int tcp_v4_remember_stamp(struct sock *sk)
if (peer) {
if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
- (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
- peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
- peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
+ ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
+ peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
+ peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
peer->tcp_ts = tp->rx_opt.ts_recent;
}
if (release_it)
@@ -1748,9 +1789,9 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
- (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
- peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
- peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
+ ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
+ peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
+ peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
peer->tcp_ts = tcptw->tw_ts_recent;
}
inet_putpeer(peer);
@@ -1815,7 +1856,7 @@ static int tcp_v4_init_sock(struct sock *sk)
*/
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_clamp = ~0;
- tp->mss_cache = 536;
+ tp->mss_cache = TCP_MSS_DEFAULT;
tp->reordering = sysctl_tcp_reordering;
icsk->icsk_ca_ops = &tcp_init_congestion_ops;
@@ -1831,6 +1872,19 @@ static int tcp_v4_init_sock(struct sock *sk)
tp->af_specific = &tcp_sock_ipv4_specific;
#endif
+ /* TCP Cookie Transactions */
+ if (sysctl_tcp_cookie_size > 0) {
+ /* Default, cookies without s_data_payload. */
+ tp->cookie_values =
+ kzalloc(sizeof(*tp->cookie_values),
+ sk->sk_allocation);
+ if (tp->cookie_values != NULL)
+ kref_init(&tp->cookie_values->kref);
+ }
+ /* Presumed zeroed, in order of appearance:
+ * cookie_in_always, cookie_out_never,
+ * s_data_constant, s_data_in, s_data_out
+ */
sk->sk_sndbuf = sysctl_tcp_wmem[1];
sk->sk_rcvbuf = sysctl_tcp_rmem[1];
@@ -1884,6 +1938,13 @@ void tcp_v4_destroy_sock(struct sock *sk)
sk->sk_sndmsg_page = NULL;
}
+ /* TCP Cookie Transactions */
+ if (tp->cookie_values != NULL) {
+ kref_put(&tp->cookie_values->kref,
+ tcp_cookie_values_release);
+ tp->cookie_values = NULL;
+ }
+
percpu_counter_dec(&tcp_sockets_allocated);
}
@@ -2468,12 +2529,17 @@ static int __net_init tcp_sk_init(struct net *net)
static void __net_exit tcp_sk_exit(struct net *net)
{
inet_ctl_sock_destroy(net->ipv4.tcp_sock);
- inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET);
+}
+
+static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
+{
+ inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
}
static struct pernet_operations __net_initdata tcp_sk_ops = {
- .init = tcp_sk_init,
- .exit = tcp_sk_exit,
+ .init = tcp_sk_init,
+ .exit = tcp_sk_exit,
+ .exit_batch = tcp_sk_exit_batch,
};
void __init tcp_v4_init(void)
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index ce3c41f..de87037 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -143,8 +143,8 @@ static u32 tcp_lp_remote_hz_estimator(struct sock *sk)
goto out;
/* we can't calc remote HZ with no different!! */
- if (tp->rx_opt.rcv_tsval == lp->remote_ref_time
- || tp->rx_opt.rcv_tsecr == lp->local_ref_time)
+ if (tp->rx_opt.rcv_tsval == lp->remote_ref_time ||
+ tp->rx_opt.rcv_tsecr == lp->local_ref_time)
goto out;
m = HZ * (tp->rx_opt.rcv_tsval -
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 463d51b..87accec 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -26,13 +26,7 @@
#include <net/inet_common.h>
#include <net/xfrm.h>
-#ifdef CONFIG_SYSCTL
-#define SYNC_INIT 0 /* let the user enable it */
-#else
-#define SYNC_INIT 1
-#endif
-
-int sysctl_tcp_syncookies __read_mostly = SYNC_INIT;
+int sysctl_tcp_syncookies __read_mostly = 1;
EXPORT_SYMBOL(sysctl_tcp_syncookies);
int sysctl_tcp_abort_on_overflow __read_mostly;
@@ -96,13 +90,14 @@ enum tcp_tw_status
tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
const struct tcphdr *th)
{
- struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
struct tcp_options_received tmp_opt;
+ u8 *hash_location;
+ struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
int paws_reject = 0;
if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
tmp_opt.tstamp_ok = 1;
- tcp_parse_options(skb, &tmp_opt, 1, NULL);
+ tcp_parse_options(skb, &tmp_opt, &hash_location, 1, NULL);
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = tcptw->tw_ts_recent;
@@ -389,14 +384,43 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
const struct inet_request_sock *ireq = inet_rsk(req);
struct tcp_request_sock *treq = tcp_rsk(req);
struct inet_connection_sock *newicsk = inet_csk(newsk);
- struct tcp_sock *newtp;
+ struct tcp_sock *newtp = tcp_sk(newsk);
+ struct tcp_sock *oldtp = tcp_sk(sk);
+ struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
+
+ /* TCP Cookie Transactions require space for the cookie pair,
+ * as it differs for each connection. There is no need to
+ * copy any s_data_payload stored at the original socket.
+ * Failure will prevent resuming the connection.
+ *
+ * Presumed copied, in order of appearance:
+ * cookie_in_always, cookie_out_never
+ */
+ if (oldcvp != NULL) {
+ struct tcp_cookie_values *newcvp =
+ kzalloc(sizeof(*newtp->cookie_values),
+ GFP_ATOMIC);
+
+ if (newcvp != NULL) {
+ kref_init(&newcvp->kref);
+ newcvp->cookie_desired =
+ oldcvp->cookie_desired;
+ newtp->cookie_values = newcvp;
+ } else {
+ /* Not Yet Implemented */
+ newtp->cookie_values = NULL;
+ }
+ }
/* Now setup tcp_sock */
- newtp = tcp_sk(newsk);
newtp->pred_flags = 0;
- newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1;
- newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1;
- newtp->snd_up = treq->snt_isn + 1;
+
+ newtp->rcv_wup = newtp->copied_seq =
+ newtp->rcv_nxt = treq->rcv_isn + 1;
+
+ newtp->snd_sml = newtp->snd_una =
+ newtp->snd_nxt = newtp->snd_up =
+ treq->snt_isn + 1 + tcp_s_data_size(oldtp);
tcp_prequeue_init(newtp);
@@ -429,8 +453,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
tcp_set_ca_state(newsk, TCP_CA_Open);
tcp_init_xmit_timers(newsk);
skb_queue_head_init(&newtp->out_of_order_queue);
- newtp->write_seq = treq->snt_isn + 1;
- newtp->pushed_seq = newtp->write_seq;
+ newtp->write_seq = newtp->pushed_seq =
+ treq->snt_isn + 1 + tcp_s_data_size(oldtp);
newtp->rx_opt.saw_tstamp = 0;
@@ -476,7 +500,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
if (newtp->af_specific->md5_lookup(sk, newsk))
newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
#endif
- if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
+ if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
newtp->rx_opt.mss_clamp = req->mss;
TCP_ECN_openreq_child(newtp, req);
@@ -495,16 +519,16 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct request_sock **prev)
{
+ struct tcp_options_received tmp_opt;
+ u8 *hash_location;
+ struct sock *child;
const struct tcphdr *th = tcp_hdr(skb);
__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
int paws_reject = 0;
- struct tcp_options_received tmp_opt;
- struct sock *child;
- struct dst_entry *dst = inet_csk_route_req(sk, req);
- tmp_opt.saw_tstamp = 0;
- if (th->doff > (sizeof(struct tcphdr)>>2)) {
- tcp_parse_options(skb, &tmp_opt, 0, dst);
+ if ((th->doff > (sizeof(*th) >> 2)) && (req->ts_recent)) {
+ tmp_opt.tstamp_ok = 1;
+ tcp_parse_options(skb, &tmp_opt, &hash_location, 1, NULL);
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = req->ts_recent;
@@ -517,8 +541,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
}
}
- dst_release(dst);
-
/* Check for pure retransmitted SYN. */
if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
flg == TCP_FLAG_SYN &&
@@ -540,7 +562,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* Enforce "SYN-ACK" according to figure 8, figure 6
* of RFC793, fixed by RFC1122.
*/
- req->rsk_ops->rtx_syn_ack(sk, req);
+ req->rsk_ops->rtx_syn_ack(sk, req, NULL);
return NULL;
}
@@ -599,7 +621,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* Invalid ACK: reset will be sent by listening socket
*/
if ((flg & TCP_FLAG_ACK) &&
- (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1))
+ (TCP_SKB_CB(skb)->ack_seq !=
+ tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
return sk;
/* Also, it would be not so bad idea to check rcv_tsecr, which
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 616c686..93316a9 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -59,6 +59,10 @@ int sysctl_tcp_base_mss __read_mostly = 512;
/* By default, RFC2861 behavior. */
int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
+int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
+EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
+
+
/* Account for new data that has been sent to the network. */
static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
{
@@ -362,15 +366,45 @@ static inline int tcp_urg_mode(const struct tcp_sock *tp)
#define OPTION_TS (1 << 1)
#define OPTION_MD5 (1 << 2)
#define OPTION_WSCALE (1 << 3)
+#define OPTION_COOKIE_EXTENSION (1 << 4)
struct tcp_out_options {
u8 options; /* bit field of OPTION_* */
u8 ws; /* window scale, 0 to disable */
u8 num_sack_blocks; /* number of SACK blocks to include */
+ u8 hash_size; /* bytes in hash_location */
u16 mss; /* 0 to disable */
__u32 tsval, tsecr; /* need to include OPTION_TS */
+ __u8 *hash_location; /* temporary pointer, overloaded */
};
+/* The sysctl int routines are generic, so check consistency here.
+ */
+static u8 tcp_cookie_size_check(u8 desired)
+{
+ if (desired > 0) {
+ /* previously specified */
+ return desired;
+ }
+ if (sysctl_tcp_cookie_size <= 0) {
+ /* no default specified */
+ return 0;
+ }
+ if (sysctl_tcp_cookie_size <= TCP_COOKIE_MIN) {
+ /* value too small, specify minimum */
+ return TCP_COOKIE_MIN;
+ }
+ if (sysctl_tcp_cookie_size >= TCP_COOKIE_MAX) {
+ /* value too large, specify maximum */
+ return TCP_COOKIE_MAX;
+ }
+ if (0x1 & sysctl_tcp_cookie_size) {
+ /* 8-bit multiple, illegal, fix it */
+ return (u8)(sysctl_tcp_cookie_size + 0x1);
+ }
+ return (u8)sysctl_tcp_cookie_size;
+}
+
/* Write previously computed TCP options to the packet.
*
* Beware: Something in the Internet is very sensitive to the ordering of
@@ -385,17 +419,34 @@ struct tcp_out_options {
* (but it may well be that other scenarios fail similarly).
*/
static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
- const struct tcp_out_options *opts,
- __u8 **md5_hash) {
- if (unlikely(OPTION_MD5 & opts->options)) {
- *ptr++ = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
- (TCPOPT_MD5SIG << 8) |
- TCPOLEN_MD5SIG);
- *md5_hash = (__u8 *)ptr;
+ struct tcp_out_options *opts)
+{
+ u8 options = opts->options; /* mungable copy */
+
+ /* Having both authentication and cookies for security is redundant,
+ * and there's certainly not enough room. Instead, the cookie-less
+ * extension variant is proposed.
+ *
+ * Consider the pessimal case with authentication. The options
+ * could look like:
+ * COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40
+ */
+ if (unlikely(OPTION_MD5 & options)) {
+ if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
+ *ptr++ = htonl((TCPOPT_COOKIE << 24) |
+ (TCPOLEN_COOKIE_BASE << 16) |
+ (TCPOPT_MD5SIG << 8) |
+ TCPOLEN_MD5SIG);
+ } else {
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_MD5SIG << 8) |
+ TCPOLEN_MD5SIG);
+ }
+ options &= ~OPTION_COOKIE_EXTENSION;
+ /* overload cookie hash location */
+ opts->hash_location = (__u8 *)ptr;
ptr += 4;
- } else {
- *md5_hash = NULL;
}
if (unlikely(opts->mss)) {
@@ -404,12 +455,13 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
opts->mss);
}
- if (likely(OPTION_TS & opts->options)) {
- if (unlikely(OPTION_SACK_ADVERTISE & opts->options)) {
+ if (likely(OPTION_TS & options)) {
+ if (unlikely(OPTION_SACK_ADVERTISE & options)) {
*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
(TCPOLEN_SACK_PERM << 16) |
(TCPOPT_TIMESTAMP << 8) |
TCPOLEN_TIMESTAMP);
+ options &= ~OPTION_SACK_ADVERTISE;
} else {
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
@@ -420,15 +472,52 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
*ptr++ = htonl(opts->tsecr);
}
- if (unlikely(OPTION_SACK_ADVERTISE & opts->options &&
- !(OPTION_TS & opts->options))) {
+ /* Specification requires after timestamp, so do it now.
+ *
+ * Consider the pessimal case without authentication. The options
+ * could look like:
+ * MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40
+ */
+ if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
+ __u8 *cookie_copy = opts->hash_location;
+ u8 cookie_size = opts->hash_size;
+
+ /* 8-bit multiple handled in tcp_cookie_size_check() above,
+ * and elsewhere.
+ */
+ if (0x2 & cookie_size) {
+ __u8 *p = (__u8 *)ptr;
+
+ /* 16-bit multiple */
+ *p++ = TCPOPT_COOKIE;
+ *p++ = TCPOLEN_COOKIE_BASE + cookie_size;
+ *p++ = *cookie_copy++;
+ *p++ = *cookie_copy++;
+ ptr++;
+ cookie_size -= 2;
+ } else {
+ /* 32-bit multiple */
+ *ptr++ = htonl(((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_COOKIE << 8) |
+ TCPOLEN_COOKIE_BASE) +
+ cookie_size);
+ }
+
+ if (cookie_size > 0) {
+ memcpy(ptr, cookie_copy, cookie_size);
+ ptr += (cookie_size / 4);
+ }
+ }
+
+ if (unlikely(OPTION_SACK_ADVERTISE & options)) {
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_SACK_PERM << 8) |
TCPOLEN_SACK_PERM);
}
- if (unlikely(OPTION_WSCALE & opts->options)) {
+ if (unlikely(OPTION_WSCALE & options)) {
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_WINDOW << 16) |
(TCPOLEN_WINDOW << 8) |
@@ -463,14 +552,18 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
struct tcp_out_options *opts,
struct tcp_md5sig_key **md5) {
struct tcp_sock *tp = tcp_sk(sk);
- unsigned size = 0;
+ struct tcp_cookie_values *cvp = tp->cookie_values;
struct dst_entry *dst = __sk_dst_get(sk);
+ unsigned remaining = MAX_TCP_OPTION_SPACE;
+ u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
+ tcp_cookie_size_check(cvp->cookie_desired) :
+ 0;
#ifdef CONFIG_TCP_MD5SIG
*md5 = tp->af_specific->md5_lookup(sk, sk);
if (*md5) {
opts->options |= OPTION_MD5;
- size += TCPOLEN_MD5SIG_ALIGNED;
+ remaining -= TCPOLEN_MD5SIG_ALIGNED;
}
#else
*md5 = NULL;
@@ -486,7 +579,7 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
* SACKs don't matter, we never delay an ACK when we have any of those
* going out. */
opts->mss = tcp_advertise_mss(sk);
- size += TCPOLEN_MSS_ALIGNED;
+ remaining -= TCPOLEN_MSS_ALIGNED;
if (likely(sysctl_tcp_timestamps &&
!dst_feature(dst, RTAX_FEATURE_NO_TSTAMP) &&
@@ -494,22 +587,68 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
opts->options |= OPTION_TS;
opts->tsval = TCP_SKB_CB(skb)->when;
opts->tsecr = tp->rx_opt.ts_recent;
- size += TCPOLEN_TSTAMP_ALIGNED;
+ remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
if (likely(sysctl_tcp_window_scaling &&
!dst_feature(dst, RTAX_FEATURE_NO_WSCALE))) {
opts->ws = tp->rx_opt.rcv_wscale;
opts->options |= OPTION_WSCALE;
- size += TCPOLEN_WSCALE_ALIGNED;
+ remaining -= TCPOLEN_WSCALE_ALIGNED;
}
if (likely(sysctl_tcp_sack &&
!dst_feature(dst, RTAX_FEATURE_NO_SACK))) {
opts->options |= OPTION_SACK_ADVERTISE;
if (unlikely(!(OPTION_TS & opts->options)))
- size += TCPOLEN_SACKPERM_ALIGNED;
+ remaining -= TCPOLEN_SACKPERM_ALIGNED;
}
- return size;
+ /* Note that timestamps are required by the specification.
+ *
+ * Odd numbers of bytes are prohibited by the specification, ensuring
+ * that the cookie is 16-bit aligned, and the resulting cookie pair is
+ * 32-bit aligned.
+ */
+ if (*md5 == NULL &&
+ (OPTION_TS & opts->options) &&
+ cookie_size > 0) {
+ int need = TCPOLEN_COOKIE_BASE + cookie_size;
+
+ if (0x2 & need) {
+ /* 32-bit multiple */
+ need += 2; /* NOPs */
+
+ if (need > remaining) {
+ /* try shrinking cookie to fit */
+ cookie_size -= 2;
+ need -= 4;
+ }
+ }
+ while (need > remaining && TCP_COOKIE_MIN <= cookie_size) {
+ cookie_size -= 4;
+ need -= 4;
+ }
+ if (TCP_COOKIE_MIN <= cookie_size) {
+ opts->options |= OPTION_COOKIE_EXTENSION;
+ opts->hash_location = (__u8 *)&cvp->cookie_pair[0];
+ opts->hash_size = cookie_size;
+
+ /* Remember for future incarnations. */
+ cvp->cookie_desired = cookie_size;
+
+ if (cvp->cookie_desired != cvp->cookie_pair_size) {
+ /* Currently use random bytes as a nonce,
+ * assuming these are completely unpredictable
+ * by hostile users of the same system.
+ */
+ get_random_bytes(&cvp->cookie_pair[0],
+ cookie_size);
+ cvp->cookie_pair_size = cookie_size;
+ }
+
+ remaining -= need;
+ }
+ }
+ return MAX_TCP_OPTION_SPACE - remaining;
}
/* Set up TCP options for SYN-ACKs. */
@@ -517,48 +656,77 @@ static unsigned tcp_synack_options(struct sock *sk,
struct request_sock *req,
unsigned mss, struct sk_buff *skb,
struct tcp_out_options *opts,
- struct tcp_md5sig_key **md5) {
- unsigned size = 0;
+ struct tcp_md5sig_key **md5,
+ struct tcp_extend_values *xvp)
+{
struct inet_request_sock *ireq = inet_rsk(req);
- char doing_ts;
+ unsigned remaining = MAX_TCP_OPTION_SPACE;
+ u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ?
+ xvp->cookie_plus :
+ 0;
+ bool doing_ts = ireq->tstamp_ok;
#ifdef CONFIG_TCP_MD5SIG
*md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
if (*md5) {
opts->options |= OPTION_MD5;
- size += TCPOLEN_MD5SIG_ALIGNED;
+ remaining -= TCPOLEN_MD5SIG_ALIGNED;
+
+ /* We can't fit any SACK blocks in a packet with MD5 + TS
+ * options. There was discussion about disabling SACK
+ * rather than TS in order to fit in better with old,
+ * buggy kernels, but that was deemed to be unnecessary.
+ */
+ doing_ts &= !ireq->sack_ok;
}
#else
*md5 = NULL;
#endif
- /* we can't fit any SACK blocks in a packet with MD5 + TS
- options. There was discussion about disabling SACK rather than TS in
- order to fit in better with old, buggy kernels, but that was deemed
- to be unnecessary. */
- doing_ts = ireq->tstamp_ok && !(*md5 && ireq->sack_ok);
-
+ /* We always send an MSS option. */
opts->mss = mss;
- size += TCPOLEN_MSS_ALIGNED;
+ remaining -= TCPOLEN_MSS_ALIGNED;
if (likely(ireq->wscale_ok)) {
opts->ws = ireq->rcv_wscale;
opts->options |= OPTION_WSCALE;
- size += TCPOLEN_WSCALE_ALIGNED;
+ remaining -= TCPOLEN_WSCALE_ALIGNED;
}
if (likely(doing_ts)) {
opts->options |= OPTION_TS;
opts->tsval = TCP_SKB_CB(skb)->when;
opts->tsecr = req->ts_recent;
- size += TCPOLEN_TSTAMP_ALIGNED;
+ remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
if (likely(ireq->sack_ok)) {
opts->options |= OPTION_SACK_ADVERTISE;
if (unlikely(!doing_ts))
- size += TCPOLEN_SACKPERM_ALIGNED;
+ remaining -= TCPOLEN_SACKPERM_ALIGNED;
}
- return size;
+ /* Similar rationale to tcp_syn_options() applies here, too.
+ * If the <SYN> options fit, the same options should fit now!
+ */
+ if (*md5 == NULL &&
+ doing_ts &&
+ cookie_plus > TCPOLEN_COOKIE_BASE) {
+ int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */
+
+ if (0x2 & need) {
+ /* 32-bit multiple */
+ need += 2; /* NOPs */
+ }
+ if (need <= remaining) {
+ opts->options |= OPTION_COOKIE_EXTENSION;
+ opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE;
+ remaining -= need;
+ } else {
+ /* There's no error return, so flag it. */
+ xvp->cookie_out_never = 1; /* true */
+ opts->hash_size = 0;
+ }
+ }
+ return MAX_TCP_OPTION_SPACE - remaining;
}
/* Compute TCP options for ESTABLISHED sockets. This is not the
@@ -624,7 +792,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
struct tcp_out_options opts;
unsigned tcp_options_size, tcp_header_size;
struct tcp_md5sig_key *md5;
- __u8 *md5_hash_location;
struct tcphdr *th;
int err;
@@ -695,7 +862,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
}
}
- tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location);
+ tcp_options_write((__be32 *)(th + 1), tp, &opts);
if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0))
TCP_ECN_send(sk, skb, tcp_header_size);
@@ -703,7 +870,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
/* Calculate the MD5 hash, as we have all we need now */
if (md5) {
sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
- tp->af_specific->calc_md5_hash(md5_hash_location,
+ tp->af_specific->calc_md5_hash(opts.hash_location,
md5, sk, NULL, skb);
}
#endif
@@ -1923,8 +2090,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
* case, when window is shrunk to zero. In this case
* our retransmit serves as a zero window probe.
*/
- if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))
- && TCP_SKB_CB(skb)->seq != tp->snd_una)
+ if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
+ TCP_SKB_CB(skb)->seq != tp->snd_una)
return -EAGAIN;
if (skb->len > cur_mss) {
@@ -2224,16 +2391,17 @@ int tcp_send_synack(struct sock *sk)
/* Prepare a SYN-ACK. */
struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
- struct request_sock *req)
+ struct request_sock *req,
+ struct request_values *rvp)
{
+ struct tcp_out_options opts;
+ struct tcp_extend_values *xvp = tcp_xv(rvp);
struct inet_request_sock *ireq = inet_rsk(req);
struct tcp_sock *tp = tcp_sk(sk);
struct tcphdr *th;
- int tcp_header_size;
- struct tcp_out_options opts;
struct sk_buff *skb;
struct tcp_md5sig_key *md5;
- __u8 *md5_hash_location;
+ int tcp_header_size;
int mss;
skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
@@ -2271,8 +2439,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
#endif
TCP_SKB_CB(skb)->when = tcp_time_stamp;
tcp_header_size = tcp_synack_options(sk, req, mss,
- skb, &opts, &md5) +
- sizeof(struct tcphdr);
+ skb, &opts, &md5, xvp)
+ + sizeof(*th);
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
@@ -2289,19 +2457,58 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
*/
tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
TCPCB_FLAG_SYN | TCPCB_FLAG_ACK);
+
+ if (OPTION_COOKIE_EXTENSION & opts.options) {
+ const struct tcp_cookie_values *cvp = tp->cookie_values;
+
+ if (cvp != NULL &&
+ cvp->s_data_constant &&
+ cvp->s_data_desired > 0) {
+ u8 *buf = skb_put(skb, cvp->s_data_desired);
+
+ /* copy data directly from the listening socket. */
+ memcpy(buf, cvp->s_data_payload, cvp->s_data_desired);
+ TCP_SKB_CB(skb)->end_seq += cvp->s_data_desired;
+ }
+
+ if (opts.hash_size > 0) {
+ __u32 workspace[SHA_WORKSPACE_WORDS];
+ u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS];
+ u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1];
+
+ /* Secret recipe depends on the Timestamp, (future)
+ * Sequence and Acknowledgment Numbers, Initiator
+ * Cookie, and others handled by IP variant caller.
+ */
+ *tail-- ^= opts.tsval;
+ *tail-- ^= tcp_rsk(req)->rcv_isn + 1;
+ *tail-- ^= TCP_SKB_CB(skb)->seq + 1;
+
+ /* recommended */
+ *tail-- ^= ((th->dest << 16) | th->source);
+ *tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */
+
+ sha_transform((__u32 *)&xvp->cookie_bakery[0],
+ (char *)mess,
+ &workspace[0]);
+ opts.hash_location =
+ (__u8 *)&xvp->cookie_bakery[0];
+ }
+ }
+
th->seq = htonl(TCP_SKB_CB(skb)->seq);
th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
th->window = htons(min(req->rcv_wnd, 65535U));
- tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location);
+ tcp_options_write((__be32 *)(th + 1), tp, &opts);
th->doff = (tcp_header_size >> 2);
TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
#ifdef CONFIG_TCP_MD5SIG
/* Okay, we have all we need - do the md5 hash if needed */
if (md5) {
- tcp_rsk(req)->af_specific->calc_md5_hash(md5_hash_location,
+ tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
md5, NULL, req, skb);
}
#endif
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 7a3cc2f..bb110c5 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -95,8 +95,8 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
/* Only update if port matches */
if ((port == 0 || ntohs(inet->inet_dport) == port ||
- ntohs(inet->inet_sport) == port)
- && (full || tp->snd_cwnd != tcp_probe.lastcwnd)) {
+ ntohs(inet->inet_sport) == port) &&
+ (full || tp->snd_cwnd != tcp_probe.lastcwnd)) {
spin_lock(&tcp_probe.lock);
/* If log fills, just silently drop */
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index e9bbff7..b612acf 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -165,9 +165,8 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
* every other rtt.
*/
if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- if (veno->inc
- && tp->snd_cwnd <
- tp->snd_cwnd_clamp) {
+ if (veno->inc &&
+ tp->snd_cwnd < tp->snd_cwnd_clamp) {
tp->snd_cwnd++;
veno->inc = 0;
} else
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 66b6821..a0f2403 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -157,8 +157,8 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
if (queue > TCP_YEAH_ALPHA ||
rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) {
- if (queue > TCP_YEAH_ALPHA
- && tp->snd_cwnd > yeah->reno_count) {
+ if (queue > TCP_YEAH_ALPHA &&
+ tp->snd_cwnd > yeah->reno_count) {
u32 reduction = min(queue / TCP_YEAH_GAMMA ,
tp->snd_cwnd >> TCP_YEAH_EPSILON);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 4274c1c..1f95348 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -136,33 +136,67 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
struct hlist_nulls_node *node;
sk_nulls_for_each(sk2, node, &hslot->head)
- if (net_eq(sock_net(sk2), net) &&
- sk2 != sk &&
- (bitmap || sk2->sk_hash == num) &&
- (!sk2->sk_reuse || !sk->sk_reuse) &&
- (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if
- || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+ if (net_eq(sock_net(sk2), net) &&
+ sk2 != sk &&
+ (bitmap || udp_sk(sk2)->udp_port_hash == num) &&
+ (!sk2->sk_reuse || !sk->sk_reuse) &&
+ (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
+ sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
(*saddr_comp)(sk, sk2)) {
if (bitmap)
- __set_bit(sk2->sk_hash >> log, bitmap);
+ __set_bit(udp_sk(sk2)->udp_port_hash >> log,
+ bitmap);
else
return 1;
}
return 0;
}
+/*
+ * Note: we still hold spinlock of primary hash chain, so no other writer
+ * can insert/delete a socket with local_port == num
+ */
+static int udp_lib_lport_inuse2(struct net *net, __u16 num,
+ struct udp_hslot *hslot2,
+ struct sock *sk,
+ int (*saddr_comp)(const struct sock *sk1,
+ const struct sock *sk2))
+{
+ struct sock *sk2;
+ struct hlist_nulls_node *node;
+ int res = 0;
+
+ spin_lock(&hslot2->lock);
+ udp_portaddr_for_each_entry(sk2, node, &hslot2->head)
+ if (net_eq(sock_net(sk2), net) &&
+ sk2 != sk &&
+ (udp_sk(sk2)->udp_port_hash == num) &&
+ (!sk2->sk_reuse || !sk->sk_reuse) &&
+ (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
+ sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+ (*saddr_comp)(sk, sk2)) {
+ res = 1;
+ break;
+ }
+ spin_unlock(&hslot2->lock);
+ return res;
+}
+
/**
* udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6
*
* @sk: socket struct in question
* @snum: port number to look up
* @saddr_comp: AF-dependent comparison of bound local IP addresses
+ * @hash2_nulladdr: AF-dependant hash value in secondary hash chains,
+ * with NULL address
*/
int udp_lib_get_port(struct sock *sk, unsigned short snum,
int (*saddr_comp)(const struct sock *sk1,
- const struct sock *sk2))
+ const struct sock *sk2),
+ unsigned int hash2_nulladdr)
{
- struct udp_hslot *hslot;
+ struct udp_hslot *hslot, *hslot2;
struct udp_table *udptable = sk->sk_prot->h.udp_table;
int error = 1;
struct net *net = sock_net(sk);
@@ -209,16 +243,49 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
} else {
hslot = udp_hashslot(udptable, net, snum);
spin_lock_bh(&hslot->lock);
+ if (hslot->count > 10) {
+ int exist;
+ unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum;
+
+ slot2 &= udptable->mask;
+ hash2_nulladdr &= udptable->mask;
+
+ hslot2 = udp_hashslot2(udptable, slot2);
+ if (hslot->count < hslot2->count)
+ goto scan_primary_hash;
+
+ exist = udp_lib_lport_inuse2(net, snum, hslot2,
+ sk, saddr_comp);
+ if (!exist && (hash2_nulladdr != slot2)) {
+ hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
+ exist = udp_lib_lport_inuse2(net, snum, hslot2,
+ sk, saddr_comp);
+ }
+ if (exist)
+ goto fail_unlock;
+ else
+ goto found;
+ }
+scan_primary_hash:
if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk,
saddr_comp, 0))
goto fail_unlock;
}
found:
inet_sk(sk)->inet_num = snum;
- sk->sk_hash = snum;
+ udp_sk(sk)->udp_port_hash = snum;
+ udp_sk(sk)->udp_portaddr_hash ^= snum;
if (sk_unhashed(sk)) {
sk_nulls_add_node_rcu(sk, &hslot->head);
+ hslot->count++;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+
+ hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+ spin_lock(&hslot2->lock);
+ hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+ &hslot2->head);
+ hslot2->count++;
+ spin_unlock(&hslot2->lock);
}
error = 0;
fail_unlock:
@@ -237,9 +304,22 @@ static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
inet1->inet_rcv_saddr == inet2->inet_rcv_saddr));
}
+static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr,
+ unsigned int port)
+{
+ return jhash_1word(saddr, net_hash_mix(net)) ^ port;
+}
+
int udp_v4_get_port(struct sock *sk, unsigned short snum)
{
- return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal);
+ unsigned int hash2_nulladdr =
+ udp4_portaddr_hash(sock_net(sk), INADDR_ANY, snum);
+ unsigned int hash2_partial =
+ udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
+
+ /* precompute partial secondary hash */
+ udp_sk(sk)->udp_portaddr_hash = hash2_partial;
+ return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
}
static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
@@ -248,7 +328,7 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
{
int score = -1;
- if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
+ if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum &&
!ipv6_only_sock(sk)) {
struct inet_sock *inet = inet_sk(sk);
@@ -277,6 +357,89 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
return score;
}
+/*
+ * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num)
+ */
+#define SCORE2_MAX (1 + 2 + 2 + 2)
+static inline int compute_score2(struct sock *sk, struct net *net,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned int hnum, int dif)
+{
+ int score = -1;
+
+ if (net_eq(sock_net(sk), net) && !ipv6_only_sock(sk)) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (inet->inet_rcv_saddr != daddr)
+ return -1;
+ if (inet->inet_num != hnum)
+ return -1;
+
+ score = (sk->sk_family == PF_INET ? 1 : 0);
+ if (inet->inet_daddr) {
+ if (inet->inet_daddr != saddr)
+ return -1;
+ score += 2;
+ }
+ if (inet->inet_dport) {
+ if (inet->inet_dport != sport)
+ return -1;
+ score += 2;
+ }
+ if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if != dif)
+ return -1;
+ score += 2;
+ }
+ }
+ return score;
+}
+
+
+/* called with read_rcu_lock() */
+static struct sock *udp4_lib_lookup2(struct net *net,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned int hnum, int dif,
+ struct udp_hslot *hslot2, unsigned int slot2)
+{
+ struct sock *sk, *result;
+ struct hlist_nulls_node *node;
+ int score, badness;
+
+begin:
+ result = NULL;
+ badness = -1;
+ udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
+ score = compute_score2(sk, net, saddr, sport,
+ daddr, hnum, dif);
+ if (score > badness) {
+ result = sk;
+ badness = score;
+ if (score == SCORE2_MAX)
+ goto exact_match;
+ }
+ }
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot2)
+ goto begin;
+
+ if (result) {
+exact_match:
+ if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
+ result = NULL;
+ else if (unlikely(compute_score2(result, net, saddr, sport,
+ daddr, hnum, dif) < badness)) {
+ sock_put(result);
+ goto begin;
+ }
+ }
+ return result;
+}
+
/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
* harder than this. -DaveM
*/
@@ -287,11 +450,35 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
struct sock *sk, *result;
struct hlist_nulls_node *node;
unsigned short hnum = ntohs(dport);
- unsigned int hash = udp_hashfn(net, hnum, udptable->mask);
- struct udp_hslot *hslot = &udptable->hash[hash];
+ unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
+ struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
int score, badness;
rcu_read_lock();
+ if (hslot->count > 10) {
+ hash2 = udp4_portaddr_hash(net, daddr, hnum);
+ slot2 = hash2 & udptable->mask;
+ hslot2 = &udptable->hash2[slot2];
+ if (hslot->count < hslot2->count)
+ goto begin;
+
+ result = udp4_lib_lookup2(net, saddr, sport,
+ daddr, hnum, dif,
+ hslot2, slot2);
+ if (!result) {
+ hash2 = udp4_portaddr_hash(net, INADDR_ANY, hnum);
+ slot2 = hash2 & udptable->mask;
+ hslot2 = &udptable->hash2[slot2];
+ if (hslot->count < hslot2->count)
+ goto begin;
+
+ result = udp4_lib_lookup2(net, INADDR_ANY, sport,
+ daddr, hnum, dif,
+ hslot2, slot2);
+ }
+ rcu_read_unlock();
+ return result;
+ }
begin:
result = NULL;
badness = -1;
@@ -308,7 +495,7 @@ begin:
* not the expected one, we must restart lookup.
* We probably met an item that was moved to another chain.
*/
- if (get_nulls_value(node) != hash)
+ if (get_nulls_value(node) != slot)
goto begin;
if (result) {
@@ -358,13 +545,13 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
sk_nulls_for_each_from(s, node) {
struct inet_sock *inet = inet_sk(s);
- if (!net_eq(sock_net(s), net) ||
- s->sk_hash != hnum ||
- (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
- (inet->inet_dport != rmt_port && inet->inet_dport) ||
- (inet->inet_rcv_saddr &&
- inet->inet_rcv_saddr != loc_addr) ||
- ipv6_only_sock(s) ||
+ if (!net_eq(sock_net(s), net) ||
+ udp_sk(s)->udp_port_hash != hnum ||
+ (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
+ (inet->inet_dport != rmt_port && inet->inet_dport) ||
+ (inet->inet_rcv_saddr &&
+ inet->inet_rcv_saddr != loc_addr) ||
+ ipv6_only_sock(s) ||
(s->sk_bound_dev_if && s->sk_bound_dev_if != dif))
continue;
if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif))
@@ -1005,9 +1192,7 @@ try_again:
err = ulen;
out_free:
- lock_sock(sk);
- skb_free_datagram(sk, skb);
- release_sock(sk);
+ skb_free_datagram_locked(sk, skb);
out:
return err;
@@ -1050,13 +1235,22 @@ void udp_lib_unhash(struct sock *sk)
{
if (sk_hashed(sk)) {
struct udp_table *udptable = sk->sk_prot->h.udp_table;
- struct udp_hslot *hslot = udp_hashslot(udptable, sock_net(sk),
- sk->sk_hash);
+ struct udp_hslot *hslot, *hslot2;
+
+ hslot = udp_hashslot(udptable, sock_net(sk),
+ udp_sk(sk)->udp_port_hash);
+ hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
spin_lock_bh(&hslot->lock);
if (sk_nulls_del_node_init_rcu(sk)) {
+ hslot->count--;
inet_sk(sk)->inet_num = 0;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+
+ spin_lock(&hslot2->lock);
+ hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+ hslot2->count--;
+ spin_unlock(&hslot2->lock);
}
spin_unlock_bh(&hslot->lock);
}
@@ -1192,49 +1386,83 @@ drop:
return -1;
}
+
+static void flush_stack(struct sock **stack, unsigned int count,
+ struct sk_buff *skb, unsigned int final)
+{
+ unsigned int i;
+ struct sk_buff *skb1 = NULL;
+ struct sock *sk;
+
+ for (i = 0; i < count; i++) {
+ sk = stack[i];
+ if (likely(skb1 == NULL))
+ skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
+
+ if (!skb1) {
+ atomic_inc(&sk->sk_drops);
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
+ IS_UDPLITE(sk));
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
+ IS_UDPLITE(sk));
+ }
+
+ if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
+ skb1 = NULL;
+ }
+ if (unlikely(skb1))
+ kfree_skb(skb1);
+}
+
/*
* Multicasts and broadcasts go to each listener.
*
- * Note: called only from the BH handler context,
- * so we don't need to lock the hashes.
+ * Note: called only from the BH handler context.
*/
static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
struct udphdr *uh,
__be32 saddr, __be32 daddr,
struct udp_table *udptable)
{
- struct sock *sk;
+ struct sock *sk, *stack[256 / sizeof(struct sock *)];
struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
int dif;
+ unsigned int i, count = 0;
spin_lock(&hslot->lock);
sk = sk_nulls_head(&hslot->head);
dif = skb->dev->ifindex;
sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
- if (sk) {
- struct sock *sknext = NULL;
-
- do {
- struct sk_buff *skb1 = skb;
-
- sknext = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
- daddr, uh->source, saddr,
- dif);
- if (sknext)
- skb1 = skb_clone(skb, GFP_ATOMIC);
-
- if (skb1) {
- int ret = udp_queue_rcv_skb(sk, skb1);
- if (ret > 0)
- /* we should probably re-process instead
- * of dropping packets here. */
- kfree_skb(skb1);
- }
- sk = sknext;
- } while (sknext);
- } else
- consume_skb(skb);
+ while (sk) {
+ stack[count++] = sk;
+ sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
+ daddr, uh->source, saddr, dif);
+ if (unlikely(count == ARRAY_SIZE(stack))) {
+ if (!sk)
+ break;
+ flush_stack(stack, count, skb, ~0);
+ count = 0;
+ }
+ }
+ /*
+ * before releasing chain lock, we must take a reference on sockets
+ */
+ for (i = 0; i < count; i++)
+ sock_hold(stack[i]);
+
spin_unlock(&hslot->lock);
+
+ /*
+ * do the slow work with no lock held
+ */
+ if (count) {
+ flush_stack(stack, count, skb, count - 1);
+
+ for (i = 0; i < count; i++)
+ sock_put(stack[i]);
+ } else {
+ kfree_skb(skb);
+ }
return 0;
}
@@ -1844,7 +2072,7 @@ void __init udp_table_init(struct udp_table *table, const char *name)
if (!CONFIG_BASE_SMALL)
table->hash = alloc_large_system_hash(name,
- sizeof(struct udp_hslot),
+ 2 * sizeof(struct udp_hslot),
uhash_entries,
21, /* one slot per 2 MB */
0,
@@ -1856,16 +2084,23 @@ void __init udp_table_init(struct udp_table *table, const char *name)
*/
if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) {
table->hash = kmalloc(UDP_HTABLE_SIZE_MIN *
- sizeof(struct udp_hslot), GFP_KERNEL);
+ 2 * sizeof(struct udp_hslot), GFP_KERNEL);
if (!table->hash)
panic(name);
table->log = ilog2(UDP_HTABLE_SIZE_MIN);
table->mask = UDP_HTABLE_SIZE_MIN - 1;
}
+ table->hash2 = table->hash + (table->mask + 1);
for (i = 0; i <= table->mask; i++) {
INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
+ table->hash[i].count = 0;
spin_lock_init(&table->hash[i].lock);
}
+ for (i = 0; i <= table->mask; i++) {
+ INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i);
+ table->hash2[i].count = 0;
+ spin_lock_init(&table->hash2[i].lock);
+ }
}
void __init udp_init(void)
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 470c504..66f7951 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -64,7 +64,6 @@ static struct inet_protosw udplite4_protosw = {
.protocol = IPPROTO_UDPLITE,
.prot = &udplite_prot,
.ops = &inet_dgram_ops,
- .capability = -1,
.no_check = 0, /* must checksum (RFC 3828) */
.flags = INET_PROTOSW_PERMANENT,
};
OpenPOWER on IntegriCloud