summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c13
-rw-r--r--net/ipv4/arp.c112
-rw-r--r--net/ipv4/devinet.c33
-rw-r--r--net/ipv4/fib_frontend.c130
-rw-r--r--net/ipv4/fib_lookup.h4
-rw-r--r--net/ipv4/fib_rules.c23
-rw-r--r--net/ipv4/fib_semantics.c7
-rw-r--r--net/ipv4/fib_trie.c1758
-rw-r--r--net/ipv4/igmp.c48
-rw-r--r--net/ipv4/inet_connection_sock.c160
-rw-r--r--net/ipv4/inet_diag.c428
-rw-r--r--net/ipv4/inet_hashtables.c66
-rw-r--r--net/ipv4/inet_timewait_sock.c5
-rw-r--r--net/ipv4/ip_sockglue.c28
-rw-r--r--net/ipv4/ipmr.c4
-rw-r--r--net/ipv4/netfilter/Kconfig38
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c7
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c17
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c25
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c3
-rw-r--r--net/ipv4/ping.c13
-rw-r--r--net/ipv4/raw.c7
-rw-r--r--net/ipv4/route.c2
-rw-r--r--net/ipv4/syncookies.c25
-rw-r--r--net/ipv4/sysctl_net_ipv4.c14
-rw-r--r--net/ipv4/tcp.c10
-rw-r--r--net/ipv4/tcp_cong.c2
-rw-r--r--net/ipv4/tcp_diag.c4
-rw-r--r--net/ipv4/tcp_fastopen.c13
-rw-r--r--net/ipv4/tcp_input.c84
-rw-r--r--net/ipv4/tcp_ipv4.c55
-rw-r--r--net/ipv4/tcp_metrics.c151
-rw-r--r--net/ipv4/tcp_minisocks.c10
-rw-r--r--net/ipv4/tcp_offload.c4
-rw-r--r--net/ipv4/tcp_output.c79
-rw-r--r--net/ipv4/tcp_timer.c13
-rw-r--r--net/ipv4/udp.c29
-rw-r--r--net/ipv4/udp_diag.c22
-rw-r--r--net/ipv4/udp_impl.h4
-rw-r--r--net/ipv4/xfrm4_policy.c1
40 files changed, 1893 insertions, 1558 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d2e49ba..64a9c0f 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -716,8 +716,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
}
EXPORT_SYMBOL(inet_getname);
-int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
- size_t size)
+int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
struct sock *sk = sock->sk;
@@ -728,7 +727,7 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
inet_autobind(sk))
return -EAGAIN;
- return sk->sk_prot->sendmsg(iocb, sk, msg, size);
+ return sk->sk_prot->sendmsg(sk, msg, size);
}
EXPORT_SYMBOL(inet_sendmsg);
@@ -750,8 +749,8 @@ ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
}
EXPORT_SYMBOL(inet_sendpage);
-int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
- size_t size, int flags)
+int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
{
struct sock *sk = sock->sk;
int addr_len = 0;
@@ -759,7 +758,7 @@ int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
sock_rps_record_flow(sk);
- err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
+ err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
flags & ~MSG_DONTWAIT, &addr_len);
if (err >= 0)
msg->msg_namelen = addr_len;
@@ -1675,7 +1674,7 @@ static int __init inet_init(void)
struct list_head *r;
int rc = -EINVAL;
- BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb));
+ sock_skb_cb_check_size(sizeof(struct inet_skb_parm));
rc = proto_register(&tcp_prot, 1);
if (rc)
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 205e147..5f5c674 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -122,6 +122,7 @@
* Interface to generic neighbour cache.
*/
static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd);
+static bool arp_key_eq(const struct neighbour *n, const void *pkey);
static int arp_constructor(struct neighbour *neigh);
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
@@ -149,18 +150,12 @@ static const struct neigh_ops arp_direct_ops = {
.connected_output = neigh_direct_output,
};
-static const struct neigh_ops arp_broken_ops = {
- .family = AF_INET,
- .solicit = arp_solicit,
- .error_report = arp_error_report,
- .output = neigh_compat_output,
- .connected_output = neigh_compat_output,
-};
-
struct neigh_table arp_tbl = {
.family = AF_INET,
.key_len = 4,
+ .protocol = cpu_to_be16(ETH_P_IP),
.hash = arp_hash,
+ .key_eq = arp_key_eq,
.constructor = arp_constructor,
.proxy_redo = parp_redo,
.id = "arp_cache",
@@ -216,7 +211,12 @@ static u32 arp_hash(const void *pkey,
const struct net_device *dev,
__u32 *hash_rnd)
{
- return arp_hashfn(*(u32 *)pkey, dev, *hash_rnd);
+ return arp_hashfn(pkey, dev, hash_rnd);
+}
+
+static bool arp_key_eq(const struct neighbour *neigh, const void *pkey)
+{
+ return neigh_key_eq32(neigh, pkey);
}
static int arp_constructor(struct neighbour *neigh)
@@ -260,35 +260,6 @@ static int arp_constructor(struct neighbour *neigh)
in old paradigm.
*/
-#if 1
- /* So... these "amateur" devices are hopeless.
- The only thing, that I can say now:
- It is very sad that we need to keep ugly obsolete
- code to make them happy.
-
- They should be moved to more reasonable state, now
- they use rebuild_header INSTEAD OF hard_start_xmit!!!
- Besides that, they are sort of out of date
- (a lot of redundant clones/copies, useless in 2.1),
- I wonder why people believe that they work.
- */
- switch (dev->type) {
- default:
- break;
- case ARPHRD_ROSE:
-#if IS_ENABLED(CONFIG_AX25)
- case ARPHRD_AX25:
-#if IS_ENABLED(CONFIG_NETROM)
- case ARPHRD_NETROM:
-#endif
- neigh->ops = &arp_broken_ops;
- neigh->output = neigh->ops->output;
- return 0;
-#else
- break;
-#endif
- }
-#endif
if (neigh->type == RTN_MULTICAST) {
neigh->nud_state = NUD_NOARP;
arp_mc_map(addr, neigh->ha, dev, 1);
@@ -433,71 +404,6 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
return flag;
}
-/* OBSOLETE FUNCTIONS */
-
-/*
- * Find an arp mapping in the cache. If not found, post a request.
- *
- * It is very UGLY routine: it DOES NOT use skb->dst->neighbour,
- * even if it exists. It is supposed that skb->dev was mangled
- * by a virtual device (eql, shaper). Nobody but broken devices
- * is allowed to use this function, it is scheduled to be removed. --ANK
- */
-
-static int arp_set_predefined(int addr_hint, unsigned char *haddr,
- __be32 paddr, struct net_device *dev)
-{
- switch (addr_hint) {
- case RTN_LOCAL:
- pr_debug("arp called for own IP address\n");
- memcpy(haddr, dev->dev_addr, dev->addr_len);
- return 1;
- case RTN_MULTICAST:
- arp_mc_map(paddr, haddr, dev, 1);
- return 1;
- case RTN_BROADCAST:
- memcpy(haddr, dev->broadcast, dev->addr_len);
- return 1;
- }
- return 0;
-}
-
-
-int arp_find(unsigned char *haddr, struct sk_buff *skb)
-{
- struct net_device *dev = skb->dev;
- __be32 paddr;
- struct neighbour *n;
-
- if (!skb_dst(skb)) {
- pr_debug("arp_find is called with dst==NULL\n");
- kfree_skb(skb);
- return 1;
- }
-
- paddr = rt_nexthop(skb_rtable(skb), ip_hdr(skb)->daddr);
- if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
- paddr, dev))
- return 0;
-
- n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
-
- if (n) {
- n->used = jiffies;
- if (n->nud_state & NUD_VALID || neigh_event_send(n, skb) == 0) {
- neigh_ha_snapshot(haddr, n, dev);
- neigh_release(n);
- return 0;
- }
- neigh_release(n);
- } else
- kfree_skb(skb);
- return 1;
-}
-EXPORT_SYMBOL(arp_find);
-
-/* END OF OBSOLETE FUNCTIONS */
-
/*
* Check if we can use proxy ARP for this path
*/
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 3a8985c..975ee5e 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -107,7 +107,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
-static u32 inet_addr_hash(struct net *net, __be32 addr)
+static u32 inet_addr_hash(const struct net *net, __be32 addr)
{
u32 val = (__force u32) addr ^ net_hash_mix(net);
@@ -548,6 +548,26 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
return NULL;
}
+static int ip_mc_config(struct sock *sk, bool join, const struct in_ifaddr *ifa)
+{
+ struct ip_mreqn mreq = {
+ .imr_multiaddr.s_addr = ifa->ifa_address,
+ .imr_ifindex = ifa->ifa_dev->dev->ifindex,
+ };
+ int ret;
+
+ ASSERT_RTNL();
+
+ lock_sock(sk);
+ if (join)
+ ret = ip_mc_join_group(sk, &mreq);
+ else
+ ret = ip_mc_leave_group(sk, &mreq);
+ release_sock(sk);
+
+ return ret;
+}
+
static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
@@ -584,6 +604,8 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh)
!inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa)))
continue;
+ if (ipv4_is_multicast(ifa->ifa_address))
+ ip_mc_config(net->ipv4.mc_autojoin_sk, false, ifa);
__inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid);
return 0;
}
@@ -838,6 +860,15 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
* userspace already relies on not having to provide this.
*/
set_ifa_lifetime(ifa, valid_lft, prefered_lft);
+ if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) {
+ int ret = ip_mc_config(net->ipv4.mc_autojoin_sk,
+ true, ifa);
+
+ if (ret < 0) {
+ inet_free_ifa(ifa);
+ return ret;
+ }
+ }
return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
} else {
inet_free_ifa(ifa);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 57be71d..e5b6b05 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -52,12 +52,12 @@ static int __net_init fib4_rules_init(struct net *net)
{
struct fib_table *local_table, *main_table;
- local_table = fib_trie_table(RT_TABLE_LOCAL);
- if (local_table == NULL)
+ main_table = fib_trie_table(RT_TABLE_MAIN, NULL);
+ if (main_table == NULL)
return -ENOMEM;
- main_table = fib_trie_table(RT_TABLE_MAIN);
- if (main_table == NULL)
+ local_table = fib_trie_table(RT_TABLE_LOCAL, main_table);
+ if (local_table == NULL)
goto fail;
hlist_add_head_rcu(&local_table->tb_hlist,
@@ -67,14 +67,14 @@ static int __net_init fib4_rules_init(struct net *net)
return 0;
fail:
- fib_free_table(local_table);
+ fib_free_table(main_table);
return -ENOMEM;
}
#else
struct fib_table *fib_new_table(struct net *net, u32 id)
{
- struct fib_table *tb;
+ struct fib_table *tb, *alias = NULL;
unsigned int h;
if (id == 0)
@@ -83,23 +83,23 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
if (tb)
return tb;
- tb = fib_trie_table(id);
+ if (id == RT_TABLE_LOCAL)
+ alias = fib_new_table(net, RT_TABLE_MAIN);
+
+ tb = fib_trie_table(id, alias);
if (!tb)
return NULL;
switch (id) {
case RT_TABLE_LOCAL:
- net->ipv4.fib_local = tb;
+ rcu_assign_pointer(net->ipv4.fib_local, tb);
break;
-
case RT_TABLE_MAIN:
- net->ipv4.fib_main = tb;
+ rcu_assign_pointer(net->ipv4.fib_main, tb);
break;
-
case RT_TABLE_DEFAULT:
- net->ipv4.fib_default = tb;
+ rcu_assign_pointer(net->ipv4.fib_default, tb);
break;
-
default:
break;
}
@@ -129,16 +129,62 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
}
#endif /* CONFIG_IP_MULTIPLE_TABLES */
+static void fib_replace_table(struct net *net, struct fib_table *old,
+ struct fib_table *new)
+{
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ switch (new->tb_id) {
+ case RT_TABLE_LOCAL:
+ rcu_assign_pointer(net->ipv4.fib_local, new);
+ break;
+ case RT_TABLE_MAIN:
+ rcu_assign_pointer(net->ipv4.fib_main, new);
+ break;
+ case RT_TABLE_DEFAULT:
+ rcu_assign_pointer(net->ipv4.fib_default, new);
+ break;
+ default:
+ break;
+ }
+
+#endif
+ /* replace the old table in the hlist */
+ hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist);
+}
+
+int fib_unmerge(struct net *net)
+{
+ struct fib_table *old, *new;
+
+ /* attempt to fetch local table if it has been allocated */
+ old = fib_get_table(net, RT_TABLE_LOCAL);
+ if (!old)
+ return 0;
+
+ new = fib_trie_unmerge(old);
+ if (!new)
+ return -ENOMEM;
+
+ /* replace merged table with clean table */
+ if (new != old) {
+ fib_replace_table(net, old, new);
+ fib_free_table(old);
+ }
+
+ return 0;
+}
+
static void fib_flush(struct net *net)
{
int flushed = 0;
- struct fib_table *tb;
- struct hlist_head *head;
unsigned int h;
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
- head = &net->ipv4.fib_table_hash[h];
- hlist_for_each_entry(tb, head, tb_hlist)
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ struct hlist_node *tmp;
+ struct fib_table *tb;
+
+ hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
flushed += fib_table_flush(tb);
}
@@ -146,6 +192,19 @@ static void fib_flush(struct net *net)
rt_cache_flush(net);
}
+void fib_flush_external(struct net *net)
+{
+ struct fib_table *tb;
+ struct hlist_head *head;
+ unsigned int h;
+
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ head = &net->ipv4.fib_table_hash[h];
+ hlist_for_each_entry(tb, head, tb_hlist)
+ fib_table_flush_external(tb);
+ }
+}
+
/*
* Find address type as if only "dev" was present in the system. If
* on_dev is NULL then all interfaces are taken into consideration.
@@ -665,10 +724,12 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
s_h = cb->args[0];
s_e = cb->args[1];
+ rcu_read_lock();
+
for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
e = 0;
head = &net->ipv4.fib_table_hash[h];
- hlist_for_each_entry(tb, head, tb_hlist) {
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
if (e < s_e)
goto next;
if (dumped)
@@ -682,6 +743,8 @@ next:
}
}
out:
+ rcu_read_unlock();
+
cb->args[1] = e;
cb->args[0] = h;
@@ -1111,20 +1174,41 @@ static void ip_fib_net_exit(struct net *net)
{
unsigned int i;
+ rtnl_lock();
+
#ifdef CONFIG_IP_MULTIPLE_TABLES
fib4_rules_exit(net);
#endif
- rtnl_lock();
for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
- struct fib_table *tb;
- struct hlist_head *head;
+ struct hlist_head *head = &net->ipv4.fib_table_hash[i];
struct hlist_node *tmp;
+ struct fib_table *tb;
+
+ /* this is done in two passes as flushing the table could
+ * cause it to be reallocated in order to accommodate new
+ * tnodes at the root as the table shrinks.
+ */
+ hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
+ fib_table_flush(tb);
- head = &net->ipv4.fib_table_hash[i];
hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ switch (tb->tb_id) {
+ case RT_TABLE_LOCAL:
+ RCU_INIT_POINTER(net->ipv4.fib_local, NULL);
+ break;
+ case RT_TABLE_MAIN:
+ RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
+ break;
+ case RT_TABLE_DEFAULT:
+ RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
+ break;
+ default:
+ break;
+ }
+#endif
hlist_del(&tb->tb_hlist);
- fib_table_flush(tb);
fib_free_table(tb);
}
}
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 825981b1..c6211ed 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -6,11 +6,13 @@
#include <net/ip_fib.h>
struct fib_alias {
- struct list_head fa_list;
+ struct hlist_node fa_list;
struct fib_info *fa_info;
u8 fa_tos;
u8 fa_type;
u8 fa_state;
+ u8 fa_slen;
+ u32 tb_id;
struct rcu_head rcu;
};
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index d3db718..e9bc5e4 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -174,6 +174,11 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
if (frh->tos & ~IPTOS_TOS_MASK)
goto errout;
+ /* split local/main if they are not already split */
+ err = fib_unmerge(net);
+ if (err)
+ goto errout;
+
if (rule->table == RT_TABLE_UNSPEC) {
if (rule->action == FR_ACT_TO_TBL) {
struct fib_table *table;
@@ -209,21 +214,31 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
rule4->tos = frh->tos;
net->ipv4.fib_has_custom_rules = true;
+ fib_flush_external(rule->fr_net);
+
err = 0;
errout:
return err;
}
-static void fib4_rule_delete(struct fib_rule *rule)
+static int fib4_rule_delete(struct fib_rule *rule)
{
struct net *net = rule->fr_net;
-#ifdef CONFIG_IP_ROUTE_CLASSID
- struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+ int err;
- if (rule4->tclassid)
+ /* split local/main if they are not already split */
+ err = fib_unmerge(net);
+ if (err)
+ goto errout;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (((struct fib4_rule *)rule)->tclassid)
net->ipv4.fib_num_tclassid_users--;
#endif
net->ipv4.fib_has_custom_rules = true;
+ fib_flush_external(rule->fr_net);
+errout:
+ return err;
}
static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 1e2090e..66c1e4f 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -213,7 +213,6 @@ static void free_fib_info_rcu(struct rcu_head *head)
rt_fibinfo_free(&nexthop_nh->nh_rth_input);
} endfor_nexthops(fi);
- release_net(fi->fib_net);
if (fi->fib_metrics != (u32 *) dst_default_metrics)
kfree(fi->fib_metrics);
kfree(fi);
@@ -814,7 +813,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
} else
fi->fib_metrics = (u32 *) dst_default_metrics;
- fi->fib_net = hold_net(net);
+ fi->fib_net = net;
fi->fib_protocol = cfg->fc_protocol;
fi->fib_scope = cfg->fc_scope;
fi->fib_flags = cfg->fc_flags;
@@ -1163,12 +1162,12 @@ int fib_sync_down_dev(struct net_device *dev, int force)
void fib_select_default(struct fib_result *res)
{
struct fib_info *fi = NULL, *last_resort = NULL;
- struct list_head *fa_head = res->fa_head;
+ struct hlist_head *fa_head = res->fa_head;
struct fib_table *tb = res->table;
int order = -1, last_idx = -1;
struct fib_alias *fa;
- list_for_each_entry_rcu(fa, fa_head, fa_list) {
+ hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
struct fib_info *next_fi = fa->fa_info;
if (next_fi->fib_scope != res->scope ||
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 3daf022..e3b4aee 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -79,6 +79,7 @@
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
+#include <net/switchdev.h>
#include "fib_lookup.h"
#define MAX_STAT_DEPTH 32
@@ -88,38 +89,35 @@
typedef unsigned int t_key;
-#define IS_TNODE(n) ((n)->bits)
-#define IS_LEAF(n) (!(n)->bits)
+#define IS_TRIE(n) ((n)->pos >= KEYLENGTH)
+#define IS_TNODE(n) ((n)->bits)
+#define IS_LEAF(n) (!(n)->bits)
-#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> (_kv)->pos)
-
-struct tnode {
+struct key_vector {
t_key key;
- unsigned char bits; /* 2log(KEYLENGTH) bits needed */
unsigned char pos; /* 2log(KEYLENGTH) bits needed */
+ unsigned char bits; /* 2log(KEYLENGTH) bits needed */
unsigned char slen;
- struct tnode __rcu *parent;
- struct rcu_head rcu;
union {
- /* The fields in this struct are valid if bits > 0 (TNODE) */
- struct {
- t_key empty_children; /* KEYLENGTH bits needed */
- t_key full_children; /* KEYLENGTH bits needed */
- struct tnode __rcu *child[0];
- };
- /* This list pointer if valid if bits == 0 (LEAF) */
- struct hlist_head list;
+ /* This list pointer if valid if (pos | bits) == 0 (LEAF) */
+ struct hlist_head leaf;
+ /* This array is valid if (pos | bits) > 0 (TNODE) */
+ struct key_vector __rcu *tnode[0];
};
};
-struct leaf_info {
- struct hlist_node hlist;
- int plen;
- u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
- struct list_head falh;
+struct tnode {
struct rcu_head rcu;
+ t_key empty_children; /* KEYLENGTH bits needed */
+ t_key full_children; /* KEYLENGTH bits needed */
+ struct key_vector __rcu *parent;
+ struct key_vector kv[1];
+#define tn_bits kv[0].bits
};
+#define TNODE_SIZE(n) offsetof(struct tnode, kv[0].tnode[n])
+#define LEAF_SIZE TNODE_SIZE(1)
+
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats {
unsigned int gets;
@@ -142,13 +140,13 @@ struct trie_stat {
};
struct trie {
- struct tnode __rcu *trie;
+ struct key_vector kv[1];
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats __percpu *stats;
#endif
};
-static void resize(struct trie *t, struct tnode *tn);
+static struct key_vector *resize(struct trie *t, struct key_vector *tn);
static size_t tnode_free_size;
/*
@@ -161,41 +159,46 @@ static const int sync_pages = 128;
static struct kmem_cache *fn_alias_kmem __read_mostly;
static struct kmem_cache *trie_leaf_kmem __read_mostly;
+static inline struct tnode *tn_info(struct key_vector *kv)
+{
+ return container_of(kv, struct tnode, kv[0]);
+}
+
/* caller must hold RTNL */
-#define node_parent(n) rtnl_dereference((n)->parent)
+#define node_parent(tn) rtnl_dereference(tn_info(tn)->parent)
+#define get_child(tn, i) rtnl_dereference((tn)->tnode[i])
/* caller must hold RCU read lock or RTNL */
-#define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
+#define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent)
+#define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i])
/* wrapper for rcu_assign_pointer */
-static inline void node_set_parent(struct tnode *n, struct tnode *tp)
+static inline void node_set_parent(struct key_vector *n, struct key_vector *tp)
{
if (n)
- rcu_assign_pointer(n->parent, tp);
+ rcu_assign_pointer(tn_info(n)->parent, tp);
}
-#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)
+#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p)
/* This provides us with the number of children in this node, in the case of a
* leaf this will return 0 meaning none of the children are accessible.
*/
-static inline unsigned long tnode_child_length(const struct tnode *tn)
+static inline unsigned long child_length(const struct key_vector *tn)
{
return (1ul << tn->bits) & ~(1ul);
}
-/* caller must hold RTNL */
-static inline struct tnode *tnode_get_child(const struct tnode *tn,
- unsigned long i)
-{
- return rtnl_dereference(tn->child[i]);
-}
+#define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos)
-/* caller must hold RCU read lock or RTNL */
-static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn,
- unsigned long i)
+static inline unsigned long get_index(t_key key, struct key_vector *kv)
{
- return rcu_dereference_rtnl(tn->child[i]);
+ unsigned long index = key ^ kv->key;
+
+ if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos))
+ return 0;
+
+ return index >> kv->pos;
}
/* To understand this stuff, an understanding of keys and all their bits is
@@ -274,106 +277,104 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa)
}
#define TNODE_KMALLOC_MAX \
- ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *))
+ ilog2((PAGE_SIZE - TNODE_SIZE(0)) / sizeof(struct key_vector *))
+#define TNODE_VMALLOC_MAX \
+ ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *))
static void __node_free_rcu(struct rcu_head *head)
{
struct tnode *n = container_of(head, struct tnode, rcu);
- if (IS_LEAF(n))
+ if (!n->tn_bits)
kmem_cache_free(trie_leaf_kmem, n);
- else if (n->bits <= TNODE_KMALLOC_MAX)
+ else if (n->tn_bits <= TNODE_KMALLOC_MAX)
kfree(n);
else
vfree(n);
}
-#define node_free(n) call_rcu(&n->rcu, __node_free_rcu)
+#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
-static inline void free_leaf_info(struct leaf_info *leaf)
+static struct tnode *tnode_alloc(int bits)
{
- kfree_rcu(leaf, rcu);
-}
+ size_t size;
+
+ /* verify bits is within bounds */
+ if (bits > TNODE_VMALLOC_MAX)
+ return NULL;
+
+ /* determine size and verify it is non-zero and didn't overflow */
+ size = TNODE_SIZE(1ul << bits);
-static struct tnode *tnode_alloc(size_t size)
-{
if (size <= PAGE_SIZE)
return kzalloc(size, GFP_KERNEL);
else
return vzalloc(size);
}
-static inline void empty_child_inc(struct tnode *n)
+static inline void empty_child_inc(struct key_vector *n)
{
- ++n->empty_children ? : ++n->full_children;
+ ++tn_info(n)->empty_children ? : ++tn_info(n)->full_children;
}
-static inline void empty_child_dec(struct tnode *n)
+static inline void empty_child_dec(struct key_vector *n)
{
- n->empty_children-- ? : n->full_children--;
+ tn_info(n)->empty_children-- ? : tn_info(n)->full_children--;
}
-static struct tnode *leaf_new(t_key key)
+static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
{
- struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
- if (l) {
- l->parent = NULL;
- /* set key and pos to reflect full key value
- * any trailing zeros in the key should be ignored
- * as the nodes are searched
- */
- l->key = key;
- l->slen = 0;
- l->pos = 0;
- /* set bits to 0 indicating we are not a tnode */
- l->bits = 0;
+ struct tnode *kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
+ struct key_vector *l = kv->kv;
- INIT_HLIST_HEAD(&l->list);
- }
- return l;
-}
+ if (!kv)
+ return NULL;
-static struct leaf_info *leaf_info_new(int plen)
-{
- struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
- if (li) {
- li->plen = plen;
- li->mask_plen = ntohl(inet_make_mask(plen));
- INIT_LIST_HEAD(&li->falh);
- }
- return li;
+ /* initialize key vector */
+ l->key = key;
+ l->pos = 0;
+ l->bits = 0;
+ l->slen = fa->fa_slen;
+
+ /* link leaf to fib alias */
+ INIT_HLIST_HEAD(&l->leaf);
+ hlist_add_head(&fa->fa_list, &l->leaf);
+
+ return l;
}
-static struct tnode *tnode_new(t_key key, int pos, int bits)
+static struct key_vector *tnode_new(t_key key, int pos, int bits)
{
- size_t sz = offsetof(struct tnode, child[1ul << bits]);
- struct tnode *tn = tnode_alloc(sz);
+ struct tnode *tnode = tnode_alloc(bits);
unsigned int shift = pos + bits;
+ struct key_vector *tn = tnode->kv;
/* verify bits and pos their msb bits clear and values are valid */
BUG_ON(!bits || (shift > KEYLENGTH));
- if (tn) {
- tn->parent = NULL;
- tn->slen = pos;
- tn->pos = pos;
- tn->bits = bits;
- tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
- if (bits == KEYLENGTH)
- tn->full_children = 1;
- else
- tn->empty_children = 1ul << bits;
- }
+ pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0),
+ sizeof(struct key_vector *) << bits);
+
+ if (!tnode)
+ return NULL;
+
+ if (bits == KEYLENGTH)
+ tnode->full_children = 1;
+ else
+ tnode->empty_children = 1ul << bits;
+
+ tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
+ tn->pos = pos;
+ tn->bits = bits;
+ tn->slen = pos;
- pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
- sizeof(struct tnode *) << bits);
return tn;
}
/* Check whether a tnode 'n' is "full", i.e. it is an internal node
* and no bits are skipped. See discussion in dyntree paper p. 6
*/
-static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
+static inline int tnode_full(struct key_vector *tn, struct key_vector *n)
{
return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
}
@@ -381,12 +382,13 @@ static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
/* Add a child at position i overwriting the old value.
* Update the value of full_children and empty_children.
*/
-static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
+static void put_child(struct key_vector *tn, unsigned long i,
+ struct key_vector *n)
{
- struct tnode *chi = tnode_get_child(tn, i);
+ struct key_vector *chi = get_child(tn, i);
int isfull, wasfull;
- BUG_ON(i >= tnode_child_length(tn));
+ BUG_ON(i >= child_length(tn));
/* update emptyChildren, overflow into fullChildren */
if (n == NULL && chi != NULL)
@@ -399,23 +401,23 @@ static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
isfull = tnode_full(tn, n);
if (wasfull && !isfull)
- tn->full_children--;
+ tn_info(tn)->full_children--;
else if (!wasfull && isfull)
- tn->full_children++;
+ tn_info(tn)->full_children++;
if (n && (tn->slen < n->slen))
tn->slen = n->slen;
- rcu_assign_pointer(tn->child[i], n);
+ rcu_assign_pointer(tn->tnode[i], n);
}
-static void update_children(struct tnode *tn)
+static void update_children(struct key_vector *tn)
{
unsigned long i;
/* update all of the child parent pointers */
- for (i = tnode_child_length(tn); i;) {
- struct tnode *inode = tnode_get_child(tn, --i);
+ for (i = child_length(tn); i;) {
+ struct key_vector *inode = get_child(tn, --i);
if (!inode)
continue;
@@ -431,36 +433,37 @@ static void update_children(struct tnode *tn)
}
}
-static inline void put_child_root(struct tnode *tp, struct trie *t,
- t_key key, struct tnode *n)
+static inline void put_child_root(struct key_vector *tp, t_key key,
+ struct key_vector *n)
{
- if (tp)
- put_child(tp, get_index(key, tp), n);
+ if (IS_TRIE(tp))
+ rcu_assign_pointer(tp->tnode[0], n);
else
- rcu_assign_pointer(t->trie, n);
+ put_child(tp, get_index(key, tp), n);
}
-static inline void tnode_free_init(struct tnode *tn)
+static inline void tnode_free_init(struct key_vector *tn)
{
- tn->rcu.next = NULL;
+ tn_info(tn)->rcu.next = NULL;
}
-static inline void tnode_free_append(struct tnode *tn, struct tnode *n)
+static inline void tnode_free_append(struct key_vector *tn,
+ struct key_vector *n)
{
- n->rcu.next = tn->rcu.next;
- tn->rcu.next = &n->rcu;
+ tn_info(n)->rcu.next = tn_info(tn)->rcu.next;
+ tn_info(tn)->rcu.next = &tn_info(n)->rcu;
}
-static void tnode_free(struct tnode *tn)
+static void tnode_free(struct key_vector *tn)
{
- struct callback_head *head = &tn->rcu;
+ struct callback_head *head = &tn_info(tn)->rcu;
while (head) {
head = head->next;
- tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
+ tnode_free_size += TNODE_SIZE(1ul << tn->bits);
node_free(tn);
- tn = container_of(head, struct tnode, rcu);
+ tn = container_of(head, struct tnode, rcu)->kv;
}
if (tnode_free_size >= PAGE_SIZE * sync_pages) {
@@ -469,14 +472,16 @@ static void tnode_free(struct tnode *tn)
}
}
-static void replace(struct trie *t, struct tnode *oldtnode, struct tnode *tn)
+static struct key_vector *replace(struct trie *t,
+ struct key_vector *oldtnode,
+ struct key_vector *tn)
{
- struct tnode *tp = node_parent(oldtnode);
+ struct key_vector *tp = node_parent(oldtnode);
unsigned long i;
/* setup the parent pointer out of and back into this node */
NODE_INIT_PARENT(tn, tp);
- put_child_root(tp, t, tn->key, tn);
+ put_child_root(tp, tn->key, tn);
/* update all of the child parent pointers */
update_children(tn);
@@ -485,18 +490,21 @@ static void replace(struct trie *t, struct tnode *oldtnode, struct tnode *tn)
tnode_free(oldtnode);
/* resize children now that oldtnode is freed */
- for (i = tnode_child_length(tn); i;) {
- struct tnode *inode = tnode_get_child(tn, --i);
+ for (i = child_length(tn); i;) {
+ struct key_vector *inode = get_child(tn, --i);
/* resize child node */
if (tnode_full(tn, inode))
- resize(t, inode);
+ tn = resize(t, inode);
}
+
+ return tp;
}
-static int inflate(struct trie *t, struct tnode *oldtnode)
+static struct key_vector *inflate(struct trie *t,
+ struct key_vector *oldtnode)
{
- struct tnode *tn;
+ struct key_vector *tn;
unsigned long i;
t_key m;
@@ -504,7 +512,7 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
if (!tn)
- return -ENOMEM;
+ goto notnode;
/* prepare oldtnode to be freed */
tnode_free_init(oldtnode);
@@ -514,9 +522,9 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
* point to existing tnodes and the links between our allocated
* nodes.
*/
- for (i = tnode_child_length(oldtnode), m = 1u << tn->pos; i;) {
- struct tnode *inode = tnode_get_child(oldtnode, --i);
- struct tnode *node0, *node1;
+ for (i = child_length(oldtnode), m = 1u << tn->pos; i;) {
+ struct key_vector *inode = get_child(oldtnode, --i);
+ struct key_vector *node0, *node1;
unsigned long j, k;
/* An empty child */
@@ -534,8 +542,8 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
/* An internal node with two children */
if (inode->bits == 1) {
- put_child(tn, 2 * i + 1, tnode_get_child(inode, 1));
- put_child(tn, 2 * i, tnode_get_child(inode, 0));
+ put_child(tn, 2 * i + 1, get_child(inode, 1));
+ put_child(tn, 2 * i, get_child(inode, 0));
continue;
}
@@ -564,11 +572,11 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
tnode_free_append(tn, node0);
/* populate child pointers in new nodes */
- for (k = tnode_child_length(inode), j = k / 2; j;) {
- put_child(node1, --j, tnode_get_child(inode, --k));
- put_child(node0, j, tnode_get_child(inode, j));
- put_child(node1, --j, tnode_get_child(inode, --k));
- put_child(node0, j, tnode_get_child(inode, j));
+ for (k = child_length(inode), j = k / 2; j;) {
+ put_child(node1, --j, get_child(inode, --k));
+ put_child(node0, j, get_child(inode, j));
+ put_child(node1, --j, get_child(inode, --k));
+ put_child(node0, j, get_child(inode, j));
}
/* link new nodes to parent */
@@ -581,25 +589,25 @@ static int inflate(struct trie *t, struct tnode *oldtnode)
}
/* setup the parent pointers into and out of this node */
- replace(t, oldtnode, tn);
-
- return 0;
+ return replace(t, oldtnode, tn);
nomem:
/* all pointers should be clean so we are done */
tnode_free(tn);
- return -ENOMEM;
+notnode:
+ return NULL;
}
-static int halve(struct trie *t, struct tnode *oldtnode)
+static struct key_vector *halve(struct trie *t,
+ struct key_vector *oldtnode)
{
- struct tnode *tn;
+ struct key_vector *tn;
unsigned long i;
pr_debug("In halve\n");
tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
if (!tn)
- return -ENOMEM;
+ goto notnode;
/* prepare oldtnode to be freed */
tnode_free_init(oldtnode);
@@ -609,10 +617,10 @@ static int halve(struct trie *t, struct tnode *oldtnode)
* point to existing tnodes and the links between our allocated
* nodes.
*/
- for (i = tnode_child_length(oldtnode); i;) {
- struct tnode *node1 = tnode_get_child(oldtnode, --i);
- struct tnode *node0 = tnode_get_child(oldtnode, --i);
- struct tnode *inode;
+ for (i = child_length(oldtnode); i;) {
+ struct key_vector *node1 = get_child(oldtnode, --i);
+ struct key_vector *node0 = get_child(oldtnode, --i);
+ struct key_vector *inode;
/* At least one of the children is empty */
if (!node1 || !node0) {
@@ -622,10 +630,8 @@ static int halve(struct trie *t, struct tnode *oldtnode)
/* Two nonempty children */
inode = tnode_new(node0->key, oldtnode->pos, 1);
- if (!inode) {
- tnode_free(tn);
- return -ENOMEM;
- }
+ if (!inode)
+ goto nomem;
tnode_free_append(tn, inode);
/* initialize pointers out of node */
@@ -638,30 +644,36 @@ static int halve(struct trie *t, struct tnode *oldtnode)
}
/* setup the parent pointers into and out of this node */
- replace(t, oldtnode, tn);
-
- return 0;
+ return replace(t, oldtnode, tn);
+nomem:
+ /* all pointers should be clean so we are done */
+ tnode_free(tn);
+notnode:
+ return NULL;
}
-static void collapse(struct trie *t, struct tnode *oldtnode)
+static struct key_vector *collapse(struct trie *t,
+ struct key_vector *oldtnode)
{
- struct tnode *n, *tp;
+ struct key_vector *n, *tp;
unsigned long i;
/* scan the tnode looking for that one child that might still exist */
- for (n = NULL, i = tnode_child_length(oldtnode); !n && i;)
- n = tnode_get_child(oldtnode, --i);
+ for (n = NULL, i = child_length(oldtnode); !n && i;)
+ n = get_child(oldtnode, --i);
/* compress one level */
tp = node_parent(oldtnode);
- put_child_root(tp, t, oldtnode->key, n);
+ put_child_root(tp, oldtnode->key, n);
node_set_parent(n, tp);
/* drop dead node */
node_free(oldtnode);
+
+ return tp;
}
-static unsigned char update_suffix(struct tnode *tn)
+static unsigned char update_suffix(struct key_vector *tn)
{
unsigned char slen = tn->pos;
unsigned long stride, i;
@@ -671,8 +683,8 @@ static unsigned char update_suffix(struct tnode *tn)
* why we start with a stride of 2 since a stride of 1 would
* represent the nodes with suffix length equal to tn->pos
*/
- for (i = 0, stride = 0x2ul ; i < tnode_child_length(tn); i += stride) {
- struct tnode *n = tnode_get_child(tn, i);
+ for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) {
+ struct key_vector *n = get_child(tn, i);
if (!n || (n->slen <= slen))
continue;
@@ -704,12 +716,12 @@ static unsigned char update_suffix(struct tnode *tn)
*
* 'high' in this instance is the variable 'inflate_threshold'. It
* is expressed as a percentage, so we multiply it with
- * tnode_child_length() and instead of multiplying by 2 (since the
+ * child_length() and instead of multiplying by 2 (since the
* child array will be doubled by inflate()) and multiplying
* the left-hand side by 100 (to handle the percentage thing) we
* multiply the left-hand side by 50.
*
- * The left-hand side may look a bit weird: tnode_child_length(tn)
+ * The left-hand side may look a bit weird: child_length(tn)
* - tn->empty_children is of course the number of non-null children
* in the current node. tn->full_children is the number of "full"
* children, that is non-null tnodes with a skip value of 0.
@@ -719,10 +731,10 @@ static unsigned char update_suffix(struct tnode *tn)
* A clearer way to write this would be:
*
* to_be_doubled = tn->full_children;
- * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
+ * not_to_be_doubled = child_length(tn) - tn->empty_children -
* tn->full_children;
*
- * new_child_length = tnode_child_length(tn) * 2;
+ * new_child_length = child_length(tn) * 2;
*
* new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
* new_child_length;
@@ -739,57 +751,57 @@ static unsigned char update_suffix(struct tnode *tn)
* inflate_threshold * new_child_length
*
* expand not_to_be_doubled and to_be_doubled, and shorten:
- * 100 * (tnode_child_length(tn) - tn->empty_children +
+ * 100 * (child_length(tn) - tn->empty_children +
* tn->full_children) >= inflate_threshold * new_child_length
*
* expand new_child_length:
- * 100 * (tnode_child_length(tn) - tn->empty_children +
+ * 100 * (child_length(tn) - tn->empty_children +
* tn->full_children) >=
- * inflate_threshold * tnode_child_length(tn) * 2
+ * inflate_threshold * child_length(tn) * 2
*
* shorten again:
- * 50 * (tn->full_children + tnode_child_length(tn) -
+ * 50 * (tn->full_children + child_length(tn) -
* tn->empty_children) >= inflate_threshold *
- * tnode_child_length(tn)
+ * child_length(tn)
*
*/
-static bool should_inflate(const struct tnode *tp, const struct tnode *tn)
+static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn)
{
- unsigned long used = tnode_child_length(tn);
+ unsigned long used = child_length(tn);
unsigned long threshold = used;
/* Keep root node larger */
- threshold *= tp ? inflate_threshold : inflate_threshold_root;
- used -= tn->empty_children;
- used += tn->full_children;
+ threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold;
+ used -= tn_info(tn)->empty_children;
+ used += tn_info(tn)->full_children;
/* if bits == KEYLENGTH then pos = 0, and will fail below */
return (used > 1) && tn->pos && ((50 * used) >= threshold);
}
-static bool should_halve(const struct tnode *tp, const struct tnode *tn)
+static inline bool should_halve(struct key_vector *tp, struct key_vector *tn)
{
- unsigned long used = tnode_child_length(tn);
+ unsigned long used = child_length(tn);
unsigned long threshold = used;
/* Keep root node larger */
- threshold *= tp ? halve_threshold : halve_threshold_root;
- used -= tn->empty_children;
+ threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold;
+ used -= tn_info(tn)->empty_children;
/* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */
return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold);
}
-static bool should_collapse(const struct tnode *tn)
+static inline bool should_collapse(struct key_vector *tn)
{
- unsigned long used = tnode_child_length(tn);
+ unsigned long used = child_length(tn);
- used -= tn->empty_children;
+ used -= tn_info(tn)->empty_children;
/* account for bits == KEYLENGTH case */
- if ((tn->bits == KEYLENGTH) && tn->full_children)
+ if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children)
used -= KEY_MAX;
/* One child or none, time to drop us from the trie */
@@ -797,10 +809,13 @@ static bool should_collapse(const struct tnode *tn)
}
#define MAX_WORK 10
-static void resize(struct trie *t, struct tnode *tn)
+static struct key_vector *resize(struct trie *t, struct key_vector *tn)
{
- struct tnode *tp = node_parent(tn);
- struct tnode __rcu **cptr;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ struct trie_use_stats __percpu *stats = t->stats;
+#endif
+ struct key_vector *tp = node_parent(tn);
+ unsigned long cindex = get_index(tn->key, tp);
int max_work = MAX_WORK;
pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
@@ -810,183 +825,123 @@ static void resize(struct trie *t, struct tnode *tn)
* doing it ourselves. This way we can let RCU fully do its
* thing without us interfering
*/
- cptr = tp ? &tp->child[get_index(tn->key, tp)] : &t->trie;
- BUG_ON(tn != rtnl_dereference(*cptr));
+ BUG_ON(tn != get_child(tp, cindex));
/* Double as long as the resulting node has a number of
* nonempty nodes that are above the threshold.
*/
- while (should_inflate(tp, tn) && max_work) {
- if (inflate(t, tn)) {
+ while (should_inflate(tp, tn) && max_work--) {
+ tp = inflate(t, tn);
+ if (!tp) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
- this_cpu_inc(t->stats->resize_node_skipped);
+ this_cpu_inc(stats->resize_node_skipped);
#endif
break;
}
- max_work--;
- tn = rtnl_dereference(*cptr);
+ tn = get_child(tp, cindex);
}
/* Return if at least one inflate is run */
if (max_work != MAX_WORK)
- return;
+ return node_parent(tn);
/* Halve as long as the number of empty children in this
* node is above threshold.
*/
- while (should_halve(tp, tn) && max_work) {
- if (halve(t, tn)) {
+ while (should_halve(tp, tn) && max_work--) {
+ tp = halve(t, tn);
+ if (!tp) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
- this_cpu_inc(t->stats->resize_node_skipped);
+ this_cpu_inc(stats->resize_node_skipped);
#endif
break;
}
- max_work--;
- tn = rtnl_dereference(*cptr);
+ tn = get_child(tp, cindex);
}
/* Only one child remains */
- if (should_collapse(tn)) {
- collapse(t, tn);
- return;
- }
+ if (should_collapse(tn))
+ return collapse(t, tn);
+
+ /* update parent in case inflate or halve failed */
+ tp = node_parent(tn);
/* Return if at least one deflate was run */
if (max_work != MAX_WORK)
- return;
+ return tp;
/* push the suffix length to the parent node */
if (tn->slen > tn->pos) {
unsigned char slen = update_suffix(tn);
- if (tp && (slen > tp->slen))
+ if (slen > tp->slen)
tp->slen = slen;
}
-}
-
-/* readside must use rcu_read_lock currently dump routines
- via get_fa_head and dump */
-
-static struct leaf_info *find_leaf_info(struct tnode *l, int plen)
-{
- struct hlist_head *head = &l->list;
- struct leaf_info *li;
-
- hlist_for_each_entry_rcu(li, head, hlist)
- if (li->plen == plen)
- return li;
-
- return NULL;
-}
-
-static inline struct list_head *get_fa_head(struct tnode *l, int plen)
-{
- struct leaf_info *li = find_leaf_info(l, plen);
- if (!li)
- return NULL;
-
- return &li->falh;
+ return tp;
}
-static void leaf_pull_suffix(struct tnode *l)
+static void leaf_pull_suffix(struct key_vector *tp, struct key_vector *l)
{
- struct tnode *tp = node_parent(l);
-
- while (tp && (tp->slen > tp->pos) && (tp->slen > l->slen)) {
+ while ((tp->slen > tp->pos) && (tp->slen > l->slen)) {
if (update_suffix(tp) > l->slen)
break;
tp = node_parent(tp);
}
}
-static void leaf_push_suffix(struct tnode *l)
+static void leaf_push_suffix(struct key_vector *tn, struct key_vector *l)
{
- struct tnode *tn = node_parent(l);
-
/* if this is a new leaf then tn will be NULL and we can sort
* out parent suffix lengths as a part of trie_rebalance
*/
- while (tn && (tn->slen < l->slen)) {
+ while (tn->slen < l->slen) {
tn->slen = l->slen;
tn = node_parent(tn);
}
}
-static void remove_leaf_info(struct tnode *l, struct leaf_info *old)
-{
- /* record the location of the previous list_info entry */
- struct hlist_node **pprev = old->hlist.pprev;
- struct leaf_info *li = hlist_entry(pprev, typeof(*li), hlist.next);
-
- /* remove the leaf info from the list */
- hlist_del_rcu(&old->hlist);
-
- /* only access li if it is pointing at the last valid hlist_node */
- if (hlist_empty(&l->list) || (*pprev))
- return;
-
- /* update the trie with the latest suffix length */
- l->slen = KEYLENGTH - li->plen;
- leaf_pull_suffix(l);
-}
-
-static void insert_leaf_info(struct tnode *l, struct leaf_info *new)
+/* rcu_read_lock needs to be hold by caller from readside */
+static struct key_vector *fib_find_node(struct trie *t,
+ struct key_vector **tp, u32 key)
{
- struct hlist_head *head = &l->list;
- struct leaf_info *li = NULL, *last = NULL;
-
- if (hlist_empty(head)) {
- hlist_add_head_rcu(&new->hlist, head);
- } else {
- hlist_for_each_entry(li, head, hlist) {
- if (new->plen > li->plen)
- break;
-
- last = li;
- }
- if (last)
- hlist_add_behind_rcu(&new->hlist, &last->hlist);
- else
- hlist_add_before_rcu(&new->hlist, &li->hlist);
- }
+ struct key_vector *pn, *n = t->kv;
+ unsigned long index = 0;
- /* if we added to the tail node then we need to update slen */
- if (l->slen < (KEYLENGTH - new->plen)) {
- l->slen = KEYLENGTH - new->plen;
- leaf_push_suffix(l);
- }
-}
+ do {
+ pn = n;
+ n = get_child_rcu(n, index);
-/* rcu_read_lock needs to be hold by caller from readside */
-static struct tnode *fib_find_node(struct trie *t, u32 key)
-{
- struct tnode *n = rcu_dereference_rtnl(t->trie);
+ if (!n)
+ break;
- while (n) {
- unsigned long index = get_index(key, n);
+ index = get_cindex(key, n);
/* This bit of code is a bit tricky but it combines multiple
* checks into a single check. The prefix consists of the
* prefix plus zeros for the bits in the cindex. The index
* is the difference between the key and this value. From
* this we can actually derive several pieces of data.
- * if (index & (~0ul << bits))
+ * if (index >= (1ul << bits))
* we have a mismatch in skip bits and failed
* else
* we know the value is cindex
+ *
+ * This check is safe even if bits == KEYLENGTH due to the
+ * fact that we can only allocate a node with 32 bits if a
+ * long is greater than 32 bits.
*/
- if (index & (~0ul << n->bits))
- return NULL;
-
- /* we have found a leaf. Prefixes have already been compared */
- if (IS_LEAF(n))
+ if (index >= (1ul << n->bits)) {
+ n = NULL;
break;
+ }
- n = tnode_get_child_rcu(n, index);
- }
+ /* keep searching until we find a perfect match leaf or NULL */
+ } while (IS_TNODE(n));
+
+ *tp = pn;
return n;
}
@@ -994,14 +949,23 @@ static struct tnode *fib_find_node(struct trie *t, u32 key)
/* Return the first fib alias matching TOS with
* priority less than or equal to PRIO.
*/
-static struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
+static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
+ u8 tos, u32 prio, u32 tb_id)
{
struct fib_alias *fa;
if (!fah)
return NULL;
- list_for_each_entry(fa, fah, fa_list) {
+ hlist_for_each_entry(fa, fah, fa_list) {
+ if (fa->fa_slen < slen)
+ continue;
+ if (fa->fa_slen != slen)
+ break;
+ if (fa->tb_id > tb_id)
+ continue;
+ if (fa->tb_id != tb_id)
+ break;
if (fa->fa_tos > tos)
continue;
if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
@@ -1011,77 +975,23 @@ static struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
return NULL;
}
-static void trie_rebalance(struct trie *t, struct tnode *tn)
+static void trie_rebalance(struct trie *t, struct key_vector *tn)
{
- struct tnode *tp;
-
- while ((tp = node_parent(tn)) != NULL) {
- resize(t, tn);
- tn = tp;
- }
-
- /* Handle last (top) tnode */
- if (IS_TNODE(tn))
- resize(t, tn);
+ while (!IS_TRIE(tn))
+ tn = resize(t, tn);
}
-/* only used from updater-side */
-
-static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
+static int fib_insert_node(struct trie *t, struct key_vector *tp,
+ struct fib_alias *new, t_key key)
{
- struct list_head *fa_head = NULL;
- struct tnode *l, *n, *tp = NULL;
- struct leaf_info *li;
-
- li = leaf_info_new(plen);
- if (!li)
- return NULL;
- fa_head = &li->falh;
-
- n = rtnl_dereference(t->trie);
+ struct key_vector *n, *l;
- /* If we point to NULL, stop. Either the tree is empty and we should
- * just put a new leaf in if, or we have reached an empty child slot,
- * and we should just put our new leaf in that.
- *
- * If we hit a node with a key that does't match then we should stop
- * and create a new tnode to replace that node and insert ourselves
- * and the other node into the new tnode.
- */
- while (n) {
- unsigned long index = get_index(key, n);
-
- /* This bit of code is a bit tricky but it combines multiple
- * checks into a single check. The prefix consists of the
- * prefix plus zeros for the "bits" in the prefix. The index
- * is the difference between the key and this value. From
- * this we can actually derive several pieces of data.
- * if !(index >> bits)
- * we know the value is child index
- * else
- * we have a mismatch in skip bits and failed
- */
- if (index >> n->bits)
- break;
-
- /* we have found a leaf. Prefixes have already been compared */
- if (IS_LEAF(n)) {
- /* Case 1: n is a leaf, and prefixes match*/
- insert_leaf_info(n, li);
- return fa_head;
- }
-
- tp = n;
- n = tnode_get_child_rcu(n, index);
- }
-
- l = leaf_new(key);
- if (!l) {
- free_leaf_info(li);
- return NULL;
- }
+ l = leaf_new(key, new);
+ if (!l)
+ goto noleaf;
- insert_leaf_info(l, li);
+ /* retrieve child from parent node */
+ n = get_child(tp, get_index(key, tp));
/* Case 2: n is a LEAF or a TNODE and the key doesn't match.
*
@@ -1090,21 +1000,18 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
* leaves us in position for handling as case 3
*/
if (n) {
- struct tnode *tn;
+ struct key_vector *tn;
tn = tnode_new(key, __fls(key ^ n->key), 1);
- if (!tn) {
- free_leaf_info(li);
- node_free(l);
- return NULL;
- }
+ if (!tn)
+ goto notnode;
/* initialize routes out of node */
NODE_INIT_PARENT(tn, tp);
put_child(tn, get_index(key, tn) ^ 1, n);
/* start adding routes into the node */
- put_child_root(tp, t, key, tn);
+ put_child_root(tp, key, tn);
node_set_parent(n, tn);
/* parent now has a NULL spot where the leaf can go */
@@ -1112,69 +1019,93 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
}
/* Case 3: n is NULL, and will just insert a new leaf */
- if (tp) {
- NODE_INIT_PARENT(l, tp);
- put_child(tp, get_index(key, tp), l);
- trie_rebalance(t, tp);
+ NODE_INIT_PARENT(l, tp);
+ put_child_root(tp, key, l);
+ trie_rebalance(t, tp);
+
+ return 0;
+notnode:
+ node_free(l);
+noleaf:
+ return -ENOMEM;
+}
+
+static int fib_insert_alias(struct trie *t, struct key_vector *tp,
+ struct key_vector *l, struct fib_alias *new,
+ struct fib_alias *fa, t_key key)
+{
+ if (!l)
+ return fib_insert_node(t, tp, new, key);
+
+ if (fa) {
+ hlist_add_before_rcu(&new->fa_list, &fa->fa_list);
} else {
- rcu_assign_pointer(t->trie, l);
+ struct fib_alias *last;
+
+ hlist_for_each_entry(last, &l->leaf, fa_list) {
+ if (new->fa_slen < last->fa_slen)
+ break;
+ if ((new->fa_slen == last->fa_slen) &&
+ (new->tb_id > last->tb_id))
+ break;
+ fa = last;
+ }
+
+ if (fa)
+ hlist_add_behind_rcu(&new->fa_list, &fa->fa_list);
+ else
+ hlist_add_head_rcu(&new->fa_list, &l->leaf);
+ }
+
+ /* if we added to the tail node then we need to update slen */
+ if (l->slen < new->fa_slen) {
+ l->slen = new->fa_slen;
+ leaf_push_suffix(tp, l);
}
- return fa_head;
+ return 0;
}
-/*
- * Caller must hold RTNL.
- */
+/* Caller must hold RTNL. */
int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
{
- struct trie *t = (struct trie *) tb->tb_data;
+ struct trie *t = (struct trie *)tb->tb_data;
struct fib_alias *fa, *new_fa;
- struct list_head *fa_head = NULL;
+ struct key_vector *l, *tp;
struct fib_info *fi;
- int plen = cfg->fc_dst_len;
+ u8 plen = cfg->fc_dst_len;
+ u8 slen = KEYLENGTH - plen;
u8 tos = cfg->fc_tos;
- u32 key, mask;
+ u32 key;
int err;
- struct tnode *l;
- if (plen > 32)
+ if (plen > KEYLENGTH)
return -EINVAL;
key = ntohl(cfg->fc_dst);
pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
- mask = ntohl(inet_make_mask(plen));
-
- if (key & ~mask)
+ if ((plen < KEYLENGTH) && (key << plen))
return -EINVAL;
- key = key & mask;
-
fi = fib_create_info(cfg);
if (IS_ERR(fi)) {
err = PTR_ERR(fi);
goto err;
}
- l = fib_find_node(t, key);
- fa = NULL;
-
- if (l) {
- fa_head = get_fa_head(l, plen);
- fa = fib_find_alias(fa_head, tos, fi->fib_priority);
- }
+ l = fib_find_node(t, &tp, key);
+ fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority,
+ tb->tb_id) : NULL;
/* Now fa, if non-NULL, points to the first fib alias
* with the same keys [prefix,tos,priority], if such key already
* exists or to the node before which we will insert new one.
*
* If fa is NULL, we will need to allocate a new one and
- * insert to the head of f.
- *
- * If f is NULL, no fib node matched the destination key
- * and we need to allocate a new one of those as well.
+ * insert to the tail of the section matching the suffix length
+ * of the new alias.
*/
if (fa && fa->fa_tos == tos &&
@@ -1192,9 +1123,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
*/
fa_match = NULL;
fa_first = fa;
- fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
- list_for_each_entry_continue(fa, fa_head, fa_list) {
- if (fa->fa_tos != tos)
+ hlist_for_each_entry_from(fa, fa_list) {
+ if ((fa->fa_slen != slen) ||
+ (fa->tb_id != tb->tb_id) ||
+ (fa->fa_tos != tos))
break;
if (fa->fa_info->fib_priority != fi->fib_priority)
break;
@@ -1226,8 +1158,21 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->fa_type = cfg->fc_type;
state = fa->fa_state;
new_fa->fa_state = state & ~FA_S_ACCESSED;
+ new_fa->fa_slen = fa->fa_slen;
+
+ err = netdev_switch_fib_ipv4_add(key, plen, fi,
+ new_fa->fa_tos,
+ cfg->fc_type,
+ cfg->fc_nlflags,
+ tb->tb_id);
+ if (err) {
+ netdev_switch_fib_ipv4_abort(fi);
+ kmem_cache_free(fn_alias_kmem, new_fa);
+ goto out;
+ }
+
+ hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
- list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
alias_free_mem_rcu(fa);
fib_release_info(fi_drop);
@@ -1261,30 +1206,35 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->fa_tos = tos;
new_fa->fa_type = cfg->fc_type;
new_fa->fa_state = 0;
- /*
- * Insert new entry to the list.
- */
-
- if (!fa_head) {
- fa_head = fib_insert_node(t, key, plen);
- if (unlikely(!fa_head)) {
- err = -ENOMEM;
- goto out_free_new_fa;
- }
+ new_fa->fa_slen = slen;
+ new_fa->tb_id = tb->tb_id;
+
+ /* (Optionally) offload fib entry to switch hardware. */
+ err = netdev_switch_fib_ipv4_add(key, plen, fi, tos,
+ cfg->fc_type,
+ cfg->fc_nlflags,
+ tb->tb_id);
+ if (err) {
+ netdev_switch_fib_ipv4_abort(fi);
+ goto out_free_new_fa;
}
+ /* Insert new entry to the list. */
+ err = fib_insert_alias(t, tp, l, new_fa, fa, key);
+ if (err)
+ goto out_sw_fib_del;
+
if (!plen)
tb->tb_num_default++;
- list_add_tail_rcu(&new_fa->fa_list,
- (fa ? &fa->fa_list : fa_head));
-
rt_cache_flush(cfg->fc_nlinfo.nl_net);
- rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
+ rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
&cfg->fc_nlinfo, 0);
succeeded:
return 0;
+out_sw_fib_del:
+ netdev_switch_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id);
out_free_new_fa:
kmem_cache_free(fn_alias_kmem, new_fa);
out:
@@ -1293,7 +1243,7 @@ err:
return err;
}
-static inline t_key prefix_mismatch(t_key key, struct tnode *n)
+static inline t_key prefix_mismatch(t_key key, struct key_vector *n)
{
t_key prefix = n->key;
@@ -1304,16 +1254,20 @@ static inline t_key prefix_mismatch(t_key key, struct tnode *n)
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
struct fib_result *res, int fib_flags)
{
- struct trie *t = (struct trie *)tb->tb_data;
+ struct trie *t = (struct trie *) tb->tb_data;
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats __percpu *stats = t->stats;
#endif
const t_key key = ntohl(flp->daddr);
- struct tnode *n, *pn;
- struct leaf_info *li;
+ struct key_vector *n, *pn;
+ struct fib_alias *fa;
+ unsigned long index;
t_key cindex;
- n = rcu_dereference(t->trie);
+ pn = t->kv;
+ cindex = 0;
+
+ n = get_child_rcu(pn, cindex);
if (!n)
return -EAGAIN;
@@ -1321,24 +1275,25 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
this_cpu_inc(stats->gets);
#endif
- pn = n;
- cindex = 0;
-
/* Step 1: Travel to the longest prefix match in the trie */
for (;;) {
- unsigned long index = get_index(key, n);
+ index = get_cindex(key, n);
/* This bit of code is a bit tricky but it combines multiple
* checks into a single check. The prefix consists of the
* prefix plus zeros for the "bits" in the prefix. The index
* is the difference between the key and this value. From
* this we can actually derive several pieces of data.
- * if (index & (~0ul << bits))
+ * if (index >= (1ul << bits))
* we have a mismatch in skip bits and failed
* else
* we know the value is cindex
+ *
+ * This check is safe even if bits == KEYLENGTH due to the
+ * fact that we can only allocate a node with 32 bits if a
+ * long is greater than 32 bits.
*/
- if (index & (~0ul << n->bits))
+ if (index >= (1ul << n->bits))
break;
/* we have found a leaf. Prefixes have already been compared */
@@ -1353,7 +1308,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
cindex = index;
}
- n = tnode_get_child_rcu(n, index);
+ n = get_child_rcu(n, index);
if (unlikely(!n))
goto backtrace;
}
@@ -1361,7 +1316,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
/* Step 2: Sort out leaves and begin backtracing for longest prefix */
for (;;) {
/* record the pointer where our next node pointer is stored */
- struct tnode __rcu **cptr = n->child;
+ struct key_vector __rcu **cptr = n->tnode;
/* This test verifies that none of the bits that differ
* between the key and the prefix exist in the region of
@@ -1393,13 +1348,17 @@ backtrace:
while (!cindex) {
t_key pkey = pn->key;
- pn = node_parent_rcu(pn);
- if (unlikely(!pn))
+ /* If we don't have a parent then there is
+ * nothing for us to do as we do not have any
+ * further nodes to parse.
+ */
+ if (IS_TRIE(pn))
return -EAGAIN;
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->backtrack);
#endif
/* Get Child's index */
+ pn = node_parent_rcu(pn);
cindex = get_index(pkey, pn);
}
@@ -1407,138 +1366,134 @@ backtrace:
cindex &= cindex - 1;
/* grab pointer for next child node */
- cptr = &pn->child[cindex];
+ cptr = &pn->tnode[cindex];
}
}
found:
+ /* this line carries forward the xor from earlier in the function */
+ index = key ^ n->key;
+
/* Step 3: Process the leaf, if that fails fall back to backtracing */
- hlist_for_each_entry_rcu(li, &n->list, hlist) {
- struct fib_alias *fa;
+ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+ int nhsel, err;
- if ((key ^ n->key) & li->mask_plen)
+ if ((index >= (1ul << fa->fa_slen)) &&
+ ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen != KEYLENGTH)))
continue;
-
- list_for_each_entry_rcu(fa, &li->falh, fa_list) {
- struct fib_info *fi = fa->fa_info;
- int nhsel, err;
-
- if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
- continue;
- if (fi->fib_dead)
- continue;
- if (fa->fa_info->fib_scope < flp->flowi4_scope)
- continue;
- fib_alias_accessed(fa);
- err = fib_props[fa->fa_type].error;
- if (unlikely(err < 0)) {
+ if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+ continue;
+ if (fi->fib_dead)
+ continue;
+ if (fa->fa_info->fib_scope < flp->flowi4_scope)
+ continue;
+ fib_alias_accessed(fa);
+ err = fib_props[fa->fa_type].error;
+ if (unlikely(err < 0)) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
- this_cpu_inc(stats->semantic_match_passed);
+ this_cpu_inc(stats->semantic_match_passed);
#endif
- return err;
- }
- if (fi->fib_flags & RTNH_F_DEAD)
+ return err;
+ }
+ if (fi->fib_flags & RTNH_F_DEAD)
+ continue;
+ for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
+ const struct fib_nh *nh = &fi->fib_nh[nhsel];
+
+ if (nh->nh_flags & RTNH_F_DEAD)
+ continue;
+ if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
continue;
- for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
- const struct fib_nh *nh = &fi->fib_nh[nhsel];
-
- if (nh->nh_flags & RTNH_F_DEAD)
- continue;
- if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
- continue;
-
- if (!(fib_flags & FIB_LOOKUP_NOREF))
- atomic_inc(&fi->fib_clntref);
-
- res->prefixlen = li->plen;
- res->nh_sel = nhsel;
- res->type = fa->fa_type;
- res->scope = fi->fib_scope;
- res->fi = fi;
- res->table = tb;
- res->fa_head = &li->falh;
+
+ if (!(fib_flags & FIB_LOOKUP_NOREF))
+ atomic_inc(&fi->fib_clntref);
+
+ res->prefixlen = KEYLENGTH - fa->fa_slen;
+ res->nh_sel = nhsel;
+ res->type = fa->fa_type;
+ res->scope = fi->fib_scope;
+ res->fi = fi;
+ res->table = tb;
+ res->fa_head = &n->leaf;
#ifdef CONFIG_IP_FIB_TRIE_STATS
- this_cpu_inc(stats->semantic_match_passed);
+ this_cpu_inc(stats->semantic_match_passed);
#endif
- return err;
- }
+ return err;
}
-
+ }
#ifdef CONFIG_IP_FIB_TRIE_STATS
- this_cpu_inc(stats->semantic_match_miss);
+ this_cpu_inc(stats->semantic_match_miss);
#endif
- }
goto backtrace;
}
EXPORT_SYMBOL_GPL(fib_table_lookup);
-/*
- * Remove the leaf and return parent.
- */
-static void trie_leaf_remove(struct trie *t, struct tnode *l)
+static void fib_remove_alias(struct trie *t, struct key_vector *tp,
+ struct key_vector *l, struct fib_alias *old)
{
- struct tnode *tp = node_parent(l);
+ /* record the location of the previous list_info entry */
+ struct hlist_node **pprev = old->fa_list.pprev;
+ struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next);
- pr_debug("entering trie_leaf_remove(%p)\n", l);
+ /* remove the fib_alias from the list */
+ hlist_del_rcu(&old->fa_list);
- if (tp) {
- put_child(tp, get_index(l->key, tp), NULL);
+ /* if we emptied the list this leaf will be freed and we can sort
+ * out parent suffix lengths as a part of trie_rebalance
+ */
+ if (hlist_empty(&l->leaf)) {
+ put_child_root(tp, l->key, NULL);
+ node_free(l);
trie_rebalance(t, tp);
- } else {
- RCU_INIT_POINTER(t->trie, NULL);
+ return;
}
- node_free(l);
+ /* only access fa if it is pointing at the last valid hlist_node */
+ if (*pprev)
+ return;
+
+ /* update the trie with the latest suffix length */
+ l->slen = fa->fa_slen;
+ leaf_pull_suffix(tp, l);
}
-/*
- * Caller must hold RTNL.
- */
+/* Caller must hold RTNL. */
int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
{
struct trie *t = (struct trie *) tb->tb_data;
- u32 key, mask;
- int plen = cfg->fc_dst_len;
- u8 tos = cfg->fc_tos;
struct fib_alias *fa, *fa_to_delete;
- struct list_head *fa_head;
- struct tnode *l;
- struct leaf_info *li;
+ struct key_vector *l, *tp;
+ u8 plen = cfg->fc_dst_len;
+ u8 slen = KEYLENGTH - plen;
+ u8 tos = cfg->fc_tos;
+ u32 key;
- if (plen > 32)
+ if (plen > KEYLENGTH)
return -EINVAL;
key = ntohl(cfg->fc_dst);
- mask = ntohl(inet_make_mask(plen));
- if (key & ~mask)
+ if ((plen < KEYLENGTH) && (key << plen))
return -EINVAL;
- key = key & mask;
- l = fib_find_node(t, key);
-
+ l = fib_find_node(t, &tp, key);
if (!l)
return -ESRCH;
- li = find_leaf_info(l, plen);
-
- if (!li)
- return -ESRCH;
-
- fa_head = &li->falh;
- fa = fib_find_alias(fa_head, tos, 0);
-
+ fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id);
if (!fa)
return -ESRCH;
pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
fa_to_delete = NULL;
- fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
- list_for_each_entry_continue(fa, fa_head, fa_list) {
+ hlist_for_each_entry_from(fa, fa_list) {
struct fib_info *fi = fa->fa_info;
- if (fa->fa_tos != tos)
+ if ((fa->fa_slen != slen) ||
+ (fa->tb_id != tb->tb_id) ||
+ (fa->fa_tos != tos))
break;
if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
@@ -1557,240 +1512,397 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
if (!fa_to_delete)
return -ESRCH;
- fa = fa_to_delete;
- rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
- &cfg->fc_nlinfo, 0);
+ netdev_switch_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos,
+ cfg->fc_type, tb->tb_id);
- list_del_rcu(&fa->fa_list);
+ rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
+ &cfg->fc_nlinfo, 0);
if (!plen)
tb->tb_num_default--;
- if (list_empty(fa_head)) {
- remove_leaf_info(l, li);
- free_leaf_info(li);
- }
+ fib_remove_alias(t, tp, l, fa_to_delete);
- if (hlist_empty(&l->list))
- trie_leaf_remove(t, l);
-
- if (fa->fa_state & FA_S_ACCESSED)
+ if (fa_to_delete->fa_state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net);
- fib_release_info(fa->fa_info);
- alias_free_mem_rcu(fa);
+ fib_release_info(fa_to_delete->fa_info);
+ alias_free_mem_rcu(fa_to_delete);
return 0;
}
-static int trie_flush_list(struct list_head *head)
+/* Scan for the next leaf starting at the provided key value */
+static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key)
{
- struct fib_alias *fa, *fa_node;
- int found = 0;
+ struct key_vector *pn, *n = *tn;
+ unsigned long cindex;
- list_for_each_entry_safe(fa, fa_node, head, fa_list) {
- struct fib_info *fi = fa->fa_info;
+ /* this loop is meant to try and find the key in the trie */
+ do {
+ /* record parent and next child index */
+ pn = n;
+ cindex = key ? get_index(key, pn) : 0;
- if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
- list_del_rcu(&fa->fa_list);
- fib_release_info(fa->fa_info);
- alias_free_mem_rcu(fa);
- found++;
+ if (cindex >> pn->bits)
+ break;
+
+ /* descend into the next child */
+ n = get_child_rcu(pn, cindex++);
+ if (!n)
+ break;
+
+ /* guarantee forward progress on the keys */
+ if (IS_LEAF(n) && (n->key >= key))
+ goto found;
+ } while (IS_TNODE(n));
+
+ /* this loop will search for the next leaf with a greater key */
+ while (!IS_TRIE(pn)) {
+ /* if we exhausted the parent node we will need to climb */
+ if (cindex >= (1ul << pn->bits)) {
+ t_key pkey = pn->key;
+
+ pn = node_parent_rcu(pn);
+ cindex = get_index(pkey, pn) + 1;
+ continue;
}
+
+ /* grab the next available node */
+ n = get_child_rcu(pn, cindex++);
+ if (!n)
+ continue;
+
+ /* no need to compare keys since we bumped the index */
+ if (IS_LEAF(n))
+ goto found;
+
+ /* Rescan start scanning in new node */
+ pn = n;
+ cindex = 0;
}
- return found;
+
+ *tn = pn;
+ return NULL; /* Root of trie */
+found:
+ /* if we are at the limit for keys just return NULL for the tnode */
+ *tn = pn;
+ return n;
}
-static int trie_flush_leaf(struct tnode *l)
+static void fib_trie_free(struct fib_table *tb)
{
- int found = 0;
- struct hlist_head *lih = &l->list;
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *pn = t->kv;
+ unsigned long cindex = 1;
struct hlist_node *tmp;
- struct leaf_info *li = NULL;
- unsigned char plen = KEYLENGTH;
+ struct fib_alias *fa;
+
+ /* walk trie in reverse order and free everything */
+ for (;;) {
+ struct key_vector *n;
+
+ if (!(cindex--)) {
+ t_key pkey = pn->key;
+
+ if (IS_TRIE(pn))
+ break;
+
+ n = pn;
+ pn = node_parent(pn);
- hlist_for_each_entry_safe(li, tmp, lih, hlist) {
- found += trie_flush_list(&li->falh);
+ /* drop emptied tnode */
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
+
+ cindex = get_index(pkey, pn);
- if (list_empty(&li->falh)) {
- hlist_del_rcu(&li->hlist);
- free_leaf_info(li);
continue;
}
- plen = li->plen;
- }
+ /* grab the next available node */
+ n = get_child(pn, cindex);
+ if (!n)
+ continue;
- l->slen = KEYLENGTH - plen;
+ if (IS_TNODE(n)) {
+ /* record pn and cindex for leaf walking */
+ pn = n;
+ cindex = 1ul << n->bits;
- return found;
+ continue;
+ }
+
+ hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+ hlist_del_rcu(&fa->fa_list);
+ alias_free_mem_rcu(fa);
+ }
+
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
+ }
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ free_percpu(t->stats);
+#endif
+ kfree(tb);
}
-/*
- * Scan for the next right leaf starting at node p->child[idx]
- * Since we have back pointer, no recursion necessary.
- */
-static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
+struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
{
- do {
- unsigned long idx = c ? idx = get_index(c->key, p) + 1 : 0;
+ struct trie *ot = (struct trie *)oldtb->tb_data;
+ struct key_vector *l, *tp = ot->kv;
+ struct fib_table *local_tb;
+ struct fib_alias *fa;
+ struct trie *lt;
+ t_key key = 0;
- while (idx < tnode_child_length(p)) {
- c = tnode_get_child_rcu(p, idx++);
- if (!c)
+ if (oldtb->tb_data == oldtb->__data)
+ return oldtb;
+
+ local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL);
+ if (!local_tb)
+ return NULL;
+
+ lt = (struct trie *)local_tb->tb_data;
+
+ while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
+ struct key_vector *local_l = NULL, *local_tp;
+
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ struct fib_alias *new_fa;
+
+ if (local_tb->tb_id != fa->tb_id)
continue;
- if (IS_LEAF(c))
- return c;
+ /* clone fa for new local table */
+ new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+ if (!new_fa)
+ goto out;
+
+ memcpy(new_fa, fa, sizeof(*fa));
- /* Rescan start scanning in new node */
- p = c;
- idx = 0;
+ /* insert clone into table */
+ if (!local_l)
+ local_l = fib_find_node(lt, &local_tp, l->key);
+
+ if (fib_insert_alias(lt, local_tp, local_l, new_fa,
+ NULL, l->key))
+ goto out;
}
- /* Node empty, walk back up to parent */
- c = p;
- } while ((p = node_parent_rcu(c)) != NULL);
+ /* stop loop if key wrapped back to 0 */
+ key = l->key + 1;
+ if (key < l->key)
+ break;
+ }
- return NULL; /* Root of trie */
+ return local_tb;
+out:
+ fib_trie_free(local_tb);
+
+ return NULL;
}
-static struct tnode *trie_firstleaf(struct trie *t)
+/* Caller must hold RTNL */
+void fib_table_flush_external(struct fib_table *tb)
{
- struct tnode *n = rcu_dereference_rtnl(t->trie);
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *pn = t->kv;
+ unsigned long cindex = 1;
+ struct hlist_node *tmp;
+ struct fib_alias *fa;
- if (!n)
- return NULL;
+ /* walk trie in reverse order */
+ for (;;) {
+ unsigned char slen = 0;
+ struct key_vector *n;
- if (IS_LEAF(n)) /* trie is just a leaf */
- return n;
+ if (!(cindex--)) {
+ t_key pkey = pn->key;
- return leaf_walk_rcu(n, NULL);
-}
+ /* cannot resize the trie vector */
+ if (IS_TRIE(pn))
+ break;
-static struct tnode *trie_nextleaf(struct tnode *l)
-{
- struct tnode *p = node_parent_rcu(l);
+ /* resize completed node */
+ pn = resize(t, pn);
+ cindex = get_index(pkey, pn);
- if (!p)
- return NULL; /* trie with just one leaf */
+ continue;
+ }
- return leaf_walk_rcu(p, l);
-}
+ /* grab the next available node */
+ n = get_child(pn, cindex);
+ if (!n)
+ continue;
-static struct tnode *trie_leafindex(struct trie *t, int index)
-{
- struct tnode *l = trie_firstleaf(t);
+ if (IS_TNODE(n)) {
+ /* record pn and cindex for leaf walking */
+ pn = n;
+ cindex = 1ul << n->bits;
- while (l && index-- > 0)
- l = trie_nextleaf(l);
+ continue;
+ }
- return l;
-}
+ hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ /* if alias was cloned to local then we just
+ * need to remove the local copy from main
+ */
+ if (tb->tb_id != fa->tb_id) {
+ hlist_del_rcu(&fa->fa_list);
+ alias_free_mem_rcu(fa);
+ continue;
+ }
+ /* record local slen */
+ slen = fa->fa_slen;
-/*
- * Caller must hold RTNL.
- */
+ if (!fi || !(fi->fib_flags & RTNH_F_EXTERNAL))
+ continue;
+
+ netdev_switch_fib_ipv4_del(n->key,
+ KEYLENGTH - fa->fa_slen,
+ fi, fa->fa_tos,
+ fa->fa_type, tb->tb_id);
+ }
+
+ /* update leaf slen */
+ n->slen = slen;
+
+ if (hlist_empty(&n->leaf)) {
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
+ } else {
+ leaf_pull_suffix(pn, n);
+ }
+ }
+}
+
+/* Caller must hold RTNL. */
int fib_table_flush(struct fib_table *tb)
{
- struct trie *t = (struct trie *) tb->tb_data;
- struct tnode *l, *ll = NULL;
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *pn = t->kv;
+ unsigned long cindex = 1;
+ struct hlist_node *tmp;
+ struct fib_alias *fa;
int found = 0;
- for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
- found += trie_flush_leaf(l);
+ /* walk trie in reverse order */
+ for (;;) {
+ unsigned char slen = 0;
+ struct key_vector *n;
+
+ if (!(cindex--)) {
+ t_key pkey = pn->key;
- if (ll) {
- if (hlist_empty(&ll->list))
- trie_leaf_remove(t, ll);
- else
- leaf_pull_suffix(ll);
+ /* cannot resize the trie vector */
+ if (IS_TRIE(pn))
+ break;
+
+ /* resize completed node */
+ pn = resize(t, pn);
+ cindex = get_index(pkey, pn);
+
+ continue;
}
- ll = l;
- }
+ /* grab the next available node */
+ n = get_child(pn, cindex);
+ if (!n)
+ continue;
- if (ll) {
- if (hlist_empty(&ll->list))
- trie_leaf_remove(t, ll);
- else
- leaf_pull_suffix(ll);
+ if (IS_TNODE(n)) {
+ /* record pn and cindex for leaf walking */
+ pn = n;
+ cindex = 1ul << n->bits;
+
+ continue;
+ }
+
+ hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (!fi || !(fi->fib_flags & RTNH_F_DEAD)) {
+ slen = fa->fa_slen;
+ continue;
+ }
+
+ netdev_switch_fib_ipv4_del(n->key,
+ KEYLENGTH - fa->fa_slen,
+ fi, fa->fa_tos,
+ fa->fa_type, tb->tb_id);
+ hlist_del_rcu(&fa->fa_list);
+ fib_release_info(fa->fa_info);
+ alias_free_mem_rcu(fa);
+ found++;
+ }
+
+ /* update leaf slen */
+ n->slen = slen;
+
+ if (hlist_empty(&n->leaf)) {
+ put_child_root(pn, n->key, NULL);
+ node_free(n);
+ } else {
+ leaf_pull_suffix(pn, n);
+ }
}
pr_debug("trie_flush found=%d\n", found);
return found;
}
-void fib_free_table(struct fib_table *tb)
+static void __trie_free_rcu(struct rcu_head *head)
{
+ struct fib_table *tb = container_of(head, struct fib_table, rcu);
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie *t = (struct trie *)tb->tb_data;
- free_percpu(t->stats);
+ if (tb->tb_data == tb->__data)
+ free_percpu(t->stats);
#endif /* CONFIG_IP_FIB_TRIE_STATS */
kfree(tb);
}
-static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
- struct fib_table *tb,
- struct sk_buff *skb, struct netlink_callback *cb)
+void fib_free_table(struct fib_table *tb)
{
- int i, s_i;
+ call_rcu(&tb->rcu, __trie_free_rcu);
+}
+
+static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ __be32 xkey = htonl(l->key);
struct fib_alias *fa;
- __be32 xkey = htonl(key);
+ int i, s_i;
- s_i = cb->args[5];
+ s_i = cb->args[4];
i = 0;
/* rcu_read_lock is hold by caller */
-
- list_for_each_entry_rcu(fa, fah, fa_list) {
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
if (i < s_i) {
i++;
continue;
}
+ if (tb->tb_id != fa->tb_id) {
+ i++;
+ continue;
+ }
+
if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWROUTE,
tb->tb_id,
fa->fa_type,
xkey,
- plen,
+ KEYLENGTH - fa->fa_slen,
fa->fa_tos,
fa->fa_info, NLM_F_MULTI) < 0) {
- cb->args[5] = i;
- return -1;
- }
- i++;
- }
- cb->args[5] = i;
- return skb->len;
-}
-
-static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
- struct sk_buff *skb, struct netlink_callback *cb)
-{
- struct leaf_info *li;
- int i, s_i;
-
- s_i = cb->args[4];
- i = 0;
-
- /* rcu_read_lock is hold by caller */
- hlist_for_each_entry_rcu(li, &l->list, hlist) {
- if (i < s_i) {
- i++;
- continue;
- }
-
- if (i > s_i)
- cb->args[5] = 0;
-
- if (list_empty(&li->falh))
- continue;
-
- if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
cb->args[4] = i;
return -1;
}
@@ -1801,44 +1913,38 @@ static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
return skb->len;
}
+/* rcu_read_lock needs to be hold by caller from readside */
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
struct netlink_callback *cb)
{
- struct tnode *l;
- struct trie *t = (struct trie *) tb->tb_data;
- t_key key = cb->args[2];
- int count = cb->args[3];
-
- rcu_read_lock();
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *l, *tp = t->kv;
/* Dump starting at last key.
* Note: 0.0.0.0/0 (ie default) is first key.
*/
- if (count == 0)
- l = trie_firstleaf(t);
- else {
- /* Normally, continue from last key, but if that is missing
- * fallback to using slow rescan
- */
- l = fib_find_node(t, key);
- if (!l)
- l = trie_leafindex(t, count);
- }
+ int count = cb->args[2];
+ t_key key = cb->args[3];
- while (l) {
- cb->args[2] = l->key;
+ while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
- cb->args[3] = count;
- rcu_read_unlock();
+ cb->args[3] = key;
+ cb->args[2] = count;
return -1;
}
++count;
- l = trie_nextleaf(l);
+ key = l->key + 1;
+
memset(&cb->args[4], 0,
sizeof(cb->args) - 4*sizeof(cb->args[0]));
+
+ /* stop loop if key wrapped back to 0 */
+ if (key < l->key)
+ break;
}
- cb->args[3] = count;
- rcu_read_unlock();
+
+ cb->args[3] = key;
+ cb->args[2] = count;
return skb->len;
}
@@ -1850,28 +1956,34 @@ void __init fib_trie_init(void)
0, SLAB_PANIC, NULL);
trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
- max(sizeof(struct tnode),
- sizeof(struct leaf_info)),
+ LEAF_SIZE,
0, SLAB_PANIC, NULL);
}
-
-struct fib_table *fib_trie_table(u32 id)
+struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
{
struct fib_table *tb;
struct trie *t;
+ size_t sz = sizeof(*tb);
+
+ if (!alias)
+ sz += sizeof(struct trie);
- tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
- GFP_KERNEL);
+ tb = kzalloc(sz, GFP_KERNEL);
if (tb == NULL)
return NULL;
tb->tb_id = id;
tb->tb_default = -1;
tb->tb_num_default = 0;
+ tb->tb_data = (alias ? alias->__data : tb->__data);
+
+ if (alias)
+ return tb;
t = (struct trie *) tb->tb_data;
- RCU_INIT_POINTER(t->trie, NULL);
+ t->kv[0].pos = KEYLENGTH;
+ t->kv[0].slen = KEYLENGTH;
#ifdef CONFIG_IP_FIB_TRIE_STATS
t->stats = alloc_percpu(struct trie_use_stats);
if (!t->stats) {
@@ -1888,65 +2000,63 @@ struct fib_table *fib_trie_table(u32 id)
struct fib_trie_iter {
struct seq_net_private p;
struct fib_table *tb;
- struct tnode *tnode;
+ struct key_vector *tnode;
unsigned int index;
unsigned int depth;
};
-static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
+static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter)
{
unsigned long cindex = iter->index;
- struct tnode *tn = iter->tnode;
- struct tnode *p;
-
- /* A single entry routing table */
- if (!tn)
- return NULL;
+ struct key_vector *pn = iter->tnode;
+ t_key pkey;
pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
iter->tnode, iter->index, iter->depth);
-rescan:
- while (cindex < tnode_child_length(tn)) {
- struct tnode *n = tnode_get_child_rcu(tn, cindex);
- if (n) {
+ while (!IS_TRIE(pn)) {
+ while (cindex < child_length(pn)) {
+ struct key_vector *n = get_child_rcu(pn, cindex++);
+
+ if (!n)
+ continue;
+
if (IS_LEAF(n)) {
- iter->tnode = tn;
- iter->index = cindex + 1;
+ iter->tnode = pn;
+ iter->index = cindex;
} else {
/* push down one level */
iter->tnode = n;
iter->index = 0;
++iter->depth;
}
+
return n;
}
- ++cindex;
- }
-
- /* Current node exhausted, pop back up */
- p = node_parent_rcu(tn);
- if (p) {
- cindex = get_index(tn->key, p) + 1;
- tn = p;
+ /* Current node exhausted, pop back up */
+ pkey = pn->key;
+ pn = node_parent_rcu(pn);
+ cindex = get_index(pkey, pn) + 1;
--iter->depth;
- goto rescan;
}
- /* got root? */
+ /* record root node so further searches know we are done */
+ iter->tnode = pn;
+ iter->index = 0;
+
return NULL;
}
-static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
- struct trie *t)
+static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter,
+ struct trie *t)
{
- struct tnode *n;
+ struct key_vector *n, *pn = t->kv;
if (!t)
return NULL;
- n = rcu_dereference(t->trie);
+ n = rcu_dereference(pn->tnode[0]);
if (!n)
return NULL;
@@ -1955,7 +2065,7 @@ static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
iter->index = 0;
iter->depth = 1;
} else {
- iter->tnode = NULL;
+ iter->tnode = pn;
iter->index = 0;
iter->depth = 0;
}
@@ -1965,7 +2075,7 @@ static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
- struct tnode *n;
+ struct key_vector *n;
struct fib_trie_iter iter;
memset(s, 0, sizeof(*s));
@@ -1973,20 +2083,20 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
rcu_read_lock();
for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
if (IS_LEAF(n)) {
- struct leaf_info *li;
+ struct fib_alias *fa;
s->leaves++;
s->totdepth += iter.depth;
if (iter.depth > s->maxdepth)
s->maxdepth = iter.depth;
- hlist_for_each_entry_rcu(li, &n->list, hlist)
+ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list)
++s->prefixes;
} else {
s->tnodes++;
if (n->bits < MAX_STAT_DEPTH)
s->nodesizes[n->bits]++;
- s->nullpointers += n->empty_children;
+ s->nullpointers += tn_info(n)->empty_children;
}
}
rcu_read_unlock();
@@ -2009,13 +2119,13 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
- bytes = sizeof(struct tnode) * stat->leaves;
+ bytes = LEAF_SIZE * stat->leaves;
seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
- bytes += sizeof(struct leaf_info) * stat->prefixes;
+ bytes += sizeof(struct fib_alias) * stat->prefixes;
seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
- bytes += sizeof(struct tnode) * stat->tnodes;
+ bytes += TNODE_SIZE(0) * stat->tnodes;
max = MAX_STAT_DEPTH;
while (max > 0 && stat->nodesizes[max-1] == 0)
@@ -2030,7 +2140,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
seq_putc(seq, '\n');
seq_printf(seq, "\tPointers: %u\n", pointers);
- bytes += sizeof(struct tnode *) * pointers;
+ bytes += sizeof(struct key_vector *) * pointers;
seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
}
@@ -2084,7 +2194,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
seq_printf(seq,
"Basic info: size of leaf:"
" %Zd bytes, size of tnode: %Zd bytes.\n",
- sizeof(struct tnode), sizeof(struct tnode));
+ LEAF_SIZE, TNODE_SIZE(0));
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
@@ -2123,7 +2233,7 @@ static const struct file_operations fib_triestat_fops = {
.release = single_release_net,
};
-static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
+static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
{
struct fib_trie_iter *iter = seq->private;
struct net *net = seq_file_net(seq);
@@ -2135,7 +2245,7 @@ static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
struct fib_table *tb;
hlist_for_each_entry_rcu(tb, head, tb_hlist) {
- struct tnode *n;
+ struct key_vector *n;
for (n = fib_trie_get_first(iter,
(struct trie *) tb->tb_data);
@@ -2164,7 +2274,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
struct fib_table *tb = iter->tb;
struct hlist_node *tb_node;
unsigned int h;
- struct tnode *n;
+ struct key_vector *n;
++*pos;
/* next node in same table */
@@ -2250,9 +2360,9 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
static int fib_trie_seq_show(struct seq_file *seq, void *v)
{
const struct fib_trie_iter *iter = seq->private;
- struct tnode *n = v;
+ struct key_vector *n = v;
- if (!node_parent_rcu(n))
+ if (IS_TRIE(node_parent_rcu(n)))
fib_table_print(seq, iter->tb);
if (IS_TNODE(n)) {
@@ -2261,30 +2371,28 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
seq_indent(seq, iter->depth-1);
seq_printf(seq, " +-- %pI4/%zu %u %u %u\n",
&prf, KEYLENGTH - n->pos - n->bits, n->bits,
- n->full_children, n->empty_children);
+ tn_info(n)->full_children,
+ tn_info(n)->empty_children);
} else {
- struct leaf_info *li;
__be32 val = htonl(n->key);
+ struct fib_alias *fa;
seq_indent(seq, iter->depth);
seq_printf(seq, " |-- %pI4\n", &val);
- hlist_for_each_entry_rcu(li, &n->list, hlist) {
- struct fib_alias *fa;
-
- list_for_each_entry_rcu(fa, &li->falh, fa_list) {
- char buf1[32], buf2[32];
-
- seq_indent(seq, iter->depth+1);
- seq_printf(seq, " /%d %s %s", li->plen,
- rtn_scope(buf1, sizeof(buf1),
- fa->fa_info->fib_scope),
- rtn_type(buf2, sizeof(buf2),
- fa->fa_type));
- if (fa->fa_tos)
- seq_printf(seq, " tos=%d", fa->fa_tos);
- seq_putc(seq, '\n');
- }
+ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
+ char buf1[32], buf2[32];
+
+ seq_indent(seq, iter->depth + 1);
+ seq_printf(seq, " /%zu %s %s",
+ KEYLENGTH - fa->fa_slen,
+ rtn_scope(buf1, sizeof(buf1),
+ fa->fa_info->fib_scope),
+ rtn_type(buf2, sizeof(buf2),
+ fa->fa_type));
+ if (fa->fa_tos)
+ seq_printf(seq, " tos=%d", fa->fa_tos);
+ seq_putc(seq, '\n');
}
}
@@ -2314,31 +2422,47 @@ static const struct file_operations fib_trie_fops = {
struct fib_route_iter {
struct seq_net_private p;
- struct trie *main_trie;
+ struct fib_table *main_tb;
+ struct key_vector *tnode;
loff_t pos;
t_key key;
};
-static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
+static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter,
+ loff_t pos)
{
- struct tnode *l = NULL;
- struct trie *t = iter->main_trie;
+ struct fib_table *tb = iter->main_tb;
+ struct key_vector *l, **tp = &iter->tnode;
+ struct trie *t;
+ t_key key;
- /* use cache location of last found key */
- if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
+ /* use cache location of next-to-find key */
+ if (iter->pos > 0 && pos >= iter->pos) {
pos -= iter->pos;
- else {
+ key = iter->key;
+ } else {
+ t = (struct trie *)tb->tb_data;
+ iter->tnode = t->kv;
iter->pos = 0;
- l = trie_firstleaf(t);
+ key = 0;
}
- while (l && pos-- > 0) {
+ while ((l = leaf_walk_rcu(tp, key)) != NULL) {
+ key = l->key + 1;
iter->pos++;
- l = trie_nextleaf(l);
+
+ if (pos-- <= 0)
+ break;
+
+ l = NULL;
+
+ /* handle unlikely case of a key wrap */
+ if (!key)
+ break;
}
if (l)
- iter->key = pos; /* remember it */
+ iter->key = key; /* remember it */
else
iter->pos = 0; /* forget it */
@@ -2350,37 +2474,46 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
{
struct fib_route_iter *iter = seq->private;
struct fib_table *tb;
+ struct trie *t;
rcu_read_lock();
+
tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
if (!tb)
return NULL;
- iter->main_trie = (struct trie *) tb->tb_data;
- if (*pos == 0)
- return SEQ_START_TOKEN;
- else
- return fib_route_get_idx(iter, *pos - 1);
+ iter->main_tb = tb;
+
+ if (*pos != 0)
+ return fib_route_get_idx(iter, *pos);
+
+ t = (struct trie *)tb->tb_data;
+ iter->tnode = t->kv;
+ iter->pos = 0;
+ iter->key = 0;
+
+ return SEQ_START_TOKEN;
}
static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct fib_route_iter *iter = seq->private;
- struct tnode *l = v;
+ struct key_vector *l = NULL;
+ t_key key = iter->key;
++*pos;
- if (v == SEQ_START_TOKEN) {
- iter->pos = 0;
- l = trie_firstleaf(iter->main_trie);
- } else {
+
+ /* only allow key of 0 for start of sequence */
+ if ((v == SEQ_START_TOKEN) || key)
+ l = leaf_walk_rcu(&iter->tnode, key);
+
+ if (l) {
+ iter->key = l->key + 1;
iter->pos++;
- l = trie_nextleaf(l);
+ } else {
+ iter->pos = 0;
}
- if (l)
- iter->key = l->key;
- else
- iter->pos = 0;
return l;
}
@@ -2412,8 +2545,11 @@ static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info
*/
static int fib_route_seq_show(struct seq_file *seq, void *v)
{
- struct tnode *l = v;
- struct leaf_info *li;
+ struct fib_route_iter *iter = seq->private;
+ struct fib_table *tb = iter->main_tb;
+ struct fib_alias *fa;
+ struct key_vector *l = v;
+ __be32 prefix;
if (v == SEQ_START_TOKEN) {
seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
@@ -2422,45 +2558,43 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
return 0;
}
- hlist_for_each_entry_rcu(li, &l->list, hlist) {
- struct fib_alias *fa;
- __be32 mask, prefix;
+ prefix = htonl(l->key);
- mask = inet_make_mask(li->plen);
- prefix = htonl(l->key);
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ const struct fib_info *fi = fa->fa_info;
+ __be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen);
+ unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
- list_for_each_entry_rcu(fa, &li->falh, fa_list) {
- const struct fib_info *fi = fa->fa_info;
- unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
+ if ((fa->fa_type == RTN_BROADCAST) ||
+ (fa->fa_type == RTN_MULTICAST))
+ continue;
- if (fa->fa_type == RTN_BROADCAST
- || fa->fa_type == RTN_MULTICAST)
- continue;
+ if (fa->tb_id != tb->tb_id)
+ continue;
- seq_setwidth(seq, 127);
-
- if (fi)
- seq_printf(seq,
- "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
- "%d\t%08X\t%d\t%u\t%u",
- fi->fib_dev ? fi->fib_dev->name : "*",
- prefix,
- fi->fib_nh->nh_gw, flags, 0, 0,
- fi->fib_priority,
- mask,
- (fi->fib_advmss ?
- fi->fib_advmss + 40 : 0),
- fi->fib_window,
- fi->fib_rtt >> 3);
- else
- seq_printf(seq,
- "*\t%08X\t%08X\t%04X\t%d\t%u\t"
- "%d\t%08X\t%d\t%u\t%u",
- prefix, 0, flags, 0, 0, 0,
- mask, 0, 0, 0);
-
- seq_pad(seq, '\n');
- }
+ seq_setwidth(seq, 127);
+
+ if (fi)
+ seq_printf(seq,
+ "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
+ "%d\t%08X\t%d\t%u\t%u",
+ fi->fib_dev ? fi->fib_dev->name : "*",
+ prefix,
+ fi->fib_nh->nh_gw, flags, 0, 0,
+ fi->fib_priority,
+ mask,
+ (fi->fib_advmss ?
+ fi->fib_advmss + 40 : 0),
+ fi->fib_window,
+ fi->fib_rtt >> 3);
+ else
+ seq_printf(seq,
+ "*\t%08X\t%08X\t%04X\t%d\t%u\t"
+ "%d\t%08X\t%d\t%u\t%u",
+ prefix, 0, flags, 0, 0, 0,
+ mask, 0, 0, 0);
+
+ seq_pad(seq, '\n');
}
return 0;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 666cf36..ad3f866 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -97,6 +97,7 @@
#include <net/route.h>
#include <net/sock.h>
#include <net/checksum.h>
+#include <net/inet_common.h>
#include <linux/netfilter_ipv4.h>
#ifdef CONFIG_IP_MROUTE
#include <linux/mroute.h>
@@ -1849,30 +1850,28 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc)
pmc->sfcount[MCAST_EXCLUDE] = 1;
}
-
-/*
- * Join a multicast group
+/* Join a multicast group
*/
-int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
+
+int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
{
- int err;
__be32 addr = imr->imr_multiaddr.s_addr;
- struct ip_mc_socklist *iml = NULL, *i;
+ struct ip_mc_socklist *iml, *i;
struct in_device *in_dev;
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
int ifindex;
int count = 0;
+ int err;
+
+ ASSERT_RTNL();
if (!ipv4_is_multicast(addr))
return -EINVAL;
- rtnl_lock();
-
in_dev = ip_mc_find_dev(net, imr);
if (!in_dev) {
- iml = NULL;
err = -ENODEV;
goto done;
}
@@ -1900,7 +1899,6 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
ip_mc_inc_group(in_dev, addr);
err = 0;
done:
- rtnl_unlock();
return err;
}
EXPORT_SYMBOL(ip_mc_join_group);
@@ -1925,10 +1923,6 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
return err;
}
-/*
- * Ask a socket to leave a group.
- */
-
int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
{
struct inet_sock *inet = inet_sk(sk);
@@ -1940,7 +1934,8 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
u32 ifindex;
int ret = -EADDRNOTAVAIL;
- rtnl_lock();
+ ASSERT_RTNL();
+
in_dev = ip_mc_find_dev(net, imr);
if (!in_dev) {
ret = -ENODEV;
@@ -1964,14 +1959,13 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
*imlp = iml->next_rcu;
ip_mc_dec_group(in_dev, group);
- rtnl_unlock();
+
/* decrease mem now to avoid the memleak warning */
atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
kfree_rcu(iml, rcu);
return 0;
}
out:
- rtnl_unlock();
return ret;
}
EXPORT_SYMBOL(ip_mc_leave_group);
@@ -1993,7 +1987,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
if (!ipv4_is_multicast(addr))
return -EINVAL;
- rtnl_lock();
+ ASSERT_RTNL();
imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr;
imr.imr_address.s_addr = mreqs->imr_interface;
@@ -2107,9 +2101,8 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
&mreqs->imr_sourceaddr, 1);
done:
- rtnl_unlock();
if (leavegroup)
- return ip_mc_leave_group(sk, &imr);
+ err = ip_mc_leave_group(sk, &imr);
return err;
}
@@ -2131,7 +2124,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
msf->imsf_fmode != MCAST_EXCLUDE)
return -EINVAL;
- rtnl_lock();
+ ASSERT_RTNL();
imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
imr.imr_address.s_addr = msf->imsf_interface;
@@ -2193,7 +2186,6 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
pmc->sfmode = msf->imsf_fmode;
err = 0;
done:
- rtnl_unlock();
if (leavegroup)
err = ip_mc_leave_group(sk, &imr);
return err;
@@ -2724,6 +2716,7 @@ static const struct file_operations igmp_mcf_seq_fops = {
static int __net_init igmp_net_init(struct net *net)
{
struct proc_dir_entry *pde;
+ int err;
pde = proc_create("igmp", S_IRUGO, net->proc_net, &igmp_mc_seq_fops);
if (!pde)
@@ -2732,8 +2725,18 @@ static int __net_init igmp_net_init(struct net *net)
&igmp_mcf_seq_fops);
if (!pde)
goto out_mcfilter;
+ err = inet_ctl_sock_create(&net->ipv4.mc_autojoin_sk, AF_INET,
+ SOCK_DGRAM, 0, net);
+ if (err < 0) {
+ pr_err("Failed to initialize the IGMP autojoin socket (err %d)\n",
+ err);
+ goto out_sock;
+ }
+
return 0;
+out_sock:
+ remove_proc_entry("mcfilter", net->proc_net);
out_mcfilter:
remove_proc_entry("igmp", net->proc_net);
out_igmp:
@@ -2744,6 +2747,7 @@ static void __net_exit igmp_net_exit(struct net *net)
{
remove_proc_entry("mcfilter", net->proc_net);
remove_proc_entry("igmp", net->proc_net);
+ inet_ctl_sock_destroy(net->ipv4.mc_autojoin_sk);
}
static struct pernet_operations igmp_net_ops = {
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 3e44b9b..844808d 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -23,6 +23,7 @@
#include <net/route.h>
#include <net/tcp_states.h>
#include <net/xfrm.h>
+#include <net/tcp.h>
#ifdef INET_CSK_DEBUG
const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
@@ -294,8 +295,8 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
- struct sock *newsk;
struct request_sock *req;
+ struct sock *newsk;
int error;
lock_sock(sk);
@@ -324,9 +325,11 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
newsk = req->sk;
sk_acceptq_removed(sk);
- if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) {
+ if (sk->sk_protocol == IPPROTO_TCP &&
+ tcp_rsk(req)->tfo_listener &&
+ queue->fastopenq) {
spin_lock_bh(&queue->fastopenq->lock);
- if (tcp_rsk(req)->listener) {
+ if (tcp_rsk(req)->tfo_listener) {
/* We are still waiting for the final ACK from 3WHS
* so can't free req now. Instead, we set req->sk to
* NULL to signify that the child socket is taken
@@ -341,7 +344,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
out:
release_sock(sk);
if (req)
- __reqsk_free(req);
+ reqsk_put(req);
return newsk;
out_err:
newsk = NULL;
@@ -475,33 +478,37 @@ static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
#if IS_ENABLED(CONFIG_IPV6)
#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
#else
-#define AF_INET_FAMILY(fam) 1
+#define AF_INET_FAMILY(fam) true
#endif
-struct request_sock *inet_csk_search_req(const struct sock *sk,
- struct request_sock ***prevp,
- const __be16 rport, const __be32 raddr,
+/* Note: this is temporary :
+ * req sock will no longer be in listener hash table
+*/
+struct request_sock *inet_csk_search_req(struct sock *sk,
+ const __be16 rport,
+ const __be32 raddr,
const __be32 laddr)
{
- const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
- struct request_sock *req, **prev;
+ struct request_sock *req;
+ u32 hash = inet_synq_hash(raddr, rport, lopt->hash_rnd,
+ lopt->nr_table_entries);
- for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd,
- lopt->nr_table_entries)];
- (req = *prev) != NULL;
- prev = &req->dl_next) {
+ write_lock(&icsk->icsk_accept_queue.syn_wait_lock);
+ for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) {
const struct inet_request_sock *ireq = inet_rsk(req);
if (ireq->ir_rmt_port == rport &&
ireq->ir_rmt_addr == raddr &&
ireq->ir_loc_addr == laddr &&
AF_INET_FAMILY(req->rsk_ops->family)) {
+ atomic_inc(&req->rsk_refcnt);
WARN_ON(req->sk);
- *prevp = prev;
break;
}
}
+ write_unlock(&icsk->icsk_accept_queue.syn_wait_lock);
return req;
}
@@ -557,23 +564,23 @@ int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
}
EXPORT_SYMBOL(inet_rtx_syn_ack);
-void inet_csk_reqsk_queue_prune(struct sock *parent,
- const unsigned long interval,
- const unsigned long timeout,
- const unsigned long max_rto)
+static void reqsk_timer_handler(unsigned long data)
{
- struct inet_connection_sock *icsk = inet_csk(parent);
+ struct request_sock *req = (struct request_sock *)data;
+ struct sock *sk_listener = req->rsk_listener;
+ struct inet_connection_sock *icsk = inet_csk(sk_listener);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
struct listen_sock *lopt = queue->listen_opt;
- int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
- int thresh = max_retries;
- unsigned long now = jiffies;
- struct request_sock **reqp, *req;
- int i, budget;
+ int expire = 0, resend = 0;
+ int max_retries, thresh;
- if (lopt == NULL || lopt->qlen == 0)
+ if (sk_listener->sk_state != TCP_LISTEN || !lopt) {
+ reqsk_put(req);
return;
+ }
+ max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+ thresh = max_retries;
/* Normally all the openreqs are young and become mature
* (i.e. converted to established socket) for first timeout.
* If synack was not acknowledged for 1 second, it means
@@ -591,67 +598,63 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
* embrions; and abort old ones without pity, if old
* ones are about to clog our table.
*/
- if (lopt->qlen>>(lopt->max_qlen_log-1)) {
- int young = (lopt->qlen_young<<1);
+ if (listen_sock_qlen(lopt) >> (lopt->max_qlen_log - 1)) {
+ int young = listen_sock_young(lopt) << 1;
while (thresh > 2) {
- if (lopt->qlen < young)
+ if (listen_sock_qlen(lopt) < young)
break;
thresh--;
young <<= 1;
}
}
-
if (queue->rskq_defer_accept)
max_retries = queue->rskq_defer_accept;
+ syn_ack_recalc(req, thresh, max_retries, queue->rskq_defer_accept,
+ &expire, &resend);
+ req->rsk_ops->syn_ack_timeout(sk_listener, req);
+ if (!expire &&
+ (!resend ||
+ !inet_rtx_syn_ack(sk_listener, req) ||
+ inet_rsk(req)->acked)) {
+ unsigned long timeo;
+
+ if (req->num_timeout++ == 0)
+ atomic_inc(&lopt->young_dec);
+ timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
+ mod_timer_pinned(&req->rsk_timer, jiffies + timeo);
+ return;
+ }
+ inet_csk_reqsk_queue_drop(sk_listener, req);
+ reqsk_put(req);
+}
- budget = 2 * (lopt->nr_table_entries / (timeout / interval));
- i = lopt->clock_hand;
-
- do {
- reqp=&lopt->syn_table[i];
- while ((req = *reqp) != NULL) {
- if (time_after_eq(now, req->expires)) {
- int expire = 0, resend = 0;
-
- syn_ack_recalc(req, thresh, max_retries,
- queue->rskq_defer_accept,
- &expire, &resend);
- req->rsk_ops->syn_ack_timeout(parent, req);
- if (!expire &&
- (!resend ||
- !inet_rtx_syn_ack(parent, req) ||
- inet_rsk(req)->acked)) {
- unsigned long timeo;
-
- if (req->num_timeout++ == 0)
- lopt->qlen_young--;
- timeo = min(timeout << req->num_timeout,
- max_rto);
- req->expires = now + timeo;
- reqp = &req->dl_next;
- continue;
- }
-
- /* Drop this request */
- inet_csk_reqsk_queue_unlink(parent, req, reqp);
- reqsk_queue_removed(queue, req);
- reqsk_free(req);
- continue;
- }
- reqp = &req->dl_next;
- }
+void reqsk_queue_hash_req(struct request_sock_queue *queue,
+ u32 hash, struct request_sock *req,
+ unsigned long timeout)
+{
+ struct listen_sock *lopt = queue->listen_opt;
- i = (i + 1) & (lopt->nr_table_entries - 1);
+ req->num_retrans = 0;
+ req->num_timeout = 0;
+ req->sk = NULL;
- } while (--budget > 0);
+ /* before letting lookups find us, make sure all req fields
+ * are committed to memory and refcnt initialized.
+ */
+ smp_wmb();
+ atomic_set(&req->rsk_refcnt, 2);
+ setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req);
+ req->rsk_hash = hash;
- lopt->clock_hand = i;
+ write_lock(&queue->syn_wait_lock);
+ req->dl_next = lopt->syn_table[hash];
+ lopt->syn_table[hash] = req;
+ write_unlock(&queue->syn_wait_lock);
- if (lopt->qlen)
- inet_csk_reset_keepalive_timer(parent, interval);
+ mod_timer_pinned(&req->rsk_timer, jiffies + timeout);
}
-EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
+EXPORT_SYMBOL(reqsk_queue_hash_req);
/**
* inet_csk_clone_lock - clone an inet socket, and lock its clone
@@ -679,6 +682,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
newsk->sk_write_space = sk_stream_write_space;
newsk->sk_mark = inet_rsk(req)->ir_mark;
+ atomic64_set(&newsk->sk_cookie,
+ atomic64_read(&inet_rsk(req)->ir_cookie));
newicsk->icsk_retransmits = 0;
newicsk->icsk_backoff = 0;
@@ -785,8 +790,6 @@ void inet_csk_listen_stop(struct sock *sk)
struct request_sock *acc_req;
struct request_sock *req;
- inet_csk_delete_keepalive_timer(sk);
-
/* make all the listen_opt local to us */
acc_req = reqsk_queue_yank_acceptq(queue);
@@ -816,9 +819,9 @@ void inet_csk_listen_stop(struct sock *sk)
percpu_counter_inc(sk->sk_prot->orphan_count);
- if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) {
+ if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
BUG_ON(tcp_sk(child)->fastopen_rsk != req);
- BUG_ON(sk != tcp_rsk(req)->listener);
+ BUG_ON(sk != req->rsk_listener);
/* Paranoid, to prevent race condition if
* an inbound pkt destined for child is
@@ -827,7 +830,6 @@ void inet_csk_listen_stop(struct sock *sk)
* tcp_v4_destroy_sock().
*/
tcp_sk(child)->fastopen_rsk = NULL;
- sock_put(sk);
}
inet_csk_destroy_sock(child);
@@ -836,7 +838,7 @@ void inet_csk_listen_stop(struct sock *sk)
sock_put(child);
sk_acceptq_removed(sk);
- __reqsk_free(req);
+ reqsk_put(req);
}
if (queue->fastopenq != NULL) {
/* Free all the reqs queued in rskq_rst_head. */
@@ -846,7 +848,7 @@ void inet_csk_listen_stop(struct sock *sk)
spin_unlock_bh(&queue->fastopenq->lock);
while ((req = acc_req) != NULL) {
acc_req = req->dl_next;
- __reqsk_free(req);
+ reqsk_put(req);
}
}
WARN_ON(sk->sk_ack_backlog);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 592aff3..f984b20 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -38,16 +38,12 @@
static const struct inet_diag_handler **inet_diag_table;
struct inet_diag_entry {
- __be32 *saddr;
- __be32 *daddr;
+ const __be32 *saddr;
+ const __be32 *daddr;
u16 sport;
u16 dport;
u16 family;
u16 userlocks;
-#if IS_ENABLED(CONFIG_IPV6)
- struct in6_addr saddr_storage; /* for IPv4-mapped-IPv6 addresses */
- struct in6_addr daddr_storage; /* for IPv4-mapped-IPv6 addresses */
-#endif
};
static DEFINE_MUTEX(inet_diag_table_mutex);
@@ -65,12 +61,35 @@ static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
return inet_diag_table[proto];
}
-static inline void inet_diag_unlock_handler(
- const struct inet_diag_handler *handler)
+static void inet_diag_unlock_handler(const struct inet_diag_handler *handler)
{
mutex_unlock(&inet_diag_table_mutex);
}
+static void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
+{
+ r->idiag_family = sk->sk_family;
+
+ r->id.idiag_sport = htons(sk->sk_num);
+ r->id.idiag_dport = sk->sk_dport;
+ r->id.idiag_if = sk->sk_bound_dev_if;
+ sock_diag_save_cookie(sk, r->id.idiag_cookie);
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6) {
+ *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr;
+ *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr;
+ } else
+#endif
+ {
+ memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+ memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+
+ r->id.idiag_src[0] = sk->sk_rcv_saddr;
+ r->id.idiag_dst[0] = sk->sk_daddr;
+ }
+}
+
static size_t inet_sk_attr_size(void)
{
return nla_total_size(sizeof(struct tcp_info))
@@ -86,21 +105,21 @@ static size_t inet_sk_attr_size(void)
}
int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
- struct sk_buff *skb, struct inet_diag_req_v2 *req,
- struct user_namespace *user_ns,
- u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ struct sk_buff *skb, const struct inet_diag_req_v2 *req,
+ struct user_namespace *user_ns,
+ u32 portid, u32 seq, u16 nlmsg_flags,
+ const struct nlmsghdr *unlh)
{
const struct inet_sock *inet = inet_sk(sk);
+ const struct inet_diag_handler *handler;
+ int ext = req->idiag_ext;
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
struct nlattr *attr;
void *info = NULL;
- const struct inet_diag_handler *handler;
- int ext = req->idiag_ext;
handler = inet_diag_table[req->sdiag_protocol];
- BUG_ON(handler == NULL);
+ BUG_ON(!handler);
nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
nlmsg_flags);
@@ -108,25 +127,13 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
return -EMSGSIZE;
r = nlmsg_data(nlh);
- BUG_ON(sk->sk_state == TCP_TIME_WAIT);
+ BUG_ON(!sk_fullsock(sk));
- r->idiag_family = sk->sk_family;
+ inet_diag_msg_common_fill(r, sk);
r->idiag_state = sk->sk_state;
r->idiag_timer = 0;
r->idiag_retrans = 0;
- r->id.idiag_if = sk->sk_bound_dev_if;
- sock_diag_save_cookie(sk, r->id.idiag_cookie);
-
- r->id.idiag_sport = inet->inet_sport;
- r->id.idiag_dport = inet->inet_dport;
-
- memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
- memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
-
- r->id.idiag_src[0] = inet->inet_rcv_saddr;
- r->id.idiag_dst[0] = inet->inet_daddr;
-
if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown))
goto errout;
@@ -139,10 +146,6 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
#if IS_ENABLED(CONFIG_IPV6)
if (r->idiag_family == AF_INET6) {
-
- *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr;
- *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr;
-
if (ext & (1 << (INET_DIAG_TCLASS - 1)))
if (nla_put_u8(skb, INET_DIAG_TCLASS,
inet6_sk(sk)->tclass) < 0)
@@ -169,7 +172,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
goto errout;
- if (icsk == NULL) {
+ if (!icsk) {
handler->idiag_get_info(sk, r, NULL);
goto out;
}
@@ -227,23 +230,25 @@ errout:
EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
static int inet_csk_diag_fill(struct sock *sk,
- struct sk_buff *skb, struct inet_diag_req_v2 *req,
+ struct sk_buff *skb,
+ const struct inet_diag_req_v2 *req,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
const struct nlmsghdr *unlh)
{
- return inet_sk_diag_fill(sk, inet_csk(sk),
- skb, req, user_ns, portid, seq, nlmsg_flags, unlh);
+ return inet_sk_diag_fill(sk, inet_csk(sk), skb, req,
+ user_ns, portid, seq, nlmsg_flags, unlh);
}
-static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
- struct sk_buff *skb, struct inet_diag_req_v2 *req,
+static int inet_twsk_diag_fill(struct sock *sk,
+ struct sk_buff *skb,
u32 portid, u32 seq, u16 nlmsg_flags,
const struct nlmsghdr *unlh)
{
- s32 tmo;
+ struct inet_timewait_sock *tw = inet_twsk(sk);
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
+ s32 tmo;
nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
nlmsg_flags);
@@ -257,21 +262,9 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
if (tmo < 0)
tmo = 0;
- r->idiag_family = tw->tw_family;
+ inet_diag_msg_common_fill(r, sk);
r->idiag_retrans = 0;
- r->id.idiag_if = tw->tw_bound_dev_if;
- sock_diag_save_cookie(tw, r->id.idiag_cookie);
-
- r->id.idiag_sport = tw->tw_sport;
- r->id.idiag_dport = tw->tw_dport;
-
- memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
- memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
-
- r->id.idiag_src[0] = tw->tw_rcv_saddr;
- r->id.idiag_dst[0] = tw->tw_daddr;
-
r->idiag_state = tw->tw_substate;
r->idiag_timer = 3;
r->idiag_expires = jiffies_to_msecs(tmo);
@@ -279,61 +272,91 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
r->idiag_wqueue = 0;
r->idiag_uid = 0;
r->idiag_inode = 0;
-#if IS_ENABLED(CONFIG_IPV6)
- if (tw->tw_family == AF_INET6) {
- *(struct in6_addr *)r->id.idiag_src = tw->tw_v6_rcv_saddr;
- *(struct in6_addr *)r->id.idiag_dst = tw->tw_v6_daddr;
- }
-#endif
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
+ u32 portid, u32 seq, u16 nlmsg_flags,
+ const struct nlmsghdr *unlh)
+{
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ long tmo;
+
+ nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
+ nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ inet_diag_msg_common_fill(r, sk);
+ r->idiag_state = TCP_SYN_RECV;
+ r->idiag_timer = 1;
+ r->idiag_retrans = inet_reqsk(sk)->num_retrans;
+
+ BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
+ offsetof(struct sock, sk_cookie));
+
+ tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies;
+ r->idiag_expires = (tmo >= 0) ? jiffies_to_msecs(tmo) : 0;
+ r->idiag_rqueue = 0;
+ r->idiag_wqueue = 0;
+ r->idiag_uid = 0;
+ r->idiag_inode = 0;
nlmsg_end(skb, nlh);
return 0;
}
static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
- struct inet_diag_req_v2 *r,
+ const struct inet_diag_req_v2 *r,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
const struct nlmsghdr *unlh)
{
if (sk->sk_state == TCP_TIME_WAIT)
- return inet_twsk_diag_fill(inet_twsk(sk), skb, r, portid, seq,
+ return inet_twsk_diag_fill(sk, skb, portid, seq,
nlmsg_flags, unlh);
+ if (sk->sk_state == TCP_NEW_SYN_RECV)
+ return inet_req_diag_fill(sk, skb, portid, seq,
+ nlmsg_flags, unlh);
+
return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
nlmsg_flags, unlh);
}
-int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb,
- const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req)
+int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
+ struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh,
+ const struct inet_diag_req_v2 *req)
{
- int err;
- struct sock *sk;
- struct sk_buff *rep;
struct net *net = sock_net(in_skb->sk);
+ struct sk_buff *rep;
+ struct sock *sk;
+ int err;
err = -EINVAL;
- if (req->sdiag_family == AF_INET) {
+ if (req->sdiag_family == AF_INET)
sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0],
req->id.idiag_dport, req->id.idiag_src[0],
req->id.idiag_sport, req->id.idiag_if);
- }
#if IS_ENABLED(CONFIG_IPV6)
- else if (req->sdiag_family == AF_INET6) {
+ else if (req->sdiag_family == AF_INET6)
sk = inet6_lookup(net, hashinfo,
(struct in6_addr *)req->id.idiag_dst,
req->id.idiag_dport,
(struct in6_addr *)req->id.idiag_src,
req->id.idiag_sport,
req->id.idiag_if);
- }
#endif
- else {
+ else
goto out_nosk;
- }
err = -ENOENT;
- if (sk == NULL)
+ if (!sk)
goto out_nosk;
err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
@@ -371,7 +394,7 @@ EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk);
static int inet_diag_get_exact(struct sk_buff *in_skb,
const struct nlmsghdr *nlh,
- struct inet_diag_req_v2 *req)
+ const struct inet_diag_req_v2 *req)
{
const struct inet_diag_handler *handler;
int err;
@@ -412,9 +435,8 @@ static int bitstring_match(const __be32 *a1, const __be32 *a2, int bits)
return 1;
}
-
static int inet_diag_bc_run(const struct nlattr *_bc,
- const struct inet_diag_entry *entry)
+ const struct inet_diag_entry *entry)
{
const void *bc = nla_data(_bc);
int len = nla_len(_bc);
@@ -446,10 +468,10 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
break;
case INET_DIAG_BC_S_COND:
case INET_DIAG_BC_D_COND: {
- struct inet_diag_hostcond *cond;
- __be32 *addr;
+ const struct inet_diag_hostcond *cond;
+ const __be32 *addr;
- cond = (struct inet_diag_hostcond *)(op + 1);
+ cond = (const struct inet_diag_hostcond *)(op + 1);
if (cond->port != -1 &&
cond->port != (op->code == INET_DIAG_BC_S_COND ?
entry->sport : entry->dport)) {
@@ -498,29 +520,36 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
return len == 0;
}
+/* This helper is available for all sockets (ESTABLISH, TIMEWAIT, SYN_RECV)
+ */
+static void entry_fill_addrs(struct inet_diag_entry *entry,
+ const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6) {
+ entry->saddr = sk->sk_v6_rcv_saddr.s6_addr32;
+ entry->daddr = sk->sk_v6_daddr.s6_addr32;
+ } else
+#endif
+ {
+ entry->saddr = &sk->sk_rcv_saddr;
+ entry->daddr = &sk->sk_daddr;
+ }
+}
+
int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
{
- struct inet_diag_entry entry;
struct inet_sock *inet = inet_sk(sk);
+ struct inet_diag_entry entry;
- if (bc == NULL)
+ if (!bc)
return 1;
entry.family = sk->sk_family;
-#if IS_ENABLED(CONFIG_IPV6)
- if (entry.family == AF_INET6) {
-
- entry.saddr = sk->sk_v6_rcv_saddr.s6_addr32;
- entry.daddr = sk->sk_v6_daddr.s6_addr32;
- } else
-#endif
- {
- entry.saddr = &inet->inet_rcv_saddr;
- entry.daddr = &inet->inet_daddr;
- }
+ entry_fill_addrs(&entry, sk);
entry.sport = inet->inet_num;
entry.dport = ntohs(inet->inet_dport);
- entry.userlocks = sk->sk_userlocks;
+ entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0;
return inet_diag_bc_run(bc, &entry);
}
@@ -547,8 +576,8 @@ static int valid_cc(const void *bc, int len, int cc)
static bool valid_hostcond(const struct inet_diag_bc_op *op, int len,
int *min_len)
{
- int addr_len;
struct inet_diag_hostcond *cond;
+ int addr_len;
/* Check hostcond space. */
*min_len += sizeof(struct inet_diag_hostcond);
@@ -582,8 +611,8 @@ static bool valid_hostcond(const struct inet_diag_bc_op *op, int len,
}
/* Validate a port comparison operator. */
-static inline bool valid_port_comparison(const struct inet_diag_bc_op *op,
- int len, int *min_len)
+static bool valid_port_comparison(const struct inet_diag_bc_op *op,
+ int len, int *min_len)
{
/* Port comparisons put the port in a follow-on inet_diag_bc_op. */
*min_len += sizeof(struct inet_diag_bc_op);
@@ -598,10 +627,9 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
int len = bytecode_len;
while (len > 0) {
- const struct inet_diag_bc_op *op = bc;
int min_len = sizeof(struct inet_diag_bc_op);
+ const struct inet_diag_bc_op *op = bc;
-//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
switch (op->code) {
case INET_DIAG_BC_S_COND:
case INET_DIAG_BC_D_COND:
@@ -642,7 +670,7 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
static int inet_csk_diag_dump(struct sock *sk,
struct sk_buff *skb,
struct netlink_callback *cb,
- struct inet_diag_req_v2 *r,
+ const struct inet_diag_req_v2 *r,
const struct nlattr *bc)
{
if (!inet_diag_bc_sk(bc, sk))
@@ -654,139 +682,42 @@ static int inet_csk_diag_dump(struct sock *sk,
cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
}
-static int inet_twsk_diag_dump(struct sock *sk,
- struct sk_buff *skb,
- struct netlink_callback *cb,
- struct inet_diag_req_v2 *r,
- const struct nlattr *bc)
+static void twsk_build_assert(void)
{
- struct inet_timewait_sock *tw = inet_twsk(sk);
-
- if (bc != NULL) {
- struct inet_diag_entry entry;
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) !=
+ offsetof(struct sock, sk_family));
- entry.family = tw->tw_family;
-#if IS_ENABLED(CONFIG_IPV6)
- if (tw->tw_family == AF_INET6) {
- entry.saddr = tw->tw_v6_rcv_saddr.s6_addr32;
- entry.daddr = tw->tw_v6_daddr.s6_addr32;
- } else
-#endif
- {
- entry.saddr = &tw->tw_rcv_saddr;
- entry.daddr = &tw->tw_daddr;
- }
- entry.sport = tw->tw_num;
- entry.dport = ntohs(tw->tw_dport);
- entry.userlocks = 0;
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) !=
+ offsetof(struct inet_sock, inet_num));
- if (!inet_diag_bc_run(bc, &entry))
- return 0;
- }
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) !=
+ offsetof(struct inet_sock, inet_dport));
- return inet_twsk_diag_fill(tw, skb, r,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
-}
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) !=
+ offsetof(struct inet_sock, inet_rcv_saddr));
-/* Get the IPv4, IPv6, or IPv4-mapped-IPv6 local and remote addresses
- * from a request_sock. For IPv4-mapped-IPv6 we must map IPv4 to IPv6.
- */
-static inline void inet_diag_req_addrs(const struct sock *sk,
- const struct request_sock *req,
- struct inet_diag_entry *entry)
-{
- struct inet_request_sock *ireq = inet_rsk(req);
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) !=
+ offsetof(struct inet_sock, inet_daddr));
#if IS_ENABLED(CONFIG_IPV6)
- if (sk->sk_family == AF_INET6) {
- if (req->rsk_ops->family == AF_INET6) {
- entry->saddr = ireq->ir_v6_loc_addr.s6_addr32;
- entry->daddr = ireq->ir_v6_rmt_addr.s6_addr32;
- } else if (req->rsk_ops->family == AF_INET) {
- ipv6_addr_set_v4mapped(ireq->ir_loc_addr,
- &entry->saddr_storage);
- ipv6_addr_set_v4mapped(ireq->ir_rmt_addr,
- &entry->daddr_storage);
- entry->saddr = entry->saddr_storage.s6_addr32;
- entry->daddr = entry->daddr_storage.s6_addr32;
- }
- } else
-#endif
- {
- entry->saddr = &ireq->ir_loc_addr;
- entry->daddr = &ireq->ir_rmt_addr;
- }
-}
-
-static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
- struct request_sock *req,
- struct user_namespace *user_ns,
- u32 portid, u32 seq,
- const struct nlmsghdr *unlh)
-{
- const struct inet_request_sock *ireq = inet_rsk(req);
- struct inet_sock *inet = inet_sk(sk);
- struct inet_diag_msg *r;
- struct nlmsghdr *nlh;
- long tmo;
-
- nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
- NLM_F_MULTI);
- if (!nlh)
- return -EMSGSIZE;
-
- r = nlmsg_data(nlh);
- r->idiag_family = sk->sk_family;
- r->idiag_state = TCP_SYN_RECV;
- r->idiag_timer = 1;
- r->idiag_retrans = req->num_retrans;
-
- r->id.idiag_if = sk->sk_bound_dev_if;
- sock_diag_save_cookie(req, r->id.idiag_cookie);
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) !=
+ offsetof(struct sock, sk_v6_rcv_saddr));
- tmo = req->expires - jiffies;
- if (tmo < 0)
- tmo = 0;
-
- r->id.idiag_sport = inet->inet_sport;
- r->id.idiag_dport = ireq->ir_rmt_port;
-
- memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
- memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
-
- r->id.idiag_src[0] = ireq->ir_loc_addr;
- r->id.idiag_dst[0] = ireq->ir_rmt_addr;
-
- r->idiag_expires = jiffies_to_msecs(tmo);
- r->idiag_rqueue = 0;
- r->idiag_wqueue = 0;
- r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
- r->idiag_inode = 0;
-#if IS_ENABLED(CONFIG_IPV6)
- if (r->idiag_family == AF_INET6) {
- struct inet_diag_entry entry;
- inet_diag_req_addrs(sk, req, &entry);
- memcpy(r->id.idiag_src, entry.saddr, sizeof(struct in6_addr));
- memcpy(r->id.idiag_dst, entry.daddr, sizeof(struct in6_addr));
- }
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) !=
+ offsetof(struct sock, sk_v6_daddr));
#endif
-
- nlmsg_end(skb, nlh);
- return 0;
}
static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
struct netlink_callback *cb,
- struct inet_diag_req_v2 *r,
+ const struct inet_diag_req_v2 *r,
const struct nlattr *bc)
{
- struct inet_diag_entry entry;
struct inet_connection_sock *icsk = inet_csk(sk);
- struct listen_sock *lopt;
struct inet_sock *inet = inet_sk(sk);
- int j, s_j;
- int reqnum, s_reqnum;
+ struct inet_diag_entry entry;
+ int j, s_j, reqnum, s_reqnum;
+ struct listen_sock *lopt;
int err = 0;
s_j = cb->args[3];
@@ -800,10 +731,10 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
lopt = icsk->icsk_accept_queue.listen_opt;
- if (!lopt || !lopt->qlen)
+ if (!lopt || !listen_sock_qlen(lopt))
goto out;
- if (bc != NULL) {
+ if (bc) {
entry.sport = inet->inet_num;
entry.userlocks = sk->sk_userlocks;
}
@@ -822,17 +753,18 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
continue;
if (bc) {
- inet_diag_req_addrs(sk, req, &entry);
+ /* Note: entry.sport and entry.userlocks are already set */
+ entry_fill_addrs(&entry, req_to_sk(req));
entry.dport = ntohs(ireq->ir_rmt_port);
if (!inet_diag_bc_run(bc, &entry))
continue;
}
- err = inet_diag_fill_req(skb, sk, req,
- sk_user_ns(NETLINK_CB(cb->skb).sk),
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, cb->nlh);
+ err = inet_req_diag_fill(req_to_sk(req), skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, cb->nlh);
if (err < 0) {
cb->args[3] = j + 1;
cb->args[4] = reqnum;
@@ -850,11 +782,11 @@ out:
}
void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
- struct netlink_callback *cb, struct inet_diag_req_v2 *r, struct nlattr *bc)
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r, struct nlattr *bc)
{
- int i, num;
- int s_i, s_num;
struct net *net = sock_net(skb->sk);
+ int i, num, s_i, s_num;
s_i = cb->args[1];
s_num = num = cb->args[2];
@@ -864,9 +796,9 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
goto skip_listen_ht;
for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
- struct sock *sk;
- struct hlist_nulls_node *node;
struct inet_listen_hashbucket *ilb;
+ struct hlist_nulls_node *node;
+ struct sock *sk;
num = 0;
ilb = &hashinfo->listening_hash[i];
@@ -883,7 +815,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
}
if (r->sdiag_family != AF_UNSPEC &&
- sk->sk_family != r->sdiag_family)
+ sk->sk_family != r->sdiag_family)
goto next_listen;
if (r->id.idiag_sport != inet->inet_sport &&
@@ -931,8 +863,8 @@ skip_listen_ht:
for (i = s_i; i <= hashinfo->ehash_mask; i++) {
struct inet_ehash_bucket *head = &hashinfo->ehash[i];
spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
- struct sock *sk;
struct hlist_nulls_node *node;
+ struct sock *sk;
num = 0;
@@ -944,8 +876,7 @@ skip_listen_ht:
spin_lock_bh(lock);
sk_nulls_for_each(sk, node, &head->chain) {
- int res;
- int state;
+ int state, res;
if (!net_eq(sock_net(sk), net))
continue;
@@ -964,10 +895,16 @@ skip_listen_ht:
if (r->id.idiag_dport != sk->sk_dport &&
r->id.idiag_dport)
goto next_normal;
- if (sk->sk_state == TCP_TIME_WAIT)
- res = inet_twsk_diag_dump(sk, skb, cb, r, bc);
- else
- res = inet_csk_diag_dump(sk, skb, cb, r, bc);
+ twsk_build_assert();
+
+ if (!inet_diag_bc_sk(bc, sk))
+ goto next_normal;
+
+ res = sk_diag_fill(sk, skb, r,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ cb->nlh);
if (res < 0) {
spin_unlock_bh(lock);
goto done;
@@ -988,7 +925,8 @@ out:
EXPORT_SYMBOL_GPL(inet_diag_dump_icsk);
static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- struct inet_diag_req_v2 *r, struct nlattr *bc)
+ const struct inet_diag_req_v2 *r,
+ struct nlattr *bc)
{
const struct inet_diag_handler *handler;
int err = 0;
@@ -1005,8 +943,8 @@ static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct nlattr *bc = NULL;
int hdrlen = sizeof(struct inet_diag_req_v2);
+ struct nlattr *bc = NULL;
if (nlmsg_attrlen(cb->nlh, hdrlen))
bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
@@ -1014,7 +952,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc);
}
-static inline int inet_diag_type2proto(int type)
+static int inet_diag_type2proto(int type)
{
switch (type) {
case TCPDIAG_GETSOCK:
@@ -1026,12 +964,13 @@ static inline int inet_diag_type2proto(int type)
}
}
-static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb)
+static int inet_diag_dump_compat(struct sk_buff *skb,
+ struct netlink_callback *cb)
{
struct inet_diag_req *rc = nlmsg_data(cb->nlh);
+ int hdrlen = sizeof(struct inet_diag_req);
struct inet_diag_req_v2 req;
struct nlattr *bc = NULL;
- int hdrlen = sizeof(struct inet_diag_req);
req.sdiag_family = AF_UNSPEC; /* compatibility */
req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type);
@@ -1046,7 +985,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *c
}
static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
- const struct nlmsghdr *nlh)
+ const struct nlmsghdr *nlh)
{
struct inet_diag_req *rc = nlmsg_data(nlh);
struct inet_diag_req_v2 req;
@@ -1075,7 +1014,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
attr = nlmsg_find_attr(nlh, hdrlen,
INET_DIAG_REQ_BYTECODE);
- if (attr == NULL ||
+ if (!attr ||
nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
return -EINVAL;
@@ -1102,9 +1041,10 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
if (h->nlmsg_flags & NLM_F_DUMP) {
if (nlmsg_attrlen(h, hdrlen)) {
struct nlattr *attr;
+
attr = nlmsg_find_attr(h, hdrlen,
INET_DIAG_REQ_BYTECODE);
- if (attr == NULL ||
+ if (!attr ||
nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
return -EINVAL;
@@ -1140,7 +1080,7 @@ int inet_diag_register(const struct inet_diag_handler *h)
mutex_lock(&inet_diag_table_mutex);
err = -EEXIST;
- if (inet_diag_table[type] == NULL) {
+ if (!inet_diag_table[type]) {
inet_diag_table[type] = h;
err = 0;
}
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 9111a4e..0fb841b 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -24,9 +24,9 @@
#include <net/secure_seq.h>
#include <net/ip.h>
-static unsigned int inet_ehashfn(struct net *net, const __be32 laddr,
- const __u16 lport, const __be32 faddr,
- const __be16 fport)
+static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
+ const __u16 lport, const __be32 faddr,
+ const __be16 fport)
{
static u32 inet_ehash_secret __read_mostly;
@@ -36,17 +36,21 @@ static unsigned int inet_ehashfn(struct net *net, const __be32 laddr,
inet_ehash_secret + net_hash_mix(net));
}
-
-static unsigned int inet_sk_ehashfn(const struct sock *sk)
+/* This function handles inet_sock, but also timewait and request sockets
+ * for IPv4/IPv6.
+ */
+u32 sk_ehashfn(const struct sock *sk)
{
- const struct inet_sock *inet = inet_sk(sk);
- const __be32 laddr = inet->inet_rcv_saddr;
- const __u16 lport = inet->inet_num;
- const __be32 faddr = inet->inet_daddr;
- const __be16 fport = inet->inet_dport;
- struct net *net = sock_net(sk);
-
- return inet_ehashfn(net, laddr, lport, faddr, fport);
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6 &&
+ !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
+ return inet6_ehashfn(sock_net(sk),
+ &sk->sk_v6_rcv_saddr, sk->sk_num,
+ &sk->sk_v6_daddr, sk->sk_dport);
+#endif
+ return inet_ehashfn(sock_net(sk),
+ sk->sk_rcv_saddr, sk->sk_num,
+ sk->sk_daddr, sk->sk_dport);
}
/*
@@ -61,7 +65,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
if (tb != NULL) {
- write_pnet(&tb->ib_net, hold_net(net));
+ write_pnet(&tb->ib_net, net);
tb->port = snum;
tb->fastreuse = 0;
tb->fastreuseport = 0;
@@ -79,7 +83,6 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket
{
if (hlist_empty(&tb->owners)) {
__hlist_del(&tb->node);
- release_net(ib_net(tb));
kmem_cache_free(cachep, tb);
}
}
@@ -263,11 +266,19 @@ void sock_gen_put(struct sock *sk)
if (sk->sk_state == TCP_TIME_WAIT)
inet_twsk_free(inet_twsk(sk));
+ else if (sk->sk_state == TCP_NEW_SYN_RECV)
+ reqsk_free(inet_reqsk(sk));
else
sk_free(sk);
}
EXPORT_SYMBOL_GPL(sock_gen_put);
+void sock_edemux(struct sk_buff *skb)
+{
+ sock_gen_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_edemux);
+
struct sock *__inet_lookup_established(struct net *net,
struct inet_hashinfo *hashinfo,
const __be32 saddr, const __be16 sport,
@@ -400,13 +411,13 @@ int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct hlist_nulls_head *list;
- spinlock_t *lock;
struct inet_ehash_bucket *head;
+ spinlock_t *lock;
int twrefcnt = 0;
WARN_ON(!sk_unhashed(sk));
- sk->sk_hash = inet_sk_ehashfn(sk);
+ sk->sk_hash = sk_ehashfn(sk);
head = inet_ehash_bucket(hashinfo, sk->sk_hash);
list = &head->chain;
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
@@ -423,15 +434,13 @@ int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
}
EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
-static void __inet_hash(struct sock *sk)
+int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_listen_hashbucket *ilb;
- if (sk->sk_state != TCP_LISTEN) {
- __inet_hash_nolisten(sk, NULL);
- return;
- }
+ if (sk->sk_state != TCP_LISTEN)
+ return __inet_hash_nolisten(sk, tw);
WARN_ON(!sk_unhashed(sk));
ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
@@ -440,13 +449,15 @@ static void __inet_hash(struct sock *sk)
__sk_nulls_add_node_rcu(sk, &ilb->head);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
spin_unlock(&ilb->lock);
+ return 0;
}
+EXPORT_SYMBOL(__inet_hash);
void inet_hash(struct sock *sk)
{
if (sk->sk_state != TCP_CLOSE) {
local_bh_disable();
- __inet_hash(sk);
+ __inet_hash(sk, NULL);
local_bh_enable();
}
}
@@ -477,8 +488,7 @@ EXPORT_SYMBOL_GPL(inet_unhash);
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk, u32 port_offset,
int (*check_established)(struct inet_timewait_death_row *,
- struct sock *, __u16, struct inet_timewait_sock **),
- int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))
+ struct sock *, __u16, struct inet_timewait_sock **))
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
const unsigned short snum = inet_sk(sk)->inet_num;
@@ -548,7 +558,7 @@ ok:
inet_bind_hash(sk, tb, port);
if (sk_unhashed(sk)) {
inet_sk(sk)->inet_sport = htons(port);
- twrefcnt += hash(sk, tw);
+ twrefcnt += __inet_hash_nolisten(sk, tw);
}
if (tw)
twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
@@ -570,7 +580,7 @@ ok:
tb = inet_csk(sk)->icsk_bind_hash;
spin_lock_bh(&head->lock);
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
- hash(sk, NULL);
+ __inet_hash_nolisten(sk, NULL);
spin_unlock_bh(&head->lock);
return 0;
} else {
@@ -590,7 +600,7 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk)
{
return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
- __inet_check_established, __inet_hash_nolisten);
+ __inet_check_established);
}
EXPORT_SYMBOL_GPL(inet_hash_connect);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 6d592f8..f38e387 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -98,7 +98,6 @@ void inet_twsk_free(struct inet_timewait_sock *tw)
#ifdef SOCK_REFCNT_DEBUG
pr_debug("%s timewait_sock %p released\n", tw->tw_prot->name, tw);
#endif
- release_net(twsk_net(tw));
kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
module_put(owner);
}
@@ -195,7 +194,8 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
tw->tw_ipv6only = 0;
tw->tw_transparent = inet->transparent;
tw->tw_prot = sk->sk_prot_creator;
- twsk_net_set(tw, hold_net(sock_net(sk)));
+ atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
+ twsk_net_set(tw, sock_net(sk));
/*
* Because we use RCU lookups, we should not set tw_refcnt
* to a non null value before everything is setup for this
@@ -487,6 +487,7 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo,
for (slot = 0; slot <= hashinfo->ehash_mask; slot++) {
struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
restart_rcu:
+ cond_resched();
rcu_read_lock();
restart:
sk_nulls_for_each_rcu(sk, node, &head->chain) {
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 5cd9927..f6a0d54 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -536,12 +536,34 @@ out:
* Socket option code for IP. This is the end of the line after any
* TCP,UDP etc options on an IP socket.
*/
+static bool setsockopt_needs_rtnl(int optname)
+{
+ switch (optname) {
+ case IP_ADD_MEMBERSHIP:
+ case IP_ADD_SOURCE_MEMBERSHIP:
+ case IP_BLOCK_SOURCE:
+ case IP_DROP_MEMBERSHIP:
+ case IP_DROP_SOURCE_MEMBERSHIP:
+ case IP_MSFILTER:
+ case IP_UNBLOCK_SOURCE:
+ case MCAST_BLOCK_SOURCE:
+ case MCAST_MSFILTER:
+ case MCAST_JOIN_GROUP:
+ case MCAST_JOIN_SOURCE_GROUP:
+ case MCAST_LEAVE_GROUP:
+ case MCAST_LEAVE_SOURCE_GROUP:
+ case MCAST_UNBLOCK_SOURCE:
+ return true;
+ }
+ return false;
+}
static int do_ip_setsockopt(struct sock *sk, int level,
int optname, char __user *optval, unsigned int optlen)
{
struct inet_sock *inet = inet_sk(sk);
int val = 0, err;
+ bool needs_rtnl = setsockopt_needs_rtnl(optname);
switch (optname) {
case IP_PKTINFO:
@@ -584,6 +606,8 @@ static int do_ip_setsockopt(struct sock *sk, int level,
return ip_mroute_setsockopt(sk, optname, optval, optlen);
err = 0;
+ if (needs_rtnl)
+ rtnl_lock();
lock_sock(sk);
switch (optname) {
@@ -1118,10 +1142,14 @@ mc_msf_out:
break;
}
release_sock(sk);
+ if (needs_rtnl)
+ rtnl_unlock();
return err;
e_inval:
release_sock(sk);
+ if (needs_rtnl)
+ rtnl_unlock();
return -EINVAL;
}
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 9d78427..5b18883 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -73,9 +73,7 @@
struct mr_table {
struct list_head list;
-#ifdef CONFIG_NET_NS
- struct net *net;
-#endif
+ possible_net_t net;
u32 id;
struct sock __rcu *mroute_sk;
struct timer_list ipmr_expire_timer;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 59f883d..fb20f36 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -36,24 +36,16 @@ config NF_CONNTRACK_PROC_COMPAT
If unsure, say Y.
-config NF_LOG_ARP
- tristate "ARP packet logging"
- default m if NETFILTER_ADVANCED=n
- select NF_LOG_COMMON
-
-config NF_LOG_IPV4
- tristate "IPv4 packet logging"
- default m if NETFILTER_ADVANCED=n
- select NF_LOG_COMMON
+if NF_TABLES
config NF_TABLES_IPV4
- depends on NF_TABLES
tristate "IPv4 nf_tables support"
help
This option enables the IPv4 support for nf_tables.
+if NF_TABLES_IPV4
+
config NFT_CHAIN_ROUTE_IPV4
- depends on NF_TABLES_IPV4
tristate "IPv4 nf_tables route chain support"
help
This option enables the "route" chain for IPv4 in nf_tables. This
@@ -61,22 +53,34 @@ config NFT_CHAIN_ROUTE_IPV4
fields such as the source, destination, type of service and
the packet mark.
-config NF_REJECT_IPV4
- tristate "IPv4 packet rejection"
- default m if NETFILTER_ADVANCED=n
-
config NFT_REJECT_IPV4
- depends on NF_TABLES_IPV4
select NF_REJECT_IPV4
default NFT_REJECT
tristate
+endif # NF_TABLES_IPV4
+
config NF_TABLES_ARP
- depends on NF_TABLES
tristate "ARP nf_tables support"
help
This option enables the ARP support for nf_tables.
+endif # NF_TABLES
+
+config NF_LOG_ARP
+ tristate "ARP packet logging"
+ default m if NETFILTER_ADVANCED=n
+ select NF_LOG_COMMON
+
+config NF_LOG_IPV4
+ tristate "IPv4 packet logging"
+ default m if NETFILTER_ADVANCED=n
+ select NF_LOG_COMMON
+
+config NF_REJECT_IPV4
+ tristate "IPv4 packet rejection"
+ default m if NETFILTER_ADVANCED=n
+
config NF_NAT_IPV4
tristate "IPv4 NAT"
depends on NF_CONNTRACK_IPV4
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index e90f83a..f75e9df 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -418,6 +418,13 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
+
+ if (!par->net->xt.clusterip_deprecated_warning) {
+ pr_info("ipt_CLUSTERIP is deprecated and it will removed soon, "
+ "use xt_cluster instead\n");
+ par->net->xt.clusterip_deprecated_warning = true;
+ }
+
return ret;
}
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 8f48f55..87907d4 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -34,31 +34,32 @@ static unsigned int
reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ipt_reject_info *reject = par->targinfo;
+ int hook = par->hooknum;
switch (reject->with) {
case IPT_ICMP_NET_UNREACHABLE:
- nf_send_unreach(skb, ICMP_NET_UNREACH);
+ nf_send_unreach(skb, ICMP_NET_UNREACH, hook);
break;
case IPT_ICMP_HOST_UNREACHABLE:
- nf_send_unreach(skb, ICMP_HOST_UNREACH);
+ nf_send_unreach(skb, ICMP_HOST_UNREACH, hook);
break;
case IPT_ICMP_PROT_UNREACHABLE:
- nf_send_unreach(skb, ICMP_PROT_UNREACH);
+ nf_send_unreach(skb, ICMP_PROT_UNREACH, hook);
break;
case IPT_ICMP_PORT_UNREACHABLE:
- nf_send_unreach(skb, ICMP_PORT_UNREACH);
+ nf_send_unreach(skb, ICMP_PORT_UNREACH, hook);
break;
case IPT_ICMP_NET_PROHIBITED:
- nf_send_unreach(skb, ICMP_NET_ANO);
+ nf_send_unreach(skb, ICMP_NET_ANO, hook);
break;
case IPT_ICMP_HOST_PROHIBITED:
- nf_send_unreach(skb, ICMP_HOST_ANO);
+ nf_send_unreach(skb, ICMP_HOST_ANO, hook);
break;
case IPT_ICMP_ADMIN_PROHIBITED:
- nf_send_unreach(skb, ICMP_PKT_FILTERED);
+ nf_send_unreach(skb, ICMP_PKT_FILTERED, hook);
break;
case IPT_TCP_RESET:
- nf_send_reset(skb, par->hooknum);
+ nf_send_reset(skb, hook);
case IPT_ICMP_ECHOREPLY:
/* Doesn't happen. */
break;
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 536da7b..c5b794d 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_get);
struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
const struct sk_buff *oldskb,
- __be16 protocol, int ttl)
+ __u8 protocol, int ttl)
{
struct iphdr *niph, *oiph = ip_hdr(oldskb);
@@ -164,4 +164,27 @@ void nf_send_reset(struct sk_buff *oldskb, int hook)
}
EXPORT_SYMBOL_GPL(nf_send_reset);
+void nf_send_unreach(struct sk_buff *skb_in, int code, int hook)
+{
+ struct iphdr *iph = ip_hdr(skb_in);
+ u8 proto;
+
+ if (skb_in->csum_bad || iph->frag_off & htons(IP_OFFSET))
+ return;
+
+ if (skb_csum_unnecessary(skb_in)) {
+ icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
+ return;
+ }
+
+ if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP)
+ proto = iph->protocol;
+ else
+ proto = 0;
+
+ if (nf_ip_checksum(skb_in, hook, ip_hdrlen(skb_in), proto) == 0)
+ icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
+}
+EXPORT_SYMBOL_GPL(nf_send_unreach);
+
MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
index d729542..16a5d4d 100644
--- a/net/ipv4/netfilter/nft_reject_ipv4.c
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -27,7 +27,8 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr,
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
- nf_send_unreach(pkt->skb, priv->icmp_code);
+ nf_send_unreach(pkt->skb, priv->icmp_code,
+ pkt->ops->hooknum);
break;
case NFT_REJECT_TCP_RST:
nf_send_reset(pkt->skb, pkt->ops->hooknum);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 208d543..344e7cd 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -64,11 +64,11 @@ EXPORT_SYMBOL_GPL(pingv6_ops);
static u16 ping_port_rover;
-static inline int ping_hashfn(struct net *net, unsigned int num, unsigned int mask)
+static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask)
{
- int res = (num + net_hash_mix(net)) & mask;
+ u32 res = (num + net_hash_mix(net)) & mask;
- pr_debug("hash(%d) = %d\n", num, res);
+ pr_debug("hash(%u) = %u\n", num, res);
return res;
}
EXPORT_SYMBOL_GPL(ping_hash);
@@ -692,8 +692,7 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
}
EXPORT_SYMBOL_GPL(ping_common_sendmsg);
-static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len)
+static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct net *net = sock_net(sk);
struct flowi4 fl4;
@@ -849,8 +848,8 @@ do_confirm:
goto out;
}
-int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len, int noblock, int flags, int *addr_len)
+int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
+ int flags, int *addr_len)
{
struct inet_sock *isk = inet_sk(sk);
int family = sk->sk_family;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index f027a70..923cf53 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -481,8 +481,7 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd,
return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb);
}
-static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len)
+static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct inet_sock *inet = inet_sk(sk);
struct ipcm_cookie ipc;
@@ -709,8 +708,8 @@ out: return ret;
* we return it, otherwise we block.
*/
-static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len, int noblock, int flags, int *addr_len)
+static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int noblock, int flags, int *addr_len)
{
struct inet_sock *inet = inet_sk(sk);
size_t copied = 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ad50643..649c8a3 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -152,7 +152,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
static struct dst_ops ipv4_dst_ops = {
.family = AF_INET,
- .protocol = cpu_to_be16(ETH_P_IP),
.check = ipv4_dst_check,
.default_advmss = ipv4_default_advmss,
.mtu = ipv4_mtu,
@@ -2225,7 +2224,6 @@ static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
static struct dst_ops ipv4_dst_blackhole_ops = {
.family = AF_INET,
- .protocol = cpu_to_be16(ETH_P_IP),
.check = ipv4_blackhole_dst_check,
.mtu = ipv4_blackhole_mtu,
.default_advmss = ipv4_default_advmss,
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 45fe60c..805dc44 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -219,19 +219,20 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
}
EXPORT_SYMBOL_GPL(__cookie_v4_check);
-static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
- struct request_sock *req,
- struct dst_entry *dst)
+static struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct sock *child;
child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
- if (child)
+ if (child) {
+ atomic_set(&req->rsk_refcnt, 1);
inet_csk_reqsk_queue_add(sk, req, child);
- else
+ } else {
reqsk_free(req);
-
+ }
return child;
}
@@ -325,7 +326,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
goto out;
ret = NULL;
- req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */
+ req = inet_reqsk_alloc(&tcp_request_sock_ops, sk); /* for safety */
if (!req)
goto out;
@@ -336,8 +337,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
req->mss = mss;
ireq->ir_num = ntohs(th->dest);
ireq->ir_rmt_port = th->source;
- ireq->ir_loc_addr = ip_hdr(skb)->daddr;
- ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
+ sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
+ sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
ireq->ir_mark = inet_request_mark(sk, skb);
ireq->snd_wscale = tcp_opt.snd_wscale;
ireq->sack_ok = tcp_opt.sack_ok;
@@ -345,7 +346,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
ireq->tstamp_ok = tcp_opt.saw_tstamp;
req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
- treq->listener = NULL;
+ treq->tfo_listener = false;
+ ireq->ireq_family = AF_INET;
+
+ ireq->ir_iif = sk->sk_bound_dev_if;
/* We throwed the options of the initial SYN away, so we hope
* the ACK carries the same options again (see RFC1122 4.2.3.8)
@@ -357,7 +361,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
goto out;
}
- req->expires = 0UL;
req->num_retrans = 0;
/*
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d151539..fdf8991 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -883,6 +883,20 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "tcp_probe_threshold",
+ .data = &init_net.ipv4.sysctl_tcp_probe_threshold,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_probe_interval",
+ .data = &init_net.ipv4.sysctl_tcp_probe_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ }
};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 995a225..62f3842 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1060,8 +1060,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
return err;
}
-int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t size)
+int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -1539,8 +1538,8 @@ EXPORT_SYMBOL(tcp_read_sock);
* Probably, code can be easily improved even more.
*/
-int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len, int nonblock, int flags, int *addr_len)
+int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
+ int flags, int *addr_len)
{
struct tcp_sock *tp = tcp_sk(sk);
int copied = 0;
@@ -3001,12 +3000,11 @@ static void __init tcp_init_mem(void)
void __init tcp_init(void)
{
- struct sk_buff *skb = NULL;
unsigned long limit;
int max_rshare, max_wshare, cnt;
unsigned int i;
- BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
+ sock_skb_cb_check_size(sizeof(struct tcp_skb_cb));
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 62856e1..7a5ae50 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -83,7 +83,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
ret = -EEXIST;
} else {
list_add_tail_rcu(&ca->list, &tcp_cong_list);
- pr_info("%s registered\n", ca->name);
+ pr_debug("%s registered\n", ca->name);
}
spin_unlock(&tcp_cong_list_lock);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 0d73f9d..86dc119 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -34,13 +34,13 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
}
static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- struct inet_diag_req_v2 *r, struct nlattr *bc)
+ const struct inet_diag_req_v2 *r, struct nlattr *bc)
{
inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc);
}
static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
- struct inet_diag_req_v2 *req)
+ const struct inet_diag_req_v2 *req)
{
return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req);
}
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index ea82fd4..2eb887e 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -155,12 +155,7 @@ static bool tcp_fastopen_create_child(struct sock *sk,
tp = tcp_sk(child);
tp->fastopen_rsk = req;
- /* Do a hold on the listner sk so that if the listener is being
- * closed, the child that has been accepted can live on and still
- * access listen_lock.
- */
- sock_hold(sk);
- tcp_rsk(req)->listener = sk;
+ tcp_rsk(req)->tfo_listener = true;
/* RFC1323: The window in SYN & SYN/ACK segments is never
* scaled. So correct it appropriately.
@@ -174,6 +169,7 @@ static bool tcp_fastopen_create_child(struct sock *sk,
inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
TCP_TIMEOUT_INIT, TCP_RTO_MAX);
+ atomic_set(&req->rsk_refcnt, 1);
/* Add the child socket directly into the accept queue */
inet_csk_reqsk_queue_add(sk, req, child);
@@ -221,7 +217,6 @@ static bool tcp_fastopen_create_child(struct sock *sk,
WARN_ON(req->sk == NULL);
return true;
}
-EXPORT_SYMBOL(tcp_fastopen_create_child);
static bool tcp_fastopen_queue_check(struct sock *sk)
{
@@ -245,7 +240,7 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
struct request_sock *req1;
spin_lock(&fastopenq->lock);
req1 = fastopenq->rskq_rst_head;
- if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
+ if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) {
spin_unlock(&fastopenq->lock);
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
@@ -254,7 +249,7 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
fastopenq->rskq_rst_head = req1->dl_next;
fastopenq->qlen--;
spin_unlock(&fastopenq->lock);
- reqsk_free(req1);
+ reqsk_put(req1);
}
return true;
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fb4cf8b..95caea7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3321,6 +3321,36 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
return flag;
}
+/* Return true if we're currently rate-limiting out-of-window ACKs and
+ * thus shouldn't send a dupack right now. We rate-limit dupacks in
+ * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
+ * attacks that send repeated SYNs or ACKs for the same connection. To
+ * do this, we do not send a duplicate SYNACK or ACK if the remote
+ * endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
+ */
+bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
+ int mib_idx, u32 *last_oow_ack_time)
+{
+ /* Data packets without SYNs are not likely part of an ACK loop. */
+ if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
+ !tcp_hdr(skb)->syn)
+ goto not_rate_limited;
+
+ if (*last_oow_ack_time) {
+ s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
+
+ if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
+ NET_INC_STATS_BH(net, mib_idx);
+ return true; /* rate-limited: don't send yet! */
+ }
+ }
+
+ *last_oow_ack_time = tcp_time_stamp;
+
+not_rate_limited:
+ return false; /* not rate-limited: go ahead, send dupack now! */
+}
+
/* RFC 5961 7 [ACK Throttling] */
static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
{
@@ -5664,7 +5694,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
sk->sk_state != TCP_FIN_WAIT1);
- if (tcp_check_req(sk, skb, req, NULL, true) == NULL)
+ if (tcp_check_req(sk, skb, req, true) == NULL)
goto discard;
}
@@ -5912,6 +5942,51 @@ static void tcp_ecn_create_request(struct request_sock *req,
inet_rsk(req)->ecn_ok = 1;
}
+static void tcp_openreq_init(struct request_sock *req,
+ const struct tcp_options_received *rx_opt,
+ struct sk_buff *skb, const struct sock *sk)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
+ req->cookie_ts = 0;
+ tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
+ tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ tcp_rsk(req)->snt_synack = tcp_time_stamp;
+ tcp_rsk(req)->last_oow_ack_time = 0;
+ req->mss = rx_opt->mss_clamp;
+ req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
+ ireq->tstamp_ok = rx_opt->tstamp_ok;
+ ireq->sack_ok = rx_opt->sack_ok;
+ ireq->snd_wscale = rx_opt->snd_wscale;
+ ireq->wscale_ok = rx_opt->wscale_ok;
+ ireq->acked = 0;
+ ireq->ecn_ok = 0;
+ ireq->ir_rmt_port = tcp_hdr(skb)->source;
+ ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
+ ireq->ir_mark = inet_request_mark(sk, skb);
+}
+
+struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
+ struct sock *sk_listener)
+{
+ struct request_sock *req = reqsk_alloc(ops, sk_listener);
+
+ if (req) {
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ kmemcheck_annotate_bitfield(ireq, flags);
+ ireq->opt = NULL;
+ atomic64_set(&ireq->ir_cookie, 0);
+ ireq->ireq_state = TCP_NEW_SYN_RECV;
+ write_pnet(&ireq->ireq_net, sock_net(sk_listener));
+
+ }
+
+ return req;
+}
+EXPORT_SYMBOL(inet_reqsk_alloc);
+
int tcp_conn_request(struct request_sock_ops *rsk_ops,
const struct tcp_request_sock_ops *af_ops,
struct sock *sk, struct sk_buff *skb)
@@ -5949,7 +6024,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop;
}
- req = inet_reqsk_alloc(rsk_ops);
+ req = inet_reqsk_alloc(rsk_ops, sk);
if (!req)
goto drop;
@@ -5966,6 +6041,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
tcp_openreq_init(req, &tmp_opt, skb, sk);
+ /* Note: tcp_v6_init_req() might override ir_iif for link locals */
+ inet_rsk(req)->ir_iif = sk->sk_bound_dev_if;
+
af_ops->init_req(req, sk, skb);
if (security_inet_conn_request(sk, skb, req))
@@ -6038,7 +6116,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
if (err || want_cookie)
goto drop_and_free;
- tcp_rsk(req)->listener = NULL;
+ tcp_rsk(req)->tfo_listener = false;
af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5a2dfed..5554b8f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -189,7 +189,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (!inet->inet_saddr)
inet->inet_saddr = fl4->saddr;
- inet->inet_rcv_saddr = inet->inet_saddr;
+ sk_rcv_saddr_set(sk, inet->inet_saddr);
if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
/* Reset inherited state */
@@ -204,7 +204,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
tcp_fetch_timewait_stamp(sk, &rt->dst);
inet->inet_dport = usin->sin_port;
- inet->inet_daddr = daddr;
+ sk_daddr_set(sk, daddr);
inet_csk(sk)->icsk_ext_hdr_len = 0;
if (inet_opt)
@@ -458,12 +458,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
}
switch (sk->sk_state) {
- struct request_sock *req, **prev;
+ struct request_sock *req;
case TCP_LISTEN:
if (sock_owned_by_user(sk))
goto out;
- req = inet_csk_search_req(sk, &prev, th->dest,
+ req = inet_csk_search_req(sk, th->dest,
iph->daddr, iph->saddr);
if (!req)
goto out;
@@ -475,6 +475,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
if (seq != tcp_rsk(req)->snt_isn) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+ reqsk_put(req);
goto out;
}
@@ -484,8 +485,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
* created socket, and POSIX does not want network
* errors returned from accept().
*/
- inet_csk_reqsk_queue_drop(sk, req, prev);
+ inet_csk_reqsk_queue_drop(sk, req);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ reqsk_put(req);
goto out;
case TCP_SYN_SENT:
@@ -1219,15 +1221,16 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
#endif
-static void tcp_v4_init_req(struct request_sock *req, struct sock *sk,
+static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener,
struct sk_buff *skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
- ireq->ir_loc_addr = ip_hdr(skb)->daddr;
- ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
- ireq->no_srccheck = inet_sk(sk)->transparent;
+ sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
+ sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
+ ireq->no_srccheck = inet_sk(sk_listener)->transparent;
ireq->opt = tcp_v4_save_options(skb);
+ ireq->ireq_family = AF_INET;
}
static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
@@ -1318,8 +1321,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newtp = tcp_sk(newsk);
newinet = inet_sk(newsk);
ireq = inet_rsk(req);
- newinet->inet_daddr = ireq->ir_rmt_addr;
- newinet->inet_rcv_saddr = ireq->ir_loc_addr;
+ sk_daddr_set(newsk, ireq->ir_rmt_addr);
+ sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
newinet->inet_saddr = ireq->ir_loc_addr;
inet_opt = ireq->opt;
rcu_assign_pointer(newinet->inet_opt, inet_opt);
@@ -1391,15 +1394,17 @@ EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
- struct tcphdr *th = tcp_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
const struct iphdr *iph = ip_hdr(skb);
+ struct request_sock *req;
struct sock *nsk;
- struct request_sock **prev;
- /* Find possible connection requests. */
- struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
- iph->saddr, iph->daddr);
- if (req)
- return tcp_check_req(sk, skb, req, prev, false);
+
+ req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr);
+ if (req) {
+ nsk = tcp_check_req(sk, skb, req, false);
+ reqsk_put(req);
+ return nsk;
+ }
nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
th->source, iph->daddr, th->dest, inet_iif(skb));
@@ -1517,7 +1522,7 @@ void tcp_v4_early_demux(struct sk_buff *skb)
if (sk) {
skb->sk = sk;
skb->destructor = sock_edemux;
- if (sk->sk_state != TCP_TIME_WAIT) {
+ if (sk_fullsock(sk)) {
struct dst_entry *dst = sk->sk_rx_dst;
if (dst)
@@ -2204,17 +2209,17 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
}
EXPORT_SYMBOL(tcp_proc_unregister);
-static void get_openreq4(const struct sock *sk, const struct request_sock *req,
+static void get_openreq4(const struct request_sock *req,
struct seq_file *f, int i, kuid_t uid)
{
const struct inet_request_sock *ireq = inet_rsk(req);
- long delta = req->expires - jiffies;
+ long delta = req->rsk_timer.expires - jiffies;
seq_printf(f, "%4d: %08X:%04X %08X:%04X"
" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
i,
ireq->ir_loc_addr,
- ntohs(inet_sk(sk)->inet_sport),
+ ireq->ir_num,
ireq->ir_rmt_addr,
ntohs(ireq->ir_rmt_port),
TCP_SYN_RECV,
@@ -2225,7 +2230,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
from_kuid_munged(seq_user_ns(f), uid),
0, /* non standard timer */
0, /* open_requests have no inode */
- atomic_read(&sk->sk_refcnt),
+ 0,
req);
}
@@ -2332,7 +2337,7 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
get_tcp4_sock(v, seq, st->num);
break;
case TCP_SEQ_STATE_OPENREQ:
- get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid);
+ get_openreq4(v, seq, st->num, st->uid);
break;
}
out:
@@ -2460,6 +2465,8 @@ static int __net_init tcp_sk_init(struct net *net)
}
net->ipv4.sysctl_tcp_ecn = 2;
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
+ net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
+ net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
return 0;
fail:
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index e5f41bd..5bef351 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -40,6 +40,7 @@ struct tcp_fastopen_metrics {
struct tcp_metrics_block {
struct tcp_metrics_block __rcu *tcpm_next;
+ possible_net_t tcpm_net;
struct inetpeer_addr tcpm_saddr;
struct inetpeer_addr tcpm_daddr;
unsigned long tcpm_stamp;
@@ -52,6 +53,11 @@ struct tcp_metrics_block {
struct rcu_head rcu_head;
};
+static inline struct net *tm_net(struct tcp_metrics_block *tm)
+{
+ return read_pnet(&tm->tcpm_net);
+}
+
static bool tcp_metric_locked(struct tcp_metrics_block *tm,
enum tcp_metric_index idx)
{
@@ -91,6 +97,9 @@ struct tcpm_hash_bucket {
struct tcp_metrics_block __rcu *chain;
};
+static struct tcpm_hash_bucket *tcp_metrics_hash __read_mostly;
+static unsigned int tcp_metrics_hash_log __read_mostly;
+
static DEFINE_SPINLOCK(tcp_metrics_lock);
static void tcpm_suck_dst(struct tcp_metrics_block *tm,
@@ -143,6 +152,9 @@ static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst
#define TCP_METRICS_RECLAIM_DEPTH 5
#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL
+#define deref_locked(p) \
+ rcu_dereference_protected(p, lockdep_is_held(&tcp_metrics_lock))
+
static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
struct inetpeer_addr *saddr,
struct inetpeer_addr *daddr,
@@ -171,9 +183,9 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
if (unlikely(reclaim)) {
struct tcp_metrics_block *oldest;
- oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain);
- for (tm = rcu_dereference(oldest->tcpm_next); tm;
- tm = rcu_dereference(tm->tcpm_next)) {
+ oldest = deref_locked(tcp_metrics_hash[hash].chain);
+ for (tm = deref_locked(oldest->tcpm_next); tm;
+ tm = deref_locked(tm->tcpm_next)) {
if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
oldest = tm;
}
@@ -183,14 +195,15 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
if (!tm)
goto out_unlock;
}
+ write_pnet(&tm->tcpm_net, net);
tm->tcpm_saddr = *saddr;
tm->tcpm_daddr = *daddr;
tcpm_suck_dst(tm, dst, true);
if (likely(!reclaim)) {
- tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain;
- rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm);
+ tm->tcpm_next = tcp_metrics_hash[hash].chain;
+ rcu_assign_pointer(tcp_metrics_hash[hash].chain, tm);
}
out_unlock:
@@ -214,10 +227,11 @@ static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *s
struct tcp_metrics_block *tm;
int depth = 0;
- for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+ for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
tm = rcu_dereference(tm->tcpm_next)) {
if (addr_same(&tm->tcpm_saddr, saddr) &&
- addr_same(&tm->tcpm_daddr, daddr))
+ addr_same(&tm->tcpm_daddr, daddr) &&
+ net_eq(tm_net(tm), net))
break;
depth++;
}
@@ -252,12 +266,14 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
}
net = dev_net(dst->dev);
- hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
+ hash ^= net_hash_mix(net);
+ hash = hash_32(hash, tcp_metrics_hash_log);
- for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+ for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
tm = rcu_dereference(tm->tcpm_next)) {
if (addr_same(&tm->tcpm_saddr, &saddr) &&
- addr_same(&tm->tcpm_daddr, &daddr))
+ addr_same(&tm->tcpm_daddr, &daddr) &&
+ net_eq(tm_net(tm), net))
break;
}
tcpm_check_stamp(tm, dst);
@@ -299,12 +315,14 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock
return NULL;
net = twsk_net(tw);
- hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
+ hash ^= net_hash_mix(net);
+ hash = hash_32(hash, tcp_metrics_hash_log);
- for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+ for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
tm = rcu_dereference(tm->tcpm_next)) {
if (addr_same(&tm->tcpm_saddr, &saddr) &&
- addr_same(&tm->tcpm_daddr, &daddr))
+ addr_same(&tm->tcpm_daddr, &daddr) &&
+ net_eq(tm_net(tm), net))
break;
}
return tm;
@@ -347,7 +365,8 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
return NULL;
net = dev_net(dst->dev);
- hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
+ hash ^= net_hash_mix(net);
+ hash = hash_32(hash, tcp_metrics_hash_log);
tm = __tcp_get_metrics(&saddr, &daddr, net, hash);
if (tm == TCP_METRICS_RECLAIM_PTR)
@@ -898,17 +917,19 @@ static int tcp_metrics_nl_dump(struct sk_buff *skb,
struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
- unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
+ unsigned int max_rows = 1U << tcp_metrics_hash_log;
unsigned int row, s_row = cb->args[0];
int s_col = cb->args[1], col = s_col;
for (row = s_row; row < max_rows; row++, s_col = 0) {
struct tcp_metrics_block *tm;
- struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash + row;
+ struct tcpm_hash_bucket *hb = tcp_metrics_hash + row;
rcu_read_lock();
for (col = 0, tm = rcu_dereference(hb->chain); tm;
tm = rcu_dereference(tm->tcpm_next), col++) {
+ if (!net_eq(tm_net(tm), net))
+ continue;
if (col < s_col)
continue;
if (tcp_metrics_dump_info(skb, cb, tm) < 0) {
@@ -994,13 +1015,15 @@ static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)
if (!reply)
goto nla_put_failure;
- hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
+ hash ^= net_hash_mix(net);
+ hash = hash_32(hash, tcp_metrics_hash_log);
ret = -ESRCH;
rcu_read_lock();
- for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
+ for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
tm = rcu_dereference(tm->tcpm_next)) {
if (addr_same(&tm->tcpm_daddr, &daddr) &&
- (!src || addr_same(&tm->tcpm_saddr, &saddr))) {
+ (!src || addr_same(&tm->tcpm_saddr, &saddr)) &&
+ net_eq(tm_net(tm), net)) {
ret = tcp_metrics_fill_info(msg, tm);
break;
}
@@ -1020,34 +1043,27 @@ out_free:
return ret;
}
-#define deref_locked_genl(p) \
- rcu_dereference_protected(p, lockdep_genl_is_held() && \
- lockdep_is_held(&tcp_metrics_lock))
-
-#define deref_genl(p) rcu_dereference_protected(p, lockdep_genl_is_held())
-
-static int tcp_metrics_flush_all(struct net *net)
+static void tcp_metrics_flush_all(struct net *net)
{
- unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
- struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash;
+ unsigned int max_rows = 1U << tcp_metrics_hash_log;
+ struct tcpm_hash_bucket *hb = tcp_metrics_hash;
struct tcp_metrics_block *tm;
unsigned int row;
for (row = 0; row < max_rows; row++, hb++) {
+ struct tcp_metrics_block __rcu **pp;
spin_lock_bh(&tcp_metrics_lock);
- tm = deref_locked_genl(hb->chain);
- if (tm)
- hb->chain = NULL;
- spin_unlock_bh(&tcp_metrics_lock);
- while (tm) {
- struct tcp_metrics_block *next;
-
- next = deref_genl(tm->tcpm_next);
- kfree_rcu(tm, rcu_head);
- tm = next;
+ pp = &hb->chain;
+ for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
+ if (net_eq(tm_net(tm), net)) {
+ *pp = tm->tcpm_next;
+ kfree_rcu(tm, rcu_head);
+ } else {
+ pp = &tm->tcpm_next;
+ }
}
+ spin_unlock_bh(&tcp_metrics_lock);
}
- return 0;
}
static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
@@ -1064,19 +1080,23 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
ret = parse_nl_addr(info, &daddr, &hash, 1);
if (ret < 0)
return ret;
- if (ret > 0)
- return tcp_metrics_flush_all(net);
+ if (ret > 0) {
+ tcp_metrics_flush_all(net);
+ return 0;
+ }
ret = parse_nl_saddr(info, &saddr);
if (ret < 0)
src = false;
- hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
- hb = net->ipv4.tcp_metrics_hash + hash;
+ hash ^= net_hash_mix(net);
+ hash = hash_32(hash, tcp_metrics_hash_log);
+ hb = tcp_metrics_hash + hash;
pp = &hb->chain;
spin_lock_bh(&tcp_metrics_lock);
- for (tm = deref_locked_genl(*pp); tm; tm = deref_locked_genl(*pp)) {
+ for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
if (addr_same(&tm->tcpm_daddr, &daddr) &&
- (!src || addr_same(&tm->tcpm_saddr, &saddr))) {
+ (!src || addr_same(&tm->tcpm_saddr, &saddr)) &&
+ net_eq(tm_net(tm), net)) {
*pp = tm->tcpm_next;
kfree_rcu(tm, rcu_head);
found = true;
@@ -1126,6 +1146,9 @@ static int __net_init tcp_net_metrics_init(struct net *net)
size_t size;
unsigned int slots;
+ if (!net_eq(net, &init_net))
+ return 0;
+
slots = tcpmhash_entries;
if (!slots) {
if (totalram_pages >= 128 * 1024)
@@ -1134,14 +1157,14 @@ static int __net_init tcp_net_metrics_init(struct net *net)
slots = 8 * 1024;
}
- net->ipv4.tcp_metrics_hash_log = order_base_2(slots);
- size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log;
+ tcp_metrics_hash_log = order_base_2(slots);
+ size = sizeof(struct tcpm_hash_bucket) << tcp_metrics_hash_log;
- net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
- if (!net->ipv4.tcp_metrics_hash)
- net->ipv4.tcp_metrics_hash = vzalloc(size);
+ tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+ if (!tcp_metrics_hash)
+ tcp_metrics_hash = vzalloc(size);
- if (!net->ipv4.tcp_metrics_hash)
+ if (!tcp_metrics_hash)
return -ENOMEM;
return 0;
@@ -1149,19 +1172,7 @@ static int __net_init tcp_net_metrics_init(struct net *net)
static void __net_exit tcp_net_metrics_exit(struct net *net)
{
- unsigned int i;
-
- for (i = 0; i < (1U << net->ipv4.tcp_metrics_hash_log) ; i++) {
- struct tcp_metrics_block *tm, *next;
-
- tm = rcu_dereference_protected(net->ipv4.tcp_metrics_hash[i].chain, 1);
- while (tm) {
- next = rcu_dereference_protected(tm->tcpm_next, 1);
- kfree(tm);
- tm = next;
- }
- }
- kvfree(net->ipv4.tcp_metrics_hash);
+ tcp_metrics_flush_all(net);
}
static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
@@ -1175,16 +1186,10 @@ void __init tcp_metrics_init(void)
ret = register_pernet_subsys(&tcp_net_metrics_ops);
if (ret < 0)
- goto cleanup;
+ panic("Could not allocate the tcp_metrics hash table\n");
+
ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
tcp_metrics_nl_ops);
if (ret < 0)
- goto cleanup_subsys;
- return;
-
-cleanup_subsys:
- unregister_pernet_subsys(&tcp_net_metrics_ops);
-
-cleanup:
- return;
+ panic("Could not register tcp_metrics generic netlink\n");
}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index dd11ac7..274e96f 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -572,7 +572,6 @@ EXPORT_SYMBOL(tcp_create_openreq_child);
struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
- struct request_sock **prev,
bool fastopen)
{
struct tcp_options_received tmp_opt;
@@ -630,8 +629,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
&tcp_rsk(req)->last_oow_ack_time) &&
!inet_rtx_syn_ack(sk, req))
- req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout,
- TCP_RTO_MAX) + jiffies;
+ mod_timer_pending(&req->rsk_timer, jiffies +
+ min(TCP_TIMEOUT_INIT << req->num_timeout,
+ TCP_RTO_MAX));
return NULL;
}
@@ -766,7 +766,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (child == NULL)
goto listen_overflow;
- inet_csk_reqsk_queue_unlink(sk, req, prev);
+ inet_csk_reqsk_queue_unlink(sk, req);
inet_csk_reqsk_queue_removed(sk, req);
inet_csk_reqsk_queue_add(sk, req, child);
@@ -791,7 +791,7 @@ embryonic_reset:
tcp_reset(sk);
}
if (!fastopen) {
- inet_csk_reqsk_queue_drop(sk, req, prev);
+ inet_csk_reqsk_queue_drop(sk, req);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
}
return NULL;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 9d7930b..3f7c2fc 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -29,8 +29,8 @@ static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq,
}
}
-struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
- netdev_features_t features)
+static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
{
if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
return ERR_PTR(-EINVAL);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 1db253e..c2f0f60 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1354,6 +1354,8 @@ void tcp_mtup_init(struct sock *sk)
icsk->icsk_af_ops->net_header_len;
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
icsk->icsk_mtup.probe_size = 0;
+ if (icsk->icsk_mtup.enabled)
+ icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
}
EXPORT_SYMBOL(tcp_mtup_init);
@@ -1752,20 +1754,23 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
bool *is_cwnd_limited, u32 max_segs)
{
- struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
- u32 send_win, cong_win, limit, in_flight;
+ u32 age, send_win, cong_win, limit, in_flight;
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct skb_mstamp now;
+ struct sk_buff *head;
int win_divisor;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto send_now;
- if (icsk->icsk_ca_state != TCP_CA_Open)
+ if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_CWR)))
goto send_now;
- /* Defer for less than two clock ticks. */
- if (tp->tso_deferred &&
- (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1)
+ /* Avoid bursty behavior by allowing defer
+ * only if the last write was recent.
+ */
+ if ((s32)(tcp_time_stamp - tp->lsndtime) > 0)
goto send_now;
in_flight = tcp_packets_in_flight(tp);
@@ -1807,11 +1812,14 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
goto send_now;
}
- /* Ok, it looks like it is advisable to defer.
- * Do not rearm the timer if already set to not break TCP ACK clocking.
- */
- if (!tp->tso_deferred)
- tp->tso_deferred = 1 | (jiffies << 1);
+ head = tcp_write_queue_head(sk);
+ skb_mstamp_get(&now);
+ age = skb_mstamp_us_delta(&now, &head->skb_mstamp);
+ /* If next ACK is likely to come too late (half srtt), do not defer */
+ if (age < (tp->srtt_us >> 4))
+ goto send_now;
+
+ /* Ok, it looks like it is advisable to defer. */
if (cong_win < send_win && cong_win < skb->len)
*is_cwnd_limited = true;
@@ -1819,10 +1827,34 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
return true;
send_now:
- tp->tso_deferred = 0;
return false;
}
+static inline void tcp_mtu_check_reprobe(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
+ u32 interval;
+ s32 delta;
+
+ interval = net->ipv4.sysctl_tcp_probe_interval;
+ delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp;
+ if (unlikely(delta >= interval * HZ)) {
+ int mss = tcp_current_mss(sk);
+
+ /* Update current search range */
+ icsk->icsk_mtup.probe_size = 0;
+ icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
+ sizeof(struct tcphdr) +
+ icsk->icsk_af_ops->net_header_len;
+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
+
+ /* Update probe time stamp */
+ icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
+ }
+}
+
/* Create a new MTU probe if we are ready.
* MTU probe is regularly attempting to increase the path MTU by
* deliberately sending larger packets. This discovers routing
@@ -1837,11 +1869,13 @@ static int tcp_mtu_probe(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb, *nskb, *next;
+ struct net *net = sock_net(sk);
int len;
int probe_size;
int size_needed;
int copy;
int mss_now;
+ int interval;
/* Not currently probing/verifying,
* not in recovery,
@@ -1854,12 +1888,25 @@ static int tcp_mtu_probe(struct sock *sk)
tp->rx_opt.num_sacks || tp->rx_opt.dsack)
return -1;
- /* Very simple search strategy: just double the MSS. */
+ /* Use binary search for probe_size between tcp_mss_base,
+ * and current mss_clamp. if (search_high - search_low)
+ * smaller than a threshold, backoff from probing.
+ */
mss_now = tcp_current_mss(sk);
- probe_size = 2 * tp->mss_cache;
+ probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
+ icsk->icsk_mtup.search_low) >> 1);
size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
- if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
- /* TODO: set timer for probe_converge_event */
+ interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
+ /* When misfortune happens, we are reprobing actively,
+ * and then reprobe timer has expired. We stick with current
+ * probing process by not resetting search range to its orignal.
+ */
+ if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
+ interval < net->ipv4.sysctl_tcp_probe_threshold) {
+ /* Check whether enough time has elaplased for
+ * another round of probing.
+ */
+ tcp_mtu_check_reprobe(sk);
return -1;
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 0732b78..3daa6b5 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -107,6 +107,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
if (net->ipv4.sysctl_tcp_mtu_probing) {
if (!icsk->icsk_mtup.enabled) {
icsk->icsk_mtup.enabled = 1;
+ icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
} else {
struct net *net = sock_net(sk);
@@ -538,16 +539,6 @@ static void tcp_write_timer(unsigned long data)
sock_put(sk);
}
-/*
- * Timer for listening sockets
- */
-
-static void tcp_synack_timer(struct sock *sk)
-{
- inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL,
- TCP_TIMEOUT_INIT, TCP_RTO_MAX);
-}
-
void tcp_syn_ack_timeout(struct sock *sk, struct request_sock *req)
{
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEOUTS);
@@ -582,7 +573,7 @@ static void tcp_keepalive_timer (unsigned long data)
}
if (sk->sk_state == TCP_LISTEN) {
- tcp_synack_timer(sk);
+ pr_err("Hmm... keepalive on a LISTEN ???\n");
goto out;
}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 97ef1f8b..294af16 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -318,8 +318,8 @@ static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
inet1->inet_rcv_saddr == inet2->inet_rcv_saddr));
}
-static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr,
- unsigned int port)
+static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr,
+ unsigned int port)
{
return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
}
@@ -421,9 +421,9 @@ static inline int compute_score2(struct sock *sk, struct net *net,
return score;
}
-static unsigned int udp_ehashfn(struct net *net, const __be32 laddr,
- const __u16 lport, const __be32 faddr,
- const __be16 fport)
+static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
+ const __u16 lport, const __be32 faddr,
+ const __be16 fport)
{
static u32 udp_ehash_secret __read_mostly;
@@ -873,8 +873,7 @@ out:
}
EXPORT_SYMBOL(udp_push_pending_frames);
-int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len)
+int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct inet_sock *inet = inet_sk(sk);
struct udp_sock *up = udp_sk(sk);
@@ -1136,7 +1135,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
* sendpage interface can't pass.
* This will succeed only when the socket is connected.
*/
- ret = udp_sendmsg(NULL, sk, &msg, 0);
+ ret = udp_sendmsg(sk, &msg, 0);
if (ret < 0)
return ret;
}
@@ -1254,8 +1253,8 @@ EXPORT_SYMBOL(udp_ioctl);
* return it, otherwise we block.
*/
-int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len, int noblock, int flags, int *addr_len)
+int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
+ int flags, int *addr_len)
{
struct inet_sock *inet = inet_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
@@ -2525,6 +2524,16 @@ void __init udp_table_init(struct udp_table *table, const char *name)
}
}
+u32 udp_flow_hashrnd(void)
+{
+ static u32 hashrnd __read_mostly;
+
+ net_get_random_once(&hashrnd, sizeof(hashrnd));
+
+ return hashrnd;
+}
+EXPORT_SYMBOL(udp_flow_hashrnd);
+
void __init udp_init(void)
{
unsigned long limit;
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 4a000f1..2dbfc1f 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -18,8 +18,9 @@
#include <linux/sock_diag.h>
static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
- struct netlink_callback *cb, struct inet_diag_req_v2 *req,
- struct nlattr *bc)
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *req,
+ struct nlattr *bc)
{
if (!inet_diag_bc_sk(bc, sk))
return 0;
@@ -31,7 +32,8 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
}
static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
- const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req)
+ const struct nlmsghdr *nlh,
+ const struct inet_diag_req_v2 *req)
{
int err = -EINVAL;
struct sock *sk;
@@ -90,8 +92,9 @@ out_nosk:
return err;
}
-static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb,
- struct inet_diag_req_v2 *r, struct nlattr *bc)
+static void udp_dump(struct udp_table *table, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r, struct nlattr *bc)
{
int num, s_num, slot, s_slot;
struct net *net = sock_net(skb->sk);
@@ -144,13 +147,13 @@ done:
}
static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- struct inet_diag_req_v2 *r, struct nlattr *bc)
+ const struct inet_diag_req_v2 *r, struct nlattr *bc)
{
udp_dump(&udp_table, skb, cb, r, bc);
}
static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
- struct inet_diag_req_v2 *req)
+ const struct inet_diag_req_v2 *req)
{
return udp_dump_one(&udp_table, in_skb, nlh, req);
}
@@ -170,13 +173,14 @@ static const struct inet_diag_handler udp_diag_handler = {
};
static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- struct inet_diag_req_v2 *r, struct nlattr *bc)
+ const struct inet_diag_req_v2 *r,
+ struct nlattr *bc)
{
udp_dump(&udplite_table, skb, cb, r, bc);
}
static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
- struct inet_diag_req_v2 *req)
+ const struct inet_diag_req_v2 *req)
{
return udp_dump_one(&udplite_table, in_skb, nlh, req);
}
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index f3c2789..7e0fe4b 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -21,8 +21,8 @@ int compat_udp_setsockopt(struct sock *sk, int level, int optname,
int compat_udp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen);
#endif
-int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
- size_t len, int noblock, int flags, int *addr_len);
+int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
+ int flags, int *addr_len);
int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
int flags);
int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 6156f68..c224c85 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -232,7 +232,6 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
static struct dst_ops xfrm4_dst_ops = {
.family = AF_INET,
- .protocol = cpu_to_be16(ETH_P_IP),
.gc = xfrm4_garbage_collect,
.update_pmtu = xfrm4_update_pmtu,
.redirect = xfrm4_redirect,
OpenPOWER on IntegriCloud