From 457c4cbc5a3dde259d2a1f15d5f9785290397267 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 12 Sep 2007 12:01:34 +0200 Subject: [NET]: Make /proc/net per network namespace This patch makes /proc/net per network namespace. It modifies the global variables proc_net and proc_net_stat to be per network namespace. The proc_net file helpers are modified to take a network namespace argument, and all of their callers are fixed to pass &init_net for that argument. This ensures that all of the /proc/net files are only visible and usable in the initial network namespace until the code behind them has been updated to be handle multiple network namespaces. Making /proc/net per namespace is necessary as at least some files in /proc/net depend upon the set of network devices which is per network namespace, and even more files in /proc/net have contents that are relevant to a single network namespace. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/ipv4/route.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net/ipv4/route.c') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c7ca94b..efd2a92 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -91,6 +91,7 @@ #include #include #include +#include #include #include #include @@ -3011,15 +3012,15 @@ int __init ip_rt_init(void) #ifdef CONFIG_PROC_FS { struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */ - if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) || + if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) || !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, - proc_net_stat))) { + init_net.proc_net_stat))) { return -ENOMEM; } rtstat_pde->proc_fops = &rt_cpu_seq_fops; } #ifdef CONFIG_NET_CLS_ROUTE - create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL); + create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL); #endif #endif #ifdef CONFIG_XFRM -- cgit v1.1 From 881d966b48b035ab3f3aeaae0f3d3f9b584f45b2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 17 Sep 2007 11:56:21 -0700 Subject: [NET]: Make the device list and device lookups per namespace. This patch makes most of the generic device layer network namespace safe. This patch makes dev_base_head a network namespace variable, and then it picks up a few associated variables. The functions: dev_getbyhwaddr dev_getfirsthwbytype dev_get_by_flags dev_get_by_name __dev_get_by_name dev_get_by_index __dev_get_by_index dev_ioctl dev_ethtool dev_load wireless_process_ioctl were modified to take a network namespace argument, and deal with it. vlan_ioctl_set and brioctl_set were modified so their hooks will receive a network namespace argument. So basically anthing in the core of the network stack that was affected to by the change of dev_base was modified to handle multiple network namespaces. The rest of the network stack was simply modified to explicitly use &init_net the initial network namespace. This can be fixed when those components of the network stack are modified to handle multiple network namespaces. For now the ifindex generator is left global. Fundametally ifindex numbers are per namespace, or else we will have corner case problems with migration when we get that far. At the same time there are assumptions in the network stack that the ifindex of a network device won't change. Making the ifindex number global seems a good compromise until the network stack can cope with ifindex changes when you change namespaces, and the like. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/ipv4/route.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/route.c') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index efd2a92..396c631 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2213,7 +2213,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) if (oldflp->oif) { - dev_out = dev_get_by_index(oldflp->oif); + dev_out = dev_get_by_index(&init_net, oldflp->oif); err = -ENODEV; if (dev_out == NULL) goto out; @@ -2592,7 +2592,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void if (iif) { struct net_device *dev; - dev = __dev_get_by_index(iif); + dev = __dev_get_by_index(&init_net, iif); if (dev == NULL) { err = -ENODEV; goto errout_free; -- cgit v1.1 From 39c90ece7565f5c47110c2fa77409d7a9478bd5b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 15 Sep 2007 10:55:54 -0700 Subject: [IPV4]: Convert rt_check_expire() from softirq processing to workqueue. On loaded/big hosts, rt_check_expire() if of litle use, because it generally breaks out of its main loop because of a jiffies change. It can take a long time (read : timer invocations) to actually scan the whole hash table, freeing unused entries. Converting it to use a workqueue instead of softirq is a nice move because we can allow rt_check_expire() to do the scan it is supposed to do, without hogging the CPU. This has an impact on the average number of entries in cache, reducing ram usage. Cache is more responsive to parameter changes (/proc/sys/net/ipv4/route/gc_timeout and /proc/sys/net/ipv4/route/gc_interval) Note: Maybe the default value of gc_interval (60 seconds) is too high, since this means we actually need 5 (300/60) invocations to scan the whole table. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'net/ipv4/route.c') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 396c631..006d605 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -81,6 +81,7 @@ #include #include #include +#include #include #include #include @@ -136,7 +137,8 @@ static unsigned long rt_deadline; #define RTprint(a...) printk(KERN_DEBUG a) static struct timer_list rt_flush_timer; -static struct timer_list rt_periodic_timer; +static void rt_check_expire(struct work_struct *work); +static DECLARE_DELAYED_WORK(expires_work, rt_check_expire); static struct timer_list rt_secret_timer; /* @@ -572,20 +574,19 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) (fl1->iif ^ fl2->iif)) == 0; } -/* This runs via a timer and thus is always in BH context. */ -static void rt_check_expire(unsigned long dummy) +static void rt_check_expire(struct work_struct *work) { static unsigned int rover; unsigned int i = rover, goal; struct rtable *rth, **rthp; - unsigned long now = jiffies; u64 mult; mult = ((u64)ip_rt_gc_interval) << rt_hash_log; if (ip_rt_gc_timeout > 1) do_div(mult, ip_rt_gc_timeout); goal = (unsigned int)mult; - if (goal > rt_hash_mask) goal = rt_hash_mask + 1; + if (goal > rt_hash_mask) + goal = rt_hash_mask + 1; for (; goal > 0; goal--) { unsigned long tmo = ip_rt_gc_timeout; @@ -594,11 +595,11 @@ static void rt_check_expire(unsigned long dummy) if (*rthp == 0) continue; - spin_lock(rt_hash_lock_addr(i)); + spin_lock_bh(rt_hash_lock_addr(i)); while ((rth = *rthp) != NULL) { if (rth->u.dst.expires) { /* Entry is expired even if it is in use */ - if (time_before_eq(now, rth->u.dst.expires)) { + if (time_before_eq(jiffies, rth->u.dst.expires)) { tmo >>= 1; rthp = &rth->u.dst.rt_next; continue; @@ -613,14 +614,10 @@ static void rt_check_expire(unsigned long dummy) *rthp = rth->u.dst.rt_next; rt_free(rth); } - spin_unlock(rt_hash_lock_addr(i)); - - /* Fallback loop breaker. */ - if (time_after(jiffies, now)) - break; + spin_unlock_bh(rt_hash_lock_addr(i)); } rover = i; - mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval); + schedule_delayed_work(&expires_work, ip_rt_gc_interval); } /* This can run from both BH and non-BH contexts, the latter @@ -2993,17 +2990,14 @@ int __init ip_rt_init(void) init_timer(&rt_flush_timer); rt_flush_timer.function = rt_run_flush; - init_timer(&rt_periodic_timer); - rt_periodic_timer.function = rt_check_expire; init_timer(&rt_secret_timer); rt_secret_timer.function = rt_secret_rebuild; /* All the timers, started at system startup tend to synchronize. Perturb it a bit. */ - rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval + - ip_rt_gc_interval; - add_timer(&rt_periodic_timer); + schedule_delayed_work(&expires_work, + net_random() % ip_rt_gc_interval + ip_rt_gc_interval); rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval + ip_rt_secret_interval; -- cgit v1.1 From de3cb747ffac5f2a4a6bb156e7e2fd5229e688e5 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 25 Sep 2007 19:16:28 -0700 Subject: [NET]: Dynamically allocate the loopback device, part 1. This patch replaces all occurences to the static variable loopback_dev to a pointer loopback_dev. That provides the mindless, trivial, uninteressting change part for the dynamic allocation for the loopback. Signed-off-by: Eric W. Biederman Signed-off-by: Daniel Lezcano Acked-By: Kirill Korotaev Acked-by: Benjamin Thery Signed-off-by: David S. Miller --- net/ipv4/route.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net/ipv4/route.c') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 006d605..ca2878d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1402,8 +1402,8 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, { struct rtable *rt = (struct rtable *) dst; struct in_device *idev = rt->idev; - if (dev != &loopback_dev && idev && idev->dev == dev) { - struct in_device *loopback_idev = in_dev_get(&loopback_dev); + if (dev != loopback_dev && idev && idev->dev == dev) { + struct in_device *loopback_idev = in_dev_get(loopback_dev); if (loopback_idev) { rt->idev = loopback_idev; in_dev_put(idev); @@ -1555,7 +1555,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, #endif rth->rt_iif = rth->fl.iif = dev->ifindex; - rth->u.dst.dev = &loopback_dev; + rth->u.dst.dev = loopback_dev; dev_hold(rth->u.dst.dev); rth->idev = in_dev_get(rth->u.dst.dev); rth->fl.oif = 0; @@ -1812,7 +1812,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res.type == RTN_LOCAL) { int result; result = fib_validate_source(saddr, daddr, tos, - loopback_dev.ifindex, + loopback_dev->ifindex, dev, &spec_dst, &itag); if (result < 0) goto martian_source; @@ -1879,7 +1879,7 @@ local_input: #endif rth->rt_iif = rth->fl.iif = dev->ifindex; - rth->u.dst.dev = &loopback_dev; + rth->u.dst.dev = loopback_dev; dev_hold(rth->u.dst.dev); rth->idev = in_dev_get(rth->u.dst.dev); rth->rt_gateway = daddr; @@ -2149,7 +2149,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) RT_SCOPE_UNIVERSE), } }, .mark = oldflp->mark, - .iif = loopback_dev.ifindex, + .iif = loopback_dev->ifindex, .oif = oldflp->oif }; struct fib_result res; unsigned flags = 0; @@ -2243,9 +2243,9 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); if (dev_out) dev_put(dev_out); - dev_out = &loopback_dev; + dev_out = loopback_dev; dev_hold(dev_out); - fl.oif = loopback_dev.ifindex; + fl.oif = loopback_dev->ifindex; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; @@ -2290,7 +2290,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) fl.fl4_src = fl.fl4_dst; if (dev_out) dev_put(dev_out); - dev_out = &loopback_dev; + dev_out = loopback_dev; dev_hold(dev_out); fl.oif = dev_out->ifindex; if (res.fi) -- cgit v1.1 From 2774c7aba6c97a2535be3309a2209770953780b3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 26 Sep 2007 22:10:56 -0700 Subject: [NET]: Make the loopback device per network namespace. This patch makes loopback_dev per network namespace. Adding code to create a different loopback device for each network namespace and adding the code to free a loopback device when a network namespace exits. This patch modifies all users the loopback_dev so they access it as init_net.loopback_dev, keeping all of the code compiling and working. A later pass will be needed to update the users to use something other than the initial network namespace. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/ipv4/route.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net/ipv4/route.c') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ca2878d..2a9b363 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1402,8 +1402,8 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, { struct rtable *rt = (struct rtable *) dst; struct in_device *idev = rt->idev; - if (dev != loopback_dev && idev && idev->dev == dev) { - struct in_device *loopback_idev = in_dev_get(loopback_dev); + if (dev != init_net.loopback_dev && idev && idev->dev == dev) { + struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev); if (loopback_idev) { rt->idev = loopback_idev; in_dev_put(idev); @@ -1555,7 +1555,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, #endif rth->rt_iif = rth->fl.iif = dev->ifindex; - rth->u.dst.dev = loopback_dev; + rth->u.dst.dev = init_net.loopback_dev; dev_hold(rth->u.dst.dev); rth->idev = in_dev_get(rth->u.dst.dev); rth->fl.oif = 0; @@ -1812,7 +1812,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res.type == RTN_LOCAL) { int result; result = fib_validate_source(saddr, daddr, tos, - loopback_dev->ifindex, + init_net.loopback_dev->ifindex, dev, &spec_dst, &itag); if (result < 0) goto martian_source; @@ -1879,7 +1879,7 @@ local_input: #endif rth->rt_iif = rth->fl.iif = dev->ifindex; - rth->u.dst.dev = loopback_dev; + rth->u.dst.dev = init_net.loopback_dev; dev_hold(rth->u.dst.dev); rth->idev = in_dev_get(rth->u.dst.dev); rth->rt_gateway = daddr; @@ -2149,7 +2149,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) RT_SCOPE_UNIVERSE), } }, .mark = oldflp->mark, - .iif = loopback_dev->ifindex, + .iif = init_net.loopback_dev->ifindex, .oif = oldflp->oif }; struct fib_result res; unsigned flags = 0; @@ -2243,9 +2243,9 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); if (dev_out) dev_put(dev_out); - dev_out = loopback_dev; + dev_out = init_net.loopback_dev; dev_hold(dev_out); - fl.oif = loopback_dev->ifindex; + fl.oif = init_net.loopback_dev->ifindex; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; @@ -2290,7 +2290,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) fl.fl4_src = fl.fl4_dst; if (dev_out) dev_put(dev_out); - dev_out = loopback_dev; + dev_out = init_net.loopback_dev; dev_hold(dev_out); fl.oif = dev_out->ifindex; if (res.fi) -- cgit v1.1 From cfcabdcc2d5a810208e5bb3974121b7ed60119aa Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 9 Oct 2007 01:59:42 -0700 Subject: [NET]: sparse warning fixes Fix a bunch of sparse warnings. Mostly about 0 used as NULL pointer, and shadowed variable declarations. One notable case was that hash size should have been unsigned. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/route.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/route.c') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2a9b363..307e1f1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -246,7 +246,7 @@ static spinlock_t *rt_hash_locks; static struct rt_hash_bucket *rt_hash_table; static unsigned rt_hash_mask; -static int rt_hash_log; +static unsigned int rt_hash_log; static unsigned int rt_hash_rnd; static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); @@ -593,7 +593,7 @@ static void rt_check_expire(struct work_struct *work) i = (i + 1) & rt_hash_mask; rthp = &rt_hash_table[i].chain; - if (*rthp == 0) + if (*rthp == NULL) continue; spin_lock_bh(rt_hash_lock_addr(i)); while ((rth = *rthp) != NULL) { -- cgit v1.1 From cf7732e4cc14b56d593ff53352673e1fd5e3ba52 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 10 Oct 2007 02:29:29 -0700 Subject: [NET]: Make core networking code use seq_open_private This concerns the ipv4 and ipv6 code mostly, but also the netlink and unix sockets. The netlink code is an example of how to use the __seq_open_private() call - it saves the net namespace on this private. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/route.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'net/ipv4/route.c') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 307e1f1..21b12de 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -375,23 +375,8 @@ static const struct seq_operations rt_cache_seq_ops = { static int rt_cache_seq_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - struct rt_cache_iter_state *s; - - s = kzalloc(sizeof(*s), GFP_KERNEL); - if (!s) - goto out; - rc = seq_open(file, &rt_cache_seq_ops); - if (rc) - goto out_kfree; - seq = file->private_data; - seq->private = s; -out: - return rc; -out_kfree: - kfree(s); - goto out; + return seq_open_private(file, &rt_cache_seq_ops, + sizeof(struct rt_cache_iter_state)); } static const struct file_operations rt_cache_seq_fops = { -- cgit v1.1