summaryrefslogtreecommitdiffstats
path: root/net/ipv4/netfilter/ip_tables.c
diff options
context:
space:
mode:
authorEric Dumazet <dada1@cosmosbay.com>2005-12-13 23:13:48 -0800
committerDavid S. Miller <davem@sunset.davemloft.net>2006-01-03 13:10:29 -0800
commit318360646941d6f3d4c6e4ee99107392728a4079 (patch)
tree26ab4ddc68f917dd4e8813ace504956620eba3a8 /net/ipv4/netfilter/ip_tables.c
parentdf3271f3361b61ce02da0026b4a53e63bc2720cb (diff)
downloadop-kernel-dev-318360646941d6f3d4c6e4ee99107392728a4079.zip
op-kernel-dev-318360646941d6f3d4c6e4ee99107392728a4079.tar.gz
[NETFILTER] ip_tables: NUMA-aware allocation
Part of a performance problem with ip_tables is that memory allocation is not NUMA aware, but 'only' SMP aware (ie each CPU normally touch separate cache lines) Even with small iptables rules, the cost of this misplacement can be high on common workloads. Instead of using one vmalloc() area (located in the node of the iptables process), we now allocate an area for each possible CPU, using vmalloc_node() so that memory should be allocated in the CPU's node if possible. Port to arp_tables and ip6_tables by Harald Welte. Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/netfilter/ip_tables.c')
-rw-r--r--net/ipv4/netfilter/ip_tables.c199
1 files changed, 133 insertions, 66 deletions
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 45886c8..2a26d16 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -83,11 +83,6 @@ static DECLARE_MUTEX(ipt_mutex);
context stops packets coming through and allows user context to read
the counters or update the rules.
- To be cache friendly on SMP, we arrange them like so:
- [ n-entries ]
- ... cache-align padding ...
- [ n-entries ]
-
Hence the start of any table is given by get_table() below. */
/* The table itself */
@@ -105,20 +100,15 @@ struct ipt_table_info
unsigned int underflow[NF_IP_NUMHOOKS];
/* ipt_entry tables: one per CPU */
- char entries[0] ____cacheline_aligned;
+ void *entries[NR_CPUS];
};
static LIST_HEAD(ipt_target);
static LIST_HEAD(ipt_match);
static LIST_HEAD(ipt_tables);
+#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
-#ifdef CONFIG_SMP
-#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
-#else
-#define TABLE_OFFSET(t,p) 0
-#endif
-
#if 0
#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0)
#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; })
@@ -290,8 +280,7 @@ ipt_do_table(struct sk_buff **pskb,
read_lock_bh(&table->lock);
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
- table_base = (void *)table->private->entries
- + TABLE_OFFSET(table->private, smp_processor_id());
+ table_base = (void *)table->private->entries[smp_processor_id()];
e = get_entry(table_base, table->private->hook_entry[hook]);
#ifdef CONFIG_NETFILTER_DEBUG
@@ -563,7 +552,8 @@ unconditional(const struct ipt_ip *ip)
/* Figures out from what hook each rule can be called: returns 0 if
there are loops. Puts hook bitmask in comefrom. */
static int
-mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
+mark_source_chains(struct ipt_table_info *newinfo,
+ unsigned int valid_hooks, void *entry0)
{
unsigned int hook;
@@ -572,7 +562,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) {
unsigned int pos = newinfo->hook_entry[hook];
struct ipt_entry *e
- = (struct ipt_entry *)(newinfo->entries + pos);
+ = (struct ipt_entry *)(entry0 + pos);
if (!(valid_hooks & (1 << hook)))
continue;
@@ -622,13 +612,13 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
goto next;
e = (struct ipt_entry *)
- (newinfo->entries + pos);
+ (entry0 + pos);
} while (oldpos == pos + e->next_offset);
/* Move along one */
size = e->next_offset;
e = (struct ipt_entry *)
- (newinfo->entries + pos + size);
+ (entry0 + pos + size);
e->counters.pcnt = pos;
pos += size;
} else {
@@ -645,7 +635,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
newpos = pos + e->next_offset;
}
e = (struct ipt_entry *)
- (newinfo->entries + newpos);
+ (entry0 + newpos);
e->counters.pcnt = pos;
pos = newpos;
}
@@ -855,6 +845,7 @@ static int
translate_table(const char *name,
unsigned int valid_hooks,
struct ipt_table_info *newinfo,
+ void *entry0,
unsigned int size,
unsigned int number,
const unsigned int *hook_entries,
@@ -875,11 +866,11 @@ translate_table(const char *name,
duprintf("translate_table: size %u\n", newinfo->size);
i = 0;
/* Walk through entries, checking offsets. */
- ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
check_entry_size_and_hooks,
newinfo,
- newinfo->entries,
- newinfo->entries + size,
+ entry0,
+ entry0 + size,
hook_entries, underflows, &i);
if (ret != 0)
return ret;
@@ -907,27 +898,24 @@ translate_table(const char *name,
}
}
- if (!mark_source_chains(newinfo, valid_hooks))
+ if (!mark_source_chains(newinfo, valid_hooks, entry0))
return -ELOOP;
/* Finally, each sanity check must pass */
i = 0;
- ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
check_entry, name, size, &i);
if (ret != 0) {
- IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ IPT_ENTRY_ITERATE(entry0, newinfo->size,
cleanup_entry, &i);
return ret;
}
/* And one copy for every other CPU */
for_each_cpu(i) {
- if (i == 0)
- continue;
- memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
- newinfo->entries,
- SMP_ALIGN(newinfo->size));
+ if (newinfo->entries[i] && newinfo->entries[i] != entry0)
+ memcpy(newinfo->entries[i], entry0, newinfo->size);
}
return ret;
@@ -943,15 +931,12 @@ replace_table(struct ipt_table *table,
#ifdef CONFIG_NETFILTER_DEBUG
{
- struct ipt_entry *table_base;
- unsigned int i;
+ int cpu;
- for_each_cpu(i) {
- table_base =
- (void *)newinfo->entries
- + TABLE_OFFSET(newinfo, i);
-
- table_base->comefrom = 0xdead57ac;
+ for_each_cpu(cpu) {
+ struct ipt_entry *table_base = newinfo->entries[cpu];
+ if (table_base)
+ table_base->comefrom = 0xdead57ac;
}
}
#endif
@@ -986,16 +971,44 @@ add_entry_to_counter(const struct ipt_entry *e,
return 0;
}
+static inline int
+set_entry_to_counter(const struct ipt_entry *e,
+ struct ipt_counters total[],
+ unsigned int *i)
+{
+ SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
+
+ (*i)++;
+ return 0;
+}
+
static void
get_counters(const struct ipt_table_info *t,
struct ipt_counters counters[])
{
unsigned int cpu;
unsigned int i;
+ unsigned int curcpu;
+
+ /* Instead of clearing (by a previous call to memset())
+ * the counters and using adds, we set the counters
+ * with data used by 'current' CPU
+ * We dont care about preemption here.
+ */
+ curcpu = raw_smp_processor_id();
+
+ i = 0;
+ IPT_ENTRY_ITERATE(t->entries[curcpu],
+ t->size,
+ set_entry_to_counter,
+ counters,
+ &i);
for_each_cpu(cpu) {
+ if (cpu == curcpu)
+ continue;
i = 0;
- IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
+ IPT_ENTRY_ITERATE(t->entries[cpu],
t->size,
add_entry_to_counter,
counters,
@@ -1012,24 +1025,29 @@ copy_entries_to_user(unsigned int total_size,
struct ipt_entry *e;
struct ipt_counters *counters;
int ret = 0;
+ void *loc_cpu_entry;
/* We need atomic snapshot of counters: rest doesn't change
(other than comefrom, which userspace doesn't care
about). */
countersize = sizeof(struct ipt_counters) * table->private->number;
- counters = vmalloc(countersize);
+ counters = vmalloc_node(countersize, numa_node_id());
if (counters == NULL)
return -ENOMEM;
/* First, sum counters... */
- memset(counters, 0, countersize);
write_lock_bh(&table->lock);
get_counters(table->private, counters);
write_unlock_bh(&table->lock);
- /* ... then copy entire thing from CPU 0... */
- if (copy_to_user(userptr, table->private->entries, total_size) != 0) {
+ /* choose the copy that is on our node/cpu, ...
+ * This choice is lazy (because current thread is
+ * allowed to migrate to another cpu)
+ */
+ loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
+ /* ... then copy entire thing ... */
+ if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
ret = -EFAULT;
goto free_counters;
}
@@ -1041,7 +1059,7 @@ copy_entries_to_user(unsigned int total_size,
struct ipt_entry_match *m;
struct ipt_entry_target *t;
- e = (struct ipt_entry *)(table->private->entries + off);
+ e = (struct ipt_entry *)(loc_cpu_entry + off);
if (copy_to_user(userptr + off
+ offsetof(struct ipt_entry, counters),
&counters[num],
@@ -1110,6 +1128,45 @@ get_entries(const struct ipt_get_entries *entries,
return ret;
}
+static void free_table_info(struct ipt_table_info *info)
+{
+ int cpu;
+ for_each_cpu(cpu) {
+ if (info->size <= PAGE_SIZE)
+ kfree(info->entries[cpu]);
+ else
+ vfree(info->entries[cpu]);
+ }
+ kfree(info);
+}
+
+static struct ipt_table_info *alloc_table_info(unsigned int size)
+{
+ struct ipt_table_info *newinfo;
+ int cpu;
+
+ newinfo = kzalloc(sizeof(struct ipt_table_info), GFP_KERNEL);
+ if (!newinfo)
+ return NULL;
+
+ newinfo->size = size;
+
+ for_each_cpu(cpu) {
+ if (size <= PAGE_SIZE)
+ newinfo->entries[cpu] = kmalloc_node(size,
+ GFP_KERNEL,
+ cpu_to_node(cpu));
+ else
+ newinfo->entries[cpu] = vmalloc_node(size, cpu_to_node(cpu));
+ if (newinfo->entries[cpu] == 0) {
+ free_table_info(newinfo);
+ return NULL;
+ }
+ }
+
+ return newinfo;
+}
+
static int
do_replace(void __user *user, unsigned int len)
{
@@ -1118,6 +1175,7 @@ do_replace(void __user *user, unsigned int len)
struct ipt_table *t;
struct ipt_table_info *newinfo, *oldinfo;
struct ipt_counters *counters;
+ void *loc_cpu_entry, *loc_cpu_old_entry;
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
@@ -1130,13 +1188,13 @@ do_replace(void __user *user, unsigned int len)
if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
return -ENOMEM;
- newinfo = vmalloc(sizeof(struct ipt_table_info)
- + SMP_ALIGN(tmp.size) *
- (highest_possible_processor_id()+1));
+ newinfo = alloc_table_info(tmp.size);
if (!newinfo)
return -ENOMEM;
- if (copy_from_user(newinfo->entries, user + sizeof(tmp),
+ /* choose the copy that is our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
@@ -1147,10 +1205,9 @@ do_replace(void __user *user, unsigned int len)
ret = -ENOMEM;
goto free_newinfo;
}
- memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters));
ret = translate_table(tmp.name, tmp.valid_hooks,
- newinfo, tmp.size, tmp.num_entries,
+ newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
tmp.hook_entry, tmp.underflow);
if (ret != 0)
goto free_newinfo_counters;
@@ -1189,8 +1246,9 @@ do_replace(void __user *user, unsigned int len)
/* Get the old counters. */
get_counters(oldinfo, counters);
/* Decrease module usage counts and free resource */
- IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
- vfree(oldinfo);
+ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+ IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
+ free_table_info(oldinfo);
if (copy_to_user(tmp.counters, counters,
sizeof(struct ipt_counters) * tmp.num_counters) != 0)
ret = -EFAULT;
@@ -1202,11 +1260,11 @@ do_replace(void __user *user, unsigned int len)
module_put(t->me);
up(&ipt_mutex);
free_newinfo_counters_untrans:
- IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL);
+ IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
free_newinfo_counters:
vfree(counters);
free_newinfo:
- vfree(newinfo);
+ free_table_info(newinfo);
return ret;
}
@@ -1239,6 +1297,7 @@ do_add_counters(void __user *user, unsigned int len)
struct ipt_counters_info tmp, *paddc;
struct ipt_table *t;
int ret = 0;
+ void *loc_cpu_entry;
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
@@ -1246,7 +1305,7 @@ do_add_counters(void __user *user, unsigned int len)
if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters))
return -EINVAL;
- paddc = vmalloc(len);
+ paddc = vmalloc_node(len, numa_node_id());
if (!paddc)
return -ENOMEM;
@@ -1268,7 +1327,9 @@ do_add_counters(void __user *user, unsigned int len)
}
i = 0;
- IPT_ENTRY_ITERATE(t->private->entries,
+ /* Choose the copy that is on our node */
+ loc_cpu_entry = t->private->entries[raw_smp_processor_id()];
+ IPT_ENTRY_ITERATE(loc_cpu_entry,
t->private->size,
add_counter_to_entry,
paddc->counters,
@@ -1460,28 +1521,31 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl)
struct ipt_table_info *newinfo;
static struct ipt_table_info bootstrap
= { 0, 0, 0, { 0 }, { 0 }, { } };
+ void *loc_cpu_entry;
- newinfo = vmalloc(sizeof(struct ipt_table_info)
- + SMP_ALIGN(repl->size) *
- (highest_possible_processor_id()+1));
+ newinfo = alloc_table_info(repl->size);
if (!newinfo)
return -ENOMEM;
- memcpy(newinfo->entries, repl->entries, repl->size);
+ /* choose the copy on our node/cpu
+ * but dont care of preemption
+ */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+ memcpy(loc_cpu_entry, repl->entries, repl->size);
ret = translate_table(table->name, table->valid_hooks,
- newinfo, repl->size,
+ newinfo, loc_cpu_entry, repl->size,
repl->num_entries,
repl->hook_entry,
repl->underflow);
if (ret != 0) {
- vfree(newinfo);
+ free_table_info(newinfo);
return ret;
}
ret = down_interruptible(&ipt_mutex);
if (ret != 0) {
- vfree(newinfo);
+ free_table_info(newinfo);
return ret;
}
@@ -1510,20 +1574,23 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl)
return ret;
free_unlock:
- vfree(newinfo);
+ free_table_info(newinfo);
goto unlock;
}
void ipt_unregister_table(struct ipt_table *table)
{
+ void *loc_cpu_entry;
+
down(&ipt_mutex);
LIST_DELETE(&ipt_tables, table);
up(&ipt_mutex);
/* Decrease module usage counts and free resources */
- IPT_ENTRY_ITERATE(table->private->entries, table->private->size,
+ loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
+ IPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size,
cleanup_entry, NULL);
- vfree(table->private);
+ free_table_info(table->private);
}
/* Returns 1 if the port is matched by the range, 0 otherwise */
OpenPOWER on IntegriCloud