diff options
Diffstat (limited to 'net/ipv4')
29 files changed, 703 insertions, 567 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index bf147f8..a9d84f9 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1248,11 +1248,6 @@ module_init(inet_init); /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS -#ifdef CONFIG_IP_FIB_TRIE -extern int fib_stat_proc_init(void); -extern void fib_stat_proc_exit(void); -#endif - static int __init ipv4_proc_init(void) { int rc = 0; @@ -1265,19 +1260,11 @@ static int __init ipv4_proc_init(void) goto out_udp; if (fib_proc_init()) goto out_fib; -#ifdef CONFIG_IP_FIB_TRIE - if (fib_stat_proc_init()) - goto out_fib_stat; -#endif if (ip_misc_proc_init()) goto out_misc; out: return rc; out_misc: -#ifdef CONFIG_IP_FIB_TRIE - fib_stat_proc_exit(); -out_fib_stat: -#endif fib_proc_exit(); out_fib: udp4_proc_exit(); diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 514c85b..035ad2c 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -263,10 +263,8 @@ static int ah_init_state(struct xfrm_state *x) error: if (ahp) { - if (ahp->work_icv) - kfree(ahp->work_icv); - if (ahp->tfm) - crypto_free_tfm(ahp->tfm); + kfree(ahp->work_icv); + crypto_free_tfm(ahp->tfm); kfree(ahp); } return -EINVAL; @@ -279,14 +277,10 @@ static void ah_destroy(struct xfrm_state *x) if (!ahp) return; - if (ahp->work_icv) { - kfree(ahp->work_icv); - ahp->work_icv = NULL; - } - if (ahp->tfm) { - crypto_free_tfm(ahp->tfm); - ahp->tfm = NULL; - } + kfree(ahp->work_icv); + ahp->work_icv = NULL; + crypto_free_tfm(ahp->tfm); + ahp->tfm = NULL; kfree(ahp); } diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index b31ffc5..1b5a09d 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -343,22 +343,14 @@ static void esp_destroy(struct xfrm_state *x) if (!esp) return; - if (esp->conf.tfm) { - crypto_free_tfm(esp->conf.tfm); - esp->conf.tfm = NULL; - } - if (esp->conf.ivec) { - kfree(esp->conf.ivec); - esp->conf.ivec = NULL; - } - if (esp->auth.tfm) { - crypto_free_tfm(esp->auth.tfm); - esp->auth.tfm = NULL; - } - if (esp->auth.work_icv) { - kfree(esp->auth.work_icv); - esp->auth.work_icv = NULL; - } + crypto_free_tfm(esp->conf.tfm); + esp->conf.tfm = NULL; + kfree(esp->conf.ivec); + esp->conf.ivec = NULL; + crypto_free_tfm(esp->auth.tfm); + esp->auth.tfm = NULL; + kfree(esp->auth.work_icv); + esp->auth.work_icv = NULL; kfree(esp); } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index b2dea4e..1b63b48 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -43,7 +43,7 @@ * 2 of the License, or (at your option) any later version. */ -#define VERSION "0.402" +#define VERSION "0.403" #include <linux/config.h> #include <asm/uaccess.h> @@ -164,7 +164,6 @@ static struct node *resize(struct trie *t, struct tnode *tn); static struct tnode *inflate(struct trie *t, struct tnode *tn); static struct tnode *halve(struct trie *t, struct tnode *tn); static void tnode_free(struct tnode *tn); -static void trie_dump_seq(struct seq_file *seq, struct trie *t); static kmem_cache_t *fn_alias_kmem __read_mostly; static struct trie *trie_local = NULL, *trie_main = NULL; @@ -1971,558 +1970,525 @@ struct fib_table * __init fib_hash_init(int id) return tb; } -/* Trie dump functions */ +#ifdef CONFIG_PROC_FS +/* Depth first Trie walk iterator */ +struct fib_trie_iter { + struct tnode *tnode; + struct trie *trie; + unsigned index; + unsigned depth; +}; -static void putspace_seq(struct seq_file *seq, int n) +static struct node *fib_trie_get_next(struct fib_trie_iter *iter) { - while (n--) - seq_printf(seq, " "); -} + struct tnode *tn = iter->tnode; + unsigned cindex = iter->index; + struct tnode *p; -static void printbin_seq(struct seq_file *seq, unsigned int v, int bits) -{ - while (bits--) - seq_printf(seq, "%s", (v & (1<<bits))?"1":"0"); -} + pr_debug("get_next iter={node=%p index=%d depth=%d}\n", + iter->tnode, iter->index, iter->depth); +rescan: + while (cindex < (1<<tn->bits)) { + struct node *n = tnode_get_child(tn, cindex); -static void printnode_seq(struct seq_file *seq, int indent, struct node *n, - int pend, int cindex, int bits) -{ - putspace_seq(seq, indent); - if (IS_LEAF(n)) - seq_printf(seq, "|"); - else - seq_printf(seq, "+"); - if (bits) { - seq_printf(seq, "%d/", cindex); - printbin_seq(seq, cindex, bits); - seq_printf(seq, ": "); - } else - seq_printf(seq, "<root>: "); - seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n); + if (n) { + if (IS_LEAF(n)) { + iter->tnode = tn; + iter->index = cindex + 1; + } else { + /* push down one level */ + iter->tnode = (struct tnode *) n; + iter->index = 0; + ++iter->depth; + } + return n; + } - if (IS_LEAF(n)) { - struct leaf *l = (struct leaf *)n; - struct fib_alias *fa; - int i; + ++cindex; + } - seq_printf(seq, "key=%d.%d.%d.%d\n", - n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256); - - for (i = 32; i >= 0; i--) - if (find_leaf_info(&l->list, i)) { - struct list_head *fa_head = get_fa_head(l, i); - - if (!fa_head) - continue; - - if (list_empty(fa_head)) - continue; - - putspace_seq(seq, indent+2); - seq_printf(seq, "{/%d...dumping}\n", i); - - list_for_each_entry_rcu(fa, fa_head, fa_list) { - putspace_seq(seq, indent+2); - if (fa->fa_info == NULL) { - seq_printf(seq, "Error fa_info=NULL\n"); - continue; - } - if (fa->fa_info->fib_nh == NULL) { - seq_printf(seq, "Error _fib_nh=NULL\n"); - continue; - } - - seq_printf(seq, "{type=%d scope=%d TOS=%d}\n", - fa->fa_type, - fa->fa_scope, - fa->fa_tos); - } - } - } else { - struct tnode *tn = (struct tnode *)n; - int plen = ((struct tnode *)n)->pos; - t_key prf = MASK_PFX(n->key, plen); - - seq_printf(seq, "key=%d.%d.%d.%d/%d\n", - prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen); - - putspace_seq(seq, indent); seq_printf(seq, "| "); - seq_printf(seq, "{key prefix=%08x/", tn->key & TKEY_GET_MASK(0, tn->pos)); - printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos); - seq_printf(seq, "}\n"); - putspace_seq(seq, indent); seq_printf(seq, "| "); - seq_printf(seq, "{pos=%d", tn->pos); - seq_printf(seq, " (skip=%d bits)", tn->pos - pend); - seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits)); - putspace_seq(seq, indent); seq_printf(seq, "| "); - seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children); + /* Current node exhausted, pop back up */ + p = NODE_PARENT(tn); + if (p) { + cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; + tn = p; + --iter->depth; + goto rescan; } + + /* got root? */ + return NULL; } -static void trie_dump_seq(struct seq_file *seq, struct trie *t) +static struct node *fib_trie_get_first(struct fib_trie_iter *iter, + struct trie *t) { - struct node *n; - int cindex = 0; - int indent = 1; - int pend = 0; - int depth = 0; - struct tnode *tn; - - rcu_read_lock(); - n = rcu_dereference(t->trie); - seq_printf(seq, "------ trie_dump of t=%p ------\n", t); + struct node *n = rcu_dereference(t->trie); - if (!n) { - seq_printf(seq, "------ trie is empty\n"); - - rcu_read_unlock(); - return; + if (n && IS_TNODE(n)) { + iter->tnode = (struct tnode *) n; + iter->trie = t; + iter->index = 0; + iter->depth = 0; + return n; } + return NULL; +} - printnode_seq(seq, indent, n, pend, cindex, 0); - - if (!IS_TNODE(n)) { - rcu_read_unlock(); - return; - } - - tn = (struct tnode *)n; - pend = tn->pos+tn->bits; - putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); - indent += 3; - depth++; - - while (tn && cindex < (1 << tn->bits)) { - struct node *child = rcu_dereference(tn->child[cindex]); - if (!child) - cindex++; - else { - /* Got a child */ - printnode_seq(seq, indent, child, pend, - cindex, tn->bits); - - if (IS_LEAF(child)) - cindex++; - - else { - /* - * New tnode. Decend one level - */ - - depth++; - n = child; - tn = (struct tnode *)n; - pend = tn->pos+tn->bits; - putspace_seq(seq, indent); - seq_printf(seq, "\\--\n"); - indent += 3; - cindex = 0; - } - } - - /* - * Test if we are done - */ - - while (cindex >= (1 << tn->bits)) { - /* - * Move upwards and test for root - * pop off all traversed nodes - */ +static void trie_collect_stats(struct trie *t, struct trie_stat *s) +{ + struct node *n; + struct fib_trie_iter iter; - if (NODE_PARENT(tn) == NULL) { - tn = NULL; - break; - } + memset(s, 0, sizeof(*s)); - cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); - cindex++; - tn = NODE_PARENT(tn); - pend = tn->pos + tn->bits; - indent -= 3; - depth--; + rcu_read_lock(); + for (n = fib_trie_get_first(&iter, t); n; + n = fib_trie_get_next(&iter)) { + if (IS_LEAF(n)) { + s->leaves++; + s->totdepth += iter.depth; + if (iter.depth > s->maxdepth) + s->maxdepth = iter.depth; + } else { + const struct tnode *tn = (const struct tnode *) n; + int i; + + s->tnodes++; + s->nodesizes[tn->bits]++; + for (i = 0; i < (1<<tn->bits); i++) + if (!tn->child[i]) + s->nullpointers++; } } rcu_read_unlock(); } -static struct trie_stat *trie_stat_new(void) +/* + * This outputs /proc/net/fib_triestats + */ +static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) { - struct trie_stat *s; - int i; + unsigned i, max, pointers, bytes, avdepth; - s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL); - if (!s) - return NULL; + if (stat->leaves) + avdepth = stat->totdepth*100 / stat->leaves; + else + avdepth = 0; - s->totdepth = 0; - s->maxdepth = 0; - s->tnodes = 0; - s->leaves = 0; - s->nullpointers = 0; + seq_printf(seq, "\tAver depth: %d.%02d\n", avdepth / 100, avdepth % 100 ); + seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); - for (i = 0; i < MAX_CHILDS; i++) - s->nodesizes[i] = 0; + seq_printf(seq, "\tLeaves: %u\n", stat->leaves); - return s; -} + bytes = sizeof(struct leaf) * stat->leaves; + seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes); + bytes += sizeof(struct tnode) * stat->tnodes; -static struct trie_stat *trie_collect_stats(struct trie *t) -{ - struct node *n; - struct trie_stat *s = trie_stat_new(); - int cindex = 0; - int pend = 0; - int depth = 0; + max = MAX_CHILDS-1; + while (max >= 0 && stat->nodesizes[max] == 0) + max--; - if (!s) - return NULL; + pointers = 0; + for (i = 1; i <= max; i++) + if (stat->nodesizes[i] != 0) { + seq_printf(seq, " %d: %d", i, stat->nodesizes[i]); + pointers += (1<<i) * stat->nodesizes[i]; + } + seq_putc(seq, '\n'); + seq_printf(seq, "\tPointers: %d\n", pointers); - rcu_read_lock(); - n = rcu_dereference(t->trie); + bytes += sizeof(struct node *) * pointers; + seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers); + seq_printf(seq, "Total size: %d kB\n", (bytes + 1023) / 1024); - if (!n) - return s; +#ifdef CONFIG_IP_FIB_TRIE_STATS + seq_printf(seq, "Counters:\n---------\n"); + seq_printf(seq,"gets = %d\n", t->stats.gets); + seq_printf(seq,"backtracks = %d\n", t->stats.backtrack); + seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed); + seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss); + seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit); + seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped); +#ifdef CLEAR_STATS + memset(&(t->stats), 0, sizeof(t->stats)); +#endif +#endif /* CONFIG_IP_FIB_TRIE_STATS */ +} - if (IS_TNODE(n)) { - struct tnode *tn = (struct tnode *)n; - pend = tn->pos+tn->bits; - s->nodesizes[tn->bits]++; - depth++; - - while (tn && cindex < (1 << tn->bits)) { - struct node *ch = rcu_dereference(tn->child[cindex]); - if (ch) { - - /* Got a child */ - - if (IS_LEAF(tn->child[cindex])) { - cindex++; - - /* stats */ - if (depth > s->maxdepth) - s->maxdepth = depth; - s->totdepth += depth; - s->leaves++; - } else { - /* - * New tnode. Decend one level - */ - - s->tnodes++; - s->nodesizes[tn->bits]++; - depth++; - - n = ch; - tn = (struct tnode *)n; - pend = tn->pos+tn->bits; - - cindex = 0; - } - } else { - cindex++; - s->nullpointers++; - } +static int fib_triestat_seq_show(struct seq_file *seq, void *v) +{ + struct trie_stat *stat; - /* - * Test if we are done - */ + stat = kmalloc(sizeof(*stat), GFP_KERNEL); + if (!stat) + return -ENOMEM; - while (cindex >= (1 << tn->bits)) { - /* - * Move upwards and test for root - * pop off all traversed nodes - */ + seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n", + sizeof(struct leaf), sizeof(struct tnode)); - if (NODE_PARENT(tn) == NULL) { - tn = NULL; - n = NULL; - break; - } + if (trie_local) { + seq_printf(seq, "Local:\n"); + trie_collect_stats(trie_local, stat); + trie_show_stats(seq, stat); + } - cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); - tn = NODE_PARENT(tn); - cindex++; - n = (struct node *)tn; - pend = tn->pos+tn->bits; - depth--; - } - } + if (trie_main) { + seq_printf(seq, "Main:\n"); + trie_collect_stats(trie_main, stat); + trie_show_stats(seq, stat); } + kfree(stat); - rcu_read_unlock(); - return s; + return 0; } -#ifdef CONFIG_PROC_FS - -static struct fib_alias *fib_triestat_get_first(struct seq_file *seq) +static int fib_triestat_seq_open(struct inode *inode, struct file *file) { - return NULL; + return single_open(file, fib_triestat_seq_show, NULL); } -static struct fib_alias *fib_triestat_get_next(struct seq_file *seq) +static struct file_operations fib_triestat_fops = { + .owner = THIS_MODULE, + .open = fib_triestat_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct node *fib_trie_get_idx(struct fib_trie_iter *iter, + loff_t pos) { + loff_t idx = 0; + struct node *n; + + for (n = fib_trie_get_first(iter, trie_local); + n; ++idx, n = fib_trie_get_next(iter)) { + if (pos == idx) + return n; + } + + for (n = fib_trie_get_first(iter, trie_main); + n; ++idx, n = fib_trie_get_next(iter)) { + if (pos == idx) + return n; + } return NULL; } -static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos) +static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos) { - if (!ip_fib_main_table) - return NULL; - - if (*pos) - return fib_triestat_get_next(seq); - else + rcu_read_lock(); + if (*pos == 0) return SEQ_START_TOKEN; + return fib_trie_get_idx(seq->private, *pos - 1); } -static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos) +static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct fib_trie_iter *iter = seq->private; + void *l = v; + ++*pos; if (v == SEQ_START_TOKEN) - return fib_triestat_get_first(seq); - else - return fib_triestat_get_next(seq); -} + return fib_trie_get_idx(iter, 0); -static void fib_triestat_seq_stop(struct seq_file *seq, void *v) -{ + v = fib_trie_get_next(iter); + BUG_ON(v == l); + if (v) + return v; -} + /* continue scan in next trie */ + if (iter->trie == trie_local) + return fib_trie_get_first(iter, trie_main); -/* - * This outputs /proc/net/fib_triestats - * - * It always works in backward compatibility mode. - * The format of the file is not supposed to be changed. - */ + return NULL; +} -static void collect_and_show(struct trie *t, struct seq_file *seq) +static void fib_trie_seq_stop(struct seq_file *seq, void *v) { - int bytes = 0; /* How many bytes are used, a ref is 4 bytes */ - int i, max, pointers; - struct trie_stat *stat; - int avdepth; - - stat = trie_collect_stats(t); - - bytes = 0; - seq_printf(seq, "trie=%p\n", t); - - if (stat) { - if (stat->leaves) - avdepth = stat->totdepth*100 / stat->leaves; - else - avdepth = 0; - seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100); - seq_printf(seq, "Max depth: %4d\n", stat->maxdepth); + rcu_read_unlock(); +} - seq_printf(seq, "Leaves: %d\n", stat->leaves); - bytes += sizeof(struct leaf) * stat->leaves; - seq_printf(seq, "Internal nodes: %d\n", stat->tnodes); - bytes += sizeof(struct tnode) * stat->tnodes; +static void seq_indent(struct seq_file *seq, int n) +{ + while (n-- > 0) seq_puts(seq, " "); +} - max = MAX_CHILDS-1; +static inline const char *rtn_scope(enum rt_scope_t s) +{ + static char buf[32]; - while (max >= 0 && stat->nodesizes[max] == 0) - max--; - pointers = 0; + switch(s) { + case RT_SCOPE_UNIVERSE: return "universe"; + case RT_SCOPE_SITE: return "site"; + case RT_SCOPE_LINK: return "link"; + case RT_SCOPE_HOST: return "host"; + case RT_SCOPE_NOWHERE: return "nowhere"; + default: + snprintf(buf, sizeof(buf), "scope=%d", s); + return buf; + } +} - for (i = 1; i <= max; i++) - if (stat->nodesizes[i] != 0) { - seq_printf(seq, " %d: %d", i, stat->nodesizes[i]); - pointers += (1<<i) * stat->nodesizes[i]; - } - seq_printf(seq, "\n"); - seq_printf(seq, "Pointers: %d\n", pointers); - bytes += sizeof(struct node *) * pointers; - seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers); - seq_printf(seq, "Total size: %d kB\n", bytes / 1024); +static const char *rtn_type_names[__RTN_MAX] = { + [RTN_UNSPEC] = "UNSPEC", + [RTN_UNICAST] = "UNICAST", + [RTN_LOCAL] = "LOCAL", + [RTN_BROADCAST] = "BROADCAST", + [RTN_ANYCAST] = "ANYCAST", + [RTN_MULTICAST] = "MULTICAST", + [RTN_BLACKHOLE] = "BLACKHOLE", + [RTN_UNREACHABLE] = "UNREACHABLE", + [RTN_PROHIBIT] = "PROHIBIT", + [RTN_THROW] = "THROW", + [RTN_NAT] = "NAT", + [RTN_XRESOLVE] = "XRESOLVE", +}; - kfree(stat); - } +static inline const char *rtn_type(unsigned t) +{ + static char buf[32]; -#ifdef CONFIG_IP_FIB_TRIE_STATS - seq_printf(seq, "Counters:\n---------\n"); - seq_printf(seq,"gets = %d\n", t->stats.gets); - seq_printf(seq,"backtracks = %d\n", t->stats.backtrack); - seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed); - seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss); - seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit); - seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped); -#ifdef CLEAR_STATS - memset(&(t->stats), 0, sizeof(t->stats)); -#endif -#endif /* CONFIG_IP_FIB_TRIE_STATS */ + if (t < __RTN_MAX && rtn_type_names[t]) + return rtn_type_names[t]; + snprintf(buf, sizeof(buf), "type %d", t); + return buf; } -static int fib_triestat_seq_show(struct seq_file *seq, void *v) +/* Pretty print the trie */ +static int fib_trie_seq_show(struct seq_file *seq, void *v) { - char bf[128]; + const struct fib_trie_iter *iter = seq->private; + struct node *n = v; - if (v == SEQ_START_TOKEN) { - seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n", - sizeof(struct leaf), sizeof(struct tnode)); - if (trie_local) - collect_and_show(trie_local, seq); + if (v == SEQ_START_TOKEN) + return 0; - if (trie_main) - collect_and_show(trie_main, seq); - } else { - snprintf(bf, sizeof(bf), "*\t%08X\t%08X", 200, 400); + if (IS_TNODE(n)) { + struct tnode *tn = (struct tnode *) n; + t_key prf = ntohl(MASK_PFX(tn->key, tn->pos)); - seq_printf(seq, "%-127s\n", bf); + if (!NODE_PARENT(n)) { + if (iter->trie == trie_local) + seq_puts(seq, "<local>:\n"); + else + seq_puts(seq, "<main>:\n"); + } else { + seq_indent(seq, iter->depth-1); + seq_printf(seq, " +-- %d.%d.%d.%d/%d\n", + NIPQUAD(prf), tn->pos); + } + } else { + struct leaf *l = (struct leaf *) n; + int i; + u32 val = ntohl(l->key); + + seq_indent(seq, iter->depth); + seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val)); + for (i = 32; i >= 0; i--) { + struct leaf_info *li = find_leaf_info(&l->list, i); + if (li) { + struct fib_alias *fa; + list_for_each_entry_rcu(fa, &li->falh, fa_list) { + seq_indent(seq, iter->depth+1); + seq_printf(seq, " /%d %s %s", i, + rtn_scope(fa->fa_scope), + rtn_type(fa->fa_type)); + if (fa->fa_tos) + seq_printf(seq, "tos =%d\n", + fa->fa_tos); + seq_putc(seq, '\n'); + } + } + } } + return 0; } -static struct seq_operations fib_triestat_seq_ops = { - .start = fib_triestat_seq_start, - .next = fib_triestat_seq_next, - .stop = fib_triestat_seq_stop, - .show = fib_triestat_seq_show, +static struct seq_operations fib_trie_seq_ops = { + .start = fib_trie_seq_start, + .next = fib_trie_seq_next, + .stop = fib_trie_seq_stop, + .show = fib_trie_seq_show, }; -static int fib_triestat_seq_open(struct inode *inode, struct file *file) +static int fib_trie_seq_open(struct inode *inode, struct file *file) { struct seq_file *seq; int rc = -ENOMEM; + struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL); - rc = seq_open(file, &fib_triestat_seq_ops); + if (!s) + goto out; + + rc = seq_open(file, &fib_trie_seq_ops); if (rc) goto out_kfree; - seq = file->private_data; + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); out: return rc; out_kfree: + kfree(s); goto out; } -static struct file_operations fib_triestat_seq_fops = { - .owner = THIS_MODULE, - .open = fib_triestat_seq_open, - .read = seq_read, - .llseek = seq_lseek, +static struct file_operations fib_trie_fops = { + .owner = THIS_MODULE, + .open = fib_trie_seq_open, + .read = seq_read, + .llseek = seq_lseek, .release = seq_release_private, }; -int __init fib_stat_proc_init(void) -{ - if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops)) - return -ENOMEM; - return 0; -} - -void __init fib_stat_proc_exit(void) +static unsigned fib_flag_trans(int type, u32 mask, const struct fib_info *fi) { - proc_net_remove("fib_triestat"); -} + static unsigned type2flags[RTN_MAX + 1] = { + [7] = RTF_REJECT, [8] = RTF_REJECT, + }; + unsigned flags = type2flags[type]; -static struct fib_alias *fib_trie_get_first(struct seq_file *seq) -{ - return NULL; + if (fi && fi->fib_nh->nh_gw) + flags |= RTF_GATEWAY; + if (mask == 0xFFFFFFFF) + flags |= RTF_HOST; + flags |= RTF_UP; + return flags; } -static struct fib_alias *fib_trie_get_next(struct seq_file *seq) +/* + * This outputs /proc/net/route. + * The format of the file is not supposed to be changed + * and needs to be same as fib_hash output to avoid breaking + * legacy utilities + */ +static int fib_route_seq_show(struct seq_file *seq, void *v) { - return NULL; -} + struct leaf *l = v; + int i; + char bf[128]; -static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos) -{ - if (!ip_fib_main_table) - return NULL; + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " + "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU" + "\tWindow\tIRTT"); + return 0; + } - if (*pos) - return fib_trie_get_next(seq); - else - return SEQ_START_TOKEN; -} + if (IS_TNODE(l)) + return 0; -static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - ++*pos; - if (v == SEQ_START_TOKEN) - return fib_trie_get_first(seq); - else - return fib_trie_get_next(seq); + for (i=32; i>=0; i--) { + struct leaf_info *li = find_leaf_info(&l->list, i); + struct fib_alias *fa; + u32 mask, prefix; -} + if (!li) + continue; -static void fib_trie_seq_stop(struct seq_file *seq, void *v) -{ -} + mask = inet_make_mask(li->plen); + prefix = htonl(l->key); -/* - * This outputs /proc/net/fib_trie. - * - * It always works in backward compatibility mode. - * The format of the file is not supposed to be changed. - */ + list_for_each_entry_rcu(fa, &li->falh, fa_list) { + const struct fib_info *fi = rcu_dereference(fa->fa_info); + unsigned flags = fib_flag_trans(fa->fa_type, mask, fi); -static int fib_trie_seq_show(struct seq_file *seq, void *v) -{ - char bf[128]; + if (fa->fa_type == RTN_BROADCAST + || fa->fa_type == RTN_MULTICAST) + continue; - if (v == SEQ_START_TOKEN) { - if (trie_local) - trie_dump_seq(seq, trie_local); + if (fi) + snprintf(bf, sizeof(bf), + "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u", + fi->fib_dev ? fi->fib_dev->name : "*", + prefix, + fi->fib_nh->nh_gw, flags, 0, 0, + fi->fib_priority, + mask, + (fi->fib_advmss ? fi->fib_advmss + 40 : 0), + fi->fib_window, + fi->fib_rtt >> 3); + else + snprintf(bf, sizeof(bf), + "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u", + prefix, 0, flags, 0, 0, 0, + mask, 0, 0, 0); - if (trie_main) - trie_dump_seq(seq, trie_main); - } else { - snprintf(bf, sizeof(bf), - "*\t%08X\t%08X", 200, 400); - seq_printf(seq, "%-127s\n", bf); + seq_printf(seq, "%-127s\n", bf); + } } return 0; } -static struct seq_operations fib_trie_seq_ops = { - .start = fib_trie_seq_start, - .next = fib_trie_seq_next, - .stop = fib_trie_seq_stop, - .show = fib_trie_seq_show, +static struct seq_operations fib_route_seq_ops = { + .start = fib_trie_seq_start, + .next = fib_trie_seq_next, + .stop = fib_trie_seq_stop, + .show = fib_route_seq_show, }; -static int fib_trie_seq_open(struct inode *inode, struct file *file) +static int fib_route_seq_open(struct inode *inode, struct file *file) { struct seq_file *seq; int rc = -ENOMEM; + struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL); - rc = seq_open(file, &fib_trie_seq_ops); + if (!s) + goto out; + + rc = seq_open(file, &fib_route_seq_ops); if (rc) goto out_kfree; - seq = file->private_data; + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); out: return rc; out_kfree: + kfree(s); goto out; } -static struct file_operations fib_trie_seq_fops = { - .owner = THIS_MODULE, - .open = fib_trie_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release= seq_release_private, +static struct file_operations fib_route_fops = { + .owner = THIS_MODULE, + .open = fib_route_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, }; int __init fib_proc_init(void) { - if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops)) - return -ENOMEM; + if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops)) + goto out1; + + if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops)) + goto out2; + + if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops)) + goto out3; + return 0; + +out3: + proc_net_remove("fib_triestat"); +out2: + proc_net_remove("fib_trie"); +out1: + return -ENOMEM; } void __init fib_proc_exit(void) { proc_net_remove("fib_trie"); + proc_net_remove("fib_triestat"); + proc_net_remove("route"); } #endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index f84ba9c..2fc3fd3 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -100,8 +100,7 @@ DEFINE_SPINLOCK(inet_peer_unused_lock); #define PEER_MAX_CLEANUP_WORK 30 static void peer_check_expire(unsigned long dummy); -static struct timer_list peer_periodic_timer = - TIMER_INITIALIZER(peer_check_expire, 0, 0); +static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0); /* Exported for sysctl_net_ipv4. */ int inet_peer_gc_mintime = 10 * HZ, diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 9e6e683..e7d26d9 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -457,7 +457,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (pskb_pull(skb, ihl) == NULL) goto err; - if (pskb_trim(skb, end-offset)) + if (pskb_trim_rcsum(skb, end-offset)) goto err; /* Find out which fragments are in front and at the back of us diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index dcb7ee6..fc718df 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -345,8 +345,7 @@ static void ipcomp_free_tfms(struct crypto_tfm **tfms) for_each_cpu(cpu) { struct crypto_tfm *tfm = *per_cpu_ptr(tfms, cpu); - if (tfm) - crypto_free_tfm(tfm); + crypto_free_tfm(tfm); } free_percpu(tfms); } diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 63e1066..953129d 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -54,6 +54,7 @@ #include <linux/major.h> #include <linux/root_dev.h> #include <linux/delay.h> +#include <linux/nfs_fs.h> #include <net/arp.h> #include <net/ip.h> #include <net/ipconfig.h> diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index e046f55..30aa8e2 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -34,6 +34,7 @@ config IP_NF_CT_ACCT config IP_NF_CONNTRACK_MARK bool 'Connection mark tracking support' + depends on IP_NF_CONNTRACK help This option enables support for connection marks, used by the `CONNMARK' target and `connmark' match. Similar to the mark value @@ -85,6 +86,25 @@ config IP_NF_IRC To compile it as a module, choose M here. If unsure, say Y. +config IP_NF_NETBIOS_NS + tristate "NetBIOS name service protocol support (EXPERIMENTAL)" + depends on IP_NF_CONNTRACK && EXPERIMENTAL + help + NetBIOS name service requests are sent as broadcast messages from an + unprivileged port and responded to with unicast messages to the + same port. This make them hard to firewall properly because connection + tracking doesn't deal with broadcasts. This helper tracks locally + originating NetBIOS name service requests and the corresponding + responses. It relies on correct IP address configuration, specifically + netmask and broadcast address. When properly configured, the output + of "ip address show" should look similar to this: + + $ ip -4 address show eth0 + 4: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc pfifo_fast qlen 1000 + inet 172.16.2.252/24 brd 172.16.2.255 scope global eth0 + + To compile it as a module, choose M here. If unsure, say N. + config IP_NF_TFTP tristate "TFTP protocol support" depends on IP_NF_CONNTRACK diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index a7bd38f..1ba0db7 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -21,6 +21,7 @@ obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o +obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o # NAT helpers obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c index be4c9eb..dc20881 100644 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c @@ -108,6 +108,7 @@ static int help(struct sk_buff **pskb, } exp->expectfn = NULL; + exp->flags = 0; exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip; exp->tuple.src.u.tcp.port = 0; diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index a064860..19cba16 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -197,7 +197,7 @@ ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse, /* ip_conntrack_expect helper functions */ -static void unlink_expect(struct ip_conntrack_expect *exp) +void ip_ct_unlink_expect(struct ip_conntrack_expect *exp) { ASSERT_WRITE_LOCK(&ip_conntrack_lock); IP_NF_ASSERT(!timer_pending(&exp->timeout)); @@ -207,18 +207,12 @@ static void unlink_expect(struct ip_conntrack_expect *exp) ip_conntrack_expect_put(exp); } -void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp) -{ - unlink_expect(exp); - ip_conntrack_expect_put(exp); -} - static void expectation_timed_out(unsigned long ul_expect) { struct ip_conntrack_expect *exp = (void *)ul_expect; write_lock_bh(&ip_conntrack_lock); - unlink_expect(exp); + ip_ct_unlink_expect(exp); write_unlock_bh(&ip_conntrack_lock); ip_conntrack_expect_put(exp); } @@ -264,10 +258,14 @@ find_expectation(const struct ip_conntrack_tuple *tuple) master ct never got confirmed, we'd hold a reference to it and weird things would happen to future packets). */ if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) - && is_confirmed(i->master) - && del_timer(&i->timeout)) { - unlink_expect(i); - return i; + && is_confirmed(i->master)) { + if (i->flags & IP_CT_EXPECT_PERMANENT) { + atomic_inc(&i->use); + return i; + } else if (del_timer(&i->timeout)) { + ip_ct_unlink_expect(i); + return i; + } } } return NULL; @@ -284,7 +282,7 @@ void ip_ct_remove_expectations(struct ip_conntrack *ct) list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) { if (i->master == ct && del_timer(&i->timeout)) { - unlink_expect(i); + ip_ct_unlink_expect(i); ip_conntrack_expect_put(i); } } @@ -925,7 +923,7 @@ void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp) /* choose the the oldest expectation to evict */ list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { if (expect_matches(i, exp) && del_timer(&i->timeout)) { - unlink_expect(i); + ip_ct_unlink_expect(i); write_unlock_bh(&ip_conntrack_lock); ip_conntrack_expect_put(i); return; @@ -934,6 +932,9 @@ void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp) write_unlock_bh(&ip_conntrack_lock); } +/* We don't increase the master conntrack refcount for non-fulfilled + * conntracks. During the conntrack destruction, the expectations are + * always killed before the conntrack itself */ struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me) { struct ip_conntrack_expect *new; @@ -944,17 +945,14 @@ struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me) return NULL; } new->master = me; - atomic_inc(&new->master->ct_general.use); atomic_set(&new->use, 1); return new; } void ip_conntrack_expect_put(struct ip_conntrack_expect *exp) { - if (atomic_dec_and_test(&exp->use)) { - ip_conntrack_put(exp->master); + if (atomic_dec_and_test(&exp->use)) kmem_cache_free(ip_conntrack_expect_cachep, exp); - } } static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) @@ -982,7 +980,7 @@ static void evict_oldest_expect(struct ip_conntrack *master) list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { if (i->master == master) { if (del_timer(&i->timeout)) { - unlink_expect(i); + ip_ct_unlink_expect(i); ip_conntrack_expect_put(i); } break; @@ -1099,7 +1097,7 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me) /* Get rid of expectations */ list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) { if (exp->master->helper == me && del_timer(&exp->timeout)) { - unlink_expect(exp); + ip_ct_unlink_expect(exp); ip_conntrack_expect_put(exp); } } diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index 3a2627d..1b79ec3 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -421,6 +421,7 @@ static int help(struct sk_buff **pskb, { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }}); exp->expectfn = NULL; + exp->flags = 0; /* Now, NAT might want to mangle the packet, and register the * (possibly changed) expectation itself. */ diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c index 25438ee..d7a8a98 100644 --- a/net/ipv4/netfilter/ip_conntrack_irc.c +++ b/net/ipv4/netfilter/ip_conntrack_irc.c @@ -221,6 +221,7 @@ static int help(struct sk_buff **pskb, { { 0, { 0 } }, { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }}); exp->expectfn = NULL; + exp->flags = 0; if (ip_nat_irc_hook) ret = ip_nat_irc_hook(pskb, ctinfo, addr_beg_p - ib_ptr, diff --git a/net/ipv4/netfilter/ip_conntrack_netbios_ns.c b/net/ipv4/netfilter/ip_conntrack_netbios_ns.c new file mode 100644 index 0000000..bb72466 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_netbios_ns.c @@ -0,0 +1,147 @@ +/* + * NetBIOS name service broadcast connection tracking helper + * + * (c) 2005 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +/* + * This helper tracks locally originating NetBIOS name service + * requests by issuing permanent expectations (valid until + * timing out) matching all reply connections from the + * destination network. The only NetBIOS specific thing is + * actually the port number. + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include <net/route.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("NetBIOS name service broadcast connection tracking helper"); +MODULE_LICENSE("GPL"); + +static unsigned int timeout = 3; +module_param(timeout, int, 0600); +MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); + +static int help(struct sk_buff **pskb, + struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) +{ + struct ip_conntrack_expect *exp; + struct iphdr *iph = (*pskb)->nh.iph; + struct udphdr _uh, *uh; + struct rtable *rt = (struct rtable *)(*pskb)->dst; + struct in_device *in_dev; + u_int32_t mask = 0; + + /* we're only interested in locally generated packets */ + if ((*pskb)->sk == NULL) + goto out; + if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST)) + goto out; + if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + goto out; + + rcu_read_lock(); + in_dev = __in_dev_get(rt->u.dst.dev); + if (in_dev != NULL) { + for_primary_ifa(in_dev) { + if (ifa->ifa_broadcast == iph->daddr) { + mask = ifa->ifa_mask; + break; + } + } endfor_ifa(in_dev); + } + rcu_read_unlock(); + + if (mask == 0) + goto out; + + uh = skb_header_pointer(*pskb, iph->ihl * 4, sizeof(_uh), &_uh); + BUG_ON(uh == NULL); + + exp = ip_conntrack_expect_alloc(ct); + if (exp == NULL) + goto out; + memset(&exp->tuple, 0, sizeof(exp->tuple)); + exp->tuple.src.ip = iph->daddr & mask; + exp->tuple.dst.ip = iph->saddr; + exp->tuple.dst.u.udp.port = uh->source; + exp->tuple.dst.protonum = IPPROTO_UDP; + + memset(&exp->mask, 0, sizeof(exp->mask)); + exp->mask.src.ip = mask; + exp->mask.dst.ip = 0xFFFFFFFF; + exp->mask.dst.u.udp.port = 0xFFFF; + exp->mask.dst.protonum = 0xFF; + + exp->expectfn = NULL; + exp->flags = IP_CT_EXPECT_PERMANENT; + + ip_conntrack_expect_related(exp); + ip_conntrack_expect_put(exp); + + ip_ct_refresh_acct(ct, ctinfo, NULL, timeout * HZ); +out: + return NF_ACCEPT; +} + +static struct ip_conntrack_helper helper = { + .name = "netbios-ns", + .tuple = { + .src = { + .u = { + .udp = { + .port = __constant_htons(137), + } + } + }, + .dst = { + .protonum = IPPROTO_UDP, + }, + }, + .mask = { + .src = { + .u = { + .udp = { + .port = 0xFFFF, + } + } + }, + .dst = { + .protonum = 0xFF, + }, + }, + .max_expected = 1, + .me = THIS_MODULE, + .help = help, +}; + +static int __init init(void) +{ + helper.timeout = timeout; + return ip_conntrack_helper_register(&helper); +} + +static void __exit fini(void) +{ + ip_conntrack_helper_unregister(&helper); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index a4e9278..15aef35 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -1349,8 +1349,10 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) { if (exp->master->helper == h - && del_timer(&exp->timeout)) - __ip_ct_expect_unlink_destroy(exp); + && del_timer(&exp->timeout)) { + ip_ct_unlink_expect(exp); + ip_conntrack_expect_put(exp); + } } write_unlock(&ip_conntrack_lock); } else { @@ -1358,8 +1360,10 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, write_lock_bh(&ip_conntrack_lock); list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) { - if (del_timer(&exp->timeout)) - __ip_ct_expect_unlink_destroy(exp); + if (del_timer(&exp->timeout)) { + ip_ct_unlink_expect(exp); + ip_conntrack_expect_put(exp); + } } write_unlock_bh(&ip_conntrack_lock); } @@ -1413,6 +1417,7 @@ ctnetlink_create_expect(struct nfattr *cda[]) } exp->expectfn = NULL; + exp->flags = 0; exp->master = ct; memcpy(&exp->tuple, &tuple, sizeof(struct ip_conntrack_tuple)); memcpy(&exp->mask, &mask, sizeof(struct ip_conntrack_tuple)); diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index f23ef1f..1985abc 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -349,6 +349,7 @@ static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa, return 0; nfattr_failure: + read_unlock_bh(&tcp_lock); return -1; } #endif diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index ee5895a..ae3e3e6 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -998,7 +998,7 @@ EXPORT_SYMBOL(ip_conntrack_expect_related); EXPORT_SYMBOL(ip_conntrack_unexpect_related); EXPORT_SYMBOL_GPL(ip_conntrack_expect_list); EXPORT_SYMBOL_GPL(__ip_conntrack_expect_find); -EXPORT_SYMBOL_GPL(__ip_ct_expect_unlink_destroy); +EXPORT_SYMBOL_GPL(ip_ct_unlink_expect); EXPORT_SYMBOL(ip_conntrack_tuple_taken); EXPORT_SYMBOL(ip_ct_gather_frags); diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c index f8ff170..d2b5905 100644 --- a/net/ipv4/netfilter/ip_conntrack_tftp.c +++ b/net/ipv4/netfilter/ip_conntrack_tftp.c @@ -75,6 +75,7 @@ static int tftp_help(struct sk_buff **pskb, exp->mask.dst.u.udp.port = 0xffff; exp->mask.dst.protonum = 0xff; exp->expectfn = NULL; + exp->flags = 0; DEBUGP("expect: "); DUMP_TUPLE(&exp->tuple); diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c index 60d70fa..cb66b8b 100644 --- a/net/ipv4/netfilter/ip_nat_rule.c +++ b/net/ipv4/netfilter/ip_nat_rule.c @@ -255,6 +255,27 @@ alloc_null_binding(struct ip_conntrack *conntrack, return ip_nat_setup_info(conntrack, &range, hooknum); } +unsigned int +alloc_null_binding_confirmed(struct ip_conntrack *conntrack, + struct ip_nat_info *info, + unsigned int hooknum) +{ + u_int32_t ip + = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC + ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip + : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip); + u_int16_t all + = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC + ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.all + : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.all); + struct ip_nat_range range + = { IP_NAT_RANGE_MAP_IPS, ip, ip, { all }, { all } }; + + DEBUGP("Allocating NULL binding for confirmed %p (%u.%u.%u.%u)\n", + conntrack, NIPQUAD(ip)); + return ip_nat_setup_info(conntrack, &range, hooknum); +} + int ip_nat_rule_find(struct sk_buff **pskb, unsigned int hooknum, const struct net_device *in, diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c index 89db052..0ff368b 100644 --- a/net/ipv4/netfilter/ip_nat_standalone.c +++ b/net/ipv4/netfilter/ip_nat_standalone.c @@ -123,8 +123,12 @@ ip_nat_fn(unsigned int hooknum, if (!ip_nat_initialized(ct, maniptype)) { unsigned int ret; - /* LOCAL_IN hook doesn't have a chain! */ - if (hooknum == NF_IP_LOCAL_IN) + if (unlikely(is_confirmed(ct))) + /* NAT module was loaded late */ + ret = alloc_null_binding_confirmed(ct, info, + hooknum); + else if (hooknum == NF_IP_LOCAL_IN) + /* LOCAL_IN hook doesn't have a chain! */ ret = alloc_null_binding(ct, info, hooknum); else ret = ip_nat_rule_find(pskb, hooknum, diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 2d05caf..7d38913 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -144,7 +144,7 @@ clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip, memcpy(&c->clustermac, &i->clustermac, ETH_ALEN); c->num_total_nodes = i->num_total_nodes; c->num_local_nodes = i->num_local_nodes; - memcpy(&c->local_nodes, &i->local_nodes, sizeof(&c->local_nodes)); + memcpy(&c->local_nodes, &i->local_nodes, sizeof(c->local_nodes)); c->hash_mode = i->hash_mode; c->hash_initval = i->hash_initval; atomic_set(&c->refcount, 1); diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index f115a84..f057025 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -92,10 +92,7 @@ static inline struct rtable *route_reverse(struct sk_buff *skb, fl.fl_ip_sport = tcph->dest; fl.fl_ip_dport = tcph->source; - if (xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0)) { - dst_release(&rt->u.dst); - rt = NULL; - } + xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0); return rt; } diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c index c1889f8..0cee286 100644 --- a/net/ipv4/netfilter/ipt_owner.c +++ b/net/ipv4/netfilter/ipt_owner.c @@ -11,6 +11,7 @@ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/file.h> +#include <linux/rcupdate.h> #include <net/sock.h> #include <linux/netfilter_ipv4/ipt_owner.h> diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 8c0b14e..8549f26 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1760,6 +1760,7 @@ static inline int __mkroute_input(struct sk_buff *skb, goto cleanup; } + atomic_set(&rth->u.dst.__refcnt, 1); rth->u.dst.flags= DST_HOST; #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED if (res->fi->fib_nhs > 1) @@ -1820,7 +1821,6 @@ static inline int ip_mkroute_input_def(struct sk_buff *skb, err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); if (err) return err; - atomic_set(&rth->u.dst.__refcnt, 1); /* put it into the cache */ hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos); @@ -1834,8 +1834,8 @@ static inline int ip_mkroute_input(struct sk_buff *skb, u32 daddr, u32 saddr, u32 tos) { #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED - struct rtable* rth = NULL; - unsigned char hop, hopcount, lasthop; + struct rtable* rth = NULL, *rtres; + unsigned char hop, hopcount; int err = -EINVAL; unsigned int hash; @@ -1844,8 +1844,6 @@ static inline int ip_mkroute_input(struct sk_buff *skb, else hopcount = 1; - lasthop = hopcount - 1; - /* distinguish between multipath and singlepath */ if (hopcount < 2) return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, @@ -1855,6 +1853,10 @@ static inline int ip_mkroute_input(struct sk_buff *skb, for (hop = 0; hop < hopcount; hop++) { res->nh_sel = hop; + /* put reference to previous result */ + if (hop) + ip_rt_put(rtres); + /* create a routing cache entry */ err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); @@ -1863,7 +1865,7 @@ static inline int ip_mkroute_input(struct sk_buff *skb, /* put it into the cache */ hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos); - err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); + err = rt_intern_hash(hash, rth, &rtres); if (err) return err; @@ -1873,13 +1875,8 @@ static inline int ip_mkroute_input(struct sk_buff *skb, FIB_RES_NETMASK(*res), res->prefixlen, &FIB_RES_NH(*res)); - - /* only for the last hop the reference count is handled - * outside - */ - if (hop == lasthop) - atomic_set(&(skb->dst->__refcnt), 1); } + skb->dst = &rtres->u.dst; return err; #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos); @@ -2208,6 +2205,7 @@ static inline int __mkroute_output(struct rtable **result, goto cleanup; } + atomic_set(&rth->u.dst.__refcnt, 1); rth->u.dst.flags= DST_HOST; #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED if (res->fi) { @@ -2290,8 +2288,6 @@ static inline int ip_mkroute_output_def(struct rtable **rp, if (err == 0) { u32 tos = RT_FL_TOS(oldflp); - atomic_set(&rth->u.dst.__refcnt, 1); - hash = rt_hash_code(oldflp->fl4_dst, oldflp->fl4_src ^ (oldflp->oif << 5), tos); err = rt_intern_hash(hash, rth, rp); @@ -2326,6 +2322,10 @@ static inline int ip_mkroute_output(struct rtable** rp, dev2nexthop = FIB_RES_DEV(*res); dev_hold(dev2nexthop); + /* put reference to previous result */ + if (hop) + ip_rt_put(*rp); + err = __mkroute_output(&rth, res, fl, oldflp, dev2nexthop, flags); @@ -2350,7 +2350,6 @@ static inline int ip_mkroute_output(struct rtable** rp, if (err != 0) return err; } - atomic_set(&(*rp)->u.dst.__refcnt, 1); return err; } else { return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 02fdda6..f3f0013 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -552,8 +552,7 @@ new_segment: tcp_mark_push(tp, skb); goto new_segment; } - if (sk->sk_forward_alloc < copy && - !sk_stream_mem_schedule(sk, copy, 0)) + if (!sk_stream_wmem_schedule(sk, copy)) goto wait_for_memory; if (can_coalesce) { @@ -770,19 +769,23 @@ new_segment: if (off == PAGE_SIZE) { put_page(page); TCP_PAGE(sk) = page = NULL; + off = 0; } - } + } else + off = 0; + + if (copy > PAGE_SIZE - off) + copy = PAGE_SIZE - off; + + if (!sk_stream_wmem_schedule(sk, copy)) + goto wait_for_memory; if (!page) { /* Allocate new cache page. */ if (!(page = sk_stream_alloc_page(sk))) goto wait_for_memory; - off = 0; } - if (copy > PAGE_SIZE - off) - copy = PAGE_SIZE - off; - /* Time to copy data. We are close to * the end! */ err = skb_copy_to_page(sk, from, skb, page, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1afb080..29222b96 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -923,14 +923,6 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ int flag = 0; int i; - /* So, SACKs for already sent large segments will be lost. - * Not good, but alternative is to resegment the queue. */ - if (sk->sk_route_caps & NETIF_F_TSO) { - sk->sk_route_caps &= ~NETIF_F_TSO; - sock_set_flag(sk, SOCK_NO_LARGESEND); - tp->mss_cache = tp->mss_cache; - } - if (!tp->sacked_out) tp->fackets_out = 0; prior_fackets = tp->fackets_out; @@ -978,20 +970,40 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ flag |= FLAG_DATA_LOST; sk_stream_for_retrans_queue(skb, sk) { - u8 sacked = TCP_SKB_CB(skb)->sacked; - int in_sack; + int in_sack, pcount; + u8 sacked; /* The retransmission queue is always in order, so * we can short-circuit the walk early. */ - if(!before(TCP_SKB_CB(skb)->seq, end_seq)) + if (!before(TCP_SKB_CB(skb)->seq, end_seq)) break; - fack_count += tcp_skb_pcount(skb); + pcount = tcp_skb_pcount(skb); + + if (pcount > 1 && + (after(start_seq, TCP_SKB_CB(skb)->seq) || + before(end_seq, TCP_SKB_CB(skb)->end_seq))) { + unsigned int pkt_len; + + if (after(start_seq, TCP_SKB_CB(skb)->seq)) + pkt_len = (start_seq - + TCP_SKB_CB(skb)->seq); + else + pkt_len = (end_seq - + TCP_SKB_CB(skb)->seq); + if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->tso_size)) + break; + pcount = tcp_skb_pcount(skb); + } + + fack_count += pcount; in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && !before(end_seq, TCP_SKB_CB(skb)->end_seq); + sacked = TCP_SKB_CB(skb)->sacked; + /* Account D-SACK for retransmitted packet. */ if ((dup_sack && in_sack) && (sacked & TCPCB_RETRANS) && diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 75b6811..15e1134 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -428,11 +428,11 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned * packet to the list. This won't be called frequently, I hope. * Remember, these are still headerless SKBs at this point. */ -static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now) +int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; - int nsize; + int nsize, old_factor; u16 flags; nsize = skb_headlen(skb) - len; @@ -490,18 +490,29 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned tp->left_out -= tcp_skb_pcount(skb); } + old_factor = tcp_skb_pcount(skb); + /* Fix up tso_factor for both original and new SKB. */ tcp_set_skb_tso_segs(sk, skb, mss_now); tcp_set_skb_tso_segs(sk, buff, mss_now); - if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { - tp->lost_out += tcp_skb_pcount(skb); - tp->left_out += tcp_skb_pcount(skb); - } + /* If this packet has been sent out already, we must + * adjust the various packet counters. + */ + if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) { + int diff = old_factor - tcp_skb_pcount(skb) - + tcp_skb_pcount(buff); - if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) { - tp->lost_out += tcp_skb_pcount(buff); - tp->left_out += tcp_skb_pcount(buff); + tp->packets_out -= diff; + if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { + tp->lost_out -= diff; + tp->left_out -= diff; + } + if (diff > 0) { + tp->fackets_out -= diff; + if ((int)tp->fackets_out < 0) + tp->fackets_out = 0; + } } /* Link BUFF into the send queue. */ @@ -1350,12 +1361,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) BUG(); - - if (sk->sk_route_caps & NETIF_F_TSO) { - sk->sk_route_caps &= ~NETIF_F_TSO; - sock_set_flag(sk, SOCK_NO_LARGESEND); - } - if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) return -ENOMEM; } @@ -1370,22 +1375,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) return -EAGAIN; if (skb->len > cur_mss) { - int old_factor = tcp_skb_pcount(skb); - int diff; - if (tcp_fragment(sk, skb, cur_mss, cur_mss)) return -ENOMEM; /* We'll try again later. */ - - /* New SKB created, account for it. */ - diff = old_factor - tcp_skb_pcount(skb) - - tcp_skb_pcount(skb->next); - tp->packets_out -= diff; - - if (diff > 0) { - tp->fackets_out -= diff; - if ((int)tp->fackets_out < 0) - tp->fackets_out = 0; - } } /* Collapse two adjacent packets if worthwhile and we can. */ @@ -1993,12 +1984,6 @@ int tcp_write_wakeup(struct sock *sk) TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; if (tcp_fragment(sk, skb, seg_size, mss)) return -1; - /* SWS override triggered forced fragmentation. - * Disable TSO, the connection is too sick. */ - if (sk->sk_route_caps & NETIF_F_TSO) { - sock_set_flag(sk, SOCK_NO_LARGESEND); - sk->sk_route_caps &= ~NETIF_F_TSO; - } } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(sk, skb, mss); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e5beca7..e0bd101 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1141,7 +1141,7 @@ int udp_rcv(struct sk_buff *skb) if (ulen > len || ulen < sizeof(*uh)) goto short_packet; - if (pskb_trim(skb, ulen)) + if (pskb_trim_rcsum(skb, ulen)) goto short_packet; if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0) |