From 64f3b9e203bd06855072e295557dca1485a2ecba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 May 2011 10:02:26 +0000 Subject: net: ip_expire() must revalidate route Commit 4a94445c9a5c (net: Use ip_route_input_noref() in input path) added a bug in IP defragmentation handling, in case timeout is fired. When a frame is defragmented, we use last skb dst field when building final skb. Its dst is valid, since we are in rcu read section. But if a timeout occurs, we take first queued fragment to build one ICMP TIME EXCEEDED message. Problem is all queued skb have weak dst pointers, since we escaped RCU critical section after their queueing. icmp_send() might dereference a now freed (and possibly reused) part of memory. Calling skb_dst_drop() and ip_route_input_noref() to revalidate route is the only possible choice. Reported-by: Denys Fedoryshchenko Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index a1151b8..b1d282f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -223,31 +223,30 @@ static void ip_expire(unsigned long arg) if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { struct sk_buff *head = qp->q.fragments; + const struct iphdr *iph; + int err; rcu_read_lock(); head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) goto out_rcu_unlock; + /* skb dst is stale, drop it, and perform route lookup again */ + skb_dst_drop(head); + iph = ip_hdr(head); + err = ip_route_input_noref(head, iph->daddr, iph->saddr, + iph->tos, head->dev); + if (err) + goto out_rcu_unlock; + /* - * Only search router table for the head fragment, - * when defraging timeout at PRE_ROUTING HOOK. + * Only an end host needs to send an ICMP + * "Fragment Reassembly Timeout" message, per RFC792. */ - if (qp->user == IP_DEFRAG_CONNTRACK_IN && !skb_dst(head)) { - const struct iphdr *iph = ip_hdr(head); - int err = ip_route_input(head, iph->daddr, iph->saddr, - iph->tos, head->dev); - if (unlikely(err)) - goto out_rcu_unlock; - - /* - * Only an end host needs to send an ICMP - * "Fragment Reassembly Timeout" message, per RFC792. - */ - if (skb_rtable(head)->rt_type != RTN_LOCAL) - goto out_rcu_unlock; + if (qp->user == IP_DEFRAG_CONNTRACK_IN && + skb_rtable(head)->rt_type != RTN_LOCAL) + goto out_rcu_unlock; - } /* Send an ICMP "Fragment Reassembly Timeout" message. */ icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); -- cgit v1.1 From a294865978b701e4d0d90135672749531b9a900d Mon Sep 17 00:00:00 2001 From: Dan Rosenberg Date: Fri, 6 May 2011 03:27:18 +0000 Subject: dccp: handle invalid feature options length A length of zero (after subtracting two for the type and len fields) for the DCCPO_{CHANGE,CONFIRM}_{L,R} options will cause an underflow due to the subtraction. The subsequent code may read past the end of the options value buffer when parsing. I'm unsure of what the consequences of this might be, but it's probably not good. Signed-off-by: Dan Rosenberg Cc: stable@kernel.org Acked-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/options.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/dccp/options.c b/net/dccp/options.c index f06ffcf..4b2ab65 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -123,6 +123,8 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R: if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */ break; + if (len == 0) + goto out_invalid_option; rc = dccp_feat_parse_options(sk, dreq, mandatory, opt, *value, value + 1, len - 1); if (rc) -- cgit v1.1 From b9f47a3aaeabdce3b42829bbb27765fa340f76ba Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Wed, 4 May 2011 10:04:56 +0000 Subject: tcp_cubic: limit delayed_ack ratio to prevent divide error TCP Cubic keeps a metric that estimates the amount of delayed acknowledgements to use in adjusting the window. If an abnormally large number of packets are acknowledged at once, then the update could wrap and reach zero. This kind of ACK could only happen when there was a large window and huge number of ACK's were lost. This patch limits the value of delayed ack ratio. The choice of 32 is just a conservative value since normally it should be range of 1 to 4 packets. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 34340c9..f376b05 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -93,6 +93,7 @@ struct bictcp { u32 ack_cnt; /* number of acks */ u32 tcp_cwnd; /* estimated tcp cwnd */ #define ACK_RATIO_SHIFT 4 +#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT) u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ u8 sample_cnt; /* number of samples to decide curr_rtt */ u8 found; /* the exit point is found? */ @@ -398,8 +399,12 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) u32 delay; if (icsk->icsk_ca_state == TCP_CA_Open) { - cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; - ca->delayed_ack += cnt; + u32 ratio = ca->delayed_ack; + + ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT; + ratio += cnt; + + ca->delayed_ack = min(ratio, ACK_RATIO_LIMIT); } /* Some calls are for duplicates without timetamps */ -- cgit v1.1 From 315c34dae0069d0c67abd714bb846cd466289c7f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 21 Apr 2011 10:55:07 +0200 Subject: netfilter: ctnetlink: fix timestamp support for new conntracks This patch fixes the missing initialization of the start time if the timestamp support is enabled. libnetfilter_conntrack/utils# conntrack -E & libnetfilter_conntrack/utils# ./conntrack_create tcp 6 109 ESTABLISHED src=1.1.1.1 dst=2.2.2.2 sport=1025 dport=21 packets=0 bytes=0 [UNREPLIED] src=2.2.2.2 dst=1.1.1.1 sport=21 dport=1025 packets=0 bytes=0 mark=0 delta-time=1303296401 use=2 Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_netlink.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 30bf8a1..482e90c 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1334,6 +1334,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, struct nf_conn *ct; int err = -EINVAL; struct nf_conntrack_helper *helper; + struct nf_conn_tstamp *tstamp; ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC); if (IS_ERR(ct)) @@ -1451,6 +1452,9 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, __set_bit(IPS_EXPECTED_BIT, &ct->status); ct->master = master_ct; } + tstamp = nf_conn_tstamp_find(ct); + if (tstamp) + tstamp->start = ktime_to_ns(ktime_get_real()); add_timer(&ct->timeout); nf_conntrack_hash_insert(ct); -- cgit v1.1 From 5a6351eecf8c87afed9c883bb6341d09406d74ba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Apr 2011 10:57:21 +0200 Subject: netfilter: fix ebtables compat support commit 255d0dc34068a976 (netfilter: x_table: speedup compat operations) made ebtables not working anymore. 1) xt_compat_calc_jump() is not an exact match lookup 2) compat_table_info() has a typo in xt_compat_init_offsets() call 3) compat_do_replace() misses a xt_compat_init_offsets() call Reported-by: dann frazier Signed-off-by: Eric Dumazet Signed-off-by: Patrick McHardy --- net/bridge/netfilter/ebtables.c | 3 ++- net/netfilter/x_tables.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 893669c..9707079 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1766,7 +1766,7 @@ static int compat_table_info(const struct ebt_table_info *info, newinfo->entries_size = size; - xt_compat_init_offsets(AF_INET, info->nentries); + xt_compat_init_offsets(NFPROTO_BRIDGE, info->nentries); return EBT_ENTRY_ITERATE(entries, size, compat_calc_entry, info, entries, newinfo); } @@ -2240,6 +2240,7 @@ static int compat_do_replace(struct net *net, void __user *user, xt_compat_lock(NFPROTO_BRIDGE); + xt_compat_init_offsets(NFPROTO_BRIDGE, tmp.nentries); ret = compat_copy_entries(entries_tmp, tmp.entries_size, &state); if (ret < 0) goto out_unlock; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index a9adf4c..8a025a5 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -455,6 +455,7 @@ void xt_compat_flush_offsets(u_int8_t af) vfree(xt[af].compat_tab); xt[af].compat_tab = NULL; xt[af].number = 0; + xt[af].cur = 0; } } EXPORT_SYMBOL_GPL(xt_compat_flush_offsets); @@ -473,8 +474,7 @@ int xt_compat_calc_jump(u_int8_t af, unsigned int offset) else return mid ? tmp[mid - 1].delta : 0; } - WARN_ON_ONCE(1); - return 0; + return left ? tmp[left - 1].delta : 0; } EXPORT_SYMBOL_GPL(xt_compat_calc_jump); -- cgit v1.1 From 103a9778e07bcc0cd34b5c35a87281454eec719e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 21 Apr 2011 10:58:25 +0200 Subject: netfilter: ebtables: only call xt_compat_add_offset once per rule The optimizations in commit 255d0dc34068a976 (netfilter: x_table: speedup compat operations) assume that xt_compat_add_offset is called once per rule. ebtables however called it for each match/target found in a rule. The match/watcher/target parser already returns the needed delta, so it is sufficient to move the xt_compat_add_offset call to a more reasonable location. While at it, also get rid of the unused COMPAT iterator macros. Signed-off-by: Florian Westphal Signed-off-by: Patrick McHardy --- net/bridge/netfilter/ebtables.c | 61 ++++++----------------------------------- 1 file changed, 9 insertions(+), 52 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 9707079..1a92b36 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1882,7 +1882,7 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, struct xt_match *match; struct xt_target *wt; void *dst = NULL; - int off, pad = 0, ret = 0; + int off, pad = 0; unsigned int size_kern, entry_offset, match_size = mwt->match_size; strlcpy(name, mwt->u.name, sizeof(name)); @@ -1935,13 +1935,6 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, break; } - if (!dst) { - ret = xt_compat_add_offset(NFPROTO_BRIDGE, entry_offset, - off + ebt_compat_entry_padsize()); - if (ret < 0) - return ret; - } - state->buf_kern_offset += match_size + off; state->buf_user_offset += match_size; pad = XT_ALIGN(size_kern) - size_kern; @@ -2016,50 +2009,6 @@ static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32, return growth; } -#define EBT_COMPAT_WATCHER_ITERATE(e, fn, args...) \ -({ \ - unsigned int __i; \ - int __ret = 0; \ - struct compat_ebt_entry_mwt *__watcher; \ - \ - for (__i = e->watchers_offset; \ - __i < (e)->target_offset; \ - __i += __watcher->watcher_size + \ - sizeof(struct compat_ebt_entry_mwt)) { \ - __watcher = (void *)(e) + __i; \ - __ret = fn(__watcher , ## args); \ - if (__ret != 0) \ - break; \ - } \ - if (__ret == 0) { \ - if (__i != (e)->target_offset) \ - __ret = -EINVAL; \ - } \ - __ret; \ -}) - -#define EBT_COMPAT_MATCH_ITERATE(e, fn, args...) \ -({ \ - unsigned int __i; \ - int __ret = 0; \ - struct compat_ebt_entry_mwt *__match; \ - \ - for (__i = sizeof(struct ebt_entry); \ - __i < (e)->watchers_offset; \ - __i += __match->match_size + \ - sizeof(struct compat_ebt_entry_mwt)) { \ - __match = (void *)(e) + __i; \ - __ret = fn(__match , ## args); \ - if (__ret != 0) \ - break; \ - } \ - if (__ret == 0) { \ - if (__i != (e)->watchers_offset) \ - __ret = -EINVAL; \ - } \ - __ret; \ -}) - /* called for all ebt_entry structures. */ static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base, unsigned int *total, @@ -2132,6 +2081,14 @@ static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base, } } + if (state->buf_kern_start == NULL) { + unsigned int offset = buf_start - (char *) base; + + ret = xt_compat_add_offset(NFPROTO_BRIDGE, offset, new_offset); + if (ret < 0) + return ret; + } + startoff = state->buf_user_offset - startoff; BUG_ON(*total < startoff); -- cgit v1.1 From 1ae132b0347907ac95b8bc9dba37934f59d2a508 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Tue, 3 May 2011 22:09:30 +0200 Subject: IPVS: Change of socket usage to enable name space exit. If the sync daemons run in a name space while it crashes or get killed, there is no way to stop them except for a reboot. When all patches are there, ip_vs_core will handle register_pernet_(), i.e. ip_vs_sync_init() and ip_vs_sync_cleanup() will be removed. Kernel threads should not increment the use count of a socket. By calling sk_change_net() after creating a socket this is avoided. sock_release cant be used intead sk_release_kernel() should be used. Thanks Eric W Biederman for your advices. Signed-off-by: Hans Schillstrom [horms@verge.net.au: minor edit to changelog] Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 2 +- net/netfilter/ipvs/ip_vs_sync.c | 58 ++++++++++++++++++++++++++--------------- 2 files changed, 38 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 07accf6..a0791dc 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1896,7 +1896,7 @@ static int __net_init __ip_vs_init(struct net *net) static void __net_exit __ip_vs_cleanup(struct net *net) { - IP_VS_DBG(10, "ipvs netns %d released\n", net_ipvs(net)->gen); + IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen); } static struct pernet_operations ipvs_core_ops = { diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 3e7961e..0cce953 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1303,13 +1303,18 @@ static struct socket *make_send_sock(struct net *net) struct socket *sock; int result; - /* First create a socket */ - result = __sock_create(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock, 1); + /* First create a socket move it to right name space later */ + result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); } - + /* + * Kernel sockets that are a part of a namespace, should not + * hold a reference to a namespace in order to allow to stop it. + * After sk_change_net should be released using sk_release_kernel. + */ + sk_change_net(sock->sk, net); result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn); if (result < 0) { pr_err("Error setting outbound mcast interface\n"); @@ -1334,8 +1339,8 @@ static struct socket *make_send_sock(struct net *net) return sock; - error: - sock_release(sock); +error: + sk_release_kernel(sock->sk); return ERR_PTR(result); } @@ -1350,12 +1355,17 @@ static struct socket *make_receive_sock(struct net *net) int result; /* First create a socket */ - result = __sock_create(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock, 1); + result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); } - + /* + * Kernel sockets that are a part of a namespace, should not + * hold a reference to a namespace in order to allow to stop it. + * After sk_change_net should be released using sk_release_kernel. + */ + sk_change_net(sock->sk, net); /* it is equivalent to the REUSEADDR option in user-space */ sock->sk->sk_reuse = 1; @@ -1377,8 +1387,8 @@ static struct socket *make_receive_sock(struct net *net) return sock; - error: - sock_release(sock); +error: + sk_release_kernel(sock->sk); return ERR_PTR(result); } @@ -1473,7 +1483,7 @@ static int sync_thread_master(void *data) ip_vs_sync_buff_release(sb); /* release the sending multicast socket */ - sock_release(tinfo->sock); + sk_release_kernel(tinfo->sock->sk); kfree(tinfo); return 0; @@ -1513,7 +1523,7 @@ static int sync_thread_backup(void *data) } /* release the sending multicast socket */ - sock_release(tinfo->sock); + sk_release_kernel(tinfo->sock->sk); kfree(tinfo->buf); kfree(tinfo); @@ -1601,7 +1611,7 @@ outtinfo: outbuf: kfree(buf); outsocket: - sock_release(sock); + sk_release_kernel(sock->sk); out: return result; } @@ -1610,6 +1620,7 @@ out: int stop_sync_thread(struct net *net, int state) { struct netns_ipvs *ipvs = net_ipvs(net); + int retc = -EINVAL; IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); @@ -1629,7 +1640,7 @@ int stop_sync_thread(struct net *net, int state) spin_lock_bh(&ipvs->sync_lock); ipvs->sync_state &= ~IP_VS_STATE_MASTER; spin_unlock_bh(&ipvs->sync_lock); - kthread_stop(ipvs->master_thread); + retc = kthread_stop(ipvs->master_thread); ipvs->master_thread = NULL; } else if (state == IP_VS_STATE_BACKUP) { if (!ipvs->backup_thread) @@ -1639,16 +1650,14 @@ int stop_sync_thread(struct net *net, int state) task_pid_nr(ipvs->backup_thread)); ipvs->sync_state &= ~IP_VS_STATE_BACKUP; - kthread_stop(ipvs->backup_thread); + retc = kthread_stop(ipvs->backup_thread); ipvs->backup_thread = NULL; - } else { - return -EINVAL; } /* decrease the module use count */ ip_vs_use_count_dec(); - return 0; + return retc; } /* @@ -1670,8 +1679,15 @@ static int __net_init __ip_vs_sync_init(struct net *net) static void __ip_vs_sync_cleanup(struct net *net) { - stop_sync_thread(net, IP_VS_STATE_MASTER); - stop_sync_thread(net, IP_VS_STATE_BACKUP); + int retc; + + retc = stop_sync_thread(net, IP_VS_STATE_MASTER); + if (retc && retc != -ESRCH) + pr_err("Failed to stop Master Daemon\n"); + + retc = stop_sync_thread(net, IP_VS_STATE_BACKUP); + if (retc && retc != -ESRCH) + pr_err("Failed to stop Backup Daemon\n"); } static struct pernet_operations ipvs_sync_ops = { @@ -1682,10 +1698,10 @@ static struct pernet_operations ipvs_sync_ops = { int __init ip_vs_sync_init(void) { - return register_pernet_subsys(&ipvs_sync_ops); + return register_pernet_device(&ipvs_sync_ops); } void ip_vs_sync_cleanup(void) { - unregister_pernet_subsys(&ipvs_sync_ops); + unregister_pernet_device(&ipvs_sync_ops); } -- cgit v1.1 From 7a4f0761fce32ff4918a7c23b08db564ad33092d Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Tue, 3 May 2011 22:09:31 +0200 Subject: IPVS: init and cleanup restructuring DESCRIPTION This patch tries to restore the initial init and cleanup sequences that was before namspace patch. Netns also requires action when net devices unregister which has never been implemented. I.e this patch also covers when a device moves into a network namespace, and has to be released. IMPLEMENTATION The number of calls to register_pernet_device have been reduced to one for the ip_vs.ko Schedulers still have their own calls. This patch adds a function __ip_vs_service_cleanup() and an enable flag for the netfilter hooks. The nf hooks will be enabled when the first service is loaded and never disabled again, except when a namespace exit starts. Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov [horms@verge.net.au: minor edit to changelog] Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_app.c | 15 +---- net/netfilter/ipvs/ip_vs_conn.c | 12 +--- net/netfilter/ipvs/ip_vs_core.c | 101 +++++++++++++++++++++++++++++--- net/netfilter/ipvs/ip_vs_ctl.c | 120 ++++++++++++++++++++++++++++++++------- net/netfilter/ipvs/ip_vs_est.c | 14 +---- net/netfilter/ipvs/ip_vs_proto.c | 11 +--- net/netfilter/ipvs/ip_vs_sync.c | 13 +---- 7 files changed, 206 insertions(+), 80 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 2dc6de1..51f3af7 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -576,7 +576,7 @@ static const struct file_operations ip_vs_app_fops = { }; #endif -static int __net_init __ip_vs_app_init(struct net *net) +int __net_init __ip_vs_app_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -585,26 +585,17 @@ static int __net_init __ip_vs_app_init(struct net *net) return 0; } -static void __net_exit __ip_vs_app_cleanup(struct net *net) +void __net_exit __ip_vs_app_cleanup(struct net *net) { proc_net_remove(net, "ip_vs_app"); } -static struct pernet_operations ip_vs_app_ops = { - .init = __ip_vs_app_init, - .exit = __ip_vs_app_cleanup, -}; - int __init ip_vs_app_init(void) { - int rv; - - rv = register_pernet_subsys(&ip_vs_app_ops); - return rv; + return 0; } void ip_vs_app_cleanup(void) { - unregister_pernet_subsys(&ip_vs_app_ops); } diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index c97bd45..d3fd91b 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1258,22 +1258,17 @@ int __net_init __ip_vs_conn_init(struct net *net) return 0; } -static void __net_exit __ip_vs_conn_cleanup(struct net *net) +void __net_exit __ip_vs_conn_cleanup(struct net *net) { /* flush all the connection entries first */ ip_vs_conn_flush(net); proc_net_remove(net, "ip_vs_conn"); proc_net_remove(net, "ip_vs_conn_sync"); } -static struct pernet_operations ipvs_conn_ops = { - .init = __ip_vs_conn_init, - .exit = __ip_vs_conn_cleanup, -}; int __init ip_vs_conn_init(void) { int idx; - int retc; /* Compute size and mask */ ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; @@ -1309,17 +1304,14 @@ int __init ip_vs_conn_init(void) rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); } - retc = register_pernet_subsys(&ipvs_conn_ops); - /* calculate the random value for connection hash */ get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); - return retc; + return 0; } void ip_vs_conn_cleanup(void) { - unregister_pernet_subsys(&ipvs_conn_ops); /* Release the empty cache */ kmem_cache_destroy(ip_vs_conn_cachep); vfree(ip_vs_conn_tab); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index a0791dc..a74dae6 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1113,6 +1113,9 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) return NF_ACCEPT; net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { @@ -1343,6 +1346,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) return NF_ACCEPT; /* The packet looks wrong, ignore */ net = skb_net(skb); + pd = ip_vs_proto_data_get(net, cih->protocol); if (!pd) return NF_ACCEPT; @@ -1529,6 +1533,11 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) IP_VS_DBG_ADDR(af, &iph.daddr), hooknum); return NF_ACCEPT; } + /* ipvs enabled in this netns ? */ + net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); /* Bad... Do not break raw sockets */ @@ -1562,7 +1571,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } - net = skb_net(skb); /* Protocol supported? */ pd = ip_vs_proto_data_get(net, iph.protocol); if (unlikely(!pd)) @@ -1588,7 +1596,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) } IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); - net = skb_net(skb); ipvs = net_ipvs(net); /* Check the server status */ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { @@ -1743,10 +1750,16 @@ ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, int (*okfn)(struct sk_buff *)) { int r; + struct net *net; if (ip_hdr(skb)->protocol != IPPROTO_ICMP) return NF_ACCEPT; + /* ipvs enabled in this netns ? */ + net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + return ip_vs_in_icmp(skb, &r, hooknum); } @@ -1757,10 +1770,16 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, int (*okfn)(struct sk_buff *)) { int r; + struct net *net; if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) return NF_ACCEPT; + /* ipvs enabled in this netns ? */ + net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + return ip_vs_in_icmp_v6(skb, &r, hooknum); } #endif @@ -1884,21 +1903,72 @@ static int __net_init __ip_vs_init(struct net *net) pr_err("%s(): no memory.\n", __func__); return -ENOMEM; } + /* Hold the beast until a service is registerd */ + ipvs->enable = 0; ipvs->net = net; /* Counters used for creating unique names */ ipvs->gen = atomic_read(&ipvs_netns_cnt); atomic_inc(&ipvs_netns_cnt); net->ipvs = ipvs; + + if (__ip_vs_estimator_init(net) < 0) + goto estimator_fail; + + if (__ip_vs_control_init(net) < 0) + goto control_fail; + + if (__ip_vs_protocol_init(net) < 0) + goto protocol_fail; + + if (__ip_vs_app_init(net) < 0) + goto app_fail; + + if (__ip_vs_conn_init(net) < 0) + goto conn_fail; + + if (__ip_vs_sync_init(net) < 0) + goto sync_fail; + printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n", sizeof(struct netns_ipvs), ipvs->gen); return 0; +/* + * Error handling + */ + +sync_fail: + __ip_vs_conn_cleanup(net); +conn_fail: + __ip_vs_app_cleanup(net); +app_fail: + __ip_vs_protocol_cleanup(net); +protocol_fail: + __ip_vs_control_cleanup(net); +control_fail: + __ip_vs_estimator_cleanup(net); +estimator_fail: + return -ENOMEM; } static void __net_exit __ip_vs_cleanup(struct net *net) { + __ip_vs_service_cleanup(net); /* ip_vs_flush() with locks */ + __ip_vs_conn_cleanup(net); + __ip_vs_app_cleanup(net); + __ip_vs_protocol_cleanup(net); + __ip_vs_control_cleanup(net); + __ip_vs_estimator_cleanup(net); IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen); } +static void __net_exit __ip_vs_dev_cleanup(struct net *net) +{ + EnterFunction(2); + net_ipvs(net)->enable = 0; /* Disable packet reception */ + __ip_vs_sync_cleanup(net); + LeaveFunction(2); +} + static struct pernet_operations ipvs_core_ops = { .init = __ip_vs_init, .exit = __ip_vs_cleanup, @@ -1906,6 +1976,10 @@ static struct pernet_operations ipvs_core_ops = { .size = sizeof(struct netns_ipvs), }; +static struct pernet_operations ipvs_core_dev_ops = { + .exit = __ip_vs_dev_cleanup, +}; + /* * Initialize IP Virtual Server */ @@ -1913,10 +1987,6 @@ static int __init ip_vs_init(void) { int ret; - ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ - if (ret < 0) - return ret; - ip_vs_estimator_init(); ret = ip_vs_control_init(); if (ret < 0) { @@ -1944,15 +2014,28 @@ static int __init ip_vs_init(void) goto cleanup_conn; } + ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ + if (ret < 0) + goto cleanup_sync; + + ret = register_pernet_device(&ipvs_core_dev_ops); + if (ret < 0) + goto cleanup_sub; + ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); if (ret < 0) { pr_err("can't register hooks.\n"); - goto cleanup_sync; + goto cleanup_dev; } pr_info("ipvs loaded.\n"); + return ret; +cleanup_dev: + unregister_pernet_device(&ipvs_core_dev_ops); +cleanup_sub: + unregister_pernet_subsys(&ipvs_core_ops); cleanup_sync: ip_vs_sync_cleanup(); cleanup_conn: @@ -1964,20 +2047,20 @@ cleanup_sync: ip_vs_control_cleanup(); cleanup_estimator: ip_vs_estimator_cleanup(); - unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ return ret; } static void __exit ip_vs_cleanup(void) { nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + unregister_pernet_device(&ipvs_core_dev_ops); + unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ ip_vs_sync_cleanup(); ip_vs_conn_cleanup(); ip_vs_app_cleanup(); ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); ip_vs_estimator_cleanup(); - unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ pr_info("ipvs unloaded.\n"); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index ae47090..ea72281 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -69,6 +69,11 @@ int ip_vs_get_debug_level(void) } #endif + +/* Protos */ +static void __ip_vs_del_service(struct ip_vs_service *svc); + + #ifdef CONFIG_IP_VS_IPV6 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ static int __ip_vs_addr_is_local_v6(struct net *net, @@ -1214,6 +1219,8 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, write_unlock_bh(&__ip_vs_svc_lock); *svc_p = svc; + /* Now there is a service - full throttle */ + ipvs->enable = 1; return 0; @@ -1472,6 +1479,84 @@ static int ip_vs_flush(struct net *net) return 0; } +/* + * Delete service by {netns} in the service table. + * Called by __ip_vs_cleanup() + */ +void __ip_vs_service_cleanup(struct net *net) +{ + EnterFunction(2); + /* Check for "full" addressed entries */ + mutex_lock(&__ip_vs_mutex); + ip_vs_flush(net); + mutex_unlock(&__ip_vs_mutex); + LeaveFunction(2); +} +/* + * Release dst hold by dst_cache + */ +static inline void +__ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev) +{ + spin_lock_bh(&dest->dst_lock); + if (dest->dst_cache && dest->dst_cache->dev == dev) { + IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n", + dev->name, + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port), + atomic_read(&dest->refcnt)); + ip_vs_dst_reset(dest); + } + spin_unlock_bh(&dest->dst_lock); + +} +/* + * Netdev event receiver + * Currently only NETDEV_UNREGISTER is handled, i.e. if we hold a reference to + * a device that is "unregister" it must be released. + */ +static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + struct net *net = dev_net(dev); + struct ip_vs_service *svc; + struct ip_vs_dest *dest; + unsigned int idx; + + if (event != NETDEV_UNREGISTER) + return NOTIFY_DONE; + IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); + EnterFunction(2); + mutex_lock(&__ip_vs_mutex); + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + if (net_eq(svc->net, net)) { + list_for_each_entry(dest, &svc->destinations, + n_list) { + __ip_vs_dev_reset(dest, dev); + } + } + } + + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + if (net_eq(svc->net, net)) { + list_for_each_entry(dest, &svc->destinations, + n_list) { + __ip_vs_dev_reset(dest, dev); + } + } + + } + } + + list_for_each_entry(dest, &net_ipvs(net)->dest_trash, n_list) { + __ip_vs_dev_reset(dest, dev); + } + mutex_unlock(&__ip_vs_mutex); + LeaveFunction(2); + return NOTIFY_DONE; +} /* * Zero counters in a service or all services @@ -3588,6 +3673,10 @@ void __net_init __ip_vs_control_cleanup_sysctl(struct net *net) { } #endif +static struct notifier_block ip_vs_dst_notifier = { + .notifier_call = ip_vs_dst_event, +}; + int __net_init __ip_vs_control_init(struct net *net) { int idx; @@ -3626,7 +3715,7 @@ err: return -ENOMEM; } -static void __net_exit __ip_vs_control_cleanup(struct net *net) +void __net_exit __ip_vs_control_cleanup(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -3639,11 +3728,6 @@ static void __net_exit __ip_vs_control_cleanup(struct net *net) free_percpu(ipvs->tot_stats.cpustats); } -static struct pernet_operations ipvs_control_ops = { - .init = __ip_vs_control_init, - .exit = __ip_vs_control_cleanup, -}; - int __init ip_vs_control_init(void) { int idx; @@ -3657,33 +3741,32 @@ int __init ip_vs_control_init(void) INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); } - ret = register_pernet_subsys(&ipvs_control_ops); - if (ret) { - pr_err("cannot register namespace.\n"); - goto err; - } - smp_wmb(); /* Do we really need it now ? */ ret = nf_register_sockopt(&ip_vs_sockopts); if (ret) { pr_err("cannot register sockopt.\n"); - goto err_net; + goto err_sock; } ret = ip_vs_genl_register(); if (ret) { pr_err("cannot register Generic Netlink interface.\n"); - nf_unregister_sockopt(&ip_vs_sockopts); - goto err_net; + goto err_genl; } + ret = register_netdevice_notifier(&ip_vs_dst_notifier); + if (ret < 0) + goto err_notf; + LeaveFunction(2); return 0; -err_net: - unregister_pernet_subsys(&ipvs_control_ops); -err: +err_notf: + ip_vs_genl_unregister(); +err_genl: + nf_unregister_sockopt(&ip_vs_sockopts); +err_sock: return ret; } @@ -3691,7 +3774,6 @@ err: void ip_vs_control_cleanup(void) { EnterFunction(2); - unregister_pernet_subsys(&ipvs_control_ops); ip_vs_genl_unregister(); nf_unregister_sockopt(&ip_vs_sockopts); LeaveFunction(2); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 8c8766c..508cce9 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -192,7 +192,7 @@ void ip_vs_read_estimator(struct ip_vs_stats_user *dst, dst->outbps = (e->outbps + 0xF) >> 5; } -static int __net_init __ip_vs_estimator_init(struct net *net) +int __net_init __ip_vs_estimator_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -203,24 +203,16 @@ static int __net_init __ip_vs_estimator_init(struct net *net) return 0; } -static void __net_exit __ip_vs_estimator_exit(struct net *net) +void __net_exit __ip_vs_estimator_cleanup(struct net *net) { del_timer_sync(&net_ipvs(net)->est_timer); } -static struct pernet_operations ip_vs_app_ops = { - .init = __ip_vs_estimator_init, - .exit = __ip_vs_estimator_exit, -}; int __init ip_vs_estimator_init(void) { - int rv; - - rv = register_pernet_subsys(&ip_vs_app_ops); - return rv; + return 0; } void ip_vs_estimator_cleanup(void) { - unregister_pernet_subsys(&ip_vs_app_ops); } diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index 17484a4..eb86028 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -316,7 +316,7 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, /* * per network name-space init */ -static int __net_init __ip_vs_protocol_init(struct net *net) +int __net_init __ip_vs_protocol_init(struct net *net) { #ifdef CONFIG_IP_VS_PROTO_TCP register_ip_vs_proto_netns(net, &ip_vs_protocol_tcp); @@ -336,7 +336,7 @@ static int __net_init __ip_vs_protocol_init(struct net *net) return 0; } -static void __net_exit __ip_vs_protocol_cleanup(struct net *net) +void __net_exit __ip_vs_protocol_cleanup(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_proto_data *pd; @@ -349,11 +349,6 @@ static void __net_exit __ip_vs_protocol_cleanup(struct net *net) } } -static struct pernet_operations ipvs_proto_ops = { - .init = __ip_vs_protocol_init, - .exit = __ip_vs_protocol_cleanup, -}; - int __init ip_vs_protocol_init(void) { char protocols[64]; @@ -382,7 +377,6 @@ int __init ip_vs_protocol_init(void) REGISTER_PROTOCOL(&ip_vs_protocol_esp); #endif pr_info("Registered protocols (%s)\n", &protocols[2]); - return register_pernet_subsys(&ipvs_proto_ops); return 0; } @@ -393,7 +387,6 @@ void ip_vs_protocol_cleanup(void) struct ip_vs_protocol *pp; int i; - unregister_pernet_subsys(&ipvs_proto_ops); /* unregister all the ipvs protocols */ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { while ((pp = ip_vs_proto_table[i]) != NULL) diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 0cce953..e292e5b 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1663,7 +1663,7 @@ int stop_sync_thread(struct net *net, int state) /* * Initialize data struct for each netns */ -static int __net_init __ip_vs_sync_init(struct net *net) +int __net_init __ip_vs_sync_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -1677,7 +1677,7 @@ static int __net_init __ip_vs_sync_init(struct net *net) return 0; } -static void __ip_vs_sync_cleanup(struct net *net) +void __ip_vs_sync_cleanup(struct net *net) { int retc; @@ -1690,18 +1690,11 @@ static void __ip_vs_sync_cleanup(struct net *net) pr_err("Failed to stop Backup Daemon\n"); } -static struct pernet_operations ipvs_sync_ops = { - .init = __ip_vs_sync_init, - .exit = __ip_vs_sync_cleanup, -}; - - int __init ip_vs_sync_init(void) { - return register_pernet_device(&ipvs_sync_ops); + return 0; } void ip_vs_sync_cleanup(void) { - unregister_pernet_device(&ipvs_sync_ops); } -- cgit v1.1 From 4319cc0cf5bb894b7368008cdf6dd20eb8868018 Mon Sep 17 00:00:00 2001 From: Fernando Luis Vazquez Cao Date: Tue, 10 May 2011 09:55:44 +0200 Subject: netfilter: IPv6: initialize TOS field in REJECT target module The IPv6 header is not zeroed out in alloc_skb so we must initialize it properly unless we want to see IPv6 packets with random TOS fields floating around. The current implementation resets the flow label but this could be changed if deemed necessary. We stumbled upon this issue when trying to apply a mangle rule to the RST packet generated by the REJECT target module. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/ip6t_REJECT.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index 28e7448..a5a4c5d 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -45,6 +45,8 @@ static void send_reset(struct net *net, struct sk_buff *oldskb) int tcphoff, needs_ack; const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); struct ipv6hdr *ip6h; +#define DEFAULT_TOS_VALUE 0x0U + const __u8 tclass = DEFAULT_TOS_VALUE; struct dst_entry *dst = NULL; u8 proto; struct flowi6 fl6; @@ -124,7 +126,7 @@ static void send_reset(struct net *net, struct sk_buff *oldskb) skb_put(nskb, sizeof(struct ipv6hdr)); skb_reset_network_header(nskb); ip6h = ipv6_hdr(nskb); - ip6h->version = 6; + *(__be32 *)ip6h = htonl(0x60000000 | (tclass << 20)); ip6h->hop_limit = ip6_dst_hoplimit(dst); ip6h->nexthdr = IPPROTO_TCP; ipv6_addr_copy(&ip6h->saddr, &oip6h->daddr); -- cgit v1.1 From 1ed2f73d90fb49bcf5704aee7e9084adb882bfc5 Mon Sep 17 00:00:00 2001 From: Fernando Luis Vazquez Cao Date: Tue, 10 May 2011 10:00:21 +0200 Subject: netfilter: IPv6: fix DSCP mangle code The mask indicates the bits one wants to zero out, so it needs to be inverted before applying to the original TOS field. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_DSCP.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c index 0a22919..ae82716 100644 --- a/net/netfilter/xt_DSCP.c +++ b/net/netfilter/xt_DSCP.c @@ -99,7 +99,7 @@ tos_tg6(struct sk_buff *skb, const struct xt_action_param *par) u_int8_t orig, nv; orig = ipv6_get_dsfield(iph); - nv = (orig & info->tos_mask) ^ info->tos_value; + nv = (orig & ~info->tos_mask) ^ info->tos_value; if (orig != nv) { if (!skb_make_writable(skb, sizeof(struct iphdr))) -- cgit v1.1 From 93bbce1ad0cd788190dd7d6c17d289f771fe3d0d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 10 May 2011 12:13:36 +0200 Subject: netfilter: revert a2361c8735e07322023aedc36e4938b35af31eb0 This patch reverts a2361c8735e07322023aedc36e4938b35af31eb0: "[PATCH] netfilter: xt_conntrack: warn about use in raw table" Florian Wesphal says: "... when the packet was sent from the local machine the skb already has ->nfct attached, and -m conntrack seems to do the right thing." Acked-by: Jan Engelhardt Reported-by: Florian Wesphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_conntrack.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index 481a86f..61805d7 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -272,11 +272,6 @@ static int conntrack_mt_check(const struct xt_mtchk_param *par) { int ret; - if (strcmp(par->table, "raw") == 0) { - pr_info("state is undetermined at the time of raw table\n"); - return -EINVAL; - } - ret = nf_ct_l3proto_try_module_get(par->family); if (ret < 0) pr_info("cannot load conntrack support for proto=%u\n", -- cgit v1.1 From 55aee10dec477254241e4f72968f92e0543b33ad Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 May 2011 12:22:54 -0700 Subject: vlan: fix GVRP at dismantle time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ip link add link eth2 eth2.103 type vlan id 103 gvrp on loose_binding on ip link set eth2.103 up rmmod tg3 # driver providing eth2 BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] garp_request_leave+0x3e/0xc0 [garp] PGD 11d251067 PUD 11b9e0067 PMD 0 Oops: 0000 [#1] SMP last sysfs file: /sys/devices/virtual/net/eth2.104/ifindex CPU 0 Modules linked in: tg3(-) 8021q garp nfsd lockd auth_rpcgss sunrpc libphy sg [last unloaded: x_tables] Pid: 11494, comm: rmmod Tainted: G W 2.6.39-rc6-00261-gfd71257-dirty #580 HP ProLiant BL460c G6 RIP: 0010:[] [] garp_request_leave+0x3e/0xc0 [garp] RSP: 0018:ffff88007a19bae8 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff88011b5e2000 RCX: 0000000000000002 RDX: 0000000000000000 RSI: 0000000000000175 RDI: ffffffffa0030d5b RBP: ffff88007a19bb18 R08: 0000000000000001 R09: ffff88011bd64a00 R10: ffff88011d34ec00 R11: 0000000000000000 R12: 0000000000000002 R13: ffff88007a19bc48 R14: ffff88007a19bb88 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff88011fc00000(0063) knlGS:00000000f77d76c0 CS: 0010 DS: 002b ES: 002b CR0: 000000008005003b CR2: 0000000000000000 CR3: 000000011a675000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process rmmod (pid: 11494, threadinfo ffff88007a19a000, task ffff8800798595c0) Stack: ffff88007a19bb36 ffff88011c84b800 ffff88011b5e2000 ffff88007a19bc48 ffff88007a19bb88 0000000000000006 ffff88007a19bb38 ffffffffa003a5f6 ffff88007a19bb38 670088007a19bba8 ffff88007a19bb58 ffffffffa00397e7 Call Trace: [] vlan_gvrp_request_leave+0x46/0x50 [8021q] [] vlan_dev_stop+0xb7/0xc0 [8021q] [] __dev_close_many+0x87/0xe0 [] dev_close_many+0x87/0x110 [] rollback_registered_many+0xa0/0x240 [] unregister_netdevice_many+0x19/0x60 [] vlan_device_event+0x53b/0x550 [8021q] [] ? ip6mr_device_event+0xa8/0xd0 [] notifier_call_chain+0x53/0x80 [] __raw_notifier_call_chain+0x9/0x10 [] raw_notifier_call_chain+0x11/0x20 [] call_netdevice_notifiers+0x32/0x60 [] rollback_registered_many+0x10f/0x240 [] rollback_registered+0x2f/0x40 [] unregister_netdevice_queue+0x58/0x90 [] unregister_netdev+0x1b/0x30 [] tg3_remove_one+0x6f/0x10b [tg3] We should call vlan_gvrp_request_leave() from unregister_vlan_dev(), not from vlan_dev_stop(), because vlan_gvrp_uninit_applicant() is called right after unregister_netdevice_queue(). In batch mode, unregister_netdevice_queue() doesn’t immediately call vlan_dev_stop(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/8021q/vlan.c | 3 +++ net/8021q/vlan_dev.c | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 7850412..0eb1a88 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -124,6 +124,9 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head) grp->nr_vlans--; + if (vlan->flags & VLAN_FLAG_GVRP) + vlan_gvrp_request_leave(dev); + vlan_group_set_device(grp, vlan_id, NULL); if (!grp->killall) synchronize_net(); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index e34ea9e..b2ff6c8 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -487,9 +487,6 @@ static int vlan_dev_stop(struct net_device *dev) struct vlan_dev_info *vlan = vlan_dev_info(dev); struct net_device *real_dev = vlan->real_dev; - if (vlan->flags & VLAN_FLAG_GVRP) - vlan_gvrp_request_leave(dev); - dev_mc_unsync(real_dev, dev); dev_uc_unsync(real_dev, dev); if (dev->flags & IFF_ALLMULTI) -- cgit v1.1 From e14a599335427f81bbb0008963e59aa9c6449dce Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 May 2011 12:26:06 -0700 Subject: net: dev_close() should check IFF_UP Commit 443457242beb (factorize sync-rcu call in unregister_netdevice_many) mistakenly removed one test from dev_close() Following actions trigger a BUG : modprobe bonding modprobe dummy ifconfig bond0 up ifenslave bond0 dummy0 rmmod dummy dev_close() must not close a non IFF_UP device. With help from Frank Blaschka and Einar EL Lueck Reported-by: Frank Blaschka Reported-by: Einar EL Lueck Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 856b6ee..9200944 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1284,11 +1284,13 @@ static int dev_close_many(struct list_head *head) */ int dev_close(struct net_device *dev) { - LIST_HEAD(single); + if (dev->flags & IFF_UP) { + LIST_HEAD(single); - list_add(&dev->unreg_list, &single); - dev_close_many(&single); - list_del(&single); + list_add(&dev->unreg_list, &single); + dev_close_many(&single); + list_del(&single); + } return 0; } EXPORT_SYMBOL(dev_close); -- cgit v1.1 From 43a4dea4c9d44baae38ddc14b9b6d86fde4c8b88 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 9 May 2011 19:36:38 +0000 Subject: xfrm: Assign the inner mode output function to the dst entry As it is, we assign the outer modes output function to the dst entry when we create the xfrm bundle. This leads to two problems on interfamily scenarios. We might insert ipv4 packets into ip6_fragment when called from xfrm6_output. The system crashes if we try to fragment an ipv4 packet with ip6_fragment. This issue was introduced with git commit ad0081e4 (ipv6: Fragment locally generated tunnel-mode IPSec6 packets as needed). The second issue is, that we might insert ipv4 packets in netfilter6 and vice versa on interfamily scenarios. With this patch we assign the inner mode output function to the dst entry when we create the xfrm bundle. So xfrm4_output/xfrm6_output from the inner mode is used and the right fragmentation and netfilter functions are called. We switch then to outer mode with the output_finish functions. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv4/xfrm4_output.c | 8 ++++++-- net/ipv4/xfrm4_state.c | 1 + net/ipv6/xfrm6_output.c | 6 +++--- net/ipv6/xfrm6_state.c | 1 + net/xfrm/xfrm_policy.c | 14 +++++++++++++- 5 files changed, 24 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 571aa96..2d51840 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -69,7 +69,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb) } EXPORT_SYMBOL(xfrm4_prepare_output); -static int xfrm4_output_finish(struct sk_buff *skb) +int xfrm4_output_finish(struct sk_buff *skb) { #ifdef CONFIG_NETFILTER if (!skb_dst(skb)->xfrm) { @@ -86,7 +86,11 @@ static int xfrm4_output_finish(struct sk_buff *skb) int xfrm4_output(struct sk_buff *skb) { + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, - NULL, skb_dst(skb)->dev, xfrm4_output_finish, + NULL, dst->dev, + x->outer_mode->afinfo->output_finish, !(IPCB(skb)->flags & IPSKB_REROUTED)); } diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 1717c64..805d63e 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -78,6 +78,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = { .init_tempsel = __xfrm4_init_tempsel, .init_temprop = xfrm4_init_temprop, .output = xfrm4_output, + .output_finish = xfrm4_output_finish, .extract_input = xfrm4_extract_input, .extract_output = xfrm4_extract_output, .transport_finish = xfrm4_transport_finish, diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 8e688b3..49a91c5f 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -79,7 +79,7 @@ int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb) } EXPORT_SYMBOL(xfrm6_prepare_output); -static int xfrm6_output_finish(struct sk_buff *skb) +int xfrm6_output_finish(struct sk_buff *skb) { #ifdef CONFIG_NETFILTER IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; @@ -97,9 +97,9 @@ static int __xfrm6_output(struct sk_buff *skb) if ((x && x->props.mode == XFRM_MODE_TUNNEL) && ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || dst_allfrag(skb_dst(skb)))) { - return ip6_fragment(skb, xfrm6_output_finish); + return ip6_fragment(skb, x->outer_mode->afinfo->output_finish); } - return xfrm6_output_finish(skb); + return x->outer_mode->afinfo->output_finish(skb); } int xfrm6_output(struct sk_buff *skb) diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index afe941e..248f0b2 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -178,6 +178,7 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = { .tmpl_sort = __xfrm6_tmpl_sort, .state_sort = __xfrm6_state_sort, .output = xfrm6_output, + .output_finish = xfrm6_output_finish, .extract_input = xfrm6_extract_input, .extract_output = xfrm6_extract_output, .transport_finish = xfrm6_transport_finish, diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 15792d8..b4d745e 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1406,6 +1406,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, struct net *net = xp_net(policy); unsigned long now = jiffies; struct net_device *dev; + struct xfrm_mode *inner_mode; struct dst_entry *dst_prev = NULL; struct dst_entry *dst0 = NULL; int i = 0; @@ -1436,6 +1437,17 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, goto put_states; } + if (xfrm[i]->sel.family == AF_UNSPEC) { + inner_mode = xfrm_ip2inner_mode(xfrm[i], + xfrm_af2proto(family)); + if (!inner_mode) { + err = -EAFNOSUPPORT; + dst_release(dst); + goto put_states; + } + } else + inner_mode = xfrm[i]->inner_mode; + if (!dst_prev) dst0 = dst1; else { @@ -1464,7 +1476,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, dst1->lastuse = now; dst1->input = dst_discard; - dst1->output = xfrm[i]->outer_mode->afinfo->output; + dst1->output = inner_mode->afinfo->output; dst1->next = dst_prev; dst_prev = dst1; -- cgit v1.1 From 6fa5ddcc675b937f94d05628e8997c07a80c6cb9 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 9 May 2011 19:43:05 +0000 Subject: xfrm: Don't allow esn with disabled anti replay detection Unlike the standard case, disabled anti replay detection needs some nontrivial extra treatment on ESN. RFC 4303 states: Note: If a receiver chooses to not enable anti-replay for an SA, then the receiver SHOULD NOT negotiate ESN in an SA management protocol. Use of ESN creates a need for the receiver to manage the anti-replay window (in order to determine the correct value for the high-order bits of the ESN, which are employed in the ICV computation), which is generally contrary to the notion of disabling anti-replay for an SA. So return an error if an ESN state with disabled anti replay detection is inserted for now and add the extra treatment later if we need it. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/xfrm/xfrm_replay.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index e8a7814..47f1b86 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -535,6 +535,9 @@ int xfrm_init_replay(struct xfrm_state *x) replay_esn->bmp_len * sizeof(__u32) * 8) return -EINVAL; + if ((x->props.flags & XFRM_STATE_ESN) && replay_esn->replay_window == 0) + return -EINVAL; + if ((x->props.flags & XFRM_STATE_ESN) && x->replay_esn) x->repl = &xfrm_replay_esn; else -- cgit v1.1