From c936e8bd1de2fa50c49e3df6fa5036bf07870b67 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Mon, 31 May 2010 16:41:09 +0200 Subject: netfilter: don't xt_jumpstack_alloc twice in xt_register_table In xt_register_table, xt_jumpstack_alloc is called first, later xt_replace_table is used. But in xt_replace_table, xt_jumpstack_alloc will be used again. Then the memory allocated by previous xt_jumpstack_alloc will be leaked. We can simply remove the previous xt_jumpstack_alloc because there aren't any users of newinfo between xt_jumpstack_alloc and xt_replace_table. Signed-off-by: Xiaotian Feng Cc: Patrick McHardy Cc: "David S. Miller" Cc: Jan Engelhardt Cc: Andrew Morton Cc: Rusty Russell Cc: Alexey Dobriyan Acked-By: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/netfilter/x_tables.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 445de70..47b1e79 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -844,10 +844,6 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table_info *private; struct xt_table *t, *table; - ret = xt_jumpstack_alloc(newinfo); - if (ret < 0) - return ERR_PTR(ret); - /* Don't add one object to multiple lists. */ table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL); if (!table) { -- cgit v1.1 From 7489aec8eed4f2f1eb3b4d35763bd3ea30b32ef5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 31 May 2010 16:41:35 +0200 Subject: netfilter: xtables: stackptr should be percpu commit f3c5c1bfd4 (netfilter: xtables: make ip_tables reentrant) introduced a performance regression, because stackptr array is shared by all cpus, adding cache line ping pongs. (16 cpus share a 64 bytes cache line) Fix this using alloc_percpu() Signed-off-by: Eric Dumazet Acked-By: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/ip_tables.c | 2 +- net/ipv6/netfilter/ip6_tables.c | 2 +- net/netfilter/x_tables.c | 13 +++---------- 3 files changed, 5 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 63958f3..4b6c5ca 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -336,7 +336,7 @@ ipt_do_table(struct sk_buff *skb, cpu = smp_processor_id(); table_base = private->entries[cpu]; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; - stackptr = &private->stackptr[cpu]; + stackptr = per_cpu_ptr(private->stackptr, cpu); origptr = *stackptr; e = get_entry(table_base, private->hook_entry[hook]); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 6f517bd..9d2d68f 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -363,7 +363,7 @@ ip6t_do_table(struct sk_buff *skb, cpu = smp_processor_id(); table_base = private->entries[cpu]; jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; - stackptr = &private->stackptr[cpu]; + stackptr = per_cpu_ptr(private->stackptr, cpu); origptr = *stackptr; e = get_entry(table_base, private->hook_entry[hook]); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 47b1e79..e34622f 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -699,10 +699,8 @@ void xt_free_table_info(struct xt_table_info *info) vfree(info->jumpstack); else kfree(info->jumpstack); - if (sizeof(unsigned int) * nr_cpu_ids > PAGE_SIZE) - vfree(info->stackptr); - else - kfree(info->stackptr); + + free_percpu(info->stackptr); kfree(info); } @@ -753,14 +751,9 @@ static int xt_jumpstack_alloc(struct xt_table_info *i) unsigned int size; int cpu; - size = sizeof(unsigned int) * nr_cpu_ids; - if (size > PAGE_SIZE) - i->stackptr = vmalloc(size); - else - i->stackptr = kmalloc(size, GFP_KERNEL); + i->stackptr = alloc_percpu(unsigned int); if (i->stackptr == NULL) return -ENOMEM; - memset(i->stackptr, 0, size); size = sizeof(void **) * nr_cpu_ids; if (size > PAGE_SIZE) -- cgit v1.1 From b1faf5666438090a4dc4fceac8502edc7788b7e3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 31 May 2010 23:44:05 -0700 Subject: net: sock_queue_err_skb() dont mess with sk_forward_alloc Correct sk_forward_alloc handling for error_queue would need to use a backlog of frames that softirq handler could not deliver because socket is owned by user thread. Or extend backlog processing to be able to process normal and error packets. Another possibility is to not use mem charge for error queue, this is what I implemented in this patch. Note: this reverts commit 29030374 (net: fix sk_forward_alloc corruptions), since we dont need to lock socket anymore. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 30 ++++++++++++++++++++++++++++-- net/ipv4/udp.c | 6 ++---- net/ipv6/udp.c | 6 ++---- 3 files changed, 32 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4e7ac09..9f07e74 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2965,6 +2965,34 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) } EXPORT_SYMBOL_GPL(skb_cow_data); +static void sock_rmem_free(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + atomic_sub(skb->truesize, &sk->sk_rmem_alloc); +} + +/* + * Note: We dont mem charge error packets (no sk_forward_alloc changes) + */ +int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) +{ + if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= + (unsigned)sk->sk_rcvbuf) + return -ENOMEM; + + skb_orphan(skb); + skb->sk = sk; + skb->destructor = sock_rmem_free; + atomic_add(skb->truesize, &sk->sk_rmem_alloc); + + skb_queue_tail(&sk->sk_error_queue, skb); + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, skb->len); + return 0; +} +EXPORT_SYMBOL(sock_queue_err_skb); + void skb_tstamp_tx(struct sk_buff *orig_skb, struct skb_shared_hwtstamps *hwtstamps) { @@ -2997,9 +3025,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; - bh_lock_sock(sk); err = sock_queue_err_skb(sk, skb); - bh_unlock_sock(sk); if (err) kfree_skb(skb); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 50678f9..eec4ff4 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -633,11 +633,9 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) if (!inet->recverr) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; - } else { - bh_lock_sock(sk); + } else ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1)); - bh_unlock_sock(sk); - } + sk->sk_err = err; sk->sk_error_report(sk); out: diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 3048f90..87be586 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -466,11 +466,9 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (sk->sk_state != TCP_ESTABLISHED && !np->recverr) goto out; - if (np->recverr) { - bh_lock_sock(sk); + if (np->recverr) ipv6_icmp_error(sk, skb, err, uh->dest, ntohl(info), (u8 *)(uh+1)); - bh_unlock_sock(sk); - } + sk->sk_err = err; sk->sk_error_report(sk); out: -- cgit v1.1 From 288fcee8b7aa98796d96cd5b1b2e8005639328bf Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 31 May 2010 23:48:19 -0700 Subject: net/ipv4/tcp_input.c: fix compilation breakage when FASTRETRANS_DEBUG > 1 Commit: c720c7e8383aff1cb219bddf474ed89d850336e3 missed these. Signed-off-by: Joe Perches Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3e6dafc..548d575 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2639,7 +2639,7 @@ static void DBGUNDO(struct sock *sk, const char *msg) if (sk->sk_family == AF_INET) { printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", msg, - &inet->daddr, ntohs(inet->dport), + &inet->inet_daddr, ntohs(inet->inet_dport), tp->snd_cwnd, tcp_left_out(tp), tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); @@ -2649,7 +2649,7 @@ static void DBGUNDO(struct sock *sk, const char *msg) struct ipv6_pinfo *np = inet6_sk(sk); printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", msg, - &np->daddr, ntohs(inet->dport), + &np->daddr, ntohs(inet->inet_dport), tp->snd_cwnd, tcp_left_out(tp), tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); -- cgit v1.1 From 8ae5977ff95c03fe6c36a5721c57dcb4bfe4f290 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sun, 30 May 2010 14:52:58 +0200 Subject: mac80211: fix blockack-req processing Daniel reported that the paged RX changes had broken blockack request frame processing due to using data that wasn't really part of the skb data. Fix this using skb_copy_bits() for the needed data. As a side effect, this adds a check on processing too short frames, which previously this code could do. Reported-by: Daniel Halperin Signed-off-by: Johannes Berg Acked-by: Daniel Halperin Signed-off-by: John W. Linville --- net/mac80211/rx.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 6e2a7bc..5e0b654 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1818,17 +1818,26 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) return RX_CONTINUE; if (ieee80211_is_back_req(bar->frame_control)) { + struct { + __le16 control, start_seq_num; + } __packed bar_data; + if (!rx->sta) return RX_DROP_MONITOR; + + if (skb_copy_bits(skb, offsetof(struct ieee80211_bar, control), + &bar_data, sizeof(bar_data))) + return RX_DROP_MONITOR; + spin_lock(&rx->sta->lock); - tid = le16_to_cpu(bar->control) >> 12; + tid = le16_to_cpu(bar_data.control) >> 12; if (!rx->sta->ampdu_mlme.tid_active_rx[tid]) { spin_unlock(&rx->sta->lock); return RX_DROP_MONITOR; } tid_agg_rx = rx->sta->ampdu_mlme.tid_rx[tid]; - start_seq_num = le16_to_cpu(bar->start_seq_num) >> 4; + start_seq_num = le16_to_cpu(bar_data.start_seq_num) >> 4; /* reset session timer */ if (tid_agg_rx->timeout) -- cgit v1.1 From 51a0d38de26226f2779912d92f155b93d539da9a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 31 May 2010 12:00:12 +0200 Subject: mac80211: fix dialog token allocator The dialog token allocator has apparently been broken since b83f4e15 ("mac80211: fix deadlock in sta->lock") because it got moved out under the spinlock. Fix it. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/agg-tx.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index c163d0a..98258b7 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -332,14 +332,16 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid) IEEE80211_QUEUE_STOP_REASON_AGGREGATION); spin_unlock(&local->ampdu_lock); - spin_unlock_bh(&sta->lock); - /* send an addBA request */ + /* prepare tid data */ sta->ampdu_mlme.dialog_token_allocator++; sta->ampdu_mlme.tid_tx[tid]->dialog_token = sta->ampdu_mlme.dialog_token_allocator; sta->ampdu_mlme.tid_tx[tid]->ssn = start_seq_num; + spin_unlock_bh(&sta->lock); + + /* send AddBA request */ ieee80211_send_addba_request(sdata, pubsta->addr, tid, sta->ampdu_mlme.tid_tx[tid]->dialog_token, sta->ampdu_mlme.tid_tx[tid]->ssn, -- cgit v1.1 From fafeeb6c80e3842c6dc19d05de09a23f23eef0d8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 1 Jun 2010 10:04:49 +0000 Subject: xfrm: force a dst reference in __xfrm_route_forward() Packets going through __xfrm_route_forward() have a not refcounted dst entry, since we enabled a noref forwarding path. xfrm_lookup() might incorrectly release this dst entry. It's a bit late to make invasive changes in xfrm_lookup(), so lets force a refcount in this path. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/xfrm/xfrm_policy.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index d965a2b..4bf27d9 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2153,6 +2153,7 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) return 0; } + skb_dst_force(skb); dst = skb_dst(skb); res = xfrm_lookup(net, &dst, &fl, NULL, 0) == 0; -- cgit v1.1 From 194dbcc8a1a97cbac9a619a563e5f6b7f7d5a485 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 12 May 2010 21:31:06 +0000 Subject: net: init_vlan should not copy slave or master flags The vlan device should not copy the slave or master flags from the real device. It is not in the bond until added nor is it a master. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- net/8021q/vlan_dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 55be908..5298426 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -708,7 +708,8 @@ static int vlan_dev_init(struct net_device *dev) netif_carrier_off(dev); /* IFF_BROADCAST|IFF_MULTICAST; ??? */ - dev->flags = real_dev->flags & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI); + dev->flags = real_dev->flags & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | + IFF_MASTER | IFF_SLAVE); dev->iflink = real_dev->ifindex; dev->state = (real_dev->state & ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT))) | -- cgit v1.1 From 2df4a0fa1540c460ec69788ab2a901cc72a75644 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 12 May 2010 21:31:11 +0000 Subject: net: fix conflict between null_or_orig and null_or_bond If a skb is received on an inactive bond that does not meet the special cases checked for by skb_bond_should_drop it should only be delivered to exact matches as the comment in netif_receive_skb() says. However because null_or_bond could also be null this is not always true. This patch renames null_or_bond to orig_or_bond and initializes it to orig_dev. This keeps the intent of null_or_bond to pass frames received on VLAN interfaces stacked on bonding interfaces without invalidating the statement for null_or_orig. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 1845b08..d03470f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2795,7 +2795,7 @@ static int __netif_receive_skb(struct sk_buff *skb) struct net_device *orig_dev; struct net_device *master; struct net_device *null_or_orig; - struct net_device *null_or_bond; + struct net_device *orig_or_bond; int ret = NET_RX_DROP; __be16 type; @@ -2868,10 +2868,10 @@ ncls: * device that may have registered for a specific ptype. The * handler may have to adjust skb->dev and orig_dev. */ - null_or_bond = NULL; + orig_or_bond = orig_dev; if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) && (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) { - null_or_bond = vlan_dev_real_dev(skb->dev); + orig_or_bond = vlan_dev_real_dev(skb->dev); } type = skb->protocol; @@ -2879,7 +2879,7 @@ ncls: &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { if (ptype->type == type && (ptype->dev == null_or_orig || ptype->dev == skb->dev || ptype->dev == orig_dev || - ptype->dev == null_or_bond)) { + ptype->dev == orig_or_bond)) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; -- cgit v1.1 From 33c29dde7d04dc0ec0edb649d20ccf1351c13a06 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Sat, 29 May 2010 14:26:59 +0000 Subject: act_nat: fix the wrong checksum when addr isn't in old_addr/mask fix the wrong checksum when addr isn't in old_addr/mask For TCP and UDP packets, when addr isn't in old_addr/mask we don't do SNAT or DNAT, and we should not update layer 4 checksum. Signed-off-by: Changli Gao ---- net/sched/act_nat.c | 4 ++++ 1 file changed, 4 insertions(+) Signed-off-by: David S. Miller --- net/sched/act_nat.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index d885ba3..5709494 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -159,6 +159,9 @@ static int tcf_nat(struct sk_buff *skb, struct tc_action *a, iph->daddr = new_addr; csum_replace4(&iph->check, addr, new_addr); + } else if ((iph->frag_off & htons(IP_OFFSET)) || + iph->protocol != IPPROTO_ICMP) { + goto out; } ihl = iph->ihl * 4; @@ -247,6 +250,7 @@ static int tcf_nat(struct sk_buff *skb, struct tc_action *a, break; } +out: return action; drop: -- cgit v1.1 From edafe502404f3669d364b6e96d79b54067b634b4 Mon Sep 17 00:00:00 2001 From: Daniele Lacamera Date: Wed, 2 Jun 2010 02:02:04 +0000 Subject: TCP: tcp_hybla: Fix integer overflow in slow start increment For large values of rtt, 2^rho operation may overflow u32. Clamp down the increment to 2^16. Signed-off-by: Daniele Lacamera Signed-off-by: David S. Miller --- net/ipv4/tcp_hybla.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index c209e05..377bc93 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -126,8 +126,8 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) * calculate 2^fract in a <<7 value. */ is_slowstart = 1; - increment = ((1 << ca->rho) * hybla_fraction(rho_fractions)) - - 128; + increment = ((1 << min(ca->rho, 16U)) * + hybla_fraction(rho_fractions)) - 128; } else { /* * congestion avoidance -- cgit v1.1 From fbc2e7d9cf49e0bf89b9e91fd60a06851a855c5d Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Wed, 2 Jun 2010 07:32:42 -0700 Subject: cls_u32: use skb_header_pointer() to dereference data safely use skb_header_pointer() to dereference data safely the original skb->data dereference isn't safe, as there isn't any skb->len or skb_is_nonlinear() check. skb_header_pointer() is used instead in this patch. And when the skb isn't long enough, we terminate the function u32_classify() immediately with -1. Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 49 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 9627542..4f52214 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -98,11 +98,11 @@ static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_re { struct { struct tc_u_knode *knode; - u8 *ptr; + unsigned int off; } stack[TC_U32_MAXDEPTH]; struct tc_u_hnode *ht = (struct tc_u_hnode*)tp->root; - u8 *ptr = skb_network_header(skb); + unsigned int off = skb_network_offset(skb); struct tc_u_knode *n; int sdepth = 0; int off2 = 0; @@ -134,8 +134,14 @@ next_knode: #endif for (i = n->sel.nkeys; i>0; i--, key++) { - - if ((*(__be32*)(ptr+key->off+(off2&key->offmask))^key->val)&key->mask) { + unsigned int toff; + __be32 *data, _data; + + toff = off + key->off + (off2 & key->offmask); + data = skb_header_pointer(skb, toff, 4, &_data); + if (!data) + goto out; + if ((*data ^ key->val) & key->mask) { n = n->next; goto next_knode; } @@ -174,29 +180,45 @@ check_terminal: if (sdepth >= TC_U32_MAXDEPTH) goto deadloop; stack[sdepth].knode = n; - stack[sdepth].ptr = ptr; + stack[sdepth].off = off; sdepth++; ht = n->ht_down; sel = 0; - if (ht->divisor) - sel = ht->divisor&u32_hash_fold(*(__be32*)(ptr+n->sel.hoff), &n->sel,n->fshift); - + if (ht->divisor) { + __be32 *data, _data; + + data = skb_header_pointer(skb, off + n->sel.hoff, 4, + &_data); + if (!data) + goto out; + sel = ht->divisor & u32_hash_fold(*data, &n->sel, + n->fshift); + } if (!(n->sel.flags&(TC_U32_VAROFFSET|TC_U32_OFFSET|TC_U32_EAT))) goto next_ht; if (n->sel.flags&(TC_U32_OFFSET|TC_U32_VAROFFSET)) { off2 = n->sel.off + 3; - if (n->sel.flags&TC_U32_VAROFFSET) - off2 += ntohs(n->sel.offmask & *(__be16*)(ptr+n->sel.offoff)) >>n->sel.offshift; + if (n->sel.flags & TC_U32_VAROFFSET) { + __be16 *data, _data; + + data = skb_header_pointer(skb, + off + n->sel.offoff, + 2, &_data); + if (!data) + goto out; + off2 += ntohs(n->sel.offmask & *data) >> + n->sel.offshift; + } off2 &= ~3; } if (n->sel.flags&TC_U32_EAT) { - ptr += off2; + off += off2; off2 = 0; } - if (ptr < skb_tail_pointer(skb)) + if (off < skb->len) goto next_ht; } @@ -204,9 +226,10 @@ check_terminal: if (sdepth--) { n = stack[sdepth].knode; ht = n->ht_up; - ptr = stack[sdepth].ptr; + off = stack[sdepth].off; goto check_terminal; } +out: return -1; deadloop: -- cgit v1.1 From db2c24175d149b55784f7cb2c303622ce962c1ae Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Wed, 2 Jun 2010 04:55:02 +0000 Subject: act_pedit: access skb->data safely access skb->data safely we should use skb_header_pointer() and skb_store_bits() to access skb->data to handle small or non-linear skbs. Signed-off-by: Changli Gao ---- net/sched/act_pedit.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index fdbd0b7..50e3d94 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -125,7 +125,7 @@ static int tcf_pedit(struct sk_buff *skb, struct tc_action *a, { struct tcf_pedit *p = a->priv; int i, munged = 0; - u8 *pptr; + unsigned int off; if (!(skb->tc_verd & TC_OK2MUNGE)) { /* should we set skb->cloned? */ @@ -134,7 +134,7 @@ static int tcf_pedit(struct sk_buff *skb, struct tc_action *a, } } - pptr = skb_network_header(skb); + off = skb_network_offset(skb); spin_lock(&p->tcf_lock); @@ -144,17 +144,17 @@ static int tcf_pedit(struct sk_buff *skb, struct tc_action *a, struct tc_pedit_key *tkey = p->tcfp_keys; for (i = p->tcfp_nkeys; i > 0; i--, tkey++) { - u32 *ptr; + u32 *ptr, _data; int offset = tkey->off; if (tkey->offmask) { - if (skb->len > tkey->at) { - char *j = pptr + tkey->at; - offset += ((*j & tkey->offmask) >> - tkey->shift); - } else { + char *d, _d; + + d = skb_header_pointer(skb, off + tkey->at, 1, + &_d); + if (!d) goto bad; - } + offset += (*d & tkey->offmask) >> tkey->shift; } if (offset % 4) { @@ -169,9 +169,13 @@ static int tcf_pedit(struct sk_buff *skb, struct tc_action *a, goto bad; } - ptr = (u32 *)(pptr+offset); + ptr = skb_header_pointer(skb, off + offset, 4, &_data); + if (!ptr) + goto bad; /* just do it, baby */ *ptr = ((*ptr & tkey->mask) ^ tkey->val); + if (ptr == &_data) + skb_store_bits(skb, off + offset, ptr, 4); munged++; } -- cgit v1.1 From 8764ab2ca7ab5055e1ca80f9cfa4970c34acb804 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Fri, 4 Jun 2010 01:57:38 +0000 Subject: net: check for refcount if pop a stacked dst_entry xfrm triggers a warning if dst_pop() drops a refcount on a noref dst. This patch changes dst_pop() to skb_dst_pop(). skb_dst_pop() drops the refcnt only on a refcounted dst. Also we don't clone the child dst_entry, so it is not refcounted and we can use skb_dst_set_noref() in xfrm_output_one(). Signed-off-by: Steffen Klassert Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/xfrm/xfrm_output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 6a32915..a3cca0a 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -95,13 +95,13 @@ resume: goto error_nolock; } - dst = dst_pop(dst); + dst = skb_dst_pop(skb); if (!dst) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); err = -EHOSTUNREACH; goto error_nolock; } - skb_dst_set(skb, dst); + skb_dst_set_noref(skb, dst); x = dst->xfrm; } while (x && !(x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL)); -- cgit v1.1 From 57f1553ee5d9f093660cc49098f494e17ed11668 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 3 Jun 2010 00:42:30 +0000 Subject: syncookies: remove Kconfig text line about disabled-by-default syncookies default to on since e994b7c901ded7200b525a707c6da71f2cf6d4bb (tcp: Don't make syn cookies initial setting depend on CONFIG_SYSCTL). Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 8e3a1fd..7c3a7d1 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -303,7 +303,7 @@ config ARPD If unsure, say N. config SYN_COOKIES - bool "IP: TCP syncookie support (disabled per default)" + bool "IP: TCP syncookie support" ---help--- Normal TCP/IP networking is open to an attack known as "SYN flooding". This denial-of-service attack prevents legitimate remote @@ -328,13 +328,13 @@ config SYN_COOKIES server is really overloaded. If this happens frequently better turn them off. - If you say Y here, note that SYN cookies aren't enabled by default; - you can enable them by saying Y to "/proc file system support" and + If you say Y here, you can disable SYN cookies at run time by + saying Y to "/proc file system support" and "Sysctl support" below and executing the command - echo 1 >/proc/sys/net/ipv4/tcp_syncookies + echo 0 > /proc/sys/net/ipv4/tcp_syncookies - at boot time after the /proc file system has been mounted. + after the /proc file system has been mounted. If unsure, say N. -- cgit v1.1 From ca55158c6ecb7832a6ad80ac44a14d23bab8cdfc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Jun 2010 09:03:58 +0000 Subject: rps: tcp: fix rps_sock_flow_table table updates I believe a moderate SYN flood attack can corrupt RFS flow table (rps_sock_flow_table), making RPS/RFS much less effective. Even in a normal situation, server handling short lived sessions suffer from bad steering for the first data packet of a session, if another SYN packet is received for another session. We do following action in tcp_v4_rcv() : sock_rps_save_rxhash(sk, skb->rxhash); We should _not_ do this if sk is a LISTEN socket, as about each packet received on a LISTEN socket has a different rxhash than previous one. -> RPS_NO_CPU markers are spread all over rps_sock_flow_table. Also, it makes sense to protect sk->rxhash field changes with socket lock (We currently can change it even if user thread owns the lock and might use rxhash) This patch moves sock_rps_save_rxhash() to a sock locked section, and only for non LISTEN sockets. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 202cf09..fe193e5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1555,6 +1555,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) #endif if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ + sock_rps_save_rxhash(sk, skb->rxhash); TCP_CHECK_TIMER(sk); if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; @@ -1579,7 +1580,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) } return 0; } - } + } else + sock_rps_save_rxhash(sk, skb->rxhash); + TCP_CHECK_TIMER(sk); if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { @@ -1672,8 +1675,6 @@ process: skb->dev = NULL; - sock_rps_save_rxhash(sk, skb->rxhash); - bh_lock_sock_nested(sk); ret = 0; if (!sock_owned_by_user(sk)) { -- cgit v1.1 From c44649216522cd607a4027d2ebf4a8147d3fa94c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Jun 2010 05:45:47 +0000 Subject: tcp: use correct net ns in cookie_v4_check() Its better to make a route lookup in appropriate namespace. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/syncookies.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 5c24db4..9f6b222 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -347,7 +347,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, { .sport = th->dest, .dport = th->source } } }; security_req_classify_flow(req, &fl); - if (ip_route_output_key(&init_net, &rt, &fl)) { + if (ip_route_output_key(sock_net(sk), &rt, &fl)) { reqsk_free(req); goto out; } -- cgit v1.1 From 72e09ad107e78d69ff4d3b97a69f0aad2b77280f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 5 Jun 2010 03:03:30 -0700 Subject: ipv6: avoid high order allocations With mtu=9000, mld_newpack() use order-2 GFP_ATOMIC allocations, that are very unreliable, on machines where PAGE_SIZE=4K Limit allocated skbs to be at most one page. (order-0 allocations) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/mcast.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 59f1881..ab1622d 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1356,7 +1356,10 @@ static struct sk_buff *mld_newpack(struct net_device *dev, int size) IPV6_TLV_PADN, 0 }; /* we assume size > sizeof(ra) here */ - skb = sock_alloc_send_skb(sk, size + LL_ALLOCATED_SPACE(dev), 1, &err); + size += LL_ALLOCATED_SPACE(dev); + /* limit our allocations to order-0 page */ + size = min_t(int, size, SKB_MAX_ORDER(0, 0)); + skb = sock_alloc_send_skb(sk, size, 1, &err); if (!skb) return NULL; -- cgit v1.1 From 8ffb335e8d696affc04f963bf73ce2196f80edb9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 6 Jun 2010 15:34:40 -0700 Subject: ip6mr: fix a typo in ip6mr_for_each_table() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6mr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 073071f..89c0b07 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -120,7 +120,7 @@ static void mroute_clean_tables(struct mr6_table *mrt); static void ipmr_expire_process(unsigned long arg); #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES -#define ip6mr_for_each_table(mrt, met) \ +#define ip6mr_for_each_table(mrt, net) \ list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list) static struct mr6_table *ip6mr_get_table(struct net *net, u32 id) -- cgit v1.1