From d48102007d068df7ba3055cdc1723e12db1ba30f Mon Sep 17 00:00:00 2001 From: Evgeniy Polyakov Date: Wed, 18 May 2005 22:51:45 -0700 Subject: [XFRM]: skb_cow_data() does not set proper owner for new skbs. It looks like skb_cow_data() does not set proper owner for newly created skb. If we have several fragments for skb and some of them are shared(?) or cloned (like in async IPsec) there might be a situation when we require recreating skb and thus using skb_copy() for it. Newly created skb has neither a destructor nor a socket assotiated with it, which must be copied from the old skb. As far as I can see, current code sets destructor and socket for the first one skb only and uses truesize of the first skb only to increment sk_wmem_alloc value. If above "analysis" is correct then attached patch fixes that. Signed-off-by: Evgeniy Polyakov Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/xfrm/xfrm_algo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c index 080aae2..2f4531f 100644 --- a/net/xfrm/xfrm_algo.c +++ b/net/xfrm/xfrm_algo.c @@ -698,7 +698,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) return -ENOMEM; if (skb1->sk) - skb_set_owner_w(skb, skb1->sk); + skb_set_owner_w(skb2, skb1->sk); /* Looking around. Are we still alive? * OK, link new skb, drop old one */ -- cgit v1.1 From 2fdba6b085eb7068e9594cfa55ffe40466184b4d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 18 May 2005 22:52:33 -0700 Subject: [IPV4/IPV6] Ensure all frag_list members have NULL sk Having frag_list members which holds wmem of an sk leads to nightmares with partially cloned frag skb's. The reason is that once you unleash a skb with a frag_list that has individual sk ownerships into the stack you can never undo those ownerships safely as they may have been cloned by things like netfilter. Since we have to undo them in order to make skb_linearize happy this approach leads to a dead-end. So let's go the other way and make this an invariant: For any skb on a frag_list, skb->sk must be NULL. That is, the socket ownership always belongs to the head skb. It turns out that the implementation is actually pretty simple. The above invariant is actually violated in the following patch for a short duration inside ip_fragment. This is OK because the offending frag_list member is either destroyed at the end of the slow path without being sent anywhere, or it is detached from the frag_list before being sent. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 8 ++++++++ net/ipv6/ip6_output.c | 14 ++++++++------ 2 files changed, 16 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index daebd93..760dc82 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -490,6 +490,14 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) /* Partially cloned skb? */ if (skb_shared(frag)) goto slow_path; + + BUG_ON(frag->sk); + if (skb->sk) { + sock_hold(skb->sk); + frag->sk = skb->sk; + frag->destructor = sock_wfree; + skb->truesize -= frag->truesize; + } } /* Everything is OK. Generate! */ diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 0f07114..b78a535 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -552,13 +552,17 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) skb_headroom(frag) < hlen) goto slow_path; - /* Correct socket ownership. */ - if (frag->sk == NULL) - goto slow_path; - /* Partially cloned skb? */ if (skb_shared(frag)) goto slow_path; + + BUG_ON(frag->sk); + if (skb->sk) { + sock_hold(skb->sk); + frag->sk = skb->sk; + frag->destructor = sock_wfree; + skb->truesize -= frag->truesize; + } } err = 0; @@ -1116,12 +1120,10 @@ int ip6_push_pending_frames(struct sock *sk) tail_skb = &(tmp_skb->next); skb->len += tmp_skb->len; skb->data_len += tmp_skb->len; -#if 0 /* Logically correct, but useless work, ip_fragment() will have to undo */ skb->truesize += tmp_skb->truesize; __sock_put(tmp_skb->sk); tmp_skb->destructor = NULL; tmp_skb->sk = NULL; -#endif } ipv6_addr_copy(final_dst, &fl->fl6_dst); -- cgit v1.1 From f81a0bffa116ea22149aa7cfb0b4ee09096d9d92 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 19 May 2005 12:26:43 -0700 Subject: [AF_UNIX]: Use lookup_create(). currently it opencodes it, but that's in the way of chaning the lookup_hash interface. I'd prefer to disallow modular af_unix over exporting lookup_create, but I'll leave that to you. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- net/unix/af_unix.c | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c478fc8..c420eba 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -770,33 +770,12 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd); if (err) goto out_mknod_parent; - /* - * Yucky last component or no last component at all? - * (foo/., foo/.., /////) - */ - err = -EEXIST; - if (nd.last_type != LAST_NORM) - goto out_mknod; - /* - * Lock the directory. - */ - down(&nd.dentry->d_inode->i_sem); - /* - * Do the final lookup. - */ - dentry = lookup_hash(&nd.last, nd.dentry); + + dentry = lookup_create(&nd, 0); err = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_mknod_unlock; - err = -ENOENT; - /* - * Special case - lookup gave negative, but... we had foo/bar/ - * From the vfs_mknod() POV we just have a negative dentry - - * all is fine. Let's be bastards - you had / on the end, you've - * been asking for (non-existent) directory. -ENOENT for you. - */ - if (nd.last.name[nd.last.len] && !dentry->d_inode) - goto out_mknod_dput; + /* * All right, let's create it. */ @@ -845,7 +824,6 @@ out_mknod_dput: dput(dentry); out_mknod_unlock: up(&nd.dentry->d_inode->i_sem); -out_mknod: path_release(&nd); out_mknod_parent: if (err==-EEXIST) -- cgit v1.1 From d9fa0f392b20b2b8e3df379c44194492a2446c6e Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 19 May 2005 12:29:59 -0700 Subject: [IP_VS]: Remove extra __ip_vs_conn_put() for incoming ICMP. Remove extra __ip_vs_conn_put for incoming ICMP in direct routing mode. Mark de Vries reports that IPVS connections are not leaked anymore. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/ipvs/ip_vs_xmit.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c index faa6176..de21da0 100644 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -508,7 +508,6 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, rc = NF_ACCEPT; /* do not touch skb anymore */ atomic_inc(&cp->in_pkts); - __ip_vs_conn_put(cp); goto out; } -- cgit v1.1 From 8be58932ca596972e4953ae980d8bc286857cae8 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 19 May 2005 12:36:33 -0700 Subject: [NETFILTER]: Do not be clever about SKB ownership in ip_ct_gather_frags(). Just do an skb_orphan() and be done with it. Based upon discussions with Herbert Xu on netdev. Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_core.c | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 28d9425..09e8246 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -940,37 +940,25 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct, struct sk_buff * ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) { - struct sock *sk = skb->sk; #ifdef CONFIG_NETFILTER_DEBUG unsigned int olddebug = skb->nf_debug; #endif - if (sk) { - sock_hold(sk); - skb_orphan(skb); - } + skb_orphan(skb); local_bh_disable(); skb = ip_defrag(skb, user); local_bh_enable(); - if (!skb) { - if (sk) - sock_put(sk); - return skb; - } - - if (sk) { - skb_set_owner_w(skb, sk); - sock_put(sk); - } - - ip_send_check(skb->nh.iph); - skb->nfcache |= NFC_ALTERED; + if (skb) { + ip_send_check(skb->nh.iph); + skb->nfcache |= NFC_ALTERED; #ifdef CONFIG_NETFILTER_DEBUG - /* Packet path as if nothing had happened. */ - skb->nf_debug = olddebug; + /* Packet path as if nothing had happened. */ + skb->nf_debug = olddebug; #endif + } + return skb; } -- cgit v1.1 From b9e9dead05b19e7f52c9aa00cd3a5b7ac4fcacf4 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 19 May 2005 12:39:04 -0700 Subject: [IPSEC]: Fixed alg_key_len usage in attach_one_algo The variable alg_key_len is in bits and not bytes. The function attach_one_algo is currently using it as if it were in bytes. This causes it to read memory which may not be there. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/xfrm/xfrm_user.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 5ddda2c..15ba086 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -162,6 +162,7 @@ static int attach_one_algo(struct xfrm_algo **algpp, u8 *props, struct rtattr *rta = u_arg; struct xfrm_algo *p, *ualg; struct xfrm_algo_desc *algo; + int len; if (!rta) return 0; @@ -173,11 +174,12 @@ static int attach_one_algo(struct xfrm_algo **algpp, u8 *props, return -ENOSYS; *props = algo->desc.sadb_alg_id; - p = kmalloc(sizeof(*ualg) + ualg->alg_key_len, GFP_KERNEL); + len = sizeof(*ualg) + (ualg->alg_key_len + 7U) / 8; + p = kmalloc(len, GFP_KERNEL); if (!p) return -ENOMEM; - memcpy(p, ualg, sizeof(*ualg) + ualg->alg_key_len); + memcpy(p, ualg, len); *algpp = p; return 0; } -- cgit v1.1 From 31c26852cb2ac77f1d4acb37bcf31f165fd5eb68 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 19 May 2005 12:39:49 -0700 Subject: [IPSEC]: Verify key payload in verify_one_algo We need to verify that the payload contains enough data so that attach_one_algo can copy alg_key_len bits from the payload. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/xfrm/xfrm_user.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 15ba086..9750901 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -34,14 +34,21 @@ static int verify_one_alg(struct rtattr **xfrma, enum xfrm_attr_type_t type) { struct rtattr *rt = xfrma[type - 1]; struct xfrm_algo *algp; + int len; if (!rt) return 0; - if ((rt->rta_len - sizeof(*rt)) < sizeof(*algp)) + len = (rt->rta_len - sizeof(*rt)) - sizeof(*algp); + if (len < 0) return -EINVAL; algp = RTA_DATA(rt); + + len -= (algp->alg_key_len + 7U) / 8; + if (len < 0) + return -EINVAL; + switch (type) { case XFRMA_ALG_AUTH: if (!algp->alg_key_len && -- cgit v1.1 From db61ecc3352d72513c1b07805bd6f760e30c001b Mon Sep 17 00:00:00 2001 From: "Tommy S. Christensen" Date: Thu, 19 May 2005 12:46:59 -0700 Subject: [NETLINK]: Fix race with recvmsg(). This bug causes: assertion (!atomic_read(&sk->sk_rmem_alloc)) failed at net/netlink/af_netlink.c (122) What's happening is that: 1) The skb is sent to socket 1. 2) Someone does a recvmsg on socket 1 and drops the ref on the skb. Note that the rmalloc is not returned at this point since the skb is still referenced. 3) The same skb is now sent to socket 2. This version of the fix resurrects the skb_orphan call that was moved out, last time we had 'shared-skb troubles'. It is practically a no-op in the common case, but still prevents the possible race with recvmsg. Signed-off-by: Tommy S. Christensen Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 733bf52..b40c9a9 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -697,6 +697,7 @@ static __inline__ int netlink_broadcast_deliver(struct sock *sk, struct sk_buff if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !test_bit(0, &nlk->state)) { + skb_orphan(skb); skb_set_owner_r(skb, sk); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk, skb->len); -- cgit v1.1 From 68acc024ea7391e03c2c695ba0b9fb31baa974bf Mon Sep 17 00:00:00 2001 From: "Tommy S. Christensen" Date: Thu, 19 May 2005 13:06:35 -0700 Subject: [NETLINK]: Move broadcast skb_orphan to the skb_get path. Cloned packets don't need the orphan call. Signed-off-by: Tommy S. Christensen Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index b40c9a9..4b91f4b 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -697,7 +697,6 @@ static __inline__ int netlink_broadcast_deliver(struct sock *sk, struct sk_buff if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !test_bit(0, &nlk->state)) { - skb_orphan(skb); skb_set_owner_r(skb, sk); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk, skb->len); @@ -736,11 +735,15 @@ static inline int do_one_broadcast(struct sock *sk, sock_hold(sk); if (p->skb2 == NULL) { - if (atomic_read(&p->skb->users) != 1) { + if (skb_shared(p->skb)) { p->skb2 = skb_clone(p->skb, p->allocation); } else { - p->skb2 = p->skb; - atomic_inc(&p->skb->users); + p->skb2 = skb_get(p->skb); + /* + * skb ownership may have been set when + * delivered to a previous socket. + */ + skb_orphan(p->skb2); } } if (p->skb2 == NULL) { -- cgit v1.1 From aa1c6a6f7f0518b42994d02756a41cbfdcac1916 Mon Sep 17 00:00:00 2001 From: "Tommy S. Christensen" Date: Thu, 19 May 2005 13:07:32 -0700 Subject: [NETLINK]: Defer socket destruction a bit In netlink_broadcast() we're sending shared skb's to netlink listeners when possible (saves some copying). This is OK, since we hold the only other reference to the skb. However, this implies that we must drop our reference on the skb, before allowing a receiving socket to disappear. Otherwise, the socket buffer accounting is disrupted. Signed-off-by: Tommy S. Christensen Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 4b91f4b..e41ce45 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -789,11 +789,12 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list) do_one_broadcast(sk, &info); + kfree_skb(skb); + netlink_unlock_table(); if (info.skb2) kfree_skb(info.skb2); - kfree_skb(skb); if (info.delivered) { if (info.congested && (allocation & __GFP_WAIT)) -- cgit v1.1 From 314324121f9b94b2ca657a494cf2b9cb0e4a28cc Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 23 May 2005 12:03:06 -0700 Subject: [TCP]: Fix stretch ACK performance killer when doing ucopy. When we are doing ucopy, we try to defer the ACK generation to cleanup_rbuf(). This works most of the time very well, but if the ucopy prequeue is large, this ACKing behavior kills performance. With TSO, it is possible to fill the prequeue so large that by the time the ACK is sent and gets back to the sender, most of the window has emptied of data and performance suffers significantly. This behavior does help in some cases, so we should think about re-enabling this trick in the future, using some kind of limit in order to avoid the bug case. Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 79835a6..5bad504 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4355,16 +4355,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, goto no_ack; } - if (eaten) { - if (tcp_in_quickack_mode(tp)) { - tcp_send_ack(sk); - } else { - tcp_send_delayed_ack(sk); - } - } else { - __tcp_ack_snd_check(sk, 0); - } - + __tcp_ack_snd_check(sk, 0); no_ack: if (eaten) __kfree_skb(skb); -- cgit v1.1 From 180e42503300629692b513daeb55a6bb0b51500c Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 23 May 2005 13:11:07 -0700 Subject: [IPV6]: Fix xfrm tunnel oops with large packets Signed-off-by: Herbert Xu Acked-by: Hideaki YOSHIFUJI Signed-off-by: David S. Miller --- net/ipv6/xfrm6_output.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 601a148..6b98677 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -84,6 +84,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) mtu = IPV6_MIN_MTU; if (skb->len > mtu) { + skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); ret = -EMSGSIZE; } -- cgit v1.1 From 0afb51e72855971dba83b3c6b70c547c2d1161fd Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 26 May 2005 12:53:49 -0700 Subject: [PKT_SCHED]: netem: reinsert for duplication Handle duplication of packets in netem by re-inserting at top of qdisc tree. This avoid problems with qlen accounting with nested qdisc. This recursion requires no additional locking but will potentially increase stack depth. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 53 ++++++++++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index e0c9fbe..5c0f0c2 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -203,42 +203,47 @@ static int netem_run(struct Qdisc *sch) return 0; } +/* + * Insert one skb into qdisc. + * Note: parent depends on return value to account for queue length. + * NET_XMIT_DROP: queue length didn't change. + * NET_XMIT_SUCCESS: one skb was queued. + */ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb2; int ret; + int count = 1; pr_debug("netem_enqueue skb=%p\n", skb); + /* Random duplication */ + if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) + ++count; + /* Random packet drop 0 => none, ~0 => all */ - if (q->loss && q->loss >= get_crandom(&q->loss_cor)) { - pr_debug("netem_enqueue: random loss\n"); + if (q->loss && q->loss >= get_crandom(&q->loss_cor)) + --count; + + if (count == 0) { sch->qstats.drops++; kfree_skb(skb); - return 0; /* lie about loss so TCP doesn't know */ + return NET_XMIT_DROP; } - /* Random duplication */ - if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) { - struct sk_buff *skb2; - - skb2 = skb_clone(skb, GFP_ATOMIC); - if (skb2 && netem_delay(sch, skb2) == NET_XMIT_SUCCESS) { - struct Qdisc *qp; - - /* Since one packet can generate two packets in the - * queue, the parent's qlen accounting gets confused, - * so fix it. - */ - qp = qdisc_lookup(sch->dev, TC_H_MAJ(sch->parent)); - if (qp) - qp->q.qlen++; - - sch->q.qlen++; - sch->bstats.bytes += skb2->len; - sch->bstats.packets++; - } else - sch->qstats.drops++; + /* + * If we need to duplicate packet, then re-insert at top of the + * qdisc tree, since parent queuer expects that only one + * skb will be queued. + */ + if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { + struct Qdisc *rootq = sch->dev->qdisc; + u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ + q->duplicate = 0; + + rootq->enqueue(skb2, rootq); + q->duplicate = dupsave; } /* If doing simple delay then gap == 0 so all packets -- cgit v1.1 From 0f9f32ac65ee4a452a912a8440cebbc4dff73852 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 26 May 2005 12:55:01 -0700 Subject: [PKT_SCHED] netem: use only inner qdisc -- no private skbuff queue Netem works better if there if packets are just queued in the inner discipline rather than having a separate delayed queue. Change to use the dequeue/requeue to peek like TBF does. By doing this potential qlen problems with the old method are avoided. The problems happened when the netem_run that moved packets from the inner discipline to the nested discipline failed (because inner queue was full). This happened in dequeue, so the effective qlen of the netem would be decreased (because of the drop), but there was no way to keep the outer qdisc (caller of netem dequeue) in sync. The problem window is still there since this patch doesn't address the issue of requeue failing in netem_dequeue, but that shouldn't happen since the sequence dequeue/requeue should always work. Long term correct fix is to implement qdisc->peek in all the qdisc's to allow for this (needed by several other qdisc's as well). Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 124 +++++++++++++++----------------------------------- 1 file changed, 36 insertions(+), 88 deletions(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 5c0f0c2..48360f7 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -53,7 +53,6 @@ struct netem_sched_data { struct Qdisc *qdisc; - struct sk_buff_head delayed; struct timer_list timer; u32 latency; @@ -137,72 +136,6 @@ static long tabledist(unsigned long mu, long sigma, return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; } -/* Put skb in the private delayed queue. */ -static int netem_delay(struct Qdisc *sch, struct sk_buff *skb) -{ - struct netem_sched_data *q = qdisc_priv(sch); - psched_tdiff_t td; - psched_time_t now; - - PSCHED_GET_TIME(now); - td = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist); - - /* Always queue at tail to keep packets in order */ - if (likely(q->delayed.qlen < q->limit)) { - struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb; - - PSCHED_TADD2(now, td, cb->time_to_send); - - pr_debug("netem_delay: skb=%p now=%llu tosend=%llu\n", skb, - now, cb->time_to_send); - - __skb_queue_tail(&q->delayed, skb); - return NET_XMIT_SUCCESS; - } - - pr_debug("netem_delay: queue over limit %d\n", q->limit); - sch->qstats.overlimits++; - kfree_skb(skb); - return NET_XMIT_DROP; -} - -/* - * Move a packet that is ready to send from the delay holding - * list to the underlying qdisc. - */ -static int netem_run(struct Qdisc *sch) -{ - struct netem_sched_data *q = qdisc_priv(sch); - struct sk_buff *skb; - psched_time_t now; - - PSCHED_GET_TIME(now); - - skb = skb_peek(&q->delayed); - if (skb) { - const struct netem_skb_cb *cb - = (const struct netem_skb_cb *)skb->cb; - long delay - = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now)); - pr_debug("netem_run: skb=%p delay=%ld\n", skb, delay); - - /* if more time remaining? */ - if (delay > 0) { - mod_timer(&q->timer, jiffies + delay); - return 1; - } - - __skb_unlink(skb, &q->delayed); - - if (q->qdisc->enqueue(skb, q->qdisc)) { - sch->q.qlen--; - sch->qstats.drops++; - } - } - - return 0; -} - /* * Insert one skb into qdisc. * Note: parent depends on return value to account for queue length. @@ -212,6 +145,7 @@ static int netem_run(struct Qdisc *sch) static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); + struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb; struct sk_buff *skb2; int ret; int count = 1; @@ -246,18 +180,24 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) q->duplicate = dupsave; } - /* If doing simple delay then gap == 0 so all packets - * go into the delayed holding queue - * otherwise if doing out of order only "1 out of gap" - * packets will be delayed. + /* + * Do re-ordering by putting one out of N packets at the front + * of the queue. + * gap == 0 is special case for no-reordering. */ - if (q->counter < q->gap) { + if (q->gap == 0 || q->counter != q->gap) { + psched_time_t now; + PSCHED_GET_TIME(now); + PSCHED_TADD2(now, + tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist), + cb->time_to_send); + ++q->counter; ret = q->qdisc->enqueue(skb, q->qdisc); } else { q->counter = 0; - ret = netem_delay(sch, skb); - netem_run(sch); + PSCHED_GET_TIME(cb->time_to_send); + ret = q->qdisc->ops->requeue(skb, q->qdisc); } if (likely(ret == NET_XMIT_SUCCESS)) { @@ -301,22 +241,33 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; - int pending; - - pending = netem_run(sch); skb = q->qdisc->dequeue(q->qdisc); if (skb) { - pr_debug("netem_dequeue: return skb=%p\n", skb); - sch->q.qlen--; - sch->flags &= ~TCQ_F_THROTTLED; - } - else if (pending) { - pr_debug("netem_dequeue: throttling\n"); + const struct netem_skb_cb *cb + = (const struct netem_skb_cb *)skb->cb; + psched_time_t now; + long delay; + + /* if more time remaining? */ + PSCHED_GET_TIME(now); + delay = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now)); + pr_debug("netem_run: skb=%p delay=%ld\n", skb, delay); + if (delay <= 0) { + pr_debug("netem_dequeue: return skb=%p\n", skb); + sch->q.qlen--; + sch->flags &= ~TCQ_F_THROTTLED; + return skb; + } + + mod_timer(&q->timer, jiffies + delay); sch->flags |= TCQ_F_THROTTLED; - } - return skb; + if (q->qdisc->ops->requeue(skb, q->qdisc) != 0) + sch->qstats.drops++; + } + + return NULL; } static void netem_watchdog(unsigned long arg) @@ -333,8 +284,6 @@ static void netem_reset(struct Qdisc *sch) struct netem_sched_data *q = qdisc_priv(sch); qdisc_reset(q->qdisc); - skb_queue_purge(&q->delayed); - sch->q.qlen = 0; sch->flags &= ~TCQ_F_THROTTLED; del_timer_sync(&q->timer); @@ -460,7 +409,6 @@ static int netem_init(struct Qdisc *sch, struct rtattr *opt) if (!opt) return -EINVAL; - skb_queue_head_init(&q->delayed); init_timer(&q->timer); q->timer.function = netem_watchdog; q->timer.data = (unsigned long) sch; -- cgit v1.1 From 0dca51d362b8e4af6b0dbc9e54d1e5165341918a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 26 May 2005 12:55:48 -0700 Subject: [PKT_SCHED] netem: allow random reordering (with fix) Here is a fixed up version of the reorder feature of netem. It is the same as the earlier patch plus with the bugfix from Julio merged in. Has expected backwards compatibility behaviour. Go ahead and merge this one, the TCP strangeness I was seeing was due to the reordering bug, and previous version of TSO patch. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 54 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 48360f7..bb9bf8d 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -62,11 +62,12 @@ struct netem_sched_data { u32 gap; u32 jitter; u32 duplicate; + u32 reorder; struct crndstate { unsigned long last; unsigned long rho; - } delay_cor, loss_cor, dup_cor; + } delay_cor, loss_cor, dup_cor, reorder_cor; struct disttable { u32 size; @@ -180,23 +181,23 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) q->duplicate = dupsave; } - /* - * Do re-ordering by putting one out of N packets at the front - * of the queue. - * gap == 0 is special case for no-reordering. - */ - if (q->gap == 0 || q->counter != q->gap) { + if (q->gap == 0 /* not doing reordering */ + || q->counter < q->gap /* inside last reordering gap */ + || q->reorder < get_crandom(&q->reorder_cor)) { psched_time_t now; PSCHED_GET_TIME(now); - PSCHED_TADD2(now, - tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist), + PSCHED_TADD2(now, tabledist(q->latency, q->jitter, + &q->delay_cor, q->delay_dist), cb->time_to_send); - ++q->counter; ret = q->qdisc->enqueue(skb, q->qdisc); } else { - q->counter = 0; + /* + * Do re-ordering by putting one out of N packets at the front + * of the queue. + */ PSCHED_GET_TIME(cb->time_to_send); + q->counter = 0; ret = q->qdisc->ops->requeue(skb, q->qdisc); } @@ -351,6 +352,19 @@ static int get_correlation(struct Qdisc *sch, const struct rtattr *attr) return 0; } +static int get_reorder(struct Qdisc *sch, const struct rtattr *attr) +{ + struct netem_sched_data *q = qdisc_priv(sch); + const struct tc_netem_reorder *r = RTA_DATA(attr); + + if (RTA_PAYLOAD(attr) != sizeof(*r)) + return -EINVAL; + + q->reorder = r->probability; + init_crandom(&q->reorder_cor, r->correlation); + return 0; +} + static int netem_change(struct Qdisc *sch, struct rtattr *opt) { struct netem_sched_data *q = qdisc_priv(sch); @@ -371,9 +385,15 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt) q->jitter = qopt->jitter; q->limit = qopt->limit; q->gap = qopt->gap; + q->counter = 0; q->loss = qopt->loss; q->duplicate = qopt->duplicate; + /* for compatiablity with earlier versions. + * if gap is set, need to assume 100% probablity + */ + q->reorder = ~0; + /* Handle nested options after initial queue options. * Should have put all options in nested format but too late now. */ @@ -395,6 +415,11 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt) if (ret) return ret; } + if (tb[TCA_NETEM_REORDER-1]) { + ret = get_reorder(sch, tb[TCA_NETEM_REORDER-1]); + if (ret) + return ret; + } } @@ -412,7 +437,6 @@ static int netem_init(struct Qdisc *sch, struct rtattr *opt) init_timer(&q->timer); q->timer.function = netem_watchdog; q->timer.data = (unsigned long) sch; - q->counter = 0; q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); if (!q->qdisc) { @@ -444,6 +468,7 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) struct rtattr *rta = (struct rtattr *) b; struct tc_netem_qopt qopt; struct tc_netem_corr cor; + struct tc_netem_reorder reorder; qopt.latency = q->latency; qopt.jitter = q->jitter; @@ -457,6 +482,11 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) cor.loss_corr = q->loss_cor.rho; cor.dup_corr = q->dup_cor.rho; RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor); + + reorder.probability = q->reorder; + reorder.correlation = q->reorder_cor.rho; + RTA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder); + rta->rta_len = skb->tail - b; return skb->len; -- cgit v1.1 From 92d63decc0b6a5d600f792fcf5f3ff9718c09a3d Mon Sep 17 00:00:00 2001 From: Hideaki YOSHIFUJI Date: Thu, 26 May 2005 12:58:04 -0700 Subject: From: Kazunori Miyazawa [XFRM] Call dst_check() with appropriate cookie This fixes infinite loop issue with IPv6 tunnel mode. Signed-off-by: Kazunori Miyazawa Signed-off-by: Hideaki YOSHIFUJI Signed-off-by: David S. Miller --- net/ipv6/xfrm6_policy.c | 4 ++++ net/xfrm/xfrm_policy.c | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 4429b1a..cf1d91e 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -113,6 +113,8 @@ __xfrm6_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int xdst = (struct xfrm_dst *)dst1; xdst->route = &rt->u.dst; + if (rt->rt6i_node) + xdst->route_cookie = rt->rt6i_node->fn_sernum; dst1->next = dst_prev; dst_prev = dst1; @@ -137,6 +139,8 @@ __xfrm6_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int dst_prev->child = &rt->u.dst; dst->path = &rt->u.dst; + if (rt->rt6i_node) + ((struct xfrm_dst *)dst)->path_cookie = rt->rt6i_node->fn_sernum; *dst_p = dst; dst = dst_prev; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 55ed979..d07f5ce 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1136,7 +1136,7 @@ int xfrm_bundle_ok(struct xfrm_dst *first, struct flowi *fl, int family) struct xfrm_dst *last; u32 mtu; - if (!dst_check(dst->path, 0) || + if (!dst_check(dst->path, ((struct xfrm_dst *)dst)->path_cookie) || (dst->dev && !netif_running(dst->dev))) return 0; @@ -1156,7 +1156,7 @@ int xfrm_bundle_ok(struct xfrm_dst *first, struct flowi *fl, int family) xdst->child_mtu_cached = mtu; } - if (!dst_check(xdst->route, 0)) + if (!dst_check(xdst->route, xdst->route_cookie)) return 0; mtu = dst_mtu(xdst->route); if (xdst->route_mtu_cached != mtu) { -- cgit v1.1 From c6b3365391c626206f6789354794a81a010cb7a1 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 26 May 2005 12:59:05 -0700 Subject: [TOKENRING]: be'ify trh_hdr, trllc, rif_cache_s Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/802/tr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/802/tr.c b/net/802/tr.c index 85293cc..3bfb862 100644 --- a/net/802/tr.c +++ b/net/802/tr.c @@ -50,8 +50,8 @@ static void rif_check_expire(unsigned long dummy); struct rif_cache_s { unsigned char addr[TR_ALEN]; int iface; - __u16 rcf; - __u16 rseg[8]; + __be16 rcf; + __be16 rseg[8]; struct rif_cache_s *next; unsigned long last_used; unsigned char local_ring; -- cgit v1.1 From c8b35d2a29ec3c93e3b9c1e70d649a77a214b1c1 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 26 May 2005 12:59:42 -0700 Subject: [TOKENRING]: net/802/tr.c: s/struct rif_cache_s/struct rif_cache/ "_s" suffix is certainly of hungarian origin. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/802/tr.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/802/tr.c b/net/802/tr.c index 3bfb862..a755e88 100644 --- a/net/802/tr.c +++ b/net/802/tr.c @@ -47,12 +47,12 @@ static void rif_check_expire(unsigned long dummy); * Each RIF entry we learn is kept this way */ -struct rif_cache_s { +struct rif_cache { unsigned char addr[TR_ALEN]; int iface; __be16 rcf; __be16 rseg[8]; - struct rif_cache_s *next; + struct rif_cache *next; unsigned long last_used; unsigned char local_ring; }; @@ -64,7 +64,7 @@ struct rif_cache_s { * up a lot. */ -static struct rif_cache_s *rif_table[RIF_TABLE_SIZE]; +static struct rif_cache *rif_table[RIF_TABLE_SIZE]; static DEFINE_SPINLOCK(rif_lock); @@ -249,7 +249,7 @@ void tr_source_route(struct sk_buff *skb,struct trh_hdr *trh,struct net_device * { int slack; unsigned int hash; - struct rif_cache_s *entry; + struct rif_cache *entry; unsigned char *olddata; static const unsigned char mcast_func_addr[] = {0xC0,0x00,0x00,0x04,0x00,0x00}; @@ -337,7 +337,7 @@ printk("source routing for %02X:%02X:%02X:%02X:%02X:%02X\n",trh->daddr[0], static void tr_add_rif_info(struct trh_hdr *trh, struct net_device *dev) { unsigned int hash, rii_p = 0; - struct rif_cache_s *entry; + struct rif_cache *entry; spin_lock_bh(&rif_lock); @@ -373,7 +373,7 @@ printk("adding rif_entry: addr:%02X:%02X:%02X:%02X:%02X:%02X rcf:%04X\n", * FIXME: We ought to keep some kind of cache size * limiting and adjust the timers to suit. */ - entry=kmalloc(sizeof(struct rif_cache_s),GFP_ATOMIC); + entry=kmalloc(sizeof(struct rif_cache),GFP_ATOMIC); if(!entry) { @@ -435,7 +435,7 @@ static void rif_check_expire(unsigned long dummy) spin_lock_bh(&rif_lock); for(i =0; i < RIF_TABLE_SIZE; i++) { - struct rif_cache_s *entry, **pentry; + struct rif_cache *entry, **pentry; pentry = rif_table+i; while((entry=*pentry) != NULL) { @@ -467,10 +467,10 @@ static void rif_check_expire(unsigned long dummy) #ifdef CONFIG_PROC_FS -static struct rif_cache_s *rif_get_idx(loff_t pos) +static struct rif_cache *rif_get_idx(loff_t pos) { int i; - struct rif_cache_s *entry; + struct rif_cache *entry; loff_t off = 0; for(i = 0; i < RIF_TABLE_SIZE; i++) @@ -493,7 +493,7 @@ static void *rif_seq_start(struct seq_file *seq, loff_t *pos) static void *rif_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int i; - struct rif_cache_s *ent = v; + struct rif_cache *ent = v; ++*pos; @@ -522,7 +522,7 @@ static void rif_seq_stop(struct seq_file *seq, void *v) static int rif_seq_show(struct seq_file *seq, void *v) { int j, rcf_len, segment, brdgnmb; - struct rif_cache_s *entry = v; + struct rif_cache *entry = v; if (v == SEQ_START_TOKEN) seq_puts(seq, -- cgit v1.1