From 8d34ce10c59b40e0ce2685341c4e93416f505e45 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 27 Sep 2013 14:20:01 -0700
Subject: pkt_sched: fq: qdisc dismantle fixes

fq_reset() should drops all packets in queue, including
throttled flows.

This patch moves code from fq_destroy() to fq_reset()
to do the cleaning.

fq_change() must stop calling fq_dequeue() if all remaining
packets are from throttled flows.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_fq.c | 57 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 37 insertions(+), 20 deletions(-)

(limited to 'net/sched')

diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 32ad015..fc6de56 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -285,7 +285,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
 
 
 /* remove one skb from head of flow queue */
-static struct sk_buff *fq_dequeue_head(struct fq_flow *flow)
+static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
 {
 	struct sk_buff *skb = flow->head;
 
@@ -293,6 +293,8 @@ static struct sk_buff *fq_dequeue_head(struct fq_flow *flow)
 		flow->head = skb->next;
 		skb->next = NULL;
 		flow->qlen--;
+		sch->qstats.backlog -= qdisc_pkt_len(skb);
+		sch->q.qlen--;
 	}
 	return skb;
 }
@@ -419,7 +421,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 	struct sk_buff *skb;
 	struct fq_flow *f;
 
-	skb = fq_dequeue_head(&q->internal);
+	skb = fq_dequeue_head(sch, &q->internal);
 	if (skb)
 		goto out;
 	fq_check_throttled(q, now);
@@ -449,7 +451,7 @@ begin:
 		goto begin;
 	}
 
-	skb = fq_dequeue_head(f);
+	skb = fq_dequeue_head(sch, f);
 	if (!skb) {
 		head->first = f->next;
 		/* force a pass through old_flows to prevent starvation */
@@ -490,19 +492,44 @@ begin:
 		}
 	}
 out:
-	sch->qstats.backlog -= qdisc_pkt_len(skb);
 	qdisc_bstats_update(sch, skb);
-	sch->q.qlen--;
 	qdisc_unthrottled(sch);
 	return skb;
 }
 
 static void fq_reset(struct Qdisc *sch)
 {
+	struct fq_sched_data *q = qdisc_priv(sch);
+	struct rb_root *root;
 	struct sk_buff *skb;
+	struct rb_node *p;
+	struct fq_flow *f;
+	unsigned int idx;
 
-	while ((skb = fq_dequeue(sch)) != NULL)
+	while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL)
 		kfree_skb(skb);
+
+	if (!q->fq_root)
+		return;
+
+	for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
+		root = &q->fq_root[idx];
+		while ((p = rb_first(root)) != NULL) {
+			f = container_of(p, struct fq_flow, fq_node);
+			rb_erase(p, root);
+
+			while ((skb = fq_dequeue_head(sch, f)) != NULL)
+				kfree_skb(skb);
+
+			kmem_cache_free(fq_flow_cachep, f);
+		}
+	}
+	q->new_flows.first	= NULL;
+	q->old_flows.first	= NULL;
+	q->delayed		= RB_ROOT;
+	q->flows		= 0;
+	q->inactive_flows	= 0;
+	q->throttled_flows	= 0;
 }
 
 static void fq_rehash(struct fq_sched_data *q,
@@ -645,6 +672,8 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
 	while (sch->q.qlen > sch->limit) {
 		struct sk_buff *skb = fq_dequeue(sch);
 
+		if (!skb)
+			break;
 		kfree_skb(skb);
 		drop_count++;
 	}
@@ -657,21 +686,9 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
 static void fq_destroy(struct Qdisc *sch)
 {
 	struct fq_sched_data *q = qdisc_priv(sch);
-	struct rb_root *root;
-	struct rb_node *p;
-	unsigned int idx;
 
-	if (q->fq_root) {
-		for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
-			root = &q->fq_root[idx];
-			while ((p = rb_first(root)) != NULL) {
-				rb_erase(p, root);
-				kmem_cache_free(fq_flow_cachep,
-						container_of(p, struct fq_flow, fq_node));
-			}
-		}
-		kfree(q->fq_root);
-	}
+	fq_reset(sch);
+	kfree(q->fq_root);
 	qdisc_watchdog_cancel(&q->watchdog);
 }
 
-- 
cgit v1.1


From 0eab5eb7a3a9a6ccfcdecbffff00d60a86a004bb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 1 Oct 2013 09:10:16 -0700
Subject: pkt_sched: fq: rate limiting improvements

FQ rate limiting suffers from two problems, reported
by Steinar :

1) FQ enforces a delay when flow quantum is exhausted in order
to reduce cpu overhead. But if packets are small, current
delay computation is slightly wrong, and observed rates can
be too high.

Steinar had this problem because he disabled TSO and GSO,
and default FQ quantum is 2*1514.

(Of course, I wish recent TSO auto sizing changes will help
to not having to disable TSO in the first place)

2) maxrate was not used for forwarded flows (skbs not attached
to a socket)

Tested:

tc qdisc add dev eth0 root est 1sec 4sec fq maxrate 8Mbit
netperf -H lpq84 -l 1000 &
sleep 10 ; tc -s qdisc show dev eth0
qdisc fq 8003: root refcnt 32 limit 10000p flow_limit 100p buckets 1024
 quantum 3028 initial_quantum 15140 maxrate 8000Kbit
 Sent 16819357 bytes 11258 pkt (dropped 0, overlimits 0 requeues 0)
 rate 7831Kbit 653pps backlog 7570b 5p requeues 0
  44 flows (43 inactive, 1 throttled), next packet delay 2977352 ns
  0 gc, 0 highprio, 5545 throttled

lpq83:~# tcpdump -p -i eth0 host lpq84 -c 12
09:02:52.079484 IP lpq83 > lpq84: . 1389536928:1389538376(1448) ack 3808678021 win 457 <nop,nop,timestamp 961812 572609068>
09:02:52.079499 IP lpq83 > lpq84: . 1448:2896(1448) ack 1 win 457 <nop,nop,timestamp 961812 572609068>
09:02:52.079906 IP lpq84 > lpq83: . ack 2896 win 16384 <nop,nop,timestamp 572609080 961812>
09:02:52.082568 IP lpq83 > lpq84: . 2896:4344(1448) ack 1 win 457 <nop,nop,timestamp 961815 572609071>
09:02:52.082581 IP lpq83 > lpq84: . 4344:5792(1448) ack 1 win 457 <nop,nop,timestamp 961815 572609071>
09:02:52.083017 IP lpq84 > lpq83: . ack 5792 win 16384 <nop,nop,timestamp 572609083 961815>
09:02:52.085678 IP lpq83 > lpq84: . 5792:7240(1448) ack 1 win 457 <nop,nop,timestamp 961818 572609074>
09:02:52.085693 IP lpq83 > lpq84: . 7240:8688(1448) ack 1 win 457 <nop,nop,timestamp 961818 572609074>
09:02:52.086117 IP lpq84 > lpq83: . ack 8688 win 16384 <nop,nop,timestamp 572609086 961818>
09:02:52.088792 IP lpq83 > lpq84: . 8688:10136(1448) ack 1 win 457 <nop,nop,timestamp 961821 572609077>
09:02:52.088806 IP lpq83 > lpq84: . 10136:11584(1448) ack 1 win 457 <nop,nop,timestamp 961821 572609077>
09:02:52.089217 IP lpq84 > lpq83: . ack 11584 win 16384 <nop,nop,timestamp 572609090 961821>

Reported-by: Steinar H. Gunderson <sesse@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_fq.c | 45 ++++++++++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 19 deletions(-)

(limited to 'net/sched')

diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index fc6de56..a2fef8b 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -420,6 +420,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 	struct fq_flow_head *head;
 	struct sk_buff *skb;
 	struct fq_flow *f;
+	u32 rate;
 
 	skb = fq_dequeue_head(sch, &q->internal);
 	if (skb)
@@ -468,28 +469,34 @@ begin:
 	f->time_next_packet = now;
 	f->credit -= qdisc_pkt_len(skb);
 
-	if (f->credit <= 0 &&
-	    q->rate_enable &&
-	    skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) {
-		u32 rate = skb->sk->sk_pacing_rate ?: q->flow_default_rate;
+	if (f->credit > 0 || !q->rate_enable)
+		goto out;
 
-		rate = min(rate, q->flow_max_rate);
-		if (rate) {
-			u64 len = (u64)qdisc_pkt_len(skb) * NSEC_PER_SEC;
-
-			do_div(len, rate);
-			/* Since socket rate can change later,
-			 * clamp the delay to 125 ms.
-			 * TODO: maybe segment the too big skb, as in commit
-			 * e43ac79a4bc ("sch_tbf: segment too big GSO packets")
-			 */
-			if (unlikely(len > 125 * NSEC_PER_MSEC)) {
-				len = 125 * NSEC_PER_MSEC;
-				q->stat_pkts_too_long++;
-			}
+	if (skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) {
+		rate = skb->sk->sk_pacing_rate ?: q->flow_default_rate;
 
-			f->time_next_packet = now + len;
+		rate = min(rate, q->flow_max_rate);
+	} else {
+		rate = q->flow_max_rate;
+		if (rate == ~0U)
+			goto out;
+	}
+	if (rate) {
+		u32 plen = max(qdisc_pkt_len(skb), q->quantum);
+		u64 len = (u64)plen * NSEC_PER_SEC;
+
+		do_div(len, rate);
+		/* Since socket rate can change later,
+		 * clamp the delay to 125 ms.
+		 * TODO: maybe segment the too big skb, as in commit
+		 * e43ac79a4bc ("sch_tbf: segment too big GSO packets")
+		 */
+		if (unlikely(len > 125 * NSEC_PER_MSEC)) {
+			len = 125 * NSEC_PER_MSEC;
+			q->stat_pkts_too_long++;
 		}
+
+		f->time_next_packet = now + len;
 	}
 out:
 	qdisc_bstats_update(sch, skb);
-- 
cgit v1.1