From 44142fac3446d08c08c5d717ec11d50a737e8640 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:01 +0200
Subject: sched: fix sysctl_sched_child_runs_first flag

fix the sched_child_runs_first flag: always call into ->task_new()
if we are on the same CPU, as SCHED_OTHER tasks depend on it for
correct initial setup.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 6c10fa7..2054e55 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1688,10 +1688,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 	else
 		p->sched_class = &fair_sched_class;
 
-	if (!p->sched_class->task_new || !sysctl_sched_child_runs_first ||
-			(clone_flags & CLONE_VM) || task_cpu(p) != this_cpu ||
-			!current->se.on_rq) {
-
+	if (task_cpu(p) != this_cpu || !p->sched_class->task_new ||
+							!current->se.on_rq) {
 		activate_task(rq, p, 0);
 	} else {
 		/*
-- 
cgit v1.1


From bb61c210835db95b0e9fb612a316422e7cc675e3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:02 +0200
Subject: sched: resched task in task_new_fair()

to get full child-runs-first semantics make sure the parent is
rescheduled.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 67c67a8..0990b20 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1191,6 +1191,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 		se->wait_runtime = -(sched_granularity(cfs_rq) / 2);
 
 	__enqueue_entity(cfs_rq, se);
+	resched_task(rq->curr);
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-- 
cgit v1.1


From 2e45874c5aabe573b6ab4328f303c765701394f9 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <matthias.kaehlcke@gmail.com>
Date: Mon, 15 Oct 2007 17:00:02 +0200
Subject: sched: use list_for_each_entry_safe() in __wake_up_common()

Use list_for_each_entry_safe() instead of list_for_each_safe() in
__wake_up_common()

Signed-off-by: Matthias Kaehlcke <matthias.kaehlcke@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 2054e55..e92b185 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3634,10 +3634,9 @@ EXPORT_SYMBOL(default_wake_function);
 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 			     int nr_exclusive, int sync, void *key)
 {
-	struct list_head *tmp, *next;
+	wait_queue_t *curr, *next;
 
-	list_for_each_safe(tmp, next, &q->task_list) {
-		wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
+	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
 		unsigned flags = curr->flags;
 
 		if (curr->func(curr, mode, sync, key) &&
-- 
cgit v1.1


From a4b29ba2f72673aaa60ba11ced74d579771dd578 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:02 +0200
Subject: sched: small sched_debug cleanup

small kernel/sched_debug.c cleanup - break up
multi-variable assignment.

no code changed:

   text    data     bss     dec     hex filename
   38869    3550      24   42443    a5cb sched.o.before
   38869    3550      24   42443    a5cb sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_debug.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index c3ee38b..94915f1 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -279,9 +279,13 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 void proc_sched_set_task(struct task_struct *p)
 {
 #ifdef CONFIG_SCHEDSTATS
-	p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0;
-	p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0;
+	p->se.sleep_max			= 0;
+	p->se.block_max			= 0;
+	p->se.exec_max			= 0;
+	p->se.wait_max			= 0;
+	p->se.wait_runtime_overruns	= 0;
+	p->se.wait_runtime_underruns	= 0;
 #endif
-	p->se.sum_exec_runtime = 0;
+	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
 }
-- 
cgit v1.1


From eba1ed4b7e52720e3099325874811c38a5ec1562 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:02 +0200
Subject: sched: debug: track maximum 'slice'

track the maximum amount of time a task has executed while
the CPU load was at least 2x. (i.e. at least two nice-0
tasks were runnable)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        |  1 +
 kernel/sched_debug.c  |  2 ++
 kernel/sched_fair.c   | 11 +++++++++++
 4 files changed, 15 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 833f7dc..9761b16 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -921,6 +921,7 @@ struct sched_entity {
 	u64			block_start;
 	u64			block_max;
 	u64			exec_max;
+	u64			slice_max;
 
 	unsigned long		wait_runtime_overruns;
 	unsigned long		wait_runtime_underruns;
diff --git a/kernel/sched.c b/kernel/sched.c
index e92b185..282d037 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1603,6 +1603,7 @@ static void __sched_fork(struct task_struct *p)
 	p->se.sleep_max			= 0;
 	p->se.block_max			= 0;
 	p->se.exec_max			= 0;
+	p->se.slice_max			= 0;
 	p->se.wait_max			= 0;
 	p->se.wait_runtime_overruns	= 0;
 	p->se.wait_runtime_underruns	= 0;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 94915f1..fd080f6 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -254,6 +254,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	P(se.sleep_max);
 	P(se.block_max);
 	P(se.exec_max);
+	P(se.slice_max);
 	P(se.wait_max);
 	P(se.wait_runtime_overruns);
 	P(se.wait_runtime_underruns);
@@ -282,6 +283,7 @@ void proc_sched_set_task(struct task_struct *p)
 	p->se.sleep_max			= 0;
 	p->se.block_max			= 0;
 	p->se.exec_max			= 0;
+	p->se.slice_max			= 0;
 	p->se.wait_max			= 0;
 	p->se.wait_runtime_overruns	= 0;
 	p->se.wait_runtime_underruns	= 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0990b20..5c15d8a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -739,6 +739,17 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	update_stats_wait_end(cfs_rq, se);
 	update_stats_curr_start(cfs_rq, se);
 	set_cfs_rq_curr(cfs_rq, se);
+#ifdef CONFIG_SCHEDSTATS
+	/*
+	 * Track our maximum slice length, if the CPU's load is at
+	 * least twice that of our own weight (i.e. dont track it
+	 * when there are only lesser-weight tasks around):
+	 */
+	if (rq_of(cfs_rq)->ls.load.weight >= 2*se->load.weight) {
+		se->slice_max = max(se->slice_max,
+			se->sum_exec_runtime - se->prev_sum_exec_runtime);
+	}
+#endif
 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
 
-- 
cgit v1.1


From 38ad464d410dadceda1563f36bdb0be7fe4c8938 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:02 +0200
Subject: sched: uniform tunings

use the same defaults on both UP and SMP.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 282d037..2520923 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4898,32 +4898,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
  */
 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
 
-/*
- * Increase the granularity value when there are more CPUs,
- * because with more CPUs the 'effective latency' as visible
- * to users decreases. But the relationship is not linear,
- * so pick a second-best guess by going with the log2 of the
- * number of CPUs.
- *
- * This idea comes from the SD scheduler of Con Kolivas:
- */
-static inline void sched_init_granularity(void)
-{
-	unsigned int factor = 1 + ilog2(num_online_cpus());
-	const unsigned long limit = 100000000;
-
-	sysctl_sched_min_granularity *= factor;
-	if (sysctl_sched_min_granularity > limit)
-		sysctl_sched_min_granularity = limit;
-
-	sysctl_sched_latency *= factor;
-	if (sysctl_sched_latency > limit)
-		sysctl_sched_latency = limit;
-
-	sysctl_sched_runtime_limit = sysctl_sched_latency;
-	sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2;
-}
-
 #ifdef CONFIG_SMP
 /*
  * This is how migration works:
@@ -6491,12 +6465,10 @@ void __init sched_init_smp(void)
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
 		BUG();
-	sched_init_granularity();
 }
 #else
 void __init sched_init_smp(void)
 {
-	sched_init_granularity();
 }
 #endif /* CONFIG_SMP */
 
-- 
cgit v1.1


From 2bd8e6d422a4f44c0994f909317eba80b0fe08a1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:02 +0200
Subject: sched: use constants if !CONFIG_SCHED_DEBUG

use constants if !CONFIG_SCHED_DEBUG.

this speeds up the code and reduces code-size:

    text    data     bss     dec     hex filename
   27464    3014      16   30494    771e sched.o.before
   26929    3010      20   29959    7507 sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |  5 ++++-
 kernel/sched.c        |  6 ------
 kernel/sched_fair.c   | 28 ++++++++++++++++++++--------
 3 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9761b16..befca3f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1402,15 +1402,18 @@ static inline void idle_task_exit(void) {}
 
 extern void sched_idle_next(void);
 
+#ifdef CONFIG_SCHED_DEBUG
 extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_batch_wakeup_granularity;
 extern unsigned int sysctl_sched_stat_granularity;
 extern unsigned int sysctl_sched_runtime_limit;
-extern unsigned int sysctl_sched_compat_yield;
 extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
+#endif
+
+extern unsigned int sysctl_sched_compat_yield;
 
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
diff --git a/kernel/sched.c b/kernel/sched.c
index 2520923..ae1544f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1659,12 +1659,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
 }
 
 /*
- * After fork, child runs first. (default) If set to 0 then
- * parent will (try to) run first.
- */
-unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
-
-/*
  * wake_up_new_task - wake up a newly created task for the first time.
  *
  * This function will do some initial scheduler statistics housekeeping
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5c15d8a..2e84aaf 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -21,6 +21,15 @@
  */
 
 /*
+ * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
+ */
+#ifdef CONFIG_SCHED_DEBUG
+# define const_debug __read_mostly
+#else
+# define const_debug static const
+#endif
+
+/*
  * Targeted preemption latency for CPU-bound tasks:
  * (default: 20ms, units: nanoseconds)
  *
@@ -34,7 +43,13 @@
  * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
  * Targeted preemption latency for CPU-bound tasks:
  */
-unsigned int sysctl_sched_latency __read_mostly = 20000000ULL;
+const_debug unsigned int sysctl_sched_latency = 20000000ULL;
+
+/*
+ * After fork, child runs first. (default) If set to 0 then
+ * parent will (try to) run first.
+ */
+const_debug unsigned int sysctl_sched_child_runs_first = 1;
 
 /*
  * Minimal preemption granularity for CPU-bound tasks:
@@ -58,7 +73,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
  */
-unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL;
+const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 25000000UL;
 
 /*
  * SCHED_OTHER wake-up granularity.
@@ -68,13 +83,10 @@ unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL;
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
  */
-unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000UL;
+const_debug unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
 
-unsigned int sysctl_sched_stat_granularity __read_mostly;
+const_debug unsigned int sysctl_sched_stat_granularity;
 
-/*
- * Initialized in sched_init_granularity() [to 5 times the base granularity]:
- */
 unsigned int sysctl_sched_runtime_limit __read_mostly;
 
 /*
@@ -89,7 +101,7 @@ enum {
 	SCHED_FEAT_SKIP_INITIAL		= 32,
 };
 
-unsigned int sysctl_sched_features __read_mostly =
+const_debug unsigned int sysctl_sched_features =
 		SCHED_FEAT_FAIR_SLEEPERS	*1 |
 		SCHED_FEAT_SLEEPER_AVG		*0 |
 		SCHED_FEAT_SLEEPER_LOAD_AVG	*1 |
-- 
cgit v1.1


From 8ebc91d93669af39dbed50914d7daf457eeb43be Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:03 +0200
Subject: sched: remove stat_gran

remove the stat_gran code - it was disabled by default and it causes
unnecessary overhead.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |  3 ---
 kernel/sched.c        |  5 +----
 kernel/sched_fair.c   | 46 ++++++++++++++--------------------------------
 kernel/sysctl.c       | 11 -----------
 4 files changed, 15 insertions(+), 50 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index befca3f..3c38a50 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -895,9 +895,6 @@ struct load_weight {
  */
 struct sched_entity {
 	long			wait_runtime;
-	unsigned long		delta_fair_run;
-	unsigned long		delta_fair_sleep;
-	unsigned long		delta_exec;
 	s64			fair_key;
 	struct load_weight	load;		/* for load-balancing */
 	struct rb_node		run_node;
diff --git a/kernel/sched.c b/kernel/sched.c
index ae1544f..d4dabfc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -829,7 +829,7 @@ static void update_curr_load(struct rq *rq)
 	 * Stagger updates to ls->delta_fair. Very frequent updates
 	 * can be expensive.
 	 */
-	if (ls->delta_stat >= sysctl_sched_stat_granularity)
+	if (ls->delta_stat)
 		__update_curr_load(rq, ls);
 }
 
@@ -1588,9 +1588,6 @@ static void __sched_fork(struct task_struct *p)
 	p->se.exec_start		= 0;
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
-	p->se.delta_exec		= 0;
-	p->se.delta_fair_run		= 0;
-	p->se.delta_fair_sleep		= 0;
 	p->se.wait_runtime		= 0;
 	p->se.sleep_start_fair		= 0;
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 2e84aaf..2138c40 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -85,8 +85,6 @@ const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 25000000UL;
  */
 const_debug unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
 
-const_debug unsigned int sysctl_sched_stat_granularity;
-
 unsigned int sysctl_sched_runtime_limit __read_mostly;
 
 /*
@@ -360,13 +358,13 @@ add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
  * are not in our scheduling class.
  */
 static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
+	      unsigned long delta_exec)
 {
-	unsigned long delta, delta_exec, delta_fair, delta_mine;
+	unsigned long delta, delta_fair, delta_mine;
 	struct load_weight *lw = &cfs_rq->load;
 	unsigned long load = lw->weight;
 
-	delta_exec = curr->delta_exec;
 	schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
 
 	curr->sum_exec_runtime += delta_exec;
@@ -400,6 +398,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 static void update_curr(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *curr = cfs_rq_curr(cfs_rq);
+	u64 now = rq_of(cfs_rq)->clock;
 	unsigned long delta_exec;
 
 	if (unlikely(!curr))
@@ -410,15 +409,10 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	 * since the last time we changed load (this cannot
 	 * overflow on 32 bits):
 	 */
-	delta_exec = (unsigned long)(rq_of(cfs_rq)->clock - curr->exec_start);
-
-	curr->delta_exec += delta_exec;
+	delta_exec = (unsigned long)(now - curr->exec_start);
 
-	if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) {
-		__update_curr(cfs_rq, curr);
-		curr->delta_exec = 0;
-	}
-	curr->exec_start = rq_of(cfs_rq)->clock;
+	__update_curr(cfs_rq, curr, delta_exec);
+	curr->exec_start = now;
 }
 
 static inline void
@@ -494,10 +488,9 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Note: must be called with a freshly updated rq->fair_clock.
  */
 static inline void
-__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se,
+			unsigned long delta_fair)
 {
-	unsigned long delta_fair = se->delta_fair_run;
-
 	schedstat_set(se->wait_max, max(se->wait_max,
 			rq_of(cfs_rq)->clock - se->wait_start));
 
@@ -519,12 +512,7 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
 			(u64)(cfs_rq->fair_clock - se->wait_start_fair));
 
-	se->delta_fair_run += delta_fair;
-	if (unlikely(abs(se->delta_fair_run) >=
-				sysctl_sched_stat_granularity)) {
-		__update_stats_wait_end(cfs_rq, se);
-		se->delta_fair_run = 0;
-	}
+	__update_stats_wait_end(cfs_rq, se, delta_fair);
 
 	se->wait_start_fair = 0;
 	schedstat_set(se->wait_start, 0);
@@ -567,9 +555,10 @@ update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Scheduling class queueing methods:
  */
 
-static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se,
+			      unsigned long delta_fair)
 {
-	unsigned long load = cfs_rq->load.weight, delta_fair;
+	unsigned long load = cfs_rq->load.weight;
 	long prev_runtime;
 
 	/*
@@ -582,8 +571,6 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG)
 		load = rq_of(cfs_rq)->cpu_load[2];
 
-	delta_fair = se->delta_fair_sleep;
-
 	/*
 	 * Fix up delta_fair with the effect of us running
 	 * during the whole sleep period:
@@ -618,12 +605,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
 		(u64)(cfs_rq->fair_clock - se->sleep_start_fair));
 
-	se->delta_fair_sleep += delta_fair;
-	if (unlikely(abs(se->delta_fair_sleep) >=
-				sysctl_sched_stat_granularity)) {
-		__enqueue_sleeper(cfs_rq, se);
-		se->delta_fair_sleep = 0;
-	}
+	__enqueue_sleeper(cfs_rq, se, delta_fair);
 
 	se->sleep_start_fair = 0;
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6c97259..9b1b0d4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -266,17 +266,6 @@ static ctl_table kern_table[] = {
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_stat_granularity_ns",
-		.data		= &sysctl_sched_stat_granularity,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &min_wakeup_granularity_ns,
-		.extra2		= &max_wakeup_granularity_ns,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_runtime_limit_ns",
 		.data		= &sysctl_sched_runtime_limit,
 		.maxlen		= sizeof(unsigned int),
-- 
cgit v1.1


From a25707f3aef9cf68c341eba5960d580f364e4e6f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:03 +0200
Subject: sched: remove precise CPU load

CPU load calculations are statistical anyway, and there's little benefit
from having it calculated on every scheduling event. So remove this code,
it gets rid of a divide from the scheduler wakeup and context-switch
fastpath.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c       | 42 +++++++-----------------------------------
 kernel/sched_debug.c |  2 --
 kernel/sched_fair.c  |  6 ++----
 3 files changed, 9 insertions(+), 41 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index d4dabfc..25cc9b2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1972,42 +1972,11 @@ unsigned long nr_active(void)
  */
 static void update_cpu_load(struct rq *this_rq)
 {
-	u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
 	unsigned long total_load = this_rq->ls.load.weight;
 	unsigned long this_load =  total_load;
-	struct load_stat *ls = &this_rq->ls;
 	int i, scale;
 
 	this_rq->nr_load_updates++;
-	if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
-		goto do_avg;
-
-	/* Update delta_fair/delta_exec fields first */
-	update_curr_load(this_rq);
-
-	fair_delta64 = ls->delta_fair + 1;
-	ls->delta_fair = 0;
-
-	exec_delta64 = ls->delta_exec + 1;
-	ls->delta_exec = 0;
-
-	sample_interval64 = this_rq->clock - ls->load_update_last;
-	ls->load_update_last = this_rq->clock;
-
-	if ((s64)sample_interval64 < (s64)TICK_NSEC)
-		sample_interval64 = TICK_NSEC;
-
-	if (exec_delta64 > sample_interval64)
-		exec_delta64 = sample_interval64;
-
-	idle_delta64 = sample_interval64 - exec_delta64;
-
-	tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
-	tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
-
-	this_load = (unsigned long)tmp64;
-
-do_avg:
 
 	/* Update our load: */
 	for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2017,7 +1986,13 @@ do_avg:
 
 		old_load = this_rq->cpu_load[i];
 		new_load = this_load;
-
+		/*
+		 * Round up the averaging division if load is increasing. This
+		 * prevents us from getting stuck on 9 if the load is 10, for
+		 * example.
+		 */
+		if (new_load > old_load)
+			new_load += scale-1;
 		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
 	}
 }
@@ -6484,7 +6459,6 @@ static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
 
 void __init sched_init(void)
 {
-	u64 now = sched_clock();
 	int highest_cpu = 0;
 	int i, j;
 
@@ -6509,8 +6483,6 @@ void __init sched_init(void)
 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
 		list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
 #endif
-		rq->ls.load_update_last = now;
-		rq->ls.load_update_start = now;
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index fd080f6..6b789da 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -145,8 +145,6 @@ static void print_cpu(struct seq_file *m, int cpu)
 	P(nr_running);
 	SEQ_printf(m, "  .%-30s: %lu\n", "load",
 		   rq->ls.load.weight);
-	P(ls.delta_fair);
-	P(ls.delta_exec);
 	P(nr_switches);
 	P(nr_load_updates);
 	P(nr_uninterruptible);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 2138c40..105d57b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -94,16 +94,14 @@ enum {
 	SCHED_FEAT_FAIR_SLEEPERS	= 1,
 	SCHED_FEAT_SLEEPER_AVG		= 2,
 	SCHED_FEAT_SLEEPER_LOAD_AVG	= 4,
-	SCHED_FEAT_PRECISE_CPU_LOAD	= 8,
-	SCHED_FEAT_START_DEBIT		= 16,
-	SCHED_FEAT_SKIP_INITIAL		= 32,
+	SCHED_FEAT_START_DEBIT		= 8,
+	SCHED_FEAT_SKIP_INITIAL		= 16,
 };
 
 const_debug unsigned int sysctl_sched_features =
 		SCHED_FEAT_FAIR_SLEEPERS	*1 |
 		SCHED_FEAT_SLEEPER_AVG		*0 |
 		SCHED_FEAT_SLEEPER_LOAD_AVG	*1 |
-		SCHED_FEAT_PRECISE_CPU_LOAD	*1 |
 		SCHED_FEAT_START_DEBIT		*1 |
 		SCHED_FEAT_SKIP_INITIAL		*0;
 
-- 
cgit v1.1


From 53df556e06d85245cf6aacedaba8e4da684859c3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:03 +0200
Subject: sched: remove precise CPU load calculations #2

continued removal of precise CPU load calculations.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 32 +-------------------------------
 1 file changed, 1 insertion(+), 31 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 25cc9b2..f6a8106 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -173,8 +173,6 @@ struct rt_prio_array {
 
 struct load_stat {
 	struct load_weight load;
-	u64 load_update_start, load_update_last;
-	unsigned long delta_fair, delta_exec, delta_stat;
 };
 
 /* CFS-related fields in a runqueue */
@@ -793,15 +791,6 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 
 #define sched_class_highest (&rt_sched_class)
 
-static void __update_curr_load(struct rq *rq, struct load_stat *ls)
-{
-	if (rq->curr != rq->idle && ls->load.weight) {
-		ls->delta_exec += ls->delta_stat;
-		ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
-		ls->delta_stat = 0;
-	}
-}
-
 /*
  * Update delta_exec, delta_fair fields for rq.
  *
@@ -817,31 +806,13 @@ static void __update_curr_load(struct rq *rq, struct load_stat *ls)
  * This function is called /before/ updating rq->ls.load
  * and when switching tasks.
  */
-static void update_curr_load(struct rq *rq)
-{
-	struct load_stat *ls = &rq->ls;
-	u64 start;
-
-	start = ls->load_update_start;
-	ls->load_update_start = rq->clock;
-	ls->delta_stat += rq->clock - start;
-	/*
-	 * Stagger updates to ls->delta_fair. Very frequent updates
-	 * can be expensive.
-	 */
-	if (ls->delta_stat)
-		__update_curr_load(rq, ls);
-}
-
 static inline void inc_load(struct rq *rq, const struct task_struct *p)
 {
-	update_curr_load(rq);
 	update_load_add(&rq->ls.load, p->se.load.weight);
 }
 
 static inline void dec_load(struct rq *rq, const struct task_struct *p)
 {
-	update_curr_load(rq);
 	update_load_sub(&rq->ls.load, p->se.load.weight);
 }
 
@@ -1972,8 +1943,7 @@ unsigned long nr_active(void)
  */
 static void update_cpu_load(struct rq *this_rq)
 {
-	unsigned long total_load = this_rq->ls.load.weight;
-	unsigned long this_load =  total_load;
+	unsigned long this_load = this_rq->ls.load.weight;
 	int i, scale;
 
 	this_rq->nr_load_updates++;
-- 
cgit v1.1


From 62160e3f4a06d948ec89665d29f1173e551deedc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:03 +0200
Subject: sched: track cfs_rq->curr on !group-scheduling too

Noticed by Roman Zippel: use cfs_rq->curr in the !group-scheduling
case too. Small micro-optimization and cleanup effect:

   text    data     bss     dec     hex filename
   36269    3482      24   39775    9b5f sched.o.before
   36177    3486      24   39687    9b07 sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c      |  2 +-
 kernel/sched_fair.c | 31 +++++++++----------------------
 2 files changed, 10 insertions(+), 23 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index f6a8106..3209e2c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -189,11 +189,11 @@ struct cfs_rq {
 	struct rb_root tasks_timeline;
 	struct rb_node *rb_leftmost;
 	struct rb_node *rb_load_balance_curr;
-#ifdef CONFIG_FAIR_GROUP_SCHED
 	/* 'curr' points to currently running entity on this cfs_rq.
 	 * It is set to NULL otherwise (i.e when none are currently running).
 	 */
 	struct sched_entity *curr;
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
 
 	/* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 105d57b..335faf0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -111,51 +111,38 @@ extern struct sched_class fair_sched_class;
  * CFS operations on generic schedulable entities:
  */
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/* cpu runqueue to which this cfs_rq is attached */
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
-{
-	return cfs_rq->rq;
-}
-
 /* currently running entity (if any) on this cfs_rq */
 static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
 {
 	return cfs_rq->curr;
 }
 
-/* An entity is a task if it doesn't "own" a runqueue */
-#define entity_is_task(se)	(!se->my_q)
-
 static inline void
 set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	cfs_rq->curr = se;
 }
 
-#else	/* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_FAIR_GROUP_SCHED
 
+/* cpu runqueue to which this cfs_rq is attached */
 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 {
-	return container_of(cfs_rq, struct rq, cfs);
+	return cfs_rq->rq;
 }
 
-static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
-{
-	struct rq *rq = rq_of(cfs_rq);
+/* An entity is a task if it doesn't "own" a runqueue */
+#define entity_is_task(se)	(!se->my_q)
 
-	if (unlikely(rq->curr->sched_class != &fair_sched_class))
-		return NULL;
+#else	/* CONFIG_FAIR_GROUP_SCHED */
 
-	return &rq->curr->se;
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+	return container_of(cfs_rq, struct rq, cfs);
 }
 
 #define entity_is_task(se)	1
 
-static inline void
-set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
-
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
 static inline struct task_struct *task_of(struct sched_entity *se)
-- 
cgit v1.1


From 429d43bcc026b92b9dfaccd3577fec290f6a67ce Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:03 +0200
Subject: sched: cleanup: simplify cfs_rq_curr() methods

cleanup: simplify cfs_rq_curr() methods - now that the cfs_rq->curr
pointer is unconditionally present, remove the wrappers.

  kernel/sched.o:
      text    data     bss     dec     hex filename
     11784     224    2012   14020    36c4 sched.o.before
     11784     224    2012   14020    36c4 sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 335faf0..74d47e6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -111,18 +111,6 @@ extern struct sched_class fair_sched_class;
  * CFS operations on generic schedulable entities:
  */
 
-/* currently running entity (if any) on this cfs_rq */
-static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
-{
-	return cfs_rq->curr;
-}
-
-static inline void
-set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	cfs_rq->curr = se;
-}
-
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
 /* cpu runqueue to which this cfs_rq is attached */
@@ -382,7 +370,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 
 static void update_curr(struct cfs_rq *cfs_rq)
 {
-	struct sched_entity *curr = cfs_rq_curr(cfs_rq);
+	struct sched_entity *curr = cfs_rq->curr;
 	u64 now = rq_of(cfs_rq)->clock;
 	unsigned long delta_exec;
 
@@ -440,7 +428,7 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	 * Are we enqueueing a waiting task? (for current tasks
 	 * a dequeue/enqueue event is a NOP)
 	 */
-	if (se != cfs_rq_curr(cfs_rq))
+	if (se != cfs_rq->curr)
 		update_stats_wait_start(cfs_rq, se);
 	/*
 	 * Update the key:
@@ -511,7 +499,7 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	 * Mark the end of the wait period if dequeueing a
 	 * waiting task:
 	 */
-	if (se != cfs_rq_curr(cfs_rq))
+	if (se != cfs_rq->curr)
 		update_stats_wait_end(cfs_rq, se);
 }
 
@@ -717,7 +705,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	 */
 	update_stats_wait_end(cfs_rq, se);
 	update_stats_curr_start(cfs_rq, se);
-	set_cfs_rq_curr(cfs_rq, se);
+	cfs_rq->curr = se;
 #ifdef CONFIG_SCHEDSTATS
 	/*
 	 * Track our maximum slice length, if the CPU's load is at
@@ -754,7 +742,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 
 	if (prev->on_rq)
 		update_stats_wait_start(cfs_rq, prev);
-	set_cfs_rq_curr(cfs_rq, NULL);
+	cfs_rq->curr = NULL;
 }
 
 static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
@@ -1153,7 +1141,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr)
 static void task_new_fair(struct rq *rq, struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
-	struct sched_entity *se = &p->se, *curr = cfs_rq_curr(cfs_rq);
+	struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
 
 	sched_info_queued(p);
 
-- 
cgit v1.1


From e59c80c5bbc0d3d6b0772edb347ce2dd303121b3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Oct 2007 17:00:03 +0200
Subject: sched: simplify SCHED_FEAT_* code

Peter Zijlstra suggested to simplify SCHED_FEAT_* checks via the
sched_feat(x) macro.

No code changed:

   text    data     bss     dec     hex filename
   38895    3550      24   42469    a5e5 sched.o.before
   38895    3550      24   42469    a5e5 sched.o.after

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 74d47e6..2488f6f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -105,6 +105,8 @@ const_debug unsigned int sysctl_sched_features =
 		SCHED_FEAT_START_DEBIT		*1 |
 		SCHED_FEAT_SKIP_INITIAL		*0;
 
+#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
+
 extern struct sched_class fair_sched_class;
 
 /**************************************************************
@@ -541,14 +543,14 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit))
 		return;
 
-	if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG)
+	if (sched_feat(SLEEPER_LOAD_AVG))
 		load = rq_of(cfs_rq)->cpu_load[2];
 
 	/*
 	 * Fix up delta_fair with the effect of us running
 	 * during the whole sleep period:
 	 */
-	if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG)
+	if (sched_feat(SLEEPER_AVG))
 		delta_fair = div64_likely32((u64)delta_fair * load,
 						load + se->load.weight);
 
@@ -572,7 +574,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	unsigned long delta_fair;
 
 	if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) ||
-			 !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS))
+			 !sched_feat(FAIR_SLEEPERS))
 		return;
 
 	delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
@@ -1158,14 +1160,14 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	 * The first wait is dominated by the child-runs-first logic,
 	 * so do not credit it with that waiting time yet:
 	 */
-	if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL)
+	if (sched_feat(SKIP_INITIAL))
 		se->wait_start_fair = 0;
 
 	/*
 	 * The statistical average of wait_runtime is about
 	 * -granularity/2, so initialize the task with that:
 	 */
-	if (sysctl_sched_features & SCHED_FEAT_START_DEBIT)
+	if (sched_feat(START_DEBIT))
 		se->wait_runtime = -(sched_granularity(cfs_rq) / 2);
 
 	__enqueue_entity(cfs_rq, se);
-- 
cgit v1.1


From 19ccd97a03a026c2341b35af3ed2078a83c4a22b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:04 +0200
Subject: sched: uninline __enqueue_entity()/__dequeue_entity()

suggested by Roman Zippel: uninline __enqueue_entity() and
__dequeue_entity().

this reduces code size:

      text    data     bss     dec     hex filename
     25385    2386      16   27787    6c8b sched.o.before
     25257    2386      16   27659    6c0b sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 2488f6f..91a227b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -148,7 +148,7 @@ static inline struct task_struct *task_of(struct sched_entity *se)
 /*
  * Enqueue an entity into the rb-tree:
  */
-static inline void
+static void
 __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
@@ -191,7 +191,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
 }
 
-static inline void
+static void
 __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	if (cfs_rq->rb_leftmost == &se->run_node)
-- 
cgit v1.1


From 1091985b482fdd577a5c511059b9d7b4467bd15d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:04 +0200
Subject: sched: speed up update_load_add/_sub()

speed up update_load_add/_sub() by not delaying the division - this
reduces CPU pipeline dependencies.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 3209e2c..992a1fa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -697,16 +697,17 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
 	return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
 }
 
-static void update_load_add(struct load_weight *lw, unsigned long inc)
+static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
-	lw->inv_weight = 0;
+	lw->inv_weight = WMULT_CONST / lw->weight;
 }
 
-static void update_load_sub(struct load_weight *lw, unsigned long dec)
+static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
 {
 	lw->weight -= dec;
-	lw->inv_weight = 0;
+	if (likely(lw->weight))
+		lw->inv_weight = WMULT_CONST / lw->weight;
 }
 
 /*
-- 
cgit v1.1


From 08e2388aa1e40cb06f7d04ac621e2ae94e1d8fdc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:04 +0200
Subject: sched: clean up calc_weighted()

clean up calc_weighted() - we always use the normalized shift so
it's not needed to pass that in. Also, push the non-nice0 branch
into the function.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 31 ++++++++-----------------------
 1 file changed, 8 insertions(+), 23 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 91a227b..b46f807 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -397,27 +397,16 @@ update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
 }
 
-/*
- * We calculate fair deltas here, so protect against the random effects
- * of a multiplication overflow by capping it to the runtime limit:
- */
-#if BITS_PER_LONG == 32
 static inline unsigned long
-calc_weighted(unsigned long delta, unsigned long weight, int shift)
+calc_weighted(unsigned long delta, struct sched_entity *se)
 {
-	u64 tmp = (u64)delta * weight >> shift;
+	unsigned long weight = se->load.weight;
 
-	if (unlikely(tmp > sysctl_sched_runtime_limit*2))
-		return sysctl_sched_runtime_limit*2;
-	return tmp;
+	if (unlikely(weight != NICE_0_LOAD))
+		return (u64)delta * se->load.weight >> NICE_0_SHIFT;
+	else
+		return delta;
 }
-#else
-static inline unsigned long
-calc_weighted(unsigned long delta, unsigned long weight, int shift)
-{
-	return delta * weight >> shift;
-}
-#endif
 
 /*
  * Task is being enqueued - update stats:
@@ -469,9 +458,7 @@ __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	schedstat_set(se->wait_max, max(se->wait_max,
 			rq_of(cfs_rq)->clock - se->wait_start));
 
-	if (unlikely(se->load.weight != NICE_0_LOAD))
-		delta_fair = calc_weighted(delta_fair, se->load.weight,
-							NICE_0_SHIFT);
+	delta_fair = calc_weighted(delta_fair, se);
 
 	add_wait_runtime(cfs_rq, se, delta_fair);
 }
@@ -554,9 +541,7 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se,
 		delta_fair = div64_likely32((u64)delta_fair * load,
 						load + se->load.weight);
 
-	if (unlikely(se->load.weight != NICE_0_LOAD))
-		delta_fair = calc_weighted(delta_fair, se->load.weight,
-							NICE_0_SHIFT);
+	delta_fair = calc_weighted(delta_fair, se);
 
 	prev_runtime = se->wait_runtime;
 	__add_wait_runtime(cfs_rq, se, delta_fair);
-- 
cgit v1.1


From e9acbff6484df51fd880e0f5fe0224e8be34c17b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:04 +0200
Subject: sched: introduce se->vruntime

introduce se->vruntime as a sum of weighted delta-exec's, and use that
as the key into the tree.

the idea to use absolute virtual time as the basic metric of scheduling
has been first raised by William Lee Irwin, advanced by Tong Li and first
prototyped by Roman Zippel in the "Really Fair Scheduler" (RFS) patchset.

also see:

   http://lkml.org/lkml/2007/9/2/76

for a simpler variant of this patch.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        |  1 +
 kernel/sched_fair.c   | 81 ++++++++++++++++++++++++++++++---------------------
 3 files changed, 50 insertions(+), 33 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3c38a50..5e5c457 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -902,6 +902,7 @@ struct sched_entity {
 
 	u64			exec_start;
 	u64			sum_exec_runtime;
+	u64			vruntime;
 	u64			prev_sum_exec_runtime;
 	u64			wait_start_fair;
 	u64			sleep_start_fair;
diff --git a/kernel/sched.c b/kernel/sched.c
index 992a1fa..8f80eba 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -182,6 +182,7 @@ struct cfs_rq {
 
 	s64 fair_clock;
 	u64 exec_clock;
+	u64 min_vruntime;
 	s64 wait_runtime;
 	u64 sleeper_bonus;
 	unsigned long wait_runtime_overruns, wait_runtime_underruns;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b46f807..a2af09c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -92,14 +92,16 @@ unsigned int sysctl_sched_runtime_limit __read_mostly;
  */
 enum {
 	SCHED_FEAT_FAIR_SLEEPERS	= 1,
-	SCHED_FEAT_SLEEPER_AVG		= 2,
-	SCHED_FEAT_SLEEPER_LOAD_AVG	= 4,
-	SCHED_FEAT_START_DEBIT		= 8,
-	SCHED_FEAT_SKIP_INITIAL		= 16,
+	SCHED_FEAT_NEW_FAIR_SLEEPERS	= 2,
+	SCHED_FEAT_SLEEPER_AVG		= 4,
+	SCHED_FEAT_SLEEPER_LOAD_AVG	= 8,
+	SCHED_FEAT_START_DEBIT		= 16,
+	SCHED_FEAT_SKIP_INITIAL		= 32,
 };
 
 const_debug unsigned int sysctl_sched_features =
-		SCHED_FEAT_FAIR_SLEEPERS	*1 |
+		SCHED_FEAT_FAIR_SLEEPERS	*0 |
+		SCHED_FEAT_NEW_FAIR_SLEEPERS	*1 |
 		SCHED_FEAT_SLEEPER_AVG		*0 |
 		SCHED_FEAT_SLEEPER_LOAD_AVG	*1 |
 		SCHED_FEAT_START_DEBIT		*1 |
@@ -145,6 +147,19 @@ static inline struct task_struct *task_of(struct sched_entity *se)
  * Scheduling class tree data structure manipulation methods:
  */
 
+static inline void
+set_leftmost(struct cfs_rq *cfs_rq, struct rb_node *leftmost)
+{
+	struct sched_entity *se;
+
+	cfs_rq->rb_leftmost = leftmost;
+	if (leftmost) {
+		se = rb_entry(leftmost, struct sched_entity, run_node);
+		cfs_rq->min_vruntime = max(se->vruntime,
+						cfs_rq->min_vruntime);
+	}
+}
+
 /*
  * Enqueue an entity into the rb-tree:
  */
@@ -180,7 +195,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	 * used):
 	 */
 	if (leftmost)
-		cfs_rq->rb_leftmost = &se->run_node;
+		set_leftmost(cfs_rq, &se->run_node);
 
 	rb_link_node(&se->run_node, parent, link);
 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -195,7 +210,8 @@ static void
 __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	if (cfs_rq->rb_leftmost == &se->run_node)
-		cfs_rq->rb_leftmost = rb_next(&se->run_node);
+		set_leftmost(cfs_rq, rb_next(&se->run_node));
+
 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 	update_load_sub(&cfs_rq->load, se->load.weight);
 	cfs_rq->nr_running--;
@@ -336,7 +352,7 @@ static inline void
 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 	      unsigned long delta_exec)
 {
-	unsigned long delta, delta_fair, delta_mine;
+	unsigned long delta, delta_fair, delta_mine, delta_exec_weighted;
 	struct load_weight *lw = &cfs_rq->load;
 	unsigned long load = lw->weight;
 
@@ -344,6 +360,12 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 
 	curr->sum_exec_runtime += delta_exec;
 	cfs_rq->exec_clock += delta_exec;
+	delta_exec_weighted = delta_exec;
+	if (unlikely(curr->load.weight != NICE_0_LOAD)) {
+		delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
+							&curr->load);
+	}
+	curr->vruntime += delta_exec_weighted;
 
 	if (unlikely(!load))
 		return;
@@ -413,8 +435,6 @@ calc_weighted(unsigned long delta, struct sched_entity *se)
  */
 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	s64 key;
-
 	/*
 	 * Are we enqueueing a waiting task? (for current tasks
 	 * a dequeue/enqueue event is a NOP)
@@ -424,28 +444,7 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	/*
 	 * Update the key:
 	 */
-	key = cfs_rq->fair_clock;
-
-	/*
-	 * Optimize the common nice 0 case:
-	 */
-	if (likely(se->load.weight == NICE_0_LOAD)) {
-		key -= se->wait_runtime;
-	} else {
-		u64 tmp;
-
-		if (se->wait_runtime < 0) {
-			tmp = -se->wait_runtime;
-			key += (tmp * se->load.inv_weight) >>
-					(WMULT_SHIFT - NICE_0_SHIFT);
-		} else {
-			tmp = se->wait_runtime;
-			key -= (tmp * se->load.inv_weight) >>
-					(WMULT_SHIFT - NICE_0_SHIFT);
-		}
-	}
-
-	se->fair_key = key;
+	se->fair_key = se->vruntime;
 }
 
 /*
@@ -615,8 +614,22 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 	 */
 	update_curr(cfs_rq);
 
-	if (wakeup)
+	if (wakeup) {
+		u64 min_runtime, latency;
+
+		min_runtime = cfs_rq->min_vruntime;
+		min_runtime += sysctl_sched_latency/2;
+
+		if (sched_feat(NEW_FAIR_SLEEPERS)) {
+			latency = calc_weighted(sysctl_sched_latency, se);
+			if (min_runtime > latency)
+				min_runtime -= latency;
+		}
+
+		se->vruntime = max(se->vruntime, min_runtime);
+
 		enqueue_sleeper(cfs_rq, se);
+	}
 
 	update_stats_enqueue(cfs_rq, se);
 	__enqueue_entity(cfs_rq, se);
@@ -1155,6 +1168,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	if (sched_feat(START_DEBIT))
 		se->wait_runtime = -(sched_granularity(cfs_rq) / 2);
 
+	se->vruntime = cfs_rq->min_vruntime;
+	update_stats_enqueue(cfs_rq, se);
 	__enqueue_entity(cfs_rq, se);
 	resched_task(rq->curr);
 }
-- 
cgit v1.1


From bf5c91ba8c629b84413c761f529627195fd0a935 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:04 +0200
Subject: sched: move sched_feat() definitions

move sched_feat() definitions so that it can be used sooner by generic
code too.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c      | 31 +++++++++++++++++++++++++++++++
 kernel/sched_fair.c | 31 -------------------------------
 2 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 8f80eba..a5dd035 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -382,6 +382,37 @@ static void update_rq_clock(struct rq *rq)
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
 /*
+ * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
+ */
+#ifdef CONFIG_SCHED_DEBUG
+# define const_debug __read_mostly
+#else
+# define const_debug static const
+#endif
+
+/*
+ * Debugging: various feature bits
+ */
+enum {
+	SCHED_FEAT_FAIR_SLEEPERS	= 1,
+	SCHED_FEAT_NEW_FAIR_SLEEPERS	= 2,
+	SCHED_FEAT_SLEEPER_AVG		= 4,
+	SCHED_FEAT_SLEEPER_LOAD_AVG	= 8,
+	SCHED_FEAT_START_DEBIT		= 16,
+	SCHED_FEAT_SKIP_INITIAL		= 32,
+};
+
+const_debug unsigned int sysctl_sched_features =
+		SCHED_FEAT_FAIR_SLEEPERS	*0 |
+		SCHED_FEAT_NEW_FAIR_SLEEPERS	*1 |
+		SCHED_FEAT_SLEEPER_AVG		*0 |
+		SCHED_FEAT_SLEEPER_LOAD_AVG	*1 |
+		SCHED_FEAT_START_DEBIT		*1 |
+		SCHED_FEAT_SKIP_INITIAL		*0;
+
+#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
+
+/*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
  * clock constructed from sched_clock():
  */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a2af09c..a566a45 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -21,15 +21,6 @@
  */
 
 /*
- * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
- */
-#ifdef CONFIG_SCHED_DEBUG
-# define const_debug __read_mostly
-#else
-# define const_debug static const
-#endif
-
-/*
  * Targeted preemption latency for CPU-bound tasks:
  * (default: 20ms, units: nanoseconds)
  *
@@ -87,28 +78,6 @@ const_debug unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
 
 unsigned int sysctl_sched_runtime_limit __read_mostly;
 
-/*
- * Debugging: various feature bits
- */
-enum {
-	SCHED_FEAT_FAIR_SLEEPERS	= 1,
-	SCHED_FEAT_NEW_FAIR_SLEEPERS	= 2,
-	SCHED_FEAT_SLEEPER_AVG		= 4,
-	SCHED_FEAT_SLEEPER_LOAD_AVG	= 8,
-	SCHED_FEAT_START_DEBIT		= 16,
-	SCHED_FEAT_SKIP_INITIAL		= 32,
-};
-
-const_debug unsigned int sysctl_sched_features =
-		SCHED_FEAT_FAIR_SLEEPERS	*0 |
-		SCHED_FEAT_NEW_FAIR_SLEEPERS	*1 |
-		SCHED_FEAT_SLEEPER_AVG		*0 |
-		SCHED_FEAT_SLEEPER_LOAD_AVG	*1 |
-		SCHED_FEAT_START_DEBIT		*1 |
-		SCHED_FEAT_SKIP_INITIAL		*0;
-
-#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
-
 extern struct sched_class fair_sched_class;
 
 /**************************************************************
-- 
cgit v1.1


From 6cb58195143b55d4c427d92f8425bec2b0d9c56c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:04 +0200
Subject: sched: optimize vruntime based scheduling

optimize vruntime based scheduling.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c      | 5 +++--
 kernel/sched_fair.c | 3 +++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index a5dd035..5594e65 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -732,13 +732,14 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
-	lw->inv_weight = WMULT_CONST / lw->weight;
+	if (sched_feat(FAIR_SLEEPERS))
+		lw->inv_weight = WMULT_CONST / lw->weight;
 }
 
 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
 {
 	lw->weight -= dec;
-	if (likely(lw->weight))
+	if (sched_feat(FAIR_SLEEPERS) && likely(lw->weight))
 		lw->inv_weight = WMULT_CONST / lw->weight;
 }
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a566a45..7041dc6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -336,6 +336,9 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 	}
 	curr->vruntime += delta_exec_weighted;
 
+	if (!sched_feat(FAIR_SLEEPERS))
+		return;
+
 	if (unlikely(!load))
 		return;
 
-- 
cgit v1.1


From 4d78e7b656aa6440c337302fe065338ce840a64e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Oct 2007 17:00:04 +0200
Subject: sched: new task placement for vruntime

add proper new task placement for the vruntime based math too.

( note: introduces a swap() macro, but the swap token is too
  widely used in the kernel namespace for a generic version
  to be added without changing non-scheduler code - so this
  cleanup will be done separately. )

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 7041dc6..95487e3 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -203,6 +203,20 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
  * Scheduling class statistics methods:
  */
 
+static u64 __sched_period(unsigned long nr_running)
+{
+	u64 period = sysctl_sched_latency;
+	unsigned long nr_latency =
+		sysctl_sched_latency / sysctl_sched_min_granularity;
+
+	if (unlikely(nr_running > nr_latency)) {
+		period *= nr_running;
+		do_div(period, nr_latency);
+	}
+
+	return period;
+}
+
 /*
  * Calculate the preemption granularity needed to schedule every
  * runnable task once per sysctl_sched_latency amount of time.
@@ -1103,6 +1117,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr)
 	}
 }
 
+#define swap(a,b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
+
 /*
  * Share the fairness runtime between parent and child, thus the
  * total amount of pressure for CPU stays equal - new tasks
@@ -1118,14 +1134,9 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	sched_info_queued(p);
 
 	update_curr(cfs_rq);
+	se->vruntime = cfs_rq->min_vruntime;
 	update_stats_enqueue(cfs_rq, se);
-	/*
-	 * Child runs first: we let it run before the parent
-	 * until it reschedules once. We set up the key so that
-	 * it will preempt the parent:
-	 */
-	se->fair_key = curr->fair_key -
-		niced_granularity(curr, sched_granularity(cfs_rq)) - 1;
+
 	/*
 	 * The first wait is dominated by the child-runs-first logic,
 	 * so do not credit it with that waiting time yet:
@@ -1138,9 +1149,16 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	 * -granularity/2, so initialize the task with that:
 	 */
 	if (sched_feat(START_DEBIT))
-		se->wait_runtime = -(sched_granularity(cfs_rq) / 2);
+		se->wait_runtime = -(__sched_period(cfs_rq->nr_running+1) / 2);
+
+	if (sysctl_sched_child_runs_first &&
+			curr->vruntime < se->vruntime) {
+
+		dequeue_entity(cfs_rq, curr, 0);
+		swap(curr->vruntime, se->vruntime);
+		enqueue_entity(cfs_rq, curr, 0);
+	}
 
-	se->vruntime = cfs_rq->min_vruntime;
 	update_stats_enqueue(cfs_rq, se);
 	__enqueue_entity(cfs_rq, se);
 	resched_task(rq->curr);
-- 
cgit v1.1


From 6d0f0ebd063e36cd0ebae9be15973b02c4245a99 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Oct 2007 17:00:05 +0200
Subject: sched: simplify adaptive latency

simplify adaptive latency.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 113 +++++-----------------------------------------------
 1 file changed, 9 insertions(+), 104 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 95487e3..3179d11 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -217,77 +217,14 @@ static u64 __sched_period(unsigned long nr_running)
 	return period;
 }
 
-/*
- * Calculate the preemption granularity needed to schedule every
- * runnable task once per sysctl_sched_latency amount of time.
- * (down to a sensible low limit on granularity)
- *
- * For example, if there are 2 tasks running and latency is 10 msecs,
- * we switch tasks every 5 msecs. If we have 3 tasks running, we have
- * to switch tasks every 3.33 msecs to get a 10 msecs observed latency
- * for each task. We do finer and finer scheduling up to until we
- * reach the minimum granularity value.
- *
- * To achieve this we use the following dynamic-granularity rule:
- *
- *    gran = lat/nr - lat/nr/nr
- *
- * This comes out of the following equations:
- *
- *    kA1 + gran = kB1
- *    kB2 + gran = kA2
- *    kA2 = kA1
- *    kB2 = kB1 - d + d/nr
- *    lat = d * nr
- *
- * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running),
- * '1' is start of time, '2' is end of time, 'd' is delay between
- * 1 and 2 (during which task B was running), 'nr' is number of tasks
- * running, 'lat' is the the period of each task. ('lat' is the
- * sched_latency that we aim for.)
- */
-static long
-sched_granularity(struct cfs_rq *cfs_rq)
+static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	unsigned int gran = sysctl_sched_latency;
-	unsigned int nr = cfs_rq->nr_running;
-
-	if (nr > 1) {
-		gran = gran/nr - gran/nr/nr;
-		gran = max(gran, sysctl_sched_min_granularity);
-	}
+	u64 period = __sched_period(cfs_rq->nr_running);
 
-	return gran;
-}
+	period *= se->load.weight;
+	do_div(period, cfs_rq->load.weight);
 
-/*
- * We rescale the rescheduling granularity of tasks according to their
- * nice level, but only linearly, not exponentially:
- */
-static long
-niced_granularity(struct sched_entity *curr, unsigned long granularity)
-{
-	u64 tmp;
-
-	if (likely(curr->load.weight == NICE_0_LOAD))
-		return granularity;
-	/*
-	 * Positive nice levels get the same granularity as nice-0:
-	 */
-	if (likely(curr->load.weight < NICE_0_LOAD)) {
-		tmp = curr->load.weight * (u64)granularity;
-		return (long) (tmp >> NICE_0_SHIFT);
-	}
-	/*
-	 * Negative nice level tasks get linearly finer
-	 * granularity:
-	 */
-	tmp = curr->load.inv_weight * (u64)granularity;
-
-	/*
-	 * It will always fit into 'long':
-	 */
-	return (long) (tmp >> (WMULT_SHIFT-NICE_0_SHIFT));
+	return period;
 }
 
 static inline void
@@ -646,36 +583,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
  */
 static void
 __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
-			  struct sched_entity *curr, unsigned long granularity)
+			  struct sched_entity *curr)
 {
-	s64 __delta = curr->fair_key - se->fair_key;
 	unsigned long ideal_runtime, delta_exec;
 
-	/*
-	 * ideal_runtime is compared against sum_exec_runtime, which is
-	 * walltime, hence do not scale.
-	 */
-	ideal_runtime = max(sysctl_sched_latency / cfs_rq->nr_running,
-			(unsigned long)sysctl_sched_min_granularity);
-
-	/*
-	 * If we executed more than what the latency constraint suggests,
-	 * reduce the rescheduling granularity. This way the total latency
-	 * of how much a task is not scheduled converges to
-	 * sysctl_sched_latency:
-	 */
+	ideal_runtime = sched_slice(cfs_rq, curr);
 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
 	if (delta_exec > ideal_runtime)
-		granularity = 0;
-
-	/*
-	 * Take scheduling granularity into account - do not
-	 * preempt the current task unless the best task has
-	 * a larger than sched_granularity fairness advantage:
-	 *
-	 * scale granularity as key space is in fair_clock.
-	 */
-	if (__delta > niced_granularity(curr, granularity))
 		resched_task(rq_of(cfs_rq)->curr);
 }
 
@@ -749,8 +663,7 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	if (next == curr)
 		return;
 
-	__check_preempt_curr_fair(cfs_rq, next, curr,
-			sched_granularity(cfs_rq));
+	__check_preempt_curr_fair(cfs_rq, next, curr);
 }
 
 /**************************************************
@@ -944,7 +857,6 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
 {
 	struct task_struct *curr = rq->curr;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-	unsigned long gran;
 
 	if (unlikely(rt_prio(p->prio))) {
 		update_rq_clock(rq);
@@ -953,15 +865,8 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
 		return;
 	}
 
-	gran = sysctl_sched_wakeup_granularity;
-	/*
-	 * Batch tasks prefer throughput over latency:
-	 */
-	if (unlikely(p->policy == SCHED_BATCH))
-		gran = sysctl_sched_batch_wakeup_granularity;
-
 	if (is_same_group(curr, p))
-		__check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran);
+		__check_preempt_curr_fair(cfs_rq, &p->se, &curr->se);
 }
 
 static struct task_struct *pick_next_task_fair(struct rq *rq)
-- 
cgit v1.1


From 5c6b5964a0629bd39fbf4e5648a8aca32de5bcaf Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:05 +0200
Subject: sched: simplify check_preempt() methods

simplify the check_preempt() methods.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
---
 kernel/sched_fair.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3179d11..45c7493 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -582,8 +582,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
  * Preempt the current task with a newly woken task if needed:
  */
 static void
-__check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
-			  struct sched_entity *curr)
+__check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
 	unsigned long ideal_runtime, delta_exec;
 
@@ -663,7 +662,7 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	if (next == curr)
 		return;
 
-	__check_preempt_curr_fair(cfs_rq, next, curr);
+	__check_preempt_curr_fair(cfs_rq, curr);
 }
 
 /**************************************************
@@ -866,7 +865,7 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
 	}
 
 	if (is_same_group(curr, p))
-		__check_preempt_curr_fair(cfs_rq, &p->se, &curr->se);
+		__check_preempt_curr_fair(cfs_rq, &curr->se);
 }
 
 static struct task_struct *pick_next_task_fair(struct rq *rq)
-- 
cgit v1.1


From 2e09bf556fbe1a4cd8d837a3e6607de55f7cf4fd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:05 +0200
Subject: sched: wakeup granularity increase

increase wakeup granularity - we were overscheduling a bit.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
---
 kernel/sched_fair.c | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 45c7493..a60b1da 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -74,7 +74,7 @@ const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 25000000UL;
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
  */
-const_debug unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
+const_debug unsigned int sysctl_sched_wakeup_granularity = 2000000UL;
 
 unsigned int sysctl_sched_runtime_limit __read_mostly;
 
@@ -582,7 +582,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
  * Preempt the current task with a newly woken task if needed:
  */
 static void
-__check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
 	unsigned long ideal_runtime, delta_exec;
 
@@ -646,8 +646,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 
 static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
-	struct sched_entity *next;
-
 	/*
 	 * Dequeue and enqueue the task to update its
 	 * position within the tree:
@@ -655,14 +653,8 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	dequeue_entity(cfs_rq, curr, 0);
 	enqueue_entity(cfs_rq, curr, 0);
 
-	/*
-	 * Reschedule if another task tops the current one.
-	 */
-	next = __pick_next_entity(cfs_rq);
-	if (next == curr)
-		return;
-
-	__check_preempt_curr_fair(cfs_rq, curr);
+	if (cfs_rq->nr_running > 1)
+		check_preempt_tick(cfs_rq, curr);
 }
 
 /**************************************************
@@ -852,7 +844,7 @@ static void yield_task_fair(struct rq *rq, struct task_struct *p)
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
+static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 {
 	struct task_struct *curr = rq->curr;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
@@ -863,9 +855,12 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
 		resched_task(curr);
 		return;
 	}
+	if (is_same_group(curr, p)) {
+		s64 delta = curr->se.vruntime - p->se.vruntime;
 
-	if (is_same_group(curr, p))
-		__check_preempt_curr_fair(cfs_rq, &curr->se);
+		if (delta > (s64)sysctl_sched_wakeup_granularity)
+			resched_task(curr);
+	}
 }
 
 static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1095,7 +1090,7 @@ struct sched_class fair_sched_class __read_mostly = {
 	.dequeue_task		= dequeue_task_fair,
 	.yield_task		= yield_task_fair,
 
-	.check_preempt_curr	= check_preempt_curr_fair,
+	.check_preempt_curr	= check_preempt_wakeup,
 
 	.pick_next_task		= pick_next_task_fair,
 	.put_prev_task		= put_prev_task_fair,
-- 
cgit v1.1


From aeb73b040399f94698b4f64dd058cae39187e18d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Oct 2007 17:00:05 +0200
Subject: sched: clean up new task placement

clean up new task placement.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Mike Galbraith <efault@gmx.de>
---
 kernel/sched_fair.c | 57 +++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 42 insertions(+), 15 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a60b1da..cc447fb 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -199,6 +199,21 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
 	return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
 }
 
+static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+{
+	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
+	struct sched_entity *se = NULL;
+	struct rb_node *parent;
+
+	while (*link) {
+		parent = *link;
+		se = rb_entry(parent, struct sched_entity, run_node);
+		link = &parent->rb_right;
+	}
+
+	return se;
+}
+
 /**************************************************************
  * Scheduling class statistics methods:
  */
@@ -530,6 +545,31 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 static void
+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+{
+	struct sched_entity *last = __pick_last_entity(cfs_rq);
+	u64 min_runtime, latency;
+
+	min_runtime = cfs_rq->min_vruntime;
+	if (last) {
+		min_runtime += last->vruntime;
+		min_runtime >>= 1;
+		if (initial && sched_feat(START_DEBIT))
+			min_runtime += sysctl_sched_latency/2;
+	}
+
+	if (!initial && sched_feat(NEW_FAIR_SLEEPERS)) {
+		latency = sysctl_sched_latency;
+		if (min_runtime > latency)
+			min_runtime -= latency;
+		else
+			min_runtime = 0;
+	}
+
+	se->vruntime = max(se->vruntime, min_runtime);
+}
+
+static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 {
 	/*
@@ -538,19 +578,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 	update_curr(cfs_rq);
 
 	if (wakeup) {
-		u64 min_runtime, latency;
-
-		min_runtime = cfs_rq->min_vruntime;
-		min_runtime += sysctl_sched_latency/2;
-
-		if (sched_feat(NEW_FAIR_SLEEPERS)) {
-			latency = calc_weighted(sysctl_sched_latency, se);
-			if (min_runtime > latency)
-				min_runtime -= latency;
-		}
-
-		se->vruntime = max(se->vruntime, min_runtime);
-
+		place_entity(cfs_rq, se, 0);
 		enqueue_sleeper(cfs_rq, se);
 	}
 
@@ -1033,8 +1061,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	sched_info_queued(p);
 
 	update_curr(cfs_rq);
-	se->vruntime = cfs_rq->min_vruntime;
-	update_stats_enqueue(cfs_rq, se);
+	place_entity(cfs_rq, se, 1);
 
 	/*
 	 * The first wait is dominated by the child-runs-first logic,
-- 
cgit v1.1


From 67e12eac328b276dca7e61640632ed996ff1a93a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:05 +0200
Subject: sched: add se->vruntime debugging

debug se->vruntime fields.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
---
 kernel/sched_debug.c | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 6b789da..75ccf7a 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -44,7 +44,8 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 		(long long)(p->nvcsw + p->nivcsw),
 		p->prio);
 #ifdef CONFIG_SCHEDSTATS
-	SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n",
+	SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld %15Ld\n",
+		(long long)p->se.vruntime,
 		(long long)p->se.sum_exec_runtime,
 		(long long)p->se.sum_wait_runtime,
 		(long long)p->se.sum_sleep_runtime,
@@ -64,10 +65,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
 	"\nrunnable tasks:\n"
 	"            task   PID        tree-key         delta       waiting"
 	"  switches  prio"
-	"        sum-exec        sum-wait       sum-sleep"
+	"    exec-runtime        sum-exec        sum-wait       sum-sleep"
 	"    wait-overrun   wait-underrun\n"
 	"------------------------------------------------------------------"
-	"----------------"
+	"--------------------------------"
 	"------------------------------------------------"
 	"--------------------------------\n");
 
@@ -108,6 +109,11 @@ print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 
 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
+	s64 MIN_vruntime = -1, max_vruntime = -1, spread;
+	struct rq *rq = &per_cpu(runqueues, cpu);
+	struct sched_entity *last;
+	unsigned long flags;
+
 	SEQ_printf(m, "\ncfs_rq\n");
 
 #define P(x) \
@@ -115,6 +121,23 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 
 	P(fair_clock);
 	P(exec_clock);
+	P(min_vruntime);
+
+	spin_lock_irqsave(&rq->lock, flags);
+	if (cfs_rq->rb_leftmost)
+		MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
+	last = __pick_last_entity(cfs_rq);
+	if (last)
+		max_vruntime = last->vruntime;
+	spin_unlock_irqrestore(&rq->lock, flags);
+	SEQ_printf(m, "  .%-30s: %Ld\n", "MIN_vruntime",
+			(long long)MIN_vruntime);
+	SEQ_printf(m, "  .%-30s: %Ld\n", "max_vruntime",
+			(long long)max_vruntime);
+	spread = max_vruntime - MIN_vruntime;
+	SEQ_printf(m, "  .%-30s: %Ld\n", "spread",
+			(long long)spread);
+
 	P(wait_runtime);
 	P(wait_runtime_overruns);
 	P(wait_runtime_underruns);
@@ -243,6 +266,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	P(se.wait_start_fair);
 	P(se.exec_start);
 	P(se.sleep_start_fair);
+	P(se.vruntime);
 	P(se.sum_exec_runtime);
 
 #ifdef CONFIG_SCHEDSTATS
-- 
cgit v1.1


From 28a1f6fa2f7ecec7e5da28b03a24abbecbd2e864 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:05 +0200
Subject: sched: remove SCHED_FEAT_SKIP_INITIAL

remove SCHED_FEAT_SKIP_INITIAL - it was off by default and even
when enabled it never made any real difference.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c      | 4 +---
 kernel/sched_fair.c | 7 -------
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 5594e65..bf85b4b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -399,7 +399,6 @@ enum {
 	SCHED_FEAT_SLEEPER_AVG		= 4,
 	SCHED_FEAT_SLEEPER_LOAD_AVG	= 8,
 	SCHED_FEAT_START_DEBIT		= 16,
-	SCHED_FEAT_SKIP_INITIAL		= 32,
 };
 
 const_debug unsigned int sysctl_sched_features =
@@ -407,8 +406,7 @@ const_debug unsigned int sysctl_sched_features =
 		SCHED_FEAT_NEW_FAIR_SLEEPERS	*1 |
 		SCHED_FEAT_SLEEPER_AVG		*0 |
 		SCHED_FEAT_SLEEPER_LOAD_AVG	*1 |
-		SCHED_FEAT_START_DEBIT		*1 |
-		SCHED_FEAT_SKIP_INITIAL		*0;
+		SCHED_FEAT_START_DEBIT		*1;
 
 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index cc447fb..c8c6b05 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1064,13 +1064,6 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	place_entity(cfs_rq, se, 1);
 
 	/*
-	 * The first wait is dominated by the child-runs-first logic,
-	 * so do not credit it with that waiting time yet:
-	 */
-	if (sched_feat(SKIP_INITIAL))
-		se->wait_start_fair = 0;
-
-	/*
 	 * The statistical average of wait_runtime is about
 	 * -granularity/2, so initialize the task with that:
 	 */
-- 
cgit v1.1


From 94dfb5e75ef59068a8cf68fa6e18f25ebdcd20b9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Oct 2007 17:00:05 +0200
Subject: sched: add tree based averages

add support for tree based vruntime averages.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c      |  6 +++++-
 kernel/sched_fair.c | 20 +++++++++++++-------
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index bf85b4b..198b07a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -399,6 +399,8 @@ enum {
 	SCHED_FEAT_SLEEPER_AVG		= 4,
 	SCHED_FEAT_SLEEPER_LOAD_AVG	= 8,
 	SCHED_FEAT_START_DEBIT		= 16,
+	SCHED_FEAT_USE_TREE_AVG         = 32,
+	SCHED_FEAT_APPROX_AVG           = 64,
 };
 
 const_debug unsigned int sysctl_sched_features =
@@ -406,7 +408,9 @@ const_debug unsigned int sysctl_sched_features =
 		SCHED_FEAT_NEW_FAIR_SLEEPERS	*1 |
 		SCHED_FEAT_SLEEPER_AVG		*0 |
 		SCHED_FEAT_SLEEPER_LOAD_AVG	*1 |
-		SCHED_FEAT_START_DEBIT		*1;
+		SCHED_FEAT_START_DEBIT		*1 |
+		SCHED_FEAT_USE_TREE_AVG		*0 |
+		SCHED_FEAT_APPROX_AVG		*0;
 
 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c8c6b05..86e5e8c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -547,16 +547,22 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 {
-	struct sched_entity *last = __pick_last_entity(cfs_rq);
 	u64 min_runtime, latency;
 
 	min_runtime = cfs_rq->min_vruntime;
-	if (last) {
-		min_runtime += last->vruntime;
-		min_runtime >>= 1;
-		if (initial && sched_feat(START_DEBIT))
-			min_runtime += sysctl_sched_latency/2;
-	}
+
+	if (sched_feat(USE_TREE_AVG)) {
+		struct sched_entity *last = __pick_last_entity(cfs_rq);
+		if (last) {
+			min_runtime = __pick_next_entity(cfs_rq)->vruntime;
+			min_runtime += last->vruntime;
+			min_runtime >>= 1;
+		}
+	} else if (sched_feat(APPROX_AVG))
+		min_runtime += sysctl_sched_latency/2;
+
+	if (initial && sched_feat(START_DEBIT))
+		min_runtime += sched_slice(cfs_rq, se);
 
 	if (!initial && sched_feat(NEW_FAIR_SLEEPERS)) {
 		latency = sysctl_sched_latency;
-- 
cgit v1.1


From 9014623c0e3545be58a7f19f55793f6517bdc274 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Oct 2007 17:00:05 +0200
Subject: sched: handle vruntime 64-bit overflow

Handle vruntime overflow by centering the key space around min_vruntime.

( otherwise we could overflow 64-bit vruntime in a few days with SCHED_IDLE
 tasks - or in a few years with nice +19. )

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 86e5e8c..895fef7 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -124,11 +124,18 @@ set_leftmost(struct cfs_rq *cfs_rq, struct rb_node *leftmost)
 	cfs_rq->rb_leftmost = leftmost;
 	if (leftmost) {
 		se = rb_entry(leftmost, struct sched_entity, run_node);
-		cfs_rq->min_vruntime = max(se->vruntime,
-						cfs_rq->min_vruntime);
+		if ((se->vruntime > cfs_rq->min_vruntime) ||
+		    (cfs_rq->min_vruntime > (1ULL << 61) &&
+		     se->vruntime < (1ULL << 50)))
+			cfs_rq->min_vruntime = se->vruntime;
 	}
 }
 
+s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	return se->fair_key - cfs_rq->min_vruntime;
+}
+
 /*
  * Enqueue an entity into the rb-tree:
  */
@@ -138,7 +145,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 	struct rb_node *parent = NULL;
 	struct sched_entity *entry;
-	s64 key = se->fair_key;
+	s64 key = entity_key(cfs_rq, se);
 	int leftmost = 1;
 
 	/*
@@ -151,7 +158,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		 * We dont care about collisions. Nodes with
 		 * the same key stay together.
 		 */
-		if (key - entry->fair_key < 0) {
+		if (key < entity_key(cfs_rq, entry)) {
 			link = &parent->rb_left;
 		} else {
 			link = &parent->rb_right;
-- 
cgit v1.1


From 86d9560cb6bd85986e98b4c63705daec94406bd4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:06 +0200
Subject: sched: add more vruntime statistics

add more vruntime statistics.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_debug.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 75ccf7a..7a61706 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -109,7 +109,8 @@ print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 
 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
-	s64 MIN_vruntime = -1, max_vruntime = -1, spread;
+	s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
+		spread, rq0_min_vruntime, spread0;
 	struct rq *rq = &per_cpu(runqueues, cpu);
 	struct sched_entity *last;
 	unsigned long flags;
@@ -121,7 +122,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 
 	P(fair_clock);
 	P(exec_clock);
-	P(min_vruntime);
 
 	spin_lock_irqsave(&rq->lock, flags);
 	if (cfs_rq->rb_leftmost)
@@ -129,14 +129,21 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	last = __pick_last_entity(cfs_rq);
 	if (last)
 		max_vruntime = last->vruntime;
+	min_vruntime = rq->cfs.min_vruntime;
+	rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
 	spin_unlock_irqrestore(&rq->lock, flags);
 	SEQ_printf(m, "  .%-30s: %Ld\n", "MIN_vruntime",
 			(long long)MIN_vruntime);
+	SEQ_printf(m, "  .%-30s: %Ld\n", "min_vruntime",
+			(long long)min_vruntime);
 	SEQ_printf(m, "  .%-30s: %Ld\n", "max_vruntime",
 			(long long)max_vruntime);
 	spread = max_vruntime - MIN_vruntime;
 	SEQ_printf(m, "  .%-30s: %Ld\n", "spread",
 			(long long)spread);
+	spread0 = min_vruntime - rq0_min_vruntime;
+	SEQ_printf(m, "  .%-30s: %Ld\n", "spread0",
+			(long long)spread0);
 
 	P(wait_runtime);
 	P(wait_runtime_overruns);
-- 
cgit v1.1


From 7a62eabc4d60980eb39fff659f168d903b55c6d7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:06 +0200
Subject: sched: debug: update exec_clock only when SCHED_DEBUG

micro-optimization: update cfs_rq->exec_clock only if
CONFIG_SCHED_DEBUG=y.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 895fef7..ce79eb0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -301,7 +301,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 	schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
 
 	curr->sum_exec_runtime += delta_exec;
-	cfs_rq->exec_clock += delta_exec;
+	schedstat_add(cfs_rq, exec_clock, delta_exec);
 	delta_exec_weighted = delta_exec;
 	if (unlikely(curr->load.weight != NICE_0_LOAD)) {
 		delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
-- 
cgit v1.1


From 495eca494aa6006df55e3a04e105462c5940ca17 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Mon, 15 Oct 2007 17:00:06 +0200
Subject: sched: clean up struct load_stat

'struct load_stat' is redundant now so let's get rid of it.

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c       | 18 +++++++-----------
 kernel/sched_debug.c |  2 +-
 kernel/sched_fair.c  |  2 +-
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 198b07a..3a4ac0b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -171,10 +171,6 @@ struct rt_prio_array {
 	struct list_head queue[MAX_RT_PRIO];
 };
 
-struct load_stat {
-	struct load_weight load;
-};
-
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
 	struct load_weight load;
@@ -236,7 +232,7 @@ struct rq {
 #ifdef CONFIG_NO_HZ
 	unsigned char in_nohz_recently;
 #endif
-	struct load_stat ls;	/* capture load from *all* tasks on this cpu */
+	struct load_weight load;	/* capture load from *all* tasks on this cpu */
 	unsigned long nr_load_updates;
 	u64 nr_switches;
 
@@ -831,7 +827,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
  * Update delta_exec, delta_fair fields for rq.
  *
  * delta_fair clock advances at a rate inversely proportional to
- * total load (rq->ls.load.weight) on the runqueue, while
+ * total load (rq->load.weight) on the runqueue, while
  * delta_exec advances at the same rate as wall-clock (provided
  * cpu is not idle).
  *
@@ -839,17 +835,17 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
  * runqueue over any given interval. This (smoothened) load is used
  * during load balance.
  *
- * This function is called /before/ updating rq->ls.load
+ * This function is called /before/ updating rq->load
  * and when switching tasks.
  */
 static inline void inc_load(struct rq *rq, const struct task_struct *p)
 {
-	update_load_add(&rq->ls.load, p->se.load.weight);
+	update_load_add(&rq->load, p->se.load.weight);
 }
 
 static inline void dec_load(struct rq *rq, const struct task_struct *p)
 {
-	update_load_sub(&rq->ls.load, p->se.load.weight);
+	update_load_sub(&rq->load, p->se.load.weight);
 }
 
 static void inc_nr_running(struct task_struct *p, struct rq *rq)
@@ -996,7 +992,7 @@ inline int task_curr(const struct task_struct *p)
 /* Used instead of source_load when we know the type == 0 */
 unsigned long weighted_cpuload(const int cpu)
 {
-	return cpu_rq(cpu)->ls.load.weight;
+	return cpu_rq(cpu)->load.weight;
 }
 
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
@@ -1979,7 +1975,7 @@ unsigned long nr_active(void)
  */
 static void update_cpu_load(struct rq *this_rq)
 {
-	unsigned long this_load = this_rq->ls.load.weight;
+	unsigned long this_load = this_rq->load.weight;
 	int i, scale;
 
 	this_rq->nr_load_updates++;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 7a61706..62965f0 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -174,7 +174,7 @@ static void print_cpu(struct seq_file *m, int cpu)
 
 	P(nr_running);
 	SEQ_printf(m, "  .%-30s: %lu\n", "load",
-		   rq->ls.load.weight);
+		   rq->load.weight);
 	P(nr_switches);
 	P(nr_load_updates);
 	P(nr_uninterruptible);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ce79eb0..72f202a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -652,7 +652,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	 * least twice that of our own weight (i.e. dont track it
 	 * when there are only lesser-weight tasks around):
 	 */
-	if (rq_of(cfs_rq)->ls.load.weight >= 2*se->load.weight) {
+	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
 		se->slice_max = max(se->slice_max,
 			se->sum_exec_runtime - se->prev_sum_exec_runtime);
 	}
-- 
cgit v1.1


From e22f5bbf86d8cce710d5c8ba5bf57832e73aab8c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:06 +0200
Subject: sched: remove wait_runtime limit

remove the wait_runtime-limit fields and the code depending on it, now
that the math has been changed over to rely on the vruntime metric.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |  1 -
 kernel/sched.c        | 18 ----------
 kernel/sched_debug.c  |  2 --
 kernel/sched_fair.c   | 97 +++------------------------------------------------
 kernel/sysctl.c       | 11 ------
 5 files changed, 5 insertions(+), 124 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5e5c457..353630d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -905,7 +905,6 @@ struct sched_entity {
 	u64			vruntime;
 	u64			prev_sum_exec_runtime;
 	u64			wait_start_fair;
-	u64			sleep_start_fair;
 
 #ifdef CONFIG_SCHEDSTATS
 	u64			wait_start;
diff --git a/kernel/sched.c b/kernel/sched.c
index 3a4ac0b..21cc3b2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -180,7 +180,6 @@ struct cfs_rq {
 	u64 exec_clock;
 	u64 min_vruntime;
 	s64 wait_runtime;
-	u64 sleeper_bonus;
 	unsigned long wait_runtime_overruns, wait_runtime_underruns;
 
 	struct rb_root tasks_timeline;
@@ -673,19 +672,6 @@ static inline void resched_task(struct task_struct *p)
 }
 #endif
 
-static u64 div64_likely32(u64 divident, unsigned long divisor)
-{
-#if BITS_PER_LONG == 32
-	if (likely(divident <= 0xffffffffULL))
-		return (u32)divident / divisor;
-	do_div(divident, divisor);
-
-	return divident;
-#else
-	return divident / divisor;
-#endif
-}
-
 #if BITS_PER_LONG == 32
 # define WMULT_CONST	(~0UL)
 #else
@@ -1016,8 +1002,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 
 	if (p->se.wait_start_fair)
 		p->se.wait_start_fair -= fair_clock_offset;
-	if (p->se.sleep_start_fair)
-		p->se.sleep_start_fair -= fair_clock_offset;
 
 #ifdef CONFIG_SCHEDSTATS
 	if (p->se.wait_start)
@@ -1592,7 +1576,6 @@ static void __sched_fork(struct task_struct *p)
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.wait_runtime		= 0;
-	p->se.sleep_start_fair		= 0;
 
 #ifdef CONFIG_SCHEDSTATS
 	p->se.wait_start		= 0;
@@ -6582,7 +6565,6 @@ void normalize_rt_tasks(void)
 		p->se.wait_runtime		= 0;
 		p->se.exec_start		= 0;
 		p->se.wait_start_fair		= 0;
-		p->se.sleep_start_fair		= 0;
 #ifdef CONFIG_SCHEDSTATS
 		p->se.wait_start		= 0;
 		p->se.sleep_start		= 0;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 62965f0..3350169 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -148,7 +148,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	P(wait_runtime);
 	P(wait_runtime_overruns);
 	P(wait_runtime_underruns);
-	P(sleeper_bonus);
 #undef P
 
 	print_cfs_rq_runtime_sum(m, cpu, cfs_rq);
@@ -272,7 +271,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	P(se.wait_runtime);
 	P(se.wait_start_fair);
 	P(se.exec_start);
-	P(se.sleep_start_fair);
 	P(se.vruntime);
 	P(se.sum_exec_runtime);
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 72f202a..a94189c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -249,41 +249,11 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	return period;
 }
 
-static inline void
-limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	long limit = sysctl_sched_runtime_limit;
-
-	/*
-	 * Niced tasks have the same history dynamic range as
-	 * non-niced tasks:
-	 */
-	if (unlikely(se->wait_runtime > limit)) {
-		se->wait_runtime = limit;
-		schedstat_inc(se, wait_runtime_overruns);
-		schedstat_inc(cfs_rq, wait_runtime_overruns);
-	}
-	if (unlikely(se->wait_runtime < -limit)) {
-		se->wait_runtime = -limit;
-		schedstat_inc(se, wait_runtime_underruns);
-		schedstat_inc(cfs_rq, wait_runtime_underruns);
-	}
-}
-
-static inline void
-__add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
-{
-	se->wait_runtime += delta;
-	schedstat_add(se, sum_wait_runtime, delta);
-	limit_wait_runtime(cfs_rq, se);
-}
-
 static void
 add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
 {
-	schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
-	__add_wait_runtime(cfs_rq, se, delta);
-	schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
+	se->wait_runtime += delta;
+	schedstat_add(cfs_rq, wait_runtime, delta);
 }
 
 /*
@@ -294,7 +264,7 @@ static inline void
 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 	      unsigned long delta_exec)
 {
-	unsigned long delta, delta_fair, delta_mine, delta_exec_weighted;
+	unsigned long delta_fair, delta_mine, delta_exec_weighted;
 	struct load_weight *lw = &cfs_rq->load;
 	unsigned long load = lw->weight;
 
@@ -318,14 +288,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 	delta_fair = calc_delta_fair(delta_exec, lw);
 	delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
 
-	if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) {
-		delta = min((u64)delta_mine, cfs_rq->sleeper_bonus);
-		delta = min(delta, (unsigned long)(
-			(long)sysctl_sched_runtime_limit - curr->wait_runtime));
-		cfs_rq->sleeper_bonus -= delta;
-		delta_mine -= delta;
-	}
-
 	cfs_rq->fair_clock += delta_fair;
 	/*
 	 * We executed delta_exec amount of time on the CPU,
@@ -461,58 +423,8 @@ update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Scheduling class queueing methods:
  */
 
-static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se,
-			      unsigned long delta_fair)
-{
-	unsigned long load = cfs_rq->load.weight;
-	long prev_runtime;
-
-	/*
-	 * Do not boost sleepers if there's too much bonus 'in flight'
-	 * already:
-	 */
-	if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit))
-		return;
-
-	if (sched_feat(SLEEPER_LOAD_AVG))
-		load = rq_of(cfs_rq)->cpu_load[2];
-
-	/*
-	 * Fix up delta_fair with the effect of us running
-	 * during the whole sleep period:
-	 */
-	if (sched_feat(SLEEPER_AVG))
-		delta_fair = div64_likely32((u64)delta_fair * load,
-						load + se->load.weight);
-
-	delta_fair = calc_weighted(delta_fair, se);
-
-	prev_runtime = se->wait_runtime;
-	__add_wait_runtime(cfs_rq, se, delta_fair);
-	delta_fair = se->wait_runtime - prev_runtime;
-
-	/*
-	 * Track the amount of bonus we've given to sleepers:
-	 */
-	cfs_rq->sleeper_bonus += delta_fair;
-}
-
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	struct task_struct *tsk = task_of(se);
-	unsigned long delta_fair;
-
-	if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) ||
-			 !sched_feat(FAIR_SLEEPERS))
-		return;
-
-	delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
-		(u64)(cfs_rq->fair_clock - se->sleep_start_fair));
-
-	__enqueue_sleeper(cfs_rq, se, delta_fair);
-
-	se->sleep_start_fair = 0;
-
 #ifdef CONFIG_SCHEDSTATS
 	if (se->sleep_start) {
 		u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
@@ -544,6 +456,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		 * time that the task spent sleeping:
 		 */
 		if (unlikely(prof_on == SLEEP_PROFILING)) {
+			struct task_struct *tsk = task_of(se);
+
 			profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
 				     delta >> 20);
 		}
@@ -604,7 +518,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 {
 	update_stats_dequeue(cfs_rq, se);
 	if (sleep) {
-		se->sleep_start_fair = cfs_rq->fair_clock;
 #ifdef CONFIG_SCHEDSTATS
 		if (entity_is_task(se)) {
 			struct task_struct *tsk = task_of(se);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9b1b0d4..97b15c2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -266,17 +266,6 @@ static ctl_table kern_table[] = {
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_runtime_limit_ns",
-		.data		= &sysctl_sched_runtime_limit,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &min_sched_granularity_ns,
-		.extra2		= &max_sched_granularity_ns,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_child_runs_first",
 		.data		= &sysctl_sched_child_runs_first,
 		.maxlen		= sizeof(unsigned int),
-- 
cgit v1.1


From bbdba7c0e1161934ae881ad00e4db49830f5ef59 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:06 +0200
Subject: sched: remove wait_runtime fields and features

remove wait_runtime based fields and features, now that the CFS
math has been changed over to the vruntime metric.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |  9 -------
 kernel/sched.c        | 38 ++++----------------------
 kernel/sched_debug.c  | 54 ++++---------------------------------
 kernel/sched_fair.c   | 74 +++------------------------------------------------
 4 files changed, 14 insertions(+), 161 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 353630d..572df1b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -888,13 +888,9 @@ struct load_weight {
  *     4 se->block_start
  *     4 se->run_node
  *     4 se->sleep_start
- *     4 se->sleep_start_fair
  *     6 se->load.weight
- *     7 se->delta_fair
- *    15 se->wait_runtime
  */
 struct sched_entity {
-	long			wait_runtime;
 	s64			fair_key;
 	struct load_weight	load;		/* for load-balancing */
 	struct rb_node		run_node;
@@ -904,12 +900,10 @@ struct sched_entity {
 	u64			sum_exec_runtime;
 	u64			vruntime;
 	u64			prev_sum_exec_runtime;
-	u64			wait_start_fair;
 
 #ifdef CONFIG_SCHEDSTATS
 	u64			wait_start;
 	u64			wait_max;
-	s64			sum_wait_runtime;
 
 	u64			sleep_start;
 	u64			sleep_max;
@@ -919,9 +913,6 @@ struct sched_entity {
 	u64			block_max;
 	u64			exec_max;
 	u64			slice_max;
-
-	unsigned long		wait_runtime_overruns;
-	unsigned long		wait_runtime_underruns;
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched.c b/kernel/sched.c
index 21cc3b2..0f0cf37 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -176,11 +176,8 @@ struct cfs_rq {
 	struct load_weight load;
 	unsigned long nr_running;
 
-	s64 fair_clock;
 	u64 exec_clock;
 	u64 min_vruntime;
-	s64 wait_runtime;
-	unsigned long wait_runtime_overruns, wait_runtime_underruns;
 
 	struct rb_root tasks_timeline;
 	struct rb_node *rb_leftmost;
@@ -389,20 +386,14 @@ static void update_rq_clock(struct rq *rq)
  * Debugging: various feature bits
  */
 enum {
-	SCHED_FEAT_FAIR_SLEEPERS	= 1,
-	SCHED_FEAT_NEW_FAIR_SLEEPERS	= 2,
-	SCHED_FEAT_SLEEPER_AVG		= 4,
-	SCHED_FEAT_SLEEPER_LOAD_AVG	= 8,
-	SCHED_FEAT_START_DEBIT		= 16,
-	SCHED_FEAT_USE_TREE_AVG         = 32,
-	SCHED_FEAT_APPROX_AVG           = 64,
+	SCHED_FEAT_NEW_FAIR_SLEEPERS	= 1,
+	SCHED_FEAT_START_DEBIT		= 2,
+	SCHED_FEAT_USE_TREE_AVG         = 4,
+	SCHED_FEAT_APPROX_AVG           = 8,
 };
 
 const_debug unsigned int sysctl_sched_features =
-		SCHED_FEAT_FAIR_SLEEPERS	*0 |
 		SCHED_FEAT_NEW_FAIR_SLEEPERS	*1 |
-		SCHED_FEAT_SLEEPER_AVG		*0 |
-		SCHED_FEAT_SLEEPER_LOAD_AVG	*1 |
 		SCHED_FEAT_START_DEBIT		*1 |
 		SCHED_FEAT_USE_TREE_AVG		*0 |
 		SCHED_FEAT_APPROX_AVG		*0;
@@ -716,15 +707,11 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
-	if (sched_feat(FAIR_SLEEPERS))
-		lw->inv_weight = WMULT_CONST / lw->weight;
 }
 
 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
 {
 	lw->weight -= dec;
-	if (sched_feat(FAIR_SLEEPERS) && likely(lw->weight))
-		lw->inv_weight = WMULT_CONST / lw->weight;
 }
 
 /*
@@ -848,8 +835,6 @@ static void dec_nr_running(struct task_struct *p, struct rq *rq)
 
 static void set_load_weight(struct task_struct *p)
 {
-	p->se.wait_runtime = 0;
-
 	if (task_has_rt_policy(p)) {
 		p->se.load.weight = prio_to_weight[0] * 2;
 		p->se.load.inv_weight = prio_to_wmult[0] >> 1;
@@ -995,13 +980,9 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 	int old_cpu = task_cpu(p);
 	struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
-	u64 clock_offset, fair_clock_offset;
+	u64 clock_offset;
 
 	clock_offset = old_rq->clock - new_rq->clock;
-	fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock;
-
-	if (p->se.wait_start_fair)
-		p->se.wait_start_fair -= fair_clock_offset;
 
 #ifdef CONFIG_SCHEDSTATS
 	if (p->se.wait_start)
@@ -1571,15 +1552,12 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state)
  */
 static void __sched_fork(struct task_struct *p)
 {
-	p->se.wait_start_fair		= 0;
 	p->se.exec_start		= 0;
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
-	p->se.wait_runtime		= 0;
 
 #ifdef CONFIG_SCHEDSTATS
 	p->se.wait_start		= 0;
-	p->se.sum_wait_runtime		= 0;
 	p->se.sum_sleep_runtime		= 0;
 	p->se.sleep_start		= 0;
 	p->se.block_start		= 0;
@@ -1588,8 +1566,6 @@ static void __sched_fork(struct task_struct *p)
 	p->se.exec_max			= 0;
 	p->se.slice_max			= 0;
 	p->se.wait_max			= 0;
-	p->se.wait_runtime_overruns	= 0;
-	p->se.wait_runtime_underruns	= 0;
 #endif
 
 	INIT_LIST_HEAD(&p->run_list);
@@ -6436,7 +6412,6 @@ int in_sched_functions(unsigned long addr)
 static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT;
-	cfs_rq->fair_clock = 1;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->rq = rq;
 #endif
@@ -6562,15 +6537,12 @@ void normalize_rt_tasks(void)
 	read_lock_irq(&tasklist_lock);
 	do_each_thread(g, p) {
 		p->se.fair_key			= 0;
-		p->se.wait_runtime		= 0;
 		p->se.exec_start		= 0;
-		p->se.wait_start_fair		= 0;
 #ifdef CONFIG_SCHEDSTATS
 		p->se.wait_start		= 0;
 		p->se.sleep_start		= 0;
 		p->se.block_start		= 0;
 #endif
-		task_rq(p)->cfs.fair_clock	= 0;
 		task_rq(p)->clock		= 0;
 
 		if (!rt_task(p)) {
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 3350169..e3b6232 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -36,21 +36,16 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 	else
 		SEQ_printf(m, " ");
 
-	SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d ",
+	SEQ_printf(m, "%15s %5d %15Ld %13Ld %5d ",
 		p->comm, p->pid,
 		(long long)p->se.fair_key,
-		(long long)(p->se.fair_key - rq->cfs.fair_clock),
-		(long long)p->se.wait_runtime,
 		(long long)(p->nvcsw + p->nivcsw),
 		p->prio);
 #ifdef CONFIG_SCHEDSTATS
-	SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld %15Ld\n",
+	SEQ_printf(m, "%15Ld %15Ld %15Ld\n",
 		(long long)p->se.vruntime,
 		(long long)p->se.sum_exec_runtime,
-		(long long)p->se.sum_wait_runtime,
-		(long long)p->se.sum_sleep_runtime,
-		(long long)p->se.wait_runtime_overruns,
-		(long long)p->se.wait_runtime_underruns);
+		(long long)p->se.sum_sleep_runtime);
 #else
 	SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n",
 		0LL, 0LL, 0LL, 0LL, 0LL);
@@ -63,10 +58,8 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
 
 	SEQ_printf(m,
 	"\nrunnable tasks:\n"
-	"            task   PID        tree-key         delta       waiting"
-	"  switches  prio"
-	"    exec-runtime        sum-exec        sum-wait       sum-sleep"
-	"    wait-overrun   wait-underrun\n"
+	"            task   PID        tree-key  switches  prio"
+	"    exec-runtime        sum-exec       sum-sleep\n"
 	"------------------------------------------------------------------"
 	"--------------------------------"
 	"------------------------------------------------"
@@ -84,29 +77,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
 	read_unlock_irq(&tasklist_lock);
 }
 
-static void
-print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
-{
-	s64 wait_runtime_rq_sum = 0;
-	struct task_struct *p;
-	struct rb_node *curr;
-	unsigned long flags;
-	struct rq *rq = &per_cpu(runqueues, cpu);
-
-	spin_lock_irqsave(&rq->lock, flags);
-	curr = first_fair(cfs_rq);
-	while (curr) {
-		p = rb_entry(curr, struct task_struct, se.run_node);
-		wait_runtime_rq_sum += p->se.wait_runtime;
-
-		curr = rb_next(curr);
-	}
-	spin_unlock_irqrestore(&rq->lock, flags);
-
-	SEQ_printf(m, "  .%-30s: %Ld\n", "wait_runtime_rq_sum",
-		(long long)wait_runtime_rq_sum);
-}
-
 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
 	s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@@ -120,7 +90,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #define P(x) \
 	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(cfs_rq->x))
 
-	P(fair_clock);
 	P(exec_clock);
 
 	spin_lock_irqsave(&rq->lock, flags);
@@ -144,13 +113,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	spread0 = min_vruntime - rq0_min_vruntime;
 	SEQ_printf(m, "  .%-30s: %Ld\n", "spread0",
 			(long long)spread0);
-
-	P(wait_runtime);
-	P(wait_runtime_overruns);
-	P(wait_runtime_underruns);
 #undef P
-
-	print_cfs_rq_runtime_sum(m, cpu, cfs_rq);
 }
 
 static void print_cpu(struct seq_file *m, int cpu)
@@ -268,8 +231,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 #define P(F) \
 	SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F)
 
-	P(se.wait_runtime);
-	P(se.wait_start_fair);
 	P(se.exec_start);
 	P(se.vruntime);
 	P(se.sum_exec_runtime);
@@ -283,9 +244,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	P(se.exec_max);
 	P(se.slice_max);
 	P(se.wait_max);
-	P(se.wait_runtime_overruns);
-	P(se.wait_runtime_underruns);
-	P(se.sum_wait_runtime);
 #endif
 	SEQ_printf(m, "%-25s:%20Ld\n",
 		   "nr_switches", (long long)(p->nvcsw + p->nivcsw));
@@ -312,8 +270,6 @@ void proc_sched_set_task(struct task_struct *p)
 	p->se.exec_max			= 0;
 	p->se.slice_max			= 0;
 	p->se.wait_max			= 0;
-	p->se.wait_runtime_overruns	= 0;
-	p->se.wait_runtime_underruns	= 0;
 #endif
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a94189c..2df5a64 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -178,8 +178,6 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	update_load_add(&cfs_rq->load, se->load.weight);
 	cfs_rq->nr_running++;
 	se->on_rq = 1;
-
-	schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
 }
 
 static void
@@ -192,8 +190,6 @@ __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	update_load_sub(&cfs_rq->load, se->load.weight);
 	cfs_rq->nr_running--;
 	se->on_rq = 0;
-
-	schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
 }
 
 static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
@@ -249,13 +245,6 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	return period;
 }
 
-static void
-add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
-{
-	se->wait_runtime += delta;
-	schedstat_add(cfs_rq, wait_runtime, delta);
-}
-
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
@@ -264,9 +253,7 @@ static inline void
 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 	      unsigned long delta_exec)
 {
-	unsigned long delta_fair, delta_mine, delta_exec_weighted;
-	struct load_weight *lw = &cfs_rq->load;
-	unsigned long load = lw->weight;
+	unsigned long delta_exec_weighted;
 
 	schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
 
@@ -278,25 +265,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 							&curr->load);
 	}
 	curr->vruntime += delta_exec_weighted;
-
-	if (!sched_feat(FAIR_SLEEPERS))
-		return;
-
-	if (unlikely(!load))
-		return;
-
-	delta_fair = calc_delta_fair(delta_exec, lw);
-	delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
-
-	cfs_rq->fair_clock += delta_fair;
-	/*
-	 * We executed delta_exec amount of time on the CPU,
-	 * but we were only entitled to delta_mine amount of
-	 * time during that period (if nr_running == 1 then
-	 * the two values are equal)
-	 * [Note: delta_mine - delta_exec is negative]:
-	 */
-	add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec);
 }
 
 static void update_curr(struct cfs_rq *cfs_rq)
@@ -322,7 +290,6 @@ static void update_curr(struct cfs_rq *cfs_rq)
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	se->wait_start_fair = cfs_rq->fair_clock;
 	schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
 }
 
@@ -354,35 +321,11 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	se->fair_key = se->vruntime;
 }
 
-/*
- * Note: must be called with a freshly updated rq->fair_clock.
- */
-static inline void
-__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se,
-			unsigned long delta_fair)
-{
-	schedstat_set(se->wait_max, max(se->wait_max,
-			rq_of(cfs_rq)->clock - se->wait_start));
-
-	delta_fair = calc_weighted(delta_fair, se);
-
-	add_wait_runtime(cfs_rq, se, delta_fair);
-}
-
 static void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	unsigned long delta_fair;
-
-	if (unlikely(!se->wait_start_fair))
-		return;
-
-	delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
-			(u64)(cfs_rq->fair_clock - se->wait_start_fair));
-
-	__update_stats_wait_end(cfs_rq, se, delta_fair);
-
-	se->wait_start_fair = 0;
+	schedstat_set(se->wait_max, max(se->wait_max,
+			rq_of(cfs_rq)->clock - se->wait_start));
 	schedstat_set(se->wait_start, 0);
 }
 
@@ -552,9 +495,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	/*
 	 * Any task has to be enqueued before it get to execute on
 	 * a CPU. So account for the time it spent waiting on the
-	 * runqueue. (note, here we rely on pick_next_task() having
-	 * done a put_prev_task_fair() shortly before this, which
-	 * updated rq->fair_clock - used by update_stats_wait_end())
+	 * runqueue.
 	 */
 	update_stats_wait_end(cfs_rq, se);
 	update_stats_curr_start(cfs_rq, se);
@@ -989,13 +930,6 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	update_curr(cfs_rq);
 	place_entity(cfs_rq, se, 1);
 
-	/*
-	 * The statistical average of wait_runtime is about
-	 * -granularity/2, so initialize the task with that:
-	 */
-	if (sched_feat(START_DEBIT))
-		se->wait_runtime = -(__sched_period(cfs_rq->nr_running+1) / 2);
-
 	if (sysctl_sched_child_runs_first &&
 			curr->vruntime < se->vruntime) {
 
-- 
cgit v1.1


From db36cc7d6d9e538481e60fae7f56646b92557526 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Mon, 15 Oct 2007 17:00:06 +0200
Subject: sched: clean up schedstat block in dequeue_entity()

Better placement of #ifdef CONFIG_SCHEDSTAT block in dequeue_entity().

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 2df5a64..e3081fb 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -460,8 +460,8 @@ static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 {
 	update_stats_dequeue(cfs_rq, se);
-	if (sleep) {
 #ifdef CONFIG_SCHEDSTATS
+	if (sleep) {
 		if (entity_is_task(se)) {
 			struct task_struct *tsk = task_of(se);
 
@@ -470,8 +470,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 			if (tsk->state & TASK_UNINTERRUPTIBLE)
 				se->block_start = rq_of(cfs_rq)->clock;
 		}
-#endif
 	}
+#endif
 	__dequeue_entity(cfs_rq, se);
 }
 
-- 
cgit v1.1


From 35a6ff5417bf94c9e19b6b55a9eb6eea14cc7be7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:07 +0200
Subject: sched: x86: allow single-depth wchan output

sched.o gets smaller and faster if we compile it with -fomit-frame-pointers,
so make this a config option. The cost is the loss of multi-depth wchan
lookups - but SysRq-T is a sufficient replacement for them anyway, so their
utility is much lower these days.

the size difference is significant:

   text    data     bss     dec     hex filename
  34005    3462      24   37491    9273 sched.o.before
  33470    3462      24   36956    905c sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/i386/Kconfig | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index f1486f8..bf9aafa 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -214,6 +214,17 @@ config X86_ES7000
 
 endchoice
 
+config SCHED_NO_NO_OMIT_FRAME_POINTER
+	bool "Single-depth WCHAN output"
+	default y
+	help
+	  Calculate simpler /proc/<PID>/wchan values. If this option
+	  is disabled then wchan values will recurse back to the
+	  caller function. This provides more accurate wchan values,
+	  at the expense of slightly more scheduling overhead.
+
+	  If in doubt, say "Y".
+
 config PARAVIRT
 	bool "Paravirtualization support (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
-- 
cgit v1.1


From 02e0431a3db554019b816936b597d618256b705d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Oct 2007 17:00:07 +0200
Subject: sched: better min_vruntime tracking

Better min_vruntime tracking: update it every time 'curr' is
updated - not just when a task is enqueued into the tree.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 40 +++++++++++++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e3081fb..ec445ca 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -116,22 +116,28 @@ static inline struct task_struct *task_of(struct sched_entity *se)
  * Scheduling class tree data structure manipulation methods:
  */
 
+static inline u64
+max_vruntime(u64 min_vruntime, u64 vruntime)
+{
+	if ((vruntime > min_vruntime) ||
+	    (min_vruntime > (1ULL << 61) && vruntime < (1ULL << 50)))
+		min_vruntime = vruntime;
+
+	return min_vruntime;
+}
+
 static inline void
 set_leftmost(struct cfs_rq *cfs_rq, struct rb_node *leftmost)
 {
 	struct sched_entity *se;
 
 	cfs_rq->rb_leftmost = leftmost;
-	if (leftmost) {
+	if (leftmost)
 		se = rb_entry(leftmost, struct sched_entity, run_node);
-		if ((se->vruntime > cfs_rq->min_vruntime) ||
-		    (cfs_rq->min_vruntime > (1ULL << 61) &&
-		     se->vruntime < (1ULL << 50)))
-			cfs_rq->min_vruntime = se->vruntime;
-	}
 }
 
-s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline s64
+entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	return se->fair_key - cfs_rq->min_vruntime;
 }
@@ -254,6 +260,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 	      unsigned long delta_exec)
 {
 	unsigned long delta_exec_weighted;
+	u64 next_vruntime, min_vruntime;
 
 	schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
 
@@ -265,6 +272,25 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 							&curr->load);
 	}
 	curr->vruntime += delta_exec_weighted;
+
+	/*
+	 * maintain cfs_rq->min_vruntime to be a monotonic increasing
+	 * value tracking the leftmost vruntime in the tree.
+	 */
+	if (first_fair(cfs_rq)) {
+		next_vruntime = __pick_next_entity(cfs_rq)->vruntime;
+
+		/* min_vruntime() := !max_vruntime() */
+		min_vruntime = max_vruntime(curr->vruntime, next_vruntime);
+		if (min_vruntime == next_vruntime)
+			min_vruntime = curr->vruntime;
+		else
+			min_vruntime = next_vruntime;
+	} else
+		min_vruntime = curr->vruntime;
+
+	cfs_rq->min_vruntime =
+		max_vruntime(cfs_rq->min_vruntime, min_vruntime);
 }
 
 static void update_curr(struct cfs_rq *cfs_rq)
-- 
cgit v1.1


From 119fe5e06800afc197781ebc8c2d8ca7d03497c8 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Mon, 15 Oct 2007 17:00:07 +0200
Subject: sched: fix SMP migration latencies

fix SMP migration latencies: the vruntimes of different CPUs are
at incompatible offsets so they have to be fixed up when migrating
a task across CPUs.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/sched.c b/kernel/sched.c
index 0f0cf37..4ad789d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -992,6 +992,9 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	if (p->se.block_start)
 		p->se.block_start -= clock_offset;
 #endif
+	if (likely(new_rq->cfs.min_vruntime))
+		p->se.vruntime -= old_rq->cfs.min_vruntime -
+						new_rq->cfs.min_vruntime;
 
 	__set_task_cpu(p, new_cpu);
 }
-- 
cgit v1.1


From 29f59db3a74b0bdf78a1f5b53ef773caa82692dc Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Mon, 15 Oct 2007 17:00:07 +0200
Subject: sched: group-scheduler core

Add interface to control cpu bandwidth allocation to task-groups.

(not yet configurable, due to missing CONFIG_CONTAINERS)

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 init/Kconfig            |   9 ++
 kernel/sched.c          | 346 +++++++++++++++++++++++++++++++++++++++++++++---
 kernel/sched_fair.c     |   3 +-
 kernel/sched_idletask.c |   5 +
 kernel/sched_rt.c       |   5 +
 5 files changed, 350 insertions(+), 18 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index d54d0ca..11c6762 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -281,6 +281,15 @@ config CPUSETS
 
 	  Say N if unsure.
 
+config FAIR_GROUP_SCHED
+	bool "Fair group scheduler"
+	depends on EXPERIMENTAL && CONTAINERS
+	help
+	  This option enables you to group tasks and control CPU resource
+	  allocation to such groups.
+
+	  Say N if unsure.
+
 config SYSFS_DEPRECATED
 	bool "Create deprecated sysfs files"
 	default y
diff --git a/kernel/sched.c b/kernel/sched.c
index 4ad789d..b2688ce 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -171,6 +171,58 @@ struct rt_prio_array {
 	struct list_head queue[MAX_RT_PRIO];
 };
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+#include <linux/container.h>
+
+struct cfs_rq;
+
+/* task group related information */
+struct task_grp {
+	struct container_subsys_state css;
+	/* schedulable entities of this group on each cpu */
+	struct sched_entity **se;
+	/* runqueue "owned" by this group on each cpu */
+	struct cfs_rq **cfs_rq;
+	unsigned long shares;
+};
+
+/* Default task group's sched entity on each cpu */
+static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
+/* Default task group's cfs_rq on each cpu */
+static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
+
+static struct sched_entity *init_sched_entity_p[CONFIG_NR_CPUS];
+static struct cfs_rq *init_cfs_rq_p[CONFIG_NR_CPUS];
+
+/* Default task group.
+ * 	Every task in system belong to this group at bootup.
+ */
+static struct task_grp init_task_grp =  {
+					.se     = init_sched_entity_p,
+					.cfs_rq = init_cfs_rq_p,
+					};
+
+/* return group to which a task belongs */
+static inline struct task_grp *task_grp(struct task_struct *p)
+{
+	return container_of(task_subsys_state(p, cpu_subsys_id),
+				struct task_grp, css);
+}
+
+/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
+static inline void set_task_cfs_rq(struct task_struct *p)
+{
+	p->se.cfs_rq = task_grp(p)->cfs_rq[task_cpu(p)];
+	p->se.parent = task_grp(p)->se[task_cpu(p)];
+}
+
+#else
+
+static inline void set_task_cfs_rq(struct task_struct *p) { }
+
+#endif	/* CONFIG_FAIR_GROUP_SCHED */
+
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
 	struct load_weight load;
@@ -197,6 +249,7 @@ struct cfs_rq {
 	 * list is used during load balance.
 	 */
 	struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
+	struct task_grp *tg;    /* group that "owns" this runqueue */
 #endif
 };
 
@@ -419,18 +472,6 @@ unsigned long long cpu_clock(int cpu)
 	return now;
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/* Change a task's ->cfs_rq if it moves across CPUs */
-static inline void set_task_cfs_rq(struct task_struct *p)
-{
-	p->se.cfs_rq = &task_rq(p)->cfs;
-}
-#else
-static inline void set_task_cfs_rq(struct task_struct *p)
-{
-}
-#endif
-
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
@@ -970,8 +1011,8 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
 #ifdef CONFIG_SMP
 	task_thread_info(p)->cpu = cpu;
-	set_task_cfs_rq(p);
 #endif
+	set_task_cfs_rq(p);
 }
 
 #ifdef CONFIG_SMP
@@ -3885,8 +3926,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 	oldprio = p->prio;
 	on_rq = p->se.on_rq;
-	if (on_rq)
+	if (on_rq) {
 		dequeue_task(rq, p, 0);
+		if (task_running(rq, p))
+			p->sched_class->put_prev_task(rq, p);
+	}
 
 	if (rt_prio(prio))
 		p->sched_class = &rt_sched_class;
@@ -3905,6 +3949,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		if (task_running(rq, p)) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
+			p->sched_class->set_curr_task(rq);
 		} else {
 			check_preempt_curr(rq, p);
 		}
@@ -4190,8 +4235,11 @@ recheck:
 	}
 	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
-	if (on_rq)
+	if (on_rq) {
 		deactivate_task(rq, p, 0);
+		if (task_running(rq, p))
+			p->sched_class->put_prev_task(rq, p);
+	}
 	oldprio = p->prio;
 	__setscheduler(rq, p, policy, param->sched_priority);
 	if (on_rq) {
@@ -4204,6 +4252,7 @@ recheck:
 		if (task_running(rq, p)) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
+			p->sched_class->set_curr_task(rq);
 		} else {
 			check_preempt_curr(rq, p);
 		}
@@ -6444,7 +6493,25 @@ void __init sched_init(void)
 		init_cfs_rq(&rq->cfs, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-		list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+	 	{
+ 			struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
+ 			struct sched_entity *se =
+ 					 &per_cpu(init_sched_entity, i);
+
+ 			init_cfs_rq_p[i] = cfs_rq;
+ 			init_cfs_rq(cfs_rq, rq);
+ 			cfs_rq->tg = &init_task_grp;
+ 			list_add(&cfs_rq->leaf_cfs_rq_list,
+							 &rq->leaf_cfs_rq_list);
+
+ 			init_sched_entity_p[i] = se;
+ 			se->cfs_rq = &rq->cfs;
+ 			se->my_q = cfs_rq;
+ 			se->load.weight = NICE_0_LOAD;
+			se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
+ 			se->parent = NULL;
+ 		}
+		init_task_grp.shares = NICE_0_LOAD;
 #endif
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -6632,3 +6699,250 @@ void set_curr_task(int cpu, struct task_struct *p)
 }
 
 #endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* return corresponding task_grp object of a container */
+static inline struct task_grp *container_tg(struct container *cont)
+{
+	return container_of(container_subsys_state(cont, cpu_subsys_id),
+					 struct task_grp, css);
+}
+
+/* allocate runqueue etc for a new task group */
+static struct container_subsys_state *
+sched_create_group(struct container_subsys *ss, struct container *cont)
+{
+	struct task_grp *tg;
+	struct cfs_rq *cfs_rq;
+	struct sched_entity *se;
+	int i;
+
+	if (!cont->parent) {
+		/* This is early initialization for the top container */
+		init_task_grp.css.container = cont;
+		return &init_task_grp.css;
+	}
+
+	/* we support only 1-level deep hierarchical scheduler atm */
+	if (cont->parent->parent)
+		return ERR_PTR(-EINVAL);
+
+	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+	if (!tg)
+		return ERR_PTR(-ENOMEM);
+
+	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * num_possible_cpus(), GFP_KERNEL);
+	if (!tg->cfs_rq)
+		goto err;
+	tg->se = kzalloc(sizeof(se) * num_possible_cpus(), GFP_KERNEL);
+	if (!tg->se)
+		goto err;
+
+	for_each_possible_cpu(i) {
+		struct rq *rq = cpu_rq(i);
+
+		cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL,
+							 cpu_to_node(i));
+		if (!cfs_rq)
+			goto err;
+
+		se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL,
+							cpu_to_node(i));
+		if (!se)
+			goto err;
+
+		memset(cfs_rq, 0, sizeof(struct cfs_rq));
+		memset(se, 0, sizeof(struct sched_entity));
+
+		tg->cfs_rq[i] = cfs_rq;
+		init_cfs_rq(cfs_rq, rq);
+		cfs_rq->tg = tg;
+		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+
+		tg->se[i] = se;
+		se->cfs_rq = &rq->cfs;
+		se->my_q = cfs_rq;
+		se->load.weight = NICE_0_LOAD;
+		se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
+		se->parent = NULL;
+	}
+
+	tg->shares = NICE_0_LOAD;
+
+	/* Bind the container to task_grp object we just created */
+	tg->css.container = cont;
+
+	return &tg->css;
+
+err:
+	for_each_possible_cpu(i) {
+		if (tg->cfs_rq && tg->cfs_rq[i])
+			kfree(tg->cfs_rq[i]);
+		if (tg->se && tg->se[i])
+			kfree(tg->se[i]);
+	}
+	if (tg->cfs_rq)
+		kfree(tg->cfs_rq);
+	if (tg->se)
+		kfree(tg->se);
+	if (tg)
+		kfree(tg);
+
+	return ERR_PTR(-ENOMEM);
+}
+
+
+/* destroy runqueue etc associated with a task group */
+static void sched_destroy_group(struct container_subsys *ss,
+					struct container *cont)
+{
+	struct task_grp *tg = container_tg(cont);
+	struct cfs_rq *cfs_rq;
+	struct sched_entity *se;
+	int i;
+
+	for_each_possible_cpu(i) {
+		cfs_rq = tg->cfs_rq[i];
+		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+	}
+
+	/* wait for possible concurrent references to cfs_rqs complete */
+	synchronize_sched();
+
+	/* now it should be safe to free those cfs_rqs */
+	for_each_possible_cpu(i) {
+		cfs_rq = tg->cfs_rq[i];
+		kfree(cfs_rq);
+
+		se = tg->se[i];
+		kfree(se);
+	}
+
+	kfree(tg->cfs_rq);
+	kfree(tg->se);
+	kfree(tg);
+}
+
+static int sched_can_attach(struct container_subsys *ss,
+			     struct container *cont, struct task_struct *tsk)
+{
+	/* We don't support RT-tasks being in separate groups */
+	if (tsk->sched_class != &fair_sched_class)
+		return -EINVAL;
+
+	return 0;
+}
+
+/* change task's runqueue when it moves between groups */
+static void sched_move_task(struct container_subsys *ss, struct container *cont,
+			struct container *old_cont, struct task_struct *tsk)
+{
+	int on_rq, running;
+	unsigned long flags;
+	struct rq *rq;
+
+	rq = task_rq_lock(tsk, &flags);
+
+	if (tsk->sched_class != &fair_sched_class)
+		goto done;
+
+	update_rq_clock(rq);
+
+	running = task_running(rq, tsk);
+	on_rq = tsk->se.on_rq;
+
+	if (on_rq) {
+		dequeue_task(rq, tsk, 0);
+		if (unlikely(running))
+			tsk->sched_class->put_prev_task(rq, tsk);
+	}
+
+	set_task_cfs_rq(tsk);
+
+	if (on_rq) {
+		enqueue_task(rq, tsk, 0);
+		if (unlikely(running))
+			tsk->sched_class->set_curr_task(rq);
+	}
+
+done:
+	task_rq_unlock(rq, &flags);
+}
+
+static void set_se_shares(struct sched_entity *se, unsigned long shares)
+{
+	struct cfs_rq *cfs_rq = se->cfs_rq;
+	struct rq *rq = cfs_rq->rq;
+	int on_rq;
+
+	spin_lock_irq(&rq->lock);
+
+	on_rq = se->on_rq;
+	if (on_rq)
+		dequeue_entity(cfs_rq, se, 0);
+
+	se->load.weight = shares;
+	se->load.inv_weight = div64_64((1ULL<<32), shares);
+
+	if (on_rq)
+		enqueue_entity(cfs_rq, se, 0);
+
+	spin_unlock_irq(&rq->lock);
+}
+
+static ssize_t cpu_shares_write(struct container *cont, struct cftype *cftype,
+				struct file *file, const char __user *userbuf,
+				size_t nbytes, loff_t *ppos)
+{
+	int i;
+	unsigned long shareval;
+	struct task_grp *tg = container_tg(cont);
+	char buffer[2*sizeof(unsigned long) + 1];
+
+	if (nbytes > 2*sizeof(unsigned long))	/* safety check */
+		return -E2BIG;
+
+	if (copy_from_user(buffer, userbuf, nbytes))
+		return -EFAULT;
+
+	buffer[nbytes] = 0;	/* nul-terminate */
+	shareval = simple_strtoul(buffer, NULL, 10);
+
+	tg->shares = shareval;
+	for_each_possible_cpu(i)
+		set_se_shares(tg->se[i], shareval);
+
+	return nbytes;
+}
+
+static u64 cpu_shares_read_uint(struct container *cont, struct cftype *cft)
+{
+	struct task_grp *tg = container_tg(cont);
+
+	return (u64) tg->shares;
+}
+
+struct cftype cpuctl_share = {
+	.name = "shares",
+	.read_uint = cpu_shares_read_uint,
+	.write = cpu_shares_write,
+};
+
+static int sched_populate(struct container_subsys *ss, struct container *cont)
+{
+	return container_add_file(cont, ss, &cpuctl_share);
+}
+
+struct container_subsys cpu_subsys = {
+	.name = "cpu",
+	.create = sched_create_group,
+	.destroy  = sched_destroy_group,
+	.can_attach = sched_can_attach,
+	.attach = sched_move_task,
+	.populate = sched_populate,
+	.subsys_id = cpu_subsys_id,
+	.early_init = 1,
+};
+
+#endif	/* CONFIG_FAIR_GROUP_SCHED */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ec445ca..12ab933 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -610,8 +610,7 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
  */
 static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 {
-	/* A later patch will take group into account */
-	return &cpu_rq(this_cpu)->cfs;
+	return cfs_rq->tg->cfs_rq[this_cpu];
 }
 
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 3503fb2..5ebf829 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -50,6 +50,10 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr)
 {
 }
 
+static void set_curr_task_idle(struct rq *rq)
+{
+}
+
 /*
  * Simple, special scheduling class for the per-CPU idle tasks:
  */
@@ -66,6 +70,7 @@ static struct sched_class idle_sched_class __read_mostly = {
 
 	.load_balance		= load_balance_idle,
 
+	.set_curr_task          = set_curr_task_idle,
 	.task_tick		= task_tick_idle,
 	/* no .task_new for idle tasks */
 };
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 4b87476..45b339f 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -218,6 +218,10 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
 	}
 }
 
+static void set_curr_task_rt(struct rq *rq)
+{
+}
+
 static struct sched_class rt_sched_class __read_mostly = {
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
@@ -230,5 +234,6 @@ static struct sched_class rt_sched_class __read_mostly = {
 
 	.load_balance		= load_balance_rt,
 
+	.set_curr_task          = set_curr_task_rt,
 	.task_tick		= task_tick_rt,
 };
-- 
cgit v1.1


From d02e5ed8d55e2a2b2735232ea1da40ffbf4c0932 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Mon, 15 Oct 2007 17:00:07 +0200
Subject: sched: sched_setscheduler() fix

Fix a problem in the 'sched-group' patch for !CONFIG_FAIR_GROUP_SCHED.

description:

sched_setscheduler()
{
...
if (task_running()) p->sched_class->put_prev_entity();

[ this one sets up cfs_rq->curr to NULL ]

...

if (task_running) p->sched_class->set_curr_task();

[ and this one is a _NOP_ (empty) for !CONFIG_FAIR_GROUP_SCHED ]

As a result, the task continues to run with cfs_rq->curr == NULL... no
crashes (due to checks for !NULL in place) but e.g. update_curr()
effectively becomes a NOP... i.e. runtime statistics for this task is
not accounted untill it's rescheduled anew.

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 12ab933..144f3ef 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -984,6 +984,10 @@ static void set_curr_task_fair(struct rq *rq)
 #else
 static void set_curr_task_fair(struct rq *rq)
 {
+	struct sched_entity *se = &rq->curr->se;
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+	cfs_rq->curr = se;
 }
 #endif
 
-- 
cgit v1.1


From 7074badbcb4212d404a243e5c50efeb778ec3fc6 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Mon, 15 Oct 2007 17:00:07 +0200
Subject: sched: add set_curr_task() calls

p->sched_class->set_curr_task() has to be called before
activate_task()/enqueue_task() in rt_mutex_setprio(),
sched_setschedule() and sched_move_task() in order to set up
'cfs_rq->curr'. The logic of enqueueing depends on whether a task to be
inserted is 'current' or not.

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index b2688ce..6d18921 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3915,8 +3915,8 @@ EXPORT_SYMBOL(sleep_on_timeout);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
+	int oldprio, on_rq, running;
 	unsigned long flags;
-	int oldprio, on_rq;
 	struct rq *rq;
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
@@ -3926,9 +3926,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 	oldprio = p->prio;
 	on_rq = p->se.on_rq;
+	running = task_running(rq, p);
 	if (on_rq) {
 		dequeue_task(rq, p, 0);
-		if (task_running(rq, p))
+		if (running)
 			p->sched_class->put_prev_task(rq, p);
 	}
 
@@ -3940,16 +3941,17 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	p->prio = prio;
 
 	if (on_rq) {
+		if (running)
+			p->sched_class->set_curr_task(rq);
 		enqueue_task(rq, p, 0);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
 		 * this runqueue and our priority is higher than the current's
 		 */
-		if (task_running(rq, p)) {
+		if (running) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
-			p->sched_class->set_curr_task(rq);
 		} else {
 			check_preempt_curr(rq, p);
 		}
@@ -4153,7 +4155,7 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 int sched_setscheduler(struct task_struct *p, int policy,
 		       struct sched_param *param)
 {
-	int retval, oldprio, oldpolicy = -1, on_rq;
+	int retval, oldprio, oldpolicy = -1, on_rq, running;
 	unsigned long flags;
 	struct rq *rq;
 
@@ -4235,24 +4237,26 @@ recheck:
 	}
 	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
+	running = task_running(rq, p);
 	if (on_rq) {
 		deactivate_task(rq, p, 0);
-		if (task_running(rq, p))
+		if (running)
 			p->sched_class->put_prev_task(rq, p);
 	}
 	oldprio = p->prio;
 	__setscheduler(rq, p, policy, param->sched_priority);
 	if (on_rq) {
+		if (running)
+			p->sched_class->set_curr_task(rq);
 		activate_task(rq, p, 0);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
 		 * this runqueue and our priority is higher than the current's
 		 */
-		if (task_running(rq, p)) {
+		if (running) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
-			p->sched_class->set_curr_task(rq);
 		} else {
 			check_preempt_curr(rq, p);
 		}
@@ -6861,9 +6865,9 @@ static void sched_move_task(struct container_subsys *ss, struct container *cont,
 	set_task_cfs_rq(tsk);
 
 	if (on_rq) {
-		enqueue_task(rq, tsk, 0);
 		if (unlikely(running))
 			tsk->sched_class->set_curr_task(rq);
+		enqueue_task(rq, tsk, 0);
 	}
 
 done:
-- 
cgit v1.1


From 30cfdcfc5f180fc21a3dad6ae3b7b2a9ee112186 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Mon, 15 Oct 2007 17:00:07 +0200
Subject: sched: do not keep current in the tree and get rid of
 sched_entity::fair_key

Get rid of 'sched_entity::fair_key'.

As a side effect, 'current' is not kept withing the tree for
SCHED_NORMAL/BATCH tasks anymore. This simplifies some parts of code
(e.g. entity_tick() and yield_task_fair()) and also somewhat optimizes
them (e.g. a single update_curr() now vs. dequeue/enqueue() before in
entity_tick()).

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |  1 -
 kernel/sched.c        |  1 -
 kernel/sched_debug.c  |  2 +-
 kernel/sched_fair.c   | 54 +++++++++++++++++++++++++++++++++------------------
 4 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 572df1b..f776a30 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -891,7 +891,6 @@ struct load_weight {
  *     6 se->load.weight
  */
 struct sched_entity {
-	s64			fair_key;
 	struct load_weight	load;		/* for load-balancing */
 	struct rb_node		run_node;
 	unsigned int		on_rq;
diff --git a/kernel/sched.c b/kernel/sched.c
index 6d18921..3b10463 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6610,7 +6610,6 @@ void normalize_rt_tasks(void)
 
 	read_lock_irq(&tasklist_lock);
 	do_each_thread(g, p) {
-		p->se.fair_key			= 0;
 		p->se.exec_start		= 0;
 #ifdef CONFIG_SCHEDSTATS
 		p->se.wait_start		= 0;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index e3b6232..bb34b81 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -38,7 +38,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 
 	SEQ_printf(m, "%15s %5d %15Ld %13Ld %5d ",
 		p->comm, p->pid,
-		(long long)p->se.fair_key,
+		(long long)p->se.vruntime,
 		(long long)(p->nvcsw + p->nivcsw),
 		p->prio);
 #ifdef CONFIG_SCHEDSTATS
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 144f3ef..b9e426a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -139,7 +139,7 @@ set_leftmost(struct cfs_rq *cfs_rq, struct rb_node *leftmost)
 static inline s64
 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	return se->fair_key - cfs_rq->min_vruntime;
+	return se->vruntime - cfs_rq->min_vruntime;
 }
 
 /*
@@ -181,9 +181,6 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 	rb_link_node(&se->run_node, parent, link);
 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
-	update_load_add(&cfs_rq->load, se->load.weight);
-	cfs_rq->nr_running++;
-	se->on_rq = 1;
 }
 
 static void
@@ -193,9 +190,6 @@ __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		set_leftmost(cfs_rq, rb_next(&se->run_node));
 
 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
-	update_load_sub(&cfs_rq->load, se->load.weight);
-	cfs_rq->nr_running--;
-	se->on_rq = 0;
 }
 
 static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
@@ -341,10 +335,6 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	 */
 	if (se != cfs_rq->curr)
 		update_stats_wait_start(cfs_rq, se);
-	/*
-	 * Update the key:
-	 */
-	se->fair_key = se->vruntime;
 }
 
 static void
@@ -392,6 +382,22 @@ update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Scheduling class queueing methods:
  */
 
+static void
+account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	update_load_add(&cfs_rq->load, se->load.weight);
+	cfs_rq->nr_running++;
+	se->on_rq = 1;
+}
+
+static void
+account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	update_load_sub(&cfs_rq->load, se->load.weight);
+	cfs_rq->nr_running--;
+	se->on_rq = 0;
+}
+
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SCHEDSTATS
@@ -479,7 +485,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 	}
 
 	update_stats_enqueue(cfs_rq, se);
-	__enqueue_entity(cfs_rq, se);
+	if (se != cfs_rq->curr)
+		__enqueue_entity(cfs_rq, se);
+	account_entity_enqueue(cfs_rq, se);
 }
 
 static void
@@ -498,7 +506,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 		}
 	}
 #endif
-	__dequeue_entity(cfs_rq, se);
+	if (se != cfs_rq->curr)
+		__dequeue_entity(cfs_rq, se);
+	account_entity_dequeue(cfs_rq, se);
 }
 
 /*
@@ -544,6 +554,10 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *se = __pick_next_entity(cfs_rq);
 
+	/* 'current' is not kept within the tree. */
+	if (se)
+		__dequeue_entity(cfs_rq, se);
+
 	set_next_entity(cfs_rq, se);
 
 	return se;
@@ -560,19 +574,20 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 
 	update_stats_curr_end(cfs_rq, prev);
 
-	if (prev->on_rq)
+	if (prev->on_rq) {
 		update_stats_wait_start(cfs_rq, prev);
+		/* Put 'current' back into the tree. */
+		__enqueue_entity(cfs_rq, prev);
+	}
 	cfs_rq->curr = NULL;
 }
 
 static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
 	/*
-	 * Dequeue and enqueue the task to update its
-	 * position within the tree:
+	 * Update run-time statistics of the 'current'.
 	 */
-	dequeue_entity(cfs_rq, curr, 0);
-	enqueue_entity(cfs_rq, curr, 0);
+	update_curr(cfs_rq);
 
 	if (cfs_rq->nr_running > 1)
 		check_preempt_tick(cfs_rq, curr);
@@ -749,7 +764,7 @@ static void yield_task_fair(struct rq *rq, struct task_struct *p)
 	/*
 	 * Minimally necessary key value to be last in the tree:
 	 */
-	se->fair_key = rightmost->fair_key + 1;
+	se->vruntime = rightmost->vruntime + 1;
 
 	if (cfs_rq->rb_leftmost == &se->run_node)
 		cfs_rq->rb_leftmost = rb_next(&se->run_node);
@@ -965,6 +980,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 
 	update_stats_enqueue(cfs_rq, se);
 	__enqueue_entity(cfs_rq, se);
+	account_entity_enqueue(cfs_rq, se);
 	resched_task(rq->curr);
 }
 
-- 
cgit v1.1


From 75d4ef16a6aa84f708188bada182315f80aab6fa Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:08 +0200
Subject: sched: fix delay accounting performance regression

fix delay accounting performance regression - those sched_clock()
calls are not needed.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_stats.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index c20a94d..1d9ec98 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -129,7 +129,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 # define schedstat_set(var, val)	do { } while (0)
 #endif
 
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+#ifdef CONFIG_SCHEDSTATS
 /*
  * Called when a process is dequeued from the active array and given
  * the cpu.  We should note that with the exception of interactive
@@ -233,5 +233,5 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
 #else
 #define sched_info_queued(t)		do { } while (0)
 #define sched_info_switch(t, next)	do { } while (0)
-#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
+#endif /* CONFIG_SCHEDSTATS */
 
-- 
cgit v1.1


From 87fefa381ef27f46c1182622ea01eb9504cd2e24 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Mon, 15 Oct 2007 17:00:08 +0200
Subject: sched: optimize task_new_fair()

due to the fact that we no longer keep the 'current' within the tree,
dequeue/enqueue_entity() is useless for the 'current' in
task_new_fair(). We are about to reschedule and
sched_class->put_prev_task() will put the 'current' back into the tree,
based on its new key.

   text    data     bss     dec     hex filename
  24388    2734      20   27142    6a06 sched.o.before
  24341    2734      20   27095    69d7 sched.o.after

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b9e426a..827a063 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -972,10 +972,11 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 
 	if (sysctl_sched_child_runs_first &&
 			curr->vruntime < se->vruntime) {
-
-		dequeue_entity(cfs_rq, curr, 0);
+		/*
+ 		 * Upon rescheduling, sched_class::put_prev_task() will place
+ 		 * 'current' within the tree based on its new key value.
+ 		 */
 		swap(curr->vruntime, se->vruntime);
-		enqueue_entity(cfs_rq, curr, 0);
 	}
 
 	update_stats_enqueue(cfs_rq, se);
-- 
cgit v1.1


From 4530d7ab0fb8d5056b68c376949e2d5c4db7817e Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Mon, 15 Oct 2007 17:00:08 +0200
Subject: sched: simplify sched_class::yield_task()

the 'p' (task_struct) parameter in the sched_class :: yield_task() is
redundant as the caller is always the 'current'. Get rid of it.

   text    data     bss     dec     hex filename
  24341    2734      20   27095    69d7 sched.o.before
  24330    2734      20   27084    69cc sched.o.after

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |  2 +-
 kernel/sched.c        |  2 +-
 kernel/sched_fair.c   | 10 +++++-----
 kernel/sched_rt.c     |  4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f776a30..6616900 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -858,7 +858,7 @@ struct sched_class {
 
 	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
 	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
-	void (*yield_task) (struct rq *rq, struct task_struct *p);
+	void (*yield_task) (struct rq *rq);
 
 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 3b10463..e1f784f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4537,7 +4537,7 @@ asmlinkage long sys_sched_yield(void)
 	struct rq *rq = this_rq_lock();
 
 	schedstat_inc(rq, yld_cnt);
-	current->sched_class->yield_task(rq, current);
+	current->sched_class->yield_task(rq);
 
 	/*
 	 * Since we are going to call schedule() anyway, there's
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 827a063..4dd256d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -722,11 +722,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
  *
  * If compat_yield is turned on then we requeue to the end of the tree.
  */
-static void yield_task_fair(struct rq *rq, struct task_struct *p)
+static void yield_task_fair(struct rq *rq)
 {
-	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
-	struct sched_entity *rightmost, *se = &p->se;
+	struct sched_entity *rightmost, *se = &rq->curr->se;
 	struct rb_node *parent;
 
 	/*
@@ -741,8 +741,8 @@ static void yield_task_fair(struct rq *rq, struct task_struct *p)
 		 * Dequeue and enqueue the task to update its
 		 * position within the tree:
 		 */
-		dequeue_entity(cfs_rq, &p->se, 0);
-		enqueue_entity(cfs_rq, &p->se, 0);
+		dequeue_entity(cfs_rq, se, 0);
+		enqueue_entity(cfs_rq, se, 0);
 
 		return;
 	}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 45b339f..b86944c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -59,9 +59,9 @@ static void requeue_task_rt(struct rq *rq, struct task_struct *p)
 }
 
 static void
-yield_task_rt(struct rq *rq, struct task_struct *p)
+yield_task_rt(struct rq *rq)
 {
-	requeue_task_rt(rq, p);
+	requeue_task_rt(rq, rq->curr);
 }
 
 /*
-- 
cgit v1.1


From f6b53205e17c8ca481c69ed579a35a650a4b481a Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Mon, 15 Oct 2007 17:00:08 +0200
Subject: sched: rework enqueue/dequeue_entity() to get rid of set_curr_task()

rework enqueue/dequeue_entity() to get rid of
sched_class::set_curr_task(). This simplifies sched_setscheduler(),
rt_mutex_setprio() and sched_move_tasks().

   text    data     bss     dec     hex filename
  24330    2734      20   27084    69cc sched.o.before
  24233    2730      20   26983    6967 sched.o.after

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h   |  1 -
 kernel/sched.c          | 36 +++++++++-----------------------
 kernel/sched_fair.c     | 55 ++++++++++++++++++++++---------------------------
 kernel/sched_idletask.c |  5 -----
 kernel/sched_rt.c       |  5 -----
 5 files changed, 35 insertions(+), 67 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6616900..abcb027 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -871,7 +871,6 @@ struct sched_class {
 			struct sched_domain *sd, enum cpu_idle_type idle,
 			int *all_pinned, int *this_best_prio);
 
-	void (*set_curr_task) (struct rq *rq);
 	void (*task_tick) (struct rq *rq, struct task_struct *p);
 	void (*task_new) (struct rq *rq, struct task_struct *p);
 };
diff --git a/kernel/sched.c b/kernel/sched.c
index e1f784f..72c936d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3915,8 +3915,8 @@ EXPORT_SYMBOL(sleep_on_timeout);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-	int oldprio, on_rq, running;
 	unsigned long flags;
+	int oldprio, on_rq;
 	struct rq *rq;
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
@@ -3926,12 +3926,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 	oldprio = p->prio;
 	on_rq = p->se.on_rq;
-	running = task_running(rq, p);
-	if (on_rq) {
+	if (on_rq)
 		dequeue_task(rq, p, 0);
-		if (running)
-			p->sched_class->put_prev_task(rq, p);
-	}
 
 	if (rt_prio(prio))
 		p->sched_class = &rt_sched_class;
@@ -3941,15 +3937,13 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	p->prio = prio;
 
 	if (on_rq) {
-		if (running)
-			p->sched_class->set_curr_task(rq);
 		enqueue_task(rq, p, 0);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
 		 * this runqueue and our priority is higher than the current's
 		 */
-		if (running) {
+		if (task_running(rq, p)) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
 		} else {
@@ -4155,7 +4149,7 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 int sched_setscheduler(struct task_struct *p, int policy,
 		       struct sched_param *param)
 {
-	int retval, oldprio, oldpolicy = -1, on_rq, running;
+	int retval, oldprio, oldpolicy = -1, on_rq;
 	unsigned long flags;
 	struct rq *rq;
 
@@ -4237,24 +4231,20 @@ recheck:
 	}
 	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
-	running = task_running(rq, p);
-	if (on_rq) {
+	if (on_rq)
 		deactivate_task(rq, p, 0);
-		if (running)
-			p->sched_class->put_prev_task(rq, p);
-	}
+
 	oldprio = p->prio;
 	__setscheduler(rq, p, policy, param->sched_priority);
+
 	if (on_rq) {
-		if (running)
-			p->sched_class->set_curr_task(rq);
 		activate_task(rq, p, 0);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
 		 * this runqueue and our priority is higher than the current's
 		 */
-		if (running) {
+		if (task_running(rq, p)) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
 		} else {
@@ -6855,19 +6845,13 @@ static void sched_move_task(struct container_subsys *ss, struct container *cont,
 	running = task_running(rq, tsk);
 	on_rq = tsk->se.on_rq;
 
-	if (on_rq) {
+	if (on_rq)
 		dequeue_task(rq, tsk, 0);
-		if (unlikely(running))
-			tsk->sched_class->put_prev_task(rq, tsk);
-	}
 
 	set_task_cfs_rq(tsk);
 
-	if (on_rq) {
-		if (unlikely(running))
-			tsk->sched_class->set_curr_task(rq);
+	if (on_rq)
 		enqueue_task(rq, tsk, 0);
-	}
 
 done:
 	task_rq_unlock(rq, &flags);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4dd256d..568e922 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -472,9 +472,20 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 }
 
 static void
-enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
+enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+		int wakeup, int set_curr)
 {
 	/*
+ 	 * In case of the 'current'.
+ 	 */
+	if (unlikely(set_curr)) {
+		update_stats_curr_start(cfs_rq, se);
+		cfs_rq->curr = se;
+		account_entity_enqueue(cfs_rq, se);
+		return;
+	}
+
+	/*
 	 * Update the fair clock.
 	 */
 	update_curr(cfs_rq);
@@ -485,8 +496,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 	}
 
 	update_stats_enqueue(cfs_rq, se);
-	if (se != cfs_rq->curr)
-		__enqueue_entity(cfs_rq, se);
+	__enqueue_entity(cfs_rq, se);
 	account_entity_enqueue(cfs_rq, se);
 }
 
@@ -506,8 +516,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 		}
 	}
 #endif
-	if (se != cfs_rq->curr)
+	if (likely(se != cfs_rq->curr))
 		__dequeue_entity(cfs_rq, se);
+	else {
+		update_stats_curr_end(cfs_rq, se);
+		cfs_rq->curr = NULL;
+	}
 	account_entity_dequeue(cfs_rq, se);
 }
 
@@ -689,12 +703,17 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
+	int set_curr = 0;
+
+	/* Are we enqueuing the current task? */
+	if (unlikely(task_running(rq, p)))
+		set_curr = 1;
 
 	for_each_sched_entity(se) {
 		if (se->on_rq)
 			break;
 		cfs_rq = cfs_rq_of(se);
-		enqueue_entity(cfs_rq, se, wakeup);
+		enqueue_entity(cfs_rq, se, wakeup, set_curr);
 	}
 }
 
@@ -742,7 +761,7 @@ static void yield_task_fair(struct rq *rq)
 		 * position within the tree:
 		 */
 		dequeue_entity(cfs_rq, se, 0);
-		enqueue_entity(cfs_rq, se, 0);
+		enqueue_entity(cfs_rq, se, 0, 1);
 
 		return;
 	}
@@ -985,29 +1004,6 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	resched_task(rq->curr);
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/* Account for a task changing its policy or group.
- *
- * This routine is mostly called to set cfs_rq->curr field when a task
- * migrates between groups/classes.
- */
-static void set_curr_task_fair(struct rq *rq)
-{
-	struct sched_entity *se = &rq->curr->se;
-
-	for_each_sched_entity(se)
-		set_next_entity(cfs_rq_of(se), se);
-}
-#else
-static void set_curr_task_fair(struct rq *rq)
-{
-	struct sched_entity *se = &rq->curr->se;
-	struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
-	cfs_rq->curr = se;
-}
-#endif
-
 /*
  * All the scheduling class methods:
  */
@@ -1023,7 +1019,6 @@ struct sched_class fair_sched_class __read_mostly = {
 
 	.load_balance		= load_balance_fair,
 
-	.set_curr_task          = set_curr_task_fair,
 	.task_tick		= task_tick_fair,
 	.task_new		= task_new_fair,
 };
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 5ebf829..3503fb2 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -50,10 +50,6 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr)
 {
 }
 
-static void set_curr_task_idle(struct rq *rq)
-{
-}
-
 /*
  * Simple, special scheduling class for the per-CPU idle tasks:
  */
@@ -70,7 +66,6 @@ static struct sched_class idle_sched_class __read_mostly = {
 
 	.load_balance		= load_balance_idle,
 
-	.set_curr_task          = set_curr_task_idle,
 	.task_tick		= task_tick_idle,
 	/* no .task_new for idle tasks */
 };
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b86944c..3c77c03 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -218,10 +218,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
 	}
 }
 
-static void set_curr_task_rt(struct rq *rq)
-{
-}
-
 static struct sched_class rt_sched_class __read_mostly = {
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
@@ -234,6 +230,5 @@ static struct sched_class rt_sched_class __read_mostly = {
 
 	.load_balance		= load_balance_rt,
 
-	.set_curr_task          = set_curr_task_rt,
 	.task_tick		= task_tick_rt,
 };
-- 
cgit v1.1


From 1a75b94f7bda591f4c53af86baa50e1eaee35927 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:08 +0200
Subject: sched: prettify /proc/sched_debug output

print the correct amount of dashes in /proc/sched_debug.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_debug.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index bb34b81..22cf74c 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -60,10 +60,8 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
 	"\nrunnable tasks:\n"
 	"            task   PID        tree-key  switches  prio"
 	"    exec-runtime        sum-exec       sum-sleep\n"
-	"------------------------------------------------------------------"
-	"--------------------------------"
-	"------------------------------------------------"
-	"--------------------------------\n");
+	"------------------------------------------------------"
+	"------------------------------------------------");
 
 	read_lock_irq(&tasklist_lock);
 
-- 
cgit v1.1


From ef83a5714d9a817b2e9b97f04a6d070fbd6ecf80 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:08 +0200
Subject: sched: enhance debug output

enhance debug output by changing 12345678 nsecs to 12.345678 output,
this is more human-readable.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_debug.c | 108 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 68 insertions(+), 40 deletions(-)

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 22cf74c..e2c1e0d 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -28,6 +28,31 @@
 		printk(x);			\
  } while (0)
 
+/*
+ * Ease the printing of nsec fields:
+ */
+static long long nsec_high(long long nsec)
+{
+	if (nsec < 0) {
+		nsec = -nsec;
+		do_div(nsec, 1000000);
+		return -nsec;
+	}
+	do_div(nsec, 1000000);
+
+	return nsec;
+}
+
+static unsigned long nsec_low(long long nsec)
+{
+	if (nsec < 0)
+		nsec = -nsec;
+
+	return do_div(nsec, 1000000);
+}
+
+#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
+
 static void
 print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 {
@@ -36,19 +61,19 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 	else
 		SEQ_printf(m, " ");
 
-	SEQ_printf(m, "%15s %5d %15Ld %13Ld %5d ",
+	SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
 		p->comm, p->pid,
-		(long long)p->se.vruntime,
+		SPLIT_NS(p->se.vruntime),
 		(long long)(p->nvcsw + p->nivcsw),
 		p->prio);
 #ifdef CONFIG_SCHEDSTATS
-	SEQ_printf(m, "%15Ld %15Ld %15Ld\n",
-		(long long)p->se.vruntime,
-		(long long)p->se.sum_exec_runtime,
-		(long long)p->se.sum_sleep_runtime);
+	SEQ_printf(m, "%15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n",
+		SPLIT_NS(p->se.vruntime),
+		SPLIT_NS(p->se.sum_exec_runtime),
+		SPLIT_NS(p->se.sum_sleep_runtime));
 #else
-	SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n",
-		0LL, 0LL, 0LL, 0LL, 0LL);
+	SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n",
+		0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
 #endif
 }
 
@@ -85,10 +110,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 
 	SEQ_printf(m, "\ncfs_rq\n");
 
-#define P(x) \
-	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(cfs_rq->x))
-
-	P(exec_clock);
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
+			SPLIT_NS(cfs_rq->exec_clock));
 
 	spin_lock_irqsave(&rq->lock, flags);
 	if (cfs_rq->rb_leftmost)
@@ -99,19 +122,18 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	min_vruntime = rq->cfs.min_vruntime;
 	rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
 	spin_unlock_irqrestore(&rq->lock, flags);
-	SEQ_printf(m, "  .%-30s: %Ld\n", "MIN_vruntime",
-			(long long)MIN_vruntime);
-	SEQ_printf(m, "  .%-30s: %Ld\n", "min_vruntime",
-			(long long)min_vruntime);
-	SEQ_printf(m, "  .%-30s: %Ld\n", "max_vruntime",
-			(long long)max_vruntime);
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "MIN_vruntime",
+			SPLIT_NS(MIN_vruntime));
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "min_vruntime",
+			SPLIT_NS(min_vruntime));
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "max_vruntime",
+			SPLIT_NS(max_vruntime));
 	spread = max_vruntime - MIN_vruntime;
-	SEQ_printf(m, "  .%-30s: %Ld\n", "spread",
-			(long long)spread);
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread",
+			SPLIT_NS(spread));
 	spread0 = min_vruntime - rq0_min_vruntime;
-	SEQ_printf(m, "  .%-30s: %Ld\n", "spread0",
-			(long long)spread0);
-#undef P
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread0",
+			SPLIT_NS(spread0));
 }
 
 static void print_cpu(struct seq_file *m, int cpu)
@@ -131,6 +153,8 @@ static void print_cpu(struct seq_file *m, int cpu)
 
 #define P(x) \
 	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rq->x))
+#define PN(x) \
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
 
 	P(nr_running);
 	SEQ_printf(m, "  .%-30s: %lu\n", "load",
@@ -139,21 +163,22 @@ static void print_cpu(struct seq_file *m, int cpu)
 	P(nr_load_updates);
 	P(nr_uninterruptible);
 	SEQ_printf(m, "  .%-30s: %lu\n", "jiffies", jiffies);
-	P(next_balance);
+	PN(next_balance);
 	P(curr->pid);
-	P(clock);
-	P(idle_clock);
-	P(prev_clock_raw);
+	PN(clock);
+	PN(idle_clock);
+	PN(prev_clock_raw);
 	P(clock_warps);
 	P(clock_overflows);
 	P(clock_deep_idle_events);
-	P(clock_max_delta);
+	PN(clock_max_delta);
 	P(cpu_load[0]);
 	P(cpu_load[1]);
 	P(cpu_load[2]);
 	P(cpu_load[3]);
 	P(cpu_load[4]);
 #undef P
+#undef PN
 
 	print_cfs_stats(m, cpu);
 
@@ -170,7 +195,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
 
-	SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now);
+	SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now));
 
 	for_each_online_cpu(cpu)
 		print_cpu(m, cpu);
@@ -228,20 +253,22 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	SEQ_printf(m, "----------------------------------------------\n");
 #define P(F) \
 	SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F)
+#define PN(F) \
+	SEQ_printf(m, "%-25s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
 
-	P(se.exec_start);
-	P(se.vruntime);
-	P(se.sum_exec_runtime);
+	PN(se.exec_start);
+	PN(se.vruntime);
+	PN(se.sum_exec_runtime);
 
 #ifdef CONFIG_SCHEDSTATS
-	P(se.wait_start);
-	P(se.sleep_start);
-	P(se.block_start);
-	P(se.sleep_max);
-	P(se.block_max);
-	P(se.exec_max);
-	P(se.slice_max);
-	P(se.wait_max);
+	PN(se.wait_start);
+	PN(se.sleep_start);
+	PN(se.block_start);
+	PN(se.sleep_max);
+	PN(se.block_max);
+	PN(se.exec_max);
+	PN(se.slice_max);
+	PN(se.wait_max);
 #endif
 	SEQ_printf(m, "%-25s:%20Ld\n",
 		   "nr_switches", (long long)(p->nvcsw + p->nivcsw));
@@ -249,6 +276,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	P(policy);
 	P(prio);
 #undef P
+#undef PN
 
 	{
 		u64 t0, t1;
-- 
cgit v1.1


From c86da3a3d40f6e7a032edfaea191fb51e9626c8f Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Mon, 15 Oct 2007 17:00:08 +0200
Subject: sched: fix formatting of /proc/sched_debug

fix formatting of /proc/sched_debug

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_debug.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index e2c1e0d..4eaaf96 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -67,7 +67,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 		(long long)(p->nvcsw + p->nivcsw),
 		p->prio);
 #ifdef CONFIG_SCHEDSTATS
-	SEQ_printf(m, "%15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n",
+	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n",
 		SPLIT_NS(p->se.vruntime),
 		SPLIT_NS(p->se.sum_exec_runtime),
 		SPLIT_NS(p->se.sum_sleep_runtime));
@@ -83,10 +83,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
 
 	SEQ_printf(m,
 	"\nrunnable tasks:\n"
-	"            task   PID        tree-key  switches  prio"
-	"    exec-runtime        sum-exec       sum-sleep\n"
+	"            task   PID         tree-key  switches  prio"
+	"     exec-runtime         sum-exec        sum-sleep\n"
 	"------------------------------------------------------"
-	"------------------------------------------------");
+	"----------------------------------------------------\n");
 
 	read_lock_irq(&tasklist_lock);
 
-- 
cgit v1.1


From edcb60a309769a5f6e7c9e76d7c98b34d1757448 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:08 +0200
Subject: sched: kernel/sched_fair.c whitespace cleanups

some trivial whitespace cleanups.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 568e922..9f93a5c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -476,8 +476,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 		int wakeup, int set_curr)
 {
 	/*
- 	 * In case of the 'current'.
- 	 */
+	 * In case of the 'current'.
+	 */
 	if (unlikely(set_curr)) {
 		update_stats_curr_start(cfs_rq, se);
 		cfs_rq->curr = se;
@@ -992,9 +992,9 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	if (sysctl_sched_child_runs_first &&
 			curr->vruntime < se->vruntime) {
 		/*
- 		 * Upon rescheduling, sched_class::put_prev_task() will place
- 		 * 'current' within the tree based on its new key value.
- 		 */
+		 * Upon rescheduling, sched_class::put_prev_task() will place
+		 * 'current' within the tree based on its new key value.
+		 */
 		swap(curr->vruntime, se->vruntime);
 	}
 
-- 
cgit v1.1


From 83b699ed20f5218580a1b7042064082e2e05f8c5 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Mon, 15 Oct 2007 17:00:08 +0200
Subject: sched: revert recent removal of set_curr_task()

Revert removal of set_curr_task.
Use put_prev_task/set_curr_task when changing groups/policies

Signed-off-by: Srivatsa Vaddagiri < vatsa@linux.vnet.ibm.com>
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h   |  1 +
 kernel/sched.c          | 34 +++++++++++++++++++------
 kernel/sched_fair.c     | 68 +++++++++++++++++++++++--------------------------
 kernel/sched_idletask.c |  5 ++++
 kernel/sched_rt.c       |  8 ++++++
 5 files changed, 72 insertions(+), 44 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index abcb027..6616900 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -871,6 +871,7 @@ struct sched_class {
 			struct sched_domain *sd, enum cpu_idle_type idle,
 			int *all_pinned, int *this_best_prio);
 
+	void (*set_curr_task) (struct rq *rq);
 	void (*task_tick) (struct rq *rq, struct task_struct *p);
 	void (*task_new) (struct rq *rq, struct task_struct *p);
 };
diff --git a/kernel/sched.c b/kernel/sched.c
index 72c936d..ee7ac71 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3916,7 +3916,7 @@ EXPORT_SYMBOL(sleep_on_timeout);
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
 	unsigned long flags;
-	int oldprio, on_rq;
+	int oldprio, on_rq, running;
 	struct rq *rq;
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
@@ -3926,8 +3926,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 	oldprio = p->prio;
 	on_rq = p->se.on_rq;
-	if (on_rq)
+	running = task_running(rq, p);
+	if (on_rq) {
 		dequeue_task(rq, p, 0);
+		if (running)
+			p->sched_class->put_prev_task(rq, p);
+	}
 
 	if (rt_prio(prio))
 		p->sched_class = &rt_sched_class;
@@ -3937,13 +3941,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	p->prio = prio;
 
 	if (on_rq) {
+		if (running)
+			p->sched_class->set_curr_task(rq);
 		enqueue_task(rq, p, 0);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
 		 * this runqueue and our priority is higher than the current's
 		 */
-		if (task_running(rq, p)) {
+		if (running) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
 		} else {
@@ -4149,7 +4155,7 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 int sched_setscheduler(struct task_struct *p, int policy,
 		       struct sched_param *param)
 {
-	int retval, oldprio, oldpolicy = -1, on_rq;
+	int retval, oldprio, oldpolicy = -1, on_rq, running;
 	unsigned long flags;
 	struct rq *rq;
 
@@ -4231,20 +4237,26 @@ recheck:
 	}
 	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
-	if (on_rq)
+	running = task_running(rq, p);
+	if (on_rq) {
 		deactivate_task(rq, p, 0);
+		if (running)
+			p->sched_class->put_prev_task(rq, p);
+	}
 
 	oldprio = p->prio;
 	__setscheduler(rq, p, policy, param->sched_priority);
 
 	if (on_rq) {
+		if (running)
+			p->sched_class->set_curr_task(rq);
 		activate_task(rq, p, 0);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
 		 * this runqueue and our priority is higher than the current's
 		 */
-		if (task_running(rq, p)) {
+		if (running) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
 		} else {
@@ -6845,13 +6857,19 @@ static void sched_move_task(struct container_subsys *ss, struct container *cont,
 	running = task_running(rq, tsk);
 	on_rq = tsk->se.on_rq;
 
-	if (on_rq)
+	if (on_rq) {
 		dequeue_task(rq, tsk, 0);
+		if (unlikely(running))
+			tsk->sched_class->put_prev_task(rq, tsk);
+	}
 
 	set_task_cfs_rq(tsk);
 
-	if (on_rq)
+	if (on_rq) {
+		if (unlikely(running))
+			tsk->sched_class->set_curr_task(rq);
 		enqueue_task(rq, tsk, 0);
+	}
 
 done:
 	task_rq_unlock(rq, &flags);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 9f93a5c..92563cd 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -472,20 +472,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 }
 
 static void
-enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
-		int wakeup, int set_curr)
+enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 {
 	/*
-	 * In case of the 'current'.
-	 */
-	if (unlikely(set_curr)) {
-		update_stats_curr_start(cfs_rq, se);
-		cfs_rq->curr = se;
-		account_entity_enqueue(cfs_rq, se);
-		return;
-	}
-
-	/*
 	 * Update the fair clock.
 	 */
 	update_curr(cfs_rq);
@@ -496,7 +485,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	}
 
 	update_stats_enqueue(cfs_rq, se);
-	__enqueue_entity(cfs_rq, se);
+	if (se != cfs_rq->curr)
+		__enqueue_entity(cfs_rq, se);
 	account_entity_enqueue(cfs_rq, se);
 }
 
@@ -516,12 +506,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 		}
 	}
 #endif
-	if (likely(se != cfs_rq->curr))
+	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
-	else {
-		update_stats_curr_end(cfs_rq, se);
-		cfs_rq->curr = NULL;
-	}
 	account_entity_dequeue(cfs_rq, se);
 }
 
@@ -539,15 +525,20 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 		resched_task(rq_of(cfs_rq)->curr);
 }
 
-static inline void
+static void
 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	/*
-	 * Any task has to be enqueued before it get to execute on
-	 * a CPU. So account for the time it spent waiting on the
-	 * runqueue.
-	 */
-	update_stats_wait_end(cfs_rq, se);
+	/* 'current' is not kept within the tree. */
+	if (se->on_rq) {
+		/*
+		 * Any task has to be enqueued before it get to execute on
+		 * a CPU. So account for the time it spent waiting on the
+		 * runqueue.
+		 */
+		update_stats_wait_end(cfs_rq, se);
+		__dequeue_entity(cfs_rq, se);
+	}
+
 	update_stats_curr_start(cfs_rq, se);
 	cfs_rq->curr = se;
 #ifdef CONFIG_SCHEDSTATS
@@ -568,10 +559,6 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *se = __pick_next_entity(cfs_rq);
 
-	/* 'current' is not kept within the tree. */
-	if (se)
-		__dequeue_entity(cfs_rq, se);
-
 	set_next_entity(cfs_rq, se);
 
 	return se;
@@ -703,17 +690,12 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
-	int set_curr = 0;
-
-	/* Are we enqueuing the current task? */
-	if (unlikely(task_running(rq, p)))
-		set_curr = 1;
 
 	for_each_sched_entity(se) {
 		if (se->on_rq)
 			break;
 		cfs_rq = cfs_rq_of(se);
-		enqueue_entity(cfs_rq, se, wakeup, set_curr);
+		enqueue_entity(cfs_rq, se, wakeup);
 	}
 }
 
@@ -761,7 +743,7 @@ static void yield_task_fair(struct rq *rq)
 		 * position within the tree:
 		 */
 		dequeue_entity(cfs_rq, se, 0);
-		enqueue_entity(cfs_rq, se, 0, 1);
+		enqueue_entity(cfs_rq, se, 0);
 
 		return;
 	}
@@ -1004,6 +986,19 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	resched_task(rq->curr);
 }
 
+/* Account for a task changing its policy or group.
+ *
+ * This routine is mostly called to set cfs_rq->curr field when a task
+ * migrates between groups/classes.
+ */
+static void set_curr_task_fair(struct rq *rq)
+{
+	struct sched_entity *se = &rq->curr->se;
+
+	for_each_sched_entity(se)
+		set_next_entity(cfs_rq_of(se), se);
+}
+
 /*
  * All the scheduling class methods:
  */
@@ -1019,6 +1014,7 @@ struct sched_class fair_sched_class __read_mostly = {
 
 	.load_balance		= load_balance_fair,
 
+	.set_curr_task          = set_curr_task_fair,
 	.task_tick		= task_tick_fair,
 	.task_new		= task_new_fair,
 };
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 3503fb2..5ebf829 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -50,6 +50,10 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr)
 {
 }
 
+static void set_curr_task_idle(struct rq *rq)
+{
+}
+
 /*
  * Simple, special scheduling class for the per-CPU idle tasks:
  */
@@ -66,6 +70,7 @@ static struct sched_class idle_sched_class __read_mostly = {
 
 	.load_balance		= load_balance_idle,
 
+	.set_curr_task          = set_curr_task_idle,
 	.task_tick		= task_tick_idle,
 	/* no .task_new for idle tasks */
 };
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 3c77c03..e1d5f1c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -218,6 +218,13 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
 	}
 }
 
+static void set_curr_task_rt(struct rq *rq)
+{
+	struct task_struct *p = rq->curr;
+
+	p->se.exec_start = rq->clock;
+}
+
 static struct sched_class rt_sched_class __read_mostly = {
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
@@ -230,5 +237,6 @@ static struct sched_class rt_sched_class __read_mostly = {
 
 	.load_balance		= load_balance_rt,
 
+	.set_curr_task          = set_curr_task_rt,
 	.task_tick		= task_tick_rt,
 };
-- 
cgit v1.1


From 72ea22f8fbc893425faefa60641f45a4cdef2261 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Mon, 15 Oct 2007 17:00:08 +0200
Subject: sched: fix minor bug in yield

- fix a minor bug in yield (seen for CONFIG_FAIR_GROUP_SCHED),
  group scheduling would skew when yield was called.

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 92563cd..d8d2e2f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -725,7 +725,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
  */
 static void yield_task_fair(struct rq *rq)
 {
-	struct cfs_rq *cfs_rq = &rq->cfs;
+	struct cfs_rq *cfs_rq = task_cfs_rq(rq->curr);
 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 	struct sched_entity *rightmost, *se = &rq->curr->se;
 	struct rb_node *parent;
-- 
cgit v1.1


From 545f3b18152355acbb8da59873506fcf66c7c60e Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Mon, 15 Oct 2007 17:00:09 +0200
Subject: sched: print nr_running and load in /proc/sched_debug

- print nr_running and load information for cfs_rq in /proc/sched_debug

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_debug.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 4eaaf96..3e47e87 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -134,6 +134,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	spread0 = min_vruntime - rq0_min_vruntime;
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread0",
 			SPLIT_NS(spread0));
+	SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
+	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 }
 
 static void print_cpu(struct seq_file *m, int cpu)
-- 
cgit v1.1


From 75c28ace9f2b2f403674e045939424a77c95b47c Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Mon, 15 Oct 2007 17:00:09 +0200
Subject: sched: print &rq->cfs stats

- Print &rq->cfs statistics as well (useful for group scheduling)

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index d8d2e2f..556942c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1024,6 +1024,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
 {
 	struct cfs_rq *cfs_rq;
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
+#endif
 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
 		print_cfs_rq(m, cpu, cfs_rq);
 }
-- 
cgit v1.1


From 9b5b77512dce239fa168183fa71896712232e95a Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Mon, 15 Oct 2007 17:00:09 +0200
Subject: sched: clean up code under CONFIG_FAIR_GROUP_SCHED

With the view of supporting user-id based fair scheduling (and not just
container-based fair scheduling), this patch renames several functions
and makes them independent of whether they are being used for container
or user-id based fair scheduling.

Also fix a problem reported by KAMEZAWA Hiroyuki (wrt allocating
less-sized array for tg->cfs_rq[] and tf->se[]).

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |  12 ++++
 init/Kconfig          |  11 ++--
 kernel/sched.c        | 172 ++++++++++++++++++--------------------------------
 kernel/sched_fair.c   |   5 +-
 4 files changed, 83 insertions(+), 117 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6616900..03c13b6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -136,6 +136,7 @@ extern unsigned long weighted_cpuload(const int cpu);
 
 struct seq_file;
 struct cfs_rq;
+struct task_grp;
 #ifdef CONFIG_SCHED_DEBUG
 extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
 extern void proc_sched_set_task(struct task_struct *p);
@@ -1834,6 +1835,17 @@ extern int sched_mc_power_savings, sched_smt_power_savings;
 
 extern void normalize_rt_tasks(void);
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+extern struct task_grp init_task_grp;
+
+extern struct task_grp *sched_create_group(void);
+extern void sched_destroy_group(struct task_grp *tg);
+extern void sched_move_task(struct task_struct *tsk);
+extern int sched_group_set_shares(struct task_grp *tg, unsigned long shares);
+
+#endif
+
 #ifdef CONFIG_TASK_XACCT
 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
 {
diff --git a/init/Kconfig b/init/Kconfig
index 11c6762..ef90a15 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -282,13 +282,12 @@ config CPUSETS
 	  Say N if unsure.
 
 config FAIR_GROUP_SCHED
-	bool "Fair group scheduler"
-	depends on EXPERIMENTAL && CONTAINERS
+	bool "Fair group cpu scheduler"
+	default n
+	depends on EXPERIMENTAL
 	help
-	  This option enables you to group tasks and control CPU resource
-	  allocation to such groups.
-
-	  Say N if unsure.
+	  This feature lets cpu scheduler recognize task groups and control cpu
+	  bandwidth allocation to such task groups.
 
 config SYSFS_DEPRECATED
 	bool "Create deprecated sysfs files"
diff --git a/kernel/sched.c b/kernel/sched.c
index ee7ac71..e10c403 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -173,13 +173,10 @@ struct rt_prio_array {
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
-#include <linux/container.h>
-
 struct cfs_rq;
 
 /* task group related information */
 struct task_grp {
-	struct container_subsys_state css;
 	/* schedulable entities of this group on each cpu */
 	struct sched_entity **se;
 	/* runqueue "owned" by this group on each cpu */
@@ -192,22 +189,28 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 /* Default task group's cfs_rq on each cpu */
 static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 
-static struct sched_entity *init_sched_entity_p[CONFIG_NR_CPUS];
-static struct cfs_rq *init_cfs_rq_p[CONFIG_NR_CPUS];
+static struct sched_entity *init_sched_entity_p[NR_CPUS];
+static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
 
 /* Default task group.
  * 	Every task in system belong to this group at bootup.
  */
-static struct task_grp init_task_grp =  {
-					.se     = init_sched_entity_p,
-					.cfs_rq = init_cfs_rq_p,
-					};
+struct task_grp init_task_grp =  {
+				.se     = init_sched_entity_p,
+				.cfs_rq = init_cfs_rq_p,
+				 };
+
+#define INIT_TASK_GRP_LOAD	NICE_0_LOAD
+static int init_task_grp_load = INIT_TASK_GRP_LOAD;
 
 /* return group to which a task belongs */
 static inline struct task_grp *task_grp(struct task_struct *p)
 {
-	return container_of(task_subsys_state(p, cpu_subsys_id),
-				struct task_grp, css);
+	struct task_grp *tg;
+
+	tg  = &init_task_grp;
+
+	return tg;
 }
 
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -250,6 +253,7 @@ struct cfs_rq {
 	 */
 	struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
 	struct task_grp *tg;    /* group that "owns" this runqueue */
+	struct rcu_head rcu;
 #endif
 };
 
@@ -6513,11 +6517,12 @@ void __init sched_init(void)
  			init_sched_entity_p[i] = se;
  			se->cfs_rq = &rq->cfs;
  			se->my_q = cfs_rq;
- 			se->load.weight = NICE_0_LOAD;
-			se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
+ 			se->load.weight = init_task_grp_load;
+			se->load.inv_weight =
+				 div64_64(1ULL<<32, init_task_grp_load);
  			se->parent = NULL;
  		}
-		init_task_grp.shares = NICE_0_LOAD;
+		init_task_grp.shares = init_task_grp_load;
 #endif
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -6707,45 +6712,28 @@ void set_curr_task(int cpu, struct task_struct *p)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
-/* return corresponding task_grp object of a container */
-static inline struct task_grp *container_tg(struct container *cont)
-{
-	return container_of(container_subsys_state(cont, cpu_subsys_id),
-					 struct task_grp, css);
-}
-
 /* allocate runqueue etc for a new task group */
-static struct container_subsys_state *
-sched_create_group(struct container_subsys *ss, struct container *cont)
+struct task_grp *sched_create_group(void)
 {
 	struct task_grp *tg;
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se;
+	struct rq *rq;
 	int i;
 
-	if (!cont->parent) {
-		/* This is early initialization for the top container */
-		init_task_grp.css.container = cont;
-		return &init_task_grp.css;
-	}
-
-	/* we support only 1-level deep hierarchical scheduler atm */
-	if (cont->parent->parent)
-		return ERR_PTR(-EINVAL);
-
 	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
 	if (!tg)
 		return ERR_PTR(-ENOMEM);
 
-	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * num_possible_cpus(), GFP_KERNEL);
+	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
 	if (!tg->cfs_rq)
 		goto err;
-	tg->se = kzalloc(sizeof(se) * num_possible_cpus(), GFP_KERNEL);
+	tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
 	if (!tg->se)
 		goto err;
 
 	for_each_possible_cpu(i) {
-		struct rq *rq = cpu_rq(i);
+		rq = cpu_rq(i);
 
 		cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL,
 							 cpu_to_node(i));
@@ -6763,7 +6751,6 @@ sched_create_group(struct container_subsys *ss, struct container *cont)
 		tg->cfs_rq[i] = cfs_rq;
 		init_cfs_rq(cfs_rq, rq);
 		cfs_rq->tg = tg;
-		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
 
 		tg->se[i] = se;
 		se->cfs_rq = &rq->cfs;
@@ -6773,12 +6760,15 @@ sched_create_group(struct container_subsys *ss, struct container *cont)
 		se->parent = NULL;
 	}
 
-	tg->shares = NICE_0_LOAD;
+	for_each_possible_cpu(i) {
+		rq = cpu_rq(i);
+		cfs_rq = tg->cfs_rq[i];
+		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+	}
 
-	/* Bind the container to task_grp object we just created */
-	tg->css.container = cont;
+	tg->shares = NICE_0_LOAD;
 
-	return &tg->css;
+	return tg;
 
 err:
 	for_each_possible_cpu(i) {
@@ -6797,24 +6787,14 @@ err:
 	return ERR_PTR(-ENOMEM);
 }
 
-
-/* destroy runqueue etc associated with a task group */
-static void sched_destroy_group(struct container_subsys *ss,
-					struct container *cont)
+/* rcu callback to free various structures associated with a task group */
+static void free_sched_group(struct rcu_head *rhp)
 {
-	struct task_grp *tg = container_tg(cont);
-	struct cfs_rq *cfs_rq;
+	struct cfs_rq *cfs_rq = container_of(rhp, struct cfs_rq, rcu);
+	struct task_grp *tg = cfs_rq->tg;
 	struct sched_entity *se;
 	int i;
 
-	for_each_possible_cpu(i) {
-		cfs_rq = tg->cfs_rq[i];
-		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
-	}
-
-	/* wait for possible concurrent references to cfs_rqs complete */
-	synchronize_sched();
-
 	/* now it should be safe to free those cfs_rqs */
 	for_each_possible_cpu(i) {
 		cfs_rq = tg->cfs_rq[i];
@@ -6829,19 +6809,29 @@ static void sched_destroy_group(struct container_subsys *ss,
 	kfree(tg);
 }
 
-static int sched_can_attach(struct container_subsys *ss,
-			     struct container *cont, struct task_struct *tsk)
+/* Destroy runqueue etc associated with a task group */
+void sched_destroy_group(struct task_grp *tg)
 {
-	/* We don't support RT-tasks being in separate groups */
-	if (tsk->sched_class != &fair_sched_class)
-		return -EINVAL;
+	struct cfs_rq *cfs_rq;
+	int i;
 
-	return 0;
+	for_each_possible_cpu(i) {
+		cfs_rq = tg->cfs_rq[i];
+		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+	}
+
+	cfs_rq = tg->cfs_rq[0];
+
+	/* wait for possible concurrent references to cfs_rqs complete */
+	call_rcu(&cfs_rq->rcu, free_sched_group);
 }
 
-/* change task's runqueue when it moves between groups */
-static void sched_move_task(struct container_subsys *ss, struct container *cont,
-			struct container *old_cont, struct task_struct *tsk)
+/* change task's runqueue when it moves between groups.
+ * 	The caller of this function should have put the task in its new group
+ * 	by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
+ * 	reflect its new group.
+ */
+void sched_move_task(struct task_struct *tsk)
 {
 	int on_rq, running;
 	unsigned long flags;
@@ -6896,58 +6886,20 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
 	spin_unlock_irq(&rq->lock);
 }
 
-static ssize_t cpu_shares_write(struct container *cont, struct cftype *cftype,
-				struct file *file, const char __user *userbuf,
-				size_t nbytes, loff_t *ppos)
+int sched_group_set_shares(struct task_grp *tg, unsigned long shares)
 {
 	int i;
-	unsigned long shareval;
-	struct task_grp *tg = container_tg(cont);
-	char buffer[2*sizeof(unsigned long) + 1];
-
-	if (nbytes > 2*sizeof(unsigned long))	/* safety check */
-		return -E2BIG;
 
-	if (copy_from_user(buffer, userbuf, nbytes))
-		return -EFAULT;
+	if (tg->shares == shares)
+		return 0;
 
-	buffer[nbytes] = 0;	/* nul-terminate */
-	shareval = simple_strtoul(buffer, NULL, 10);
+	/* return -EINVAL if the new value is not sane */
 
-	tg->shares = shareval;
+	tg->shares = shares;
 	for_each_possible_cpu(i)
-		set_se_shares(tg->se[i], shareval);
-
-	return nbytes;
-}
-
-static u64 cpu_shares_read_uint(struct container *cont, struct cftype *cft)
-{
-	struct task_grp *tg = container_tg(cont);
-
-	return (u64) tg->shares;
-}
+		set_se_shares(tg->se[i], shares);
 
-struct cftype cpuctl_share = {
-	.name = "shares",
-	.read_uint = cpu_shares_read_uint,
-	.write = cpu_shares_write,
-};
-
-static int sched_populate(struct container_subsys *ss, struct container *cont)
-{
-	return container_add_file(cont, ss, &cpuctl_share);
+	return 0;
 }
 
-struct container_subsys cpu_subsys = {
-	.name = "cpu",
-	.create = sched_create_group,
-	.destroy  = sched_destroy_group,
-	.can_attach = sched_can_attach,
-	.attach = sched_move_task,
-	.populate = sched_populate,
-	.subsys_id = cpu_subsys_id,
-	.early_init = 1,
-};
-
-#endif	/* CONFIG_FAIR_GROUP_SCHED */
+#endif 	/* CONFIG_FAIR_GROUP_SCHED */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 556942c..abd65ed 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -877,7 +877,10 @@ static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
 	if (!cfs_rq->nr_running)
 		return MAX_PRIO;
 
-	curr = __pick_next_entity(cfs_rq);
+	curr = cfs_rq->curr;
+	if (!curr)
+		curr = __pick_next_entity(cfs_rq);
+
 	p = task_of(curr);
 
 	return p->prio;
-- 
cgit v1.1


From 24e377a83220ef05c9b5bec7e01d65eed6609aa6 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Mon, 15 Oct 2007 17:00:09 +0200
Subject: sched: add fair-user scheduler

Enable user-id based fair group scheduling. This is useful for anyone
who wants to test the group scheduler w/o having to enable
CONFIG_CGROUPS.

A separate scheduling group (i.e struct task_grp) is automatically created for
every new user added to the system. Upon uid change for a task, it is made to
move to the corresponding scheduling group.

A /proc tunable (/proc/root_user_share) is also provided to tune root
user's quota of cpu bandwidth.

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |  4 ++++
 init/Kconfig          | 13 +++++++++++++
 kernel/sched.c        |  9 +++++++++
 kernel/sched_debug.c  | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/user.c         | 43 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 121 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 03c13b6..d0cc583 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -597,6 +597,10 @@ struct user_struct {
 	/* Hash table maintenance information */
 	struct hlist_node uidhash_node;
 	uid_t uid;
+
+#ifdef CONFIG_FAIR_USER_SCHED
+	struct task_grp *tg;
+#endif
 };
 
 extern struct user_struct *find_user(uid_t);
diff --git a/init/Kconfig b/init/Kconfig
index ef90a15..37711fe 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -289,6 +289,19 @@ config FAIR_GROUP_SCHED
 	  This feature lets cpu scheduler recognize task groups and control cpu
 	  bandwidth allocation to such task groups.
 
+choice
+	depends on FAIR_GROUP_SCHED
+	prompt "Basis for grouping tasks"
+	default FAIR_USER_SCHED
+
+ 	config FAIR_USER_SCHED
+ 		bool "user id"
+ 		help
+ 		  This option will choose userid as the basis for grouping
+		  tasks, thus providing equal cpu bandwidth to each user.
+
+endchoice
+
 config SYSFS_DEPRECATED
 	bool "Create deprecated sysfs files"
 	default y
diff --git a/kernel/sched.c b/kernel/sched.c
index e10c403..f33608e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -200,7 +200,12 @@ struct task_grp init_task_grp =  {
 				.cfs_rq = init_cfs_rq_p,
 				 };
 
+#ifdef CONFIG_FAIR_USER_SCHED
+#define INIT_TASK_GRP_LOAD	2*NICE_0_LOAD
+#else
 #define INIT_TASK_GRP_LOAD	NICE_0_LOAD
+#endif
+
 static int init_task_grp_load = INIT_TASK_GRP_LOAD;
 
 /* return group to which a task belongs */
@@ -208,7 +213,11 @@ static inline struct task_grp *task_grp(struct task_struct *p)
 {
 	struct task_grp *tg;
 
+#ifdef CONFIG_FAIR_USER_SCHED
+	tg = p->user->tg;
+#else
 	tg  = &init_task_grp;
+#endif
 
 	return tg;
 }
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 3e47e87..57ee9d5 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -212,6 +212,49 @@ static void sysrq_sched_debug_show(void)
 	sched_debug_show(NULL, NULL);
 }
 
+#ifdef CONFIG_FAIR_USER_SCHED
+
+static DEFINE_MUTEX(root_user_share_mutex);
+
+static int
+root_user_share_read_proc(char *page, char **start, off_t off, int count,
+				 int *eof, void *data)
+{
+	int len;
+
+	len = sprintf(page, "%d\n", init_task_grp_load);
+
+	return len;
+}
+
+static int
+root_user_share_write_proc(struct file *file, const char __user *buffer,
+				 unsigned long count, void *data)
+{
+	unsigned long shares;
+	char kbuf[sizeof(unsigned long)+1];
+	int rc = 0;
+
+	if (copy_from_user(kbuf, buffer, sizeof(kbuf)))
+		return -EFAULT;
+
+	shares = simple_strtoul(kbuf, NULL, 0);
+
+	if (!shares)
+		shares = NICE_0_LOAD;
+
+	mutex_lock(&root_user_share_mutex);
+
+	init_task_grp_load = shares;
+	rc = sched_group_set_shares(&init_task_grp, shares);
+
+	mutex_unlock(&root_user_share_mutex);
+
+	return (rc < 0 ? rc : count);
+}
+
+#endif	/* CONFIG_FAIR_USER_SCHED */
+
 static int sched_debug_open(struct inode *inode, struct file *filp)
 {
 	return single_open(filp, sched_debug_show, NULL);
@@ -234,6 +277,15 @@ static int __init init_sched_debug_procfs(void)
 
 	pe->proc_fops = &sched_debug_fops;
 
+#ifdef CONFIG_FAIR_USER_SCHED
+	pe = create_proc_entry("root_user_share", 0644, NULL);
+	if (!pe)
+		return -ENOMEM;
+
+	pe->read_proc = root_user_share_read_proc;
+	pe->write_proc = root_user_share_write_proc;
+#endif
+
 	return 0;
 }
 
diff --git a/kernel/user.c b/kernel/user.c
index 9ca2848..c6387fa 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -50,8 +50,41 @@ struct user_struct root_user = {
 	.uid_keyring	= &root_user_keyring,
 	.session_keyring = &root_session_keyring,
 #endif
+#ifdef CONFIG_FAIR_USER_SCHED
+	.tg		= &init_task_grp,
+#endif
 };
 
+#ifdef CONFIG_FAIR_USER_SCHED
+static void sched_destroy_user(struct user_struct *up)
+{
+	sched_destroy_group(up->tg);
+}
+
+static int sched_create_user(struct user_struct *up)
+{
+	int rc = 0;
+
+	up->tg = sched_create_group();
+	if (IS_ERR(up->tg))
+		rc = -ENOMEM;
+
+	return rc;
+}
+
+static void sched_switch_user(struct task_struct *p)
+{
+	sched_move_task(p);
+}
+
+#else	/* CONFIG_FAIR_USER_SCHED */
+
+static void sched_destroy_user(struct user_struct *up) { }
+static int sched_create_user(struct user_struct *up) { return 0; }
+static void sched_switch_user(struct task_struct *p) { }
+
+#endif	/* CONFIG_FAIR_USER_SCHED */
+
 /*
  * These routines must be called with the uidhash spinlock held!
  */
@@ -109,6 +142,7 @@ void free_uid(struct user_struct *up)
 	if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
 		uid_hash_remove(up);
 		spin_unlock_irqrestore(&uidhash_lock, flags);
+		sched_destroy_user(up);
 		key_put(up->uid_keyring);
 		key_put(up->session_keyring);
 		kmem_cache_free(uid_cachep, up);
@@ -150,6 +184,13 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 			return NULL;
 		}
 
+		if (sched_create_user(new) < 0) {
+			key_put(new->uid_keyring);
+			key_put(new->session_keyring);
+			kmem_cache_free(uid_cachep, new);
+			return NULL;
+		}
+
 		/*
 		 * Before adding this, check whether we raced
 		 * on adding the same user already..
@@ -157,6 +198,7 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 		spin_lock_irq(&uidhash_lock);
 		up = uid_hash_find(uid, hashent);
 		if (up) {
+			sched_destroy_user(new);
 			key_put(new->uid_keyring);
 			key_put(new->session_keyring);
 			kmem_cache_free(uid_cachep, new);
@@ -184,6 +226,7 @@ void switch_uid(struct user_struct *new_user)
 	atomic_dec(&old_user->processes);
 	switch_uid_keyring(new_user);
 	current->user = new_user;
+	sched_switch_user(current);
 
 	/*
 	 * We need to synchronize with __sigqueue_alloc()
-- 
cgit v1.1


From 7ed2be459b61c66fcc4926ffb073a25fc077d51f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:09 +0200
Subject: sched: fair-group sched, cleanups

fair-group sched, cleanups.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 init/Kconfig | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index 37711fe..b680733 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -294,10 +294,10 @@ choice
 	prompt "Basis for grouping tasks"
 	default FAIR_USER_SCHED
 
- 	config FAIR_USER_SCHED
- 		bool "user id"
- 		help
- 		  This option will choose userid as the basis for grouping
+	config FAIR_USER_SCHED
+		bool "user id"
+		help
+		  This option will choose userid as the basis for grouping
 		  tasks, thus providing equal cpu bandwidth to each user.
 
 endchoice
-- 
cgit v1.1


From de8d585a12aef40676f12ddc63e97daaf7752ba1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:09 +0200
Subject: sched: enable CONFIG_FAIR_GROUP_SCHED=y by default

enable CONFIG_FAIR_GROUP_SCHED=y by default.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 init/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/init/Kconfig b/init/Kconfig
index b680733..faed9a0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -283,7 +283,7 @@ config CPUSETS
 
 config FAIR_GROUP_SCHED
 	bool "Fair group cpu scheduler"
-	default n
+	default y
 	depends on EXPERIMENTAL
 	help
 	  This feature lets cpu scheduler recognize task groups and control cpu
-- 
cgit v1.1


From b8efb56172bc55082b8490778b07ef73eea0b551 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:10 +0200
Subject: sched debug: BKL usage statistics

add per task and per rq BKL usage statistics.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h | 4 ++++
 kernel/sched.c        | 9 +++++++++
 kernel/sched_debug.c  | 4 ++++
 3 files changed, 17 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d0cc583..920eb73 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -621,6 +621,10 @@ struct sched_info {
 	/* timestamps */
 	unsigned long long last_arrival,/* when we last ran on a cpu */
 			   last_queued;	/* when we were last queued to run */
+#ifdef CONFIG_SCHEDSTATS
+	/* BKL stats */
+	unsigned long bkl_cnt;
+#endif
 };
 #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
 
diff --git a/kernel/sched.c b/kernel/sched.c
index f33608e..5004dff 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -356,6 +356,9 @@ struct rq {
 	/* try_to_wake_up() stats */
 	unsigned long ttwu_cnt;
 	unsigned long ttwu_local;
+
+	/* BKL stats */
+	unsigned long bkl_cnt;
 #endif
 	struct lock_class_key rq_lock_key;
 };
@@ -3414,6 +3417,12 @@ static inline void schedule_debug(struct task_struct *prev)
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 
 	schedstat_inc(this_rq(), sched_cnt);
+#ifdef CONFIG_SCHEDSTATS
+	if (unlikely(prev->lock_depth >= 0)) {
+		schedstat_inc(this_rq(), bkl_cnt);
+		schedstat_inc(prev, sched_info.bkl_cnt);
+	}
+#endif
 }
 
 /*
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 57ee9d5..823b63a 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -136,6 +136,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			SPLIT_NS(spread0));
 	SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
 	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
+	SEQ_printf(m, "  .%-30s: %ld\n", "bkl_cnt",
+			rq->bkl_cnt);
 }
 
 static void print_cpu(struct seq_file *m, int cpu)
@@ -323,6 +325,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.exec_max);
 	PN(se.slice_max);
 	PN(se.wait_max);
+	P(sched_info.bkl_cnt);
 #endif
 	SEQ_printf(m, "%-25s:%20Ld\n",
 		   "nr_switches", (long long)(p->nvcsw + p->nivcsw));
@@ -350,6 +353,7 @@ void proc_sched_set_task(struct task_struct *p)
 	p->se.exec_max			= 0;
 	p->se.slice_max			= 0;
 	p->se.wait_max			= 0;
+	p->sched_info.bkl_cnt		= 0;
 #endif
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
-- 
cgit v1.1


From fdd71d132badad542a9ab99ab4a9c3c08fa6412f Mon Sep 17 00:00:00 2001
From: "S.Caglar Onur" <caglar@pardus.org.tr>
Date: Mon, 15 Oct 2007 17:00:10 +0200
Subject: sched debug: BKL usage statistics, fix

build fix for the SCHED_DEBUG && !SCHEDSTATS case.

Signed-off-by: S.Ceglar Onur <caglar@pardus.org.tr>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_debug.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 823b63a..b6d0a94 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -136,8 +136,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			SPLIT_NS(spread0));
 	SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
 	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
+#ifdef CONFIG_SCHEDSTATS
 	SEQ_printf(m, "  .%-30s: %ld\n", "bkl_cnt",
 			rq->bkl_cnt);
+#endif
 }
 
 static void print_cpu(struct seq_file *m, int cpu)
-- 
cgit v1.1


From c18b8a7cbcbac46497ee1ce656b0e68197c7581d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:10 +0200
Subject: sched: remove unneeded tunables

remove unneeded tunables.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h | 2 --
 kernel/sched_fair.c   | 2 --
 2 files changed, 4 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 920eb73..2c33227 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1403,8 +1403,6 @@ extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_batch_wakeup_granularity;
-extern unsigned int sysctl_sched_stat_granularity;
-extern unsigned int sysctl_sched_runtime_limit;
 extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
 #endif
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index abd65ed..5db7bd1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -76,8 +76,6 @@ const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 25000000UL;
  */
 const_debug unsigned int sysctl_sched_wakeup_granularity = 2000000UL;
 
-unsigned int sysctl_sched_runtime_limit __read_mostly;
-
 extern struct sched_class fair_sched_class;
 
 /**************************************************************
-- 
cgit v1.1


From 1aa4731eff7dab7bd01747b46f654f449f1cfc2c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:10 +0200
Subject: sched debug: print settings

print the current value of all tunables in /proc/sched_debug output.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_debug.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index b6d0a94..d79e1ec 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -203,6 +203,19 @@ static int sched_debug_show(struct seq_file *m, void *v)
 
 	SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now));
 
+#define P(x) \
+	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(x))
+#define PN(x) \
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+	PN(sysctl_sched_latency);
+	PN(sysctl_sched_min_granularity);
+	PN(sysctl_sched_wakeup_granularity);
+	PN(sysctl_sched_batch_wakeup_granularity);
+	PN(sysctl_sched_child_runs_first);
+	P(sysctl_sched_features);
+#undef PN
+#undef P
+
 	for_each_online_cpu(cpu)
 		print_cpu(m, cpu);
 
-- 
cgit v1.1


From 67e9fb2a39a1d454218d50383094940982be138f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Oct 2007 17:00:10 +0200
Subject: sched: add vslice

add vslice: the load-dependent "virtual slice" a task should
run ideally, so that the observed latency stays within the
sched_latency window.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        |  2 ++
 kernel/sched_fair.c   | 44 +++++++++++++++++++++++++++++---------------
 3 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c33227..d74830c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -908,6 +908,7 @@ struct sched_entity {
 	u64			sum_exec_runtime;
 	u64			vruntime;
 	u64			prev_sum_exec_runtime;
+	u64			last_min_vruntime;
 
 #ifdef CONFIG_SCHEDSTATS
 	u64			wait_start;
diff --git a/kernel/sched.c b/kernel/sched.c
index 5004dff..fe1165b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1615,6 +1615,7 @@ static void __sched_fork(struct task_struct *p)
 	p->se.exec_start		= 0;
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
+	p->se.last_min_vruntime		= 0;
 
 #ifdef CONFIG_SCHEDSTATS
 	p->se.wait_start		= 0;
@@ -6495,6 +6496,7 @@ static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->rq = rq;
 #endif
+	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 }
 
 void __init sched_init(void)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5db7bd1..87acc5c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -243,6 +243,15 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	return period;
 }
 
+static u64 __sched_vslice(unsigned long nr_running)
+{
+	u64 period = __sched_period(nr_running);
+
+	do_div(period, nr_running);
+
+	return period;
+}
+
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
@@ -441,32 +450,33 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 {
-	u64 min_runtime, latency;
+	u64 vruntime;
 
-	min_runtime = cfs_rq->min_vruntime;
+	vruntime = cfs_rq->min_vruntime;
 
 	if (sched_feat(USE_TREE_AVG)) {
 		struct sched_entity *last = __pick_last_entity(cfs_rq);
 		if (last) {
-			min_runtime = __pick_next_entity(cfs_rq)->vruntime;
-			min_runtime += last->vruntime;
-			min_runtime >>= 1;
+			vruntime += last->vruntime;
+			vruntime >>= 1;
 		}
-	} else if (sched_feat(APPROX_AVG))
-		min_runtime += sysctl_sched_latency/2;
+	} else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
+		vruntime += __sched_vslice(cfs_rq->nr_running)/2;
 
 	if (initial && sched_feat(START_DEBIT))
-		min_runtime += sched_slice(cfs_rq, se);
+		vruntime += __sched_vslice(cfs_rq->nr_running + 1);
 
 	if (!initial && sched_feat(NEW_FAIR_SLEEPERS)) {
-		latency = sysctl_sched_latency;
-		if (min_runtime > latency)
-			min_runtime -= latency;
+		s64 latency = cfs_rq->min_vruntime - se->last_min_vruntime;
+		if (latency < 0 || !cfs_rq->nr_running)
+			latency = 0;
 		else
-			min_runtime = 0;
+			latency = min_t(s64, latency, sysctl_sched_latency);
+		vruntime -= latency;
 	}
 
-	se->vruntime = max(se->vruntime, min_runtime);
+	se->vruntime = vruntime;
+
 }
 
 static void
@@ -478,6 +488,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 	update_curr(cfs_rq);
 
 	if (wakeup) {
+		/* se->vruntime += cfs_rq->min_vruntime; */
 		place_entity(cfs_rq, se, 0);
 		enqueue_sleeper(cfs_rq, se);
 	}
@@ -492,8 +503,8 @@ static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 {
 	update_stats_dequeue(cfs_rq, se);
-#ifdef CONFIG_SCHEDSTATS
 	if (sleep) {
+#ifdef CONFIG_SCHEDSTATS
 		if (entity_is_task(se)) {
 			struct task_struct *tsk = task_of(se);
 
@@ -502,8 +513,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 			if (tsk->state & TASK_UNINTERRUPTIBLE)
 				se->block_start = rq_of(cfs_rq)->clock;
 		}
-	}
 #endif
+		/* se->vruntime = entity_key(cfs_rq, se); */
+		se->last_min_vruntime = cfs_rq->min_vruntime;
+	}
+
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	account_entity_dequeue(cfs_rq, se);
-- 
cgit v1.1


From d822cecedad88b69a7d68aa8d49e1f238aa320c7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:10 +0200
Subject: sched debug: more width for parameter printouts

more width for parameter printouts in /proc/sched_debug.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_debug.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index d79e1ec..b24f17d 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -204,9 +204,9 @@ static int sched_debug_show(struct seq_file *m, void *v)
 	SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now));
 
 #define P(x) \
-	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(x))
+	SEQ_printf(m, "  .%-40s: %Ld\n", #x, (long long)(x))
 #define PN(x) \
-	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+	SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
 	PN(sysctl_sched_latency);
 	PN(sysctl_sched_min_granularity);
 	PN(sysctl_sched_wakeup_granularity);
-- 
cgit v1.1


From ddc972975091ba5f839bf24d0f9ef54fe90ee741 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Oct 2007 17:00:10 +0200
Subject: sched debug: check spread

debug feature: check how well we schedule within a reasonable
vruntime 'spread' range. (note that CPU overload can increase
the spread, so this is not a hard condition, but normal loads
should be within the spread.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c       |  3 +++
 kernel/sched_debug.c |  2 ++
 kernel/sched_fair.c  | 17 +++++++++++++++++
 3 files changed, 22 insertions(+)

diff --git a/kernel/sched.c b/kernel/sched.c
index fe1165b..213294f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -250,6 +250,9 @@ struct cfs_rq {
 	 * It is set to NULL otherwise (i.e when none are currently running).
 	 */
 	struct sched_entity *curr;
+
+	unsigned long nr_spread_over;
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
 
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index b24f17d..4659c90 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -140,6 +140,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	SEQ_printf(m, "  .%-30s: %ld\n", "bkl_cnt",
 			rq->bkl_cnt);
 #endif
+	SEQ_printf(m, "  .%-30s: %ld\n", "nr_spread_over",
+			cfs_rq->nr_spread_over);
 }
 
 static void print_cpu(struct seq_file *m, int cpu)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 87acc5c..8ea4c9b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -447,6 +447,19 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 #endif
 }
 
+static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+#ifdef CONFIG_SCHED_DEBUG
+	s64 d = se->vruntime - cfs_rq->min_vruntime;
+
+	if (d < 0)
+		d = -d;
+
+	if (d > 3*sysctl_sched_latency)
+		schedstat_inc(cfs_rq, nr_spread_over);
+#endif
+}
+
 static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 {
@@ -494,6 +507,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 	}
 
 	update_stats_enqueue(cfs_rq, se);
+	check_spread(cfs_rq, se);
 	if (se != cfs_rq->curr)
 		__enqueue_entity(cfs_rq, se);
 	account_entity_enqueue(cfs_rq, se);
@@ -587,6 +601,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 
 	update_stats_curr_end(cfs_rq, prev);
 
+	check_spread(cfs_rq, prev);
 	if (prev->on_rq) {
 		update_stats_wait_start(cfs_rq, prev);
 		/* Put 'current' back into the tree. */
@@ -996,6 +1011,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	}
 
 	update_stats_enqueue(cfs_rq, se);
+	check_spread(cfs_rq, se);
+	check_spread(cfs_rq, curr);
 	__enqueue_entity(cfs_rq, se);
 	account_entity_enqueue(cfs_rq, se);
 	resched_task(rq->curr);
-- 
cgit v1.1


From 8465e792e82c567b80358e38732164b770ed4b7f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:11 +0200
Subject: sched: entity_key() fix

entity_key() fix - we'd occasionally end up with a 0 vruntime
in the !initial case.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched_fair.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8ea4c9b..926491f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -479,13 +479,16 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 	if (initial && sched_feat(START_DEBIT))
 		vruntime += __sched_vslice(cfs_rq->nr_running + 1);
 
-	if (!initial && sched_feat(NEW_FAIR_SLEEPERS)) {
-		s64 latency = cfs_rq->min_vruntime - se->last_min_vruntime;
-		if (latency < 0 || !cfs_rq->nr_running)
-			latency = 0;
-		else
-			latency = min_t(s64, latency, sysctl_sched_latency);
-		vruntime -= latency;
+	if (!initial) {
+		if (sched_feat(NEW_FAIR_SLEEPERS)) {
+			s64 latency = cfs_rq->min_vruntime - se->last_min_vruntime;
+			if (latency < 0 || !cfs_rq->nr_running)
+				latency = 0;
+			else
+				latency = min_t(s64, latency, sysctl_sched_latency);
+			vruntime -= latency;
+		}
+		vruntime = max(vruntime, se->vruntime);
 	}
 
 	se->vruntime = vruntime;
-- 
cgit v1.1


From 785c29ef9573d98b31493c9a68c3589449082108 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:11 +0200
Subject: sched: remove condition from set_task_cpu()

remove condition from set_task_cpu(). Now that ->vruntime
is not global anymore, it should (and does) work fine without
it too.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 213294f..c779bf9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1052,9 +1052,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	if (p->se.block_start)
 		p->se.block_start -= clock_offset;
 #endif
-	if (likely(new_rq->cfs.min_vruntime))
-		p->se.vruntime -= old_rq->cfs.min_vruntime -
-						new_rq->cfs.min_vruntime;
+	p->se.vruntime -= old_rq->cfs.min_vruntime - new_rq->cfs.min_vruntime;
 
 	__set_task_cpu(p, new_cpu);
 }
-- 
cgit v1.1


From dc1f31c90cfa067af6f7000db7a5383c7667ccba Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:11 +0200
Subject: sched: remove last_min_vruntime effect

remove last_min_vruntime use - prepare to remove it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 926491f..0228de1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -481,7 +481,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 
 	if (!initial) {
 		if (sched_feat(NEW_FAIR_SLEEPERS)) {
-			s64 latency = cfs_rq->min_vruntime - se->last_min_vruntime;
+			s64 latency = cfs_rq->min_vruntime - se->vruntime;
 			if (latency < 0 || !cfs_rq->nr_running)
 				latency = 0;
 			else
-- 
cgit v1.1


From 94359f05cb7e1fed0deccc83ebc30a1175a9ae16 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:11 +0200
Subject: sched: undo some of the recent changes

undo some of the recent changes that are not needed after all,
such as last_min_vruntime.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h |  1 -
 kernel/sched.c        |  1 -
 kernel/sched_fair.c   | 13 +++----------
 3 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d74830c..2c33227 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -908,7 +908,6 @@ struct sched_entity {
 	u64			sum_exec_runtime;
 	u64			vruntime;
 	u64			prev_sum_exec_runtime;
-	u64			last_min_vruntime;
 
 #ifdef CONFIG_SCHEDSTATS
 	u64			wait_start;
diff --git a/kernel/sched.c b/kernel/sched.c
index c779bf9..744bd50 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1616,7 +1616,6 @@ static void __sched_fork(struct task_struct *p)
 	p->se.exec_start		= 0;
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
-	p->se.last_min_vruntime		= 0;
 
 #ifdef CONFIG_SCHEDSTATS
 	p->se.wait_start		= 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0228de1..62a9ee8d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -480,14 +480,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 		vruntime += __sched_vslice(cfs_rq->nr_running + 1);
 
 	if (!initial) {
-		if (sched_feat(NEW_FAIR_SLEEPERS)) {
-			s64 latency = cfs_rq->min_vruntime - se->vruntime;
-			if (latency < 0 || !cfs_rq->nr_running)
-				latency = 0;
-			else
-				latency = min_t(s64, latency, sysctl_sched_latency);
-			vruntime -= latency;
-		}
+		if (sched_feat(NEW_FAIR_SLEEPERS))
+			vruntime -= sysctl_sched_latency;
+
 		vruntime = max(vruntime, se->vruntime);
 	}
 
@@ -531,8 +526,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 				se->block_start = rq_of(cfs_rq)->clock;
 		}
 #endif
-		/* se->vruntime = entity_key(cfs_rq, se); */
-		se->last_min_vruntime = cfs_rq->min_vruntime;
 	}
 
 	if (se != cfs_rq->curr)
-- 
cgit v1.1


From b8487b924177385e3932f846f430b73ce8e69bba Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:11 +0200
Subject: sched: fix sign check error in place_entity()

fix sign check error in place_entity() - we'd get excessive
latencies due to negatives being converted to large u64's.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 62a9ee8d..2bd9625 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -483,7 +483,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 		if (sched_feat(NEW_FAIR_SLEEPERS))
 			vruntime -= sysctl_sched_latency;
 
-		vruntime = max(vruntime, se->vruntime);
+		vruntime = max_t(s64, vruntime, se->vruntime);
 	}
 
 	se->vruntime = vruntime;
-- 
cgit v1.1


From 02e4bac2a5b097e23d757bf2953740b3d51b7976 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:11 +0200
Subject: sched: fix sched_fork()

fix sched_fork(): large latencies at new task creation time because
the ->vruntime was not fixed up cross-CPU, if the parent got migrated
after the child's CPU got set up.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 744bd50..36484da 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1657,7 +1657,7 @@ void sched_fork(struct task_struct *p, int clone_flags)
 #ifdef CONFIG_SMP
 	cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
 #endif
-	__set_task_cpu(p, cpu);
+	set_task_cpu(p, cpu);
 
 	/*
 	 * Make sure we do not leak PI boosting priority to the child:
-- 
cgit v1.1


From 368059a977871def0f88a92eefb6ecc1f7b6132f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Oct 2007 17:00:11 +0200
Subject: sched: max_vruntime() simplification

max_vruntime() simplification.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched_fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 2bd9625..91664d6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -117,8 +117,8 @@ static inline struct task_struct *task_of(struct sched_entity *se)
 static inline u64
 max_vruntime(u64 min_vruntime, u64 vruntime)
 {
-	if ((vruntime > min_vruntime) ||
-	    (min_vruntime > (1ULL << 61) && vruntime < (1ULL << 50)))
+	s64 delta = (s64)(vruntime - min_vruntime);
+	if (delta > 0)
 		min_vruntime = vruntime;
 
 	return min_vruntime;
-- 
cgit v1.1


From 2ddbf952508fb9911036c484a87f6351106b917c Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Mon, 15 Oct 2007 17:00:11 +0200
Subject: sched: clean up sched_fork()

The adjusting sched_class is a missing part of the already existing "do
not leak PI boosting priority to the child" at the sched_fork(). This
patch moves the adjusting sched_class from wake_up_new_task() to
sched_fork().

this also shrinks the code a bit:

   text    data     bss     dec     hex filename
  40111    4018     292   44421    ad85 sched.o.before
  40102    4018     292   44412    ad7c sched.o.after

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 36484da..cd2b494 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1663,6 +1663,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
 	 * Make sure we do not leak PI boosting priority to the child:
 	 */
 	p->prio = current->normal_prio;
+	if (!rt_prio(p->prio))
+		p->sched_class = &fair_sched_class;
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (likely(sched_info_on()))
@@ -1698,11 +1700,6 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 
 	p->prio = effective_prio(p);
 
-	if (rt_prio(p->prio))
-		p->sched_class = &rt_sched_class;
-	else
-		p->sched_class = &fair_sched_class;
-
 	if (task_cpu(p) != this_cpu || !p->sched_class->task_new ||
 							!current->se.on_rq) {
 		activate_task(rq, p, 0);
-- 
cgit v1.1


From 57cb499df26d80ec11cd49e56d20835334ac4ab9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:11 +0200
Subject: sched: remove set_leftmost()

Lee Schermerhorn noticed that set_leftmost() contains dead code,
remove this.

Reported-by: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 91664d6..48c6921 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -124,16 +124,6 @@ max_vruntime(u64 min_vruntime, u64 vruntime)
 	return min_vruntime;
 }
 
-static inline void
-set_leftmost(struct cfs_rq *cfs_rq, struct rb_node *leftmost)
-{
-	struct sched_entity *se;
-
-	cfs_rq->rb_leftmost = leftmost;
-	if (leftmost)
-		se = rb_entry(leftmost, struct sched_entity, run_node);
-}
-
 static inline s64
 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -175,7 +165,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	 * used):
 	 */
 	if (leftmost)
-		set_leftmost(cfs_rq, &se->run_node);
+		cfs_rq->rb_leftmost = &se->run_node;
 
 	rb_link_node(&se->run_node, parent, link);
 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -185,7 +175,7 @@ static void
 __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	if (cfs_rq->rb_leftmost == &se->run_node)
-		set_leftmost(cfs_rq, rb_next(&se->run_node));
+		cfs_rq->rb_leftmost = rb_next(&se->run_node);
 
 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 }
-- 
cgit v1.1


From 8651a86c342ab79a956afec0c5971acaad38d3a1 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Mon, 15 Oct 2007 17:00:12 +0200
Subject: sched: group scheduler wakeup latency fix

group scheduler wakeup latency fix: when checking for preemption
we must check cross-group too, not just intra-group.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 48c6921..5384a97 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -796,7 +796,8 @@ static void yield_task_fair(struct rq *rq)
 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 {
 	struct task_struct *curr = rq->curr;
-	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+	struct cfs_rq *cfs_rq = task_cfs_rq(curr), *pcfs_rq;
+	struct sched_entity *se = &curr->se, *pse = &p->se;
 
 	if (unlikely(rt_prio(p->prio))) {
 		update_rq_clock(rq);
@@ -804,11 +805,21 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 		resched_task(curr);
 		return;
 	}
-	if (is_same_group(curr, p)) {
-		s64 delta = curr->se.vruntime - p->se.vruntime;
 
-		if (delta > (s64)sysctl_sched_wakeup_granularity)
-			resched_task(curr);
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+		pcfs_rq = cfs_rq_of(pse);
+
+		if (cfs_rq == pcfs_rq) {
+			s64 delta = se->vruntime - pse->vruntime;
+
+			if (delta > (s64)sysctl_sched_wakeup_granularity)
+				resched_task(curr);
+			break;
+		}
+#ifdef CONFIG_FAIR_GROUP_SCHED
+		pse = pse->parent;
+#endif
 	}
 }
 
-- 
cgit v1.1


From 2b1e315dd2822c99793485f9e53a73459fb399c1 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Mon, 15 Oct 2007 17:00:12 +0200
Subject: sched: yield fix

fix yield bugs due to the current-not-in-rbtree changes: the task is
not in the rbtree so rbtree-removal is a no-no.

[ From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>: build fix. ]

also, nice code size reduction:

kernel/sched.o:
   text    data     bss     dec     hex filename
  38323    3506      24   41853    a37d sched.o.before
  38236    3506      24   41766    a326 sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 25 +++++--------------------
 1 file changed, 5 insertions(+), 20 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5384a97..fcd6900 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -739,9 +739,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 static void yield_task_fair(struct rq *rq)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(rq->curr);
-	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 	struct sched_entity *rightmost, *se = &rq->curr->se;
-	struct rb_node *parent;
 
 	/*
 	 * Are we the only task in the tree?
@@ -755,39 +753,26 @@ static void yield_task_fair(struct rq *rq)
 		 * Dequeue and enqueue the task to update its
 		 * position within the tree:
 		 */
-		dequeue_entity(cfs_rq, se, 0);
-		enqueue_entity(cfs_rq, se, 0);
+		update_curr(cfs_rq);
 
 		return;
 	}
 	/*
 	 * Find the rightmost entry in the rbtree:
 	 */
-	do {
-		parent = *link;
-		link = &parent->rb_right;
-	} while (*link);
-
-	rightmost = rb_entry(parent, struct sched_entity, run_node);
+	rightmost = __pick_last_entity(cfs_rq);
 	/*
 	 * Already in the rightmost position?
 	 */
-	if (unlikely(rightmost == se))
+	if (unlikely(rightmost->vruntime < se->vruntime))
 		return;
 
 	/*
 	 * Minimally necessary key value to be last in the tree:
+	 * Upon rescheduling, sched_class::put_prev_task() will place
+	 * 'current' within the tree based on its new key value.
 	 */
 	se->vruntime = rightmost->vruntime + 1;
-
-	if (cfs_rq->rb_leftmost == &se->run_node)
-		cfs_rq->rb_leftmost = rb_next(&se->run_node);
-	/*
-	 * Relink the task to the rightmost position:
-	 */
-	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
-	rb_link_node(&se->run_node, parent, link);
-	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
 }
 
 /*
-- 
cgit v1.1


From 2d72376b3af1e7d4d4515ebfd0f4383f2e92c343 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:12 +0200
Subject: sched: clean up schedstats, cnt -> count

rename all 'cnt' fields and variables to the less yucky 'count' name.

yuckage noticed by Andrew Morton.

no change in code, other than the /proc/sched_debug bkl_count string got
a bit larger:

   text    data     bss     dec     hex filename
  38236    3506      24   41766    a326 sched.o.before
  38240    3506      24   41770    a32a sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/proc/base.c        |  2 +-
 include/linux/sched.h | 12 ++++++------
 kernel/delayacct.c    |  2 +-
 kernel/sched.c        | 24 ++++++++++++------------
 kernel/sched_debug.c  |  8 ++++----
 kernel/sched_stats.h  | 24 ++++++++++++------------
 6 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 19489b0..e5d0953 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -304,7 +304,7 @@ static int proc_pid_schedstat(struct task_struct *task, char *buffer)
 	return sprintf(buffer, "%llu %llu %lu\n",
 			task->sched_info.cpu_time,
 			task->sched_info.run_delay,
-			task->sched_info.pcnt);
+			task->sched_info.pcount);
 }
 #endif
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c33227..d5daca4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -614,7 +614,7 @@ struct reclaim_state;
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 struct sched_info {
 	/* cumulative counters */
-	unsigned long pcnt;	      /* # of times run on this cpu */
+	unsigned long pcount;	      /* # of times run on this cpu */
 	unsigned long long cpu_time,  /* time spent on the cpu */
 			   run_delay; /* time spent waiting on a runqueue */
 
@@ -623,7 +623,7 @@ struct sched_info {
 			   last_queued;	/* when we were last queued to run */
 #ifdef CONFIG_SCHEDSTATS
 	/* BKL stats */
-	unsigned long bkl_cnt;
+	unsigned long bkl_count;
 #endif
 };
 #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
@@ -759,7 +759,7 @@ struct sched_domain {
 
 #ifdef CONFIG_SCHEDSTATS
 	/* load_balance() stats */
-	unsigned long lb_cnt[CPU_MAX_IDLE_TYPES];
+	unsigned long lb_count[CPU_MAX_IDLE_TYPES];
 	unsigned long lb_failed[CPU_MAX_IDLE_TYPES];
 	unsigned long lb_balanced[CPU_MAX_IDLE_TYPES];
 	unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES];
@@ -769,17 +769,17 @@ struct sched_domain {
 	unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES];
 
 	/* Active load balancing */
-	unsigned long alb_cnt;
+	unsigned long alb_count;
 	unsigned long alb_failed;
 	unsigned long alb_pushed;
 
 	/* SD_BALANCE_EXEC stats */
-	unsigned long sbe_cnt;
+	unsigned long sbe_count;
 	unsigned long sbe_balanced;
 	unsigned long sbe_pushed;
 
 	/* SD_BALANCE_FORK stats */
-	unsigned long sbf_cnt;
+	unsigned long sbf_count;
 	unsigned long sbf_balanced;
 	unsigned long sbf_pushed;
 
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 81e6978..09e9574 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -119,7 +119,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	 * No locking available for sched_info (and too expensive to add one)
 	 * Mitigate by taking snapshot of values
 	 */
-	t1 = tsk->sched_info.pcnt;
+	t1 = tsk->sched_info.pcount;
 	t2 = tsk->sched_info.run_delay;
 	t3 = tsk->sched_info.cpu_time;
 
diff --git a/kernel/sched.c b/kernel/sched.c
index cd2b494..ba9fa6c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -349,19 +349,19 @@ struct rq {
 	unsigned long yld_exp_empty;
 	unsigned long yld_act_empty;
 	unsigned long yld_both_empty;
-	unsigned long yld_cnt;
+	unsigned long yld_count;
 
 	/* schedule() stats */
 	unsigned long sched_switch;
-	unsigned long sched_cnt;
+	unsigned long sched_count;
 	unsigned long sched_goidle;
 
 	/* try_to_wake_up() stats */
-	unsigned long ttwu_cnt;
+	unsigned long ttwu_count;
 	unsigned long ttwu_local;
 
 	/* BKL stats */
-	unsigned long bkl_cnt;
+	unsigned long bkl_count;
 #endif
 	struct lock_class_key rq_lock_key;
 };
@@ -1481,7 +1481,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
 	new_cpu = cpu;
 
-	schedstat_inc(rq, ttwu_cnt);
+	schedstat_inc(rq, ttwu_count);
 	if (cpu == this_cpu) {
 		schedstat_inc(rq, ttwu_local);
 		goto out_set_cpu;
@@ -2637,7 +2637,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		sd_idle = 1;
 
-	schedstat_inc(sd, lb_cnt[idle]);
+	schedstat_inc(sd, lb_count[idle]);
 
 redo:
 	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
@@ -2790,7 +2790,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		sd_idle = 1;
 
-	schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
+	schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
 redo:
 	group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
 				   &sd_idle, &cpus, NULL);
@@ -2924,7 +2924,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 	}
 
 	if (likely(sd)) {
-		schedstat_inc(sd, alb_cnt);
+		schedstat_inc(sd, alb_count);
 
 		if (move_one_task(target_rq, target_cpu, busiest_rq,
 				  sd, CPU_IDLE))
@@ -3414,11 +3414,11 @@ static inline void schedule_debug(struct task_struct *prev)
 
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 
-	schedstat_inc(this_rq(), sched_cnt);
+	schedstat_inc(this_rq(), sched_count);
 #ifdef CONFIG_SCHEDSTATS
 	if (unlikely(prev->lock_depth >= 0)) {
-		schedstat_inc(this_rq(), bkl_cnt);
-		schedstat_inc(prev, sched_info.bkl_cnt);
+		schedstat_inc(this_rq(), bkl_count);
+		schedstat_inc(prev, sched_info.bkl_count);
 	}
 #endif
 }
@@ -4558,7 +4558,7 @@ asmlinkage long sys_sched_yield(void)
 {
 	struct rq *rq = this_rq_lock();
 
-	schedstat_inc(rq, yld_cnt);
+	schedstat_inc(rq, yld_count);
 	current->sched_class->yield_task(rq);
 
 	/*
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 4659c90..be79cd6 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -137,8 +137,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
 	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_SCHEDSTATS
-	SEQ_printf(m, "  .%-30s: %ld\n", "bkl_cnt",
-			rq->bkl_cnt);
+	SEQ_printf(m, "  .%-30s: %ld\n", "bkl_count",
+			rq->bkl_count);
 #endif
 	SEQ_printf(m, "  .%-30s: %ld\n", "nr_spread_over",
 			cfs_rq->nr_spread_over);
@@ -342,7 +342,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.exec_max);
 	PN(se.slice_max);
 	PN(se.wait_max);
-	P(sched_info.bkl_cnt);
+	P(sched_info.bkl_count);
 #endif
 	SEQ_printf(m, "%-25s:%20Ld\n",
 		   "nr_switches", (long long)(p->nvcsw + p->nivcsw));
@@ -370,7 +370,7 @@ void proc_sched_set_task(struct task_struct *p)
 	p->se.exec_max			= 0;
 	p->se.slice_max			= 0;
 	p->se.wait_max			= 0;
-	p->sched_info.bkl_cnt		= 0;
+	p->sched_info.bkl_count		= 0;
 #endif
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 1d9ec98..1c08484 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -16,18 +16,18 @@ static int show_schedstat(struct seq_file *seq, void *v)
 		struct rq *rq = cpu_rq(cpu);
 #ifdef CONFIG_SMP
 		struct sched_domain *sd;
-		int dcnt = 0;
+		int dcount = 0;
 #endif
 
 		/* runqueue-specific stats */
 		seq_printf(seq,
 		    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu",
 		    cpu, rq->yld_both_empty,
-		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
-		    rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
-		    rq->ttwu_cnt, rq->ttwu_local,
+		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
+		    rq->sched_switch, rq->sched_count, rq->sched_goidle,
+		    rq->ttwu_count, rq->ttwu_local,
 		    rq->rq_sched_info.cpu_time,
-		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
+		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
 
 		seq_printf(seq, "\n");
 
@@ -39,12 +39,12 @@ static int show_schedstat(struct seq_file *seq, void *v)
 			char mask_str[NR_CPUS];
 
 			cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
-			seq_printf(seq, "domain%d %s", dcnt++, mask_str);
+			seq_printf(seq, "domain%d %s", dcount++, mask_str);
 			for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
 					itype++) {
 				seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
 						"%lu",
-				    sd->lb_cnt[itype],
+				    sd->lb_count[itype],
 				    sd->lb_balanced[itype],
 				    sd->lb_failed[itype],
 				    sd->lb_imbalance[itype],
@@ -55,9 +55,9 @@ static int show_schedstat(struct seq_file *seq, void *v)
 			}
 			seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
 			    " %lu %lu %lu\n",
-			    sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
-			    sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
-			    sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
+			    sd->alb_count, sd->alb_failed, sd->alb_pushed,
+			    sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
+			    sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
 			    sd->ttwu_wake_remote, sd->ttwu_move_affine,
 			    sd->ttwu_move_balance);
 		}
@@ -101,7 +101,7 @@ rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
 {
 	if (rq) {
 		rq->rq_sched_info.run_delay += delta;
-		rq->rq_sched_info.pcnt++;
+		rq->rq_sched_info.pcount++;
 	}
 }
 
@@ -164,7 +164,7 @@ static void sched_info_arrive(struct task_struct *t)
 	sched_info_dequeued(t);
 	t->sched_info.run_delay += delta;
 	t->sched_info.last_arrival = now;
-	t->sched_info.pcnt++;
+	t->sched_info.pcount++;
 
 	rq_sched_info_arrive(task_rq(t), delta);
 }
-- 
cgit v1.1


From 2830cf8c90f37526d401f1999250312df970bfa3 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Mon, 15 Oct 2007 17:00:12 +0200
Subject: sched: group scheduler SMP migration fix

group scheduler SMP migration fix: use task_cfs_rq(p) to get
to the relevant fair-scheduling runqueue of a task, rq->cfs
is not the right one.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index ba9fa6c..e1657e0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1040,6 +1040,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 	int old_cpu = task_cpu(p);
 	struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
+	struct cfs_rq *old_cfsrq = task_cfs_rq(p),
+		      *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
 	u64 clock_offset;
 
 	clock_offset = old_rq->clock - new_rq->clock;
@@ -1052,7 +1054,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	if (p->se.block_start)
 		p->se.block_start -= clock_offset;
 #endif
-	p->se.vruntime -= old_rq->cfs.min_vruntime - new_rq->cfs.min_vruntime;
+	p->se.vruntime -= old_cfsrq->min_vruntime -
+					 new_cfsrq->min_vruntime;
 
 	__set_task_cpu(p, new_cpu);
 }
-- 
cgit v1.1


From b0ffd246ea947a037746e725bd461bb7e809a4b3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Oct 2007 17:00:12 +0200
Subject: sched: clean up min_vruntime use

clean up min_vruntime use.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index fcd6900..ec0569e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -124,6 +124,16 @@ max_vruntime(u64 min_vruntime, u64 vruntime)
 	return min_vruntime;
 }
 
+static inline u64
+min_vruntime(u64 min_vruntime, u64 vruntime)
+{
+	s64 delta = (s64)(vruntime - min_vruntime);
+	if (delta < 0)
+		min_vruntime = vruntime;
+
+	return min_vruntime;
+}
+
 static inline s64
 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -251,7 +261,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 	      unsigned long delta_exec)
 {
 	unsigned long delta_exec_weighted;
-	u64 next_vruntime, min_vruntime;
+	u64 vruntime;
 
 	schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
 
@@ -269,19 +279,13 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 	 * value tracking the leftmost vruntime in the tree.
 	 */
 	if (first_fair(cfs_rq)) {
-		next_vruntime = __pick_next_entity(cfs_rq)->vruntime;
-
-		/* min_vruntime() := !max_vruntime() */
-		min_vruntime = max_vruntime(curr->vruntime, next_vruntime);
-		if (min_vruntime == next_vruntime)
-			min_vruntime = curr->vruntime;
-		else
-			min_vruntime = next_vruntime;
+		vruntime = min_vruntime(curr->vruntime,
+				__pick_next_entity(cfs_rq)->vruntime);
 	} else
-		min_vruntime = curr->vruntime;
+		vruntime = curr->vruntime;
 
 	cfs_rq->min_vruntime =
-		max_vruntime(cfs_rq->min_vruntime, min_vruntime);
+		max_vruntime(cfs_rq->min_vruntime, vruntime);
 }
 
 static void update_curr(struct cfs_rq *cfs_rq)
-- 
cgit v1.1


From 5f6d858ecca78f71755859a346d845e302973cd1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Oct 2007 17:00:12 +0200
Subject: sched: speed up and simplify vslice calculations

speed up and simplify vslice calculations.

[ From: Mike Galbraith <efault@gmx.de>: build fix ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  2 +-
 kernel/sched_debug.c  |  2 +-
 kernel/sched_fair.c   | 15 +++++++++------
 kernel/sysctl.c       |  9 +++------
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d5daca4..97f736b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1400,7 +1400,7 @@ extern void sched_idle_next(void);
 
 #ifdef CONFIG_SCHED_DEBUG
 extern unsigned int sysctl_sched_latency;
-extern unsigned int sysctl_sched_min_granularity;
+extern unsigned int sysctl_sched_nr_latency;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_batch_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index be79cd6..995bbd3 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -210,7 +210,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
 #define PN(x) \
 	SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
 	PN(sysctl_sched_latency);
-	PN(sysctl_sched_min_granularity);
+	PN(sysctl_sched_nr_latency);
 	PN(sysctl_sched_wakeup_granularity);
 	PN(sysctl_sched_batch_wakeup_granularity);
 	PN(sysctl_sched_child_runs_first);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ec0569e..ae2d4b0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -46,7 +46,7 @@ const_debug unsigned int sysctl_sched_child_runs_first = 1;
  * Minimal preemption granularity for CPU-bound tasks:
  * (default: 2 msec, units: nanoseconds)
  */
-unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL;
+const_debug unsigned int sysctl_sched_nr_latency = 20;
 
 /*
  * sys_sched_yield() compat mode
@@ -222,8 +222,7 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 static u64 __sched_period(unsigned long nr_running)
 {
 	u64 period = sysctl_sched_latency;
-	unsigned long nr_latency =
-		sysctl_sched_latency / sysctl_sched_min_granularity;
+	unsigned long nr_latency = sysctl_sched_nr_latency;
 
 	if (unlikely(nr_running > nr_latency)) {
 		period *= nr_running;
@@ -245,11 +244,15 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 static u64 __sched_vslice(unsigned long nr_running)
 {
-	u64 period = __sched_period(nr_running);
+	unsigned long period = sysctl_sched_latency;
+	unsigned long nr_latency = sysctl_sched_nr_latency;
 
-	do_div(period, nr_running);
+	if (unlikely(nr_running > nr_latency))
+		nr_running = nr_latency;
 
-	return period;
+	period /= nr_running;
+
+	return (u64)period;
 }
 
 /*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 97b15c2..230ca4e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -222,14 +222,11 @@ static ctl_table kern_table[] = {
 #ifdef CONFIG_SCHED_DEBUG
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_min_granularity_ns",
-		.data		= &sysctl_sched_min_granularity,
+		.procname	= "sched_nr_latency",
+		.data		= &sysctl_sched_nr_latency,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &min_sched_granularity_ns,
-		.extra2		= &max_sched_granularity_ns,
+		.proc_handler	= &proc_dointvec,
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-- 
cgit v1.1


From b39c5dd7f938775fd0a1df5b4b1c26f854d15231 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:12 +0200
Subject: sched: cleanup, remove stale comment

cleanup, remove stale comment.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ae2d4b0..c44a295 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -496,7 +496,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 	update_curr(cfs_rq);
 
 	if (wakeup) {
-		/* se->vruntime += cfs_rq->min_vruntime; */
 		place_entity(cfs_rq, se, 0);
 		enqueue_sleeper(cfs_rq, se);
 	}
-- 
cgit v1.1


From fb615581c78efee25e4d04f1145e8fa8ec705dc3 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Mon, 15 Oct 2007 17:00:12 +0200
Subject: sched: group scheduler, fix coding style issues

Fix coding style issues reported by Randy Dunlap and others

Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 init/Kconfig         | 14 +++++++-------
 kernel/sched_debug.c |  8 ++------
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index faed9a0..54f31a1 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -282,11 +282,11 @@ config CPUSETS
 	  Say N if unsure.
 
 config FAIR_GROUP_SCHED
-	bool "Fair group cpu scheduler"
+	bool "Fair group CPU scheduler"
 	default y
 	depends on EXPERIMENTAL
 	help
-	  This feature lets cpu scheduler recognize task groups and control cpu
+	  This feature lets CPU scheduler recognize task groups and control CPU
 	  bandwidth allocation to such task groups.
 
 choice
@@ -294,11 +294,11 @@ choice
 	prompt "Basis for grouping tasks"
 	default FAIR_USER_SCHED
 
-	config FAIR_USER_SCHED
-		bool "user id"
-		help
-		  This option will choose userid as the basis for grouping
-		  tasks, thus providing equal cpu bandwidth to each user.
+config FAIR_USER_SCHED
+	bool "user id"
+	help
+	  This option will choose userid as the basis for grouping
+	  tasks, thus providing equal CPU bandwidth to each user.
 
 endchoice
 
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 995bbd3..48748d0 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -239,11 +239,7 @@ static int
 root_user_share_read_proc(char *page, char **start, off_t off, int count,
 				 int *eof, void *data)
 {
-	int len;
-
-	len = sprintf(page, "%d\n", init_task_grp_load);
-
-	return len;
+	return sprintf(page, "%d\n", init_task_grp_load);
 }
 
 static int
@@ -297,7 +293,7 @@ static int __init init_sched_debug_procfs(void)
 	pe->proc_fops = &sched_debug_fops;
 
 #ifdef CONFIG_FAIR_USER_SCHED
-	pe = create_proc_entry("root_user_share", 0644, NULL);
+	pe = create_proc_entry("root_user_cpu_share", 0644, NULL);
 	if (!pe)
 		return -ENOMEM;
 
-- 
cgit v1.1


From fad095a7b963d9e914e0cdb73e27355c47709441 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Mon, 15 Oct 2007 17:00:12 +0200
Subject: sched: group scheduler, fix bloat

Recent fix to check_preempt_wakeup() to check for preemption at higher
levels caused a size bloat for !CONFIG_FAIR_GROUP_SCHED.

Fix the problem.

  42277   10598     320   53195    cfcb kernel/sched.o-before_this_patch
  42216   10598     320   53134    cf8e kernel/sched.o-after_this_patch

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 43 +++++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c44a295..57e7f36 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -652,15 +652,21 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 	list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 
-/* Do the two (enqueued) tasks belong to the same group ? */
-static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
+/* Do the two (enqueued) entities belong to the same group ? */
+static inline int
+is_same_group(struct sched_entity *se, struct sched_entity *pse)
 {
-	if (curr->se.cfs_rq == p->se.cfs_rq)
+	if (se->cfs_rq == pse->cfs_rq)
 		return 1;
 
 	return 0;
 }
 
+static inline struct sched_entity *parent_entity(struct sched_entity *se)
+{
+	return se->parent;
+}
+
 #else	/* CONFIG_FAIR_GROUP_SCHED */
 
 #define for_each_sched_entity(se) \
@@ -693,11 +699,17 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 
-static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
+static inline int
+is_same_group(struct sched_entity *se, struct sched_entity *pse)
 {
 	return 1;
 }
 
+static inline struct sched_entity *parent_entity(struct sched_entity *se)
+{
+	return NULL;
+}
+
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
 /*
@@ -787,8 +799,9 @@ static void yield_task_fair(struct rq *rq)
 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 {
 	struct task_struct *curr = rq->curr;
-	struct cfs_rq *cfs_rq = task_cfs_rq(curr), *pcfs_rq;
+	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 	struct sched_entity *se = &curr->se, *pse = &p->se;
+	s64 delta;
 
 	if (unlikely(rt_prio(p->prio))) {
 		update_rq_clock(rq);
@@ -797,21 +810,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 		return;
 	}
 
-	for_each_sched_entity(se) {
-		cfs_rq = cfs_rq_of(se);
-		pcfs_rq = cfs_rq_of(pse);
+	while (!is_same_group(se, pse)) {
+		se = parent_entity(se);
+		pse = parent_entity(pse);
+	}
 
-		if (cfs_rq == pcfs_rq) {
-			s64 delta = se->vruntime - pse->vruntime;
+	delta = se->vruntime - pse->vruntime;
 
-			if (delta > (s64)sysctl_sched_wakeup_granularity)
-				resched_task(curr);
-			break;
-		}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-		pse = pse->parent;
-#endif
-	}
+	if (delta > (s64)sysctl_sched_wakeup_granularity)
+		resched_task(curr);
 }
 
 static struct task_struct *pick_next_task_fair(struct rq *rq)
-- 
cgit v1.1


From b9fa3df33f9166daf81bfa8253d339f5a7726122 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Mon, 15 Oct 2007 17:00:12 +0200
Subject: sched: group scheduler, fix latency

There is a possibility that because of task of a group moving from one
cpu to another, it may gain more cpu time that desired. See
http://marc.info/?l=linux-kernel&m=119073197730334 for details.

This is an attempt to fix that problem. Basically it simulates dequeue
of higher level entities as if they are going to sleep. Similarly it
simulate wakeup of higher level entities as if they are waking up from
sleep.

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 57e7f36..de13a6f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -727,6 +727,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 			break;
 		cfs_rq = cfs_rq_of(se);
 		enqueue_entity(cfs_rq, se, wakeup);
+		wakeup = 1;
 	}
 }
 
@@ -746,6 +747,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight)
 			break;
+		sleep = 1;
 	}
 }
 
-- 
cgit v1.1


From 5522d5d5f70005faeffff3ffc0cfa8eec0155de4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:12 +0200
Subject: sched: mark scheduling classes as const

mark scheduling classes as const. The speeds up the code
a bit and shrinks it:

   text    data     bss     dec     hex filename
  40027    4018     292   44337    ad31 sched.o.before
  40190    3842     292   44324    ad24 sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h   |  4 ++--
 kernel/sched.c          | 17 +++++------------
 kernel/sched_fair.c     |  5 ++---
 kernel/sched_idletask.c |  3 ++-
 kernel/sched_rt.c       |  3 ++-
 5 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 97f736b..47e3717 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -863,7 +863,7 @@ struct rq;
 struct sched_domain;
 
 struct sched_class {
-	struct sched_class *next;
+	const struct sched_class *next;
 
 	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
 	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
@@ -949,7 +949,7 @@ struct task_struct {
 
 	int prio, static_prio, normal_prio;
 	struct list_head run_list;
-	struct sched_class *sched_class;
+	const struct sched_class *sched_class;
 	struct sched_entity se;
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/kernel/sched.c b/kernel/sched.c
index e1657e0..f582e2c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -847,9 +847,9 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		      int *this_best_prio, struct rq_iterator *iterator);
 
 #include "sched_stats.h"
-#include "sched_rt.c"
-#include "sched_fair.c"
 #include "sched_idletask.c"
+#include "sched_fair.c"
+#include "sched_rt.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
 #endif
@@ -2251,7 +2251,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		      struct sched_domain *sd, enum cpu_idle_type idle,
 		      int *all_pinned)
 {
-	struct sched_class *class = sched_class_highest;
+	const struct sched_class *class = sched_class_highest;
 	unsigned long total_load_moved = 0;
 	int this_best_prio = this_rq->curr->prio;
 
@@ -2276,7 +2276,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 			 struct sched_domain *sd, enum cpu_idle_type idle)
 {
-	struct sched_class *class;
+	const struct sched_class *class;
 	int this_best_prio = MAX_PRIO;
 
 	for (class = sched_class_highest; class; class = class->next)
@@ -3432,7 +3432,7 @@ static inline void schedule_debug(struct task_struct *prev)
 static inline struct task_struct *
 pick_next_task(struct rq *rq, struct task_struct *prev)
 {
-	struct sched_class *class;
+	const struct sched_class *class;
 	struct task_struct *p;
 
 	/*
@@ -6504,13 +6504,6 @@ void __init sched_init(void)
 	int highest_cpu = 0;
 	int i, j;
 
-	/*
-	 * Link up the scheduling class hierarchy:
-	 */
-	rt_sched_class.next = &fair_sched_class;
-	fair_sched_class.next = &idle_sched_class;
-	idle_sched_class.next = NULL;
-
 	for_each_possible_cpu(i) {
 		struct rt_prio_array *array;
 		struct rq *rq;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index de13a6f..32fd976 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -76,8 +76,6 @@ const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 25000000UL;
  */
 const_debug unsigned int sysctl_sched_wakeup_granularity = 2000000UL;
 
-extern struct sched_class fair_sched_class;
-
 /**************************************************************
  * CFS operations on generic schedulable entities:
  */
@@ -1031,7 +1029,8 @@ static void set_curr_task_fair(struct rq *rq)
 /*
  * All the scheduling class methods:
  */
-struct sched_class fair_sched_class __read_mostly = {
+static const struct sched_class fair_sched_class = {
+	.next			= &idle_sched_class,
 	.enqueue_task		= enqueue_task_fair,
 	.dequeue_task		= dequeue_task_fair,
 	.yield_task		= yield_task_fair,
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 5ebf829..6e2ead4 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -57,7 +57,8 @@ static void set_curr_task_idle(struct rq *rq)
 /*
  * Simple, special scheduling class for the per-CPU idle tasks:
  */
-static struct sched_class idle_sched_class __read_mostly = {
+const struct sched_class idle_sched_class = {
+	/* .next is NULL */
 	/* no enqueue/yield_task for idle tasks */
 
 	/* dequeue is not valid, we print a debug message there: */
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index e1d5f1c..dbe4d8c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -225,7 +225,8 @@ static void set_curr_task_rt(struct rq *rq)
 	p->se.exec_start = rq->clock;
 }
 
-static struct sched_class rt_sched_class __read_mostly = {
+const struct sched_class rt_sched_class = {
+	.next			= &fair_sched_class,
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
 	.yield_task		= yield_task_rt,
-- 
cgit v1.1


From 3a2520157234d58abce89526756a32c272824f3f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:12 +0200
Subject: sched: whitespace cleanups

more whitespace cleanups. No code changed:

      text    data     bss     dec     hex filename
     26553    2790     288   29631    73bf sched.o.before
     26553    2790     288   29631    73bf sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 52 ++++++++++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index f582e2c..e717047 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -193,17 +193,17 @@ static struct sched_entity *init_sched_entity_p[NR_CPUS];
 static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
 
 /* Default task group.
- * 	Every task in system belong to this group at bootup.
+ *	Every task in system belong to this group at bootup.
  */
-struct task_grp init_task_grp =  {
-				.se     = init_sched_entity_p,
-				.cfs_rq = init_cfs_rq_p,
-				 };
+struct task_grp init_task_grp = {
+	.se     = init_sched_entity_p,
+	.cfs_rq = init_cfs_rq_p,
+};
 
 #ifdef CONFIG_FAIR_USER_SCHED
-#define INIT_TASK_GRP_LOAD	2*NICE_0_LOAD
+# define INIT_TASK_GRP_LOAD	2*NICE_0_LOAD
 #else
-#define INIT_TASK_GRP_LOAD	NICE_0_LOAD
+# define INIT_TASK_GRP_LOAD	NICE_0_LOAD
 #endif
 
 static int init_task_grp_load = INIT_TASK_GRP_LOAD;
@@ -6516,25 +6516,25 @@ void __init sched_init(void)
 		init_cfs_rq(&rq->cfs, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-	 	{
- 			struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
- 			struct sched_entity *se =
- 					 &per_cpu(init_sched_entity, i);
-
- 			init_cfs_rq_p[i] = cfs_rq;
- 			init_cfs_rq(cfs_rq, rq);
- 			cfs_rq->tg = &init_task_grp;
- 			list_add(&cfs_rq->leaf_cfs_rq_list,
+		{
+			struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
+			struct sched_entity *se =
+					 &per_cpu(init_sched_entity, i);
+
+			init_cfs_rq_p[i] = cfs_rq;
+			init_cfs_rq(cfs_rq, rq);
+			cfs_rq->tg = &init_task_grp;
+			list_add(&cfs_rq->leaf_cfs_rq_list,
 							 &rq->leaf_cfs_rq_list);
 
- 			init_sched_entity_p[i] = se;
- 			se->cfs_rq = &rq->cfs;
- 			se->my_q = cfs_rq;
- 			se->load.weight = init_task_grp_load;
+			init_sched_entity_p[i] = se;
+			se->cfs_rq = &rq->cfs;
+			se->my_q = cfs_rq;
+			se->load.weight = init_task_grp_load;
 			se->load.inv_weight =
 				 div64_64(1ULL<<32, init_task_grp_load);
- 			se->parent = NULL;
- 		}
+			se->parent = NULL;
+		}
 		init_task_grp.shares = init_task_grp_load;
 #endif
 
@@ -6840,9 +6840,9 @@ void sched_destroy_group(struct task_grp *tg)
 }
 
 /* change task's runqueue when it moves between groups.
- * 	The caller of this function should have put the task in its new group
- * 	by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- * 	reflect its new group.
+ *	The caller of this function should have put the task in its new group
+ *	by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
+ *	reflect its new group.
  */
 void sched_move_task(struct task_struct *tsk)
 {
@@ -6915,4 +6915,4 @@ int sched_group_set_shares(struct task_grp *tg, unsigned long shares)
 	return 0;
 }
 
-#endif 	/* CONFIG_FAIR_GROUP_SCHED */
+#endif	/* CONFIG_FAIR_GROUP_SCHED */
-- 
cgit v1.1


From 647e7cac2d215fb8890f79252d7eaee3d6743d66 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:13 +0200
Subject: sched: vslice fixups for non-0 nice levels

Make vslice accurate wrt nice levels, and add some comments
while we're at it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 53 ++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 13 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 32fd976..1f14b56 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -217,6 +217,15 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
  * Scheduling class statistics methods:
  */
 
+
+/*
+ * The idea is to set a period in which each task runs once.
+ *
+ * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
+ * this period because otherwise the slices get too small.
+ *
+ * p = (nr <= nl) ? l : l*nr/nl
+ */
 static u64 __sched_period(unsigned long nr_running)
 {
 	u64 period = sysctl_sched_latency;
@@ -230,27 +239,45 @@ static u64 __sched_period(unsigned long nr_running)
 	return period;
 }
 
+/*
+ * We calculate the wall-time slice from the period by taking a part
+ * proportional to the weight.
+ *
+ * s = p*w/rw
+ */
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	u64 period = __sched_period(cfs_rq->nr_running);
+	u64 slice = __sched_period(cfs_rq->nr_running);
 
-	period *= se->load.weight;
-	do_div(period, cfs_rq->load.weight);
+	slice *= se->load.weight;
+	do_div(slice, cfs_rq->load.weight);
 
-	return period;
+	return slice;
 }
 
-static u64 __sched_vslice(unsigned long nr_running)
+/*
+ * We calculate the vruntime slice.
+ *
+ * vs = s/w = p/rw
+ */
+static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
 {
-	unsigned long period = sysctl_sched_latency;
-	unsigned long nr_latency = sysctl_sched_nr_latency;
+	u64 vslice = __sched_period(nr_running);
 
-	if (unlikely(nr_running > nr_latency))
-		nr_running = nr_latency;
+	do_div(vslice, rq_weight);
 
-	period /= nr_running;
+	return vslice;
+}
 
-	return (u64)period;
+static u64 sched_vslice(struct cfs_rq *cfs_rq)
+{
+	return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running);
+}
+
+static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	return __sched_vslice(cfs_rq->load.weight + se->load.weight,
+			cfs_rq->nr_running + 1);
 }
 
 /*
@@ -469,10 +496,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 			vruntime >>= 1;
 		}
 	} else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
-		vruntime += __sched_vslice(cfs_rq->nr_running)/2;
+		vruntime += sched_vslice(cfs_rq)/2;
 
 	if (initial && sched_feat(START_DEBIT))
-		vruntime += __sched_vslice(cfs_rq->nr_running + 1);
+		vruntime += sched_vslice_add(cfs_rq, se);
 
 	if (!initial) {
 		if (sched_feat(NEW_FAIR_SLEEPERS))
-- 
cgit v1.1


From 08ec3df5109e0555da5b9deb4382fd29733c852c Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Mon, 15 Oct 2007 17:00:13 +0200
Subject: sched: fix __pick_next_entity()

The thing is that __pick_next_entity() must never be called when
first_fair(cfs_rq) == NULL. It wouldn't be a problem, should 'run_node'
be the very first field of 'struct sched_entity' (and it's the second).

The 'nr_running != 0' check is _not_ enough, due to the fact that
'current' is not within the tree. Generic paths are ok (e.g. schedule()
as put_prev_task() is called previously)... I'm more worried about e.g.
migration_call() -> CPU_DEAD_FROZEN -> migrate_dead_tasks()... if
'current' == rq->idle, no problems.. if it's one of the SCHED_NORMAL
tasks (or imagine, some other use-cases in the future -- i.e. we should
not make outer world dependent on internal details of sched_fair class)
-- it may be "Houston, we've got a problem" case.

it's +16 bytes to the ".text". Another variant is to make 'run_node' the
first data member of 'struct sched_entity' but an additional check (se !
= NULL) is still needed in pick_next_entity().

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 1f14b56..fa78686 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -600,9 +600,12 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 {
-	struct sched_entity *se = __pick_next_entity(cfs_rq);
+	struct sched_entity *se = NULL;
 
-	set_next_entity(cfs_rq, se);
+	if (first_fair(cfs_rq)) {
+		se = __pick_next_entity(cfs_rq);
+		set_next_entity(cfs_rq, se);
+	}
 
 	return se;
 }
-- 
cgit v1.1


From 1e819950660e6a811b549422ffb652273257e45e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:13 +0200
Subject: sched: optimize schedule() a bit on SMP

optimize schedule() a bit on SMP, by moving the rq-clock update
outside the rq lock.

code size is the same:

      text    data     bss     dec     hex filename
     25725    2666      96   28487    6f47 sched.o.before
     25725    2666      96   28487    6f47 sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index e717047..4f13d37 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3481,9 +3481,13 @@ need_resched_nonpreemptible:
 
 	schedule_debug(prev);
 
-	spin_lock_irq(&rq->lock);
-	clear_tsk_need_resched(prev);
+	/*
+	 * Do the rq-clock update outside the rq lock:
+	 */
+	local_irq_disable();
 	__update_rq_clock(rq);
+	spin_lock(&rq->lock);
+	clear_tsk_need_resched(prev);
 
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
-- 
cgit v1.1


From 155bb293ae8387526e6e07d42b1691104e55d9a2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:13 +0200
Subject: sched: tweak wakeup granularity

tweak wakeup granularity.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index fa78686..0856701 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -58,23 +58,23 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
 
 /*
  * SCHED_BATCH wake-up granularity.
- * (default: 25 msec, units: nanoseconds)
+ * (default: 10 msec, units: nanoseconds)
  *
  * This option delays the preemption effects of decoupled workloads
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
  */
-const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 25000000UL;
+const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
 
 /*
  * SCHED_OTHER wake-up granularity.
- * (default: 1 msec, units: nanoseconds)
+ * (default: 10 msec, units: nanoseconds)
  *
  * This option delays the preemption effects of decoupled workloads
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
  */
-const_debug unsigned int sysctl_sched_wakeup_granularity = 2000000UL;
+const_debug unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
 
 /**************************************************************
  * CFS operations on generic schedulable entities:
-- 
cgit v1.1


From a9957449b08ab561a33e1e038df06843b8d8dd9f Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@sw.ru>
Date: Mon, 15 Oct 2007 17:00:13 +0200
Subject: sched: uninline scheduler

* save ~300 bytes
* activate_idle_task() was moved to avoid a warning

bloat-o-meter output:

add/remove: 6/0 grow/shrink: 0/16 up/down: 438/-733 (-295)		<===
function                                     old     new   delta
__enqueue_entity                               -     165    +165
finish_task_switch                             -     110    +110
update_curr_rt                                 -      79     +79
__load_balance_iterator                        -      32     +32
__task_rq_unlock                               -      28     +28
find_process_by_pid                            -      24     +24
do_sched_setscheduler                        133     123     -10
sys_sched_rr_get_interval                    176     165     -11
sys_sched_getparam                           156     145     -11
normalize_rt_tasks                           482     470     -12
sched_getaffinity                            112      99     -13
sys_sched_getscheduler                        86      72     -14
sched_setaffinity                            226     212     -14
sched_setscheduler                           666     642     -24
load_balance_start_fair                       33       9     -24
load_balance_next_fair                        33       9     -24
dequeue_task_rt                              133      67     -66
put_prev_task_rt                              97      28     -69
schedule_tail                                133      50     -83
schedule                                     682     594     -88
enqueue_entity                               499     366    -133
task_new_fair                                317     180    -137

Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 44 ++++++++++++++++++++++----------------------
 kernel/sched_fair.c |  2 +-
 kernel/sched_rt.c   |  2 +-
 3 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 4f13d37..ce9bb7a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -608,7 +608,7 @@ repeat_lock_task:
 	return rq;
 }
 
-static inline void __task_rq_unlock(struct rq *rq)
+static void __task_rq_unlock(struct rq *rq)
 	__releases(rq->lock)
 {
 	spin_unlock(&rq->lock);
@@ -623,7 +623,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
 /*
  * this_rq_lock - lock this runqueue and disable interrupts.
  */
-static inline struct rq *this_rq_lock(void)
+static struct rq *this_rq_lock(void)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
@@ -986,20 +986,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 }
 
 /*
- * activate_idle_task - move idle task to the _front_ of runqueue.
- */
-static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
-{
-	update_rq_clock(rq);
-
-	if (p->state == TASK_UNINTERRUPTIBLE)
-		rq->nr_uninterruptible--;
-
-	enqueue_task(rq, p, 0);
-	inc_nr_running(p, rq);
-}
-
-/*
  * deactivate_task - remove a task from the runqueue.
  */
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
@@ -1206,7 +1192,7 @@ void kick_process(struct task_struct *p)
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
-static inline unsigned long source_load(int cpu, int type)
+static unsigned long source_load(int cpu, int type)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
@@ -1221,7 +1207,7 @@ static inline unsigned long source_load(int cpu, int type)
  * Return a high guess at the load of a migration-target cpu weighted
  * according to the scheduling class and "nice" value.
  */
-static inline unsigned long target_load(int cpu, int type)
+static unsigned long target_load(int cpu, int type)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
@@ -1813,7 +1799,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
  * with the lock held can cause deadlocks; see schedule() for
  * details.)
  */
-static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
+static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct mm_struct *mm = rq->prev_mm;
@@ -3020,7 +3006,7 @@ static DEFINE_SPINLOCK(balancing);
  *
  * Balancing parameters are set up in arch_init_sched_domains.
  */
-static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
+static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 {
 	int balance = 1;
 	struct rq *rq = cpu_rq(cpu);
@@ -4140,7 +4126,7 @@ struct task_struct *idle_task(int cpu)
  * find_process_by_pid - find a process with a matching PID value.
  * @pid: the pid in question.
  */
-static inline struct task_struct *find_process_by_pid(pid_t pid)
+static struct task_struct *find_process_by_pid(pid_t pid)
 {
 	return pid ? find_task_by_pid(pid) : current;
 }
@@ -5157,6 +5143,20 @@ static void migrate_live_tasks(int src_cpu)
 }
 
 /*
+ * activate_idle_task - move idle task to the _front_ of runqueue.
+ */
+static void activate_idle_task(struct task_struct *p, struct rq *rq)
+{
+	update_rq_clock(rq);
+
+	if (p->state == TASK_UNINTERRUPTIBLE)
+		rq->nr_uninterruptible--;
+
+	enqueue_task(rq, p, 0);
+	inc_nr_running(p, rq);
+}
+
+/*
  * Schedules idle task to be the next runnable task on current CPU.
  * It does so by boosting its priority to highest possible and adding it to
  * the _front_ of the runqueue. Used by CPU offline code.
@@ -6494,7 +6494,7 @@ int in_sched_functions(unsigned long addr)
 		&& addr < (unsigned long)__sched_text_end);
 }
 
-static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
+static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT;
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0856701..48604ea 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -892,7 +892,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
  * achieve that by always pre-iterating before returning
  * the current task:
  */
-static inline struct task_struct *
+static struct task_struct *
 __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
 {
 	struct task_struct *p;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index dbe4d8c..2f26c3d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -7,7 +7,7 @@
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
  */
-static inline void update_curr_rt(struct rq *rq)
+static void update_curr_rt(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
 	u64 delta_exec;
-- 
cgit v1.1


From a4ec24b48ddef1e93f7578be53270f0b95ad666c Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Mon, 15 Oct 2007 17:00:13 +0200
Subject: sched: tidy up SCHED_RR

- make timeslices of SCHED_RR tasks constant and not
dependent on task's static_prio [1] ;
- remove obsolete code (timeslice related bits);
- make sched_rr_get_interval() return something more
meaningful [2] for SCHED_OTHER tasks.

[1] according to the following link, it's not compliant with SUSv3
(not sure though, what is the reference for us :-)
http://lkml.org/lkml/2007/3/7/656

[2] the interval is dynamic and can be depicted as follows "should a
task be one of the runnable tasks at this particular moment, it would
expect to run for this interval of time before being re-scheduled by the
scheduler tick".
(i.e. it's more precise if a task is runnable at the moment)

yeah, this seems to require task_rq_lock/unlock() but this is not a hot
path.

results:

(SCHED_FIFO)

dimm@earth:~/storage/prog$ sudo chrt -f 10 ./rr_interval
time_slice: 0 : 0

(SCHED_RR)

dimm@earth:~/storage/prog$ sudo chrt 10 ./rr_interval
time_slice: 0 : 99984800

(SCHED_NORMAL)

dimm@earth:~/storage/prog$ ./rr_interval
time_slice: 0 : 19996960

(SCHED_NORMAL + a cpu_hog of similar 'weight' on the same CPU --- so should be a half of the previous result)

dimm@earth:~/storage/prog$ taskset 1 ./rr_interval
time_slice: 0 : 9998480

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    | 41 +++++++++++++++++------------------------
 kernel/sched_rt.c |  2 +-
 2 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index ce9bb7a..f370f10 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -96,7 +96,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 /*
  * Some helpers for converting nanosecond timing to jiffy resolution
  */
-#define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
+#define NS_TO_JIFFIES(TIME)	((unsigned long)(TIME) / (1000000000 / HZ))
 #define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
 
 #define NICE_0_LOAD		SCHED_LOAD_SCALE
@@ -105,11 +105,9 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 /*
  * These are the 'tuning knobs' of the scheduler:
  *
- * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
- * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
+ * default timeslice is 100 msecs (used only for SCHED_RR tasks).
  * Timeslices get refilled after they expire.
  */
-#define MIN_TIMESLICE		max(5 * HZ / 1000, 1)
 #define DEF_TIMESLICE		(100 * HZ / 1000)
 
 #ifdef CONFIG_SMP
@@ -133,24 +131,6 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
 }
 #endif
 
-#define SCALE_PRIO(x, prio) \
-	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
-
-/*
- * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
- * to time slice values: [800ms ... 100ms ... 5ms]
- */
-static unsigned int static_prio_timeslice(int static_prio)
-{
-	if (static_prio == NICE_TO_PRIO(19))
-		return 1;
-
-	if (static_prio < NICE_TO_PRIO(0))
-		return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
-	else
-		return SCALE_PRIO(DEF_TIMESLICE, static_prio);
-}
-
 static inline int rt_policy(int policy)
 {
 	if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
@@ -4746,6 +4726,7 @@ asmlinkage
 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
 {
 	struct task_struct *p;
+	unsigned int time_slice;
 	int retval = -EINVAL;
 	struct timespec t;
 
@@ -4762,9 +4743,21 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
 	if (retval)
 		goto out_unlock;
 
-	jiffies_to_timespec(p->policy == SCHED_FIFO ?
-				0 : static_prio_timeslice(p->static_prio), &t);
+	if (p->policy == SCHED_FIFO)
+		time_slice = 0;
+	else if (p->policy == SCHED_RR)
+		time_slice = DEF_TIMESLICE;
+	else {
+		struct sched_entity *se = &p->se;
+		unsigned long flags;
+		struct rq *rq;
+
+		rq = task_rq_lock(p, &flags);
+		time_slice = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
+		task_rq_unlock(rq, &flags);
+	}
 	read_unlock(&tasklist_lock);
+	jiffies_to_timespec(time_slice, &t);
 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
 out_nounlock:
 	return retval;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 2f26c3d..d0097a0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -206,7 +206,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
 	if (--p->time_slice)
 		return;
 
-	p->time_slice = static_prio_timeslice(p->static_prio);
+	p->time_slice = DEF_TIMESLICE;
 
 	/*
 	 * Requeue to the end of queue if we are not the only element
-- 
cgit v1.1


From a03c9061d93822f66eb6287f8e9cf5833a12b49c Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Mon, 15 Oct 2007 17:00:13 +0200
Subject: sched: cleanup, remove calc_weighted()

remove obsolete code -- calc_weighted()

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 48604ea..d8502ec 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -342,17 +342,6 @@ update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
 }
 
-static inline unsigned long
-calc_weighted(unsigned long delta, struct sched_entity *se)
-{
-	unsigned long weight = se->load.weight;
-
-	if (unlikely(weight != NICE_0_LOAD))
-		return (u64)delta * se->load.weight >> NICE_0_SHIFT;
-	else
-		return delta;
-}
-
 /*
  * Task is being enqueued - update stats:
  */
-- 
cgit v1.1


From a2a2d680735ad7c3b5250704b3863abf54ff4020 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Mon, 15 Oct 2007 17:00:13 +0200
Subject: sched: cleanup, make dequeue_entity() and update_stats_wait_end()
 similar

make dequeue_entity() / enqueue_entity() and update_stats_dequeue() /
update_stats_enqueue() look similar, structure-wise.

zero effect, functionality-wise:

   text    data     bss     dec     hex filename
  34550    3026     100   37676    932c sched.o.before
  34550    3026     100   37676    932c sched.o.after

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index d8502ec..7826e18 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -366,7 +366,6 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static inline void
 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	update_curr(cfs_rq);
 	/*
 	 * Mark the end of the wait period if dequeueing a
 	 * waiting task:
@@ -505,7 +504,7 @@ static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 {
 	/*
-	 * Update the fair clock.
+	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
 
@@ -524,6 +523,11 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 {
+	/*
+	 * Update run-time statistics of the 'current'.
+	 */
+	update_curr(cfs_rq);
+
 	update_stats_dequeue(cfs_rq, se);
 	if (sleep) {
 #ifdef CONFIG_SCHEDSTATS
@@ -787,8 +791,7 @@ static void yield_task_fair(struct rq *rq)
 	if (likely(!sysctl_sched_compat_yield)) {
 		__update_rq_clock(rq);
 		/*
-		 * Dequeue and enqueue the task to update its
-		 * position within the tree:
+		 * Update run-time statistics of the 'current'.
 		 */
 		update_curr(cfs_rq);
 
-- 
cgit v1.1


From af92723262f3e0c431083f668b605a1dcdbe8f3d Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Mon, 15 Oct 2007 17:00:13 +0200
Subject: sched: cleanup, remove the TASK_NONINTERACTIVE flag

Here's another piece of low hanging obsolete fruit.

Remove obsolete TASK_NONINTERACTIVE.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/pipe.c             | 3 +--
 include/linux/sched.h | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/pipe.c b/fs/pipe.c
index 6b3d91a..f1fa2b4 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -45,8 +45,7 @@ void pipe_wait(struct pipe_inode_info *pipe)
 	 * Pipes are system-local resources, so sleeping on them
 	 * is considered a noninteractive wait:
 	 */
-	prepare_to_wait(&pipe->wait, &wait,
-			TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE);
+	prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
 	if (pipe->inode)
 		mutex_unlock(&pipe->inode->i_mutex);
 	schedule();
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 47e3717..49c7b37 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -175,8 +175,7 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #define EXIT_ZOMBIE		16
 #define EXIT_DEAD		32
 /* in tsk->state again */
-#define TASK_NONINTERACTIVE	64
-#define TASK_DEAD		128
+#define TASK_DEAD		64
 
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
-- 
cgit v1.1


From 3e9830dcabdeb3656855ec1b678b6bcf3b50261c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:13 +0200
Subject: sched: run sched_domain_debug() if CONFIG_SCHED_DEBUG=y

run sched_domain_debug() if CONFIG_SCHED_DEBUG=y, instead
of relying on the hand-crafted SCHED_DOMAIN_DEBUG switch.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index f370f10..1a80ac1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5476,8 +5476,7 @@ int __init migration_init(void)
 int nr_cpu_ids __read_mostly = NR_CPUS;
 EXPORT_SYMBOL(nr_cpu_ids);
 
-#undef SCHED_DOMAIN_DEBUG
-#ifdef SCHED_DOMAIN_DEBUG
+#ifdef CONFIG_SCHED_DEBUG
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
 	int level = 0;
-- 
cgit v1.1


From 26797a34a24cfeab9951a6f42f27432c0b2546af Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:13 +0200
Subject: sched: break out if printing a warning in sched_domain_debug()

checkpatch.pl and Andy Whitcroft noticed the following bug: we did
not break out after printing an error.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/sched.c b/kernel/sched.c
index 1a80ac1..7fefd8a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5534,16 +5534,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 				printk("\n");
 				printk(KERN_ERR "ERROR: domain->cpu_power not "
 						"set\n");
+				break;
 			}
 
 			if (!cpus_weight(group->cpumask)) {
 				printk("\n");
 				printk(KERN_ERR "ERROR: empty group\n");
+				break;
 			}
 
 			if (cpus_intersects(groupmask, group->cpumask)) {
 				printk("\n");
 				printk(KERN_ERR "ERROR: repeated CPUs\n");
+				break;
 			}
 
 			cpus_or(groupmask, groupmask, group->cpumask);
-- 
cgit v1.1


From 8927f49479756c1aff76e8202ad32733c965864f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:13 +0200
Subject: sched: style cleanup

fix up __setup() style bug - noticed via checkpatch.pl.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 7fefd8a..10b7bed 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5680,7 +5680,7 @@ static int __init isolated_cpu_setup(char *str)
 	return 1;
 }
 
-__setup ("isolcpus=", isolated_cpu_setup);
+__setup("isolcpus=", isolated_cpu_setup);
 
 /*
  * init_sched_build_groups takes the cpumask we wish to span, and a pointer
-- 
cgit v1.1


From a65914b3658043da27c159b8a28c5811bb0a88c9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:13 +0200
Subject: sched: kfree(NULL) is valid

kfree(NULL) is valid.

pointed out by checkpatch.pl.

the fix shrinks the code a bit:

   text    data     bss     dec     hex filename
  40024    3842     100   43966    abbe sched.o.before
  40002    3842     100   43944    aba8 sched.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 10b7bed..23da933 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6784,17 +6784,14 @@ struct task_grp *sched_create_group(void)
 
 err:
 	for_each_possible_cpu(i) {
-		if (tg->cfs_rq && tg->cfs_rq[i])
+		if (tg->cfs_rq)
 			kfree(tg->cfs_rq[i]);
-		if (tg->se && tg->se[i])
+		if (tg->se)
 			kfree(tg->se[i]);
 	}
-	if (tg->cfs_rq)
-		kfree(tg->cfs_rq);
-	if (tg->se)
-		kfree(tg->se);
-	if (tg)
-		kfree(tg);
+	kfree(tg->cfs_rq);
+	kfree(tg->se);
+	kfree(tg);
 
 	return ERR_PTR(-ENOMEM);
 }
-- 
cgit v1.1


From 06877c33fe9261ccdf143492c28de93c56493079 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:13 +0200
Subject: sched: cleanup: rename SCHED_FEAT_USE_TREE_AVG to SCHED_FEAT_TREE_AVG

cleanup: rename SCHED_FEAT_USE_TREE_AVG to SCHED_FEAT_TREE_AVG, to
make SCHED_FEAT_ names more consistent.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 4 ++--
 kernel/sched_fair.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 23da933..5bfe1df 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -440,14 +440,14 @@ static void update_rq_clock(struct rq *rq)
 enum {
 	SCHED_FEAT_NEW_FAIR_SLEEPERS	= 1,
 	SCHED_FEAT_START_DEBIT		= 2,
-	SCHED_FEAT_USE_TREE_AVG         = 4,
+	SCHED_FEAT_TREE_AVG             = 4,
 	SCHED_FEAT_APPROX_AVG           = 8,
 };
 
 const_debug unsigned int sysctl_sched_features =
 		SCHED_FEAT_NEW_FAIR_SLEEPERS	*1 |
 		SCHED_FEAT_START_DEBIT		*1 |
-		SCHED_FEAT_USE_TREE_AVG		*0 |
+		SCHED_FEAT_TREE_AVG		*0 |
 		SCHED_FEAT_APPROX_AVG		*0;
 
 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 7826e18..14a9b9b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -477,7 +477,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 
 	vruntime = cfs_rq->min_vruntime;
 
-	if (sched_feat(USE_TREE_AVG)) {
+	if (sched_feat(TREE_AVG)) {
 		struct sched_entity *last = __pick_last_entity(cfs_rq);
 		if (last) {
 			vruntime += last->vruntime;
-- 
cgit v1.1


From 4cf86d77f5942336e7cd9de874b38b3c83b54d5e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:14 +0200
Subject: sched: cleanup: rename task_grp to task_group

cleanup: rename task_grp to task_group. No need to save two characters
and 'grp' is annoying to read.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 12 ++++++------
 kernel/sched.c        | 36 ++++++++++++++++++------------------
 kernel/sched_debug.c  |  6 +++---
 kernel/user.c         |  2 +-
 4 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 49c7b37..3cddbfc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -136,7 +136,7 @@ extern unsigned long weighted_cpuload(const int cpu);
 
 struct seq_file;
 struct cfs_rq;
-struct task_grp;
+struct task_group;
 #ifdef CONFIG_SCHED_DEBUG
 extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
 extern void proc_sched_set_task(struct task_struct *p);
@@ -598,7 +598,7 @@ struct user_struct {
 	uid_t uid;
 
 #ifdef CONFIG_FAIR_USER_SCHED
-	struct task_grp *tg;
+	struct task_group *tg;
 #endif
 };
 
@@ -1842,12 +1842,12 @@ extern void normalize_rt_tasks(void);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
-extern struct task_grp init_task_grp;
+extern struct task_group init_task_group;
 
-extern struct task_grp *sched_create_group(void);
-extern void sched_destroy_group(struct task_grp *tg);
+extern struct task_group *sched_create_group(void);
+extern void sched_destroy_group(struct task_group *tg);
 extern void sched_move_task(struct task_struct *tsk);
-extern int sched_group_set_shares(struct task_grp *tg, unsigned long shares);
+extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 
 #endif
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 5bfe1df..f2b8db4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -156,7 +156,7 @@ struct rt_prio_array {
 struct cfs_rq;
 
 /* task group related information */
-struct task_grp {
+struct task_group {
 	/* schedulable entities of this group on each cpu */
 	struct sched_entity **se;
 	/* runqueue "owned" by this group on each cpu */
@@ -175,7 +175,7 @@ static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
 /* Default task group.
  *	Every task in system belong to this group at bootup.
  */
-struct task_grp init_task_grp = {
+struct task_group init_task_group = {
 	.se     = init_sched_entity_p,
 	.cfs_rq = init_cfs_rq_p,
 };
@@ -186,17 +186,17 @@ struct task_grp init_task_grp = {
 # define INIT_TASK_GRP_LOAD	NICE_0_LOAD
 #endif
 
-static int init_task_grp_load = INIT_TASK_GRP_LOAD;
+static int init_task_group_load = INIT_TASK_GRP_LOAD;
 
 /* return group to which a task belongs */
-static inline struct task_grp *task_grp(struct task_struct *p)
+static inline struct task_group *task_group(struct task_struct *p)
 {
-	struct task_grp *tg;
+	struct task_group *tg;
 
 #ifdef CONFIG_FAIR_USER_SCHED
 	tg = p->user->tg;
 #else
-	tg  = &init_task_grp;
+	tg  = &init_task_group;
 #endif
 
 	return tg;
@@ -205,8 +205,8 @@ static inline struct task_grp *task_grp(struct task_struct *p)
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
 static inline void set_task_cfs_rq(struct task_struct *p)
 {
-	p->se.cfs_rq = task_grp(p)->cfs_rq[task_cpu(p)];
-	p->se.parent = task_grp(p)->se[task_cpu(p)];
+	p->se.cfs_rq = task_group(p)->cfs_rq[task_cpu(p)];
+	p->se.parent = task_group(p)->se[task_cpu(p)];
 }
 
 #else
@@ -244,7 +244,7 @@ struct cfs_rq {
 	 * list is used during load balance.
 	 */
 	struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
-	struct task_grp *tg;    /* group that "owns" this runqueue */
+	struct task_group *tg;    /* group that "owns" this runqueue */
 	struct rcu_head rcu;
 #endif
 };
@@ -6522,19 +6522,19 @@ void __init sched_init(void)
 
 			init_cfs_rq_p[i] = cfs_rq;
 			init_cfs_rq(cfs_rq, rq);
-			cfs_rq->tg = &init_task_grp;
+			cfs_rq->tg = &init_task_group;
 			list_add(&cfs_rq->leaf_cfs_rq_list,
 							 &rq->leaf_cfs_rq_list);
 
 			init_sched_entity_p[i] = se;
 			se->cfs_rq = &rq->cfs;
 			se->my_q = cfs_rq;
-			se->load.weight = init_task_grp_load;
+			se->load.weight = init_task_group_load;
 			se->load.inv_weight =
-				 div64_64(1ULL<<32, init_task_grp_load);
+				 div64_64(1ULL<<32, init_task_group_load);
 			se->parent = NULL;
 		}
-		init_task_grp.shares = init_task_grp_load;
+		init_task_group.shares = init_task_group_load;
 #endif
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -6725,9 +6725,9 @@ void set_curr_task(int cpu, struct task_struct *p)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
 /* allocate runqueue etc for a new task group */
-struct task_grp *sched_create_group(void)
+struct task_group *sched_create_group(void)
 {
-	struct task_grp *tg;
+	struct task_group *tg;
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se;
 	struct rq *rq;
@@ -6800,7 +6800,7 @@ err:
 static void free_sched_group(struct rcu_head *rhp)
 {
 	struct cfs_rq *cfs_rq = container_of(rhp, struct cfs_rq, rcu);
-	struct task_grp *tg = cfs_rq->tg;
+	struct task_group *tg = cfs_rq->tg;
 	struct sched_entity *se;
 	int i;
 
@@ -6819,7 +6819,7 @@ static void free_sched_group(struct rcu_head *rhp)
 }
 
 /* Destroy runqueue etc associated with a task group */
-void sched_destroy_group(struct task_grp *tg)
+void sched_destroy_group(struct task_group *tg)
 {
 	struct cfs_rq *cfs_rq;
 	int i;
@@ -6895,7 +6895,7 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
 	spin_unlock_irq(&rq->lock);
 }
 
-int sched_group_set_shares(struct task_grp *tg, unsigned long shares)
+int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 {
 	int i;
 
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 48748d0..6f87b31 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -239,7 +239,7 @@ static int
 root_user_share_read_proc(char *page, char **start, off_t off, int count,
 				 int *eof, void *data)
 {
-	return sprintf(page, "%d\n", init_task_grp_load);
+	return sprintf(page, "%d\n", init_task_group_load);
 }
 
 static int
@@ -260,8 +260,8 @@ root_user_share_write_proc(struct file *file, const char __user *buffer,
 
 	mutex_lock(&root_user_share_mutex);
 
-	init_task_grp_load = shares;
-	rc = sched_group_set_shares(&init_task_grp, shares);
+	init_task_group_load = shares;
+	rc = sched_group_set_shares(&init_task_group, shares);
 
 	mutex_unlock(&root_user_share_mutex);
 
diff --git a/kernel/user.c b/kernel/user.c
index c6387fa..0c9a787 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -51,7 +51,7 @@ struct user_struct root_user = {
 	.session_keyring = &root_session_keyring,
 #endif
 #ifdef CONFIG_FAIR_USER_SCHED
-	.tg		= &init_task_grp,
+	.tg		= &init_task_group,
 #endif
 };
 
-- 
cgit v1.1


From 0702e3ebc1e42576a04d29f8adacf13be825b800 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:14 +0200
Subject: sched: cleanup: function prototype cleanups

noticed by Thomas Gleixner:

cleanup: function prototype cleanups - move into single line
wherever possible.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 14a9b9b..a9dfb77 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -112,8 +112,7 @@ static inline struct task_struct *task_of(struct sched_entity *se)
  * Scheduling class tree data structure manipulation methods:
  */
 
-static inline u64
-max_vruntime(u64 min_vruntime, u64 vruntime)
+static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
 {
 	s64 delta = (s64)(vruntime - min_vruntime);
 	if (delta > 0)
@@ -122,8 +121,7 @@ max_vruntime(u64 min_vruntime, u64 vruntime)
 	return min_vruntime;
 }
 
-static inline u64
-min_vruntime(u64 min_vruntime, u64 vruntime)
+static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
 {
 	s64 delta = (s64)(vruntime - min_vruntime);
 	if (delta < 0)
@@ -132,8 +130,7 @@ min_vruntime(u64 min_vruntime, u64 vruntime)
 	return min_vruntime;
 }
 
-static inline s64
-entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	return se->vruntime - cfs_rq->min_vruntime;
 }
@@ -141,8 +138,7 @@ entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
 /*
  * Enqueue an entity into the rb-tree:
  */
-static void
-__enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 	struct rb_node *parent = NULL;
@@ -179,8 +175,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
 }
 
-static void
-__dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	if (cfs_rq->rb_leftmost == &se->run_node)
 		cfs_rq->rb_leftmost = rb_next(&se->run_node);
-- 
cgit v1.1


From 00bf7bfc2eaf775b634774e9ec435d720b6ecee7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:14 +0200
Subject: sched: fix: move the CPU check into ->task_new_fair()

noticed by Peter Zijlstra:

fix: move the CPU check into ->task_new_fair(), this way we
can call place_entity() and get child ->vruntime right at
initial wakeup time.

(without this there can be large latencies)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c      | 5 +----
 kernel/sched_fair.c | 3 ++-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index f2b8db4..b41ef66 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1660,17 +1660,14 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
 	unsigned long flags;
 	struct rq *rq;
-	int this_cpu;
 
 	rq = task_rq_lock(p, &flags);
 	BUG_ON(p->state != TASK_RUNNING);
-	this_cpu = smp_processor_id(); /* parent's CPU */
 	update_rq_clock(rq);
 
 	p->prio = effective_prio(p);
 
-	if (task_cpu(p) != this_cpu || !p->sched_class->task_new ||
-							!current->se.on_rq) {
+	if (!p->sched_class->task_new || !current->se.on_rq || !rq->cfs.curr) {
 		activate_task(rq, p, 0);
 	} else {
 		/*
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a9dfb77..f5f49176 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1007,13 +1007,14 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
 	struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
+	int this_cpu = smp_processor_id();
 
 	sched_info_queued(p);
 
 	update_curr(cfs_rq);
 	place_entity(cfs_rq, se, 1);
 
-	if (sysctl_sched_child_runs_first &&
+	if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
 			curr->vruntime < se->vruntime) {
 		/*
 		 * Upon rescheduling, sched_class::put_prev_task() will place
-- 
cgit v1.1


From a58f6f253d268f7b9712bd13c344a1fd89a3192f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 15 Oct 2007 17:00:14 +0200
Subject: sched: export cpu_clock()

export cpu_clock() - the preferred API instead of sched_clock().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched.c b/kernel/sched.c
index b41ef66..a3c3ec8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -470,6 +470,7 @@ unsigned long long cpu_clock(int cpu)
 
 	return now;
 }
+EXPORT_SYMBOL_GPL(cpu_clock);
 
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
-- 
cgit v1.1


From 810e95ccd58d91369191aa4ecc9e6d4a10d8d0c8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Oct 2007 17:00:14 +0200
Subject: sched: another wakeup_granularity fix

unit mis-match: wakeup_gran was used against a vruntime

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f5f49176..3ecbfd0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -818,7 +818,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 	struct task_struct *curr = rq->curr;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 	struct sched_entity *se = &curr->se, *pse = &p->se;
-	s64 delta;
+	s64 delta, gran;
 
 	if (unlikely(rt_prio(p->prio))) {
 		update_rq_clock(rq);
@@ -833,8 +833,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 	}
 
 	delta = se->vruntime - pse->vruntime;
+	gran = sysctl_sched_wakeup_granularity;
+	if (unlikely(se->load.weight != NICE_0_LOAD))
+		gran = calc_delta_fair(gran, &se->load);
 
-	if (delta > (s64)sysctl_sched_wakeup_granularity)
+	if (delta > gran)
 		resched_task(curr);
 }
 
-- 
cgit v1.1


From 8ca0e14ffb12c257de591571a9e96102acdb1c64 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Oct 2007 17:00:14 +0200
Subject: sched: disable sleeper_fairness on SCHED_BATCH

disable sleeper fairness for batch tasks - they are about
batch processing after all.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3ecbfd0..410b77a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -485,7 +485,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 		vruntime += sched_vslice_add(cfs_rq, se);
 
 	if (!initial) {
-		if (sched_feat(NEW_FAIR_SLEEPERS))
+		struct task_struct *p = container_of(se, struct task_struct, se);
+
+		if (sched_feat(NEW_FAIR_SLEEPERS) && p->policy != SCHED_BATCH)
 			vruntime -= sysctl_sched_latency;
 
 		vruntime = max_t(s64, vruntime, se->vruntime);
-- 
cgit v1.1


From 5cb350baf580017da38199625b7365b1763d7180 Mon Sep 17 00:00:00 2001
From: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Date: Mon, 15 Oct 2007 17:00:14 +0200
Subject: sched: group scheduling, sysfs tunables

Add tunables in sysfs to modify a user's cpu share.

A directory is created in sysfs for each new user in the system.

	/sys/kernel/uids/<uid>/cpu_share

Reading this file returns the cpu shares granted for the user.
Writing into this file modifies the cpu share for the user. Only an
administrator is allowed to modify a user's cpu share.

Ex:
	# cd /sys/kernel/uids/
	# cat 512/cpu_share
	1024
	# echo 2048 > 512/cpu_share
	# cat 512/cpu_share
	2048
	#

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/sched-design-CFS.txt |  67 +++++++++++
 include/linux/sched.h              |  11 ++
 kernel/ksysfs.c                    |   8 ++
 kernel/sched.c                     |  14 ++-
 kernel/sched_debug.c               |  48 --------
 kernel/user.c                      | 240 ++++++++++++++++++++++++++++++++-----
 6 files changed, 309 insertions(+), 79 deletions(-)

diff --git a/Documentation/sched-design-CFS.txt b/Documentation/sched-design-CFS.txt
index 84901e7..88bcb87 100644
--- a/Documentation/sched-design-CFS.txt
+++ b/Documentation/sched-design-CFS.txt
@@ -117,3 +117,70 @@ Some implementation details:
    iterators of the scheduling modules are used. The balancing code got
    quite a bit simpler as a result.
 
+
+Group scheduler extension to CFS
+================================
+
+Normally the scheduler operates on individual tasks and strives to provide
+fair CPU time to each task. Sometimes, it may be desirable to group tasks
+and provide fair CPU time to each such task group. For example, it may
+be desirable to first provide fair CPU time to each user on the system
+and then to each task belonging to a user.
+
+CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets
+SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such
+groups. At present, there are two (mutually exclusive) mechanisms to group
+tasks for CPU bandwidth control purpose:
+
+	- Based on user id (CONFIG_FAIR_USER_SCHED)
+		In this option, tasks are grouped according to their user id.
+	- Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED)
+		This options lets the administrator create arbitrary groups
+		of tasks, using the "cgroup" pseudo filesystem. See
+		Documentation/cgroups.txt for more information about this
+		filesystem.
+
+Only one of these options to group tasks can be chosen and not both.
+
+Group scheduler tunables:
+
+When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for
+each new user and a "cpu_share" file is added in that directory.
+
+	# cd /sys/kernel/uids
+	# cat 512/cpu_share		# Display user 512's CPU share
+	1024
+	# echo 2048 > 512/cpu_share	# Modify user 512's CPU share
+	# cat 512/cpu_share		# Display user 512's CPU share
+	2048
+	#
+
+CPU bandwidth between two users are divided in the ratio of their CPU shares.
+For ex: if you would like user "root" to get twice the bandwidth of user
+"guest", then set the cpu_share for both the users such that "root"'s
+cpu_share is twice "guest"'s cpu_share
+
+
+When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created
+for each group created using the pseudo filesystem. See example steps
+below to create task groups and modify their CPU share using the "cgroups"
+pseudo filesystem
+
+	# mkdir /dev/cpuctl
+	# mount -t cgroup -ocpu none /dev/cpuctl
+	# cd /dev/cpuctl
+
+	# mkdir multimedia	# create "multimedia" group of tasks
+	# mkdir browser		# create "browser" group of tasks
+
+	# #Configure the multimedia group to receive twice the CPU bandwidth
+	# #that of browser group
+
+	# echo 2048 > multimedia/cpu.shares
+	# echo 1024 > browser/cpu.shares
+
+	# firefox &	# Launch firefox and move it to "browser" group
+	# echo <firefox_pid> > browser/tasks
+
+	# #Launch gmplayer (or your favourite movie player)
+	# echo <movie_player_pid> > multimedia/tasks
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3cddbfc..04233c8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -87,6 +87,7 @@ struct sched_param {
 #include <linux/timer.h>
 #include <linux/hrtimer.h>
 #include <linux/task_io_accounting.h>
+#include <linux/kobject.h>
 
 #include <asm/processor.h>
 
@@ -599,9 +600,18 @@ struct user_struct {
 
 #ifdef CONFIG_FAIR_USER_SCHED
 	struct task_group *tg;
+	struct kset kset;
+	struct subsys_attribute user_attr;
+	struct work_struct work;
 #endif
 };
 
+#ifdef CONFIG_FAIR_USER_SCHED
+extern int uids_kobject_init(void);
+#else
+static inline int uids_kobject_init(void) { return 0; }
+#endif
+
 extern struct user_struct *find_user(uid_t);
 
 extern struct user_struct root_user;
@@ -1848,6 +1858,7 @@ extern struct task_group *sched_create_group(void);
 extern void sched_destroy_group(struct task_group *tg);
 extern void sched_move_task(struct task_struct *tsk);
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+extern unsigned long sched_group_shares(struct task_group *tg);
 
 #endif
 
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index d0e5c48..6046939 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/kexec.h>
+#include <linux/sched.h>
 
 #define KERNEL_ATTR_RO(_name) \
 static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
@@ -116,6 +117,13 @@ static int __init ksysfs_init(void)
 					      &notes_attr);
 	}
 
+	/*
+	 * Create "/sys/kernel/uids" directory and corresponding root user's
+	 * directory under it.
+	 */
+	if (!error)
+		error = uids_kobject_init();
+
 	return error;
 }
 
diff --git a/kernel/sched.c b/kernel/sched.c
index a3c3ec8..9ac9989 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -162,6 +162,8 @@ struct task_group {
 	/* runqueue "owned" by this group on each cpu */
 	struct cfs_rq **cfs_rq;
 	unsigned long shares;
+	/* spinlock to serialize modification to shares */
+	spinlock_t lock;
 };
 
 /* Default task group's sched entity on each cpu */
@@ -6533,6 +6535,7 @@ void __init sched_init(void)
 			se->parent = NULL;
 		}
 		init_task_group.shares = init_task_group_load;
+		spin_lock_init(&init_task_group.lock);
 #endif
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -6777,6 +6780,7 @@ struct task_group *sched_create_group(void)
 	}
 
 	tg->shares = NICE_0_LOAD;
+	spin_lock_init(&tg->lock);
 
 	return tg;
 
@@ -6897,8 +6901,9 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 {
 	int i;
 
+	spin_lock(&tg->lock);
 	if (tg->shares == shares)
-		return 0;
+		goto done;
 
 	/* return -EINVAL if the new value is not sane */
 
@@ -6906,7 +6911,14 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	for_each_possible_cpu(i)
 		set_se_shares(tg->se[i], shares);
 
+done:
+	spin_unlock(&tg->lock);
 	return 0;
 }
 
+unsigned long sched_group_shares(struct task_group *tg)
+{
+	return tg->shares;
+}
+
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 6f87b31..0aab455 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -231,45 +231,6 @@ static void sysrq_sched_debug_show(void)
 	sched_debug_show(NULL, NULL);
 }
 
-#ifdef CONFIG_FAIR_USER_SCHED
-
-static DEFINE_MUTEX(root_user_share_mutex);
-
-static int
-root_user_share_read_proc(char *page, char **start, off_t off, int count,
-				 int *eof, void *data)
-{
-	return sprintf(page, "%d\n", init_task_group_load);
-}
-
-static int
-root_user_share_write_proc(struct file *file, const char __user *buffer,
-				 unsigned long count, void *data)
-{
-	unsigned long shares;
-	char kbuf[sizeof(unsigned long)+1];
-	int rc = 0;
-
-	if (copy_from_user(kbuf, buffer, sizeof(kbuf)))
-		return -EFAULT;
-
-	shares = simple_strtoul(kbuf, NULL, 0);
-
-	if (!shares)
-		shares = NICE_0_LOAD;
-
-	mutex_lock(&root_user_share_mutex);
-
-	init_task_group_load = shares;
-	rc = sched_group_set_shares(&init_task_group, shares);
-
-	mutex_unlock(&root_user_share_mutex);
-
-	return (rc < 0 ? rc : count);
-}
-
-#endif	/* CONFIG_FAIR_USER_SCHED */
-
 static int sched_debug_open(struct inode *inode, struct file *filp)
 {
 	return single_open(filp, sched_debug_show, NULL);
@@ -292,15 +253,6 @@ static int __init init_sched_debug_procfs(void)
 
 	pe->proc_fops = &sched_debug_fops;
 
-#ifdef CONFIG_FAIR_USER_SCHED
-	pe = create_proc_entry("root_user_cpu_share", 0644, NULL);
-	if (!pe)
-		return -ENOMEM;
-
-	pe->read_proc = root_user_share_read_proc;
-	pe->write_proc = root_user_share_write_proc;
-#endif
-
 	return 0;
 }
 
diff --git a/kernel/user.c b/kernel/user.c
index 0c9a787..74cadea 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -55,7 +55,41 @@ struct user_struct root_user = {
 #endif
 };
 
+/*
+ * These routines must be called with the uidhash spinlock held!
+ */
+static inline void uid_hash_insert(struct user_struct *up,
+						struct hlist_head *hashent)
+{
+	hlist_add_head(&up->uidhash_node, hashent);
+}
+
+static inline void uid_hash_remove(struct user_struct *up)
+{
+	hlist_del_init(&up->uidhash_node);
+}
+
+static inline struct user_struct *uid_hash_find(uid_t uid,
+						struct hlist_head *hashent)
+{
+	struct user_struct *user;
+	struct hlist_node *h;
+
+	hlist_for_each_entry(user, h, hashent, uidhash_node) {
+		if (user->uid == uid) {
+			atomic_inc(&user->__count);
+			return user;
+		}
+	}
+
+	return NULL;
+}
+
 #ifdef CONFIG_FAIR_USER_SCHED
+
+static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */
+static DEFINE_MUTEX(uids_mutex);
+
 static void sched_destroy_user(struct user_struct *up)
 {
 	sched_destroy_group(up->tg);
@@ -77,42 +111,173 @@ static void sched_switch_user(struct task_struct *p)
 	sched_move_task(p);
 }
 
-#else	/* CONFIG_FAIR_USER_SCHED */
+static inline void uids_mutex_lock(void)
+{
+	mutex_lock(&uids_mutex);
+}
 
-static void sched_destroy_user(struct user_struct *up) { }
-static int sched_create_user(struct user_struct *up) { return 0; }
-static void sched_switch_user(struct task_struct *p) { }
+static inline void uids_mutex_unlock(void)
+{
+	mutex_unlock(&uids_mutex);
+}
 
-#endif	/* CONFIG_FAIR_USER_SCHED */
+/* return cpu shares held by the user */
+ssize_t cpu_shares_show(struct kset *kset, char *buffer)
+{
+	struct user_struct *up = container_of(kset, struct user_struct, kset);
 
-/*
- * These routines must be called with the uidhash spinlock held!
+	return sprintf(buffer, "%lu\n", sched_group_shares(up->tg));
+}
+
+/* modify cpu shares held by the user */
+ssize_t cpu_shares_store(struct kset *kset, const char *buffer, size_t size)
+{
+	struct user_struct *up = container_of(kset, struct user_struct, kset);
+	unsigned long shares;
+	int rc;
+
+	sscanf(buffer, "%lu", &shares);
+
+	rc = sched_group_set_shares(up->tg, shares);
+
+	return (rc ? rc : size);
+}
+
+static void user_attr_init(struct subsys_attribute *sa, char *name, int mode)
+{
+	sa->attr.name = name;
+	sa->attr.mode = mode;
+	sa->show = cpu_shares_show;
+	sa->store = cpu_shares_store;
+}
+
+/* Create "/sys/kernel/uids/<uid>" directory and
+ *  "/sys/kernel/uids/<uid>/cpu_share" file for this user.
  */
-static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
+static int user_kobject_create(struct user_struct *up)
 {
-	hlist_add_head(&up->uidhash_node, hashent);
+	struct kset *kset = &up->kset;
+	struct kobject *kobj = &kset->kobj;
+	int error;
+
+	memset(kset, 0, sizeof(struct kset));
+	kobj->parent = &uids_kobject;	/* create under /sys/kernel/uids dir */
+	kobject_set_name(kobj, "%d", up->uid);
+	kset_init(kset);
+	user_attr_init(&up->user_attr, "cpu_share", 0644);
+
+	error = kobject_add(kobj);
+	if (error)
+		goto done;
+
+	error = sysfs_create_file(kobj, &up->user_attr.attr);
+	if (error)
+		kobject_del(kobj);
+
+done:
+	return error;
 }
 
-static inline void uid_hash_remove(struct user_struct *up)
+/* create these in sysfs filesystem:
+ * 	"/sys/kernel/uids" directory
+ * 	"/sys/kernel/uids/0" directory (for root user)
+ * 	"/sys/kernel/uids/0/cpu_share" file (for root user)
+ */
+int __init uids_kobject_init(void)
 {
-	hlist_del_init(&up->uidhash_node);
+	int error;
+
+	/* create under /sys/kernel dir */
+	uids_kobject.parent = &kernel_subsys.kobj;
+	kobject_set_name(&uids_kobject, "uids");
+	kobject_init(&uids_kobject);
+
+	error = kobject_add(&uids_kobject);
+	if (!error)
+		error = user_kobject_create(&root_user);
+
+	return error;
 }
 
-static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
+/* work function to remove sysfs directory for a user and free up
+ * corresponding structures.
+ */
+static void remove_user_sysfs_dir(struct work_struct *w)
 {
-	struct user_struct *user;
-	struct hlist_node *h;
+	struct user_struct *up = container_of(w, struct user_struct, work);
+	struct kobject *kobj = &up->kset.kobj;
+	unsigned long flags;
+	int remove_user = 0;
 
-	hlist_for_each_entry(user, h, hashent, uidhash_node) {
-		if(user->uid == uid) {
-			atomic_inc(&user->__count);
-			return user;
-		}
+	/* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
+	 * atomic.
+	 */
+	uids_mutex_lock();
+
+	local_irq_save(flags);
+
+	if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
+		uid_hash_remove(up);
+		remove_user = 1;
+		spin_unlock_irqrestore(&uidhash_lock, flags);
+	} else {
+		local_irq_restore(flags);
 	}
 
-	return NULL;
+	if (!remove_user)
+		goto done;
+
+	sysfs_remove_file(kobj, &up->user_attr.attr);
+	kobject_del(kobj);
+
+	sched_destroy_user(up);
+	key_put(up->uid_keyring);
+	key_put(up->session_keyring);
+	kmem_cache_free(uid_cachep, up);
+
+done:
+	uids_mutex_unlock();
+}
+
+/* IRQs are disabled and uidhash_lock is held upon function entry.
+ * IRQ state (as stored in flags) is restored and uidhash_lock released
+ * upon function exit.
+ */
+static inline void free_user(struct user_struct *up, unsigned long flags)
+{
+	/* restore back the count */
+	atomic_inc(&up->__count);
+	spin_unlock_irqrestore(&uidhash_lock, flags);
+
+	INIT_WORK(&up->work, remove_user_sysfs_dir);
+	schedule_work(&up->work);
 }
 
+#else	/* CONFIG_FAIR_USER_SCHED */
+
+static void sched_destroy_user(struct user_struct *up) { }
+static int sched_create_user(struct user_struct *up) { return 0; }
+static void sched_switch_user(struct task_struct *p) { }
+static inline int user_kobject_create(struct user_struct *up) { return 0; }
+static inline void uids_mutex_lock(void) { }
+static inline void uids_mutex_unlock(void) { }
+
+/* IRQs are disabled and uidhash_lock is held upon function entry.
+ * IRQ state (as stored in flags) is restored and uidhash_lock released
+ * upon function exit.
+ */
+static inline void free_user(struct user_struct *up, unsigned long flags)
+{
+	uid_hash_remove(up);
+	spin_unlock_irqrestore(&uidhash_lock, flags);
+	sched_destroy_user(up);
+	key_put(up->uid_keyring);
+	key_put(up->session_keyring);
+	kmem_cache_free(uid_cachep, up);
+}
+
+#endif	/* CONFIG_FAIR_USER_SCHED */
+
 /*
  * Locate the user_struct for the passed UID.  If found, take a ref on it.  The
  * caller must undo that ref with free_uid().
@@ -139,16 +304,10 @@ void free_uid(struct user_struct *up)
 		return;
 
 	local_irq_save(flags);
-	if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
-		uid_hash_remove(up);
-		spin_unlock_irqrestore(&uidhash_lock, flags);
-		sched_destroy_user(up);
-		key_put(up->uid_keyring);
-		key_put(up->session_keyring);
-		kmem_cache_free(uid_cachep, up);
-	} else {
+	if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
+		free_user(up, flags);
+	else
 		local_irq_restore(flags);
-	}
 }
 
 struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
@@ -156,6 +315,11 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 	struct hlist_head *hashent = uidhashentry(ns, uid);
 	struct user_struct *up;
 
+	/* Make uid_hash_find() + user_kobject_create() + uid_hash_insert()
+	 * atomic.
+	 */
+	uids_mutex_lock();
+
 	spin_lock_irq(&uidhash_lock);
 	up = uid_hash_find(uid, hashent);
 	spin_unlock_irq(&uidhash_lock);
@@ -191,6 +355,15 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 			return NULL;
 		}
 
+		if (user_kobject_create(new)) {
+			sched_destroy_user(new);
+			key_put(new->uid_keyring);
+			key_put(new->session_keyring);
+			kmem_cache_free(uid_cachep, new);
+			uids_mutex_unlock();
+			return NULL;
+		}
+
 		/*
 		 * Before adding this, check whether we raced
 		 * on adding the same user already..
@@ -198,7 +371,11 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 		spin_lock_irq(&uidhash_lock);
 		up = uid_hash_find(uid, hashent);
 		if (up) {
-			sched_destroy_user(new);
+			/* This case is not possible when CONFIG_FAIR_USER_SCHED
+			 * is defined, since we serialize alloc_uid() using
+			 * uids_mutex. Hence no need to call
+			 * sched_destroy_user() or remove_user_sysfs_dir().
+			 */
 			key_put(new->uid_keyring);
 			key_put(new->session_keyring);
 			kmem_cache_free(uid_cachep, new);
@@ -209,6 +386,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 		spin_unlock_irq(&uidhash_lock);
 
 	}
+
+	uids_mutex_unlock();
+
 	return up;
 }
 
-- 
cgit v1.1


From 638e13ac37a1a89473415f407cbffc1688a20fe2 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Mon, 15 Oct 2007 17:00:14 +0200
Subject: sched: fix rt ptracer monopolizing CPU

yield() in wait_task_inactive(), can cause a high priority thread to be
scheduled back in, and there by loop forever while it is waiting for some
lower priority thread which is unfortunately still on the runqueue.

Use schedule_timeout_uninterruptible(1) instead.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Credit: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 9ac9989..48fc74b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1133,7 +1133,7 @@ repeat:
 	 * yield - it could be a while.
 	 */
 	if (unlikely(on_rq)) {
-		yield();
+		schedule_timeout_uninterruptible(1);
 		goto repeat;
 	}
 
-- 
cgit v1.1


From ace8b3d633f93da8535921bf3e3679db3c619578 Mon Sep 17 00:00:00 2001
From: Zou Nan hai <nanhai.zou@intel.com>
Date: Mon, 15 Oct 2007 17:00:14 +0200
Subject: sched: some proc entries are missed in sched_domain sys_ctl debug
 code

cache_nice_tries and flags entry do not appear in proc fs sched_domain
directory, because ctl_table entry is skipped.

This patch fixes the issue.

Signed-off-by: Zou Nan hai <nanhai.zou@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 48fc74b..b7dff36 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5284,7 +5284,7 @@ set_table_entry(struct ctl_table *entry,
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
-	struct ctl_table *table = sd_alloc_ctl_entry(14);
+	struct ctl_table *table = sd_alloc_ctl_entry(12);
 
 	set_table_entry(&table[0], "min_interval", &sd->min_interval,
 		sizeof(long), 0644, proc_doulongvec_minmax);
@@ -5304,10 +5304,10 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
 		sizeof(int), 0644, proc_dointvec_minmax);
-	set_table_entry(&table[10], "cache_nice_tries",
+	set_table_entry(&table[9], "cache_nice_tries",
 		&sd->cache_nice_tries,
 		sizeof(int), 0644, proc_dointvec_minmax);
-	set_table_entry(&table[12], "flags", &sd->flags,
+	set_table_entry(&table[10], "flags", &sd->flags,
 		sizeof(int), 0644, proc_dointvec_minmax);
 
 	return table;
-- 
cgit v1.1


From e62dd02ed0af35631c6ca473e50758c9594773cf Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Mon, 15 Oct 2007 17:00:14 +0200
Subject: sched: fix group scheduling for SCHED_BATCH

The following patch (sched: disable sleeper_fairness on SCHED_BATCH)
seems to break GROUP_SCHED. Although, it may be 'oops'-less due to the
possibility of 'p' being always a valid address.

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 410b77a..3ac096e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -485,9 +485,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 		vruntime += sched_vslice_add(cfs_rq, se);
 
 	if (!initial) {
-		struct task_struct *p = container_of(se, struct task_struct, se);
-
-		if (sched_feat(NEW_FAIR_SLEEPERS) && p->policy != SCHED_BATCH)
+		if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) &&
+				task_of(se)->policy != SCHED_BATCH)
 			vruntime -= sysctl_sched_latency;
 
 		vruntime = max_t(s64, vruntime, se->vruntime);
-- 
cgit v1.1


From ce6c131131df442f0d49d064129ecc52d9fe8ca9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Oct 2007 17:00:14 +0200
Subject: sched: disable forced preemption by default

Implement feature bit to disable forced preemption. This way
it can be checked whether a workload is overscheduling or not.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  4 +++-
 kernel/sched_fair.c | 24 +++++++++++++-----------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index b7dff36..0bd8f2c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -444,13 +444,15 @@ enum {
 	SCHED_FEAT_START_DEBIT		= 2,
 	SCHED_FEAT_TREE_AVG             = 4,
 	SCHED_FEAT_APPROX_AVG           = 8,
+	SCHED_FEAT_WAKEUP_PREEMPT	= 16,
 };
 
 const_debug unsigned int sysctl_sched_features =
 		SCHED_FEAT_NEW_FAIR_SLEEPERS	*1 |
 		SCHED_FEAT_START_DEBIT		*1 |
 		SCHED_FEAT_TREE_AVG		*0 |
-		SCHED_FEAT_APPROX_AVG		*0;
+		SCHED_FEAT_APPROX_AVG		*0 |
+		SCHED_FEAT_WAKEUP_PREEMPT	*1;
 
 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3ac096e..3843ec7 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -626,7 +626,7 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	 */
 	update_curr(cfs_rq);
 
-	if (cfs_rq->nr_running > 1)
+	if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
 		check_preempt_tick(cfs_rq, curr);
 }
 
@@ -828,18 +828,20 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 		return;
 	}
 
-	while (!is_same_group(se, pse)) {
-		se = parent_entity(se);
-		pse = parent_entity(pse);
-	}
+	if (sched_feat(WAKEUP_PREEMPT)) {
+		while (!is_same_group(se, pse)) {
+			se = parent_entity(se);
+			pse = parent_entity(pse);
+		}
 
-	delta = se->vruntime - pse->vruntime;
-	gran = sysctl_sched_wakeup_granularity;
-	if (unlikely(se->load.weight != NICE_0_LOAD))
-		gran = calc_delta_fair(gran, &se->load);
+		delta = se->vruntime - pse->vruntime;
+		gran = sysctl_sched_wakeup_granularity;
+		if (unlikely(se->load.weight != NICE_0_LOAD))
+			gran = calc_delta_fair(gran, &se->load);
 
-	if (delta > gran)
-		resched_task(curr);
+		if (delta > gran)
+			resched_task(curr);
+	}
 }
 
 static struct task_struct *pick_next_task_fair(struct rq *rq)
-- 
cgit v1.1


From 95938a35c5562afa7af7252821e44132391a3db8 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Mon, 15 Oct 2007 17:00:14 +0200
Subject: sched: prevent wakeup over-scheduling

Prevent wakeup over-scheduling.  Once a task has been preempted by a
task of the same or lower priority, it becomes ineligible for repeated
preemption by same until it has been ticked, or slept.  Instead, the
task is marked for preemption at the next tick.  Tasks of higher
priority still preempt immediately.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        |  4 +++-
 kernel/sched_fair.c   | 14 +++++++++++---
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 04233c8..8be5b57 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -912,6 +912,7 @@ struct sched_entity {
 	struct load_weight	load;		/* for load-balancing */
 	struct rb_node		run_node;
 	unsigned int		on_rq;
+	int			peer_preempt;
 
 	u64			exec_start;
 	u64			sum_exec_runtime;
diff --git a/kernel/sched.c b/kernel/sched.c
index 0bd8f2c..e8051bd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -445,6 +445,7 @@ enum {
 	SCHED_FEAT_TREE_AVG             = 4,
 	SCHED_FEAT_APPROX_AVG           = 8,
 	SCHED_FEAT_WAKEUP_PREEMPT	= 16,
+	SCHED_FEAT_PREEMPT_RESTRICT	= 32,
 };
 
 const_debug unsigned int sysctl_sched_features =
@@ -452,7 +453,8 @@ const_debug unsigned int sysctl_sched_features =
 		SCHED_FEAT_START_DEBIT		*1 |
 		SCHED_FEAT_TREE_AVG		*0 |
 		SCHED_FEAT_APPROX_AVG		*0 |
-		SCHED_FEAT_WAKEUP_PREEMPT	*1;
+		SCHED_FEAT_WAKEUP_PREEMPT	*1 |
+		SCHED_FEAT_PREEMPT_RESTRICT	*1;
 
 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3843ec7..f819f943 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -526,6 +526,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 
 	update_stats_dequeue(cfs_rq, se);
 	if (sleep) {
+		se->peer_preempt = 0;
 #ifdef CONFIG_SCHEDSTATS
 		if (entity_is_task(se)) {
 			struct task_struct *tsk = task_of(se);
@@ -553,8 +554,10 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 
 	ideal_runtime = sched_slice(cfs_rq, curr);
 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
-	if (delta_exec > ideal_runtime)
+	if (delta_exec > ideal_runtime ||
+			(sched_feat(PREEMPT_RESTRICT) && curr->peer_preempt))
 		resched_task(rq_of(cfs_rq)->curr);
+	curr->peer_preempt = 0;
 }
 
 static void
@@ -839,8 +842,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 		if (unlikely(se->load.weight != NICE_0_LOAD))
 			gran = calc_delta_fair(gran, &se->load);
 
-		if (delta > gran)
-			resched_task(curr);
+		if (delta > gran) {
+			int now = !sched_feat(PREEMPT_RESTRICT);
+
+			if (now || p->prio < curr->prio || !se->peer_preempt++)
+				resched_task(curr);
+		}
 	}
 }
 
@@ -1034,6 +1041,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	check_spread(cfs_rq, curr);
 	__enqueue_entity(cfs_rq, se);
 	account_entity_enqueue(cfs_rq, se);
+	se->peer_preempt = 0;
 	resched_task(rq->curr);
 }
 
-- 
cgit v1.1


From d274a4cee190c880ec25b60501efe50c4435b3d7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:14 +0200
Subject: sched: update comment

update comment: clarify time-slices and remove obsolete tuning detail.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f819f943..ec1592e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -25,14 +25,12 @@
  * (default: 20ms, units: nanoseconds)
  *
  * NOTE: this latency value is not the same as the concept of
- * 'timeslice length' - timeslices in CFS are of variable length.
- * (to see the precise effective timeslice length of your workload,
- *  run vmstat and monitor the context-switches field)
+ * 'timeslice length' - timeslices in CFS are of variable length
+ * and have no persistent notion like in traditional, time-slice
+ * based scheduling concepts.
  *
- * On SMP systems the value of this is multiplied by the log2 of the
- * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
- * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
- * Targeted preemption latency for CPU-bound tasks:
+ * (to see the precise effective timeslice length of your workload,
+ *  run vmstat and monitor the context-switches (cs) field)
  */
 const_debug unsigned int sysctl_sched_latency = 20000000ULL;
 
-- 
cgit v1.1


From 3a5c359a58c39801d838c508f127bdb228af28b0 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 15 Oct 2007 17:00:14 +0200
Subject: sched: cleanup: remove unnecessary gotos

Replace loops implemented with gotos with real loops.
Replace err = ...; goto x; x: return err; with return ...;

No functional changes.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 327 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 162 insertions(+), 165 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index e8051bd..4c15b17 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -562,16 +562,13 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 static inline struct rq *__task_rq_lock(struct task_struct *p)
 	__acquires(rq->lock)
 {
-	struct rq *rq;
-
-repeat_lock_task:
-	rq = task_rq(p);
-	spin_lock(&rq->lock);
-	if (unlikely(rq != task_rq(p))) {
+	for (;;) {
+		struct rq *rq = task_rq(p);
+		spin_lock(&rq->lock);
+		if (likely(rq == task_rq(p)))
+			return rq;
 		spin_unlock(&rq->lock);
-		goto repeat_lock_task;
 	}
-	return rq;
 }
 
 /*
@@ -584,15 +581,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 {
 	struct rq *rq;
 
-repeat_lock_task:
-	local_irq_save(*flags);
-	rq = task_rq(p);
-	spin_lock(&rq->lock);
-	if (unlikely(rq != task_rq(p))) {
+	for (;;) {
+		local_irq_save(*flags);
+		rq = task_rq(p);
+		spin_lock(&rq->lock);
+		if (likely(rq == task_rq(p)))
+			return rq;
 		spin_unlock_irqrestore(&rq->lock, *flags);
-		goto repeat_lock_task;
 	}
-	return rq;
 }
 
 static void __task_rq_unlock(struct rq *rq)
@@ -1083,69 +1079,71 @@ void wait_task_inactive(struct task_struct *p)
 	int running, on_rq;
 	struct rq *rq;
 
-repeat:
-	/*
-	 * We do the initial early heuristics without holding
-	 * any task-queue locks at all. We'll only try to get
-	 * the runqueue lock when things look like they will
-	 * work out!
-	 */
-	rq = task_rq(p);
+	for (;;) {
+		/*
+		 * We do the initial early heuristics without holding
+		 * any task-queue locks at all. We'll only try to get
+		 * the runqueue lock when things look like they will
+		 * work out!
+		 */
+		rq = task_rq(p);
 
-	/*
-	 * If the task is actively running on another CPU
-	 * still, just relax and busy-wait without holding
-	 * any locks.
-	 *
-	 * NOTE! Since we don't hold any locks, it's not
-	 * even sure that "rq" stays as the right runqueue!
-	 * But we don't care, since "task_running()" will
-	 * return false if the runqueue has changed and p
-	 * is actually now running somewhere else!
-	 */
-	while (task_running(rq, p))
-		cpu_relax();
+		/*
+		 * If the task is actively running on another CPU
+		 * still, just relax and busy-wait without holding
+		 * any locks.
+		 *
+		 * NOTE! Since we don't hold any locks, it's not
+		 * even sure that "rq" stays as the right runqueue!
+		 * But we don't care, since "task_running()" will
+		 * return false if the runqueue has changed and p
+		 * is actually now running somewhere else!
+		 */
+		while (task_running(rq, p))
+			cpu_relax();
 
-	/*
-	 * Ok, time to look more closely! We need the rq
-	 * lock now, to be *sure*. If we're wrong, we'll
-	 * just go back and repeat.
-	 */
-	rq = task_rq_lock(p, &flags);
-	running = task_running(rq, p);
-	on_rq = p->se.on_rq;
-	task_rq_unlock(rq, &flags);
+		/*
+		 * Ok, time to look more closely! We need the rq
+		 * lock now, to be *sure*. If we're wrong, we'll
+		 * just go back and repeat.
+		 */
+		rq = task_rq_lock(p, &flags);
+		running = task_running(rq, p);
+		on_rq = p->se.on_rq;
+		task_rq_unlock(rq, &flags);
 
-	/*
-	 * Was it really running after all now that we
-	 * checked with the proper locks actually held?
-	 *
-	 * Oops. Go back and try again..
-	 */
-	if (unlikely(running)) {
-		cpu_relax();
-		goto repeat;
-	}
+		/*
+		 * Was it really running after all now that we
+		 * checked with the proper locks actually held?
+		 *
+		 * Oops. Go back and try again..
+		 */
+		if (unlikely(running)) {
+			cpu_relax();
+			continue;
+		}
 
-	/*
-	 * It's not enough that it's not actively running,
-	 * it must be off the runqueue _entirely_, and not
-	 * preempted!
-	 *
-	 * So if it wa still runnable (but just not actively
-	 * running right now), it's preempted, and we should
-	 * yield - it could be a while.
-	 */
-	if (unlikely(on_rq)) {
-		schedule_timeout_uninterruptible(1);
-		goto repeat;
-	}
+		/*
+		 * It's not enough that it's not actively running,
+		 * it must be off the runqueue _entirely_, and not
+		 * preempted!
+		 *
+		 * So if it wa still runnable (but just not actively
+		 * running right now), it's preempted, and we should
+		 * yield - it could be a while.
+		 */
+		if (unlikely(on_rq)) {
+			schedule_timeout_uninterruptible(1);
+			continue;
+		}
 
-	/*
-	 * Ahh, all good. It wasn't running, and it wasn't
-	 * runnable, which means that it will never become
-	 * running in the future either. We're all done!
-	 */
+		/*
+		 * Ahh, all good. It wasn't running, and it wasn't
+		 * runnable, which means that it will never become
+		 * running in the future either. We're all done!
+		 */
+		break;
+	}
 }
 
 /***
@@ -1236,7 +1234,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 
 		/* Skip over this group if it has no CPUs allowed */
 		if (!cpus_intersects(group->cpumask, p->cpus_allowed))
-			goto nextgroup;
+			continue;
 
 		local_group = cpu_isset(this_cpu, group->cpumask);
 
@@ -1264,9 +1262,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 			min_load = avg_load;
 			idlest = group;
 		}
-nextgroup:
-		group = group->next;
-	} while (group != sd->groups);
+	} while (group = group->next, group != sd->groups);
 
 	if (!idlest || 100*this_load < imbalance*min_load)
 		return NULL;
@@ -3517,27 +3513,30 @@ asmlinkage void __sched preempt_schedule(void)
 	if (likely(ti->preempt_count || irqs_disabled()))
 		return;
 
-need_resched:
-	add_preempt_count(PREEMPT_ACTIVE);
-	/*
-	 * We keep the big kernel semaphore locked, but we
-	 * clear ->lock_depth so that schedule() doesnt
-	 * auto-release the semaphore:
-	 */
+	do {
+		add_preempt_count(PREEMPT_ACTIVE);
+
+		/*
+		 * We keep the big kernel semaphore locked, but we
+		 * clear ->lock_depth so that schedule() doesnt
+		 * auto-release the semaphore:
+		 */
 #ifdef CONFIG_PREEMPT_BKL
-	saved_lock_depth = task->lock_depth;
-	task->lock_depth = -1;
+		saved_lock_depth = task->lock_depth;
+		task->lock_depth = -1;
 #endif
-	schedule();
+		schedule();
 #ifdef CONFIG_PREEMPT_BKL
-	task->lock_depth = saved_lock_depth;
+		task->lock_depth = saved_lock_depth;
 #endif
-	sub_preempt_count(PREEMPT_ACTIVE);
+		sub_preempt_count(PREEMPT_ACTIVE);
 
-	/* we could miss a preemption opportunity between schedule and now */
-	barrier();
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
-		goto need_resched;
+		/*
+		 * Check again in case we missed a preemption opportunity
+		 * between schedule and now.
+		 */
+		barrier();
+	} while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
 }
 EXPORT_SYMBOL(preempt_schedule);
 
@@ -3557,29 +3556,32 @@ asmlinkage void __sched preempt_schedule_irq(void)
 	/* Catch callers which need to be fixed */
 	BUG_ON(ti->preempt_count || !irqs_disabled());
 
-need_resched:
-	add_preempt_count(PREEMPT_ACTIVE);
-	/*
-	 * We keep the big kernel semaphore locked, but we
-	 * clear ->lock_depth so that schedule() doesnt
-	 * auto-release the semaphore:
-	 */
+	do {
+		add_preempt_count(PREEMPT_ACTIVE);
+
+		/*
+		 * We keep the big kernel semaphore locked, but we
+		 * clear ->lock_depth so that schedule() doesnt
+		 * auto-release the semaphore:
+		 */
 #ifdef CONFIG_PREEMPT_BKL
-	saved_lock_depth = task->lock_depth;
-	task->lock_depth = -1;
+		saved_lock_depth = task->lock_depth;
+		task->lock_depth = -1;
 #endif
-	local_irq_enable();
-	schedule();
-	local_irq_disable();
+		local_irq_enable();
+		schedule();
+		local_irq_disable();
 #ifdef CONFIG_PREEMPT_BKL
-	task->lock_depth = saved_lock_depth;
+		task->lock_depth = saved_lock_depth;
 #endif
-	sub_preempt_count(PREEMPT_ACTIVE);
+		sub_preempt_count(PREEMPT_ACTIVE);
 
-	/* we could miss a preemption opportunity between schedule and now */
-	barrier();
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
-		goto need_resched;
+		/*
+		 * Check again in case we missed a preemption opportunity
+		 * between schedule and now.
+		 */
+		barrier();
+	} while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
 }
 
 #endif /* CONFIG_PREEMPT */
@@ -4324,10 +4326,10 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
 asmlinkage long sys_sched_getscheduler(pid_t pid)
 {
 	struct task_struct *p;
-	int retval = -EINVAL;
+	int retval;
 
 	if (pid < 0)
-		goto out_nounlock;
+		return -EINVAL;
 
 	retval = -ESRCH;
 	read_lock(&tasklist_lock);
@@ -4338,8 +4340,6 @@ asmlinkage long sys_sched_getscheduler(pid_t pid)
 			retval = p->policy;
 	}
 	read_unlock(&tasklist_lock);
-
-out_nounlock:
 	return retval;
 }
 
@@ -4352,10 +4352,10 @@ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
 {
 	struct sched_param lp;
 	struct task_struct *p;
-	int retval = -EINVAL;
+	int retval;
 
 	if (!param || pid < 0)
-		goto out_nounlock;
+		return -EINVAL;
 
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
@@ -4375,7 +4375,6 @@ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
 	 */
 	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
 
-out_nounlock:
 	return retval;
 
 out_unlock:
@@ -4731,11 +4730,11 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
 {
 	struct task_struct *p;
 	unsigned int time_slice;
-	int retval = -EINVAL;
+	int retval;
 	struct timespec t;
 
 	if (pid < 0)
-		goto out_nounlock;
+		return -EINVAL;
 
 	retval = -ESRCH;
 	read_lock(&tasklist_lock);
@@ -4763,8 +4762,8 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
 	read_unlock(&tasklist_lock);
 	jiffies_to_timespec(time_slice, &t);
 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
-out_nounlock:
 	return retval;
+
 out_unlock:
 	read_unlock(&tasklist_lock);
 	return retval;
@@ -5070,35 +5069,34 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 	struct rq *rq;
 	int dest_cpu;
 
-restart:
-	/* On same node? */
-	mask = node_to_cpumask(cpu_to_node(dead_cpu));
-	cpus_and(mask, mask, p->cpus_allowed);
-	dest_cpu = any_online_cpu(mask);
-
-	/* On any allowed CPU? */
-	if (dest_cpu == NR_CPUS)
-		dest_cpu = any_online_cpu(p->cpus_allowed);
-
-	/* No more Mr. Nice Guy. */
-	if (dest_cpu == NR_CPUS) {
-		rq = task_rq_lock(p, &flags);
-		cpus_setall(p->cpus_allowed);
-		dest_cpu = any_online_cpu(p->cpus_allowed);
-		task_rq_unlock(rq, &flags);
+	do {
+		/* On same node? */
+		mask = node_to_cpumask(cpu_to_node(dead_cpu));
+		cpus_and(mask, mask, p->cpus_allowed);
+		dest_cpu = any_online_cpu(mask);
+
+		/* On any allowed CPU? */
+		if (dest_cpu == NR_CPUS)
+			dest_cpu = any_online_cpu(p->cpus_allowed);
+
+		/* No more Mr. Nice Guy. */
+		if (dest_cpu == NR_CPUS) {
+			rq = task_rq_lock(p, &flags);
+			cpus_setall(p->cpus_allowed);
+			dest_cpu = any_online_cpu(p->cpus_allowed);
+			task_rq_unlock(rq, &flags);
 
-		/*
-		 * Don't tell them about moving exiting tasks or
-		 * kernel threads (both mm NULL), since they never
-		 * leave kernel.
-		 */
-		if (p->mm && printk_ratelimit())
-			printk(KERN_INFO "process %d (%s) no "
-			       "longer affine to cpu%d\n",
-			       p->pid, p->comm, dead_cpu);
-	}
-	if (!__migrate_task(p, dead_cpu, dest_cpu))
-		goto restart;
+			/*
+			 * Don't tell them about moving exiting tasks or
+			 * kernel threads (both mm NULL), since they never
+			 * leave kernel.
+			 */
+			if (p->mm && printk_ratelimit())
+				printk(KERN_INFO "process %d (%s) no "
+				       "longer affine to cpu%d\n",
+				       p->pid, p->comm, dead_cpu);
+		}
+	} while (!__migrate_task(p, dead_cpu, dest_cpu));
 }
 
 /*
@@ -5913,24 +5911,23 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
 
 	if (!sg)
 		return;
-next_sg:
-	for_each_cpu_mask(j, sg->cpumask) {
-		struct sched_domain *sd;
+	do {
+		for_each_cpu_mask(j, sg->cpumask) {
+			struct sched_domain *sd;
 
-		sd = &per_cpu(phys_domains, j);
-		if (j != first_cpu(sd->groups->cpumask)) {
-			/*
-			 * Only add "power" once for each
-			 * physical package.
-			 */
-			continue;
-		}
+			sd = &per_cpu(phys_domains, j);
+			if (j != first_cpu(sd->groups->cpumask)) {
+				/*
+				 * Only add "power" once for each
+				 * physical package.
+				 */
+				continue;
+			}
 
-		sg_inc_cpu_power(sg, sd->groups->__cpu_power);
-	}
-	sg = sg->next;
-	if (sg != group_head)
-		goto next_sg;
+			sg_inc_cpu_power(sg, sd->groups->__cpu_power);
+		}
+		sg = sg->next;
+	} while (sg != group_head);
 }
 #endif
 
-- 
cgit v1.1


From 8cbbe86dfcfd68ad69916164bdc838d9e09adca8 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 15 Oct 2007 17:00:14 +0200
Subject: sched: cleanup: refactor common code of sleep_on /
 wait_for_completion

Refactor common code of sleep_on / wait_for_completion

These functions were largely cut'n'pasted. This moves
the common code into single helpers instead.  Advantage
is about 1k less code on x86-64 and 91 lines of code removed.
It adds one function call to the non timeout version of
the functions; i don't expect this to be measurable.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 188 +++++++++++++++------------------------------------------
 1 file changed, 49 insertions(+), 139 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 4c15b17..db88b56 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3697,206 +3697,116 @@ void fastcall complete_all(struct completion *x)
 }
 EXPORT_SYMBOL(complete_all);
 
-void fastcall __sched wait_for_completion(struct completion *x)
+static inline long __sched
+do_wait_for_common(struct completion *x, long timeout, int state)
 {
-	might_sleep();
-
-	spin_lock_irq(&x->wait.lock);
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);
 
 		wait.flags |= WQ_FLAG_EXCLUSIVE;
 		__add_wait_queue_tail(&x->wait, &wait);
 		do {
-			__set_current_state(TASK_UNINTERRUPTIBLE);
-			spin_unlock_irq(&x->wait.lock);
-			schedule();
-			spin_lock_irq(&x->wait.lock);
-		} while (!x->done);
-		__remove_wait_queue(&x->wait, &wait);
-	}
-	x->done--;
-	spin_unlock_irq(&x->wait.lock);
-}
-EXPORT_SYMBOL(wait_for_completion);
-
-unsigned long fastcall __sched
-wait_for_completion_timeout(struct completion *x, unsigned long timeout)
-{
-	might_sleep();
-
-	spin_lock_irq(&x->wait.lock);
-	if (!x->done) {
-		DECLARE_WAITQUEUE(wait, current);
-
-		wait.flags |= WQ_FLAG_EXCLUSIVE;
-		__add_wait_queue_tail(&x->wait, &wait);
-		do {
-			__set_current_state(TASK_UNINTERRUPTIBLE);
+			if (state == TASK_INTERRUPTIBLE &&
+			    signal_pending(current)) {
+				__remove_wait_queue(&x->wait, &wait);
+				return -ERESTARTSYS;
+			}
+			__set_current_state(state);
 			spin_unlock_irq(&x->wait.lock);
 			timeout = schedule_timeout(timeout);
 			spin_lock_irq(&x->wait.lock);
 			if (!timeout) {
 				__remove_wait_queue(&x->wait, &wait);
-				goto out;
+				return timeout;
 			}
 		} while (!x->done);
 		__remove_wait_queue(&x->wait, &wait);
 	}
 	x->done--;
-out:
-	spin_unlock_irq(&x->wait.lock);
 	return timeout;
 }
-EXPORT_SYMBOL(wait_for_completion_timeout);
 
-int fastcall __sched wait_for_completion_interruptible(struct completion *x)
+static long __sched
+wait_for_common(struct completion *x, long timeout, int state)
 {
-	int ret = 0;
-
 	might_sleep();
 
 	spin_lock_irq(&x->wait.lock);
-	if (!x->done) {
-		DECLARE_WAITQUEUE(wait, current);
-
-		wait.flags |= WQ_FLAG_EXCLUSIVE;
-		__add_wait_queue_tail(&x->wait, &wait);
-		do {
-			if (signal_pending(current)) {
-				ret = -ERESTARTSYS;
-				__remove_wait_queue(&x->wait, &wait);
-				goto out;
-			}
-			__set_current_state(TASK_INTERRUPTIBLE);
-			spin_unlock_irq(&x->wait.lock);
-			schedule();
-			spin_lock_irq(&x->wait.lock);
-		} while (!x->done);
-		__remove_wait_queue(&x->wait, &wait);
-	}
-	x->done--;
-out:
+	timeout = do_wait_for_common(x, timeout, state);
 	spin_unlock_irq(&x->wait.lock);
+	return timeout;
+}
 
-	return ret;
+void fastcall __sched wait_for_completion(struct completion *x)
+{
+	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
 }
-EXPORT_SYMBOL(wait_for_completion_interruptible);
+EXPORT_SYMBOL(wait_for_completion);
 
 unsigned long fastcall __sched
-wait_for_completion_interruptible_timeout(struct completion *x,
-					  unsigned long timeout)
+wait_for_completion_timeout(struct completion *x, unsigned long timeout)
 {
-	might_sleep();
-
-	spin_lock_irq(&x->wait.lock);
-	if (!x->done) {
-		DECLARE_WAITQUEUE(wait, current);
-
-		wait.flags |= WQ_FLAG_EXCLUSIVE;
-		__add_wait_queue_tail(&x->wait, &wait);
-		do {
-			if (signal_pending(current)) {
-				timeout = -ERESTARTSYS;
-				__remove_wait_queue(&x->wait, &wait);
-				goto out;
-			}
-			__set_current_state(TASK_INTERRUPTIBLE);
-			spin_unlock_irq(&x->wait.lock);
-			timeout = schedule_timeout(timeout);
-			spin_lock_irq(&x->wait.lock);
-			if (!timeout) {
-				__remove_wait_queue(&x->wait, &wait);
-				goto out;
-			}
-		} while (!x->done);
-		__remove_wait_queue(&x->wait, &wait);
-	}
-	x->done--;
-out:
-	spin_unlock_irq(&x->wait.lock);
-	return timeout;
+	return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
 }
-EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+EXPORT_SYMBOL(wait_for_completion_timeout);
 
-static inline void
-sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
+int __sched wait_for_completion_interruptible(struct completion *x)
 {
-	spin_lock_irqsave(&q->lock, *flags);
-	__add_wait_queue(q, wait);
-	spin_unlock(&q->lock);
+	return wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
 }
+EXPORT_SYMBOL(wait_for_completion_interruptible);
 
-static inline void
-sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
+unsigned long fastcall __sched
+wait_for_completion_interruptible_timeout(struct completion *x,
+					  unsigned long timeout)
 {
-	spin_lock_irq(&q->lock);
-	__remove_wait_queue(q, wait);
-	spin_unlock_irqrestore(&q->lock, *flags);
+	return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
 }
+EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
 
-void __sched interruptible_sleep_on(wait_queue_head_t *q)
+static long __sched
+sleep_on_common(wait_queue_head_t *q, int state, long timeout)
 {
 	unsigned long flags;
 	wait_queue_t wait;
 
 	init_waitqueue_entry(&wait, current);
 
-	current->state = TASK_INTERRUPTIBLE;
+	__set_current_state(state);
 
-	sleep_on_head(q, &wait, &flags);
-	schedule();
-	sleep_on_tail(q, &wait, &flags);
+	spin_lock_irqsave(&q->lock, flags);
+	__add_wait_queue(q, &wait);
+	spin_unlock(&q->lock);
+	timeout = schedule_timeout(timeout);
+	spin_lock_irq(&q->lock);
+	__remove_wait_queue(q, &wait);
+	spin_unlock_irqrestore(&q->lock, flags);
+
+	return timeout;
+}
+
+void __sched interruptible_sleep_on(wait_queue_head_t *q)
+{
+	sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
 }
 EXPORT_SYMBOL(interruptible_sleep_on);
 
 long __sched
 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
-	unsigned long flags;
-	wait_queue_t wait;
-
-	init_waitqueue_entry(&wait, current);
-
-	current->state = TASK_INTERRUPTIBLE;
-
-	sleep_on_head(q, &wait, &flags);
-	timeout = schedule_timeout(timeout);
-	sleep_on_tail(q, &wait, &flags);
-
-	return timeout;
+	return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
 }
 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
 
 void __sched sleep_on(wait_queue_head_t *q)
 {
-	unsigned long flags;
-	wait_queue_t wait;
-
-	init_waitqueue_entry(&wait, current);
-
-	current->state = TASK_UNINTERRUPTIBLE;
-
-	sleep_on_head(q, &wait, &flags);
-	schedule();
-	sleep_on_tail(q, &wait, &flags);
+	sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
 }
 EXPORT_SYMBOL(sleep_on);
 
 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
-	unsigned long flags;
-	wait_queue_t wait;
-
-	init_waitqueue_entry(&wait, current);
-
-	current->state = TASK_UNINTERRUPTIBLE;
-
-	sleep_on_head(q, &wait, &flags);
-	timeout = schedule_timeout(timeout);
-	sleep_on_tail(q, &wait, &flags);
-
-	return timeout;
+	return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
 }
 EXPORT_SYMBOL(sleep_on_timeout);
 
-- 
cgit v1.1


From 3a5e4dc12f23fb96fafd4f5d0f61e6c3070f80a5 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 15 Oct 2007 17:00:15 +0200
Subject: sched: cleanup: refactor normalize_rt_tasks

Replace a particularly ugly ifdef with an inline and a new macro.
Also split up the function to be easier to read.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 43 +++++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index db88b56..2c6295b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,6 +75,12 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 	return (unsigned long long)jiffies * (1000000000 / HZ);
 }
 
+#ifdef CONFIG_SMP
+#define is_migration_thread(p, rq) ((p) == (rq)->migration_thread)
+#else
+#define is_migration_thread(p, rq) 0
+#endif
+
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -6532,12 +6538,25 @@ EXPORT_SYMBOL(__might_sleep);
 #endif
 
 #ifdef CONFIG_MAGIC_SYSRQ
+static void normalize_task(struct rq *rq, struct task_struct *p)
+{
+	int on_rq;
+	update_rq_clock(rq);
+	on_rq = p->se.on_rq;
+	if (on_rq)
+		deactivate_task(rq, p, 0);
+	__setscheduler(rq, p, SCHED_NORMAL, 0);
+	if (on_rq) {
+		activate_task(rq, p, 0);
+		resched_task(rq->curr);
+	}
+}
+
 void normalize_rt_tasks(void)
 {
 	struct task_struct *g, *p;
 	unsigned long flags;
 	struct rq *rq;
-	int on_rq;
 
 	read_lock_irq(&tasklist_lock);
 	do_each_thread(g, p) {
@@ -6561,26 +6580,10 @@ void normalize_rt_tasks(void)
 
 		spin_lock_irqsave(&p->pi_lock, flags);
 		rq = __task_rq_lock(p);
-#ifdef CONFIG_SMP
-		/*
-		 * Do not touch the migration thread:
-		 */
-		if (p == rq->migration_thread)
-			goto out_unlock;
-#endif
 
-		update_rq_clock(rq);
-		on_rq = p->se.on_rq;
-		if (on_rq)
-			deactivate_task(rq, p, 0);
-		__setscheduler(rq, p, SCHED_NORMAL, 0);
-		if (on_rq) {
-			activate_task(rq, p, 0);
-			resched_task(rq->curr);
-		}
-#ifdef CONFIG_SMP
- out_unlock:
-#endif
+		if (!is_migration_thread(p, rq))
+			normalize_task(rq, p);
+
 		__task_rq_unlock(rq);
 		spin_unlock_irqrestore(&p->pi_lock, flags);
 	} while_each_thread(g, p);
-- 
cgit v1.1


From d5036e89dcf7c19b3d03219d7d385bc96965b7fe Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:15 +0200
Subject: sched: clean up is_migration_thread()

clean up is_migration_thread() and turn it into an inline function.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 2c6295b..7ef66bd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,12 +75,6 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 	return (unsigned long long)jiffies * (1000000000 / HZ);
 }
 
-#ifdef CONFIG_SMP
-#define is_migration_thread(p, rq) ((p) == (rq)->migration_thread)
-#else
-#define is_migration_thread(p, rq) 0
-#endif
-
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -371,6 +365,15 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
+static inline int is_migration_thread(struct task_struct *p, struct rq *rq)
+{
+#ifdef CONFIG_SMP
+	return p == rq->migration_thread;
+#else
+	return 0;
+#endif
+}
+
 /*
  * Update the per-runqueue clock, as finegrained as the platform can give
  * us, but without assuming monotonicity, etc.:
-- 
cgit v1.1


From 1666703af948ae87c87c2bc7121aa34271cc52ab Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 15 Oct 2007 17:00:18 +0200
Subject: sched: remove stale comment from sched_group_set_shares()

remove stale comment from sched_group_set_shares().

Function never returns -EINVAL.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 7ef66bd..fc61b1f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6822,8 +6822,6 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	if (tg->shares == shares)
 		goto done;
 
-	/* return -EINVAL if the new value is not sane */
-
 	tg->shares = shares;
 	for_each_possible_cpu(i)
 		set_se_shares(tg->se[i], shares);
-- 
cgit v1.1


From 178be793485d70d871a0fd46b29e9e3e7da636ad Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:18 +0200
Subject: sched: do not normalize kernel threads via SysRq-N

do not normalize kernel threads via SysRq-N: the migration threads,
softlockup threads, etc. might be essential for the system to
function properly. So only zap user tasks.

pointed out by Andi Kleen.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index fc61b1f..791dd08 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -365,15 +365,6 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
-static inline int is_migration_thread(struct task_struct *p, struct rq *rq)
-{
-#ifdef CONFIG_SMP
-	return p == rq->migration_thread;
-#else
-	return 0;
-#endif
-}
-
 /*
  * Update the per-runqueue clock, as finegrained as the platform can give
  * us, but without assuming monotonicity, etc.:
@@ -6563,6 +6554,12 @@ void normalize_rt_tasks(void)
 
 	read_lock_irq(&tasklist_lock);
 	do_each_thread(g, p) {
+		/*
+		 * Only normalize user tasks:
+		 */
+		if (!p->mm)
+			continue;
+
 		p->se.exec_start		= 0;
 #ifdef CONFIG_SCHEDSTATS
 		p->se.wait_start		= 0;
@@ -6584,8 +6581,7 @@ void normalize_rt_tasks(void)
 		spin_lock_irqsave(&p->pi_lock, flags);
 		rq = __task_rq_lock(p);
 
-		if (!is_migration_thread(p, rq))
-			normalize_task(rq, p);
+		normalize_task(rq, p);
 
 		__task_rq_unlock(rq);
 		spin_unlock_irqrestore(&p->pi_lock, flags);
-- 
cgit v1.1


From fb7dde37ece82e13de383afd7042c45df67a9d17 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Mon, 15 Oct 2007 17:00:18 +0200
Subject: sched: generate uevents for user creation/destruction

Generate uevents when a user is being created/destroyed. These events
can be used to configure cpu share of a new user.

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/user.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/user.c b/kernel/user.c
index 74cadea..f0e561e 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -174,6 +174,8 @@ static int user_kobject_create(struct user_struct *up)
 	if (error)
 		kobject_del(kobj);
 
+	kobject_uevent(kobj, KOBJ_ADD);
+
 done:
 	return error;
 }
@@ -189,6 +191,7 @@ int __init uids_kobject_init(void)
 
 	/* create under /sys/kernel dir */
 	uids_kobject.parent = &kernel_subsys.kobj;
+	uids_kobject.kset = &kernel_subsys;
 	kobject_set_name(&uids_kobject, "uids");
 	kobject_init(&uids_kobject);
 
@@ -228,6 +231,7 @@ static void remove_user_sysfs_dir(struct work_struct *w)
 		goto done;
 
 	sysfs_remove_file(kobj, &up->user_attr.attr);
+	kobject_uevent(kobj, KOBJ_REMOVE);
 	kobject_del(kobj);
 
 	sched_destroy_user(up);
-- 
cgit v1.1


From 91c234b4e3419c786cac2d5b7a7b96443e512e3a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:18 +0200
Subject: sched: do not wakeup-preempt with SCHED_BATCH tasks

do not wakeup-preempt with SCHED_BATCH tasks, their preemption
is batched too, driven by the tick.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ec1592e..c240b72 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -828,6 +828,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 		resched_task(curr);
 		return;
 	}
+	/*
+	 * Batch tasks do not preempt (their preemption is driven by
+	 * the tick):
+	 */
+	if (unlikely(p->policy == SCHED_BATCH))
+		return;
 
 	if (sched_feat(WAKEUP_PREEMPT)) {
 		while (!is_same_group(se, pse)) {
-- 
cgit v1.1


From e5f32a3856caabe745381279f7f32e3b581b59dc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:18 +0200
Subject: sched: speed up context-switches a bit

speed up context-switches a bit by not clearing p->exec_start.

(as a side-effect, this also makes p->exec_start a universal timestamp
available to cache-hot estimations.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c240b72..cea1fa3 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -379,15 +379,6 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	se->exec_start = rq_of(cfs_rq)->clock;
 }
 
-/*
- * We are descheduling a task - update its stats:
- */
-static inline void
-update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	se->exec_start = 0;
-}
-
 /**************************************************
  * Scheduling class queueing methods:
  */
@@ -609,8 +600,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 	if (prev->on_rq)
 		update_curr(cfs_rq);
 
-	update_stats_curr_end(cfs_rq, prev);
-
 	check_spread(cfs_rq, prev);
 	if (prev->on_rq) {
 		update_stats_wait_start(cfs_rq, prev);
-- 
cgit v1.1


From da84d96176729fb48a8458561e5d8647103168b8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:18 +0200
Subject: sched: reintroduce cache-hot affinity

reintroduce a simplified version of cache-hot/cold scheduling
affinity. This improves performance with certain SMP workloads,
such as sysbench.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        | 27 +++++++++++++++++++++++++++
 kernel/sched_fair.c   |  2 ++
 kernel/sysctl.c       |  8 ++++++++
 4 files changed, 38 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8be5b57..fcc9a5a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1415,6 +1415,7 @@ extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_batch_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
+extern unsigned int sysctl_sched_migration_cost;
 #endif
 
 extern unsigned int sysctl_sched_compat_yield;
diff --git a/kernel/sched.c b/kernel/sched.c
index 791dd08..089d8b1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2119,6 +2119,17 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
 }
 
 /*
+ * Is this task likely cache-hot:
+ */
+static inline int
+task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
+{
+	s64 delta = now - p->se.exec_start;
+
+	return delta < (long long)sysctl_sched_migration_cost;
+}
+
+/*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
 static
@@ -2139,6 +2150,22 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	if (task_running(rq, p))
 		return 0;
 
+	/*
+	 * Aggressive migration if:
+	 * 1) task is cache cold, or
+	 * 2) too many balance attempts have failed.
+	 */
+
+	if (sd->nr_balance_failed > sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+		if (task_hot(p, rq->clock, sd))
+			schedstat_inc(sd, lb_hot_gained[idle]);
+#endif
+		return 1;
+	}
+
+	if (task_hot(p, rq->clock, sd))
+		return 0;
 	return 1;
 }
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index cea1fa3..a17b785 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -74,6 +74,8 @@ const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
  */
 const_debug unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
 
+const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+
 /**************************************************************
  * CFS operations on generic schedulable entities:
  */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 230ca4e..ec14aa8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -277,6 +277,14 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_migration_cost",
+		.data		= &sysctl_sched_migration_cost,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-- 
cgit v1.1


From ff56b2f01537aef7237d5ac8bf6bfbb409c1a127 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 15 Oct 2007 17:00:18 +0200
Subject: sched: activate task_hot() only on fair-scheduled tasks

activate task_hot() only for fair-scheduled tasks (i.e. disable it
for RT tasks).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 089d8b1..945ab13 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2122,11 +2122,16 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
  * Is this task likely cache-hot:
  */
 static inline int
-task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
+task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 {
-	s64 delta = now - p->se.exec_start;
+	s64 delta;
 
-	return delta < (long long)sysctl_sched_migration_cost;
+	if (p->sched_class != &fair_sched_class)
+		return 0;
+
+	delta = now - p->se.exec_start;
+
+	return delta < (s64)sysctl_sched_migration_cost;
 }
 
 /*
-- 
cgit v1.1


From 2d92f22784b7b8879ebe3254e44c92cb8792b0dd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:18 +0200
Subject: sched: debug: increase width of debug line

increase width of debug line - in preparation of more debugging info.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_debug.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 0aab455..7558159 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -198,7 +198,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
 	u64 now = ktime_to_ns(ktime_get());
 	int cpu;
 
-	SEQ_printf(m, "Sched Debug Version: v0.05-v20, %s %.*s\n",
+	SEQ_printf(m, "Sched Debug Version: v0.06-v22, %s %.*s\n",
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
@@ -271,11 +271,12 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	rcu_read_unlock();
 
 	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
-	SEQ_printf(m, "----------------------------------------------\n");
+	SEQ_printf(m,
+		"---------------------------------------------------------\n");
 #define P(F) \
-	SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F)
+	SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F)
 #define PN(F) \
-	SEQ_printf(m, "%-25s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+	SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
 
 	PN(se.exec_start);
 	PN(se.vruntime);
@@ -292,7 +293,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.wait_max);
 	P(sched_info.bkl_count);
 #endif
-	SEQ_printf(m, "%-25s:%20Ld\n",
+	SEQ_printf(m, "%-35s:%21Ld\n",
 		   "nr_switches", (long long)(p->nvcsw + p->nivcsw));
 	P(se.load.weight);
 	P(policy);
@@ -305,7 +306,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 
 		t0 = sched_clock();
 		t1 = sched_clock();
-		SEQ_printf(m, "%-25s:%20Ld\n",
+		SEQ_printf(m, "%-35s:%21Ld\n",
 			   "clock-delta", (long long)(t1-t0));
 	}
 }
-- 
cgit v1.1


From cc367732ff0b1c63d0d7bdd11e6d1661794ef6a3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:18 +0200
Subject: sched: debug, improve migration statistics

add new migration statistics when SCHED_DEBUG and SCHEDSTATS
is enabled. Available in /proc/<PID>/sched.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 18 +++++++++++
 kernel/sched.c        | 76 +++++++++++++++++++++++++++++++-------------
 kernel/sched_debug.c  | 87 +++++++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 149 insertions(+), 32 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fcc9a5a..3a6e05e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -931,6 +931,24 @@ struct sched_entity {
 	u64			block_max;
 	u64			exec_max;
 	u64			slice_max;
+
+	u64			nr_migrations;
+	u64			nr_migrations_cold;
+	u64			nr_failed_migrations_affine;
+	u64			nr_failed_migrations_running;
+	u64			nr_failed_migrations_hot;
+	u64			nr_forced_migrations;
+	u64			nr_forced2_migrations;
+
+	u64			nr_wakeups;
+	u64			nr_wakeups_sync;
+	u64			nr_wakeups_migrate;
+	u64			nr_wakeups_local;
+	u64			nr_wakeups_remote;
+	u64			nr_wakeups_affine;
+	u64			nr_wakeups_affine_attempts;
+	u64			nr_wakeups_passive;
+	u64			nr_wakeups_idle;
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched.c b/kernel/sched.c
index 945ab13..3b27c3a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1005,6 +1005,23 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 #ifdef CONFIG_SMP
 
+/*
+ * Is this task likely cache-hot:
+ */
+static inline int
+task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
+{
+	s64 delta;
+
+	if (p->sched_class != &fair_sched_class)
+		return 0;
+
+	delta = now - p->se.exec_start;
+
+	return delta < (s64)sysctl_sched_migration_cost;
+}
+
+
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 	int old_cpu = task_cpu(p);
@@ -1022,6 +1039,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		p->se.sleep_start -= clock_offset;
 	if (p->se.block_start)
 		p->se.block_start -= clock_offset;
+	if (old_cpu != new_cpu) {
+		schedstat_inc(p, se.nr_migrations);
+		if (task_hot(p, old_rq->clock, NULL))
+			schedstat_inc(p, se.nr_forced2_migrations);
+	}
 #endif
 	p->se.vruntime -= old_cfsrq->min_vruntime -
 					 new_cfsrq->min_vruntime;
@@ -1394,8 +1416,13 @@ static int wake_idle(int cpu, struct task_struct *p)
 		if (sd->flags & SD_WAKE_IDLE) {
 			cpus_and(tmp, sd->span, p->cpus_allowed);
 			for_each_cpu_mask(i, tmp) {
-				if (idle_cpu(i))
+				if (idle_cpu(i)) {
+					if (i != task_cpu(p)) {
+						schedstat_inc(p,
+							se.nr_wakeups_idle);
+					}
 					return i;
+				}
 			}
 		} else {
 			break;
@@ -1426,7 +1453,7 @@ static inline int wake_idle(int cpu, struct task_struct *p)
  */
 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 {
-	int cpu, this_cpu, success = 0;
+	int cpu, orig_cpu, this_cpu, success = 0;
 	unsigned long flags;
 	long old_state;
 	struct rq *rq;
@@ -1445,6 +1472,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 		goto out_running;
 
 	cpu = task_cpu(p);
+	orig_cpu = cpu;
 	this_cpu = smp_processor_id();
 
 #ifdef CONFIG_SMP
@@ -1488,6 +1516,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 			unsigned long tl = this_load;
 			unsigned long tl_per_task;
 
+			schedstat_inc(p, se.nr_wakeups_affine_attempts);
 			tl_per_task = cpu_avg_load_per_task(this_cpu);
 
 			/*
@@ -1507,6 +1536,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 				 * there is no bad imbalance.
 				 */
 				schedstat_inc(this_sd, ttwu_move_affine);
+				schedstat_inc(p, se.nr_wakeups_affine);
 				goto out_set_cpu;
 			}
 		}
@@ -1518,6 +1548,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 		if (this_sd->flags & SD_WAKE_BALANCE) {
 			if (imbalance*this_load <= 100*load) {
 				schedstat_inc(this_sd, ttwu_move_balance);
+				schedstat_inc(p, se.nr_wakeups_passive);
 				goto out_set_cpu;
 			}
 		}
@@ -1543,6 +1574,15 @@ out_set_cpu:
 
 out_activate:
 #endif /* CONFIG_SMP */
+	schedstat_inc(p, se.nr_wakeups);
+	if (sync)
+		schedstat_inc(p, se.nr_wakeups_sync);
+	if (orig_cpu != cpu)
+		schedstat_inc(p, se.nr_wakeups_migrate);
+	if (cpu == this_cpu)
+		schedstat_inc(p, se.nr_wakeups_local);
+	else
+		schedstat_inc(p, se.nr_wakeups_remote);
 	update_rq_clock(rq);
 	activate_task(rq, p, 1);
 	/*
@@ -2119,22 +2159,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
 }
 
 /*
- * Is this task likely cache-hot:
- */
-static inline int
-task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
-{
-	s64 delta;
-
-	if (p->sched_class != &fair_sched_class)
-		return 0;
-
-	delta = now - p->se.exec_start;
-
-	return delta < (s64)sysctl_sched_migration_cost;
-}
-
-/*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
 static
@@ -2148,12 +2172,16 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
 	 */
-	if (!cpu_isset(this_cpu, p->cpus_allowed))
+	if (!cpu_isset(this_cpu, p->cpus_allowed)) {
+		schedstat_inc(p, se.nr_failed_migrations_affine);
 		return 0;
+	}
 	*all_pinned = 0;
 
-	if (task_running(rq, p))
+	if (task_running(rq, p)) {
+		schedstat_inc(p, se.nr_failed_migrations_running);
 		return 0;
+	}
 
 	/*
 	 * Aggressive migration if:
@@ -2163,14 +2191,18 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 
 	if (sd->nr_balance_failed > sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
-		if (task_hot(p, rq->clock, sd))
+		if (task_hot(p, rq->clock, sd)) {
 			schedstat_inc(sd, lb_hot_gained[idle]);
+			schedstat_inc(p, se.nr_forced_migrations);
+		}
 #endif
 		return 1;
 	}
 
-	if (task_hot(p, rq->clock, sd))
+	if (task_hot(p, rq->clock, sd)) {
+		schedstat_inc(p, se.nr_failed_migrations_hot);
 		return 0;
+	}
 	return 1;
 }
 
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 7558159..27e82cb 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -260,6 +260,7 @@ __initcall(init_sched_debug_procfs);
 
 void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 {
+	unsigned long nr_switches;
 	unsigned long flags;
 	int num_threads = 1;
 
@@ -273,8 +274,12 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
 	SEQ_printf(m,
 		"---------------------------------------------------------\n");
+#define __P(F) \
+	SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F)
 #define P(F) \
 	SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F)
+#define __PN(F) \
+	SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
 #define PN(F) \
 	SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
 
@@ -282,6 +287,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.vruntime);
 	PN(se.sum_exec_runtime);
 
+	nr_switches = p->nvcsw + p->nivcsw;
+
 #ifdef CONFIG_SCHEDSTATS
 	PN(se.wait_start);
 	PN(se.sleep_start);
@@ -292,14 +299,55 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.slice_max);
 	PN(se.wait_max);
 	P(sched_info.bkl_count);
+	P(se.nr_migrations);
+	P(se.nr_migrations_cold);
+	P(se.nr_failed_migrations_affine);
+	P(se.nr_failed_migrations_running);
+	P(se.nr_failed_migrations_hot);
+	P(se.nr_forced_migrations);
+	P(se.nr_forced2_migrations);
+	P(se.nr_wakeups);
+	P(se.nr_wakeups_sync);
+	P(se.nr_wakeups_migrate);
+	P(se.nr_wakeups_local);
+	P(se.nr_wakeups_remote);
+	P(se.nr_wakeups_affine);
+	P(se.nr_wakeups_affine_attempts);
+	P(se.nr_wakeups_passive);
+	P(se.nr_wakeups_idle);
+
+	{
+		u64 avg_atom, avg_per_cpu;
+
+		avg_atom = p->se.sum_exec_runtime;
+		if (nr_switches)
+			do_div(avg_atom, nr_switches);
+		else
+			avg_atom = -1LL;
+
+		avg_per_cpu = p->se.sum_exec_runtime;
+		if (p->se.nr_migrations)
+			avg_per_cpu = div64_64(avg_per_cpu, p->se.nr_migrations);
+		else
+			avg_per_cpu = -1LL;
+
+		__PN(avg_atom);
+		__PN(avg_per_cpu);
+	}
 #endif
+	__P(nr_switches);
 	SEQ_printf(m, "%-35s:%21Ld\n",
-		   "nr_switches", (long long)(p->nvcsw + p->nivcsw));
+		   "nr_voluntary_switches", (long long)p->nvcsw);
+	SEQ_printf(m, "%-35s:%21Ld\n",
+		   "nr_involuntary_switches", (long long)p->nivcsw);
+
 	P(se.load.weight);
 	P(policy);
 	P(prio);
-#undef P
 #undef PN
+#undef __PN
+#undef P
+#undef __P
 
 	{
 		u64 t0, t1;
@@ -314,13 +362,32 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 void proc_sched_set_task(struct task_struct *p)
 {
 #ifdef CONFIG_SCHEDSTATS
-	p->se.sleep_max			= 0;
-	p->se.block_max			= 0;
-	p->se.exec_max			= 0;
-	p->se.slice_max			= 0;
-	p->se.wait_max			= 0;
-	p->sched_info.bkl_count		= 0;
+	p->se.wait_max				= 0;
+	p->se.sleep_max				= 0;
+	p->se.sum_sleep_runtime			= 0;
+	p->se.block_max				= 0;
+	p->se.exec_max				= 0;
+	p->se.slice_max				= 0;
+	p->se.nr_migrations			= 0;
+	p->se.nr_migrations_cold		= 0;
+	p->se.nr_failed_migrations_affine	= 0;
+	p->se.nr_failed_migrations_running	= 0;
+	p->se.nr_failed_migrations_hot		= 0;
+	p->se.nr_forced_migrations		= 0;
+	p->se.nr_forced2_migrations		= 0;
+	p->se.nr_wakeups			= 0;
+	p->se.nr_wakeups_sync			= 0;
+	p->se.nr_wakeups_migrate		= 0;
+	p->se.nr_wakeups_local			= 0;
+	p->se.nr_wakeups_remote			= 0;
+	p->se.nr_wakeups_affine			= 0;
+	p->se.nr_wakeups_affine_attempts	= 0;
+	p->se.nr_wakeups_passive		= 0;
+	p->se.nr_wakeups_idle			= 0;
+	p->sched_info.bkl_count			= 0;
 #endif
-	p->se.sum_exec_runtime		= 0;
-	p->se.prev_sum_exec_runtime	= 0;
+	p->se.sum_exec_runtime			= 0;
+	p->se.prev_sum_exec_runtime		= 0;
+	p->nvcsw				= 0;
+	p->nivcsw				= 0;
 }
-- 
cgit v1.1


From 6bc1665ba71de0f207391b01b187b21b2619c15c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:18 +0200
Subject: sched: allow the immediate migration of cache-cold tasks

allow the immediate migration of cache-cold tasks.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 3b27c3a..7506127 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1016,6 +1016,11 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 	if (p->sched_class != &fair_sched_class)
 		return 0;
 
+	if (sysctl_sched_migration_cost == -1)
+		return 1;
+	if (sysctl_sched_migration_cost == 0)
+		return 0;
+
 	delta = now - p->se.exec_start;
 
 	return delta < (s64)sysctl_sched_migration_cost;
@@ -2189,7 +2194,8 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	 * 2) too many balance attempts have failed.
 	 */
 
-	if (sd->nr_balance_failed > sd->cache_nice_tries) {
+	if (!task_hot(p, rq->clock, sd) ||
+			sd->nr_balance_failed > sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
 		if (task_hot(p, rq->clock, sd)) {
 			schedstat_inc(sd, lb_hot_gained[idle]);
-- 
cgit v1.1


From 95dbb421d12fdd9796ed153853daf3679809274f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:19 +0200
Subject: sched: reintroduce topology.h tunings

reintroduce the 2.6.22 topology.h tunings again - they result in
slightly better balancing.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/topology.h | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 525d437..865a63e 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -98,7 +98,7 @@
 	.cache_nice_tries	= 0,			\
 	.busy_idx		= 0,			\
 	.idle_idx		= 0,			\
-	.newidle_idx		= 0,			\
+	.newidle_idx		= 1,			\
 	.wake_idx		= 0,			\
 	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
@@ -128,15 +128,14 @@
 	.imbalance_pct		= 125,			\
 	.cache_nice_tries	= 1,			\
 	.busy_idx		= 2,			\
-	.idle_idx		= 0,			\
-	.newidle_idx		= 0,			\
+	.idle_idx		= 1,			\
+	.newidle_idx		= 2,			\
 	.wake_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE	\
-				| SD_WAKE_IDLE		\
 				| SD_SHARE_PKG_RESOURCES\
 				| BALANCE_FOR_MC_POWER,	\
 	.last_balance		= jiffies,		\
@@ -159,15 +158,14 @@
 	.imbalance_pct		= 125,			\
 	.cache_nice_tries	= 1,			\
 	.busy_idx		= 2,			\
-	.idle_idx		= 0,			\
-	.newidle_idx		= 0,			\
+	.idle_idx		= 1,			\
+	.newidle_idx		= 2,			\
 	.wake_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE	\
-				| SD_WAKE_IDLE		\
 				| BALANCE_FOR_PKG_POWER,\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
-- 
cgit v1.1


From 7a6c6bcee029a978f866511d6e41dbc7301fde4c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:19 +0200
Subject: sched: enable wake-idle on CONFIG_SCHED_MC=y

most multicore CPUs today have shared L2 caches, so tune things so
that the spreading amongst cores is more aggressive.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/topology.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 865a63e..47729f1 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -98,7 +98,7 @@
 	.cache_nice_tries	= 0,			\
 	.busy_idx		= 0,			\
 	.idle_idx		= 0,			\
-	.newidle_idx		= 1,			\
+	.newidle_idx		= 0,			\
 	.wake_idx		= 0,			\
 	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
@@ -128,14 +128,15 @@
 	.imbalance_pct		= 125,			\
 	.cache_nice_tries	= 1,			\
 	.busy_idx		= 2,			\
-	.idle_idx		= 1,			\
-	.newidle_idx		= 2,			\
+	.idle_idx		= 0,			\
+	.newidle_idx		= 0,			\
 	.wake_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE	\
+				| SD_WAKE_IDLE		\
 				| SD_SHARE_PKG_RESOURCES\
 				| BALANCE_FOR_MC_POWER,	\
 	.last_balance		= jiffies,		\
-- 
cgit v1.1


From 0dbee3a6b006dbe814d002cb18e94bf24a216451 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 15 Oct 2007 17:00:19 +0200
Subject: Make scheduler debug file operations const

In general, struct file_operations are const in the kernel, to not have
false cacheline sharing and to catch bugs at compiletime with accidental
writes to them. The new scheduler code introduces a new non-const one;
fix this up.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 27e82cb..a5e517e 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -236,7 +236,7 @@ static int sched_debug_open(struct inode *inode, struct file *filp)
 	return single_open(filp, sched_debug_show, NULL);
 }
 
-static struct file_operations sched_debug_fops = {
+static const struct file_operations sched_debug_fops = {
 	.open		= sched_debug_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-- 
cgit v1.1


From 5cf9f062c8e33d5a09eaa447550330162b2a96ed Mon Sep 17 00:00:00 2001
From: Milton Miller <miltonm@bga.com>
Date: Mon, 15 Oct 2007 17:00:19 +0200
Subject: sched: domain sysctl fixes: use kcalloc()

kcalloc checks for n * sizeof(element) overflows and it zeros.

Signed-off-by: Milton Miller <miltonm@bga.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 7506127..d29950a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5243,10 +5243,9 @@ static struct ctl_table sd_ctl_root[] = {
 static struct ctl_table *sd_alloc_ctl_entry(int n)
 {
 	struct ctl_table *entry =
-		kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL);
+		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
 
 	BUG_ON(!entry);
-	memset(entry, 0, n * sizeof(struct ctl_table));
 
 	return entry;
 }
@@ -6018,7 +6017,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 	/*
 	 * Allocate the per-node list of sched groups
 	 */
-	sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
+	sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
 					   GFP_KERNEL);
 	if (!sched_group_nodes) {
 		printk(KERN_WARNING "Can not alloc sched group node list\n");
-- 
cgit v1.1


From 97b6ea7b6369d51a451a7d5747a7939a593fdd9c Mon Sep 17 00:00:00 2001
From: Milton Miller <miltonm@bga.com>
Date: Mon, 15 Oct 2007 17:00:19 +0200
Subject: sched: domain sysctl fixes: use for_each_online_cpu()

init_sched_domain_sysctl was walking cpus 0-n and referencing per_cpu
variables.  If the cpus_possible mask is not contigious this will result
in a crash referencing unallocated data.  If the online mask is not
contigious then we would show offline cpus and miss online ones.

Signed-off-by: Milton Miller <miltonm@bga.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index d29950a..374f421 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5326,11 +5326,12 @@ static void init_sched_domain_sysctl(void)
 
 	sd_ctl_dir[0].child = entry;
 
-	for (i = 0; i < cpu_num; i++, entry++) {
+	for_each_online_cpu(i) {
 		snprintf(buf, 32, "cpu%d", i);
 		entry->procname = kstrdup(buf, GFP_KERNEL);
 		entry->mode = 0555;
 		entry->child = sd_alloc_ctl_cpu_table(i);
+		entry++;
 	}
 	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
 }
-- 
cgit v1.1


From 6382bc90f5664c450afc1f896e7ddb35ba182af9 Mon Sep 17 00:00:00 2001
From: Milton Miller <miltonm@bga.com>
Date: Mon, 15 Oct 2007 17:00:19 +0200
Subject: sched: domain sysctl fixes: unregister the sysctl table before
 domains

Unregister and free the sysctl table before destroying domains, then
rebuild and register after creating the new domains.  This prevents the
sysctl table from pointing to freed memory for root to write.

Signed-off-by: Milton Miller <miltonm@bga.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 374f421..a2dd054 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5250,6 +5250,18 @@ static struct ctl_table *sd_alloc_ctl_entry(int n)
 	return entry;
 }
 
+static void sd_free_ctl_entry(struct ctl_table **tablep)
+{
+	struct ctl_table *entry = *tablep;
+
+	for (entry = *tablep; entry->procname; entry++)
+		if (entry->child)
+			sd_free_ctl_entry(&entry->child);
+
+	kfree(*tablep);
+	*tablep = NULL;
+}
+
 static void
 set_table_entry(struct ctl_table *entry,
 		const char *procname, void *data, int maxlen,
@@ -5318,7 +5330,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
 }
 
 static struct ctl_table_header *sd_sysctl_header;
-static void init_sched_domain_sysctl(void)
+static void register_sched_domain_sysctl(void)
 {
 	int i, cpu_num = num_online_cpus();
 	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
@@ -5335,8 +5347,18 @@ static void init_sched_domain_sysctl(void)
 	}
 	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
 }
+
+static void unregister_sched_domain_sysctl(void)
+{
+	unregister_sysctl_table(sd_sysctl_header);
+	sd_sysctl_header = NULL;
+	sd_free_ctl_entry(&sd_ctl_dir[0].child);
+}
 #else
-static void init_sched_domain_sysctl(void)
+static void register_sched_domain_sysctl(void)
+{
+}
+static void unregister_sched_domain_sysctl(void)
 {
 }
 #endif
@@ -6271,6 +6293,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
 
 	err = build_sched_domains(&cpu_default_map);
 
+	register_sched_domain_sysctl();
+
 	return err;
 }
 
@@ -6287,6 +6311,8 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
 {
 	int i;
 
+	unregister_sched_domain_sysctl();
+
 	for_each_cpu_mask(i, *cpu_map)
 		cpu_attach_domain(NULL, i);
 	synchronize_sched();
@@ -6317,6 +6343,8 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
 	if (!err && !cpus_empty(*partition2))
 		err = build_sched_domains(partition2);
 
+	register_sched_domain_sysctl();
+
 	return err;
 }
 
@@ -6448,8 +6476,6 @@ void __init sched_init_smp(void)
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
 
-	init_sched_domain_sysctl();
-
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
 		BUG();
-- 
cgit v1.1


From ad1cdc1d7883e88f936f7888a092e4e3e6d8c631 Mon Sep 17 00:00:00 2001
From: Milton Miller <miltonm@bga.com>
Date: Mon, 15 Oct 2007 17:00:19 +0200
Subject: sched: domain sysctl fixes: do not crash on allocation failure

Now that we are calling this at runtime, a more relaxed error path is
suggested.  If an allocation fails, we just register the partial table,
which will show empty directories.

Signed-off-by: Milton Miller <miltonm@bga.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index a2dd054..f40fe02 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5245,8 +5245,6 @@ static struct ctl_table *sd_alloc_ctl_entry(int n)
 	struct ctl_table *entry =
 		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
 
-	BUG_ON(!entry);
-
 	return entry;
 }
 
@@ -5279,6 +5277,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
 	struct ctl_table *table = sd_alloc_ctl_entry(12);
 
+	if (table == NULL)
+		return NULL;
+
 	set_table_entry(&table[0], "min_interval", &sd->min_interval,
 		sizeof(long), 0644, proc_doulongvec_minmax);
 	set_table_entry(&table[1], "max_interval", &sd->max_interval,
@@ -5316,6 +5317,8 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
 	for_each_domain(cpu, sd)
 		domain_num++;
 	entry = table = sd_alloc_ctl_entry(domain_num + 1);
+	if (table == NULL)
+		return NULL;
 
 	i = 0;
 	for_each_domain(cpu, sd) {
@@ -5336,6 +5339,9 @@ static void register_sched_domain_sysctl(void)
 	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
 	char buf[32];
 
+	if (entry == NULL)
+		return;
+
 	sd_ctl_dir[0].child = entry;
 
 	for_each_online_cpu(i) {
-- 
cgit v1.1


From 6323469f9b72530eb90c96ba162cc70f2f4611de Mon Sep 17 00:00:00 2001
From: Milton Miller <miltonm@bga.com>
Date: Mon, 15 Oct 2007 17:00:19 +0200
Subject: sched: domain sysctl fixes: add terminator comment

we had an incorrect-terminator bug in sd_alloc_ctl_domain_table()
before, so add a comment that documents it.

Signed-off-by: Milton Miller <miltonm@bga.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched.c b/kernel/sched.c
index f40fe02..9887ca00 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5303,6 +5303,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[10], "flags", &sd->flags,
 		sizeof(int), 0644, proc_dointvec_minmax);
+	/* &table[11] is terminator */
 
 	return table;
 }
-- 
cgit v1.1


From 5e84cfde51cf303d368fcb48f22059f37b3872de Mon Sep 17 00:00:00 2001
From: Laurent Vivier <Laurent.Vivier@bull.net>
Date: Mon, 15 Oct 2007 17:00:19 +0200
Subject: sched: guest CPU accounting: add guest-CPU /proc/stat field

as recent CPUs introduce a third running state, after "user" and
"system", we need a new field, "guest", in cpustat to store the time
used by the CPU to run virtual CPU. Modify /proc/stat to display this
new field.

Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
Acked-by: Avi Kivity <avi@qumranet.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/proc_misc.c         | 15 +++++++++++----
 include/linux/kernel_stat.h |  1 +
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index bee251c..b872a01 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -443,6 +443,7 @@ static int show_stat(struct seq_file *p, void *v)
 	int i;
 	unsigned long jif;
 	cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
+	cputime64_t guest;
 	u64 sum = 0;
 	struct timespec boottime;
 	unsigned int *per_irq_sum;
@@ -453,6 +454,7 @@ static int show_stat(struct seq_file *p, void *v)
 
 	user = nice = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
+	guest = cputime64_zero;
 	getboottime(&boottime);
 	jif = boottime.tv_sec;
 
@@ -467,6 +469,7 @@ static int show_stat(struct seq_file *p, void *v)
 		irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
+		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
 		for (j = 0; j < NR_IRQS; j++) {
 			unsigned int temp = kstat_cpu(i).irqs[j];
 			sum += temp;
@@ -474,7 +477,7 @@ static int show_stat(struct seq_file *p, void *v)
 		}
 	}
 
-	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu\n",
+	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
 		(unsigned long long)cputime64_to_clock_t(user),
 		(unsigned long long)cputime64_to_clock_t(nice),
 		(unsigned long long)cputime64_to_clock_t(system),
@@ -482,7 +485,8 @@ static int show_stat(struct seq_file *p, void *v)
 		(unsigned long long)cputime64_to_clock_t(iowait),
 		(unsigned long long)cputime64_to_clock_t(irq),
 		(unsigned long long)cputime64_to_clock_t(softirq),
-		(unsigned long long)cputime64_to_clock_t(steal));
+		(unsigned long long)cputime64_to_clock_t(steal),
+		(unsigned long long)cputime64_to_clock_t(guest));
 	for_each_online_cpu(i) {
 
 		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
@@ -494,7 +498,9 @@ static int show_stat(struct seq_file *p, void *v)
 		irq = kstat_cpu(i).cpustat.irq;
 		softirq = kstat_cpu(i).cpustat.softirq;
 		steal = kstat_cpu(i).cpustat.steal;
-		seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu\n",
+		guest = kstat_cpu(i).cpustat.guest;
+		seq_printf(p,
+			"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
 			i,
 			(unsigned long long)cputime64_to_clock_t(user),
 			(unsigned long long)cputime64_to_clock_t(nice),
@@ -503,7 +509,8 @@ static int show_stat(struct seq_file *p, void *v)
 			(unsigned long long)cputime64_to_clock_t(iowait),
 			(unsigned long long)cputime64_to_clock_t(irq),
 			(unsigned long long)cputime64_to_clock_t(softirq),
-			(unsigned long long)cputime64_to_clock_t(steal));
+			(unsigned long long)cputime64_to_clock_t(steal),
+			(unsigned long long)cputime64_to_clock_t(guest));
 	}
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 43e895f..12bf44f 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -23,6 +23,7 @@ struct cpu_usage_stat {
 	cputime64_t idle;
 	cputime64_t iowait;
 	cputime64_t steal;
+	cputime64_t guest;
 };
 
 struct kernel_stat {
-- 
cgit v1.1


From 9ac52315d4cf5f561f36dabaf0720c00d3553162 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <Laurent.Vivier@bull.net>
Date: Mon, 15 Oct 2007 17:00:19 +0200
Subject: sched: guest CPU accounting: add guest-CPU /proc/<pid>/stat fields

like for cpustat, introduce the "gtime" (guest time of the task) and
"cgtime" (guest time of the task children) fields for the
tasks. Modify signal_struct and task_struct.

Modify /proc/<pid>/stat to display these new fields.

Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
Acked-by: Avi Kivity <avi@qumranet.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/array.c       | 17 +++++++++++++++--
 include/linux/sched.h |  3 +++
 kernel/exit.c         |  6 ++++++
 kernel/fork.c         |  3 +++
 4 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index ee4814d..27b59f5 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -370,6 +370,11 @@ static cputime_t task_stime(struct task_struct *p)
 }
 #endif
 
+static cputime_t task_gtime(struct task_struct *p)
+{
+	return p->gtime;
+}
+
 static int do_task_stat(struct task_struct *task, char *buffer, int whole)
 {
 	unsigned long vsize, eip, esp, wchan = ~0UL;
@@ -385,6 +390,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
 	unsigned long cmin_flt = 0, cmaj_flt = 0;
 	unsigned long  min_flt = 0,  maj_flt = 0;
 	cputime_t cutime, cstime, utime, stime;
+	cputime_t cgtime, gtime;
 	unsigned long rsslim = 0;
 	char tcomm[sizeof(task->comm)];
 	unsigned long flags;
@@ -403,6 +409,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
 	sigemptyset(&sigign);
 	sigemptyset(&sigcatch);
 	cutime = cstime = utime = stime = cputime_zero;
+	cgtime = gtime = cputime_zero;
 
 	rcu_read_lock();
 	if (lock_task_sighand(task, &flags)) {
@@ -420,6 +427,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
 		cmaj_flt = sig->cmaj_flt;
 		cutime = sig->cutime;
 		cstime = sig->cstime;
+		cgtime = sig->cgtime;
 		rsslim = sig->rlim[RLIMIT_RSS].rlim_cur;
 
 		/* add up live thread stats at the group level */
@@ -430,6 +438,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
 				maj_flt += t->maj_flt;
 				utime = cputime_add(utime, task_utime(t));
 				stime = cputime_add(stime, task_stime(t));
+				gtime = cputime_add(gtime, task_gtime(t));
 				t = next_thread(t);
 			} while (t != task);
 
@@ -437,6 +446,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
 			maj_flt += sig->maj_flt;
 			utime = cputime_add(utime, sig->utime);
 			stime = cputime_add(stime, sig->stime);
+			gtime += cputime_add(gtime, sig->gtime);
 		}
 
 		sid = signal_session(sig);
@@ -454,6 +464,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
 		maj_flt = task->maj_flt;
 		utime = task_utime(task);
 		stime = task_stime(task);
+		gtime = task_gtime(task);
 	}
 
 	/* scale priority and nice values from timeslices to -20..20 */
@@ -471,7 +482,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
 
 	res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \
 %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu\n",
+%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
 		task->pid,
 		tcomm,
 		state,
@@ -516,7 +527,9 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
 		task_cpu(task),
 		task->rt_priority,
 		task->policy,
-		(unsigned long long)delayacct_blkio_ticks(task));
+		(unsigned long long)delayacct_blkio_ticks(task),
+		cputime_to_clock_t(gtime),
+		cputime_to_clock_t(cgtime));
 	if (mm)
 		mmput(mm);
 	return res;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3a6e05e..fefce22 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -517,6 +517,8 @@ struct signal_struct {
 	 * in __exit_signal, except for the group leader.
 	 */
 	cputime_t utime, stime, cutime, cstime;
+	cputime_t gtime;
+	cputime_t cgtime;
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 	unsigned long inblock, oublock, cinblock, coublock;
@@ -1048,6 +1050,7 @@ struct task_struct {
 
 	unsigned int rt_priority;
 	cputime_t utime, stime;
+	cputime_t gtime;
 	unsigned long nvcsw, nivcsw; /* context switch counts */
 	struct timespec start_time; 		/* monotonic time */
 	struct timespec real_start_time;	/* boot based time */
diff --git a/kernel/exit.c b/kernel/exit.c
index 993369e..7f7959d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -111,6 +111,7 @@ static void __exit_signal(struct task_struct *tsk)
 		 */
 		sig->utime = cputime_add(sig->utime, tsk->utime);
 		sig->stime = cputime_add(sig->stime, tsk->stime);
+		sig->gtime = cputime_add(sig->gtime, tsk->gtime);
 		sig->min_flt += tsk->min_flt;
 		sig->maj_flt += tsk->maj_flt;
 		sig->nvcsw += tsk->nvcsw;
@@ -1242,6 +1243,11 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
 			cputime_add(p->stime,
 			cputime_add(sig->stime,
 				    sig->cstime)));
+		psig->cgtime =
+			cputime_add(psig->cgtime,
+			cputime_add(p->gtime,
+			cputime_add(sig->gtime,
+				    sig->cgtime)));
 		psig->cmin_flt +=
 			p->min_flt + sig->min_flt + sig->cmin_flt;
 		psig->cmaj_flt +=
diff --git a/kernel/fork.c b/kernel/fork.c
index 5e67f90..3fc3c13 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -877,6 +877,8 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	sig->tty_old_pgrp = NULL;
 
 	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
+	sig->gtime = cputime_zero;
+	sig->cgtime = cputime_zero;
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
@@ -1045,6 +1047,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	p->utime = cputime_zero;
 	p->stime = cputime_zero;
+	p->gtime = cputime_zero;
 
 #ifdef CONFIG_TASK_XACCT
 	p->rchar = 0;		/* I/O counter: bytes read */
-- 
cgit v1.1


From 94886b84b1bcdc95f34f70e7fce407efefe472e1 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <Laurent.Vivier@bull.net>
Date: Mon, 15 Oct 2007 17:00:19 +0200
Subject: sched: guest CPU accounting: maintain stats in account_system_time()

modify account_system_time() to add cputime to cpustat->guest if we are
running a VCPU. We add this cputime to cpustat->user instead of
cpustat->system because this part of KVM code is in fact user code
although it is executed in the kernel. We duplicate VCPU time between
guest and user to allow an unmodified "top(1)" to display correct value.
A modified "top(1)" is able to display good cpu user time and cpu guest
time by subtracting cpu guest time from cpu user time. Update "gtime" in
task_struct accordingly.

Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
Acked-by: Avi Kivity <avi@qumranet.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fefce22..228e0a8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1342,6 +1342,7 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_STARTING	0x00000002	/* being created */
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
+#define PF_VCPU		0x00000010	/* I'm a virtual CPU */
 #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
 #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
 #define PF_DUMPCORE	0x00000200	/* dumped core */
diff --git a/kernel/sched.c b/kernel/sched.c
index 9887ca00..5a91fe0b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3303,6 +3303,25 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
 }
 
 /*
+ * Account guest cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in virtual machine since the last update
+ */
+void account_guest_time(struct task_struct *p, cputime_t cputime)
+{
+	cputime64_t tmp;
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+
+	tmp = cputime_to_cputime64(cputime);
+
+	p->utime = cputime_add(p->utime, cputime);
+	p->gtime = cputime_add(p->gtime, cputime);
+
+	cpustat->user = cputime64_add(cpustat->user, tmp);
+	cpustat->guest = cputime64_add(cpustat->guest, tmp);
+}
+
+/*
  * Account system cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3315,6 +3334,12 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 	struct rq *rq = this_rq();
 	cputime64_t tmp;
 
+	if (p->flags & PF_VCPU) {
+		account_guest_time(p, cputime);
+		p->flags &= ~PF_VCPU;
+		return;
+	}
+
 	p->stime = cputime_add(p->stime, cputime);
 
 	/* Add system time to cpustat. */
-- 
cgit v1.1


From d172fcd3ae1ca7ac27ec8904242fd61e0e11d332 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <Laurent.Vivier@bull.net>
Date: Mon, 15 Oct 2007 17:00:19 +0200
Subject: sched: guest CPU accounting: maintain guest state in KVM

Modify KVM to update guest time accounting.

[ mingo@elte.hu: ported to 2.6.24 KVM. ]

Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
Acked-by: Avi Kivity <avi@qumranet.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/kvm/kvm.h      | 10 ++++++++++
 drivers/kvm/kvm_main.c |  2 ++
 2 files changed, 12 insertions(+)

diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index ad08138..3b0bc4b 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -624,6 +624,16 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu);
 
 int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
 
+static inline void kvm_guest_enter(void)
+{
+	current->flags |= PF_VCPU;
+}
+
+static inline void kvm_guest_exit(void)
+{
+	current->flags &= ~PF_VCPU;
+}
+
 static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 				     u32 error_code)
 {
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 353e585..af2d288 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -2046,6 +2046,7 @@ again:
 		kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
 
 	vcpu->guest_mode = 1;
+	kvm_guest_enter();
 
 	if (vcpu->requests)
 		if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
@@ -2053,6 +2054,7 @@ again:
 
 	kvm_x86_ops->run(vcpu, kvm_run);
 
+	kvm_guest_exit();
 	vcpu->guest_mode = 0;
 	local_irq_enable();
 
-- 
cgit v1.1


From 71e20f1873d46e138c26ce83f8fe54b7221f572f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:19 +0200
Subject: sched: affine sync wakeups

make sync wakeups affine for cache-cold tasks: if a cache-cold task
is woken up by a sync wakeup then use the opportunity to migrate it
straight away. (the two tasks are 'related' because they communicate)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/pipe.c          | 6 +++---
 kernel/sched.c     | 8 +++++++-
 net/unix/af_unix.c | 4 ++--
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/fs/pipe.c b/fs/pipe.c
index f1fa2b4..e66ec48 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -382,7 +382,7 @@ redo:
 
 	/* Signal writers asynchronously that there is more room. */
 	if (do_wakeup) {
-		wake_up_interruptible(&pipe->wait);
+		wake_up_interruptible_sync(&pipe->wait);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
 	if (ret > 0)
@@ -555,7 +555,7 @@ redo2:
 out:
 	mutex_unlock(&inode->i_mutex);
 	if (do_wakeup) {
-		wake_up_interruptible(&pipe->wait);
+		wake_up_interruptible_sync(&pipe->wait);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
 	if (ret > 0)
@@ -649,7 +649,7 @@ pipe_release(struct inode *inode, int decr, int decw)
 	if (!pipe->readers && !pipe->writers) {
 		free_pipe_info(inode);
 	} else {
-		wake_up_interruptible(&pipe->wait);
+		wake_up_interruptible_sync(&pipe->wait);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
diff --git a/kernel/sched.c b/kernel/sched.c
index 5a91fe0b..7fd3434 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1521,6 +1521,12 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 			unsigned long tl = this_load;
 			unsigned long tl_per_task;
 
+			/*
+			 * Attract cache-cold tasks on sync wakeups:
+			 */
+			if (sync && !task_hot(p, rq->clock, this_sd))
+				goto out_set_cpu;
+
 			schedstat_inc(p, se.nr_wakeups_affine_attempts);
 			tl_per_task = cpu_avg_load_per_task(this_cpu);
 
@@ -1598,7 +1604,7 @@ out_activate:
 	 * the waker guarantees that the freshly woken up task is going
 	 * to be considered on this CPU.)
 	 */
-	if (!sync || cpu != this_cpu)
+	if (!sync || rq->curr == rq->idle)
 		check_preempt_curr(rq, p);
 	success = 1;
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 2b57eaf..6996cba 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -334,7 +334,7 @@ static void unix_write_space(struct sock *sk)
 	read_lock(&sk->sk_callback_lock);
 	if (unix_writable(sk)) {
 		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-			wake_up_interruptible(sk->sk_sleep);
+			wake_up_interruptible_sync(sk->sk_sleep);
 		sk_wake_async(sk, 2, POLL_OUT);
 	}
 	read_unlock(&sk->sk_callback_lock);
@@ -1639,7 +1639,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 	if (!skb)
 		goto out_unlock;
 
-	wake_up_interruptible(&u->peer_wait);
+	wake_up_interruptible_sync(&u->peer_wait);
 
 	if (msg->msg_name)
 		unix_copy_addr(msg, skb->sk);
-- 
cgit v1.1


From 9c63d9c021f375a2708ad79043d6f4dd1291a085 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 15 Oct 2007 17:00:20 +0200
Subject: sched: sync wakeups preempt too

make sure sync wakeups preempt too - the scheduler will not
overschedule as we've got various throttles against that.
As a result, sync wakeups can be used more widely in the kernel
(to signal wakeup affinity between tasks), and no arbitrary
latencies will be introduced either.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 7fd3434..bba57ad 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1596,16 +1596,7 @@ out_activate:
 		schedstat_inc(p, se.nr_wakeups_remote);
 	update_rq_clock(rq);
 	activate_task(rq, p, 1);
-	/*
-	 * Sync wakeups (i.e. those types of wakeups where the waker
-	 * has indicated that it will leave the CPU in short order)
-	 * don't trigger a preemption, if the woken up task will run on
-	 * this cpu. (in this case the 'I will reschedule' promise of
-	 * the waker guarantees that the freshly woken up task is going
-	 * to be considered on this CPU.)
-	 */
-	if (!sync || rq->curr == rq->idle)
-		check_preempt_curr(rq, p);
+	check_preempt_curr(rq, p);
 	success = 1;
 
 out_running:
-- 
cgit v1.1