From f9bcf1e0e0145323ba2cf72ecad5264ff3883eb1 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 11 Aug 2016 13:36:35 +0800 Subject: sched/cputime: Fix steal time accounting Commit: 57430218317 ("sched/cputime: Count actually elapsed irq & softirq time") ... didn't take steal time into consideration with passing the noirqtime kernel parameter. As Paolo pointed out before: | Why not? If idle=poll, for example, any time the guest is suspended (and | thus cannot poll) does count as stolen time. This patch fixes it by reducing steal time from idle time accounting when the noirqtime parameter is true. The average idle time drops from 56.8% to 54.75% for nohz idle kvm guest(noirqtime, idle=poll, four vCPUs running on one pCPU). Signed-off-by: Wanpeng Li Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Peter Zijlstra (Intel) Cc: Peter Zijlstra Cc: Radim Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1470893795-3527-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 1934f65..8b9bcc5 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -508,13 +508,20 @@ void account_process_tick(struct task_struct *p, int user_tick) */ void account_idle_ticks(unsigned long ticks) { - + cputime_t cputime, steal; if (sched_clock_irqtime) { irqtime_account_idle_ticks(ticks); return; } - account_idle_time(jiffies_to_cputime(ticks)); + cputime = cputime_one_jiffy; + steal = steal_account_process_time(cputime); + + if (steal >= cputime) + return; + + cputime -= steal; + account_idle_time(cputime); } /* -- cgit v1.1 From 26f2c75cd2cf10a6120ef02ca9a94db77cc9c8e0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 11 Aug 2016 14:58:24 +0200 Subject: sched/cputime: Fix omitted ticks passed in parameter Commit: f9bcf1e0e014 ("sched/cputime: Fix steal time accounting") ... fixes a leak on steal time accounting but forgets to account the ticks passed in parameters, assuming there is only one to take into account. Let's consider that parameter back. Signed-off-by: Frederic Weisbecker Acked-by: Wanpeng Li Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: linux-tip-commits@vger.kernel.org Link: http://lkml.kernel.org/r/20160811125822.GB4214@lerouge Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8b9bcc5..9858266 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -509,12 +509,13 @@ void account_process_tick(struct task_struct *p, int user_tick) void account_idle_ticks(unsigned long ticks) { cputime_t cputime, steal; + if (sched_clock_irqtime) { irqtime_account_idle_ticks(ticks); return; } - cputime = cputime_one_jiffy; + cputime = jiffies_to_cputime(ticks); steal = steal_account_process_time(cputime); if (steal >= cputime) -- cgit v1.1 From 173be9a14f7b2e901cf77c18b1aafd4d672e9d9e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Aug 2016 18:38:42 +0200 Subject: sched/cputime: Fix NO_HZ_FULL getrusage() monotonicity regression Mike reports: Roughly 10% of the time, ltp testcase getrusage04 fails: getrusage04 0 TINFO : Expected timers granularity is 4000 us getrusage04 0 TINFO : Using 1 as multiply factor for max [us]time increment (1000+4000us)! getrusage04 0 TINFO : utime: 0us; stime: 179us getrusage04 0 TINFO : utime: 3751us; stime: 0us getrusage04 1 TFAIL : getrusage04.c:133: stime increased > 5000us: And tracked it down to the case where the task simply doesn't get _any_ [us]time ticks. Update the code to assume all rtime is utime when we lack information, thus ensuring a task that elides the tick gets time accounted. Reported-by: Mike Galbraith Tested-by: Mike Galbraith Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Fredrik Markstrom Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Cc: Rik van Riel Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Wanpeng Li Cc: stable@vger.kernel.org # 4.3+ Fixes: 9d7fb0427648 ("sched/cputime: Guarantee stime + utime == rtime") Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 9858266..2ee83b2 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -614,19 +614,25 @@ static void cputime_adjust(struct task_cputime *curr, stime = curr->stime; utime = curr->utime; - if (utime == 0) { - stime = rtime; + /* + * If either stime or both stime and utime are 0, assume all runtime is + * userspace. Once a task gets some ticks, the monotonicy code at + * 'update' will ensure things converge to the observed ratio. + */ + if (stime == 0) { + utime = rtime; goto update; } - if (stime == 0) { - utime = rtime; + if (utime == 0) { + stime = rtime; goto update; } stime = scale_stime((__force u64)stime, (__force u64)rtime, (__force u64)(stime + utime)); +update: /* * Make sure stime doesn't go backwards; this preserves monotonicity * for utime because rtime is monotonic. @@ -649,7 +655,6 @@ static void cputime_adjust(struct task_cputime *curr, stime = rtime - utime; } -update: prev->stime = stime; prev->utime = utime; out: -- cgit v1.1 From 03cbc732639ddcad15218c4b2046d255851ff1e3 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 17 Aug 2016 10:05:46 +0800 Subject: sched/cputime: Resync steal time when guest & host lose sync Commit: 57430218317e ("sched/cputime: Count actually elapsed irq & softirq time") ... fixed a bug but also triggered a regression: On an i5 laptop, 4 pCPUs, 4vCPUs for one full dynticks guest, there are four CPU hog processes(for loop) running in the guest, I hot-unplug the pCPUs on host one by one until there is only one left, then observe CPU utilization via 'top' in the guest, it shows: 100% st for cpu0(housekeeping) 75% st for other CPUs (nohz full mode) However, w/o this commit it shows the correct 75% for all four CPUs. When a guest is interrupted for a longer amount of time, missed clock ticks are not redelivered later. Because of that, we should not limit the amount of steal time accounted to the amount of time that the calling functions think have passed. However, the interval returned by account_other_time() is NOT rounded down to the nearest jiffy, while the base interval in get_vtime_delta() it is subtracted from is, so the max cputime limit is required to avoid underflow. This patch fixes the regression by limiting the account_other_time() from get_vtime_delta() to avoid underflow, and lets the other three call sites (in account_other_time() and steal_account_process_time()) account however much steal time the host told us elapsed. Suggested-by: Rik van Riel Suggested-by: Paolo Bonzini Signed-off-by: Wanpeng Li Reviewed-by: Rik van Riel Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Radim Krcmar Cc: Thomas Gleixner Cc: kvm@vger.kernel.org Link: http://lkml.kernel.org/r/1471399546-4069-1-git-send-email-wanpeng.li@hotmail.com [ Improved the changelog. ] Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 2ee83b2..a846cf8 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -263,6 +263,11 @@ void account_idle_time(cputime_t cputime) cpustat[CPUTIME_IDLE] += (__force u64) cputime; } +/* + * When a guest is interrupted for a longer amount of time, missed clock + * ticks are not redelivered later. Due to that, this function may on + * occasion account more time than the calling functions think elapsed. + */ static __always_inline cputime_t steal_account_process_time(cputime_t maxtime) { #ifdef CONFIG_PARAVIRT @@ -371,7 +376,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, * idle, or potentially user or system time. Due to rounding, * other time can exceed ticks occasionally. */ - other = account_other_time(cputime); + other = account_other_time(ULONG_MAX); if (other >= cputime) return; cputime -= other; @@ -486,7 +491,7 @@ void account_process_tick(struct task_struct *p, int user_tick) } cputime = cputime_one_jiffy; - steal = steal_account_process_time(cputime); + steal = steal_account_process_time(ULONG_MAX); if (steal >= cputime) return; @@ -516,7 +521,7 @@ void account_idle_ticks(unsigned long ticks) } cputime = jiffies_to_cputime(ticks); - steal = steal_account_process_time(cputime); + steal = steal_account_process_time(ULONG_MAX); if (steal >= cputime) return; @@ -699,6 +704,13 @@ static cputime_t get_vtime_delta(struct task_struct *tsk) unsigned long now = READ_ONCE(jiffies); cputime_t delta, other; + /* + * Unlike tick based timing, vtime based timing never has lost + * ticks, and no need for steal time accounting to make up for + * lost ticks. Vtime accounts a rounded version of actual + * elapsed time. Limit account_other_time to prevent rounding + * errors from causing elapsed vtime to go negative. + */ delta = jiffies_to_cputime(now - tsk->vtime_snap); other = account_other_time(delta); WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); -- cgit v1.1