From 1dda606c5f94b14a8f36c220d1d8844bab68a720 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Wed, 8 Jun 2011 02:45:37 +0200 Subject: KVM: Add compat ioctl for KVM_SET_SIGNAL_MASK KVM has an ioctl to define which signal mask should be used while running inside VCPU_RUN. At least for big endian systems, this mask is different on 32-bit and 64-bit systems (though the size is identical). Add a compat wrapper that converts the mask to whatever the kernel accepts, allowing 32-bit kvm user space to set signal masks. This patch fixes qemu with --enable-io-thread on ppc64 hosts when running 32-bit user land. Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- kernel/compat.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index fc9eb093..18197ae 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -890,6 +890,7 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat) case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); } } +EXPORT_SYMBOL_GPL(sigset_from_compat); asmlinkage long compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, -- cgit v1.1 From c9aaa8957f203bd6df83b002fb40b98390bed078 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 11 Jul 2011 15:28:14 -0400 Subject: KVM: Steal time implementation To implement steal time, we need the hypervisor to pass the guest information about how much time was spent running other processes outside the VM, while the vcpu had meaningful work to do - halt time does not count. This information is acquired through the run_delay field of delayacct/schedstats infrastructure, that counts time spent in a runqueue but not running. Steal time is a per-cpu information, so the traditional MSR-based infrastructure is used. A new msr, KVM_MSR_STEAL_TIME, holds the memory area address containing information about steal time This patch contains the hypervisor part of the steal time infrasructure, and can be backported independently of the guest portion. [avi, yongjie: export delayacct_on, to avoid build failures in some configs] Signed-off-by: Glauber Costa Tested-by: Eric B Munson CC: Rik van Riel CC: Jeremy Fitzhardinge CC: Peter Zijlstra CC: Anthony Liguori Signed-off-by: Yongjie Ren Signed-off-by: Avi Kivity --- kernel/delayacct.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/delayacct.c b/kernel/delayacct.c index ead9b61..418b3f7 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -19,8 +19,10 @@ #include #include #include +#include int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ +EXPORT_SYMBOL_GPL(delayacct_on); struct kmem_cache *delayacct_cache; static int __init delayacct_setup_disable(char *str) -- cgit v1.1 From e6e6685accfa81f509fadfc9624bc7c3862d75c4 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 11 Jul 2011 15:28:17 -0400 Subject: KVM guest: Steal time accounting This patch accounts steal time time in account_process_tick. If one or more tick is considered stolen in the current accounting cycle, user/system accounting is skipped. Idle is fine, since the hypervisor does not report steal time if the guest is halted. Accounting steal time from the core scheduler give us the advantage of direct acess to the runqueue data. In a later opportunity, it can be used to tweak cpu power and make the scheduler aware of the time it lost. [avi: doesn't exist on many archs] Signed-off-by: Glauber Costa Acked-by: Rik van Riel Acked-by: Peter Zijlstra Tested-by: Eric B Munson CC: Jeremy Fitzhardinge CC: Anthony Liguori Signed-off-by: Avi Kivity --- kernel/sched.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3f2e502..f98a28b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -75,6 +75,9 @@ #include #include #include +#ifdef CONFIG_PARAVIRT +#include +#endif #include "sched_cpupri.h" #include "workqueue_sched.h" @@ -528,6 +531,9 @@ struct rq { #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; #endif +#ifdef CONFIG_PARAVIRT + u64 prev_steal_time; +#endif /* calc_load related fields */ unsigned long calc_load_update; @@ -1953,6 +1959,18 @@ void account_system_vtime(struct task_struct *curr) } EXPORT_SYMBOL_GPL(account_system_vtime); +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_PARAVIRT +static inline u64 steal_ticks(u64 steal) +{ + if (unlikely(steal > NSEC_PER_SEC)) + return div_u64(steal, TICK_NSEC); + + return __iter_div_u64_rem(steal, TICK_NSEC, &steal); +} +#endif + static void update_rq_clock_task(struct rq *rq, s64 delta) { s64 irq_delta; @@ -3845,6 +3863,25 @@ void account_idle_time(cputime_t cputime) cpustat->idle = cputime64_add(cpustat->idle, cputime64); } +static __always_inline bool steal_account_process_tick(void) +{ +#ifdef CONFIG_PARAVIRT + if (static_branch(¶virt_steal_enabled)) { + u64 steal, st = 0; + + steal = paravirt_steal_clock(smp_processor_id()); + steal -= this_rq()->prev_steal_time; + + st = steal_ticks(steal); + this_rq()->prev_steal_time += st * TICK_NSEC; + + account_steal_time(st); + return st; + } +#endif + return false; +} + #ifndef CONFIG_VIRT_CPU_ACCOUNTING #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -3876,6 +3913,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + if (steal_account_process_tick()) + return; + if (irqtime_account_hi_update()) { cpustat->irq = cputime64_add(cpustat->irq, tmp); } else if (irqtime_account_si_update()) { @@ -3929,6 +3969,9 @@ void account_process_tick(struct task_struct *p, int user_tick) return; } + if (steal_account_process_tick()) + return; + if (user_tick) account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) -- cgit v1.1 From 095c0aa83e52d6c3dd7168610746703921f570af Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 11 Jul 2011 15:28:18 -0400 Subject: sched: adjust scheduler cpu power for stolen time This patch makes update_rq_clock() aware of steal time. The mechanism of operation is not different from irq_time, and follows the same principles. This lives in a CONFIG option itself, and can be compiled out independently of the rest of steal time reporting. The effect of disabling it is that the scheduler will still report steal time (that cannot be disabled), but won't use this information for cpu power adjustments. Everytime update_rq_clock_task() is invoked, we query information about how much time was stolen since last call, and feed it into sched_rt_avg_update(). Although steal time reporting in account_process_tick() keeps track of the last time we read the steal clock, in prev_steal_time, this patch do it independently using another field, prev_steal_time_rq. This is because otherwise, information about time accounted in update_process_tick() would never reach us in update_rq_clock(). Signed-off-by: Glauber Costa Acked-by: Rik van Riel Acked-by: Peter Zijlstra Tested-by: Eric B Munson CC: Jeremy Fitzhardinge CC: Anthony Liguori Signed-off-by: Avi Kivity --- kernel/sched.c | 47 +++++++++++++++++++++++++++++++++++++---------- kernel/sched_features.h | 4 ++-- 2 files changed, 39 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f98a28b..b35ac50 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -534,6 +534,9 @@ struct rq { #ifdef CONFIG_PARAVIRT u64 prev_steal_time; #endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + u64 prev_steal_time_rq; +#endif /* calc_load related fields */ unsigned long calc_load_update; @@ -1973,8 +1976,14 @@ static inline u64 steal_ticks(u64 steal) static void update_rq_clock_task(struct rq *rq, s64 delta) { - s64 irq_delta; - +/* + * In theory, the compile should just see 0 here, and optimize out the call + * to sched_rt_avg_update. But I don't trust it... + */ +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + s64 steal = 0, irq_delta = 0; +#endif +#ifdef CONFIG_IRQ_TIME_ACCOUNTING irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; /* @@ -1997,12 +2006,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->prev_irq_time += irq_delta; delta -= irq_delta; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + if (static_branch((¶virt_steal_rq_enabled))) { + u64 st; + + steal = paravirt_steal_clock(cpu_of(rq)); + steal -= rq->prev_steal_time_rq; + + if (unlikely(steal > delta)) + steal = delta; + + st = steal_ticks(steal); + steal = st * TICK_NSEC; + + rq->prev_steal_time_rq += steal; + + delta -= steal; + } +#endif + rq->clock_task += delta; - if (irq_delta && sched_feat(NONIRQ_POWER)) - sched_rt_avg_update(rq, irq_delta); +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) + sched_rt_avg_update(rq, irq_delta + steal); +#endif } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING static int irqtime_account_hi_update(void) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; @@ -2037,12 +2069,7 @@ static int irqtime_account_si_update(void) #define sched_clock_irqtime (0) -static void update_rq_clock_task(struct rq *rq, s64 delta) -{ - rq->clock_task += delta; -} - -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ +#endif #include "sched_idletask.c" #include "sched_fair.c" diff --git a/kernel/sched_features.h b/kernel/sched_features.h index be40f73..ca3b025 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -61,9 +61,9 @@ SCHED_FEAT(LB_BIAS, 1) SCHED_FEAT(OWNER_SPIN, 1) /* - * Decrement CPU power based on irq activity + * Decrement CPU power based on time not spent running tasks */ -SCHED_FEAT(NONIRQ_POWER, 1) +SCHED_FEAT(NONTASK_POWER, 1) /* * Queue remote wakeups on the target CPU and process them -- cgit v1.1