From 0306e481d479a58eff17c27adf213fbb5822946b Mon Sep 17 00:00:00 2001 From: Shilpasri G Bhat Date: Wed, 3 Feb 2016 01:11:40 +0530 Subject: cpufreq: powernv/tracing: Add powernv_throttle tracepoint This patch adds the powernv_throttle tracepoint to trace the CPU frequency throttling event, which is used by the powernv-cpufreq driver in POWER8. Signed-off-by: Shilpasri G Bhat Reviewed-by: Gautham R. Shenoy Signed-off-by: Rafael J. Wysocki --- kernel/trace/power-traces.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index eb4220a..81b8745 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -15,4 +15,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); +EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle); -- cgit v1.1 From f7b382b988233b5851eddf4531651ffe4133e88c Mon Sep 17 00:00:00 2001 From: Abhilash Jindal Date: Sun, 31 Jan 2016 14:29:01 -0500 Subject: PM/freezer: y2038, use boottime to compare tstamps Wall time obtained from do_gettimeofday gives 32 bit timeval which can only represent time until January 2038. This patch moves to ktime_t, a 64-bit time. Also, wall time is susceptible to sudden jumps due to user setting the time or due to NTP. Boot time is constantly increasing time better suited for subtracting two timestamps. Signed-off-by: Abhilash Jindal Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 564f786..df058be 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -30,13 +30,12 @@ static int try_to_freeze_tasks(bool user_only) unsigned long end_time; unsigned int todo; bool wq_busy = false; - struct timeval start, end; - u64 elapsed_msecs64; + ktime_t start, end, elapsed; unsigned int elapsed_msecs; bool wakeup = false; int sleep_usecs = USEC_PER_MSEC; - do_gettimeofday(&start); + start = ktime_get_boottime(); end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs); @@ -78,10 +77,9 @@ static int try_to_freeze_tasks(bool user_only) sleep_usecs *= 2; } - do_gettimeofday(&end); - elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); - do_div(elapsed_msecs64, NSEC_PER_MSEC); - elapsed_msecs = elapsed_msecs64; + end = ktime_get_boottime(); + elapsed = ktime_sub(end, start); + elapsed_msecs = ktime_to_ms(elapsed); if (todo) { pr_cont("\n"); -- cgit v1.1 From 22e09b333f0b395b3eb6ab6efa4b3284e2c06810 Mon Sep 17 00:00:00 2001 From: saurabh Date: Wed, 28 Oct 2015 08:54:01 +0530 Subject: PM / suspend: replacing printk replacing printk(s) with appropriate pr_info and pr_err in order to fix checkpatch.pl warnings Signed-off-by: Saurabh Sengar Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index f9fe133..230a772 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -248,7 +248,7 @@ static int suspend_test(int level) { #ifdef CONFIG_PM_DEBUG if (pm_test_level == level) { - printk(KERN_INFO "suspend debug: Waiting for %d second(s).\n", + pr_info("suspend debug: Waiting for %d second(s).\n", pm_test_delay); mdelay(pm_test_delay * 1000); return 1; @@ -320,7 +320,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = dpm_suspend_late(PMSG_SUSPEND); if (error) { - printk(KERN_ERR "PM: late suspend of devices failed\n"); + pr_err("PM: late suspend of devices failed\n"); goto Platform_finish; } error = platform_suspend_prepare_late(state); @@ -329,7 +329,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { - printk(KERN_ERR "PM: noirq suspend of devices failed\n"); + pr_err("PM: noirq suspend of devices failed\n"); goto Platform_early_resume; } error = platform_suspend_prepare_noirq(state); -- cgit v1.1 From e237a5518425155faa508a087f28269f58074b92 Mon Sep 17 00:00:00 2001 From: Chen Fan Date: Mon, 15 Feb 2016 12:52:01 +0800 Subject: x86/ACPI/PCI: Recognize that Interrupt Line 255 means "not connected" Per the x86-specific footnote to PCI spec r3.0, sec 6.2.4, the value 255 in the Interrupt Line register means "unknown" or "no connection." Previously, when we couldn't derive an IRQ from the _PRT, we fell back to using the value from Interrupt Line as an IRQ. It's questionable whether we should do that at all, but the spec clearly suggests we shouldn't do it for the value 255 on x86. Calling request_irq() with IRQ 255 may succeed, but the driver won't receive any interrupts. Or, if IRQ 255 is shared with another device, it may succeed, and the driver's ISR will be called at random times when the *other* device interrupts. Or it may fail if another device is using IRQ 255 with incompatible flags. What we *want* is for request_irq() to fail predictably so the driver can fall back to polling. On x86, assume 255 in the Interrupt Line means the INTx line is not connected. In that case, set dev->irq to IRQ_NOTCONNECTED so request_irq() will fail gracefully with -ENOTCONN. We found this problem on a system where Secure Boot firmware assigned Interrupt Line 255 to an i801_smbus device and another device was already using MSI-X IRQ 255. This was in v3.10, where i801_probe() fails if request_irq() fails: i801_smbus 0000:00:1f.3: enabling device (0140 -> 0143) i801_smbus 0000:00:1f.3: can't derive routing for PCI INT C i801_smbus 0000:00:1f.3: PCI INT C: no GSI genirq: Flags mismatch irq 255. 00000080 (i801_smbus) vs. 00000000 (megasa) CPU: 0 PID: 2487 Comm: kworker/0:1 Not tainted 3.10.0-229.el7.x86_64 #1 Hardware name: FUJITSU PRIMEQUEST 2800E2/D3736, BIOS PRIMEQUEST 2000 Serie5 Call Trace: dump_stack+0x19/0x1b __setup_irq+0x54a/0x570 request_threaded_irq+0xcc/0x170 i801_probe+0x32f/0x508 [i2c_i801] local_pci_probe+0x45/0xa0 i801_smbus 0000:00:1f.3: Failed to allocate irq 255: -16 i801_smbus: probe of 0000:00:1f.3 failed with error -16 After aeb8a3d16ae0 ("i2c: i801: Check if interrupts are disabled"), i801_probe() will fall back to polling if request_irq() fails. But we still need this patch because request_irq() may succeed or fail depending on other devices in the system. If request_irq() fails, i801_smbus will work by falling back to polling, but if it succeeds, i801_smbus won't work because it expects interrupts that it may not receive. Signed-off-by: Chen Fan Acked-by: Thomas Gleixner Acked-by: Bjorn Helgaas Signed-off-by: Rafael J. Wysocki --- kernel/irq/manage.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 8411872..e79e60f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1609,6 +1609,9 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, struct irq_desc *desc; int retval; + if (irq == IRQ_NOTCONNECTED) + return -ENOTCONN; + /* * Sanity-check: shared interrupts must pass in a real dev-ID, * otherwise we'll have trouble later trying to figure out @@ -1699,9 +1702,13 @@ EXPORT_SYMBOL(request_threaded_irq); int request_any_context_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev_id) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; int ret; + if (irq == IRQ_NOTCONNECTED) + return -ENOTCONN; + + desc = irq_to_desc(irq); if (!desc) return -EINVAL; -- cgit v1.1 From 34e2c555f3e13c90e9284e23d00f03be8a6e06c5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 15 Feb 2016 20:20:42 +0100 Subject: cpufreq: Add mechanism for registering utilization update callbacks Introduce a mechanism by which parts of the cpufreq subsystem ("setpolicy" drivers or the core) can register callbacks to be executed from cpufreq_update_util() which is invoked by the scheduler's update_load_avg() on CPU utilization changes. This allows the "setpolicy" drivers to dispense with their timers and do all of the computations they need and frequency/voltage adjustments in the update_load_avg() code path, among other things. The update_load_avg() changes were suggested by Peter Zijlstra. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Acked-by: Peter Zijlstra (Intel) Acked-by: Ingo Molnar --- kernel/sched/deadline.c | 4 ++++ kernel/sched/fair.c | 26 +++++++++++++++++++++++++- kernel/sched/rt.c | 4 ++++ kernel/sched/sched.h | 1 + 4 files changed, 34 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index cd64c97..21a0aa6 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -726,6 +726,10 @@ static void update_curr_dl(struct rq *rq) if (!dl_task(curr) || !on_dl_rq(dl_se)) return; + /* Kick cpufreq (see the comment in linux/cpufreq.h). */ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_trigger_update(rq_clock(rq)); + /* * Consumed budget is computed considering the time as * observed by schedulable tasks (excluding time spent diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 56b7d4b..e2987a7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2824,7 +2824,8 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); - int cpu = cpu_of(rq_of(cfs_rq)); + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); /* * Track task load average for carrying it to new CPU after migrated, and @@ -2836,6 +2837,29 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) update_tg_load_avg(cfs_rq, 0); + + if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { + unsigned long max = rq->cpu_capacity_orig; + + /* + * There are a few boundary cases this might miss but it should + * get called often enough that that should (hopefully) not be + * a real problem -- added to that it only calls on the local + * CPU, so if we enqueue remotely we'll miss an update, but + * the next tick/schedule should update. + * + * It will not get called when we go idle, because the idle + * thread is a different class (!fair), nor will the utilization + * number include things like RT tasks. + * + * As is, the util number is not freq-invariant (we'd have to + * implement arch_scale_freq_capacity() for that). + * + * See cpu_util(). + */ + cpufreq_update_util(rq_clock(rq), + min(cfs_rq->avg.util_avg, max), max); + } } static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8ec86ab..27f5b03 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -945,6 +945,10 @@ static void update_curr_rt(struct rq *rq) if (curr->sched_class != &rt_sched_class) return; + /* Kick cpufreq (see the comment in linux/cpufreq.h). */ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_trigger_update(rq_clock(rq)); + delta_exec = rq_clock_task(rq) - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) return; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 10f1637..f042190 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -9,6 +9,7 @@ #include #include #include +#include #include "cpupri.h" #include "cpudeadline.h" -- cgit v1.1 From adaf9fcd136970e480d7ca834c0cf25ce922ea74 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 10 Mar 2016 20:44:47 +0100 Subject: cpufreq: Move scheduler-related code to the sched directory Create cpufreq.c under kernel/sched/ and move the cpufreq code related to the scheduler to that file and to sched.h. Redefine cpufreq_update_util() as a static inline function to avoid function calls at its call sites in the scheduler code (as suggested by Peter Zijlstra). Also move the definition of struct update_util_data and declaration of cpufreq_set_update_util_data() from include/linux/cpufreq.h to include/linux/sched.h. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) --- kernel/sched/Makefile | 1 + kernel/sched/cpufreq.c | 37 +++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 49 ++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 kernel/sched/cpufreq.c (limited to 'kernel') diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 6768797..9507522 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ) += cpufreq.o diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c new file mode 100644 index 0000000..928c4ba --- /dev/null +++ b/kernel/sched/cpufreq.c @@ -0,0 +1,37 @@ +/* + * Scheduler code and data structures related to cpufreq. + * + * Copyright (C) 2016, Intel Corporation + * Author: Rafael J. Wysocki + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "sched.h" + +DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_set_update_util_data - Populate the CPU's update_util_data pointer. + * @cpu: The CPU to set the pointer for. + * @data: New pointer value. + * + * Set and publish the update_util_data pointer for the given CPU. That pointer + * points to a struct update_util_data object containing a callback function + * to call from cpufreq_update_util(). That function will be called from an RCU + * read-side critical section, so it must not sleep. + * + * Callers must use RCU-sched callbacks to free any memory that might be + * accessed via the old update_util_data pointer or invoke synchronize_sched() + * right after this function to avoid use-after-free. + */ +void cpufreq_set_update_util_data(int cpu, struct update_util_data *data) +{ + if (WARN_ON(data && !data->func)) + return; + + rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data); +} +EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f042190..faf7e27 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -9,7 +9,6 @@ #include #include #include -#include #include "cpupri.h" #include "cpudeadline.h" @@ -1739,3 +1738,51 @@ static inline u64 irq_time_read(int cpu) } #endif /* CONFIG_64BIT */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_CPU_FREQ +DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_update_util - Take a note about CPU utilization changes. + * @time: Current time. + * @util: Current utilization. + * @max: Utilization ceiling. + * + * This function is called by the scheduler on every invocation of + * update_load_avg() on the CPU whose utilization is being updated. + * + * It can only be called from RCU-sched read-side critical sections. + */ +static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) +{ + struct update_util_data *data; + + data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); + if (data) + data->func(data, time, util, max); +} + +/** + * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed. + * @time: Current time. + * + * The way cpufreq is currently arranged requires it to evaluate the CPU + * performance state (frequency/voltage) on a regular basis to prevent it from + * being stuck in a completely inadequate performance level for too long. + * That is not guaranteed to happen if the updates are only triggered from CFS, + * though, because they may not be coming in if RT or deadline tasks are active + * all the time (or there are RT and DL tasks only). + * + * As a workaround for that issue, this function is called by the RT and DL + * sched classes to trigger extra cpufreq updates to prevent it from stalling, + * but that really is a band-aid. Going forward it should be replaced with + * solutions targeted more specifically at RT and DL tasks. + */ +static inline void cpufreq_trigger_update(u64 time) +{ + cpufreq_update_util(time, ULONG_MAX, 0); +} +#else +static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {} +static inline void cpufreq_trigger_update(u64 time) {} +#endif /* CONFIG_CPU_FREQ */ -- cgit v1.1