From bb6ab52f2befe1fb29ac198f27d8a6aadf510f81 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 31 Mar 2016 17:42:15 +0200 Subject: intel_pstate: Do not set utilization update hook too early The utilization update hook in the intel_pstate driver is set too early, as it only should be set after the policy has been fully initialized by the core. That may cause intel_pstate_update_util() to use incorrect data and put the CPUs into incorrect P-states as a result. To prevent that from happening, make intel_pstate_set_policy() set the utilization update hook instead of intel_pstate_init_cpu() so intel_pstate_update_util() only runs when all things have been initialized as appropriate. Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) (limited to 'drivers/cpufreq/intel_pstate.c') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 4b64452..81057e4 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1103,7 +1103,6 @@ static int intel_pstate_init_cpu(unsigned int cpunum) intel_pstate_sample(cpu, 0); cpu->update_util.func = intel_pstate_update_util; - cpufreq_set_update_util_data(cpunum, &cpu->update_util); pr_debug("intel_pstate: controlling: cpu %d\n", cpunum); @@ -1122,18 +1121,29 @@ static unsigned int intel_pstate_get(unsigned int cpu_num) return get_avg_frequency(cpu); } +static void intel_pstate_set_update_util_hook(unsigned int cpu) +{ + cpufreq_set_update_util_data(cpu, &all_cpu_data[cpu]->update_util); +} + +static void intel_pstate_clear_update_util_hook(unsigned int cpu) +{ + cpufreq_set_update_util_data(cpu, NULL); + synchronize_sched(); +} + static int intel_pstate_set_policy(struct cpufreq_policy *policy) { if (!policy->cpuinfo.max_freq) return -ENODEV; + intel_pstate_clear_update_util_hook(policy->cpu); + if (policy->policy == CPUFREQ_POLICY_PERFORMANCE && policy->max >= policy->cpuinfo.max_freq) { pr_debug("intel_pstate: set performance\n"); limits = &performance_limits; - if (hwp_active) - intel_pstate_hwp_set(policy->cpus); - return 0; + goto out; } pr_debug("intel_pstate: set powersave\n"); @@ -1163,6 +1173,9 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) limits->max_perf = div_fp(int_tofp(limits->max_perf_pct), int_tofp(100)); + out: + intel_pstate_set_update_util_hook(policy->cpu); + if (hwp_active) intel_pstate_hwp_set(policy->cpus); @@ -1187,8 +1200,7 @@ static void intel_pstate_stop_cpu(struct cpufreq_policy *policy) pr_debug("intel_pstate: CPU %d exiting\n", cpu_num); - cpufreq_set_update_util_data(cpu_num, NULL); - synchronize_sched(); + intel_pstate_clear_update_util_hook(cpu_num); if (hwp_active) return; @@ -1455,8 +1467,7 @@ out: get_online_cpus(); for_each_online_cpu(cpu) { if (all_cpu_data[cpu]) { - cpufreq_set_update_util_data(cpu, NULL); - synchronize_sched(); + intel_pstate_clear_update_util_hook(cpu); kfree(all_cpu_data[cpu]); } } -- cgit v1.1 From febce40febcff3ccdb33f63456ffc4cfc61640c8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 2 Apr 2016 01:06:21 +0200 Subject: intel_pstate: Avoid extra invocation of intel_pstate_sample() The initialization of intel_pstate for a given CPU involves populating the fields of its struct cpudata that represent the previous sample, but currently that is done in a problematic way. Namely, intel_pstate_init_cpu() makes an extra call to intel_pstate_sample() so it reads the current register values that will be used to populate the "previous sample" record during the next invocation of intel_pstate_sample(). However, after commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) that doesn't work for last_sample_time, because the time value is passed to intel_pstate_sample() as an argument now. Passing 0 to it from intel_pstate_init_cpu() is problematic, because that causes cpu->last_sample_time == 0 to be visible in get_target_pstate_use_performance() (and hence the extra cpu->last_sample_time > 0 check in there) and effectively allows the first invocation of intel_pstate_sample() from intel_pstate_update_util() to happen immediately after the initialization which may lead to a significant "turn on" effect in the governor algorithm. To mitigate that issue, rework the initialization to avoid the extra intel_pstate_sample() call from intel_pstate_init_cpu(). Instead, make intel_pstate_sample() return false if it has been called with cpu->sample.time equal to zero, which will make intel_pstate_update_util() skip the sample in that case, and reset cpu->sample.time from intel_pstate_set_update_util_hook() to make the algorithm start properly every time the hook is set. Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'drivers/cpufreq/intel_pstate.c') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 81057e4..9ae1596 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -910,7 +910,14 @@ static inline bool intel_pstate_sample(struct cpudata *cpu, u64 time) cpu->prev_aperf = aperf; cpu->prev_mperf = mperf; cpu->prev_tsc = tsc; - return true; + /* + * First time this function is invoked in a given cycle, all of the + * previous sample data fields are equal to zero or stale and they must + * be populated with meaningful numbers for things to work, so assume + * that sample.time will always be reset before setting the utilization + * update hook and make the caller skip the sample then. + */ + return !!cpu->last_sample_time; } static inline int32_t get_avg_frequency(struct cpudata *cpu) @@ -984,8 +991,7 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu) * enough period of time to adjust our busyness. */ duration_ns = cpu->sample.time - cpu->last_sample_time; - if ((s64)duration_ns > pid_params.sample_rate_ns * 3 - && cpu->last_sample_time > 0) { + if ((s64)duration_ns > pid_params.sample_rate_ns * 3) { sample_ratio = div_fp(int_tofp(pid_params.sample_rate_ns), int_tofp(duration_ns)); core_busy = mul_fp(core_busy, sample_ratio); @@ -1100,7 +1106,6 @@ static int intel_pstate_init_cpu(unsigned int cpunum) intel_pstate_get_cpu_pstates(cpu); intel_pstate_busy_pid_reset(cpu); - intel_pstate_sample(cpu, 0); cpu->update_util.func = intel_pstate_update_util; @@ -1121,9 +1126,13 @@ static unsigned int intel_pstate_get(unsigned int cpu_num) return get_avg_frequency(cpu); } -static void intel_pstate_set_update_util_hook(unsigned int cpu) +static void intel_pstate_set_update_util_hook(unsigned int cpu_num) { - cpufreq_set_update_util_data(cpu, &all_cpu_data[cpu]->update_util); + struct cpudata *cpu = all_cpu_data[cpu_num]; + + /* Prevent intel_pstate_update_util() from using stale data. */ + cpu->sample.time = 0; + cpufreq_set_update_util_data(cpu_num, &cpu->update_util); } static void intel_pstate_clear_update_util_hook(unsigned int cpu) -- cgit v1.1 From 30a3915385f124a6e3c81df4070f531d4f84299b Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sun, 3 Apr 2016 19:42:11 -0700 Subject: cpufreq: intel_pstate: fix inconsistency in setting policy limits When user sets performance policy using cpufreq interface, it is possible that because of policy->max limits, the actual performance is still limited. But the current implementation will silently switch the policy to powersave and start using powersave limits. If user modifies any limits using intel_pstate sysfs, this is actually changing powersave limits. The current implementation tracks limits under powersave and performance policy using two different variables. When policy->max is less than policy->cpuinfo.max_freq, only powersave limit variable is used. This fix causes the performance limits variable to be used always when the policy is performance. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'drivers/cpufreq/intel_pstate.c') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 9ae1596..c6c169a 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1141,6 +1141,20 @@ static void intel_pstate_clear_update_util_hook(unsigned int cpu) synchronize_sched(); } +static void intel_pstate_set_performance_limits(struct perf_limits *limits) +{ + limits->no_turbo = 0; + limits->turbo_disabled = 0; + limits->max_perf_pct = 100; + limits->max_perf = int_tofp(1); + limits->min_perf_pct = 100; + limits->min_perf = int_tofp(1); + limits->max_policy_pct = 100; + limits->max_sysfs_pct = 100; + limits->min_policy_pct = 0; + limits->min_sysfs_pct = 0; +} + static int intel_pstate_set_policy(struct cpufreq_policy *policy) { if (!policy->cpuinfo.max_freq) @@ -1148,15 +1162,18 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) intel_pstate_clear_update_util_hook(policy->cpu); - if (policy->policy == CPUFREQ_POLICY_PERFORMANCE && - policy->max >= policy->cpuinfo.max_freq) { - pr_debug("intel_pstate: set performance\n"); + if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) { limits = &performance_limits; - goto out; + if (policy->max >= policy->cpuinfo.max_freq) { + pr_debug("intel_pstate: set performance\n"); + intel_pstate_set_performance_limits(limits); + goto out; + } + } else { + pr_debug("intel_pstate: set powersave\n"); + limits = &powersave_limits; } - pr_debug("intel_pstate: set powersave\n"); - limits = &powersave_limits; limits->min_policy_pct = (policy->min * 100) / policy->cpuinfo.max_freq; limits->min_policy_pct = clamp_t(int, limits->min_policy_pct, 0 , 100); limits->max_policy_pct = DIV_ROUND_UP(policy->max * 100, -- cgit v1.1 From 13ad7701f9d0ab7806eb91f1fe1ca43d41b31fa2 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sun, 3 Apr 2016 13:06:46 -0700 Subject: cpufreq: intel_pstate: Documenation for structures No code change. Only added kernel doc style comments for structures. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 135 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) (limited to 'drivers/cpufreq/intel_pstate.c') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index c6c169a..8b5a415 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -64,6 +64,25 @@ static inline int ceiling_fp(int32_t x) return ret; } +/** + * struct sample - Store performance sample + * @core_pct_busy: Ratio of APERF/MPERF in percent, which is actual + * performance during last sample period + * @busy_scaled: Scaled busy value which is used to calculate next + * P state. This can be different than core_pct_busy + * to account for cpu idle period + * @aperf: Difference of actual performance frequency clock count + * read from APERF MSR between last and current sample + * @mperf: Difference of maximum performance frequency clock count + * read from MPERF MSR between last and current sample + * @tsc: Difference of time stamp counter between last and + * current sample + * @freq: Effective frequency calculated from APERF/MPERF + * @time: Current time from scheduler + * + * This structure is used in the cpudata structure to store performance sample + * data for choosing next P State. + */ struct sample { int32_t core_pct_busy; int32_t busy_scaled; @@ -74,6 +93,20 @@ struct sample { u64 time; }; +/** + * struct pstate_data - Store P state data + * @current_pstate: Current requested P state + * @min_pstate: Min P state possible for this platform + * @max_pstate: Max P state possible for this platform + * @max_pstate_physical:This is physical Max P state for a processor + * This can be higher than the max_pstate which can + * be limited by platform thermal design power limits + * @scaling: Scaling factor to convert frequency to cpufreq + * frequency units + * @turbo_pstate: Max Turbo P state possible for this platform + * + * Stores the per cpu model P state limits and current P state. + */ struct pstate_data { int current_pstate; int min_pstate; @@ -83,6 +116,19 @@ struct pstate_data { int turbo_pstate; }; +/** + * struct vid_data - Stores voltage information data + * @min: VID data for this platform corresponding to + * the lowest P state + * @max: VID data corresponding to the highest P State. + * @turbo: VID data for turbo P state + * @ratio: Ratio of (vid max - vid min) / + * (max P state - Min P State) + * + * Stores the voltage data for DVFS (Dynamic Voltage and Frequency Scaling) + * This data is used in Atom platforms, where in addition to target P state, + * the voltage data needs to be specified to select next P State. + */ struct vid_data { int min; int max; @@ -90,6 +136,18 @@ struct vid_data { int32_t ratio; }; +/** + * struct _pid - Stores PID data + * @setpoint: Target set point for busyness or performance + * @integral: Storage for accumulated error values + * @p_gain: PID proportional gain + * @i_gain: PID integral gain + * @d_gain: PID derivative gain + * @deadband: PID deadband + * @last_err: Last error storage for integral part of PID calculation + * + * Stores PID coefficients and last error for PID controller. + */ struct _pid { int setpoint; int32_t integral; @@ -100,6 +158,23 @@ struct _pid { int32_t last_err; }; +/** + * struct cpudata - Per CPU instance data storage + * @cpu: CPU number for this instance data + * @update_util: CPUFreq utility callback information + * @pstate: Stores P state limits for this CPU + * @vid: Stores VID limits for this CPU + * @pid: Stores PID parameters for this CPU + * @last_sample_time: Last Sample time + * @prev_aperf: Last APERF value read from APERF MSR + * @prev_mperf: Last MPERF value read from MPERF MSR + * @prev_tsc: Last timestamp counter (TSC) value + * @prev_cummulative_iowait: IO Wait time difference from last and + * current sample + * @sample: Storage for storing last Sample data + * + * This structure stores per CPU instance data for all CPUs. + */ struct cpudata { int cpu; @@ -118,6 +193,19 @@ struct cpudata { }; static struct cpudata **all_cpu_data; + +/** + * struct pid_adjust_policy - Stores static PID configuration data + * @sample_rate_ms: PID calculation sample rate in ms + * @sample_rate_ns: Sample rate calculation in ns + * @deadband: PID deadband + * @setpoint: PID Setpoint + * @p_gain_pct: PID proportional gain + * @i_gain_pct: PID integral gain + * @d_gain_pct: PID derivative gain + * + * Stores per CPU model static PID configuration data. + */ struct pstate_adjust_policy { int sample_rate_ms; s64 sample_rate_ns; @@ -128,6 +216,20 @@ struct pstate_adjust_policy { int i_gain_pct; }; +/** + * struct pstate_funcs - Per CPU model specific callbacks + * @get_max: Callback to get maximum non turbo effective P state + * @get_max_physical: Callback to get maximum non turbo physical P state + * @get_min: Callback to get minimum P state + * @get_turbo: Callback to get turbo P state + * @get_scaling: Callback to get frequency scaling factor + * @get_val: Callback to convert P state to actual MSR write value + * @get_vid: Callback to get VID data for Atom platforms + * @get_target_pstate: Callback to a function to calculate next P state to use + * + * Core and Atom CPU models have different way to get P State limits. This + * structure is used to store those callbacks. + */ struct pstate_funcs { int (*get_max)(void); int (*get_max_physical)(void); @@ -139,6 +241,11 @@ struct pstate_funcs { int32_t (*get_target_pstate)(struct cpudata *); }; +/** + * struct cpu_defaults- Per CPU model default config data + * @pid_policy: PID config data + * @funcs: Callback function data + */ struct cpu_defaults { struct pstate_adjust_policy pid_policy; struct pstate_funcs funcs; @@ -151,6 +258,34 @@ static struct pstate_adjust_policy pid_params; static struct pstate_funcs pstate_funcs; static int hwp_active; + +/** + * struct perf_limits - Store user and policy limits + * @no_turbo: User requested turbo state from intel_pstate sysfs + * @turbo_disabled: Platform turbo status either from msr + * MSR_IA32_MISC_ENABLE or when maximum available pstate + * matches the maximum turbo pstate + * @max_perf_pct: Effective maximum performance limit in percentage, this + * is minimum of either limits enforced by cpufreq policy + * or limits from user set limits via intel_pstate sysfs + * @min_perf_pct: Effective minimum performance limit in percentage, this + * is maximum of either limits enforced by cpufreq policy + * or limits from user set limits via intel_pstate sysfs + * @max_perf: This is a scaled value between 0 to 255 for max_perf_pct + * This value is used to limit max pstate + * @min_perf: This is a scaled value between 0 to 255 for min_perf_pct + * This value is used to limit min pstate + * @max_policy_pct: The maximum performance in percentage enforced by + * cpufreq setpolicy interface + * @max_sysfs_pct: The maximum performance in percentage enforced by + * intel pstate sysfs interface + * @min_policy_pct: The minimum performance in percentage enforced by + * cpufreq setpolicy interface + * @min_sysfs_pct: The minimum performance in percentage enforced by + * intel pstate sysfs interface + * + * Storage for user and policy defined limits. + */ struct perf_limits { int no_turbo; int turbo_disabled; -- cgit v1.1