diff options
-rw-r--r-- | MAINTAINERS | 3 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle-powernv.c | 2 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle.c | 19 | ||||
-rw-r--r-- | drivers/cpuidle/dt_idle_states.c | 6 | ||||
-rw-r--r-- | drivers/cpuidle/governor.c | 4 | ||||
-rw-r--r-- | drivers/cpuidle/governors/ladder.c | 2 | ||||
-rw-r--r-- | drivers/cpuidle/governors/menu.c | 2 | ||||
-rw-r--r-- | drivers/cpuidle/sysfs.c | 4 | ||||
-rw-r--r-- | drivers/idle/intel_idle.c | 154 | ||||
-rw-r--r-- | drivers/thermal/intel_powerclamp.c | 359 | ||||
-rw-r--r-- | include/linux/cpu.h | 2 | ||||
-rw-r--r-- | include/linux/cpuidle.h | 9 | ||||
-rw-r--r-- | include/linux/sched.h | 3 | ||||
-rw-r--r-- | kernel/fork.c | 2 | ||||
-rw-r--r-- | kernel/sched/core.c | 1 | ||||
-rw-r--r-- | kernel/sched/idle.c | 175 |
16 files changed, 425 insertions, 322 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index d951d67..e89a188 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3389,6 +3389,7 @@ M: Daniel Lezcano <daniel.lezcano@linaro.org> L: linux-pm@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git +B: https://bugzilla.kernel.org F: drivers/cpuidle/* F: include/linux/cpuidle.h @@ -6298,9 +6299,11 @@ S: Maintained F: drivers/platform/x86/intel-vbtn.c INTEL IDLE DRIVER +M: Jacob Pan <jacob.jun.pan@linux.intel.com> M: Len Brown <lenb@kernel.org> L: linux-pm@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git +B: https://bugzilla.kernel.org S: Supported F: drivers/idle/intel_idle.c diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 7fe442c..0835a37 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -22,7 +22,7 @@ #define POWERNV_THRESHOLD_LATENCY_NS 200000 -struct cpuidle_driver powernv_idle_driver = { +static struct cpuidle_driver powernv_idle_driver = { .name = "powernv_idle", .owner = THIS_MODULE, }; diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index c73207a..62810ff 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -97,7 +97,23 @@ static int find_deepest_state(struct cpuidle_driver *drv, return ret; } -#ifdef CONFIG_SUSPEND +/** + * cpuidle_use_deepest_state - Set/clear governor override flag. + * @enable: New value of the flag. + * + * Set/unset the current CPU to use the deepest idle state (override governors + * going forward if set). + */ +void cpuidle_use_deepest_state(bool enable) +{ + struct cpuidle_device *dev; + + preempt_disable(); + dev = cpuidle_get_device(); + dev->use_deepest_state = enable; + preempt_enable(); +} + /** * cpuidle_find_deepest_state - Find the deepest available idle state. * @drv: cpuidle driver for the given CPU. @@ -109,6 +125,7 @@ int cpuidle_find_deepest_state(struct cpuidle_driver *drv, return find_deepest_state(drv, dev, UINT_MAX, 0, false); } +#ifdef CONFIG_SUSPEND static void enter_freeze_proper(struct cpuidle_driver *drv, struct cpuidle_device *dev, int index) { diff --git a/drivers/cpuidle/dt_idle_states.c b/drivers/cpuidle/dt_idle_states.c index a5c111b..ffca4fc 100644 --- a/drivers/cpuidle/dt_idle_states.c +++ b/drivers/cpuidle/dt_idle_states.c @@ -38,6 +38,12 @@ static int init_state_node(struct cpuidle_state *idle_state, * state enter function. */ idle_state->enter = match_id->data; + /* + * Since this is not a "coupled" state, it's safe to assume interrupts + * won't be enabled when it exits allowing the tick to be frozen + * safely. So enter() can be also enter_freeze() callback. + */ + idle_state->enter_freeze = match_id->data; err = of_property_read_u32(state_node, "wakeup-latency-us", &idle_state->exit_latency); diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c index fb9f511..4e78263 100644 --- a/drivers/cpuidle/governor.c +++ b/drivers/cpuidle/governor.c @@ -9,7 +9,6 @@ */ #include <linux/mutex.h> -#include <linux/module.h> #include <linux/cpuidle.h> #include "cpuidle.h" @@ -53,14 +52,11 @@ int cpuidle_switch_governor(struct cpuidle_governor *gov) if (cpuidle_curr_governor) { list_for_each_entry(dev, &cpuidle_detected_devices, device_list) cpuidle_disable_device(dev); - module_put(cpuidle_curr_governor->owner); } cpuidle_curr_governor = gov; if (gov) { - if (!try_module_get(cpuidle_curr_governor->owner)) - return -EINVAL; list_for_each_entry(dev, &cpuidle_detected_devices, device_list) cpuidle_enable_device(dev); cpuidle_install_idle_handler(); diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c index 63bd5a4..fe8f089 100644 --- a/drivers/cpuidle/governors/ladder.c +++ b/drivers/cpuidle/governors/ladder.c @@ -15,7 +15,6 @@ #include <linux/kernel.h> #include <linux/cpuidle.h> #include <linux/pm_qos.h> -#include <linux/module.h> #include <linux/jiffies.h> #include <linux/tick.h> @@ -177,7 +176,6 @@ static struct cpuidle_governor ladder_governor = { .enable = ladder_enable_device, .select = ladder_select_state, .reflect = ladder_reflect, - .owner = THIS_MODULE, }; /** diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 03d38c2..d9b5b93 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -19,7 +19,6 @@ #include <linux/tick.h> #include <linux/sched.h> #include <linux/math64.h> -#include <linux/module.h> /* * Please note when changing the tuning values: @@ -484,7 +483,6 @@ static struct cpuidle_governor menu_governor = { .enable = menu_enable_device, .select = menu_select, .reflect = menu_reflect, - .owner = THIS_MODULE, }; /** diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index 832a2c3..c5adc8c 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -403,8 +403,10 @@ static int cpuidle_add_state_sysfs(struct cpuidle_device *device) /* state statistics */ for (i = 0; i < drv->state_count; i++) { kobj = kzalloc(sizeof(struct cpuidle_state_kobj), GFP_KERNEL); - if (!kobj) + if (!kobj) { + ret = -ENOMEM; goto error_state; + } kobj->state = &drv->states[i]; kobj->state_usage = &device->states_usage[i]; init_completion(&kobj->kobj_unregister); diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 4466a2f..7d8ea3d 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -98,8 +98,6 @@ static int intel_idle(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index); static void intel_idle_freeze(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index); -static int intel_idle_cpu_init(int cpu); - static struct cpuidle_state *cpuidle_state_table; /* @@ -724,6 +722,50 @@ static struct cpuidle_state atom_cstates[] = { { .enter = NULL } }; +static struct cpuidle_state tangier_cstates[] = { + { + .name = "C1-TNG", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 4, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C4-TNG", + .desc = "MWAIT 0x30", + .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 100, + .target_residency = 400, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C6-TNG", + .desc = "MWAIT 0x52", + .flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 140, + .target_residency = 560, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C7-TNG", + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 1200, + .target_residency = 4000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C9-TNG", + .desc = "MWAIT 0x64", + .flags = MWAIT2flg(0x64) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 10000, + .target_residency = 20000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .enter = NULL } +}; static struct cpuidle_state avn_cstates[] = { { .name = "C1-AVN", @@ -907,51 +949,15 @@ static void intel_idle_freeze(struct cpuidle_device *dev, mwait_idle_with_hints(eax, ecx); } -static void __setup_broadcast_timer(void *arg) +static void __setup_broadcast_timer(bool on) { - unsigned long on = (unsigned long)arg; - if (on) tick_broadcast_enable(); else tick_broadcast_disable(); } -static int cpu_hotplug_notify(struct notifier_block *n, - unsigned long action, void *hcpu) -{ - int hotcpu = (unsigned long)hcpu; - struct cpuidle_device *dev; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - - if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE) - smp_call_function_single(hotcpu, __setup_broadcast_timer, - (void *)true, 1); - - /* - * Some systems can hotplug a cpu at runtime after - * the kernel has booted, we have to initialize the - * driver in this case - */ - dev = per_cpu_ptr(intel_idle_cpuidle_devices, hotcpu); - if (dev->registered) - break; - - if (intel_idle_cpu_init(hotcpu)) - return NOTIFY_BAD; - - break; - } - return NOTIFY_OK; -} - -static struct notifier_block cpu_hotplug_notifier = { - .notifier_call = cpu_hotplug_notify, -}; - -static void auto_demotion_disable(void *dummy) +static void auto_demotion_disable(void) { unsigned long long msr_bits; @@ -959,7 +965,7 @@ static void auto_demotion_disable(void *dummy) msr_bits &= ~(icpu->auto_demotion_disable_flags); wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits); } -static void c1e_promotion_disable(void *dummy) +static void c1e_promotion_disable(void) { unsigned long long msr_bits; @@ -978,6 +984,10 @@ static const struct idle_cpu idle_cpu_atom = { .state_table = atom_cstates, }; +static const struct idle_cpu idle_cpu_tangier = { + .state_table = tangier_cstates, +}; + static const struct idle_cpu idle_cpu_lincroft = { .state_table = atom_cstates, .auto_demotion_disable_flags = ATM_LNC_C6_AUTO_DEMOTE, @@ -1066,6 +1076,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { ICPU(INTEL_FAM6_SANDYBRIDGE_X, idle_cpu_snb), ICPU(INTEL_FAM6_ATOM_CEDARVIEW, idle_cpu_atom), ICPU(INTEL_FAM6_ATOM_SILVERMONT1, idle_cpu_byt), + ICPU(INTEL_FAM6_ATOM_MERRIFIELD, idle_cpu_tangier), ICPU(INTEL_FAM6_ATOM_AIRMONT, idle_cpu_cht), ICPU(INTEL_FAM6_IVYBRIDGE, idle_cpu_ivb), ICPU(INTEL_FAM6_IVYBRIDGE_X, idle_cpu_ivt), @@ -1084,6 +1095,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { ICPU(INTEL_FAM6_KABYLAKE_DESKTOP, idle_cpu_skl), ICPU(INTEL_FAM6_SKYLAKE_X, idle_cpu_skx), ICPU(INTEL_FAM6_XEON_PHI_KNL, idle_cpu_knl), + ICPU(INTEL_FAM6_XEON_PHI_KNM, idle_cpu_knl), ICPU(INTEL_FAM6_ATOM_GOLDMONT, idle_cpu_bxt), ICPU(INTEL_FAM6_ATOM_DENVERTON, idle_cpu_dnv), {} @@ -1373,12 +1385,11 @@ static void __init intel_idle_cpuidle_driver_init(void) * allocate, initialize, register cpuidle_devices * @cpu: cpu/core to initialize */ -static int intel_idle_cpu_init(int cpu) +static int intel_idle_cpu_init(unsigned int cpu) { struct cpuidle_device *dev; dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu); - dev->cpu = cpu; if (cpuidle_register_device(dev)) { @@ -1387,17 +1398,36 @@ static int intel_idle_cpu_init(int cpu) } if (icpu->auto_demotion_disable_flags) - smp_call_function_single(cpu, auto_demotion_disable, NULL, 1); + auto_demotion_disable(); if (icpu->disable_promotion_to_c1e) - smp_call_function_single(cpu, c1e_promotion_disable, NULL, 1); + c1e_promotion_disable(); + + return 0; +} + +static int intel_idle_cpu_online(unsigned int cpu) +{ + struct cpuidle_device *dev; + + if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE) + __setup_broadcast_timer(true); + + /* + * Some systems can hotplug a cpu at runtime after + * the kernel has booted, we have to initialize the + * driver in this case + */ + dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu); + if (!dev->registered) + return intel_idle_cpu_init(cpu); return 0; } static int __init intel_idle_init(void) { - int retval, i; + int retval; /* Do not load intel_idle at all for now if idle= is passed */ if (boot_option_idle_override != IDLE_NO_OVERRIDE) @@ -1417,35 +1447,29 @@ static int __init intel_idle_init(void) struct cpuidle_driver *drv = cpuidle_get_driver(); printk(KERN_DEBUG PREFIX "intel_idle yielding to %s", drv ? drv->name : "none"); - free_percpu(intel_idle_cpuidle_devices); - return retval; + goto init_driver_fail; } - cpu_notifier_register_begin(); - - for_each_online_cpu(i) { - retval = intel_idle_cpu_init(i); - if (retval) { - intel_idle_cpuidle_devices_uninit(); - cpu_notifier_register_done(); - cpuidle_unregister_driver(&intel_idle_driver); - free_percpu(intel_idle_cpuidle_devices); - return retval; - } - } - __register_cpu_notifier(&cpu_hotplug_notifier); - if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */ lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE; - else - on_each_cpu(__setup_broadcast_timer, (void *)true, 1); - cpu_notifier_register_done(); + retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online", + intel_idle_cpu_online, NULL); + if (retval < 0) + goto hp_setup_fail; pr_debug(PREFIX "lapic_timer_reliable_states 0x%x\n", lapic_timer_reliable_states); return 0; + +hp_setup_fail: + intel_idle_cpuidle_devices_uninit(); + cpuidle_unregister_driver(&intel_idle_driver); +init_driver_fail: + free_percpu(intel_idle_cpuidle_devices); + return retval; + } device_initcall(intel_idle_init); diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c index afada65..83e6971 100644 --- a/drivers/thermal/intel_powerclamp.c +++ b/drivers/thermal/intel_powerclamp.c @@ -43,7 +43,6 @@ #include <linux/kernel.h> #include <linux/delay.h> #include <linux/kthread.h> -#include <linux/freezer.h> #include <linux/cpu.h> #include <linux/thermal.h> #include <linux/slab.h> @@ -86,11 +85,26 @@ static unsigned int control_cpu; /* The cpu assigned to collect stat and update */ static bool clamping; +static const struct sched_param sparam = { + .sched_priority = MAX_USER_RT_PRIO / 2, +}; +struct powerclamp_worker_data { + struct kthread_worker *worker; + struct kthread_work balancing_work; + struct kthread_delayed_work idle_injection_work; + unsigned int cpu; + unsigned int count; + unsigned int guard; + unsigned int window_size_now; + unsigned int target_ratio; + unsigned int duration_jiffies; + bool clamping; +}; -static struct task_struct * __percpu *powerclamp_thread; +static struct powerclamp_worker_data * __percpu worker_data; static struct thermal_cooling_device *cooling_dev; static unsigned long *cpu_clamping_mask; /* bit map for tracking per cpu - * clamping thread + * clamping kthread worker */ static unsigned int duration; @@ -262,11 +276,6 @@ static u64 pkg_state_counter(void) return count; } -static void noop_timer(unsigned long foo) -{ - /* empty... just the fact that we get the interrupt wakes us up */ -} - static unsigned int get_compensation(int ratio) { unsigned int comp = 0; @@ -368,103 +377,79 @@ static bool powerclamp_adjust_controls(unsigned int target_ratio, return set_target_ratio + guard <= current_ratio; } -static int clamp_thread(void *arg) +static void clamp_balancing_func(struct kthread_work *work) { - int cpunr = (unsigned long)arg; - DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0); - static const struct sched_param param = { - .sched_priority = MAX_USER_RT_PRIO/2, - }; - unsigned int count = 0; - unsigned int target_ratio; + struct powerclamp_worker_data *w_data; + int sleeptime; + unsigned long target_jiffies; + unsigned int compensated_ratio; + int interval; /* jiffies to sleep for each attempt */ - set_bit(cpunr, cpu_clamping_mask); - set_freezable(); - init_timer_on_stack(&wakeup_timer); - sched_setscheduler(current, SCHED_FIFO, ¶m); - - while (true == clamping && !kthread_should_stop() && - cpu_online(cpunr)) { - int sleeptime; - unsigned long target_jiffies; - unsigned int guard; - unsigned int compensated_ratio; - int interval; /* jiffies to sleep for each attempt */ - unsigned int duration_jiffies = msecs_to_jiffies(duration); - unsigned int window_size_now; - - try_to_freeze(); - /* - * make sure user selected ratio does not take effect until - * the next round. adjust target_ratio if user has changed - * target such that we can converge quickly. - */ - target_ratio = set_target_ratio; - guard = 1 + target_ratio/20; - window_size_now = window_size; - count++; - - /* - * systems may have different ability to enter package level - * c-states, thus we need to compensate the injected idle ratio - * to achieve the actual target reported by the HW. - */ - compensated_ratio = target_ratio + - get_compensation(target_ratio); - if (compensated_ratio <= 0) - compensated_ratio = 1; - interval = duration_jiffies * 100 / compensated_ratio; - - /* align idle time */ - target_jiffies = roundup(jiffies, interval); - sleeptime = target_jiffies - jiffies; - if (sleeptime <= 0) - sleeptime = 1; - schedule_timeout_interruptible(sleeptime); - /* - * only elected controlling cpu can collect stats and update - * control parameters. - */ - if (cpunr == control_cpu && !(count%window_size_now)) { - should_skip = - powerclamp_adjust_controls(target_ratio, - guard, window_size_now); - smp_mb(); - } + w_data = container_of(work, struct powerclamp_worker_data, + balancing_work); - if (should_skip) - continue; - - target_jiffies = jiffies + duration_jiffies; - mod_timer(&wakeup_timer, target_jiffies); - if (unlikely(local_softirq_pending())) - continue; - /* - * stop tick sched during idle time, interrupts are still - * allowed. thus jiffies are updated properly. - */ - preempt_disable(); - /* mwait until target jiffies is reached */ - while (time_before(jiffies, target_jiffies)) { - unsigned long ecx = 1; - unsigned long eax = target_mwait; - - /* - * REVISIT: may call enter_idle() to notify drivers who - * can save power during cpu idle. same for exit_idle() - */ - local_touch_nmi(); - stop_critical_timings(); - mwait_idle_with_hints(eax, ecx); - start_critical_timings(); - atomic_inc(&idle_wakeup_counter); - } - preempt_enable(); + /* + * make sure user selected ratio does not take effect until + * the next round. adjust target_ratio if user has changed + * target such that we can converge quickly. + */ + w_data->target_ratio = READ_ONCE(set_target_ratio); + w_data->guard = 1 + w_data->target_ratio / 20; + w_data->window_size_now = window_size; + w_data->duration_jiffies = msecs_to_jiffies(duration); + w_data->count++; + + /* + * systems may have different ability to enter package level + * c-states, thus we need to compensate the injected idle ratio + * to achieve the actual target reported by the HW. + */ + compensated_ratio = w_data->target_ratio + + get_compensation(w_data->target_ratio); + if (compensated_ratio <= 0) + compensated_ratio = 1; + interval = w_data->duration_jiffies * 100 / compensated_ratio; + + /* align idle time */ + target_jiffies = roundup(jiffies, interval); + sleeptime = target_jiffies - jiffies; + if (sleeptime <= 0) + sleeptime = 1; + + if (clamping && w_data->clamping && cpu_online(w_data->cpu)) + kthread_queue_delayed_work(w_data->worker, + &w_data->idle_injection_work, + sleeptime); +} + +static void clamp_idle_injection_func(struct kthread_work *work) +{ + struct powerclamp_worker_data *w_data; + + w_data = container_of(work, struct powerclamp_worker_data, + idle_injection_work.work); + + /* + * only elected controlling cpu can collect stats and update + * control parameters. + */ + if (w_data->cpu == control_cpu && + !(w_data->count % w_data->window_size_now)) { + should_skip = + powerclamp_adjust_controls(w_data->target_ratio, + w_data->guard, + w_data->window_size_now); + smp_mb(); } - del_timer_sync(&wakeup_timer); - clear_bit(cpunr, cpu_clamping_mask); - return 0; + if (should_skip) + goto balance; + + play_idle(jiffies_to_msecs(w_data->duration_jiffies)); + +balance: + if (clamping && w_data->clamping && cpu_online(w_data->cpu)) + kthread_queue_work(w_data->worker, &w_data->balancing_work); } /* @@ -508,10 +493,60 @@ static void poll_pkg_cstate(struct work_struct *dummy) schedule_delayed_work(&poll_pkg_cstate_work, HZ); } +static void start_power_clamp_worker(unsigned long cpu) +{ + struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu); + struct kthread_worker *worker; + + worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inject/%ld", cpu); + if (IS_ERR(worker)) + return; + + w_data->worker = worker; + w_data->count = 0; + w_data->cpu = cpu; + w_data->clamping = true; + set_bit(cpu, cpu_clamping_mask); + sched_setscheduler(worker->task, SCHED_FIFO, &sparam); + kthread_init_work(&w_data->balancing_work, clamp_balancing_func); + kthread_init_delayed_work(&w_data->idle_injection_work, + clamp_idle_injection_func); + kthread_queue_work(w_data->worker, &w_data->balancing_work); +} + +static void stop_power_clamp_worker(unsigned long cpu) +{ + struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu); + + if (!w_data->worker) + return; + + w_data->clamping = false; + /* + * Make sure that all works that get queued after this point see + * the clamping disabled. The counter part is not needed because + * there is an implicit memory barrier when the queued work + * is proceed. + */ + smp_wmb(); + kthread_cancel_work_sync(&w_data->balancing_work); + kthread_cancel_delayed_work_sync(&w_data->idle_injection_work); + /* + * The balancing work still might be queued here because + * the handling of the "clapming" variable, cancel, and queue + * operations are not synchronized via a lock. But it is not + * a big deal. The balancing work is fast and destroy kthread + * will wait for it. + */ + clear_bit(w_data->cpu, cpu_clamping_mask); + kthread_destroy_worker(w_data->worker); + + w_data->worker = NULL; +} + static int start_power_clamp(void) { unsigned long cpu; - struct task_struct *thread; set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1); /* prevent cpu hotplug */ @@ -525,22 +560,9 @@ static int start_power_clamp(void) clamping = true; schedule_delayed_work(&poll_pkg_cstate_work, 0); - /* start one thread per online cpu */ + /* start one kthread worker per online cpu */ for_each_online_cpu(cpu) { - struct task_struct **p = - per_cpu_ptr(powerclamp_thread, cpu); - - thread = kthread_create_on_node(clamp_thread, - (void *) cpu, - cpu_to_node(cpu), - "kidle_inject/%ld", cpu); - /* bind to cpu here */ - if (likely(!IS_ERR(thread))) { - kthread_bind(thread, cpu); - wake_up_process(thread); - *p = thread; - } - + start_power_clamp_worker(cpu); } put_online_cpus(); @@ -550,71 +572,49 @@ static int start_power_clamp(void) static void end_power_clamp(void) { int i; - struct task_struct *thread; - clamping = false; /* - * make clamping visible to other cpus and give per cpu clamping threads - * sometime to exit, or gets killed later. + * Block requeuing in all the kthread workers. They will flush and + * stop faster. */ - smp_mb(); - msleep(20); + clamping = false; if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) { for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) { - pr_debug("clamping thread for cpu %d alive, kill\n", i); - thread = *per_cpu_ptr(powerclamp_thread, i); - kthread_stop(thread); + pr_debug("clamping worker for cpu %d alive, destroy\n", + i); + stop_power_clamp_worker(i); } } } -static int powerclamp_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int powerclamp_cpu_online(unsigned int cpu) { - unsigned long cpu = (unsigned long)hcpu; - struct task_struct *thread; - struct task_struct **percpu_thread = - per_cpu_ptr(powerclamp_thread, cpu); - - if (false == clamping) - goto exit_ok; - - switch (action) { - case CPU_ONLINE: - thread = kthread_create_on_node(clamp_thread, - (void *) cpu, - cpu_to_node(cpu), - "kidle_inject/%lu", cpu); - if (likely(!IS_ERR(thread))) { - kthread_bind(thread, cpu); - wake_up_process(thread); - *percpu_thread = thread; - } - /* prefer BSP as controlling CPU */ - if (cpu == 0) { - control_cpu = 0; - smp_mb(); - } - break; - case CPU_DEAD: - if (test_bit(cpu, cpu_clamping_mask)) { - pr_err("cpu %lu dead but powerclamping thread is not\n", - cpu); - kthread_stop(*percpu_thread); - } - if (cpu == control_cpu) { - control_cpu = smp_processor_id(); - smp_mb(); - } + if (clamping == false) + return 0; + start_power_clamp_worker(cpu); + /* prefer BSP as controlling CPU */ + if (cpu == 0) { + control_cpu = 0; + smp_mb(); } - -exit_ok: - return NOTIFY_OK; + return 0; } -static struct notifier_block powerclamp_cpu_notifier = { - .notifier_call = powerclamp_cpu_callback, -}; +static int powerclamp_cpu_predown(unsigned int cpu) +{ + if (clamping == false) + return 0; + + stop_power_clamp_worker(cpu); + if (cpu != control_cpu) + return 0; + + control_cpu = cpumask_first(cpu_online_mask); + if (control_cpu == cpu) + control_cpu = cpumask_next(cpu, cpu_online_mask); + smp_mb(); + return 0; +} static int powerclamp_get_max_state(struct thermal_cooling_device *cdev, unsigned long *state) @@ -742,6 +742,8 @@ file_error: debugfs_remove_recursive(debug_dir); } +static enum cpuhp_state hp_state; + static int __init powerclamp_init(void) { int retval; @@ -759,10 +761,17 @@ static int __init powerclamp_init(void) /* set default limit, maybe adjusted during runtime based on feedback */ window_size = 2; - register_hotcpu_notifier(&powerclamp_cpu_notifier); + retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "thermal/intel_powerclamp:online", + powerclamp_cpu_online, + powerclamp_cpu_predown); + if (retval < 0) + goto exit_free; + + hp_state = retval; - powerclamp_thread = alloc_percpu(struct task_struct *); - if (!powerclamp_thread) { + worker_data = alloc_percpu(struct powerclamp_worker_data); + if (!worker_data) { retval = -ENOMEM; goto exit_unregister; } @@ -782,9 +791,9 @@ static int __init powerclamp_init(void) return 0; exit_free_thread: - free_percpu(powerclamp_thread); + free_percpu(worker_data); exit_unregister: - unregister_hotcpu_notifier(&powerclamp_cpu_notifier); + cpuhp_remove_state_nocalls(hp_state); exit_free: kfree(cpu_clamping_mask); return retval; @@ -793,9 +802,9 @@ module_init(powerclamp_init); static void __exit powerclamp_exit(void) { - unregister_hotcpu_notifier(&powerclamp_cpu_notifier); end_power_clamp(); - free_percpu(powerclamp_thread); + cpuhp_remove_state_nocalls(hp_state); + free_percpu(worker_data); thermal_cooling_device_unregister(cooling_dev); kfree(cpu_clamping_mask); diff --git a/include/linux/cpu.h b/include/linux/cpu.h index b886dc1..ac0efae 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -245,6 +245,8 @@ void arch_cpu_idle_dead(void); int cpu_report_state(int cpu); int cpu_check_up_prepare(int cpu); void cpu_set_state_online(int cpu); +void play_idle(unsigned long duration_ms); + #ifdef CONFIG_HOTPLUG_CPU bool cpu_wait_death(unsigned int cpu, int seconds); bool cpu_report_death(void); diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index bb31373..da346f2 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -74,6 +74,7 @@ struct cpuidle_driver_kobj; struct cpuidle_device { unsigned int registered:1; unsigned int enabled:1; + unsigned int use_deepest_state:1; unsigned int cpu; int last_residency; @@ -192,11 +193,12 @@ static inline struct cpuidle_driver *cpuidle_get_cpu_driver( static inline struct cpuidle_device *cpuidle_get_device(void) {return NULL; } #endif -#if defined(CONFIG_CPU_IDLE) && defined(CONFIG_SUSPEND) +#ifdef CONFIG_CPU_IDLE extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv, struct cpuidle_device *dev); extern int cpuidle_enter_freeze(struct cpuidle_driver *drv, struct cpuidle_device *dev); +extern void cpuidle_use_deepest_state(bool enable); #else static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, struct cpuidle_device *dev) @@ -204,6 +206,9 @@ static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv, struct cpuidle_device *dev) {return -ENODEV; } +static inline void cpuidle_use_deepest_state(bool enable) +{ +} #endif /* kernel/sched/idle.c */ @@ -235,8 +240,6 @@ struct cpuidle_governor { int (*select) (struct cpuidle_driver *drv, struct cpuidle_device *dev); void (*reflect) (struct cpuidle_device *dev, int index); - - struct module *owner; }; #ifdef CONFIG_CPU_IDLE diff --git a/include/linux/sched.h b/include/linux/sched.h index e9c009d..a3d338e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2254,6 +2254,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, /* * Per process flags */ +#define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ @@ -2611,7 +2612,7 @@ extern struct task_struct *idle_task(int cpu); */ static inline bool is_idle_task(const struct task_struct *p) { - return p->pid == 0; + return !!(p->flags & PF_IDLE); } extern struct task_struct *curr_task(int cpu); extern void ia64_set_curr_task(int cpu, struct task_struct *p); diff --git a/kernel/fork.c b/kernel/fork.c index 997ac1d..a8eb821 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1540,7 +1540,7 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ - p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); + p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE); p->flags |= PF_FORKNOEXEC; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 154fd68..c95fbcd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5279,6 +5279,7 @@ void init_idle(struct task_struct *idle, int cpu) __sched_fork(0, idle); idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); + idle->flags |= PF_IDLE; kasan_unpoison_task_stack(idle); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 1d8718d..6a4bae0 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -164,11 +164,14 @@ static void cpuidle_idle_call(void) * timekeeping to prevent timer interrupts from kicking us out of idle * until a proper wakeup interrupt happens. */ - if (idle_should_freeze()) { - entered_state = cpuidle_enter_freeze(drv, dev); - if (entered_state > 0) { - local_irq_enable(); - goto exit_idle; + + if (idle_should_freeze() || dev->use_deepest_state) { + if (idle_should_freeze()) { + entered_state = cpuidle_enter_freeze(drv, dev); + if (entered_state > 0) { + local_irq_enable(); + goto exit_idle; + } } next_state = cpuidle_find_deepest_state(drv, dev); @@ -202,76 +205,65 @@ exit_idle: * * Called with polling cleared. */ -static void cpu_idle_loop(void) +static void do_idle(void) { - int cpu = smp_processor_id(); - - while (1) { - /* - * If the arch has a polling bit, we maintain an invariant: - * - * Our polling bit is clear if we're not scheduled (i.e. if - * rq->curr != rq->idle). This means that, if rq->idle has - * the polling bit set, then setting need_resched is - * guaranteed to cause the cpu to reschedule. - */ - - __current_set_polling(); - quiet_vmstat(); - tick_nohz_idle_enter(); + /* + * If the arch has a polling bit, we maintain an invariant: + * + * Our polling bit is clear if we're not scheduled (i.e. if rq->curr != + * rq->idle). This means that, if rq->idle has the polling bit set, + * then setting need_resched is guaranteed to cause the CPU to + * reschedule. + */ - while (!need_resched()) { - check_pgt_cache(); - rmb(); + __current_set_polling(); + tick_nohz_idle_enter(); - if (cpu_is_offline(cpu)) { - cpuhp_report_idle_dead(); - arch_cpu_idle_dead(); - } + while (!need_resched()) { + check_pgt_cache(); + rmb(); - local_irq_disable(); - arch_cpu_idle_enter(); - - /* - * In poll mode we reenable interrupts and spin. - * - * Also if we detected in the wakeup from idle - * path that the tick broadcast device expired - * for us, we don't want to go deep idle as we - * know that the IPI is going to arrive right - * away - */ - if (cpu_idle_force_poll || tick_check_broadcast_expired()) - cpu_idle_poll(); - else - cpuidle_idle_call(); - - arch_cpu_idle_exit(); + if (cpu_is_offline(smp_processor_id())) { + cpuhp_report_idle_dead(); + arch_cpu_idle_dead(); } - /* - * Since we fell out of the loop above, we know - * TIF_NEED_RESCHED must be set, propagate it into - * PREEMPT_NEED_RESCHED. - * - * This is required because for polling idle loops we will - * not have had an IPI to fold the state for us. - */ - preempt_set_need_resched(); - tick_nohz_idle_exit(); - __current_clr_polling(); + local_irq_disable(); + arch_cpu_idle_enter(); /* - * We promise to call sched_ttwu_pending and reschedule - * if need_resched is set while polling is set. That - * means that clearing polling needs to be visible - * before doing these things. + * In poll mode we reenable interrupts and spin. Also if we + * detected in the wakeup from idle path that the tick + * broadcast device expired for us, we don't want to go deep + * idle as we know that the IPI is going to arrive right away. */ - smp_mb__after_atomic(); - - sched_ttwu_pending(); - schedule_preempt_disabled(); + if (cpu_idle_force_poll || tick_check_broadcast_expired()) + cpu_idle_poll(); + else + cpuidle_idle_call(); + arch_cpu_idle_exit(); } + + /* + * Since we fell out of the loop above, we know TIF_NEED_RESCHED must + * be set, propagate it into PREEMPT_NEED_RESCHED. + * + * This is required because for polling idle loops we will not have had + * an IPI to fold the state for us. + */ + preempt_set_need_resched(); + tick_nohz_idle_exit(); + __current_clr_polling(); + + /* + * We promise to call sched_ttwu_pending() and reschedule if + * need_resched() is set while polling is set. That means that clearing + * polling needs to be visible before doing these things. + */ + smp_mb__after_atomic(); + + sched_ttwu_pending(); + schedule_preempt_disabled(); } bool cpu_in_idle(unsigned long pc) @@ -280,6 +272,56 @@ bool cpu_in_idle(unsigned long pc) pc < (unsigned long)__cpuidle_text_end; } +struct idle_timer { + struct hrtimer timer; + int done; +}; + +static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer) +{ + struct idle_timer *it = container_of(timer, struct idle_timer, timer); + + WRITE_ONCE(it->done, 1); + set_tsk_need_resched(current); + + return HRTIMER_NORESTART; +} + +void play_idle(unsigned long duration_ms) +{ + struct idle_timer it; + + /* + * Only FIFO tasks can disable the tick since they don't need the forced + * preemption. + */ + WARN_ON_ONCE(current->policy != SCHED_FIFO); + WARN_ON_ONCE(current->nr_cpus_allowed != 1); + WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); + WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); + WARN_ON_ONCE(!duration_ms); + + rcu_sleep_check(); + preempt_disable(); + current->flags |= PF_IDLE; + cpuidle_use_deepest_state(true); + + it.done = 0; + hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + it.timer.function = idle_inject_timer_fn; + hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED); + + while (!READ_ONCE(it.done)) + do_idle(); + + cpuidle_use_deepest_state(false); + current->flags &= ~PF_IDLE; + + preempt_fold_need_resched(); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(play_idle); + void cpu_startup_entry(enum cpuhp_state state) { /* @@ -299,5 +341,6 @@ void cpu_startup_entry(enum cpuhp_state state) #endif arch_cpu_idle_prepare(); cpuhp_online_idle(state); - cpu_idle_loop(); + while (1) + do_idle(); } |