48 files changed, 3493 insertions, 1334 deletions
diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index bb01dea..f0b4a98 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -157,7 +157,7 @@ static void acpi_processor_ppc_ost(acpi_handle handle, int status)
 				  status, NULL);
 }
 
-int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag)
+void acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag)
 {
 	int ret;
 
@@ -168,7 +168,7 @@ int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag)
 		 */
 		if (event_flag)
 			acpi_processor_ppc_ost(pr->handle, 1);
-		return 0;
+		return;
 	}
 
 	ret = acpi_processor_get_platform_limit(pr);
@@ -182,10 +182,8 @@ int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag)
 		else
 			acpi_processor_ppc_ost(pr->handle, 0);
 	}
-	if (ret < 0)
-		return (ret);
-	else
-		return cpufreq_update_policy(pr->id);
+	if (ret >= 0)
+		cpufreq_update_policy(pr->id);
 }
 
 int acpi_processor_get_bios_limit(int cpu, unsigned int *limit)
@@ -465,11 +463,33 @@ int acpi_processor_get_performance_info(struct acpi_processor *pr)
 	return result;
 }
 EXPORT_SYMBOL_GPL(acpi_processor_get_performance_info);
-int acpi_processor_notify_smm(struct module *calling_module)
+
+int acpi_processor_pstate_control(void)
 {
 	acpi_status status;
-	static int is_done = 0;
 
+	if (!acpi_gbl_FADT.smi_command || !acpi_gbl_FADT.pstate_control)
+		return 0;
+
+	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+			  "Writing pstate_control [0x%x] to smi_command [0x%x]\n",
+			  acpi_gbl_FADT.pstate_control, acpi_gbl_FADT.smi_command));
+
+	status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
+				    (u32)acpi_gbl_FADT.pstate_control, 8);
+	if (ACPI_SUCCESS(status))
+		return 1;
+
+	ACPI_EXCEPTION((AE_INFO, status,
+			"Failed to write pstate_control [0x%x] to smi_command [0x%x]",
+			acpi_gbl_FADT.pstate_control, acpi_gbl_FADT.smi_command));
+	return -EIO;
+}
+
+int acpi_processor_notify_smm(struct module *calling_module)
+{
+	static int is_done = 0;
+	int result;
 
 	if (!(acpi_processor_ppc_status & PPC_REGISTERED))
 		return -EBUSY;
@@ -492,26 +512,15 @@ int acpi_processor_notify_smm(struct module *calling_module)
 
 	is_done = -EIO;
 
-	/* Can't write pstate_control to smi_command if either value is zero */
-	if ((!acpi_gbl_FADT.smi_command) || (!acpi_gbl_FADT.pstate_control)) {
+	result = acpi_processor_pstate_control();
+	if (!result) {
 		ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No SMI port or pstate_control\n"));
 		module_put(calling_module);
 		return 0;
 	}
-
-	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
-			  "Writing pstate_control [0x%x] to smi_command [0x%x]\n",
-			  acpi_gbl_FADT.pstate_control, acpi_gbl_FADT.smi_command));
-
-	status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
-				    (u32) acpi_gbl_FADT.pstate_control, 8);
-	if (ACPI_FAILURE(status)) {
-		ACPI_EXCEPTION((AE_INFO, status,
-				"Failed to write pstate_control [0x%x] to "
-				"smi_command [0x%x]", acpi_gbl_FADT.pstate_control,
-				acpi_gbl_FADT.smi_command));
+	if (result < 0) {
 		module_put(calling_module);
-		return status;
+		return result;
 	}
 
 	/* Success. If there's no _PPC, we need to fear nothing, so
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index 54abb26..9b6cebe 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -674,6 +674,14 @@ static void acpi_sleep_suspend_setup(void)
 		if (acpi_sleep_state_supported(i))
 			sleep_states[i] = 1;
 
+	/*
+	 * Use suspend-to-idle by default if ACPI_FADT_LOW_POWER_S0 is set and
+	 * the default suspend mode was not selected from the command line.
+	 */
+	if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0 &&
+	    mem_sleep_default > PM_SUSPEND_MEM)
+		mem_sleep_default = PM_SUSPEND_FREEZE;
+
 	suspend_set_ops(old_suspend_ordering ?
 		&acpi_suspend_ops_old : &acpi_suspend_ops);
 	freeze_set_ops(&acpi_freeze_ops);
diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index e023066..5711708 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -39,6 +39,105 @@
 static LIST_HEAD(gpd_list);
 static DEFINE_MUTEX(gpd_list_lock);
 
+struct genpd_lock_ops {
+	void (*lock)(struct generic_pm_domain *genpd);
+	void (*lock_nested)(struct generic_pm_domain *genpd, int depth);
+	int (*lock_interruptible)(struct generic_pm_domain *genpd);
+	void (*unlock)(struct generic_pm_domain *genpd);
+};
+
+static void genpd_lock_mtx(struct generic_pm_domain *genpd)
+{
+	mutex_lock(&genpd->mlock);
+}
+
+static void genpd_lock_nested_mtx(struct generic_pm_domain *genpd,
+					int depth)
+{
+	mutex_lock_nested(&genpd->mlock, depth);
+}
+
+static int genpd_lock_interruptible_mtx(struct generic_pm_domain *genpd)
+{
+	return mutex_lock_interruptible(&genpd->mlock);
+}
+
+static void genpd_unlock_mtx(struct generic_pm_domain *genpd)
+{
+	return mutex_unlock(&genpd->mlock);
+}
+
+static const struct genpd_lock_ops genpd_mtx_ops = {
+	.lock = genpd_lock_mtx,
+	.lock_nested = genpd_lock_nested_mtx,
+	.lock_interruptible = genpd_lock_interruptible_mtx,
+	.unlock = genpd_unlock_mtx,
+};
+
+static void genpd_lock_spin(struct generic_pm_domain *genpd)
+	__acquires(&genpd->slock)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&genpd->slock, flags);
+	genpd->lock_flags = flags;
+}
+
+static void genpd_lock_nested_spin(struct generic_pm_domain *genpd,
+					int depth)
+	__acquires(&genpd->slock)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave_nested(&genpd->slock, flags, depth);
+	genpd->lock_flags = flags;
+}
+
+static int genpd_lock_interruptible_spin(struct generic_pm_domain *genpd)
+	__acquires(&genpd->slock)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&genpd->slock, flags);
+	genpd->lock_flags = flags;
+	return 0;
+}
+
+static void genpd_unlock_spin(struct generic_pm_domain *genpd)
+	__releases(&genpd->slock)
+{
+	spin_unlock_irqrestore(&genpd->slock, genpd->lock_flags);
+}
+
+static const struct genpd_lock_ops genpd_spin_ops = {
+	.lock = genpd_lock_spin,
+	.lock_nested = genpd_lock_nested_spin,
+	.lock_interruptible = genpd_lock_interruptible_spin,
+	.unlock = genpd_unlock_spin,
+};
+
+#define genpd_lock(p)			p->lock_ops->lock(p)
+#define genpd_lock_nested(p, d)		p->lock_ops->lock_nested(p, d)
+#define genpd_lock_interruptible(p)	p->lock_ops->lock_interruptible(p)
+#define genpd_unlock(p)			p->lock_ops->unlock(p)
+
+#define genpd_is_irq_safe(genpd)	(genpd->flags & GENPD_FLAG_IRQ_SAFE)
+
+static inline bool irq_safe_dev_in_no_sleep_domain(struct device *dev,
+		struct generic_pm_domain *genpd)
+{
+	bool ret;
+
+	ret = pm_runtime_is_irq_safe(dev) && !genpd_is_irq_safe(genpd);
+
+	/* Warn once for each IRQ safe dev in no sleep domain */
+	if (ret)
+		dev_warn_once(dev, "PM domain %s will not be powered off\n",
+				genpd->name);
+
+	return ret;
+}
+
 /*
  * Get the generic PM domain for a particular struct device.
  * This validates the struct device pointer, the PM domain pointer,
@@ -200,9 +299,9 @@ static int genpd_poweron(struct generic_pm_domain *genpd, unsigned int depth)
 
 		genpd_sd_counter_inc(master);
 
-		mutex_lock_nested(&master->lock, depth + 1);
+		genpd_lock_nested(master, depth + 1);
 		ret = genpd_poweron(master, depth + 1);
-		mutex_unlock(&master->lock);
+		genpd_unlock(master);
 
 		if (ret) {
 			genpd_sd_counter_dec(master);
@@ -255,9 +354,9 @@ static int genpd_dev_pm_qos_notifier(struct notifier_block *nb,
 		spin_unlock_irq(&dev->power.lock);
 
 		if (!IS_ERR(genpd)) {
-			mutex_lock(&genpd->lock);
+			genpd_lock(genpd);
 			genpd->max_off_time_changed = true;
-			mutex_unlock(&genpd->lock);
+			genpd_unlock(genpd);
 		}
 
 		dev = dev->parent;
@@ -303,7 +402,12 @@ static int genpd_poweroff(struct generic_pm_domain *genpd, bool is_async)
 		if (stat > PM_QOS_FLAGS_NONE)
 			return -EBUSY;
 
-		if (!pm_runtime_suspended(pdd->dev) || pdd->dev->power.irq_safe)
+		/*
+		 * Do not allow PM domain to be powered off, when an IRQ safe
+		 * device is part of a non-IRQ safe domain.
+		 */
+		if (!pm_runtime_suspended(pdd->dev) ||
+			irq_safe_dev_in_no_sleep_domain(pdd->dev, genpd))
 			not_suspended++;
 	}
 
@@ -354,9 +458,9 @@ static void genpd_power_off_work_fn(struct work_struct *work)
 
 	genpd = container_of(work, struct generic_pm_domain, power_off_work);
 
-	mutex_lock(&genpd->lock);
+	genpd_lock(genpd);
 	genpd_poweroff(genpd, true);
-	mutex_unlock(&genpd->lock);
+	genpd_unlock(genpd);
 }
 
 /**
@@ -466,15 +570,15 @@ static int genpd_runtime_suspend(struct device *dev)
 	}
 
 	/*
-	 * If power.irq_safe is set, this routine will be run with interrupts
-	 * off, so it can't use mutexes.
+	 * If power.irq_safe is set, this routine may be run with
+	 * IRQs disabled, so suspend only if the PM domain also is irq_safe.
 	 */
-	if (dev->power.irq_safe)
+	if (irq_safe_dev_in_no_sleep_domain(dev, genpd))
 		return 0;
 
-	mutex_lock(&genpd->lock);
+	genpd_lock(genpd);
 	genpd_poweroff(genpd, false);
-	mutex_unlock(&genpd->lock);
+	genpd_unlock(genpd);
 
 	return 0;
 }
@@ -503,15 +607,18 @@ static int genpd_runtime_resume(struct device *dev)
 	if (IS_ERR(genpd))
 		return -EINVAL;
 
-	/* If power.irq_safe, the PM domain is never powered off. */
-	if (dev->power.irq_safe) {
+	/*
+	 * As we don't power off a non IRQ safe domain, which holds
+	 * an IRQ safe device, we don't need to restore power to it.
+	 */
+	if (irq_safe_dev_in_no_sleep_domain(dev, genpd)) {
 		timed = false;
 		goto out;
 	}
 
-	mutex_lock(&genpd->lock);
+	genpd_lock(genpd);
 	ret = genpd_poweron(genpd, 0);
-	mutex_unlock(&genpd->lock);
+	genpd_unlock(genpd);
 
 	if (ret)
 		return ret;
@@ -546,10 +653,11 @@ static int genpd_runtime_resume(struct device *dev)
 err_stop:
 	genpd_stop_dev(genpd, dev);
 err_poweroff:
-	if (!dev->power.irq_safe) {
-		mutex_lock(&genpd->lock);
+	if (!pm_runtime_is_irq_safe(dev) ||
+		(pm_runtime_is_irq_safe(dev) && genpd_is_irq_safe(genpd))) {
+		genpd_lock(genpd);
 		genpd_poweroff(genpd, 0);
-		mutex_unlock(&genpd->lock);
+		genpd_unlock(genpd);
 	}
 
 	return ret;
@@ -732,20 +840,20 @@ static int pm_genpd_prepare(struct device *dev)
 	if (resume_needed(dev, genpd))
 		pm_runtime_resume(dev);
 
-	mutex_lock(&genpd->lock);
+	genpd_lock(genpd);
 
 	if (genpd->prepared_count++ == 0)
 		genpd->suspended_count = 0;
 
-	mutex_unlock(&genpd->lock);
+	genpd_unlock(genpd);
 
 	ret = pm_generic_prepare(dev);
 	if (ret) {
-		mutex_lock(&genpd->lock);
+		genpd_lock(genpd);
 
 		genpd->prepared_count--;
 
-		mutex_unlock(&genpd->lock);
+		genpd_unlock(genpd);
 	}
 
 	return ret;
@@ -936,13 +1044,13 @@ static void pm_genpd_complete(struct device *dev)
 
 	pm_generic_complete(dev);
 
-	mutex_lock(&genpd->lock);
+	genpd_lock(genpd);
 
 	genpd->prepared_count--;
 	if (!genpd->prepared_count)
 		genpd_queue_power_off_work(genpd);
 
-	mutex_unlock(&genpd->lock);
+	genpd_unlock(genpd);
 }
 
 /**
@@ -1071,7 +1179,7 @@ static int genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
 	if (IS_ERR(gpd_data))
 		return PTR_ERR(gpd_data);
 
-	mutex_lock(&genpd->lock);
+	genpd_lock(genpd);
 
 	if (genpd->prepared_count > 0) {
 		ret = -EAGAIN;
@@ -1088,7 +1196,7 @@ static int genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
 	list_add_tail(&gpd_data->base.list_node, &genpd->dev_list);
 
  out:
-	mutex_unlock(&genpd->lock);
+	genpd_unlock(genpd);
 
 	if (ret)
 		genpd_free_dev_data(dev, gpd_data);
@@ -1130,7 +1238,7 @@ static int genpd_remove_device(struct generic_pm_domain *genpd,
 	gpd_data = to_gpd_data(pdd);
 	dev_pm_qos_remove_notifier(dev, &gpd_data->nb);
 
-	mutex_lock(&genpd->lock);
+	genpd_lock(genpd);
 
 	if (genpd->prepared_count > 0) {
 		ret = -EAGAIN;
@@ -1145,14 +1253,14 @@ static int genpd_remove_device(struct generic_pm_domain *genpd,
 
 	list_del_init(&pdd->list_node);
 
-	mutex_unlock(&genpd->lock);
+	genpd_unlock(genpd);
 
 	genpd_free_dev_data(dev, gpd_data);
 
 	return 0;
 
  out:
-	mutex_unlock(&genpd->lock);
+	genpd_unlock(genpd);
 	dev_pm_qos_add_notifier(dev, &gpd_data->nb);
 
 	return ret;
@@ -1183,12 +1291,23 @@ static int genpd_add_subdomain(struct generic_pm_domain *genpd,
 	    || genpd == subdomain)
 		return -EINVAL;
 
+	/*
+	 * If the domain can be powered on/off in an IRQ safe
+	 * context, ensure that the subdomain can also be
+	 * powered on/off in that context.
+	 */
+	if (!genpd_is_irq_safe(genpd) && genpd_is_irq_safe(subdomain)) {
+		WARN(1, "Parent %s of subdomain %s must be IRQ safe\n",
+				genpd->name, subdomain->name);
+		return -EINVAL;
+	}
+
 	link = kzalloc(sizeof(*link), GFP_KERNEL);
 	if (!link)
 		return -ENOMEM;
 
-	mutex_lock(&subdomain->lock);
-	mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING);
+	genpd_lock(subdomain);
+	genpd_lock_nested(genpd, SINGLE_DEPTH_NESTING);
 
 	if (genpd->status == GPD_STATE_POWER_OFF
 	    &&  subdomain->status != GPD_STATE_POWER_OFF) {
@@ -1211,8 +1330,8 @@ static int genpd_add_subdomain(struct generic_pm_domain *genpd,
 		genpd_sd_counter_inc(genpd);
 
  out:
-	mutex_unlock(&genpd->lock);
-	mutex_unlock(&subdomain->lock);
+	genpd_unlock(genpd);
+	genpd_unlock(subdomain);
 	if (ret)
 		kfree(link);
 	return ret;
@@ -1250,8 +1369,8 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
 	if (IS_ERR_OR_NULL(genpd) || IS_ERR_OR_NULL(subdomain))
 		return -EINVAL;
 
-	mutex_lock(&subdomain->lock);
-	mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING);
+	genpd_lock(subdomain);
+	genpd_lock_nested(genpd, SINGLE_DEPTH_NESTING);
 
 	if (!list_empty(&subdomain->master_links) || subdomain->device_count) {
 		pr_warn("%s: unable to remove subdomain %s\n", genpd->name,
@@ -1275,13 +1394,39 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
 	}
 
 out:
-	mutex_unlock(&genpd->lock);
-	mutex_unlock(&subdomain->lock);
+	genpd_unlock(genpd);
+	genpd_unlock(subdomain);
 
 	return ret;
 }
 EXPORT_SYMBOL_GPL(pm_genpd_remove_subdomain);
 
+static int genpd_set_default_power_state(struct generic_pm_domain *genpd)
+{
+	struct genpd_power_state *state;
+
+	state = kzalloc(sizeof(*state), GFP_KERNEL);
+	if (!state)
+		return -ENOMEM;
+
+	genpd->states = state;
+	genpd->state_count = 1;
+	genpd->free = state;
+
+	return 0;
+}
+
+static void genpd_lock_init(struct generic_pm_domain *genpd)
+{
+	if (genpd->flags & GENPD_FLAG_IRQ_SAFE) {
+		spin_lock_init(&genpd->slock);
+		genpd->lock_ops = &genpd_spin_ops;
+	} else {
+		mutex_init(&genpd->mlock);
+		genpd->lock_ops = &genpd_mtx_ops;
+	}
+}
+
 /**
  * pm_genpd_init - Initialize a generic I/O PM domain object.
  * @genpd: PM domain object to initialize.
@@ -1293,13 +1438,15 @@ EXPORT_SYMBOL_GPL(pm_genpd_remove_subdomain);
 int pm_genpd_init(struct generic_pm_domain *genpd,
 		  struct dev_power_governor *gov, bool is_off)
 {
+	int ret;
+
 	if (IS_ERR_OR_NULL(genpd))
 		return -EINVAL;
 
 	INIT_LIST_HEAD(&genpd->master_links);
 	INIT_LIST_HEAD(&genpd->slave_links);
 	INIT_LIST_HEAD(&genpd->dev_list);
-	mutex_init(&genpd->lock);
+	genpd_lock_init(genpd);
 	genpd->gov = gov;
 	INIT_WORK(&genpd->power_off_work, genpd_power_off_work_fn);
 	atomic_set(&genpd->sd_count, 0);
@@ -1325,19 +1472,12 @@ int pm_genpd_init(struct generic_pm_domain *genpd,
 		genpd->dev_ops.start = pm_clk_resume;
 	}
 
-	if (genpd->state_idx >= GENPD_MAX_NUM_STATES) {
-		pr_warn("Initial state index out of bounds.\n");
-		genpd->state_idx = GENPD_MAX_NUM_STATES - 1;
-	}
-
-	if (genpd->state_count > GENPD_MAX_NUM_STATES) {
-		pr_warn("Limiting states to  %d\n", GENPD_MAX_NUM_STATES);
-		genpd->state_count = GENPD_MAX_NUM_STATES;
-	}
-
 	/* Use only one "off" state if there were no states declared */
-	if (genpd->state_count == 0)
-		genpd->state_count = 1;
+	if (genpd->state_count == 0) {
+		ret = genpd_set_default_power_state(genpd);
+		if (ret)
+			return ret;
+	}
 
 	mutex_lock(&gpd_list_lock);
 	list_add(&genpd->gpd_list_node, &gpd_list);
@@ -1354,16 +1494,16 @@ static int genpd_remove(struct generic_pm_domain *genpd)
 	if (IS_ERR_OR_NULL(genpd))
 		return -EINVAL;
 
-	mutex_lock(&genpd->lock);
+	genpd_lock(genpd);
 
 	if (genpd->has_provider) {
-		mutex_unlock(&genpd->lock);
+		genpd_unlock(genpd);
 		pr_err("Provider present, unable to remove %s\n", genpd->name);
 		return -EBUSY;
 	}
 
 	if (!list_empty(&genpd->master_links) || genpd->device_count) {
-		mutex_unlock(&genpd->lock);
+		genpd_unlock(genpd);
 		pr_err("%s: unable to remove %s\n", __func__, genpd->name);
 		return -EBUSY;
 	}
@@ -1375,8 +1515,9 @@ static int genpd_remove(struct generic_pm_domain *genpd)
 	}
 
 	list_del(&genpd->gpd_list_node);
-	mutex_unlock(&genpd->lock);
+	genpd_unlock(genpd);
 	cancel_work_sync(&genpd->power_off_work);
+	kfree(genpd->free);
 	pr_debug("%s: removed %s\n", __func__, genpd->name);
 
 	return 0;
@@ -1890,21 +2031,117 @@ int genpd_dev_pm_attach(struct device *dev)
 	mutex_unlock(&gpd_list_lock);
 
 	if (ret < 0) {
-		dev_err(dev, "failed to add to PM domain %s: %d",
-			pd->name, ret);
+		if (ret != -EPROBE_DEFER)
+			dev_err(dev, "failed to add to PM domain %s: %d",
+				pd->name, ret);
 		goto out;
 	}
 
 	dev->pm_domain->detach = genpd_dev_pm_detach;
 	dev->pm_domain->sync = genpd_dev_pm_sync;
 
-	mutex_lock(&pd->lock);
+	genpd_lock(pd);
 	ret = genpd_poweron(pd, 0);
-	mutex_unlock(&pd->lock);
+	genpd_unlock(pd);
 out:
 	return ret ? -EPROBE_DEFER : 0;
 }
 EXPORT_SYMBOL_GPL(genpd_dev_pm_attach);
+
+static const struct of_device_id idle_state_match[] = {
+	{ .compatible = "domain-idle-state", },
+	{ }
+};
+
+static int genpd_parse_state(struct genpd_power_state *genpd_state,
+				    struct device_node *state_node)
+{
+	int err;
+	u32 residency;
+	u32 entry_latency, exit_latency;
+	const struct of_device_id *match_id;
+
+	match_id = of_match_node(idle_state_match, state_node);
+	if (!match_id)
+		return -EINVAL;
+
+	err = of_property_read_u32(state_node, "entry-latency-us",
+						&entry_latency);
+	if (err) {
+		pr_debug(" * %s missing entry-latency-us property\n",
+						state_node->full_name);
+		return -EINVAL;
+	}
+
+	err = of_property_read_u32(state_node, "exit-latency-us",
+						&exit_latency);
+	if (err) {
+		pr_debug(" * %s missing exit-latency-us property\n",
+						state_node->full_name);
+		return -EINVAL;
+	}
+
+	err = of_property_read_u32(state_node, "min-residency-us", &residency);
+	if (!err)
+		genpd_state->residency_ns = 1000 * residency;
+
+	genpd_state->power_on_latency_ns = 1000 * exit_latency;
+	genpd_state->power_off_latency_ns = 1000 * entry_latency;
+	genpd_state->fwnode = &state_node->fwnode;
+
+	return 0;
+}
+
+/**
+ * of_genpd_parse_idle_states: Return array of idle states for the genpd.
+ *
+ * @dn: The genpd device node
+ * @states: The pointer to which the state array will be saved.
+ * @n: The count of elements in the array returned from this function.
+ *
+ * Returns the device states parsed from the OF node. The memory for the states
+ * is allocated by this function and is the responsibility of the caller to
+ * free the memory after use.
+ */
+int of_genpd_parse_idle_states(struct device_node *dn,
+			struct genpd_power_state **states, int *n)
+{
+	struct genpd_power_state *st;
+	struct device_node *np;
+	int i = 0;
+	int err, ret;
+	int count;
+	struct of_phandle_iterator it;
+
+	count = of_count_phandle_with_args(dn, "domain-idle-states", NULL);
+	if (count <= 0)
+		return -EINVAL;
+
+	st = kcalloc(count, sizeof(*st), GFP_KERNEL);
+	if (!st)
+		return -ENOMEM;
+
+	/* Loop over the phandles until all the requested entry is found */
+	of_for_each_phandle(&it, err, dn, "domain-idle-states", NULL, 0) {
+		np = it.node;
+		ret = genpd_parse_state(&st[i++], np);
+		if (ret) {
+			pr_err
+			("Parsing idle state node %s failed with err %d\n",
+							np->full_name, ret);
+			of_node_put(np);
+			kfree(st);
+			return ret;
+		}
+	}
+
+	*n = count;
+	*states = st;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(of_genpd_parse_idle_states);
+
 #endif /* CONFIG_PM_GENERIC_DOMAINS_OF */
 
 
@@ -1958,7 +2195,7 @@ static int pm_genpd_summary_one(struct seq_file *s,
 	char state[16];
 	int ret;
 
-	ret = mutex_lock_interruptible(&genpd->lock);
+	ret = genpd_lock_interruptible(genpd);
 	if (ret)
 		return -ERESTARTSYS;
 
@@ -1984,7 +2221,9 @@ static int pm_genpd_summary_one(struct seq_file *s,
 	}
 
 	list_for_each_entry(pm_data, &genpd->dev_list, list_node) {
-		kobj_path = kobject_get_path(&pm_data->dev->kobj, GFP_KERNEL);
+		kobj_path = kobject_get_path(&pm_data->dev->kobj,
+				genpd_is_irq_safe(genpd) ?
+				GFP_ATOMIC : GFP_KERNEL);
 		if (kobj_path == NULL)
 			continue;
 
@@ -1995,7 +2234,7 @@ static int pm_genpd_summary_one(struct seq_file *s,
 
 	seq_puts(s, "\n");
 exit:
-	mutex_unlock(&genpd->lock);
+	genpd_unlock(genpd);
 
 	return 0;
 }
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 2932a5b..eb474c8 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1460,10 +1460,10 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
 	dpm_watchdog_clear(&wd);
 
  Complete:
-	complete_all(&dev->power.completion);
 	if (error)
 		async_error = error;
 
+	complete_all(&dev->power.completion);
 	TRACE_SUSPEND(error);
 	return error;
 }
diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index 4c7c6da..35ff062 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -93,6 +93,8 @@ struct opp_table *_find_opp_table(struct device *dev)
  * Return: voltage in micro volt corresponding to the opp, else
  * return 0
  *
+ * This is useful only for devices with single power supply.
+ *
  * Locking: This function must be called under rcu_read_lock(). opp is a rcu
  * protected pointer. This means that opp which could have been fetched by
  * opp_find_freq_{exact,ceil,floor} functions is valid as long as we are
@@ -112,7 +114,7 @@ unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
 	if (IS_ERR_OR_NULL(tmp_opp))
 		pr_err("%s: Invalid parameters\n", __func__);
 	else
-		v = tmp_opp->u_volt;
+		v = tmp_opp->supplies[0].u_volt;
 
 	return v;
 }
@@ -210,6 +212,24 @@ unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_clock_latency);
 
+static int _get_regulator_count(struct device *dev)
+{
+	struct opp_table *opp_table;
+	int count;
+
+	rcu_read_lock();
+
+	opp_table = _find_opp_table(dev);
+	if (!IS_ERR(opp_table))
+		count = opp_table->regulator_count;
+	else
+		count = 0;
+
+	rcu_read_unlock();
+
+	return count;
+}
+
 /**
  * dev_pm_opp_get_max_volt_latency() - Get max voltage latency in nanoseconds
  * @dev: device for which we do this operation
@@ -222,34 +242,51 @@ unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
 {
 	struct opp_table *opp_table;
 	struct dev_pm_opp *opp;
-	struct regulator *reg;
+	struct regulator *reg, **regulators;
 	unsigned long latency_ns = 0;
-	unsigned long min_uV = ~0, max_uV = 0;
-	int ret;
+	int ret, i, count;
+	struct {
+		unsigned long min;
+		unsigned long max;
+	} *uV;
+
+	count = _get_regulator_count(dev);
+
+	/* Regulator may not be required for the device */
+	if (!count)
+		return 0;
+
+	regulators = kmalloc_array(count, sizeof(*regulators), GFP_KERNEL);
+	if (!regulators)
+		return 0;
+
+	uV = kmalloc_array(count, sizeof(*uV), GFP_KERNEL);
+	if (!uV)
+		goto free_regulators;
 
 	rcu_read_lock();
 
 	opp_table = _find_opp_table(dev);
 	if (IS_ERR(opp_table)) {
 		rcu_read_unlock();
-		return 0;
+		goto free_uV;
 	}
 
-	reg = opp_table->regulator;
-	if (IS_ERR(reg)) {
-		/* Regulator may not be required for device */
-		rcu_read_unlock();
-		return 0;
-	}
+	memcpy(regulators, opp_table->regulators, count * sizeof(*regulators));
 
-	list_for_each_entry_rcu(opp, &opp_table->opp_list, node) {
-		if (!opp->available)
-			continue;
+	for (i = 0; i < count; i++) {
+		uV[i].min = ~0;
+		uV[i].max = 0;
+
+		list_for_each_entry_rcu(opp, &opp_table->opp_list, node) {
+			if (!opp->available)
+				continue;
 
-		if (opp->u_volt_min < min_uV)
-			min_uV = opp->u_volt_min;
-		if (opp->u_volt_max > max_uV)
-			max_uV = opp->u_volt_max;
+			if (opp->supplies[i].u_volt_min < uV[i].min)
+				uV[i].min = opp->supplies[i].u_volt_min;
+			if (opp->supplies[i].u_volt_max > uV[i].max)
+				uV[i].max = opp->supplies[i].u_volt_max;
+		}
 	}
 
 	rcu_read_unlock();
@@ -258,9 +295,16 @@ unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
 	 * The caller needs to ensure that opp_table (and hence the regulator)
 	 * isn't freed, while we are executing this routine.
 	 */
-	ret = regulator_set_voltage_time(reg, min_uV, max_uV);
-	if (ret > 0)
-		latency_ns = ret * 1000;
+	for (i = 0; reg = regulators[i], i < count; i++) {
+		ret = regulator_set_voltage_time(reg, uV[i].min, uV[i].max);
+		if (ret > 0)
+			latency_ns += ret * 1000;
+	}
+
+free_uV:
+	kfree(uV);
+free_regulators:
+	kfree(regulators);
 
 	return latency_ns;
 }
@@ -542,8 +586,7 @@ unlock:
 }
 
 static int _set_opp_voltage(struct device *dev, struct regulator *reg,
-			    unsigned long u_volt, unsigned long u_volt_min,
-			    unsigned long u_volt_max)
+			    struct dev_pm_opp_supply *supply)
 {
 	int ret;
 
@@ -554,14 +597,78 @@ static int _set_opp_voltage(struct device *dev, struct regulator *reg,
 		return 0;
 	}
 
-	dev_dbg(dev, "%s: voltages (mV): %lu %lu %lu\n", __func__, u_volt_min,
-		u_volt, u_volt_max);
+	dev_dbg(dev, "%s: voltages (mV): %lu %lu %lu\n", __func__,
+		supply->u_volt_min, supply->u_volt, supply->u_volt_max);
 
-	ret = regulator_set_voltage_triplet(reg, u_volt_min, u_volt,
-					    u_volt_max);
+	ret = regulator_set_voltage_triplet(reg, supply->u_volt_min,
+					    supply->u_volt, supply->u_volt_max);
 	if (ret)
 		dev_err(dev, "%s: failed to set voltage (%lu %lu %lu mV): %d\n",
-			__func__, u_volt_min, u_volt, u_volt_max, ret);
+			__func__, supply->u_volt_min, supply->u_volt,
+			supply->u_volt_max, ret);
+
+	return ret;
+}
+
+static inline int
+_generic_set_opp_clk_only(struct device *dev, struct clk *clk,
+			  unsigned long old_freq, unsigned long freq)
+{
+	int ret;
+
+	ret = clk_set_rate(clk, freq);
+	if (ret) {
+		dev_err(dev, "%s: failed to set clock rate: %d\n", __func__,
+			ret);
+	}
+
+	return ret;
+}
+
+static int _generic_set_opp(struct dev_pm_set_opp_data *data)
+{
+	struct dev_pm_opp_supply *old_supply = data->old_opp.supplies;
+	struct dev_pm_opp_supply *new_supply = data->new_opp.supplies;
+	unsigned long old_freq = data->old_opp.rate, freq = data->new_opp.rate;
+	struct regulator *reg = data->regulators[0];
+	struct device *dev= data->dev;
+	int ret;
+
+	/* This function only supports single regulator per device */
+	if (WARN_ON(data->regulator_count > 1)) {
+		dev_err(dev, "multiple regulators are not supported\n");
+		return -EINVAL;
+	}
+
+	/* Scaling up? Scale voltage before frequency */
+	if (freq > old_freq) {
+		ret = _set_opp_voltage(dev, reg, new_supply);
+		if (ret)
+			goto restore_voltage;
+	}
+
+	/* Change frequency */
+	ret = _generic_set_opp_clk_only(dev, data->clk, old_freq, freq);
+	if (ret)
+		goto restore_voltage;
+
+	/* Scaling down? Scale voltage after frequency */
+	if (freq < old_freq) {
+		ret = _set_opp_voltage(dev, reg, new_supply);
+		if (ret)
+			goto restore_freq;
+	}
+
+	return 0;
+
+restore_freq:
+	if (_generic_set_opp_clk_only(dev, data->clk, freq, old_freq))
+		dev_err(dev, "%s: failed to restore old-freq (%lu Hz)\n",
+			__func__, old_freq);
+restore_voltage:
+	/* This shouldn't harm even if the voltages weren't updated earlier */
+	if (old_supply->u_volt)
+		_set_opp_voltage(dev, reg, old_supply);
 
 	return ret;
 }
@@ -579,12 +686,13 @@ static int _set_opp_voltage(struct device *dev, struct regulator *reg,
 int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 {
 	struct opp_table *opp_table;
+	unsigned long freq, old_freq;
+	int (*set_opp)(struct dev_pm_set_opp_data *data);
 	struct dev_pm_opp *old_opp, *opp;
-	struct regulator *reg;
+	struct regulator **regulators;
+	struct dev_pm_set_opp_data *data;
 	struct clk *clk;
-	unsigned long freq, old_freq;
-	unsigned long u_volt, u_volt_min, u_volt_max;
-	int ret;
+	int ret, size;
 
 	if (unlikely(!target_freq)) {
 		dev_err(dev, "%s: Invalid target frequency %lu\n", __func__,
@@ -633,55 +741,41 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 		return ret;
 	}
 
-	u_volt = opp->u_volt;
-	u_volt_min = opp->u_volt_min;
-	u_volt_max = opp->u_volt_max;
+	dev_dbg(dev, "%s: switching OPP: %lu Hz --> %lu Hz\n", __func__,
+		old_freq, freq);
 
-	reg = opp_table->regulator;
+	regulators = opp_table->regulators;
 
-	rcu_read_unlock();
-
-	/* Scaling up? Scale voltage before frequency */
-	if (freq > old_freq) {
-		ret = _set_opp_voltage(dev, reg, u_volt, u_volt_min,
-				       u_volt_max);
-		if (ret)
-			goto restore_voltage;
-	}
-
-	/* Change frequency */
-
-	dev_dbg(dev, "%s: switching OPP: %lu Hz --> %lu Hz\n",
-		__func__, old_freq, freq);
-
-	ret = clk_set_rate(clk, freq);
-	if (ret) {
-		dev_err(dev, "%s: failed to set clock rate: %d\n", __func__,
-			ret);
-		goto restore_voltage;
+	/* Only frequency scaling */
+	if (!regulators) {
+		rcu_read_unlock();
+		return _generic_set_opp_clk_only(dev, clk, old_freq, freq);
 	}
 
-	/* Scaling down? Scale voltage after frequency */
-	if (freq < old_freq) {
-		ret = _set_opp_voltage(dev, reg, u_volt, u_volt_min,
-				       u_volt_max);
-		if (ret)
-			goto restore_freq;
-	}
+	if (opp_table->set_opp)
+		set_opp = opp_table->set_opp;
+	else
+		set_opp = _generic_set_opp;
+
+	data = opp_table->set_opp_data;
+	data->regulators = regulators;
+	data->regulator_count = opp_table->regulator_count;
+	data->clk = clk;
+	data->dev = dev;
+
+	data->old_opp.rate = old_freq;
+	size = sizeof(*opp->supplies) * opp_table->regulator_count;
+	if (IS_ERR(old_opp))
+		memset(data->old_opp.supplies, 0, size);
+	else
+		memcpy(data->old_opp.supplies, old_opp->supplies, size);
 
-	return 0;
+	data->new_opp.rate = freq;
+	memcpy(data->new_opp.supplies, opp->supplies, size);
 
-restore_freq:
-	if (clk_set_rate(clk, old_freq))
-		dev_err(dev, "%s: failed to restore old-freq (%lu Hz)\n",
-			__func__, old_freq);
-restore_voltage:
-	/* This shouldn't harm even if the voltages weren't updated earlier */
-	if (!IS_ERR(old_opp))
-		_set_opp_voltage(dev, reg, old_opp->u_volt,
-				 old_opp->u_volt_min, old_opp->u_volt_max);
+	rcu_read_unlock();
 
-	return ret;
+	return set_opp(data);
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_set_rate);
 
@@ -764,9 +858,6 @@ static struct opp_table *_add_opp_table(struct device *dev)
 
 	_of_init_opp_table(opp_table, dev);
 
-	/* Set regulator to a non-NULL error value */
-	opp_table->regulator = ERR_PTR(-ENXIO);
-
 	/* Find clk for the device */
 	opp_table->clk = clk_get(dev, NULL);
 	if (IS_ERR(opp_table->clk)) {
@@ -815,7 +906,10 @@ static void _remove_opp_table(struct opp_table *opp_table)
 	if (opp_table->prop_name)
 		return;
 
-	if (!IS_ERR(opp_table->regulator))
+	if (opp_table->regulators)
+		return;
+
+	if (opp_table->set_opp)
 		return;
 
 	/* Release clk */
@@ -924,34 +1018,50 @@ struct dev_pm_opp *_allocate_opp(struct device *dev,
 				 struct opp_table **opp_table)
 {
 	struct dev_pm_opp *opp;
+	int count, supply_size;
+	struct opp_table *table;
 
-	/* allocate new OPP node */
-	opp = kzalloc(sizeof(*opp), GFP_KERNEL);
-	if (!opp)
+	table = _add_opp_table(dev);
+	if (!table)
 		return NULL;
 
-	INIT_LIST_HEAD(&opp->node);
+	/* Allocate space for at least one supply */
+	count = table->regulator_count ? table->regulator_count : 1;
+	supply_size = sizeof(*opp->supplies) * count;
 
-	*opp_table = _add_opp_table(dev);
-	if (!*opp_table) {
-		kfree(opp);
+	/* allocate new OPP node and supplies structures */
+	opp = kzalloc(sizeof(*opp) + supply_size, GFP_KERNEL);
+	if (!opp) {
+		kfree(table);
 		return NULL;
 	}
 
+	/* Put the supplies at the end of the OPP structure as an empty array */
+	opp->supplies = (struct dev_pm_opp_supply *)(opp + 1);
+	INIT_LIST_HEAD(&opp->node);
+
+	*opp_table = table;
+
 	return opp;
 }
 
 static bool _opp_supported_by_regulators(struct dev_pm_opp *opp,
 					 struct opp_table *opp_table)
 {
-	struct regulator *reg = opp_table->regulator;
-
-	if (!IS_ERR(reg) &&
-	    !regulator_is_supported_voltage(reg, opp->u_volt_min,
-					    opp->u_volt_max)) {
-		pr_warn("%s: OPP minuV: %lu maxuV: %lu, not supported by regulator\n",
-			__func__, opp->u_volt_min, opp->u_volt_max);
-		return false;
+	struct regulator *reg;
+	int i;
+
+	for (i = 0; i < opp_table->regulator_count; i++) {
+		reg = opp_table->regulators[i];
+
+		if (!regulator_is_supported_voltage(reg,
+					opp->supplies[i].u_volt_min,
+					opp->supplies[i].u_volt_max)) {
+			pr_warn("%s: OPP minuV: %lu maxuV: %lu, not supported by regulator\n",
+				__func__, opp->supplies[i].u_volt_min,
+				opp->supplies[i].u_volt_max);
+			return false;
+		}
 	}
 
 	return true;
@@ -983,11 +1093,13 @@ int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
 
 		/* Duplicate OPPs */
 		dev_warn(dev, "%s: duplicate OPPs detected. Existing: freq: %lu, volt: %lu, enabled: %d. New: freq: %lu, volt: %lu, enabled: %d\n",
-			 __func__, opp->rate, opp->u_volt, opp->available,
-			 new_opp->rate, new_opp->u_volt, new_opp->available);
+			 __func__, opp->rate, opp->supplies[0].u_volt,
+			 opp->available, new_opp->rate,
+			 new_opp->supplies[0].u_volt, new_opp->available);
 
-		return opp->available && new_opp->u_volt == opp->u_volt ?
-			0 : -EEXIST;
+		/* Should we compare voltages for all regulators here ? */
+		return opp->available &&
+		       new_opp->supplies[0].u_volt == opp->supplies[0].u_volt ? 0 : -EEXIST;
 	}
 
 	new_opp->opp_table = opp_table;
@@ -1054,9 +1166,9 @@ int _opp_add_v1(struct device *dev, unsigned long freq, long u_volt,
 	/* populate the opp table */
 	new_opp->rate = freq;
 	tol = u_volt * opp_table->voltage_tolerance_v1 / 100;
-	new_opp->u_volt = u_volt;
-	new_opp->u_volt_min = u_volt - tol;
-	new_opp->u_volt_max = u_volt + tol;
+	new_opp->supplies[0].u_volt = u_volt;
+	new_opp->supplies[0].u_volt_min = u_volt - tol;
+	new_opp->supplies[0].u_volt_max = u_volt + tol;
 	new_opp->available = true;
 	new_opp->dynamic = dynamic;
 
@@ -1300,13 +1412,47 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
 
+static int _allocate_set_opp_data(struct opp_table *opp_table)
+{
+	struct dev_pm_set_opp_data *data;
+	int len, count = opp_table->regulator_count;
+
+	if (WARN_ON(!count))
+		return -EINVAL;
+
+	/* space for set_opp_data */
+	len = sizeof(*data);
+
+	/* space for old_opp.supplies and new_opp.supplies */
+	len += 2 * sizeof(struct dev_pm_opp_supply) * count;
+
+	data = kzalloc(len, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->old_opp.supplies = (void *)(data + 1);
+	data->new_opp.supplies = data->old_opp.supplies + count;
+
+	opp_table->set_opp_data = data;
+
+	return 0;
+}
+
+static void _free_set_opp_data(struct opp_table *opp_table)
+{
+	kfree(opp_table->set_opp_data);
+	opp_table->set_opp_data = NULL;
+}
+
 /**
- * dev_pm_opp_set_regulator() - Set regulator name for the device
+ * dev_pm_opp_set_regulators() - Set regulator names for the device
  * @dev: Device for which regulator name is being set.
- * @name: Name of the regulator.
+ * @names: Array of pointers to the names of the regulator.
+ * @count: Number of regulators.
  *
  * In order to support OPP switching, OPP layer needs to know the name of the
- * device's regulator, as the core would be required to switch voltages as well.
+ * device's regulators, as the core would be required to switch voltages as
+ * well.
  *
  * This must be called before any OPPs are initialized for the device.
  *
@@ -1316,11 +1462,13 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
  * that this function is *NOT* called under RCU protection or in contexts where
  * mutex cannot be locked.
  */
-int dev_pm_opp_set_regulator(struct device *dev, const char *name)
+struct opp_table *dev_pm_opp_set_regulators(struct device *dev,
+					    const char * const names[],
+					    unsigned int count)
 {
 	struct opp_table *opp_table;
 	struct regulator *reg;
-	int ret;
+	int ret, i;
 
 	mutex_lock(&opp_table_lock);
 
@@ -1336,22 +1484,146 @@ int dev_pm_opp_set_regulator(struct device *dev, const char *name)
 		goto err;
 	}
 
-	/* Already have a regulator set */
-	if (WARN_ON(!IS_ERR(opp_table->regulator))) {
+	/* Already have regulators set */
+	if (opp_table->regulators) {
 		ret = -EBUSY;
 		goto err;
 	}
-	/* Allocate the regulator */
-	reg = regulator_get_optional(dev, name);
-	if (IS_ERR(reg)) {
-		ret = PTR_ERR(reg);
-		if (ret != -EPROBE_DEFER)
-			dev_err(dev, "%s: no regulator (%s) found: %d\n",
-				__func__, name, ret);
+
+	opp_table->regulators = kmalloc_array(count,
+					      sizeof(*opp_table->regulators),
+					      GFP_KERNEL);
+	if (!opp_table->regulators) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	for (i = 0; i < count; i++) {
+		reg = regulator_get_optional(dev, names[i]);
+		if (IS_ERR(reg)) {
+			ret = PTR_ERR(reg);
+			if (ret != -EPROBE_DEFER)
+				dev_err(dev, "%s: no regulator (%s) found: %d\n",
+					__func__, names[i], ret);
+			goto free_regulators;
+		}
+
+		opp_table->regulators[i] = reg;
+	}
+
+	opp_table->regulator_count = count;
+
+	/* Allocate block only once to pass to set_opp() routines */
+	ret = _allocate_set_opp_data(opp_table);
+	if (ret)
+		goto free_regulators;
+
+	mutex_unlock(&opp_table_lock);
+	return opp_table;
+
+free_regulators:
+	while (i != 0)
+		regulator_put(opp_table->regulators[--i]);
+
+	kfree(opp_table->regulators);
+	opp_table->regulators = NULL;
+	opp_table->regulator_count = 0;
+err:
+	_remove_opp_table(opp_table);
+unlock:
+	mutex_unlock(&opp_table_lock);
+
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulators);
+
+/**
+ * dev_pm_opp_put_regulators() - Releases resources blocked for regulator
+ * @opp_table: OPP table returned from dev_pm_opp_set_regulators().
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+void dev_pm_opp_put_regulators(struct opp_table *opp_table)
+{
+	int i;
+
+	mutex_lock(&opp_table_lock);
+
+	if (!opp_table->regulators) {
+		pr_err("%s: Doesn't have regulators set\n", __func__);
+		goto unlock;
+	}
+
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
+
+	for (i = opp_table->regulator_count - 1; i >= 0; i--)
+		regulator_put(opp_table->regulators[i]);
+
+	_free_set_opp_data(opp_table);
+
+	kfree(opp_table->regulators);
+	opp_table->regulators = NULL;
+	opp_table->regulator_count = 0;
+
+	/* Try freeing opp_table if this was the last blocking resource */
+	_remove_opp_table(opp_table);
+
+unlock:
+	mutex_unlock(&opp_table_lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulators);
+
+/**
+ * dev_pm_opp_register_set_opp_helper() - Register custom set OPP helper
+ * @dev: Device for which the helper is getting registered.
+ * @set_opp: Custom set OPP helper.
+ *
+ * This is useful to support complex platforms (like platforms with multiple
+ * regulators per device), instead of the generic OPP set rate helper.
+ *
+ * This must be called before any OPPs are initialized for the device.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+int dev_pm_opp_register_set_opp_helper(struct device *dev,
+			int (*set_opp)(struct dev_pm_set_opp_data *data))
+{
+	struct opp_table *opp_table;
+	int ret;
+
+	if (!set_opp)
+		return -EINVAL;
+
+	mutex_lock(&opp_table_lock);
+
+	opp_table = _add_opp_table(dev);
+	if (!opp_table) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	/* This should be called before OPPs are initialized */
+	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
+		ret = -EBUSY;
 		goto err;
 	}
 
-	opp_table->regulator = reg;
+	/* Already have custom set_opp helper */
+	if (WARN_ON(opp_table->set_opp)) {
+		ret = -EBUSY;
+		goto err;
+	}
+
+	opp_table->set_opp = set_opp;
 
 	mutex_unlock(&opp_table_lock);
 	return 0;
@@ -1363,11 +1635,12 @@ unlock:
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulator);
+EXPORT_SYMBOL_GPL(dev_pm_opp_register_set_opp_helper);
 
 /**
- * dev_pm_opp_put_regulator() - Releases resources blocked for regulator
- * @dev: Device for which regulator was set.
+ * dev_pm_opp_register_put_opp_helper() - Releases resources blocked for
+ *					   set_opp helper
+ * @dev: Device for which custom set_opp helper has to be cleared.
  *
  * Locking: The internal opp_table and opp structures are RCU protected.
  * Hence this function internally uses RCU updater strategy with mutex locks
@@ -1375,7 +1648,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulator);
  * that this function is *NOT* called under RCU protection or in contexts where
  * mutex cannot be locked.
  */
-void dev_pm_opp_put_regulator(struct device *dev)
+void dev_pm_opp_register_put_opp_helper(struct device *dev)
 {
 	struct opp_table *opp_table;
 
@@ -1389,16 +1662,16 @@ void dev_pm_opp_put_regulator(struct device *dev)
 		goto unlock;
 	}
 
-	if (IS_ERR(opp_table->regulator)) {
-		dev_err(dev, "%s: Doesn't have regulator set\n", __func__);
+	if (!opp_table->set_opp) {
+		dev_err(dev, "%s: Doesn't have custom set_opp helper set\n",
+			__func__);
 		goto unlock;
 	}
 
 	/* Make sure there are no concurrent readers while updating opp_table */
 	WARN_ON(!list_empty(&opp_table->opp_list));
 
-	regulator_put(opp_table->regulator);
-	opp_table->regulator = ERR_PTR(-ENXIO);
+	opp_table->set_opp = NULL;
 
 	/* Try freeing opp_table if this was the last blocking resource */
 	_remove_opp_table(opp_table);
@@ -1406,7 +1679,7 @@ void dev_pm_opp_put_regulator(struct device *dev)
 unlock:
 	mutex_unlock(&opp_table_lock);
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulator);
+EXPORT_SYMBOL_GPL(dev_pm_opp_register_put_opp_helper);
 
 /**
  * dev_pm_opp_add()  - Add an OPP table from a table definitions
diff --git a/drivers/base/power/opp/debugfs.c b/drivers/base/power/opp/debugfs.c
index ef1ae6b..95f433d 100644
--- a/drivers/base/power/opp/debugfs.c
+++ b/drivers/base/power/opp/debugfs.c
@@ -15,6 +15,7 @@
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/limits.h>
+#include <linux/slab.h>
 
 #include "opp.h"
 
@@ -34,6 +35,46 @@ void opp_debug_remove_one(struct dev_pm_opp *opp)
 	debugfs_remove_recursive(opp->dentry);
 }
 
+static bool opp_debug_create_supplies(struct dev_pm_opp *opp,
+				      struct opp_table *opp_table,
+				      struct dentry *pdentry)
+{
+	struct dentry *d;
+	int i = 0;
+	char *name;
+
+	/* Always create at least supply-0 directory */
+	do {
+		name = kasprintf(GFP_KERNEL, "supply-%d", i);
+
+		/* Create per-opp directory */
+		d = debugfs_create_dir(name, pdentry);
+
+		kfree(name);
+
+		if (!d)
+			return false;
+
+		if (!debugfs_create_ulong("u_volt_target", S_IRUGO, d,
+					  &opp->supplies[i].u_volt))
+			return false;
+
+		if (!debugfs_create_ulong("u_volt_min", S_IRUGO, d,
+					  &opp->supplies[i].u_volt_min))
+			return false;
+
+		if (!debugfs_create_ulong("u_volt_max", S_IRUGO, d,
+					  &opp->supplies[i].u_volt_max))
+			return false;
+
+		if (!debugfs_create_ulong("u_amp", S_IRUGO, d,
+					  &opp->supplies[i].u_amp))
+			return false;
+	} while (++i < opp_table->regulator_count);
+
+	return true;
+}
+
 int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
 {
 	struct dentry *pdentry = opp_table->dentry;
@@ -63,16 +104,7 @@ int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
 	if (!debugfs_create_ulong("rate_hz", S_IRUGO, d, &opp->rate))
 		return -ENOMEM;
 
-	if (!debugfs_create_ulong("u_volt_target", S_IRUGO, d, &opp->u_volt))
-		return -ENOMEM;
-
-	if (!debugfs_create_ulong("u_volt_min", S_IRUGO, d, &opp->u_volt_min))
-		return -ENOMEM;
-
-	if (!debugfs_create_ulong("u_volt_max", S_IRUGO, d, &opp->u_volt_max))
-		return -ENOMEM;
-
-	if (!debugfs_create_ulong("u_amp", S_IRUGO, d, &opp->u_amp))
+	if (!opp_debug_create_supplies(opp, opp_table, d))
 		return -ENOMEM;
 
 	if (!debugfs_create_ulong("clock_latency_ns", S_IRUGO, d,
diff --git a/drivers/base/power/opp/of.c b/drivers/base/power/opp/of.c
index 5552211..3f7d259 100644
--- a/drivers/base/power/opp/of.c
+++ b/drivers/base/power/opp/of.c
@@ -17,6 +17,7 @@
 #include <linux/errno.h>
 #include <linux/device.h>
 #include <linux/of.h>
+#include <linux/slab.h>
 #include <linux/export.h>
 
 #include "opp.h"
@@ -101,16 +102,16 @@ static bool _opp_is_supported(struct device *dev, struct opp_table *opp_table,
 	return true;
 }
 
-/* TODO: Support multiple regulators */
 static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
 			      struct opp_table *opp_table)
 {
-	u32 microvolt[3] = {0};
-	u32 val;
-	int count, ret;
+	u32 *microvolt, *microamp = NULL;
+	int supplies, vcount, icount, ret, i, j;
 	struct property *prop = NULL;
 	char name[NAME_MAX];
 
+	supplies = opp_table->regulator_count ? opp_table->regulator_count : 1;
+
 	/* Search for "opp-microvolt-<name>" */
 	if (opp_table->prop_name) {
 		snprintf(name, sizeof(name), "opp-microvolt-%s",
@@ -128,34 +129,29 @@ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
 			return 0;
 	}
 
-	count = of_property_count_u32_elems(opp->np, name);
-	if (count < 0) {
+	vcount = of_property_count_u32_elems(opp->np, name);
+	if (vcount < 0) {
 		dev_err(dev, "%s: Invalid %s property (%d)\n",
-			__func__, name, count);
-		return count;
+			__func__, name, vcount);
+		return vcount;
 	}
 
-	/* There can be one or three elements here */
-	if (count != 1 && count != 3) {
-		dev_err(dev, "%s: Invalid number of elements in %s property (%d)\n",
-			__func__, name, count);
+	/* There can be one or three elements per supply */
+	if (vcount != supplies && vcount != supplies * 3) {
+		dev_err(dev, "%s: Invalid number of elements in %s property (%d) with supplies (%d)\n",
+			__func__, name, vcount, supplies);
 		return -EINVAL;
 	}
 
-	ret = of_property_read_u32_array(opp->np, name, microvolt, count);
+	microvolt = kmalloc_array(vcount, sizeof(*microvolt), GFP_KERNEL);
+	if (!microvolt)
+		return -ENOMEM;
+
+	ret = of_property_read_u32_array(opp->np, name, microvolt, vcount);
 	if (ret) {
 		dev_err(dev, "%s: error parsing %s: %d\n", __func__, name, ret);
-		return -EINVAL;
-	}
-
-	opp->u_volt = microvolt[0];
-
-	if (count == 1) {
-		opp->u_volt_min = opp->u_volt;
-		opp->u_volt_max = opp->u_volt;
-	} else {
-		opp->u_volt_min = microvolt[1];
-		opp->u_volt_max = microvolt[2];
+		ret = -EINVAL;
+		goto free_microvolt;
 	}
 
 	/* Search for "opp-microamp-<name>" */
@@ -172,10 +168,59 @@ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
 		prop = of_find_property(opp->np, name, NULL);
 	}
 
-	if (prop && !of_property_read_u32(opp->np, name, &val))
-		opp->u_amp = val;
+	if (prop) {
+		icount = of_property_count_u32_elems(opp->np, name);
+		if (icount < 0) {
+			dev_err(dev, "%s: Invalid %s property (%d)\n", __func__,
+				name, icount);
+			ret = icount;
+			goto free_microvolt;
+		}
 
-	return 0;
+		if (icount != supplies) {
+			dev_err(dev, "%s: Invalid number of elements in %s property (%d) with supplies (%d)\n",
+				__func__, name, icount, supplies);
+			ret = -EINVAL;
+			goto free_microvolt;
+		}
+
+		microamp = kmalloc_array(icount, sizeof(*microamp), GFP_KERNEL);
+		if (!microamp) {
+			ret = -EINVAL;
+			goto free_microvolt;
+		}
+
+		ret = of_property_read_u32_array(opp->np, name, microamp,
+						 icount);
+		if (ret) {
+			dev_err(dev, "%s: error parsing %s: %d\n", __func__,
+				name, ret);
+			ret = -EINVAL;
+			goto free_microamp;
+		}
+	}
+
+	for (i = 0, j = 0; i < supplies; i++) {
+		opp->supplies[i].u_volt = microvolt[j++];
+
+		if (vcount == supplies) {
+			opp->supplies[i].u_volt_min = opp->supplies[i].u_volt;
+			opp->supplies[i].u_volt_max = opp->supplies[i].u_volt;
+		} else {
+			opp->supplies[i].u_volt_min = microvolt[j++];
+			opp->supplies[i].u_volt_max = microvolt[j++];
+		}
+
+		if (microamp)
+			opp->supplies[i].u_amp = microamp[i];
+	}
+
+free_microamp:
+	kfree(microamp);
+free_microvolt:
+	kfree(microvolt);
+
+	return ret;
 }
 
 /**
@@ -198,7 +243,7 @@ void dev_pm_opp_of_remove_table(struct device *dev)
 EXPORT_SYMBOL_GPL(dev_pm_opp_of_remove_table);
 
 /* Returns opp descriptor node for a device, caller must do of_node_put() */
-struct device_node *_of_get_opp_desc_node(struct device *dev)
+static struct device_node *_of_get_opp_desc_node(struct device *dev)
 {
 	/*
 	 * TODO: Support for multiple OPP tables.
@@ -303,9 +348,9 @@ static int _opp_add_static_v2(struct device *dev, struct device_node *np)
 	mutex_unlock(&opp_table_lock);
 
 	pr_debug("%s: turbo:%d rate:%lu uv:%lu uvmin:%lu uvmax:%lu latency:%lu\n",
-		 __func__, new_opp->turbo, new_opp->rate, new_opp->u_volt,
-		 new_opp->u_volt_min, new_opp->u_volt_max,
-		 new_opp->clock_latency_ns);
+		 __func__, new_opp->turbo, new_opp->rate,
+		 new_opp->supplies[0].u_volt, new_opp->supplies[0].u_volt_min,
+		 new_opp->supplies[0].u_volt_max, new_opp->clock_latency_ns);
 
 	/*
 	 * Notify the changes in the availability of the operable
@@ -562,7 +607,7 @@ int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev,
 	/* Get OPP descriptor node */
 	np = _of_get_opp_desc_node(cpu_dev);
 	if (!np) {
-		dev_dbg(cpu_dev, "%s: Couldn't find cpu_dev node.\n", __func__);
+		dev_dbg(cpu_dev, "%s: Couldn't find opp node.\n", __func__);
 		return -ENOENT;
 	}
 
@@ -587,7 +632,7 @@ int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev,
 		/* Get OPP descriptor node */
 		tmp_np = _of_get_opp_desc_node(tcpu_dev);
 		if (!tmp_np) {
-			dev_err(tcpu_dev, "%s: Couldn't find tcpu_dev node.\n",
+			dev_err(tcpu_dev, "%s: Couldn't find opp node.\n",
 				__func__);
 			ret = -ENOENT;
 			goto put_cpu_node;
diff --git a/drivers/base/power/opp/opp.h b/drivers/base/power/opp/opp.h
index fabd5ca..af9f2b8 100644
--- a/drivers/base/power/opp/opp.h
+++ b/drivers/base/power/opp/opp.h
@@ -61,10 +61,7 @@ extern struct list_head opp_tables;
  * @turbo:	true if turbo (boost) OPP
  * @suspend:	true if suspend OPP
  * @rate:	Frequency in hertz
- * @u_volt:	Target voltage in microvolts corresponding to this OPP
- * @u_volt_min:	Minimum voltage in microvolts corresponding to this OPP
- * @u_volt_max:	Maximum voltage in microvolts corresponding to this OPP
- * @u_amp:	Maximum current drawn by the device in microamperes
+ * @supplies:	Power supplies voltage/current values
  * @clock_latency_ns: Latency (in nanoseconds) of switching to this OPP's
  *		frequency from any other OPP's frequency.
  * @opp_table:	points back to the opp_table struct this opp belongs to
@@ -83,10 +80,8 @@ struct dev_pm_opp {
 	bool suspend;
 	unsigned long rate;
 
-	unsigned long u_volt;
-	unsigned long u_volt_min;
-	unsigned long u_volt_max;
-	unsigned long u_amp;
+	struct dev_pm_opp_supply *supplies;
+
 	unsigned long clock_latency_ns;
 
 	struct opp_table *opp_table;
@@ -144,7 +139,10 @@ enum opp_table_access {
  * @supported_hw_count: Number of elements in supported_hw array.
  * @prop_name: A name to postfix to many DT properties, while parsing them.
  * @clk: Device's clock handle
- * @regulator: Supply regulator
+ * @regulators: Supply regulators
+ * @regulator_count: Number of power supply regulators
+ * @set_opp: Platform specific set_opp callback
+ * @set_opp_data: Data to be passed to set_opp callback
  * @dentry:	debugfs dentry pointer of the real device directory (not links).
  * @dentry_name: Name of the real dentry.
  *
@@ -179,7 +177,11 @@ struct opp_table {
 	unsigned int supported_hw_count;
 	const char *prop_name;
 	struct clk *clk;
-	struct regulator *regulator;
+	struct regulator **regulators;
+	unsigned int regulator_count;
+
+	int (*set_opp)(struct dev_pm_set_opp_data *data);
+	struct dev_pm_set_opp_data *set_opp_data;
 
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *dentry;
@@ -190,7 +192,6 @@ struct opp_table {
 /* Routines internal to opp core */
 struct opp_table *_find_opp_table(struct device *dev);
 struct opp_device *_add_opp_dev(const struct device *dev, struct opp_table *opp_table);
-struct device_node *_of_get_opp_desc_node(struct device *dev);
 void _dev_pm_opp_remove_table(struct device *dev, bool remove_all);
 struct dev_pm_opp *_allocate_opp(struct device *dev, struct opp_table **opp_table);
 int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, struct opp_table *opp_table);
diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h
index 50e30e7..a84332a 100644
--- a/drivers/base/power/power.h
+++ b/drivers/base/power/power.h
@@ -21,14 +21,22 @@ extern void pm_runtime_init(struct device *dev);
 extern void pm_runtime_reinit(struct device *dev);
 extern void pm_runtime_remove(struct device *dev);
 
+#define WAKE_IRQ_DEDICATED_ALLOCATED	BIT(0)
+#define WAKE_IRQ_DEDICATED_MANAGED	BIT(1)
+#define WAKE_IRQ_DEDICATED_MASK		(WAKE_IRQ_DEDICATED_ALLOCATED | \
+					 WAKE_IRQ_DEDICATED_MANAGED)
+
 struct wake_irq {
 	struct device *dev;
+	unsigned int status;
 	int irq;
-	bool dedicated_irq:1;
 };
 
 extern void dev_pm_arm_wake_irq(struct wake_irq *wirq);
 extern void dev_pm_disarm_wake_irq(struct wake_irq *wirq);
+extern void dev_pm_enable_wake_irq_check(struct device *dev,
+					 bool can_change_status);
+extern void dev_pm_disable_wake_irq_check(struct device *dev);
 
 #ifdef CONFIG_PM_SLEEP
 
@@ -104,6 +112,15 @@ static inline void dev_pm_disarm_wake_irq(struct wake_irq *wirq)
 {
 }
 
+static inline void dev_pm_enable_wake_irq_check(struct device *dev,
+						bool can_change_status)
+{
+}
+
+static inline void dev_pm_disable_wake_irq_check(struct device *dev)
+{
+}
+
 #endif
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index 7f3646e..58fcc75 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -856,7 +856,10 @@ int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val)
 		struct dev_pm_qos_request *req;
 
 		if (val < 0) {
-			ret = -EINVAL;
+			if (val == PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT)
+				ret = 0;
+			else
+				ret = -EINVAL;
 			goto out;
 		}
 		req = kzalloc(sizeof(*req), GFP_KERNEL);
@@ -883,6 +886,7 @@ int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val)
 	mutex_unlock(&dev_pm_qos_mtx);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(dev_pm_qos_update_user_latency_tolerance);
 
 /**
  * dev_pm_qos_expose_latency_tolerance - Expose latency tolerance to userspace
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 82a081e..26856d0 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -241,7 +241,8 @@ static int rpm_check_suspend_allowed(struct device *dev)
 		retval = -EACCES;
 	else if (atomic_read(&dev->power.usage_count) > 0)
 		retval = -EAGAIN;
-	else if (!pm_children_suspended(dev))
+	else if (!dev->power.ignore_children &&
+			atomic_read(&dev->power.child_count))
 		retval = -EBUSY;
 
 	/* Pending resume requests take precedence over suspends. */
@@ -515,7 +516,7 @@ static int rpm_suspend(struct device *dev, int rpmflags)
 
 	callback = RPM_GET_CALLBACK(dev, runtime_suspend);
 
-	dev_pm_enable_wake_irq(dev);
+	dev_pm_enable_wake_irq_check(dev, true);
 	retval = rpm_callback(callback, dev);
 	if (retval)
 		goto fail;
@@ -554,7 +555,7 @@ static int rpm_suspend(struct device *dev, int rpmflags)
 	return retval;
 
  fail:
-	dev_pm_disable_wake_irq(dev);
+	dev_pm_disable_wake_irq_check(dev);
 	__update_runtime_status(dev, RPM_ACTIVE);
 	dev->power.deferred_resume = false;
 	wake_up_all(&dev->power.wait_queue);
@@ -712,8 +713,8 @@ static int rpm_resume(struct device *dev, int rpmflags)
 
 		spin_lock(&parent->power.lock);
 		/*
-		 * We can resume if the parent's runtime PM is disabled or it
-		 * is set to ignore children.
+		 * Resume the parent if it has runtime PM enabled and not been
+		 * set to ignore its children.
 		 */
 		if (!parent->power.disable_depth
 		    && !parent->power.ignore_children) {
@@ -737,12 +738,12 @@ static int rpm_resume(struct device *dev, int rpmflags)
 
 	callback = RPM_GET_CALLBACK(dev, runtime_resume);
 
-	dev_pm_disable_wake_irq(dev);
+	dev_pm_disable_wake_irq_check(dev);
 	retval = rpm_callback(callback, dev);
 	if (retval) {
 		__update_runtime_status(dev, RPM_SUSPENDED);
 		pm_runtime_cancel_pending(dev);
-		dev_pm_enable_wake_irq(dev);
+		dev_pm_enable_wake_irq_check(dev, false);
 	} else {
  no_callback:
 		__update_runtime_status(dev, RPM_ACTIVE);
@@ -1027,7 +1028,17 @@ int __pm_runtime_set_status(struct device *dev, unsigned int status)
 		goto out_set;
 
 	if (status == RPM_SUSPENDED) {
-		/* It always is possible to set the status to 'suspended'. */
+		/*
+		 * It is invalid to suspend a device with an active child,
+		 * unless it has been set to ignore its children.
+		 */
+		if (!dev->power.ignore_children &&
+			atomic_read(&dev->power.child_count)) {
+			dev_err(dev, "runtime PM trying to suspend device but active child\n");
+			error = -EBUSY;
+			goto out;
+		}
+
 		if (parent) {
 			atomic_add_unless(&parent->power.child_count, -1, 0);
 			notify_parent = !parent->power.ignore_children;
@@ -1478,6 +1489,16 @@ int pm_runtime_force_suspend(struct device *dev)
 	if (ret)
 		goto err;
 
+	/*
+	 * Increase the runtime PM usage count for the device's parent, in case
+	 * when we find the device being used when system suspend was invoked.
+	 * This informs pm_runtime_force_resume() to resume the parent
+	 * immediately, which is needed to be able to resume its children,
+	 * when not deferring the resume to be managed via runtime PM.
+	 */
+	if (dev->parent && atomic_read(&dev->power.usage_count) > 1)
+		pm_runtime_get_noresume(dev->parent);
+
 	pm_runtime_set_suspended(dev);
 	return 0;
 err:
@@ -1487,16 +1508,20 @@ err:
 EXPORT_SYMBOL_GPL(pm_runtime_force_suspend);
 
 /**
- * pm_runtime_force_resume - Force a device into resume state.
+ * pm_runtime_force_resume - Force a device into resume state if needed.
  * @dev: Device to resume.
  *
  * Prior invoking this function we expect the user to have brought the device
  * into low power state by a call to pm_runtime_force_suspend(). Here we reverse
- * those actions and brings the device into full power. We update the runtime PM
- * status and re-enables runtime PM.
+ * those actions and brings the device into full power, if it is expected to be
+ * used on system resume. To distinguish that, we check whether the runtime PM
+ * usage count is greater than 1 (the PM core increases the usage count in the
+ * system PM prepare phase), as that indicates a real user (such as a subsystem,
+ * driver, userspace, etc.) is using it. If that is the case, the device is
+ * expected to be used on system resume as well, so then we resume it. In the
+ * other case, we defer the resume to be managed via runtime PM.
  *
- * Typically this function may be invoked from a system resume callback to make
- * sure the device is put into full power state.
+ * Typically this function may be invoked from a system resume callback.
  */
 int pm_runtime_force_resume(struct device *dev)
 {
@@ -1513,6 +1538,17 @@ int pm_runtime_force_resume(struct device *dev)
 	if (!pm_runtime_status_suspended(dev))
 		goto out;
 
+	/*
+	 * Decrease the parent's runtime PM usage count, if we increased it
+	 * during system suspend in pm_runtime_force_suspend().
+	*/
+	if (atomic_read(&dev->power.usage_count) > 1) {
+		if (dev->parent)
+			pm_runtime_put_noidle(dev->parent);
+	} else {
+		goto out;
+	}
+
 	ret = pm_runtime_set_active(dev);
 	if (ret)
 		goto out;
diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c
index a7b4679..33b4b90 100644
--- a/drivers/base/power/sysfs.c
+++ b/drivers/base/power/sysfs.c
@@ -263,7 +263,11 @@ static ssize_t pm_qos_latency_tolerance_store(struct device *dev,
 	s32 value;
 	int ret;
 
-	if (kstrtos32(buf, 0, &value)) {
+	if (kstrtos32(buf, 0, &value) == 0) {
+		/* Users can't write negative values directly */
+		if (value < 0)
+			return -EINVAL;
+	} else {
 		if (!strcmp(buf, "auto") || !strcmp(buf, "auto\n"))
 			value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT;
 		else if (!strcmp(buf, "any") || !strcmp(buf, "any\n"))
diff --git a/drivers/base/power/wakeirq.c b/drivers/base/power/wakeirq.c
index 0d77cd6..404d94c 100644
--- a/drivers/base/power/wakeirq.c
+++ b/drivers/base/power/wakeirq.c
@@ -110,8 +110,10 @@ void dev_pm_clear_wake_irq(struct device *dev)
 	dev->power.wakeirq = NULL;
 	spin_unlock_irqrestore(&dev->power.lock, flags);
 
-	if (wirq->dedicated_irq)
+	if (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED) {
 		free_irq(wirq->irq, wirq);
+		wirq->status &= ~WAKE_IRQ_DEDICATED_MASK;
+	}
 	kfree(wirq);
 }
 EXPORT_SYMBOL_GPL(dev_pm_clear_wake_irq);
@@ -179,7 +181,6 @@ int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq)
 
 	wirq->dev = dev;
 	wirq->irq = irq;
-	wirq->dedicated_irq = true;
 	irq_set_status_flags(irq, IRQ_NOAUTOEN);
 
 	/*
@@ -195,6 +196,8 @@ int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq)
 	if (err)
 		goto err_free_irq;
 
+	wirq->status = WAKE_IRQ_DEDICATED_ALLOCATED;
+
 	return err;
 
 err_free_irq:
@@ -210,9 +213,9 @@ EXPORT_SYMBOL_GPL(dev_pm_set_dedicated_wake_irq);
  * dev_pm_enable_wake_irq - Enable device wake-up interrupt
  * @dev: Device
  *
- * Called from the bus code or the device driver for
- * runtime_suspend() to enable the wake-up interrupt while
- * the device is running.
+ * Optionally called from the bus code or the device driver for
+ * runtime_resume() to override the PM runtime core managed wake-up
+ * interrupt handling to enable the wake-up interrupt.
  *
  * Note that for runtime_suspend()) the wake-up interrupts
  * should be unconditionally enabled unlike for suspend()
@@ -222,7 +225,7 @@ void dev_pm_enable_wake_irq(struct device *dev)
 {
 	struct wake_irq *wirq = dev->power.wakeirq;
 
-	if (wirq && wirq->dedicated_irq)
+	if (wirq && (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED))
 		enable_irq(wirq->irq);
 }
 EXPORT_SYMBOL_GPL(dev_pm_enable_wake_irq);
@@ -231,20 +234,73 @@ EXPORT_SYMBOL_GPL(dev_pm_enable_wake_irq);
  * dev_pm_disable_wake_irq - Disable device wake-up interrupt
  * @dev: Device
  *
- * Called from the bus code or the device driver for
- * runtime_resume() to disable the wake-up interrupt while
- * the device is running.
+ * Optionally called from the bus code or the device driver for
+ * runtime_suspend() to override the PM runtime core managed wake-up
+ * interrupt handling to disable the wake-up interrupt.
  */
 void dev_pm_disable_wake_irq(struct device *dev)
 {
 	struct wake_irq *wirq = dev->power.wakeirq;
 
-	if (wirq && wirq->dedicated_irq)
+	if (wirq && (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED))
 		disable_irq_nosync(wirq->irq);
 }
 EXPORT_SYMBOL_GPL(dev_pm_disable_wake_irq);
 
 /**
+ * dev_pm_enable_wake_irq_check - Checks and enables wake-up interrupt
+ * @dev: Device
+ * @can_change_status: Can change wake-up interrupt status
+ *
+ * Enables wakeirq conditionally. We need to enable wake-up interrupt
+ * lazily on the first rpm_suspend(). This is needed as the consumer device
+ * starts in RPM_SUSPENDED state, and the the first pm_runtime_get() would
+ * otherwise try to disable already disabled wakeirq. The wake-up interrupt
+ * starts disabled with IRQ_NOAUTOEN set.
+ *
+ * Should be only called from rpm_suspend() and rpm_resume() path.
+ * Caller must hold &dev->power.lock to change wirq->status
+ */
+void dev_pm_enable_wake_irq_check(struct device *dev,
+				  bool can_change_status)
+{
+	struct wake_irq *wirq = dev->power.wakeirq;
+
+	if (!wirq || !((wirq->status & WAKE_IRQ_DEDICATED_MASK)))
+		return;
+
+	if (likely(wirq->status & WAKE_IRQ_DEDICATED_MANAGED)) {
+		goto enable;
+	} else if (can_change_status) {
+		wirq->status |= WAKE_IRQ_DEDICATED_MANAGED;
+		goto enable;
+	}
+
+	return;
+
+enable:
+	enable_irq(wirq->irq);
+}
+
+/**
+ * dev_pm_disable_wake_irq_check - Checks and disables wake-up interrupt
+ * @dev: Device
+ *
+ * Disables wake-up interrupt conditionally based on status.
+ * Should be only called from rpm_suspend() and rpm_resume() path.
+ */
+void dev_pm_disable_wake_irq_check(struct device *dev)
+{
+	struct wake_irq *wirq = dev->power.wakeirq;
+
+	if (!wirq || !((wirq->status & WAKE_IRQ_DEDICATED_MASK)))
+		return;
+
+	if (wirq->status & WAKE_IRQ_DEDICATED_MANAGED)
+		disable_irq_nosync(wirq->irq);
+}
+
+/**
  * dev_pm_arm_wake_irq - Arm device wake-up
  * @wirq: Device wake-up interrupt
  *
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 62e4de2..bf9ba26 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -811,7 +811,7 @@ void pm_print_active_wakeup_sources(void)
 	rcu_read_lock();
 	list_for_each_entry_rcu(ws, &wakeup_sources, entry) {
 		if (ws->active) {
-			pr_info("active wakeup source: %s\n", ws->name);
+			pr_debug("active wakeup source: %s\n", ws->name);
 			active = 1;
 		} else if (!active &&
 			   (!last_activity_ws ||
@@ -822,7 +822,7 @@ void pm_print_active_wakeup_sources(void)
 	}
 
 	if (!active && last_activity_ws)
-		pr_info("last active wakeup source: %s\n",
+		pr_debug("last active wakeup source: %s\n",
 			last_activity_ws->name);
 	rcu_read_unlock();
 }
@@ -905,7 +905,7 @@ bool pm_get_wakeup_count(unsigned int *count, bool block)
 			split_counters(&cnt, &inpr);
 			if (inpr == 0 || signal_pending(current))
 				break;
-
+			pm_print_active_wakeup_sources();
 			schedule();
 		}
 		finish_wait(&wakeup_count_wait_queue, &wait);
diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index d89b8af..920c469 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -12,6 +12,27 @@ config ARM_BIG_LITTLE_CPUFREQ
 	help
 	  This enables the Generic CPUfreq driver for ARM big.LITTLE platforms.
 
+config ARM_BRCMSTB_AVS_CPUFREQ
+	tristate "Broadcom STB AVS CPUfreq driver"
+	depends on ARCH_BRCMSTB || COMPILE_TEST
+	default y
+	help
+	  Some Broadcom STB SoCs use a co-processor running proprietary firmware
+	  ("AVS") to handle voltage and frequency scaling. This driver provides
+	  a standard CPUfreq interface to to the firmware.
+
+	  Say Y, if you have a Broadcom SoC with AVS support for DFS or DVFS.
+
+config ARM_BRCMSTB_AVS_CPUFREQ_DEBUG
+	bool "Broadcom STB AVS CPUfreq driver sysfs debug capability"
+	depends on ARM_BRCMSTB_AVS_CPUFREQ
+	help
+	  Enabling this option turns on debug support via sysfs under
+	  /sys/kernel/debug/brcmstb-avs-cpufreq. It is possible to read all and
+	  write some AVS mailbox registers through sysfs entries.
+
+	  If in doubt, say N.
+
 config ARM_DT_BL_CPUFREQ
 	tristate "Generic probing via DT for ARM big LITTLE CPUfreq driver"
 	depends on ARM_BIG_LITTLE_CPUFREQ && OF
@@ -60,14 +81,6 @@ config ARM_IMX6Q_CPUFREQ
 
 	  If in doubt, say N.
 
-config ARM_INTEGRATOR
-	tristate "CPUfreq driver for ARM Integrator CPUs"
-	depends on ARCH_INTEGRATOR
-	default y
-	help
-	  This enables the CPUfreq driver for ARM Integrator CPUs.
-	  If in doubt, say Y.
-
 config ARM_KIRKWOOD_CPUFREQ
 	def_bool MACH_KIRKWOOD
 	help
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index 0a9b6a09..1e46c39 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -51,12 +51,12 @@ obj-$(CONFIG_ARM_BIG_LITTLE_CPUFREQ)	+= arm_big_little.o
 # LITTLE drivers, so that it is probed last.
 obj-$(CONFIG_ARM_DT_BL_CPUFREQ)		+= arm_big_little_dt.o
 
+obj-$(CONFIG_ARM_BRCMSTB_AVS_CPUFREQ)	+= brcmstb-avs-cpufreq.o
 obj-$(CONFIG_ARCH_DAVINCI)		+= davinci-cpufreq.o
 obj-$(CONFIG_UX500_SOC_DB8500)		+= dbx500-cpufreq.o
 obj-$(CONFIG_ARM_EXYNOS5440_CPUFREQ)	+= exynos5440-cpufreq.o
 obj-$(CONFIG_ARM_HIGHBANK_CPUFREQ)	+= highbank-cpufreq.o
 obj-$(CONFIG_ARM_IMX6Q_CPUFREQ)		+= imx6q-cpufreq.o
-obj-$(CONFIG_ARM_INTEGRATOR)		+= integrator-cpufreq.o
 obj-$(CONFIG_ARM_KIRKWOOD_CPUFREQ)	+= kirkwood-cpufreq.o
 obj-$(CONFIG_ARM_MT8173_CPUFREQ)	+= mt8173-cpufreq.o
 obj-$(CONFIG_ARM_OMAP2PLUS_CPUFREQ)	+= omap-cpufreq.o
diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index 297e912..3a98702 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -84,7 +84,6 @@ static inline struct acpi_processor_performance *to_perf_data(struct acpi_cpufre
 static struct cpufreq_driver acpi_cpufreq_driver;
 
 static unsigned int acpi_pstate_strict;
-static struct msr __percpu *msrs;
 
 static bool boost_state(unsigned int cpu)
 {
@@ -104,11 +103,10 @@ static bool boost_state(unsigned int cpu)
 	return false;
 }
 
-static void boost_set_msrs(bool enable, const struct cpumask *cpumask)
+static int boost_set_msr(bool enable)
 {
-	u32 cpu;
 	u32 msr_addr;
-	u64 msr_mask;
+	u64 msr_mask, val;
 
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_INTEL:
@@ -120,26 +118,31 @@ static void boost_set_msrs(bool enable, const struct cpumask *cpumask)
 		msr_mask = MSR_K7_HWCR_CPB_DIS;
 		break;
 	default:
-		return;
+		return -EINVAL;
 	}
 
-	rdmsr_on_cpus(cpumask, msr_addr, msrs);
+	rdmsrl(msr_addr, val);
 
-	for_each_cpu(cpu, cpumask) {
-		struct msr *reg = per_cpu_ptr(msrs, cpu);
-		if (enable)
-			reg->q &= ~msr_mask;
-		else
-			reg->q |= msr_mask;
-	}
+	if (enable)
+		val &= ~msr_mask;
+	else
+		val |= msr_mask;
+
+	wrmsrl(msr_addr, val);
+	return 0;
+}
+
+static void boost_set_msr_each(void *p_en)
+{
+	bool enable = (bool) p_en;
 
-	wrmsr_on_cpus(cpumask, msr_addr, msrs);
+	boost_set_msr(enable);
 }
 
 static int set_boost(int val)
 {
 	get_online_cpus();
-	boost_set_msrs(val, cpu_online_mask);
+	on_each_cpu(boost_set_msr_each, (void *)(long)val, 1);
 	put_online_cpus();
 	pr_debug("Core Boosting %sabled.\n", val ? "en" : "dis");
 
@@ -536,46 +539,24 @@ static void free_acpi_perf_data(void)
 	free_percpu(acpi_perf_data);
 }
 
-static int boost_notify(struct notifier_block *nb, unsigned long action,
-		      void *hcpu)
+static int cpufreq_boost_online(unsigned int cpu)
 {
-	unsigned cpu = (long)hcpu;
-	const struct cpumask *cpumask;
-
-	cpumask = get_cpu_mask(cpu);
+	/*
+	 * On the CPU_UP path we simply keep the boost-disable flag
+	 * in sync with the current global state.
+	 */
+	return boost_set_msr(acpi_cpufreq_driver.boost_enabled);
+}
 
+static int cpufreq_boost_down_prep(unsigned int cpu)
+{
 	/*
 	 * Clear the boost-disable bit on the CPU_DOWN path so that
-	 * this cpu cannot block the remaining ones from boosting. On
-	 * the CPU_UP path we simply keep the boost-disable flag in
-	 * sync with the current global state.
+	 * this cpu cannot block the remaining ones from boosting.
 	 */
-
-	switch (action) {
-	case CPU_DOWN_FAILED:
-	case CPU_DOWN_FAILED_FROZEN:
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-		boost_set_msrs(acpi_cpufreq_driver.boost_enabled, cpumask);
-		break;
-
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
-		boost_set_msrs(1, cpumask);
-		break;
-
-	default:
-		break;
-	}
-
-	return NOTIFY_OK;
+	return boost_set_msr(1);
 }
 
-
-static struct notifier_block boost_nb = {
-	.notifier_call          = boost_notify,
-};
-
 /*
  * acpi_cpufreq_early_init - initialize ACPI P-States library
  *
@@ -922,37 +903,35 @@ static struct cpufreq_driver acpi_cpufreq_driver = {
 	.attr		= acpi_cpufreq_attr,
 };
 
+static enum cpuhp_state acpi_cpufreq_online;
+
 static void __init acpi_cpufreq_boost_init(void)
 {
-	if (boot_cpu_has(X86_FEATURE_CPB) || boot_cpu_has(X86_FEATURE_IDA)) {
-		msrs = msrs_alloc();
-
-		if (!msrs)
-			return;
-
-		acpi_cpufreq_driver.set_boost = set_boost;
-		acpi_cpufreq_driver.boost_enabled = boost_state(0);
-
-		cpu_notifier_register_begin();
+	int ret;
 
-		/* Force all MSRs to the same value */
-		boost_set_msrs(acpi_cpufreq_driver.boost_enabled,
-			       cpu_online_mask);
+	if (!(boot_cpu_has(X86_FEATURE_CPB) || boot_cpu_has(X86_FEATURE_IDA)))
+		return;
 
-		__register_cpu_notifier(&boost_nb);
+	acpi_cpufreq_driver.set_boost = set_boost;
+	acpi_cpufreq_driver.boost_enabled = boost_state(0);
 
-		cpu_notifier_register_done();
+	/*
+	 * This calls the online callback on all online cpu and forces all
+	 * MSRs to the same value.
+	 */
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "cpufreq/acpi:online",
+				cpufreq_boost_online, cpufreq_boost_down_prep);
+	if (ret < 0) {
+		pr_err("acpi_cpufreq: failed to register hotplug callbacks\n");
+		return;
 	}
+	acpi_cpufreq_online = ret;
 }
 
 static void acpi_cpufreq_boost_exit(void)
 {
-	if (msrs) {
-		unregister_cpu_notifier(&boost_nb);
-
-		msrs_free(msrs);
-		msrs = NULL;
-	}
+	if (acpi_cpufreq_online >= 0)
+		cpuhp_remove_state_nocalls(acpi_cpufreq_online);
 }
 
 static int __init acpi_cpufreq_init(void)
diff --git a/drivers/cpufreq/brcmstb-avs-cpufreq.c b/drivers/cpufreq/brcmstb-avs-cpufreq.c
new file mode 100644
index 0000000..4fda623
--- /dev/null
+++ b/drivers/cpufreq/brcmstb-avs-cpufreq.c
@@ -0,0 +1,1057 @@
+/*
+ * CPU frequency scaling for Broadcom SoCs with AVS firmware that
+ * supports DVS or DVFS
+ *
+ * Copyright (c) 2016 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation version 2.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+/*
+ * "AVS" is the name of a firmware developed at Broadcom. It derives
+ * its name from the technique called "Adaptive Voltage Scaling".
+ * Adaptive voltage scaling was the original purpose of this firmware.
+ * The AVS firmware still supports "AVS mode", where all it does is
+ * adaptive voltage scaling. However, on some newer Broadcom SoCs, the
+ * AVS Firmware, despite its unchanged name, also supports DFS mode and
+ * DVFS mode.
+ *
+ * In the context of this document and the related driver, "AVS" by
+ * itself always means the Broadcom firmware and never refers to the
+ * technique called "Adaptive Voltage Scaling".
+ *
+ * The Broadcom STB AVS CPUfreq driver provides voltage and frequency
+ * scaling on Broadcom SoCs using AVS firmware with support for DFS and
+ * DVFS. The AVS firmware is running on its own co-processor. The
+ * driver supports both uniprocessor (UP) and symmetric multiprocessor
+ * (SMP) systems which share clock and voltage across all CPUs.
+ *
+ * Actual voltage and frequency scaling is done solely by the AVS
+ * firmware. This driver does not change frequency or voltage itself.
+ * It provides a standard CPUfreq interface to the rest of the kernel
+ * and to userland. It interfaces with the AVS firmware to effect the
+ * requested changes and to report back the current system status in a
+ * way that is expected by existing tools.
+ */
+
+#include <linux/cpufreq.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of_address.h>
+#include <linux/platform_device.h>
+#include <linux/semaphore.h>
+
+#ifdef CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#endif
+
+/* Max number of arguments AVS calls take */
+#define AVS_MAX_CMD_ARGS	4
+/*
+ * This macro is used to generate AVS parameter register offsets. For
+ * x >= AVS_MAX_CMD_ARGS, it returns 0 to protect against accidental memory
+ * access outside of the parameter range. (Offset 0 is the first parameter.)
+ */
+#define AVS_PARAM_MULT(x)	((x) < AVS_MAX_CMD_ARGS ? (x) : 0)
+
+/* AVS Mailbox Register offsets */
+#define AVS_MBOX_COMMAND	0x00
+#define AVS_MBOX_STATUS		0x04
+#define AVS_MBOX_VOLTAGE0	0x08
+#define AVS_MBOX_TEMP0		0x0c
+#define AVS_MBOX_PV0		0x10
+#define AVS_MBOX_MV0		0x14
+#define AVS_MBOX_PARAM(x)	(0x18 + AVS_PARAM_MULT(x) * sizeof(u32))
+#define AVS_MBOX_REVISION	0x28
+#define AVS_MBOX_PSTATE		0x2c
+#define AVS_MBOX_HEARTBEAT	0x30
+#define AVS_MBOX_MAGIC		0x34
+#define AVS_MBOX_SIGMA_HVT	0x38
+#define AVS_MBOX_SIGMA_SVT	0x3c
+#define AVS_MBOX_VOLTAGE1	0x40
+#define AVS_MBOX_TEMP1		0x44
+#define AVS_MBOX_PV1		0x48
+#define AVS_MBOX_MV1		0x4c
+#define AVS_MBOX_FREQUENCY	0x50
+
+/* AVS Commands */
+#define AVS_CMD_AVAILABLE	0x00
+#define AVS_CMD_DISABLE		0x10
+#define AVS_CMD_ENABLE		0x11
+#define AVS_CMD_S2_ENTER	0x12
+#define AVS_CMD_S2_EXIT		0x13
+#define AVS_CMD_BBM_ENTER	0x14
+#define AVS_CMD_BBM_EXIT	0x15
+#define AVS_CMD_S3_ENTER	0x16
+#define AVS_CMD_S3_EXIT		0x17
+#define AVS_CMD_BALANCE		0x18
+/* PMAP and P-STATE commands */
+#define AVS_CMD_GET_PMAP	0x30
+#define AVS_CMD_SET_PMAP	0x31
+#define AVS_CMD_GET_PSTATE	0x40
+#define AVS_CMD_SET_PSTATE	0x41
+
+/* Different modes AVS supports (for GET_PMAP/SET_PMAP) */
+#define AVS_MODE_AVS		0x0
+#define AVS_MODE_DFS		0x1
+#define AVS_MODE_DVS		0x2
+#define AVS_MODE_DVFS		0x3
+
+/*
+ * PMAP parameter p1
+ * unused:31-24, mdiv_p0:23-16, unused:15-14, pdiv:13-10 , ndiv_int:9-0
+ */
+#define NDIV_INT_SHIFT		0
+#define NDIV_INT_MASK		0x3ff
+#define PDIV_SHIFT		10
+#define PDIV_MASK		0xf
+#define MDIV_P0_SHIFT		16
+#define MDIV_P0_MASK		0xff
+/*
+ * PMAP parameter p2
+ * mdiv_p4:31-24, mdiv_p3:23-16, mdiv_p2:15:8, mdiv_p1:7:0
+ */
+#define MDIV_P1_SHIFT		0
+#define MDIV_P1_MASK		0xff
+#define MDIV_P2_SHIFT		8
+#define MDIV_P2_MASK		0xff
+#define MDIV_P3_SHIFT		16
+#define MDIV_P3_MASK		0xff
+#define MDIV_P4_SHIFT		24
+#define MDIV_P4_MASK		0xff
+
+/* Different P-STATES AVS supports (for GET_PSTATE/SET_PSTATE) */
+#define AVS_PSTATE_P0		0x0
+#define AVS_PSTATE_P1		0x1
+#define AVS_PSTATE_P2		0x2
+#define AVS_PSTATE_P3		0x3
+#define AVS_PSTATE_P4		0x4
+#define AVS_PSTATE_MAX		AVS_PSTATE_P4
+
+/* CPU L2 Interrupt Controller Registers */
+#define AVS_CPU_L2_SET0		0x04
+#define AVS_CPU_L2_INT_MASK	BIT(31)
+
+/* AVS Command Status Values */
+#define AVS_STATUS_CLEAR	0x00
+/* Command/notification accepted */
+#define AVS_STATUS_SUCCESS	0xf0
+/* Command/notification rejected */
+#define AVS_STATUS_FAILURE	0xff
+/* Invalid command/notification (unknown) */
+#define AVS_STATUS_INVALID	0xf1
+/* Non-AVS modes are not supported */
+#define AVS_STATUS_NO_SUPP	0xf2
+/* Cannot set P-State until P-Map supplied */
+#define AVS_STATUS_NO_MAP	0xf3
+/* Cannot change P-Map after initial P-Map set */
+#define AVS_STATUS_MAP_SET	0xf4
+/* Max AVS status; higher numbers are used for debugging */
+#define AVS_STATUS_MAX		0xff
+
+/* Other AVS related constants */
+#define AVS_LOOP_LIMIT		10000
+#define AVS_TIMEOUT		300 /* in ms; expected completion is < 10ms */
+#define AVS_FIRMWARE_MAGIC	0xa11600d1
+
+#define BRCM_AVS_CPUFREQ_PREFIX	"brcmstb-avs"
+#define BRCM_AVS_CPUFREQ_NAME	BRCM_AVS_CPUFREQ_PREFIX "-cpufreq"
+#define BRCM_AVS_CPU_DATA	"brcm,avs-cpu-data-mem"
+#define BRCM_AVS_CPU_INTR	"brcm,avs-cpu-l2-intr"
+#define BRCM_AVS_HOST_INTR	"sw_intr"
+
+struct pmap {
+	unsigned int mode;
+	unsigned int p1;
+	unsigned int p2;
+	unsigned int state;
+};
+
+struct private_data {
+	void __iomem *base;
+	void __iomem *avs_intr_base;
+	struct device *dev;
+#ifdef CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG
+	struct dentry *debugfs;
+#endif
+	struct completion done;
+	struct semaphore sem;
+	struct pmap pmap;
+};
+
+#ifdef CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG
+
+enum debugfs_format {
+	DEBUGFS_NORMAL,
+	DEBUGFS_FLOAT,
+	DEBUGFS_REV,
+};
+
+struct debugfs_data {
+	struct debugfs_entry *entry;
+	struct private_data *priv;
+};
+
+struct debugfs_entry {
+	char *name;
+	u32 offset;
+	fmode_t mode;
+	enum debugfs_format format;
+};
+
+#define DEBUGFS_ENTRY(name, mode, format)	{ \
+	#name, AVS_MBOX_##name, mode, format \
+}
+
+/*
+ * These are used for debugfs only. Otherwise we use AVS_MBOX_PARAM() directly.
+ */
+#define AVS_MBOX_PARAM1		AVS_MBOX_PARAM(0)
+#define AVS_MBOX_PARAM2		AVS_MBOX_PARAM(1)
+#define AVS_MBOX_PARAM3		AVS_MBOX_PARAM(2)
+#define AVS_MBOX_PARAM4		AVS_MBOX_PARAM(3)
+
+/*
+ * This table stores the name, access permissions and offset for each hardware
+ * register and is used to generate debugfs entries.
+ */
+static struct debugfs_entry debugfs_entries[] = {
+	DEBUGFS_ENTRY(COMMAND, S_IWUSR, DEBUGFS_NORMAL),
+	DEBUGFS_ENTRY(STATUS, S_IWUSR, DEBUGFS_NORMAL),
+	DEBUGFS_ENTRY(VOLTAGE0, 0, DEBUGFS_FLOAT),
+	DEBUGFS_ENTRY(TEMP0, 0, DEBUGFS_FLOAT),
+	DEBUGFS_ENTRY(PV0, 0, DEBUGFS_FLOAT),
+	DEBUGFS_ENTRY(MV0, 0, DEBUGFS_FLOAT),
+	DEBUGFS_ENTRY(PARAM1, S_IWUSR, DEBUGFS_NORMAL),
+	DEBUGFS_ENTRY(PARAM2, S_IWUSR, DEBUGFS_NORMAL),
+	DEBUGFS_ENTRY(PARAM3, S_IWUSR, DEBUGFS_NORMAL),
+	DEBUGFS_ENTRY(PARAM4, S_IWUSR, DEBUGFS_NORMAL),
+	DEBUGFS_ENTRY(REVISION, 0, DEBUGFS_REV),
+	DEBUGFS_ENTRY(PSTATE, 0, DEBUGFS_NORMAL),
+	DEBUGFS_ENTRY(HEARTBEAT, 0, DEBUGFS_NORMAL),
+	DEBUGFS_ENTRY(MAGIC, S_IWUSR, DEBUGFS_NORMAL),
+	DEBUGFS_ENTRY(SIGMA_HVT, 0, DEBUGFS_NORMAL),
+	DEBUGFS_ENTRY(SIGMA_SVT, 0, DEBUGFS_NORMAL),
+	DEBUGFS_ENTRY(VOLTAGE1, 0, DEBUGFS_FLOAT),
+	DEBUGFS_ENTRY(TEMP1, 0, DEBUGFS_FLOAT),
+	DEBUGFS_ENTRY(PV1, 0, DEBUGFS_FLOAT),
+	DEBUGFS_ENTRY(MV1, 0, DEBUGFS_FLOAT),
+	DEBUGFS_ENTRY(FREQUENCY, 0, DEBUGFS_NORMAL),
+};
+
+static int brcm_avs_target_index(struct cpufreq_policy *, unsigned int);
+
+static char *__strtolower(char *s)
+{
+	char *p;
+
+	for (p = s; *p; p++)
+		*p = tolower(*p);
+
+	return s;
+}
+
+#endif /* CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG */
+
+static void __iomem *__map_region(const char *name)
+{
+	struct device_node *np;
+	void __iomem *ptr;
+
+	np = of_find_compatible_node(NULL, NULL, name);
+	if (!np)
+		return NULL;
+
+	ptr = of_iomap(np, 0);
+	of_node_put(np);
+
+	return ptr;
+}
+
+static int __issue_avs_command(struct private_data *priv, int cmd, bool is_send,
+			       u32 args[])
+{
+	unsigned long time_left = msecs_to_jiffies(AVS_TIMEOUT);
+	void __iomem *base = priv->base;
+	unsigned int i;
+	int ret;
+	u32 val;
+
+	ret = down_interruptible(&priv->sem);
+	if (ret)
+		return ret;
+
+	/*
+	 * Make sure no other command is currently running: cmd is 0 if AVS
+	 * co-processor is idle. Due to the guard above, we should almost never
+	 * have to wait here.
+	 */
+	for (i = 0, val = 1; val != 0 && i < AVS_LOOP_LIMIT; i++)
+		val = readl(base + AVS_MBOX_COMMAND);
+
+	/* Give the caller a chance to retry if AVS is busy. */
+	if (i == AVS_LOOP_LIMIT) {
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	/* Clear status before we begin. */
+	writel(AVS_STATUS_CLEAR, base + AVS_MBOX_STATUS);
+
+	/* We need to send arguments for this command. */
+	if (args && is_send) {
+		for (i = 0; i < AVS_MAX_CMD_ARGS; i++)
+			writel(args[i], base + AVS_MBOX_PARAM(i));
+	}
+
+	/* Protect from spurious interrupts. */
+	reinit_completion(&priv->done);
+
+	/* Now issue the command & tell firmware to wake up to process it. */
+	writel(cmd, base + AVS_MBOX_COMMAND);
+	writel(AVS_CPU_L2_INT_MASK, priv->avs_intr_base + AVS_CPU_L2_SET0);
+
+	/* Wait for AVS co-processor to finish processing the command. */
+	time_left = wait_for_completion_timeout(&priv->done, time_left);
+
+	/*
+	 * If the AVS status is not in the expected range, it means AVS didn't
+	 * complete our command in time, and we return an error. Also, if there
+	 * is no "time left", we timed out waiting for the interrupt.
+	 */
+	val = readl(base + AVS_MBOX_STATUS);
+	if (time_left == 0 || val == 0 || val > AVS_STATUS_MAX) {
+		dev_err(priv->dev, "AVS command %#x didn't complete in time\n",
+			cmd);
+		dev_err(priv->dev, "    Time left: %u ms, AVS status: %#x\n",
+			jiffies_to_msecs(time_left), val);
+		ret = -ETIMEDOUT;
+		goto out;
+	}
+
+	/* This command returned arguments, so we read them back. */
+	if (args && !is_send) {
+		for (i = 0; i < AVS_MAX_CMD_ARGS; i++)
+			args[i] = readl(base + AVS_MBOX_PARAM(i));
+	}
+
+	/* Clear status to tell AVS co-processor we are done. */
+	writel(AVS_STATUS_CLEAR, base + AVS_MBOX_STATUS);
+
+	/* Convert firmware errors to errno's as much as possible. */
+	switch (val) {
+	case AVS_STATUS_INVALID:
+		ret = -EINVAL;
+		break;
+	case AVS_STATUS_NO_SUPP:
+		ret = -ENOTSUPP;
+		break;
+	case AVS_STATUS_NO_MAP:
+		ret = -ENOENT;
+		break;
+	case AVS_STATUS_MAP_SET:
+		ret = -EEXIST;
+		break;
+	case AVS_STATUS_FAILURE:
+		ret = -EIO;
+		break;
+	}
+
+out:
+	up(&priv->sem);
+
+	return ret;
+}
+
+static irqreturn_t irq_handler(int irq, void *data)
+{
+	struct private_data *priv = data;
+
+	/* AVS command completed execution. Wake up __issue_avs_command(). */
+	complete(&priv->done);
+
+	return IRQ_HANDLED;
+}
+
+static char *brcm_avs_mode_to_string(unsigned int mode)
+{
+	switch (mode) {
+	case AVS_MODE_AVS:
+		return "AVS";
+	case AVS_MODE_DFS:
+		return "DFS";
+	case AVS_MODE_DVS:
+		return "DVS";
+	case AVS_MODE_DVFS:
+		return "DVFS";
+	}
+	return NULL;
+}
+
+static void brcm_avs_parse_p1(u32 p1, unsigned int *mdiv_p0, unsigned int *pdiv,
+			      unsigned int *ndiv)
+{
+	*mdiv_p0 = (p1 >> MDIV_P0_SHIFT) & MDIV_P0_MASK;
+	*pdiv = (p1 >> PDIV_SHIFT) & PDIV_MASK;
+	*ndiv = (p1 >> NDIV_INT_SHIFT) & NDIV_INT_MASK;
+}
+
+static void brcm_avs_parse_p2(u32 p2, unsigned int *mdiv_p1,
+			      unsigned int *mdiv_p2, unsigned int *mdiv_p3,
+			      unsigned int *mdiv_p4)
+{
+	*mdiv_p4 = (p2 >> MDIV_P4_SHIFT) & MDIV_P4_MASK;
+	*mdiv_p3 = (p2 >> MDIV_P3_SHIFT) & MDIV_P3_MASK;
+	*mdiv_p2 = (p2 >> MDIV_P2_SHIFT) & MDIV_P2_MASK;
+	*mdiv_p1 = (p2 >> MDIV_P1_SHIFT) & MDIV_P1_MASK;
+}
+
+static int brcm_avs_get_pmap(struct private_data *priv, struct pmap *pmap)
+{
+	u32 args[AVS_MAX_CMD_ARGS];
+	int ret;
+
+	ret = __issue_avs_command(priv, AVS_CMD_GET_PMAP, false, args);
+	if (ret || !pmap)
+		return ret;
+
+	pmap->mode = args[0];
+	pmap->p1 = args[1];
+	pmap->p2 = args[2];
+	pmap->state = args[3];
+
+	return 0;
+}
+
+static int brcm_avs_set_pmap(struct private_data *priv, struct pmap *pmap)
+{
+	u32 args[AVS_MAX_CMD_ARGS];
+
+	args[0] = pmap->mode;
+	args[1] = pmap->p1;
+	args[2] = pmap->p2;
+	args[3] = pmap->state;
+
+	return __issue_avs_command(priv, AVS_CMD_SET_PMAP, true, args);
+}
+
+static int brcm_avs_get_pstate(struct private_data *priv, unsigned int *pstate)
+{
+	u32 args[AVS_MAX_CMD_ARGS];
+	int ret;
+
+	ret = __issue_avs_command(priv, AVS_CMD_GET_PSTATE, false, args);
+	if (ret)
+		return ret;
+	*pstate = args[0];
+
+	return 0;
+}
+
+static int brcm_avs_set_pstate(struct private_data *priv, unsigned int pstate)
+{
+	u32 args[AVS_MAX_CMD_ARGS];
+
+	args[0] = pstate;
+
+	return __issue_avs_command(priv, AVS_CMD_SET_PSTATE, true, args);
+}
+
+static unsigned long brcm_avs_get_voltage(void __iomem *base)
+{
+	return readl(base + AVS_MBOX_VOLTAGE1);
+}
+
+static unsigned long brcm_avs_get_frequency(void __iomem *base)
+{
+	return readl(base + AVS_MBOX_FREQUENCY) * 1000;	/* in kHz */
+}
+
+/*
+ * We determine which frequencies are supported by cycling through all P-states
+ * and reading back what frequency we are running at for each P-state.
+ */
+static struct cpufreq_frequency_table *
+brcm_avs_get_freq_table(struct device *dev, struct private_data *priv)
+{
+	struct cpufreq_frequency_table *table;
+	unsigned int pstate;
+	int i, ret;
+
+	/* Remember P-state for later */
+	ret = brcm_avs_get_pstate(priv, &pstate);
+	if (ret)
+		return ERR_PTR(ret);
+
+	table = devm_kzalloc(dev, (AVS_PSTATE_MAX + 1) * sizeof(*table),
+			     GFP_KERNEL);
+	if (!table)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = AVS_PSTATE_P0; i <= AVS_PSTATE_MAX; i++) {
+		ret = brcm_avs_set_pstate(priv, i);
+		if (ret)
+			return ERR_PTR(ret);
+		table[i].frequency = brcm_avs_get_frequency(priv->base);
+		table[i].driver_data = i;
+	}
+	table[i].frequency = CPUFREQ_TABLE_END;
+
+	/* Restore P-state */
+	ret = brcm_avs_set_pstate(priv, pstate);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return table;
+}
+
+#ifdef CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG
+
+#define MANT(x)	(unsigned int)(abs((x)) / 1000)
+#define FRAC(x)	(unsigned int)(abs((x)) - abs((x)) / 1000 * 1000)
+
+static int brcm_avs_debug_show(struct seq_file *s, void *data)
+{
+	struct debugfs_data *dbgfs = s->private;
+	void __iomem *base;
+	u32 val, offset;
+
+	if (!dbgfs) {
+		seq_puts(s, "No device pointer\n");
+		return 0;
+	}
+
+	base = dbgfs->priv->base;
+	offset = dbgfs->entry->offset;
+	val = readl(base + offset);
+	switch (dbgfs->entry->format) {
+	case DEBUGFS_NORMAL:
+		seq_printf(s, "%u\n", val);
+		break;
+	case DEBUGFS_FLOAT:
+		seq_printf(s, "%d.%03d\n", MANT(val), FRAC(val));
+		break;
+	case DEBUGFS_REV:
+		seq_printf(s, "%c.%c.%c.%c\n", (val >> 24 & 0xff),
+			   (val >> 16 & 0xff), (val >> 8 & 0xff),
+			   val & 0xff);
+		break;
+	}
+	seq_printf(s, "0x%08x\n", val);
+
+	return 0;
+}
+
+#undef MANT
+#undef FRAC
+
+static ssize_t brcm_avs_seq_write(struct file *file, const char __user *buf,
+				  size_t size, loff_t *ppos)
+{
+	struct seq_file *s = file->private_data;
+	struct debugfs_data *dbgfs = s->private;
+	struct private_data *priv = dbgfs->priv;
+	void __iomem *base, *avs_intr_base;
+	bool use_issue_command = false;
+	unsigned long val, offset;
+	char str[128];
+	int ret;
+	char *str_ptr = str;
+
+	if (size >= sizeof(str))
+		return -E2BIG;
+
+	memset(str, 0, sizeof(str));
+	ret = copy_from_user(str, buf, size);
+	if (ret)
+		return ret;
+
+	base = priv->base;
+	avs_intr_base = priv->avs_intr_base;
+	offset = dbgfs->entry->offset;
+	/*
+	 * Special case writing to "command" entry only: if the string starts
+	 * with a 'c', we use the driver's __issue_avs_command() function.
+	 * Otherwise, we perform a raw write. This should allow testing of raw
+	 * access as well as using the higher level function. (Raw access
+	 * doesn't clear the firmware return status after issuing the command.)
+	 */
+	if (str_ptr[0] == 'c' && offset == AVS_MBOX_COMMAND) {
+		use_issue_command = true;
+		str_ptr++;
+	}
+	if (kstrtoul(str_ptr, 0, &val) != 0)
+		return -EINVAL;
+
+	/*
+	 * Setting the P-state is a special case. We need to update the CPU
+	 * frequency we report.
+	 */
+	if (val == AVS_CMD_SET_PSTATE) {
+		struct cpufreq_policy *policy;
+		unsigned int pstate;
+
+		policy = cpufreq_cpu_get(smp_processor_id());
+		/* Read back the P-state we are about to set */
+		pstate = readl(base + AVS_MBOX_PARAM(0));
+		if (use_issue_command) {
+			ret = brcm_avs_target_index(policy, pstate);
+			return ret ? ret : size;
+		}
+		policy->cur = policy->freq_table[pstate].frequency;
+	}
+
+	if (use_issue_command) {
+		ret = __issue_avs_command(priv, val, false, NULL);
+	} else {
+		/* Locking here is not perfect, but is only for debug. */
+		ret = down_interruptible(&priv->sem);
+		if (ret)
+			return ret;
+
+		writel(val, base + offset);
+		/* We have to wake up the firmware to process a command. */
+		if (offset == AVS_MBOX_COMMAND)
+			writel(AVS_CPU_L2_INT_MASK,
+			       avs_intr_base + AVS_CPU_L2_SET0);
+		up(&priv->sem);
+	}
+
+	return ret ? ret : size;
+}
+
+static struct debugfs_entry *__find_debugfs_entry(const char *name)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(debugfs_entries); i++)
+		if (strcasecmp(debugfs_entries[i].name, name) == 0)
+			return &debugfs_entries[i];
+
+	return NULL;
+}
+
+static int brcm_avs_debug_open(struct inode *inode, struct file *file)
+{
+	struct debugfs_data *data;
+	fmode_t fmode;
+	int ret;
+
+	/*
+	 * seq_open(), which is called by single_open(), clears "write" access.
+	 * We need write access to some files, so we preserve our access mode
+	 * and restore it.
+	 */
+	fmode = file->f_mode;
+	/*
+	 * Check access permissions even for root. We don't want to be writing
+	 * to read-only registers. Access for regular users has already been
+	 * checked by the VFS layer.
+	 */
+	if ((fmode & FMODE_WRITER) && !(inode->i_mode & S_IWUSR))
+		return -EACCES;
+
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+	/*
+	 * We use the same file system operations for all our debug files. To
+	 * produce specific output, we look up the file name upon opening a
+	 * debugfs entry and map it to a memory offset. This offset is then used
+	 * in the generic "show" function to read a specific register.
+	 */
+	data->entry = __find_debugfs_entry(file->f_path.dentry->d_iname);
+	data->priv = inode->i_private;
+
+	ret = single_open(file, brcm_avs_debug_show, data);
+	if (ret)
+		kfree(data);
+	file->f_mode = fmode;
+
+	return ret;
+}
+
+static int brcm_avs_debug_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq_priv = file->private_data;
+	struct debugfs_data *data = seq_priv->private;
+
+	kfree(data);
+	return single_release(inode, file);
+}
+
+static const struct file_operations brcm_avs_debug_ops = {
+	.open		= brcm_avs_debug_open,
+	.read		= seq_read,
+	.write		= brcm_avs_seq_write,
+	.llseek		= seq_lseek,
+	.release	= brcm_avs_debug_release,
+};
+
+static void brcm_avs_cpufreq_debug_init(struct platform_device *pdev)
+{
+	struct private_data *priv = platform_get_drvdata(pdev);
+	struct dentry *dir;
+	int i;
+
+	if (!priv)
+		return;
+
+	dir = debugfs_create_dir(BRCM_AVS_CPUFREQ_NAME, NULL);
+	if (IS_ERR_OR_NULL(dir))
+		return;
+	priv->debugfs = dir;
+
+	for (i = 0; i < ARRAY_SIZE(debugfs_entries); i++) {
+		/*
+		 * The DEBUGFS_ENTRY macro generates uppercase strings. We
+		 * convert them to lowercase before creating the debugfs
+		 * entries.
+		 */
+		char *entry = __strtolower(debugfs_entries[i].name);
+		fmode_t mode = debugfs_entries[i].mode;
+
+		if (!debugfs_create_file(entry, S_IFREG | S_IRUGO | mode,
+					 dir, priv, &brcm_avs_debug_ops)) {
+			priv->debugfs = NULL;
+			debugfs_remove_recursive(dir);
+			break;
+		}
+	}
+}
+
+static void brcm_avs_cpufreq_debug_exit(struct platform_device *pdev)
+{
+	struct private_data *priv = platform_get_drvdata(pdev);
+
+	if (priv && priv->debugfs) {
+		debugfs_remove_recursive(priv->debugfs);
+		priv->debugfs = NULL;
+	}
+}
+
+#else
+
+static void brcm_avs_cpufreq_debug_init(struct platform_device *pdev) {}
+static void brcm_avs_cpufreq_debug_exit(struct platform_device *pdev) {}
+
+#endif /* CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG */
+
+/*
+ * To ensure the right firmware is running we need to
+ *    - check the MAGIC matches what we expect
+ *    - brcm_avs_get_pmap() doesn't return -ENOTSUPP or -EINVAL
+ * We need to set up our interrupt handling before calling brcm_avs_get_pmap()!
+ */
+static bool brcm_avs_is_firmware_loaded(struct private_data *priv)
+{
+	u32 magic;
+	int rc;
+
+	rc = brcm_avs_get_pmap(priv, NULL);
+	magic = readl(priv->base + AVS_MBOX_MAGIC);
+
+	return (magic == AVS_FIRMWARE_MAGIC) && (rc != -ENOTSUPP) &&
+		(rc != -EINVAL);
+}
+
+static unsigned int brcm_avs_cpufreq_get(unsigned int cpu)
+{
+	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
+	struct private_data *priv = policy->driver_data;
+
+	return brcm_avs_get_frequency(priv->base);
+}
+
+static int brcm_avs_target_index(struct cpufreq_policy *policy,
+				 unsigned int index)
+{
+	return brcm_avs_set_pstate(policy->driver_data,
+				  policy->freq_table[index].driver_data);
+}
+
+static int brcm_avs_suspend(struct cpufreq_policy *policy)
+{
+	struct private_data *priv = policy->driver_data;
+
+	return brcm_avs_get_pmap(priv, &priv->pmap);
+}
+
+static int brcm_avs_resume(struct cpufreq_policy *policy)
+{
+	struct private_data *priv = policy->driver_data;
+	int ret;
+
+	ret = brcm_avs_set_pmap(priv, &priv->pmap);
+	if (ret == -EEXIST) {
+		struct platform_device *pdev  = cpufreq_get_driver_data();
+		struct device *dev = &pdev->dev;
+
+		dev_warn(dev, "PMAP was already set\n");
+		ret = 0;
+	}
+
+	return ret;
+}
+
+/*
+ * All initialization code that we only want to execute once goes here. Setup
+ * code that can be re-tried on every core (if it failed before) can go into
+ * brcm_avs_cpufreq_init().
+ */
+static int brcm_avs_prepare_init(struct platform_device *pdev)
+{
+	struct private_data *priv;
+	struct device *dev;
+	int host_irq, ret;
+
+	dev = &pdev->dev;
+	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->dev = dev;
+	sema_init(&priv->sem, 1);
+	init_completion(&priv->done);
+	platform_set_drvdata(pdev, priv);
+
+	priv->base = __map_region(BRCM_AVS_CPU_DATA);
+	if (!priv->base) {
+		dev_err(dev, "Couldn't find property %s in device tree.\n",
+			BRCM_AVS_CPU_DATA);
+		return -ENOENT;
+	}
+
+	priv->avs_intr_base = __map_region(BRCM_AVS_CPU_INTR);
+	if (!priv->avs_intr_base) {
+		dev_err(dev, "Couldn't find property %s in device tree.\n",
+			BRCM_AVS_CPU_INTR);
+		ret = -ENOENT;
+		goto unmap_base;
+	}
+
+	host_irq = platform_get_irq_byname(pdev, BRCM_AVS_HOST_INTR);
+	if (host_irq < 0) {
+		dev_err(dev, "Couldn't find interrupt %s -- %d\n",
+			BRCM_AVS_HOST_INTR, host_irq);
+		ret = host_irq;
+		goto unmap_intr_base;
+	}
+
+	ret = devm_request_irq(dev, host_irq, irq_handler, IRQF_TRIGGER_RISING,
+			       BRCM_AVS_HOST_INTR, priv);
+	if (ret) {
+		dev_err(dev, "IRQ request failed: %s (%d) -- %d\n",
+			BRCM_AVS_HOST_INTR, host_irq, ret);
+		goto unmap_intr_base;
+	}
+
+	if (brcm_avs_is_firmware_loaded(priv))
+		return 0;
+
+	dev_err(dev, "AVS firmware is not loaded or doesn't support DVFS\n");
+	ret = -ENODEV;
+
+unmap_intr_base:
+	iounmap(priv->avs_intr_base);
+unmap_base:
+	iounmap(priv->base);
+	platform_set_drvdata(pdev, NULL);
+
+	return ret;
+}
+
+static int brcm_avs_cpufreq_init(struct cpufreq_policy *policy)
+{
+	struct cpufreq_frequency_table *freq_table;
+	struct platform_device *pdev;
+	struct private_data *priv;
+	struct device *dev;
+	int ret;
+
+	pdev = cpufreq_get_driver_data();
+	priv = platform_get_drvdata(pdev);
+	policy->driver_data = priv;
+	dev = &pdev->dev;
+
+	freq_table = brcm_avs_get_freq_table(dev, priv);
+	if (IS_ERR(freq_table)) {
+		ret = PTR_ERR(freq_table);
+		dev_err(dev, "Couldn't determine frequency table (%d).\n", ret);
+		return ret;
+	}
+
+	ret = cpufreq_table_validate_and_show(policy, freq_table);
+	if (ret) {
+		dev_err(dev, "invalid frequency table: %d\n", ret);
+		return ret;
+	}
+
+	/* All cores share the same clock and thus the same policy. */
+	cpumask_setall(policy->cpus);
+
+	ret = __issue_avs_command(priv, AVS_CMD_ENABLE, false, NULL);
+	if (!ret) {
+		unsigned int pstate;
+
+		ret = brcm_avs_get_pstate(priv, &pstate);
+		if (!ret) {
+			policy->cur = freq_table[pstate].frequency;
+			dev_info(dev, "registered\n");
+			return 0;
+		}
+	}
+
+	dev_err(dev, "couldn't initialize driver (%d)\n", ret);
+
+	return ret;
+}
+
+static ssize_t show_brcm_avs_pstate(struct cpufreq_policy *policy, char *buf)
+{
+	struct private_data *priv = policy->driver_data;
+	unsigned int pstate;
+
+	if (brcm_avs_get_pstate(priv, &pstate))
+		return sprintf(buf, "<unknown>\n");
+
+	return sprintf(buf, "%u\n", pstate);
+}
+
+static ssize_t show_brcm_avs_mode(struct cpufreq_policy *policy, char *buf)
+{
+	struct private_data *priv = policy->driver_data;
+	struct pmap pmap;
+
+	if (brcm_avs_get_pmap(priv, &pmap))
+		return sprintf(buf, "<unknown>\n");
+
+	return sprintf(buf, "%s %u\n", brcm_avs_mode_to_string(pmap.mode),
+		pmap.mode);
+}
+
+static ssize_t show_brcm_avs_pmap(struct cpufreq_policy *policy, char *buf)
+{
+	unsigned int mdiv_p0, mdiv_p1, mdiv_p2, mdiv_p3, mdiv_p4;
+	struct private_data *priv = policy->driver_data;
+	unsigned int ndiv, pdiv;
+	struct pmap pmap;
+
+	if (brcm_avs_get_pmap(priv, &pmap))
+		return sprintf(buf, "<unknown>\n");
+
+	brcm_avs_parse_p1(pmap.p1, &mdiv_p0, &pdiv, &ndiv);
+	brcm_avs_parse_p2(pmap.p2, &mdiv_p1, &mdiv_p2, &mdiv_p3, &mdiv_p4);
+
+	return sprintf(buf, "0x%08x 0x%08x %u %u %u %u %u %u %u\n",
+		pmap.p1, pmap.p2, ndiv, pdiv, mdiv_p0, mdiv_p1, mdiv_p2,
+		mdiv_p3, mdiv_p4);
+}
+
+static ssize_t show_brcm_avs_voltage(struct cpufreq_policy *policy, char *buf)
+{
+	struct private_data *priv = policy->driver_data;
+
+	return sprintf(buf, "0x%08lx\n", brcm_avs_get_voltage(priv->base));
+}
+
+static ssize_t show_brcm_avs_frequency(struct cpufreq_policy *policy, char *buf)
+{
+	struct private_data *priv = policy->driver_data;
+
+	return sprintf(buf, "0x%08lx\n", brcm_avs_get_frequency(priv->base));
+}
+
+cpufreq_freq_attr_ro(brcm_avs_pstate);
+cpufreq_freq_attr_ro(brcm_avs_mode);
+cpufreq_freq_attr_ro(brcm_avs_pmap);
+cpufreq_freq_attr_ro(brcm_avs_voltage);
+cpufreq_freq_attr_ro(brcm_avs_frequency);
+
+static struct freq_attr *brcm_avs_cpufreq_attr[] = {
+	&cpufreq_freq_attr_scaling_available_freqs,
+	&brcm_avs_pstate,
+	&brcm_avs_mode,
+	&brcm_avs_pmap,
+	&brcm_avs_voltage,
+	&brcm_avs_frequency,
+	NULL
+};
+
+static struct cpufreq_driver brcm_avs_driver = {
+	.flags		= CPUFREQ_NEED_INITIAL_FREQ_CHECK,
+	.verify		= cpufreq_generic_frequency_table_verify,
+	.target_index	= brcm_avs_target_index,
+	.get		= brcm_avs_cpufreq_get,
+	.suspend	= brcm_avs_suspend,
+	.resume		= brcm_avs_resume,
+	.init		= brcm_avs_cpufreq_init,
+	.attr		= brcm_avs_cpufreq_attr,
+	.name		= BRCM_AVS_CPUFREQ_PREFIX,
+};
+
+static int brcm_avs_cpufreq_probe(struct platform_device *pdev)
+{
+	int ret;
+
+	ret = brcm_avs_prepare_init(pdev);
+	if (ret)
+		return ret;
+
+	brcm_avs_driver.driver_data = pdev;
+	ret = cpufreq_register_driver(&brcm_avs_driver);
+	if (!ret)
+		brcm_avs_cpufreq_debug_init(pdev);
+
+	return ret;
+}
+
+static int brcm_avs_cpufreq_remove(struct platform_device *pdev)
+{
+	struct private_data *priv;
+	int ret;
+
+	ret = cpufreq_unregister_driver(&brcm_avs_driver);
+	if (ret)
+		return ret;
+
+	brcm_avs_cpufreq_debug_exit(pdev);
+
+	priv = platform_get_drvdata(pdev);
+	iounmap(priv->base);
+	iounmap(priv->avs_intr_base);
+	platform_set_drvdata(pdev, NULL);
+
+	return 0;
+}
+
+static const struct of_device_id brcm_avs_cpufreq_match[] = {
+	{ .compatible = BRCM_AVS_CPU_DATA },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, brcm_avs_cpufreq_match);
+
+static struct platform_driver brcm_avs_cpufreq_platdrv = {
+	.driver = {
+		.name	= BRCM_AVS_CPUFREQ_NAME,
+		.of_match_table = brcm_avs_cpufreq_match,
+	},
+	.probe		= brcm_avs_cpufreq_probe,
+	.remove		= brcm_avs_cpufreq_remove,
+};
+module_platform_driver(brcm_avs_cpufreq_platdrv);
+
+MODULE_AUTHOR("Markus Mayer <mmayer@broadcom.com>");
+MODULE_DESCRIPTION("CPUfreq driver for Broadcom STB AVS");
+MODULE_LICENSE("GPL");
diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index 4852d9e..e82bb3c 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -247,3 +247,10 @@ MODULE_DESCRIPTION("CPUFreq driver based on the ACPI CPPC v5.0+ spec");
 MODULE_LICENSE("GPL");
 
 late_initcall(cppc_cpufreq_init);
+
+static const struct acpi_device_id cppc_acpi_ids[] = {
+	{ACPI_PROCESSOR_DEVICE_HID, },
+	{}
+};
+
+MODULE_DEVICE_TABLE(acpi, cppc_acpi_ids);
diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c
index 7126762..bc97b6a 100644
--- a/drivers/cpufreq/cpufreq-dt-platdev.c
+++ b/drivers/cpufreq/cpufreq-dt-platdev.c
@@ -26,6 +26,9 @@ static const struct of_device_id machines[] __initconst = {
 	{ .compatible = "allwinner,sun8i-a83t", },
 	{ .compatible = "allwinner,sun8i-h3", },
 
+	{ .compatible = "arm,integrator-ap", },
+	{ .compatible = "arm,integrator-cp", },
+
 	{ .compatible = "hisilicon,hi6220", },
 
 	{ .compatible = "fsl,imx27", },
@@ -34,6 +37,8 @@ static const struct of_device_id machines[] __initconst = {
 	{ .compatible = "fsl,imx7d", },
 
 	{ .compatible = "marvell,berlin", },
+	{ .compatible = "marvell,pxa250", },
+	{ .compatible = "marvell,pxa270", },
 
 	{ .compatible = "samsung,exynos3250", },
 	{ .compatible = "samsung,exynos4210", },
@@ -50,6 +55,8 @@ static const struct of_device_id machines[] __initconst = {
 	{ .compatible = "renesas,r7s72100", },
 	{ .compatible = "renesas,r8a73a4", },
 	{ .compatible = "renesas,r8a7740", },
+	{ .compatible = "renesas,r8a7743", },
+	{ .compatible = "renesas,r8a7745", },
 	{ .compatible = "renesas,r8a7778", },
 	{ .compatible = "renesas,r8a7779", },
 	{ .compatible = "renesas,r8a7790", },
@@ -72,6 +79,12 @@ static const struct of_device_id machines[] __initconst = {
 
 	{ .compatible = "sigma,tango4" },
 
+	{ .compatible = "socionext,uniphier-pro5", },
+	{ .compatible = "socionext,uniphier-pxs2", },
+	{ .compatible = "socionext,uniphier-ld6b", },
+	{ .compatible = "socionext,uniphier-ld11", },
+	{ .compatible = "socionext,uniphier-ld20", },
+
 	{ .compatible = "ti,am33xx", },
 	{ .compatible = "ti,dra7", },
 	{ .compatible = "ti,omap2", },
@@ -81,6 +94,8 @@ static const struct of_device_id machines[] __initconst = {
 
 	{ .compatible = "xlnx,zynq-7000", },
 
+	{ .compatible = "zte,zx296718", },
+
 	{ }
 };
 
diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 5c07ae0..2690133 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -28,6 +28,7 @@
 #include "cpufreq-dt.h"
 
 struct private_data {
+	struct opp_table *opp_table;
 	struct device *cpu_dev;
 	struct thermal_cooling_device *cdev;
 	const char *reg_name;
@@ -143,6 +144,7 @@ static int resources_available(void)
 static int cpufreq_init(struct cpufreq_policy *policy)
 {
 	struct cpufreq_frequency_table *freq_table;
+	struct opp_table *opp_table = NULL;
 	struct private_data *priv;
 	struct device *cpu_dev;
 	struct clk *cpu_clk;
@@ -186,8 +188,9 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 	 */
 	name = find_supply_name(cpu_dev);
 	if (name) {
-		ret = dev_pm_opp_set_regulator(cpu_dev, name);
-		if (ret) {
+		opp_table = dev_pm_opp_set_regulators(cpu_dev, &name, 1);
+		if (IS_ERR(opp_table)) {
+			ret = PTR_ERR(opp_table);
 			dev_err(cpu_dev, "Failed to set regulator for cpu%d: %d\n",
 				policy->cpu, ret);
 			goto out_put_clk;
@@ -237,6 +240,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 	}
 
 	priv->reg_name = name;
+	priv->opp_table = opp_table;
 
 	ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &freq_table);
 	if (ret) {
@@ -285,7 +289,7 @@ out_free_priv:
 out_free_opp:
 	dev_pm_opp_of_cpumask_remove_table(policy->cpus);
 	if (name)
-		dev_pm_opp_put_regulator(cpu_dev);
+		dev_pm_opp_put_regulators(opp_table);
 out_put_clk:
 	clk_put(cpu_clk);
 
@@ -300,7 +304,7 @@ static int cpufreq_exit(struct cpufreq_policy *policy)
 	dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &policy->freq_table);
 	dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);
 	if (priv->reg_name)
-		dev_pm_opp_put_regulator(priv->cpu_dev);
+		dev_pm_opp_put_regulators(priv->opp_table);
 
 	clk_put(policy->clk);
 	kfree(priv);
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 6e6c1fb..cc475ef 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1526,7 +1526,10 @@ unsigned int cpufreq_get(unsigned int cpu)
 
 	if (policy) {
 		down_read(&policy->rwsem);
-		ret_freq = __cpufreq_get(policy);
+
+		if (!policy_is_inactive(policy))
+			ret_freq = __cpufreq_get(policy);
+
 		up_read(&policy->rwsem);
 
 		cpufreq_cpu_put(policy);
@@ -2254,17 +2257,19 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
  *	Useful for policy notifiers which have different necessities
  *	at different times.
  */
-int cpufreq_update_policy(unsigned int cpu)
+void cpufreq_update_policy(unsigned int cpu)
 {
 	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
 	struct cpufreq_policy new_policy;
-	int ret;
 
 	if (!policy)
-		return -ENODEV;
+		return;
 
 	down_write(&policy->rwsem);
 
+	if (policy_is_inactive(policy))
+		goto unlock;
+
 	pr_debug("updating policy for CPU %u\n", cpu);
 	memcpy(&new_policy, policy, sizeof(*policy));
 	new_policy.min = policy->user_policy.min;
@@ -2275,24 +2280,20 @@ int cpufreq_update_policy(unsigned int cpu)
 	 * -> ask driver for current freq and notify governors about a change
 	 */
 	if (cpufreq_driver->get && !cpufreq_driver->setpolicy) {
-		if (cpufreq_suspended) {
-			ret = -EAGAIN;
+		if (cpufreq_suspended)
 			goto unlock;
-		}
+
 		new_policy.cur = cpufreq_update_current_freq(policy);
-		if (WARN_ON(!new_policy.cur)) {
-			ret = -EIO;
+		if (WARN_ON(!new_policy.cur))
 			goto unlock;
-		}
 	}
 
-	ret = cpufreq_set_policy(policy, &new_policy);
+	cpufreq_set_policy(policy, &new_policy);
 
 unlock:
 	up_write(&policy->rwsem);
 
 	cpufreq_cpu_put(policy);
-	return ret;
 }
 EXPORT_SYMBOL(cpufreq_update_policy);
 
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 1347589..992f7c2 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -37,16 +37,16 @@ struct cs_dbs_tuners {
 #define DEF_SAMPLING_DOWN_FACTOR		(1)
 #define MAX_SAMPLING_DOWN_FACTOR		(10)
 
-static inline unsigned int get_freq_target(struct cs_dbs_tuners *cs_tuners,
-					   struct cpufreq_policy *policy)
+static inline unsigned int get_freq_step(struct cs_dbs_tuners *cs_tuners,
+					 struct cpufreq_policy *policy)
 {
-	unsigned int freq_target = (cs_tuners->freq_step * policy->max) / 100;
+	unsigned int freq_step = (cs_tuners->freq_step * policy->max) / 100;
 
 	/* max freq cannot be less than 100. But who knows... */
-	if (unlikely(freq_target == 0))
-		freq_target = DEF_FREQUENCY_STEP;
+	if (unlikely(freq_step == 0))
+		freq_step = DEF_FREQUENCY_STEP;
 
-	return freq_target;
+	return freq_step;
 }
 
 /*
@@ -55,10 +55,10 @@ static inline unsigned int get_freq_target(struct cs_dbs_tuners *cs_tuners,
  * sampling_down_factor, we check, if current idle time is more than 80%
  * (default), then we try to decrease frequency
  *
- * Any frequency increase takes it to the maximum frequency. Frequency reduction
- * happens at minimum steps of 5% (default) of maximum frequency
+ * Frequency updates happen at minimum steps of 5% (default) of maximum
+ * frequency
  */
-static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
+static unsigned int cs_dbs_update(struct cpufreq_policy *policy)
 {
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 	struct cs_policy_dbs_info *dbs_info = to_dbs_info(policy_dbs);
@@ -66,6 +66,7 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 	unsigned int load = dbs_update(policy);
+	unsigned int freq_step;
 
 	/*
 	 * break out if we 'cannot' reduce the speed as the user might
@@ -82,6 +83,23 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
 	if (requested_freq > policy->max || requested_freq < policy->min)
 		requested_freq = policy->cur;
 
+	freq_step = get_freq_step(cs_tuners, policy);
+
+	/*
+	 * Decrease requested_freq one freq_step for each idle period that
+	 * we didn't update the frequency.
+	 */
+	if (policy_dbs->idle_periods < UINT_MAX) {
+		unsigned int freq_steps = policy_dbs->idle_periods * freq_step;
+
+		if (requested_freq > freq_steps)
+			requested_freq -= freq_steps;
+		else
+			requested_freq = policy->min;
+
+		policy_dbs->idle_periods = UINT_MAX;
+	}
+
 	/* Check for frequency increase */
 	if (load > dbs_data->up_threshold) {
 		dbs_info->down_skip = 0;
@@ -90,7 +108,7 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
 		if (requested_freq == policy->max)
 			goto out;
 
-		requested_freq += get_freq_target(cs_tuners, policy);
+		requested_freq += freq_step;
 		if (requested_freq > policy->max)
 			requested_freq = policy->max;
 
@@ -106,16 +124,14 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
 
 	/* Check for frequency decrease */
 	if (load < cs_tuners->down_threshold) {
-		unsigned int freq_target;
 		/*
 		 * if we cannot reduce the frequency anymore, break out early
 		 */
 		if (requested_freq == policy->min)
 			goto out;
 
-		freq_target = get_freq_target(cs_tuners, policy);
-		if (requested_freq > freq_target)
-			requested_freq -= freq_target;
+		if (requested_freq > freq_step)
+			requested_freq -= freq_step;
 		else
 			requested_freq = policy->min;
 
@@ -305,7 +321,7 @@ static void cs_start(struct cpufreq_policy *policy)
 static struct dbs_governor cs_governor = {
 	.gov = CPUFREQ_DBS_GOVERNOR_INITIALIZER("conservative"),
 	.kobj_type = { .default_attrs = cs_attributes },
-	.gov_dbs_timer = cs_dbs_timer,
+	.gov_dbs_update = cs_dbs_update,
 	.alloc = cs_alloc,
 	.free = cs_free,
 	.init = cs_init,
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 642dd0f..0196467 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -61,7 +61,7 @@ ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf,
 	 * entries can't be freed concurrently.
 	 */
 	list_for_each_entry(policy_dbs, &attr_set->policy_list, list) {
-		mutex_lock(&policy_dbs->timer_mutex);
+		mutex_lock(&policy_dbs->update_mutex);
 		/*
 		 * On 32-bit architectures this may race with the
 		 * sample_delay_ns read in dbs_update_util_handler(), but that
@@ -76,7 +76,7 @@ ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf,
 		 * taken, so it shouldn't be significant.
 		 */
 		gov_update_sample_delay(policy_dbs, 0);
-		mutex_unlock(&policy_dbs->timer_mutex);
+		mutex_unlock(&policy_dbs->update_mutex);
 	}
 
 	return count;
@@ -117,7 +117,7 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	unsigned int ignore_nice = dbs_data->ignore_nice_load;
-	unsigned int max_load = 0;
+	unsigned int max_load = 0, idle_periods = UINT_MAX;
 	unsigned int sampling_rate, io_busy, j;
 
 	/*
@@ -215,9 +215,19 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
 			j_cdbs->prev_load = load;
 		}
 
+		if (time_elapsed > 2 * sampling_rate) {
+			unsigned int periods = time_elapsed / sampling_rate;
+
+			if (periods < idle_periods)
+				idle_periods = periods;
+		}
+
 		if (load > max_load)
 			max_load = load;
 	}
+
+	policy_dbs->idle_periods = idle_periods;
+
 	return max_load;
 }
 EXPORT_SYMBOL_GPL(dbs_update);
@@ -236,9 +246,9 @@ static void dbs_work_handler(struct work_struct *work)
 	 * Make sure cpufreq_governor_limits() isn't evaluating load or the
 	 * ondemand governor isn't updating the sampling rate in parallel.
 	 */
-	mutex_lock(&policy_dbs->timer_mutex);
-	gov_update_sample_delay(policy_dbs, gov->gov_dbs_timer(policy));
-	mutex_unlock(&policy_dbs->timer_mutex);
+	mutex_lock(&policy_dbs->update_mutex);
+	gov_update_sample_delay(policy_dbs, gov->gov_dbs_update(policy));
+	mutex_unlock(&policy_dbs->update_mutex);
 
 	/* Allow the utilization update handler to queue up more work. */
 	atomic_set(&policy_dbs->work_count, 0);
@@ -348,7 +358,7 @@ static struct policy_dbs_info *alloc_policy_dbs_info(struct cpufreq_policy *poli
 		return NULL;
 
 	policy_dbs->policy = policy;
-	mutex_init(&policy_dbs->timer_mutex);
+	mutex_init(&policy_dbs->update_mutex);
 	atomic_set(&policy_dbs->work_count, 0);
 	init_irq_work(&policy_dbs->irq_work, dbs_irq_work);
 	INIT_WORK(&policy_dbs->work, dbs_work_handler);
@@ -367,7 +377,7 @@ static void free_policy_dbs_info(struct policy_dbs_info *policy_dbs,
 {
 	int j;
 
-	mutex_destroy(&policy_dbs->timer_mutex);
+	mutex_destroy(&policy_dbs->update_mutex);
 
 	for_each_cpu(j, policy_dbs->policy->related_cpus) {
 		struct cpu_dbs_info *j_cdbs = &per_cpu(cpu_dbs, j);
@@ -547,10 +557,10 @@ void cpufreq_dbs_governor_limits(struct cpufreq_policy *policy)
 {
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 
-	mutex_lock(&policy_dbs->timer_mutex);
+	mutex_lock(&policy_dbs->update_mutex);
 	cpufreq_policy_apply_limits(policy);
 	gov_update_sample_delay(policy_dbs, 0);
 
-	mutex_unlock(&policy_dbs->timer_mutex);
+	mutex_unlock(&policy_dbs->update_mutex);
 }
 EXPORT_SYMBOL_GPL(cpufreq_dbs_governor_limits);
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index ef1037e..f5717ca 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -85,7 +85,7 @@ struct policy_dbs_info {
 	 * Per policy mutex that serializes load evaluation from limit-change
 	 * and work-handler.
 	 */
-	struct mutex timer_mutex;
+	struct mutex update_mutex;
 
 	u64 last_sample_time;
 	s64 sample_delay_ns;
@@ -97,6 +97,7 @@ struct policy_dbs_info {
 	struct list_head list;
 	/* Multiplier for increasing sample delay temporarily. */
 	unsigned int rate_mult;
+	unsigned int idle_periods;	/* For conservative */
 	/* Status indicators */
 	bool is_shared;		/* This object is used by multiple CPUs */
 	bool work_in_progress;	/* Work is being queued up or in progress */
@@ -135,7 +136,7 @@ struct dbs_governor {
 	 */
 	struct dbs_data *gdbs_data;
 
-	unsigned int (*gov_dbs_timer)(struct cpufreq_policy *policy);
+	unsigned int (*gov_dbs_update)(struct cpufreq_policy *policy);
 	struct policy_dbs_info *(*alloc)(void);
 	void (*free)(struct policy_dbs_info *policy_dbs);
 	int (*init)(struct dbs_data *dbs_data);
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 3a1f49f..4a017e8 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -25,7 +25,7 @@
 #define MAX_SAMPLING_DOWN_FACTOR		(100000)
 #define MICRO_FREQUENCY_UP_THRESHOLD		(95)
 #define MICRO_FREQUENCY_MIN_SAMPLE_RATE		(10000)
-#define MIN_FREQUENCY_UP_THRESHOLD		(11)
+#define MIN_FREQUENCY_UP_THRESHOLD		(1)
 #define MAX_FREQUENCY_UP_THRESHOLD		(100)
 
 static struct od_ops od_ops;
@@ -169,7 +169,7 @@ static void od_update(struct cpufreq_policy *policy)
 	}
 }
 
-static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
+static unsigned int od_dbs_update(struct cpufreq_policy *policy)
 {
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
@@ -191,7 +191,7 @@ static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
 	od_update(policy);
 
 	if (dbs_info->freq_lo) {
-		/* Setup timer for SUB_SAMPLE */
+		/* Setup SUB_SAMPLE */
 		dbs_info->sample_type = OD_SUB_SAMPLE;
 		return dbs_info->freq_hi_delay_us;
 	}
@@ -255,11 +255,11 @@ static ssize_t store_sampling_down_factor(struct gov_attr_set *attr_set,
 	list_for_each_entry(policy_dbs, &attr_set->policy_list, list) {
 		/*
 		 * Doing this without locking might lead to using different
-		 * rate_mult values in od_update() and od_dbs_timer().
+		 * rate_mult values in od_update() and od_dbs_update().
 		 */
-		mutex_lock(&policy_dbs->timer_mutex);
+		mutex_lock(&policy_dbs->update_mutex);
 		policy_dbs->rate_mult = 1;
-		mutex_unlock(&policy_dbs->timer_mutex);
+		mutex_unlock(&policy_dbs->update_mutex);
 	}
 
 	return count;
@@ -374,8 +374,7 @@ static int od_init(struct dbs_data *dbs_data)
 		dbs_data->up_threshold = MICRO_FREQUENCY_UP_THRESHOLD;
 		/*
 		 * In nohz/micro accounting case we set the minimum frequency
-		 * not depending on HZ, but fixed (very low). The deferred
-		 * timer might skip some samples if idle/sleeping as needed.
+		 * not depending on HZ, but fixed (very low).
 		*/
 		dbs_data->min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE;
 	} else {
@@ -415,7 +414,7 @@ static struct od_ops od_ops = {
 static struct dbs_governor od_dbs_gov = {
 	.gov = CPUFREQ_DBS_GOVERNOR_INITIALIZER("ondemand"),
 	.kobj_type = { .default_attrs = od_attributes },
-	.gov_dbs_timer = od_dbs_timer,
+	.gov_dbs_update = od_dbs_update,
 	.alloc = od_alloc,
 	.free = od_free,
 	.init = od_init,
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 06d3abd..ac284e6 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -41,6 +41,18 @@ static int cpufreq_stats_update(struct cpufreq_stats *stats)
 	return 0;
 }
 
+static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
+{
+	unsigned int count = stats->max_state;
+
+	memset(stats->time_in_state, 0, count * sizeof(u64));
+#ifdef CONFIG_CPU_FREQ_STAT_DETAILS
+	memset(stats->trans_table, 0, count * count * sizeof(int));
+#endif
+	stats->last_time = get_jiffies_64();
+	stats->total_trans = 0;
+}
+
 static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf)
 {
 	return sprintf(buf, "%d\n", policy->stats->total_trans);
@@ -64,6 +76,14 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
 	return len;
 }
 
+static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf,
+			   size_t count)
+{
+	/* We don't care what is written to the attribute. */
+	cpufreq_stats_clear_table(policy->stats);
+	return count;
+}
+
 #ifdef CONFIG_CPU_FREQ_STAT_DETAILS
 static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
 {
@@ -113,10 +133,12 @@ cpufreq_freq_attr_ro(trans_table);
 
 cpufreq_freq_attr_ro(total_trans);
 cpufreq_freq_attr_ro(time_in_state);
+cpufreq_freq_attr_wo(reset);
 
 static struct attribute *default_attrs[] = {
 	&total_trans.attr,
 	&time_in_state.attr,
+	&reset.attr,
 #ifdef CONFIG_CPU_FREQ_STAT_DETAILS
 	&trans_table.attr,
 #endif
diff --git a/drivers/cpufreq/integrator-cpufreq.c b/drivers/cpufreq/integrator-cpufreq.c
deleted file mode 100644
index 79e3ff2..0000000
--- a/drivers/cpufreq/integrator-cpufreq.c
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- *  Copyright (C) 2001-2002 Deep Blue Solutions Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * CPU support functions
- */
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/cpufreq.h>
-#include <linux/sched.h>
-#include <linux/smp.h>
-#include <linux/init.h>
-#include <linux/io.h>
-#include <linux/platform_device.h>
-#include <linux/of.h>
-#include <linux/of_address.h>
-
-#include <asm/mach-types.h>
-#include <asm/hardware/icst.h>
-
-static void __iomem *cm_base;
-/* The cpufreq driver only use the OSC register */
-#define INTEGRATOR_HDR_OSC_OFFSET       0x08
-#define INTEGRATOR_HDR_LOCK_OFFSET      0x14
-
-static struct cpufreq_driver integrator_driver;
-
-static const struct icst_params lclk_params = {
-	.ref		= 24000000,
-	.vco_max	= ICST525_VCO_MAX_5V,
-	.vco_min	= ICST525_VCO_MIN,
-	.vd_min		= 8,
-	.vd_max		= 132,
-	.rd_min		= 24,
-	.rd_max		= 24,
-	.s2div		= icst525_s2div,
-	.idx2s		= icst525_idx2s,
-};
-
-static const struct icst_params cclk_params = {
-	.ref		= 24000000,
-	.vco_max	= ICST525_VCO_MAX_5V,
-	.vco_min	= ICST525_VCO_MIN,
-	.vd_min		= 12,
-	.vd_max		= 160,
-	.rd_min		= 24,
-	.rd_max		= 24,
-	.s2div		= icst525_s2div,
-	.idx2s		= icst525_idx2s,
-};
-
-/*
- * Validate the speed policy.
- */
-static int integrator_verify_policy(struct cpufreq_policy *policy)
-{
-	struct icst_vco vco;
-
-	cpufreq_verify_within_cpu_limits(policy);
-
-	vco = icst_hz_to_vco(&cclk_params, policy->max * 1000);
-	policy->max = icst_hz(&cclk_params, vco) / 1000;
-
-	vco = icst_hz_to_vco(&cclk_params, policy->min * 1000);
-	policy->min = icst_hz(&cclk_params, vco) / 1000;
-
-	cpufreq_verify_within_cpu_limits(policy);
-	return 0;
-}
-
-
-static int integrator_set_target(struct cpufreq_policy *policy,
-				 unsigned int target_freq,
-				 unsigned int relation)
-{
-	cpumask_t cpus_allowed;
-	int cpu = policy->cpu;
-	struct icst_vco vco;
-	struct cpufreq_freqs freqs;
-	u_int cm_osc;
-
-	/*
-	 * Save this threads cpus_allowed mask.
-	 */
-	cpus_allowed = current->cpus_allowed;
-
-	/*
-	 * Bind to the specified CPU.  When this call returns,
-	 * we should be running on the right CPU.
-	 */
-	set_cpus_allowed_ptr(current, cpumask_of(cpu));
-	BUG_ON(cpu != smp_processor_id());
-
-	/* get current setting */
-	cm_osc = __raw_readl(cm_base + INTEGRATOR_HDR_OSC_OFFSET);
-
-	if (machine_is_integrator())
-		vco.s = (cm_osc >> 8) & 7;
-	else if (machine_is_cintegrator())
-		vco.s = 1;
-	vco.v = cm_osc & 255;
-	vco.r = 22;
-	freqs.old = icst_hz(&cclk_params, vco) / 1000;
-
-	/* icst_hz_to_vco rounds down -- so we need the next
-	 * larger freq in case of CPUFREQ_RELATION_L.
-	 */
-	if (relation == CPUFREQ_RELATION_L)
-		target_freq += 999;
-	if (target_freq > policy->max)
-		target_freq = policy->max;
-	vco = icst_hz_to_vco(&cclk_params, target_freq * 1000);
-	freqs.new = icst_hz(&cclk_params, vco) / 1000;
-
-	if (freqs.old == freqs.new) {
-		set_cpus_allowed_ptr(current, &cpus_allowed);
-		return 0;
-	}
-
-	cpufreq_freq_transition_begin(policy, &freqs);
-
-	cm_osc = __raw_readl(cm_base + INTEGRATOR_HDR_OSC_OFFSET);
-
-	if (machine_is_integrator()) {
-		cm_osc &= 0xfffff800;
-		cm_osc |= vco.s << 8;
-	} else if (machine_is_cintegrator()) {
-		cm_osc &= 0xffffff00;
-	}
-	cm_osc |= vco.v;
-
-	__raw_writel(0xa05f, cm_base + INTEGRATOR_HDR_LOCK_OFFSET);
-	__raw_writel(cm_osc, cm_base + INTEGRATOR_HDR_OSC_OFFSET);
-	__raw_writel(0, cm_base + INTEGRATOR_HDR_LOCK_OFFSET);
-
-	/*
-	 * Restore the CPUs allowed mask.
-	 */
-	set_cpus_allowed_ptr(current, &cpus_allowed);
-
-	cpufreq_freq_transition_end(policy, &freqs, 0);
-
-	return 0;
-}
-
-static unsigned int integrator_get(unsigned int cpu)
-{
-	cpumask_t cpus_allowed;
-	unsigned int current_freq;
-	u_int cm_osc;
-	struct icst_vco vco;
-
-	cpus_allowed = current->cpus_allowed;
-
-	set_cpus_allowed_ptr(current, cpumask_of(cpu));
-	BUG_ON(cpu != smp_processor_id());
-
-	/* detect memory etc. */
-	cm_osc = __raw_readl(cm_base + INTEGRATOR_HDR_OSC_OFFSET);
-
-	if (machine_is_integrator())
-		vco.s = (cm_osc >> 8) & 7;
-	else
-		vco.s = 1;
-	vco.v = cm_osc & 255;
-	vco.r = 22;
-
-	current_freq = icst_hz(&cclk_params, vco) / 1000; /* current freq */
-
-	set_cpus_allowed_ptr(current, &cpus_allowed);
-
-	return current_freq;
-}
-
-static int integrator_cpufreq_init(struct cpufreq_policy *policy)
-{
-
-	/* set default policy and cpuinfo */
-	policy->max = policy->cpuinfo.max_freq = 160000;
-	policy->min = policy->cpuinfo.min_freq = 12000;
-	policy->cpuinfo.transition_latency = 1000000; /* 1 ms, assumed */
-
-	return 0;
-}
-
-static struct cpufreq_driver integrator_driver = {
-	.flags		= CPUFREQ_NEED_INITIAL_FREQ_CHECK,
-	.verify		= integrator_verify_policy,
-	.target		= integrator_set_target,
-	.get		= integrator_get,
-	.init		= integrator_cpufreq_init,
-	.name		= "integrator",
-};
-
-static int __init integrator_cpufreq_probe(struct platform_device *pdev)
-{
-	struct resource *res;
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res)
-		return -ENODEV;
-
-	cm_base = devm_ioremap(&pdev->dev, res->start, resource_size(res));
-	if (!cm_base)
-		return -ENODEV;
-
-	return cpufreq_register_driver(&integrator_driver);
-}
-
-static int __exit integrator_cpufreq_remove(struct platform_device *pdev)
-{
-	return cpufreq_unregister_driver(&integrator_driver);
-}
-
-static const struct of_device_id integrator_cpufreq_match[] = {
-	{ .compatible = "arm,core-module-integrator"},
-	{ },
-};
-
-MODULE_DEVICE_TABLE(of, integrator_cpufreq_match);
-
-static struct platform_driver integrator_cpufreq_driver = {
-	.driver = {
-		.name = "integrator-cpufreq",
-		.of_match_table = integrator_cpufreq_match,
-	},
-	.remove = __exit_p(integrator_cpufreq_remove),
-};
-
-module_platform_driver_probe(integrator_cpufreq_driver,
-			     integrator_cpufreq_probe);
-
-MODULE_AUTHOR("Russell M. King");
-MODULE_DESCRIPTION("cpufreq driver for ARM Integrator CPUs");
-MODULE_LICENSE("GPL");
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index e8dc42f..6acbd4a 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -37,6 +37,8 @@
 #include <asm/cpufeature.h>
 #include <asm/intel-family.h>
 
+#define INTEL_CPUFREQ_TRANSITION_LATENCY	20000
+
 #define ATOM_RATIOS		0x66a
 #define ATOM_VIDS		0x66b
 #define ATOM_TURBO_RATIOS	0x66c
@@ -53,6 +55,8 @@
 
 #define EXT_BITS 6
 #define EXT_FRAC_BITS (EXT_BITS + FRAC_BITS)
+#define fp_ext_toint(X) ((X) >> EXT_FRAC_BITS)
+#define int_ext_tofp(X) ((int64_t)(X) << EXT_FRAC_BITS)
 
 static inline int32_t mul_fp(int32_t x, int32_t y)
 {
@@ -123,6 +127,8 @@ struct sample {
  * @scaling:		Scaling factor to  convert frequency to cpufreq
  *			frequency units
  * @turbo_pstate:	Max Turbo P state possible for this platform
+ * @max_freq:		@max_pstate frequency in cpufreq units
+ * @turbo_freq:		@turbo_pstate frequency in cpufreq units
  *
  * Stores the per cpu model P state limits and current P state.
  */
@@ -133,6 +139,8 @@ struct pstate_data {
 	int	max_pstate_physical;
 	int	scaling;
 	int	turbo_pstate;
+	unsigned int max_freq;
+	unsigned int turbo_freq;
 };
 
 /**
@@ -178,6 +186,48 @@ struct _pid {
 };
 
 /**
+ * struct perf_limits - Store user and policy limits
+ * @no_turbo:		User requested turbo state from intel_pstate sysfs
+ * @turbo_disabled:	Platform turbo status either from msr
+ *			MSR_IA32_MISC_ENABLE or when maximum available pstate
+ *			matches the maximum turbo pstate
+ * @max_perf_pct:	Effective maximum performance limit in percentage, this
+ *			is minimum of either limits enforced by cpufreq policy
+ *			or limits from user set limits via intel_pstate sysfs
+ * @min_perf_pct:	Effective minimum performance limit in percentage, this
+ *			is maximum of either limits enforced by cpufreq policy
+ *			or limits from user set limits via intel_pstate sysfs
+ * @max_perf:		This is a scaled value between 0 to 255 for max_perf_pct
+ *			This value is used to limit max pstate
+ * @min_perf:		This is a scaled value between 0 to 255 for min_perf_pct
+ *			This value is used to limit min pstate
+ * @max_policy_pct:	The maximum performance in percentage enforced by
+ *			cpufreq setpolicy interface
+ * @max_sysfs_pct:	The maximum performance in percentage enforced by
+ *			intel pstate sysfs interface, unused when per cpu
+ *			controls are enforced
+ * @min_policy_pct:	The minimum performance in percentage enforced by
+ *			cpufreq setpolicy interface
+ * @min_sysfs_pct:	The minimum performance in percentage enforced by
+ *			intel pstate sysfs interface, unused when per cpu
+ *			controls are enforced
+ *
+ * Storage for user and policy defined limits.
+ */
+struct perf_limits {
+	int no_turbo;
+	int turbo_disabled;
+	int max_perf_pct;
+	int min_perf_pct;
+	int32_t max_perf;
+	int32_t min_perf;
+	int max_policy_pct;
+	int max_sysfs_pct;
+	int min_policy_pct;
+	int min_sysfs_pct;
+};
+
+/**
  * struct cpudata -	Per CPU instance data storage
  * @cpu:		CPU number for this instance data
  * @policy:		CPUFreq policy value
@@ -195,8 +245,19 @@ struct _pid {
  * @prev_cummulative_iowait: IO Wait time difference from last and
  *			current sample
  * @sample:		Storage for storing last Sample data
+ * @perf_limits:	Pointer to perf_limit unique to this CPU
+ *			Not all field in the structure are applicable
+ *			when per cpu controls are enforced
  * @acpi_perf_data:	Stores ACPI perf information read from _PSS
  * @valid_pss_table:	Set to true for valid ACPI _PSS entries found
+ * @epp_powersave:	Last saved HWP energy performance preference
+ *			(EPP) or energy performance bias (EPB),
+ *			when policy switched to performance
+ * @epp_policy:		Last saved policy used to set EPP/EPB
+ * @epp_default:	Power on default HWP energy performance
+ *			preference/bias
+ * @epp_saved:		Saved EPP/EPB during system suspend or CPU offline
+ *			operation
  *
  * This structure stores per CPU instance data for all CPUs.
  */
@@ -218,11 +279,16 @@ struct cpudata {
 	u64	prev_tsc;
 	u64	prev_cummulative_iowait;
 	struct sample sample;
+	struct perf_limits *perf_limits;
 #ifdef CONFIG_ACPI
 	struct acpi_processor_performance acpi_perf_data;
 	bool valid_pss_table;
 #endif
 	unsigned int iowait_boost;
+	s16 epp_powersave;
+	s16 epp_policy;
+	s16 epp_default;
+	s16 epp_saved;
 };
 
 static struct cpudata **all_cpu_data;
@@ -236,7 +302,6 @@ static struct cpudata **all_cpu_data;
  * @p_gain_pct:		PID proportional gain
  * @i_gain_pct:		PID integral gain
  * @d_gain_pct:		PID derivative gain
- * @boost_iowait:	Whether or not to use iowait boosting.
  *
  * Stores per CPU model static PID configuration data.
  */
@@ -248,7 +313,6 @@ struct pstate_adjust_policy {
 	int p_gain_pct;
 	int d_gain_pct;
 	int i_gain_pct;
-	bool boost_iowait;
 };
 
 /**
@@ -292,58 +356,19 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu);
 static struct pstate_adjust_policy pid_params __read_mostly;
 static struct pstate_funcs pstate_funcs __read_mostly;
 static int hwp_active __read_mostly;
+static bool per_cpu_limits __read_mostly;
 
 #ifdef CONFIG_ACPI
 static bool acpi_ppc;
 #endif
 
-/**
- * struct perf_limits - Store user and policy limits
- * @no_turbo:		User requested turbo state from intel_pstate sysfs
- * @turbo_disabled:	Platform turbo status either from msr
- *			MSR_IA32_MISC_ENABLE or when maximum available pstate
- *			matches the maximum turbo pstate
- * @max_perf_pct:	Effective maximum performance limit in percentage, this
- *			is minimum of either limits enforced by cpufreq policy
- *			or limits from user set limits via intel_pstate sysfs
- * @min_perf_pct:	Effective minimum performance limit in percentage, this
- *			is maximum of either limits enforced by cpufreq policy
- *			or limits from user set limits via intel_pstate sysfs
- * @max_perf:		This is a scaled value between 0 to 255 for max_perf_pct
- *			This value is used to limit max pstate
- * @min_perf:		This is a scaled value between 0 to 255 for min_perf_pct
- *			This value is used to limit min pstate
- * @max_policy_pct:	The maximum performance in percentage enforced by
- *			cpufreq setpolicy interface
- * @max_sysfs_pct:	The maximum performance in percentage enforced by
- *			intel pstate sysfs interface
- * @min_policy_pct:	The minimum performance in percentage enforced by
- *			cpufreq setpolicy interface
- * @min_sysfs_pct:	The minimum performance in percentage enforced by
- *			intel pstate sysfs interface
- *
- * Storage for user and policy defined limits.
- */
-struct perf_limits {
-	int no_turbo;
-	int turbo_disabled;
-	int max_perf_pct;
-	int min_perf_pct;
-	int32_t max_perf;
-	int32_t min_perf;
-	int max_policy_pct;
-	int max_sysfs_pct;
-	int min_policy_pct;
-	int min_sysfs_pct;
-};
-
 static struct perf_limits performance_limits = {
 	.no_turbo = 0,
 	.turbo_disabled = 0,
 	.max_perf_pct = 100,
-	.max_perf = int_tofp(1),
+	.max_perf = int_ext_tofp(1),
 	.min_perf_pct = 100,
-	.min_perf = int_tofp(1),
+	.min_perf = int_ext_tofp(1),
 	.max_policy_pct = 100,
 	.max_sysfs_pct = 100,
 	.min_policy_pct = 0,
@@ -354,7 +379,7 @@ static struct perf_limits powersave_limits = {
 	.no_turbo = 0,
 	.turbo_disabled = 0,
 	.max_perf_pct = 100,
-	.max_perf = int_tofp(1),
+	.max_perf = int_ext_tofp(1),
 	.min_perf_pct = 0,
 	.min_perf = 0,
 	.max_policy_pct = 100,
@@ -369,6 +394,8 @@ static struct perf_limits *limits = &performance_limits;
 static struct perf_limits *limits = &powersave_limits;
 #endif
 
+static DEFINE_MUTEX(intel_pstate_limits_lock);
+
 #ifdef CONFIG_ACPI
 
 static bool intel_pstate_get_ppc_enable_status(void)
@@ -513,11 +540,11 @@ static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
 }
 
 #else
-static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
+static inline void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
 {
 }
 
-static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
+static inline void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
 {
 }
 #endif
@@ -613,24 +640,252 @@ static inline void update_turbo_state(void)
 		 cpu->pstate.max_pstate == cpu->pstate.turbo_pstate);
 }
 
+static s16 intel_pstate_get_epb(struct cpudata *cpu_data)
+{
+	u64 epb;
+	int ret;
+
+	if (!static_cpu_has(X86_FEATURE_EPB))
+		return -ENXIO;
+
+	ret = rdmsrl_on_cpu(cpu_data->cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
+	if (ret)
+		return (s16)ret;
+
+	return (s16)(epb & 0x0f);
+}
+
+static s16 intel_pstate_get_epp(struct cpudata *cpu_data, u64 hwp_req_data)
+{
+	s16 epp;
+
+	if (static_cpu_has(X86_FEATURE_HWP_EPP)) {
+		/*
+		 * When hwp_req_data is 0, means that caller didn't read
+		 * MSR_HWP_REQUEST, so need to read and get EPP.
+		 */
+		if (!hwp_req_data) {
+			epp = rdmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST,
+					    &hwp_req_data);
+			if (epp)
+				return epp;
+		}
+		epp = (hwp_req_data >> 24) & 0xff;
+	} else {
+		/* When there is no EPP present, HWP uses EPB settings */
+		epp = intel_pstate_get_epb(cpu_data);
+	}
+
+	return epp;
+}
+
+static int intel_pstate_set_epb(int cpu, s16 pref)
+{
+	u64 epb;
+	int ret;
+
+	if (!static_cpu_has(X86_FEATURE_EPB))
+		return -ENXIO;
+
+	ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
+	if (ret)
+		return ret;
+
+	epb = (epb & ~0x0f) | pref;
+	wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, epb);
+
+	return 0;
+}
+
+/*
+ * EPP/EPB display strings corresponding to EPP index in the
+ * energy_perf_strings[]
+ *	index		String
+ *-------------------------------------
+ *	0		default
+ *	1		performance
+ *	2		balance_performance
+ *	3		balance_power
+ *	4		power
+ */
+static const char * const energy_perf_strings[] = {
+	"default",
+	"performance",
+	"balance_performance",
+	"balance_power",
+	"power",
+	NULL
+};
+
+static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data)
+{
+	s16 epp;
+	int index = -EINVAL;
+
+	epp = intel_pstate_get_epp(cpu_data, 0);
+	if (epp < 0)
+		return epp;
+
+	if (static_cpu_has(X86_FEATURE_HWP_EPP)) {
+		/*
+		 * Range:
+		 *	0x00-0x3F	:	Performance
+		 *	0x40-0x7F	:	Balance performance
+		 *	0x80-0xBF	:	Balance power
+		 *	0xC0-0xFF	:	Power
+		 * The EPP is a 8 bit value, but our ranges restrict the
+		 * value which can be set. Here only using top two bits
+		 * effectively.
+		 */
+		index = (epp >> 6) + 1;
+	} else if (static_cpu_has(X86_FEATURE_EPB)) {
+		/*
+		 * Range:
+		 *	0x00-0x03	:	Performance
+		 *	0x04-0x07	:	Balance performance
+		 *	0x08-0x0B	:	Balance power
+		 *	0x0C-0x0F	:	Power
+		 * The EPB is a 4 bit value, but our ranges restrict the
+		 * value which can be set. Here only using top two bits
+		 * effectively.
+		 */
+		index = (epp >> 2) + 1;
+	}
+
+	return index;
+}
+
+static int intel_pstate_set_energy_pref_index(struct cpudata *cpu_data,
+					      int pref_index)
+{
+	int epp = -EINVAL;
+	int ret;
+
+	if (!pref_index)
+		epp = cpu_data->epp_default;
+
+	mutex_lock(&intel_pstate_limits_lock);
+
+	if (static_cpu_has(X86_FEATURE_HWP_EPP)) {
+		u64 value;
+
+		ret = rdmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, &value);
+		if (ret)
+			goto return_pref;
+
+		value &= ~GENMASK_ULL(31, 24);
+
+		/*
+		 * If epp is not default, convert from index into
+		 * energy_perf_strings to epp value, by shifting 6
+		 * bits left to use only top two bits in epp.
+		 * The resultant epp need to shifted by 24 bits to
+		 * epp position in MSR_HWP_REQUEST.
+		 */
+		if (epp == -EINVAL)
+			epp = (pref_index - 1) << 6;
+
+		value |= (u64)epp << 24;
+		ret = wrmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, value);
+	} else {
+		if (epp == -EINVAL)
+			epp = (pref_index - 1) << 2;
+		ret = intel_pstate_set_epb(cpu_data->cpu, epp);
+	}
+return_pref:
+	mutex_unlock(&intel_pstate_limits_lock);
+
+	return ret;
+}
+
+static ssize_t show_energy_performance_available_preferences(
+				struct cpufreq_policy *policy, char *buf)
+{
+	int i = 0;
+	int ret = 0;
+
+	while (energy_perf_strings[i] != NULL)
+		ret += sprintf(&buf[ret], "%s ", energy_perf_strings[i++]);
+
+	ret += sprintf(&buf[ret], "\n");
+
+	return ret;
+}
+
+cpufreq_freq_attr_ro(energy_performance_available_preferences);
+
+static ssize_t store_energy_performance_preference(
+		struct cpufreq_policy *policy, const char *buf, size_t count)
+{
+	struct cpudata *cpu_data = all_cpu_data[policy->cpu];
+	char str_preference[21];
+	int ret, i = 0;
+
+	ret = sscanf(buf, "%20s", str_preference);
+	if (ret != 1)
+		return -EINVAL;
+
+	while (energy_perf_strings[i] != NULL) {
+		if (!strcmp(str_preference, energy_perf_strings[i])) {
+			intel_pstate_set_energy_pref_index(cpu_data, i);
+			return count;
+		}
+		++i;
+	}
+
+	return -EINVAL;
+}
+
+static ssize_t show_energy_performance_preference(
+				struct cpufreq_policy *policy, char *buf)
+{
+	struct cpudata *cpu_data = all_cpu_data[policy->cpu];
+	int preference;
+
+	preference = intel_pstate_get_energy_pref_index(cpu_data);
+	if (preference < 0)
+		return preference;
+
+	return  sprintf(buf, "%s\n", energy_perf_strings[preference]);
+}
+
+cpufreq_freq_attr_rw(energy_performance_preference);
+
+static struct freq_attr *hwp_cpufreq_attrs[] = {
+	&energy_performance_preference,
+	&energy_performance_available_preferences,
+	NULL,
+};
+
 static void intel_pstate_hwp_set(const struct cpumask *cpumask)
 {
 	int min, hw_min, max, hw_max, cpu, range, adj_range;
+	struct perf_limits *perf_limits = limits;
 	u64 value, cap;
 
 	for_each_cpu(cpu, cpumask) {
+		int max_perf_pct, min_perf_pct;
+		struct cpudata *cpu_data = all_cpu_data[cpu];
+		s16 epp;
+
+		if (per_cpu_limits)
+			perf_limits = all_cpu_data[cpu]->perf_limits;
+
 		rdmsrl_on_cpu(cpu, MSR_HWP_CAPABILITIES, &cap);
 		hw_min = HWP_LOWEST_PERF(cap);
 		hw_max = HWP_HIGHEST_PERF(cap);
 		range = hw_max - hw_min;
 
+		max_perf_pct = perf_limits->max_perf_pct;
+		min_perf_pct = perf_limits->min_perf_pct;
+
 		rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value);
-		adj_range = limits->min_perf_pct * range / 100;
+		adj_range = min_perf_pct * range / 100;
 		min = hw_min + adj_range;
 		value &= ~HWP_MIN_PERF(~0L);
 		value |= HWP_MIN_PERF(min);
 
-		adj_range = limits->max_perf_pct * range / 100;
+		adj_range = max_perf_pct * range / 100;
 		max = hw_min + adj_range;
 		if (limits->no_turbo) {
 			hw_max = HWP_GUARANTEED_PERF(cap);
@@ -640,6 +895,53 @@ static void intel_pstate_hwp_set(const struct cpumask *cpumask)
 
 		value &= ~HWP_MAX_PERF(~0L);
 		value |= HWP_MAX_PERF(max);
+
+		if (cpu_data->epp_policy == cpu_data->policy)
+			goto skip_epp;
+
+		cpu_data->epp_policy = cpu_data->policy;
+
+		if (cpu_data->epp_saved >= 0) {
+			epp = cpu_data->epp_saved;
+			cpu_data->epp_saved = -EINVAL;
+			goto update_epp;
+		}
+
+		if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) {
+			epp = intel_pstate_get_epp(cpu_data, value);
+			cpu_data->epp_powersave = epp;
+			/* If EPP read was failed, then don't try to write */
+			if (epp < 0)
+				goto skip_epp;
+
+
+			epp = 0;
+		} else {
+			/* skip setting EPP, when saved value is invalid */
+			if (cpu_data->epp_powersave < 0)
+				goto skip_epp;
+
+			/*
+			 * No need to restore EPP when it is not zero. This
+			 * means:
+			 *  - Policy is not changed
+			 *  - user has manually changed
+			 *  - Error reading EPB
+			 */
+			epp = intel_pstate_get_epp(cpu_data, value);
+			if (epp)
+				goto skip_epp;
+
+			epp = cpu_data->epp_powersave;
+		}
+update_epp:
+		if (static_cpu_has(X86_FEATURE_HWP_EPP)) {
+			value &= ~GENMASK_ULL(31, 24);
+			value |= (u64)epp << 24;
+		} else {
+			intel_pstate_set_epb(cpu, epp);
+		}
+skip_epp:
 		wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
 	}
 }
@@ -652,6 +954,28 @@ static int intel_pstate_hwp_set_policy(struct cpufreq_policy *policy)
 	return 0;
 }
 
+static int intel_pstate_hwp_save_state(struct cpufreq_policy *policy)
+{
+	struct cpudata *cpu_data = all_cpu_data[policy->cpu];
+
+	if (!hwp_active)
+		return 0;
+
+	cpu_data->epp_saved = intel_pstate_get_epp(cpu_data, 0);
+
+	return 0;
+}
+
+static int intel_pstate_resume(struct cpufreq_policy *policy)
+{
+	if (!hwp_active)
+		return 0;
+
+	all_cpu_data[policy->cpu]->epp_policy = 0;
+
+	return intel_pstate_hwp_set_policy(policy);
+}
+
 static void intel_pstate_hwp_set_online_cpus(void)
 {
 	get_online_cpus();
@@ -694,8 +1018,10 @@ static void __init intel_pstate_debug_expose_params(void)
 	struct dentry *debugfs_parent;
 	int i = 0;
 
-	if (hwp_active)
+	if (hwp_active ||
+	    pstate_funcs.get_target_pstate == get_target_pstate_use_cpu_load)
 		return;
+
 	debugfs_parent = debugfs_create_dir("pstate_snb", NULL);
 	if (IS_ERR_OR_NULL(debugfs_parent))
 		return;
@@ -768,9 +1094,12 @@ static ssize_t store_no_turbo(struct kobject *a, struct attribute *b,
 	if (ret != 1)
 		return -EINVAL;
 
+	mutex_lock(&intel_pstate_limits_lock);
+
 	update_turbo_state();
 	if (limits->turbo_disabled) {
 		pr_warn("Turbo disabled by BIOS or unavailable on processor\n");
+		mutex_unlock(&intel_pstate_limits_lock);
 		return -EPERM;
 	}
 
@@ -779,6 +1108,8 @@ static ssize_t store_no_turbo(struct kobject *a, struct attribute *b,
 	if (hwp_active)
 		intel_pstate_hwp_set_online_cpus();
 
+	mutex_unlock(&intel_pstate_limits_lock);
+
 	return count;
 }
 
@@ -792,6 +1123,8 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
 	if (ret != 1)
 		return -EINVAL;
 
+	mutex_lock(&intel_pstate_limits_lock);
+
 	limits->max_sysfs_pct = clamp_t(int, input, 0 , 100);
 	limits->max_perf_pct = min(limits->max_policy_pct,
 				   limits->max_sysfs_pct);
@@ -799,10 +1132,13 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
 				   limits->max_perf_pct);
 	limits->max_perf_pct = max(limits->min_perf_pct,
 				   limits->max_perf_pct);
-	limits->max_perf = div_fp(limits->max_perf_pct, 100);
+	limits->max_perf = div_ext_fp(limits->max_perf_pct, 100);
 
 	if (hwp_active)
 		intel_pstate_hwp_set_online_cpus();
+
+	mutex_unlock(&intel_pstate_limits_lock);
+
 	return count;
 }
 
@@ -816,6 +1152,8 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
 	if (ret != 1)
 		return -EINVAL;
 
+	mutex_lock(&intel_pstate_limits_lock);
+
 	limits->min_sysfs_pct = clamp_t(int, input, 0 , 100);
 	limits->min_perf_pct = max(limits->min_policy_pct,
 				   limits->min_sysfs_pct);
@@ -823,10 +1161,13 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
 				   limits->min_perf_pct);
 	limits->min_perf_pct = min(limits->max_perf_pct,
 				   limits->min_perf_pct);
-	limits->min_perf = div_fp(limits->min_perf_pct, 100);
+	limits->min_perf = div_ext_fp(limits->min_perf_pct, 100);
 
 	if (hwp_active)
 		intel_pstate_hwp_set_online_cpus();
+
+	mutex_unlock(&intel_pstate_limits_lock);
+
 	return count;
 }
 
@@ -841,8 +1182,6 @@ define_one_global_ro(num_pstates);
 
 static struct attribute *intel_pstate_attributes[] = {
 	&no_turbo.attr,
-	&max_perf_pct.attr,
-	&min_perf_pct.attr,
 	&turbo_pct.attr,
 	&num_pstates.attr,
 	NULL
@@ -859,9 +1198,26 @@ static void __init intel_pstate_sysfs_expose_params(void)
 
 	intel_pstate_kobject = kobject_create_and_add("intel_pstate",
 						&cpu_subsys.dev_root->kobj);
-	BUG_ON(!intel_pstate_kobject);
+	if (WARN_ON(!intel_pstate_kobject))
+		return;
+
 	rc = sysfs_create_group(intel_pstate_kobject, &intel_pstate_attr_group);
-	BUG_ON(rc);
+	if (WARN_ON(rc))
+		return;
+
+	/*
+	 * If per cpu limits are enforced there are no global limits, so
+	 * return without creating max/min_perf_pct attributes
+	 */
+	if (per_cpu_limits)
+		return;
+
+	rc = sysfs_create_file(intel_pstate_kobject, &max_perf_pct.attr);
+	WARN_ON(rc);
+
+	rc = sysfs_create_file(intel_pstate_kobject, &min_perf_pct.attr);
+	WARN_ON(rc);
+
 }
 /************************** sysfs end ************************/
 
@@ -872,6 +1228,9 @@ static void intel_pstate_hwp_enable(struct cpudata *cpudata)
 		wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00);
 
 	wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1);
+	cpudata->epp_policy = 0;
+	if (cpudata->epp_default == -EINVAL)
+		cpudata->epp_default = intel_pstate_get_epp(cpudata, 0);
 }
 
 static int atom_get_min_pstate(void)
@@ -1099,7 +1458,6 @@ static const struct cpu_defaults silvermont_params = {
 		.p_gain_pct = 14,
 		.d_gain_pct = 0,
 		.i_gain_pct = 4,
-		.boost_iowait = true,
 	},
 	.funcs = {
 		.get_max = atom_get_max_pstate,
@@ -1121,7 +1479,6 @@ static const struct cpu_defaults airmont_params = {
 		.p_gain_pct = 14,
 		.d_gain_pct = 0,
 		.i_gain_pct = 4,
-		.boost_iowait = true,
 	},
 	.funcs = {
 		.get_max = atom_get_max_pstate,
@@ -1163,7 +1520,6 @@ static const struct cpu_defaults bxt_params = {
 		.p_gain_pct = 14,
 		.d_gain_pct = 0,
 		.i_gain_pct = 4,
-		.boost_iowait = true,
 	},
 	.funcs = {
 		.get_max = core_get_max_pstate,
@@ -1181,20 +1537,24 @@ static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max)
 	int max_perf = cpu->pstate.turbo_pstate;
 	int max_perf_adj;
 	int min_perf;
+	struct perf_limits *perf_limits = limits;
 
 	if (limits->no_turbo || limits->turbo_disabled)
 		max_perf = cpu->pstate.max_pstate;
 
+	if (per_cpu_limits)
+		perf_limits = cpu->perf_limits;
+
 	/*
 	 * performance can be limited by user through sysfs, by cpufreq
 	 * policy, or by cpu specific default values determined through
 	 * experimentation.
 	 */
-	max_perf_adj = fp_toint(max_perf * limits->max_perf);
+	max_perf_adj = fp_ext_toint(max_perf * perf_limits->max_perf);
 	*max = clamp_t(int, max_perf_adj,
 			cpu->pstate.min_pstate, cpu->pstate.turbo_pstate);
 
-	min_perf = fp_toint(max_perf * limits->min_perf);
+	min_perf = fp_ext_toint(max_perf * perf_limits->min_perf);
 	*min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf);
 }
 
@@ -1232,6 +1592,8 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
 	cpu->pstate.max_pstate_physical = pstate_funcs.get_max_physical();
 	cpu->pstate.turbo_pstate = pstate_funcs.get_turbo();
 	cpu->pstate.scaling = pstate_funcs.get_scaling();
+	cpu->pstate.max_freq = cpu->pstate.max_pstate * cpu->pstate.scaling;
+	cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
 
 	if (pstate_funcs.get_vid)
 		pstate_funcs.get_vid(cpu);
@@ -1370,15 +1732,19 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
 	return cpu->pstate.current_pstate - pid_calc(&cpu->pid, perf_scaled);
 }
 
-static inline void intel_pstate_update_pstate(struct cpudata *cpu, int pstate)
+static int intel_pstate_prepare_request(struct cpudata *cpu, int pstate)
 {
 	int max_perf, min_perf;
 
-	update_turbo_state();
-
 	intel_pstate_get_min_max(cpu, &min_perf, &max_perf);
 	pstate = clamp_t(int, pstate, min_perf, max_perf);
 	trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu);
+	return pstate;
+}
+
+static void intel_pstate_update_pstate(struct cpudata *cpu, int pstate)
+{
+	pstate = intel_pstate_prepare_request(cpu, pstate);
 	if (pstate == cpu->pstate.current_pstate)
 		return;
 
@@ -1396,6 +1762,8 @@ static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
 	target_pstate = cpu->policy == CPUFREQ_POLICY_PERFORMANCE ?
 		cpu->pstate.turbo_pstate : pstate_funcs.get_target_pstate(cpu);
 
+	update_turbo_state();
+
 	intel_pstate_update_pstate(cpu, target_pstate);
 
 	sample = &cpu->sample;
@@ -1416,7 +1784,7 @@ static void intel_pstate_update_util(struct update_util_data *data, u64 time,
 	struct cpudata *cpu = container_of(data, struct cpudata, update_util);
 	u64 delta_ns;
 
-	if (pid_params.boost_iowait) {
+	if (pstate_funcs.get_target_pstate == get_target_pstate_use_cpu_load) {
 		if (flags & SCHED_CPUFREQ_IOWAIT) {
 			cpu->iowait_boost = int_tofp(1);
 		} else if (cpu->iowait_boost) {
@@ -1462,6 +1830,7 @@ static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
 	ICPU(INTEL_FAM6_SKYLAKE_DESKTOP,	core_params),
 	ICPU(INTEL_FAM6_BROADWELL_XEON_D,	core_params),
 	ICPU(INTEL_FAM6_XEON_PHI_KNL,		knl_params),
+	ICPU(INTEL_FAM6_XEON_PHI_KNM,		knl_params),
 	ICPU(INTEL_FAM6_ATOM_GOLDMONT,		bxt_params),
 	{}
 };
@@ -1478,11 +1847,26 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
 {
 	struct cpudata *cpu;
 
-	if (!all_cpu_data[cpunum])
-		all_cpu_data[cpunum] = kzalloc(sizeof(struct cpudata),
-					       GFP_KERNEL);
-	if (!all_cpu_data[cpunum])
-		return -ENOMEM;
+	cpu = all_cpu_data[cpunum];
+
+	if (!cpu) {
+		unsigned int size = sizeof(struct cpudata);
+
+		if (per_cpu_limits)
+			size += sizeof(struct perf_limits);
+
+		cpu = kzalloc(size, GFP_KERNEL);
+		if (!cpu)
+			return -ENOMEM;
+
+		all_cpu_data[cpunum] = cpu;
+		if (per_cpu_limits)
+			cpu->perf_limits = (struct perf_limits *)(cpu + 1);
+
+		cpu->epp_default = -EINVAL;
+		cpu->epp_powersave = -EINVAL;
+		cpu->epp_saved = -EINVAL;
+	}
 
 	cpu = all_cpu_data[cpunum];
 
@@ -1541,18 +1925,57 @@ static void intel_pstate_set_performance_limits(struct perf_limits *limits)
 	limits->no_turbo = 0;
 	limits->turbo_disabled = 0;
 	limits->max_perf_pct = 100;
-	limits->max_perf = int_tofp(1);
+	limits->max_perf = int_ext_tofp(1);
 	limits->min_perf_pct = 100;
-	limits->min_perf = int_tofp(1);
+	limits->min_perf = int_ext_tofp(1);
 	limits->max_policy_pct = 100;
 	limits->max_sysfs_pct = 100;
 	limits->min_policy_pct = 0;
 	limits->min_sysfs_pct = 0;
 }
 
+static void intel_pstate_update_perf_limits(struct cpufreq_policy *policy,
+					    struct perf_limits *limits)
+{
+
+	limits->max_policy_pct = DIV_ROUND_UP(policy->max * 100,
+					      policy->cpuinfo.max_freq);
+	limits->max_policy_pct = clamp_t(int, limits->max_policy_pct, 0, 100);
+	if (policy->max == policy->min) {
+		limits->min_policy_pct = limits->max_policy_pct;
+	} else {
+		limits->min_policy_pct = DIV_ROUND_UP(policy->min * 100,
+						      policy->cpuinfo.max_freq);
+		limits->min_policy_pct = clamp_t(int, limits->min_policy_pct,
+						 0, 100);
+	}
+
+	/* Normalize user input to [min_policy_pct, max_policy_pct] */
+	limits->min_perf_pct = max(limits->min_policy_pct,
+				   limits->min_sysfs_pct);
+	limits->min_perf_pct = min(limits->max_policy_pct,
+				   limits->min_perf_pct);
+	limits->max_perf_pct = min(limits->max_policy_pct,
+				   limits->max_sysfs_pct);
+	limits->max_perf_pct = max(limits->min_policy_pct,
+				   limits->max_perf_pct);
+
+	/* Make sure min_perf_pct <= max_perf_pct */
+	limits->min_perf_pct = min(limits->max_perf_pct, limits->min_perf_pct);
+
+	limits->min_perf = div_ext_fp(limits->min_perf_pct, 100);
+	limits->max_perf = div_ext_fp(limits->max_perf_pct, 100);
+	limits->max_perf = round_up(limits->max_perf, EXT_FRAC_BITS);
+	limits->min_perf = round_up(limits->min_perf, EXT_FRAC_BITS);
+
+	pr_debug("cpu:%d max_perf_pct:%d min_perf_pct:%d\n", policy->cpu,
+		 limits->max_perf_pct, limits->min_perf_pct);
+}
+
 static int intel_pstate_set_policy(struct cpufreq_policy *policy)
 {
 	struct cpudata *cpu;
+	struct perf_limits *perf_limits = NULL;
 
 	if (!policy->cpuinfo.max_freq)
 		return -ENODEV;
@@ -1570,41 +1993,31 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
 		policy->max = policy->cpuinfo.max_freq;
 	}
 
-	if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) {
-		limits = &performance_limits;
+	if (per_cpu_limits)
+		perf_limits = cpu->perf_limits;
+
+	mutex_lock(&intel_pstate_limits_lock);
+
+	if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) {
+		if (!perf_limits) {
+			limits = &performance_limits;
+			perf_limits = limits;
+		}
 		if (policy->max >= policy->cpuinfo.max_freq) {
 			pr_debug("set performance\n");
-			intel_pstate_set_performance_limits(limits);
+			intel_pstate_set_performance_limits(perf_limits);
 			goto out;
 		}
 	} else {
 		pr_debug("set powersave\n");
-		limits = &powersave_limits;
-	}
-
-	limits->min_policy_pct = (policy->min * 100) / policy->cpuinfo.max_freq;
-	limits->min_policy_pct = clamp_t(int, limits->min_policy_pct, 0 , 100);
-	limits->max_policy_pct = DIV_ROUND_UP(policy->max * 100,
-					      policy->cpuinfo.max_freq);
-	limits->max_policy_pct = clamp_t(int, limits->max_policy_pct, 0 , 100);
-
-	/* Normalize user input to [min_policy_pct, max_policy_pct] */
-	limits->min_perf_pct = max(limits->min_policy_pct,
-				   limits->min_sysfs_pct);
-	limits->min_perf_pct = min(limits->max_policy_pct,
-				   limits->min_perf_pct);
-	limits->max_perf_pct = min(limits->max_policy_pct,
-				   limits->max_sysfs_pct);
-	limits->max_perf_pct = max(limits->min_policy_pct,
-				   limits->max_perf_pct);
-
-	/* Make sure min_perf_pct <= max_perf_pct */
-	limits->min_perf_pct = min(limits->max_perf_pct, limits->min_perf_pct);
+		if (!perf_limits) {
+			limits = &powersave_limits;
+			perf_limits = limits;
+		}
 
-	limits->min_perf = div_fp(limits->min_perf_pct, 100);
-	limits->max_perf = div_fp(limits->max_perf_pct, 100);
-	limits->max_perf = round_up(limits->max_perf, FRAC_BITS);
+	}
 
+	intel_pstate_update_perf_limits(policy, perf_limits);
  out:
 	if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) {
 		/*
@@ -1619,6 +2032,8 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
 
 	intel_pstate_hwp_set_policy(policy);
 
+	mutex_unlock(&intel_pstate_limits_lock);
+
 	return 0;
 }
 
@@ -1633,22 +2048,32 @@ static int intel_pstate_verify_policy(struct cpufreq_policy *policy)
 	return 0;
 }
 
+static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy)
+{
+	intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]);
+}
+
 static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
 {
-	int cpu_num = policy->cpu;
-	struct cpudata *cpu = all_cpu_data[cpu_num];
+	pr_debug("CPU %d exiting\n", policy->cpu);
 
-	pr_debug("CPU %d exiting\n", cpu_num);
+	intel_pstate_clear_update_util_hook(policy->cpu);
+	if (hwp_active)
+		intel_pstate_hwp_save_state(policy);
+	else
+		intel_cpufreq_stop_cpu(policy);
+}
 
-	intel_pstate_clear_update_util_hook(cpu_num);
+static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
+{
+	intel_pstate_exit_perf_limits(policy);
 
-	if (hwp_active)
-		return;
+	policy->fast_switch_possible = false;
 
-	intel_pstate_set_min_pstate(cpu);
+	return 0;
 }
 
-static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
+static int __intel_pstate_cpu_init(struct cpufreq_policy *policy)
 {
 	struct cpudata *cpu;
 	int rc;
@@ -1659,10 +2084,13 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
 
 	cpu = all_cpu_data[policy->cpu];
 
-	if (limits->min_perf_pct == 100 && limits->max_perf_pct == 100)
-		policy->policy = CPUFREQ_POLICY_PERFORMANCE;
-	else
-		policy->policy = CPUFREQ_POLICY_POWERSAVE;
+	/*
+	 * We need sane value in the cpu->perf_limits, so inherit from global
+	 * perf_limits limits, which are seeded with values based on the
+	 * CONFIG_CPU_FREQ_DEFAULT_GOV_*, during boot up.
+	 */
+	if (per_cpu_limits)
+		memcpy(cpu->perf_limits, limits, sizeof(struct perf_limits));
 
 	policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling;
 	policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
@@ -1675,24 +2103,35 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
 	policy->cpuinfo.max_freq *= cpu->pstate.scaling;
 
 	intel_pstate_init_acpi_perf_limits(policy);
-	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
 	cpumask_set_cpu(policy->cpu, policy->cpus);
 
+	policy->fast_switch_possible = true;
+
 	return 0;
 }
 
-static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
+static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
 {
-	intel_pstate_exit_perf_limits(policy);
+	int ret = __intel_pstate_cpu_init(policy);
+
+	if (ret)
+		return ret;
+
+	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
+	if (limits->min_perf_pct == 100 && limits->max_perf_pct == 100)
+		policy->policy = CPUFREQ_POLICY_PERFORMANCE;
+	else
+		policy->policy = CPUFREQ_POLICY_POWERSAVE;
 
 	return 0;
 }
 
-static struct cpufreq_driver intel_pstate_driver = {
+static struct cpufreq_driver intel_pstate = {
 	.flags		= CPUFREQ_CONST_LOOPS,
 	.verify		= intel_pstate_verify_policy,
 	.setpolicy	= intel_pstate_set_policy,
-	.resume		= intel_pstate_hwp_set_policy,
+	.suspend	= intel_pstate_hwp_save_state,
+	.resume		= intel_pstate_resume,
 	.get		= intel_pstate_get,
 	.init		= intel_pstate_cpu_init,
 	.exit		= intel_pstate_cpu_exit,
@@ -1700,6 +2139,118 @@ static struct cpufreq_driver intel_pstate_driver = {
 	.name		= "intel_pstate",
 };
 
+static int intel_cpufreq_verify_policy(struct cpufreq_policy *policy)
+{
+	struct cpudata *cpu = all_cpu_data[policy->cpu];
+	struct perf_limits *perf_limits = limits;
+
+	update_turbo_state();
+	policy->cpuinfo.max_freq = limits->turbo_disabled ?
+			cpu->pstate.max_freq : cpu->pstate.turbo_freq;
+
+	cpufreq_verify_within_cpu_limits(policy);
+
+	if (per_cpu_limits)
+		perf_limits = cpu->perf_limits;
+
+	intel_pstate_update_perf_limits(policy, perf_limits);
+
+	return 0;
+}
+
+static unsigned int intel_cpufreq_turbo_update(struct cpudata *cpu,
+					       struct cpufreq_policy *policy,
+					       unsigned int target_freq)
+{
+	unsigned int max_freq;
+
+	update_turbo_state();
+
+	max_freq = limits->no_turbo || limits->turbo_disabled ?
+			cpu->pstate.max_freq : cpu->pstate.turbo_freq;
+	policy->cpuinfo.max_freq = max_freq;
+	if (policy->max > max_freq)
+		policy->max = max_freq;
+
+	if (target_freq > max_freq)
+		target_freq = max_freq;
+
+	return target_freq;
+}
+
+static int intel_cpufreq_target(struct cpufreq_policy *policy,
+				unsigned int target_freq,
+				unsigned int relation)
+{
+	struct cpudata *cpu = all_cpu_data[policy->cpu];
+	struct cpufreq_freqs freqs;
+	int target_pstate;
+
+	freqs.old = policy->cur;
+	freqs.new = intel_cpufreq_turbo_update(cpu, policy, target_freq);
+
+	cpufreq_freq_transition_begin(policy, &freqs);
+	switch (relation) {
+	case CPUFREQ_RELATION_L:
+		target_pstate = DIV_ROUND_UP(freqs.new, cpu->pstate.scaling);
+		break;
+	case CPUFREQ_RELATION_H:
+		target_pstate = freqs.new / cpu->pstate.scaling;
+		break;
+	default:
+		target_pstate = DIV_ROUND_CLOSEST(freqs.new, cpu->pstate.scaling);
+		break;
+	}
+	target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
+	if (target_pstate != cpu->pstate.current_pstate) {
+		cpu->pstate.current_pstate = target_pstate;
+		wrmsrl_on_cpu(policy->cpu, MSR_IA32_PERF_CTL,
+			      pstate_funcs.get_val(cpu, target_pstate));
+	}
+	cpufreq_freq_transition_end(policy, &freqs, false);
+
+	return 0;
+}
+
+static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy,
+					      unsigned int target_freq)
+{
+	struct cpudata *cpu = all_cpu_data[policy->cpu];
+	int target_pstate;
+
+	target_freq = intel_cpufreq_turbo_update(cpu, policy, target_freq);
+	target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling);
+	intel_pstate_update_pstate(cpu, target_pstate);
+	return target_freq;
+}
+
+static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy)
+{
+	int ret = __intel_pstate_cpu_init(policy);
+
+	if (ret)
+		return ret;
+
+	policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY;
+	/* This reflects the intel_pstate_get_cpu_pstates() setting. */
+	policy->cur = policy->cpuinfo.min_freq;
+
+	return 0;
+}
+
+static struct cpufreq_driver intel_cpufreq = {
+	.flags		= CPUFREQ_CONST_LOOPS,
+	.verify		= intel_cpufreq_verify_policy,
+	.target		= intel_cpufreq_target,
+	.fast_switch	= intel_cpufreq_fast_switch,
+	.init		= intel_cpufreq_cpu_init,
+	.exit		= intel_pstate_cpu_exit,
+	.stop_cpu	= intel_cpufreq_stop_cpu,
+	.name		= "intel_cpufreq",
+};
+
+static struct cpufreq_driver *intel_pstate_driver = &intel_pstate;
+
 static int no_load __initdata;
 static int no_hwp __initdata;
 static int hwp_only __initdata;
@@ -1726,6 +2277,19 @@ static void __init copy_pid_params(struct pstate_adjust_policy *policy)
 	pid_params.setpoint = policy->setpoint;
 }
 
+#ifdef CONFIG_ACPI
+static void intel_pstate_use_acpi_profile(void)
+{
+	if (acpi_gbl_FADT.preferred_profile == PM_MOBILE)
+		pstate_funcs.get_target_pstate =
+				get_target_pstate_use_cpu_load;
+}
+#else
+static void intel_pstate_use_acpi_profile(void)
+{
+}
+#endif
+
 static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
 {
 	pstate_funcs.get_max   = funcs->get_max;
@@ -1737,6 +2301,7 @@ static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
 	pstate_funcs.get_vid   = funcs->get_vid;
 	pstate_funcs.get_target_pstate = funcs->get_target_pstate;
 
+	intel_pstate_use_acpi_profile();
 }
 
 #ifdef CONFIG_ACPI
@@ -1850,9 +2415,20 @@ static bool __init intel_pstate_platform_pwr_mgmt_exists(void)
 
 	return false;
 }
+
+static void intel_pstate_request_control_from_smm(void)
+{
+	/*
+	 * It may be unsafe to request P-states control from SMM if _PPC support
+	 * has not been enabled.
+	 */
+	if (acpi_ppc)
+		acpi_processor_pstate_control();
+}
 #else /* CONFIG_ACPI not enabled */
 static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; }
 static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
+static inline void intel_pstate_request_control_from_smm(void) {}
 #endif /* CONFIG_ACPI */
 
 static const struct x86_cpu_id hwp_support_ids[] __initconst = {
@@ -1872,6 +2448,7 @@ static int __init intel_pstate_init(void)
 	if (x86_match_cpu(hwp_support_ids) && !no_hwp) {
 		copy_cpu_funcs(&core_params.funcs);
 		hwp_active++;
+		intel_pstate.attr = hwp_cpufreq_attrs;
 		goto hwp_cpu_matched;
 	}
 
@@ -1904,7 +2481,9 @@ hwp_cpu_matched:
 	if (!hwp_active && hwp_only)
 		goto out;
 
-	rc = cpufreq_register_driver(&intel_pstate_driver);
+	intel_pstate_request_control_from_smm();
+
+	rc = cpufreq_register_driver(intel_pstate_driver);
 	if (rc)
 		goto out;
 
@@ -1919,7 +2498,9 @@ out:
 	get_online_cpus();
 	for_each_online_cpu(cpu) {
 		if (all_cpu_data[cpu]) {
-			intel_pstate_clear_update_util_hook(cpu);
+			if (intel_pstate_driver == &intel_pstate)
+				intel_pstate_clear_update_util_hook(cpu);
+
 			kfree(all_cpu_data[cpu]);
 		}
 	}
@@ -1935,8 +2516,13 @@ static int __init intel_pstate_setup(char *str)
 	if (!str)
 		return -EINVAL;
 
-	if (!strcmp(str, "disable"))
+	if (!strcmp(str, "disable")) {
 		no_load = 1;
+	} else if (!strcmp(str, "passive")) {
+		pr_info("Passive mode enabled\n");
+		intel_pstate_driver = &intel_cpufreq;
+		no_hwp = 1;
+	}
 	if (!strcmp(str, "no_hwp")) {
 		pr_info("HWP disabled\n");
 		no_hwp = 1;
@@ -1945,6 +2531,8 @@ static int __init intel_pstate_setup(char *str)
 		force_load = 1;
 	if (!strcmp(str, "hwp_only"))
 		hwp_only = 1;
+	if (!strcmp(str, "per_cpu_perf_limits"))
+		per_cpu_limits = true;
 
 #ifdef CONFIG_ACPI
 	if (!strcmp(str, "support_acpi_ppc"))
diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index d3ffde8..37671b5 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -42,6 +42,10 @@
 #define PMSR_PSAFE_ENABLE	(1UL << 30)
 #define PMSR_SPR_EM_DISABLE	(1UL << 31)
 #define PMSR_MAX(x)		((x >> 32) & 0xFF)
+#define LPSTATE_SHIFT		48
+#define GPSTATE_SHIFT		56
+#define GET_LPSTATE(x)		(((x) >> LPSTATE_SHIFT) & 0xFF)
+#define GET_GPSTATE(x)		(((x) >> GPSTATE_SHIFT) & 0xFF)
 
 #define MAX_RAMP_DOWN_TIME				5120
 /*
@@ -592,7 +596,8 @@ void gpstate_timer_handler(unsigned long data)
 {
 	struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
 	struct global_pstate_info *gpstates = policy->driver_data;
-	int gpstate_idx;
+	int gpstate_idx, lpstate_idx;
+	unsigned long val;
 	unsigned int time_diff = jiffies_to_msecs(jiffies)
 					- gpstates->last_sampled_time;
 	struct powernv_smp_call_data freq_data;
@@ -600,21 +605,37 @@ void gpstate_timer_handler(unsigned long data)
 	if (!spin_trylock(&gpstates->gpstate_lock))
 		return;
 
+	/*
+	 * If PMCR was last updated was using fast_swtich then
+	 * We may have wrong in gpstate->last_lpstate_idx
+	 * value. Hence, read from PMCR to get correct data.
+	 */
+	val = get_pmspr(SPRN_PMCR);
+	freq_data.gpstate_id = (s8)GET_GPSTATE(val);
+	freq_data.pstate_id = (s8)GET_LPSTATE(val);
+	if (freq_data.gpstate_id  == freq_data.pstate_id) {
+		reset_gpstates(policy);
+		spin_unlock(&gpstates->gpstate_lock);
+		return;
+	}
+
 	gpstates->last_sampled_time += time_diff;
 	gpstates->elapsed_time += time_diff;
-	freq_data.pstate_id = idx_to_pstate(gpstates->last_lpstate_idx);
 
-	if ((gpstates->last_gpstate_idx == gpstates->last_lpstate_idx) ||
-	    (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME)) {
+	if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
 		gpstate_idx = pstate_to_idx(freq_data.pstate_id);
+		lpstate_idx = gpstate_idx;
 		reset_gpstates(policy);
 		gpstates->highest_lpstate_idx = gpstate_idx;
 	} else {
+		lpstate_idx = pstate_to_idx(freq_data.pstate_id);
 		gpstate_idx = calc_global_pstate(gpstates->elapsed_time,
 						 gpstates->highest_lpstate_idx,
-						 gpstates->last_lpstate_idx);
+						 lpstate_idx);
 	}
-
+	freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
+	gpstates->last_gpstate_idx = gpstate_idx;
+	gpstates->last_lpstate_idx = lpstate_idx;
 	/*
 	 * If local pstate is equal to global pstate, rampdown is over
 	 * So timer is not required to be queued.
@@ -622,10 +643,6 @@ void gpstate_timer_handler(unsigned long data)
 	if (gpstate_idx != gpstates->last_lpstate_idx)
 		queue_gpstate_timer(gpstates);
 
-	freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
-	gpstates->last_gpstate_idx = pstate_to_idx(freq_data.gpstate_id);
-	gpstates->last_lpstate_idx = pstate_to_idx(freq_data.pstate_id);
-
 	spin_unlock(&gpstates->gpstate_lock);
 
 	/* Timer may get migrated to a different cpu on cpu hot unplug */
@@ -647,8 +664,14 @@ static int powernv_cpufreq_target_index(struct cpufreq_policy *policy,
 	if (unlikely(rebooting) && new_index != get_nominal_index())
 		return 0;
 
-	if (!throttled)
+	if (!throttled) {
+		/* we don't want to be preempted while
+		 * checking if the CPU frequency has been throttled
+		 */
+		preempt_disable();
 		powernv_cpufreq_throttle_check(NULL);
+		preempt_enable();
+	}
 
 	cur_msec = jiffies_to_msecs(get_jiffies_64());
 
@@ -752,9 +775,12 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	spin_lock_init(&gpstates->gpstate_lock);
 	ret = cpufreq_table_validate_and_show(policy, powernv_freqs);
 
-	if (ret < 0)
+	if (ret < 0) {
 		kfree(policy->driver_data);
+		return ret;
+	}
 
+	policy->fast_switch_possible = true;
 	return ret;
 }
 
@@ -897,6 +923,20 @@ static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy)
 	del_timer_sync(&gpstates->timer);
 }
 
+static unsigned int powernv_fast_switch(struct cpufreq_policy *policy,
+					unsigned int target_freq)
+{
+	int index;
+	struct powernv_smp_call_data freq_data;
+
+	index = cpufreq_table_find_index_dl(policy, target_freq);
+	freq_data.pstate_id = powernv_freqs[index].driver_data;
+	freq_data.gpstate_id = powernv_freqs[index].driver_data;
+	set_pstate(&freq_data);
+
+	return powernv_freqs[index].frequency;
+}
+
 static struct cpufreq_driver powernv_cpufreq_driver = {
 	.name		= "powernv-cpufreq",
 	.flags		= CPUFREQ_CONST_LOOPS,
@@ -904,6 +944,7 @@ static struct cpufreq_driver powernv_cpufreq_driver = {
 	.exit		= powernv_cpufreq_cpu_exit,
 	.verify		= cpufreq_generic_frequency_table_verify,
 	.target_index	= powernv_cpufreq_target_index,
+	.fast_switch	= powernv_fast_switch,
 	.get		= powernv_cpufreq_get,
 	.stop_cpu	= powernv_cpufreq_stop_cpu,
 	.attr		= powernv_cpu_freq_attr,
diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c
index 7fe442c..0835a37 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -22,7 +22,7 @@
 
 #define POWERNV_THRESHOLD_LATENCY_NS 200000
 
-struct cpuidle_driver powernv_idle_driver = {
+static struct cpuidle_driver powernv_idle_driver = {
 	.name             = "powernv_idle",
 	.owner            = THIS_MODULE,
 };
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index c73207a..62810ff 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -97,7 +97,23 @@ static int find_deepest_state(struct cpuidle_driver *drv,
 	return ret;
 }
 
-#ifdef CONFIG_SUSPEND
+/**
+ * cpuidle_use_deepest_state - Set/clear governor override flag.
+ * @enable: New value of the flag.
+ *
+ * Set/unset the current CPU to use the deepest idle state (override governors
+ * going forward if set).
+ */
+void cpuidle_use_deepest_state(bool enable)
+{
+	struct cpuidle_device *dev;
+
+	preempt_disable();
+	dev = cpuidle_get_device();
+	dev->use_deepest_state = enable;
+	preempt_enable();
+}
+
 /**
  * cpuidle_find_deepest_state - Find the deepest available idle state.
  * @drv: cpuidle driver for the given CPU.
@@ -109,6 +125,7 @@ int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 	return find_deepest_state(drv, dev, UINT_MAX, 0, false);
 }
 
+#ifdef CONFIG_SUSPEND
 static void enter_freeze_proper(struct cpuidle_driver *drv,
 				struct cpuidle_device *dev, int index)
 {
diff --git a/drivers/cpuidle/dt_idle_states.c b/drivers/cpuidle/dt_idle_states.c
index a5c111b..ffca4fc 100644
--- a/drivers/cpuidle/dt_idle_states.c
+++ b/drivers/cpuidle/dt_idle_states.c
@@ -38,6 +38,12 @@ static int init_state_node(struct cpuidle_state *idle_state,
 	 * state enter function.
 	 */
 	idle_state->enter = match_id->data;
+	/*
+	 * Since this is not a "coupled" state, it's safe to assume interrupts
+	 * won't be enabled when it exits allowing the tick to be frozen
+	 * safely. So enter() can be also enter_freeze() callback.
+	 */
+	idle_state->enter_freeze = match_id->data;
 
 	err = of_property_read_u32(state_node, "wakeup-latency-us",
 				   &idle_state->exit_latency);
diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c
index fb9f511..4e78263 100644
--- a/drivers/cpuidle/governor.c
+++ b/drivers/cpuidle/governor.c
@@ -9,7 +9,6 @@
  */
 
 #include <linux/mutex.h>
-#include <linux/module.h>
 #include <linux/cpuidle.h>
 
 #include "cpuidle.h"
@@ -53,14 +52,11 @@ int cpuidle_switch_governor(struct cpuidle_governor *gov)
 	if (cpuidle_curr_governor) {
 		list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
 			cpuidle_disable_device(dev);
-		module_put(cpuidle_curr_governor->owner);
 	}
 
 	cpuidle_curr_governor = gov;
 
 	if (gov) {
-		if (!try_module_get(cpuidle_curr_governor->owner))
-			return -EINVAL;
 		list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
 			cpuidle_enable_device(dev);
 		cpuidle_install_idle_handler();
diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index 63bd5a4..fe8f089 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -15,7 +15,6 @@
 #include <linux/kernel.h>
 #include <linux/cpuidle.h>
 #include <linux/pm_qos.h>
-#include <linux/module.h>
 #include <linux/jiffies.h>
 #include <linux/tick.h>
 
@@ -177,7 +176,6 @@ static struct cpuidle_governor ladder_governor = {
 	.enable =	ladder_enable_device,
 	.select =	ladder_select_state,
 	.reflect =	ladder_reflect,
-	.owner =	THIS_MODULE,
 };
 
 /**
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 03d38c2..d9b5b93 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -19,7 +19,6 @@
 #include <linux/tick.h>
 #include <linux/sched.h>
 #include <linux/math64.h>
-#include <linux/module.h>
 
 /*
  * Please note when changing the tuning values:
@@ -484,7 +483,6 @@ static struct cpuidle_governor menu_governor = {
 	.enable =	menu_enable_device,
 	.select =	menu_select,
 	.reflect =	menu_reflect,
-	.owner =	THIS_MODULE,
 };
 
 /**
diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index 832a2c3..c5adc8c 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -403,8 +403,10 @@ static int cpuidle_add_state_sysfs(struct cpuidle_device *device)
 	/* state statistics */
 	for (i = 0; i < drv->state_count; i++) {
 		kobj = kzalloc(sizeof(struct cpuidle_state_kobj), GFP_KERNEL);
-		if (!kobj)
+		if (!kobj) {
+			ret = -ENOMEM;
 			goto error_state;
+		}
 		kobj->state = &drv->states[i];
 		kobj->state_usage = &device->states_usage[i];
 		init_completion(&kobj->kobj_unregister);
diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index bf3ea76..a324801 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -850,7 +850,7 @@ err_out:
 EXPORT_SYMBOL(devfreq_add_governor);
 
 /**
- * devfreq_remove_device() - Remove devfreq feature from a device.
+ * devfreq_remove_governor() - Remove devfreq feature from a device.
  * @governor:	the devfreq governor to be removed
  */
 int devfreq_remove_governor(struct devfreq_governor *governor)
diff --git a/drivers/devfreq/event/exynos-nocp.c b/drivers/devfreq/event/exynos-nocp.c
index 49e712a..5c3e7b1 100644
--- a/drivers/devfreq/event/exynos-nocp.c
+++ b/drivers/devfreq/event/exynos-nocp.c
@@ -190,6 +190,7 @@ static const struct of_device_id exynos_nocp_id_match[] = {
 	{ .compatible = "samsung,exynos5420-nocp", },
 	{ /* sentinel */ },
 };
+MODULE_DEVICE_TABLE(of, exynos_nocp_id_match);
 
 static struct regmap_config exynos_nocp_regmap_config = {
 	.reg_bits = 32,
diff --git a/drivers/devfreq/event/exynos-ppmu.c b/drivers/devfreq/event/exynos-ppmu.c
index f55cf0e..107eb91 100644
--- a/drivers/devfreq/event/exynos-ppmu.c
+++ b/drivers/devfreq/event/exynos-ppmu.c
@@ -15,7 +15,6 @@
 #include <linux/io.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/mutex.h>
 #include <linux/of_address.h>
 #include <linux/platform_device.h>
 #include <linux/suspend.h>
@@ -34,7 +33,6 @@ struct exynos_ppmu {
 	unsigned int num_events;
 
 	struct device *dev;
-	struct mutex lock;
 
 	struct exynos_ppmu_data ppmu;
 };
@@ -90,8 +88,6 @@ struct __exynos_ppmu_events {
 	PPMU_EVENT(d1-cpu),
 	PPMU_EVENT(d1-general),
 	PPMU_EVENT(d1-rt),
-
-	{ /* sentinel */ },
 };
 
 static int exynos_ppmu_find_ppmu_id(struct devfreq_event_dev *edev)
@@ -351,6 +347,7 @@ static const struct of_device_id exynos_ppmu_id_match[] = {
 	},
 	{ /* sentinel */ },
 };
+MODULE_DEVICE_TABLE(of, exynos_ppmu_id_match);
 
 static struct devfreq_event_ops *exynos_bus_get_ops(struct device_node *np)
 {
@@ -463,7 +460,6 @@ static int exynos_ppmu_probe(struct platform_device *pdev)
 	if (!info)
 		return -ENOMEM;
 
-	mutex_init(&info->lock);
 	info->dev = &pdev->dev;
 
 	/* Parse dt data to get resource */
diff --git a/drivers/devfreq/event/rockchip-dfi.c b/drivers/devfreq/event/rockchip-dfi.c
index 43fcc5a..22b1133 100644
--- a/drivers/devfreq/event/rockchip-dfi.c
+++ b/drivers/devfreq/event/rockchip-dfi.c
@@ -188,6 +188,7 @@ static const struct of_device_id rockchip_dfi_id_match[] = {
 	{ .compatible = "rockchip,rk3399-dfi" },
 	{ },
 };
+MODULE_DEVICE_TABLE(of, rockchip_dfi_id_match);
 
 static int rockchip_dfi_probe(struct platform_device *pdev)
 {
diff --git a/drivers/devfreq/exynos-bus.c b/drivers/devfreq/exynos-bus.c
index 29866f7..a8ed779 100644
--- a/drivers/devfreq/exynos-bus.c
+++ b/drivers/devfreq/exynos-bus.c
@@ -35,7 +35,7 @@ struct exynos_bus {
 	unsigned int edev_count;
 	struct mutex lock;
 
-	struct dev_pm_opp *curr_opp;
+	unsigned long curr_freq;
 
 	struct regulator *regulator;
 	struct clk *clk;
@@ -99,7 +99,7 @@ static int exynos_bus_target(struct device *dev, unsigned long *freq, u32 flags)
 {
 	struct exynos_bus *bus = dev_get_drvdata(dev);
 	struct dev_pm_opp *new_opp;
-	unsigned long old_freq, new_freq, old_volt, new_volt, tol;
+	unsigned long old_freq, new_freq, new_volt, tol;
 	int ret = 0;
 
 	/* Get new opp-bus instance according to new bus clock */
@@ -113,8 +113,7 @@ static int exynos_bus_target(struct device *dev, unsigned long *freq, u32 flags)
 
 	new_freq = dev_pm_opp_get_freq(new_opp);
 	new_volt = dev_pm_opp_get_voltage(new_opp);
-	old_freq = dev_pm_opp_get_freq(bus->curr_opp);
-	old_volt = dev_pm_opp_get_voltage(bus->curr_opp);
+	old_freq = bus->curr_freq;
 	rcu_read_unlock();
 
 	if (old_freq == new_freq)
@@ -146,7 +145,7 @@ static int exynos_bus_target(struct device *dev, unsigned long *freq, u32 flags)
 			goto out;
 		}
 	}
-	bus->curr_opp = new_opp;
+	bus->curr_freq = new_freq;
 
 	dev_dbg(dev, "Set the frequency of bus (%lukHz -> %lukHz)\n",
 			old_freq/1000, new_freq/1000);
@@ -163,9 +162,7 @@ static int exynos_bus_get_dev_status(struct device *dev,
 	struct devfreq_event_data edata;
 	int ret;
 
-	rcu_read_lock();
-	stat->current_frequency = dev_pm_opp_get_freq(bus->curr_opp);
-	rcu_read_unlock();
+	stat->current_frequency = bus->curr_freq;
 
 	ret = exynos_bus_get_event(bus, &edata);
 	if (ret < 0) {
@@ -226,7 +223,7 @@ static int exynos_bus_passive_target(struct device *dev, unsigned long *freq,
 	}
 
 	new_freq = dev_pm_opp_get_freq(new_opp);
-	old_freq = dev_pm_opp_get_freq(bus->curr_opp);
+	old_freq = bus->curr_freq;
 	rcu_read_unlock();
 
 	if (old_freq == new_freq)
@@ -242,7 +239,7 @@ static int exynos_bus_passive_target(struct device *dev, unsigned long *freq,
 	}
 
 	*freq = new_freq;
-	bus->curr_opp = new_opp;
+	bus->curr_freq = new_freq;
 
 	dev_dbg(dev, "Set the frequency of bus (%lukHz -> %lukHz)\n",
 			old_freq/1000, new_freq/1000);
@@ -335,6 +332,7 @@ static int exynos_bus_parse_of(struct device_node *np,
 			      struct exynos_bus *bus)
 {
 	struct device *dev = bus->dev;
+	struct dev_pm_opp *opp;
 	unsigned long rate;
 	int ret;
 
@@ -352,22 +350,23 @@ static int exynos_bus_parse_of(struct device_node *np,
 	}
 
 	/* Get the freq and voltage from OPP table to scale the bus freq */
-	rcu_read_lock();
 	ret = dev_pm_opp_of_add_table(dev);
 	if (ret < 0) {
 		dev_err(dev, "failed to get OPP table\n");
-		rcu_read_unlock();
 		goto err_clk;
 	}
 
 	rate = clk_get_rate(bus->clk);
-	bus->curr_opp = devfreq_recommended_opp(dev, &rate, 0);
-	if (IS_ERR(bus->curr_opp)) {
+
+	rcu_read_lock();
+	opp = devfreq_recommended_opp(dev, &rate, 0);
+	if (IS_ERR(opp)) {
 		dev_err(dev, "failed to find dev_pm_opp\n");
 		rcu_read_unlock();
-		ret = PTR_ERR(bus->curr_opp);
+		ret = PTR_ERR(opp);
 		goto err_opp;
 	}
+	bus->curr_freq = dev_pm_opp_get_freq(opp);
 	rcu_read_unlock();
 
 	return 0;
diff --git a/drivers/devfreq/rk3399_dmc.c b/drivers/devfreq/rk3399_dmc.c
index e24b73d..27d2f34 100644
--- a/drivers/devfreq/rk3399_dmc.c
+++ b/drivers/devfreq/rk3399_dmc.c
@@ -80,7 +80,6 @@ struct rk3399_dmcfreq {
 	struct regulator *vdd_center;
 	unsigned long rate, target_rate;
 	unsigned long volt, target_volt;
-	struct dev_pm_opp *curr_opp;
 };
 
 static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq,
@@ -102,9 +101,6 @@ static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq,
 	target_rate = dev_pm_opp_get_freq(opp);
 	target_volt = dev_pm_opp_get_voltage(opp);
 
-	dmcfreq->rate = dev_pm_opp_get_freq(dmcfreq->curr_opp);
-	dmcfreq->volt = dev_pm_opp_get_voltage(dmcfreq->curr_opp);
-
 	rcu_read_unlock();
 
 	if (dmcfreq->rate == target_rate)
@@ -165,7 +161,9 @@ static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq,
 	if (err)
 		dev_err(dev, "Cannot to set vol %lu uV\n", target_volt);
 
-	dmcfreq->curr_opp = opp;
+	dmcfreq->rate = target_rate;
+	dmcfreq->volt = target_volt;
+
 out:
 	mutex_unlock(&dmcfreq->lock);
 	return err;
@@ -414,7 +412,6 @@ static int rk3399_dmcfreq_probe(struct platform_device *pdev)
 	 */
 	if (dev_pm_opp_of_add_table(dev)) {
 		dev_err(dev, "Invalid operating-points in device tree.\n");
-		rcu_read_unlock();
 		return -EINVAL;
 	}
 
@@ -431,12 +428,13 @@ static int rk3399_dmcfreq_probe(struct platform_device *pdev)
 		rcu_read_unlock();
 		return PTR_ERR(opp);
 	}
+	data->rate = dev_pm_opp_get_freq(opp);
+	data->volt = dev_pm_opp_get_voltage(opp);
 	rcu_read_unlock();
-	data->curr_opp = opp;
 
 	rk3399_devfreq_dmc_profile.initial_freq = data->rate;
 
-	data->devfreq = devfreq_add_device(dev,
+	data->devfreq = devm_devfreq_add_device(dev,
 					   &rk3399_devfreq_dmc_profile,
 					   "simple_ondemand",
 					   &data->ondemand_data);
@@ -454,6 +452,7 @@ static const struct of_device_id rk3399dmc_devfreq_of_match[] = {
 	{ .compatible = "rockchip,rk3399-dmc" },
 	{ },
 };
+MODULE_DEVICE_TABLE(of, rk3399dmc_devfreq_of_match);
 
 static struct platform_driver rk3399_dmcfreq_driver = {
 	.probe	= rk3399_dmcfreq_probe,
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 4466a2f..7d8ea3d 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -98,8 +98,6 @@ static int intel_idle(struct cpuidle_device *dev,
 			struct cpuidle_driver *drv, int index);
 static void intel_idle_freeze(struct cpuidle_device *dev,
 			      struct cpuidle_driver *drv, int index);
-static int intel_idle_cpu_init(int cpu);
-
 static struct cpuidle_state *cpuidle_state_table;
 
 /*
@@ -724,6 +722,50 @@ static struct cpuidle_state atom_cstates[] = {
 	{
 		.enter = NULL }
 };
+static struct cpuidle_state tangier_cstates[] = {
+	{
+		.name = "C1-TNG",
+		.desc = "MWAIT 0x00",
+		.flags = MWAIT2flg(0x00),
+		.exit_latency = 1,
+		.target_residency = 4,
+		.enter = &intel_idle,
+		.enter_freeze = intel_idle_freeze, },
+	{
+		.name = "C4-TNG",
+		.desc = "MWAIT 0x30",
+		.flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 100,
+		.target_residency = 400,
+		.enter = &intel_idle,
+		.enter_freeze = intel_idle_freeze, },
+	{
+		.name = "C6-TNG",
+		.desc = "MWAIT 0x52",
+		.flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 140,
+		.target_residency = 560,
+		.enter = &intel_idle,
+		.enter_freeze = intel_idle_freeze, },
+	{
+		.name = "C7-TNG",
+		.desc = "MWAIT 0x60",
+		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 1200,
+		.target_residency = 4000,
+		.enter = &intel_idle,
+		.enter_freeze = intel_idle_freeze, },
+	{
+		.name = "C9-TNG",
+		.desc = "MWAIT 0x64",
+		.flags = MWAIT2flg(0x64) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 10000,
+		.target_residency = 20000,
+		.enter = &intel_idle,
+		.enter_freeze = intel_idle_freeze, },
+	{
+		.enter = NULL }
+};
 static struct cpuidle_state avn_cstates[] = {
 	{
 		.name = "C1-AVN",
@@ -907,51 +949,15 @@ static void intel_idle_freeze(struct cpuidle_device *dev,
 	mwait_idle_with_hints(eax, ecx);
 }
 
-static void __setup_broadcast_timer(void *arg)
+static void __setup_broadcast_timer(bool on)
 {
-	unsigned long on = (unsigned long)arg;
-
 	if (on)
 		tick_broadcast_enable();
 	else
 		tick_broadcast_disable();
 }
 
-static int cpu_hotplug_notify(struct notifier_block *n,
-			      unsigned long action, void *hcpu)
-{
-	int hotcpu = (unsigned long)hcpu;
-	struct cpuidle_device *dev;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_ONLINE:
-
-		if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE)
-			smp_call_function_single(hotcpu, __setup_broadcast_timer,
-						 (void *)true, 1);
-
-		/*
-		 * Some systems can hotplug a cpu at runtime after
-		 * the kernel has booted, we have to initialize the
-		 * driver in this case
-		 */
-		dev = per_cpu_ptr(intel_idle_cpuidle_devices, hotcpu);
-		if (dev->registered)
-			break;
-
-		if (intel_idle_cpu_init(hotcpu))
-			return NOTIFY_BAD;
-
-		break;
-	}
-	return NOTIFY_OK;
-}
-
-static struct notifier_block cpu_hotplug_notifier = {
-	.notifier_call = cpu_hotplug_notify,
-};
-
-static void auto_demotion_disable(void *dummy)
+static void auto_demotion_disable(void)
 {
 	unsigned long long msr_bits;
 
@@ -959,7 +965,7 @@ static void auto_demotion_disable(void *dummy)
 	msr_bits &= ~(icpu->auto_demotion_disable_flags);
 	wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits);
 }
-static void c1e_promotion_disable(void *dummy)
+static void c1e_promotion_disable(void)
 {
 	unsigned long long msr_bits;
 
@@ -978,6 +984,10 @@ static const struct idle_cpu idle_cpu_atom = {
 	.state_table = atom_cstates,
 };
 
+static const struct idle_cpu idle_cpu_tangier = {
+	.state_table = tangier_cstates,
+};
+
 static const struct idle_cpu idle_cpu_lincroft = {
 	.state_table = atom_cstates,
 	.auto_demotion_disable_flags = ATM_LNC_C6_AUTO_DEMOTE,
@@ -1066,6 +1076,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	ICPU(INTEL_FAM6_SANDYBRIDGE_X,		idle_cpu_snb),
 	ICPU(INTEL_FAM6_ATOM_CEDARVIEW,		idle_cpu_atom),
 	ICPU(INTEL_FAM6_ATOM_SILVERMONT1,	idle_cpu_byt),
+	ICPU(INTEL_FAM6_ATOM_MERRIFIELD,	idle_cpu_tangier),
 	ICPU(INTEL_FAM6_ATOM_AIRMONT,		idle_cpu_cht),
 	ICPU(INTEL_FAM6_IVYBRIDGE,		idle_cpu_ivb),
 	ICPU(INTEL_FAM6_IVYBRIDGE_X,		idle_cpu_ivt),
@@ -1084,6 +1095,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	ICPU(INTEL_FAM6_KABYLAKE_DESKTOP,	idle_cpu_skl),
 	ICPU(INTEL_FAM6_SKYLAKE_X,		idle_cpu_skx),
 	ICPU(INTEL_FAM6_XEON_PHI_KNL,		idle_cpu_knl),
+	ICPU(INTEL_FAM6_XEON_PHI_KNM,		idle_cpu_knl),
 	ICPU(INTEL_FAM6_ATOM_GOLDMONT,		idle_cpu_bxt),
 	ICPU(INTEL_FAM6_ATOM_DENVERTON,		idle_cpu_dnv),
 	{}
@@ -1373,12 +1385,11 @@ static void __init intel_idle_cpuidle_driver_init(void)
  * allocate, initialize, register cpuidle_devices
  * @cpu: cpu/core to initialize
  */
-static int intel_idle_cpu_init(int cpu)
+static int intel_idle_cpu_init(unsigned int cpu)
 {
 	struct cpuidle_device *dev;
 
 	dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu);
-
 	dev->cpu = cpu;
 
 	if (cpuidle_register_device(dev)) {
@@ -1387,17 +1398,36 @@ static int intel_idle_cpu_init(int cpu)
 	}
 
 	if (icpu->auto_demotion_disable_flags)
-		smp_call_function_single(cpu, auto_demotion_disable, NULL, 1);
+		auto_demotion_disable();
 
 	if (icpu->disable_promotion_to_c1e)
-		smp_call_function_single(cpu, c1e_promotion_disable, NULL, 1);
+		c1e_promotion_disable();
+
+	return 0;
+}
+
+static int intel_idle_cpu_online(unsigned int cpu)
+{
+	struct cpuidle_device *dev;
+
+	if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE)
+		__setup_broadcast_timer(true);
+
+	/*
+	 * Some systems can hotplug a cpu at runtime after
+	 * the kernel has booted, we have to initialize the
+	 * driver in this case
+	 */
+	dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu);
+	if (!dev->registered)
+		return intel_idle_cpu_init(cpu);
 
 	return 0;
 }
 
 static int __init intel_idle_init(void)
 {
-	int retval, i;
+	int retval;
 
 	/* Do not load intel_idle at all for now if idle= is passed */
 	if (boot_option_idle_override != IDLE_NO_OVERRIDE)
@@ -1417,35 +1447,29 @@ static int __init intel_idle_init(void)
 		struct cpuidle_driver *drv = cpuidle_get_driver();
 		printk(KERN_DEBUG PREFIX "intel_idle yielding to %s",
 			drv ? drv->name : "none");
-		free_percpu(intel_idle_cpuidle_devices);
-		return retval;
+		goto init_driver_fail;
 	}
 
-	cpu_notifier_register_begin();
-
-	for_each_online_cpu(i) {
-		retval = intel_idle_cpu_init(i);
-		if (retval) {
-			intel_idle_cpuidle_devices_uninit();
-			cpu_notifier_register_done();
-			cpuidle_unregister_driver(&intel_idle_driver);
-			free_percpu(intel_idle_cpuidle_devices);
-			return retval;
-		}
-	}
-	__register_cpu_notifier(&cpu_hotplug_notifier);
-
 	if (boot_cpu_has(X86_FEATURE_ARAT))	/* Always Reliable APIC Timer */
 		lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE;
-	else
-		on_each_cpu(__setup_broadcast_timer, (void *)true, 1);
 
-	cpu_notifier_register_done();
+	retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online",
+				   intel_idle_cpu_online, NULL);
+	if (retval < 0)
+		goto hp_setup_fail;
 
 	pr_debug(PREFIX "lapic_timer_reliable_states 0x%x\n",
 		lapic_timer_reliable_states);
 
 	return 0;
+
+hp_setup_fail:
+	intel_idle_cpuidle_devices_uninit();
+	cpuidle_unregister_driver(&intel_idle_driver);
+init_driver_fail:
+	free_percpu(intel_idle_cpuidle_devices);
+	return retval;
+
 }
 device_initcall(intel_idle_init);
 
diff --git a/drivers/net/ethernet/smsc/smsc911x.c b/drivers/net/ethernet/smsc/smsc911x.c
index c48fc0c..fa5ca09 100644
--- a/drivers/net/ethernet/smsc/smsc911x.c
+++ b/drivers/net/ethernet/smsc/smsc911x.c
@@ -2585,6 +2585,9 @@ static int smsc911x_suspend(struct device *dev)
 		PMT_CTRL_PM_MODE_D1_ | PMT_CTRL_WOL_EN_ |
 		PMT_CTRL_ED_EN_ | PMT_CTRL_PME_EN_);
 
+	pm_runtime_disable(dev);
+	pm_runtime_set_suspended(dev);
+
 	return 0;
 }
 
@@ -2594,6 +2597,9 @@ static int smsc911x_resume(struct device *dev)
 	struct smsc911x_data *pdata = netdev_priv(ndev);
 	unsigned int to = 100;
 
+	pm_runtime_enable(dev);
+	pm_runtime_resume(dev);
+
 	/* Note 3.11 from the datasheet:
 	 * 	"When the LAN9220 is in a power saving state, a write of any
 	 * 	 data to the BYTE_TEST register will wake-up the device."
diff --git a/drivers/power/avs/rockchip-io-domain.c b/drivers/power/avs/rockchip-io-domain.c
index 01b6d3f..56bce19 100644
--- a/drivers/power/avs/rockchip-io-domain.c
+++ b/drivers/power/avs/rockchip-io-domain.c
@@ -143,7 +143,7 @@ static int rockchip_iodomain_notify(struct notifier_block *nb,
 	if (ret && event == REGULATOR_EVENT_PRE_VOLTAGE_CHANGE)
 		return NOTIFY_BAD;
 
-	dev_info(supply->iod->dev, "Setting to %d done\n", uV);
+	dev_dbg(supply->iod->dev, "Setting to %d done\n", uV);
 	return NOTIFY_OK;
 }
 
diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index 243b233..9a25110 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -189,14 +189,13 @@ struct rapl_package {
 	unsigned int time_unit;
 	struct rapl_domain *domains; /* array of domains, sized at runtime */
 	struct powercap_zone *power_zone; /* keep track of parent zone */
-	int nr_cpus; /* active cpus on the package, topology info is lost during
-		      * cpu hotplug. so we have to track ourselves.
-		      */
 	unsigned long power_limit_irq; /* keep track of package power limit
 					* notify interrupt enable status.
 					*/
 	struct list_head plist;
 	int lead_cpu; /* one active cpu per package for access */
+	/* Track active cpus */
+	struct cpumask cpumask;
 };
 
 struct rapl_defaults {
@@ -275,18 +274,6 @@ static struct rapl_package *find_package_by_id(int id)
 	return NULL;
 }
 
-/* caller must hold cpu hotplug lock */
-static void rapl_cleanup_data(void)
-{
-	struct rapl_package *p, *tmp;
-
-	list_for_each_entry_safe(p, tmp, &rapl_packages, plist) {
-		kfree(p->domains);
-		list_del(&p->plist);
-		kfree(p);
-	}
-}
-
 static int get_energy_counter(struct powercap_zone *power_zone, u64 *energy_raw)
 {
 	struct rapl_domain *rd;
@@ -442,6 +429,7 @@ static int contraint_to_pl(struct rapl_domain *rd, int cid)
 			return i;
 		}
 	}
+	pr_err("Cannot find matching power limit for constraint %d\n", cid);
 
 	return -EINVAL;
 }
@@ -457,6 +445,10 @@ static int set_power_limit(struct powercap_zone *power_zone, int cid,
 	get_online_cpus();
 	rd = power_zone_to_rapl_domain(power_zone);
 	id = contraint_to_pl(rd, cid);
+	if (id < 0) {
+		ret = id;
+		goto set_exit;
+	}
 
 	rp = rd->rp;
 
@@ -496,6 +488,11 @@ static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
 	get_online_cpus();
 	rd = power_zone_to_rapl_domain(power_zone);
 	id = contraint_to_pl(rd, cid);
+	if (id < 0) {
+		ret = id;
+		goto get_exit;
+	}
+
 	switch (rd->rpl[id].prim_id) {
 	case PL1_ENABLE:
 		prim = POWER_LIMIT1;
@@ -512,6 +509,7 @@ static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
 	else
 		*data = val;
 
+get_exit:
 	put_online_cpus();
 
 	return ret;
@@ -527,6 +525,10 @@ static int set_time_window(struct powercap_zone *power_zone, int cid,
 	get_online_cpus();
 	rd = power_zone_to_rapl_domain(power_zone);
 	id = contraint_to_pl(rd, cid);
+	if (id < 0) {
+		ret = id;
+		goto set_time_exit;
+	}
 
 	switch (rd->rpl[id].prim_id) {
 	case PL1_ENABLE:
@@ -538,6 +540,8 @@ static int set_time_window(struct powercap_zone *power_zone, int cid,
 	default:
 		ret = -EINVAL;
 	}
+
+set_time_exit:
 	put_online_cpus();
 	return ret;
 }
@@ -552,6 +556,10 @@ static int get_time_window(struct powercap_zone *power_zone, int cid, u64 *data)
 	get_online_cpus();
 	rd = power_zone_to_rapl_domain(power_zone);
 	id = contraint_to_pl(rd, cid);
+	if (id < 0) {
+		ret = id;
+		goto get_time_exit;
+	}
 
 	switch (rd->rpl[id].prim_id) {
 	case PL1_ENABLE:
@@ -566,6 +574,8 @@ static int get_time_window(struct powercap_zone *power_zone, int cid, u64 *data)
 	}
 	if (!ret)
 		*data = val;
+
+get_time_exit:
 	put_online_cpus();
 
 	return ret;
@@ -707,7 +717,7 @@ static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type,
 	case ENERGY_UNIT:
 		scale = ENERGY_UNIT_SCALE;
 		/* per domain unit takes precedence */
-		if (rd && rd->domain_energy_unit)
+		if (rd->domain_energy_unit)
 			units = rd->domain_energy_unit;
 		else
 			units = rp->energy_unit;
@@ -976,10 +986,20 @@ static void package_power_limit_irq_save(struct rapl_package *rp)
 	smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1);
 }
 
-static void power_limit_irq_restore_cpu(void *info)
+/*
+ * Restore per package power limit interrupt enable state. Called from cpu
+ * hotplug code on package removal.
+ */
+static void package_power_limit_irq_restore(struct rapl_package *rp)
 {
-	u32 l, h = 0;
-	struct rapl_package *rp = (struct rapl_package *)info;
+	u32 l, h;
+
+	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
+		return;
+
+	/* irq enable state not saved, nothing to restore */
+	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
+		return;
 
 	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
 
@@ -991,19 +1011,6 @@ static void power_limit_irq_restore_cpu(void *info)
 	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
 }
 
-/* restore per package power limit interrupt enable state */
-static void package_power_limit_irq_restore(struct rapl_package *rp)
-{
-	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
-		return;
-
-	/* irq enable state not saved, nothing to restore */
-	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
-		return;
-
-	smp_call_function_single(rp->lead_cpu, power_limit_irq_restore_cpu, rp, 1);
-}
-
 static void set_floor_freq_default(struct rapl_domain *rd, bool mode)
 {
 	int nr_powerlimit = find_nr_power_limit(rd);
@@ -1160,84 +1167,49 @@ static const struct x86_cpu_id rapl_ids[] __initconst = {
 	RAPL_CPU(INTEL_FAM6_ATOM_DENVERTON,	rapl_defaults_core),
 
 	RAPL_CPU(INTEL_FAM6_XEON_PHI_KNL,	rapl_defaults_hsw_server),
+	RAPL_CPU(INTEL_FAM6_XEON_PHI_KNM,	rapl_defaults_hsw_server),
 	{}
 };
 MODULE_DEVICE_TABLE(x86cpu, rapl_ids);
 
-/* read once for all raw primitive data for all packages, domains */
-static void rapl_update_domain_data(void)
+/* Read once for all raw primitive data for domains */
+static void rapl_update_domain_data(struct rapl_package *rp)
 {
 	int dmn, prim;
 	u64 val;
-	struct rapl_package *rp;
 
-	list_for_each_entry(rp, &rapl_packages, plist) {
-		for (dmn = 0; dmn < rp->nr_domains; dmn++) {
-			pr_debug("update package %d domain %s data\n", rp->id,
-				rp->domains[dmn].name);
-			/* exclude non-raw primitives */
-			for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++)
-				if (!rapl_read_data_raw(&rp->domains[dmn], prim,
-								rpi[prim].unit,
-								&val))
-					rp->domains[dmn].rdd.primitives[prim] =
-									val;
+	for (dmn = 0; dmn < rp->nr_domains; dmn++) {
+		pr_debug("update package %d domain %s data\n", rp->id,
+			 rp->domains[dmn].name);
+		/* exclude non-raw primitives */
+		for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) {
+			if (!rapl_read_data_raw(&rp->domains[dmn], prim,
+						rpi[prim].unit, &val))
+				rp->domains[dmn].rdd.primitives[prim] =	val;
 		}
 	}
 
 }
 
-static int rapl_unregister_powercap(void)
+static void rapl_unregister_powercap(void)
 {
-	struct rapl_package *rp;
-	struct rapl_domain *rd, *rd_package = NULL;
-
-	/* unregister all active rapl packages from the powercap layer,
-	 * hotplug lock held
-	 */
-	list_for_each_entry(rp, &rapl_packages, plist) {
-		package_power_limit_irq_restore(rp);
-
-		for (rd = rp->domains; rd < rp->domains + rp->nr_domains;
-		     rd++) {
-			pr_debug("remove package, undo power limit on %d: %s\n",
-				rp->id, rd->name);
-			rapl_write_data_raw(rd, PL1_ENABLE, 0);
-			rapl_write_data_raw(rd, PL1_CLAMP, 0);
-			if (find_nr_power_limit(rd) > 1) {
-				rapl_write_data_raw(rd, PL2_ENABLE, 0);
-				rapl_write_data_raw(rd, PL2_CLAMP, 0);
-			}
-			if (rd->id == RAPL_DOMAIN_PACKAGE) {
-				rd_package = rd;
-				continue;
-			}
-			powercap_unregister_zone(control_type, &rd->power_zone);
-		}
-		/* do the package zone last */
-		if (rd_package)
-			powercap_unregister_zone(control_type,
-						&rd_package->power_zone);
-	}
-
 	if (platform_rapl_domain) {
 		powercap_unregister_zone(control_type,
 					 &platform_rapl_domain->power_zone);
 		kfree(platform_rapl_domain);
 	}
-
 	powercap_unregister_control_type(control_type);
-
-	return 0;
 }
 
 static int rapl_package_register_powercap(struct rapl_package *rp)
 {
 	struct rapl_domain *rd;
-	int ret = 0;
 	char dev_name[17]; /* max domain name = 7 + 1 + 8 for int + 1 for null*/
 	struct powercap_zone *power_zone = NULL;
-	int nr_pl;
+	int nr_pl, ret;;
+
+	/* Update the domain data of the new package */
+	rapl_update_domain_data(rp);
 
 	/* first we register package domain as the parent zone*/
 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
@@ -1257,8 +1229,7 @@ static int rapl_package_register_powercap(struct rapl_package *rp)
 			if (IS_ERR(power_zone)) {
 				pr_debug("failed to register package, %d\n",
 					rp->id);
-				ret = PTR_ERR(power_zone);
-				goto exit_package;
+				return PTR_ERR(power_zone);
 			}
 			/* track parent zone in per package/socket data */
 			rp->power_zone = power_zone;
@@ -1268,8 +1239,7 @@ static int rapl_package_register_powercap(struct rapl_package *rp)
 	}
 	if (!power_zone) {
 		pr_err("no package domain found, unknown topology!\n");
-		ret = -ENODEV;
-		goto exit_package;
+		return -ENODEV;
 	}
 	/* now register domains as children of the socket/package*/
 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
@@ -1290,11 +1260,11 @@ static int rapl_package_register_powercap(struct rapl_package *rp)
 			goto err_cleanup;
 		}
 	}
+	return 0;
 
-exit_package:
-	return ret;
 err_cleanup:
-	/* clean up previously initialized domains within the package if we
+	/*
+	 * Clean up previously initialized domains within the package if we
 	 * failed after the first domain setup.
 	 */
 	while (--rd >= rp->domains) {
@@ -1305,7 +1275,7 @@ err_cleanup:
 	return ret;
 }
 
-static int rapl_register_psys(void)
+static int __init rapl_register_psys(void)
 {
 	struct rapl_domain *rd;
 	struct powercap_zone *power_zone;
@@ -1346,40 +1316,14 @@ static int rapl_register_psys(void)
 	return 0;
 }
 
-static int rapl_register_powercap(void)
+static int __init rapl_register_powercap(void)
 {
-	struct rapl_domain *rd;
-	struct rapl_package *rp;
-	int ret = 0;
-
 	control_type = powercap_register_control_type(NULL, "intel-rapl", NULL);
 	if (IS_ERR(control_type)) {
 		pr_debug("failed to register powercap control_type.\n");
 		return PTR_ERR(control_type);
 	}
-	/* read the initial data */
-	rapl_update_domain_data();
-	list_for_each_entry(rp, &rapl_packages, plist)
-		if (rapl_package_register_powercap(rp))
-			goto err_cleanup_package;
-
-	/* Don't bail out if PSys is not supported */
-	rapl_register_psys();
-
-	return ret;
-
-err_cleanup_package:
-	/* clean up previously initialized packages */
-	list_for_each_entry_continue_reverse(rp, &rapl_packages, plist) {
-		for (rd = rp->domains; rd < rp->domains + rp->nr_domains;
-		     rd++) {
-			pr_debug("unregister zone/package %d, %s domain\n",
-				rp->id, rd->name);
-			powercap_unregister_zone(control_type, &rd->power_zone);
-		}
-	}
-
-	return ret;
+	return 0;
 }
 
 static int rapl_check_domain(int cpu, int domain)
@@ -1452,9 +1396,8 @@ static void rapl_detect_powerlimit(struct rapl_domain *rd)
  */
 static int rapl_detect_domains(struct rapl_package *rp, int cpu)
 {
-	int i;
-	int ret = 0;
 	struct rapl_domain *rd;
+	int i;
 
 	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
 		/* use physical package id to read counters */
@@ -1466,84 +1409,20 @@ static int rapl_detect_domains(struct rapl_package *rp, int cpu)
 	rp->nr_domains = bitmap_weight(&rp->domain_map,	RAPL_DOMAIN_MAX);
 	if (!rp->nr_domains) {
 		pr_debug("no valid rapl domains found in package %d\n", rp->id);
-		ret = -ENODEV;
-		goto done;
+		return -ENODEV;
 	}
 	pr_debug("found %d domains on package %d\n", rp->nr_domains, rp->id);
 
 	rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain),
 			GFP_KERNEL);
-	if (!rp->domains) {
-		ret = -ENOMEM;
-		goto done;
-	}
+	if (!rp->domains)
+		return -ENOMEM;
+
 	rapl_init_domains(rp);
 
 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++)
 		rapl_detect_powerlimit(rd);
 
-
-
-done:
-	return ret;
-}
-
-static bool is_package_new(int package)
-{
-	struct rapl_package *rp;
-
-	/* caller prevents cpu hotplug, there will be no new packages added
-	 * or deleted while traversing the package list, no need for locking.
-	 */
-	list_for_each_entry(rp, &rapl_packages, plist)
-		if (package == rp->id)
-			return false;
-
-	return true;
-}
-
-/* RAPL interface can be made of a two-level hierarchy: package level and domain
- * level. We first detect the number of packages then domains of each package.
- * We have to consider the possiblity of CPU online/offline due to hotplug and
- * other scenarios.
- */
-static int rapl_detect_topology(void)
-{
-	int i;
-	int phy_package_id;
-	struct rapl_package *new_package, *rp;
-
-	for_each_online_cpu(i) {
-		phy_package_id = topology_physical_package_id(i);
-		if (is_package_new(phy_package_id)) {
-			new_package = kzalloc(sizeof(*rp), GFP_KERNEL);
-			if (!new_package) {
-				rapl_cleanup_data();
-				return -ENOMEM;
-			}
-			/* add the new package to the list */
-			new_package->id = phy_package_id;
-			new_package->nr_cpus = 1;
-			/* use the first active cpu of the package to access */
-			new_package->lead_cpu = i;
-			/* check if the package contains valid domains */
-			if (rapl_detect_domains(new_package, i) ||
-				rapl_defaults->check_unit(new_package, i)) {
-				kfree(new_package->domains);
-				kfree(new_package);
-				/* free up the packages already initialized */
-				rapl_cleanup_data();
-				return -ENODEV;
-			}
-			INIT_LIST_HEAD(&new_package->plist);
-			list_add(&new_package->plist, &rapl_packages);
-		} else {
-			rp = find_package_by_id(phy_package_id);
-			if (rp)
-				++rp->nr_cpus;
-		}
-	}
-
 	return 0;
 }
 
@@ -1552,12 +1431,21 @@ static void rapl_remove_package(struct rapl_package *rp)
 {
 	struct rapl_domain *rd, *rd_package = NULL;
 
+	package_power_limit_irq_restore(rp);
+
 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
+		rapl_write_data_raw(rd, PL1_ENABLE, 0);
+		rapl_write_data_raw(rd, PL1_CLAMP, 0);
+		if (find_nr_power_limit(rd) > 1) {
+			rapl_write_data_raw(rd, PL2_ENABLE, 0);
+			rapl_write_data_raw(rd, PL2_CLAMP, 0);
+		}
 		if (rd->id == RAPL_DOMAIN_PACKAGE) {
 			rd_package = rd;
 			continue;
 		}
-		pr_debug("remove package %d, %s domain\n", rp->id, rd->name);
+		pr_debug("remove package, undo power limit on %d: %s\n",
+			 rp->id, rd->name);
 		powercap_unregister_zone(control_type, &rd->power_zone);
 	}
 	/* do parent zone last */
@@ -1567,20 +1455,17 @@ static void rapl_remove_package(struct rapl_package *rp)
 }
 
 /* called from CPU hotplug notifier, hotplug lock held */
-static int rapl_add_package(int cpu)
+static struct rapl_package *rapl_add_package(int cpu, int pkgid)
 {
-	int ret = 0;
-	int phy_package_id;
 	struct rapl_package *rp;
+	int ret;
 
-	phy_package_id = topology_physical_package_id(cpu);
 	rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL);
 	if (!rp)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	/* add the new package to the list */
-	rp->id = phy_package_id;
-	rp->nr_cpus = 1;
+	rp->id = pkgid;
 	rp->lead_cpu = cpu;
 
 	/* check if the package contains valid domains */
@@ -1589,17 +1474,17 @@ static int rapl_add_package(int cpu)
 		ret = -ENODEV;
 		goto err_free_package;
 	}
-	if (!rapl_package_register_powercap(rp)) {
+	ret = rapl_package_register_powercap(rp);
+	if (!ret) {
 		INIT_LIST_HEAD(&rp->plist);
 		list_add(&rp->plist, &rapl_packages);
-		return ret;
+		return rp;
 	}
 
 err_free_package:
 	kfree(rp->domains);
 	kfree(rp);
-
-	return ret;
+	return ERR_PTR(ret);
 }
 
 /* Handles CPU hotplug on multi-socket systems.
@@ -1609,55 +1494,46 @@ err_free_package:
  * associated domains. Cooling devices are handled accordingly at
  * per-domain level.
  */
-static int rapl_cpu_callback(struct notifier_block *nfb,
-				unsigned long action, void *hcpu)
+static int rapl_cpu_online(unsigned int cpu)
 {
-	unsigned long cpu = (unsigned long)hcpu;
-	int phy_package_id;
+	int pkgid = topology_physical_package_id(cpu);
 	struct rapl_package *rp;
-	int lead_cpu;
 
-	phy_package_id = topology_physical_package_id(cpu);
-	switch (action) {
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-	case CPU_DOWN_FAILED:
-	case CPU_DOWN_FAILED_FROZEN:
-		rp = find_package_by_id(phy_package_id);
-		if (rp)
-			++rp->nr_cpus;
-		else
-			rapl_add_package(cpu);
-		break;
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
-		rp = find_package_by_id(phy_package_id);
-		if (!rp)
-			break;
-		if (--rp->nr_cpus == 0)
-			rapl_remove_package(rp);
-		else if (cpu == rp->lead_cpu) {
-			/* choose another active cpu in the package */
-			lead_cpu = cpumask_any_but(topology_core_cpumask(cpu), cpu);
-			if (lead_cpu < nr_cpu_ids)
-				rp->lead_cpu = lead_cpu;
-			else /* should never go here */
-				pr_err("no active cpu available for package %d\n",
-					phy_package_id);
-		}
+	rp = find_package_by_id(pkgid);
+	if (!rp) {
+		rp = rapl_add_package(cpu, pkgid);
+		if (IS_ERR(rp))
+			return PTR_ERR(rp);
 	}
+	cpumask_set_cpu(cpu, &rp->cpumask);
+	return 0;
+}
+
+static int rapl_cpu_down_prep(unsigned int cpu)
+{
+	int pkgid = topology_physical_package_id(cpu);
+	struct rapl_package *rp;
+	int lead_cpu;
+
+	rp = find_package_by_id(pkgid);
+	if (!rp)
+		return 0;
 
-	return NOTIFY_OK;
+	cpumask_clear_cpu(cpu, &rp->cpumask);
+	lead_cpu = cpumask_first(&rp->cpumask);
+	if (lead_cpu >= nr_cpu_ids)
+		rapl_remove_package(rp);
+	else if (rp->lead_cpu == cpu)
+		rp->lead_cpu = lead_cpu;
+	return 0;
 }
 
-static struct notifier_block rapl_cpu_notifier = {
-	.notifier_call = rapl_cpu_callback,
-};
+static enum cpuhp_state pcap_rapl_online;
 
 static int __init rapl_init(void)
 {
-	int ret = 0;
 	const struct x86_cpu_id *id;
+	int ret;
 
 	id = x86_match_cpu(rapl_ids);
 	if (!id) {
@@ -1669,36 +1545,29 @@ static int __init rapl_init(void)
 
 	rapl_defaults = (struct rapl_defaults *)id->driver_data;
 
-	cpu_notifier_register_begin();
-
-	/* prevent CPU hotplug during detection */
-	get_online_cpus();
-	ret = rapl_detect_topology();
+	ret = rapl_register_powercap();
 	if (ret)
-		goto done;
+		return ret;
 
-	if (rapl_register_powercap()) {
-		rapl_cleanup_data();
-		ret = -ENODEV;
-		goto done;
-	}
-	__register_hotcpu_notifier(&rapl_cpu_notifier);
-done:
-	put_online_cpus();
-	cpu_notifier_register_done();
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powercap/rapl:online",
+				rapl_cpu_online, rapl_cpu_down_prep);
+	if (ret < 0)
+		goto err_unreg;
+	pcap_rapl_online = ret;
+
+	/* Don't bail out if PSys is not supported */
+	rapl_register_psys();
+	return 0;
 
+err_unreg:
+	rapl_unregister_powercap();
 	return ret;
 }
 
 static void __exit rapl_exit(void)
 {
-	cpu_notifier_register_begin();
-	get_online_cpus();
-	__unregister_hotcpu_notifier(&rapl_cpu_notifier);
+	cpuhp_remove_state(pcap_rapl_online);
 	rapl_unregister_powercap();
-	rapl_cleanup_data();
-	put_online_cpus();
-	cpu_notifier_register_done();
 }
 
 module_init(rapl_init);
diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c
index 350cb5e..df64692 100644
--- a/drivers/thermal/intel_powerclamp.c
+++ b/drivers/thermal/intel_powerclamp.c
@@ -43,7 +43,6 @@
 #include <linux/kernel.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
-#include <linux/freezer.h>
 #include <linux/cpu.h>
 #include <linux/thermal.h>
 #include <linux/slab.h>
@@ -85,11 +84,26 @@ static unsigned int control_cpu; /* The cpu assigned to collect stat and update
 				  */
 static bool clamping;
 
+static const struct sched_param sparam = {
+	.sched_priority = MAX_USER_RT_PRIO / 2,
+};
+struct powerclamp_worker_data {
+	struct kthread_worker *worker;
+	struct kthread_work balancing_work;
+	struct kthread_delayed_work idle_injection_work;
+	unsigned int cpu;
+	unsigned int count;
+	unsigned int guard;
+	unsigned int window_size_now;
+	unsigned int target_ratio;
+	unsigned int duration_jiffies;
+	bool clamping;
+};
 
-static struct task_struct * __percpu *powerclamp_thread;
+static struct powerclamp_worker_data * __percpu worker_data;
 static struct thermal_cooling_device *cooling_dev;
 static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
-					   * clamping thread
+					   * clamping kthread worker
 					   */
 
 static unsigned int duration;
@@ -261,11 +275,6 @@ static u64 pkg_state_counter(void)
 	return count;
 }
 
-static void noop_timer(unsigned long foo)
-{
-	/* empty... just the fact that we get the interrupt wakes us up */
-}
-
 static unsigned int get_compensation(int ratio)
 {
 	unsigned int comp = 0;
@@ -367,103 +376,79 @@ static bool powerclamp_adjust_controls(unsigned int target_ratio,
 	return set_target_ratio + guard <= current_ratio;
 }
 
-static int clamp_thread(void *arg)
+static void clamp_balancing_func(struct kthread_work *work)
 {
-	int cpunr = (unsigned long)arg;
-	DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
-	static const struct sched_param param = {
-		.sched_priority = MAX_USER_RT_PRIO/2,
-	};
-	unsigned int count = 0;
-	unsigned int target_ratio;
+	struct powerclamp_worker_data *w_data;
+	int sleeptime;
+	unsigned long target_jiffies;
+	unsigned int compensated_ratio;
+	int interval; /* jiffies to sleep for each attempt */
 
-	set_bit(cpunr, cpu_clamping_mask);
-	set_freezable();
-	init_timer_on_stack(&wakeup_timer);
-	sched_setscheduler(current, SCHED_FIFO, &param);
-
-	while (true == clamping && !kthread_should_stop() &&
-		cpu_online(cpunr)) {
-		int sleeptime;
-		unsigned long target_jiffies;
-		unsigned int guard;
-		unsigned int compensated_ratio;
-		int interval; /* jiffies to sleep for each attempt */
-		unsigned int duration_jiffies = msecs_to_jiffies(duration);
-		unsigned int window_size_now;
-
-		try_to_freeze();
-		/*
-		 * make sure user selected ratio does not take effect until
-		 * the next round. adjust target_ratio if user has changed
-		 * target such that we can converge quickly.
-		 */
-		target_ratio = set_target_ratio;
-		guard = 1 + target_ratio/20;
-		window_size_now = window_size;
-		count++;
-
-		/*
-		 * systems may have different ability to enter package level
-		 * c-states, thus we need to compensate the injected idle ratio
-		 * to achieve the actual target reported by the HW.
-		 */
-		compensated_ratio = target_ratio +
-			get_compensation(target_ratio);
-		if (compensated_ratio <= 0)
-			compensated_ratio = 1;
-		interval = duration_jiffies * 100 / compensated_ratio;
-
-		/* align idle time */
-		target_jiffies = roundup(jiffies, interval);
-		sleeptime = target_jiffies - jiffies;
-		if (sleeptime <= 0)
-			sleeptime = 1;
-		schedule_timeout_interruptible(sleeptime);
-		/*
-		 * only elected controlling cpu can collect stats and update
-		 * control parameters.
-		 */
-		if (cpunr == control_cpu && !(count%window_size_now)) {
-			should_skip =
-				powerclamp_adjust_controls(target_ratio,
-							guard, window_size_now);
-			smp_mb();
-		}
+	w_data = container_of(work, struct powerclamp_worker_data,
+			      balancing_work);
 
-		if (should_skip)
-			continue;
-
-		target_jiffies = jiffies + duration_jiffies;
-		mod_timer(&wakeup_timer, target_jiffies);
-		if (unlikely(local_softirq_pending()))
-			continue;
-		/*
-		 * stop tick sched during idle time, interrupts are still
-		 * allowed. thus jiffies are updated properly.
-		 */
-		preempt_disable();
-		/* mwait until target jiffies is reached */
-		while (time_before(jiffies, target_jiffies)) {
-			unsigned long ecx = 1;
-			unsigned long eax = target_mwait;
-
-			/*
-			 * REVISIT: may call enter_idle() to notify drivers who
-			 * can save power during cpu idle. same for exit_idle()
-			 */
-			local_touch_nmi();
-			stop_critical_timings();
-			mwait_idle_with_hints(eax, ecx);
-			start_critical_timings();
-			atomic_inc(&idle_wakeup_counter);
-		}
-		preempt_enable();
+	/*
+	 * make sure user selected ratio does not take effect until
+	 * the next round. adjust target_ratio if user has changed
+	 * target such that we can converge quickly.
+	 */
+	w_data->target_ratio = READ_ONCE(set_target_ratio);
+	w_data->guard = 1 + w_data->target_ratio / 20;
+	w_data->window_size_now = window_size;
+	w_data->duration_jiffies = msecs_to_jiffies(duration);
+	w_data->count++;
+
+	/*
+	 * systems may have different ability to enter package level
+	 * c-states, thus we need to compensate the injected idle ratio
+	 * to achieve the actual target reported by the HW.
+	 */
+	compensated_ratio = w_data->target_ratio +
+		get_compensation(w_data->target_ratio);
+	if (compensated_ratio <= 0)
+		compensated_ratio = 1;
+	interval = w_data->duration_jiffies * 100 / compensated_ratio;
+
+	/* align idle time */
+	target_jiffies = roundup(jiffies, interval);
+	sleeptime = target_jiffies - jiffies;
+	if (sleeptime <= 0)
+		sleeptime = 1;
+
+	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
+		kthread_queue_delayed_work(w_data->worker,
+					   &w_data->idle_injection_work,
+					   sleeptime);
+}
+
+static void clamp_idle_injection_func(struct kthread_work *work)
+{
+	struct powerclamp_worker_data *w_data;
+
+	w_data = container_of(work, struct powerclamp_worker_data,
+			      idle_injection_work.work);
+
+	/*
+	 * only elected controlling cpu can collect stats and update
+	 * control parameters.
+	 */
+	if (w_data->cpu == control_cpu &&
+	    !(w_data->count % w_data->window_size_now)) {
+		should_skip =
+			powerclamp_adjust_controls(w_data->target_ratio,
+						   w_data->guard,
+						   w_data->window_size_now);
+		smp_mb();
 	}
-	del_timer_sync(&wakeup_timer);
-	clear_bit(cpunr, cpu_clamping_mask);
 
-	return 0;
+	if (should_skip)
+		goto balance;
+
+	play_idle(jiffies_to_msecs(w_data->duration_jiffies));
+
+balance:
+	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
+		kthread_queue_work(w_data->worker, &w_data->balancing_work);
 }
 
 /*
@@ -507,10 +492,60 @@ static void poll_pkg_cstate(struct work_struct *dummy)
 		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
 }
 
+static void start_power_clamp_worker(unsigned long cpu)
+{
+	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
+	struct kthread_worker *worker;
+
+	worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inject/%ld", cpu);
+	if (IS_ERR(worker))
+		return;
+
+	w_data->worker = worker;
+	w_data->count = 0;
+	w_data->cpu = cpu;
+	w_data->clamping = true;
+	set_bit(cpu, cpu_clamping_mask);
+	sched_setscheduler(worker->task, SCHED_FIFO, &sparam);
+	kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
+	kthread_init_delayed_work(&w_data->idle_injection_work,
+				  clamp_idle_injection_func);
+	kthread_queue_work(w_data->worker, &w_data->balancing_work);
+}
+
+static void stop_power_clamp_worker(unsigned long cpu)
+{
+	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
+
+	if (!w_data->worker)
+		return;
+
+	w_data->clamping = false;
+	/*
+	 * Make sure that all works that get queued after this point see
+	 * the clamping disabled. The counter part is not needed because
+	 * there is an implicit memory barrier when the queued work
+	 * is proceed.
+	 */
+	smp_wmb();
+	kthread_cancel_work_sync(&w_data->balancing_work);
+	kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
+	/*
+	 * The balancing work still might be queued here because
+	 * the handling of the "clapming" variable, cancel, and queue
+	 * operations are not synchronized via a lock. But it is not
+	 * a big deal. The balancing work is fast and destroy kthread
+	 * will wait for it.
+	 */
+	clear_bit(w_data->cpu, cpu_clamping_mask);
+	kthread_destroy_worker(w_data->worker);
+
+	w_data->worker = NULL;
+}
+
 static int start_power_clamp(void)
 {
 	unsigned long cpu;
-	struct task_struct *thread;
 
 	set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
 	/* prevent cpu hotplug */
@@ -524,22 +559,9 @@ static int start_power_clamp(void)
 	clamping = true;
 	schedule_delayed_work(&poll_pkg_cstate_work, 0);
 
-	/* start one thread per online cpu */
+	/* start one kthread worker per online cpu */
 	for_each_online_cpu(cpu) {
-		struct task_struct **p =
-			per_cpu_ptr(powerclamp_thread, cpu);
-
-		thread = kthread_create_on_node(clamp_thread,
-						(void *) cpu,
-						cpu_to_node(cpu),
-						"kidle_inject/%ld", cpu);
-		/* bind to cpu here */
-		if (likely(!IS_ERR(thread))) {
-			kthread_bind(thread, cpu);
-			wake_up_process(thread);
-			*p = thread;
-		}
-
+		start_power_clamp_worker(cpu);
 	}
 	put_online_cpus();
 
@@ -549,71 +571,49 @@ static int start_power_clamp(void)
 static void end_power_clamp(void)
 {
 	int i;
-	struct task_struct *thread;
 
-	clamping = false;
 	/*
-	 * make clamping visible to other cpus and give per cpu clamping threads
-	 * sometime to exit, or gets killed later.
+	 * Block requeuing in all the kthread workers. They will flush and
+	 * stop faster.
 	 */
-	smp_mb();
-	msleep(20);
+	clamping = false;
 	if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
 		for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
-			pr_debug("clamping thread for cpu %d alive, kill\n", i);
-			thread = *per_cpu_ptr(powerclamp_thread, i);
-			kthread_stop(thread);
+			pr_debug("clamping worker for cpu %d alive, destroy\n",
+				 i);
+			stop_power_clamp_worker(i);
 		}
 	}
 }
 
-static int powerclamp_cpu_callback(struct notifier_block *nfb,
-				unsigned long action, void *hcpu)
+static int powerclamp_cpu_online(unsigned int cpu)
 {
-	unsigned long cpu = (unsigned long)hcpu;
-	struct task_struct *thread;
-	struct task_struct **percpu_thread =
-		per_cpu_ptr(powerclamp_thread, cpu);
-
-	if (false == clamping)
-		goto exit_ok;
-
-	switch (action) {
-	case CPU_ONLINE:
-		thread = kthread_create_on_node(clamp_thread,
-						(void *) cpu,
-						cpu_to_node(cpu),
-						"kidle_inject/%lu", cpu);
-		if (likely(!IS_ERR(thread))) {
-			kthread_bind(thread, cpu);
-			wake_up_process(thread);
-			*percpu_thread = thread;
-		}
-		/* prefer BSP as controlling CPU */
-		if (cpu == 0) {
-			control_cpu = 0;
-			smp_mb();
-		}
-		break;
-	case CPU_DEAD:
-		if (test_bit(cpu, cpu_clamping_mask)) {
-			pr_err("cpu %lu dead but powerclamping thread is not\n",
-				cpu);
-			kthread_stop(*percpu_thread);
-		}
-		if (cpu == control_cpu) {
-			control_cpu = smp_processor_id();
-			smp_mb();
-		}
+	if (clamping == false)
+		return 0;
+	start_power_clamp_worker(cpu);
+	/* prefer BSP as controlling CPU */
+	if (cpu == 0) {
+		control_cpu = 0;
+		smp_mb();
 	}
-
-exit_ok:
-	return NOTIFY_OK;
+	return 0;
 }
 
-static struct notifier_block powerclamp_cpu_notifier = {
-	.notifier_call = powerclamp_cpu_callback,
-};
+static int powerclamp_cpu_predown(unsigned int cpu)
+{
+	if (clamping == false)
+		return 0;
+
+	stop_power_clamp_worker(cpu);
+	if (cpu != control_cpu)
+		return 0;
+
+	control_cpu = cpumask_first(cpu_online_mask);
+	if (control_cpu == cpu)
+		control_cpu = cpumask_next(cpu, cpu_online_mask);
+	smp_mb();
+	return 0;
+}
 
 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
 				 unsigned long *state)
@@ -741,6 +741,8 @@ file_error:
 	debugfs_remove_recursive(debug_dir);
 }
 
+static enum cpuhp_state hp_state;
+
 static int __init powerclamp_init(void)
 {
 	int retval;
@@ -758,10 +760,17 @@ static int __init powerclamp_init(void)
 
 	/* set default limit, maybe adjusted during runtime based on feedback */
 	window_size = 2;
-	register_hotcpu_notifier(&powerclamp_cpu_notifier);
+	retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+					   "thermal/intel_powerclamp:online",
+					   powerclamp_cpu_online,
+					   powerclamp_cpu_predown);
+	if (retval < 0)
+		goto exit_free;
+
+	hp_state = retval;
 
-	powerclamp_thread = alloc_percpu(struct task_struct *);
-	if (!powerclamp_thread) {
+	worker_data = alloc_percpu(struct powerclamp_worker_data);
+	if (!worker_data) {
 		retval = -ENOMEM;
 		goto exit_unregister;
 	}
@@ -781,9 +790,9 @@ static int __init powerclamp_init(void)
 	return 0;
 
 exit_free_thread:
-	free_percpu(powerclamp_thread);
+	free_percpu(worker_data);
 exit_unregister:
-	unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
+	cpuhp_remove_state_nocalls(hp_state);
 exit_free:
 	kfree(cpu_clamping_mask);
 	return retval;
@@ -792,9 +801,9 @@ module_init(powerclamp_init);
 
 static void __exit powerclamp_exit(void)
 {
-	unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
 	end_power_clamp();
-	free_percpu(powerclamp_thread);
+	cpuhp_remove_state_nocalls(hp_state);
+	free_percpu(worker_data);
 	thermal_cooling_device_unregister(cooling_dev);
 	kfree(cpu_clamping_mask);