diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index eacb2e0397ae..10b8cc1bda8d 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1446,6 +1446,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. disable Do not enable intel_pstate as the default scaling driver for the supported processors + force + Enable intel_pstate on systems that prohibit it by default + in favor of acpi-cpufreq. Forcing the intel_pstate driver + instead of acpi-cpufreq may disable platform features, such + as thermal controls and power capping, that rely on ACPI + P-States information being indicated to OSPM and therefore + should be used with caution. This option does not work with + processors that aren't supported by the intel_pstate driver + or on platforms that use pcc-cpufreq instead of acpi-cpufreq. no_hwp Do not enable hardware P state control (HWP) if available. diff --git a/MAINTAINERS b/MAINTAINERS index c8927bc7748e..1c63cfecc6bf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4869,6 +4869,12 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git S: Supported F: drivers/idle/intel_idle.c +INTEL PSTATE DRIVER +M: Kristen Carlson Accardi +L: linux-pm@vger.kernel.org +S: Supported +F: drivers/cpufreq/intel_pstate.c + INTEL FRAMEBUFFER DRIVER (excluding 810 and 815) M: Maik Broemme L: linux-fbdev@vger.kernel.org diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c index 1bbef8e838e7..d24dd614a0bd 100644 --- a/drivers/base/power/opp.c +++ b/drivers/base/power/opp.c @@ -84,7 +84,11 @@ struct dev_pm_opp { * * This is an internal data structure maintaining the link to opps attached to * a device. This structure is not meant to be shared to users as it is - * meant for book keeping and private to OPP library + * meant for book keeping and private to OPP library. + * + * Because the opp structures can be used from both rcu and srcu readers, we + * need to wait for the grace period of both of them before freeing any + * resources. And so we have used kfree_rcu() from within call_srcu() handlers. */ struct device_opp { struct list_head node; @@ -382,12 +386,34 @@ struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev, } EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_floor); +static struct device_opp *add_device_opp(struct device *dev) +{ + struct device_opp *dev_opp; + + /* + * Allocate a new device OPP table. In the infrequent case where a new + * device is needed to be added, we pay this penalty. + */ + dev_opp = kzalloc(sizeof(*dev_opp), GFP_KERNEL); + if (!dev_opp) + return NULL; + + dev_opp->dev = dev; + srcu_init_notifier_head(&dev_opp->srcu_head); + INIT_LIST_HEAD(&dev_opp->opp_list); + + /* Secure the device list modification */ + list_add_rcu(&dev_opp->node, &dev_opp_list); + return dev_opp; +} + static int dev_pm_opp_add_dynamic(struct device *dev, unsigned long freq, unsigned long u_volt, bool dynamic) { struct device_opp *dev_opp = NULL; struct dev_pm_opp *opp, *new_opp; struct list_head *head; + int ret; /* allocate new OPP node */ new_opp = kzalloc(sizeof(*new_opp), GFP_KERNEL); @@ -408,27 +434,12 @@ static int dev_pm_opp_add_dynamic(struct device *dev, unsigned long freq, /* Check for existing list for 'dev' */ dev_opp = find_device_opp(dev); if (IS_ERR(dev_opp)) { - /* - * Allocate a new device OPP table. In the infrequent case - * where a new device is needed to be added, we pay this - * penalty. - */ - dev_opp = kzalloc(sizeof(struct device_opp), GFP_KERNEL); + dev_opp = add_device_opp(dev); if (!dev_opp) { - mutex_unlock(&dev_opp_list_lock); - kfree(new_opp); - dev_warn(dev, - "%s: Unable to create device OPP structure\n", - __func__); - return -ENOMEM; + ret = -ENOMEM; + goto free_opp; } - dev_opp->dev = dev; - srcu_init_notifier_head(&dev_opp->srcu_head); - INIT_LIST_HEAD(&dev_opp->opp_list); - - /* Secure the device list modification */ - list_add_rcu(&dev_opp->node, &dev_opp_list); head = &dev_opp->opp_list; goto list_add; } @@ -447,15 +458,13 @@ static int dev_pm_opp_add_dynamic(struct device *dev, unsigned long freq, /* Duplicate OPPs ? */ if (new_opp->rate == opp->rate) { - int ret = opp->available && new_opp->u_volt == opp->u_volt ? + ret = opp->available && new_opp->u_volt == opp->u_volt ? 0 : -EEXIST; dev_warn(dev, "%s: duplicate OPPs detected. Existing: freq: %lu, volt: %lu, enabled: %d. New: freq: %lu, volt: %lu, enabled: %d\n", __func__, opp->rate, opp->u_volt, opp->available, new_opp->rate, new_opp->u_volt, new_opp->available); - mutex_unlock(&dev_opp_list_lock); - kfree(new_opp); - return ret; + goto free_opp; } list_add: @@ -469,6 +478,11 @@ list_add: */ srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_ADD, new_opp); return 0; + +free_opp: + mutex_unlock(&dev_opp_list_lock); + kfree(new_opp); + return ret; } /** @@ -511,10 +525,11 @@ static void kfree_device_rcu(struct rcu_head *head) { struct device_opp *device_opp = container_of(head, struct device_opp, rcu_head); - kfree(device_opp); + kfree_rcu(device_opp, rcu_head); } -void __dev_pm_opp_remove(struct device_opp *dev_opp, struct dev_pm_opp *opp) +static void __dev_pm_opp_remove(struct device_opp *dev_opp, + struct dev_pm_opp *opp) { /* * Notify the changes in the availability of the operable @@ -592,7 +607,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_remove); static int opp_set_availability(struct device *dev, unsigned long freq, bool availability_req) { - struct device_opp *tmp_dev_opp, *dev_opp = ERR_PTR(-ENODEV); + struct device_opp *dev_opp; struct dev_pm_opp *new_opp, *tmp_opp, *opp = ERR_PTR(-ENODEV); int r = 0; @@ -606,12 +621,7 @@ static int opp_set_availability(struct device *dev, unsigned long freq, mutex_lock(&dev_opp_list_lock); /* Find the device_opp */ - list_for_each_entry(tmp_dev_opp, &dev_opp_list, node) { - if (dev == tmp_dev_opp->dev) { - dev_opp = tmp_dev_opp; - break; - } - } + dev_opp = find_device_opp(dev); if (IS_ERR(dev_opp)) { r = PTR_ERR(dev_opp); dev_warn(dev, "%s: Device OPP not found (%d)\n", __func__, r); diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 1405b393c93d..742eefba12c2 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -199,7 +199,14 @@ static signed int pid_calc(struct _pid *pid, int32_t busy) pid->integral += fp_error; - /* limit the integral term */ + /* + * We limit the integral here so that it will never + * get higher than 30. This prevents it from becoming + * too large an input over long periods of time and allows + * it to get factored out sooner. + * + * The value of 30 was chosen through experimentation. + */ integral_limit = int_tofp(30); if (pid->integral > integral_limit) pid->integral = integral_limit; @@ -616,6 +623,11 @@ static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max) if (limits.no_turbo || limits.turbo_disabled) max_perf = cpu->pstate.max_pstate; + /* + * performance can be limited by user through sysfs, by cpufreq + * policy, or by cpu specific default values determined through + * experimentation. + */ max_perf_adj = fp_toint(mul_fp(int_tofp(max_perf), limits.max_perf)); *max = clamp_t(int, max_perf_adj, cpu->pstate.min_pstate, cpu->pstate.turbo_pstate); @@ -717,11 +729,29 @@ static inline int32_t intel_pstate_get_scaled_busy(struct cpudata *cpu) u32 duration_us; u32 sample_time; + /* + * core_busy is the ratio of actual performance to max + * max_pstate is the max non turbo pstate available + * current_pstate was the pstate that was requested during + * the last sample period. + * + * We normalize core_busy, which was our actual percent + * performance to what we requested during the last sample + * period. The result will be a percentage of busy at a + * specified pstate. + */ core_busy = cpu->sample.core_pct_busy; max_pstate = int_tofp(cpu->pstate.max_pstate); current_pstate = int_tofp(cpu->pstate.current_pstate); core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate)); + /* + * Since we have a deferred timer, it will not fire unless + * we are in C0. So, determine if the actual elapsed time + * is significantly greater (3x) than our sample interval. If it + * is, then we were idle for a long enough period of time + * to adjust our busyness. + */ sample_time = pid_params.sample_rate_ms * USEC_PER_MSEC; duration_us = (u32) ktime_us_delta(cpu->sample.time, cpu->last_sample_time); @@ -948,6 +978,7 @@ static struct cpufreq_driver intel_pstate_driver = { static int __initdata no_load; static int __initdata no_hwp; +static unsigned int force_load; static int intel_pstate_msrs_not_valid(void) { @@ -1094,7 +1125,8 @@ static bool intel_pstate_platform_pwr_mgmt_exists(void) case PSS: return intel_pstate_no_acpi_pss(); case PPC: - return intel_pstate_has_acpi_ppc(); + return intel_pstate_has_acpi_ppc() && + (!force_load); } } @@ -1175,6 +1207,8 @@ static int __init intel_pstate_setup(char *str) no_load = 1; if (!strcmp(str, "no_hwp")) no_hwp = 1; + if (!strcmp(str, "force")) + force_load = 1; return 0; } early_param("intel_pstate", intel_pstate_setup); diff --git a/tools/power/cpupower/utils/cpuidle-info.c b/tools/power/cpupower/utils/cpuidle-info.c index 458d69b444ad..75e66de7e7a7 100644 --- a/tools/power/cpupower/utils/cpuidle-info.c +++ b/tools/power/cpupower/utils/cpuidle-info.c @@ -22,13 +22,13 @@ static void cpuidle_cpu_output(unsigned int cpu, int verbose) { - int idlestates, idlestate; + unsigned int idlestates, idlestate; char *tmp; printf(_ ("Analyzing CPU %d:\n"), cpu); idlestates = sysfs_get_idlestate_count(cpu); - if (idlestates < 1) { + if (idlestates == 0) { printf(_("CPU %u: No idle states\n"), cpu); return; } @@ -100,10 +100,10 @@ static void cpuidle_general_output(void) static void proc_cpuidle_cpu_output(unsigned int cpu) { long max_allowed_cstate = 2000000000; - int cstate, cstates; + unsigned int cstate, cstates; cstates = sysfs_get_idlestate_count(cpu); - if (cstates < 1) { + if (cstates == 0) { printf(_("CPU %u: No C-states info\n"), cpu); return; }