From 6e34e1f23d780978da65968327cbba6d7013a73f Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 13 Jul 2017 15:03:51 -0700 Subject: [PATCH 1/2] cpufreq: intel_pstate: Correct the busy calculation for KNL The busy percent calculated for the Knights Landing (KNL) platform is 1024 times smaller than the correct busy value. This causes performance to get stuck at the lowest ratio. The scaling algorithm used for KNL is performance-based, but it still looks at the CPU load to set the scaled busy factor to 0 when the load is less than 1 percent. In this case, since the computed load is 1024x smaller than it should be, the scaled busy factor will always be 0, irrespective of CPU business. This needs a fix similar to the turbostat one in commit b2b34dfe4d9a (tools/power turbostat: KNL workaround for %Busy and Avg_MHz). For this reason, add one more callback to processor-specific callbacks to specify an MPERF multiplier represented by a number of bit positions to shift the value of that register to the left to copmensate for its rate difference with respect to the TSC. This shift value is used during CPU busy calculations. Fixes: ffb810563c (intel_pstate: Avoid getting stuck in high P-states when idle) Reported-and-tested-by: Artem Bityutskiy Signed-off-by: Srinivas Pandruvada Cc: 4.6+ # 4.6+ [ rjw: Changelog ] Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 2386d7036e90..0ddc0d045831 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -225,6 +225,9 @@ struct global_params { * @vid: Stores VID limits for this CPU * @pid: Stores PID parameters for this CPU * @last_sample_time: Last Sample time + * @aperf_mperf_shift: Number of clock cycles after aperf, merf is incremented + * This shift is a multiplier to mperf delta to + * calculate CPU busy. * @prev_aperf: Last APERF value read from APERF MSR * @prev_mperf: Last MPERF value read from MPERF MSR * @prev_tsc: Last timestamp counter (TSC) value @@ -259,6 +262,7 @@ struct cpudata { u64 last_update; u64 last_sample_time; + u64 aperf_mperf_shift; u64 prev_aperf; u64 prev_mperf; u64 prev_tsc; @@ -321,6 +325,7 @@ struct pstate_funcs { int (*get_min)(void); int (*get_turbo)(void); int (*get_scaling)(void); + int (*get_aperf_mperf_shift)(void); u64 (*get_val)(struct cpudata*, int pstate); void (*get_vid)(struct cpudata *); void (*update_util)(struct update_util_data *data, u64 time, @@ -1490,6 +1495,11 @@ static u64 core_get_val(struct cpudata *cpudata, int pstate) return val; } +static int knl_get_aperf_mperf_shift(void) +{ + return 10; +} + static int knl_get_turbo_pstate(void) { u64 value; @@ -1547,6 +1557,9 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) cpu->pstate.max_freq = cpu->pstate.max_pstate * cpu->pstate.scaling; cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * cpu->pstate.scaling; + if (pstate_funcs.get_aperf_mperf_shift) + cpu->aperf_mperf_shift = pstate_funcs.get_aperf_mperf_shift(); + if (pstate_funcs.get_vid) pstate_funcs.get_vid(cpu); @@ -1620,7 +1633,8 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu) int32_t busy_frac, boost; int target, avg_pstate; - busy_frac = div_fp(sample->mperf, sample->tsc); + busy_frac = div_fp(sample->mperf << cpu->aperf_mperf_shift, + sample->tsc); boost = cpu->iowait_boost; cpu->iowait_boost >>= 1; @@ -1679,7 +1693,8 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu) sample_ratio = div_fp(pid_params.sample_rate_ns, duration_ns); perf_scaled = mul_fp(perf_scaled, sample_ratio); } else { - sample_ratio = div_fp(100 * cpu->sample.mperf, cpu->sample.tsc); + sample_ratio = div_fp(100 * (cpu->sample.mperf << cpu->aperf_mperf_shift), + cpu->sample.tsc); if (sample_ratio < int_tofp(1)) perf_scaled = 0; } @@ -1811,6 +1826,7 @@ static const struct pstate_funcs knl_funcs = { .get_max_physical = core_get_max_pstate_physical, .get_min = core_get_min_pstate, .get_turbo = knl_get_turbo_pstate, + .get_aperf_mperf_shift = knl_get_aperf_mperf_shift, .get_scaling = core_get_scaling, .get_val = core_get_val, .update_util = intel_pstate_update_util_pid, @@ -2407,6 +2423,7 @@ static void __init copy_cpu_funcs(struct pstate_funcs *funcs) pstate_funcs.get_val = funcs->get_val; pstate_funcs.get_vid = funcs->get_vid; pstate_funcs.update_util = funcs->update_util; + pstate_funcs.get_aperf_mperf_shift = funcs->get_aperf_mperf_shift; intel_pstate_use_acpi_profile(); } From 975e83cfb8dc16e7a2fdc58188c77c0c605876c2 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Fri, 14 Jul 2017 11:51:48 +0100 Subject: [PATCH 2/2] PM / Domains: defer dev_pm_domain_set() until genpd->attach_dev succeeds if present If the genpd->attach_dev or genpd->power_on fails, genpd_dev_pm_attach may return -EPROBE_DEFER initially. However genpd_alloc_dev_data sets the PM domain for the device unconditionally. When subsequent attempts are made to call genpd_dev_pm_attach, it may return -EEXISTS checking dev->pm_domain without re-attempting to call attach_dev or power_on. platform_drv_probe then attempts to call drv->probe as the return value -EEXIST != -EPROBE_DEFER, which may end up in a situation where the device is accessed without it's power domain switched on. Fixes: f104e1e5ef57 (PM / Domains: Re-order initialization of generic_pm_domain_data) Cc: 4.4+ # v4.4+ Signed-off-by: Sudeep Holla Acked-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 2ac906288a6f..91ba82688f4c 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -1222,8 +1222,6 @@ static struct generic_pm_domain_data *genpd_alloc_dev_data(struct device *dev, spin_unlock_irq(&dev->power.lock); - dev_pm_domain_set(dev, &genpd->domain); - return gpd_data; err_free: @@ -1237,8 +1235,6 @@ static struct generic_pm_domain_data *genpd_alloc_dev_data(struct device *dev, static void genpd_free_dev_data(struct device *dev, struct generic_pm_domain_data *gpd_data) { - dev_pm_domain_set(dev, NULL); - spin_lock_irq(&dev->power.lock); dev->power.subsys_data->domain_data = NULL; @@ -1275,6 +1271,8 @@ static int genpd_add_device(struct generic_pm_domain *genpd, struct device *dev, if (ret) goto out; + dev_pm_domain_set(dev, &genpd->domain); + genpd->device_count++; genpd->max_off_time_changed = true; @@ -1336,6 +1334,8 @@ static int genpd_remove_device(struct generic_pm_domain *genpd, if (genpd->detach_dev) genpd->detach_dev(genpd, dev); + dev_pm_domain_set(dev, NULL); + list_del_init(&pdd->list_node); genpd_unlock(genpd);