From 22baebd489c78cf7d02030bf50b0c99e4fcb6748 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 26 Jul 2017 00:42:54 +0200 Subject: [PATCH 1/3] cpufreq: intel_pstate: Drop ->get from intel_pstate structure The ->get callback in the intel_pstate structure was mostly there for the scaling_cur_freq sysfs attribute to work, but after commit f8475cef9008 (x86: use common aperfmperf_khz_on_cpu() to calculate KHz using APERF/MPERF) that attribute uses arch_freq_get_on_cpu() provided by the x86 arch code on all processors supported by intel_pstate, so it doesn't need the ->get callback from the driver any more. Moreover, the very presence of the ->get callback in the intel_pstate structure causes the cpuinfo_cur_freq attribute to be present when intel_pstate operates in the active mode, which is bogus, because the role of that attribute is to return the current CPU frequency as seen by the hardware. For intel_pstate, though, this is just an average frequency and not really current, but computed for the previous sampling interval (the actual current frequency may be way different at the point this value is obtained by reading from cpuinfo_cur_freq), and after commit 82b4e03e01bc (intel_pstate: skip scheduler hook when in "performance" mode) the value in cpuinfo_cur_freq may be stale or just 0, depending on the driver's operation mode. In fact, however, on the hardware supported by intel_pstate there is no way to read the current CPU frequency from it, so the cpuinfo_cur_freq attribute should not be present at all when this driver is in use. For this reason, drop intel_pstate_get() and clear the ->get callback pointer pointing to it, so that the cpuinfo_cur_freq is not present for intel_pstate in the active mode any more. Fixes: 82b4e03e01bc (intel_pstate: skip scheduler hook when in "performance" mode) Reported-by: Huaisheng Ye Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- drivers/cpufreq/intel_pstate.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 0ddc0d045831..bf6af77cd945 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1926,13 +1926,6 @@ static int intel_pstate_init_cpu(unsigned int cpunum) return 0; } -static unsigned int intel_pstate_get(unsigned int cpu_num) -{ - struct cpudata *cpu = all_cpu_data[cpu_num]; - - return cpu ? get_avg_frequency(cpu) : 0; -} - static void intel_pstate_set_update_util_hook(unsigned int cpu_num) { struct cpudata *cpu = all_cpu_data[cpu_num]; @@ -2173,7 +2166,6 @@ static struct cpufreq_driver intel_pstate = { .setpolicy = intel_pstate_set_policy, .suspend = intel_pstate_hwp_save_state, .resume = intel_pstate_resume, - .get = intel_pstate_get, .init = intel_pstate_cpu_init, .exit = intel_pstate_cpu_exit, .stop_cpu = intel_pstate_stop_cpu, From c2e3af11a979688bc9b9229867688e89e343a465 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 27 Jul 2017 02:05:56 +0200 Subject: [PATCH 2/3] cpufreq: docs: Add missing cpuinfo_cur_freq description Add a description of the cpuinfo_cur_freq policy attribute in sysfs to the cpufreq documentation under Documentation/admin-guide/pm/ as it is missing after commit 2a0e49279850 (cpufreq: User/admin documentation update and consolidation) that overlooked it. Fixes: 2a0e49279850 (cpufreq: User/admin documentation update and consolidation) Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- Documentation/admin-guide/pm/cpufreq.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/admin-guide/pm/cpufreq.rst b/Documentation/admin-guide/pm/cpufreq.rst index 463cf7e73db8..7af83a92d2d6 100644 --- a/Documentation/admin-guide/pm/cpufreq.rst +++ b/Documentation/admin-guide/pm/cpufreq.rst @@ -237,6 +237,14 @@ are the following: This attribute is not present if the scaling driver in use does not support it. +``cpuinfo_cur_freq`` + Current frequency of the CPUs belonging to this policy as obtained from + the hardware (in KHz). + + This is expected to be the frequency the hardware actually runs at. + If that frequency cannot be determined, this attribute should not + be present. + ``cpuinfo_max_freq`` Maximum possible operating frequency the CPUs belonging to this policy can run at (in kHz). From 4815d3c56d1e10449a44089a47544d9ba84fad0d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 28 Jul 2017 14:45:03 +0200 Subject: [PATCH 3/3] cpufreq: x86: Make scaling_cur_freq behave more as expected After commit f8475cef9008 "x86: use common aperfmperf_khz_on_cpu() to calculate KHz using APERF/MPERF" the scaling_cur_freq policy attribute in sysfs only behaves as expected on x86 with APERF/MPERF registers available when it is read from at least twice in a row. The value returned by the first read may not be meaningful, because the computations in there use cached values from the previous iteration of aperfmperf_snapshot_khz() which may be stale. To prevent that from happening, modify arch_freq_get_on_cpu() to call aperfmperf_snapshot_khz() twice, with a short delay between these calls, if the previous invocation of aperfmperf_snapshot_khz() was too far back in the past (specifically, more that 1s ago). Also, as pointed out by Doug Smythies, aperf_delta is limited now and the multiplication of it by cpu_khz won't overflow, so simplify the s->khz computations too. Fixes: f8475cef9008 "x86: use common aperfmperf_khz_on_cpu() to calculate KHz using APERF/MPERF" Reported-by: Doug Smythies Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/cpu/aperfmperf.c | 40 +++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index d869c8671e36..7cf7c70b6ef2 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -8,20 +8,25 @@ * This file is licensed under GPLv2. */ -#include +#include +#include #include #include #include struct aperfmperf_sample { unsigned int khz; - unsigned long jiffies; + ktime_t time; u64 aperf; u64 mperf; }; static DEFINE_PER_CPU(struct aperfmperf_sample, samples); +#define APERFMPERF_CACHE_THRESHOLD_MS 10 +#define APERFMPERF_REFRESH_DELAY_MS 20 +#define APERFMPERF_STALE_THRESHOLD_MS 1000 + /* * aperfmperf_snapshot_khz() * On the current CPU, snapshot APERF, MPERF, and jiffies @@ -33,9 +38,11 @@ static void aperfmperf_snapshot_khz(void *dummy) u64 aperf, aperf_delta; u64 mperf, mperf_delta; struct aperfmperf_sample *s = this_cpu_ptr(&samples); + ktime_t now = ktime_get(); + s64 time_delta = ktime_ms_delta(now, s->time); - /* Don't bother re-computing within 10 ms */ - if (time_before(jiffies, s->jiffies + HZ/100)) + /* Don't bother re-computing within the cache threshold time. */ + if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS) return; rdmsrl(MSR_IA32_APERF, aperf); @@ -51,22 +58,21 @@ static void aperfmperf_snapshot_khz(void *dummy) if (mperf_delta == 0) return; - /* - * if (cpu_khz * aperf_delta) fits into ULLONG_MAX, then - * khz = (cpu_khz * aperf_delta) / mperf_delta - */ - if (div64_u64(ULLONG_MAX, cpu_khz) > aperf_delta) - s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); - else /* khz = aperf_delta / (mperf_delta / cpu_khz) */ - s->khz = div64_u64(aperf_delta, - div64_u64(mperf_delta, cpu_khz)); - s->jiffies = jiffies; + s->time = now; s->aperf = aperf; s->mperf = mperf; + + /* If the previous iteration was too long ago, discard it. */ + if (time_delta > APERFMPERF_STALE_THRESHOLD_MS) + s->khz = 0; + else + s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); } unsigned int arch_freq_get_on_cpu(int cpu) { + unsigned int khz; + if (!cpu_khz) return 0; @@ -74,6 +80,12 @@ unsigned int arch_freq_get_on_cpu(int cpu) return 0; smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); + khz = per_cpu(samples.khz, cpu); + if (khz) + return khz; + + msleep(APERFMPERF_REFRESH_DELAY_MS); + smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); return per_cpu(samples.khz, cpu); }