From 2194324d8bbbad1b179c08b6095649b06abd62d5 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 14 Feb 2014 01:14:13 -0500 Subject: [PATCH 1/7] ACPI idle: permit sparse C-state sub-state numbers Linux uses CPUID.MWAIT.EDX to validate the C-states reported by ACPI, silently discarding states which are not supported by the HW. This test is too restrictive, as some HW now uses sparse sub-state numbering, so the sub-state number may be higher than the number of sub-states... Also, rather than silently ignoring an invalid state, we should complain about a firmware bug. In practice... Bay Trail systems originally supported C6-no-shrink as MWAIT sub-state 0x58, and in CPUID.MWAIT.EDX 0x03000000 indicated that there were 3 MWAIT-C6 sub-states. So acpi_idle would discard that C-state because 8 >= 3. Upon discovering this issue, the ucode was updated so that C6-no-shrink was also exported as 0x51, and the BIOS was updated to match. However, systems shipped with 0x58, will never get a BIOS update, and this patch allows Linux to see C6-no-shrink on early Bay Trail. Signed-off-by: Len Brown --- arch/x86/kernel/acpi/cstate.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index e69182fd01cf..4b28159e0421 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -87,7 +87,9 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK; retval = 0; - if (num_cstate_subtype < (cx->address & MWAIT_SUBSTATE_MASK)) { + /* If the HW does not support any sub-states in this C-state */ + if (num_cstate_subtype == 0) { + pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n", cx->address, edx_part); retval = -1; goto out; } From 24bfa950f114d5d770e482de73cf673a3b017f65 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 14 Feb 2014 00:50:34 -0500 Subject: [PATCH 2/7] intel_idle: allow sparse sub-state numbering, for Bay Trail Like acpi_idle, intel_idle compared sub-state numbers to the number of supported sub-states -- discarding sub-states numbers that were numbered >= the number of states. But some Bay Trail SOCs use sparse sub-state numbers, so we can't make such a comparison if we are going to access those states. So now we simply check that _some_ sub-states are supported for the given state, and assume that the sub-state number in our driver is valid. In practice, the driver is correct, and even if it were not, the hardware clips invalid sub-state requests to valid ones. No entries in the driver require this change, but Bay Trail will need it. Signed-off-by: Len Brown --- drivers/idle/intel_idle.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 8e1939f564f4..057ffef37b42 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -584,7 +584,7 @@ static int __init intel_idle_cpuidle_driver_init(void) drv->state_count = 1; for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) { - int num_substates, mwait_hint, mwait_cstate, mwait_substate; + int num_substates, mwait_hint, mwait_cstate; if (cpuidle_state_table[cstate].enter == NULL) break; @@ -597,14 +597,13 @@ static int __init intel_idle_cpuidle_driver_init(void) mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags); mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint); - mwait_substate = MWAIT_HINT2SUBSTATE(mwait_hint); - /* does the state exist in CPUID.MWAIT? */ + /* number of sub-states for this state in CPUID.MWAIT */ num_substates = (mwait_substates >> ((mwait_cstate + 1) * 4)) & MWAIT_SUBSTATE_MASK; - /* if sub-state in table is not enumerated by CPUID */ - if ((mwait_substate + 1) > num_substates) + /* if NO sub-states for this state in CPUID, skip it */ + if (num_substates == 0) continue; if (((mwait_cstate + 1) > 2) && From 718987d695adc991eb94501209fe5353136c8c16 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 14 Feb 2014 02:30:00 -0500 Subject: [PATCH 3/7] intel_idle: support Bay Trail Bay Trail (BYT) is a family of Silvermont-core Atom Processor SOCs, including the Intel Atom Processor Z36xxx and Z37xxx Series. Although it shares the Silvermont core with Avoton, BYT is optimized for mobile, and thus it supports different power saving CPU idle states. Note that not all versions of Bay Trail HW support all of the states listed in the driver. Signed-off-by: Len Brown Tested-by: Aubrey Li --- drivers/idle/intel_idle.c | 53 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 057ffef37b42..aeddfc46c831 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -196,6 +196,53 @@ static struct cpuidle_state snb_cstates[] = { .enter = NULL } }; +static struct cpuidle_state byt_cstates[] = { + { + .name = "C1-BYT", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle }, + { + .name = "C1E-BYT", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 15, + .target_residency = 30, + .enter = &intel_idle }, + { + .name = "C6N-BYT", + .desc = "MWAIT 0x58", + .flags = MWAIT2flg(0x58) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 40, + .target_residency = 275, + .enter = &intel_idle }, + { + .name = "C6S-BYT", + .desc = "MWAIT 0x52", + .flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 140, + .target_residency = 560, + .enter = &intel_idle }, + { + .name = "C7-BYT", + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 1200, + .target_residency = 1500, + .enter = &intel_idle }, + { + .name = "C7S-BYT", + .desc = "MWAIT 0x64", + .flags = MWAIT2flg(0x64) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 10000, + .target_residency = 20000, + .enter = &intel_idle }, + { + .enter = NULL } +}; + static struct cpuidle_state ivb_cstates[] = { { .name = "C1-IVB", @@ -464,6 +511,11 @@ static const struct idle_cpu idle_cpu_snb = { .disable_promotion_to_c1e = true, }; +static const struct idle_cpu idle_cpu_byt = { + .state_table = byt_cstates, + .disable_promotion_to_c1e = true, +}; + static const struct idle_cpu idle_cpu_ivb = { .state_table = ivb_cstates, .disable_promotion_to_c1e = true, @@ -494,6 +546,7 @@ static const struct x86_cpu_id intel_idle_ids[] = { ICPU(0x2f, idle_cpu_nehalem), ICPU(0x2a, idle_cpu_snb), ICPU(0x2d, idle_cpu_snb), + ICPU(0x37, idle_cpu_byt), ICPU(0x3a, idle_cpu_ivb), ICPU(0x3e, idle_cpu_ivb), ICPU(0x3c, idle_cpu_hsw), From acead1b0fac5b10d0ae3f1cc5f7820b9f9f924f5 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sat, 25 Jan 2014 22:24:22 +0100 Subject: [PATCH 4/7] intel_idle: Add CPU model 54 (Atom N2000 series) Add CPU ID for Atom N2600/N2800 processors. Datasheets indicate support for this, detailed information about potential quirks or limitations are missing, though. So we just reuse the definition for the previous ATOM series. Tests on N2800 systems showed that this addition is fine an can reduce power consumption by about 0.25 W (personally confirmed on Intel DN2800MT). Signed-off-by: Jan Kiszka Signed-off-by: Len Brown --- drivers/idle/intel_idle.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index aeddfc46c831..710ab54ab04b 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -546,6 +546,7 @@ static const struct x86_cpu_id intel_idle_ids[] = { ICPU(0x2f, idle_cpu_nehalem), ICPU(0x2a, idle_cpu_snb), ICPU(0x2d, idle_cpu_snb), + ICPU(0x36, idle_cpu_atom), ICPU(0x37, idle_cpu_byt), ICPU(0x3a, idle_cpu_ivb), ICPU(0x3e, idle_cpu_ivb), From fc04cc67ea8f44124f048832a745a24bc2fa12fa Mon Sep 17 00:00:00 2001 From: Len Brown Date: Thu, 6 Feb 2014 00:55:19 -0500 Subject: [PATCH 5/7] tools/power turbostat: simplify output, add Avg_MHz Use 8 columns for each number ouput. We don't fit into 80 columns on most machines, so keep the format simple. Print frequency in MHz instead of GHz. We've got 8 columns now, so use them to show low frequency in a more natural unit. Many users didn't understand what %c0 meant, so re-name it to be %Busy. Add Avg_MHz column, which is the frequency that many users expect to see -- the total number of cycles executed over the measurement interval. People found the previous GHz to be confusing, since it was the speed only over the non-idle interval. That measurement has been re-named Bzy_MHz. Suggested-by: Dirk J. Brandewie Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 127 ++++++-------- tools/power/x86/turbostat/turbostat.c | 230 ++++++++++++-------------- 2 files changed, 151 insertions(+), 206 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index b4ddb748356c..56bfb523c5bb 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -47,21 +47,22 @@ displays the statistics gathered since it was forked. .PP .SH FIELD DESCRIPTIONS .nf -\fBpk\fP processor package number. -\fBcor\fP processor core number. +\fBPackage\fP processor package number. +\fBCore\fP processor core number. \fBCPU\fP Linux CPU (logical processor) number. Note that multiple CPUs per core indicate support for Intel(R) Hyper-Threading Technology. -\fB%c0\fP percent of the interval that the CPU retired instructions. -\fBGHz\fP average clock rate while the CPU was in c0 state. -\fBTSC\fP average GHz that the TSC ran during the entire interval. -\fB%c1, %c3, %c6, %c7\fP show the percentage residency in hardware core idle states. -\fBCTMP\fP Degrees Celsius reported by the per-core Digital Thermal Sensor. -\fBPTMP\fP Degrees Celsius reported by the per-package Package Thermal Monitor. -\fB%pc2, %pc3, %pc6, %pc7\fP percentage residency in hardware package idle states. -\fBPkg_W\fP Watts consumed by the whole package. -\fBCor_W\fP Watts consumed by the core part of the package. -\fBGFX_W\fP Watts consumed by the Graphics part of the package -- available only on client processors. -\fBRAM_W\fP Watts consumed by the DRAM DIMMS -- available only on server processors. +\fBAVG_MHz\fP number of cycles executed divided by time elapsed. +\fB%Buzy\fP percent of the interval that the CPU retired instructions, aka. % of time in "C0" state. +\fBBzy_MHz\fP average clock rate while the CPU was busy (in "c0" state). +\fBTSC_MHz\fP average MHz that the TSC ran during the entire interval. +\fBCPU%c1, CPU%c3, CPU%c6, CPU%c7\fP show the percentage residency in hardware core idle states. +\fBCoreTmp\fP Degrees Celsius reported by the per-core Digital Thermal Sensor. +\fBPkgTtmp\fP Degrees Celsius reported by the per-package Package Thermal Monitor. +\fBPkg%pc2, Pkg%pc3, Pkg%pc6, Pkg%pc7\fP percentage residency in hardware package idle states. +\fBPkgWatt\fP Watts consumed by the whole package. +\fBCorWatt\fP Watts consumed by the core part of the package. +\fBGFXWatt\fP Watts consumed by the Graphics part of the package -- available only on client processors. +\fBRAMWatt\fP Watts consumed by the DRAM DIMMS -- available only on server processors. \fBPKG_%\fP percent of the interval that RAPL throttling was active on the Package. \fBRAM_%\fP percent of the interval that RAPL throttling was active on DRAM. .fi @@ -78,29 +79,17 @@ For Watts columns, the summary is a system total. Subsequent rows show per-CPU statistics. .nf -[root@sandy]# ./turbostat -cor CPU %c0 GHz TSC %c1 %c3 %c6 %c7 CTMP PTMP %pc2 %pc3 %pc6 %pc7 Pkg_W Cor_W GFX_W - 0.06 0.80 2.29 0.11 0.00 0.00 99.83 47 40 0.26 0.01 0.44 98.78 3.49 0.12 0.14 - 0 0 0.07 0.80 2.29 0.07 0.00 0.00 99.86 40 40 0.26 0.01 0.44 98.78 3.49 0.12 0.14 - 0 4 0.03 0.80 2.29 0.12 - 1 1 0.04 0.80 2.29 0.25 0.01 0.00 99.71 40 - 1 5 0.16 0.80 2.29 0.13 - 2 2 0.05 0.80 2.29 0.06 0.01 0.00 99.88 40 - 2 6 0.03 0.80 2.29 0.08 - 3 3 0.05 0.80 2.29 0.08 0.00 0.00 99.87 47 - 3 7 0.04 0.84 2.29 0.09 -.fi -.SH SUMMARY EXAMPLE -The "-s" option prints the column headers just once, -and then the one line system summary for each sample interval. - -.nf -[root@wsm]# turbostat -S - %c0 GHz TSC %c1 %c3 %c6 CTMP %pc3 %pc6 - 1.40 2.81 3.38 10.78 43.47 44.35 42 13.67 2.09 - 1.34 2.90 3.38 11.48 58.96 28.23 41 19.89 0.15 - 1.55 2.72 3.38 26.73 37.66 34.07 42 2.53 2.80 - 1.37 2.83 3.38 16.95 60.05 21.63 42 5.76 0.20 +[root@ivy]# ./turbostat + Core CPU Avg_MHz %Busy Bzy_MHz TSC_MHz SMI CPU%c1 CPU%c3 CPU%c6 CPU%c7 CoreTmp PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt + - - 6 0.36 1596 3492 0 0.59 0.01 99.04 0.00 23 24 23.82 0.01 72.47 0.00 6.40 1.01 0.00 + 0 0 9 0.58 1596 3492 0 0.28 0.01 99.13 0.00 23 24 23.82 0.01 72.47 0.00 6.40 1.01 0.00 + 0 4 1 0.07 1596 3492 0 0.79 + 1 1 10 0.65 1596 3492 0 0.59 0.00 98.76 0.00 23 + 1 5 5 0.28 1596 3492 0 0.95 + 2 2 10 0.66 1596 3492 0 0.41 0.01 98.92 0.00 23 + 2 6 2 0.10 1597 3492 0 0.97 + 3 3 3 0.20 1596 3492 0 0.44 0.00 99.37 0.00 23 + 3 7 5 0.31 1596 3492 0 0.33 .fi .SH VERBOSE EXAMPLE The "-v" option adds verbosity to the output: @@ -154,55 +143,35 @@ eg. Here a cycle soaker is run on 1 CPU (see %c0) for a few seconds until ^C while the other CPUs are mostly idle: .nf -[root@x980 lenb]# ./turbostat cat /dev/zero > /dev/null +root@ivy: turbostat cat /dev/zero > /dev/null ^C -cor CPU %c0 GHz TSC %c1 %c3 %c6 %pc3 %pc6 - 8.86 3.61 3.38 15.06 31.19 44.89 0.00 0.00 - 0 0 1.46 3.22 3.38 16.84 29.48 52.22 0.00 0.00 - 0 6 0.21 3.06 3.38 18.09 - 1 2 0.53 3.33 3.38 2.80 46.40 50.27 - 1 8 0.89 3.47 3.38 2.44 - 2 4 1.36 3.43 3.38 9.04 23.71 65.89 - 2 10 0.18 2.86 3.38 10.22 - 8 1 0.04 2.87 3.38 99.96 0.01 0.00 - 8 7 99.72 3.63 3.38 0.27 - 9 3 0.31 3.21 3.38 7.64 56.55 35.50 - 9 9 0.08 2.95 3.38 7.88 - 10 5 1.42 3.43 3.38 2.14 30.99 65.44 - 10 11 0.16 2.88 3.38 3.40 + Core CPU Avg_MHz %Busy Bzy_MHz TSC_MHz SMI CPU%c1 CPU%c3 CPU%c6 CPU%c7 CoreTmp PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt + - - 496 12.75 3886 3492 0 13.16 0.04 74.04 0.00 36 36 0.00 0.00 0.00 0.00 23.15 17.65 0.00 + 0 0 22 0.57 3830 3492 0 0.83 0.02 98.59 0.00 27 36 0.00 0.00 0.00 0.00 23.15 17.65 0.00 + 0 4 9 0.24 3829 3492 0 1.15 + 1 1 4 0.09 3783 3492 0 99.91 0.00 0.00 0.00 36 + 1 5 3880 99.82 3888 3492 0 0.18 + 2 2 17 0.44 3813 3492 0 0.77 0.04 98.75 0.00 28 + 2 6 12 0.32 3823 3492 0 0.89 + 3 3 16 0.43 3844 3492 0 0.63 0.11 98.84 0.00 30 + 3 7 4 0.11 3827 3492 0 0.94 +30.372243 sec + .fi -Above the cycle soaker drives cpu7 up its 3.6 GHz turbo limit +Above the cycle soaker drives cpu5 up its 3.8 GHz turbo limit while the other processors are generally in various states of idle. -Note that cpu1 and cpu7 are HT siblings within core8. -As cpu7 is very busy, it prevents its sibling, cpu1, +Note that cpu1 and cpu5 are HT siblings within core1. +As cpu5 is very busy, it prevents its sibling, cpu1, from entering a c-state deeper than c1. -Note that turbostat reports average GHz of 3.63, while -the arithmetic average of the GHz column above is lower. -This is a weighted average, where the weight is %c0. ie. it is the total number of -un-halted cycles elapsed per time divided by the number of CPUs. -.SH SMI COUNTING EXAMPLE -On Intel Nehalem and newer processors, MSR 0x34 is a System Management Mode Interrupt (SMI) counter. -This counter is shown by default under the "SMI" column. -.nf -[root@x980 ~]# turbostat -cor CPU %c0 GHz TSC SMI %c1 %c3 %c6 CTMP %pc3 %pc6 - 0.11 1.91 3.38 0 1.84 0.26 97.79 29 0.82 83.87 - 0 0 0.40 1.63 3.38 0 10.27 0.12 89.20 20 0.82 83.88 - 0 6 0.06 1.63 3.38 0 10.61 - 1 2 0.37 2.63 3.38 0 0.02 0.10 99.51 22 - 1 8 0.01 1.62 3.38 0 0.39 - 2 4 0.07 1.62 3.38 0 0.04 0.07 99.82 23 - 2 10 0.02 1.62 3.38 0 0.09 - 8 1 0.23 1.64 3.38 0 0.10 1.07 98.60 24 - 8 7 0.02 1.64 3.38 0 0.31 - 9 3 0.03 1.62 3.38 0 0.03 0.05 99.89 29 - 9 9 0.02 1.62 3.38 0 0.05 - 10 5 0.07 1.62 3.38 0 0.08 0.12 99.73 27 - 10 11 0.03 1.62 3.38 0 0.13 -^C -.fi +Note that the Avg_MHz column reflects the total number of cycles executed +divided by the measurement interval. If the %Busy column is 100%, +then the processor was running at that speed the entire interval. +The Avg_MHz multiplied by the %Busy results in the Bzy_MHz -- +which is the average frequency while the processor was executing -- +not including any non-busy idle time. + .SH NOTES .B "turbostat " diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 77eb130168da..18dab6ecc132 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -56,7 +56,7 @@ unsigned int do_slm_cstates; unsigned int use_c1_residency_msr; unsigned int has_aperf; unsigned int has_epb; -unsigned int units = 1000000000; /* Ghz etc */ +unsigned int units = 1000000; /* MHz etc */ unsigned int genuine_intel; unsigned int has_invariant_tsc; unsigned int do_nehalem_platform_info; @@ -264,88 +264,93 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr) return 0; } +/* + * Example Format w/ field column widths: + * + * Package Core CPU Avg_MHz Bzy_MHz TSC_MHz SMI %Busy CPU_%c1 CPU_%c3 CPU_%c6 CPU_%c7 CoreTmp PkgTmp Pkg%pc2 Pkg%pc3 Pkg%pc6 Pkg%pc7 PkgWatt CorWatt GFXWatt + * 1234567 1234567 1234567 1234567 1234567 1234567 1234567 1234567 1234567 1234567 1234567 1234567 1234567 1234567 1234567 1234567 1234567 1234567 1234567 1234567 1234567 + */ + void print_header(void) { if (show_pkg) - outp += sprintf(outp, "pk"); - if (show_pkg) - outp += sprintf(outp, " "); + outp += sprintf(outp, "Package "); if (show_core) - outp += sprintf(outp, "cor"); + outp += sprintf(outp, " Core "); if (show_cpu) - outp += sprintf(outp, " CPU"); - if (show_pkg || show_core || show_cpu) - outp += sprintf(outp, " "); - if (do_nhm_cstates) - outp += sprintf(outp, " %%c0"); + outp += sprintf(outp, " CPU "); if (has_aperf) - outp += sprintf(outp, " GHz"); - outp += sprintf(outp, " TSC"); + outp += sprintf(outp, "Avg_MHz "); + if (do_nhm_cstates) + outp += sprintf(outp, " %%Busy "); + if (has_aperf) + outp += sprintf(outp, "Bzy_MHz "); + outp += sprintf(outp, "TSC_MHz "); if (do_smi) - outp += sprintf(outp, " SMI"); + outp += sprintf(outp, " SMI "); if (extra_delta_offset32) - outp += sprintf(outp, " count 0x%03X", extra_delta_offset32); + outp += sprintf(outp, " count 0x%03X ", extra_delta_offset32); if (extra_delta_offset64) - outp += sprintf(outp, " COUNT 0x%03X", extra_delta_offset64); + outp += sprintf(outp, " COUNT 0x%03X ", extra_delta_offset64); if (extra_msr_offset32) - outp += sprintf(outp, " MSR 0x%03X", extra_msr_offset32); + outp += sprintf(outp, " MSR 0x%03X ", extra_msr_offset32); if (extra_msr_offset64) - outp += sprintf(outp, " MSR 0x%03X", extra_msr_offset64); + outp += sprintf(outp, " MSR 0x%03X ", extra_msr_offset64); if (do_nhm_cstates) - outp += sprintf(outp, " %%c1"); + outp += sprintf(outp, " CPU%%c1 "); if (do_nhm_cstates && !do_slm_cstates) - outp += sprintf(outp, " %%c3"); + outp += sprintf(outp, " CPU%%c3 "); if (do_nhm_cstates) - outp += sprintf(outp, " %%c6"); + outp += sprintf(outp, " CPU%%c6 "); if (do_snb_cstates) - outp += sprintf(outp, " %%c7"); + outp += sprintf(outp, " CPU%%c7 "); if (do_dts) - outp += sprintf(outp, " CTMP"); + outp += sprintf(outp, "CoreTmp "); if (do_ptm) - outp += sprintf(outp, " PTMP"); + outp += sprintf(outp, " PkgTmp "); if (do_snb_cstates) - outp += sprintf(outp, " %%pc2"); + outp += sprintf(outp, "Pkg%%pc2 "); if (do_nhm_cstates && !do_slm_cstates) - outp += sprintf(outp, " %%pc3"); + outp += sprintf(outp, "Pkg%%pc3 "); if (do_nhm_cstates && !do_slm_cstates) - outp += sprintf(outp, " %%pc6"); + outp += sprintf(outp, "Pkg%%pc6 "); if (do_snb_cstates) - outp += sprintf(outp, " %%pc7"); + outp += sprintf(outp, "Pkg%%pc7 "); if (do_c8_c9_c10) { - outp += sprintf(outp, " %%pc8"); - outp += sprintf(outp, " %%pc9"); - outp += sprintf(outp, " %%pc10"); + outp += sprintf(outp, "Pkg%%pc8 "); + outp += sprintf(outp, "Pkg%%pc9 "); + outp += sprintf(outp, "Pk%%pc10 "); } if (do_rapl && !rapl_joules) { if (do_rapl & RAPL_PKG) - outp += sprintf(outp, " Pkg_W"); + outp += sprintf(outp, "PkgWatt "); if (do_rapl & RAPL_CORES) - outp += sprintf(outp, " Cor_W"); + outp += sprintf(outp, "CorWatt "); if (do_rapl & RAPL_GFX) - outp += sprintf(outp, " GFX_W"); + outp += sprintf(outp, "GFXWatt "); if (do_rapl & RAPL_DRAM) - outp += sprintf(outp, " RAM_W"); + outp += sprintf(outp, "RAMWatt "); if (do_rapl & RAPL_PKG_PERF_STATUS) - outp += sprintf(outp, " PKG_%%"); + outp += sprintf(outp, " PKG_%% "); if (do_rapl & RAPL_DRAM_PERF_STATUS) - outp += sprintf(outp, " RAM_%%"); + outp += sprintf(outp, " RAM_%% "); } else { if (do_rapl & RAPL_PKG) - outp += sprintf(outp, " Pkg_J"); + outp += sprintf(outp, " Pkg_J "); if (do_rapl & RAPL_CORES) - outp += sprintf(outp, " Cor_J"); + outp += sprintf(outp, " Cor_J "); if (do_rapl & RAPL_GFX) - outp += sprintf(outp, " GFX_J"); + outp += sprintf(outp, " GFX_J "); if (do_rapl & RAPL_DRAM) - outp += sprintf(outp, " RAM_W"); + outp += sprintf(outp, " RAM_W "); if (do_rapl & RAPL_PKG_PERF_STATUS) - outp += sprintf(outp, " PKG_%%"); + outp += sprintf(outp, " PKG_%% "); if (do_rapl & RAPL_DRAM_PERF_STATUS) - outp += sprintf(outp, " RAM_%%"); - outp += sprintf(outp, " time"); + outp += sprintf(outp, " RAM_%% "); + outp += sprintf(outp, " time "); } outp += sprintf(outp, "\n"); @@ -410,25 +415,12 @@ int dump_counters(struct thread_data *t, struct core_data *c, /* * column formatting convention & formats - * package: "pk" 2 columns %2d - * core: "cor" 3 columns %3d - * CPU: "CPU" 3 columns %3d - * Pkg_W: %6.2 - * Cor_W: %6.2 - * GFX_W: %5.2 - * RAM_W: %5.2 - * GHz: "GHz" 3 columns %3.2 - * TSC: "TSC" 3 columns %3.2 - * SMI: "SMI" 4 columns %4d - * percentage " %pc3" %6.2 - * Perf Status percentage: %5.2 - * "CTMP" 4 columns %4d */ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { double interval_float; - char *fmt5, *fmt6; + char *fmt8; /* if showing only 1st thread in core and this isn't one, bail out */ if (show_core_only && !(t->flags & CPU_IS_FIRST_THREAD_IN_CORE)) @@ -443,65 +435,52 @@ int format_counters(struct thread_data *t, struct core_data *c, /* topo columns, print blanks on 1st (average) line */ if (t == &average.threads) { if (show_pkg) - outp += sprintf(outp, " "); - if (show_pkg && show_core) - outp += sprintf(outp, " "); + outp += sprintf(outp, " -"); if (show_core) - outp += sprintf(outp, " "); + outp += sprintf(outp, " -"); if (show_cpu) - outp += sprintf(outp, " " " "); + outp += sprintf(outp, " -"); } else { if (show_pkg) { if (p) - outp += sprintf(outp, "%2d", p->package_id); + outp += sprintf(outp, "%8d", p->package_id); else - outp += sprintf(outp, " "); + outp += sprintf(outp, " -"); } - if (show_pkg && show_core) - outp += sprintf(outp, " "); if (show_core) { if (c) - outp += sprintf(outp, "%3d", c->core_id); + outp += sprintf(outp, "%8d", c->core_id); else - outp += sprintf(outp, " "); + outp += sprintf(outp, " -"); } if (show_cpu) - outp += sprintf(outp, " %3d", t->cpu_id); + outp += sprintf(outp, "%8d", t->cpu_id); } + + /* AvgMHz */ + if (has_aperf) + outp += sprintf(outp, "%8.0f", + 1.0 / units * t->aperf / interval_float); + /* %c0 */ if (do_nhm_cstates) { - if (show_pkg || show_core || show_cpu) - outp += sprintf(outp, " "); if (!skip_c0) - outp += sprintf(outp, "%6.2f", 100.0 * t->mperf/t->tsc); + outp += sprintf(outp, "%8.2f", 100.0 * t->mperf/t->tsc); else - outp += sprintf(outp, " ****"); + outp += sprintf(outp, "********"); } - /* GHz */ - if (has_aperf) { - if (!aperf_mperf_unstable) { - outp += sprintf(outp, " %3.2f", - 1.0 * t->tsc / units * t->aperf / - t->mperf / interval_float); - } else { - if (t->aperf > t->tsc || t->mperf > t->tsc) { - outp += sprintf(outp, " ***"); - } else { - outp += sprintf(outp, "%3.1f*", - 1.0 * t->tsc / - units * t->aperf / - t->mperf / interval_float); - } - } - } + /* BzyMHz */ + if (has_aperf) + outp += sprintf(outp, "%8.0f", + 1.0 * t->tsc / units * t->aperf / t->mperf / interval_float); /* TSC */ - outp += sprintf(outp, "%5.2f", 1.0 * t->tsc/units/interval_float); + outp += sprintf(outp, "%8.0f", 1.0 * t->tsc/units/interval_float); /* SMI */ if (do_smi) - outp += sprintf(outp, "%4d", t->smi_count); + outp += sprintf(outp, "%8d", t->smi_count); /* delta */ if (extra_delta_offset32) @@ -520,9 +499,9 @@ int format_counters(struct thread_data *t, struct core_data *c, if (do_nhm_cstates) { if (!skip_c1) - outp += sprintf(outp, " %6.2f", 100.0 * t->c1/t->tsc); + outp += sprintf(outp, "%8.2f", 100.0 * t->c1/t->tsc); else - outp += sprintf(outp, " ****"); + outp += sprintf(outp, "********"); } /* print per-core data only for 1st thread in core */ @@ -530,79 +509,76 @@ int format_counters(struct thread_data *t, struct core_data *c, goto done; if (do_nhm_cstates && !do_slm_cstates) - outp += sprintf(outp, " %6.2f", 100.0 * c->c3/t->tsc); + outp += sprintf(outp, "%8.2f", 100.0 * c->c3/t->tsc); if (do_nhm_cstates) - outp += sprintf(outp, " %6.2f", 100.0 * c->c6/t->tsc); + outp += sprintf(outp, "%8.2f", 100.0 * c->c6/t->tsc); if (do_snb_cstates) - outp += sprintf(outp, " %6.2f", 100.0 * c->c7/t->tsc); + outp += sprintf(outp, "%8.2f", 100.0 * c->c7/t->tsc); if (do_dts) - outp += sprintf(outp, " %4d", c->core_temp_c); + outp += sprintf(outp, "%8d", c->core_temp_c); /* print per-package data only for 1st core in package */ if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) goto done; if (do_ptm) - outp += sprintf(outp, " %4d", p->pkg_temp_c); + outp += sprintf(outp, "%8d", p->pkg_temp_c); if (do_snb_cstates) - outp += sprintf(outp, " %6.2f", 100.0 * p->pc2/t->tsc); + outp += sprintf(outp, "%8.2f", 100.0 * p->pc2/t->tsc); if (do_nhm_cstates && !do_slm_cstates) - outp += sprintf(outp, " %6.2f", 100.0 * p->pc3/t->tsc); + outp += sprintf(outp, "%8.2f", 100.0 * p->pc3/t->tsc); if (do_nhm_cstates && !do_slm_cstates) - outp += sprintf(outp, " %6.2f", 100.0 * p->pc6/t->tsc); + outp += sprintf(outp, "%8.2f", 100.0 * p->pc6/t->tsc); if (do_snb_cstates) - outp += sprintf(outp, " %6.2f", 100.0 * p->pc7/t->tsc); + outp += sprintf(outp, "%8.2f", 100.0 * p->pc7/t->tsc); if (do_c8_c9_c10) { - outp += sprintf(outp, " %6.2f", 100.0 * p->pc8/t->tsc); - outp += sprintf(outp, " %6.2f", 100.0 * p->pc9/t->tsc); - outp += sprintf(outp, " %6.2f", 100.0 * p->pc10/t->tsc); + outp += sprintf(outp, "%8.2f", 100.0 * p->pc8/t->tsc); + outp += sprintf(outp, "%8.2f", 100.0 * p->pc9/t->tsc); + outp += sprintf(outp, "%8.2f", 100.0 * p->pc10/t->tsc); } /* * If measurement interval exceeds minimum RAPL Joule Counter range, * indicate that results are suspect by printing "**" in fraction place. */ - if (interval_float < rapl_joule_counter_range) { - fmt5 = " %5.2f"; - fmt6 = " %6.2f"; - } else { - fmt5 = " %3.0f**"; - fmt6 = " %4.0f**"; - } + if (interval_float < rapl_joule_counter_range) + fmt8 = "%8.2f"; + else + fmt8 = " %6.0f**"; if (do_rapl && !rapl_joules) { if (do_rapl & RAPL_PKG) - outp += sprintf(outp, fmt6, p->energy_pkg * rapl_energy_units / interval_float); + outp += sprintf(outp, fmt8, p->energy_pkg * rapl_energy_units / interval_float); if (do_rapl & RAPL_CORES) - outp += sprintf(outp, fmt6, p->energy_cores * rapl_energy_units / interval_float); + outp += sprintf(outp, fmt8, p->energy_cores * rapl_energy_units / interval_float); if (do_rapl & RAPL_GFX) - outp += sprintf(outp, fmt5, p->energy_gfx * rapl_energy_units / interval_float); + outp += sprintf(outp, fmt8, p->energy_gfx * rapl_energy_units / interval_float); if (do_rapl & RAPL_DRAM) - outp += sprintf(outp, fmt5, p->energy_dram * rapl_energy_units / interval_float); + outp += sprintf(outp, fmt8, p->energy_dram * rapl_energy_units / interval_float); if (do_rapl & RAPL_PKG_PERF_STATUS) - outp += sprintf(outp, fmt5, 100.0 * p->rapl_pkg_perf_status * rapl_time_units / interval_float); + outp += sprintf(outp, fmt8, 100.0 * p->rapl_pkg_perf_status * rapl_time_units / interval_float); if (do_rapl & RAPL_DRAM_PERF_STATUS) - outp += sprintf(outp, fmt5, 100.0 * p->rapl_dram_perf_status * rapl_time_units / interval_float); + outp += sprintf(outp, fmt8, 100.0 * p->rapl_dram_perf_status * rapl_time_units / interval_float); } else { if (do_rapl & RAPL_PKG) - outp += sprintf(outp, fmt6, + outp += sprintf(outp, fmt8, p->energy_pkg * rapl_energy_units); if (do_rapl & RAPL_CORES) - outp += sprintf(outp, fmt6, + outp += sprintf(outp, fmt8, p->energy_cores * rapl_energy_units); if (do_rapl & RAPL_GFX) - outp += sprintf(outp, fmt5, + outp += sprintf(outp, fmt8, p->energy_gfx * rapl_energy_units); if (do_rapl & RAPL_DRAM) - outp += sprintf(outp, fmt5, + outp += sprintf(outp, fmt8, p->energy_dram * rapl_energy_units); if (do_rapl & RAPL_PKG_PERF_STATUS) - outp += sprintf(outp, fmt5, 100.0 * p->rapl_pkg_perf_status * rapl_time_units / interval_float); + outp += sprintf(outp, fmt8, 100.0 * p->rapl_pkg_perf_status * rapl_time_units / interval_float); if (do_rapl & RAPL_DRAM_PERF_STATUS) - outp += sprintf(outp, fmt5, 100.0 * p->rapl_dram_perf_status * rapl_time_units / interval_float); - outp += sprintf(outp, fmt5, interval_float); + outp += sprintf(outp, fmt8, 100.0 * p->rapl_dram_perf_status * rapl_time_units / interval_float); + outp += sprintf(outp, fmt8, interval_float); } done: @@ -2455,7 +2431,7 @@ int main(int argc, char **argv) cmdline(argc, argv); if (verbose) - fprintf(stderr, "turbostat v3.6 Dec 2, 2013" + fprintf(stderr, "turbostat v3.7 Feb 6, 2014" " - Len Brown \n"); turbostat_init(); From 4e8e863fed2e82278d29c6357de8251adb73acb9 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Thu, 27 Feb 2014 23:28:53 -0500 Subject: [PATCH 6/7] tools/power turbostat: Run on Broadwell Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 18dab6ecc132..7c9d8e71eb9e 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1492,6 +1492,9 @@ int has_nehalem_turbo_ratio_limit(unsigned int family, unsigned int model) case 0x46: /* HSW */ case 0x37: /* BYT */ case 0x4D: /* AVN */ + case 0x3D: /* BDW */ + case 0x4F: /* BDX */ + case 0x56: /* BDX-DE */ return 1; case 0x2E: /* Nehalem-EX Xeon - Beckton */ case 0x2F: /* Westmere-EX Xeon - Eagleton */ @@ -1605,9 +1608,12 @@ void rapl_probe(unsigned int family, unsigned int model) case 0x3C: /* HSW */ case 0x45: /* HSW */ case 0x46: /* HSW */ + case 0x3D: /* BDW */ do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_GFX | RAPL_PKG_POWER_INFO; break; case 0x3F: /* HSX */ + case 0x4F: /* BDX */ + case 0x56: /* BDX-DE */ do_rapl = RAPL_PKG | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO; break; case 0x2D: @@ -1851,6 +1857,9 @@ int is_snb(unsigned int family, unsigned int model) case 0x3F: /* HSW */ case 0x45: /* HSW */ case 0x46: /* HSW */ + case 0x3D: /* BDW */ + case 0x4F: /* BDX */ + case 0x56: /* BDX-DE */ return 1; } return 0; @@ -1862,7 +1871,8 @@ int has_c8_c9_c10(unsigned int family, unsigned int model) return 0; switch (model) { - case 0x45: + case 0x45: /* HSW */ + case 0x3D: /* BDW */ return 1; } return 0; From 0138d8f0755b5b28d0acdb0a758bcfcaf441fc58 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 4 Apr 2014 01:21:07 -0400 Subject: [PATCH 7/7] intel_idle: fine-tune IVT residency targets Ivy Town processors have slightly different properties than Ivy Bridge processors, particuarly as socket count grows. Here we add dedicated tables covering 1-2 socket, 3-4 socket, and > 4 socket IVT configurations. This reduces the frequency of deep transitions on those systems, which can impact throughput. Signed-off-by: Len Brown --- drivers/idle/intel_idle.c | 141 +++++++++++++++++++++++++++++++++++++- 1 file changed, 140 insertions(+), 1 deletion(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 710ab54ab04b..9ddbf550bafa 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -283,6 +283,105 @@ static struct cpuidle_state ivb_cstates[] = { .enter = NULL } }; +static struct cpuidle_state ivt_cstates[] = { + { + .name = "C1-IVT", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle }, + { + .name = "C1E-IVT", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 10, + .target_residency = 80, + .enter = &intel_idle }, + { + .name = "C3-IVT", + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 59, + .target_residency = 156, + .enter = &intel_idle }, + { + .name = "C6-IVT", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 82, + .target_residency = 300, + .enter = &intel_idle }, + { + .enter = NULL } +}; + +static struct cpuidle_state ivt_cstates_4s[] = { + { + .name = "C1-IVT-4S", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle }, + { + .name = "C1E-IVT-4S", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 10, + .target_residency = 250, + .enter = &intel_idle }, + { + .name = "C3-IVT-4S", + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 59, + .target_residency = 300, + .enter = &intel_idle }, + { + .name = "C6-IVT-4S", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 84, + .target_residency = 400, + .enter = &intel_idle }, + { + .enter = NULL } +}; + +static struct cpuidle_state ivt_cstates_8s[] = { + { + .name = "C1-IVT-8S", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle }, + { + .name = "C1E-IVT-8S", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 10, + .target_residency = 500, + .enter = &intel_idle }, + { + .name = "C3-IVT-8S", + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 59, + .target_residency = 600, + .enter = &intel_idle }, + { + .name = "C6-IVT-8S", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 88, + .target_residency = 700, + .enter = &intel_idle }, + { + .enter = NULL } +}; + static struct cpuidle_state hsw_cstates[] = { { .name = "C1-HSW", @@ -521,6 +620,11 @@ static const struct idle_cpu idle_cpu_ivb = { .disable_promotion_to_c1e = true, }; +static const struct idle_cpu idle_cpu_ivt = { + .state_table = ivt_cstates, + .disable_promotion_to_c1e = true, +}; + static const struct idle_cpu idle_cpu_hsw = { .state_table = hsw_cstates, .disable_promotion_to_c1e = true, @@ -549,7 +653,7 @@ static const struct x86_cpu_id intel_idle_ids[] = { ICPU(0x36, idle_cpu_atom), ICPU(0x37, idle_cpu_byt), ICPU(0x3a, idle_cpu_ivb), - ICPU(0x3e, idle_cpu_ivb), + ICPU(0x3e, idle_cpu_ivt), ICPU(0x3c, idle_cpu_hsw), ICPU(0x3f, idle_cpu_hsw), ICPU(0x45, idle_cpu_hsw), @@ -626,6 +730,39 @@ static void intel_idle_cpuidle_devices_uninit(void) free_percpu(intel_idle_cpuidle_devices); return; } + +/* + * intel_idle_state_table_update() + * + * Update the default state_table for this CPU-id + * + * Currently used to access tuned IVT multi-socket targets + * Assumption: num_sockets == (max_package_num + 1) + */ +void intel_idle_state_table_update(void) +{ + /* IVT uses a different table for 1-2, 3-4, and > 4 sockets */ + if (boot_cpu_data.x86_model == 0x3e) { /* IVT */ + int cpu, package_num, num_sockets = 1; + + for_each_online_cpu(cpu) { + package_num = topology_physical_package_id(cpu); + if (package_num + 1 > num_sockets) { + num_sockets = package_num + 1; + + if (num_sockets > 4) + cpuidle_state_table = ivt_cstates_8s; + return; + } + } + + if (num_sockets > 2) + cpuidle_state_table = ivt_cstates_4s; + /* else, 1 and 2 socket systems use default ivt_cstates */ + } + return; +} + /* * intel_idle_cpuidle_driver_init() * allocate, initialize cpuidle_states @@ -635,6 +772,8 @@ static int __init intel_idle_cpuidle_driver_init(void) int cstate; struct cpuidle_driver *drv = &intel_idle_driver; + intel_idle_state_table_update(); + drv->state_count = 1; for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {