From be1fcde604e429691771ce70230668af8097e29b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 27 May 2019 12:56:07 +0200 Subject: [PATCH 1/3] x86: intel_epb: Do not build when CONFIG_PM is unset Commit 9ed0985332a6 ("x86: intel_epb: Take CONFIG_PM into account") prevented the majority of the Performance and Energy Bias Hint (EPB) handling code from being built when CONFIG_PM is unset to fix a regression introduced by commit b9c273babce7 ("PM / arch: x86: MSR_IA32_ENERGY_PERF_BIAS sysfs interface"). In hindsight, however, it would be better to skip all of the EPB handling code for CONFIG_PM unset as there really is no reason for it to be there in that case. Namely, if the EPB is not touched by the kernel at all with CONFIG_PM unset, there is no need to worry about modifying the EPB inadvertently on CPU online and since the system will not suspend or hibernate then, there is no need to worry about possible modifications of the EPB by the platform firmware during system-wide PM transitions. For this reason, revert the changes made by commit 9ed0985332a6 and only allow intel_epb.o to be built when CONFIG_PM is set. Note that this changes the behavior of the kernels built with CONFIG_PM unset as they will not modify the EPB on boot if it is zero initially any more, so it is not a fix strictly speaking, but users building their kernels with CONFIG_PM unset really should not expect them to take energy efficiency into account. Moreover, if CONFIG_PM is unset for performance reasons, leaving EPB as set initially by the platform firmware will actually be consistent with the user's expectations. Signed-off-by: Rafael J. Wysocki Acked-by: Ingo Molnar --- arch/x86/kernel/cpu/Makefile | 5 ++++- arch/x86/kernel/cpu/intel_epb.c | 22 +--------------------- 2 files changed, 5 insertions(+), 22 deletions(-) diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 1796d2bdcaaa..5102bf7c8192 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -28,7 +28,10 @@ obj-y += cpuid-deps.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o -obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o intel_epb.o +ifdef CONFIG_CPU_SUP_INTEL +obj-y += intel.o intel_pconfig.o +obj-$(CONFIG_PM) += intel_epb.o +endif obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index ebb14a26f117..f4dd73396f28 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -97,7 +97,6 @@ static void intel_epb_restore(void) wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); } -#ifdef CONFIG_PM static struct syscore_ops intel_epb_syscore_ops = { .suspend = intel_epb_save, .resume = intel_epb_restore, @@ -194,25 +193,6 @@ static int intel_epb_offline(unsigned int cpu) return 0; } -static inline void register_intel_ebp_syscore_ops(void) -{ - register_syscore_ops(&intel_epb_syscore_ops); -} -#else /* !CONFIG_PM */ -static int intel_epb_online(unsigned int cpu) -{ - intel_epb_restore(); - return 0; -} - -static int intel_epb_offline(unsigned int cpu) -{ - return intel_epb_save(); -} - -static inline void register_intel_ebp_syscore_ops(void) {} -#endif - static __init int intel_epb_init(void) { int ret; @@ -226,7 +206,7 @@ static __init int intel_epb_init(void) if (ret < 0) goto err_out_online; - register_intel_ebp_syscore_ops(); + register_syscore_ops(&intel_epb_syscore_ops); return 0; err_out_online: From a61373476127edac8bcc5ee9d68a74dc1b864f53 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 27 May 2019 12:45:18 +0200 Subject: [PATCH 2/3] PM: sleep: Add kerneldoc comments to some functions Add kerneldoc comments to pm_suspend_via_firmware(), pm_resume_via_firmware() and pm_suspend_via_s2idle() to explain what they do. Signed-off-by: Rafael J. Wysocki --- include/linux/suspend.h | 31 +++++++++++++++++++++++++++++++ kernel/power/suspend.c | 6 ++++++ 2 files changed, 37 insertions(+) diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 4a2ffd678887..8594001e8be8 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -227,11 +227,42 @@ static inline void pm_set_resume_via_firmware(void) pm_suspend_global_flags |= PM_SUSPEND_FLAG_FW_RESUME; } +/** + * pm_suspend_via_firmware - Check if platform firmware will suspend the system. + * + * To be called during system-wide power management transitions to sleep states + * or during the subsequent system-wide transitions back to the working state. + * + * Return 'true' if the platform firmware is going to be invoked at the end of + * the system-wide power management transition (to a sleep state) in progress in + * order to complete it, or if the platform firmware has been invoked in order + * to complete the last (or preceding) transition of the system to a sleep + * state. + * + * This matters if the caller needs or wants to carry out some special actions + * depending on whether or not control will be passed to the platform firmware + * subsequently (for example, the device may need to be reset before letting the + * platform firmware manipulate it, which is not necessary when the platform + * firmware is not going to be invoked) or when such special actions may have + * been carried out during the preceding transition of the system to a sleep + * state (as they may need to be taken into account). + */ static inline bool pm_suspend_via_firmware(void) { return !!(pm_suspend_global_flags & PM_SUSPEND_FLAG_FW_SUSPEND); } +/** + * pm_resume_via_firmware - Check if platform firmware has woken up the system. + * + * To be called during system-wide power management transitions from sleep + * states. + * + * Return 'true' if the platform firmware has passed control to the kernel at + * the beginning of the system-wide power management transition in progress, so + * the event that woke up the system from sleep has been handled by the platform + * firmware. + */ static inline bool pm_resume_via_firmware(void) { return !!(pm_suspend_global_flags & PM_SUSPEND_FLAG_FW_RESUME); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ef908c134b34..43d869db6c07 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -62,6 +62,12 @@ static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head); enum s2idle_states __read_mostly s2idle_state; static DEFINE_RAW_SPINLOCK(s2idle_lock); +/** + * pm_suspend_via_s2idle - Check if suspend-to-idle is the default suspend. + * + * Return 'true' if suspend-to-idle has been selected as the default system + * suspend method. + */ bool pm_suspend_via_s2idle(void) { return mem_sleep_current == PM_SUSPEND_TO_IDLE; From ec527c318036a65a083ef68d8ba95789d2212246 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 30 May 2019 00:09:39 +0200 Subject: [PATCH 3/3] x86/power: Fix 'nosmt' vs hibernation triple fault during resume As explained in 0cc3cd21657b ("cpu/hotplug: Boot HT siblings at least once") we always, no matter what, have to bring up x86 HT siblings during boot at least once in order to avoid first MCE bringing the system to its knees. That means that whenever 'nosmt' is supplied on the kernel command-line, all the HT siblings are as a result sitting in mwait or cpudile after going through the online-offline cycle at least once. This causes a serious issue though when a kernel, which saw 'nosmt' on its commandline, is going to perform resume from hibernation: if the resume from the hibernated image is successful, cr3 is flipped in order to point to the address space of the kernel that is being resumed, which in turn means that all the HT siblings are all of a sudden mwaiting on address which is no longer valid. That results in triple fault shortly after cr3 is switched, and machine reboots. Fix this by always waking up all the SMT siblings before initiating the 'restore from hibernation' process; this guarantees that all the HT siblings will be properly carried over to the resumed kernel waiting in resume_play_dead(), and acted upon accordingly afterwards, based on the target kernel configuration. Symmetricaly, the resumed kernel has to push the SMT siblings to mwait again in case it has SMT disabled; this means it has to online all the siblings when resuming (so that they come out of hlt) and offline them again to let them reach mwait. Cc: 4.19+ # v4.19+ Debugged-by: Thomas Gleixner Fixes: 0cc3cd21657b ("cpu/hotplug: Boot HT siblings at least once") Signed-off-by: Jiri Kosina Acked-by: Pavel Machek Reviewed-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Signed-off-by: Rafael J. Wysocki --- arch/x86/power/cpu.c | 10 ++++++++++ arch/x86/power/hibernate.c | 33 +++++++++++++++++++++++++++++++++ include/linux/cpu.h | 4 ++++ kernel/cpu.c | 4 ++-- kernel/power/hibernate.c | 9 +++++++++ 5 files changed, 58 insertions(+), 2 deletions(-) diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index a7d966964c6f..513ce09e9950 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -299,7 +299,17 @@ int hibernate_resume_nonboot_cpu_disable(void) * address in its instruction pointer may not be possible to resolve * any more at that point (the page tables used by it previously may * have been overwritten by hibernate image data). + * + * First, make sure that we wake up all the potentially disabled SMT + * threads which have been initially brought up and then put into + * mwait/cpuidle sleep. + * Those will be put to proper (not interfering with hibernation + * resume) sleep afterwards, and the resumed kernel will decide itself + * what to do with them. */ + ret = cpuhp_smt_enable(); + if (ret) + return ret; smp_ops.play_dead = resume_play_dead; ret = disable_nonboot_cpus(); smp_ops.play_dead = play_dead; diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c index 4845b8c7be7f..fc413717a45f 100644 --- a/arch/x86/power/hibernate.c +++ b/arch/x86/power/hibernate.c @@ -11,6 +11,7 @@ #include #include #include +#include #include @@ -245,3 +246,35 @@ out: __flush_tlb_all(); return 0; } + +int arch_resume_nosmt(void) +{ + int ret = 0; + /* + * We reached this while coming out of hibernation. This means + * that SMT siblings are sleeping in hlt, as mwait is not safe + * against control transition during resume (see comment in + * hibernate_resume_nonboot_cpu_disable()). + * + * If the resumed kernel has SMT disabled, we have to take all the + * SMT siblings out of hlt, and offline them again so that they + * end up in mwait proper. + * + * Called with hotplug disabled. + */ + cpu_hotplug_enable(); + if (cpu_smt_control == CPU_SMT_DISABLED || + cpu_smt_control == CPU_SMT_FORCE_DISABLED) { + enum cpuhp_smt_control old = cpu_smt_control; + + ret = cpuhp_smt_enable(); + if (ret) + goto out; + ret = cpuhp_smt_disable(old); + if (ret) + goto out; + } +out: + cpu_hotplug_disable(); + return ret; +} diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 3813fe45effd..fcb1386bb0d4 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -201,10 +201,14 @@ enum cpuhp_smt_control { extern enum cpuhp_smt_control cpu_smt_control; extern void cpu_smt_disable(bool force); extern void cpu_smt_check_topology(void); +extern int cpuhp_smt_enable(void); +extern int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval); #else # define cpu_smt_control (CPU_SMT_NOT_IMPLEMENTED) static inline void cpu_smt_disable(bool force) { } static inline void cpu_smt_check_topology(void) { } +static inline int cpuhp_smt_enable(void) { return 0; } +static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 0; } #endif /* diff --git a/kernel/cpu.c b/kernel/cpu.c index f2ef10460698..077fde6fb953 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2061,7 +2061,7 @@ static void cpuhp_online_cpu_device(unsigned int cpu) kobject_uevent(&dev->kobj, KOBJ_ONLINE); } -static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) +int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { int cpu, ret = 0; @@ -2093,7 +2093,7 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) return ret; } -static int cpuhp_smt_enable(void) +int cpuhp_smt_enable(void) { int cpu, ret = 0; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index c8c272df7154..b65635753e8e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -257,6 +257,11 @@ void swsusp_show_speed(ktime_t start, ktime_t stop, (kps % 1000) / 10); } +__weak int arch_resume_nosmt(void) +{ + return 0; +} + /** * create_image - Create a hibernation image. * @platform_mode: Whether or not to use the platform driver. @@ -324,6 +329,10 @@ static int create_image(int platform_mode) Enable_cpus: suspend_enable_secondary_cpus(); + /* Allow architectures to do nosmt-specific post-resume dances */ + if (!in_suspend) + error = arch_resume_nosmt(); + Platform_finish: platform_finish(platform_mode);