From aaba98018b8295dfa2119345d17f833d74448cd0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 26 May 2009 08:10:00 +0200 Subject: [PATCH] perf_counter, x86: Make NMI lockups more robust We have a debug check that detects stuck NMIs and returns with the PMU disabled in the global ctrl MSR - but i managed to trigger a situation where this was not enough to deassert the NMI. So clear/reset the full PMU and keep the disable count balanced when exiting from here. This way the box produces a debug warning but stays up and is more debuggable. [ Impact: in case of PMU related bugs, recover more gracefully ] Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ece3813c7a3c..2eeaa99add1c 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -724,6 +724,30 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter) intel_pmu_enable_counter(hwc, idx); } +static void intel_pmu_reset(void) +{ + unsigned long flags; + int idx; + + if (!x86_pmu.num_counters) + return; + + local_irq_save(flags); + + printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); + checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); + } + for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + } + + local_irq_restore(flags); +} + + /* * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: @@ -750,6 +774,8 @@ again: if (++loops > 100) { WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); perf_counter_print_debug(); + intel_pmu_reset(); + perf_enable(); return 1; }