From 6441402b1f173fa38e561d3cee7c01c32e5281ad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Sep 2008 18:46:37 +0200 Subject: [PATCH 1/8] clockevents: prevent cpu online to interfere with nohz Impact: rare hang which can be triggered on CPU online. tick_do_timer_cpu keeps track of the CPU which updates jiffies via do_timer. The value -1 is used to signal, that currently no CPU is doing this. There are two cases, where the variable can have this state: boot: necessary for systems where the boot cpu id can be != 0 nohz long idle sleep: When the CPU which did the jiffies update last goes into a long idle sleep it drops the update jiffies duty so another CPU which is not idle can pick it up and keep jiffies going. Using the same value for both situations is wrong, as the CPU online code can see the -1 state when the timer of the newly onlined CPU is setup. The setup for a newly onlined CPU goes through periodic mode and can pick up the do_timer duty without being aware of the nohz / highres mode of the already running system. Use two separate states and make them constants to avoid magic numbers confusion. Signed-off-by: Thomas Gleixner --- kernel/time/tick-common.c | 7 ++++--- kernel/time/tick-internal.h | 4 ++++ kernel/time/tick-sched.c | 8 ++++---- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 019315ebf9de..b523d095decf 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); */ ktime_t tick_next_period; ktime_t tick_period; -int tick_do_timer_cpu __read_mostly = -1; +int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; DEFINE_SPINLOCK(tick_device_lock); /* @@ -148,7 +148,7 @@ static void tick_setup_device(struct tick_device *td, * If no cpu took the do_timer update, assign it to * this cpu: */ - if (tick_do_timer_cpu == -1) { + if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { tick_do_timer_cpu = cpu; tick_next_period = ktime_get(); tick_period = ktime_set(0, NSEC_PER_SEC / HZ); @@ -300,7 +300,8 @@ static void tick_shutdown(unsigned int *cpup) if (*cpup == tick_do_timer_cpu) { int cpu = first_cpu(cpu_online_map); - tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1; + tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : + TICK_DO_TIMER_NONE; } spin_unlock_irqrestore(&tick_device_lock, flags); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 6e9db9734aa6..e18014fadf95 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -1,6 +1,10 @@ /* * tick internal variable and functions used by low/high res code */ + +#define TICK_DO_TIMER_NONE -1 +#define TICK_DO_TIMER_BOOT -2 + DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern spinlock_t tick_device_lock; extern ktime_t tick_next_period; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a87b0468568b..31a14e8caac1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -221,7 +221,7 @@ void tick_nohz_stop_sched_tick(int inidle) */ if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = -1; + tick_do_timer_cpu = TICK_DO_TIMER_NONE; } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) @@ -303,7 +303,7 @@ void tick_nohz_stop_sched_tick(int inidle) * invoked. */ if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = -1; + tick_do_timer_cpu = TICK_DO_TIMER_NONE; ts->idle_sleeps++; @@ -468,7 +468,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) * this duty, then the jiffies update is still serialized by * xtime_lock. */ - if (unlikely(tick_do_timer_cpu == -1)) + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) tick_do_timer_cpu = cpu; /* Check, if the jiffies need an update */ @@ -570,7 +570,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) * this duty, then the jiffies update is still serialized by * xtime_lock. */ - if (unlikely(tick_do_timer_cpu == -1)) + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) tick_do_timer_cpu = cpu; #endif From 4faac97d44ac27bdbb010a9c3597401a8f89341f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Sep 2008 18:54:29 +0200 Subject: [PATCH 2/8] x86: prevent stale state of c1e_mask across CPU offline/online Impact: hang which happens across CPU offline/online on AMD C1E systems. When a CPU goes offline then the corresponding bit in the broadcast mask is cleared. For AMD C1E enabled CPUs we do not reenable the broadcast when the CPU comes online again as we do not clear the corresponding bit in the c1e_mask, which keeps track which CPUs have been switched to broadcast already. So on those !$@#& machines we never switch back to broadcasting after a CPU offline/online cycle. Clear the bit when the CPU plays dead. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process.c | 11 ++++++++--- arch/x86/kernel/process_32.c | 1 + arch/x86/kernel/process_64.c | 2 ++ include/asm-x86/idle.h | 2 ++ 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 7fc4d5b0a6a0..2e2247117f6e 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -246,6 +246,14 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) return 1; } +static cpumask_t c1e_mask = CPU_MASK_NONE; +static int c1e_detected; + +void c1e_remove_cpu(int cpu) +{ + cpu_clear(cpu, c1e_mask); +} + /* * C1E aware idle routine. We check for C1E active in the interrupt * pending message MSR. If we detect C1E, then we handle it the same @@ -253,9 +261,6 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) */ static void c1e_idle(void) { - static cpumask_t c1e_mask = CPU_MASK_NONE; - static int c1e_detected; - if (need_resched()) return; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 3b7a1ddcc0bc..4b3cfdf54216 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -88,6 +88,7 @@ static void cpu_exit_clear(void) cpu_clear(cpu, cpu_callin_map); numa_remove_cpu(cpu); + c1e_remove_cpu(cpu); } /* We don't actually take CPU down, just spin without interrupts. */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 71553b664e2a..e12e0e4dd256 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -93,6 +93,8 @@ DECLARE_PER_CPU(int, cpu_state); static inline void play_dead(void) { idle_task_exit(); + c1e_remove_cpu(raw_smp_processor_id()); + mb(); /* Ack it */ __get_cpu_var(cpu_state) = CPU_DEAD; diff --git a/include/asm-x86/idle.h b/include/asm-x86/idle.h index d240e5b30a45..cbb649123612 100644 --- a/include/asm-x86/idle.h +++ b/include/asm-x86/idle.h @@ -10,4 +10,6 @@ void idle_notifier_register(struct notifier_block *n); void enter_idle(void); void exit_idle(void); +void c1e_remove_cpu(int cpu); + #endif From 49d670fb8dd62d3ed4e3ed2513538ea65b051aed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Sep 2008 18:56:01 +0200 Subject: [PATCH 3/8] clockevents: prevent stale tick_next_period for onlining CPUs Impact: possible hang on CPU onlining in timer one shot mode. The tick_next_period variable is only used during boot on nohz/highres enabled systems, but for CPU onlining it needs to be maintained when the per cpu clock events device operates in one shot mode. Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 31a14e8caac1..39019b3f7621 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -75,6 +75,9 @@ static void tick_do_update_jiffies64(ktime_t now) incr * ticks); } do_timer(++ticks); + + /* Keep the tick_next_period variable up to date */ + tick_next_period = ktime_add(last_jiffies_update, tick_period); } write_sequnlock(&xtime_lock); } From 302745699c1b675b5d2a1af87271de10e4d96b6a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Sep 2008 19:02:25 +0200 Subject: [PATCH 4/8] clockevents: check broadcast device not tick device Impact: Possible hang on CPU online observed on AMD C1E machines. The broadcast setup code looks at the mode of the tick device to determine whether it needs to be shut down or setup. This is wrong when the broadcast mode is set to one shot already. This can happen when a CPU is brought online as it goes through the periodic setup first. The problem went unnoticed as sane systems do not call into that code before the switch to one shot for the clock event device happens. The AMD C1E idle routine switches over immediately and thereby shuts down the just setup device before the first interrupt happens. Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f1f3eee28113..e2b66d1c8ca5 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -235,7 +235,7 @@ static void tick_do_broadcast_on_off(void *why) case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: if (!cpu_isset(cpu, tick_broadcast_mask)) { cpu_set(cpu, tick_broadcast_mask); - if (td->mode == TICKDEV_MODE_PERIODIC) + if (bc->mode == TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); } if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) @@ -245,7 +245,7 @@ static void tick_do_broadcast_on_off(void *why) if (!tick_broadcast_force && cpu_isset(cpu, tick_broadcast_mask)) { cpu_clear(cpu, tick_broadcast_mask); - if (td->mode == TICKDEV_MODE_PERIODIC) + if (bc->mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); } break; From 27ce4cb4a0c7cf59b9a9952266883862f2e4c99f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Sep 2008 19:04:02 +0200 Subject: [PATCH 5/8] clockevents: prevent mode mismatch on cpu online Impact: timer hang on CPU online observed on AMD C1E systems When a CPU is brought online then the broadcast machinery can be in the one shot state already. Check this and setup the timer device of the new CPU in one shot mode so the broadcast code can pick up the next_event value correctly. Another AMD C1E oddity, as we switch to broadcast immediately and not after the full bring up via the ACPI cpu idle code. Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 8 ++++++++ kernel/time/tick-common.c | 3 ++- kernel/time/tick-internal.h | 2 ++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index e2b66d1c8ca5..bd7034542399 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -575,4 +575,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) spin_unlock_irqrestore(&tick_broadcast_lock, flags); } +/* + * Check, whether the broadcast device is in one shot mode + */ +int tick_broadcast_oneshot_active(void) +{ + return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; +} + #endif diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b523d095decf..df12434b43ca 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -109,7 +109,8 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) if (!tick_device_is_functional(dev)) return; - if (dev->features & CLOCK_EVT_FEAT_PERIODIC) { + if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && + !tick_broadcast_oneshot_active()) { clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); } else { unsigned long seq; diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index e18014fadf95..55c3f4be6077 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -35,6 +35,7 @@ extern void tick_broadcast_oneshot_control(unsigned long reason); extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); +extern int tick_broadcast_oneshot_active(void); # else /* BROADCAST */ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { @@ -43,6 +44,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) static inline void tick_broadcast_oneshot_control(unsigned long reason) { } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } +static inline int tick_broadcast_oneshot_active(void) { return 0; } # endif /* !BROADCAST */ #else /* !ONESHOT */ From a8d6829044901a67732904be5f1eacdf8539604f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Sep 2008 19:02:25 +0200 Subject: [PATCH 6/8] x86: prevent C-states hang on AMD C1E enabled machines Impact: System hang when AMD C1E machines switch into C2/C3 AMD C1E enabled systems do not work with normal ACPI C-states even if the BIOS is advertising them. Limit the C-states to C1 for the ACPI processor idle code. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process.c | 1 + include/asm-x86/acpi.h | 2 ++ include/asm-x86/cpufeature.h | 1 + 3 files changed, 4 insertions(+) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 2e2247117f6e..d8c2a299bfe5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -272,6 +272,7 @@ static void c1e_idle(void) c1e_detected = 1; mark_tsc_unstable("TSC halt in C1E"); printk(KERN_INFO "System has C1E enabled\n"); + set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); } } diff --git a/include/asm-x86/acpi.h b/include/asm-x86/acpi.h index 635d764dc13e..35d1743b57ac 100644 --- a/include/asm-x86/acpi.h +++ b/include/asm-x86/acpi.h @@ -140,6 +140,8 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate) boot_cpu_data.x86_model <= 0x05 && boot_cpu_data.x86_mask < 0x0A) return 1; + else if (boot_cpu_has(X86_FEATURE_AMDC1E)) + return 1; else return max_cstate; } diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h index 9489283a4bcf..cfcfb0a806ba 100644 --- a/include/asm-x86/cpufeature.h +++ b/include/asm-x86/cpufeature.h @@ -81,6 +81,7 @@ #define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* Lfence synchronizes RDTSC */ #define X86_FEATURE_11AP (3*32+19) /* Bad local APIC aka 11AP */ #define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ +#define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* Streaming SIMD Extensions-3 */ From 09bfeea13cea843fb03eaa96b5d891fa0abdcc90 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 18 Sep 2008 21:12:10 +0200 Subject: [PATCH 7/8] x86: c1e_idle: don't mark TSC unstable if CPU has invariant TSC Impact: Functional TSC is marked unstable on AMD family 0x10 and 0x11 CPUs. This would be wrong because for those CPUs "invariant TSC" means: "The TSC counts at the same rate in all P-states, all C states, S0, or S1" (See "Processor BIOS and Kernel Developer's Guides" for those CPUs.) [ tglx: Changed C1E to AMD C1E in the printks to avoid confusion with Intel C1E ] Signed-off-by: Andreas Herrmann Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index d8c2a299bfe5..876e91890777 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -270,8 +270,9 @@ static void c1e_idle(void) rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); if (lo & K8_INTP_C1E_ACTIVE_MASK) { c1e_detected = 1; - mark_tsc_unstable("TSC halt in C1E"); - printk(KERN_INFO "System has C1E enabled\n"); + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + mark_tsc_unstable("TSC halt in AMD C1E"); + printk(KERN_INFO "System has AMD C1E enabled\n"); set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); } } From f8e256c687eb53850685747757c8d75e58756e15 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Sep 2008 13:00:57 +0200 Subject: [PATCH 8/8] timers: fix build error in !oneshot case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kernel/time/tick-common.c: In function ‘tick_setup_periodic’: kernel/time/tick-common.c:113: error: implicit declaration of function ‘tick_broadcast_oneshot_active’ Signed-off-by: Ingo Molnar --- kernel/time/tick-internal.h | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 55c3f4be6077..469248782c23 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -74,6 +74,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) { return 0; } +static inline int tick_broadcast_oneshot_active(void) { return 0; } #endif /* !TICK_ONESHOT */ /*