rcu: add support for dynamic ticks and preempt rcu

The PREEMPT-RCU can get stuck if a CPU goes idle and NO_HZ is set. The
idle CPU will not progress the RCU through its grace period and a
synchronize_rcu my get stuck. Without this patch I have a box that will
not boot when PREEMPT_RCU and NO_HZ are set. That same box boots fine
with this patch.

This patch comes from the -rt kernel where it has been tested for
several months.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
Steven Rostedt 2008-02-29 18:46:50 +01:00 committed by Ingo Molnar
parent c0f4133b8f
commit 2232c2d8e0
6 changed files with 259 additions and 4 deletions

View File

@ -109,6 +109,14 @@ static inline void account_system_vtime(struct task_struct *tsk)
}
#endif
#if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ)
extern void rcu_irq_enter(void);
extern void rcu_irq_exit(void);
#else
# define rcu_irq_enter() do { } while (0)
# define rcu_irq_exit() do { } while (0)
#endif /* CONFIG_PREEMPT_RCU */
/*
* It is safe to do non-atomic ops on ->hardirq_context,
* because NMI handlers may not preempt and the ops are
@ -117,6 +125,7 @@ static inline void account_system_vtime(struct task_struct *tsk)
*/
#define __irq_enter() \
do { \
rcu_irq_enter(); \
account_system_vtime(current); \
add_preempt_count(HARDIRQ_OFFSET); \
trace_hardirq_enter(); \
@ -135,6 +144,7 @@ extern void irq_enter(void);
trace_hardirq_exit(); \
account_system_vtime(current); \
sub_preempt_count(HARDIRQ_OFFSET); \
rcu_irq_exit(); \
} while (0)
/*

View File

@ -160,5 +160,8 @@ extern void rcu_restart_cpu(int cpu);
extern long rcu_batches_completed(void);
extern long rcu_batches_completed_bh(void);
#define rcu_enter_nohz() do { } while (0)
#define rcu_exit_nohz() do { } while (0)
#endif /* __KERNEL__ */
#endif /* __LINUX_RCUCLASSIC_H */

View File

@ -82,5 +82,27 @@ extern struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu);
struct softirq_action;
#ifdef CONFIG_NO_HZ
DECLARE_PER_CPU(long, dynticks_progress_counter);
static inline void rcu_enter_nohz(void)
{
__get_cpu_var(dynticks_progress_counter)++;
WARN_ON(__get_cpu_var(dynticks_progress_counter) & 0x1);
mb();
}
static inline void rcu_exit_nohz(void)
{
mb();
__get_cpu_var(dynticks_progress_counter)++;
WARN_ON(!(__get_cpu_var(dynticks_progress_counter) & 0x1));
}
#else /* CONFIG_NO_HZ */
#define rcu_enter_nohz() do { } while (0)
#define rcu_exit_nohz() do { } while (0)
#endif /* CONFIG_NO_HZ */
#endif /* __KERNEL__ */
#endif /* __LINUX_RCUPREEMPT_H */

View File

@ -23,6 +23,10 @@
* to Suparna Bhattacharya for pushing me completely away
* from atomic instructions on the read side.
*
* - Added handling of Dynamic Ticks
* Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
* - Steven Rostedt <srostedt@redhat.com>
*
* Papers: http://www.rdrop.com/users/paulmck/RCU
*
* Design Document: http://lwn.net/Articles/253651/
@ -409,6 +413,212 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp)
}
}
#ifdef CONFIG_NO_HZ
DEFINE_PER_CPU(long, dynticks_progress_counter) = 1;
static DEFINE_PER_CPU(long, rcu_dyntick_snapshot);
static DEFINE_PER_CPU(int, rcu_update_flag);
/**
* rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
*
* If the CPU was idle with dynamic ticks active, this updates the
* dynticks_progress_counter to let the RCU handling know that the
* CPU is active.
*/
void rcu_irq_enter(void)
{
int cpu = smp_processor_id();
if (per_cpu(rcu_update_flag, cpu))
per_cpu(rcu_update_flag, cpu)++;
/*
* Only update if we are coming from a stopped ticks mode
* (dynticks_progress_counter is even).
*/
if (!in_interrupt() &&
(per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) {
/*
* The following might seem like we could have a race
* with NMI/SMIs. But this really isn't a problem.
* Here we do a read/modify/write, and the race happens
* when an NMI/SMI comes in after the read and before
* the write. But NMI/SMIs will increment this counter
* twice before returning, so the zero bit will not
* be corrupted by the NMI/SMI which is the most important
* part.
*
* The only thing is that we would bring back the counter
* to a postion that it was in during the NMI/SMI.
* But the zero bit would be set, so the rest of the
* counter would again be ignored.
*
* On return from the IRQ, the counter may have the zero
* bit be 0 and the counter the same as the return from
* the NMI/SMI. If the state machine was so unlucky to
* see that, it still doesn't matter, since all
* RCU read-side critical sections on this CPU would
* have already completed.
*/
per_cpu(dynticks_progress_counter, cpu)++;
/*
* The following memory barrier ensures that any
* rcu_read_lock() primitives in the irq handler
* are seen by other CPUs to follow the above
* increment to dynticks_progress_counter. This is
* required in order for other CPUs to correctly
* determine when it is safe to advance the RCU
* grace-period state machine.
*/
smp_mb(); /* see above block comment. */
/*
* Since we can't determine the dynamic tick mode from
* the dynticks_progress_counter after this routine,
* we use a second flag to acknowledge that we came
* from an idle state with ticks stopped.
*/
per_cpu(rcu_update_flag, cpu)++;
/*
* If we take an NMI/SMI now, they will also increment
* the rcu_update_flag, and will not update the
* dynticks_progress_counter on exit. That is for
* this IRQ to do.
*/
}
}
/**
* rcu_irq_exit - Called from exiting Hard irq context.
*
* If the CPU was idle with dynamic ticks active, update the
* dynticks_progress_counter to put let the RCU handling be
* aware that the CPU is going back to idle with no ticks.
*/
void rcu_irq_exit(void)
{
int cpu = smp_processor_id();
/*
* rcu_update_flag is set if we interrupted the CPU
* when it was idle with ticks stopped.
* Once this occurs, we keep track of interrupt nesting
* because a NMI/SMI could also come in, and we still
* only want the IRQ that started the increment of the
* dynticks_progress_counter to be the one that modifies
* it on exit.
*/
if (per_cpu(rcu_update_flag, cpu)) {
if (--per_cpu(rcu_update_flag, cpu))
return;
/* This must match the interrupt nesting */
WARN_ON(in_interrupt());
/*
* If an NMI/SMI happens now we are still
* protected by the dynticks_progress_counter being odd.
*/
/*
* The following memory barrier ensures that any
* rcu_read_unlock() primitives in the irq handler
* are seen by other CPUs to preceed the following
* increment to dynticks_progress_counter. This
* is required in order for other CPUs to determine
* when it is safe to advance the RCU grace-period
* state machine.
*/
smp_mb(); /* see above block comment. */
per_cpu(dynticks_progress_counter, cpu)++;
WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1);
}
}
static void dyntick_save_progress_counter(int cpu)
{
per_cpu(rcu_dyntick_snapshot, cpu) =
per_cpu(dynticks_progress_counter, cpu);
}
static inline int
rcu_try_flip_waitack_needed(int cpu)
{
long curr;
long snap;
curr = per_cpu(dynticks_progress_counter, cpu);
snap = per_cpu(rcu_dyntick_snapshot, cpu);
smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
/*
* If the CPU remained in dynticks mode for the entire time
* and didn't take any interrupts, NMIs, SMIs, or whatever,
* then it cannot be in the middle of an rcu_read_lock(), so
* the next rcu_read_lock() it executes must use the new value
* of the counter. So we can safely pretend that this CPU
* already acknowledged the counter.
*/
if ((curr == snap) && ((curr & 0x1) == 0))
return 0;
/*
* If the CPU passed through or entered a dynticks idle phase with
* no active irq handlers, then, as above, we can safely pretend
* that this CPU already acknowledged the counter.
*/
if ((curr - snap) > 2 || (snap & 0x1) == 0)
return 0;
/* We need this CPU to explicitly acknowledge the counter flip. */
return 1;
}
static inline int
rcu_try_flip_waitmb_needed(int cpu)
{
long curr;
long snap;
curr = per_cpu(dynticks_progress_counter, cpu);
snap = per_cpu(rcu_dyntick_snapshot, cpu);
smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
/*
* If the CPU remained in dynticks mode for the entire time
* and didn't take any interrupts, NMIs, SMIs, or whatever,
* then it cannot have executed an RCU read-side critical section
* during that time, so there is no need for it to execute a
* memory barrier.
*/
if ((curr == snap) && ((curr & 0x1) == 0))
return 0;
/*
* If the CPU either entered or exited an outermost interrupt,
* SMI, NMI, or whatever handler, then we know that it executed
* a memory barrier when doing so. So we don't need another one.
*/
if (curr != snap)
return 0;
/* We need the CPU to execute a memory barrier. */
return 1;
}
#else /* !CONFIG_NO_HZ */
# define dyntick_save_progress_counter(cpu) do { } while (0)
# define rcu_try_flip_waitack_needed(cpu) (1)
# define rcu_try_flip_waitmb_needed(cpu) (1)
#endif /* CONFIG_NO_HZ */
/*
* Get here when RCU is idle. Decide whether we need to
* move out of idle state, and return non-zero if so.
@ -447,8 +657,10 @@ rcu_try_flip_idle(void)
/* Now ask each CPU for acknowledgement of the flip. */
for_each_cpu_mask(cpu, rcu_cpu_online_map)
for_each_cpu_mask(cpu, rcu_cpu_online_map) {
per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
dyntick_save_progress_counter(cpu);
}
return 1;
}
@ -464,7 +676,8 @@ rcu_try_flip_waitack(void)
RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
for_each_cpu_mask(cpu, rcu_cpu_online_map)
if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
if (rcu_try_flip_waitack_needed(cpu) &&
per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
return 0;
}
@ -509,8 +722,10 @@ rcu_try_flip_waitzero(void)
smp_mb(); /* ^^^^^^^^^^^^ */
/* Call for a memory barrier from each CPU. */
for_each_cpu_mask(cpu, rcu_cpu_online_map)
for_each_cpu_mask(cpu, rcu_cpu_online_map) {
per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
dyntick_save_progress_counter(cpu);
}
RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
return 1;
@ -528,7 +743,8 @@ rcu_try_flip_waitmb(void)
RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
for_each_cpu_mask(cpu, rcu_cpu_online_map)
if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
if (rcu_try_flip_waitmb_needed(cpu) &&
per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
return 0;
}

View File

@ -313,6 +313,7 @@ void irq_exit(void)
/* Make sure that timer wheel updates are propagated */
if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
tick_nohz_stop_sched_tick();
rcu_irq_exit();
#endif
preempt_enable_no_resched();
}

View File

@ -282,6 +282,7 @@ void tick_nohz_stop_sched_tick(void)
ts->idle_tick = ts->sched_timer.expires;
ts->tick_stopped = 1;
ts->idle_jiffies = last_jiffies;
rcu_enter_nohz();
}
/*
@ -375,6 +376,8 @@ void tick_nohz_restart_sched_tick(void)
return;
}
rcu_exit_nohz();
/* Update jiffies first */
select_nohz_load_balancer(0);
now = ktime_get();