From cd4ae6adf8b1c21d88e83ed56afeeef97b28f356 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Fri, 22 Apr 2011 18:53:54 +0800 Subject: [PATCH 1/4] sched: More sched_domain iterations fixes sched_domain iterations needs to be protected by rcu_read_lock() now, this patch adds another two places which needs the rcu lock, which is spotted by following suspicious rcu_dereference_check() usage warnings. kernel/sched_rt.c:1244 invoked rcu_dereference_check() without protection! kernel/sched_stats.h:41 invoked rcu_dereference_check() without protection! Signed-off-by: Xiaotian Feng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1303469634-11678-1-git-send-email-dfeng@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 10 ++++++++-- kernel/sched_stats.h | 4 ++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 64b2a37c07d0..88725c939e0b 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1263,6 +1263,7 @@ static int find_lowest_rq(struct task_struct *task) if (!cpumask_test_cpu(this_cpu, lowest_mask)) this_cpu = -1; /* Skip this_cpu opt if not among lowest */ + rcu_read_lock(); for_each_domain(cpu, sd) { if (sd->flags & SD_WAKE_AFFINE) { int best_cpu; @@ -1272,15 +1273,20 @@ static int find_lowest_rq(struct task_struct *task) * remote processor. */ if (this_cpu != -1 && - cpumask_test_cpu(this_cpu, sched_domain_span(sd))) + cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { + rcu_read_unlock(); return this_cpu; + } best_cpu = cpumask_first_and(lowest_mask, sched_domain_span(sd)); - if (best_cpu < nr_cpu_ids) + if (best_cpu < nr_cpu_ids) { + rcu_read_unlock(); return best_cpu; + } } } + rcu_read_unlock(); /* * And finally, if there were no matches within the domains diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 48ddf431db0e..331e01bcd026 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -37,7 +37,7 @@ static int show_schedstat(struct seq_file *seq, void *v) #ifdef CONFIG_SMP /* domain-specific stats */ - preempt_disable(); + rcu_read_lock(); for_each_domain(cpu, sd) { enum cpu_idle_type itype; @@ -64,7 +64,7 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); } - preempt_enable(); + rcu_read_unlock(); #endif } kfree(mask_str); From d6aa8f85f16379d42c147b22b59e33b67f9ff466 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 May 2011 14:21:33 +0200 Subject: [PATCH 2/4] sched: Fix ttwu() for __ARCH_WANT_INTERRUPTS_ON_CTXSW Marc reported that e4a52bcb9 (sched: Remove rq->lock from the first half of ttwu()) broke his ARM-SMP machine. Now ARM is one of the few __ARCH_WANT_INTERRUPTS_ON_CTXSW users, so that exception in the ttwu() code was suspect. Yong found that the interrupt could hit after context_switch() changes current but before it clears p->on_cpu, if that interrupt were to attempt a wake-up of p we would indeed find ourselves spinning in IRQ context. Fix this by reverting to the old behaviour for this situation and perform a full remote wake-up. Cc: Frank Rowand Cc: Yong Zhang Cc: Oleg Nesterov Reported-by: Marc Zyngier Tested-by: Marc Zyngier Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 5e43e9dc65d1..a80ee911900e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2573,7 +2573,26 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) if (!next) smp_send_reschedule(cpu); } -#endif + +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW +static int ttwu_activate_remote(struct task_struct *p, int wake_flags) +{ + struct rq *rq; + int ret = 0; + + rq = __task_rq_lock(p); + if (p->on_cpu) { + ttwu_activate(rq, p, ENQUEUE_WAKEUP); + ttwu_do_wakeup(rq, p, wake_flags); + ret = 1; + } + __task_rq_unlock(rq); + + return ret; + +} +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ +#endif /* CONFIG_SMP */ static void ttwu_queue(struct task_struct *p, int cpu) { @@ -2631,17 +2650,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) while (p->on_cpu) { #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW /* - * If called from interrupt context we could have landed in the - * middle of schedule(), in this case we should take care not - * to spin on ->on_cpu if p is current, since that would - * deadlock. + * In case the architecture enables interrupts in + * context_switch(), we cannot busy wait, since that + * would lead to deadlocks when an interrupt hits and + * tries to wake up @prev. So bail and do a complete + * remote wakeup. */ - if (p == current) { - ttwu_queue(p, cpu); + if (ttwu_activate_remote(p, wake_flags)) goto stat; - } -#endif +#else cpu_relax(); +#endif } /* * Pairs with the smp_wmb() in finish_lock_switch(). From 1e876231785d82443a5ac8b6c660e9f51bc5dede Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 May 2011 16:21:10 -0700 Subject: [PATCH 3/4] sched: Fix ->min_vruntime calculation in dequeue_entity() Dima Zavin reported: "After pulling the thread off the run-queue during a cgroup change, the cfs_rq.min_vruntime gets recalculated. The dequeued thread's vruntime then gets normalized to this new value. This can then lead to the thread getting an unfair boost in the new group if the vruntime of the next task in the old run-queue was way further ahead." Reported-by: Dima Zavin Signed-off-by: John Stultz Recalls-having-tested-once-upon-a-time-by: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1305674470-23727-1-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e32a9b70ee9c..433491c2dc8f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1076,8 +1076,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) se->on_rq = 0; update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); - update_min_vruntime(cfs_rq); - update_cfs_shares(cfs_rq); /* * Normalize the entity after updating the min_vruntime because the @@ -1086,6 +1084,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ if (!(flags & DEQUEUE_SLEEP)) se->vruntime -= cfs_rq->min_vruntime; + + update_min_vruntime(cfs_rq); + update_cfs_shares(cfs_rq); } /* From 1e1b6c511d1b23cb7c3b619d82fc7bd9f620565d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 19 May 2011 15:08:58 +0900 Subject: [PATCH 4/4] cpuset: Fix cpuset_cpus_allowed_fallback(), don't update tsk->rt.nr_cpus_allowed The rule is, we have to update tsk->rt.nr_cpus_allowed if we change tsk->cpus_allowed. Otherwise RT scheduler may confuse. Signed-off-by: KOSAKI Motohiro Cc: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4DD4B3FA.5060901@jp.fujitsu.com Signed-off-by: Ingo Molnar --- include/linux/cpuset.h | 2 +- include/linux/sched.h | 7 +++++++ kernel/cpuset.c | 4 ++-- kernel/kthread.c | 4 ++-- kernel/sched.c | 19 ++++++++++++------- 5 files changed, 24 insertions(+), 12 deletions(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index f20eb8f16025..e9eaec522655 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -146,7 +146,7 @@ static inline void cpuset_cpus_allowed(struct task_struct *p, static inline int cpuset_cpus_allowed_fallback(struct task_struct *p) { - cpumask_copy(&p->cpus_allowed, cpu_possible_mask); + do_set_cpus_allowed(p, cpu_possible_mask); return cpumask_any(cpu_active_mask); } diff --git a/include/linux/sched.h b/include/linux/sched.h index dc8871295a5a..8da84b7bc1b8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1841,9 +1841,16 @@ static inline void rcu_copy_process(struct task_struct *p) #endif #ifdef CONFIG_SMP +extern void do_set_cpus_allowed(struct task_struct *p, + const struct cpumask *new_mask); + extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); #else +static inline void do_set_cpus_allowed(struct task_struct *p, + const struct cpumask *new_mask) +{ +} static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1ceeb049c827..9c9b7545c810 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2190,7 +2190,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk) rcu_read_lock(); cs = task_cs(tsk); if (cs) - cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed); + do_set_cpus_allowed(tsk, cs->cpus_allowed); rcu_read_unlock(); /* @@ -2217,7 +2217,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk) * Like above we can temporary set any mask and rely on * set_cpus_allowed_ptr() as synchronization point. */ - cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask); + do_set_cpus_allowed(tsk, cpu_possible_mask); cpu = cpumask_any(cpu_active_mask); } diff --git a/kernel/kthread.c b/kernel/kthread.c index 3b34d2732bce..4ba7cccb4994 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -202,8 +202,8 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) return; } - p->cpus_allowed = cpumask_of_cpu(cpu); - p->rt.nr_cpus_allowed = 1; + /* It's safe because the task is inactive. */ + do_set_cpus_allowed(p, cpumask_of(cpu)); p->flags |= PF_THREAD_BOUND; } EXPORT_SYMBOL(kthread_bind); diff --git a/kernel/sched.c b/kernel/sched.c index a80ee911900e..cbb3a0eee58e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5860,7 +5860,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); - cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); + do_set_cpus_allowed(idle, cpumask_of(cpu)); /* * We're having a chicken and egg problem, even though we are * holding rq->lock, the cpu isn't yet set to this cpu so the @@ -5948,6 +5948,16 @@ static inline void sched_init_granularity(void) } #ifdef CONFIG_SMP +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + if (p->sched_class && p->sched_class->set_cpus_allowed) + p->sched_class->set_cpus_allowed(p, new_mask); + else { + cpumask_copy(&p->cpus_allowed, new_mask); + p->rt.nr_cpus_allowed = cpumask_weight(new_mask); + } +} + /* * This is how migration works: * @@ -5993,12 +6003,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; } - if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, new_mask); - else { - cpumask_copy(&p->cpus_allowed, new_mask); - p->rt.nr_cpus_allowed = cpumask_weight(new_mask); - } + do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ if (cpumask_test_cpu(task_cpu(p), new_mask))