From e9dd685ce81815811fb4da72e6ab10a694ac8468 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 27 May 2014 17:02:04 -0400 Subject: [PATCH 1/4] sched/numa: Fix use of spin_{un}lock_irq() when interrupts are disabled As Peter Zijlstra told me, we have the following path: do_exit() exit_itimers() itimer_delete() spin_lock_irqsave(&timer->it_lock, &flags); timer_delete_hook(timer); kc->timer_del(timer) := posix_cpu_timer_del() put_task_struct() __put_task_struct() task_numa_free() spin_lock(&grp->lock); Which means that task_numa_free() can be called with interrupts disabled, which means that we should not be using spin_lock_irq() but spin_lock_irqsave() instead. Otherwise we are enabling interrupts while holding an interrupt unsafe lock! Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra Cc: Thomas Gleixner Cc: Mike Galbraith Cc: Eric Dumazet Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140527182541.GH11096@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0fdb96de81a5..b4768c069392 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1707,18 +1707,19 @@ no_join: void task_numa_free(struct task_struct *p) { struct numa_group *grp = p->numa_group; - int i; void *numa_faults = p->numa_faults_memory; + unsigned long flags; + int i; if (grp) { - spin_lock_irq(&grp->lock); + spin_lock_irqsave(&grp->lock, flags); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) grp->faults[i] -= p->numa_faults_memory[i]; grp->total_faults -= p->total_numa_faults; list_del(&p->numa_entry); grp->nr_tasks--; - spin_unlock_irq(&grp->lock); + spin_unlock_irqrestore(&grp->lock, flags); rcu_assign_pointer(p->numa_group, NULL); put_numa_group(grp); } From b14ed2c273f8ab872ae4e6735fe5ab09cb14b8c3 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Mon, 2 Jun 2014 22:38:34 +0200 Subject: [PATCH 2/4] sched: Fix sched_policy < 0 comparison attr.sched_policy is u32, therefore a comparison against < 0 is never true. Fix this by casting sched_policy to int. This issue was reported by coverity CID 1219934. Fixes: dbdb22754fde ("sched: Disallow sched_attr::sched_policy < 0") Signed-off-by: Richard Weinberger Signed-off-by: Peter Zijlstra Cc: Michael Kerrisk Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1401741514-7045-1-git-send-email-richard@nod.at Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 86f3890c3d08..540542b93f9f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3685,7 +3685,7 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, if (retval) return retval; - if (attr.sched_policy < 0) + if ((int)attr.sched_policy < 0) return -EINVAL; rcu_read_lock(); From 0f397f2c90ce68821ee864c2c53baafe78de765d Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 20 May 2014 13:33:42 +0400 Subject: [PATCH 3/4] sched/dl: Fix race in dl_task_timer() Throttled task is still on rq, and it may be moved to other cpu if user is playing with sched_setaffinity(). Therefore, unlocked task_rq() access makes the race. Juri Lelli reports he got this race when dl_bandwidth_enabled() was not set. Other thing, pointed by Peter Zijlstra: "Now I suppose the problem can still actually happen when you change the root domain and trigger a effective affinity change that way". To fix that we do the same as made in __task_rq_lock(). We do not use __task_rq_lock() itself, because it has a useful lockdep check, which is not correct in case of dl_task_timer(). We do not need pi_lock locked here. This case is an exception (PeterZ): "The only reason we don't strictly need ->pi_lock now is because we're guaranteed to have p->state == TASK_RUNNING here and are thus free of ttwu races". Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Cc: # v3.14+ Cc: Linus Torvalds Link: http://lkml.kernel.org/r/3056991400578422@web14g.yandex.ru Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 800e99b99075..14bc348ba3b4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -513,9 +513,17 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) struct sched_dl_entity, dl_timer); struct task_struct *p = dl_task_of(dl_se); - struct rq *rq = task_rq(p); + struct rq *rq; +again: + rq = task_rq(p); raw_spin_lock(&rq->lock); + if (rq != task_rq(p)) { + /* Task was moved, retrying. */ + raw_spin_unlock(&rq->lock); + goto again; + } + /* * We need to take care of a possible races here. In fact, the * task might have changed its scheduling policy to something From 09dc4ab03936df5c5aa711d27c81283c6d09f495 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 19 May 2014 15:10:09 +0400 Subject: [PATCH 4/4] sched/fair: Fix tg_set_cfs_bandwidth() deadlock on rq->lock tg_set_cfs_bandwidth() sets cfs_b->timer_active to 0 to force the period timer restart. It's not safe, because can lead to deadlock, described in commit 927b54fccbf0: "__start_cfs_bandwidth calls hrtimer_cancel while holding rq->lock, waiting for the hrtimer to finish. However, if sched_cfs_period_timer runs for another loop iteration, the hrtimer can attempt to take rq->lock, resulting in deadlock." Three CPUs must be involved: CPU0 CPU1 CPU2 take rq->lock period timer fired ... take cfs_b lock ... ... tg_set_cfs_bandwidth() throttle_cfs_rq() release cfs_b lock take cfs_b lock ... distribute_cfs_runtime() timer_active = 0 take cfs_b->lock wait for rq->lock ... __start_cfs_bandwidth() {wait for timer callback break if timer_active == 1} So, CPU0 and CPU1 are deadlocked. Instead of resetting cfs_b->timer_active, tg_set_cfs_bandwidth can wait for period timer callbacks (ignoring cfs_b->timer_active) and restart the timer explicitly. Signed-off-by: Roman Gushchin Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/87wqdi9g8e.wl\%klamm@yandex-team.ru Cc: pjt@google.com Cc: chris.j.arges@canonical.com Cc: gregkh@linuxfoundation.org Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +-- kernel/sched/fair.c | 8 ++++---- kernel/sched/sched.h | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 540542b93f9f..f3f48e7a2dda 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7751,8 +7751,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) /* restart the period timer (if active) to handle new period expiry */ if (runtime_enabled && cfs_b->timer_active) { /* force a reprogram */ - cfs_b->timer_active = 0; - __start_cfs_bandwidth(cfs_b); + __start_cfs_bandwidth(cfs_b, true); } raw_spin_unlock_irq(&cfs_b->lock); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b4768c069392..8cbe2d2c16de 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3130,7 +3130,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) */ if (!cfs_b->timer_active) { __refill_cfs_bandwidth_runtime(cfs_b); - __start_cfs_bandwidth(cfs_b); + __start_cfs_bandwidth(cfs_b, false); } if (cfs_b->runtime > 0) { @@ -3309,7 +3309,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); if (!cfs_b->timer_active) - __start_cfs_bandwidth(cfs_b); + __start_cfs_bandwidth(cfs_b, false); raw_spin_unlock(&cfs_b->lock); } @@ -3691,7 +3691,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) } /* requires cfs_b->lock, may release to reprogram timer */ -void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force) { /* * The timer may be active because we're trying to set a new bandwidth @@ -3706,7 +3706,7 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cpu_relax(); raw_spin_lock(&cfs_b->lock); /* if someone else restarted the timer then we're done */ - if (cfs_b->timer_active) + if (!force && cfs_b->timer_active) return; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 456e492a3dca..369b4d663c42 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -278,7 +278,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); -extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); +extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force); extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); extern void free_rt_sched_group(struct task_group *tg);