A set of scheduler fixes:
- Plug a load average accounting race which was introduced with a recent optimization casing load average to show bogus numbers. - Fix the rseq CPU id initialization for new tasks. sched_fork() does not update the rseq CPU id so the id is the stale id of the parent task, which can cause user space data corruption. - Handle a 0 return value of task_h_load() correctly in the load balancer, which does not decrease imbalance and therefore pulls until the maximum number of loops is reached, which might be all tasks just created by a fork bomb. -----BEGIN PGP SIGNATURE----- iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAl8UQrITHHRnbHhAbGlu dXRyb25peC5kZQAKCRCmGPVMDXSYoTNgD/4+uP0wmIuYAJd1WmpifX+G9h+3NIiU zfLTxGeo+D/I+rdeS7ClyjeSTcZHl1fQfZBIopMevsEMymMu2BbQd+OeAlkESbS6 dp6G3dv0ZGbm9Sn4G3CEEPltoCJi7pOgRrixGi/o4APkNfy3U2r+w/kM1N6AwHE0 PYztzvq5Q++m+MEHOALsB1J8mc7vygU26EO4s/rRrV6/RnNZXL269PeZRFxxEvYn rtmyUw53Lc72Y+23FuityE/jb2xkr80yuXQWxTOxbhzBtHO1omWQQVhBTMam5RDg NUYzeZvK/nZW3i6WOuHyaaLj7+2ML7RmNpaYRueymJinda409GDXRcDOYXNFtxcI lcVmsxzNF5rb7b9mXqdgdSJKuZotKLnTjXAIGhHzkSkl2uYfYW6PUGxq6BmSCKvR GpewHQ8Ynf4JcsjioOTQjRNjJYmlrTsHcUUKXsyTIfYaEEw+i/7s/7G5G7bXxJ6G Sma52oTyrsFQEG+AjT2CxhOzxQumtT5vQ9/l8EvnQXQdG7fZzIimgWnTBc6IE83J OPYI8WomKhj+EkJSltxUQm+ZwqhDv4rBHQ+SqPr+jhvPobUN6jS0HkOoW+SIGuo4 oMRvMiNhCyUWLFYMVL2pflJANyiFczfKWyAqyjwgiSjfNaTqSmYCcPOc1NWz/Ic4 fGMLMqFQ2fW/rg== =bCPw -----END PGP SIGNATURE----- Merge tag 'sched-urgent-2020-07-19' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into master Pull scheduler fixes from Thomas Gleixner: "A set of scheduler fixes: - Plug a load average accounting race which was introduced with a recent optimization casing load average to show bogus numbers. - Fix the rseq CPU id initialization for new tasks. sched_fork() does not update the rseq CPU id so the id is the stale id of the parent task, which can cause user space data corruption. - Handle a 0 return value of task_h_load() correctly in the load balancer, which does not decrease imbalance and therefore pulls until the maximum number of loops is reached, which might be all tasks just created by a fork bomb" * tag 'sched-urgent-2020-07-19' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: handle case of task_h_load() returning 0 sched: Fix unreliable rseq cpu_id for new tasks sched: Fix loadavg accounting race
This commit is contained in:
commit
43768f7ce0
|
@ -114,10 +114,6 @@ struct task_group;
|
|||
|
||||
#define task_is_stopped_or_traced(task) ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
|
||||
|
||||
#define task_contributes_to_load(task) ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
|
||||
(task->flags & PF_FROZEN) == 0 && \
|
||||
(task->state & TASK_NOLOAD) == 0)
|
||||
|
||||
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
|
||||
|
||||
/*
|
||||
|
|
|
@ -1311,9 +1311,6 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
|
|||
|
||||
void activate_task(struct rq *rq, struct task_struct *p, int flags)
|
||||
{
|
||||
if (task_contributes_to_load(p))
|
||||
rq->nr_uninterruptible--;
|
||||
|
||||
enqueue_task(rq, p, flags);
|
||||
|
||||
p->on_rq = TASK_ON_RQ_QUEUED;
|
||||
|
@ -1323,9 +1320,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
|
|||
{
|
||||
p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
|
||||
|
||||
if (task_contributes_to_load(p))
|
||||
rq->nr_uninterruptible++;
|
||||
|
||||
dequeue_task(rq, p, flags);
|
||||
}
|
||||
|
||||
|
@ -2236,10 +2230,10 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
|
|||
|
||||
lockdep_assert_held(&rq->lock);
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
if (p->sched_contributes_to_load)
|
||||
rq->nr_uninterruptible--;
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
if (wake_flags & WF_MIGRATED)
|
||||
en_flags |= ENQUEUE_MIGRATED;
|
||||
#endif
|
||||
|
@ -2583,7 +2577,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
|
|||
* A similar smb_rmb() lives in try_invoke_on_locked_down_task().
|
||||
*/
|
||||
smp_rmb();
|
||||
if (p->on_rq && ttwu_remote(p, wake_flags))
|
||||
if (READ_ONCE(p->on_rq) && ttwu_remote(p, wake_flags))
|
||||
goto unlock;
|
||||
|
||||
if (p->in_iowait) {
|
||||
|
@ -2592,9 +2586,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
|
|||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
p->sched_contributes_to_load = !!task_contributes_to_load(p);
|
||||
p->state = TASK_WAKING;
|
||||
|
||||
/*
|
||||
* Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
|
||||
* possible to, falsely, observe p->on_cpu == 0.
|
||||
|
@ -2613,8 +2604,20 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
|
|||
*
|
||||
* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
|
||||
* __schedule(). See the comment for smp_mb__after_spinlock().
|
||||
*
|
||||
* Form a control-dep-acquire with p->on_rq == 0 above, to ensure
|
||||
* schedule()'s deactivate_task() has 'happened' and p will no longer
|
||||
* care about it's own p->state. See the comment in __schedule().
|
||||
*/
|
||||
smp_rmb();
|
||||
smp_acquire__after_ctrl_dep();
|
||||
|
||||
/*
|
||||
* We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
|
||||
* == 0), which means we need to do an enqueue, change p->state to
|
||||
* TASK_WAKING such that we can unlock p->pi_lock before doing the
|
||||
* enqueue, such as ttwu_queue_wakelist().
|
||||
*/
|
||||
p->state = TASK_WAKING;
|
||||
|
||||
/*
|
||||
* If the owning (remote) CPU is still in the middle of schedule() with
|
||||
|
@ -2962,6 +2965,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
|
|||
* Silence PROVE_RCU.
|
||||
*/
|
||||
raw_spin_lock_irqsave(&p->pi_lock, flags);
|
||||
rseq_migrate(p);
|
||||
/*
|
||||
* We're setting the CPU for the first time, we don't migrate,
|
||||
* so use __set_task_cpu().
|
||||
|
@ -3026,6 +3030,7 @@ void wake_up_new_task(struct task_struct *p)
|
|||
* as we're not fully set-up yet.
|
||||
*/
|
||||
p->recent_used_cpu = task_cpu(p);
|
||||
rseq_migrate(p);
|
||||
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
|
||||
#endif
|
||||
rq = __task_rq_lock(p, &rf);
|
||||
|
@ -4097,6 +4102,7 @@ static void __sched notrace __schedule(bool preempt)
|
|||
{
|
||||
struct task_struct *prev, *next;
|
||||
unsigned long *switch_count;
|
||||
unsigned long prev_state;
|
||||
struct rq_flags rf;
|
||||
struct rq *rq;
|
||||
int cpu;
|
||||
|
@ -4113,12 +4119,22 @@ static void __sched notrace __schedule(bool preempt)
|
|||
local_irq_disable();
|
||||
rcu_note_context_switch(preempt);
|
||||
|
||||
/* See deactivate_task() below. */
|
||||
prev_state = prev->state;
|
||||
|
||||
/*
|
||||
* Make sure that signal_pending_state()->signal_pending() below
|
||||
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
|
||||
* done by the caller to avoid the race with signal_wake_up().
|
||||
* done by the caller to avoid the race with signal_wake_up():
|
||||
*
|
||||
* The membarrier system call requires a full memory barrier
|
||||
* __set_current_state(@state) signal_wake_up()
|
||||
* schedule() set_tsk_thread_flag(p, TIF_SIGPENDING)
|
||||
* wake_up_state(p, state)
|
||||
* LOCK rq->lock LOCK p->pi_state
|
||||
* smp_mb__after_spinlock() smp_mb__after_spinlock()
|
||||
* if (signal_pending_state()) if (p->state & @state)
|
||||
*
|
||||
* Also, the membarrier system call requires a full memory barrier
|
||||
* after coming from user-space, before storing to rq->curr.
|
||||
*/
|
||||
rq_lock(rq, &rf);
|
||||
|
@ -4129,10 +4145,31 @@ static void __sched notrace __schedule(bool preempt)
|
|||
update_rq_clock(rq);
|
||||
|
||||
switch_count = &prev->nivcsw;
|
||||
if (!preempt && prev->state) {
|
||||
if (signal_pending_state(prev->state, prev)) {
|
||||
/*
|
||||
* We must re-load prev->state in case ttwu_remote() changed it
|
||||
* before we acquired rq->lock.
|
||||
*/
|
||||
if (!preempt && prev_state && prev_state == prev->state) {
|
||||
if (signal_pending_state(prev_state, prev)) {
|
||||
prev->state = TASK_RUNNING;
|
||||
} else {
|
||||
prev->sched_contributes_to_load =
|
||||
(prev_state & TASK_UNINTERRUPTIBLE) &&
|
||||
!(prev_state & TASK_NOLOAD) &&
|
||||
!(prev->flags & PF_FROZEN);
|
||||
|
||||
if (prev->sched_contributes_to_load)
|
||||
rq->nr_uninterruptible++;
|
||||
|
||||
/*
|
||||
* __schedule() ttwu()
|
||||
* prev_state = prev->state; if (READ_ONCE(p->on_rq) && ...)
|
||||
* LOCK rq->lock goto out;
|
||||
* smp_mb__after_spinlock(); smp_acquire__after_ctrl_dep();
|
||||
* p->on_rq = 0; p->state = TASK_WAKING;
|
||||
*
|
||||
* After this, schedule() must not care about p->state any more.
|
||||
*/
|
||||
deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
|
||||
|
||||
if (prev->in_iowait) {
|
||||
|
|
|
@ -4039,7 +4039,11 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
|
|||
return;
|
||||
}
|
||||
|
||||
rq->misfit_task_load = task_h_load(p);
|
||||
/*
|
||||
* Make sure that misfit_task_load will not be null even if
|
||||
* task_h_load() returns 0.
|
||||
*/
|
||||
rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
|
||||
}
|
||||
|
||||
#else /* CONFIG_SMP */
|
||||
|
@ -7638,7 +7642,14 @@ static int detach_tasks(struct lb_env *env)
|
|||
|
||||
switch (env->migration_type) {
|
||||
case migrate_load:
|
||||
load = task_h_load(p);
|
||||
/*
|
||||
* Depending of the number of CPUs and tasks and the
|
||||
* cgroup hierarchy, task_h_load() can return a null
|
||||
* value. Make sure that env->imbalance decreases
|
||||
* otherwise detach_tasks() will stop only after
|
||||
* detaching up to loop_max tasks.
|
||||
*/
|
||||
load = max_t(unsigned long, task_h_load(p), 1);
|
||||
|
||||
if (sched_feat(LB_MIN) &&
|
||||
load < 16 && !env->sd->nr_balance_failed)
|
||||
|
|
Loading…
Reference in New Issue