diff --git a/include/linux/sched.h b/include/linux/sched.h index 354583117f46..3615ef489138 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2022,6 +2022,10 @@ extern void do_set_cpus_allowed(struct task_struct *p, extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); +int migrate_me(void); +void tell_sched_cpu_down_begin(int cpu); +void tell_sched_cpu_down_done(int cpu); + #else static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) @@ -2034,6 +2038,9 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, return -EINVAL; return 0; } +static inline int migrate_me(void) { return 0; } +static inline void tell_sched_cpu_down_begin(int cpu) { } +static inline void tell_sched_cpu_down_done(int cpu) { } #endif #ifdef CONFIG_NO_HZ_COMMON diff --git a/kernel/cpu.c b/kernel/cpu.c index 49f920d31d57..139345fc2c9b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -51,12 +51,7 @@ static int cpu_hotplug_disabled; static struct { struct task_struct *active_writer; -#ifdef CONFIG_PREEMPT_RT_FULL - /* Makes the lock keep the task's state */ - spinlock_t lock; -#else struct mutex lock; /* Synchronizes accesses to refcount, */ -#endif /* * Also blocks the new readers during * an ongoing cpu hotplug operation. @@ -64,28 +59,46 @@ static struct { int refcount; } cpu_hotplug = { .active_writer = NULL, -#ifdef CONFIG_PREEMPT_RT_FULL - .lock = __SPIN_LOCK_UNLOCKED(cpu_hotplug.lock), -#else .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), -#endif .refcount = 0, }; -#ifdef CONFIG_PREEMPT_RT_FULL -# define hotplug_lock() rt_spin_lock(&cpu_hotplug.lock) -# define hotplug_unlock() rt_spin_unlock(&cpu_hotplug.lock) -#else -# define hotplug_lock() mutex_lock(&cpu_hotplug.lock) -# define hotplug_unlock() mutex_unlock(&cpu_hotplug.lock) -#endif - +/** + * hotplug_pcp - per cpu hotplug descriptor + * @unplug: set when pin_current_cpu() needs to sync tasks + * @sync_tsk: the task that waits for tasks to finish pinned sections + * @refcount: counter of tasks in pinned sections + * @grab_lock: set when the tasks entering pinned sections should wait + * @synced: notifier for @sync_tsk to tell cpu_down it's finished + * @mutex: the mutex to make tasks wait (used when @grab_lock is true) + * @mutex_init: zero if the mutex hasn't been initialized yet. + * + * Although @unplug and @sync_tsk may point to the same task, the @unplug + * is used as a flag and still exists after @sync_tsk has exited and + * @sync_tsk set to NULL. + */ struct hotplug_pcp { struct task_struct *unplug; + struct task_struct *sync_tsk; int refcount; + int grab_lock; struct completion synced; +#ifdef CONFIG_PREEMPT_RT_FULL + spinlock_t lock; +#else + struct mutex mutex; +#endif + int mutex_init; }; +#ifdef CONFIG_PREEMPT_RT_FULL +# define hotplug_lock(hp) rt_spin_lock(&(hp)->lock) +# define hotplug_unlock(hp) rt_spin_unlock(&(hp)->lock) +#else +# define hotplug_lock(hp) mutex_lock(&(hp)->mutex) +# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex) +#endif + static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp); /** @@ -99,18 +112,39 @@ static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp); void pin_current_cpu(void) { struct hotplug_pcp *hp; + int force = 0; retry: hp = &__get_cpu_var(hotplug_pcp); - if (!hp->unplug || hp->refcount || preempt_count() > 1 || + if (!hp->unplug || hp->refcount || force || preempt_count() > 1 || hp->unplug == current) { hp->refcount++; return; } - preempt_enable(); - hotplug_lock(); - hotplug_unlock(); + if (hp->grab_lock) { + preempt_enable(); + hotplug_lock(hp); + hotplug_unlock(hp); + } else { + preempt_enable(); + /* + * Try to push this task off of this CPU. + */ + if (!migrate_me()) { + preempt_disable(); + hp = &__get_cpu_var(hotplug_pcp); + if (!hp->grab_lock) { + /* + * Just let it continue it's already pinned + * or about to sleep. + */ + force = 1; + goto retry; + } + preempt_enable(); + } + } preempt_disable(); goto retry; } @@ -131,26 +165,84 @@ void unpin_current_cpu(void) wake_up_process(hp->unplug); } -/* - * FIXME: Is this really correct under all circumstances ? - */ +static void wait_for_pinned_cpus(struct hotplug_pcp *hp) +{ + set_current_state(TASK_UNINTERRUPTIBLE); + while (hp->refcount) { + schedule_preempt_disabled(); + set_current_state(TASK_UNINTERRUPTIBLE); + } +} + static int sync_unplug_thread(void *data) { struct hotplug_pcp *hp = data; preempt_disable(); hp->unplug = current; + wait_for_pinned_cpus(hp); + + /* + * This thread will synchronize the cpu_down() with threads + * that have pinned the CPU. When the pinned CPU count reaches + * zero, we inform the cpu_down code to continue to the next step. + */ set_current_state(TASK_UNINTERRUPTIBLE); - while (hp->refcount) { - schedule_preempt_disabled(); + preempt_enable(); + complete(&hp->synced); + + /* + * If all succeeds, the next step will need tasks to wait till + * the CPU is offline before continuing. To do this, the grab_lock + * is set and tasks going into pin_current_cpu() will block on the + * mutex. But we still need to wait for those that are already in + * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop() + * will kick this thread out. + */ + while (!hp->grab_lock && !kthread_should_stop()) { + schedule(); + set_current_state(TASK_UNINTERRUPTIBLE); + } + + /* Make sure grab_lock is seen before we see a stale completion */ + smp_mb(); + + /* + * Now just before cpu_down() enters stop machine, we need to make + * sure all tasks that are in pinned CPU sections are out, and new + * tasks will now grab the lock, keeping them from entering pinned + * CPU sections. + */ + if (!kthread_should_stop()) { + preempt_disable(); + wait_for_pinned_cpus(hp); + preempt_enable(); + complete(&hp->synced); + } + + set_current_state(TASK_UNINTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); set_current_state(TASK_UNINTERRUPTIBLE); } set_current_state(TASK_RUNNING); - preempt_enable(); - complete(&hp->synced); + + /* + * Force this thread off this CPU as it's going down and + * we don't want any more work on this CPU. + */ + current->flags &= ~PF_NO_SETAFFINITY; + do_set_cpus_allowed(current, cpu_present_mask); + migrate_me(); return 0; } +static void __cpu_unplug_sync(struct hotplug_pcp *hp) +{ + wake_up_process(hp->sync_tsk); + wait_for_completion(&hp->synced); +} + /* * Start the sync_unplug_thread on the target cpu and wait for it to * complete. @@ -158,23 +250,83 @@ static int sync_unplug_thread(void *data) static int cpu_unplug_begin(unsigned int cpu) { struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu); - struct task_struct *tsk; + int err; + + /* Protected by cpu_hotplug.lock */ + if (!hp->mutex_init) { +#ifdef CONFIG_PREEMPT_RT_FULL + spin_lock_init(&hp->lock); +#else + mutex_init(&hp->mutex); +#endif + hp->mutex_init = 1; + } + + /* Inform the scheduler to migrate tasks off this CPU */ + tell_sched_cpu_down_begin(cpu); init_completion(&hp->synced); - tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu); - if (IS_ERR(tsk)) - return (PTR_ERR(tsk)); - kthread_bind(tsk, cpu); - wake_up_process(tsk); - wait_for_completion(&hp->synced); + + hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu); + if (IS_ERR(hp->sync_tsk)) { + err = PTR_ERR(hp->sync_tsk); + hp->sync_tsk = NULL; + return err; + } + kthread_bind(hp->sync_tsk, cpu); + + /* + * Wait for tasks to get out of the pinned sections, + * it's still OK if new tasks enter. Some CPU notifiers will + * wait for tasks that are going to enter these sections and + * we must not have them block. + */ + __cpu_unplug_sync(hp); + return 0; } +static void cpu_unplug_sync(unsigned int cpu) +{ + struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu); + + init_completion(&hp->synced); + /* The completion needs to be initialzied before setting grab_lock */ + smp_wmb(); + + /* Grab the mutex before setting grab_lock */ + hotplug_lock(hp); + hp->grab_lock = 1; + + /* + * The CPU notifiers have been completed. + * Wait for tasks to get out of pinned CPU sections and have new + * tasks block until the CPU is completely down. + */ + __cpu_unplug_sync(hp); + + /* All done with the sync thread */ + kthread_stop(hp->sync_tsk); + hp->sync_tsk = NULL; +} + static void cpu_unplug_done(unsigned int cpu) { struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu); hp->unplug = NULL; + /* Let all tasks know cpu unplug is finished before cleaning up */ + smp_wmb(); + + if (hp->sync_tsk) + kthread_stop(hp->sync_tsk); + + if (hp->grab_lock) { + hotplug_unlock(hp); + /* protected by cpu_hotplug.lock */ + hp->grab_lock = 0; + } + tell_sched_cpu_down_done(cpu); } void get_online_cpus(void) @@ -182,9 +334,9 @@ void get_online_cpus(void) might_sleep(); if (cpu_hotplug.active_writer == current) return; - hotplug_lock(); + mutex_lock(&cpu_hotplug.lock); cpu_hotplug.refcount++; - hotplug_unlock(); + mutex_unlock(&cpu_hotplug.lock); } EXPORT_SYMBOL_GPL(get_online_cpus); @@ -194,14 +346,13 @@ void put_online_cpus(void) if (cpu_hotplug.active_writer == current) return; - hotplug_lock(); + mutex_lock(&cpu_hotplug.lock); if (WARN_ON(!cpu_hotplug.refcount)) cpu_hotplug.refcount++; /* try to fix things up */ if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) wake_up_process(cpu_hotplug.active_writer); - hotplug_unlock(); - + mutex_unlock(&cpu_hotplug.lock); } EXPORT_SYMBOL_GPL(put_online_cpus); @@ -232,11 +383,11 @@ void cpu_hotplug_begin(void) cpu_hotplug.active_writer = current; for (;;) { - hotplug_lock(); + mutex_lock(&cpu_hotplug.lock); if (likely(!cpu_hotplug.refcount)) break; __set_current_state(TASK_UNINTERRUPTIBLE); - hotplug_unlock(); + mutex_unlock(&cpu_hotplug.lock); schedule(); } } @@ -244,7 +395,7 @@ void cpu_hotplug_begin(void) void cpu_hotplug_done(void) { cpu_hotplug.active_writer = NULL; - hotplug_unlock(); + mutex_unlock(&cpu_hotplug.lock); } /* @@ -458,6 +609,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) smpboot_park_threads(cpu); + /* Notifiers are done. Don't let any more tasks pin this CPU. */ + cpu_unplug_sync(cpu); + /* * So now all preempt/rcu users must observe !cpu_active(). */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2fbc0d299bb8..d25b47d2b4a2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2651,7 +2651,7 @@ void migrate_disable(void) { struct task_struct *p = current; - if (in_atomic() || p->flags & PF_NO_SETAFFINITY) { + if (in_atomic()) { #ifdef CONFIG_SCHED_DEBUG p->migrate_disable_atomic++; #endif @@ -2681,7 +2681,7 @@ void migrate_enable(void) unsigned long flags; struct rq *rq; - if (in_atomic() || p->flags & PF_NO_SETAFFINITY) { + if (in_atomic()) { #ifdef CONFIG_SCHED_DEBUG p->migrate_disable_atomic--; #endif @@ -4749,6 +4749,84 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) cpumask_copy(&p->cpus_allowed, new_mask); } +static DEFINE_PER_CPU(struct cpumask, sched_cpumasks); +static DEFINE_MUTEX(sched_down_mutex); +static cpumask_t sched_down_cpumask; + +void tell_sched_cpu_down_begin(int cpu) +{ + mutex_lock(&sched_down_mutex); + cpumask_set_cpu(cpu, &sched_down_cpumask); + mutex_unlock(&sched_down_mutex); +} + +void tell_sched_cpu_down_done(int cpu) +{ + mutex_lock(&sched_down_mutex); + cpumask_clear_cpu(cpu, &sched_down_cpumask); + mutex_unlock(&sched_down_mutex); +} + +/** + * migrate_me - try to move the current task off this cpu + * + * Used by the pin_current_cpu() code to try to get tasks + * to move off the current CPU as it is going down. + * It will only move the task if the task isn't pinned to + * the CPU (with migrate_disable, affinity or NO_SETAFFINITY) + * and the task has to be in a RUNNING state. Otherwise the + * movement of the task will wake it up (change its state + * to running) when the task did not expect it. + * + * Returns 1 if it succeeded in moving the current task + * 0 otherwise. + */ +int migrate_me(void) +{ + struct task_struct *p = current; + struct migration_arg arg; + struct cpumask *cpumask; + struct cpumask *mask; + unsigned long flags; + unsigned int dest_cpu; + struct rq *rq; + + /* + * We can not migrate tasks bounded to a CPU or tasks not + * running. The movement of the task will wake it up. + */ + if (p->flags & PF_NO_SETAFFINITY || p->state) + return 0; + + mutex_lock(&sched_down_mutex); + rq = task_rq_lock(p, &flags); + + cpumask = &__get_cpu_var(sched_cpumasks); + mask = &p->cpus_allowed; + + cpumask_andnot(cpumask, mask, &sched_down_cpumask); + + if (!cpumask_weight(cpumask)) { + /* It's only on this CPU? */ + task_rq_unlock(rq, p, &flags); + mutex_unlock(&sched_down_mutex); + return 0; + } + + dest_cpu = cpumask_any_and(cpu_active_mask, cpumask); + + arg.task = p; + arg.dest_cpu = dest_cpu; + + task_rq_unlock(rq, p, &flags); + + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + tlb_migrate_finish(p->mm); + mutex_unlock(&sched_down_mutex); + + return 1; +} + /* * This is how migration works: *