sched/fair: Remove double_lock_balance() from load_balance()

Avoid double_rq_lock() and use TASK_ON_RQ_MIGRATING for
load_balance(). The advantage is (obviously) not holding two
rq->lock's at the same time and thereby increasing parallelism.

Further note that if there was no task to migrate we will not
have acquired the second rq->lock at all.

The important point to note is that because we acquire dst->lock
immediately after releasing src->lock the potential wait time of
task_rq_lock() callers on TASK_ON_RQ_MIGRATING is not longer
than it would have been in the double rq lock scenario.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Turner <pjt@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Kirill Tkhai <tkhai@yandex.ru>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1408528109.23412.94.camel@tkhai
Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Kirill Tkhai 2014-08-20 13:48:29 +04:00 committed by Ingo Molnar
parent e5673f2805
commit 163122b7fc
1 changed files with 99 additions and 52 deletions

View File

@ -4709,7 +4709,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
/*
* This is possible from callers such as move_task(), in which we
* This is possible from callers such as attach_tasks(), in which we
* unconditionally check_prempt_curr() after an enqueue (which may have
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
@ -5117,20 +5117,9 @@ struct lb_env {
unsigned int loop_max;
enum fbq_type fbq_type;
struct list_head tasks;
};
/*
* move_task - move a task from one runqueue to another runqueue.
* Both runqueues must be locked.
*/
static void move_task(struct task_struct *p, struct lb_env *env)
{
deactivate_task(env->src_rq, p, 0);
set_task_cpu(p, env->dst_cpu);
activate_task(env->dst_rq, p, 0);
check_preempt_curr(env->dst_rq, p, 0);
}
/*
* Is this task likely cache-hot:
*/
@ -5345,6 +5334,18 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
return 0;
}
/*
* detach_task() -- detach the task for the migration specified in env
*/
static void detach_task(struct task_struct *p, struct lb_env *env)
{
lockdep_assert_held(&env->src_rq->lock);
deactivate_task(env->src_rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(p, env->dst_cpu);
}
/*
* detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
* part of active balancing operations within "domain".
@ -5361,15 +5362,13 @@ static struct task_struct *detach_one_task(struct lb_env *env)
if (!can_migrate_task(p, env))
continue;
deactivate_task(env->src_rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(p, env->dst_cpu);
detach_task(p, env);
/*
* Right now, this is only the second place where
* lb_gained[env->idle] is updated (other is move_tasks)
* lb_gained[env->idle] is updated (other is detach_tasks)
* so we can safely collect stats here rather than
* inside move_tasks().
* inside detach_tasks().
*/
schedstat_inc(env->sd, lb_gained[env->idle]);
return p;
@ -5377,35 +5376,22 @@ static struct task_struct *detach_one_task(struct lb_env *env)
return NULL;
}
/*
* attach_one_task() -- attaches the task returned from detach_one_task() to
* its new rq.
*/
static void attach_one_task(struct rq *rq, struct task_struct *p)
{
raw_spin_lock(&rq->lock);
BUG_ON(task_rq(p) != rq);
p->on_rq = TASK_ON_RQ_QUEUED;
activate_task(rq, p, 0);
check_preempt_curr(rq, p, 0);
raw_spin_unlock(&rq->lock);
}
static const unsigned int sched_nr_migrate_break = 32;
/*
* move_tasks tries to move up to imbalance weighted load from busiest to
* this_rq, as part of a balancing operation within domain "sd".
* Returns 1 if successful and 0 otherwise.
* detach_tasks() -- tries to detach up to imbalance weighted load from
* busiest_rq, as part of a balancing operation within domain "sd".
*
* Called with both runqueues locked.
* Returns number of detached tasks if successful and 0 otherwise.
*/
static int move_tasks(struct lb_env *env)
static int detach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->src_rq->cfs_tasks;
struct task_struct *p;
unsigned long load;
int pulled = 0;
int detached = 0;
lockdep_assert_held(&env->src_rq->lock);
if (env->imbalance <= 0)
return 0;
@ -5436,14 +5422,16 @@ static int move_tasks(struct lb_env *env)
if ((load / 2) > env->imbalance)
goto next;
move_task(p, env);
pulled++;
detach_task(p, env);
list_add(&p->se.group_node, &env->tasks);
detached++;
env->imbalance -= load;
#ifdef CONFIG_PREEMPT
/*
* NEWIDLE balancing is a source of latency, so preemptible
* kernels will stop after the first task is pulled to minimize
* kernels will stop after the first task is detached to minimize
* the critical section.
*/
if (env->idle == CPU_NEWLY_IDLE)
@ -5463,13 +5451,58 @@ next:
}
/*
* Right now, this is one of only two places move_task() is called,
* so we can safely collect move_task() stats here rather than
* inside move_task().
* Right now, this is one of only two places we collect this stat
* so we can safely collect detach_one_task() stats here rather
* than inside detach_one_task().
*/
schedstat_add(env->sd, lb_gained[env->idle], pulled);
schedstat_add(env->sd, lb_gained[env->idle], detached);
return pulled;
return detached;
}
/*
* attach_task() -- attach the task detached by detach_task() to its new rq.
*/
static void attach_task(struct rq *rq, struct task_struct *p)
{
lockdep_assert_held(&rq->lock);
BUG_ON(task_rq(p) != rq);
p->on_rq = TASK_ON_RQ_QUEUED;
activate_task(rq, p, 0);
check_preempt_curr(rq, p, 0);
}
/*
* attach_one_task() -- attaches the task returned from detach_one_task() to
* its new rq.
*/
static void attach_one_task(struct rq *rq, struct task_struct *p)
{
raw_spin_lock(&rq->lock);
attach_task(rq, p);
raw_spin_unlock(&rq->lock);
}
/*
* attach_tasks() -- attaches all tasks detached by detach_tasks() to their
* new rq.
*/
static void attach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->tasks;
struct task_struct *p;
raw_spin_lock(&env->dst_rq->lock);
while (!list_empty(tasks)) {
p = list_first_entry(tasks, struct task_struct, se.group_node);
list_del_init(&p->se.group_node);
attach_task(env->dst_rq, p);
}
raw_spin_unlock(&env->dst_rq->lock);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@ -6603,6 +6636,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
.fbq_type = all,
.tasks = LIST_HEAD_INIT(env.tasks),
};
/*
@ -6652,16 +6686,29 @@ redo:
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
local_irq_save(flags);
double_rq_lock(env.dst_rq, busiest);
raw_spin_lock_irqsave(&busiest->lock, flags);
/*
* cur_ld_moved - load moved in current iteration
* ld_moved - cumulative load moved across iterations
*/
cur_ld_moved = move_tasks(&env);
ld_moved += cur_ld_moved;
double_rq_unlock(env.dst_rq, busiest);
cur_ld_moved = detach_tasks(&env);
/*
* We've detached some tasks from busiest_rq. Every
* task is masked "TASK_ON_RQ_MIGRATING", so we can safely
* unlock busiest->lock, and we are able to be sure
* that nobody can manipulate the tasks in parallel.
* See task_rq_lock() family for the details.
*/
raw_spin_unlock(&busiest->lock);
if (cur_ld_moved) {
attach_tasks(&env);
ld_moved += cur_ld_moved;
}
local_irq_restore(flags);
/*
@ -6797,7 +6844,7 @@ more_balance:
* If we've begun active balancing, start to back off. This
* case may not be covered by the all_pinned logic if there
* is only 1 task on the busy runqueue (because we don't call
* move_tasks).
* detach_tasks).
*/
if (sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;