diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b7e6b6f9c5f6..66b878e94554 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4348,6 +4348,25 @@ fail: return ret; } +#ifdef CONFIG_NUMA_BALANCING +/* Migrate current task p to target_cpu */ +int migrate_task_to(struct task_struct *p, int target_cpu) +{ + struct migration_arg arg = { p, target_cpu }; + int curr_cpu = task_cpu(p); + + if (curr_cpu == target_cpu) + return 0; + + if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p))) + return -EINVAL; + + /* TODO: This is not properly updating schedstats */ + + return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); +} +#endif + /* * migration_cpu_stop - this will be executed by a highprio stopper thread * and performs thread migration by bumping thread off CPU then diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 89431248d33d..8b15e9e1d1b8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -886,6 +886,31 @@ static unsigned int task_scan_max(struct task_struct *p) */ unsigned int sysctl_numa_balancing_settle_count __read_mostly = 3; +static unsigned long weighted_cpuload(const int cpu); + + +static int +find_idlest_cpu_node(int this_cpu, int nid) +{ + unsigned long load, min_load = ULONG_MAX; + int i, idlest_cpu = this_cpu; + + BUG_ON(cpu_to_node(this_cpu) == nid); + + rcu_read_lock(); + for_each_cpu(i, cpumask_of_node(nid)) { + load = weighted_cpuload(i); + + if (load < min_load) { + min_load = load; + idlest_cpu = i; + } + } + rcu_read_unlock(); + + return idlest_cpu; +} + static void task_numa_placement(struct task_struct *p) { int seq, nid, max_nid = -1; @@ -916,10 +941,29 @@ static void task_numa_placement(struct task_struct *p) } } - /* Update the tasks preferred node if necessary */ + /* + * Record the preferred node as the node with the most faults, + * requeue the task to be running on the idlest CPU on the + * preferred node and reset the scanning rate to recheck + * the working set placement. + */ if (max_faults && max_nid != p->numa_preferred_nid) { + int preferred_cpu; + + /* + * If the task is not on the preferred node then find the most + * idle CPU to migrate to. + */ + preferred_cpu = task_cpu(p); + if (cpu_to_node(preferred_cpu) != max_nid) { + preferred_cpu = find_idlest_cpu_node(preferred_cpu, + max_nid); + } + + /* Update the preferred nid and migrate task if possible */ p->numa_preferred_nid = max_nid; p->numa_migrate_seq = 0; + migrate_task_to(p, preferred_cpu); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 199099c7aa22..66458c902d84 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -557,6 +557,7 @@ static inline u64 rq_clock_task(struct rq *rq) } #ifdef CONFIG_NUMA_BALANCING +extern int migrate_task_to(struct task_struct *p, int cpu); static inline void task_numa_free(struct task_struct *p) { kfree(p->numa_faults);