diff --git a/include/linux/sched.h b/include/linux/sched.h index 5ab3b89fc33e..ef92953764f2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1459,6 +1459,8 @@ struct task_struct { int numa_preferred_nid; unsigned long numa_migrate_retry; u64 node_stamp; /* migration stamp */ + u64 last_task_numa_placement; + u64 last_sum_exec_runtime; struct callback_head numa_work; struct list_head numa_entry; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bc708c53bf03..a561c9e8e382 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1746,6 +1746,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->numa_work.next = &p->numa_work; p->numa_faults_memory = NULL; p->numa_faults_buffer_memory = NULL; + p->last_task_numa_placement = 0; + p->last_sum_exec_runtime = 0; INIT_LIST_HEAD(&p->numa_entry); p->numa_group = NULL; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index eeabb33f349e..8fc3a8234817 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -887,6 +887,11 @@ struct numa_group { struct rcu_head rcu; nodemask_t active_nodes; unsigned long total_faults; + /* + * Faults_cpu is used to decide whether memory should move + * towards the CPU. As a consequence, these stats are weighted + * more by CPU use than by memory faults. + */ unsigned long *faults_cpu; unsigned long faults[0]; }; @@ -1446,11 +1451,41 @@ static void update_task_scan_period(struct task_struct *p, memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } +/* + * Get the fraction of time the task has been running since the last + * NUMA placement cycle. The scheduler keeps similar statistics, but + * decays those on a 32ms period, which is orders of magnitude off + * from the dozens-of-seconds NUMA balancing period. Use the scheduler + * stats only if the task is so new there are no NUMA statistics yet. + */ +static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) +{ + u64 runtime, delta, now; + /* Use the start of this time slice to avoid calculations. */ + now = p->se.exec_start; + runtime = p->se.sum_exec_runtime; + + if (p->last_task_numa_placement) { + delta = runtime - p->last_sum_exec_runtime; + *period = now - p->last_task_numa_placement; + } else { + delta = p->se.avg.runnable_avg_sum; + *period = p->se.avg.runnable_avg_period; + } + + p->last_sum_exec_runtime = runtime; + p->last_task_numa_placement = now; + + return delta; +} + static void task_numa_placement(struct task_struct *p) { int seq, nid, max_nid = -1, max_group_nid = -1; unsigned long max_faults = 0, max_group_faults = 0; unsigned long fault_types[2] = { 0, 0 }; + unsigned long total_faults; + u64 runtime, period; spinlock_t *group_lock = NULL; seq = ACCESS_ONCE(p->mm->numa_scan_seq); @@ -1459,6 +1494,10 @@ static void task_numa_placement(struct task_struct *p) p->numa_scan_seq = seq; p->numa_scan_period_max = task_scan_max(p); + total_faults = p->numa_faults_locality[0] + + p->numa_faults_locality[1]; + runtime = numa_get_avg_runtime(p, &period); + /* If the task is part of a group prevent parallel updates to group stats */ if (p->numa_group) { group_lock = &p->numa_group->lock; @@ -1471,7 +1510,7 @@ static void task_numa_placement(struct task_struct *p) int priv, i; for (priv = 0; priv < 2; priv++) { - long diff, f_diff; + long diff, f_diff, f_weight; i = task_faults_idx(nid, priv); diff = -p->numa_faults_memory[i]; @@ -1483,8 +1522,18 @@ static void task_numa_placement(struct task_struct *p) fault_types[priv] += p->numa_faults_buffer_memory[i]; p->numa_faults_buffer_memory[i] = 0; + /* + * Normalize the faults_from, so all tasks in a group + * count according to CPU use, instead of by the raw + * number of faults. Tasks with little runtime have + * little over-all impact on throughput, and thus their + * faults are less important. + */ + f_weight = div64_u64(runtime << 16, period + 1); + f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / + (total_faults + 1); p->numa_faults_cpu[i] >>= 1; - p->numa_faults_cpu[i] += p->numa_faults_buffer_cpu[i]; + p->numa_faults_cpu[i] += f_weight; p->numa_faults_buffer_cpu[i] = 0; faults += p->numa_faults_memory[i];