diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5c357561db5d..0da79d8a6a2c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9121,6 +9121,124 @@ out_unlock: return 0; } +static DEFINE_SPINLOCK(balancing); + +/* + * Scale the max load_balance interval with the number of CPUs in the system. + * This trades load-balance latency on larger machines for less cross talk. + */ +void update_max_interval(void) +{ + max_load_balance_interval = HZ*num_online_cpus()/10; +} + +/* + * It checks each scheduling domain to see if it is due to be balanced, + * and initiates a balancing operation if so. + * + * Balancing parameters are set up in init_sched_domains. + */ +static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) +{ + int continue_balancing = 1; + int cpu = rq->cpu; + unsigned long interval; + struct sched_domain *sd; + /* Earliest time when we have to do rebalance again */ + unsigned long next_balance = jiffies + 60*HZ; + int update_next_balance = 0; + int need_serialize, need_decay = 0; + u64 max_cost = 0; + + rcu_read_lock(); + for_each_domain(cpu, sd) { + /* + * Decay the newidle max times here because this is a regular + * visit to all the domains. Decay ~1% per second. + */ + if (time_after(jiffies, sd->next_decay_max_lb_cost)) { + sd->max_newidle_lb_cost = + (sd->max_newidle_lb_cost * 253) / 256; + sd->next_decay_max_lb_cost = jiffies + HZ; + need_decay = 1; + } + max_cost += sd->max_newidle_lb_cost; + + if (!(sd->flags & SD_LOAD_BALANCE)) + continue; + + /* + * Stop the load balance at this level. There is another + * CPU in our sched group which is doing load balancing more + * actively. + */ + if (!continue_balancing) { + if (need_decay) + continue; + break; + } + + interval = get_sd_balance_interval(sd, idle != CPU_IDLE); + + need_serialize = sd->flags & SD_SERIALIZE; + if (need_serialize) { + if (!spin_trylock(&balancing)) + goto out; + } + + if (time_after_eq(jiffies, sd->last_balance + interval)) { + if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { + /* + * The LBF_DST_PINNED logic could have changed + * env->dst_cpu, so we can't know our idle + * state even if we migrated tasks. Update it. + */ + idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; + } + sd->last_balance = jiffies; + interval = get_sd_balance_interval(sd, idle != CPU_IDLE); + } + if (need_serialize) + spin_unlock(&balancing); +out: + if (time_after(next_balance, sd->last_balance + interval)) { + next_balance = sd->last_balance + interval; + update_next_balance = 1; + } + } + if (need_decay) { + /* + * Ensure the rq-wide value also decays but keep it at a + * reasonable floor to avoid funnies with rq->avg_idle. + */ + rq->max_idle_balance_cost = + max((u64)sysctl_sched_migration_cost, max_cost); + } + rcu_read_unlock(); + + /* + * next_balance will be updated only when there is a need. + * When the cpu is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) { + rq->next_balance = next_balance; + +#ifdef CONFIG_NO_HZ_COMMON + /* + * If this CPU has been elected to perform the nohz idle + * balance. Other idle CPUs have already rebalanced with + * nohz_idle_balance() and nohz.next_balance has been + * updated accordingly. This CPU is now running the idle load + * balance for itself and we need to update the + * nohz.next_balance accordingly. + */ + if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance)) + nohz.next_balance = rq->next_balance; +#endif + } +} + static inline int on_null_domain(struct rq *rq) { return unlikely(!rcu_dereference_sched(rq->sd)); @@ -9373,124 +9491,6 @@ out: static inline void nohz_balancer_kick(struct rq *rq) { } #endif -static DEFINE_SPINLOCK(balancing); - -/* - * Scale the max load_balance interval with the number of CPUs in the system. - * This trades load-balance latency on larger machines for less cross talk. - */ -void update_max_interval(void) -{ - max_load_balance_interval = HZ*num_online_cpus()/10; -} - -/* - * It checks each scheduling domain to see if it is due to be balanced, - * and initiates a balancing operation if so. - * - * Balancing parameters are set up in init_sched_domains. - */ -static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) -{ - int continue_balancing = 1; - int cpu = rq->cpu; - unsigned long interval; - struct sched_domain *sd; - /* Earliest time when we have to do rebalance again */ - unsigned long next_balance = jiffies + 60*HZ; - int update_next_balance = 0; - int need_serialize, need_decay = 0; - u64 max_cost = 0; - - rcu_read_lock(); - for_each_domain(cpu, sd) { - /* - * Decay the newidle max times here because this is a regular - * visit to all the domains. Decay ~1% per second. - */ - if (time_after(jiffies, sd->next_decay_max_lb_cost)) { - sd->max_newidle_lb_cost = - (sd->max_newidle_lb_cost * 253) / 256; - sd->next_decay_max_lb_cost = jiffies + HZ; - need_decay = 1; - } - max_cost += sd->max_newidle_lb_cost; - - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - - /* - * Stop the load balance at this level. There is another - * CPU in our sched group which is doing load balancing more - * actively. - */ - if (!continue_balancing) { - if (need_decay) - continue; - break; - } - - interval = get_sd_balance_interval(sd, idle != CPU_IDLE); - - need_serialize = sd->flags & SD_SERIALIZE; - if (need_serialize) { - if (!spin_trylock(&balancing)) - goto out; - } - - if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { - /* - * The LBF_DST_PINNED logic could have changed - * env->dst_cpu, so we can't know our idle - * state even if we migrated tasks. Update it. - */ - idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; - } - sd->last_balance = jiffies; - interval = get_sd_balance_interval(sd, idle != CPU_IDLE); - } - if (need_serialize) - spin_unlock(&balancing); -out: - if (time_after(next_balance, sd->last_balance + interval)) { - next_balance = sd->last_balance + interval; - update_next_balance = 1; - } - } - if (need_decay) { - /* - * Ensure the rq-wide value also decays but keep it at a - * reasonable floor to avoid funnies with rq->avg_idle. - */ - rq->max_idle_balance_cost = - max((u64)sysctl_sched_migration_cost, max_cost); - } - rcu_read_unlock(); - - /* - * next_balance will be updated only when there is a need. - * When the CPU is attached to null domain for ex, it will not be - * updated. - */ - if (likely(update_next_balance)) { - rq->next_balance = next_balance; - -#ifdef CONFIG_NO_HZ_COMMON - /* - * If this CPU has been elected to perform the nohz idle - * balance. Other idle CPUs have already rebalanced with - * nohz_idle_balance() and nohz.next_balance has been - * updated accordingly. This CPU is now running the idle load - * balance for itself and we need to update the - * nohz.next_balance accordingly. - */ - if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance)) - nohz.next_balance = rq->next_balance; -#endif - } -} - #ifdef CONFIG_NO_HZ_COMMON /* * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the