diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 471193bdd4b7..7e13dd0fd4b5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5936,11 +5936,10 @@ struct sg_lb_stats { unsigned long group_capacity; unsigned long group_usage; /* Total usage of the group */ unsigned int sum_nr_running; /* Nr tasks running in the group */ - unsigned int group_capacity_factor; unsigned int idle_cpus; unsigned int group_weight; enum group_type group_type; - int group_has_free_capacity; + int group_no_capacity; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -6156,28 +6155,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } /* - * Try and fix up capacity for tiny siblings, this is needed when - * things like SD_ASYM_PACKING need f_b_g to select another sibling - * which on its own isn't powerful enough. - * - * See update_sd_pick_busiest() and check_asym_packing(). + * Check whether the capacity of the rq has been noticeably reduced by side + * activity. The imbalance_pct is used for the threshold. + * Return true is the capacity is reduced */ static inline int -fix_small_capacity(struct sched_domain *sd, struct sched_group *group) +check_cpu_capacity(struct rq *rq, struct sched_domain *sd) { - /* - * Only siblings can have significantly less than SCHED_CAPACITY_SCALE - */ - if (!(sd->flags & SD_SHARE_CPUCAPACITY)) - return 0; - - /* - * If ~90% of the cpu_capacity is still there, we're good. - */ - if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29) - return 1; - - return 0; + return ((rq->cpu_capacity * sd->imbalance_pct) < + (rq->cpu_capacity_orig * 100)); } /* @@ -6215,37 +6201,56 @@ static inline int sg_imbalanced(struct sched_group *group) } /* - * Compute the group capacity factor. - * - * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by - * first dividing out the smt factor and computing the actual number of cores - * and limit unit capacity with that. + * group_has_capacity returns true if the group has spare capacity that could + * be used by some tasks. + * We consider that a group has spare capacity if the * number of task is + * smaller than the number of CPUs or if the usage is lower than the available + * capacity for CFS tasks. + * For the latter, we use a threshold to stabilize the state, to take into + * account the variance of the tasks' load and to return true if the available + * capacity in meaningful for the load balancer. + * As an example, an available capacity of 1% can appear but it doesn't make + * any benefit for the load balance. */ -static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) +static inline bool +group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) { - unsigned int capacity_factor, smt, cpus; - unsigned int capacity, capacity_orig; + if (sgs->sum_nr_running < sgs->group_weight) + return true; - capacity = group->sgc->capacity; - capacity_orig = group->sgc->capacity_orig; - cpus = group->group_weight; + if ((sgs->group_capacity * 100) > + (sgs->group_usage * env->sd->imbalance_pct)) + return true; - /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ - smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); - capacity_factor = cpus / smt; /* cores */ - - capacity_factor = min_t(unsigned, - capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); - if (!capacity_factor) - capacity_factor = fix_small_capacity(env->sd, group); - - return capacity_factor; + return false; } -static enum group_type -group_classify(struct sched_group *group, struct sg_lb_stats *sgs) +/* + * group_is_overloaded returns true if the group has more tasks than it can + * handle. + * group_is_overloaded is not equals to !group_has_capacity because a group + * with the exact right number of tasks, has no more spare capacity but is not + * overloaded so both group_has_capacity and group_is_overloaded return + * false. + */ +static inline bool +group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) { - if (sgs->sum_nr_running > sgs->group_capacity_factor) + if (sgs->sum_nr_running <= sgs->group_weight) + return false; + + if ((sgs->group_capacity * 100) < + (sgs->group_usage * env->sd->imbalance_pct)) + return true; + + return false; +} + +static enum group_type group_classify(struct lb_env *env, + struct sched_group *group, + struct sg_lb_stats *sgs) +{ + if (sgs->group_no_capacity) return group_overloaded; if (sg_imbalanced(group)) @@ -6306,11 +6311,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; sgs->group_weight = group->group_weight; - sgs->group_capacity_factor = sg_capacity_factor(env, group); - sgs->group_type = group_classify(group, sgs); - if (sgs->group_capacity_factor > sgs->sum_nr_running) - sgs->group_has_free_capacity = 1; + sgs->group_no_capacity = group_is_overloaded(env, sgs); + sgs->group_type = group_classify(env, group, sgs); } /** @@ -6432,18 +6435,19 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd /* * In case the child domain prefers tasks go to siblings - * first, lower the sg capacity factor to one so that we'll try + * first, lower the sg capacity so that we'll try * and move all the excess tasks away. We lower the capacity * of a group only if the local group has the capacity to fit - * these excess tasks, i.e. nr_running < group_capacity_factor. The - * extra check prevents the case where you always pull from the - * heaviest group when it is already under-utilized (possible - * with a large weight task outweighs the tasks on the system). + * these excess tasks. The extra check prevents the case where + * you always pull from the heaviest group when it is already + * under-utilized (possible with a large weight task outweighs + * the tasks on the system). */ if (prefer_sibling && sds->local && - sds->local_stat.group_has_free_capacity) { - sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); - sgs->group_type = group_classify(sg, sgs); + group_has_capacity(env, &sds->local_stat) && + (sgs->sum_nr_running > 1)) { + sgs->group_no_capacity = 1; + sgs->group_type = group_overloaded; } if (update_sd_pick_busiest(env, sds, sg, sgs)) { @@ -6623,11 +6627,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (busiest->group_type == group_overloaded && local->group_type == group_overloaded) { - load_above_capacity = - (busiest->sum_nr_running - busiest->group_capacity_factor); - - load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); - load_above_capacity /= busiest->group_capacity; + load_above_capacity = busiest->sum_nr_running * + SCHED_LOAD_SCALE; + if (load_above_capacity > busiest->group_capacity) + load_above_capacity -= busiest->group_capacity; + else + load_above_capacity = ~0UL; } /* @@ -6690,6 +6695,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) local = &sds.local_stat; busiest = &sds.busiest_stat; + /* ASYM feature bypasses nice load balance check */ if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && check_asym_packing(env, &sds)) return sds.busiest; @@ -6710,8 +6716,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && - !busiest->group_has_free_capacity) + if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && + busiest->group_no_capacity) goto force_balance; /* @@ -6770,7 +6776,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, int i; for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { - unsigned long capacity, capacity_factor, wl; + unsigned long capacity, wl; enum fbq_type rt; rq = cpu_rq(i); @@ -6799,9 +6805,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, continue; capacity = capacity_of(i); - capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); - if (!capacity_factor) - capacity_factor = fix_small_capacity(env->sd, group); wl = weighted_cpuload(i); @@ -6809,7 +6812,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, * When comparing with imbalance, use weighted_cpuload() * which is not scaled with the cpu capacity. */ - if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) + + if (rq->nr_running == 1 && wl > env->imbalance && + !check_cpu_capacity(rq, env->sd)) continue; /*