diff --git a/kernel/sched.c b/kernel/sched.c index 8ec1e7ac2894..5db05f6fb470 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -402,7 +402,7 @@ struct cfs_rq { u64 runtime_expires; s64 runtime_remaining; - int throttled; + int throttled, throttle_count; struct list_head throttled_list; #endif #endif diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 76411950ff3b..5a2089492a98 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -706,6 +706,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_FAIR_GROUP_SCHED +/* we need this in update_cfs_load and load-balance functions below */ +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); # ifdef CONFIG_SMP static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, int global_update) @@ -728,7 +730,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) u64 now, delta; unsigned long load = cfs_rq->load.weight; - if (cfs_rq->tg == &root_task_group) + if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) return; now = rq_of(cfs_rq)->clock_task; @@ -837,7 +839,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) tg = cfs_rq->tg; se = tg->se[cpu_of(rq_of(cfs_rq))]; - if (!se) + if (!se || throttled_hierarchy(cfs_rq)) return; #ifndef CONFIG_SMP if (likely(se->load.weight == tg->shares)) @@ -1403,6 +1405,65 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) return cfs_rq->throttled; } +/* check whether cfs_rq, or any parent, is throttled */ +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) +{ + return cfs_rq->throttle_count; +} + +/* + * Ensure that neither of the group entities corresponding to src_cpu or + * dest_cpu are members of a throttled hierarchy when performing group + * load-balance operations. + */ +static inline int throttled_lb_pair(struct task_group *tg, + int src_cpu, int dest_cpu) +{ + struct cfs_rq *src_cfs_rq, *dest_cfs_rq; + + src_cfs_rq = tg->cfs_rq[src_cpu]; + dest_cfs_rq = tg->cfs_rq[dest_cpu]; + + return throttled_hierarchy(src_cfs_rq) || + throttled_hierarchy(dest_cfs_rq); +} + +/* updated child weight may affect parent so we have to do this bottom up */ +static int tg_unthrottle_up(struct task_group *tg, void *data) +{ + struct rq *rq = data; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + cfs_rq->throttle_count--; +#ifdef CONFIG_SMP + if (!cfs_rq->throttle_count) { + u64 delta = rq->clock_task - cfs_rq->load_stamp; + + /* leaving throttled state, advance shares averaging windows */ + cfs_rq->load_stamp += delta; + cfs_rq->load_last += delta; + + /* update entity weight now that we are on_rq again */ + update_cfs_shares(cfs_rq); + } +#endif + + return 0; +} + +static int tg_throttle_down(struct task_group *tg, void *data) +{ + struct rq *rq = data; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + /* group is entering throttled state, record last load */ + if (!cfs_rq->throttle_count) + update_cfs_load(cfs_rq, 0); + cfs_rq->throttle_count++; + + return 0; +} + static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -1413,7 +1474,9 @@ static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq) se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; /* account load preceding throttle */ - update_cfs_load(cfs_rq, 0); + rcu_read_lock(); + walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); + rcu_read_unlock(); task_delta = cfs_rq->h_nr_running; for_each_sched_entity(se) { @@ -1454,6 +1517,10 @@ static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) list_del_rcu(&cfs_rq->throttled_list); raw_spin_unlock(&cfs_b->lock); + update_rq_clock(rq); + /* update hierarchical throttle state */ + walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); + if (!cfs_rq->load.weight) return; @@ -1598,6 +1665,17 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { return 0; } + +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) +{ + return 0; +} + +static inline int throttled_lb_pair(struct task_group *tg, + int src_cpu, int dest_cpu) +{ + return 0; +} #endif /************************************************** @@ -2493,6 +2571,9 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, for_each_leaf_cfs_rq(busiest, cfs_rq) { list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { + if (throttled_lb_pair(task_group(p), + busiest->cpu, this_cpu)) + break; if (!can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) @@ -2608,8 +2689,13 @@ static void update_shares(int cpu) * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ - for_each_leaf_cfs_rq(rq, cfs_rq) + for_each_leaf_cfs_rq(rq, cfs_rq) { + /* throttled entities do not contribute to load */ + if (throttled_hierarchy(cfs_rq)) + continue; + update_shares_cpu(cfs_rq->tg, cpu); + } rcu_read_unlock(); } @@ -2659,9 +2745,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, u64 rem_load, moved_load; /* - * empty group + * empty group or part of a throttled hierarchy */ - if (!busiest_cfs_rq->task_weight) + if (!busiest_cfs_rq->task_weight || + throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu)) continue; rem_load = (u64)rem_load_move * busiest_weight;