diff --git a/kernel/sched.c b/kernel/sched.c index 34bf8e6db9af..a2d55144bd9c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -256,6 +256,7 @@ struct cfs_bandwidth { ktime_t period; u64 quota, runtime; s64 hierarchal_quota; + u64 runtime_expires; int idle, timer_active; struct hrtimer period_timer; @@ -396,6 +397,7 @@ struct cfs_rq { #endif #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; + u64 runtime_expires; s64 runtime_remaining; #endif #endif @@ -9166,8 +9168,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) raw_spin_lock_irq(&cfs_b->lock); cfs_b->period = ns_to_ktime(period); cfs_b->quota = quota; - cfs_b->runtime = quota; + __refill_cfs_bandwidth_runtime(cfs_b); /* restart the period timer (if active) to handle new period expiry */ if (runtime_enabled && cfs_b->timer_active) { /* force a reprogram */ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index af73a8a85eef..9d1adbd0b615 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1272,11 +1272,30 @@ static inline u64 sched_cfs_bandwidth_slice(void) return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC; } +/* + * Replenish runtime according to assigned quota and update expiration time. + * We use sched_clock_cpu directly instead of rq->clock to avoid adding + * additional synchronization around rq->lock. + * + * requires cfs_b->lock + */ +static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) +{ + u64 now; + + if (cfs_b->quota == RUNTIME_INF) + return; + + now = sched_clock_cpu(smp_processor_id()); + cfs_b->runtime = cfs_b->quota; + cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); +} + static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { struct task_group *tg = cfs_rq->tg; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); - u64 amount = 0, min_amount; + u64 amount = 0, min_amount, expires; /* note: this is a positive sum as runtime_remaining <= 0 */ min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; @@ -1285,9 +1304,16 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (cfs_b->quota == RUNTIME_INF) amount = min_amount; else { - /* ensure bandwidth timer remains active under consumption */ - if (!cfs_b->timer_active) + /* + * If the bandwidth pool has become inactive, then at least one + * period must have elapsed since the last consumption. + * Refresh the global state and ensure bandwidth timer becomes + * active. + */ + if (!cfs_b->timer_active) { + __refill_cfs_bandwidth_runtime(cfs_b); __start_cfs_bandwidth(cfs_b); + } if (cfs_b->runtime > 0) { amount = min(cfs_b->runtime, min_amount); @@ -1295,19 +1321,61 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } + expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += amount; + /* + * we may have advanced our local expiration to account for allowed + * spread between our sched_clock and the one on which runtime was + * issued. + */ + if ((s64)(expires - cfs_rq->runtime_expires) > 0) + cfs_rq->runtime_expires = expires; +} + +/* + * Note: This depends on the synchronization provided by sched_clock and the + * fact that rq->clock snapshots this value. + */ +static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct rq *rq = rq_of(cfs_rq); + + /* if the deadline is ahead of our clock, nothing to do */ + if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0)) + return; + + if (cfs_rq->runtime_remaining < 0) + return; + + /* + * If the local deadline has passed we have to consider the + * possibility that our sched_clock is 'fast' and the global deadline + * has not truly expired. + * + * Fortunately we can check determine whether this the case by checking + * whether the global deadline has advanced. + */ + + if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) { + /* extend local deadline, drift is bounded above by 2 ticks */ + cfs_rq->runtime_expires += TICK_NSEC; + } else { + /* global deadline is ahead, expiration has passed */ + cfs_rq->runtime_remaining = 0; + } } static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) { - if (!cfs_rq->runtime_enabled) - return; - + /* dock delta_exec before expiring quota (as it could span periods) */ cfs_rq->runtime_remaining -= delta_exec; - if (cfs_rq->runtime_remaining > 0) + expire_cfs_rq_runtime(cfs_rq); + + if (likely(cfs_rq->runtime_remaining > 0)) return; assign_cfs_rq_runtime(cfs_rq); @@ -1338,7 +1406,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) goto out_unlock; idle = cfs_b->idle; - cfs_b->runtime = cfs_b->quota; + /* if we're going inactive then everything else can be deferred */ + if (idle) + goto out_unlock; + + __refill_cfs_bandwidth_runtime(cfs_b); + /* mark as potentially idle for the upcoming period */ cfs_b->idle = 1; @@ -1557,7 +1630,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) return wl; } - #else static inline unsigned long effective_load(struct task_group *tg, int cpu,