diff --git a/include/linux/sched.h b/include/linux/sched.h index f2f440221b70..c34a718e20dd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -788,9 +788,28 @@ enum cpu_idle_type { }; /* - * Increase resolution of nice-level calculations: + * Increase resolution of nice-level calculations for 64-bit architectures. + * The extra resolution improves shares distribution and load balancing of + * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup + * hierarchies, especially on larger systems. This is not a user-visible change + * and does not change the user-interface for setting shares/weights. + * + * We increase resolution only if we have enough bits to allow this increased + * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution + * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the + * increased costs. */ -#define SCHED_LOAD_SHIFT 10 +#if BITS_PER_LONG > 32 +# define SCHED_LOAD_RESOLUTION 10 +# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION) +# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION) +#else +# define SCHED_LOAD_RESOLUTION 0 +# define scale_load(w) (w) +# define scale_load_down(w) (w) +#endif + +#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION) #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) /* diff --git a/kernel/sched.c b/kernel/sched.c index 375e9c677d58..bb504e1839e5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -293,7 +293,7 @@ static DEFINE_SPINLOCK(task_group_lock); * limitation from this.) */ #define MIN_SHARES 2 -#define MAX_SHARES (1UL << 18) +#define MAX_SHARES (1UL << (18 + SCHED_LOAD_RESOLUTION)) static int root_task_group_load = ROOT_TASK_GROUP_LOAD; #endif @@ -1330,13 +1330,25 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, { u64 tmp; - tmp = (u64)delta_exec * weight; + /* + * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched + * entities since MIN_SHARES = 2. Treat weight as 1 if less than + * 2^SCHED_LOAD_RESOLUTION. + */ + if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) + tmp = (u64)delta_exec * scale_load_down(weight); + else + tmp = (u64)delta_exec; if (!lw->inv_weight) { - if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) + unsigned long w = scale_load_down(lw->weight); + + if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) lw->inv_weight = 1; + else if (unlikely(!w)) + lw->inv_weight = WMULT_CONST; else - lw->inv_weight = WMULT_CONST / lw->weight; + lw->inv_weight = WMULT_CONST / w; } /* @@ -1785,12 +1797,12 @@ static void set_load_weight(struct task_struct *p) * SCHED_IDLE tasks get minimal weight: */ if (p->policy == SCHED_IDLE) { - load->weight = WEIGHT_IDLEPRIO; + load->weight = scale_load(WEIGHT_IDLEPRIO); load->inv_weight = WMULT_IDLEPRIO; return; } - load->weight = prio_to_weight[prio]; + load->weight = scale_load(prio_to_weight[prio]); load->inv_weight = prio_to_wmult[prio]; } @@ -8809,14 +8821,14 @@ cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, u64 shareval) { - return sched_group_set_shares(cgroup_tg(cgrp), shareval); + return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); } static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) { struct task_group *tg = cgroup_tg(cgrp); - return (u64) tg->shares; + return (u64) scale_load_down(tg->shares); } #endif /* CONFIG_FAIR_GROUP_SCHED */