From e9532e69b8d1d1284e8ecf8d2586de34aec61244 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 4 Mar 2016 15:59:42 +0100 Subject: [PATCH 1/6] sched/cputime: Fix steal time accounting vs. CPU hotplug On CPU hotplug the steal time accounting can keep a stale rq->prev_steal_time value over CPU down and up. So after the CPU comes up again the delta calculation in steal_account_process_tick() wreckages itself due to the unsigned math: u64 steal = paravirt_steal_clock(smp_processor_id()); steal -= this_rq()->prev_steal_time; So if steal is smaller than rq->prev_steal_time we end up with an insane large value which then gets added to rq->prev_steal_time, resulting in a permanent wreckage of the accounting. As a consequence the per CPU stats in /proc/stat become stale. Nice trick to tell the world how idle the system is (100%) while the CPU is 100% busy running tasks. Though we prefer realistic numbers. None of the accounting values which use a previous value to account for fractions is reset at CPU hotplug time. update_rq_clock_task() has a sanity check for prev_irq_time and prev_steal_time_rq, but that sanity check solely deals with clock warps and limits the /proc/stat visible wreckage. The prev_time values are still wrong. Solution is simple: Reset rq->prev_*_time when the CPU is plugged in again. Signed-off-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Cc: Frederic Weisbecker Cc: Glauber Costa Cc: Linus Torvalds Cc: Peter Zijlstra Fixes: commit 095c0aa83e52 "sched: adjust scheduler cpu power for stolen time" Fixes: commit aa483808516c "sched: Remove irq time from available CPU power" Fixes: commit e6e6685accfa "KVM guest: Steal time accounting" Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1603041539490.3686@nanos Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 + kernel/sched/sched.h | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ab814bf100e1..406182af99ac 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5627,6 +5627,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_UP_PREPARE: rq->calc_load_update = calc_load_update; + account_reset_rq(rq); break; case CPU_ONLINE: diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 30ea2d871ba7..4f6598ae4c31 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1738,3 +1738,16 @@ static inline u64 irq_time_read(int cpu) } #endif /* CONFIG_64BIT */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +static inline void account_reset_rq(struct rq *rq) +{ +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + rq->prev_irq_time = 0; +#endif +#ifdef CONFIG_PARAVIRT + rq->prev_steal_time = 0; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + rq->prev_steal_time_rq = 0; +#endif +} From 2f5177f0fd7e531b26d54633be62d1d4cb94621c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Mar 2016 16:22:45 +0100 Subject: [PATCH 2/6] sched/cgroup: Fix/cleanup cgroup teardown/init The CPU controller hasn't kept up with the various changes in the whole cgroup initialization / destruction sequence, and commit: 2e91fa7f6d45 ("cgroup: keep zombies associated with their original cgroups") caused it to explode. The reason for this is that zombies do not inhibit css_offline() from being called, but do stall css_released(). Now we tear down the cfs_rq structures on css_offline() but zombies can run after that, leading to use-after-free issues. The solution is to move the tear-down to css_released(), which guarantees nobody (including no zombies) is still using our cgroup. Furthermore, a few simple cleanups are possible too. There doesn't appear to be any point to us using css_online() (anymore?) so fold that in css_alloc(). And since cgroup code guarantees an RCU grace period between css_released() and css_free() we can forgo using call_rcu() and free the stuff immediately. Suggested-by: Tejun Heo Reported-by: Kazuki Yamaguchi Reported-by: Niklas Cassel Tested-by: Niklas Cassel Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 2e91fa7f6d45 ("cgroup: keep zombies associated with their original cgroups") Link: http://lkml.kernel.org/r/20160316152245.GY6344@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4d872e19244b..2a87bdde8d4e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7536,7 +7536,7 @@ void set_curr_task(int cpu, struct task_struct *p) /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); -static void free_sched_group(struct task_group *tg) +static void sched_free_group(struct task_group *tg) { free_fair_sched_group(tg); free_rt_sched_group(tg); @@ -7562,7 +7562,7 @@ struct task_group *sched_create_group(struct task_group *parent) return tg; err: - free_sched_group(tg); + sched_free_group(tg); return ERR_PTR(-ENOMEM); } @@ -7582,17 +7582,16 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) } /* rcu callback to free various structures associated with a task group */ -static void free_sched_group_rcu(struct rcu_head *rhp) +static void sched_free_group_rcu(struct rcu_head *rhp) { /* now it should be safe to free those cfs_rqs */ - free_sched_group(container_of(rhp, struct task_group, rcu)); + sched_free_group(container_of(rhp, struct task_group, rcu)); } -/* Destroy runqueue etc associated with a task group */ void sched_destroy_group(struct task_group *tg) { /* wait for possible concurrent references to cfs_rqs complete */ - call_rcu(&tg->rcu, free_sched_group_rcu); + call_rcu(&tg->rcu, sched_free_group_rcu); } void sched_offline_group(struct task_group *tg) @@ -8051,31 +8050,26 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); + sched_online_group(tg, parent); + return &tg->css; } -static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) +static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - struct task_group *parent = css_tg(css->parent); - if (parent) - sched_online_group(tg, parent); - return 0; + sched_offline_group(tg); } static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - sched_destroy_group(tg); -} - -static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) -{ - struct task_group *tg = css_tg(css); - - sched_offline_group(tg); + /* + * Relies on the RCU grace period between css_released() and this. + */ + sched_free_group(tg); } static void cpu_cgroup_fork(struct task_struct *task) @@ -8435,9 +8429,8 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, + .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, - .css_online = cpu_cgroup_css_online, - .css_offline = cpu_cgroup_css_offline, .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, From 3a47d5124a957358274e9ca7b115b2f3a914f56d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Mar 2016 13:04:03 +0100 Subject: [PATCH 3/6] sched/fair: Fix fairness issue on migration Pavan reported that in the presence of very light tasks (or cgroups) the placement of migrated tasks can cause severe fairness issues. The problem is that enqueue_entity() places the task before it updates time, thereby it can place the task far in the past (remember that light tasks will shoot virtual time forward at a high speed, so in relation to the pre-existing light task, we can land far in the past). This is done because update_curr() needs the current task, and we might be placing the current task. The obvious solution is to differentiate between the current and any other task; placing the current before we update time, and placing any other task after, such that !curr tasks end up at the current moment in time, and not in the past. Reported-by: Pavan Kondeti Tested-by: Pavan Kondeti Signed-off-by: Peter Zijlstra (Intel) Cc: Ben Segall Cc: Linus Torvalds Cc: Matt Fleming Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: byungchul.park@lge.com Link: http://lkml.kernel.org/r/20160309120403.GK6344@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 33130529e9b5..3c114d971d84 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3157,17 +3157,25 @@ static inline void check_schedstat_required(void) static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - /* - * Update the normalized vruntime before updating min_vruntime - * through calling update_curr(). - */ - if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) - se->vruntime += cfs_rq->min_vruntime; + bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING); + bool curr = cfs_rq->curr == se; /* - * Update run-time statistics of the 'current'. + * If we're the current task, we must renormalise before calling + * update_curr(). */ + if (renorm && curr) + se->vruntime += cfs_rq->min_vruntime; + update_curr(cfs_rq); + + /* + * Otherwise, renormalise after, such that we're placed at the current + * moment in time, instead of some random moment in the past. + */ + if (renorm && !curr) + se->vruntime += cfs_rq->min_vruntime; + enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -3183,7 +3191,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_stats_enqueue(cfs_rq, se); check_spread(cfs_rq, se); } - if (se != cfs_rq->curr) + if (!curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; From d4335581dc30ec6545999c7443bb9fead274a980 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Wed, 9 Mar 2016 14:59:08 +0000 Subject: [PATCH 4/6] sched/fair: Add comments to explain select_idle_sibling() It's not entirely obvious how the main loop in select_idle_sibling() works on first glance. Sprinkle a few comments to explain the design and intention behind the loop based on some conversations with Mike and Peter. Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1457535548-15329-1-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3c114d971d84..303d6392b389 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5055,7 +5055,19 @@ static int select_idle_sibling(struct task_struct *p, int target) return i; /* - * Otherwise, iterate the domains and find an elegible idle cpu. + * Otherwise, iterate the domains and find an eligible idle cpu. + * + * A completely idle sched group at higher domains is more + * desirable than an idle group at a lower level, because lower + * domains have smaller groups and usually share hardware + * resources which causes tasks to contend on them, e.g. x86 + * hyperthread siblings in the lowest domain (SMT) can contend + * on the shared cpu pipeline. + * + * However, while we prefer idle groups at higher domains + * finding an idle cpu at the lowest domain is still better than + * returning 'target', which we've already established, isn't + * idle. */ sd = rcu_dereference(per_cpu(sd_llc, target)); for_each_lower_domain(sd) { @@ -5065,11 +5077,16 @@ static int select_idle_sibling(struct task_struct *p, int target) tsk_cpus_allowed(p))) goto next; + /* Ensure the entire group is idle */ for_each_cpu(i, sched_group_cpus(sg)) { if (i == target || !idle_cpu(i)) goto next; } + /* + * It doesn't matter which cpu we pick, the + * whole group is idle. + */ target = cpumask_first_and(sched_group_cpus(sg), tsk_cpus_allowed(p)); goto done; From 1a736b77a3f50910843d076623204ba6e5057dc1 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Mon, 21 Dec 2015 19:14:42 +0800 Subject: [PATCH 5/6] sched/cpuacct: Rename parameter in cpuusage_write() for readability The name of the 'reset' parameter to cpuusage_write() is quite confusing, because the only valid value we allow is '0', so !reset is actually the case that resets ... Rename it to 'val' and explain it in a comment that we only allow 0. Signed-off-by: Dongsheng Yang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: cgroups@vger.kernel.org Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1450696483-2864-1-git-send-email-yangds.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index dd7cbb55bbf2..9c2bbf7efa1a 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -145,13 +145,16 @@ static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) } static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, - u64 reset) + u64 val) { struct cpuacct *ca = css_ca(css); int err = 0; int i; - if (reset) { + /* + * Only allow '0' here to do a reset. + */ + if (val) { err = -EINVAL; goto out; } From 73e6aafd9ea81498d31361f01db84a0118da2d1c Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Thu, 17 Mar 2016 12:19:43 +0800 Subject: [PATCH 6/6] sched/cpuacct: Simplify the cpuacct code - Use for() instead of while() loop in some functions to make the code simpler. - Use this_cpu_ptr() instead of per_cpu_ptr() to make the code cleaner and a bit faster. Suggested-by: Peter Zijlstra Signed-off-by: Zhao Lei Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Tejun Heo Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/d8a7ef9592f55224630cb26dea239f05b6398a4e.1458187654.git.zhaolei@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 28 +++++----------------------- kernel/sched/cpuacct.h | 4 ++-- 2 files changed, 7 insertions(+), 25 deletions(-) diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 9c2bbf7efa1a..434c2fa41352 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -238,23 +238,10 @@ static struct cftype files[] = { void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; - int cpu; - - cpu = task_cpu(tsk); rcu_read_lock(); - - ca = task_ca(tsk); - - while (true) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - *cpuusage += cputime; - - ca = parent_ca(ca); - if (!ca) - break; - } - + for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) + *this_cpu_ptr(ca->cpuusage) += cputime; rcu_read_unlock(); } @@ -263,18 +250,13 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) * * Note: it's the caller that updates the account of the root cgroup. */ -void cpuacct_account_field(struct task_struct *p, int index, u64 val) +void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) { - struct kernel_cpustat *kcpustat; struct cpuacct *ca; rcu_read_lock(); - ca = task_ca(p); - while (ca != &root_cpuacct) { - kcpustat = this_cpu_ptr(ca->cpustat); - kcpustat->cpustat[index] += val; - ca = parent_ca(ca); - } + for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca)) + this_cpu_ptr(ca->cpustat)->cpustat[index] += val; rcu_read_unlock(); } diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h index ed605624a5e7..ba72807c73d4 100644 --- a/kernel/sched/cpuacct.h +++ b/kernel/sched/cpuacct.h @@ -1,7 +1,7 @@ #ifdef CONFIG_CGROUP_CPUACCT extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); -extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); +extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); #else @@ -10,7 +10,7 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) } static inline void -cpuacct_account_field(struct task_struct *p, int index, u64 val) +cpuacct_account_field(struct task_struct *tsk, int index, u64 val) { }