From 3b9c08ae3dd44201b3a188aef34d6ddf73434015 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 4 Jul 2017 11:53:40 +0200
Subject: [PATCH 1/8] Revert "sched/cputime: Refactor the cputime_adjust()
 code"

This reverts commit 72298e5c92c50edd8cb7cfda4519483ce65fa166.

As Peter explains:

> Argh, no... That code was perfectly fine. The new code otoh is
> convoluted.
>
> The old code had the following form:
>
>         if (exception1)
>           deal with exception1
>
>         if (execption2)
>           deal with exception2
>
>         do normal stuff
>
> Which is as simple and straight forward as it gets.
>
> The new code otoh reads like:
>
>         if (!exception1) {
>                 if (exception2)
>                   deal with exception 2
>                 else
>                   do normal stuff
>         }

So restore the old form.

Also fix the comment describing the logic, as it was confusing.

Requested-by: Peter Zijlstra <peterz@infradead.org>
Cc: Gustavo A. R. Silva <garsilva@embeddedor.com>
Cc: Frans Klaver <fransklaver@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cputime.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 67c70e287647..84a419bdf5aa 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -611,17 +611,23 @@ static void cputime_adjust(struct task_cputime *curr,
 	utime = curr->utime;
 
 	/*
-	 * If either stime or both stime and utime are 0, assume all runtime is
-	 * userspace. Once a task gets some ticks, the monotonicy code at
-	 * 'update' will ensure things converge to the observed ratio.
+	 * If either stime or utime are 0, assume all runtime is userspace.
+	 * Once a task gets some ticks, the monotonicy code at 'update:'
+	 * will ensure things converge to the observed ratio.
 	 */
-	if (stime != 0) {
-		if (utime == 0)
-			stime = rtime;
-		else
-			stime = scale_stime(stime, rtime, stime + utime);
+	if (stime == 0) {
+		utime = rtime;
+		goto update;
 	}
 
+	if (utime == 0) {
+		stime = rtime;
+		goto update;
+	}
+
+	stime = scale_stime(stime, rtime, stime + utime);
+
+update:
 	/*
 	 * Make sure stime doesn't go backwards; this preserves monotonicity
 	 * for utime because rtime is monotonic.

From 1c3eda01a79b8e9237d91c52c5a75b20983f47c6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 29 Jun 2017 19:15:07 +0200
Subject: [PATCH 2/8] vtime, sched/cputime: Remove vtime_account_user()

It's an unnecessary function between vtime_user_exit() and
account_user_time().

Tested-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1498756511-11714-2-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/vtime.h  |  9 +--------
 kernel/sched/cputime.c | 18 +++++++++---------
 2 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index 0681fe25abeb..18b405e3cd93 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -67,19 +67,12 @@ static inline void vtime_account_system(struct task_struct *tsk) { }
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 extern void arch_vtime_task_switch(struct task_struct *tsk);
-extern void vtime_account_user(struct task_struct *tsk);
 extern void vtime_user_enter(struct task_struct *tsk);
-
-static inline void vtime_user_exit(struct task_struct *tsk)
-{
-	vtime_account_user(tsk);
-}
-
+extern void vtime_user_exit(struct task_struct *tsk);
 extern void vtime_guest_enter(struct task_struct *tsk);
 extern void vtime_guest_exit(struct task_struct *tsk);
 extern void vtime_init_idle(struct task_struct *tsk, int cpu);
 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN  */
-static inline void vtime_account_user(struct task_struct *tsk) { }
 static inline void vtime_user_enter(struct task_struct *tsk) { }
 static inline void vtime_user_exit(struct task_struct *tsk) { }
 static inline void vtime_guest_enter(struct task_struct *tsk) { }
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 84a419bdf5aa..5adc896d0f64 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -724,15 +724,6 @@ void vtime_account_system(struct task_struct *tsk)
 	write_seqcount_end(&tsk->vtime_seqcount);
 }
 
-void vtime_account_user(struct task_struct *tsk)
-{
-	write_seqcount_begin(&tsk->vtime_seqcount);
-	tsk->vtime_snap_whence = VTIME_SYS;
-	if (vtime_delta(tsk))
-		account_user_time(tsk, get_vtime_delta(tsk));
-	write_seqcount_end(&tsk->vtime_seqcount);
-}
-
 void vtime_user_enter(struct task_struct *tsk)
 {
 	write_seqcount_begin(&tsk->vtime_seqcount);
@@ -742,6 +733,15 @@ void vtime_user_enter(struct task_struct *tsk)
 	write_seqcount_end(&tsk->vtime_seqcount);
 }
 
+void vtime_user_exit(struct task_struct *tsk)
+{
+	write_seqcount_begin(&tsk->vtime_seqcount);
+	tsk->vtime_snap_whence = VTIME_SYS;
+	if (vtime_delta(tsk))
+		account_user_time(tsk, get_vtime_delta(tsk));
+	write_seqcount_end(&tsk->vtime_seqcount);
+}
+
 void vtime_guest_enter(struct task_struct *tsk)
 {
 	/*

From 9fa57cf5a5c4aed1e45879b335fe433048709327 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 29 Jun 2017 19:15:08 +0200
Subject: [PATCH 3/8] sched/cputime: Always set tsk->vtime_snap_whence after
 accounting vtime

Even though it doesn't have functional consequences, setting
the task's new context state after we actually accounted the pending
vtime from the old context state makes more sense from a review
perspective.

vtime_user_exit() is the only function that doesn't follow that rule
and that can bug the reviewer for a little while until he realizes there
is no reason for this special case.

Tested-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1498756511-11714-3-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cputime.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 5adc896d0f64..ab68927e8e94 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -736,9 +736,9 @@ void vtime_user_enter(struct task_struct *tsk)
 void vtime_user_exit(struct task_struct *tsk)
 {
 	write_seqcount_begin(&tsk->vtime_seqcount);
-	tsk->vtime_snap_whence = VTIME_SYS;
 	if (vtime_delta(tsk))
 		account_user_time(tsk, get_vtime_delta(tsk));
+	tsk->vtime_snap_whence = VTIME_SYS;
 	write_seqcount_end(&tsk->vtime_seqcount);
 }
 

From 60a9ce57e7c5ac1df3a39fb941022bbfa40c0862 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 29 Jun 2017 19:15:09 +0200
Subject: [PATCH 4/8] sched/cputime: Rename vtime fields

The current "snapshot" based naming on vtime fields suggests we record
some past event but that's a low level picture of their actual purpose
which comes out blurry. The real point of these fields is to run a basic
state machine that tracks down cputime entry while switching between
contexts.

So lets reflect that with more meaningful names.

Tested-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1498756511-11714-4-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/init_task.h |  4 ++--
 include/linux/sched.h     |  4 ++--
 kernel/fork.c             |  4 ++--
 kernel/sched/cputime.c    | 30 +++++++++++++++---------------
 4 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index e049526bc188..3d537331cd4e 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -171,8 +171,8 @@ extern struct cred init_cred;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 # define INIT_VTIME(tsk)						\
 	.vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount),	\
-	.vtime_snap = 0,				\
-	.vtime_snap_whence = VTIME_SYS,
+	.vtime_starttime = 0,				\
+	.vtime_state = VTIME_SYS,
 #else
 # define INIT_VTIME(tsk)
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9c4ca7433d9d..ff001646549e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -689,7 +689,7 @@ struct task_struct {
 	struct prev_cputime		prev_cputime;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 	seqcount_t			vtime_seqcount;
-	unsigned long long		vtime_snap;
+	unsigned long long		vtime_starttime;
 	enum {
 		/* Task is sleeping or running in a CPU with VTIME inactive: */
 		VTIME_INACTIVE = 0,
@@ -697,7 +697,7 @@ struct task_struct {
 		VTIME_USER,
 		/* Task runs in kernelspace in a CPU with VTIME active: */
 		VTIME_SYS,
-	} vtime_snap_whence;
+	} vtime_state;
 #endif
 
 #ifdef CONFIG_NO_HZ_FULL
diff --git a/kernel/fork.c b/kernel/fork.c
index e53770d2bf95..83c4f9bf3e14 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1638,8 +1638,8 @@ static __latent_entropy struct task_struct *copy_process(
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 	seqcount_init(&p->vtime_seqcount);
-	p->vtime_snap = 0;
-	p->vtime_snap_whence = VTIME_INACTIVE;
+	p->vtime_starttime = 0;
+	p->vtime_state = VTIME_INACTIVE;
 #endif
 
 #if defined(SPLIT_RSS_COUNTING)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ab68927e8e94..8c64753067c5 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -683,10 +683,10 @@ static u64 vtime_delta(struct task_struct *tsk)
 {
 	unsigned long now = READ_ONCE(jiffies);
 
-	if (time_before(now, (unsigned long)tsk->vtime_snap))
+	if (time_before(now, (unsigned long)tsk->vtime_starttime))
 		return 0;
 
-	return jiffies_to_nsecs(now - tsk->vtime_snap);
+	return jiffies_to_nsecs(now - tsk->vtime_starttime);
 }
 
 static u64 get_vtime_delta(struct task_struct *tsk)
@@ -701,10 +701,10 @@ static u64 get_vtime_delta(struct task_struct *tsk)
 	 * elapsed time. Limit account_other_time to prevent rounding
 	 * errors from causing elapsed vtime to go negative.
 	 */
-	delta = jiffies_to_nsecs(now - tsk->vtime_snap);
+	delta = jiffies_to_nsecs(now - tsk->vtime_starttime);
 	other = account_other_time(delta);
-	WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
-	tsk->vtime_snap = now;
+	WARN_ON_ONCE(tsk->vtime_state == VTIME_INACTIVE);
+	tsk->vtime_starttime = now;
 
 	return delta - other;
 }
@@ -746,7 +746,7 @@ void vtime_guest_enter(struct task_struct *tsk)
 {
 	/*
 	 * The flags must be updated under the lock with
-	 * the vtime_snap flush and update.
+	 * the vtime_starttime flush and update.
 	 * That enforces a right ordering and update sequence
 	 * synchronization against the reader (task_gtime())
 	 * that can thus safely catch up with a tickless delta.
@@ -776,12 +776,12 @@ void vtime_account_idle(struct task_struct *tsk)
 void arch_vtime_task_switch(struct task_struct *prev)
 {
 	write_seqcount_begin(&prev->vtime_seqcount);
-	prev->vtime_snap_whence = VTIME_INACTIVE;
+	prev->vtime_state = VTIME_INACTIVE;
 	write_seqcount_end(&prev->vtime_seqcount);
 
 	write_seqcount_begin(&current->vtime_seqcount);
-	current->vtime_snap_whence = VTIME_SYS;
-	current->vtime_snap = jiffies;
+	current->vtime_state = VTIME_SYS;
+	current->vtime_starttime = jiffies;
 	write_seqcount_end(&current->vtime_seqcount);
 }
 
@@ -791,8 +791,8 @@ void vtime_init_idle(struct task_struct *t, int cpu)
 
 	local_irq_save(flags);
 	write_seqcount_begin(&t->vtime_seqcount);
-	t->vtime_snap_whence = VTIME_SYS;
-	t->vtime_snap = jiffies;
+	t->vtime_state = VTIME_SYS;
+	t->vtime_starttime = jiffies;
 	write_seqcount_end(&t->vtime_seqcount);
 	local_irq_restore(flags);
 }
@@ -809,7 +809,7 @@ u64 task_gtime(struct task_struct *t)
 		seq = read_seqcount_begin(&t->vtime_seqcount);
 
 		gtime = t->gtime;
-		if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
+		if (t->vtime_state == VTIME_SYS && t->flags & PF_VCPU)
 			gtime += vtime_delta(t);
 
 	} while (read_seqcount_retry(&t->vtime_seqcount, seq));
@@ -840,7 +840,7 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
 		*stime = t->stime;
 
 		/* Task is sleeping, nothing to add */
-		if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t))
+		if (t->vtime_state == VTIME_INACTIVE || is_idle_task(t))
 			continue;
 
 		delta = vtime_delta(t);
@@ -849,9 +849,9 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
 		 * Task runs either in user or kernel space, add pending nohz time to
 		 * the right place.
 		 */
-		if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU)
+		if (t->vtime_state == VTIME_USER || t->flags & PF_VCPU)
 			*utime += delta;
-		else if (t->vtime_snap_whence == VTIME_SYS)
+		else if (t->vtime_state == VTIME_SYS)
 			*stime += delta;
 	} while (read_seqcount_retry(&t->vtime_seqcount, seq));
 }

From bac5b6b6b11560f323e71d0ebac4061cfe5f56c0 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 29 Jun 2017 19:15:10 +0200
Subject: [PATCH 5/8] sched/cputime: Move the vtime task fields to their own
 struct

We are about to add vtime accumulation fields to the task struct. Let's
avoid more bloatification and gather vtime information to their own
struct.

Tested-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1498756511-11714-5-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/init_task.h |   6 +-
 include/linux/sched.h     |  26 +++++----
 kernel/fork.c             |   6 +-
 kernel/sched/cputime.c    | 112 ++++++++++++++++++++++----------------
 4 files changed, 86 insertions(+), 64 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 3d537331cd4e..a2f6707e9fc0 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -170,9 +170,9 @@ extern struct cred init_cred;
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 # define INIT_VTIME(tsk)						\
-	.vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount),	\
-	.vtime_starttime = 0,				\
-	.vtime_state = VTIME_SYS,
+	.vtime.seqcount = SEQCNT_ZERO(tsk.vtime.seqcount),		\
+	.vtime.starttime = 0,						\
+	.vtime.state = VTIME_SYS,
 #else
 # define INIT_VTIME(tsk)
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ff001646549e..eeff8a024f0c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -223,6 +223,21 @@ struct task_cputime {
 #define prof_exp			stime
 #define sched_exp			sum_exec_runtime
 
+enum vtime_state {
+	/* Task is sleeping or running in a CPU with VTIME inactive: */
+	VTIME_INACTIVE = 0,
+	/* Task runs in userspace in a CPU with VTIME active: */
+	VTIME_USER,
+	/* Task runs in kernelspace in a CPU with VTIME active: */
+	VTIME_SYS,
+};
+
+struct vtime {
+	seqcount_t		seqcount;
+	unsigned long long	starttime;
+	enum vtime_state	state;
+};
+
 struct sched_info {
 #ifdef CONFIG_SCHED_INFO
 	/* Cumulative counters: */
@@ -688,16 +703,7 @@ struct task_struct {
 	u64				gtime;
 	struct prev_cputime		prev_cputime;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-	seqcount_t			vtime_seqcount;
-	unsigned long long		vtime_starttime;
-	enum {
-		/* Task is sleeping or running in a CPU with VTIME inactive: */
-		VTIME_INACTIVE = 0,
-		/* Task runs in userspace in a CPU with VTIME active: */
-		VTIME_USER,
-		/* Task runs in kernelspace in a CPU with VTIME active: */
-		VTIME_SYS,
-	} vtime_state;
+	struct vtime			vtime;
 #endif
 
 #ifdef CONFIG_NO_HZ_FULL
diff --git a/kernel/fork.c b/kernel/fork.c
index 83c4f9bf3e14..d927ec11aa7a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1637,9 +1637,9 @@ static __latent_entropy struct task_struct *copy_process(
 	prev_cputime_init(&p->prev_cputime);
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-	seqcount_init(&p->vtime_seqcount);
-	p->vtime_starttime = 0;
-	p->vtime_state = VTIME_INACTIVE;
+	seqcount_init(&p->vtime.seqcount);
+	p->vtime.starttime = 0;
+	p->vtime.state = VTIME_INACTIVE;
 #endif
 
 #if defined(SPLIT_RSS_COUNTING)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8c64753067c5..9ee725edcbe0 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -679,17 +679,17 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static u64 vtime_delta(struct task_struct *tsk)
+static u64 vtime_delta(struct vtime *vtime)
 {
 	unsigned long now = READ_ONCE(jiffies);
 
-	if (time_before(now, (unsigned long)tsk->vtime_starttime))
+	if (time_before(now, (unsigned long)vtime->starttime))
 		return 0;
 
-	return jiffies_to_nsecs(now - tsk->vtime_starttime);
+	return jiffies_to_nsecs(now - vtime->starttime);
 }
 
-static u64 get_vtime_delta(struct task_struct *tsk)
+static u64 get_vtime_delta(struct vtime *vtime)
 {
 	unsigned long now = READ_ONCE(jiffies);
 	u64 delta, other;
@@ -701,49 +701,56 @@ static u64 get_vtime_delta(struct task_struct *tsk)
 	 * elapsed time. Limit account_other_time to prevent rounding
 	 * errors from causing elapsed vtime to go negative.
 	 */
-	delta = jiffies_to_nsecs(now - tsk->vtime_starttime);
+	delta = jiffies_to_nsecs(now - vtime->starttime);
 	other = account_other_time(delta);
-	WARN_ON_ONCE(tsk->vtime_state == VTIME_INACTIVE);
-	tsk->vtime_starttime = now;
+	WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
+	vtime->starttime = now;
 
 	return delta - other;
 }
 
 static void __vtime_account_system(struct task_struct *tsk)
 {
-	account_system_time(tsk, irq_count(), get_vtime_delta(tsk));
+	account_system_time(tsk, irq_count(), get_vtime_delta(&tsk->vtime));
 }
 
 void vtime_account_system(struct task_struct *tsk)
 {
-	if (!vtime_delta(tsk))
+	struct vtime *vtime = &tsk->vtime;
+
+	if (!vtime_delta(vtime))
 		return;
 
-	write_seqcount_begin(&tsk->vtime_seqcount);
+	write_seqcount_begin(&vtime->seqcount);
 	__vtime_account_system(tsk);
-	write_seqcount_end(&tsk->vtime_seqcount);
+	write_seqcount_end(&vtime->seqcount);
 }
 
 void vtime_user_enter(struct task_struct *tsk)
 {
-	write_seqcount_begin(&tsk->vtime_seqcount);
-	if (vtime_delta(tsk))
+	struct vtime *vtime = &tsk->vtime;
+
+	write_seqcount_begin(&vtime->seqcount);
+	if (vtime_delta(vtime))
 		__vtime_account_system(tsk);
-	tsk->vtime_snap_whence = VTIME_USER;
-	write_seqcount_end(&tsk->vtime_seqcount);
+	vtime->state = VTIME_USER;
+	write_seqcount_end(&vtime->seqcount);
 }
 
 void vtime_user_exit(struct task_struct *tsk)
 {
-	write_seqcount_begin(&tsk->vtime_seqcount);
-	if (vtime_delta(tsk))
-		account_user_time(tsk, get_vtime_delta(tsk));
-	tsk->vtime_snap_whence = VTIME_SYS;
-	write_seqcount_end(&tsk->vtime_seqcount);
+	struct vtime *vtime = &tsk->vtime;
+
+	write_seqcount_begin(&vtime->seqcount);
+	if (vtime_delta(vtime))
+		account_user_time(tsk, get_vtime_delta(vtime));
+	vtime->state = VTIME_SYS;
+	write_seqcount_end(&vtime->seqcount);
 }
 
 void vtime_guest_enter(struct task_struct *tsk)
 {
+	struct vtime *vtime = &tsk->vtime;
 	/*
 	 * The flags must be updated under the lock with
 	 * the vtime_starttime flush and update.
@@ -751,54 +758,62 @@ void vtime_guest_enter(struct task_struct *tsk)
 	 * synchronization against the reader (task_gtime())
 	 * that can thus safely catch up with a tickless delta.
 	 */
-	write_seqcount_begin(&tsk->vtime_seqcount);
-	if (vtime_delta(tsk))
+	write_seqcount_begin(&vtime->seqcount);
+	if (vtime_delta(vtime))
 		__vtime_account_system(tsk);
 	current->flags |= PF_VCPU;
-	write_seqcount_end(&tsk->vtime_seqcount);
+	write_seqcount_end(&vtime->seqcount);
 }
 EXPORT_SYMBOL_GPL(vtime_guest_enter);
 
 void vtime_guest_exit(struct task_struct *tsk)
 {
-	write_seqcount_begin(&tsk->vtime_seqcount);
+	struct vtime *vtime = &tsk->vtime;
+
+	write_seqcount_begin(&vtime->seqcount);
 	__vtime_account_system(tsk);
 	current->flags &= ~PF_VCPU;
-	write_seqcount_end(&tsk->vtime_seqcount);
+	write_seqcount_end(&vtime->seqcount);
 }
 EXPORT_SYMBOL_GPL(vtime_guest_exit);
 
 void vtime_account_idle(struct task_struct *tsk)
 {
-	account_idle_time(get_vtime_delta(tsk));
+	account_idle_time(get_vtime_delta(&tsk->vtime));
 }
 
 void arch_vtime_task_switch(struct task_struct *prev)
 {
-	write_seqcount_begin(&prev->vtime_seqcount);
-	prev->vtime_state = VTIME_INACTIVE;
-	write_seqcount_end(&prev->vtime_seqcount);
+	struct vtime *vtime = &prev->vtime;
 
-	write_seqcount_begin(&current->vtime_seqcount);
-	current->vtime_state = VTIME_SYS;
-	current->vtime_starttime = jiffies;
-	write_seqcount_end(&current->vtime_seqcount);
+	write_seqcount_begin(&vtime->seqcount);
+	vtime->state = VTIME_INACTIVE;
+	write_seqcount_end(&vtime->seqcount);
+
+	vtime = &current->vtime;
+
+	write_seqcount_begin(&vtime->seqcount);
+	vtime->state = VTIME_SYS;
+	vtime->starttime = jiffies;
+	write_seqcount_end(&vtime->seqcount);
 }
 
 void vtime_init_idle(struct task_struct *t, int cpu)
 {
+	struct vtime *vtime = &t->vtime;
 	unsigned long flags;
 
 	local_irq_save(flags);
-	write_seqcount_begin(&t->vtime_seqcount);
-	t->vtime_state = VTIME_SYS;
-	t->vtime_starttime = jiffies;
-	write_seqcount_end(&t->vtime_seqcount);
+	write_seqcount_begin(&vtime->seqcount);
+	vtime->state = VTIME_SYS;
+	vtime->starttime = jiffies;
+	write_seqcount_end(&vtime->seqcount);
 	local_irq_restore(flags);
 }
 
 u64 task_gtime(struct task_struct *t)
 {
+	struct vtime *vtime = &t->vtime;
 	unsigned int seq;
 	u64 gtime;
 
@@ -806,13 +821,13 @@ u64 task_gtime(struct task_struct *t)
 		return t->gtime;
 
 	do {
-		seq = read_seqcount_begin(&t->vtime_seqcount);
+		seq = read_seqcount_begin(&vtime->seqcount);
 
 		gtime = t->gtime;
-		if (t->vtime_state == VTIME_SYS && t->flags & PF_VCPU)
-			gtime += vtime_delta(t);
+		if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
+			gtime += vtime_delta(vtime);
 
-	} while (read_seqcount_retry(&t->vtime_seqcount, seq));
+	} while (read_seqcount_retry(&vtime->seqcount, seq));
 
 	return gtime;
 }
@@ -824,8 +839,9 @@ u64 task_gtime(struct task_struct *t)
  */
 void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
 {
-	u64 delta;
+	struct vtime *vtime = &t->vtime;
 	unsigned int seq;
+	u64 delta;
 
 	if (!vtime_accounting_enabled()) {
 		*utime = t->utime;
@@ -834,25 +850,25 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
 	}
 
 	do {
-		seq = read_seqcount_begin(&t->vtime_seqcount);
+		seq = read_seqcount_begin(&vtime->seqcount);
 
 		*utime = t->utime;
 		*stime = t->stime;
 
 		/* Task is sleeping, nothing to add */
-		if (t->vtime_state == VTIME_INACTIVE || is_idle_task(t))
+		if (vtime->state == VTIME_INACTIVE || is_idle_task(t))
 			continue;
 
-		delta = vtime_delta(t);
+		delta = vtime_delta(vtime);
 
 		/*
 		 * Task runs either in user or kernel space, add pending nohz time to
 		 * the right place.
 		 */
-		if (t->vtime_state == VTIME_USER || t->flags & PF_VCPU)
+		if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
 			*utime += delta;
-		else if (t->vtime_state == VTIME_SYS)
+		else if (vtime->state == VTIME_SYS)
 			*stime += delta;
-	} while (read_seqcount_retry(&t->vtime_seqcount, seq));
+	} while (read_seqcount_retry(&vtime->seqcount, seq));
 }
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */

From 2a42eb9594a1480b4ead9e036e06ee1290e5fa6d Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Thu, 29 Jun 2017 19:15:11 +0200
Subject: [PATCH 6/8] sched/cputime: Accumulate vtime on top of nsec
 clocksource

Currently the cputime source used by vtime is jiffies. When we cross
a context boundary and jiffies have changed since the last snapshot, the
pending cputime is accounted to the switching out context.

This system works ok if the ticks are not aligned across CPUs. If they
instead are aligned (ie: all fire at the same time) and the CPUs run in
userspace, the jiffies change is only observed on tick exit and therefore
the user cputime is accounted as system cputime. This is because the
CPU that maintains timekeeping fires its tick at the same time as the
others. It updates jiffies in the middle of the tick and the other CPUs
see that update on IRQ exit:

    CPU 0 (timekeeper)                  CPU 1
    -------------------              -------------
                      jiffies = N
    ...                              run in userspace for a jiffy
    tick entry                       tick entry (sees jiffies = N)
    set jiffies = N + 1
    tick exit                        tick exit (sees jiffies = N + 1)
                                                account 1 jiffy as stime

Fix this with using a nanosec clock source instead of jiffies. The
cputime is then accumulated and flushed everytime the pending delta
reaches a jiffy in order to mitigate the accounting overhead.

[ fweisbec: changelog, rebase on struct vtime, field renames, add delta
  on cputime readers, keep idle vtime as-is (low overhead accounting),
  harmonize clock sources. ]

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Luiz Capitulino <lcapitulino@redhat.com>
Tested-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1498756511-11714-6-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h  |  3 ++
 kernel/sched/cputime.c | 64 +++++++++++++++++++++++++++---------------
 2 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index eeff8a024f0c..4818126c5153 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -236,6 +236,9 @@ struct vtime {
 	seqcount_t		seqcount;
 	unsigned long long	starttime;
 	enum vtime_state	state;
+	u64			utime;
+	u64			stime;
+	u64			gtime;
 };
 
 struct sched_info {
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 9ee725edcbe0..6e3ea4ac1bda 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -681,18 +681,19 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 static u64 vtime_delta(struct vtime *vtime)
 {
-	unsigned long now = READ_ONCE(jiffies);
+	unsigned long long clock;
 
-	if (time_before(now, (unsigned long)vtime->starttime))
+	clock = sched_clock_cpu(smp_processor_id());
+	if (clock < vtime->starttime)
 		return 0;
 
-	return jiffies_to_nsecs(now - vtime->starttime);
+	return clock - vtime->starttime;
 }
 
 static u64 get_vtime_delta(struct vtime *vtime)
 {
-	unsigned long now = READ_ONCE(jiffies);
-	u64 delta, other;
+	u64 delta = vtime_delta(vtime);
+	u64 other;
 
 	/*
 	 * Unlike tick based timing, vtime based timing never has lost
@@ -701,17 +702,31 @@ static u64 get_vtime_delta(struct vtime *vtime)
 	 * elapsed time. Limit account_other_time to prevent rounding
 	 * errors from causing elapsed vtime to go negative.
 	 */
-	delta = jiffies_to_nsecs(now - vtime->starttime);
 	other = account_other_time(delta);
 	WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
-	vtime->starttime = now;
+	vtime->starttime += delta;
 
 	return delta - other;
 }
 
-static void __vtime_account_system(struct task_struct *tsk)
+static void __vtime_account_system(struct task_struct *tsk,
+				   struct vtime *vtime)
 {
-	account_system_time(tsk, irq_count(), get_vtime_delta(&tsk->vtime));
+	vtime->stime += get_vtime_delta(vtime);
+	if (vtime->stime >= TICK_NSEC) {
+		account_system_time(tsk, irq_count(), vtime->stime);
+		vtime->stime = 0;
+	}
+}
+
+static void vtime_account_guest(struct task_struct *tsk,
+				struct vtime *vtime)
+{
+	vtime->gtime += get_vtime_delta(vtime);
+	if (vtime->gtime >= TICK_NSEC) {
+		account_guest_time(tsk, vtime->gtime);
+		vtime->gtime = 0;
+	}
 }
 
 void vtime_account_system(struct task_struct *tsk)
@@ -722,7 +737,11 @@ void vtime_account_system(struct task_struct *tsk)
 		return;
 
 	write_seqcount_begin(&vtime->seqcount);
-	__vtime_account_system(tsk);
+	/* We might have scheduled out from guest path */
+	if (current->flags & PF_VCPU)
+		vtime_account_guest(tsk, vtime);
+	else
+		__vtime_account_system(tsk, vtime);
 	write_seqcount_end(&vtime->seqcount);
 }
 
@@ -731,8 +750,7 @@ void vtime_user_enter(struct task_struct *tsk)
 	struct vtime *vtime = &tsk->vtime;
 
 	write_seqcount_begin(&vtime->seqcount);
-	if (vtime_delta(vtime))
-		__vtime_account_system(tsk);
+	__vtime_account_system(tsk, vtime);
 	vtime->state = VTIME_USER;
 	write_seqcount_end(&vtime->seqcount);
 }
@@ -742,8 +760,11 @@ void vtime_user_exit(struct task_struct *tsk)
 	struct vtime *vtime = &tsk->vtime;
 
 	write_seqcount_begin(&vtime->seqcount);
-	if (vtime_delta(vtime))
-		account_user_time(tsk, get_vtime_delta(vtime));
+	vtime->utime += get_vtime_delta(vtime);
+	if (vtime->utime >= TICK_NSEC) {
+		account_user_time(tsk, vtime->utime);
+		vtime->utime = 0;
+	}
 	vtime->state = VTIME_SYS;
 	write_seqcount_end(&vtime->seqcount);
 }
@@ -759,8 +780,7 @@ void vtime_guest_enter(struct task_struct *tsk)
 	 * that can thus safely catch up with a tickless delta.
 	 */
 	write_seqcount_begin(&vtime->seqcount);
-	if (vtime_delta(vtime))
-		__vtime_account_system(tsk);
+	__vtime_account_system(tsk, vtime);
 	current->flags |= PF_VCPU;
 	write_seqcount_end(&vtime->seqcount);
 }
@@ -771,7 +791,7 @@ void vtime_guest_exit(struct task_struct *tsk)
 	struct vtime *vtime = &tsk->vtime;
 
 	write_seqcount_begin(&vtime->seqcount);
-	__vtime_account_system(tsk);
+	vtime_account_guest(tsk, vtime);
 	current->flags &= ~PF_VCPU;
 	write_seqcount_end(&vtime->seqcount);
 }
@@ -794,7 +814,7 @@ void arch_vtime_task_switch(struct task_struct *prev)
 
 	write_seqcount_begin(&vtime->seqcount);
 	vtime->state = VTIME_SYS;
-	vtime->starttime = jiffies;
+	vtime->starttime = sched_clock_cpu(smp_processor_id());
 	write_seqcount_end(&vtime->seqcount);
 }
 
@@ -806,7 +826,7 @@ void vtime_init_idle(struct task_struct *t, int cpu)
 	local_irq_save(flags);
 	write_seqcount_begin(&vtime->seqcount);
 	vtime->state = VTIME_SYS;
-	vtime->starttime = jiffies;
+	vtime->starttime = sched_clock_cpu(cpu);
 	write_seqcount_end(&vtime->seqcount);
 	local_irq_restore(flags);
 }
@@ -825,7 +845,7 @@ u64 task_gtime(struct task_struct *t)
 
 		gtime = t->gtime;
 		if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
-			gtime += vtime_delta(vtime);
+			gtime += vtime->gtime + vtime_delta(vtime);
 
 	} while (read_seqcount_retry(&vtime->seqcount, seq));
 
@@ -866,9 +886,9 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
 		 * the right place.
 		 */
 		if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
-			*utime += delta;
+			*utime += vtime->utime + delta;
 		else if (vtime->state == VTIME_SYS)
-			*stime += delta;
+			*stime += vtime->stime + delta;
 	} while (read_seqcount_retry(&vtime->seqcount, seq));
 }
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */

From 65a4433aebe36c8c6abeb69b99ef00274b971c6c Mon Sep 17 00:00:00 2001
From: Jeffrey Hugo <jhugo@codeaurora.org>
Date: Wed, 7 Jun 2017 13:18:57 -0600
Subject: [PATCH 7/8] sched/fair: Fix load_balance() affinity redo path

If load_balance() fails to migrate any tasks because all tasks were
affined, load_balance() removes the source CPU from consideration and
attempts to redo and balance among the new subset of CPUs.

There is a bug in this code path where the algorithm considers all active
CPUs in the system (minus the source that was just masked out).  This is
not valid for two reasons: some active CPUs may not be in the current
scheduling domain and one of the active CPUs is dst_cpu. These CPUs should
not be considered, as we cannot pull load from them.

Instead of failing out of load_balance(), we may end up redoing the search
with no valid CPUs and incorrectly concluding the domain is balanced.
Additionally, if the group_imbalance flag was just set, it may also be
incorrectly unset, thus the flag will not be seen by other CPUs in future
load_balance() runs as that algorithm intends.

Fix the check by removing CPUs not in the current domain and the dst_cpu
from considertation, thus limiting the evaluation to valid remaining CPUs
from which load might be migrated.

Co-authored-by: Austin Christ <austinwc@codeaurora.org>
Co-authored-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Tyler Baicar <tbaicar@codeaurora.org>
Signed-off-by: Jeffrey Hugo <jhugo@codeaurora.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Austin Christ <austinwc@codeaurora.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Timur Tabi <timur@codeaurora.org>
Link: http://lkml.kernel.org/r/1496863138-11322-2-git-send-email-jhugo@codeaurora.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 008c514dc241..c95880e216f6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6646,10 +6646,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 		 * our sched_group. We may want to revisit it if we couldn't
 		 * meet load balance goals by pulling other tasks on src_cpu.
 		 *
-		 * Also avoid computing new_dst_cpu if we have already computed
-		 * one in current iteration.
+		 * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
+		 * already computed one in current iteration.
 		 */
-		if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
+		if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
 			return 0;
 
 		/* Prevent to re-select dst_cpu via env's cpus */
@@ -8022,14 +8022,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 		.tasks		= LIST_HEAD_INIT(env.tasks),
 	};
 
-	/*
-	 * For NEWLY_IDLE load_balancing, we don't need to consider
-	 * other cpus in our group
-	 */
-	if (idle == CPU_NEWLY_IDLE)
-		env.dst_grpmask = NULL;
-
-	cpumask_copy(cpus, cpu_active_mask);
+	cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
 
 	schedstat_inc(sd->lb_count[idle]);
 
@@ -8151,7 +8144,15 @@ more_balance:
 		/* All tasks on this runqueue were pinned by CPU affinity */
 		if (unlikely(env.flags & LBF_ALL_PINNED)) {
 			cpumask_clear_cpu(cpu_of(busiest), cpus);
-			if (!cpumask_empty(cpus)) {
+			/*
+			 * Attempting to continue load balancing at the current
+			 * sched_domain level only makes sense if there are
+			 * active CPUs remaining as possible busiest CPUs to
+			 * pull load from which are not contained within the
+			 * destination group that is receiving any migrated
+			 * load.
+			 */
+			if (!cpumask_subset(cpus, env.dst_grpmask)) {
 				env.loop = 0;
 				env.loop_break = sched_nr_migrate_break;
 				goto redo;
@@ -8447,6 +8448,13 @@ static int active_load_balance_cpu_stop(void *data)
 			.src_cpu	= busiest_rq->cpu,
 			.src_rq		= busiest_rq,
 			.idle		= CPU_IDLE,
+			/*
+			 * can_migrate_task() doesn't need to compute new_dst_cpu
+			 * for active balancing. Since we have CPU_IDLE, but no
+			 * @dst_grpmask we need to make that test go away with lying
+			 * about DST_PINNED.
+			 */
+			.flags		= LBF_DST_PINNED,
 		};
 
 		schedstat_inc(sd->alb_count);

From 242fc35290bd8cf0effc6e3474e3a417985de2f3 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Wed, 5 Jul 2017 19:23:28 +0300
Subject: [PATCH 8/8] sched/headers/uapi: Fix linux/sched/types.h userspace
 compilation errors

Consistently use types provided by <linux/types.h> to fix the following
linux/sched/types.h userspace compilation errors:

  /usr/include/linux/sched/types.h:57:2: error: unknown type name 'u32'
    u32 size;
  ...
  u64 sched_period;

Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org # v4.12
Fixes: e2d1e2aec572 ("sched/headers: Move various ABI definitions to <uapi/linux/sched/types.h>")
Link: http://lkml.kernel.org/r/20170705162328.GA11026@altlinux.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/sched/types.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
index 307acbc82d80..34b81aa1a2f7 100644
--- a/include/uapi/linux/sched/types.h
+++ b/include/uapi/linux/sched/types.h
@@ -54,21 +54,21 @@ struct sched_param {
  * available in the scheduling class file or in Documentation/.
  */
 struct sched_attr {
-	u32 size;
+	__u32 size;
 
-	u32 sched_policy;
-	u64 sched_flags;
+	__u32 sched_policy;
+	__u64 sched_flags;
 
 	/* SCHED_NORMAL, SCHED_BATCH */
-	s32 sched_nice;
+	__s32 sched_nice;
 
 	/* SCHED_FIFO, SCHED_RR */
-	u32 sched_priority;
+	__u32 sched_priority;
 
 	/* SCHED_DEADLINE */
-	u64 sched_runtime;
-	u64 sched_deadline;
-	u64 sched_period;
+	__u64 sched_runtime;
+	__u64 sched_deadline;
+	__u64 sched_period;
 };
 
 #endif /* _UAPI_LINUX_SCHED_TYPES_H */