sched/cputime, s390: Implement delayed accounting of system time

The account_system_time() function is called with a cputime that
occurred while running in the kernel. The function detects which
context the CPU is currently running in and accounts the time to
the correct bucket. This forces the arch code to account the
cputime for hardirq and softirq immediately.

Such accounting function can be costly and perform unwelcome divisions
and multiplications, among others.

The arch code can delay the accounting for system time. For s390
the accounting is done once per timer tick and for each task switch.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
[ Rebase against latest linus tree and move account_system_index_scaled(). ]
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1483636310-6557-10-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Martin Schwidefsky 2017-01-05 18:11:49 +01:00 committed by Ingo Molnar
parent 7dd582305d
commit b7394a5f4c
3 changed files with 123 additions and 77 deletions

View File

@ -85,53 +85,56 @@ struct lowcore {
__u64 mcck_enter_timer; /* 0x02c0 */
__u64 exit_timer; /* 0x02c8 */
__u64 user_timer; /* 0x02d0 */
__u64 system_timer; /* 0x02d8 */
__u64 steal_timer; /* 0x02e0 */
__u64 last_update_timer; /* 0x02e8 */
__u64 last_update_clock; /* 0x02f0 */
__u64 int_clock; /* 0x02f8 */
__u64 mcck_clock; /* 0x0300 */
__u64 clock_comparator; /* 0x0308 */
__u64 guest_timer; /* 0x02d8 */
__u64 system_timer; /* 0x02e0 */
__u64 hardirq_timer; /* 0x02e8 */
__u64 softirq_timer; /* 0x02f0 */
__u64 steal_timer; /* 0x02f8 */
__u64 last_update_timer; /* 0x0300 */
__u64 last_update_clock; /* 0x0308 */
__u64 int_clock; /* 0x0310 */
__u64 mcck_clock; /* 0x0318 */
__u64 clock_comparator; /* 0x0320 */
/* Current process. */
__u64 current_task; /* 0x0310 */
__u8 pad_0x318[0x320-0x318]; /* 0x0318 */
__u64 kernel_stack; /* 0x0320 */
__u64 current_task; /* 0x0328 */
__u8 pad_0x318[0x320-0x318]; /* 0x0330 */
__u64 kernel_stack; /* 0x0338 */
/* Interrupt, panic and restart stack. */
__u64 async_stack; /* 0x0328 */
__u64 panic_stack; /* 0x0330 */
__u64 restart_stack; /* 0x0338 */
__u64 async_stack; /* 0x0340 */
__u64 panic_stack; /* 0x0348 */
__u64 restart_stack; /* 0x0350 */
/* Restart function and parameter. */
__u64 restart_fn; /* 0x0340 */
__u64 restart_data; /* 0x0348 */
__u64 restart_source; /* 0x0350 */
__u64 restart_fn; /* 0x0358 */
__u64 restart_data; /* 0x0360 */
__u64 restart_source; /* 0x0368 */
/* Address space pointer. */
__u64 kernel_asce; /* 0x0358 */
__u64 user_asce; /* 0x0360 */
__u64 kernel_asce; /* 0x0370 */
__u64 user_asce; /* 0x0378 */
/*
* The lpp and current_pid fields form a
* 64-bit value that is set as program
* parameter with the LPP instruction.
*/
__u32 lpp; /* 0x0368 */
__u32 current_pid; /* 0x036c */
__u32 lpp; /* 0x0380 */
__u32 current_pid; /* 0x0384 */
/* SMP info area */
__u32 cpu_nr; /* 0x0370 */
__u32 softirq_pending; /* 0x0374 */
__u64 percpu_offset; /* 0x0378 */
__u64 vdso_per_cpu_data; /* 0x0380 */
__u64 machine_flags; /* 0x0388 */
__u32 preempt_count; /* 0x0390 */
__u8 pad_0x0394[0x0398-0x0394]; /* 0x0394 */
__u64 gmap; /* 0x0398 */
__u32 spinlock_lockval; /* 0x03a0 */
__u32 fpu_flags; /* 0x03a4 */
__u8 pad_0x03a8[0x0400-0x03a8]; /* 0x03a8 */
__u32 cpu_nr; /* 0x0388 */
__u32 softirq_pending; /* 0x038c */
__u64 percpu_offset; /* 0x0390 */
__u64 vdso_per_cpu_data; /* 0x0398 */
__u64 machine_flags; /* 0x03a0 */
__u32 preempt_count; /* 0x03a8 */
__u8 pad_0x03ac[0x03b0-0x03ac]; /* 0x03ac */
__u64 gmap; /* 0x03b0 */
__u32 spinlock_lockval; /* 0x03b8 */
__u32 fpu_flags; /* 0x03bc */
__u8 pad_0x03c0[0x0400-0x03c0]; /* 0x03c0 */
/* Per cpu primary space access list */
__u32 paste[16]; /* 0x0400 */

View File

@ -111,7 +111,10 @@ struct thread_struct {
unsigned int acrs[NUM_ACRS];
unsigned long ksp; /* kernel stack pointer */
unsigned long user_timer; /* task cputime in user space */
unsigned long guest_timer; /* task cputime in kvm guest */
unsigned long system_timer; /* task cputime in kernel space */
unsigned long hardirq_timer; /* task cputime in hardirq context */
unsigned long softirq_timer; /* task cputime in softirq context */
unsigned long sys_call_table; /* system call table address */
mm_segment_t mm_segment;
unsigned long gmap_addr; /* address of last gmap fault. */

View File

@ -90,14 +90,41 @@ static void update_mt_scaling(void)
__this_cpu_write(mt_scaling_jiffies, jiffies_64);
}
static inline u64 update_tsk_timer(unsigned long *tsk_vtime, u64 new)
{
u64 delta;
delta = new - *tsk_vtime;
*tsk_vtime = new;
return delta;
}
static inline u64 scale_vtime(u64 vtime)
{
u64 mult = __this_cpu_read(mt_scaling_mult);
u64 div = __this_cpu_read(mt_scaling_div);
if (smp_cpu_mtid)
return vtime * mult / div;
return vtime;
}
static void account_system_index_scaled(struct task_struct *p,
cputime_t cputime, cputime_t scaled,
enum cpu_usage_stat index)
{
p->stimescaled += scaled;
account_system_index_time(p, cputime, index);
}
/*
* Update process times based on virtual cpu times stored by entry.S
* to the lowcore fields user_timer, system_timer & steal_clock.
*/
static int do_account_vtime(struct task_struct *tsk)
{
u64 timer, clock, user, system, steal;
u64 user_scaled, system_scaled;
u64 timer, clock, user, guest, system, hardirq, softirq, steal;
timer = S390_lowcore.last_update_timer;
clock = S390_lowcore.last_update_clock;
@ -110,36 +137,53 @@ static int do_account_vtime(struct task_struct *tsk)
#endif
: "=m" (S390_lowcore.last_update_timer),
"=m" (S390_lowcore.last_update_clock));
S390_lowcore.system_timer += timer - S390_lowcore.last_update_timer;
S390_lowcore.steal_timer += S390_lowcore.last_update_clock - clock;
clock = S390_lowcore.last_update_clock - clock;
timer -= S390_lowcore.last_update_timer;
if (hardirq_count())
S390_lowcore.hardirq_timer += timer;
else
S390_lowcore.system_timer += timer;
/* Update MT utilization calculation */
if (smp_cpu_mtid &&
time_after64(jiffies_64, this_cpu_read(mt_scaling_jiffies)))
update_mt_scaling();
user = S390_lowcore.user_timer - tsk->thread.user_timer;
S390_lowcore.steal_timer -= user;
tsk->thread.user_timer = S390_lowcore.user_timer;
/* Calculate cputime delta */
user = update_tsk_timer(&tsk->thread.user_timer,
READ_ONCE(S390_lowcore.user_timer));
guest = update_tsk_timer(&tsk->thread.guest_timer,
READ_ONCE(S390_lowcore.guest_timer));
system = update_tsk_timer(&tsk->thread.system_timer,
READ_ONCE(S390_lowcore.system_timer));
hardirq = update_tsk_timer(&tsk->thread.hardirq_timer,
READ_ONCE(S390_lowcore.hardirq_timer));
softirq = update_tsk_timer(&tsk->thread.softirq_timer,
READ_ONCE(S390_lowcore.softirq_timer));
S390_lowcore.steal_timer +=
clock - user - guest - system - hardirq - softirq;
system = S390_lowcore.system_timer - tsk->thread.system_timer;
S390_lowcore.steal_timer -= system;
tsk->thread.system_timer = S390_lowcore.system_timer;
user_scaled = user;
system_scaled = system;
/* Do MT utilization scaling */
if (smp_cpu_mtid) {
u64 mult = __this_cpu_read(mt_scaling_mult);
u64 div = __this_cpu_read(mt_scaling_div);
user_scaled = (user_scaled * mult) / div;
system_scaled = (system_scaled * mult) / div;
/* Push account value */
if (user) {
account_user_time(tsk, user);
tsk->utimescaled += scale_vtime(user);
}
account_user_time(tsk, user);
tsk->utimescaled += user_scaled;
account_system_time(tsk, 0, system);
tsk->stimescaled += system_scaled;
if (guest) {
account_guest_time(tsk, guest);
tsk->utimescaled += scale_vtime(guest);
}
if (system)
account_system_index_scaled(tsk, system, scale_vtime(system),
CPUTIME_SYSTEM);
if (hardirq)
account_system_index_scaled(tsk, hardirq, scale_vtime(hardirq),
CPUTIME_IRQ);
if (softirq)
account_system_index_scaled(tsk, softirq, scale_vtime(softirq),
CPUTIME_SOFTIRQ);
steal = S390_lowcore.steal_timer;
if ((s64) steal > 0) {
@ -147,16 +191,22 @@ static int do_account_vtime(struct task_struct *tsk)
account_steal_time(steal);
}
return virt_timer_forward(user + system);
return virt_timer_forward(user + guest + system + hardirq + softirq);
}
void vtime_task_switch(struct task_struct *prev)
{
do_account_vtime(prev);
prev->thread.user_timer = S390_lowcore.user_timer;
prev->thread.guest_timer = S390_lowcore.guest_timer;
prev->thread.system_timer = S390_lowcore.system_timer;
prev->thread.hardirq_timer = S390_lowcore.hardirq_timer;
prev->thread.softirq_timer = S390_lowcore.softirq_timer;
S390_lowcore.user_timer = current->thread.user_timer;
S390_lowcore.guest_timer = current->thread.guest_timer;
S390_lowcore.system_timer = current->thread.system_timer;
S390_lowcore.hardirq_timer = current->thread.hardirq_timer;
S390_lowcore.softirq_timer = current->thread.softirq_timer;
}
/*
@ -176,32 +226,22 @@ void vtime_account_user(struct task_struct *tsk)
*/
void vtime_account_irq_enter(struct task_struct *tsk)
{
u64 timer, system, system_scaled;
u64 timer;
timer = S390_lowcore.last_update_timer;
S390_lowcore.last_update_timer = get_vtimer();
S390_lowcore.system_timer += timer - S390_lowcore.last_update_timer;
timer -= S390_lowcore.last_update_timer;
/* Update MT utilization calculation */
if (smp_cpu_mtid &&
time_after64(jiffies_64, this_cpu_read(mt_scaling_jiffies)))
update_mt_scaling();
if ((tsk->flags & PF_VCPU) && (irq_count() == 0))
S390_lowcore.guest_timer += timer;
else if (hardirq_count())
S390_lowcore.hardirq_timer += timer;
else if (in_serving_softirq())
S390_lowcore.softirq_timer += timer;
else
S390_lowcore.system_timer += timer;
system = S390_lowcore.system_timer - tsk->thread.system_timer;
S390_lowcore.steal_timer -= system;
tsk->thread.system_timer = S390_lowcore.system_timer;
system_scaled = system;
/* Do MT utilization scaling */
if (smp_cpu_mtid) {
u64 mult = __this_cpu_read(mt_scaling_mult);
u64 div = __this_cpu_read(mt_scaling_div);
system_scaled = (system_scaled * mult) / div;
}
account_system_time(tsk, 0, system);
tsk->stimescaled += system_scaled;
virt_timer_forward(system);
virt_timer_forward(timer);
}
EXPORT_SYMBOL_GPL(vtime_account_irq_enter);