From a332d86d3c262cddd3de0bfa90e1910de60b4f95 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 10 Feb 2008 09:04:12 +0100 Subject: [PATCH 01/10] hrtimer: add nanosleep specific restart_block member The back and forth typecasting of restart_block->args is horrible. We added a separate union member for futex already. Do the same for nanosleep. Signed-off-by: Thomas Gleixner --- include/linux/thread_info.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 421323e5a2d6..accd7bad35b0 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -9,6 +9,9 @@ #include +struct timespec; +struct compat_timespec; + /* * System call restart block. */ @@ -26,6 +29,15 @@ struct restart_block { u32 bitset; u64 time; } futex; + /* For nanosleep */ + struct { + clockid_t index; + struct timespec __user *rmtp; +#ifdef CONFIG_COMPAT + struct compat_timespec __user *compat_rmtp; +#endif + u64 expires; + } nanosleep; }; }; From 029a07e0311c7fef968d44b50beca53969cee40b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 10 Feb 2008 09:17:43 +0100 Subject: [PATCH 02/10] hrtimer: use nanosleep specific restart_block fields Convert all the nanosleep related users of restart_block to the new nanosleep specific restart_block fields. Signed-off-by: Thomas Gleixner --- kernel/compat.c | 15 +++++++-------- kernel/hrtimer.c | 13 ++++++------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/kernel/compat.c b/kernel/compat.c index 5f0e201bcfd3..9c48abfcd4a5 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -47,15 +47,14 @@ static long compat_nanosleep_restart(struct restart_block *restart) mm_segment_t oldfs; long ret; - rmtp = (struct compat_timespec __user *)(restart->arg1); - restart->arg1 = (unsigned long)&rmt; + restart->nanosleep.rmtp = (struct timespec __user *) &rmt; oldfs = get_fs(); set_fs(KERNEL_DS); ret = hrtimer_nanosleep_restart(restart); set_fs(oldfs); if (ret) { - restart->arg1 = (unsigned long)rmtp; + rmtp = restart->nanosleep.compat_rmtp; if (rmtp && put_compat_timespec(&rmt, rmtp)) return -EFAULT; @@ -89,7 +88,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, = ¤t_thread_info()->restart_block; restart->fn = compat_nanosleep_restart; - restart->arg1 = (unsigned long)rmtp; + restart->nanosleep.compat_rmtp = rmtp; if (rmtp && put_compat_timespec(&rmt, rmtp)) return -EFAULT; @@ -607,9 +606,9 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart) long err; mm_segment_t oldfs; struct timespec tu; - struct compat_timespec *rmtp = (struct compat_timespec *)(restart->arg1); + struct compat_timespec *rmtp = restart->nanosleep.compat_rmtp; - restart->arg1 = (unsigned long) &tu; + restart->nanosleep.rmtp = (struct timespec __user *) &tu; oldfs = get_fs(); set_fs(KERNEL_DS); err = clock_nanosleep_restart(restart); @@ -621,7 +620,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart) if (err == -ERESTART_RESTARTBLOCK) { restart->fn = compat_clock_nanosleep_restart; - restart->arg1 = (unsigned long) rmtp; + restart->nanosleep.compat_rmtp = rmtp; } return err; } @@ -652,7 +651,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, if (err == -ERESTART_RESTARTBLOCK) { restart = ¤t_thread_info()->restart_block; restart->fn = compat_clock_nanosleep_restart; - restart->arg1 = (unsigned long) rmtp; + restart->nanosleep.compat_rmtp = rmtp; } return err; } diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 98bee013f71f..911e87d0440d 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1354,13 +1354,13 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart) struct hrtimer_sleeper t; struct timespec __user *rmtp; - hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS); - t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2; + hrtimer_init(&t.timer, restart->nanosleep.index, HRTIMER_MODE_ABS); + t.timer.expires.tv64 = restart->nanosleep.expires; if (do_nanosleep(&t, HRTIMER_MODE_ABS)) return 0; - rmtp = (struct timespec __user *)restart->arg1; + rmtp = restart->nanosleep.rmtp; if (rmtp) { int ret = update_rmtp(&t.timer, rmtp); if (ret <= 0) @@ -1394,10 +1394,9 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, restart = ¤t_thread_info()->restart_block; restart->fn = hrtimer_nanosleep_restart; - restart->arg0 = (unsigned long) t.timer.base->index; - restart->arg1 = (unsigned long) rmtp; - restart->arg2 = t.timer.expires.tv64 & 0xFFFFFFFF; - restart->arg3 = t.timer.expires.tv64 >> 32; + restart->nanosleep.index = t.timer.base->index; + restart->nanosleep.rmtp = rmtp; + restart->nanosleep.expires = t.timer.expires.tv64; return -ERESTART_RESTARTBLOCK; } From d59b949f771eb3cbe50865c72e13e2a0a8d4d781 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Tue, 5 Feb 2008 00:48:13 +0100 Subject: [PATCH 03/10] timer_list: add annotations to workqueue.c Add timer list annotations to workqueue.c so we can see the call site in the timer stats. Signed-off-by: Pavel Machek Signed-off-by: Thomas Gleixner --- kernel/workqueue.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ff06611655af..00ff4d08e370 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -219,6 +219,7 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; + timer_stats_timer_set_start_info(&dwork->timer); if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { BUG_ON(timer_pending(timer)); BUG_ON(!list_empty(&work->entry)); @@ -580,6 +581,7 @@ EXPORT_SYMBOL(schedule_delayed_work); int schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { + timer_stats_timer_set_start_info(&dwork->timer); return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); } EXPORT_SYMBOL(schedule_delayed_work_on); From ee7dd205b5cdbc3231d48e38641efd05f572c52a Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Fri, 4 Apr 2008 20:54:10 +0200 Subject: [PATCH 04/10] posix-timers: fix shadowed variables Fix sparse warnings like this: kernel/posix-cpu-timers.c:1090:25: warning: symbol 't' shadows an earlier one kernel/posix-cpu-timers.c:1058:21: originally declared here Signed-off-by: WANG Cong Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-cpu-timers.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 2eae91f954ca..ae5c6c147c4b 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1087,45 +1087,45 @@ static void check_process_timers(struct task_struct *tsk, maxfire = 20; prof_expires = cputime_zero; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, + struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) { - prof_expires = t->expires.cpu; + if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) { + prof_expires = tl->expires.cpu; break; } - t->firing = 1; - list_move_tail(&t->entry, firing); + tl->firing = 1; + list_move_tail(&tl->entry, firing); } ++timers; maxfire = 20; virt_expires = cputime_zero; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, + struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || cputime_lt(utime, t->expires.cpu)) { - virt_expires = t->expires.cpu; + if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) { + virt_expires = tl->expires.cpu; break; } - t->firing = 1; - list_move_tail(&t->entry, firing); + tl->firing = 1; + list_move_tail(&tl->entry, firing); } ++timers; maxfire = 20; sched_expires = 0; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, + struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || sum_sched_runtime < t->expires.sched) { - sched_expires = t->expires.sched; + if (!--maxfire || sum_sched_runtime < tl->expires.sched) { + sched_expires = tl->expires.sched; break; } - t->firing = 1; - list_move_tail(&t->entry, firing); + tl->firing = 1; + list_move_tail(&tl->entry, firing); } /* From 0d180406f2914aea3a78ddb880e2fe9ac78a9372 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 4 Apr 2008 20:54:10 +0200 Subject: [PATCH 05/10] timers: simplify lockdep handling In order to avoid the false positive from lockdep, each per-cpu base->lock has the separate lock class and migrate_timers() uses double_spin_lock(). This all is overcomplicated: except for migrate_timers() we never take 2 locks at once, and migrate_timers() can use spin_lock_nested(). Signed-off-by: Oleg Nesterov Cc: Arjan van de Ven Cc: Heiko Carstens Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/timer.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/kernel/timer.c b/kernel/timer.c index b024106daa70..f3d35d4ea42e 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1228,13 +1228,6 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) return 0; } -/* - * lockdep: we want to track each per-CPU base as a separate lock-class, - * but timer-bases are kmalloc()-ed, so we need to attach separate - * keys to them: - */ -static struct lock_class_key base_lock_keys[NR_CPUS]; - static int __cpuinit init_timers_cpu(int cpu) { int j; @@ -1277,7 +1270,6 @@ static int __cpuinit init_timers_cpu(int cpu) } spin_lock_init(&base->lock); - lockdep_set_class(&base->lock, base_lock_keys + cpu); for (j = 0; j < TVN_SIZE; j++) { INIT_LIST_HEAD(base->tv5.vec + j); @@ -1316,8 +1308,8 @@ static void __cpuinit migrate_timers(int cpu) new_base = get_cpu_var(tvec_bases); local_irq_disable(); - double_spin_lock(&new_base->lock, &old_base->lock, - smp_processor_id() < cpu); + spin_lock(&new_base->lock); + spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); BUG_ON(old_base->running_timer); @@ -1330,8 +1322,8 @@ static void __cpuinit migrate_timers(int cpu) migrate_timer_list(new_base, old_base->tv5.vec + i); } - double_spin_unlock(&new_base->lock, &old_base->lock, - smp_processor_id() < cpu); + spin_unlock(&old_base->lock); + spin_unlock(&new_base->lock); local_irq_enable(); put_cpu_var(tvec_bases); } From 8e60e05fdc7344415fa69a3883b11f65db967b47 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 4 Apr 2008 20:54:10 +0200 Subject: [PATCH 06/10] hrtimers: simplify lockdep handling In order to avoid the false positive from lockdep, each per-cpu base->lock has the separate lock class and migrate_hrtimers() uses double_spin_lock(). This is overcomplicated: except for migrate_hrtimers() we never take 2 locks at once, and migrate_hrtimers() can use spin_lock_nested(). Signed-off-by: Oleg Nesterov Cc: Arjan van de Ven Cc: Heiko Carstens Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 2 -- kernel/hrtimer.c | 9 ++++----- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 1ad56a7b2f74..56f3236da829 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -173,7 +173,6 @@ struct hrtimer_clock_base { * struct hrtimer_cpu_base - the per cpu clock bases * @lock: lock protecting the base and associated clock bases * and timers - * @lock_key: the lock_class_key for use with lockdep * @clock_base: array of clock bases for this cpu * @curr_timer: the timer which is executing a callback right now * @expires_next: absolute time of the next event which was scheduled @@ -189,7 +188,6 @@ struct hrtimer_clock_base { */ struct hrtimer_cpu_base { spinlock_t lock; - struct lock_class_key lock_key; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; struct list_head cb_pending; #ifdef CONFIG_HIGH_RES_TIMERS diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 911e87d0440d..c642ef75069f 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1424,7 +1424,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu) int i; spin_lock_init(&cpu_base->lock); - lockdep_set_class(&cpu_base->lock, &cpu_base->lock_key); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) cpu_base->clock_base[i].cpu_base = cpu_base; @@ -1465,16 +1464,16 @@ static void migrate_hrtimers(int cpu) tick_cancel_sched_timer(cpu); local_irq_disable(); - double_spin_lock(&new_base->lock, &old_base->lock, - smp_processor_id() < cpu); + spin_lock(&new_base->lock); + spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]); } - double_spin_unlock(&new_base->lock, &old_base->lock, - smp_processor_id() < cpu); + spin_unlock(&old_base->lock); + spin_unlock(&new_base->lock); local_irq_enable(); put_cpu_var(hrtimer_bases); } From 3f3eafc921e2378954c28cfd0eb10910449f4c11 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 4 Apr 2008 20:54:10 +0200 Subject: [PATCH 07/10] locking: remove unused double_spin_lock() double_spin_lock() has no callers, and it can't be used without additional lockdep annotations, remove it. Signed-off-by: Oleg Nesterov Cc: Arjan van de Ven Cc: Heiko Carstens Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- include/linux/spinlock.h | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 1129ee0a7180..d311a090fae7 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -295,43 +295,6 @@ do { \ 1 : ({ local_irq_restore(flags); 0; }); \ }) -/* - * Locks two spinlocks l1 and l2. - * l1_first indicates if spinlock l1 should be taken first. - */ -static inline void double_spin_lock(spinlock_t *l1, spinlock_t *l2, - bool l1_first) - __acquires(l1) - __acquires(l2) -{ - if (l1_first) { - spin_lock(l1); - spin_lock(l2); - } else { - spin_lock(l2); - spin_lock(l1); - } -} - -/* - * Unlocks two spinlocks l1 and l2. - * l1_taken_first indicates if spinlock l1 was taken first and therefore - * should be released after spinlock l2. - */ -static inline void double_spin_unlock(spinlock_t *l1, spinlock_t *l2, - bool l1_taken_first) - __releases(l1) - __releases(l2) -{ - if (l1_taken_first) { - spin_unlock(l2); - spin_unlock(l1); - } else { - spin_unlock(l1); - spin_unlock(l2); - } -} - /* * Pull the atomic_t declaration: * (asm-mips/atomic.h needs above definitions) From 903b8a8d4835a796f582033802c83283886f4a3d Mon Sep 17 00:00:00 2001 From: Karsten Wiese Date: Thu, 28 Feb 2008 15:10:50 +0100 Subject: [PATCH 08/10] clockevents: optimise tick_nohz_stop_sched_tick() a bit Call ts = &per_cpu(tick_cpu_sched, cpu); and cpu = smp_processor_id(); once instead of twice. No functional change done, as changed code runs with local irq off. Reduces source lines and text size (20bytes on x86_64). [ akpm@linux-foundation.org: Build fix ] Signed-off-by: Karsten Wiese Cc: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 686da821d376..69dba0c71727 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -158,9 +158,8 @@ void tick_nohz_stop_idle(int cpu) } } -static ktime_t tick_nohz_start_idle(int cpu) +static ktime_t tick_nohz_start_idle(struct tick_sched *ts) { - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, delta; now = ktime_get(); @@ -201,8 +200,8 @@ void tick_nohz_stop_sched_tick(void) local_irq_save(flags); cpu = smp_processor_id(); - now = tick_nohz_start_idle(cpu); ts = &per_cpu(tick_cpu_sched, cpu); + now = tick_nohz_start_idle(ts); /* * If this cpu is offline and it is the one which updates @@ -222,7 +221,6 @@ void tick_nohz_stop_sched_tick(void) if (need_resched()) goto end; - cpu = smp_processor_id(); if (unlikely(local_softirq_pending())) { static int ratelimit; From 3833eecc183ce052e9ac96b39b45121a2d11ac16 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 5 Mar 2008 18:28:15 +0100 Subject: [PATCH 09/10] Documentation: move timer related documentation to a single place We have two directories with timer related information in Documentation/: hrtimers/ and hrtimer/. timer_stats are not restricted to hrtimers. Move all those files into Documentation/timers where we can pile up other timer related docs as well. Pointed-out-by: Randy Dunlap Signed-off-by: Thomas Gleixner --- Documentation/00-INDEX | 6 ++---- Documentation/{hrtimers => timers}/highres.txt | 0 Documentation/{hrtimers => timers}/hrtimers.txt | 0 Documentation/{hrtimer => timers}/timer_stats.txt | 0 4 files changed, 2 insertions(+), 4 deletions(-) rename Documentation/{hrtimers => timers}/highres.txt (100%) rename Documentation/{hrtimers => timers}/hrtimers.txt (100%) rename Documentation/{hrtimer => timers}/timer_stats.txt (100%) diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index e8fb24671967..f7923a42e769 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX @@ -167,10 +167,8 @@ highuid.txt - notes on the change from 16 bit to 32 bit user/group IDs. hpet.txt - High Precision Event Timer Driver for Linux. -hrtimer/ - - info on the timer_stats debugging facility for timer (ab)use. -hrtimers/ - - info on the hrtimers subsystem for high-resolution kernel timers. +timers/ + - info on the timer related topics hw_random.txt - info on Linux support for random number generator in i8xx chipsets. hwmon/ diff --git a/Documentation/hrtimers/highres.txt b/Documentation/timers/highres.txt similarity index 100% rename from Documentation/hrtimers/highres.txt rename to Documentation/timers/highres.txt diff --git a/Documentation/hrtimers/hrtimers.txt b/Documentation/timers/hrtimers.txt similarity index 100% rename from Documentation/hrtimers/hrtimers.txt rename to Documentation/timers/hrtimers.txt diff --git a/Documentation/hrtimer/timer_stats.txt b/Documentation/timers/timer_stats.txt similarity index 100% rename from Documentation/hrtimer/timer_stats.txt rename to Documentation/timers/timer_stats.txt From 6993fc5bbc5d63ccd55985b39c34417e430e75e9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 30 Jan 2008 13:30:02 +0100 Subject: [PATCH 10/10] clocksource: make clocksource watchdog cycle through online CPUs This way it checks if the clocks are synchronized between CPUs too. This might be able to detect slowly drifting TSCs which only go wrong over longer time. Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 7f60097d443a..912156dd6005 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -141,8 +141,16 @@ static void clocksource_watchdog(unsigned long data) } if (!list_empty(&watchdog_list)) { - __mod_timer(&watchdog_timer, - watchdog_timer.expires + WATCHDOG_INTERVAL); + /* + * Cycle through CPUs to check if the CPUs stay + * synchronized to each other. + */ + int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map); + + if (next_cpu >= NR_CPUS) + next_cpu = first_cpu(cpu_online_map); + watchdog_timer.expires += WATCHDOG_INTERVAL; + add_timer_on(&watchdog_timer, next_cpu); } spin_unlock(&watchdog_lock); } @@ -164,7 +172,8 @@ static void clocksource_check_watchdog(struct clocksource *cs) if (!started && watchdog) { watchdog_last = watchdog->read(); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; - add_timer(&watchdog_timer); + add_timer_on(&watchdog_timer, + first_cpu(cpu_online_map)); } } else { if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) @@ -185,7 +194,8 @@ static void clocksource_check_watchdog(struct clocksource *cs) watchdog_last = watchdog->read(); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; - add_timer(&watchdog_timer); + add_timer_on(&watchdog_timer, + first_cpu(cpu_online_map)); } } }