From e5f1b245870d59be0e6cc3b33edf5406a3b59648 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Wed, 5 Oct 2016 09:33:12 +0200 Subject: [PATCH 1/7] cpuidle: governors: Remove remaining old module code The governor's code use try_module_get() and put_module() to refcount the governor's module. But the governors are not compiled as module. The refcount does not prevent to switch the governor or unload a module as they aren't compiled as modules. The code is pointless, so remove it. Signed-off-by: Daniel Lezcano Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governor.c | 4 ---- drivers/cpuidle/governors/ladder.c | 2 -- drivers/cpuidle/governors/menu.c | 2 -- include/linux/cpuidle.h | 2 -- 4 files changed, 10 deletions(-) diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c index fb9f511cca23..4e78263e34a4 100644 --- a/drivers/cpuidle/governor.c +++ b/drivers/cpuidle/governor.c @@ -9,7 +9,6 @@ */ #include -#include #include #include "cpuidle.h" @@ -53,14 +52,11 @@ int cpuidle_switch_governor(struct cpuidle_governor *gov) if (cpuidle_curr_governor) { list_for_each_entry(dev, &cpuidle_detected_devices, device_list) cpuidle_disable_device(dev); - module_put(cpuidle_curr_governor->owner); } cpuidle_curr_governor = gov; if (gov) { - if (!try_module_get(cpuidle_curr_governor->owner)) - return -EINVAL; list_for_each_entry(dev, &cpuidle_detected_devices, device_list) cpuidle_enable_device(dev); cpuidle_install_idle_handler(); diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c index 63bd5a403e22..fe8f08948fcb 100644 --- a/drivers/cpuidle/governors/ladder.c +++ b/drivers/cpuidle/governors/ladder.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include @@ -177,7 +176,6 @@ static struct cpuidle_governor ladder_governor = { .enable = ladder_enable_device, .select = ladder_select_state, .reflect = ladder_reflect, - .owner = THIS_MODULE, }; /** diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 03d38c291de6..d9b5b9398a0f 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -19,7 +19,6 @@ #include #include #include -#include /* * Please note when changing the tuning values: @@ -484,7 +483,6 @@ static struct cpuidle_governor menu_governor = { .enable = menu_enable_device, .select = menu_select, .reflect = menu_reflect, - .owner = THIS_MODULE, }; /** diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index bb31373c3478..15deea449edc 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -235,8 +235,6 @@ struct cpuidle_governor { int (*select) (struct cpuidle_driver *drv, struct cpuidle_device *dev); void (*reflect) (struct cpuidle_device *dev, int index); - - struct module *owner; }; #ifdef CONFIG_CPU_IDLE From a94e502c22b61072f808f53392a8433bc948b03d Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 10 Nov 2016 14:24:33 +0000 Subject: [PATCH 2/7] cpuidle: dt: assign ->enter_freeze to same as ->enter callback function enter_freeze() callback is expected atleast to do the same as enter() but it has to guarantee that interrupts aren't enabled at any point in its execution, as the tick is frozen. CPUs execute ->enter_freeze with the local tick or entire timekeeping suspended, so it must not re-enable interrupts at any point (even temporarily) or attempt to change states of clock event devices. It will be called when the system goes to suspend-to-idle and will reduce power usage because CPUs won't be awaken for unnecessary IRQs (i.e. woken up only on IRQs from "wakeup sources") We can reuse the same code for both the enter() and enter_freeze() callbacks as along as they don't re-enable interrupts. Only "coupled" cpuidle mechanism enables interrupts and doing that with timekeeping suspended is generally not safe. Since this generic DT based idle driver doesn't support "coupled" states, it is safe to assume that the interrupts are not re-enabled. This patch assign enter_freeze to same as enter callback function which helps to save power without any intermittent spurious wakeups from suspend-to-idle. Signed-off-by: Sudeep Holla Tested-by: Andy Gross Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/dt_idle_states.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/cpuidle/dt_idle_states.c b/drivers/cpuidle/dt_idle_states.c index a5c111b67f37..ffca4fc0061d 100644 --- a/drivers/cpuidle/dt_idle_states.c +++ b/drivers/cpuidle/dt_idle_states.c @@ -38,6 +38,12 @@ static int init_state_node(struct cpuidle_state *idle_state, * state enter function. */ idle_state->enter = match_id->data; + /* + * Since this is not a "coupled" state, it's safe to assume interrupts + * won't be enabled when it exits allowing the tick to be frozen + * safely. So enter() can be also enter_freeze() callback. + */ + idle_state->enter_freeze = match_id->data; err = of_property_read_u32(state_node, "wakeup-latency-us", &idle_state->exit_latency); From ed61390bff3ad43166a2552651b09ebd95dd1da5 Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Tue, 22 Nov 2016 16:08:06 +1100 Subject: [PATCH 3/7] cpuidle/powernv: staticise powernv_idle_driver powernv_idle_driver isn't exported, it can be made static. Found by sparse. Signed-off-by: Andrew Donnellan Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle-powernv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 7fe442ca38f4..0835a37a5f3a 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -22,7 +22,7 @@ #define POWERNV_THRESHOLD_LATENCY_NS 200000 -struct cpuidle_driver powernv_idle_driver = { +static struct cpuidle_driver powernv_idle_driver = { .name = "powernv_idle", .owner = THIS_MODULE, }; From bb8313b603eb8fd52de48a079bfcd72dcab2ef1e Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Mon, 28 Nov 2016 23:03:04 -0800 Subject: [PATCH 4/7] cpuidle: Allow enforcing deepest idle state selection When idle injection is used to cap power, we need to override the governor's choice of idle states. For this reason, make it possible the deepest idle state selection to be enforced by setting a flag on a given CPU to achieve the maximum potential power draw reduction. Signed-off-by: Jacob Pan [ rjw: Subject & changelog ] Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 13 ++++++++++++- include/linux/cpuidle.h | 7 ++++++- kernel/sched/idle.c | 13 ++++++++----- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index c73207abb5a4..afc005b917fe 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -97,7 +97,17 @@ static int find_deepest_state(struct cpuidle_driver *drv, return ret; } -#ifdef CONFIG_SUSPEND +/* Set the current cpu to use the deepest idle state, override governors */ +void cpuidle_use_deepest_state(bool enable) +{ + struct cpuidle_device *dev; + + preempt_disable(); + dev = cpuidle_get_device(); + dev->use_deepest_state = enable; + preempt_enable(); +} + /** * cpuidle_find_deepest_state - Find the deepest available idle state. * @drv: cpuidle driver for the given CPU. @@ -109,6 +119,7 @@ int cpuidle_find_deepest_state(struct cpuidle_driver *drv, return find_deepest_state(drv, dev, UINT_MAX, 0, false); } +#ifdef CONFIG_SUSPEND static void enter_freeze_proper(struct cpuidle_driver *drv, struct cpuidle_device *dev, int index) { diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 15deea449edc..da346f2817a8 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -74,6 +74,7 @@ struct cpuidle_driver_kobj; struct cpuidle_device { unsigned int registered:1; unsigned int enabled:1; + unsigned int use_deepest_state:1; unsigned int cpu; int last_residency; @@ -192,11 +193,12 @@ static inline struct cpuidle_driver *cpuidle_get_cpu_driver( static inline struct cpuidle_device *cpuidle_get_device(void) {return NULL; } #endif -#if defined(CONFIG_CPU_IDLE) && defined(CONFIG_SUSPEND) +#ifdef CONFIG_CPU_IDLE extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv, struct cpuidle_device *dev); extern int cpuidle_enter_freeze(struct cpuidle_driver *drv, struct cpuidle_device *dev); +extern void cpuidle_use_deepest_state(bool enable); #else static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, struct cpuidle_device *dev) @@ -204,6 +206,9 @@ static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv, struct cpuidle_device *dev) {return -ENODEV; } +static inline void cpuidle_use_deepest_state(bool enable) +{ +} #endif /* kernel/sched/idle.c */ diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 1d8718d5300d..513e4dfeeae7 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -164,11 +164,14 @@ static void cpuidle_idle_call(void) * timekeeping to prevent timer interrupts from kicking us out of idle * until a proper wakeup interrupt happens. */ - if (idle_should_freeze()) { - entered_state = cpuidle_enter_freeze(drv, dev); - if (entered_state > 0) { - local_irq_enable(); - goto exit_idle; + + if (idle_should_freeze() || dev->use_deepest_state) { + if (idle_should_freeze()) { + entered_state = cpuidle_enter_freeze(drv, dev); + if (entered_state > 0) { + local_irq_enable(); + goto exit_idle; + } } next_state = cpuidle_find_deepest_state(drv, dev); From c1de45ca831acee9b72c9320dde447edafadb43f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Nov 2016 23:03:05 -0800 Subject: [PATCH 5/7] sched/idle: Add support for tasks that inject idle Idle injection drivers such as Intel powerclamp and ACPI PAD drivers use realtime tasks to take control of CPU then inject idle. There are two issues with this approach: 1. Low efficiency: injected idle task is treated as busy so sched ticks do not stop during injected idle period, the result of these unwanted wakeups can be ~20% loss in power savings. 2. Idle accounting: injected idle time is presented to user as busy. This patch addresses the issues by introducing a new PF_IDLE flag which allows any given task to be treated as idle task while the flag is set. Therefore, idle injection tasks can run through the normal flow of NOHZ idle enter/exit to get the correct accounting as well as tick stop when possible. The implication is that idle task is then no longer limited to PID == 0. Acked-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Jacob Pan Signed-off-by: Rafael J. Wysocki --- include/linux/cpu.h | 2 + include/linux/sched.h | 3 +- kernel/fork.c | 2 +- kernel/sched/core.c | 1 + kernel/sched/idle.c | 162 ++++++++++++++++++++++++++---------------- 5 files changed, 107 insertions(+), 63 deletions(-) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index b886dc17f2f3..ac0efae38072 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -245,6 +245,8 @@ void arch_cpu_idle_dead(void); int cpu_report_state(int cpu); int cpu_check_up_prepare(int cpu); void cpu_set_state_online(int cpu); +void play_idle(unsigned long duration_ms); + #ifdef CONFIG_HOTPLUG_CPU bool cpu_wait_death(unsigned int cpu, int seconds); bool cpu_report_death(void); diff --git a/include/linux/sched.h b/include/linux/sched.h index 348f51b0ec92..114c7fcb6af6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2254,6 +2254,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, /* * Per process flags */ +#define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ @@ -2609,7 +2610,7 @@ extern struct task_struct *idle_task(int cpu); */ static inline bool is_idle_task(const struct task_struct *p) { - return p->pid == 0; + return !!(p->flags & PF_IDLE); } extern struct task_struct *curr_task(int cpu); extern void ia64_set_curr_task(int cpu, struct task_struct *p); diff --git a/kernel/fork.c b/kernel/fork.c index 623259fc794d..5074b2f0827b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1537,7 +1537,7 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ - p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); + p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE); p->flags |= PF_FORKNOEXEC; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 94732d1ab00a..63b3a8a49884 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5285,6 +5285,7 @@ void init_idle(struct task_struct *idle, int cpu) __sched_fork(0, idle); idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); + idle->flags |= PF_IDLE; kasan_unpoison_task_stack(idle); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 513e4dfeeae7..6a4bae0a649d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -205,76 +205,65 @@ exit_idle: * * Called with polling cleared. */ -static void cpu_idle_loop(void) +static void do_idle(void) { - int cpu = smp_processor_id(); + /* + * If the arch has a polling bit, we maintain an invariant: + * + * Our polling bit is clear if we're not scheduled (i.e. if rq->curr != + * rq->idle). This means that, if rq->idle has the polling bit set, + * then setting need_resched is guaranteed to cause the CPU to + * reschedule. + */ - while (1) { - /* - * If the arch has a polling bit, we maintain an invariant: - * - * Our polling bit is clear if we're not scheduled (i.e. if - * rq->curr != rq->idle). This means that, if rq->idle has - * the polling bit set, then setting need_resched is - * guaranteed to cause the cpu to reschedule. - */ + __current_set_polling(); + tick_nohz_idle_enter(); - __current_set_polling(); - quiet_vmstat(); - tick_nohz_idle_enter(); + while (!need_resched()) { + check_pgt_cache(); + rmb(); - while (!need_resched()) { - check_pgt_cache(); - rmb(); - - if (cpu_is_offline(cpu)) { - cpuhp_report_idle_dead(); - arch_cpu_idle_dead(); - } - - local_irq_disable(); - arch_cpu_idle_enter(); - - /* - * In poll mode we reenable interrupts and spin. - * - * Also if we detected in the wakeup from idle - * path that the tick broadcast device expired - * for us, we don't want to go deep idle as we - * know that the IPI is going to arrive right - * away - */ - if (cpu_idle_force_poll || tick_check_broadcast_expired()) - cpu_idle_poll(); - else - cpuidle_idle_call(); - - arch_cpu_idle_exit(); + if (cpu_is_offline(smp_processor_id())) { + cpuhp_report_idle_dead(); + arch_cpu_idle_dead(); } - /* - * Since we fell out of the loop above, we know - * TIF_NEED_RESCHED must be set, propagate it into - * PREEMPT_NEED_RESCHED. - * - * This is required because for polling idle loops we will - * not have had an IPI to fold the state for us. - */ - preempt_set_need_resched(); - tick_nohz_idle_exit(); - __current_clr_polling(); + local_irq_disable(); + arch_cpu_idle_enter(); /* - * We promise to call sched_ttwu_pending and reschedule - * if need_resched is set while polling is set. That - * means that clearing polling needs to be visible - * before doing these things. + * In poll mode we reenable interrupts and spin. Also if we + * detected in the wakeup from idle path that the tick + * broadcast device expired for us, we don't want to go deep + * idle as we know that the IPI is going to arrive right away. */ - smp_mb__after_atomic(); - - sched_ttwu_pending(); - schedule_preempt_disabled(); + if (cpu_idle_force_poll || tick_check_broadcast_expired()) + cpu_idle_poll(); + else + cpuidle_idle_call(); + arch_cpu_idle_exit(); } + + /* + * Since we fell out of the loop above, we know TIF_NEED_RESCHED must + * be set, propagate it into PREEMPT_NEED_RESCHED. + * + * This is required because for polling idle loops we will not have had + * an IPI to fold the state for us. + */ + preempt_set_need_resched(); + tick_nohz_idle_exit(); + __current_clr_polling(); + + /* + * We promise to call sched_ttwu_pending() and reschedule if + * need_resched() is set while polling is set. That means that clearing + * polling needs to be visible before doing these things. + */ + smp_mb__after_atomic(); + + sched_ttwu_pending(); + schedule_preempt_disabled(); } bool cpu_in_idle(unsigned long pc) @@ -283,6 +272,56 @@ bool cpu_in_idle(unsigned long pc) pc < (unsigned long)__cpuidle_text_end; } +struct idle_timer { + struct hrtimer timer; + int done; +}; + +static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer) +{ + struct idle_timer *it = container_of(timer, struct idle_timer, timer); + + WRITE_ONCE(it->done, 1); + set_tsk_need_resched(current); + + return HRTIMER_NORESTART; +} + +void play_idle(unsigned long duration_ms) +{ + struct idle_timer it; + + /* + * Only FIFO tasks can disable the tick since they don't need the forced + * preemption. + */ + WARN_ON_ONCE(current->policy != SCHED_FIFO); + WARN_ON_ONCE(current->nr_cpus_allowed != 1); + WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); + WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); + WARN_ON_ONCE(!duration_ms); + + rcu_sleep_check(); + preempt_disable(); + current->flags |= PF_IDLE; + cpuidle_use_deepest_state(true); + + it.done = 0; + hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + it.timer.function = idle_inject_timer_fn; + hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED); + + while (!READ_ONCE(it.done)) + do_idle(); + + cpuidle_use_deepest_state(false); + current->flags &= ~PF_IDLE; + + preempt_fold_need_resched(); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(play_idle); + void cpu_startup_entry(enum cpuhp_state state) { /* @@ -302,5 +341,6 @@ void cpu_startup_entry(enum cpuhp_state state) #endif arch_cpu_idle_prepare(); cpuhp_online_idle(state); - cpu_idle_loop(); + while (1) + do_idle(); } From 2ed38cbe05e7ea27022e29d75e7b93c20ccc5eaa Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 30 Nov 2016 02:32:17 +0100 Subject: [PATCH 6/7] MAINTAINERS: Add bug tracking system location entries for cpuidle The kernel Bugzilla is used for tracking bugs in the cpuidle core and intel_idle, so document that. Signed-off-by: Rafael J. Wysocki Acked-by: Len Brown --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 1cd38a7e0064..08753a5a23be 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3367,6 +3367,7 @@ M: Daniel Lezcano L: linux-pm@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git +B: https://bugzilla.kernel.org F: drivers/cpuidle/* F: include/linux/cpuidle.h @@ -6271,6 +6272,7 @@ INTEL IDLE DRIVER M: Len Brown L: linux-pm@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git +B: https://bugzilla.kernel.org S: Supported F: drivers/idle/intel_idle.c From 6af33995318fdfb4914fb1c5e67450fdb3d32084 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 30 Nov 2016 02:33:07 +0100 Subject: [PATCH 7/7] MAINTAINERS: Add Jacob Pan as a new intel_idle maintainer The intel_idle driver is going to be maintained by Jacob Pan now, so update MAINTAINERS accordingly. Signed-off-by: Rafael J. Wysocki Acked-by: Len Brown --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 08753a5a23be..0248f13c8adf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6269,6 +6269,7 @@ S: Maintained F: drivers/platform/x86/intel-vbtn.c INTEL IDLE DRIVER +M: Jacob Pan M: Len Brown L: linux-pm@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git