From 228b30234f258a193317874854eee1ca7807186e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 27 Jul 2013 01:13:26 +0200 Subject: [PATCH 1/5] Revert "cpuidle: Quickly notice prediction failure in general case" Revert commit e11538d1 (cpuidle: Quickly notice prediction failure in general case), since it depends on commit 69a37be (cpuidle: Quickly notice prediction failure for repeat mode) that has been identified as the source of a significant performance regression in v3.8 and later. Requested-by: Jeremy Eder Tested-by: Len Brown Cc: 3.8+ Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/menu.c | 35 +------------------------------- 1 file changed, 1 insertion(+), 34 deletions(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index fe343a06b7da..b69a87e22155 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -34,7 +34,7 @@ static DEFINE_PER_CPU(struct hrtimer, menu_hrtimer); static DEFINE_PER_CPU(int, hrtimer_status); /* menu hrtimer mode */ -enum {MENU_HRTIMER_STOP, MENU_HRTIMER_REPEAT, MENU_HRTIMER_GENERAL}; +enum {MENU_HRTIMER_STOP, MENU_HRTIMER_REPEAT}; /* * Concepts and ideas behind the menu governor @@ -116,13 +116,6 @@ enum {MENU_HRTIMER_STOP, MENU_HRTIMER_REPEAT, MENU_HRTIMER_GENERAL}; * */ -/* - * The C-state residency is so long that is is worthwhile to exit - * from the shallow C-state and re-enter into a deeper C-state. - */ -static unsigned int perfect_cstate_ms __read_mostly = 30; -module_param(perfect_cstate_ms, uint, 0000); - struct menu_device { int last_state_idx; int needs_update; @@ -223,16 +216,6 @@ EXPORT_SYMBOL_GPL(menu_hrtimer_cancel); static enum hrtimer_restart menu_hrtimer_notify(struct hrtimer *hrtimer) { int cpu = smp_processor_id(); - struct menu_device *data = &per_cpu(menu_devices, cpu); - - /* In general case, the expected residency is much larger than - * deepest C-state target residency, but prediction logic still - * predicts a small predicted residency, so the prediction - * history is totally broken if the timer is triggered. - * So reset the correction factor. - */ - if (per_cpu(hrtimer_status, cpu) == MENU_HRTIMER_GENERAL) - data->correction_factor[data->bucket] = RESOLUTION * DECAY; per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_STOP; @@ -389,7 +372,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) /* not deepest C-state chosen for low predicted residency */ if (low_predicted) { unsigned int timer_us = 0; - unsigned int perfect_us = 0; /* * Set a timer to detect whether this sleep is much @@ -400,28 +382,13 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) */ timer_us = 2 * (data->predicted_us + MAX_DEVIATION); - perfect_us = perfect_cstate_ms * 1000; - if (repeat && (4 * timer_us < data->expected_us)) { RCU_NONIDLE(hrtimer_start(hrtmr, ns_to_ktime(1000 * timer_us), HRTIMER_MODE_REL_PINNED)); /* In repeat case, menu hrtimer is started */ per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_REPEAT; - } else if (perfect_us < data->expected_us) { - /* - * The next timer is long. This could be because - * we did not make a useful prediction. - * In that case, it makes sense to re-enter - * into a deeper C-state after some time. - */ - RCU_NONIDLE(hrtimer_start(hrtmr, - ns_to_ktime(1000 * timer_us), - HRTIMER_MODE_REL_PINNED)); - /* In general case, menu hrtimer is started */ - per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_GENERAL; } - } return data->last_state_idx; From 148519120c6d1f19ad53349683aeae9f228b0b8d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 27 Jul 2013 01:41:34 +0200 Subject: [PATCH 2/5] Revert "cpuidle: Quickly notice prediction failure for repeat mode" Revert commit 69a37bea (cpuidle: Quickly notice prediction failure for repeat mode), because it has been identified as the source of a significant performance regression in v3.8 and later as explained by Jeremy Eder: We believe we've identified a particular commit to the cpuidle code that seems to be impacting performance of variety of workloads. The simplest way to reproduce is using netperf TCP_RR test, so we're using that, on a pair of Sandy Bridge based servers. We also have data from a large database setup where performance is also measurably/positively impacted, though that test data isn't easily share-able. Included below are test results from 3 test kernels: kernel reverts ----------------------------------------------------------- 1) vanilla upstream (no reverts) 2) perfteam2 reverts e11538d1f03914eb92af5a1a378375c05ae8520c 3) test reverts 69a37beabf1f0a6705c08e879bdd5d82ff6486c4 e11538d1f03914eb92af5a1a378375c05ae8520c In summary, netperf TCP_RR numbers improve by approximately 4% after reverting 69a37beabf1f0a6705c08e879bdd5d82ff6486c4. When 69a37beabf1f0a6705c08e879bdd5d82ff6486c4 is included, C0 residency never seems to get above 40%. Taking that patch out gets C0 near 100% quite often, and performance increases. The below data are histograms representing the %c0 residency @ 1-second sample rates (using turbostat), while under netperf test. - If you look at the first 4 histograms, you can see %c0 residency almost entirely in the 30,40% bin. - The last pair, which reverts 69a37beabf1f0a6705c08e879bdd5d82ff6486c4, shows %c0 in the 80,90,100% bins. Below each kernel name are netperf TCP_RR trans/s numbers for the particular kernel that can be disclosed publicly, comparing the 3 test kernels. We ran a 4th test with the vanilla kernel where we've also set /dev/cpu_dma_latency=0 to show overall impact boosting single-threaded TCP_RR performance over 11% above baseline. 3.10-rc2 vanilla RX + c0 lock (/dev/cpu_dma_latency=0): TCP_RR trans/s 54323.78 ----------------------------------------------------------- 3.10-rc2 vanilla RX (no reverts) TCP_RR trans/s 48192.47 Receiver %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 59]: *********************************************************** 40.0000 - 50.0000 [ 1]: * 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: Sender %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 11]: *********** 40.0000 - 50.0000 [ 49]: ************************************************* 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: ----------------------------------------------------------- 3.10-rc2 perfteam2 RX (reverts commit e11538d1f03914eb92af5a1a378375c05ae8520c) TCP_RR trans/s 49698.69 Receiver %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 1]: * 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 59]: *********************************************************** 40.0000 - 50.0000 [ 0]: 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: Sender %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 2]: ** 40.0000 - 50.0000 [ 58]: ********************************************************** 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: ----------------------------------------------------------- 3.10-rc2 test RX (reverts 69a37beabf1f0a6705c08e879bdd5d82ff6486c4 and e11538d1f03914eb92af5a1a378375c05ae8520c) TCP_RR trans/s 47766.95 Receiver %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 1]: * 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 27]: *************************** 40.0000 - 50.0000 [ 2]: ** 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 2]: ** 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 28]: **************************** Sender: 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 11]: *********** 40.0000 - 50.0000 [ 0]: 50.0000 - 60.0000 [ 1]: * 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 3]: *** 80.0000 - 90.0000 [ 7]: ******* 90.0000 - 100.0000 [ 38]: ************************************** These results demonstrate gaining back the tendency of the CPU to stay in more responsive, performant C-states (and thus yield measurably better performance), by reverting commit 69a37beabf1f0a6705c08e879bdd5d82ff6486c4. Requested-by: Jeremy Eder Tested-by: Len Brown Cc: 3.8+ Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/menu.c | 73 ++------------------------------ include/linux/tick.h | 6 --- kernel/time/tick-sched.c | 9 +--- 3 files changed, 6 insertions(+), 82 deletions(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index b69a87e22155..bc580b67a652 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -28,13 +28,6 @@ #define MAX_INTERESTING 50000 #define STDDEV_THRESH 400 -/* 60 * 60 > STDDEV_THRESH * INTERVALS = 400 * 8 */ -#define MAX_DEVIATION 60 - -static DEFINE_PER_CPU(struct hrtimer, menu_hrtimer); -static DEFINE_PER_CPU(int, hrtimer_status); -/* menu hrtimer mode */ -enum {MENU_HRTIMER_STOP, MENU_HRTIMER_REPEAT}; /* * Concepts and ideas behind the menu governor @@ -198,42 +191,17 @@ static u64 div_round64(u64 dividend, u32 divisor) return div_u64(dividend + (divisor / 2), divisor); } -/* Cancel the hrtimer if it is not triggered yet */ -void menu_hrtimer_cancel(void) -{ - int cpu = smp_processor_id(); - struct hrtimer *hrtmr = &per_cpu(menu_hrtimer, cpu); - - /* The timer is still not time out*/ - if (per_cpu(hrtimer_status, cpu)) { - hrtimer_cancel(hrtmr); - per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_STOP; - } -} -EXPORT_SYMBOL_GPL(menu_hrtimer_cancel); - -/* Call back for hrtimer is triggered */ -static enum hrtimer_restart menu_hrtimer_notify(struct hrtimer *hrtimer) -{ - int cpu = smp_processor_id(); - - per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_STOP; - - return HRTIMER_NORESTART; -} - /* * Try detecting repeating patterns by keeping track of the last 8 * intervals, and checking if the standard deviation of that set * of points is below a threshold. If it is... then use the * average of these 8 points as the estimated value. */ -static u32 get_typical_interval(struct menu_device *data) +static void get_typical_interval(struct menu_device *data) { int i = 0, divisor = 0; uint64_t max = 0, avg = 0, stddev = 0; int64_t thresh = LLONG_MAX; /* Discard outliers above this value. */ - unsigned int ret = 0; again: @@ -274,16 +242,13 @@ again: if (((avg > stddev * 6) && (divisor * 4 >= INTERVALS * 3)) || stddev <= 20) { data->predicted_us = avg; - ret = 1; - return ret; + return; } else if ((divisor * 4) > INTERVALS * 3) { /* Exclude the max interval */ thresh = max - 1; goto again; } - - return ret; } /** @@ -298,9 +263,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) int i; int multiplier; struct timespec t; - int repeat = 0, low_predicted = 0; - int cpu = smp_processor_id(); - struct hrtimer *hrtmr = &per_cpu(menu_hrtimer, cpu); if (data->needs_update) { menu_update(drv, dev); @@ -335,7 +297,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) data->predicted_us = div_round64(data->expected_us * data->correction_factor[data->bucket], RESOLUTION * DECAY); - repeat = get_typical_interval(data); + get_typical_interval(data); /* * We want to default to C1 (hlt), not to busy polling @@ -356,10 +318,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) if (s->disabled || su->disable) continue; - if (s->target_residency > data->predicted_us) { - low_predicted = 1; + if (s->target_residency > data->predicted_us) continue; - } if (s->exit_latency > latency_req) continue; if (s->exit_latency * multiplier > data->predicted_us) @@ -369,28 +329,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) data->exit_us = s->exit_latency; } - /* not deepest C-state chosen for low predicted residency */ - if (low_predicted) { - unsigned int timer_us = 0; - - /* - * Set a timer to detect whether this sleep is much - * longer than repeat mode predicted. If the timer - * triggers, the code will evaluate whether to put - * the CPU into a deeper C-state. - * The timer is cancelled on CPU wakeup. - */ - timer_us = 2 * (data->predicted_us + MAX_DEVIATION); - - if (repeat && (4 * timer_us < data->expected_us)) { - RCU_NONIDLE(hrtimer_start(hrtmr, - ns_to_ktime(1000 * timer_us), - HRTIMER_MODE_REL_PINNED)); - /* In repeat case, menu hrtimer is started */ - per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_REPEAT; - } - } - return data->last_state_idx; } @@ -481,9 +419,6 @@ static int menu_enable_device(struct cpuidle_driver *drv, struct cpuidle_device *dev) { struct menu_device *data = &per_cpu(menu_devices, dev->cpu); - struct hrtimer *t = &per_cpu(menu_hrtimer, dev->cpu); - hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - t->function = menu_hrtimer_notify; memset(data, 0, sizeof(struct menu_device)); diff --git a/include/linux/tick.h b/include/linux/tick.h index 9180f4b85e6d..62bd8b72873c 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -174,10 +174,4 @@ static inline void tick_nohz_task_switch(struct task_struct *tsk) { } #endif -# ifdef CONFIG_CPU_IDLE_GOV_MENU -extern void menu_hrtimer_cancel(void); -# else -static inline void menu_hrtimer_cancel(void) {} -# endif /* CONFIG_CPU_IDLE_GOV_MENU */ - #endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e80183f4a6c4..e77edc97e036 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -827,13 +827,10 @@ void tick_nohz_irq_exit(void) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); - if (ts->inidle) { - /* Cancel the timer because CPU already waken up from the C-states*/ - menu_hrtimer_cancel(); + if (ts->inidle) __tick_nohz_idle_enter(ts); - } else { + else tick_nohz_full_stop_tick(ts); - } } /** @@ -931,8 +928,6 @@ void tick_nohz_idle_exit(void) ts->inidle = 0; - /* Cancel the timer because CPU already waken up from the C-states*/ - menu_hrtimer_cancel(); if (ts->idle_active || ts->tick_stopped) now = ktime_get(); From 2a99859932281ed6c2ecdd988855f8f6838f6743 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 30 Jul 2013 00:32:00 +0200 Subject: [PATCH 3/5] cpufreq: Fix cpufreq driver module refcount balance after suspend/resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since cpufreq_cpu_put() called by __cpufreq_remove_dev() drops the driver module refcount, __cpufreq_remove_dev() causes that refcount to become negative for the cpufreq driver after a suspend/resume cycle. This is not the only bad thing that happens there, however, because kobject_put() should only be called for the policy kobject at this point if the CPU is not the last one for that policy. Namely, if the given CPU is the last one for that policy, the policy kobject's refcount should be 1 at this point, as set by cpufreq_add_dev_interface(), and only needs to be dropped once for the kobject to go away. This actually happens under the cpu == 1 check, so it need not be done before by cpufreq_cpu_put(). On the other hand, if the given CPU is not the last one for that policy, this means that cpufreq_add_policy_cpu() has been called at least once for that policy and cpufreq_cpu_get() has been called for it too. To balance that cpufreq_cpu_get(), we need to call cpufreq_cpu_put() in that case. Thus, to fix the described problem and keep the reference counters balanced in both cases, move the cpufreq_cpu_get() call in __cpufreq_remove_dev() to the code path executed only for CPUs that share the policy with other CPUs. Reported-and-tested-by: Toralf Förster Signed-off-by: Rafael J. Wysocki Reviewed-by: Srivatsa S. Bhat Cc: 3.10+ --- drivers/cpufreq/cpufreq.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index a4ad7339588d..f0a5e2b0eb8a 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1177,14 +1177,11 @@ static int __cpufreq_remove_dev(struct device *dev, __func__, cpu_dev->id, cpu); } - if ((cpus == 1) && (cpufreq_driver->target)) - __cpufreq_governor(data, CPUFREQ_GOV_POLICY_EXIT); - - pr_debug("%s: removing link, cpu: %d\n", __func__, cpu); - cpufreq_cpu_put(data); - /* If cpu is last user of policy, free policy */ if (cpus == 1) { + if (cpufreq_driver->target) + __cpufreq_governor(data, CPUFREQ_GOV_POLICY_EXIT); + lock_policy_rwsem_read(cpu); kobj = &data->kobj; cmp = &data->kobj_unregister; @@ -1205,9 +1202,13 @@ static int __cpufreq_remove_dev(struct device *dev, free_cpumask_var(data->related_cpus); free_cpumask_var(data->cpus); kfree(data); - } else if (cpufreq_driver->target) { - __cpufreq_governor(data, CPUFREQ_GOV_START); - __cpufreq_governor(data, CPUFREQ_GOV_LIMITS); + } else { + pr_debug("%s: removing link, cpu: %d\n", __func__, cpu); + cpufreq_cpu_put(data); + if (cpufreq_driver->target) { + __cpufreq_governor(data, CPUFREQ_GOV_START); + __cpufreq_governor(data, CPUFREQ_GOV_LIMITS); + } } per_cpu(cpufreq_policy_cpu, cpu) = -1; From 016d5baad04269e8559332df05f89bd95b52d6ad Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Tue, 30 Jul 2013 14:00:42 +0200 Subject: [PATCH 4/5] ACPI / battery: Fix parsing _BIX return value The _BIX method returns extended battery info as a package. According the ACPI spec (ACPI 5, Section 10.2.2.2), the first member of that package should be "Revision". However, the current ACPI battery driver treats the first member as "Power Unit" which should be the second member. This causes the result of _BIX return data parsing to be incorrect. Fix this by adding a new member called 'revision' to struct acpi_battery and adding the offsetof() information on it to extended_info_offsets[] as the first row. [rjw: Changelog] Reported-and-tested-by: Jan Hoffmann References: http://bugzilla.kernel.org/show_bug.cgi?id=60519 Signed-off-by: Lan Tianyu Cc: 2.6.34+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/battery.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index 082b4dd252a8..d405fbad406a 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -117,6 +117,7 @@ struct acpi_battery { struct acpi_device *device; struct notifier_block pm_nb; unsigned long update_time; + int revision; int rate_now; int capacity_now; int voltage_now; @@ -359,6 +360,7 @@ static struct acpi_offsets info_offsets[] = { }; static struct acpi_offsets extended_info_offsets[] = { + {offsetof(struct acpi_battery, revision), 0}, {offsetof(struct acpi_battery, power_unit), 0}, {offsetof(struct acpi_battery, design_capacity), 0}, {offsetof(struct acpi_battery, full_charge_capacity), 0}, From 2b44c4db2e2f1765d35163a861d301038e0c8a75 Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Wed, 24 Jul 2013 17:41:33 -0700 Subject: [PATCH 5/5] freezer: set PF_SUSPEND_TASK flag on tasks that call freeze_processes Calling freeze_processes sets a global flag that will cause any process that calls try_to_freeze to enter the refrigerator. It skips sending a signal to the current task, but if the current task ever hits try_to_freeze, all threads will be frozen and the system will deadlock. Set a new flag, PF_SUSPEND_TASK, on the task that calls freeze_processes. The flag notifies the freezer that the thread is involved in suspend and should not be frozen. Also add a WARN_ON in thaw_processes if the caller does not have the PF_SUSPEND_TASK flag set to catch if a different task calls thaw_processes than the one that called freeze_processes, leaving a task with PF_SUSPEND_TASK permanently set on it. Threads that spawn off a task with PF_SUSPEND_TASK set (which swsusp does) will also have PF_SUSPEND_TASK set, preventing them from freezing while they are helping with suspend, but they need to be dead by the time suspend is triggered, otherwise they may run when userspace is expected to be frozen. Add a WARN_ON in thaw_processes if more than one thread has the PF_SUSPEND_TASK flag set. Reported-and-tested-by: Michael Leun Signed-off-by: Colin Cross Signed-off-by: Rafael J. Wysocki --- include/linux/sched.h | 1 + kernel/freezer.c | 2 +- kernel/power/process.c | 11 +++++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 50d04b92ceda..d722490da030 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1628,6 +1628,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ +#define PF_SUSPEND_TASK 0x80000000 /* this thread called freeze_processes and should not be frozen */ /* * Only the _current_ task can read/write to tsk->flags, but other diff --git a/kernel/freezer.c b/kernel/freezer.c index 8b2afc1c9df0..b462fa197517 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -33,7 +33,7 @@ static DEFINE_SPINLOCK(freezer_lock); */ bool freezing_slow_path(struct task_struct *p) { - if (p->flags & PF_NOFREEZE) + if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) return false; if (pm_nosig_freezing || cgroup_freezing(p)) diff --git a/kernel/power/process.c b/kernel/power/process.c index fc0df8486449..06ec8869dbf1 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -109,6 +109,8 @@ static int try_to_freeze_tasks(bool user_only) /** * freeze_processes - Signal user space processes to enter the refrigerator. + * The current thread will not be frozen. The same process that calls + * freeze_processes must later call thaw_processes. * * On success, returns 0. On failure, -errno and system is fully thawed. */ @@ -120,6 +122,9 @@ int freeze_processes(void) if (error) return error; + /* Make sure this task doesn't get frozen */ + current->flags |= PF_SUSPEND_TASK; + if (!pm_freezing) atomic_inc(&system_freezing_cnt); @@ -168,6 +173,7 @@ int freeze_kernel_threads(void) void thaw_processes(void) { struct task_struct *g, *p; + struct task_struct *curr = current; if (pm_freezing) atomic_dec(&system_freezing_cnt); @@ -182,10 +188,15 @@ void thaw_processes(void) read_lock(&tasklist_lock); do_each_thread(g, p) { + /* No other threads should have PF_SUSPEND_TASK set */ + WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); __thaw_task(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); + WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); + curr->flags &= ~PF_SUSPEND_TASK; + usermodehelper_enable(); schedule();