Merge branch 'for-mingo' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu into core/rcu

Pull RCU updates from Paul E. McKenney:

  - Miscellaneous fixes. (Paul E. McKenney, Boqun Feng, Oleg Nesterov, Patrick Marlier)

  - Improvements to expedited grace periods. (Paul E. McKenney)

  - Performance improvements to and locktorture tests for percpu-rwsem.
    (Oleg Nesterov, Paul E. McKenney)

  - Torture-test changes. (Paul E. McKenney, Davidlohr Bueso)

  - Documentation updates. (Paul E. McKenney)

Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Ingo Molnar 2015-10-19 10:09:54 +02:00
commit c13dc31adb
41 changed files with 1331 additions and 543 deletions

View File

@ -205,6 +205,13 @@ o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the
behavior, you might need to replace some of the cond_resched() behavior, you might need to replace some of the cond_resched()
calls with calls to cond_resched_rcu_qs(). calls with calls to cond_resched_rcu_qs().
o Booting Linux using a console connection that is too slow to
keep up with the boot-time console-message rate. For example,
a 115Kbaud serial console can be -way- too slow to keep up
with boot-time message rates, and will frequently result in
RCU CPU stall warning messages. Especially if you have added
debug printk()s.
o Anything that prevents RCU's grace-period kthreads from running. o Anything that prevents RCU's grace-period kthreads from running.
This can result in the "All QSes seen" console-log message. This can result in the "All QSes seen" console-log message.
This message will include information on when the kthread last This message will include information on when the kthread last

View File

@ -166,40 +166,27 @@ test_no_idle_hz Whether or not to test the ability of RCU to operate in
torture_type The type of RCU to test, with string values as follows: torture_type The type of RCU to test, with string values as follows:
"rcu": rcu_read_lock(), rcu_read_unlock() and call_rcu(). "rcu": rcu_read_lock(), rcu_read_unlock() and call_rcu(),
along with expedited, synchronous, and polling
"rcu_sync": rcu_read_lock(), rcu_read_unlock(), and variants.
synchronize_rcu().
"rcu_expedited": rcu_read_lock(), rcu_read_unlock(), and
synchronize_rcu_expedited().
"rcu_bh": rcu_read_lock_bh(), rcu_read_unlock_bh(), and "rcu_bh": rcu_read_lock_bh(), rcu_read_unlock_bh(), and
call_rcu_bh(). call_rcu_bh(), along with expedited and synchronous
variants.
"rcu_bh_sync": rcu_read_lock_bh(), rcu_read_unlock_bh(), "rcu_busted": This tests an intentionally incorrect version
and synchronize_rcu_bh(). of RCU in order to help test rcutorture itself.
"rcu_bh_expedited": rcu_read_lock_bh(), rcu_read_unlock_bh(),
and synchronize_rcu_bh_expedited().
"srcu": srcu_read_lock(), srcu_read_unlock() and "srcu": srcu_read_lock(), srcu_read_unlock() and
call_srcu(). call_srcu(), along with expedited and
synchronous variants.
"srcu_sync": srcu_read_lock(), srcu_read_unlock() and
synchronize_srcu().
"srcu_expedited": srcu_read_lock(), srcu_read_unlock() and
synchronize_srcu_expedited().
"sched": preempt_disable(), preempt_enable(), and "sched": preempt_disable(), preempt_enable(), and
call_rcu_sched(). call_rcu_sched(), along with expedited,
synchronous, and polling variants.
"sched_sync": preempt_disable(), preempt_enable(), and "tasks": voluntary context switch and call_rcu_tasks(),
synchronize_sched(). along with expedited and synchronous variants.
"sched_expedited": preempt_disable(), preempt_enable(), and
synchronize_sched_expedited().
Defaults to "rcu". Defaults to "rcu".

View File

@ -56,14 +56,14 @@ rcuboost:
The output of "cat rcu/rcu_preempt/rcudata" looks as follows: The output of "cat rcu/rcu_preempt/rcudata" looks as follows:
0!c=30455 g=30456 pq=1/0 qp=1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716 0!c=30455 g=30456 cnq=1/0:1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716
1!c=30719 g=30720 pq=1/0 qp=0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982 1!c=30719 g=30720 cnq=1/0:0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982
2!c=30150 g=30151 pq=1/1 qp=1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458 2!c=30150 g=30151 cnq=1/1:1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458
3 c=31249 g=31250 pq=1/1 qp=0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622 3 c=31249 g=31250 cnq=1/1:0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622
4!c=29502 g=29503 pq=1/0 qp=1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521 4!c=29502 g=29503 cnq=1/0:1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521
5 c=31201 g=31202 pq=1/0 qp=1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698 5 c=31201 g=31202 cnq=1/0:1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698
6!c=30253 g=30254 pq=1/0 qp=1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353 6!c=30253 g=30254 cnq=1/0:1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353
7 c=31178 g=31178 pq=1/0 qp=0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969 7 c=31178 g=31178 cnq=1/0:0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969
This file has one line per CPU, or eight for this 8-CPU system. This file has one line per CPU, or eight for this 8-CPU system.
The fields are as follows: The fields are as follows:
@ -188,14 +188,14 @@ o "ca" is the number of RCU callbacks that have been adopted by this
Kernels compiled with CONFIG_RCU_BOOST=y display the following from Kernels compiled with CONFIG_RCU_BOOST=y display the following from
/debug/rcu/rcu_preempt/rcudata: /debug/rcu/rcu_preempt/rcudata:
0!c=12865 g=12866 pq=1/0 qp=1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871 0!c=12865 g=12866 cnq=1/0:1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871
1 c=14407 g=14408 pq=1/0 qp=0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485 1 c=14407 g=14408 cnq=1/0:0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485
2 c=14407 g=14408 pq=1/0 qp=0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490 2 c=14407 g=14408 cnq=1/0:0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490
3 c=14407 g=14408 pq=1/0 qp=0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290 3 c=14407 g=14408 cnq=1/0:0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290
4 c=14405 g=14406 pq=1/0 qp=1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114 4 c=14405 g=14406 cnq=1/0:1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114
5!c=14168 g=14169 pq=1/0 qp=0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722 5!c=14168 g=14169 cnq=1/0:0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722
6 c=14404 g=14405 pq=1/0 qp=0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811 6 c=14404 g=14405 cnq=1/0:0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811
7 c=14407 g=14408 pq=1/0 qp=1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042 7 c=14407 g=14408 cnq=1/0:1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042
This is similar to the output discussed above, but contains the following This is similar to the output discussed above, but contains the following
additional fields: additional fields:

View File

@ -364,7 +364,7 @@ uses of RCU may be found in listRCU.txt, arrayRCU.txt, and NMI-RCU.txt.
}; };
DEFINE_SPINLOCK(foo_mutex); DEFINE_SPINLOCK(foo_mutex);
struct foo *gbl_foo; struct foo __rcu *gbl_foo;
/* /*
* Create a new struct foo that is the same as the one currently * Create a new struct foo that is the same as the one currently
@ -386,7 +386,7 @@ uses of RCU may be found in listRCU.txt, arrayRCU.txt, and NMI-RCU.txt.
new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL); new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL);
spin_lock(&foo_mutex); spin_lock(&foo_mutex);
old_fp = gbl_foo; old_fp = rcu_dereference_protected(gbl_foo, lockdep_is_held(&foo_mutex));
*new_fp = *old_fp; *new_fp = *old_fp;
new_fp->a = new_a; new_fp->a = new_a;
rcu_assign_pointer(gbl_foo, new_fp); rcu_assign_pointer(gbl_foo, new_fp);
@ -487,7 +487,7 @@ The foo_update_a() function might then be written as follows:
new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL); new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL);
spin_lock(&foo_mutex); spin_lock(&foo_mutex);
old_fp = gbl_foo; old_fp = rcu_dereference_protected(gbl_foo, lockdep_is_held(&foo_mutex));
*new_fp = *old_fp; *new_fp = *old_fp;
new_fp->a = new_a; new_fp->a = new_a;
rcu_assign_pointer(gbl_foo, new_fp); rcu_assign_pointer(gbl_foo, new_fp);

View File

@ -3074,9 +3074,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
cache-to-cache transfer latencies. cache-to-cache transfer latencies.
rcutree.rcu_fanout_leaf= [KNL] rcutree.rcu_fanout_leaf= [KNL]
Increase the number of CPUs assigned to each Change the number of CPUs assigned to each
leaf rcu_node structure. Useful for very large leaf rcu_node structure. Useful for very
systems. large systems, which will choose the value 64,
and for NUMA systems with large remote-access
latencies, which will choose a value aligned
with the appropriate hardware boundaries.
rcutree.jiffies_till_sched_qs= [KNL] rcutree.jiffies_till_sched_qs= [KNL]
Set required age in jiffies for a Set required age in jiffies for a

View File

@ -52,6 +52,9 @@ torture_type Type of lock to torture. By default, only spinlocks will
o "mutex_lock": mutex_lock() and mutex_unlock() pairs. o "mutex_lock": mutex_lock() and mutex_unlock() pairs.
o "rtmutex_lock": rtmutex_lock() and rtmutex_unlock()
pairs. Kernel must have CONFIG_RT_MUTEX=y.
o "rwsem_lock": read/write down() and up() semaphore pairs. o "rwsem_lock": read/write down() and up() semaphore pairs.
torture_runnable Start locktorture at boot time in the case where the torture_runnable Start locktorture at boot time in the case where the

View File

@ -1710,6 +1710,17 @@ There are some more advanced barrier functions:
operations" subsection for information on where to use these. operations" subsection for information on where to use these.
(*) lockless_dereference();
This can be thought of as a pointer-fetch wrapper around the
smp_read_barrier_depends() data-dependency barrier.
This is also similar to rcu_dereference(), but in cases where
object lifetime is handled by some mechanism other than RCU, for
example, when the objects removed only when the system goes down.
In addition, lockless_dereference() is used in some data structures
that can be used both with and without RCU.
(*) dma_wmb(); (*) dma_wmb();
(*) dma_rmb(); (*) dma_rmb();
@ -1789,7 +1800,6 @@ The Linux kernel has a number of locking constructs:
(*) mutexes (*) mutexes
(*) semaphores (*) semaphores
(*) R/W semaphores (*) R/W semaphores
(*) RCU
In all cases there are variants on "ACQUIRE" operations and "RELEASE" operations In all cases there are variants on "ACQUIRE" operations and "RELEASE" operations
for each construct. These operations all imply certain barriers: for each construct. These operations all imply certain barriers:

View File

@ -228,7 +228,6 @@ extern struct bus_type cpu_subsys;
extern void cpu_hotplug_begin(void); extern void cpu_hotplug_begin(void);
extern void cpu_hotplug_done(void); extern void cpu_hotplug_done(void);
extern void get_online_cpus(void); extern void get_online_cpus(void);
extern bool try_get_online_cpus(void);
extern void put_online_cpus(void); extern void put_online_cpus(void);
extern void cpu_hotplug_disable(void); extern void cpu_hotplug_disable(void);
extern void cpu_hotplug_enable(void); extern void cpu_hotplug_enable(void);
@ -246,7 +245,6 @@ int cpu_down(unsigned int cpu);
static inline void cpu_hotplug_begin(void) {} static inline void cpu_hotplug_begin(void) {}
static inline void cpu_hotplug_done(void) {} static inline void cpu_hotplug_done(void) {}
#define get_online_cpus() do { } while (0) #define get_online_cpus() do { } while (0)
#define try_get_online_cpus() true
#define put_online_cpus() do { } while (0) #define put_online_cpus() do { } while (0)
#define cpu_hotplug_disable() do { } while (0) #define cpu_hotplug_disable() do { } while (0)
#define cpu_hotplug_enable() do { } while (0) #define cpu_hotplug_enable() do { } while (0)

View File

@ -87,7 +87,7 @@ static inline void list_add_tail(struct list_head *new, struct list_head *head)
static inline void __list_del(struct list_head * prev, struct list_head * next) static inline void __list_del(struct list_head * prev, struct list_head * next)
{ {
next->prev = prev; next->prev = prev;
prev->next = next; WRITE_ONCE(prev->next, next);
} }
/** /**
@ -615,7 +615,8 @@ static inline void __hlist_del(struct hlist_node *n)
{ {
struct hlist_node *next = n->next; struct hlist_node *next = n->next;
struct hlist_node **pprev = n->pprev; struct hlist_node **pprev = n->pprev;
*pprev = next;
WRITE_ONCE(*pprev, next);
if (next) if (next)
next->pprev = pprev; next->pprev = pprev;
} }

View File

@ -93,9 +93,10 @@ static inline void __hlist_bl_del(struct hlist_bl_node *n)
LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
/* pprev may be `first`, so be careful not to lose the lock bit */ /* pprev may be `first`, so be careful not to lose the lock bit */
*pprev = (struct hlist_bl_node *) WRITE_ONCE(*pprev,
(struct hlist_bl_node *)
((unsigned long)next | ((unsigned long)next |
((unsigned long)*pprev & LIST_BL_LOCKMASK)); ((unsigned long)*pprev & LIST_BL_LOCKMASK)));
if (next) if (next)
next->pprev = pprev; next->pprev = pprev;
} }

View File

@ -76,7 +76,8 @@ static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
{ {
struct hlist_nulls_node *next = n->next; struct hlist_nulls_node *next = n->next;
struct hlist_nulls_node **pprev = n->pprev; struct hlist_nulls_node **pprev = n->pprev;
*pprev = next;
WRITE_ONCE(*pprev, next);
if (!is_a_nulls(next)) if (!is_a_nulls(next))
next->pprev = pprev; next->pprev = pprev;
} }

View File

@ -5,11 +5,12 @@
#include <linux/rwsem.h> #include <linux/rwsem.h>
#include <linux/percpu.h> #include <linux/percpu.h>
#include <linux/wait.h> #include <linux/wait.h>
#include <linux/rcu_sync.h>
#include <linux/lockdep.h> #include <linux/lockdep.h>
struct percpu_rw_semaphore { struct percpu_rw_semaphore {
struct rcu_sync rss;
unsigned int __percpu *fast_read_ctr; unsigned int __percpu *fast_read_ctr;
atomic_t write_ctr;
struct rw_semaphore rw_sem; struct rw_semaphore rw_sem;
atomic_t slow_read_ctr; atomic_t slow_read_ctr;
wait_queue_head_t write_waitq; wait_queue_head_t write_waitq;

86
include/linux/rcu_sync.h Normal file
View File

@ -0,0 +1,86 @@
/*
* RCU-based infrastructure for lightweight reader-writer locking
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, you can access it online at
* http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright (c) 2015, Red Hat, Inc.
*
* Author: Oleg Nesterov <oleg@redhat.com>
*/
#ifndef _LINUX_RCU_SYNC_H_
#define _LINUX_RCU_SYNC_H_
#include <linux/wait.h>
#include <linux/rcupdate.h>
enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
/* Structure to mediate between updaters and fastpath-using readers. */
struct rcu_sync {
int gp_state;
int gp_count;
wait_queue_head_t gp_wait;
int cb_state;
struct rcu_head cb_head;
enum rcu_sync_type gp_type;
};
extern void rcu_sync_lockdep_assert(struct rcu_sync *);
/**
* rcu_sync_is_idle() - Are readers permitted to use their fastpaths?
* @rsp: Pointer to rcu_sync structure to use for synchronization
*
* Returns true if readers are permitted to use their fastpaths.
* Must be invoked within an RCU read-side critical section whose
* flavor matches that of the rcu_sync struture.
*/
static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
{
#ifdef CONFIG_PROVE_RCU
rcu_sync_lockdep_assert(rsp);
#endif
return !rsp->gp_state; /* GP_IDLE */
}
extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type);
extern void rcu_sync_enter(struct rcu_sync *);
extern void rcu_sync_exit(struct rcu_sync *);
extern void rcu_sync_dtor(struct rcu_sync *);
#define __RCU_SYNC_INITIALIZER(name, type) { \
.gp_state = 0, \
.gp_count = 0, \
.gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait), \
.cb_state = 0, \
.gp_type = type, \
}
#define __DEFINE_RCU_SYNC(name, type) \
struct rcu_sync_struct name = __RCU_SYNC_INITIALIZER(name, type)
#define DEFINE_RCU_SYNC(name) \
__DEFINE_RCU_SYNC(name, RCU_SYNC)
#define DEFINE_RCU_SCHED_SYNC(name) \
__DEFINE_RCU_SYNC(name, RCU_SCHED_SYNC)
#define DEFINE_RCU_BH_SYNC(name) \
__DEFINE_RCU_SYNC(name, RCU_BH_SYNC)
#endif /* _LINUX_RCU_SYNC_H_ */

View File

@ -247,10 +247,7 @@ static inline void list_splice_init_rcu(struct list_head *list,
* primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
*/ */
#define list_entry_rcu(ptr, type, member) \ #define list_entry_rcu(ptr, type, member) \
({ \ container_of(lockless_dereference(ptr), type, member)
typeof(*ptr) __rcu *__ptr = (typeof(*ptr) __rcu __force *)ptr; \
container_of((typeof(ptr))rcu_dereference_raw(__ptr), type, member); \
})
/** /**
* Where are list_empty_rcu() and list_first_entry_rcu()? * Where are list_empty_rcu() and list_first_entry_rcu()?

View File

@ -160,7 +160,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
* more than one CPU). * more than one CPU).
*/ */
void call_rcu(struct rcu_head *head, void call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *head)); rcu_callback_t func);
#else /* #ifdef CONFIG_PREEMPT_RCU */ #else /* #ifdef CONFIG_PREEMPT_RCU */
@ -191,7 +191,7 @@ void call_rcu(struct rcu_head *head,
* memory ordering guarantees. * memory ordering guarantees.
*/ */
void call_rcu_bh(struct rcu_head *head, void call_rcu_bh(struct rcu_head *head,
void (*func)(struct rcu_head *head)); rcu_callback_t func);
/** /**
* call_rcu_sched() - Queue an RCU for invocation after sched grace period. * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
@ -213,7 +213,7 @@ void call_rcu_bh(struct rcu_head *head,
* memory ordering guarantees. * memory ordering guarantees.
*/ */
void call_rcu_sched(struct rcu_head *head, void call_rcu_sched(struct rcu_head *head,
void (*func)(struct rcu_head *rcu)); rcu_callback_t func);
void synchronize_sched(void); void synchronize_sched(void);
@ -274,7 +274,7 @@ do { \
* See the description of call_rcu() for more detailed information on * See the description of call_rcu() for more detailed information on
* memory ordering guarantees. * memory ordering guarantees.
*/ */
void call_rcu_tasks(struct rcu_head *head, void (*func)(struct rcu_head *head)); void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func);
void synchronize_rcu_tasks(void); void synchronize_rcu_tasks(void);
void rcu_barrier_tasks(void); void rcu_barrier_tasks(void);
@ -297,12 +297,14 @@ void synchronize_rcu(void);
static inline void __rcu_read_lock(void) static inline void __rcu_read_lock(void)
{ {
preempt_disable(); if (IS_ENABLED(CONFIG_PREEMPT_COUNT))
preempt_disable();
} }
static inline void __rcu_read_unlock(void) static inline void __rcu_read_unlock(void)
{ {
preempt_enable(); if (IS_ENABLED(CONFIG_PREEMPT_COUNT))
preempt_enable();
} }
static inline void synchronize_rcu(void) static inline void synchronize_rcu(void)
@ -535,28 +537,8 @@ static inline int rcu_read_lock_sched_held(void)
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/* Deprecate rcu_lockdep_assert(): Use RCU_LOCKDEP_WARN() instead. */
static inline void __attribute((deprecated)) deprecate_rcu_lockdep_assert(void)
{
}
#ifdef CONFIG_PROVE_RCU #ifdef CONFIG_PROVE_RCU
/**
* rcu_lockdep_assert - emit lockdep splat if specified condition not met
* @c: condition to check
* @s: informative message
*/
#define rcu_lockdep_assert(c, s) \
do { \
static bool __section(.data.unlikely) __warned; \
deprecate_rcu_lockdep_assert(); \
if (debug_lockdep_rcu_enabled() && !__warned && !(c)) { \
__warned = true; \
lockdep_rcu_suspicious(__FILE__, __LINE__, s); \
} \
} while (0)
/** /**
* RCU_LOCKDEP_WARN - emit lockdep splat if specified condition is met * RCU_LOCKDEP_WARN - emit lockdep splat if specified condition is met
* @c: condition to check * @c: condition to check
@ -594,7 +576,6 @@ static inline void rcu_preempt_sleep_check(void)
#else /* #ifdef CONFIG_PROVE_RCU */ #else /* #ifdef CONFIG_PROVE_RCU */
#define rcu_lockdep_assert(c, s) deprecate_rcu_lockdep_assert()
#define RCU_LOCKDEP_WARN(c, s) do { } while (0) #define RCU_LOCKDEP_WARN(c, s) do { } while (0)
#define rcu_sleep_check() do { } while (0) #define rcu_sleep_check() do { } while (0)
@ -810,6 +791,28 @@ static inline void rcu_preempt_sleep_check(void)
*/ */
#define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0) #define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0)
/**
* rcu_pointer_handoff() - Hand off a pointer from RCU to other mechanism
* @p: The pointer to hand off
*
* This is simply an identity function, but it documents where a pointer
* is handed off from RCU to some other synchronization mechanism, for
* example, reference counting or locking. In C11, it would map to
* kill_dependency(). It could be used as follows:
*
* rcu_read_lock();
* p = rcu_dereference(gp);
* long_lived = is_long_lived(p);
* if (long_lived) {
* if (!atomic_inc_not_zero(p->refcnt))
* long_lived = false;
* else
* p = rcu_pointer_handoff(p);
* }
* rcu_read_unlock();
*/
#define rcu_pointer_handoff(p) (p)
/** /**
* rcu_read_lock() - mark the beginning of an RCU read-side critical section * rcu_read_lock() - mark the beginning of an RCU read-side critical section
* *
@ -1065,7 +1068,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
#define __kfree_rcu(head, offset) \ #define __kfree_rcu(head, offset) \
do { \ do { \
BUILD_BUG_ON(!__is_kfree_rcu_offset(offset)); \ BUILD_BUG_ON(!__is_kfree_rcu_offset(offset)); \
kfree_call_rcu(head, (void (*)(struct rcu_head *))(unsigned long)(offset)); \ kfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \
} while (0) } while (0)
/** /**

View File

@ -83,7 +83,7 @@ static inline void synchronize_sched_expedited(void)
} }
static inline void kfree_call_rcu(struct rcu_head *head, static inline void kfree_call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *rcu)) rcu_callback_t func)
{ {
call_rcu(head, func); call_rcu(head, func);
} }
@ -216,6 +216,7 @@ static inline bool rcu_is_watching(void)
static inline void rcu_all_qs(void) static inline void rcu_all_qs(void)
{ {
barrier(); /* Avoid RCU read-side critical sections leaking across. */
} }
#endif /* __LINUX_RCUTINY_H */ #endif /* __LINUX_RCUTINY_H */

View File

@ -48,7 +48,7 @@ void synchronize_rcu_bh(void);
void synchronize_sched_expedited(void); void synchronize_sched_expedited(void);
void synchronize_rcu_expedited(void); void synchronize_rcu_expedited(void);
void kfree_call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func);
/** /**
* synchronize_rcu_bh_expedited - Brute-force RCU-bh grace period * synchronize_rcu_bh_expedited - Brute-force RCU-bh grace period

View File

@ -1342,10 +1342,12 @@ struct sched_dl_entity {
union rcu_special { union rcu_special {
struct { struct {
bool blocked; u8 blocked;
bool need_qs; u8 need_qs;
} b; u8 exp_need_qs;
short s; u8 pad; /* Otherwise the compiler can store garbage here. */
} b; /* Bits. */
u32 s; /* Set of bits. */
}; };
struct rcu_node; struct rcu_node;

View File

@ -215,8 +215,11 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp)
*/ */
static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp) static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp)
{ {
int retval = __srcu_read_lock(sp); int retval;
preempt_disable();
retval = __srcu_read_lock(sp);
preempt_enable();
rcu_lock_acquire(&(sp)->dep_map); rcu_lock_acquire(&(sp)->dep_map);
return retval; return retval;
} }

View File

@ -102,19 +102,6 @@ void get_online_cpus(void)
} }
EXPORT_SYMBOL_GPL(get_online_cpus); EXPORT_SYMBOL_GPL(get_online_cpus);
bool try_get_online_cpus(void)
{
if (cpu_hotplug.active_writer == current)
return true;
if (!mutex_trylock(&cpu_hotplug.lock))
return false;
cpuhp_lock_acquire_tryread();
atomic_inc(&cpu_hotplug.refcount);
mutex_unlock(&cpu_hotplug.lock);
return true;
}
EXPORT_SYMBOL_GPL(try_get_online_cpus);
void put_online_cpus(void) void put_online_cpus(void)
{ {
int refcount; int refcount;

View File

@ -761,7 +761,9 @@ void do_exit(long code)
*/ */
flush_ptrace_hw_breakpoint(tsk); flush_ptrace_hw_breakpoint(tsk);
TASKS_RCU(preempt_disable());
TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
TASKS_RCU(preempt_enable());
exit_notify(tsk, group_dead); exit_notify(tsk, group_dead);
proc_exit_connector(tsk); proc_exit_connector(tsk);
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA

View File

@ -17,12 +17,14 @@
* *
* Copyright (C) IBM Corporation, 2014 * Copyright (C) IBM Corporation, 2014
* *
* Author: Paul E. McKenney <paulmck@us.ibm.com> * Authors: Paul E. McKenney <paulmck@us.ibm.com>
* Davidlohr Bueso <dave@stgolabs.net>
* Based on kernel/rcu/torture.c. * Based on kernel/rcu/torture.c.
*/ */
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/kthread.h> #include <linux/kthread.h>
#include <linux/sched/rt.h>
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/rwlock.h> #include <linux/rwlock.h>
#include <linux/mutex.h> #include <linux/mutex.h>
@ -34,6 +36,7 @@
#include <linux/moduleparam.h> #include <linux/moduleparam.h>
#include <linux/delay.h> #include <linux/delay.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/percpu-rwsem.h>
#include <linux/torture.h> #include <linux/torture.h>
MODULE_LICENSE("GPL"); MODULE_LICENSE("GPL");
@ -91,11 +94,13 @@ struct lock_torture_ops {
void (*init)(void); void (*init)(void);
int (*writelock)(void); int (*writelock)(void);
void (*write_delay)(struct torture_random_state *trsp); void (*write_delay)(struct torture_random_state *trsp);
void (*task_boost)(struct torture_random_state *trsp);
void (*writeunlock)(void); void (*writeunlock)(void);
int (*readlock)(void); int (*readlock)(void);
void (*read_delay)(struct torture_random_state *trsp); void (*read_delay)(struct torture_random_state *trsp);
void (*readunlock)(void); void (*readunlock)(void);
unsigned long flags;
unsigned long flags; /* for irq spinlocks */
const char *name; const char *name;
}; };
@ -139,9 +144,15 @@ static void torture_lock_busted_write_unlock(void)
/* BUGGY, do not use in real life!!! */ /* BUGGY, do not use in real life!!! */
} }
static void torture_boost_dummy(struct torture_random_state *trsp)
{
/* Only rtmutexes care about priority */
}
static struct lock_torture_ops lock_busted_ops = { static struct lock_torture_ops lock_busted_ops = {
.writelock = torture_lock_busted_write_lock, .writelock = torture_lock_busted_write_lock,
.write_delay = torture_lock_busted_write_delay, .write_delay = torture_lock_busted_write_delay,
.task_boost = torture_boost_dummy,
.writeunlock = torture_lock_busted_write_unlock, .writeunlock = torture_lock_busted_write_unlock,
.readlock = NULL, .readlock = NULL,
.read_delay = NULL, .read_delay = NULL,
@ -185,6 +196,7 @@ static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
static struct lock_torture_ops spin_lock_ops = { static struct lock_torture_ops spin_lock_ops = {
.writelock = torture_spin_lock_write_lock, .writelock = torture_spin_lock_write_lock,
.write_delay = torture_spin_lock_write_delay, .write_delay = torture_spin_lock_write_delay,
.task_boost = torture_boost_dummy,
.writeunlock = torture_spin_lock_write_unlock, .writeunlock = torture_spin_lock_write_unlock,
.readlock = NULL, .readlock = NULL,
.read_delay = NULL, .read_delay = NULL,
@ -211,6 +223,7 @@ __releases(torture_spinlock)
static struct lock_torture_ops spin_lock_irq_ops = { static struct lock_torture_ops spin_lock_irq_ops = {
.writelock = torture_spin_lock_write_lock_irq, .writelock = torture_spin_lock_write_lock_irq,
.write_delay = torture_spin_lock_write_delay, .write_delay = torture_spin_lock_write_delay,
.task_boost = torture_boost_dummy,
.writeunlock = torture_lock_spin_write_unlock_irq, .writeunlock = torture_lock_spin_write_unlock_irq,
.readlock = NULL, .readlock = NULL,
.read_delay = NULL, .read_delay = NULL,
@ -275,6 +288,7 @@ static void torture_rwlock_read_unlock(void) __releases(torture_rwlock)
static struct lock_torture_ops rw_lock_ops = { static struct lock_torture_ops rw_lock_ops = {
.writelock = torture_rwlock_write_lock, .writelock = torture_rwlock_write_lock,
.write_delay = torture_rwlock_write_delay, .write_delay = torture_rwlock_write_delay,
.task_boost = torture_boost_dummy,
.writeunlock = torture_rwlock_write_unlock, .writeunlock = torture_rwlock_write_unlock,
.readlock = torture_rwlock_read_lock, .readlock = torture_rwlock_read_lock,
.read_delay = torture_rwlock_read_delay, .read_delay = torture_rwlock_read_delay,
@ -315,6 +329,7 @@ __releases(torture_rwlock)
static struct lock_torture_ops rw_lock_irq_ops = { static struct lock_torture_ops rw_lock_irq_ops = {
.writelock = torture_rwlock_write_lock_irq, .writelock = torture_rwlock_write_lock_irq,
.write_delay = torture_rwlock_write_delay, .write_delay = torture_rwlock_write_delay,
.task_boost = torture_boost_dummy,
.writeunlock = torture_rwlock_write_unlock_irq, .writeunlock = torture_rwlock_write_unlock_irq,
.readlock = torture_rwlock_read_lock_irq, .readlock = torture_rwlock_read_lock_irq,
.read_delay = torture_rwlock_read_delay, .read_delay = torture_rwlock_read_delay,
@ -354,6 +369,7 @@ static void torture_mutex_unlock(void) __releases(torture_mutex)
static struct lock_torture_ops mutex_lock_ops = { static struct lock_torture_ops mutex_lock_ops = {
.writelock = torture_mutex_lock, .writelock = torture_mutex_lock,
.write_delay = torture_mutex_delay, .write_delay = torture_mutex_delay,
.task_boost = torture_boost_dummy,
.writeunlock = torture_mutex_unlock, .writeunlock = torture_mutex_unlock,
.readlock = NULL, .readlock = NULL,
.read_delay = NULL, .read_delay = NULL,
@ -361,6 +377,90 @@ static struct lock_torture_ops mutex_lock_ops = {
.name = "mutex_lock" .name = "mutex_lock"
}; };
#ifdef CONFIG_RT_MUTEXES
static DEFINE_RT_MUTEX(torture_rtmutex);
static int torture_rtmutex_lock(void) __acquires(torture_rtmutex)
{
rt_mutex_lock(&torture_rtmutex);
return 0;
}
static void torture_rtmutex_boost(struct torture_random_state *trsp)
{
int policy;
struct sched_param param;
const unsigned int factor = 50000; /* yes, quite arbitrary */
if (!rt_task(current)) {
/*
* (1) Boost priority once every ~50k operations. When the
* task tries to take the lock, the rtmutex it will account
* for the new priority, and do any corresponding pi-dance.
*/
if (!(torture_random(trsp) %
(cxt.nrealwriters_stress * factor))) {
policy = SCHED_FIFO;
param.sched_priority = MAX_RT_PRIO - 1;
} else /* common case, do nothing */
return;
} else {
/*
* The task will remain boosted for another ~500k operations,
* then restored back to its original prio, and so forth.
*
* When @trsp is nil, we want to force-reset the task for
* stopping the kthread.
*/
if (!trsp || !(torture_random(trsp) %
(cxt.nrealwriters_stress * factor * 2))) {
policy = SCHED_NORMAL;
param.sched_priority = 0;
} else /* common case, do nothing */
return;
}
sched_setscheduler_nocheck(current, policy, &param);
}
static void torture_rtmutex_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
const unsigned long longdelay_ms = 100;
/*
* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
if (!(torture_random(trsp) %
(cxt.nrealwriters_stress * 2000 * longdelay_ms)))
mdelay(longdelay_ms);
if (!(torture_random(trsp) %
(cxt.nrealwriters_stress * 2 * shortdelay_us)))
udelay(shortdelay_us);
#ifdef CONFIG_PREEMPT
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
preempt_schedule(); /* Allow test to be preempted. */
#endif
}
static void torture_rtmutex_unlock(void) __releases(torture_rtmutex)
{
rt_mutex_unlock(&torture_rtmutex);
}
static struct lock_torture_ops rtmutex_lock_ops = {
.writelock = torture_rtmutex_lock,
.write_delay = torture_rtmutex_delay,
.task_boost = torture_rtmutex_boost,
.writeunlock = torture_rtmutex_unlock,
.readlock = NULL,
.read_delay = NULL,
.readunlock = NULL,
.name = "rtmutex_lock"
};
#endif
static DECLARE_RWSEM(torture_rwsem); static DECLARE_RWSEM(torture_rwsem);
static int torture_rwsem_down_write(void) __acquires(torture_rwsem) static int torture_rwsem_down_write(void) __acquires(torture_rwsem)
{ {
@ -419,6 +519,7 @@ static void torture_rwsem_up_read(void) __releases(torture_rwsem)
static struct lock_torture_ops rwsem_lock_ops = { static struct lock_torture_ops rwsem_lock_ops = {
.writelock = torture_rwsem_down_write, .writelock = torture_rwsem_down_write,
.write_delay = torture_rwsem_write_delay, .write_delay = torture_rwsem_write_delay,
.task_boost = torture_boost_dummy,
.writeunlock = torture_rwsem_up_write, .writeunlock = torture_rwsem_up_write,
.readlock = torture_rwsem_down_read, .readlock = torture_rwsem_down_read,
.read_delay = torture_rwsem_read_delay, .read_delay = torture_rwsem_read_delay,
@ -426,6 +527,48 @@ static struct lock_torture_ops rwsem_lock_ops = {
.name = "rwsem_lock" .name = "rwsem_lock"
}; };
#include <linux/percpu-rwsem.h>
static struct percpu_rw_semaphore pcpu_rwsem;
void torture_percpu_rwsem_init(void)
{
BUG_ON(percpu_init_rwsem(&pcpu_rwsem));
}
static int torture_percpu_rwsem_down_write(void) __acquires(pcpu_rwsem)
{
percpu_down_write(&pcpu_rwsem);
return 0;
}
static void torture_percpu_rwsem_up_write(void) __releases(pcpu_rwsem)
{
percpu_up_write(&pcpu_rwsem);
}
static int torture_percpu_rwsem_down_read(void) __acquires(pcpu_rwsem)
{
percpu_down_read(&pcpu_rwsem);
return 0;
}
static void torture_percpu_rwsem_up_read(void) __releases(pcpu_rwsem)
{
percpu_up_read(&pcpu_rwsem);
}
static struct lock_torture_ops percpu_rwsem_lock_ops = {
.init = torture_percpu_rwsem_init,
.writelock = torture_percpu_rwsem_down_write,
.write_delay = torture_rwsem_write_delay,
.task_boost = torture_boost_dummy,
.writeunlock = torture_percpu_rwsem_up_write,
.readlock = torture_percpu_rwsem_down_read,
.read_delay = torture_rwsem_read_delay,
.readunlock = torture_percpu_rwsem_up_read,
.name = "percpu_rwsem_lock"
};
/* /*
* Lock torture writer kthread. Repeatedly acquires and releases * Lock torture writer kthread. Repeatedly acquires and releases
* the lock, checking for duplicate acquisitions. * the lock, checking for duplicate acquisitions.
@ -442,6 +585,7 @@ static int lock_torture_writer(void *arg)
if ((torture_random(&rand) & 0xfffff) == 0) if ((torture_random(&rand) & 0xfffff) == 0)
schedule_timeout_uninterruptible(1); schedule_timeout_uninterruptible(1);
cxt.cur_ops->task_boost(&rand);
cxt.cur_ops->writelock(); cxt.cur_ops->writelock();
if (WARN_ON_ONCE(lock_is_write_held)) if (WARN_ON_ONCE(lock_is_write_held))
lwsp->n_lock_fail++; lwsp->n_lock_fail++;
@ -456,6 +600,8 @@ static int lock_torture_writer(void *arg)
stutter_wait("lock_torture_writer"); stutter_wait("lock_torture_writer");
} while (!torture_must_stop()); } while (!torture_must_stop());
cxt.cur_ops->task_boost(NULL); /* reset prio */
torture_kthread_stopping("lock_torture_writer"); torture_kthread_stopping("lock_torture_writer");
return 0; return 0;
} }
@ -642,7 +788,11 @@ static int __init lock_torture_init(void)
&spin_lock_ops, &spin_lock_irq_ops, &spin_lock_ops, &spin_lock_irq_ops,
&rw_lock_ops, &rw_lock_irq_ops, &rw_lock_ops, &rw_lock_irq_ops,
&mutex_lock_ops, &mutex_lock_ops,
#ifdef CONFIG_RT_MUTEXES
&rtmutex_lock_ops,
#endif
&rwsem_lock_ops, &rwsem_lock_ops,
&percpu_rwsem_lock_ops,
}; };
if (!torture_init_begin(torture_type, verbose, &torture_runnable)) if (!torture_init_begin(torture_type, verbose, &torture_runnable))
@ -661,11 +811,11 @@ static int __init lock_torture_init(void)
for (i = 0; i < ARRAY_SIZE(torture_ops); i++) for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
pr_alert(" %s", torture_ops[i]->name); pr_alert(" %s", torture_ops[i]->name);
pr_alert("\n"); pr_alert("\n");
torture_init_end(); firsterr = -EINVAL;
return -EINVAL; goto unwind;
} }
if (cxt.cur_ops->init) if (cxt.cur_ops->init)
cxt.cur_ops->init(); /* no "goto unwind" prior to this point!!! */ cxt.cur_ops->init();
if (nwriters_stress >= 0) if (nwriters_stress >= 0)
cxt.nrealwriters_stress = nwriters_stress; cxt.nrealwriters_stress = nwriters_stress;
@ -676,6 +826,10 @@ static int __init lock_torture_init(void)
if (strncmp(torture_type, "mutex", 5) == 0) if (strncmp(torture_type, "mutex", 5) == 0)
cxt.debug_lock = true; cxt.debug_lock = true;
#endif #endif
#ifdef CONFIG_DEBUG_RT_MUTEXES
if (strncmp(torture_type, "rtmutex", 7) == 0)
cxt.debug_lock = true;
#endif
#ifdef CONFIG_DEBUG_SPINLOCK #ifdef CONFIG_DEBUG_SPINLOCK
if ((strncmp(torture_type, "spin", 4) == 0) || if ((strncmp(torture_type, "spin", 4) == 0) ||
(strncmp(torture_type, "rw_lock", 7) == 0)) (strncmp(torture_type, "rw_lock", 7) == 0))

View File

@ -17,50 +17,43 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
__init_rwsem(&brw->rw_sem, name, rwsem_key); __init_rwsem(&brw->rw_sem, name, rwsem_key);
atomic_set(&brw->write_ctr, 0); rcu_sync_init(&brw->rss, RCU_SCHED_SYNC);
atomic_set(&brw->slow_read_ctr, 0); atomic_set(&brw->slow_read_ctr, 0);
init_waitqueue_head(&brw->write_waitq); init_waitqueue_head(&brw->write_waitq);
return 0; return 0;
} }
EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
void percpu_free_rwsem(struct percpu_rw_semaphore *brw) void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
{ {
/*
* XXX: temporary kludge. The error path in alloc_super()
* assumes that percpu_free_rwsem() is safe after kzalloc().
*/
if (!brw->fast_read_ctr)
return;
rcu_sync_dtor(&brw->rss);
free_percpu(brw->fast_read_ctr); free_percpu(brw->fast_read_ctr);
brw->fast_read_ctr = NULL; /* catch use after free bugs */ brw->fast_read_ctr = NULL; /* catch use after free bugs */
} }
/* /*
* This is the fast-path for down_read/up_read, it only needs to ensure * This is the fast-path for down_read/up_read. If it succeeds we rely
* there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the * on the barriers provided by rcu_sync_enter/exit; see the comments in
* fast per-cpu counter. The writer uses synchronize_sched_expedited() to * percpu_down_write() and percpu_up_write().
* serialize with the preempt-disabled section below.
*
* The nontrivial part is that we should guarantee acquire/release semantics
* in case when
*
* R_W: down_write() comes after up_read(), the writer should see all
* changes done by the reader
* or
* W_R: down_read() comes after up_write(), the reader should see all
* changes done by the writer
* *
* If this helper fails the callers rely on the normal rw_semaphore and * If this helper fails the callers rely on the normal rw_semaphore and
* atomic_dec_and_test(), so in this case we have the necessary barriers. * atomic_dec_and_test(), so in this case we have the necessary barriers.
*
* But if it succeeds we do not have any barriers, atomic_read(write_ctr) or
* __this_cpu_add() below can be reordered with any LOAD/STORE done by the
* reader inside the critical section. See the comments in down_write and
* up_write below.
*/ */
static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
{ {
bool success = false; bool success;
preempt_disable(); preempt_disable();
if (likely(!atomic_read(&brw->write_ctr))) { success = rcu_sync_is_idle(&brw->rss);
if (likely(success))
__this_cpu_add(*brw->fast_read_ctr, val); __this_cpu_add(*brw->fast_read_ctr, val);
success = true;
}
preempt_enable(); preempt_enable();
return success; return success;
@ -77,16 +70,17 @@ static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
void percpu_down_read(struct percpu_rw_semaphore *brw) void percpu_down_read(struct percpu_rw_semaphore *brw)
{ {
might_sleep(); might_sleep();
if (likely(update_fast_ctr(brw, +1))) { rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
return;
}
down_read(&brw->rw_sem); if (likely(update_fast_ctr(brw, +1)))
return;
/* Avoid rwsem_acquire_read() and rwsem_release() */
__down_read(&brw->rw_sem);
atomic_inc(&brw->slow_read_ctr); atomic_inc(&brw->slow_read_ctr);
/* avoid up_read()->rwsem_release() */
__up_read(&brw->rw_sem); __up_read(&brw->rw_sem);
} }
EXPORT_SYMBOL_GPL(percpu_down_read);
int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) int percpu_down_read_trylock(struct percpu_rw_semaphore *brw)
{ {
@ -112,6 +106,7 @@ void percpu_up_read(struct percpu_rw_semaphore *brw)
if (atomic_dec_and_test(&brw->slow_read_ctr)) if (atomic_dec_and_test(&brw->slow_read_ctr))
wake_up_all(&brw->write_waitq); wake_up_all(&brw->write_waitq);
} }
EXPORT_SYMBOL_GPL(percpu_up_read);
static int clear_fast_ctr(struct percpu_rw_semaphore *brw) static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
{ {
@ -126,33 +121,17 @@ static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
return sum; return sum;
} }
/*
* A writer increments ->write_ctr to force the readers to switch to the
* slow mode, note the atomic_read() check in update_fast_ctr().
*
* After that the readers can only inc/dec the slow ->slow_read_ctr counter,
* ->fast_read_ctr is stable. Once the writer moves its sum into the slow
* counter it represents the number of active readers.
*
* Finally the writer takes ->rw_sem for writing and blocks the new readers,
* then waits until the slow counter becomes zero.
*/
void percpu_down_write(struct percpu_rw_semaphore *brw) void percpu_down_write(struct percpu_rw_semaphore *brw)
{ {
/* tell update_fast_ctr() there is a pending writer */
atomic_inc(&brw->write_ctr);
/* /*
* 1. Ensures that write_ctr != 0 is visible to any down_read/up_read * Make rcu_sync_is_idle() == F and thus disable the fast-path in
* so that update_fast_ctr() can't succeed. * percpu_down_read() and percpu_up_read(), and wait for gp pass.
* *
* 2. Ensures we see the result of every previous this_cpu_add() in * The latter synchronises us with the preceding readers which used
* update_fast_ctr(). * the fast-past, so we can not miss the result of __this_cpu_add()
* * or anything else inside their criticial sections.
* 3. Ensures that if any reader has exited its critical section via
* fast-path, it executes a full memory barrier before we return.
* See R_W case in the comment above update_fast_ctr().
*/ */
synchronize_sched_expedited(); rcu_sync_enter(&brw->rss);
/* exclude other writers, and block the new readers completely */ /* exclude other writers, and block the new readers completely */
down_write(&brw->rw_sem); down_write(&brw->rw_sem);
@ -163,16 +142,17 @@ void percpu_down_write(struct percpu_rw_semaphore *brw)
/* wait for all readers to complete their percpu_up_read() */ /* wait for all readers to complete their percpu_up_read() */
wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
} }
EXPORT_SYMBOL_GPL(percpu_down_write);
void percpu_up_write(struct percpu_rw_semaphore *brw) void percpu_up_write(struct percpu_rw_semaphore *brw)
{ {
/* release the lock, but the readers can't use the fast-path */ /* release the lock, but the readers can't use the fast-path */
up_write(&brw->rw_sem); up_write(&brw->rw_sem);
/* /*
* Insert the barrier before the next fast-path in down_read, * Enable the fast-path in percpu_down_read() and percpu_up_read()
* see W_R case in the comment above update_fast_ctr(). * but only after another gp pass; this adds the necessary barrier
* to ensure the reader can't miss the changes done by us.
*/ */
synchronize_sched_expedited(); rcu_sync_exit(&brw->rss);
/* the last writer unblocks update_fast_ctr() */
atomic_dec(&brw->write_ctr);
} }
EXPORT_SYMBOL_GPL(percpu_up_write);

View File

@ -1,4 +1,4 @@
obj-y += update.o obj-y += update.o sync.o
obj-$(CONFIG_SRCU) += srcu.o obj-$(CONFIG_SRCU) += srcu.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_TREE_RCU) += tree.o

View File

@ -252,7 +252,7 @@ struct rcu_torture_ops {
void (*exp_sync)(void); void (*exp_sync)(void);
unsigned long (*get_state)(void); unsigned long (*get_state)(void);
void (*cond_sync)(unsigned long oldstate); void (*cond_sync)(unsigned long oldstate);
void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); call_rcu_func_t call;
void (*cb_barrier)(void); void (*cb_barrier)(void);
void (*fqs)(void); void (*fqs)(void);
void (*stats)(void); void (*stats)(void);
@ -448,7 +448,7 @@ static void synchronize_rcu_busted(void)
} }
static void static void
call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) call_rcu_busted(struct rcu_head *head, rcu_callback_t func)
{ {
/* This is a deliberate bug for testing purposes only! */ /* This is a deliberate bug for testing purposes only! */
func(head); func(head);
@ -523,7 +523,7 @@ static void srcu_torture_synchronize(void)
} }
static void srcu_torture_call(struct rcu_head *head, static void srcu_torture_call(struct rcu_head *head,
void (*func)(struct rcu_head *head)) rcu_callback_t func)
{ {
call_srcu(srcu_ctlp, head, func); call_srcu(srcu_ctlp, head, func);
} }
@ -695,7 +695,7 @@ static bool __maybe_unused torturing_tasks(void)
#define RCUTORTURE_TASKS_OPS #define RCUTORTURE_TASKS_OPS
static bool torturing_tasks(void) static bool __maybe_unused torturing_tasks(void)
{ {
return false; return false;
} }
@ -768,7 +768,6 @@ static int rcu_torture_boost(void *arg)
} }
call_rcu_time = jiffies; call_rcu_time = jiffies;
} }
cond_resched_rcu_qs();
stutter_wait("rcu_torture_boost"); stutter_wait("rcu_torture_boost");
if (torture_must_stop()) if (torture_must_stop())
goto checkwait; goto checkwait;
@ -1208,7 +1207,6 @@ rcu_torture_reader(void *arg)
__this_cpu_inc(rcu_torture_batch[completed]); __this_cpu_inc(rcu_torture_batch[completed]);
preempt_enable(); preempt_enable();
cur_ops->readunlock(idx); cur_ops->readunlock(idx);
cond_resched_rcu_qs();
stutter_wait("rcu_torture_reader"); stutter_wait("rcu_torture_reader");
} while (!torture_must_stop()); } while (!torture_must_stop());
if (irqreader && cur_ops->irq_capable) { if (irqreader && cur_ops->irq_capable) {
@ -1742,15 +1740,15 @@ rcu_torture_init(void)
for (i = 0; i < ARRAY_SIZE(torture_ops); i++) for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
pr_alert(" %s", torture_ops[i]->name); pr_alert(" %s", torture_ops[i]->name);
pr_alert("\n"); pr_alert("\n");
torture_init_end(); firsterr = -EINVAL;
return -EINVAL; goto unwind;
} }
if (cur_ops->fqs == NULL && fqs_duration != 0) { if (cur_ops->fqs == NULL && fqs_duration != 0) {
pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
fqs_duration = 0; fqs_duration = 0;
} }
if (cur_ops->init) if (cur_ops->init)
cur_ops->init(); /* no "goto unwind" prior to this point!!! */ cur_ops->init();
if (nreaders >= 0) { if (nreaders >= 0) {
nrealreaders = nreaders; nrealreaders = nreaders;

View File

@ -298,11 +298,9 @@ int __srcu_read_lock(struct srcu_struct *sp)
int idx; int idx;
idx = READ_ONCE(sp->completed) & 0x1; idx = READ_ONCE(sp->completed) & 0x1;
preempt_disable();
__this_cpu_inc(sp->per_cpu_ref->c[idx]); __this_cpu_inc(sp->per_cpu_ref->c[idx]);
smp_mb(); /* B */ /* Avoid leaking the critical section. */ smp_mb(); /* B */ /* Avoid leaking the critical section. */
__this_cpu_inc(sp->per_cpu_ref->seq[idx]); __this_cpu_inc(sp->per_cpu_ref->seq[idx]);
preempt_enable();
return idx; return idx;
} }
EXPORT_SYMBOL_GPL(__srcu_read_lock); EXPORT_SYMBOL_GPL(__srcu_read_lock);
@ -387,7 +385,7 @@ static void srcu_flip(struct srcu_struct *sp)
* srcu_struct structure. * srcu_struct structure.
*/ */
void call_srcu(struct srcu_struct *sp, struct rcu_head *head, void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
void (*func)(struct rcu_head *head)) rcu_callback_t func)
{ {
unsigned long flags; unsigned long flags;

223
kernel/rcu/sync.c Normal file
View File

@ -0,0 +1,223 @@
/*
* RCU-based infrastructure for lightweight reader-writer locking
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, you can access it online at
* http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright (c) 2015, Red Hat, Inc.
*
* Author: Oleg Nesterov <oleg@redhat.com>
*/
#include <linux/rcu_sync.h>
#include <linux/sched.h>
#ifdef CONFIG_PROVE_RCU
#define __INIT_HELD(func) .held = func,
#else
#define __INIT_HELD(func)
#endif
static const struct {
void (*sync)(void);
void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
void (*wait)(void);
#ifdef CONFIG_PROVE_RCU
int (*held)(void);
#endif
} gp_ops[] = {
[RCU_SYNC] = {
.sync = synchronize_rcu,
.call = call_rcu,
.wait = rcu_barrier,
__INIT_HELD(rcu_read_lock_held)
},
[RCU_SCHED_SYNC] = {
.sync = synchronize_sched,
.call = call_rcu_sched,
.wait = rcu_barrier_sched,
__INIT_HELD(rcu_read_lock_sched_held)
},
[RCU_BH_SYNC] = {
.sync = synchronize_rcu_bh,
.call = call_rcu_bh,
.wait = rcu_barrier_bh,
__INIT_HELD(rcu_read_lock_bh_held)
},
};
enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
#define rss_lock gp_wait.lock
#ifdef CONFIG_PROVE_RCU
void rcu_sync_lockdep_assert(struct rcu_sync *rsp)
{
RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(),
"suspicious rcu_sync_is_idle() usage");
}
#endif
/**
* rcu_sync_init() - Initialize an rcu_sync structure
* @rsp: Pointer to rcu_sync structure to be initialized
* @type: Flavor of RCU with which to synchronize rcu_sync structure
*/
void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
{
memset(rsp, 0, sizeof(*rsp));
init_waitqueue_head(&rsp->gp_wait);
rsp->gp_type = type;
}
/**
* rcu_sync_enter() - Force readers onto slowpath
* @rsp: Pointer to rcu_sync structure to use for synchronization
*
* This function is used by updaters who need readers to make use of
* a slowpath during the update. After this function returns, all
* subsequent calls to rcu_sync_is_idle() will return false, which
* tells readers to stay off their fastpaths. A later call to
* rcu_sync_exit() re-enables reader slowpaths.
*
* When called in isolation, rcu_sync_enter() must wait for a grace
* period, however, closely spaced calls to rcu_sync_enter() can
* optimize away the grace-period wait via a state machine implemented
* by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func().
*/
void rcu_sync_enter(struct rcu_sync *rsp)
{
bool need_wait, need_sync;
spin_lock_irq(&rsp->rss_lock);
need_wait = rsp->gp_count++;
need_sync = rsp->gp_state == GP_IDLE;
if (need_sync)
rsp->gp_state = GP_PENDING;
spin_unlock_irq(&rsp->rss_lock);
BUG_ON(need_wait && need_sync);
if (need_sync) {
gp_ops[rsp->gp_type].sync();
rsp->gp_state = GP_PASSED;
wake_up_all(&rsp->gp_wait);
} else if (need_wait) {
wait_event(rsp->gp_wait, rsp->gp_state == GP_PASSED);
} else {
/*
* Possible when there's a pending CB from a rcu_sync_exit().
* Nobody has yet been allowed the 'fast' path and thus we can
* avoid doing any sync(). The callback will get 'dropped'.
*/
BUG_ON(rsp->gp_state != GP_PASSED);
}
}
/**
* rcu_sync_func() - Callback function managing reader access to fastpath
* @rsp: Pointer to rcu_sync structure to use for synchronization
*
* This function is passed to one of the call_rcu() functions by
* rcu_sync_exit(), so that it is invoked after a grace period following the
* that invocation of rcu_sync_exit(). It takes action based on events that
* have taken place in the meantime, so that closely spaced rcu_sync_enter()
* and rcu_sync_exit() pairs need not wait for a grace period.
*
* If another rcu_sync_enter() is invoked before the grace period
* ended, reset state to allow the next rcu_sync_exit() to let the
* readers back onto their fastpaths (after a grace period). If both
* another rcu_sync_enter() and its matching rcu_sync_exit() are invoked
* before the grace period ended, re-invoke call_rcu() on behalf of that
* rcu_sync_exit(). Otherwise, set all state back to idle so that readers
* can again use their fastpaths.
*/
static void rcu_sync_func(struct rcu_head *rcu)
{
struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head);
unsigned long flags;
BUG_ON(rsp->gp_state != GP_PASSED);
BUG_ON(rsp->cb_state == CB_IDLE);
spin_lock_irqsave(&rsp->rss_lock, flags);
if (rsp->gp_count) {
/*
* A new rcu_sync_begin() has happened; drop the callback.
*/
rsp->cb_state = CB_IDLE;
} else if (rsp->cb_state == CB_REPLAY) {
/*
* A new rcu_sync_exit() has happened; requeue the callback
* to catch a later GP.
*/
rsp->cb_state = CB_PENDING;
gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func);
} else {
/*
* We're at least a GP after rcu_sync_exit(); eveybody will now
* have observed the write side critical section. Let 'em rip!.
*/
rsp->cb_state = CB_IDLE;
rsp->gp_state = GP_IDLE;
}
spin_unlock_irqrestore(&rsp->rss_lock, flags);
}
/**
* rcu_sync_exit() - Allow readers back onto fast patch after grace period
* @rsp: Pointer to rcu_sync structure to use for synchronization
*
* This function is used by updaters who have completed, and can therefore
* now allow readers to make use of their fastpaths after a grace period
* has elapsed. After this grace period has completed, all subsequent
* calls to rcu_sync_is_idle() will return true, which tells readers that
* they can once again use their fastpaths.
*/
void rcu_sync_exit(struct rcu_sync *rsp)
{
spin_lock_irq(&rsp->rss_lock);
if (!--rsp->gp_count) {
if (rsp->cb_state == CB_IDLE) {
rsp->cb_state = CB_PENDING;
gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func);
} else if (rsp->cb_state == CB_PENDING) {
rsp->cb_state = CB_REPLAY;
}
}
spin_unlock_irq(&rsp->rss_lock);
}
/**
* rcu_sync_dtor() - Clean up an rcu_sync structure
* @rsp: Pointer to rcu_sync structure to be cleaned up
*/
void rcu_sync_dtor(struct rcu_sync *rsp)
{
int cb_state;
BUG_ON(rsp->gp_count);
spin_lock_irq(&rsp->rss_lock);
if (rsp->cb_state == CB_REPLAY)
rsp->cb_state = CB_PENDING;
cb_state = rsp->cb_state;
spin_unlock_irq(&rsp->rss_lock);
if (cb_state != CB_IDLE) {
gp_ops[rsp->gp_type].wait();
BUG_ON(rsp->cb_state != CB_IDLE);
}
}

View File

@ -44,7 +44,7 @@ struct rcu_ctrlblk;
static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
static void rcu_process_callbacks(struct softirq_action *unused); static void rcu_process_callbacks(struct softirq_action *unused);
static void __call_rcu(struct rcu_head *head, static void __call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *rcu), rcu_callback_t func,
struct rcu_ctrlblk *rcp); struct rcu_ctrlblk *rcp);
#include "tiny_plugin.h" #include "tiny_plugin.h"
@ -203,7 +203,7 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
* Helper function for call_rcu() and call_rcu_bh(). * Helper function for call_rcu() and call_rcu_bh().
*/ */
static void __call_rcu(struct rcu_head *head, static void __call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *rcu), rcu_callback_t func,
struct rcu_ctrlblk *rcp) struct rcu_ctrlblk *rcp)
{ {
unsigned long flags; unsigned long flags;
@ -229,7 +229,7 @@ static void __call_rcu(struct rcu_head *head,
* period. But since we have but one CPU, that would be after any * period. But since we have but one CPU, that would be after any
* quiescent state. * quiescent state.
*/ */
void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
{ {
__call_rcu(head, func, &rcu_sched_ctrlblk); __call_rcu(head, func, &rcu_sched_ctrlblk);
} }
@ -239,7 +239,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
* Post an RCU bottom-half callback to be invoked after any subsequent * Post an RCU bottom-half callback to be invoked after any subsequent
* quiescent state. * quiescent state.
*/ */
void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
{ {
__call_rcu(head, func, &rcu_bh_ctrlblk); __call_rcu(head, func, &rcu_bh_ctrlblk);
} }

View File

@ -71,7 +71,6 @@ MODULE_ALIAS("rcutree");
static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
static struct lock_class_key rcu_exp_sched_class[RCU_NUM_LVLS];
/* /*
* In order to export the rcu_state name to the tracing tools, it * In order to export the rcu_state name to the tracing tools, it
@ -98,7 +97,7 @@ struct rcu_state sname##_state = { \
.level = { &sname##_state.node[0] }, \ .level = { &sname##_state.node[0] }, \
.rda = &sname##_data, \ .rda = &sname##_data, \
.call = cr, \ .call = cr, \
.fqs_state = RCU_GP_IDLE, \ .gp_state = RCU_GP_IDLE, \
.gpnum = 0UL - 300UL, \ .gpnum = 0UL - 300UL, \
.completed = 0UL - 300UL, \ .completed = 0UL - 300UL, \
.orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
@ -161,6 +160,8 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
static void invoke_rcu_core(void); static void invoke_rcu_core(void);
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
static void rcu_report_exp_rdp(struct rcu_state *rsp,
struct rcu_data *rdp, bool wake);
/* rcuc/rcub kthread realtime priority */ /* rcuc/rcub kthread realtime priority */
#ifdef CONFIG_RCU_KTHREAD_PRIO #ifdef CONFIG_RCU_KTHREAD_PRIO
@ -245,21 +246,33 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
*/ */
void rcu_sched_qs(void) void rcu_sched_qs(void)
{ {
if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) { unsigned long flags;
if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) {
trace_rcu_grace_period(TPS("rcu_sched"), trace_rcu_grace_period(TPS("rcu_sched"),
__this_cpu_read(rcu_sched_data.gpnum), __this_cpu_read(rcu_sched_data.gpnum),
TPS("cpuqs")); TPS("cpuqs"));
__this_cpu_write(rcu_sched_data.passed_quiesce, 1); __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
return;
local_irq_save(flags);
if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) {
__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
rcu_report_exp_rdp(&rcu_sched_state,
this_cpu_ptr(&rcu_sched_data),
true);
}
local_irq_restore(flags);
} }
} }
void rcu_bh_qs(void) void rcu_bh_qs(void)
{ {
if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) { if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
trace_rcu_grace_period(TPS("rcu_bh"), trace_rcu_grace_period(TPS("rcu_bh"),
__this_cpu_read(rcu_bh_data.gpnum), __this_cpu_read(rcu_bh_data.gpnum),
TPS("cpuqs")); TPS("cpuqs"));
__this_cpu_write(rcu_bh_data.passed_quiesce, 1); __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
} }
} }
@ -337,12 +350,14 @@ static void rcu_momentary_dyntick_idle(void)
*/ */
void rcu_note_context_switch(void) void rcu_note_context_switch(void)
{ {
barrier(); /* Avoid RCU read-side critical sections leaking down. */
trace_rcu_utilization(TPS("Start context switch")); trace_rcu_utilization(TPS("Start context switch"));
rcu_sched_qs(); rcu_sched_qs();
rcu_preempt_note_context_switch(); rcu_preempt_note_context_switch();
if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
rcu_momentary_dyntick_idle(); rcu_momentary_dyntick_idle();
trace_rcu_utilization(TPS("End context switch")); trace_rcu_utilization(TPS("End context switch"));
barrier(); /* Avoid RCU read-side critical sections leaking up. */
} }
EXPORT_SYMBOL_GPL(rcu_note_context_switch); EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@ -353,12 +368,19 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
* RCU flavors in desperate need of a quiescent state, which will normally * RCU flavors in desperate need of a quiescent state, which will normally
* be none of them). Either way, do a lightweight quiescent state for * be none of them). Either way, do a lightweight quiescent state for
* all RCU flavors. * all RCU flavors.
*
* The barrier() calls are redundant in the common case when this is
* called externally, but just in case this is called from within this
* file.
*
*/ */
void rcu_all_qs(void) void rcu_all_qs(void)
{ {
barrier(); /* Avoid RCU read-side critical sections leaking down. */
if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
rcu_momentary_dyntick_idle(); rcu_momentary_dyntick_idle();
this_cpu_inc(rcu_qs_ctr); this_cpu_inc(rcu_qs_ctr);
barrier(); /* Avoid RCU read-side critical sections leaking up. */
} }
EXPORT_SYMBOL_GPL(rcu_all_qs); EXPORT_SYMBOL_GPL(rcu_all_qs);
@ -1744,9 +1766,9 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
*/ */
rdp->gpnum = rnp->gpnum; rdp->gpnum = rnp->gpnum;
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
rdp->passed_quiesce = 0; rdp->cpu_no_qs.b.norm = true;
rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
zero_cpu_stall_ticks(rdp); zero_cpu_stall_ticks(rdp);
WRITE_ONCE(rdp->gpwrap, false); WRITE_ONCE(rdp->gpwrap, false);
} }
@ -1927,16 +1949,15 @@ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
/* /*
* Do one round of quiescent-state forcing. * Do one round of quiescent-state forcing.
*/ */
static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time)
{ {
int fqs_state = fqs_state_in;
bool isidle = false; bool isidle = false;
unsigned long maxj; unsigned long maxj;
struct rcu_node *rnp = rcu_get_root(rsp); struct rcu_node *rnp = rcu_get_root(rsp);
WRITE_ONCE(rsp->gp_activity, jiffies); WRITE_ONCE(rsp->gp_activity, jiffies);
rsp->n_force_qs++; rsp->n_force_qs++;
if (fqs_state == RCU_SAVE_DYNTICK) { if (first_time) {
/* Collect dyntick-idle snapshots. */ /* Collect dyntick-idle snapshots. */
if (is_sysidle_rcu_state(rsp)) { if (is_sysidle_rcu_state(rsp)) {
isidle = true; isidle = true;
@ -1945,7 +1966,6 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
force_qs_rnp(rsp, dyntick_save_progress_counter, force_qs_rnp(rsp, dyntick_save_progress_counter,
&isidle, &maxj); &isidle, &maxj);
rcu_sysidle_report_gp(rsp, isidle, maxj); rcu_sysidle_report_gp(rsp, isidle, maxj);
fqs_state = RCU_FORCE_QS;
} else { } else {
/* Handle dyntick-idle and offline CPUs. */ /* Handle dyntick-idle and offline CPUs. */
isidle = true; isidle = true;
@ -1959,7 +1979,6 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
raw_spin_unlock_irq(&rnp->lock); raw_spin_unlock_irq(&rnp->lock);
} }
return fqs_state;
} }
/* /*
@ -2023,7 +2042,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
/* Declare grace period done. */ /* Declare grace period done. */
WRITE_ONCE(rsp->completed, rsp->gpnum); WRITE_ONCE(rsp->completed, rsp->gpnum);
trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
rsp->fqs_state = RCU_GP_IDLE; rsp->gp_state = RCU_GP_IDLE;
rdp = this_cpu_ptr(rsp->rda); rdp = this_cpu_ptr(rsp->rda);
/* Advance CBs to reduce false positives below. */ /* Advance CBs to reduce false positives below. */
needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
@ -2041,7 +2060,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
*/ */
static int __noreturn rcu_gp_kthread(void *arg) static int __noreturn rcu_gp_kthread(void *arg)
{ {
int fqs_state; bool first_gp_fqs;
int gf; int gf;
unsigned long j; unsigned long j;
int ret; int ret;
@ -2073,7 +2092,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
} }
/* Handle quiescent-state forcing. */ /* Handle quiescent-state forcing. */
fqs_state = RCU_SAVE_DYNTICK; first_gp_fqs = true;
j = jiffies_till_first_fqs; j = jiffies_till_first_fqs;
if (j > HZ) { if (j > HZ) {
j = HZ; j = HZ;
@ -2101,7 +2120,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
trace_rcu_grace_period(rsp->name, trace_rcu_grace_period(rsp->name,
READ_ONCE(rsp->gpnum), READ_ONCE(rsp->gpnum),
TPS("fqsstart")); TPS("fqsstart"));
fqs_state = rcu_gp_fqs(rsp, fqs_state); rcu_gp_fqs(rsp, first_gp_fqs);
first_gp_fqs = false;
trace_rcu_grace_period(rsp->name, trace_rcu_grace_period(rsp->name,
READ_ONCE(rsp->gpnum), READ_ONCE(rsp->gpnum),
TPS("fqsend")); TPS("fqsend"));
@ -2337,7 +2357,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
rnp = rdp->mynode; rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags); raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock(); smp_mb__after_unlock_lock();
if ((rdp->passed_quiesce == 0 && if ((rdp->cpu_no_qs.b.norm &&
rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
rdp->gpwrap) { rdp->gpwrap) {
@ -2348,7 +2368,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
* We will instead need a new quiescent state that lies * We will instead need a new quiescent state that lies
* within the current grace period. * within the current grace period.
*/ */
rdp->passed_quiesce = 0; /* need qs for new gp. */ rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */
rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
raw_spin_unlock_irqrestore(&rnp->lock, flags); raw_spin_unlock_irqrestore(&rnp->lock, flags);
return; return;
@ -2357,7 +2377,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
if ((rnp->qsmask & mask) == 0) { if ((rnp->qsmask & mask) == 0) {
raw_spin_unlock_irqrestore(&rnp->lock, flags); raw_spin_unlock_irqrestore(&rnp->lock, flags);
} else { } else {
rdp->qs_pending = 0; rdp->core_needs_qs = 0;
/* /*
* This GP can't end until cpu checks in, so all of our * This GP can't end until cpu checks in, so all of our
@ -2388,14 +2408,14 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
* Does this CPU still need to do its part for current grace period? * Does this CPU still need to do its part for current grace period?
* If no, return and let the other CPUs do their part as well. * If no, return and let the other CPUs do their part as well.
*/ */
if (!rdp->qs_pending) if (!rdp->core_needs_qs)
return; return;
/* /*
* Was there a quiescent state since the beginning of the grace * Was there a quiescent state since the beginning of the grace
* period? If no, then exit and wait for the next call. * period? If no, then exit and wait for the next call.
*/ */
if (!rdp->passed_quiesce && if (rdp->cpu_no_qs.b.norm &&
rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
return; return;
@ -3017,7 +3037,7 @@ static void rcu_leak_callback(struct rcu_head *rhp)
* is expected to specify a CPU. * is expected to specify a CPU.
*/ */
static void static void
__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), __call_rcu(struct rcu_head *head, rcu_callback_t func,
struct rcu_state *rsp, int cpu, bool lazy) struct rcu_state *rsp, int cpu, bool lazy)
{ {
unsigned long flags; unsigned long flags;
@ -3088,7 +3108,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
/* /*
* Queue an RCU-sched callback for invocation after a grace period. * Queue an RCU-sched callback for invocation after a grace period.
*/ */
void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
{ {
__call_rcu(head, func, &rcu_sched_state, -1, 0); __call_rcu(head, func, &rcu_sched_state, -1, 0);
} }
@ -3097,7 +3117,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
/* /*
* Queue an RCU callback for invocation after a quicker grace period. * Queue an RCU callback for invocation after a quicker grace period.
*/ */
void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
{ {
__call_rcu(head, func, &rcu_bh_state, -1, 0); __call_rcu(head, func, &rcu_bh_state, -1, 0);
} }
@ -3111,7 +3131,7 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
* function may only be called from __kfree_rcu(). * function may only be called from __kfree_rcu().
*/ */
void kfree_call_rcu(struct rcu_head *head, void kfree_call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *rcu)) rcu_callback_t func)
{ {
__call_rcu(head, func, rcu_state_p, -1, 1); __call_rcu(head, func, rcu_state_p, -1, 1);
} }
@ -3379,6 +3399,191 @@ static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
return rcu_seq_done(&rsp->expedited_sequence, s); return rcu_seq_done(&rsp->expedited_sequence, s);
} }
/*
* Reset the ->expmaskinit values in the rcu_node tree to reflect any
* recent CPU-online activity. Note that these masks are not cleared
* when CPUs go offline, so they reflect the union of all CPUs that have
* ever been online. This means that this function normally takes its
* no-work-to-do fastpath.
*/
static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
{
bool done;
unsigned long flags;
unsigned long mask;
unsigned long oldmask;
int ncpus = READ_ONCE(rsp->ncpus);
struct rcu_node *rnp;
struct rcu_node *rnp_up;
/* If no new CPUs onlined since last time, nothing to do. */
if (likely(ncpus == rsp->ncpus_snap))
return;
rsp->ncpus_snap = ncpus;
/*
* Each pass through the following loop propagates newly onlined
* CPUs for the current rcu_node structure up the rcu_node tree.
*/
rcu_for_each_leaf_node(rsp, rnp) {
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
if (rnp->expmaskinit == rnp->expmaskinitnext) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
continue; /* No new CPUs, nothing to do. */
}
/* Update this node's mask, track old value for propagation. */
oldmask = rnp->expmaskinit;
rnp->expmaskinit = rnp->expmaskinitnext;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
/* If was already nonzero, nothing to propagate. */
if (oldmask)
continue;
/* Propagate the new CPU up the tree. */
mask = rnp->grpmask;
rnp_up = rnp->parent;
done = false;
while (rnp_up) {
raw_spin_lock_irqsave(&rnp_up->lock, flags);
smp_mb__after_unlock_lock();
if (rnp_up->expmaskinit)
done = true;
rnp_up->expmaskinit |= mask;
raw_spin_unlock_irqrestore(&rnp_up->lock, flags);
if (done)
break;
mask = rnp_up->grpmask;
rnp_up = rnp_up->parent;
}
}
}
/*
* Reset the ->expmask values in the rcu_node tree in preparation for
* a new expedited grace period.
*/
static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
{
unsigned long flags;
struct rcu_node *rnp;
sync_exp_reset_tree_hotplug(rsp);
rcu_for_each_node_breadth_first(rsp, rnp) {
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
WARN_ON_ONCE(rnp->expmask);
rnp->expmask = rnp->expmaskinit;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
}
/*
* Return non-zero if there is no RCU expedited grace period in progress
* for the specified rcu_node structure, in other words, if all CPUs and
* tasks covered by the specified rcu_node structure have done their bit
* for the current expedited grace period. Works only for preemptible
* RCU -- other RCU implementation use other means.
*
* Caller must hold the root rcu_node's exp_funnel_mutex.
*/
static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
{
return rnp->exp_tasks == NULL &&
READ_ONCE(rnp->expmask) == 0;
}
/*
* Report the exit from RCU read-side critical section for the last task
* that queued itself during or before the current expedited preemptible-RCU
* grace period. This event is reported either to the rcu_node structure on
* which the task was queued or to one of that rcu_node structure's ancestors,
* recursively up the tree. (Calm down, calm down, we do the recursion
* iteratively!)
*
* Caller must hold the root rcu_node's exp_funnel_mutex and the
* specified rcu_node structure's ->lock.
*/
static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
bool wake, unsigned long flags)
__releases(rnp->lock)
{
unsigned long mask;
for (;;) {
if (!sync_rcu_preempt_exp_done(rnp)) {
if (!rnp->expmask)
rcu_initiate_boost(rnp, flags);
else
raw_spin_unlock_irqrestore(&rnp->lock, flags);
break;
}
if (rnp->parent == NULL) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
if (wake) {
smp_mb(); /* EGP done before wake_up(). */
wake_up(&rsp->expedited_wq);
}
break;
}
mask = rnp->grpmask;
raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
rnp = rnp->parent;
raw_spin_lock(&rnp->lock); /* irqs already disabled */
smp_mb__after_unlock_lock();
WARN_ON_ONCE(!(rnp->expmask & mask));
rnp->expmask &= ~mask;
}
}
/*
* Report expedited quiescent state for specified node. This is a
* lock-acquisition wrapper function for __rcu_report_exp_rnp().
*
* Caller must hold the root rcu_node's exp_funnel_mutex.
*/
static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
struct rcu_node *rnp, bool wake)
{
unsigned long flags;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
__rcu_report_exp_rnp(rsp, rnp, wake, flags);
}
/*
* Report expedited quiescent state for multiple CPUs, all covered by the
* specified leaf rcu_node structure. Caller must hold the root
* rcu_node's exp_funnel_mutex.
*/
static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
unsigned long mask, bool wake)
{
unsigned long flags;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
if (!(rnp->expmask & mask)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
rnp->expmask &= ~mask;
__rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */
}
/*
* Report expedited quiescent state for specified rcu_data (CPU).
* Caller must hold the root rcu_node's exp_funnel_mutex.
*/
static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
bool wake)
{
rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake);
}
/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp, struct rcu_data *rdp,
@ -3455,16 +3660,111 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
} }
/* Invoked on each online non-idle CPU for expedited quiescent state. */ /* Invoked on each online non-idle CPU for expedited quiescent state. */
static int synchronize_sched_expedited_cpu_stop(void *data) static void sync_sched_exp_handler(void *data)
{ {
struct rcu_data *rdp = data; struct rcu_data *rdp;
struct rcu_state *rsp = rdp->rsp; struct rcu_node *rnp;
struct rcu_state *rsp = data;
/* We are here: If we are last, do the wakeup. */ rdp = this_cpu_ptr(rsp->rda);
rdp->exp_done = true; rnp = rdp->mynode;
if (atomic_dec_and_test(&rsp->expedited_need_qs)) if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
wake_up(&rsp->expedited_wq); __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
return 0; return;
__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
resched_cpu(smp_processor_id());
}
/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
static void sync_sched_exp_online_cleanup(int cpu)
{
struct rcu_data *rdp;
int ret;
struct rcu_node *rnp;
struct rcu_state *rsp = &rcu_sched_state;
rdp = per_cpu_ptr(rsp->rda, cpu);
rnp = rdp->mynode;
if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))
return;
ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0);
WARN_ON_ONCE(ret);
}
/*
* Select the nodes that the upcoming expedited grace period needs
* to wait for.
*/
static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
smp_call_func_t func)
{
int cpu;
unsigned long flags;
unsigned long mask;
unsigned long mask_ofl_test;
unsigned long mask_ofl_ipi;
int ret;
struct rcu_node *rnp;
sync_exp_reset_tree(rsp);
rcu_for_each_leaf_node(rsp, rnp) {
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
/* Each pass checks a CPU for identity, offline, and idle. */
mask_ofl_test = 0;
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
if (raw_smp_processor_id() == cpu ||
!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
mask_ofl_test |= rdp->grpmask;
}
mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
/*
* Need to wait for any blocked tasks as well. Note that
* additional blocking tasks will also block the expedited
* GP until such time as the ->expmask bits are cleared.
*/
if (rcu_preempt_has_tasks(rnp))
rnp->exp_tasks = rnp->blkd_tasks.next;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
/* IPI the remaining CPUs for expedited quiescent state. */
mask = 1;
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
if (!(mask_ofl_ipi & mask))
continue;
retry_ipi:
ret = smp_call_function_single(cpu, func, rsp, 0);
if (!ret) {
mask_ofl_ipi &= ~mask;
} else {
/* Failed, raced with offline. */
raw_spin_lock_irqsave(&rnp->lock, flags);
if (cpu_online(cpu) &&
(rnp->expmask & mask)) {
raw_spin_unlock_irqrestore(&rnp->lock,
flags);
schedule_timeout_uninterruptible(1);
if (cpu_online(cpu) &&
(rnp->expmask & mask))
goto retry_ipi;
raw_spin_lock_irqsave(&rnp->lock,
flags);
}
if (!(rnp->expmask & mask))
mask_ofl_ipi &= ~mask;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
}
/* Report quiescent states for those that went offline. */
mask_ofl_test |= mask_ofl_ipi;
if (mask_ofl_test)
rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
}
} }
static void synchronize_sched_expedited_wait(struct rcu_state *rsp) static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
@ -3472,7 +3772,9 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
int cpu; int cpu;
unsigned long jiffies_stall; unsigned long jiffies_stall;
unsigned long jiffies_start; unsigned long jiffies_start;
struct rcu_data *rdp; unsigned long mask;
struct rcu_node *rnp;
struct rcu_node *rnp_root = rcu_get_root(rsp);
int ret; int ret;
jiffies_stall = rcu_jiffies_till_stall_check(); jiffies_stall = rcu_jiffies_till_stall_check();
@ -3481,33 +3783,43 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
for (;;) { for (;;) {
ret = wait_event_interruptible_timeout( ret = wait_event_interruptible_timeout(
rsp->expedited_wq, rsp->expedited_wq,
!atomic_read(&rsp->expedited_need_qs), sync_rcu_preempt_exp_done(rnp_root),
jiffies_stall); jiffies_stall);
if (ret > 0) if (ret > 0)
return; return;
if (ret < 0) { if (ret < 0) {
/* Hit a signal, disable CPU stall warnings. */ /* Hit a signal, disable CPU stall warnings. */
wait_event(rsp->expedited_wq, wait_event(rsp->expedited_wq,
!atomic_read(&rsp->expedited_need_qs)); sync_rcu_preempt_exp_done(rnp_root));
return; return;
} }
pr_err("INFO: %s detected expedited stalls on CPUs: {", pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
rsp->name); rsp->name);
for_each_online_cpu(cpu) { rcu_for_each_leaf_node(rsp, rnp) {
rdp = per_cpu_ptr(rsp->rda, cpu); (void)rcu_print_task_exp_stall(rnp);
mask = 1;
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
struct rcu_data *rdp;
if (rdp->exp_done) if (!(rnp->expmask & mask))
continue; continue;
pr_cont(" %d", cpu); rdp = per_cpu_ptr(rsp->rda, cpu);
pr_cont(" %d-%c%c%c", cpu,
"O."[cpu_online(cpu)],
"o."[!!(rdp->grpmask & rnp->expmaskinit)],
"N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
}
mask <<= 1;
} }
pr_cont(" } %lu jiffies s: %lu\n", pr_cont(" } %lu jiffies s: %lu\n",
jiffies - jiffies_start, rsp->expedited_sequence); jiffies - jiffies_start, rsp->expedited_sequence);
for_each_online_cpu(cpu) { rcu_for_each_leaf_node(rsp, rnp) {
rdp = per_cpu_ptr(rsp->rda, cpu); mask = 1;
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
if (rdp->exp_done) if (!(rnp->expmask & mask))
continue; continue;
dump_cpu_task(cpu); dump_cpu_task(cpu);
}
} }
jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
} }
@ -3531,7 +3843,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
*/ */
void synchronize_sched_expedited(void) void synchronize_sched_expedited(void)
{ {
int cpu;
unsigned long s; unsigned long s;
struct rcu_node *rnp; struct rcu_node *rnp;
struct rcu_state *rsp = &rcu_sched_state; struct rcu_state *rsp = &rcu_sched_state;
@ -3539,48 +3850,16 @@ void synchronize_sched_expedited(void)
/* Take a snapshot of the sequence number. */ /* Take a snapshot of the sequence number. */
s = rcu_exp_gp_seq_snap(rsp); s = rcu_exp_gp_seq_snap(rsp);
if (!try_get_online_cpus()) {
/* CPU hotplug operation in flight, fall back to normal GP. */
wait_rcu_gp(call_rcu_sched);
atomic_long_inc(&rsp->expedited_normal);
return;
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
rnp = exp_funnel_lock(rsp, s); rnp = exp_funnel_lock(rsp, s);
if (rnp == NULL) { if (rnp == NULL)
put_online_cpus();
return; /* Someone else did our work for us. */ return; /* Someone else did our work for us. */
}
rcu_exp_gp_seq_start(rsp); rcu_exp_gp_seq_start(rsp);
sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
/* Stop each CPU that is online, non-idle, and not us. */ synchronize_sched_expedited_wait(rsp);
init_waitqueue_head(&rsp->expedited_wq);
atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */
for_each_online_cpu(cpu) {
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
rdp->exp_done = false;
/* Skip our CPU and any idle CPUs. */
if (raw_smp_processor_id() == cpu ||
!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
continue;
atomic_inc(&rsp->expedited_need_qs);
stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop,
rdp, &rdp->exp_stop_work);
}
/* Remove extra count and, if necessary, wait for CPUs to stop. */
if (!atomic_dec_and_test(&rsp->expedited_need_qs))
synchronize_sched_expedited_wait(rsp);
rcu_exp_gp_seq_end(rsp); rcu_exp_gp_seq_end(rsp);
mutex_unlock(&rnp->exp_funnel_mutex); mutex_unlock(&rnp->exp_funnel_mutex);
put_online_cpus();
} }
EXPORT_SYMBOL_GPL(synchronize_sched_expedited); EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
@ -3606,11 +3885,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
/* Is the RCU core waiting for a quiescent state from this CPU? */ /* Is the RCU core waiting for a quiescent state from this CPU? */
if (rcu_scheduler_fully_active && if (rcu_scheduler_fully_active &&
rdp->qs_pending && !rdp->passed_quiesce && rdp->core_needs_qs && rdp->cpu_no_qs.b.norm &&
rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
rdp->n_rp_qs_pending++; rdp->n_rp_core_needs_qs++;
} else if (rdp->qs_pending && } else if (rdp->core_needs_qs &&
(rdp->passed_quiesce || (!rdp->cpu_no_qs.b.norm ||
rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) { rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
rdp->n_rp_report_qs++; rdp->n_rp_report_qs++;
return 1; return 1;
@ -3868,7 +4147,6 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
static void __init static void __init
rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
{ {
static struct lock_class_key rcu_exp_sched_rdp_class;
unsigned long flags; unsigned long flags;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rcu_get_root(rsp); struct rcu_node *rnp = rcu_get_root(rsp);
@ -3884,10 +4162,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
mutex_init(&rdp->exp_funnel_mutex); mutex_init(&rdp->exp_funnel_mutex);
rcu_boot_init_nocb_percpu_data(rdp); rcu_boot_init_nocb_percpu_data(rdp);
raw_spin_unlock_irqrestore(&rnp->lock, flags); raw_spin_unlock_irqrestore(&rnp->lock, flags);
if (rsp == &rcu_sched_state)
lockdep_set_class_and_name(&rdp->exp_funnel_mutex,
&rcu_exp_sched_rdp_class,
"rcu_data_exp_sched");
} }
/* /*
@ -3906,7 +4180,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
/* Set up local state, ensuring consistent view of global state. */ /* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave(&rnp->lock, flags); raw_spin_lock_irqsave(&rnp->lock, flags);
rdp->beenonline = 1; /* We have now been online. */
rdp->qlen_last_fqs_check = 0; rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = rsp->n_force_qs; rdp->n_force_qs_snap = rsp->n_force_qs;
rdp->blimit = blimit; rdp->blimit = blimit;
@ -3928,11 +4201,15 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
raw_spin_lock(&rnp->lock); /* irqs already disabled. */ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
smp_mb__after_unlock_lock(); smp_mb__after_unlock_lock();
rnp->qsmaskinitnext |= mask; rnp->qsmaskinitnext |= mask;
rnp->expmaskinitnext |= mask;
if (!rdp->beenonline)
WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);
rdp->beenonline = true; /* We have now been online. */
rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
rdp->completed = rnp->completed; rdp->completed = rnp->completed;
rdp->passed_quiesce = false; rdp->cpu_no_qs.b.norm = true;
rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
rdp->qs_pending = false; rdp->core_needs_qs = false;
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
raw_spin_unlock_irqrestore(&rnp->lock, flags); raw_spin_unlock_irqrestore(&rnp->lock, flags);
} }
@ -3965,6 +4242,7 @@ int rcu_cpu_notify(struct notifier_block *self,
break; break;
case CPU_ONLINE: case CPU_ONLINE:
case CPU_DOWN_FAILED: case CPU_DOWN_FAILED:
sync_sched_exp_online_cleanup(cpu);
rcu_boost_kthread_setaffinity(rnp, -1); rcu_boost_kthread_setaffinity(rnp, -1);
break; break;
case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE:
@ -3976,6 +4254,12 @@ int rcu_cpu_notify(struct notifier_block *self,
rcu_cleanup_dying_cpu(rsp); rcu_cleanup_dying_cpu(rsp);
break; break;
case CPU_DYING_IDLE: case CPU_DYING_IDLE:
/* QS for any half-done expedited RCU-sched GP. */
preempt_disable();
rcu_report_exp_rdp(&rcu_sched_state,
this_cpu_ptr(rcu_sched_state.rda), true);
preempt_enable();
for_each_rcu_flavor(rsp) { for_each_rcu_flavor(rsp) {
rcu_cleanup_dying_idle_cpu(cpu, rsp); rcu_cleanup_dying_idle_cpu(cpu, rsp);
} }
@ -4107,7 +4391,6 @@ static void __init rcu_init_one(struct rcu_state *rsp,
static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const buf[] = RCU_NODE_NAME_INIT;
static const char * const fqs[] = RCU_FQS_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT;
static const char * const exp[] = RCU_EXP_NAME_INIT; static const char * const exp[] = RCU_EXP_NAME_INIT;
static const char * const exp_sched[] = RCU_EXP_SCHED_NAME_INIT;
static u8 fl_mask = 0x1; static u8 fl_mask = 0x1;
int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */
@ -4167,18 +4450,13 @@ static void __init rcu_init_one(struct rcu_state *rsp,
INIT_LIST_HEAD(&rnp->blkd_tasks); INIT_LIST_HEAD(&rnp->blkd_tasks);
rcu_init_one_nocb(rnp); rcu_init_one_nocb(rnp);
mutex_init(&rnp->exp_funnel_mutex); mutex_init(&rnp->exp_funnel_mutex);
if (rsp == &rcu_sched_state) lockdep_set_class_and_name(&rnp->exp_funnel_mutex,
lockdep_set_class_and_name( &rcu_exp_class[i], exp[i]);
&rnp->exp_funnel_mutex,
&rcu_exp_sched_class[i], exp_sched[i]);
else
lockdep_set_class_and_name(
&rnp->exp_funnel_mutex,
&rcu_exp_class[i], exp[i]);
} }
} }
init_waitqueue_head(&rsp->gp_wq); init_waitqueue_head(&rsp->gp_wq);
init_waitqueue_head(&rsp->expedited_wq);
rnp = rsp->level[rcu_num_lvls - 1]; rnp = rsp->level[rcu_num_lvls - 1];
for_each_possible_cpu(i) { for_each_possible_cpu(i) {
while (i > rnp->grphi) while (i > rnp->grphi)
@ -4221,13 +4499,12 @@ static void __init rcu_init_geometry(void)
rcu_fanout_leaf, nr_cpu_ids); rcu_fanout_leaf, nr_cpu_ids);
/* /*
* The boot-time rcu_fanout_leaf parameter is only permitted * The boot-time rcu_fanout_leaf parameter must be at least two
* to increase the leaf-level fanout, not decrease it. Of course, * and cannot exceed the number of bits in the rcu_node masks.
* the leaf-level fanout cannot exceed the number of bits in * Complain and fall back to the compile-time values if this
* the rcu_node masks. Complain and fall back to the compile- * limit is exceeded.
* time values if these limits are exceeded.
*/ */
if (rcu_fanout_leaf < RCU_FANOUT_LEAF || if (rcu_fanout_leaf < 2 ||
rcu_fanout_leaf > sizeof(unsigned long) * 8) { rcu_fanout_leaf > sizeof(unsigned long) * 8) {
rcu_fanout_leaf = RCU_FANOUT_LEAF; rcu_fanout_leaf = RCU_FANOUT_LEAF;
WARN_ON(1); WARN_ON(1);
@ -4244,10 +4521,13 @@ static void __init rcu_init_geometry(void)
/* /*
* The tree must be able to accommodate the configured number of CPUs. * The tree must be able to accommodate the configured number of CPUs.
* If this limit is exceeded than we have a serious problem elsewhere. * If this limit is exceeded, fall back to the compile-time values.
*/ */
if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) {
panic("rcu_init_geometry: rcu_capacity[] is too small"); rcu_fanout_leaf = RCU_FANOUT_LEAF;
WARN_ON(1);
return;
}
/* Calculate the number of levels in the tree. */ /* Calculate the number of levels in the tree. */
for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) { for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) {

View File

@ -70,8 +70,6 @@
# define RCU_NODE_NAME_INIT { "rcu_node_0" } # define RCU_NODE_NAME_INIT { "rcu_node_0" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" } # define RCU_EXP_NAME_INIT { "rcu_node_exp_0" }
# define RCU_EXP_SCHED_NAME_INIT \
{ "rcu_node_exp_sched_0" }
#elif NR_CPUS <= RCU_FANOUT_2 #elif NR_CPUS <= RCU_FANOUT_2
# define RCU_NUM_LVLS 2 # define RCU_NUM_LVLS 2
# define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_0 1
@ -81,8 +79,6 @@
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" } # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" }
# define RCU_EXP_SCHED_NAME_INIT \
{ "rcu_node_exp_sched_0", "rcu_node_exp_sched_1" }
#elif NR_CPUS <= RCU_FANOUT_3 #elif NR_CPUS <= RCU_FANOUT_3
# define RCU_NUM_LVLS 3 # define RCU_NUM_LVLS 3
# define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_0 1
@ -93,8 +89,6 @@
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" } # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
# define RCU_EXP_SCHED_NAME_INIT \
{ "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2" }
#elif NR_CPUS <= RCU_FANOUT_4 #elif NR_CPUS <= RCU_FANOUT_4
# define RCU_NUM_LVLS 4 # define RCU_NUM_LVLS 4
# define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_0 1
@ -106,8 +100,6 @@
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" } # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
# define RCU_EXP_SCHED_NAME_INIT \
{ "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2", "rcu_node_exp_sched_3" }
#else #else
# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
@ -171,16 +163,21 @@ struct rcu_node {
/* an rcu_data structure, otherwise, each */ /* an rcu_data structure, otherwise, each */
/* bit corresponds to a child rcu_node */ /* bit corresponds to a child rcu_node */
/* structure. */ /* structure. */
unsigned long expmask; /* Groups that have ->blkd_tasks */
/* elements that need to drain to allow the */
/* current expedited grace period to */
/* complete (only for PREEMPT_RCU). */
unsigned long qsmaskinit; unsigned long qsmaskinit;
/* Per-GP initial value for qsmask & expmask. */ /* Per-GP initial value for qsmask. */
/* Initialized from ->qsmaskinitnext at the */ /* Initialized from ->qsmaskinitnext at the */
/* beginning of each grace period. */ /* beginning of each grace period. */
unsigned long qsmaskinitnext; unsigned long qsmaskinitnext;
/* Online CPUs for next grace period. */ /* Online CPUs for next grace period. */
unsigned long expmask; /* CPUs or groups that need to check in */
/* to allow the current expedited GP */
/* to complete. */
unsigned long expmaskinit;
/* Per-GP initial values for expmask. */
/* Initialized from ->expmaskinitnext at the */
/* beginning of each expedited GP. */
unsigned long expmaskinitnext;
/* Online CPUs for next expedited GP. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */
/* Only one bit will be set in this mask. */ /* Only one bit will be set in this mask. */
int grplo; /* lowest-numbered CPU or group here. */ int grplo; /* lowest-numbered CPU or group here. */
@ -281,6 +278,18 @@ struct rcu_node {
for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
(rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
/*
* Union to allow "aggregate OR" operation on the need for a quiescent
* state by the normal and expedited grace periods.
*/
union rcu_noqs {
struct {
u8 norm;
u8 exp;
} b; /* Bits. */
u16 s; /* Set of bits, aggregate OR here. */
};
/* Index values for nxttail array in struct rcu_data. */ /* Index values for nxttail array in struct rcu_data. */
#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ #define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
@ -297,8 +306,8 @@ struct rcu_data {
/* is aware of having started. */ /* is aware of having started. */
unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */
/* for rcu_all_qs() invocations. */ /* for rcu_all_qs() invocations. */
bool passed_quiesce; /* User-mode/idle loop etc. */ union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */
bool qs_pending; /* Core waits for quiesc state. */ bool core_needs_qs; /* Core waits for quiesc state. */
bool beenonline; /* CPU online at least once. */ bool beenonline; /* CPU online at least once. */
bool gpwrap; /* Possible gpnum/completed wrap. */ bool gpwrap; /* Possible gpnum/completed wrap. */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
@ -307,9 +316,6 @@ struct rcu_data {
/* ticks this CPU has handled */ /* ticks this CPU has handled */
/* during and after the last grace */ /* during and after the last grace */
/* period it is aware of. */ /* period it is aware of. */
struct cpu_stop_work exp_stop_work;
/* Expedited grace-period control */
/* for CPU stopping. */
/* 2) batch handling */ /* 2) batch handling */
/* /*
@ -363,7 +369,7 @@ struct rcu_data {
/* 5) __rcu_pending() statistics. */ /* 5) __rcu_pending() statistics. */
unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
unsigned long n_rp_qs_pending; unsigned long n_rp_core_needs_qs;
unsigned long n_rp_report_qs; unsigned long n_rp_report_qs;
unsigned long n_rp_cb_ready; unsigned long n_rp_cb_ready;
unsigned long n_rp_cpu_needs_gp; unsigned long n_rp_cpu_needs_gp;
@ -378,7 +384,6 @@ struct rcu_data {
struct rcu_head oom_head; struct rcu_head oom_head;
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
struct mutex exp_funnel_mutex; struct mutex exp_funnel_mutex;
bool exp_done; /* Expedited QS for this CPU? */
/* 7) Callback offloading. */ /* 7) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU #ifdef CONFIG_RCU_NOCB_CPU
@ -412,13 +417,6 @@ struct rcu_data {
struct rcu_state *rsp; struct rcu_state *rsp;
}; };
/* Values for fqs_state field in struct rcu_state. */
#define RCU_GP_IDLE 0 /* No grace period in progress. */
#define RCU_GP_INIT 1 /* Grace period being initialized. */
#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
/* Values for nocb_defer_wakeup field in struct rcu_data. */ /* Values for nocb_defer_wakeup field in struct rcu_data. */
#define RCU_NOGP_WAKE_NOT 0 #define RCU_NOGP_WAKE_NOT 0
#define RCU_NOGP_WAKE 1 #define RCU_NOGP_WAKE 1
@ -464,14 +462,13 @@ struct rcu_state {
/* shut bogus gcc warning) */ /* shut bogus gcc warning) */
u8 flavor_mask; /* bit in flavor mask. */ u8 flavor_mask; /* bit in flavor mask. */
struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ call_rcu_func_t call; /* call_rcu() flavor. */
void (*func)(struct rcu_head *head)); int ncpus; /* # CPUs seen so far. */
/* The following fields are guarded by the root rcu_node's lock. */ /* The following fields are guarded by the root rcu_node's lock. */
u8 fqs_state ____cacheline_internodealigned_in_smp; u8 boost ____cacheline_internodealigned_in_smp;
/* Force QS state. */ /* Subject to priority boost. */
u8 boost; /* Subject to priority boost. */
unsigned long gpnum; /* Current gp number. */ unsigned long gpnum; /* Current gp number. */
unsigned long completed; /* # of last completed gp. */ unsigned long completed; /* # of last completed gp. */
struct task_struct *gp_kthread; /* Task for grace periods. */ struct task_struct *gp_kthread; /* Task for grace periods. */
@ -508,6 +505,7 @@ struct rcu_state {
atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */
atomic_t expedited_need_qs; /* # CPUs left to check in. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */
wait_queue_head_t expedited_wq; /* Wait for check-ins. */ wait_queue_head_t expedited_wq; /* Wait for check-ins. */
int ncpus_snap; /* # CPUs seen last time. */
unsigned long jiffies_force_qs; /* Time at which to invoke */ unsigned long jiffies_force_qs; /* Time at which to invoke */
/* force_quiescent_state(). */ /* force_quiescent_state(). */
@ -538,8 +536,8 @@ struct rcu_state {
#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */
#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */
/* Values for rcu_state structure's gp_flags field. */ /* Values for rcu_state structure's gp_state field. */
#define RCU_GP_WAIT_INIT 0 /* Initial state. */ #define RCU_GP_IDLE 0 /* Initial state and no GP in progress. */
#define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ #define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */
#define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */ #define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */
#define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */ #define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */
@ -582,9 +580,10 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */ #endif /* #ifdef CONFIG_HOTPLUG_CPU */
static void rcu_print_detail_task_stall(struct rcu_state *rsp); static void rcu_print_detail_task_stall(struct rcu_state *rsp);
static int rcu_print_task_stall(struct rcu_node *rnp); static int rcu_print_task_stall(struct rcu_node *rnp);
static int rcu_print_task_exp_stall(struct rcu_node *rnp);
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
static void rcu_preempt_check_callbacks(void); static void rcu_preempt_check_callbacks(void);
void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); void call_rcu(struct rcu_head *head, rcu_callback_t func);
static void __init __rcu_init_preempt(void); static void __init __rcu_init_preempt(void);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);

View File

@ -101,7 +101,6 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
static struct rcu_state *const rcu_state_p = &rcu_preempt_state; static struct rcu_state *const rcu_state_p = &rcu_preempt_state;
static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data; static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data;
static int rcu_preempted_readers_exp(struct rcu_node *rnp);
static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
bool wake); bool wake);
@ -114,6 +113,147 @@ static void __init rcu_bootup_announce(void)
rcu_bootup_announce_oddness(); rcu_bootup_announce_oddness();
} }
/* Flags for rcu_preempt_ctxt_queue() decision table. */
#define RCU_GP_TASKS 0x8
#define RCU_EXP_TASKS 0x4
#define RCU_GP_BLKD 0x2
#define RCU_EXP_BLKD 0x1
/*
* Queues a task preempted within an RCU-preempt read-side critical
* section into the appropriate location within the ->blkd_tasks list,
* depending on the states of any ongoing normal and expedited grace
* periods. The ->gp_tasks pointer indicates which element the normal
* grace period is waiting on (NULL if none), and the ->exp_tasks pointer
* indicates which element the expedited grace period is waiting on (again,
* NULL if none). If a grace period is waiting on a given element in the
* ->blkd_tasks list, it also waits on all subsequent elements. Thus,
* adding a task to the tail of the list blocks any grace period that is
* already waiting on one of the elements. In contrast, adding a task
* to the head of the list won't block any grace period that is already
* waiting on one of the elements.
*
* This queuing is imprecise, and can sometimes make an ongoing grace
* period wait for a task that is not strictly speaking blocking it.
* Given the choice, we needlessly block a normal grace period rather than
* blocking an expedited grace period.
*
* Note that an endless sequence of expedited grace periods still cannot
* indefinitely postpone a normal grace period. Eventually, all of the
* fixed number of preempted tasks blocking the normal grace period that are
* not also blocking the expedited grace period will resume and complete
* their RCU read-side critical sections. At that point, the ->gp_tasks
* pointer will equal the ->exp_tasks pointer, at which point the end of
* the corresponding expedited grace period will also be the end of the
* normal grace period.
*/
static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
unsigned long flags) __releases(rnp->lock)
{
int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) +
(rnp->exp_tasks ? RCU_EXP_TASKS : 0) +
(rnp->qsmask & rdp->grpmask ? RCU_GP_BLKD : 0) +
(rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0);
struct task_struct *t = current;
/*
* Decide where to queue the newly blocked task. In theory,
* this could be an if-statement. In practice, when I tried
* that, it was quite messy.
*/
switch (blkd_state) {
case 0:
case RCU_EXP_TASKS:
case RCU_EXP_TASKS + RCU_GP_BLKD:
case RCU_GP_TASKS:
case RCU_GP_TASKS + RCU_EXP_TASKS:
/*
* Blocking neither GP, or first task blocking the normal
* GP but not blocking the already-waiting expedited GP.
* Queue at the head of the list to avoid unnecessarily
* blocking the already-waiting GPs.
*/
list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
break;
case RCU_EXP_BLKD:
case RCU_GP_BLKD:
case RCU_GP_BLKD + RCU_EXP_BLKD:
case RCU_GP_TASKS + RCU_EXP_BLKD:
case RCU_GP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
/*
* First task arriving that blocks either GP, or first task
* arriving that blocks the expedited GP (with the normal
* GP already waiting), or a task arriving that blocks
* both GPs with both GPs already waiting. Queue at the
* tail of the list to avoid any GP waiting on any of the
* already queued tasks that are not blocking it.
*/
list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks);
break;
case RCU_EXP_TASKS + RCU_EXP_BLKD:
case RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_EXP_BLKD:
/*
* Second or subsequent task blocking the expedited GP.
* The task either does not block the normal GP, or is the
* first task blocking the normal GP. Queue just after
* the first task blocking the expedited GP.
*/
list_add(&t->rcu_node_entry, rnp->exp_tasks);
break;
case RCU_GP_TASKS + RCU_GP_BLKD:
case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD:
/*
* Second or subsequent task blocking the normal GP.
* The task does not block the expedited GP. Queue just
* after the first task blocking the normal GP.
*/
list_add(&t->rcu_node_entry, rnp->gp_tasks);
break;
default:
/* Yet another exercise in excessive paranoia. */
WARN_ON_ONCE(1);
break;
}
/*
* We have now queued the task. If it was the first one to
* block either grace period, update the ->gp_tasks and/or
* ->exp_tasks pointers, respectively, to reference the newly
* blocked tasks.
*/
if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD))
rnp->gp_tasks = &t->rcu_node_entry;
if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
rnp->exp_tasks = &t->rcu_node_entry;
raw_spin_unlock(&rnp->lock);
/*
* Report the quiescent state for the expedited GP. This expedited
* GP should not be able to end until we report, so there should be
* no need to check for a subsequent expedited GP. (Though we are
* still in a quiescent state in any case.)
*/
if (blkd_state & RCU_EXP_BLKD &&
t->rcu_read_unlock_special.b.exp_need_qs) {
t->rcu_read_unlock_special.b.exp_need_qs = false;
rcu_report_exp_rdp(rdp->rsp, rdp, true);
} else {
WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs);
}
local_irq_restore(flags);
}
/* /*
* Record a preemptible-RCU quiescent state for the specified CPU. Note * Record a preemptible-RCU quiescent state for the specified CPU. Note
* that this just means that the task currently running on the CPU is * that this just means that the task currently running on the CPU is
@ -125,11 +265,11 @@ static void __init rcu_bootup_announce(void)
*/ */
static void rcu_preempt_qs(void) static void rcu_preempt_qs(void)
{ {
if (!__this_cpu_read(rcu_data_p->passed_quiesce)) { if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) {
trace_rcu_grace_period(TPS("rcu_preempt"), trace_rcu_grace_period(TPS("rcu_preempt"),
__this_cpu_read(rcu_data_p->gpnum), __this_cpu_read(rcu_data_p->gpnum),
TPS("cpuqs")); TPS("cpuqs"));
__this_cpu_write(rcu_data_p->passed_quiesce, 1); __this_cpu_write(rcu_data_p->cpu_no_qs.b.norm, false);
barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */ barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
current->rcu_read_unlock_special.b.need_qs = false; current->rcu_read_unlock_special.b.need_qs = false;
} }
@ -167,42 +307,18 @@ static void rcu_preempt_note_context_switch(void)
t->rcu_blocked_node = rnp; t->rcu_blocked_node = rnp;
/* /*
* If this CPU has already checked in, then this task * Verify the CPU's sanity, trace the preemption, and
* will hold up the next grace period rather than the * then queue the task as required based on the states
* current grace period. Queue the task accordingly. * of any ongoing and expedited grace periods.
* If the task is queued for the current grace period
* (i.e., this CPU has not yet passed through a quiescent
* state for the current grace period), then as long
* as that task remains queued, the current grace period
* cannot end. Note that there is some uncertainty as
* to exactly when the current grace period started.
* We take a conservative approach, which can result
* in unnecessarily waiting on tasks that started very
* slightly after the current grace period began. C'est
* la vie!!!
*
* But first, note that the current CPU must still be
* on line!
*/ */
WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0); WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
rnp->gp_tasks = &t->rcu_node_entry;
if (IS_ENABLED(CONFIG_RCU_BOOST) &&
rnp->boost_tasks != NULL)
rnp->boost_tasks = rnp->gp_tasks;
} else {
list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
if (rnp->qsmask & rdp->grpmask)
rnp->gp_tasks = &t->rcu_node_entry;
}
trace_rcu_preempt_task(rdp->rsp->name, trace_rcu_preempt_task(rdp->rsp->name,
t->pid, t->pid,
(rnp->qsmask & rdp->grpmask) (rnp->qsmask & rdp->grpmask)
? rnp->gpnum ? rnp->gpnum
: rnp->gpnum + 1); : rnp->gpnum + 1);
raw_spin_unlock_irqrestore(&rnp->lock, flags); rcu_preempt_ctxt_queue(rnp, rdp, flags);
} else if (t->rcu_read_lock_nesting < 0 && } else if (t->rcu_read_lock_nesting < 0 &&
t->rcu_read_unlock_special.s) { t->rcu_read_unlock_special.s) {
@ -272,6 +388,7 @@ void rcu_read_unlock_special(struct task_struct *t)
unsigned long flags; unsigned long flags;
struct list_head *np; struct list_head *np;
bool drop_boost_mutex = false; bool drop_boost_mutex = false;
struct rcu_data *rdp;
struct rcu_node *rnp; struct rcu_node *rnp;
union rcu_special special; union rcu_special special;
@ -282,8 +399,8 @@ void rcu_read_unlock_special(struct task_struct *t)
local_irq_save(flags); local_irq_save(flags);
/* /*
* If RCU core is waiting for this CPU to exit critical section, * If RCU core is waiting for this CPU to exit its critical section,
* let it know that we have done so. Because irqs are disabled, * report the fact that it has exited. Because irqs are disabled,
* t->rcu_read_unlock_special cannot change. * t->rcu_read_unlock_special cannot change.
*/ */
special = t->rcu_read_unlock_special; special = t->rcu_read_unlock_special;
@ -296,13 +413,32 @@ void rcu_read_unlock_special(struct task_struct *t)
} }
} }
/*
* Respond to a request for an expedited grace period, but only if
* we were not preempted, meaning that we were running on the same
* CPU throughout. If we were preempted, the exp_need_qs flag
* would have been cleared at the time of the first preemption,
* and the quiescent state would be reported when we were dequeued.
*/
if (special.b.exp_need_qs) {
WARN_ON_ONCE(special.b.blocked);
t->rcu_read_unlock_special.b.exp_need_qs = false;
rdp = this_cpu_ptr(rcu_state_p->rda);
rcu_report_exp_rdp(rcu_state_p, rdp, true);
if (!t->rcu_read_unlock_special.s) {
local_irq_restore(flags);
return;
}
}
/* Hardware IRQ handlers cannot block, complain if they get here. */ /* Hardware IRQ handlers cannot block, complain if they get here. */
if (in_irq() || in_serving_softirq()) { if (in_irq() || in_serving_softirq()) {
lockdep_rcu_suspicious(__FILE__, __LINE__, lockdep_rcu_suspicious(__FILE__, __LINE__,
"rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n"); "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n", pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
t->rcu_read_unlock_special.s, t->rcu_read_unlock_special.s,
t->rcu_read_unlock_special.b.blocked, t->rcu_read_unlock_special.b.blocked,
t->rcu_read_unlock_special.b.exp_need_qs,
t->rcu_read_unlock_special.b.need_qs); t->rcu_read_unlock_special.b.need_qs);
local_irq_restore(flags); local_irq_restore(flags);
return; return;
@ -329,7 +465,7 @@ void rcu_read_unlock_special(struct task_struct *t)
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
} }
empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
empty_exp = !rcu_preempted_readers_exp(rnp); empty_exp = sync_rcu_preempt_exp_done(rnp);
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
np = rcu_next_node_entry(t, rnp); np = rcu_next_node_entry(t, rnp);
list_del_init(&t->rcu_node_entry); list_del_init(&t->rcu_node_entry);
@ -353,7 +489,7 @@ void rcu_read_unlock_special(struct task_struct *t)
* Note that rcu_report_unblock_qs_rnp() releases rnp->lock, * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
* so we must take a snapshot of the expedited state. * so we must take a snapshot of the expedited state.
*/ */
empty_exp_now = !rcu_preempted_readers_exp(rnp); empty_exp_now = sync_rcu_preempt_exp_done(rnp);
if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
trace_rcu_quiescent_state_report(TPS("preempt_rcu"), trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
rnp->gpnum, rnp->gpnum,
@ -449,6 +585,27 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
return ndetected; return ndetected;
} }
/*
* Scan the current list of tasks blocked within RCU read-side critical
* sections, printing out the tid of each that is blocking the current
* expedited grace period.
*/
static int rcu_print_task_exp_stall(struct rcu_node *rnp)
{
struct task_struct *t;
int ndetected = 0;
if (!rnp->exp_tasks)
return 0;
t = list_entry(rnp->exp_tasks->prev,
struct task_struct, rcu_node_entry);
list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
pr_cont(" P%d", t->pid);
ndetected++;
}
return ndetected;
}
/* /*
* Check that the list of blocked tasks for the newly completed grace * Check that the list of blocked tasks for the newly completed grace
* period is in fact empty. It is a serious bug to complete a grace * period is in fact empty. It is a serious bug to complete a grace
@ -483,8 +640,8 @@ static void rcu_preempt_check_callbacks(void)
return; return;
} }
if (t->rcu_read_lock_nesting > 0 && if (t->rcu_read_lock_nesting > 0 &&
__this_cpu_read(rcu_data_p->qs_pending) && __this_cpu_read(rcu_data_p->core_needs_qs) &&
!__this_cpu_read(rcu_data_p->passed_quiesce)) __this_cpu_read(rcu_data_p->cpu_no_qs.b.norm))
t->rcu_read_unlock_special.b.need_qs = true; t->rcu_read_unlock_special.b.need_qs = true;
} }
@ -500,7 +657,7 @@ static void rcu_preempt_do_callbacks(void)
/* /*
* Queue a preemptible-RCU callback for invocation after a grace period. * Queue a preemptible-RCU callback for invocation after a grace period.
*/ */
void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) void call_rcu(struct rcu_head *head, rcu_callback_t func)
{ {
__call_rcu(head, func, rcu_state_p, -1, 0); __call_rcu(head, func, rcu_state_p, -1, 0);
} }
@ -535,155 +692,41 @@ void synchronize_rcu(void)
} }
EXPORT_SYMBOL_GPL(synchronize_rcu); EXPORT_SYMBOL_GPL(synchronize_rcu);
static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
/* /*
* Return non-zero if there are any tasks in RCU read-side critical * Remote handler for smp_call_function_single(). If there is an
* sections blocking the current preemptible-RCU expedited grace period. * RCU read-side critical section in effect, request that the
* If there is no preemptible-RCU expedited grace period currently in * next rcu_read_unlock() record the quiescent state up the
* progress, returns zero unconditionally. * ->expmask fields in the rcu_node tree. Otherwise, immediately
* report the quiescent state.
*/ */
static int rcu_preempted_readers_exp(struct rcu_node *rnp) static void sync_rcu_exp_handler(void *info)
{ {
return rnp->exp_tasks != NULL; struct rcu_data *rdp;
} struct rcu_state *rsp = info;
struct task_struct *t = current;
/*
* return non-zero if there is no RCU expedited grace period in progress
* for the specified rcu_node structure, in other words, if all CPUs and
* tasks covered by the specified rcu_node structure have done their bit
* for the current expedited grace period. Works only for preemptible
* RCU -- other RCU implementation use other means.
*
* Caller must hold the root rcu_node's exp_funnel_mutex.
*/
static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
{
return !rcu_preempted_readers_exp(rnp) &&
READ_ONCE(rnp->expmask) == 0;
}
/*
* Report the exit from RCU read-side critical section for the last task
* that queued itself during or before the current expedited preemptible-RCU
* grace period. This event is reported either to the rcu_node structure on
* which the task was queued or to one of that rcu_node structure's ancestors,
* recursively up the tree. (Calm down, calm down, we do the recursion
* iteratively!)
*
* Caller must hold the root rcu_node's exp_funnel_mutex.
*/
static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
bool wake)
{
unsigned long flags;
unsigned long mask;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
for (;;) {
if (!sync_rcu_preempt_exp_done(rnp)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
break;
}
if (rnp->parent == NULL) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
if (wake) {
smp_mb(); /* EGP done before wake_up(). */
wake_up(&sync_rcu_preempt_exp_wq);
}
break;
}
mask = rnp->grpmask;
raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
rnp = rnp->parent;
raw_spin_lock(&rnp->lock); /* irqs already disabled */
smp_mb__after_unlock_lock();
rnp->expmask &= ~mask;
}
}
/*
* Snapshot the tasks blocking the newly started preemptible-RCU expedited
* grace period for the specified rcu_node structure, phase 1. If there
* are such tasks, set the ->expmask bits up the rcu_node tree and also
* set the ->expmask bits on the leaf rcu_node structures to tell phase 2
* that work is needed here.
*
* Caller must hold the root rcu_node's exp_funnel_mutex.
*/
static void
sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp)
{
unsigned long flags;
unsigned long mask;
struct rcu_node *rnp_up;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
WARN_ON_ONCE(rnp->expmask);
WARN_ON_ONCE(rnp->exp_tasks);
if (!rcu_preempt_has_tasks(rnp)) {
/* No blocked tasks, nothing to do. */
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
/* Call for Phase 2 and propagate ->expmask bits up the tree. */
rnp->expmask = 1;
rnp_up = rnp;
while (rnp_up->parent) {
mask = rnp_up->grpmask;
rnp_up = rnp_up->parent;
if (rnp_up->expmask & mask)
break;
raw_spin_lock(&rnp_up->lock); /* irqs already off */
smp_mb__after_unlock_lock();
rnp_up->expmask |= mask;
raw_spin_unlock(&rnp_up->lock); /* irqs still off */
}
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
/*
* Snapshot the tasks blocking the newly started preemptible-RCU expedited
* grace period for the specified rcu_node structure, phase 2. If the
* leaf rcu_node structure has its ->expmask field set, check for tasks.
* If there are some, clear ->expmask and set ->exp_tasks accordingly,
* then initiate RCU priority boosting. Otherwise, clear ->expmask and
* invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits,
* enabling rcu_read_unlock_special() to do the bit-clearing.
*
* Caller must hold the root rcu_node's exp_funnel_mutex.
*/
static void
sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
{
unsigned long flags;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
if (!rnp->expmask) {
/* Phase 1 didn't do anything, so Phase 2 doesn't either. */
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
/* Phase 1 is over. */
rnp->expmask = 0;
/* /*
* If there are still blocked tasks, set up ->exp_tasks so that * Within an RCU read-side critical section, request that the next
* rcu_read_unlock_special() will wake us and then boost them. * rcu_read_unlock() report. Unless this RCU read-side critical
* section has already blocked, in which case it is already set
* up for the expedited grace period to wait on it.
*/ */
if (rcu_preempt_has_tasks(rnp)) { if (t->rcu_read_lock_nesting > 0 &&
rnp->exp_tasks = rnp->blkd_tasks.next; !t->rcu_read_unlock_special.b.blocked) {
rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ t->rcu_read_unlock_special.b.exp_need_qs = true;
return; return;
} }
/* No longer any blocked tasks, so undo bit setting. */ /*
raw_spin_unlock_irqrestore(&rnp->lock, flags); * We are either exiting an RCU read-side critical section (negative
rcu_report_exp_rnp(rsp, rnp, false); * values of t->rcu_read_lock_nesting) or are not in one at all
* (zero value of t->rcu_read_lock_nesting). Or we are in an RCU
* read-side critical section that blocked before this expedited
* grace period started. Either way, we can immediately report
* the quiescent state.
*/
rdp = this_cpu_ptr(rsp->rda);
rcu_report_exp_rdp(rsp, rdp, true);
} }
/** /**
@ -713,24 +756,12 @@ void synchronize_rcu_expedited(void)
rcu_exp_gp_seq_start(rsp); rcu_exp_gp_seq_start(rsp);
/* force all RCU readers onto ->blkd_tasks lists. */ /* Initialize the rcu_node tree in preparation for the wait. */
synchronize_sched_expedited(); sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
/*
* Snapshot current state of ->blkd_tasks lists into ->expmask.
* Phase 1 sets bits and phase 2 permits rcu_read_unlock_special()
* to start clearing them. Doing this in one phase leads to
* strange races between setting and clearing bits, so just say "no"!
*/
rcu_for_each_leaf_node(rsp, rnp)
sync_rcu_preempt_exp_init1(rsp, rnp);
rcu_for_each_leaf_node(rsp, rnp)
sync_rcu_preempt_exp_init2(rsp, rnp);
/* Wait for snapshotted ->blkd_tasks lists to drain. */ /* Wait for snapshotted ->blkd_tasks lists to drain. */
rnp = rcu_get_root(rsp); rnp = rcu_get_root(rsp);
wait_event(sync_rcu_preempt_exp_wq, synchronize_sched_expedited_wait(rsp);
sync_rcu_preempt_exp_done(rnp));
/* Clean up and exit. */ /* Clean up and exit. */
rcu_exp_gp_seq_end(rsp); rcu_exp_gp_seq_end(rsp);
@ -834,6 +865,16 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
return 0; return 0;
} }
/*
* Because preemptible RCU does not exist, we never have to check for
* tasks blocked within RCU read-side critical sections that are
* blocking the current expedited grace period.
*/
static int rcu_print_task_exp_stall(struct rcu_node *rnp)
{
return 0;
}
/* /*
* Because there is no preemptible RCU, there can be no readers blocked, * Because there is no preemptible RCU, there can be no readers blocked,
* so there is no need to check for blocked tasks. So check only for * so there is no need to check for blocked tasks. So check only for
@ -1702,8 +1743,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
ticks_value = rsp->gpnum - rdp->gpnum; ticks_value = rsp->gpnum - rdp->gpnum;
} }
print_cpu_stall_fast_no_hz(fast_no_hz, cpu); print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", pr_err("\t%d-%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
cpu, ticks_value, ticks_title, cpu,
"O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
"N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
ticks_value, ticks_title,
atomic_read(&rdtp->dynticks) & 0xfff, atomic_read(&rdtp->dynticks) & 0xfff,
rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),

View File

@ -117,13 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
if (!rdp->beenonline) if (!rdp->beenonline)
return; return;
seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d", seq_printf(m, "%3d%cc=%ld g=%ld cnq=%d/%d:%d",
rdp->cpu, rdp->cpu,
cpu_is_offline(rdp->cpu) ? '!' : ' ', cpu_is_offline(rdp->cpu) ? '!' : ' ',
ulong2long(rdp->completed), ulong2long(rdp->gpnum), ulong2long(rdp->completed), ulong2long(rdp->gpnum),
rdp->passed_quiesce, rdp->cpu_no_qs.b.norm,
rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
rdp->qs_pending); rdp->core_needs_qs);
seq_printf(m, " dt=%d/%llx/%d df=%lu", seq_printf(m, " dt=%d/%llx/%d df=%lu",
atomic_read(&rdp->dynticks->dynticks), atomic_read(&rdp->dynticks->dynticks),
rdp->dynticks->dynticks_nesting, rdp->dynticks->dynticks_nesting,
@ -268,7 +268,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
gpnum = rsp->gpnum; gpnum = rsp->gpnum;
seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ", seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ",
ulong2long(rsp->completed), ulong2long(gpnum), ulong2long(rsp->completed), ulong2long(gpnum),
rsp->fqs_state, rsp->gp_state,
(long)(rsp->jiffies_force_qs - jiffies), (long)(rsp->jiffies_force_qs - jiffies),
(int)(jiffies & 0xffff)); (int)(jiffies & 0xffff));
seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
@ -361,7 +361,7 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
cpu_is_offline(rdp->cpu) ? '!' : ' ', cpu_is_offline(rdp->cpu) ? '!' : ' ',
rdp->n_rcu_pending); rdp->n_rcu_pending);
seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ", seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ",
rdp->n_rp_qs_pending, rdp->n_rp_core_needs_qs,
rdp->n_rp_report_qs, rdp->n_rp_report_qs,
rdp->n_rp_cb_ready, rdp->n_rp_cb_ready,
rdp->n_rp_cpu_needs_gp); rdp->n_rp_cpu_needs_gp);

View File

@ -534,7 +534,7 @@ static void rcu_spawn_tasks_kthread(void);
* Post an RCU-tasks callback. First call must be from process context * Post an RCU-tasks callback. First call must be from process context
* after the scheduler if fully operational. * after the scheduler if fully operational.
*/ */
void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp)) void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
{ {
unsigned long flags; unsigned long flags;
bool needwake; bool needwake;

View File

@ -4022,6 +4022,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy,
{ {
return _sched_setscheduler(p, policy, param, false); return _sched_setscheduler(p, policy, param, false);
} }
EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
static int static int
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

View File

@ -523,6 +523,7 @@ static int stutter;
*/ */
void stutter_wait(const char *title) void stutter_wait(const char *title)
{ {
cond_resched_rcu_qs();
while (READ_ONCE(stutter_pause_test) || while (READ_ONCE(stutter_pause_test) ||
(torture_runnable && !READ_ONCE(*torture_runnable))) { (torture_runnable && !READ_ONCE(*torture_runnable))) {
if (stutter_pause_test) if (stutter_pause_test)

View File

@ -75,7 +75,7 @@ usage () {
while test $# -gt 0 while test $# -gt 0
do do
case "$1" in case "$1" in
--bootargs) --bootargs|--bootarg)
checkarg --bootargs "(list of kernel boot arguments)" "$#" "$2" '.*' '^--' checkarg --bootargs "(list of kernel boot arguments)" "$#" "$2" '.*' '^--'
TORTURE_BOOTARGS="$2" TORTURE_BOOTARGS="$2"
shift shift
@ -88,7 +88,7 @@ do
--buildonly) --buildonly)
TORTURE_BUILDONLY=1 TORTURE_BUILDONLY=1
;; ;;
--configs) --configs|--config)
checkarg --configs "(list of config files)" "$#" "$2" '^[^/]*$' '^--' checkarg --configs "(list of config files)" "$#" "$2" '^[^/]*$' '^--'
configs="$2" configs="$2"
shift shift
@ -134,7 +134,7 @@ do
--no-initrd) --no-initrd)
TORTURE_INITRD=""; export TORTURE_INITRD TORTURE_INITRD=""; export TORTURE_INITRD
;; ;;
--qemu-args) --qemu-args|--qemu-arg)
checkarg --qemu-args "-qemu args" $# "$2" '^-' '^error' checkarg --qemu-args "-qemu args" $# "$2" '^-' '^error'
TORTURE_QEMU_ARG="$2" TORTURE_QEMU_ARG="$2"
shift shift

View File

@ -2,3 +2,5 @@ LOCK01
LOCK02 LOCK02
LOCK03 LOCK03
LOCK04 LOCK04
LOCK05
LOCK06

View File

@ -0,0 +1,6 @@
CONFIG_SMP=y
CONFIG_NR_CPUS=4
CONFIG_HOTPLUG_CPU=y
CONFIG_PREEMPT_NONE=n
CONFIG_PREEMPT_VOLUNTARY=n
CONFIG_PREEMPT=y

View File

@ -0,0 +1 @@
locktorture.torture_type=rtmutex_lock

View File

@ -0,0 +1,6 @@
CONFIG_SMP=y
CONFIG_NR_CPUS=4
CONFIG_HOTPLUG_CPU=y
CONFIG_PREEMPT_NONE=n
CONFIG_PREEMPT_VOLUNTARY=n
CONFIG_PREEMPT=y

View File

@ -0,0 +1 @@
locktorture.torture_type=percpu_rwsem_lock