From f4ecea309d3e17ba5e90082308125ad23bd5701b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 29 Jul 2015 17:28:11 -0700
Subject: [PATCH 01/53] rcu: Use rsp->expedited_wq instead of
 sync_rcu_preempt_exp_wq

Now that there is an ->expedited_wq waitqueue in each rcu_state structure,
there is no need for the sync_rcu_preempt_exp_wq global variable.  This
commit therefore substitutes ->expedited_wq for sync_rcu_preempt_exp_wq.
It also initializes ->expedited_wq only once at boot instead of at the
start of each expedited grace period.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        | 2 +-
 kernel/rcu/tree_plugin.h | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 775d36cc0050..53d66ebb4811 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3556,7 +3556,6 @@ void synchronize_sched_expedited(void)
 	rcu_exp_gp_seq_start(rsp);
 
 	/* Stop each CPU that is online, non-idle, and not us. */
-	init_waitqueue_head(&rsp->expedited_wq);
 	atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */
 	for_each_online_cpu(cpu) {
 		struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
@@ -4179,6 +4178,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 	}
 
 	init_waitqueue_head(&rsp->gp_wq);
+	init_waitqueue_head(&rsp->expedited_wq);
 	rnp = rsp->level[rcu_num_lvls - 1];
 	for_each_possible_cpu(i) {
 		while (i > rnp->grphi)
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index b2bf3963a0ae..72df006de798 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -535,8 +535,6 @@ void synchronize_rcu(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
-static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
-
 /*
  * Return non-zero if there are any tasks in RCU read-side critical
  * sections blocking the current preemptible-RCU expedited grace period.
@@ -590,7 +588,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 			if (wake) {
 				smp_mb(); /* EGP done before wake_up(). */
-				wake_up(&sync_rcu_preempt_exp_wq);
+				wake_up(&rsp->expedited_wq);
 			}
 			break;
 		}
@@ -729,7 +727,7 @@ void synchronize_rcu_expedited(void)
 
 	/* Wait for snapshotted ->blkd_tasks lists to drain. */
 	rnp = rcu_get_root(rsp);
-	wait_event(sync_rcu_preempt_exp_wq,
+	wait_event(rsp->expedited_wq,
 		   sync_rcu_preempt_exp_done(rnp));
 
 	/* Clean up and exit. */

From 7922cd0e562cb2b8da2c8a0afda2e1c9bb4a94e2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 31 Jul 2015 13:34:32 -0700
Subject: [PATCH 02/53] rcu: Move rcu_report_exp_rnp() to allow consolidation

This is a nearly pure code-movement commit, moving rcu_report_exp_rnp(),
sync_rcu_preempt_exp_done(), and rcu_preempted_readers_exp() so
that later commits can make synchronize_sched_expedited() use them.
The non-code-movement portion of this commit tags rcu_report_exp_rnp()
as __maybe_unused to avoid build errors when CONFIG_PREEMPT=n.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        | 66 ++++++++++++++++++++++++++++++++++++++++
 kernel/rcu/tree_plugin.h | 66 ----------------------------------------
 2 files changed, 66 insertions(+), 66 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 53d66ebb4811..59af27d8bc6a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3379,6 +3379,72 @@ static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
 	return rcu_seq_done(&rsp->expedited_sequence, s);
 }
 
+/*
+ * Return non-zero if there are any tasks in RCU read-side critical
+ * sections blocking the current preemptible-RCU expedited grace period.
+ * If there is no preemptible-RCU expedited grace period currently in
+ * progress, returns zero unconditionally.
+ */
+static int rcu_preempted_readers_exp(struct rcu_node *rnp)
+{
+	return rnp->exp_tasks != NULL;
+}
+
+/*
+ * return non-zero if there is no RCU expedited grace period in progress
+ * for the specified rcu_node structure, in other words, if all CPUs and
+ * tasks covered by the specified rcu_node structure have done their bit
+ * for the current expedited grace period.  Works only for preemptible
+ * RCU -- other RCU implementation use other means.
+ *
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
+ */
+static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
+{
+	return !rcu_preempted_readers_exp(rnp) &&
+	       READ_ONCE(rnp->expmask) == 0;
+}
+
+/*
+ * Report the exit from RCU read-side critical section for the last task
+ * that queued itself during or before the current expedited preemptible-RCU
+ * grace period.  This event is reported either to the rcu_node structure on
+ * which the task was queued or to one of that rcu_node structure's ancestors,
+ * recursively up the tree.  (Calm down, calm down, we do the recursion
+ * iteratively!)
+ *
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
+ */
+static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
+					      struct rcu_node *rnp, bool wake)
+{
+	unsigned long flags;
+	unsigned long mask;
+
+	raw_spin_lock_irqsave(&rnp->lock, flags);
+	smp_mb__after_unlock_lock();
+	for (;;) {
+		if (!sync_rcu_preempt_exp_done(rnp)) {
+			raw_spin_unlock_irqrestore(&rnp->lock, flags);
+			break;
+		}
+		if (rnp->parent == NULL) {
+			raw_spin_unlock_irqrestore(&rnp->lock, flags);
+			if (wake) {
+				smp_mb(); /* EGP done before wake_up(). */
+				wake_up(&rsp->expedited_wq);
+			}
+			break;
+		}
+		mask = rnp->grpmask;
+		raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
+		rnp = rnp->parent;
+		raw_spin_lock(&rnp->lock); /* irqs already disabled */
+		smp_mb__after_unlock_lock();
+		rnp->expmask &= ~mask;
+	}
+}
+
 /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
 static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
 			       struct rcu_data *rdp,
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 72df006de798..e73be8539978 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -535,72 +535,6 @@ void synchronize_rcu(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
-/*
- * Return non-zero if there are any tasks in RCU read-side critical
- * sections blocking the current preemptible-RCU expedited grace period.
- * If there is no preemptible-RCU expedited grace period currently in
- * progress, returns zero unconditionally.
- */
-static int rcu_preempted_readers_exp(struct rcu_node *rnp)
-{
-	return rnp->exp_tasks != NULL;
-}
-
-/*
- * return non-zero if there is no RCU expedited grace period in progress
- * for the specified rcu_node structure, in other words, if all CPUs and
- * tasks covered by the specified rcu_node structure have done their bit
- * for the current expedited grace period.  Works only for preemptible
- * RCU -- other RCU implementation use other means.
- *
- * Caller must hold the root rcu_node's exp_funnel_mutex.
- */
-static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
-{
-	return !rcu_preempted_readers_exp(rnp) &&
-	       READ_ONCE(rnp->expmask) == 0;
-}
-
-/*
- * Report the exit from RCU read-side critical section for the last task
- * that queued itself during or before the current expedited preemptible-RCU
- * grace period.  This event is reported either to the rcu_node structure on
- * which the task was queued or to one of that rcu_node structure's ancestors,
- * recursively up the tree.  (Calm down, calm down, we do the recursion
- * iteratively!)
- *
- * Caller must hold the root rcu_node's exp_funnel_mutex.
- */
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
-			       bool wake)
-{
-	unsigned long flags;
-	unsigned long mask;
-
-	raw_spin_lock_irqsave(&rnp->lock, flags);
-	smp_mb__after_unlock_lock();
-	for (;;) {
-		if (!sync_rcu_preempt_exp_done(rnp)) {
-			raw_spin_unlock_irqrestore(&rnp->lock, flags);
-			break;
-		}
-		if (rnp->parent == NULL) {
-			raw_spin_unlock_irqrestore(&rnp->lock, flags);
-			if (wake) {
-				smp_mb(); /* EGP done before wake_up(). */
-				wake_up(&rsp->expedited_wq);
-			}
-			break;
-		}
-		mask = rnp->grpmask;
-		raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
-		rnp = rnp->parent;
-		raw_spin_lock(&rnp->lock); /* irqs already disabled */
-		smp_mb__after_unlock_lock();
-		rnp->expmask &= ~mask;
-	}
-}
-
 /*
  * Snapshot the tasks blocking the newly started preemptible-RCU expedited
  * grace period for the specified rcu_node structure, phase 1.  If there

From b9585e940a0d78770cda8f9aebf81b17b4d19e6d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 31 Jul 2015 16:04:45 -0700
Subject: [PATCH 03/53] rcu: Consolidate tree setup for
 synchronize_rcu_expedited()

This commit replaces sync_rcu_preempt_exp_init1(() and
sync_rcu_preempt_exp_init2() with sync_exp_reset_tree_hotplug()
and sync_exp_reset_tree(), which will also be used by
synchronize_sched_expedited(), and sync_rcu_exp_select_nodes(), which
contains code specific to synchronize_rcu_expedited().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        |  86 ++++++++++++++++++++++++++++++++-
 kernel/rcu/tree.h        |  17 +++++--
 kernel/rcu/tree_plugin.h | 102 +++++++--------------------------------
 3 files changed, 115 insertions(+), 90 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 59af27d8bc6a..8526896afea7 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3379,6 +3379,87 @@ static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
 	return rcu_seq_done(&rsp->expedited_sequence, s);
 }
 
+/*
+ * Reset the ->expmaskinit values in the rcu_node tree to reflect any
+ * recent CPU-online activity.  Note that these masks are not cleared
+ * when CPUs go offline, so they reflect the union of all CPUs that have
+ * ever been online.  This means that this function normally takes its
+ * no-work-to-do fastpath.
+ */
+static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
+{
+	bool done;
+	unsigned long flags;
+	unsigned long mask;
+	unsigned long oldmask;
+	int ncpus = READ_ONCE(rsp->ncpus);
+	struct rcu_node *rnp;
+	struct rcu_node *rnp_up;
+
+	/* If no new CPUs onlined since last time, nothing to do. */
+	if (likely(ncpus == rsp->ncpus_snap))
+		return;
+	rsp->ncpus_snap = ncpus;
+
+	/*
+	 * Each pass through the following loop propagates newly onlined
+	 * CPUs for the current rcu_node structure up the rcu_node tree.
+	 */
+	rcu_for_each_leaf_node(rsp, rnp) {
+		raw_spin_lock_irqsave(&rnp->lock, flags);
+		smp_mb__after_unlock_lock();
+		if (rnp->expmaskinit == rnp->expmaskinitnext) {
+			raw_spin_unlock_irqrestore(&rnp->lock, flags);
+			continue;  /* No new CPUs, nothing to do. */
+		}
+
+		/* Update this node's mask, track old value for propagation. */
+		oldmask = rnp->expmaskinit;
+		rnp->expmaskinit = rnp->expmaskinitnext;
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+
+		/* If was already nonzero, nothing to propagate. */
+		if (oldmask)
+			continue;
+
+		/* Propagate the new CPU up the tree. */
+		mask = rnp->grpmask;
+		rnp_up = rnp->parent;
+		done = false;
+		while (rnp_up) {
+			raw_spin_lock_irqsave(&rnp_up->lock, flags);
+			smp_mb__after_unlock_lock();
+			if (rnp_up->expmaskinit)
+				done = true;
+			rnp_up->expmaskinit |= mask;
+			raw_spin_unlock_irqrestore(&rnp_up->lock, flags);
+			if (done)
+				break;
+			mask = rnp_up->grpmask;
+			rnp_up = rnp_up->parent;
+		}
+	}
+}
+
+/*
+ * Reset the ->expmask values in the rcu_node tree in preparation for
+ * a new expedited grace period.
+ */
+static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
+{
+	unsigned long flags;
+	struct rcu_node *rnp;
+
+	sync_exp_reset_tree_hotplug(rsp);
+	rcu_for_each_node_breadth_first(rsp, rnp) {
+		raw_spin_lock_irqsave(&rnp->lock, flags);
+		smp_mb__after_unlock_lock();
+		WARN_ON_ONCE(rnp->expmask);
+		rnp->expmask = rnp->expmaskinit;
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+	}
+}
+
 /*
  * Return non-zero if there are any tasks in RCU read-side critical
  * sections blocking the current preemptible-RCU expedited grace period.
@@ -3971,7 +4052,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 
 	/* Set up local state, ensuring consistent view of global state. */
 	raw_spin_lock_irqsave(&rnp->lock, flags);
-	rdp->beenonline = 1;	 /* We have now been online. */
 	rdp->qlen_last_fqs_check = 0;
 	rdp->n_force_qs_snap = rsp->n_force_qs;
 	rdp->blimit = blimit;
@@ -3993,6 +4073,10 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	raw_spin_lock(&rnp->lock);		/* irqs already disabled. */
 	smp_mb__after_unlock_lock();
 	rnp->qsmaskinitnext |= mask;
+	rnp->expmaskinitnext |= mask;
+	if (!rdp->beenonline)
+		WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);
+	rdp->beenonline = true;	 /* We have now been online. */
 	rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
 	rdp->completed = rnp->completed;
 	rdp->passed_quiesce = false;
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 2e991f8361e4..a57f25ecca58 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -171,16 +171,21 @@ struct rcu_node {
 				/*  an rcu_data structure, otherwise, each */
 				/*  bit corresponds to a child rcu_node */
 				/*  structure. */
-	unsigned long expmask;	/* Groups that have ->blkd_tasks */
-				/*  elements that need to drain to allow the */
-				/*  current expedited grace period to */
-				/*  complete (only for PREEMPT_RCU). */
 	unsigned long qsmaskinit;
-				/* Per-GP initial value for qsmask & expmask. */
+				/* Per-GP initial value for qsmask. */
 				/*  Initialized from ->qsmaskinitnext at the */
 				/*  beginning of each grace period. */
 	unsigned long qsmaskinitnext;
 				/* Online CPUs for next grace period. */
+	unsigned long expmask;	/* CPUs or groups that need to check in */
+				/*  to allow the current expedited GP */
+				/*  to complete. */
+	unsigned long expmaskinit;
+				/* Per-GP initial values for expmask. */
+				/*  Initialized from ->expmaskinitnext at the */
+				/*  beginning of each expedited GP. */
+	unsigned long expmaskinitnext;
+				/* Online CPUs for next expedited GP. */
 	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
 				/*  Only one bit will be set in this mask. */
 	int	grplo;		/* lowest-numbered CPU or group here. */
@@ -466,6 +471,7 @@ struct rcu_state {
 	struct rcu_data __percpu *rda;		/* pointer of percu rcu_data. */
 	void (*call)(struct rcu_head *head,	/* call_rcu() flavor. */
 		     void (*func)(struct rcu_head *head));
+	int ncpus;				/* # CPUs seen so far. */
 
 	/* The following fields are guarded by the root rcu_node's lock. */
 
@@ -508,6 +514,7 @@ struct rcu_state {
 	atomic_long_t expedited_normal;		/* # fallbacks to normal. */
 	atomic_t expedited_need_qs;		/* # CPUs left to check in. */
 	wait_queue_head_t expedited_wq;		/* Wait for check-ins. */
+	int ncpus_snap;				/* # CPUs seen last time. */
 
 	unsigned long jiffies_force_qs;		/* Time at which to invoke */
 						/*  force_quiescent_state(). */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index e73be8539978..62d05413b7ba 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -536,86 +536,28 @@ void synchronize_rcu(void)
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
 /*
- * Snapshot the tasks blocking the newly started preemptible-RCU expedited
- * grace period for the specified rcu_node structure, phase 1.  If there
- * are such tasks, set the ->expmask bits up the rcu_node tree and also
- * set the ->expmask bits on the leaf rcu_node structures to tell phase 2
- * that work is needed here.
- *
- * Caller must hold the root rcu_node's exp_funnel_mutex.
+ * Select the nodes that the upcoming expedited grace period needs
+ * to wait for.
  */
-static void
-sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp)
+static void sync_rcu_exp_select_nodes(struct rcu_state *rsp)
 {
 	unsigned long flags;
-	unsigned long mask;
-	struct rcu_node *rnp_up;
+	struct rcu_node *rnp;
 
-	raw_spin_lock_irqsave(&rnp->lock, flags);
-	smp_mb__after_unlock_lock();
-	WARN_ON_ONCE(rnp->expmask);
-	WARN_ON_ONCE(rnp->exp_tasks);
-	if (!rcu_preempt_has_tasks(rnp)) {
-		/* No blocked tasks, nothing to do. */
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
-		return;
-	}
-	/* Call for Phase 2 and propagate ->expmask bits up the tree. */
-	rnp->expmask = 1;
-	rnp_up = rnp;
-	while (rnp_up->parent) {
-		mask = rnp_up->grpmask;
-		rnp_up = rnp_up->parent;
-		if (rnp_up->expmask & mask)
-			break;
-		raw_spin_lock(&rnp_up->lock); /* irqs already off */
+	sync_exp_reset_tree(rsp);
+	rcu_for_each_leaf_node(rsp, rnp) {
+		raw_spin_lock_irqsave(&rnp->lock, flags);
 		smp_mb__after_unlock_lock();
-		rnp_up->expmask |= mask;
-		raw_spin_unlock(&rnp_up->lock); /* irqs still off */
+		rnp->expmask = 0; /* No per-CPU component yet. */
+		if (!rcu_preempt_has_tasks(rnp)) {
+			/* FIXME: Want __rcu_report_exp_rnp() here. */
+			raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		} else {
+			rnp->exp_tasks = rnp->blkd_tasks.next;
+			rcu_initiate_boost(rnp, flags);
+		}
+		rcu_report_exp_rnp(rsp, rnp, false);
 	}
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
-}
-
-/*
- * Snapshot the tasks blocking the newly started preemptible-RCU expedited
- * grace period for the specified rcu_node structure, phase 2.  If the
- * leaf rcu_node structure has its ->expmask field set, check for tasks.
- * If there are some, clear ->expmask and set ->exp_tasks accordingly,
- * then initiate RCU priority boosting.  Otherwise, clear ->expmask and
- * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits,
- * enabling rcu_read_unlock_special() to do the bit-clearing.
- *
- * Caller must hold the root rcu_node's exp_funnel_mutex.
- */
-static void
-sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
-{
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&rnp->lock, flags);
-	smp_mb__after_unlock_lock();
-	if (!rnp->expmask) {
-		/* Phase 1 didn't do anything, so Phase 2 doesn't either. */
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
-		return;
-	}
-
-	/* Phase 1 is over. */
-	rnp->expmask = 0;
-
-	/*
-	 * If there are still blocked tasks, set up ->exp_tasks so that
-	 * rcu_read_unlock_special() will wake us and then boost them.
-	 */
-	if (rcu_preempt_has_tasks(rnp)) {
-		rnp->exp_tasks = rnp->blkd_tasks.next;
-		rcu_initiate_boost(rnp, flags);  /* releases rnp->lock */
-		return;
-	}
-
-	/* No longer any blocked tasks, so undo bit setting. */
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
-	rcu_report_exp_rnp(rsp, rnp, false);
 }
 
 /**
@@ -648,16 +590,8 @@ void synchronize_rcu_expedited(void)
 	/* force all RCU readers onto ->blkd_tasks lists. */
 	synchronize_sched_expedited();
 
-	/*
-	 * Snapshot current state of ->blkd_tasks lists into ->expmask.
-	 * Phase 1 sets bits and phase 2 permits rcu_read_unlock_special()
-	 * to start clearing them.  Doing this in one phase leads to
-	 * strange races between setting and clearing bits, so just say "no"!
-	 */
-	rcu_for_each_leaf_node(rsp, rnp)
-		sync_rcu_preempt_exp_init1(rsp, rnp);
-	rcu_for_each_leaf_node(rsp, rnp)
-		sync_rcu_preempt_exp_init2(rsp, rnp);
+	/* Initialize the rcu_node tree in preparation for the wait. */
+	sync_rcu_exp_select_nodes(rsp);
 
 	/* Wait for snapshotted ->blkd_tasks lists to drain. */
 	rnp = rcu_get_root(rsp);

From 8203d6d0ee784cfb2ebf89053f7fe399abc867d7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 2 Aug 2015 13:53:17 -0700
Subject: [PATCH 04/53] rcu: Use single-stage IPI algorithm for RCU expedited
 grace period

The current preemptible-RCU expedited grace-period algorithm invokes
synchronize_sched_expedited() to enqueue all tasks currently running
in a preemptible-RCU read-side critical section, then waits for all the
->blkd_tasks lists to drain.  This works, but results in both an IPI and
a double context switch even on CPUs that do not happen to be running
in a preemptible RCU read-side critical section.

This commit implements a new algorithm that causes less OS jitter.
This new algorithm IPIs all online CPUs that are not idle (from an
RCU perspective), but refrains from self-IPIs.  If a CPU receiving
this IPI is not in a preemptible RCU read-side critical section (or
is just now exiting one), it pushes quiescence up the rcu_node tree,
otherwise, it sets a flag that will be handled by the upcoming outermost
rcu_read_unlock(), which will then push quiescence up the tree.

The expedited grace period must of course wait on any pre-existing blocked
readers, and newly blocked readers must be queued carefully based on
the state of both the normal and the expedited grace periods.  This
new queueing approach also avoids the need to update boost state,
courtesy of the fact that blocked tasks are no longer ever migrated to
the root rcu_node structure.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/sched.h    |  10 +-
 kernel/rcu/tree.c        |  75 +++++++---
 kernel/rcu/tree_plugin.h | 298 +++++++++++++++++++++++++++++++++------
 3 files changed, 312 insertions(+), 71 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a4ab9daa387c..7fa8c4d372e7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1330,10 +1330,12 @@ struct sched_dl_entity {
 
 union rcu_special {
 	struct {
-		bool blocked;
-		bool need_qs;
-	} b;
-	short s;
+		u8 blocked;
+		u8 need_qs;
+		u8 exp_need_qs;
+		u8 pad;	/* Otherwise the compiler can store garbage here. */
+	} b; /* Bits. */
+	u32 s; /* Set of bits. */
 };
 struct rcu_node;
 
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8526896afea7..6e92bf4337bd 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3461,18 +3461,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
 }
 
 /*
- * Return non-zero if there are any tasks in RCU read-side critical
- * sections blocking the current preemptible-RCU expedited grace period.
- * If there is no preemptible-RCU expedited grace period currently in
- * progress, returns zero unconditionally.
- */
-static int rcu_preempted_readers_exp(struct rcu_node *rnp)
-{
-	return rnp->exp_tasks != NULL;
-}
-
-/*
- * return non-zero if there is no RCU expedited grace period in progress
+ * Return non-zero if there is no RCU expedited grace period in progress
  * for the specified rcu_node structure, in other words, if all CPUs and
  * tasks covered by the specified rcu_node structure have done their bit
  * for the current expedited grace period.  Works only for preemptible
@@ -3482,7 +3471,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp)
  */
 static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
 {
-	return !rcu_preempted_readers_exp(rnp) &&
+	return rnp->exp_tasks == NULL &&
 	       READ_ONCE(rnp->expmask) == 0;
 }
 
@@ -3494,19 +3483,21 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
  * recursively up the tree.  (Calm down, calm down, we do the recursion
  * iteratively!)
  *
- * Caller must hold the root rcu_node's exp_funnel_mutex.
+ * Caller must hold the root rcu_node's exp_funnel_mutex and the
+ * specified rcu_node structure's ->lock.
  */
-static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
-					      struct rcu_node *rnp, bool wake)
+static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+				 bool wake, unsigned long flags)
+	__releases(rnp->lock)
 {
-	unsigned long flags;
 	unsigned long mask;
 
-	raw_spin_lock_irqsave(&rnp->lock, flags);
-	smp_mb__after_unlock_lock();
 	for (;;) {
 		if (!sync_rcu_preempt_exp_done(rnp)) {
-			raw_spin_unlock_irqrestore(&rnp->lock, flags);
+			if (!rnp->expmask)
+				rcu_initiate_boost(rnp, flags);
+			else
+				raw_spin_unlock_irqrestore(&rnp->lock, flags);
 			break;
 		}
 		if (rnp->parent == NULL) {
@@ -3522,10 +3513,54 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
 		rnp = rnp->parent;
 		raw_spin_lock(&rnp->lock); /* irqs already disabled */
 		smp_mb__after_unlock_lock();
+		WARN_ON_ONCE(!(rnp->expmask & mask));
 		rnp->expmask &= ~mask;
 	}
 }
 
+/*
+ * Report expedited quiescent state for specified node.  This is a
+ * lock-acquisition wrapper function for __rcu_report_exp_rnp().
+ *
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
+ */
+static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
+					      struct rcu_node *rnp, bool wake)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&rnp->lock, flags);
+	smp_mb__after_unlock_lock();
+	__rcu_report_exp_rnp(rsp, rnp, wake, flags);
+}
+
+/*
+ * Report expedited quiescent state for multiple CPUs, all covered by the
+ * specified leaf rcu_node structure.  Caller must hold the root
+ * rcu_node's exp_funnel_mutex.
+ */
+static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
+				    unsigned long mask, bool wake)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&rnp->lock, flags);
+	smp_mb__after_unlock_lock();
+	WARN_ON_ONCE((rnp->expmask & mask) != mask);
+	rnp->expmask &= ~mask;
+	__rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */
+}
+
+/*
+ * Report expedited quiescent state for specified rcu_data (CPU).
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
+ */
+static void __maybe_unused rcu_report_exp_rdp(struct rcu_state *rsp,
+					      struct rcu_data *rdp, bool wake)
+{
+	rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake);
+}
+
 /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
 static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
 			       struct rcu_data *rdp,
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 62d05413b7ba..6f7500f9387c 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -101,7 +101,6 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
 static struct rcu_state *const rcu_state_p = &rcu_preempt_state;
 static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data;
 
-static int rcu_preempted_readers_exp(struct rcu_node *rnp);
 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 			       bool wake);
 
@@ -114,6 +113,147 @@ static void __init rcu_bootup_announce(void)
 	rcu_bootup_announce_oddness();
 }
 
+/* Flags for rcu_preempt_ctxt_queue() decision table. */
+#define RCU_GP_TASKS	0x8
+#define RCU_EXP_TASKS	0x4
+#define RCU_GP_BLKD	0x2
+#define RCU_EXP_BLKD	0x1
+
+/*
+ * Queues a task preempted within an RCU-preempt read-side critical
+ * section into the appropriate location within the ->blkd_tasks list,
+ * depending on the states of any ongoing normal and expedited grace
+ * periods.  The ->gp_tasks pointer indicates which element the normal
+ * grace period is waiting on (NULL if none), and the ->exp_tasks pointer
+ * indicates which element the expedited grace period is waiting on (again,
+ * NULL if none).  If a grace period is waiting on a given element in the
+ * ->blkd_tasks list, it also waits on all subsequent elements.  Thus,
+ * adding a task to the tail of the list blocks any grace period that is
+ * already waiting on one of the elements.  In contrast, adding a task
+ * to the head of the list won't block any grace period that is already
+ * waiting on one of the elements.
+ *
+ * This queuing is imprecise, and can sometimes make an ongoing grace
+ * period wait for a task that is not strictly speaking blocking it.
+ * Given the choice, we needlessly block a normal grace period rather than
+ * blocking an expedited grace period.
+ *
+ * Note that an endless sequence of expedited grace periods still cannot
+ * indefinitely postpone a normal grace period.  Eventually, all of the
+ * fixed number of preempted tasks blocking the normal grace period that are
+ * not also blocking the expedited grace period will resume and complete
+ * their RCU read-side critical sections.  At that point, the ->gp_tasks
+ * pointer will equal the ->exp_tasks pointer, at which point the end of
+ * the corresponding expedited grace period will also be the end of the
+ * normal grace period.
+ */
+static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
+				   unsigned long flags) __releases(rnp->lock)
+{
+	int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) +
+			 (rnp->exp_tasks ? RCU_EXP_TASKS : 0) +
+			 (rnp->qsmask & rdp->grpmask ? RCU_GP_BLKD : 0) +
+			 (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0);
+	struct task_struct *t = current;
+
+	/*
+	 * Decide where to queue the newly blocked task.  In theory,
+	 * this could be an if-statement.  In practice, when I tried
+	 * that, it was quite messy.
+	 */
+	switch (blkd_state) {
+	case 0:
+	case                RCU_EXP_TASKS:
+	case                RCU_EXP_TASKS + RCU_GP_BLKD:
+	case RCU_GP_TASKS:
+	case RCU_GP_TASKS + RCU_EXP_TASKS:
+
+		/*
+		 * Blocking neither GP, or first task blocking the normal
+		 * GP but not blocking the already-waiting expedited GP.
+		 * Queue at the head of the list to avoid unnecessarily
+		 * blocking the already-waiting GPs.
+		 */
+		list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
+		break;
+
+	case                                              RCU_EXP_BLKD:
+	case                                RCU_GP_BLKD:
+	case                                RCU_GP_BLKD + RCU_EXP_BLKD:
+	case RCU_GP_TASKS +                               RCU_EXP_BLKD:
+	case RCU_GP_TASKS +                 RCU_GP_BLKD + RCU_EXP_BLKD:
+	case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
+
+		/*
+		 * First task arriving that blocks either GP, or first task
+		 * arriving that blocks the expedited GP (with the normal
+		 * GP already waiting), or a task arriving that blocks
+		 * both GPs with both GPs already waiting.  Queue at the
+		 * tail of the list to avoid any GP waiting on any of the
+		 * already queued tasks that are not blocking it.
+		 */
+		list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks);
+		break;
+
+	case                RCU_EXP_TASKS +               RCU_EXP_BLKD:
+	case                RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
+	case RCU_GP_TASKS + RCU_EXP_TASKS +               RCU_EXP_BLKD:
+
+		/*
+		 * Second or subsequent task blocking the expedited GP.
+		 * The task either does not block the normal GP, or is the
+		 * first task blocking the normal GP.  Queue just after
+		 * the first task blocking the expedited GP.
+		 */
+		list_add(&t->rcu_node_entry, rnp->exp_tasks);
+		break;
+
+	case RCU_GP_TASKS +                 RCU_GP_BLKD:
+	case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD:
+
+		/*
+		 * Second or subsequent task blocking the normal GP.
+		 * The task does not block the expedited GP. Queue just
+		 * after the first task blocking the normal GP.
+		 */
+		list_add(&t->rcu_node_entry, rnp->gp_tasks);
+		break;
+
+	default:
+
+		/* Yet another exercise in excessive paranoia. */
+		WARN_ON_ONCE(1);
+		break;
+	}
+
+	/*
+	 * We have now queued the task.  If it was the first one to
+	 * block either grace period, update the ->gp_tasks and/or
+	 * ->exp_tasks pointers, respectively, to reference the newly
+	 * blocked tasks.
+	 */
+	if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD))
+		rnp->gp_tasks = &t->rcu_node_entry;
+	if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
+		rnp->exp_tasks = &t->rcu_node_entry;
+	raw_spin_unlock(&rnp->lock);
+
+	/*
+	 * Report the quiescent state for the expedited GP.  This expedited
+	 * GP should not be able to end until we report, so there should be
+	 * no need to check for a subsequent expedited GP.  (Though we are
+	 * still in a quiescent state in any case.)
+	 */
+	if (blkd_state & RCU_EXP_BLKD &&
+	    t->rcu_read_unlock_special.b.exp_need_qs) {
+		t->rcu_read_unlock_special.b.exp_need_qs = false;
+		rcu_report_exp_rdp(rdp->rsp, rdp, true);
+	} else {
+		WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs);
+	}
+	local_irq_restore(flags);
+}
+
 /*
  * Record a preemptible-RCU quiescent state for the specified CPU.  Note
  * that this just means that the task currently running on the CPU is
@@ -167,42 +307,18 @@ static void rcu_preempt_note_context_switch(void)
 		t->rcu_blocked_node = rnp;
 
 		/*
-		 * If this CPU has already checked in, then this task
-		 * will hold up the next grace period rather than the
-		 * current grace period.  Queue the task accordingly.
-		 * If the task is queued for the current grace period
-		 * (i.e., this CPU has not yet passed through a quiescent
-		 * state for the current grace period), then as long
-		 * as that task remains queued, the current grace period
-		 * cannot end.  Note that there is some uncertainty as
-		 * to exactly when the current grace period started.
-		 * We take a conservative approach, which can result
-		 * in unnecessarily waiting on tasks that started very
-		 * slightly after the current grace period began.  C'est
-		 * la vie!!!
-		 *
-		 * But first, note that the current CPU must still be
-		 * on line!
+		 * Verify the CPU's sanity, trace the preemption, and
+		 * then queue the task as required based on the states
+		 * of any ongoing and expedited grace periods.
 		 */
 		WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
 		WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
-		if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
-			list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
-			rnp->gp_tasks = &t->rcu_node_entry;
-			if (IS_ENABLED(CONFIG_RCU_BOOST) &&
-			    rnp->boost_tasks != NULL)
-				rnp->boost_tasks = rnp->gp_tasks;
-		} else {
-			list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
-			if (rnp->qsmask & rdp->grpmask)
-				rnp->gp_tasks = &t->rcu_node_entry;
-		}
 		trace_rcu_preempt_task(rdp->rsp->name,
 				       t->pid,
 				       (rnp->qsmask & rdp->grpmask)
 				       ? rnp->gpnum
 				       : rnp->gpnum + 1);
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		rcu_preempt_ctxt_queue(rnp, rdp, flags);
 	} else if (t->rcu_read_lock_nesting < 0 &&
 		   t->rcu_read_unlock_special.s) {
 
@@ -272,6 +388,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 	unsigned long flags;
 	struct list_head *np;
 	bool drop_boost_mutex = false;
+	struct rcu_data *rdp;
 	struct rcu_node *rnp;
 	union rcu_special special;
 
@@ -282,8 +399,8 @@ void rcu_read_unlock_special(struct task_struct *t)
 	local_irq_save(flags);
 
 	/*
-	 * If RCU core is waiting for this CPU to exit critical section,
-	 * let it know that we have done so.  Because irqs are disabled,
+	 * If RCU core is waiting for this CPU to exit its critical section,
+	 * report the fact that it has exited.  Because irqs are disabled,
 	 * t->rcu_read_unlock_special cannot change.
 	 */
 	special = t->rcu_read_unlock_special;
@@ -296,13 +413,32 @@ void rcu_read_unlock_special(struct task_struct *t)
 		}
 	}
 
+	/*
+	 * Respond to a request for an expedited grace period, but only if
+	 * we were not preempted, meaning that we were running on the same
+	 * CPU throughout.  If we were preempted, the exp_need_qs flag
+	 * would have been cleared at the time of the first preemption,
+	 * and the quiescent state would be reported when we were dequeued.
+	 */
+	if (special.b.exp_need_qs) {
+		WARN_ON_ONCE(special.b.blocked);
+		t->rcu_read_unlock_special.b.exp_need_qs = false;
+		rdp = this_cpu_ptr(rcu_state_p->rda);
+		rcu_report_exp_rdp(rcu_state_p, rdp, true);
+		if (!t->rcu_read_unlock_special.s) {
+			local_irq_restore(flags);
+			return;
+		}
+	}
+
 	/* Hardware IRQ handlers cannot block, complain if they get here. */
 	if (in_irq() || in_serving_softirq()) {
 		lockdep_rcu_suspicious(__FILE__, __LINE__,
 				       "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
-		pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n",
+		pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
 			 t->rcu_read_unlock_special.s,
 			 t->rcu_read_unlock_special.b.blocked,
+			 t->rcu_read_unlock_special.b.exp_need_qs,
 			 t->rcu_read_unlock_special.b.need_qs);
 		local_irq_restore(flags);
 		return;
@@ -329,7 +465,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 		}
 		empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
-		empty_exp = !rcu_preempted_readers_exp(rnp);
+		empty_exp = sync_rcu_preempt_exp_done(rnp);
 		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
 		np = rcu_next_node_entry(t, rnp);
 		list_del_init(&t->rcu_node_entry);
@@ -353,7 +489,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 		 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
 		 * so we must take a snapshot of the expedited state.
 		 */
-		empty_exp_now = !rcu_preempted_readers_exp(rnp);
+		empty_exp_now = sync_rcu_preempt_exp_done(rnp);
 		if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
 			trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
 							 rnp->gpnum,
@@ -535,28 +671,99 @@ void synchronize_rcu(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
+/*
+ * Remote handler for smp_call_function_single().  If there is an
+ * RCU read-side critical section in effect, request that the
+ * next rcu_read_unlock() record the quiescent state up the
+ * ->expmask fields in the rcu_node tree.  Otherwise, immediately
+ * report the quiescent state.
+ */
+static void sync_rcu_exp_handler(void *info)
+{
+	struct rcu_data *rdp;
+	struct rcu_state *rsp = info;
+	struct task_struct *t = current;
+
+	/*
+	 * Within an RCU read-side critical section, request that the next
+	 * rcu_read_unlock() report.  Unless this RCU read-side critical
+	 * section has already blocked, in which case it is already set
+	 * up for the expedited grace period to wait on it.
+	 */
+	if (t->rcu_read_lock_nesting > 0 &&
+	    !t->rcu_read_unlock_special.b.blocked) {
+		t->rcu_read_unlock_special.b.exp_need_qs = true;
+		return;
+	}
+
+	/*
+	 * We are either exiting an RCU read-side critical section (negative
+	 * values of t->rcu_read_lock_nesting) or are not in one at all
+	 * (zero value of t->rcu_read_lock_nesting).  Or we are in an RCU
+	 * read-side critical section that blocked before this expedited
+	 * grace period started.  Either way, we can immediately report
+	 * the quiescent state.
+	 */
+	rdp = this_cpu_ptr(rsp->rda);
+	rcu_report_exp_rdp(rsp, rdp, true);
+}
+
 /*
  * Select the nodes that the upcoming expedited grace period needs
  * to wait for.
  */
-static void sync_rcu_exp_select_nodes(struct rcu_state *rsp)
+static void sync_rcu_exp_select_cpus(struct rcu_state *rsp)
 {
+	int cpu;
 	unsigned long flags;
+	unsigned long mask;
+	unsigned long mask_ofl_test;
+	unsigned long mask_ofl_ipi;
+	int ret;
 	struct rcu_node *rnp;
 
 	sync_exp_reset_tree(rsp);
 	rcu_for_each_leaf_node(rsp, rnp) {
 		raw_spin_lock_irqsave(&rnp->lock, flags);
 		smp_mb__after_unlock_lock();
-		rnp->expmask = 0; /* No per-CPU component yet. */
-		if (!rcu_preempt_has_tasks(rnp)) {
-			/* FIXME: Want __rcu_report_exp_rnp() here. */
-			raw_spin_unlock_irqrestore(&rnp->lock, flags);
-		} else {
-			rnp->exp_tasks = rnp->blkd_tasks.next;
-			rcu_initiate_boost(rnp, flags);
+
+		/* Each pass checks a CPU for identity, offline, and idle. */
+		mask_ofl_test = 0;
+		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
+			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+			if (raw_smp_processor_id() == cpu ||
+			    cpu_is_offline(cpu) ||
+			    !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+				mask_ofl_test |= rdp->grpmask;
 		}
-		rcu_report_exp_rnp(rsp, rnp, false);
+		mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
+
+		/*
+		 * Need to wait for any blocked tasks as well.  Note that
+		 * additional blocking tasks will also block the expedited
+		 * GP until such time as the ->expmask bits are cleared.
+		 */
+		if (rcu_preempt_has_tasks(rnp))
+			rnp->exp_tasks = rnp->blkd_tasks.next;
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+
+		/* IPI the remaining CPUs for expedited quiescent state. */
+		mask = 1;
+		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
+			if (!(mask_ofl_ipi & mask))
+				continue;
+			ret = smp_call_function_single(cpu,
+						       sync_rcu_exp_handler,
+						       rsp, 0);
+			if (!ret)
+				mask_ofl_ipi &= ~mask;
+		}
+		/* Report quiescent states for those that went offline. */
+		mask_ofl_test |= mask_ofl_ipi;
+		if (mask_ofl_test)
+			rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
 	}
 }
 
@@ -587,11 +794,8 @@ void synchronize_rcu_expedited(void)
 
 	rcu_exp_gp_seq_start(rsp);
 
-	/* force all RCU readers onto ->blkd_tasks lists. */
-	synchronize_sched_expedited();
-
 	/* Initialize the rcu_node tree in preparation for the wait. */
-	sync_rcu_exp_select_nodes(rsp);
+	sync_rcu_exp_select_cpus(rsp);
 
 	/* Wait for snapshotted ->blkd_tasks lists to drain. */
 	rnp = rcu_get_root(rsp);

From bce5fa12aad148e15efd9bc0015dc4898b6e723b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 5 Aug 2015 16:03:54 -0700
Subject: [PATCH 05/53] rcu: Move synchronize_sched_expedited() to combining
 tree

Currently, synchronize_sched_expedited() uses a single global counter
to track the number of remaining context switches that the current
expedited grace period must wait on.  This is problematic on large
systems, where the resulting memory contention can be pathological.
This commit therefore makes synchronize_sched_expedited() instead use
the combining tree in the same manner as synchronize_rcu_expedited(),
keeping memory contention down to a dull roar.

This commit creates a temporary function sync_sched_exp_select_cpus()
that is very similar to sync_rcu_exp_select_cpus().  A later commit
will consolidate these two functions, which becomes possible when
synchronize_sched_expedited() switches from stop_one_cpu_nowait() to
smp_call_function_single().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 123 ++++++++++++++++++++++++++++++----------------
 kernel/rcu/tree.h |   1 -
 2 files changed, 82 insertions(+), 42 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6e92bf4337bd..d2cdcada6fe0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3642,19 +3642,77 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
 	struct rcu_data *rdp = data;
 	struct rcu_state *rsp = rdp->rsp;
 
-	/* We are here: If we are last, do the wakeup. */
-	rdp->exp_done = true;
-	if (atomic_dec_and_test(&rsp->expedited_need_qs))
-		wake_up(&rsp->expedited_wq);
+	/* Report the quiescent state. */
+	rcu_report_exp_rdp(rsp, rdp, true);
 	return 0;
 }
 
+/*
+ * Select the nodes that the upcoming expedited grace period needs
+ * to wait for.
+ */
+static void sync_sched_exp_select_cpus(struct rcu_state *rsp)
+{
+	int cpu;
+	unsigned long flags;
+	unsigned long mask;
+	unsigned long mask_ofl_test;
+	unsigned long mask_ofl_ipi;
+	struct rcu_data *rdp;
+	struct rcu_node *rnp;
+
+	sync_exp_reset_tree(rsp);
+	rcu_for_each_leaf_node(rsp, rnp) {
+		raw_spin_lock_irqsave(&rnp->lock, flags);
+		smp_mb__after_unlock_lock();
+
+		/* Each pass checks a CPU for identity, offline, and idle. */
+		mask_ofl_test = 0;
+		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
+			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+			if (raw_smp_processor_id() == cpu ||
+			    cpu_is_offline(cpu) ||
+			    !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+				mask_ofl_test |= rdp->grpmask;
+		}
+		mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
+
+		/*
+		 * Need to wait for any blocked tasks as well.  Note that
+		 * additional blocking tasks will also block the expedited
+		 * GP until such time as the ->expmask bits are cleared.
+		 */
+		if (rcu_preempt_has_tasks(rnp))
+			rnp->exp_tasks = rnp->blkd_tasks.next;
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+
+		/* IPI the remaining CPUs for expedited quiescent state. */
+		mask = 1;
+		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
+			if (!(mask_ofl_ipi & mask))
+				continue;
+			rdp = per_cpu_ptr(rsp->rda, cpu);
+			stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop,
+					    rdp, &rdp->exp_stop_work);
+			mask_ofl_ipi &= ~mask;
+		}
+		/* Report quiescent states for those that went offline. */
+		mask_ofl_test |= mask_ofl_ipi;
+		if (mask_ofl_test)
+			rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
+	}
+}
+
 static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 {
 	int cpu;
 	unsigned long jiffies_stall;
 	unsigned long jiffies_start;
-	struct rcu_data *rdp;
+	unsigned long mask;
+	struct rcu_node *rnp;
+	struct rcu_node *rnp_root = rcu_get_root(rsp);
 	int ret;
 
 	jiffies_stall = rcu_jiffies_till_stall_check();
@@ -3663,33 +3721,36 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 	for (;;) {
 		ret = wait_event_interruptible_timeout(
 				rsp->expedited_wq,
-				!atomic_read(&rsp->expedited_need_qs),
+				sync_rcu_preempt_exp_done(rnp_root),
 				jiffies_stall);
 		if (ret > 0)
 			return;
 		if (ret < 0) {
 			/* Hit a signal, disable CPU stall warnings. */
 			wait_event(rsp->expedited_wq,
-				   !atomic_read(&rsp->expedited_need_qs));
+				   sync_rcu_preempt_exp_done(rnp_root));
 			return;
 		}
 		pr_err("INFO: %s detected expedited stalls on CPUs: {",
 		       rsp->name);
-		for_each_online_cpu(cpu) {
-			rdp = per_cpu_ptr(rsp->rda, cpu);
-
-			if (rdp->exp_done)
-				continue;
-			pr_cont(" %d", cpu);
+		rcu_for_each_leaf_node(rsp, rnp) {
+			mask = 1;
+			for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
+				if (!(rnp->expmask & mask))
+					continue;
+				pr_cont(" %d", cpu);
+			}
+			mask <<= 1;
 		}
 		pr_cont(" } %lu jiffies s: %lu\n",
 			jiffies - jiffies_start, rsp->expedited_sequence);
-		for_each_online_cpu(cpu) {
-			rdp = per_cpu_ptr(rsp->rda, cpu);
-
-			if (rdp->exp_done)
-				continue;
-			dump_cpu_task(cpu);
+		rcu_for_each_leaf_node(rsp, rnp) {
+			mask = 1;
+			for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
+				if (!(rnp->expmask & mask))
+					continue;
+				dump_cpu_task(cpu);
+			}
 		}
 		jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
 	}
@@ -3713,7 +3774,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
  */
 void synchronize_sched_expedited(void)
 {
-	int cpu;
 	unsigned long s;
 	struct rcu_node *rnp;
 	struct rcu_state *rsp = &rcu_sched_state;
@@ -3736,27 +3796,8 @@ void synchronize_sched_expedited(void)
 	}
 
 	rcu_exp_gp_seq_start(rsp);
-
-	/* Stop each CPU that is online, non-idle, and not us. */
-	atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */
-	for_each_online_cpu(cpu) {
-		struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-		struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
-		rdp->exp_done = false;
-
-		/* Skip our CPU and any idle CPUs. */
-		if (raw_smp_processor_id() == cpu ||
-		    !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
-			continue;
-		atomic_inc(&rsp->expedited_need_qs);
-		stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop,
-				    rdp, &rdp->exp_stop_work);
-	}
-
-	/* Remove extra count and, if necessary, wait for CPUs to stop. */
-	if (!atomic_dec_and_test(&rsp->expedited_need_qs))
-		synchronize_sched_expedited_wait(rsp);
+	sync_sched_exp_select_cpus(rsp);
+	synchronize_sched_expedited_wait(rsp);
 
 	rcu_exp_gp_seq_end(rsp);
 	mutex_unlock(&rnp->exp_funnel_mutex);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index a57f25ecca58..efe361c764ab 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -383,7 +383,6 @@ struct rcu_data {
 	struct rcu_head oom_head;
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 	struct mutex exp_funnel_mutex;
-	bool exp_done;			/* Expedited QS for this CPU? */
 
 	/* 7) Callback offloading. */
 #ifdef CONFIG_RCU_NOCB_CPU

From 97c668b8e983b722e2ed765b98b05f644aff1b13 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 6 Aug 2015 11:31:51 -0700
Subject: [PATCH 06/53] rcu: Rename qs_pending to core_needs_qs

An upcoming commit needs to invert the sense of the ->passed_quiesce
rcu_data structure field, so this commit is taking this opportunity
to clarify things a bit by renaming ->qs_pending to ->core_needs_qs.

So if !rdp->core_needs_qs, then this CPU need not concern itself with
quiescent states, in particular, it need not acquire its leaf rcu_node
structure's ->lock to check.  Otherwise, it needs to report the next
quiescent state.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        | 14 +++++++-------
 kernel/rcu/tree.h        |  4 ++--
 kernel/rcu/tree_plugin.h |  2 +-
 kernel/rcu/tree_trace.c  |  4 ++--
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d2cdcada6fe0..7c158ffc7769 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1746,7 +1746,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
 		trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
 		rdp->passed_quiesce = 0;
 		rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
-		rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
+		rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
 		zero_cpu_stall_ticks(rdp);
 		WRITE_ONCE(rdp->gpwrap, false);
 	}
@@ -2357,7 +2357,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 	if ((rnp->qsmask & mask) == 0) {
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	} else {
-		rdp->qs_pending = 0;
+		rdp->core_needs_qs = 0;
 
 		/*
 		 * This GP can't end until cpu checks in, so all of our
@@ -2388,7 +2388,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 	 * Does this CPU still need to do its part for current grace period?
 	 * If no, return and let the other CPUs do their part as well.
 	 */
-	if (!rdp->qs_pending)
+	if (!rdp->core_needs_qs)
 		return;
 
 	/*
@@ -3828,10 +3828,10 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 
 	/* Is the RCU core waiting for a quiescent state from this CPU? */
 	if (rcu_scheduler_fully_active &&
-	    rdp->qs_pending && !rdp->passed_quiesce &&
+	    rdp->core_needs_qs && !rdp->passed_quiesce &&
 	    rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
-		rdp->n_rp_qs_pending++;
-	} else if (rdp->qs_pending &&
+		rdp->n_rp_core_needs_qs++;
+	} else if (rdp->core_needs_qs &&
 		   (rdp->passed_quiesce ||
 		    rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
 		rdp->n_rp_report_qs++;
@@ -4157,7 +4157,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	rdp->completed = rnp->completed;
 	rdp->passed_quiesce = false;
 	rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
-	rdp->qs_pending = false;
+	rdp->core_needs_qs = false;
 	trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index efe361c764ab..4a0f30676ba8 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -303,7 +303,7 @@ struct rcu_data {
 	unsigned long	rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */
 					/*  for rcu_all_qs() invocations. */
 	bool		passed_quiesce;	/* User-mode/idle loop etc. */
-	bool		qs_pending;	/* Core waits for quiesc state. */
+	bool		core_needs_qs;	/* Core waits for quiesc state. */
 	bool		beenonline;	/* CPU online at least once. */
 	bool		gpwrap;		/* Possible gpnum/completed wrap. */
 	struct rcu_node *mynode;	/* This CPU's leaf of hierarchy */
@@ -368,7 +368,7 @@ struct rcu_data {
 
 	/* 5) __rcu_pending() statistics. */
 	unsigned long n_rcu_pending;	/* rcu_pending() calls since boot. */
-	unsigned long n_rp_qs_pending;
+	unsigned long n_rp_core_needs_qs;
 	unsigned long n_rp_report_qs;
 	unsigned long n_rp_cb_ready;
 	unsigned long n_rp_cpu_needs_gp;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 6f7500f9387c..e33b4f3b8e0a 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -619,7 +619,7 @@ static void rcu_preempt_check_callbacks(void)
 		return;
 	}
 	if (t->rcu_read_lock_nesting > 0 &&
-	    __this_cpu_read(rcu_data_p->qs_pending) &&
+	    __this_cpu_read(rcu_data_p->core_needs_qs) &&
 	    !__this_cpu_read(rcu_data_p->passed_quiesce))
 		t->rcu_read_unlock_special.b.need_qs = true;
 }
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 6fc4c5ff3bb5..4ac25f8520d6 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -123,7 +123,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 		   ulong2long(rdp->completed), ulong2long(rdp->gpnum),
 		   rdp->passed_quiesce,
 		   rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
-		   rdp->qs_pending);
+		   rdp->core_needs_qs);
 	seq_printf(m, " dt=%d/%llx/%d df=%lu",
 		   atomic_read(&rdp->dynticks->dynticks),
 		   rdp->dynticks->dynticks_nesting,
@@ -361,7 +361,7 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
 		   cpu_is_offline(rdp->cpu) ? '!' : ' ',
 		   rdp->n_rcu_pending);
 	seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ",
-		   rdp->n_rp_qs_pending,
+		   rdp->n_rp_core_needs_qs,
 		   rdp->n_rp_report_qs,
 		   rdp->n_rp_cb_ready,
 		   rdp->n_rp_cpu_needs_gp);

From 0d43eb34f9aabcddf41c99b7af2d0ced33e9a3cc Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 6 Aug 2015 14:17:29 -0700
Subject: [PATCH 07/53] rcu: Invert passed_quiesce and rename to cpu_no_qs

This commit inverts the sense of the rcu_data structure's ->passed_quiesce
field and renames it to ->cpu_no_qs.  This will allow a later commit to
use an "aggregate OR" operation to test expedited as well as normal grace
periods without added overhead.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/RCU/trace.txt | 32 ++++++++++++++++----------------
 kernel/rcu/tree.c           | 22 +++++++++++-----------
 kernel/rcu/tree.h           |  2 +-
 kernel/rcu/tree_plugin.h    |  6 +++---
 kernel/rcu/tree_trace.c     |  4 ++--
 5 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index 97f17e9decda..ec6998b1b6d0 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -56,14 +56,14 @@ rcuboost:
 
 The output of "cat rcu/rcu_preempt/rcudata" looks as follows:
 
-  0!c=30455 g=30456 pq=1/0 qp=1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716
-  1!c=30719 g=30720 pq=1/0 qp=0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982
-  2!c=30150 g=30151 pq=1/1 qp=1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458
-  3 c=31249 g=31250 pq=1/1 qp=0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622
-  4!c=29502 g=29503 pq=1/0 qp=1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521
-  5 c=31201 g=31202 pq=1/0 qp=1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698
-  6!c=30253 g=30254 pq=1/0 qp=1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353
-  7 c=31178 g=31178 pq=1/0 qp=0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969
+  0!c=30455 g=30456 cnq=1/0:1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716
+  1!c=30719 g=30720 cnq=1/0:0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982
+  2!c=30150 g=30151 cnq=1/1:1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458
+  3 c=31249 g=31250 cnq=1/1:0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622
+  4!c=29502 g=29503 cnq=1/0:1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521
+  5 c=31201 g=31202 cnq=1/0:1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698
+  6!c=30253 g=30254 cnq=1/0:1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353
+  7 c=31178 g=31178 cnq=1/0:0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969
 
 This file has one line per CPU, or eight for this 8-CPU system.
 The fields are as follows:
@@ -188,14 +188,14 @@ o	"ca" is the number of RCU callbacks that have been adopted by this
 Kernels compiled with CONFIG_RCU_BOOST=y display the following from
 /debug/rcu/rcu_preempt/rcudata:
 
-  0!c=12865 g=12866 pq=1/0 qp=1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871
-  1 c=14407 g=14408 pq=1/0 qp=0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485
-  2 c=14407 g=14408 pq=1/0 qp=0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490
-  3 c=14407 g=14408 pq=1/0 qp=0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290
-  4 c=14405 g=14406 pq=1/0 qp=1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114
-  5!c=14168 g=14169 pq=1/0 qp=0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722
-  6 c=14404 g=14405 pq=1/0 qp=0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811
-  7 c=14407 g=14408 pq=1/0 qp=1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042
+  0!c=12865 g=12866 cnq=1/0:1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871
+  1 c=14407 g=14408 cnq=1/0:0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485
+  2 c=14407 g=14408 cnq=1/0:0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490
+  3 c=14407 g=14408 cnq=1/0:0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290
+  4 c=14405 g=14406 cnq=1/0:1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114
+  5!c=14168 g=14169 cnq=1/0:0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722
+  6 c=14404 g=14405 cnq=1/0:0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811
+  7 c=14407 g=14408 cnq=1/0:1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042
 
 This is similar to the output discussed above, but contains the following
 additional fields:
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 7c158ffc7769..31e7021ced4d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -245,21 +245,21 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
  */
 void rcu_sched_qs(void)
 {
-	if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) {
+	if (__this_cpu_read(rcu_sched_data.cpu_no_qs)) {
 		trace_rcu_grace_period(TPS("rcu_sched"),
 				       __this_cpu_read(rcu_sched_data.gpnum),
 				       TPS("cpuqs"));
-		__this_cpu_write(rcu_sched_data.passed_quiesce, 1);
+		__this_cpu_write(rcu_sched_data.cpu_no_qs, false);
 	}
 }
 
 void rcu_bh_qs(void)
 {
-	if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) {
+	if (__this_cpu_read(rcu_bh_data.cpu_no_qs)) {
 		trace_rcu_grace_period(TPS("rcu_bh"),
 				       __this_cpu_read(rcu_bh_data.gpnum),
 				       TPS("cpuqs"));
-		__this_cpu_write(rcu_bh_data.passed_quiesce, 1);
+		__this_cpu_write(rcu_bh_data.cpu_no_qs, false);
 	}
 }
 
@@ -1744,7 +1744,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
 		 */
 		rdp->gpnum = rnp->gpnum;
 		trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
-		rdp->passed_quiesce = 0;
+		rdp->cpu_no_qs = true;
 		rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
 		rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
 		zero_cpu_stall_ticks(rdp);
@@ -2337,7 +2337,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 	rnp = rdp->mynode;
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	smp_mb__after_unlock_lock();
-	if ((rdp->passed_quiesce == 0 &&
+	if ((rdp->cpu_no_qs &&
 	     rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
 	    rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
 	    rdp->gpwrap) {
@@ -2348,7 +2348,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 		 * We will instead need a new quiescent state that lies
 		 * within the current grace period.
 		 */
-		rdp->passed_quiesce = 0;	/* need qs for new gp. */
+		rdp->cpu_no_qs = true;	/* need qs for new gp. */
 		rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
@@ -2395,7 +2395,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 	 * Was there a quiescent state since the beginning of the grace
 	 * period? If no, then exit and wait for the next call.
 	 */
-	if (!rdp->passed_quiesce &&
+	if (rdp->cpu_no_qs &&
 	    rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
 		return;
 
@@ -3828,11 +3828,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 
 	/* Is the RCU core waiting for a quiescent state from this CPU? */
 	if (rcu_scheduler_fully_active &&
-	    rdp->core_needs_qs && !rdp->passed_quiesce &&
+	    rdp->core_needs_qs && rdp->cpu_no_qs &&
 	    rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
 		rdp->n_rp_core_needs_qs++;
 	} else if (rdp->core_needs_qs &&
-		   (rdp->passed_quiesce ||
+		   (!rdp->cpu_no_qs ||
 		    rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
 		rdp->n_rp_report_qs++;
 		return 1;
@@ -4155,7 +4155,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	rdp->beenonline = true;	 /* We have now been online. */
 	rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
 	rdp->completed = rnp->completed;
-	rdp->passed_quiesce = false;
+	rdp->cpu_no_qs = true;
 	rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
 	rdp->core_needs_qs = false;
 	trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 4a0f30676ba8..ded4ceebed76 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -302,7 +302,7 @@ struct rcu_data {
 					/*  is aware of having started. */
 	unsigned long	rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */
 					/*  for rcu_all_qs() invocations. */
-	bool		passed_quiesce;	/* User-mode/idle loop etc. */
+	bool		cpu_no_qs;	/* No QS yet for this CPU. */
 	bool		core_needs_qs;	/* Core waits for quiesc state. */
 	bool		beenonline;	/* CPU online at least once. */
 	bool		gpwrap;		/* Possible gpnum/completed wrap. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index e33b4f3b8e0a..6977ff0dccb9 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -265,11 +265,11 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
  */
 static void rcu_preempt_qs(void)
 {
-	if (!__this_cpu_read(rcu_data_p->passed_quiesce)) {
+	if (__this_cpu_read(rcu_data_p->cpu_no_qs)) {
 		trace_rcu_grace_period(TPS("rcu_preempt"),
 				       __this_cpu_read(rcu_data_p->gpnum),
 				       TPS("cpuqs"));
-		__this_cpu_write(rcu_data_p->passed_quiesce, 1);
+		__this_cpu_write(rcu_data_p->cpu_no_qs, false);
 		barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
 		current->rcu_read_unlock_special.b.need_qs = false;
 	}
@@ -620,7 +620,7 @@ static void rcu_preempt_check_callbacks(void)
 	}
 	if (t->rcu_read_lock_nesting > 0 &&
 	    __this_cpu_read(rcu_data_p->core_needs_qs) &&
-	    !__this_cpu_read(rcu_data_p->passed_quiesce))
+	    __this_cpu_read(rcu_data_p->cpu_no_qs))
 		t->rcu_read_unlock_special.b.need_qs = true;
 }
 
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 4ac25f8520d6..d373e57109b8 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -117,11 +117,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 
 	if (!rdp->beenonline)
 		return;
-	seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d",
+	seq_printf(m, "%3d%cc=%ld g=%ld cnq=%d/%d:%d",
 		   rdp->cpu,
 		   cpu_is_offline(rdp->cpu) ? '!' : ' ',
 		   ulong2long(rdp->completed), ulong2long(rdp->gpnum),
-		   rdp->passed_quiesce,
+		   rdp->cpu_no_qs,
 		   rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
 		   rdp->core_needs_qs);
 	seq_printf(m, " dt=%d/%llx/%d df=%lu",

From 5b74c458906fc4a62f932ee8bb801d29baf15fec Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 6 Aug 2015 15:16:57 -0700
Subject: [PATCH 08/53] rcu: Make ->cpu_no_qs be a union for aggregate OR

This commit converts the rcu_data structure's ->cpu_no_qs field
to a union.  The bytewise side of this union allows individual access
to indications as to whether this CPU needs to find a quiescent state
for a normal (.norm) and/or expedited (.exp) grace period.  The setwise
side of the union allows testing whether or not a quiescent state is
needed at all, for either type of grace period.

For now, only .norm is used.  A later commit will introduce the expedited
usage.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        | 22 +++++++++++-----------
 kernel/rcu/tree.h        | 14 +++++++++++++-
 kernel/rcu/tree_plugin.h |  6 +++---
 kernel/rcu/tree_trace.c  |  2 +-
 4 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 31e7021ced4d..3e2875b38eae 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -245,21 +245,21 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
  */
 void rcu_sched_qs(void)
 {
-	if (__this_cpu_read(rcu_sched_data.cpu_no_qs)) {
+	if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) {
 		trace_rcu_grace_period(TPS("rcu_sched"),
 				       __this_cpu_read(rcu_sched_data.gpnum),
 				       TPS("cpuqs"));
-		__this_cpu_write(rcu_sched_data.cpu_no_qs, false);
+		__this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
 	}
 }
 
 void rcu_bh_qs(void)
 {
-	if (__this_cpu_read(rcu_bh_data.cpu_no_qs)) {
+	if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
 		trace_rcu_grace_period(TPS("rcu_bh"),
 				       __this_cpu_read(rcu_bh_data.gpnum),
 				       TPS("cpuqs"));
-		__this_cpu_write(rcu_bh_data.cpu_no_qs, false);
+		__this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
 	}
 }
 
@@ -1744,7 +1744,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
 		 */
 		rdp->gpnum = rnp->gpnum;
 		trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
-		rdp->cpu_no_qs = true;
+		rdp->cpu_no_qs.b.norm = true;
 		rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
 		rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
 		zero_cpu_stall_ticks(rdp);
@@ -2337,7 +2337,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 	rnp = rdp->mynode;
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	smp_mb__after_unlock_lock();
-	if ((rdp->cpu_no_qs &&
+	if ((rdp->cpu_no_qs.b.norm &&
 	     rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
 	    rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
 	    rdp->gpwrap) {
@@ -2348,7 +2348,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 		 * We will instead need a new quiescent state that lies
 		 * within the current grace period.
 		 */
-		rdp->cpu_no_qs = true;	/* need qs for new gp. */
+		rdp->cpu_no_qs.b.norm = true;	/* need qs for new gp. */
 		rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
@@ -2395,7 +2395,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 	 * Was there a quiescent state since the beginning of the grace
 	 * period? If no, then exit and wait for the next call.
 	 */
-	if (rdp->cpu_no_qs &&
+	if (rdp->cpu_no_qs.b.norm &&
 	    rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
 		return;
 
@@ -3828,11 +3828,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 
 	/* Is the RCU core waiting for a quiescent state from this CPU? */
 	if (rcu_scheduler_fully_active &&
-	    rdp->core_needs_qs && rdp->cpu_no_qs &&
+	    rdp->core_needs_qs && rdp->cpu_no_qs.b.norm &&
 	    rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
 		rdp->n_rp_core_needs_qs++;
 	} else if (rdp->core_needs_qs &&
-		   (!rdp->cpu_no_qs ||
+		   (!rdp->cpu_no_qs.b.norm ||
 		    rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
 		rdp->n_rp_report_qs++;
 		return 1;
@@ -4155,7 +4155,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	rdp->beenonline = true;	 /* We have now been online. */
 	rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
 	rdp->completed = rnp->completed;
-	rdp->cpu_no_qs = true;
+	rdp->cpu_no_qs.b.norm = true;
 	rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
 	rdp->core_needs_qs = false;
 	trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index ded4ceebed76..3eee48bcf52b 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -286,6 +286,18 @@ struct rcu_node {
 	for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
 	     (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
 
+/*
+ * Union to allow "aggregate OR" operation on the need for a quiescent
+ * state by the normal and expedited grace periods.
+ */
+union rcu_noqs {
+	struct {
+		u8 norm;
+		u8 exp;
+	} b; /* Bits. */
+	u16 s; /* Set of bits, aggregate OR here. */
+};
+
 /* Index values for nxttail array in struct rcu_data. */
 #define RCU_DONE_TAIL		0	/* Also RCU_WAIT head. */
 #define RCU_WAIT_TAIL		1	/* Also RCU_NEXT_READY head. */
@@ -302,7 +314,7 @@ struct rcu_data {
 					/*  is aware of having started. */
 	unsigned long	rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */
 					/*  for rcu_all_qs() invocations. */
-	bool		cpu_no_qs;	/* No QS yet for this CPU. */
+	union rcu_noqs	cpu_no_qs;	/* No QSes yet for this CPU. */
 	bool		core_needs_qs;	/* Core waits for quiesc state. */
 	bool		beenonline;	/* CPU online at least once. */
 	bool		gpwrap;		/* Possible gpnum/completed wrap. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 6977ff0dccb9..7880202f1e38 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -265,11 +265,11 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
  */
 static void rcu_preempt_qs(void)
 {
-	if (__this_cpu_read(rcu_data_p->cpu_no_qs)) {
+	if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) {
 		trace_rcu_grace_period(TPS("rcu_preempt"),
 				       __this_cpu_read(rcu_data_p->gpnum),
 				       TPS("cpuqs"));
-		__this_cpu_write(rcu_data_p->cpu_no_qs, false);
+		__this_cpu_write(rcu_data_p->cpu_no_qs.b.norm, false);
 		barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
 		current->rcu_read_unlock_special.b.need_qs = false;
 	}
@@ -620,7 +620,7 @@ static void rcu_preempt_check_callbacks(void)
 	}
 	if (t->rcu_read_lock_nesting > 0 &&
 	    __this_cpu_read(rcu_data_p->core_needs_qs) &&
-	    __this_cpu_read(rcu_data_p->cpu_no_qs))
+	    __this_cpu_read(rcu_data_p->cpu_no_qs.b.norm))
 		t->rcu_read_unlock_special.b.need_qs = true;
 }
 
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index d373e57109b8..999c3672f990 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -121,7 +121,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 		   rdp->cpu,
 		   cpu_is_offline(rdp->cpu) ? '!' : ' ',
 		   ulong2long(rdp->completed), ulong2long(rdp->gpnum),
-		   rdp->cpu_no_qs,
+		   rdp->cpu_no_qs.b.norm,
 		   rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
 		   rdp->core_needs_qs);
 	seq_printf(m, " dt=%d/%llx/%d df=%lu",

From 84778472e1b6a27a8931712c40e8cf31143c8f6c Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 2 Sep 2015 01:28:44 -0700
Subject: [PATCH 09/53] sched: Export sched_setscheduler_nocheck

The new locktorture rtmutex_lock tests exercise priority boosting, which
means that they need to set some tasks to real-time priority.  To do this,
they use sched_setscheduler_nocheck().  However, this is not exported to
modules, which results in the following error when building locktorture
as a module:

ERROR: "sched_setscheduler_nocheck" [kernel/locking/locktorture.ko] undefined!

This commit therefore adds an EXPORT_SYMBOL_GPL() to allow this function
to be invoked from locktorture when built as a module.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/sched/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2f9c92884817..c4e607873d6f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4022,6 +4022,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy,
 {
 	return _sched_setscheduler(p, policy, param, false);
 }
+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
 
 static int
 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

From b6a4ae766e3133a4db73fabc81e522d1601cb623 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Wed, 29 Jul 2015 13:29:38 +0800
Subject: [PATCH 10/53] rcu: Use rcu_callback_t in call_rcu*() and friends

As we now have rcu_callback_t typedefs as the type of rcu callbacks, we
should use it in call_rcu*() and friends as the type of parameters. This
could save us a few lines of code and make it clear which function
requires an rcu callbacks rather than other callbacks as its argument.

Besides, this can also help cscope to generate a better database for
code reading.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcupdate.h | 10 +++++-----
 include/linux/rcutiny.h  |  2 +-
 include/linux/rcutree.h  |  2 +-
 kernel/rcu/rcutorture.c  |  4 ++--
 kernel/rcu/srcu.c        |  2 +-
 kernel/rcu/tiny.c        |  8 ++++----
 kernel/rcu/tree.c        |  8 ++++----
 kernel/rcu/tree.h        |  2 +-
 kernel/rcu/tree_plugin.h |  2 +-
 kernel/rcu/update.c      |  2 +-
 10 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 581abf848566..d63bb77dab35 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -160,7 +160,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
  * more than one CPU).
  */
 void call_rcu(struct rcu_head *head,
-	      void (*func)(struct rcu_head *head));
+	      rcu_callback_t func);
 
 #else /* #ifdef CONFIG_PREEMPT_RCU */
 
@@ -191,7 +191,7 @@ void call_rcu(struct rcu_head *head,
  * memory ordering guarantees.
  */
 void call_rcu_bh(struct rcu_head *head,
-		 void (*func)(struct rcu_head *head));
+		 rcu_callback_t func);
 
 /**
  * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
@@ -213,7 +213,7 @@ void call_rcu_bh(struct rcu_head *head,
  * memory ordering guarantees.
  */
 void call_rcu_sched(struct rcu_head *head,
-		    void (*func)(struct rcu_head *rcu));
+		    rcu_callback_t func);
 
 void synchronize_sched(void);
 
@@ -274,7 +274,7 @@ do {									\
  * See the description of call_rcu() for more detailed information on
  * memory ordering guarantees.
  */
-void call_rcu_tasks(struct rcu_head *head, void (*func)(struct rcu_head *head));
+void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func);
 void synchronize_rcu_tasks(void);
 void rcu_barrier_tasks(void);
 
@@ -1065,7 +1065,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
 #define __kfree_rcu(head, offset) \
 	do { \
 		BUILD_BUG_ON(!__is_kfree_rcu_offset(offset)); \
-		kfree_call_rcu(head, (void (*)(struct rcu_head *))(unsigned long)(offset)); \
+		kfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \
 	} while (0)
 
 /**
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index ff968b7af3a4..c8a0722f77ea 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -83,7 +83,7 @@ static inline void synchronize_sched_expedited(void)
 }
 
 static inline void kfree_call_rcu(struct rcu_head *head,
-				  void (*func)(struct rcu_head *rcu))
+				  rcu_callback_t func)
 {
 	call_rcu(head, func);
 }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 5abec82f325e..60d15a080d7c 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -48,7 +48,7 @@ void synchronize_rcu_bh(void);
 void synchronize_sched_expedited(void);
 void synchronize_rcu_expedited(void);
 
-void kfree_call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
+void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func);
 
 /**
  * synchronize_rcu_bh_expedited - Brute-force RCU-bh grace period
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 77192953dee5..51c8e7f02f48 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -448,7 +448,7 @@ static void synchronize_rcu_busted(void)
 }
 
 static void
-call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+call_rcu_busted(struct rcu_head *head, rcu_callback_t func)
 {
 	/* This is a deliberate bug for testing purposes only! */
 	func(head);
@@ -523,7 +523,7 @@ static void srcu_torture_synchronize(void)
 }
 
 static void srcu_torture_call(struct rcu_head *head,
-			      void (*func)(struct rcu_head *head))
+			      rcu_callback_t func)
 {
 	call_srcu(srcu_ctlp, head, func);
 }
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index d3fcb2ec8536..9e6122540d28 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -387,7 +387,7 @@ static void srcu_flip(struct srcu_struct *sp)
  * srcu_struct structure.
  */
 void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
-		void (*func)(struct rcu_head *head))
+	       rcu_callback_t func)
 {
 	unsigned long flags;
 
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index d0471056d0af..944b1b491ed8 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -44,7 +44,7 @@ struct rcu_ctrlblk;
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
 static void rcu_process_callbacks(struct softirq_action *unused);
 static void __call_rcu(struct rcu_head *head,
-		       void (*func)(struct rcu_head *rcu),
+		       rcu_callback_t func,
 		       struct rcu_ctrlblk *rcp);
 
 #include "tiny_plugin.h"
@@ -203,7 +203,7 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
  * Helper function for call_rcu() and call_rcu_bh().
  */
 static void __call_rcu(struct rcu_head *head,
-		       void (*func)(struct rcu_head *rcu),
+		       rcu_callback_t func,
 		       struct rcu_ctrlblk *rcp)
 {
 	unsigned long flags;
@@ -229,7 +229,7 @@ static void __call_rcu(struct rcu_head *head,
  * period.  But since we have but one CPU, that would be after any
  * quiescent state.
  */
-void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
 {
 	__call_rcu(head, func, &rcu_sched_ctrlblk);
 }
@@ -239,7 +239,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
  * Post an RCU bottom-half callback to be invoked after any subsequent
  * quiescent state.
  */
-void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
 {
 	__call_rcu(head, func, &rcu_bh_ctrlblk);
 }
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 775d36cc0050..b9d9e0249e2f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3017,7 +3017,7 @@ static void rcu_leak_callback(struct rcu_head *rhp)
  * is expected to specify a CPU.
  */
 static void
-__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
+__call_rcu(struct rcu_head *head, rcu_callback_t func,
 	   struct rcu_state *rsp, int cpu, bool lazy)
 {
 	unsigned long flags;
@@ -3088,7 +3088,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 /*
  * Queue an RCU-sched callback for invocation after a grace period.
  */
-void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
 {
 	__call_rcu(head, func, &rcu_sched_state, -1, 0);
 }
@@ -3097,7 +3097,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
 /*
  * Queue an RCU callback for invocation after a quicker grace period.
  */
-void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
 {
 	__call_rcu(head, func, &rcu_bh_state, -1, 0);
 }
@@ -3111,7 +3111,7 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
  * function may only be called from __kfree_rcu().
  */
 void kfree_call_rcu(struct rcu_head *head,
-		    void (*func)(struct rcu_head *rcu))
+		    rcu_callback_t func)
 {
 	__call_rcu(head, func, rcu_state_p, -1, 1);
 }
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 2e991f8361e4..ad11529375cc 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -584,7 +584,7 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp);
 static int rcu_print_task_stall(struct rcu_node *rnp);
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 static void rcu_preempt_check_callbacks(void);
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
+void call_rcu(struct rcu_head *head, rcu_callback_t func);
 static void __init __rcu_init_preempt(void);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index b2bf3963a0ae..06116ae6dfd7 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -500,7 +500,7 @@ static void rcu_preempt_do_callbacks(void)
 /*
  * Queue a preemptible-RCU callback for invocation after a grace period.
  */
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
 	__call_rcu(head, func, rcu_state_p, -1, 0);
 }
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 7a0b3bc7c5ed..5f748c5a40f0 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -534,7 +534,7 @@ static void rcu_spawn_tasks_kthread(void);
  * Post an RCU-tasks callback.  First call must be from process context
  * after the scheduler if fully operational.
  */
-void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp))
+void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
 {
 	unsigned long flags;
 	bool needwake;

From db3e8db45e1cbf8cc22f1083a559d78eb91ccd63 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Wed, 29 Jul 2015 13:29:39 +0800
Subject: [PATCH 11/53] rcu: Use call_rcu_func_t to replace explicit type
 equivalents

We have had the call_rcu_func_t typedef for a quite awhile, but we still
use explicit function pointer types in some places.  These types can
confuse cscope and can be hard to read.  This patch therefore replaces
these types with the call_rcu_func_t typedef.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/rcutorture.c | 2 +-
 kernel/rcu/tree.h       | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 51c8e7f02f48..f9ec6cbe77d3 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -252,7 +252,7 @@ struct rcu_torture_ops {
 	void (*exp_sync)(void);
 	unsigned long (*get_state)(void);
 	void (*cond_sync)(unsigned long oldstate);
-	void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
+	call_rcu_func_t call;
 	void (*cb_barrier)(void);
 	void (*fqs)(void);
 	void (*stats)(void);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index ad11529375cc..0c33c82cec64 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -464,8 +464,7 @@ struct rcu_state {
 						/*  shut bogus gcc warning) */
 	u8 flavor_mask;				/* bit in flavor mask. */
 	struct rcu_data __percpu *rda;		/* pointer of percu rcu_data. */
-	void (*call)(struct rcu_head *head,	/* call_rcu() flavor. */
-		     void (*func)(struct rcu_head *head));
+	call_rcu_func_t call;			/* call_rcu() flavor. */
 
 	/* The following fields are guarded by the root rcu_node's lock. */
 

From bb73c52bad3666997ed2ec83c0c80c3f8ef55008 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Thu, 30 Jul 2015 16:55:38 -0700
Subject: [PATCH 12/53] rcu: Don't disable preemption for Tiny and Tree RCU
 readers

Because preempt_disable() maps to barrier() for non-debug builds,
it forces the compiler to spill and reload registers.  Because Tree
RCU and Tiny RCU now only appear in CONFIG_PREEMPT=n builds, these
barrier() instances generate needless extra code for each instance of
rcu_read_lock() and rcu_read_unlock().  This extra code slows down Tree
RCU and bloats Tiny RCU.

This commit therefore removes the preempt_disable() and preempt_enable()
from the non-preemptible implementations of __rcu_read_lock() and
__rcu_read_unlock(), respectively.  However, for debug purposes,
preempt_disable() and preempt_enable() are still invoked if
CONFIG_PREEMPT_COUNT=y, because this allows detection of sleeping inside
atomic sections in non-preemptible kernels.

However, Tiny and Tree RCU operates by coalescing all RCU read-side
critical sections on a given CPU that lie between successive quiescent
states.  It is therefore necessary to compensate for removing barriers
from __rcu_read_lock() and __rcu_read_unlock() by adding them to a
couple of the RCU functions invoked during quiescent states, namely to
rcu_all_qs() and rcu_note_context_switch().  However, note that the latter
is more paranoia than necessity, at least until link-time optimizations
become more aggressive.

This is based on an earlier patch by Paul E. McKenney, fixing
a bug encountered in kernels built with CONFIG_PREEMPT=n and
CONFIG_PREEMPT_COUNT=y.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 6 ++++--
 include/linux/rcutiny.h  | 1 +
 kernel/rcu/tree.c        | 9 +++++++++
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index d63bb77dab35..6c3ceceb6148 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -297,12 +297,14 @@ void synchronize_rcu(void);
 
 static inline void __rcu_read_lock(void)
 {
-	preempt_disable();
+	if (IS_ENABLED(CONFIG_PREEMPT_COUNT))
+		preempt_disable();
 }
 
 static inline void __rcu_read_unlock(void)
 {
-	preempt_enable();
+	if (IS_ENABLED(CONFIG_PREEMPT_COUNT))
+		preempt_enable();
 }
 
 static inline void synchronize_rcu(void)
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index c8a0722f77ea..4c1aaf9cce7b 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -216,6 +216,7 @@ static inline bool rcu_is_watching(void)
 
 static inline void rcu_all_qs(void)
 {
+	barrier(); /* Avoid RCU read-side critical sections leaking across. */
 }
 
 #endif /* __LINUX_RCUTINY_H */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b9d9e0249e2f..93c0f23c3e45 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -337,12 +337,14 @@ static void rcu_momentary_dyntick_idle(void)
  */
 void rcu_note_context_switch(void)
 {
+	barrier(); /* Avoid RCU read-side critical sections leaking down. */
 	trace_rcu_utilization(TPS("Start context switch"));
 	rcu_sched_qs();
 	rcu_preempt_note_context_switch();
 	if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
 		rcu_momentary_dyntick_idle();
 	trace_rcu_utilization(TPS("End context switch"));
+	barrier(); /* Avoid RCU read-side critical sections leaking up. */
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 
@@ -353,12 +355,19 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
  * RCU flavors in desperate need of a quiescent state, which will normally
  * be none of them).  Either way, do a lightweight quiescent state for
  * all RCU flavors.
+ *
+ * The barrier() calls are redundant in the common case when this is
+ * called externally, but just in case this is called from within this
+ * file.
+ *
  */
 void rcu_all_qs(void)
 {
+	barrier(); /* Avoid RCU read-side critical sections leaking down. */
 	if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
 		rcu_momentary_dyntick_idle();
 	this_cpu_inc(rcu_qs_ctr);
+	barrier(); /* Avoid RCU read-side critical sections leaking up. */
 }
 EXPORT_SYMBOL_GPL(rcu_all_qs);
 

From ee968ac61d5a3440b931375d81113e0eedfcb249 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 31 Jul 2015 08:28:35 -0700
Subject: [PATCH 13/53] rcu: Eliminate panic when silly boot-time fanout
 specified

This commit loosens rcutree.rcu_fanout_leaf range checks
and replaces a panic() with a fallback to compile-time values.
This fallback is accompanied by a WARN_ON(), and both occur when the
rcutree.rcu_fanout_leaf value is too small to accommodate the number of
CPUs.  For example, given the current four-level limit for the rcu_node
tree, a system with more than 16 CPUs built with CONFIG_FANOUT=2 must
have rcutree.rcu_fanout_leaf larger than 2.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 Documentation/kernel-parameters.txt |  9 ++++++---
 kernel/rcu/tree.c                   | 20 +++++++++++---------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 22a4b687ea5b..23ec96877311 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3074,9 +3074,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			cache-to-cache transfer latencies.
 
 	rcutree.rcu_fanout_leaf= [KNL]
-			Increase the number of CPUs assigned to each
-			leaf rcu_node structure.  Useful for very large
-			systems.
+			Change the number of CPUs assigned to each
+			leaf rcu_node structure.  Useful for very
+			large systems, which will choose the value 64,
+			and for NUMA systems with large remote-access
+			latencies, which will choose a value aligned
+			with the appropriate hardware boundaries.
 
 	rcutree.jiffies_till_sched_qs= [KNL]
 			Set required age in jiffies for a
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 93c0f23c3e45..713eb92314b4 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4230,13 +4230,12 @@ static void __init rcu_init_geometry(void)
 		rcu_fanout_leaf, nr_cpu_ids);
 
 	/*
-	 * The boot-time rcu_fanout_leaf parameter is only permitted
-	 * to increase the leaf-level fanout, not decrease it.  Of course,
-	 * the leaf-level fanout cannot exceed the number of bits in
-	 * the rcu_node masks.  Complain and fall back to the compile-
-	 * time values if these limits are exceeded.
+	 * The boot-time rcu_fanout_leaf parameter must be at least two
+	 * and cannot exceed the number of bits in the rcu_node masks.
+	 * Complain and fall back to the compile-time values if this
+	 * limit is exceeded.
 	 */
-	if (rcu_fanout_leaf < RCU_FANOUT_LEAF ||
+	if (rcu_fanout_leaf < 2 ||
 	    rcu_fanout_leaf > sizeof(unsigned long) * 8) {
 		rcu_fanout_leaf = RCU_FANOUT_LEAF;
 		WARN_ON(1);
@@ -4253,10 +4252,13 @@ static void __init rcu_init_geometry(void)
 
 	/*
 	 * The tree must be able to accommodate the configured number of CPUs.
-	 * If this limit is exceeded than we have a serious problem elsewhere.
+	 * If this limit is exceeded, fall back to the compile-time values.
 	 */
-	if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1])
-		panic("rcu_init_geometry: rcu_capacity[] is too small");
+	if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) {
+		rcu_fanout_leaf = RCU_FANOUT_LEAF;
+		WARN_ON(1);
+		return;
+	}
 
 	/* Calculate the number of levels in the tree. */
 	for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) {

From 7f21aeef722d598ba350d1834f6e4134c7b5e8de Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 15 Aug 2015 14:49:50 -0700
Subject: [PATCH 14/53] rcu: Add online/offline info to stall warning message

This commit makes the RCU CPU stall warning message print online/offline
indications immediately after a hyphen following the CPU number.  A "O"
indicates that the global CPU-hotplug system believes that the CPU is
online, a "o" that RCU perceived the CPU to be online at the beginning
of the current expedited grace period, and an "N" that RCU currently
believes that it will perceive the CPU as being online at the beginning
of the next expedited grace period, with "." otherwise for all three
indications.  So for CPU 10, you would normally see "10-OoN:" indicating
that everything believes that the CPU is online.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 06116ae6dfd7..57ed9c13ae5a 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1702,8 +1702,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
 		ticks_value = rsp->gpnum - rdp->gpnum;
 	}
 	print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
-	pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
-	       cpu, ticks_value, ticks_title,
+	pr_err("\t%d-%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
+	       cpu,
+	       "O."[!!cpu_online(cpu)],
+	       "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
+	       "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
+	       ticks_value, ticks_title,
 	       atomic_read(&rdtp->dynticks) & 0xfff,
 	       rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
 	       rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),

From 49f5903b473c5f63f3b57856d1bd4593db0a2eef Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 1 Sep 2015 00:42:57 -0700
Subject: [PATCH 15/53] rcu: Move preemption disabling out of
 __srcu_read_lock()

Currently, __srcu_read_lock() cannot be invoked from restricted
environments because it contains calls to preempt_disable() and
preempt_enable(), both of which can invoke lockdep, which is a bad
idea in some restricted execution modes.  This commit therefore moves
the preempt_disable() and preempt_enable() from __srcu_read_lock()
to srcu_read_lock().  It also inserts the preempt_disable() and
preempt_enable() around the call to __srcu_read_lock() in do_exit().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/srcu.h | 5 ++++-
 kernel/exit.c        | 2 ++
 kernel/rcu/srcu.c    | 2 --
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index bdeb4567b71e..f5f80c5643ac 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -215,8 +215,11 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp)
  */
 static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp)
 {
-	int retval = __srcu_read_lock(sp);
+	int retval;
 
+	preempt_disable();
+	retval = __srcu_read_lock(sp);
+	preempt_enable();
 	rcu_lock_acquire(&(sp)->dep_map);
 	return retval;
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index ea95ee1b5ef7..0e93b63bbc59 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -761,7 +761,9 @@ void do_exit(long code)
 	 */
 	flush_ptrace_hw_breakpoint(tsk);
 
+	TASKS_RCU(preempt_disable());
 	TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
+	TASKS_RCU(preempt_enable());
 	exit_notify(tsk, group_dead);
 	proc_exit_connector(tsk);
 #ifdef CONFIG_NUMA
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 9e6122540d28..a63a1ea5a41b 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -298,11 +298,9 @@ int __srcu_read_lock(struct srcu_struct *sp)
 	int idx;
 
 	idx = READ_ONCE(sp->completed) & 0x1;
-	preempt_disable();
 	__this_cpu_inc(sp->per_cpu_ref->c[idx]);
 	smp_mb(); /* B */  /* Avoid leaking the critical section. */
 	__this_cpu_inc(sp->per_cpu_ref->seq[idx]);
-	preempt_enable();
 	return idx;
 }
 EXPORT_SYMBOL_GPL(__srcu_read_lock);

From 77f81fe08ebd99d7e0eefde42ddac06a675bc4ad Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Wed, 9 Sep 2015 12:09:49 -0700
Subject: [PATCH 16/53] rcu: Finish folding ->fqs_state into ->gp_state

Commit commit 4cdfc175c25c89ee ("rcu: Move quiescent-state forcing
into kthread") started the process of folding the old ->fqs_state into
->gp_state, but did not complete it.  This situation does not cause
any malfunction, but can result in extremely confusing trace output.
This commit completes this task of eliminating ->fqs_state in favor
of ->gp_state.

The old ->fqs_state was also used to decide when to collect dyntick-idle
snapshots.  For this purpose, we add a boolean variable into the kthread,
which is set on the first call to rcu_gp_fqs() for a given grace period
and clear otherwise.

Signed-off-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree.c       | 18 ++++++++----------
 kernel/rcu/tree.h       | 14 +++-----------
 kernel/rcu/tree_trace.c |  2 +-
 3 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 713eb92314b4..4d296b0fb987 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -98,7 +98,7 @@ struct rcu_state sname##_state = { \
 	.level = { &sname##_state.node[0] }, \
 	.rda = &sname##_data, \
 	.call = cr, \
-	.fqs_state = RCU_GP_IDLE, \
+	.gp_state = RCU_GP_IDLE, \
 	.gpnum = 0UL - 300UL, \
 	.completed = 0UL - 300UL, \
 	.orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
@@ -1936,16 +1936,15 @@ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
 /*
  * Do one round of quiescent-state forcing.
  */
-static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
+static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time)
 {
-	int fqs_state = fqs_state_in;
 	bool isidle = false;
 	unsigned long maxj;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	WRITE_ONCE(rsp->gp_activity, jiffies);
 	rsp->n_force_qs++;
-	if (fqs_state == RCU_SAVE_DYNTICK) {
+	if (first_time) {
 		/* Collect dyntick-idle snapshots. */
 		if (is_sysidle_rcu_state(rsp)) {
 			isidle = true;
@@ -1954,7 +1953,6 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
 		force_qs_rnp(rsp, dyntick_save_progress_counter,
 			     &isidle, &maxj);
 		rcu_sysidle_report_gp(rsp, isidle, maxj);
-		fqs_state = RCU_FORCE_QS;
 	} else {
 		/* Handle dyntick-idle and offline CPUs. */
 		isidle = true;
@@ -1968,7 +1966,6 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
 			   READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
 		raw_spin_unlock_irq(&rnp->lock);
 	}
-	return fqs_state;
 }
 
 /*
@@ -2032,7 +2029,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 	/* Declare grace period done. */
 	WRITE_ONCE(rsp->completed, rsp->gpnum);
 	trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
-	rsp->fqs_state = RCU_GP_IDLE;
+	rsp->gp_state = RCU_GP_IDLE;
 	rdp = this_cpu_ptr(rsp->rda);
 	/* Advance CBs to reduce false positives below. */
 	needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
@@ -2050,7 +2047,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
  */
 static int __noreturn rcu_gp_kthread(void *arg)
 {
-	int fqs_state;
+	bool first_gp_fqs;
 	int gf;
 	unsigned long j;
 	int ret;
@@ -2082,7 +2079,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 		}
 
 		/* Handle quiescent-state forcing. */
-		fqs_state = RCU_SAVE_DYNTICK;
+		first_gp_fqs = true;
 		j = jiffies_till_first_fqs;
 		if (j > HZ) {
 			j = HZ;
@@ -2110,7 +2107,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
 				trace_rcu_grace_period(rsp->name,
 						       READ_ONCE(rsp->gpnum),
 						       TPS("fqsstart"));
-				fqs_state = rcu_gp_fqs(rsp, fqs_state);
+				rcu_gp_fqs(rsp, first_gp_fqs);
+				first_gp_fqs = false;
 				trace_rcu_grace_period(rsp->name,
 						       READ_ONCE(rsp->gpnum),
 						       TPS("fqsend"));
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 0c33c82cec64..be6d1e8eeb79 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -412,13 +412,6 @@ struct rcu_data {
 	struct rcu_state *rsp;
 };
 
-/* Values for fqs_state field in struct rcu_state. */
-#define RCU_GP_IDLE		0	/* No grace period in progress. */
-#define RCU_GP_INIT		1	/* Grace period being initialized. */
-#define RCU_SAVE_DYNTICK	2	/* Need to scan dyntick state. */
-#define RCU_FORCE_QS		3	/* Need to force quiescent state. */
-#define RCU_SIGNAL_INIT		RCU_SAVE_DYNTICK
-
 /* Values for nocb_defer_wakeup field in struct rcu_data. */
 #define RCU_NOGP_WAKE_NOT	0
 #define RCU_NOGP_WAKE		1
@@ -468,9 +461,8 @@ struct rcu_state {
 
 	/* The following fields are guarded by the root rcu_node's lock. */
 
-	u8	fqs_state ____cacheline_internodealigned_in_smp;
-						/* Force QS state. */
-	u8	boost;				/* Subject to priority boost. */
+	u8	boost ____cacheline_internodealigned_in_smp;
+						/* Subject to priority boost. */
 	unsigned long gpnum;			/* Current gp number. */
 	unsigned long completed;		/* # of last completed gp. */
 	struct task_struct *gp_kthread;		/* Task for grace periods. */
@@ -538,7 +530,7 @@ struct rcu_state {
 #define RCU_GP_FLAG_FQS  0x2	/* Need grace-period quiescent-state forcing. */
 
 /* Values for rcu_state structure's gp_flags field. */
-#define RCU_GP_WAIT_INIT 0	/* Initial state. */
+#define RCU_GP_IDLE	 0	/* Initial state and no GP in progress. */
 #define RCU_GP_WAIT_GPS  1	/* Wait for grace-period start. */
 #define RCU_GP_DONE_GPS  2	/* Wait done for grace-period start. */
 #define RCU_GP_WAIT_FQS  3	/* Wait for force-quiescent-state time. */
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 6fc4c5ff3bb5..1d61f5ba4641 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -268,7 +268,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 	gpnum = rsp->gpnum;
 	seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ",
 		   ulong2long(rsp->completed), ulong2long(gpnum),
-		   rsp->fqs_state,
+		   rsp->gp_state,
 		   (long)(rsp->jiffies_force_qs - jiffies),
 		   (int)(jiffies & 0xffff));
 	seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",

From c34d2f418485a5d710bc002e490148b8ea53f456 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 10 Sep 2015 11:21:28 -0700
Subject: [PATCH 17/53] rcu: Correct comment for values of ->gp_state field

This commit corrects the comment for the values of the ->gp_state field,
which previously incorrectly said that these were for the ->gp_flags
field.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index be6d1e8eeb79..674ebbc3e406 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -529,7 +529,7 @@ struct rcu_state {
 #define RCU_GP_FLAG_INIT 0x1	/* Need grace-period initialization. */
 #define RCU_GP_FLAG_FQS  0x2	/* Need grace-period quiescent-state forcing. */
 
-/* Values for rcu_state structure's gp_flags field. */
+/* Values for rcu_state structure's gp_state field. */
 #define RCU_GP_IDLE	 0	/* Initial state and no GP in progress. */
 #define RCU_GP_WAIT_GPS  1	/* Wait for grace-period start. */
 #define RCU_GP_DONE_GPS  2	/* Wait done for grace-period start. */

From c3ac7cf1847a4e68c909984f60d36adef2088e35 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 10 Sep 2015 16:29:02 -0700
Subject: [PATCH 18/53] rcu: Add rcu_pointer_handoff()

This commit adds an rcu_pointer_handoff() that is intended to mark
situations where a structure's protection transitions from RCU to some
other mechanism (locking, reference counting, whatever).  These markings
should allow external tools to more easily spot bugs involving leaking
pointers out of RCU read-side critical sections.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 6c3ceceb6148..587eb057e2fa 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -812,6 +812,28 @@ static inline void rcu_preempt_sleep_check(void)
  */
 #define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0)
 
+/**
+ * rcu_pointer_handoff() - Hand off a pointer from RCU to other mechanism
+ * @p: The pointer to hand off
+ *
+ * This is simply an identity function, but it documents where a pointer
+ * is handed off from RCU to some other synchronization mechanism, for
+ * example, reference counting or locking.  In C11, it would map to
+ * kill_dependency().  It could be used as follows:
+ *
+ *	rcu_read_lock();
+ *	p = rcu_dereference(gp);
+ *	long_lived = is_long_lived(p);
+ *	if (long_lived) {
+ *		if (!atomic_inc_not_zero(p->refcnt))
+ *			long_lived = false;
+ *		else
+ *			p = rcu_pointer_handoff(p);
+ *	}
+ *	rcu_read_unlock();
+ */
+#define rcu_pointer_handoff(p) (p)
+
 /**
  * rcu_read_lock() - mark the beginning of an RCU read-side critical section
  *

From 8db70b132dd57696cfc7560203a72e90c51bfdda Mon Sep 17 00:00:00 2001
From: Patrick Marlier <patrick.marlier@gmail.com>
Date: Fri, 11 Sep 2015 15:50:35 -0700
Subject: [PATCH 19/53] rculist: Make list_entry_rcu() use
 lockless_dereference()

The current list_entry_rcu() implementation copies the pointer to a stack
variable, then invokes rcu_dereference_raw() on it.  This results in an
additional store-load pair.  Now, most compilers will emit normal store
and load instructions, which might seem to be of negligible overhead,
but this results in a load-hit-store situation that can cause surprisingly
long pipeline stalls, even on modern microprocessors.  The problem is
that it takes time for the store to get the store buffer updated, which
can delay the subsequent load, which immediately follows.

This commit therefore switches to the lockless_dereference() primitive,
which does not expect the __rcu annotations (that are anyway not present
in the list_head structure) and which, like rcu_dereference_raw(),
does not check for an enclosing RCU read-side critical section.
Most importantly, it does not copy the pointer, thus avoiding the
load-hit-store overhead.

Signed-off-by: Patrick Marlier <patrick.marlier@gmail.com>
[ paulmck: Switched to lockless_dereference() to suppress sparse warnings. ]
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rculist.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 17c6b1f84a77..5ed540986019 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -247,10 +247,7 @@ static inline void list_splice_init_rcu(struct list_head *list,
  * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
  */
 #define list_entry_rcu(ptr, type, member) \
-({ \
-	typeof(*ptr) __rcu *__ptr = (typeof(*ptr) __rcu __force *)ptr; \
-	container_of((typeof(ptr))rcu_dereference_raw(__ptr), type, member); \
-})
+	container_of(lockless_dereference(ptr), type, member)
 
 /**
  * Where are list_empty_rcu() and list_first_entry_rcu()?

From e62e3f620ba8d437f4998441fc11cf3dc9d466d1 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 14 Sep 2015 12:23:01 -0700
Subject: [PATCH 20/53] rcu: Remove deprecated rcu_lockdep_assert()

The old rcu_lockdep_assert() was retained to ease handling of incoming
patches, but any use will result in deprecated warnings.  However, its
replacement, RCU_LOCKDEP_WARN(), is now upstream.  It is therefore
time to remove rcu_lockdep_assert(), which this commit does.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcupdate.h | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 587eb057e2fa..a0189ba67fde 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -537,28 +537,8 @@ static inline int rcu_read_lock_sched_held(void)
 
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
-/* Deprecate rcu_lockdep_assert():  Use RCU_LOCKDEP_WARN() instead. */
-static inline void __attribute((deprecated)) deprecate_rcu_lockdep_assert(void)
-{
-}
-
 #ifdef CONFIG_PROVE_RCU
 
-/**
- * rcu_lockdep_assert - emit lockdep splat if specified condition not met
- * @c: condition to check
- * @s: informative message
- */
-#define rcu_lockdep_assert(c, s)					\
-	do {								\
-		static bool __section(.data.unlikely) __warned;		\
-		deprecate_rcu_lockdep_assert();				\
-		if (debug_lockdep_rcu_enabled() && !__warned && !(c)) {	\
-			__warned = true;				\
-			lockdep_rcu_suspicious(__FILE__, __LINE__, s);	\
-		}							\
-	} while (0)
-
 /**
  * RCU_LOCKDEP_WARN - emit lockdep splat if specified condition is met
  * @c: condition to check
@@ -596,7 +576,6 @@ static inline void rcu_preempt_sleep_check(void)
 
 #else /* #ifdef CONFIG_PROVE_RCU */
 
-#define rcu_lockdep_assert(c, s) deprecate_rcu_lockdep_assert()
 #define RCU_LOCKDEP_WARN(c, s) do { } while (0)
 #define rcu_sleep_check() do { } while (0)
 

From 7f5f873c6a0772970d5fee1f364231207051ecd8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 18 Sep 2015 08:45:22 -0700
Subject: [PATCH 21/53] rculist: Use WRITE_ONCE() when deleting from
 reader-visible list

The various RCU list-deletion macros (list_del_rcu(),
hlist_del_init_rcu(), hlist_del_rcu(), hlist_bl_del_init_rcu(),
hlist_bl_del_rcu(), hlist_nulls_del_init_rcu(), and hlist_nulls_del_rcu())
do plain stores into the ->next pointer of the preceding list elemment.
Unfortunately, the compiler is within its rights to (for example) use
byte-at-a-time writes to update the pointer, which would fatally confuse
concurrent readers.  This patch therefore adds the needed WRITE_ONCE()
macros.

KernelThreadSanitizer (KTSAN) reported the __hlist_del() issue, which
is a problem when __hlist_del() is invoked by hlist_del_rcu().

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/list.h       | 5 +++--
 include/linux/list_bl.h    | 5 +++--
 include/linux/list_nulls.h | 3 ++-
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/include/linux/list.h b/include/linux/list.h
index 3e3e64a61002..993395a2e55c 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -87,7 +87,7 @@ static inline void list_add_tail(struct list_head *new, struct list_head *head)
 static inline void __list_del(struct list_head * prev, struct list_head * next)
 {
 	next->prev = prev;
-	prev->next = next;
+	WRITE_ONCE(prev->next, next);
 }
 
 /**
@@ -615,7 +615,8 @@ static inline void __hlist_del(struct hlist_node *n)
 {
 	struct hlist_node *next = n->next;
 	struct hlist_node **pprev = n->pprev;
-	*pprev = next;
+
+	WRITE_ONCE(*pprev, next);
 	if (next)
 		next->pprev = pprev;
 }
diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
index 2eb88556c5c5..8132214e8efd 100644
--- a/include/linux/list_bl.h
+++ b/include/linux/list_bl.h
@@ -93,9 +93,10 @@ static inline void __hlist_bl_del(struct hlist_bl_node *n)
 	LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
 
 	/* pprev may be `first`, so be careful not to lose the lock bit */
-	*pprev = (struct hlist_bl_node *)
+	WRITE_ONCE(*pprev,
+		   (struct hlist_bl_node *)
 			((unsigned long)next |
-			 ((unsigned long)*pprev & LIST_BL_LOCKMASK));
+			 ((unsigned long)*pprev & LIST_BL_LOCKMASK)));
 	if (next)
 		next->pprev = pprev;
 }
diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
index f266661d2666..444d2b1313bd 100644
--- a/include/linux/list_nulls.h
+++ b/include/linux/list_nulls.h
@@ -76,7 +76,8 @@ static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
 {
 	struct hlist_nulls_node *next = n->next;
 	struct hlist_nulls_node **pprev = n->pprev;
-	*pprev = next;
+
+	WRITE_ONCE(*pprev, next);
 	if (!is_a_nulls(next))
 		next->pprev = pprev;
 }

From 27566139b6e2f6cfe273e8bb0e538d7616c2ea00 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 1 Aug 2015 10:45:26 -0700
Subject: [PATCH 22/53] documentation: No acquire/release for RCU readers

Documentation/memory-barriers.txt calls out RCU as one of the sets
of primitives associated with ACQUIRE and RELEASE.  There really
is an association in that rcu_assign_pointer() includes a RELEASE
operation, but a quick read can convince people that rcu_read_lock() and
rcu_read_unlock() have ACQUIRE and RELEASE semantics, which they do not.

This commit therefore removes RCU from this list in order to avoid
this confusion.

Reported-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 Documentation/memory-barriers.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 2ba8461b0631..d336e4d42029 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1789,7 +1789,6 @@ The Linux kernel has a number of locking constructs:
  (*) mutexes
  (*) semaphores
  (*) R/W semaphores
- (*) RCU
 
 In all cases there are variants on "ACQUIRE" operations and "RELEASE" operations
 for each construct.  These operations all imply certain barriers:

From da873def8da5883a6c04d11f73dcd836c216cf4f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 4 Aug 2015 11:54:04 -0700
Subject: [PATCH 23/53] documentation: Call out slow consoles as cause of stall
 warnings

The Linux kernel outputs copious text during boot, and a slow serial
console can result in stall warnings, particularly when messages are
printed with interrupts disabled.  This commit adds this to the list
of causes of RCU CPU stall warning messages.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 Documentation/RCU/stallwarn.txt | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index efb9454875ab..0f7fb4298e7e 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -205,6 +205,13 @@ o	For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the
 	behavior, you might need to replace some of the cond_resched()
 	calls with calls to cond_resched_rcu_qs().
 
+o	Booting Linux using a console connection that is too slow to
+	keep up with the boot-time console-message rate.  For example,
+	a 115Kbaud serial console can be -way- too slow to keep up
+	with boot-time message rates, and will frequently result in
+	RCU CPU stall warning messages.  Especially if you have added
+	debug printk()s.
+
 o	Anything that prevents RCU's grace-period kthreads from running.
 	This can result in the "All QSes seen" console-log message.
 	This message will include information on when the kthread last

From 2c4ac34bc2d97f056ed3c43fa03c0737fae46fb6 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Tue, 11 Aug 2015 14:26:33 +0200
Subject: [PATCH 24/53] documentation: Correct doc to use
 rcu_dereference_protected

As there is lots of misinformation and outdated information on the
Internet about nearly all topics related to the kernel, I thought it
would be best if I based my RCU code on the guidelines of the examples
in the Documentation/ tree of the latest kernel. One thing that stuck
out when reading the whatisRCU.txt document was, "interesting how we
don't need any function to dereference rcu protected pointers when doing
updates if a lock is held. I wonder how static analyzers will work with
that." Then, a few weeks later, upon discovering sparse's __rcu support,
I ran it over my code, and lo and behold, things weren't done right.
Examining other RCU usages in the kernel reveal consistent usage of
rcu_dereference_protected, passing in lockdep_is_held as the
conditional. So, this patch adds that idiom to the documentation, so
that others ahead of me won't endure the same exercise.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 Documentation/RCU/whatisRCU.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index adc2184009c5..dc49c6712b17 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -364,7 +364,7 @@ uses of RCU may be found in listRCU.txt, arrayRCU.txt, and NMI-RCU.txt.
 	};
 	DEFINE_SPINLOCK(foo_mutex);
 
-	struct foo *gbl_foo;
+	struct foo __rcu *gbl_foo;
 
 	/*
 	 * Create a new struct foo that is the same as the one currently
@@ -386,7 +386,7 @@ uses of RCU may be found in listRCU.txt, arrayRCU.txt, and NMI-RCU.txt.
 
 		new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL);
 		spin_lock(&foo_mutex);
-		old_fp = gbl_foo;
+		old_fp = rcu_dereference_protected(gbl_foo, lockdep_is_held(&foo_mutex));
 		*new_fp = *old_fp;
 		new_fp->a = new_a;
 		rcu_assign_pointer(gbl_foo, new_fp);
@@ -487,7 +487,7 @@ The foo_update_a() function might then be written as follows:
 
 		new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL);
 		spin_lock(&foo_mutex);
-		old_fp = gbl_foo;
+		old_fp = rcu_dereference_protected(gbl_foo, lockdep_is_held(&foo_mutex));
 		*new_fp = *old_fp;
 		new_fp->a = new_a;
 		rcu_assign_pointer(gbl_foo, new_fp);

From b672adf8cfb822781ab904343e5de0297ee117ed Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 24 Aug 2015 11:46:00 -0700
Subject: [PATCH 25/53] documentation: Catch up list of torture_type options

This commit rids the documentation of long-obsolete torture_type options
such as rcu_sync and adds new ones such as tasks.  Also add verbiage
noting the fact that rcutorture now concurrently tests the asynchrounous,
synchronous, expedited synchronous, and polling grace-period primitives.

Reported-by: David Miller <davem@davemloft.net>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: David Miller <davem@davemloft.net>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 Documentation/RCU/torture.txt | 39 ++++++++++++-----------------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
index dac02a6219b1..118e7c176ce7 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@@ -166,40 +166,27 @@ test_no_idle_hz	Whether or not to test the ability of RCU to operate in
 
 torture_type	The type of RCU to test, with string values as follows:
 
-		"rcu":  rcu_read_lock(), rcu_read_unlock() and call_rcu().
-
-		"rcu_sync":  rcu_read_lock(), rcu_read_unlock(), and
-			synchronize_rcu().
-
-		"rcu_expedited": rcu_read_lock(), rcu_read_unlock(), and
-			synchronize_rcu_expedited().
+		"rcu":  rcu_read_lock(), rcu_read_unlock() and call_rcu(),
+			along with expedited, synchronous, and polling
+			variants.
 
 		"rcu_bh": rcu_read_lock_bh(), rcu_read_unlock_bh(), and
-			call_rcu_bh().
+			call_rcu_bh(), along with expedited and synchronous
+			variants.
 
-		"rcu_bh_sync": rcu_read_lock_bh(), rcu_read_unlock_bh(),
-			and synchronize_rcu_bh().
-
-		"rcu_bh_expedited": rcu_read_lock_bh(), rcu_read_unlock_bh(),
-			and synchronize_rcu_bh_expedited().
+		"rcu_busted": This tests an intentionally incorrect version
+			of RCU in order to help test rcutorture itself.
 
 		"srcu": srcu_read_lock(), srcu_read_unlock() and
-			call_srcu().
-
-		"srcu_sync": srcu_read_lock(), srcu_read_unlock() and
-			synchronize_srcu().
-
-		"srcu_expedited": srcu_read_lock(), srcu_read_unlock() and
-			synchronize_srcu_expedited().
+			call_srcu(), along with expedited and
+			synchronous variants.
 
 		"sched": preempt_disable(), preempt_enable(), and
-			call_rcu_sched().
+			call_rcu_sched(), along with expedited,
+			synchronous, and polling variants.
 
-		"sched_sync": preempt_disable(), preempt_enable(), and
-			synchronize_sched().
-
-		"sched_expedited": preempt_disable(), preempt_enable(), and
-			synchronize_sched_expedited().
+		"tasks": voluntary context switch and call_rcu_tasks(),
+			along with expedited and synchronous variants.
 
 		Defaults to "rcu".
 

From ad2ad5d31f90fc6fb269e9be244224cecfc8b400 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 17 Sep 2015 08:18:32 -0700
Subject: [PATCH 26/53] documentation: Add lockless_dereference()

The recently added lockless_dereference() macro is not present in the
Documentation/ directory, so this commit fixes that.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 Documentation/memory-barriers.txt | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index d336e4d42029..8e7cf9ad3db1 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1710,6 +1710,17 @@ There are some more advanced barrier functions:
      operations" subsection for information on where to use these.
 
 
+ (*) lockless_dereference();
+     This can be thought of as a pointer-fetch wrapper around the
+     smp_read_barrier_depends() data-dependency barrier.
+
+     This is also similar to rcu_dereference(), but in cases where
+     object lifetime is handled by some mechanism other than RCU, for
+     example, when the objects removed only when the system goes down.
+     In addition, lockless_dereference() is used in some data structures
+     that can be used both with and without RCU.
+
+
  (*) dma_wmb();
  (*) dma_rmb();
 

From 095777c417db142970adeb776fa0cb10810b8122 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 22 Jul 2015 14:07:27 -0700
Subject: [PATCH 27/53] locktorture: Support rtmutex torturing

Real time mutexes is one of the few general primitives
that we do not have in locktorture. Address this -- a few
considerations:

o To spice things up, enable competing thread(s) to become
rt, such that we can stress different prio boosting paths
in the rtmutex code. Introduce a ->task_boost callback,
only used by rtmutex-torturer. Tasks will boost/deboost
around every 50k (arbitrarily) lock/unlock operations.

o Hold times are similar to what we have for other locks:
only occasionally having longer hold times (per ~200k ops).
So we roughly do two full rt boost+deboosting ops with
short hold times.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 Documentation/locking/locktorture.txt         |   3 +
 kernel/locking/locktorture.c                  | 114 +++++++++++++++++-
 .../selftests/rcutorture/configs/lock/CFLIST  |   3 +-
 .../selftests/rcutorture/configs/lock/LOCK05  |   6 +
 .../rcutorture/configs/lock/LOCK05.boot       |   1 +
 5 files changed, 124 insertions(+), 3 deletions(-)
 create mode 100644 tools/testing/selftests/rcutorture/configs/lock/LOCK05
 create mode 100644 tools/testing/selftests/rcutorture/configs/lock/LOCK05.boot

diff --git a/Documentation/locking/locktorture.txt b/Documentation/locking/locktorture.txt
index 619f2bb136a5..a2ef3a929bf1 100644
--- a/Documentation/locking/locktorture.txt
+++ b/Documentation/locking/locktorture.txt
@@ -52,6 +52,9 @@ torture_type	  Type of lock to torture. By default, only spinlocks will
 
 		     o "mutex_lock": mutex_lock() and mutex_unlock() pairs.
 
+		     o "rtmutex_lock": rtmutex_lock() and rtmutex_unlock()
+				       pairs. Kernel must have CONFIG_RT_MUTEX=y.
+
 		     o "rwsem_lock": read/write down() and up() semaphore pairs.
 
 torture_runnable  Start locktorture at boot time in the case where the
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 32244186f1f2..e1ca7a2fae91 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -17,12 +17,14 @@
  *
  * Copyright (C) IBM Corporation, 2014
  *
- * Author: Paul E. McKenney <paulmck@us.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ *          Davidlohr Bueso <dave@stgolabs.net>
  *	Based on kernel/rcu/torture.c.
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/kthread.h>
+#include <linux/sched/rt.h>
 #include <linux/spinlock.h>
 #include <linux/rwlock.h>
 #include <linux/mutex.h>
@@ -91,11 +93,13 @@ struct lock_torture_ops {
 	void (*init)(void);
 	int (*writelock)(void);
 	void (*write_delay)(struct torture_random_state *trsp);
+	void (*task_boost)(struct torture_random_state *trsp);
 	void (*writeunlock)(void);
 	int (*readlock)(void);
 	void (*read_delay)(struct torture_random_state *trsp);
 	void (*readunlock)(void);
-	unsigned long flags;
+
+	unsigned long flags; /* for irq spinlocks */
 	const char *name;
 };
 
@@ -139,9 +143,15 @@ static void torture_lock_busted_write_unlock(void)
 	  /* BUGGY, do not use in real life!!! */
 }
 
+static void torture_boost_dummy(struct torture_random_state *trsp)
+{
+	/* Only rtmutexes care about priority */
+}
+
 static struct lock_torture_ops lock_busted_ops = {
 	.writelock	= torture_lock_busted_write_lock,
 	.write_delay	= torture_lock_busted_write_delay,
+	.task_boost     = torture_boost_dummy,
 	.writeunlock	= torture_lock_busted_write_unlock,
 	.readlock       = NULL,
 	.read_delay     = NULL,
@@ -185,6 +195,7 @@ static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
 static struct lock_torture_ops spin_lock_ops = {
 	.writelock	= torture_spin_lock_write_lock,
 	.write_delay	= torture_spin_lock_write_delay,
+	.task_boost     = torture_boost_dummy,
 	.writeunlock	= torture_spin_lock_write_unlock,
 	.readlock       = NULL,
 	.read_delay     = NULL,
@@ -211,6 +222,7 @@ __releases(torture_spinlock)
 static struct lock_torture_ops spin_lock_irq_ops = {
 	.writelock	= torture_spin_lock_write_lock_irq,
 	.write_delay	= torture_spin_lock_write_delay,
+	.task_boost     = torture_boost_dummy,
 	.writeunlock	= torture_lock_spin_write_unlock_irq,
 	.readlock       = NULL,
 	.read_delay     = NULL,
@@ -275,6 +287,7 @@ static void torture_rwlock_read_unlock(void) __releases(torture_rwlock)
 static struct lock_torture_ops rw_lock_ops = {
 	.writelock	= torture_rwlock_write_lock,
 	.write_delay	= torture_rwlock_write_delay,
+	.task_boost     = torture_boost_dummy,
 	.writeunlock	= torture_rwlock_write_unlock,
 	.readlock       = torture_rwlock_read_lock,
 	.read_delay     = torture_rwlock_read_delay,
@@ -315,6 +328,7 @@ __releases(torture_rwlock)
 static struct lock_torture_ops rw_lock_irq_ops = {
 	.writelock	= torture_rwlock_write_lock_irq,
 	.write_delay	= torture_rwlock_write_delay,
+	.task_boost     = torture_boost_dummy,
 	.writeunlock	= torture_rwlock_write_unlock_irq,
 	.readlock       = torture_rwlock_read_lock_irq,
 	.read_delay     = torture_rwlock_read_delay,
@@ -354,6 +368,7 @@ static void torture_mutex_unlock(void) __releases(torture_mutex)
 static struct lock_torture_ops mutex_lock_ops = {
 	.writelock	= torture_mutex_lock,
 	.write_delay	= torture_mutex_delay,
+	.task_boost     = torture_boost_dummy,
 	.writeunlock	= torture_mutex_unlock,
 	.readlock       = NULL,
 	.read_delay     = NULL,
@@ -361,6 +376,90 @@ static struct lock_torture_ops mutex_lock_ops = {
 	.name		= "mutex_lock"
 };
 
+#ifdef CONFIG_RT_MUTEXES
+static DEFINE_RT_MUTEX(torture_rtmutex);
+
+static int torture_rtmutex_lock(void) __acquires(torture_rtmutex)
+{
+	rt_mutex_lock(&torture_rtmutex);
+	return 0;
+}
+
+static void torture_rtmutex_boost(struct torture_random_state *trsp)
+{
+	int policy;
+	struct sched_param param;
+	const unsigned int factor = 50000; /* yes, quite arbitrary */
+
+	if (!rt_task(current)) {
+		/*
+		 * (1) Boost priority once every ~50k operations. When the
+		 * task tries to take the lock, the rtmutex it will account
+		 * for the new priority, and do any corresponding pi-dance.
+		 */
+		if (!(torture_random(trsp) %
+		      (cxt.nrealwriters_stress * factor))) {
+			policy = SCHED_FIFO;
+			param.sched_priority = MAX_RT_PRIO - 1;
+		} else /* common case, do nothing */
+			return;
+	} else {
+		/*
+		 * The task will remain boosted for another ~500k operations,
+		 * then restored back to its original prio, and so forth.
+		 *
+		 * When @trsp is nil, we want to force-reset the task for
+		 * stopping the kthread.
+		 */
+		if (!trsp || !(torture_random(trsp) %
+			       (cxt.nrealwriters_stress * factor * 2))) {
+			policy = SCHED_NORMAL;
+			param.sched_priority = 0;
+		} else /* common case, do nothing */
+			return;
+	}
+
+	sched_setscheduler_nocheck(current, policy, &param);
+}
+
+static void torture_rtmutex_delay(struct torture_random_state *trsp)
+{
+	const unsigned long shortdelay_us = 2;
+	const unsigned long longdelay_ms = 100;
+
+	/*
+	 * We want a short delay mostly to emulate likely code, and
+	 * we want a long delay occasionally to force massive contention.
+	 */
+	if (!(torture_random(trsp) %
+	      (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+		mdelay(longdelay_ms);
+	if (!(torture_random(trsp) %
+	      (cxt.nrealwriters_stress * 2 * shortdelay_us)))
+		udelay(shortdelay_us);
+#ifdef CONFIG_PREEMPT
+	if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
+		preempt_schedule();  /* Allow test to be preempted. */
+#endif
+}
+
+static void torture_rtmutex_unlock(void) __releases(torture_rtmutex)
+{
+	rt_mutex_unlock(&torture_rtmutex);
+}
+
+static struct lock_torture_ops rtmutex_lock_ops = {
+	.writelock	= torture_rtmutex_lock,
+	.write_delay	= torture_rtmutex_delay,
+	.task_boost     = torture_rtmutex_boost,
+	.writeunlock	= torture_rtmutex_unlock,
+	.readlock       = NULL,
+	.read_delay     = NULL,
+	.readunlock     = NULL,
+	.name		= "rtmutex_lock"
+};
+#endif
+
 static DECLARE_RWSEM(torture_rwsem);
 static int torture_rwsem_down_write(void) __acquires(torture_rwsem)
 {
@@ -419,6 +518,7 @@ static void torture_rwsem_up_read(void) __releases(torture_rwsem)
 static struct lock_torture_ops rwsem_lock_ops = {
 	.writelock	= torture_rwsem_down_write,
 	.write_delay	= torture_rwsem_write_delay,
+	.task_boost     = torture_boost_dummy,
 	.writeunlock	= torture_rwsem_up_write,
 	.readlock       = torture_rwsem_down_read,
 	.read_delay     = torture_rwsem_read_delay,
@@ -442,6 +542,7 @@ static int lock_torture_writer(void *arg)
 		if ((torture_random(&rand) & 0xfffff) == 0)
 			schedule_timeout_uninterruptible(1);
 
+		cxt.cur_ops->task_boost(&rand);
 		cxt.cur_ops->writelock();
 		if (WARN_ON_ONCE(lock_is_write_held))
 			lwsp->n_lock_fail++;
@@ -456,6 +557,8 @@ static int lock_torture_writer(void *arg)
 
 		stutter_wait("lock_torture_writer");
 	} while (!torture_must_stop());
+
+	cxt.cur_ops->task_boost(NULL); /* reset prio */
 	torture_kthread_stopping("lock_torture_writer");
 	return 0;
 }
@@ -642,6 +745,9 @@ static int __init lock_torture_init(void)
 		&spin_lock_ops, &spin_lock_irq_ops,
 		&rw_lock_ops, &rw_lock_irq_ops,
 		&mutex_lock_ops,
+#ifdef CONFIG_RT_MUTEXES
+		&rtmutex_lock_ops,
+#endif
 		&rwsem_lock_ops,
 	};
 
@@ -676,6 +782,10 @@ static int __init lock_torture_init(void)
 	if (strncmp(torture_type, "mutex", 5) == 0)
 		cxt.debug_lock = true;
 #endif
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	if (strncmp(torture_type, "rtmutex", 7) == 0)
+		cxt.debug_lock = true;
+#endif
 #ifdef CONFIG_DEBUG_SPINLOCK
 	if ((strncmp(torture_type, "spin", 4) == 0) ||
 	    (strncmp(torture_type, "rw_lock", 7) == 0))
diff --git a/tools/testing/selftests/rcutorture/configs/lock/CFLIST b/tools/testing/selftests/rcutorture/configs/lock/CFLIST
index 6910b7370761..6ed32794eaa1 100644
--- a/tools/testing/selftests/rcutorture/configs/lock/CFLIST
+++ b/tools/testing/selftests/rcutorture/configs/lock/CFLIST
@@ -1,4 +1,5 @@
 LOCK01
 LOCK02
 LOCK03
-LOCK04
\ No newline at end of file
+LOCK04
+LOCK05
diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK05 b/tools/testing/selftests/rcutorture/configs/lock/LOCK05
new file mode 100644
index 000000000000..1d1da1477fc3
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK05
@@ -0,0 +1,6 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=4
+CONFIG_HOTPLUG_CPU=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK05.boot b/tools/testing/selftests/rcutorture/configs/lock/LOCK05.boot
new file mode 100644
index 000000000000..8ac37307c987
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK05.boot
@@ -0,0 +1 @@
+locktorture.torture_type=rtmutex_lock

From 302707fd7cd341a23dbc448a335d432ad0069c20 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 31 Aug 2015 20:21:59 -0700
Subject: [PATCH 28/53] locking/percpu-rwsem: Export symbols for locktorture

This commit exports percpu_down_read(), percpu_down_write(),
__percpu_init_rwsem(), percpu_up_read(), and percpu_up_write() to allow
locktorture to test them when built as a module.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/locking/percpu-rwsem.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index f32567254867..e2621fbbcbd1 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -22,6 +22,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
 	init_waitqueue_head(&brw->write_waitq);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
 
 void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
 {
@@ -87,6 +88,7 @@ void percpu_down_read(struct percpu_rw_semaphore *brw)
 	/* avoid up_read()->rwsem_release() */
 	__up_read(&brw->rw_sem);
 }
+EXPORT_SYMBOL_GPL(percpu_down_read);
 
 int percpu_down_read_trylock(struct percpu_rw_semaphore *brw)
 {
@@ -112,6 +114,7 @@ void percpu_up_read(struct percpu_rw_semaphore *brw)
 	if (atomic_dec_and_test(&brw->slow_read_ctr))
 		wake_up_all(&brw->write_waitq);
 }
+EXPORT_SYMBOL_GPL(percpu_up_read);
 
 static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
 {
@@ -163,6 +166,7 @@ void percpu_down_write(struct percpu_rw_semaphore *brw)
 	/* wait for all readers to complete their percpu_up_read() */
 	wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
 }
+EXPORT_SYMBOL_GPL(percpu_down_write);
 
 void percpu_up_write(struct percpu_rw_semaphore *brw)
 {
@@ -176,3 +180,4 @@ void percpu_up_write(struct percpu_rw_semaphore *brw)
 	/* the last writer unblocks update_fast_ctr() */
 	atomic_dec(&brw->write_ctr);
 }
+EXPORT_SYMBOL_GPL(percpu_up_write);

From 617783dd99704331e22636388c932450e02ee636 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 29 Aug 2015 14:46:29 -0700
Subject: [PATCH 29/53] locktorture: Add torture tests for percpu_rwsem

This commit adds percpu_rwsem tests based on the earlier rwsem tests.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/locking/locktorture.c                  | 44 +++++++++++++++++++
 .../selftests/rcutorture/configs/lock/CFLIST  |  1 +
 .../selftests/rcutorture/configs/lock/LOCK06  |  6 +++
 .../rcutorture/configs/lock/LOCK06.boot       |  1 +
 4 files changed, 52 insertions(+)
 create mode 100644 tools/testing/selftests/rcutorture/configs/lock/LOCK06
 create mode 100644 tools/testing/selftests/rcutorture/configs/lock/LOCK06.boot

diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index e1ca7a2fae91..8545e12598ce 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -36,6 +36,7 @@
 #include <linux/moduleparam.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
+#include <linux/percpu-rwsem.h>
 #include <linux/torture.h>
 
 MODULE_LICENSE("GPL");
@@ -526,6 +527,48 @@ static struct lock_torture_ops rwsem_lock_ops = {
 	.name		= "rwsem_lock"
 };
 
+#include <linux/percpu-rwsem.h>
+static struct percpu_rw_semaphore pcpu_rwsem;
+
+void torture_percpu_rwsem_init(void)
+{
+	BUG_ON(percpu_init_rwsem(&pcpu_rwsem));
+}
+
+static int torture_percpu_rwsem_down_write(void) __acquires(pcpu_rwsem)
+{
+	percpu_down_write(&pcpu_rwsem);
+	return 0;
+}
+
+static void torture_percpu_rwsem_up_write(void) __releases(pcpu_rwsem)
+{
+	percpu_up_write(&pcpu_rwsem);
+}
+
+static int torture_percpu_rwsem_down_read(void) __acquires(pcpu_rwsem)
+{
+	percpu_down_read(&pcpu_rwsem);
+	return 0;
+}
+
+static void torture_percpu_rwsem_up_read(void) __releases(pcpu_rwsem)
+{
+	percpu_up_read(&pcpu_rwsem);
+}
+
+static struct lock_torture_ops percpu_rwsem_lock_ops = {
+	.init		= torture_percpu_rwsem_init,
+	.writelock	= torture_percpu_rwsem_down_write,
+	.write_delay	= torture_rwsem_write_delay,
+	.task_boost     = torture_boost_dummy,
+	.writeunlock	= torture_percpu_rwsem_up_write,
+	.readlock       = torture_percpu_rwsem_down_read,
+	.read_delay     = torture_rwsem_read_delay,
+	.readunlock     = torture_percpu_rwsem_up_read,
+	.name		= "percpu_rwsem_lock"
+};
+
 /*
  * Lock torture writer kthread.  Repeatedly acquires and releases
  * the lock, checking for duplicate acquisitions.
@@ -749,6 +792,7 @@ static int __init lock_torture_init(void)
 		&rtmutex_lock_ops,
 #endif
 		&rwsem_lock_ops,
+		&percpu_rwsem_lock_ops,
 	};
 
 	if (!torture_init_begin(torture_type, verbose, &torture_runnable))
diff --git a/tools/testing/selftests/rcutorture/configs/lock/CFLIST b/tools/testing/selftests/rcutorture/configs/lock/CFLIST
index 6ed32794eaa1..b9611c523723 100644
--- a/tools/testing/selftests/rcutorture/configs/lock/CFLIST
+++ b/tools/testing/selftests/rcutorture/configs/lock/CFLIST
@@ -3,3 +3,4 @@ LOCK02
 LOCK03
 LOCK04
 LOCK05
+LOCK06
diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK06 b/tools/testing/selftests/rcutorture/configs/lock/LOCK06
new file mode 100644
index 000000000000..1d1da1477fc3
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK06
@@ -0,0 +1,6 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=4
+CONFIG_HOTPLUG_CPU=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK06.boot b/tools/testing/selftests/rcutorture/configs/lock/LOCK06.boot
new file mode 100644
index 000000000000..f92219cd4ad9
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK06.boot
@@ -0,0 +1 @@
+locktorture.torture_type=percpu_rwsem_lock

From 3836f5337f74fedc15981688c3c31dbf4293ae84 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 30 Aug 2015 03:29:58 -0700
Subject: [PATCH 30/53] torture: Consolidate cond_resched_rcu_qs() into
 stutter_wait()

This commit moves cond_resched_rcu_qs() into stutter_wait(), saving
a line and also avoiding RCU CPU stall warnings from all torture
loops containing a stutter_wait().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/rcutorture.c | 2 --
 kernel/torture.c        | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 77192953dee5..8a65b7d471a0 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -768,7 +768,6 @@ static int rcu_torture_boost(void *arg)
 				}
 				call_rcu_time = jiffies;
 			}
-			cond_resched_rcu_qs();
 			stutter_wait("rcu_torture_boost");
 			if (torture_must_stop())
 				goto checkwait;
@@ -1208,7 +1207,6 @@ rcu_torture_reader(void *arg)
 		__this_cpu_inc(rcu_torture_batch[completed]);
 		preempt_enable();
 		cur_ops->readunlock(idx);
-		cond_resched_rcu_qs();
 		stutter_wait("rcu_torture_reader");
 	} while (!torture_must_stop());
 	if (irqreader && cur_ops->irq_capable) {
diff --git a/kernel/torture.c b/kernel/torture.c
index 3e4840633d3e..44aa462d033f 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -523,6 +523,7 @@ static int stutter;
  */
 void stutter_wait(const char *title)
 {
+	cond_resched_rcu_qs();
 	while (READ_ONCE(stutter_pause_test) ||
 	       (torture_runnable && !READ_ONCE(*torture_runnable))) {
 		if (stutter_pause_test)

From cc44ca848f5e517aeca9f5eabbe13609a3f71450 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 21 Aug 2015 19:42:44 +0200
Subject: [PATCH 31/53] rcu: Create rcu_sync infrastructure

The rcu_sync infrastructure can be thought of as infrastructure to be
used to implement reader-writer primitives having extremely lightweight
readers during times when there are no writers.  The first use is in
the percpu_rwsem used by the VFS subsystem.

This infrastructure is functionally equivalent to

        struct rcu_sync_struct {
                atomic_t counter;
        };

	/* Check possibility of fast-path read-side operations. */
        static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
        {
                return atomic_read(&rss->counter) == 0;
        }

	/* Tell readers to use slowpaths. */
        static inline void rcu_sync_enter(struct rcu_sync_struct *rss)
        {
                atomic_inc(&rss->counter);
                synchronize_sched();
        }

	/* Allow readers to once again use fastpaths. */
        static inline void rcu_sync_exit(struct rcu_sync_struct *rss)
        {
                synchronize_sched();
                atomic_dec(&rss->counter);
        }

The main difference is that it records the state and only calls
synchronize_sched() if required.  At least some of the calls to
synchronize_sched() will be optimized away when rcu_sync_enter() and
rcu_sync_exit() are invoked repeatedly in quick succession.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcu_sync.h |  94 +++++++++++++++++++++
 kernel/rcu/Makefile      |   2 +-
 kernel/rcu/sync.c        | 175 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 270 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/rcu_sync.h
 create mode 100644 kernel/rcu/sync.c

diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h
new file mode 100644
index 000000000000..cb044df2e21c
--- /dev/null
+++ b/include/linux/rcu_sync.h
@@ -0,0 +1,94 @@
+/*
+ * RCU-based infrastructure for lightweight reader-writer locking
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (c) 2015, Red Hat, Inc.
+ *
+ * Author: Oleg Nesterov <oleg@redhat.com>
+ */
+
+#ifndef _LINUX_RCU_SYNC_H_
+#define _LINUX_RCU_SYNC_H_
+
+#include <linux/wait.h>
+#include <linux/rcupdate.h>
+
+/* Structure to mediate between updaters and fastpath-using readers.  */
+struct rcu_sync {
+	int			gp_state;
+	int			gp_count;
+	wait_queue_head_t	gp_wait;
+
+	int			cb_state;
+	struct rcu_head		cb_head;
+
+	void (*sync)(void);
+	void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+};
+
+#define ___RCU_SYNC_INIT(name)						\
+	.gp_state = 0,							\
+	.gp_count = 0,							\
+	.gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait),		\
+	.cb_state = 0
+
+#define __RCU_SCHED_SYNC_INIT(name) {					\
+	___RCU_SYNC_INIT(name),						\
+	.sync = synchronize_sched,					\
+	.call = call_rcu_sched,						\
+}
+
+#define __RCU_BH_SYNC_INIT(name) {					\
+	___RCU_SYNC_INIT(name),						\
+	.sync = synchronize_rcu_bh,					\
+	.call = call_rcu_bh,						\
+}
+
+#define __RCU_SYNC_INIT(name) {						\
+	___RCU_SYNC_INIT(name),						\
+	.sync = synchronize_rcu,					\
+	.call = call_rcu,						\
+}
+
+#define DEFINE_RCU_SCHED_SYNC(name)					\
+	struct rcu_sync name = __RCU_SCHED_SYNC_INIT(name)
+
+#define DEFINE_RCU_BH_SYNC(name)					\
+	struct rcu_sync name = __RCU_BH_SYNC_INIT(name)
+
+#define DEFINE_RCU_SYNC(name)						\
+	struct rcu_sync name = __RCU_SYNC_INIT(name)
+
+/**
+ * rcu_sync_is_idle() - Are readers permitted to use their fastpaths?
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
+ * Returns true if readers are permitted to use their fastpaths.
+ * Must be invoked within an RCU read-side critical section whose
+ * flavor matches that of the rcu_sync struture.
+ */
+static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
+{
+	return !rsp->gp_state; /* GP_IDLE */
+}
+
+enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
+
+extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type);
+extern void rcu_sync_enter(struct rcu_sync *);
+extern void rcu_sync_exit(struct rcu_sync *);
+
+#endif /* _LINUX_RCU_SYNC_H_ */
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 50a808424b06..61a16569ffbf 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,4 +1,4 @@
-obj-y += update.o
+obj-y += update.o sync.o
 obj-$(CONFIG_SRCU) += srcu.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_TREE_RCU) += tree.o
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
new file mode 100644
index 000000000000..0a11df43be23
--- /dev/null
+++ b/kernel/rcu/sync.c
@@ -0,0 +1,175 @@
+/*
+ * RCU-based infrastructure for lightweight reader-writer locking
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (c) 2015, Red Hat, Inc.
+ *
+ * Author: Oleg Nesterov <oleg@redhat.com>
+ */
+
+#include <linux/rcu_sync.h>
+#include <linux/sched.h>
+
+enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
+enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
+
+#define	rss_lock	gp_wait.lock
+
+/**
+ * rcu_sync_init() - Initialize an rcu_sync structure
+ * @rsp: Pointer to rcu_sync structure to be initialized
+ * @type: Flavor of RCU with which to synchronize rcu_sync structure
+ */
+void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
+{
+	memset(rsp, 0, sizeof(*rsp));
+	init_waitqueue_head(&rsp->gp_wait);
+
+	switch (type) {
+	case RCU_SYNC:
+		rsp->sync = synchronize_rcu;
+		rsp->call = call_rcu;
+		break;
+
+	case RCU_SCHED_SYNC:
+		rsp->sync = synchronize_sched;
+		rsp->call = call_rcu_sched;
+		break;
+
+	case RCU_BH_SYNC:
+		rsp->sync = synchronize_rcu_bh;
+		rsp->call = call_rcu_bh;
+		break;
+	}
+}
+
+/**
+ * rcu_sync_enter() - Force readers onto slowpath
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
+ * This function is used by updaters who need readers to make use of
+ * a slowpath during the update.  After this function returns, all
+ * subsequent calls to rcu_sync_is_idle() will return false, which
+ * tells readers to stay off their fastpaths.  A later call to
+ * rcu_sync_exit() re-enables reader slowpaths.
+ *
+ * When called in isolation, rcu_sync_enter() must wait for a grace
+ * period, however, closely spaced calls to rcu_sync_enter() can
+ * optimize away the grace-period wait via a state machine implemented
+ * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func().
+ */
+void rcu_sync_enter(struct rcu_sync *rsp)
+{
+	bool need_wait, need_sync;
+
+	spin_lock_irq(&rsp->rss_lock);
+	need_wait = rsp->gp_count++;
+	need_sync = rsp->gp_state == GP_IDLE;
+	if (need_sync)
+		rsp->gp_state = GP_PENDING;
+	spin_unlock_irq(&rsp->rss_lock);
+
+	BUG_ON(need_wait && need_sync);
+
+	if (need_sync) {
+		rsp->sync();
+		rsp->gp_state = GP_PASSED;
+		wake_up_all(&rsp->gp_wait);
+	} else if (need_wait) {
+		wait_event(rsp->gp_wait, rsp->gp_state == GP_PASSED);
+	} else {
+		/*
+		 * Possible when there's a pending CB from a rcu_sync_exit().
+		 * Nobody has yet been allowed the 'fast' path and thus we can
+		 * avoid doing any sync(). The callback will get 'dropped'.
+		 */
+		BUG_ON(rsp->gp_state != GP_PASSED);
+	}
+}
+
+/**
+ * rcu_sync_func() - Callback function managing reader access to fastpath
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
+ * This function is passed to one of the call_rcu() functions by
+ * rcu_sync_exit(), so that it is invoked after a grace period following the
+ * that invocation of rcu_sync_exit().  It takes action based on events that
+ * have taken place in the meantime, so that closely spaced rcu_sync_enter()
+ * and rcu_sync_exit() pairs need not wait for a grace period.
+ *
+ * If another rcu_sync_enter() is invoked before the grace period
+ * ended, reset state to allow the next rcu_sync_exit() to let the
+ * readers back onto their fastpaths (after a grace period).  If both
+ * another rcu_sync_enter() and its matching rcu_sync_exit() are invoked
+ * before the grace period ended, re-invoke call_rcu() on behalf of that
+ * rcu_sync_exit().  Otherwise, set all state back to idle so that readers
+ * can again use their fastpaths.
+ */
+static void rcu_sync_func(struct rcu_head *rcu)
+{
+	struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head);
+	unsigned long flags;
+
+	BUG_ON(rsp->gp_state != GP_PASSED);
+	BUG_ON(rsp->cb_state == CB_IDLE);
+
+	spin_lock_irqsave(&rsp->rss_lock, flags);
+	if (rsp->gp_count) {
+		/*
+		 * A new rcu_sync_begin() has happened; drop the callback.
+		 */
+		rsp->cb_state = CB_IDLE;
+	} else if (rsp->cb_state == CB_REPLAY) {
+		/*
+		 * A new rcu_sync_exit() has happened; requeue the callback
+		 * to catch a later GP.
+		 */
+		rsp->cb_state = CB_PENDING;
+		rsp->call(&rsp->cb_head, rcu_sync_func);
+	} else {
+		/*
+		 * We're at least a GP after rcu_sync_exit(); eveybody will now
+		 * have observed the write side critical section. Let 'em rip!.
+		 */
+		rsp->cb_state = CB_IDLE;
+		rsp->gp_state = GP_IDLE;
+	}
+	spin_unlock_irqrestore(&rsp->rss_lock, flags);
+}
+
+/**
+ * rcu_sync_exit() - Allow readers back onto fast patch after grace period
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
+ * This function is used by updaters who have completed, and can therefore
+ * now allow readers to make use of their fastpaths after a grace period
+ * has elapsed.  After this grace period has completed, all subsequent
+ * calls to rcu_sync_is_idle() will return true, which tells readers that
+ * they can once again use their fastpaths.
+ */
+void rcu_sync_exit(struct rcu_sync *rsp)
+{
+	spin_lock_irq(&rsp->rss_lock);
+	if (!--rsp->gp_count) {
+		if (rsp->cb_state == CB_IDLE) {
+			rsp->cb_state = CB_PENDING;
+			rsp->call(&rsp->cb_head, rcu_sync_func);
+		} else if (rsp->cb_state == CB_PENDING) {
+			rsp->cb_state = CB_REPLAY;
+		}
+	}
+	spin_unlock_irq(&rsp->rss_lock);
+}

From 82e8c565be8a72957570d7da8dd9b441db7bb648 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 21 Aug 2015 19:42:47 +0200
Subject: [PATCH 32/53] rcu_sync: Simplify rcu_sync using new rcu_sync_ops
 structure

This commit adds the new struct rcu_sync_ops which holds sync/call
methods, and turns the function pointers in rcu_sync_struct into an array
of struct rcu_sync_ops.  This simplifies the "init" helpers by collapsing
a switch statement and explicit multiple definitions into a simple
assignment and a helper macro, respectively.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcu_sync.h | 60 +++++++++++++++-------------------------
 kernel/rcu/sync.c        | 42 ++++++++++++++--------------
 2 files changed, 45 insertions(+), 57 deletions(-)

diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h
index cb044df2e21c..c6d2272c4459 100644
--- a/include/linux/rcu_sync.h
+++ b/include/linux/rcu_sync.h
@@ -26,6 +26,8 @@
 #include <linux/wait.h>
 #include <linux/rcupdate.h>
 
+enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
+
 /* Structure to mediate between updaters and fastpath-using readers.  */
 struct rcu_sync {
 	int			gp_state;
@@ -35,43 +37,9 @@ struct rcu_sync {
 	int			cb_state;
 	struct rcu_head		cb_head;
 
-	void (*sync)(void);
-	void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+	enum rcu_sync_type	gp_type;
 };
 
-#define ___RCU_SYNC_INIT(name)						\
-	.gp_state = 0,							\
-	.gp_count = 0,							\
-	.gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait),		\
-	.cb_state = 0
-
-#define __RCU_SCHED_SYNC_INIT(name) {					\
-	___RCU_SYNC_INIT(name),						\
-	.sync = synchronize_sched,					\
-	.call = call_rcu_sched,						\
-}
-
-#define __RCU_BH_SYNC_INIT(name) {					\
-	___RCU_SYNC_INIT(name),						\
-	.sync = synchronize_rcu_bh,					\
-	.call = call_rcu_bh,						\
-}
-
-#define __RCU_SYNC_INIT(name) {						\
-	___RCU_SYNC_INIT(name),						\
-	.sync = synchronize_rcu,					\
-	.call = call_rcu,						\
-}
-
-#define DEFINE_RCU_SCHED_SYNC(name)					\
-	struct rcu_sync name = __RCU_SCHED_SYNC_INIT(name)
-
-#define DEFINE_RCU_BH_SYNC(name)					\
-	struct rcu_sync name = __RCU_BH_SYNC_INIT(name)
-
-#define DEFINE_RCU_SYNC(name)						\
-	struct rcu_sync name = __RCU_SYNC_INIT(name)
-
 /**
  * rcu_sync_is_idle() - Are readers permitted to use their fastpaths?
  * @rsp: Pointer to rcu_sync structure to use for synchronization
@@ -85,10 +53,28 @@ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
 	return !rsp->gp_state; /* GP_IDLE */
 }
 
-enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
-
 extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type);
 extern void rcu_sync_enter(struct rcu_sync *);
 extern void rcu_sync_exit(struct rcu_sync *);
 
+#define __RCU_SYNC_INITIALIZER(name, type) {				\
+		.gp_state = 0,						\
+		.gp_count = 0,						\
+		.gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait),	\
+		.cb_state = 0,						\
+		.gp_type = type,					\
+	}
+
+#define	__DEFINE_RCU_SYNC(name, type)	\
+	struct rcu_sync_struct name = __RCU_SYNC_INITIALIZER(name, type)
+
+#define DEFINE_RCU_SYNC(name)		\
+	__DEFINE_RCU_SYNC(name, RCU_SYNC)
+
+#define DEFINE_RCU_SCHED_SYNC(name)	\
+	__DEFINE_RCU_SYNC(name, RCU_SCHED_SYNC)
+
+#define DEFINE_RCU_BH_SYNC(name)	\
+	__DEFINE_RCU_SYNC(name, RCU_BH_SYNC)
+
 #endif /* _LINUX_RCU_SYNC_H_ */
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 0a11df43be23..5a9aa4c394f1 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -23,6 +23,24 @@
 #include <linux/rcu_sync.h>
 #include <linux/sched.h>
 
+static const struct {
+	void (*sync)(void);
+	void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+} gp_ops[] = {
+	[RCU_SYNC] = {
+		.sync = synchronize_rcu,
+		.call = call_rcu,
+	},
+	[RCU_SCHED_SYNC] = {
+		.sync = synchronize_sched,
+		.call = call_rcu_sched,
+	},
+	[RCU_BH_SYNC] = {
+		.sync = synchronize_rcu_bh,
+		.call = call_rcu_bh,
+	},
+};
+
 enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
 enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
 
@@ -37,23 +55,7 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
 {
 	memset(rsp, 0, sizeof(*rsp));
 	init_waitqueue_head(&rsp->gp_wait);
-
-	switch (type) {
-	case RCU_SYNC:
-		rsp->sync = synchronize_rcu;
-		rsp->call = call_rcu;
-		break;
-
-	case RCU_SCHED_SYNC:
-		rsp->sync = synchronize_sched;
-		rsp->call = call_rcu_sched;
-		break;
-
-	case RCU_BH_SYNC:
-		rsp->sync = synchronize_rcu_bh;
-		rsp->call = call_rcu_bh;
-		break;
-	}
+	rsp->gp_type = type;
 }
 
 /**
@@ -85,7 +87,7 @@ void rcu_sync_enter(struct rcu_sync *rsp)
 	BUG_ON(need_wait && need_sync);
 
 	if (need_sync) {
-		rsp->sync();
+		gp_ops[rsp->gp_type].sync();
 		rsp->gp_state = GP_PASSED;
 		wake_up_all(&rsp->gp_wait);
 	} else if (need_wait) {
@@ -138,7 +140,7 @@ static void rcu_sync_func(struct rcu_head *rcu)
 		 * to catch a later GP.
 		 */
 		rsp->cb_state = CB_PENDING;
-		rsp->call(&rsp->cb_head, rcu_sync_func);
+		gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func);
 	} else {
 		/*
 		 * We're at least a GP after rcu_sync_exit(); eveybody will now
@@ -166,7 +168,7 @@ void rcu_sync_exit(struct rcu_sync *rsp)
 	if (!--rsp->gp_count) {
 		if (rsp->cb_state == CB_IDLE) {
 			rsp->cb_state = CB_PENDING;
-			rsp->call(&rsp->cb_head, rcu_sync_func);
+			gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func);
 		} else if (rsp->cb_state == CB_PENDING) {
 			rsp->cb_state = CB_REPLAY;
 		}

From 3a518b76af7bb411efe6dd090fbf098e29accb2e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 21 Aug 2015 19:42:50 +0200
Subject: [PATCH 33/53] rcu_sync: Add CONFIG_PROVE_RCU checks

This commit validates that the caller of rcu_sync_is_idle() holds the
corresponding type of RCU read-side lock, but only in kernels built
with CONFIG_PROVE_RCU=y.  This validation is carried out via a new
rcu_sync_ops->held() method that is checked within rcu_sync_is_idle().

Note that although this does add code to the fast path, it only does so
in kernels built with CONFIG_PROVE_RCU=y.

Suggested-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcu_sync.h |  6 ++++++
 kernel/rcu/sync.c        | 20 ++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h
index c6d2272c4459..1f2d4fc30b04 100644
--- a/include/linux/rcu_sync.h
+++ b/include/linux/rcu_sync.h
@@ -40,6 +40,8 @@ struct rcu_sync {
 	enum rcu_sync_type	gp_type;
 };
 
+extern bool __rcu_sync_is_idle(struct rcu_sync *);
+
 /**
  * rcu_sync_is_idle() - Are readers permitted to use their fastpaths?
  * @rsp: Pointer to rcu_sync structure to use for synchronization
@@ -50,7 +52,11 @@ struct rcu_sync {
  */
 static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
 {
+#ifdef CONFIG_PROVE_RCU
+	return __rcu_sync_is_idle(rsp);
+#else
 	return !rsp->gp_state; /* GP_IDLE */
+#endif
 }
 
 extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type);
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 5a9aa4c394f1..01c9807a7f73 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -23,21 +23,33 @@
 #include <linux/rcu_sync.h>
 #include <linux/sched.h>
 
+#ifdef CONFIG_PROVE_RCU
+#define __INIT_HELD(func)	.held = func,
+#else
+#define __INIT_HELD(func)
+#endif
+
 static const struct {
 	void (*sync)(void);
 	void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+#ifdef CONFIG_PROVE_RCU
+	int  (*held)(void);
+#endif
 } gp_ops[] = {
 	[RCU_SYNC] = {
 		.sync = synchronize_rcu,
 		.call = call_rcu,
+		__INIT_HELD(rcu_read_lock_held)
 	},
 	[RCU_SCHED_SYNC] = {
 		.sync = synchronize_sched,
 		.call = call_rcu_sched,
+		__INIT_HELD(rcu_read_lock_sched_held)
 	},
 	[RCU_BH_SYNC] = {
 		.sync = synchronize_rcu_bh,
 		.call = call_rcu_bh,
+		__INIT_HELD(rcu_read_lock_bh_held)
 	},
 };
 
@@ -46,6 +58,14 @@ enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
 
 #define	rss_lock	gp_wait.lock
 
+#ifdef CONFIG_PROVE_RCU
+bool __rcu_sync_is_idle(struct rcu_sync *rsp)
+{
+	WARN_ON(!gp_ops[rsp->gp_type].held());
+	return rsp->gp_state == GP_IDLE;
+}
+#endif
+
 /**
  * rcu_sync_init() - Initialize an rcu_sync structure
  * @rsp: Pointer to rcu_sync structure to be initialized

From 07899a6e5f56136028c44a57ad0451e797365ac3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 21 Aug 2015 19:42:52 +0200
Subject: [PATCH 34/53] rcu_sync: Introduce rcu_sync_dtor()

This commit allows rcu_sync structures to be safely deallocated,
The trick is to add a new ->wait field to the gp_ops array.
This field is a pointer to the rcu_barrier() function corresponding
to the flavor of RCU in question.  This allows a new rcu_sync_dtor()
to wait for any outstanding callbacks before freeing the rcu_sync
structure.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcu_sync.h |  1 +
 kernel/rcu/sync.c        | 26 ++++++++++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h
index 1f2d4fc30b04..8069d6468bc4 100644
--- a/include/linux/rcu_sync.h
+++ b/include/linux/rcu_sync.h
@@ -62,6 +62,7 @@ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
 extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type);
 extern void rcu_sync_enter(struct rcu_sync *);
 extern void rcu_sync_exit(struct rcu_sync *);
+extern void rcu_sync_dtor(struct rcu_sync *);
 
 #define __RCU_SYNC_INITIALIZER(name, type) {				\
 		.gp_state = 0,						\
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 01c9807a7f73..1e353f0a2b66 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -32,6 +32,7 @@
 static const struct {
 	void (*sync)(void);
 	void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+	void (*wait)(void);
 #ifdef CONFIG_PROVE_RCU
 	int  (*held)(void);
 #endif
@@ -39,16 +40,19 @@ static const struct {
 	[RCU_SYNC] = {
 		.sync = synchronize_rcu,
 		.call = call_rcu,
+		.wait = rcu_barrier,
 		__INIT_HELD(rcu_read_lock_held)
 	},
 	[RCU_SCHED_SYNC] = {
 		.sync = synchronize_sched,
 		.call = call_rcu_sched,
+		.wait = rcu_barrier_sched,
 		__INIT_HELD(rcu_read_lock_sched_held)
 	},
 	[RCU_BH_SYNC] = {
 		.sync = synchronize_rcu_bh,
 		.call = call_rcu_bh,
+		.wait = rcu_barrier_bh,
 		__INIT_HELD(rcu_read_lock_bh_held)
 	},
 };
@@ -195,3 +199,25 @@ void rcu_sync_exit(struct rcu_sync *rsp)
 	}
 	spin_unlock_irq(&rsp->rss_lock);
 }
+
+/**
+ * rcu_sync_dtor() - Clean up an rcu_sync structure
+ * @rsp: Pointer to rcu_sync structure to be cleaned up
+ */
+void rcu_sync_dtor(struct rcu_sync *rsp)
+{
+	int cb_state;
+
+	BUG_ON(rsp->gp_count);
+
+	spin_lock_irq(&rsp->rss_lock);
+	if (rsp->cb_state == CB_REPLAY)
+		rsp->cb_state = CB_PENDING;
+	cb_state = rsp->cb_state;
+	spin_unlock_irq(&rsp->rss_lock);
+
+	if (cb_state != CB_IDLE) {
+		gp_ops[rsp->gp_type].wait();
+		BUG_ON(rsp->cb_state != CB_IDLE);
+	}
+}

From 95b19f684c61ffc9b039e02c5d1113c2d8cd7105 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 21 Aug 2015 19:42:55 +0200
Subject: [PATCH 35/53] locking/percpu-rwsem: Make percpu_free_rwsem() after
 kzalloc() safe

This is the temporary ugly hack which will be reverted later. We only
need it to ensure that the next patch will not break "change sb_writers
to use percpu_rw_semaphore" patches routed via the VFS tree.

The alloc_super()->destroy_super() error path assumes that it is safe
to call percpu_free_rwsem() after kzalloc() without percpu_init_rwsem(),
so let's not disappoint it.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/locking/percpu-rwsem.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index e2621fbbcbd1..9529a30ec57b 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -26,6 +26,13 @@ EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
 
 void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
 {
+	/*
+	 * XXX: temporary kludge. The error path in alloc_super()
+	 * assumes that percpu_free_rwsem() is safe after kzalloc().
+	 */
+	if (!brw->fast_read_ctr)
+		return;
+
 	free_percpu(brw->fast_read_ctr);
 	brw->fast_read_ctr = NULL; /* catch use after free bugs */
 }

From 001dac627ff37433d5528ffb0d897cd19c2b1e43 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 21 Aug 2015 19:42:57 +0200
Subject: [PATCH 36/53] locking/percpu-rwsem: Make use of the rcu_sync
 infrastructure

Currently down_write/up_write calls synchronize_sched_expedited()
twice, which is evil.  Change this code to rely on rcu-sync primitives.
This avoids the _expedited "big hammer", and this can be faster in
the contended case or even in the case when a single thread does
down_write/up_write in a loop.

Of course, a single down_write() will take more time, but otoh it
will be much more friendly to the whole system.

To simplify the review this patch doesn't update the comments, fixed
by the next change.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/percpu-rwsem.h  |  3 ++-
 kernel/locking/percpu-rwsem.c | 18 +++++++-----------
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 834c4e52cb2d..c2fa3ecb0dce 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -5,11 +5,12 @@
 #include <linux/rwsem.h>
 #include <linux/percpu.h>
 #include <linux/wait.h>
+#include <linux/rcu_sync.h>
 #include <linux/lockdep.h>
 
 struct percpu_rw_semaphore {
+	struct rcu_sync		rss;
 	unsigned int __percpu	*fast_read_ctr;
-	atomic_t		write_ctr;
 	struct rw_semaphore	rw_sem;
 	atomic_t		slow_read_ctr;
 	wait_queue_head_t	write_waitq;
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 9529a30ec57b..183a71151ac0 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -17,7 +17,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
 
 	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
 	__init_rwsem(&brw->rw_sem, name, rwsem_key);
-	atomic_set(&brw->write_ctr, 0);
+	rcu_sync_init(&brw->rss, RCU_SCHED_SYNC);
 	atomic_set(&brw->slow_read_ctr, 0);
 	init_waitqueue_head(&brw->write_waitq);
 	return 0;
@@ -33,6 +33,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
 	if (!brw->fast_read_ctr)
 		return;
 
+	rcu_sync_dtor(&brw->rss);
 	free_percpu(brw->fast_read_ctr);
 	brw->fast_read_ctr = NULL; /* catch use after free bugs */
 }
@@ -62,13 +63,12 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
  */
 static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
 {
-	bool success = false;
+	bool success;
 
 	preempt_disable();
-	if (likely(!atomic_read(&brw->write_ctr))) {
+	success = rcu_sync_is_idle(&brw->rss);
+	if (likely(success))
 		__this_cpu_add(*brw->fast_read_ctr, val);
-		success = true;
-	}
 	preempt_enable();
 
 	return success;
@@ -149,8 +149,6 @@ static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
  */
 void percpu_down_write(struct percpu_rw_semaphore *brw)
 {
-	/* tell update_fast_ctr() there is a pending writer */
-	atomic_inc(&brw->write_ctr);
 	/*
 	 * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read
 	 *    so that update_fast_ctr() can't succeed.
@@ -162,7 +160,7 @@ void percpu_down_write(struct percpu_rw_semaphore *brw)
 	 *    fast-path, it executes a full memory barrier before we return.
 	 *    See R_W case in the comment above update_fast_ctr().
 	 */
-	synchronize_sched_expedited();
+	rcu_sync_enter(&brw->rss);
 
 	/* exclude other writers, and block the new readers completely */
 	down_write(&brw->rw_sem);
@@ -183,8 +181,6 @@ void percpu_up_write(struct percpu_rw_semaphore *brw)
 	 * Insert the barrier before the next fast-path in down_read,
 	 * see W_R case in the comment above update_fast_ctr().
 	 */
-	synchronize_sched_expedited();
-	/* the last writer unblocks update_fast_ctr() */
-	atomic_dec(&brw->write_ctr);
+	rcu_sync_exit(&brw->rss);
 }
 EXPORT_SYMBOL_GPL(percpu_up_write);

From f324a76324c97e81a6ba66a8efac20cdbffd759e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 21 Aug 2015 19:43:00 +0200
Subject: [PATCH 37/53] locking/percpu-rwsem: Fix the comments outdated by
 rcu_sync

Update the comments broken by the previous change.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/locking/percpu-rwsem.c | 50 ++++++++---------------------------
 1 file changed, 11 insertions(+), 39 deletions(-)

diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 183a71151ac0..02a726dd9adc 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -39,27 +39,12 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
 }
 
 /*
- * This is the fast-path for down_read/up_read, it only needs to ensure
- * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the
- * fast per-cpu counter. The writer uses synchronize_sched_expedited() to
- * serialize with the preempt-disabled section below.
- *
- * The nontrivial part is that we should guarantee acquire/release semantics
- * in case when
- *
- *	R_W: down_write() comes after up_read(), the writer should see all
- *	     changes done by the reader
- * or
- *	W_R: down_read() comes after up_write(), the reader should see all
- *	     changes done by the writer
+ * This is the fast-path for down_read/up_read. If it succeeds we rely
+ * on the barriers provided by rcu_sync_enter/exit; see the comments in
+ * percpu_down_write() and percpu_up_write().
  *
  * If this helper fails the callers rely on the normal rw_semaphore and
  * atomic_dec_and_test(), so in this case we have the necessary barriers.
- *
- * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or
- * __this_cpu_add() below can be reordered with any LOAD/STORE done by the
- * reader inside the critical section. See the comments in down_write and
- * up_write below.
  */
 static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
 {
@@ -136,29 +121,15 @@ static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
 	return sum;
 }
 
-/*
- * A writer increments ->write_ctr to force the readers to switch to the
- * slow mode, note the atomic_read() check in update_fast_ctr().
- *
- * After that the readers can only inc/dec the slow ->slow_read_ctr counter,
- * ->fast_read_ctr is stable. Once the writer moves its sum into the slow
- * counter it represents the number of active readers.
- *
- * Finally the writer takes ->rw_sem for writing and blocks the new readers,
- * then waits until the slow counter becomes zero.
- */
 void percpu_down_write(struct percpu_rw_semaphore *brw)
 {
 	/*
-	 * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read
-	 *    so that update_fast_ctr() can't succeed.
+	 * Make rcu_sync_is_idle() == F and thus disable the fast-path in
+	 * percpu_down_read() and percpu_up_read(), and wait for gp pass.
 	 *
-	 * 2. Ensures we see the result of every previous this_cpu_add() in
-	 *    update_fast_ctr().
-	 *
-	 * 3. Ensures that if any reader has exited its critical section via
-	 *    fast-path, it executes a full memory barrier before we return.
-	 *    See R_W case in the comment above update_fast_ctr().
+	 * The latter synchronises us with the preceding readers which used
+	 * the fast-past, so we can not miss the result of __this_cpu_add()
+	 * or anything else inside their criticial sections.
 	 */
 	rcu_sync_enter(&brw->rss);
 
@@ -178,8 +149,9 @@ void percpu_up_write(struct percpu_rw_semaphore *brw)
 	/* release the lock, but the readers can't use the fast-path */
 	up_write(&brw->rw_sem);
 	/*
-	 * Insert the barrier before the next fast-path in down_read,
-	 * see W_R case in the comment above update_fast_ctr().
+	 * Enable the fast-path in percpu_down_read() and percpu_up_read()
+	 * but only after another gp pass; this adds the necessary barrier
+	 * to ensure the reader can't miss the changes done by us.
 	 */
 	rcu_sync_exit(&brw->rss);
 }

From cc5f730b41506d37a5c2826b2e801d0a59853d11 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 21 Aug 2015 19:43:03 +0200
Subject: [PATCH 38/53] locking/percpu-rwsem: Clean up the lockdep annotations
 in percpu_down_read()

Based on Peter Zijlstra's earlier patch.

Change percpu_down_read() to use __down_read(), this way we can
do rwsem_acquire_read() unconditionally at the start to make this
code more symmetric and clean.

Originally-From: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/locking/percpu-rwsem.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 02a726dd9adc..f231e0bb311c 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -70,14 +70,14 @@ static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
 void percpu_down_read(struct percpu_rw_semaphore *brw)
 {
 	might_sleep();
-	if (likely(update_fast_ctr(brw, +1))) {
-		rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
-		return;
-	}
+	rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
 
-	down_read(&brw->rw_sem);
+	if (likely(update_fast_ctr(brw, +1)))
+		return;
+
+	/* Avoid rwsem_acquire_read() and rwsem_release() */
+	__down_read(&brw->rw_sem);
 	atomic_inc(&brw->slow_read_ctr);
-	/* avoid up_read()->rwsem_release() */
 	__up_read(&brw->rw_sem);
 }
 EXPORT_SYMBOL_GPL(percpu_down_read);

From 4bace7344d6dbd7a1b0b801abf24ea9878064317 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 11 Sep 2015 17:59:18 +0200
Subject: [PATCH 39/53] rcu_sync: Cleanup the CONFIG_PROVE_RCU checks

1. Rename __rcu_sync_is_idle() to rcu_sync_lockdep_assert() and
   change it to use rcu_lockdep_assert().

2. Change rcu_sync_is_idle() to return rsp->gp_state == GP_IDLE
   unconditonally, this way we can remove the same check from
   rcu_sync_lockdep_assert() and clearly isolate the debugging
   code.

Note: rcu_sync_enter()->wait_event(gp_state == GP_PASSED) needs
another CONFIG_PROVE_RCU check, the same as is done in ->sync(); but
this needs some simple preparations in the core RCU code to avoid the
code duplication.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcu_sync.h | 7 +++----
 kernel/rcu/sync.c        | 6 +++---
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h
index 8069d6468bc4..a63a33e6196e 100644
--- a/include/linux/rcu_sync.h
+++ b/include/linux/rcu_sync.h
@@ -40,7 +40,7 @@ struct rcu_sync {
 	enum rcu_sync_type	gp_type;
 };
 
-extern bool __rcu_sync_is_idle(struct rcu_sync *);
+extern void rcu_sync_lockdep_assert(struct rcu_sync *);
 
 /**
  * rcu_sync_is_idle() - Are readers permitted to use their fastpaths?
@@ -53,10 +53,9 @@ extern bool __rcu_sync_is_idle(struct rcu_sync *);
 static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
 {
 #ifdef CONFIG_PROVE_RCU
-	return __rcu_sync_is_idle(rsp);
-#else
-	return !rsp->gp_state; /* GP_IDLE */
+	rcu_sync_lockdep_assert(rsp);
 #endif
+	return !rsp->gp_state; /* GP_IDLE */
 }
 
 extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type);
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 1e353f0a2b66..be922c9f3d37 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -63,10 +63,10 @@ enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
 #define	rss_lock	gp_wait.lock
 
 #ifdef CONFIG_PROVE_RCU
-bool __rcu_sync_is_idle(struct rcu_sync *rsp)
+void rcu_sync_lockdep_assert(struct rcu_sync *rsp)
 {
-	WARN_ON(!gp_ops[rsp->gp_type].held());
-	return rsp->gp_state == GP_IDLE;
+	RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(),
+			 "suspicious rcu_sync_is_idle() usage");
 }
 #endif
 

From 889d487a26de4bcd1a0a668754bcbce893969edf Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 24 Aug 2015 11:37:58 -0700
Subject: [PATCH 40/53] rcutorture: Fix module unwind when bad torture_type
 specified

The rcutorture module has a list of torture types, and specifying a
type not on this list is supposed to cleanly fail the module load.
Unfortunately, the "fail" happens without the "cleanly".  This commit
therefore adds the needed clean-up after an incorrect torture_type.

Reported-by: David Miller <davem@davemloft.net>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: David Miller <davem@davemloft.net>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/rcutorture.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 77192953dee5..b74b56474e17 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1742,15 +1742,15 @@ rcu_torture_init(void)
 		for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
 			pr_alert(" %s", torture_ops[i]->name);
 		pr_alert("\n");
-		torture_init_end();
-		return -EINVAL;
+		firsterr = -EINVAL;
+		goto unwind;
 	}
 	if (cur_ops->fqs == NULL && fqs_duration != 0) {
 		pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
 		fqs_duration = 0;
 	}
 	if (cur_ops->init)
-		cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+		cur_ops->init();
 
 	if (nreaders >= 0) {
 		nrealreaders = nreaders;

From 4f441a258f7badf752b3d9b04b675869ca4e751c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 24 Aug 2015 13:13:51 -0700
Subject: [PATCH 41/53] rcutorture: Fix unused-function warning for
 torturing_tasks()

The torturing_tasks() function is used only in kernels built with
CONFIG_PROVE_RCU=y, so the second definition can result in unused-function
compiler warnings.  This commit adds __maybe_unused to suppress these
warnings.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/rcutorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index b74b56474e17..009b62c76dfa 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -695,7 +695,7 @@ static bool __maybe_unused torturing_tasks(void)
 
 #define RCUTORTURE_TASKS_OPS
 
-static bool torturing_tasks(void)
+static bool __maybe_unused torturing_tasks(void)
 {
 	return false;
 }

From a8c06024d0b557b465ad40b87ef0b51a428f99fd Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 25 Aug 2015 17:12:09 -0700
Subject: [PATCH 42/53] torture: Forgive non-plural arguments

This commit allows --bootarg instead of --bootargs, --config instead of
--configs, and --qemu-arg instead of --qemu-args.  For those cases where
a native English speaker might auto-correct the argument to be incorrect.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 tools/testing/selftests/rcutorture/bin/kvm.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index fbe2dbff1e21..f6483609ebc2 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -75,7 +75,7 @@ usage () {
 while test $# -gt 0
 do
 	case "$1" in
-	--bootargs)
+	--bootargs|--bootarg)
 		checkarg --bootargs "(list of kernel boot arguments)" "$#" "$2" '.*' '^--'
 		TORTURE_BOOTARGS="$2"
 		shift
@@ -88,7 +88,7 @@ do
 	--buildonly)
 		TORTURE_BUILDONLY=1
 		;;
-	--configs)
+	--configs|--config)
 		checkarg --configs "(list of config files)" "$#" "$2" '^[^/]*$' '^--'
 		configs="$2"
 		shift
@@ -134,7 +134,7 @@ do
 	--no-initrd)
 		TORTURE_INITRD=""; export TORTURE_INITRD
 		;;
-	--qemu-args)
+	--qemu-args|--qemu-arg)
 		checkarg --qemu-args "-qemu args" $# "$2" '^-' '^error'
 		TORTURE_QEMU_ARG="$2"
 		shift

From a36a99618b1adb2d6ca0b7e08e3a656a04e477fe Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 30 Aug 2015 20:01:48 -0700
Subject: [PATCH 43/53] locktorture: Fix module unwind when bad torture_type
 specified

The locktorture module has a list of torture types, and specifying
a type not on this list is supposed to cleanly fail the module load.
Unfortunately, the "fail" happens without the "cleanly".  This commit
therefore adds the needed clean-up after an incorrect torture_type.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/locking/locktorture.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 32244186f1f2..820852f69858 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -661,11 +661,11 @@ static int __init lock_torture_init(void)
 		for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
 			pr_alert(" %s", torture_ops[i]->name);
 		pr_alert("\n");
-		torture_init_end();
-		return -EINVAL;
+		firsterr = -EINVAL;
+		goto unwind;
 	}
 	if (cxt.cur_ops->init)
-		cxt.cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+		cxt.cur_ops->init();
 
 	if (nwriters_stress >= 0)
 		cxt.nrealwriters_stress = nwriters_stress;

From 6587a23b6b9bdb47205ec96c703e5bf8a2d39701 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 6 Aug 2015 16:50:39 -0700
Subject: [PATCH 44/53] rcu: Switch synchronize_sched_expedited() to IPI

This commit switches synchronize_sched_expedited() from stop_one_cpu_nowait()
to smp_call_function_single(), thus moving from an IPI and a pair of
context switches to an IPI and a single pass through the scheduler.
Of course, if the scheduler actually does decide to switch to a different
task, there will still be a pair of context switches, but there would
likely have been a pair of context switches anyway, just a bit later.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 32 ++++++++++++++++++++------------
 kernel/rcu/tree.h |  3 ---
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 3e2875b38eae..a42168911941 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -161,6 +161,8 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
 static void invoke_rcu_core(void);
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
+static void rcu_report_exp_rdp(struct rcu_state *rsp,
+			       struct rcu_data *rdp, bool wake);
 
 /* rcuc/rcub kthread realtime priority */
 #ifdef CONFIG_RCU_KTHREAD_PRIO
@@ -250,6 +252,12 @@ void rcu_sched_qs(void)
 				       __this_cpu_read(rcu_sched_data.gpnum),
 				       TPS("cpuqs"));
 		__this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
+		if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) {
+			__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
+			rcu_report_exp_rdp(&rcu_sched_state,
+					   this_cpu_ptr(&rcu_sched_data),
+					   true);
+		}
 	}
 }
 
@@ -3555,8 +3563,8 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
  * Report expedited quiescent state for specified rcu_data (CPU).
  * Caller must hold the root rcu_node's exp_funnel_mutex.
  */
-static void __maybe_unused rcu_report_exp_rdp(struct rcu_state *rsp,
-					      struct rcu_data *rdp, bool wake)
+static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
+			       bool wake)
 {
 	rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake);
 }
@@ -3637,14 +3645,10 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 }
 
 /* Invoked on each online non-idle CPU for expedited quiescent state. */
-static int synchronize_sched_expedited_cpu_stop(void *data)
+static void synchronize_sched_expedited_cpu_stop(void *data)
 {
-	struct rcu_data *rdp = data;
-	struct rcu_state *rsp = rdp->rsp;
-
-	/* Report the quiescent state. */
-	rcu_report_exp_rdp(rsp, rdp, true);
-	return 0;
+	__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
+	resched_cpu(smp_processor_id());
 }
 
 /*
@@ -3659,6 +3663,7 @@ static void sync_sched_exp_select_cpus(struct rcu_state *rsp)
 	unsigned long mask_ofl_test;
 	unsigned long mask_ofl_ipi;
 	struct rcu_data *rdp;
+	int ret;
 	struct rcu_node *rnp;
 
 	sync_exp_reset_tree(rsp);
@@ -3694,9 +3699,9 @@ static void sync_sched_exp_select_cpus(struct rcu_state *rsp)
 			if (!(mask_ofl_ipi & mask))
 				continue;
 			rdp = per_cpu_ptr(rsp->rda, cpu);
-			stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop,
-					    rdp, &rdp->exp_stop_work);
-			mask_ofl_ipi &= ~mask;
+			ret = smp_call_function_single(cpu, synchronize_sched_expedited_cpu_stop, NULL, 0);
+			if (!ret)
+				mask_ofl_ipi &= ~mask;
 		}
 		/* Report quiescent states for those that went offline. */
 		mask_ofl_test |= mask_ofl_ipi;
@@ -4201,6 +4206,9 @@ int rcu_cpu_notify(struct notifier_block *self,
 			rcu_cleanup_dying_cpu(rsp);
 		break;
 	case CPU_DYING_IDLE:
+		/* QS for any half-done expedited RCU-sched GP. */
+		rcu_sched_qs();
+
 		for_each_rcu_flavor(rsp) {
 			rcu_cleanup_dying_idle_cpu(cpu, rsp);
 		}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 3eee48bcf52b..1b969cef8fe4 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -324,9 +324,6 @@ struct rcu_data {
 					/*  ticks this CPU has handled */
 					/*  during and after the last grace */
 					/* period it is aware of. */
-	struct cpu_stop_work exp_stop_work;
-					/* Expedited grace-period control */
-					/*  for CPU stopping. */
 
 	/* 2) batch handling */
 	/*

From 83c2c735e78da1a0d994911f730f6e1d36c88d7a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 6 Aug 2015 20:43:02 -0700
Subject: [PATCH 45/53] rcu: Stop silencing lockdep false positive for
 expedited grace periods

This reverts commit af859beaaba4 (rcu: Silence lockdep false positive
for expedited grace periods).  Because synchronize_rcu_expedited()
no longer invokes synchronize_sched_expedited(), ->exp_funnel_mutex
acquisition is no longer nested, so the false positive no longer happens.
This commit therefore removes the extra lockdep data structures, as they
are no longer needed.
---
 kernel/rcu/tree.c | 17 ++---------------
 kernel/rcu/tree.h |  8 --------
 2 files changed, 2 insertions(+), 23 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a42168911941..15b19baaade5 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -71,7 +71,6 @@ MODULE_ALIAS("rcutree");
 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
 static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
-static struct lock_class_key rcu_exp_sched_class[RCU_NUM_LVLS];
 
 /*
  * In order to export the rcu_state name to the tracing tools, it
@@ -4095,7 +4094,6 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
 static void __init
 rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 {
-	static struct lock_class_key rcu_exp_sched_rdp_class;
 	unsigned long flags;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rcu_get_root(rsp);
@@ -4111,10 +4109,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 	mutex_init(&rdp->exp_funnel_mutex);
 	rcu_boot_init_nocb_percpu_data(rdp);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
-	if (rsp == &rcu_sched_state)
-		lockdep_set_class_and_name(&rdp->exp_funnel_mutex,
-					   &rcu_exp_sched_rdp_class,
-					   "rcu_data_exp_sched");
 }
 
 /*
@@ -4340,7 +4334,6 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 	static const char * const buf[] = RCU_NODE_NAME_INIT;
 	static const char * const fqs[] = RCU_FQS_NAME_INIT;
 	static const char * const exp[] = RCU_EXP_NAME_INIT;
-	static const char * const exp_sched[] = RCU_EXP_SCHED_NAME_INIT;
 	static u8 fl_mask = 0x1;
 
 	int levelcnt[RCU_NUM_LVLS];		/* # nodes in each level. */
@@ -4400,14 +4393,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 			INIT_LIST_HEAD(&rnp->blkd_tasks);
 			rcu_init_one_nocb(rnp);
 			mutex_init(&rnp->exp_funnel_mutex);
-			if (rsp == &rcu_sched_state)
-				lockdep_set_class_and_name(
-					&rnp->exp_funnel_mutex,
-					&rcu_exp_sched_class[i], exp_sched[i]);
-			else
-				lockdep_set_class_and_name(
-					&rnp->exp_funnel_mutex,
-					&rcu_exp_class[i], exp[i]);
+			lockdep_set_class_and_name(&rnp->exp_funnel_mutex,
+						   &rcu_exp_class[i], exp[i]);
 		}
 	}
 
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 1b969cef8fe4..6f3b63b68886 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -70,8 +70,6 @@
 #  define RCU_NODE_NAME_INIT  { "rcu_node_0" }
 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0" }
 #  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0" }
-#  define RCU_EXP_SCHED_NAME_INIT \
-			      { "rcu_node_exp_sched_0" }
 #elif NR_CPUS <= RCU_FANOUT_2
 #  define RCU_NUM_LVLS	      2
 #  define NUM_RCU_LVL_0	      1
@@ -81,8 +79,6 @@
 #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1" }
 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1" }
 #  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1" }
-#  define RCU_EXP_SCHED_NAME_INIT \
-			      { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1" }
 #elif NR_CPUS <= RCU_FANOUT_3
 #  define RCU_NUM_LVLS	      3
 #  define NUM_RCU_LVL_0	      1
@@ -93,8 +89,6 @@
 #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
 #  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
-#  define RCU_EXP_SCHED_NAME_INIT \
-			      { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2" }
 #elif NR_CPUS <= RCU_FANOUT_4
 #  define RCU_NUM_LVLS	      4
 #  define NUM_RCU_LVL_0	      1
@@ -106,8 +100,6 @@
 #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
 #  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
-#  define RCU_EXP_SCHED_NAME_INIT \
-			      { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2", "rcu_node_exp_sched_3" }
 #else
 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
 #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */

From 807226e2fbb504d82cd504b7b6114896db41ef63 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 7 Aug 2015 12:03:45 -0700
Subject: [PATCH 46/53] rcu: Stop excluding CPU hotplug in
 synchronize_sched_expedited()

Now that synchronize_sched_expedited() uses IPIs, a hook in
rcu_sched_qs(), and the ->expmask field in the rcu_node combining
tree, it is no longer necessary to exclude CPU hotplug.  Any
races with CPU hotplug will be detected when attempting to send
the IPI.  This commit therefore removes the code excluding
CPU hotplug operations.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 15b19baaade5..7e7df5deadf9 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3785,19 +3785,9 @@ void synchronize_sched_expedited(void)
 	/* Take a snapshot of the sequence number.  */
 	s = rcu_exp_gp_seq_snap(rsp);
 
-	if (!try_get_online_cpus()) {
-		/* CPU hotplug operation in flight, fall back to normal GP. */
-		wait_rcu_gp(call_rcu_sched);
-		atomic_long_inc(&rsp->expedited_normal);
-		return;
-	}
-	WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
-
 	rnp = exp_funnel_lock(rsp, s);
-	if (rnp == NULL) {
-		put_online_cpus();
+	if (rnp == NULL)
 		return;  /* Someone else did our work for us. */
-	}
 
 	rcu_exp_gp_seq_start(rsp);
 	sync_sched_exp_select_cpus(rsp);
@@ -3805,8 +3795,6 @@ void synchronize_sched_expedited(void)
 
 	rcu_exp_gp_seq_end(rsp);
 	mutex_unlock(&rnp->exp_funnel_mutex);
-
-	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 

From 02ef3c4a2aae65a1632b27770bfea3f83ca06772 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 7 Aug 2015 15:14:30 -0700
Subject: [PATCH 47/53] cpu: Remove try_get_online_cpus()

Now that synchronize_sched_expedited() no longer uses it, there are
no users of try_get_online_cpus() in mainline.  This commit therefore
removes it.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpu.h |  2 --
 kernel/cpu.c        | 13 -------------
 2 files changed, 15 deletions(-)

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 23c30bdcca86..d2ca8c38f9c4 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -228,7 +228,6 @@ extern struct bus_type cpu_subsys;
 extern void cpu_hotplug_begin(void);
 extern void cpu_hotplug_done(void);
 extern void get_online_cpus(void);
-extern bool try_get_online_cpus(void);
 extern void put_online_cpus(void);
 extern void cpu_hotplug_disable(void);
 extern void cpu_hotplug_enable(void);
@@ -246,7 +245,6 @@ int cpu_down(unsigned int cpu);
 static inline void cpu_hotplug_begin(void) {}
 static inline void cpu_hotplug_done(void) {}
 #define get_online_cpus()	do { } while (0)
-#define try_get_online_cpus()	true
 #define put_online_cpus()	do { } while (0)
 #define cpu_hotplug_disable()	do { } while (0)
 #define cpu_hotplug_enable()	do { } while (0)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 82cf9dff4295..14a9cdf8abe9 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -102,19 +102,6 @@ void get_online_cpus(void)
 }
 EXPORT_SYMBOL_GPL(get_online_cpus);
 
-bool try_get_online_cpus(void)
-{
-	if (cpu_hotplug.active_writer == current)
-		return true;
-	if (!mutex_trylock(&cpu_hotplug.lock))
-		return false;
-	cpuhp_lock_acquire_tryread();
-	atomic_inc(&cpu_hotplug.refcount);
-	mutex_unlock(&cpu_hotplug.lock);
-	return true;
-}
-EXPORT_SYMBOL_GPL(try_get_online_cpus);
-
 void put_online_cpus(void)
 {
 	int refcount;

From 66fe6cbee44e3f78d05882195a6a38e30b97b936 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 15 Aug 2015 17:20:58 -0700
Subject: [PATCH 48/53] rcu: Prepare for consolidating expedited CPU selection

This commit brings sync_sched_exp_select_cpus() into alignment with
sync_rcu_exp_select_cpus(), as a first step towards consolidating them
into one function.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 7e7df5deadf9..ae582e3d4136 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3661,7 +3661,6 @@ static void sync_sched_exp_select_cpus(struct rcu_state *rsp)
 	unsigned long mask;
 	unsigned long mask_ofl_test;
 	unsigned long mask_ofl_ipi;
-	struct rcu_data *rdp;
 	int ret;
 	struct rcu_node *rnp;
 
@@ -3697,7 +3696,6 @@ static void sync_sched_exp_select_cpus(struct rcu_state *rsp)
 		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
 			if (!(mask_ofl_ipi & mask))
 				continue;
-			rdp = per_cpu_ptr(rsp->rda, cpu);
 			ret = smp_call_function_single(cpu, synchronize_sched_expedited_cpu_stop, NULL, 0);
 			if (!ret)
 				mask_ofl_ipi &= ~mask;

From dcdb8807ba0f5127d4dc67daeeb154edf50a93d1 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 15 Aug 2015 19:00:31 -0700
Subject: [PATCH 49/53] rcu: Consolidate expedited CPU selection

Now that sync_sched_exp_select_cpus() and sync_rcu_exp_select_cpus()
are identical aside from the the argument to smp_call_function_single(),
this commit consolidates them with a functional argument.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        |  7 +++--
 kernel/rcu/tree_plugin.h | 61 +---------------------------------------
 2 files changed, 5 insertions(+), 63 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index ae582e3d4136..f44f4b30c68a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3654,7 +3654,8 @@ static void synchronize_sched_expedited_cpu_stop(void *data)
  * Select the nodes that the upcoming expedited grace period needs
  * to wait for.
  */
-static void sync_sched_exp_select_cpus(struct rcu_state *rsp)
+static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
+				     smp_call_func_t func)
 {
 	int cpu;
 	unsigned long flags;
@@ -3696,7 +3697,7 @@ static void sync_sched_exp_select_cpus(struct rcu_state *rsp)
 		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
 			if (!(mask_ofl_ipi & mask))
 				continue;
-			ret = smp_call_function_single(cpu, synchronize_sched_expedited_cpu_stop, NULL, 0);
+			ret = smp_call_function_single(cpu, func, rsp, 0);
 			if (!ret)
 				mask_ofl_ipi &= ~mask;
 		}
@@ -3788,7 +3789,7 @@ void synchronize_sched_expedited(void)
 		return;  /* Someone else did our work for us. */
 
 	rcu_exp_gp_seq_start(rsp);
-	sync_sched_exp_select_cpus(rsp);
+	sync_rcu_exp_select_cpus(rsp, synchronize_sched_expedited_cpu_stop);
 	synchronize_sched_expedited_wait(rsp);
 
 	rcu_exp_gp_seq_end(rsp);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 7880202f1e38..6cbfbfc58656 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -708,65 +708,6 @@ static void sync_rcu_exp_handler(void *info)
 	rcu_report_exp_rdp(rsp, rdp, true);
 }
 
-/*
- * Select the nodes that the upcoming expedited grace period needs
- * to wait for.
- */
-static void sync_rcu_exp_select_cpus(struct rcu_state *rsp)
-{
-	int cpu;
-	unsigned long flags;
-	unsigned long mask;
-	unsigned long mask_ofl_test;
-	unsigned long mask_ofl_ipi;
-	int ret;
-	struct rcu_node *rnp;
-
-	sync_exp_reset_tree(rsp);
-	rcu_for_each_leaf_node(rsp, rnp) {
-		raw_spin_lock_irqsave(&rnp->lock, flags);
-		smp_mb__after_unlock_lock();
-
-		/* Each pass checks a CPU for identity, offline, and idle. */
-		mask_ofl_test = 0;
-		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
-			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
-			if (raw_smp_processor_id() == cpu ||
-			    cpu_is_offline(cpu) ||
-			    !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
-				mask_ofl_test |= rdp->grpmask;
-		}
-		mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
-
-		/*
-		 * Need to wait for any blocked tasks as well.  Note that
-		 * additional blocking tasks will also block the expedited
-		 * GP until such time as the ->expmask bits are cleared.
-		 */
-		if (rcu_preempt_has_tasks(rnp))
-			rnp->exp_tasks = rnp->blkd_tasks.next;
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
-
-		/* IPI the remaining CPUs for expedited quiescent state. */
-		mask = 1;
-		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
-			if (!(mask_ofl_ipi & mask))
-				continue;
-			ret = smp_call_function_single(cpu,
-						       sync_rcu_exp_handler,
-						       rsp, 0);
-			if (!ret)
-				mask_ofl_ipi &= ~mask;
-		}
-		/* Report quiescent states for those that went offline. */
-		mask_ofl_test |= mask_ofl_ipi;
-		if (mask_ofl_test)
-			rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
-	}
-}
-
 /**
  * synchronize_rcu_expedited - Brute-force RCU grace period
  *
@@ -795,7 +736,7 @@ void synchronize_rcu_expedited(void)
 	rcu_exp_gp_seq_start(rsp);
 
 	/* Initialize the rcu_node tree in preparation for the wait. */
-	sync_rcu_exp_select_cpus(rsp);
+	sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
 
 	/* Wait for snapshotted ->blkd_tasks lists to drain. */
 	rnp = rcu_get_root(rsp);

From 74611ecb0fc4c850a8f89a744ce99cbf0dd43cb2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 18 Aug 2015 10:20:43 -0700
Subject: [PATCH 50/53] rcu: Add online/offline info to expedited stall warning
 message

This commit makes the RCU CPU stall warning message print online/offline
indications immediately after the CPU number.  A "O" indicates global
offline, a "." global online, and a "o" indicates RCU believes that the
CPU is offline for the current grace period and "." otherwise, and an
"N" indicates that RCU believes that the CPU will be offline for the
next grace period, and "." otherwise, all right after the CPU number.
So for CPU 10, you would normally see "10-...:" indicating that everything
believes that the CPU is online.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        |  9 ++++++++-
 kernel/rcu/tree.h        |  1 +
 kernel/rcu/tree_plugin.h | 31 +++++++++++++++++++++++++++++++
 3 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f44f4b30c68a..3d033fee0dcb 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3737,11 +3737,18 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 		pr_err("INFO: %s detected expedited stalls on CPUs: {",
 		       rsp->name);
 		rcu_for_each_leaf_node(rsp, rnp) {
+			(void)rcu_print_task_exp_stall(rnp);
 			mask = 1;
 			for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
+				struct rcu_data *rdp;
+
 				if (!(rnp->expmask & mask))
 					continue;
-				pr_cont(" %d", cpu);
+				rdp = per_cpu_ptr(rsp->rda, cpu);
+				pr_cont(" %d-%c%c%c", cpu,
+					"O."[cpu_online(cpu)],
+					"o."[!!(rdp->grpmask & rnp->expmaskinit)],
+					"N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
 			}
 			mask <<= 1;
 		}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 6f3b63b68886..191aa3678575 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -589,6 +589,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_print_detail_task_stall(struct rcu_state *rsp);
 static int rcu_print_task_stall(struct rcu_node *rnp);
+static int rcu_print_task_exp_stall(struct rcu_node *rnp);
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 static void rcu_preempt_check_callbacks(void);
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 6cbfbfc58656..7b61cece80c0 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -585,6 +585,27 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
 	return ndetected;
 }
 
+/*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, printing out the tid of each that is blocking the current
+ * expedited grace period.
+ */
+static int rcu_print_task_exp_stall(struct rcu_node *rnp)
+{
+	struct task_struct *t;
+	int ndetected = 0;
+
+	if (!rnp->exp_tasks)
+		return 0;
+	t = list_entry(rnp->exp_tasks->prev,
+		       struct task_struct, rcu_node_entry);
+	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+		pr_cont(" P%d", t->pid);
+		ndetected++;
+	}
+	return ndetected;
+}
+
 /*
  * Check that the list of blocked tasks for the newly completed grace
  * period is in fact empty.  It is a serious bug to complete a grace
@@ -845,6 +866,16 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
 	return 0;
 }
 
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections that are
+ * blocking the current expedited grace period.
+ */
+static int rcu_print_task_exp_stall(struct rcu_node *rnp)
+{
+	return 0;
+}
+
 /*
  * Because there is no preemptible RCU, there can be no readers blocked,
  * so there is no need to check for blocked tasks.  So check only for

From c58656382e5f2919b05913584f2c54b4f841bc9f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 18 Aug 2015 11:25:48 -0700
Subject: [PATCH 51/53] rcu: Add tasks to expedited stall-warning messages

This commit adds task-print ability to the expedited RCU CPU stall
warning messages in preparation for adding stall warnings to
synchornize_rcu_expedited().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 3d033fee0dcb..b4f71ffa299c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3734,7 +3734,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 				   sync_rcu_preempt_exp_done(rnp_root));
 			return;
 		}
-		pr_err("INFO: %s detected expedited stalls on CPUs: {",
+		pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
 		       rsp->name);
 		rcu_for_each_leaf_node(rsp, rnp) {
 			(void)rcu_print_task_exp_stall(rnp);

From b08517c76d764c373c232cd309ed058c98705219 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 18 Aug 2015 12:17:29 -0700
Subject: [PATCH 52/53] rcu: Enable stall warnings for
 synchronize_rcu_expedited()

This commit redirects synchronize_rcu_expedited()'s wait to
synchronize_sched_expedited_wait(), thus enabling RCU CPU
stall warnings.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 7b61cece80c0..ffeb99e550e8 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -761,8 +761,7 @@ void synchronize_rcu_expedited(void)
 
 	/* Wait for snapshotted ->blkd_tasks lists to drain. */
 	rnp = rcu_get_root(rsp);
-	wait_event(rsp->expedited_wq,
-		   sync_rcu_preempt_exp_done(rnp));
+	synchronize_sched_expedited_wait(rsp);
 
 	/* Clean up and exit. */
 	rcu_exp_gp_seq_end(rsp);

From 338b0f760e84676130c6e4d8268cb8c923b38c8c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 3 Sep 2015 00:45:02 -0700
Subject: [PATCH 53/53] rcu: Better hotplug handling for
 synchronize_sched_expedited()

Earlier versions of synchronize_sched_expedited() can prematurely end
grace periods due to the fact that a CPU marked as cpu_is_offline()
can still be using RCU read-side critical sections during the time that
CPU makes its last pass through the scheduler and into the idle loop
and during the time that a given CPU is in the process of coming online.
This commit therefore eliminates this window by adding additional
interaction with the CPU-hotplug operations.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 68 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 62 insertions(+), 6 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b4f71ffa299c..42b317e13776 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -246,17 +246,23 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
  */
 void rcu_sched_qs(void)
 {
+	unsigned long flags;
+
 	if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) {
 		trace_rcu_grace_period(TPS("rcu_sched"),
 				       __this_cpu_read(rcu_sched_data.gpnum),
 				       TPS("cpuqs"));
 		__this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
+		if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
+			return;
+		local_irq_save(flags);
 		if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) {
 			__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
 			rcu_report_exp_rdp(&rcu_sched_state,
 					   this_cpu_ptr(&rcu_sched_data),
 					   true);
 		}
+		local_irq_restore(flags);
 	}
 }
 
@@ -3553,7 +3559,10 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	smp_mb__after_unlock_lock();
-	WARN_ON_ONCE((rnp->expmask & mask) != mask);
+	if (!(rnp->expmask & mask)) {
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
 	rnp->expmask &= ~mask;
 	__rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */
 }
@@ -3644,12 +3653,37 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 }
 
 /* Invoked on each online non-idle CPU for expedited quiescent state. */
-static void synchronize_sched_expedited_cpu_stop(void *data)
+static void sync_sched_exp_handler(void *data)
 {
+	struct rcu_data *rdp;
+	struct rcu_node *rnp;
+	struct rcu_state *rsp = data;
+
+	rdp = this_cpu_ptr(rsp->rda);
+	rnp = rdp->mynode;
+	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
+	    __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
+		return;
 	__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
 	resched_cpu(smp_processor_id());
 }
 
+/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
+static void sync_sched_exp_online_cleanup(int cpu)
+{
+	struct rcu_data *rdp;
+	int ret;
+	struct rcu_node *rnp;
+	struct rcu_state *rsp = &rcu_sched_state;
+
+	rdp = per_cpu_ptr(rsp->rda, cpu);
+	rnp = rdp->mynode;
+	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))
+		return;
+	ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0);
+	WARN_ON_ONCE(ret);
+}
+
 /*
  * Select the nodes that the upcoming expedited grace period needs
  * to wait for.
@@ -3677,7 +3711,6 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
 
 			if (raw_smp_processor_id() == cpu ||
-			    cpu_is_offline(cpu) ||
 			    !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
 				mask_ofl_test |= rdp->grpmask;
 		}
@@ -3697,9 +3730,28 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 		for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
 			if (!(mask_ofl_ipi & mask))
 				continue;
+retry_ipi:
 			ret = smp_call_function_single(cpu, func, rsp, 0);
-			if (!ret)
+			if (!ret) {
 				mask_ofl_ipi &= ~mask;
+			} else {
+				/* Failed, raced with offline. */
+				raw_spin_lock_irqsave(&rnp->lock, flags);
+				if (cpu_online(cpu) &&
+				    (rnp->expmask & mask)) {
+					raw_spin_unlock_irqrestore(&rnp->lock,
+								   flags);
+					schedule_timeout_uninterruptible(1);
+					if (cpu_online(cpu) &&
+					    (rnp->expmask & mask))
+						goto retry_ipi;
+					raw_spin_lock_irqsave(&rnp->lock,
+							      flags);
+				}
+				if (!(rnp->expmask & mask))
+					mask_ofl_ipi &= ~mask;
+				raw_spin_unlock_irqrestore(&rnp->lock, flags);
+			}
 		}
 		/* Report quiescent states for those that went offline. */
 		mask_ofl_test |= mask_ofl_ipi;
@@ -3796,7 +3848,7 @@ void synchronize_sched_expedited(void)
 		return;  /* Someone else did our work for us. */
 
 	rcu_exp_gp_seq_start(rsp);
-	sync_rcu_exp_select_cpus(rsp, synchronize_sched_expedited_cpu_stop);
+	sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
 	synchronize_sched_expedited_wait(rsp);
 
 	rcu_exp_gp_seq_end(rsp);
@@ -4183,6 +4235,7 @@ int rcu_cpu_notify(struct notifier_block *self,
 		break;
 	case CPU_ONLINE:
 	case CPU_DOWN_FAILED:
+		sync_sched_exp_online_cleanup(cpu);
 		rcu_boost_kthread_setaffinity(rnp, -1);
 		break;
 	case CPU_DOWN_PREPARE:
@@ -4195,7 +4248,10 @@ int rcu_cpu_notify(struct notifier_block *self,
 		break;
 	case CPU_DYING_IDLE:
 		/* QS for any half-done expedited RCU-sched GP. */
-		rcu_sched_qs();
+		preempt_disable();
+		rcu_report_exp_rdp(&rcu_sched_state,
+				   this_cpu_ptr(rcu_sched_state.rda), true);
+		preempt_enable();
 
 		for_each_rcu_flavor(rsp) {
 			rcu_cleanup_dying_idle_cpu(cpu, rsp);