From 28728dd310d48834cd486dac3cac9ae96b9deb96 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 12 Jan 2016 08:33:37 -0800
Subject: rcu: Make expedited RCU-sched grace period immediately detect idle

Currently, sync_sched_exp_handler() will force a reschedule unless
this CPU has already checked in or unless a reschedule has already
been called for.  This is clearly wasteful if sync_sched_exp_handler()
interrupted an idle CPU, so this commit immediately reports the
quiescent state in that case.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 531a328..5f4336f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3649,6 +3649,11 @@ static void sync_sched_exp_handler(void *data)
 	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
 	    __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
 		return;
+	if (rcu_is_cpu_rrupt_from_idle()) {
+		rcu_report_exp_rdp(&rcu_sched_state,
+				   this_cpu_ptr(&rcu_sched_data), true);
+		return;
+	}
 	__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
 	resched_cpu(smp_processor_id());
 }
-- 
cgit v1.1


From 251c617c75f48e03523c43c4ce1dff44bc3ae2bd Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 13 Jan 2016 10:52:35 -0800
Subject: rcu: Make expedited RCU-preempt stall warnings count accurately

Currently, synchronize_sched_expedited_wait() simply sets the ndetected
variable to the rcu_print_task_exp_stall() return value.  This means
that if the last rcu_node structure has no stalled tasks, record of
any stalled tasks in previous rcu_node structures is lost, which can
in turn result in failure to dump out the blocking rcu_node structures.
Or could, had the test been correct.

This commit therefore adds the return value of rcu_print_task_exp_stall()
to ndetected and corrects the later test for ndetected.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5f4336f..687d8a5 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3778,7 +3778,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 		       rsp->name);
 		ndetected = 0;
 		rcu_for_each_leaf_node(rsp, rnp) {
-			ndetected = rcu_print_task_exp_stall(rnp);
+			ndetected += rcu_print_task_exp_stall(rnp);
 			mask = 1;
 			for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
 				struct rcu_data *rdp;
@@ -3797,7 +3797,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 		pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
 			jiffies - jiffies_start, rsp->expedited_sequence,
 			rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
-		if (!ndetected) {
+		if (ndetected) {
 			pr_err("blocking rcu_node structures:");
 			rcu_for_each_node_breadth_first(rsp, rnp) {
 				if (rnp == rnp_root)
-- 
cgit v1.1


From a1e1224849d9610b50fd1dd7d6f44308a59e46af Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 13 Jan 2016 13:57:54 -0800
Subject: rcu: Make cond_resched_rcu_qs() supply RCU-sched expedited QS

Although cond_resched_rcu_qs() supplies quiescent states to all flavors
of normal RCU grace periods, it does nothing for expedited RCU-sched
grace periods.  This commit therefore adds a check for a need for a
quiescent state from the current CPU by an expedited RCU-sched grace
period, and invokes rcu_sched_qs() to supply that quiescent state if so.

Note that the check is racy in that we might be migrated to some other
CPU just after checking the per-CPU variable.  This is OK because the
act of migration will do a context switch, which will supply the needed
quiescent state.  The only downside is that we might do an unnecessary
call to rcu_sched_qs(), but the probability is low and the overhead
is small.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 687d8a5..178575c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -370,6 +370,21 @@ void rcu_all_qs(void)
 		rcu_momentary_dyntick_idle();
 		local_irq_restore(flags);
 	}
+	if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) {
+		/*
+		 * Yes, we just checked a per-CPU variable with preemption
+		 * enabled, so we might be migrated to some other CPU at
+		 * this point.  That is OK because in that case, the
+		 * migration will supply the needed quiescent state.
+		 * We might end up needlessly disabling preemption and
+		 * invoking rcu_sched_qs() on the destination CPU, but
+		 * the probability and cost are both quite low, so this
+		 * should not be a problem in practice.
+		 */
+		preempt_disable();
+		rcu_sched_qs();
+		preempt_enable();
+	}
 	this_cpu_inc(rcu_qs_ctr);
 	barrier(); /* Avoid RCU read-side critical sections leaking up. */
 }
-- 
cgit v1.1


From bea2de44ae647698dc848a671fdee6e53c192423 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 28 Jan 2016 20:30:06 -0800
Subject: rcu: Add funnel-locking tracing for expedited grace periods

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 178575c..79e9206 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3584,10 +3584,18 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
 			       atomic_long_t *stat, unsigned long s)
 {
 	if (rcu_exp_gp_seq_done(rsp, s)) {
-		if (rnp)
+		if (rnp) {
 			mutex_unlock(&rnp->exp_funnel_mutex);
-		else if (rdp)
+			trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
+						  rnp->grplo, rnp->grphi,
+						  TPS("rel"));
+		} else if (rdp) {
 			mutex_unlock(&rdp->exp_funnel_mutex);
+			trace_rcu_exp_funnel_lock(rsp->name,
+						  rdp->mynode->level + 1,
+						  rdp->cpu, rdp->cpu,
+						  TPS("rel"));
+		}
 		/* Ensure test happens before caller kfree(). */
 		smp_mb__before_atomic(); /* ^^^ */
 		atomic_long_inc(stat);
@@ -3619,6 +3627,9 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 			if (sync_exp_work_done(rsp, rnp0, NULL,
 					       &rdp->expedited_workdone0, s))
 				return NULL;
+			trace_rcu_exp_funnel_lock(rsp->name, rnp0->level,
+						  rnp0->grplo, rnp0->grphi,
+						  TPS("acq"));
 			return rnp0;
 		}
 	}
@@ -3634,16 +3645,28 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 	if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s))
 		return NULL;
 	mutex_lock(&rdp->exp_funnel_mutex);
+	trace_rcu_exp_funnel_lock(rsp->name, rdp->mynode->level + 1,
+				  rdp->cpu, rdp->cpu, TPS("acq"));
 	rnp0 = rdp->mynode;
 	for (; rnp0 != NULL; rnp0 = rnp0->parent) {
 		if (sync_exp_work_done(rsp, rnp1, rdp,
 				       &rdp->expedited_workdone2, s))
 			return NULL;
 		mutex_lock(&rnp0->exp_funnel_mutex);
-		if (rnp1)
+		trace_rcu_exp_funnel_lock(rsp->name, rnp0->level,
+					  rnp0->grplo, rnp0->grphi, TPS("acq"));
+		if (rnp1) {
 			mutex_unlock(&rnp1->exp_funnel_mutex);
-		else
+			trace_rcu_exp_funnel_lock(rsp->name, rnp1->level,
+						  rnp1->grplo, rnp1->grphi,
+						  TPS("rel"));
+		} else {
 			mutex_unlock(&rdp->exp_funnel_mutex);
+			trace_rcu_exp_funnel_lock(rsp->name,
+						  rdp->mynode->level + 1,
+						  rdp->cpu, rdp->cpu,
+						  TPS("rel"));
+		}
 		rnp1 = rnp0;
 	}
 	if (sync_exp_work_done(rsp, rnp1, rdp,
-- 
cgit v1.1


From 4f41530245c7fd4837152e264d120d05ae940eb0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 28 Jan 2016 20:49:49 -0800
Subject: rcu: Add expedited-grace-period event tracing

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 79e9206..524026f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3584,17 +3584,18 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
 			       atomic_long_t *stat, unsigned long s)
 {
 	if (rcu_exp_gp_seq_done(rsp, s)) {
+		trace_rcu_exp_grace_period(rsp->name, s, TPS("done"));
 		if (rnp) {
-			mutex_unlock(&rnp->exp_funnel_mutex);
 			trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
 						  rnp->grplo, rnp->grphi,
 						  TPS("rel"));
+			mutex_unlock(&rnp->exp_funnel_mutex);
 		} else if (rdp) {
-			mutex_unlock(&rdp->exp_funnel_mutex);
 			trace_rcu_exp_funnel_lock(rsp->name,
 						  rdp->mynode->level + 1,
 						  rdp->cpu, rdp->cpu,
 						  TPS("rel"));
+			mutex_unlock(&rdp->exp_funnel_mutex);
 		}
 		/* Ensure test happens before caller kfree(). */
 		smp_mb__before_atomic(); /* ^^^ */
@@ -3624,12 +3625,12 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 	rnp0 = rcu_get_root(rsp);
 	if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) {
 		if (mutex_trylock(&rnp0->exp_funnel_mutex)) {
-			if (sync_exp_work_done(rsp, rnp0, NULL,
-					       &rdp->expedited_workdone0, s))
-				return NULL;
 			trace_rcu_exp_funnel_lock(rsp->name, rnp0->level,
 						  rnp0->grplo, rnp0->grphi,
 						  TPS("acq"));
+			if (sync_exp_work_done(rsp, rnp0, NULL,
+					       &rdp->expedited_workdone0, s))
+				return NULL;
 			return rnp0;
 		}
 	}
@@ -3656,16 +3657,16 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 		trace_rcu_exp_funnel_lock(rsp->name, rnp0->level,
 					  rnp0->grplo, rnp0->grphi, TPS("acq"));
 		if (rnp1) {
-			mutex_unlock(&rnp1->exp_funnel_mutex);
 			trace_rcu_exp_funnel_lock(rsp->name, rnp1->level,
 						  rnp1->grplo, rnp1->grphi,
 						  TPS("rel"));
+			mutex_unlock(&rnp1->exp_funnel_mutex);
 		} else {
-			mutex_unlock(&rdp->exp_funnel_mutex);
 			trace_rcu_exp_funnel_lock(rsp->name,
 						  rdp->mynode->level + 1,
 						  rdp->cpu, rdp->cpu,
 						  TPS("rel"));
+			mutex_unlock(&rdp->exp_funnel_mutex);
 		}
 		rnp1 = rnp0;
 	}
@@ -3895,16 +3896,21 @@ void synchronize_sched_expedited(void)
 
 	/* Take a snapshot of the sequence number.  */
 	s = rcu_exp_gp_seq_snap(rsp);
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
 
 	rnp = exp_funnel_lock(rsp, s);
 	if (rnp == NULL)
 		return;  /* Someone else did our work for us. */
 
 	rcu_exp_gp_seq_start(rsp);
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("start"));
 	sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
 	synchronize_sched_expedited_wait(rsp);
 
 	rcu_exp_gp_seq_end(rsp);
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
+	trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
+				  rnp->grplo, rnp->grphi, TPS("rel"));
 	mutex_unlock(&rnp->exp_funnel_mutex);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-- 
cgit v1.1


From e2fd9d35847d1936398d44c4df68dceb3d7f64e7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 30 Jan 2016 17:23:19 -0800
Subject: rcu: Remove expedited GP funnel-lock bypass

Commit #cdacbe1f91264 ("rcu: Add fastpath bypassing funnel locking")
turns out to be a pessimization at high load because it forces a tree
full of tasks to wait for an expedited grace period that they probably
do not need.  This commit therefore removes this optimization.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 524026f..62e73e0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3617,25 +3617,6 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 	struct rcu_node *rnp1 = NULL;
 
 	/*
-	 * First try directly acquiring the root lock in order to reduce
-	 * latency in the common case where expedited grace periods are
-	 * rare.  We check mutex_is_locked() to avoid pathological levels of
-	 * memory contention on ->exp_funnel_mutex in the heavy-load case.
-	 */
-	rnp0 = rcu_get_root(rsp);
-	if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) {
-		if (mutex_trylock(&rnp0->exp_funnel_mutex)) {
-			trace_rcu_exp_funnel_lock(rsp->name, rnp0->level,
-						  rnp0->grplo, rnp0->grphi,
-						  TPS("acq"));
-			if (sync_exp_work_done(rsp, rnp0, NULL,
-					       &rdp->expedited_workdone0, s))
-				return NULL;
-			return rnp0;
-		}
-	}
-
-	/*
 	 * Each pass through the following loop works its way
 	 * up the rcu_node tree, returning if others have done the
 	 * work or otherwise falls through holding the root rnp's
-- 
cgit v1.1


From ec3833ed02ae6ef2a933ece9de7cbab0c64c699e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 11 Jan 2016 16:29:29 -0800
Subject: rcu: Force boolean subscript for expedited stall warnings

The cpu_online() function can return values other than 0 and 1, which
can result in subscript overflow when applied to a two-element array.
This commit allows for this behavior by using "!!" on the return value
from cpu_online() when used as a subscript.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 62e73e0..64c2e32 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3808,7 +3808,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 				ndetected++;
 				rdp = per_cpu_ptr(rsp->rda, cpu);
 				pr_cont(" %d-%c%c%c", cpu,
-					"O."[cpu_online(cpu)],
+					"O."[!!cpu_online(cpu)],
 					"o."[!!(rdp->grpmask & rnp->expmaskinit)],
 					"N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
 			}
-- 
cgit v1.1


From d40a4f09a448382961fa9b1a2f7d4f34813f0273 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 8 Mar 2016 14:43:44 -0800
Subject: rcu: Shorten expedited_workdone* to exp_workdone*

Just a name change to save a few lines and a bit of typing.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 64c2e32..89f0287 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3624,15 +3624,14 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 	 * can be inexact, as it is just promoting locality and is not
 	 * strictly needed for correctness.
 	 */
-	if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s))
+	if (sync_exp_work_done(rsp, NULL, NULL, &rdp->exp_workdone1, s))
 		return NULL;
 	mutex_lock(&rdp->exp_funnel_mutex);
 	trace_rcu_exp_funnel_lock(rsp->name, rdp->mynode->level + 1,
 				  rdp->cpu, rdp->cpu, TPS("acq"));
 	rnp0 = rdp->mynode;
 	for (; rnp0 != NULL; rnp0 = rnp0->parent) {
-		if (sync_exp_work_done(rsp, rnp1, rdp,
-				       &rdp->expedited_workdone2, s))
+		if (sync_exp_work_done(rsp, rnp1, rdp, &rdp->exp_workdone2, s))
 			return NULL;
 		mutex_lock(&rnp0->exp_funnel_mutex);
 		trace_rcu_exp_funnel_lock(rsp->name, rnp0->level,
@@ -3651,8 +3650,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 		}
 		rnp1 = rnp0;
 	}
-	if (sync_exp_work_done(rsp, rnp1, rdp,
-			       &rdp->expedited_workdone3, s))
+	if (sync_exp_work_done(rsp, rnp1, rdp, &rdp->exp_workdone3, s))
 		return NULL;
 	return rnp1;
 }
-- 
cgit v1.1


From f6a12f34a448cc8a624070fd365c29c890138a48 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 30 Jan 2016 17:57:35 -0800
Subject: rcu: Enforce expedited-GP fairness via funnel wait queue

The current mutex-based funnel-locking approach used by expedited grace
periods is subject to severe unfairness.  The problem arises when a
few tasks, making a path from leaves to root, all wake up before other
tasks do.  A new task can then follow this path all the way to the root,
which needlessly delays tasks whose grace period is done, but who do
not happen to acquire the lock quickly enough.

This commit avoids this problem by maintaining per-rcu_node wait queues,
along with a per-rcu_node counter that tracks the latest grace period
sought by an earlier task to visit this node.  If that grace period
would satisfy the current task, instead of proceeding up the tree,
it waits on the current rcu_node structure using a pair of wait queues
provided for that purpose.  This decouples awakening of old tasks from
the arrival of new tasks.

If the wakeups prove to be a bottleneck, additional kthreads can be
brought to bear for that purpose.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 155 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 81 insertions(+), 74 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 89f0287..bd2658e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -102,6 +102,7 @@ struct rcu_state sname##_state = { \
 	.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
 	.name = RCU_STATE_NAME(sname), \
 	.abbr = sabbr, \
+	.exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \
 }
 
 RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
@@ -3484,7 +3485,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
  * for the current expedited grace period.  Works only for preemptible
  * RCU -- other RCU implementation use other means.
  *
- * Caller must hold the root rcu_node's exp_funnel_mutex.
+ * Caller must hold the rcu_state's exp_mutex.
  */
 static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
 {
@@ -3500,8 +3501,8 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
  * recursively up the tree.  (Calm down, calm down, we do the recursion
  * iteratively!)
  *
- * Caller must hold the root rcu_node's exp_funnel_mutex and the
- * specified rcu_node structure's ->lock.
+ * Caller must hold the rcu_state's exp_mutex and the specified rcu_node
+ * structure's ->lock.
  */
 static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 				 bool wake, unsigned long flags)
@@ -3538,7 +3539,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
  * Report expedited quiescent state for specified node.  This is a
  * lock-acquisition wrapper function for __rcu_report_exp_rnp().
  *
- * Caller must hold the root rcu_node's exp_funnel_mutex.
+ * Caller must hold the rcu_state's exp_mutex.
  */
 static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
 					      struct rcu_node *rnp, bool wake)
@@ -3551,8 +3552,8 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
 
 /*
  * Report expedited quiescent state for multiple CPUs, all covered by the
- * specified leaf rcu_node structure.  Caller must hold the root
- * rcu_node's exp_funnel_mutex.
+ * specified leaf rcu_node structure.  Caller must hold the rcu_state's
+ * exp_mutex.
  */
 static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
 				    unsigned long mask, bool wake)
@@ -3570,7 +3571,6 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
 
 /*
  * Report expedited quiescent state for specified rcu_data (CPU).
- * Caller must hold the root rcu_node's exp_funnel_mutex.
  */
 static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
 			       bool wake)
@@ -3579,24 +3579,11 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
 }
 
 /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
-static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
-			       struct rcu_data *rdp,
-			       atomic_long_t *stat, unsigned long s)
+static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat,
+			       unsigned long s)
 {
 	if (rcu_exp_gp_seq_done(rsp, s)) {
 		trace_rcu_exp_grace_period(rsp->name, s, TPS("done"));
-		if (rnp) {
-			trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
-						  rnp->grplo, rnp->grphi,
-						  TPS("rel"));
-			mutex_unlock(&rnp->exp_funnel_mutex);
-		} else if (rdp) {
-			trace_rcu_exp_funnel_lock(rsp->name,
-						  rdp->mynode->level + 1,
-						  rdp->cpu, rdp->cpu,
-						  TPS("rel"));
-			mutex_unlock(&rdp->exp_funnel_mutex);
-		}
 		/* Ensure test happens before caller kfree(). */
 		smp_mb__before_atomic(); /* ^^^ */
 		atomic_long_inc(stat);
@@ -3606,53 +3593,53 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
 }
 
 /*
- * Funnel-lock acquisition for expedited grace periods.  Returns a
- * pointer to the root rcu_node structure, or NULL if some other
- * task did the expedited grace period for us.
+ * Funnel-lock acquisition for expedited grace periods.  Returns true
+ * if some other task completed an expedited grace period that this task
+ * can piggy-back on, and with no mutex held.  Otherwise, returns false
+ * with the mutex held, indicating that the caller must actually do the
+ * expedited grace period.
  */
-static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
+static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 {
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
-	struct rcu_node *rnp0;
-	struct rcu_node *rnp1 = NULL;
+	struct rcu_node *rnp = rdp->mynode;
 
 	/*
-	 * Each pass through the following loop works its way
-	 * up the rcu_node tree, returning if others have done the
-	 * work or otherwise falls through holding the root rnp's
-	 * ->exp_funnel_mutex.  The mapping from CPU to rcu_node structure
-	 * can be inexact, as it is just promoting locality and is not
-	 * strictly needed for correctness.
+	 * Each pass through the following loop works its way up
+	 * the rcu_node tree, returning if others have done the work or
+	 * otherwise falls through to acquire rsp->exp_mutex.  The mapping
+	 * from CPU to rcu_node structure can be inexact, as it is just
+	 * promoting locality and is not strictly needed for correctness.
 	 */
-	if (sync_exp_work_done(rsp, NULL, NULL, &rdp->exp_workdone1, s))
-		return NULL;
-	mutex_lock(&rdp->exp_funnel_mutex);
-	trace_rcu_exp_funnel_lock(rsp->name, rdp->mynode->level + 1,
-				  rdp->cpu, rdp->cpu, TPS("acq"));
-	rnp0 = rdp->mynode;
-	for (; rnp0 != NULL; rnp0 = rnp0->parent) {
-		if (sync_exp_work_done(rsp, rnp1, rdp, &rdp->exp_workdone2, s))
-			return NULL;
-		mutex_lock(&rnp0->exp_funnel_mutex);
-		trace_rcu_exp_funnel_lock(rsp->name, rnp0->level,
-					  rnp0->grplo, rnp0->grphi, TPS("acq"));
-		if (rnp1) {
-			trace_rcu_exp_funnel_lock(rsp->name, rnp1->level,
-						  rnp1->grplo, rnp1->grphi,
-						  TPS("rel"));
-			mutex_unlock(&rnp1->exp_funnel_mutex);
-		} else {
-			trace_rcu_exp_funnel_lock(rsp->name,
-						  rdp->mynode->level + 1,
-						  rdp->cpu, rdp->cpu,
-						  TPS("rel"));
-			mutex_unlock(&rdp->exp_funnel_mutex);
+	for (; rnp != NULL; rnp = rnp->parent) {
+		if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s))
+			return true;
+
+		/* Work not done, either wait here or go up. */
+		spin_lock(&rnp->exp_lock);
+		if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) {
+
+			/* Someone else doing GP, so wait for them. */
+			spin_unlock(&rnp->exp_lock);
+			trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
+						  rnp->grplo, rnp->grphi,
+						  TPS("wait"));
+			wait_event(rnp->exp_wq[(s >> 1) & 0x1],
+				   sync_exp_work_done(rsp,
+						      &rdp->exp_workdone2, s));
+			return true;
 		}
-		rnp1 = rnp0;
+		rnp->exp_seq_rq = s; /* Followers can wait on us. */
+		spin_unlock(&rnp->exp_lock);
+		trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo,
+					  rnp->grphi, TPS("nxtlvl"));
 	}
-	if (sync_exp_work_done(rsp, rnp1, rdp, &rdp->exp_workdone3, s))
-		return NULL;
-	return rnp1;
+	mutex_lock(&rsp->exp_mutex);
+	if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) {
+		mutex_unlock(&rsp->exp_mutex);
+		return true;
+	}
+	return false;
 }
 
 /* Invoked on each online non-idle CPU for expedited quiescent state. */
@@ -3841,6 +3828,27 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 	}
 }
 
+/*
+ * Wake up everyone who piggybacked on the just-completed expedited
+ * grace period.  Also update all the ->exp_seq_rq counters as needed
+ * in order to avoid counter-wrap problems.
+ */
+static void rcu_exp_wake(struct rcu_state *rsp, unsigned long s)
+{
+	struct rcu_node *rnp;
+
+	rcu_for_each_node_breadth_first(rsp, rnp) {
+		if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
+			spin_lock(&rnp->exp_lock);
+			/* Recheck, avoid hang in case someone just arrived. */
+			if (ULONG_CMP_LT(rnp->exp_seq_rq, s))
+				rnp->exp_seq_rq = s;
+			spin_unlock(&rnp->exp_lock);
+		}
+		wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x1]);
+	}
+}
+
 /**
  * synchronize_sched_expedited - Brute-force RCU-sched grace period
  *
@@ -3860,7 +3868,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 void synchronize_sched_expedited(void)
 {
 	unsigned long s;
-	struct rcu_node *rnp;
 	struct rcu_state *rsp = &rcu_sched_state;
 
 	/* If only one CPU, this is automatically a grace period. */
@@ -3877,20 +3884,23 @@ void synchronize_sched_expedited(void)
 	s = rcu_exp_gp_seq_snap(rsp);
 	trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
 
-	rnp = exp_funnel_lock(rsp, s);
-	if (rnp == NULL)
+	if (exp_funnel_lock(rsp, s))
 		return;  /* Someone else did our work for us. */
 
 	rcu_exp_gp_seq_start(rsp);
 	trace_rcu_exp_grace_period(rsp->name, s, TPS("start"));
+
+	/* Initialize the rcu_node tree in preparation for the wait. */
 	sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
-	synchronize_sched_expedited_wait(rsp);
 
+	/* Wait and clean up, including waking everyone. */
+	synchronize_sched_expedited_wait(rsp);
 	rcu_exp_gp_seq_end(rsp);
 	trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
-	trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
-				  rnp->grplo, rnp->grphi, TPS("rel"));
-	mutex_unlock(&rnp->exp_funnel_mutex);
+	rcu_exp_wake(rsp, s);
+
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
+	mutex_unlock(&rsp->exp_mutex);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 
@@ -4190,7 +4200,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 	WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
 	rdp->cpu = cpu;
 	rdp->rsp = rsp;
-	mutex_init(&rdp->exp_funnel_mutex);
 	rcu_boot_init_nocb_percpu_data(rdp);
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 }
@@ -4448,10 +4457,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 {
 	static const char * const buf[] = RCU_NODE_NAME_INIT;
 	static const char * const fqs[] = RCU_FQS_NAME_INIT;
-	static const char * const exp[] = RCU_EXP_NAME_INIT;
 	static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
 	static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
-	static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
 	static u8 fl_mask = 0x1;
 
 	int levelcnt[RCU_NUM_LVLS];		/* # nodes in each level. */
@@ -4510,9 +4517,9 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 			rnp->level = i;
 			INIT_LIST_HEAD(&rnp->blkd_tasks);
 			rcu_init_one_nocb(rnp);
-			mutex_init(&rnp->exp_funnel_mutex);
-			lockdep_set_class_and_name(&rnp->exp_funnel_mutex,
-						   &rcu_exp_class[i], exp[i]);
+			init_waitqueue_head(&rnp->exp_wq[0]);
+			init_waitqueue_head(&rnp->exp_wq[1]);
+			spin_lock_init(&rnp->exp_lock);
 		}
 	}
 
-- 
cgit v1.1


From 356051e1de3cf65575da4ee92d1f5cee86677ee2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 16 Mar 2016 13:22:53 -0700
Subject: rcu: Add exp_funnel_lock() fastpath

This commit speeds up the low-contention case, especially for systems
with large rcu_node trees, by attempting to directly acquire the
->exp_mutex.  This fastpath checks the leaves and root first in
order to avoid excessive memory contention on the mutex itself.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index bd2658e..892a140 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3603,6 +3603,15 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 {
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
 	struct rcu_node *rnp = rdp->mynode;
+	struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+	/* Low-contention fastpath. */
+	if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) &&
+	    (rnp == rnp_root ||
+	     ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) &&
+	    !mutex_is_locked(&rsp->exp_mutex) &&
+	    mutex_trylock(&rsp->exp_mutex))
+		goto fastpath;
 
 	/*
 	 * Each pass through the following loop works its way up
@@ -3635,6 +3644,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 					  rnp->grphi, TPS("nxtlvl"));
 	}
 	mutex_lock(&rsp->exp_mutex);
+fastpath:
 	if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) {
 		mutex_unlock(&rsp->exp_mutex);
 		return true;
-- 
cgit v1.1


From 4ea3e85b113ab37a2d55cfabf0d709ddec088bb3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 16 Mar 2016 16:22:25 -0700
Subject: rcu: Consolidate expedited GP code into rcu_exp_wait_wake()

Currently, synchronize_rcu_expedited() and rcu_sched_expedited() have
significant duplicate code.  This commit therefore consolidates some of
this code into rcu_exp_wake(), which is now renamed to rcu_exp_wait_wake()
in recognition of its added responsibilities.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 892a140..fd86eca 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3839,14 +3839,18 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 }
 
 /*
- * Wake up everyone who piggybacked on the just-completed expedited
+ * Wait for the current expedited grace period to complete, and then
+ * wake up everyone who piggybacked on the just-completed expedited
  * grace period.  Also update all the ->exp_seq_rq counters as needed
  * in order to avoid counter-wrap problems.
  */
-static void rcu_exp_wake(struct rcu_state *rsp, unsigned long s)
+static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
 {
 	struct rcu_node *rnp;
 
+	synchronize_sched_expedited_wait(rsp);
+	rcu_exp_gp_seq_end(rsp);
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
 	rcu_for_each_node_breadth_first(rsp, rnp) {
 		if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
 			spin_lock(&rnp->exp_lock);
@@ -3857,6 +3861,8 @@ static void rcu_exp_wake(struct rcu_state *rsp, unsigned long s)
 		}
 		wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x1]);
 	}
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
+	mutex_unlock(&rsp->exp_mutex);
 }
 
 /**
@@ -3904,13 +3910,7 @@ void synchronize_sched_expedited(void)
 	sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
 
 	/* Wait and clean up, including waking everyone. */
-	synchronize_sched_expedited_wait(rsp);
-	rcu_exp_gp_seq_end(rsp);
-	trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
-	rcu_exp_wake(rsp, s);
-
-	trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
-	mutex_unlock(&rsp->exp_mutex);
+	rcu_exp_wait_wake(rsp, s);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 
-- 
cgit v1.1


From 179e5dcd1e5bdfac1128431d131b31322aedd2bc Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 16 Mar 2016 16:27:44 -0700
Subject: rcu: Consolidate expedited GP tracing into rcu_exp_gp_seq_snap()

This commit moves some duplicate code from synchronize_rcu_expedited()
and synchronize_sched_expedited() into rcu_exp_gp_seq_snap().  This
doesn't save lines of code, but does eliminate a "tell me twice" issue.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index fd86eca..5b1c8fd 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3392,8 +3392,12 @@ static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
 }
 static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
 {
+	unsigned long s;
+
 	smp_mb(); /* Caller's modifications seen first by other CPUs. */
-	return rcu_seq_snap(&rsp->expedited_sequence);
+	s = rcu_seq_snap(&rsp->expedited_sequence);
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
+	return s;
 }
 static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
 {
@@ -3898,8 +3902,6 @@ void synchronize_sched_expedited(void)
 
 	/* Take a snapshot of the sequence number.  */
 	s = rcu_exp_gp_seq_snap(rsp);
-	trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
-
 	if (exp_funnel_lock(rsp, s))
 		return;  /* Someone else did our work for us. */
 
-- 
cgit v1.1


From aff12cdf86e6fa891d1c30c0fad112d138bd7b10 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 16 Mar 2016 16:32:24 -0700
Subject: rcu: Consolidate expedited GP code into exp_funnel_lock()

This commit pulls the grace-period-start counter adjustment and tracing
from synchronize_rcu_expedited() and synchronize_sched_expedited()
into exp_funnel_lock(), thus eliminating some code duplication.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5b1c8fd..e8fff14 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3653,6 +3653,8 @@ fastpath:
 		mutex_unlock(&rsp->exp_mutex);
 		return true;
 	}
+	rcu_exp_gp_seq_start(rsp);
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("start"));
 	return false;
 }
 
@@ -3905,9 +3907,6 @@ void synchronize_sched_expedited(void)
 	if (exp_funnel_lock(rsp, s))
 		return;  /* Someone else did our work for us. */
 
-	rcu_exp_gp_seq_start(rsp);
-	trace_rcu_exp_grace_period(rsp->name, s, TPS("start"));
-
 	/* Initialize the rcu_node tree in preparation for the wait. */
 	sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
 
-- 
cgit v1.1


From 3b5f668e715bc19610ad967ef97a7e8c55a186ec Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 16 Mar 2016 16:47:55 -0700
Subject: rcu: Overlap wakeups with next expedited grace period

The current expedited grace-period implementation makes subsequent grace
periods wait on wakeups for the prior grace period.  This does not fit
the dictionary definition of "expedited", so this commit allows these two
phases to overlap.  Doing this requires four waitqueues rather than two
because tasks can now be waiting on the previous, current, and next grace
periods.  The fourth waitqueue makes the bit masking work out nicely.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e8fff14..1df100c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -103,6 +103,7 @@ struct rcu_state sname##_state = { \
 	.name = RCU_STATE_NAME(sname), \
 	.abbr = sabbr, \
 	.exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \
+	.exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \
 }
 
 RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
@@ -3637,7 +3638,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 			trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
 						  rnp->grplo, rnp->grphi,
 						  TPS("wait"));
-			wait_event(rnp->exp_wq[(s >> 1) & 0x1],
+			wait_event(rnp->exp_wq[(s >> 1) & 0x3],
 				   sync_exp_work_done(rsp,
 						      &rdp->exp_workdone2, s));
 			return true;
@@ -3857,6 +3858,14 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
 	synchronize_sched_expedited_wait(rsp);
 	rcu_exp_gp_seq_end(rsp);
 	trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
+
+	/*
+	 * Switch over to wakeup mode, allowing the next GP, but -only- the
+	 * next GP, to proceed.
+	 */
+	mutex_lock(&rsp->exp_wake_mutex);
+	mutex_unlock(&rsp->exp_mutex);
+
 	rcu_for_each_node_breadth_first(rsp, rnp) {
 		if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
 			spin_lock(&rnp->exp_lock);
@@ -3865,10 +3874,10 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
 				rnp->exp_seq_rq = s;
 			spin_unlock(&rnp->exp_lock);
 		}
-		wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x1]);
+		wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]);
 	}
 	trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
-	mutex_unlock(&rsp->exp_mutex);
+	mutex_unlock(&rsp->exp_wake_mutex);
 }
 
 /**
@@ -4530,6 +4539,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 			rcu_init_one_nocb(rnp);
 			init_waitqueue_head(&rnp->exp_wq[0]);
 			init_waitqueue_head(&rnp->exp_wq[1]);
+			init_waitqueue_head(&rnp->exp_wq[2]);
+			init_waitqueue_head(&rnp->exp_wq[3]);
 			spin_lock_init(&rnp->exp_lock);
 		}
 	}
-- 
cgit v1.1


From 86057b80ae31d37fcbdb5f57d15aaf1148c69f96 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 31 Dec 2015 08:48:36 -0800
Subject: rcu: Awaken grace-period kthread when stalled

Recent kernels can fail to awaken the grace-period kthread for
quiescent-state forcing.  This commit is a crude hack that does
a wakeup any time a stall is detected.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 531a328..a327a25 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1224,8 +1224,10 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
 		       rsp->gp_flags,
 		       gp_state_getname(rsp->gp_state), rsp->gp_state,
 		       rsp->gp_kthread ? rsp->gp_kthread->state : ~0);
-		if (rsp->gp_kthread)
+		if (rsp->gp_kthread) {
 			sched_show_task(rsp->gp_kthread);
+			wake_up_process(rsp->gp_kthread);
+		}
 	}
 }
 
-- 
cgit v1.1


From fcfd0a237bfcf0c314005007e9d76e55a25e2bad Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 3 Jan 2016 16:42:18 -0800
Subject: rcu: Make FQS schedule advance only if FQS happened

Currently, the force-quiescent-state (FQS) code in rcu_gp_kthread() can
advance the next FQS even if one was not executed last time.  This can
happen due timeout-duration uncertainty.  This commit therefore avoids
advancing the FQS schedule unless an FQS was just executed.  In the
corner case where an FQS was not executed, but is due now, the code does
a one-jiffy wait.

This change prepares for kthread kicking.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a327a25..6116cfa 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2146,6 +2146,15 @@ static int __noreturn rcu_gp_kthread(void *arg)
 						       TPS("fqsend"));
 				cond_resched_rcu_qs();
 				WRITE_ONCE(rsp->gp_activity, jiffies);
+				ret = 0; /* Force full wait till next FQS. */
+				j = jiffies_till_next_fqs;
+				if (j > HZ) {
+					j = HZ;
+					jiffies_till_next_fqs = HZ;
+				} else if (j < 1) {
+					j = 1;
+					jiffies_till_next_fqs = 1;
+				}
 			} else {
 				/* Deal with stray signal. */
 				cond_resched_rcu_qs();
@@ -2154,14 +2163,12 @@ static int __noreturn rcu_gp_kthread(void *arg)
 				trace_rcu_grace_period(rsp->name,
 						       READ_ONCE(rsp->gpnum),
 						       TPS("fqswaitsig"));
-			}
-			j = jiffies_till_next_fqs;
-			if (j > HZ) {
-				j = HZ;
-				jiffies_till_next_fqs = HZ;
-			} else if (j < 1) {
-				j = 1;
-				jiffies_till_next_fqs = 1;
+				ret = 1; /* Keep old FQS timing. */
+				j = jiffies;
+				if (time_after(jiffies, rsp->jiffies_force_qs))
+					j = 1;
+				else
+					j = rsp->jiffies_force_qs - j;
 			}
 		}
 
-- 
cgit v1.1


From 8c7c4829a81c1838f18c12ce5a3a5c29a08bf0a8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 3 Jan 2016 20:29:57 -0800
Subject: rcu: Awaken grace-period kthread if too long since FQS

Recent kernels can fail to awaken the grace-period kthread for
quiescent-state forcing.  This commit is a crude hack that does
a wakeup if a scheduling-clock interrupt sees that it has been
too long since force-quiescent-state (FQS) processing.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 39 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6116cfa..a739292 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -385,9 +385,11 @@ module_param(qlowmark, long, 0444);
 
 static ulong jiffies_till_first_fqs = ULONG_MAX;
 static ulong jiffies_till_next_fqs = ULONG_MAX;
+static bool rcu_kick_kthreads;
 
 module_param(jiffies_till_first_fqs, ulong, 0644);
 module_param(jiffies_till_next_fqs, ulong, 0644);
+module_param(rcu_kick_kthreads, bool, 0644);
 
 /*
  * How long the grace period must be before we start recruiting
@@ -1251,6 +1253,24 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
 	}
 }
 
+/*
+ * If too much time has passed in the current grace period, and if
+ * so configured, go kick the relevant kthreads.
+ */
+static void rcu_stall_kick_kthreads(struct rcu_state *rsp)
+{
+	unsigned long j;
+
+	if (!rcu_kick_kthreads)
+		return;
+	j = READ_ONCE(rsp->jiffies_kick_kthreads);
+	if (time_after(jiffies, j) && rsp->gp_kthread) {
+		WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name);
+		wake_up_process(rsp->gp_kthread);
+		WRITE_ONCE(rsp->jiffies_kick_kthreads, j + HZ);
+	}
+}
+
 static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 {
 	int cpu;
@@ -1262,6 +1282,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 	struct rcu_node *rnp = rcu_get_root(rsp);
 	long totqlen = 0;
 
+	/* Kick and suppress, if so configured. */
+	rcu_stall_kick_kthreads(rsp);
+	if (rcu_cpu_stall_suppress)
+		return;
+
 	/* Only let one CPU complain about others per time interval. */
 
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@ -1335,6 +1360,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
 	struct rcu_node *rnp = rcu_get_root(rsp);
 	long totqlen = 0;
 
+	/* Kick and suppress, if so configured. */
+	rcu_stall_kick_kthreads(rsp);
+	if (rcu_cpu_stall_suppress)
+		return;
+
 	/*
 	 * OK, time to rat on ourselves...
 	 * See Documentation/RCU/stallwarn.txt for info on how to debug
@@ -1379,8 +1409,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 	unsigned long js;
 	struct rcu_node *rnp;
 
-	if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp))
+	if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) ||
+	    !rcu_gp_in_progress(rsp))
 		return;
+	rcu_stall_kick_kthreads(rsp);
 	j = jiffies;
 
 	/*
@@ -2119,8 +2151,11 @@ static int __noreturn rcu_gp_kthread(void *arg)
 		}
 		ret = 0;
 		for (;;) {
-			if (!ret)
+			if (!ret) {
 				rsp->jiffies_force_qs = jiffies + j;
+				WRITE_ONCE(rsp->jiffies_kick_kthreads,
+					   jiffies + 3 * j);
+			}
 			trace_rcu_grace_period(rsp->name,
 					       READ_ONCE(rsp->gpnum),
 					       TPS("fqswait"));
-- 
cgit v1.1


From 5dffed1e5721f6deae4fd67d32386ef037c5fc56 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 17 Feb 2016 11:54:28 -0800
Subject: rcu: Dump ftrace buffer when kicking grace-period kthread

If it is necessary to kick the grace-period kthread, that is a good
time to dump the trace buffer in order to learn why kicking was needed.
This commit therefore does the dump.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a739292..86edb92 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1266,6 +1266,7 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp)
 	j = READ_ONCE(rsp->jiffies_kick_kthreads);
 	if (time_after(jiffies, j) && rsp->gp_kthread) {
 		WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name);
+		rcu_ftrace_dump(DUMP_ALL);
 		wake_up_process(rsp->gp_kthread);
 		WRITE_ONCE(rsp->jiffies_kick_kthreads, j + HZ);
 	}
-- 
cgit v1.1


From 291783b8ad77a83a6fdf91d55eee7f1ad72ed4d1 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 12 Jan 2016 13:43:30 -0800
Subject: rcutorture: Expedited-GP batch progress access to torturing

This commit provides rcu_exp_batches_completed() and
rcu_exp_batches_completed_sched() functions to allow torture-test modules
to check how many expedited grace period batches have completed.
These are analogous to the existing rcu_batches_completed(),
rcu_batches_completed_bh(), and rcu_batches_completed_sched() functions.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'kernel/rcu/tree.c')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 531a328..88df640 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -460,6 +460,28 @@ unsigned long rcu_batches_completed_bh(void)
 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 
 /*
+ * Return the number of RCU expedited batches completed thus far for
+ * debug & stats.  Odd numbers mean that a batch is in progress, even
+ * numbers mean idle.  The value returned will thus be roughly double
+ * the cumulative batches since boot.
+ */
+unsigned long rcu_exp_batches_completed(void)
+{
+	return rcu_state_p->expedited_sequence;
+}
+EXPORT_SYMBOL_GPL(rcu_exp_batches_completed);
+
+/*
+ * Return the number of RCU-sched expedited batches completed thus far
+ * for debug & stats.  Similar to rcu_exp_batches_completed().
+ */
+unsigned long rcu_exp_batches_completed_sched(void)
+{
+	return rcu_sched_state.expedited_sequence;
+}
+EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched);
+
+/*
  * Force a quiescent state.
  */
 void rcu_force_quiescent_state(void)
-- 
cgit v1.1