From d1ec4c34c7a9f328e43ea87522119258194f28f8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 May 2015 10:41:58 -0700 Subject: rcu: Drop RCU_USER_QS in favor of NO_HZ_FULL The RCU_USER_QS Kconfig parameter is now just a synonym for NO_HZ_FULL, so this commit eliminates RCU_USER_QS, replacing all uses with NO_HZ_FULL. Reported-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Acked-by: Frederic Weisbecker --- include/linux/rcupdate.h | 4 ++-- init/Kconfig | 9 --------- kernel/rcu/tree.c | 8 ++++---- kernel/time/Kconfig | 2 -- 4 files changed, 6 insertions(+), 17 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 4cf5f51..237f7b8 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -309,7 +309,7 @@ static inline void rcu_sysrq_end(void) } #endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */ -#ifdef CONFIG_RCU_USER_QS +#ifdef CONFIG_NO_HZ_FULL void rcu_user_enter(void); void rcu_user_exit(void); #else @@ -317,7 +317,7 @@ static inline void rcu_user_enter(void) { } static inline void rcu_user_exit(void) { } static inline void rcu_user_hooks_switch(struct task_struct *prev, struct task_struct *next) { } -#endif /* CONFIG_RCU_USER_QS */ +#endif /* CONFIG_NO_HZ_FULL */ #ifdef CONFIG_RCU_NOCB_CPU void rcu_init_nohz(void); diff --git a/init/Kconfig b/init/Kconfig index af09b4f..fdeff4a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -538,15 +538,6 @@ config RCU_STALL_COMMON config CONTEXT_TRACKING bool -config RCU_USER_QS - bool - help - This option sets hooks on kernel / userspace boundaries and - puts RCU in extended quiescent state when the CPU runs in - userspace. It means that when a CPU runs in userspace, it is - excluded from the global RCU state machine and thus doesn't - try to keep the timer tick on for RCU. - config CONTEXT_TRACKING_FORCE bool "Force context tracking" depends on CONTEXT_TRACKING diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 65137bc..8b5dd8b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -701,7 +701,7 @@ void rcu_idle_enter(void) } EXPORT_SYMBOL_GPL(rcu_idle_enter); -#ifdef CONFIG_RCU_USER_QS +#ifdef CONFIG_NO_HZ_FULL /** * rcu_user_enter - inform RCU that we are resuming userspace. * @@ -714,7 +714,7 @@ void rcu_user_enter(void) { rcu_eqs_enter(1); } -#endif /* CONFIG_RCU_USER_QS */ +#endif /* CONFIG_NO_HZ_FULL */ /** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle @@ -828,7 +828,7 @@ void rcu_idle_exit(void) } EXPORT_SYMBOL_GPL(rcu_idle_exit); -#ifdef CONFIG_RCU_USER_QS +#ifdef CONFIG_NO_HZ_FULL /** * rcu_user_exit - inform RCU that we are exiting userspace. * @@ -839,7 +839,7 @@ void rcu_user_exit(void) { rcu_eqs_exit(1); } -#endif /* CONFIG_RCU_USER_QS */ +#endif /* CONFIG_NO_HZ_FULL */ /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 579ce1b..4008d9f 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -92,12 +92,10 @@ config NO_HZ_FULL depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS # We need at least one periodic CPU for timekeeping depends on SMP - # RCU_USER_QS dependency depends on HAVE_CONTEXT_TRACKING # VIRT_CPU_ACCOUNTING_GEN dependency depends on HAVE_VIRT_CPU_ACCOUNTING_GEN select NO_HZ_COMMON - select RCU_USER_QS select RCU_NOCB_CPU select VIRT_CPU_ACCOUNTING_GEN select IRQ_WORK -- cgit v1.1 From 21b05de4c8fac08fff08cf84ef1d4fe5786f9608 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 May 2015 17:29:51 -0700 Subject: documentation: Bring rcutorture parameters up to date This commit changes the documentation of the rcutorture parameters to better match reality. Signed-off-by: Paul E. McKenney --- Documentation/kernel-parameters.txt | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 1d6f045..01b5b68 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3135,22 +3135,35 @@ bytes respectively. Such letter suffixes can also be entirely omitted. in a given burst of a callback-flood test. rcutorture.fqs_duration= [KNL] - Set duration of force_quiescent_state bursts. + Set duration of force_quiescent_state bursts + in microseconds. rcutorture.fqs_holdoff= [KNL] - Set holdoff time within force_quiescent_state bursts. + Set holdoff time within force_quiescent_state bursts + in microseconds. rcutorture.fqs_stutter= [KNL] - Set wait time between force_quiescent_state bursts. + Set wait time between force_quiescent_state bursts + in seconds. + + rcutorture.gp_cond= [KNL] + Use conditional/asynchronous update-side + primitives, if available. rcutorture.gp_exp= [KNL] - Use expedited update-side primitives. + Use expedited update-side primitives, if available. rcutorture.gp_normal= [KNL] - Use normal (non-expedited) update-side primitives. - If both gp_exp and gp_normal are set, do both. - If neither gp_exp nor gp_normal are set, still - do both. + Use normal (non-expedited) asynchronous + update-side primitives, if available. + + rcutorture.gp_sync= [KNL] + Use normal (non-expedited) synchronous + update-side primitives, if available. If all + of rcutorture.gp_cond=, rcutorture.gp_exp=, + rcutorture.gp_normal=, and rcutorture.gp_sync= + are zero, rcutorture acts as if is interpreted + they are all non-zero. rcutorture.n_barrier_cbs= [KNL] Set callbacks/threads for rcu_barrier() testing. @@ -3177,9 +3190,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Set time (s) between CPU-hotplug operations, or zero to disable CPU-hotplug testing. - rcutorture.torture_runnable= [BOOT] - Start rcutorture running at boot time. - rcutorture.shuffle_interval= [KNL] Set task-shuffle interval (s). Shuffling tasks allows some CPUs to go into dyntick-idle mode @@ -3220,6 +3230,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Test RCU's dyntick-idle handling. See also the rcutorture.shuffle_interval parameter. + rcutorture.torture_runnable= [BOOT] + Start rcutorture running at boot time. + rcutorture.torture_type= [KNL] Specify the RCU implementation to test. -- cgit v1.1 From 297f739938908a4262603314576e32ee7375296c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 May 2015 17:31:07 -0700 Subject: documentation: Fix spelling of "operators" Reported-by: Mathieu Desnoyers Signed-off-by: Paul E. McKenney --- Documentation/RCU/rcu_dereference.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/RCU/rcu_dereference.txt b/Documentation/RCU/rcu_dereference.txt index 1e6c0da..c0bf244 100644 --- a/Documentation/RCU/rcu_dereference.txt +++ b/Documentation/RCU/rcu_dereference.txt @@ -28,7 +28,7 @@ o You must use one of the rcu_dereference() family of primitives o Avoid cancellation when using the "+" and "-" infix arithmetic operators. For example, for a given variable "x", avoid "(x-x)". There are similar arithmetic pitfalls from other - arithmetic operatiors, such as "(x*0)", "(x/(x+1))" or "(x%1)". + arithmetic operators, such as "(x*0)", "(x/(x+1))" or "(x%1)". The compiler is within its rights to substitute zero for all of these expressions, so that subsequent accesses no longer depend on the rcu_dereference(), again possibly resulting in bugs due -- cgit v1.1 From 57aecae950c55ef50934640794160cd118e73256 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 18 May 2015 18:27:42 -0700 Subject: documentation: Fix variable-name typo in memory-barriers.txt Signed-off-by: Paul E. McKenney --- Documentation/memory-barriers.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 13feb69..3d06f98 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -746,7 +746,7 @@ You must also be careful not to rely too much on boolean short-circuit evaluation. Consider this example: q = READ_ONCE_CTRL(a); - if (a || 1 > 0) + if (q || 1 > 0) ACCESS_ONCE(b) = 1; Because the first condition cannot fault and the second condition is -- cgit v1.1 From 9af194cefc3c40e75a59df4cbb06e1c1064bee7f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 18 Jun 2015 14:33:24 -0700 Subject: documentation: Replace ACCESS_ONCE() by READ_ONCE() and WRITE_ONCE() Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- Documentation/memory-barriers.txt | 346 +++++++++++++++++++------------------- 1 file changed, 177 insertions(+), 169 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 3d06f98..470c07c 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -194,22 +194,22 @@ There are some minimal guarantees that may be expected of a CPU: (*) On any given CPU, dependent memory accesses will be issued in order, with respect to itself. This means that for: - ACCESS_ONCE(Q) = P; smp_read_barrier_depends(); D = ACCESS_ONCE(*Q); + WRITE_ONCE(Q, P); smp_read_barrier_depends(); D = READ_ONCE(*Q); the CPU will issue the following memory operations: Q = LOAD P, D = LOAD *Q and always in that order. On most systems, smp_read_barrier_depends() - does nothing, but it is required for DEC Alpha. The ACCESS_ONCE() - is required to prevent compiler mischief. Please note that you - should normally use something like rcu_dereference() instead of - open-coding smp_read_barrier_depends(). + does nothing, but it is required for DEC Alpha. The READ_ONCE() + and WRITE_ONCE() are required to prevent compiler mischief. Please + note that you should normally use something like rcu_dereference() + instead of open-coding smp_read_barrier_depends(). (*) Overlapping loads and stores within a particular CPU will appear to be ordered within that CPU. This means that for: - a = ACCESS_ONCE(*X); ACCESS_ONCE(*X) = b; + a = READ_ONCE(*X); WRITE_ONCE(*X, b); the CPU will only issue the following sequence of memory operations: @@ -217,7 +217,7 @@ There are some minimal guarantees that may be expected of a CPU: And for: - ACCESS_ONCE(*X) = c; d = ACCESS_ONCE(*X); + WRITE_ONCE(*X, c); d = READ_ONCE(*X); the CPU will only issue: @@ -228,11 +228,11 @@ There are some minimal guarantees that may be expected of a CPU: And there are a number of things that _must_ or _must_not_ be assumed: - (*) It _must_not_ be assumed that the compiler will do what you want with - memory references that are not protected by ACCESS_ONCE(). Without - ACCESS_ONCE(), the compiler is within its rights to do all sorts - of "creative" transformations, which are covered in the Compiler - Barrier section. + (*) It _must_not_ be assumed that the compiler will do what you want + with memory references that are not protected by READ_ONCE() and + WRITE_ONCE(). Without them, the compiler is within its rights to + do all sorts of "creative" transformations, which are covered in + the Compiler Barrier section. (*) It _must_not_ be assumed that independent loads and stores will be issued in the order given. This means that for: @@ -520,8 +520,8 @@ following sequence of events: { A == 1, B == 2, C = 3, P == &A, Q == &C } B = 4; - ACCESS_ONCE(P) = &B - Q = ACCESS_ONCE(P); + WRITE_ONCE(P, &B) + Q = READ_ONCE(P); D = *Q; There's a clear data dependency here, and it would seem that by the end of the @@ -547,8 +547,8 @@ between the address load and the data load: { A == 1, B == 2, C = 3, P == &A, Q == &C } B = 4; - ACCESS_ONCE(P) = &B - Q = ACCESS_ONCE(P); + WRITE_ONCE(P, &B); + Q = READ_ONCE(P); D = *Q; @@ -574,8 +574,8 @@ access: { M[0] == 1, M[1] == 2, M[3] = 3, P == 0, Q == 3 } M[1] = 4; - ACCESS_ONCE(P) = 1 - Q = ACCESS_ONCE(P); + WRITE_ONCE(P, 1); + Q = READ_ONCE(P); D = M[Q]; @@ -596,10 +596,10 @@ A load-load control dependency requires a full read memory barrier, not simply a data dependency barrier to make it work correctly. Consider the following bit of code: - q = ACCESS_ONCE(a); + q = READ_ONCE(a); if (q) { /* BUG: No data dependency!!! */ - p = ACCESS_ONCE(b); + p = READ_ONCE(b); } This will not have the desired effect because there is no actual data @@ -608,10 +608,10 @@ by attempting to predict the outcome in advance, so that other CPUs see the load from b as having happened before the load from a. In such a case what's actually required is: - q = ACCESS_ONCE(a); + q = READ_ONCE(a); if (q) { - p = ACCESS_ONCE(b); + p = READ_ONCE(b); } However, stores are not speculated. This means that ordering -is- provided @@ -619,7 +619,7 @@ for load-store control dependencies, as in the following example: q = READ_ONCE_CTRL(a); if (q) { - ACCESS_ONCE(b) = p; + WRITE_ONCE(b, p); } Control dependencies pair normally with other types of barriers. That @@ -647,11 +647,11 @@ branches of the "if" statement as follows: q = READ_ONCE_CTRL(a); if (q) { barrier(); - ACCESS_ONCE(b) = p; + WRITE_ONCE(b, p); do_something(); } else { barrier(); - ACCESS_ONCE(b) = p; + WRITE_ONCE(b, p); do_something_else(); } @@ -660,12 +660,12 @@ optimization levels: q = READ_ONCE_CTRL(a); barrier(); - ACCESS_ONCE(b) = p; /* BUG: No ordering vs. load from a!!! */ + WRITE_ONCE(b, p); /* BUG: No ordering vs. load from a!!! */ if (q) { - /* ACCESS_ONCE(b) = p; -- moved up, BUG!!! */ + /* WRITE_ONCE(b, p); -- moved up, BUG!!! */ do_something(); } else { - /* ACCESS_ONCE(b) = p; -- moved up, BUG!!! */ + /* WRITE_ONCE(b, p); -- moved up, BUG!!! */ do_something_else(); } @@ -676,7 +676,7 @@ assembly code even after all compiler optimizations have been applied. Therefore, if you need ordering in this example, you need explicit memory barriers, for example, smp_store_release(): - q = ACCESS_ONCE(a); + q = READ_ONCE(a); if (q) { smp_store_release(&b, p); do_something(); @@ -690,10 +690,10 @@ ordering is guaranteed only when the stores differ, for example: q = READ_ONCE_CTRL(a); if (q) { - ACCESS_ONCE(b) = p; + WRITE_ONCE(b, p); do_something(); } else { - ACCESS_ONCE(b) = r; + WRITE_ONCE(b, r); do_something_else(); } @@ -706,10 +706,10 @@ the needed conditional. For example: q = READ_ONCE_CTRL(a); if (q % MAX) { - ACCESS_ONCE(b) = p; + WRITE_ONCE(b, p); do_something(); } else { - ACCESS_ONCE(b) = r; + WRITE_ONCE(b, r); do_something_else(); } @@ -718,7 +718,7 @@ equal to zero, in which case the compiler is within its rights to transform the above code into the following: q = READ_ONCE_CTRL(a); - ACCESS_ONCE(b) = p; + WRITE_ONCE(b, p); do_something_else(); Given this transformation, the CPU is not required to respect the ordering @@ -731,10 +731,10 @@ one, perhaps as follows: q = READ_ONCE_CTRL(a); BUILD_BUG_ON(MAX <= 1); /* Order load from a with store to b. */ if (q % MAX) { - ACCESS_ONCE(b) = p; + WRITE_ONCE(b, p); do_something(); } else { - ACCESS_ONCE(b) = r; + WRITE_ONCE(b, r); do_something_else(); } @@ -747,17 +747,17 @@ evaluation. Consider this example: q = READ_ONCE_CTRL(a); if (q || 1 > 0) - ACCESS_ONCE(b) = 1; + WRITE_ONCE(b, 1); Because the first condition cannot fault and the second condition is always true, the compiler can transform this example as following, defeating control dependency: q = READ_ONCE_CTRL(a); - ACCESS_ONCE(b) = 1; + WRITE_ONCE(b, 1); This example underscores the need to ensure that the compiler cannot -out-guess your code. More generally, although ACCESS_ONCE() does force +out-guess your code. More generally, although READ_ONCE() does force the compiler to actually emit code for a given load, it does not force the compiler to use the results. @@ -769,7 +769,7 @@ x and y both being zero: ======================= ======================= r1 = READ_ONCE_CTRL(x); r2 = READ_ONCE_CTRL(y); if (r1 > 0) if (r2 > 0) - ACCESS_ONCE(y) = 1; ACCESS_ONCE(x) = 1; + WRITE_ONCE(y, 1); WRITE_ONCE(x, 1); assert(!(r1 == 1 && r2 == 1)); @@ -779,7 +779,7 @@ then adding the following CPU would guarantee a related assertion: CPU 2 ===================== - ACCESS_ONCE(x) = 2; + WRITE_ONCE(x, 2); assert(!(r1 == 2 && r2 == 1 && x == 2)); /* FAILS!!! */ @@ -798,8 +798,7 @@ In summary: (*) Control dependencies must be headed by READ_ONCE_CTRL(). Or, as a much less preferable alternative, interpose - be headed by READ_ONCE() or an ACCESS_ONCE() read and must - have smp_read_barrier_depends() between this read and the + smp_read_barrier_depends() between a READ_ONCE() and the control-dependent write. (*) Control dependencies can order prior loads against later stores. @@ -815,15 +814,16 @@ In summary: (*) Control dependencies require at least one run-time conditional between the prior load and the subsequent store, and this - conditional must involve the prior load. If the compiler - is able to optimize the conditional away, it will have also - optimized away the ordering. Careful use of ACCESS_ONCE() can - help to preserve the needed conditional. + conditional must involve the prior load. If the compiler is able + to optimize the conditional away, it will have also optimized + away the ordering. Careful use of READ_ONCE_CTRL() READ_ONCE(), + and WRITE_ONCE() can help to preserve the needed conditional. (*) Control dependencies require that the compiler avoid reordering the - dependency into nonexistence. Careful use of ACCESS_ONCE() or - barrier() can help to preserve your control dependency. Please - see the Compiler Barrier section for more information. + dependency into nonexistence. Careful use of READ_ONCE_CTRL() + or smp_read_barrier_depends() can help to preserve your control + dependency. Please see the Compiler Barrier section for more + information. (*) Control dependencies pair normally with other types of barriers. @@ -848,11 +848,11 @@ barrier, an acquire barrier, a release barrier, or a general barrier: CPU 1 CPU 2 =============== =============== - ACCESS_ONCE(a) = 1; + WRITE_ONCE(a, 1); - ACCESS_ONCE(b) = 2; x = ACCESS_ONCE(b); + WRITE_ONCE(b, 2); x = READ_ONCE(b); - y = ACCESS_ONCE(a); + y = READ_ONCE(a); Or: @@ -860,7 +860,7 @@ Or: =============== =============================== a = 1; - ACCESS_ONCE(b) = &a; x = ACCESS_ONCE(b); + WRITE_ONCE(b, &a); x = READ_ONCE(b); y = *x; @@ -868,11 +868,11 @@ Or even: CPU 1 CPU 2 =============== =============================== - r1 = ACCESS_ONCE(y); + r1 = READ_ONCE(y); - ACCESS_ONCE(y) = 1; if (r2 = ACCESS_ONCE(x)) { + WRITE_ONCE(y, 1); if (r2 = READ_ONCE(x)) { - ACCESS_ONCE(y) = 1; + WRITE_ONCE(y, 1); } assert(r1 == 0 || r2 == 0); @@ -886,11 +886,11 @@ versa: CPU 1 CPU 2 =================== =================== - ACCESS_ONCE(a) = 1; }---- --->{ v = ACCESS_ONCE(c); - ACCESS_ONCE(b) = 2; } \ / { w = ACCESS_ONCE(d); + WRITE_ONCE(a, 1); }---- --->{ v = READ_ONCE(c); + WRITE_ONCE(b, 2); } \ / { w = READ_ONCE(d); \ - ACCESS_ONCE(c) = 3; } / \ { x = ACCESS_ONCE(a); - ACCESS_ONCE(d) = 4; }---- --->{ y = ACCESS_ONCE(b); + WRITE_ONCE(c, 3); } / \ { x = READ_ONCE(a); + WRITE_ONCE(d, 4); }---- --->{ y = READ_ONCE(b); EXAMPLES OF MEMORY BARRIER SEQUENCES @@ -1340,10 +1340,10 @@ compiler from moving the memory accesses either side of it to the other side: barrier(); -This is a general barrier -- there are no read-read or write-write variants -of barrier(). However, ACCESS_ONCE() can be thought of as a weak form -for barrier() that affects only the specific accesses flagged by the -ACCESS_ONCE(). +This is a general barrier -- there are no read-read or write-write +variants of barrier(). However, READ_ONCE() and WRITE_ONCE() can be +thought of as weak forms of barrier() that affect only the specific +accesses flagged by the READ_ONCE() or WRITE_ONCE(). The barrier() function has the following effects: @@ -1355,9 +1355,10 @@ The barrier() function has the following effects: (*) Within a loop, forces the compiler to load the variables used in that loop's conditional on each pass through that loop. -The ACCESS_ONCE() function can prevent any number of optimizations that, -while perfectly safe in single-threaded code, can be fatal in concurrent -code. Here are some examples of these sorts of optimizations: +The READ_ONCE() and WRITE_ONCE() functions can prevent any number of +optimizations that, while perfectly safe in single-threaded code, can +be fatal in concurrent code. Here are some examples of these sorts +of optimizations: (*) The compiler is within its rights to reorder loads and stores to the same variable, and in some cases, the CPU is within its @@ -1370,11 +1371,11 @@ code. Here are some examples of these sorts of optimizations: Might result in an older value of x stored in a[1] than in a[0]. Prevent both the compiler and the CPU from doing this as follows: - a[0] = ACCESS_ONCE(x); - a[1] = ACCESS_ONCE(x); + a[0] = READ_ONCE(x); + a[1] = READ_ONCE(x); - In short, ACCESS_ONCE() provides cache coherence for accesses from - multiple CPUs to a single variable. + In short, READ_ONCE() and WRITE_ONCE() provide cache coherence for + accesses from multiple CPUs to a single variable. (*) The compiler is within its rights to merge successive loads from the same variable. Such merging can cause the compiler to "optimize" @@ -1391,9 +1392,9 @@ code. Here are some examples of these sorts of optimizations: for (;;) do_something_with(tmp); - Use ACCESS_ONCE() to prevent the compiler from doing this to you: + Use READ_ONCE() to prevent the compiler from doing this to you: - while (tmp = ACCESS_ONCE(a)) + while (tmp = READ_ONCE(a)) do_something_with(tmp); (*) The compiler is within its rights to reload a variable, for example, @@ -1415,9 +1416,9 @@ code. Here are some examples of these sorts of optimizations: a was modified by some other CPU between the "while" statement and the call to do_something_with(). - Again, use ACCESS_ONCE() to prevent the compiler from doing this: + Again, use READ_ONCE() to prevent the compiler from doing this: - while (tmp = ACCESS_ONCE(a)) + while (tmp = READ_ONCE(a)) do_something_with(tmp); Note that if the compiler runs short of registers, it might save @@ -1437,21 +1438,21 @@ code. Here are some examples of these sorts of optimizations: do { } while (0); - This transformation is a win for single-threaded code because it gets - rid of a load and a branch. The problem is that the compiler will - carry out its proof assuming that the current CPU is the only one - updating variable 'a'. If variable 'a' is shared, then the compiler's - proof will be erroneous. Use ACCESS_ONCE() to tell the compiler - that it doesn't know as much as it thinks it does: + This transformation is a win for single-threaded code because it + gets rid of a load and a branch. The problem is that the compiler + will carry out its proof assuming that the current CPU is the only + one updating variable 'a'. If variable 'a' is shared, then the + compiler's proof will be erroneous. Use READ_ONCE() to tell the + compiler that it doesn't know as much as it thinks it does: - while (tmp = ACCESS_ONCE(a)) + while (tmp = READ_ONCE(a)) do_something_with(tmp); But please note that the compiler is also closely watching what you - do with the value after the ACCESS_ONCE(). For example, suppose you + do with the value after the READ_ONCE(). For example, suppose you do the following and MAX is a preprocessor macro with the value 1: - while ((tmp = ACCESS_ONCE(a)) % MAX) + while ((tmp = READ_ONCE(a)) % MAX) do_something_with(tmp); Then the compiler knows that the result of the "%" operator applied @@ -1475,12 +1476,12 @@ code. Here are some examples of these sorts of optimizations: surprise if some other CPU might have stored to variable 'a' in the meantime. - Use ACCESS_ONCE() to prevent the compiler from making this sort of + Use WRITE_ONCE() to prevent the compiler from making this sort of wrong guess: - ACCESS_ONCE(a) = 0; + WRITE_ONCE(a, 0); /* Code that does not store to variable a. */ - ACCESS_ONCE(a) = 0; + WRITE_ONCE(a, 0); (*) The compiler is within its rights to reorder memory accesses unless you tell it not to. For example, consider the following interaction @@ -1509,40 +1510,43 @@ code. Here are some examples of these sorts of optimizations: } If the interrupt occurs between these two statement, then - interrupt_handler() might be passed a garbled msg. Use ACCESS_ONCE() + interrupt_handler() might be passed a garbled msg. Use WRITE_ONCE() to prevent this as follows: void process_level(void) { - ACCESS_ONCE(msg) = get_message(); - ACCESS_ONCE(flag) = true; + WRITE_ONCE(msg, get_message()); + WRITE_ONCE(flag, true); } void interrupt_handler(void) { - if (ACCESS_ONCE(flag)) - process_message(ACCESS_ONCE(msg)); + if (READ_ONCE(flag)) + process_message(READ_ONCE(msg)); } - Note that the ACCESS_ONCE() wrappers in interrupt_handler() - are needed if this interrupt handler can itself be interrupted - by something that also accesses 'flag' and 'msg', for example, - a nested interrupt or an NMI. Otherwise, ACCESS_ONCE() is not - needed in interrupt_handler() other than for documentation purposes. - (Note also that nested interrupts do not typically occur in modern - Linux kernels, in fact, if an interrupt handler returns with - interrupts enabled, you will get a WARN_ONCE() splat.) - - You should assume that the compiler can move ACCESS_ONCE() past - code not containing ACCESS_ONCE(), barrier(), or similar primitives. - - This effect could also be achieved using barrier(), but ACCESS_ONCE() - is more selective: With ACCESS_ONCE(), the compiler need only forget - the contents of the indicated memory locations, while with barrier() - the compiler must discard the value of all memory locations that - it has currented cached in any machine registers. Of course, - the compiler must also respect the order in which the ACCESS_ONCE()s - occur, though the CPU of course need not do so. + Note that the READ_ONCE() and WRITE_ONCE() wrappers in + interrupt_handler() are needed if this interrupt handler can itself + be interrupted by something that also accesses 'flag' and 'msg', + for example, a nested interrupt or an NMI. Otherwise, READ_ONCE() + and WRITE_ONCE() are not needed in interrupt_handler() other than + for documentation purposes. (Note also that nested interrupts + do not typically occur in modern Linux kernels, in fact, if an + interrupt handler returns with interrupts enabled, you will get a + WARN_ONCE() splat.) + + You should assume that the compiler can move READ_ONCE() and + WRITE_ONCE() past code not containing READ_ONCE(), WRITE_ONCE(), + barrier(), or similar primitives. + + This effect could also be achieved using barrier(), but READ_ONCE() + and WRITE_ONCE() are more selective: With READ_ONCE() and + WRITE_ONCE(), the compiler need only forget the contents of the + indicated memory locations, while with barrier() the compiler must + discard the value of all memory locations that it has currented + cached in any machine registers. Of course, the compiler must also + respect the order in which the READ_ONCE()s and WRITE_ONCE()s occur, + though the CPU of course need not do so. (*) The compiler is within its rights to invent stores to a variable, as in the following example: @@ -1562,16 +1566,16 @@ code. Here are some examples of these sorts of optimizations: a branch. Unfortunately, in concurrent code, this optimization could cause some other CPU to see a spurious value of 42 -- even if variable 'a' was never zero -- when loading variable 'b'. - Use ACCESS_ONCE() to prevent this as follows: + Use WRITE_ONCE() to prevent this as follows: if (a) - ACCESS_ONCE(b) = a; + WRITE_ONCE(b, a); else - ACCESS_ONCE(b) = 42; + WRITE_ONCE(b, 42); The compiler can also invent loads. These are usually less damaging, but they can result in cache-line bouncing and thus in - poor performance and scalability. Use ACCESS_ONCE() to prevent + poor performance and scalability. Use READ_ONCE() to prevent invented loads. (*) For aligned memory locations whose size allows them to be accessed @@ -1590,9 +1594,9 @@ code. Here are some examples of these sorts of optimizations: This optimization can therefore be a win in single-threaded code. In fact, a recent bug (since fixed) caused GCC to incorrectly use this optimization in a volatile store. In the absence of such bugs, - use of ACCESS_ONCE() prevents store tearing in the following example: + use of WRITE_ONCE() prevents store tearing in the following example: - ACCESS_ONCE(p) = 0x00010002; + WRITE_ONCE(p, 0x00010002); Use of packed structures can also result in load and store tearing, as in this example: @@ -1609,22 +1613,23 @@ code. Here are some examples of these sorts of optimizations: foo2.b = foo1.b; foo2.c = foo1.c; - Because there are no ACCESS_ONCE() wrappers and no volatile markings, - the compiler would be well within its rights to implement these three - assignment statements as a pair of 32-bit loads followed by a pair - of 32-bit stores. This would result in load tearing on 'foo1.b' - and store tearing on 'foo2.b'. ACCESS_ONCE() again prevents tearing - in this example: + Because there are no READ_ONCE() or WRITE_ONCE() wrappers and no + volatile markings, the compiler would be well within its rights to + implement these three assignment statements as a pair of 32-bit + loads followed by a pair of 32-bit stores. This would result in + load tearing on 'foo1.b' and store tearing on 'foo2.b'. READ_ONCE() + and WRITE_ONCE() again prevent tearing in this example: foo2.a = foo1.a; - ACCESS_ONCE(foo2.b) = ACCESS_ONCE(foo1.b); + WRITE_ONCE(foo2.b, READ_ONCE(foo1.b)); foo2.c = foo1.c; -All that aside, it is never necessary to use ACCESS_ONCE() on a variable -that has been marked volatile. For example, because 'jiffies' is marked -volatile, it is never necessary to say ACCESS_ONCE(jiffies). The reason -for this is that ACCESS_ONCE() is implemented as a volatile cast, which -has no effect when its argument is already marked volatile. +All that aside, it is never necessary to use READ_ONCE() and +WRITE_ONCE() on a variable that has been marked volatile. For example, +because 'jiffies' is marked volatile, it is never necessary to +say READ_ONCE(jiffies). The reason for this is that READ_ONCE() and +WRITE_ONCE() are implemented as volatile casts, which has no effect when +its argument is already marked volatile. Please note that these compiler barriers have no direct effect on the CPU, which may then reorder things however it wishes. @@ -1646,14 +1651,15 @@ The Linux kernel has eight basic CPU memory barriers: All memory barriers except the data dependency barriers imply a compiler barrier. Data dependencies do not impose any additional compiler ordering. -Aside: In the case of data dependencies, the compiler would be expected to -issue the loads in the correct order (eg. `a[b]` would have to load the value -of b before loading a[b]), however there is no guarantee in the C specification -that the compiler may not speculate the value of b (eg. is equal to 1) and load -a before b (eg. tmp = a[1]; if (b != 1) tmp = a[b]; ). There is also the -problem of a compiler reloading b after having loaded a[b], thus having a newer -copy of b than a[b]. A consensus has not yet been reached about these problems, -however the ACCESS_ONCE macro is a good place to start looking. +Aside: In the case of data dependencies, the compiler would be expected +to issue the loads in the correct order (eg. `a[b]` would have to load +the value of b before loading a[b]), however there is no guarantee in +the C specification that the compiler may not speculate the value of b +(eg. is equal to 1) and load a before b (eg. tmp = a[1]; if (b != 1) +tmp = a[b]; ). There is also the problem of a compiler reloading b after +having loaded a[b], thus having a newer copy of b than a[b]. A consensus +has not yet been reached about these problems, however the READ_ONCE() +macro is a good place to start looking. SMP memory barriers are reduced to compiler barriers on uniprocessor compiled systems because it is assumed that a CPU will appear to be self-consistent, @@ -2126,12 +2132,12 @@ three CPUs; then should the following sequence of events occur: CPU 1 CPU 2 =============================== =============================== - ACCESS_ONCE(*A) = a; ACCESS_ONCE(*E) = e; + WRITE_ONCE(*A, a); WRITE_ONCE(*E, e); ACQUIRE M ACQUIRE Q - ACCESS_ONCE(*B) = b; ACCESS_ONCE(*F) = f; - ACCESS_ONCE(*C) = c; ACCESS_ONCE(*G) = g; + WRITE_ONCE(*B, b); WRITE_ONCE(*F, f); + WRITE_ONCE(*C, c); WRITE_ONCE(*G, g); RELEASE M RELEASE Q - ACCESS_ONCE(*D) = d; ACCESS_ONCE(*H) = h; + WRITE_ONCE(*D, d); WRITE_ONCE(*H, h); Then there is no guarantee as to what order CPU 3 will see the accesses to *A through *H occur in, other than the constraints imposed by the separate locks @@ -2151,18 +2157,18 @@ However, if the following occurs: CPU 1 CPU 2 =============================== =============================== - ACCESS_ONCE(*A) = a; + WRITE_ONCE(*A, a); ACQUIRE M [1] - ACCESS_ONCE(*B) = b; - ACCESS_ONCE(*C) = c; + WRITE_ONCE(*B, b); + WRITE_ONCE(*C, c); RELEASE M [1] - ACCESS_ONCE(*D) = d; ACCESS_ONCE(*E) = e; + WRITE_ONCE(*D, d); WRITE_ONCE(*E, e); ACQUIRE M [2] smp_mb__after_unlock_lock(); - ACCESS_ONCE(*F) = f; - ACCESS_ONCE(*G) = g; + WRITE_ONCE(*F, f); + WRITE_ONCE(*G, g); RELEASE M [2] - ACCESS_ONCE(*H) = h; + WRITE_ONCE(*H, h); CPU 3 might see: @@ -2881,11 +2887,11 @@ A programmer might take it for granted that the CPU will perform memory operations in exactly the order specified, so that if the CPU is, for example, given the following piece of code to execute: - a = ACCESS_ONCE(*A); - ACCESS_ONCE(*B) = b; - c = ACCESS_ONCE(*C); - d = ACCESS_ONCE(*D); - ACCESS_ONCE(*E) = e; + a = READ_ONCE(*A); + WRITE_ONCE(*B, b); + c = READ_ONCE(*C); + d = READ_ONCE(*D); + WRITE_ONCE(*E, e); they would then expect that the CPU will complete the memory operation for each instruction before moving on to the next one, leading to a definite sequence of @@ -2932,12 +2938,12 @@ However, it is guaranteed that a CPU will be self-consistent: it will see its _own_ accesses appear to be correctly ordered, without the need for a memory barrier. For instance with the following code: - U = ACCESS_ONCE(*A); - ACCESS_ONCE(*A) = V; - ACCESS_ONCE(*A) = W; - X = ACCESS_ONCE(*A); - ACCESS_ONCE(*A) = Y; - Z = ACCESS_ONCE(*A); + U = READ_ONCE(*A); + WRITE_ONCE(*A, V); + WRITE_ONCE(*A, W); + X = READ_ONCE(*A); + WRITE_ONCE(*A, Y); + Z = READ_ONCE(*A); and assuming no intervention by an external influence, it can be assumed that the final result will appear to be: @@ -2953,13 +2959,14 @@ accesses: U=LOAD *A, STORE *A=V, STORE *A=W, X=LOAD *A, STORE *A=Y, Z=LOAD *A in that order, but, without intervention, the sequence may have almost any -combination of elements combined or discarded, provided the program's view of -the world remains consistent. Note that ACCESS_ONCE() is -not- optional -in the above example, as there are architectures where a given CPU might -reorder successive loads to the same location. On such architectures, -ACCESS_ONCE() does whatever is necessary to prevent this, for example, on -Itanium the volatile casts used by ACCESS_ONCE() cause GCC to emit the -special ld.acq and st.rel instructions that prevent such reordering. +combination of elements combined or discarded, provided the program's view +of the world remains consistent. Note that READ_ONCE() and WRITE_ONCE() +are -not- optional in the above example, as there are architectures +where a given CPU might reorder successive loads to the same location. +On such architectures, READ_ONCE() and WRITE_ONCE() do whatever is +necessary to prevent this, for example, on Itanium the volatile casts +used by READ_ONCE() and WRITE_ONCE() cause GCC to emit the special ld.acq +and st.rel instructions (respectively) that prevent such reordering. The compiler may also combine, discard or defer elements of the sequence before the CPU even sees them. @@ -2973,13 +2980,14 @@ may be reduced to: *A = W; -since, without either a write barrier or an ACCESS_ONCE(), it can be +since, without either a write barrier or an WRITE_ONCE(), it can be assumed that the effect of the storage of V to *A is lost. Similarly: *A = Y; Z = *A; -may, without a memory barrier or an ACCESS_ONCE(), be reduced to: +may, without a memory barrier or an READ_ONCE() and WRITE_ONCE(), be +reduced to: *A = Y; Z = Y; -- cgit v1.1 From 96d7744e0a5631a1b5fef2a97658150b165f02b6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Jul 2015 15:55:52 -0700 Subject: doc: Call out smp_mb__after_unlock_lock() transitivity Although "full barrier" should be interpreted as providing transitivity, it is worth eliminating any possible confusion. This commit therefore adds "(including transitivity)" to eliminate any possible confusion. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- Documentation/memory-barriers.txt | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 470c07c..3185238 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -1858,11 +1858,12 @@ Similarly, the reverse case of a RELEASE followed by an ACQUIRE does not imply a full memory barrier. If it is necessary for a RELEASE-ACQUIRE pair to produce a full barrier, the ACQUIRE can be followed by an smp_mb__after_unlock_lock() invocation. This will produce a full barrier -if either (a) the RELEASE and the ACQUIRE are executed by the same -CPU or task, or (b) the RELEASE and ACQUIRE act on the same variable. -The smp_mb__after_unlock_lock() primitive is free on many architectures. -Without smp_mb__after_unlock_lock(), the CPU's execution of the critical -sections corresponding to the RELEASE and the ACQUIRE can cross, so that: +(including transitivity) if either (a) the RELEASE and the ACQUIRE are +executed by the same CPU or task, or (b) the RELEASE and ACQUIRE act on +the same variable. The smp_mb__after_unlock_lock() primitive is free +on many architectures. Without smp_mb__after_unlock_lock(), the CPU's +execution of the critical sections corresponding to the RELEASE and the +ACQUIRE can cross, so that: *A = a; RELEASE M -- cgit v1.1 From d5671f6bf2a672cfa72ef2cbac5cc53a4539690d Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 26 May 2015 17:48:34 +0200 Subject: rcu: Deinline rcu_read_lock_sched_held() if DEBUG_LOCK_ALLOC DEBUG_LOCK_ALLOC=y is not a production setting, but it is not very unusual either. Many developers routinely use kernels built with it enabled. Apart from being selected by hand, it is also auto-selected by PROVE_LOCKING "Lock debugging: prove locking correctness" and LOCK_STAT "Lock usage statistics" config options. LOCK STAT is necessary for "perf lock" to work. I wouldn't spend too much time optimizing it, but this particular function has a very large cost in code size: when it is deinlined, code size decreases by 830,000 bytes: text data bss dec hex filename 85674192 22294776 20627456 128596424 7aa39c8 vmlinux.before 84837612 22294424 20627456 127759492 79d7484 vmlinux (with this config: http://busybox.net/~vda/kernel_config) Signed-off-by: Denys Vlasenko CC: "Paul E. McKenney" CC: Josh Triplett CC: Mathieu Desnoyers CC: Lai Jiangshan CC: Tejun Heo CC: Oleg Nesterov CC: linux-kernel@vger.kernel.org Reviewed-by: Steven Rostedt Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 40 ++------------------------------------- kernel/rcu/update.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 38 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 237f7b8..def6d45 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -469,46 +469,10 @@ int rcu_read_lock_bh_held(void); * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an * RCU-sched read-side critical section. In absence of * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side - * critical section unless it can prove otherwise. Note that disabling - * of preemption (including disabling irqs) counts as an RCU-sched - * read-side critical section. This is useful for debug checks in functions - * that required that they be called within an RCU-sched read-side - * critical section. - * - * Check debug_lockdep_rcu_enabled() to prevent false positives during boot - * and while lockdep is disabled. - * - * Note that if the CPU is in the idle loop from an RCU point of - * view (ie: that we are in the section between rcu_idle_enter() and - * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU - * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs - * that are in such a section, considering these as in extended quiescent - * state, so such a CPU is effectively never in an RCU read-side critical - * section regardless of what RCU primitives it invokes. This state of - * affairs is required --- we need to keep an RCU-free window in idle - * where the CPU may possibly enter into low power mode. This way we can - * notice an extended quiescent state to other CPUs that started a grace - * period. Otherwise we would delay any grace period as long as we run in - * the idle task. - * - * Similarly, we avoid claiming an SRCU read lock held if the current - * CPU is offline. + * critical section unless it can prove otherwise. */ #ifdef CONFIG_PREEMPT_COUNT -static inline int rcu_read_lock_sched_held(void) -{ - int lockdep_opinion = 0; - - if (!debug_lockdep_rcu_enabled()) - return 1; - if (!rcu_is_watching()) - return 0; - if (!rcu_lockdep_current_cpu_online()) - return 0; - if (debug_locks) - lockdep_opinion = lock_is_held(&rcu_sched_lock_map); - return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); -} +int rcu_read_lock_sched_held(void); #else /* #ifdef CONFIG_PREEMPT_COUNT */ static inline int rcu_read_lock_sched_held(void) { diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index afaecb7..fec5f48 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -62,6 +62,55 @@ MODULE_ALIAS("rcupdate"); module_param(rcu_expedited, int, 0); +#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) +/** + * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? + * + * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an + * RCU-sched read-side critical section. In absence of + * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side + * critical section unless it can prove otherwise. Note that disabling + * of preemption (including disabling irqs) counts as an RCU-sched + * read-side critical section. This is useful for debug checks in functions + * that required that they be called within an RCU-sched read-side + * critical section. + * + * Check debug_lockdep_rcu_enabled() to prevent false positives during boot + * and while lockdep is disabled. + * + * Note that if the CPU is in the idle loop from an RCU point of + * view (ie: that we are in the section between rcu_idle_enter() and + * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU + * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs + * that are in such a section, considering these as in extended quiescent + * state, so such a CPU is effectively never in an RCU read-side critical + * section regardless of what RCU primitives it invokes. This state of + * affairs is required --- we need to keep an RCU-free window in idle + * where the CPU may possibly enter into low power mode. This way we can + * notice an extended quiescent state to other CPUs that started a grace + * period. Otherwise we would delay any grace period as long as we run in + * the idle task. + * + * Similarly, we avoid claiming an SRCU read lock held if the current + * CPU is offline. + */ +int rcu_read_lock_sched_held(void) +{ + int lockdep_opinion = 0; + + if (!debug_lockdep_rcu_enabled()) + return 1; + if (!rcu_is_watching()) + return 0; + if (!rcu_lockdep_current_cpu_online()) + return 0; + if (debug_locks) + lockdep_opinion = lock_is_held(&rcu_sched_lock_map); + return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); +} +EXPORT_SYMBOL(rcu_read_lock_sched_held); +#endif + #ifndef CONFIG_TINY_RCU static atomic_t rcu_expedited_nesting = -- cgit v1.1 From f765d1130700878c2275bc1ea09eed428f870a2a Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Wed, 27 May 2015 08:56:25 +0200 Subject: rcu: Change return type to bool Type-checking coccinelle spatches are being used to locate type mismatches between function signatures and return values in this case this produced: ./kernel/rcu/srcu.c:271 WARNING: return of wrong type int != unsigned long, srcu_readers_active() returns an int that is the sum of per_cpu unsigned long but the only user is cleanup_srcu_struct() which is using it as a boolean (condition) to see if there is any readers rather than actually using the approximate number of readers. The theoretically possible unsigned long overflow case does not need to be handled explicitly - if we had 4G++ readers then something else went wrong a long time ago. proposal: change the return type to boolean. The function name is left unchanged as it fits the naming expectation for a boolean. patch was compile tested for x86_64_defconfig (implies CONFIG_SRCU=y) patch is against 4.1-rc5 (localversion-next is -next-20150525) Signed-off-by: Nicholas Mc Guire Signed-off-by: Paul E. McKenney --- kernel/rcu/srcu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index fb33d35..de35087 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -252,14 +252,15 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) } /** - * srcu_readers_active - returns approximate number of readers. + * srcu_readers_active - returns true if there are readers. and false + * otherwise * @sp: which srcu_struct to count active readers (holding srcu_read_lock). * * Note that this is not an atomic primitive, and can therefore suffer * severe errors when invoked on an active srcu_struct. That said, it * can be useful as an error check at cleanup time. */ -static int srcu_readers_active(struct srcu_struct *sp) +static bool srcu_readers_active(struct srcu_struct *sp) { int cpu; unsigned long sum = 0; -- cgit v1.1 From 319362c90f7af92d03714759bea5d2ba7c88e388 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 May 2015 14:16:52 -0700 Subject: rcu: Provide more diagnostics for stalled GP kthread Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 ++++++++-- kernel/rcu/tree.h | 6 +++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 65137bc..1c58cbd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1178,9 +1178,11 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) j = jiffies; gpa = READ_ONCE(rsp->gp_activity); if (j - gpa > 2 * HZ) - pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x\n", + pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n", rsp->name, j - gpa, - rsp->gpnum, rsp->completed, rsp->gp_flags); + rsp->gpnum, rsp->completed, + rsp->gp_flags, rsp->gp_state, + rsp->gp_kthread ? rsp->gp_kthread->state : 0); } /* @@ -2041,6 +2043,7 @@ static int __noreturn rcu_gp_kthread(void *arg) wait_event_interruptible(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); + rsp->gp_state = RCU_GP_DONE_GPS; /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) break; @@ -2073,6 +2076,7 @@ static int __noreturn rcu_gp_kthread(void *arg) (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)), j); + rsp->gp_state = RCU_GP_DONE_FQS; /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ if (!READ_ONCE(rnp->qsmask) && @@ -2110,7 +2114,9 @@ static int __noreturn rcu_gp_kthread(void *arg) } /* Handle grace-period end. */ + rsp->gp_state = RCU_GP_CLEANUP; rcu_gp_cleanup(rsp); + rsp->gp_state = RCU_GP_CLEANED; } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 4adb7ca..f1f4784 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -527,7 +527,11 @@ struct rcu_state { /* Values for rcu_state structure's gp_flags field. */ #define RCU_GP_WAIT_INIT 0 /* Initial state. */ #define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ -#define RCU_GP_WAIT_FQS 2 /* Wait for force-quiescent-state time. */ +#define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */ +#define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */ +#define RCU_GP_DONE_FQS 4 /* Wait done for force-quiescent-state time. */ +#define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */ +#define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */ extern struct list_head rcu_struct_flavors; -- cgit v1.1 From 75cf15a4c0dd57f5d230bd30c2d41bd8e06ae5a9 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 3 Jun 2015 08:18:23 +0200 Subject: rcu: Panic if RCU tree can not accommodate all CPUs Currently a condition when RCU tree is unable to accommodate the configured number of CPUs is not permitted and causes a fall back to compile-time values. However, the code has no means to exceed the RCU tree capacity neither at compile-time nor in run-time. Therefore, if the condition is met in run- time then it indicates a serios problem elsewhere and should be handled with a panic. Cc: "Paul E. McKenney" Cc: Steven Rostedt Signed-off-by: Alexander Gordeev Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1c58cbd..fe8d929 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4107,6 +4107,19 @@ static void __init rcu_init_geometry(void) rcu_fanout_leaf, nr_cpu_ids); /* + * The boot-time rcu_fanout_leaf parameter is only permitted + * to increase the leaf-level fanout, not decrease it. Of course, + * the leaf-level fanout cannot exceed the number of bits in + * the rcu_node masks. Complain and fall back to the compile- + * time values if these limits are exceeded. + */ + if (rcu_fanout_leaf < RCU_FANOUT_LEAF || + rcu_fanout_leaf > sizeof(unsigned long) * 8) { + WARN_ON(1); + return; + } + + /* * Compute number of nodes that can be handled an rcu_node tree * with the given number of levels. Setting rcu_capacity[0] makes * some of the arithmetic easier. @@ -4117,19 +4130,11 @@ static void __init rcu_init_geometry(void) rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT; /* - * The boot-time rcu_fanout_leaf parameter is only permitted - * to increase the leaf-level fanout, not decrease it. Of course, - * the leaf-level fanout cannot exceed the number of bits in - * the rcu_node masks. Finally, the tree must be able to accommodate - * the configured number of CPUs. Complain and fall back to the - * compile-time values if these limits are exceeded. + * The tree must be able to accommodate the configured number of CPUs. + * If this limit is exceeded than we have a serious problem elsewhere. */ - if (rcu_fanout_leaf < RCU_FANOUT_LEAF || - rcu_fanout_leaf > sizeof(unsigned long) * 8 || - n > rcu_capacity[MAX_RCU_LVLS]) { - WARN_ON(1); - return; - } + if (n > rcu_capacity[MAX_RCU_LVLS]) + panic("rcu_init_geometry: rcu_capacity[] is too small"); /* Calculate the number of rcu_nodes at each level of the tree. */ for (i = 1; i <= MAX_RCU_LVLS; i++) -- cgit v1.1 From 372b0ec24f6b516174934d68fd86d2056f1a5bba Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 3 Jun 2015 08:18:24 +0200 Subject: rcu: Remove superfluous local variable in rcu_init_geometry() Local variable 'n' mimics 'nr_cpu_ids' while the both are used within one function. There is no reason for 'n' to exist whatsoever. Cc: "Paul E. McKenney" Cc: Steven Rostedt Signed-off-by: Alexander Gordeev Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index fe8d929..ad49dbe 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4083,7 +4083,6 @@ static void __init rcu_init_geometry(void) ulong d; int i; int j; - int n = nr_cpu_ids; int rcu_capacity[MAX_RCU_LVLS + 1]; /* @@ -4133,15 +4132,16 @@ static void __init rcu_init_geometry(void) * The tree must be able to accommodate the configured number of CPUs. * If this limit is exceeded than we have a serious problem elsewhere. */ - if (n > rcu_capacity[MAX_RCU_LVLS]) + if (nr_cpu_ids > rcu_capacity[MAX_RCU_LVLS]) panic("rcu_init_geometry: rcu_capacity[] is too small"); /* Calculate the number of rcu_nodes at each level of the tree. */ for (i = 1; i <= MAX_RCU_LVLS; i++) - if (n <= rcu_capacity[i]) { - for (j = 0; j <= i; j++) - num_rcu_lvl[j] = - DIV_ROUND_UP(n, rcu_capacity[i - j]); + if (nr_cpu_ids <= rcu_capacity[i]) { + for (j = 0; j <= i; j++) { + int cap = rcu_capacity[i - j]; + num_rcu_lvl[j] = DIV_ROUND_UP(nr_cpu_ids, cap); + } rcu_num_lvls = i; for (j = i + 1; j <= MAX_RCU_LVLS; j++) num_rcu_lvl[j] = 0; @@ -4152,7 +4152,7 @@ static void __init rcu_init_geometry(void) rcu_num_nodes = 0; for (i = 0; i <= MAX_RCU_LVLS; i++) rcu_num_nodes += num_rcu_lvl[i]; - rcu_num_nodes -= n; + rcu_num_nodes -= nr_cpu_ids; } /* -- cgit v1.1 From 679f9858b1769d740d933f5f1ad9dbe3292f26d2 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 3 Jun 2015 08:18:25 +0200 Subject: rcu: Cleanup rcu_init_geometry() code and arithmetics This update simplifies rcu_init_geometry() code flow and makes calculation of the total number of rcu_node structures more easy to read. The update relies on the fact num_rcu_lvl[] is never accessed beyond rcu_num_lvls index by the rest of the code. Therefore, there is no need initialize the whole num_rcu_lvl[]. Cc: "Paul E. McKenney" Cc: Steven Rostedt Signed-off-by: Alexander Gordeev Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ad49dbe..37ca8a8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4082,7 +4082,6 @@ static void __init rcu_init_geometry(void) { ulong d; int i; - int j; int rcu_capacity[MAX_RCU_LVLS + 1]; /* @@ -4135,24 +4134,21 @@ static void __init rcu_init_geometry(void) if (nr_cpu_ids > rcu_capacity[MAX_RCU_LVLS]) panic("rcu_init_geometry: rcu_capacity[] is too small"); + /* Calculate the number of levels in the tree. */ + for (i = 1; nr_cpu_ids > rcu_capacity[i]; i++) { + } + rcu_num_lvls = i; + /* Calculate the number of rcu_nodes at each level of the tree. */ - for (i = 1; i <= MAX_RCU_LVLS; i++) - if (nr_cpu_ids <= rcu_capacity[i]) { - for (j = 0; j <= i; j++) { - int cap = rcu_capacity[i - j]; - num_rcu_lvl[j] = DIV_ROUND_UP(nr_cpu_ids, cap); - } - rcu_num_lvls = i; - for (j = i + 1; j <= MAX_RCU_LVLS; j++) - num_rcu_lvl[j] = 0; - break; - } + for (i = 0; i < rcu_num_lvls; i++) { + int cap = rcu_capacity[rcu_num_lvls - i]; + num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap); + } /* Calculate the total number of rcu_node structures. */ rcu_num_nodes = 0; - for (i = 0; i <= MAX_RCU_LVLS; i++) + for (i = 0; i < rcu_num_lvls; i++) rcu_num_nodes += num_rcu_lvl[i]; - rcu_num_nodes -= nr_cpu_ids; } /* -- cgit v1.1 From 9618138b09260e3df5af5a0d7bdc8fca010f6a3f Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 3 Jun 2015 08:18:26 +0200 Subject: rcu: Simplify rcu_init_geometry() capacity arithmetics Current code suggests that introducing the extra level to rcu_capacity[] array makes some of the arithmetic easier. Well, in fact it appears rather confusing and unnecessary. Cc: "Paul E. McKenney" Cc: Steven Rostedt Signed-off-by: Alexander Gordeev Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 37ca8a8..2103bee 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4082,7 +4082,7 @@ static void __init rcu_init_geometry(void) { ulong d; int i; - int rcu_capacity[MAX_RCU_LVLS + 1]; + int rcu_capacity[MAX_RCU_LVLS]; /* * Initialize any unspecified boot parameters. @@ -4119,29 +4119,27 @@ static void __init rcu_init_geometry(void) /* * Compute number of nodes that can be handled an rcu_node tree - * with the given number of levels. Setting rcu_capacity[0] makes - * some of the arithmetic easier. + * with the given number of levels. */ - rcu_capacity[0] = 1; - rcu_capacity[1] = rcu_fanout_leaf; - for (i = 2; i <= MAX_RCU_LVLS; i++) + rcu_capacity[0] = rcu_fanout_leaf; + for (i = 1; i < MAX_RCU_LVLS; i++) rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT; /* * The tree must be able to accommodate the configured number of CPUs. * If this limit is exceeded than we have a serious problem elsewhere. */ - if (nr_cpu_ids > rcu_capacity[MAX_RCU_LVLS]) + if (nr_cpu_ids > rcu_capacity[MAX_RCU_LVLS - 1]) panic("rcu_init_geometry: rcu_capacity[] is too small"); /* Calculate the number of levels in the tree. */ - for (i = 1; nr_cpu_ids > rcu_capacity[i]; i++) { + for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) { } - rcu_num_lvls = i; + rcu_num_lvls = i + 1; /* Calculate the number of rcu_nodes at each level of the tree. */ for (i = 0; i < rcu_num_lvls; i++) { - int cap = rcu_capacity[rcu_num_lvls - i]; + int cap = rcu_capacity[(rcu_num_lvls - 1) - i]; num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap); } -- cgit v1.1 From a6d77081e266605c9f4d8c11e0ee00468b9dc614 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 3 Jun 2015 08:18:27 +0200 Subject: rcu: Limit rcu_state::levelcnt[] to RCU_NUM_LVLS items Variable rcu_num_lvls is limited by RCU_NUM_LVLS macro. In turn, rcu_state::levelcnt[] array is never accessed beyond rcu_num_lvls. Thus, rcu_state::levelcnt[] is safe to limit to RCU_NUM_LVLS items. Since rcu_num_lvls could be changed during boot (as result of rcutree.rcu_fanout_leaf kernel parameter update) one might assume a new value could overflow the value of RCU_NUM_LVLS. However, that is not the case, since leaf-level fanout is only permitted to increase, resulting in rcu_num_lvls possibly to decrease. Cc: "Paul E. McKenney" Cc: Steven Rostedt Signed-off-by: Alexander Gordeev Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f1f4784..a6faae5 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -443,7 +443,7 @@ do { \ struct rcu_state { struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ - u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ + u32 levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ u8 flavor_mask; /* bit in flavor mask. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ -- cgit v1.1 From 05b84aec465c34da242a224d2438d192ca0feec7 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 3 Jun 2015 08:18:28 +0200 Subject: rcu: Limit rcu_capacity[] size to RCU_NUM_LVLS items Number of items in rcu_capacity[] array is defined by macro MAX_RCU_LVLS. However, that array is never accessed beyond RCU_NUM_LVLS index. Therefore, we can limit the array to RCU_NUM_LVLS items and eliminate MAX_RCU_LVLS. As result, in most cases the memory is conserved. Cc: "Paul E. McKenney" Cc: Steven Rostedt Signed-off-by: Alexander Gordeev Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 12 ++++++------ kernel/rcu/tree.h | 2 -- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2103bee..2ec7b79 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3998,19 +3998,19 @@ static void __init rcu_init_one(struct rcu_state *rsp, "rcu_node_0", "rcu_node_1", "rcu_node_2", - "rcu_node_3" }; /* Match MAX_RCU_LVLS */ + "rcu_node_3" }; static const char * const fqs[] = { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", - "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ + "rcu_node_fqs_3" }; static u8 fl_mask = 0x1; int cpustride = 1; int i; int j; struct rcu_node *rnp; - BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ + BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ /* Silence gcc 4.8 false positive about array index out of range. */ if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS) @@ -4082,7 +4082,7 @@ static void __init rcu_init_geometry(void) { ulong d; int i; - int rcu_capacity[MAX_RCU_LVLS]; + int rcu_capacity[RCU_NUM_LVLS]; /* * Initialize any unspecified boot parameters. @@ -4122,14 +4122,14 @@ static void __init rcu_init_geometry(void) * with the given number of levels. */ rcu_capacity[0] = rcu_fanout_leaf; - for (i = 1; i < MAX_RCU_LVLS; i++) + for (i = 1; i < RCU_NUM_LVLS; i++) rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT; /* * The tree must be able to accommodate the configured number of CPUs. * If this limit is exceeded than we have a serious problem elsewhere. */ - if (nr_cpu_ids > rcu_capacity[MAX_RCU_LVLS - 1]) + if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) panic("rcu_init_geometry: rcu_capacity[] is too small"); /* Calculate the number of levels in the tree. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a6faae5..d625e9f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -36,8 +36,6 @@ * Of course, your mileage may vary. */ -#define MAX_RCU_LVLS 4 - #ifdef CONFIG_RCU_FANOUT #define RCU_FANOUT CONFIG_RCU_FANOUT #else /* #ifdef CONFIG_RCU_FANOUT */ -- cgit v1.1 From 199977bff9efceec649d74510fa9754e107ce0c5 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 3 Jun 2015 08:18:29 +0200 Subject: rcu: Remove unnecessary fields from rcu_state structure Members rcu_state::levelcnt[] and rcu_state::levelspread[] are only used at init. There is no reason to keep them afterwards. Cc: "Paul E. McKenney" Cc: Steven Rostedt Signed-off-by: Alexander Gordeev Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 27 +++++++++++++++------------ kernel/rcu/tree.h | 2 -- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2ec7b79..7226e25 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3967,22 +3967,22 @@ void rcu_scheduler_starting(void) * Compute the per-level fanout, either using the exact fanout specified * or balancing the tree, depending on the rcu_fanout_exact boot parameter. */ -static void __init rcu_init_levelspread(struct rcu_state *rsp) +static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt) { int i; if (rcu_fanout_exact) { - rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; + levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; for (i = rcu_num_lvls - 2; i >= 0; i--) - rsp->levelspread[i] = RCU_FANOUT; + levelspread[i] = RCU_FANOUT; } else { int ccur; int cprv; cprv = nr_cpu_ids; for (i = rcu_num_lvls - 1; i >= 0; i--) { - ccur = rsp->levelcnt[i]; - rsp->levelspread[i] = (cprv + ccur - 1) / ccur; + ccur = levelcnt[i]; + levelspread[i] = (cprv + ccur - 1) / ccur; cprv = ccur; } } @@ -4005,6 +4005,9 @@ static void __init rcu_init_one(struct rcu_state *rsp, "rcu_node_fqs_2", "rcu_node_fqs_3" }; static u8 fl_mask = 0x1; + + int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ + int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ int cpustride = 1; int i; int j; @@ -4019,19 +4022,19 @@ static void __init rcu_init_one(struct rcu_state *rsp, /* Initialize the level-tracking arrays. */ for (i = 0; i < rcu_num_lvls; i++) - rsp->levelcnt[i] = num_rcu_lvl[i]; + levelcnt[i] = num_rcu_lvl[i]; for (i = 1; i < rcu_num_lvls; i++) - rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; - rcu_init_levelspread(rsp); + rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1]; + rcu_init_levelspread(levelspread, levelcnt); rsp->flavor_mask = fl_mask; fl_mask <<= 1; /* Initialize the elements themselves, starting from the leaves. */ for (i = rcu_num_lvls - 1; i >= 0; i--) { - cpustride *= rsp->levelspread[i]; + cpustride *= levelspread[i]; rnp = rsp->level[i]; - for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { + for (j = 0; j < levelcnt[i]; j++, rnp++) { raw_spin_lock_init(&rnp->lock); lockdep_set_class_and_name(&rnp->lock, &rcu_node_class[i], buf[i]); @@ -4051,10 +4054,10 @@ static void __init rcu_init_one(struct rcu_state *rsp, rnp->grpmask = 0; rnp->parent = NULL; } else { - rnp->grpnum = j % rsp->levelspread[i - 1]; + rnp->grpnum = j % levelspread[i - 1]; rnp->grpmask = 1UL << rnp->grpnum; rnp->parent = rsp->level[i - 1] + - j / rsp->levelspread[i - 1]; + j / levelspread[i - 1]; } rnp->level = i; INIT_LIST_HEAD(&rnp->blkd_tasks); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index d625e9f..3413f3c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -441,8 +441,6 @@ do { \ struct rcu_state { struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ - u32 levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ - u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ u8 flavor_mask; /* bit in flavor mask. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ -- cgit v1.1 From cb007102398edd06ffc4488bf841c2e10f14d2e7 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 3 Jun 2015 08:18:30 +0200 Subject: rcu: Limit count of static data to the number of RCU levels Although a number of RCU levels may be less than the current maximum of four, some static data associated with each level are allocated for all four levels. As result, the extra data never get accessed and just wast memory. This update limits count of allocated items to the number of used RCU levels. Cc: "Paul E. McKenney" Cc: Steven Rostedt Signed-off-by: Alexander Gordeev Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 21 ++++----------------- kernel/rcu/tree.h | 12 ++++++++++++ 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7226e25..e53bbc5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -124,13 +124,8 @@ module_param(rcu_fanout_exact, bool, 0444); static int rcu_fanout_leaf = RCU_FANOUT_LEAF; module_param(rcu_fanout_leaf, int, 0444); int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; -static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ - NUM_RCU_LVL_0, - NUM_RCU_LVL_1, - NUM_RCU_LVL_2, - NUM_RCU_LVL_3, - NUM_RCU_LVL_4, -}; +/* Number of rcu_nodes at specified level. */ +static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ /* @@ -3994,16 +3989,8 @@ static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt) static void __init rcu_init_one(struct rcu_state *rsp, struct rcu_data __percpu *rda) { - static const char * const buf[] = { - "rcu_node_0", - "rcu_node_1", - "rcu_node_2", - "rcu_node_3" }; - static const char * const fqs[] = { - "rcu_node_fqs_0", - "rcu_node_fqs_1", - "rcu_node_fqs_2", - "rcu_node_fqs_3" }; + static const char * const buf[] = RCU_NODE_NAME_INIT; + static const char * const fqs[] = RCU_FQS_NAME_INIT; static u8 fl_mask = 0x1; int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 3413f3c..d44856b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -68,6 +68,9 @@ # define NUM_RCU_LVL_2 0 # define NUM_RCU_LVL_3 0 # define NUM_RCU_LVL_4 0 +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } +# define RCU_NODE_NAME_INIT { "rcu_node_0" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } #elif NR_CPUS <= RCU_FANOUT_2 # define RCU_NUM_LVLS 2 # define NUM_RCU_LVL_0 1 @@ -75,6 +78,9 @@ # define NUM_RCU_LVL_2 (NR_CPUS) # define NUM_RCU_LVL_3 0 # define NUM_RCU_LVL_4 0 +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } #elif NR_CPUS <= RCU_FANOUT_3 # define RCU_NUM_LVLS 3 # define NUM_RCU_LVL_0 1 @@ -82,6 +88,9 @@ # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) # define NUM_RCU_LVL_3 (NR_CPUS) # define NUM_RCU_LVL_4 0 +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } #elif NR_CPUS <= RCU_FANOUT_4 # define RCU_NUM_LVLS 4 # define NUM_RCU_LVL_0 1 @@ -89,6 +98,9 @@ # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) # define NUM_RCU_LVL_4 (NR_CPUS) +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ -- cgit v1.1 From 426216970e0458c1f507860f4837cbde66a72263 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 3 Jun 2015 08:18:31 +0200 Subject: rcu: Simplify arithmetic to calculate number of RCU nodes This update makes arithmetic to calculate number of RCU nodes more straight and easy to read. Cc: "Paul E. McKenney" Cc: Steven Rostedt Signed-off-by: Alexander Gordeev Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 17 ++++------------- kernel/rcu/tree_plugin.h | 4 ++-- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index d44856b..581f8d3 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -64,10 +64,7 @@ #if NR_CPUS <= RCU_FANOUT_1 # define RCU_NUM_LVLS 1 # define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 (NR_CPUS) -# define NUM_RCU_LVL_2 0 -# define NUM_RCU_LVL_3 0 -# define NUM_RCU_LVL_4 0 +# define NUM_RCU_NODES NUM_RCU_LVL_0 # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } # define RCU_NODE_NAME_INIT { "rcu_node_0" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } @@ -75,9 +72,7 @@ # define RCU_NUM_LVLS 2 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_LVL_2 (NR_CPUS) -# define NUM_RCU_LVL_3 0 -# define NUM_RCU_LVL_4 0 +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1) # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } @@ -86,8 +81,7 @@ # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_LVL_3 (NR_CPUS) -# define NUM_RCU_LVL_4 0 +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2) # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } @@ -97,7 +91,7 @@ # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_LVL_4 (NR_CPUS) +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } @@ -105,9 +99,6 @@ # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ -#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) -#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) - extern int rcu_num_lvls; extern int rcu_num_nodes; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 013485f..5dac0a1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -84,8 +84,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU torture testing starts during boot.\n"); if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO)) pr_info("\tAdditional per-CPU info printed with stalls.\n"); - if (NUM_RCU_LVL_4 != 0) - pr_info("\tFour-level hierarchy is enabled.\n"); + if (RCU_NUM_LVLS >= 4) + pr_info("\tFour(or more)-level hierarchy is enabled.\n"); if (RCU_FANOUT_LEAF != 16) pr_info("\tBuild-time adjustment of leaf fanout to %d.\n", RCU_FANOUT_LEAF); -- cgit v1.1 From d9eba768839ac24e47606af36e50c14f10c2211c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 May 2015 15:35:43 -0700 Subject: rcutorture: Better bounds checking for n_barrier_cbs A negative value for rcutorture.n_barrier_cbs can pass a negative value to the memory allocator, so this commit instead causes rcu_barrier() testing to be disabled in this case. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 59e3268..7e29a32 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1507,7 +1507,7 @@ static int rcu_torture_barrier_init(void) int i; int ret; - if (n_barrier_cbs == 0) + if (n_barrier_cbs <= 0) return 0; if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { pr_alert("%s" TORTURE_FLAG -- cgit v1.1 From 4444d852a99b8f0310f369da8473ec3639e380a7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 May 2015 15:42:40 -0700 Subject: rcutorture: Check nfakewriters parameter Currently, a negative value for rcutorture.nfakewriters= can cause rcutorture to pass a negative size to the memory allocator, which is not really a particularly good thing to do. This commit therefore adds bounds checking to this parameter, so that values that are less than or equal to zero disable fake writing. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7e29a32..2cbe569 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1786,12 +1786,15 @@ rcu_torture_init(void) writer_task); if (firsterr) goto unwind; - fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), - GFP_KERNEL); - if (fakewriter_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; + if (nfakewriters > 0) { + fakewriter_tasks = kzalloc(nfakewriters * + sizeof(fakewriter_tasks[0]), + GFP_KERNEL); + if (fakewriter_tasks == NULL) { + VERBOSE_TOROUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } } for (i = 0; i < nfakewriters; i++) { firsterr = torture_create_kthread(rcu_torture_fakewriter, -- cgit v1.1 From e8e255f7191fb6491dd1d96cfbbe19981f6eb3dd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 May 2015 16:55:45 -0700 Subject: rcutorture: Bounds-check rcutorture.shuffle_interval Specifying a negative rcutorture.shuffle_interval value will cause a negative value to be used as a sleep time. This commit therefore refuses to start shuffling unless the rcutorture.shuffle_interval value is greater than zero. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 2cbe569..1cead78 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1821,7 +1821,7 @@ rcu_torture_init(void) if (firsterr) goto unwind; } - if (test_no_idle_hz) { + if (test_no_idle_hz && shuffle_interval > 0) { firsterr = torture_shuffle_init(shuffle_interval * HZ); if (firsterr) goto unwind; -- cgit v1.1 From 3a0af333415830d2a0ca77de832336af5aadced4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 Jun 2015 18:11:31 -0700 Subject: rcutorture: Fix rcu_torture_cbflood() for callback-free RCU The rcu_torture_cbflood() function correctly checks for flavors of RCU that lack analogs to call_rcu() and rcu_barrier(), but in that case it fails to terminate correctly. In fact, it terminates so incorrectly that segfaults can result. This commit therefore causes rcu_torture_cbflood() to do the proper wait-for-stop procedure. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 1cead78..e0eda3c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -823,9 +823,7 @@ rcu_torture_cbflood(void *arg) } if (err) { VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM"); - while (!torture_must_stop()) - schedule_timeout_interruptible(HZ); - return 0; + goto wait_for_stop; } VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started"); do { @@ -844,6 +842,7 @@ rcu_torture_cbflood(void *arg) stutter_wait("rcu_torture_cbflood"); } while (!torture_must_stop()); vfree(rhp); +wait_for_stop: torture_kthread_stopping("rcu_torture_cbflood"); return 0; } -- cgit v1.1 From 5be5d1a11775fadc6104789fad72fae46dff348e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Jun 2015 08:57:57 -0700 Subject: rcutorture: Add RCU-tasks qualifier to dereference Although RCU-tasks isn't really designed to support rcu_dereference() and list manipulation, that is how rcutorture tests it. Which means that lockdep-RCU complains about the rcu_dereference_check() invocations because RCU-tasks doesn't have read-side markers. This commit therefore creates a torturing_tasks() to silence the lockdep-RCU complaints from rcu_dereference_check() when RCU-tasks is being tortured. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index e0eda3c..67b3f26 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -684,10 +684,20 @@ static struct rcu_torture_ops tasks_ops = { #define RCUTORTURE_TASKS_OPS &tasks_ops, +static bool __maybe_unused torturing_tasks(void) +{ + return cur_ops == &tasks_ops; +} + #else /* #ifdef CONFIG_TASKS_RCU */ #define RCUTORTURE_TASKS_OPS +static bool torturing_tasks(void) +{ + return false; +} + #endif /* #else #ifdef CONFIG_TASKS_RCU */ /* @@ -1087,7 +1097,8 @@ static void rcu_torture_timer(unsigned long unused) p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || - srcu_read_lock_held(srcu_ctlp)); + srcu_read_lock_held(srcu_ctlp) || + torturing_tasks()); if (p == NULL) { /* Leave because rcu_torture_writer is not yet underway */ cur_ops->readunlock(idx); @@ -1161,7 +1172,8 @@ rcu_torture_reader(void *arg) p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || - srcu_read_lock_held(srcu_ctlp)); + srcu_read_lock_held(srcu_ctlp) || + torturing_tasks()); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ cur_ops->readunlock(idx); -- cgit v1.1 From d6a8c6d34222f7d9de4c819fbcfbaebb1e40e8a2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Jun 2015 09:14:01 -0700 Subject: rcutorture: Enable lockdep-RCU on TASKS01 Currently none of the RCU-tasks scenarios enables lockdep-RCU, which causes bugs to be missed. This commit therefore enables lockdep-RCU on TASKS01. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/configs/rcu/TASKS01 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS01 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS01 index 2cc0e60..bafe94c 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS01 @@ -5,6 +5,6 @@ CONFIG_PREEMPT_NONE=n CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=y CONFIG_DEBUG_LOCK_ALLOC=y -CONFIG_PROVE_LOCKING=n -#CHECK#CONFIG_PROVE_RCU=n +CONFIG_PROVE_LOCKING=y +#CHECK#CONFIG_PROVE_RCU=y CONFIG_RCU_EXPERT=y -- cgit v1.1 From 032dfc87225c96ec1771e5967436c4b23d1dc5d6 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 9 Jul 2015 15:34:23 +0200 Subject: rcu: Shut up bogus gcc array bounds warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because gcc does not realize a loop would not be entered ever (i.e. in case of rcu_num_lvls == 1): for (i = 1; i < rcu_num_lvls; i++) rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1]; some compiler (pre- 5.x?) versions give a bogus warning: kernel/rcu/tree.c: In function ‘rcu_init_one.isra.55’: kernel/rcu/tree.c:4108:13: warning: array subscript is above array bounds [-Warray-bounds] rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; ^ Fix that warning by adding an extra item to rcu_state::level[] array. Once the bogus warning is fixed in gcc and kernel drops support of older versions, the dummy item may be removed from the array. Cc: "Paul E. McKenney" Suggested-by: "Paul E. McKenney" Signed-off-by: Alexander Gordeev Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 581f8d3..faee524 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -443,7 +443,9 @@ do { \ */ struct rcu_state { struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ - struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ + struct rcu_node *level[RCU_NUM_LVLS + 1]; + /* Hierarchy levels (+1 to */ + /* shut bogus gcc warning) */ u8 flavor_mask; /* bit in flavor mask. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ -- cgit v1.1 From 13bd64947f53ba8d7199922be94b6626b8e222d7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 4 Jun 2015 10:06:01 -0700 Subject: rcu: Reset rcu_fanout_leaf if out of bounds Currently if the rcu_fanout_leaf boot parameter is out of bounds (that is, less than RCU_FANOUT_LEAF or greater than the number of bits in an unsigned long), a warning is issued and execution continues with the out-of-bounds value. This can result in all manner of failures, so this patch resets rcu_fanout_leaf to RCU_FANOUT_LEAF when an out-of-bounds condition is detected. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e53bbc5..a2147d7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4103,6 +4103,7 @@ static void __init rcu_init_geometry(void) */ if (rcu_fanout_leaf < RCU_FANOUT_LEAF || rcu_fanout_leaf > sizeof(unsigned long) * 8) { + rcu_fanout_leaf = RCU_FANOUT_LEAF; WARN_ON(1); return; } -- cgit v1.1 From 9b683874504a57cfa97558d403c75e286e20c9ce Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Jun 2015 14:50:22 -0700 Subject: rcu: Stop disabling CPU hotplug in synchronize_rcu_expedited() The fact that tasks could be migrated from leaf to root rcu_node structures meant that synchronize_rcu_expedited() had to disable CPU hotplug. However, tasks now stay put, so this commit removes the CPU-hotplug disabling from synchronize_rcu_expedited(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 25 ++----------------------- 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5dac0a1..7234f03 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -728,42 +728,23 @@ void synchronize_rcu_expedited(void) smp_mb(); /* Above access cannot bleed into critical section. */ /* - * Block CPU-hotplug operations. This means that any CPU-hotplug - * operation that finds an rcu_node structure with tasks in the - * process of being boosted will know that all tasks blocking - * this expedited grace period will already be in the process of - * being boosted. This simplifies the process of moving tasks - * from leaf to root rcu_node structures. - */ - if (!try_get_online_cpus()) { - /* CPU-hotplug operation in flight, fall back to normal GP. */ - wait_rcu_gp(call_rcu); - return; - } - - /* * Acquire lock, falling back to synchronize_rcu() if too many * lock-acquisition failures. Of course, if someone does the * expedited grace period for us, just leave. */ while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { if (ULONG_CMP_LT(snap, - READ_ONCE(sync_rcu_preempt_exp_count))) { - put_online_cpus(); + READ_ONCE(sync_rcu_preempt_exp_count))) goto mb_ret; /* Others did our work for us. */ - } if (trycount++ < 10) { udelay(trycount * num_online_cpus()); } else { - put_online_cpus(); wait_rcu_gp(call_rcu); return; } } - if (ULONG_CMP_LT(snap, READ_ONCE(sync_rcu_preempt_exp_count))) { - put_online_cpus(); + if (ULONG_CMP_LT(snap, READ_ONCE(sync_rcu_preempt_exp_count))) goto unlock_mb_ret; /* Others did our work for us. */ - } /* force all RCU readers onto ->blkd_tasks lists. */ synchronize_sched_expedited(); @@ -779,8 +760,6 @@ void synchronize_rcu_expedited(void) rcu_for_each_leaf_node(rsp, rnp) sync_rcu_preempt_exp_init2(rsp, rnp); - put_online_cpus(); - /* Wait for snapshotted ->blkd_tasks lists to drain. */ rnp = rcu_get_root(rsp); wait_event(sync_rcu_preempt_exp_wq, -- cgit v1.1 From 75c27f119b6475d95374bdad872c6938b5c26196 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Jun 2015 15:22:43 -0700 Subject: rcu: Remove CONFIG_RCU_CPU_STALL_INFO The CONFIG_RCU_CPU_STALL_INFO has been default-y for a couple of releases with no complaints, so it is time to eliminate this Kconfig option entirely, so that the long-form RCU CPU stall warnings cannot be disabled. This commit does just that. Signed-off-by: Paul E. McKenney --- Documentation/RCU/stallwarn.txt | 12 +----- kernel/rcu/tree.h | 4 -- kernel/rcu/tree_plugin.h | 45 ---------------------- lib/Kconfig.debug | 14 ------- .../selftests/rcutorture/configs/rcu/TREE01 | 1 - .../selftests/rcutorture/configs/rcu/TREE02 | 1 - .../selftests/rcutorture/configs/rcu/TREE02-T | 1 - .../selftests/rcutorture/configs/rcu/TREE03 | 1 - .../selftests/rcutorture/configs/rcu/TREE04 | 1 - .../selftests/rcutorture/configs/rcu/TREE05 | 1 - .../selftests/rcutorture/configs/rcu/TREE06 | 1 - .../selftests/rcutorture/configs/rcu/TREE07 | 1 - .../selftests/rcutorture/configs/rcu/TREE08 | 1 - .../selftests/rcutorture/configs/rcu/TREE08-T | 1 - .../selftests/rcutorture/configs/rcu/TREE09 | 1 - .../selftests/rcutorture/doc/TREE_RCU-kconfig.txt | 1 - 16 files changed, 2 insertions(+), 85 deletions(-) diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index b57c0c1..046f326 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -26,12 +26,6 @@ CONFIG_RCU_CPU_STALL_TIMEOUT Stall-warning messages may be enabled and disabled completely via /sys/module/rcupdate/parameters/rcu_cpu_stall_suppress. -CONFIG_RCU_CPU_STALL_INFO - - This kernel configuration parameter causes the stall warning to - print out additional per-CPU diagnostic information, including - information on scheduling-clock ticks and RCU's idle-CPU tracking. - RCU_STALL_DELAY_DELTA Although the lockdep facility is extremely useful, it does add @@ -101,15 +95,13 @@ interact. Please note that it is not possible to entirely eliminate this sort of false positive without resorting to things like stop_machine(), which is overkill for this sort of problem. -If the CONFIG_RCU_CPU_STALL_INFO kernel configuration parameter is set, -more information is printed with the stall-warning message, for example: +Recent kernels will print a long form of the stall-warning message: INFO: rcu_preempt detected stall on CPU 0: (63959 ticks this GP) idle=241/3fffffffffffffff/0 softirq=82/543 (t=65000 jiffies) -In kernels with CONFIG_RCU_FAST_NO_HZ, even more information is -printed: +In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed: INFO: rcu_preempt detected stall on CPU 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 nonlazy_posted: 25 .D diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index faee524..7c0b09d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -288,12 +288,10 @@ struct rcu_data { bool gpwrap; /* Possible gpnum/completed wrap. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ -#ifdef CONFIG_RCU_CPU_STALL_INFO unsigned long ticks_this_gp; /* The number of scheduling-clock */ /* ticks this CPU has handled */ /* during and after the last grace */ /* period it is aware of. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ /* 2) batch handling */ /* @@ -388,9 +386,7 @@ struct rcu_data { #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ /* 8) RCU CPU stall data. */ -#ifdef CONFIG_RCU_CPU_STALL_INFO unsigned int softirq_snap; /* Snapshot of softirq activity. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ int cpu; struct rcu_state *rsp; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7234f03..ef41c1b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -82,8 +82,6 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU lockdep checking is enabled.\n"); if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE)) pr_info("\tRCU torture testing starts during boot.\n"); - if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO)) - pr_info("\tAdditional per-CPU info printed with stalls.\n"); if (RCU_NUM_LVLS >= 4) pr_info("\tFour(or more)-level hierarchy is enabled.\n"); if (RCU_FANOUT_LEAF != 16) @@ -418,8 +416,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) rcu_print_detail_task_stall_rnp(rnp); } -#ifdef CONFIG_RCU_CPU_STALL_INFO - static void rcu_print_task_stall_begin(struct rcu_node *rnp) { pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", @@ -431,18 +427,6 @@ static void rcu_print_task_stall_end(void) pr_cont("\n"); } -#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ - -static void rcu_print_task_stall_begin(struct rcu_node *rnp) -{ -} - -static void rcu_print_task_stall_end(void) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ - /* * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each. @@ -1685,8 +1669,6 @@ early_initcall(rcu_register_oom_notifier); #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ -#ifdef CONFIG_RCU_CPU_STALL_INFO - #ifdef CONFIG_RCU_FAST_NO_HZ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) @@ -1775,33 +1757,6 @@ static void increment_cpu_stall_ticks(void) raw_cpu_inc(rsp->rda->ticks_this_gp); } -#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ - -static void print_cpu_stall_info_begin(void) -{ - pr_cont(" {"); -} - -static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) -{ - pr_cont(" %d", cpu); -} - -static void print_cpu_stall_info_end(void) -{ - pr_cont("} "); -} - -static void zero_cpu_stall_ticks(struct rcu_data *rdp) -{ -} - -static void increment_cpu_stall_ticks(void) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ - #ifdef CONFIG_RCU_NOCB_CPU /* diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e2894b2..8a34205 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1353,20 +1353,6 @@ config RCU_CPU_STALL_TIMEOUT RCU grace period persists, additional CPU stall warnings are printed at more widely spaced intervals. -config RCU_CPU_STALL_INFO - bool "Print additional diagnostics on RCU CPU stall" - depends on (TREE_RCU || PREEMPT_RCU) && DEBUG_KERNEL - default y - help - For each stalled CPU that is aware of the current RCU grace - period, print out additional per-CPU diagnostic information - regarding scheduling-clock ticks, idle state, and, - for RCU_FAST_NO_HZ kernels, idle-entry state. - - Say N if you are unsure. - - Say Y if you want to enable such diagnostics. - config RCU_TRACE bool "Enable tracing for RCU" depends on DEBUG_KERNEL diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 b/tools/testing/selftests/rcutorture/configs/rcu/TREE01 index 8e9137f..f572b87 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01 @@ -13,7 +13,6 @@ CONFIG_MAXSMP=y CONFIG_RCU_NOCB_CPU=y CONFIG_RCU_NOCB_CPU_ZERO=y CONFIG_DEBUG_LOCK_ALLOC=n -CONFIG_RCU_CPU_STALL_INFO=n CONFIG_RCU_BOOST=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE02 b/tools/testing/selftests/rcutorture/configs/rcu/TREE02 index aeea6a2..ef6a22c 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE02 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE02 @@ -17,7 +17,6 @@ CONFIG_RCU_FANOUT_LEAF=3 CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=n -CONFIG_RCU_CPU_STALL_INFO=n CONFIG_RCU_BOOST=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T b/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T index 2ac9e68..917d251 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T @@ -17,6 +17,5 @@ CONFIG_RCU_FANOUT_LEAF=3 CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=n -CONFIG_RCU_CPU_STALL_INFO=n CONFIG_RCU_BOOST=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03 b/tools/testing/selftests/rcutorture/configs/rcu/TREE03 index 72aa7d8..7a17c50 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03 @@ -13,7 +13,6 @@ CONFIG_RCU_FANOUT=2 CONFIG_RCU_FANOUT_LEAF=2 CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=n -CONFIG_RCU_CPU_STALL_INFO=n CONFIG_RCU_BOOST=y CONFIG_RCU_KTHREAD_PRIO=2 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 index 3f51127..39a2c6d 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 @@ -17,6 +17,5 @@ CONFIG_RCU_FANOUT=4 CONFIG_RCU_FANOUT_LEAF=4 CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=n -CONFIG_RCU_CPU_STALL_INFO=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE05 b/tools/testing/selftests/rcutorture/configs/rcu/TREE05 index c04dfea..1257d32 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE05 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE05 @@ -17,6 +17,5 @@ CONFIG_RCU_NOCB_CPU_NONE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y #CHECK#CONFIG_PROVE_RCU=y -CONFIG_RCU_CPU_STALL_INFO=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE06 b/tools/testing/selftests/rcutorture/configs/rcu/TREE06 index f51d2c7..d3e456b 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE06 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE06 @@ -18,6 +18,5 @@ CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y #CHECK#CONFIG_PROVE_RCU=y -CONFIG_RCU_CPU_STALL_INFO=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=y CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 b/tools/testing/selftests/rcutorture/configs/rcu/TREE07 index f422af4..3956b41 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07 @@ -17,6 +17,5 @@ CONFIG_RCU_FANOUT=2 CONFIG_RCU_FANOUT_LEAF=2 CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=n -CONFIG_RCU_CPU_STALL_INFO=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08 b/tools/testing/selftests/rcutorture/configs/rcu/TREE08 index a24d2ca..bb9b0c1 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08 @@ -19,7 +19,6 @@ CONFIG_RCU_NOCB_CPU_ALL=y CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_PROVE_LOCKING=y #CHECK#CONFIG_PROVE_RCU=y -CONFIG_RCU_CPU_STALL_INFO=n CONFIG_RCU_BOOST=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T b/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T index b2b8cea..2ad13f0 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T @@ -17,6 +17,5 @@ CONFIG_RCU_FANOUT_LEAF=2 CONFIG_RCU_NOCB_CPU=y CONFIG_RCU_NOCB_CPU_ALL=y CONFIG_DEBUG_LOCK_ALLOC=n -CONFIG_RCU_CPU_STALL_INFO=n CONFIG_RCU_BOOST=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 b/tools/testing/selftests/rcutorture/configs/rcu/TREE09 index aa4ed08..6710e74 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE09 @@ -13,7 +13,6 @@ CONFIG_SUSPEND=n CONFIG_HIBERNATION=n CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=n -CONFIG_RCU_CPU_STALL_INFO=n CONFIG_RCU_BOOST=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n #CHECK#CONFIG_RCU_EXPERT=n diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt index b24c000..657f3a0 100644 --- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt +++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt @@ -16,7 +16,6 @@ CONFIG_PROVE_LOCKING -- Do several, covering CONFIG_DEBUG_LOCK_ALLOC=y and not. CONFIG_PROVE_RCU -- Hardwired to CONFIG_PROVE_LOCKING. CONFIG_RCU_BOOST -- one of PREEMPT_RCU. CONFIG_RCU_KTHREAD_PRIO -- set to 2 for _BOOST testing. -CONFIG_RCU_CPU_STALL_INFO -- Now default, avoid at least twice. CONFIG_RCU_FANOUT -- Cover hierarchy, but overlap with others. CONFIG_RCU_FANOUT_LEAF -- Do one non-default. CONFIG_RCU_FAST_NO_HZ -- Do one, but not with CONFIG_RCU_NOCB_CPU_ALL. -- cgit v1.1 From c190c3b16c0f56ff338df12df53c03859155951b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Jun 2015 19:03:45 -0700 Subject: rcu: Switch synchronize_sched_expedited() to stop_one_cpu() The synchronize_sched_expedited() currently invokes try_stop_cpus(), which schedules the stopper kthreads on each online non-idle CPU, and waits until all those kthreads are running before letting any of them stop. This is disastrous for real-time workloads, which get hit with a preemption that is as long as the longest scheduling latency on any CPU, including any non-realtime housekeeping CPUs. This commit therefore switches to using stop_one_cpu() on each CPU in turn. This avoids inflicting the worst-case scheduling latency on the worst-case CPU onto all other CPUs, and also simplifies the code a little bit. Follow-up commits will simplify the counter-snapshotting algorithm and convert a number of the counters that are now protected by the new ->expedited_mutex to non-atomic. Signed-off-by: Peter Zijlstra [ paulmck: Kept stop_one_cpu(), dropped disabling of "guardrails". ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 41 ++++++++++++++--------------------------- kernel/rcu/tree.h | 1 + 2 files changed, 15 insertions(+), 27 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a2147d7..ae39a49 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -103,6 +103,7 @@ struct rcu_state sname##_state = { \ .orphan_nxttail = &sname##_state.orphan_nxtlist, \ .orphan_donetail = &sname##_state.orphan_donelist, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ + .expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ } @@ -3305,8 +3306,6 @@ static int synchronize_sched_expedited_cpu_stop(void *data) */ void synchronize_sched_expedited(void) { - cpumask_var_t cm; - bool cma = false; int cpu; long firstsnap, s, snap; int trycount = 0; @@ -3342,28 +3341,11 @@ void synchronize_sched_expedited(void) } WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); - /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */ - cma = zalloc_cpumask_var(&cm, GFP_KERNEL); - if (cma) { - cpumask_copy(cm, cpu_online_mask); - cpumask_clear_cpu(raw_smp_processor_id(), cm); - for_each_cpu(cpu, cm) { - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1)) - cpumask_clear_cpu(cpu, cm); - } - if (cpumask_weight(cm) == 0) - goto all_cpus_idle; - } - /* * Each pass through the following loop attempts to force a * context switch on each CPU. */ - while (try_stop_cpus(cma ? cm : cpu_online_mask, - synchronize_sched_expedited_cpu_stop, - NULL) == -EAGAIN) { + while (!mutex_trylock(&rsp->expedited_mutex)) { put_online_cpus(); atomic_long_inc(&rsp->expedited_tryfail); @@ -3373,7 +3355,6 @@ void synchronize_sched_expedited(void) /* ensure test happens before caller kfree */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_workdone1); - free_cpumask_var(cm); return; } @@ -3383,7 +3364,6 @@ void synchronize_sched_expedited(void) } else { wait_rcu_gp(call_rcu_sched); atomic_long_inc(&rsp->expedited_normal); - free_cpumask_var(cm); return; } @@ -3393,7 +3373,6 @@ void synchronize_sched_expedited(void) /* ensure test happens before caller kfree */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_workdone2); - free_cpumask_var(cm); return; } @@ -3408,16 +3387,23 @@ void synchronize_sched_expedited(void) /* CPU hotplug operation in flight, use normal GP. */ wait_rcu_gp(call_rcu_sched); atomic_long_inc(&rsp->expedited_normal); - free_cpumask_var(cm); return; } snap = atomic_long_read(&rsp->expedited_start); smp_mb(); /* ensure read is before try_stop_cpus(). */ } - atomic_long_inc(&rsp->expedited_stoppedcpus); -all_cpus_idle: - free_cpumask_var(cm); + /* Stop each CPU that is online, non-idle, and not us. */ + for_each_online_cpu(cpu) { + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + /* Skip our CPU and any idle CPUs. */ + if (raw_smp_processor_id() == cpu || + !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) + continue; + stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL); + } + atomic_long_inc(&rsp->expedited_stoppedcpus); /* * Everyone up to our most recent fetch is covered by our grace @@ -3436,6 +3422,7 @@ all_cpus_idle: } } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s); atomic_long_inc(&rsp->expedited_done_exit); + mutex_unlock(&rsp->expedited_mutex); put_online_cpus(); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7c0b09d..7c25fe4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -480,6 +480,7 @@ struct rcu_state { /* _rcu_barrier(). */ /* End of fields guarded by barrier_mutex. */ + struct mutex expedited_mutex; /* Serializes expediting. */ atomic_long_t expedited_start; /* Starting ticket. */ atomic_long_t expedited_done; /* Done ticket. */ atomic_long_t expedited_wrap; /* # near-wrap incidents. */ -- cgit v1.1 From d6ada2cf2f81dab8a231d0ef8fb5dec4f5ac8379 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 24 Jun 2015 10:46:30 -0700 Subject: rcu: Rework synchronize_sched_expedited() counter handling Now that synchronize_sched_expedited() have a mutex, it can use simpler work-already-done detection scheme. This commit simplifies this scheme by using something similar to the sequence-locking counter scheme. A counter is incremented before and after each grace period, so that the counter is odd in the midst of the grace period and even otherwise. So if the counter has advanced to the second even number that is greater than or equal to the snapshot, the required grace period has already happened. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 98 +++++++++++++++---------------------------------- kernel/rcu/tree.h | 9 +---- kernel/rcu/tree_trace.c | 12 ++---- 3 files changed, 36 insertions(+), 83 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ae39a49..3c182fd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3283,56 +3283,24 @@ static int synchronize_sched_expedited_cpu_stop(void *data) * restructure your code to batch your updates, and then use a single * synchronize_sched() instead. * - * This implementation can be thought of as an application of ticket - * locking to RCU, with sync_sched_expedited_started and - * sync_sched_expedited_done taking on the roles of the halves - * of the ticket-lock word. Each task atomically increments - * sync_sched_expedited_started upon entry, snapshotting the old value, - * then attempts to stop all the CPUs. If this succeeds, then each - * CPU will have executed a context switch, resulting in an RCU-sched - * grace period. We are then done, so we use atomic_cmpxchg() to - * update sync_sched_expedited_done to match our snapshot -- but - * only if someone else has not already advanced past our snapshot. - * - * On the other hand, if try_stop_cpus() fails, we check the value - * of sync_sched_expedited_done. If it has advanced past our - * initial snapshot, then someone else must have forced a grace period - * some time after we took our snapshot. In this case, our work is - * done for us, and we can simply return. Otherwise, we try again, - * but keep our initial snapshot for purposes of checking for someone - * doing our work for us. - * - * If we fail too many times in a row, we fall back to synchronize_sched(). + * This implementation can be thought of as an application of sequence + * locking to expedited grace periods, but using the sequence counter to + * determine when someone else has already done the work instead of for + * retrying readers. We do a mutex_trylock() polling loop, but if we fail + * too many times in a row, we fall back to synchronize_sched(). */ void synchronize_sched_expedited(void) { int cpu; - long firstsnap, s, snap; + long s; int trycount = 0; struct rcu_state *rsp = &rcu_sched_state; - /* - * If we are in danger of counter wrap, just do synchronize_sched(). - * By allowing sync_sched_expedited_started to advance no more than - * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring - * that more than 3.5 billion CPUs would be required to force a - * counter wrap on a 32-bit system. Quite a few more CPUs would of - * course be required on a 64-bit system. - */ - if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start), - (ulong)atomic_long_read(&rsp->expedited_done) + - ULONG_MAX / 8)) { - wait_rcu_gp(call_rcu_sched); - atomic_long_inc(&rsp->expedited_wrap); - return; - } + /* Take a snapshot of the sequence number. */ + smp_mb(); /* Caller's modifications seen first by other CPUs. */ + s = (READ_ONCE(rsp->expedited_sequence) + 3) & ~0x1; + smp_mb(); /* Above access must not bleed into critical section. */ - /* - * Take a ticket. Note that atomic_inc_return() implies a - * full memory barrier. - */ - snap = atomic_long_inc_return(&rsp->expedited_start); - firstsnap = snap; if (!try_get_online_cpus()) { /* CPU hotplug operation in flight, fall back to normal GP. */ wait_rcu_gp(call_rcu_sched); @@ -3342,16 +3310,15 @@ void synchronize_sched_expedited(void) WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); /* - * Each pass through the following loop attempts to force a - * context switch on each CPU. + * Each pass through the following loop attempts to acquire + * ->expedited_mutex, checking for others doing our work each time. */ while (!mutex_trylock(&rsp->expedited_mutex)) { put_online_cpus(); atomic_long_inc(&rsp->expedited_tryfail); /* Check to see if someone else did our work for us. */ - s = atomic_long_read(&rsp->expedited_done); - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { + if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) { /* ensure test happens before caller kfree */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_workdone1); @@ -3368,8 +3335,7 @@ void synchronize_sched_expedited(void) } /* Recheck to see if someone else did our work for us. */ - s = atomic_long_read(&rsp->expedited_done); - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { + if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) { /* ensure test happens before caller kfree */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_workdone2); @@ -3389,10 +3355,20 @@ void synchronize_sched_expedited(void) atomic_long_inc(&rsp->expedited_normal); return; } - snap = atomic_long_read(&rsp->expedited_start); - smp_mb(); /* ensure read is before try_stop_cpus(). */ } + /* Recheck yet again to see if someone else did our work for us. */ + if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) { + rsp->expedited_workdone3++; + mutex_unlock(&rsp->expedited_mutex); + smp_mb(); /* ensure test happens before caller kfree */ + return; + } + + WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1); + smp_mb(); /* Ensure expedited GP seen after counter increment. */ + WARN_ON_ONCE(!(rsp->expedited_sequence & 0x1)); + /* Stop each CPU that is online, non-idle, and not us. */ for_each_online_cpu(cpu) { struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); @@ -3403,26 +3379,12 @@ void synchronize_sched_expedited(void) continue; stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL); } - atomic_long_inc(&rsp->expedited_stoppedcpus); - /* - * Everyone up to our most recent fetch is covered by our grace - * period. Update the counter, but only if our work is still - * relevant -- which it won't be if someone who started later - * than we did already did their update. - */ - do { - atomic_long_inc(&rsp->expedited_done_tries); - s = atomic_long_read(&rsp->expedited_done); - if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { - /* ensure test happens before caller kfree */ - smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(&rsp->expedited_done_lost); - break; - } - } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s); - atomic_long_inc(&rsp->expedited_done_exit); + smp_mb(); /* Ensure expedited GP seen before counter increment. */ + WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1); + WARN_ON_ONCE(rsp->expedited_sequence & 0x1); mutex_unlock(&rsp->expedited_mutex); + smp_mb(); /* ensure subsequent action seen after grace period. */ put_online_cpus(); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7c25fe4..6a2b741 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -481,17 +481,12 @@ struct rcu_state { /* End of fields guarded by barrier_mutex. */ struct mutex expedited_mutex; /* Serializes expediting. */ - atomic_long_t expedited_start; /* Starting ticket. */ - atomic_long_t expedited_done; /* Done ticket. */ - atomic_long_t expedited_wrap; /* # near-wrap incidents. */ + unsigned long expedited_sequence; /* Take a ticket. */ atomic_long_t expedited_tryfail; /* # acquisition failures. */ atomic_long_t expedited_workdone1; /* # done by others #1. */ atomic_long_t expedited_workdone2; /* # done by others #2. */ + unsigned long expedited_workdone3; /* # done by others #3. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ - atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */ - atomic_long_t expedited_done_tries; /* # tries to update _done. */ - atomic_long_t expedited_done_lost; /* # times beaten to _done. */ - atomic_long_t expedited_done_exit; /* # times exited _done loop. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 3ea7ffc..a1ab3a5 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -185,18 +185,14 @@ static int show_rcuexp(struct seq_file *m, void *v) { struct rcu_state *rsp = (struct rcu_state *)m->private; - seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n", - atomic_long_read(&rsp->expedited_start), - atomic_long_read(&rsp->expedited_done), - atomic_long_read(&rsp->expedited_wrap), + seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu sc=%lu\n", + rsp->expedited_sequence, atomic_long_read(&rsp->expedited_tryfail), atomic_long_read(&rsp->expedited_workdone1), atomic_long_read(&rsp->expedited_workdone2), + rsp->expedited_workdone3, atomic_long_read(&rsp->expedited_normal), - atomic_long_read(&rsp->expedited_stoppedcpus), - atomic_long_read(&rsp->expedited_done_tries), - atomic_long_read(&rsp->expedited_done_lost), - atomic_long_read(&rsp->expedited_done_exit)); + rsp->expedited_sequence / 2); return 0; } -- cgit v1.1 From 385b73c06f6a733547d0a7714d0c4cb4c8788b88 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 24 Jun 2015 14:20:08 -0700 Subject: rcu: Get rid of synchronize_sched_expedited()'s polling loop This commit gets rid of synchronize_sched_expedited()'s mutex_trylock() polling loop in favor of a funnel-locking scheme based on the rcu_node tree. The work-done check is done at each level of the tree, allowing high-contention situations to be resolved quickly with reasonable levels of mutex contention. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 95 +++++++++++++++++++++---------------------------- kernel/rcu/tree.h | 8 +++-- kernel/rcu/tree_trace.c | 3 +- 3 files changed, 47 insertions(+), 59 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3c182fd..b310b40 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -70,6 +70,7 @@ MODULE_ALIAS("rcutree"); static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; +static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; /* * In order to export the rcu_state name to the tracing tools, it @@ -103,7 +104,6 @@ struct rcu_state sname##_state = { \ .orphan_nxttail = &sname##_state.orphan_nxtlist, \ .orphan_donetail = &sname##_state.orphan_donelist, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ - .expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ } @@ -3272,6 +3272,22 @@ static int synchronize_sched_expedited_cpu_stop(void *data) return 0; } +/* Common code for synchronize_sched_expedited() work-done checking. */ +static bool sync_sched_exp_wd(struct rcu_state *rsp, struct rcu_node *rnp, + atomic_long_t *stat, unsigned long s) +{ + if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) { + if (rnp) + mutex_unlock(&rnp->exp_funnel_mutex); + /* Ensure test happens before caller kfree(). */ + smp_mb__before_atomic(); /* ^^^ */ + atomic_long_inc(stat); + put_online_cpus(); + return true; + } + return false; +} + /** * synchronize_sched_expedited - Brute-force RCU-sched grace period * @@ -3286,15 +3302,15 @@ static int synchronize_sched_expedited_cpu_stop(void *data) * This implementation can be thought of as an application of sequence * locking to expedited grace periods, but using the sequence counter to * determine when someone else has already done the work instead of for - * retrying readers. We do a mutex_trylock() polling loop, but if we fail - * too many times in a row, we fall back to synchronize_sched(). + * retrying readers. */ void synchronize_sched_expedited(void) { int cpu; long s; - int trycount = 0; struct rcu_state *rsp = &rcu_sched_state; + struct rcu_node *rnp0; + struct rcu_node *rnp1 = NULL; /* Take a snapshot of the sequence number. */ smp_mb(); /* Caller's modifications seen first by other CPUs. */ @@ -3310,60 +3326,25 @@ void synchronize_sched_expedited(void) WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); /* - * Each pass through the following loop attempts to acquire - * ->expedited_mutex, checking for others doing our work each time. + * Each pass through the following loop works its way + * up the rcu_node tree, returning if others have done the + * work or otherwise falls through holding the root rnp's + * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure + * can be inexact, as it is just promoting locality and is not + * strictly needed for correctness. */ - while (!mutex_trylock(&rsp->expedited_mutex)) { - put_online_cpus(); - atomic_long_inc(&rsp->expedited_tryfail); - - /* Check to see if someone else did our work for us. */ - if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) { - /* ensure test happens before caller kfree */ - smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(&rsp->expedited_workdone1); - return; - } - - /* No joy, try again later. Or just synchronize_sched(). */ - if (trycount++ < 10) { - udelay(trycount * num_online_cpus()); - } else { - wait_rcu_gp(call_rcu_sched); - atomic_long_inc(&rsp->expedited_normal); + rnp0 = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; + for (; rnp0 != NULL; rnp0 = rnp0->parent) { + if (sync_sched_exp_wd(rsp, rnp1, &rsp->expedited_workdone1, s)) return; - } - - /* Recheck to see if someone else did our work for us. */ - if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) { - /* ensure test happens before caller kfree */ - smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(&rsp->expedited_workdone2); - return; - } - - /* - * Refetching sync_sched_expedited_started allows later - * callers to piggyback on our grace period. We retry - * after they started, so our grace period works for them, - * and they started after our first try, so their grace - * period works for us. - */ - if (!try_get_online_cpus()) { - /* CPU hotplug operation in flight, use normal GP. */ - wait_rcu_gp(call_rcu_sched); - atomic_long_inc(&rsp->expedited_normal); - return; - } + mutex_lock(&rnp0->exp_funnel_mutex); + if (rnp1) + mutex_unlock(&rnp1->exp_funnel_mutex); + rnp1 = rnp0; } - - /* Recheck yet again to see if someone else did our work for us. */ - if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) { - rsp->expedited_workdone3++; - mutex_unlock(&rsp->expedited_mutex); - smp_mb(); /* ensure test happens before caller kfree */ + rnp0 = rnp1; /* rcu_get_root(rsp), AKA root rcu_node structure. */ + if (sync_sched_exp_wd(rsp, rnp0, &rsp->expedited_workdone2, s)) return; - } WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1); smp_mb(); /* Ensure expedited GP seen after counter increment. */ @@ -3383,7 +3364,7 @@ void synchronize_sched_expedited(void) smp_mb(); /* Ensure expedited GP seen before counter increment. */ WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1); WARN_ON_ONCE(rsp->expedited_sequence & 0x1); - mutex_unlock(&rsp->expedited_mutex); + mutex_unlock(&rnp0->exp_funnel_mutex); smp_mb(); /* ensure subsequent action seen after grace period. */ put_online_cpus(); @@ -3940,6 +3921,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, { static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT; + static const char * const exp[] = RCU_EXP_NAME_INIT; static u8 fl_mask = 0x1; int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ @@ -3998,6 +3980,9 @@ static void __init rcu_init_one(struct rcu_state *rsp, rnp->level = i; INIT_LIST_HEAD(&rnp->blkd_tasks); rcu_init_one_nocb(rnp); + mutex_init(&rnp->exp_funnel_mutex); + lockdep_set_class_and_name(&rnp->exp_funnel_mutex, + &rcu_exp_class[i], exp[i]); } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6a2b741..2ef036b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -68,6 +68,7 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } # define RCU_NODE_NAME_INIT { "rcu_node_0" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } +# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" } #elif NR_CPUS <= RCU_FANOUT_2 # define RCU_NUM_LVLS 2 # define NUM_RCU_LVL_0 1 @@ -76,6 +77,7 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } +# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" } #elif NR_CPUS <= RCU_FANOUT_3 # define RCU_NUM_LVLS 3 # define NUM_RCU_LVL_0 1 @@ -85,6 +87,7 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } +# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" } #elif NR_CPUS <= RCU_FANOUT_4 # define RCU_NUM_LVLS 4 # define NUM_RCU_LVL_0 1 @@ -95,6 +98,7 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } +# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" } #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ @@ -237,6 +241,8 @@ struct rcu_node { int need_future_gp[2]; /* Counts of upcoming no-CB GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; + + struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp; } ____cacheline_internodealigned_in_smp; /* @@ -480,12 +486,10 @@ struct rcu_state { /* _rcu_barrier(). */ /* End of fields guarded by barrier_mutex. */ - struct mutex expedited_mutex; /* Serializes expediting. */ unsigned long expedited_sequence; /* Take a ticket. */ atomic_long_t expedited_tryfail; /* # acquisition failures. */ atomic_long_t expedited_workdone1; /* # done by others #1. */ atomic_long_t expedited_workdone2; /* # done by others #2. */ - unsigned long expedited_workdone3; /* # done by others #3. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index a1ab3a5..d2aab8d 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -185,12 +185,11 @@ static int show_rcuexp(struct seq_file *m, void *v) { struct rcu_state *rsp = (struct rcu_state *)m->private; - seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu sc=%lu\n", + seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu\n", rsp->expedited_sequence, atomic_long_read(&rsp->expedited_tryfail), atomic_long_read(&rsp->expedited_workdone1), atomic_long_read(&rsp->expedited_workdone2), - rsp->expedited_workdone3, atomic_long_read(&rsp->expedited_normal), rsp->expedited_sequence / 2); return 0; -- cgit v1.1 From 3a6d7c64d78a78d279851524d39999637a549363 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Jun 2015 11:27:10 -0700 Subject: rcu: Make expedited GP CPU stoppage asynchronous Sequentially stopping the CPUs slows down expedited grace periods by at least a factor of two, based on rcutorture's grace-period-per-second rate. This is a conservative measure because rcutorture uses unusually long RCU read-side critical sections and because rcutorture periodically quiesces the system in order to test RCU's ability to ramp down to and up from the idle state. This commit therefore replaces the stop_one_cpu() with stop_one_cpu_nowait(), using an atomic-counter scheme to determine when all CPUs have passed through the stopped state. Signed-off-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 31 +++++++++++++++++-------------- kernel/rcu/tree.h | 6 ++++++ kernel/rcu/tree_trace.c | 3 ++- 3 files changed, 25 insertions(+), 15 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b310b40..c5c8509 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3257,18 +3257,11 @@ EXPORT_SYMBOL_GPL(cond_synchronize_rcu); static int synchronize_sched_expedited_cpu_stop(void *data) { - /* - * There must be a full memory barrier on each affected CPU - * between the time that try_stop_cpus() is called and the - * time that it returns. - * - * In the current initial implementation of cpu_stop, the - * above condition is already met when the control reaches - * this point and the following smp_mb() is not strictly - * necessary. Do smp_mb() anyway for documentation and - * robustness against future implementation changes. - */ - smp_mb(); /* See above comment block. */ + struct rcu_state *rsp = data; + + /* We are here: If we are last, do the wakeup. */ + if (atomic_dec_and_test(&rsp->expedited_need_qs)) + wake_up(&rsp->expedited_wq); return 0; } @@ -3308,9 +3301,9 @@ void synchronize_sched_expedited(void) { int cpu; long s; - struct rcu_state *rsp = &rcu_sched_state; struct rcu_node *rnp0; struct rcu_node *rnp1 = NULL; + struct rcu_state *rsp = &rcu_sched_state; /* Take a snapshot of the sequence number. */ smp_mb(); /* Caller's modifications seen first by other CPUs. */ @@ -3351,16 +3344,26 @@ void synchronize_sched_expedited(void) WARN_ON_ONCE(!(rsp->expedited_sequence & 0x1)); /* Stop each CPU that is online, non-idle, and not us. */ + init_waitqueue_head(&rsp->expedited_wq); + atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */ for_each_online_cpu(cpu) { + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); /* Skip our CPU and any idle CPUs. */ if (raw_smp_processor_id() == cpu || !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) continue; - stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL); + atomic_inc(&rsp->expedited_need_qs); + stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop, + rsp, &rdp->exp_stop_work); } + /* Remove extra count and, if necessary, wait for CPUs to stop. */ + if (!atomic_dec_and_test(&rsp->expedited_need_qs)) + wait_event(rsp->expedited_wq, + !atomic_read(&rsp->expedited_need_qs)); + smp_mb(); /* Ensure expedited GP seen before counter increment. */ WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1); WARN_ON_ONCE(rsp->expedited_sequence & 0x1); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 2ef036b..4edc277 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -27,6 +27,7 @@ #include #include #include +#include /* * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and @@ -298,6 +299,9 @@ struct rcu_data { /* ticks this CPU has handled */ /* during and after the last grace */ /* period it is aware of. */ + struct cpu_stop_work exp_stop_work; + /* Expedited grace-period control */ + /* for CPU stopping. */ /* 2) batch handling */ /* @@ -491,6 +495,8 @@ struct rcu_state { atomic_long_t expedited_workdone1; /* # done by others #1. */ atomic_long_t expedited_workdone2; /* # done by others #2. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ + atomic_t expedited_need_qs; /* # CPUs left to check in. */ + wait_queue_head_t expedited_wq; /* Wait for check-ins. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index d2aab8d..36c04b4 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -185,12 +185,13 @@ static int show_rcuexp(struct seq_file *m, void *v) { struct rcu_state *rsp = (struct rcu_state *)m->private; - seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu\n", + seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu n=%lu enq=%d sc=%lu\n", rsp->expedited_sequence, atomic_long_read(&rsp->expedited_tryfail), atomic_long_read(&rsp->expedited_workdone1), atomic_long_read(&rsp->expedited_workdone2), atomic_long_read(&rsp->expedited_normal), + atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); return 0; } -- cgit v1.1 From 28f00767e3db933cacc3030f4d9736acd037be2c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Jun 2015 15:00:58 -0700 Subject: rcu: Abstract sequence counting from synchronize_sched_expedited() This commit creates rcu_exp_gp_seq_start() and rcu_exp_gp_seq_end() to bracket an expedited grace period, rcu_exp_gp_seq_snap() to snapshot the sequence counter, and rcu_exp_gp_seq_done() to check to see if a full expedited grace period has elapsed since the snapshot. These will be applied to synchronize_rcu_expedited(). These are defined in terms of underlying rcu_seq_start(), rcu_seq_end(), rcu_seq_snap(), rcu_seq_done(), which will be applied to _rcu_barrier(). One reason that this commit doesn't use the seqcount primitives themselves is that the smp_wmb() in those primitive is insufficient due to the fact that expedited grace periods do reads as well as writes. In addition, the read-side seqcount primitives detect a potentially partial change, where the expedited primitives instead need a guaranteed full change. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 10 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c5c8509..67fe757 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3255,6 +3255,60 @@ void cond_synchronize_rcu(unsigned long oldstate) } EXPORT_SYMBOL_GPL(cond_synchronize_rcu); +/* Adjust sequence number for start of update-side operation. */ +static void rcu_seq_start(unsigned long *sp) +{ + WRITE_ONCE(*sp, *sp + 1); + smp_mb(); /* Ensure update-side operation after counter increment. */ + WARN_ON_ONCE(!(*sp & 0x1)); +} + +/* Adjust sequence number for end of update-side operation. */ +static void rcu_seq_end(unsigned long *sp) +{ + smp_mb(); /* Ensure update-side operation before counter increment. */ + WRITE_ONCE(*sp, *sp + 1); + WARN_ON_ONCE(*sp & 0x1); +} + +/* Take a snapshot of the update side's sequence number. */ +static unsigned long rcu_seq_snap(unsigned long *sp) +{ + unsigned long s; + + smp_mb(); /* Caller's modifications seen first by other CPUs. */ + s = (READ_ONCE(*sp) + 3) & ~0x1; + smp_mb(); /* Above access must not bleed into critical section. */ + return s; +} + +/* + * Given a snapshot from rcu_seq_snap(), determine whether or not a + * full update-side operation has occurred. + */ +static bool rcu_seq_done(unsigned long *sp, unsigned long s) +{ + return ULONG_CMP_GE(READ_ONCE(*sp), s); +} + +/* Wrapper functions for expedited grace periods. */ +static void rcu_exp_gp_seq_start(struct rcu_state *rsp) +{ + rcu_seq_start(&rsp->expedited_sequence); +} +static void rcu_exp_gp_seq_end(struct rcu_state *rsp) +{ + rcu_seq_end(&rsp->expedited_sequence); +} +static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) +{ + return rcu_seq_snap(&rsp->expedited_sequence); +} +static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) +{ + return rcu_seq_done(&rsp->expedited_sequence, s); +} + static int synchronize_sched_expedited_cpu_stop(void *data) { struct rcu_state *rsp = data; @@ -3269,7 +3323,7 @@ static int synchronize_sched_expedited_cpu_stop(void *data) static bool sync_sched_exp_wd(struct rcu_state *rsp, struct rcu_node *rnp, atomic_long_t *stat, unsigned long s) { - if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) { + if (rcu_exp_gp_seq_done(rsp, s)) { if (rnp) mutex_unlock(&rnp->exp_funnel_mutex); /* Ensure test happens before caller kfree(). */ @@ -3306,9 +3360,7 @@ void synchronize_sched_expedited(void) struct rcu_state *rsp = &rcu_sched_state; /* Take a snapshot of the sequence number. */ - smp_mb(); /* Caller's modifications seen first by other CPUs. */ - s = (READ_ONCE(rsp->expedited_sequence) + 3) & ~0x1; - smp_mb(); /* Above access must not bleed into critical section. */ + s = rcu_exp_gp_seq_snap(rsp); if (!try_get_online_cpus()) { /* CPU hotplug operation in flight, fall back to normal GP. */ @@ -3339,9 +3391,7 @@ void synchronize_sched_expedited(void) if (sync_sched_exp_wd(rsp, rnp0, &rsp->expedited_workdone2, s)) return; - WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1); - smp_mb(); /* Ensure expedited GP seen after counter increment. */ - WARN_ON_ONCE(!(rsp->expedited_sequence & 0x1)); + rcu_exp_gp_seq_start(rsp); /* Stop each CPU that is online, non-idle, and not us. */ init_waitqueue_head(&rsp->expedited_wq); @@ -3364,9 +3414,7 @@ void synchronize_sched_expedited(void) wait_event(rsp->expedited_wq, !atomic_read(&rsp->expedited_need_qs)); - smp_mb(); /* Ensure expedited GP seen before counter increment. */ - WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1); - WARN_ON_ONCE(rsp->expedited_sequence & 0x1); + rcu_exp_gp_seq_end(rsp); mutex_unlock(&rnp0->exp_funnel_mutex); smp_mb(); /* ensure subsequent action seen after grace period. */ -- cgit v1.1 From 543c6158f6dff20a741dfa492771f18ceaa1a109 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Jun 2015 15:52:50 -0700 Subject: rcu: Make synchronize_rcu_expedited() use sequence-counter scheme Although synchronize_rcu_expedited() uses a sequence-counter scheme, it is based on a single increment per grace period, which means that tasks piggybacking off of concurrent grace periods may be forced to wait longer than necessary. This commit therefore applies the new sequence-count functions developed for synchronize_sched_expedited() to speed things up a bit and to consolidate the sequence-counter implementation. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ef41c1b..759883f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -536,7 +536,6 @@ void synchronize_rcu(void) EXPORT_SYMBOL_GPL(synchronize_rcu); static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); -static unsigned long sync_rcu_preempt_exp_count; static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); /* @@ -704,12 +703,10 @@ void synchronize_rcu_expedited(void) { struct rcu_node *rnp; struct rcu_state *rsp = rcu_state_p; - unsigned long snap; + unsigned long s; int trycount = 0; - smp_mb(); /* Caller's modifications seen first by other CPUs. */ - snap = READ_ONCE(sync_rcu_preempt_exp_count) + 1; - smp_mb(); /* Above access cannot bleed into critical section. */ + s = rcu_exp_gp_seq_snap(rsp); /* * Acquire lock, falling back to synchronize_rcu() if too many @@ -717,8 +714,7 @@ void synchronize_rcu_expedited(void) * expedited grace period for us, just leave. */ while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { - if (ULONG_CMP_LT(snap, - READ_ONCE(sync_rcu_preempt_exp_count))) + if (rcu_exp_gp_seq_done(rsp, s)) goto mb_ret; /* Others did our work for us. */ if (trycount++ < 10) { udelay(trycount * num_online_cpus()); @@ -727,8 +723,9 @@ void synchronize_rcu_expedited(void) return; } } - if (ULONG_CMP_LT(snap, READ_ONCE(sync_rcu_preempt_exp_count))) + if (rcu_exp_gp_seq_done(rsp, s)) goto unlock_mb_ret; /* Others did our work for us. */ + rcu_exp_gp_seq_start(rsp); /* force all RCU readers onto ->blkd_tasks lists. */ synchronize_sched_expedited(); @@ -750,8 +747,7 @@ void synchronize_rcu_expedited(void) sync_rcu_preempt_exp_done(rnp)); /* Clean up and exit. */ - smp_mb(); /* ensure expedited GP seen before counter increment. */ - WRITE_ONCE(sync_rcu_preempt_exp_count, sync_rcu_preempt_exp_count + 1); + rcu_exp_gp_seq_end(rsp); unlock_mb_ret: mutex_unlock(&sync_rcu_preempt_exp_mutex); mb_ret: -- cgit v1.1 From b09e5f8601d7e5b8d45348c9c09e1fb4109e8dc6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Jun 2015 16:30:54 -0700 Subject: rcu: Abstract funnel locking from synchronize_sched_expedited() This commit abstracts funnel locking from synchronize_sched_expedited() so that it may be used by synchronize_rcu_expedited(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 80 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 33 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 67fe757..f79a1c6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3309,16 +3309,6 @@ static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) return rcu_seq_done(&rsp->expedited_sequence, s); } -static int synchronize_sched_expedited_cpu_stop(void *data) -{ - struct rcu_state *rsp = data; - - /* We are here: If we are last, do the wakeup. */ - if (atomic_dec_and_test(&rsp->expedited_need_qs)) - wake_up(&rsp->expedited_wq); - return 0; -} - /* Common code for synchronize_sched_expedited() work-done checking. */ static bool sync_sched_exp_wd(struct rcu_state *rsp, struct rcu_node *rnp, atomic_long_t *stat, unsigned long s) @@ -3335,6 +3325,48 @@ static bool sync_sched_exp_wd(struct rcu_state *rsp, struct rcu_node *rnp, return false; } +/* + * Funnel-lock acquisition for expedited grace periods. Returns a + * pointer to the root rcu_node structure, or NULL if some other + * task did the expedited grace period for us. + */ +static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) +{ + struct rcu_node *rnp0; + struct rcu_node *rnp1 = NULL; + + /* + * Each pass through the following loop works its way + * up the rcu_node tree, returning if others have done the + * work or otherwise falls through holding the root rnp's + * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure + * can be inexact, as it is just promoting locality and is not + * strictly needed for correctness. + */ + rnp0 = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; + for (; rnp0 != NULL; rnp0 = rnp0->parent) { + if (sync_sched_exp_wd(rsp, rnp1, &rsp->expedited_workdone1, s)) + return NULL; + mutex_lock(&rnp0->exp_funnel_mutex); + if (rnp1) + mutex_unlock(&rnp1->exp_funnel_mutex); + rnp1 = rnp0; + } + if (sync_sched_exp_wd(rsp, rnp1, &rsp->expedited_workdone2, s)) + return NULL; + return rnp1; +} + +static int synchronize_sched_expedited_cpu_stop(void *data) +{ + struct rcu_state *rsp = data; + + /* We are here: If we are last, do the wakeup. */ + if (atomic_dec_and_test(&rsp->expedited_need_qs)) + wake_up(&rsp->expedited_wq); + return 0; +} + /** * synchronize_sched_expedited - Brute-force RCU-sched grace period * @@ -3355,8 +3387,7 @@ void synchronize_sched_expedited(void) { int cpu; long s; - struct rcu_node *rnp0; - struct rcu_node *rnp1 = NULL; + struct rcu_node *rnp; struct rcu_state *rsp = &rcu_sched_state; /* Take a snapshot of the sequence number. */ @@ -3370,26 +3401,9 @@ void synchronize_sched_expedited(void) } WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); - /* - * Each pass through the following loop works its way - * up the rcu_node tree, returning if others have done the - * work or otherwise falls through holding the root rnp's - * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure - * can be inexact, as it is just promoting locality and is not - * strictly needed for correctness. - */ - rnp0 = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; - for (; rnp0 != NULL; rnp0 = rnp0->parent) { - if (sync_sched_exp_wd(rsp, rnp1, &rsp->expedited_workdone1, s)) - return; - mutex_lock(&rnp0->exp_funnel_mutex); - if (rnp1) - mutex_unlock(&rnp1->exp_funnel_mutex); - rnp1 = rnp0; - } - rnp0 = rnp1; /* rcu_get_root(rsp), AKA root rcu_node structure. */ - if (sync_sched_exp_wd(rsp, rnp0, &rsp->expedited_workdone2, s)) - return; + rnp = exp_funnel_lock(rsp, s); + if (rnp == NULL) + return; /* Someone else did our work for us. */ rcu_exp_gp_seq_start(rsp); @@ -3415,7 +3429,7 @@ void synchronize_sched_expedited(void) !atomic_read(&rsp->expedited_need_qs)); rcu_exp_gp_seq_end(rsp); - mutex_unlock(&rnp0->exp_funnel_mutex); + mutex_unlock(&rnp->exp_funnel_mutex); smp_mb(); /* ensure subsequent action seen after grace period. */ put_online_cpus(); -- cgit v1.1 From 7fd0ddc5bf1ab5259c80a53a01984e13befd658b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Jun 2015 16:35:03 -0700 Subject: rcu: Fix synchronize_sched_expedited() type error for "s" The type of "s" has been "long" rather than the correct "unsigned long" for quite some time. This commit fixes this type error. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f79a1c6..094ed8f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3386,7 +3386,7 @@ static int synchronize_sched_expedited_cpu_stop(void *data) void synchronize_sched_expedited(void) { int cpu; - long s; + unsigned long s; struct rcu_node *rnp; struct rcu_state *rsp = &rcu_sched_state; -- cgit v1.1 From 29fd930940193a9a035a75a3847457160d65559a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Jun 2015 19:03:16 -0700 Subject: rcu: Use funnel locking for synchronize_rcu_expedited()'s polling loop This commit gets rid of synchronize_rcu_expedited()'s mutex_trylock() polling loop in favor of the funnel-locking scheme that was abstracted from synchronize_sched_expedited(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 15 ++++++++------- kernel/rcu/tree_plugin.h | 36 ++++++++++-------------------------- 2 files changed, 18 insertions(+), 33 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 094ed8f..338ea61 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3309,9 +3309,9 @@ static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) return rcu_seq_done(&rsp->expedited_sequence, s); } -/* Common code for synchronize_sched_expedited() work-done checking. */ -static bool sync_sched_exp_wd(struct rcu_state *rsp, struct rcu_node *rnp, - atomic_long_t *stat, unsigned long s) +/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ +static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, + atomic_long_t *stat, unsigned long s) { if (rcu_exp_gp_seq_done(rsp, s)) { if (rnp) @@ -3319,7 +3319,6 @@ static bool sync_sched_exp_wd(struct rcu_state *rsp, struct rcu_node *rnp, /* Ensure test happens before caller kfree(). */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(stat); - put_online_cpus(); return true; } return false; @@ -3345,14 +3344,14 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) */ rnp0 = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; for (; rnp0 != NULL; rnp0 = rnp0->parent) { - if (sync_sched_exp_wd(rsp, rnp1, &rsp->expedited_workdone1, s)) + if (sync_exp_work_done(rsp, rnp1, &rsp->expedited_workdone1, s)) return NULL; mutex_lock(&rnp0->exp_funnel_mutex); if (rnp1) mutex_unlock(&rnp1->exp_funnel_mutex); rnp1 = rnp0; } - if (sync_sched_exp_wd(rsp, rnp1, &rsp->expedited_workdone2, s)) + if (sync_exp_work_done(rsp, rnp1, &rsp->expedited_workdone2, s)) return NULL; return rnp1; } @@ -3402,8 +3401,10 @@ void synchronize_sched_expedited(void) WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); rnp = exp_funnel_lock(rsp, s); - if (rnp == NULL) + if (rnp == NULL) { + put_online_cpus(); return; /* Someone else did our work for us. */ + } rcu_exp_gp_seq_start(rsp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 759883f..f0d7144 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -536,7 +536,6 @@ void synchronize_rcu(void) EXPORT_SYMBOL_GPL(synchronize_rcu); static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); -static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); /* * Return non-zero if there are any tasks in RCU read-side critical @@ -556,7 +555,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp) * for the current expedited grace period. Works only for preemptible * RCU -- other RCU implementation use other means. * - * Caller must hold sync_rcu_preempt_exp_mutex. + * Caller must hold the root rcu_node's exp_funnel_mutex. */ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) { @@ -572,7 +571,7 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) * - * Caller must hold sync_rcu_preempt_exp_mutex. + * Caller must hold the root rcu_node's exp_funnel_mutex. */ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake) @@ -611,7 +610,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, * set the ->expmask bits on the leaf rcu_node structures to tell phase 2 * that work is needed here. * - * Caller must hold sync_rcu_preempt_exp_mutex. + * Caller must hold the root rcu_node's exp_funnel_mutex. */ static void sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp) @@ -654,7 +653,7 @@ sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp) * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits, * enabling rcu_read_unlock_special() to do the bit-clearing. * - * Caller must hold sync_rcu_preempt_exp_mutex. + * Caller must hold the root rcu_node's exp_funnel_mutex. */ static void sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp) @@ -702,29 +701,16 @@ sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp) void synchronize_rcu_expedited(void) { struct rcu_node *rnp; + struct rcu_node *rnp_unlock; struct rcu_state *rsp = rcu_state_p; unsigned long s; - int trycount = 0; s = rcu_exp_gp_seq_snap(rsp); - /* - * Acquire lock, falling back to synchronize_rcu() if too many - * lock-acquisition failures. Of course, if someone does the - * expedited grace period for us, just leave. - */ - while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { - if (rcu_exp_gp_seq_done(rsp, s)) - goto mb_ret; /* Others did our work for us. */ - if (trycount++ < 10) { - udelay(trycount * num_online_cpus()); - } else { - wait_rcu_gp(call_rcu); - return; - } - } - if (rcu_exp_gp_seq_done(rsp, s)) - goto unlock_mb_ret; /* Others did our work for us. */ + rnp_unlock = exp_funnel_lock(rsp, s); + if (rnp_unlock == NULL) + return; /* Someone else did our work for us. */ + rcu_exp_gp_seq_start(rsp); /* force all RCU readers onto ->blkd_tasks lists. */ @@ -748,9 +734,7 @@ void synchronize_rcu_expedited(void) /* Clean up and exit. */ rcu_exp_gp_seq_end(rsp); -unlock_mb_ret: - mutex_unlock(&sync_rcu_preempt_exp_mutex); -mb_ret: + mutex_unlock(&rnp_unlock->exp_funnel_mutex); smp_mb(); /* ensure subsequent action seen after grace period. */ } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); -- cgit v1.1 From 4f525a528b9e75571c6bedc6202beff1ced24c32 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 26 Jun 2015 11:20:00 -0700 Subject: rcu: Apply rcu_seq operations to _rcu_barrier() The rcu_seq operations were open-coded in _rcu_barrier(), so this commit replaces the open-coding with the shiny new rcu_seq operations. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 1 - kernel/rcu/tree.c | 72 ++++++++++++---------------------------------- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_trace.c | 4 +-- 4 files changed, 22 insertions(+), 57 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index c78e88c..ef72c4a 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -661,7 +661,6 @@ TRACE_EVENT(rcu_torture_read, * Tracepoint for _rcu_barrier() execution. The string "s" describes * the _rcu_barrier phase: * "Begin": _rcu_barrier() started. - * "Check": _rcu_barrier() checking for piggybacking. * "EarlyExit": _rcu_barrier() piggybacked, thus early exit. * "Inc1": _rcu_barrier() piggyback check counter incremented. * "OfflineNoCB": _rcu_barrier() found callback on never-online CPU diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 338ea61..44245ae 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3568,10 +3568,10 @@ static void rcu_barrier_callback(struct rcu_head *rhp) struct rcu_state *rsp = rdp->rsp; if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { - _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done); + _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence); complete(&rsp->barrier_completion); } else { - _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done); + _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence); } } @@ -3583,7 +3583,7 @@ static void rcu_barrier_func(void *type) struct rcu_state *rsp = type; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); + _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence); atomic_inc(&rsp->barrier_cpu_count); rsp->call(&rdp->barrier_head, rcu_barrier_callback); } @@ -3596,55 +3596,24 @@ static void _rcu_barrier(struct rcu_state *rsp) { int cpu; struct rcu_data *rdp; - unsigned long snap = READ_ONCE(rsp->n_barrier_done); - unsigned long snap_done; + unsigned long s = rcu_seq_snap(&rsp->barrier_sequence); - _rcu_barrier_trace(rsp, "Begin", -1, snap); + _rcu_barrier_trace(rsp, "Begin", -1, s); /* Take mutex to serialize concurrent rcu_barrier() requests. */ mutex_lock(&rsp->barrier_mutex); - /* - * Ensure that all prior references, including to ->n_barrier_done, - * are ordered before the _rcu_barrier() machinery. - */ - smp_mb(); /* See above block comment. */ - - /* - * Recheck ->n_barrier_done to see if others did our work for us. - * This means checking ->n_barrier_done for an even-to-odd-to-even - * transition. The "if" expression below therefore rounds the old - * value up to the next even number and adds two before comparing. - */ - snap_done = rsp->n_barrier_done; - _rcu_barrier_trace(rsp, "Check", -1, snap_done); - - /* - * If the value in snap is odd, we needed to wait for the current - * rcu_barrier() to complete, then wait for the next one, in other - * words, we need the value of snap_done to be three larger than - * the value of snap. On the other hand, if the value in snap is - * even, we only had to wait for the next rcu_barrier() to complete, - * in other words, we need the value of snap_done to be only two - * greater than the value of snap. The "(snap + 3) & ~0x1" computes - * this for us (thank you, Linus!). - */ - if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) { - _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); + /* Did someone else do our work for us? */ + if (rcu_seq_done(&rsp->barrier_sequence, s)) { + _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence); smp_mb(); /* caller's subsequent code after above check. */ mutex_unlock(&rsp->barrier_mutex); return; } - /* - * Increment ->n_barrier_done to avoid duplicate work. Use - * WRITE_ONCE() to prevent the compiler from speculating - * the increment to precede the early-exit check. - */ - WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1); - WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); - _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); - smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ + /* Mark the start of the barrier operation. */ + rcu_seq_start(&rsp->barrier_sequence); + _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence); /* * Initialize the count to one rather than to zero in order to @@ -3668,10 +3637,10 @@ static void _rcu_barrier(struct rcu_state *rsp) if (rcu_is_nocb_cpu(cpu)) { if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, - rsp->n_barrier_done); + rsp->barrier_sequence); } else { _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, - rsp->n_barrier_done); + rsp->barrier_sequence); smp_mb__before_atomic(); atomic_inc(&rsp->barrier_cpu_count); __call_rcu(&rdp->barrier_head, @@ -3679,11 +3648,11 @@ static void _rcu_barrier(struct rcu_state *rsp) } } else if (READ_ONCE(rdp->qlen)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, - rsp->n_barrier_done); + rsp->barrier_sequence); smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); } else { _rcu_barrier_trace(rsp, "OnlineNQ", cpu, - rsp->n_barrier_done); + rsp->barrier_sequence); } } put_online_cpus(); @@ -3695,16 +3664,13 @@ static void _rcu_barrier(struct rcu_state *rsp) if (atomic_dec_and_test(&rsp->barrier_cpu_count)) complete(&rsp->barrier_completion); - /* Increment ->n_barrier_done to prevent duplicate work. */ - smp_mb(); /* Keep increment after above mechanism. */ - WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1); - WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); - _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); - smp_mb(); /* Keep increment before caller's subsequent code. */ - /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ wait_for_completion(&rsp->barrier_completion); + /* Mark the end of the barrier operation. */ + _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence); + rcu_seq_end(&rsp->barrier_sequence); + /* Other rcu_barrier() invocations can now safely proceed. */ mutex_unlock(&rsp->barrier_mutex); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 4edc277..5c1042d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -486,7 +486,7 @@ struct rcu_state { struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ - unsigned long n_barrier_done; /* ++ at start and end of */ + unsigned long barrier_sequence; /* ++ at start and end of */ /* _rcu_barrier(). */ /* End of fields guarded by barrier_mutex. */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 36c04b4..d9982a2 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -81,9 +81,9 @@ static void r_stop(struct seq_file *m, void *v) static int show_rcubarrier(struct seq_file *m, void *v) { struct rcu_state *rsp = (struct rcu_state *)m->private; - seq_printf(m, "bcc: %d nbd: %lu\n", + seq_printf(m, "bcc: %d bseq: %lu\n", atomic_read(&rsp->barrier_cpu_count), - rsp->n_barrier_done); + rsp->barrier_sequence); return 0; } -- cgit v1.1 From 704dd435ac7eaefa89fcd82fd2876b8330e00ff3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 27 Jun 2015 09:36:29 -0700 Subject: rcu: Consolidate last open-coded expedited memory barrier One of the requirements on RCU grace periods is that if there is a causal chain of operations that starts after one grace period and ends before another grace period, then the two grace periods must be serialized. There has been (and might still be) code that relies on this, for example, certain types of reference-counting code that does a call_rcu() within an RCU callback function. This requirement is why there is an smp_mb() at the end of both synchronize_sched_expedited() and synchronize_rcu_expedited(). However, this is the only smp_mb() in these functions, so it would be nicer to consolidate it into rcu_exp_gp_seq_end(). This commit does just that. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- kernel/rcu/tree_plugin.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 44245ae..a905d3b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3299,6 +3299,7 @@ static void rcu_exp_gp_seq_start(struct rcu_state *rsp) static void rcu_exp_gp_seq_end(struct rcu_state *rsp) { rcu_seq_end(&rsp->expedited_sequence); + smp_mb(); /* Ensure that consecutive grace periods serialize. */ } static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) { @@ -3431,7 +3432,6 @@ void synchronize_sched_expedited(void) rcu_exp_gp_seq_end(rsp); mutex_unlock(&rnp->exp_funnel_mutex); - smp_mb(); /* ensure subsequent action seen after grace period. */ put_online_cpus(); } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f0d7144..27b7146 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -735,7 +735,6 @@ void synchronize_rcu_expedited(void) /* Clean up and exit. */ rcu_exp_gp_seq_end(rsp); mutex_unlock(&rnp_unlock->exp_funnel_mutex); - smp_mb(); /* ensure subsequent action seen after grace period. */ } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); -- cgit v1.1 From 2cd6ffafec066118365f6d7eb7a42ea16c1f032c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Jun 2015 17:06:39 -0700 Subject: rcu: Extend expedited funnel locking to rcu_data structure The strictly rcu_node based funnel-locking scheme works well in many cases, but systems with CONFIG_RCU_FANOUT_LEAF=64 won't necessarily get all that much concurrency. This commit therefore extends the funnel locking into the per-CPU rcu_data structure, providing concurrency equal to the number of CPUs. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 19 ++++++++++++++++--- kernel/rcu/tree.h | 4 +++- kernel/rcu/tree_trace.c | 3 ++- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a905d3b..e45097fc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3312,11 +3312,14 @@ static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp, atomic_long_t *stat, unsigned long s) { if (rcu_exp_gp_seq_done(rsp, s)) { if (rnp) mutex_unlock(&rnp->exp_funnel_mutex); + else if (rdp) + mutex_unlock(&rdp->exp_funnel_mutex); /* Ensure test happens before caller kfree(). */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(stat); @@ -3332,6 +3335,7 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, */ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) { + struct rcu_data *rdp; struct rcu_node *rnp0; struct rcu_node *rnp1 = NULL; @@ -3343,16 +3347,24 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) * can be inexact, as it is just promoting locality and is not * strictly needed for correctness. */ - rnp0 = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; + rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); + if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s)) + return NULL; + mutex_lock(&rdp->exp_funnel_mutex); + rnp0 = rdp->mynode; for (; rnp0 != NULL; rnp0 = rnp0->parent) { - if (sync_exp_work_done(rsp, rnp1, &rsp->expedited_workdone1, s)) + if (sync_exp_work_done(rsp, rnp1, rdp, + &rsp->expedited_workdone2, s)) return NULL; mutex_lock(&rnp0->exp_funnel_mutex); if (rnp1) mutex_unlock(&rnp1->exp_funnel_mutex); + else + mutex_unlock(&rdp->exp_funnel_mutex); rnp1 = rnp0; } - if (sync_exp_work_done(rsp, rnp1, &rsp->expedited_workdone2, s)) + if (sync_exp_work_done(rsp, rnp1, rdp, + &rsp->expedited_workdone3, s)) return NULL; return rnp1; } @@ -3733,6 +3745,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); rdp->cpu = cpu; rdp->rsp = rsp; + mutex_init(&rdp->exp_funnel_mutex); rcu_boot_init_nocb_percpu_data(rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 5c1042d..efee84c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -364,11 +364,12 @@ struct rcu_data { unsigned long n_rp_nocb_defer_wakeup; unsigned long n_rp_need_nothing; - /* 6) _rcu_barrier() and OOM callbacks. */ + /* 6) _rcu_barrier(), OOM callbacks, and expediting. */ struct rcu_head barrier_head; #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + struct mutex exp_funnel_mutex; /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU @@ -494,6 +495,7 @@ struct rcu_state { atomic_long_t expedited_tryfail; /* # acquisition failures. */ atomic_long_t expedited_workdone1; /* # done by others #1. */ atomic_long_t expedited_workdone2; /* # done by others #2. */ + atomic_long_t expedited_workdone3; /* # done by others #3. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ wait_queue_head_t expedited_wq; /* Wait for check-ins. */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index d9982a2..ec62369 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -185,11 +185,12 @@ static int show_rcuexp(struct seq_file *m, void *v) { struct rcu_state *rsp = (struct rcu_state *)m->private; - seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu n=%lu enq=%d sc=%lu\n", + seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", rsp->expedited_sequence, atomic_long_read(&rsp->expedited_tryfail), atomic_long_read(&rsp->expedited_workdone1), atomic_long_read(&rsp->expedited_workdone2), + atomic_long_read(&rsp->expedited_workdone3), atomic_long_read(&rsp->expedited_normal), atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); -- cgit v1.1 From cf3620a6c7798be3395163d3bb863ab378a6aa80 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Jun 2015 11:14:32 -0700 Subject: rcu: Add stall warnings to synchronize_sched_expedited() Although synchronize_sched_expedited() historically has no RCU CPU stall warnings, the availability of the rcupdate.rcu_expedited boot parameter invalidates the old assumption that synchronize_sched()'s stall warnings would suffice. This commit therefore adds RCU CPU stall warnings to synchronize_sched_expedited(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++---- kernel/rcu/tree.h | 1 + 2 files changed, 55 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e45097fc..4b6594c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3369,16 +3369,65 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) return rnp1; } +/* Invoked on each online non-idle CPU for expedited quiescent state. */ static int synchronize_sched_expedited_cpu_stop(void *data) { - struct rcu_state *rsp = data; + struct rcu_data *rdp = data; + struct rcu_state *rsp = rdp->rsp; /* We are here: If we are last, do the wakeup. */ + rdp->exp_done = true; if (atomic_dec_and_test(&rsp->expedited_need_qs)) wake_up(&rsp->expedited_wq); return 0; } +static void synchronize_sched_expedited_wait(struct rcu_state *rsp) +{ + int cpu; + unsigned long jiffies_stall; + unsigned long jiffies_start; + struct rcu_data *rdp; + int ret; + + jiffies_stall = rcu_jiffies_till_stall_check(); + jiffies_start = jiffies; + + for (;;) { + ret = wait_event_interruptible_timeout( + rsp->expedited_wq, + !atomic_read(&rsp->expedited_need_qs), + jiffies_stall); + if (ret > 0) + return; + if (ret < 0) { + /* Hit a signal, disable CPU stall warnings. */ + wait_event(rsp->expedited_wq, + !atomic_read(&rsp->expedited_need_qs)); + return; + } + pr_err("INFO: %s detected expedited stalls on CPUs: {", + rsp->name); + for_each_online_cpu(cpu) { + rdp = per_cpu_ptr(rsp->rda, cpu); + + if (rdp->exp_done) + continue; + pr_cont(" %d", cpu); + } + pr_cont(" } %lu jiffies s: %lu\n", + jiffies - jiffies_start, rsp->expedited_sequence); + for_each_online_cpu(cpu) { + rdp = per_cpu_ptr(rsp->rda, cpu); + + if (rdp->exp_done) + continue; + dump_cpu_task(cpu); + } + jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; + } +} + /** * synchronize_sched_expedited - Brute-force RCU-sched grace period * @@ -3428,19 +3477,20 @@ void synchronize_sched_expedited(void) struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + rdp->exp_done = false; + /* Skip our CPU and any idle CPUs. */ if (raw_smp_processor_id() == cpu || !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) continue; atomic_inc(&rsp->expedited_need_qs); stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop, - rsp, &rdp->exp_stop_work); + rdp, &rdp->exp_stop_work); } /* Remove extra count and, if necessary, wait for CPUs to stop. */ if (!atomic_dec_and_test(&rsp->expedited_need_qs)) - wait_event(rsp->expedited_wq, - !atomic_read(&rsp->expedited_need_qs)); + synchronize_sched_expedited_wait(rsp); rcu_exp_gp_seq_end(rsp); mutex_unlock(&rnp->exp_funnel_mutex); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index efee84c..b3ae8d3 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -370,6 +370,7 @@ struct rcu_data { struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ struct mutex exp_funnel_mutex; + bool exp_done; /* Expedited QS for this CPU? */ /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU -- cgit v1.1 From 99a930b0d2b018f31474d69137f311ce3581a4c2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Jun 2015 14:54:09 -0700 Subject: documentation: Describe new expedited stall warnings Signed-off-by: Paul E. McKenney --- Documentation/RCU/stallwarn.txt | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 046f326..efb9454 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -163,6 +163,23 @@ message will be about three times the interval between the beginning of the stall and the first message. +Stall Warnings for Expedited Grace Periods + +If an expedited grace period detects a stall, it will place a message +like the following in dmesg: + + INFO: rcu_sched detected expedited stalls on CPUs: { 1 2 6 } 26009 jiffies s: 1043 + +This indicates that CPUs 1, 2, and 6 have failed to respond to a +reschedule IPI, that the expedited grace period has been going on for +26,009 jiffies, and that the expedited grace-period sequence counter is +1043. The fact that this last value is odd indicates that an expedited +grace period is in flight. + +It is entirely possible to see stall warnings from normal and from +expedited grace periods at about the same time from the same run. + + What Causes RCU CPU Stall Warnings? So your kernel printed an RCU CPU stall warning. The next question is -- cgit v1.1 From b9a425cfcb3c473b4ca2f3dfaeaf13848f4a7976 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 Jul 2015 13:50:28 -0700 Subject: rcu: Pull out wait_event*() condition into helper function The condition for the wait_event_interruptible_timeout() that waits to do the next force-quiescent-state scan is a bit ornate: ((gf = READ_ONCE(rsp->gp_flags)) & RCU_GP_FLAG_FQS) || (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) This commit therefore pulls this condition out into a helper function and comments its component conditions. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4b6594c..b280373 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1904,6 +1904,26 @@ static int rcu_gp_init(struct rcu_state *rsp) } /* + * Helper function for wait_event_interruptible_timeout() wakeup + * at force-quiescent-state time. + */ +static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) +{ + struct rcu_node *rnp = rcu_get_root(rsp); + + /* Someone like call_rcu() requested a force-quiescent-state scan. */ + *gfp = READ_ONCE(rsp->gp_flags); + if (*gfp & RCU_GP_FLAG_FQS) + return true; + + /* The current grace period has completed. */ + if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) + return true; + + return false; +} + +/* * Do one round of quiescent-state forcing. */ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) @@ -2067,11 +2087,7 @@ static int __noreturn rcu_gp_kthread(void *arg) TPS("fqswait")); rsp->gp_state = RCU_GP_WAIT_FQS; ret = wait_event_interruptible_timeout(rsp->gp_wq, - ((gf = READ_ONCE(rsp->gp_flags)) & - RCU_GP_FLAG_FQS) || - (!READ_ONCE(rnp->qsmask) && - !rcu_preempt_blocked_readers_cgp(rnp)), - j); + rcu_gp_fqs_check_wake(rsp, &gf), j); rsp->gp_state = RCU_GP_DONE_FQS; /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ -- cgit v1.1 From 32bb1c79996069ef9e4e53b428050749f9841c3f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Jul 2015 12:27:31 -0700 Subject: rcu: Rename RCU_GP_DONE_FQS to RCU_GP_DOING_FQS The grace-period kthread sleeps waiting to do a force-quiescent-state scan, and when awakened sets rsp->gp_state to RCU_GP_DONE_FQS. However, this is confusing because the kthread has not done the force-quiescent-state, but is instead just starting to do it. This commit therefore renames RCU_GP_DONE_FQS to RCU_GP_DOING_FQS in order to make things a bit easier on reviewers. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- kernel/rcu/tree.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b280373..f66f6e7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2088,7 +2088,7 @@ static int __noreturn rcu_gp_kthread(void *arg) rsp->gp_state = RCU_GP_WAIT_FQS; ret = wait_event_interruptible_timeout(rsp->gp_wq, rcu_gp_fqs_check_wake(rsp, &gf), j); - rsp->gp_state = RCU_GP_DONE_FQS; + rsp->gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ if (!READ_ONCE(rnp->qsmask) && diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index b3ae8d3..543ba72 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -535,7 +535,7 @@ struct rcu_state { #define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ #define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */ #define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */ -#define RCU_GP_DONE_FQS 4 /* Wait done for force-quiescent-state time. */ +#define RCU_GP_DOING_FQS 4 /* Wait done for force-quiescent-state time. */ #define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */ #define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */ -- cgit v1.1 From cdacbe1f91264687af956e810278030f2ab5a3d0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 11 Jul 2015 16:24:45 -0700 Subject: rcu: Add fastpath bypassing funnel locking In the common case, there will be only one expedited grace period in the system at a given time, in which case it is not helpful to use funnel locking. This commit therefore adds a fastpath that bypasses funnel locking when the root ->exp_funnel_mutex is not held. Signed-off-by: Paul E. McKenney --- Documentation/RCU/trace.txt | 36 ++++++++++-------------------------- kernel/rcu/tree.c | 16 ++++++++++++++++ kernel/rcu/tree.h | 2 +- kernel/rcu/tree_trace.c | 4 ++-- 4 files changed, 29 insertions(+), 29 deletions(-) diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index 08651da..97f17e9 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -237,42 +237,26 @@ o "ktl" is the low-order 16 bits (in hexadecimal) of the count of The output of "cat rcu/rcu_preempt/rcuexp" looks as follows: -s=21872 d=21872 w=0 tf=0 wd1=0 wd2=0 n=0 sc=21872 dt=21872 dl=0 dx=21872 +s=21872 wd0=0 wd1=0 wd2=0 wd3=5 n=0 enq=0 sc=21872 These fields are as follows: -o "s" is the starting sequence number. +o "s" is the sequence number, with an odd number indicating that + an expedited grace period is in progress. -o "d" is the ending sequence number. When the starting and ending - numbers differ, there is an expedited grace period in progress. - -o "w" is the number of times that the sequence numbers have been - in danger of wrapping. - -o "tf" is the number of times that contention has resulted in a - failure to begin an expedited grace period. - -o "wd1" and "wd2" are the number of times that an attempt to - start an expedited grace period found that someone else had - completed an expedited grace period that satisfies the +o "wd0", "wd1", "wd2", and "wd3" are the number of times that an + attempt to start an expedited grace period found that someone + else had completed an expedited grace period that satisfies the attempted request. "Our work is done." -o "n" is number of times that contention was so great that - the request was demoted from an expedited grace period to - a normal grace period. +o "n" is number of times that a concurrent CPU-hotplug operation + forced a fallback to a normal grace period. + +o "enq" is the number of quiescent states still outstanding. o "sc" is the number of times that the attempt to start a new expedited grace period succeeded. -o "dt" is the number of times that we attempted to update - the "d" counter. - -o "dl" is the number of times that we failed to update the "d" - counter. - -o "dx" is the number of times that we succeeded in updating - the "d" counter. - The output of "cat rcu/rcu_preempt/rcugp" looks as follows: diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f66f6e7..3af0dee 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3356,6 +3356,22 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) struct rcu_node *rnp1 = NULL; /* + * First try directly acquiring the root lock in order to reduce + * latency in the common case where expedited grace periods are + * rare. We check mutex_is_locked() to avoid pathological levels of + * memory contention on ->exp_funnel_mutex in the heavy-load case. + */ + rnp0 = rcu_get_root(rsp); + if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { + if (mutex_trylock(&rnp0->exp_funnel_mutex)) { + if (sync_exp_work_done(rsp, rnp0, NULL, + &rsp->expedited_workdone0, s)) + return NULL; + return rnp0; + } + } + + /* * Each pass through the following loop works its way * up the rcu_node tree, returning if others have done the * work or otherwise falls through holding the root rnp's diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 543ba72..80d974d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -493,7 +493,7 @@ struct rcu_state { /* End of fields guarded by barrier_mutex. */ unsigned long expedited_sequence; /* Take a ticket. */ - atomic_long_t expedited_tryfail; /* # acquisition failures. */ + atomic_long_t expedited_workdone0; /* # done by others #0. */ atomic_long_t expedited_workdone1; /* # done by others #1. */ atomic_long_t expedited_workdone2; /* # done by others #2. */ atomic_long_t expedited_workdone3; /* # done by others #3. */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index ec62369..6fc4c5f 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -185,9 +185,9 @@ static int show_rcuexp(struct seq_file *m, void *v) { struct rcu_state *rsp = (struct rcu_state *)m->private; - seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", + seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", rsp->expedited_sequence, - atomic_long_read(&rsp->expedited_tryfail), + atomic_long_read(&rsp->expedited_workdone0), atomic_long_read(&rsp->expedited_workdone1), atomic_long_read(&rsp->expedited_workdone2), atomic_long_read(&rsp->expedited_workdone3), -- cgit v1.1 From 24560056de61d86153cecb84d04e4237437f5888 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 30 May 2015 10:11:24 -0700 Subject: rcu: Add RCU-sched flavors of get-state and cond-sync The get_state_synchronize_rcu() and cond_synchronize_rcu() functions allow polling for grace-period completion, with an actual wait for a grace period occurring only when cond_synchronize_rcu() is called too soon after the corresponding get_state_synchronize_rcu(). However, these functions work only for vanilla RCU. This commit adds the get_state_synchronize_sched() and cond_synchronize_sched(), which provide the same capability for RCU-sched. Reported-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 10 ++++++++++ include/linux/rcutree.h | 2 ++ kernel/rcu/rcutorture.c | 2 ++ kernel/rcu/tree.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 66 insertions(+) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 3df6c1e..ff968b7 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -37,6 +37,16 @@ static inline void cond_synchronize_rcu(unsigned long oldstate) might_sleep(); } +static inline unsigned long get_state_synchronize_sched(void) +{ + return 0; +} + +static inline void cond_synchronize_sched(unsigned long oldstate) +{ + might_sleep(); +} + static inline void rcu_barrier_bh(void) { wait_rcu_gp(call_rcu_bh); diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 4568791..5abec82 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -76,6 +76,8 @@ void rcu_barrier_bh(void); void rcu_barrier_sched(void); unsigned long get_state_synchronize_rcu(void); void cond_synchronize_rcu(unsigned long oldstate); +unsigned long get_state_synchronize_sched(void); +void cond_synchronize_sched(unsigned long oldstate); extern unsigned long rcutorture_testseq; extern unsigned long rcutorture_vernum; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 59e3268..0f2cb55 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -635,6 +635,8 @@ static struct rcu_torture_ops sched_ops = { .deferred_free = rcu_sched_torture_deferred_free, .sync = synchronize_sched, .exp_sync = synchronize_sched_expedited, + .get_state = get_state_synchronize_sched, + .cond_sync = cond_synchronize_sched, .call = call_rcu_sched, .cb_barrier = rcu_barrier_sched, .fqs = rcu_sched_force_quiescent_state, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8b5dd8b..9629298 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3253,6 +3253,58 @@ void cond_synchronize_rcu(unsigned long oldstate) } EXPORT_SYMBOL_GPL(cond_synchronize_rcu); +/** + * get_state_synchronize_sched - Snapshot current RCU-sched state + * + * Returns a cookie that is used by a later call to cond_synchronize_sched() + * to determine whether or not a full grace period has elapsed in the + * meantime. + */ +unsigned long get_state_synchronize_sched(void) +{ + /* + * Any prior manipulation of RCU-protected data must happen + * before the load from ->gpnum. + */ + smp_mb(); /* ^^^ */ + + /* + * Make sure this load happens before the purportedly + * time-consuming work between get_state_synchronize_sched() + * and cond_synchronize_sched(). + */ + return smp_load_acquire(&rcu_sched_state.gpnum); +} +EXPORT_SYMBOL_GPL(get_state_synchronize_sched); + +/** + * cond_synchronize_sched - Conditionally wait for an RCU-sched grace period + * + * @oldstate: return value from earlier call to get_state_synchronize_sched() + * + * If a full RCU-sched grace period has elapsed since the earlier call to + * get_state_synchronize_sched(), just return. Otherwise, invoke + * synchronize_sched() to wait for a full grace period. + * + * Yes, this function does not take counter wrap into account. But + * counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for one additional grace period should be just fine. + */ +void cond_synchronize_sched(unsigned long oldstate) +{ + unsigned long newstate; + + /* + * Ensure that this load happens before any RCU-destructive + * actions the caller might carry out after we return. + */ + newstate = smp_load_acquire(&rcu_sched_state.completed); + if (ULONG_CMP_GE(oldstate, newstate)) + synchronize_sched(); +} +EXPORT_SYMBOL_GPL(cond_synchronize_sched); + static int synchronize_sched_expedited_cpu_stop(void *data) { /* -- cgit v1.1 From be55fa2ad244678beed11d2827d88b7f80ff4730 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Jun 2015 05:29:18 -0700 Subject: rcu: Hide RCU_NOCB_CPU behind RCU_EXPERT This commit prevents Kconfig from asking the user about RCU_NOCB_CPU unless the user really wants to be asked. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney Cc: Steven Rostedt Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner --- init/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/init/Kconfig b/init/Kconfig index fdeff4a..ba1e6ea 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -698,6 +698,7 @@ config RCU_BOOST_DELAY config RCU_NOCB_CPU bool "Offload RCU callback processing from boot-selected CPUs" depends on TREE_RCU || PREEMPT_RCU + depends on RCU_EXPERT || NO_HZ_FULL default n help Use this option to reduce OS jitter for aggressive HPC or -- cgit v1.1 From 155d1d12786386d21732f9bba036343ffa43847d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2015 17:26:48 +0200 Subject: rcu: Use WRITE_ONCE in RCU_INIT_POINTER For the paranoid amongst us GCC would be in its right to use byte stores to write our NULL value, tell it not to do that. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index def6d45..c63428c 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -995,7 +995,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) #define RCU_INIT_POINTER(p, v) \ do { \ rcu_dereference_sparse(p, __rcu); \ - p = RCU_INITIALIZER(v); \ + WRITE_ONCE(p, RCU_INITIALIZER(v)); \ } while (0) /** -- cgit v1.1 From bc17ea1092c48227334a311a130c1a41966333fe Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 6 Jun 2015 08:11:43 -0700 Subject: rcu: Fix obsolete priority-boosting comment Tasks are no longer migrated to the root rcu_node, so there is no longer any need for a boost kthread for the root rcu_node, and there no longer is such a kthread. This commit therefore fixes the comment in rcu_boost_kthread()'s header to reflect this new reality. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 013485f..a983bc6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1061,8 +1061,7 @@ static int rcu_boost(struct rcu_node *rnp) } /* - * Priority-boosting kthread. One per leaf rcu_node and one for the - * root rcu_node. + * Priority-boosting kthread, one per leaf rcu_node. */ static int rcu_boost_kthread(void *arg) { -- cgit v1.1 From ec90a194ae2cb8b8e9fe4f6f70dd3d4dc0269b4b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Jun 2015 12:53:06 -0700 Subject: rcu: Create a synchronize_rcu_mult() There have been several requests for a primitive that waits for grace periods for several RCU flavors concurrently, so this commit creates it. This is a variadic macro, and you pass in the call_rcu() functions of the flavors of RCU that you wish to wait for. Note that you cannot pass in call_srcu() for two reasons: (1) This would result in a type mismatch and (2) You need to specify which srcu_struct you want to use. Handle this by creating a wrapper function for your SRCU domain, for example: void call_srcu_mine(struct rcu_head *head, rcu_callback_t func) { call_srcu(&ss_mine, head, func); } You can then do something like this: synchronize_rcu_mult(call_srcu_mine, call_rcu, call_rcu_sched); Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 35 +++++++++++++++++++++++++++++++---- include/linux/types.h | 3 +++ kernel/rcu/update.c | 37 +++++++++++++++++++++++++++---------- 3 files changed, 61 insertions(+), 14 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index c63428c..33ec16b 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -226,6 +226,37 @@ struct rcu_synchronize { }; void wakeme_after_rcu(struct rcu_head *head); +void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, + struct rcu_synchronize *rs_array); + +#define _wait_rcu_gp(checktiny, ...) \ +do { \ + call_rcu_func_t __crcu_array[] = { __VA_ARGS__ }; \ + const int __n = ARRAY_SIZE(__crcu_array); \ + struct rcu_synchronize __rs_array[__n]; \ + \ + __wait_rcu_gp(checktiny, __n, __crcu_array, __rs_array); \ +} while (0) + +#define wait_rcu_gp(...) _wait_rcu_gp(false, __VA_ARGS__) + +/** + * synchronize_rcu_mult - Wait concurrently for multiple grace periods + * @...: List of call_rcu() functions for the flavors to wait on. + * + * This macro waits concurrently for multiple flavors of RCU grace periods. + * For example, synchronize_rcu_mult(call_rcu, call_rcu_bh) would wait + * on concurrent RCU and RCU-bh grace periods. Waiting on a give SRCU + * domain requires you to write a wrapper function for that SRCU domain's + * call_srcu() function, supplying the corresponding srcu_struct. + * + * If Tiny RCU, tell _wait_rcu_gp() not to bother waiting for RCU + * or RCU-bh, given that anywhere synchronize_rcu_mult() can be called + * is automatically a grace period. + */ +#define synchronize_rcu_mult(...) \ + _wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), __VA_ARGS__) + /** * call_rcu_tasks() - Queue an RCU for invocation task-based grace period * @head: structure to be used for queueing the RCU updates. @@ -392,10 +423,6 @@ bool __rcu_is_watching(void); * TREE_RCU and rcu_barrier_() primitives in TINY_RCU. */ -typedef void call_rcu_func_t(struct rcu_head *head, - void (*func)(struct rcu_head *head)); -void wait_rcu_gp(call_rcu_func_t crf); - #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) #include #elif defined(CONFIG_TINY_RCU) diff --git a/include/linux/types.h b/include/linux/types.h index 8715287..c314989 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -212,6 +212,9 @@ struct callback_head { }; #define rcu_head callback_head +typedef void (*rcu_callback_t)(struct rcu_head *head); +typedef void (*call_rcu_func_t)(struct rcu_head *head, rcu_callback_t func); + /* clocksource cycle base type */ typedef u64 cycle_t; diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index fec5f48..a0a0dd0 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -318,20 +318,37 @@ void wakeme_after_rcu(struct rcu_head *head) rcu = container_of(head, struct rcu_synchronize, head); complete(&rcu->completion); } +EXPORT_SYMBOL_GPL(wakeme_after_rcu); -void wait_rcu_gp(call_rcu_func_t crf) +void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, + struct rcu_synchronize *rs_array) { - struct rcu_synchronize rcu; + int i; - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - crf(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); + /* Initialize and register callbacks for each flavor specified. */ + for (i = 0; i < n; i++) { + if (checktiny && + (crcu_array[i] == call_rcu || + crcu_array[i] == call_rcu_bh)) { + might_sleep(); + continue; + } + init_rcu_head_on_stack(&rs_array[i].head); + init_completion(&rs_array[i].completion); + (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); + } + + /* Wait for all callbacks to be invoked. */ + for (i = 0; i < n; i++) { + if (checktiny && + (crcu_array[i] == call_rcu || + crcu_array[i] == call_rcu_bh)) + continue; + wait_for_completion(&rs_array[i].completion); + destroy_rcu_head_on_stack(&rs_array[i].head); + } } -EXPORT_SYMBOL_GPL(wait_rcu_gp); +EXPORT_SYMBOL_GPL(__wait_rcu_gp); #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD void init_rcu_head(struct rcu_head *head) -- cgit v1.1 From 779de6ce54f627f955d4a3d0c5b3dcfaab74fea8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Jun 2015 13:34:41 -0700 Subject: cpu: Wait for RCU grace periods concurrently In kernels built with CONFIG_PREEMPT, _cpu_down() waits for RCU and RCU-sched grace periods back-to-back, incurring quite a bit more latency than required. This commit therefore uses the new synchronize_rcu_mult() to allow waiting for both grace periods concurrently. Signed-off-by: Paul E. McKenney --- kernel/cpu.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 9c9c9fa..d63b062 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -380,14 +380,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) * will observe it. * * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might - * not imply sync_sched(), so explicitly call both. + * not imply sync_sched(), so wait for both. * * Do sync before park smpboot threads to take care the rcu boost case. */ -#ifdef CONFIG_PREEMPT - synchronize_sched(); -#endif - synchronize_rcu(); + if (IS_ENABLED(CONFIG_PREEMPT)) + synchronize_rcu_mult(call_rcu, call_rcu_sched); + else + synchronize_rcu(); smpboot_park_threads(cpu); -- cgit v1.1 From 46f00d18fca42cc954c2e9e99a48b6f3a7741ed7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 16 Jun 2015 10:35:18 -0700 Subject: rcu: Make rcu_is_watching() really notrace Although rcu_is_watching() is marked notrace, it invokes preempt_disable() and preempt_enable(), both of which can be traced. This defeats the purpose of the notrace on rcu_is_watching(), so this commit substitutes preempt_disable_notrace() and preempt_enable_notrace(). Signed-off-by: Alexei Starovoitov Signed-off-by: Paul E. McKenney Acked-by: Steven Rostedt --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9629298..cb64d7e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -978,9 +978,9 @@ bool notrace rcu_is_watching(void) { bool ret; - preempt_disable(); + preempt_disable_notrace(); ret = __rcu_is_watching(); - preempt_enable(); + preempt_enable_notrace(); return ret; } EXPORT_SYMBOL_GPL(rcu_is_watching); -- cgit v1.1 From f78f5b90c4ffa559e400c3919a02236101f29f3f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 18 Jun 2015 15:50:02 -0700 Subject: rcu: Rename rcu_lockdep_assert() to RCU_LOCKDEP_WARN() This commit renames rcu_lockdep_assert() to RCU_LOCKDEP_WARN() for consistency with the WARN() series of macros. This also requires inverting the sense of the conditional, which this commit also does. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney Reviewed-by: Ingo Molnar --- Documentation/RCU/whatisRCU.txt | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 6 ++-- arch/x86/kernel/traps.c | 2 +- drivers/base/power/opp.c | 4 +-- include/linux/fdtable.h | 4 +-- include/linux/rcupdate.h | 63 ++++++++++++++++++++++++++-------------- kernel/cgroup.c | 4 +-- kernel/pid.c | 5 ++-- kernel/rcu/srcu.c | 10 +++---- kernel/rcu/tiny.c | 8 ++--- kernel/rcu/tree.c | 28 +++++++++--------- kernel/rcu/tree_plugin.h | 8 ++--- kernel/rcu/update.c | 4 +-- kernel/sched/core.c | 8 ++--- kernel/workqueue.c | 20 ++++++------- security/device_cgroup.c | 6 ++-- 16 files changed, 101 insertions(+), 81 deletions(-) diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 5746b0c..adc2184 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -883,7 +883,7 @@ All: lockdep-checked RCU-protected pointer access rcu_access_pointer rcu_dereference_raw - rcu_lockdep_assert + RCU_LOCKDEP_WARN rcu_sleep_check RCU_NONIDLE diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index df919ff..3d6b526 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -54,9 +54,9 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); #define rcu_dereference_check_mce(p) \ ({ \ - rcu_lockdep_assert(rcu_read_lock_sched_held() || \ - lockdep_is_held(&mce_chrdev_read_mutex), \ - "suspicious rcu_dereference_check_mce() usage"); \ + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + !lockdep_is_held(&mce_chrdev_read_mutex), \ + "suspicious rcu_dereference_check_mce() usage"); \ smp_load_acquire(&(p)); \ }) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index f579192..c5a5231 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -136,7 +136,7 @@ enum ctx_state ist_enter(struct pt_regs *regs) preempt_count_add(HARDIRQ_OFFSET); /* This code is a bit fragile. Test it. */ - rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work"); + RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work"); return prev_state; } diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c index 677fb28..3b188f2 100644 --- a/drivers/base/power/opp.c +++ b/drivers/base/power/opp.c @@ -110,8 +110,8 @@ static DEFINE_MUTEX(dev_opp_list_lock); #define opp_rcu_lockdep_assert() \ do { \ - rcu_lockdep_assert(rcu_read_lock_held() || \ - lockdep_is_held(&dev_opp_list_lock), \ + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ + !lockdep_is_held(&dev_opp_list_lock), \ "Missing rcu_read_lock() or " \ "dev_opp_list_lock protection"); \ } while (0) diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index fbb8874..674e3e2 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -86,8 +86,8 @@ static inline struct file *__fcheck_files(struct files_struct *files, unsigned i static inline struct file *fcheck_files(struct files_struct *files, unsigned int fd) { - rcu_lockdep_assert(rcu_read_lock_held() || - lockdep_is_held(&files->file_lock), + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && + !lockdep_is_held(&files->file_lock), "suspicious rcu_dereference_check() usage"); return __fcheck_files(files, fd); } diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 33ec16b..ff47651 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -536,6 +536,11 @@ static inline int rcu_read_lock_sched_held(void) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +/* Deprecate rcu_lockdep_assert(): Use RCU_LOCKDEP_WARN() instead. */ +static inline void __attribute((deprecated)) deprecate_rcu_lockdep_assert(void) +{ +} + #ifdef CONFIG_PROVE_RCU /** @@ -546,17 +551,32 @@ static inline int rcu_read_lock_sched_held(void) #define rcu_lockdep_assert(c, s) \ do { \ static bool __section(.data.unlikely) __warned; \ + deprecate_rcu_lockdep_assert(); \ if (debug_lockdep_rcu_enabled() && !__warned && !(c)) { \ __warned = true; \ lockdep_rcu_suspicious(__FILE__, __LINE__, s); \ } \ } while (0) +/** + * RCU_LOCKDEP_WARN - emit lockdep splat if specified condition is met + * @c: condition to check + * @s: informative message + */ +#define RCU_LOCKDEP_WARN(c, s) \ + do { \ + static bool __section(.data.unlikely) __warned; \ + if (debug_lockdep_rcu_enabled() && !__warned && (c)) { \ + __warned = true; \ + lockdep_rcu_suspicious(__FILE__, __LINE__, s); \ + } \ + } while (0) + #if defined(CONFIG_PROVE_RCU) && !defined(CONFIG_PREEMPT_RCU) static inline void rcu_preempt_sleep_check(void) { - rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), - "Illegal context switch in RCU read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map), + "Illegal context switch in RCU read-side critical section"); } #else /* #ifdef CONFIG_PROVE_RCU */ static inline void rcu_preempt_sleep_check(void) @@ -567,15 +587,16 @@ static inline void rcu_preempt_sleep_check(void) #define rcu_sleep_check() \ do { \ rcu_preempt_sleep_check(); \ - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map), \ - "Illegal context switch in RCU-bh read-side critical section"); \ - rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map), \ - "Illegal context switch in RCU-sched read-side critical section"); \ + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), \ + "Illegal context switch in RCU-bh read-side critical section"); \ + RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), \ + "Illegal context switch in RCU-sched read-side critical section"); \ } while (0) #else /* #ifdef CONFIG_PROVE_RCU */ -#define rcu_lockdep_assert(c, s) do { } while (0) +#define rcu_lockdep_assert(c, s) deprecate_rcu_lockdep_assert() +#define RCU_LOCKDEP_WARN(c, s) do { } while (0) #define rcu_sleep_check() do { } while (0) #endif /* #else #ifdef CONFIG_PROVE_RCU */ @@ -606,13 +627,13 @@ static inline void rcu_preempt_sleep_check(void) ({ \ /* Dependency order vs. p above. */ \ typeof(*p) *________p1 = (typeof(*p) *__force)lockless_dereference(p); \ - rcu_lockdep_assert(c, "suspicious rcu_dereference_check() usage"); \ + RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \ rcu_dereference_sparse(p, space); \ ((typeof(*p) __force __kernel *)(________p1)); \ }) #define __rcu_dereference_protected(p, c, space) \ ({ \ - rcu_lockdep_assert(c, "suspicious rcu_dereference_protected() usage"); \ + RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_protected() usage"); \ rcu_dereference_sparse(p, space); \ ((typeof(*p) __force __kernel *)(p)); \ }) @@ -836,8 +857,8 @@ static inline void rcu_read_lock(void) __rcu_read_lock(); __acquire(RCU); rcu_lock_acquire(&rcu_lock_map); - rcu_lockdep_assert(rcu_is_watching(), - "rcu_read_lock() used illegally while idle"); + RCU_LOCKDEP_WARN(!rcu_is_watching(), + "rcu_read_lock() used illegally while idle"); } /* @@ -887,8 +908,8 @@ static inline void rcu_read_lock(void) */ static inline void rcu_read_unlock(void) { - rcu_lockdep_assert(rcu_is_watching(), - "rcu_read_unlock() used illegally while idle"); + RCU_LOCKDEP_WARN(!rcu_is_watching(), + "rcu_read_unlock() used illegally while idle"); __release(RCU); __rcu_read_unlock(); rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */ @@ -916,8 +937,8 @@ static inline void rcu_read_lock_bh(void) local_bh_disable(); __acquire(RCU_BH); rcu_lock_acquire(&rcu_bh_lock_map); - rcu_lockdep_assert(rcu_is_watching(), - "rcu_read_lock_bh() used illegally while idle"); + RCU_LOCKDEP_WARN(!rcu_is_watching(), + "rcu_read_lock_bh() used illegally while idle"); } /* @@ -927,8 +948,8 @@ static inline void rcu_read_lock_bh(void) */ static inline void rcu_read_unlock_bh(void) { - rcu_lockdep_assert(rcu_is_watching(), - "rcu_read_unlock_bh() used illegally while idle"); + RCU_LOCKDEP_WARN(!rcu_is_watching(), + "rcu_read_unlock_bh() used illegally while idle"); rcu_lock_release(&rcu_bh_lock_map); __release(RCU_BH); local_bh_enable(); @@ -952,8 +973,8 @@ static inline void rcu_read_lock_sched(void) preempt_disable(); __acquire(RCU_SCHED); rcu_lock_acquire(&rcu_sched_lock_map); - rcu_lockdep_assert(rcu_is_watching(), - "rcu_read_lock_sched() used illegally while idle"); + RCU_LOCKDEP_WARN(!rcu_is_watching(), + "rcu_read_lock_sched() used illegally while idle"); } /* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */ @@ -970,8 +991,8 @@ static inline notrace void rcu_read_lock_sched_notrace(void) */ static inline void rcu_read_unlock_sched(void) { - rcu_lockdep_assert(rcu_is_watching(), - "rcu_read_unlock_sched() used illegally while idle"); + RCU_LOCKDEP_WARN(!rcu_is_watching(), + "rcu_read_unlock_sched() used illegally while idle"); rcu_lock_release(&rcu_sched_lock_map); __release(RCU_SCHED); preempt_enable(); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f89d929..b89f316 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -107,8 +107,8 @@ static DEFINE_SPINLOCK(release_agent_path_lock); struct percpu_rw_semaphore cgroup_threadgroup_rwsem; #define cgroup_assert_mutex_or_rcu_locked() \ - rcu_lockdep_assert(rcu_read_lock_held() || \ - lockdep_is_held(&cgroup_mutex), \ + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ + !lockdep_is_held(&cgroup_mutex), \ "cgroup_mutex or RCU read lock required"); /* diff --git a/kernel/pid.c b/kernel/pid.c index 4fd07d5..ca36879 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -451,9 +451,8 @@ EXPORT_SYMBOL(pid_task); */ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) { - rcu_lockdep_assert(rcu_read_lock_held(), - "find_task_by_pid_ns() needs rcu_read_lock()" - " protection"); + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), + "find_task_by_pid_ns() needs rcu_read_lock() protection"); return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); } diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index de35087..d3fcb2e 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -415,11 +415,11 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) struct rcu_head *head = &rcu.head; bool done = false; - rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && - !lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) || + lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); might_sleep(); init_completion(&rcu.completion); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index c291bd6..d047105 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -191,10 +191,10 @@ static void rcu_process_callbacks(struct softirq_action *unused) */ void synchronize_sched(void) { - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_sched() in RCU read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_sched() in RCU read-side critical section"); cond_resched(); } EXPORT_SYMBOL_GPL(synchronize_sched); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cb64d7e..0a73d26 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -649,12 +649,12 @@ static void rcu_eqs_enter_common(long long oldval, bool user) * It is illegal to enter an extended quiescent state while * in an RCU read-side critical section. */ - rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), - "Illegal idle entry in RCU read-side critical section."); - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map), - "Illegal idle entry in RCU-bh read-side critical section."); - rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map), - "Illegal idle entry in RCU-sched read-side critical section."); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map), + "Illegal idle entry in RCU read-side critical section."); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), + "Illegal idle entry in RCU-bh read-side critical section."); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), + "Illegal idle entry in RCU-sched read-side critical section."); } /* @@ -3161,10 +3161,10 @@ static inline int rcu_blocking_is_gp(void) */ void synchronize_sched(void) { - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_sched() in RCU-sched read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_sched() in RCU-sched read-side critical section"); if (rcu_blocking_is_gp()) return; if (rcu_gp_is_expedited()) @@ -3188,10 +3188,10 @@ EXPORT_SYMBOL_GPL(synchronize_sched); */ void synchronize_rcu_bh(void) { - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); if (rcu_blocking_is_gp()) return; if (rcu_gp_is_expedited()) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a983bc6..9e922f1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -538,10 +538,10 @@ EXPORT_SYMBOL_GPL(call_rcu); */ void synchronize_rcu(void) { - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu() in RCU read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu() in RCU read-side critical section"); if (!rcu_scheduler_active) return; if (rcu_gp_is_expedited()) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index a0a0dd0..47268fb 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -589,8 +589,8 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks); void synchronize_rcu_tasks(void) { /* Complain if the scheduler has not started. */ - rcu_lockdep_assert(!rcu_scheduler_active, - "synchronize_rcu_tasks called too soon"); + RCU_LOCKDEP_WARN(rcu_scheduler_active, + "synchronize_rcu_tasks called too soon"); /* Wait for the grace period. */ wait_rcu_gp(call_rcu_tasks); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 78b4bad10..5e73c79 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2200,8 +2200,8 @@ unsigned long to_ratio(u64 period, u64 runtime) #ifdef CONFIG_SMP inline struct dl_bw *dl_bw_of(int i) { - rcu_lockdep_assert(rcu_read_lock_sched_held(), - "sched RCU must be held"); + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); return &cpu_rq(i)->rd->dl_bw; } @@ -2210,8 +2210,8 @@ static inline int dl_bw_cpus(int i) struct root_domain *rd = cpu_rq(i)->rd; int cpus = 0; - rcu_lockdep_assert(rcu_read_lock_sched_held(), - "sched RCU must be held"); + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); for_each_cpu_and(i, rd->span, cpu_active_mask) cpus++; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4c4f061..cb91c63 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -338,20 +338,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); #include #define assert_rcu_or_pool_mutex() \ - rcu_lockdep_assert(rcu_read_lock_sched_held() || \ - lockdep_is_held(&wq_pool_mutex), \ - "sched RCU or wq_pool_mutex should be held") + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + !lockdep_is_held(&wq_pool_mutex), \ + "sched RCU or wq_pool_mutex should be held") #define assert_rcu_or_wq_mutex(wq) \ - rcu_lockdep_assert(rcu_read_lock_sched_held() || \ - lockdep_is_held(&wq->mutex), \ - "sched RCU or wq->mutex should be held") + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + !lockdep_is_held(&wq->mutex), \ + "sched RCU or wq->mutex should be held") #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ - rcu_lockdep_assert(rcu_read_lock_sched_held() || \ - lockdep_is_held(&wq->mutex) || \ - lockdep_is_held(&wq_pool_mutex), \ - "sched RCU, wq->mutex or wq_pool_mutex should be held") + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + !lockdep_is_held(&wq->mutex) && \ + !lockdep_is_held(&wq_pool_mutex), \ + "sched RCU, wq->mutex or wq_pool_mutex should be held") #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 188c1d2..73455089 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -400,9 +400,9 @@ static bool verify_new_ex(struct dev_cgroup *dev_cgroup, { bool match = false; - rcu_lockdep_assert(rcu_read_lock_held() || - lockdep_is_held(&devcgroup_mutex), - "device_cgroup:verify_new_ex called without proper synchronization"); + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && + lockdep_is_held(&devcgroup_mutex), + "device_cgroup:verify_new_ex called without proper synchronization"); if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) { if (behavior == DEVCG_DEFAULT_ALLOW) { -- cgit v1.1 From a76a9a485d730024a7cbd76efcd9c6eb46003829 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Jun 2015 08:17:40 -0700 Subject: rcu: Fix backwards RCU_LOCKDEP_WARN() in synchronize_rcu_tasks() The RCU_LOCKDEP_WARN() in synchronize_rcu_tasks() triggers if the scheduler is active, which is backwards. This commit therefore negates the test. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 47268fb..7a0b3bc 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -589,7 +589,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks); void synchronize_rcu_tasks(void) { /* Complain if the scheduler has not started. */ - RCU_LOCKDEP_WARN(rcu_scheduler_active, + RCU_LOCKDEP_WARN(!rcu_scheduler_active, "synchronize_rcu_tasks called too soon"); /* Wait for the grace period. */ -- cgit v1.1 From eb6d5b0a5c64192ec13914caef1b87a9aa520d66 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Jun 2015 09:56:31 -0700 Subject: rcu: Clarify CONFIG_RCU_EQS_DEBUG help text Reported-by: Geert Uytterhoeven Signed-off-by: Paul E. McKenney --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e2894b2..26da2ca 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1379,7 +1379,7 @@ config RCU_TRACE Say N if you are unsure. config RCU_EQS_DEBUG - bool "Use this when adding any sort of NO_HZ support to your arch" + bool "Provide debugging asserts for adding NO_HZ support to an arch" depends on DEBUG_KERNEL help This option provides consistency checks in RCU's handling of -- cgit v1.1 From 96a39bc434f94e213df43859fb8b01acda902634 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 1 Jul 2015 15:26:27 +0800 Subject: rcu: Update MAINTAINERS entry This commit updates Lai Jiangshan's email address because the old laijs@cn.fujitsu.com address will expire after July 10, 2015. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- MAINTAINERS | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 8133cef..f92d813 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8445,7 +8445,7 @@ M: "Paul E. McKenney" M: Josh Triplett R: Steven Rostedt R: Mathieu Desnoyers -R: Lai Jiangshan +R: Lai Jiangshan L: linux-kernel@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git @@ -8472,7 +8472,7 @@ M: "Paul E. McKenney" M: Josh Triplett R: Steven Rostedt R: Mathieu Desnoyers -R: Lai Jiangshan +R: Lai Jiangshan L: linux-kernel@vger.kernel.org W: http://www.rdrop.com/users/paulmck/RCU/ S: Supported @@ -9340,7 +9340,7 @@ F: include/linux/sl?b*.h F: mm/sl?b* SLEEPABLE READ-COPY UPDATE (SRCU) -M: Lai Jiangshan +M: Lai Jiangshan M: "Paul E. McKenney" M: Josh Triplett R: Steven Rostedt -- cgit v1.1 From 3ad81779ad35cd2e2ab34ba3f47f6f485696a105 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Jul 2015 11:55:40 -0700 Subject: scripts: Make checkpatch.pl warn on expedited RCU grace periods The synchronize_rcu_expedited() and synchronize_sched_expedited() expedited-grace-period primitives induce OS jitter, which can degrade real-time response. This commit therefore adds a checkpatch.pl warning on any patch adding them. Note that this patch does not warn on synchronize_srcu_expedited() because it does not induce OS jitter, courtesy of its otherwise much-maligned read-side memory barriers. Signed-off-by: Paul E. McKenney Cc: Andy Whitcroft Cc: Joe Perches --- scripts/checkpatch.pl | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 90e1edc..976e711 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5011,6 +5011,7 @@ sub process { "memory barrier without comment\n" . $herecurr); } } + # check for waitqueue_active without a comment. if ($line =~ /\bwaitqueue_active\s*\(/) { if (!ctx_has_comment($first_line, $linenr)) { @@ -5018,6 +5019,24 @@ sub process { "waitqueue_active without comment\n" . $herecurr); } } + +# Check for expedited grace periods that interrupt non-idle non-nohz +# online CPUs. These expedited can therefore degrade real-time response +# if used carelessly, and should be avoided where not absolutely +# needed. It is always OK to use synchronize_rcu_expedited() and +# synchronize_sched_expedited() at boot time (before real-time applications +# start) and in error situations where real-time response is compromised in +# any case. Note that synchronize_srcu_expedited() does -not- interrupt +# other CPUs, so don't warn on uses of synchronize_srcu_expedited(). +# Of course, nothing comes for free, and srcu_read_lock() and +# srcu_read_unlock() do contain full memory barriers in payment for +# synchronize_srcu_expedited() non-interruption properties. + if ($line =~ /\b(synchronize_rcu_expedited|synchronize_sched_expedited)\(/) { + WARN("EXPEDITED_RCU_GRACE_PERIOD", + "expedited RCU grace periods should be avoided where they can degrade real-time response\n" . $herecurr); + + } + # check of hardware specific defines if ($line =~ m@^.\s*\#\s*if.*\b(__i386__|__powerpc64__|__sun__|__s390x__)\b@ && $realfile !~ m@include/asm-@) { CHK("ARCH_DEFINES", -- cgit v1.1 From 9a54f98e341d09793247a6e598012edefb5ae7cb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 14 Jul 2015 16:24:14 -0700 Subject: rcu: Don't disable CPU hotplug during OOM notifiers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RCU's rcu_oom_notify() disables CPU hotplug in order to stabilize the list of online CPUs, which it traverses. However, this is completely pointless because smp_call_function_single() will quietly fail if invoked on an offline CPU. Because the count of requests is incremented in the rcu_oom_notify_cpu() function that is remotely invoked, everything works nicely even in the face of concurrent CPU-hotplug operations. Furthermore, in recent kernels, invoking get_online_cpus() from an OOM notifier can result in deadlock. This commit therefore removes the call to get_online_cpus() and put_online_cpus() from rcu_oom_notify(). Reported-by: Marcin Ślusarz Reported-by: David Rientjes Signed-off-by: Paul E. McKenney Acked-by: David Rientjes Tested-by: Marcin Ślusarz --- kernel/rcu/tree_plugin.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 9e922f1..80a7c17 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1679,12 +1679,10 @@ static int rcu_oom_notify(struct notifier_block *self, */ atomic_set(&oom_callback_count, 1); - get_online_cpus(); for_each_online_cpu(cpu) { smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); cond_resched_rcu_qs(); } - put_online_cpus(); /* Unconditionally decrement: no need to wake ourselves up. */ atomic_dec(&oom_callback_count); -- cgit v1.1 From af859beaaba4d57883b08f4acbcb3974bc1f975e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 19 Jul 2015 15:13:40 -0700 Subject: rcu: Silence lockdep false positive for expedited grace periods In a CONFIG_PREEMPT=y kernel, synchronize_rcu_expedited() acquires the ->exp_funnel_mutex in rcu_preempt_state, then invokes synchronize_sched_expedited, which acquires the ->exp_funnel_mutex in rcu_sched_state. There can be no deadlock because rcu_preempt_state ->exp_funnel_mutex acquisition always precedes that of rcu_sched_state. But lockdep does not know that, so it gives false-positive splats. This commit therefore associates a separate lock_class_key structure with the rcu_sched_state structure's ->exp_funnel_mutex, allowing lockdep to see the lock ordering, avoiding the false positives. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 12 ++++++++++-- kernel/rcu/tree.h | 8 ++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3af0dee..439112e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -71,6 +71,7 @@ MODULE_ALIAS("rcutree"); static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; +static struct lock_class_key rcu_exp_sched_class[RCU_NUM_LVLS]; /* * In order to export the rcu_state name to the tracing tools, it @@ -4049,6 +4050,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT; static const char * const exp[] = RCU_EXP_NAME_INIT; + static const char * const exp_sched[] = RCU_EXP_SCHED_NAME_INIT; static u8 fl_mask = 0x1; int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ @@ -4108,8 +4110,14 @@ static void __init rcu_init_one(struct rcu_state *rsp, INIT_LIST_HEAD(&rnp->blkd_tasks); rcu_init_one_nocb(rnp); mutex_init(&rnp->exp_funnel_mutex); - lockdep_set_class_and_name(&rnp->exp_funnel_mutex, - &rcu_exp_class[i], exp[i]); + if (rsp == &rcu_sched_state) + lockdep_set_class_and_name( + &rnp->exp_funnel_mutex, + &rcu_exp_sched_class[i], exp_sched[i]); + else + lockdep_set_class_and_name( + &rnp->exp_funnel_mutex, + &rcu_exp_class[i], exp[i]); } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 80d974d..0412030 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -70,6 +70,8 @@ # define RCU_NODE_NAME_INIT { "rcu_node_0" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } # define RCU_EXP_NAME_INIT { "rcu_node_exp_0" } +# define RCU_EXP_SCHED_NAME_INIT \ + { "rcu_node_exp_sched_0" } #elif NR_CPUS <= RCU_FANOUT_2 # define RCU_NUM_LVLS 2 # define NUM_RCU_LVL_0 1 @@ -79,6 +81,8 @@ # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" } +# define RCU_EXP_SCHED_NAME_INIT \ + { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1" } #elif NR_CPUS <= RCU_FANOUT_3 # define RCU_NUM_LVLS 3 # define NUM_RCU_LVL_0 1 @@ -89,6 +93,8 @@ # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" } +# define RCU_EXP_SCHED_NAME_INIT \ + { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2" } #elif NR_CPUS <= RCU_FANOUT_4 # define RCU_NUM_LVLS 4 # define NUM_RCU_LVL_0 1 @@ -100,6 +106,8 @@ # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" } +# define RCU_EXP_SCHED_NAME_INIT \ + { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2", "rcu_node_exp_sched_3" } #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ -- cgit v1.1