summaryrefslogtreecommitdiffstats
path: root/kernel/rcu
Commit message (Collapse)AuthorAgeFilesLines
* treewide: Use array_size() in vmalloc()Kees Cook2018-06-121-2/+3
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The vmalloc() function has no 2-factor argument form, so multiplication factors need to be wrapped in array_size(). This patch replaces cases of: vmalloc(a * b) with: vmalloc(array_size(a, b)) as well as handling cases of: vmalloc(a * b * c) with: vmalloc(array3_size(a, b, c)) This does, however, attempt to ignore constant size factors like: vmalloc(4 * 1024) though any constants defined via macros get caught up in the conversion. Any factors with a sizeof() of "unsigned char", "char", and "u8" were dropped, since they're redundant. The Coccinelle script used for this was: // Fix redundant parens around sizeof(). @@ type TYPE; expression THING, E; @@ ( vmalloc( - (sizeof(TYPE)) * E + sizeof(TYPE) * E , ...) | vmalloc( - (sizeof(THING)) * E + sizeof(THING) * E , ...) ) // Drop single-byte sizes and redundant parens. @@ expression COUNT; typedef u8; typedef __u8; @@ ( vmalloc( - sizeof(u8) * (COUNT) + COUNT , ...) | vmalloc( - sizeof(__u8) * (COUNT) + COUNT , ...) | vmalloc( - sizeof(char) * (COUNT) + COUNT , ...) | vmalloc( - sizeof(unsigned char) * (COUNT) + COUNT , ...) | vmalloc( - sizeof(u8) * COUNT + COUNT , ...) | vmalloc( - sizeof(__u8) * COUNT + COUNT , ...) | vmalloc( - sizeof(char) * COUNT + COUNT , ...) | vmalloc( - sizeof(unsigned char) * COUNT + COUNT , ...) ) // 2-factor product with sizeof(type/expression) and identifier or constant. @@ type TYPE; expression THING; identifier COUNT_ID; constant COUNT_CONST; @@ ( vmalloc( - sizeof(TYPE) * (COUNT_ID) + array_size(COUNT_ID, sizeof(TYPE)) , ...) | vmalloc( - sizeof(TYPE) * COUNT_ID + array_size(COUNT_ID, sizeof(TYPE)) , ...) | vmalloc( - sizeof(TYPE) * (COUNT_CONST) + array_size(COUNT_CONST, sizeof(TYPE)) , ...) | vmalloc( - sizeof(TYPE) * COUNT_CONST + array_size(COUNT_CONST, sizeof(TYPE)) , ...) | vmalloc( - sizeof(THING) * (COUNT_ID) + array_size(COUNT_ID, sizeof(THING)) , ...) | vmalloc( - sizeof(THING) * COUNT_ID + array_size(COUNT_ID, sizeof(THING)) , ...) | vmalloc( - sizeof(THING) * (COUNT_CONST) + array_size(COUNT_CONST, sizeof(THING)) , ...) | vmalloc( - sizeof(THING) * COUNT_CONST + array_size(COUNT_CONST, sizeof(THING)) , ...) ) // 2-factor product, only identifiers. @@ identifier SIZE, COUNT; @@ vmalloc( - SIZE * COUNT + array_size(COUNT, SIZE) , ...) // 3-factor product with 1 sizeof(type) or sizeof(expression), with // redundant parens removed. @@ expression THING; identifier STRIDE, COUNT; type TYPE; @@ ( vmalloc( - sizeof(TYPE) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | vmalloc( - sizeof(TYPE) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | vmalloc( - sizeof(TYPE) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | vmalloc( - sizeof(TYPE) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | vmalloc( - sizeof(THING) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | vmalloc( - sizeof(THING) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | vmalloc( - sizeof(THING) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | vmalloc( - sizeof(THING) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) ) // 3-factor product with 2 sizeof(variable), with redundant parens removed. @@ expression THING1, THING2; identifier COUNT; type TYPE1, TYPE2; @@ ( vmalloc( - sizeof(TYPE1) * sizeof(TYPE2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | vmalloc( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | vmalloc( - sizeof(THING1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | vmalloc( - sizeof(THING1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | vmalloc( - sizeof(TYPE1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) | vmalloc( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) ) // 3-factor product, only identifiers, with redundant parens removed. @@ identifier STRIDE, SIZE, COUNT; @@ ( vmalloc( - (COUNT) * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | vmalloc( - COUNT * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | vmalloc( - COUNT * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | vmalloc( - (COUNT) * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | vmalloc( - COUNT * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | vmalloc( - (COUNT) * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | vmalloc( - (COUNT) * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | vmalloc( - COUNT * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) ) // Any remaining multi-factor products, first at least 3-factor products // when they're not all constants... @@ expression E1, E2, E3; constant C1, C2, C3; @@ ( vmalloc(C1 * C2 * C3, ...) | vmalloc( - E1 * E2 * E3 + array3_size(E1, E2, E3) , ...) ) // And then all remaining 2 factors products when they're not all constants. @@ expression E1, E2; constant C1, C2; @@ ( vmalloc(C1 * C2, ...) | vmalloc( - E1 * E2 + array_size(E1, E2) , ...) ) Signed-off-by: Kees Cook <keescook@chromium.org>
* rcu/x86: Provide early rcu_cpu_starting() callbackPeter Zijlstra2018-05-221-0/+9
| | | | | | | | | | | | | | | | The x86/mtrr code does horrific things because hardware. It uses stop_machine_from_inactive_cpu(), which does a wakeup (of the stopper thread on another CPU), which uses RCU, all before the CPU is onlined. RCU complains about this, because wakeups use RCU and RCU does (rightfully) not consider offline CPUs for grace-periods. Fix this by initializing RCU way early in the MTRR case. Tested-by: Mike Galbraith <efault@gmx.de> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> [ paulmck: Add !SMP support, per 0day Test Robot report. ]
*---. Merge branches 'exp.2018.05.15a', 'fixes.2018.05.15a', 'lock.2018.05.15a' ↵Paul E. McKenney2018-05-1512-376/+250
|\ \ \ | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | and 'torture.2018.05.15a' into HEAD exp.2018.05.15a: Parallelize expedited grace-period initialization. fixes.2018.05.15a: Miscellaneous fixes. lock.2018.05.15a: Decrease lock contention on root rcu_node structure, which is a step towards merging RCU flavors. torture.2018.05.15a: Torture-test updates.
| | | * rcutorture: Print end-of-test statePaul E. McKenney2018-05-151-0/+8
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | This commit adds end-of-test state printout to help check whether RCU shut down nicely. Note that this printout only helps for flavors of RCU that are not used much by the kernel. In particular, for normal RCU having a grace period in progress is expected behavior. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| | * | rcu: Drop early GP request check from rcu_gp_kthread()Paul E. McKenney2018-05-151-6/+0
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Now that grace-period requests use funnel locking and now that they set ->gp_flags to RCU_GP_FLAG_INIT even when the RCU grace-period kthread has not yet started, rcu_gp_kthread() no longer needs to check need_any_future_gp() at startup time. This commit therefore removes this check. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| | * | rcu: Simplify and inline cpu_needs_another_gp()Paul E. McKenney2018-05-153-57/+3
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Now that RCU no longer relies on failsafe checks, cpu_needs_another_gp() can be greatly simplified. This simplification eliminates the last call to rcu_future_needs_gp() and to rcu_segcblist_future_gp_needed(), both of which which can then be eliminated. And then, because cpu_needs_another_gp() is called only from __rcu_pending(), it can be inlined and eliminated. This commit carries out the simplification, inlining, and elimination called out above. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| | * | rcu: The rcu_gp_cleanup() function does not need cpu_needs_another_gp()Paul E. McKenney2018-05-151-4/+2
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | All of the cpu_needs_another_gp() function's checks (except for newly arrived callbacks) have been subsumed into the rcu_gp_cleanup() function's scan of the rcu_node tree. This commit therefore drops the call to cpu_needs_another_gp(). The check for newly arrived callbacks is supplied by rcu_accelerate_cbs(). Any needed advancing (as in the earlier rcu_advance_cbs() call) will be supplied when the corresponding CPU becomes aware of the end of the now-completed grace period. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| | * | rcu: Make rcu_start_this_gp() check for out-of-range requestsPaul E. McKenney2018-05-151-0/+2
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | If rcu_start_this_gp() is invoked with a requested grace period more than three in the future, then either the ->need_future_gp[] array needs to be bigger or the caller needs to be repaired. This commit therefore adds a WARN_ON_ONCE() checking for this condition. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| | * | rcu: Add funnel locking to rcu_start_this_gp()Paul E. McKenney2018-05-151-57/+35
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The rcu_start_this_gp() function had a simple form of funnel locking that used only the leaves and root of the rcu_node tree, which is fine for systems with only a few hundred CPUs, but sub-optimal for systems having thousands of CPUs. This commit therefore adds full-tree funnel locking. This variant of funnel locking is unusual in the following ways: 1. The leaf-level rcu_node structure's ->lock is held throughout. Other funnel-locking implementations drop the leaf-level lock before progressing to the next level of the tree. 2. Funnel locking can be started at the root, which is convenient for code that already holds the root rcu_node structure's ->lock. Other funnel-locking implementations start at the leaves. 3. If an rcu_node structure other than the initial one believes that a grace period is in progress, it is not necessary to go further up the tree. This is because grace-period cleanup scans the full tree, so that marking the need for a subsequent grace period anywhere in the tree suffices -- but only if a grace period is currently in progress. 4. It is possible that the RCU grace-period kthread has not yet started, and this case must be handled appropriately. However, the general approach of using a tree to control lock contention is still in place. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| | * | rcu: Make rcu_start_future_gp() caller select grace periodPaul E. McKenney2018-05-152-35/+27
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The rcu_accelerate_cbs() function selects a grace-period target, which it uses to have rcu_segcblist_accelerate() assign numbers to recently queued callbacks. Then it invokes rcu_start_future_gp(), which selects a grace-period target again, which is a bit pointless. This commit therefore changes rcu_start_future_gp() to take the grace-period target as a parameter, thus avoiding double selection. This commit also changes the name of rcu_start_future_gp() to rcu_start_this_gp() to reflect this change in functionality, and also makes a similar change to the name of trace_rcu_future_gp(). Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| | * | rcu: Inline rcu_start_gp_advanced() into rcu_start_future_gp()Paul E. McKenney2018-05-151-44/+12
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The rcu_start_gp_advanced() is invoked only from rcu_start_future_gp() and much of its code is redundant when invoked from that context. This commit therefore inlines rcu_start_gp_advanced() into rcu_start_future_gp(), then removes rcu_start_gp_advanced(). Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| | * | rcu: Clear request other than RCU_GP_FLAG_INIT at GP endPaul E. McKenney2018-05-151-0/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Once the grace period has ended, any RCU_GP_FLAG_FQS requests are irrelevant: The grace period has ended, so there is no longer any point in forcing quiescent states in order to try to make it end sooner. This commit therefore causes rcu_gp_cleanup() to clear any bits other than RCU_GP_FLAG_INIT from ->gp_flags at the end of the grace period. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| | * | rcu: Cleanup, don't put ->completed into an intPaul E. McKenney2018-05-151-1/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | It is true that currently only the low-order two bits are used, so there should be no problem given modern machines and compilers, but good hygiene and maintainability dictates use of an unsigned long instead of an int. This commit therefore makes this change. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| | * | rcu: Switch __rcu_process_callbacks() to rcu_accelerate_cbs()Paul E. McKenney2018-05-151-38/+15
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The __rcu_process_callbacks() function currently checks to see if the current CPU needs a grace period and also if there is any other reason to kick off a new grace period. This is one of the fail-safe checks that has been rendered unnecessary by the changes that increase the accuracy of rcu_gp_cleanup()'s estimate as to whether another grace period is required. Because this particular fail-safe involved acquiring the root rcu_node structure's ->lock, which has seen excessive contention in real life, this fail-safe needs to go. However, one check must remain, namely the check for newly arrived RCU callbacks that have not yet been associated with a grace period. One might hope that the checks in __note_gp_changes(), which is invoked indirectly from rcu_check_quiescent_state(), would suffice, but this function won't be invoked at all if RCU is idle. It is therefore necessary to replace the fail-safe checks with a simpler check for newly arrived callbacks during an RCU idle period, which is exactly what this commit does. This change removes the final call to rcu_start_gp(), so this function is removed as well. Note that lockless use of cpu_needs_another_gp() is racy, but that these races are harmless in this case. If RCU really is idle, the values will not change, so the return value from cpu_needs_another_gp() will be correct. If RCU is not idle, the resulting redundant call to rcu_accelerate_cbs() will be harmless, and might even have the benefit of reducing grace-period latency a bit. This commit also moves interrupt disabling into the "if" statement to improve real-time response a bit. Reported-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| | * | rcu: Avoid __call_rcu_core() root rcu_node ->lock acquisitionPaul E. McKenney2018-05-151-4/+4
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | When __call_rcu_core() notices excessive numbers of callbacks pending on the current CPU, we know that at least one of them is not yet classified, namely the one that was just now queued. Therefore, it is not necessary to invoke rcu_start_gp() and thus not necessary to acquire the root rcu_node structure's ->lock. This commit therefore replaces the rcu_start_gp() with rcu_accelerate_cbs(), thus replacing an acquisition of the root rcu_node structure's ->lock with that of this CPU's leaf rcu_node structure. This decreases contention on the root rcu_node structure's ->lock. Reported-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| | * | rcu: Make rcu_migrate_callbacks wake GP kthread when neededPaul E. McKenney2018-05-151-2/+6
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The rcu_migrate_callbacks() function invokes rcu_advance_cbs() twice, ignoring the return value. This is OK at pressent because of failsafe code that does the wakeup when needed. However, this failsafe code acquires the root rcu_node structure's lock frequently, while rcu_migrate_callbacks() does so only once per CPU-offline operation. This commit therefore makes rcu_migrate_callbacks() wake up the RCU GP kthread when either call to rcu_advance_cbs() returns true, thus removing need for the failsafe code. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| | * | rcu: Convert ->need_future_gp[] array to booleanPaul E. McKenney2018-05-152-4/+4
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | There is no longer any need for ->need_future_gp[] to count the number of requests for future grace periods, so this commit converts the additions to assignments to "true" and reduces the size of each element to one byte. While we are in the area, fix an obsolete comment. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| | * | rcu: Make rcu_future_needs_gp() check all ->need_future_gps[] elementsPaul E. McKenney2018-05-152-2/+3
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Currently, the rcu_future_needs_gp() function checks only the current element of the ->need_future_gps[] array, which might miss elements that were offset from the expected element, for example, due to races with the start or the end of a grace period. This commit therefore makes rcu_future_needs_gp() use the need_any_future_gp() macro to check all of the elements of this array. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| | * | rcu: Avoid losing ->need_future_gp[] values due to GP start/end racesPaul E. McKenney2018-05-151-2/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The rcu_cbs_completed() function provides the value of ->completed at which new callbacks can safely be invoked. This is recorded in two-element ->need_future_gp[] arrays in the rcu_node structure, and the elements of these arrays corresponding to the just-completed grace period are zeroed at the end of that grace period. However, the rcu_cbs_completed() function can return the current ->completed value plus either one or two, so it is possible for the corresponding ->need_future_gp[] entry to be cleared just after it was set, thus losing a request for a future grace period. This commit avoids this race by expanding ->need_future_gp[] to four elements. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| | * | rcu: Make rcu_gp_cleanup() more accurately predict need for new GPPaul E. McKenney2018-05-153-24/+10
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Currently, rcu_gp_cleanup() scans the rcu_node tree in order to reset state to reflect the end of the grace period. It also checks to see whether a new grace period is needed, but in a number of cases, rather than directly cause the new grace period to be immediately started, it instead leaves the grace-period-needed state where various fail-safes can find it. This works fine, but results in higher contention on the root rcu_node structure's ->lock, which is undesirable, and contention on that lock has recently become noticeable. This commit therefore makes rcu_gp_cleanup() immediately start a new grace period if there is any need for one. It is quite possible that it will later be necessary to throttle the grace-period rate, but that can be dealt with when and if. Reported-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| | * | rcu: Make rcu_gp_kthread() check for early-boot activityPaul E. McKenney2018-05-151-0/+6
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The rcu_gp_kthread() function immediately sleeps waiting to be notified of the need for a new grace period, which currently works because there are a number of code sequences that will provide the needed wakeup later. However, some of these code sequences need to acquire the root rcu_node structure's ->lock, and contention on that lock has started manifesting. This commit therefore makes rcu_gp_kthread() check for early-boot activity when it starts up, omitting the initial sleep in that case. Reported-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| | * | rcu: Add accessor macros for the ->need_future_gp[] arrayPaul E. McKenney2018-05-153-10/+23
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Accessors for the ->need_future_gp[] array are currently open-coded, which makes them difficult to change. To improve maintainability, this commit adds need_future_gp_mask() to compute the indexing mask from the array size, need_future_gp_element() to access the element corresponding to the specified grace-period number, and need_any_future_gp() to determine if any future grace period is needed. This commit also applies need_future_gp_element() to existing open-coded single-element accesses. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| | * | rcu: Make rcu_start_future_gp()'s grace-period check more precisePaul E. McKenney2018-05-151-13/+5
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The rcu_start_future_gp() function uses a sloppy check for a grace period being in progress, which works today because there are a number of code sequences that resolve the resulting races. However, some of these race-resolution code sequences must acquire the root rcu_node structure's ->lock, and contention on that lock has started manifesting. This commit therefore makes rcu_start_future_gp() check more precise, eliminating the sloppy lockless check of the rcu_state structure's ->gpnum and ->completed fields. The effect is that rcu_start_future_gp() will sometimes unnecessarily attempt to start a new grace period, but this overhead will be reduced later using funnel locking. Reported-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| | * | rcu: Improve non-root rcu_cbs_completed() accuracyPaul E. McKenney2018-05-151-0/+15
| | |/ | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | When rcu_cbs_completed() is invoked on a non-root rcu_node structure, it unconditionally assumes that two grace periods must complete before the callbacks at hand can be invoked. This is overly conservative because if that non-root rcu_node structure believes that no grace period is in progress, and if the corresponding rcu_state structure's ->gpnum field has not yet been incremented, then these callbacks may safely be invoked after only one grace period has completed. This change is required to permit grace-period start requests to use funnel locking, which is in turn permitted to reduce root rcu_node ->lock contention, which has been observed by Nick Piggin. Furthermore, such contention will likely be increased by the merging of RCU-bh, RCU-preempt, and RCU-sched, so it makes sense to take steps to decrease it. This commit therefore improves the accuracy of rcu_cbs_completed() when invoked on a non-root rcu_node structure as described above. Reported-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| * | rcu: Add leaf-node macrosPaul E. McKenney2018-05-153-7/+12
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | This commit adds rcu_first_leaf_node() that returns a pointer to the first leaf rcu_node structure in the specified RCU flavor and an rcu_is_leaf_node() that returns true iff the specified rcu_node structure is a leaf. This commit also uses these macros where appropriate. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| * | srcu: Add cleanup_srcu_struct_quiesced()Paul E. McKenney2018-05-153-17/+29
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The current cleanup_srcu_struct() flushes work, which prevents it from being invoked from some workqueue contexts, as well as from atomic (non-blocking) contexts. This patch therefore introduced a cleanup_srcu_struct_quiesced(), which can be invoked only after all activity on the specified srcu_struct has completed. This restriction allows cleanup_srcu_struct_quiesced() to be invoked from workqueue contexts as well as from atomic contexts. Suggested-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nitzan Carmi <nitzanc@mellanox.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| * | rcu: Declare rcu_eqs_special_set() in public headerYury Norov2018-05-151-1/+0
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Because rcu_eqs_special_set() is declared only in internal header kernel/rcu/tree.h and stubbed in include/linux/rcutiny.h, it is inaccessible outside of the RCU implementation. This patch therefore moves the rcu_eqs_special_set() declaration to include/linux/rcutree.h, which allows it to be used in non-rcu kernel code. Signed-off-by: Yury Norov <ynorov@caviumnetworks.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| * | rcu: Update rcu_bind_gp_kthread() header commentPaul E. McKenney2018-05-151-2/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The header comment for rcu_bind_gp_kthread() refers to sysidle, which is no longer with us. However, it is still important to bind RCU's grace-period kthreads to the housekeeping CPU(s), so rather than remove rcu_bind_gp_kthread(), this commit updates the comment. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| * | rcu: Move __rcu_read_lock() and __rcu_read_unlock() to tree_plugin.hPaul E. McKenney2018-05-152-48/+44
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The __rcu_read_lock() and __rcu_read_unlock() functions were moved to kernel/rcu/update.c in order to implement tiny preemptible RCU. However, tiny preemptible RCU was removed from the kernel a long time ago, so this commit belatedly moves them back into the only remaining preemptible-RCU code. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| * | rcu: Rename cond_resched_rcu_qs() to cond_resched_tasks_rcu_qs()Paul E. McKenney2018-05-154-14/+14
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Commit e31d28b6ab8f ("trace: Eliminate cond_resched_rcu_qs() in favor of cond_resched()") substituted cond_resched() for the earlier call to cond_resched_rcu_qs(). However, the new-age cond_resched() does not do anything to help RCU-tasks grace periods because (1) RCU-tasks is only enabled when CONFIG_PREEMPT=y and (2) cond_resched() is a complete no-op when preemption is enabled. This situation results in hangs when running the trace benchmarks. A number of potential fixes were discussed on LKML (https://lkml.kernel.org/r/20180224151240.0d63a059@vmware.local.home), including making cond_resched() not be a no-op; making cond_resched() not be a no-op, but only when running tracing benchmarks; reverting the aforementioned commit (which works because cond_resched_rcu_qs() does provide an RCU-tasks quiescent state; and adding a call to the scheduler/RCU rcu_note_voluntary_context_switch() function. All were deemed unsatisfactory, either due to added cond_resched() overhead or due to magic functions inviting cargo culting. This commit renames cond_resched_rcu_qs() to cond_resched_tasks_rcu_qs(), which provides a clear hint as to what this function is doing and why and where it should be used, and then replaces the call to cond_resched() with cond_resched_tasks_rcu_qs() in the trace benchmark's benchmark_event_kthread() function. Reported-by: Steven Rostedt <rostedt@goodmis.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| * | rcu: Remove deprecated RCU debugfs tracing codeByungchul Park2018-05-152-12/+5
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Commit ae91aa0adb14 ("rcu: Remove debugfs tracing") removed the RCU debugfs tracing code, but did not remove the no-longer used ->exp_workdone{0,1,2,3} fields in the srcu_data structure. This commit therefore removes these fields along with the code that uselessly updates them. Signed-off-by: Byungchul Park <byungchul.park@lge.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| * | rcu: Call wake_nocb_leader_defer() with 'FORCE' when nocb_q_count is highByungchul Park2018-05-151-1/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | If an excessive number of callbacks have been queued, but the NOCB leader kthread's wakeup must be deferred, then we should wake up the leader unconditionally once it is safe to do so. This was handled correctly in commit fbce7497ee ("rcu: Parallelize and economize NOCB kthread wakeups"), but then commit 8be6e1b15c ("rcu: Use timer as backstop for NOCB deferred wakeups") passed RCU_NOCB_WAKE instead of the correct RCU_NOCB_WAKE_FORCE to wake_nocb_leader_defer(). As an interesting aside, RCU_NOCB_WAKE_FORCE is never passed to anything, which should have been taken as a hint. ;-) This commit therefore passes RCU_NOCB_WAKE_FORCE instead of RCU_NOCB_WAKE to wake_nocb_leader_defer() when a callback is queued onto a NOCB CPU that already has an excessive number of callbacks pending. Signed-off-by: Byungchul Park <byungchul.park@lge.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| * | rcu: Don't allocate rcu_nocb_mask if no one needs itPaul E. McKenney2018-05-151-2/+2
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Commit 44c65ff2e3b0 ("rcu: Eliminate NOCBs CPU-state Kconfig options") made allocation of rcu_nocb_mask depend only on the rcu_nocbs=, nohz_full=, or isolcpus= kernel boot parameters. However, it failed to change the initial value of rcu_init_nohz()'s local variable need_rcu_nocb_mask to false, which can result in useless allocation of an all-zero rcu_nocb_mask. This commit therefore fixes this bug by changing the initial value of need_rcu_nocb_mask to false. While we are in the area, also correct the error message that is printed when someone specifies that can-never-exist CPUs should be NOCBs CPUs. Reported-by: Byungchul Park <byungchul.park@lge.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Acked-by: Byungchul Park <byungchul.park@lge.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
| * | rcu: Inline rcu_preempt_do_callback() into its sole callerByungchul Park2018-05-152-11/+1
| |/ | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The rcu_preempt_do_callbacks() function was introduced in commit 09223371dea(rcu: Use softirq to address performance regression), where it was necessary to handle kernel builds both containing and not containing RCU-preempt. Since then, various changes (most notably f8b7fc6b51 ("rcu: use softirq instead of kthreads except when RCU_BOOST=y")) have resulted in this function being invoked only from rcu_kthread_do_work(), which is present only in kernels containing RCU-preempt, which in turn means that the rcu_preempt_do_callbacks() function is no longer needed. This commit therefore inlines rcu_preempt_do_callbacks() into its sole remaining caller and also removes the rcu_state_p and rcu_data_p indirection for added clarity. Signed-off-by: Byungchul Park <byungchul.park@lge.com> Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org> [ paulmck: Remove the rcu_state_p and rcu_data_p indirection. ] Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
* | rcu: exp: Protect all sync_rcu_preempt_exp_done() with rcu_node lockBoqun Feng2018-05-151-3/+25
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Currently some callsites of sync_rcu_preempt_exp_done() are not called with the corresponding rcu_node's ->lock held, which could introduces bugs as per Paul: o CPU 0 in sync_rcu_preempt_exp_done() reads ->exp_tasks and sees that it is NULL. o CPU 1 blocks within an RCU read-side critical section, so it enqueues the task and points ->exp_tasks at it and clears CPU 1's bit in ->expmask. o All other CPUs clear their bits in ->expmask. o CPU 0 reads ->expmask, sees that it is zero, so incorrectly concludes that all quiescent states have completed, despite the fact that ->exp_tasks is non-NULL. To fix this, sync_rcu_preempt_exp_unlocked() is introduced to replace lockless callsites of sync_rcu_preempt_exp_done(). Further, a lockdep annotation is added into sync_rcu_preempt_exp_done() to prevent mis-use in the future. Signed-off-by: Boqun Feng <boqun.feng@gmail.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
* | rcu: exp: Fix "must hold exp_mutex" comments for QS reporting functionsBoqun Feng2018-05-151-7/+3
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Since commit d9a3da0699b2 ("rcu: Add expedited grace-period support for preemptible RCU"), there are comments for some funtions in rcu_report_exp_rnp()'s call-chain saying that exp_mutex or its predecessors needs to be held. However, exp_mutex and its predecessors were used only to synchronize between GPs, and it is clear that all variables visited by those functions are under the protection of rcu_node's ->lock. Moreover, those functions are currently called without held exp_mutex, and seems that doesn't introduce any trouble. So this patch fixes this problem by updating the comments to match the current code. Signed-off-by: Boqun Feng <boqun.feng@gmail.com> Fixes: d9a3da0699b2 ("rcu: Add expedited grace-period support for preemptible RCU") Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
* | rcu: Parallelize expedited grace-period initializationPaul E. McKenney2018-05-154-78/+120
|/ | | | | | | | | | | | | | | The latency of RCU expedited grace periods grows with increasing numbers of CPUs, eventually failing to be all that expedited. Much of the growth in latency is in the initialization phase, so this commit uses workqueues to carry out this initialization concurrently on a rcu_node-by-rcu_node basis. This change makes use of a new rcu_par_gp_wq because flushing a work item from another work item running from the same workqueue can result in deadlock. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
*-. Merge branches 'fixes.2018.02.23a', 'srcu.2018.02.20a' and ↵Paul E. McKenney2018-02-233-40/+70
|\ \ | | | | | | | | | | | | | | | | | | | | | 'torture.2018.02.20a' into HEAD fixes.2018.02.23a: Miscellaneous fixes srcu.2018.02.20a: SRCU updates torture.2018.02.20a: Torture-test updates
| | * torture: Provide more sensible nreader/nwriter defaults for rcuperfPaul E. McKenney2018-02-201-1/+20
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The default values for nreader and nwriter are apparently not all that user-friendly, resulting in people doing scalability tests that ran all runs at large scale. This commit therefore makes both the nreaders and nwriters module default to the number of CPUs, and adds a comment to rcuperf.c stating that the number of CPUs should be specified using the nr_cpus kernel boot parameter. This commit also eliminates the redundant rcuperf scripting specification of default values for these parameters. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
| | * rcutorture: Record which grace-period primitives are testedPaul E. McKenney2018-02-201-8/+16
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The rcu_torture_writer() function adapts to requested testing from module parameters as well as the function pointers in the structure referenced by cur_ops. However, as long as the module parameters do not conflict with the function pointers, this adaptation is silent. This silence can result in confusion as to exactly what was tested, which could in turn result in untested RCU code making its way into mainline. This commit therefore makes rcu_torture_writer() announce exactly which portions of RCU's API it ends up testing. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
| | * rcutorture: Re-enable testing of dynamic expeditingPaul E. McKenney2018-02-201-6/+9
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | During boot, normal grace periods are processed as expedited. When rcutorture is built into the kernel, it starts during boot and thus detects that normal grace periods are unconditionally expedited. Therefore, rcutorture concludes that there is no point in trying to dynamically enable expediting, do it disables this aspect of testing, which is a bit of an overreaction to the temporary boot-time expediting. This commit therefore rechecks forced expediting throughout the test, enabling dynamic expediting if normal grace periods are processed normally at any point. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
| | * rcutorture: Avoid fake-writer use of undefined primitivesPaul E. McKenney2018-02-201-4/+4
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Currently the rcu_torture_fakewriter() function invokes cur_ops->sync() and cur_ops->exp_sync() without first checking to see if they are in fact non-NULL. This results in kernel NULL pointer dereferences when testing RCU implementations that choose not to provide the full set of primitives. Given that it is perfectly reasonable to have specialized RCU implementations that provide only a subset of the RCU API, this is a bug in rcutorture. This commit therefore makes rcu_torture_fakewriter() check function pointers before invoking them, thus allowing it to test subsetted RCU implementations. Reported-by: Lihao Liang <lianglihao@huawei.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
| | * rcutorture: Abstract function and module namesPaul E. McKenney2018-02-201-8/+8
| | | | | | | | | | | | | | | | | | | | | This commit moves to __func__ for function names and for KBUILD_MODNAME for module names, all in the name of better resilience to change. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
| | * rcutorture: Replace multi-instance kzalloc() with kcalloc()Paul E. McKenney2018-02-201-5/+4
| | | | | | | | | | | | | | | | | | | | | This commit replaces array-allocation calls to kzalloc() with equivalent calls to kcalloc(). Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
| * | rcu: Remove SRCU throttlingPaul E. McKenney2018-02-201-3/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The code in srcu_gp_end() inserts a delay every 0x3ff grace periods in order to prevent SRCU grace-period work from consuming an entire CPU when there is a long sequence of expedited SRCU grace-period requests. However, all of SRCU's grace-period work is carried out in workqueues, which are in turn within kthreads, which are automatically throttled as needed by the scheduler. In particular, if there is plenty of idle time, there is no point in throttling. This commit therefore removes the expedited SRCU grace-period throttling. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
| * | srcu: Remove dead code in srcu_gp_end()Byungchul Park2018-02-201-2/+0
| | | | | | | | | | | | | | | | | | | | | | | | | | | Of course, compilers will optimize out a dead code. Anyway, remove any dead code for better readibility. Signed-off-by: Byungchul Park <byungchul.park@lge.com> Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
| * | srcu: Reduce scans of srcu_data in counter wrap checkIldar Ismagilov2018-02-201-2/+4
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Currently, given a multi-level srcu_node tree, SRCU can scan the full set of srcu_data structures at each level when cleaning up after a grace period. This, though harmless otherwise, represents pointless overhead. This commit therefore eliminates this overhead by scanning the srcu_data structures only when traversing the leaf srcu_node structures. Signed-off-by: Ildar Ismagilov <devix84@gmail.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
| * | srcu: Prevent sdp->srcu_gp_seq_needed_exp counter wrapIldar Ismagilov2018-02-201-0/+3
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | SRCU checks each srcu_data structure's grace-period number for counter wrap four times per cycle by default. This frequency guarantees that normal comparisons will detect potential wrap. However, the expedited grace-period number is not checked. The consquences are not too horrible (a failure to expedite a grace period when requested), but it would be good to avoid such things. This commit therefore adds this check to the expedited grace-period number. Signed-off-by: Ildar Ismagilov <devix84@gmail.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
| * | srcu: Abstract function namePaul E. McKenney2018-02-201-1/+1
| |/ | | | | | | | | | | | | This commit moves to __func__ for function names in the name of better resilience to change. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
* | rcu: Create RCU-specific workqueues with rescuersPaul E. McKenney2018-02-234-6/+11
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | RCU's expedited grace periods can participate in out-of-memory deadlocks due to all available system_wq kthreads being blocked and there not being memory available to create more. This commit prevents such deadlocks by allocating an RCU-specific workqueue_struct at early boot time, and providing it with a rescuer to ensure forward progress. This uses the shiny new init_rescuer() function provided by Tejun (but indirectly). This commit also causes SRCU to use this new RCU-specific workqueue_struct. Note that SRCU's use of workqueues never blocks them waiting for readers, so this should be safe from a forward-progress viewpoint. Note that this moves SRCU from system_power_efficient_wq to a normal workqueue. In the unlikely event that this results in measurable degradation, a separate power-efficient workqueue will be creates for SRCU. Reported-by: Prateek Sood <prsood@codeaurora.org> Reported-by: Tejun Heo <tj@kernel.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Acked-by: Tejun Heo <tj@kernel.org>
OpenPOWER on IntegriCloud