diff options
65 files changed, 2827 insertions, 523 deletions
diff --git a/Documentation/atomic_bitops.txt b/Documentation/atomic_bitops.txt new file mode 100644 index 0000000..5550bfdc --- /dev/null +++ b/Documentation/atomic_bitops.txt @@ -0,0 +1,66 @@ + +On atomic bitops. + + +While our bitmap_{}() functions are non-atomic, we have a number of operations +operating on single bits in a bitmap that are atomic. + + +API +--- + +The single bit operations are: + +Non-RMW ops: + + test_bit() + +RMW atomic operations without return value: + + {set,clear,change}_bit() + clear_bit_unlock() + +RMW atomic operations with return value: + + test_and_{set,clear,change}_bit() + test_and_set_bit_lock() + +Barriers: + + smp_mb__{before,after}_atomic() + + +All RMW atomic operations have a '__' prefixed variant which is non-atomic. + + +SEMANTICS +--------- + +Non-atomic ops: + +In particular __clear_bit_unlock() suffers the same issue as atomic_set(), +which is why the generic version maps to clear_bit_unlock(), see atomic_t.txt. + + +RMW ops: + +The test_and_{}_bit() operations return the original value of the bit. + + +ORDERING +-------- + +Like with atomic_t, the rule of thumb is: + + - non-RMW operations are unordered; + + - RMW operations that have no return value are unordered; + + - RMW operations that have a return value are fully ordered. + +Except for test_and_set_bit_lock() which has ACQUIRE semantics and +clear_bit_unlock() which has RELEASE semantics. + +Since a platform only has a single means of achieving atomic operations +the same barriers as for atomic_t are used, see atomic_t.txt. + diff --git a/Documentation/atomic_t.txt b/Documentation/atomic_t.txt new file mode 100644 index 0000000..eee1271 --- /dev/null +++ b/Documentation/atomic_t.txt @@ -0,0 +1,200 @@ + +On atomic types (atomic_t atomic64_t and atomic_long_t). + +The atomic type provides an interface to the architecture's means of atomic +RMW operations between CPUs (atomic operations on MMIO are not supported and +can lead to fatal traps on some platforms). + +API +--- + +The 'full' API consists of (atomic64_ and atomic_long_ prefixes omitted for +brevity): + +Non-RMW ops: + + atomic_read(), atomic_set() + atomic_read_acquire(), atomic_set_release() + + +RMW atomic operations: + +Arithmetic: + + atomic_{add,sub,inc,dec}() + atomic_{add,sub,inc,dec}_return{,_relaxed,_acquire,_release}() + atomic_fetch_{add,sub,inc,dec}{,_relaxed,_acquire,_release}() + + +Bitwise: + + atomic_{and,or,xor,andnot}() + atomic_fetch_{and,or,xor,andnot}{,_relaxed,_acquire,_release}() + + +Swap: + + atomic_xchg{,_relaxed,_acquire,_release}() + atomic_cmpxchg{,_relaxed,_acquire,_release}() + atomic_try_cmpxchg{,_relaxed,_acquire,_release}() + + +Reference count (but please see refcount_t): + + atomic_add_unless(), atomic_inc_not_zero() + atomic_sub_and_test(), atomic_dec_and_test() + + +Misc: + + atomic_inc_and_test(), atomic_add_negative() + atomic_dec_unless_positive(), atomic_inc_unless_negative() + + +Barriers: + + smp_mb__{before,after}_atomic() + + + +SEMANTICS +--------- + +Non-RMW ops: + +The non-RMW ops are (typically) regular LOADs and STOREs and are canonically +implemented using READ_ONCE(), WRITE_ONCE(), smp_load_acquire() and +smp_store_release() respectively. + +The one detail to this is that atomic_set{}() should be observable to the RMW +ops. That is: + + C atomic-set + + { + atomic_set(v, 1); + } + + P1(atomic_t *v) + { + atomic_add_unless(v, 1, 0); + } + + P2(atomic_t *v) + { + atomic_set(v, 0); + } + + exists + (v=2) + +In this case we would expect the atomic_set() from CPU1 to either happen +before the atomic_add_unless(), in which case that latter one would no-op, or +_after_ in which case we'd overwrite its result. In no case is "2" a valid +outcome. + +This is typically true on 'normal' platforms, where a regular competing STORE +will invalidate a LL/SC or fail a CMPXCHG. + +The obvious case where this is not so is when we need to implement atomic ops +with a lock: + + CPU0 CPU1 + + atomic_add_unless(v, 1, 0); + lock(); + ret = READ_ONCE(v->counter); // == 1 + atomic_set(v, 0); + if (ret != u) WRITE_ONCE(v->counter, 0); + WRITE_ONCE(v->counter, ret + 1); + unlock(); + +the typical solution is to then implement atomic_set{}() with atomic_xchg(). + + +RMW ops: + +These come in various forms: + + - plain operations without return value: atomic_{}() + + - operations which return the modified value: atomic_{}_return() + + these are limited to the arithmetic operations because those are + reversible. Bitops are irreversible and therefore the modified value + is of dubious utility. + + - operations which return the original value: atomic_fetch_{}() + + - swap operations: xchg(), cmpxchg() and try_cmpxchg() + + - misc; the special purpose operations that are commonly used and would, + given the interface, normally be implemented using (try_)cmpxchg loops but + are time critical and can, (typically) on LL/SC architectures, be more + efficiently implemented. + +All these operations are SMP atomic; that is, the operations (for a single +atomic variable) can be fully ordered and no intermediate state is lost or +visible. + + +ORDERING (go read memory-barriers.txt first) +-------- + +The rule of thumb: + + - non-RMW operations are unordered; + + - RMW operations that have no return value are unordered; + + - RMW operations that have a return value are fully ordered; + + - RMW operations that are conditional are unordered on FAILURE, + otherwise the above rules apply. + +Except of course when an operation has an explicit ordering like: + + {}_relaxed: unordered + {}_acquire: the R of the RMW (or atomic_read) is an ACQUIRE + {}_release: the W of the RMW (or atomic_set) is a RELEASE + +Where 'unordered' is against other memory locations. Address dependencies are +not defeated. + +Fully ordered primitives are ordered against everything prior and everything +subsequent. Therefore a fully ordered primitive is like having an smp_mb() +before and an smp_mb() after the primitive. + + +The barriers: + + smp_mb__{before,after}_atomic() + +only apply to the RMW ops and can be used to augment/upgrade the ordering +inherent to the used atomic op. These barriers provide a full smp_mb(). + +These helper barriers exist because architectures have varying implicit +ordering on their SMP atomic primitives. For example our TSO architectures +provide full ordered atomics and these barriers are no-ops. + +Thus: + + atomic_fetch_add(); + +is equivalent to: + + smp_mb__before_atomic(); + atomic_fetch_add_relaxed(); + smp_mb__after_atomic(); + +However the atomic_fetch_add() might be implemented more efficiently. + +Further, while something like: + + smp_mb__before_atomic(); + atomic_dec(&X); + +is a 'typical' RELEASE pattern, the barrier is strictly stronger than +a RELEASE. Similarly for something like: + + diff --git a/Documentation/locking/crossrelease.txt b/Documentation/locking/crossrelease.txt new file mode 100644 index 0000000..bdf1423 --- /dev/null +++ b/Documentation/locking/crossrelease.txt @@ -0,0 +1,874 @@ +Crossrelease +============ + +Started by Byungchul Park <byungchul.park@lge.com> + +Contents: + + (*) Background + + - What causes deadlock + - How lockdep works + + (*) Limitation + + - Limit lockdep + - Pros from the limitation + - Cons from the limitation + - Relax the limitation + + (*) Crossrelease + + - Introduce crossrelease + - Introduce commit + + (*) Implementation + + - Data structures + - How crossrelease works + + (*) Optimizations + + - Avoid duplication + - Lockless for hot paths + + (*) APPENDIX A: What lockdep does to work aggresively + + (*) APPENDIX B: How to avoid adding false dependencies + + +========== +Background +========== + +What causes deadlock +-------------------- + +A deadlock occurs when a context is waiting for an event to happen, +which is impossible because another (or the) context who can trigger the +event is also waiting for another (or the) event to happen, which is +also impossible due to the same reason. + +For example: + + A context going to trigger event C is waiting for event A to happen. + A context going to trigger event A is waiting for event B to happen. + A context going to trigger event B is waiting for event C to happen. + +A deadlock occurs when these three wait operations run at the same time, +because event C cannot be triggered if event A does not happen, which in +turn cannot be triggered if event B does not happen, which in turn +cannot be triggered if event C does not happen. After all, no event can +be triggered since any of them never meets its condition to wake up. + +A dependency might exist between two waiters and a deadlock might happen +due to an incorrect releationship between dependencies. Thus, we must +define what a dependency is first. A dependency exists between them if: + + 1. There are two waiters waiting for each event at a given time. + 2. The only way to wake up each waiter is to trigger its event. + 3. Whether one can be woken up depends on whether the other can. + +Each wait in the example creates its dependency like: + + Event C depends on event A. + Event A depends on event B. + Event B depends on event C. + + NOTE: Precisely speaking, a dependency is one between whether a + waiter for an event can be woken up and whether another waiter for + another event can be woken up. However from now on, we will describe + a dependency as if it's one between an event and another event for + simplicity. + +And they form circular dependencies like: + + -> C -> A -> B - + / \ + \ / + ---------------- + + where 'A -> B' means that event A depends on event B. + +Such circular dependencies lead to a deadlock since no waiter can meet +its condition to wake up as described. + +CONCLUSION + +Circular dependencies cause a deadlock. + + +How lockdep works +----------------- + +Lockdep tries to detect a deadlock by checking dependencies created by +lock operations, acquire and release. Waiting for a lock corresponds to +waiting for an event, and releasing a lock corresponds to triggering an +event in the previous section. + +In short, lockdep does: + + 1. Detect a new dependency. + 2. Add the dependency into a global graph. + 3. Check if that makes dependencies circular. + 4. Report a deadlock or its possibility if so. + +For example, consider a graph built by lockdep that looks like: + + A -> B - + \ + -> E + / + C -> D - + + where A, B,..., E are different lock classes. + +Lockdep will add a dependency into the graph on detection of a new +dependency. For example, it will add a dependency 'E -> C' when a new +dependency between lock E and lock C is detected. Then the graph will be: + + A -> B - + \ + -> E - + / \ + -> C -> D - \ + / / + \ / + ------------------ + + where A, B,..., E are different lock classes. + +This graph contains a subgraph which demonstrates circular dependencies: + + -> E - + / \ + -> C -> D - \ + / / + \ / + ------------------ + + where C, D and E are different lock classes. + +This is the condition under which a deadlock might occur. Lockdep +reports it on detection after adding a new dependency. This is the way +how lockdep works. + +CONCLUSION + +Lockdep detects a deadlock or its possibility by checking if circular +dependencies were created after adding each new dependency. + + +========== +Limitation +========== + +Limit lockdep +------------- + +Limiting lockdep to work on only typical locks e.g. spin locks and +mutexes, which are released within the acquire context, the +implementation becomes simple but its capacity for detection becomes +limited. Let's check pros and cons in next section. + + +Pros from the limitation +------------------------ + +Given the limitation, when acquiring a lock, locks in a held_locks +cannot be released if the context cannot acquire it so has to wait to +acquire it, which means all waiters for the locks in the held_locks are +stuck. It's an exact case to create dependencies between each lock in +the held_locks and the lock to acquire. + +For example: + + CONTEXT X + --------- + acquire A + acquire B /* Add a dependency 'A -> B' */ + release B + release A + + where A and B are different lock classes. + +When acquiring lock A, the held_locks of CONTEXT X is empty thus no +dependency is added. But when acquiring lock B, lockdep detects and adds +a new dependency 'A -> B' between lock A in the held_locks and lock B. +They can be simply added whenever acquiring each lock. + +And data required by lockdep exists in a local structure, held_locks +embedded in task_struct. Forcing to access the data within the context, +lockdep can avoid racy problems without explicit locks while handling +the local data. + +Lastly, lockdep only needs to keep locks currently being held, to build +a dependency graph. However, relaxing the limitation, it needs to keep +even locks already released, because a decision whether they created +dependencies might be long-deferred. + +To sum up, we can expect several advantages from the limitation: + + 1. Lockdep can easily identify a dependency when acquiring a lock. + 2. Races are avoidable while accessing local locks in a held_locks. + 3. Lockdep only needs to keep locks currently being held. + +CONCLUSION + +Given the limitation, the implementation becomes simple and efficient. + + +Cons from the limitation +------------------------ + +Given the limitation, lockdep is applicable only to typical locks. For +example, page locks for page access or completions for synchronization +cannot work with lockdep. + +Can we detect deadlocks below, under the limitation? + +Example 1: + + CONTEXT X CONTEXT Y CONTEXT Z + --------- --------- ---------- + mutex_lock A + lock_page B + lock_page B + mutex_lock A /* DEADLOCK */ + unlock_page B held by X + unlock_page B + mutex_unlock A + mutex_unlock A + + where A and B are different lock classes. + +No, we cannot. + +Example 2: + + CONTEXT X CONTEXT Y + --------- --------- + mutex_lock A + mutex_lock A + wait_for_complete B /* DEADLOCK */ + complete B + mutex_unlock A + mutex_unlock A + + where A is a lock class and B is a completion variable. + +No, we cannot. + +CONCLUSION + +Given the limitation, lockdep cannot detect a deadlock or its +possibility caused by page locks or completions. + + +Relax the limitation +-------------------- + +Under the limitation, things to create dependencies are limited to +typical locks. However, synchronization primitives like page locks and +completions, which are allowed to be released in any context, also +create dependencies and can cause a deadlock. So lockdep should track +these locks to do a better job. We have to relax the limitation for +these locks to work with lockdep. + +Detecting dependencies is very important for lockdep to work because +adding a dependency means adding an opportunity to check whether it +causes a deadlock. The more lockdep adds dependencies, the more it +thoroughly works. Thus Lockdep has to do its best to detect and add as +many true dependencies into a graph as possible. + +For example, considering only typical locks, lockdep builds a graph like: + + A -> B - + \ + -> E + / + C -> D - + + where A, B,..., E are different lock classes. + +On the other hand, under the relaxation, additional dependencies might +be created and added. Assuming additional 'FX -> C' and 'E -> GX' are +added thanks to the relaxation, the graph will be: + + A -> B - + \ + -> E -> GX + / + FX -> C -> D - + + where A, B,..., E, FX and GX are different lock classes, and a suffix + 'X' is added on non-typical locks. + +The latter graph gives us more chances to check circular dependencies +than the former. However, it might suffer performance degradation since +relaxing the limitation, with which design and implementation of lockdep +can be efficient, might introduce inefficiency inevitably. So lockdep +should provide two options, strong detection and efficient detection. + +Choosing efficient detection: + + Lockdep works with only locks restricted to be released within the + acquire context. However, lockdep works efficiently. + +Choosing strong detection: + + Lockdep works with all synchronization primitives. However, lockdep + suffers performance degradation. + +CONCLUSION + +Relaxing the limitation, lockdep can add additional dependencies giving +additional opportunities to check circular dependencies. + + +============ +Crossrelease +============ + +Introduce crossrelease +---------------------- + +In order to allow lockdep to handle additional dependencies by what +might be released in any context, namely 'crosslock', we have to be able +to identify those created by crosslocks. The proposed 'crossrelease' +feature provoides a way to do that. + +Crossrelease feature has to do: + + 1. Identify dependencies created by crosslocks. + 2. Add the dependencies into a dependency graph. + +That's all. Once a meaningful dependency is added into graph, then +lockdep would work with the graph as it did. The most important thing +crossrelease feature has to do is to correctly identify and add true +dependencies into the global graph. + +A dependency e.g. 'A -> B' can be identified only in the A's release +context because a decision required to identify the dependency can be +made only in the release context. That is to decide whether A can be +released so that a waiter for A can be woken up. It cannot be made in +other than the A's release context. + +It's no matter for typical locks because each acquire context is same as +its release context, thus lockdep can decide whether a lock can be +released in the acquire context. However for crosslocks, lockdep cannot +make the decision in the acquire context but has to wait until the +release context is identified. + +Therefore, deadlocks by crosslocks cannot be detected just when it +happens, because those cannot be identified until the crosslocks are +released. However, deadlock possibilities can be detected and it's very +worth. See 'APPENDIX A' section to check why. + +CONCLUSION + +Using crossrelease feature, lockdep can work with what might be released +in any context, namely crosslock. + + +Introduce commit +---------------- + +Since crossrelease defers the work adding true dependencies of +crosslocks until they are actually released, crossrelease has to queue +all acquisitions which might create dependencies with the crosslocks. +Then it identifies dependencies using the queued data in batches at a +proper time. We call it 'commit'. + +There are four types of dependencies: + +1. TT type: 'typical lock A -> typical lock B' + + Just when acquiring B, lockdep can see it's in the A's release + context. So the dependency between A and B can be identified + immediately. Commit is unnecessary. + +2. TC type: 'typical lock A -> crosslock BX' + + Just when acquiring BX, lockdep can see it's in the A's release + context. So the dependency between A and BX can be identified + immediately. Commit is unnecessary, too. + +3. CT type: 'crosslock AX -> typical lock B' + + When acquiring B, lockdep cannot identify the dependency because + there's no way to know if it's in the AX's release context. It has + to wait until the decision can be made. Commit is necessary. + +4. CC type: 'crosslock AX -> crosslock BX' + + When acquiring BX, lockdep cannot identify the dependency because + there's no way to know if it's in the AX's release context. It has + to wait until the decision can be made. Commit is necessary. + But, handling CC type is not implemented yet. It's a future work. + +Lockdep can work without commit for typical locks, but commit step is +necessary once crosslocks are involved. Introducing commit, lockdep +performs three steps. What lockdep does in each step is: + +1. Acquisition: For typical locks, lockdep does what it originally did + and queues the lock so that CT type dependencies can be checked using + it at the commit step. For crosslocks, it saves data which will be + used at the commit step and increases a reference count for it. + +2. Commit: No action is reauired for typical locks. For crosslocks, + lockdep adds CT type dependencies using the data saved at the + acquisition step. + +3. Release: No changes are required for typical locks. When a crosslock + is released, it decreases a reference count for it. + +CONCLUSION + +Crossrelease introduces commit step to handle dependencies of crosslocks +in batches at a proper time. + + +============== +Implementation +============== + +Data structures +--------------- + +Crossrelease introduces two main data structures. + +1. hist_lock + + This is an array embedded in task_struct, for keeping lock history so + that dependencies can be added using them at the commit step. Since + it's local data, it can be accessed locklessly in the owner context. + The array is filled at the acquisition step and consumed at the + commit step. And it's managed in circular manner. + +2. cross_lock + + One per lockdep_map exists. This is for keeping data of crosslocks + and used at the commit step. + + +How crossrelease works +---------------------- + +It's the key of how crossrelease works, to defer necessary works to an +appropriate point in time and perform in at once at the commit step. +Let's take a look with examples step by step, starting from how lockdep +works without crossrelease for typical locks. + + acquire A /* Push A onto held_locks */ + acquire B /* Push B onto held_locks and add 'A -> B' */ + acquire C /* Push C onto held_locks and add 'B -> C' */ + release C /* Pop C from held_locks */ + release B /* Pop B from held_locks */ + release A /* Pop A from held_locks */ + + where A, B and C are different lock classes. + + NOTE: This document assumes that readers already understand how + lockdep works without crossrelease thus omits details. But there's + one thing to note. Lockdep pretends to pop a lock from held_locks + when releasing it. But it's subtly different from the original pop + operation because lockdep allows other than the top to be poped. + +In this case, lockdep adds 'the top of held_locks -> the lock to acquire' +dependency every time acquiring a lock. + +After adding 'A -> B', a dependency graph will be: + + A -> B + + where A and B are different lock classes. + +And after adding 'B -> C', the graph will be: + + A -> B -> C + + where A, B and C are different lock classes. + +Let's performs commit step even for typical locks to add dependencies. +Of course, commit step is not necessary for them, however, it would work +well because this is a more general way. + + acquire A + /* + * Queue A into hist_locks + * + * In hist_locks: A + * In graph: Empty + */ + + acquire B + /* + * Queue B into hist_locks + * + * In hist_locks: A, B + * In graph: Empty + */ + + acquire C + /* + * Queue C into hist_locks + * + * In hist_locks: A, B, C + * In graph: Empty + */ + + commit C + /* + * Add 'C -> ?' + * Answer the following to decide '?' + * What has been queued since acquire C: Nothing + * + * In hist_locks: A, B, C + * In graph: Empty + */ + + release C + + commit B + /* + * Add 'B -> ?' + * Answer the following to decide '?' + * What has been queued since acquire B: C + * + * In hist_locks: A, B, C + * In graph: 'B -> C' + */ + + release B + + commit A + /* + * Add 'A -> ?' + * Answer the following to decide '?' + * What has been queued since acquire A: B, C + * + * In hist_locks: A, B, C + * In graph: 'B -> C', 'A -> B', 'A -> C' + */ + + release A + + where A, B and C are different lock classes. + +In this case, dependencies are added at the commit step as described. + +After commits for A, B and C, the graph will be: + + A -> B -> C + + where A, B and C are different lock classes. + + NOTE: A dependency 'A -> C' is optimized out. + +We can see the former graph built without commit step is same as the +latter graph built using commit steps. Of course the former way leads to +earlier finish for building the graph, which means we can detect a +deadlock or its possibility sooner. So the former way would be prefered +when possible. But we cannot avoid using the latter way for crosslocks. + +Let's look at how commit steps work for crosslocks. In this case, the +commit step is performed only on crosslock AX as real. And it assumes +that the AX release context is different from the AX acquire context. + + BX RELEASE CONTEXT BX ACQUIRE CONTEXT + ------------------ ------------------ + acquire A + /* + * Push A onto held_locks + * Queue A into hist_locks + * + * In held_locks: A + * In hist_locks: A + * In graph: Empty + */ + + acquire BX + /* + * Add 'the top of held_locks -> BX' + * + * In held_locks: A + * In hist_locks: A + * In graph: 'A -> BX' + */ + + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + It must be guaranteed that the following operations are seen after + acquiring BX globally. It can be done by things like barrier. + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + acquire C + /* + * Push C onto held_locks + * Queue C into hist_locks + * + * In held_locks: C + * In hist_locks: C + * In graph: 'A -> BX' + */ + + release C + /* + * Pop C from held_locks + * + * In held_locks: Empty + * In hist_locks: C + * In graph: 'A -> BX' + */ + acquire D + /* + * Push D onto held_locks + * Queue D into hist_locks + * Add 'the top of held_locks -> D' + * + * In held_locks: A, D + * In hist_locks: A, D + * In graph: 'A -> BX', 'A -> D' + */ + acquire E + /* + * Push E onto held_locks + * Queue E into hist_locks + * + * In held_locks: E + * In hist_locks: C, E + * In graph: 'A -> BX', 'A -> D' + */ + + release E + /* + * Pop E from held_locks + * + * In held_locks: Empty + * In hist_locks: D, E + * In graph: 'A -> BX', 'A -> D' + */ + release D + /* + * Pop D from held_locks + * + * In held_locks: A + * In hist_locks: A, D + * In graph: 'A -> BX', 'A -> D' + */ + commit BX + /* + * Add 'BX -> ?' + * What has been queued since acquire BX: C, E + * + * In held_locks: Empty + * In hist_locks: D, E + * In graph: 'A -> BX', 'A -> D', + * 'BX -> C', 'BX -> E' + */ + + release BX + /* + * In held_locks: Empty + * In hist_locks: D, E + * In graph: 'A -> BX', 'A -> D', + * 'BX -> C', 'BX -> E' + */ + release A + /* + * Pop A from held_locks + * + * In held_locks: Empty + * In hist_locks: A, D + * In graph: 'A -> BX', 'A -> D', + * 'BX -> C', 'BX -> E' + */ + + where A, BX, C,..., E are different lock classes, and a suffix 'X' is + added on crosslocks. + +Crossrelease considers all acquisitions after acqiuring BX are +candidates which might create dependencies with BX. True dependencies +will be determined when identifying the release context of BX. Meanwhile, +all typical locks are queued so that they can be used at the commit step. +And then two dependencies 'BX -> C' and 'BX -> E' are added at the +commit step when identifying the release context. + +The final graph will be, with crossrelease: + + -> C + / + -> BX - + / \ + A - -> E + \ + -> D + + where A, BX, C,..., E are different lock classes, and a suffix 'X' is + added on crosslocks. + +However, the final graph will be, without crossrelease: + + A -> D + + where A and D are different lock classes. + +The former graph has three more dependencies, 'A -> BX', 'BX -> C' and +'BX -> E' giving additional opportunities to check if they cause +deadlocks. This way lockdep can detect a deadlock or its possibility +caused by crosslocks. + +CONCLUSION + +We checked how crossrelease works with several examples. + + +============= +Optimizations +============= + +Avoid duplication +----------------- + +Crossrelease feature uses a cache like what lockdep already uses for +dependency chains, but this time it's for caching CT type dependencies. +Once that dependency is cached, the same will never be added again. + + +Lockless for hot paths +---------------------- + +To keep all locks for later use at the commit step, crossrelease adopts +a local array embedded in task_struct, which makes access to the data +lockless by forcing it to happen only within the owner context. It's +like how lockdep handles held_locks. Lockless implmentation is important +since typical locks are very frequently acquired and released. + + +================================================= +APPENDIX A: What lockdep does to work aggresively +================================================= + +A deadlock actually occurs when all wait operations creating circular +dependencies run at the same time. Even though they don't, a potential +deadlock exists if the problematic dependencies exist. Thus it's +meaningful to detect not only an actual deadlock but also its potential +possibility. The latter is rather valuable. When a deadlock occurs +actually, we can identify what happens in the system by some means or +other even without lockdep. However, there's no way to detect possiblity +without lockdep unless the whole code is parsed in head. It's terrible. +Lockdep does the both, and crossrelease only focuses on the latter. + +Whether or not a deadlock actually occurs depends on several factors. +For example, what order contexts are switched in is a factor. Assuming +circular dependencies exist, a deadlock would occur when contexts are +switched so that all wait operations creating the dependencies run +simultaneously. Thus to detect a deadlock possibility even in the case +that it has not occured yet, lockdep should consider all possible +combinations of dependencies, trying to: + +1. Use a global dependency graph. + + Lockdep combines all dependencies into one global graph and uses them, + regardless of which context generates them or what order contexts are + switched in. Aggregated dependencies are only considered so they are + prone to be circular if a problem exists. + +2. Check dependencies between classes instead of instances. + + What actually causes a deadlock are instances of lock. However, + lockdep checks dependencies between classes instead of instances. + This way lockdep can detect a deadlock which has not happened but + might happen in future by others but the same class. + +3. Assume all acquisitions lead to waiting. + + Although locks might be acquired without waiting which is essential + to create dependencies, lockdep assumes all acquisitions lead to + waiting since it might be true some time or another. + +CONCLUSION + +Lockdep detects not only an actual deadlock but also its possibility, +and the latter is more valuable. + + +================================================== +APPENDIX B: How to avoid adding false dependencies +================================================== + +Remind what a dependency is. A dependency exists if: + + 1. There are two waiters waiting for each event at a given time. + 2. The only way to wake up each waiter is to trigger its event. + 3. Whether one can be woken up depends on whether the other can. + +For example: + + acquire A + acquire B /* A dependency 'A -> B' exists */ + release B + release A + + where A and B are different lock classes. + +A depedency 'A -> B' exists since: + + 1. A waiter for A and a waiter for B might exist when acquiring B. + 2. Only way to wake up each is to release what it waits for. + 3. Whether the waiter for A can be woken up depends on whether the + other can. IOW, TASK X cannot release A if it fails to acquire B. + +For another example: + + TASK X TASK Y + ------ ------ + acquire AX + acquire B /* A dependency 'AX -> B' exists */ + release B + release AX held by Y + + where AX and B are different lock classes, and a suffix 'X' is added + on crosslocks. + +Even in this case involving crosslocks, the same rule can be applied. A +depedency 'AX -> B' exists since: + + 1. A waiter for AX and a waiter for B might exist when acquiring B. + 2. Only way to wake up each is to release what it waits for. + 3. Whether the waiter for AX can be woken up depends on whether the + other can. IOW, TASK X cannot release AX if it fails to acquire B. + +Let's take a look at more complicated example: + + TASK X TASK Y + ------ ------ + acquire B + release B + fork Y + acquire AX + acquire C /* A dependency 'AX -> C' exists */ + release C + release AX held by Y + + where AX, B and C are different lock classes, and a suffix 'X' is + added on crosslocks. + +Does a dependency 'AX -> B' exist? Nope. + +Two waiters are essential to create a dependency. However, waiters for +AX and B to create 'AX -> B' cannot exist at the same time in this +example. Thus the dependency 'AX -> B' cannot be created. + +It would be ideal if the full set of true ones can be considered. But +we can ensure nothing but what actually happened. Relying on what +actually happens at runtime, we can anyway add only true ones, though +they might be a subset of true ones. It's similar to how lockdep works +for typical locks. There might be more true dependencies than what +lockdep has detected in runtime. Lockdep has no choice but to rely on +what actually happens. Crossrelease also relies on it. + +CONCLUSION + +Relying on what actually happens, lockdep can avoid adding false +dependencies. diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index c4ddfcd..d1d1716 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -498,11 +498,11 @@ And a couple of implicit varieties: This means that ACQUIRE acts as a minimal "acquire" operation and RELEASE acts as a minimal "release" operation. -A subset of the atomic operations described in core-api/atomic_ops.rst have -ACQUIRE and RELEASE variants in addition to fully-ordered and relaxed (no -barrier semantics) definitions. For compound atomics performing both a load -and a store, ACQUIRE semantics apply only to the load and RELEASE semantics -apply only to the store portion of the operation. +A subset of the atomic operations described in atomic_t.txt have ACQUIRE and +RELEASE variants in addition to fully-ordered and relaxed (no barrier +semantics) definitions. For compound atomics performing both a load and a +store, ACQUIRE semantics apply only to the load and RELEASE semantics apply +only to the store portion of the operation. Memory barriers are only required where there's a possibility of interaction between two CPUs or between a CPU and a device. If it can be guaranteed that @@ -1876,8 +1876,7 @@ There are some more advanced barrier functions: This makes sure that the death mark on the object is perceived to be set *before* the reference counter is decremented. - See Documentation/core-api/atomic_ops.rst for more information. See the - "Atomic operations" subsection for information on where to use these. + See Documentation/atomic_{t,bitops}.txt for more information. (*) lockless_dereference(); @@ -1982,10 +1981,7 @@ for each construct. These operations all imply certain barriers: ACQUIRE operation has completed. Memory operations issued before the ACQUIRE may be completed after - the ACQUIRE operation has completed. An smp_mb__before_spinlock(), - combined with a following ACQUIRE, orders prior stores against - subsequent loads and stores. Note that this is weaker than smp_mb()! - The smp_mb__before_spinlock() primitive is free on many architectures. + the ACQUIRE operation has completed. (2) RELEASE operation implication: @@ -2503,88 +2499,7 @@ operations are noted specially as some of them imply full memory barriers and some don't, but they're very heavily relied on as a group throughout the kernel. -Any atomic operation that modifies some state in memory and returns information -about the state (old or new) implies an SMP-conditional general memory barrier -(smp_mb()) on each side of the actual operation (with the exception of -explicit lock operations, described later). These include: - - xchg(); - atomic_xchg(); atomic_long_xchg(); - atomic_inc_return(); atomic_long_inc_return(); - atomic_dec_return(); atomic_long_dec_return(); - atomic_add_return(); atomic_long_add_return(); - atomic_sub_return(); atomic_long_sub_return(); - atomic_inc_and_test(); atomic_long_inc_and_test(); - atomic_dec_and_test(); atomic_long_dec_and_test(); - atomic_sub_and_test(); atomic_long_sub_and_test(); - atomic_add_negative(); atomic_long_add_negative(); - test_and_set_bit(); - test_and_clear_bit(); - test_and_change_bit(); - - /* when succeeds */ - cmpxchg(); - atomic_cmpxchg(); atomic_long_cmpxchg(); - atomic_add_unless(); atomic_long_add_unless(); - -These are used for such things as implementing ACQUIRE-class and RELEASE-class -operations and adjusting reference counters towards object destruction, and as -such the implicit memory barrier effects are necessary. - - -The following operations are potential problems as they do _not_ imply memory -barriers, but might be used for implementing such things as RELEASE-class -operations: - - atomic_set(); - set_bit(); - clear_bit(); - change_bit(); - -With these the appropriate explicit memory barrier should be used if necessary -(smp_mb__before_atomic() for instance). - - -The following also do _not_ imply memory barriers, and so may require explicit -memory barriers under some circumstances (smp_mb__before_atomic() for -instance): - - atomic_add(); - atomic_sub(); - atomic_inc(); - atomic_dec(); - -If they're used for statistics generation, then they probably don't need memory -barriers, unless there's a coupling between statistical data. - -If they're used for reference counting on an object to control its lifetime, -they probably don't need memory barriers because either the reference count -will be adjusted inside a locked section, or the caller will already hold -sufficient references to make the lock, and thus a memory barrier unnecessary. - -If they're used for constructing a lock of some description, then they probably -do need memory barriers as a lock primitive generally has to do things in a -specific order. - -Basically, each usage case has to be carefully considered as to whether memory -barriers are needed or not. - -The following operations are special locking primitives: - - test_and_set_bit_lock(); - clear_bit_unlock(); - __clear_bit_unlock(); - -These implement ACQUIRE-class and RELEASE-class operations. These should be -used in preference to other operations when implementing locking primitives, -because their implementations can be optimised on many architectures. - -[!] Note that special memory barrier primitives are available for these -situations because on some CPUs the atomic instructions used imply full memory -barriers, and so barrier instructions are superfluous in conjunction with them, -and in such cases the special barrier primitives will be no-ops. - -See Documentation/core-api/atomic_ops.rst for more information. +See Documentation/atomic_t.txt for more information. ACCESSING DEVICES diff --git a/Documentation/static-keys.txt b/Documentation/static-keys.txt index b83dfa1..ab16efe 100644 --- a/Documentation/static-keys.txt +++ b/Documentation/static-keys.txt @@ -149,6 +149,26 @@ static_branch_inc(), will change the branch back to true. Likewise, if the key is initialized false, a 'static_branch_inc()', will change the branch to true. And then a 'static_branch_dec()', will again make the branch false. +The state and the reference count can be retrieved with 'static_key_enabled()' +and 'static_key_count()'. In general, if you use these functions, they +should be protected with the same mutex used around the enable/disable +or increment/decrement function. + +Note that switching branches results in some locks being taken, +particularly the CPU hotplug lock (in order to avoid races against +CPUs being brought in the kernel whilst the kernel is getting +patched). Calling the static key API from within a hotplug notifier is +thus a sure deadlock recipe. In order to still allow use of the +functionnality, the following functions are provided: + + static_key_enable_cpuslocked() + static_key_disable_cpuslocked() + static_branch_enable_cpuslocked() + static_branch_disable_cpuslocked() + +These functions are *not* general purpose, and must only be used when +you really know that you're in the above context, and no other. + Where an array of keys is required, it can be defined as:: DEFINE_STATIC_KEY_ARRAY_TRUE(keys, count); diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt index 38310dc..bc80fc0 100644 --- a/Documentation/translations/ko_KR/memory-barriers.txt +++ b/Documentation/translations/ko_KR/memory-barriers.txt @@ -1956,10 +1956,7 @@ MMIO 쓰기 배리어 뒤에 완료됩니다. ACQUIRE 앞에서 요청된 메모리 오퍼레이션은 ACQUIRE 오퍼레이션이 완료된 후에 - 완료될 수 있습니다. smp_mb__before_spinlock() 뒤에 ACQUIRE 가 실행되는 - 코드 블록은 블록 앞의 스토어를 블록 뒤의 로드와 스토어에 대해 순서 - 맞춥니다. 이건 smp_mb() 보다 완화된 것임을 기억하세요! 많은 아키텍쳐에서 - smp_mb__before_spinlock() 은 사실 아무일도 하지 않습니다. + 완료될 수 있습니다. (2) RELEASE 오퍼레이션의 영향: diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h index 54b54da..1185928 100644 --- a/arch/arc/include/asm/atomic.h +++ b/arch/arc/include/asm/atomic.h @@ -123,6 +123,8 @@ static inline void atomic_set(atomic_t *v, int i) atomic_ops_unlock(flags); } +#define atomic_set_release(v, i) atomic_set((v), (i)) + #endif /* diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h index cae331d..ae4241a 100644 --- a/arch/arm64/include/asm/spinlock.h +++ b/arch/arm64/include/asm/spinlock.h @@ -358,14 +358,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw) #define arch_read_relax(lock) cpu_relax() #define arch_write_relax(lock) cpu_relax() -/* - * Accesses appearing in program order before a spin_lock() operation - * can be reordered with accesses inside the critical section, by virtue - * of arch_spin_lock being constructed using acquire semantics. - * - * In cases where this is problematic (e.g. try_to_wake_up), an - * smp_mb__before_spinlock() can restore the required ordering. - */ -#define smp_mb__before_spinlock() smp_mb() +/* See include/linux/spinlock.h */ +#define smp_mb__after_spinlock() smp_mb() #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h index a62ba36..fb3dfb2 100644 --- a/arch/hexagon/include/asm/atomic.h +++ b/arch/hexagon/include/asm/atomic.h @@ -42,6 +42,8 @@ static inline void atomic_set(atomic_t *v, int new) ); } +#define atomic_set_release(v, i) atomic_set((v), (i)) + /** * atomic_read - reads a word, atomically * @v: pointer to atomic value diff --git a/arch/metag/include/asm/atomic_lock1.h b/arch/metag/include/asm/atomic_lock1.h index 6c1380a..eee779f 100644 --- a/arch/metag/include/asm/atomic_lock1.h +++ b/arch/metag/include/asm/atomic_lock1.h @@ -37,6 +37,8 @@ static inline int atomic_set(atomic_t *v, int i) return i; } +#define atomic_set_release(v, i) atomic_set((v), (i)) + #define ATOMIC_OP(op, c_op) \ static inline void atomic_##op(int i, atomic_t *v) \ { \ diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h index 5394b9c..17b98a8 100644 --- a/arch/parisc/include/asm/atomic.h +++ b/arch/parisc/include/asm/atomic.h @@ -65,6 +65,8 @@ static __inline__ void atomic_set(atomic_t *v, int i) _atomic_spin_unlock_irqrestore(v, flags); } +#define atomic_set_release(v, i) atomic_set((v), (i)) + static __inline__ int atomic_read(const atomic_t *v) { return READ_ONCE((v)->counter); diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index 25d42bd..9c601ad 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -74,13 +74,6 @@ do { \ ___p1; \ }) -/* - * This must resolve to hwsync on SMP for the context switch path. - * See _switch, and core scheduler context switch memory ordering - * comments. - */ -#define smp_mb__before_spinlock() smp_mb() - #include <asm-generic/barrier.h> #endif /* _ASM_POWERPC_BARRIER_H */ diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index 8c1b913..c1b1ec9 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -342,5 +342,8 @@ static inline void arch_write_unlock(arch_rwlock_t *rw) #define arch_read_relax(lock) __rw_yield(lock) #define arch_write_relax(lock) __rw_yield(lock) +/* See include/linux/spinlock.h */ +#define smp_mb__after_spinlock() smp_mb() + #endif /* __KERNEL__ */ #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h index ee3f11c..7643e97 100644 --- a/arch/sparc/include/asm/atomic_32.h +++ b/arch/sparc/include/asm/atomic_32.h @@ -29,6 +29,8 @@ int atomic_xchg(atomic_t *, int); int __atomic_add_unless(atomic_t *, int, int); void atomic_set(atomic_t *, int); +#define atomic_set_release(v, i) atomic_set((v), (i)) + #define atomic_read(v) ACCESS_ONCE((v)->counter) #define atomic_add(i, v) ((void)atomic_add_return( (int)(i), (v))) diff --git a/arch/tile/include/asm/atomic_32.h b/arch/tile/include/asm/atomic_32.h index a9377425..53a423e 100644 --- a/arch/tile/include/asm/atomic_32.h +++ b/arch/tile/include/asm/atomic_32.h @@ -101,6 +101,8 @@ static inline void atomic_set(atomic_t *v, int n) _atomic_xchg(&v->counter, n); } +#define atomic_set_release(v, i) atomic_set((v), (i)) + /* A 64bit atomic type */ typedef struct { diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 33380b8..0874ebd 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -197,35 +197,56 @@ static inline int atomic_xchg(atomic_t *v, int new) return xchg(&v->counter, new); } -#define ATOMIC_OP(op) \ -static inline void atomic_##op(int i, atomic_t *v) \ -{ \ - asm volatile(LOCK_PREFIX #op"l %1,%0" \ - : "+m" (v->counter) \ - : "ir" (i) \ - : "memory"); \ +static inline void atomic_and(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "andl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); +} + +static inline int atomic_fetch_and(int i, atomic_t *v) +{ + int val = atomic_read(v); + + do { } while (!atomic_try_cmpxchg(v, &val, val & i)); + + return val; } -#define ATOMIC_FETCH_OP(op, c_op) \ -static inline int atomic_fetch_##op(int i, atomic_t *v) \ -{ \ - int val = atomic_read(v); \ - do { \ - } while (!atomic_try_cmpxchg(v, &val, val c_op i)); \ - return val; \ +static inline void atomic_or(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "orl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); } -#define ATOMIC_OPS(op, c_op) \ - ATOMIC_OP(op) \ - ATOMIC_FETCH_OP(op, c_op) +static inline int atomic_fetch_or(int i, atomic_t *v) +{ + int val = atomic_read(v); -ATOMIC_OPS(and, &) -ATOMIC_OPS(or , |) -ATOMIC_OPS(xor, ^) + do { } while (!atomic_try_cmpxchg(v, &val, val | i)); -#undef ATOMIC_OPS -#undef ATOMIC_FETCH_OP -#undef ATOMIC_OP + return val; +} + +static inline void atomic_xor(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "xorl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); +} + +static inline int atomic_fetch_xor(int i, atomic_t *v) +{ + int val = atomic_read(v); + + do { } while (!atomic_try_cmpxchg(v, &val, val ^ i)); + + return val; +} /** * __atomic_add_unless - add unless the number is already a given value @@ -239,10 +260,12 @@ ATOMIC_OPS(xor, ^) static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) { int c = atomic_read(v); + do { if (unlikely(c == u)) break; } while (!atomic_try_cmpxchg(v, &c, c + a)); + return c; } diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 71d7705..9e206f3 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -312,37 +312,70 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v) #undef alternative_atomic64 #undef __alternative_atomic64 -#define ATOMIC64_OP(op, c_op) \ -static inline void atomic64_##op(long long i, atomic64_t *v) \ -{ \ - long long old, c = 0; \ - while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ - c = old; \ +static inline void atomic64_and(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c & i)) != c) + c = old; } -#define ATOMIC64_FETCH_OP(op, c_op) \ -static inline long long atomic64_fetch_##op(long long i, atomic64_t *v) \ -{ \ - long long old, c = 0; \ - while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ - c = old; \ - return old; \ +static inline long long atomic64_fetch_and(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c & i)) != c) + c = old; + + return old; } -ATOMIC64_FETCH_OP(add, +) +static inline void atomic64_or(long long i, atomic64_t *v) +{ + long long old, c = 0; -#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v)) + while ((old = atomic64_cmpxchg(v, c, c | i)) != c) + c = old; +} + +static inline long long atomic64_fetch_or(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c | i)) != c) + c = old; + + return old; +} -#define ATOMIC64_OPS(op, c_op) \ - ATOMIC64_OP(op, c_op) \ - ATOMIC64_FETCH_OP(op, c_op) +static inline void atomic64_xor(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c) + c = old; +} -ATOMIC64_OPS(and, &) -ATOMIC64_OPS(or, |) -ATOMIC64_OPS(xor, ^) +static inline long long atomic64_fetch_xor(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c) + c = old; + + return old; +} -#undef ATOMIC64_OPS -#undef ATOMIC64_FETCH_OP -#undef ATOMIC64_OP +static inline long long atomic64_fetch_add(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c + i)) != c) + c = old; + + return old; +} + +#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v)) #endif /* _ASM_X86_ATOMIC64_32_H */ diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 6189a43..5d9de36 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -177,7 +177,7 @@ static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) } #define atomic64_try_cmpxchg atomic64_try_cmpxchg -static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, long *old, long new) +static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, long new) { return try_cmpxchg(&v->counter, old, new); } @@ -198,7 +198,7 @@ static inline long atomic64_xchg(atomic64_t *v, long new) */ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) { - long c = atomic64_read(v); + s64 c = atomic64_read(v); do { if (unlikely(c == u)) return false; @@ -217,7 +217,7 @@ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) */ static inline long atomic64_dec_if_positive(atomic64_t *v) { - long dec, c = atomic64_read(v); + s64 dec, c = atomic64_read(v); do { dec = c - 1; if (unlikely(dec < 0)) @@ -226,34 +226,55 @@ static inline long atomic64_dec_if_positive(atomic64_t *v) return dec; } -#define ATOMIC64_OP(op) \ -static inline void atomic64_##op(long i, atomic64_t *v) \ -{ \ - asm volatile(LOCK_PREFIX #op"q %1,%0" \ - : "+m" (v->counter) \ - : "er" (i) \ - : "memory"); \ +static inline void atomic64_and(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "andq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); } -#define ATOMIC64_FETCH_OP(op, c_op) \ -static inline long atomic64_fetch_##op(long i, atomic64_t *v) \ -{ \ - long val = atomic64_read(v); \ - do { \ - } while (!atomic64_try_cmpxchg(v, &val, val c_op i)); \ - return val; \ +static inline long atomic64_fetch_and(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); + + do { + } while (!atomic64_try_cmpxchg(v, &val, val & i)); + return val; } -#define ATOMIC64_OPS(op, c_op) \ - ATOMIC64_OP(op) \ - ATOMIC64_FETCH_OP(op, c_op) +static inline void atomic64_or(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "orq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); +} -ATOMIC64_OPS(and, &) -ATOMIC64_OPS(or, |) -ATOMIC64_OPS(xor, ^) +static inline long atomic64_fetch_or(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); -#undef ATOMIC64_OPS -#undef ATOMIC64_FETCH_OP -#undef ATOMIC64_OP + do { + } while (!atomic64_try_cmpxchg(v, &val, val | i)); + return val; +} + +static inline void atomic64_xor(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "xorq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); +} + +static inline long atomic64_fetch_xor(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); + + do { + } while (!atomic64_try_cmpxchg(v, &val, val ^ i)); + return val; +} #endif /* _ASM_X86_ATOMIC64_64_H */ diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index d90296d..b5069e8 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -157,7 +157,7 @@ extern void __add_wrong_size(void) #define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock) \ ({ \ bool success; \ - __typeof__(_ptr) _old = (_pold); \ + __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ switch (size) { \ diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index aae87c4..c62e716 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -455,7 +455,11 @@ void arch_timer_enable_workaround(const struct arch_timer_erratum_workaround *wa per_cpu(timer_unstable_counter_workaround, i) = wa; } - static_branch_enable(&arch_timer_read_ool_enabled); + /* + * Use the locked version, as we're called from the CPU + * hotplug framework. Otherwise, we end-up in deadlock-land. + */ + static_branch_enable_cpuslocked(&arch_timer_read_ool_enabled); /* * Don't use the vdso fastpath if errata require using the diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 00d8967..0511fce 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -28,6 +28,7 @@ #include <linux/debugfs.h> #include <linux/sort.h> +#include <linux/sched/mm.h> #include "intel_drv.h" static inline struct drm_i915_private *node_to_i915(struct drm_info_node *node) @@ -4331,7 +4332,7 @@ i915_drop_caches_set(void *data, u64 val) mutex_unlock(&dev->struct_mutex); } - lockdep_set_current_reclaim_state(GFP_KERNEL); + fs_reclaim_acquire(GFP_KERNEL); if (val & DROP_BOUND) i915_gem_shrink(dev_priv, LONG_MAX, I915_SHRINK_BOUND); @@ -4340,7 +4341,7 @@ i915_drop_caches_set(void *data, u64 val) if (val & DROP_SHRINK_ALL) i915_gem_shrink_all(dev_priv); - lockdep_clear_current_reclaim_state(); + fs_reclaim_release(GFP_KERNEL); if (val & DROP_FREED) { synchronize_rcu(); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 3d424a5..f0fd3ad 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -446,14 +446,14 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, ovl_path_upper(dentry, &upperpath); realfile = ovl_path_open(&upperpath, O_RDONLY); - smp_mb__before_spinlock(); + inode_lock(inode); if (!od->upperfile) { if (IS_ERR(realfile)) { inode_unlock(inode); return PTR_ERR(realfile); } - od->upperfile = realfile; + smp_store_release(&od->upperfile, realfile); } else { /* somebody has beaten us to it */ if (!IS_ERR(realfile)) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index b0d5897..886085b 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -109,27 +109,24 @@ static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, goto out; WRITE_ONCE(uwq->waken, true); /* - * The implicit smp_mb__before_spinlock in try_to_wake_up() - * renders uwq->waken visible to other CPUs before the task is - * waken. + * The Program-Order guarantees provided by the scheduler + * ensure uwq->waken is visible before the task is woken. */ ret = wake_up_state(wq->private, mode); - if (ret) + if (ret) { /* * Wake only once, autoremove behavior. * - * After the effect of list_del_init is visible to the - * other CPUs, the waitqueue may disappear from under - * us, see the !list_empty_careful() in - * handle_userfault(). try_to_wake_up() has an - * implicit smp_mb__before_spinlock, and the - * wq->private is read before calling the extern - * function "wake_up_state" (which in turns calls - * try_to_wake_up). While the spin_lock;spin_unlock; - * wouldn't be enough, the smp_mb__before_spinlock is - * enough to avoid an explicit smp_mb() here. + * After the effect of list_del_init is visible to the other + * CPUs, the waitqueue may disappear from under us, see the + * !list_empty_careful() in handle_userfault(). + * + * try_to_wake_up() has an implicit smp_mb(), and the + * wq->private is read before calling the extern function + * "wake_up_state" (which in turns calls try_to_wake_up). */ list_del_init(&wq->entry); + } out: return ret; } diff --git a/include/asm-generic/atomic64.h b/include/asm-generic/atomic64.h index dad68bf..8d28eb0 100644 --- a/include/asm-generic/atomic64.h +++ b/include/asm-generic/atomic64.h @@ -21,6 +21,8 @@ typedef struct { extern long long atomic64_read(const atomic64_t *v); extern void atomic64_set(atomic64_t *v, long long i); +#define atomic64_set_release(v, i) atomic64_set((v), (i)) + #define ATOMIC64_OP(op) \ extern void atomic64_##op(long long a, atomic64_t *v); diff --git a/include/linux/atomic.h b/include/linux/atomic.h index c56be74..40d6bfe 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -38,6 +38,9 @@ * Besides, if an arch has a special barrier for acquire/release, it could * implement its own __atomic_op_* and use the same framework for building * variants + * + * If an architecture overrides __atomic_op_acquire() it will probably want + * to define smp_mb__after_spinlock(). */ #ifndef __atomic_op_acquire #define __atomic_op_acquire(op, args...) \ diff --git a/include/linux/completion.h b/include/linux/completion.h index 5d5aaae..9bcebf5 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -9,6 +9,9 @@ */ #include <linux/wait.h> +#ifdef CONFIG_LOCKDEP_COMPLETE +#include <linux/lockdep.h> +#endif /* * struct completion - structure used to maintain state for a "completion" @@ -25,10 +28,50 @@ struct completion { unsigned int done; wait_queue_head_t wait; +#ifdef CONFIG_LOCKDEP_COMPLETE + struct lockdep_map_cross map; +#endif }; +#ifdef CONFIG_LOCKDEP_COMPLETE +static inline void complete_acquire(struct completion *x) +{ + lock_acquire_exclusive((struct lockdep_map *)&x->map, 0, 0, NULL, _RET_IP_); +} + +static inline void complete_release(struct completion *x) +{ + lock_release((struct lockdep_map *)&x->map, 0, _RET_IP_); +} + +static inline void complete_release_commit(struct completion *x) +{ + lock_commit_crosslock((struct lockdep_map *)&x->map); +} + +#define init_completion(x) \ +do { \ + static struct lock_class_key __key; \ + lockdep_init_map_crosslock((struct lockdep_map *)&(x)->map, \ + "(complete)" #x, \ + &__key, 0); \ + __init_completion(x); \ +} while (0) +#else +#define init_completion(x) __init_completion(x) +static inline void complete_acquire(struct completion *x) {} +static inline void complete_release(struct completion *x) {} +static inline void complete_release_commit(struct completion *x) {} +#endif + +#ifdef CONFIG_LOCKDEP_COMPLETE +#define COMPLETION_INITIALIZER(work) \ + { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait), \ + STATIC_CROSS_LOCKDEP_MAP_INIT("(complete)" #work, &(work)) } +#else #define COMPLETION_INITIALIZER(work) \ { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) } +#endif #define COMPLETION_INITIALIZER_ONSTACK(work) \ ({ init_completion(&work); work; }) @@ -70,7 +113,7 @@ struct completion { * This inline function will initialize a dynamically created completion * structure. */ -static inline void init_completion(struct completion *x) +static inline void __init_completion(struct completion *x) { x->done = 0; init_waitqueue_head(&x->wait); diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 898cfe2..e74655d 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -37,12 +37,6 @@ static inline bool cpusets_enabled(void) return static_branch_unlikely(&cpusets_enabled_key); } -static inline int nr_cpusets(void) -{ - /* jump label reference count + the top-level cpuset */ - return static_key_count(&cpusets_enabled_key.key) + 1; -} - static inline void cpuset_inc(void) { static_branch_inc(&cpusets_pre_enable_key); diff --git a/include/linux/futex.h b/include/linux/futex.h index 7c5b694..f36bfd2 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -54,7 +54,6 @@ union futex_key { #ifdef CONFIG_FUTEX extern void exit_robust_list(struct task_struct *curr); -extern void exit_pi_state_list(struct task_struct *curr); #ifdef CONFIG_HAVE_FUTEX_CMPXCHG #define futex_cmpxchg_enabled 1 #else @@ -64,8 +63,14 @@ extern int futex_cmpxchg_enabled; static inline void exit_robust_list(struct task_struct *curr) { } +#endif + +#ifdef CONFIG_FUTEX_PI +extern void exit_pi_state_list(struct task_struct *curr); +#else static inline void exit_pi_state_list(struct task_struct *curr) { } #endif + #endif diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 5dd1272..5fdd93b 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -23,10 +23,26 @@ # define trace_softirq_context(p) ((p)->softirq_context) # define trace_hardirqs_enabled(p) ((p)->hardirqs_enabled) # define trace_softirqs_enabled(p) ((p)->softirqs_enabled) -# define trace_hardirq_enter() do { current->hardirq_context++; } while (0) -# define trace_hardirq_exit() do { current->hardirq_context--; } while (0) -# define lockdep_softirq_enter() do { current->softirq_context++; } while (0) -# define lockdep_softirq_exit() do { current->softirq_context--; } while (0) +# define trace_hardirq_enter() \ +do { \ + current->hardirq_context++; \ + crossrelease_hist_start(XHLOCK_HARD); \ +} while (0) +# define trace_hardirq_exit() \ +do { \ + current->hardirq_context--; \ + crossrelease_hist_end(XHLOCK_HARD); \ +} while (0) +# define lockdep_softirq_enter() \ +do { \ + current->softirq_context++; \ + crossrelease_hist_start(XHLOCK_SOFT); \ +} while (0) +# define lockdep_softirq_exit() \ +do { \ + current->softirq_context--; \ + crossrelease_hist_end(XHLOCK_SOFT); \ +} while (0) # define INIT_TRACE_IRQFLAGS .softirqs_enabled = 1, #else # define trace_hardirqs_on() do { } while (0) diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 2afd74b..cd58616 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -163,6 +163,8 @@ extern void jump_label_apply_nops(struct module *mod); extern int static_key_count(struct static_key *key); extern void static_key_enable(struct static_key *key); extern void static_key_disable(struct static_key *key); +extern void static_key_enable_cpuslocked(struct static_key *key); +extern void static_key_disable_cpuslocked(struct static_key *key); /* * We should be using ATOMIC_INIT() for initializing .enabled, but @@ -234,24 +236,29 @@ static inline int jump_label_apply_nops(struct module *mod) static inline void static_key_enable(struct static_key *key) { - int count = static_key_count(key); - - WARN_ON_ONCE(count < 0 || count > 1); + STATIC_KEY_CHECK_USE(); - if (!count) - static_key_slow_inc(key); + if (atomic_read(&key->enabled) != 0) { + WARN_ON_ONCE(atomic_read(&key->enabled) != 1); + return; + } + atomic_set(&key->enabled, 1); } static inline void static_key_disable(struct static_key *key) { - int count = static_key_count(key); - - WARN_ON_ONCE(count < 0 || count > 1); + STATIC_KEY_CHECK_USE(); - if (count) - static_key_slow_dec(key); + if (atomic_read(&key->enabled) != 1) { + WARN_ON_ONCE(atomic_read(&key->enabled) != 0); + return; + } + atomic_set(&key->enabled, 0); } +#define static_key_enable_cpuslocked(k) static_key_enable((k)) +#define static_key_disable_cpuslocked(k) static_key_disable((k)) + #define STATIC_KEY_INIT_TRUE { .enabled = ATOMIC_INIT(1) } #define STATIC_KEY_INIT_FALSE { .enabled = ATOMIC_INIT(0) } @@ -413,8 +420,10 @@ extern bool ____wrong_branch_error(void); * Normal usage; boolean enable/disable. */ -#define static_branch_enable(x) static_key_enable(&(x)->key) -#define static_branch_disable(x) static_key_disable(&(x)->key) +#define static_branch_enable(x) static_key_enable(&(x)->key) +#define static_branch_disable(x) static_key_disable(&(x)->key) +#define static_branch_enable_cpuslocked(x) static_key_enable_cpuslocked(&(x)->key) +#define static_branch_disable_cpuslocked(x) static_key_disable_cpuslocked(&(x)->key) #endif /* __ASSEMBLY__ */ diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h index b7f8ace..41960fe 100644 --- a/include/linux/kasan-checks.h +++ b/include/linux/kasan-checks.h @@ -2,11 +2,13 @@ #define _LINUX_KASAN_CHECKS_H #ifdef CONFIG_KASAN -void kasan_check_read(const void *p, unsigned int size); -void kasan_check_write(const void *p, unsigned int size); +void kasan_check_read(const volatile void *p, unsigned int size); +void kasan_check_write(const volatile void *p, unsigned int size); #else -static inline void kasan_check_read(const void *p, unsigned int size) { } -static inline void kasan_check_write(const void *p, unsigned int size) { } +static inline void kasan_check_read(const volatile void *p, unsigned int size) +{ } +static inline void kasan_check_write(const volatile void *p, unsigned int size) +{ } #endif #endif diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index fffe49f..651cc61 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -29,7 +29,7 @@ extern int lock_stat; * We'd rather not expose kernel/lockdep_states.h this wide, but we do need * the total number of states... :-( */ -#define XXX_LOCK_USAGE_STATES (1+3*4) +#define XXX_LOCK_USAGE_STATES (1+2*4) /* * NR_LOCKDEP_CACHING_CLASSES ... Number of classes @@ -155,6 +155,12 @@ struct lockdep_map { int cpu; unsigned long ip; #endif +#ifdef CONFIG_LOCKDEP_CROSSRELEASE + /* + * Whether it's a crosslock. + */ + int cross; +#endif }; static inline void lockdep_copy_map(struct lockdep_map *to, @@ -258,8 +264,95 @@ struct held_lock { unsigned int hardirqs_off:1; unsigned int references:12; /* 32 bits */ unsigned int pin_count; +#ifdef CONFIG_LOCKDEP_CROSSRELEASE + /* + * Generation id. + * + * A value of cross_gen_id will be stored when holding this, + * which is globally increased whenever each crosslock is held. + */ + unsigned int gen_id; +#endif +}; + +#ifdef CONFIG_LOCKDEP_CROSSRELEASE +#define MAX_XHLOCK_TRACE_ENTRIES 5 + +/* + * This is for keeping locks waiting for commit so that true dependencies + * can be added at commit step. + */ +struct hist_lock { + /* + * Id for each entry in the ring buffer. This is used to + * decide whether the ring buffer was overwritten or not. + * + * For example, + * + * |<----------- hist_lock ring buffer size ------->| + * pppppppppppppppppppppiiiiiiiiiiiiiiiiiiiiiiiiiiiii + * wrapped > iiiiiiiiiiiiiiiiiiiiiiiiiii....................... + * + * where 'p' represents an acquisition in process + * context, 'i' represents an acquisition in irq + * context. + * + * In this example, the ring buffer was overwritten by + * acquisitions in irq context, that should be detected on + * rollback or commit. + */ + unsigned int hist_id; + + /* + * Seperate stack_trace data. This will be used at commit step. + */ + struct stack_trace trace; + unsigned long trace_entries[MAX_XHLOCK_TRACE_ENTRIES]; + + /* + * Seperate hlock instance. This will be used at commit step. + * + * TODO: Use a smaller data structure containing only necessary + * data. However, we should make lockdep code able to handle the + * smaller one first. + */ + struct held_lock hlock; +}; + +/* + * To initialize a lock as crosslock, lockdep_init_map_crosslock() should + * be called instead of lockdep_init_map(). + */ +struct cross_lock { + /* + * When more than one acquisition of crosslocks are overlapped, + * we have to perform commit for them based on cross_gen_id of + * the first acquisition, which allows us to add more true + * dependencies. + * + * Moreover, when no acquisition of a crosslock is in progress, + * we should not perform commit because the lock might not exist + * any more, which might cause incorrect memory access. So we + * have to track the number of acquisitions of a crosslock. + */ + int nr_acquire; + + /* + * Seperate hlock instance. This will be used at commit step. + * + * TODO: Use a smaller data structure containing only necessary + * data. However, we should make lockdep code able to handle the + * smaller one first. + */ + struct held_lock hlock; }; +struct lockdep_map_cross { + struct lockdep_map map; + struct cross_lock xlock; +}; +#endif + /* * Initialization, self-test and debugging-output methods: */ @@ -282,13 +375,6 @@ extern void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass); /* - * To initialize a lockdep_map statically use this macro. - * Note that _name must not be NULL. - */ -#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ - { .name = (_name), .key = (void *)(_key), } - -/* * Reinitialize a lock key - for cases where there is special locking or * special initialization of locks so that the validator gets the scope * of dependencies wrong: they are either too broad (they need a class-split) @@ -363,10 +449,6 @@ static inline void lock_set_subclass(struct lockdep_map *lock, extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip); -extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask); -extern void lockdep_clear_current_reclaim_state(void); -extern void lockdep_trace_alloc(gfp_t mask); - struct pin_cookie { unsigned int val; }; #define NIL_COOKIE (struct pin_cookie){ .val = 0U, } @@ -375,7 +457,7 @@ extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock); extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie); extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); -# define INIT_LOCKDEP .lockdep_recursion = 0, .lockdep_reclaim_gfp = 0, +# define INIT_LOCKDEP .lockdep_recursion = 0, #define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) @@ -416,9 +498,6 @@ static inline void lockdep_on(void) # define lock_downgrade(l, i) do { } while (0) # define lock_set_class(l, n, k, s, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) -# define lockdep_set_current_reclaim_state(g) do { } while (0) -# define lockdep_clear_current_reclaim_state() do { } while (0) -# define lockdep_trace_alloc(g) do { } while (0) # define lockdep_info() do { } while (0) # define lockdep_init_map(lock, name, key, sub) \ do { (void)(name); (void)(key); } while (0) @@ -467,6 +546,56 @@ struct pin_cookie { }; #endif /* !LOCKDEP */ +enum xhlock_context_t { + XHLOCK_HARD, + XHLOCK_SOFT, + XHLOCK_PROC, + XHLOCK_CTX_NR, +}; + +#ifdef CONFIG_LOCKDEP_CROSSRELEASE +extern void lockdep_init_map_crosslock(struct lockdep_map *lock, + const char *name, + struct lock_class_key *key, + int subclass); +extern void lock_commit_crosslock(struct lockdep_map *lock); + +/* + * What we essencially have to initialize is 'nr_acquire'. Other members + * will be initialized in add_xlock(). + */ +#define STATIC_CROSS_LOCK_INIT() \ + { .nr_acquire = 0,} + +#define STATIC_CROSS_LOCKDEP_MAP_INIT(_name, _key) \ + { .map.name = (_name), .map.key = (void *)(_key), \ + .map.cross = 1, .xlock = STATIC_CROSS_LOCK_INIT(), } + +/* + * To initialize a lockdep_map statically use this macro. + * Note that _name must not be NULL. + */ +#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ + { .name = (_name), .key = (void *)(_key), .cross = 0, } + +extern void crossrelease_hist_start(enum xhlock_context_t c); +extern void crossrelease_hist_end(enum xhlock_context_t c); +extern void lockdep_init_task(struct task_struct *task); +extern void lockdep_free_task(struct task_struct *task); +#else +/* + * To initialize a lockdep_map statically use this macro. + * Note that _name must not be NULL. + */ +#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ + { .name = (_name), .key = (void *)(_key), } + +static inline void crossrelease_hist_start(enum xhlock_context_t c) {} +static inline void crossrelease_hist_end(enum xhlock_context_t c) {} +static inline void lockdep_init_task(struct task_struct *task) {} +static inline void lockdep_free_task(struct task_struct *task) {} +#endif + #ifdef CONFIG_LOCK_STAT extern void lock_contended(struct lockdep_map *lock, unsigned long ip); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3cadee0..dc1edec 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -535,6 +535,10 @@ extern void tlb_finish_mmu(struct mmu_gather *tlb, */ static inline bool mm_tlb_flush_pending(struct mm_struct *mm) { + /* + * Must be called with PTL held; such that our PTL acquire will have + * observed the store from set_tlb_flush_pending(). + */ return atomic_read(&mm->tlb_flush_pending) > 0; } @@ -556,10 +560,29 @@ static inline void inc_tlb_flush_pending(struct mm_struct *mm) atomic_inc(&mm->tlb_flush_pending); /* - * Guarantee that the tlb_flush_pending increase does not leak into the - * critical section updating the page tables + * The only time this value is relevant is when there are indeed pages + * to flush. And we'll only flush pages after changing them, which + * requires the PTL. + * + * So the ordering here is: + * + * atomic_inc(&mm->tlb_flush_pending); + * spin_lock(&ptl); + * ... + * set_pte_at(); + * spin_unlock(&ptl); + * + * spin_lock(&ptl) + * mm_tlb_flush_pending(); + * .... + * spin_unlock(&ptl); + * + * flush_tlb_range(); + * atomic_dec(&mm->tlb_flush_pending); + * + * So the =true store is constrained by the PTL unlock, and the =false + * store is constrained by the TLB invalidate. */ - smp_mb__before_spinlock(); } /* Clearing is done after a TLB flush, which also provides a barrier. */ diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h index ae0528b..e784761 100644 --- a/include/linux/rwsem-spinlock.h +++ b/include/linux/rwsem-spinlock.h @@ -32,6 +32,7 @@ struct rw_semaphore { #define RWSEM_UNLOCKED_VALUE 0x00000000 extern void __down_read(struct rw_semaphore *sem); +extern int __must_check __down_read_killable(struct rw_semaphore *sem); extern int __down_read_trylock(struct rw_semaphore *sem); extern void __down_write(struct rw_semaphore *sem); extern int __must_check __down_write_killable(struct rw_semaphore *sem); diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index dd1d142..0ad7318 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -44,6 +44,7 @@ struct rw_semaphore { }; extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *); diff --git a/include/linux/sched.h b/include/linux/sched.h index 8337e2d..772c5f6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -846,7 +846,17 @@ struct task_struct { int lockdep_depth; unsigned int lockdep_recursion; struct held_lock held_locks[MAX_LOCK_DEPTH]; - gfp_t lockdep_reclaim_gfp; +#endif + +#ifdef CONFIG_LOCKDEP_CROSSRELEASE +#define MAX_XHLOCKS_NR 64UL + struct hist_lock *xhlocks; /* Crossrelease history locks */ + unsigned int xhlock_idx; + /* For restoring at history boundaries */ + unsigned int xhlock_idx_hist[XHLOCK_CTX_NR]; + unsigned int hist_id; + /* For overwrite check at each context exit */ + unsigned int hist_id_save[XHLOCK_CTX_NR]; #endif #ifdef CONFIG_UBSAN diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 2b24a69..2b0a281 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -167,6 +167,14 @@ static inline gfp_t current_gfp_context(gfp_t flags) return flags; } +#ifdef CONFIG_LOCKDEP +extern void fs_reclaim_acquire(gfp_t gfp_mask); +extern void fs_reclaim_release(gfp_t gfp_mask); +#else +static inline void fs_reclaim_acquire(gfp_t gfp_mask) { } +static inline void fs_reclaim_release(gfp_t gfp_mask) { } +#endif + static inline unsigned int memalloc_noio_save(void) { unsigned int flags = current->flags & PF_MEMALLOC_NOIO; diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index d9510e8..4e8cce1 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -118,16 +118,39 @@ do { \ #endif /* - * Despite its name it doesn't necessarily has to be a full barrier. - * It should only guarantee that a STORE before the critical section - * can not be reordered with LOADs and STOREs inside this section. - * spin_lock() is the one-way barrier, this LOAD can not escape out - * of the region. So the default implementation simply ensures that - * a STORE can not move into the critical section, smp_wmb() should - * serialize it with another STORE done by spin_lock(). + * This barrier must provide two things: + * + * - it must guarantee a STORE before the spin_lock() is ordered against a + * LOAD after it, see the comments at its two usage sites. + * + * - it must ensure the critical section is RCsc. + * + * The latter is important for cases where we observe values written by other + * CPUs in spin-loops, without barriers, while being subject to scheduling. + * + * CPU0 CPU1 CPU2 + * + * for (;;) { + * if (READ_ONCE(X)) + * break; + * } + * X=1 + * <sched-out> + * <sched-in> + * r = X; + * + * without transitivity it could be that CPU1 observes X!=0 breaks the loop, + * we get migrated and CPU2 sees X==0. + * + * Since most load-store architectures implement ACQUIRE with an smp_mb() after + * the LL/SC loop, they need no further barriers. Similarly all our TSO + * architectures imply an smp_mb() for each atomic instruction and equally don't + * need more. + * + * Architectures that can implement ACQUIRE better need to take care. */ -#ifndef smp_mb__before_spinlock -#define smp_mb__before_spinlock() smp_wmb() +#ifndef smp_mb__after_spinlock +#define smp_mb__after_spinlock() do { } while (0) #endif /** diff --git a/init/Kconfig b/init/Kconfig index 8514b25..5f0ef85 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1275,12 +1275,17 @@ config BASE_FULL config FUTEX bool "Enable futex support" if EXPERT default y - select RT_MUTEXES + imply RT_MUTEXES help Disabling this option will cause the kernel to be built without support for "fast userspace mutexes". The resulting kernel may not run glibc-based applications correctly. +config FUTEX_PI + bool + depends on FUTEX && RT_MUTEXES + default y + config HAVE_FUTEX_CMPXCHG bool depends on FUTEX diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 8d51516..9ed6a05 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -577,6 +577,13 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, rcu_read_unlock(); } +/* Must be called with cpuset_mutex held. */ +static inline int nr_cpusets(void) +{ + /* jump label reference count + the top-level cpuset */ + return static_key_count(&cpusets_enabled_key.key) + 1; +} + /* * generate_sched_domains() * diff --git a/kernel/exit.c b/kernel/exit.c index c5548fa..fa72d57 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -920,6 +920,7 @@ void __noreturn do_exit(long code) exit_rcu(); TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); + lockdep_free_task(tsk); do_task_dead(); } EXPORT_SYMBOL_GPL(do_exit); diff --git a/kernel/fork.c b/kernel/fork.c index e075b77..5fc0991 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -484,6 +484,8 @@ void __init fork_init(void) cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", NULL, free_vm_stack_cache); #endif + + lockdep_init_task(&init_task); } int __weak arch_dup_task_struct(struct task_struct *dst, @@ -1691,6 +1693,7 @@ static __latent_entropy struct task_struct *copy_process( p->lockdep_depth = 0; /* no locks held yet */ p->curr_chain_key = 0; p->lockdep_recursion = 0; + lockdep_init_task(p); #endif #ifdef CONFIG_DEBUG_MUTEXES @@ -1949,6 +1952,7 @@ bad_fork_cleanup_audit: bad_fork_cleanup_perf: perf_event_free_task(p); bad_fork_cleanup_policy: + lockdep_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: diff --git a/kernel/futex.c b/kernel/futex.c index f50b434..0939255 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -876,6 +876,8 @@ static struct task_struct *futex_find_get_task(pid_t pid) return p; } +#ifdef CONFIG_FUTEX_PI + /* * This task is holding PI mutexes at exit time => bad. * Kernel cleans up PI-state, but userspace is likely hosed. @@ -933,6 +935,8 @@ void exit_pi_state_list(struct task_struct *curr) raw_spin_unlock_irq(&curr->pi_lock); } +#endif + /* * We need to check the following states: * @@ -1800,6 +1804,15 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, struct futex_q *this, *next; DEFINE_WAKE_Q(wake_q); + /* + * When PI not supported: return -ENOSYS if requeue_pi is true, + * consequently the compiler knows requeue_pi is always false past + * this point which will optimize away all the conditional code + * further down. + */ + if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi) + return -ENOSYS; + if (requeue_pi) { /* * Requeue PI only works on two distinct uaddrs. This @@ -2595,6 +2608,9 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, struct futex_q q = futex_q_init; int res, ret; + if (!IS_ENABLED(CONFIG_FUTEX_PI)) + return -ENOSYS; + if (refill_pi_state_cache()) return -ENOMEM; @@ -2774,6 +2790,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) struct futex_q *top_waiter; int ret; + if (!IS_ENABLED(CONFIG_FUTEX_PI)) + return -ENOSYS; + retry: if (get_user(uval, uaddr)) return -EFAULT; @@ -2984,6 +3003,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, struct futex_q q = futex_q_init; int res, ret; + if (!IS_ENABLED(CONFIG_FUTEX_PI)) + return -ENOSYS; + if (uaddr == uaddr2) return -EINVAL; diff --git a/kernel/jump_label.c b/kernel/jump_label.c index d11c506..0bf2e8f5 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -79,29 +79,7 @@ int static_key_count(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_count); -void static_key_enable(struct static_key *key) -{ - int count = static_key_count(key); - - WARN_ON_ONCE(count < 0 || count > 1); - - if (!count) - static_key_slow_inc(key); -} -EXPORT_SYMBOL_GPL(static_key_enable); - -void static_key_disable(struct static_key *key) -{ - int count = static_key_count(key); - - WARN_ON_ONCE(count < 0 || count > 1); - - if (count) - static_key_slow_dec(key); -} -EXPORT_SYMBOL_GPL(static_key_disable); - -void static_key_slow_inc(struct static_key *key) +static void static_key_slow_inc_cpuslocked(struct static_key *key) { int v, v1; @@ -125,24 +103,87 @@ void static_key_slow_inc(struct static_key *key) return; } - cpus_read_lock(); jump_label_lock(); if (atomic_read(&key->enabled) == 0) { atomic_set(&key->enabled, -1); jump_label_update(key); - atomic_set(&key->enabled, 1); + /* + * Ensure that if the above cmpxchg loop observes our positive + * value, it must also observe all the text changes. + */ + atomic_set_release(&key->enabled, 1); } else { atomic_inc(&key->enabled); } jump_label_unlock(); +} + +void static_key_slow_inc(struct static_key *key) +{ + cpus_read_lock(); + static_key_slow_inc_cpuslocked(key); cpus_read_unlock(); } EXPORT_SYMBOL_GPL(static_key_slow_inc); -static void __static_key_slow_dec(struct static_key *key, - unsigned long rate_limit, struct delayed_work *work) +void static_key_enable_cpuslocked(struct static_key *key) +{ + STATIC_KEY_CHECK_USE(); + + if (atomic_read(&key->enabled) > 0) { + WARN_ON_ONCE(atomic_read(&key->enabled) != 1); + return; + } + + jump_label_lock(); + if (atomic_read(&key->enabled) == 0) { + atomic_set(&key->enabled, -1); + jump_label_update(key); + /* + * See static_key_slow_inc(). + */ + atomic_set_release(&key->enabled, 1); + } + jump_label_unlock(); +} +EXPORT_SYMBOL_GPL(static_key_enable_cpuslocked); + +void static_key_enable(struct static_key *key) +{ + cpus_read_lock(); + static_key_enable_cpuslocked(key); + cpus_read_unlock(); +} +EXPORT_SYMBOL_GPL(static_key_enable); + +void static_key_disable_cpuslocked(struct static_key *key) +{ + STATIC_KEY_CHECK_USE(); + + if (atomic_read(&key->enabled) != 1) { + WARN_ON_ONCE(atomic_read(&key->enabled) != 0); + return; + } + + jump_label_lock(); + if (atomic_cmpxchg(&key->enabled, 1, 0)) + jump_label_update(key); + jump_label_unlock(); +} +EXPORT_SYMBOL_GPL(static_key_disable_cpuslocked); + +void static_key_disable(struct static_key *key) { cpus_read_lock(); + static_key_disable_cpuslocked(key); + cpus_read_unlock(); +} +EXPORT_SYMBOL_GPL(static_key_disable); + +static void static_key_slow_dec_cpuslocked(struct static_key *key, + unsigned long rate_limit, + struct delayed_work *work) +{ /* * The negative count check is valid even when a negative * key->enabled is in use by static_key_slow_inc(); a @@ -153,7 +194,6 @@ static void __static_key_slow_dec(struct static_key *key, if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { WARN(atomic_read(&key->enabled) < 0, "jump label: negative count!\n"); - cpus_read_unlock(); return; } @@ -164,6 +204,14 @@ static void __static_key_slow_dec(struct static_key *key, jump_label_update(key); } jump_label_unlock(); +} + +static void __static_key_slow_dec(struct static_key *key, + unsigned long rate_limit, + struct delayed_work *work) +{ + cpus_read_lock(); + static_key_slow_dec_cpuslocked(key, rate_limit, work); cpus_read_unlock(); } diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 7d2499b..1114dc4 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -58,6 +58,10 @@ #define CREATE_TRACE_POINTS #include <trace/events/lock.h> +#ifdef CONFIG_LOCKDEP_CROSSRELEASE +#include <linux/slab.h> +#endif + #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; module_param(prove_locking, int, 0644); @@ -344,14 +348,12 @@ EXPORT_SYMBOL(lockdep_on); #if VERBOSE # define HARDIRQ_VERBOSE 1 # define SOFTIRQ_VERBOSE 1 -# define RECLAIM_VERBOSE 1 #else # define HARDIRQ_VERBOSE 0 # define SOFTIRQ_VERBOSE 0 -# define RECLAIM_VERBOSE 0 #endif -#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE +#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE /* * Quick filtering for interesting events: */ @@ -726,6 +728,18 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL); } +#ifdef CONFIG_LOCKDEP_CROSSRELEASE +static void cross_init(struct lockdep_map *lock, int cross); +static int cross_lock(struct lockdep_map *lock); +static int lock_acquire_crosslock(struct held_lock *hlock); +static int lock_release_crosslock(struct lockdep_map *lock); +#else +static inline void cross_init(struct lockdep_map *lock, int cross) {} +static inline int cross_lock(struct lockdep_map *lock) { return 0; } +static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; } +static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; } +#endif + /* * Register a lock's class in the hash-table, if the class is not present * yet. Otherwise we look it up. We cache the result in the lock object @@ -1125,22 +1139,41 @@ print_circular_lock_scenario(struct held_lock *src, printk(KERN_CONT "\n\n"); } - printk(" Possible unsafe locking scenario:\n\n"); - printk(" CPU0 CPU1\n"); - printk(" ---- ----\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(parent); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(source); - printk(KERN_CONT ");\n"); - printk("\n *** DEADLOCK ***\n\n"); + if (cross_lock(tgt->instance)) { + printk(" Possible unsafe locking scenario by crosslock:\n\n"); + printk(" CPU0 CPU1\n"); + printk(" ---- ----\n"); + printk(" lock("); + __print_lock_name(parent); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(source); + printk(KERN_CONT ");\n"); + printk(" unlock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk("\n *** DEADLOCK ***\n\n"); + } else { + printk(" Possible unsafe locking scenario:\n\n"); + printk(" CPU0 CPU1\n"); + printk(" ---- ----\n"); + printk(" lock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(parent); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(source); + printk(KERN_CONT ");\n"); + printk("\n *** DEADLOCK ***\n\n"); + } } /* @@ -1165,7 +1198,12 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, pr_warn("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(check_src); - pr_warn("\nbut task is already holding lock:\n"); + + if (cross_lock(check_tgt->instance)) + pr_warn("\nbut now in release context of a crosslock acquired at the following:\n"); + else + pr_warn("\nbut task is already holding lock:\n"); + print_lock(check_tgt); pr_warn("\nwhich lock already depends on the new lock.\n\n"); pr_warn("\nthe existing dependency chain (in reverse order) is:\n"); @@ -1183,7 +1221,8 @@ static inline int class_equal(struct lock_list *entry, void *data) static noinline int print_circular_bug(struct lock_list *this, struct lock_list *target, struct held_lock *check_src, - struct held_lock *check_tgt) + struct held_lock *check_tgt, + struct stack_trace *trace) { struct task_struct *curr = current; struct lock_list *parent; @@ -1193,7 +1232,9 @@ static noinline int print_circular_bug(struct lock_list *this, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - if (!save_trace(&this->trace)) + if (cross_lock(check_tgt->instance)) + this->trace = *trace; + else if (!save_trace(&this->trace)) return 0; depth = get_lock_depth(target); @@ -1309,6 +1350,19 @@ check_noncircular(struct lock_list *root, struct lock_class *target, return result; } +static noinline int +check_redundant(struct lock_list *root, struct lock_class *target, + struct lock_list **target_entry) +{ + int result; + + debug_atomic_inc(nr_redundant_checks); + + result = __bfs_forwards(root, target, class_equal, target_entry); + + return result; +} + #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * Forwards and backwards subgraph searching, for the purposes of @@ -1784,6 +1838,9 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, if (nest) return 2; + if (cross_lock(prev->instance)) + continue; + return print_deadlock_bug(curr, prev, next); } return 1; @@ -1813,20 +1870,13 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, int *stack_saved) + struct held_lock *next, int distance, struct stack_trace *trace, + int (*save)(struct stack_trace *trace)) { struct lock_list *entry; int ret; struct lock_list this; struct lock_list *uninitialized_var(target_entry); - /* - * Static variable, serialized by the graph_lock(). - * - * We use this static variable to save the stack trace in case - * we call into this function multiple times due to encountering - * trylocks in the held lock stack. - */ - static struct stack_trace trace; /* * Prove that the new <prev> -> <next> dependency would not @@ -1841,7 +1891,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, this.parent = NULL; ret = check_noncircular(&this, hlock_class(prev), &target_entry); if (unlikely(!ret)) - return print_circular_bug(&this, target_entry, next, prev); + return print_circular_bug(&this, target_entry, next, prev, trace); else if (unlikely(ret < 0)) return print_bfs_bug(ret); @@ -1870,15 +1920,26 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, if (entry->class == hlock_class(next)) { if (distance == 1) entry->distance = 1; - return 2; + return 1; } } - if (!*stack_saved) { - if (!save_trace(&trace)) - return 0; - *stack_saved = 1; + /* + * Is the <prev> -> <next> link redundant? + */ + this.class = hlock_class(prev); + this.parent = NULL; + ret = check_redundant(&this, hlock_class(next), &target_entry); + if (!ret) { + debug_atomic_inc(nr_redundant); + return 2; } + if (ret < 0) + return print_bfs_bug(ret); + + + if (save && !save(trace)) + return 0; /* * Ok, all validations passed, add the new lock @@ -1886,14 +1947,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, */ ret = add_lock_to_list(hlock_class(next), &hlock_class(prev)->locks_after, - next->acquire_ip, distance, &trace); + next->acquire_ip, distance, trace); if (!ret) return 0; ret = add_lock_to_list(hlock_class(prev), &hlock_class(next)->locks_before, - next->acquire_ip, distance, &trace); + next->acquire_ip, distance, trace); if (!ret) return 0; @@ -1901,8 +1962,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * Debugging printouts: */ if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { - /* We drop graph lock, so another thread can overwrite trace. */ - *stack_saved = 0; graph_unlock(); printk("\n new dependency: "); print_lock_name(hlock_class(prev)); @@ -1910,9 +1969,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, print_lock_name(hlock_class(next)); printk(KERN_CONT "\n"); dump_stack(); - return graph_lock(); + if (!graph_lock()) + return 0; } - return 1; + return 2; } /* @@ -1925,8 +1985,9 @@ static int check_prevs_add(struct task_struct *curr, struct held_lock *next) { int depth = curr->lockdep_depth; - int stack_saved = 0; struct held_lock *hlock; + struct stack_trace trace; + int (*save)(struct stack_trace *trace) = save_trace; /* * Debugging checks. @@ -1947,21 +2008,36 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) int distance = curr->lockdep_depth - depth + 1; hlock = curr->held_locks + depth - 1; /* - * Only non-recursive-read entries get new dependencies - * added: + * Only non-crosslock entries get new dependencies added. + * Crosslock entries will be added by commit later: */ - if (hlock->read != 2 && hlock->check) { - if (!check_prev_add(curr, hlock, next, - distance, &stack_saved)) - return 0; + if (!cross_lock(hlock->instance)) { /* - * Stop after the first non-trylock entry, - * as non-trylock entries have added their - * own direct dependencies already, so this - * lock is connected to them indirectly: + * Only non-recursive-read entries get new dependencies + * added: */ - if (!hlock->trylock) - break; + if (hlock->read != 2 && hlock->check) { + int ret = check_prev_add(curr, hlock, next, + distance, &trace, save); + if (!ret) + return 0; + + /* + * Stop saving stack_trace if save_trace() was + * called at least once: + */ + if (save && ret == 2) + save = NULL; + + /* + * Stop after the first non-trylock entry, + * as non-trylock entries have added their + * own direct dependencies already, so this + * lock is connected to them indirectly: + */ + if (!hlock->trylock) + break; + } } depth--; /* @@ -2126,19 +2202,26 @@ static int check_no_collision(struct task_struct *curr, } /* - * Look up a dependency chain. If the key is not present yet then - * add it and return 1 - in this case the new dependency chain is - * validated. If the key is already hashed, return 0. - * (On return with 1 graph_lock is held.) + * This is for building a chain between just two different classes, + * instead of adding a new hlock upon current, which is done by + * add_chain_cache(). + * + * This can be called in any context with two classes, while + * add_chain_cache() must be done within the lock owener's context + * since it uses hlock which might be racy in another context. */ -static inline int lookup_chain_cache(struct task_struct *curr, - struct held_lock *hlock, - u64 chain_key) +static inline int add_chain_cache_classes(unsigned int prev, + unsigned int next, + unsigned int irq_context, + u64 chain_key) { - struct lock_class *class = hlock_class(hlock); struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; - int i, j; + + /* + * Allocate a new chain entry from the static array, and add + * it to the hash: + */ /* * We might need to take the graph lock, ensure we've got IRQs @@ -2147,43 +2230,76 @@ static inline int lookup_chain_cache(struct task_struct *curr, */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; + + if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { + if (!debug_locks_off_graph_unlock()) + return 0; + + print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!"); + dump_stack(); + return 0; + } + + chain = lock_chains + nr_lock_chains++; + chain->chain_key = chain_key; + chain->irq_context = irq_context; + chain->depth = 2; + if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { + chain->base = nr_chain_hlocks; + nr_chain_hlocks += chain->depth; + chain_hlocks[chain->base] = prev - 1; + chain_hlocks[chain->base + 1] = next -1; + } +#ifdef CONFIG_DEBUG_LOCKDEP /* - * We can walk it lock-free, because entries only get added - * to the hash: + * Important for check_no_collision(). */ - hlist_for_each_entry_rcu(chain, hash_head, entry) { - if (chain->chain_key == chain_key) { -cache_hit: - debug_atomic_inc(chain_lookup_hits); - if (!check_no_collision(curr, hlock, chain)) - return 0; - - if (very_verbose(class)) - printk("\nhash chain already cached, key: " - "%016Lx tail class: [%p] %s\n", - (unsigned long long)chain_key, - class->key, class->name); + else { + if (!debug_locks_off_graph_unlock()) return 0; - } + + print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!"); + dump_stack(); + return 0; } - if (very_verbose(class)) - printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", - (unsigned long long)chain_key, class->key, class->name); +#endif + + hlist_add_head_rcu(&chain->entry, hash_head); + debug_atomic_inc(chain_lookup_misses); + inc_chains(); + + return 1; +} + +/* + * Adds a dependency chain into chain hashtable. And must be called with + * graph_lock held. + * + * Return 0 if fail, and graph_lock is released. + * Return 1 if succeed, with graph_lock held. + */ +static inline int add_chain_cache(struct task_struct *curr, + struct held_lock *hlock, + u64 chain_key) +{ + struct lock_class *class = hlock_class(hlock); + struct hlist_head *hash_head = chainhashentry(chain_key); + struct lock_chain *chain; + int i, j; + /* * Allocate a new chain entry from the static array, and add * it to the hash: */ - if (!graph_lock()) - return 0; + /* - * We have to walk the chain again locked - to avoid duplicates: + * We might need to take the graph lock, ensure we've got IRQs + * disabled to make this an IRQ-safe lock.. for recursion reasons + * lockdep won't complain about its own locking errors. */ - hlist_for_each_entry(chain, hash_head, entry) { - if (chain->chain_key == chain_key) { - graph_unlock(); - goto cache_hit; - } - } + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return 0; + if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { if (!debug_locks_off_graph_unlock()) return 0; @@ -2235,6 +2351,78 @@ cache_hit: return 1; } +/* + * Look up a dependency chain. + */ +static inline struct lock_chain *lookup_chain_cache(u64 chain_key) +{ + struct hlist_head *hash_head = chainhashentry(chain_key); + struct lock_chain *chain; + + /* + * We can walk it lock-free, because entries only get added + * to the hash: + */ + hlist_for_each_entry_rcu(chain, hash_head, entry) { + if (chain->chain_key == chain_key) { + debug_atomic_inc(chain_lookup_hits); + return chain; + } + } + return NULL; +} + +/* + * If the key is not present yet in dependency chain cache then + * add it and return 1 - in this case the new dependency chain is + * validated. If the key is already hashed, return 0. + * (On return with 1 graph_lock is held.) + */ +static inline int lookup_chain_cache_add(struct task_struct *curr, + struct held_lock *hlock, + u64 chain_key) +{ + struct lock_class *class = hlock_class(hlock); + struct lock_chain *chain = lookup_chain_cache(chain_key); + + if (chain) { +cache_hit: + if (!check_no_collision(curr, hlock, chain)) + return 0; + + if (very_verbose(class)) { + printk("\nhash chain already cached, key: " + "%016Lx tail class: [%p] %s\n", + (unsigned long long)chain_key, + class->key, class->name); + } + + return 0; + } + + if (very_verbose(class)) { + printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", + (unsigned long long)chain_key, class->key, class->name); + } + + if (!graph_lock()) + return 0; + + /* + * We have to walk the chain again locked - to avoid duplicates: + */ + chain = lookup_chain_cache(chain_key); + if (chain) { + graph_unlock(); + goto cache_hit; + } + + if (!add_chain_cache(curr, hlock, chain_key)) + return 0; + + return 1; +} + static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, struct held_lock *hlock, int chain_head, u64 chain_key) { @@ -2245,11 +2433,11 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, * * We look up the chain_key and do the O(N^2) check and update of * the dependencies only if this is a new dependency chain. - * (If lookup_chain_cache() returns with 1 it acquires + * (If lookup_chain_cache_add() return with 1 it acquires * graph_lock for us) */ if (!hlock->trylock && hlock->check && - lookup_chain_cache(curr, hlock, chain_key)) { + lookup_chain_cache_add(curr, hlock, chain_key)) { /* * Check whether last held lock: * @@ -2277,14 +2465,17 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, * Add dependency only if this lock is not the head * of the chain, and if it's not a secondary read-lock: */ - if (!chain_head && ret != 2) + if (!chain_head && ret != 2) { if (!check_prevs_add(curr, hlock)) return 0; + } + graph_unlock(); - } else - /* after lookup_chain_cache(): */ + } else { + /* after lookup_chain_cache_add(): */ if (unlikely(!debug_locks)) return 0; + } return 1; } @@ -2567,14 +2758,6 @@ static int SOFTIRQ_verbose(struct lock_class *class) return 0; } -static int RECLAIM_FS_verbose(struct lock_class *class) -{ -#if RECLAIM_VERBOSE - return class_filter(class); -#endif - return 0; -} - #define STRICT_READ_CHECKS 1 static int (*state_verbose_f[])(struct lock_class *class) = { @@ -2870,57 +3053,6 @@ void trace_softirqs_off(unsigned long ip) debug_atomic_inc(redundant_softirqs_off); } -static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) -{ - struct task_struct *curr = current; - - if (unlikely(!debug_locks)) - return; - - gfp_mask = current_gfp_context(gfp_mask); - - /* no reclaim without waiting on it */ - if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) - return; - - /* this guy won't enter reclaim */ - if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) - return; - - /* We're only interested __GFP_FS allocations for now */ - if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS)) - return; - - /* - * Oi! Can't be having __GFP_FS allocations with IRQs disabled. - */ - if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) - return; - - /* Disable lockdep if explicitly requested */ - if (gfp_mask & __GFP_NOLOCKDEP) - return; - - mark_held_locks(curr, RECLAIM_FS); -} - -static void check_flags(unsigned long flags); - -void lockdep_trace_alloc(gfp_t gfp_mask) -{ - unsigned long flags; - - if (unlikely(current->lockdep_recursion)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - current->lockdep_recursion = 1; - __lockdep_trace_alloc(gfp_mask, flags); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} - static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) { /* @@ -2966,22 +3098,6 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) } } - /* - * We reuse the irq context infrastructure more broadly as a general - * context checking code. This tests GFP_FS recursion (a lock taken - * during reclaim for a GFP_FS allocation is held over a GFP_FS - * allocation). - */ - if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) { - if (hlock->read) { - if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ)) - return 0; - } else { - if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS)) - return 0; - } - } - return 1; } @@ -3040,10 +3156,6 @@ static inline int separate_irq_context(struct task_struct *curr, return 0; } -void lockdep_trace_alloc(gfp_t gfp_mask) -{ -} - #endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ /* @@ -3116,7 +3228,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, /* * Initialize a lock instance's lock-class mapping info: */ -void lockdep_init_map(struct lockdep_map *lock, const char *name, +static void __lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { int i; @@ -3174,8 +3286,25 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, raw_local_irq_restore(flags); } } + +void lockdep_init_map(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, int subclass) +{ + cross_init(lock, 0); + __lockdep_init_map(lock, name, key, subclass); +} EXPORT_SYMBOL_GPL(lockdep_init_map); +#ifdef CONFIG_LOCKDEP_CROSSRELEASE +void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, int subclass) +{ + cross_init(lock, 1); + __lockdep_init_map(lock, name, key, subclass); +} +EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock); +#endif + struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); @@ -3231,6 +3360,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int chain_head = 0; int class_idx; u64 chain_key; + int ret; if (unlikely(!debug_locks)) return 0; @@ -3279,7 +3409,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, class_idx = class - lock_classes + 1; - if (depth) { + /* TODO: nest_lock is not implemented for crosslock yet. */ + if (depth && !cross_lock(lock)) { hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { if (hlock->references) { @@ -3367,6 +3498,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) return 0; + ret = lock_acquire_crosslock(hlock); + /* + * 2 means normal acquire operations are needed. Otherwise, it's + * ok just to return with '0:fail, 1:success'. + */ + if (ret != 2) + return ret; + curr->curr_chain_key = chain_key; curr->lockdep_depth++; check_chain_key(curr); @@ -3604,11 +3743,19 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) struct task_struct *curr = current; struct held_lock *hlock; unsigned int depth; - int i; + int ret, i; if (unlikely(!debug_locks)) return 0; + ret = lock_release_crosslock(lock); + /* + * 2 means normal release operations are needed. Otherwise, it's + * ok just to return with '0:fail, 1:success'. + */ + if (ret != 2) + return ret; + depth = curr->lockdep_depth; /* * So we're all set to release this lock.. wait what lock? We don't @@ -3952,18 +4099,6 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) } EXPORT_SYMBOL_GPL(lock_unpin_lock); -void lockdep_set_current_reclaim_state(gfp_t gfp_mask) -{ - current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask); -} -EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state); - -void lockdep_clear_current_reclaim_state(void) -{ - current->lockdep_reclaim_gfp = 0; -} -EXPORT_SYMBOL_GPL(lockdep_clear_current_reclaim_state); - #ifdef CONFIG_LOCK_STAT static int print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, @@ -4484,6 +4619,13 @@ asmlinkage __visible void lockdep_sys_exit(void) curr->comm, curr->pid); lockdep_print_held_locks(curr); } + + /* + * The lock history for each syscall should be independent. So wipe the + * slate clean on return to userspace. + */ + crossrelease_hist_end(XHLOCK_PROC); + crossrelease_hist_start(XHLOCK_PROC); } void lockdep_rcu_suspicious(const char *file, const int line, const char *s) @@ -4532,3 +4674,470 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) dump_stack(); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); + +#ifdef CONFIG_LOCKDEP_CROSSRELEASE + +/* + * Crossrelease works by recording a lock history for each thread and + * connecting those historic locks that were taken after the + * wait_for_completion() in the complete() context. + * + * Task-A Task-B + * + * mutex_lock(&A); + * mutex_unlock(&A); + * + * wait_for_completion(&C); + * lock_acquire_crosslock(); + * atomic_inc_return(&cross_gen_id); + * | + * | mutex_lock(&B); + * | mutex_unlock(&B); + * | + * | complete(&C); + * `-- lock_commit_crosslock(); + * + * Which will then add a dependency between B and C. + */ + +#define xhlock(i) (current->xhlocks[(i) % MAX_XHLOCKS_NR]) + +/* + * Whenever a crosslock is held, cross_gen_id will be increased. + */ +static atomic_t cross_gen_id; /* Can be wrapped */ + +/* + * Make an entry of the ring buffer invalid. + */ +static inline void invalidate_xhlock(struct hist_lock *xhlock) +{ + /* + * Normally, xhlock->hlock.instance must be !NULL. + */ + xhlock->hlock.instance = NULL; +} + +/* + * Lock history stacks; we have 3 nested lock history stacks: + * + * Hard IRQ + * Soft IRQ + * History / Task + * + * The thing is that once we complete a (Hard/Soft) IRQ the future task locks + * should not depend on any of the locks observed while running the IRQ. + * + * So what we do is rewind the history buffer and erase all our knowledge of + * that temporal event. + */ + +/* + * We need this to annotate lock history boundaries. Take for instance + * workqueues; each work is independent of the last. The completion of a future + * work does not depend on the completion of a past work (in general). + * Therefore we must not carry that (lock) dependency across works. + * + * This is true for many things; pretty much all kthreads fall into this + * pattern, where they have an 'idle' state and future completions do not + * depend on past completions. Its just that since they all have the 'same' + * form -- the kthread does the same over and over -- it doesn't typically + * matter. + * + * The same is true for system-calls, once a system call is completed (we've + * returned to userspace) the next system call does not depend on the lock + * history of the previous system call. + */ +void crossrelease_hist_start(enum xhlock_context_t c) +{ + struct task_struct *cur = current; + + if (cur->xhlocks) { + cur->xhlock_idx_hist[c] = cur->xhlock_idx; + cur->hist_id_save[c] = cur->hist_id; + } +} + +void crossrelease_hist_end(enum xhlock_context_t c) +{ + struct task_struct *cur = current; + + if (cur->xhlocks) { + unsigned int idx = cur->xhlock_idx_hist[c]; + struct hist_lock *h = &xhlock(idx); + + cur->xhlock_idx = idx; + + /* Check if the ring was overwritten. */ + if (h->hist_id != cur->hist_id_save[c]) + invalidate_xhlock(h); + } +} + +static int cross_lock(struct lockdep_map *lock) +{ + return lock ? lock->cross : 0; +} + +/* + * This is needed to decide the relationship between wrapable variables. + */ +static inline int before(unsigned int a, unsigned int b) +{ + return (int)(a - b) < 0; +} + +static inline struct lock_class *xhlock_class(struct hist_lock *xhlock) +{ + return hlock_class(&xhlock->hlock); +} + +static inline struct lock_class *xlock_class(struct cross_lock *xlock) +{ + return hlock_class(&xlock->hlock); +} + +/* + * Should we check a dependency with previous one? + */ +static inline int depend_before(struct held_lock *hlock) +{ + return hlock->read != 2 && hlock->check && !hlock->trylock; +} + +/* + * Should we check a dependency with next one? + */ +static inline int depend_after(struct held_lock *hlock) +{ + return hlock->read != 2 && hlock->check; +} + +/* + * Check if the xhlock is valid, which would be false if, + * + * 1. Has not used after initializaion yet. + * 2. Got invalidated. + * + * Remind hist_lock is implemented as a ring buffer. + */ +static inline int xhlock_valid(struct hist_lock *xhlock) +{ + /* + * xhlock->hlock.instance must be !NULL. + */ + return !!xhlock->hlock.instance; +} + +/* + * Record a hist_lock entry. + * + * Irq disable is only required. + */ +static void add_xhlock(struct held_lock *hlock) +{ + unsigned int idx = ++current->xhlock_idx; + struct hist_lock *xhlock = &xhlock(idx); + +#ifdef CONFIG_DEBUG_LOCKDEP + /* + * This can be done locklessly because they are all task-local + * state, we must however ensure IRQs are disabled. + */ + WARN_ON_ONCE(!irqs_disabled()); +#endif + + /* Initialize hist_lock's members */ + xhlock->hlock = *hlock; + xhlock->hist_id = current->hist_id++; + + xhlock->trace.nr_entries = 0; + xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES; + xhlock->trace.entries = xhlock->trace_entries; + xhlock->trace.skip = 3; + save_stack_trace(&xhlock->trace); +} + +static inline int same_context_xhlock(struct hist_lock *xhlock) +{ + return xhlock->hlock.irq_context == task_irq_context(current); +} + +/* + * This should be lockless as far as possible because this would be + * called very frequently. + */ +static void check_add_xhlock(struct held_lock *hlock) +{ + /* + * Record a hist_lock, only in case that acquisitions ahead + * could depend on the held_lock. For example, if the held_lock + * is trylock then acquisitions ahead never depends on that. + * In that case, we don't need to record it. Just return. + */ + if (!current->xhlocks || !depend_before(hlock)) + return; + + add_xhlock(hlock); +} + +/* + * For crosslock. + */ +static int add_xlock(struct held_lock *hlock) +{ + struct cross_lock *xlock; + unsigned int gen_id; + + if (!graph_lock()) + return 0; + + xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock; + + /* + * When acquisitions for a crosslock are overlapped, we use + * nr_acquire to perform commit for them, based on cross_gen_id + * of the first acquisition, which allows to add additional + * dependencies. + * + * Moreover, when no acquisition of a crosslock is in progress, + * we should not perform commit because the lock might not exist + * any more, which might cause incorrect memory access. So we + * have to track the number of acquisitions of a crosslock. + * + * depend_after() is necessary to initialize only the first + * valid xlock so that the xlock can be used on its commit. + */ + if (xlock->nr_acquire++ && depend_after(&xlock->hlock)) + goto unlock; + + gen_id = (unsigned int)atomic_inc_return(&cross_gen_id); + xlock->hlock = *hlock; + xlock->hlock.gen_id = gen_id; +unlock: + graph_unlock(); + return 1; +} + +/* + * Called for both normal and crosslock acquires. Normal locks will be + * pushed on the hist_lock queue. Cross locks will record state and + * stop regular lock_acquire() to avoid being placed on the held_lock + * stack. + * + * Return: 0 - failure; + * 1 - crosslock, done; + * 2 - normal lock, continue to held_lock[] ops. + */ +static int lock_acquire_crosslock(struct held_lock *hlock) +{ + /* + * CONTEXT 1 CONTEXT 2 + * --------- --------- + * lock A (cross) + * X = atomic_inc_return(&cross_gen_id) + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * Y = atomic_read_acquire(&cross_gen_id) + * lock B + * + * atomic_read_acquire() is for ordering between A and B, + * IOW, A happens before B, when CONTEXT 2 see Y >= X. + * + * Pairs with atomic_inc_return() in add_xlock(). + */ + hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id); + + if (cross_lock(hlock->instance)) + return add_xlock(hlock); + + check_add_xhlock(hlock); + return 2; +} + +static int copy_trace(struct stack_trace *trace) +{ + unsigned long *buf = stack_trace + nr_stack_trace_entries; + unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; + unsigned int nr = min(max_nr, trace->nr_entries); + + trace->nr_entries = nr; + memcpy(buf, trace->entries, nr * sizeof(trace->entries[0])); + trace->entries = buf; + nr_stack_trace_entries += nr; + + if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { + if (!debug_locks_off_graph_unlock()) + return 0; + + print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); + dump_stack(); + + return 0; + } + + return 1; +} + +static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock) +{ + unsigned int xid, pid; + u64 chain_key; + + xid = xlock_class(xlock) - lock_classes; + chain_key = iterate_chain_key((u64)0, xid); + pid = xhlock_class(xhlock) - lock_classes; + chain_key = iterate_chain_key(chain_key, pid); + + if (lookup_chain_cache(chain_key)) + return 1; + + if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context, + chain_key)) + return 0; + + if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1, + &xhlock->trace, copy_trace)) + return 0; + + return 1; +} + +static void commit_xhlocks(struct cross_lock *xlock) +{ + unsigned int cur = current->xhlock_idx; + unsigned int prev_hist_id = xhlock(cur).hist_id; + unsigned int i; + + if (!graph_lock()) + return; + + if (xlock->nr_acquire) { + for (i = 0; i < MAX_XHLOCKS_NR; i++) { + struct hist_lock *xhlock = &xhlock(cur - i); + + if (!xhlock_valid(xhlock)) + break; + + if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id)) + break; + + if (!same_context_xhlock(xhlock)) + break; + + /* + * Filter out the cases that the ring buffer was + * overwritten and the previous entry has a bigger + * hist_id than the following one, which is impossible + * otherwise. + */ + if (unlikely(before(xhlock->hist_id, prev_hist_id))) + break; + + prev_hist_id = xhlock->hist_id; + + /* + * commit_xhlock() returns 0 with graph_lock already + * released if fail. + */ + if (!commit_xhlock(xlock, xhlock)) + return; + } + } + + graph_unlock(); +} + +void lock_commit_crosslock(struct lockdep_map *lock) +{ + struct cross_lock *xlock; + unsigned long flags; + + if (unlikely(!debug_locks || current->lockdep_recursion)) + return; + + if (!current->xhlocks) + return; + + /* + * Do commit hist_locks with the cross_lock, only in case that + * the cross_lock could depend on acquisitions after that. + * + * For example, if the cross_lock does not have the 'check' flag + * then we don't need to check dependencies and commit for that. + * Just skip it. In that case, of course, the cross_lock does + * not depend on acquisitions ahead, either. + * + * WARNING: Don't do that in add_xlock() in advance. When an + * acquisition context is different from the commit context, + * invalid(skipped) cross_lock might be accessed. + */ + if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + current->lockdep_recursion = 1; + xlock = &((struct lockdep_map_cross *)lock)->xlock; + commit_xhlocks(xlock); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_commit_crosslock); + +/* + * Return: 0 - failure; + * 1 - crosslock, done; + * 2 - normal lock, continue to held_lock[] ops. + */ +static int lock_release_crosslock(struct lockdep_map *lock) +{ + if (cross_lock(lock)) { + if (!graph_lock()) + return 0; + ((struct lockdep_map_cross *)lock)->xlock.nr_acquire--; + graph_unlock(); + return 1; + } + return 2; +} + +static void cross_init(struct lockdep_map *lock, int cross) +{ + if (cross) + ((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0; + + lock->cross = cross; + + /* + * Crossrelease assumes that the ring buffer size of xhlocks + * is aligned with power of 2. So force it on build. + */ + BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1)); +} + +void lockdep_init_task(struct task_struct *task) +{ + int i; + + task->xhlock_idx = UINT_MAX; + task->hist_id = 0; + + for (i = 0; i < XHLOCK_CTX_NR; i++) { + task->xhlock_idx_hist[i] = UINT_MAX; + task->hist_id_save[i] = 0; + } + + task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR, + GFP_KERNEL); +} + +void lockdep_free_task(struct task_struct *task) +{ + if (task->xhlocks) { + void *tmp = task->xhlocks; + /* Diable crossrelease for current */ + task->xhlocks = NULL; + kfree(tmp); + } +} +#endif diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index c08fbd2..1da4669 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -143,6 +143,8 @@ struct lockdep_stats { int redundant_softirqs_on; int redundant_softirqs_off; int nr_unused_locks; + int nr_redundant_checks; + int nr_redundant; int nr_cyclic_checks; int nr_cyclic_check_recursions; int nr_find_usage_forwards_checks; diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 6d1fcc7..68d9e26 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -201,6 +201,10 @@ static void lockdep_stats_debug_show(struct seq_file *m) debug_atomic_read(chain_lookup_hits)); seq_printf(m, " cyclic checks: %11llu\n", debug_atomic_read(nr_cyclic_checks)); + seq_printf(m, " redundant checks: %11llu\n", + debug_atomic_read(nr_redundant_checks)); + seq_printf(m, " redundant links: %11llu\n", + debug_atomic_read(nr_redundant)); seq_printf(m, " find-mask forwards checks: %11llu\n", debug_atomic_read(nr_find_usage_forwards_checks)); seq_printf(m, " find-mask backwards checks: %11llu\n", diff --git a/kernel/locking/lockdep_states.h b/kernel/locking/lockdep_states.h index 995b0cc..35ca09f 100644 --- a/kernel/locking/lockdep_states.h +++ b/kernel/locking/lockdep_states.h @@ -6,4 +6,3 @@ */ LOCKDEP_STATE(HARDIRQ) LOCKDEP_STATE(SOFTIRQ) -LOCKDEP_STATE(RECLAIM_FS) diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index a316794..a74ee6a 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -109,6 +109,19 @@ bool osq_lock(struct optimistic_spin_queue *lock) prev = decode_cpu(old); node->prev = prev; + + /* + * osq_lock() unqueue + * + * node->prev = prev osq_wait_next() + * WMB MB + * prev->next = node next->prev = prev // unqueue-C + * + * Here 'node->prev' and 'next->prev' are the same variable and we need + * to ensure these stores happen in-order to avoid corrupting the list. + */ + smp_wmb(); + WRITE_ONCE(prev->next, node); /* diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 72ad45a..8d039b9 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -40,6 +40,9 @@ struct rt_mutex_waiter { /* * Various helpers to access the waiters-tree: */ + +#ifdef CONFIG_RT_MUTEXES + static inline int rt_mutex_has_waiters(struct rt_mutex *lock) { return !RB_EMPTY_ROOT(&lock->waiters); @@ -69,6 +72,32 @@ task_top_pi_waiter(struct task_struct *p) pi_tree_entry); } +#else + +static inline int rt_mutex_has_waiters(struct rt_mutex *lock) +{ + return false; +} + +static inline struct rt_mutex_waiter * +rt_mutex_top_waiter(struct rt_mutex *lock) +{ + return NULL; +} + +static inline int task_has_pi_waiters(struct task_struct *p) +{ + return false; +} + +static inline struct rt_mutex_waiter * +task_top_pi_waiter(struct task_struct *p) +{ + return NULL; +} + +#endif + /* * lock->owner state tracking: */ diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 20819df..0848634 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -126,7 +126,7 @@ __rwsem_wake_one_writer(struct rw_semaphore *sem) /* * get a read lock on the semaphore */ -void __sched __down_read(struct rw_semaphore *sem) +int __sched __down_read_common(struct rw_semaphore *sem, int state) { struct rwsem_waiter waiter; unsigned long flags; @@ -140,8 +140,6 @@ void __sched __down_read(struct rw_semaphore *sem) goto out; } - set_current_state(TASK_UNINTERRUPTIBLE); - /* set up my own style of waitqueue */ waiter.task = current; waiter.type = RWSEM_WAITING_FOR_READ; @@ -149,20 +147,41 @@ void __sched __down_read(struct rw_semaphore *sem) list_add_tail(&waiter.list, &sem->wait_list); - /* we don't need to touch the semaphore struct anymore */ - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - /* wait to be given the lock */ for (;;) { if (!waiter.task) break; + if (signal_pending_state(state, current)) + goto out_nolock; + set_current_state(state); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); + raw_spin_lock_irqsave(&sem->wait_lock, flags); } - __set_current_state(TASK_RUNNING); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); out: - ; + return 0; + +out_nolock: + /* + * We didn't take the lock, so that there is a writer, which + * is owner or the first waiter of the sem. If it's a waiter, + * it will be woken by current owner. Not need to wake anybody. + */ + list_del(&waiter.list); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + return -EINTR; +} + +void __sched __down_read(struct rw_semaphore *sem) +{ + __down_read_common(sem, TASK_UNINTERRUPTIBLE); +} + +int __sched __down_read_killable(struct rw_semaphore *sem) +{ + return __down_read_common(sem, TASK_KILLABLE); } /* diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 34e727f..02f6606 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -221,8 +221,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, /* * Wait for the read lock to be granted */ -__visible -struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) +static inline struct rw_semaphore __sched * +__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) { long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; struct rwsem_waiter waiter; @@ -255,17 +255,44 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) /* wait to be given the lock */ while (true) { - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(state); if (!waiter.task) break; + if (signal_pending_state(state, current)) { + raw_spin_lock_irq(&sem->wait_lock); + if (waiter.task) + goto out_nolock; + raw_spin_unlock_irq(&sem->wait_lock); + break; + } schedule(); } __set_current_state(TASK_RUNNING); return sem; +out_nolock: + list_del(&waiter.list); + if (list_empty(&sem->wait_list)) + atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); + raw_spin_unlock_irq(&sem->wait_lock); + __set_current_state(TASK_RUNNING); + return ERR_PTR(-EINTR); +} + +__visible struct rw_semaphore * __sched +rwsem_down_read_failed(struct rw_semaphore *sem) +{ + return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(rwsem_down_read_failed); +__visible struct rw_semaphore * __sched +rwsem_down_read_failed_killable(struct rw_semaphore *sem) +{ + return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); +} +EXPORT_SYMBOL(rwsem_down_read_failed_killable); + /* * This function must be called with the sem->wait_lock held to prevent * race conditions between checking the rwsem wait list and setting the diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 13fc5ae..566b6ec 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -32,6 +32,12 @@ void complete(struct completion *x) unsigned long flags; spin_lock_irqsave(&x->wait.lock, flags); + + /* + * Perform commit of crossrelease here. + */ + complete_release_commit(x); + if (x->done != UINT_MAX) x->done++; __wake_up_locked(&x->wait, TASK_NORMAL, 1); @@ -92,9 +98,14 @@ __wait_for_common(struct completion *x, { might_sleep(); + complete_acquire(x); + spin_lock_irq(&x->wait.lock); timeout = do_wait_for_common(x, action, timeout, state); spin_unlock_irq(&x->wait.lock); + + complete_release(x); + return timeout; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0869b20..9fece58 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1967,8 +1967,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * reordered with p->state check below. This pairs with mb() in * set_current_state() the waiting thread does. */ - smp_mb__before_spinlock(); raw_spin_lock_irqsave(&p->pi_lock, flags); + smp_mb__after_spinlock(); if (!(p->state & state)) goto out; @@ -3281,8 +3281,8 @@ static void __sched notrace __schedule(bool preempt) * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * done by the caller to avoid the race with signal_wake_up(). */ - smp_mb__before_spinlock(); rq_lock(rq, &rf); + smp_mb__after_spinlock(); /* Promote REQ to ACT */ rq->clock_update_flags <<= 1; diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 3d5610d..2227e18 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -33,9 +33,6 @@ void swake_up(struct swait_queue_head *q) { unsigned long flags; - if (!swait_active(q)) - return; - raw_spin_lock_irqsave(&q->lock, flags); swake_up_locked(q); raw_spin_unlock_irqrestore(&q->lock, flags); @@ -51,9 +48,6 @@ void swake_up_all(struct swait_queue_head *q) struct swait_queue *curr; LIST_HEAD(tmp); - if (!swait_active(q)) - return; - raw_spin_lock_irq(&q->lock); list_splice_init(&q->task_list, &tmp); while (!list_empty(&tmp)) { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ca937b0..e86733a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2093,6 +2093,7 @@ __acquires(&pool->lock) lock_map_acquire_read(&pwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); + crossrelease_hist_start(XHLOCK_PROC); trace_workqueue_execute_start(work); worker->current_func(work); /* @@ -2100,6 +2101,7 @@ __acquires(&pool->lock) * point will only record its address. */ trace_workqueue_execute_end(work); + crossrelease_hist_end(XHLOCK_PROC); lock_map_release(&lockdep_map); lock_map_release(&pwq->wq->lockdep_map); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 98fe715..ebd40d3 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1150,6 +1150,27 @@ config LOCK_STAT CONFIG_LOCK_STAT defines "contended" and "acquired" lock events. (CONFIG_LOCKDEP defines "acquire" and "release" events.) +config LOCKDEP_CROSSRELEASE + bool "Lock debugging: make lockdep work for crosslocks" + depends on PROVE_LOCKING + default n + help + This makes lockdep work for crosslock which is a lock allowed to + be released in a different context from the acquisition context. + Normally a lock must be released in the context acquiring the lock. + However, relexing this constraint helps synchronization primitives + such as page locks or completions can use the lock correctness + detector, lockdep. + +config LOCKDEP_COMPLETE + bool "Lock debugging: allow completions to use deadlock detector" + depends on PROVE_LOCKING + select LOCKDEP_CROSSRELEASE + default n + help + A deadlock caused by wait_for_completion() and complete() can be + detected by lockdep using crossrelease feature. + config DEBUG_LOCKDEP bool "Lock dependency engine debugging" depends on DEBUG_KERNEL && LOCKDEP diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 216114f..ce88345 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1410,6 +1410,7 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) unsigned long haddr = vmf->address & HPAGE_PMD_MASK; int page_nid = -1, this_nid = numa_node_id(); int target_nid, last_cpupid = -1; + bool need_flush = false; bool page_locked; bool migrated = false; bool was_writable; @@ -1503,10 +1504,29 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); /* + * Since we took the NUMA fault, we must have observed the !accessible + * bit. Make sure all other CPUs agree with that, to avoid them + * modifying the page we're about to migrate. + * + * Must be done under PTL such that we'll observe the relevant + * set_tlb_flush_pending(). + */ + if (mm_tlb_flush_pending(vma->vm_mm)) + need_flush = true; + + /* * Migrate the THP to the requested node, returns with page unlocked * and access rights restored. */ spin_unlock(vmf->ptl); + + /* + * We are not sure a pending tlb flush here is for a huge page + * mapping or not. Hence use the tlb range variant + */ + if (need_flush) + flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); + migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, vmf->pmd, pmd, vmf->address, page, target_nid); if (migrated) { diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index ca11bc4..6f319fb 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -267,13 +267,13 @@ static void check_memory_region(unsigned long addr, check_memory_region_inline(addr, size, write, ret_ip); } -void kasan_check_read(const void *p, unsigned int size) +void kasan_check_read(const volatile void *p, unsigned int size) { check_memory_region((unsigned long)p, size, false, _RET_IP_); } EXPORT_SYMBOL(kasan_check_read); -void kasan_check_write(const void *p, unsigned int size) +void kasan_check_write(const volatile void *p, unsigned int size) { check_memory_region((unsigned long)p, size, true, _RET_IP_); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6d00f74..6acf612 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -66,6 +66,7 @@ #include <linux/kthread.h> #include <linux/memcontrol.h> #include <linux/ftrace.h> +#include <linux/lockdep.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -3490,6 +3491,47 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla } #endif /* CONFIG_COMPACTION */ +#ifdef CONFIG_LOCKDEP +struct lockdep_map __fs_reclaim_map = + STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map); + +static bool __need_fs_reclaim(gfp_t gfp_mask) +{ + gfp_mask = current_gfp_context(gfp_mask); + + /* no reclaim without waiting on it */ + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) + return false; + + /* this guy won't enter reclaim */ + if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) + return false; + + /* We're only interested __GFP_FS allocations for now */ + if (!(gfp_mask & __GFP_FS)) + return false; + + if (gfp_mask & __GFP_NOLOCKDEP) + return false; + + return true; +} + +void fs_reclaim_acquire(gfp_t gfp_mask) +{ + if (__need_fs_reclaim(gfp_mask)) + lock_map_acquire(&__fs_reclaim_map); +} +EXPORT_SYMBOL_GPL(fs_reclaim_acquire); + +void fs_reclaim_release(gfp_t gfp_mask) +{ + if (__need_fs_reclaim(gfp_mask)) + lock_map_release(&__fs_reclaim_map); +} +EXPORT_SYMBOL_GPL(fs_reclaim_release); +#endif + /* Perform direct synchronous page reclaim */ static int __perform_reclaim(gfp_t gfp_mask, unsigned int order, @@ -3504,7 +3546,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); noreclaim_flag = memalloc_noreclaim_save(); - lockdep_set_current_reclaim_state(gfp_mask); + fs_reclaim_acquire(gfp_mask); reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; @@ -3512,7 +3554,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, ac->nodemask); current->reclaim_state = NULL; - lockdep_clear_current_reclaim_state(); + fs_reclaim_release(gfp_mask); memalloc_noreclaim_restore(noreclaim_flag); cond_resched(); @@ -4041,7 +4083,8 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, *alloc_flags |= ALLOC_CPUSET; } - lockdep_trace_alloc(gfp_mask); + fs_reclaim_acquire(gfp_mask); + fs_reclaim_release(gfp_mask); might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); @@ -43,6 +43,7 @@ struct kmem_cache { #include <linux/kasan.h> #include <linux/kmemleak.h> #include <linux/random.h> +#include <linux/sched/mm.h> /* * State of the slab allocator. @@ -412,7 +413,10 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { flags &= gfp_allowed_mask; - lockdep_trace_alloc(flags); + + fs_reclaim_acquire(flags); + fs_reclaim_release(flags); + might_sleep_if(gfpflags_allow_blocking(flags)); if (should_failslab(s, flags)) @@ -432,7 +432,8 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) gfp &= gfp_allowed_mask; - lockdep_trace_alloc(gfp); + fs_reclaim_acquire(gfp); + fs_reclaim_release(gfp); if (size < PAGE_SIZE - align) { if (!size) @@ -538,7 +539,8 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) flags &= gfp_allowed_mask; - lockdep_trace_alloc(flags); + fs_reclaim_acquire(flags); + fs_reclaim_release(flags); if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node); diff --git a/mm/vmscan.c b/mm/vmscan.c index a1af041..f957afe 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3525,8 +3525,6 @@ static int kswapd(void *p) }; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); - lockdep_set_current_reclaim_state(GFP_KERNEL); - if (!cpumask_empty(cpumask)) set_cpus_allowed_ptr(tsk, cpumask); current->reclaim_state = &reclaim_state; @@ -3585,14 +3583,15 @@ kswapd_try_sleep: */ trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx, alloc_order); + fs_reclaim_acquire(GFP_KERNEL); reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); + fs_reclaim_release(GFP_KERNEL); if (reclaim_order < alloc_order) goto kswapd_try_sleep; } tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); current->reclaim_state = NULL; - lockdep_clear_current_reclaim_state(); return 0; } @@ -3655,14 +3654,14 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) unsigned int noreclaim_flag; noreclaim_flag = memalloc_noreclaim_save(); - lockdep_set_current_reclaim_state(sc.gfp_mask); + fs_reclaim_acquire(sc.gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; nr_reclaimed = do_try_to_free_pages(zonelist, &sc); p->reclaim_state = NULL; - lockdep_clear_current_reclaim_state(); + fs_reclaim_release(sc.gfp_mask); memalloc_noreclaim_restore(noreclaim_flag); return nr_reclaimed; @@ -3847,7 +3846,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in */ noreclaim_flag = memalloc_noreclaim_save(); p->flags |= PF_SWAPWRITE; - lockdep_set_current_reclaim_state(sc.gfp_mask); + fs_reclaim_acquire(sc.gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -3862,9 +3861,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in } p->reclaim_state = NULL; + fs_reclaim_release(gfp_mask); current->flags &= ~PF_SWAPWRITE; memalloc_noreclaim_restore(noreclaim_flag); - lockdep_clear_current_reclaim_state(); return sc.nr_reclaimed >= nr_pages; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index a7c804f..b34f09b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1809,8 +1809,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) static struct static_key udp_encap_needed __read_mostly; void udp_encap_enable(void) { - if (!static_key_enabled(&udp_encap_needed)) - static_key_slow_inc(&udp_encap_needed); + static_key_enable(&udp_encap_needed); } EXPORT_SYMBOL(udp_encap_enable); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 578142b..96d2407 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -574,8 +574,7 @@ static __inline__ void udpv6_err(struct sk_buff *skb, static struct static_key udpv6_encap_needed __read_mostly; void udpv6_encap_enable(void) { - if (!static_key_enabled(&udpv6_encap_needed)) - static_key_slow_inc(&udpv6_encap_needed); + static_key_enable(&udpv6_encap_needed); } EXPORT_SYMBOL(udpv6_encap_enable); |