diff options
author | mjg <mjg@FreeBSD.org> | 2017-03-16 06:35:53 +0000 |
---|---|---|
committer | mjg <mjg@FreeBSD.org> | 2017-03-16 06:35:53 +0000 |
commit | 75c730bce9acd14b7ecb04b7be87b14661c92be3 (patch) | |
tree | a495b1d47bb6dabb7a674823c4cd6dfa1959abc9 /sys/sys | |
parent | 871eda175a84b535a6be9a606ffb7d37a9ac194c (diff) | |
download | FreeBSD-src-75c730bce9acd14b7ecb04b7be87b14661c92be3.zip FreeBSD-src-75c730bce9acd14b7ecb04b7be87b14661c92be3.tar.gz |
MFC r313269,r313270,r313271,r313272,r313274,r313278,r313279,r313996,r314474
mtx: switch to fcmpset
The found value is passed to locking routines in order to reduce cacheline
accesses.
mtx_unlock grows an explicit check for regular unlock. On ll/sc architectures
the routine can fail even if the lock could have been handled by the inline
primitive.
==
rwlock: switch to fcmpset
==
sx: switch to fcmpset
==
sx: uninline slock/sunlock
Shared locking routines explicitly read the value and test it. If the
change attempt fails, they fall back to a regular function which would
retry in a loop.
The problem is that with many concurrent readers the risk of failure is pretty
high and even the value returned by fcmpset is very likely going to be stale
by the time the loop in the fallback routine is reached.
Uninline said primitives. It gives a throughput increase when doing concurrent
slocks/sunlocks with 80 hardware threads from ~50 mln/s to ~56 mln/s.
Interestingly, rwlock primitives are already not inlined.
==
sx: add witness support missed in r313272
==
mtx: fix up _mtx_obtain_lock_fetch usage in thread lock
Since _mtx_obtain_lock_fetch no longer sets the argument to MTX_UNOWNED,
callers have to do it on their own.
==
mtx: fixup r313278, the assignemnt was supposed to go inside the loop
==
mtx: fix spin mutexes interaction with failed fcmpset
While doing so move recursion support down to the fallback routine.
==
locks: ensure proper barriers are used with atomic ops when necessary
Unclear how, but the locking routine for mutexes was using the *release*
barrier instead of acquire. This must have been either a copy-pasto or bad
completion.
Going through other uses of atomics shows no barriers in:
- upgrade routines (addressed in this patch)
- sections protected with turnstile locks - this should be fine as necessary
barriers are in the worst case provided by turnstile unlock
I would like to thank Mark Millard and andreast@ for reporting the problem and
testing previous patches before the issue got identified.
Diffstat (limited to 'sys/sys')
-rw-r--r-- | sys/sys/mutex.h | 38 | ||||
-rw-r--r-- | sys/sys/rwlock.h | 20 | ||||
-rw-r--r-- | sys/sys/sx.h | 63 |
3 files changed, 44 insertions, 77 deletions
diff --git a/sys/sys/mutex.h b/sys/sys/mutex.h index 11ed9d7..841bd76 100644 --- a/sys/sys/mutex.h +++ b/sys/sys/mutex.h @@ -98,13 +98,13 @@ void mtx_sysinit(void *arg); int _mtx_trylock_flags_(volatile uintptr_t *c, int opts, const char *file, int line); void mutex_init(void); -void __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int opts, - const char *file, int line); +void __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t v, uintptr_t tid, + int opts, const char *file, int line); void __mtx_unlock_sleep(volatile uintptr_t *c, int opts, const char *file, int line); #ifdef SMP -void _mtx_lock_spin_cookie(volatile uintptr_t *c, uintptr_t tid, int opts, - const char *file, int line); +void _mtx_lock_spin_cookie(volatile uintptr_t *c, uintptr_t v, uintptr_t tid, + int opts, const char *file, int line); #endif void __mtx_lock_flags(volatile uintptr_t *c, int opts, const char *file, int line); @@ -140,13 +140,13 @@ void thread_lock_flags_(struct thread *, int, const char *, int); _mtx_destroy(&(m)->mtx_lock) #define mtx_trylock_flags_(m, o, f, l) \ _mtx_trylock_flags_(&(m)->mtx_lock, o, f, l) -#define _mtx_lock_sleep(m, t, o, f, l) \ - __mtx_lock_sleep(&(m)->mtx_lock, t, o, f, l) +#define _mtx_lock_sleep(m, v, t, o, f, l) \ + __mtx_lock_sleep(&(m)->mtx_lock, v, t, o, f, l) #define _mtx_unlock_sleep(m, o, f, l) \ __mtx_unlock_sleep(&(m)->mtx_lock, o, f, l) #ifdef SMP -#define _mtx_lock_spin(m, t, o, f, l) \ - _mtx_lock_spin_cookie(&(m)->mtx_lock, t, o, f, l) +#define _mtx_lock_spin(m, v, t, o, f, l) \ + _mtx_lock_spin_cookie(&(m)->mtx_lock, v, t, o, f, l) #endif #define _mtx_lock_flags(m, o, f, l) \ __mtx_lock_flags(&(m)->mtx_lock, o, f, l) @@ -171,6 +171,11 @@ void thread_lock_flags_(struct thread *, int, const char *, int); #define _mtx_obtain_lock(mp, tid) \ atomic_cmpset_acq_ptr(&(mp)->mtx_lock, MTX_UNOWNED, (tid)) +#define _mtx_obtain_lock_fetch(mp, vp, tid) ({ \ + *vp = MTX_UNOWNED; \ + atomic_fcmpset_acq_ptr(&(mp)->mtx_lock, vp, (tid)); \ +}) + /* Try to release mtx_lock if it is unrecursed and uncontested. */ #define _mtx_release_lock(mp, tid) \ atomic_cmpset_rel_ptr(&(mp)->mtx_lock, (tid), MTX_UNOWNED) @@ -188,9 +193,10 @@ void thread_lock_flags_(struct thread *, int, const char *, int); /* Lock a normal mutex. */ #define __mtx_lock(mp, tid, opts, file, line) do { \ uintptr_t _tid = (uintptr_t)(tid); \ + uintptr_t _v; \ \ - if (((mp)->mtx_lock != MTX_UNOWNED || !_mtx_obtain_lock((mp), _tid)))\ - _mtx_lock_sleep((mp), _tid, (opts), (file), (line)); \ + if (!_mtx_obtain_lock_fetch((mp), &_v, _tid)) \ + _mtx_lock_sleep((mp), _v, _tid, (opts), (file), (line));\ else \ LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(adaptive__acquire, \ mp, 0, 0, file, line); \ @@ -205,14 +211,12 @@ void thread_lock_flags_(struct thread *, int, const char *, int); #ifdef SMP #define __mtx_lock_spin(mp, tid, opts, file, line) do { \ uintptr_t _tid = (uintptr_t)(tid); \ + uintptr_t _v; \ \ spinlock_enter(); \ - if (((mp)->mtx_lock != MTX_UNOWNED || !_mtx_obtain_lock((mp), _tid))) {\ - if ((mp)->mtx_lock == _tid) \ - (mp)->mtx_recurse++; \ - else \ - _mtx_lock_spin((mp), _tid, (opts), (file), (line)); \ - } else \ + if (!_mtx_obtain_lock_fetch((mp), &_v, _tid)) \ + _mtx_lock_spin((mp), _v, _tid, (opts), (file), (line)); \ + else \ LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(spin__acquire, \ mp, 0, 0, file, line); \ } while (0) @@ -265,7 +269,7 @@ void thread_lock_flags_(struct thread *, int, const char *, int); \ if ((mp)->mtx_recurse == 0) \ LOCKSTAT_PROFILE_RELEASE_LOCK(adaptive__release, mp); \ - if ((mp)->mtx_lock != _tid || !_mtx_release_lock((mp), _tid)) \ + if (!_mtx_release_lock((mp), _tid)) \ _mtx_unlock_sleep((mp), (opts), (file), (line)); \ } while (0) diff --git a/sys/sys/rwlock.h b/sys/sys/rwlock.h index e5b1166..816037e 100644 --- a/sys/sys/rwlock.h +++ b/sys/sys/rwlock.h @@ -84,6 +84,11 @@ #define _rw_write_lock(rw, tid) \ atomic_cmpset_acq_ptr(&(rw)->rw_lock, RW_UNLOCKED, (tid)) +#define _rw_write_lock_fetch(rw, vp, tid) ({ \ + *vp = RW_UNLOCKED; \ + atomic_fcmpset_acq_ptr(&(rw)->rw_lock, vp, (tid)); \ +}) + /* Release a write lock quickly if there are no waiters. */ #define _rw_write_unlock(rw, tid) \ atomic_cmpset_rel_ptr(&(rw)->rw_lock, (tid), RW_UNLOCKED) @@ -97,9 +102,10 @@ /* Acquire a write lock. */ #define __rw_wlock(rw, tid, file, line) do { \ uintptr_t _tid = (uintptr_t)(tid); \ + uintptr_t _v; \ \ - if ((rw)->rw_lock != RW_UNLOCKED || !_rw_write_lock((rw), _tid))\ - _rw_wlock_hard((rw), _tid, (file), (line)); \ + if (!_rw_write_lock_fetch((rw), &_v, _tid)) \ + _rw_wlock_hard((rw), _v, _tid, (file), (line)); \ else \ LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, \ 0, 0, file, line, LOCKSTAT_WRITER); \ @@ -114,7 +120,7 @@ else { \ LOCKSTAT_PROFILE_RELEASE_RWLOCK(rw__release, rw, \ LOCKSTAT_WRITER); \ - if ((rw)->rw_lock != _tid || !_rw_write_unlock((rw), _tid))\ + if (!_rw_write_unlock((rw), _tid)) \ _rw_wunlock_hard((rw), _tid, (file), (line)); \ } \ } while (0) @@ -135,8 +141,8 @@ void _rw_wunlock_cookie(volatile uintptr_t *c, const char *file, int line); void __rw_rlock(volatile uintptr_t *c, const char *file, int line); int __rw_try_rlock(volatile uintptr_t *c, const char *file, int line); void _rw_runlock_cookie(volatile uintptr_t *c, const char *file, int line); -void __rw_wlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file, - int line); +void __rw_wlock_hard(volatile uintptr_t *c, uintptr_t v, uintptr_t tid, + const char *file, int line); void __rw_wunlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file, int line); int __rw_try_upgrade(volatile uintptr_t *c, const char *file, int line); @@ -171,8 +177,8 @@ void __rw_assert(const volatile uintptr_t *c, int what, const char *file, __rw_try_rlock(&(rw)->rw_lock, f, l) #define _rw_runlock(rw, f, l) \ _rw_runlock_cookie(&(rw)->rw_lock, f, l) -#define _rw_wlock_hard(rw, t, f, l) \ - __rw_wlock_hard(&(rw)->rw_lock, t, f, l) +#define _rw_wlock_hard(rw, v, t, f, l) \ + __rw_wlock_hard(&(rw)->rw_lock, v, t, f, l) #define _rw_wunlock_hard(rw, t, f, l) \ __rw_wunlock_hard(&(rw)->rw_lock, t, f, l) #define _rw_try_upgrade(rw, f, l) \ diff --git a/sys/sys/sx.h b/sys/sys/sx.h index e8cffaa..a676d2a 100644 --- a/sys/sys/sx.h +++ b/sys/sys/sx.h @@ -109,12 +109,10 @@ int _sx_slock(struct sx *sx, int opts, const char *file, int line); int _sx_xlock(struct sx *sx, int opts, const char *file, int line); void _sx_sunlock(struct sx *sx, const char *file, int line); void _sx_xunlock(struct sx *sx, const char *file, int line); -int _sx_xlock_hard(struct sx *sx, uintptr_t tid, int opts, +int _sx_xlock_hard(struct sx *sx, uintptr_t v, uintptr_t tid, int opts, const char *file, int line); -int _sx_slock_hard(struct sx *sx, int opts, const char *file, int line); void _sx_xunlock_hard(struct sx *sx, uintptr_t tid, const char *file, int line); -void _sx_sunlock_hard(struct sx *sx, const char *file, int line); #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void _sx_assert(const struct sx *sx, int what, const char *file, int line); #endif @@ -153,11 +151,12 @@ __sx_xlock(struct sx *sx, struct thread *td, int opts, const char *file, int line) { uintptr_t tid = (uintptr_t)td; + uintptr_t v; int error = 0; - if (sx->sx_lock != SX_LOCK_UNLOCKED || - !atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED, tid)) - error = _sx_xlock_hard(sx, tid, opts, file, line); + v = SX_LOCK_UNLOCKED; + if (!atomic_fcmpset_acq_ptr(&sx->sx_lock, &v, tid)) + error = _sx_xlock_hard(sx, v, tid, opts, file, line); else LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, 0, 0, file, line, LOCKSTAT_WRITER); @@ -174,46 +173,10 @@ __sx_xunlock(struct sx *sx, struct thread *td, const char *file, int line) if (sx->sx_recurse == 0) LOCKSTAT_PROFILE_RELEASE_RWLOCK(sx__release, sx, LOCKSTAT_WRITER); - if (sx->sx_lock != tid || - !atomic_cmpset_rel_ptr(&sx->sx_lock, tid, SX_LOCK_UNLOCKED)) + if (!atomic_cmpset_rel_ptr(&sx->sx_lock, tid, SX_LOCK_UNLOCKED)) _sx_xunlock_hard(sx, tid, file, line); } -/* Acquire a shared lock. */ -static __inline int -__sx_slock(struct sx *sx, int opts, const char *file, int line) -{ - uintptr_t x = sx->sx_lock; - int error = 0; - - if (!(x & SX_LOCK_SHARED) || - !atomic_cmpset_acq_ptr(&sx->sx_lock, x, x + SX_ONE_SHARER)) - error = _sx_slock_hard(sx, opts, file, line); - else - LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, - 0, 0, file, line, LOCKSTAT_READER); - - return (error); -} - -/* - * Release a shared lock. We can just drop a single shared lock so - * long as we aren't trying to drop the last shared lock when other - * threads are waiting for an exclusive lock. This takes advantage of - * the fact that an unlocked lock is encoded as a shared lock with a - * count of 0. - */ -static __inline void -__sx_sunlock(struct sx *sx, const char *file, int line) -{ - uintptr_t x = sx->sx_lock; - - LOCKSTAT_PROFILE_RELEASE_RWLOCK(sx__release, sx, LOCKSTAT_READER); - if (x == (SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS) || - !atomic_cmpset_rel_ptr(&sx->sx_lock, x, x - SX_ONE_SHARER)) - _sx_sunlock_hard(sx, file, line); -} - /* * Public interface for lock operations. */ @@ -227,12 +190,6 @@ __sx_sunlock(struct sx *sx, const char *file, int line) _sx_xlock((sx), SX_INTERRUPTIBLE, (file), (line)) #define sx_xunlock_(sx, file, line) \ _sx_xunlock((sx), (file), (line)) -#define sx_slock_(sx, file, line) \ - (void)_sx_slock((sx), 0, (file), (line)) -#define sx_slock_sig_(sx, file, line) \ - _sx_slock((sx), SX_INTERRUPTIBLE, (file) , (line)) -#define sx_sunlock_(sx, file, line) \ - _sx_sunlock((sx), (file), (line)) #else #define sx_xlock_(sx, file, line) \ (void)__sx_xlock((sx), curthread, 0, (file), (line)) @@ -240,13 +197,13 @@ __sx_sunlock(struct sx *sx, const char *file, int line) __sx_xlock((sx), curthread, SX_INTERRUPTIBLE, (file), (line)) #define sx_xunlock_(sx, file, line) \ __sx_xunlock((sx), curthread, (file), (line)) +#endif /* LOCK_DEBUG > 0 || SX_NOINLINE */ #define sx_slock_(sx, file, line) \ - (void)__sx_slock((sx), 0, (file), (line)) + (void)_sx_slock((sx), 0, (file), (line)) #define sx_slock_sig_(sx, file, line) \ - __sx_slock((sx), SX_INTERRUPTIBLE, (file), (line)) + _sx_slock((sx), SX_INTERRUPTIBLE, (file) , (line)) #define sx_sunlock_(sx, file, line) \ - __sx_sunlock((sx), (file), (line)) -#endif /* LOCK_DEBUG > 0 || SX_NOINLINE */ + _sx_sunlock((sx), (file), (line)) #define sx_try_slock(sx) sx_try_slock_((sx), LOCK_FILE, LOCK_LINE) #define sx_try_xlock(sx) sx_try_xlock_((sx), LOCK_FILE, LOCK_LINE) #define sx_try_upgrade(sx) sx_try_upgrade_((sx), LOCK_FILE, LOCK_LINE) |