Solve the dynamic linker's problems with multithreaded programs once

and for all (I hope). Packages such as wine, JDK, and linuxthreads should no longer have any problems with re-entering the dynamic linker. This commit replaces the locking used in the dynamic linker with a new spinlock-based reader/writer lock implementation. Brian Fundakowski Feldman <green> argued for this from the very beginning, but it took me a long time to come around to his point of view. Spinlocks are the only kinds of locks that work with all thread packages. But on uniprocessor systems they can be inefficient, because while a contender for the lock is spinning the holder of the lock cannot make any progress toward releasing it. To alleviate this disadvantage I have borrowed a trick from Sleepycat's Berkeley DB implementation. When spinning for a lock, the requester does a nanosleep() call for 1 usec. each time around the loop. This will generally yield the CPU to other threads, allowing the lock holder to finish its business and release the lock. I chose 1 usec. as the minimum sleep which would with reasonable certainty not be rounded down to 0. The formerly machine-independent file "lockdflt.c" has been moved into the architecture-specific subdirectories by repository copy. It now contains the machine-dependent spinlocking code. For the spinlocks I used the very nifty "simple, non-scalable reader-preference lock" which I found at <http://www.cs.rochester.edu/u/scott/synchronization/pseudocode/rw.html> on all CPUs except the 80386 (the specific CPU model, not the architecture). The 80386 CPU doesn't support the necessary "cmpxchg" instruction, so on that CPU a simple exclusive test-and-set lock is used instead. 80386 CPUs are detected at initialization time by trying to execute "cmpxchg" and catching the resulting SIGILL signal. To reduce contention for the locks, I have revamped a couple of key data structures, permitting all common operations to be done under non-exclusive (reader) locking. The only operations that require exclusive locking now are the rare intrusive operations such as dlopen() and dlclose(). The dllockinit() interface is now deprecated. It still exists, but only as a do-nothing stub. I plan to remove it as soon as is reasonably possible. (From the very beginning it was clearly labeled as experimental and subject to change.) As far as I know, only the linuxthreads port uses dllockinit(). This interface turned out to have several problems. As one example, when the dynamic linker called a client-supplied locking function, that function sometimes needed lazy binding, causing re-entry into the dynamic linker and a big looping mess. And in any case, it turned out to be too burdensome to require threads packages to register themselves with the dynamic linker.
author: jdp <jdp@FreeBSD.org> 2000-07-08 04:10:38 +0000
committer: jdp <jdp@FreeBSD.org> 2000-07-08 04:10:38 +0000
commit: 3fa5480ba30a5687028cb2783eb1ae21513d4b9c (patch)
tree: db0497618ec72d42ebbd228c98ab3cc747ac3aeb /libexec/rtld-elf/i386
parent: aa26657bfbbf4260f14797da8a632778e067951e (diff)
download: FreeBSD-src-3fa5480ba30a5687028cb2783eb1ae21513d4b9c.zip
FreeBSD-src-3fa5480ba30a5687028cb2783eb1ae21513d4b9c.tar.gz
2 files changed, 225 insertions, 40 deletions
diff --git a/libexec/rtld-elf/i386/lockdflt.c b/libexec/rtld-elf/i386/lockdflt.c
index 4233b36..b2ca9a5 100644
--- a/libexec/rtld-elf/i386/lockdflt.c
+++ b/libexec/rtld-elf/i386/lockdflt.c
@@ -26,64 +26,228 @@
  */
 
 /*
- * Default thread locking implementation for the dynamic linker.  It
- * is used until the client registers a different implementation with
- * dllockinit().  The default implementation does mutual exclusion by
- * blocking almost all signals.  This is based on the observation that
- * most userland thread packages use signals to support preemption.
+ * Thread locking implementation for the dynamic linker.
+ *
+ * On 80486 and later CPUs we use the "simple, non-scalable
+ * reader-preference lock" from:
+ *
+ *   J. M. Mellor-Crummey and M. L. Scott. "Scalable Reader-Writer
+ *   Synchronization for Shared-Memory Multiprocessors." 3rd ACM Symp. on
+ *   Principles and Practice of Parallel Programming, April 1991.
+ *
+ * In this algorithm the lock is a single word.  Its low-order bit is
+ * set when a writer holds the lock.  The remaining high-order bits
+ * contain a count of readers desiring the lock.  The algorithm requires
+ * atomic "compare_and_store" and "add" operations.
+ *
+ * The "compare_and_store" operation requires the "cmpxchg" instruction
+ * on the x86.  Unfortunately, the 80386 CPU does not support that
+ * instruction -- only the 80486 and later models support it.  So on the
+ * 80386 we must use simple test-and-set exclusive locks instead.  We
+ * determine which kind of lock to use by trying to execute a "cmpxchg"
+ * instruction and catching the SIGILL which results on the 80386.
+ *
+ * These are spinlocks.  When spinning we call nanosleep() for 1
+ * microsecond each time around the loop.  This will most likely yield
+ * the CPU to other threads (including, we hope, the lockholder) allowing
+ * them to make some progress.
  */
 
-#include <dlfcn.h>
+#include <setjmp.h>
 #include <signal.h>
 #include <stdlib.h>
+#include <time.h>
 
 #include "debug.h"
 #include "rtld.h"
 
-typedef struct Struct_LockDflt {
-    sigset_t lock_mask;
-    sigset_t old_mask;
-    int depth;
-} LockDflt;
+#define CACHE_LINE_SIZE		32
 
-void
-lockdflt_acquire(void *lock)
+#define WAFLAG		0x1	/* A writer holds the lock */
+#define RC_INCR		0x2	/* Adjusts count of readers desiring lock */
+
+typedef struct Struct_Lock {
+	volatile int lock;
+	void *base;
+} Lock;
+
+static const struct timespec usec = { 0, 1000 };	/* 1 usec. */
+
+static inline int
+cmpxchgl(int old, int new, volatile int *m)
 {
-    LockDflt *l = (LockDflt *)lock;
-    sigprocmask(SIG_BLOCK, &l->lock_mask, &l->old_mask);
-    assert(l->depth == 0);
-    l->depth++;
+	int result;
+
+	__asm __volatile ("lock; cmpxchgl %2, %0"
+	    : "=m"(*m), "=a"(result)
+	    : "r"(new), "0"(*m), "1"(old)
+	    : "cc");
+
+	return result;
 }
 
-void *
-lockdflt_create(void *context)
+static inline int
+xchgl(int v, volatile int *m)
 {
-    LockDflt *l;
-
-    l = NEW(LockDflt);
-    l->depth = 0;
-    sigfillset(&l->lock_mask);
-    sigdelset(&l->lock_mask, SIGTRAP);
-    sigdelset(&l->lock_mask, SIGABRT);
-    sigdelset(&l->lock_mask, SIGBUS);
-    sigdelset(&l->lock_mask, SIGSEGV);
-    sigdelset(&l->lock_mask, SIGKILL);
-    sigdelset(&l->lock_mask, SIGSTOP);
+	int result;
+
+	__asm __volatile ("xchgl %0, %1"
+	    : "=r"(result), "=m"(*m)
+	    : "0"(v), "1"(*m));
+
+	return result;
+}
+
+static void *
+lock_create(void *context)
+{
+    void *base;
+    char *p;
+    uintptr_t r;
+    Lock *l;
+
+    /*
+     * Arrange for the lock to occupy its own cache line.  First, we
+     * optimistically allocate just a cache line, hoping that malloc
+     * will give us a well-aligned block of memory.  If that doesn't
+     * work, we allocate a larger block and take a well-aligned cache
+     * line from it.
+     */
+    base = xmalloc(CACHE_LINE_SIZE);
+    p = (char *)base;
+    if ((uintptr_t)p % CACHE_LINE_SIZE != 0) {
+	free(base);
+	base = xmalloc(2 * CACHE_LINE_SIZE);
+	p = (char *)base;
+	if ((r = (uintptr_t)p % CACHE_LINE_SIZE) != 0)
+	    p += CACHE_LINE_SIZE - r;
+    }
+    l = (Lock *)p;
+    l->base = base;
+    l->lock = 0;
     return l;
 }
 
-void
-lockdflt_destroy(void *lock)
+static void
+lock_destroy(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    free(l->base);
+}
+
+/*
+ * Crude exclusive locks for the 80386, which does not support the
+ * cmpxchg instruction.
+ */
+static void
+lock80386_acquire(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    while (xchgl(1, &l->lock) != 0)
+	while (l->lock != 0)
+	    nanosleep(&usec, NULL);
+}
+
+static void
+lock80386_release(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    l->lock = 0;
+}
+
+/*
+ * Better reader/writer locks for the 80486 and later CPUs.
+ */
+static void
+rlock_acquire(void *lock)
 {
-    LockDflt *l = (LockDflt *)lock;
-    free(l);
+    Lock *l = (Lock *)lock;
+
+    atomic_add_int(&l->lock, RC_INCR);
+    while (l->lock & WAFLAG)
+	    nanosleep(&usec, NULL);
+}
+
+static void
+wlock_acquire(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    while (cmpxchgl(0, WAFLAG, &l->lock) != 0)
+	nanosleep(&usec, NULL);
+}
+
+static void
+rlock_release(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    atomic_add_int(&l->lock, -RC_INCR);
+}
+
+static void
+wlock_release(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    atomic_add_int(&l->lock, -WAFLAG);
+}
+
+/*
+ * Code to determine at runtime whether the CPU supports the cmpxchg
+ * instruction.  This instruction allows us to use locks that are more
+ * efficient, but it didn't exist on the 80386.
+ */
+static jmp_buf sigill_env;
+
+static void
+sigill(int sig)
+{
+    longjmp(sigill_env, 1);
+}
+
+static int
+cpu_supports_cmpxchg(void)
+{
+    struct sigaction act, oact;
+    int result;
+    volatile int lock;
+
+    memset(&act, 0, sizeof act);
+    act.sa_handler = sigill;
+    sigemptyset(&act.sa_mask);
+    act.sa_flags = 0;
+
+    sigaction(SIGILL, &act, &oact);
+    if (setjmp(sigill_env) == 0) {
+	lock = 0;
+	cmpxchgl(0, 1, &lock);
+	result = 1;
+    } else
+	result = 0;
+    sigaction(SIGILL, &oact, NULL);
+    return result;
 }
 
 void
-lockdflt_release(void *lock)
+lockdflt_init(LockInfo *li)
 {
-    LockDflt *l = (LockDflt *)lock;
-    assert(l->depth == 1);
-    l->depth--;
-    sigprocmask(SIG_SETMASK, &l->old_mask, NULL);
+    li->context = NULL;
+    li->context_destroy = NULL;
+    li->lock_create = lock_create;
+    li->lock_destroy = lock_destroy;
+    if (cpu_supports_cmpxchg()) {
+	/* Use fast locks that require an 80486 or later. */
+	li->rlock_acquire = rlock_acquire;
+	li->wlock_acquire = wlock_acquire;
+	li->rlock_release = rlock_release;
+	li->wlock_release = wlock_release;
+    } else {
+	/* It's a cruddy old 80386. */
+	li->rlock_acquire = li->wlock_acquire = lock80386_acquire;
+	li->rlock_release = li->wlock_release = lock80386_release;
+    }
 }
diff --git a/libexec/rtld-elf/i386/rtld_machdep.h b/libexec/rtld-elf/i386/rtld_machdep.h
index b44129a..37a81d3 100644
--- a/libexec/rtld-elf/i386/rtld_machdep.h
+++ b/libexec/rtld-elf/i386/rtld_machdep.h
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 1999 John D. Polstra.
+ * Copyright (c) 1999, 2000 John D. Polstra.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -41,4 +41,25 @@
 	(*(Elf_Addr *)(where) = (Elf_Addr)(target));	\
     } while (0)
 
+static inline void
+atomic_decr_int(volatile int *p)
+{
+    __asm __volatile ("lock; decl %0" : "=m"(*p) : "0"(*p) : "cc");
+}
+
+static inline void
+atomic_incr_int(volatile int *p)
+{
+    __asm __volatile ("lock; incl %0" : "=m"(*p) : "0"(*p) : "cc");
+}
+
+static inline void
+atomic_add_int(volatile int *p, int val)
+{
+    __asm __volatile ("lock; addl %1, %0"
+	: "=m"(*p)
+	: "ri"(val), "0"(*p)
+	: "cc");
+}
+
 #endif
author	jdp <jdp@FreeBSD.org>	2000-07-08 04:10:38 +0000
committer	jdp <jdp@FreeBSD.org>	2000-07-08 04:10:38 +0000
commit	3fa5480ba30a5687028cb2783eb1ae21513d4b9c (patch)
tree	db0497618ec72d42ebbd228c98ab3cc747ac3aeb /libexec/rtld-elf/i386
parent	aa26657bfbbf4260f14797da8a632778e067951e (diff)
download	FreeBSD-src-3fa5480ba30a5687028cb2783eb1ae21513d4b9c.zip FreeBSD-src-3fa5480ba30a5687028cb2783eb1ae21513d4b9c.tar.gz