Solve the dynamic linker's problems with multithreaded programs once

and for all (I hope). Packages such as wine, JDK, and linuxthreads should no longer have any problems with re-entering the dynamic linker. This commit replaces the locking used in the dynamic linker with a new spinlock-based reader/writer lock implementation. Brian Fundakowski Feldman <green> argued for this from the very beginning, but it took me a long time to come around to his point of view. Spinlocks are the only kinds of locks that work with all thread packages. But on uniprocessor systems they can be inefficient, because while a contender for the lock is spinning the holder of the lock cannot make any progress toward releasing it. To alleviate this disadvantage I have borrowed a trick from Sleepycat's Berkeley DB implementation. When spinning for a lock, the requester does a nanosleep() call for 1 usec. each time around the loop. This will generally yield the CPU to other threads, allowing the lock holder to finish its business and release the lock. I chose 1 usec. as the minimum sleep which would with reasonable certainty not be rounded down to 0. The formerly machine-independent file "lockdflt.c" has been moved into the architecture-specific subdirectories by repository copy. It now contains the machine-dependent spinlocking code. For the spinlocks I used the very nifty "simple, non-scalable reader-preference lock" which I found at <http://www.cs.rochester.edu/u/scott/synchronization/pseudocode/rw.html> on all CPUs except the 80386 (the specific CPU model, not the architecture). The 80386 CPU doesn't support the necessary "cmpxchg" instruction, so on that CPU a simple exclusive test-and-set lock is used instead. 80386 CPUs are detected at initialization time by trying to execute "cmpxchg" and catching the resulting SIGILL signal. To reduce contention for the locks, I have revamped a couple of key data structures, permitting all common operations to be done under non-exclusive (reader) locking. The only operations that require exclusive locking now are the rare intrusive operations such as dlopen() and dlclose(). The dllockinit() interface is now deprecated. It still exists, but only as a do-nothing stub. I plan to remove it as soon as is reasonably possible. (From the very beginning it was clearly labeled as experimental and subject to change.) As far as I know, only the linuxthreads port uses dllockinit(). This interface turned out to have several problems. As one example, when the dynamic linker called a client-supplied locking function, that function sometimes needed lazy binding, causing re-entry into the dynamic linker and a big looping mess. And in any case, it turned out to be too burdensome to require threads packages to register themselves with the dynamic linker.
author: jdp <jdp@FreeBSD.org> 2000-07-08 04:10:38 +0000
committer: jdp <jdp@FreeBSD.org> 2000-07-08 04:10:38 +0000
commit: 3fa5480ba30a5687028cb2783eb1ae21513d4b9c (patch)
tree: db0497618ec72d42ebbd228c98ab3cc747ac3aeb /libexec/rtld-elf
parent: aa26657bfbbf4260f14797da8a632778e067951e (diff)
download: FreeBSD-src-3fa5480ba30a5687028cb2783eb1ae21513d4b9c.zip
FreeBSD-src-3fa5480ba30a5687028cb2783eb1ae21513d4b9c.tar.gz
10 files changed, 770 insertions, 334 deletions
diff --git a/libexec/rtld-elf/alpha/lockdflt.c b/libexec/rtld-elf/alpha/lockdflt.c
index 4233b36..65900a6 100644
--- a/libexec/rtld-elf/alpha/lockdflt.c
+++ b/libexec/rtld-elf/alpha/lockdflt.c
@@ -26,64 +26,133 @@
  */
 
 /*
- * Default thread locking implementation for the dynamic linker.  It
- * is used until the client registers a different implementation with
- * dllockinit().  The default implementation does mutual exclusion by
- * blocking almost all signals.  This is based on the observation that
- * most userland thread packages use signals to support preemption.
+ * Thread locking implementation for the dynamic linker.
+ *
+ * We use the "simple, non-scalable reader-preference lock" from:
+ *
+ *   J. M. Mellor-Crummey and M. L. Scott. "Scalable Reader-Writer
+ *   Synchronization for Shared-Memory Multiprocessors." 3rd ACM Symp. on
+ *   Principles and Practice of Parallel Programming, April 1991.
+ *
+ * In this algorithm the lock is a single word.  Its low-order bit is
+ * set when a writer holds the lock.  The remaining high-order bits
+ * contain a count of readers desiring the lock.  The algorithm requires
+ * atomic "compare_and_store" and "add" operations, which we implement
+ * using assembly language sequences in "rtld_start.S".
+ *
+ * These are spinlocks.  When spinning we call nanosleep() for 1
+ * microsecond each time around the loop.  This will most likely yield
+ * the CPU to other threads (including, we hope, the lockholder) allowing
+ * them to make some progress.
  */
 
-#include <dlfcn.h>
-#include <signal.h>
 #include <stdlib.h>
+#include <time.h>
 
 #include "debug.h"
 #include "rtld.h"
 
-typedef struct Struct_LockDflt {
-    sigset_t lock_mask;
-    sigset_t old_mask;
-    int depth;
-} LockDflt;
+/*
+ * This value of CACHE_LINE_SIZE is conservative.  The actual size
+ * is 32 on the  21064, 21064A, 21066, 21066A, and 21164.  It is 64
+ * on the 21264.  Compaq recommends sequestering each lock in its own
+ * 128-byte block to allow for future implementations with larger
+ * cache lines.
+ */
+#define CACHE_LINE_SIZE		128
 
-void
-lockdflt_acquire(void *lock)
+#define WAFLAG		0x1	/* A writer holds the lock */
+#define RC_INCR		0x2	/* Adjusts count of readers desiring lock */
+
+typedef struct Struct_Lock {
+	volatile int lock;
+	void *base;
+} Lock;
+
+static const struct timespec usec = { 0, 1000 };	/* 1 usec. */
+
+static void *
+lock_create(void *context)
 {
-    LockDflt *l = (LockDflt *)lock;
-    sigprocmask(SIG_BLOCK, &l->lock_mask, &l->old_mask);
-    assert(l->depth == 0);
-    l->depth++;
+    void *base;
+    char *p;
+    uintptr_t r;
+    Lock *l;
+
+    /*
+     * Arrange for the lock to occupy its own cache line.  First, we
+     * optimistically allocate just a cache line, hoping that malloc
+     * will give us a well-aligned block of memory.  If that doesn't
+     * work, we allocate a larger block and take a well-aligned cache
+     * line from it.
+     */
+    base = xmalloc(CACHE_LINE_SIZE);
+    p = (char *)base;
+    if ((uintptr_t)p % CACHE_LINE_SIZE != 0) {
+	free(base);
+	base = xmalloc(2 * CACHE_LINE_SIZE);
+	p = (char *)base;
+	if ((r = (uintptr_t)p % CACHE_LINE_SIZE) != 0)
+	    p += CACHE_LINE_SIZE - r;
+    }
+    l = (Lock *)p;
+    l->base = base;
+    l->lock = 0;
+    return l;
 }
 
-void *
-lockdflt_create(void *context)
+static void
+lock_destroy(void *lock)
 {
-    LockDflt *l;
-
-    l = NEW(LockDflt);
-    l->depth = 0;
-    sigfillset(&l->lock_mask);
-    sigdelset(&l->lock_mask, SIGTRAP);
-    sigdelset(&l->lock_mask, SIGABRT);
-    sigdelset(&l->lock_mask, SIGBUS);
-    sigdelset(&l->lock_mask, SIGSEGV);
-    sigdelset(&l->lock_mask, SIGKILL);
-    sigdelset(&l->lock_mask, SIGSTOP);
-    return l;
+    Lock *l = (Lock *)lock;
+
+    free(l->base);
 }
 
-void
-lockdflt_destroy(void *lock)
+static void
+rlock_acquire(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    atomic_add_int(&l->lock, RC_INCR);
+    while (l->lock & WAFLAG)
+	    nanosleep(&usec, NULL);
+}
+
+static void
+wlock_acquire(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    while (cmp0_and_store_int(&l->lock, WAFLAG) != 0)
+	nanosleep(&usec, NULL);
+}
+
+static void
+rlock_release(void *lock)
 {
-    LockDflt *l = (LockDflt *)lock;
-    free(l);
+    Lock *l = (Lock *)lock;
+
+    atomic_add_int(&l->lock, -RC_INCR);
+}
+
+static void
+wlock_release(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    atomic_add_int(&l->lock, -WAFLAG);
 }
 
 void
-lockdflt_release(void *lock)
+lockdflt_init(LockInfo *li)
 {
-    LockDflt *l = (LockDflt *)lock;
-    assert(l->depth == 1);
-    l->depth--;
-    sigprocmask(SIG_SETMASK, &l->old_mask, NULL);
+    li->context = NULL;
+    li->lock_create = lock_create;
+    li->rlock_acquire = rlock_acquire;
+    li->wlock_acquire = wlock_acquire;
+    li->rlock_release = rlock_release;
+    li->wlock_release = wlock_release;
+    li->lock_destroy = lock_destroy;
+    li->context_destroy = NULL;
 }
diff --git a/libexec/rtld-elf/alpha/rtld_machdep.h b/libexec/rtld-elf/alpha/rtld_machdep.h
index 11927d6..13921c4 100644
--- a/libexec/rtld-elf/alpha/rtld_machdep.h
+++ b/libexec/rtld-elf/alpha/rtld_machdep.h
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 1999 John D. Polstra.
+ * Copyright (c) 1999, 2000 John D. Polstra.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,4 +34,10 @@
 
 void reloc_jmpslot(Elf_Addr *, Elf_Addr);
 
+/* Atomic operations. */
+int cmp0_and_store_int(volatile int *, int);
+void atomic_add_int(volatile int *, int);
+void atomic_incr_int(volatile int *);
+void atomic_decr_int(volatile int *);
+
 #endif
diff --git a/libexec/rtld-elf/alpha/rtld_start.S b/libexec/rtld-elf/alpha/rtld_start.S
index d7ec0d3..29d6178 100644
--- a/libexec/rtld-elf/alpha/rtld_start.S
+++ b/libexec/rtld-elf/alpha/rtld_start.S
@@ -3,6 +3,7 @@
 
 /*
  * Copyright 1996 Matt Thomas <matt@3am-software.com>
+ * Copyright 2000 John D. Polstra
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -165,7 +166,53 @@ $100:	ldgp    gp, 0(gp)
 	lda     sp, 168(sp)
 	jmp     $31, ($27)
 	.end _rtld_bind_start
- 
-
-
 
+/*
+ * int cmp0_and_store_int(volatile int *p, int newval);
+ *
+ * If an int holds 0, store newval into it; else do nothing.  Returns
+ * the previous value.
+ */
+LEAF(cmp0_and_store_int, 2)
+1:	mov	a1, t0
+	ldl_l	v0, 0(a0)
+	bne	v0, 3f
+	stl_c	t0, 0(a0)
+	beq	t0, 2f
+	mb
+	RET
+2:	br	1b
+3:	RET
+END(cmp0_and_store_int)
+
+LEAF(atomic_add_int, 2)
+0:	ldl_l	t0, 0(a0)
+	addq	t0, a1, t0
+	stl_c	t0, 0(a0)
+	beq	t0, 1f
+	mb
+	RET
+1:	br	0b
+END(atomic_add_int)
+ 
+/* Atomically increment an int. */
+LEAF(atomic_incr_int, 1)
+0:	ldl_l	t0, 0(a0)
+	addq	t0, 1, t0
+	stl_c	t0, 0(a0)
+	beq	t0, 1f
+	mb
+	RET
+1:	br	0b
+END(atomic_incr_int)
+
+/* Atomically decrement an int. */
+LEAF(atomic_decr_int, 1)
+0:	ldl_l	t0, 0(a0)
+	subq	t0, 1, t0
+	stl_c	t0, 0(a0)
+	beq	t0, 1f
+	mb
+	RET
+1:	br	0b
+END(atomic_decr_int)
diff --git a/libexec/rtld-elf/amd64/lockdflt.c b/libexec/rtld-elf/amd64/lockdflt.c
index 4233b36..b2ca9a5 100644
--- a/libexec/rtld-elf/amd64/lockdflt.c
+++ b/libexec/rtld-elf/amd64/lockdflt.c
@@ -26,64 +26,228 @@
  */
 
 /*
- * Default thread locking implementation for the dynamic linker.  It
- * is used until the client registers a different implementation with
- * dllockinit().  The default implementation does mutual exclusion by
- * blocking almost all signals.  This is based on the observation that
- * most userland thread packages use signals to support preemption.
+ * Thread locking implementation for the dynamic linker.
+ *
+ * On 80486 and later CPUs we use the "simple, non-scalable
+ * reader-preference lock" from:
+ *
+ *   J. M. Mellor-Crummey and M. L. Scott. "Scalable Reader-Writer
+ *   Synchronization for Shared-Memory Multiprocessors." 3rd ACM Symp. on
+ *   Principles and Practice of Parallel Programming, April 1991.
+ *
+ * In this algorithm the lock is a single word.  Its low-order bit is
+ * set when a writer holds the lock.  The remaining high-order bits
+ * contain a count of readers desiring the lock.  The algorithm requires
+ * atomic "compare_and_store" and "add" operations.
+ *
+ * The "compare_and_store" operation requires the "cmpxchg" instruction
+ * on the x86.  Unfortunately, the 80386 CPU does not support that
+ * instruction -- only the 80486 and later models support it.  So on the
+ * 80386 we must use simple test-and-set exclusive locks instead.  We
+ * determine which kind of lock to use by trying to execute a "cmpxchg"
+ * instruction and catching the SIGILL which results on the 80386.
+ *
+ * These are spinlocks.  When spinning we call nanosleep() for 1
+ * microsecond each time around the loop.  This will most likely yield
+ * the CPU to other threads (including, we hope, the lockholder) allowing
+ * them to make some progress.
  */
 
-#include <dlfcn.h>
+#include <setjmp.h>
 #include <signal.h>
 #include <stdlib.h>
+#include <time.h>
 
 #include "debug.h"
 #include "rtld.h"
 
-typedef struct Struct_LockDflt {
-    sigset_t lock_mask;
-    sigset_t old_mask;
-    int depth;
-} LockDflt;
+#define CACHE_LINE_SIZE		32
 
-void
-lockdflt_acquire(void *lock)
+#define WAFLAG		0x1	/* A writer holds the lock */
+#define RC_INCR		0x2	/* Adjusts count of readers desiring lock */
+
+typedef struct Struct_Lock {
+	volatile int lock;
+	void *base;
+} Lock;
+
+static const struct timespec usec = { 0, 1000 };	/* 1 usec. */
+
+static inline int
+cmpxchgl(int old, int new, volatile int *m)
 {
-    LockDflt *l = (LockDflt *)lock;
-    sigprocmask(SIG_BLOCK, &l->lock_mask, &l->old_mask);
-    assert(l->depth == 0);
-    l->depth++;
+	int result;
+
+	__asm __volatile ("lock; cmpxchgl %2, %0"
+	    : "=m"(*m), "=a"(result)
+	    : "r"(new), "0"(*m), "1"(old)
+	    : "cc");
+
+	return result;
 }
 
-void *
-lockdflt_create(void *context)
+static inline int
+xchgl(int v, volatile int *m)
 {
-    LockDflt *l;
-
-    l = NEW(LockDflt);
-    l->depth = 0;
-    sigfillset(&l->lock_mask);
-    sigdelset(&l->lock_mask, SIGTRAP);
-    sigdelset(&l->lock_mask, SIGABRT);
-    sigdelset(&l->lock_mask, SIGBUS);
-    sigdelset(&l->lock_mask, SIGSEGV);
-    sigdelset(&l->lock_mask, SIGKILL);
-    sigdelset(&l->lock_mask, SIGSTOP);
+	int result;
+
+	__asm __volatile ("xchgl %0, %1"
+	    : "=r"(result), "=m"(*m)
+	    : "0"(v), "1"(*m));
+
+	return result;
+}
+
+static void *
+lock_create(void *context)
+{
+    void *base;
+    char *p;
+    uintptr_t r;
+    Lock *l;
+
+    /*
+     * Arrange for the lock to occupy its own cache line.  First, we
+     * optimistically allocate just a cache line, hoping that malloc
+     * will give us a well-aligned block of memory.  If that doesn't
+     * work, we allocate a larger block and take a well-aligned cache
+     * line from it.
+     */
+    base = xmalloc(CACHE_LINE_SIZE);
+    p = (char *)base;
+    if ((uintptr_t)p % CACHE_LINE_SIZE != 0) {
+	free(base);
+	base = xmalloc(2 * CACHE_LINE_SIZE);
+	p = (char *)base;
+	if ((r = (uintptr_t)p % CACHE_LINE_SIZE) != 0)
+	    p += CACHE_LINE_SIZE - r;
+    }
+    l = (Lock *)p;
+    l->base = base;
+    l->lock = 0;
     return l;
 }
 
-void
-lockdflt_destroy(void *lock)
+static void
+lock_destroy(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    free(l->base);
+}
+
+/*
+ * Crude exclusive locks for the 80386, which does not support the
+ * cmpxchg instruction.
+ */
+static void
+lock80386_acquire(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    while (xchgl(1, &l->lock) != 0)
+	while (l->lock != 0)
+	    nanosleep(&usec, NULL);
+}
+
+static void
+lock80386_release(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    l->lock = 0;
+}
+
+/*
+ * Better reader/writer locks for the 80486 and later CPUs.
+ */
+static void
+rlock_acquire(void *lock)
 {
-    LockDflt *l = (LockDflt *)lock;
-    free(l);
+    Lock *l = (Lock *)lock;
+
+    atomic_add_int(&l->lock, RC_INCR);
+    while (l->lock & WAFLAG)
+	    nanosleep(&usec, NULL);
+}
+
+static void
+wlock_acquire(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    while (cmpxchgl(0, WAFLAG, &l->lock) != 0)
+	nanosleep(&usec, NULL);
+}
+
+static void
+rlock_release(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    atomic_add_int(&l->lock, -RC_INCR);
+}
+
+static void
+wlock_release(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    atomic_add_int(&l->lock, -WAFLAG);
+}
+
+/*
+ * Code to determine at runtime whether the CPU supports the cmpxchg
+ * instruction.  This instruction allows us to use locks that are more
+ * efficient, but it didn't exist on the 80386.
+ */
+static jmp_buf sigill_env;
+
+static void
+sigill(int sig)
+{
+    longjmp(sigill_env, 1);
+}
+
+static int
+cpu_supports_cmpxchg(void)
+{
+    struct sigaction act, oact;
+    int result;
+    volatile int lock;
+
+    memset(&act, 0, sizeof act);
+    act.sa_handler = sigill;
+    sigemptyset(&act.sa_mask);
+    act.sa_flags = 0;
+
+    sigaction(SIGILL, &act, &oact);
+    if (setjmp(sigill_env) == 0) {
+	lock = 0;
+	cmpxchgl(0, 1, &lock);
+	result = 1;
+    } else
+	result = 0;
+    sigaction(SIGILL, &oact, NULL);
+    return result;
 }
 
 void
-lockdflt_release(void *lock)
+lockdflt_init(LockInfo *li)
 {
-    LockDflt *l = (LockDflt *)lock;
-    assert(l->depth == 1);
-    l->depth--;
-    sigprocmask(SIG_SETMASK, &l->old_mask, NULL);
+    li->context = NULL;
+    li->context_destroy = NULL;
+    li->lock_create = lock_create;
+    li->lock_destroy = lock_destroy;
+    if (cpu_supports_cmpxchg()) {
+	/* Use fast locks that require an 80486 or later. */
+	li->rlock_acquire = rlock_acquire;
+	li->wlock_acquire = wlock_acquire;
+	li->rlock_release = rlock_release;
+	li->wlock_release = wlock_release;
+    } else {
+	/* It's a cruddy old 80386. */
+	li->rlock_acquire = li->wlock_acquire = lock80386_acquire;
+	li->rlock_release = li->wlock_release = lock80386_release;
+    }
 }
diff --git a/libexec/rtld-elf/amd64/rtld_machdep.h b/libexec/rtld-elf/amd64/rtld_machdep.h
index b44129a..37a81d3 100644
--- a/libexec/rtld-elf/amd64/rtld_machdep.h
+++ b/libexec/rtld-elf/amd64/rtld_machdep.h
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 1999 John D. Polstra.
+ * Copyright (c) 1999, 2000 John D. Polstra.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -41,4 +41,25 @@
 	(*(Elf_Addr *)(where) = (Elf_Addr)(target));	\
     } while (0)
 
+static inline void
+atomic_decr_int(volatile int *p)
+{
+    __asm __volatile ("lock; decl %0" : "=m"(*p) : "0"(*p) : "cc");
+}
+
+static inline void
+atomic_incr_int(volatile int *p)
+{
+    __asm __volatile ("lock; incl %0" : "=m"(*p) : "0"(*p) : "cc");
+}
+
+static inline void
+atomic_add_int(volatile int *p, int val)
+{
+    __asm __volatile ("lock; addl %1, %0"
+	: "=m"(*p)
+	: "ri"(val), "0"(*p)
+	: "cc");
+}
+
 #endif
diff --git a/libexec/rtld-elf/i386/lockdflt.c b/libexec/rtld-elf/i386/lockdflt.c
index 4233b36..b2ca9a5 100644
--- a/libexec/rtld-elf/i386/lockdflt.c
+++ b/libexec/rtld-elf/i386/lockdflt.c
@@ -26,64 +26,228 @@
  */
 
 /*
- * Default thread locking implementation for the dynamic linker.  It
- * is used until the client registers a different implementation with
- * dllockinit().  The default implementation does mutual exclusion by
- * blocking almost all signals.  This is based on the observation that
- * most userland thread packages use signals to support preemption.
+ * Thread locking implementation for the dynamic linker.
+ *
+ * On 80486 and later CPUs we use the "simple, non-scalable
+ * reader-preference lock" from:
+ *
+ *   J. M. Mellor-Crummey and M. L. Scott. "Scalable Reader-Writer
+ *   Synchronization for Shared-Memory Multiprocessors." 3rd ACM Symp. on
+ *   Principles and Practice of Parallel Programming, April 1991.
+ *
+ * In this algorithm the lock is a single word.  Its low-order bit is
+ * set when a writer holds the lock.  The remaining high-order bits
+ * contain a count of readers desiring the lock.  The algorithm requires
+ * atomic "compare_and_store" and "add" operations.
+ *
+ * The "compare_and_store" operation requires the "cmpxchg" instruction
+ * on the x86.  Unfortunately, the 80386 CPU does not support that
+ * instruction -- only the 80486 and later models support it.  So on the
+ * 80386 we must use simple test-and-set exclusive locks instead.  We
+ * determine which kind of lock to use by trying to execute a "cmpxchg"
+ * instruction and catching the SIGILL which results on the 80386.
+ *
+ * These are spinlocks.  When spinning we call nanosleep() for 1
+ * microsecond each time around the loop.  This will most likely yield
+ * the CPU to other threads (including, we hope, the lockholder) allowing
+ * them to make some progress.
  */
 
-#include <dlfcn.h>
+#include <setjmp.h>
 #include <signal.h>
 #include <stdlib.h>
+#include <time.h>
 
 #include "debug.h"
 #include "rtld.h"
 
-typedef struct Struct_LockDflt {
-    sigset_t lock_mask;
-    sigset_t old_mask;
-    int depth;
-} LockDflt;
+#define CACHE_LINE_SIZE		32
 
-void
-lockdflt_acquire(void *lock)
+#define WAFLAG		0x1	/* A writer holds the lock */
+#define RC_INCR		0x2	/* Adjusts count of readers desiring lock */
+
+typedef struct Struct_Lock {
+	volatile int lock;
+	void *base;
+} Lock;
+
+static const struct timespec usec = { 0, 1000 };	/* 1 usec. */
+
+static inline int
+cmpxchgl(int old, int new, volatile int *m)
 {
-    LockDflt *l = (LockDflt *)lock;
-    sigprocmask(SIG_BLOCK, &l->lock_mask, &l->old_mask);
-    assert(l->depth == 0);
-    l->depth++;
+	int result;
+
+	__asm __volatile ("lock; cmpxchgl %2, %0"
+	    : "=m"(*m), "=a"(result)
+	    : "r"(new), "0"(*m), "1"(old)
+	    : "cc");
+
+	return result;
 }
 
-void *
-lockdflt_create(void *context)
+static inline int
+xchgl(int v, volatile int *m)
 {
-    LockDflt *l;
-
-    l = NEW(LockDflt);
-    l->depth = 0;
-    sigfillset(&l->lock_mask);
-    sigdelset(&l->lock_mask, SIGTRAP);
-    sigdelset(&l->lock_mask, SIGABRT);
-    sigdelset(&l->lock_mask, SIGBUS);
-    sigdelset(&l->lock_mask, SIGSEGV);
-    sigdelset(&l->lock_mask, SIGKILL);
-    sigdelset(&l->lock_mask, SIGSTOP);
+	int result;
+
+	__asm __volatile ("xchgl %0, %1"
+	    : "=r"(result), "=m"(*m)
+	    : "0"(v), "1"(*m));
+
+	return result;
+}
+
+static void *
+lock_create(void *context)
+{
+    void *base;
+    char *p;
+    uintptr_t r;
+    Lock *l;
+
+    /*
+     * Arrange for the lock to occupy its own cache line.  First, we
+     * optimistically allocate just a cache line, hoping that malloc
+     * will give us a well-aligned block of memory.  If that doesn't
+     * work, we allocate a larger block and take a well-aligned cache
+     * line from it.
+     */
+    base = xmalloc(CACHE_LINE_SIZE);
+    p = (char *)base;
+    if ((uintptr_t)p % CACHE_LINE_SIZE != 0) {
+	free(base);
+	base = xmalloc(2 * CACHE_LINE_SIZE);
+	p = (char *)base;
+	if ((r = (uintptr_t)p % CACHE_LINE_SIZE) != 0)
+	    p += CACHE_LINE_SIZE - r;
+    }
+    l = (Lock *)p;
+    l->base = base;
+    l->lock = 0;
     return l;
 }
 
-void
-lockdflt_destroy(void *lock)
+static void
+lock_destroy(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    free(l->base);
+}
+
+/*
+ * Crude exclusive locks for the 80386, which does not support the
+ * cmpxchg instruction.
+ */
+static void
+lock80386_acquire(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    while (xchgl(1, &l->lock) != 0)
+	while (l->lock != 0)
+	    nanosleep(&usec, NULL);
+}
+
+static void
+lock80386_release(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    l->lock = 0;
+}
+
+/*
+ * Better reader/writer locks for the 80486 and later CPUs.
+ */
+static void
+rlock_acquire(void *lock)
 {
-    LockDflt *l = (LockDflt *)lock;
-    free(l);
+    Lock *l = (Lock *)lock;
+
+    atomic_add_int(&l->lock, RC_INCR);
+    while (l->lock & WAFLAG)
+	    nanosleep(&usec, NULL);
+}
+
+static void
+wlock_acquire(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    while (cmpxchgl(0, WAFLAG, &l->lock) != 0)
+	nanosleep(&usec, NULL);
+}
+
+static void
+rlock_release(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    atomic_add_int(&l->lock, -RC_INCR);
+}
+
+static void
+wlock_release(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    atomic_add_int(&l->lock, -WAFLAG);
+}
+
+/*
+ * Code to determine at runtime whether the CPU supports the cmpxchg
+ * instruction.  This instruction allows us to use locks that are more
+ * efficient, but it didn't exist on the 80386.
+ */
+static jmp_buf sigill_env;
+
+static void
+sigill(int sig)
+{
+    longjmp(sigill_env, 1);
+}
+
+static int
+cpu_supports_cmpxchg(void)
+{
+    struct sigaction act, oact;
+    int result;
+    volatile int lock;
+
+    memset(&act, 0, sizeof act);
+    act.sa_handler = sigill;
+    sigemptyset(&act.sa_mask);
+    act.sa_flags = 0;
+
+    sigaction(SIGILL, &act, &oact);
+    if (setjmp(sigill_env) == 0) {
+	lock = 0;
+	cmpxchgl(0, 1, &lock);
+	result = 1;
+    } else
+	result = 0;
+    sigaction(SIGILL, &oact, NULL);
+    return result;
 }
 
 void
-lockdflt_release(void *lock)
+lockdflt_init(LockInfo *li)
 {
-    LockDflt *l = (LockDflt *)lock;
-    assert(l->depth == 1);
-    l->depth--;
-    sigprocmask(SIG_SETMASK, &l->old_mask, NULL);
+    li->context = NULL;
+    li->context_destroy = NULL;
+    li->lock_create = lock_create;
+    li->lock_destroy = lock_destroy;
+    if (cpu_supports_cmpxchg()) {
+	/* Use fast locks that require an 80486 or later. */
+	li->rlock_acquire = rlock_acquire;
+	li->wlock_acquire = wlock_acquire;
+	li->rlock_release = rlock_release;
+	li->wlock_release = wlock_release;
+    } else {
+	/* It's a cruddy old 80386. */
+	li->rlock_acquire = li->wlock_acquire = lock80386_acquire;
+	li->rlock_release = li->wlock_release = lock80386_release;
+    }
 }
diff --git a/libexec/rtld-elf/i386/rtld_machdep.h b/libexec/rtld-elf/i386/rtld_machdep.h
index b44129a..37a81d3 100644
--- a/libexec/rtld-elf/i386/rtld_machdep.h
+++ b/libexec/rtld-elf/i386/rtld_machdep.h
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 1999 John D. Polstra.
+ * Copyright (c) 1999, 2000 John D. Polstra.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -41,4 +41,25 @@
 	(*(Elf_Addr *)(where) = (Elf_Addr)(target));	\
     } while (0)
 
+static inline void
+atomic_decr_int(volatile int *p)
+{
+    __asm __volatile ("lock; decl %0" : "=m"(*p) : "0"(*p) : "cc");
+}
+
+static inline void
+atomic_incr_int(volatile int *p)
+{
+    __asm __volatile ("lock; incl %0" : "=m"(*p) : "0"(*p) : "cc");
+}
+
+static inline void
+atomic_add_int(volatile int *p, int val)
+{
+    __asm __volatile ("lock; addl %1, %0"
+	: "=m"(*p)
+	: "ri"(val), "0"(*p)
+	: "cc");
+}
+
 #endif
diff --git a/libexec/rtld-elf/lockdflt.c b/libexec/rtld-elf/lockdflt.c
deleted file mode 100644
index 4233b36..0000000
--- a/libexec/rtld-elf/lockdflt.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/*-
- * Copyright 1999, 2000 John D. Polstra.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-/*
- * Default thread locking implementation for the dynamic linker.  It
- * is used until the client registers a different implementation with
- * dllockinit().  The default implementation does mutual exclusion by
- * blocking almost all signals.  This is based on the observation that
- * most userland thread packages use signals to support preemption.
- */
-
-#include <dlfcn.h>
-#include <signal.h>
-#include <stdlib.h>
-
-#include "debug.h"
-#include "rtld.h"
-
-typedef struct Struct_LockDflt {
-    sigset_t lock_mask;
-    sigset_t old_mask;
-    int depth;
-} LockDflt;
-
-void
-lockdflt_acquire(void *lock)
-{
-    LockDflt *l = (LockDflt *)lock;
-    sigprocmask(SIG_BLOCK, &l->lock_mask, &l->old_mask);
-    assert(l->depth == 0);
-    l->depth++;
-}
-
-void *
-lockdflt_create(void *context)
-{
-    LockDflt *l;
-
-    l = NEW(LockDflt);
-    l->depth = 0;
-    sigfillset(&l->lock_mask);
-    sigdelset(&l->lock_mask, SIGTRAP);
-    sigdelset(&l->lock_mask, SIGABRT);
-    sigdelset(&l->lock_mask, SIGBUS);
-    sigdelset(&l->lock_mask, SIGSEGV);
-    sigdelset(&l->lock_mask, SIGKILL);
-    sigdelset(&l->lock_mask, SIGSTOP);
-    return l;
-}
-
-void
-lockdflt_destroy(void *lock)
-{
-    LockDflt *l = (LockDflt *)lock;
-    free(l);
-}
-
-void
-lockdflt_release(void *lock)
-{
-    LockDflt *l = (LockDflt *)lock;
-    assert(l->depth == 1);
-    l->depth--;
-    sigprocmask(SIG_SETMASK, &l->old_mask, NULL);
-}
diff --git a/libexec/rtld-elf/rtld.c b/libexec/rtld-elf/rtld.c
index 146b9b2..6a1ccf3 100644
--- a/libexec/rtld-elf/rtld.c
+++ b/libexec/rtld-elf/rtld.c
@@ -58,16 +58,15 @@
 /* Types. */
 typedef void (*func_ptr_type)();
 
-typedef struct Struct_LockInfo {
-    void *context;		/* Client context for creating locks */
-    void *thelock;		/* The one big lock */
-    /* Methods */
-    void (*rlock_acquire)(void *lock);
-    void (*wlock_acquire)(void *lock);
-    void (*lock_release)(void *lock);
-    void (*lock_destroy)(void *lock);
-    void (*context_destroy)(void *context);
-} LockInfo;
+/*
+ * This structure provides a reentrant way to keep a list of objects and
+ * check which ones have already been processed in some way.
+ */
+typedef struct Struct_DoneList {
+    Obj_Entry **objs;			/* Array of object pointers */
+    unsigned int num_alloc;		/* Allocated size of the array */
+    unsigned int num_used;		/* Number of array slots used */
+} DoneList;
 
 /*
  * Function declarations.
@@ -77,6 +76,7 @@ static void die(void);
 static void digest_dynamic(Obj_Entry *);
 static Obj_Entry *digest_phdr(const Elf_Phdr *, int, caddr_t, const char *);
 static Obj_Entry *dlcheck(void *);
+static bool donelist_check(DoneList *, Obj_Entry *);
 static char *find_library(const char *, const Obj_Entry *);
 static void funclist_call(Funclist *);
 static void funclist_clear(Funclist *);
@@ -85,7 +85,7 @@ static void funclist_push_head(Funclist *, InitFunc);
 static void funclist_push_tail(Funclist *, InitFunc);
 static const char *gethints(void);
 static void init_dag(Obj_Entry *);
-static void init_dag1(Obj_Entry *root, Obj_Entry *obj);
+static void init_dag1(Obj_Entry *root, Obj_Entry *obj, DoneList *);
 static void init_rtld(caddr_t);
 static bool is_exported(const Elf_Sym *);
 static void linkmap_add(Obj_Entry *);
@@ -93,18 +93,17 @@ static void linkmap_delete(Obj_Entry *);
 static int load_needed_objects(Obj_Entry *);
 static int load_preload_objects(void);
 static Obj_Entry *load_object(char *);
-static void lock_nop(void *);
+static void lock_check(void);
 static Obj_Entry *obj_from_addr(const void *);
 static void objlist_add(Objlist *, Obj_Entry *);
 static Objlist_Entry *objlist_find(Objlist *, const Obj_Entry *);
 static void objlist_remove(Objlist *, Obj_Entry *);
-static void prebind(void *);
 static int relocate_objects(Obj_Entry *, bool);
 static void rtld_exit(void);
 static char *search_library_path(const char *, const char *);
 static void set_program_var(const char *, const void *);
 static const Elf_Sym *symlook_list(const char *, unsigned long,
-  Objlist *, const Obj_Entry **, bool in_plt);
+  Objlist *, const Obj_Entry **, bool in_plt, DoneList *);
 static void trace_loaded_objects(Obj_Entry *obj);
 static void unload_object(Obj_Entry *);
 static void unref_dag(Obj_Entry *);
@@ -128,7 +127,7 @@ static Obj_Entry *obj_list;	/* Head of linked list of shared objects */
 static Obj_Entry **obj_tail;	/* Link field of last object in list */
 static Obj_Entry *obj_main;	/* The main program shared object */
 static Obj_Entry obj_rtld;	/* The dynamic linker shared object */
-static unsigned long curmark;	/* Current mark value */
+static unsigned int obj_count;	/* Number of objects in obj_list */
 
 static Objlist list_global =	/* Objects dlopened with RTLD_GLOBAL */
   STAILQ_HEAD_INITIALIZER(list_global);
@@ -167,22 +166,45 @@ static func_ptr_type exports[] = {
 char *__progname;
 char **environ;
 
+/*
+ * Fill in a DoneList with an allocation large enough to hold all of
+ * the currently-loaded objects.  Keep this as a macro since it calls
+ * alloca and we want that to occur within the scope of the caller.
+ */
+#define donelist_init(dlp)					\
+    ((dlp)->objs = alloca(obj_count * sizeof (dlp)->objs[0]),	\
+    assert((dlp)->objs != NULL),				\
+    (dlp)->num_alloc = obj_count,				\
+    (dlp)->num_used = 0)
+
 static __inline void
 rlock_acquire(void)
 {
     lockinfo.rlock_acquire(lockinfo.thelock);
+    atomic_incr_int(&lockinfo.rcount);
+    lock_check();
 }
 
 static __inline void
 wlock_acquire(void)
 {
     lockinfo.wlock_acquire(lockinfo.thelock);
+    atomic_incr_int(&lockinfo.wcount);
+    lock_check();
 }
 
 static __inline void
-lock_release(void)
+rlock_release(void)
 {
-    lockinfo.lock_release(lockinfo.thelock);
+    atomic_decr_int(&lockinfo.rcount);
+    lockinfo.rlock_release(lockinfo.thelock);
+}
+
+static __inline void
+wlock_release(void)
+{
+    atomic_decr_int(&lockinfo.wcount);
+    lockinfo.wlock_release(lockinfo.thelock);
 }
 
 /*
@@ -316,6 +338,7 @@ _rtld(Elf_Addr *sp, func_ptr_type *exit_proc, Obj_Entry **objp)
     /* Link the main program into the list of objects. */
     *obj_tail = obj_main;
     obj_tail = &obj_main->next;
+    obj_count++;
     obj_main->refcount++;
 
     /* Initialize a fake symbol for resolving undefined weak references. */
@@ -358,15 +381,16 @@ _rtld(Elf_Addr *sp, func_ptr_type *exit_proc, Obj_Entry **objp)
     set_program_var("__progname", argv[0] != NULL ? basename(argv[0]) : "");
     set_program_var("environ", env);
 
-    dbg("initializing default locks");
-    dllockinit(NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+    dbg("initializing thread locks");
+    lockdflt_init(&lockinfo);
+    lockinfo.thelock = lockinfo.lock_create(lockinfo.context);
 
     r_debug_state();		/* say hello to gdb! */
 
     funclist_call(&initlist);
     wlock_acquire();
     funclist_clear(&initlist);
-    lock_release();
+    wlock_release();
 
     dbg("transferring control to program entry point = %p", obj_main->entry);
 
@@ -385,7 +409,7 @@ _rtld_bind(Obj_Entry *obj, Elf_Word reloff)
     Elf_Addr *where;
     Elf_Addr target;
 
-    wlock_acquire();
+    rlock_acquire();
     if (obj->pltrel)
 	rel = (const Elf_Rel *) ((caddr_t) obj->pltrel + reloff);
     else
@@ -403,7 +427,7 @@ _rtld_bind(Obj_Entry *obj, Elf_Word reloff)
       (void *)target, basename(defobj->path));
 
     reloc_jmpslot(where, target);
-    lock_release();
+    rlock_release();
     return target;
 }
 
@@ -671,6 +695,29 @@ dlcheck(void *handle)
 }
 
 /*
+ * If the given object is already in the donelist, return true.  Otherwise
+ * add the object to the list and return false.
+ */
+static bool
+donelist_check(DoneList *dlp, Obj_Entry *obj)
+{
+    unsigned int i;
+
+    for (i = 0;  i < dlp->num_used;  i++)
+	if (dlp->objs[i] == obj)
+	    return true;
+    /*
+     * Our donelist allocation should always be sufficient.  But if
+     * our threads locking isn't working properly, more shared objects
+     * could have been loaded since we allocated the list.  That should
+     * never happen, but we'll handle it properly just in case it does.
+     */
+    if (dlp->num_used < dlp->num_alloc)
+	dlp->objs[dlp->num_used++] = obj;
+    return false;
+}
+
+/*
  * Hash function for symbol table lookup.  Don't even think about changing
  * this.  It is specified by the System V ABI.
  */
@@ -741,6 +788,7 @@ const Elf_Sym *
 find_symdef(unsigned long symnum, Obj_Entry *refobj,
     const Obj_Entry **defobj_out, bool in_plt)
 {
+    DoneList donelist;
     const Elf_Sym *ref;
     const Elf_Sym *def;
     const Elf_Sym *symp;
@@ -755,11 +803,11 @@ find_symdef(unsigned long symnum, Obj_Entry *refobj,
     hash = elf_hash(name);
     def = NULL;
     defobj = NULL;
-    curmark++;
+    donelist_init(&donelist);
 
-    if (refobj->symbolic) {	/* Look first in the referencing object */
+    /* Look first in the referencing object if linked symbolically. */
+    if (refobj->symbolic && !donelist_check(&donelist, refobj)) {
 	symp = symlook_obj(name, hash, refobj, in_plt);
-	refobj->mark = curmark;
 	if (symp != NULL) {
 	    def = symp;
 	    defobj = refobj;
@@ -768,7 +816,7 @@ find_symdef(unsigned long symnum, Obj_Entry *refobj,
 
     /* Search all objects loaded at program start up. */
     if (def == NULL || ELF_ST_BIND(def->st_info) == STB_WEAK) {
-	symp = symlook_list(name, hash, &list_main, &obj, in_plt);
+	symp = symlook_list(name, hash, &list_main, &obj, in_plt, &donelist);
 	if (symp != NULL &&
 	  (def == NULL || ELF_ST_BIND(symp->st_info) != STB_WEAK)) {
 	    def = symp;
@@ -780,7 +828,8 @@ find_symdef(unsigned long symnum, Obj_Entry *refobj,
     STAILQ_FOREACH(elm, &refobj->dldags, link) {
 	if (def != NULL && ELF_ST_BIND(def->st_info) != STB_WEAK)
 	    break;
-	symp = symlook_list(name, hash, &elm->obj->dagmembers, &obj, in_plt);
+	symp = symlook_list(name, hash, &elm->obj->dagmembers, &obj, in_plt,
+	  &donelist);
 	if (symp != NULL &&
 	  (def == NULL || ELF_ST_BIND(symp->st_info) != STB_WEAK)) {
 	    def = symp;
@@ -790,7 +839,7 @@ find_symdef(unsigned long symnum, Obj_Entry *refobj,
 
     /* Search all RTLD_GLOBAL objects. */
     if (def == NULL || ELF_ST_BIND(def->st_info) == STB_WEAK) {
-	symp = symlook_list(name, hash, &list_global, &obj, in_plt);
+	symp = symlook_list(name, hash, &list_global, &obj, in_plt, &donelist);
 	if (symp != NULL &&
 	  (def == NULL || ELF_ST_BIND(symp->st_info) != STB_WEAK)) {
 	    def = symp;
@@ -919,23 +968,24 @@ gethints(void)
 static void
 init_dag(Obj_Entry *root)
 {
-    curmark++;
-    init_dag1(root, root);
+    DoneList donelist;
+
+    donelist_init(&donelist);
+    init_dag1(root, root, &donelist);
 }
 
 static void
-init_dag1(Obj_Entry *root, Obj_Entry *obj)
+init_dag1(Obj_Entry *root, Obj_Entry *obj, DoneList *dlp)
 {
     const Needed_Entry *needed;
 
-    if (obj->mark == curmark)
+    if (donelist_check(dlp, obj))
 	return;
-    obj->mark = curmark;
     objlist_add(&obj->dldags, root);
     objlist_add(&root->dagmembers, obj);
     for (needed = obj->needed;  needed != NULL;  needed = needed->next)
 	if (needed->obj != NULL)
-	    init_dag1(root, needed->obj);
+	    init_dag1(root, needed->obj, dlp);
 }
 
 /*
@@ -971,6 +1021,7 @@ init_rtld(caddr_t mapbase)
 	 */
 	obj_list = &obj_rtld;
 	obj_tail = &obj_rtld.next;
+	obj_count = 1;
 
 	relocate_objects(&obj_rtld, true);
     }
@@ -978,6 +1029,7 @@ init_rtld(caddr_t mapbase)
     /* Make the object list empty again. */
     obj_list = NULL;
     obj_tail = &obj_list;
+    obj_count = 0;
 
     /* Replace the path with a dynamically allocated copy. */
     obj_rtld.path = xstrdup(obj_rtld.path);
@@ -1118,6 +1170,7 @@ load_object(char *path)
 
 	*obj_tail = obj;
 	obj_tail = &obj->next;
+	obj_count++;
 	linkmap_add(obj);	/* for GDB */
 
 	dbg("  %p .. %p: %s", obj->mapbase,
@@ -1131,9 +1184,24 @@ load_object(char *path)
     return obj;
 }
 
+/*
+ * Check for locking violations and die if one is found.
+ */
 static void
-lock_nop(void *lock)
+lock_check(void)
 {
+    int rcount, wcount;
+
+    rcount = lockinfo.rcount;
+    wcount = lockinfo.wcount;
+    assert(rcount >= 0);
+    assert(wcount >= 0);
+    if (wcount > 1 || (wcount != 0 && rcount != 0)) {
+	_rtld_error("Application locking error: %d readers and %d writers"
+	  " in dynamic linker.  See DLLOCKINIT(3) in manual pages.",
+	  rcount, wcount);
+	die();
+    }
 }
 
 static Obj_Entry *
@@ -1317,7 +1385,7 @@ dlclose(void *handle)
     wlock_acquire();
     root = dlcheck(handle);
     if (root == NULL) {
-	lock_release();
+	wlock_release();
 	return -1;
     }
 
@@ -1336,7 +1404,7 @@ dlclose(void *handle)
 	    if (obj->refcount == 0 && obj->fini != NULL)
 		funclist_push_tail(&finilist, obj->fini);
 
-	lock_release();
+	wlock_release();
 	funclist_call(&finilist);
 	wlock_acquire();
 	funclist_clear(&finilist);
@@ -1346,7 +1414,7 @@ dlclose(void *handle)
 	unload_object(root);
 	GDB_STATE(RT_CONSISTENT);
     }
-    lock_release();
+    wlock_release();
     return 0;
 }
 
@@ -1358,6 +1426,9 @@ dlerror(void)
     return msg;
 }
 
+/*
+ * This function is deprecated and has no effect.
+ */
 void
 dllockinit(void *context,
 	   void *(*lock_create)(void *context),
@@ -1367,68 +1438,14 @@ dllockinit(void *context,
            void (*lock_destroy)(void *lock),
 	   void (*context_destroy)(void *context))
 {
-    bool is_dflt = false;
-
-    /* NULL arguments mean reset to the built-in locks. */
-    if (lock_create == NULL) {
-	is_dflt = true;
-	context = NULL;
-	lock_create = lockdflt_create;
-	rlock_acquire = wlock_acquire = lockdflt_acquire;
-	lock_release = lockdflt_release;
-	lock_destroy = lockdflt_destroy;
-	context_destroy = NULL;
-    }
-
-    /* Temporarily set locking methods to no-ops. */
-    lockinfo.rlock_acquire = lock_nop;
-    lockinfo.wlock_acquire = lock_nop;
-    lockinfo.lock_release = lock_nop;
-
-    /* Release any existing locks and context. */
-    if (lockinfo.lock_destroy != NULL)
-	lockinfo.lock_destroy(lockinfo.thelock);
-    if (lockinfo.context_destroy != NULL)
-	lockinfo.context_destroy(lockinfo.context);
-
-    /*
-     * Make sure the shared objects containing the locking methods are
-     * fully bound, to avoid infinite recursion when they are called
-     * from the lazy binding code.
-     */
-    if (!is_dflt) {
-	prebind((void *)rlock_acquire);
-	prebind((void *)wlock_acquire);
-	prebind((void *)lock_release);
-    }
-
-    /* Allocate our lock. */
-    lockinfo.thelock = lock_create(lockinfo.context);
-
-    /* Record the new method information. */
-    lockinfo.context = context;
-    lockinfo.rlock_acquire = rlock_acquire;
-    lockinfo.wlock_acquire = wlock_acquire;
-    lockinfo.lock_release = lock_release;
-    lockinfo.lock_destroy = lock_destroy;
-    lockinfo.context_destroy = context_destroy;
-}
-
-static void
-prebind(void *addr)
-{
-    Obj_Entry *obj;
-
-    if ((obj = obj_from_addr(addr)) == NULL) {
-	_rtld_error("Cannot determine shared object of locking method at %p",
-	  addr);
-	die();
-    }
-    if (!obj->rtld && !obj->jmpslots_done) {
-	dbg("Pre-binding %s for locking", obj->path);
-	if (reloc_jmpslots(obj) == -1)
-	    die();
-    }
+    static void *cur_context;
+    static void (*cur_context_destroy)(void *);
+
+    /* Just destroy the context from the previous call, if necessary. */
+    if (cur_context_destroy != NULL)
+	cur_context_destroy(cur_context);
+    cur_context = context;
+    cur_context_destroy = context_destroy;
 }
 
 void *
@@ -1482,11 +1499,11 @@ dlopen(const char *name, int mode)
     GDB_STATE(RT_CONSISTENT);
 
     /* Call the init functions with no locks held. */
-    lock_release();
+    wlock_release();
     funclist_call(&initlist);
     wlock_acquire();
     funclist_clear(&initlist);
-    lock_release();
+    wlock_release();
     return obj;
 }
 
@@ -1502,14 +1519,14 @@ dlsym(void *handle, const char *name)
     def = NULL;
     defobj = NULL;
 
-    wlock_acquire();
+    rlock_acquire();
     if (handle == NULL || handle == RTLD_NEXT) {
 	void *retaddr;
 
 	retaddr = __builtin_return_address(0);	/* __GNUC__ only */
 	if ((obj = obj_from_addr(retaddr)) == NULL) {
 	    _rtld_error("Cannot determine caller's shared object");
-	    lock_release();
+	    rlock_release();
 	    return NULL;
 	}
 	if (handle == NULL) {	/* Just the caller's shared object. */
@@ -1525,14 +1542,17 @@ dlsym(void *handle, const char *name)
 	}
     } else {
 	if ((obj = dlcheck(handle)) == NULL) {
-	    lock_release();
+	    rlock_release();
 	    return NULL;
 	}
 
 	if (obj->mainprog) {
+	    DoneList donelist;
+
 	    /* Search main program and all libraries loaded by it. */
-	    curmark++;
-	    def = symlook_list(name, hash, &list_main, &defobj, true);
+	    donelist_init(&donelist);
+	    def = symlook_list(name, hash, &list_main, &defobj, true,
+	      &donelist);
 	} else {
 	    /*
 	     * XXX - This isn't correct.  The search should include the whole
@@ -1544,12 +1564,12 @@ dlsym(void *handle, const char *name)
     }
 
     if (def != NULL) {
-	lock_release();
+	rlock_release();
 	return defobj->relocbase + def->st_value;
     }
 
     _rtld_error("Undefined symbol \"%s\"", name);
-    lock_release();
+    rlock_release();
     return NULL;
 }
 
@@ -1561,11 +1581,11 @@ dladdr(const void *addr, Dl_info *info)
     void *symbol_addr;
     unsigned long symoffset;
     
-    wlock_acquire();
+    rlock_acquire();
     obj = obj_from_addr(addr);
     if (obj == NULL) {
         _rtld_error("No shared object contains address");
-	lock_release();
+	rlock_release();
         return 0;
     }
     info->dli_fname = obj->path;
@@ -1604,7 +1624,7 @@ dladdr(const void *addr, Dl_info *info)
         if (info->dli_saddr == addr)
             break;
     }
-    lock_release();
+    rlock_release();
     return 1;
 }
 
@@ -1695,7 +1715,7 @@ set_program_var(const char *name, const void *value)
 
 static const Elf_Sym *
 symlook_list(const char *name, unsigned long hash, Objlist *objlist,
-  const Obj_Entry **defobj_out, bool in_plt)
+  const Obj_Entry **defobj_out, bool in_plt, DoneList *dlp)
 {
     const Elf_Sym *symp;
     const Elf_Sym *def;
@@ -1705,9 +1725,8 @@ symlook_list(const char *name, unsigned long hash, Objlist *objlist,
     def = NULL;
     defobj = NULL;
     STAILQ_FOREACH(elm, objlist, link) {
-	if (elm->obj->mark == curmark)
+	if (donelist_check(dlp, elm->obj))
 	    continue;
-	elm->obj->mark = curmark;
 	if ((symp = symlook_obj(name, hash, elm->obj, in_plt)) != NULL) {
 	    if (def == NULL || ELF_ST_BIND(symp->st_info) != STB_WEAK) {
 		def = symp;
@@ -1877,6 +1896,7 @@ unload_object(Obj_Entry *root)
 	    munmap(obj->mapbase, obj->mapsize);
 	    linkmap_delete(obj);
 	    *linkp = obj->next;
+	    obj_count--;
 	    obj_free(obj);
 	} else
 	    linkp = &obj->next;
diff --git a/libexec/rtld-elf/rtld.h b/libexec/rtld-elf/rtld.h
index 6d1ebbf..ab00437 100644
--- a/libexec/rtld-elf/rtld.h
+++ b/libexec/rtld-elf/rtld.h
@@ -77,6 +77,23 @@ typedef struct Struct_Needed_Entry {
     unsigned long name;		/* Offset of name in string table */
 } Needed_Entry;
 
+/* Lock object */
+typedef struct Struct_LockInfo {
+    void *context;		/* Client context for creating locks */
+    void *thelock;		/* The one big lock */
+    /* Debugging aids. */
+    volatile int rcount;	/* Number of readers holding lock */
+    volatile int wcount;	/* Number of writers holding lock */
+    /* Methods */
+    void *(*lock_create)(void *context);
+    void (*rlock_acquire)(void *lock);
+    void (*wlock_acquire)(void *lock);
+    void (*rlock_release)(void *lock);
+    void (*wlock_release)(void *lock);
+    void (*lock_destroy)(void *lock);
+    void (*context_destroy)(void *context);
+} LockInfo;
+
 /*
  * Shared object descriptor.
  *
@@ -149,7 +166,6 @@ typedef struct Struct_Obj_Entry {
     Objlist dagmembers;		/* DAG has these members (%) */
     dev_t dev;			/* Object's filesystem's device */
     ino_t ino;			/* Object's inode number */
-    unsigned long mark;		/* Set to "curmark" to avoid repeat visits */
 } Obj_Entry;
 
 #define RTLD_MAGIC	0xd550b87a
@@ -170,10 +186,7 @@ unsigned long elf_hash(const char *);
 const Elf_Sym *find_symdef(unsigned long, Obj_Entry *, const Obj_Entry **,
   bool);
 void init_pltgot(Obj_Entry *);
-void lockdflt_acquire(void *);
-void *lockdflt_create(void *);
-void lockdflt_destroy(void *);
-void lockdflt_release(void *);
+void lockdflt_init(LockInfo *);
 void obj_free(Obj_Entry *);
 Obj_Entry *obj_new(void);
 int reloc_non_plt(Obj_Entry *, Obj_Entry *);
author	jdp <jdp@FreeBSD.org>	2000-07-08 04:10:38 +0000
committer	jdp <jdp@FreeBSD.org>	2000-07-08 04:10:38 +0000
commit	3fa5480ba30a5687028cb2783eb1ae21513d4b9c (patch)
tree	db0497618ec72d42ebbd228c98ab3cc747ac3aeb /libexec/rtld-elf
parent	aa26657bfbbf4260f14797da8a632778e067951e (diff)
download	FreeBSD-src-3fa5480ba30a5687028cb2783eb1ae21513d4b9c.zip FreeBSD-src-3fa5480ba30a5687028cb2783eb1ae21513d4b9c.tar.gz