10 files changed, 770 insertions, 334 deletions
diff --git a/libexec/rtld-elf/alpha/lockdflt.c b/libexec/rtld-elf/alpha/lockdflt.c
index 4233b36..65900a6 100644
--- a/libexec/rtld-elf/alpha/lockdflt.c
+++ b/libexec/rtld-elf/alpha/lockdflt.c
@@ -26,64 +26,133 @@
  */
 
 /*
- * Default thread locking implementation for the dynamic linker.  It
- * is used until the client registers a different implementation with
- * dllockinit().  The default implementation does mutual exclusion by
- * blocking almost all signals.  This is based on the observation that
- * most userland thread packages use signals to support preemption.
+ * Thread locking implementation for the dynamic linker.
+ *
+ * We use the "simple, non-scalable reader-preference lock" from:
+ *
+ *   J. M. Mellor-Crummey and M. L. Scott. "Scalable Reader-Writer
+ *   Synchronization for Shared-Memory Multiprocessors." 3rd ACM Symp. on
+ *   Principles and Practice of Parallel Programming, April 1991.
+ *
+ * In this algorithm the lock is a single word.  Its low-order bit is
+ * set when a writer holds the lock.  The remaining high-order bits
+ * contain a count of readers desiring the lock.  The algorithm requires
+ * atomic "compare_and_store" and "add" operations, which we implement
+ * using assembly language sequences in "rtld_start.S".
+ *
+ * These are spinlocks.  When spinning we call nanosleep() for 1
+ * microsecond each time around the loop.  This will most likely yield
+ * the CPU to other threads (including, we hope, the lockholder) allowing
+ * them to make some progress.
  */
 
-#include <dlfcn.h>
-#include <signal.h>
 #include <stdlib.h>
+#include <time.h>
 
 #include "debug.h"
 #include "rtld.h"
 
-typedef struct Struct_LockDflt {
-    sigset_t lock_mask;
-    sigset_t old_mask;
-    int depth;
-} LockDflt;
+/*
+ * This value of CACHE_LINE_SIZE is conservative.  The actual size
+ * is 32 on the  21064, 21064A, 21066, 21066A, and 21164.  It is 64
+ * on the 21264.  Compaq recommends sequestering each lock in its own
+ * 128-byte block to allow for future implementations with larger
+ * cache lines.
+ */
+#define CACHE_LINE_SIZE		128
 
-void
-lockdflt_acquire(void *lock)
+#define WAFLAG		0x1	/* A writer holds the lock */
+#define RC_INCR		0x2	/* Adjusts count of readers desiring lock */
+
+typedef struct Struct_Lock {
+	volatile int lock;
+	void *base;
+} Lock;
+
+static const struct timespec usec = { 0, 1000 };	/* 1 usec. */
+
+static void *
+lock_create(void *context)
 {
-    LockDflt *l = (LockDflt *)lock;
-    sigprocmask(SIG_BLOCK, &l->lock_mask, &l->old_mask);
-    assert(l->depth == 0);
-    l->depth++;
+    void *base;
+    char *p;
+    uintptr_t r;
+    Lock *l;
+
+    /*
+     * Arrange for the lock to occupy its own cache line.  First, we
+     * optimistically allocate just a cache line, hoping that malloc
+     * will give us a well-aligned block of memory.  If that doesn't
+     * work, we allocate a larger block and take a well-aligned cache
+     * line from it.
+     */
+    base = xmalloc(CACHE_LINE_SIZE);
+    p = (char *)base;
+    if ((uintptr_t)p % CACHE_LINE_SIZE != 0) {
+	free(base);
+	base = xmalloc(2 * CACHE_LINE_SIZE);
+	p = (char *)base;
+	if ((r = (uintptr_t)p % CACHE_LINE_SIZE) != 0)
+	    p += CACHE_LINE_SIZE - r;
+    }
+    l = (Lock *)p;
+    l->base = base;
+    l->lock = 0;
+    return l;
 }
 
-void *
-lockdflt_create(void *context)
+static void
+lock_destroy(void *lock)
 {
-    LockDflt *l;
-
-    l = NEW(LockDflt);
-    l->depth = 0;
-    sigfillset(&l->lock_mask);
-    sigdelset(&l->lock_mask, SIGTRAP);
-    sigdelset(&l->lock_mask, SIGABRT);
-    sigdelset(&l->lock_mask, SIGBUS);
-    sigdelset(&l->lock_mask, SIGSEGV);
-    sigdelset(&l->lock_mask, SIGKILL);
-    sigdelset(&l->lock_mask, SIGSTOP);
-    return l;
+    Lock *l = (Lock *)lock;
+
+    free(l->base);
 }
 
-void
-lockdflt_destroy(void *lock)
+static void
+rlock_acquire(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    atomic_add_int(&l->lock, RC_INCR);
+    while (l->lock & WAFLAG)
+	    nanosleep(&usec, NULL);
+}
+
+static void
+wlock_acquire(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    while (cmp0_and_store_int(&l->lock, WAFLAG) != 0)
+	nanosleep(&usec, NULL);
+}
+
+static void
+rlock_release(void *lock)
 {
-    LockDflt *l = (LockDflt *)lock;
-    free(l);
+    Lock *l = (Lock *)lock;
+
+    atomic_add_int(&l->lock, -RC_INCR);
+}
+
+static void
+wlock_release(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    atomic_add_int(&l->lock, -WAFLAG);
 }
 
 void
-lockdflt_release(void *lock)
+lockdflt_init(LockInfo *li)
 {
-    LockDflt *l = (LockDflt *)lock;
-    assert(l->depth == 1);
-    l->depth--;
-    sigprocmask(SIG_SETMASK, &l->old_mask, NULL);
+    li->context = NULL;
+    li->lock_create = lock_create;
+    li->rlock_acquire = rlock_acquire;
+    li->wlock_acquire = wlock_acquire;
+    li->rlock_release = rlock_release;
+    li->wlock_release = wlock_release;
+    li->lock_destroy = lock_destroy;
+    li->context_destroy = NULL;
 }
diff --git a/libexec/rtld-elf/alpha/rtld_machdep.h b/libexec/rtld-elf/alpha/rtld_machdep.h
index 11927d6..13921c4 100644
--- a/libexec/rtld-elf/alpha/rtld_machdep.h
+++ b/libexec/rtld-elf/alpha/rtld_machdep.h
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 1999 John D. Polstra.
+ * Copyright (c) 1999, 2000 John D. Polstra.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -34,4 +34,10 @@
 
 void reloc_jmpslot(Elf_Addr *, Elf_Addr);
 
+/* Atomic operations. */
+int cmp0_and_store_int(volatile int *, int);
+void atomic_add_int(volatile int *, int);
+void atomic_incr_int(volatile int *);
+void atomic_decr_int(volatile int *);
+
 #endif
diff --git a/libexec/rtld-elf/alpha/rtld_start.S b/libexec/rtld-elf/alpha/rtld_start.S
index d7ec0d3..29d6178 100644
--- a/libexec/rtld-elf/alpha/rtld_start.S
+++ b/libexec/rtld-elf/alpha/rtld_start.S
@@ -3,6 +3,7 @@
 
 /*
  * Copyright 1996 Matt Thomas <matt@3am-software.com>
+ * Copyright 2000 John D. Polstra
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -165,7 +166,53 @@ $100:	ldgp    gp, 0(gp)
 	lda     sp, 168(sp)
 	jmp     $31, ($27)
 	.end _rtld_bind_start
- 
-
-
 
+/*
+ * int cmp0_and_store_int(volatile int *p, int newval);
+ *
+ * If an int holds 0, store newval into it; else do nothing.  Returns
+ * the previous value.
+ */
+LEAF(cmp0_and_store_int, 2)
+1:	mov	a1, t0
+	ldl_l	v0, 0(a0)
+	bne	v0, 3f
+	stl_c	t0, 0(a0)
+	beq	t0, 2f
+	mb
+	RET
+2:	br	1b
+3:	RET
+END(cmp0_and_store_int)
+
+LEAF(atomic_add_int, 2)
+0:	ldl_l	t0, 0(a0)
+	addq	t0, a1, t0
+	stl_c	t0, 0(a0)
+	beq	t0, 1f
+	mb
+	RET
+1:	br	0b
+END(atomic_add_int)
+ 
+/* Atomically increment an int. */
+LEAF(atomic_incr_int, 1)
+0:	ldl_l	t0, 0(a0)
+	addq	t0, 1, t0
+	stl_c	t0, 0(a0)
+	beq	t0, 1f
+	mb
+	RET
+1:	br	0b
+END(atomic_incr_int)
+
+/* Atomically decrement an int. */
+LEAF(atomic_decr_int, 1)
+0:	ldl_l	t0, 0(a0)
+	subq	t0, 1, t0
+	stl_c	t0, 0(a0)
+	beq	t0, 1f
+	mb
+	RET
+1:	br	0b
+END(atomic_decr_int)
diff --git a/libexec/rtld-elf/amd64/lockdflt.c b/libexec/rtld-elf/amd64/lockdflt.c
index 4233b36..b2ca9a5 100644
--- a/libexec/rtld-elf/amd64/lockdflt.c
+++ b/libexec/rtld-elf/amd64/lockdflt.c
@@ -26,64 +26,228 @@
  */
 
 /*
- * Default thread locking implementation for the dynamic linker.  It
- * is used until the client registers a different implementation with
- * dllockinit().  The default implementation does mutual exclusion by
- * blocking almost all signals.  This is based on the observation that
- * most userland thread packages use signals to support preemption.
+ * Thread locking implementation for the dynamic linker.
+ *
+ * On 80486 and later CPUs we use the "simple, non-scalable
+ * reader-preference lock" from:
+ *
+ *   J. M. Mellor-Crummey and M. L. Scott. "Scalable Reader-Writer
+ *   Synchronization for Shared-Memory Multiprocessors." 3rd ACM Symp. on
+ *   Principles and Practice of Parallel Programming, April 1991.
+ *
+ * In this algorithm the lock is a single word.  Its low-order bit is
+ * set when a writer holds the lock.  The remaining high-order bits
+ * contain a count of readers desiring the lock.  The algorithm requires
+ * atomic "compare_and_store" and "add" operations.
+ *
+ * The "compare_and_store" operation requires the "cmpxchg" instruction
+ * on the x86.  Unfortunately, the 80386 CPU does not support that
+ * instruction -- only the 80486 and later models support it.  So on the
+ * 80386 we must use simple test-and-set exclusive locks instead.  We
+ * determine which kind of lock to use by trying to execute a "cmpxchg"
+ * instruction and catching the SIGILL which results on the 80386.
+ *
+ * These are spinlocks.  When spinning we call nanosleep() for 1
+ * microsecond each time around the loop.  This will most likely yield
+ * the CPU to other threads (including, we hope, the lockholder) allowing
+ * them to make some progress.
  */
 
-#include <dlfcn.h>
+#include <setjmp.h>
 #include <signal.h>
 #include <stdlib.h>
+#include <time.h>
 
 #include "debug.h"
 #include "rtld.h"
 
-typedef struct Struct_LockDflt {
-    sigset_t lock_mask;
-    sigset_t old_mask;
-    int depth;
-} LockDflt;
+#define CACHE_LINE_SIZE		32
 
-void
-lockdflt_acquire(void *lock)
+#define WAFLAG		0x1	/* A writer holds the lock */
+#define RC_INCR		0x2	/* Adjusts count of readers desiring lock */
+
+typedef struct Struct_Lock {
+	volatile int lock;
+	void *base;
+} Lock;
+
+static const struct timespec usec = { 0, 1000 };	/* 1 usec. */
+
+static inline int
+cmpxchgl(int old, int new, volatile int *m)
 {
-    LockDflt *l = (LockDflt *)lock;
-    sigprocmask(SIG_BLOCK, &l->lock_mask, &l->old_mask);
-    assert(l->depth == 0);
-    l->depth++;
+	int result;
+
+	__asm __volatile ("lock; cmpxchgl %2, %0"
+	    : "=m"(*m), "=a"(result)
+	    : "r"(new), "0"(*m), "1"(old)
+	    : "cc");
+
+	return result;
 }
 
-void *
-lockdflt_create(void *context)
+static inline int
+xchgl(int v, volatile int *m)
 {
-    LockDflt *l;
-
-    l = NEW(LockDflt);
-    l->depth = 0;
-    sigfillset(&l->lock_mask);
-    sigdelset(&l->lock_mask, SIGTRAP);
-    sigdelset(&l->lock_mask, SIGABRT);
-    sigdelset(&l->lock_mask, SIGBUS);
-    sigdelset(&l->lock_mask, SIGSEGV);
-    sigdelset(&l->lock_mask, SIGKILL);
-    sigdelset(&l->lock_mask, SIGSTOP);
+	int result;
+
+	__asm __volatile ("xchgl %0, %1"
+	    : "=r"(result), "=m"(*m)
+	    : "0"(v), "1"(*m));
+
+	return result;
+}
+
+static void *
+lock_create(void *context)
+{
+    void *base;
+    char *p;
+    uintptr_t r;
+    Lock *l;
+
+    /*
+     * Arrange for the lock to occupy its own cache line.  First, we
+     * optimistically allocate just a cache line, hoping that malloc
+     * will give us a well-aligned block of memory.  If that doesn't
+     * work, we allocate a larger block and take a well-aligned cache
+     * line from it.
+     */
+    base = xmalloc(CACHE_LINE_SIZE);
+    p = (char *)base;
+    if ((uintptr_t)p % CACHE_LINE_SIZE != 0) {
+	free(base);
+	base = xmalloc(2 * CACHE_LINE_SIZE);
+	p = (char *)base;
+	if ((r = (uintptr_t)p % CACHE_LINE_SIZE) != 0)
+	    p += CACHE_LINE_SIZE - r;
+    }
+    l = (Lock *)p;
+    l->base = base;
+    l->lock = 0;
     return l;
 }
 
-void
-lockdflt_destroy(void *lock)
+static void
+lock_destroy(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    free(l->base);
+}
+
+/*
+ * Crude exclusive locks for the 80386, which does not support the
+ * cmpxchg instruction.
+ */
+static void
+lock80386_acquire(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    while (xchgl(1, &l->lock) != 0)
+	while (l->lock != 0)
+	    nanosleep(&usec, NULL);
+}
+
+static void
+lock80386_release(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    l->lock = 0;
+}
+
+/*
+ * Better reader/writer locks for the 80486 and later CPUs.
+ */
+static void
+rlock_acquire(void *lock)
 {
-    LockDflt *l = (LockDflt *)lock;
-    free(l);
+    Lock *l = (Lock *)lock;
+
+    atomic_add_int(&l->lock, RC_INCR);
+    while (l->lock & WAFLAG)
+	    nanosleep(&usec, NULL);
+}
+
+static void
+wlock_acquire(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    while (cmpxchgl(0, WAFLAG, &l->lock) != 0)
+	nanosleep(&usec, NULL);
+}
+
+static void
+rlock_release(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    atomic_add_int(&l->lock, -RC_INCR);
+}
+
+static void
+wlock_release(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    atomic_add_int(&l->lock, -WAFLAG);
+}
+
+/*
+ * Code to determine at runtime whether the CPU supports the cmpxchg
+ * instruction.  This instruction allows us to use locks that are more
+ * efficient, but it didn't exist on the 80386.
+ */
+static jmp_buf sigill_env;
+
+static void
+sigill(int sig)
+{
+    longjmp(sigill_env, 1);
+}
+
+static int
+cpu_supports_cmpxchg(void)
+{
+    struct sigaction act, oact;
+    int result;
+    volatile int lock;
+
+    memset(&act, 0, sizeof act);
+    act.sa_handler = sigill;
+    sigemptyset(&act.sa_mask);
+    act.sa_flags = 0;
+
+    sigaction(SIGILL, &act, &oact);
+    if (setjmp(sigill_env) == 0) {
+	lock = 0;
+	cmpxchgl(0, 1, &lock);
+	result = 1;
+    } else
+	result = 0;
+    sigaction(SIGILL, &oact, NULL);
+    return result;
 }
 
 void
-lockdflt_release(void *lock)
+lockdflt_init(LockInfo *li)
 {
-    LockDflt *l = (LockDflt *)lock;
-    assert(l->depth == 1);
-    l->depth--;
-    sigprocmask(SIG_SETMASK, &l->old_mask, NULL);
+    li->context = NULL;
+    li->context_destroy = NULL;
+    li->lock_create = lock_create;
+    li->lock_destroy = lock_destroy;
+    if (cpu_supports_cmpxchg()) {
+	/* Use fast locks that require an 80486 or later. */
+	li->rlock_acquire = rlock_acquire;
+	li->wlock_acquire = wlock_acquire;
+	li->rlock_release = rlock_release;
+	li->wlock_release = wlock_release;
+    } else {
+	/* It's a cruddy old 80386. */
+	li->rlock_acquire = li->wlock_acquire = lock80386_acquire;
+	li->rlock_release = li->wlock_release = lock80386_release;
+    }
 }
diff --git a/libexec/rtld-elf/amd64/rtld_machdep.h b/libexec/rtld-elf/amd64/rtld_machdep.h
index b44129a..37a81d3 100644
--- a/libexec/rtld-elf/amd64/rtld_machdep.h
+++ b/libexec/rtld-elf/amd64/rtld_machdep.h
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 1999 John D. Polstra.
+ * Copyright (c) 1999, 2000 John D. Polstra.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -41,4 +41,25 @@
 	(*(Elf_Addr *)(where) = (Elf_Addr)(target));	\
     } while (0)
 
+static inline void
+atomic_decr_int(volatile int *p)
+{
+    __asm __volatile ("lock; decl %0" : "=m"(*p) : "0"(*p) : "cc");
+}
+
+static inline void
+atomic_incr_int(volatile int *p)
+{
+    __asm __volatile ("lock; incl %0" : "=m"(*p) : "0"(*p) : "cc");
+}
+
+static inline void
+atomic_add_int(volatile int *p, int val)
+{
+    __asm __volatile ("lock; addl %1, %0"
+	: "=m"(*p)
+	: "ri"(val), "0"(*p)
+	: "cc");
+}
+
 #endif
diff --git a/libexec/rtld-elf/i386/lockdflt.c b/libexec/rtld-elf/i386/lockdflt.c
index 4233b36..b2ca9a5 100644
--- a/libexec/rtld-elf/i386/lockdflt.c
+++ b/libexec/rtld-elf/i386/lockdflt.c
@@ -26,64 +26,228 @@
  */
 
 /*
- * Default thread locking implementation for the dynamic linker.  It
- * is used until the client registers a different implementation with
- * dllockinit().  The default implementation does mutual exclusion by
- * blocking almost all signals.  This is based on the observation that
- * most userland thread packages use signals to support preemption.
+ * Thread locking implementation for the dynamic linker.
+ *
+ * On 80486 and later CPUs we use the "simple, non-scalable
+ * reader-preference lock" from:
+ *
+ *   J. M. Mellor-Crummey and M. L. Scott. "Scalable Reader-Writer
+ *   Synchronization for Shared-Memory Multiprocessors." 3rd ACM Symp. on
+ *   Principles and Practice of Parallel Programming, April 1991.
+ *
+ * In this algorithm the lock is a single word.  Its low-order bit is
+ * set when a writer holds the lock.  The remaining high-order bits
+ * contain a count of readers desiring the lock.  The algorithm requires
+ * atomic "compare_and_store" and "add" operations.
+ *
+ * The "compare_and_store" operation requires the "cmpxchg" instruction
+ * on the x86.  Unfortunately, the 80386 CPU does not support that
+ * instruction -- only the 80486 and later models support it.  So on the
+ * 80386 we must use simple test-and-set exclusive locks instead.  We
+ * determine which kind of lock to use by trying to execute a "cmpxchg"
+ * instruction and catching the SIGILL which results on the 80386.
+ *
+ * These are spinlocks.  When spinning we call nanosleep() for 1
+ * microsecond each time around the loop.  This will most likely yield
+ * the CPU to other threads (including, we hope, the lockholder) allowing
+ * them to make some progress.
  */
 
-#include <dlfcn.h>
+#include <setjmp.h>
 #include <signal.h>
 #include <stdlib.h>
+#include <time.h>
 
 #include "debug.h"
 #include "rtld.h"
 
-typedef struct Struct_LockDflt {
-    sigset_t lock_mask;
-    sigset_t old_mask;
-    int depth;
-} LockDflt;
+#define CACHE_LINE_SIZE		32
 
-void
-lockdflt_acquire(void *lock)
+#define WAFLAG		0x1	/* A writer holds the lock */
+#define RC_INCR		0x2	/* Adjusts count of readers desiring lock */
+
+typedef struct Struct_Lock {
+	volatile int lock;
+	void *base;
+} Lock;
+
+static const struct timespec usec = { 0, 1000 };	/* 1 usec. */
+
+static inline int
+cmpxchgl(int old, int new, volatile int *m)
 {
-    LockDflt *l = (LockDflt *)lock;
-    sigprocmask(SIG_BLOCK, &l->lock_mask, &l->old_mask);
-    assert(l->depth == 0);
-    l->depth++;
+	int result;
+
+	__asm __volatile ("lock; cmpxchgl %2, %0"
+	    : "=m"(*m), "=a"(result)
+	    : "r"(new), "0"(*m), "1"(old)
+	    : "cc");
+
+	return result;
 }
 
-void *
-lockdflt_create(void *context)
+static inline int
+xchgl(int v, volatile int *m)
 {
-    LockDflt *l;
-
-    l = NEW(LockDflt);
-    l->depth = 0;
-    sigfillset(&l->lock_mask);
-    sigdelset(&l->lock_mask, SIGTRAP);
-    sigdelset(&l->lock_mask, SIGABRT);
-    sigdelset(&l->lock_mask, SIGBUS);
-    sigdelset(&l->lock_mask, SIGSEGV);
-    sigdelset(&l->lock_mask, SIGKILL);
-    sigdelset(&l->lock_mask, SIGSTOP);
+	int result;
+
+	__asm __volatile ("xchgl %0, %1"
+	    : "=r"(result), "=m"(*m)
+	    : "0"(v), "1"(*m));
+
+	return result;
+}
+
+static void *
+lock_create(void *context)
+{
+    void *base;
+    char *p;
+    uintptr_t r;
+    Lock *l;
+
+    /*
+     * Arrange for the lock to occupy its own cache line.  First, we
+     * optimistically allocate just a cache line, hoping that malloc
+     * will give us a well-aligned block of memory.  If that doesn't
+     * work, we allocate a larger block and take a well-aligned cache
+     * line from it.
+     */
+    base = xmalloc(CACHE_LINE_SIZE);
+    p = (char *)base;
+    if ((uintptr_t)p % CACHE_LINE_SIZE != 0) {
+	free(base);
+	base = xmalloc(2 * CACHE_LINE_SIZE);
+	p = (char *)base;
+	if ((r = (uintptr_t)p % CACHE_LINE_SIZE) != 0)
+	    p += CACHE_LINE_SIZE - r;
+    }
+    l = (Lock *)p;
+    l->base = base;
+    l->lock = 0;
     return l;
 }
 
-void
-lockdflt_destroy(void *lock)
+static void
+lock_destroy(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    free(l->base);
+}
+
+/*
+ * Crude exclusive locks for the 80386, which does not support the
+ * cmpxchg instruction.
+ */
+static void
+lock80386_acquire(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    while (xchgl(1, &l->lock) != 0)
+	while (l->lock != 0)
+	    nanosleep(&usec, NULL);
+}
+
+static void
+lock80386_release(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    l->lock = 0;
+}
+
+/*
+ * Better reader/writer locks for the 80486 and later CPUs.
+ */
+static void
+rlock_acquire(void *lock)
 {
-    LockDflt *l = (LockDflt *)lock;
-    free(l);
+    Lock *l = (Lock *)lock;
+
+    atomic_add_int(&l->lock, RC_INCR);
+    while (l->lock & WAFLAG)
+	    nanosleep(&usec, NULL);
+}
+
+static void
+wlock_acquire(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    while (cmpxchgl(0, WAFLAG, &l->lock) != 0)
+	nanosleep(&usec, NULL);
+}
+
+static void
+rlock_release(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    atomic_add_int(&l->lock, -RC_INCR);
+}
+
+static void
+wlock_release(void *lock)
+{
+    Lock *l = (Lock *)lock;
+
+    atomic_add_int(&l->lock, -WAFLAG);
+}
+
+/*
+ * Code to determine at runtime whether the CPU supports the cmpxchg
+ * instruction.  This instruction allows us to use locks that are more
+ * efficient, but it didn't exist on the 80386.
+ */
+static jmp_buf sigill_env;
+
+static void
+sigill(int sig)
+{
+    longjmp(sigill_env, 1);
+}
+
+static int
+cpu_supports_cmpxchg(void)
+{
+    struct sigaction act, oact;
+    int result;
+    volatile int lock;
+
+    memset(&act, 0, sizeof act);
+    act.sa_handler = sigill;
+    sigemptyset(&act.sa_mask);
+    act.sa_flags = 0;
+
+    sigaction(SIGILL, &act, &oact);
+    if (setjmp(sigill_env) == 0) {
+	lock = 0;
+	cmpxchgl(0, 1, &lock);
+	result = 1;
+    } else
+	result = 0;
+    sigaction(SIGILL, &oact, NULL);
+    return result;
 }
 
 void
-lockdflt_release(void *lock)
+lockdflt_init(LockInfo *li)
 {
-    LockDflt *l = (LockDflt *)lock;
-    assert(l->depth == 1);
-    l->depth--;
-    sigprocmask(SIG_SETMASK, &l->old_mask, NULL);
+    li->context = NULL;
+    li->context_destroy = NULL;
+    li->lock_create = lock_create;
+    li->lock_destroy = lock_destroy;
+    if (cpu_supports_cmpxchg()) {
+	/* Use fast locks that require an 80486 or later. */
+	li->rlock_acquire = rlock_acquire;
+	li->wlock_acquire = wlock_acquire;
+	li->rlock_release = rlock_release;
+	li->wlock_release = wlock_release;
+    } else {
+	/* It's a cruddy old 80386. */
+	li->rlock_acquire = li->wlock_acquire = lock80386_acquire;
+	li->rlock_release = li->wlock_release = lock80386_release;
+    }
 }
diff --git a/libexec/rtld-elf/i386/rtld_machdep.h b/libexec/rtld-elf/i386/rtld_machdep.h
index b44129a..37a81d3 100644
--- a/libexec/rtld-elf/i386/rtld_machdep.h
+++ b/libexec/rtld-elf/i386/rtld_machdep.h
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 1999 John D. Polstra.
+ * Copyright (c) 1999, 2000 John D. Polstra.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -41,4 +41,25 @@
 	(*(Elf_Addr *)(where) = (Elf_Addr)(target));	\
     } while (0)
 
+static inline void
+atomic_decr_int(volatile int *p)
+{
+    __asm __volatile ("lock; decl %0" : "=m"(*p) : "0"(*p) : "cc");
+}
+
+static inline void
+atomic_incr_int(volatile int *p)
+{
+    __asm __volatile ("lock; incl %0" : "=m"(*p) : "0"(*p) : "cc");
+}
+
+static inline void
+atomic_add_int(volatile int *p, int val)
+{
+    __asm __volatile ("lock; addl %1, %0"
+	: "=m"(*p)
+	: "ri"(val), "0"(*p)
+	: "cc");
+}
+
 #endif
diff --git a/libexec/rtld-elf/lockdflt.c b/libexec/rtld-elf/lockdflt.c
deleted file mode 100644
index 4233b36..0000000
--- a/libexec/rtld-elf/lockdflt.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/*-
- * Copyright 1999, 2000 John D. Polstra.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-/*
- * Default thread locking implementation for the dynamic linker.  It
- * is used until the client registers a different implementation with
- * dllockinit().  The default implementation does mutual exclusion by
- * blocking almost all signals.  This is based on the observation that
- * most userland thread packages use signals to support preemption.
- */
-
-#include <dlfcn.h>
-#include <signal.h>
-#include <stdlib.h>
-
-#include "debug.h"
-#include "rtld.h"
-
-typedef struct Struct_LockDflt {
-    sigset_t lock_mask;
-    sigset_t old_mask;
-    int depth;
-} LockDflt;
-
-void
-lockdflt_acquire(void *lock)
-{
-    LockDflt *l = (LockDflt *)lock;
-    sigprocmask(SIG_BLOCK, &l->lock_mask, &l->old_mask);
-    assert(l->depth == 0);
-    l->depth++;
-}
-
-void *
-lockdflt_create(void *context)
-{
-    LockDflt *l;
-
-    l = NEW(LockDflt);
-    l->depth = 0;
-    sigfillset(&l->lock_mask);
-    sigdelset(&l->lock_mask, SIGTRAP);
-    sigdelset(&l->lock_mask, SIGABRT);
-    sigdelset(&l->lock_mask, SIGBUS);
-    sigdelset(&l->lock_mask, SIGSEGV);
-    sigdelset(&l->lock_mask, SIGKILL);
-    sigdelset(&l->lock_mask, SIGSTOP);
-    return l;
-}
-
-void
-lockdflt_destroy(void *lock)
-{
-    LockDflt *l = (LockDflt *)lock;
-    free(l);
-}
-
-void
-lockdflt_release(void *lock)
-{
-    LockDflt *l = (LockDflt *)lock;
-    assert(l->depth == 1);
-    l->depth--;
-    sigprocmask(SIG_SETMASK, &l->old_mask, NULL);
-}
diff --git a/libexec/rtld-elf/rtld.c b/libexec/rtld-elf/rtld.c
index 146b9b2..6a1ccf3 100644
--- a/libexec/rtld-elf/rtld.c
+++ b/libexec/rtld-elf/rtld.c
@@ -58,16 +58,15 @@
 /* Types. */
 typedef void (*func_ptr_type)();
 
-typedef struct Struct_LockInfo {
-    void *context;		/* Client context for creating locks */
-    void *thelock;		/* The one big lock */
-    /* Methods */
-    void (*rlock_acquire)(void *lock);
-    void (*wlock_acquire)(void *lock);
-    void (*lock_release)(void *lock);
-    void (*lock_destroy)(void *lock);
-    void (*context_destroy)(void *context);
-} LockInfo;
+/*
+ * This structure provides a reentrant way to keep a list of objects and
+ * check which ones have already been processed in some way.
+ */
+typedef struct Struct_DoneList {
+    Obj_Entry **objs;			/* Array of object pointers */
+    unsigned int num_alloc;		/* Allocated size of the array */
+    unsigned int num_used;		/* Number of array slots used */
+} DoneList;
 
 /*
  * Function declarations.
@@ -77,6 +76,7 @@ static void die(void);
 static void digest_dynamic(Obj_Entry *);
 static Obj_Entry *digest_phdr(const Elf_Phdr *, int, caddr_t, const char *);
 static Obj_Entry *dlcheck(void *);
+static bool donelist_check(DoneList *, Obj_Entry *);
 static char *find_library(const char *, const Obj_Entry *);
 static void funclist_call(Funclist *);
 static void funclist_clear(Funclist *);
@@ -85,7 +85,7 @@ static void funclist_push_head(Funclist *, InitFunc);
 static void funclist_push_tail(Funclist *, InitFunc);
 static const char *gethints(void);
 static void init_dag(Obj_Entry *);
-static void init_dag1(Obj_Entry *root, Obj_Entry *obj);
+static void init_dag1(Obj_Entry *root, Obj_Entry *obj, DoneList *);
 static void init_rtld(caddr_t);
 static bool is_exported(const Elf_Sym *);
 static void linkmap_add(Obj_Entry *);
@@ -93,18 +93,17 @@ static void linkmap_delete(Obj_Entry *);
 static int load_needed_objects(Obj_Entry *);
 static int load_preload_objects(void);
 static Obj_Entry *load_object(char *);
-static void lock_nop(void *);
+static void lock_check(void);
 static Obj_Entry *obj_from_addr(const void *);
 static void objlist_add(Objlist *, Obj_Entry *);
 static Objlist_Entry *objlist_find(Objlist *, const Obj_Entry *);
 static void objlist_remove(Objlist *, Obj_Entry *);
-static void prebind(void *);
 static int relocate_objects(Obj_Entry *, bool);
 static void rtld_exit(void);
 static char *search_library_path(const char *, const char *);
 static void set_program_var(const char *, const void *);
 static const Elf_Sym *symlook_list(const char *, unsigned long,
-  Objlist *, const Obj_Entry **, bool in_plt);
+  Objlist *, const Obj_Entry **, bool in_plt, DoneList *);
 static void trace_loaded_objects(Obj_Entry *obj);
 static void unload_object(Obj_Entry *);
 static void unref_dag(Obj_Entry *);
@@ -128,7 +127,7 @@ static Obj_Entry *obj_list;	/* Head of linked list of shared objects */
 static Obj_Entry **obj_tail;	/* Link field of last object in list */
 static Obj_Entry *obj_main;	/* The main program shared object */
 static Obj_Entry obj_rtld;	/* The dynamic linker shared object */
-static unsigned long curmark;	/* Current mark value */
+static unsigned int obj_count;	/* Number of objects in obj_list */
 
 static Objlist list_global =	/* Objects dlopened with RTLD_GLOBAL */
   STAILQ_HEAD_INITIALIZER(list_global);
@@ -167,22 +166,45 @@ static func_ptr_type exports[] = {
 char *__progname;
 char **environ;
 
+/*
+ * Fill in a DoneList with an allocation large enough to hold all of
+ * the currently-loaded objects.  Keep this as a macro since it calls
+ * alloca and we want that to occur within the scope of the caller.
+ */
+#define donelist_init(dlp)					\
+    ((dlp)->objs = alloca(obj_count * sizeof (dlp)->objs[0]),	\
+    assert((dlp)->objs != NULL),				\
+    (dlp)->num_alloc = obj_count,				\
+    (dlp)->num_used = 0)
+
 static __inline void
 rlock_acquire(void)
 {
     lockinfo.rlock_acquire(lockinfo.thelock);
+    atomic_incr_int(&lockinfo.rcount);
+    lock_check();
 }
 
 static __inline void
 wlock_acquire(void)
 {
     lockinfo.wlock_acquire(lockinfo.thelock);
+    atomic_incr_int(&lockinfo.wcount);
+    lock_check();
 }
 
 static __inline void
-lock_release(void)
+rlock_release(void)
 {
-    lockinfo.lock_release(lockinfo.thelock);
+    atomic_decr_int(&lockinfo.rcount);
+    lockinfo.rlock_release(lockinfo.thelock);
+}
+
+static __inline void
+wlock_release(void)
+{
+    atomic_decr_int(&lockinfo.wcount);
+    lockinfo.wlock_release(lockinfo.thelock);
 }
 
 /*
@@ -316,6 +338,7 @@ _rtld(Elf_Addr *sp, func_ptr_type *exit_proc, Obj_Entry **objp)
     /* Link the main program into the list of objects. */
     *obj_tail = obj_main;
     obj_tail = &obj_main->next;
+    obj_count++;
     obj_main->refcount++;
 
     /* Initialize a fake symbol for resolving undefined weak references. */
@@ -358,15 +381,16 @@ _rtld(Elf_Addr *sp, func_ptr_type *exit_proc, Obj_Entry **objp)
     set_program_var("__progname", argv[0] != NULL ? basename(argv[0]) : "");
     set_program_var("environ", env);
 
-    dbg("initializing default locks");
-    dllockinit(NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+    dbg("initializing thread locks");
+    lockdflt_init(&lockinfo);
+    lockinfo.thelock = lockinfo.lock_create(lockinfo.context);
 
     r_debug_state();		/* say hello to gdb! */
 
     funclist_call(&initlist);
     wlock_acquire();
     funclist_clear(&initlist);
-    lock_release();
+    wlock_release();
 
     dbg("transferring control to program entry point = %p", obj_main->entry);
 
@@ -385,7 +409,7 @@ _rtld_bind(Obj_Entry *obj, Elf_Word reloff)
     Elf_Addr *where;
     Elf_Addr target;
 
-    wlock_acquire();
+    rlock_acquire();
     if (obj->pltrel)
 	rel = (const Elf_Rel *) ((caddr_t) obj->pltrel + reloff);
     else
@@ -403,7 +427,7 @@ _rtld_bind(Obj_Entry *obj, Elf_Word reloff)
       (void *)target, basename(defobj->path));
 
     reloc_jmpslot(where, target);
-    lock_release();
+    rlock_release();
     return target;
 }
 
@@ -671,6 +695,29 @@ dlcheck(void *handle)
 }
 
 /*
+ * If the given object is already in the donelist, return true.  Otherwise
+ * add the object to the list and return false.
+ */
+static bool
+donelist_check(DoneList *dlp, Obj_Entry *obj)
+{
+    unsigned int i;
+
+    for (i = 0;  i < dlp->num_used;  i++)
+	if (dlp->objs[i] == obj)
+	    return true;
+    /*
+     * Our donelist allocation should always be sufficient.  But if
+     * our threads locking isn't working properly, more shared objects
+     * could have been loaded since we allocated the list.  That should
+     * never happen, but we'll handle it properly just in case it does.
+     */
+    if (dlp->num_used < dlp->num_alloc)
+	dlp->objs[dlp->num_used++] = obj;
+    return false;
+}
+
+/*
  * Hash function for symbol table lookup.  Don't even think about changing
  * this.  It is specified by the System V ABI.
  */
@@ -741,6 +788,7 @@ const Elf_Sym *
 find_symdef(unsigned long symnum, Obj_Entry *refobj,
     const Obj_Entry **defobj_out, bool in_plt)
 {
+    DoneList donelist;
     const Elf_Sym *ref;
     const Elf_Sym *def;
     const Elf_Sym *symp;
@@ -755,11 +803,11 @@ find_symdef(unsigned long symnum, Obj_Entry *refobj,
     hash = elf_hash(name);
     def = NULL;
     defobj = NULL;
-    curmark++;
+    donelist_init(&donelist);
 
-    if (refobj->symbolic) {	/* Look first in the referencing object */
+    /* Look first in the referencing object if linked symbolically. */
+    if (refobj->symbolic && !donelist_check(&donelist, refobj)) {
 	symp = symlook_obj(name, hash, refobj, in_plt);
-	refobj->mark = curmark;
 	if (symp != NULL) {
 	    def = symp;
 	    defobj = refobj;
@@ -768,7 +816,7 @@ find_symdef(unsigned long symnum, Obj_Entry *refobj,
 
     /* Search all objects loaded at program start up. */
     if (def == NULL || ELF_ST_BIND(def->st_info) == STB_WEAK) {
-	symp = symlook_list(name, hash, &list_main, &obj, in_plt);
+	symp = symlook_list(name, hash, &list_main, &obj, in_plt, &donelist);
 	if (symp != NULL &&
 	  (def == NULL || ELF_ST_BIND(symp->st_info) != STB_WEAK)) {
 	    def = symp;
@@ -780,7 +828,8 @@ find_symdef(unsigned long symnum, Obj_Entry *refobj,
     STAILQ_FOREACH(elm, &refobj->dldags, link) {
 	if (def != NULL && ELF_ST_BIND(def->st_info) != STB_WEAK)
 	    break;
-	symp = symlook_list(name, hash, &elm->obj->dagmembers, &obj, in_plt);
+	symp = symlook_list(name, hash, &elm->obj->dagmembers, &obj, in_plt,
+	  &donelist);
 	if (symp != NULL &&
 	  (def == NULL || ELF_ST_BIND(symp->st_info) != STB_WEAK)) {
 	    def = symp;
@@ -790,7 +839,7 @@ find_symdef(unsigned long symnum, Obj_Entry *refobj,
 
     /* Search all RTLD_GLOBAL objects. */
     if (def == NULL || ELF_ST_BIND(def->st_info) == STB_WEAK) {
-	symp = symlook_list(name, hash, &list_global, &obj, in_plt);
+	symp = symlook_list(name, hash, &list_global, &obj, in_plt, &donelist);
 	if (symp != NULL &&
 	  (def == NULL || ELF_ST_BIND(symp->st_info) != STB_WEAK)) {
 	    def = symp;
@@ -919,23 +968,24 @@ gethints(void)
 static void
 init_dag(Obj_Entry *root)
 {
-    curmark++;
-    init_dag1(root, root);
+    DoneList donelist;
+
+    donelist_init(&donelist);
+    init_dag1(root, root, &donelist);
 }
 
 static void
-init_dag1(Obj_Entry *root, Obj_Entry *obj)
+init_dag1(Obj_Entry *root, Obj_Entry *obj, DoneList *dlp)
 {
     const Needed_Entry *needed;
 
-    if (obj->mark == curmark)
+    if (donelist_check(dlp, obj))
 	return;
-    obj->mark = curmark;
     objlist_add(&obj->dldags, root);
     objlist_add(&root->dagmembers, obj);
     for (needed = obj->needed;  needed != NULL;  needed = needed->next)
 	if (needed->obj != NULL)
-	    init_dag1(root, needed->obj);
+	    init_dag1(root, needed->obj, dlp);
 }
 
 /*
@@ -971,6 +1021,7 @@ init_rtld(caddr_t mapbase)
 	 */
 	obj_list = &obj_rtld;
 	obj_tail = &obj_rtld.next;
+	obj_count = 1;
 
 	relocate_objects(&obj_rtld, true);
     }
@@ -978,6 +1029,7 @@ init_rtld(caddr_t mapbase)
     /* Make the object list empty again. */
     obj_list = NULL;
     obj_tail = &obj_list;
+    obj_count = 0;
 
     /* Replace the path with a dynamically allocated copy. */
     obj_rtld.path = xstrdup(obj_rtld.path);
@@ -1118,6 +1170,7 @@ load_object(char *path)
 
 	*obj_tail = obj;
 	obj_tail = &obj->next;
+	obj_count++;
 	linkmap_add(obj);	/* for GDB */
 
 	dbg("  %p .. %p: %s", obj->mapbase,
@@ -1131,9 +1184,24 @@ load_object(char *path)
     return obj;
 }
 
+/*
+ * Check for locking violations and die if one is found.
+ */
 static void
-lock_nop(void *lock)
+lock_check(void)
 {
+    int rcount, wcount;
+
+    rcount = lockinfo.rcount;
+    wcount = lockinfo.wcount;
+    assert(rcount >= 0);
+    assert(wcount >= 0);
+    if (wcount > 1 || (wcount != 0 && rcount != 0)) {
+	_rtld_error("Application locking error: %d readers and %d writers"
+	  " in dynamic linker.  See DLLOCKINIT(3) in manual pages.",
+	  rcount, wcount);
+	die();
+    }
 }
 
 static Obj_Entry *
@@ -1317,7 +1385,7 @@ dlclose(void *handle)
     wlock_acquire();
     root = dlcheck(handle);
     if (root == NULL) {
-	lock_release();
+	wlock_release();
 	return -1;
     }
 
@@ -1336,7 +1404,7 @@ dlclose(void *handle)
 	    if (obj->refcount == 0 && obj->fini != NULL)
 		funclist_push_tail(&finilist, obj->fini);
 
-	lock_release();
+	wlock_release();
 	funclist_call(&finilist);
 	wlock_acquire();
 	funclist_clear(&finilist);
@@ -1346,7 +1414,7 @@ dlclose(void *handle)
 	unload_object(root);
 	GDB_STATE(RT_CONSISTENT);
     }
-    lock_release();
+    wlock_release();
     return 0;
 }
 
@@ -1358,6 +1426,9 @@ dlerror(void)
     return msg;
 }
 
+/*
+ * This function is deprecated and has no effect.
+ */
 void
 dllockinit(void *context,
 	   void *(*lock_create)(void *context),
@@ -1367,68 +1438,14 @@ dllockinit(void *context,
            void (*lock_destroy)(void *lock),
 	   void (*context_destroy)(void *context))
 {
-    bool is_dflt = false;
-
-    /* NULL arguments mean reset to the built-in locks. */
-    if (lock_create == NULL) {
-	is_dflt = true;
-	context = NULL;
-	lock_create = lockdflt_create;
-	rlock_acquire = wlock_acquire = lockdflt_acquire;
-	lock_release = lockdflt_release;
-	lock_destroy = lockdflt_destroy;
-	context_destroy = NULL;
-    }
-
-    /* Temporarily set locking methods to no-ops. */
-    lockinfo.rlock_acquire = lock_nop;
-    lockinfo.wlock_acquire = lock_nop;
-    lockinfo.lock_release = lock_nop;
-
-    /* Release any existing locks and context. */
-    if (lockinfo.lock_destroy != NULL)
-	lockinfo.lock_destroy(lockinfo.thelock);
-    if (lockinfo.context_destroy != NULL)
-	lockinfo.context_destroy(lockinfo.context);
-
-    /*
-     * Make sure the shared objects containing the locking methods are
-     * fully bound, to avoid infinite recursion when they are called
-     * from the lazy binding code.
-     */
-    if (!is_dflt) {
-	prebind((void *)rlock_acquire);
-	prebind((void *)wlock_acquire);
-	prebind((void *)lock_release);
-    }
-
-    /* Allocate our lock. */
-    lockinfo.thelock = lock_create(lockinfo.context);
-
-    /* Record the new method information. */
-    lockinfo.context = context;
-    lockinfo.rlock_acquire = rlock_acquire;
-    lockinfo.wlock_acquire = wlock_acquire;
-    lockinfo.lock_release = lock_release;
-    lockinfo.lock_destroy = lock_destroy;
-    lockinfo.context_destroy = context_destroy;
-}
-
-static void
-prebind(void *addr)
-{
-    Obj_Entry *obj;
-
-    if ((obj = obj_from_addr(addr)) == NULL) {
-	_rtld_error("Cannot determine shared object of locking method at %p",
-	  addr);
-	die();
-    }
-    if (!obj->rtld && !obj->jmpslots_done) {
-	dbg("Pre-binding %s for locking", obj->path);
-	if (reloc_jmpslots(obj) == -1)
-	    die();
-    }
+    static void *cur_context;
+    static void (*cur_context_destroy)(void *);
+
+    /* Just destroy the context from the previous call, if necessary. */
+    if (cur_context_destroy != NULL)
+	cur_context_destroy(cur_context);
+    cur_context = context;
+    cur_context_destroy = context_destroy;
 }
 
 void *
@@ -1482,11 +1499,11 @@ dlopen(const char *name, int mode)
     GDB_STATE(RT_CONSISTENT);
 
     /* Call the init functions with no locks held. */
-    lock_release();
+    wlock_release();
     funclist_call(&initlist);
     wlock_acquire();
     funclist_clear(&initlist);
-    lock_release();
+    wlock_release();
     return obj;
 }
 
@@ -1502,14 +1519,14 @@ dlsym(void *handle, const char *name)
     def = NULL;
     defobj = NULL;
 
-    wlock_acquire();
+    rlock_acquire();
     if (handle == NULL || handle == RTLD_NEXT) {
 	void *retaddr;
 
 	retaddr = __builtin_return_address(0);	/* __GNUC__ only */
 	if ((obj = obj_from_addr(retaddr)) == NULL) {
 	    _rtld_error("Cannot determine caller's shared object");
-	    lock_release();
+	    rlock_release();
 	    return NULL;
 	}
 	if (handle == NULL) {	/* Just the caller's shared object. */
@@ -1525,14 +1542,17 @@ dlsym(void *handle, const char *name)
 	}
     } else {
 	if ((obj = dlcheck(handle)) == NULL) {
-	    lock_release();
+	    rlock_release();
 	    return NULL;
 	}
 
 	if (obj->mainprog) {
+	    DoneList donelist;
+
 	    /* Search main program and all libraries loaded by it. */
-	    curmark++;
-	    def = symlook_list(name, hash, &list_main, &defobj, true);
+	    donelist_init(&donelist);
+	    def = symlook_list(name, hash, &list_main, &defobj, true,
+	      &donelist);
 	} else {
 	    /*
 	     * XXX - This isn't correct.  The search should include the whole
@@ -1544,12 +1564,12 @@ dlsym(void *handle, const char *name)
     }
 
     if (def != NULL) {
-	lock_release();
+	rlock_release();
 	return defobj->relocbase + def->st_value;
     }
 
     _rtld_error("Undefined symbol \"%s\"", name);
-    lock_release();
+    rlock_release();
     return NULL;
 }
 
@@ -1561,11 +1581,11 @@ dladdr(const void *addr, Dl_info *info)
     void *symbol_addr;
     unsigned long symoffset;
     
-    wlock_acquire();
+    rlock_acquire();
     obj = obj_from_addr(addr);
     if (obj == NULL) {
         _rtld_error("No shared object contains address");
-	lock_release();
+	rlock_release();
         return 0;
     }
     info->dli_fname = obj->path;
@@ -1604,7 +1624,7 @@ dladdr(const void *addr, Dl_info *info)
         if (info->dli_saddr == addr)
             break;
     }
-    lock_release();
+    rlock_release();
     return 1;
 }
 
@@ -1695,7 +1715,7 @@ set_program_var(const char *name, const void *value)
 
 static const Elf_Sym *
 symlook_list(const char *name, unsigned long hash, Objlist *objlist,
-  const Obj_Entry **defobj_out, bool in_plt)
+  const Obj_Entry **defobj_out, bool in_plt, DoneList *dlp)
 {
     const Elf_Sym *symp;
     const Elf_Sym *def;
@@ -1705,9 +1725,8 @@ symlook_list(const char *name, unsigned long hash, Objlist *objlist,
     def = NULL;
     defobj = NULL;
     STAILQ_FOREACH(elm, objlist, link) {
-	if (elm->obj->mark == curmark)
+	if (donelist_check(dlp, elm->obj))
 	    continue;
-	elm->obj->mark = curmark;
 	if ((symp = symlook_obj(name, hash, elm->obj, in_plt)) != NULL) {
 	    if (def == NULL || ELF_ST_BIND(symp->st_info) != STB_WEAK) {
 		def = symp;
@@ -1877,6 +1896,7 @@ unload_object(Obj_Entry *root)
 	    munmap(obj->mapbase, obj->mapsize);
 	    linkmap_delete(obj);
 	    *linkp = obj->next;
+	    obj_count--;
 	    obj_free(obj);
 	} else
 	    linkp = &obj->next;
diff --git a/libexec/rtld-elf/rtld.h b/libexec/rtld-elf/rtld.h
index 6d1ebbf..ab00437 100644
--- a/libexec/rtld-elf/rtld.h
+++ b/libexec/rtld-elf/rtld.h
@@ -77,6 +77,23 @@ typedef struct Struct_Needed_Entry {
     unsigned long name;		/* Offset of name in string table */
 } Needed_Entry;
 
+/* Lock object */
+typedef struct Struct_LockInfo {
+    void *context;		/* Client context for creating locks */
+    void *thelock;		/* The one big lock */
+    /* Debugging aids. */
+    volatile int rcount;	/* Number of readers holding lock */
+    volatile int wcount;	/* Number of writers holding lock */
+    /* Methods */
+    void *(*lock_create)(void *context);
+    void (*rlock_acquire)(void *lock);
+    void (*wlock_acquire)(void *lock);
+    void (*rlock_release)(void *lock);
+    void (*wlock_release)(void *lock);
+    void (*lock_destroy)(void *lock);
+    void (*context_destroy)(void *context);
+} LockInfo;
+
 /*
  * Shared object descriptor.
  *
@@ -149,7 +166,6 @@ typedef struct Struct_Obj_Entry {
     Objlist dagmembers;		/* DAG has these members (%) */
     dev_t dev;			/* Object's filesystem's device */
     ino_t ino;			/* Object's inode number */
-    unsigned long mark;		/* Set to "curmark" to avoid repeat visits */
 } Obj_Entry;
 
 #define RTLD_MAGIC	0xd550b87a
@@ -170,10 +186,7 @@ unsigned long elf_hash(const char *);
 const Elf_Sym *find_symdef(unsigned long, Obj_Entry *, const Obj_Entry **,
   bool);
 void init_pltgot(Obj_Entry *);
-void lockdflt_acquire(void *);
-void *lockdflt_create(void *);
-void lockdflt_destroy(void *);
-void lockdflt_release(void *);
+void lockdflt_init(LockInfo *);
 void obj_free(Obj_Entry *);
 Obj_Entry *obj_new(void);
 int reloc_non_plt(Obj_Entry *, Obj_Entry *);