summaryrefslogtreecommitdiffstats
path: root/sys/kern
diff options
context:
space:
mode:
authorjasone <jasone@FreeBSD.org>2000-09-07 01:33:02 +0000
committerjasone <jasone@FreeBSD.org>2000-09-07 01:33:02 +0000
commit769e0f974d8929599ba599ac496510fffc90ff34 (patch)
tree9387522900085835de81e7830e570ef3f6b3ea80 /sys/kern
parentacf1927de02afda4855ec278b1128fd9446405ea (diff)
downloadFreeBSD-src-769e0f974d8929599ba599ac496510fffc90ff34.zip
FreeBSD-src-769e0f974d8929599ba599ac496510fffc90ff34.tar.gz
Major update to the way synchronization is done in the kernel. Highlights
include: * Mutual exclusion is used instead of spl*(). See mutex(9). (Note: The alpha port is still in transition and currently uses both.) * Per-CPU idle processes. * Interrupts are run in their own separate kernel threads and can be preempted (i386 only). Partially contributed by: BSDi (BSD/OS) Submissions by (at least): cp, dfr, dillon, grog, jake, jhb, sheldonh
Diffstat (limited to 'sys/kern')
-rw-r--r--sys/kern/init_main.c33
-rw-r--r--sys/kern/kern_clock.c29
-rw-r--r--sys/kern/kern_exit.c1
-rw-r--r--sys/kern/kern_fork.c80
-rw-r--r--sys/kern/kern_idle.c108
-rw-r--r--sys/kern/kern_kthread.c17
-rw-r--r--sys/kern/kern_mutex.c799
-rw-r--r--sys/kern/kern_proc.c1
-rw-r--r--sys/kern/kern_resource.c2
-rw-r--r--sys/kern/kern_shutdown.c9
-rw-r--r--sys/kern/kern_sig.c3
-rw-r--r--sys/kern/kern_subr.c7
-rw-r--r--sys/kern/kern_switch.c100
-rw-r--r--sys/kern/kern_synch.c115
-rw-r--r--sys/kern/kern_tc.c9
-rw-r--r--sys/kern/kern_threads.c5
-rw-r--r--sys/kern/subr_prf.c3
-rw-r--r--sys/kern/subr_prof.c4
-rw-r--r--sys/kern/subr_smp.c88
-rw-r--r--sys/kern/subr_trap.c391
-rw-r--r--sys/kern/subr_turnstile.c799
-rw-r--r--sys/kern/subr_witness.c799
-rw-r--r--sys/kern/tty.c3
-rw-r--r--sys/kern/vfs_bio.c20
-rw-r--r--sys/kern/vfs_export.c4
-rw-r--r--sys/kern/vfs_subr.c4
26 files changed, 3116 insertions, 317 deletions
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
index 6d0d915..f5ae66c 100644
--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c
@@ -48,6 +48,7 @@
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
+#include <sys/ktr.h>
#include <sys/mount.h>
#include <sys/sysctl.h>
#include <sys/proc.h>
@@ -64,6 +65,8 @@
#include <sys/conf.h>
#include <machine/cpu.h>
+#include <machine/globals.h>
+#include <machine/mutex.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
@@ -260,6 +263,11 @@ proc0_init(void *dummy __unused)
p = &proc0;
/*
+ * Initialize magic number.
+ */
+ p->p_magic = P_MAGIC;
+
+ /*
* Initialize process and pgrp structures.
*/
procinit();
@@ -364,11 +372,20 @@ proc0_init(void *dummy __unused)
*/
(void)chgproccnt(cred0.p_uidinfo, 1, 0);
+ LIST_INIT(&p->p_heldmtx);
+ LIST_INIT(&p->p_contested);
+
/*
* Initialize the current process pointer (curproc) before
* any possible traps/probes to simplify trap processing.
*/
- SET_CURPROC(p);
+ PCPU_SET(curproc, p);
+
+ /*
+ * Enter the Giant mutex.
+ * XXX This should be done BEFORE cpu_startup().
+ */
+ mtx_enter(&Giant, MTX_DEF);
}
SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL)
@@ -389,7 +406,7 @@ proc0_post(void *dummy __unused)
p->p_runtime = 0;
}
microuptime(&switchtime);
- switchticks = ticks;
+ PCPU_SET(switchticks, ticks);
/*
* Give the ``random'' number generator a thump.
@@ -418,7 +435,6 @@ SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL)
***************************************************************************
*/
-
/*
* List of paths to try when searching for "init".
*/
@@ -444,6 +460,8 @@ start_init(void *dummy)
char *ucp, **uap, *arg0, *arg1;
struct proc *p;
+ mtx_enter(&Giant, MTX_DEF);
+
p = curproc;
/* Get the vnode for '/'. Set p->p_fd->fd_cdir to reference it. */
@@ -562,16 +580,12 @@ static void
create_init(const void *udata __unused)
{
int error;
- int s;
- s = splhigh();
- error = fork1(&proc0, RFFDG | RFPROC, &initproc);
+ error = fork1(&proc0, RFFDG | RFPROC | RFSTOPPED, &initproc);
if (error)
panic("cannot fork init: %d\n", error);
initproc->p_flag |= P_INMEM | P_SYSTEM;
cpu_set_fork_handler(initproc, start_init, NULL);
- remrunqueue(initproc);
- splx(s);
}
SYSINIT(init,SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL)
@@ -581,6 +595,9 @@ SYSINIT(init,SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL)
static void
kick_init(const void *udata __unused)
{
+ mtx_enter(&sched_lock, MTX_SPIN);
+ initproc->p_stat = SRUN;
setrunqueue(initproc);
+ mtx_exit(&sched_lock, MTX_SPIN);
}
SYSINIT(kickinit,SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL)
diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c
index 11e63a7..33eef3c 100644
--- a/sys/kern/kern_clock.c
+++ b/sys/kern/kern_clock.c
@@ -70,11 +70,7 @@ static void initclocks __P((void *dummy));
SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
/* Some of these don't belong here, but it's easiest to concentrate them. */
-#if defined(SMP) && defined(BETTER_CLOCK)
long cp_time[CPUSTATES];
-#else
-static long cp_time[CPUSTATES];
-#endif
long tk_cancc;
long tk_nin;
@@ -156,7 +152,7 @@ hardclock(frame)
register struct proc *p;
p = curproc;
- if (p) {
+ if (p != idleproc) {
register struct pstats *pstats;
/*
@@ -325,12 +321,12 @@ statclock(frame)
struct rusage *ru;
struct vmspace *vm;
- if (curproc != NULL && CLKF_USERMODE(frame)) {
+ if (CLKF_USERMODE(frame)) {
/*
* Came from user mode; CPU was in user state.
* If this process is being profiled, record the tick.
*/
- p = curproc;
+ p = prevproc;
if (p->p_flag & P_PROFIL)
addupc_intr(p, CLKF_PC(frame), 1);
#if defined(SMP) && defined(BETTER_CLOCK)
@@ -379,20 +375,21 @@ statclock(frame)
* so that we know how much of its real time was spent
* in ``non-process'' (i.e., interrupt) work.
*/
- p = curproc;
- if (CLKF_INTR(frame)) {
- if (p != NULL)
- p->p_iticks++;
+ p = prevproc;
+ if (p->p_ithd) {
+ p->p_iticks++;
cp_time[CP_INTR]++;
- } else if (p != NULL) {
+ } else {
p->p_sticks++;
- cp_time[CP_SYS]++;
- } else
- cp_time[CP_IDLE]++;
+ if (p != idleproc)
+ cp_time[CP_SYS]++;
+ else
+ cp_time[CP_IDLE]++;
+ }
}
pscnt = psdiv;
- if (p != NULL) {
+ if (p != idleproc) {
schedclock(p);
/* Update resource usage integrals and maximums. */
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
index df71fe0..7fccc16 100644
--- a/sys/kern/kern_exit.c
+++ b/sys/kern/kern_exit.c
@@ -349,7 +349,6 @@ exit1(p, rv)
*
* Other substructures are freed from wait().
*/
- SET_CURPROC(NULL);
if (--p->p_limit->p_refcnt == 0) {
FREE(p->p_limit, M_SUBPROC);
p->p_limit = NULL;
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index f24c97e..0aa31ab 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -52,6 +52,7 @@
#include <sys/resourcevar.h>
#include <sys/vnode.h>
#include <sys/acct.h>
+#include <sys/ktr.h>
#include <sys/ktrace.h>
#include <sys/unistd.h>
#include <sys/jail.h>
@@ -65,6 +66,8 @@
#include <sys/user.h>
+#include <machine/mutex.h>
+
static MALLOC_DEFINE(M_ATFORK, "atfork", "atfork callback");
static int fast_vfork = 1;
@@ -131,7 +134,8 @@ rfork(p, uap)
int error;
struct proc *p2;
- error = fork1(p, uap->flags, &p2);
+ /* mask kernel only flags out of the user flags */
+ error = fork1(p, uap->flags & ~RFKERNELONLY, &p2);
if (error == 0) {
p->p_retval[0] = p2 ? p2->p_pid : 0;
p->p_retval[1] = 0;
@@ -177,17 +181,19 @@ SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
int
fork1(p1, flags, procp)
- struct proc *p1;
+ struct proc *p1; /* parent proc */
int flags;
- struct proc **procp;
+ struct proc **procp; /* child proc */
{
struct proc *p2, *pptr;
uid_t uid;
struct proc *newproc;
+ int trypid;
int ok;
static int pidchecked = 0;
struct forklist *ep;
+ /* Can't copy and clear */
if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
return (EINVAL);
@@ -278,47 +284,56 @@ fork1(p1, flags, procp)
/*
* Find an unused process ID. We remember a range of unused IDs
* ready to use (from nextpid+1 through pidchecked-1).
+ *
+ * If RFHIGHPID is set (used during system boot), do not allocate
+ * low-numbered pids.
*/
- nextpid++;
+ trypid = nextpid + 1;
+ if (flags & RFHIGHPID) {
+ if (trypid < 10) {
+ trypid = 10;
+ }
+ } else {
if (randompid)
- nextpid += arc4random() % randompid;
+ trypid += arc4random() % randompid;
+ }
retry:
/*
* If the process ID prototype has wrapped around,
* restart somewhat above 0, as the low-numbered procs
* tend to include daemons that don't exit.
*/
- if (nextpid >= PID_MAX) {
- nextpid = nextpid % PID_MAX;
- if (nextpid < 100)
- nextpid += 100;
+ if (trypid >= PID_MAX) {
+ trypid = trypid % PID_MAX;
+ if (trypid < 100)
+ trypid += 100;
pidchecked = 0;
}
- if (nextpid >= pidchecked) {
+ if (trypid >= pidchecked) {
int doingzomb = 0;
pidchecked = PID_MAX;
/*
* Scan the active and zombie procs to check whether this pid
* is in use. Remember the lowest pid that's greater
- * than nextpid, so we can avoid checking for a while.
+ * than trypid, so we can avoid checking for a while.
*/
p2 = LIST_FIRST(&allproc);
again:
for (; p2 != 0; p2 = LIST_NEXT(p2, p_list)) {
- while (p2->p_pid == nextpid ||
- p2->p_pgrp->pg_id == nextpid ||
- p2->p_session->s_sid == nextpid) {
- nextpid++;
- if (nextpid >= pidchecked)
+ while (p2->p_pid == trypid ||
+ p2->p_pgrp->pg_id == trypid ||
+ p2->p_session->s_sid == trypid) {
+ trypid++;
+ if (trypid >= pidchecked)
goto retry;
}
- if (p2->p_pid > nextpid && pidchecked > p2->p_pid)
+ if (p2->p_pid > trypid && pidchecked > p2->p_pid)
pidchecked = p2->p_pid;
- if (p2->p_pgrp->pg_id > nextpid &&
+ if (p2->p_pgrp->pg_id > trypid &&
pidchecked > p2->p_pgrp->pg_id)
pidchecked = p2->p_pgrp->pg_id;
- if (p2->p_session->s_sid > nextpid &&
+ if (p2->p_session->s_sid > trypid &&
pidchecked > p2->p_session->s_sid)
pidchecked = p2->p_session->s_sid;
}
@@ -331,11 +346,19 @@ again:
p2 = newproc;
p2->p_stat = SIDL; /* protect against others */
- p2->p_pid = nextpid;
+ p2->p_pid = trypid;
LIST_INSERT_HEAD(&allproc, p2, p_list);
LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
/*
+ * RFHIGHPID does not mess with the nextpid counter during boot.
+ */
+ if (flags & RFHIGHPID)
+ pidchecked = 0;
+ else
+ nextpid = trypid;
+
+ /*
* Make a proc table entry for the new process.
* Start by zeroing the section of proc that is zero-initialized,
* then copy the section that is copied directly from the parent.
@@ -456,6 +479,8 @@ again:
p2->p_pptr = pptr;
LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
LIST_INIT(&p2->p_children);
+ LIST_INIT(&p2->p_heldmtx);
+ LIST_INIT(&p2->p_contested);
#ifdef KTRACE
/*
@@ -496,14 +521,19 @@ again:
}
/*
- * Make child runnable and add to run queue.
+ * If RFSTOPPED not requested, make child runnable and add to
+ * run queue.
*/
microtime(&(p2->p_stats->p_start));
p2->p_acflag = AFORK;
- (void) splhigh();
- p2->p_stat = SRUN;
- setrunqueue(p2);
- (void) spl0();
+ if ((flags & RFSTOPPED) == 0) {
+ splhigh();
+ mtx_enter(&sched_lock, MTX_SPIN);
+ p2->p_stat = SRUN;
+ setrunqueue(p2);
+ mtx_exit(&sched_lock, MTX_SPIN);
+ spl0();
+ }
/*
* Now can be swapped.
diff --git a/sys/kern/kern_idle.c b/sys/kern/kern_idle.c
new file mode 100644
index 0000000..840c0f9
--- /dev/null
+++ b/sys/kern/kern_idle.c
@@ -0,0 +1,108 @@
+/*-
+ * Copyright (c) 2000, All rights reserved. See /usr/src/COPYRIGHT
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/signalvar.h>
+#include <sys/resourcevar.h>
+#include <sys/vmmeter.h>
+#include <sys/sysctl.h>
+#include <sys/unistd.h>
+#include <sys/kthread.h>
+#include <sys/queue.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#ifdef KTRACE
+#include <sys/uio.h>
+#include <sys/ktrace.h>
+#endif
+
+#include <machine/cpu.h>
+#include <machine/ipl.h>
+#include <machine/mutex.h>
+#include <machine/smp.h>
+
+#include <machine/globaldata.h>
+#include <machine/globals.h>
+
+#ifdef SMP_DEBUG
+#include <sys/bus.h>
+#include <i386/isa/icu.h>
+#include <i386/isa/intr_machdep.h>
+#endif
+
+static void idle_setup(void *dummy);
+SYSINIT(idle_setup, SI_SUB_SCHED_IDLE, SI_ORDER_FIRST, idle_setup, NULL)
+
+static void idle_proc(void *dummy);
+
+/*
+ * setup per-cpu idle process contexts
+ */
+static void
+idle_setup(void *dummy)
+{
+ struct globaldata *gd;
+ int error;
+
+ SLIST_FOREACH(gd, &cpuhead, gd_allcpu) {
+#ifdef SMP
+ error = kthread_create(idle_proc, NULL, &gd->gd_idleproc,
+ RFSTOPPED|RFHIGHPID, "idle: cpu%d",
+ gd->gd_cpuid);
+#else
+ error = kthread_create(idle_proc, NULL, &gd->gd_idleproc,
+ RFSTOPPED|RFHIGHPID, "idle");
+#endif
+ if (error)
+ panic("idle_setup: kthread_create error %d\n", error);
+
+ gd->gd_idleproc->p_stat = SWAIT;
+ }
+}
+
+/*
+ * idle process context
+ */
+static void
+idle_proc(void *dummy)
+{
+ int count;
+
+ for (;;) {
+ /*
+ * Clear switchtime, which prevents the idle process's time
+ * from being counted.
+ switchtime.tv_usec = 0;
+ switchtime.tv_sec = 0;
+ */
+
+ mtx_assert(&Giant, MA_NOTOWNED);
+
+ count = 0;
+
+ while (count >= 0 && procrunnable() == 0) {
+ /*
+ * This is a good place to put things to be done in
+ * the background, including sanity checks.
+ */
+ if (count++ < 0)
+ CTR0(KTR_PROC, "idle_proc: timed out waiting"
+ " for a process");
+ }
+
+ mtx_enter(&sched_lock, MTX_SPIN);
+ idleproc->p_stat = SWAIT;
+ mi_switch();
+ mtx_exit(&sched_lock, MTX_SPIN);
+ spl0();
+ }
+}
diff --git a/sys/kern/kern_kthread.c b/sys/kern/kern_kthread.c
index 6373750..e684b78 100644
--- a/sys/kern/kern_kthread.c
+++ b/sys/kern/kern_kthread.c
@@ -52,24 +52,33 @@ kproc_start(udata)
int error;
error = kthread_create((void (*)(void *))kp->func, NULL,
- kp->global_procpp, kp->arg0);
+ kp->global_procpp, 0, kp->arg0);
if (error)
panic("kproc_start: %s: error %d", kp->arg0, error);
}
/*
- * Create a kernel process/thread/whatever. It shares it's address space
+ * Create a kernel process/thread/whatever. It shares its address space
* with proc0 - ie: kernel only.
+ *
+ * func is the function to start.
+ * arg is the parameter to pass to function on first startup.
+ * newpp is the return value pointing to the thread's struct proc.
+ * flags are flags to fork1 (in unistd.h)
+ * fmt and following will be *printf'd into (*newpp)->p_comm (for ps, etc.).
*/
int
kthread_create(void (*func)(void *), void *arg,
- struct proc **newpp, const char *fmt, ...)
+ struct proc **newpp, int flags, const char *fmt, ...)
{
int error;
va_list ap;
struct proc *p2;
- error = fork1(&proc0, RFMEM | RFFDG | RFPROC, &p2);
+ if (!proc0.p_stats /* || proc0.p_stats->p_start.tv_sec == 0 */)
+ panic("kthread_create called too soon");
+
+ error = fork1(&proc0, RFMEM | RFFDG | RFPROC | flags, &p2);
if (error)
return error;
diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c
new file mode 100644
index 0000000..1ac3f58
--- /dev/null
+++ b/sys/kern/kern_mutex.c
@@ -0,0 +1,799 @@
+/*-
+ * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Berkeley Software Design Inc's name may not be used to endorse or
+ * promote products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
+ * $FreeBSD$
+ */
+
+/*
+ * Main Entry: witness
+ * Pronunciation: 'wit-n&s
+ * Function: noun
+ * Etymology: Middle English witnesse, from Old English witnes knowledge,
+ * testimony, witness, from 2wit
+ * Date: before 12th century
+ * 1 : attestation of a fact or event : TESTIMONY
+ * 2 : one that gives evidence; specifically : one who testifies in
+ * a cause or before a judicial tribunal
+ * 3 : one asked to be present at a transaction so as to be able to
+ * testify to its having taken place
+ * 4 : one who has personal knowledge of something
+ * 5 a : something serving as evidence or proof : SIGN
+ * b : public affirmation by word or example of usually
+ * religious faith or conviction <the heroic witness to divine
+ * life -- Pilot>
+ * 6 capitalized : a member of the Jehovah's Witnesses
+ */
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/ktr.h>
+
+#include <machine/cpu.h>
+#define _KERN_MUTEX_C_ /* Cause non-inlined mtx_*() to be compiled. */
+#include <machine/mutex.h>
+
+/*
+ * The non-inlined versions of the mtx_*() functions are always built (above),
+ * but the witness code depends on the SMP_DEBUG and WITNESS kernel options
+ * being specified.
+ */
+#if (defined(SMP_DEBUG) && defined(WITNESS))
+
+#define WITNESS_COUNT 200
+#define WITNESS_NCHILDREN 2
+
+#ifndef WITNESS
+#define WITNESS 0 /* default off */
+#endif
+
+#ifndef SMP
+extern int witness_spin_check;
+#endif
+
+int witness_watch;
+
+typedef struct witness {
+ struct witness *w_next;
+ char *w_description;
+ char *w_file;
+ int w_line;
+ struct witness *w_morechildren;
+ u_char w_childcnt;
+ u_char w_Giant_squawked:1;
+ u_char w_other_squawked:1;
+ u_char w_same_squawked:1;
+ u_char w_sleep:1;
+ u_char w_spin:1; /* this is a spin mutex */
+ u_int w_level;
+ struct witness *w_children[WITNESS_NCHILDREN];
+} witness_t;
+
+typedef struct witness_blessed {
+ char *b_lock1;
+ char *b_lock2;
+} witness_blessed_t;
+
+#ifdef KDEBUG
+/*
+ * When WITNESS_KDEBUG is set to 1, it will cause the system to
+ * drop into kdebug() when:
+ * - a lock heirarchy violation occurs
+ * - locks are held when going to sleep.
+ */
+#ifndef WITNESS_KDEBUG
+#define WITNESS_KDEBUG 0
+#endif
+int witness_kdebug = WITNESS_KDEBUG;
+#endif /* KDEBUG */
+
+#ifndef WITNESS_SKIPSPIN
+#define WITNESS_SKIPSPIN 0
+#endif
+int witness_skipspin = WITNESS_SKIPSPIN;
+
+
+static mtx_t w_mtx;
+static witness_t *w_free;
+static witness_t *w_all;
+static int w_inited;
+static int witness_dead; /* fatal error, probably no memory */
+
+static witness_t w_data[WITNESS_COUNT];
+
+static witness_t *enroll __P((char *description, int flag));
+static int itismychild __P((witness_t *parent, witness_t *child));
+static void removechild __P((witness_t *parent, witness_t *child));
+static int isitmychild __P((witness_t *parent, witness_t *child));
+static int isitmydescendant __P((witness_t *parent, witness_t *child));
+static int dup_ok __P((witness_t *));
+static int blessed __P((witness_t *, witness_t *));
+static void witness_displaydescendants
+ __P((void(*)(const char *fmt, ...), witness_t *));
+static void witness_leveldescendents __P((witness_t *parent, int level));
+static void witness_levelall __P((void));
+static witness_t * witness_get __P((void));
+static void witness_free __P((witness_t *m));
+
+
+static char *ignore_list[] = {
+ "witness lock",
+ "Kdebug", /* breaks rules and may or may not work */
+ "Page Alias", /* sparc only, witness lock won't block intr */
+ NULL
+};
+
+static char *spin_order_list[] = {
+ "sched lock",
+ "log mtx",
+ "zslock", /* sparc only above log, this one is a real hack */
+ "time lock", /* above callout */
+ "callout mtx", /* above wayout */
+ /*
+ * leaf locks
+ */
+ "wayout mtx",
+ "kernel_pmap", /* sparc only, logically equal "pmap" below */
+ "pmap", /* sparc only */
+ NULL
+};
+
+static char *order_list[] = {
+ "tcb", "inp", "so_snd", "so_rcv", "Giant lock", NULL,
+ "udb", "inp", NULL,
+ "unp head", "unp", "so_snd", NULL,
+ "de0", "Giant lock", NULL,
+ "ifnet", "Giant lock", NULL,
+ "fifo", "so_snd", NULL,
+ "hme0", "Giant lock", NULL,
+ "esp0", "Giant lock", NULL,
+ "hfa0", "Giant lock", NULL,
+ "so_rcv", "atm_global", NULL,
+ "so_snd", "atm_global", NULL,
+ "NFS", "Giant lock", NULL,
+ NULL
+};
+
+static char *dup_list[] = {
+ "inp",
+ "process group",
+ "session",
+ "unp",
+ "rtentry",
+ "rawcb",
+ NULL
+};
+
+static char *sleep_list[] = {
+ "Giant lock",
+ NULL
+};
+
+/*
+ * Pairs of locks which have been blessed
+ * Don't complain about order problems with blessed locks
+ */
+static witness_blessed_t blessed_list[] = {
+};
+static int blessed_count = sizeof (blessed_list) / sizeof (witness_blessed_t);
+
+void
+witness_init(mtx_t *m, int flag)
+{
+ m->mtx_witness = enroll(m->mtx_description, flag);
+}
+
+void
+witness_destroy(mtx_t *m)
+{
+ mtx_t *m1;
+ struct proc *p;
+ p = CURPROC;
+ for ((m1 = LIST_FIRST(&p->p_heldmtx)); m1 != NULL;
+ m1 = LIST_NEXT(m1, mtx_held)) {
+ if (m1 == m) {
+ LIST_REMOVE(m, mtx_held);
+ break;
+ }
+ }
+ return;
+
+}
+
+void
+witness_enter(mtx_t *m, int flags, char *file, int line)
+{
+ witness_t *w, *w1;
+ mtx_t *m1;
+ struct proc *p;
+ int i;
+#ifdef KDEBUG
+ int go_into_kdebug = 0;
+#endif /* KDEBUG */
+
+ w = m->mtx_witness;
+ p = CURPROC;
+
+ if (flags & MTX_SPIN) {
+ if (!w->w_spin)
+ panic("mutex_enter: MTX_SPIN on MTX_DEF mutex %s @ %s:%d",
+ m->mtx_description, file, line);
+ if (m->mtx_recurse != 0)
+ return;
+ mtx_enter(&w_mtx, MTX_SPIN);
+ i = witness_spin_check;
+ if (i != 0 && w->w_level < i) {
+ mtx_exit(&w_mtx, MTX_SPIN);
+ panic("mutex_enter(%s:%x, MTX_SPIN) out of order @ %s:%d"
+ " already holding %s:%x",
+ m->mtx_description, w->w_level, file, line,
+ spin_order_list[ffs(i)-1], i);
+ }
+ PCPU_SET(witness_spin_check, i | w->w_level);
+ mtx_exit(&w_mtx, MTX_SPIN);
+ return;
+ }
+ if (w->w_spin)
+ panic("mutex_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
+ m->mtx_description, file, line);
+
+ if (m->mtx_recurse != 0)
+ return;
+ if (witness_dead)
+ goto out;
+ if (cold)
+ goto out;
+
+ if (!mtx_legal2block())
+ panic("blockable mtx_enter() of %s when not legal @ %s:%d",
+ m->mtx_description, file, line);
+ /*
+ * Is this the first mutex acquired
+ */
+ if ((m1 = LIST_FIRST(&p->p_heldmtx)) == NULL)
+ goto out;
+
+
+ if ((w1 = m1->mtx_witness) == w) {
+ if (w->w_same_squawked || dup_ok(w))
+ goto out;
+ w->w_same_squawked = 1;
+ printf("acquring duplicate lock of same type: \"%s\"\n",
+ m->mtx_description);
+ printf(" 1st @ %s:%d\n", w->w_file, w->w_line);
+ printf(" 2nd @ %s:%d\n", file, line);
+#ifdef KDEBUG
+ go_into_kdebug = 1;
+#endif /* KDEBUG */
+ goto out;
+ }
+ MPASS(!mtx_owned(&w_mtx));
+ mtx_enter(&w_mtx, MTX_SPIN);
+ /*
+ * If we have a known higher number just say ok
+ */
+ if (witness_watch > 1 && w->w_level > w1->w_level) {
+ mtx_exit(&w_mtx, MTX_SPIN);
+ goto out;
+ }
+ if (isitmydescendant(m1->mtx_witness, w)) {
+ mtx_exit(&w_mtx, MTX_SPIN);
+ goto out;
+ }
+ for (i = 0; m1 != NULL; m1 = LIST_NEXT(m1, mtx_held), i++) {
+
+ ASS(i < 200);
+ w1 = m1->mtx_witness;
+ if (isitmydescendant(w, w1)) {
+ mtx_exit(&w_mtx, MTX_SPIN);
+ if (blessed(w, w1))
+ goto out;
+ if (m1 == &Giant) {
+ if (w1->w_Giant_squawked)
+ goto out;
+ else
+ w1->w_Giant_squawked = 1;
+ } else {
+ if (w1->w_other_squawked)
+ goto out;
+ else
+ w1->w_other_squawked = 1;
+ }
+ printf("lock order reversal\n");
+ printf(" 1st %s last acquired @ %s:%d\n",
+ w->w_description, w->w_file, w->w_line);
+ printf(" 2nd %p %s @ %s:%d\n",
+ m1, w1->w_description, w1->w_file, w1->w_line);
+ printf(" 3rd %p %s @ %s:%d\n",
+ m, w->w_description, file, line);
+#ifdef KDEBUG
+ go_into_kdebug = 1;
+#endif /* KDEBUG */
+ goto out;
+ }
+ }
+ m1 = LIST_FIRST(&p->p_heldmtx);
+ if (!itismychild(m1->mtx_witness, w))
+ mtx_exit(&w_mtx, MTX_SPIN);
+
+out:
+#ifdef KDEBUG
+ if (witness_kdebug && go_into_kdebug)
+ kdebug();
+#endif /* KDEBUG */
+ w->w_file = file;
+ w->w_line = line;
+ m->mtx_line = line;
+ m->mtx_file = file;
+
+ /*
+ * If this pays off it likely means that a mutex being witnessed
+ * is acquired in hardclock. Put it in the ignore list. It is
+ * likely not the mutex this assert fails on.
+ */
+ ASS(m->mtx_held.le_prev == NULL);
+ LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held);
+}
+
+void
+witness_exit(mtx_t *m, int flags, char *file, int line)
+{
+ witness_t *w;
+
+ w = m->mtx_witness;
+
+ if (flags & MTX_SPIN) {
+ if (!w->w_spin)
+ panic("mutex_exit: MTX_SPIN on MTX_DEF mutex %s @ %s:%d",
+ m->mtx_description, file, line);
+ if (m->mtx_recurse != 0)
+ return;
+ mtx_enter(&w_mtx, MTX_SPIN);
+ PCPU_SET(witness_spin_check, witness_spin_check & ~w->w_level);
+ mtx_exit(&w_mtx, MTX_SPIN);
+ return;
+ }
+ if (w->w_spin)
+ panic("mutex_exit: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
+ m->mtx_description, file, line);
+
+ if (m->mtx_recurse != 0)
+ return;
+
+ if ((flags & MTX_NOSWITCH) == 0 && !mtx_legal2block() && !cold)
+ panic("switchable mtx_exit() of %s when not legal @ %s:%d",
+ m->mtx_description, file, line);
+ LIST_REMOVE(m, mtx_held);
+ m->mtx_held.le_prev = NULL;
+}
+
+void
+witness_try_enter(mtx_t *m, int flags, char *file, int line)
+{
+ struct proc *p;
+ witness_t *w = m->mtx_witness;
+
+
+ if (flags & MTX_SPIN) {
+ if (!w->w_spin)
+ panic("mutex_try_enter: "
+ "MTX_SPIN on MTX_DEF mutex %s @ %s:%d",
+ m->mtx_description, file, line);
+ if (m->mtx_recurse != 0)
+ return;
+ mtx_enter(&w_mtx, MTX_SPIN);
+ PCPU_SET(witness_spin_check, witness_spin_check | w->w_level);
+ mtx_exit(&w_mtx, MTX_SPIN);
+ return;
+ }
+
+ if (w->w_spin)
+ panic("mutex_try_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
+ m->mtx_description, file, line);
+
+ if (m->mtx_recurse != 0)
+ return;
+
+ w->w_file = file;
+ w->w_line = line;
+ m->mtx_line = line;
+ m->mtx_file = file;
+ p = CURPROC;
+ ASS(m->mtx_held.le_prev == NULL);
+ LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held);
+}
+
+void
+witness_display(void(*prnt)(const char *fmt, ...))
+{
+ witness_t *w, *w1;
+
+ witness_levelall();
+
+ for (w = w_all; w; w = w->w_next) {
+ if (w->w_file == NULL)
+ continue;
+ for (w1 = w_all; w1; w1 = w1->w_next) {
+ if (isitmychild(w1, w))
+ break;
+ }
+ if (w1 != NULL)
+ continue;
+ /*
+ * This lock has no anscestors, display its descendants.
+ */
+ witness_displaydescendants(prnt, w);
+ }
+ prnt("\nMutex which were never acquired\n");
+ for (w = w_all; w; w = w->w_next) {
+ if (w->w_file != NULL)
+ continue;
+ prnt("%s\n", w->w_description);
+ }
+}
+
+int
+witness_sleep(int check_only, mtx_t *mtx, char *file, int line)
+{
+ mtx_t *m;
+ struct proc *p;
+ char **sleep;
+ int n = 0;
+
+ p = CURPROC;
+ for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL;
+ m = LIST_NEXT(m, mtx_held)) {
+ if (m == mtx)
+ continue;
+ for (sleep = sleep_list; *sleep!= NULL; sleep++)
+ if (strcmp(m->mtx_description, *sleep) == 0)
+ goto next;
+ printf("%s:%d: %s with \"%s\" locked from %s:%d\n",
+ file, line, check_only ? "could sleep" : "sleeping",
+ m->mtx_description,
+ m->mtx_witness->w_file, m->mtx_witness->w_line);
+ n++;
+ next:
+ }
+#ifdef KDEBUG
+ if (witness_kdebug && n)
+ kdebug();
+#endif /* KDEBUG */
+ return (n);
+}
+
+static witness_t *
+enroll(char *description, int flag)
+{
+ int i;
+ witness_t *w, *w1;
+ char **ignore;
+ char **order;
+
+ if (!witness_watch)
+ return (NULL);
+ for (ignore = ignore_list; *ignore != NULL; ignore++)
+ if (strcmp(description, *ignore) == 0)
+ return (NULL);
+
+ if (w_inited == 0) {
+ mtx_init(&w_mtx, "witness lock", MTX_DEF);
+ for (i = 0; i < WITNESS_COUNT; i++) {
+ w = &w_data[i];
+ witness_free(w);
+ }
+ w_inited = 1;
+ for (order = order_list; *order != NULL; order++) {
+ w = enroll(*order, MTX_DEF);
+ w->w_file = "order list";
+ for (order++; *order != NULL; order++) {
+ w1 = enroll(*order, MTX_DEF);
+ w1->w_file = "order list";
+ itismychild(w, w1);
+ w = w1;
+ }
+ }
+ }
+ if ((flag & MTX_SPIN) && witness_skipspin)
+ return (NULL);
+ mtx_enter(&w_mtx, MTX_SPIN);
+ for (w = w_all; w; w = w->w_next) {
+ if (strcmp(description, w->w_description) == 0) {
+ mtx_exit(&w_mtx, MTX_SPIN);
+ return (w);
+ }
+ }
+ if ((w = witness_get()) == NULL)
+ return (NULL);
+ w->w_next = w_all;
+ w_all = w;
+ w->w_description = description;
+ mtx_exit(&w_mtx, MTX_SPIN);
+ if (flag & MTX_SPIN) {
+ w->w_spin = 1;
+
+ i = 1;
+ for (order = spin_order_list; *order != NULL; order++) {
+ if (strcmp(description, *order) == 0)
+ break;
+ i <<= 1;
+ }
+ if (*order == NULL)
+ panic("spin lock %s not in order list", description);
+ w->w_level = i;
+ }
+ return (w);
+}
+
+static int
+itismychild(witness_t *parent, witness_t *child)
+{
+ static int recursed;
+
+ /*
+ * Insert "child" after "parent"
+ */
+ while (parent->w_morechildren)
+ parent = parent->w_morechildren;
+
+ if (parent->w_childcnt == WITNESS_NCHILDREN) {
+ if ((parent->w_morechildren = witness_get()) == NULL)
+ return (1);
+ parent = parent->w_morechildren;
+ }
+ ASS(child != NULL);
+ parent->w_children[parent->w_childcnt++] = child;
+ /*
+ * now prune whole tree
+ */
+ if (recursed)
+ return (0);
+ recursed = 1;
+ for (child = w_all; child != NULL; child = child->w_next) {
+ for (parent = w_all; parent != NULL;
+ parent = parent->w_next) {
+ if (!isitmychild(parent, child))
+ continue;
+ removechild(parent, child);
+ if (isitmydescendant(parent, child))
+ continue;
+ itismychild(parent, child);
+ }
+ }
+ recursed = 0;
+ witness_levelall();
+ return (0);
+}
+
+static void
+removechild(witness_t *parent, witness_t *child)
+{
+ witness_t *w, *w1;
+ int i;
+
+ for (w = parent; w != NULL; w = w->w_morechildren)
+ for (i = 0; i < w->w_childcnt; i++)
+ if (w->w_children[i] == child)
+ goto found;
+ return;
+found:
+ for (w1 = w; w1->w_morechildren != NULL; w1 = w1->w_morechildren)
+ continue;
+ w->w_children[i] = w1->w_children[--w1->w_childcnt];
+ ASS(w->w_children[i] != NULL);
+
+ if (w1->w_childcnt != 0)
+ return;
+
+ if (w1 == parent)
+ return;
+ for (w = parent; w->w_morechildren != w1; w = w->w_morechildren)
+ continue;
+ w->w_morechildren = 0;
+ witness_free(w1);
+}
+
+static int
+isitmychild(witness_t *parent, witness_t *child)
+{
+ witness_t *w;
+ int i;
+
+ for (w = parent; w != NULL; w = w->w_morechildren) {
+ for (i = 0; i < w->w_childcnt; i++) {
+ if (w->w_children[i] == child)
+ return (1);
+ }
+ }
+ return (0);
+}
+
+static int
+isitmydescendant(witness_t *parent, witness_t *child)
+{
+ witness_t *w;
+ int i;
+ int j;
+
+ for (j = 0, w = parent; w != NULL; w = w->w_morechildren, j++) {
+ ASS(j < 1000);
+ for (i = 0; i < w->w_childcnt; i++) {
+ if (w->w_children[i] == child)
+ return (1);
+ }
+ for (i = 0; i < w->w_childcnt; i++) {
+ if (isitmydescendant(w->w_children[i], child))
+ return (1);
+ }
+ }
+ return (0);
+}
+
+void
+witness_levelall (void)
+{
+ witness_t *w, *w1;
+
+ for (w = w_all; w; w = w->w_next)
+ if (!w->w_spin)
+ w->w_level = 0;
+ for (w = w_all; w; w = w->w_next) {
+ if (w->w_spin)
+ continue;
+ for (w1 = w_all; w1; w1 = w1->w_next) {
+ if (isitmychild(w1, w))
+ break;
+ }
+ if (w1 != NULL)
+ continue;
+ witness_leveldescendents(w, 0);
+ }
+}
+
+static void
+witness_leveldescendents(witness_t *parent, int level)
+{
+ int i;
+ witness_t *w;
+
+ if (parent->w_level < level)
+ parent->w_level = level;
+ level++;
+ for (w = parent; w != NULL; w = w->w_morechildren)
+ for (i = 0; i < w->w_childcnt; i++)
+ witness_leveldescendents(w->w_children[i], level);
+}
+
+static void
+witness_displaydescendants(void(*prnt)(const char *fmt, ...), witness_t *parent)
+{
+ witness_t *w;
+ int i;
+ int level = parent->w_level;
+
+ prnt("%d", level);
+ if (level < 10)
+ prnt(" ");
+ for (i = 0; i < level; i++)
+ prnt(" ");
+ prnt("%s", parent->w_description);
+ if (parent->w_file != NULL) {
+ prnt(" -- last acquired @ %s", parent->w_file);
+#ifndef W_USE_WHERE
+ prnt(":%d", parent->w_line);
+#endif
+ prnt("\n");
+ }
+
+ for (w = parent; w != NULL; w = w->w_morechildren)
+ for (i = 0; i < w->w_childcnt; i++)
+ witness_displaydescendants(prnt, w->w_children[i]);
+ }
+
+static int
+dup_ok(witness_t *w)
+{
+ char **dup;
+
+ for (dup = dup_list; *dup!= NULL; dup++)
+ if (strcmp(w->w_description, *dup) == 0)
+ return (1);
+ return (0);
+}
+
+static int
+blessed(witness_t *w1, witness_t *w2)
+{
+ int i;
+ witness_blessed_t *b;
+
+ for (i = 0; i < blessed_count; i++) {
+ b = &blessed_list[i];
+ if (strcmp(w1->w_description, b->b_lock1) == 0) {
+ if (strcmp(w2->w_description, b->b_lock2) == 0)
+ return (1);
+ continue;
+ }
+ if (strcmp(w1->w_description, b->b_lock2) == 0)
+ if (strcmp(w2->w_description, b->b_lock1) == 0)
+ return (1);
+ }
+ return (0);
+}
+
+static witness_t *
+witness_get()
+{
+ witness_t *w;
+
+ if ((w = w_free) == NULL) {
+ witness_dead = 1;
+ mtx_exit(&w_mtx, MTX_SPIN);
+ printf("witness exhausted\n");
+ return (NULL);
+ }
+ w_free = w->w_next;
+ bzero(w, sizeof (*w));
+ return (w);
+}
+
+static void
+witness_free(witness_t *w)
+{
+ w->w_next = w_free;
+ w_free = w;
+}
+
+void
+witness_list(struct proc *p)
+{
+ mtx_t *m;
+
+ for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL;
+ m = LIST_NEXT(m, mtx_held)) {
+ printf("\t\"%s\" (%p) locked at %s:%d\n",
+ m->mtx_description, m,
+ m->mtx_witness->w_file, m->mtx_witness->w_line);
+ }
+}
+
+void
+witness_save(mtx_t *m, char **filep, int *linep)
+{
+ *filep = m->mtx_witness->w_file;
+ *linep = m->mtx_witness->w_line;
+}
+
+void
+witness_restore(mtx_t *m, char *file, int line)
+{
+ m->mtx_witness->w_file = file;
+ m->mtx_witness->w_line = line;
+}
+
+#endif /* (defined(SMP_DEBUG) && defined(WITNESS)) */
diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c
index 7ec2628..4800747 100644
--- a/sys/kern/kern_proc.c
+++ b/sys/kern/kern_proc.c
@@ -73,6 +73,7 @@ u_long pgrphash;
struct proclist allproc;
struct proclist zombproc;
vm_zone_t proc_zone;
+vm_zone_t ithread_zone;
/*
* Initialize global process hashing structures.
diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c
index f2a8fa6..3344f7e 100644
--- a/sys/kern/kern_resource.c
+++ b/sys/kern/kern_resource.c
@@ -530,7 +530,7 @@ calcru(p, up, sp, ip)
microuptime(&tv);
if (timevalcmp(&tv, &switchtime, <))
printf("microuptime() went backwards (%ld.%06ld -> %ld.%06ld)\n",
- switchtime.tv_sec, switchtime.tv_usec,
+ switchtime.tv_sec, switchtime.tv_usec,
tv.tv_sec, tv.tv_usec);
else
tu += (tv.tv_usec - switchtime.tv_usec) +
diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c
index 9c744c7..8a6ccd8 100644
--- a/sys/kern/kern_shutdown.c
+++ b/sys/kern/kern_shutdown.c
@@ -63,6 +63,7 @@
#include <machine/pcb.h>
#include <machine/clock.h>
+#include <machine/lock.h>
#include <machine/md_var.h>
#include <machine/smp.h> /* smp_active, cpuid */
@@ -524,6 +525,11 @@ panic(const char *fmt, ...)
va_list ap;
static char buf[256];
+#ifdef SMP
+ /* Only 1 CPU can panic at a time */
+ s_lock(&panic_lock);
+#endif
+
bootopt = RB_AUTOBOOT | RB_DUMP;
if (panicstr)
bootopt |= RB_NOSYNC;
@@ -537,8 +543,7 @@ panic(const char *fmt, ...)
va_end(ap);
printf("panic: %s\n", buf);
#ifdef SMP
- /* three seperate prints in case of an unmapped page and trap */
- printf("mp_lock = %08x; ", mp_lock);
+ /* two seperate prints in case of an unmapped page and trap */
printf("cpuid = %d; ", cpuid);
printf("lapic.id = %08x\n", lapic.id);
#endif
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index a2ff2ef..a39a4c8 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -56,6 +56,7 @@
#include <sys/acct.h>
#include <sys/fcntl.h>
#include <sys/wait.h>
+#include <sys/ktr.h>
#include <sys/ktrace.h>
#include <sys/syslog.h>
#include <sys/stat.h>
@@ -1465,6 +1466,8 @@ killproc(p, why)
struct proc *p;
char *why;
{
+ CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)",
+ p, p->p_pid, p->p_comm);
log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm,
p->p_cred && p->p_ucred ? p->p_ucred->cr_uid : -1, why);
psignal(p, SIGKILL);
diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c
index c0f7f64..d9a599a 100644
--- a/sys/kern/kern_subr.c
+++ b/sys/kern/kern_subr.c
@@ -42,6 +42,7 @@
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
+#include <sys/ktr.h>
#include <sys/proc.h>
#include <sys/malloc.h>
#include <sys/lock.h>
@@ -52,6 +53,8 @@
#include <vm/vm_page.h>
#include <vm/vm_map.h>
+#include <machine/mutex.h>
+
static void uio_yield __P((void));
int
@@ -421,10 +424,12 @@ uio_yield()
int s;
p = curproc;
- p->p_priority = p->p_usrpri;
s = splhigh();
+ mtx_enter(&sched_lock, MTX_SPIN);
+ p->p_priority = p->p_usrpri;
setrunqueue(p);
p->p_stats->p_ru.ru_nivcsw++;
mi_switch();
+ mtx_exit(&sched_lock, MTX_SPIN);
splx(s);
}
diff --git a/sys/kern/kern_switch.c b/sys/kern/kern_switch.c
index 3146f9e..8f47dba 100644
--- a/sys/kern/kern_switch.c
+++ b/sys/kern/kern_switch.c
@@ -29,27 +29,39 @@
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
+#include <sys/ktr.h>
#include <sys/proc.h>
#include <sys/rtprio.h>
#include <sys/queue.h>
+#include <machine/mutex.h>
+
/*
* We have NQS (32) run queues per scheduling class. For the normal
* class, there are 128 priorities scaled onto these 32 queues. New
* processes are added to the last entry in each queue, and processes
* are selected for running by taking them from the head and maintaining
- * a simple FIFO arrangement. Realtime and Idle priority processes have
- * and explicit 0-31 priority which maps directly onto their class queue
- * index. When a queue has something in it, the corresponding bit is
- * set in the queuebits variable, allowing a single read to determine
- * the state of all 32 queues and then a ffs() to find the first busy
+ * a simple FIFO arrangement.
+ *
+ * Interrupt, real time and idle priority processes have and explicit
+ * 0-31 priority which maps directly onto their class queue index.
+ * When a queue has something in it, the corresponding bit is set in
+ * the queuebits variable, allowing a single read to determine the
+ * state of all 32 queues and then a ffs() to find the first busy
* queue.
+ *
+ * XXX This needs fixing. First, we only have one idle process, so we
+ * hardly need 32 queues for it. Secondly, the number of classes
+ * makes things unwieldy. We should be able to merge them into a
+ * single 96 or 128 entry queue.
*/
-struct rq queues[NQS];
-struct rq rtqueues[NQS];
-struct rq idqueues[NQS];
-u_int32_t queuebits;
+struct rq itqueues[NQS]; /* interrupt threads */
+struct rq rtqueues[NQS]; /* real time processes */
+struct rq queues[NQS]; /* time sharing processes */
+struct rq idqueues[NQS]; /* idle process */
+u_int32_t itqueuebits;
u_int32_t rtqueuebits;
+u_int32_t queuebits;
u_int32_t idqueuebits;
/*
@@ -61,8 +73,9 @@ rqinit(void *dummy)
int i;
for (i = 0; i < NQS; i++) {
- TAILQ_INIT(&queues[i]);
+ TAILQ_INIT(&itqueues[i]);
TAILQ_INIT(&rtqueues[i]);
+ TAILQ_INIT(&queues[i]);
TAILQ_INIT(&idqueues[i]);
}
}
@@ -81,22 +94,37 @@ setrunqueue(struct proc *p)
struct rq *q;
u_int8_t pri;
- KASSERT(p->p_stat == SRUN, ("setrunqueue: proc not SRUN"));
- if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
- pri = p->p_priority >> 2;
- q = &queues[pri];
- queuebits |= 1 << pri;
- } else if (p->p_rtprio.type == RTP_PRIO_REALTIME ||
+ mtx_assert(&sched_lock, MA_OWNED);
+ KASSERT(p->p_stat == SRUN, ("setrunqueue: proc %p (%s) not SRUN", p, \
+ p->p_comm));
+
+ /*
+ * Decide which class we want to run. We now have four
+ * queues, and this is becoming ugly. We should be able to
+ * collapse the first three classes into a single contiguous
+ * queue. XXX FIXME.
+ */
+ CTR4(KTR_PROC, "setrunqueue: proc %p (pid %d, %s), schedlock %x",
+ p, p->p_pid, p->p_comm, sched_lock.mtx_lock);
+ if (p->p_rtprio.type == RTP_PRIO_ITHREAD) { /* interrupt thread */
+ pri = p->p_rtprio.prio;
+ q = &itqueues[pri];
+ itqueuebits |= 1 << pri;
+ } else if (p->p_rtprio.type == RTP_PRIO_REALTIME || /* real time */
p->p_rtprio.type == RTP_PRIO_FIFO) {
pri = p->p_rtprio.prio;
q = &rtqueues[pri];
rtqueuebits |= 1 << pri;
- } else if (p->p_rtprio.type == RTP_PRIO_IDLE) {
+ } else if (p->p_rtprio.type == RTP_PRIO_NORMAL) { /* time sharing */
+ pri = p->p_priority >> 2;
+ q = &queues[pri];
+ queuebits |= 1 << pri;
+ } else if (p->p_rtprio.type == RTP_PRIO_IDLE) { /* idle proc */
pri = p->p_rtprio.prio;
q = &idqueues[pri];
idqueuebits |= 1 << pri;
} else {
- panic("setrunqueue: invalid rtprio type");
+ panic("setrunqueue: invalid rtprio type %d", p->p_rtprio.type);
}
p->p_rqindex = pri; /* remember the queue index */
TAILQ_INSERT_TAIL(q, p, p_procq);
@@ -114,14 +142,20 @@ remrunqueue(struct proc *p)
u_int32_t *which;
u_int8_t pri;
+ CTR4(KTR_PROC, "remrunqueue: proc %p (pid %d, %s), schedlock %x",
+ p, p->p_pid, p->p_comm, sched_lock.mtx_lock);
+ mtx_assert(&sched_lock, MA_OWNED);
pri = p->p_rqindex;
- if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
- q = &queues[pri];
- which = &queuebits;
+ if (p->p_rtprio.type == RTP_PRIO_ITHREAD) {
+ q = &itqueues[pri];
+ which = &itqueuebits;
} else if (p->p_rtprio.type == RTP_PRIO_REALTIME ||
p->p_rtprio.type == RTP_PRIO_FIFO) {
q = &rtqueues[pri];
which = &rtqueuebits;
+ } else if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
+ q = &queues[pri];
+ which = &queuebits;
} else if (p->p_rtprio.type == RTP_PRIO_IDLE) {
q = &idqueues[pri];
which = &idqueuebits;
@@ -142,11 +176,17 @@ remrunqueue(struct proc *p)
* loop to avoid the more expensive (and destructive) chooseproc().
*
* MP SAFE. CALLED WITHOUT THE MP LOCK
+ *
+ * XXX I doubt this. It's possibly fail-safe, but there's obviously
+ * the case here where one of the bits words gets loaded, the
+ * processor gets preempted, and by the time it returns from this
+ * function, some other processor has picked the runnable process.
+ * What am I missing? (grog, 23 July 2000).
*/
u_int32_t
procrunnable(void)
{
- return (rtqueuebits || queuebits || idqueuebits);
+ return (itqueuebits || rtqueuebits || queuebits || idqueuebits);
}
/*
@@ -173,7 +213,12 @@ chooseproc(void)
u_char id;
#endif
- if (rtqueuebits) {
+ mtx_assert(&sched_lock, MA_OWNED);
+ if (itqueuebits) {
+ pri = ffs(itqueuebits) - 1;
+ q = &itqueues[pri];
+ which = &itqueuebits;
+ } else if (rtqueuebits) {
pri = ffs(rtqueuebits) - 1;
q = &rtqueues[pri];
which = &rtqueuebits;
@@ -186,10 +231,12 @@ chooseproc(void)
q = &idqueues[pri];
which = &idqueuebits;
} else {
- return NULL;
+ CTR1(KTR_PROC, "chooseproc: idleproc, schedlock %x",
+ sched_lock.mtx_lock);
+ idleproc->p_stat = SRUN;
+ return idleproc;
}
p = TAILQ_FIRST(q);
- KASSERT(p, ("chooseproc: no proc on busy queue"));
#ifdef SMP
/* wander down the current run queue for this pri level for a match */
id = cpuid;
@@ -201,6 +248,9 @@ chooseproc(void)
}
}
#endif
+ CTR4(KTR_PROC, "chooseproc: proc %p (pid %d, %s), schedlock %x",
+ p, p->p_pid, p->p_comm, sched_lock.mtx_lock);
+ KASSERT(p, ("chooseproc: no proc on busy queue"));
TAILQ_REMOVE(q, p, p_procq);
if (TAILQ_EMPTY(q))
*which &= ~(1 << pri);
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
index f747759..f397f40 100644
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@@ -45,6 +45,7 @@
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kernel.h>
+#include <sys/ktr.h>
#include <sys/signalvar.h>
#include <sys/resourcevar.h>
#include <sys/vmmeter.h>
@@ -59,6 +60,7 @@
#include <machine/cpu.h>
#include <machine/ipl.h>
#include <machine/smp.h>
+#include <machine/mutex.h>
static void sched_setup __P((void *dummy));
SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
@@ -135,7 +137,7 @@ maybe_resched(chk)
* standard process becomes runaway cpu-bound, the system can lockup
* due to idle-scheduler processes in wakeup never getting any cpu.
*/
- if (p == NULL) {
+ if (p == idleproc) {
#if 0
need_resched();
#endif
@@ -169,7 +171,7 @@ roundrobin(arg)
need_resched();
forward_roundrobin();
#else
- if (p == 0 || RTP_PRIO_NEED_RR(p->p_rtprio.type))
+ if (p == idleproc || RTP_PRIO_NEED_RR(p->p_rtprio.type))
need_resched();
#endif
@@ -284,6 +286,8 @@ schedcpu(arg)
* Increment time in/out of memory and sleep time
* (if sleeping). We ignore overflow; with 16-bit int's
* (remember them?) overflow takes 45 days.
+ if (p->p_stat == SWAIT)
+ continue;
*/
p->p_swtime++;
if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
@@ -295,7 +299,12 @@ schedcpu(arg)
*/
if (p->p_slptime > 1)
continue;
- s = splhigh(); /* prevent state changes and protect run queue */
+ /*
+ * prevent state changes and protect run queue
+ */
+ s = splhigh();
+ mtx_enter(&sched_lock, MTX_SPIN);
+
/*
* p_pctcpu is only for ps.
*/
@@ -325,6 +334,7 @@ schedcpu(arg)
} else
p->p_priority = p->p_usrpri;
}
+ mtx_exit(&sched_lock, MTX_SPIN);
splx(s);
}
vmmeter();
@@ -364,6 +374,7 @@ updatepri(p)
static TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE];
#define LOOKUP(x) (((intptr_t)(x) >> 8) & (TABLESIZE - 1))
+#if 0
/*
* During autoconfiguration or after a panic, a sleep will simply
* lower the priority briefly to allow interrupts, then return.
@@ -374,6 +385,7 @@ static TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE];
* higher to block network software interrupts after panics.
*/
int safepri;
+#endif
void
sleepinit(void)
@@ -406,11 +418,15 @@ tsleep(ident, priority, wmesg, timo)
struct proc *p = curproc;
int s, sig, catch = priority & PCATCH;
struct callout_handle thandle;
+ int rval = 0;
#ifdef KTRACE
if (p && KTRPOINT(p, KTR_CSW))
ktrcsw(p->p_tracep, 1, 0);
#endif
+ mtx_assert(&Giant, MA_OWNED);
+ mtx_enter(&sched_lock, MTX_SPIN);
+
s = splhigh();
if (cold || panicstr) {
/*
@@ -419,10 +435,14 @@ tsleep(ident, priority, wmesg, timo)
* don't run any other procs or panic below,
* in case this is the idle process and already asleep.
*/
+ mtx_exit(&sched_lock, MTX_SPIN);
+#if 0
splx(safepri);
+#endif
splx(s);
return (0);
}
+
KASSERT(p != NULL, ("tsleep1"));
KASSERT(ident != NULL && p->p_stat == SRUN, ("tsleep"));
/*
@@ -436,6 +456,9 @@ tsleep(ident, priority, wmesg, timo)
p->p_wmesg = wmesg;
p->p_slptime = 0;
p->p_priority = priority & PRIMASK;
+ p->p_nativepri = p->p_priority;
+ CTR4(KTR_PROC, "tsleep: proc %p (pid %d, %s), schedlock %x",
+ p, p->p_pid, p->p_comm, sched_lock.mtx_lock);
TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq);
if (timo)
thandle = timeout(endtsleep, (void *)p, timo);
@@ -449,6 +472,9 @@ tsleep(ident, priority, wmesg, timo)
* stopped, p->p_wchan will be 0 upon return from CURSIG.
*/
if (catch) {
+ CTR4(KTR_PROC,
+ "tsleep caught: proc %p (pid %d, %s), schedlock %x",
+ p, p->p_pid, p->p_comm, sched_lock.mtx_lock);
p->p_flag |= P_SINTR;
if ((sig = CURSIG(p))) {
if (p->p_wchan)
@@ -465,6 +491,9 @@ tsleep(ident, priority, wmesg, timo)
p->p_stat = SSLEEP;
p->p_stats->p_ru.ru_nvcsw++;
mi_switch();
+ CTR4(KTR_PROC,
+ "tsleep resume: proc %p (pid %d, %s), schedlock %x",
+ p, p->p_pid, p->p_comm, sched_lock.mtx_lock);
resume:
curpriority = p->p_usrpri;
splx(s);
@@ -476,7 +505,8 @@ resume:
if (KTRPOINT(p, KTR_CSW))
ktrcsw(p->p_tracep, 0, 0);
#endif
- return (EWOULDBLOCK);
+ rval = EWOULDBLOCK;
+ goto out;
}
} else if (timo)
untimeout(endtsleep, (void *)p, thandle);
@@ -486,14 +516,19 @@ resume:
ktrcsw(p->p_tracep, 0, 0);
#endif
if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
- return (EINTR);
- return (ERESTART);
+ rval = EINTR;
+ else
+ rval = ERESTART;
+ goto out;
}
+out:
+ mtx_exit(&sched_lock, MTX_SPIN);
#ifdef KTRACE
if (KTRPOINT(p, KTR_CSW))
ktrcsw(p->p_tracep, 0, 0);
#endif
- return (0);
+
+ return (rval);
}
/*
@@ -519,13 +554,14 @@ asleep(void *ident, int priority, const char *wmesg, int timo)
int s;
/*
- * splhigh() while manipulating sleep structures and slpque.
+ * obtain sched_lock while manipulating sleep structures and slpque.
*
* Remove preexisting wait condition (if any) and place process
* on appropriate slpque, but do not put process to sleep.
*/
s = splhigh();
+ mtx_enter(&sched_lock, MTX_SPIN);
if (p->p_wchan != NULL)
unsleep(p);
@@ -539,6 +575,7 @@ asleep(void *ident, int priority, const char *wmesg, int timo)
TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq);
}
+ mtx_exit(&sched_lock, MTX_SPIN);
splx(s);
return(0);
@@ -560,8 +597,12 @@ int
await(int priority, int timo)
{
struct proc *p = curproc;
+ int rval = 0;
int s;
+ mtx_assert(&Giant, MA_OWNED);
+ mtx_enter(&sched_lock, MTX_SPIN);
+
s = splhigh();
if (p->p_wchan != NULL) {
@@ -616,7 +657,8 @@ resume:
if (KTRPOINT(p, KTR_CSW))
ktrcsw(p->p_tracep, 0, 0);
#endif
- return (EWOULDBLOCK);
+ rval = EWOULDBLOCK;
+ goto out;
}
} else if (timo)
untimeout(endtsleep, (void *)p, thandle);
@@ -626,8 +668,10 @@ resume:
ktrcsw(p->p_tracep, 0, 0);
#endif
if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
- return (EINTR);
- return (ERESTART);
+ rval = EINTR;
+ else
+ rval = ERESTART;
+ goto out;
}
#ifdef KTRACE
if (KTRPOINT(p, KTR_CSW))
@@ -655,7 +699,10 @@ resume:
*/
p->p_asleep.as_priority = 0;
- return (0);
+out:
+ mtx_exit(&sched_lock, MTX_SPIN);
+
+ return (rval);
}
/*
@@ -673,7 +720,11 @@ endtsleep(arg)
int s;
p = (struct proc *)arg;
+ CTR4(KTR_PROC,
+ "endtsleep: proc %p (pid %d, %s), schedlock %x",
+ p, p->p_pid, p->p_comm, sched_lock.mtx_lock);
s = splhigh();
+ mtx_enter(&sched_lock, MTX_SPIN);
if (p->p_wchan) {
if (p->p_stat == SSLEEP)
setrunnable(p);
@@ -681,6 +732,7 @@ endtsleep(arg)
unsleep(p);
p->p_flag |= P_TIMEOUT;
}
+ mtx_exit(&sched_lock, MTX_SPIN);
splx(s);
}
@@ -694,10 +746,12 @@ unsleep(p)
int s;
s = splhigh();
+ mtx_enter(&sched_lock, MTX_SPIN);
if (p->p_wchan) {
TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_procq);
p->p_wchan = 0;
}
+ mtx_exit(&sched_lock, MTX_SPIN);
splx(s);
}
@@ -713,6 +767,7 @@ wakeup(ident)
int s;
s = splhigh();
+ mtx_enter(&sched_lock, MTX_SPIN);
qp = &slpque[LOOKUP(ident)];
restart:
TAILQ_FOREACH(p, qp, p_procq) {
@@ -721,6 +776,9 @@ restart:
p->p_wchan = 0;
if (p->p_stat == SSLEEP) {
/* OPTIMIZED EXPANSION OF setrunnable(p); */
+ CTR4(KTR_PROC,
+ "wakeup: proc %p (pid %d, %s), schedlock %x",
+ p, p->p_pid, p->p_comm, sched_lock.mtx_lock);
if (p->p_slptime > 1)
updatepri(p);
p->p_slptime = 0;
@@ -737,6 +795,7 @@ restart:
}
}
}
+ mtx_exit(&sched_lock, MTX_SPIN);
splx(s);
}
@@ -754,6 +813,7 @@ wakeup_one(ident)
int s;
s = splhigh();
+ mtx_enter(&sched_lock, MTX_SPIN);
qp = &slpque[LOOKUP(ident)];
TAILQ_FOREACH(p, qp, p_procq) {
@@ -762,6 +822,9 @@ wakeup_one(ident)
p->p_wchan = 0;
if (p->p_stat == SSLEEP) {
/* OPTIMIZED EXPANSION OF setrunnable(p); */
+ CTR4(KTR_PROC,
+ "wakeup1: proc %p (pid %d, %s), schedlock %x",
+ p, p->p_pid, p->p_comm, sched_lock.mtx_lock);
if (p->p_slptime > 1)
updatepri(p);
p->p_slptime = 0;
@@ -778,6 +841,7 @@ wakeup_one(ident)
}
}
}
+ mtx_exit(&sched_lock, MTX_SPIN);
splx(s);
}
@@ -791,7 +855,9 @@ mi_switch()
struct timeval new_switchtime;
register struct proc *p = curproc; /* XXX */
register struct rlimit *rlim;
+ int giantreleased;
int x;
+ WITNESS_SAVE_DECL(Giant);
/*
* XXX this spl is almost unnecessary. It is partly to allow for
@@ -812,6 +878,14 @@ mi_switch()
*/
x = splstatclock();
+ CTR4(KTR_PROC, "mi_switch: old proc %p (pid %d, %s), schedlock %x",
+ p, p->p_pid, p->p_comm, sched_lock.mtx_lock);
+ mtx_enter(&sched_lock, MTX_SPIN | MTX_RLIKELY);
+
+ WITNESS_SAVE(&Giant, Giant);
+ for (giantreleased = 0; mtx_owned(&Giant); giantreleased++)
+ mtx_exit(&Giant, MTX_DEF | MTX_NOSWITCH);
+
#ifdef SIMPLELOCK_DEBUG
if (p->p_simple_locks)
printf("sleep: holding simple lock\n");
@@ -823,7 +897,7 @@ mi_switch()
microuptime(&new_switchtime);
if (timevalcmp(&new_switchtime, &switchtime, <)) {
printf("microuptime() went backwards (%ld.%06ld -> %ld.%06ld)\n",
- switchtime.tv_sec, switchtime.tv_usec,
+ switchtime.tv_sec, switchtime.tv_usec,
new_switchtime.tv_sec, new_switchtime.tv_usec);
new_switchtime = switchtime;
} else {
@@ -834,6 +908,8 @@ mi_switch()
/*
* Check if the process exceeds its cpu resource allocation.
* If over max, kill it.
+ *
+ * XXX drop sched_lock, pickup Giant
*/
if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
p->p_runtime > p->p_limit->p_cpulimit) {
@@ -854,10 +930,18 @@ mi_switch()
*/
cnt.v_swtch++;
switchtime = new_switchtime;
- cpu_switch(p);
+ CTR4(KTR_PROC, "mi_switch: old proc %p (pid %d, %s), schedlock %x",
+ p, p->p_pid, p->p_comm, sched_lock.mtx_lock);
+ cpu_switch();
+ CTR4(KTR_PROC, "mi_switch: new proc %p (pid %d, %s), schedlock %x",
+ p, p->p_pid, p->p_comm, sched_lock.mtx_lock);
if (switchtime.tv_sec == 0)
microuptime(&switchtime);
switchticks = ticks;
+ mtx_exit(&sched_lock, MTX_SPIN);
+ while (giantreleased--)
+ mtx_enter(&Giant, MTX_DEF);
+ WITNESS_RESTORE(&Giant, Giant);
splx(x);
}
@@ -874,10 +958,12 @@ setrunnable(p)
register int s;
s = splhigh();
+ mtx_enter(&sched_lock, MTX_SPIN);
switch (p->p_stat) {
case 0:
case SRUN:
case SZOMB:
+ case SWAIT:
default:
panic("setrunnable");
case SSTOP:
@@ -891,6 +977,7 @@ setrunnable(p)
p->p_stat = SRUN;
if (p->p_flag & P_INMEM)
setrunqueue(p);
+ mtx_exit(&sched_lock, MTX_SPIN);
splx(s);
if (p->p_slptime > 1)
updatepri(p);
diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c
index b8d5833..1128c2e 100644
--- a/sys/kern/kern_tc.c
+++ b/sys/kern/kern_tc.c
@@ -24,7 +24,7 @@
* Number of timecounters used to implement stable storage
*/
#ifndef NTIMECOUNTER
-#define NTIMECOUNTER 5
+#define NTIMECOUNTER 45
#endif
static MALLOC_DEFINE(M_TIMECOUNTER, "timecounter",
@@ -148,6 +148,13 @@ nanotime(struct timespec *ts)
nnanotime++;
tc = timecounter;
+#ifdef KTR
+ if (tc == NULL) { /* called before initialization */
+ ts->tv_sec = 0;
+ ts->tv_nsec = 0;
+ return;
+ }
+#endif
ts->tv_sec = tc->tc_offset_sec;
count = tco_delta(tc);
delta = tc->tc_offset_nano;
diff --git a/sys/kern/kern_threads.c b/sys/kern/kern_threads.c
index 3531e2c..ba2b4bf 100644
--- a/sys/kern/kern_threads.c
+++ b/sys/kern/kern_threads.c
@@ -52,10 +52,13 @@
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
+#include <sys/ktr.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/sysproto.h>
+#include <machine/mutex.h>
+
/*
* Low level support for sleep/wakeup paradigm
* If a timeout is specified:
@@ -145,10 +148,12 @@ yield(struct proc *p, struct yield_args *uap) {
p->p_retval[0] = 0;
s = splhigh();
+ mtx_enter(&sched_lock, MTX_SPIN);
p->p_priority = MAXPRI;
setrunqueue(p);
p->p_stats->p_ru.ru_nvcsw++;
mi_switch();
+ mtx_exit(&sched_lock, MTX_SPIN);
splx(s);
return(0);
diff --git a/sys/kern/subr_prf.c b/sys/kern/subr_prf.c
index 3794ccf..a989152 100644
--- a/sys/kern/subr_prf.c
+++ b/sys/kern/subr_prf.c
@@ -110,7 +110,8 @@ uprintf(const char *fmt, ...)
struct putchar_arg pca;
int retval = 0;
- if (p && p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) {
+ if (p && p != idleproc && p->p_flag & P_CONTROLT &&
+ p->p_session->s_ttyvp) {
va_start(ap, fmt);
pca.tty = p->p_session->s_ttyp;
pca.flags = TOTTY;
diff --git a/sys/kern/subr_prof.c b/sys/kern/subr_prof.c
index 4fa5223..294c649 100644
--- a/sys/kern/subr_prof.c
+++ b/sys/kern/subr_prof.c
@@ -93,6 +93,7 @@ kmstartup(dummy)
int nullfunc_loop_profiled_time;
uintfptr_t tmp_addr;
#endif
+ int intrstate;
/*
* Round lowpc and highpc to multiples of the density we're using
@@ -135,6 +136,7 @@ kmstartup(dummy)
* Disable interrupts to avoid interference while we calibrate
* things.
*/
+ intrstate = save_intr();
disable_intr();
/*
@@ -189,7 +191,7 @@ kmstartup(dummy)
p->state = GMON_PROF_OFF;
stopguprof(p);
- enable_intr();
+ restore_intr(intrstate);
nullfunc_loop_profiled_time = 0;
for (tmp_addr = (uintfptr_t)nullfunc_loop_profiled;
diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c
index 61c5ecf..95b5759 100644
--- a/sys/kern/subr_smp.c
+++ b/sys/kern/subr_smp.c
@@ -36,6 +36,7 @@
#endif
#include <sys/param.h>
+#include <sys/bus.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/proc.h>
@@ -65,6 +66,7 @@
#include <machine/apic.h>
#include <machine/atomic.h>
#include <machine/cpufunc.h>
+#include <machine/mutex.h>
#include <machine/mpapic.h>
#include <machine/psl.h>
#include <machine/segments.h>
@@ -236,6 +238,8 @@ typedef struct BASETABLE_ENTRY {
#define MP_ANNOUNCE_POST 0x19
+/* used to hold the AP's until we are ready to release them */
+struct simplelock ap_boot_lock;
/** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */
int current_postcode;
@@ -336,6 +340,7 @@ static int start_all_aps(u_int boot_addr);
static void install_ap_tramp(u_int boot_addr);
static int start_ap(int logicalCpu, u_int boot_addr);
static int apic_int_is_bus_type(int intr, int bus_type);
+static void release_aps(void *dummy);
/*
* Calculate usable address in base memory for AP trampoline code.
@@ -403,7 +408,7 @@ found:
/*
- * Startup the SMP processors.
+ * Initialize the SMP hardware and the APIC and start up the AP's.
*/
void
mp_start(void)
@@ -619,6 +624,9 @@ mp_enable(u_int boot_addr)
/* initialize all SMP locks */
init_locks();
+ /* obtain the ap_boot_lock */
+ s_lock(&ap_boot_lock);
+
/* start each Application Processor */
start_all_aps(boot_addr);
}
@@ -1866,9 +1874,6 @@ struct simplelock fast_intr_lock;
/* critical region around INTR() routines */
struct simplelock intr_lock;
-/* lock regions protected in UP kernel via cli/sti */
-struct simplelock mpintr_lock;
-
/* lock region used by kernel profiling */
struct simplelock mcount_lock;
@@ -1885,26 +1890,16 @@ struct simplelock clock_lock;
/* lock around the MP rendezvous */
static struct simplelock smp_rv_lock;
+/* only 1 CPU can panic at a time :) */
+struct simplelock panic_lock;
+
static void
init_locks(void)
{
- /*
- * Get the initial mp_lock with a count of 1 for the BSP.
- * This uses a LOGICAL cpu ID, ie BSP == 0.
- */
- mp_lock = 0x00000001;
-
-#if 0
- /* ISR uses its own "giant lock" */
- isr_lock = FREE_LOCK;
-#endif
-
#if defined(APIC_INTR_DIAGNOSTIC) && defined(APIC_INTR_DIAGNOSTIC_IRQ)
s_lock_init((struct simplelock*)&apic_itrace_debuglock);
#endif
- s_lock_init((struct simplelock*)&mpintr_lock);
-
s_lock_init((struct simplelock*)&mcount_lock);
s_lock_init((struct simplelock*)&fast_intr_lock);
@@ -1912,6 +1907,7 @@ init_locks(void)
s_lock_init((struct simplelock*)&imen_lock);
s_lock_init((struct simplelock*)&cpl_lock);
s_lock_init(&smp_rv_lock);
+ s_lock_init(&panic_lock);
#ifdef USE_COMLOCK
s_lock_init((struct simplelock*)&com_lock);
@@ -1919,11 +1915,9 @@ init_locks(void)
#ifdef USE_CLOCKLOCK
s_lock_init((struct simplelock*)&clock_lock);
#endif /* USE_CLOCKLOCK */
-}
-
-/* Wait for all APs to be fully initialized */
-extern int wait_ap(unsigned int);
+ s_lock_init(&ap_boot_lock);
+}
/*
* start each AP in our list
@@ -1987,6 +1981,7 @@ start_all_aps(u_int boot_addr)
SMPpt[pg + 4] = 0; /* *prv_PMAP1 */
/* prime data page for it to use */
+ SLIST_INSERT_HEAD(&cpuhead, gd, gd_allcpu);
gd->gd_cpuid = x;
gd->gd_cpu_lockid = x << 24;
gd->gd_prv_CMAP1 = &SMPpt[pg + 1];
@@ -2211,7 +2206,6 @@ start_ap(int logical_cpu, u_int boot_addr)
return 0; /* return FAILURE */
}
-
/*
* Flush the TLB on all other CPU's
*
@@ -2348,10 +2342,13 @@ SYSCTL_INT(_machdep, OID_AUTO, forward_roundrobin_enabled, CTLFLAG_RW,
void ap_init(void);
void
-ap_init()
+ap_init(void)
{
u_int apic_id;
+ /* lock against other AP's that are waking up */
+ s_lock(&ap_boot_lock);
+
/* BSP may have changed PTD while we're waiting for the lock */
cpu_invltlb();
@@ -2397,6 +2394,30 @@ ap_init()
smp_started = 1; /* enable IPI's, tlb shootdown, freezes etc */
smp_active = 1; /* historic */
}
+
+ /* let other AP's wake up now */
+ s_unlock(&ap_boot_lock);
+
+ /* wait until all the AP's are up */
+ while (smp_started == 0)
+ ; /* nothing */
+
+ /*
+ * Set curproc to our per-cpu idleproc so that mutexes have
+ * something unique to lock with.
+ */
+ PCPU_SET(curproc,idleproc);
+ PCPU_SET(prevproc,idleproc);
+
+ microuptime(&switchtime);
+ switchticks = ticks;
+
+ /* ok, now grab sched_lock and enter the scheduler */
+ enable_intr();
+ mtx_enter(&sched_lock, MTX_SPIN);
+ cpu_throw(); /* doesn't return */
+
+ panic("scheduler returned us to ap_init");
}
#ifdef BETTER_CLOCK
@@ -2453,6 +2474,12 @@ forwarded_statclock(int id, int pscnt, int *astmap)
p = checkstate_curproc[id];
cpustate = checkstate_cpustate[id];
+ /* XXX */
+ if (p->p_ithd)
+ cpustate = CHECKSTATE_INTR;
+ else if (p == idleproc)
+ cpustate = CHECKSTATE_SYS;
+
switch (cpustate) {
case CHECKSTATE_USER:
if (p->p_flag & P_PROFIL)
@@ -2482,9 +2509,10 @@ forwarded_statclock(int id, int pscnt, int *astmap)
if (pscnt > 1)
return;
- if (!p)
+ if (p == idleproc) {
+ p->p_sticks++;
cp_time[CP_IDLE]++;
- else {
+ } else {
p->p_sticks++;
cp_time[CP_SYS]++;
}
@@ -2510,7 +2538,7 @@ forwarded_statclock(int id, int pscnt, int *astmap)
p->p_iticks++;
cp_time[CP_INTR]++;
}
- if (p != NULL) {
+ if (p != idleproc) {
schedclock(p);
/* Update resource usage integrals and maximums. */
@@ -2863,3 +2891,11 @@ smp_rendezvous(void (* setup_func)(void *),
/* release lock */
s_unlock(&smp_rv_lock);
}
+
+void
+release_aps(void *dummy __unused)
+{
+ s_unlock(&ap_boot_lock);
+}
+
+SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
index 51de1ac..f32dfae 100644
--- a/sys/kern/subr_trap.c
+++ b/sys/kern/subr_trap.c
@@ -49,10 +49,12 @@
#include "opt_trap.h"
#include <sys/param.h>
+#include <sys/bus.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/pioctl.h>
#include <sys/kernel.h>
+#include <sys/ktr.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
#include <sys/syscall.h>
@@ -76,12 +78,14 @@
#include <machine/cpu.h>
#include <machine/ipl.h>
#include <machine/md_var.h>
+#include <machine/mutex.h>
#include <machine/pcb.h>
#ifdef SMP
#include <machine/smp.h>
#endif
#include <machine/tss.h>
+#include <i386/isa/icu.h>
#include <i386/isa/intr_machdep.h>
#ifdef POWERFAIL_NMI
@@ -96,11 +100,14 @@
#include "isa.h"
#include "npx.h"
+#include <sys/sysctl.h>
+
int (*pmath_emulate) __P((struct trapframe *));
extern void trap __P((struct trapframe frame));
extern int trapwrite __P((unsigned addr));
extern void syscall2 __P((struct trapframe frame));
+extern void ast __P((struct trapframe frame));
static int trap_pfault __P((struct trapframe *, int, vm_offset_t));
static void trap_fatal __P((struct trapframe *, vm_offset_t));
@@ -142,7 +149,7 @@ static char *trap_msg[] = {
};
static __inline int userret __P((struct proc *p, struct trapframe *frame,
- u_quad_t oticks, int have_mplock));
+ u_quad_t oticks, int have_giant));
#if defined(I586_CPU) && !defined(NO_F00F_HACK)
extern int has_f00f_bug;
@@ -158,18 +165,18 @@ SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
&panic_on_nmi, 0, "Panic on NMI");
static __inline int
-userret(p, frame, oticks, have_mplock)
+userret(p, frame, oticks, have_giant)
struct proc *p;
struct trapframe *frame;
u_quad_t oticks;
- int have_mplock;
+ int have_giant;
{
int sig, s;
while ((sig = CURSIG(p)) != 0) {
- if (have_mplock == 0) {
- get_mplock();
- have_mplock = 1;
+ if (have_giant == 0) {
+ mtx_enter(&Giant, MTX_DEF);
+ have_giant = 1;
}
postsig(sig);
}
@@ -184,31 +191,34 @@ userret(p, frame, oticks, have_mplock)
* mi_switch()'ed, we might not be on the queue indicated by
* our priority.
*/
- if (have_mplock == 0) {
- get_mplock();
- have_mplock = 1;
- }
s = splhigh();
+ mtx_enter(&sched_lock, MTX_SPIN);
setrunqueue(p);
p->p_stats->p_ru.ru_nivcsw++;
mi_switch();
+ mtx_exit(&sched_lock, MTX_SPIN);
splx(s);
- while ((sig = CURSIG(p)) != 0)
+ while ((sig = CURSIG(p)) != 0) {
+ if (have_giant == 0) {
+ mtx_enter(&Giant, MTX_DEF);
+ have_giant = 1;
+ }
postsig(sig);
+ }
}
/*
* Charge system time if profiling.
*/
if (p->p_flag & P_PROFIL) {
- if (have_mplock == 0) {
- get_mplock();
- have_mplock = 1;
+ if (have_giant == 0) {
+ mtx_enter(&Giant, MTX_DEF);
+ have_giant = 1;
}
addupc_task(p, frame->tf_eip,
(u_int)(p->p_sticks - oticks) * psratio);
}
curpriority = p->p_priority;
- return(have_mplock);
+ return(have_giant);
}
/*
@@ -226,13 +236,20 @@ trap(frame)
u_quad_t sticks = 0;
int i = 0, ucode = 0, type, code;
vm_offset_t eva;
+#ifdef POWERFAIL_NMI
+ static int lastalert = 0;
+#endif
- if (!(frame.tf_eflags & PSL_I)) {
+ atomic_add_int(&cnt.v_trap, 1);
+
+ if ((frame.tf_eflags & PSL_I) == 0) {
/*
- * Buggy application or kernel code has disabled interrupts
- * and then trapped. Enabling interrupts now is wrong, but
- * it is better than running with interrupts disabled until
- * they are accidentally enabled later.
+ * Buggy application or kernel code has disabled
+ * interrupts and then trapped. Enabling interrupts
+ * now is wrong, but it is better than running with
+ * interrupts disabled until they are accidentally
+ * enabled later. XXX Consider whether is this still
+ * correct.
*/
type = frame.tf_trapno;
if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM))
@@ -252,54 +269,27 @@ trap(frame)
eva = 0;
if (frame.tf_trapno == T_PAGEFLT) {
/*
- * For some Cyrix CPUs, %cr2 is clobbered by interrupts.
- * This problem is worked around by using an interrupt
- * gate for the pagefault handler. We are finally ready
- * to read %cr2 and then must reenable interrupts.
- *
- * XXX this should be in the switch statement, but the
- * NO_FOOF_HACK and VM86 goto and ifdefs obfuscate the
- * flow of control too much for this to be obviously
- * correct.
+ * For some Cyrix CPUs, %cr2 is clobbered by
+ * interrupts. This problem is worked around by using
+ * an interrupt gate for the pagefault handler. We
+ * are finally ready to read %cr2 and then must
+ * reenable interrupts.
*/
eva = rcr2();
enable_intr();
- }
+ }
+
+ mtx_enter(&Giant, MTX_DEF);
#if defined(I586_CPU) && !defined(NO_F00F_HACK)
restart:
#endif
+
type = frame.tf_trapno;
code = frame.tf_err;
- if (in_vm86call) {
- if (frame.tf_eflags & PSL_VM &&
- (type == T_PROTFLT || type == T_STKFLT)) {
- i = vm86_emulate((struct vm86frame *)&frame);
- if (i != 0)
- /*
- * returns to original process
- */
- vm86_trap((struct vm86frame *)&frame);
- return;
- }
- switch (type) {
- /*
- * these traps want either a process context, or
- * assume a normal userspace trap.
- */
- case T_PROTFLT:
- case T_SEGNPFLT:
- trap_fatal(&frame, eva);
- return;
- case T_TRCTRAP:
- type = T_BPTFLT; /* kernel breakpoint */
- /* FALL THROUGH */
- }
- goto kernel_trap; /* normal kernel trap handling */
- }
-
- if ((ISPL(frame.tf_cs) == SEL_UPL) || (frame.tf_eflags & PSL_VM)) {
+ if ((ISPL(frame.tf_cs) == SEL_UPL) ||
+ ((frame.tf_eflags & PSL_VM) && !in_vm86call)) {
/* user trap */
sticks = p->p_sticks;
@@ -322,16 +312,6 @@ restart:
i = SIGFPE;
break;
- case T_ASTFLT: /* Allow process switch */
- astoff();
- cnt.v_soft++;
- if (p->p_flag & P_OWEUPC) {
- p->p_flag &= ~P_OWEUPC;
- addupc_task(p, p->p_stats->p_prof.pr_addr,
- p->p_stats->p_prof.pr_ticks);
- }
- goto out;
-
/*
* The following two traps can happen in
* vm86 mode, and, if so, we want to handle
@@ -342,7 +322,7 @@ restart:
if (frame.tf_eflags & PSL_VM) {
i = vm86_emulate((struct vm86frame *)&frame);
if (i == 0)
- goto out;
+ goto user;
break;
}
/* FALL THROUGH */
@@ -357,14 +337,20 @@ restart:
case T_PAGEFLT: /* page fault */
i = trap_pfault(&frame, TRUE, eva);
- if (i == -1)
- return;
#if defined(I586_CPU) && !defined(NO_F00F_HACK)
- if (i == -2)
+ if (i == -2) {
+ /*
+ * f00f hack workaround has triggered, treat
+ * as illegal instruction not page fault.
+ */
+ frame.tf_trapno = T_PRIVINFLT;
goto restart;
+ }
#endif
- if (i == 0)
+ if (i == -1)
goto out;
+ if (i == 0)
+ goto user;
ucode = T_PAGEFLT;
break;
@@ -377,7 +363,15 @@ restart:
#if NISA > 0
case T_NMI:
#ifdef POWERFAIL_NMI
- goto handle_powerfail;
+#ifndef TIMER_FREQ
+# define TIMER_FREQ 1193182
+#endif
+ if (time_second - lastalert > 10) {
+ log(LOG_WARNING, "NMI: power fail\n");
+ sysbeep(TIMER_FREQ/880, hz);
+ lastalert = time_second;
+ }
+ goto out;
#else /* !POWERFAIL_NMI */
/* machine/parity/power fail/"kitchen sink" faults */
if (isa_nmi(code) == 0) {
@@ -391,7 +385,7 @@ restart:
kdb_trap (type, 0, &frame);
}
#endif /* DDB */
- return;
+ goto out;
} else if (panic_on_nmi)
panic("NMI indicates hardware failure");
break;
@@ -410,9 +404,9 @@ restart:
case T_DNA:
#if NNPX > 0
- /* if a transparent fault (due to context switch "late") */
+ /* transparent fault (due to context switch "late") */
if (npxdna())
- return;
+ goto out;
#endif
if (!pmath_emulate) {
i = SIGFPE;
@@ -422,7 +416,7 @@ restart:
i = (*pmath_emulate)(&frame);
if (i == 0) {
if (!(frame.tf_eflags & PSL_T))
- return;
+ goto out;
frame.tf_eflags &= ~PSL_T;
i = SIGTRAP;
}
@@ -435,13 +429,12 @@ restart:
break;
}
} else {
-kernel_trap:
/* kernel trap */
switch (type) {
case T_PAGEFLT: /* page fault */
(void) trap_pfault(&frame, FALSE, eva);
- return;
+ goto out;
case T_DNA:
#if NNPX > 0
@@ -451,31 +444,35 @@ kernel_trap:
* registered such use.
*/
if (npxdna())
- return;
+ goto out;
#endif
break;
- case T_PROTFLT: /* general protection fault */
- case T_SEGNPFLT: /* segment not present fault */
/*
- * Invalid segment selectors and out of bounds
- * %eip's and %esp's can be set up in user mode.
- * This causes a fault in kernel mode when the
- * kernel tries to return to user mode. We want
- * to get this fault so that we can fix the
- * problem here and not have to check all the
- * selectors and pointers when the user changes
- * them.
+ * The following two traps can happen in
+ * vm86 mode, and, if so, we want to handle
+ * them specially.
*/
-#define MAYBE_DORETI_FAULT(where, whereto) \
- do { \
- if (frame.tf_eip == (int)where) { \
- frame.tf_eip = (int)whereto; \
- return; \
- } \
- } while (0)
-
- if (intr_nesting_level == 0) {
+ case T_PROTFLT: /* general protection fault */
+ case T_STKFLT: /* stack fault */
+ if (frame.tf_eflags & PSL_VM) {
+ i = vm86_emulate((struct vm86frame *)&frame);
+ if (i != 0)
+ /*
+ * returns to original process
+ */
+ vm86_trap((struct vm86frame *)&frame);
+ goto out;
+ }
+ /* FALL THROUGH */
+
+ case T_SEGNPFLT: /* segment not present fault */
+ if (in_vm86call)
+ break;
+
+ if (intr_nesting_level != 0)
+ break;
+
/*
* Invalid %fs's and %gs's can be created using
* procfs or PT_SETREGS or by invalidating the
@@ -488,20 +485,38 @@ kernel_trap:
if (frame.tf_eip == (int)cpu_switch_load_gs) {
curpcb->pcb_gs = 0;
psignal(p, SIGBUS);
- return;
+ goto out;
+ }
+
+ /*
+ * Invalid segment selectors and out of bounds
+ * %eip's and %esp's can be set up in user mode.
+ * This causes a fault in kernel mode when the
+ * kernel tries to return to user mode. We want
+ * to get this fault so that we can fix the
+ * problem here and not have to check all the
+ * selectors and pointers when the user changes
+ * them.
+ */
+ if (frame.tf_eip == (int)doreti_iret) {
+ frame.tf_eip = (int)doreti_iret_fault;
+ goto out;
+ }
+ if (frame.tf_eip == (int)doreti_popl_ds) {
+ frame.tf_eip = (int)doreti_popl_ds_fault;
+ goto out;
+ }
+ if (frame.tf_eip == (int)doreti_popl_es) {
+ frame.tf_eip = (int)doreti_popl_es_fault;
+ goto out;
}
- MAYBE_DORETI_FAULT(doreti_iret,
- doreti_iret_fault);
- MAYBE_DORETI_FAULT(doreti_popl_ds,
- doreti_popl_ds_fault);
- MAYBE_DORETI_FAULT(doreti_popl_es,
- doreti_popl_es_fault);
- MAYBE_DORETI_FAULT(doreti_popl_fs,
- doreti_popl_fs_fault);
+ if (frame.tf_eip == (int)doreti_popl_fs) {
+ frame.tf_eip = (int)doreti_popl_fs_fault;
+ goto out;
+ }
if (curpcb && curpcb->pcb_onfault) {
frame.tf_eip = (int)curpcb->pcb_onfault;
- return;
- }
+ goto out;
}
break;
@@ -517,7 +532,7 @@ kernel_trap:
*/
if (frame.tf_eflags & PSL_NT) {
frame.tf_eflags &= ~PSL_NT;
- return;
+ goto out;
}
break;
@@ -529,7 +544,7 @@ kernel_trap:
* silently until the syscall handler has
* saved the flags.
*/
- return;
+ goto out;
}
if (frame.tf_eip == (int)IDTVEC(syscall) + 1) {
/*
@@ -537,7 +552,7 @@ kernel_trap:
* flags. Stop single stepping it.
*/
frame.tf_eflags &= ~PSL_T;
- return;
+ goto out;
}
/*
* Ignore debug register trace traps due to
@@ -549,13 +564,13 @@ kernel_trap:
* in kernel space because that is useful when
* debugging the kernel.
*/
- if (user_dbreg_trap()) {
+ if (user_dbreg_trap() && !in_vm86call) {
/*
* Reset breakpoint bits because the
* processor doesn't
*/
load_dr6(rdr6() & 0xfffffff0);
- return;
+ goto out;
}
/*
* Fall through (TRCTRAP kernel mode, kernel address)
@@ -567,28 +582,19 @@ kernel_trap:
*/
#ifdef DDB
if (kdb_trap (type, 0, &frame))
- return;
+ goto out;
#endif
break;
#if NISA > 0
case T_NMI:
#ifdef POWERFAIL_NMI
-#ifndef TIMER_FREQ
-# define TIMER_FREQ 1193182
-#endif
- handle_powerfail:
- {
- static unsigned lastalert = 0;
-
- if(time_second - lastalert > 10)
- {
+ if (time_second - lastalert > 10) {
log(LOG_WARNING, "NMI: power fail\n");
sysbeep(TIMER_FREQ/880, hz);
lastalert = time_second;
- }
- return;
}
+ goto out;
#else /* !POWERFAIL_NMI */
/* machine/parity/power fail/"kitchen sink" faults */
if (isa_nmi(code) == 0) {
@@ -602,16 +608,16 @@ kernel_trap:
kdb_trap (type, 0, &frame);
}
#endif /* DDB */
- return;
+ goto out;
} else if (panic_on_nmi == 0)
- return;
+ goto out;
/* FALL THROUGH */
#endif /* POWERFAIL_NMI */
#endif /* NISA > 0 */
}
trap_fatal(&frame, eva);
- return;
+ goto out;
}
/* Translate fault for emulators (e.g. Linux) */
@@ -630,8 +636,10 @@ kernel_trap:
}
#endif
-out:
+user:
userret(p, &frame, sticks, 1);
+out:
+ mtx_exit(&Giant, MTX_DEF);
}
#ifdef notyet
@@ -769,10 +777,8 @@ trap_pfault(frame, usermode, eva)
* fault.
*/
#if defined(I586_CPU) && !defined(NO_F00F_HACK)
- if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) {
- frame->tf_trapno = T_PRIVINFLT;
+ if ((eva == (unsigned int)&idt[6]) && has_f00f_bug)
return -2;
- }
#endif
if (usermode)
goto nogo;
@@ -869,8 +875,7 @@ trap_fatal(frame, eva)
frame->tf_eflags & PSL_VM ? "vm86" :
ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
#ifdef SMP
- /* three seperate prints in case of a trap on an unmapped page */
- printf("mp_lock = %08x; ", mp_lock);
+ /* two seperate prints in case of a trap on an unmapped page */
printf("cpuid = %d; ", cpuid);
printf("lapic.id = %08x\n", lapic.id);
#endif
@@ -917,26 +922,6 @@ trap_fatal(frame, eva)
} else {
printf("Idle\n");
}
- printf("interrupt mask = ");
- if ((cpl & net_imask) == net_imask)
- printf("net ");
- if ((cpl & tty_imask) == tty_imask)
- printf("tty ");
- if ((cpl & bio_imask) == bio_imask)
- printf("bio ");
- if ((cpl & cam_imask) == cam_imask)
- printf("cam ");
- if (cpl == 0)
- printf("none");
-#ifdef SMP
-/**
- * XXX FIXME:
- * we probably SHOULD have stopped the other CPUs before now!
- * another CPU COULD have been touching cpl at this moment...
- */
- printf(" <- SMP: XXX");
-#endif
- printf("\n");
#ifdef KDB
if (kdb_trap(&psl))
@@ -973,8 +958,7 @@ dblfault_handler()
printf("esp = 0x%x\n", common_tss.tss_esp);
printf("ebp = 0x%x\n", common_tss.tss_ebp);
#ifdef SMP
- /* three seperate prints in case of a trap on an unmapped page */
- printf("mp_lock = %08x; ", mp_lock);
+ /* two seperate prints in case of a trap on an unmapped page */
printf("cpuid = %d; ", cpuid);
printf("lapic.id = %08x\n", lapic.id);
#endif
@@ -1048,12 +1032,14 @@ syscall2(frame)
int error;
int narg;
int args[8];
- int have_mplock = 0;
+ int have_giant = 0;
u_int code;
+ atomic_add_int(&cnt.v_syscall, 1);
+
#ifdef DIAGNOSTIC
if (ISPL(frame.tf_cs) != SEL_UPL) {
- get_mplock();
+ mtx_enter(&Giant, MTX_DEF);
panic("syscall");
/* NOT REACHED */
}
@@ -1075,9 +1061,9 @@ syscall2(frame)
/*
* The prep code is not MP aware.
*/
- get_mplock();
+ mtx_enter(&Giant, MTX_DEF);
(*p->p_sysent->sv_prepsyscall)(&frame, args, &code, &params);
- rel_mplock();
+ mtx_exit(&Giant, MTX_DEF);
} else {
/*
* Need to check if this is a 32 bit or 64 bit syscall.
@@ -1114,8 +1100,8 @@ syscall2(frame)
*/
if (params && (i = narg * sizeof(int)) &&
(error = copyin(params, (caddr_t)args, (u_int)i))) {
- get_mplock();
- have_mplock = 1;
+ mtx_enter(&Giant, MTX_DEF);
+ have_giant = 1;
#ifdef KTRACE
if (KTRPOINT(p, KTR_SYSCALL))
ktrsyscall(p->p_tracep, code, narg, args);
@@ -1129,15 +1115,15 @@ syscall2(frame)
* we are ktracing
*/
if ((callp->sy_narg & SYF_MPSAFE) == 0) {
- get_mplock();
- have_mplock = 1;
+ mtx_enter(&Giant, MTX_DEF);
+ have_giant = 1;
}
#ifdef KTRACE
if (KTRPOINT(p, KTR_SYSCALL)) {
- if (have_mplock == 0) {
- get_mplock();
- have_mplock = 1;
+ if (have_giant == 0) {
+ mtx_enter(&Giant, MTX_DEF);
+ have_giant = 1;
}
ktrsyscall(p->p_tracep, code, narg, args);
}
@@ -1192,9 +1178,9 @@ bad:
* Traced syscall. trapsignal() is not MP aware.
*/
if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) {
- if (have_mplock == 0) {
- get_mplock();
- have_mplock = 1;
+ if (have_giant == 0) {
+ mtx_enter(&Giant, MTX_DEF);
+ have_giant = 1;
}
frame.tf_eflags &= ~PSL_T;
trapsignal(p, SIGTRAP, 0);
@@ -1203,13 +1189,13 @@ bad:
/*
* Handle reschedule and other end-of-syscall issues
*/
- have_mplock = userret(p, &frame, sticks, have_mplock);
+ have_giant = userret(p, &frame, sticks, have_giant);
#ifdef KTRACE
if (KTRPOINT(p, KTR_SYSRET)) {
- if (have_mplock == 0) {
- get_mplock();
- have_mplock = 1;
+ if (have_giant == 0) {
+ mtx_enter(&Giant, MTX_DEF);
+ have_giant = 1;
}
ktrsysret(p->p_tracep, code, error, p->p_retval[0]);
}
@@ -1225,27 +1211,66 @@ bad:
/*
* Release the MP lock if we had to get it
*/
- if (have_mplock)
- rel_mplock();
+ if (have_giant)
+ mtx_exit(&Giant, MTX_DEF);
+
+ mtx_assert(&sched_lock, MA_NOTOWNED);
+ mtx_assert(&Giant, MA_NOTOWNED);
+}
+
+void
+ast(frame)
+ struct trapframe frame;
+{
+ struct proc *p = CURPROC;
+ u_quad_t sticks;
+
+ /*
+ * handle atomicy by looping since interrupts are enabled and the
+ * MP lock is not held.
+ */
+ sticks = ((volatile struct proc *)p)->p_sticks;
+ while (sticks != ((volatile struct proc *)p)->p_sticks)
+ sticks = ((volatile struct proc *)p)->p_sticks;
+
+ astoff();
+ atomic_add_int(&cnt.v_soft, 1);
+ if (p->p_flag & P_OWEUPC) {
+ mtx_enter(&Giant, MTX_DEF);
+ p->p_flag &= ~P_OWEUPC;
+ addupc_task(p, p->p_stats->p_prof.pr_addr,
+ p->p_stats->p_prof.pr_ticks);
+}
+ if (userret(p, &frame, sticks, mtx_owned(&Giant)) != 0)
+ mtx_exit(&Giant, MTX_DEF);
}
/*
* Simplified back end of syscall(), used when returning from fork()
- * directly into user mode. MP lock is held on entry and should be
- * held on return.
+ * directly into user mode. Giant is not held on entry, and must not
+ * be held on return.
*/
void
fork_return(p, frame)
struct proc *p;
struct trapframe frame;
{
+ int have_giant;
+
frame.tf_eax = 0; /* Child returns zero */
frame.tf_eflags &= ~PSL_C; /* success */
frame.tf_edx = 1;
- userret(p, &frame, 0, 1);
+ have_giant = userret(p, &frame, 0, mtx_owned(&Giant));
#ifdef KTRACE
- if (KTRPOINT(p, KTR_SYSRET))
+ if (KTRPOINT(p, KTR_SYSRET)) {
+ if (have_giant == 0) {
+ mtx_enter(&Giant, MTX_DEF);
+ have_giant = 1;
+ }
ktrsysret(p->p_tracep, SYS_fork, 0, 0);
+ }
#endif
+ if (have_giant)
+ mtx_exit(&Giant, MTX_DEF);
}
diff --git a/sys/kern/subr_turnstile.c b/sys/kern/subr_turnstile.c
new file mode 100644
index 0000000..1ac3f58
--- /dev/null
+++ b/sys/kern/subr_turnstile.c
@@ -0,0 +1,799 @@
+/*-
+ * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Berkeley Software Design Inc's name may not be used to endorse or
+ * promote products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
+ * $FreeBSD$
+ */
+
+/*
+ * Main Entry: witness
+ * Pronunciation: 'wit-n&s
+ * Function: noun
+ * Etymology: Middle English witnesse, from Old English witnes knowledge,
+ * testimony, witness, from 2wit
+ * Date: before 12th century
+ * 1 : attestation of a fact or event : TESTIMONY
+ * 2 : one that gives evidence; specifically : one who testifies in
+ * a cause or before a judicial tribunal
+ * 3 : one asked to be present at a transaction so as to be able to
+ * testify to its having taken place
+ * 4 : one who has personal knowledge of something
+ * 5 a : something serving as evidence or proof : SIGN
+ * b : public affirmation by word or example of usually
+ * religious faith or conviction <the heroic witness to divine
+ * life -- Pilot>
+ * 6 capitalized : a member of the Jehovah's Witnesses
+ */
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/ktr.h>
+
+#include <machine/cpu.h>
+#define _KERN_MUTEX_C_ /* Cause non-inlined mtx_*() to be compiled. */
+#include <machine/mutex.h>
+
+/*
+ * The non-inlined versions of the mtx_*() functions are always built (above),
+ * but the witness code depends on the SMP_DEBUG and WITNESS kernel options
+ * being specified.
+ */
+#if (defined(SMP_DEBUG) && defined(WITNESS))
+
+#define WITNESS_COUNT 200
+#define WITNESS_NCHILDREN 2
+
+#ifndef WITNESS
+#define WITNESS 0 /* default off */
+#endif
+
+#ifndef SMP
+extern int witness_spin_check;
+#endif
+
+int witness_watch;
+
+typedef struct witness {
+ struct witness *w_next;
+ char *w_description;
+ char *w_file;
+ int w_line;
+ struct witness *w_morechildren;
+ u_char w_childcnt;
+ u_char w_Giant_squawked:1;
+ u_char w_other_squawked:1;
+ u_char w_same_squawked:1;
+ u_char w_sleep:1;
+ u_char w_spin:1; /* this is a spin mutex */
+ u_int w_level;
+ struct witness *w_children[WITNESS_NCHILDREN];
+} witness_t;
+
+typedef struct witness_blessed {
+ char *b_lock1;
+ char *b_lock2;
+} witness_blessed_t;
+
+#ifdef KDEBUG
+/*
+ * When WITNESS_KDEBUG is set to 1, it will cause the system to
+ * drop into kdebug() when:
+ * - a lock heirarchy violation occurs
+ * - locks are held when going to sleep.
+ */
+#ifndef WITNESS_KDEBUG
+#define WITNESS_KDEBUG 0
+#endif
+int witness_kdebug = WITNESS_KDEBUG;
+#endif /* KDEBUG */
+
+#ifndef WITNESS_SKIPSPIN
+#define WITNESS_SKIPSPIN 0
+#endif
+int witness_skipspin = WITNESS_SKIPSPIN;
+
+
+static mtx_t w_mtx;
+static witness_t *w_free;
+static witness_t *w_all;
+static int w_inited;
+static int witness_dead; /* fatal error, probably no memory */
+
+static witness_t w_data[WITNESS_COUNT];
+
+static witness_t *enroll __P((char *description, int flag));
+static int itismychild __P((witness_t *parent, witness_t *child));
+static void removechild __P((witness_t *parent, witness_t *child));
+static int isitmychild __P((witness_t *parent, witness_t *child));
+static int isitmydescendant __P((witness_t *parent, witness_t *child));
+static int dup_ok __P((witness_t *));
+static int blessed __P((witness_t *, witness_t *));
+static void witness_displaydescendants
+ __P((void(*)(const char *fmt, ...), witness_t *));
+static void witness_leveldescendents __P((witness_t *parent, int level));
+static void witness_levelall __P((void));
+static witness_t * witness_get __P((void));
+static void witness_free __P((witness_t *m));
+
+
+static char *ignore_list[] = {
+ "witness lock",
+ "Kdebug", /* breaks rules and may or may not work */
+ "Page Alias", /* sparc only, witness lock won't block intr */
+ NULL
+};
+
+static char *spin_order_list[] = {
+ "sched lock",
+ "log mtx",
+ "zslock", /* sparc only above log, this one is a real hack */
+ "time lock", /* above callout */
+ "callout mtx", /* above wayout */
+ /*
+ * leaf locks
+ */
+ "wayout mtx",
+ "kernel_pmap", /* sparc only, logically equal "pmap" below */
+ "pmap", /* sparc only */
+ NULL
+};
+
+static char *order_list[] = {
+ "tcb", "inp", "so_snd", "so_rcv", "Giant lock", NULL,
+ "udb", "inp", NULL,
+ "unp head", "unp", "so_snd", NULL,
+ "de0", "Giant lock", NULL,
+ "ifnet", "Giant lock", NULL,
+ "fifo", "so_snd", NULL,
+ "hme0", "Giant lock", NULL,
+ "esp0", "Giant lock", NULL,
+ "hfa0", "Giant lock", NULL,
+ "so_rcv", "atm_global", NULL,
+ "so_snd", "atm_global", NULL,
+ "NFS", "Giant lock", NULL,
+ NULL
+};
+
+static char *dup_list[] = {
+ "inp",
+ "process group",
+ "session",
+ "unp",
+ "rtentry",
+ "rawcb",
+ NULL
+};
+
+static char *sleep_list[] = {
+ "Giant lock",
+ NULL
+};
+
+/*
+ * Pairs of locks which have been blessed
+ * Don't complain about order problems with blessed locks
+ */
+static witness_blessed_t blessed_list[] = {
+};
+static int blessed_count = sizeof (blessed_list) / sizeof (witness_blessed_t);
+
+void
+witness_init(mtx_t *m, int flag)
+{
+ m->mtx_witness = enroll(m->mtx_description, flag);
+}
+
+void
+witness_destroy(mtx_t *m)
+{
+ mtx_t *m1;
+ struct proc *p;
+ p = CURPROC;
+ for ((m1 = LIST_FIRST(&p->p_heldmtx)); m1 != NULL;
+ m1 = LIST_NEXT(m1, mtx_held)) {
+ if (m1 == m) {
+ LIST_REMOVE(m, mtx_held);
+ break;
+ }
+ }
+ return;
+
+}
+
+void
+witness_enter(mtx_t *m, int flags, char *file, int line)
+{
+ witness_t *w, *w1;
+ mtx_t *m1;
+ struct proc *p;
+ int i;
+#ifdef KDEBUG
+ int go_into_kdebug = 0;
+#endif /* KDEBUG */
+
+ w = m->mtx_witness;
+ p = CURPROC;
+
+ if (flags & MTX_SPIN) {
+ if (!w->w_spin)
+ panic("mutex_enter: MTX_SPIN on MTX_DEF mutex %s @ %s:%d",
+ m->mtx_description, file, line);
+ if (m->mtx_recurse != 0)
+ return;
+ mtx_enter(&w_mtx, MTX_SPIN);
+ i = witness_spin_check;
+ if (i != 0 && w->w_level < i) {
+ mtx_exit(&w_mtx, MTX_SPIN);
+ panic("mutex_enter(%s:%x, MTX_SPIN) out of order @ %s:%d"
+ " already holding %s:%x",
+ m->mtx_description, w->w_level, file, line,
+ spin_order_list[ffs(i)-1], i);
+ }
+ PCPU_SET(witness_spin_check, i | w->w_level);
+ mtx_exit(&w_mtx, MTX_SPIN);
+ return;
+ }
+ if (w->w_spin)
+ panic("mutex_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
+ m->mtx_description, file, line);
+
+ if (m->mtx_recurse != 0)
+ return;
+ if (witness_dead)
+ goto out;
+ if (cold)
+ goto out;
+
+ if (!mtx_legal2block())
+ panic("blockable mtx_enter() of %s when not legal @ %s:%d",
+ m->mtx_description, file, line);
+ /*
+ * Is this the first mutex acquired
+ */
+ if ((m1 = LIST_FIRST(&p->p_heldmtx)) == NULL)
+ goto out;
+
+
+ if ((w1 = m1->mtx_witness) == w) {
+ if (w->w_same_squawked || dup_ok(w))
+ goto out;
+ w->w_same_squawked = 1;
+ printf("acquring duplicate lock of same type: \"%s\"\n",
+ m->mtx_description);
+ printf(" 1st @ %s:%d\n", w->w_file, w->w_line);
+ printf(" 2nd @ %s:%d\n", file, line);
+#ifdef KDEBUG
+ go_into_kdebug = 1;
+#endif /* KDEBUG */
+ goto out;
+ }
+ MPASS(!mtx_owned(&w_mtx));
+ mtx_enter(&w_mtx, MTX_SPIN);
+ /*
+ * If we have a known higher number just say ok
+ */
+ if (witness_watch > 1 && w->w_level > w1->w_level) {
+ mtx_exit(&w_mtx, MTX_SPIN);
+ goto out;
+ }
+ if (isitmydescendant(m1->mtx_witness, w)) {
+ mtx_exit(&w_mtx, MTX_SPIN);
+ goto out;
+ }
+ for (i = 0; m1 != NULL; m1 = LIST_NEXT(m1, mtx_held), i++) {
+
+ ASS(i < 200);
+ w1 = m1->mtx_witness;
+ if (isitmydescendant(w, w1)) {
+ mtx_exit(&w_mtx, MTX_SPIN);
+ if (blessed(w, w1))
+ goto out;
+ if (m1 == &Giant) {
+ if (w1->w_Giant_squawked)
+ goto out;
+ else
+ w1->w_Giant_squawked = 1;
+ } else {
+ if (w1->w_other_squawked)
+ goto out;
+ else
+ w1->w_other_squawked = 1;
+ }
+ printf("lock order reversal\n");
+ printf(" 1st %s last acquired @ %s:%d\n",
+ w->w_description, w->w_file, w->w_line);
+ printf(" 2nd %p %s @ %s:%d\n",
+ m1, w1->w_description, w1->w_file, w1->w_line);
+ printf(" 3rd %p %s @ %s:%d\n",
+ m, w->w_description, file, line);
+#ifdef KDEBUG
+ go_into_kdebug = 1;
+#endif /* KDEBUG */
+ goto out;
+ }
+ }
+ m1 = LIST_FIRST(&p->p_heldmtx);
+ if (!itismychild(m1->mtx_witness, w))
+ mtx_exit(&w_mtx, MTX_SPIN);
+
+out:
+#ifdef KDEBUG
+ if (witness_kdebug && go_into_kdebug)
+ kdebug();
+#endif /* KDEBUG */
+ w->w_file = file;
+ w->w_line = line;
+ m->mtx_line = line;
+ m->mtx_file = file;
+
+ /*
+ * If this pays off it likely means that a mutex being witnessed
+ * is acquired in hardclock. Put it in the ignore list. It is
+ * likely not the mutex this assert fails on.
+ */
+ ASS(m->mtx_held.le_prev == NULL);
+ LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held);
+}
+
+void
+witness_exit(mtx_t *m, int flags, char *file, int line)
+{
+ witness_t *w;
+
+ w = m->mtx_witness;
+
+ if (flags & MTX_SPIN) {
+ if (!w->w_spin)
+ panic("mutex_exit: MTX_SPIN on MTX_DEF mutex %s @ %s:%d",
+ m->mtx_description, file, line);
+ if (m->mtx_recurse != 0)
+ return;
+ mtx_enter(&w_mtx, MTX_SPIN);
+ PCPU_SET(witness_spin_check, witness_spin_check & ~w->w_level);
+ mtx_exit(&w_mtx, MTX_SPIN);
+ return;
+ }
+ if (w->w_spin)
+ panic("mutex_exit: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
+ m->mtx_description, file, line);
+
+ if (m->mtx_recurse != 0)
+ return;
+
+ if ((flags & MTX_NOSWITCH) == 0 && !mtx_legal2block() && !cold)
+ panic("switchable mtx_exit() of %s when not legal @ %s:%d",
+ m->mtx_description, file, line);
+ LIST_REMOVE(m, mtx_held);
+ m->mtx_held.le_prev = NULL;
+}
+
+void
+witness_try_enter(mtx_t *m, int flags, char *file, int line)
+{
+ struct proc *p;
+ witness_t *w = m->mtx_witness;
+
+
+ if (flags & MTX_SPIN) {
+ if (!w->w_spin)
+ panic("mutex_try_enter: "
+ "MTX_SPIN on MTX_DEF mutex %s @ %s:%d",
+ m->mtx_description, file, line);
+ if (m->mtx_recurse != 0)
+ return;
+ mtx_enter(&w_mtx, MTX_SPIN);
+ PCPU_SET(witness_spin_check, witness_spin_check | w->w_level);
+ mtx_exit(&w_mtx, MTX_SPIN);
+ return;
+ }
+
+ if (w->w_spin)
+ panic("mutex_try_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
+ m->mtx_description, file, line);
+
+ if (m->mtx_recurse != 0)
+ return;
+
+ w->w_file = file;
+ w->w_line = line;
+ m->mtx_line = line;
+ m->mtx_file = file;
+ p = CURPROC;
+ ASS(m->mtx_held.le_prev == NULL);
+ LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held);
+}
+
+void
+witness_display(void(*prnt)(const char *fmt, ...))
+{
+ witness_t *w, *w1;
+
+ witness_levelall();
+
+ for (w = w_all; w; w = w->w_next) {
+ if (w->w_file == NULL)
+ continue;
+ for (w1 = w_all; w1; w1 = w1->w_next) {
+ if (isitmychild(w1, w))
+ break;
+ }
+ if (w1 != NULL)
+ continue;
+ /*
+ * This lock has no anscestors, display its descendants.
+ */
+ witness_displaydescendants(prnt, w);
+ }
+ prnt("\nMutex which were never acquired\n");
+ for (w = w_all; w; w = w->w_next) {
+ if (w->w_file != NULL)
+ continue;
+ prnt("%s\n", w->w_description);
+ }
+}
+
+int
+witness_sleep(int check_only, mtx_t *mtx, char *file, int line)
+{
+ mtx_t *m;
+ struct proc *p;
+ char **sleep;
+ int n = 0;
+
+ p = CURPROC;
+ for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL;
+ m = LIST_NEXT(m, mtx_held)) {
+ if (m == mtx)
+ continue;
+ for (sleep = sleep_list; *sleep!= NULL; sleep++)
+ if (strcmp(m->mtx_description, *sleep) == 0)
+ goto next;
+ printf("%s:%d: %s with \"%s\" locked from %s:%d\n",
+ file, line, check_only ? "could sleep" : "sleeping",
+ m->mtx_description,
+ m->mtx_witness->w_file, m->mtx_witness->w_line);
+ n++;
+ next:
+ }
+#ifdef KDEBUG
+ if (witness_kdebug && n)
+ kdebug();
+#endif /* KDEBUG */
+ return (n);
+}
+
+static witness_t *
+enroll(char *description, int flag)
+{
+ int i;
+ witness_t *w, *w1;
+ char **ignore;
+ char **order;
+
+ if (!witness_watch)
+ return (NULL);
+ for (ignore = ignore_list; *ignore != NULL; ignore++)
+ if (strcmp(description, *ignore) == 0)
+ return (NULL);
+
+ if (w_inited == 0) {
+ mtx_init(&w_mtx, "witness lock", MTX_DEF);
+ for (i = 0; i < WITNESS_COUNT; i++) {
+ w = &w_data[i];
+ witness_free(w);
+ }
+ w_inited = 1;
+ for (order = order_list; *order != NULL; order++) {
+ w = enroll(*order, MTX_DEF);
+ w->w_file = "order list";
+ for (order++; *order != NULL; order++) {
+ w1 = enroll(*order, MTX_DEF);
+ w1->w_file = "order list";
+ itismychild(w, w1);
+ w = w1;
+ }
+ }
+ }
+ if ((flag & MTX_SPIN) && witness_skipspin)
+ return (NULL);
+ mtx_enter(&w_mtx, MTX_SPIN);
+ for (w = w_all; w; w = w->w_next) {
+ if (strcmp(description, w->w_description) == 0) {
+ mtx_exit(&w_mtx, MTX_SPIN);
+ return (w);
+ }
+ }
+ if ((w = witness_get()) == NULL)
+ return (NULL);
+ w->w_next = w_all;
+ w_all = w;
+ w->w_description = description;
+ mtx_exit(&w_mtx, MTX_SPIN);
+ if (flag & MTX_SPIN) {
+ w->w_spin = 1;
+
+ i = 1;
+ for (order = spin_order_list; *order != NULL; order++) {
+ if (strcmp(description, *order) == 0)
+ break;
+ i <<= 1;
+ }
+ if (*order == NULL)
+ panic("spin lock %s not in order list", description);
+ w->w_level = i;
+ }
+ return (w);
+}
+
+static int
+itismychild(witness_t *parent, witness_t *child)
+{
+ static int recursed;
+
+ /*
+ * Insert "child" after "parent"
+ */
+ while (parent->w_morechildren)
+ parent = parent->w_morechildren;
+
+ if (parent->w_childcnt == WITNESS_NCHILDREN) {
+ if ((parent->w_morechildren = witness_get()) == NULL)
+ return (1);
+ parent = parent->w_morechildren;
+ }
+ ASS(child != NULL);
+ parent->w_children[parent->w_childcnt++] = child;
+ /*
+ * now prune whole tree
+ */
+ if (recursed)
+ return (0);
+ recursed = 1;
+ for (child = w_all; child != NULL; child = child->w_next) {
+ for (parent = w_all; parent != NULL;
+ parent = parent->w_next) {
+ if (!isitmychild(parent, child))
+ continue;
+ removechild(parent, child);
+ if (isitmydescendant(parent, child))
+ continue;
+ itismychild(parent, child);
+ }
+ }
+ recursed = 0;
+ witness_levelall();
+ return (0);
+}
+
+static void
+removechild(witness_t *parent, witness_t *child)
+{
+ witness_t *w, *w1;
+ int i;
+
+ for (w = parent; w != NULL; w = w->w_morechildren)
+ for (i = 0; i < w->w_childcnt; i++)
+ if (w->w_children[i] == child)
+ goto found;
+ return;
+found:
+ for (w1 = w; w1->w_morechildren != NULL; w1 = w1->w_morechildren)
+ continue;
+ w->w_children[i] = w1->w_children[--w1->w_childcnt];
+ ASS(w->w_children[i] != NULL);
+
+ if (w1->w_childcnt != 0)
+ return;
+
+ if (w1 == parent)
+ return;
+ for (w = parent; w->w_morechildren != w1; w = w->w_morechildren)
+ continue;
+ w->w_morechildren = 0;
+ witness_free(w1);
+}
+
+static int
+isitmychild(witness_t *parent, witness_t *child)
+{
+ witness_t *w;
+ int i;
+
+ for (w = parent; w != NULL; w = w->w_morechildren) {
+ for (i = 0; i < w->w_childcnt; i++) {
+ if (w->w_children[i] == child)
+ return (1);
+ }
+ }
+ return (0);
+}
+
+static int
+isitmydescendant(witness_t *parent, witness_t *child)
+{
+ witness_t *w;
+ int i;
+ int j;
+
+ for (j = 0, w = parent; w != NULL; w = w->w_morechildren, j++) {
+ ASS(j < 1000);
+ for (i = 0; i < w->w_childcnt; i++) {
+ if (w->w_children[i] == child)
+ return (1);
+ }
+ for (i = 0; i < w->w_childcnt; i++) {
+ if (isitmydescendant(w->w_children[i], child))
+ return (1);
+ }
+ }
+ return (0);
+}
+
+void
+witness_levelall (void)
+{
+ witness_t *w, *w1;
+
+ for (w = w_all; w; w = w->w_next)
+ if (!w->w_spin)
+ w->w_level = 0;
+ for (w = w_all; w; w = w->w_next) {
+ if (w->w_spin)
+ continue;
+ for (w1 = w_all; w1; w1 = w1->w_next) {
+ if (isitmychild(w1, w))
+ break;
+ }
+ if (w1 != NULL)
+ continue;
+ witness_leveldescendents(w, 0);
+ }
+}
+
+static void
+witness_leveldescendents(witness_t *parent, int level)
+{
+ int i;
+ witness_t *w;
+
+ if (parent->w_level < level)
+ parent->w_level = level;
+ level++;
+ for (w = parent; w != NULL; w = w->w_morechildren)
+ for (i = 0; i < w->w_childcnt; i++)
+ witness_leveldescendents(w->w_children[i], level);
+}
+
+static void
+witness_displaydescendants(void(*prnt)(const char *fmt, ...), witness_t *parent)
+{
+ witness_t *w;
+ int i;
+ int level = parent->w_level;
+
+ prnt("%d", level);
+ if (level < 10)
+ prnt(" ");
+ for (i = 0; i < level; i++)
+ prnt(" ");
+ prnt("%s", parent->w_description);
+ if (parent->w_file != NULL) {
+ prnt(" -- last acquired @ %s", parent->w_file);
+#ifndef W_USE_WHERE
+ prnt(":%d", parent->w_line);
+#endif
+ prnt("\n");
+ }
+
+ for (w = parent; w != NULL; w = w->w_morechildren)
+ for (i = 0; i < w->w_childcnt; i++)
+ witness_displaydescendants(prnt, w->w_children[i]);
+ }
+
+static int
+dup_ok(witness_t *w)
+{
+ char **dup;
+
+ for (dup = dup_list; *dup!= NULL; dup++)
+ if (strcmp(w->w_description, *dup) == 0)
+ return (1);
+ return (0);
+}
+
+static int
+blessed(witness_t *w1, witness_t *w2)
+{
+ int i;
+ witness_blessed_t *b;
+
+ for (i = 0; i < blessed_count; i++) {
+ b = &blessed_list[i];
+ if (strcmp(w1->w_description, b->b_lock1) == 0) {
+ if (strcmp(w2->w_description, b->b_lock2) == 0)
+ return (1);
+ continue;
+ }
+ if (strcmp(w1->w_description, b->b_lock2) == 0)
+ if (strcmp(w2->w_description, b->b_lock1) == 0)
+ return (1);
+ }
+ return (0);
+}
+
+static witness_t *
+witness_get()
+{
+ witness_t *w;
+
+ if ((w = w_free) == NULL) {
+ witness_dead = 1;
+ mtx_exit(&w_mtx, MTX_SPIN);
+ printf("witness exhausted\n");
+ return (NULL);
+ }
+ w_free = w->w_next;
+ bzero(w, sizeof (*w));
+ return (w);
+}
+
+static void
+witness_free(witness_t *w)
+{
+ w->w_next = w_free;
+ w_free = w;
+}
+
+void
+witness_list(struct proc *p)
+{
+ mtx_t *m;
+
+ for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL;
+ m = LIST_NEXT(m, mtx_held)) {
+ printf("\t\"%s\" (%p) locked at %s:%d\n",
+ m->mtx_description, m,
+ m->mtx_witness->w_file, m->mtx_witness->w_line);
+ }
+}
+
+void
+witness_save(mtx_t *m, char **filep, int *linep)
+{
+ *filep = m->mtx_witness->w_file;
+ *linep = m->mtx_witness->w_line;
+}
+
+void
+witness_restore(mtx_t *m, char *file, int line)
+{
+ m->mtx_witness->w_file = file;
+ m->mtx_witness->w_line = line;
+}
+
+#endif /* (defined(SMP_DEBUG) && defined(WITNESS)) */
diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c
new file mode 100644
index 0000000..1ac3f58
--- /dev/null
+++ b/sys/kern/subr_witness.c
@@ -0,0 +1,799 @@
+/*-
+ * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Berkeley Software Design Inc's name may not be used to endorse or
+ * promote products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
+ * $FreeBSD$
+ */
+
+/*
+ * Main Entry: witness
+ * Pronunciation: 'wit-n&s
+ * Function: noun
+ * Etymology: Middle English witnesse, from Old English witnes knowledge,
+ * testimony, witness, from 2wit
+ * Date: before 12th century
+ * 1 : attestation of a fact or event : TESTIMONY
+ * 2 : one that gives evidence; specifically : one who testifies in
+ * a cause or before a judicial tribunal
+ * 3 : one asked to be present at a transaction so as to be able to
+ * testify to its having taken place
+ * 4 : one who has personal knowledge of something
+ * 5 a : something serving as evidence or proof : SIGN
+ * b : public affirmation by word or example of usually
+ * religious faith or conviction <the heroic witness to divine
+ * life -- Pilot>
+ * 6 capitalized : a member of the Jehovah's Witnesses
+ */
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/ktr.h>
+
+#include <machine/cpu.h>
+#define _KERN_MUTEX_C_ /* Cause non-inlined mtx_*() to be compiled. */
+#include <machine/mutex.h>
+
+/*
+ * The non-inlined versions of the mtx_*() functions are always built (above),
+ * but the witness code depends on the SMP_DEBUG and WITNESS kernel options
+ * being specified.
+ */
+#if (defined(SMP_DEBUG) && defined(WITNESS))
+
+#define WITNESS_COUNT 200
+#define WITNESS_NCHILDREN 2
+
+#ifndef WITNESS
+#define WITNESS 0 /* default off */
+#endif
+
+#ifndef SMP
+extern int witness_spin_check;
+#endif
+
+int witness_watch;
+
+typedef struct witness {
+ struct witness *w_next;
+ char *w_description;
+ char *w_file;
+ int w_line;
+ struct witness *w_morechildren;
+ u_char w_childcnt;
+ u_char w_Giant_squawked:1;
+ u_char w_other_squawked:1;
+ u_char w_same_squawked:1;
+ u_char w_sleep:1;
+ u_char w_spin:1; /* this is a spin mutex */
+ u_int w_level;
+ struct witness *w_children[WITNESS_NCHILDREN];
+} witness_t;
+
+typedef struct witness_blessed {
+ char *b_lock1;
+ char *b_lock2;
+} witness_blessed_t;
+
+#ifdef KDEBUG
+/*
+ * When WITNESS_KDEBUG is set to 1, it will cause the system to
+ * drop into kdebug() when:
+ * - a lock heirarchy violation occurs
+ * - locks are held when going to sleep.
+ */
+#ifndef WITNESS_KDEBUG
+#define WITNESS_KDEBUG 0
+#endif
+int witness_kdebug = WITNESS_KDEBUG;
+#endif /* KDEBUG */
+
+#ifndef WITNESS_SKIPSPIN
+#define WITNESS_SKIPSPIN 0
+#endif
+int witness_skipspin = WITNESS_SKIPSPIN;
+
+
+static mtx_t w_mtx;
+static witness_t *w_free;
+static witness_t *w_all;
+static int w_inited;
+static int witness_dead; /* fatal error, probably no memory */
+
+static witness_t w_data[WITNESS_COUNT];
+
+static witness_t *enroll __P((char *description, int flag));
+static int itismychild __P((witness_t *parent, witness_t *child));
+static void removechild __P((witness_t *parent, witness_t *child));
+static int isitmychild __P((witness_t *parent, witness_t *child));
+static int isitmydescendant __P((witness_t *parent, witness_t *child));
+static int dup_ok __P((witness_t *));
+static int blessed __P((witness_t *, witness_t *));
+static void witness_displaydescendants
+ __P((void(*)(const char *fmt, ...), witness_t *));
+static void witness_leveldescendents __P((witness_t *parent, int level));
+static void witness_levelall __P((void));
+static witness_t * witness_get __P((void));
+static void witness_free __P((witness_t *m));
+
+
+static char *ignore_list[] = {
+ "witness lock",
+ "Kdebug", /* breaks rules and may or may not work */
+ "Page Alias", /* sparc only, witness lock won't block intr */
+ NULL
+};
+
+static char *spin_order_list[] = {
+ "sched lock",
+ "log mtx",
+ "zslock", /* sparc only above log, this one is a real hack */
+ "time lock", /* above callout */
+ "callout mtx", /* above wayout */
+ /*
+ * leaf locks
+ */
+ "wayout mtx",
+ "kernel_pmap", /* sparc only, logically equal "pmap" below */
+ "pmap", /* sparc only */
+ NULL
+};
+
+static char *order_list[] = {
+ "tcb", "inp", "so_snd", "so_rcv", "Giant lock", NULL,
+ "udb", "inp", NULL,
+ "unp head", "unp", "so_snd", NULL,
+ "de0", "Giant lock", NULL,
+ "ifnet", "Giant lock", NULL,
+ "fifo", "so_snd", NULL,
+ "hme0", "Giant lock", NULL,
+ "esp0", "Giant lock", NULL,
+ "hfa0", "Giant lock", NULL,
+ "so_rcv", "atm_global", NULL,
+ "so_snd", "atm_global", NULL,
+ "NFS", "Giant lock", NULL,
+ NULL
+};
+
+static char *dup_list[] = {
+ "inp",
+ "process group",
+ "session",
+ "unp",
+ "rtentry",
+ "rawcb",
+ NULL
+};
+
+static char *sleep_list[] = {
+ "Giant lock",
+ NULL
+};
+
+/*
+ * Pairs of locks which have been blessed
+ * Don't complain about order problems with blessed locks
+ */
+static witness_blessed_t blessed_list[] = {
+};
+static int blessed_count = sizeof (blessed_list) / sizeof (witness_blessed_t);
+
+void
+witness_init(mtx_t *m, int flag)
+{
+ m->mtx_witness = enroll(m->mtx_description, flag);
+}
+
+void
+witness_destroy(mtx_t *m)
+{
+ mtx_t *m1;
+ struct proc *p;
+ p = CURPROC;
+ for ((m1 = LIST_FIRST(&p->p_heldmtx)); m1 != NULL;
+ m1 = LIST_NEXT(m1, mtx_held)) {
+ if (m1 == m) {
+ LIST_REMOVE(m, mtx_held);
+ break;
+ }
+ }
+ return;
+
+}
+
+void
+witness_enter(mtx_t *m, int flags, char *file, int line)
+{
+ witness_t *w, *w1;
+ mtx_t *m1;
+ struct proc *p;
+ int i;
+#ifdef KDEBUG
+ int go_into_kdebug = 0;
+#endif /* KDEBUG */
+
+ w = m->mtx_witness;
+ p = CURPROC;
+
+ if (flags & MTX_SPIN) {
+ if (!w->w_spin)
+ panic("mutex_enter: MTX_SPIN on MTX_DEF mutex %s @ %s:%d",
+ m->mtx_description, file, line);
+ if (m->mtx_recurse != 0)
+ return;
+ mtx_enter(&w_mtx, MTX_SPIN);
+ i = witness_spin_check;
+ if (i != 0 && w->w_level < i) {
+ mtx_exit(&w_mtx, MTX_SPIN);
+ panic("mutex_enter(%s:%x, MTX_SPIN) out of order @ %s:%d"
+ " already holding %s:%x",
+ m->mtx_description, w->w_level, file, line,
+ spin_order_list[ffs(i)-1], i);
+ }
+ PCPU_SET(witness_spin_check, i | w->w_level);
+ mtx_exit(&w_mtx, MTX_SPIN);
+ return;
+ }
+ if (w->w_spin)
+ panic("mutex_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
+ m->mtx_description, file, line);
+
+ if (m->mtx_recurse != 0)
+ return;
+ if (witness_dead)
+ goto out;
+ if (cold)
+ goto out;
+
+ if (!mtx_legal2block())
+ panic("blockable mtx_enter() of %s when not legal @ %s:%d",
+ m->mtx_description, file, line);
+ /*
+ * Is this the first mutex acquired
+ */
+ if ((m1 = LIST_FIRST(&p->p_heldmtx)) == NULL)
+ goto out;
+
+
+ if ((w1 = m1->mtx_witness) == w) {
+ if (w->w_same_squawked || dup_ok(w))
+ goto out;
+ w->w_same_squawked = 1;
+ printf("acquring duplicate lock of same type: \"%s\"\n",
+ m->mtx_description);
+ printf(" 1st @ %s:%d\n", w->w_file, w->w_line);
+ printf(" 2nd @ %s:%d\n", file, line);
+#ifdef KDEBUG
+ go_into_kdebug = 1;
+#endif /* KDEBUG */
+ goto out;
+ }
+ MPASS(!mtx_owned(&w_mtx));
+ mtx_enter(&w_mtx, MTX_SPIN);
+ /*
+ * If we have a known higher number just say ok
+ */
+ if (witness_watch > 1 && w->w_level > w1->w_level) {
+ mtx_exit(&w_mtx, MTX_SPIN);
+ goto out;
+ }
+ if (isitmydescendant(m1->mtx_witness, w)) {
+ mtx_exit(&w_mtx, MTX_SPIN);
+ goto out;
+ }
+ for (i = 0; m1 != NULL; m1 = LIST_NEXT(m1, mtx_held), i++) {
+
+ ASS(i < 200);
+ w1 = m1->mtx_witness;
+ if (isitmydescendant(w, w1)) {
+ mtx_exit(&w_mtx, MTX_SPIN);
+ if (blessed(w, w1))
+ goto out;
+ if (m1 == &Giant) {
+ if (w1->w_Giant_squawked)
+ goto out;
+ else
+ w1->w_Giant_squawked = 1;
+ } else {
+ if (w1->w_other_squawked)
+ goto out;
+ else
+ w1->w_other_squawked = 1;
+ }
+ printf("lock order reversal\n");
+ printf(" 1st %s last acquired @ %s:%d\n",
+ w->w_description, w->w_file, w->w_line);
+ printf(" 2nd %p %s @ %s:%d\n",
+ m1, w1->w_description, w1->w_file, w1->w_line);
+ printf(" 3rd %p %s @ %s:%d\n",
+ m, w->w_description, file, line);
+#ifdef KDEBUG
+ go_into_kdebug = 1;
+#endif /* KDEBUG */
+ goto out;
+ }
+ }
+ m1 = LIST_FIRST(&p->p_heldmtx);
+ if (!itismychild(m1->mtx_witness, w))
+ mtx_exit(&w_mtx, MTX_SPIN);
+
+out:
+#ifdef KDEBUG
+ if (witness_kdebug && go_into_kdebug)
+ kdebug();
+#endif /* KDEBUG */
+ w->w_file = file;
+ w->w_line = line;
+ m->mtx_line = line;
+ m->mtx_file = file;
+
+ /*
+ * If this pays off it likely means that a mutex being witnessed
+ * is acquired in hardclock. Put it in the ignore list. It is
+ * likely not the mutex this assert fails on.
+ */
+ ASS(m->mtx_held.le_prev == NULL);
+ LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held);
+}
+
+void
+witness_exit(mtx_t *m, int flags, char *file, int line)
+{
+ witness_t *w;
+
+ w = m->mtx_witness;
+
+ if (flags & MTX_SPIN) {
+ if (!w->w_spin)
+ panic("mutex_exit: MTX_SPIN on MTX_DEF mutex %s @ %s:%d",
+ m->mtx_description, file, line);
+ if (m->mtx_recurse != 0)
+ return;
+ mtx_enter(&w_mtx, MTX_SPIN);
+ PCPU_SET(witness_spin_check, witness_spin_check & ~w->w_level);
+ mtx_exit(&w_mtx, MTX_SPIN);
+ return;
+ }
+ if (w->w_spin)
+ panic("mutex_exit: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
+ m->mtx_description, file, line);
+
+ if (m->mtx_recurse != 0)
+ return;
+
+ if ((flags & MTX_NOSWITCH) == 0 && !mtx_legal2block() && !cold)
+ panic("switchable mtx_exit() of %s when not legal @ %s:%d",
+ m->mtx_description, file, line);
+ LIST_REMOVE(m, mtx_held);
+ m->mtx_held.le_prev = NULL;
+}
+
+void
+witness_try_enter(mtx_t *m, int flags, char *file, int line)
+{
+ struct proc *p;
+ witness_t *w = m->mtx_witness;
+
+
+ if (flags & MTX_SPIN) {
+ if (!w->w_spin)
+ panic("mutex_try_enter: "
+ "MTX_SPIN on MTX_DEF mutex %s @ %s:%d",
+ m->mtx_description, file, line);
+ if (m->mtx_recurse != 0)
+ return;
+ mtx_enter(&w_mtx, MTX_SPIN);
+ PCPU_SET(witness_spin_check, witness_spin_check | w->w_level);
+ mtx_exit(&w_mtx, MTX_SPIN);
+ return;
+ }
+
+ if (w->w_spin)
+ panic("mutex_try_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
+ m->mtx_description, file, line);
+
+ if (m->mtx_recurse != 0)
+ return;
+
+ w->w_file = file;
+ w->w_line = line;
+ m->mtx_line = line;
+ m->mtx_file = file;
+ p = CURPROC;
+ ASS(m->mtx_held.le_prev == NULL);
+ LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held);
+}
+
+void
+witness_display(void(*prnt)(const char *fmt, ...))
+{
+ witness_t *w, *w1;
+
+ witness_levelall();
+
+ for (w = w_all; w; w = w->w_next) {
+ if (w->w_file == NULL)
+ continue;
+ for (w1 = w_all; w1; w1 = w1->w_next) {
+ if (isitmychild(w1, w))
+ break;
+ }
+ if (w1 != NULL)
+ continue;
+ /*
+ * This lock has no anscestors, display its descendants.
+ */
+ witness_displaydescendants(prnt, w);
+ }
+ prnt("\nMutex which were never acquired\n");
+ for (w = w_all; w; w = w->w_next) {
+ if (w->w_file != NULL)
+ continue;
+ prnt("%s\n", w->w_description);
+ }
+}
+
+int
+witness_sleep(int check_only, mtx_t *mtx, char *file, int line)
+{
+ mtx_t *m;
+ struct proc *p;
+ char **sleep;
+ int n = 0;
+
+ p = CURPROC;
+ for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL;
+ m = LIST_NEXT(m, mtx_held)) {
+ if (m == mtx)
+ continue;
+ for (sleep = sleep_list; *sleep!= NULL; sleep++)
+ if (strcmp(m->mtx_description, *sleep) == 0)
+ goto next;
+ printf("%s:%d: %s with \"%s\" locked from %s:%d\n",
+ file, line, check_only ? "could sleep" : "sleeping",
+ m->mtx_description,
+ m->mtx_witness->w_file, m->mtx_witness->w_line);
+ n++;
+ next:
+ }
+#ifdef KDEBUG
+ if (witness_kdebug && n)
+ kdebug();
+#endif /* KDEBUG */
+ return (n);
+}
+
+static witness_t *
+enroll(char *description, int flag)
+{
+ int i;
+ witness_t *w, *w1;
+ char **ignore;
+ char **order;
+
+ if (!witness_watch)
+ return (NULL);
+ for (ignore = ignore_list; *ignore != NULL; ignore++)
+ if (strcmp(description, *ignore) == 0)
+ return (NULL);
+
+ if (w_inited == 0) {
+ mtx_init(&w_mtx, "witness lock", MTX_DEF);
+ for (i = 0; i < WITNESS_COUNT; i++) {
+ w = &w_data[i];
+ witness_free(w);
+ }
+ w_inited = 1;
+ for (order = order_list; *order != NULL; order++) {
+ w = enroll(*order, MTX_DEF);
+ w->w_file = "order list";
+ for (order++; *order != NULL; order++) {
+ w1 = enroll(*order, MTX_DEF);
+ w1->w_file = "order list";
+ itismychild(w, w1);
+ w = w1;
+ }
+ }
+ }
+ if ((flag & MTX_SPIN) && witness_skipspin)
+ return (NULL);
+ mtx_enter(&w_mtx, MTX_SPIN);
+ for (w = w_all; w; w = w->w_next) {
+ if (strcmp(description, w->w_description) == 0) {
+ mtx_exit(&w_mtx, MTX_SPIN);
+ return (w);
+ }
+ }
+ if ((w = witness_get()) == NULL)
+ return (NULL);
+ w->w_next = w_all;
+ w_all = w;
+ w->w_description = description;
+ mtx_exit(&w_mtx, MTX_SPIN);
+ if (flag & MTX_SPIN) {
+ w->w_spin = 1;
+
+ i = 1;
+ for (order = spin_order_list; *order != NULL; order++) {
+ if (strcmp(description, *order) == 0)
+ break;
+ i <<= 1;
+ }
+ if (*order == NULL)
+ panic("spin lock %s not in order list", description);
+ w->w_level = i;
+ }
+ return (w);
+}
+
+static int
+itismychild(witness_t *parent, witness_t *child)
+{
+ static int recursed;
+
+ /*
+ * Insert "child" after "parent"
+ */
+ while (parent->w_morechildren)
+ parent = parent->w_morechildren;
+
+ if (parent->w_childcnt == WITNESS_NCHILDREN) {
+ if ((parent->w_morechildren = witness_get()) == NULL)
+ return (1);
+ parent = parent->w_morechildren;
+ }
+ ASS(child != NULL);
+ parent->w_children[parent->w_childcnt++] = child;
+ /*
+ * now prune whole tree
+ */
+ if (recursed)
+ return (0);
+ recursed = 1;
+ for (child = w_all; child != NULL; child = child->w_next) {
+ for (parent = w_all; parent != NULL;
+ parent = parent->w_next) {
+ if (!isitmychild(parent, child))
+ continue;
+ removechild(parent, child);
+ if (isitmydescendant(parent, child))
+ continue;
+ itismychild(parent, child);
+ }
+ }
+ recursed = 0;
+ witness_levelall();
+ return (0);
+}
+
+static void
+removechild(witness_t *parent, witness_t *child)
+{
+ witness_t *w, *w1;
+ int i;
+
+ for (w = parent; w != NULL; w = w->w_morechildren)
+ for (i = 0; i < w->w_childcnt; i++)
+ if (w->w_children[i] == child)
+ goto found;
+ return;
+found:
+ for (w1 = w; w1->w_morechildren != NULL; w1 = w1->w_morechildren)
+ continue;
+ w->w_children[i] = w1->w_children[--w1->w_childcnt];
+ ASS(w->w_children[i] != NULL);
+
+ if (w1->w_childcnt != 0)
+ return;
+
+ if (w1 == parent)
+ return;
+ for (w = parent; w->w_morechildren != w1; w = w->w_morechildren)
+ continue;
+ w->w_morechildren = 0;
+ witness_free(w1);
+}
+
+static int
+isitmychild(witness_t *parent, witness_t *child)
+{
+ witness_t *w;
+ int i;
+
+ for (w = parent; w != NULL; w = w->w_morechildren) {
+ for (i = 0; i < w->w_childcnt; i++) {
+ if (w->w_children[i] == child)
+ return (1);
+ }
+ }
+ return (0);
+}
+
+static int
+isitmydescendant(witness_t *parent, witness_t *child)
+{
+ witness_t *w;
+ int i;
+ int j;
+
+ for (j = 0, w = parent; w != NULL; w = w->w_morechildren, j++) {
+ ASS(j < 1000);
+ for (i = 0; i < w->w_childcnt; i++) {
+ if (w->w_children[i] == child)
+ return (1);
+ }
+ for (i = 0; i < w->w_childcnt; i++) {
+ if (isitmydescendant(w->w_children[i], child))
+ return (1);
+ }
+ }
+ return (0);
+}
+
+void
+witness_levelall (void)
+{
+ witness_t *w, *w1;
+
+ for (w = w_all; w; w = w->w_next)
+ if (!w->w_spin)
+ w->w_level = 0;
+ for (w = w_all; w; w = w->w_next) {
+ if (w->w_spin)
+ continue;
+ for (w1 = w_all; w1; w1 = w1->w_next) {
+ if (isitmychild(w1, w))
+ break;
+ }
+ if (w1 != NULL)
+ continue;
+ witness_leveldescendents(w, 0);
+ }
+}
+
+static void
+witness_leveldescendents(witness_t *parent, int level)
+{
+ int i;
+ witness_t *w;
+
+ if (parent->w_level < level)
+ parent->w_level = level;
+ level++;
+ for (w = parent; w != NULL; w = w->w_morechildren)
+ for (i = 0; i < w->w_childcnt; i++)
+ witness_leveldescendents(w->w_children[i], level);
+}
+
+static void
+witness_displaydescendants(void(*prnt)(const char *fmt, ...), witness_t *parent)
+{
+ witness_t *w;
+ int i;
+ int level = parent->w_level;
+
+ prnt("%d", level);
+ if (level < 10)
+ prnt(" ");
+ for (i = 0; i < level; i++)
+ prnt(" ");
+ prnt("%s", parent->w_description);
+ if (parent->w_file != NULL) {
+ prnt(" -- last acquired @ %s", parent->w_file);
+#ifndef W_USE_WHERE
+ prnt(":%d", parent->w_line);
+#endif
+ prnt("\n");
+ }
+
+ for (w = parent; w != NULL; w = w->w_morechildren)
+ for (i = 0; i < w->w_childcnt; i++)
+ witness_displaydescendants(prnt, w->w_children[i]);
+ }
+
+static int
+dup_ok(witness_t *w)
+{
+ char **dup;
+
+ for (dup = dup_list; *dup!= NULL; dup++)
+ if (strcmp(w->w_description, *dup) == 0)
+ return (1);
+ return (0);
+}
+
+static int
+blessed(witness_t *w1, witness_t *w2)
+{
+ int i;
+ witness_blessed_t *b;
+
+ for (i = 0; i < blessed_count; i++) {
+ b = &blessed_list[i];
+ if (strcmp(w1->w_description, b->b_lock1) == 0) {
+ if (strcmp(w2->w_description, b->b_lock2) == 0)
+ return (1);
+ continue;
+ }
+ if (strcmp(w1->w_description, b->b_lock2) == 0)
+ if (strcmp(w2->w_description, b->b_lock1) == 0)
+ return (1);
+ }
+ return (0);
+}
+
+static witness_t *
+witness_get()
+{
+ witness_t *w;
+
+ if ((w = w_free) == NULL) {
+ witness_dead = 1;
+ mtx_exit(&w_mtx, MTX_SPIN);
+ printf("witness exhausted\n");
+ return (NULL);
+ }
+ w_free = w->w_next;
+ bzero(w, sizeof (*w));
+ return (w);
+}
+
+static void
+witness_free(witness_t *w)
+{
+ w->w_next = w_free;
+ w_free = w;
+}
+
+void
+witness_list(struct proc *p)
+{
+ mtx_t *m;
+
+ for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL;
+ m = LIST_NEXT(m, mtx_held)) {
+ printf("\t\"%s\" (%p) locked at %s:%d\n",
+ m->mtx_description, m,
+ m->mtx_witness->w_file, m->mtx_witness->w_line);
+ }
+}
+
+void
+witness_save(mtx_t *m, char **filep, int *linep)
+{
+ *filep = m->mtx_witness->w_file;
+ *linep = m->mtx_witness->w_line;
+}
+
+void
+witness_restore(mtx_t *m, char *file, int line)
+{
+ m->mtx_witness->w_file = file;
+ m->mtx_witness->w_line = line;
+}
+
+#endif /* (defined(SMP_DEBUG) && defined(WITNESS)) */
diff --git a/sys/kern/tty.c b/sys/kern/tty.c
index 29b6288..87fb980 100644
--- a/sys/kern/tty.c
+++ b/sys/kern/tty.c
@@ -2266,7 +2266,8 @@ ttyinfo(tp)
tmp = (pick->p_pctcpu * 10000 + FSCALE / 2) >> FSHIFT;
ttyprintf(tp, "%d%% %ldk\n",
tmp / 100,
- pick->p_stat == SIDL || pick->p_stat == SZOMB ? 0 :
+ pick->p_stat == SIDL || pick->p_stat == SWAIT ||
+ pick->p_stat == SZOMB ? 0 :
(long)pgtok(vmspace_resident_count(pick->p_vmspace)));
}
tp->t_rocount = 0; /* so pending input will be retyped if BS */
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 99c0754..34cff17 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -37,6 +37,7 @@
#include <sys/mount.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
+#include <sys/ktr.h>
#include <sys/proc.h>
#include <sys/reboot.h>
#include <sys/resourcevar.h>
@@ -52,6 +53,8 @@
#include <vm/vm_extern.h>
#include <vm/vm_map.h>
+#include <machine/mutex.h>
+
static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
struct bio_ops bioops; /* I/O operation notification */
@@ -461,7 +464,7 @@ bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
/* if not found in cache, do some I/O */
if ((bp->b_flags & B_CACHE) == 0) {
- if (curproc != NULL)
+ if (curproc != idleproc)
curproc->p_stats->p_ru.ru_inblock++;
KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp));
bp->b_iocmd = BIO_READ;
@@ -498,7 +501,7 @@ breadn(struct vnode * vp, daddr_t blkno, int size,
/* if not found in cache, do some I/O */
if ((bp->b_flags & B_CACHE) == 0) {
- if (curproc != NULL)
+ if (curproc != idleproc)
curproc->p_stats->p_ru.ru_inblock++;
bp->b_iocmd = BIO_READ;
bp->b_flags &= ~B_INVAL;
@@ -519,7 +522,7 @@ breadn(struct vnode * vp, daddr_t blkno, int size,
rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
if ((rabp->b_flags & B_CACHE) == 0) {
- if (curproc != NULL)
+ if (curproc != idleproc)
curproc->p_stats->p_ru.ru_inblock++;
rabp->b_flags |= B_ASYNC;
rabp->b_flags &= ~B_INVAL;
@@ -640,7 +643,7 @@ bwrite(struct buf * bp)
bp->b_vp->v_numoutput++;
vfs_busy_pages(bp, 1);
- if (curproc != NULL)
+ if (curproc != idleproc)
curproc->p_stats->p_ru.ru_oublock++;
splx(s);
if (oldflags & B_ASYNC)
@@ -1420,7 +1423,8 @@ getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
int isspecial;
static int flushingbufs;
- if (curproc && (curproc->p_flag & (P_COWINPROGRESS|P_BUFEXHAUST)) == 0)
+ if (curproc != idleproc &&
+ (curproc->p_flag & (P_COWINPROGRESS|P_BUFEXHAUST)) == 0)
isspecial = 0;
else
isspecial = 1;
@@ -1745,6 +1749,8 @@ buf_daemon()
{
int s;
+ mtx_enter(&Giant, MTX_DEF);
+
/*
* This process needs to be suspended prior to shutdown sync.
*/
@@ -2070,9 +2076,9 @@ loop:
* move it into the else, when gbincore() fails. At the moment
* it isn't a problem.
*/
- if (!curproc || (curproc->p_flag & P_BUFEXHAUST)) {
+ if (curproc == idleproc || (curproc->p_flag & P_BUFEXHAUST)) {
if (numfreebuffers == 0) {
- if (!curproc)
+ if (curproc == idleproc)
return NULL;
needsbuffer |= VFS_BIO_NEED_ANY;
tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c
index 3e4b17f..52ad0ef 100644
--- a/sys/kern/vfs_export.c
+++ b/sys/kern/vfs_export.c
@@ -56,6 +56,7 @@
#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
+#include <sys/ktr.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/namei.h>
@@ -68,6 +69,7 @@
#include <sys/vnode.h>
#include <machine/limits.h>
+#include <machine/mutex.h>
#include <vm/vm.h>
#include <vm/vm_object.h>
@@ -960,6 +962,8 @@ sched_sync(void)
int s;
struct proc *p = updateproc;
+ mtx_enter(&Giant, MTX_DEF);
+
EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, p,
SHUTDOWN_PRI_LAST);
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 3e4b17f..52ad0ef 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -56,6 +56,7 @@
#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
+#include <sys/ktr.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/namei.h>
@@ -68,6 +69,7 @@
#include <sys/vnode.h>
#include <machine/limits.h>
+#include <machine/mutex.h>
#include <vm/vm.h>
#include <vm/vm_object.h>
@@ -960,6 +962,8 @@ sched_sync(void)
int s;
struct proc *p = updateproc;
+ mtx_enter(&Giant, MTX_DEF);
+
EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, p,
SHUTDOWN_PRI_LAST);
OpenPOWER on IntegriCloud