From 769e0f974d8929599ba599ac496510fffc90ff34 Mon Sep 17 00:00:00 2001 From: jasone Date: Thu, 7 Sep 2000 01:33:02 +0000 Subject: Major update to the way synchronization is done in the kernel. Highlights include: * Mutual exclusion is used instead of spl*(). See mutex(9). (Note: The alpha port is still in transition and currently uses both.) * Per-CPU idle processes. * Interrupts are run in their own separate kernel threads and can be preempted (i386 only). Partially contributed by: BSDi (BSD/OS) Submissions by (at least): cp, dfr, dillon, grog, jake, jhb, sheldonh --- sys/kern/init_main.c | 33 +- sys/kern/kern_clock.c | 29 +- sys/kern/kern_exit.c | 1 - sys/kern/kern_fork.c | 80 +++-- sys/kern/kern_idle.c | 108 +++++++ sys/kern/kern_kthread.c | 17 +- sys/kern/kern_mutex.c | 799 ++++++++++++++++++++++++++++++++++++++++++++++ sys/kern/kern_proc.c | 1 + sys/kern/kern_resource.c | 2 +- sys/kern/kern_shutdown.c | 9 +- sys/kern/kern_sig.c | 3 + sys/kern/kern_subr.c | 7 +- sys/kern/kern_switch.c | 100 ++++-- sys/kern/kern_synch.c | 115 ++++++- sys/kern/kern_tc.c | 9 +- sys/kern/kern_threads.c | 5 + sys/kern/subr_prf.c | 3 +- sys/kern/subr_prof.c | 4 +- sys/kern/subr_smp.c | 88 +++-- sys/kern/subr_trap.c | 391 ++++++++++++----------- sys/kern/subr_turnstile.c | 799 ++++++++++++++++++++++++++++++++++++++++++++++ sys/kern/subr_witness.c | 799 ++++++++++++++++++++++++++++++++++++++++++++++ sys/kern/tty.c | 3 +- sys/kern/vfs_bio.c | 20 +- sys/kern/vfs_export.c | 4 + sys/kern/vfs_subr.c | 4 + 26 files changed, 3116 insertions(+), 317 deletions(-) create mode 100644 sys/kern/kern_idle.c create mode 100644 sys/kern/kern_mutex.c create mode 100644 sys/kern/subr_turnstile.c create mode 100644 sys/kern/subr_witness.c (limited to 'sys/kern') diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c index 6d0d915..f5ae66c 100644 --- a/sys/kern/init_main.c +++ b/sys/kern/init_main.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -64,6 +65,8 @@ #include #include +#include +#include #include #include @@ -260,6 +263,11 @@ proc0_init(void *dummy __unused) p = &proc0; /* + * Initialize magic number. + */ + p->p_magic = P_MAGIC; + + /* * Initialize process and pgrp structures. */ procinit(); @@ -364,11 +372,20 @@ proc0_init(void *dummy __unused) */ (void)chgproccnt(cred0.p_uidinfo, 1, 0); + LIST_INIT(&p->p_heldmtx); + LIST_INIT(&p->p_contested); + /* * Initialize the current process pointer (curproc) before * any possible traps/probes to simplify trap processing. */ - SET_CURPROC(p); + PCPU_SET(curproc, p); + + /* + * Enter the Giant mutex. + * XXX This should be done BEFORE cpu_startup(). + */ + mtx_enter(&Giant, MTX_DEF); } SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL) @@ -389,7 +406,7 @@ proc0_post(void *dummy __unused) p->p_runtime = 0; } microuptime(&switchtime); - switchticks = ticks; + PCPU_SET(switchticks, ticks); /* * Give the ``random'' number generator a thump. @@ -418,7 +435,6 @@ SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL) *************************************************************************** */ - /* * List of paths to try when searching for "init". */ @@ -444,6 +460,8 @@ start_init(void *dummy) char *ucp, **uap, *arg0, *arg1; struct proc *p; + mtx_enter(&Giant, MTX_DEF); + p = curproc; /* Get the vnode for '/'. Set p->p_fd->fd_cdir to reference it. */ @@ -562,16 +580,12 @@ static void create_init(const void *udata __unused) { int error; - int s; - s = splhigh(); - error = fork1(&proc0, RFFDG | RFPROC, &initproc); + error = fork1(&proc0, RFFDG | RFPROC | RFSTOPPED, &initproc); if (error) panic("cannot fork init: %d\n", error); initproc->p_flag |= P_INMEM | P_SYSTEM; cpu_set_fork_handler(initproc, start_init, NULL); - remrunqueue(initproc); - splx(s); } SYSINIT(init,SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL) @@ -581,6 +595,9 @@ SYSINIT(init,SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL) static void kick_init(const void *udata __unused) { + mtx_enter(&sched_lock, MTX_SPIN); + initproc->p_stat = SRUN; setrunqueue(initproc); + mtx_exit(&sched_lock, MTX_SPIN); } SYSINIT(kickinit,SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL) diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c index 11e63a7..33eef3c 100644 --- a/sys/kern/kern_clock.c +++ b/sys/kern/kern_clock.c @@ -70,11 +70,7 @@ static void initclocks __P((void *dummy)); SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL) /* Some of these don't belong here, but it's easiest to concentrate them. */ -#if defined(SMP) && defined(BETTER_CLOCK) long cp_time[CPUSTATES]; -#else -static long cp_time[CPUSTATES]; -#endif long tk_cancc; long tk_nin; @@ -156,7 +152,7 @@ hardclock(frame) register struct proc *p; p = curproc; - if (p) { + if (p != idleproc) { register struct pstats *pstats; /* @@ -325,12 +321,12 @@ statclock(frame) struct rusage *ru; struct vmspace *vm; - if (curproc != NULL && CLKF_USERMODE(frame)) { + if (CLKF_USERMODE(frame)) { /* * Came from user mode; CPU was in user state. * If this process is being profiled, record the tick. */ - p = curproc; + p = prevproc; if (p->p_flag & P_PROFIL) addupc_intr(p, CLKF_PC(frame), 1); #if defined(SMP) && defined(BETTER_CLOCK) @@ -379,20 +375,21 @@ statclock(frame) * so that we know how much of its real time was spent * in ``non-process'' (i.e., interrupt) work. */ - p = curproc; - if (CLKF_INTR(frame)) { - if (p != NULL) - p->p_iticks++; + p = prevproc; + if (p->p_ithd) { + p->p_iticks++; cp_time[CP_INTR]++; - } else if (p != NULL) { + } else { p->p_sticks++; - cp_time[CP_SYS]++; - } else - cp_time[CP_IDLE]++; + if (p != idleproc) + cp_time[CP_SYS]++; + else + cp_time[CP_IDLE]++; + } } pscnt = psdiv; - if (p != NULL) { + if (p != idleproc) { schedclock(p); /* Update resource usage integrals and maximums. */ diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c index df71fe0..7fccc16 100644 --- a/sys/kern/kern_exit.c +++ b/sys/kern/kern_exit.c @@ -349,7 +349,6 @@ exit1(p, rv) * * Other substructures are freed from wait(). */ - SET_CURPROC(NULL); if (--p->p_limit->p_refcnt == 0) { FREE(p->p_limit, M_SUBPROC); p->p_limit = NULL; diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c index f24c97e..0aa31ab 100644 --- a/sys/kern/kern_fork.c +++ b/sys/kern/kern_fork.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -65,6 +66,8 @@ #include +#include + static MALLOC_DEFINE(M_ATFORK, "atfork", "atfork callback"); static int fast_vfork = 1; @@ -131,7 +134,8 @@ rfork(p, uap) int error; struct proc *p2; - error = fork1(p, uap->flags, &p2); + /* mask kernel only flags out of the user flags */ + error = fork1(p, uap->flags & ~RFKERNELONLY, &p2); if (error == 0) { p->p_retval[0] = p2 ? p2->p_pid : 0; p->p_retval[1] = 0; @@ -177,17 +181,19 @@ SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW, int fork1(p1, flags, procp) - struct proc *p1; + struct proc *p1; /* parent proc */ int flags; - struct proc **procp; + struct proc **procp; /* child proc */ { struct proc *p2, *pptr; uid_t uid; struct proc *newproc; + int trypid; int ok; static int pidchecked = 0; struct forklist *ep; + /* Can't copy and clear */ if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) return (EINVAL); @@ -278,47 +284,56 @@ fork1(p1, flags, procp) /* * Find an unused process ID. We remember a range of unused IDs * ready to use (from nextpid+1 through pidchecked-1). + * + * If RFHIGHPID is set (used during system boot), do not allocate + * low-numbered pids. */ - nextpid++; + trypid = nextpid + 1; + if (flags & RFHIGHPID) { + if (trypid < 10) { + trypid = 10; + } + } else { if (randompid) - nextpid += arc4random() % randompid; + trypid += arc4random() % randompid; + } retry: /* * If the process ID prototype has wrapped around, * restart somewhat above 0, as the low-numbered procs * tend to include daemons that don't exit. */ - if (nextpid >= PID_MAX) { - nextpid = nextpid % PID_MAX; - if (nextpid < 100) - nextpid += 100; + if (trypid >= PID_MAX) { + trypid = trypid % PID_MAX; + if (trypid < 100) + trypid += 100; pidchecked = 0; } - if (nextpid >= pidchecked) { + if (trypid >= pidchecked) { int doingzomb = 0; pidchecked = PID_MAX; /* * Scan the active and zombie procs to check whether this pid * is in use. Remember the lowest pid that's greater - * than nextpid, so we can avoid checking for a while. + * than trypid, so we can avoid checking for a while. */ p2 = LIST_FIRST(&allproc); again: for (; p2 != 0; p2 = LIST_NEXT(p2, p_list)) { - while (p2->p_pid == nextpid || - p2->p_pgrp->pg_id == nextpid || - p2->p_session->s_sid == nextpid) { - nextpid++; - if (nextpid >= pidchecked) + while (p2->p_pid == trypid || + p2->p_pgrp->pg_id == trypid || + p2->p_session->s_sid == trypid) { + trypid++; + if (trypid >= pidchecked) goto retry; } - if (p2->p_pid > nextpid && pidchecked > p2->p_pid) + if (p2->p_pid > trypid && pidchecked > p2->p_pid) pidchecked = p2->p_pid; - if (p2->p_pgrp->pg_id > nextpid && + if (p2->p_pgrp->pg_id > trypid && pidchecked > p2->p_pgrp->pg_id) pidchecked = p2->p_pgrp->pg_id; - if (p2->p_session->s_sid > nextpid && + if (p2->p_session->s_sid > trypid && pidchecked > p2->p_session->s_sid) pidchecked = p2->p_session->s_sid; } @@ -331,11 +346,19 @@ again: p2 = newproc; p2->p_stat = SIDL; /* protect against others */ - p2->p_pid = nextpid; + p2->p_pid = trypid; LIST_INSERT_HEAD(&allproc, p2, p_list); LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); /* + * RFHIGHPID does not mess with the nextpid counter during boot. + */ + if (flags & RFHIGHPID) + pidchecked = 0; + else + nextpid = trypid; + + /* * Make a proc table entry for the new process. * Start by zeroing the section of proc that is zero-initialized, * then copy the section that is copied directly from the parent. @@ -456,6 +479,8 @@ again: p2->p_pptr = pptr; LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling); LIST_INIT(&p2->p_children); + LIST_INIT(&p2->p_heldmtx); + LIST_INIT(&p2->p_contested); #ifdef KTRACE /* @@ -496,14 +521,19 @@ again: } /* - * Make child runnable and add to run queue. + * If RFSTOPPED not requested, make child runnable and add to + * run queue. */ microtime(&(p2->p_stats->p_start)); p2->p_acflag = AFORK; - (void) splhigh(); - p2->p_stat = SRUN; - setrunqueue(p2); - (void) spl0(); + if ((flags & RFSTOPPED) == 0) { + splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); + p2->p_stat = SRUN; + setrunqueue(p2); + mtx_exit(&sched_lock, MTX_SPIN); + spl0(); + } /* * Now can be swapped. diff --git a/sys/kern/kern_idle.c b/sys/kern/kern_idle.c new file mode 100644 index 0000000..840c0f9 --- /dev/null +++ b/sys/kern/kern_idle.c @@ -0,0 +1,108 @@ +/*- + * Copyright (c) 2000, All rights reserved. See /usr/src/COPYRIGHT + * + * $FreeBSD$ + */ + +#include "opt_ktrace.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef KTRACE +#include +#include +#endif + +#include +#include +#include +#include + +#include +#include + +#ifdef SMP_DEBUG +#include +#include +#include +#endif + +static void idle_setup(void *dummy); +SYSINIT(idle_setup, SI_SUB_SCHED_IDLE, SI_ORDER_FIRST, idle_setup, NULL) + +static void idle_proc(void *dummy); + +/* + * setup per-cpu idle process contexts + */ +static void +idle_setup(void *dummy) +{ + struct globaldata *gd; + int error; + + SLIST_FOREACH(gd, &cpuhead, gd_allcpu) { +#ifdef SMP + error = kthread_create(idle_proc, NULL, &gd->gd_idleproc, + RFSTOPPED|RFHIGHPID, "idle: cpu%d", + gd->gd_cpuid); +#else + error = kthread_create(idle_proc, NULL, &gd->gd_idleproc, + RFSTOPPED|RFHIGHPID, "idle"); +#endif + if (error) + panic("idle_setup: kthread_create error %d\n", error); + + gd->gd_idleproc->p_stat = SWAIT; + } +} + +/* + * idle process context + */ +static void +idle_proc(void *dummy) +{ + int count; + + for (;;) { + /* + * Clear switchtime, which prevents the idle process's time + * from being counted. + switchtime.tv_usec = 0; + switchtime.tv_sec = 0; + */ + + mtx_assert(&Giant, MA_NOTOWNED); + + count = 0; + + while (count >= 0 && procrunnable() == 0) { + /* + * This is a good place to put things to be done in + * the background, including sanity checks. + */ + if (count++ < 0) + CTR0(KTR_PROC, "idle_proc: timed out waiting" + " for a process"); + } + + mtx_enter(&sched_lock, MTX_SPIN); + idleproc->p_stat = SWAIT; + mi_switch(); + mtx_exit(&sched_lock, MTX_SPIN); + spl0(); + } +} diff --git a/sys/kern/kern_kthread.c b/sys/kern/kern_kthread.c index 6373750..e684b78 100644 --- a/sys/kern/kern_kthread.c +++ b/sys/kern/kern_kthread.c @@ -52,24 +52,33 @@ kproc_start(udata) int error; error = kthread_create((void (*)(void *))kp->func, NULL, - kp->global_procpp, kp->arg0); + kp->global_procpp, 0, kp->arg0); if (error) panic("kproc_start: %s: error %d", kp->arg0, error); } /* - * Create a kernel process/thread/whatever. It shares it's address space + * Create a kernel process/thread/whatever. It shares its address space * with proc0 - ie: kernel only. + * + * func is the function to start. + * arg is the parameter to pass to function on first startup. + * newpp is the return value pointing to the thread's struct proc. + * flags are flags to fork1 (in unistd.h) + * fmt and following will be *printf'd into (*newpp)->p_comm (for ps, etc.). */ int kthread_create(void (*func)(void *), void *arg, - struct proc **newpp, const char *fmt, ...) + struct proc **newpp, int flags, const char *fmt, ...) { int error; va_list ap; struct proc *p2; - error = fork1(&proc0, RFMEM | RFFDG | RFPROC, &p2); + if (!proc0.p_stats /* || proc0.p_stats->p_start.tv_sec == 0 */) + panic("kthread_create called too soon"); + + error = fork1(&proc0, RFMEM | RFFDG | RFPROC | flags, &p2); if (error) return error; diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c new file mode 100644 index 0000000..1ac3f58 --- /dev/null +++ b/sys/kern/kern_mutex.c @@ -0,0 +1,799 @@ +/*- + * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Berkeley Software Design Inc's name may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ + * $FreeBSD$ + */ + +/* + * Main Entry: witness + * Pronunciation: 'wit-n&s + * Function: noun + * Etymology: Middle English witnesse, from Old English witnes knowledge, + * testimony, witness, from 2wit + * Date: before 12th century + * 1 : attestation of a fact or event : TESTIMONY + * 2 : one that gives evidence; specifically : one who testifies in + * a cause or before a judicial tribunal + * 3 : one asked to be present at a transaction so as to be able to + * testify to its having taken place + * 4 : one who has personal knowledge of something + * 5 a : something serving as evidence or proof : SIGN + * b : public affirmation by word or example of usually + * religious faith or conviction + * 6 capitalized : a member of the Jehovah's Witnesses + */ + +#include +#include +#include +#include + +#include +#define _KERN_MUTEX_C_ /* Cause non-inlined mtx_*() to be compiled. */ +#include + +/* + * The non-inlined versions of the mtx_*() functions are always built (above), + * but the witness code depends on the SMP_DEBUG and WITNESS kernel options + * being specified. + */ +#if (defined(SMP_DEBUG) && defined(WITNESS)) + +#define WITNESS_COUNT 200 +#define WITNESS_NCHILDREN 2 + +#ifndef WITNESS +#define WITNESS 0 /* default off */ +#endif + +#ifndef SMP +extern int witness_spin_check; +#endif + +int witness_watch; + +typedef struct witness { + struct witness *w_next; + char *w_description; + char *w_file; + int w_line; + struct witness *w_morechildren; + u_char w_childcnt; + u_char w_Giant_squawked:1; + u_char w_other_squawked:1; + u_char w_same_squawked:1; + u_char w_sleep:1; + u_char w_spin:1; /* this is a spin mutex */ + u_int w_level; + struct witness *w_children[WITNESS_NCHILDREN]; +} witness_t; + +typedef struct witness_blessed { + char *b_lock1; + char *b_lock2; +} witness_blessed_t; + +#ifdef KDEBUG +/* + * When WITNESS_KDEBUG is set to 1, it will cause the system to + * drop into kdebug() when: + * - a lock heirarchy violation occurs + * - locks are held when going to sleep. + */ +#ifndef WITNESS_KDEBUG +#define WITNESS_KDEBUG 0 +#endif +int witness_kdebug = WITNESS_KDEBUG; +#endif /* KDEBUG */ + +#ifndef WITNESS_SKIPSPIN +#define WITNESS_SKIPSPIN 0 +#endif +int witness_skipspin = WITNESS_SKIPSPIN; + + +static mtx_t w_mtx; +static witness_t *w_free; +static witness_t *w_all; +static int w_inited; +static int witness_dead; /* fatal error, probably no memory */ + +static witness_t w_data[WITNESS_COUNT]; + +static witness_t *enroll __P((char *description, int flag)); +static int itismychild __P((witness_t *parent, witness_t *child)); +static void removechild __P((witness_t *parent, witness_t *child)); +static int isitmychild __P((witness_t *parent, witness_t *child)); +static int isitmydescendant __P((witness_t *parent, witness_t *child)); +static int dup_ok __P((witness_t *)); +static int blessed __P((witness_t *, witness_t *)); +static void witness_displaydescendants + __P((void(*)(const char *fmt, ...), witness_t *)); +static void witness_leveldescendents __P((witness_t *parent, int level)); +static void witness_levelall __P((void)); +static witness_t * witness_get __P((void)); +static void witness_free __P((witness_t *m)); + + +static char *ignore_list[] = { + "witness lock", + "Kdebug", /* breaks rules and may or may not work */ + "Page Alias", /* sparc only, witness lock won't block intr */ + NULL +}; + +static char *spin_order_list[] = { + "sched lock", + "log mtx", + "zslock", /* sparc only above log, this one is a real hack */ + "time lock", /* above callout */ + "callout mtx", /* above wayout */ + /* + * leaf locks + */ + "wayout mtx", + "kernel_pmap", /* sparc only, logically equal "pmap" below */ + "pmap", /* sparc only */ + NULL +}; + +static char *order_list[] = { + "tcb", "inp", "so_snd", "so_rcv", "Giant lock", NULL, + "udb", "inp", NULL, + "unp head", "unp", "so_snd", NULL, + "de0", "Giant lock", NULL, + "ifnet", "Giant lock", NULL, + "fifo", "so_snd", NULL, + "hme0", "Giant lock", NULL, + "esp0", "Giant lock", NULL, + "hfa0", "Giant lock", NULL, + "so_rcv", "atm_global", NULL, + "so_snd", "atm_global", NULL, + "NFS", "Giant lock", NULL, + NULL +}; + +static char *dup_list[] = { + "inp", + "process group", + "session", + "unp", + "rtentry", + "rawcb", + NULL +}; + +static char *sleep_list[] = { + "Giant lock", + NULL +}; + +/* + * Pairs of locks which have been blessed + * Don't complain about order problems with blessed locks + */ +static witness_blessed_t blessed_list[] = { +}; +static int blessed_count = sizeof (blessed_list) / sizeof (witness_blessed_t); + +void +witness_init(mtx_t *m, int flag) +{ + m->mtx_witness = enroll(m->mtx_description, flag); +} + +void +witness_destroy(mtx_t *m) +{ + mtx_t *m1; + struct proc *p; + p = CURPROC; + for ((m1 = LIST_FIRST(&p->p_heldmtx)); m1 != NULL; + m1 = LIST_NEXT(m1, mtx_held)) { + if (m1 == m) { + LIST_REMOVE(m, mtx_held); + break; + } + } + return; + +} + +void +witness_enter(mtx_t *m, int flags, char *file, int line) +{ + witness_t *w, *w1; + mtx_t *m1; + struct proc *p; + int i; +#ifdef KDEBUG + int go_into_kdebug = 0; +#endif /* KDEBUG */ + + w = m->mtx_witness; + p = CURPROC; + + if (flags & MTX_SPIN) { + if (!w->w_spin) + panic("mutex_enter: MTX_SPIN on MTX_DEF mutex %s @ %s:%d", + m->mtx_description, file, line); + if (m->mtx_recurse != 0) + return; + mtx_enter(&w_mtx, MTX_SPIN); + i = witness_spin_check; + if (i != 0 && w->w_level < i) { + mtx_exit(&w_mtx, MTX_SPIN); + panic("mutex_enter(%s:%x, MTX_SPIN) out of order @ %s:%d" + " already holding %s:%x", + m->mtx_description, w->w_level, file, line, + spin_order_list[ffs(i)-1], i); + } + PCPU_SET(witness_spin_check, i | w->w_level); + mtx_exit(&w_mtx, MTX_SPIN); + return; + } + if (w->w_spin) + panic("mutex_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", + m->mtx_description, file, line); + + if (m->mtx_recurse != 0) + return; + if (witness_dead) + goto out; + if (cold) + goto out; + + if (!mtx_legal2block()) + panic("blockable mtx_enter() of %s when not legal @ %s:%d", + m->mtx_description, file, line); + /* + * Is this the first mutex acquired + */ + if ((m1 = LIST_FIRST(&p->p_heldmtx)) == NULL) + goto out; + + + if ((w1 = m1->mtx_witness) == w) { + if (w->w_same_squawked || dup_ok(w)) + goto out; + w->w_same_squawked = 1; + printf("acquring duplicate lock of same type: \"%s\"\n", + m->mtx_description); + printf(" 1st @ %s:%d\n", w->w_file, w->w_line); + printf(" 2nd @ %s:%d\n", file, line); +#ifdef KDEBUG + go_into_kdebug = 1; +#endif /* KDEBUG */ + goto out; + } + MPASS(!mtx_owned(&w_mtx)); + mtx_enter(&w_mtx, MTX_SPIN); + /* + * If we have a known higher number just say ok + */ + if (witness_watch > 1 && w->w_level > w1->w_level) { + mtx_exit(&w_mtx, MTX_SPIN); + goto out; + } + if (isitmydescendant(m1->mtx_witness, w)) { + mtx_exit(&w_mtx, MTX_SPIN); + goto out; + } + for (i = 0; m1 != NULL; m1 = LIST_NEXT(m1, mtx_held), i++) { + + ASS(i < 200); + w1 = m1->mtx_witness; + if (isitmydescendant(w, w1)) { + mtx_exit(&w_mtx, MTX_SPIN); + if (blessed(w, w1)) + goto out; + if (m1 == &Giant) { + if (w1->w_Giant_squawked) + goto out; + else + w1->w_Giant_squawked = 1; + } else { + if (w1->w_other_squawked) + goto out; + else + w1->w_other_squawked = 1; + } + printf("lock order reversal\n"); + printf(" 1st %s last acquired @ %s:%d\n", + w->w_description, w->w_file, w->w_line); + printf(" 2nd %p %s @ %s:%d\n", + m1, w1->w_description, w1->w_file, w1->w_line); + printf(" 3rd %p %s @ %s:%d\n", + m, w->w_description, file, line); +#ifdef KDEBUG + go_into_kdebug = 1; +#endif /* KDEBUG */ + goto out; + } + } + m1 = LIST_FIRST(&p->p_heldmtx); + if (!itismychild(m1->mtx_witness, w)) + mtx_exit(&w_mtx, MTX_SPIN); + +out: +#ifdef KDEBUG + if (witness_kdebug && go_into_kdebug) + kdebug(); +#endif /* KDEBUG */ + w->w_file = file; + w->w_line = line; + m->mtx_line = line; + m->mtx_file = file; + + /* + * If this pays off it likely means that a mutex being witnessed + * is acquired in hardclock. Put it in the ignore list. It is + * likely not the mutex this assert fails on. + */ + ASS(m->mtx_held.le_prev == NULL); + LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held); +} + +void +witness_exit(mtx_t *m, int flags, char *file, int line) +{ + witness_t *w; + + w = m->mtx_witness; + + if (flags & MTX_SPIN) { + if (!w->w_spin) + panic("mutex_exit: MTX_SPIN on MTX_DEF mutex %s @ %s:%d", + m->mtx_description, file, line); + if (m->mtx_recurse != 0) + return; + mtx_enter(&w_mtx, MTX_SPIN); + PCPU_SET(witness_spin_check, witness_spin_check & ~w->w_level); + mtx_exit(&w_mtx, MTX_SPIN); + return; + } + if (w->w_spin) + panic("mutex_exit: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", + m->mtx_description, file, line); + + if (m->mtx_recurse != 0) + return; + + if ((flags & MTX_NOSWITCH) == 0 && !mtx_legal2block() && !cold) + panic("switchable mtx_exit() of %s when not legal @ %s:%d", + m->mtx_description, file, line); + LIST_REMOVE(m, mtx_held); + m->mtx_held.le_prev = NULL; +} + +void +witness_try_enter(mtx_t *m, int flags, char *file, int line) +{ + struct proc *p; + witness_t *w = m->mtx_witness; + + + if (flags & MTX_SPIN) { + if (!w->w_spin) + panic("mutex_try_enter: " + "MTX_SPIN on MTX_DEF mutex %s @ %s:%d", + m->mtx_description, file, line); + if (m->mtx_recurse != 0) + return; + mtx_enter(&w_mtx, MTX_SPIN); + PCPU_SET(witness_spin_check, witness_spin_check | w->w_level); + mtx_exit(&w_mtx, MTX_SPIN); + return; + } + + if (w->w_spin) + panic("mutex_try_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", + m->mtx_description, file, line); + + if (m->mtx_recurse != 0) + return; + + w->w_file = file; + w->w_line = line; + m->mtx_line = line; + m->mtx_file = file; + p = CURPROC; + ASS(m->mtx_held.le_prev == NULL); + LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held); +} + +void +witness_display(void(*prnt)(const char *fmt, ...)) +{ + witness_t *w, *w1; + + witness_levelall(); + + for (w = w_all; w; w = w->w_next) { + if (w->w_file == NULL) + continue; + for (w1 = w_all; w1; w1 = w1->w_next) { + if (isitmychild(w1, w)) + break; + } + if (w1 != NULL) + continue; + /* + * This lock has no anscestors, display its descendants. + */ + witness_displaydescendants(prnt, w); + } + prnt("\nMutex which were never acquired\n"); + for (w = w_all; w; w = w->w_next) { + if (w->w_file != NULL) + continue; + prnt("%s\n", w->w_description); + } +} + +int +witness_sleep(int check_only, mtx_t *mtx, char *file, int line) +{ + mtx_t *m; + struct proc *p; + char **sleep; + int n = 0; + + p = CURPROC; + for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL; + m = LIST_NEXT(m, mtx_held)) { + if (m == mtx) + continue; + for (sleep = sleep_list; *sleep!= NULL; sleep++) + if (strcmp(m->mtx_description, *sleep) == 0) + goto next; + printf("%s:%d: %s with \"%s\" locked from %s:%d\n", + file, line, check_only ? "could sleep" : "sleeping", + m->mtx_description, + m->mtx_witness->w_file, m->mtx_witness->w_line); + n++; + next: + } +#ifdef KDEBUG + if (witness_kdebug && n) + kdebug(); +#endif /* KDEBUG */ + return (n); +} + +static witness_t * +enroll(char *description, int flag) +{ + int i; + witness_t *w, *w1; + char **ignore; + char **order; + + if (!witness_watch) + return (NULL); + for (ignore = ignore_list; *ignore != NULL; ignore++) + if (strcmp(description, *ignore) == 0) + return (NULL); + + if (w_inited == 0) { + mtx_init(&w_mtx, "witness lock", MTX_DEF); + for (i = 0; i < WITNESS_COUNT; i++) { + w = &w_data[i]; + witness_free(w); + } + w_inited = 1; + for (order = order_list; *order != NULL; order++) { + w = enroll(*order, MTX_DEF); + w->w_file = "order list"; + for (order++; *order != NULL; order++) { + w1 = enroll(*order, MTX_DEF); + w1->w_file = "order list"; + itismychild(w, w1); + w = w1; + } + } + } + if ((flag & MTX_SPIN) && witness_skipspin) + return (NULL); + mtx_enter(&w_mtx, MTX_SPIN); + for (w = w_all; w; w = w->w_next) { + if (strcmp(description, w->w_description) == 0) { + mtx_exit(&w_mtx, MTX_SPIN); + return (w); + } + } + if ((w = witness_get()) == NULL) + return (NULL); + w->w_next = w_all; + w_all = w; + w->w_description = description; + mtx_exit(&w_mtx, MTX_SPIN); + if (flag & MTX_SPIN) { + w->w_spin = 1; + + i = 1; + for (order = spin_order_list; *order != NULL; order++) { + if (strcmp(description, *order) == 0) + break; + i <<= 1; + } + if (*order == NULL) + panic("spin lock %s not in order list", description); + w->w_level = i; + } + return (w); +} + +static int +itismychild(witness_t *parent, witness_t *child) +{ + static int recursed; + + /* + * Insert "child" after "parent" + */ + while (parent->w_morechildren) + parent = parent->w_morechildren; + + if (parent->w_childcnt == WITNESS_NCHILDREN) { + if ((parent->w_morechildren = witness_get()) == NULL) + return (1); + parent = parent->w_morechildren; + } + ASS(child != NULL); + parent->w_children[parent->w_childcnt++] = child; + /* + * now prune whole tree + */ + if (recursed) + return (0); + recursed = 1; + for (child = w_all; child != NULL; child = child->w_next) { + for (parent = w_all; parent != NULL; + parent = parent->w_next) { + if (!isitmychild(parent, child)) + continue; + removechild(parent, child); + if (isitmydescendant(parent, child)) + continue; + itismychild(parent, child); + } + } + recursed = 0; + witness_levelall(); + return (0); +} + +static void +removechild(witness_t *parent, witness_t *child) +{ + witness_t *w, *w1; + int i; + + for (w = parent; w != NULL; w = w->w_morechildren) + for (i = 0; i < w->w_childcnt; i++) + if (w->w_children[i] == child) + goto found; + return; +found: + for (w1 = w; w1->w_morechildren != NULL; w1 = w1->w_morechildren) + continue; + w->w_children[i] = w1->w_children[--w1->w_childcnt]; + ASS(w->w_children[i] != NULL); + + if (w1->w_childcnt != 0) + return; + + if (w1 == parent) + return; + for (w = parent; w->w_morechildren != w1; w = w->w_morechildren) + continue; + w->w_morechildren = 0; + witness_free(w1); +} + +static int +isitmychild(witness_t *parent, witness_t *child) +{ + witness_t *w; + int i; + + for (w = parent; w != NULL; w = w->w_morechildren) { + for (i = 0; i < w->w_childcnt; i++) { + if (w->w_children[i] == child) + return (1); + } + } + return (0); +} + +static int +isitmydescendant(witness_t *parent, witness_t *child) +{ + witness_t *w; + int i; + int j; + + for (j = 0, w = parent; w != NULL; w = w->w_morechildren, j++) { + ASS(j < 1000); + for (i = 0; i < w->w_childcnt; i++) { + if (w->w_children[i] == child) + return (1); + } + for (i = 0; i < w->w_childcnt; i++) { + if (isitmydescendant(w->w_children[i], child)) + return (1); + } + } + return (0); +} + +void +witness_levelall (void) +{ + witness_t *w, *w1; + + for (w = w_all; w; w = w->w_next) + if (!w->w_spin) + w->w_level = 0; + for (w = w_all; w; w = w->w_next) { + if (w->w_spin) + continue; + for (w1 = w_all; w1; w1 = w1->w_next) { + if (isitmychild(w1, w)) + break; + } + if (w1 != NULL) + continue; + witness_leveldescendents(w, 0); + } +} + +static void +witness_leveldescendents(witness_t *parent, int level) +{ + int i; + witness_t *w; + + if (parent->w_level < level) + parent->w_level = level; + level++; + for (w = parent; w != NULL; w = w->w_morechildren) + for (i = 0; i < w->w_childcnt; i++) + witness_leveldescendents(w->w_children[i], level); +} + +static void +witness_displaydescendants(void(*prnt)(const char *fmt, ...), witness_t *parent) +{ + witness_t *w; + int i; + int level = parent->w_level; + + prnt("%d", level); + if (level < 10) + prnt(" "); + for (i = 0; i < level; i++) + prnt(" "); + prnt("%s", parent->w_description); + if (parent->w_file != NULL) { + prnt(" -- last acquired @ %s", parent->w_file); +#ifndef W_USE_WHERE + prnt(":%d", parent->w_line); +#endif + prnt("\n"); + } + + for (w = parent; w != NULL; w = w->w_morechildren) + for (i = 0; i < w->w_childcnt; i++) + witness_displaydescendants(prnt, w->w_children[i]); + } + +static int +dup_ok(witness_t *w) +{ + char **dup; + + for (dup = dup_list; *dup!= NULL; dup++) + if (strcmp(w->w_description, *dup) == 0) + return (1); + return (0); +} + +static int +blessed(witness_t *w1, witness_t *w2) +{ + int i; + witness_blessed_t *b; + + for (i = 0; i < blessed_count; i++) { + b = &blessed_list[i]; + if (strcmp(w1->w_description, b->b_lock1) == 0) { + if (strcmp(w2->w_description, b->b_lock2) == 0) + return (1); + continue; + } + if (strcmp(w1->w_description, b->b_lock2) == 0) + if (strcmp(w2->w_description, b->b_lock1) == 0) + return (1); + } + return (0); +} + +static witness_t * +witness_get() +{ + witness_t *w; + + if ((w = w_free) == NULL) { + witness_dead = 1; + mtx_exit(&w_mtx, MTX_SPIN); + printf("witness exhausted\n"); + return (NULL); + } + w_free = w->w_next; + bzero(w, sizeof (*w)); + return (w); +} + +static void +witness_free(witness_t *w) +{ + w->w_next = w_free; + w_free = w; +} + +void +witness_list(struct proc *p) +{ + mtx_t *m; + + for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL; + m = LIST_NEXT(m, mtx_held)) { + printf("\t\"%s\" (%p) locked at %s:%d\n", + m->mtx_description, m, + m->mtx_witness->w_file, m->mtx_witness->w_line); + } +} + +void +witness_save(mtx_t *m, char **filep, int *linep) +{ + *filep = m->mtx_witness->w_file; + *linep = m->mtx_witness->w_line; +} + +void +witness_restore(mtx_t *m, char *file, int line) +{ + m->mtx_witness->w_file = file; + m->mtx_witness->w_line = line; +} + +#endif /* (defined(SMP_DEBUG) && defined(WITNESS)) */ diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c index 7ec2628..4800747 100644 --- a/sys/kern/kern_proc.c +++ b/sys/kern/kern_proc.c @@ -73,6 +73,7 @@ u_long pgrphash; struct proclist allproc; struct proclist zombproc; vm_zone_t proc_zone; +vm_zone_t ithread_zone; /* * Initialize global process hashing structures. diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c index f2a8fa6..3344f7e 100644 --- a/sys/kern/kern_resource.c +++ b/sys/kern/kern_resource.c @@ -530,7 +530,7 @@ calcru(p, up, sp, ip) microuptime(&tv); if (timevalcmp(&tv, &switchtime, <)) printf("microuptime() went backwards (%ld.%06ld -> %ld.%06ld)\n", - switchtime.tv_sec, switchtime.tv_usec, + switchtime.tv_sec, switchtime.tv_usec, tv.tv_sec, tv.tv_usec); else tu += (tv.tv_usec - switchtime.tv_usec) + diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c index 9c744c7..8a6ccd8 100644 --- a/sys/kern/kern_shutdown.c +++ b/sys/kern/kern_shutdown.c @@ -63,6 +63,7 @@ #include #include +#include #include #include /* smp_active, cpuid */ @@ -524,6 +525,11 @@ panic(const char *fmt, ...) va_list ap; static char buf[256]; +#ifdef SMP + /* Only 1 CPU can panic at a time */ + s_lock(&panic_lock); +#endif + bootopt = RB_AUTOBOOT | RB_DUMP; if (panicstr) bootopt |= RB_NOSYNC; @@ -537,8 +543,7 @@ panic(const char *fmt, ...) va_end(ap); printf("panic: %s\n", buf); #ifdef SMP - /* three seperate prints in case of an unmapped page and trap */ - printf("mp_lock = %08x; ", mp_lock); + /* two seperate prints in case of an unmapped page and trap */ printf("cpuid = %d; ", cpuid); printf("lapic.id = %08x\n", lapic.id); #endif diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c index a2ff2ef..a39a4c8 100644 --- a/sys/kern/kern_sig.c +++ b/sys/kern/kern_sig.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -1465,6 +1466,8 @@ killproc(p, why) struct proc *p; char *why; { + CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", + p, p->p_pid, p->p_comm); log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm, p->p_cred && p->p_ucred ? p->p_ucred->cr_uid : -1, why); psignal(p, SIGKILL); diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c index c0f7f64..d9a599a 100644 --- a/sys/kern/kern_subr.c +++ b/sys/kern/kern_subr.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -52,6 +53,8 @@ #include #include +#include + static void uio_yield __P((void)); int @@ -421,10 +424,12 @@ uio_yield() int s; p = curproc; - p->p_priority = p->p_usrpri; s = splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); + p->p_priority = p->p_usrpri; setrunqueue(p); p->p_stats->p_ru.ru_nivcsw++; mi_switch(); + mtx_exit(&sched_lock, MTX_SPIN); splx(s); } diff --git a/sys/kern/kern_switch.c b/sys/kern/kern_switch.c index 3146f9e..8f47dba 100644 --- a/sys/kern/kern_switch.c +++ b/sys/kern/kern_switch.c @@ -29,27 +29,39 @@ #include #include #include +#include #include #include #include +#include + /* * We have NQS (32) run queues per scheduling class. For the normal * class, there are 128 priorities scaled onto these 32 queues. New * processes are added to the last entry in each queue, and processes * are selected for running by taking them from the head and maintaining - * a simple FIFO arrangement. Realtime and Idle priority processes have - * and explicit 0-31 priority which maps directly onto their class queue - * index. When a queue has something in it, the corresponding bit is - * set in the queuebits variable, allowing a single read to determine - * the state of all 32 queues and then a ffs() to find the first busy + * a simple FIFO arrangement. + * + * Interrupt, real time and idle priority processes have and explicit + * 0-31 priority which maps directly onto their class queue index. + * When a queue has something in it, the corresponding bit is set in + * the queuebits variable, allowing a single read to determine the + * state of all 32 queues and then a ffs() to find the first busy * queue. + * + * XXX This needs fixing. First, we only have one idle process, so we + * hardly need 32 queues for it. Secondly, the number of classes + * makes things unwieldy. We should be able to merge them into a + * single 96 or 128 entry queue. */ -struct rq queues[NQS]; -struct rq rtqueues[NQS]; -struct rq idqueues[NQS]; -u_int32_t queuebits; +struct rq itqueues[NQS]; /* interrupt threads */ +struct rq rtqueues[NQS]; /* real time processes */ +struct rq queues[NQS]; /* time sharing processes */ +struct rq idqueues[NQS]; /* idle process */ +u_int32_t itqueuebits; u_int32_t rtqueuebits; +u_int32_t queuebits; u_int32_t idqueuebits; /* @@ -61,8 +73,9 @@ rqinit(void *dummy) int i; for (i = 0; i < NQS; i++) { - TAILQ_INIT(&queues[i]); + TAILQ_INIT(&itqueues[i]); TAILQ_INIT(&rtqueues[i]); + TAILQ_INIT(&queues[i]); TAILQ_INIT(&idqueues[i]); } } @@ -81,22 +94,37 @@ setrunqueue(struct proc *p) struct rq *q; u_int8_t pri; - KASSERT(p->p_stat == SRUN, ("setrunqueue: proc not SRUN")); - if (p->p_rtprio.type == RTP_PRIO_NORMAL) { - pri = p->p_priority >> 2; - q = &queues[pri]; - queuebits |= 1 << pri; - } else if (p->p_rtprio.type == RTP_PRIO_REALTIME || + mtx_assert(&sched_lock, MA_OWNED); + KASSERT(p->p_stat == SRUN, ("setrunqueue: proc %p (%s) not SRUN", p, \ + p->p_comm)); + + /* + * Decide which class we want to run. We now have four + * queues, and this is becoming ugly. We should be able to + * collapse the first three classes into a single contiguous + * queue. XXX FIXME. + */ + CTR4(KTR_PROC, "setrunqueue: proc %p (pid %d, %s), schedlock %x", + p, p->p_pid, p->p_comm, sched_lock.mtx_lock); + if (p->p_rtprio.type == RTP_PRIO_ITHREAD) { /* interrupt thread */ + pri = p->p_rtprio.prio; + q = &itqueues[pri]; + itqueuebits |= 1 << pri; + } else if (p->p_rtprio.type == RTP_PRIO_REALTIME || /* real time */ p->p_rtprio.type == RTP_PRIO_FIFO) { pri = p->p_rtprio.prio; q = &rtqueues[pri]; rtqueuebits |= 1 << pri; - } else if (p->p_rtprio.type == RTP_PRIO_IDLE) { + } else if (p->p_rtprio.type == RTP_PRIO_NORMAL) { /* time sharing */ + pri = p->p_priority >> 2; + q = &queues[pri]; + queuebits |= 1 << pri; + } else if (p->p_rtprio.type == RTP_PRIO_IDLE) { /* idle proc */ pri = p->p_rtprio.prio; q = &idqueues[pri]; idqueuebits |= 1 << pri; } else { - panic("setrunqueue: invalid rtprio type"); + panic("setrunqueue: invalid rtprio type %d", p->p_rtprio.type); } p->p_rqindex = pri; /* remember the queue index */ TAILQ_INSERT_TAIL(q, p, p_procq); @@ -114,14 +142,20 @@ remrunqueue(struct proc *p) u_int32_t *which; u_int8_t pri; + CTR4(KTR_PROC, "remrunqueue: proc %p (pid %d, %s), schedlock %x", + p, p->p_pid, p->p_comm, sched_lock.mtx_lock); + mtx_assert(&sched_lock, MA_OWNED); pri = p->p_rqindex; - if (p->p_rtprio.type == RTP_PRIO_NORMAL) { - q = &queues[pri]; - which = &queuebits; + if (p->p_rtprio.type == RTP_PRIO_ITHREAD) { + q = &itqueues[pri]; + which = &itqueuebits; } else if (p->p_rtprio.type == RTP_PRIO_REALTIME || p->p_rtprio.type == RTP_PRIO_FIFO) { q = &rtqueues[pri]; which = &rtqueuebits; + } else if (p->p_rtprio.type == RTP_PRIO_NORMAL) { + q = &queues[pri]; + which = &queuebits; } else if (p->p_rtprio.type == RTP_PRIO_IDLE) { q = &idqueues[pri]; which = &idqueuebits; @@ -142,11 +176,17 @@ remrunqueue(struct proc *p) * loop to avoid the more expensive (and destructive) chooseproc(). * * MP SAFE. CALLED WITHOUT THE MP LOCK + * + * XXX I doubt this. It's possibly fail-safe, but there's obviously + * the case here where one of the bits words gets loaded, the + * processor gets preempted, and by the time it returns from this + * function, some other processor has picked the runnable process. + * What am I missing? (grog, 23 July 2000). */ u_int32_t procrunnable(void) { - return (rtqueuebits || queuebits || idqueuebits); + return (itqueuebits || rtqueuebits || queuebits || idqueuebits); } /* @@ -173,7 +213,12 @@ chooseproc(void) u_char id; #endif - if (rtqueuebits) { + mtx_assert(&sched_lock, MA_OWNED); + if (itqueuebits) { + pri = ffs(itqueuebits) - 1; + q = &itqueues[pri]; + which = &itqueuebits; + } else if (rtqueuebits) { pri = ffs(rtqueuebits) - 1; q = &rtqueues[pri]; which = &rtqueuebits; @@ -186,10 +231,12 @@ chooseproc(void) q = &idqueues[pri]; which = &idqueuebits; } else { - return NULL; + CTR1(KTR_PROC, "chooseproc: idleproc, schedlock %x", + sched_lock.mtx_lock); + idleproc->p_stat = SRUN; + return idleproc; } p = TAILQ_FIRST(q); - KASSERT(p, ("chooseproc: no proc on busy queue")); #ifdef SMP /* wander down the current run queue for this pri level for a match */ id = cpuid; @@ -201,6 +248,9 @@ chooseproc(void) } } #endif + CTR4(KTR_PROC, "chooseproc: proc %p (pid %d, %s), schedlock %x", + p, p->p_pid, p->p_comm, sched_lock.mtx_lock); + KASSERT(p, ("chooseproc: no proc on busy queue")); TAILQ_REMOVE(q, p, p_procq); if (TAILQ_EMPTY(q)) *which &= ~(1 << pri); diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c index f747759..f397f40 100644 --- a/sys/kern/kern_synch.c +++ b/sys/kern/kern_synch.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -59,6 +60,7 @@ #include #include #include +#include static void sched_setup __P((void *dummy)); SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL) @@ -135,7 +137,7 @@ maybe_resched(chk) * standard process becomes runaway cpu-bound, the system can lockup * due to idle-scheduler processes in wakeup never getting any cpu. */ - if (p == NULL) { + if (p == idleproc) { #if 0 need_resched(); #endif @@ -169,7 +171,7 @@ roundrobin(arg) need_resched(); forward_roundrobin(); #else - if (p == 0 || RTP_PRIO_NEED_RR(p->p_rtprio.type)) + if (p == idleproc || RTP_PRIO_NEED_RR(p->p_rtprio.type)) need_resched(); #endif @@ -284,6 +286,8 @@ schedcpu(arg) * Increment time in/out of memory and sleep time * (if sleeping). We ignore overflow; with 16-bit int's * (remember them?) overflow takes 45 days. + if (p->p_stat == SWAIT) + continue; */ p->p_swtime++; if (p->p_stat == SSLEEP || p->p_stat == SSTOP) @@ -295,7 +299,12 @@ schedcpu(arg) */ if (p->p_slptime > 1) continue; - s = splhigh(); /* prevent state changes and protect run queue */ + /* + * prevent state changes and protect run queue + */ + s = splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); + /* * p_pctcpu is only for ps. */ @@ -325,6 +334,7 @@ schedcpu(arg) } else p->p_priority = p->p_usrpri; } + mtx_exit(&sched_lock, MTX_SPIN); splx(s); } vmmeter(); @@ -364,6 +374,7 @@ updatepri(p) static TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE]; #define LOOKUP(x) (((intptr_t)(x) >> 8) & (TABLESIZE - 1)) +#if 0 /* * During autoconfiguration or after a panic, a sleep will simply * lower the priority briefly to allow interrupts, then return. @@ -374,6 +385,7 @@ static TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE]; * higher to block network software interrupts after panics. */ int safepri; +#endif void sleepinit(void) @@ -406,11 +418,15 @@ tsleep(ident, priority, wmesg, timo) struct proc *p = curproc; int s, sig, catch = priority & PCATCH; struct callout_handle thandle; + int rval = 0; #ifdef KTRACE if (p && KTRPOINT(p, KTR_CSW)) ktrcsw(p->p_tracep, 1, 0); #endif + mtx_assert(&Giant, MA_OWNED); + mtx_enter(&sched_lock, MTX_SPIN); + s = splhigh(); if (cold || panicstr) { /* @@ -419,10 +435,14 @@ tsleep(ident, priority, wmesg, timo) * don't run any other procs or panic below, * in case this is the idle process and already asleep. */ + mtx_exit(&sched_lock, MTX_SPIN); +#if 0 splx(safepri); +#endif splx(s); return (0); } + KASSERT(p != NULL, ("tsleep1")); KASSERT(ident != NULL && p->p_stat == SRUN, ("tsleep")); /* @@ -436,6 +456,9 @@ tsleep(ident, priority, wmesg, timo) p->p_wmesg = wmesg; p->p_slptime = 0; p->p_priority = priority & PRIMASK; + p->p_nativepri = p->p_priority; + CTR4(KTR_PROC, "tsleep: proc %p (pid %d, %s), schedlock %x", + p, p->p_pid, p->p_comm, sched_lock.mtx_lock); TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq); if (timo) thandle = timeout(endtsleep, (void *)p, timo); @@ -449,6 +472,9 @@ tsleep(ident, priority, wmesg, timo) * stopped, p->p_wchan will be 0 upon return from CURSIG. */ if (catch) { + CTR4(KTR_PROC, + "tsleep caught: proc %p (pid %d, %s), schedlock %x", + p, p->p_pid, p->p_comm, sched_lock.mtx_lock); p->p_flag |= P_SINTR; if ((sig = CURSIG(p))) { if (p->p_wchan) @@ -465,6 +491,9 @@ tsleep(ident, priority, wmesg, timo) p->p_stat = SSLEEP; p->p_stats->p_ru.ru_nvcsw++; mi_switch(); + CTR4(KTR_PROC, + "tsleep resume: proc %p (pid %d, %s), schedlock %x", + p, p->p_pid, p->p_comm, sched_lock.mtx_lock); resume: curpriority = p->p_usrpri; splx(s); @@ -476,7 +505,8 @@ resume: if (KTRPOINT(p, KTR_CSW)) ktrcsw(p->p_tracep, 0, 0); #endif - return (EWOULDBLOCK); + rval = EWOULDBLOCK; + goto out; } } else if (timo) untimeout(endtsleep, (void *)p, thandle); @@ -486,14 +516,19 @@ resume: ktrcsw(p->p_tracep, 0, 0); #endif if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig)) - return (EINTR); - return (ERESTART); + rval = EINTR; + else + rval = ERESTART; + goto out; } +out: + mtx_exit(&sched_lock, MTX_SPIN); #ifdef KTRACE if (KTRPOINT(p, KTR_CSW)) ktrcsw(p->p_tracep, 0, 0); #endif - return (0); + + return (rval); } /* @@ -519,13 +554,14 @@ asleep(void *ident, int priority, const char *wmesg, int timo) int s; /* - * splhigh() while manipulating sleep structures and slpque. + * obtain sched_lock while manipulating sleep structures and slpque. * * Remove preexisting wait condition (if any) and place process * on appropriate slpque, but do not put process to sleep. */ s = splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); if (p->p_wchan != NULL) unsleep(p); @@ -539,6 +575,7 @@ asleep(void *ident, int priority, const char *wmesg, int timo) TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq); } + mtx_exit(&sched_lock, MTX_SPIN); splx(s); return(0); @@ -560,8 +597,12 @@ int await(int priority, int timo) { struct proc *p = curproc; + int rval = 0; int s; + mtx_assert(&Giant, MA_OWNED); + mtx_enter(&sched_lock, MTX_SPIN); + s = splhigh(); if (p->p_wchan != NULL) { @@ -616,7 +657,8 @@ resume: if (KTRPOINT(p, KTR_CSW)) ktrcsw(p->p_tracep, 0, 0); #endif - return (EWOULDBLOCK); + rval = EWOULDBLOCK; + goto out; } } else if (timo) untimeout(endtsleep, (void *)p, thandle); @@ -626,8 +668,10 @@ resume: ktrcsw(p->p_tracep, 0, 0); #endif if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig)) - return (EINTR); - return (ERESTART); + rval = EINTR; + else + rval = ERESTART; + goto out; } #ifdef KTRACE if (KTRPOINT(p, KTR_CSW)) @@ -655,7 +699,10 @@ resume: */ p->p_asleep.as_priority = 0; - return (0); +out: + mtx_exit(&sched_lock, MTX_SPIN); + + return (rval); } /* @@ -673,7 +720,11 @@ endtsleep(arg) int s; p = (struct proc *)arg; + CTR4(KTR_PROC, + "endtsleep: proc %p (pid %d, %s), schedlock %x", + p, p->p_pid, p->p_comm, sched_lock.mtx_lock); s = splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); if (p->p_wchan) { if (p->p_stat == SSLEEP) setrunnable(p); @@ -681,6 +732,7 @@ endtsleep(arg) unsleep(p); p->p_flag |= P_TIMEOUT; } + mtx_exit(&sched_lock, MTX_SPIN); splx(s); } @@ -694,10 +746,12 @@ unsleep(p) int s; s = splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); if (p->p_wchan) { TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_procq); p->p_wchan = 0; } + mtx_exit(&sched_lock, MTX_SPIN); splx(s); } @@ -713,6 +767,7 @@ wakeup(ident) int s; s = splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); qp = &slpque[LOOKUP(ident)]; restart: TAILQ_FOREACH(p, qp, p_procq) { @@ -721,6 +776,9 @@ restart: p->p_wchan = 0; if (p->p_stat == SSLEEP) { /* OPTIMIZED EXPANSION OF setrunnable(p); */ + CTR4(KTR_PROC, + "wakeup: proc %p (pid %d, %s), schedlock %x", + p, p->p_pid, p->p_comm, sched_lock.mtx_lock); if (p->p_slptime > 1) updatepri(p); p->p_slptime = 0; @@ -737,6 +795,7 @@ restart: } } } + mtx_exit(&sched_lock, MTX_SPIN); splx(s); } @@ -754,6 +813,7 @@ wakeup_one(ident) int s; s = splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); qp = &slpque[LOOKUP(ident)]; TAILQ_FOREACH(p, qp, p_procq) { @@ -762,6 +822,9 @@ wakeup_one(ident) p->p_wchan = 0; if (p->p_stat == SSLEEP) { /* OPTIMIZED EXPANSION OF setrunnable(p); */ + CTR4(KTR_PROC, + "wakeup1: proc %p (pid %d, %s), schedlock %x", + p, p->p_pid, p->p_comm, sched_lock.mtx_lock); if (p->p_slptime > 1) updatepri(p); p->p_slptime = 0; @@ -778,6 +841,7 @@ wakeup_one(ident) } } } + mtx_exit(&sched_lock, MTX_SPIN); splx(s); } @@ -791,7 +855,9 @@ mi_switch() struct timeval new_switchtime; register struct proc *p = curproc; /* XXX */ register struct rlimit *rlim; + int giantreleased; int x; + WITNESS_SAVE_DECL(Giant); /* * XXX this spl is almost unnecessary. It is partly to allow for @@ -812,6 +878,14 @@ mi_switch() */ x = splstatclock(); + CTR4(KTR_PROC, "mi_switch: old proc %p (pid %d, %s), schedlock %x", + p, p->p_pid, p->p_comm, sched_lock.mtx_lock); + mtx_enter(&sched_lock, MTX_SPIN | MTX_RLIKELY); + + WITNESS_SAVE(&Giant, Giant); + for (giantreleased = 0; mtx_owned(&Giant); giantreleased++) + mtx_exit(&Giant, MTX_DEF | MTX_NOSWITCH); + #ifdef SIMPLELOCK_DEBUG if (p->p_simple_locks) printf("sleep: holding simple lock\n"); @@ -823,7 +897,7 @@ mi_switch() microuptime(&new_switchtime); if (timevalcmp(&new_switchtime, &switchtime, <)) { printf("microuptime() went backwards (%ld.%06ld -> %ld.%06ld)\n", - switchtime.tv_sec, switchtime.tv_usec, + switchtime.tv_sec, switchtime.tv_usec, new_switchtime.tv_sec, new_switchtime.tv_usec); new_switchtime = switchtime; } else { @@ -834,6 +908,8 @@ mi_switch() /* * Check if the process exceeds its cpu resource allocation. * If over max, kill it. + * + * XXX drop sched_lock, pickup Giant */ if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY && p->p_runtime > p->p_limit->p_cpulimit) { @@ -854,10 +930,18 @@ mi_switch() */ cnt.v_swtch++; switchtime = new_switchtime; - cpu_switch(p); + CTR4(KTR_PROC, "mi_switch: old proc %p (pid %d, %s), schedlock %x", + p, p->p_pid, p->p_comm, sched_lock.mtx_lock); + cpu_switch(); + CTR4(KTR_PROC, "mi_switch: new proc %p (pid %d, %s), schedlock %x", + p, p->p_pid, p->p_comm, sched_lock.mtx_lock); if (switchtime.tv_sec == 0) microuptime(&switchtime); switchticks = ticks; + mtx_exit(&sched_lock, MTX_SPIN); + while (giantreleased--) + mtx_enter(&Giant, MTX_DEF); + WITNESS_RESTORE(&Giant, Giant); splx(x); } @@ -874,10 +958,12 @@ setrunnable(p) register int s; s = splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); switch (p->p_stat) { case 0: case SRUN: case SZOMB: + case SWAIT: default: panic("setrunnable"); case SSTOP: @@ -891,6 +977,7 @@ setrunnable(p) p->p_stat = SRUN; if (p->p_flag & P_INMEM) setrunqueue(p); + mtx_exit(&sched_lock, MTX_SPIN); splx(s); if (p->p_slptime > 1) updatepri(p); diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c index b8d5833..1128c2e 100644 --- a/sys/kern/kern_tc.c +++ b/sys/kern/kern_tc.c @@ -24,7 +24,7 @@ * Number of timecounters used to implement stable storage */ #ifndef NTIMECOUNTER -#define NTIMECOUNTER 5 +#define NTIMECOUNTER 45 #endif static MALLOC_DEFINE(M_TIMECOUNTER, "timecounter", @@ -148,6 +148,13 @@ nanotime(struct timespec *ts) nnanotime++; tc = timecounter; +#ifdef KTR + if (tc == NULL) { /* called before initialization */ + ts->tv_sec = 0; + ts->tv_nsec = 0; + return; + } +#endif ts->tv_sec = tc->tc_offset_sec; count = tco_delta(tc); delta = tc->tc_offset_nano; diff --git a/sys/kern/kern_threads.c b/sys/kern/kern_threads.c index 3531e2c..ba2b4bf 100644 --- a/sys/kern/kern_threads.c +++ b/sys/kern/kern_threads.c @@ -52,10 +52,13 @@ #include #include #include +#include #include #include #include +#include + /* * Low level support for sleep/wakeup paradigm * If a timeout is specified: @@ -145,10 +148,12 @@ yield(struct proc *p, struct yield_args *uap) { p->p_retval[0] = 0; s = splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); p->p_priority = MAXPRI; setrunqueue(p); p->p_stats->p_ru.ru_nvcsw++; mi_switch(); + mtx_exit(&sched_lock, MTX_SPIN); splx(s); return(0); diff --git a/sys/kern/subr_prf.c b/sys/kern/subr_prf.c index 3794ccf..a989152 100644 --- a/sys/kern/subr_prf.c +++ b/sys/kern/subr_prf.c @@ -110,7 +110,8 @@ uprintf(const char *fmt, ...) struct putchar_arg pca; int retval = 0; - if (p && p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) { + if (p && p != idleproc && p->p_flag & P_CONTROLT && + p->p_session->s_ttyvp) { va_start(ap, fmt); pca.tty = p->p_session->s_ttyp; pca.flags = TOTTY; diff --git a/sys/kern/subr_prof.c b/sys/kern/subr_prof.c index 4fa5223..294c649 100644 --- a/sys/kern/subr_prof.c +++ b/sys/kern/subr_prof.c @@ -93,6 +93,7 @@ kmstartup(dummy) int nullfunc_loop_profiled_time; uintfptr_t tmp_addr; #endif + int intrstate; /* * Round lowpc and highpc to multiples of the density we're using @@ -135,6 +136,7 @@ kmstartup(dummy) * Disable interrupts to avoid interference while we calibrate * things. */ + intrstate = save_intr(); disable_intr(); /* @@ -189,7 +191,7 @@ kmstartup(dummy) p->state = GMON_PROF_OFF; stopguprof(p); - enable_intr(); + restore_intr(intrstate); nullfunc_loop_profiled_time = 0; for (tmp_addr = (uintfptr_t)nullfunc_loop_profiled; diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c index 61c5ecf..95b5759 100644 --- a/sys/kern/subr_smp.c +++ b/sys/kern/subr_smp.c @@ -36,6 +36,7 @@ #endif #include +#include #include #include #include @@ -65,6 +66,7 @@ #include #include #include +#include #include #include #include @@ -236,6 +238,8 @@ typedef struct BASETABLE_ENTRY { #define MP_ANNOUNCE_POST 0x19 +/* used to hold the AP's until we are ready to release them */ +struct simplelock ap_boot_lock; /** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */ int current_postcode; @@ -336,6 +340,7 @@ static int start_all_aps(u_int boot_addr); static void install_ap_tramp(u_int boot_addr); static int start_ap(int logicalCpu, u_int boot_addr); static int apic_int_is_bus_type(int intr, int bus_type); +static void release_aps(void *dummy); /* * Calculate usable address in base memory for AP trampoline code. @@ -403,7 +408,7 @@ found: /* - * Startup the SMP processors. + * Initialize the SMP hardware and the APIC and start up the AP's. */ void mp_start(void) @@ -619,6 +624,9 @@ mp_enable(u_int boot_addr) /* initialize all SMP locks */ init_locks(); + /* obtain the ap_boot_lock */ + s_lock(&ap_boot_lock); + /* start each Application Processor */ start_all_aps(boot_addr); } @@ -1866,9 +1874,6 @@ struct simplelock fast_intr_lock; /* critical region around INTR() routines */ struct simplelock intr_lock; -/* lock regions protected in UP kernel via cli/sti */ -struct simplelock mpintr_lock; - /* lock region used by kernel profiling */ struct simplelock mcount_lock; @@ -1885,26 +1890,16 @@ struct simplelock clock_lock; /* lock around the MP rendezvous */ static struct simplelock smp_rv_lock; +/* only 1 CPU can panic at a time :) */ +struct simplelock panic_lock; + static void init_locks(void) { - /* - * Get the initial mp_lock with a count of 1 for the BSP. - * This uses a LOGICAL cpu ID, ie BSP == 0. - */ - mp_lock = 0x00000001; - -#if 0 - /* ISR uses its own "giant lock" */ - isr_lock = FREE_LOCK; -#endif - #if defined(APIC_INTR_DIAGNOSTIC) && defined(APIC_INTR_DIAGNOSTIC_IRQ) s_lock_init((struct simplelock*)&apic_itrace_debuglock); #endif - s_lock_init((struct simplelock*)&mpintr_lock); - s_lock_init((struct simplelock*)&mcount_lock); s_lock_init((struct simplelock*)&fast_intr_lock); @@ -1912,6 +1907,7 @@ init_locks(void) s_lock_init((struct simplelock*)&imen_lock); s_lock_init((struct simplelock*)&cpl_lock); s_lock_init(&smp_rv_lock); + s_lock_init(&panic_lock); #ifdef USE_COMLOCK s_lock_init((struct simplelock*)&com_lock); @@ -1919,11 +1915,9 @@ init_locks(void) #ifdef USE_CLOCKLOCK s_lock_init((struct simplelock*)&clock_lock); #endif /* USE_CLOCKLOCK */ -} - -/* Wait for all APs to be fully initialized */ -extern int wait_ap(unsigned int); + s_lock_init(&ap_boot_lock); +} /* * start each AP in our list @@ -1987,6 +1981,7 @@ start_all_aps(u_int boot_addr) SMPpt[pg + 4] = 0; /* *prv_PMAP1 */ /* prime data page for it to use */ + SLIST_INSERT_HEAD(&cpuhead, gd, gd_allcpu); gd->gd_cpuid = x; gd->gd_cpu_lockid = x << 24; gd->gd_prv_CMAP1 = &SMPpt[pg + 1]; @@ -2211,7 +2206,6 @@ start_ap(int logical_cpu, u_int boot_addr) return 0; /* return FAILURE */ } - /* * Flush the TLB on all other CPU's * @@ -2348,10 +2342,13 @@ SYSCTL_INT(_machdep, OID_AUTO, forward_roundrobin_enabled, CTLFLAG_RW, void ap_init(void); void -ap_init() +ap_init(void) { u_int apic_id; + /* lock against other AP's that are waking up */ + s_lock(&ap_boot_lock); + /* BSP may have changed PTD while we're waiting for the lock */ cpu_invltlb(); @@ -2397,6 +2394,30 @@ ap_init() smp_started = 1; /* enable IPI's, tlb shootdown, freezes etc */ smp_active = 1; /* historic */ } + + /* let other AP's wake up now */ + s_unlock(&ap_boot_lock); + + /* wait until all the AP's are up */ + while (smp_started == 0) + ; /* nothing */ + + /* + * Set curproc to our per-cpu idleproc so that mutexes have + * something unique to lock with. + */ + PCPU_SET(curproc,idleproc); + PCPU_SET(prevproc,idleproc); + + microuptime(&switchtime); + switchticks = ticks; + + /* ok, now grab sched_lock and enter the scheduler */ + enable_intr(); + mtx_enter(&sched_lock, MTX_SPIN); + cpu_throw(); /* doesn't return */ + + panic("scheduler returned us to ap_init"); } #ifdef BETTER_CLOCK @@ -2453,6 +2474,12 @@ forwarded_statclock(int id, int pscnt, int *astmap) p = checkstate_curproc[id]; cpustate = checkstate_cpustate[id]; + /* XXX */ + if (p->p_ithd) + cpustate = CHECKSTATE_INTR; + else if (p == idleproc) + cpustate = CHECKSTATE_SYS; + switch (cpustate) { case CHECKSTATE_USER: if (p->p_flag & P_PROFIL) @@ -2482,9 +2509,10 @@ forwarded_statclock(int id, int pscnt, int *astmap) if (pscnt > 1) return; - if (!p) + if (p == idleproc) { + p->p_sticks++; cp_time[CP_IDLE]++; - else { + } else { p->p_sticks++; cp_time[CP_SYS]++; } @@ -2510,7 +2538,7 @@ forwarded_statclock(int id, int pscnt, int *astmap) p->p_iticks++; cp_time[CP_INTR]++; } - if (p != NULL) { + if (p != idleproc) { schedclock(p); /* Update resource usage integrals and maximums. */ @@ -2863,3 +2891,11 @@ smp_rendezvous(void (* setup_func)(void *), /* release lock */ s_unlock(&smp_rv_lock); } + +void +release_aps(void *dummy __unused) +{ + s_unlock(&ap_boot_lock); +} + +SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c index 51de1ac..f32dfae 100644 --- a/sys/kern/subr_trap.c +++ b/sys/kern/subr_trap.c @@ -49,10 +49,12 @@ #include "opt_trap.h" #include +#include #include #include #include #include +#include #include #include #include @@ -76,12 +78,14 @@ #include #include #include +#include #include #ifdef SMP #include #endif #include +#include #include #ifdef POWERFAIL_NMI @@ -96,11 +100,14 @@ #include "isa.h" #include "npx.h" +#include + int (*pmath_emulate) __P((struct trapframe *)); extern void trap __P((struct trapframe frame)); extern int trapwrite __P((unsigned addr)); extern void syscall2 __P((struct trapframe frame)); +extern void ast __P((struct trapframe frame)); static int trap_pfault __P((struct trapframe *, int, vm_offset_t)); static void trap_fatal __P((struct trapframe *, vm_offset_t)); @@ -142,7 +149,7 @@ static char *trap_msg[] = { }; static __inline int userret __P((struct proc *p, struct trapframe *frame, - u_quad_t oticks, int have_mplock)); + u_quad_t oticks, int have_giant)); #if defined(I586_CPU) && !defined(NO_F00F_HACK) extern int has_f00f_bug; @@ -158,18 +165,18 @@ SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, &panic_on_nmi, 0, "Panic on NMI"); static __inline int -userret(p, frame, oticks, have_mplock) +userret(p, frame, oticks, have_giant) struct proc *p; struct trapframe *frame; u_quad_t oticks; - int have_mplock; + int have_giant; { int sig, s; while ((sig = CURSIG(p)) != 0) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } postsig(sig); } @@ -184,31 +191,34 @@ userret(p, frame, oticks, have_mplock) * mi_switch()'ed, we might not be on the queue indicated by * our priority. */ - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; - } s = splhigh(); + mtx_enter(&sched_lock, MTX_SPIN); setrunqueue(p); p->p_stats->p_ru.ru_nivcsw++; mi_switch(); + mtx_exit(&sched_lock, MTX_SPIN); splx(s); - while ((sig = CURSIG(p)) != 0) + while ((sig = CURSIG(p)) != 0) { + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; + } postsig(sig); + } } /* * Charge system time if profiling. */ if (p->p_flag & P_PROFIL) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } addupc_task(p, frame->tf_eip, (u_int)(p->p_sticks - oticks) * psratio); } curpriority = p->p_priority; - return(have_mplock); + return(have_giant); } /* @@ -226,13 +236,20 @@ trap(frame) u_quad_t sticks = 0; int i = 0, ucode = 0, type, code; vm_offset_t eva; +#ifdef POWERFAIL_NMI + static int lastalert = 0; +#endif - if (!(frame.tf_eflags & PSL_I)) { + atomic_add_int(&cnt.v_trap, 1); + + if ((frame.tf_eflags & PSL_I) == 0) { /* - * Buggy application or kernel code has disabled interrupts - * and then trapped. Enabling interrupts now is wrong, but - * it is better than running with interrupts disabled until - * they are accidentally enabled later. + * Buggy application or kernel code has disabled + * interrupts and then trapped. Enabling interrupts + * now is wrong, but it is better than running with + * interrupts disabled until they are accidentally + * enabled later. XXX Consider whether is this still + * correct. */ type = frame.tf_trapno; if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM)) @@ -252,54 +269,27 @@ trap(frame) eva = 0; if (frame.tf_trapno == T_PAGEFLT) { /* - * For some Cyrix CPUs, %cr2 is clobbered by interrupts. - * This problem is worked around by using an interrupt - * gate for the pagefault handler. We are finally ready - * to read %cr2 and then must reenable interrupts. - * - * XXX this should be in the switch statement, but the - * NO_FOOF_HACK and VM86 goto and ifdefs obfuscate the - * flow of control too much for this to be obviously - * correct. + * For some Cyrix CPUs, %cr2 is clobbered by + * interrupts. This problem is worked around by using + * an interrupt gate for the pagefault handler. We + * are finally ready to read %cr2 and then must + * reenable interrupts. */ eva = rcr2(); enable_intr(); - } + } + + mtx_enter(&Giant, MTX_DEF); #if defined(I586_CPU) && !defined(NO_F00F_HACK) restart: #endif + type = frame.tf_trapno; code = frame.tf_err; - if (in_vm86call) { - if (frame.tf_eflags & PSL_VM && - (type == T_PROTFLT || type == T_STKFLT)) { - i = vm86_emulate((struct vm86frame *)&frame); - if (i != 0) - /* - * returns to original process - */ - vm86_trap((struct vm86frame *)&frame); - return; - } - switch (type) { - /* - * these traps want either a process context, or - * assume a normal userspace trap. - */ - case T_PROTFLT: - case T_SEGNPFLT: - trap_fatal(&frame, eva); - return; - case T_TRCTRAP: - type = T_BPTFLT; /* kernel breakpoint */ - /* FALL THROUGH */ - } - goto kernel_trap; /* normal kernel trap handling */ - } - - if ((ISPL(frame.tf_cs) == SEL_UPL) || (frame.tf_eflags & PSL_VM)) { + if ((ISPL(frame.tf_cs) == SEL_UPL) || + ((frame.tf_eflags & PSL_VM) && !in_vm86call)) { /* user trap */ sticks = p->p_sticks; @@ -322,16 +312,6 @@ restart: i = SIGFPE; break; - case T_ASTFLT: /* Allow process switch */ - astoff(); - cnt.v_soft++; - if (p->p_flag & P_OWEUPC) { - p->p_flag &= ~P_OWEUPC; - addupc_task(p, p->p_stats->p_prof.pr_addr, - p->p_stats->p_prof.pr_ticks); - } - goto out; - /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle @@ -342,7 +322,7 @@ restart: if (frame.tf_eflags & PSL_VM) { i = vm86_emulate((struct vm86frame *)&frame); if (i == 0) - goto out; + goto user; break; } /* FALL THROUGH */ @@ -357,14 +337,20 @@ restart: case T_PAGEFLT: /* page fault */ i = trap_pfault(&frame, TRUE, eva); - if (i == -1) - return; #if defined(I586_CPU) && !defined(NO_F00F_HACK) - if (i == -2) + if (i == -2) { + /* + * f00f hack workaround has triggered, treat + * as illegal instruction not page fault. + */ + frame.tf_trapno = T_PRIVINFLT; goto restart; + } #endif - if (i == 0) + if (i == -1) goto out; + if (i == 0) + goto user; ucode = T_PAGEFLT; break; @@ -377,7 +363,15 @@ restart: #if NISA > 0 case T_NMI: #ifdef POWERFAIL_NMI - goto handle_powerfail; +#ifndef TIMER_FREQ +# define TIMER_FREQ 1193182 +#endif + if (time_second - lastalert > 10) { + log(LOG_WARNING, "NMI: power fail\n"); + sysbeep(TIMER_FREQ/880, hz); + lastalert = time_second; + } + goto out; #else /* !POWERFAIL_NMI */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { @@ -391,7 +385,7 @@ restart: kdb_trap (type, 0, &frame); } #endif /* DDB */ - return; + goto out; } else if (panic_on_nmi) panic("NMI indicates hardware failure"); break; @@ -410,9 +404,9 @@ restart: case T_DNA: #if NNPX > 0 - /* if a transparent fault (due to context switch "late") */ + /* transparent fault (due to context switch "late") */ if (npxdna()) - return; + goto out; #endif if (!pmath_emulate) { i = SIGFPE; @@ -422,7 +416,7 @@ restart: i = (*pmath_emulate)(&frame); if (i == 0) { if (!(frame.tf_eflags & PSL_T)) - return; + goto out; frame.tf_eflags &= ~PSL_T; i = SIGTRAP; } @@ -435,13 +429,12 @@ restart: break; } } else { -kernel_trap: /* kernel trap */ switch (type) { case T_PAGEFLT: /* page fault */ (void) trap_pfault(&frame, FALSE, eva); - return; + goto out; case T_DNA: #if NNPX > 0 @@ -451,31 +444,35 @@ kernel_trap: * registered such use. */ if (npxdna()) - return; + goto out; #endif break; - case T_PROTFLT: /* general protection fault */ - case T_SEGNPFLT: /* segment not present fault */ /* - * Invalid segment selectors and out of bounds - * %eip's and %esp's can be set up in user mode. - * This causes a fault in kernel mode when the - * kernel tries to return to user mode. We want - * to get this fault so that we can fix the - * problem here and not have to check all the - * selectors and pointers when the user changes - * them. + * The following two traps can happen in + * vm86 mode, and, if so, we want to handle + * them specially. */ -#define MAYBE_DORETI_FAULT(where, whereto) \ - do { \ - if (frame.tf_eip == (int)where) { \ - frame.tf_eip = (int)whereto; \ - return; \ - } \ - } while (0) - - if (intr_nesting_level == 0) { + case T_PROTFLT: /* general protection fault */ + case T_STKFLT: /* stack fault */ + if (frame.tf_eflags & PSL_VM) { + i = vm86_emulate((struct vm86frame *)&frame); + if (i != 0) + /* + * returns to original process + */ + vm86_trap((struct vm86frame *)&frame); + goto out; + } + /* FALL THROUGH */ + + case T_SEGNPFLT: /* segment not present fault */ + if (in_vm86call) + break; + + if (intr_nesting_level != 0) + break; + /* * Invalid %fs's and %gs's can be created using * procfs or PT_SETREGS or by invalidating the @@ -488,20 +485,38 @@ kernel_trap: if (frame.tf_eip == (int)cpu_switch_load_gs) { curpcb->pcb_gs = 0; psignal(p, SIGBUS); - return; + goto out; + } + + /* + * Invalid segment selectors and out of bounds + * %eip's and %esp's can be set up in user mode. + * This causes a fault in kernel mode when the + * kernel tries to return to user mode. We want + * to get this fault so that we can fix the + * problem here and not have to check all the + * selectors and pointers when the user changes + * them. + */ + if (frame.tf_eip == (int)doreti_iret) { + frame.tf_eip = (int)doreti_iret_fault; + goto out; + } + if (frame.tf_eip == (int)doreti_popl_ds) { + frame.tf_eip = (int)doreti_popl_ds_fault; + goto out; + } + if (frame.tf_eip == (int)doreti_popl_es) { + frame.tf_eip = (int)doreti_popl_es_fault; + goto out; } - MAYBE_DORETI_FAULT(doreti_iret, - doreti_iret_fault); - MAYBE_DORETI_FAULT(doreti_popl_ds, - doreti_popl_ds_fault); - MAYBE_DORETI_FAULT(doreti_popl_es, - doreti_popl_es_fault); - MAYBE_DORETI_FAULT(doreti_popl_fs, - doreti_popl_fs_fault); + if (frame.tf_eip == (int)doreti_popl_fs) { + frame.tf_eip = (int)doreti_popl_fs_fault; + goto out; + } if (curpcb && curpcb->pcb_onfault) { frame.tf_eip = (int)curpcb->pcb_onfault; - return; - } + goto out; } break; @@ -517,7 +532,7 @@ kernel_trap: */ if (frame.tf_eflags & PSL_NT) { frame.tf_eflags &= ~PSL_NT; - return; + goto out; } break; @@ -529,7 +544,7 @@ kernel_trap: * silently until the syscall handler has * saved the flags. */ - return; + goto out; } if (frame.tf_eip == (int)IDTVEC(syscall) + 1) { /* @@ -537,7 +552,7 @@ kernel_trap: * flags. Stop single stepping it. */ frame.tf_eflags &= ~PSL_T; - return; + goto out; } /* * Ignore debug register trace traps due to @@ -549,13 +564,13 @@ kernel_trap: * in kernel space because that is useful when * debugging the kernel. */ - if (user_dbreg_trap()) { + if (user_dbreg_trap() && !in_vm86call) { /* * Reset breakpoint bits because the * processor doesn't */ load_dr6(rdr6() & 0xfffffff0); - return; + goto out; } /* * Fall through (TRCTRAP kernel mode, kernel address) @@ -567,28 +582,19 @@ kernel_trap: */ #ifdef DDB if (kdb_trap (type, 0, &frame)) - return; + goto out; #endif break; #if NISA > 0 case T_NMI: #ifdef POWERFAIL_NMI -#ifndef TIMER_FREQ -# define TIMER_FREQ 1193182 -#endif - handle_powerfail: - { - static unsigned lastalert = 0; - - if(time_second - lastalert > 10) - { + if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(TIMER_FREQ/880, hz); lastalert = time_second; - } - return; } + goto out; #else /* !POWERFAIL_NMI */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { @@ -602,16 +608,16 @@ kernel_trap: kdb_trap (type, 0, &frame); } #endif /* DDB */ - return; + goto out; } else if (panic_on_nmi == 0) - return; + goto out; /* FALL THROUGH */ #endif /* POWERFAIL_NMI */ #endif /* NISA > 0 */ } trap_fatal(&frame, eva); - return; + goto out; } /* Translate fault for emulators (e.g. Linux) */ @@ -630,8 +636,10 @@ kernel_trap: } #endif -out: +user: userret(p, &frame, sticks, 1); +out: + mtx_exit(&Giant, MTX_DEF); } #ifdef notyet @@ -769,10 +777,8 @@ trap_pfault(frame, usermode, eva) * fault. */ #if defined(I586_CPU) && !defined(NO_F00F_HACK) - if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) { - frame->tf_trapno = T_PRIVINFLT; + if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) return -2; - } #endif if (usermode) goto nogo; @@ -869,8 +875,7 @@ trap_fatal(frame, eva) frame->tf_eflags & PSL_VM ? "vm86" : ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); #ifdef SMP - /* three seperate prints in case of a trap on an unmapped page */ - printf("mp_lock = %08x; ", mp_lock); + /* two seperate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", cpuid); printf("lapic.id = %08x\n", lapic.id); #endif @@ -917,26 +922,6 @@ trap_fatal(frame, eva) } else { printf("Idle\n"); } - printf("interrupt mask = "); - if ((cpl & net_imask) == net_imask) - printf("net "); - if ((cpl & tty_imask) == tty_imask) - printf("tty "); - if ((cpl & bio_imask) == bio_imask) - printf("bio "); - if ((cpl & cam_imask) == cam_imask) - printf("cam "); - if (cpl == 0) - printf("none"); -#ifdef SMP -/** - * XXX FIXME: - * we probably SHOULD have stopped the other CPUs before now! - * another CPU COULD have been touching cpl at this moment... - */ - printf(" <- SMP: XXX"); -#endif - printf("\n"); #ifdef KDB if (kdb_trap(&psl)) @@ -973,8 +958,7 @@ dblfault_handler() printf("esp = 0x%x\n", common_tss.tss_esp); printf("ebp = 0x%x\n", common_tss.tss_ebp); #ifdef SMP - /* three seperate prints in case of a trap on an unmapped page */ - printf("mp_lock = %08x; ", mp_lock); + /* two seperate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", cpuid); printf("lapic.id = %08x\n", lapic.id); #endif @@ -1048,12 +1032,14 @@ syscall2(frame) int error; int narg; int args[8]; - int have_mplock = 0; + int have_giant = 0; u_int code; + atomic_add_int(&cnt.v_syscall, 1); + #ifdef DIAGNOSTIC if (ISPL(frame.tf_cs) != SEL_UPL) { - get_mplock(); + mtx_enter(&Giant, MTX_DEF); panic("syscall"); /* NOT REACHED */ } @@ -1075,9 +1061,9 @@ syscall2(frame) /* * The prep code is not MP aware. */ - get_mplock(); + mtx_enter(&Giant, MTX_DEF); (*p->p_sysent->sv_prepsyscall)(&frame, args, &code, ¶ms); - rel_mplock(); + mtx_exit(&Giant, MTX_DEF); } else { /* * Need to check if this is a 32 bit or 64 bit syscall. @@ -1114,8 +1100,8 @@ syscall2(frame) */ if (params && (i = narg * sizeof(int)) && (error = copyin(params, (caddr_t)args, (u_int)i))) { - get_mplock(); - have_mplock = 1; + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) ktrsyscall(p->p_tracep, code, narg, args); @@ -1129,15 +1115,15 @@ syscall2(frame) * we are ktracing */ if ((callp->sy_narg & SYF_MPSAFE) == 0) { - get_mplock(); - have_mplock = 1; + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } ktrsyscall(p->p_tracep, code, narg, args); } @@ -1192,9 +1178,9 @@ bad: * Traced syscall. trapsignal() is not MP aware. */ if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } frame.tf_eflags &= ~PSL_T; trapsignal(p, SIGTRAP, 0); @@ -1203,13 +1189,13 @@ bad: /* * Handle reschedule and other end-of-syscall issues */ - have_mplock = userret(p, &frame, sticks, have_mplock); + have_giant = userret(p, &frame, sticks, have_giant); #ifdef KTRACE if (KTRPOINT(p, KTR_SYSRET)) { - if (have_mplock == 0) { - get_mplock(); - have_mplock = 1; + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; } ktrsysret(p->p_tracep, code, error, p->p_retval[0]); } @@ -1225,27 +1211,66 @@ bad: /* * Release the MP lock if we had to get it */ - if (have_mplock) - rel_mplock(); + if (have_giant) + mtx_exit(&Giant, MTX_DEF); + + mtx_assert(&sched_lock, MA_NOTOWNED); + mtx_assert(&Giant, MA_NOTOWNED); +} + +void +ast(frame) + struct trapframe frame; +{ + struct proc *p = CURPROC; + u_quad_t sticks; + + /* + * handle atomicy by looping since interrupts are enabled and the + * MP lock is not held. + */ + sticks = ((volatile struct proc *)p)->p_sticks; + while (sticks != ((volatile struct proc *)p)->p_sticks) + sticks = ((volatile struct proc *)p)->p_sticks; + + astoff(); + atomic_add_int(&cnt.v_soft, 1); + if (p->p_flag & P_OWEUPC) { + mtx_enter(&Giant, MTX_DEF); + p->p_flag &= ~P_OWEUPC; + addupc_task(p, p->p_stats->p_prof.pr_addr, + p->p_stats->p_prof.pr_ticks); +} + if (userret(p, &frame, sticks, mtx_owned(&Giant)) != 0) + mtx_exit(&Giant, MTX_DEF); } /* * Simplified back end of syscall(), used when returning from fork() - * directly into user mode. MP lock is held on entry and should be - * held on return. + * directly into user mode. Giant is not held on entry, and must not + * be held on return. */ void fork_return(p, frame) struct proc *p; struct trapframe frame; { + int have_giant; + frame.tf_eax = 0; /* Child returns zero */ frame.tf_eflags &= ~PSL_C; /* success */ frame.tf_edx = 1; - userret(p, &frame, 0, 1); + have_giant = userret(p, &frame, 0, mtx_owned(&Giant)); #ifdef KTRACE - if (KTRPOINT(p, KTR_SYSRET)) + if (KTRPOINT(p, KTR_SYSRET)) { + if (have_giant == 0) { + mtx_enter(&Giant, MTX_DEF); + have_giant = 1; + } ktrsysret(p->p_tracep, SYS_fork, 0, 0); + } #endif + if (have_giant) + mtx_exit(&Giant, MTX_DEF); } diff --git a/sys/kern/subr_turnstile.c b/sys/kern/subr_turnstile.c new file mode 100644 index 0000000..1ac3f58 --- /dev/null +++ b/sys/kern/subr_turnstile.c @@ -0,0 +1,799 @@ +/*- + * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Berkeley Software Design Inc's name may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ + * $FreeBSD$ + */ + +/* + * Main Entry: witness + * Pronunciation: 'wit-n&s + * Function: noun + * Etymology: Middle English witnesse, from Old English witnes knowledge, + * testimony, witness, from 2wit + * Date: before 12th century + * 1 : attestation of a fact or event : TESTIMONY + * 2 : one that gives evidence; specifically : one who testifies in + * a cause or before a judicial tribunal + * 3 : one asked to be present at a transaction so as to be able to + * testify to its having taken place + * 4 : one who has personal knowledge of something + * 5 a : something serving as evidence or proof : SIGN + * b : public affirmation by word or example of usually + * religious faith or conviction + * 6 capitalized : a member of the Jehovah's Witnesses + */ + +#include +#include +#include +#include + +#include +#define _KERN_MUTEX_C_ /* Cause non-inlined mtx_*() to be compiled. */ +#include + +/* + * The non-inlined versions of the mtx_*() functions are always built (above), + * but the witness code depends on the SMP_DEBUG and WITNESS kernel options + * being specified. + */ +#if (defined(SMP_DEBUG) && defined(WITNESS)) + +#define WITNESS_COUNT 200 +#define WITNESS_NCHILDREN 2 + +#ifndef WITNESS +#define WITNESS 0 /* default off */ +#endif + +#ifndef SMP +extern int witness_spin_check; +#endif + +int witness_watch; + +typedef struct witness { + struct witness *w_next; + char *w_description; + char *w_file; + int w_line; + struct witness *w_morechildren; + u_char w_childcnt; + u_char w_Giant_squawked:1; + u_char w_other_squawked:1; + u_char w_same_squawked:1; + u_char w_sleep:1; + u_char w_spin:1; /* this is a spin mutex */ + u_int w_level; + struct witness *w_children[WITNESS_NCHILDREN]; +} witness_t; + +typedef struct witness_blessed { + char *b_lock1; + char *b_lock2; +} witness_blessed_t; + +#ifdef KDEBUG +/* + * When WITNESS_KDEBUG is set to 1, it will cause the system to + * drop into kdebug() when: + * - a lock heirarchy violation occurs + * - locks are held when going to sleep. + */ +#ifndef WITNESS_KDEBUG +#define WITNESS_KDEBUG 0 +#endif +int witness_kdebug = WITNESS_KDEBUG; +#endif /* KDEBUG */ + +#ifndef WITNESS_SKIPSPIN +#define WITNESS_SKIPSPIN 0 +#endif +int witness_skipspin = WITNESS_SKIPSPIN; + + +static mtx_t w_mtx; +static witness_t *w_free; +static witness_t *w_all; +static int w_inited; +static int witness_dead; /* fatal error, probably no memory */ + +static witness_t w_data[WITNESS_COUNT]; + +static witness_t *enroll __P((char *description, int flag)); +static int itismychild __P((witness_t *parent, witness_t *child)); +static void removechild __P((witness_t *parent, witness_t *child)); +static int isitmychild __P((witness_t *parent, witness_t *child)); +static int isitmydescendant __P((witness_t *parent, witness_t *child)); +static int dup_ok __P((witness_t *)); +static int blessed __P((witness_t *, witness_t *)); +static void witness_displaydescendants + __P((void(*)(const char *fmt, ...), witness_t *)); +static void witness_leveldescendents __P((witness_t *parent, int level)); +static void witness_levelall __P((void)); +static witness_t * witness_get __P((void)); +static void witness_free __P((witness_t *m)); + + +static char *ignore_list[] = { + "witness lock", + "Kdebug", /* breaks rules and may or may not work */ + "Page Alias", /* sparc only, witness lock won't block intr */ + NULL +}; + +static char *spin_order_list[] = { + "sched lock", + "log mtx", + "zslock", /* sparc only above log, this one is a real hack */ + "time lock", /* above callout */ + "callout mtx", /* above wayout */ + /* + * leaf locks + */ + "wayout mtx", + "kernel_pmap", /* sparc only, logically equal "pmap" below */ + "pmap", /* sparc only */ + NULL +}; + +static char *order_list[] = { + "tcb", "inp", "so_snd", "so_rcv", "Giant lock", NULL, + "udb", "inp", NULL, + "unp head", "unp", "so_snd", NULL, + "de0", "Giant lock", NULL, + "ifnet", "Giant lock", NULL, + "fifo", "so_snd", NULL, + "hme0", "Giant lock", NULL, + "esp0", "Giant lock", NULL, + "hfa0", "Giant lock", NULL, + "so_rcv", "atm_global", NULL, + "so_snd", "atm_global", NULL, + "NFS", "Giant lock", NULL, + NULL +}; + +static char *dup_list[] = { + "inp", + "process group", + "session", + "unp", + "rtentry", + "rawcb", + NULL +}; + +static char *sleep_list[] = { + "Giant lock", + NULL +}; + +/* + * Pairs of locks which have been blessed + * Don't complain about order problems with blessed locks + */ +static witness_blessed_t blessed_list[] = { +}; +static int blessed_count = sizeof (blessed_list) / sizeof (witness_blessed_t); + +void +witness_init(mtx_t *m, int flag) +{ + m->mtx_witness = enroll(m->mtx_description, flag); +} + +void +witness_destroy(mtx_t *m) +{ + mtx_t *m1; + struct proc *p; + p = CURPROC; + for ((m1 = LIST_FIRST(&p->p_heldmtx)); m1 != NULL; + m1 = LIST_NEXT(m1, mtx_held)) { + if (m1 == m) { + LIST_REMOVE(m, mtx_held); + break; + } + } + return; + +} + +void +witness_enter(mtx_t *m, int flags, char *file, int line) +{ + witness_t *w, *w1; + mtx_t *m1; + struct proc *p; + int i; +#ifdef KDEBUG + int go_into_kdebug = 0; +#endif /* KDEBUG */ + + w = m->mtx_witness; + p = CURPROC; + + if (flags & MTX_SPIN) { + if (!w->w_spin) + panic("mutex_enter: MTX_SPIN on MTX_DEF mutex %s @ %s:%d", + m->mtx_description, file, line); + if (m->mtx_recurse != 0) + return; + mtx_enter(&w_mtx, MTX_SPIN); + i = witness_spin_check; + if (i != 0 && w->w_level < i) { + mtx_exit(&w_mtx, MTX_SPIN); + panic("mutex_enter(%s:%x, MTX_SPIN) out of order @ %s:%d" + " already holding %s:%x", + m->mtx_description, w->w_level, file, line, + spin_order_list[ffs(i)-1], i); + } + PCPU_SET(witness_spin_check, i | w->w_level); + mtx_exit(&w_mtx, MTX_SPIN); + return; + } + if (w->w_spin) + panic("mutex_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", + m->mtx_description, file, line); + + if (m->mtx_recurse != 0) + return; + if (witness_dead) + goto out; + if (cold) + goto out; + + if (!mtx_legal2block()) + panic("blockable mtx_enter() of %s when not legal @ %s:%d", + m->mtx_description, file, line); + /* + * Is this the first mutex acquired + */ + if ((m1 = LIST_FIRST(&p->p_heldmtx)) == NULL) + goto out; + + + if ((w1 = m1->mtx_witness) == w) { + if (w->w_same_squawked || dup_ok(w)) + goto out; + w->w_same_squawked = 1; + printf("acquring duplicate lock of same type: \"%s\"\n", + m->mtx_description); + printf(" 1st @ %s:%d\n", w->w_file, w->w_line); + printf(" 2nd @ %s:%d\n", file, line); +#ifdef KDEBUG + go_into_kdebug = 1; +#endif /* KDEBUG */ + goto out; + } + MPASS(!mtx_owned(&w_mtx)); + mtx_enter(&w_mtx, MTX_SPIN); + /* + * If we have a known higher number just say ok + */ + if (witness_watch > 1 && w->w_level > w1->w_level) { + mtx_exit(&w_mtx, MTX_SPIN); + goto out; + } + if (isitmydescendant(m1->mtx_witness, w)) { + mtx_exit(&w_mtx, MTX_SPIN); + goto out; + } + for (i = 0; m1 != NULL; m1 = LIST_NEXT(m1, mtx_held), i++) { + + ASS(i < 200); + w1 = m1->mtx_witness; + if (isitmydescendant(w, w1)) { + mtx_exit(&w_mtx, MTX_SPIN); + if (blessed(w, w1)) + goto out; + if (m1 == &Giant) { + if (w1->w_Giant_squawked) + goto out; + else + w1->w_Giant_squawked = 1; + } else { + if (w1->w_other_squawked) + goto out; + else + w1->w_other_squawked = 1; + } + printf("lock order reversal\n"); + printf(" 1st %s last acquired @ %s:%d\n", + w->w_description, w->w_file, w->w_line); + printf(" 2nd %p %s @ %s:%d\n", + m1, w1->w_description, w1->w_file, w1->w_line); + printf(" 3rd %p %s @ %s:%d\n", + m, w->w_description, file, line); +#ifdef KDEBUG + go_into_kdebug = 1; +#endif /* KDEBUG */ + goto out; + } + } + m1 = LIST_FIRST(&p->p_heldmtx); + if (!itismychild(m1->mtx_witness, w)) + mtx_exit(&w_mtx, MTX_SPIN); + +out: +#ifdef KDEBUG + if (witness_kdebug && go_into_kdebug) + kdebug(); +#endif /* KDEBUG */ + w->w_file = file; + w->w_line = line; + m->mtx_line = line; + m->mtx_file = file; + + /* + * If this pays off it likely means that a mutex being witnessed + * is acquired in hardclock. Put it in the ignore list. It is + * likely not the mutex this assert fails on. + */ + ASS(m->mtx_held.le_prev == NULL); + LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held); +} + +void +witness_exit(mtx_t *m, int flags, char *file, int line) +{ + witness_t *w; + + w = m->mtx_witness; + + if (flags & MTX_SPIN) { + if (!w->w_spin) + panic("mutex_exit: MTX_SPIN on MTX_DEF mutex %s @ %s:%d", + m->mtx_description, file, line); + if (m->mtx_recurse != 0) + return; + mtx_enter(&w_mtx, MTX_SPIN); + PCPU_SET(witness_spin_check, witness_spin_check & ~w->w_level); + mtx_exit(&w_mtx, MTX_SPIN); + return; + } + if (w->w_spin) + panic("mutex_exit: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", + m->mtx_description, file, line); + + if (m->mtx_recurse != 0) + return; + + if ((flags & MTX_NOSWITCH) == 0 && !mtx_legal2block() && !cold) + panic("switchable mtx_exit() of %s when not legal @ %s:%d", + m->mtx_description, file, line); + LIST_REMOVE(m, mtx_held); + m->mtx_held.le_prev = NULL; +} + +void +witness_try_enter(mtx_t *m, int flags, char *file, int line) +{ + struct proc *p; + witness_t *w = m->mtx_witness; + + + if (flags & MTX_SPIN) { + if (!w->w_spin) + panic("mutex_try_enter: " + "MTX_SPIN on MTX_DEF mutex %s @ %s:%d", + m->mtx_description, file, line); + if (m->mtx_recurse != 0) + return; + mtx_enter(&w_mtx, MTX_SPIN); + PCPU_SET(witness_spin_check, witness_spin_check | w->w_level); + mtx_exit(&w_mtx, MTX_SPIN); + return; + } + + if (w->w_spin) + panic("mutex_try_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", + m->mtx_description, file, line); + + if (m->mtx_recurse != 0) + return; + + w->w_file = file; + w->w_line = line; + m->mtx_line = line; + m->mtx_file = file; + p = CURPROC; + ASS(m->mtx_held.le_prev == NULL); + LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held); +} + +void +witness_display(void(*prnt)(const char *fmt, ...)) +{ + witness_t *w, *w1; + + witness_levelall(); + + for (w = w_all; w; w = w->w_next) { + if (w->w_file == NULL) + continue; + for (w1 = w_all; w1; w1 = w1->w_next) { + if (isitmychild(w1, w)) + break; + } + if (w1 != NULL) + continue; + /* + * This lock has no anscestors, display its descendants. + */ + witness_displaydescendants(prnt, w); + } + prnt("\nMutex which were never acquired\n"); + for (w = w_all; w; w = w->w_next) { + if (w->w_file != NULL) + continue; + prnt("%s\n", w->w_description); + } +} + +int +witness_sleep(int check_only, mtx_t *mtx, char *file, int line) +{ + mtx_t *m; + struct proc *p; + char **sleep; + int n = 0; + + p = CURPROC; + for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL; + m = LIST_NEXT(m, mtx_held)) { + if (m == mtx) + continue; + for (sleep = sleep_list; *sleep!= NULL; sleep++) + if (strcmp(m->mtx_description, *sleep) == 0) + goto next; + printf("%s:%d: %s with \"%s\" locked from %s:%d\n", + file, line, check_only ? "could sleep" : "sleeping", + m->mtx_description, + m->mtx_witness->w_file, m->mtx_witness->w_line); + n++; + next: + } +#ifdef KDEBUG + if (witness_kdebug && n) + kdebug(); +#endif /* KDEBUG */ + return (n); +} + +static witness_t * +enroll(char *description, int flag) +{ + int i; + witness_t *w, *w1; + char **ignore; + char **order; + + if (!witness_watch) + return (NULL); + for (ignore = ignore_list; *ignore != NULL; ignore++) + if (strcmp(description, *ignore) == 0) + return (NULL); + + if (w_inited == 0) { + mtx_init(&w_mtx, "witness lock", MTX_DEF); + for (i = 0; i < WITNESS_COUNT; i++) { + w = &w_data[i]; + witness_free(w); + } + w_inited = 1; + for (order = order_list; *order != NULL; order++) { + w = enroll(*order, MTX_DEF); + w->w_file = "order list"; + for (order++; *order != NULL; order++) { + w1 = enroll(*order, MTX_DEF); + w1->w_file = "order list"; + itismychild(w, w1); + w = w1; + } + } + } + if ((flag & MTX_SPIN) && witness_skipspin) + return (NULL); + mtx_enter(&w_mtx, MTX_SPIN); + for (w = w_all; w; w = w->w_next) { + if (strcmp(description, w->w_description) == 0) { + mtx_exit(&w_mtx, MTX_SPIN); + return (w); + } + } + if ((w = witness_get()) == NULL) + return (NULL); + w->w_next = w_all; + w_all = w; + w->w_description = description; + mtx_exit(&w_mtx, MTX_SPIN); + if (flag & MTX_SPIN) { + w->w_spin = 1; + + i = 1; + for (order = spin_order_list; *order != NULL; order++) { + if (strcmp(description, *order) == 0) + break; + i <<= 1; + } + if (*order == NULL) + panic("spin lock %s not in order list", description); + w->w_level = i; + } + return (w); +} + +static int +itismychild(witness_t *parent, witness_t *child) +{ + static int recursed; + + /* + * Insert "child" after "parent" + */ + while (parent->w_morechildren) + parent = parent->w_morechildren; + + if (parent->w_childcnt == WITNESS_NCHILDREN) { + if ((parent->w_morechildren = witness_get()) == NULL) + return (1); + parent = parent->w_morechildren; + } + ASS(child != NULL); + parent->w_children[parent->w_childcnt++] = child; + /* + * now prune whole tree + */ + if (recursed) + return (0); + recursed = 1; + for (child = w_all; child != NULL; child = child->w_next) { + for (parent = w_all; parent != NULL; + parent = parent->w_next) { + if (!isitmychild(parent, child)) + continue; + removechild(parent, child); + if (isitmydescendant(parent, child)) + continue; + itismychild(parent, child); + } + } + recursed = 0; + witness_levelall(); + return (0); +} + +static void +removechild(witness_t *parent, witness_t *child) +{ + witness_t *w, *w1; + int i; + + for (w = parent; w != NULL; w = w->w_morechildren) + for (i = 0; i < w->w_childcnt; i++) + if (w->w_children[i] == child) + goto found; + return; +found: + for (w1 = w; w1->w_morechildren != NULL; w1 = w1->w_morechildren) + continue; + w->w_children[i] = w1->w_children[--w1->w_childcnt]; + ASS(w->w_children[i] != NULL); + + if (w1->w_childcnt != 0) + return; + + if (w1 == parent) + return; + for (w = parent; w->w_morechildren != w1; w = w->w_morechildren) + continue; + w->w_morechildren = 0; + witness_free(w1); +} + +static int +isitmychild(witness_t *parent, witness_t *child) +{ + witness_t *w; + int i; + + for (w = parent; w != NULL; w = w->w_morechildren) { + for (i = 0; i < w->w_childcnt; i++) { + if (w->w_children[i] == child) + return (1); + } + } + return (0); +} + +static int +isitmydescendant(witness_t *parent, witness_t *child) +{ + witness_t *w; + int i; + int j; + + for (j = 0, w = parent; w != NULL; w = w->w_morechildren, j++) { + ASS(j < 1000); + for (i = 0; i < w->w_childcnt; i++) { + if (w->w_children[i] == child) + return (1); + } + for (i = 0; i < w->w_childcnt; i++) { + if (isitmydescendant(w->w_children[i], child)) + return (1); + } + } + return (0); +} + +void +witness_levelall (void) +{ + witness_t *w, *w1; + + for (w = w_all; w; w = w->w_next) + if (!w->w_spin) + w->w_level = 0; + for (w = w_all; w; w = w->w_next) { + if (w->w_spin) + continue; + for (w1 = w_all; w1; w1 = w1->w_next) { + if (isitmychild(w1, w)) + break; + } + if (w1 != NULL) + continue; + witness_leveldescendents(w, 0); + } +} + +static void +witness_leveldescendents(witness_t *parent, int level) +{ + int i; + witness_t *w; + + if (parent->w_level < level) + parent->w_level = level; + level++; + for (w = parent; w != NULL; w = w->w_morechildren) + for (i = 0; i < w->w_childcnt; i++) + witness_leveldescendents(w->w_children[i], level); +} + +static void +witness_displaydescendants(void(*prnt)(const char *fmt, ...), witness_t *parent) +{ + witness_t *w; + int i; + int level = parent->w_level; + + prnt("%d", level); + if (level < 10) + prnt(" "); + for (i = 0; i < level; i++) + prnt(" "); + prnt("%s", parent->w_description); + if (parent->w_file != NULL) { + prnt(" -- last acquired @ %s", parent->w_file); +#ifndef W_USE_WHERE + prnt(":%d", parent->w_line); +#endif + prnt("\n"); + } + + for (w = parent; w != NULL; w = w->w_morechildren) + for (i = 0; i < w->w_childcnt; i++) + witness_displaydescendants(prnt, w->w_children[i]); + } + +static int +dup_ok(witness_t *w) +{ + char **dup; + + for (dup = dup_list; *dup!= NULL; dup++) + if (strcmp(w->w_description, *dup) == 0) + return (1); + return (0); +} + +static int +blessed(witness_t *w1, witness_t *w2) +{ + int i; + witness_blessed_t *b; + + for (i = 0; i < blessed_count; i++) { + b = &blessed_list[i]; + if (strcmp(w1->w_description, b->b_lock1) == 0) { + if (strcmp(w2->w_description, b->b_lock2) == 0) + return (1); + continue; + } + if (strcmp(w1->w_description, b->b_lock2) == 0) + if (strcmp(w2->w_description, b->b_lock1) == 0) + return (1); + } + return (0); +} + +static witness_t * +witness_get() +{ + witness_t *w; + + if ((w = w_free) == NULL) { + witness_dead = 1; + mtx_exit(&w_mtx, MTX_SPIN); + printf("witness exhausted\n"); + return (NULL); + } + w_free = w->w_next; + bzero(w, sizeof (*w)); + return (w); +} + +static void +witness_free(witness_t *w) +{ + w->w_next = w_free; + w_free = w; +} + +void +witness_list(struct proc *p) +{ + mtx_t *m; + + for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL; + m = LIST_NEXT(m, mtx_held)) { + printf("\t\"%s\" (%p) locked at %s:%d\n", + m->mtx_description, m, + m->mtx_witness->w_file, m->mtx_witness->w_line); + } +} + +void +witness_save(mtx_t *m, char **filep, int *linep) +{ + *filep = m->mtx_witness->w_file; + *linep = m->mtx_witness->w_line; +} + +void +witness_restore(mtx_t *m, char *file, int line) +{ + m->mtx_witness->w_file = file; + m->mtx_witness->w_line = line; +} + +#endif /* (defined(SMP_DEBUG) && defined(WITNESS)) */ diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c new file mode 100644 index 0000000..1ac3f58 --- /dev/null +++ b/sys/kern/subr_witness.c @@ -0,0 +1,799 @@ +/*- + * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Berkeley Software Design Inc's name may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ + * $FreeBSD$ + */ + +/* + * Main Entry: witness + * Pronunciation: 'wit-n&s + * Function: noun + * Etymology: Middle English witnesse, from Old English witnes knowledge, + * testimony, witness, from 2wit + * Date: before 12th century + * 1 : attestation of a fact or event : TESTIMONY + * 2 : one that gives evidence; specifically : one who testifies in + * a cause or before a judicial tribunal + * 3 : one asked to be present at a transaction so as to be able to + * testify to its having taken place + * 4 : one who has personal knowledge of something + * 5 a : something serving as evidence or proof : SIGN + * b : public affirmation by word or example of usually + * religious faith or conviction + * 6 capitalized : a member of the Jehovah's Witnesses + */ + +#include +#include +#include +#include + +#include +#define _KERN_MUTEX_C_ /* Cause non-inlined mtx_*() to be compiled. */ +#include + +/* + * The non-inlined versions of the mtx_*() functions are always built (above), + * but the witness code depends on the SMP_DEBUG and WITNESS kernel options + * being specified. + */ +#if (defined(SMP_DEBUG) && defined(WITNESS)) + +#define WITNESS_COUNT 200 +#define WITNESS_NCHILDREN 2 + +#ifndef WITNESS +#define WITNESS 0 /* default off */ +#endif + +#ifndef SMP +extern int witness_spin_check; +#endif + +int witness_watch; + +typedef struct witness { + struct witness *w_next; + char *w_description; + char *w_file; + int w_line; + struct witness *w_morechildren; + u_char w_childcnt; + u_char w_Giant_squawked:1; + u_char w_other_squawked:1; + u_char w_same_squawked:1; + u_char w_sleep:1; + u_char w_spin:1; /* this is a spin mutex */ + u_int w_level; + struct witness *w_children[WITNESS_NCHILDREN]; +} witness_t; + +typedef struct witness_blessed { + char *b_lock1; + char *b_lock2; +} witness_blessed_t; + +#ifdef KDEBUG +/* + * When WITNESS_KDEBUG is set to 1, it will cause the system to + * drop into kdebug() when: + * - a lock heirarchy violation occurs + * - locks are held when going to sleep. + */ +#ifndef WITNESS_KDEBUG +#define WITNESS_KDEBUG 0 +#endif +int witness_kdebug = WITNESS_KDEBUG; +#endif /* KDEBUG */ + +#ifndef WITNESS_SKIPSPIN +#define WITNESS_SKIPSPIN 0 +#endif +int witness_skipspin = WITNESS_SKIPSPIN; + + +static mtx_t w_mtx; +static witness_t *w_free; +static witness_t *w_all; +static int w_inited; +static int witness_dead; /* fatal error, probably no memory */ + +static witness_t w_data[WITNESS_COUNT]; + +static witness_t *enroll __P((char *description, int flag)); +static int itismychild __P((witness_t *parent, witness_t *child)); +static void removechild __P((witness_t *parent, witness_t *child)); +static int isitmychild __P((witness_t *parent, witness_t *child)); +static int isitmydescendant __P((witness_t *parent, witness_t *child)); +static int dup_ok __P((witness_t *)); +static int blessed __P((witness_t *, witness_t *)); +static void witness_displaydescendants + __P((void(*)(const char *fmt, ...), witness_t *)); +static void witness_leveldescendents __P((witness_t *parent, int level)); +static void witness_levelall __P((void)); +static witness_t * witness_get __P((void)); +static void witness_free __P((witness_t *m)); + + +static char *ignore_list[] = { + "witness lock", + "Kdebug", /* breaks rules and may or may not work */ + "Page Alias", /* sparc only, witness lock won't block intr */ + NULL +}; + +static char *spin_order_list[] = { + "sched lock", + "log mtx", + "zslock", /* sparc only above log, this one is a real hack */ + "time lock", /* above callout */ + "callout mtx", /* above wayout */ + /* + * leaf locks + */ + "wayout mtx", + "kernel_pmap", /* sparc only, logically equal "pmap" below */ + "pmap", /* sparc only */ + NULL +}; + +static char *order_list[] = { + "tcb", "inp", "so_snd", "so_rcv", "Giant lock", NULL, + "udb", "inp", NULL, + "unp head", "unp", "so_snd", NULL, + "de0", "Giant lock", NULL, + "ifnet", "Giant lock", NULL, + "fifo", "so_snd", NULL, + "hme0", "Giant lock", NULL, + "esp0", "Giant lock", NULL, + "hfa0", "Giant lock", NULL, + "so_rcv", "atm_global", NULL, + "so_snd", "atm_global", NULL, + "NFS", "Giant lock", NULL, + NULL +}; + +static char *dup_list[] = { + "inp", + "process group", + "session", + "unp", + "rtentry", + "rawcb", + NULL +}; + +static char *sleep_list[] = { + "Giant lock", + NULL +}; + +/* + * Pairs of locks which have been blessed + * Don't complain about order problems with blessed locks + */ +static witness_blessed_t blessed_list[] = { +}; +static int blessed_count = sizeof (blessed_list) / sizeof (witness_blessed_t); + +void +witness_init(mtx_t *m, int flag) +{ + m->mtx_witness = enroll(m->mtx_description, flag); +} + +void +witness_destroy(mtx_t *m) +{ + mtx_t *m1; + struct proc *p; + p = CURPROC; + for ((m1 = LIST_FIRST(&p->p_heldmtx)); m1 != NULL; + m1 = LIST_NEXT(m1, mtx_held)) { + if (m1 == m) { + LIST_REMOVE(m, mtx_held); + break; + } + } + return; + +} + +void +witness_enter(mtx_t *m, int flags, char *file, int line) +{ + witness_t *w, *w1; + mtx_t *m1; + struct proc *p; + int i; +#ifdef KDEBUG + int go_into_kdebug = 0; +#endif /* KDEBUG */ + + w = m->mtx_witness; + p = CURPROC; + + if (flags & MTX_SPIN) { + if (!w->w_spin) + panic("mutex_enter: MTX_SPIN on MTX_DEF mutex %s @ %s:%d", + m->mtx_description, file, line); + if (m->mtx_recurse != 0) + return; + mtx_enter(&w_mtx, MTX_SPIN); + i = witness_spin_check; + if (i != 0 && w->w_level < i) { + mtx_exit(&w_mtx, MTX_SPIN); + panic("mutex_enter(%s:%x, MTX_SPIN) out of order @ %s:%d" + " already holding %s:%x", + m->mtx_description, w->w_level, file, line, + spin_order_list[ffs(i)-1], i); + } + PCPU_SET(witness_spin_check, i | w->w_level); + mtx_exit(&w_mtx, MTX_SPIN); + return; + } + if (w->w_spin) + panic("mutex_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", + m->mtx_description, file, line); + + if (m->mtx_recurse != 0) + return; + if (witness_dead) + goto out; + if (cold) + goto out; + + if (!mtx_legal2block()) + panic("blockable mtx_enter() of %s when not legal @ %s:%d", + m->mtx_description, file, line); + /* + * Is this the first mutex acquired + */ + if ((m1 = LIST_FIRST(&p->p_heldmtx)) == NULL) + goto out; + + + if ((w1 = m1->mtx_witness) == w) { + if (w->w_same_squawked || dup_ok(w)) + goto out; + w->w_same_squawked = 1; + printf("acquring duplicate lock of same type: \"%s\"\n", + m->mtx_description); + printf(" 1st @ %s:%d\n", w->w_file, w->w_line); + printf(" 2nd @ %s:%d\n", file, line); +#ifdef KDEBUG + go_into_kdebug = 1; +#endif /* KDEBUG */ + goto out; + } + MPASS(!mtx_owned(&w_mtx)); + mtx_enter(&w_mtx, MTX_SPIN); + /* + * If we have a known higher number just say ok + */ + if (witness_watch > 1 && w->w_level > w1->w_level) { + mtx_exit(&w_mtx, MTX_SPIN); + goto out; + } + if (isitmydescendant(m1->mtx_witness, w)) { + mtx_exit(&w_mtx, MTX_SPIN); + goto out; + } + for (i = 0; m1 != NULL; m1 = LIST_NEXT(m1, mtx_held), i++) { + + ASS(i < 200); + w1 = m1->mtx_witness; + if (isitmydescendant(w, w1)) { + mtx_exit(&w_mtx, MTX_SPIN); + if (blessed(w, w1)) + goto out; + if (m1 == &Giant) { + if (w1->w_Giant_squawked) + goto out; + else + w1->w_Giant_squawked = 1; + } else { + if (w1->w_other_squawked) + goto out; + else + w1->w_other_squawked = 1; + } + printf("lock order reversal\n"); + printf(" 1st %s last acquired @ %s:%d\n", + w->w_description, w->w_file, w->w_line); + printf(" 2nd %p %s @ %s:%d\n", + m1, w1->w_description, w1->w_file, w1->w_line); + printf(" 3rd %p %s @ %s:%d\n", + m, w->w_description, file, line); +#ifdef KDEBUG + go_into_kdebug = 1; +#endif /* KDEBUG */ + goto out; + } + } + m1 = LIST_FIRST(&p->p_heldmtx); + if (!itismychild(m1->mtx_witness, w)) + mtx_exit(&w_mtx, MTX_SPIN); + +out: +#ifdef KDEBUG + if (witness_kdebug && go_into_kdebug) + kdebug(); +#endif /* KDEBUG */ + w->w_file = file; + w->w_line = line; + m->mtx_line = line; + m->mtx_file = file; + + /* + * If this pays off it likely means that a mutex being witnessed + * is acquired in hardclock. Put it in the ignore list. It is + * likely not the mutex this assert fails on. + */ + ASS(m->mtx_held.le_prev == NULL); + LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held); +} + +void +witness_exit(mtx_t *m, int flags, char *file, int line) +{ + witness_t *w; + + w = m->mtx_witness; + + if (flags & MTX_SPIN) { + if (!w->w_spin) + panic("mutex_exit: MTX_SPIN on MTX_DEF mutex %s @ %s:%d", + m->mtx_description, file, line); + if (m->mtx_recurse != 0) + return; + mtx_enter(&w_mtx, MTX_SPIN); + PCPU_SET(witness_spin_check, witness_spin_check & ~w->w_level); + mtx_exit(&w_mtx, MTX_SPIN); + return; + } + if (w->w_spin) + panic("mutex_exit: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", + m->mtx_description, file, line); + + if (m->mtx_recurse != 0) + return; + + if ((flags & MTX_NOSWITCH) == 0 && !mtx_legal2block() && !cold) + panic("switchable mtx_exit() of %s when not legal @ %s:%d", + m->mtx_description, file, line); + LIST_REMOVE(m, mtx_held); + m->mtx_held.le_prev = NULL; +} + +void +witness_try_enter(mtx_t *m, int flags, char *file, int line) +{ + struct proc *p; + witness_t *w = m->mtx_witness; + + + if (flags & MTX_SPIN) { + if (!w->w_spin) + panic("mutex_try_enter: " + "MTX_SPIN on MTX_DEF mutex %s @ %s:%d", + m->mtx_description, file, line); + if (m->mtx_recurse != 0) + return; + mtx_enter(&w_mtx, MTX_SPIN); + PCPU_SET(witness_spin_check, witness_spin_check | w->w_level); + mtx_exit(&w_mtx, MTX_SPIN); + return; + } + + if (w->w_spin) + panic("mutex_try_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d", + m->mtx_description, file, line); + + if (m->mtx_recurse != 0) + return; + + w->w_file = file; + w->w_line = line; + m->mtx_line = line; + m->mtx_file = file; + p = CURPROC; + ASS(m->mtx_held.le_prev == NULL); + LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held); +} + +void +witness_display(void(*prnt)(const char *fmt, ...)) +{ + witness_t *w, *w1; + + witness_levelall(); + + for (w = w_all; w; w = w->w_next) { + if (w->w_file == NULL) + continue; + for (w1 = w_all; w1; w1 = w1->w_next) { + if (isitmychild(w1, w)) + break; + } + if (w1 != NULL) + continue; + /* + * This lock has no anscestors, display its descendants. + */ + witness_displaydescendants(prnt, w); + } + prnt("\nMutex which were never acquired\n"); + for (w = w_all; w; w = w->w_next) { + if (w->w_file != NULL) + continue; + prnt("%s\n", w->w_description); + } +} + +int +witness_sleep(int check_only, mtx_t *mtx, char *file, int line) +{ + mtx_t *m; + struct proc *p; + char **sleep; + int n = 0; + + p = CURPROC; + for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL; + m = LIST_NEXT(m, mtx_held)) { + if (m == mtx) + continue; + for (sleep = sleep_list; *sleep!= NULL; sleep++) + if (strcmp(m->mtx_description, *sleep) == 0) + goto next; + printf("%s:%d: %s with \"%s\" locked from %s:%d\n", + file, line, check_only ? "could sleep" : "sleeping", + m->mtx_description, + m->mtx_witness->w_file, m->mtx_witness->w_line); + n++; + next: + } +#ifdef KDEBUG + if (witness_kdebug && n) + kdebug(); +#endif /* KDEBUG */ + return (n); +} + +static witness_t * +enroll(char *description, int flag) +{ + int i; + witness_t *w, *w1; + char **ignore; + char **order; + + if (!witness_watch) + return (NULL); + for (ignore = ignore_list; *ignore != NULL; ignore++) + if (strcmp(description, *ignore) == 0) + return (NULL); + + if (w_inited == 0) { + mtx_init(&w_mtx, "witness lock", MTX_DEF); + for (i = 0; i < WITNESS_COUNT; i++) { + w = &w_data[i]; + witness_free(w); + } + w_inited = 1; + for (order = order_list; *order != NULL; order++) { + w = enroll(*order, MTX_DEF); + w->w_file = "order list"; + for (order++; *order != NULL; order++) { + w1 = enroll(*order, MTX_DEF); + w1->w_file = "order list"; + itismychild(w, w1); + w = w1; + } + } + } + if ((flag & MTX_SPIN) && witness_skipspin) + return (NULL); + mtx_enter(&w_mtx, MTX_SPIN); + for (w = w_all; w; w = w->w_next) { + if (strcmp(description, w->w_description) == 0) { + mtx_exit(&w_mtx, MTX_SPIN); + return (w); + } + } + if ((w = witness_get()) == NULL) + return (NULL); + w->w_next = w_all; + w_all = w; + w->w_description = description; + mtx_exit(&w_mtx, MTX_SPIN); + if (flag & MTX_SPIN) { + w->w_spin = 1; + + i = 1; + for (order = spin_order_list; *order != NULL; order++) { + if (strcmp(description, *order) == 0) + break; + i <<= 1; + } + if (*order == NULL) + panic("spin lock %s not in order list", description); + w->w_level = i; + } + return (w); +} + +static int +itismychild(witness_t *parent, witness_t *child) +{ + static int recursed; + + /* + * Insert "child" after "parent" + */ + while (parent->w_morechildren) + parent = parent->w_morechildren; + + if (parent->w_childcnt == WITNESS_NCHILDREN) { + if ((parent->w_morechildren = witness_get()) == NULL) + return (1); + parent = parent->w_morechildren; + } + ASS(child != NULL); + parent->w_children[parent->w_childcnt++] = child; + /* + * now prune whole tree + */ + if (recursed) + return (0); + recursed = 1; + for (child = w_all; child != NULL; child = child->w_next) { + for (parent = w_all; parent != NULL; + parent = parent->w_next) { + if (!isitmychild(parent, child)) + continue; + removechild(parent, child); + if (isitmydescendant(parent, child)) + continue; + itismychild(parent, child); + } + } + recursed = 0; + witness_levelall(); + return (0); +} + +static void +removechild(witness_t *parent, witness_t *child) +{ + witness_t *w, *w1; + int i; + + for (w = parent; w != NULL; w = w->w_morechildren) + for (i = 0; i < w->w_childcnt; i++) + if (w->w_children[i] == child) + goto found; + return; +found: + for (w1 = w; w1->w_morechildren != NULL; w1 = w1->w_morechildren) + continue; + w->w_children[i] = w1->w_children[--w1->w_childcnt]; + ASS(w->w_children[i] != NULL); + + if (w1->w_childcnt != 0) + return; + + if (w1 == parent) + return; + for (w = parent; w->w_morechildren != w1; w = w->w_morechildren) + continue; + w->w_morechildren = 0; + witness_free(w1); +} + +static int +isitmychild(witness_t *parent, witness_t *child) +{ + witness_t *w; + int i; + + for (w = parent; w != NULL; w = w->w_morechildren) { + for (i = 0; i < w->w_childcnt; i++) { + if (w->w_children[i] == child) + return (1); + } + } + return (0); +} + +static int +isitmydescendant(witness_t *parent, witness_t *child) +{ + witness_t *w; + int i; + int j; + + for (j = 0, w = parent; w != NULL; w = w->w_morechildren, j++) { + ASS(j < 1000); + for (i = 0; i < w->w_childcnt; i++) { + if (w->w_children[i] == child) + return (1); + } + for (i = 0; i < w->w_childcnt; i++) { + if (isitmydescendant(w->w_children[i], child)) + return (1); + } + } + return (0); +} + +void +witness_levelall (void) +{ + witness_t *w, *w1; + + for (w = w_all; w; w = w->w_next) + if (!w->w_spin) + w->w_level = 0; + for (w = w_all; w; w = w->w_next) { + if (w->w_spin) + continue; + for (w1 = w_all; w1; w1 = w1->w_next) { + if (isitmychild(w1, w)) + break; + } + if (w1 != NULL) + continue; + witness_leveldescendents(w, 0); + } +} + +static void +witness_leveldescendents(witness_t *parent, int level) +{ + int i; + witness_t *w; + + if (parent->w_level < level) + parent->w_level = level; + level++; + for (w = parent; w != NULL; w = w->w_morechildren) + for (i = 0; i < w->w_childcnt; i++) + witness_leveldescendents(w->w_children[i], level); +} + +static void +witness_displaydescendants(void(*prnt)(const char *fmt, ...), witness_t *parent) +{ + witness_t *w; + int i; + int level = parent->w_level; + + prnt("%d", level); + if (level < 10) + prnt(" "); + for (i = 0; i < level; i++) + prnt(" "); + prnt("%s", parent->w_description); + if (parent->w_file != NULL) { + prnt(" -- last acquired @ %s", parent->w_file); +#ifndef W_USE_WHERE + prnt(":%d", parent->w_line); +#endif + prnt("\n"); + } + + for (w = parent; w != NULL; w = w->w_morechildren) + for (i = 0; i < w->w_childcnt; i++) + witness_displaydescendants(prnt, w->w_children[i]); + } + +static int +dup_ok(witness_t *w) +{ + char **dup; + + for (dup = dup_list; *dup!= NULL; dup++) + if (strcmp(w->w_description, *dup) == 0) + return (1); + return (0); +} + +static int +blessed(witness_t *w1, witness_t *w2) +{ + int i; + witness_blessed_t *b; + + for (i = 0; i < blessed_count; i++) { + b = &blessed_list[i]; + if (strcmp(w1->w_description, b->b_lock1) == 0) { + if (strcmp(w2->w_description, b->b_lock2) == 0) + return (1); + continue; + } + if (strcmp(w1->w_description, b->b_lock2) == 0) + if (strcmp(w2->w_description, b->b_lock1) == 0) + return (1); + } + return (0); +} + +static witness_t * +witness_get() +{ + witness_t *w; + + if ((w = w_free) == NULL) { + witness_dead = 1; + mtx_exit(&w_mtx, MTX_SPIN); + printf("witness exhausted\n"); + return (NULL); + } + w_free = w->w_next; + bzero(w, sizeof (*w)); + return (w); +} + +static void +witness_free(witness_t *w) +{ + w->w_next = w_free; + w_free = w; +} + +void +witness_list(struct proc *p) +{ + mtx_t *m; + + for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL; + m = LIST_NEXT(m, mtx_held)) { + printf("\t\"%s\" (%p) locked at %s:%d\n", + m->mtx_description, m, + m->mtx_witness->w_file, m->mtx_witness->w_line); + } +} + +void +witness_save(mtx_t *m, char **filep, int *linep) +{ + *filep = m->mtx_witness->w_file; + *linep = m->mtx_witness->w_line; +} + +void +witness_restore(mtx_t *m, char *file, int line) +{ + m->mtx_witness->w_file = file; + m->mtx_witness->w_line = line; +} + +#endif /* (defined(SMP_DEBUG) && defined(WITNESS)) */ diff --git a/sys/kern/tty.c b/sys/kern/tty.c index 29b6288..87fb980 100644 --- a/sys/kern/tty.c +++ b/sys/kern/tty.c @@ -2266,7 +2266,8 @@ ttyinfo(tp) tmp = (pick->p_pctcpu * 10000 + FSCALE / 2) >> FSHIFT; ttyprintf(tp, "%d%% %ldk\n", tmp / 100, - pick->p_stat == SIDL || pick->p_stat == SZOMB ? 0 : + pick->p_stat == SIDL || pick->p_stat == SWAIT || + pick->p_stat == SZOMB ? 0 : (long)pgtok(vmspace_resident_count(pick->p_vmspace))); } tp->t_rocount = 0; /* so pending input will be retyped if BS */ diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 99c0754..34cff17 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -52,6 +53,8 @@ #include #include +#include + static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer"); struct bio_ops bioops; /* I/O operation notification */ @@ -461,7 +464,7 @@ bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { - if (curproc != NULL) + if (curproc != idleproc) curproc->p_stats->p_ru.ru_inblock++; KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp)); bp->b_iocmd = BIO_READ; @@ -498,7 +501,7 @@ breadn(struct vnode * vp, daddr_t blkno, int size, /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { - if (curproc != NULL) + if (curproc != idleproc) curproc->p_stats->p_ru.ru_inblock++; bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; @@ -519,7 +522,7 @@ breadn(struct vnode * vp, daddr_t blkno, int size, rabp = getblk(vp, *rablkno, *rabsize, 0, 0); if ((rabp->b_flags & B_CACHE) == 0) { - if (curproc != NULL) + if (curproc != idleproc) curproc->p_stats->p_ru.ru_inblock++; rabp->b_flags |= B_ASYNC; rabp->b_flags &= ~B_INVAL; @@ -640,7 +643,7 @@ bwrite(struct buf * bp) bp->b_vp->v_numoutput++; vfs_busy_pages(bp, 1); - if (curproc != NULL) + if (curproc != idleproc) curproc->p_stats->p_ru.ru_oublock++; splx(s); if (oldflags & B_ASYNC) @@ -1420,7 +1423,8 @@ getnewbuf(int slpflag, int slptimeo, int size, int maxsize) int isspecial; static int flushingbufs; - if (curproc && (curproc->p_flag & (P_COWINPROGRESS|P_BUFEXHAUST)) == 0) + if (curproc != idleproc && + (curproc->p_flag & (P_COWINPROGRESS|P_BUFEXHAUST)) == 0) isspecial = 0; else isspecial = 1; @@ -1745,6 +1749,8 @@ buf_daemon() { int s; + mtx_enter(&Giant, MTX_DEF); + /* * This process needs to be suspended prior to shutdown sync. */ @@ -2070,9 +2076,9 @@ loop: * move it into the else, when gbincore() fails. At the moment * it isn't a problem. */ - if (!curproc || (curproc->p_flag & P_BUFEXHAUST)) { + if (curproc == idleproc || (curproc->p_flag & P_BUFEXHAUST)) { if (numfreebuffers == 0) { - if (!curproc) + if (curproc == idleproc) return NULL; needsbuffer |= VFS_BIO_NEED_ANY; tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf", diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c index 3e4b17f..52ad0ef 100644 --- a/sys/kern/vfs_export.c +++ b/sys/kern/vfs_export.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -68,6 +69,7 @@ #include #include +#include #include #include @@ -960,6 +962,8 @@ sched_sync(void) int s; struct proc *p = updateproc; + mtx_enter(&Giant, MTX_DEF); + EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, p, SHUTDOWN_PRI_LAST); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 3e4b17f..52ad0ef 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -68,6 +69,7 @@ #include #include +#include #include #include @@ -960,6 +962,8 @@ sched_sync(void) int s; struct proc *p = updateproc; + mtx_enter(&Giant, MTX_DEF); + EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, p, SHUTDOWN_PRI_LAST); -- cgit v1.1