7 files changed, 460 insertions, 164 deletions
diff --git a/sys/kern/kern_switch.c b/sys/kern/kern_switch.c
index 1ccf64c..13bba12 100644
--- a/sys/kern/kern_switch.c
+++ b/sys/kern/kern_switch.c
@@ -49,6 +49,8 @@ __FBSDID("$FreeBSD$");
 #include <sys/sysctl.h>
 #endif
 
+#include <machine/cpu.h>
+
 /* Uncomment this to enable logging of critical_enter/exit. */
 #if 0
 #define	KTR_CRITICAL	KTR_SCHED
@@ -77,6 +79,49 @@ static int kern_sched_preemption = 0;
 SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
     &kern_sched_preemption, 0, "Kernel preemption enabled");
 
+#ifdef SCHED_STATS
+long switch_preempt;
+long switch_owepreempt;
+long switch_turnstile;
+long switch_sleepq;
+long switch_sleepqtimo;
+long switch_relinquish;
+long switch_needresched;
+static SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW, 0, "switch stats");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, preempt, CTLFLAG_RD, &switch_preempt, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, owepreempt, CTLFLAG_RD, &switch_owepreempt, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, turnstile, CTLFLAG_RD, &switch_turnstile, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, sleepq, CTLFLAG_RD, &switch_sleepq, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, sleepqtimo, CTLFLAG_RD, &switch_sleepqtimo, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, relinquish, CTLFLAG_RD, &switch_relinquish, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, needresched, CTLFLAG_RD, &switch_needresched, 0, "");
+static int
+sysctl_stats_reset(SYSCTL_HANDLER_ARGS)
+{
+        int error;
+	int val;
+
+        val = 0;
+        error = sysctl_handle_int(oidp, &val, 0, req);
+        if (error != 0 || req->newptr == NULL)
+                return (error);
+        if (val == 0)
+                return (0);
+	switch_preempt = 0;
+	switch_owepreempt = 0;
+	switch_turnstile = 0;
+	switch_sleepq = 0;
+	switch_sleepqtimo = 0;
+	switch_relinquish = 0;
+	switch_needresched = 0;
+
+	return (0);
+}
+
+SYSCTL_PROC(_kern_sched_stats, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_WR, NULL,
+    0, sysctl_stats_reset, "I", "Reset scheduler statistics");
+#endif
+
 /************************************************************************
  * Functions that manipulate runnability from a thread perspective.	*
  ************************************************************************/
@@ -142,13 +187,13 @@ critical_exit(void)
 #ifdef PREEMPTION
 	if (td->td_critnest == 1) {
 		td->td_critnest = 0;
-		mtx_assert(&sched_lock, MA_NOTOWNED);
 		if (td->td_owepreempt) {
 			td->td_critnest = 1;
-			mtx_lock_spin(&sched_lock);
+			thread_lock(td);
 			td->td_critnest--;
+			SCHED_STAT_INC(switch_owepreempt);
 			mi_switch(SW_INVOL, NULL);
-			mtx_unlock_spin(&sched_lock);
+			thread_unlock(td);
 		}
 	} else
 #endif
@@ -173,7 +218,6 @@ maybe_preempt(struct thread *td)
 	int cpri, pri;
 #endif
 
-	mtx_assert(&sched_lock, MA_OWNED);
 #ifdef PREEMPTION
 	/*
 	 * The new thread should not preempt the current thread if any of the
@@ -199,6 +243,7 @@ maybe_preempt(struct thread *td)
 	 * to the new thread.
 	 */
 	ctd = curthread;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT ((ctd->td_sched != NULL && ctd->td_sched->ts_thread == ctd),
 	  ("thread has no (or wrong) sched-private part."));
 	KASSERT((td->td_inhibitors == 0),
@@ -219,15 +264,25 @@ maybe_preempt(struct thread *td)
 		ctd->td_owepreempt = 1;
 		return (0);
 	}
-
 	/*
 	 * Thread is runnable but not yet put on system run queue.
 	 */
+	MPASS(ctd->td_lock == &sched_lock);
+	MPASS(td->td_lock == &sched_lock);
 	MPASS(TD_ON_RUNQ(td));
 	TD_SET_RUNNING(td);
 	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
 	    td->td_proc->p_pid, td->td_proc->p_comm);
+	SCHED_STAT_INC(switch_preempt);
 	mi_switch(SW_INVOL|SW_PREEMPT, td);
+	/*
+	 * td's lock pointer may have changed.  We have to return with it
+	 * locked.
+	 */
+	spinlock_enter();
+	thread_unlock(ctd);
+	thread_lock(td);
+	spinlock_exit();
 	return (1);
 #else
 	return (0);
@@ -442,7 +497,6 @@ runq_choose(struct runq *rq)
 	struct td_sched *ts;
 	int pri;
 
-	mtx_assert(&sched_lock, MA_OWNED);
 	while ((pri = runq_findbit(rq)) != -1) {
 		rqh = &rq->rq_queues[pri];
 #if defined(SMP) && defined(SCHED_4BSD)
@@ -484,7 +538,6 @@ runq_choose_from(struct runq *rq, u_char idx)
 	struct td_sched *ts;
 	int pri;
 
-	mtx_assert(&sched_lock, MA_OWNED);
 	if ((pri = runq_findbit_from(rq, idx)) != -1) {
 		rqh = &rq->rq_queues[pri];
 		ts = TAILQ_FIRST(rqh);
@@ -519,9 +572,20 @@ runq_remove_idx(struct runq *rq, struct td_sched *ts, u_char *idx)
 	KASSERT(ts->ts_thread->td_proc->p_sflag & PS_INMEM,
 		("runq_remove_idx: process swapped out"));
 	pri = ts->ts_rqindex;
+	KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri));
 	rqh = &rq->rq_queues[pri];
 	CTR5(KTR_RUNQ, "runq_remove_idx: td=%p, ts=%p pri=%d %d rqh=%p",
 	    ts->ts_thread, ts, ts->ts_thread->td_priority, pri, rqh);
+	{
+		struct td_sched *nts;
+
+		TAILQ_FOREACH(nts, rqh, ts_procq)
+			if (nts == ts)
+				break;
+		if (ts != nts)
+			panic("runq_remove_idx: ts %p not on rqindex %d",
+			    ts, pri);
+	}
 	TAILQ_REMOVE(rqh, ts, ts_procq);
 	if (TAILQ_EMPTY(rqh)) {
 		CTR0(KTR_RUNQ, "runq_remove_idx: empty");
@@ -589,18 +653,4 @@ sched_set_concurrency(struct proc *p, int concurrency)
 {
 }
 
-/*
- * Called from thread_exit() for all exiting thread
- *
- * Not to be confused with sched_exit_thread()
- * that is only called from thread_exit() for threads exiting
- * without the rest of the process exiting because it is also called from
- * sched_exit() and we wouldn't want to call it twice.
- * XXX This can probably be fixed.
- */
-void
-sched_thread_exit(struct thread *td)
-{
-}
-
 #endif /* KERN_SWITCH_INCLUDE */
diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c
index 66952ec..a4b1e08 100644
--- a/sys/kern/sched_4bsd.c
+++ b/sys/kern/sched_4bsd.c
@@ -248,7 +248,7 @@ static void
 maybe_resched(struct thread *td)
 {
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority < curthread->td_priority)
 		curthread->td_flags |= TDF_NEEDRESCHED;
 }
@@ -377,10 +377,7 @@ schedcpu(void)
 	realstathz = stathz ? stathz : hz;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
-		/*
-		 * Prevent state changes and protect run queue.
-		 */
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		/*
 		 * Increment time in/out of memory.  We ignore overflow; with
 		 * 16-bit int's (remember them?) overflow takes 45 days.
@@ -388,6 +385,7 @@ schedcpu(void)
 		p->p_swtime++;
 		FOREACH_THREAD_IN_PROC(p, td) { 
 			awake = 0;
+			thread_lock(td);
 			ts = td->td_sched;
 			/*
 			 * Increment sleep time (if sleeping).  We
@@ -456,13 +454,16 @@ XXX  this is broken
 				td->td_slptime = 0;
 			} else
 				td->td_slptime++;
-			if (td->td_slptime > 1)
+			if (td->td_slptime > 1) {
+				thread_unlock(td);
 				continue;
+			}
 			td->td_estcpu = decay_cpu(loadfac, td->td_estcpu);
 		      	resetpriority(td);
 			resetpriority_thread(td);
+			thread_unlock(td);
 		} /* end of thread loop */
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 	} /* end of process loop */
 	sx_sunlock(&allproc_lock);
 }
@@ -575,6 +576,7 @@ schedinit(void)
 	 */
 	proc0.p_sched = NULL; /* XXX */
 	thread0.td_sched = &td_sched0;
+	thread0.td_lock = &sched_lock;
 	td_sched0.ts_thread = &thread0;
 }
 
@@ -615,7 +617,7 @@ sched_clock(struct thread *td)
 {
 	struct td_sched *ts;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td->td_sched;
 
 	ts->ts_cpticks++;
@@ -635,22 +637,23 @@ sched_exit(struct proc *p, struct thread *td)
 
 	CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
 	    td, td->td_proc->p_comm, td->td_priority);
-
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
 }
 
 void
 sched_exit_thread(struct thread *td, struct thread *child)
 {
-	struct proc *childproc = child->td_proc;
 
 	CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
-	    child, childproc->p_comm, child->td_priority);
+	    child, child->td_proc->p_comm, child->td_priority);
+	thread_lock(td);
 	td->td_estcpu = ESTCPULIM(td->td_estcpu + child->td_estcpu);
-	childproc->p_estcpu = ESTCPULIM(childproc->p_estcpu +
-		child->td_estcpu);
+	thread_unlock(td);
+	mtx_lock_spin(&sched_lock);
 	if ((child->td_proc->p_flag & P_NOLOAD) == 0)
 		sched_load_rem();
+	mtx_unlock_spin(&sched_lock);
 }
 
 void
@@ -663,6 +666,7 @@ void
 sched_fork_thread(struct thread *td, struct thread *childtd)
 {
 	childtd->td_estcpu = td->td_estcpu;
+	childtd->td_lock = &sched_lock;
 	sched_newthread(childtd);
 }
 
@@ -672,18 +676,20 @@ sched_nice(struct proc *p, int nice)
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
-	mtx_assert(&sched_lock, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	p->p_nice = nice;
 	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
 		resetpriority(td);
 		resetpriority_thread(td);
+		thread_unlock(td);
 	}
 }
 
 void
 sched_class(struct thread *td, int class)
 {
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_pri_class = class;
 }
 
@@ -697,7 +703,7 @@ sched_priority(struct thread *td, u_char prio)
 	    td, td->td_proc->p_comm, td->td_priority, prio, curthread, 
 	    curthread->td_proc->p_comm);
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority == prio)
 		return;
 	td->td_priority = prio;
@@ -818,7 +824,7 @@ void
 sched_sleep(struct thread *td)
 {
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_slptime = 0;
 }
 
@@ -831,26 +837,18 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
 	ts = td->td_sched;
 	p = td->td_proc;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	/*  
+	 * Switch to the sched lock to fix things up and pick
+	 * a new thread.
+	 */
+	if (td->td_lock != &sched_lock) {
+		mtx_lock_spin(&sched_lock);
+		thread_unlock(td);
+	}
 
 	if ((p->p_flag & P_NOLOAD) == 0)
 		sched_load_rem();
-#if 0
-	/* 
-	 * We are volunteering to switch out so we get to nominate
-	 * a successor for the rest of our quantum
-	 * First try another thread in our process
-	 *
-	 * this is too expensive to do without per process run queues
-	 * so skip it for now.
-	 * XXX keep this comment as a marker.
-	 */
-	if (sched_followon &&
-	    (p->p_flag & P_HADTHREADS) &&
-	    (flags & SW_VOL) &&
-	    newtd == NULL) 
-		newtd = mumble();
-#endif
 
 	if (newtd) 
 		newtd->td_flags |= (td->td_flags & TDF_NEEDRESCHED);
@@ -896,6 +894,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
 	} else {
 		newtd = choosethread();
 	}
+	MPASS(newtd->td_lock == &sched_lock);
 
 	if (td != newtd) {
 #ifdef	HWPMC_HOOKS
@@ -904,7 +903,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
 #endif
 
                 /* I feel sleepy */
-		cpu_switch(td, newtd);
+		cpu_switch(td, newtd, td->td_lock);
 		/*
 		 * Where am I?  What year is it?
 		 * We are in the same thread that went to sleep above,
@@ -932,12 +931,13 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
 #endif
 	sched_lock.mtx_lock = (uintptr_t)td;
 	td->td_oncpu = PCPU_GET(cpuid);
+	MPASS(td->td_lock == &sched_lock);
 }
 
 void
 sched_wakeup(struct thread *td)
 {
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_slptime > 1) {
 		updatepri(td);
 		resetpriority(td);
@@ -1079,7 +1079,7 @@ sched_add(struct thread *td, int flags)
 	int single_cpu = 0;
 
 	ts = td->td_sched;
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 	    ("sched_add: trying to run inhibited thread"));
 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
@@ -1089,6 +1089,14 @@ sched_add(struct thread *td, int flags)
 	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
 	    td, td->td_proc->p_comm, td->td_priority, curthread,
 	    curthread->td_proc->p_comm);
+	/*
+	 * Now that the thread is moving to the run-queue, set the lock
+	 * to the scheduler's lock.
+	 */
+	if (td->td_lock != &sched_lock) {
+		mtx_lock_spin(&sched_lock);
+		thread_lock_set(td, &sched_lock);
+	}
 	TD_SET_RUNQ(td);
 
 	if (td->td_pinned != 0) {
@@ -1140,7 +1148,7 @@ sched_add(struct thread *td, int flags)
 {
 	struct td_sched *ts;
 	ts = td->td_sched;
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 	    ("sched_add: trying to run inhibited thread"));
 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
@@ -1150,6 +1158,14 @@ sched_add(struct thread *td, int flags)
 	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
 	    td, td->td_proc->p_comm, td->td_priority, curthread,
 	    curthread->td_proc->p_comm);
+	/*
+	 * Now that the thread is moving to the run-queue, set the lock
+	 * to the scheduler's lock.
+	 */
+	if (td->td_lock != &sched_lock) {
+		mtx_lock_spin(&sched_lock);
+		thread_lock_set(td, &sched_lock);
+	}
 	TD_SET_RUNQ(td);
 	CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td);
 	ts->ts_runq = &runq;
@@ -1207,6 +1223,7 @@ sched_choose(void)
 	struct td_sched *ts;
 	struct runq *rq;
 
+	mtx_assert(&sched_lock,  MA_OWNED);
 #ifdef SMP
 	struct td_sched *kecpu;
 
@@ -1256,10 +1273,10 @@ sched_userret(struct thread *td)
 	KASSERT((td->td_flags & TDF_BORROWING) == 0,
 	    ("thread with borrowed priority returning to userland"));
 	if (td->td_priority != td->td_user_pri) {
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		td->td_priority = td->td_user_pri;
 		td->td_base_pri = td->td_user_pri;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 	}
 }
 
@@ -1268,7 +1285,7 @@ sched_bind(struct thread *td, int cpu)
 {
 	struct td_sched *ts;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(TD_IS_RUNNING(td),
 	    ("sched_bind: cannot bind non-running thread"));
 
@@ -1287,25 +1304,26 @@ sched_bind(struct thread *td, int cpu)
 void
 sched_unbind(struct thread* td)
 {
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_sched->ts_flags &= ~TSF_BOUND;
 }
 
 int
 sched_is_bound(struct thread *td)
 {
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	return (td->td_sched->ts_flags & TSF_BOUND);
 }
 
 void
 sched_relinquish(struct thread *td)
 {
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	if (td->td_pri_class == PRI_TIMESHARE)
 		sched_prio(td, PRI_MAX_TIMESHARE);
+	SCHED_STAT_INC(switch_relinquish);
 	mi_switch(SW_VOL, NULL);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 }
 
 int
@@ -1363,5 +1381,57 @@ sched_idletd(void *dummy)
 	}
 }
 
+/*
+ * A CPU is entering for the first time or a thread is exiting.
+ */
+void
+sched_throw(struct thread *td)
+{
+	/*
+	 * Correct spinlock nesting.  The idle thread context that we are
+	 * borrowing was created so that it would start out with a single
+	 * spin lock (sched_lock) held in fork_trampoline().  Since we've
+	 * explicitly acquired locks in this function, the nesting count
+	 * is now 2 rather than 1.  Since we are nested, calling
+	 * spinlock_exit() will simply adjust the counts without allowing
+	 * spin lock using code to interrupt us.
+	 */
+	if (td == NULL) {
+		mtx_lock_spin(&sched_lock);
+		spinlock_exit();
+	} else {
+		MPASS(td->td_lock == &sched_lock);
+	}
+	mtx_assert(&sched_lock, MA_OWNED);
+	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
+	PCPU_SET(switchtime, cpu_ticks());
+	PCPU_SET(switchticks, ticks);
+	cpu_throw(td, choosethread());	/* doesn't return */
+}
+
+void
+sched_fork_exit(struct thread *ctd)
+{
+	struct thread *td;
+
+	/*
+	 * Finish setting up thread glue so that it begins execution in a
+	 * non-nested critical section with sched_lock held but not recursed.
+	 */
+	ctd->td_oncpu = PCPU_GET(cpuid);
+	sched_lock.mtx_lock = (uintptr_t)ctd;
+	THREAD_LOCK_ASSERT(ctd, MA_OWNED | MA_NOTRECURSED);
+	/*
+	 * Processes normally resume in mi_switch() after being
+	 * cpu_switch()'ed to, but when children start up they arrive here
+	 * instead, so we must do much the same things as mi_switch() would.
+	 */
+	if ((td = PCPU_GET(deadthread))) {
+		PCPU_SET(deadthread, NULL);
+		thread_stash(td);
+	}
+	thread_unlock(ctd);
+}
+
 #define KERN_SWITCH_INCLUDE 1
 #include "kern/kern_switch.c"
diff --git a/sys/kern/sched_core.c b/sys/kern/sched_core.c
index b0994f8..4cec09b 100644
--- a/sys/kern/sched_core.c
+++ b/sys/kern/sched_core.c
@@ -784,6 +784,7 @@ schedinit(void)
 	 */
 	proc0.p_sched = NULL; /* XXX */
 	thread0.td_sched = &kse0;
+	thread0.td_lock = &sched_lock;
 	kse0.ts_thread = &thread0;
 	kse0.ts_slice = 100;
 }
@@ -1018,7 +1019,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
-		cpu_switch(td, newtd);
+		cpu_switch(td, newtd, td->td_lock);
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
@@ -1110,6 +1111,7 @@ sched_fork_thread(struct thread *td, struct thread *child)
 	ts = td->td_sched;
 	ts2 = child->td_sched;
 
+	child->td_lock = td->td_lock;
 	ts2->ts_slptime = ts2->ts_slptime * CHILD_WEIGHT / 100;
 	if (child->td_pri_class == PRI_TIMESHARE)
 		sched_user_prio(child, sched_calc_pri(ts2));
@@ -1142,7 +1144,8 @@ sched_class(struct thread *td, int class)
 void
 sched_exit(struct proc *p, struct thread *childtd)
 {
-	mtx_assert(&sched_lock, MA_OWNED);
+
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	sched_exit_thread(FIRST_THREAD_IN_PROC(p), childtd);
 }
 
@@ -1747,5 +1750,57 @@ sched_idletd(void *dummy)
 	}
 }
 
+/*
+ * A CPU is entering for the first time or a thread is exiting.
+ */
+void
+sched_throw(struct thread *td)
+{
+	/*
+	 * Correct spinlock nesting.  The idle thread context that we are
+	 * borrowing was created so that it would start out with a single
+	 * spin lock (sched_lock) held in fork_trampoline().  Since we've
+	 * explicitly acquired locks in this function, the nesting count
+	 * is now 2 rather than 1.  Since we are nested, calling
+	 * spinlock_exit() will simply adjust the counts without allowing
+	 * spin lock using code to interrupt us.
+	 */
+	if (td == NULL) {
+		mtx_lock_spin(&sched_lock);
+		spinlock_exit();
+	} else {
+		MPASS(td->td_lock == &sched_lock);
+	}
+	mtx_assert(&sched_lock, MA_OWNED);
+	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
+	PCPU_SET(switchtime, cpu_ticks());
+	PCPU_SET(switchticks, ticks);
+	cpu_throw(td, choosethread());	/* doesn't return */
+}
+
+void
+sched_fork_exit(struct thread *ctd)
+{
+	struct thread *td;
+
+	/*
+	 * Finish setting up thread glue so that it begins execution in a
+	 * non-nested critical section with sched_lock held but not recursed.
+	 */
+	ctd->td_oncpu = PCPU_GET(cpuid);
+	sched_lock.mtx_lock = (uintptr_t)ctd;
+	THREAD_LOCK_ASSERT(ctd, MA_OWNED | MA_NOTRECURSED);
+	/*
+	 * Processes normally resume in mi_switch() after being
+	 * cpu_switch()'ed to, but when children start up they arrive here
+	 * instead, so we must do much the same things as mi_switch() would.
+	 */
+	if ((td = PCPU_GET(deadthread))) {
+		PCPU_SET(deadthread, NULL);
+		thread_stash(td);
+	}
+	thread_unlock(ctd);
+}
+
 #define KERN_SWITCH_INCLUDE 1
 #include "kern/kern_switch.c"
diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index 4f4cf41..30761fb 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -229,6 +229,7 @@ static int ipi_thresh = PRI_MIN_KERN;
 static int steal_htt = 1;
 static int steal_busy = 1;
 static int busy_thresh = 4;
+static int topology = 0;
 
 /*
  * One thread queue per processor.
@@ -434,7 +435,7 @@ tdq_load_add(struct tdq *tdq, struct td_sched *ts)
 	mtx_assert(&sched_lock, MA_OWNED);
 	class = PRI_BASE(ts->ts_thread->td_pri_class);
 	tdq->tdq_load++;
-	CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
+	CTR2(KTR_SCHED, "cpu %jd load: %d", TDQ_ID(tdq), tdq->tdq_load);
 	if (class != PRI_ITHD &&
 	    (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
 #ifdef SMP
@@ -997,7 +998,7 @@ sched_setup(void *dummy)
 		tdq = &tdq_cpu[i];
 		tdq_setup(&tdq_cpu[i]);
 	}
-	if (1) {
+	if (smp_topology == NULL) {
 		struct tdq_group *tdg;
 		struct tdq *tdq;
 		int cpus;
@@ -1027,6 +1028,7 @@ sched_setup(void *dummy)
 		struct cpu_group *cg;
 		int j;
 
+		topology = 1;
 		for (i = 0; i < smp_topology->ct_count; i++) {
 			cg = &smp_topology->ct_group[i];
 			tdg = &tdq_groups[i];
@@ -1248,6 +1250,7 @@ schedinit(void)
 	 */
 	proc0.p_sched = NULL; /* XXX */
 	thread0.td_sched = &td_sched0;
+	thread0.td_lock = &sched_lock;
 	td_sched0.ts_ltick = ticks;
 	td_sched0.ts_ftick = ticks;
 	td_sched0.ts_thread = &thread0;
@@ -1296,7 +1299,7 @@ sched_thread_priority(struct thread *td, u_char prio)
 	    td, td->td_proc->p_comm, td->td_priority, prio, curthread,
 	    curthread->td_proc->p_comm);
 	ts = td->td_sched;
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority == prio)
 		return;
 
@@ -1307,9 +1310,10 @@ sched_thread_priority(struct thread *td, u_char prio)
 		 * queue.  This could be optimized to not re-add in some
 		 * cases.
 		 */
+		MPASS(td->td_lock == &sched_lock);
 		sched_rem(td);
 		td->td_priority = prio;
-		sched_add(td, SRQ_BORROWING);
+		sched_add(td, SRQ_BORROWING|SRQ_OURSELF);
 	} else
 		td->td_priority = prio;
 }
@@ -1427,7 +1431,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
 	struct td_sched *ts;
 	int preempt;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	preempt = flags & SW_PREEMPT;
 	tdq = TDQ_SELF();
@@ -1440,24 +1444,33 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
 	 * If the thread has been assigned it may be in the process of switching
 	 * to the new cpu.  This is the case in sched_bind().
 	 */
+	/*
+	 * Switch to the sched lock to fix things up and pick
+	 * a new thread.
+	 */
+	if (td->td_lock != &sched_lock) {
+		mtx_lock_spin(&sched_lock);
+		thread_unlock(td);
+	}
 	if (TD_IS_IDLETHREAD(td)) {
+		MPASS(td->td_lock == &sched_lock);
 		TD_SET_CAN_RUN(td);
-	} else {
+	} else if (TD_IS_RUNNING(td)) {
+		/*
+		 * Don't allow the thread to migrate
+		 * from a preemption.
+		 */
 		tdq_load_rem(tdq, ts);
-		if (TD_IS_RUNNING(td)) {
-			/*
-			 * Don't allow the thread to migrate
-			 * from a preemption.
-			 */
-			if (preempt)
-				sched_pin_td(td);
-			sched_add(td, preempt ?
-			    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
-			    SRQ_OURSELF|SRQ_YIELDING);
-			if (preempt)
-				sched_unpin_td(td);
-		}
-	}
+		if (preempt)
+			sched_pin_td(td);
+		sched_add(td, preempt ?
+		    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
+		    SRQ_OURSELF|SRQ_YIELDING);
+		if (preempt)
+			sched_unpin_td(td);
+	} else
+		tdq_load_rem(tdq, ts);
+	mtx_assert(&sched_lock, MA_OWNED);
 	if (newtd != NULL) {
 		/*
 		 * If we bring in a thread account for it as if it had been
@@ -1473,7 +1486,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
 
-		cpu_switch(td, newtd);
+		cpu_switch(td, newtd, td->td_lock);
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
@@ -1481,6 +1494,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
 	}
 	sched_lock.mtx_lock = (uintptr_t)td;
 	td->td_oncpu = PCPU_GET(cpuid);
+	MPASS(td->td_lock == &sched_lock);
 }
 
 void
@@ -1489,12 +1503,14 @@ sched_nice(struct proc *p, int nice)
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
-	mtx_assert(&sched_lock, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 
 	p->p_nice = nice;
 	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
 		sched_priority(td);
 		sched_prio(td, td->td_base_user_pri);
+		thread_unlock(td);
 	}
 }
 
@@ -1502,7 +1518,7 @@ void
 sched_sleep(struct thread *td)
 {
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	td->td_sched->ts_slptime = ticks;
 }
@@ -1513,7 +1529,7 @@ sched_wakeup(struct thread *td)
 	struct td_sched *ts;
 	int slptime;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td->td_sched;
 	/*
 	 * If we slept for more than a tick update our interactivity and
@@ -1542,7 +1558,7 @@ sched_wakeup(struct thread *td)
 void
 sched_fork(struct thread *td, struct thread *child)
 {
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	sched_fork_thread(td, child);
 	/*
 	 * Penalize the parent and child for forking.
@@ -1563,7 +1579,9 @@ sched_fork_thread(struct thread *td, struct thread *child)
 	/*
 	 * Initialize child.
 	 */
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	sched_newthread(child);
+	child->td_lock = &sched_lock;
 	ts = td->td_sched;
 	ts2 = child->td_sched;
 	ts2->ts_cpu = ts->ts_cpu;
@@ -1588,7 +1606,7 @@ void
 sched_class(struct thread *td, int class)
 {
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_pri_class == class)
 		return;
 
@@ -1627,6 +1645,7 @@ sched_exit(struct proc *p, struct thread *child)
 	CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
 	    child, child->td_proc->p_comm, child->td_priority);
 
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	td = FIRST_THREAD_IN_PROC(p);
 	sched_exit_thread(td, child);
 }
@@ -1638,7 +1657,9 @@ sched_exit_thread(struct thread *td, struct thread *child)
 	CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
 	    child, child->td_proc->p_comm, child->td_priority);
 
+	thread_lock(child);
 	tdq_load_rem(TDQ_CPU(child->td_sched->ts_cpu), child->td_sched);
+	thread_unlock(child);
 #ifdef KSE
 	/*
 	 * KSE forks and exits so often that this penalty causes short-lived
@@ -1653,9 +1674,11 @@ sched_exit_thread(struct thread *td, struct thread *child)
 	 * sleep time as a penalty to the parent.  This causes shells that
 	 * launch expensive things to mark their children as expensive.
 	 */
+	thread_lock(td);
 	td->td_sched->skg_runtime += child->td_sched->skg_runtime;
 	sched_interact_update(td);
 	sched_priority(td);
+	thread_unlock(td);
 }
 
 void
@@ -1673,10 +1696,10 @@ sched_userret(struct thread *td)
 	KASSERT((td->td_flags & TDF_BORROWING) == 0,
 	    ("thread with borrowed priority returning to userland"));
 	if (td->td_priority != td->td_user_pri) {
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		td->td_priority = td->td_user_pri;
 		td->td_base_pri = td->td_user_pri;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
         }
 }
 
@@ -1805,9 +1828,22 @@ sched_preempt(struct thread *td)
 	 */
 	MPASS(TD_ON_RUNQ(td));
 	TD_SET_RUNNING(td);
+	MPASS(ctd->td_lock == &sched_lock);
+	MPASS(td->td_lock == &sched_lock);
 	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
 	    td->td_proc->p_pid, td->td_proc->p_comm);
+	/*
+	 * We enter the switch with two runnable threads that both have
+	 * the same lock.  When we return td may be sleeping so we need
+	 * to switch locks to make sure he's locked correctly.
+	 */
+	SCHED_STAT_INC(switch_preempt);
 	mi_switch(SW_INVOL|SW_PREEMPT, td);
+	spinlock_enter();
+	thread_unlock(ctd);
+	thread_lock(td);
+	spinlock_exit();
+
 	return (1);
 }
 
@@ -1824,7 +1860,7 @@ sched_add(struct thread *td, int flags)
 #endif
 	ts = td->td_sched;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
 	    td, td->td_proc->p_comm, td->td_priority, curthread,
 	    curthread->td_proc->p_comm);
@@ -1834,8 +1870,15 @@ sched_add(struct thread *td, int flags)
 	    ("sched_add: bad thread state"));
 	KASSERT(td->td_proc->p_sflag & PS_INMEM,
 	    ("sched_add: process swapped out"));
-	KASSERT(ts->ts_runq == NULL,
-	    ("sched_add: thread %p is still assigned to a run queue", td));
+	/*
+	 * Now that the thread is moving to the run-queue, set the lock
+	 * to the scheduler's lock.
+	 */
+	if (td->td_lock != &sched_lock) {
+		mtx_lock_spin(&sched_lock);
+		thread_lock_set(td, &sched_lock);
+	}
+	mtx_assert(&sched_lock, MA_OWNED);
         TD_SET_RUNQ(td);
 	tdq = TDQ_SELF();
 	class = PRI_BASE(td->td_pri_class);
@@ -1920,7 +1963,7 @@ sched_rem(struct thread *td)
 	CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
 	    td, td->td_proc->p_comm, td->td_priority, curthread,
 	    curthread->td_proc->p_comm);
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td->td_sched;
 	KASSERT(TD_ON_RUNQ(td),
 	    ("sched_rem: thread not on run queue"));
@@ -1942,7 +1985,7 @@ sched_pctcpu(struct thread *td)
 	if (ts == NULL)
 		return (0);
 
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	if (ts->ts_ticks) {
 		int rtick;
 
@@ -1952,7 +1995,7 @@ sched_pctcpu(struct thread *td)
 		pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
 	}
 	td->td_proc->p_swtime = ts->ts_ltick - ts->ts_ftick;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 
 	return (pctcpu);
 }
@@ -1962,7 +2005,7 @@ sched_bind(struct thread *td, int cpu)
 {
 	struct td_sched *ts;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td->td_sched;
 	if (ts->ts_flags & TSF_BOUND)
 		sched_unbind(td);
@@ -1982,7 +2025,7 @@ sched_unbind(struct thread *td)
 {
 	struct td_sched *ts;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td->td_sched;
 	if ((ts->ts_flags & TSF_BOUND) == 0)
 		return;
@@ -1995,18 +2038,19 @@ sched_unbind(struct thread *td)
 int
 sched_is_bound(struct thread *td)
 {
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	return (td->td_sched->ts_flags & TSF_BOUND);
 }
 
 void
 sched_relinquish(struct thread *td)
 {
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	if (td->td_pri_class == PRI_TIMESHARE)
 		sched_prio(td, PRI_MAX_TIMESHARE);
+	SCHED_STAT_INC(switch_relinquish);
 	mi_switch(SW_VOL, NULL);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 }
 
 int
@@ -2071,6 +2115,58 @@ sched_idletd(void *dummy)
 		cpu_idle();
 }
 
+/*
+ * A CPU is entering for the first time or a thread is exiting.
+ */
+void
+sched_throw(struct thread *td)
+{
+	/*
+	 * Correct spinlock nesting.  The idle thread context that we are
+	 * borrowing was created so that it would start out with a single
+	 * spin lock (sched_lock) held in fork_trampoline().  Since we've
+	 * explicitly acquired locks in this function, the nesting count
+	 * is now 2 rather than 1.  Since we are nested, calling
+	 * spinlock_exit() will simply adjust the counts without allowing
+	 * spin lock using code to interrupt us.
+	 */
+	if (td == NULL) {
+		mtx_lock_spin(&sched_lock);
+		spinlock_exit();
+	} else {
+		MPASS(td->td_lock == &sched_lock);
+	}
+	mtx_assert(&sched_lock, MA_OWNED);
+	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
+	PCPU_SET(switchtime, cpu_ticks());
+	PCPU_SET(switchticks, ticks);
+	cpu_throw(td, choosethread());	/* doesn't return */
+}
+
+void
+sched_fork_exit(struct thread *ctd)
+{
+	struct thread *td;
+
+	/*
+	 * Finish setting up thread glue so that it begins execution in a
+	 * non-nested critical section with sched_lock held but not recursed.
+	 */
+	ctd->td_oncpu = PCPU_GET(cpuid);
+	sched_lock.mtx_lock = (uintptr_t)ctd;
+	THREAD_LOCK_ASSERT(ctd, MA_OWNED | MA_NOTRECURSED);
+	/*
+	 * Processes normally resume in mi_switch() after being
+	 * cpu_switch()'ed to, but when children start up they arrive here
+	 * instead, so we must do much the same things as mi_switch() would.
+	 */
+	if ((td = PCPU_GET(deadthread))) {
+		PCPU_SET(deadthread, NULL);
+		thread_stash(td);
+	}
+	thread_unlock(ctd);
+}
+
 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0,
     "Scheduler name");
@@ -2093,6 +2189,7 @@ SYSCTL_INT(_kern_sched, OID_AUTO, ipi_thresh, CTLFLAG_RW, &ipi_thresh, 0, "");
 SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0, "");
 SYSCTL_INT(_kern_sched, OID_AUTO, steal_busy, CTLFLAG_RW, &steal_busy, 0, "");
 SYSCTL_INT(_kern_sched, OID_AUTO, busy_thresh, CTLFLAG_RW, &busy_thresh, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, topology, CTLFLAG_RD, &topology, 0, "");
 #endif
 
 /* ps compat */
diff --git a/sys/sys/mutex.h b/sys/sys/mutex.h
index caa1311..d18061a 100644
--- a/sys/sys/mutex.h
+++ b/sys/sys/mutex.h
@@ -125,6 +125,14 @@ void	_mtx_unlock_spin_flags(struct mtx *m, int opts, const char *file,
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 void	_mtx_assert(struct mtx *m, int what, const char *file, int line);
 #endif
+void	_thread_lock_flags(struct thread *, int, const char *, int);
+
+#define	thread_lock(tdp)						\
+    _thread_lock_flags((tdp), 0, __FILE__, __LINE__)
+#define	thread_lock_flags(tdp, opt)					\
+    _thread_lock_flags((tdp), (opt), __FILE__, __LINE__)
+#define	thread_unlock(tdp)						\
+       mtx_unlock_spin(__DEVOLATILE(struct mtx *, (tdp)->td_lock))
 
 /*
  * We define our machine-independent (unoptimized) mutex micro-operations
@@ -349,6 +357,7 @@ extern struct mtx_pool *mtxpool_sleep;
  */
 extern struct mtx sched_lock;
 extern struct mtx Giant;
+extern struct mtx blocked_lock;
 
 /*
  * Giant lock manipulation and clean exit macros.
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index a73d2d5..acde39d 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -134,7 +134,7 @@ struct pargs {
  *      g - process group mtx
  *      h - callout_lock mtx
  *      i - by curproc or the master session mtx
- *      j - locked by sched_lock mtx
+ *      j - locked by proc slock
  *      k - only accessed by curthread
  *	k*- only accessed by curthread and from an interrupt
  *      l - the attaching proc or attaching proc parent
@@ -144,6 +144,7 @@ struct pargs {
  *      p - select lock (sellock)
  *      q - td_contested lock
  *      r - p_peers lock
+ *      t - thread lock
  *      x - created at fork, only changes during single threading in exec
  *      z - zombie threads lock
  *
@@ -195,32 +196,19 @@ struct mqueue_notifier;
  * other than CPU cycles, which are parceled out to the threads.
  */
 
-/***************
- * Threads are the unit of execution
- With a single run queue used by all processors:
-
- RUNQ: --->THREAD---THREAD--...               SLEEPQ:[]---THREAD---THREAD---THREAD
-                                                     []---THREAD
-                                       	             []
-                                                     []---THREAD---THREAD
-
-With PER-CPU run queues: 
-it gets more complicated.
- *
- *****************/
-
 /*
  * Kernel runnable context (thread).
  * This is what is put to sleep and reactivated.
  * Thread context.  Processes may have multiple threads.
  */
 struct thread {
+	volatile struct mtx *td_lock;	/* replaces sched lock */
 	struct proc	*td_proc;	/* (*) Associated process. */
 	TAILQ_ENTRY(thread) td_plist;	/* (*) All threads in this proc. */
 
 	/* The two queues below should someday be merged. */
-	TAILQ_ENTRY(thread) td_slpq;	/* (j) Sleep queue. */
-	TAILQ_ENTRY(thread) td_lockq;	/* (j) Lock queue. */
+	TAILQ_ENTRY(thread) td_slpq;	/* (t) Sleep queue. */
+	TAILQ_ENTRY(thread) td_lockq;	/* (t) Lock queue. */
 
 	TAILQ_HEAD(, selinfo) td_selq;	/* (p) List of selinfos. */
 	struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */
@@ -232,20 +220,20 @@ struct thread {
 
 /* Cleared during fork1() or thread_schedule_upcall(). */
 #define	td_startzero td_flags
-	int		td_flags;	/* (j) TDF_* flags. */
-	int		td_inhibitors;	/* (j) Why can not run. */
+	int		td_flags;	/* (t) TDF_* flags. */
+	int		td_inhibitors;	/* (t) Why can not run. */
 	int		td_pflags;	/* (k) Private thread (TDP_*) flags. */
 	int		td_dupfd;	/* (k) Ret value from fdopen. XXX */
-	int		td_sqqueue;	/* (j) Sleepqueue queue blocked on. */
-	void		*td_wchan;	/* (j) Sleep address. */
-	const char	*td_wmesg;	/* (j) Reason for sleep. */
-	u_char		td_lastcpu;	/* (j) Last cpu we were on. */
-	u_char		td_oncpu;	/* (j) Which cpu we are on. */
+	int		td_sqqueue;	/* (t) Sleepqueue queue blocked on. */
+	void		*td_wchan;	/* (t) Sleep address. */
+	const char	*td_wmesg;	/* (t) Reason for sleep. */
+	u_char		td_lastcpu;	/* (t) Last cpu we were on. */
+	u_char		td_oncpu;	/* (t) Which cpu we are on. */
 	volatile u_char td_owepreempt;  /* (k*) Preempt on last critical_exit */
 	short		td_locks;	/* (k) Count of non-spin locks. */
-	u_char		td_tsqueue;	/* (j) Turnstile queue blocked on. */
-	struct turnstile *td_blocked;	/* (j) Lock thread is blocked on. */
-	const char	*td_lockname;	/* (j) Name of lock blocked on. */
+	u_char		td_tsqueue;	/* (t) Turnstile queue blocked on. */
+	struct turnstile *td_blocked;	/* (t) Lock thread is blocked on. */
+	const char	*td_lockname;	/* (t) Name of lock blocked on. */
 	LIST_HEAD(, turnstile) td_contested;	/* (q) Contested locks. */
 	struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */
 	int		td_intr_nesting_level; /* (k) Interrupt recursion. */
@@ -253,18 +241,18 @@ struct thread {
 	struct kse_thr_mailbox *td_mailbox; /* (*) Userland mailbox address. */
 	struct ucred	*td_ucred;	/* (k) Reference to credentials. */
 	struct thread	*td_standin;	/* (k + a) Use this for an upcall. */
-	struct kse_upcall *td_upcall;	/* (k + j) Upcall structure. */
-	u_int		td_estcpu;	/* (j) Sum of the same field in KSEs. */
-	u_int		td_slptime;	/* (j) How long completely blocked. */
-	struct rusage	td_ru;		/* (j) rusage information */
-	uint64_t	td_runtime;	/* (j) How many cpu ticks we've run. */
-	u_int 		td_pticks;	/* (j) Statclock hits for profiling */
-	u_int		td_sticks;	/* (j) Statclock hits in system mode. */
-	u_int		td_iticks;	/* (j) Statclock hits in intr mode. */
-	u_int		td_uticks;	/* (j) Statclock hits in user mode. */
+	struct kse_upcall *td_upcall;	/* (k + t) Upcall structure. */
+	u_int		td_estcpu;	/* (t) estimated cpu utilization */
+	u_int		td_slptime;	/* (t) How long completely blocked. */
+	struct rusage	td_ru;		/* (t) rusage information */
+	uint64_t	td_runtime;	/* (t) How many cpu ticks we've run. */
+	u_int 		td_pticks;	/* (t) Statclock hits for profiling */
+	u_int		td_sticks;	/* (t) Statclock hits in system mode. */
+	u_int		td_iticks;	/* (t) Statclock hits in intr mode. */
+	u_int		td_uticks;	/* (t) Statclock hits in user mode. */
 	u_int		td_uuticks;	/* (k) Statclock hits (usr), for UTS. */
 	u_int		td_usticks;	/* (k) Statclock hits (sys), for UTS. */
-	int		td_intrval;	/* (j) Return value of TDF_INTERRUPT. */
+	int		td_intrval;	/* (t) Return value of TDF_INTERRUPT. */
 	sigset_t	td_oldsigmask;	/* (k) Saved mask from pre sigpause. */
 	sigset_t	td_sigmask;	/* (c) Current signal mask. */
 	volatile u_int	td_generation;	/* (k) For detection of preemption */
@@ -278,11 +266,11 @@ struct thread {
 
 /* Copied during fork1() or thread_sched_upcall(). */
 #define	td_startcopy td_endzero
-	u_char		td_base_pri;	/* (j) Thread base kernel priority. */
-	u_char		td_priority;	/* (j) Thread active priority. */
-	u_char		td_pri_class;	/* (j) Scheduling class. */
-	u_char		td_user_pri;	/* (j) User pri from estcpu and nice. */
-	u_char		td_base_user_pri; /* (j) Base user pri */
+	u_char		td_base_pri;	/* (t) Thread base kernel priority. */
+	u_char		td_priority;	/* (t) Thread active priority. */
+	u_char		td_pri_class;	/* (t) Scheduling class. */
+	u_char		td_user_pri;	/* (t) User pri from estcpu and nice. */
+	u_char		td_base_user_pri; /* (t) Base user pri */
 #define	td_endcopy td_pcb
 
 /*
@@ -296,7 +284,7 @@ struct thread {
 		TDS_CAN_RUN,
 		TDS_RUNQ,
 		TDS_RUNNING
-	} td_state;
+	} td_state;			/* (t) thread state */
 	register_t	td_retval[2];	/* (k) Syscall aux returns. */
 	struct callout	td_slpcallout;	/* (h) Callout for sleep. */
 	struct trapframe *td_frame;	/* (k) */
@@ -313,6 +301,16 @@ struct thread {
 	int		td_syscalls;	/* per-thread syscall count (used by NFS :)) */
 };
 
+struct mtx *thread_lock_block(struct thread *);
+void thread_lock_unblock(struct thread *, struct mtx *);
+void thread_lock_set(struct thread *, struct mtx *);
+#define	THREAD_LOCK_ASSERT(td, type)					\
+do {									\
+	struct mtx *__m = __DEVOLATILE(struct mtx *, (td)->td_lock);	\
+	if (__m != &blocked_lock)					\
+		mtx_assert(__m, (type));				\
+} while (0)
+
 /*
  * Flags kept in td_flags:
  * To change these you MUST have the scheduler lock.
@@ -324,22 +322,22 @@ struct thread {
 #define	TDF_IDLETD	0x00000020 /* This is a per-CPU idle thread. */
 #define	TDF_SELECT	0x00000040 /* Selecting; wakeup/waiting danger. */
 #define	TDF_SLEEPABORT	0x00000080 /* sleepq_abort was called. */
-#define	TDF_TSNOBLOCK	0x00000100 /* Don't block on a turnstile due to race. */
+#define	TDF_UNUSEDx100	0x00000100 /* --available-- */
 #define	TDF_UBORROWING	0x00000200 /* Thread is borrowing user pri. */
 #define	TDF_BOUNDARY	0x00000400 /* Thread suspended at user boundary */
 #define	TDF_ASTPENDING	0x00000800 /* Thread has some asynchronous events. */
 #define	TDF_TIMOFAIL	0x00001000 /* Timeout from sleep after we were awake. */
 #define	TDF_INTERRUPT	0x00002000 /* Thread is marked as interrupted. */
 #define	TDF_UPIBLOCKED	0x00004000 /* Thread blocked on user PI mutex. */
-#define	TDF_UNUSED15	0x00008000 /* --available -- */
+#define	TDF_UNUSED15	0x00008000 /* --available-- */
 #define	TDF_NEEDRESCHED	0x00010000 /* Thread needs to yield. */
 #define	TDF_NEEDSIGCHK	0x00020000 /* Thread may need signal delivery. */
 #define	TDF_XSIG	0x00040000 /* Thread is exchanging signal under trace */
 #define	TDF_UNUSED19	0x00080000 /* Thread is sleeping on a umtx. */
 #define	TDF_THRWAKEUP	0x00100000 /* Libthr thread must not suspend itself. */
 #define	TDF_DBSUSPEND	0x00200000 /* Thread is suspended by debugger */
-#define	TDF_UNUSED22	0x00400000 /* --available -- */
-#define	TDF_UNUSED23	0x00800000 /* --available -- */
+#define	TDF_UNUSED22	0x00400000 /* --available-- */
+#define	TDF_UNUSED23	0x00800000 /* --available-- */
 #define	TDF_SCHED0	0x01000000 /* Reserved for scheduler private use */
 #define	TDF_SCHED1	0x02000000 /* Reserved for scheduler private use */
 #define	TDF_SCHED2	0x04000000 /* Reserved for scheduler private use */
@@ -482,7 +480,8 @@ struct rusage_ext {
  */
 struct proc {
 	LIST_ENTRY(proc) p_list;	/* (d) List of all processes. */
-	TAILQ_HEAD(, thread) p_threads;	/* (j)(td_plist) Threads. (shortcut) */
+	TAILQ_HEAD(, thread) p_threads;	/* (j) all threads. */
+	struct mtx	p_slock;	/* process spin lock */
 	struct ucred	*p_ucred;	/* (c) Process owner's identity. */
 	struct filedesc	*p_fd;		/* (b) Open files. */
 	struct filedesc_to_leader *p_fdtol; /* (b) Tracking node */
@@ -491,7 +490,7 @@ struct proc {
 	struct plimit	*p_limit;	/* (c) Process limits. */
 	struct callout	p_limco;	/* (c) Limit callout handle */
 	struct sigacts	*p_sigacts;	/* (x) Signal actions, state (CPU). */
-	TAILQ_HEAD(, kse_upcall) p_upcalls; /* All upcalls in the proc. */
+	TAILQ_HEAD(, kse_upcall) p_upcalls; /* (j) All upcalls in the proc. */
 
 	/*
 	 * The following don't make too much sense.
@@ -504,7 +503,6 @@ struct proc {
 		PRS_NORMAL,		/* threads can be run. */
 		PRS_ZOMBIE
 	} p_state;			/* (j/c) S* process status. */
-
 	pid_t		p_pid;		/* (b) Process identifier. */
 	LIST_ENTRY(proc) p_hash;	/* (d) Hash chain. */
 	LIST_ENTRY(proc) p_pglist;	/* (g + e) List of processes in pgrp. */
@@ -542,14 +540,12 @@ struct proc {
 	struct nlminfo	*p_nlminfo;	/* (?) Only used by/for lockd. */
 	struct kaioinfo	*p_aioinfo;	/* (c) ASYNC I/O info. */
 	struct thread	*p_singlethread;/* (c + j) If single threading this is it */
-	int		p_suspcount;	/* (c) Num threads in suspended mode. */
+	int		p_suspcount;	/* (j) Num threads in suspended mode. */
 	struct thread	*p_xthread;	/* (c) Trap thread */
 	int		p_boundary_count;/* (c) Num threads at user boundary */
 	int		p_pendingcnt;	/* how many signals are pending */
 	struct itimers	*p_itimers;	/* (c) POSIX interval timers. */
 /* from ksegrp */
-	u_int		p_estcpu;	/* (j) Sum of the field in threads. */
-	u_int		p_slptime;	/* (j) How long completely blocked. */
 	int		p_numupcalls;	/* (j) Num upcalls. */
 	int		p_upsleeps;	/* (c) Num threads in kse_release(). */
 	struct kse_thr_mailbox *p_completed; /* (c) Completed thread mboxes. */
@@ -592,6 +588,9 @@ struct proc {
 
 #define	NOCPU	0xff		/* For when we aren't on a CPU. */
 
+#define	PROC_SLOCK(p)	mtx_lock_spin(&(p)->p_slock)
+#define	PROC_SUNLOCK(p)	mtx_unlock_spin(&(p)->p_slock)
+#define	PROC_SLOCK_ASSERT(p, type)	mtx_assert(&(p)->p_slock, (type))
 
 /* These flags are kept in p_flag. */
 #define	P_ADVLOCK	0x00001	/* Process may hold a POSIX advisory lock. */
@@ -626,7 +625,7 @@ struct proc {
 #define	P_STOPPED	(P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE)
 #define	P_SHOULDSTOP(p)	((p)->p_flag & P_STOPPED)
 
-/* These flags are kept in p_sflag and are protected with sched_lock. */
+/* These flags are kept in p_sflag and are protected with proc slock. */
 #define	PS_INMEM	0x00001	/* Loaded into memory. */
 #define	PS_ALRMPEND	0x00020	/* Pending SIGVTALRM needs to be posted. */
 #define	PS_PROFPEND	0x00040	/* Pending SIGPROF needs to be posted. */
@@ -861,8 +860,8 @@ void	stopevent(struct proc *, u_int, u_int);
 void	threadinit(void);
 void	cpu_idle(void);
 extern	void (*cpu_idle_hook)(void);	/* Hook to machdep CPU idler. */
-void	cpu_switch(struct thread *old, struct thread *new);
-void	cpu_throw(struct thread *old, struct thread *new) __dead2;
+void	cpu_switch(struct thread *, struct thread *, struct mtx *);
+void	cpu_throw(struct thread *, struct thread *) __dead2;
 void	unsleep(struct thread *);
 void	userret(struct thread *, struct trapframe *);
 
@@ -872,6 +871,7 @@ void	cpu_fork(struct thread *, struct proc *, struct thread *, int);
 void	cpu_set_fork_handler(struct thread *, void (*)(void *), void *);
 
 /* New in KSE. */
+void	kse_unlink(struct thread *);
 void	kse_GC(void);
 void	kseinit(void);
 void	cpu_set_upcall(struct thread *td, struct thread *td0);
@@ -900,6 +900,7 @@ void	childproc_stopped(struct proc *child, int reason);
 void	childproc_continued(struct proc *child);
 void	childproc_exited(struct proc *child);
 int	thread_suspend_check(int how);
+void	thread_suspend_switch(struct thread *);
 void	thread_suspend_one(struct thread *td);
 struct thread *thread_switchout(struct thread *td, int flags,
 	    struct thread *newtd);
diff --git a/sys/sys/sched.h b/sys/sys/sched.h
index 1342906..0dcf369 100644
--- a/sys/sys/sched.h
+++ b/sys/sys/sched.h
@@ -81,6 +81,7 @@ int	sched_runnable(void);
  */
 void	sched_exit(struct proc *p, struct thread *childtd);
 void	sched_fork(struct thread *td, struct thread *childtd);
+void	sched_fork_exit(struct thread *td);
 
 /*
  * KSE Groups contain scheduling priority information.  They record the
@@ -101,6 +102,7 @@ fixpt_t	sched_pctcpu(struct thread *td);
 void	sched_prio(struct thread *td, u_char prio);
 void	sched_sleep(struct thread *td);
 void	sched_switch(struct thread *td, struct thread *newtd, int flags);
+void	sched_throw(struct thread *td);
 void	sched_unlend_prio(struct thread *td, u_char prio);
 void	sched_unlend_user_prio(struct thread *td, u_char pri);
 void	sched_user_prio(struct thread *td, u_char prio);
@@ -155,6 +157,19 @@ sched_unpin(void)
 #define	SRQ_PREEMPTED	0x0008		/* has been preempted.. be kind */
 #define	SRQ_BORROWING	0x0010		/* Priority updated due to prio_lend */
 
+/* Switch stats. */
+#ifdef SCHED_STATS
+extern long switch_preempt;
+extern long switch_owepreempt;
+extern long switch_turnstile;
+extern long switch_sleepq;
+extern long switch_sleepqtimo;
+extern long switch_relinquish;
+extern long switch_needresched;
+#define SCHED_STAT_INC(var)     atomic_add_long(&(var), 1)
+#else
+#define SCHED_STAT_INC(var)
+#endif
 
 /* temporarily here */
 void schedinit(void);
@@ -162,7 +177,6 @@ void sched_init_concurrency(struct proc *p);
 void sched_set_concurrency(struct proc *p, int cuncurrency);
 void sched_schedinit(void);
 void sched_newproc(struct proc *p, struct thread *td);
-void sched_thread_exit(struct thread *td);
 void sched_newthread(struct thread *td);
 #endif /* _KERNEL */