From aa2dc0a5d9e7a19420c153cd414fefa8498eab71 Mon Sep 17 00:00:00 2001
From: julian <julian@FreeBSD.org>
Date: Sat, 29 Jun 2002 17:26:22 +0000
Subject: Part 1 of KSE-III

The ability to schedule multiple threads per process
(one one cpu) by making ALL system calls optionally asynchronous.
to come: ia64 and power-pc patches, patches for gdb, test program (in tools)

Reviewed by:	Almost everyone who counts
	(at various times, peter, jhb, matt, alfred, mini, bernd,
	and a cast of thousands)

	NOTE: this is still Beta code, and contains lots of debugging stuff.
	expect slight instability in signals..
---
 sys/kern/init_main.c      |  33 ++-
 sys/kern/init_sysent.c    |   2 +-
 sys/kern/kern_condvar.c   |  89 ++++++-
 sys/kern/kern_exec.c      |  10 +-
 sys/kern/kern_exit.c      |  97 ++++++-
 sys/kern/kern_fork.c      |  75 ++++--
 sys/kern/kern_idle.c      |  19 +-
 sys/kern/kern_intr.c      |  27 +-
 sys/kern/kern_kthread.c   |   3 +-
 sys/kern/kern_mutex.c     |  31 +--
 sys/kern/kern_poll.c      |   1 -
 sys/kern/kern_proc.c      | 217 ++++++++++-----
 sys/kern/kern_shutdown.c  |   1 -
 sys/kern/kern_sig.c       | 386 +++++++++++++++------------
 sys/kern/kern_subr.c      |   1 -
 sys/kern/kern_switch.c    | 662 +++++++++++++++++++++++++++++++++++++++++++---
 sys/kern/kern_synch.c     | 275 +++++++++++++------
 sys/kern/ksched.c         |  27 +-
 sys/kern/subr_smp.c       |   4 +-
 sys/kern/subr_trap.c      |  37 ++-
 sys/kern/subr_turnstile.c |  31 +--
 sys/kern/subr_witness.c   |   1 +
 sys/kern/sys_generic.c    |   2 +-
 sys/kern/sys_process.c    |   6 +-
 sys/kern/syscalls.master  |   2 +-
 sys/kern/tty.c            |  53 +++-
 26 files changed, 1601 insertions(+), 491 deletions(-)

(limited to 'sys/kern')

diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
index d5c5656..06cc8d8 100644
--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c
@@ -289,6 +289,7 @@ proc0_init(void *dummy __unused)
 	 * Initialize thread, process and pgrp structures.
 	 */
 	procinit();
+	threadinit();
 
 	/*
 	 * Initialize sleep queue hash table
@@ -322,19 +323,34 @@ proc0_init(void *dummy __unused)
 	p->p_sysent = &aout_sysvec;
 #endif
 
+	/*
+	 * proc_linkup was already done in init_i386() or alphainit() etc.
+	 * because the earlier code needed to follow td->td_proc. Otherwise
+	 * I would have done it here.. maybe this means this should be
+	 * done earlier too.
+	 */
 	ke = &proc0.p_kse;	/* XXXKSE */
 	kg = &proc0.p_ksegrp;	/* XXXKSE */
 	p->p_flag = P_SYSTEM;
 	p->p_sflag = PS_INMEM;
-	p->p_stat = SRUN;
-	p->p_ksegrp.kg_nice = NZERO;
- 	kg->kg_pri_class = PRI_TIMESHARE;
- 	kg->kg_user_pri = PUSER;
- 	td->td_priority = PVM;
- 	td->td_base_pri = PUSER;
-
+	p->p_state = PRS_NORMAL;
+	td->td_state = TDS_RUNNING;
+	kg->kg_nice = NZERO;
+	kg->kg_pri_class = PRI_TIMESHARE;
+	kg->kg_user_pri = PUSER;
+	td->td_priority = PVM;
+	td->td_base_pri = PUSER;
+	td->td_kse = ke; /* XXXKSE */
+	ke->ke_oncpu = 0;
+	ke->ke_state = KES_RUNNING;
+	ke->ke_thread = td;
+	/* proc_linkup puts it in the idle queue, that's not what we want. */
+	TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
+	kg->kg_idle_kses--;
 	p->p_peers = 0;
 	p->p_leader = p;
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));
+
 
 	bcopy("swapper", p->p_comm, sizeof ("swapper"));
 
@@ -662,8 +678,7 @@ kick_init(const void *udata __unused)
 
 	td = FIRST_THREAD_IN_PROC(initproc);
 	mtx_lock_spin(&sched_lock);
-	initproc->p_stat = SRUN;
-	setrunqueue(FIRST_THREAD_IN_PROC(initproc)); /* XXXKSE */
+	setrunqueue(td);	/* XXXKSE */
 	mtx_unlock_spin(&sched_lock);
 }
 SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL)
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
index 425e3b7..cf8ba80 100644
--- a/sys/kern/init_sysent.c
+++ b/sys/kern/init_sysent.c
@@ -405,7 +405,7 @@ struct sysent sysent[] = {
 	{ 0, (sy_call_t *)kse_wakeup },			/* 380 = kse_wakeup */
 	{ AS(kse_new_args), (sy_call_t *)kse_new },	/* 381 = kse_new */
 	{ AS(thread_wakeup_args), (sy_call_t *)thread_wakeup },	/* 382 = thread_wakeup */
-	{ 0, (sy_call_t *)kse_yield },			/* 383 = kse_yield */
+	{ SYF_MPSAFE | 0, (sy_call_t *)kse_yield },	/* 383 = kse_yield */
 	{ 0, (sy_call_t *)nosys },			/* 384 = __mac_get_proc */
 	{ 0, (sy_call_t *)nosys },			/* 385 = __mac_set_proc */
 	{ 0, (sy_call_t *)nosys },			/* 386 = __mac_get_fd */
diff --git a/sys/kern/kern_condvar.c b/sys/kern/kern_condvar.c
index 9d30d25..78585b2 100644
--- a/sys/kern/kern_condvar.c
+++ b/sys/kern/kern_condvar.c
@@ -48,7 +48,7 @@
  */
 #define	CV_ASSERT(cvp, mp, td) do {					\
 	KASSERT((td) != NULL, ("%s: curthread NULL", __func__));	\
-	KASSERT((td)->td_proc->p_stat == SRUN, ("%s: not SRUN", __func__));	\
+	KASSERT((td)->td_state == TDS_RUNNING, ("%s: not TDS_RUNNING", __func__));	\
 	KASSERT((cvp) != NULL, ("%s: cvp NULL", __func__));		\
 	KASSERT((mp) != NULL, ("%s: mp NULL", __func__));		\
 	mtx_assert((mp), MA_OWNED | MA_NOTRECURSED);			\
@@ -80,6 +80,7 @@
 #endif
 
 static void cv_timedwait_end(void *arg);
+static void cv_check_upcall(struct thread *td);
 
 /*
  * Initialize a condition variable.  Must be called before use.
@@ -109,14 +110,47 @@ cv_destroy(struct cv *cvp)
  */
 
 /*
+ * Decide if we need to queue an upcall.
+ * This is copied from msleep(), perhaps this should be a common function.
+ */
+static void
+cv_check_upcall(struct thread *td)
+{
+
+	/*
+	 * If we are capable of async syscalls and there isn't already
+	 * another one ready to return, start a new thread
+	 * and queue it as ready to run. Note that there is danger here
+	 * because we need to make sure that we don't sleep allocating
+	 * the thread (recursion here might be bad).
+	 * Hence the TDF_INMSLEEP flag.
+	 */
+	if ((td->td_proc->p_flag & P_KSES) && td->td_mailbox &&
+	    (td->td_flags & TDF_INMSLEEP) == 0) {
+		/*
+		 * If we have no queued work to do,
+		 * upcall to the UTS to see if it has more work.
+		 * We don't need to upcall now, just queue it.
+		 */
+		if (TAILQ_FIRST(&td->td_ksegrp->kg_runq) == NULL) {
+			/* Don't recurse here! */
+			td->td_flags |= TDF_INMSLEEP;
+			thread_schedule_upcall(td, td->td_kse);
+			td->td_flags &= ~TDF_INMSLEEP;
+		}
+	}
+}
+
+/*
  * Switch context.
  */
 static __inline void
 cv_switch(struct thread *td)
 {
 
-	td->td_proc->p_stat = SSLEEP;
+	td->td_state = TDS_SLP;
 	td->td_proc->p_stats->p_ru.ru_nvcsw++;
+	cv_check_upcall(td);
 	mi_switch();
 	CTR3(KTR_PROC, "cv_switch: resume thread %p (pid %d, %s)", td,
 	    td->td_proc->p_pid, td->td_proc->p_comm);
@@ -135,7 +169,7 @@ cv_switch_catch(struct thread *td)
 	 * We put ourselves on the sleep queue and start our timeout before
 	 * calling cursig, as we could stop there, and a wakeup or a SIGCONT (or
 	 * both) could occur while we were stopped.  A SIGCONT would cause us to
-	 * be marked as SSLEEP without resuming us, thus we must be ready for
+	 * be marked as TDS_SLP without resuming us, thus we must be ready for
 	 * sleep when cursig is called.  If the wakeup happens while we're
 	 * stopped, td->td_wchan will be 0 upon return from cursig.
 	 */
@@ -143,13 +177,15 @@ cv_switch_catch(struct thread *td)
 	mtx_unlock_spin(&sched_lock);
 	p = td->td_proc;
 	PROC_LOCK(p);
-	sig = cursig(p);	/* XXXKSE */
+	sig = cursig(td);	/* XXXKSE */
+	if (thread_suspend_check(1))
+		sig = SIGSTOP;
 	mtx_lock_spin(&sched_lock);
 	PROC_UNLOCK(p);
 	if (sig != 0) {
 		if (td->td_wchan != NULL)
 			cv_waitq_remove(td);
-		td->td_proc->p_stat = SRUN;
+		td->td_state = TDS_RUNNING;	/* XXXKSE */
 	} else if (td->td_wchan != NULL) {
 		cv_switch(td);
 	}
@@ -175,7 +211,6 @@ cv_waitq_add(struct cv *cvp, struct thread *td)
 	td->td_flags |= TDF_CVWAITQ;
 	td->td_wchan = cvp;
 	td->td_wmesg = cvp->cv_description;
-	td->td_kse->ke_slptime = 0; /* XXXKSE */
 	td->td_ksegrp->kg_slptime = 0; /* XXXKSE */
 	td->td_base_pri = td->td_priority;
 	CTR3(KTR_PROC, "cv_waitq_add: thread %p (pid %d, %s)", td,
@@ -285,7 +320,7 @@ cv_wait_sig(struct cv *cvp, struct mtx *mp)
 
 	PROC_LOCK(p);
 	if (sig == 0)
-		sig = cursig(p);  /* XXXKSE */
+		sig = cursig(td);	/* XXXKSE */
 	if (sig != 0) {
 		if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
 			rval = EINTR;
@@ -293,6 +328,8 @@ cv_wait_sig(struct cv *cvp, struct mtx *mp)
 			rval = ERESTART;
 	}
 	PROC_UNLOCK(p);
+	if (p->p_flag & P_WEXIT)
+		rval = EINTR;
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
@@ -363,6 +400,8 @@ cv_timedwait(struct cv *cvp, struct mtx *mp, int timo)
 		mi_switch();
 	}
 
+	if (td->td_proc->p_flag & P_WEXIT)
+		rval = EWOULDBLOCK;
 	mtx_unlock_spin(&sched_lock);
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
@@ -436,12 +475,11 @@ cv_timedwait_sig(struct cv *cvp, struct mtx *mp, int timo)
 		td->td_proc->p_stats->p_ru.ru_nivcsw++;
 		mi_switch();
 	}
-
 	mtx_unlock_spin(&sched_lock);
 
 	PROC_LOCK(p);
 	if (sig == 0)
-		sig = cursig(p);
+		sig = cursig(td);
 	if (sig != 0) {
 		if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
 			rval = EINTR;
@@ -450,6 +488,9 @@ cv_timedwait_sig(struct cv *cvp, struct mtx *mp, int timo)
 	}
 	PROC_UNLOCK(p);
 
+	if (p->p_flag & P_WEXIT)
+		rval = EINTR;
+
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(0, 0);
@@ -477,15 +518,13 @@ cv_wakeup(struct cv *cvp)
 	TAILQ_REMOVE(&cvp->cv_waitq, td, td_slpq);
 	td->td_flags &= ~TDF_CVWAITQ;
 	td->td_wchan = 0;
-	if (td->td_proc->p_stat == SSLEEP) {
+	if (td->td_state == TDS_SLP) {
 		/* OPTIMIZED EXPANSION OF setrunnable(td); */
 		CTR3(KTR_PROC, "cv_signal: thread %p (pid %d, %s)",
 		    td, td->td_proc->p_pid, td->td_proc->p_comm);
 		if (td->td_ksegrp->kg_slptime > 1) /* XXXKSE */
 			updatepri(td);
-		td->td_kse->ke_slptime = 0;
 		td->td_ksegrp->kg_slptime = 0;
-		td->td_proc->p_stat = SRUN;
 		if (td->td_proc->p_sflag & PS_INMEM) {
 			setrunqueue(td);
 			maybe_resched(td);
@@ -568,7 +607,7 @@ cv_timedwait_end(void *arg)
 		td->td_flags &= ~TDF_TIMEOUT;
 		setrunqueue(td);
 	} else if (td->td_wchan != NULL) {
-		if (td->td_proc->p_stat == SSLEEP) /* XXXKSE */
+		if (td->td_state == TDS_SLP)	/* XXXKSE */
 			setrunnable(td);
 		else
 			cv_waitq_remove(td);
@@ -577,3 +616,27 @@ cv_timedwait_end(void *arg)
 		td->td_flags |= TDF_TIMOFAIL;
 	mtx_unlock_spin(&sched_lock);
 }
+
+/*
+ * For now only abort interruptable waits.
+ * The others will have to either complete on their own or have a timeout.
+ */
+void
+cv_abort(struct thread *td)
+{
+
+	CTR3(KTR_PROC, "cv_abort: thread %p (pid %d, %s)", td,
+	    td->td_proc->p_pid,
+	    td->td_proc->p_comm);
+	mtx_lock_spin(&sched_lock);
+	if ((td->td_flags & (TDF_SINTR|TDF_TIMEOUT)) == TDF_SINTR) {
+		if (td->td_wchan != NULL) {
+			if (td->td_state == TDS_SLP)
+				setrunnable(td);
+			else
+				cv_waitq_remove(td);
+		}
+	}
+	mtx_unlock_spin(&sched_lock);
+}
+
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
index feaa123..0cd7f27 100644
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@@ -154,12 +154,14 @@ execve(td, uap)
 	PROC_LOCK(p);
 	KASSERT((p->p_flag & P_INEXEC) == 0,
 	    ("%s(): process already has P_INEXEC flag", __func__));
+	if ((p->p_flag & P_KSES) && thread_single(SNGLE_EXIT)) {
+		PROC_UNLOCK(p);
+		mtx_unlock(&Giant);
+		return (ERESTART);	/* Try again later. */
+	}
+	/* If we get here all other threads are dead. */
 	p->p_flag |= P_INEXEC;
 	PROC_UNLOCK(p);
-	
-/* XXXKSE */
-/* !!!!!!!! we need abort all the other threads of this process before we */
-/* proceed beyond his point! */
 
 	/*
 	 * Initialize part of the common data
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
index 63a5135..fea5438 100644
--- a/sys/kern/kern_exit.c
+++ b/sys/kern/kern_exit.c
@@ -145,6 +145,67 @@ exit1(td, rv)
 	/*
 	 * XXXXKSE: MUST abort all other threads before proceeding past here.
 	 */
+	PROC_LOCK(p);
+	if (p->p_flag & P_KSES) {
+		/*
+		 * First check if some other thread got here before us..
+		 * if so, act apropriatly, (exit or suspend);
+		 */
+		thread_suspend_check(0);
+		/*
+		 * Here is a trick..
+		 * We need to free up our KSE to process other threads
+		 * so that we can safely set the UNBOUND flag
+		 * (whether or not we have a mailbox) as we are NEVER
+		 * going to return to the user.
+		 * The flag will not be set yet if we are exiting
+		 * because of a signal, pagefault, or similar
+		 * (or even an exit(2) from the UTS).
+		 */
+		td->td_flags |= TDF_UNBOUND;
+
+		/*
+		 * Kill off the other threads. This requires
+		 * Some co-operation from other parts of the kernel
+		 * so it may not be instant.
+		 * With this state set:
+		 * Any thread entering the kernel from userspace will
+		 * thread_exit() in trap().  Any thread attempting to
+		 * sleep will return immediatly
+		 * with EINTR or EWOULDBLOCK, which will hopefully force them
+		 * to back out to userland, freeing resources as they go, and
+		 * anything attempting to return to userland will thread_exit()
+		 * from userret().  thread_exit() will unsuspend us
+		 * when the last other thread exits.
+		 */
+		if (thread_single(SNGLE_EXIT)) {
+			panic ("Exit: Single threading fouled up");
+		}
+		/*
+		 * All other activity in this process is now stopped.
+		 * Remove excess KSEs and KSEGRPS. XXXKSE (when we have them)
+		 * ... 
+		 * Turn off threading support.
+		 */
+		p->p_flag &= ~P_KSES;
+		td->td_flags &= ~TDF_UNBOUND;
+		thread_single_end(); 	/* Don't need this any more. */
+	}
+	/*
+	 * With this state set:
+	 * Any thread entering the kernel from userspace will thread_exit()
+	 * in trap().  Any thread attempting to sleep will return immediatly
+	 * with EINTR or EWOULDBLOCK, which will hopefully force them
+	 * to back out to userland, freeing resources as they go, and
+	 * anything attempting to return to userland will thread_exit()
+	 * from userret().  thread_exit() will do a wakeup on p->p_numthreads
+	 * if it transitions to 1.
+	 */
+
+	p->p_flag |= P_WEXIT;
+	PROC_UNLOCK(p);
+	if (td->td_kse->ke_mdstorage)
+		cpu_free_kse_mdstorage(td->td_kse);
 
 	/* Are we a task leader? */
 	PROC_LOCK(p);
@@ -185,7 +246,6 @@ exit1(td, rv)
 	 */
 	PROC_LOCK(p);
 	p->p_flag &= ~(P_TRACED | P_PPWAIT);
-	p->p_flag |= P_WEXIT;
 	SIGEMPTYSET(p->p_siglist);
 	PROC_UNLOCK(p);
 	if (timevalisset(&p->p_realtimer.it_value))
@@ -434,22 +494,24 @@ exit1(td, rv)
 
 	/*
 	 * We have to wait until after releasing all locks before
-	 * changing p_stat.  If we block on a mutex then we will be
+	 * changing p_state.  If we block on a mutex then we will be
 	 * back at SRUN when we resume and our parent will never
 	 * harvest us.
 	 */
-	p->p_stat = SZOMB;
+	p->p_state = PRS_ZOMBIE;
 
 	wakeup(p->p_pptr);
 	PROC_UNLOCK(p->p_pptr);
-	PROC_UNLOCK(p);
-
 	cnt.v_swtch++;
 	binuptime(PCPU_PTR(switchtime));
 	PCPU_SET(switchticks, ticks);
 
-	cpu_sched_exit(td);
-	cpu_throw();
+	cpu_sched_exit(td); /* XXXKSE check if this should be in thread_exit */
+	/*
+	 * Make sure this thread is discarded from the zombie.
+	 * This will also release this thread's reference to the ucred.
+	 */
+	thread_exit();
 	panic("exit1");
 }
 
@@ -504,6 +566,8 @@ wait1(td, uap, compat)
 	register int nfound;
 	register struct proc *p, *q, *t;
 	int status, error;
+	struct kse *ke;
+	struct ksegrp *kg;
 
 	q = td->td_proc;
 	if (uap->pid == 0) {
@@ -540,7 +604,7 @@ loop:
 		}
 
 		nfound++;
-		if (p->p_stat == SZOMB) {
+		if (p->p_state == PRS_ZOMBIE) {
 			/*
 			 * charge childs scheduling cpu usage to parent
 			 * XXXKSE assume only one thread & kse & ksegrp
@@ -656,6 +720,21 @@ loop:
 			}
 
 			/*
+			 * There should only be one KSE/KSEGRP but
+			 * do it right anyhow.
+			 */
+			FOREACH_KSEGRP_IN_PROC(p, kg) {
+				FOREACH_KSE_IN_GROUP(kg, ke) {
+					/* Free the KSE spare thread. */
+					if (ke->ke_tdspare != NULL) {
+						thread_free(ke->ke_tdspare);
+						p->p_kse.ke_tdspare = NULL;
+					}
+				}
+			}
+			thread_reap();	/* check for zombie threads */
+
+			/*
 			 * Give vm and machine-dependent layer a chance
 			 * to free anything that cpu_exit couldn't
 			 * release while still running in process context.
@@ -669,7 +748,7 @@ loop:
 			mtx_unlock(&Giant);
 			return (0);
 		}
-		if (p->p_stat == SSTOP && (p->p_flag & P_WAITED) == 0 &&
+		if (P_SHOULDSTOP(p) && ((p->p_flag & P_WAITED) == 0) &&
 		    (p->p_flag & P_TRACED || uap->options & WUNTRACED)) {
 			p->p_flag |= P_WAITED;
 			sx_xunlock(&proctree_lock);
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index 016653b..eac0267 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -212,23 +212,6 @@ sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
 SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
     0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
 
-#if 0
-void
-kse_init(struct kse *kse1, struct kse *kse2) 
-{
-}
-
-void
-thread_init(struct thread *thread1, struct thread *thread2) 
-{
-}
-
-void
-ksegrp_init(struct ksegrp *ksegrp1, struct ksegrp *ksegrp2) 
-{
-}
-#endif
-
 int
 fork1(td, flags, procp)
 	struct thread *td;			/* parent proc */
@@ -296,6 +279,29 @@ fork1(td, flags, procp)
 		return (0);
 	}
 
+	if (p1->p_flag & P_KSES) {
+		/*
+		 * Idle the other threads for a second.
+		 * Since the user space is copied, it must remain stable.
+		 * In addition, all threads (from the user perspective)
+		 * need to either be suspended or in the kernel,
+		 * where they will try restart in the parent and will
+		 * be aborted in the child.
+		 */
+		PROC_LOCK(p1);
+		if (thread_single(SNGLE_NO_EXIT)) {
+			/* Abort.. someone else is single threading before us */
+			PROC_UNLOCK(p1);
+			return (ERESTART);
+		}
+		PROC_UNLOCK(p1);
+		/*
+		 * All other activity in this process
+		 * is now suspended at the user boundary,
+		 * (or other safe places if we think of any).
+		 */
+	}
+
 	/* Allocate new proc. */
 	newproc = uma_zalloc(proc_zone, M_WAITOK);
 
@@ -311,6 +317,11 @@ fork1(td, flags, procp)
 	if ((nprocs >= maxproc - 10 && uid != 0) || nprocs >= maxproc) {
 		sx_xunlock(&allproc_lock);
 		uma_zfree(proc_zone, newproc);
+		if (p1->p_flag & P_KSES) {
+			PROC_LOCK(p1);
+			thread_single_end();
+			PROC_UNLOCK(p1);
+		}
 		tsleep(&forksleep, PUSER, "fork", hz / 2);
 		return (EAGAIN);
 	}
@@ -325,6 +336,11 @@ fork1(td, flags, procp)
 	if (!ok) {
 		sx_xunlock(&allproc_lock);
 		uma_zfree(proc_zone, newproc);
+		if (p1->p_flag & P_KSES) {
+			PROC_LOCK(p1);
+			thread_single_end();
+			PROC_UNLOCK(p1);
+		}
 		tsleep(&forksleep, PUSER, "fork", hz / 2);
 		return (EAGAIN);
 	}
@@ -411,7 +427,7 @@ again:
 		lastpid = trypid;
 
 	p2 = newproc;
-	p2->p_stat = SIDL;			/* protect against others */
+	p2->p_state = PRS_NEW;		/* protect against others */
 	p2->p_pid = trypid;
 	LIST_INSERT_HEAD(&allproc, p2, p_list);
 	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
@@ -449,7 +465,7 @@ again:
 	 * Start by zeroing the section of proc that is zero-initialized,
 	 * then copy the section that is copied directly from the parent.
 	 */
-	td2 = thread_get(p2);
+	td2 = thread_alloc();
 	ke2 = &p2->p_kse;
 	kg2 = &p2->p_ksegrp;
 
@@ -459,8 +475,10 @@ again:
 	    (unsigned) RANGEOF(struct proc, p_startzero, p_endzero));
 	bzero(&ke2->ke_startzero,
 	    (unsigned) RANGEOF(struct kse, ke_startzero, ke_endzero));
+#if 0 /* bzero'd by the thread allocator */
 	bzero(&td2->td_startzero,
 	    (unsigned) RANGEOF(struct thread, td_startzero, td_endzero));
+#endif
 	bzero(&kg2->kg_startzero,
 	    (unsigned) RANGEOF(struct ksegrp, kg_startzero, kg_endzero));
 
@@ -482,9 +500,22 @@ again:
 	 * XXXKSE Theoretically only the running thread would get copied 
 	 * Others in the kernel would be 'aborted' in the child.
 	 * i.e return E*something*
+	 * On SMP we would have to stop them running on
+	 * other CPUs! (set a flag in the proc that stops
+	 * all returns to userland until completed)
+	 * This is wrong but ok for 1:1.
 	 */
 	proc_linkup(p2, kg2, ke2, td2);
 
+	/* Set up the thread as an active thread (as if runnable). */
+	TAILQ_REMOVE(&kg2->kg_iq, ke2, ke_kgrlist);
+	kg2->kg_idle_kses--;
+	ke2->ke_state = KES_UNQUEUED;
+	ke2->ke_thread = td2;
+	td2->td_kse = ke2;
+	td2->td_flags &= ~TDF_UNBOUND; /* For the rest of this syscall. */
+KASSERT((ke2->ke_kgrlist.tqe_next != ke2), ("linked to self!"));
+
 	/* note.. XXXKSE no pcb or u-area yet */
 
 	/*
@@ -699,7 +730,6 @@ again:
 	p2->p_acflag = AFORK;
 	if ((flags & RFSTOPPED) == 0) {
 		mtx_lock_spin(&sched_lock);
-		p2->p_stat = SRUN;
 		setrunqueue(td2);
 		mtx_unlock_spin(&sched_lock);
 	}
@@ -803,6 +833,9 @@ fork_exit(callout, arg, frame)
 	struct proc *p = td->td_proc;
 
 	td->td_kse->ke_oncpu = PCPU_GET(cpuid);
+	p->p_state = PRS_NORMAL;
+	td->td_state = TDS_RUNNING; /* Already done in switch() on 386. */
+	td->td_kse->ke_state = KES_RUNNING;
 	/*
 	 * Finish setting up thread glue.  We need to initialize
 	 * the thread into a td_critnest=1 state.  Some platforms
@@ -814,7 +847,7 @@ fork_exit(callout, arg, frame)
 	sched_lock.mtx_lock = (uintptr_t)td;
 	sched_lock.mtx_recurse = 0;
 	cpu_critical_fork_exit();
-	CTR3(KTR_PROC, "fork_exit: new proc %p (pid %d, %s)", p, p->p_pid,
+	CTR3(KTR_PROC, "fork_exit: new thread %p (pid %d, %s)", td, p->p_pid,
 	    p->p_comm);
 	if (PCPU_GET(switchtime.sec) == 0)
 		binuptime(PCPU_PTR(switchtime));
diff --git a/sys/kern/kern_idle.c b/sys/kern/kern_idle.c
index 29194b7..306f2a5 100644
--- a/sys/kern/kern_idle.c
+++ b/sys/kern/kern_idle.c
@@ -40,6 +40,7 @@ idle_setup(void *dummy)
 	struct pcpu *pc;
 #endif
 	struct proc *p;
+	struct thread *td;
 	int error;
 
 #ifdef SMP
@@ -60,7 +61,10 @@ idle_setup(void *dummy)
 			panic("idle_setup: kthread_create error %d\n", error);
 
 		p->p_flag |= P_NOLOAD;
-		p->p_stat = SRUN;
+		td = FIRST_THREAD_IN_PROC(p);
+		td->td_state = TDS_RUNQ;	
+		td->td_kse->ke_state = KES_ONRUNQ;	
+		td->td_kse->ke_flags |= KEF_IDLEKSE; 
 #ifdef SMP
 	}
 #endif
@@ -75,16 +79,22 @@ idle_proc(void *dummy)
 #ifdef DIAGNOSTIC
 	int count;
 #endif
+	struct thread *td;
+	struct proc *p;
 
+	td = curthread;
+	p = td->td_proc;
+	td->td_state = TDS_RUNNING; 
+	td->td_kse->ke_state = KES_RUNNING;
 	for (;;) {
 		mtx_assert(&Giant, MA_NOTOWNED);
 
 #ifdef DIAGNOSTIC
 		count = 0;
 
-		while (count >= 0 && procrunnable() == 0) {
+		while (count >= 0 && kserunnable() == 0) {
 #else
-		while (procrunnable() == 0) {
+		while (kserunnable() == 0) {
 #endif
 		/*
 		 * This is a good place to put things to be done in
@@ -103,8 +113,9 @@ idle_proc(void *dummy)
 		}
 
 		mtx_lock_spin(&sched_lock);
-		curproc->p_stats->p_ru.ru_nvcsw++;
+		p->p_stats->p_ru.ru_nvcsw++;
 		mi_switch();
+		td->td_kse->ke_state = KES_RUNNING;
 		mtx_unlock_spin(&sched_lock);
 	}
 }
diff --git a/sys/kern/kern_intr.c b/sys/kern/kern_intr.c
index d65dc82..fb9c092 100644
--- a/sys/kern/kern_intr.c
+++ b/sys/kern/kern_intr.c
@@ -201,7 +201,7 @@ ithread_create(struct ithd **ithread, int vector, int flags,
 	td = FIRST_THREAD_IN_PROC(p);	/* XXXKSE */
 	td->td_ksegrp->kg_pri_class = PRI_ITHD;
 	td->td_priority = PRI_MAX_ITHD;
-	p->p_stat = SWAIT;
+	td->td_state = TDS_IWAIT;
 	ithd->it_td = td;
 	td->td_ithd = ithd;
 	if (ithread != NULL)
@@ -229,8 +229,7 @@ ithread_destroy(struct ithd *ithread)
 	}
 	ithread->it_flags |= IT_DEAD;
 	mtx_lock_spin(&sched_lock);
-	if (p->p_stat == SWAIT) {
-		p->p_stat = SRUN; /* XXXKSE */
+	if (td->td_state == TDS_IWAIT) {
 		setrunqueue(td);
 	}
 	mtx_unlock_spin(&sched_lock);
@@ -327,7 +326,7 @@ ok:
 	 * handler as being dead and let the ithread do the actual removal.
 	 */
 	mtx_lock_spin(&sched_lock);
-	if (ithread->it_td->td_proc->p_stat != SWAIT) {
+	if (ithread->it_td->td_state != TDS_IWAIT) {
 		handler->ih_flags |= IH_DEAD;
 
 		/*
@@ -374,8 +373,8 @@ ithread_schedule(struct ithd *ithread, int do_switch)
 	td = ithread->it_td;
 	p = td->td_proc;
 	KASSERT(p != NULL, ("ithread %s has no process", ithread->it_name));
-	CTR4(KTR_INTR, "%s: pid %d: (%s) need = %d", __func__, p->p_pid, p->p_comm,
-	    ithread->it_need);
+	CTR4(KTR_INTR, "%s: pid %d: (%s) need = %d",
+	    __func__, p->p_pid, p->p_comm, ithread->it_need);
 
 	/*
 	 * Set it_need to tell the thread to keep running if it is already
@@ -387,14 +386,16 @@ ithread_schedule(struct ithd *ithread, int do_switch)
 	 */
 	ithread->it_need = 1;
 	mtx_lock_spin(&sched_lock);
-	if (p->p_stat == SWAIT) {
+	if (td->td_state == TDS_IWAIT) {
 		CTR2(KTR_INTR, "%s: setrunqueue %d", __func__, p->p_pid);
-		p->p_stat = SRUN;
-		setrunqueue(td); /* XXXKSE */
-		if (do_switch && curthread->td_critnest == 1 &&
-		    curthread->td_proc->p_stat == SRUN) {
+		setrunqueue(td);
+		if (do_switch &&
+		    (curthread->td_critnest == 1)/* &&
+		    (curthread->td_state == TDS_RUNNING) XXXKSE*/) {
+#if 0 /* not needed in KSE */
 			if (curthread != PCPU_GET(idlethread))
 				setrunqueue(curthread);
+#endif
 			curthread->td_proc->p_stats->p_ru.ru_nivcsw++;
 			mi_switch();
 		} else {
@@ -402,7 +403,7 @@ ithread_schedule(struct ithd *ithread, int do_switch)
 		}
 	} else {
 		CTR4(KTR_INTR, "%s: pid %d: it_need %d, state %d",
-		    __func__, p->p_pid, ithread->it_need, p->p_stat);
+		    __func__, p->p_pid, ithread->it_need, p->p_state);
 	}
 	mtx_unlock_spin(&sched_lock);
 
@@ -550,7 +551,7 @@ restart:
 			 */
 			if (ithd->it_enable != NULL)
 				ithd->it_enable(ithd->it_vector);
-			p->p_stat = SWAIT; /* we're idle */
+			td->td_state = TDS_IWAIT; /* we're idle */
 			p->p_stats->p_ru.ru_nvcsw++;
 			CTR2(KTR_INTR, "%s: pid %d: done", __func__, p->p_pid);
 			mi_switch();
diff --git a/sys/kern/kern_kthread.c b/sys/kern/kern_kthread.c
index a456a86..e8e2fea 100644
--- a/sys/kern/kern_kthread.c
+++ b/sys/kern/kern_kthread.c
@@ -109,8 +109,7 @@ kthread_create(void (*func)(void *), void *arg,
 	mtx_lock_spin(&sched_lock);
 	p2->p_sflag |= PS_INMEM;
 	if (!(flags & RFSTOPPED)) {
-		p2->p_stat = SRUN;
-		setrunqueue(FIRST_THREAD_IN_PROC(p2)); /* XXXKSE */
+		setrunqueue(FIRST_THREAD_IN_PROC(p2)); 
 	}
 	mtx_unlock_spin(&sched_lock);
 
diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c
index 08bca8d..c2e79d0 100644
--- a/sys/kern/kern_mutex.c
+++ b/sys/kern/kern_mutex.c
@@ -119,23 +119,20 @@ propagate_priority(struct thread *td)
 			return;
 		}
 
+		KASSERT(td->td_state != TDS_SURPLUS, ("Mutex owner SURPLUS"));
+		MPASS(td->td_proc != NULL);
 		MPASS(td->td_proc->p_magic == P_MAGIC);
-		KASSERT(td->td_proc->p_stat != SSLEEP, ("sleeping thread owns a mutex"));
+		KASSERT(td->td_state != TDS_SLP,
+		    ("sleeping thread owns a mutex"));
 		if (td->td_priority <= pri) /* lower is higher priority */
 			return;
 
-		/*
-		 * Bump this thread's priority.
-		 */
-		td->td_priority = pri;
 
 		/*
 		 * If lock holder is actually running, just bump priority.
 		 */
-		if (thread_running(td)) {
-			MPASS(td->td_proc->p_stat == SRUN
-			|| td->td_proc->p_stat == SZOMB
-			|| td->td_proc->p_stat == SSTOP);
+		if (td->td_state == TDS_RUNNING) {
+			td->td_priority = pri;
 			return;
 		}
 
@@ -151,20 +148,26 @@ propagate_priority(struct thread *td)
 		 * If on run queue move to new run queue, and quit.
 		 * XXXKSE this gets a lot more complicated under threads
 		 * but try anyhow.
+		 * We should have a special call to do this more efficiently.
 		 */
-		if (td->td_proc->p_stat == SRUN) {
+		if (td->td_state == TDS_RUNQ) {
 			MPASS(td->td_blocked == NULL);
 			remrunqueue(td);
+			td->td_priority = pri;
 			setrunqueue(td);
 			return;
 		}
+		/*
+		 * Adjust for any other cases.
+		 */
+		td->td_priority = pri;
 
 		/*
 		 * If we aren't blocked on a mutex, we should be.
 		 */
-		KASSERT(td->td_proc->p_stat == SMTX, (
+		KASSERT(td->td_state == TDS_MTX, (
 		    "process %d(%s):%d holds %s but isn't blocked on a mutex\n",
-		    td->td_proc->p_pid, td->td_proc->p_comm, td->td_proc->p_stat,
+		    td->td_proc->p_pid, td->td_proc->p_comm, td->td_state,
 		    m->mtx_object.lo_name));
 
 		/*
@@ -590,7 +593,7 @@ _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
 		 */
 		td->td_blocked = m;
 		td->td_mtxname = m->mtx_object.lo_name;
-		td->td_proc->p_stat = SMTX;
+		td->td_state = TDS_MTX;
 		propagate_priority(td);
 
 		if (LOCK_LOG_TEST(&m->mtx_object, opts))
@@ -727,7 +730,6 @@ _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
 		    m, td1);
 
 	td1->td_blocked = NULL;
-	td1->td_proc->p_stat = SRUN;
 	setrunqueue(td1);
 
 	if (td->td_critnest == 1 && td1->td_priority < pri) {
@@ -744,7 +746,6 @@ _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
 			}
 		}
 #endif
-		setrunqueue(td);
 		if (LOCK_LOG_TEST(&m->mtx_object, opts))
 			CTR2(KTR_LOCK,
 			    "_mtx_unlock_sleep: %p switching out lock=%p", m,
diff --git a/sys/kern/kern_poll.c b/sys/kern/kern_poll.c
index a197bc0..9dd6924 100644
--- a/sys/kern/kern_poll.c
+++ b/sys/kern/kern_poll.c
@@ -503,7 +503,6 @@ poll_idle(void)
 			mtx_unlock(&Giant);
 			mtx_assert(&Giant, MA_NOTOWNED);
 			mtx_lock_spin(&sched_lock);
-			setrunqueue(td);
 			td->td_proc->p_stats->p_ru.ru_nvcsw++;
 			mi_switch();
 			mtx_unlock_spin(&sched_lock);
diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c
index a5378d9..8b15fc2 100644
--- a/sys/kern/kern_proc.c
+++ b/sys/kern/kern_proc.c
@@ -44,6 +44,7 @@
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sysproto.h>
+#include <sys/kse.h>
 #include <sys/sysctl.h>
 #include <sys/filedesc.h>
 #include <sys/tty.h>
@@ -111,44 +112,28 @@ procinit()
 	uihashinit();
 }
 
-/*
- * Note that we do not link to the proc's ucred here
- * The thread is linked as if running but no KSE assigned
- */
-static  void
-thread_link(struct thread *td, struct ksegrp *kg)
-{
-	struct proc *p = kg->kg_proc;
-
-	td->td_proc     = p;
-	td->td_ksegrp   = kg;
-	td->td_last_kse = &p->p_kse;
-
-	TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
-	TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist);
-	td->td_critnest = 0;
-	td->td_kse      = NULL;
-	cpu_thread_link(td);
-}
-
 /* 
  * KSE is linked onto the idle queue.
  */
-static void
+void
 kse_link(struct kse *ke, struct ksegrp *kg)
 {
 	struct proc *p = kg->kg_proc;
 
+KASSERT((ke->ke_state != KES_ONRUNQ), ("linking suspect kse on run queue"));
 	TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist);
 	kg->kg_kses++;
+KASSERT((ke->ke_state != KES_IDLE), ("already on idle queue"));
+	ke->ke_state = KES_IDLE;
 	TAILQ_INSERT_HEAD(&kg->kg_iq, ke, ke_kgrlist);
+	kg->kg_idle_kses++;
 	ke->ke_proc	= p;
 	ke->ke_ksegrp	= kg;
 	ke->ke_thread	= NULL;
 	ke->ke_oncpu = NOCPU;
 }
 
-static void
+void
 ksegrp_link(struct ksegrp *kg, struct proc *p)
 {
 
@@ -159,10 +144,13 @@ ksegrp_link(struct ksegrp *kg, struct proc *p)
 	TAILQ_INIT(&kg->kg_iq);		/* all kses in ksegrp */
 	kg->kg_proc	= p;
 /* the following counters are in the -zero- section and may not need clearing */
+	kg->kg_numthreads = 0;
 	kg->kg_runnable = 0;
 	kg->kg_kses = 0;
+	kg->kg_idle_kses = 0;
 	kg->kg_runq_kses = 0; /* XXXKSE change name */
 /* link it in now that it's consitant */
+	p->p_numksegrps++;
 	TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp);
 }
 
@@ -177,30 +165,13 @@ proc_linkup(struct proc *p, struct ksegrp *kg,
 
 	TAILQ_INIT(&p->p_ksegrps);	     /* all ksegrps in proc */
 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
+	TAILQ_INIT(&p->p_suspended);	     /* Threads suspended */
 
 	ksegrp_link(kg, p);
 	kse_link(ke, kg);
 	thread_link(td, kg);
-	/* link them together for 1:1 */
-	td->td_kse = ke;
-	ke->ke_thread = td;
 }
 
-/* temporary version is ultra simple while we are in 1:1 mode */
-struct thread *
-thread_get(struct proc *p)
-{
-	struct thread *td = &p->p_xxthread;
-
-	return (td);
-}
-
-
-/*********************
-* STUB KSE syscalls
-*********************/
-
-/* struct thread_wakeup_args { struct thread_mailbox *tmbx; }; */
 int
 thread_wakeup(struct thread *td, struct  thread_wakeup_args *uap)
 {
@@ -219,7 +190,11 @@ int
 kse_yield(struct thread *td, struct kse_yield_args *uap)
 {
 
-	return(ENOSYS);
+	PROC_LOCK(td->td_proc);
+	mtx_lock_spin(&sched_lock);
+	thread_exit();
+	/* NOTREACHED */
+	return(0);
 }
 
 int kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)
@@ -228,16 +203,80 @@ int kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)
 	return(ENOSYS);
 }
 
-
-int
-kse_new(struct thread *td, struct kse_new_args *uap)
+/* 
+ * No new KSEG: first call: use current KSE, don't schedule an upcall
+ * All other situations, do alloate a new KSE and schedule an upcall on it.
+ */
 /* struct kse_new_args {
 	struct kse_mailbox *mbx;
 	int	new_grp_flag;
 }; */
+int
+kse_new(struct thread *td, struct kse_new_args *uap)
 {
+	struct kse *newkse;
+	struct proc *p;
+	struct kse_mailbox mbx;
+	int err;
 
-	return (ENOSYS);
+	p = td->td_proc;
+	if ((err = copyin(uap->mbx, &mbx, sizeof(mbx))))
+		return (err);
+	PROC_LOCK(p);
+	/*
+	 * If we have no KSE mode set, just set it, and skip KSE and KSEGRP
+	 * creation.  You cannot request a new group with the first one as
+	 * you are effectively getting one. Instead, go directly to saving
+	 * the upcall info.
+	 */
+	if ((td->td_proc->p_flag & P_KSES) || (uap->new_grp_flag)) {
+
+		return (EINVAL);	/* XXX */
+		/*
+		 * If newgroup then create the new group.
+		 * Check we have the resources for this.
+		 */
+		/* Copy lots of fields from the current KSEGRP.  */
+		/* Create the new KSE */
+		/* Copy lots of fields from the current KSE.  */
+	} else {
+		/*
+		 * We are switching to KSEs so just
+		 * use the preallocated ones for this call.
+		 * XXXKSE if we have to initialise any fields for KSE
+		 * mode operation, do it here.
+		 */
+		newkse = td->td_kse;
+	}
+	/*
+	 * Fill out the KSE-mode specific fields of the new kse.
+	 */
+	PROC_UNLOCK(p);
+	mtx_lock_spin(&sched_lock);
+	mi_switch();	/* Save current registers to PCB. */
+	mtx_unlock_spin(&sched_lock);
+	newkse->ke_upcall = mbx.kmbx_upcall;
+	newkse->ke_stackbase  = mbx.kmbx_stackbase;
+	newkse->ke_stacksize = mbx.kmbx_stacksize;
+	newkse->ke_mailbox = uap->mbx;
+	cpu_save_upcall(td, newkse);
+	/* Note that we are the returning syscall */
+	td->td_retval[0] = 0;
+	td->td_retval[1] = 0;
+
+	if ((td->td_proc->p_flag & P_KSES) || (uap->new_grp_flag)) {
+		thread_schedule_upcall(td, newkse);
+	} else {
+		/*
+		 * Don't set this until we are truely ready, because
+		 * things will start acting differently.  Return to the
+		 * calling code for the first time.  Assuming we set up
+		 * the mailboxes right, all syscalls after this will be
+		 * asynchronous.
+		 */
+		td->td_proc->p_flag |= P_KSES;
+	}
+	return (0);
 }
 
 /*
@@ -554,7 +593,7 @@ fixjobc(p, pgrp, entering)
 	LIST_FOREACH(p, &p->p_children, p_sibling) {
 		if ((hispgrp = p->p_pgrp) != pgrp &&
 		    hispgrp->pg_session == mysession &&
-		    p->p_stat != SZOMB) {
+		    p->p_state != PRS_ZOMBIE) {
 			PGRP_LOCK(hispgrp);
 			if (entering)
 				hispgrp->pg_jobc++;
@@ -583,7 +622,7 @@ orphanpg(pg)
 
 	mtx_lock_spin(&sched_lock);
 	LIST_FOREACH(p, &pg->pg_members, p_pglist) {
-		if (p->p_stat == SSTOP) {
+		if (P_SHOULDSTOP(p)) {
 			mtx_unlock_spin(&sched_lock);
 			LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 				PROC_LOCK(p);
@@ -674,7 +713,9 @@ fill_kinfo_proc(p, kp)
 		kp->ki_sigcatch = p->p_procsig->ps_sigcatch;
 	}
 	mtx_lock_spin(&sched_lock);
-	if (p->p_stat != SIDL && p->p_stat != SZOMB && p->p_vmspace != NULL) {
+	if (p->p_state != PRS_NEW &&
+	    p->p_state != PRS_ZOMBIE &&
+	    p->p_vmspace != NULL) {
 		struct vmspace *vm = p->p_vmspace;
 
 		kp->ki_size = vm->vm_map.size;
@@ -697,35 +738,65 @@ fill_kinfo_proc(p, kp)
 		    p->p_stats->p_cru.ru_stime.tv_usec;
 	}
 	td = FIRST_THREAD_IN_PROC(p);
-	if (td->td_wmesg != NULL)
-		strncpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg) - 1);
-	if (p->p_stat == SMTX) {
-		kp->ki_kiflag |= KI_MTXBLOCK;
-		strncpy(kp->ki_mtxname, td->td_mtxname,
-		    sizeof(kp->ki_mtxname) - 1);
+	if (!(p->p_flag & P_KSES)) {
+		if (td->td_wmesg != NULL) {
+			strncpy(kp->ki_wmesg, td->td_wmesg,
+			    sizeof(kp->ki_wmesg) - 1);
+		}
+		if (td->td_state == TDS_MTX) {
+			kp->ki_kiflag |= KI_MTXBLOCK;
+			strncpy(kp->ki_mtxname, td->td_mtxname,
+			    sizeof(kp->ki_mtxname) - 1);
+		}
 	}
-	kp->ki_stat = p->p_stat;
+
+	if (p->p_state == PRS_NORMAL) { /*  XXXKSE very aproximate */
+		if ((td->td_state == TDS_RUNQ) ||
+		    (td->td_state == TDS_RUNNING)) {
+			kp->ki_stat = SRUN;
+		} else if (td->td_state == TDS_SLP) {
+			kp->ki_stat = SSLEEP;
+		} else if (P_SHOULDSTOP(p)) {
+			kp->ki_stat = SSTOP;
+		} else if (td->td_state == TDS_MTX) {
+			kp->ki_stat = SMTX;
+		} else {
+			kp->ki_stat = SWAIT;
+		}
+	} else if (p->p_state == PRS_ZOMBIE) {
+		kp->ki_stat = SZOMB;
+	} else {
+		kp->ki_stat = SIDL;
+	}
+
 	kp->ki_sflag = p->p_sflag;
 	kp->ki_swtime = p->p_swtime;
 	kp->ki_pid = p->p_pid;
 	/* vvv XXXKSE */
-	bintime2timeval(&p->p_runtime, &tv);
-	kp->ki_runtime = tv.tv_sec * (u_int64_t)1000000 + tv.tv_usec;
-	kp->ki_pctcpu = p->p_kse.ke_pctcpu;
-	kp->ki_estcpu = td->td_ksegrp->kg_estcpu;
-	kp->ki_slptime = td->td_ksegrp->kg_slptime;
-	kp->ki_wchan = td->td_wchan;
-	kp->ki_pri.pri_level = td->td_priority;
-	kp->ki_pri.pri_user = td->td_ksegrp->kg_user_pri;
-	kp->ki_pri.pri_class = td->td_ksegrp->kg_pri_class;
-	kp->ki_pri.pri_native = td->td_base_pri;
-	kp->ki_nice = td->td_ksegrp->kg_nice;
-	kp->ki_rqindex = p->p_kse.ke_rqindex;
-	kp->ki_oncpu = p->p_kse.ke_oncpu;
-	kp->ki_lastcpu = td->td_lastcpu;
-	kp->ki_tdflags = td->td_flags;
-	kp->ki_pcb = td->td_pcb;
-	kp->ki_kstack = (void *)td->td_kstack;
+	if (!(p->p_flag & P_KSES)) {
+		bintime2timeval(&p->p_runtime, &tv);
+		kp->ki_runtime = tv.tv_sec * (u_int64_t)1000000 + tv.tv_usec;
+		kp->ki_pctcpu = p->p_kse.ke_pctcpu;
+		kp->ki_estcpu = p->p_ksegrp.kg_estcpu;
+		kp->ki_slptime = p->p_ksegrp.kg_slptime;
+		kp->ki_wchan = td->td_wchan;
+		kp->ki_pri.pri_level = td->td_priority;
+		kp->ki_pri.pri_user = p->p_ksegrp.kg_user_pri;
+		kp->ki_pri.pri_class = p->p_ksegrp.kg_pri_class;
+		kp->ki_pri.pri_native = td->td_base_pri;
+		kp->ki_nice = p->p_ksegrp.kg_nice;
+		kp->ki_rqindex = p->p_kse.ke_rqindex;
+		kp->ki_oncpu = p->p_kse.ke_oncpu;
+		kp->ki_lastcpu = td->td_lastcpu;
+		kp->ki_tdflags = td->td_flags;
+		kp->ki_pcb = td->td_pcb;
+		kp->ki_kstack = (void *)td->td_kstack;
+	} else {
+		kp->ki_oncpu = -1;
+		kp->ki_lastcpu = -1;
+		kp->ki_tdflags = -1;
+		/* All the reast are 0 */
+	}
 	/* ^^^ XXXKSE */
 	mtx_unlock_spin(&sched_lock);
 	sp = NULL;
@@ -878,7 +949,7 @@ sysctl_kern_proc(SYSCTL_HANDLER_ARGS)
 			/*
 			 * Skip embryonic processes.
 			 */
-			if (p->p_stat == SIDL) {
+			if (p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
 				continue;
 			}
diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c
index d2cb69d..0803cff 100644
--- a/sys/kern/kern_shutdown.c
+++ b/sys/kern/kern_shutdown.c
@@ -281,7 +281,6 @@ boot(int howto)
 				DROP_GIANT();
    				for (subiter = 0; subiter < 50 * iter; subiter++) {
      					mtx_lock_spin(&sched_lock);
-					setrunqueue(curthread);
 					curthread->td_proc->p_stats->p_ru.ru_nvcsw++;
      					mi_switch(); /* Allow interrupt threads to run */
      					mtx_unlock_spin(&sched_lock);
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index a561a19..e8ded21 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -84,7 +84,7 @@ static int killpg1(struct thread *td, int sig, int pgid, int all);
 static int sig_ffs(sigset_t *set);
 static int sigprop(int sig);
 static void stop(struct proc *);
-
+static void tdsignal(struct thread *td, int sig, sig_t action);
 static int	filt_sigattach(struct knote *kn);
 static void	filt_sigdetach(struct knote *kn);
 static int	filt_signal(struct knote *kn, long hint);
@@ -168,16 +168,18 @@ static int sigproptbl[NSIG] = {
  * Determine signal that should be delivered to process p, the current
  * process, 0 if none.  If there is a pending stop signal with default
  * action, the process stops in issignal().
+ * XXXKSE   the check for a pending stop is not done under KSE
  *
  * MP SAFE.
  */
 int
-cursig(struct proc *p)
+cursig(struct thread *td)
 {
+	struct proc *p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&sched_lock, MA_NOTOWNED);
-	return (SIGPENDING(p) ? issignal(p) : 0);
+	return (SIGPENDING(p) ? issignal(td) : 0);
 }
 
 /*
@@ -1042,7 +1044,7 @@ killpg1(td, sig, pgid, all)
 				PROC_UNLOCK(p);
 				continue;
 			}
-			if (p->p_stat == SZOMB) {
+			if (p->p_state == PRS_ZOMBIE) {
 				PROC_UNLOCK(p);
 				continue;
 			}
@@ -1243,12 +1245,10 @@ psignal(p, sig)
 	register struct proc *p;
 	register int sig;
 {
-	register int prop;
 	register sig_t action;
 	struct thread *td;
-#ifdef SMP
-	struct ksegrp *kg;
-#endif
+	register int prop;
+
 
 	KASSERT(_SIG_VALID(sig),
 	    ("psignal(): invalid signal %d\n", sig));
@@ -1257,7 +1257,6 @@ psignal(p, sig)
 	KNOTE(&p->p_klist, NOTE_SIGNAL | sig);
 
 	prop = sigprop(sig);
-
 	/*
 	 * If proc is traced, always give parent a chance;
 	 * if signal event is tracked by procfs, give *that*
@@ -1283,29 +1282,6 @@ psignal(p, sig)
 			action = SIG_DFL;
 	}
 
-	/*
-	 * bring the priority of a process up if we want it to get 
-	 * killed in this lifetime.
-	 * XXXKSE think if a better way to do this.
-	 *
-	 * What we need to do is see if there is a thread that will
-	 * be able to accept the signal. e.g.
-	 * FOREACH_THREAD_IN_PROC() {
-	 *	if runnable, we're done
-	 *	else pick one at random.
-	 * }
-	 */
-	/* XXXKSE
-	 * For now there is one thread per proc.
-	 * Effectively select one sucker thread..
-	 */
-	td = FIRST_THREAD_IN_PROC(p);
-	mtx_lock_spin(&sched_lock);
-	if ((p->p_ksegrp.kg_nice > NZERO) && (action == SIG_DFL) &&
-	    (prop & SA_KILL) && ((p->p_flag & P_TRACED) == 0))
-		p->p_ksegrp.kg_nice = NZERO; /* XXXKSE */
-	mtx_unlock_spin(&sched_lock);
-
 	if (prop & SA_CONT)
 		SIG_STOPSIGMASK(p->p_siglist);
 
@@ -1316,48 +1292,125 @@ psignal(p, sig)
 		 * is default; don't stop the process below if sleeping,
 		 * and don't clear any pending SIGCONT.
 		 */
-		if (prop & SA_TTYSTOP && p->p_pgrp->pg_jobc == 0 &&
-		    action == SIG_DFL)
+		if ((prop & SA_TTYSTOP) &&
+		    (p->p_pgrp->pg_jobc == 0) &&
+		    (action == SIG_DFL))
 		        return;
 		SIG_CONTSIGMASK(p->p_siglist);
 	}
 	SIGADDSET(p->p_siglist, sig);
 	mtx_lock_spin(&sched_lock);
 	signotify(p);
+	mtx_unlock_spin(&sched_lock);
 
 	/*
-	 * Defer further processing for signals which are held,
-	 * except that stopped processes must be continued by SIGCONT.
+	 * Some signals have a process-wide effect and a per-thread
+	 * component.  Most processing occurs when the process next
+	 * tries to cross the user boundary, however there are some
+	 * times when processing needs to be done immediatly, such as
+	 * waking up threads so that they can cross the user boundary.
+	 * We try do the per-process part here.
 	 */
-	if (action == SIG_HOLD && (!(prop & SA_CONT) || p->p_stat != SSTOP)) {
-		mtx_unlock_spin(&sched_lock);
-		return;
-	}
-
-	switch (p->p_stat) {
-
-	case SSLEEP:
+	if (P_SHOULDSTOP(p)) {
 		/*
-		 * If process is sleeping uninterruptibly
-		 * we can't interrupt the sleep... the signal will
-		 * be noticed when the process returns through
-		 * trap() or syscall().
+		 * The process is in stopped mode. All the threads should be
+		 * either winding down or already on the suspended queue.
 		 */
-		if ((td->td_flags & TDF_SINTR) == 0)
+		if (p->p_flag & P_TRACED) {
+			/*
+			 * The traced process is already stopped,
+			 * so no further action is necessary.
+			 * No signal can restart us.
+			 */
 			goto out;
+		}
+
+		if (sig == SIGKILL) {
+			/*
+			 * SIGKILL sets process running.
+			 * It will die elsewhere.
+			 * All threads must be restarted.
+			 */
+			p->p_flag &= ~P_STOPPED;
+			goto runfast;
+		}
+
+		if (prop & SA_CONT) {
+			/*
+			 * If SIGCONT is default (or ignored), we continue the
+			 * process but don't leave the signal in p_siglist as
+			 * it has no further action.  If SIGCONT is held, we
+			 * continue the process and leave the signal in
+			 * p_siglist.  If the process catches SIGCONT, let it
+			 * handle the signal itself.  If it isn't waiting on
+			 * an event, it goes back to run state.
+			 * Otherwise, process goes back to sleep state.
+			 */
+			p->p_flag &= ~P_STOPPED_SGNL;
+			if (action == SIG_DFL) {
+				SIGDELSET(p->p_siglist, sig);
+			} else if (action == SIG_CATCH) {
+				/*
+				 * The process wants to catch it so it needs
+				 * to run at least one thread, but which one?
+				 * It would seem that the answer would be to
+				 * run an upcall in the next KSE to run, and
+				 * deliver the signal that way. In a NON KSE
+				 * process, we need to make sure that the
+				 * single thread is runnable asap.
+				 * XXXKSE for now however, make them all run.
+				 */
+				goto runfast;
+			}
+			/*
+			 * The signal is not ignored or caught.
+			 */
+			mtx_lock_spin(&sched_lock);
+			thread_unsuspend(p);	/* Checks if should do it. */
+			mtx_unlock_spin(&sched_lock);
+			goto out;
+		}
+
+		if (prop & SA_STOP) {
+			/*
+			 * Already stopped, don't need to stop again
+			 * (If we did the shell could get confused).
+			 */
+			SIGDELSET(p->p_siglist, sig);
+			goto out;
+		}
+
 		/*
-		 * Process is sleeping and traced... make it runnable
-		 * so it can discover the signal in issignal() and stop
-		 * for the parent.
+		 * All other kinds of signals:
+		 * If a thread is sleeping interruptibly, simulate a
+		 * wakeup so that when it is continued it will be made
+		 * runnable and can look at the signal.  However, don't make
+		 * the process runnable, leave it stopped.
+		 * It may run a bit until it hits a thread_suspend_check().
+		 *
+		 * XXXKSE I don't understand this at all.
 		 */
-		if (p->p_flag & P_TRACED)
-			goto run;
+		mtx_lock_spin(&sched_lock);
+		FOREACH_THREAD_IN_PROC(p, td) {
+			if (td->td_wchan && (td->td_flags & TDF_SINTR)) {
+				if (td->td_flags & TDF_CVWAITQ)
+					cv_waitq_remove(td);
+				else
+					unsleep(td);
+				setrunnable(td);
+			}
+		}
+		mtx_unlock_spin(&sched_lock);
+		goto out;
 		/*
-		 * If SIGCONT is default (or ignored) and process is
-		 * asleep, we are finished; the process should not
-		 * be awakened.
+		 * XXXKSE  What about threads that are waiting on mutexes?
+		 * Shouldn't they abort too?
 		 */
-		if ((prop & SA_CONT) && action == SIG_DFL) {
+	}  else if (p->p_state == PRS_NORMAL) {
+		if (prop & SA_CONT) {
+			/*
+			 * Already active, don't need to start again.
+			 */
 			SIGDELSET(p->p_siglist, sig);
 			goto out;
 		}
@@ -1370,133 +1423,128 @@ psignal(p, sig)
 		if (prop & SA_STOP) {
 			if (action != SIG_DFL)
 				goto runfast;
+
 			/*
 			 * If a child holding parent blocked,
 			 * stopping could cause deadlock.
 			 */
 			if (p->p_flag & P_PPWAIT)
 				goto out;
-			mtx_unlock_spin(&sched_lock);
 			SIGDELSET(p->p_siglist, sig);
 			p->p_xstat = sig;
 			PROC_LOCK(p->p_pptr);
-			if ((p->p_pptr->p_procsig->ps_flag & PS_NOCLDSTOP) == 0)
+			if (!(p->p_pptr->p_procsig->ps_flag & PS_NOCLDSTOP))
 				psignal(p->p_pptr, SIGCHLD);
 			PROC_UNLOCK(p->p_pptr);
 			mtx_lock_spin(&sched_lock);
 			stop(p);
+			mtx_unlock_spin(&sched_lock);
 			goto out;
 		} else
 			goto runfast;
 		/* NOTREACHED */
+	} else {
+		/* Not in "NORMAL" state. discard the signal. */
+		SIGDELSET(p->p_siglist, sig);
+		goto out;
+	}
 
-	case SSTOP:
-		/*
-		 * If traced process is already stopped,
-		 * then no further action is necessary.
-		 */
-		if (p->p_flag & P_TRACED)
-			goto out;
+	/*
+	 * The process is not stopped so we need to apply the signal to all the
+	 * running threads.
+	 */
 
-		/*
-		 * Kill signal always sets processes running.
-		 */
-		if (sig == SIGKILL)
-			goto runfast;
+runfast:
+	FOREACH_THREAD_IN_PROC(p, td)
+		tdsignal(td, sig, action);
+	mtx_lock_spin(&sched_lock);
+	thread_unsuspend(p);
+	mtx_unlock_spin(&sched_lock);
+out:
+	/* If we jump here, sched_lock should not be owned. */
+	mtx_assert(&sched_lock, MA_NOTOWNED);
+}
 
-		if (prop & SA_CONT) {
-			/*
-			 * If SIGCONT is default (or ignored), we continue the
-			 * process but don't leave the signal in p_siglist, as
-			 * it has no further action.  If SIGCONT is held, we
-			 * continue the process and leave the signal in
-			 * p_siglist.  If the process catches SIGCONT, let it
-			 * handle the signal itself.  If it isn't waiting on
-			 * an event, then it goes back to run state.
-			 * Otherwise, process goes back to sleep state.
-			 */
-			if (action == SIG_DFL)
-				SIGDELSET(p->p_siglist, sig);
-			if (action == SIG_CATCH)
-				goto runfast;
-			/*
-			 * XXXKSE
-			 * do this for each thread.
-			 */
-			if (p->p_flag & P_KSES) {
-				mtx_assert(&sched_lock,
-				    MA_OWNED | MA_NOTRECURSED);
-				FOREACH_THREAD_IN_PROC(p, td) {
-					if (td->td_wchan == NULL) {
-						setrunnable(td); /* XXXKSE */
-					} else {
-						/* mark it as sleeping */
-					}
-				}
-			} else {
-				p->p_flag |= P_CONTINUED;
-				wakeup(p->p_pptr);
-				if (td->td_wchan == NULL)
-					goto run;
-				p->p_stat = SSLEEP;
-			}
-			goto out;
+/*
+ * The force of a signal has been directed against a single
+ * thread. We need to see what we can do about knocking it
+ * out of any sleep it may be in etc.
+ */
+static void
+tdsignal(struct thread *td, int sig, sig_t action)
+{
+	struct proc *p = td->td_proc;
+	register int prop;
+
+	prop = sigprop(sig);
+
+	/*
+	 * Bring the priority of a process up if we want it to get
+	 * killed in this lifetime.
+	 * XXXKSE we should shift the priority to the thread.
+	 */
+	mtx_lock_spin(&sched_lock);
+	if ((action == SIG_DFL) && (prop & SA_KILL)) {
+		if (td->td_priority > PUSER) {
+			td->td_priority = PUSER;
 		}
+	}
+	mtx_unlock_spin(&sched_lock);
 
-		if (prop & SA_STOP) {
-			/*
-			 * Already stopped, don't need to stop again.
-			 * (If we did the shell could get confused.)
-			 */
-			SIGDELSET(p->p_siglist, sig);
+	/*
+	 * Defer further processing for signals which are held,
+	 * except that stopped processes must be continued by SIGCONT.
+	 */
+	if (action == SIG_HOLD) {
+		goto out;
+	}
+	mtx_lock_spin(&sched_lock);
+	if (td->td_state == TDS_SLP) {
+		/*
+		 * If thread is sleeping uninterruptibly
+		 * we can't interrupt the sleep... the signal will
+		 * be noticed when the process returns through
+		 * trap() or syscall().
+		 */
+		if ((td->td_flags & TDF_SINTR) == 0) {
+			mtx_unlock_spin(&sched_lock);
 			goto out;
 		}
-
 		/*
-		 * If process is sleeping interruptibly, then simulate a
-		 * wakeup so that when it is continued, it will be made
-		 * runnable and can look at the signal.  But don't make
-		 * the process runnable, leave it stopped.
-		 * XXXKSE should we wake ALL blocked threads?
+		 * Process is sleeping and traced.  Make it runnable
+		 * so it can discover the signal in issignal() and stop
+		 * for its parent.
 		 */
-		if (p->p_flag & P_KSES) {
-			FOREACH_THREAD_IN_PROC(p, td) {
-				if (td->td_wchan && (td->td_flags & TDF_SINTR)){
-					if (td->td_flags & TDF_CVWAITQ)
-						cv_waitq_remove(td);
-					else
-						unsleep(td); /* XXXKSE */
-				}
-			}
-		} else {
-			if (td->td_wchan && td->td_flags & TDF_SINTR) {
-				if (td->td_flags & TDF_CVWAITQ)
-					cv_waitq_remove(td);
-				else
-					unsleep(td); /* XXXKSE */
-			}
+		if (p->p_flag & P_TRACED) {
+			p->p_flag &= ~P_STOPPED_TRACE;
+			goto run;
 		}
-		goto out;
+		mtx_unlock_spin(&sched_lock);
+		/*
+		 * If SIGCONT is default (or ignored) and process is
+		 * asleep, we are finished; the process should not
+		 * be awakened.
+		 */
+		if ((prop & SA_CONT) && action == SIG_DFL) {
+			SIGDELSET(p->p_siglist, sig);
+			goto out;
+		}
+		goto runfast;
+		/* NOTREACHED */
 
-	default:
+	} else {
 		/*
-		 * SRUN, SIDL, SZOMB do nothing with the signal,
+		 * Other states do nothing with the signal immediatly,
 		 * other than kicking ourselves if we are running.
 		 * It will either never be noticed, or noticed very soon.
 		 */
-		if (p->p_stat == SRUN) {
+		mtx_unlock_spin(&sched_lock);
+		if (td->td_state == TDS_RUNQ ||
+		    td->td_state == TDS_RUNNING) {
+			signotify(td->td_proc);
 #ifdef SMP
-			struct kse *ke;
-			struct thread *td = curthread;
-/* we should only deliver to one thread.. but which one? */
-			FOREACH_KSEGRP_IN_PROC(p, kg) {
-				FOREACH_KSE_IN_GROUP(kg, ke) {
-					if (ke->ke_thread == td) {
-						continue;
-					}
-					forward_signal(ke->ke_thread);
-				}
-			}
+			if (td->td_state == TDS_RUNNING && td != curthread)
+				forward_signal(td);
 #endif
 		}
 		goto out;
@@ -1506,21 +1554,17 @@ psignal(p, sig)
 runfast:
 	/*
 	 * Raise priority to at least PUSER.
-	 * XXXKSE Should we make them all run fast?
-	 * Maybe just one would be enough?
 	 */
-
-	if (FIRST_THREAD_IN_PROC(p)->td_priority > PUSER) {
-		FIRST_THREAD_IN_PROC(p)->td_priority = PUSER;
+	mtx_lock_spin(&sched_lock);
+	if (td->td_priority > PUSER) {
+		td->td_priority = PUSER;
 	}
 run:
-	/* If we jump here, sched_lock has to be owned. */
 	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
-	setrunnable(td); /* XXXKSE */
-out:
+	setrunnable(td);
 	mtx_unlock_spin(&sched_lock);
 
-	/* Once we get here, sched_lock should not be owned. */
+out:
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 }
 
@@ -1533,16 +1577,18 @@ out:
  * by checking the pending signal masks in cursig.) The normal call
  * sequence is
  *
- *	while (sig = cursig(curproc))
+ *	while (sig = cursig(curthread))
  *		postsig(sig);
  */
 int
-issignal(p)
-	register struct proc *p;
+issignal(td)
+	struct thread *td;
 {
+	struct proc *p;
 	sigset_t mask;
 	register int sig, prop;
 
+	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	for (;;) {
 		int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG);
@@ -1576,6 +1622,7 @@ issignal(p)
 			PROC_UNLOCK(p->p_pptr);
 			mtx_lock_spin(&sched_lock);
 			stop(p);
+			td->td_state = TDS_UNQUEUED;
 			PROC_UNLOCK(p);
 			DROP_GIANT();
 			p->p_stats->p_ru.ru_nivcsw++;
@@ -1633,6 +1680,7 @@ issignal(p)
 #endif
 				break;		/* == ignore */
 			}
+#if 0
 			/*
 			 * If there is a pending stop signal to process
 			 * with default action, stop here,
@@ -1647,8 +1695,10 @@ issignal(p)
 					break;	/* == ignore */
 				p->p_xstat = sig;
 				PROC_LOCK(p->p_pptr);
-				if ((p->p_pptr->p_procsig->ps_flag & PS_NOCLDSTOP) == 0)
+				if ((p->p_pptr->p_procsig->ps_flag &
+				    PS_NOCLDSTOP) == 0) {
 					psignal(p->p_pptr, SIGCHLD);
+				}
 				PROC_UNLOCK(p->p_pptr);
 				mtx_lock_spin(&sched_lock);
 				stop(p);
@@ -1660,7 +1710,9 @@ issignal(p)
 				PICKUP_GIANT();
 				PROC_LOCK(p);
 				break;
-			} else if (prop & SA_IGNORE) {
+			} else
+#endif
+			     if (prop & SA_IGNORE) {
 				/*
 				 * Except for SIGCONT, shouldn't get here.
 				 * Default action is to ignore; drop it.
@@ -1706,7 +1758,7 @@ stop(p)
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&sched_lock, MA_OWNED);
-	p->p_stat = SSTOP;
+	p->p_flag |= P_STOPPED_SGNL;
 	p->p_flag &= ~P_WAITED;
 	wakeup(p->p_pptr);
 }
diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c
index 5e32eee..c63091c 100644
--- a/sys/kern/kern_subr.c
+++ b/sys/kern/kern_subr.c
@@ -538,7 +538,6 @@ uio_yield()
 	mtx_lock_spin(&sched_lock);
 	DROP_GIANT();
 	td->td_priority = td->td_ksegrp->kg_user_pri; /* XXXKSE */
-	setrunqueue(td);
 	td->td_proc->p_stats->p_ru.ru_nivcsw++;
 	mi_switch();
 	mtx_unlock_spin(&sched_lock);
diff --git a/sys/kern/kern_switch.c b/sys/kern/kern_switch.c
index 2b531c0..40d3ef8 100644
--- a/sys/kern/kern_switch.c
+++ b/sys/kern/kern_switch.c
@@ -26,6 +26,69 @@
  * $FreeBSD$
  */
 
+/***
+
+Here is the logic..
+
+If there are N processors, then there are at most N KSEs (kernel
+schedulable entities) working to process threads that belong to a
+KSEGOUP (kg). If there are X of these KSEs actually running at the
+moment in question, then there are at most M (N-X) of these KSEs on
+the run queue, as running KSEs are not on the queue.
+
+Runnable threads are queued off the KSEGROUP in priority order.
+If there are M or more threads runnable, the top M threads
+(by priority) are 'preassigned' to the M KSEs not running. The KSEs take
+their priority from those threads and are put on the run queue.
+
+The last thread that had a priority high enough to have a KSE associated
+with it, AND IS ON THE RUN QUEUE is pointed to by
+kg->kg_last_assigned. If no threads queued off the KSEGROUP have KSEs
+assigned as all the available KSEs are activly running, or because there
+are no threads queued, that pointer is NULL.
+
+When a KSE is removed from the run queue to become runnable, we know
+it was associated with the highest priority thread in the queue (at the head
+of the queue). If it is also the last assigned we know M was 1 and must
+now be 0. Since the thread is no longer queued that pointer must be
+removed from it. Since we know there were no more KSEs available,
+(M was 1 and is now 0) and since we are not FREEING our KSE
+but using it, we know there are STILL no more KSEs available, we can prove
+that the next thread in the ksegrp list will not have a KSE to assign to
+it, so we can show that the pointer must be made 'invalid' (NULL).
+
+The pointer exists so that when a new thread is made runnable, it can
+have its priority compared with the last assigned thread to see if
+it should 'steal' its KSE or not.. i.e. is it 'earlier'
+on the list than that thread or later.. If it's earlier, then the KSE is
+removed from the last assigned (which is now not assigned a KSE)
+and reassigned to the new thread, which is placed earlier in the list.
+The pointer is then backed up to the previous thread (which may or may not
+be the new thread).
+
+When a thread sleeps or is removed, the KSE becomes available and if there 
+are queued threads that are not assigned KSEs, the highest priority one of
+them is assigned the KSE, which is then placed back on the run queue at
+the approipriate place, and the kg->kg_last_assigned pointer is adjusted down
+to point to it.
+
+The following diagram shows 2 KSEs and 3 threads from a single process.
+
+ RUNQ: --->KSE---KSE--...    (KSEs queued at priorities from threads)
+              \    \____   
+               \        \
+    KSEGROUP---thread--thread--thread    (queued in priority order)
+        \                 / 
+         \_______________/
+          (last_assigned)
+
+The result of this scheme is that the M available KSEs are always
+queued at the priorities they have inherrited from the M highest priority
+threads for that KSEGROUP. If this situation changes, the KSEs are 
+reassigned to keep this true.
+   
+*/
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
@@ -44,34 +107,442 @@ CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
 static struct runq runq;
 SYSINIT(runq, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, runq_init, &runq)
 
+static void runq_readjust(struct runq *rq, struct kse *ke);
+/************************************************************************
+ * Functions that manipulate runnability from a thread perspective.	*
+ ************************************************************************/
+
 /*
- * Wrappers which implement old interface; act on global run queue.
+ * Select the KSE that will be run next.  From that find the thread, and x
+ * remove it from the KSEGRP's run queue.  If there is thread clustering,
+ * this will be what does it.
  */
-
 struct thread *
 choosethread(void)
 {
-	return (runq_choose(&runq)->ke_thread);
+	struct kse *ke;
+	struct thread *td;
+	struct ksegrp *kg;
+
+	if ((ke = runq_choose(&runq))) {
+		td = ke->ke_thread;
+		KASSERT((td->td_kse == ke), ("kse/thread mismatch"));
+		kg = ke->ke_ksegrp;
+		if (td->td_flags & TDF_UNBOUND) {
+			TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
+			if (kg->kg_last_assigned == td) 
+				if (TAILQ_PREV(td, threadqueue, td_runq)
+				    != NULL)
+					printf("Yo MAMA!\n");
+				kg->kg_last_assigned = TAILQ_PREV(td,
+				    threadqueue, td_runq);
+			/*
+			 *  If we have started running an upcall,
+			 * Then TDF_UNBOUND WAS set because the thread was 
+			 * created without a KSE. Now that we have one,
+			 * and it is our time to run, we make sure
+			 * that BOUND semantics apply for the rest of
+			 * the journey to userland, and into the UTS.
+			 */
+#ifdef	NOTYET
+			if (td->td_flags & TDF_UPCALLING) 
+				tdf->td_flags &= ~TDF_UNBOUND;
+#endif
+		}
+		kg->kg_runnable--;
+		CTR2(KTR_RUNQ, "choosethread: td=%p pri=%d",
+		    td, td->td_priority);
+	} else {
+		/* Pretend the idle thread was on the run queue. */
+		td = PCPU_GET(idlethread);
+		/* Simulate that it was on the run queue */
+		td->td_state = TDS_RUNQ;
+		td->td_kse->ke_state = KES_UNQUEUED; 
+		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
+	}
+	thread_sanity_check(td);
+	return (td);
+}
+
+/*
+ * Given a KSE (now surplus), either assign a new runable thread to it
+ * (and put it in the run queue) or put it in the ksegrp's idle KSE list.
+ * Assumes the kse is not linked to any threads any more. (has been cleaned).
+ */
+void
+kse_reassign(struct kse *ke)
+{
+	struct ksegrp *kg;
+	struct thread *td;
+
+	kg = ke->ke_ksegrp;
+
+KASSERT((ke->ke_state != KES_ONRUNQ), ("kse_reassigning non-free kse"));
+	/*
+	 * Find the first unassigned thread
+	 * If there is a 'last assigned' then see what's next.
+	 * otherwise look at what is first.
+	 */
+	if ((td = kg->kg_last_assigned)) {
+		td = TAILQ_NEXT(td, td_runq);
+	} else {
+		td = TAILQ_FIRST(&kg->kg_runq);
+	}
+
+	/*
+	 * If we found one assign it the kse, otherwise idle the kse.
+	 */
+	if (td) {
+		thread_sanity_check(td);
+		kg->kg_last_assigned = td;
+		td->td_kse = ke;
+		ke->ke_thread = td;
+		runq_add(&runq, ke);
+		CTR2(KTR_RUNQ, "kse_reassign: ke%p -> td%p", ke, td);
+	} else {
+		KASSERT((ke->ke_state != KES_IDLE), ("kse already idle"));
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));
+		ke->ke_state = KES_IDLE;
+		ke->ke_thread = NULL;
+		TAILQ_INSERT_HEAD(&kg->kg_iq, ke, ke_kgrlist);
+		kg->kg_idle_kses++;
+		CTR1(KTR_RUNQ, "kse_reassign: ke%p idled", ke);
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self2!"));
+	}
 }
 
 int
-procrunnable(void)
+kserunnable(void)
 {
 	return runq_check(&runq);
 }
 
+/*
+ * Remove a thread from its KSEGRP's run queue.
+ * This in turn may remove it from a KSE if it was already assigned
+ * to one, possibly causing a new thread to be assigned to the KSE
+ * and the KSE getting a new priority (unless it's a BOUND thread/KSE pair).
+ */
 void
 remrunqueue(struct thread *td)
 {
-	runq_remove(&runq, td->td_kse);
+	struct thread *td2, *td3;
+	struct ksegrp *kg;
+	struct kse *ke;
+
+	mtx_assert(&sched_lock, MA_OWNED);
+	thread_sanity_check(td);
+	KASSERT ((td->td_state == TDS_RUNQ),
+		("remrunqueue: Bad state on run queue"));
+	kg = td->td_ksegrp;
+	ke = td->td_kse;
+	/*
+	 * If it's a bound thread/KSE pair, take the shortcut. All non-KSE
+	 * threads are BOUND.
+	 */
+	CTR1(KTR_RUNQ, "remrunqueue: td%p", td);
+	td->td_state = TDS_UNQUEUED;
+	kg->kg_runnable--;
+	if ((td->td_flags & TDF_UNBOUND) == 0)  {
+		/* Bring its kse with it, leave the thread attached */
+		runq_remove(&runq, ke);
+		ke->ke_state = KES_UNQUEUED; 
+		return;
+	}
+	if (ke) {
+		/*
+		 * This thread has been assigned to a KSE.
+		 * We need to dissociate it and try assign the
+		 * KSE to the next available thread. Then, we should
+		 * see if we need to move the KSE in the run queues.
+		 */
+		td2 = kg->kg_last_assigned;
+		KASSERT((td2 != NULL), ("last assigned has wrong value "));
+		td->td_kse = NULL;
+		if ((td3 = TAILQ_NEXT(td2, td_runq))) {
+			KASSERT(td3 != td, ("td3 somehow matched td"));
+			/*
+			 * Give the next unassigned thread to the KSE
+			 * so the number of runnable KSEs remains
+			 * constant.
+			 */
+			td3->td_kse = ke;
+			ke->ke_thread = td3;
+			kg->kg_last_assigned = td3;
+			runq_readjust(&runq, ke);
+		} else {
+			/*
+			 * There is no unassigned thread.
+			 * If we were the last assigned one,
+			 * adjust the last assigned pointer back
+			 * one, which may result in NULL.
+			 */
+			if (td == td2) {
+				kg->kg_last_assigned =
+				    TAILQ_PREV(td, threadqueue, td_runq);
+			}
+			runq_remove(&runq, ke);
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));
+			KASSERT((ke->ke_state != KES_IDLE),
+			    ("kse already idle"));
+			ke->ke_state = KES_IDLE;
+			ke->ke_thread = NULL;
+KASSERT((TAILQ_FIRST(&kg->kg_iq) != ke), ("really bad screwup"));
+			TAILQ_INSERT_HEAD(&kg->kg_iq, ke, ke_kgrlist);
+			kg->kg_idle_kses++;
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self2!"));
+		}
+	}
+	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
+	thread_sanity_check(td);
 }
 
+#if 1 /* use the first version */
+
 void
 setrunqueue(struct thread *td)
 {
-	runq_add(&runq, td->td_kse);
+	struct kse *ke;
+	struct ksegrp *kg;
+	struct thread *td2;
+	struct thread *tda;
+
+	CTR1(KTR_RUNQ, "setrunqueue: td%p", td);
+	mtx_assert(&sched_lock, MA_OWNED);
+	thread_sanity_check(td);
+	KASSERT((td->td_state != TDS_RUNQ), ("setrunqueue: bad thread state"));
+	td->td_state = TDS_RUNQ;
+	kg = td->td_ksegrp;
+	kg->kg_runnable++;
+	if ((td->td_flags & TDF_UNBOUND) == 0) {
+		KASSERT((td->td_kse != NULL),
+		    ("queueing BAD thread to run queue"));
+		/*
+		 * Common path optimisation: Only one of everything
+		 * and the KSE is always already attached.
+		 * Totally ignore the ksegrp run queue.
+		 */
+		runq_add(&runq, td->td_kse);
+		return;
+	}
+	/* 
+	 * Ok, so we are threading with this thread.
+	 * We don't have a KSE, see if we can get one..
+	 */
+	tda = kg->kg_last_assigned;
+	if ((ke = td->td_kse) == NULL) {
+		/*
+		 * We will need a KSE, see if there is one..
+		 * First look for a free one, before getting desperate.
+		 * If we can't get one, our priority is not high enough..
+		 * that's ok..
+		 */
+		if (kg->kg_idle_kses) {
+			/*
+			 * There is a free one so it's ours for the asking..
+			 */
+			ke = TAILQ_FIRST(&kg->kg_iq);
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self3!"));
+			TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
+			ke->ke_state = KES_UNQUEUED;
+			kg->kg_idle_kses--;
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self4!"));
+		} else if (tda && (tda->td_priority > td->td_priority)) {
+			/*
+			 * None free, but there is one we can commandeer.
+			 */
+			ke = tda->td_kse;
+			tda->td_kse = NULL;
+			ke->ke_thread = NULL;
+			tda = kg->kg_last_assigned =
+		    	    TAILQ_PREV(tda, threadqueue, td_runq);
+			runq_remove(&runq, ke);
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self5!"));
+		}
+	} else {
+		KASSERT(ke->ke_thread == td, ("KSE/thread mismatch"));
+		KASSERT(ke->ke_state != KES_IDLE, ("KSE unexpectedly idle"));
+		ke->ke_thread = NULL;
+		td->td_kse = NULL;
+	}
+
+	/*
+	 * Add the thread to the ksegrp's run queue at
+	 * the appropriate place.
+	 */
+	TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
+		if (td2->td_priority > td->td_priority) {
+			TAILQ_INSERT_BEFORE(td2, td, td_runq);
+			break;
+		}
+	}
+	if (td2 == NULL) {
+		/* We ran off the end of the TAILQ or it was empty. */
+		TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq);
+	}
+
+	/*
+	 * If we have a ke to use, then put it on the run queue and
+	 * If needed, readjust the last_assigned pointer.
+	 */
+	if (ke) {
+		if (tda == NULL) {
+			/*
+			 * No pre-existing last assigned so whoever is first
+			 * gets the KSE we borught in.. (may be us)
+			 */
+			td2 = TAILQ_FIRST(&kg->kg_runq);
+			KASSERT((td2->td_kse == NULL),
+			    ("unexpected ke present"));
+			td2->td_kse = ke;
+			ke->ke_thread = td2;
+			kg->kg_last_assigned = td2;
+		} else if (tda->td_priority > td->td_priority) {
+			/*
+			 * It's ours, grab it, but last_assigned is past us
+			 * so don't change it.
+			 */
+			td->td_kse = ke;
+			ke->ke_thread = td;
+		} else {
+			/* 
+			 * We are past last_assigned, so 
+			 * put the new kse on whatever is next,
+			 * which may or may not be us.
+			 */
+			td2 = TAILQ_NEXT(tda, td_runq);
+			kg->kg_last_assigned = td2;
+			td2->td_kse = ke;
+			ke->ke_thread = td2;
+		}
+		runq_add(&runq, ke);
+	}
+	thread_sanity_check(td);
 }
 
+#else
+
+void
+setrunqueue(struct thread *td)
+{
+	struct kse *ke;
+	struct ksegrp *kg;
+	struct thread *td2;
+
+	CTR1(KTR_RUNQ, "setrunqueue: td%p", td);
+	KASSERT((td->td_state != TDS_RUNQ), ("setrunqueue: bad thread state"));
+	td->td_state = TDS_RUNQ;
+	kg = td->td_ksegrp;
+	kg->kg_runnable++;
+	if ((td->td_flags & TDF_UNBOUND) == 0) {
+		/*
+		 * Common path optimisation: Only one of everything
+		 * and the KSE is always already attached.
+		 * Totally ignore the ksegrp run queue.
+		 */
+		runq_add(&runq, td->td_kse);
+		return;
+	}
+	/*
+	 * First add the thread to the ksegrp's run queue at
+	 * the appropriate place.
+	 */
+	TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
+		if (td2->td_priority > td->td_priority) {
+			TAILQ_INSERT_BEFORE(td2, td, td_runq);
+			break;
+		}
+	}
+	if (td2 == NULL) {
+		/* We ran off the end of the TAILQ or it was empty. */
+		TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq);
+	}
+
+	/*
+	 * The following could be achieved by simply doing:
+	 * td->td_kse = NULL; kse_reassign(ke);
+	 * but I felt that I'd try do it inline here.
+	 * All this work may not be worth it.
+	 */
+	if ((ke = td->td_kse)) { /* XXXKSE */
+		/*
+		 * We have a KSE already. See whether we can keep it
+		 * or if we need to give it to someone else.
+		 * Either way it will need to be inserted into
+		 * the runq. kse_reassign() will do this as will runq_add().
+		 */
+		if ((kg->kg_last_assigned) &&
+		   (kg->kg_last_assigned->td_priority > td->td_priority)) {
+			/*
+			 * We can definitly keep the KSE
+			 * as the "last assignead thread" has
+			 * less priority than we do.
+			 * The "last assigned" pointer stays the same.
+			 */
+			runq_add(&runq, ke);
+			return;
+
+		}
+		/*
+		 * Give it to the correct thread,
+		 * which may be (often is) us, but may not be.
+		 */
+		td->td_kse = NULL;
+		kse_reassign(ke);
+		return;
+	}
+	/*
+	 * There are two cases where KSE adjustment is needed.
+	 * Usurpation of an already assigned KSE, and assignment
+	 * of a previously IDLE KSE.
+	 */
+	if (kg->kg_idle_kses) {
+		/*
+		 * If there are unassigned KSEs then we definitly
+		 * will be assigned one from the idle KSE list.
+		 * If we are the last, we should get the "last
+		 * assigned" pointer set to us as well.
+		 */
+		ke = TAILQ_FIRST(&kg->kg_iq);
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));
+		TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
+		ke->ke_state = KES_UNQUEUED;
+		kg->kg_idle_kses--;
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));
+		ke->ke_thread = td;
+		td->td_kse = ke;
+		runq_add(&runq, ke);
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));
+		if (TAILQ_NEXT(td, td_runq) == NULL) {
+			kg->kg_last_assigned = td;
+		}
+	} else if (kg->kg_last_assigned &&
+		(kg->kg_last_assigned->td_priority > td->td_priority)) {
+		/*
+		 * If there were none last-assigned, all KSEs
+		 * are actually out running as we speak.
+		 * If there was a last assigned, but we didn't see it,
+		 * we must be inserting before it, so take the KSE from
+		 * the last assigned, and back it up one entry. Then,
+		 * assign the KSE to the new thread and adjust its priority.
+		 */
+		td2 = kg->kg_last_assigned;
+		ke = td2->td_kse;
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));
+		kg->kg_last_assigned =
+		    TAILQ_PREV(td2, threadqueue, td_runq);
+		td2->td_kse = NULL;
+		td->td_kse = ke;
+		ke->ke_thread = td;
+		runq_readjust(&runq, ke);
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));
+	}
+}
+#endif
+
+/************************************************************************
+ * Critical section marker functions					*
+ ************************************************************************/
 /* Critical sections that prevent preemption. */
 void
 critical_enter(void)
@@ -98,6 +569,23 @@ critical_exit(void)
 	}
 }
 
+
+/************************************************************************
+ * SYSTEM RUN QUEUE manipulations and tests				*
+ ************************************************************************/
+/*
+ * Initialize a run structure.
+ */
+void
+runq_init(struct runq *rq)
+{
+	int i;
+
+	bzero(rq, sizeof *rq);
+	for (i = 0; i < RQ_NQS; i++)
+		TAILQ_INIT(&rq->rq_queues[i]);
+}
+
 /*
  * Clear the status bit of the queue corresponding to priority level pri,
  * indicating that it is empty.
@@ -156,7 +644,7 @@ runq_setbit(struct runq *rq, int pri)
 }
 
 /*
- * Add the process to the queue specified by its priority, and set the
+ * Add the KSE to the queue specified by its priority, and set the
  * corresponding status bit.
  */
 void
@@ -165,14 +653,16 @@ runq_add(struct runq *rq, struct kse *ke)
 	struct rqhead *rqh;
 	int pri;
 
-#ifdef INVARIANTS
-	struct proc *p = ke->ke_proc;
-#endif
-	if (ke->ke_flags & KEF_ONRUNQ)
-		return;
 	mtx_assert(&sched_lock, MA_OWNED);
-	KASSERT(p->p_stat == SRUN, ("runq_add: proc %p (%s) not SRUN",
-	    p, p->p_comm));
+	KASSERT((ke->ke_thread != NULL), ("runq_add: No thread on KSE"));
+	KASSERT((ke->ke_thread->td_kse != NULL), ("runq_add: No KSE on thread"));
+	if (ke->ke_state == KES_ONRUNQ)
+		return;
+#if defined(INVARIANTS) && defined(DIAGNOSTIC)
+	KASSERT(ke->ke_state != KES_ONRUNQ,
+	    ("runq_add: kse %p (%s) already in run queue", ke,
+	    ke->ke_proc->p_comm));
+#endif
 	pri = ke->ke_thread->td_priority / RQ_PPQ;
 	ke->ke_rqindex = pri;
 	runq_setbit(rq, pri);
@@ -180,7 +670,8 @@ runq_add(struct runq *rq, struct kse *ke)
 	CTR4(KTR_RUNQ, "runq_add: p=%p pri=%d %d rqh=%p",
 	    ke->ke_proc, ke->ke_thread->td_priority, pri, rqh);
 	TAILQ_INSERT_TAIL(rqh, ke, ke_procq);
-	ke->ke_flags |= KEF_ONRUNQ;
+	ke->ke_ksegrp->kg_runq_kses++;
+	ke->ke_state = KES_ONRUNQ;
 }
 
 /*
@@ -219,43 +710,38 @@ runq_choose(struct runq *rq)
 	int pri;
 
 	mtx_assert(&sched_lock, MA_OWNED);
-	if ((pri = runq_findbit(rq)) != -1) {
+	while ((pri = runq_findbit(rq)) != -1) {
 		rqh = &rq->rq_queues[pri];
 		ke = TAILQ_FIRST(rqh);
 		KASSERT(ke != NULL, ("runq_choose: no proc on busy queue"));
-		KASSERT(ke->ke_proc->p_stat == SRUN,
-		    ("runq_choose: process %d(%s) in state %d", ke->ke_proc->p_pid,
-		    ke->ke_proc->p_comm, ke->ke_proc->p_stat));
-		CTR3(KTR_RUNQ, "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);
+		CTR3(KTR_RUNQ,
+		    "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);
+KASSERT(ke->ke_procq.tqe_prev != NULL, ("no prev"));
+if (ke->ke_procq.tqe_next)
+	KASSERT(ke->ke_procq.tqe_next->ke_procq.tqe_prev != NULL, ("no next"));
 		TAILQ_REMOVE(rqh, ke, ke_procq);
+		ke->ke_ksegrp->kg_runq_kses--;
 		if (TAILQ_EMPTY(rqh)) {
 			CTR0(KTR_RUNQ, "runq_choose: empty");
 			runq_clrbit(rq, pri);
 		}
-		ke->ke_flags &= ~KEF_ONRUNQ;
+
+		ke->ke_state = KES_RUNNING;
+		KASSERT((ke->ke_thread != NULL),
+		    ("runq_choose: No thread on KSE"));
+		KASSERT((ke->ke_thread->td_kse != NULL),
+		    ("runq_choose: No KSE on thread"));
 		return (ke);
 	}
 	CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
 
-	return (PCPU_GET(idlethread)->td_kse);
+	return (NULL);
 }
 
 /*
- * Initialize a run structure.
- */
-void
-runq_init(struct runq *rq)
-{
-	int i;
-
-	bzero(rq, sizeof *rq);
-	for (i = 0; i < RQ_NQS; i++)
-		TAILQ_INIT(&rq->rq_queues[i]);
-}
-
-/*
- * Remove the process from the queue specified by its priority, and clear the
+ * Remove the KSE from the queue specified by its priority, and clear the
  * corresponding status bit if the queue becomes empty.
+ * Caller must set ke->ke_state afterwards.
  */
 void
 runq_remove(struct runq *rq, struct kse *ke)
@@ -263,8 +749,7 @@ runq_remove(struct runq *rq, struct kse *ke)
 	struct rqhead *rqh;
 	int pri;
 
-	if (!(ke->ke_flags & KEF_ONRUNQ))
-		return;
+	KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue"));
 	mtx_assert(&sched_lock, MA_OWNED);
 	pri = ke->ke_rqindex;
 	rqh = &rq->rq_queues[pri];
@@ -276,5 +761,104 @@ runq_remove(struct runq *rq, struct kse *ke)
 		CTR0(KTR_RUNQ, "runq_remove: empty");
 		runq_clrbit(rq, pri);
 	}
-	ke->ke_flags &= ~KEF_ONRUNQ;
+	ke->ke_state = KES_UNQUEUED; 
+	ke->ke_ksegrp->kg_runq_kses--;
+}
+
+static void 
+runq_readjust(struct runq *rq, struct kse *ke)
+{
+
+	if (ke->ke_rqindex != (ke->ke_thread->td_priority / RQ_PPQ)) {
+		runq_remove(rq, ke);
+		runq_add(rq, ke);
+	}
+}
+
+void
+thread_sanity_check(struct thread *td)
+{
+	struct proc *p;
+	struct ksegrp *kg;
+	struct kse *ke;
+	struct thread *td2;
+	unsigned int prevpri;
+	int	saw_lastassigned;
+	int unassigned;
+	int assigned;
+
+	p = td->td_proc;
+	kg = td->td_ksegrp;
+	ke = td->td_kse;
+
+	if (kg != &p->p_ksegrp) {
+		panic ("wrong ksegrp");
+	}
+
+	if (ke) {
+		if (ke != &p->p_kse) {
+			panic("wrong kse");
+		}
+		if (ke->ke_thread != td) {
+			panic("wrong thread");
+		}
+	}
+	
+	if ((p->p_flag & P_KSES) == 0) {
+		if (ke == NULL) {
+			panic("non KSE thread lost kse");
+		}
+	} else {
+		prevpri = 0;
+		saw_lastassigned = 0;
+		unassigned = 0;
+		assigned = 0;
+		TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
+			if (td2->td_priority < prevpri) {
+				panic("thread runqueue unosorted");
+			}
+			prevpri = td2->td_priority;
+			if (td2->td_kse) {
+				assigned++;
+				if (unassigned) {
+					panic("unassigned before assigned");
+				}
+ 				if  (kg->kg_last_assigned == NULL) {
+					panic("lastassigned corrupt");
+				}
+				if (saw_lastassigned) {
+					panic("last assigned not last");
+				}
+				if (td2->td_kse->ke_thread != td2) {
+					panic("mismatched kse/thread");
+				}
+			} else {
+				unassigned++;
+			}
+			if (td2 == kg->kg_last_assigned) {
+				saw_lastassigned = 1;
+				if (td2->td_kse == NULL) {
+					panic("last assigned not assigned");
+				}
+			}
+		}
+		if (kg->kg_last_assigned && (saw_lastassigned == 0)) {
+			panic("where on earth does lastassigned point?");
+		}
+		FOREACH_THREAD_IN_GROUP(kg, td2) {
+			if (((td2->td_flags & TDF_UNBOUND) == 0) && 
+			    (td2->td_state == TDS_RUNQ)) {
+				assigned++;
+				if (td2->td_kse == NULL) {
+					panic ("BOUND thread with no KSE");
+				}
+			}
+		}
+#if 0
+		if ((unassigned + assigned) != kg->kg_runnable) {
+			panic("wrong number in runnable");
+		}
+#endif
+	}
 }
+
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
index bd1a625..a2a44ff 100644
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@@ -277,9 +277,13 @@ schedcpu(arg)
 				 * with 16-bit int's (remember them?)
 				 * overflow takes 45 days.
 				 */
-				/* XXXKSE */
-			/*	if ((ke->ke_flags & KEF_ONRUNQ) == 0) */
-				if (p->p_stat == SSLEEP || p->p_stat == SSTOP) {
+				/* XXXKSE **WRONG***/
+				/*
+				 * the kse slptimes are not touched in wakeup
+				 * because the thread may not HAVE a KSE
+				 */
+				if (ke->ke_state == KES_ONRUNQ &&
+				    ke->ke_state == KES_RUNNING) {
 					ke->ke_slptime++;
 				} else {
 					ke->ke_slptime = 0;
@@ -321,20 +325,31 @@ schedcpu(arg)
 			}
 			kg->kg_estcpu = decay_cpu(loadfac, kg->kg_estcpu);
 		      	resetpriority(kg);
-			td = FIRST_THREAD_IN_PROC(p);
-		      	if (td->td_priority >= PUSER &&
-			    (p->p_sflag & PS_INMEM)) {
-				int changedqueue =
-				    ((td->td_priority / RQ_PPQ) !=
-				     (kg->kg_user_pri / RQ_PPQ));
-
-				td->td_priority = kg->kg_user_pri;
-				FOREACH_KSE_IN_GROUP(kg, ke) {
-					if ((ke->ke_oncpu == NOCPU) &&
-					    (p->p_stat == SRUN) && /* XXXKSE */
-					    changedqueue) {
-						remrunqueue(ke->ke_thread);
-						setrunqueue(ke->ke_thread);
+			FOREACH_THREAD_IN_GROUP(kg, td) {
+				int changedqueue;
+				if (td->td_priority >= PUSER) {
+					/*
+					 * Only change the priority
+					 * of threads that are still at their
+					 * user priority. 
+					 * XXXKSE This is problematic
+					 * as we may need to re-order
+					 * the threads on the KSEG list.
+					 */
+					changedqueue =
+					    ((td->td_priority / RQ_PPQ) !=
+					     (kg->kg_user_pri / RQ_PPQ));
+
+					td->td_priority = kg->kg_user_pri;
+					if (changedqueue &&
+					    td->td_state == TDS_RUNQ) {
+						/* this could be optimised */
+						remrunqueue(td);
+						td->td_priority =
+						    kg->kg_user_pri;
+						setrunqueue(td);
+					} else {
+						td->td_priority = kg->kg_user_pri;
 					}
 				}
 			}
@@ -409,6 +424,7 @@ sleepinit(void)
  * entered before msleep returns.  If priority includes the PDROP
  * flag the mutex is not entered before returning.
  */
+
 int
 msleep(ident, mtx, priority, wmesg, timo)
 	void *ident;
@@ -426,9 +442,48 @@ msleep(ident, mtx, priority, wmesg, timo)
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(1, 0);
 #endif
+	KASSERT((td->td_kse != NULL), ("msleep: NULL KSE?"));
+	KASSERT((td->td_kse->ke_state == KES_RUNNING), ("msleep: kse state?"));
 	WITNESS_SLEEP(0, &mtx->mtx_object);
 	KASSERT(timo != 0 || mtx_owned(&Giant) || mtx != NULL,
 	    ("sleeping without a mutex"));
+	/*
+	 * If we are capable of async syscalls and there isn't already
+	 * another one ready to return, start a new thread
+	 * and queue it as ready to run. Note that there is danger here
+	 * because we need to make sure that we don't sleep allocating
+	 * the thread (recursion here might be bad).
+	 * Hence the TDF_INMSLEEP flag.
+	 */
+	if (p->p_flag & P_KSES) {
+		/* Just don't bother if we are exiting
+				and not the exiting thread. */
+		if ((p->p_flag & P_WEXIT) && catch && p->p_singlethread != td)
+			return (EINTR);
+		if (td->td_mailbox && (!(td->td_flags & TDF_INMSLEEP))) {
+			/*
+			 * If we have no queued work to do, then
+			 * upcall to the UTS to see if it has more to do.
+			 * We don't need to upcall now, just make it and
+			 * queue it.
+			 */
+			mtx_lock_spin(&sched_lock);
+			if (TAILQ_FIRST(&td->td_ksegrp->kg_runq) == NULL) {
+				/* Don't recurse here! */
+	KASSERT((td->td_kse->ke_state == KES_RUNNING), ("msleep: kse stateX?"));
+				td->td_flags |= TDF_INMSLEEP;
+				thread_schedule_upcall(td, td->td_kse);
+				td->td_flags &= ~TDF_INMSLEEP;
+	KASSERT((td->td_kse->ke_state == KES_RUNNING), ("msleep: kse stateY?"));
+			}
+			mtx_unlock_spin(&sched_lock);
+		}
+		KASSERT((td->td_kse != NULL), ("msleep: NULL KSE2?"));
+		KASSERT((td->td_kse->ke_state == KES_RUNNING),
+		    ("msleep: kse state2?"));
+		KASSERT((td->td_kse->ke_thread == td),
+		    ("msleep: kse/thread mismatch?"));
+	}
 	mtx_lock_spin(&sched_lock);
 	if (cold || panicstr) {
 		/*
@@ -454,7 +509,7 @@ msleep(ident, mtx, priority, wmesg, timo)
 	}
 
 	KASSERT(p != NULL, ("msleep1"));
-	KASSERT(ident != NULL && td->td_proc->p_stat == SRUN, ("msleep"));
+	KASSERT(ident != NULL && td->td_state == TDS_RUNNING, ("msleep"));
 
 	td->td_wchan = ident;
 	td->td_wmesg = wmesg;
@@ -468,20 +523,23 @@ msleep(ident, mtx, priority, wmesg, timo)
 		callout_reset(&td->td_slpcallout, timo, endtsleep, td);
 	/*
 	 * We put ourselves on the sleep queue and start our timeout
-	 * before calling cursig, as we could stop there, and a wakeup
-	 * or a SIGCONT (or both) could occur while we were stopped.
-	 * A SIGCONT would cause us to be marked as SSLEEP
+	 * before calling thread_suspend_check, as we could stop there, and
+	 * a wakeup or a SIGCONT (or both) could occur while we were stopped.
 	 * without resuming us, thus we must be ready for sleep
 	 * when cursig is called.  If the wakeup happens while we're
 	 * stopped, td->td_wchan will be 0 upon return from cursig.
 	 */
 	if (catch) {
-		CTR3(KTR_PROC, "msleep caught: proc %p (pid %d, %s)", p,
+		CTR3(KTR_PROC, "msleep caught: thread %p (pid %d, %s)", td,
 		    p->p_pid, p->p_comm);
 		td->td_flags |= TDF_SINTR;
 		mtx_unlock_spin(&sched_lock);
 		PROC_LOCK(p);
-		sig = cursig(p);
+		sig = cursig(td);
+		if (thread_suspend_check(1)) {
+			sig = EINTR;
+			rval = EINTR;
+		}
 		mtx_lock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 		if (sig != 0) {
@@ -492,13 +550,13 @@ msleep(ident, mtx, priority, wmesg, timo)
 	} else
 		sig = 0;
 	if (td->td_wchan != NULL) {
-		td->td_proc->p_stat = SSLEEP;
 		p->p_stats->p_ru.ru_nvcsw++;
+		td->td_state = TDS_SLP;
 		mi_switch();
 	}
-	CTR3(KTR_PROC, "msleep resume: proc %p (pid %d, %s)", td, p->p_pid,
+	CTR3(KTR_PROC, "msleep resume: thread %p (pid %d, %s)", td, p->p_pid,
 	    p->p_comm);
-	KASSERT(td->td_proc->p_stat == SRUN, ("running but not SRUN"));
+	KASSERT(td->td_state == TDS_RUNNING, ("running but not TDS_RUNNING"));
 	td->td_flags &= ~TDF_SINTR;
 	if (td->td_flags & TDF_TIMEOUT) {
 		td->td_flags &= ~TDF_TIMEOUT;
@@ -524,8 +582,8 @@ msleep(ident, mtx, priority, wmesg, timo)
 
 	if (rval == 0 && catch) {
 		PROC_LOCK(p);
-		/* XXX: shouldn't we always be calling cursig() */ 
-		if (sig != 0 || (sig = cursig(p))) {
+		/* XXX: shouldn't we always be calling cursig() */
+		if (sig != 0 || (sig = cursig(td))) {
 			if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
 				rval = EINTR;
 			else
@@ -571,7 +629,7 @@ endtsleep(arg)
 		td->td_flags &= ~TDF_TIMEOUT;
 		setrunqueue(td);
 	} else if (td->td_wchan != NULL) {
-		if (td->td_proc->p_stat == SSLEEP)  /* XXXKSE */
+		if (td->td_state == TDS_SLP)  /* XXXKSE */
 			setrunnable(td);
 		else
 			unsleep(td);
@@ -583,6 +641,38 @@ endtsleep(arg)
 }
 
 /*
+ * Abort a thread, as if an interrupt had occured.  Only abort
+ * interruptable waits (unfortunatly it isn't only safe to abort others).
+ * This is about identical to cv_abort().
+ * Think about merging them?
+ * Also, whatever the signal code does...
+ */
+void
+abortsleep(struct thread *td)
+{
+
+	mtx_lock_spin(&sched_lock);
+	/*
+	 * If the TDF_TIMEOUT flag is set, just leave. A
+	 * timeout is scheduled anyhow.
+	 */
+	if ((td->td_flags & (TDF_TIMEOUT | TDF_SINTR)) == TDF_SINTR) {
+		if (td->td_wchan != NULL) {
+			if (td->td_state == TDS_SLP) {  /* XXXKSE */
+				setrunnable(td);
+			} else {
+				/*
+				 * Probably in a suspended state..
+				 * um.. dunno XXXKSE
+				 */
+				unsleep(td);
+			}
+		}
+	}
+	mtx_unlock_spin(&sched_lock);
+}
+
+/*
  * Remove a process from its wait queue
  */
 void
@@ -618,25 +708,24 @@ restart:
 		if (td->td_wchan == ident) {
 			TAILQ_REMOVE(qp, td, td_slpq);
 			td->td_wchan = NULL;
-			if (td->td_proc->p_stat == SSLEEP) {
+			if (td->td_state == TDS_SLP) {
 				/* OPTIMIZED EXPANSION OF setrunnable(p); */
 				CTR3(KTR_PROC, "wakeup: thread %p (pid %d, %s)",
 				    td, p->p_pid, p->p_comm);
 				if (td->td_ksegrp->kg_slptime > 1)
 					updatepri(td);
 				td->td_ksegrp->kg_slptime = 0;
-				td->td_kse->ke_slptime = 0;
-				td->td_proc->p_stat = SRUN;
 				if (p->p_sflag & PS_INMEM) {
 					setrunqueue(td);
 					maybe_resched(td);
 				} else {
+/* XXXKSE Wrong! */			td->td_state = TDS_RUNQ;
 					p->p_sflag |= PS_SWAPINREQ;
 					wakeup(&proc0);
 				}
 				/* END INLINE EXPANSION */
-				goto restart;
 			}
+			goto restart;
 		}
 	}
 	mtx_unlock_spin(&sched_lock);
@@ -665,20 +754,19 @@ restart:
 		if (td->td_wchan == ident) {
 			TAILQ_REMOVE(qp, td, td_slpq);
 			td->td_wchan = NULL;
-			if (td->td_proc->p_stat == SSLEEP) {
+			if (td->td_state == TDS_SLP) {
 				/* OPTIMIZED EXPANSION OF setrunnable(p); */
-				CTR3(KTR_PROC, "wakeup1: proc %p (pid %d, %s)",
-				    p, p->p_pid, p->p_comm);
+				CTR3(KTR_PROC,"wakeup1: thread %p (pid %d, %s)",
+				    td, p->p_pid, p->p_comm);
 				if (td->td_ksegrp->kg_slptime > 1)
 					updatepri(td);
 				td->td_ksegrp->kg_slptime = 0;
-				td->td_kse->ke_slptime = 0;
-				td->td_proc->p_stat = SRUN;
 				if (p->p_sflag & PS_INMEM) {
 					setrunqueue(td);
 					maybe_resched(td);
 					break;
 				} else {
+/* XXXKSE Wrong */			td->td_state = TDS_RUNQ;
 					p->p_sflag |= PS_SWAPINREQ;
 					wakeup(&proc0);
 				}
@@ -698,15 +786,19 @@ mi_switch()
 {
 	struct bintime new_switchtime;
 	struct thread *td = curthread;	/* XXX */
-	register struct proc *p = td->td_proc;	/* XXX */
+	struct proc *p = td->td_proc;	/* XXX */
+	struct kse *ke = td->td_kse;
 #if 0
 	register struct rlimit *rlim;
 #endif
 	u_int sched_nest;
 
 	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
+	KASSERT((ke->ke_state == KES_RUNNING), ("mi_switch: kse state?"));
 #ifdef INVARIANTS
-	if (p->p_stat != SMTX && p->p_stat != SRUN)
+	if (td->td_state != TDS_MTX &&
+	    td->td_state != TDS_RUNQ &&
+	    td->td_state != TDS_RUNNING)
 		mtx_assert(&Giant, MA_NOTOWNED);
 #endif
 
@@ -735,7 +827,8 @@ mi_switch()
 	 *
 	 * XXX drop sched_lock, pickup Giant
 	 */
-	if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
+	if (p->p_state != PRS_ZOMBIE &&
+	    p->p_limit->p_cpulimit != RLIM_INFINITY &&
 	    p->p_runtime > p->p_limit->p_cpulimit) {
 		rlim = &p->p_rlimit[RLIMIT_CPU];
 		if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) {
@@ -763,17 +856,35 @@ mi_switch()
 	 */
 	cnt.v_swtch++;
 	PCPU_SET(switchtime, new_switchtime);
-	CTR3(KTR_PROC, "mi_switch: old proc %p (pid %d, %s)", p, p->p_pid,
+	CTR3(KTR_PROC, "mi_switch: old thread %p (pid %d, %s)", td, p->p_pid,
 	    p->p_comm);
 	sched_nest = sched_lock.mtx_recurse;
-	td->td_lastcpu = td->td_kse->ke_oncpu;
-	td->td_kse->ke_oncpu = NOCPU;
-	td->td_kse->ke_flags &= ~KEF_NEEDRESCHED;
+	td->td_lastcpu = ke->ke_oncpu;
+	ke->ke_oncpu = NOCPU;
+	ke->ke_flags &= ~KEF_NEEDRESCHED;
+	/*
+	 * At the last moment: if this KSE is not on the run queue,
+	 * it needs to be freed correctly and the thread treated accordingly.
+	 */
+	if ((td->td_state == TDS_RUNNING) &&
+	    ((ke->ke_flags & KEF_IDLEKSE) == 0)) {
+		/* Put us back on the run queue (kse and all). */
+		setrunqueue(td);
+	} else if ((td->td_flags & TDF_UNBOUND) &&
+	    (td->td_state != TDS_RUNQ)) { /* in case of old code */
+		/*
+		 * We will not be on the run queue.
+		 * Someone else can use the KSE if they need it.
+		 */
+		td->td_kse = NULL;
+		kse_reassign(ke);
+	}
 	cpu_switch();
 	td->td_kse->ke_oncpu = PCPU_GET(cpuid);
+	td->td_kse->ke_state = KES_RUNNING;
 	sched_lock.mtx_recurse = sched_nest;
 	sched_lock.mtx_lock = (uintptr_t)td;
-	CTR3(KTR_PROC, "mi_switch: new proc %p (pid %d, %s)", p, p->p_pid,
+	CTR3(KTR_PROC, "mi_switch: new thread %p (pid %d, %s)", td, p->p_pid,
 	    p->p_comm);
 	if (PCPU_GET(switchtime.sec) == 0)
 		binuptime(PCPU_PTR(switchtime));
@@ -791,37 +902,42 @@ setrunnable(struct thread *td)
 	struct proc *p = td->td_proc;
 
 	mtx_lock_spin(&sched_lock);
-	switch (p->p_stat) {
-	case SZOMB: /* not a thread flag XXXKSE */
+	switch (p->p_state) {
+	case PRS_ZOMBIE:
 		panic("setrunnable(1)");
+	default:
+		break;
 	}
-	switch (td->td_proc->p_stat) {
+	switch (td->td_state) {
 	case 0:
-	case SRUN:
-	case SWAIT:
+	case TDS_RUNNING:
+	case TDS_IWAIT:
 	default:
+		printf("state is %d", td->td_state);
 		panic("setrunnable(2)");
-	case SSTOP:
-	case SSLEEP:			/* e.g. when sending signals */
+	case TDS_SUSPENDED:
+		thread_unsuspend(p);
+		break;
+	case TDS_SLP:			/* e.g. when sending signals */
 		if (td->td_flags & TDF_CVWAITQ)
 			cv_waitq_remove(td);
 		else
 			unsleep(td);
-		break;
-
-	case SIDL:
+	case TDS_UNQUEUED:  /* being put back onto the queue */
+	case TDS_NEW:	/* not yet had time to suspend */
+	case TDS_RUNQ:	/* not yet had time to suspend */
 		break;
 	}
-	td->td_proc->p_stat = SRUN;
 	if (td->td_ksegrp->kg_slptime > 1)
 		updatepri(td);
 	td->td_ksegrp->kg_slptime = 0;
-	td->td_kse->ke_slptime = 0;
 	if ((p->p_sflag & PS_INMEM) == 0) {
+		td->td_state = TDS_RUNQ; /* XXXKSE not a good idea */
 		p->p_sflag |= PS_SWAPINREQ;
 		wakeup(&proc0);
 	} else {
-		setrunqueue(td);
+		if (td->td_state != TDS_RUNQ)
+			setrunqueue(td); /* XXXKSE */
 		maybe_resched(td);
 	}
 	mtx_unlock_spin(&sched_lock);
@@ -848,7 +964,7 @@ resetpriority(kg)
 		kg->kg_user_pri = newpriority;
 	}
 	FOREACH_THREAD_IN_GROUP(kg, td) {
-		maybe_resched(td);
+		maybe_resched(td);			/* XXXKSE silly */
 	}
 	mtx_unlock_spin(&sched_lock);
 }
@@ -865,20 +981,21 @@ loadav(void *arg)
 	int i, nrun;
 	struct loadavg *avg;
 	struct proc *p;
-	struct ksegrp *kg;
+	struct thread *td;
 
 	avg = &averunnable;
 	sx_slock(&allproc_lock);
 	nrun = 0;
 	FOREACH_PROC_IN_SYSTEM(p) {
-		FOREACH_KSEGRP_IN_PROC(p, kg) {
-			switch (p->p_stat) {
-			case SRUN:
+		FOREACH_THREAD_IN_PROC(p, td) {
+			switch (td->td_state) {
+			case TDS_RUNQ:
+			case TDS_RUNNING:
 				if ((p->p_flag & P_NOLOAD) != 0)
 					goto nextproc;
-				/* FALLTHROUGH */
-			case SIDL:
-				nrun++;
+				nrun++; /* XXXKSE */
+			default:
+				break;
 			}
 nextproc:
 			continue;
@@ -932,19 +1049,18 @@ void
 schedclock(td)
 	struct thread *td;
 {
-	struct kse *ke = td->td_kse;
-	struct ksegrp *kg = td->td_ksegrp;
+	struct kse *ke;
+	struct ksegrp *kg;
 
-	if (td) {
-		ke->ke_cpticks++;
-		kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1);
-		if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
-			resetpriority(td->td_ksegrp);
-			if (td->td_priority >= PUSER)
-				td->td_priority = kg->kg_user_pri;
-		}
-	} else {
-		panic("schedclock");
+	KASSERT((td != NULL), ("schedlock: null thread pointer"));
+	ke = td->td_kse;
+	kg = td->td_ksegrp;
+	ke->ke_cpticks++;
+	kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1);
+	if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
+		resetpriority(kg);
+		if (td->td_priority >= PUSER)
+			td->td_priority = kg->kg_user_pri;
 	}
 }
 
@@ -959,7 +1075,6 @@ yield(struct thread *td, struct yield_args *uap)
 	mtx_assert(&Giant, MA_NOTOWNED);
 	mtx_lock_spin(&sched_lock);
 	td->td_priority = PRI_MAX_TIMESHARE;
-	setrunqueue(td);
 	kg->kg_proc->p_stats->p_ru.ru_nvcsw++;
 	mi_switch();
 	mtx_unlock_spin(&sched_lock);
diff --git a/sys/kern/ksched.c b/sys/kern/ksched.c
index c9081c3..bbe36be 100644
--- a/sys/kern/ksched.c
+++ b/sys/kern/ksched.c
@@ -181,7 +181,18 @@ int ksched_setscheduler(register_t *ret, struct ksched *ksched,
 
 			mtx_lock_spin(&sched_lock);
 			rtp_to_pri(&rtp, kg);
-			td->td_last_kse->ke_flags |= KEF_NEEDRESCHED; /* XXXKSE */
+			FOREACH_THREAD_IN_GROUP(kg, td) { /* XXXKSE */
+				if (td->td_state == TDS_RUNNING) {
+					td->td_kse->ke_flags |= KEF_NEEDRESCHED;
+				} else if (td->td_state == TDS_RUNQ) {
+					if (td->td_priority > kg->kg_user_pri) {
+						remrunqueue(td);
+						td->td_priority =
+						    kg->kg_user_pri;
+						setrunqueue(td);
+					}
+				}
+			}
 			mtx_unlock_spin(&sched_lock);
 		}
 		else
@@ -203,7 +214,19 @@ int ksched_setscheduler(register_t *ret, struct ksched *ksched,
 			 *     on the scheduling code: You must leave the
 			 *     scheduling info alone.
 			 */
-			td->td_last_kse->ke_flags |= KEF_NEEDRESCHED; /* XXXKSE */
+			FOREACH_THREAD_IN_GROUP(kg, td) {
+				if (td->td_state == TDS_RUNNING) {
+					td->td_kse->ke_flags |= KEF_NEEDRESCHED;
+				} else if (td->td_state == TDS_RUNQ) {
+					if (td->td_priority > kg->kg_user_pri) {
+						remrunqueue(td);
+						td->td_priority =
+						    kg->kg_user_pri;
+						setrunqueue(td);
+					}
+				}
+				
+			}
 			mtx_unlock_spin(&sched_lock);
 		}
 		break;
diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c
index 9dad93b..afd4c5d 100644
--- a/sys/kern/subr_smp.c
+++ b/sys/kern/subr_smp.c
@@ -124,8 +124,8 @@ forward_signal(struct thread *td)
 	 * executing so that it executes ast().
 	 */
 	mtx_assert(&sched_lock, MA_OWNED);
-	KASSERT(td->td_proc->p_stat == SRUN,
-	    ("forward_signal: process is not SRUN"));
+	KASSERT(td->td_state == TDS_RUNNING,
+	    ("forward_signal: thread is not TDS_RUNNING"));
 
 	CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc);
 
diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
index 3b415de..027aa9c 100644
--- a/sys/kern/subr_trap.c
+++ b/sys/kern/subr_trap.c
@@ -48,6 +48,8 @@
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/kse.h>
+#include <sys/ktr.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/systm.h>
@@ -71,13 +73,15 @@ userret(td, frame, oticks)
 	struct kse *ke = td->td_kse; 
 	struct ksegrp *kg = td->td_ksegrp;
 
+	CTR3(KTR_SYSC, "userret: thread %p (pid %d, %s)", td, p->p_pid,
+            p->p_comm);
 #ifdef INVARIANTS
 	/* Check that we called signotify() enough. */
 	mtx_lock(&Giant);
 	PROC_LOCK(p);
 	mtx_lock_spin(&sched_lock);
 	if (SIGPENDING(p) && ((p->p_sflag & PS_NEEDSIGCHK) == 0 ||
-	    (p->p_kse.ke_flags & KEF_ASTPENDING) == 0))
+	    (ke->ke_flags & KEF_ASTPENDING) == 0))
 		printf("failed to set signal flags proprly for ast()\n");
 	mtx_unlock_spin(&sched_lock);
 	PROC_UNLOCK(p);
@@ -100,6 +104,22 @@ userret(td, frame, oticks)
 	}
 
 	/*
+	 * We need to check to see if we have to exit or wait due to a
+	 * single threading requirement or some other STOP condition.
+	 */
+	PROC_LOCK(p);
+	thread_suspend_check(0);	/* Can suspend or kill */
+	PROC_UNLOCK(p);
+
+	/*
+	 * DO special thread processing, e.g. upcall tweaking and such
+	 */
+	if (p->p_flag & P_KSES) {
+		thread_userret(p, kg, ke, td, frame);
+		/* printf("KSE thread returned"); */
+	}
+
+	/*
 	 * Charge system time if profiling.
 	 *
 	 * XXX should move PS_PROFIL to a place that can obviously be
@@ -121,8 +141,7 @@ userret(td, frame, oticks)
  * This function will return with preemption disabled.
  */
 void
-ast(framep)
-	struct trapframe *framep;
+ast(struct trapframe *framep)
 {
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
@@ -136,6 +155,8 @@ ast(framep)
 	int ucode;
 #endif
 
+	CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, p->p_pid,
+            p->p_comm);
 	KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode"));
 #ifdef WITNESS
 	if (witness_list(td))
@@ -164,6 +185,13 @@ ast(framep)
 		p->p_stats->p_prof.pr_ticks = 0;
 	}
 	mtx_unlock_spin(&sched_lock);
+	/*
+	 * XXXKSE While the fact that we owe a user profiling
+	 * tick is stored per KSE in this code, the statistics
+	 * themselves are still stored per process.
+	 * This should probably change, by which I mean that
+	 * possibly the location of both might change.
+	 */
 
 	if (td->td_ucred != p->p_ucred) 
 		cred_update_thread(td);
@@ -192,14 +220,13 @@ ast(framep)
 	if (flags & KEF_NEEDRESCHED) {
 		mtx_lock_spin(&sched_lock);
 		td->td_priority = kg->kg_user_pri;
-		setrunqueue(td);
 		p->p_stats->p_ru.ru_nivcsw++;
 		mi_switch();
 		mtx_unlock_spin(&sched_lock);
 	}
 	if (sflag & PS_NEEDSIGCHK) {
 		PROC_LOCK(p);
-		while ((sig = cursig(p)) != 0)
+		while ((sig = cursig(td)) != 0)
 			postsig(sig);
 		PROC_UNLOCK(p);
 	}
diff --git a/sys/kern/subr_turnstile.c b/sys/kern/subr_turnstile.c
index 08bca8d..c2e79d0 100644
--- a/sys/kern/subr_turnstile.c
+++ b/sys/kern/subr_turnstile.c
@@ -119,23 +119,20 @@ propagate_priority(struct thread *td)
 			return;
 		}
 
+		KASSERT(td->td_state != TDS_SURPLUS, ("Mutex owner SURPLUS"));
+		MPASS(td->td_proc != NULL);
 		MPASS(td->td_proc->p_magic == P_MAGIC);
-		KASSERT(td->td_proc->p_stat != SSLEEP, ("sleeping thread owns a mutex"));
+		KASSERT(td->td_state != TDS_SLP,
+		    ("sleeping thread owns a mutex"));
 		if (td->td_priority <= pri) /* lower is higher priority */
 			return;
 
-		/*
-		 * Bump this thread's priority.
-		 */
-		td->td_priority = pri;
 
 		/*
 		 * If lock holder is actually running, just bump priority.
 		 */
-		if (thread_running(td)) {
-			MPASS(td->td_proc->p_stat == SRUN
-			|| td->td_proc->p_stat == SZOMB
-			|| td->td_proc->p_stat == SSTOP);
+		if (td->td_state == TDS_RUNNING) {
+			td->td_priority = pri;
 			return;
 		}
 
@@ -151,20 +148,26 @@ propagate_priority(struct thread *td)
 		 * If on run queue move to new run queue, and quit.
 		 * XXXKSE this gets a lot more complicated under threads
 		 * but try anyhow.
+		 * We should have a special call to do this more efficiently.
 		 */
-		if (td->td_proc->p_stat == SRUN) {
+		if (td->td_state == TDS_RUNQ) {
 			MPASS(td->td_blocked == NULL);
 			remrunqueue(td);
+			td->td_priority = pri;
 			setrunqueue(td);
 			return;
 		}
+		/*
+		 * Adjust for any other cases.
+		 */
+		td->td_priority = pri;
 
 		/*
 		 * If we aren't blocked on a mutex, we should be.
 		 */
-		KASSERT(td->td_proc->p_stat == SMTX, (
+		KASSERT(td->td_state == TDS_MTX, (
 		    "process %d(%s):%d holds %s but isn't blocked on a mutex\n",
-		    td->td_proc->p_pid, td->td_proc->p_comm, td->td_proc->p_stat,
+		    td->td_proc->p_pid, td->td_proc->p_comm, td->td_state,
 		    m->mtx_object.lo_name));
 
 		/*
@@ -590,7 +593,7 @@ _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
 		 */
 		td->td_blocked = m;
 		td->td_mtxname = m->mtx_object.lo_name;
-		td->td_proc->p_stat = SMTX;
+		td->td_state = TDS_MTX;
 		propagate_priority(td);
 
 		if (LOCK_LOG_TEST(&m->mtx_object, opts))
@@ -727,7 +730,6 @@ _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
 		    m, td1);
 
 	td1->td_blocked = NULL;
-	td1->td_proc->p_stat = SRUN;
 	setrunqueue(td1);
 
 	if (td->td_critnest == 1 && td1->td_priority < pri) {
@@ -744,7 +746,6 @@ _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
 			}
 		}
 #endif
-		setrunqueue(td);
 		if (LOCK_LOG_TEST(&m->mtx_object, opts))
 			CTR2(KTR_LOCK,
 			    "_mtx_unlock_sleep: %p switching out lock=%p", m,
diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c
index 182221d..02b3a0d 100644
--- a/sys/kern/subr_witness.c
+++ b/sys/kern/subr_witness.c
@@ -225,6 +225,7 @@ static struct witness_order_list_entry order_lists[] = {
 #endif
 	{ "clk", &lock_class_mtx_spin },
 	{ "mutex profiling lock", &lock_class_mtx_spin },
+	{ "zombie_thread_lock", &lock_class_mtx_spin },
 	{ NULL, NULL },
 	{ NULL, NULL }
 };
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index 1bdd913..d8fba59 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -1187,7 +1187,7 @@ selwakeup(sip)
 	sip->si_thread = NULL;
 	mtx_lock_spin(&sched_lock);
 	if (td->td_wchan == (caddr_t)&selwait) {
-		if (td->td_proc->p_stat == SSLEEP)
+		if (td->td_state == TDS_SLP)
 			setrunnable(td);
 		else
 			cv_waitq_remove(td);
diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c
index dacb9d9..ab6f1e8 100644
--- a/sys/kern/sys_process.c
+++ b/sys/kern/sys_process.c
@@ -467,7 +467,7 @@ ptrace(struct thread *td, struct ptrace_args *uap)
 		}
 
 		/* not currently stopped */
-		if (p->p_stat != SSTOP || (p->p_flag & P_WAITED) == 0) {
+		if (!P_SHOULDSTOP(p) || (p->p_flag & P_WAITED) == 0) {
 			error = EBUSY;
 			goto fail;
 		}
@@ -566,10 +566,12 @@ ptrace(struct thread *td, struct ptrace_args *uap)
 		if (proctree_locked)
 			sx_xunlock(&proctree_lock);
 		/* deliver or queue signal */
-		if (p->p_stat == SSTOP) {
+		if (P_SHOULDSTOP(p)) {
 			p->p_xstat = uap->data;
 			mtx_lock_spin(&sched_lock);
+			p->p_flag &= ~(P_STOPPED_TRACE|P_STOPPED_SGNL);
 			setrunnable(td2);	/* XXXKSE */
+			/* Need foreach kse in proc, ... make_kse_queued(). */
 			mtx_unlock_spin(&sched_lock);
 		} else if (uap->data)		      
 			psignal(p, uap->data);
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index d8115fb..15a5d7c 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -552,7 +552,7 @@
 381	STD	BSD	{ int kse_new(struct kse_mailbox * mbx, \
 			int new_grp_flag); }
 382	STD	BSD	{ int thread_wakeup(struct thread_mailbox *tmbx); }
-383	STD	BSD	{ int kse_yield(void); }
+383	MSTD	BSD	{ int kse_yield(void); }
 384	UNIMPL	BSD	__mac_get_proc
 385	UNIMPL	BSD	__mac_set_proc
 386	UNIMPL	BSD	__mac_get_fd
diff --git a/sys/kern/tty.c b/sys/kern/tty.c
index b9c5743..6c915e1 100644
--- a/sys/kern/tty.c
+++ b/sys/kern/tty.c
@@ -2392,17 +2392,35 @@ ttyinfo(struct tty *tp)
 			PGRP_UNLOCK(tp->t_pgrp);
 
 			td = FIRST_THREAD_IN_PROC(pick);
-			stmp = pick->p_stat == SRUN ? "running" :  /* XXXKSE */
-			    pick->p_stat == SMTX ? td->td_mtxname :
-			    td->td_wmesg ? td->td_wmesg : "iowait";
+			if (pick->p_flag & P_KSES) {
+				stmp = "KSE" ;  /* XXXKSE */
+			} else {
+				if (td) {
+					if (td->td_state == TDS_RUNQ) {
+						stmp = "running";
+					} else if (td->td_state == TDS_MTX) {
+						stmp = td->td_mtxname;
+					} else if (td->td_wmesg) {
+						stmp = td->td_wmesg;
+					} else {
+						stmp = "iowait";
+					}
+				} else {
+					stmp = "threadless";
+					panic("ttyinfo: no thread!?");
+				}
+			}
 			calcru(pick, &utime, &stime, NULL);
-			ltmp = pick->p_stat == SIDL || pick->p_stat == SWAIT ||
-			    pick->p_stat == SZOMB ? 0 :
-			    pgtok(vmspace_resident_count(pick->p_vmspace));
+			ltmp = ((pick->p_state == PRS_NEW)
+			    || (td && (td->td_state == TDS_IWAIT))
+			    || (pick->p_state == PRS_ZOMBIE ? 0 :
+		    	    pgtok(vmspace_resident_count(pick->p_vmspace))));
 			mtx_unlock_spin(&sched_lock);
 
 			ttyprintf(tp, " cmd: %s %d [%s%s] ", pick->p_comm,
-			    pick->p_pid, pick->p_stat == SMTX ? "*" : "", stmp);
+			    pick->p_pid,
+			    td->td_state == TDS_MTX ? "*" : "",
+			    stmp);
 
 			/* Print user time. */
 			ttyprintf(tp, "%ld.%02ldu ",
@@ -2433,7 +2451,19 @@ ttyinfo(struct tty *tp)
  *	   we pick out just "short-term" sleepers (P_SINTR == 0).
  *	4) Further ties are broken by picking the highest pid.
  */
-#define ISRUN(p)	(((p)->p_stat == SRUN) || ((p)->p_stat == SIDL))
+#define ISRUN(p, val)						\
+do {								\
+	struct thread *td;					\
+	val = 0;						\
+	FOREACH_THREAD_IN_PROC(p, td) {				\
+		if (td->td_state == TDS_RUNQ ||			\
+		    td->td_state == TDS_RUNNING) {		\
+			val = 1;				\
+			break;					\
+		}						\
+	}							\
+} while (0)
+
 #define TESTAB(a, b)    ((a)<<1 | (b))
 #define ONLYA   2
 #define ONLYB   1
@@ -2449,10 +2479,13 @@ proc_compare(struct proc *p1, struct proc *p2)
 	if (p1 == NULL)
 		return (1);
 
+	ISRUN(p1, esta);
+	ISRUN(p2, estb);
+	
 	/*
 	 * see if at least one of them is runnable
 	 */
-	switch (TESTAB(ISRUN(p1), ISRUN(p2))) {
+	switch (TESTAB(esta, estb)) {
 	case ONLYA:
 		return (0);
 	case ONLYB:
@@ -2477,7 +2510,7 @@ proc_compare(struct proc *p1, struct proc *p2)
 	/*
 	 * weed out zombies
 	 */
-	switch (TESTAB(p1->p_stat == SZOMB, p2->p_stat == SZOMB)) {
+	switch (TESTAB(p1->p_state == PRS_ZOMBIE, p2->p_state == PRS_ZOMBIE)) {
 	case ONLYA:
 		return (1);
 	case ONLYB:
-- 
cgit v1.1