Part 1 of KSE-III

The ability to schedule multiple threads per process (one one cpu) by making ALL system calls optionally asynchronous. to come: ia64 and power-pc patches, patches for gdb, test program (in tools) Reviewed by: Almost everyone who counts (at various times, peter, jhb, matt, alfred, mini, bernd, and a cast of thousands) NOTE: this is still Beta code, and contains lots of debugging stuff. expect slight instability in signals..
author: julian <julian@FreeBSD.org> 2002-06-29 17:26:22 +0000
committer: julian <julian@FreeBSD.org> 2002-06-29 17:26:22 +0000
commit: aa2dc0a5d9e7a19420c153cd414fefa8498eab71 (patch)
tree: 0a0483a267784fa8e2bf86857d8727edb5b122e9
parent: 6dbff7f2c1f8150887038aed666e11675adf0b4e (diff)
download: FreeBSD-src-aa2dc0a5d9e7a19420c153cd414fefa8498eab71.zip
FreeBSD-src-aa2dc0a5d9e7a19420c153cd414fefa8498eab71.tar.gz
75 files changed, 2765 insertions, 731 deletions
diff --git a/lib/libkvm/kvm_proc.c b/lib/libkvm/kvm_proc.c
index 865377c..547792e 100644
--- a/lib/libkvm/kvm_proc.c
+++ b/lib/libkvm/kvm_proc.c
@@ -325,11 +325,28 @@ nopgrp:
 		kp->ki_estcpu = proc.p_ksegrp.kg_estcpu;	/* XXXKSE */
 		kp->ki_slptime = proc.p_kse.ke_slptime;		/* XXXKSE */
 		kp->ki_swtime = proc.p_swtime;
-		kp->ki_flag = proc.p_flag;
+		kp->ki_flag = proc.p_flag;	/* WILDLY INNACURATE XXXKSE */
 		kp->ki_sflag = proc.p_sflag;
 		kp->ki_wchan = mainthread.td_wchan;		/* XXXKSE */
 		kp->ki_traceflag = proc.p_traceflag;
-		kp->ki_stat = proc.p_stat;
+		if (proc.p_state == PRS_NORMAL) { /*  XXXKSE very aproximate */
+			if ((mainthread.td_state == TDS_RUNQ) ||
+			    (mainthread.td_state == TDS_RUNNING)) {
+				kp->ki_stat = SRUN;
+			} else if (mainthread.td_state == TDS_SLP) {
+				kp->ki_stat = SSLEEP;
+			} else if (P_SHOULDSTOP(&proc)) {
+				kp->ki_stat = SSTOP;
+			} else if (mainthread.td_state == TDS_MTX) {
+				kp->ki_stat = SMTX;
+			} else {
+				kp->ki_stat = SWAIT;
+			}
+		} else if (proc.p_state == PRS_ZOMBIE) {
+			kp->ki_stat = SZOMB;
+		} else {
+			kp->ki_stat = SIDL;
+		}
 		kp->ki_pri.pri_class = proc.p_ksegrp.kg_pri_class; /* XXXKSE */
 		kp->ki_pri.pri_user = proc.p_ksegrp.kg_user_pri; /* XXXKSE */
 		kp->ki_pri.pri_level = mainthread.td_priority;	/* XXXKSE */
diff --git a/sys/alpha/alpha/genassym.c b/sys/alpha/alpha/genassym.c
index 62ff3a4..96092da 100644
--- a/sys/alpha/alpha/genassym.c
+++ b/sys/alpha/alpha/genassym.c
@@ -80,6 +80,8 @@ ASSYM(MTX_UNOWNED, MTX_UNOWNED);
 ASSYM(TD_PCB, offsetof(struct thread, td_pcb));
 ASSYM(TD_KSE, offsetof(struct thread, td_kse));
 ASSYM(TD_PROC, offsetof(struct thread, td_proc));
+ASSYM(TD_STATE, offsetof(struct thread, td_state));
+ASSYM(TDS_RUNNING, TDS_RUNNING);
 
 ASSYM(KE_FLAGS, offsetof(struct kse, ke_flags));
 
diff --git a/sys/alpha/alpha/pmap.c b/sys/alpha/alpha/pmap.c
index c758edb..5137f79 100644
--- a/sys/alpha/alpha/pmap.c
+++ b/sys/alpha/alpha/pmap.c
@@ -1151,7 +1151,12 @@ pmap_dispose_thread(td)
 	ksobj = td->td_kstack_obj;
 	ks = td->td_kstack;
 	ptek = vtopte(ks);
+#ifdef KSTACK_GUARD
+	ks -= PAGE_SIZE;
+	for (i = 1; i < (KSTACK_PAGES + 1); i++) {
+#else
 	for (i = 0; i < KSTACK_PAGES; i++) {
+#endif
 		m = vm_page_lookup(ksobj, i);
 		if (m == NULL)
 			panic("pmap_dispose_thread: kstack already missing?");
@@ -1164,14 +1169,16 @@ pmap_dispose_thread(td)
 	}
 
 	/*
-	 * If the thread got swapped out some of its KSTACK might have gotten
-	 * swapped.  Just get rid of the object to clean up the swap use
-	 * proactively.  NOTE! might block waiting for paging I/O to complete.
+	 * Free the space that this stack was mapped to in the kernel
+	 * address map.
 	 */
-	if (ksobj->type == OBJT_SWAP) {
-		td->td_kstack_obj = NULL;
-		vm_object_deallocate(ksobj);
-	}
+#ifdef KSTACK_GUARD
+	kmem_free(kernel_map, ks, (KSTACK_PAGES + 1) * PAGE_SIZE);
+#else
+	kmem_free(kernel_map, ks, KSTACK_PAGES * PAGE_SIZE);
+#endif
+	td->td_kstack_obj = NULL;
+	vm_object_deallocate(ksobj);
 }
 
 /*
diff --git a/sys/alpha/alpha/swtch.s b/sys/alpha/alpha/swtch.s
index 34f3453..bae5227 100644
--- a/sys/alpha/alpha/swtch.s
+++ b/sys/alpha/alpha/swtch.s
@@ -127,6 +127,9 @@ Lcs1:	LDGP(pv)
 	mov	v0, s2				/* s2 = new thread */
 	ldq	s3, TD_MD_PCBPADDR(s2)		/* s3 = new pcbpaddr */
 
+	ldiq	t0, TDS_RUNNING
+	stl	t0, TD_STATE(s2)
+
 	/*
 	 * Check to see if we're switching to ourself.  If we are,
 	 * don't bother loading the new context.
diff --git a/sys/alpha/alpha/trap.c b/sys/alpha/alpha/trap.c
index 6cdf9f4..17dcb14 100644
--- a/sys/alpha/alpha/trap.c
+++ b/sys/alpha/alpha/trap.c
@@ -39,6 +39,7 @@
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
+#include <sys/kse.h>
 #include <sys/exec.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
@@ -299,6 +300,12 @@ trap(a0, a1, a2, entry, framep)
 		td->td_frame = framep;
 		if (td->td_ucred != p->p_ucred)
 			cred_update_thread(td);
+		if ((p->p_flag & P_WEXIT) && (p->p_singlethread != td)) {
+			mtx_lock_spin(&sched_lock);
+			PROC_LOCK(p);
+			thread_exit();
+			/* NOTREACHED */
+		}
 	} else {
 		sticks = 0;		/* XXX bogus -Wuninitialized warning */
 		KASSERT(cold || td->td_ucred != NULL,
@@ -659,6 +666,23 @@ syscall(code, framep)
 	sticks = td->td_kse->ke_sticks;
 	if (td->td_ucred != p->p_ucred)
 		cred_update_thread(td);
+	if (p->p_flag & P_KSES) {
+		/*
+		 * If we are doing a syscall in a KSE environment,
+		 * note where our mailbox is. There is always the
+		 * possibility that we could do this lazily (in sleep()),
+		 * but for now do it every time.
+		 */
+		td->td_mailbox = (void *)fuword((caddr_t)td->td_kse->ke_mailbox
+		    + offsetof(struct kse_mailbox, kmbx_current_thread));
+		if ((td->td_mailbox == NULL) ||
+		    (td->td_mailbox == (void *)-1)) {
+			td->td_mailbox = NULL;  /* single thread it.. */
+			td->td_flags &= ~TDF_UNBOUND;
+		} else {
+			td->td_flags |= TDF_UNBOUND;
+		}
+	}
 
 #ifdef DIAGNOSTIC
 	alpha_fpstate_check(td);
@@ -756,14 +780,14 @@ syscall(code, framep)
 		break;
 	}
 
-	userret(td, framep, sticks);
-	
 	/*
 	 * Release Giant if we had to get it.
 	 */
 	if ((callp->sy_narg & SYF_MPSAFE) == 0)
 		mtx_unlock(&Giant);
 
+	userret(td, framep, sticks);
+	
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
 		ktrsysret(code, error, td->td_retval[0]);
diff --git a/sys/alpha/alpha/vm_machdep.c b/sys/alpha/alpha/vm_machdep.c
index e57593c..80f5f03 100644
--- a/sys/alpha/alpha/vm_machdep.c
+++ b/sys/alpha/alpha/vm_machdep.c
@@ -240,8 +240,7 @@ cpu_set_fork_handler(td, func, arg)
  * from proc0.
  */
 void
-cpu_exit(td)
-	register struct thread *td;
+cpu_exit(struct thread *td)
 {
 
 	alpha_fpstate_drop(td);
@@ -254,6 +253,141 @@ cpu_sched_exit(td)
 }
 
 void
+cpu_thread_exit(struct thread *td)
+{
+
+	return;
+}
+
+void
+cpu_thread_setup(struct thread *td)
+{
+
+	td->td_pcb =
+	     (struct pcb *)(td->td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
+	td->td_frame = (struct trapframe *)((caddr_t)td->td_pcb) - 1;
+}
+
+struct md_store {
+	struct pcb mds_pcb;
+	struct trapframe mds_frame;
+};
+
+void
+cpu_save_upcall(struct thread *td, struct kse *newkse)
+{
+
+	newkse->ke_mdstorage = malloc(sizeof(struct md_store), M_TEMP,
+	    M_WAITOK);
+	/* Note: use of M_WAITOK means it won't fail. */
+	/* set up shortcuts in MI section */
+	newkse->ke_pcb =
+	    &(((struct md_store *)(newkse->ke_mdstorage))->mds_pcb);
+	newkse->ke_frame =
+	    &(((struct md_store *)(newkse->ke_mdstorage))->mds_frame);
+
+	/* Copy the upcall pcb. Kernel mode & fp regs are here. */
+	/* XXXKSE this may be un-needed */
+	bcopy(td->td_pcb, newkse->ke_pcb, sizeof(struct pcb));
+
+	/* This copies most of the user mode register values. */
+	bcopy(td->td_frame, newkse->ke_frame, sizeof(struct trapframe));
+}
+
+void
+cpu_set_upcall(struct thread *td, void *pcb)
+{
+	struct pcb *pcb2;
+
+	td->td_flags |= TDF_UPCALLING;
+
+	/* Point the pcb to the top of the stack. */
+	pcb2 = td->td_pcb;
+
+	/*
+	 * Copy the upcall pcb.  This loads kernel regs.
+	 * Those not loaded individually below get their default
+	 * values here.
+	 *
+	 * XXXKSE It might be a good idea to simply skip this as
+	 * the values of the other registers may be unimportant.
+	 * This would remove any requirement for knowing the KSE
+	 * at this time (see the matching comment below for
+	 * more analysis) (need a good safe default).
+	 */
+	bcopy(pcb, pcb2, sizeof(*pcb2));
+
+	/*
+	 * Create a new fresh stack for the new thread.
+	 * Don't forget to set this stack value into whatever supplies
+	 * the address for the fault handlers.
+	 * The contexts are filled in at the time we actually DO the
+	 * upcall as only then do we know which KSE we got.
+	 */
+	td->td_frame = (struct trapframe *)((caddr_t)pcb2) - 1;
+
+	/*
+	 * Arrange for continuation at fork_return(), which
+	 * will return to exception_return().  Note that the child
+	 * process doesn't stay in the kernel for long!
+	 */
+	pcb2->pcb_hw.apcb_ksp = (u_int64_t)td->td_frame;
+	pcb2->pcb_context[0] = (u_int64_t)fork_return;	 	/* s0: a0 */
+	pcb2->pcb_context[1] = (u_int64_t)exception_return;	/* s1: ra */
+	pcb2->pcb_context[2] = (u_long)td;			/* s2: a1 */
+	pcb2->pcb_context[7] = (u_int64_t)fork_trampoline;	/* ra: magic*/
+#ifdef SMP
+	/*
+	 * We start off at a nesting level of 1 within the kernel.
+	 */
+	td->td_md.md_kernnest = 1;
+#endif
+}
+
+void
+cpu_set_args(struct thread *td, struct kse *ke)
+{
+/* XXX
+	suword((void *)(ke->ke_frame->tf_esp + sizeof(void *)),
+	    (int)ke->ke_mailbox);
+*/
+}
+
+void
+cpu_free_kse_mdstorage(struct kse *kse)
+{
+
+	free(kse->ke_mdstorage, M_TEMP);
+	kse->ke_mdstorage = NULL;
+	kse->ke_pcb = NULL;
+	kse->ke_frame = NULL;
+}
+
+int
+cpu_export_context(struct thread *td)
+{
+	/* XXXKSE */
+#if 0
+	struct trapframe *frame;
+	struct thread_mailbox *tm;
+	struct trapframe *uframe;
+	int error;
+
+	frame = td->td_frame;
+	tm = td->td_mailbox;
+	uframe = &tm->ctx.tfrm.tf_tf;
+	error = copyout(frame, uframe, sizeof(*frame));
+	/*
+	 * "What about the fp regs?" I hear you ask.... XXXKSE
+	 * Don't know where gs and "onstack" come from.
+	 * May need to fiddle a few other values too.
+	 */
+	return (error);
+#endif
+	return (0);
+}
+
+void
 cpu_wait(p)
 	struct proc *p;
 {
diff --git a/sys/alpha/linux/linux_machdep.c b/sys/alpha/linux/linux_machdep.c
index 51d68f1..5f33c80 100644
--- a/sys/alpha/linux/linux_machdep.c
+++ b/sys/alpha/linux/linux_machdep.c
@@ -180,7 +180,6 @@ linux_clone(struct thread *td, struct linux_clone_args *args)
 	 * Make this runnable after we are finished with it.
 	 */
 	mtx_lock_spin(&sched_lock);
-	p2->p_stat = SRUN;
 	setrunqueue(FIRST_THREAD_IN_PROC(p2));
 	mtx_unlock_spin(&sched_lock);
 
diff --git a/sys/amd64/amd64/cpu_switch.S b/sys/amd64/amd64/cpu_switch.S
index e0f9bcd..80db485 100644
--- a/sys/amd64/amd64/cpu_switch.S
+++ b/sys/amd64/amd64/cpu_switch.S
@@ -65,12 +65,19 @@ tlb_flush_count:	.long	0
 
 /*
  * cpu_throw()
+ *
+ * This is the second half of cpu_swtch(). It is used when the current
+ * thread is either a dummy or slated to die, and we no longer care
+ * about its state.
  */
 ENTRY(cpu_throw)
 	jmp	sw1
 
 /*
  * cpu_switch()
+ *
+ * Save the current thread state, then select the next thread to run
+ * and load its state.
  */
 ENTRY(cpu_switch)
 
@@ -166,11 +173,11 @@ sw1b:
 	movl	%eax,%ecx
 
 #ifdef	INVARIANTS
-	movl	TD_PROC(%ecx), %eax		/* XXXKSE */
-	cmpb	$SRUN,P_STAT(%eax)
+	cmpb	$TDS_RUNQ,TD_STATE(%ecx)
 	jne	badsw2
 #endif
 
+	movl	$TDS_RUNNING,TD_STATE(%ecx)
 	movl	TD_PCB(%ecx),%edx
 
 #if defined(SWTCH_OPTIM_STATS)
@@ -310,12 +317,14 @@ cpu_switch_load_gs:
 
 #ifdef INVARIANTS
 badsw2:
+	pushal
 	pushl	$sw0_2
 	call	panic
 
 sw0_2:	.asciz	"cpu_switch: not TDS_RUNQ"
 
 badsw3:
+	pushal
 	pushl	$sw0_3
 	call	panic
 
diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c
index f3e9f04..dcc1880 100644
--- a/sys/amd64/amd64/genassym.c
+++ b/sys/amd64/amd64/genassym.c
@@ -79,10 +79,10 @@ ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace));
 ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap));
 ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active));
 ASSYM(P_SFLAG, offsetof(struct proc, p_sflag));
-ASSYM(P_STAT, offsetof(struct proc, p_stat));
+ASSYM(P_STATE, offsetof(struct proc, p_state));
 ASSYM(P_UAREA, offsetof(struct proc, p_uarea));
 
-/*ASSYM(TD_STAT, offsetof(struct thread, td__stat));*/
+ASSYM(TD_STATE, offsetof(struct thread, td_state));
 ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
 ASSYM(TD_WCHAN, offsetof(struct thread, td_wchan));
 ASSYM(TD_PCB, offsetof(struct thread, td_pcb));
@@ -101,8 +101,9 @@ ASSYM(KE_FLAGS, offsetof(struct kse, ke_flags));
 ASSYM(KEF_ASTPENDING, KEF_ASTPENDING);
 ASSYM(KEF_NEEDRESCHED, KEF_NEEDRESCHED);
 
-ASSYM(SSLEEP, SSLEEP);
-ASSYM(SRUN, SRUN);
+ASSYM(TDS_SLP, TDS_SLP);
+ASSYM(TDS_RUNQ, TDS_RUNQ);
+ASSYM(TDS_RUNNING, TDS_RUNNING);
 ASSYM(V_TRAP, offsetof(struct vmmeter, v_trap));
 ASSYM(V_SYSCALL, offsetof(struct vmmeter, v_syscall));
 ASSYM(V_INTR, offsetof(struct vmmeter, v_intr));
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 2f11ee2..c73c5e1 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -799,7 +799,7 @@ cpu_idle(void)
 {
 	if (cpu_idle_hlt) {
 		disable_intr();
-		if (procrunnable()) {
+  		if (kserunnable()) {
 			enable_intr();
 		} else {
 			/*
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index e2cebaf..9e35ad7 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -1100,7 +1100,12 @@ pmap_dispose_thread(td)
 	ksobj = td->td_kstack_obj;
 	ks = td->td_kstack;
 	ptek = vtopte(ks);
+#ifdef KSTACK_GUARD
+	ks -= PAGE_SIZE;
+	for (i = 1; i < (KSTACK_PAGES + 1); i++) {
+#else
 	for (i = 0; i < KSTACK_PAGES; i++) {
+#endif
 		m = vm_page_lookup(ksobj, i);
 		if (m == NULL)
 			panic("pmap_dispose_thread: kstack already missing?");
@@ -1116,16 +1121,17 @@ pmap_dispose_thread(td)
 #ifdef I386_CPU
 	invltlb();
 #endif
-
 	/*
-	 * If the thread got swapped out some of its KSTACK might have gotten
-	 * swapped.  Just get rid of the object to clean up the swap use
-	 * proactively.  NOTE! might block waiting for paging I/O to complete.
+	 * Free the space that this stack was mapped to in the kernel
+	 * address map.
 	 */
-	if (ksobj->type == OBJT_SWAP) {
-		td->td_kstack_obj = NULL;
-		vm_object_deallocate(ksobj);
-	}
+#ifdef KSTACK_GUARD
+	kmem_free(kernel_map, ks, (KSTACK_PAGES + 1) * PAGE_SIZE);
+#else
+	kmem_free(kernel_map, ks, KSTACK_PAGES * PAGE_SIZE);
+#endif
+	vm_object_deallocate(ksobj);
+	td->td_kstack_obj = NULL; /* play it safe */
 }
 
 /*
diff --git a/sys/amd64/amd64/swtch.s b/sys/amd64/amd64/swtch.s
index e0f9bcd..80db485 100644
--- a/sys/amd64/amd64/swtch.s
+++ b/sys/amd64/amd64/swtch.s
@@ -65,12 +65,19 @@ tlb_flush_count:	.long	0
 
 /*
  * cpu_throw()
+ *
+ * This is the second half of cpu_swtch(). It is used when the current
+ * thread is either a dummy or slated to die, and we no longer care
+ * about its state.
  */
 ENTRY(cpu_throw)
 	jmp	sw1
 
 /*
  * cpu_switch()
+ *
+ * Save the current thread state, then select the next thread to run
+ * and load its state.
  */
 ENTRY(cpu_switch)
 
@@ -166,11 +173,11 @@ sw1b:
 	movl	%eax,%ecx
 
 #ifdef	INVARIANTS
-	movl	TD_PROC(%ecx), %eax		/* XXXKSE */
-	cmpb	$SRUN,P_STAT(%eax)
+	cmpb	$TDS_RUNQ,TD_STATE(%ecx)
 	jne	badsw2
 #endif
 
+	movl	$TDS_RUNNING,TD_STATE(%ecx)
 	movl	TD_PCB(%ecx),%edx
 
 #if defined(SWTCH_OPTIM_STATS)
@@ -310,12 +317,14 @@ cpu_switch_load_gs:
 
 #ifdef INVARIANTS
 badsw2:
+	pushal
 	pushl	$sw0_2
 	call	panic
 
 sw0_2:	.asciz	"cpu_switch: not TDS_RUNQ"
 
 badsw3:
+	pushal
 	pushl	$sw0_3
 	call	panic
 
diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c
index 08c75e4..8282416 100644
--- a/sys/amd64/amd64/trap.c
+++ b/sys/amd64/amd64/trap.c
@@ -54,6 +54,7 @@
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
+#include <sys/kse.h>
 #include <sys/pioctl.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
@@ -267,6 +268,17 @@ trap(frame)
 		if (td->td_ucred != p->p_ucred) 
 			cred_update_thread(td);
 
+		/*
+		 * First check that we shouldn't just abort.
+		 * But check if we are the single thread first!
+		 */
+		if ((p->p_flag & P_WEXIT) && (p->p_singlethread != td)) {
+			mtx_lock_spin(&sched_lock);
+			PROC_LOCK(p);
+			thread_exit();
+			/* NOTREACHED */
+		}
+
 		switch (type) {
 		case T_PRIVINFLT:	/* privileged instruction fault */
 			ucode = type;
@@ -939,11 +951,30 @@ syscall(frame)
 		mtx_unlock(&Giant);
 	}
 #endif
+	KASSERT((td->td_kse != NULL), ("syscall: kse/thread UNLINKED"));
+	KASSERT((td->td_kse->ke_thread == td), ("syscall:kse/thread mismatch"));
 
 	sticks = td->td_kse->ke_sticks;
 	td->td_frame = &frame;
 	if (td->td_ucred != p->p_ucred) 
 		cred_update_thread(td);
+	if (p->p_flag & P_KSES) {
+		/*
+		 * If we are doing a syscall in a KSE environment,
+		 * note where our mailbox is. There is always the
+		 * possibility that we could do this lazily (in sleep()),
+		 * but for now do it every time.
+		 */
+		td->td_mailbox = (void *)fuword((caddr_t)td->td_kse->ke_mailbox
+		    + offsetof(struct kse_mailbox, kmbx_current_thread));
+		if ((td->td_mailbox == NULL) ||
+		(td->td_mailbox == (void *)-1)) {
+			td->td_mailbox = NULL;	/* single thread it.. */
+			td->td_flags &= ~TDF_UNBOUND;
+		} else {
+			td->td_flags |= TDF_UNBOUND;
+		}
+	}
 	params = (caddr_t)frame.tf_esp + sizeof(int);
 	code = frame.tf_eax;
 	orig_tf_eflags = frame.tf_eflags;
@@ -1045,6 +1076,12 @@ syscall(frame)
 	}
 
 	/*
+	 * Release Giant if we previously set it.
+	 */
+	if ((callp->sy_narg & SYF_MPSAFE) == 0)
+		mtx_unlock(&Giant);
+
+	/*
 	 * Traced syscall.
 	 */
 	if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) {
@@ -1057,12 +1094,6 @@ syscall(frame)
 	 */
 	userret(td, &frame, sticks);
 
-	/*
-	 * Release Giant if we previously set it.
-	 */
-	if ((callp->sy_narg & SYF_MPSAFE) == 0)
-		mtx_unlock(&Giant);
-
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
 		ktrsysret(code, error, td->td_retval[0]);
diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c
index 5dc2e14..04742c3 100644
--- a/sys/amd64/amd64/vm_machdep.c
+++ b/sys/amd64/amd64/vm_machdep.c
@@ -53,6 +53,7 @@
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
+#include <sys/kse.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/vnode.h>
@@ -254,15 +255,26 @@ cpu_set_fork_handler(td, func, arg)
 }
 
 void
-cpu_exit(td)
-	register struct thread *td;
+cpu_exit(struct thread *td)
+{
+	struct mdproc *mdp;
+
+	mdp = &td->td_proc->p_md;
+	if (mdp->md_ldt)
+		user_ldt_free(td);
+	reset_dbregs();
+}
+
+void
+cpu_thread_exit(struct thread *td)
 {
 	struct pcb *pcb = td->td_pcb; 
-	struct mdproc *mdp = &td->td_proc->p_md;
 #ifdef DEV_NPX
 	npxexit(td);
 #endif
 	if (pcb->pcb_ext != 0) {
+		/* XXXKSE  XXXSMP  not SMP SAFE.. what locks do we have? */
+		/* if (pcb->pcb_ext->ext_refcount-- == 1) ?? */
 	        /* 
 		 * XXX do we need to move the TSS off the allocated pages 
 		 * before freeing them?  (not done here)
@@ -271,8 +283,6 @@ cpu_exit(td)
 		    ctob(IOPAGES + 1));
 		pcb->pcb_ext = 0;
 	}
-	if (mdp->md_ldt)
-		user_ldt_free(td);
         if (pcb->pcb_flags & PCB_DBREGS) {
                 /*
                  * disable all hardware breakpoints
@@ -289,6 +299,146 @@ cpu_sched_exit(td)
 }
 
 void
+cpu_thread_setup(struct thread *td)
+{
+
+	td->td_pcb =
+	     (struct pcb *)(td->td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
+	td->td_frame = (struct trapframe *)((caddr_t)td->td_pcb - 16) - 1;
+}
+
+struct md_store {
+	struct pcb mds_pcb;
+	struct trapframe mds_frame;
+};
+
+void
+cpu_save_upcall(struct thread *td, struct kse *newkse)
+{
+	struct trapframe *tf;
+
+	newkse->ke_mdstorage = malloc(sizeof(struct md_store), M_TEMP,
+	    M_WAITOK);
+	/* Note: use of M_WAITOK means it won't fail. */
+	/* set up shortcuts in MI section */
+	newkse->ke_pcb =
+	    &(((struct md_store *)(newkse->ke_mdstorage))->mds_pcb);
+	newkse->ke_frame =
+	    &(((struct md_store *)(newkse->ke_mdstorage))->mds_frame);
+	tf = newkse->ke_frame;
+
+	/* Copy the upcall pcb. Kernel mode & fp regs are here. */
+	/* XXXKSE this may be un-needed */
+	bcopy(td->td_pcb, newkse->ke_pcb, sizeof(struct pcb));
+
+	/*
+	 * This initialises most of the user mode register values
+	 * to good values. Eventually set them explicitly to know values
+	 */
+	bcopy(td->td_frame, newkse->ke_frame, sizeof(struct trapframe));
+	tf->tf_edi = 0;
+	tf->tf_esi = 0;		    /* trampoline arg */
+	tf->tf_ebp = 0;
+	tf->tf_esp = (int)newkse->ke_stackbase + newkse->ke_stacksize - 16;
+	tf->tf_ebx = 0;		    /* trampoline arg */
+	tf->tf_eip = (int)newkse->ke_upcall;
+}
+
+void
+cpu_set_upcall(struct thread *td, void *pcb)
+{
+	struct pcb *pcb2;
+
+	td->td_flags |= TDF_UPCALLING;
+
+	/* Point the pcb to the top of the stack. */
+	pcb2 = td->td_pcb;
+
+	/*
+	 * Copy the upcall pcb.  This loads kernel regs.
+	 * Those not loaded individually below get their default
+	 * values here.
+	 *
+	 * XXXKSE It might be a good idea to simply skip this as
+	 * the values of the other registers may be unimportant.
+	 * This would remove any requirement for knowing the KSE
+	 * at this time (see the matching comment below for
+	 * more analysis) (need a good safe default).
+	 */
+	bcopy(pcb, pcb2, sizeof(*pcb2));
+
+	/*
+	 * Create a new fresh stack for the new thread.
+	 * The -16 is so we can expand the trapframe if we go to vm86.
+	 * Don't forget to set this stack value into whatever supplies
+	 * the address for the fault handlers.
+	 * The contexts are filled in at the time we actually DO the
+	 * upcall as only then do we know which KSE we got.
+	 */
+	td->td_frame = (struct trapframe *)((caddr_t)pcb2 - 16) - 1;
+
+	/*
+	 * Set registers for trampoline to user mode.  Leave space for the
+	 * return address on stack.  These are the kernel mode register values.
+	 */
+	pcb2->pcb_cr3 = vtophys(vmspace_pmap(td->td_proc->p_vmspace)->pm_pdir);
+	pcb2->pcb_edi = 0;
+	pcb2->pcb_esi = (int)fork_return;		    /* trampoline arg */
+	pcb2->pcb_ebp = 0;
+	pcb2->pcb_esp = (int)td->td_frame - sizeof(void *); /* trampoline arg */
+	pcb2->pcb_ebx = (int)td;			    /* trampoline arg */
+	pcb2->pcb_eip = (int)fork_trampoline;
+	pcb2->pcb_psl &= ~(PSL_I);	/* interrupts must be disabled */
+	/*
+	 * If we didn't copy the pcb, we'd need to do the following registers:
+	 * pcb2->pcb_dr*:	cloned above.
+	 * pcb2->pcb_savefpu:	cloned above.
+	 * pcb2->pcb_flags:	cloned above.
+	 * pcb2->pcb_onfault:	cloned above (always NULL here?).
+	 * pcb2->pcb_gs:	cloned above.  XXXKSE ???
+	 * pcb2->pcb_ext:	cleared below.
+	 */
+	 pcb2->pcb_ext = NULL;
+}
+
+void
+cpu_set_args(struct thread *td, struct kse *ke) 
+{
+	suword((void *)(ke->ke_frame->tf_esp + sizeof(void *)),
+	    (int)ke->ke_mailbox);
+}
+
+void
+cpu_free_kse_mdstorage(struct kse *kse)
+{
+
+	free(kse->ke_mdstorage, M_TEMP);
+	kse->ke_mdstorage = NULL;
+	kse->ke_pcb = NULL;
+	kse->ke_frame = NULL;
+}
+
+int
+cpu_export_context(struct thread *td)
+{
+	struct trapframe *frame;
+	struct thread_mailbox *tm;
+	struct trapframe *uframe;
+	int error;
+
+	frame = td->td_frame;
+	tm = td->td_mailbox;
+	uframe = &tm->ctx.tfrm.tf_tf;
+	error = copyout(frame, uframe, sizeof(*frame));
+	/*
+	 * "What about the fp regs?" I hear you ask.... XXXKSE
+	 * Don't know where gs and "onstack" come from.
+	 * May need to fiddle a few other values too.
+	 */
+	return (error);
+}
+
+void
 cpu_wait(p)
 	struct proc *p;
 {
diff --git a/sys/compat/linprocfs/linprocfs.c b/sys/compat/linprocfs/linprocfs.c
index 02b858e..5129746 100644
--- a/sys/compat/linprocfs/linprocfs.c
+++ b/sys/compat/linprocfs/linprocfs.c
@@ -539,21 +539,6 @@ linprocfs_doprocstat(PFS_FILL_ARGS)
 }
 
 /*
- * Map process state to descriptive letter. Note that this does not
- * quite correspond to what Linux outputs, but it's close enough.
- */
-static char *state_str[] = {
-	"? (unknown)",
-	"I (idle)",
-	"R (running)",
-	"S (sleeping)",
-	"T (stopped)",
-	"Z (zombie)",
-	"W (waiting)",
-	"M (mutex)"
-};
-
-/*
  * Filler function for proc/pid/status
  */
 static int
@@ -562,13 +547,53 @@ linprocfs_doprocstatus(PFS_FILL_ARGS)
 	struct kinfo_proc kp;
 	char *state;
 	segsz_t lsize;
+	struct thread *td2;
 	int i;
 
 	mtx_lock_spin(&sched_lock);
-	if (p->p_stat > sizeof state_str / sizeof *state_str)
-		state = state_str[0];
-	else
-		state = state_str[(int)p->p_stat];
+	td2 = FIRST_THREAD_IN_PROC(p); /* XXXKSE pretend only one thread */
+
+	if (P_SHOULDSTOP(p)) {
+		state = "T (stopped)";
+	} else {
+		switch(p->p_state) {
+		case PRS_NEW:
+			state = "I (idle)";
+			break;
+		case PRS_NORMAL:
+			if (p->p_flag & P_WEXIT) {
+				state = "X (exiting)";
+				break;
+			}
+			switch(td2->td_state) {
+			case TDS_SLP:
+			case TDS_MTX:
+				state = "S (sleeping)";
+				break;
+			case TDS_RUNQ:
+			case TDS_RUNNING:
+				state = "R (running)";
+				break;
+			case TDS_NEW:
+			case TDS_UNQUEUED:
+			case TDS_IWAIT:
+			case TDS_SURPLUS:
+			default:
+				state = "? (unknown)";
+				break;
+			}
+			break;
+		case PRS_WAIT:
+			state = "W (waiting)";
+			break;
+		case PRS_ZOMBIE:
+			state = "Z (zombie)";
+			break;
+		default:
+			state = "? (unknown)";
+			break;
+		}
+	}
 	mtx_unlock_spin(&sched_lock);
 
 	PROC_LOCK(p);
diff --git a/sys/compat/svr4/svr4_misc.c b/sys/compat/svr4/svr4_misc.c
index 7ef01b9..f60d62c 100644
--- a/sys/compat/svr4/svr4_misc.c
+++ b/sys/compat/svr4/svr4_misc.c
@@ -1168,7 +1168,7 @@ svr4_setinfo(p, st, s)
 	if (p) {
 		i.si_pid = p->p_pid;
 		mtx_lock_spin(&sched_lock);
-		if (p->p_stat == SZOMB) {
+		if (p->p_state == PRS_ZOMBIE) {
 			i.si_stime = p->p_ru->ru_stime.tv_sec;
 			i.si_utime = p->p_ru->ru_utime.tv_sec;
 		}
@@ -1256,7 +1256,7 @@ loop:
 		}
 		nfound++;
 		mtx_lock_spin(&sched_lock);
-		if (q->p_stat == SZOMB && 
+		if ((q->p_state == PRS_ZOMBIE) && 
 		    ((SCARG(uap, options) & (SVR4_WEXITED|SVR4_WTRAPPED)))) {
 			mtx_unlock_spin(&sched_lock);
 			PROC_UNLOCK(q);
@@ -1372,7 +1372,8 @@ loop:
 			nprocs--;
 			return 0;
 		}
-		if (q->p_stat == SSTOP && (q->p_flag & P_WAITED) == 0 &&
+		/* XXXKSE this needs clarification */
+		if (P_SHOULDSTOP(q) && ((q->p_flag & P_WAITED) == 0) &&
 		    (q->p_flag & P_TRACED ||
 		     (SCARG(uap, options) & (SVR4_WSTOPPED|SVR4_WCONTINUED)))) {
 			mtx_unlock_spin(&sched_lock);
diff --git a/sys/conf/files b/sys/conf/files
index 1cff41f..9994c11 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -870,6 +870,7 @@ kern/kern_synch.c	standard
 kern/kern_syscalls.c	standard
 kern/kern_sysctl.c	standard
 kern/kern_tc.c		standard
+kern/kern_thread.c	standard
 kern/kern_time.c	standard
 kern/kern_timeout.c	standard
 kern/kern_uuid.c	standard
diff --git a/sys/ddb/db_ps.c b/sys/ddb/db_ps.c
index 9468f63..996e4eb 100644
--- a/sys/ddb/db_ps.c
+++ b/sys/ddb/db_ps.c
@@ -52,6 +52,7 @@ db_ps(dummy1, dummy2, dummy3, dummy4)
 	int nl = 0;
 	volatile struct proc *p, *pp;
 	volatile struct thread *td;
+	char *state;
 
 	np = nprocs;
 
@@ -96,23 +97,44 @@ db_ps(dummy1, dummy2, dummy3, dummy4)
 		if (pp == NULL)
 			pp = p;
 
-		db_printf("%5d %8p %8p %4d %5d %5d %07x  %d",
+
+		switch(p->p_state) {
+		case PRS_NORMAL:
+			if (P_SHOULDSTOP(p))
+				state = "stopped";
+			else
+				state = "Normal";
+			break;
+		case PRS_NEW:
+			state = "New";
+			break;
+		case PRS_WAIT:
+			state = "Wait";
+			break;
+		case PRS_ZOMBIE:
+			state = "Zombie";
+			break;
+		default:
+			state = "Unknown";
+			break;
+		}
+		db_printf("%5d %8p %8p %4d %5d %5d %07x  %s",
 		    p->p_pid, (volatile void *)p, (void *)p->p_uarea, 
 		    p->p_ucred ? p->p_ucred->cr_ruid : 0, pp->p_pid,
-		    p->p_pgrp ? p->p_pgrp->pg_id : 0, p->p_flag, p->p_stat);
+		    p->p_pgrp ? p->p_pgrp->pg_id : 0, p->p_flag, state);
 		if (p->p_flag & P_KSES) {
 			db_printf("(threaded)  %s\n", p->p_comm);
 			FOREACH_THREAD_IN_PROC(p, td) {
 				db_printf( ".  .  .  .  .  .  .  "
-				           ".  .  .  .  .  .  .  .  ");
+				           ".  thread %p   .  .  .  ", td);
 				if (td->td_wchan) {
-					db_printf("%6s %8p", td->td_wmesg,
+					db_printf("SLP %6s %8p\n", td->td_wmesg,
 					    (void *)td->td_wchan);
-				} else if (p->p_stat == SMTX) {
-					db_printf("%6s %8p", td->td_mtxname,
+				} else if (td->td_state == TDS_MTX) {
+					db_printf("MTX %6s %8p\n", td->td_mtxname,
 					    (void *)td->td_blocked);
 				} else {
-					db_printf("--not blocked--");
+					db_printf("--not blocked--\n");
 				}
 			}
 		} else {
@@ -120,7 +142,7 @@ db_ps(dummy1, dummy2, dummy3, dummy4)
 			if (td->td_wchan) {
 				db_printf("  %6s %8p", td->td_wmesg,
 				    (void *)td->td_wchan);
-			} else if (p->p_stat == SMTX) {
+			} else if (td->td_state == TDS_MTX) {
 				db_printf("  %6s %8p", td->td_mtxname,
 				    (void *)td->td_blocked);
 			} else {
diff --git a/sys/fs/procfs/procfs_ctl.c b/sys/fs/procfs/procfs_ctl.c
index 0f35370..15ed718 100644
--- a/sys/fs/procfs/procfs_ctl.c
+++ b/sys/fs/procfs/procfs_ctl.c
@@ -62,7 +62,7 @@
  * relative to process (curp)
  */
 #define TRACE_WAIT_P(curp, p) \
-	((p)->p_stat == SSTOP && \
+	 (P_SHOULDSTOP(p) && \
 	 (p)->p_pptr == (curp) && \
 	 ((p)->p_flag & P_TRACED))
 
@@ -262,6 +262,7 @@ out:
 	 */
 	case PROCFS_CTL_RUN:
 		PROC_UNLOCK(p);
+		p->p_flag &= ~P_STOPPED_SGNL;	/* this uses SIGSTOP */
 		break;
 
 	/*
@@ -272,27 +273,26 @@ out:
 	case PROCFS_CTL_WAIT:
 		if (p->p_flag & P_TRACED) {
 			while (error == 0 &&
-					(p->p_stat != SSTOP) &&
+					(P_SHOULDSTOP(p)) &&
 					(p->p_flag & P_TRACED) &&
 					(p->p_pptr == td->td_proc))
 				error = msleep((caddr_t) p, &p->p_mtx,
 						PWAIT|PCATCH, "procfsx", 0);
 			if (error == 0 && !TRACE_WAIT_P(td->td_proc, p))
 				error = EBUSY;
-		} else
-			while (error == 0 && p->p_stat != SSTOP)
+		} else {
+			while (error == 0 && P_SHOULDSTOP(p))
 				error = msleep((caddr_t) p, &p->p_mtx,
 						PWAIT|PCATCH, "procfs", 0);
+		}
 		PROC_UNLOCK(p);
 		return (error);
-
 	default:
 		panic("procfs_control");
 	}
 
 	mtx_lock_spin(&sched_lock);
-	if (p->p_stat == SSTOP)
-		setrunnable(FIRST_THREAD_IN_PROC(p)); /* XXXKSE */
+	thread_unsuspend(p); /* If it can run, let it do so. */
 	mtx_unlock_spin(&sched_lock);
 	return (0);
 }
@@ -349,6 +349,7 @@ procfs_doprocctl(PFS_FILL_ARGS)
 #endif
 				mtx_lock_spin(&sched_lock);
 				/* XXXKSE: */
+				p->p_flag &= ~P_STOPPED_SGNL;
 				setrunnable(FIRST_THREAD_IN_PROC(p));
 				mtx_unlock_spin(&sched_lock);
 			} else
diff --git a/sys/fs/procfs/procfs_dbregs.c b/sys/fs/procfs/procfs_dbregs.c
index 361f34b..442521c 100644
--- a/sys/fs/procfs/procfs_dbregs.c
+++ b/sys/fs/procfs/procfs_dbregs.c
@@ -90,7 +90,7 @@ procfs_doprocdbregs(PFS_FILL_ARGS)
 	if (error == 0)
 		error = uiomove(kv, kl, uio);
 	if (error == 0 && uio->uio_rw == UIO_WRITE) {
-		if (p->p_stat != SSTOP)
+		if (!P_SHOULDSTOP(p)) /* XXXKSE should be P_TRACED? */
 			error = EBUSY;
 		else
 			/* XXXKSE: */
diff --git a/sys/fs/procfs/procfs_fpregs.c b/sys/fs/procfs/procfs_fpregs.c
index afabb33..f1401f3 100644
--- a/sys/fs/procfs/procfs_fpregs.c
+++ b/sys/fs/procfs/procfs_fpregs.c
@@ -84,7 +84,7 @@ procfs_doprocfpregs(PFS_FILL_ARGS)
 	if (error == 0)
 		error = uiomove(kv, kl, uio);
 	if (error == 0 && uio->uio_rw == UIO_WRITE) {
-		if (p->p_stat != SSTOP)
+		if (!P_SHOULDSTOP(p))
 			error = EBUSY;
 		else
 			/* XXXKSE: */
diff --git a/sys/fs/procfs/procfs_ioctl.c b/sys/fs/procfs/procfs_ioctl.c
index 09aef86..9d49be9 100644
--- a/sys/fs/procfs/procfs_ioctl.c
+++ b/sys/fs/procfs/procfs_ioctl.c
@@ -94,9 +94,11 @@ procfs_ioctl(PFS_IOCTL_ARGS)
 #if 0
 		mtx_lock_spin(&sched_lock);
 		p->p_step = 0;
-		if (p->p_stat == SSTOP) {
+		if (P_SHOULDSTOP(p)) {
 			p->p_xstat = sig;
-			setrunnable(FIRST_THREAD_IN_PROC(p));
+			p->p_flag &= ~(P_STOPPED_TRACE|P_STOPPED_SGNL);
+			FOREACH_THREAD_IN_PROC(p, td)
+				setrunnable(td);	/* XXX Totally bogus */
 			mtx_unlock_spin(&sched_lock);
 		} else {
 			mtx_unlock_spin(&sched_lock);
diff --git a/sys/fs/procfs/procfs_regs.c b/sys/fs/procfs/procfs_regs.c
index 5fcb450..6cefe7e 100644
--- a/sys/fs/procfs/procfs_regs.c
+++ b/sys/fs/procfs/procfs_regs.c
@@ -86,7 +86,7 @@ procfs_doprocregs(PFS_FILL_ARGS)
 		error = uiomove(kv, kl, uio);
 	PROC_LOCK(p);
 	if (error == 0 && uio->uio_rw == UIO_WRITE) {
-		if (p->p_stat != SSTOP)
+		if (!P_SHOULDSTOP(p))
 			error = EBUSY;
 		else
 			/* XXXKSE: */
diff --git a/sys/i386/i386/genassym.c b/sys/i386/i386/genassym.c
index f3e9f04..dcc1880 100644
--- a/sys/i386/i386/genassym.c
+++ b/sys/i386/i386/genassym.c
@@ -79,10 +79,10 @@ ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace));
 ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap));
 ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active));
 ASSYM(P_SFLAG, offsetof(struct proc, p_sflag));
-ASSYM(P_STAT, offsetof(struct proc, p_stat));
+ASSYM(P_STATE, offsetof(struct proc, p_state));
 ASSYM(P_UAREA, offsetof(struct proc, p_uarea));
 
-/*ASSYM(TD_STAT, offsetof(struct thread, td__stat));*/
+ASSYM(TD_STATE, offsetof(struct thread, td_state));
 ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
 ASSYM(TD_WCHAN, offsetof(struct thread, td_wchan));
 ASSYM(TD_PCB, offsetof(struct thread, td_pcb));
@@ -101,8 +101,9 @@ ASSYM(KE_FLAGS, offsetof(struct kse, ke_flags));
 ASSYM(KEF_ASTPENDING, KEF_ASTPENDING);
 ASSYM(KEF_NEEDRESCHED, KEF_NEEDRESCHED);
 
-ASSYM(SSLEEP, SSLEEP);
-ASSYM(SRUN, SRUN);
+ASSYM(TDS_SLP, TDS_SLP);
+ASSYM(TDS_RUNQ, TDS_RUNQ);
+ASSYM(TDS_RUNNING, TDS_RUNNING);
 ASSYM(V_TRAP, offsetof(struct vmmeter, v_trap));
 ASSYM(V_SYSCALL, offsetof(struct vmmeter, v_syscall));
 ASSYM(V_INTR, offsetof(struct vmmeter, v_intr));
diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c
index 2f11ee2..c73c5e1 100644
--- a/sys/i386/i386/machdep.c
+++ b/sys/i386/i386/machdep.c
@@ -799,7 +799,7 @@ cpu_idle(void)
 {
 	if (cpu_idle_hlt) {
 		disable_intr();
-		if (procrunnable()) {
+  		if (kserunnable()) {
 			enable_intr();
 		} else {
 			/*
diff --git a/sys/i386/i386/pmap.c b/sys/i386/i386/pmap.c
index e2cebaf..9e35ad7 100644
--- a/sys/i386/i386/pmap.c
+++ b/sys/i386/i386/pmap.c
@@ -1100,7 +1100,12 @@ pmap_dispose_thread(td)
 	ksobj = td->td_kstack_obj;
 	ks = td->td_kstack;
 	ptek = vtopte(ks);
+#ifdef KSTACK_GUARD
+	ks -= PAGE_SIZE;
+	for (i = 1; i < (KSTACK_PAGES + 1); i++) {
+#else
 	for (i = 0; i < KSTACK_PAGES; i++) {
+#endif
 		m = vm_page_lookup(ksobj, i);
 		if (m == NULL)
 			panic("pmap_dispose_thread: kstack already missing?");
@@ -1116,16 +1121,17 @@ pmap_dispose_thread(td)
 #ifdef I386_CPU
 	invltlb();
 #endif
-
 	/*
-	 * If the thread got swapped out some of its KSTACK might have gotten
-	 * swapped.  Just get rid of the object to clean up the swap use
-	 * proactively.  NOTE! might block waiting for paging I/O to complete.
+	 * Free the space that this stack was mapped to in the kernel
+	 * address map.
 	 */
-	if (ksobj->type == OBJT_SWAP) {
-		td->td_kstack_obj = NULL;
-		vm_object_deallocate(ksobj);
-	}
+#ifdef KSTACK_GUARD
+	kmem_free(kernel_map, ks, (KSTACK_PAGES + 1) * PAGE_SIZE);
+#else
+	kmem_free(kernel_map, ks, KSTACK_PAGES * PAGE_SIZE);
+#endif
+	vm_object_deallocate(ksobj);
+	td->td_kstack_obj = NULL; /* play it safe */
 }
 
 /*
diff --git a/sys/i386/i386/swtch.s b/sys/i386/i386/swtch.s
index e0f9bcd..80db485 100644
--- a/sys/i386/i386/swtch.s
+++ b/sys/i386/i386/swtch.s
@@ -65,12 +65,19 @@ tlb_flush_count:	.long	0
 
 /*
  * cpu_throw()
+ *
+ * This is the second half of cpu_swtch(). It is used when the current
+ * thread is either a dummy or slated to die, and we no longer care
+ * about its state.
  */
 ENTRY(cpu_throw)
 	jmp	sw1
 
 /*
  * cpu_switch()
+ *
+ * Save the current thread state, then select the next thread to run
+ * and load its state.
  */
 ENTRY(cpu_switch)
 
@@ -166,11 +173,11 @@ sw1b:
 	movl	%eax,%ecx
 
 #ifdef	INVARIANTS
-	movl	TD_PROC(%ecx), %eax		/* XXXKSE */
-	cmpb	$SRUN,P_STAT(%eax)
+	cmpb	$TDS_RUNQ,TD_STATE(%ecx)
 	jne	badsw2
 #endif
 
+	movl	$TDS_RUNNING,TD_STATE(%ecx)
 	movl	TD_PCB(%ecx),%edx
 
 #if defined(SWTCH_OPTIM_STATS)
@@ -310,12 +317,14 @@ cpu_switch_load_gs:
 
 #ifdef INVARIANTS
 badsw2:
+	pushal
 	pushl	$sw0_2
 	call	panic
 
 sw0_2:	.asciz	"cpu_switch: not TDS_RUNQ"
 
 badsw3:
+	pushal
 	pushl	$sw0_3
 	call	panic
 
diff --git a/sys/i386/i386/trap.c b/sys/i386/i386/trap.c
index 08c75e4..8282416 100644
--- a/sys/i386/i386/trap.c
+++ b/sys/i386/i386/trap.c
@@ -54,6 +54,7 @@
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
+#include <sys/kse.h>
 #include <sys/pioctl.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
@@ -267,6 +268,17 @@ trap(frame)
 		if (td->td_ucred != p->p_ucred) 
 			cred_update_thread(td);
 
+		/*
+		 * First check that we shouldn't just abort.
+		 * But check if we are the single thread first!
+		 */
+		if ((p->p_flag & P_WEXIT) && (p->p_singlethread != td)) {
+			mtx_lock_spin(&sched_lock);
+			PROC_LOCK(p);
+			thread_exit();
+			/* NOTREACHED */
+		}
+
 		switch (type) {
 		case T_PRIVINFLT:	/* privileged instruction fault */
 			ucode = type;
@@ -939,11 +951,30 @@ syscall(frame)
 		mtx_unlock(&Giant);
 	}
 #endif
+	KASSERT((td->td_kse != NULL), ("syscall: kse/thread UNLINKED"));
+	KASSERT((td->td_kse->ke_thread == td), ("syscall:kse/thread mismatch"));
 
 	sticks = td->td_kse->ke_sticks;
 	td->td_frame = &frame;
 	if (td->td_ucred != p->p_ucred) 
 		cred_update_thread(td);
+	if (p->p_flag & P_KSES) {
+		/*
+		 * If we are doing a syscall in a KSE environment,
+		 * note where our mailbox is. There is always the
+		 * possibility that we could do this lazily (in sleep()),
+		 * but for now do it every time.
+		 */
+		td->td_mailbox = (void *)fuword((caddr_t)td->td_kse->ke_mailbox
+		    + offsetof(struct kse_mailbox, kmbx_current_thread));
+		if ((td->td_mailbox == NULL) ||
+		(td->td_mailbox == (void *)-1)) {
+			td->td_mailbox = NULL;	/* single thread it.. */
+			td->td_flags &= ~TDF_UNBOUND;
+		} else {
+			td->td_flags |= TDF_UNBOUND;
+		}
+	}
 	params = (caddr_t)frame.tf_esp + sizeof(int);
 	code = frame.tf_eax;
 	orig_tf_eflags = frame.tf_eflags;
@@ -1045,6 +1076,12 @@ syscall(frame)
 	}
 
 	/*
+	 * Release Giant if we previously set it.
+	 */
+	if ((callp->sy_narg & SYF_MPSAFE) == 0)
+		mtx_unlock(&Giant);
+
+	/*
 	 * Traced syscall.
 	 */
 	if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) {
@@ -1057,12 +1094,6 @@ syscall(frame)
 	 */
 	userret(td, &frame, sticks);
 
-	/*
-	 * Release Giant if we previously set it.
-	 */
-	if ((callp->sy_narg & SYF_MPSAFE) == 0)
-		mtx_unlock(&Giant);
-
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
 		ktrsysret(code, error, td->td_retval[0]);
diff --git a/sys/i386/i386/vm_machdep.c b/sys/i386/i386/vm_machdep.c
index 5dc2e14..04742c3 100644
--- a/sys/i386/i386/vm_machdep.c
+++ b/sys/i386/i386/vm_machdep.c
@@ -53,6 +53,7 @@
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
+#include <sys/kse.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/vnode.h>
@@ -254,15 +255,26 @@ cpu_set_fork_handler(td, func, arg)
 }
 
 void
-cpu_exit(td)
-	register struct thread *td;
+cpu_exit(struct thread *td)
+{
+	struct mdproc *mdp;
+
+	mdp = &td->td_proc->p_md;
+	if (mdp->md_ldt)
+		user_ldt_free(td);
+	reset_dbregs();
+}
+
+void
+cpu_thread_exit(struct thread *td)
 {
 	struct pcb *pcb = td->td_pcb; 
-	struct mdproc *mdp = &td->td_proc->p_md;
 #ifdef DEV_NPX
 	npxexit(td);
 #endif
 	if (pcb->pcb_ext != 0) {
+		/* XXXKSE  XXXSMP  not SMP SAFE.. what locks do we have? */
+		/* if (pcb->pcb_ext->ext_refcount-- == 1) ?? */
 	        /* 
 		 * XXX do we need to move the TSS off the allocated pages 
 		 * before freeing them?  (not done here)
@@ -271,8 +283,6 @@ cpu_exit(td)
 		    ctob(IOPAGES + 1));
 		pcb->pcb_ext = 0;
 	}
-	if (mdp->md_ldt)
-		user_ldt_free(td);
         if (pcb->pcb_flags & PCB_DBREGS) {
                 /*
                  * disable all hardware breakpoints
@@ -289,6 +299,146 @@ cpu_sched_exit(td)
 }
 
 void
+cpu_thread_setup(struct thread *td)
+{
+
+	td->td_pcb =
+	     (struct pcb *)(td->td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
+	td->td_frame = (struct trapframe *)((caddr_t)td->td_pcb - 16) - 1;
+}
+
+struct md_store {
+	struct pcb mds_pcb;
+	struct trapframe mds_frame;
+};
+
+void
+cpu_save_upcall(struct thread *td, struct kse *newkse)
+{
+	struct trapframe *tf;
+
+	newkse->ke_mdstorage = malloc(sizeof(struct md_store), M_TEMP,
+	    M_WAITOK);
+	/* Note: use of M_WAITOK means it won't fail. */
+	/* set up shortcuts in MI section */
+	newkse->ke_pcb =
+	    &(((struct md_store *)(newkse->ke_mdstorage))->mds_pcb);
+	newkse->ke_frame =
+	    &(((struct md_store *)(newkse->ke_mdstorage))->mds_frame);
+	tf = newkse->ke_frame;
+
+	/* Copy the upcall pcb. Kernel mode & fp regs are here. */
+	/* XXXKSE this may be un-needed */
+	bcopy(td->td_pcb, newkse->ke_pcb, sizeof(struct pcb));
+
+	/*
+	 * This initialises most of the user mode register values
+	 * to good values. Eventually set them explicitly to know values
+	 */
+	bcopy(td->td_frame, newkse->ke_frame, sizeof(struct trapframe));
+	tf->tf_edi = 0;
+	tf->tf_esi = 0;		    /* trampoline arg */
+	tf->tf_ebp = 0;
+	tf->tf_esp = (int)newkse->ke_stackbase + newkse->ke_stacksize - 16;
+	tf->tf_ebx = 0;		    /* trampoline arg */
+	tf->tf_eip = (int)newkse->ke_upcall;
+}
+
+void
+cpu_set_upcall(struct thread *td, void *pcb)
+{
+	struct pcb *pcb2;
+
+	td->td_flags |= TDF_UPCALLING;
+
+	/* Point the pcb to the top of the stack. */
+	pcb2 = td->td_pcb;
+
+	/*
+	 * Copy the upcall pcb.  This loads kernel regs.
+	 * Those not loaded individually below get their default
+	 * values here.
+	 *
+	 * XXXKSE It might be a good idea to simply skip this as
+	 * the values of the other registers may be unimportant.
+	 * This would remove any requirement for knowing the KSE
+	 * at this time (see the matching comment below for
+	 * more analysis) (need a good safe default).
+	 */
+	bcopy(pcb, pcb2, sizeof(*pcb2));
+
+	/*
+	 * Create a new fresh stack for the new thread.
+	 * The -16 is so we can expand the trapframe if we go to vm86.
+	 * Don't forget to set this stack value into whatever supplies
+	 * the address for the fault handlers.
+	 * The contexts are filled in at the time we actually DO the
+	 * upcall as only then do we know which KSE we got.
+	 */
+	td->td_frame = (struct trapframe *)((caddr_t)pcb2 - 16) - 1;
+
+	/*
+	 * Set registers for trampoline to user mode.  Leave space for the
+	 * return address on stack.  These are the kernel mode register values.
+	 */
+	pcb2->pcb_cr3 = vtophys(vmspace_pmap(td->td_proc->p_vmspace)->pm_pdir);
+	pcb2->pcb_edi = 0;
+	pcb2->pcb_esi = (int)fork_return;		    /* trampoline arg */
+	pcb2->pcb_ebp = 0;
+	pcb2->pcb_esp = (int)td->td_frame - sizeof(void *); /* trampoline arg */
+	pcb2->pcb_ebx = (int)td;			    /* trampoline arg */
+	pcb2->pcb_eip = (int)fork_trampoline;
+	pcb2->pcb_psl &= ~(PSL_I);	/* interrupts must be disabled */
+	/*
+	 * If we didn't copy the pcb, we'd need to do the following registers:
+	 * pcb2->pcb_dr*:	cloned above.
+	 * pcb2->pcb_savefpu:	cloned above.
+	 * pcb2->pcb_flags:	cloned above.
+	 * pcb2->pcb_onfault:	cloned above (always NULL here?).
+	 * pcb2->pcb_gs:	cloned above.  XXXKSE ???
+	 * pcb2->pcb_ext:	cleared below.
+	 */
+	 pcb2->pcb_ext = NULL;
+}
+
+void
+cpu_set_args(struct thread *td, struct kse *ke) 
+{
+	suword((void *)(ke->ke_frame->tf_esp + sizeof(void *)),
+	    (int)ke->ke_mailbox);
+}
+
+void
+cpu_free_kse_mdstorage(struct kse *kse)
+{
+
+	free(kse->ke_mdstorage, M_TEMP);
+	kse->ke_mdstorage = NULL;
+	kse->ke_pcb = NULL;
+	kse->ke_frame = NULL;
+}
+
+int
+cpu_export_context(struct thread *td)
+{
+	struct trapframe *frame;
+	struct thread_mailbox *tm;
+	struct trapframe *uframe;
+	int error;
+
+	frame = td->td_frame;
+	tm = td->td_mailbox;
+	uframe = &tm->ctx.tfrm.tf_tf;
+	error = copyout(frame, uframe, sizeof(*frame));
+	/*
+	 * "What about the fp regs?" I hear you ask.... XXXKSE
+	 * Don't know where gs and "onstack" come from.
+	 * May need to fiddle a few other values too.
+	 */
+	return (error);
+}
+
+void
 cpu_wait(p)
 	struct proc *p;
 {
diff --git a/sys/i386/linux/linux_machdep.c b/sys/i386/linux/linux_machdep.c
index 245c96a..0819b67 100644
--- a/sys/i386/linux/linux_machdep.c
+++ b/sys/i386/linux/linux_machdep.c
@@ -361,7 +361,6 @@ linux_clone(struct thread *td, struct linux_clone_args *args)
 		 * Make this runnable after we are finished with it.
 		 */
 		mtx_lock_spin(&sched_lock);
-		p2->p_stat = SRUN;
 		setrunqueue(FIRST_THREAD_IN_PROC(p2));
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p2);
diff --git a/sys/i386/linux/linux_ptrace.c b/sys/i386/linux/linux_ptrace.c
index 536188b..a19dcc7 100644
--- a/sys/i386/linux/linux_ptrace.c
+++ b/sys/i386/linux/linux_ptrace.c
@@ -409,7 +409,7 @@ linux_ptrace(struct thread *td, struct linux_ptrace_args *uap)
 		}
 
 		/* not currently stopped */
-		if (p->p_stat != SSTOP || (p->p_flag & P_WAITED) == 0) {
+		if ((p->p_flag & (P_TRACED|P_WAITED)) == 0) {
 			error = EBUSY;
 			goto fail;
 		}
diff --git a/sys/ia64/ia64/trap.c b/sys/ia64/ia64/trap.c
index e38945f..4ffdb15 100644
--- a/sys/ia64/ia64/trap.c
+++ b/sys/ia64/ia64/trap.c
@@ -872,14 +872,14 @@ syscall(int code, u_int64_t *args, struct trapframe *framep)
 		break;
 	}
 
-	userret(td, framep, sticks);
-
 	/*
 	 * Release Giant if we had to get it.
 	 */
 	if ((callp->sy_narg & SYF_MPSAFE) == 0)
 		mtx_unlock(&Giant);
 
+	userret(td, framep, sticks);
+
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
 		ktrsysret(code, error, td->td_retval[0]);
@@ -1043,16 +1043,16 @@ ia32_syscall(struct trapframe *framep)
 	}
 
 	/*
-	 * Handle reschedule and other end-of-syscall issues
-	 */
-	userret(td, framep, sticks);
-
-	/*
 	 * Release Giant if we previously set it.
 	 */
 	if ((callp->sy_narg & SYF_MPSAFE) == 0)
 		mtx_unlock(&Giant);
 
+	/*
+	 * Handle reschedule and other end-of-syscall issues
+	 */
+	userret(td, framep, sticks);
+
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
 		ktrsysret(code, error, td->td_retval[0]);
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
index d5c5656..06cc8d8 100644
--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c
@@ -289,6 +289,7 @@ proc0_init(void *dummy __unused)
 	 * Initialize thread, process and pgrp structures.
 	 */
 	procinit();
+	threadinit();
 
 	/*
 	 * Initialize sleep queue hash table
@@ -322,19 +323,34 @@ proc0_init(void *dummy __unused)
 	p->p_sysent = &aout_sysvec;
 #endif
 
+	/*
+	 * proc_linkup was already done in init_i386() or alphainit() etc.
+	 * because the earlier code needed to follow td->td_proc. Otherwise
+	 * I would have done it here.. maybe this means this should be
+	 * done earlier too.
+	 */
 	ke = &proc0.p_kse;	/* XXXKSE */
 	kg = &proc0.p_ksegrp;	/* XXXKSE */
 	p->p_flag = P_SYSTEM;
 	p->p_sflag = PS_INMEM;
-	p->p_stat = SRUN;
-	p->p_ksegrp.kg_nice = NZERO;
- 	kg->kg_pri_class = PRI_TIMESHARE;
- 	kg->kg_user_pri = PUSER;
- 	td->td_priority = PVM;
- 	td->td_base_pri = PUSER;
-
+	p->p_state = PRS_NORMAL;
+	td->td_state = TDS_RUNNING;
+	kg->kg_nice = NZERO;
+	kg->kg_pri_class = PRI_TIMESHARE;
+	kg->kg_user_pri = PUSER;
+	td->td_priority = PVM;
+	td->td_base_pri = PUSER;
+	td->td_kse = ke; /* XXXKSE */
+	ke->ke_oncpu = 0;
+	ke->ke_state = KES_RUNNING;
+	ke->ke_thread = td;
+	/* proc_linkup puts it in the idle queue, that's not what we want. */
+	TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
+	kg->kg_idle_kses--;
 	p->p_peers = 0;
 	p->p_leader = p;
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));
+
 
 	bcopy("swapper", p->p_comm, sizeof ("swapper"));
 
@@ -662,8 +678,7 @@ kick_init(const void *udata __unused)
 
 	td = FIRST_THREAD_IN_PROC(initproc);
 	mtx_lock_spin(&sched_lock);
-	initproc->p_stat = SRUN;
-	setrunqueue(FIRST_THREAD_IN_PROC(initproc)); /* XXXKSE */
+	setrunqueue(td);	/* XXXKSE */
 	mtx_unlock_spin(&sched_lock);
 }
 SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL)
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
index 425e3b7..cf8ba80 100644
--- a/sys/kern/init_sysent.c
+++ b/sys/kern/init_sysent.c
@@ -405,7 +405,7 @@ struct sysent sysent[] = {
 	{ 0, (sy_call_t *)kse_wakeup },			/* 380 = kse_wakeup */
 	{ AS(kse_new_args), (sy_call_t *)kse_new },	/* 381 = kse_new */
 	{ AS(thread_wakeup_args), (sy_call_t *)thread_wakeup },	/* 382 = thread_wakeup */
-	{ 0, (sy_call_t *)kse_yield },			/* 383 = kse_yield */
+	{ SYF_MPSAFE | 0, (sy_call_t *)kse_yield },	/* 383 = kse_yield */
 	{ 0, (sy_call_t *)nosys },			/* 384 = __mac_get_proc */
 	{ 0, (sy_call_t *)nosys },			/* 385 = __mac_set_proc */
 	{ 0, (sy_call_t *)nosys },			/* 386 = __mac_get_fd */
diff --git a/sys/kern/kern_condvar.c b/sys/kern/kern_condvar.c
index 9d30d25..78585b2 100644
--- a/sys/kern/kern_condvar.c
+++ b/sys/kern/kern_condvar.c
@@ -48,7 +48,7 @@
  */
 #define	CV_ASSERT(cvp, mp, td) do {					\
 	KASSERT((td) != NULL, ("%s: curthread NULL", __func__));	\
-	KASSERT((td)->td_proc->p_stat == SRUN, ("%s: not SRUN", __func__));	\
+	KASSERT((td)->td_state == TDS_RUNNING, ("%s: not TDS_RUNNING", __func__));	\
 	KASSERT((cvp) != NULL, ("%s: cvp NULL", __func__));		\
 	KASSERT((mp) != NULL, ("%s: mp NULL", __func__));		\
 	mtx_assert((mp), MA_OWNED | MA_NOTRECURSED);			\
@@ -80,6 +80,7 @@
 #endif
 
 static void cv_timedwait_end(void *arg);
+static void cv_check_upcall(struct thread *td);
 
 /*
  * Initialize a condition variable.  Must be called before use.
@@ -109,14 +110,47 @@ cv_destroy(struct cv *cvp)
  */
 
 /*
+ * Decide if we need to queue an upcall.
+ * This is copied from msleep(), perhaps this should be a common function.
+ */
+static void
+cv_check_upcall(struct thread *td)
+{
+
+	/*
+	 * If we are capable of async syscalls and there isn't already
+	 * another one ready to return, start a new thread
+	 * and queue it as ready to run. Note that there is danger here
+	 * because we need to make sure that we don't sleep allocating
+	 * the thread (recursion here might be bad).
+	 * Hence the TDF_INMSLEEP flag.
+	 */
+	if ((td->td_proc->p_flag & P_KSES) && td->td_mailbox &&
+	    (td->td_flags & TDF_INMSLEEP) == 0) {
+		/*
+		 * If we have no queued work to do,
+		 * upcall to the UTS to see if it has more work.
+		 * We don't need to upcall now, just queue it.
+		 */
+		if (TAILQ_FIRST(&td->td_ksegrp->kg_runq) == NULL) {
+			/* Don't recurse here! */
+			td->td_flags |= TDF_INMSLEEP;
+			thread_schedule_upcall(td, td->td_kse);
+			td->td_flags &= ~TDF_INMSLEEP;
+		}
+	}
+}
+
+/*
  * Switch context.
  */
 static __inline void
 cv_switch(struct thread *td)
 {
 
-	td->td_proc->p_stat = SSLEEP;
+	td->td_state = TDS_SLP;
 	td->td_proc->p_stats->p_ru.ru_nvcsw++;
+	cv_check_upcall(td);
 	mi_switch();
 	CTR3(KTR_PROC, "cv_switch: resume thread %p (pid %d, %s)", td,
 	    td->td_proc->p_pid, td->td_proc->p_comm);
@@ -135,7 +169,7 @@ cv_switch_catch(struct thread *td)
 	 * We put ourselves on the sleep queue and start our timeout before
 	 * calling cursig, as we could stop there, and a wakeup or a SIGCONT (or
 	 * both) could occur while we were stopped.  A SIGCONT would cause us to
-	 * be marked as SSLEEP without resuming us, thus we must be ready for
+	 * be marked as TDS_SLP without resuming us, thus we must be ready for
 	 * sleep when cursig is called.  If the wakeup happens while we're
 	 * stopped, td->td_wchan will be 0 upon return from cursig.
 	 */
@@ -143,13 +177,15 @@ cv_switch_catch(struct thread *td)
 	mtx_unlock_spin(&sched_lock);
 	p = td->td_proc;
 	PROC_LOCK(p);
-	sig = cursig(p);	/* XXXKSE */
+	sig = cursig(td);	/* XXXKSE */
+	if (thread_suspend_check(1))
+		sig = SIGSTOP;
 	mtx_lock_spin(&sched_lock);
 	PROC_UNLOCK(p);
 	if (sig != 0) {
 		if (td->td_wchan != NULL)
 			cv_waitq_remove(td);
-		td->td_proc->p_stat = SRUN;
+		td->td_state = TDS_RUNNING;	/* XXXKSE */
 	} else if (td->td_wchan != NULL) {
 		cv_switch(td);
 	}
@@ -175,7 +211,6 @@ cv_waitq_add(struct cv *cvp, struct thread *td)
 	td->td_flags |= TDF_CVWAITQ;
 	td->td_wchan = cvp;
 	td->td_wmesg = cvp->cv_description;
-	td->td_kse->ke_slptime = 0; /* XXXKSE */
 	td->td_ksegrp->kg_slptime = 0; /* XXXKSE */
 	td->td_base_pri = td->td_priority;
 	CTR3(KTR_PROC, "cv_waitq_add: thread %p (pid %d, %s)", td,
@@ -285,7 +320,7 @@ cv_wait_sig(struct cv *cvp, struct mtx *mp)
 
 	PROC_LOCK(p);
 	if (sig == 0)
-		sig = cursig(p);  /* XXXKSE */
+		sig = cursig(td);	/* XXXKSE */
 	if (sig != 0) {
 		if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
 			rval = EINTR;
@@ -293,6 +328,8 @@ cv_wait_sig(struct cv *cvp, struct mtx *mp)
 			rval = ERESTART;
 	}
 	PROC_UNLOCK(p);
+	if (p->p_flag & P_WEXIT)
+		rval = EINTR;
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
@@ -363,6 +400,8 @@ cv_timedwait(struct cv *cvp, struct mtx *mp, int timo)
 		mi_switch();
 	}
 
+	if (td->td_proc->p_flag & P_WEXIT)
+		rval = EWOULDBLOCK;
 	mtx_unlock_spin(&sched_lock);
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
@@ -436,12 +475,11 @@ cv_timedwait_sig(struct cv *cvp, struct mtx *mp, int timo)
 		td->td_proc->p_stats->p_ru.ru_nivcsw++;
 		mi_switch();
 	}
-
 	mtx_unlock_spin(&sched_lock);
 
 	PROC_LOCK(p);
 	if (sig == 0)
-		sig = cursig(p);
+		sig = cursig(td);
 	if (sig != 0) {
 		if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
 			rval = EINTR;
@@ -450,6 +488,9 @@ cv_timedwait_sig(struct cv *cvp, struct mtx *mp, int timo)
 	}
 	PROC_UNLOCK(p);
 
+	if (p->p_flag & P_WEXIT)
+		rval = EINTR;
+
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(0, 0);
@@ -477,15 +518,13 @@ cv_wakeup(struct cv *cvp)
 	TAILQ_REMOVE(&cvp->cv_waitq, td, td_slpq);
 	td->td_flags &= ~TDF_CVWAITQ;
 	td->td_wchan = 0;
-	if (td->td_proc->p_stat == SSLEEP) {
+	if (td->td_state == TDS_SLP) {
 		/* OPTIMIZED EXPANSION OF setrunnable(td); */
 		CTR3(KTR_PROC, "cv_signal: thread %p (pid %d, %s)",
 		    td, td->td_proc->p_pid, td->td_proc->p_comm);
 		if (td->td_ksegrp->kg_slptime > 1) /* XXXKSE */
 			updatepri(td);
-		td->td_kse->ke_slptime = 0;
 		td->td_ksegrp->kg_slptime = 0;
-		td->td_proc->p_stat = SRUN;
 		if (td->td_proc->p_sflag & PS_INMEM) {
 			setrunqueue(td);
 			maybe_resched(td);
@@ -568,7 +607,7 @@ cv_timedwait_end(void *arg)
 		td->td_flags &= ~TDF_TIMEOUT;
 		setrunqueue(td);
 	} else if (td->td_wchan != NULL) {
-		if (td->td_proc->p_stat == SSLEEP) /* XXXKSE */
+		if (td->td_state == TDS_SLP)	/* XXXKSE */
 			setrunnable(td);
 		else
 			cv_waitq_remove(td);
@@ -577,3 +616,27 @@ cv_timedwait_end(void *arg)
 		td->td_flags |= TDF_TIMOFAIL;
 	mtx_unlock_spin(&sched_lock);
 }
+
+/*
+ * For now only abort interruptable waits.
+ * The others will have to either complete on their own or have a timeout.
+ */
+void
+cv_abort(struct thread *td)
+{
+
+	CTR3(KTR_PROC, "cv_abort: thread %p (pid %d, %s)", td,
+	    td->td_proc->p_pid,
+	    td->td_proc->p_comm);
+	mtx_lock_spin(&sched_lock);
+	if ((td->td_flags & (TDF_SINTR|TDF_TIMEOUT)) == TDF_SINTR) {
+		if (td->td_wchan != NULL) {
+			if (td->td_state == TDS_SLP)
+				setrunnable(td);
+			else
+				cv_waitq_remove(td);
+		}
+	}
+	mtx_unlock_spin(&sched_lock);
+}
+
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
index feaa123..0cd7f27 100644
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@@ -154,12 +154,14 @@ execve(td, uap)
 	PROC_LOCK(p);
 	KASSERT((p->p_flag & P_INEXEC) == 0,
 	    ("%s(): process already has P_INEXEC flag", __func__));
+	if ((p->p_flag & P_KSES) && thread_single(SNGLE_EXIT)) {
+		PROC_UNLOCK(p);
+		mtx_unlock(&Giant);
+		return (ERESTART);	/* Try again later. */
+	}
+	/* If we get here all other threads are dead. */
 	p->p_flag |= P_INEXEC;
 	PROC_UNLOCK(p);
-	
-/* XXXKSE */
-/* !!!!!!!! we need abort all the other threads of this process before we */
-/* proceed beyond his point! */
 
 	/*
 	 * Initialize part of the common data
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
index 63a5135..fea5438 100644
--- a/sys/kern/kern_exit.c
+++ b/sys/kern/kern_exit.c
@@ -145,6 +145,67 @@ exit1(td, rv)
 	/*
 	 * XXXXKSE: MUST abort all other threads before proceeding past here.
 	 */
+	PROC_LOCK(p);
+	if (p->p_flag & P_KSES) {
+		/*
+		 * First check if some other thread got here before us..
+		 * if so, act apropriatly, (exit or suspend);
+		 */
+		thread_suspend_check(0);
+		/*
+		 * Here is a trick..
+		 * We need to free up our KSE to process other threads
+		 * so that we can safely set the UNBOUND flag
+		 * (whether or not we have a mailbox) as we are NEVER
+		 * going to return to the user.
+		 * The flag will not be set yet if we are exiting
+		 * because of a signal, pagefault, or similar
+		 * (or even an exit(2) from the UTS).
+		 */
+		td->td_flags |= TDF_UNBOUND;
+
+		/*
+		 * Kill off the other threads. This requires
+		 * Some co-operation from other parts of the kernel
+		 * so it may not be instant.
+		 * With this state set:
+		 * Any thread entering the kernel from userspace will
+		 * thread_exit() in trap().  Any thread attempting to
+		 * sleep will return immediatly
+		 * with EINTR or EWOULDBLOCK, which will hopefully force them
+		 * to back out to userland, freeing resources as they go, and
+		 * anything attempting to return to userland will thread_exit()
+		 * from userret().  thread_exit() will unsuspend us
+		 * when the last other thread exits.
+		 */
+		if (thread_single(SNGLE_EXIT)) {
+			panic ("Exit: Single threading fouled up");
+		}
+		/*
+		 * All other activity in this process is now stopped.
+		 * Remove excess KSEs and KSEGRPS. XXXKSE (when we have them)
+		 * ... 
+		 * Turn off threading support.
+		 */
+		p->p_flag &= ~P_KSES;
+		td->td_flags &= ~TDF_UNBOUND;
+		thread_single_end(); 	/* Don't need this any more. */
+	}
+	/*
+	 * With this state set:
+	 * Any thread entering the kernel from userspace will thread_exit()
+	 * in trap().  Any thread attempting to sleep will return immediatly
+	 * with EINTR or EWOULDBLOCK, which will hopefully force them
+	 * to back out to userland, freeing resources as they go, and
+	 * anything attempting to return to userland will thread_exit()
+	 * from userret().  thread_exit() will do a wakeup on p->p_numthreads
+	 * if it transitions to 1.
+	 */
+
+	p->p_flag |= P_WEXIT;
+	PROC_UNLOCK(p);
+	if (td->td_kse->ke_mdstorage)
+		cpu_free_kse_mdstorage(td->td_kse);
 
 	/* Are we a task leader? */
 	PROC_LOCK(p);
@@ -185,7 +246,6 @@ exit1(td, rv)
 	 */
 	PROC_LOCK(p);
 	p->p_flag &= ~(P_TRACED | P_PPWAIT);
-	p->p_flag |= P_WEXIT;
 	SIGEMPTYSET(p->p_siglist);
 	PROC_UNLOCK(p);
 	if (timevalisset(&p->p_realtimer.it_value))
@@ -434,22 +494,24 @@ exit1(td, rv)
 
 	/*
 	 * We have to wait until after releasing all locks before
-	 * changing p_stat.  If we block on a mutex then we will be
+	 * changing p_state.  If we block on a mutex then we will be
 	 * back at SRUN when we resume and our parent will never
 	 * harvest us.
 	 */
-	p->p_stat = SZOMB;
+	p->p_state = PRS_ZOMBIE;
 
 	wakeup(p->p_pptr);
 	PROC_UNLOCK(p->p_pptr);
-	PROC_UNLOCK(p);
-
 	cnt.v_swtch++;
 	binuptime(PCPU_PTR(switchtime));
 	PCPU_SET(switchticks, ticks);
 
-	cpu_sched_exit(td);
-	cpu_throw();
+	cpu_sched_exit(td); /* XXXKSE check if this should be in thread_exit */
+	/*
+	 * Make sure this thread is discarded from the zombie.
+	 * This will also release this thread's reference to the ucred.
+	 */
+	thread_exit();
 	panic("exit1");
 }
 
@@ -504,6 +566,8 @@ wait1(td, uap, compat)
 	register int nfound;
 	register struct proc *p, *q, *t;
 	int status, error;
+	struct kse *ke;
+	struct ksegrp *kg;
 
 	q = td->td_proc;
 	if (uap->pid == 0) {
@@ -540,7 +604,7 @@ loop:
 		}
 
 		nfound++;
-		if (p->p_stat == SZOMB) {
+		if (p->p_state == PRS_ZOMBIE) {
 			/*
 			 * charge childs scheduling cpu usage to parent
 			 * XXXKSE assume only one thread & kse & ksegrp
@@ -656,6 +720,21 @@ loop:
 			}
 
 			/*
+			 * There should only be one KSE/KSEGRP but
+			 * do it right anyhow.
+			 */
+			FOREACH_KSEGRP_IN_PROC(p, kg) {
+				FOREACH_KSE_IN_GROUP(kg, ke) {
+					/* Free the KSE spare thread. */
+					if (ke->ke_tdspare != NULL) {
+						thread_free(ke->ke_tdspare);
+						p->p_kse.ke_tdspare = NULL;
+					}
+				}
+			}
+			thread_reap();	/* check for zombie threads */
+
+			/*
 			 * Give vm and machine-dependent layer a chance
 			 * to free anything that cpu_exit couldn't
 			 * release while still running in process context.
@@ -669,7 +748,7 @@ loop:
 			mtx_unlock(&Giant);
 			return (0);
 		}
-		if (p->p_stat == SSTOP && (p->p_flag & P_WAITED) == 0 &&
+		if (P_SHOULDSTOP(p) && ((p->p_flag & P_WAITED) == 0) &&
 		    (p->p_flag & P_TRACED || uap->options & WUNTRACED)) {
 			p->p_flag |= P_WAITED;
 			sx_xunlock(&proctree_lock);
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index 016653b..eac0267 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -212,23 +212,6 @@ sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
 SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
     0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
 
-#if 0
-void
-kse_init(struct kse *kse1, struct kse *kse2) 
-{
-}
-
-void
-thread_init(struct thread *thread1, struct thread *thread2) 
-{
-}
-
-void
-ksegrp_init(struct ksegrp *ksegrp1, struct ksegrp *ksegrp2) 
-{
-}
-#endif
-
 int
 fork1(td, flags, procp)
 	struct thread *td;			/* parent proc */
@@ -296,6 +279,29 @@ fork1(td, flags, procp)
 		return (0);
 	}
 
+	if (p1->p_flag & P_KSES) {
+		/*
+		 * Idle the other threads for a second.
+		 * Since the user space is copied, it must remain stable.
+		 * In addition, all threads (from the user perspective)
+		 * need to either be suspended or in the kernel,
+		 * where they will try restart in the parent and will
+		 * be aborted in the child.
+		 */
+		PROC_LOCK(p1);
+		if (thread_single(SNGLE_NO_EXIT)) {
+			/* Abort.. someone else is single threading before us */
+			PROC_UNLOCK(p1);
+			return (ERESTART);
+		}
+		PROC_UNLOCK(p1);
+		/*
+		 * All other activity in this process
+		 * is now suspended at the user boundary,
+		 * (or other safe places if we think of any).
+		 */
+	}
+
 	/* Allocate new proc. */
 	newproc = uma_zalloc(proc_zone, M_WAITOK);
 
@@ -311,6 +317,11 @@ fork1(td, flags, procp)
 	if ((nprocs >= maxproc - 10 && uid != 0) || nprocs >= maxproc) {
 		sx_xunlock(&allproc_lock);
 		uma_zfree(proc_zone, newproc);
+		if (p1->p_flag & P_KSES) {
+			PROC_LOCK(p1);
+			thread_single_end();
+			PROC_UNLOCK(p1);
+		}
 		tsleep(&forksleep, PUSER, "fork", hz / 2);
 		return (EAGAIN);
 	}
@@ -325,6 +336,11 @@ fork1(td, flags, procp)
 	if (!ok) {
 		sx_xunlock(&allproc_lock);
 		uma_zfree(proc_zone, newproc);
+		if (p1->p_flag & P_KSES) {
+			PROC_LOCK(p1);
+			thread_single_end();
+			PROC_UNLOCK(p1);
+		}
 		tsleep(&forksleep, PUSER, "fork", hz / 2);
 		return (EAGAIN);
 	}
@@ -411,7 +427,7 @@ again:
 		lastpid = trypid;
 
 	p2 = newproc;
-	p2->p_stat = SIDL;			/* protect against others */
+	p2->p_state = PRS_NEW;		/* protect against others */
 	p2->p_pid = trypid;
 	LIST_INSERT_HEAD(&allproc, p2, p_list);
 	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
@@ -449,7 +465,7 @@ again:
 	 * Start by zeroing the section of proc that is zero-initialized,
 	 * then copy the section that is copied directly from the parent.
 	 */
-	td2 = thread_get(p2);
+	td2 = thread_alloc();
 	ke2 = &p2->p_kse;
 	kg2 = &p2->p_ksegrp;
 
@@ -459,8 +475,10 @@ again:
 	    (unsigned) RANGEOF(struct proc, p_startzero, p_endzero));
 	bzero(&ke2->ke_startzero,
 	    (unsigned) RANGEOF(struct kse, ke_startzero, ke_endzero));
+#if 0 /* bzero'd by the thread allocator */
 	bzero(&td2->td_startzero,
 	    (unsigned) RANGEOF(struct thread, td_startzero, td_endzero));
+#endif
 	bzero(&kg2->kg_startzero,
 	    (unsigned) RANGEOF(struct ksegrp, kg_startzero, kg_endzero));
 
@@ -482,9 +500,22 @@ again:
 	 * XXXKSE Theoretically only the running thread would get copied 
 	 * Others in the kernel would be 'aborted' in the child.
 	 * i.e return E*something*
+	 * On SMP we would have to stop them running on
+	 * other CPUs! (set a flag in the proc that stops
+	 * all returns to userland until completed)
+	 * This is wrong but ok for 1:1.
 	 */
 	proc_linkup(p2, kg2, ke2, td2);
 
+	/* Set up the thread as an active thread (as if runnable). */
+	TAILQ_REMOVE(&kg2->kg_iq, ke2, ke_kgrlist);
+	kg2->kg_idle_kses--;
+	ke2->ke_state = KES_UNQUEUED;
+	ke2->ke_thread = td2;
+	td2->td_kse = ke2;
+	td2->td_flags &= ~TDF_UNBOUND; /* For the rest of this syscall. */
+KASSERT((ke2->ke_kgrlist.tqe_next != ke2), ("linked to self!"));
+
 	/* note.. XXXKSE no pcb or u-area yet */
 
 	/*
@@ -699,7 +730,6 @@ again:
 	p2->p_acflag = AFORK;
 	if ((flags & RFSTOPPED) == 0) {
 		mtx_lock_spin(&sched_lock);
-		p2->p_stat = SRUN;
 		setrunqueue(td2);
 		mtx_unlock_spin(&sched_lock);
 	}
@@ -803,6 +833,9 @@ fork_exit(callout, arg, frame)
 	struct proc *p = td->td_proc;
 
 	td->td_kse->ke_oncpu = PCPU_GET(cpuid);
+	p->p_state = PRS_NORMAL;
+	td->td_state = TDS_RUNNING; /* Already done in switch() on 386. */
+	td->td_kse->ke_state = KES_RUNNING;
 	/*
 	 * Finish setting up thread glue.  We need to initialize
 	 * the thread into a td_critnest=1 state.  Some platforms
@@ -814,7 +847,7 @@ fork_exit(callout, arg, frame)
 	sched_lock.mtx_lock = (uintptr_t)td;
 	sched_lock.mtx_recurse = 0;
 	cpu_critical_fork_exit();
-	CTR3(KTR_PROC, "fork_exit: new proc %p (pid %d, %s)", p, p->p_pid,
+	CTR3(KTR_PROC, "fork_exit: new thread %p (pid %d, %s)", td, p->p_pid,
 	    p->p_comm);
 	if (PCPU_GET(switchtime.sec) == 0)
 		binuptime(PCPU_PTR(switchtime));
diff --git a/sys/kern/kern_idle.c b/sys/kern/kern_idle.c
index 29194b7..306f2a5 100644
--- a/sys/kern/kern_idle.c
+++ b/sys/kern/kern_idle.c
@@ -40,6 +40,7 @@ idle_setup(void *dummy)
 	struct pcpu *pc;
 #endif
 	struct proc *p;
+	struct thread *td;
 	int error;
 
 #ifdef SMP
@@ -60,7 +61,10 @@ idle_setup(void *dummy)
 			panic("idle_setup: kthread_create error %d\n", error);
 
 		p->p_flag |= P_NOLOAD;
-		p->p_stat = SRUN;
+		td = FIRST_THREAD_IN_PROC(p);
+		td->td_state = TDS_RUNQ;	
+		td->td_kse->ke_state = KES_ONRUNQ;	
+		td->td_kse->ke_flags |= KEF_IDLEKSE; 
 #ifdef SMP
 	}
 #endif
@@ -75,16 +79,22 @@ idle_proc(void *dummy)
 #ifdef DIAGNOSTIC
 	int count;
 #endif
+	struct thread *td;
+	struct proc *p;
 
+	td = curthread;
+	p = td->td_proc;
+	td->td_state = TDS_RUNNING; 
+	td->td_kse->ke_state = KES_RUNNING;
 	for (;;) {
 		mtx_assert(&Giant, MA_NOTOWNED);
 
 #ifdef DIAGNOSTIC
 		count = 0;
 
-		while (count >= 0 && procrunnable() == 0) {
+		while (count >= 0 && kserunnable() == 0) {
 #else
-		while (procrunnable() == 0) {
+		while (kserunnable() == 0) {
 #endif
 		/*
 		 * This is a good place to put things to be done in
@@ -103,8 +113,9 @@ idle_proc(void *dummy)
 		}
 
 		mtx_lock_spin(&sched_lock);
-		curproc->p_stats->p_ru.ru_nvcsw++;
+		p->p_stats->p_ru.ru_nvcsw++;
 		mi_switch();
+		td->td_kse->ke_state = KES_RUNNING;
 		mtx_unlock_spin(&sched_lock);
 	}
 }
diff --git a/sys/kern/kern_intr.c b/sys/kern/kern_intr.c
index d65dc82..fb9c092 100644
--- a/sys/kern/kern_intr.c
+++ b/sys/kern/kern_intr.c
@@ -201,7 +201,7 @@ ithread_create(struct ithd **ithread, int vector, int flags,
 	td = FIRST_THREAD_IN_PROC(p);	/* XXXKSE */
 	td->td_ksegrp->kg_pri_class = PRI_ITHD;
 	td->td_priority = PRI_MAX_ITHD;
-	p->p_stat = SWAIT;
+	td->td_state = TDS_IWAIT;
 	ithd->it_td = td;
 	td->td_ithd = ithd;
 	if (ithread != NULL)
@@ -229,8 +229,7 @@ ithread_destroy(struct ithd *ithread)
 	}
 	ithread->it_flags |= IT_DEAD;
 	mtx_lock_spin(&sched_lock);
-	if (p->p_stat == SWAIT) {
-		p->p_stat = SRUN; /* XXXKSE */
+	if (td->td_state == TDS_IWAIT) {
 		setrunqueue(td);
 	}
 	mtx_unlock_spin(&sched_lock);
@@ -327,7 +326,7 @@ ok:
 	 * handler as being dead and let the ithread do the actual removal.
 	 */
 	mtx_lock_spin(&sched_lock);
-	if (ithread->it_td->td_proc->p_stat != SWAIT) {
+	if (ithread->it_td->td_state != TDS_IWAIT) {
 		handler->ih_flags |= IH_DEAD;
 
 		/*
@@ -374,8 +373,8 @@ ithread_schedule(struct ithd *ithread, int do_switch)
 	td = ithread->it_td;
 	p = td->td_proc;
 	KASSERT(p != NULL, ("ithread %s has no process", ithread->it_name));
-	CTR4(KTR_INTR, "%s: pid %d: (%s) need = %d", __func__, p->p_pid, p->p_comm,
-	    ithread->it_need);
+	CTR4(KTR_INTR, "%s: pid %d: (%s) need = %d",
+	    __func__, p->p_pid, p->p_comm, ithread->it_need);
 
 	/*
 	 * Set it_need to tell the thread to keep running if it is already
@@ -387,14 +386,16 @@ ithread_schedule(struct ithd *ithread, int do_switch)
 	 */
 	ithread->it_need = 1;
 	mtx_lock_spin(&sched_lock);
-	if (p->p_stat == SWAIT) {
+	if (td->td_state == TDS_IWAIT) {
 		CTR2(KTR_INTR, "%s: setrunqueue %d", __func__, p->p_pid);
-		p->p_stat = SRUN;
-		setrunqueue(td); /* XXXKSE */
-		if (do_switch && curthread->td_critnest == 1 &&
-		    curthread->td_proc->p_stat == SRUN) {
+		setrunqueue(td);
+		if (do_switch &&
+		    (curthread->td_critnest == 1)/* &&
+		    (curthread->td_state == TDS_RUNNING) XXXKSE*/) {
+#if 0 /* not needed in KSE */
 			if (curthread != PCPU_GET(idlethread))
 				setrunqueue(curthread);
+#endif
 			curthread->td_proc->p_stats->p_ru.ru_nivcsw++;
 			mi_switch();
 		} else {
@@ -402,7 +403,7 @@ ithread_schedule(struct ithd *ithread, int do_switch)
 		}
 	} else {
 		CTR4(KTR_INTR, "%s: pid %d: it_need %d, state %d",
-		    __func__, p->p_pid, ithread->it_need, p->p_stat);
+		    __func__, p->p_pid, ithread->it_need, p->p_state);
 	}
 	mtx_unlock_spin(&sched_lock);
 
@@ -550,7 +551,7 @@ restart:
 			 */
 			if (ithd->it_enable != NULL)
 				ithd->it_enable(ithd->it_vector);
-			p->p_stat = SWAIT; /* we're idle */
+			td->td_state = TDS_IWAIT; /* we're idle */
 			p->p_stats->p_ru.ru_nvcsw++;
 			CTR2(KTR_INTR, "%s: pid %d: done", __func__, p->p_pid);
 			mi_switch();
diff --git a/sys/kern/kern_kthread.c b/sys/kern/kern_kthread.c
index a456a86..e8e2fea 100644
--- a/sys/kern/kern_kthread.c
+++ b/sys/kern/kern_kthread.c
@@ -109,8 +109,7 @@ kthread_create(void (*func)(void *), void *arg,
 	mtx_lock_spin(&sched_lock);
 	p2->p_sflag |= PS_INMEM;
 	if (!(flags & RFSTOPPED)) {
-		p2->p_stat = SRUN;
-		setrunqueue(FIRST_THREAD_IN_PROC(p2)); /* XXXKSE */
+		setrunqueue(FIRST_THREAD_IN_PROC(p2)); 
 	}
 	mtx_unlock_spin(&sched_lock);
 
diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c
index 08bca8d..c2e79d0 100644
--- a/sys/kern/kern_mutex.c
+++ b/sys/kern/kern_mutex.c
@@ -119,23 +119,20 @@ propagate_priority(struct thread *td)
 			return;
 		}
 
+		KASSERT(td->td_state != TDS_SURPLUS, ("Mutex owner SURPLUS"));
+		MPASS(td->td_proc != NULL);
 		MPASS(td->td_proc->p_magic == P_MAGIC);
-		KASSERT(td->td_proc->p_stat != SSLEEP, ("sleeping thread owns a mutex"));
+		KASSERT(td->td_state != TDS_SLP,
+		    ("sleeping thread owns a mutex"));
 		if (td->td_priority <= pri) /* lower is higher priority */
 			return;
 
-		/*
-		 * Bump this thread's priority.
-		 */
-		td->td_priority = pri;
 
 		/*
 		 * If lock holder is actually running, just bump priority.
 		 */
-		if (thread_running(td)) {
-			MPASS(td->td_proc->p_stat == SRUN
-			|| td->td_proc->p_stat == SZOMB
-			|| td->td_proc->p_stat == SSTOP);
+		if (td->td_state == TDS_RUNNING) {
+			td->td_priority = pri;
 			return;
 		}
 
@@ -151,20 +148,26 @@ propagate_priority(struct thread *td)
 		 * If on run queue move to new run queue, and quit.
 		 * XXXKSE this gets a lot more complicated under threads
 		 * but try anyhow.
+		 * We should have a special call to do this more efficiently.
 		 */
-		if (td->td_proc->p_stat == SRUN) {
+		if (td->td_state == TDS_RUNQ) {
 			MPASS(td->td_blocked == NULL);
 			remrunqueue(td);
+			td->td_priority = pri;
 			setrunqueue(td);
 			return;
 		}
+		/*
+		 * Adjust for any other cases.
+		 */
+		td->td_priority = pri;
 
 		/*
 		 * If we aren't blocked on a mutex, we should be.
 		 */
-		KASSERT(td->td_proc->p_stat == SMTX, (
+		KASSERT(td->td_state == TDS_MTX, (
 		    "process %d(%s):%d holds %s but isn't blocked on a mutex\n",
-		    td->td_proc->p_pid, td->td_proc->p_comm, td->td_proc->p_stat,
+		    td->td_proc->p_pid, td->td_proc->p_comm, td->td_state,
 		    m->mtx_object.lo_name));
 
 		/*
@@ -590,7 +593,7 @@ _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
 		 */
 		td->td_blocked = m;
 		td->td_mtxname = m->mtx_object.lo_name;
-		td->td_proc->p_stat = SMTX;
+		td->td_state = TDS_MTX;
 		propagate_priority(td);
 
 		if (LOCK_LOG_TEST(&m->mtx_object, opts))
@@ -727,7 +730,6 @@ _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
 		    m, td1);
 
 	td1->td_blocked = NULL;
-	td1->td_proc->p_stat = SRUN;
 	setrunqueue(td1);
 
 	if (td->td_critnest == 1 && td1->td_priority < pri) {
@@ -744,7 +746,6 @@ _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
 			}
 		}
 #endif
-		setrunqueue(td);
 		if (LOCK_LOG_TEST(&m->mtx_object, opts))
 			CTR2(KTR_LOCK,
 			    "_mtx_unlock_sleep: %p switching out lock=%p", m,
diff --git a/sys/kern/kern_poll.c b/sys/kern/kern_poll.c
index a197bc0..9dd6924 100644
--- a/sys/kern/kern_poll.c
+++ b/sys/kern/kern_poll.c
@@ -503,7 +503,6 @@ poll_idle(void)
 			mtx_unlock(&Giant);
 			mtx_assert(&Giant, MA_NOTOWNED);
 			mtx_lock_spin(&sched_lock);
-			setrunqueue(td);
 			td->td_proc->p_stats->p_ru.ru_nvcsw++;
 			mi_switch();
 			mtx_unlock_spin(&sched_lock);
diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c
index a5378d9..8b15fc2 100644
--- a/sys/kern/kern_proc.c
+++ b/sys/kern/kern_proc.c
@@ -44,6 +44,7 @@
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sysproto.h>
+#include <sys/kse.h>
 #include <sys/sysctl.h>
 #include <sys/filedesc.h>
 #include <sys/tty.h>
@@ -111,44 +112,28 @@ procinit()
 	uihashinit();
 }
 
-/*
- * Note that we do not link to the proc's ucred here
- * The thread is linked as if running but no KSE assigned
- */
-static  void
-thread_link(struct thread *td, struct ksegrp *kg)
-{
-	struct proc *p = kg->kg_proc;
-
-	td->td_proc     = p;
-	td->td_ksegrp   = kg;
-	td->td_last_kse = &p->p_kse;
-
-	TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
-	TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist);
-	td->td_critnest = 0;
-	td->td_kse      = NULL;
-	cpu_thread_link(td);
-}
-
 /* 
  * KSE is linked onto the idle queue.
  */
-static void
+void
 kse_link(struct kse *ke, struct ksegrp *kg)
 {
 	struct proc *p = kg->kg_proc;
 
+KASSERT((ke->ke_state != KES_ONRUNQ), ("linking suspect kse on run queue"));
 	TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist);
 	kg->kg_kses++;
+KASSERT((ke->ke_state != KES_IDLE), ("already on idle queue"));
+	ke->ke_state = KES_IDLE;
 	TAILQ_INSERT_HEAD(&kg->kg_iq, ke, ke_kgrlist);
+	kg->kg_idle_kses++;
 	ke->ke_proc	= p;
 	ke->ke_ksegrp	= kg;
 	ke->ke_thread	= NULL;
 	ke->ke_oncpu = NOCPU;
 }
 
-static void
+void
 ksegrp_link(struct ksegrp *kg, struct proc *p)
 {
 
@@ -159,10 +144,13 @@ ksegrp_link(struct ksegrp *kg, struct proc *p)
 	TAILQ_INIT(&kg->kg_iq);		/* all kses in ksegrp */
 	kg->kg_proc	= p;
 /* the following counters are in the -zero- section and may not need clearing */
+	kg->kg_numthreads = 0;
 	kg->kg_runnable = 0;
 	kg->kg_kses = 0;
+	kg->kg_idle_kses = 0;
 	kg->kg_runq_kses = 0; /* XXXKSE change name */
 /* link it in now that it's consitant */
+	p->p_numksegrps++;
 	TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp);
 }
 
@@ -177,30 +165,13 @@ proc_linkup(struct proc *p, struct ksegrp *kg,
 
 	TAILQ_INIT(&p->p_ksegrps);	     /* all ksegrps in proc */
 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
+	TAILQ_INIT(&p->p_suspended);	     /* Threads suspended */
 
 	ksegrp_link(kg, p);
 	kse_link(ke, kg);
 	thread_link(td, kg);
-	/* link them together for 1:1 */
-	td->td_kse = ke;
-	ke->ke_thread = td;
 }
 
-/* temporary version is ultra simple while we are in 1:1 mode */
-struct thread *
-thread_get(struct proc *p)
-{
-	struct thread *td = &p->p_xxthread;
-
-	return (td);
-}
-
-
-/*********************
-* STUB KSE syscalls
-*********************/
-
-/* struct thread_wakeup_args { struct thread_mailbox *tmbx; }; */
 int
 thread_wakeup(struct thread *td, struct  thread_wakeup_args *uap)
 {
@@ -219,7 +190,11 @@ int
 kse_yield(struct thread *td, struct kse_yield_args *uap)
 {
 
-	return(ENOSYS);
+	PROC_LOCK(td->td_proc);
+	mtx_lock_spin(&sched_lock);
+	thread_exit();
+	/* NOTREACHED */
+	return(0);
 }
 
 int kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)
@@ -228,16 +203,80 @@ int kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)
 	return(ENOSYS);
 }
 
-
-int
-kse_new(struct thread *td, struct kse_new_args *uap)
+/* 
+ * No new KSEG: first call: use current KSE, don't schedule an upcall
+ * All other situations, do alloate a new KSE and schedule an upcall on it.
+ */
 /* struct kse_new_args {
 	struct kse_mailbox *mbx;
 	int	new_grp_flag;
 }; */
+int
+kse_new(struct thread *td, struct kse_new_args *uap)
 {
+	struct kse *newkse;
+	struct proc *p;
+	struct kse_mailbox mbx;
+	int err;
 
-	return (ENOSYS);
+	p = td->td_proc;
+	if ((err = copyin(uap->mbx, &mbx, sizeof(mbx))))
+		return (err);
+	PROC_LOCK(p);
+	/*
+	 * If we have no KSE mode set, just set it, and skip KSE and KSEGRP
+	 * creation.  You cannot request a new group with the first one as
+	 * you are effectively getting one. Instead, go directly to saving
+	 * the upcall info.
+	 */
+	if ((td->td_proc->p_flag & P_KSES) || (uap->new_grp_flag)) {
+
+		return (EINVAL);	/* XXX */
+		/*
+		 * If newgroup then create the new group.
+		 * Check we have the resources for this.
+		 */
+		/* Copy lots of fields from the current KSEGRP.  */
+		/* Create the new KSE */
+		/* Copy lots of fields from the current KSE.  */
+	} else {
+		/*
+		 * We are switching to KSEs so just
+		 * use the preallocated ones for this call.
+		 * XXXKSE if we have to initialise any fields for KSE
+		 * mode operation, do it here.
+		 */
+		newkse = td->td_kse;
+	}
+	/*
+	 * Fill out the KSE-mode specific fields of the new kse.
+	 */
+	PROC_UNLOCK(p);
+	mtx_lock_spin(&sched_lock);
+	mi_switch();	/* Save current registers to PCB. */
+	mtx_unlock_spin(&sched_lock);
+	newkse->ke_upcall = mbx.kmbx_upcall;
+	newkse->ke_stackbase  = mbx.kmbx_stackbase;
+	newkse->ke_stacksize = mbx.kmbx_stacksize;
+	newkse->ke_mailbox = uap->mbx;
+	cpu_save_upcall(td, newkse);
+	/* Note that we are the returning syscall */
+	td->td_retval[0] = 0;
+	td->td_retval[1] = 0;
+
+	if ((td->td_proc->p_flag & P_KSES) || (uap->new_grp_flag)) {
+		thread_schedule_upcall(td, newkse);
+	} else {
+		/*
+		 * Don't set this until we are truely ready, because
+		 * things will start acting differently.  Return to the
+		 * calling code for the first time.  Assuming we set up
+		 * the mailboxes right, all syscalls after this will be
+		 * asynchronous.
+		 */
+		td->td_proc->p_flag |= P_KSES;
+	}
+	return (0);
 }
 
 /*
@@ -554,7 +593,7 @@ fixjobc(p, pgrp, entering)
 	LIST_FOREACH(p, &p->p_children, p_sibling) {
 		if ((hispgrp = p->p_pgrp) != pgrp &&
 		    hispgrp->pg_session == mysession &&
-		    p->p_stat != SZOMB) {
+		    p->p_state != PRS_ZOMBIE) {
 			PGRP_LOCK(hispgrp);
 			if (entering)
 				hispgrp->pg_jobc++;
@@ -583,7 +622,7 @@ orphanpg(pg)
 
 	mtx_lock_spin(&sched_lock);
 	LIST_FOREACH(p, &pg->pg_members, p_pglist) {
-		if (p->p_stat == SSTOP) {
+		if (P_SHOULDSTOP(p)) {
 			mtx_unlock_spin(&sched_lock);
 			LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 				PROC_LOCK(p);
@@ -674,7 +713,9 @@ fill_kinfo_proc(p, kp)
 		kp->ki_sigcatch = p->p_procsig->ps_sigcatch;
 	}
 	mtx_lock_spin(&sched_lock);
-	if (p->p_stat != SIDL && p->p_stat != SZOMB && p->p_vmspace != NULL) {
+	if (p->p_state != PRS_NEW &&
+	    p->p_state != PRS_ZOMBIE &&
+	    p->p_vmspace != NULL) {
 		struct vmspace *vm = p->p_vmspace;
 
 		kp->ki_size = vm->vm_map.size;
@@ -697,35 +738,65 @@ fill_kinfo_proc(p, kp)
 		    p->p_stats->p_cru.ru_stime.tv_usec;
 	}
 	td = FIRST_THREAD_IN_PROC(p);
-	if (td->td_wmesg != NULL)
-		strncpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg) - 1);
-	if (p->p_stat == SMTX) {
-		kp->ki_kiflag |= KI_MTXBLOCK;
-		strncpy(kp->ki_mtxname, td->td_mtxname,
-		    sizeof(kp->ki_mtxname) - 1);
+	if (!(p->p_flag & P_KSES)) {
+		if (td->td_wmesg != NULL) {
+			strncpy(kp->ki_wmesg, td->td_wmesg,
+			    sizeof(kp->ki_wmesg) - 1);
+		}
+		if (td->td_state == TDS_MTX) {
+			kp->ki_kiflag |= KI_MTXBLOCK;
+			strncpy(kp->ki_mtxname, td->td_mtxname,
+			    sizeof(kp->ki_mtxname) - 1);
+		}
 	}
-	kp->ki_stat = p->p_stat;
+
+	if (p->p_state == PRS_NORMAL) { /*  XXXKSE very aproximate */
+		if ((td->td_state == TDS_RUNQ) ||
+		    (td->td_state == TDS_RUNNING)) {
+			kp->ki_stat = SRUN;
+		} else if (td->td_state == TDS_SLP) {
+			kp->ki_stat = SSLEEP;
+		} else if (P_SHOULDSTOP(p)) {
+			kp->ki_stat = SSTOP;
+		} else if (td->td_state == TDS_MTX) {
+			kp->ki_stat = SMTX;
+		} else {
+			kp->ki_stat = SWAIT;
+		}
+	} else if (p->p_state == PRS_ZOMBIE) {
+		kp->ki_stat = SZOMB;
+	} else {
+		kp->ki_stat = SIDL;
+	}
+
 	kp->ki_sflag = p->p_sflag;
 	kp->ki_swtime = p->p_swtime;
 	kp->ki_pid = p->p_pid;
 	/* vvv XXXKSE */
-	bintime2timeval(&p->p_runtime, &tv);
-	kp->ki_runtime = tv.tv_sec * (u_int64_t)1000000 + tv.tv_usec;
-	kp->ki_pctcpu = p->p_kse.ke_pctcpu;
-	kp->ki_estcpu = td->td_ksegrp->kg_estcpu;
-	kp->ki_slptime = td->td_ksegrp->kg_slptime;
-	kp->ki_wchan = td->td_wchan;
-	kp->ki_pri.pri_level = td->td_priority;
-	kp->ki_pri.pri_user = td->td_ksegrp->kg_user_pri;
-	kp->ki_pri.pri_class = td->td_ksegrp->kg_pri_class;
-	kp->ki_pri.pri_native = td->td_base_pri;
-	kp->ki_nice = td->td_ksegrp->kg_nice;
-	kp->ki_rqindex = p->p_kse.ke_rqindex;
-	kp->ki_oncpu = p->p_kse.ke_oncpu;
-	kp->ki_lastcpu = td->td_lastcpu;
-	kp->ki_tdflags = td->td_flags;
-	kp->ki_pcb = td->td_pcb;
-	kp->ki_kstack = (void *)td->td_kstack;
+	if (!(p->p_flag & P_KSES)) {
+		bintime2timeval(&p->p_runtime, &tv);
+		kp->ki_runtime = tv.tv_sec * (u_int64_t)1000000 + tv.tv_usec;
+		kp->ki_pctcpu = p->p_kse.ke_pctcpu;
+		kp->ki_estcpu = p->p_ksegrp.kg_estcpu;
+		kp->ki_slptime = p->p_ksegrp.kg_slptime;
+		kp->ki_wchan = td->td_wchan;
+		kp->ki_pri.pri_level = td->td_priority;
+		kp->ki_pri.pri_user = p->p_ksegrp.kg_user_pri;
+		kp->ki_pri.pri_class = p->p_ksegrp.kg_pri_class;
+		kp->ki_pri.pri_native = td->td_base_pri;
+		kp->ki_nice = p->p_ksegrp.kg_nice;
+		kp->ki_rqindex = p->p_kse.ke_rqindex;
+		kp->ki_oncpu = p->p_kse.ke_oncpu;
+		kp->ki_lastcpu = td->td_lastcpu;
+		kp->ki_tdflags = td->td_flags;
+		kp->ki_pcb = td->td_pcb;
+		kp->ki_kstack = (void *)td->td_kstack;
+	} else {
+		kp->ki_oncpu = -1;
+		kp->ki_lastcpu = -1;
+		kp->ki_tdflags = -1;
+		/* All the reast are 0 */
+	}
 	/* ^^^ XXXKSE */
 	mtx_unlock_spin(&sched_lock);
 	sp = NULL;
@@ -878,7 +949,7 @@ sysctl_kern_proc(SYSCTL_HANDLER_ARGS)
 			/*
 			 * Skip embryonic processes.
 			 */
-			if (p->p_stat == SIDL) {
+			if (p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
 				continue;
 			}
diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c
index d2cb69d..0803cff 100644
--- a/sys/kern/kern_shutdown.c
+++ b/sys/kern/kern_shutdown.c
@@ -281,7 +281,6 @@ boot(int howto)
 				DROP_GIANT();
    				for (subiter = 0; subiter < 50 * iter; subiter++) {
      					mtx_lock_spin(&sched_lock);
-					setrunqueue(curthread);
 					curthread->td_proc->p_stats->p_ru.ru_nvcsw++;
      					mi_switch(); /* Allow interrupt threads to run */
      					mtx_unlock_spin(&sched_lock);
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index a561a19..e8ded21 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -84,7 +84,7 @@ static int killpg1(struct thread *td, int sig, int pgid, int all);
 static int sig_ffs(sigset_t *set);
 static int sigprop(int sig);
 static void stop(struct proc *);
-
+static void tdsignal(struct thread *td, int sig, sig_t action);
 static int	filt_sigattach(struct knote *kn);
 static void	filt_sigdetach(struct knote *kn);
 static int	filt_signal(struct knote *kn, long hint);
@@ -168,16 +168,18 @@ static int sigproptbl[NSIG] = {
  * Determine signal that should be delivered to process p, the current
  * process, 0 if none.  If there is a pending stop signal with default
  * action, the process stops in issignal().
+ * XXXKSE   the check for a pending stop is not done under KSE
  *
  * MP SAFE.
  */
 int
-cursig(struct proc *p)
+cursig(struct thread *td)
 {
+	struct proc *p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&sched_lock, MA_NOTOWNED);
-	return (SIGPENDING(p) ? issignal(p) : 0);
+	return (SIGPENDING(p) ? issignal(td) : 0);
 }
 
 /*
@@ -1042,7 +1044,7 @@ killpg1(td, sig, pgid, all)
 				PROC_UNLOCK(p);
 				continue;
 			}
-			if (p->p_stat == SZOMB) {
+			if (p->p_state == PRS_ZOMBIE) {
 				PROC_UNLOCK(p);
 				continue;
 			}
@@ -1243,12 +1245,10 @@ psignal(p, sig)
 	register struct proc *p;
 	register int sig;
 {
-	register int prop;
 	register sig_t action;
 	struct thread *td;
-#ifdef SMP
-	struct ksegrp *kg;
-#endif
+	register int prop;
+
 
 	KASSERT(_SIG_VALID(sig),
 	    ("psignal(): invalid signal %d\n", sig));
@@ -1257,7 +1257,6 @@ psignal(p, sig)
 	KNOTE(&p->p_klist, NOTE_SIGNAL | sig);
 
 	prop = sigprop(sig);
-
 	/*
 	 * If proc is traced, always give parent a chance;
 	 * if signal event is tracked by procfs, give *that*
@@ -1283,29 +1282,6 @@ psignal(p, sig)
 			action = SIG_DFL;
 	}
 
-	/*
-	 * bring the priority of a process up if we want it to get 
-	 * killed in this lifetime.
-	 * XXXKSE think if a better way to do this.
-	 *
-	 * What we need to do is see if there is a thread that will
-	 * be able to accept the signal. e.g.
-	 * FOREACH_THREAD_IN_PROC() {
-	 *	if runnable, we're done
-	 *	else pick one at random.
-	 * }
-	 */
-	/* XXXKSE
-	 * For now there is one thread per proc.
-	 * Effectively select one sucker thread..
-	 */
-	td = FIRST_THREAD_IN_PROC(p);
-	mtx_lock_spin(&sched_lock);
-	if ((p->p_ksegrp.kg_nice > NZERO) && (action == SIG_DFL) &&
-	    (prop & SA_KILL) && ((p->p_flag & P_TRACED) == 0))
-		p->p_ksegrp.kg_nice = NZERO; /* XXXKSE */
-	mtx_unlock_spin(&sched_lock);
-
 	if (prop & SA_CONT)
 		SIG_STOPSIGMASK(p->p_siglist);
 
@@ -1316,48 +1292,125 @@ psignal(p, sig)
 		 * is default; don't stop the process below if sleeping,
 		 * and don't clear any pending SIGCONT.
 		 */
-		if (prop & SA_TTYSTOP && p->p_pgrp->pg_jobc == 0 &&
-		    action == SIG_DFL)
+		if ((prop & SA_TTYSTOP) &&
+		    (p->p_pgrp->pg_jobc == 0) &&
+		    (action == SIG_DFL))
 		        return;
 		SIG_CONTSIGMASK(p->p_siglist);
 	}
 	SIGADDSET(p->p_siglist, sig);
 	mtx_lock_spin(&sched_lock);
 	signotify(p);
+	mtx_unlock_spin(&sched_lock);
 
 	/*
-	 * Defer further processing for signals which are held,
-	 * except that stopped processes must be continued by SIGCONT.
+	 * Some signals have a process-wide effect and a per-thread
+	 * component.  Most processing occurs when the process next
+	 * tries to cross the user boundary, however there are some
+	 * times when processing needs to be done immediatly, such as
+	 * waking up threads so that they can cross the user boundary.
+	 * We try do the per-process part here.
 	 */
-	if (action == SIG_HOLD && (!(prop & SA_CONT) || p->p_stat != SSTOP)) {
-		mtx_unlock_spin(&sched_lock);
-		return;
-	}
-
-	switch (p->p_stat) {
-
-	case SSLEEP:
+	if (P_SHOULDSTOP(p)) {
 		/*
-		 * If process is sleeping uninterruptibly
-		 * we can't interrupt the sleep... the signal will
-		 * be noticed when the process returns through
-		 * trap() or syscall().
+		 * The process is in stopped mode. All the threads should be
+		 * either winding down or already on the suspended queue.
 		 */
-		if ((td->td_flags & TDF_SINTR) == 0)
+		if (p->p_flag & P_TRACED) {
+			/*
+			 * The traced process is already stopped,
+			 * so no further action is necessary.
+			 * No signal can restart us.
+			 */
 			goto out;
+		}
+
+		if (sig == SIGKILL) {
+			/*
+			 * SIGKILL sets process running.
+			 * It will die elsewhere.
+			 * All threads must be restarted.
+			 */
+			p->p_flag &= ~P_STOPPED;
+			goto runfast;
+		}
+
+		if (prop & SA_CONT) {
+			/*
+			 * If SIGCONT is default (or ignored), we continue the
+			 * process but don't leave the signal in p_siglist as
+			 * it has no further action.  If SIGCONT is held, we
+			 * continue the process and leave the signal in
+			 * p_siglist.  If the process catches SIGCONT, let it
+			 * handle the signal itself.  If it isn't waiting on
+			 * an event, it goes back to run state.
+			 * Otherwise, process goes back to sleep state.
+			 */
+			p->p_flag &= ~P_STOPPED_SGNL;
+			if (action == SIG_DFL) {
+				SIGDELSET(p->p_siglist, sig);
+			} else if (action == SIG_CATCH) {
+				/*
+				 * The process wants to catch it so it needs
+				 * to run at least one thread, but which one?
+				 * It would seem that the answer would be to
+				 * run an upcall in the next KSE to run, and
+				 * deliver the signal that way. In a NON KSE
+				 * process, we need to make sure that the
+				 * single thread is runnable asap.
+				 * XXXKSE for now however, make them all run.
+				 */
+				goto runfast;
+			}
+			/*
+			 * The signal is not ignored or caught.
+			 */
+			mtx_lock_spin(&sched_lock);
+			thread_unsuspend(p);	/* Checks if should do it. */
+			mtx_unlock_spin(&sched_lock);
+			goto out;
+		}
+
+		if (prop & SA_STOP) {
+			/*
+			 * Already stopped, don't need to stop again
+			 * (If we did the shell could get confused).
+			 */
+			SIGDELSET(p->p_siglist, sig);
+			goto out;
+		}
+
 		/*
-		 * Process is sleeping and traced... make it runnable
-		 * so it can discover the signal in issignal() and stop
-		 * for the parent.
+		 * All other kinds of signals:
+		 * If a thread is sleeping interruptibly, simulate a
+		 * wakeup so that when it is continued it will be made
+		 * runnable and can look at the signal.  However, don't make
+		 * the process runnable, leave it stopped.
+		 * It may run a bit until it hits a thread_suspend_check().
+		 *
+		 * XXXKSE I don't understand this at all.
 		 */
-		if (p->p_flag & P_TRACED)
-			goto run;
+		mtx_lock_spin(&sched_lock);
+		FOREACH_THREAD_IN_PROC(p, td) {
+			if (td->td_wchan && (td->td_flags & TDF_SINTR)) {
+				if (td->td_flags & TDF_CVWAITQ)
+					cv_waitq_remove(td);
+				else
+					unsleep(td);
+				setrunnable(td);
+			}
+		}
+		mtx_unlock_spin(&sched_lock);
+		goto out;
 		/*
-		 * If SIGCONT is default (or ignored) and process is
-		 * asleep, we are finished; the process should not
-		 * be awakened.
+		 * XXXKSE  What about threads that are waiting on mutexes?
+		 * Shouldn't they abort too?
 		 */
-		if ((prop & SA_CONT) && action == SIG_DFL) {
+	}  else if (p->p_state == PRS_NORMAL) {
+		if (prop & SA_CONT) {
+			/*
+			 * Already active, don't need to start again.
+			 */
 			SIGDELSET(p->p_siglist, sig);
 			goto out;
 		}
@@ -1370,133 +1423,128 @@ psignal(p, sig)
 		if (prop & SA_STOP) {
 			if (action != SIG_DFL)
 				goto runfast;
+
 			/*
 			 * If a child holding parent blocked,
 			 * stopping could cause deadlock.
 			 */
 			if (p->p_flag & P_PPWAIT)
 				goto out;
-			mtx_unlock_spin(&sched_lock);
 			SIGDELSET(p->p_siglist, sig);
 			p->p_xstat = sig;
 			PROC_LOCK(p->p_pptr);
-			if ((p->p_pptr->p_procsig->ps_flag & PS_NOCLDSTOP) == 0)
+			if (!(p->p_pptr->p_procsig->ps_flag & PS_NOCLDSTOP))
 				psignal(p->p_pptr, SIGCHLD);
 			PROC_UNLOCK(p->p_pptr);
 			mtx_lock_spin(&sched_lock);
 			stop(p);
+			mtx_unlock_spin(&sched_lock);
 			goto out;
 		} else
 			goto runfast;
 		/* NOTREACHED */
+	} else {
+		/* Not in "NORMAL" state. discard the signal. */
+		SIGDELSET(p->p_siglist, sig);
+		goto out;
+	}
 
-	case SSTOP:
-		/*
-		 * If traced process is already stopped,
-		 * then no further action is necessary.
-		 */
-		if (p->p_flag & P_TRACED)
-			goto out;
+	/*
+	 * The process is not stopped so we need to apply the signal to all the
+	 * running threads.
+	 */
 
-		/*
-		 * Kill signal always sets processes running.
-		 */
-		if (sig == SIGKILL)
-			goto runfast;
+runfast:
+	FOREACH_THREAD_IN_PROC(p, td)
+		tdsignal(td, sig, action);
+	mtx_lock_spin(&sched_lock);
+	thread_unsuspend(p);
+	mtx_unlock_spin(&sched_lock);
+out:
+	/* If we jump here, sched_lock should not be owned. */
+	mtx_assert(&sched_lock, MA_NOTOWNED);
+}
 
-		if (prop & SA_CONT) {
-			/*
-			 * If SIGCONT is default (or ignored), we continue the
-			 * process but don't leave the signal in p_siglist, as
-			 * it has no further action.  If SIGCONT is held, we
-			 * continue the process and leave the signal in
-			 * p_siglist.  If the process catches SIGCONT, let it
-			 * handle the signal itself.  If it isn't waiting on
-			 * an event, then it goes back to run state.
-			 * Otherwise, process goes back to sleep state.
-			 */
-			if (action == SIG_DFL)
-				SIGDELSET(p->p_siglist, sig);
-			if (action == SIG_CATCH)
-				goto runfast;
-			/*
-			 * XXXKSE
-			 * do this for each thread.
-			 */
-			if (p->p_flag & P_KSES) {
-				mtx_assert(&sched_lock,
-				    MA_OWNED | MA_NOTRECURSED);
-				FOREACH_THREAD_IN_PROC(p, td) {
-					if (td->td_wchan == NULL) {
-						setrunnable(td); /* XXXKSE */
-					} else {
-						/* mark it as sleeping */
-					}
-				}
-			} else {
-				p->p_flag |= P_CONTINUED;
-				wakeup(p->p_pptr);
-				if (td->td_wchan == NULL)
-					goto run;
-				p->p_stat = SSLEEP;
-			}
-			goto out;
+/*
+ * The force of a signal has been directed against a single
+ * thread. We need to see what we can do about knocking it
+ * out of any sleep it may be in etc.
+ */
+static void
+tdsignal(struct thread *td, int sig, sig_t action)
+{
+	struct proc *p = td->td_proc;
+	register int prop;
+
+	prop = sigprop(sig);
+
+	/*
+	 * Bring the priority of a process up if we want it to get
+	 * killed in this lifetime.
+	 * XXXKSE we should shift the priority to the thread.
+	 */
+	mtx_lock_spin(&sched_lock);
+	if ((action == SIG_DFL) && (prop & SA_KILL)) {
+		if (td->td_priority > PUSER) {
+			td->td_priority = PUSER;
 		}
+	}
+	mtx_unlock_spin(&sched_lock);
 
-		if (prop & SA_STOP) {
-			/*
-			 * Already stopped, don't need to stop again.
-			 * (If we did the shell could get confused.)
-			 */
-			SIGDELSET(p->p_siglist, sig);
+	/*
+	 * Defer further processing for signals which are held,
+	 * except that stopped processes must be continued by SIGCONT.
+	 */
+	if (action == SIG_HOLD) {
+		goto out;
+	}
+	mtx_lock_spin(&sched_lock);
+	if (td->td_state == TDS_SLP) {
+		/*
+		 * If thread is sleeping uninterruptibly
+		 * we can't interrupt the sleep... the signal will
+		 * be noticed when the process returns through
+		 * trap() or syscall().
+		 */
+		if ((td->td_flags & TDF_SINTR) == 0) {
+			mtx_unlock_spin(&sched_lock);
 			goto out;
 		}
-
 		/*
-		 * If process is sleeping interruptibly, then simulate a
-		 * wakeup so that when it is continued, it will be made
-		 * runnable and can look at the signal.  But don't make
-		 * the process runnable, leave it stopped.
-		 * XXXKSE should we wake ALL blocked threads?
+		 * Process is sleeping and traced.  Make it runnable
+		 * so it can discover the signal in issignal() and stop
+		 * for its parent.
 		 */
-		if (p->p_flag & P_KSES) {
-			FOREACH_THREAD_IN_PROC(p, td) {
-				if (td->td_wchan && (td->td_flags & TDF_SINTR)){
-					if (td->td_flags & TDF_CVWAITQ)
-						cv_waitq_remove(td);
-					else
-						unsleep(td); /* XXXKSE */
-				}
-			}
-		} else {
-			if (td->td_wchan && td->td_flags & TDF_SINTR) {
-				if (td->td_flags & TDF_CVWAITQ)
-					cv_waitq_remove(td);
-				else
-					unsleep(td); /* XXXKSE */
-			}
+		if (p->p_flag & P_TRACED) {
+			p->p_flag &= ~P_STOPPED_TRACE;
+			goto run;
 		}
-		goto out;
+		mtx_unlock_spin(&sched_lock);
+		/*
+		 * If SIGCONT is default (or ignored) and process is
+		 * asleep, we are finished; the process should not
+		 * be awakened.
+		 */
+		if ((prop & SA_CONT) && action == SIG_DFL) {
+			SIGDELSET(p->p_siglist, sig);
+			goto out;
+		}
+		goto runfast;
+		/* NOTREACHED */
 
-	default:
+	} else {
 		/*
-		 * SRUN, SIDL, SZOMB do nothing with the signal,
+		 * Other states do nothing with the signal immediatly,
 		 * other than kicking ourselves if we are running.
 		 * It will either never be noticed, or noticed very soon.
 		 */
-		if (p->p_stat == SRUN) {
+		mtx_unlock_spin(&sched_lock);
+		if (td->td_state == TDS_RUNQ ||
+		    td->td_state == TDS_RUNNING) {
+			signotify(td->td_proc);
 #ifdef SMP
-			struct kse *ke;
-			struct thread *td = curthread;
-/* we should only deliver to one thread.. but which one? */
-			FOREACH_KSEGRP_IN_PROC(p, kg) {
-				FOREACH_KSE_IN_GROUP(kg, ke) {
-					if (ke->ke_thread == td) {
-						continue;
-					}
-					forward_signal(ke->ke_thread);
-				}
-			}
+			if (td->td_state == TDS_RUNNING && td != curthread)
+				forward_signal(td);
 #endif
 		}
 		goto out;
@@ -1506,21 +1554,17 @@ psignal(p, sig)
 runfast:
 	/*
 	 * Raise priority to at least PUSER.
-	 * XXXKSE Should we make them all run fast?
-	 * Maybe just one would be enough?
 	 */
-
-	if (FIRST_THREAD_IN_PROC(p)->td_priority > PUSER) {
-		FIRST_THREAD_IN_PROC(p)->td_priority = PUSER;
+	mtx_lock_spin(&sched_lock);
+	if (td->td_priority > PUSER) {
+		td->td_priority = PUSER;
 	}
 run:
-	/* If we jump here, sched_lock has to be owned. */
 	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
-	setrunnable(td); /* XXXKSE */
-out:
+	setrunnable(td);
 	mtx_unlock_spin(&sched_lock);
 
-	/* Once we get here, sched_lock should not be owned. */
+out:
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 }
 
@@ -1533,16 +1577,18 @@ out:
  * by checking the pending signal masks in cursig.) The normal call
  * sequence is
  *
- *	while (sig = cursig(curproc))
+ *	while (sig = cursig(curthread))
  *		postsig(sig);
  */
 int
-issignal(p)
-	register struct proc *p;
+issignal(td)
+	struct thread *td;
 {
+	struct proc *p;
 	sigset_t mask;
 	register int sig, prop;
 
+	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	for (;;) {
 		int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG);
@@ -1576,6 +1622,7 @@ issignal(p)
 			PROC_UNLOCK(p->p_pptr);
 			mtx_lock_spin(&sched_lock);
 			stop(p);
+			td->td_state = TDS_UNQUEUED;
 			PROC_UNLOCK(p);
 			DROP_GIANT();
 			p->p_stats->p_ru.ru_nivcsw++;
@@ -1633,6 +1680,7 @@ issignal(p)
 #endif
 				break;		/* == ignore */
 			}
+#if 0
 			/*
 			 * If there is a pending stop signal to process
 			 * with default action, stop here,
@@ -1647,8 +1695,10 @@ issignal(p)
 					break;	/* == ignore */
 				p->p_xstat = sig;
 				PROC_LOCK(p->p_pptr);
-				if ((p->p_pptr->p_procsig->ps_flag & PS_NOCLDSTOP) == 0)
+				if ((p->p_pptr->p_procsig->ps_flag &
+				    PS_NOCLDSTOP) == 0) {
 					psignal(p->p_pptr, SIGCHLD);
+				}
 				PROC_UNLOCK(p->p_pptr);
 				mtx_lock_spin(&sched_lock);
 				stop(p);
@@ -1660,7 +1710,9 @@ issignal(p)
 				PICKUP_GIANT();
 				PROC_LOCK(p);
 				break;
-			} else if (prop & SA_IGNORE) {
+			} else
+#endif
+			     if (prop & SA_IGNORE) {
 				/*
 				 * Except for SIGCONT, shouldn't get here.
 				 * Default action is to ignore; drop it.
@@ -1706,7 +1758,7 @@ stop(p)
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&sched_lock, MA_OWNED);
-	p->p_stat = SSTOP;
+	p->p_flag |= P_STOPPED_SGNL;
 	p->p_flag &= ~P_WAITED;
 	wakeup(p->p_pptr);
 }
diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c
index 5e32eee..c63091c 100644
--- a/sys/kern/kern_subr.c
+++ b/sys/kern/kern_subr.c
@@ -538,7 +538,6 @@ uio_yield()
 	mtx_lock_spin(&sched_lock);
 	DROP_GIANT();
 	td->td_priority = td->td_ksegrp->kg_user_pri; /* XXXKSE */
-	setrunqueue(td);
 	td->td_proc->p_stats->p_ru.ru_nivcsw++;
 	mi_switch();
 	mtx_unlock_spin(&sched_lock);
diff --git a/sys/kern/kern_switch.c b/sys/kern/kern_switch.c
index 2b531c0..40d3ef8 100644
--- a/sys/kern/kern_switch.c
+++ b/sys/kern/kern_switch.c
@@ -26,6 +26,69 @@
  * $FreeBSD$
  */
 
+/***
+
+Here is the logic..
+
+If there are N processors, then there are at most N KSEs (kernel
+schedulable entities) working to process threads that belong to a
+KSEGOUP (kg). If there are X of these KSEs actually running at the
+moment in question, then there are at most M (N-X) of these KSEs on
+the run queue, as running KSEs are not on the queue.
+
+Runnable threads are queued off the KSEGROUP in priority order.
+If there are M or more threads runnable, the top M threads
+(by priority) are 'preassigned' to the M KSEs not running. The KSEs take
+their priority from those threads and are put on the run queue.
+
+The last thread that had a priority high enough to have a KSE associated
+with it, AND IS ON THE RUN QUEUE is pointed to by
+kg->kg_last_assigned. If no threads queued off the KSEGROUP have KSEs
+assigned as all the available KSEs are activly running, or because there
+are no threads queued, that pointer is NULL.
+
+When a KSE is removed from the run queue to become runnable, we know
+it was associated with the highest priority thread in the queue (at the head
+of the queue). If it is also the last assigned we know M was 1 and must
+now be 0. Since the thread is no longer queued that pointer must be
+removed from it. Since we know there were no more KSEs available,
+(M was 1 and is now 0) and since we are not FREEING our KSE
+but using it, we know there are STILL no more KSEs available, we can prove
+that the next thread in the ksegrp list will not have a KSE to assign to
+it, so we can show that the pointer must be made 'invalid' (NULL).
+
+The pointer exists so that when a new thread is made runnable, it can
+have its priority compared with the last assigned thread to see if
+it should 'steal' its KSE or not.. i.e. is it 'earlier'
+on the list than that thread or later.. If it's earlier, then the KSE is
+removed from the last assigned (which is now not assigned a KSE)
+and reassigned to the new thread, which is placed earlier in the list.
+The pointer is then backed up to the previous thread (which may or may not
+be the new thread).
+
+When a thread sleeps or is removed, the KSE becomes available and if there 
+are queued threads that are not assigned KSEs, the highest priority one of
+them is assigned the KSE, which is then placed back on the run queue at
+the approipriate place, and the kg->kg_last_assigned pointer is adjusted down
+to point to it.
+
+The following diagram shows 2 KSEs and 3 threads from a single process.
+
+ RUNQ: --->KSE---KSE--...    (KSEs queued at priorities from threads)
+              \    \____   
+               \        \
+    KSEGROUP---thread--thread--thread    (queued in priority order)
+        \                 / 
+         \_______________/
+          (last_assigned)
+
+The result of this scheme is that the M available KSEs are always
+queued at the priorities they have inherrited from the M highest priority
+threads for that KSEGROUP. If this situation changes, the KSEs are 
+reassigned to keep this true.
+   
+*/
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
@@ -44,34 +107,442 @@ CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
 static struct runq runq;
 SYSINIT(runq, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, runq_init, &runq)
 
+static void runq_readjust(struct runq *rq, struct kse *ke);
+/************************************************************************
+ * Functions that manipulate runnability from a thread perspective.	*
+ ************************************************************************/
+
 /*
- * Wrappers which implement old interface; act on global run queue.
+ * Select the KSE that will be run next.  From that find the thread, and x
+ * remove it from the KSEGRP's run queue.  If there is thread clustering,
+ * this will be what does it.
  */
-
 struct thread *
 choosethread(void)
 {
-	return (runq_choose(&runq)->ke_thread);
+	struct kse *ke;
+	struct thread *td;
+	struct ksegrp *kg;
+
+	if ((ke = runq_choose(&runq))) {
+		td = ke->ke_thread;
+		KASSERT((td->td_kse == ke), ("kse/thread mismatch"));
+		kg = ke->ke_ksegrp;
+		if (td->td_flags & TDF_UNBOUND) {
+			TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
+			if (kg->kg_last_assigned == td) 
+				if (TAILQ_PREV(td, threadqueue, td_runq)
+				    != NULL)
+					printf("Yo MAMA!\n");
+				kg->kg_last_assigned = TAILQ_PREV(td,
+				    threadqueue, td_runq);
+			/*
+			 *  If we have started running an upcall,
+			 * Then TDF_UNBOUND WAS set because the thread was 
+			 * created without a KSE. Now that we have one,
+			 * and it is our time to run, we make sure
+			 * that BOUND semantics apply for the rest of
+			 * the journey to userland, and into the UTS.
+			 */
+#ifdef	NOTYET
+			if (td->td_flags & TDF_UPCALLING) 
+				tdf->td_flags &= ~TDF_UNBOUND;
+#endif
+		}
+		kg->kg_runnable--;
+		CTR2(KTR_RUNQ, "choosethread: td=%p pri=%d",
+		    td, td->td_priority);
+	} else {
+		/* Pretend the idle thread was on the run queue. */
+		td = PCPU_GET(idlethread);
+		/* Simulate that it was on the run queue */
+		td->td_state = TDS_RUNQ;
+		td->td_kse->ke_state = KES_UNQUEUED; 
+		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
+	}
+	thread_sanity_check(td);
+	return (td);
+}
+
+/*
+ * Given a KSE (now surplus), either assign a new runable thread to it
+ * (and put it in the run queue) or put it in the ksegrp's idle KSE list.
+ * Assumes the kse is not linked to any threads any more. (has been cleaned).
+ */
+void
+kse_reassign(struct kse *ke)
+{
+	struct ksegrp *kg;
+	struct thread *td;
+
+	kg = ke->ke_ksegrp;
+
+KASSERT((ke->ke_state != KES_ONRUNQ), ("kse_reassigning non-free kse"));
+	/*
+	 * Find the first unassigned thread
+	 * If there is a 'last assigned' then see what's next.
+	 * otherwise look at what is first.
+	 */
+	if ((td = kg->kg_last_assigned)) {
+		td = TAILQ_NEXT(td, td_runq);
+	} else {
+		td = TAILQ_FIRST(&kg->kg_runq);
+	}
+
+	/*
+	 * If we found one assign it the kse, otherwise idle the kse.
+	 */
+	if (td) {
+		thread_sanity_check(td);
+		kg->kg_last_assigned = td;
+		td->td_kse = ke;
+		ke->ke_thread = td;
+		runq_add(&runq, ke);
+		CTR2(KTR_RUNQ, "kse_reassign: ke%p -> td%p", ke, td);
+	} else {
+		KASSERT((ke->ke_state != KES_IDLE), ("kse already idle"));
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));
+		ke->ke_state = KES_IDLE;
+		ke->ke_thread = NULL;
+		TAILQ_INSERT_HEAD(&kg->kg_iq, ke, ke_kgrlist);
+		kg->kg_idle_kses++;
+		CTR1(KTR_RUNQ, "kse_reassign: ke%p idled", ke);
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self2!"));
+	}
 }
 
 int
-procrunnable(void)
+kserunnable(void)
 {
 	return runq_check(&runq);
 }
 
+/*
+ * Remove a thread from its KSEGRP's run queue.
+ * This in turn may remove it from a KSE if it was already assigned
+ * to one, possibly causing a new thread to be assigned to the KSE
+ * and the KSE getting a new priority (unless it's a BOUND thread/KSE pair).
+ */
 void
 remrunqueue(struct thread *td)
 {
-	runq_remove(&runq, td->td_kse);
+	struct thread *td2, *td3;
+	struct ksegrp *kg;
+	struct kse *ke;
+
+	mtx_assert(&sched_lock, MA_OWNED);
+	thread_sanity_check(td);
+	KASSERT ((td->td_state == TDS_RUNQ),
+		("remrunqueue: Bad state on run queue"));
+	kg = td->td_ksegrp;
+	ke = td->td_kse;
+	/*
+	 * If it's a bound thread/KSE pair, take the shortcut. All non-KSE
+	 * threads are BOUND.
+	 */
+	CTR1(KTR_RUNQ, "remrunqueue: td%p", td);
+	td->td_state = TDS_UNQUEUED;
+	kg->kg_runnable--;
+	if ((td->td_flags & TDF_UNBOUND) == 0)  {
+		/* Bring its kse with it, leave the thread attached */
+		runq_remove(&runq, ke);
+		ke->ke_state = KES_UNQUEUED; 
+		return;
+	}
+	if (ke) {
+		/*
+		 * This thread has been assigned to a KSE.
+		 * We need to dissociate it and try assign the
+		 * KSE to the next available thread. Then, we should
+		 * see if we need to move the KSE in the run queues.
+		 */
+		td2 = kg->kg_last_assigned;
+		KASSERT((td2 != NULL), ("last assigned has wrong value "));
+		td->td_kse = NULL;
+		if ((td3 = TAILQ_NEXT(td2, td_runq))) {
+			KASSERT(td3 != td, ("td3 somehow matched td"));
+			/*
+			 * Give the next unassigned thread to the KSE
+			 * so the number of runnable KSEs remains
+			 * constant.
+			 */
+			td3->td_kse = ke;
+			ke->ke_thread = td3;
+			kg->kg_last_assigned = td3;
+			runq_readjust(&runq, ke);
+		} else {
+			/*
+			 * There is no unassigned thread.
+			 * If we were the last assigned one,
+			 * adjust the last assigned pointer back
+			 * one, which may result in NULL.
+			 */
+			if (td == td2) {
+				kg->kg_last_assigned =
+				    TAILQ_PREV(td, threadqueue, td_runq);
+			}
+			runq_remove(&runq, ke);
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));
+			KASSERT((ke->ke_state != KES_IDLE),
+			    ("kse already idle"));
+			ke->ke_state = KES_IDLE;
+			ke->ke_thread = NULL;
+KASSERT((TAILQ_FIRST(&kg->kg_iq) != ke), ("really bad screwup"));
+			TAILQ_INSERT_HEAD(&kg->kg_iq, ke, ke_kgrlist);
+			kg->kg_idle_kses++;
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self2!"));
+		}
+	}
+	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
+	thread_sanity_check(td);
 }
 
+#if 1 /* use the first version */
+
 void
 setrunqueue(struct thread *td)
 {
-	runq_add(&runq, td->td_kse);
+	struct kse *ke;
+	struct ksegrp *kg;
+	struct thread *td2;
+	struct thread *tda;
+
+	CTR1(KTR_RUNQ, "setrunqueue: td%p", td);
+	mtx_assert(&sched_lock, MA_OWNED);
+	thread_sanity_check(td);
+	KASSERT((td->td_state != TDS_RUNQ), ("setrunqueue: bad thread state"));
+	td->td_state = TDS_RUNQ;
+	kg = td->td_ksegrp;
+	kg->kg_runnable++;
+	if ((td->td_flags & TDF_UNBOUND) == 0) {
+		KASSERT((td->td_kse != NULL),
+		    ("queueing BAD thread to run queue"));
+		/*
+		 * Common path optimisation: Only one of everything
+		 * and the KSE is always already attached.
+		 * Totally ignore the ksegrp run queue.
+		 */
+		runq_add(&runq, td->td_kse);
+		return;
+	}
+	/* 
+	 * Ok, so we are threading with this thread.
+	 * We don't have a KSE, see if we can get one..
+	 */
+	tda = kg->kg_last_assigned;
+	if ((ke = td->td_kse) == NULL) {
+		/*
+		 * We will need a KSE, see if there is one..
+		 * First look for a free one, before getting desperate.
+		 * If we can't get one, our priority is not high enough..
+		 * that's ok..
+		 */
+		if (kg->kg_idle_kses) {
+			/*
+			 * There is a free one so it's ours for the asking..
+			 */
+			ke = TAILQ_FIRST(&kg->kg_iq);
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self3!"));
+			TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
+			ke->ke_state = KES_UNQUEUED;
+			kg->kg_idle_kses--;
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self4!"));
+		} else if (tda && (tda->td_priority > td->td_priority)) {
+			/*
+			 * None free, but there is one we can commandeer.
+			 */
+			ke = tda->td_kse;
+			tda->td_kse = NULL;
+			ke->ke_thread = NULL;
+			tda = kg->kg_last_assigned =
+		    	    TAILQ_PREV(tda, threadqueue, td_runq);
+			runq_remove(&runq, ke);
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self5!"));
+		}
+	} else {
+		KASSERT(ke->ke_thread == td, ("KSE/thread mismatch"));
+		KASSERT(ke->ke_state != KES_IDLE, ("KSE unexpectedly idle"));
+		ke->ke_thread = NULL;
+		td->td_kse = NULL;
+	}
+
+	/*
+	 * Add the thread to the ksegrp's run queue at
+	 * the appropriate place.
+	 */
+	TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
+		if (td2->td_priority > td->td_priority) {
+			TAILQ_INSERT_BEFORE(td2, td, td_runq);
+			break;
+		}
+	}
+	if (td2 == NULL) {
+		/* We ran off the end of the TAILQ or it was empty. */
+		TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq);
+	}
+
+	/*
+	 * If we have a ke to use, then put it on the run queue and
+	 * If needed, readjust the last_assigned pointer.
+	 */
+	if (ke) {
+		if (tda == NULL) {
+			/*
+			 * No pre-existing last assigned so whoever is first
+			 * gets the KSE we borught in.. (may be us)
+			 */
+			td2 = TAILQ_FIRST(&kg->kg_runq);
+			KASSERT((td2->td_kse == NULL),
+			    ("unexpected ke present"));
+			td2->td_kse = ke;
+			ke->ke_thread = td2;
+			kg->kg_last_assigned = td2;
+		} else if (tda->td_priority > td->td_priority) {
+			/*
+			 * It's ours, grab it, but last_assigned is past us
+			 * so don't change it.
+			 */
+			td->td_kse = ke;
+			ke->ke_thread = td;
+		} else {
+			/* 
+			 * We are past last_assigned, so 
+			 * put the new kse on whatever is next,
+			 * which may or may not be us.
+			 */
+			td2 = TAILQ_NEXT(tda, td_runq);
+			kg->kg_last_assigned = td2;
+			td2->td_kse = ke;
+			ke->ke_thread = td2;
+		}
+		runq_add(&runq, ke);
+	}
+	thread_sanity_check(td);
 }
 
+#else
+
+void
+setrunqueue(struct thread *td)
+{
+	struct kse *ke;
+	struct ksegrp *kg;
+	struct thread *td2;
+
+	CTR1(KTR_RUNQ, "setrunqueue: td%p", td);
+	KASSERT((td->td_state != TDS_RUNQ), ("setrunqueue: bad thread state"));
+	td->td_state = TDS_RUNQ;
+	kg = td->td_ksegrp;
+	kg->kg_runnable++;
+	if ((td->td_flags & TDF_UNBOUND) == 0) {
+		/*
+		 * Common path optimisation: Only one of everything
+		 * and the KSE is always already attached.
+		 * Totally ignore the ksegrp run queue.
+		 */
+		runq_add(&runq, td->td_kse);
+		return;
+	}
+	/*
+	 * First add the thread to the ksegrp's run queue at
+	 * the appropriate place.
+	 */
+	TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
+		if (td2->td_priority > td->td_priority) {
+			TAILQ_INSERT_BEFORE(td2, td, td_runq);
+			break;
+		}
+	}
+	if (td2 == NULL) {
+		/* We ran off the end of the TAILQ or it was empty. */
+		TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq);
+	}
+
+	/*
+	 * The following could be achieved by simply doing:
+	 * td->td_kse = NULL; kse_reassign(ke);
+	 * but I felt that I'd try do it inline here.
+	 * All this work may not be worth it.
+	 */
+	if ((ke = td->td_kse)) { /* XXXKSE */
+		/*
+		 * We have a KSE already. See whether we can keep it
+		 * or if we need to give it to someone else.
+		 * Either way it will need to be inserted into
+		 * the runq. kse_reassign() will do this as will runq_add().
+		 */
+		if ((kg->kg_last_assigned) &&
+		   (kg->kg_last_assigned->td_priority > td->td_priority)) {
+			/*
+			 * We can definitly keep the KSE
+			 * as the "last assignead thread" has
+			 * less priority than we do.
+			 * The "last assigned" pointer stays the same.
+			 */
+			runq_add(&runq, ke);
+			return;
+
+		}
+		/*
+		 * Give it to the correct thread,
+		 * which may be (often is) us, but may not be.
+		 */
+		td->td_kse = NULL;
+		kse_reassign(ke);
+		return;
+	}
+	/*
+	 * There are two cases where KSE adjustment is needed.
+	 * Usurpation of an already assigned KSE, and assignment
+	 * of a previously IDLE KSE.
+	 */
+	if (kg->kg_idle_kses) {
+		/*
+		 * If there are unassigned KSEs then we definitly
+		 * will be assigned one from the idle KSE list.
+		 * If we are the last, we should get the "last
+		 * assigned" pointer set to us as well.
+		 */
+		ke = TAILQ_FIRST(&kg->kg_iq);
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));
+		TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
+		ke->ke_state = KES_UNQUEUED;
+		kg->kg_idle_kses--;
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));
+		ke->ke_thread = td;
+		td->td_kse = ke;
+		runq_add(&runq, ke);
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));
+		if (TAILQ_NEXT(td, td_runq) == NULL) {
+			kg->kg_last_assigned = td;
+		}
+	} else if (kg->kg_last_assigned &&
+		(kg->kg_last_assigned->td_priority > td->td_priority)) {
+		/*
+		 * If there were none last-assigned, all KSEs
+		 * are actually out running as we speak.
+		 * If there was a last assigned, but we didn't see it,
+		 * we must be inserting before it, so take the KSE from
+		 * the last assigned, and back it up one entry. Then,
+		 * assign the KSE to the new thread and adjust its priority.
+		 */
+		td2 = kg->kg_last_assigned;
+		ke = td2->td_kse;
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));
+		kg->kg_last_assigned =
+		    TAILQ_PREV(td2, threadqueue, td_runq);
+		td2->td_kse = NULL;
+		td->td_kse = ke;
+		ke->ke_thread = td;
+		runq_readjust(&runq, ke);
+KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));
+	}
+}
+#endif
+
+/************************************************************************
+ * Critical section marker functions					*
+ ************************************************************************/
 /* Critical sections that prevent preemption. */
 void
 critical_enter(void)
@@ -98,6 +569,23 @@ critical_exit(void)
 	}
 }
 
+
+/************************************************************************
+ * SYSTEM RUN QUEUE manipulations and tests				*
+ ************************************************************************/
+/*
+ * Initialize a run structure.
+ */
+void
+runq_init(struct runq *rq)
+{
+	int i;
+
+	bzero(rq, sizeof *rq);
+	for (i = 0; i < RQ_NQS; i++)
+		TAILQ_INIT(&rq->rq_queues[i]);
+}
+
 /*
  * Clear the status bit of the queue corresponding to priority level pri,
  * indicating that it is empty.
@@ -156,7 +644,7 @@ runq_setbit(struct runq *rq, int pri)
 }
 
 /*
- * Add the process to the queue specified by its priority, and set the
+ * Add the KSE to the queue specified by its priority, and set the
  * corresponding status bit.
  */
 void
@@ -165,14 +653,16 @@ runq_add(struct runq *rq, struct kse *ke)
 	struct rqhead *rqh;
 	int pri;
 
-#ifdef INVARIANTS
-	struct proc *p = ke->ke_proc;
-#endif
-	if (ke->ke_flags & KEF_ONRUNQ)
-		return;
 	mtx_assert(&sched_lock, MA_OWNED);
-	KASSERT(p->p_stat == SRUN, ("runq_add: proc %p (%s) not SRUN",
-	    p, p->p_comm));
+	KASSERT((ke->ke_thread != NULL), ("runq_add: No thread on KSE"));
+	KASSERT((ke->ke_thread->td_kse != NULL), ("runq_add: No KSE on thread"));
+	if (ke->ke_state == KES_ONRUNQ)
+		return;
+#if defined(INVARIANTS) && defined(DIAGNOSTIC)
+	KASSERT(ke->ke_state != KES_ONRUNQ,
+	    ("runq_add: kse %p (%s) already in run queue", ke,
+	    ke->ke_proc->p_comm));
+#endif
 	pri = ke->ke_thread->td_priority / RQ_PPQ;
 	ke->ke_rqindex = pri;
 	runq_setbit(rq, pri);
@@ -180,7 +670,8 @@ runq_add(struct runq *rq, struct kse *ke)
 	CTR4(KTR_RUNQ, "runq_add: p=%p pri=%d %d rqh=%p",
 	    ke->ke_proc, ke->ke_thread->td_priority, pri, rqh);
 	TAILQ_INSERT_TAIL(rqh, ke, ke_procq);
-	ke->ke_flags |= KEF_ONRUNQ;
+	ke->ke_ksegrp->kg_runq_kses++;
+	ke->ke_state = KES_ONRUNQ;
 }
 
 /*
@@ -219,43 +710,38 @@ runq_choose(struct runq *rq)
 	int pri;
 
 	mtx_assert(&sched_lock, MA_OWNED);
-	if ((pri = runq_findbit(rq)) != -1) {
+	while ((pri = runq_findbit(rq)) != -1) {
 		rqh = &rq->rq_queues[pri];
 		ke = TAILQ_FIRST(rqh);
 		KASSERT(ke != NULL, ("runq_choose: no proc on busy queue"));
-		KASSERT(ke->ke_proc->p_stat == SRUN,
-		    ("runq_choose: process %d(%s) in state %d", ke->ke_proc->p_pid,
-		    ke->ke_proc->p_comm, ke->ke_proc->p_stat));
-		CTR3(KTR_RUNQ, "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);
+		CTR3(KTR_RUNQ,
+		    "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);
+KASSERT(ke->ke_procq.tqe_prev != NULL, ("no prev"));
+if (ke->ke_procq.tqe_next)
+	KASSERT(ke->ke_procq.tqe_next->ke_procq.tqe_prev != NULL, ("no next"));
 		TAILQ_REMOVE(rqh, ke, ke_procq);
+		ke->ke_ksegrp->kg_runq_kses--;
 		if (TAILQ_EMPTY(rqh)) {
 			CTR0(KTR_RUNQ, "runq_choose: empty");
 			runq_clrbit(rq, pri);
 		}
-		ke->ke_flags &= ~KEF_ONRUNQ;
+
+		ke->ke_state = KES_RUNNING;
+		KASSERT((ke->ke_thread != NULL),
+		    ("runq_choose: No thread on KSE"));
+		KASSERT((ke->ke_thread->td_kse != NULL),
+		    ("runq_choose: No KSE on thread"));
 		return (ke);
 	}
 	CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
 
-	return (PCPU_GET(idlethread)->td_kse);
+	return (NULL);
 }
 
 /*
- * Initialize a run structure.
- */
-void
-runq_init(struct runq *rq)
-{
-	int i;
-
-	bzero(rq, sizeof *rq);
-	for (i = 0; i < RQ_NQS; i++)
-		TAILQ_INIT(&rq->rq_queues[i]);
-}
-
-/*
- * Remove the process from the queue specified by its priority, and clear the
+ * Remove the KSE from the queue specified by its priority, and clear the
  * corresponding status bit if the queue becomes empty.
+ * Caller must set ke->ke_state afterwards.
  */
 void
 runq_remove(struct runq *rq, struct kse *ke)
@@ -263,8 +749,7 @@ runq_remove(struct runq *rq, struct kse *ke)
 	struct rqhead *rqh;
 	int pri;
 
-	if (!(ke->ke_flags & KEF_ONRUNQ))
-		return;
+	KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue"));
 	mtx_assert(&sched_lock, MA_OWNED);
 	pri = ke->ke_rqindex;
 	rqh = &rq->rq_queues[pri];
@@ -276,5 +761,104 @@ runq_remove(struct runq *rq, struct kse *ke)
 		CTR0(KTR_RUNQ, "runq_remove: empty");
 		runq_clrbit(rq, pri);
 	}
-	ke->ke_flags &= ~KEF_ONRUNQ;
+	ke->ke_state = KES_UNQUEUED; 
+	ke->ke_ksegrp->kg_runq_kses--;
+}
+
+static void 
+runq_readjust(struct runq *rq, struct kse *ke)
+{
+
+	if (ke->ke_rqindex != (ke->ke_thread->td_priority / RQ_PPQ)) {
+		runq_remove(rq, ke);
+		runq_add(rq, ke);
+	}
+}
+
+void
+thread_sanity_check(struct thread *td)
+{
+	struct proc *p;
+	struct ksegrp *kg;
+	struct kse *ke;
+	struct thread *td2;
+	unsigned int prevpri;
+	int	saw_lastassigned;
+	int unassigned;
+	int assigned;
+
+	p = td->td_proc;
+	kg = td->td_ksegrp;
+	ke = td->td_kse;
+
+	if (kg != &p->p_ksegrp) {
+		panic ("wrong ksegrp");
+	}
+
+	if (ke) {
+		if (ke != &p->p_kse) {
+			panic("wrong kse");
+		}
+		if (ke->ke_thread != td) {
+			panic("wrong thread");
+		}
+	}
+	
+	if ((p->p_flag & P_KSES) == 0) {
+		if (ke == NULL) {
+			panic("non KSE thread lost kse");
+		}
+	} else {
+		prevpri = 0;
+		saw_lastassigned = 0;
+		unassigned = 0;
+		assigned = 0;
+		TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
+			if (td2->td_priority < prevpri) {
+				panic("thread runqueue unosorted");
+			}
+			prevpri = td2->td_priority;
+			if (td2->td_kse) {
+				assigned++;
+				if (unassigned) {
+					panic("unassigned before assigned");
+				}
+ 				if  (kg->kg_last_assigned == NULL) {
+					panic("lastassigned corrupt");
+				}
+				if (saw_lastassigned) {
+					panic("last assigned not last");
+				}
+				if (td2->td_kse->ke_thread != td2) {
+					panic("mismatched kse/thread");
+				}
+			} else {
+				unassigned++;
+			}
+			if (td2 == kg->kg_last_assigned) {
+				saw_lastassigned = 1;
+				if (td2->td_kse == NULL) {
+					panic("last assigned not assigned");
+				}
+			}
+		}
+		if (kg->kg_last_assigned && (saw_lastassigned == 0)) {
+			panic("where on earth does lastassigned point?");
+		}
+		FOREACH_THREAD_IN_GROUP(kg, td2) {
+			if (((td2->td_flags & TDF_UNBOUND) == 0) && 
+			    (td2->td_state == TDS_RUNQ)) {
+				assigned++;
+				if (td2->td_kse == NULL) {
+					panic ("BOUND thread with no KSE");
+				}
+			}
+		}
+#if 0
+		if ((unassigned + assigned) != kg->kg_runnable) {
+			panic("wrong number in runnable");
+		}
+#endif
+	}
 }
+
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
index bd1a625..a2a44ff 100644
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@@ -277,9 +277,13 @@ schedcpu(arg)
 				 * with 16-bit int's (remember them?)
 				 * overflow takes 45 days.
 				 */
-				/* XXXKSE */
-			/*	if ((ke->ke_flags & KEF_ONRUNQ) == 0) */
-				if (p->p_stat == SSLEEP || p->p_stat == SSTOP) {
+				/* XXXKSE **WRONG***/
+				/*
+				 * the kse slptimes are not touched in wakeup
+				 * because the thread may not HAVE a KSE
+				 */
+				if (ke->ke_state == KES_ONRUNQ &&
+				    ke->ke_state == KES_RUNNING) {
 					ke->ke_slptime++;
 				} else {
 					ke->ke_slptime = 0;
@@ -321,20 +325,31 @@ schedcpu(arg)
 			}
 			kg->kg_estcpu = decay_cpu(loadfac, kg->kg_estcpu);
 		      	resetpriority(kg);
-			td = FIRST_THREAD_IN_PROC(p);
-		      	if (td->td_priority >= PUSER &&
-			    (p->p_sflag & PS_INMEM)) {
-				int changedqueue =
-				    ((td->td_priority / RQ_PPQ) !=
-				     (kg->kg_user_pri / RQ_PPQ));
-
-				td->td_priority = kg->kg_user_pri;
-				FOREACH_KSE_IN_GROUP(kg, ke) {
-					if ((ke->ke_oncpu == NOCPU) &&
-					    (p->p_stat == SRUN) && /* XXXKSE */
-					    changedqueue) {
-						remrunqueue(ke->ke_thread);
-						setrunqueue(ke->ke_thread);
+			FOREACH_THREAD_IN_GROUP(kg, td) {
+				int changedqueue;
+				if (td->td_priority >= PUSER) {
+					/*
+					 * Only change the priority
+					 * of threads that are still at their
+					 * user priority. 
+					 * XXXKSE This is problematic
+					 * as we may need to re-order
+					 * the threads on the KSEG list.
+					 */
+					changedqueue =
+					    ((td->td_priority / RQ_PPQ) !=
+					     (kg->kg_user_pri / RQ_PPQ));
+
+					td->td_priority = kg->kg_user_pri;
+					if (changedqueue &&
+					    td->td_state == TDS_RUNQ) {
+						/* this could be optimised */
+						remrunqueue(td);
+						td->td_priority =
+						    kg->kg_user_pri;
+						setrunqueue(td);
+					} else {
+						td->td_priority = kg->kg_user_pri;
 					}
 				}
 			}
@@ -409,6 +424,7 @@ sleepinit(void)
  * entered before msleep returns.  If priority includes the PDROP
  * flag the mutex is not entered before returning.
  */
+
 int
 msleep(ident, mtx, priority, wmesg, timo)
 	void *ident;
@@ -426,9 +442,48 @@ msleep(ident, mtx, priority, wmesg, timo)
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(1, 0);
 #endif
+	KASSERT((td->td_kse != NULL), ("msleep: NULL KSE?"));
+	KASSERT((td->td_kse->ke_state == KES_RUNNING), ("msleep: kse state?"));
 	WITNESS_SLEEP(0, &mtx->mtx_object);
 	KASSERT(timo != 0 || mtx_owned(&Giant) || mtx != NULL,
 	    ("sleeping without a mutex"));
+	/*
+	 * If we are capable of async syscalls and there isn't already
+	 * another one ready to return, start a new thread
+	 * and queue it as ready to run. Note that there is danger here
+	 * because we need to make sure that we don't sleep allocating
+	 * the thread (recursion here might be bad).
+	 * Hence the TDF_INMSLEEP flag.
+	 */
+	if (p->p_flag & P_KSES) {
+		/* Just don't bother if we are exiting
+				and not the exiting thread. */
+		if ((p->p_flag & P_WEXIT) && catch && p->p_singlethread != td)
+			return (EINTR);
+		if (td->td_mailbox && (!(td->td_flags & TDF_INMSLEEP))) {
+			/*
+			 * If we have no queued work to do, then
+			 * upcall to the UTS to see if it has more to do.
+			 * We don't need to upcall now, just make it and
+			 * queue it.
+			 */
+			mtx_lock_spin(&sched_lock);
+			if (TAILQ_FIRST(&td->td_ksegrp->kg_runq) == NULL) {
+				/* Don't recurse here! */
+	KASSERT((td->td_kse->ke_state == KES_RUNNING), ("msleep: kse stateX?"));
+				td->td_flags |= TDF_INMSLEEP;
+				thread_schedule_upcall(td, td->td_kse);
+				td->td_flags &= ~TDF_INMSLEEP;
+	KASSERT((td->td_kse->ke_state == KES_RUNNING), ("msleep: kse stateY?"));
+			}
+			mtx_unlock_spin(&sched_lock);
+		}
+		KASSERT((td->td_kse != NULL), ("msleep: NULL KSE2?"));
+		KASSERT((td->td_kse->ke_state == KES_RUNNING),
+		    ("msleep: kse state2?"));
+		KASSERT((td->td_kse->ke_thread == td),
+		    ("msleep: kse/thread mismatch?"));
+	}
 	mtx_lock_spin(&sched_lock);
 	if (cold || panicstr) {
 		/*
@@ -454,7 +509,7 @@ msleep(ident, mtx, priority, wmesg, timo)
 	}
 
 	KASSERT(p != NULL, ("msleep1"));
-	KASSERT(ident != NULL && td->td_proc->p_stat == SRUN, ("msleep"));
+	KASSERT(ident != NULL && td->td_state == TDS_RUNNING, ("msleep"));
 
 	td->td_wchan = ident;
 	td->td_wmesg = wmesg;
@@ -468,20 +523,23 @@ msleep(ident, mtx, priority, wmesg, timo)
 		callout_reset(&td->td_slpcallout, timo, endtsleep, td);
 	/*
 	 * We put ourselves on the sleep queue and start our timeout
-	 * before calling cursig, as we could stop there, and a wakeup
-	 * or a SIGCONT (or both) could occur while we were stopped.
-	 * A SIGCONT would cause us to be marked as SSLEEP
+	 * before calling thread_suspend_check, as we could stop there, and
+	 * a wakeup or a SIGCONT (or both) could occur while we were stopped.
 	 * without resuming us, thus we must be ready for sleep
 	 * when cursig is called.  If the wakeup happens while we're
 	 * stopped, td->td_wchan will be 0 upon return from cursig.
 	 */
 	if (catch) {
-		CTR3(KTR_PROC, "msleep caught: proc %p (pid %d, %s)", p,
+		CTR3(KTR_PROC, "msleep caught: thread %p (pid %d, %s)", td,
 		    p->p_pid, p->p_comm);
 		td->td_flags |= TDF_SINTR;
 		mtx_unlock_spin(&sched_lock);
 		PROC_LOCK(p);
-		sig = cursig(p);
+		sig = cursig(td);
+		if (thread_suspend_check(1)) {
+			sig = EINTR;
+			rval = EINTR;
+		}
 		mtx_lock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 		if (sig != 0) {
@@ -492,13 +550,13 @@ msleep(ident, mtx, priority, wmesg, timo)
 	} else
 		sig = 0;
 	if (td->td_wchan != NULL) {
-		td->td_proc->p_stat = SSLEEP;
 		p->p_stats->p_ru.ru_nvcsw++;
+		td->td_state = TDS_SLP;
 		mi_switch();
 	}
-	CTR3(KTR_PROC, "msleep resume: proc %p (pid %d, %s)", td, p->p_pid,
+	CTR3(KTR_PROC, "msleep resume: thread %p (pid %d, %s)", td, p->p_pid,
 	    p->p_comm);
-	KASSERT(td->td_proc->p_stat == SRUN, ("running but not SRUN"));
+	KASSERT(td->td_state == TDS_RUNNING, ("running but not TDS_RUNNING"));
 	td->td_flags &= ~TDF_SINTR;
 	if (td->td_flags & TDF_TIMEOUT) {
 		td->td_flags &= ~TDF_TIMEOUT;
@@ -524,8 +582,8 @@ msleep(ident, mtx, priority, wmesg, timo)
 
 	if (rval == 0 && catch) {
 		PROC_LOCK(p);
-		/* XXX: shouldn't we always be calling cursig() */ 
-		if (sig != 0 || (sig = cursig(p))) {
+		/* XXX: shouldn't we always be calling cursig() */
+		if (sig != 0 || (sig = cursig(td))) {
 			if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
 				rval = EINTR;
 			else
@@ -571,7 +629,7 @@ endtsleep(arg)
 		td->td_flags &= ~TDF_TIMEOUT;
 		setrunqueue(td);
 	} else if (td->td_wchan != NULL) {
-		if (td->td_proc->p_stat == SSLEEP)  /* XXXKSE */
+		if (td->td_state == TDS_SLP)  /* XXXKSE */
 			setrunnable(td);
 		else
 			unsleep(td);
@@ -583,6 +641,38 @@ endtsleep(arg)
 }
 
 /*
+ * Abort a thread, as if an interrupt had occured.  Only abort
+ * interruptable waits (unfortunatly it isn't only safe to abort others).
+ * This is about identical to cv_abort().
+ * Think about merging them?
+ * Also, whatever the signal code does...
+ */
+void
+abortsleep(struct thread *td)
+{
+
+	mtx_lock_spin(&sched_lock);
+	/*
+	 * If the TDF_TIMEOUT flag is set, just leave. A
+	 * timeout is scheduled anyhow.
+	 */
+	if ((td->td_flags & (TDF_TIMEOUT | TDF_SINTR)) == TDF_SINTR) {
+		if (td->td_wchan != NULL) {
+			if (td->td_state == TDS_SLP) {  /* XXXKSE */
+				setrunnable(td);
+			} else {
+				/*
+				 * Probably in a suspended state..
+				 * um.. dunno XXXKSE
+				 */
+				unsleep(td);
+			}
+		}
+	}
+	mtx_unlock_spin(&sched_lock);
+}
+
+/*
  * Remove a process from its wait queue
  */
 void
@@ -618,25 +708,24 @@ restart:
 		if (td->td_wchan == ident) {
 			TAILQ_REMOVE(qp, td, td_slpq);
 			td->td_wchan = NULL;
-			if (td->td_proc->p_stat == SSLEEP) {
+			if (td->td_state == TDS_SLP) {
 				/* OPTIMIZED EXPANSION OF setrunnable(p); */
 				CTR3(KTR_PROC, "wakeup: thread %p (pid %d, %s)",
 				    td, p->p_pid, p->p_comm);
 				if (td->td_ksegrp->kg_slptime > 1)
 					updatepri(td);
 				td->td_ksegrp->kg_slptime = 0;
-				td->td_kse->ke_slptime = 0;
-				td->td_proc->p_stat = SRUN;
 				if (p->p_sflag & PS_INMEM) {
 					setrunqueue(td);
 					maybe_resched(td);
 				} else {
+/* XXXKSE Wrong! */			td->td_state = TDS_RUNQ;
 					p->p_sflag |= PS_SWAPINREQ;
 					wakeup(&proc0);
 				}
 				/* END INLINE EXPANSION */
-				goto restart;
 			}
+			goto restart;
 		}
 	}
 	mtx_unlock_spin(&sched_lock);
@@ -665,20 +754,19 @@ restart:
 		if (td->td_wchan == ident) {
 			TAILQ_REMOVE(qp, td, td_slpq);
 			td->td_wchan = NULL;
-			if (td->td_proc->p_stat == SSLEEP) {
+			if (td->td_state == TDS_SLP) {
 				/* OPTIMIZED EXPANSION OF setrunnable(p); */
-				CTR3(KTR_PROC, "wakeup1: proc %p (pid %d, %s)",
-				    p, p->p_pid, p->p_comm);
+				CTR3(KTR_PROC,"wakeup1: thread %p (pid %d, %s)",
+				    td, p->p_pid, p->p_comm);
 				if (td->td_ksegrp->kg_slptime > 1)
 					updatepri(td);
 				td->td_ksegrp->kg_slptime = 0;
-				td->td_kse->ke_slptime = 0;
-				td->td_proc->p_stat = SRUN;
 				if (p->p_sflag & PS_INMEM) {
 					setrunqueue(td);
 					maybe_resched(td);
 					break;
 				} else {
+/* XXXKSE Wrong */			td->td_state = TDS_RUNQ;
 					p->p_sflag |= PS_SWAPINREQ;
 					wakeup(&proc0);
 				}
@@ -698,15 +786,19 @@ mi_switch()
 {
 	struct bintime new_switchtime;
 	struct thread *td = curthread;	/* XXX */
-	register struct proc *p = td->td_proc;	/* XXX */
+	struct proc *p = td->td_proc;	/* XXX */
+	struct kse *ke = td->td_kse;
 #if 0
 	register struct rlimit *rlim;
 #endif
 	u_int sched_nest;
 
 	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
+	KASSERT((ke->ke_state == KES_RUNNING), ("mi_switch: kse state?"));
 #ifdef INVARIANTS
-	if (p->p_stat != SMTX && p->p_stat != SRUN)
+	if (td->td_state != TDS_MTX &&
+	    td->td_state != TDS_RUNQ &&
+	    td->td_state != TDS_RUNNING)
 		mtx_assert(&Giant, MA_NOTOWNED);
 #endif
 
@@ -735,7 +827,8 @@ mi_switch()
 	 *
 	 * XXX drop sched_lock, pickup Giant
 	 */
-	if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
+	if (p->p_state != PRS_ZOMBIE &&
+	    p->p_limit->p_cpulimit != RLIM_INFINITY &&
 	    p->p_runtime > p->p_limit->p_cpulimit) {
 		rlim = &p->p_rlimit[RLIMIT_CPU];
 		if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) {
@@ -763,17 +856,35 @@ mi_switch()
 	 */
 	cnt.v_swtch++;
 	PCPU_SET(switchtime, new_switchtime);
-	CTR3(KTR_PROC, "mi_switch: old proc %p (pid %d, %s)", p, p->p_pid,
+	CTR3(KTR_PROC, "mi_switch: old thread %p (pid %d, %s)", td, p->p_pid,
 	    p->p_comm);
 	sched_nest = sched_lock.mtx_recurse;
-	td->td_lastcpu = td->td_kse->ke_oncpu;
-	td->td_kse->ke_oncpu = NOCPU;
-	td->td_kse->ke_flags &= ~KEF_NEEDRESCHED;
+	td->td_lastcpu = ke->ke_oncpu;
+	ke->ke_oncpu = NOCPU;
+	ke->ke_flags &= ~KEF_NEEDRESCHED;
+	/*
+	 * At the last moment: if this KSE is not on the run queue,
+	 * it needs to be freed correctly and the thread treated accordingly.
+	 */
+	if ((td->td_state == TDS_RUNNING) &&
+	    ((ke->ke_flags & KEF_IDLEKSE) == 0)) {
+		/* Put us back on the run queue (kse and all). */
+		setrunqueue(td);
+	} else if ((td->td_flags & TDF_UNBOUND) &&
+	    (td->td_state != TDS_RUNQ)) { /* in case of old code */
+		/*
+		 * We will not be on the run queue.
+		 * Someone else can use the KSE if they need it.
+		 */
+		td->td_kse = NULL;
+		kse_reassign(ke);
+	}
 	cpu_switch();
 	td->td_kse->ke_oncpu = PCPU_GET(cpuid);
+	td->td_kse->ke_state = KES_RUNNING;
 	sched_lock.mtx_recurse = sched_nest;
 	sched_lock.mtx_lock = (uintptr_t)td;
-	CTR3(KTR_PROC, "mi_switch: new proc %p (pid %d, %s)", p, p->p_pid,
+	CTR3(KTR_PROC, "mi_switch: new thread %p (pid %d, %s)", td, p->p_pid,
 	    p->p_comm);
 	if (PCPU_GET(switchtime.sec) == 0)
 		binuptime(PCPU_PTR(switchtime));
@@ -791,37 +902,42 @@ setrunnable(struct thread *td)
 	struct proc *p = td->td_proc;
 
 	mtx_lock_spin(&sched_lock);
-	switch (p->p_stat) {
-	case SZOMB: /* not a thread flag XXXKSE */
+	switch (p->p_state) {
+	case PRS_ZOMBIE:
 		panic("setrunnable(1)");
+	default:
+		break;
 	}
-	switch (td->td_proc->p_stat) {
+	switch (td->td_state) {
 	case 0:
-	case SRUN:
-	case SWAIT:
+	case TDS_RUNNING:
+	case TDS_IWAIT:
 	default:
+		printf("state is %d", td->td_state);
 		panic("setrunnable(2)");
-	case SSTOP:
-	case SSLEEP:			/* e.g. when sending signals */
+	case TDS_SUSPENDED:
+		thread_unsuspend(p);
+		break;
+	case TDS_SLP:			/* e.g. when sending signals */
 		if (td->td_flags & TDF_CVWAITQ)
 			cv_waitq_remove(td);
 		else
 			unsleep(td);
-		break;
-
-	case SIDL:
+	case TDS_UNQUEUED:  /* being put back onto the queue */
+	case TDS_NEW:	/* not yet had time to suspend */
+	case TDS_RUNQ:	/* not yet had time to suspend */
 		break;
 	}
-	td->td_proc->p_stat = SRUN;
 	if (td->td_ksegrp->kg_slptime > 1)
 		updatepri(td);
 	td->td_ksegrp->kg_slptime = 0;
-	td->td_kse->ke_slptime = 0;
 	if ((p->p_sflag & PS_INMEM) == 0) {
+		td->td_state = TDS_RUNQ; /* XXXKSE not a good idea */
 		p->p_sflag |= PS_SWAPINREQ;
 		wakeup(&proc0);
 	} else {
-		setrunqueue(td);
+		if (td->td_state != TDS_RUNQ)
+			setrunqueue(td); /* XXXKSE */
 		maybe_resched(td);
 	}
 	mtx_unlock_spin(&sched_lock);
@@ -848,7 +964,7 @@ resetpriority(kg)
 		kg->kg_user_pri = newpriority;
 	}
 	FOREACH_THREAD_IN_GROUP(kg, td) {
-		maybe_resched(td);
+		maybe_resched(td);			/* XXXKSE silly */
 	}
 	mtx_unlock_spin(&sched_lock);
 }
@@ -865,20 +981,21 @@ loadav(void *arg)
 	int i, nrun;
 	struct loadavg *avg;
 	struct proc *p;
-	struct ksegrp *kg;
+	struct thread *td;
 
 	avg = &averunnable;
 	sx_slock(&allproc_lock);
 	nrun = 0;
 	FOREACH_PROC_IN_SYSTEM(p) {
-		FOREACH_KSEGRP_IN_PROC(p, kg) {
-			switch (p->p_stat) {
-			case SRUN:
+		FOREACH_THREAD_IN_PROC(p, td) {
+			switch (td->td_state) {
+			case TDS_RUNQ:
+			case TDS_RUNNING:
 				if ((p->p_flag & P_NOLOAD) != 0)
 					goto nextproc;
-				/* FALLTHROUGH */
-			case SIDL:
-				nrun++;
+				nrun++; /* XXXKSE */
+			default:
+				break;
 			}
 nextproc:
 			continue;
@@ -932,19 +1049,18 @@ void
 schedclock(td)
 	struct thread *td;
 {
-	struct kse *ke = td->td_kse;
-	struct ksegrp *kg = td->td_ksegrp;
+	struct kse *ke;
+	struct ksegrp *kg;
 
-	if (td) {
-		ke->ke_cpticks++;
-		kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1);
-		if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
-			resetpriority(td->td_ksegrp);
-			if (td->td_priority >= PUSER)
-				td->td_priority = kg->kg_user_pri;
-		}
-	} else {
-		panic("schedclock");
+	KASSERT((td != NULL), ("schedlock: null thread pointer"));
+	ke = td->td_kse;
+	kg = td->td_ksegrp;
+	ke->ke_cpticks++;
+	kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1);
+	if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
+		resetpriority(kg);
+		if (td->td_priority >= PUSER)
+			td->td_priority = kg->kg_user_pri;
 	}
 }
 
@@ -959,7 +1075,6 @@ yield(struct thread *td, struct yield_args *uap)
 	mtx_assert(&Giant, MA_NOTOWNED);
 	mtx_lock_spin(&sched_lock);
 	td->td_priority = PRI_MAX_TIMESHARE;
-	setrunqueue(td);
 	kg->kg_proc->p_stats->p_ru.ru_nvcsw++;
 	mi_switch();
 	mtx_unlock_spin(&sched_lock);
diff --git a/sys/kern/ksched.c b/sys/kern/ksched.c
index c9081c3..bbe36be 100644
--- a/sys/kern/ksched.c
+++ b/sys/kern/ksched.c
@@ -181,7 +181,18 @@ int ksched_setscheduler(register_t *ret, struct ksched *ksched,
 
 			mtx_lock_spin(&sched_lock);
 			rtp_to_pri(&rtp, kg);
-			td->td_last_kse->ke_flags |= KEF_NEEDRESCHED; /* XXXKSE */
+			FOREACH_THREAD_IN_GROUP(kg, td) { /* XXXKSE */
+				if (td->td_state == TDS_RUNNING) {
+					td->td_kse->ke_flags |= KEF_NEEDRESCHED;
+				} else if (td->td_state == TDS_RUNQ) {
+					if (td->td_priority > kg->kg_user_pri) {
+						remrunqueue(td);
+						td->td_priority =
+						    kg->kg_user_pri;
+						setrunqueue(td);
+					}
+				}
+			}
 			mtx_unlock_spin(&sched_lock);
 		}
 		else
@@ -203,7 +214,19 @@ int ksched_setscheduler(register_t *ret, struct ksched *ksched,
 			 *     on the scheduling code: You must leave the
 			 *     scheduling info alone.
 			 */
-			td->td_last_kse->ke_flags |= KEF_NEEDRESCHED; /* XXXKSE */
+			FOREACH_THREAD_IN_GROUP(kg, td) {
+				if (td->td_state == TDS_RUNNING) {
+					td->td_kse->ke_flags |= KEF_NEEDRESCHED;
+				} else if (td->td_state == TDS_RUNQ) {
+					if (td->td_priority > kg->kg_user_pri) {
+						remrunqueue(td);
+						td->td_priority =
+						    kg->kg_user_pri;
+						setrunqueue(td);
+					}
+				}
+				
+			}
 			mtx_unlock_spin(&sched_lock);
 		}
 		break;
diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c
index 9dad93b..afd4c5d 100644
--- a/sys/kern/subr_smp.c
+++ b/sys/kern/subr_smp.c
@@ -124,8 +124,8 @@ forward_signal(struct thread *td)
 	 * executing so that it executes ast().
 	 */
 	mtx_assert(&sched_lock, MA_OWNED);
-	KASSERT(td->td_proc->p_stat == SRUN,
-	    ("forward_signal: process is not SRUN"));
+	KASSERT(td->td_state == TDS_RUNNING,
+	    ("forward_signal: thread is not TDS_RUNNING"));
 
 	CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc);
 
diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
index 3b415de..027aa9c 100644
--- a/sys/kern/subr_trap.c
+++ b/sys/kern/subr_trap.c
@@ -48,6 +48,8 @@
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/kse.h>
+#include <sys/ktr.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/systm.h>
@@ -71,13 +73,15 @@ userret(td, frame, oticks)
 	struct kse *ke = td->td_kse; 
 	struct ksegrp *kg = td->td_ksegrp;
 
+	CTR3(KTR_SYSC, "userret: thread %p (pid %d, %s)", td, p->p_pid,
+            p->p_comm);
 #ifdef INVARIANTS
 	/* Check that we called signotify() enough. */
 	mtx_lock(&Giant);
 	PROC_LOCK(p);
 	mtx_lock_spin(&sched_lock);
 	if (SIGPENDING(p) && ((p->p_sflag & PS_NEEDSIGCHK) == 0 ||
-	    (p->p_kse.ke_flags & KEF_ASTPENDING) == 0))
+	    (ke->ke_flags & KEF_ASTPENDING) == 0))
 		printf("failed to set signal flags proprly for ast()\n");
 	mtx_unlock_spin(&sched_lock);
 	PROC_UNLOCK(p);
@@ -100,6 +104,22 @@ userret(td, frame, oticks)
 	}
 
 	/*
+	 * We need to check to see if we have to exit or wait due to a
+	 * single threading requirement or some other STOP condition.
+	 */
+	PROC_LOCK(p);
+	thread_suspend_check(0);	/* Can suspend or kill */
+	PROC_UNLOCK(p);
+
+	/*
+	 * DO special thread processing, e.g. upcall tweaking and such
+	 */
+	if (p->p_flag & P_KSES) {
+		thread_userret(p, kg, ke, td, frame);
+		/* printf("KSE thread returned"); */
+	}
+
+	/*
 	 * Charge system time if profiling.
 	 *
 	 * XXX should move PS_PROFIL to a place that can obviously be
@@ -121,8 +141,7 @@ userret(td, frame, oticks)
  * This function will return with preemption disabled.
  */
 void
-ast(framep)
-	struct trapframe *framep;
+ast(struct trapframe *framep)
 {
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
@@ -136,6 +155,8 @@ ast(framep)
 	int ucode;
 #endif
 
+	CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, p->p_pid,
+            p->p_comm);
 	KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode"));
 #ifdef WITNESS
 	if (witness_list(td))
@@ -164,6 +185,13 @@ ast(framep)
 		p->p_stats->p_prof.pr_ticks = 0;
 	}
 	mtx_unlock_spin(&sched_lock);
+	/*
+	 * XXXKSE While the fact that we owe a user profiling
+	 * tick is stored per KSE in this code, the statistics
+	 * themselves are still stored per process.
+	 * This should probably change, by which I mean that
+	 * possibly the location of both might change.
+	 */
 
 	if (td->td_ucred != p->p_ucred) 
 		cred_update_thread(td);
@@ -192,14 +220,13 @@ ast(framep)
 	if (flags & KEF_NEEDRESCHED) {
 		mtx_lock_spin(&sched_lock);
 		td->td_priority = kg->kg_user_pri;
-		setrunqueue(td);
 		p->p_stats->p_ru.ru_nivcsw++;
 		mi_switch();
 		mtx_unlock_spin(&sched_lock);
 	}
 	if (sflag & PS_NEEDSIGCHK) {
 		PROC_LOCK(p);
-		while ((sig = cursig(p)) != 0)
+		while ((sig = cursig(td)) != 0)
 			postsig(sig);
 		PROC_UNLOCK(p);
 	}
diff --git a/sys/kern/subr_turnstile.c b/sys/kern/subr_turnstile.c
index 08bca8d..c2e79d0 100644
--- a/sys/kern/subr_turnstile.c
+++ b/sys/kern/subr_turnstile.c
@@ -119,23 +119,20 @@ propagate_priority(struct thread *td)
 			return;
 		}
 
+		KASSERT(td->td_state != TDS_SURPLUS, ("Mutex owner SURPLUS"));
+		MPASS(td->td_proc != NULL);
 		MPASS(td->td_proc->p_magic == P_MAGIC);
-		KASSERT(td->td_proc->p_stat != SSLEEP, ("sleeping thread owns a mutex"));
+		KASSERT(td->td_state != TDS_SLP,
+		    ("sleeping thread owns a mutex"));
 		if (td->td_priority <= pri) /* lower is higher priority */
 			return;
 
-		/*
-		 * Bump this thread's priority.
-		 */
-		td->td_priority = pri;
 
 		/*
 		 * If lock holder is actually running, just bump priority.
 		 */
-		if (thread_running(td)) {
-			MPASS(td->td_proc->p_stat == SRUN
-			|| td->td_proc->p_stat == SZOMB
-			|| td->td_proc->p_stat == SSTOP);
+		if (td->td_state == TDS_RUNNING) {
+			td->td_priority = pri;
 			return;
 		}
 
@@ -151,20 +148,26 @@ propagate_priority(struct thread *td)
 		 * If on run queue move to new run queue, and quit.
 		 * XXXKSE this gets a lot more complicated under threads
 		 * but try anyhow.
+		 * We should have a special call to do this more efficiently.
 		 */
-		if (td->td_proc->p_stat == SRUN) {
+		if (td->td_state == TDS_RUNQ) {
 			MPASS(td->td_blocked == NULL);
 			remrunqueue(td);
+			td->td_priority = pri;
 			setrunqueue(td);
 			return;
 		}
+		/*
+		 * Adjust for any other cases.
+		 */
+		td->td_priority = pri;
 
 		/*
 		 * If we aren't blocked on a mutex, we should be.
 		 */
-		KASSERT(td->td_proc->p_stat == SMTX, (
+		KASSERT(td->td_state == TDS_MTX, (
 		    "process %d(%s):%d holds %s but isn't blocked on a mutex\n",
-		    td->td_proc->p_pid, td->td_proc->p_comm, td->td_proc->p_stat,
+		    td->td_proc->p_pid, td->td_proc->p_comm, td->td_state,
 		    m->mtx_object.lo_name));
 
 		/*
@@ -590,7 +593,7 @@ _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
 		 */
 		td->td_blocked = m;
 		td->td_mtxname = m->mtx_object.lo_name;
-		td->td_proc->p_stat = SMTX;
+		td->td_state = TDS_MTX;
 		propagate_priority(td);
 
 		if (LOCK_LOG_TEST(&m->mtx_object, opts))
@@ -727,7 +730,6 @@ _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
 		    m, td1);
 
 	td1->td_blocked = NULL;
-	td1->td_proc->p_stat = SRUN;
 	setrunqueue(td1);
 
 	if (td->td_critnest == 1 && td1->td_priority < pri) {
@@ -744,7 +746,6 @@ _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
 			}
 		}
 #endif
-		setrunqueue(td);
 		if (LOCK_LOG_TEST(&m->mtx_object, opts))
 			CTR2(KTR_LOCK,
 			    "_mtx_unlock_sleep: %p switching out lock=%p", m,
diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c
index 182221d..02b3a0d 100644
--- a/sys/kern/subr_witness.c
+++ b/sys/kern/subr_witness.c
@@ -225,6 +225,7 @@ static struct witness_order_list_entry order_lists[] = {
 #endif
 	{ "clk", &lock_class_mtx_spin },
 	{ "mutex profiling lock", &lock_class_mtx_spin },
+	{ "zombie_thread_lock", &lock_class_mtx_spin },
 	{ NULL, NULL },
 	{ NULL, NULL }
 };
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index 1bdd913..d8fba59 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -1187,7 +1187,7 @@ selwakeup(sip)
 	sip->si_thread = NULL;
 	mtx_lock_spin(&sched_lock);
 	if (td->td_wchan == (caddr_t)&selwait) {
-		if (td->td_proc->p_stat == SSLEEP)
+		if (td->td_state == TDS_SLP)
 			setrunnable(td);
 		else
 			cv_waitq_remove(td);
diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c
index dacb9d9..ab6f1e8 100644
--- a/sys/kern/sys_process.c
+++ b/sys/kern/sys_process.c
@@ -467,7 +467,7 @@ ptrace(struct thread *td, struct ptrace_args *uap)
 		}
 
 		/* not currently stopped */
-		if (p->p_stat != SSTOP || (p->p_flag & P_WAITED) == 0) {
+		if (!P_SHOULDSTOP(p) || (p->p_flag & P_WAITED) == 0) {
 			error = EBUSY;
 			goto fail;
 		}
@@ -566,10 +566,12 @@ ptrace(struct thread *td, struct ptrace_args *uap)
 		if (proctree_locked)
 			sx_xunlock(&proctree_lock);
 		/* deliver or queue signal */
-		if (p->p_stat == SSTOP) {
+		if (P_SHOULDSTOP(p)) {
 			p->p_xstat = uap->data;
 			mtx_lock_spin(&sched_lock);
+			p->p_flag &= ~(P_STOPPED_TRACE|P_STOPPED_SGNL);
 			setrunnable(td2);	/* XXXKSE */
+			/* Need foreach kse in proc, ... make_kse_queued(). */
 			mtx_unlock_spin(&sched_lock);
 		} else if (uap->data)		      
 			psignal(p, uap->data);
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index d8115fb..15a5d7c 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -552,7 +552,7 @@
 381	STD	BSD	{ int kse_new(struct kse_mailbox * mbx, \
 			int new_grp_flag); }
 382	STD	BSD	{ int thread_wakeup(struct thread_mailbox *tmbx); }
-383	STD	BSD	{ int kse_yield(void); }
+383	MSTD	BSD	{ int kse_yield(void); }
 384	UNIMPL	BSD	__mac_get_proc
 385	UNIMPL	BSD	__mac_set_proc
 386	UNIMPL	BSD	__mac_get_fd
diff --git a/sys/kern/tty.c b/sys/kern/tty.c
index b9c5743..6c915e1 100644
--- a/sys/kern/tty.c
+++ b/sys/kern/tty.c
@@ -2392,17 +2392,35 @@ ttyinfo(struct tty *tp)
 			PGRP_UNLOCK(tp->t_pgrp);
 
 			td = FIRST_THREAD_IN_PROC(pick);
-			stmp = pick->p_stat == SRUN ? "running" :  /* XXXKSE */
-			    pick->p_stat == SMTX ? td->td_mtxname :
-			    td->td_wmesg ? td->td_wmesg : "iowait";
+			if (pick->p_flag & P_KSES) {
+				stmp = "KSE" ;  /* XXXKSE */
+			} else {
+				if (td) {
+					if (td->td_state == TDS_RUNQ) {
+						stmp = "running";
+					} else if (td->td_state == TDS_MTX) {
+						stmp = td->td_mtxname;
+					} else if (td->td_wmesg) {
+						stmp = td->td_wmesg;
+					} else {
+						stmp = "iowait";
+					}
+				} else {
+					stmp = "threadless";
+					panic("ttyinfo: no thread!?");
+				}
+			}
 			calcru(pick, &utime, &stime, NULL);
-			ltmp = pick->p_stat == SIDL || pick->p_stat == SWAIT ||
-			    pick->p_stat == SZOMB ? 0 :
-			    pgtok(vmspace_resident_count(pick->p_vmspace));
+			ltmp = ((pick->p_state == PRS_NEW)
+			    || (td && (td->td_state == TDS_IWAIT))
+			    || (pick->p_state == PRS_ZOMBIE ? 0 :
+		    	    pgtok(vmspace_resident_count(pick->p_vmspace))));
 			mtx_unlock_spin(&sched_lock);
 
 			ttyprintf(tp, " cmd: %s %d [%s%s] ", pick->p_comm,
-			    pick->p_pid, pick->p_stat == SMTX ? "*" : "", stmp);
+			    pick->p_pid,
+			    td->td_state == TDS_MTX ? "*" : "",
+			    stmp);
 
 			/* Print user time. */
 			ttyprintf(tp, "%ld.%02ldu ",
@@ -2433,7 +2451,19 @@ ttyinfo(struct tty *tp)
  *	   we pick out just "short-term" sleepers (P_SINTR == 0).
  *	4) Further ties are broken by picking the highest pid.
  */
-#define ISRUN(p)	(((p)->p_stat == SRUN) || ((p)->p_stat == SIDL))
+#define ISRUN(p, val)						\
+do {								\
+	struct thread *td;					\
+	val = 0;						\
+	FOREACH_THREAD_IN_PROC(p, td) {				\
+		if (td->td_state == TDS_RUNQ ||			\
+		    td->td_state == TDS_RUNNING) {		\
+			val = 1;				\
+			break;					\
+		}						\
+	}							\
+} while (0)
+
 #define TESTAB(a, b)    ((a)<<1 | (b))
 #define ONLYA   2
 #define ONLYB   1
@@ -2449,10 +2479,13 @@ proc_compare(struct proc *p1, struct proc *p2)
 	if (p1 == NULL)
 		return (1);
 
+	ISRUN(p1, esta);
+	ISRUN(p2, estb);
+	
 	/*
 	 * see if at least one of them is runnable
 	 */
-	switch (TESTAB(ISRUN(p1), ISRUN(p2))) {
+	switch (TESTAB(esta, estb)) {
 	case ONLYA:
 		return (0);
 	case ONLYB:
@@ -2477,7 +2510,7 @@ proc_compare(struct proc *p1, struct proc *p2)
 	/*
 	 * weed out zombies
 	 */
-	switch (TESTAB(p1->p_stat == SZOMB, p2->p_stat == SZOMB)) {
+	switch (TESTAB(p1->p_state == PRS_ZOMBIE, p2->p_state == PRS_ZOMBIE)) {
 	case ONLYA:
 		return (1);
 	case ONLYB:
diff --git a/sys/posix4/ksched.c b/sys/posix4/ksched.c
index c9081c3..bbe36be 100644
--- a/sys/posix4/ksched.c
+++ b/sys/posix4/ksched.c
@@ -181,7 +181,18 @@ int ksched_setscheduler(register_t *ret, struct ksched *ksched,
 
 			mtx_lock_spin(&sched_lock);
 			rtp_to_pri(&rtp, kg);
-			td->td_last_kse->ke_flags |= KEF_NEEDRESCHED; /* XXXKSE */
+			FOREACH_THREAD_IN_GROUP(kg, td) { /* XXXKSE */
+				if (td->td_state == TDS_RUNNING) {
+					td->td_kse->ke_flags |= KEF_NEEDRESCHED;
+				} else if (td->td_state == TDS_RUNQ) {
+					if (td->td_priority > kg->kg_user_pri) {
+						remrunqueue(td);
+						td->td_priority =
+						    kg->kg_user_pri;
+						setrunqueue(td);
+					}
+				}
+			}
 			mtx_unlock_spin(&sched_lock);
 		}
 		else
@@ -203,7 +214,19 @@ int ksched_setscheduler(register_t *ret, struct ksched *ksched,
 			 *     on the scheduling code: You must leave the
 			 *     scheduling info alone.
 			 */
-			td->td_last_kse->ke_flags |= KEF_NEEDRESCHED; /* XXXKSE */
+			FOREACH_THREAD_IN_GROUP(kg, td) {
+				if (td->td_state == TDS_RUNNING) {
+					td->td_kse->ke_flags |= KEF_NEEDRESCHED;
+				} else if (td->td_state == TDS_RUNQ) {
+					if (td->td_priority > kg->kg_user_pri) {
+						remrunqueue(td);
+						td->td_priority =
+						    kg->kg_user_pri;
+						setrunqueue(td);
+					}
+				}
+				
+			}
 			mtx_unlock_spin(&sched_lock);
 		}
 		break;
diff --git a/sys/sparc64/sparc64/genassym.c b/sys/sparc64/sparc64/genassym.c
index 4f47a75..eee4abc 100644
--- a/sys/sparc64/sparc64/genassym.c
+++ b/sys/sparc64/sparc64/genassym.c
@@ -232,6 +232,8 @@ ASSYM(TD_KSE, offsetof(struct thread, td_kse));
 ASSYM(TD_KSTACK, offsetof(struct thread, td_kstack));
 ASSYM(TD_PCB, offsetof(struct thread, td_pcb));
 ASSYM(TD_PROC, offsetof(struct thread, td_proc));
+ASSYM(TD_STATE, offsetof(struct thread, td_state));
+ASSYM(TDS_RUNNING, TDS_RUNNING);
 
 ASSYM(PCB_SIZEOF, sizeof(struct pcb));
 ASSYM(PCB_FPSTATE, offsetof(struct pcb, pcb_fpstate));
diff --git a/sys/sparc64/sparc64/swtch.S b/sys/sparc64/sparc64/swtch.S
index 429e961..a8a753a 100644
--- a/sys/sparc64/sparc64/swtch.S
+++ b/sys/sparc64/sparc64/swtch.S
@@ -109,6 +109,9 @@ ENTRY(cpu_switch)
 	stx	%o0, [PCPU(CURTHREAD)]
 	stx	%o1, [PCPU(CURPCB)]
 
+	mov	TDS_RUNNING, %o2
+	stw	%o2, [%o0 + TD_STATE]
+
 	SET(sched_lock, %o3, %o2)
 	stx	%o0, [%o2 + MTX_LOCK]
 
diff --git a/sys/sparc64/sparc64/swtch.s b/sys/sparc64/sparc64/swtch.s
index 429e961..a8a753a 100644
--- a/sys/sparc64/sparc64/swtch.s
+++ b/sys/sparc64/sparc64/swtch.s
@@ -109,6 +109,9 @@ ENTRY(cpu_switch)
 	stx	%o0, [PCPU(CURTHREAD)]
 	stx	%o1, [PCPU(CURPCB)]
 
+	mov	TDS_RUNNING, %o2
+	stw	%o2, [%o0 + TD_STATE]
+
 	SET(sched_lock, %o3, %o2)
 	stx	%o0, [%o2 + MTX_LOCK]
 
diff --git a/sys/sparc64/sparc64/trap.c b/sys/sparc64/sparc64/trap.c
index 61e3b44..f39d2f6 100644
--- a/sys/sparc64/sparc64/trap.c
+++ b/sys/sparc64/sparc64/trap.c
@@ -49,6 +49,7 @@
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/ktr.h>
+#include <sys/kse.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
@@ -190,6 +191,11 @@ trap(struct trapframe *tf)
 		td->td_frame = tf;
 		if (td->td_ucred != p->p_ucred)
 			cred_update_thread(td);
+		if ((p->p_flag & P_WEXIT) && (p->p_singlethread != td)) {
+			mtx_lock_spin(&sched_lock);
+			thread_exit();
+			/* NOTREACHED */
+		}
  	} else {
  		sticks = 0;
 if ((type & ~T_KERNEL) != T_BREAKPOINT)
@@ -528,6 +534,23 @@ syscall(struct trapframe *tf)
 	td->td_frame = tf;
 	if (td->td_ucred != p->p_ucred)
 		cred_update_thread(td);
+	if (p->p_flag & P_KSES) {
+		/*
+		 * If we are doing a syscall in a KSE environment,
+		 * note where our mailbox is. There is always the
+		 * possibility that we could do this lazily (in sleep()),
+		 * but for now do it every time.
+		 */
+		td->td_mailbox = (void *)fuword((caddr_t)td->td_kse->ke_mailbox
+		    + offsetof(struct kse_mailbox, kmbx_current_thread));
+		if ((td->td_mailbox == NULL) ||
+		    (td->td_mailbox == (void *)-1)) {
+			td->td_mailbox = NULL;  /* single thread it.. */
+			td->td_flags &= ~TDF_UNBOUND;
+		} else {
+			td->td_flags |= TDF_UNBOUND;
+		}
+	}
 	code = tf->tf_global[1];
 
 	/*
@@ -634,17 +657,17 @@ syscall(struct trapframe *tf)
 	}
 
 	/*
-	 * Handle reschedule and other end-of-syscall issues
-	 */
-	userret(td, tf, sticks);
-
-	/*
 	 * Release Giant if we had to get it.  Don't use mtx_owned(),
 	 * we want to catch broken syscalls.
 	 */
 	if ((callp->sy_narg & SYF_MPSAFE) == 0)
 		mtx_unlock(&Giant);
 
+	/*
+	 * Handle reschedule and other end-of-syscall issues
+	 */
+	userret(td, tf, sticks);
+
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
 		ktrsysret(code, error, td->td_retval[0]);
diff --git a/sys/sparc64/sparc64/vm_machdep.c b/sys/sparc64/sparc64/vm_machdep.c
index a896754..8282e93 100644
--- a/sys/sparc64/sparc64/vm_machdep.c
+++ b/sys/sparc64/sparc64/vm_machdep.c
@@ -108,6 +108,42 @@ cpu_sched_exit(struct thread *td)
 	}
 }
 
+void
+cpu_thread_exit(struct thread *td)
+{
+}
+
+void
+cpu_thread_setup(struct thread *td)
+{
+}
+
+void
+cpu_save_upcall(struct thread *td, struct kse *newkse)
+{
+}
+
+void
+cpu_set_upcall(struct thread *td, void *pcb)
+{
+}
+
+void
+cpu_set_args(struct thread *td, struct kse *ke)
+{
+}
+
+void
+cpu_free_kse_mdstorage(struct kse *ke)
+{
+}
+
+int
+cpu_export_context(struct thread *td)
+{
+	return (0);
+}
+
 /*
  * Finish a fork operation, with process p2 nearly set up.
  * Copy and update the pcb, set up the stack so that the child
diff --git a/sys/sys/condvar.h b/sys/sys/condvar.h
index 0050255..cf6a6c6 100644
--- a/sys/sys/condvar.h
+++ b/sys/sys/condvar.h
@@ -62,6 +62,7 @@ void	cv_signal(struct cv *cvp);
 void	cv_broadcast(struct cv *cvp);
 
 void	cv_waitq_remove(struct thread *td);
+void	cv_abort(struct thread *td);
 
 #define	cv_waitq_empty(cvp)	(TAILQ_EMPTY(&(cvp)->cv_waitq))
 #define	cv_wmesg(cvp)		((cvp)->cv_description)
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index a4f29de..2c198c8 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -249,12 +249,13 @@ They would be given priorities calculated from the KSEG.
  * This is what is put to sleep and reactivated.
  * The first KSE available in the correct group will run this thread.
  * If several are available, use the one on the same CPU as last time.
+ * When waing to be run, threads are hung off the KSEGRP in priority order.
+ * with N runnable and queued KSEs in the KSEGRP, the first N threads
+ * are linked to them. Other threads are not yet assigned.
  */
 struct thread {
 	struct proc	*td_proc;	/* Associated process. */
 	struct ksegrp	*td_ksegrp;	/* Associated KSEG. */
-	struct kse	*td_last_kse;	/* Where it wants to be if possible. */
-	struct kse	*td_kse;	/* Current KSE if running. */
 	TAILQ_ENTRY(thread) td_plist;	/* All threads in this proc */
 	TAILQ_ENTRY(thread) td_kglist;	/* All threads in this ksegrp */
 
@@ -267,6 +268,8 @@ struct thread {
 
 #define	td_startzero td_flags
 	int		td_flags;	/* (j) TDF_* flags. */
+	struct kse	*td_last_kse;	/* Where it wants to be if possible. */
+	struct kse	*td_kse;	/* Current KSE if running. */
 	int		td_dupfd;	/* (k) Ret value from fdopen. XXX */
 	void		*td_wchan;	/* (j) Sleep address. */
 	const char	*td_wmesg;	/* (j) Reason for sleep. */
@@ -280,6 +283,8 @@ struct thread {
 	LIST_HEAD(, mtx) td_contested;	/* (j) Contested locks. */
 	struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */
 	int		td_intr_nesting_level; /* (k) Interrupt recursion. */
+	void 		*td_mailbox;	/* the userland mailbox address */
+	struct ucred	*td_ucred;	/* (k) Reference to credentials. */
 #define	td_endzero td_md
 
 #define	td_startcopy td_endzero
@@ -290,14 +295,44 @@ struct thread {
 	u_char		td_priority;	/* (j) Thread active priority. */
 #define	td_endcopy td_pcb
 
-	struct ucred	*td_ucred;	/* (k) Reference to credentials. */
 	struct pcb	*td_pcb;	/* (k) Kernel VA of pcb and kstack. */
+	enum {
+		TDS_NEW = 0x20,
+		TDS_UNQUEUED,
+		TDS_SLP,
+		TDS_MTX,
+		TDS_RUNQ,
+		TDS_RUNNING,
+		TDS_SUSPENDED,		/* would have liked to have run */
+		TDS_IWAIT,
+		TDS_SURPLUS
+	} td_state;
 	struct callout	td_slpcallout;	/* (h) Callout for sleep. */
 	struct trapframe *td_frame;	/* (k) */
 	struct vm_object *td_kstack_obj;/* (a) Kstack object. */
 	vm_offset_t	td_kstack;	/* Kernel VA of kstack. */
 	u_int		td_critnest;	/* (k) Critical section nest level. */
 };
+/* flags kept in td_flags */
+#define	TDF_UNBOUND	0x000001 /* may give away the kse, uses the kg runq */
+#define	TDF_SINTR	0x000008 /* Sleep is interruptible. */
+#define	TDF_TIMEOUT	0x000010 /* Timing out during sleep. */
+#define	TDF_SELECT	0x000040 /* Selecting; wakeup/waiting danger. */
+#define	TDF_CVWAITQ	0x000080 /* Thread is on a cv_waitq (not slpq). */
+#define	TDF_UPCALLING	0x000100 /* This thread is doing an upcall. */
+#define TDF_INMSLEEP	0x000400 /* Don't recurse in msleep() */
+#define	TDF_TIMOFAIL	0x001000 /* Timeout from sleep after we were awake. */
+#define	TDF_DEADLKTREAT	0x800000 /* Lock aquisition - deadlock treatment. */
+
+/*
+ * Traps for young players:
+ * The main thread flag that controls whether a thread acts as a threaded
+ * or unthreaded thread is the TDF_UNBOUND flag.
+ * UPCALLS run with the UNBOUND flags clear, after they are first scheduled.
+ * i.e. they bind themselves to whatever thread thay are first scheduled with.
+ * You may see BOUND threads in KSE processes but you should never see
+ * UNBOUND threads in non KSE processes.
+ */
 
 /*
  * The schedulable entity that can be given a context to run.
@@ -309,14 +344,14 @@ struct thread {
 struct kse {
 	struct proc	*ke_proc;	/* Associated process. */
 	struct ksegrp	*ke_ksegrp;	/* Associated KSEG. */
-	struct thread	*ke_thread;	/* Associated thread, if running. */
 	TAILQ_ENTRY(kse) ke_kglist;	/* Queue of all KSEs in ke_ksegrp. */
 	TAILQ_ENTRY(kse) ke_kgrlist;	/* Queue of all KSEs in this state. */
 	TAILQ_ENTRY(kse) ke_procq;	/* (j) Run queue. */
-	TAILQ_HEAD(, thread) ke_runq;	/* (td_runq) RUNNABLE bound to KSE. */
 
 #define	ke_startzero ke_flags
 	int		ke_flags;	/* (j) KEF_* flags. */
+	struct thread	*ke_thread;	/* Active associated thread. */
+	struct thread	*ke_bound;	/* Thread bound to this KSE (*) */
 	/*u_int		ke_estcpu; */	/* (j) Time averaged val of cpticks. */
 	int		ke_cpticks;	/* (j) Ticks of cpu time. */
 	fixpt_t		ke_pctcpu;	/* (j) %cpu during p_swtime. */
@@ -329,15 +364,45 @@ struct kse {
 	u_char		ke_oncpu;	/* (j) Which cpu we are on. */
 	u_int		ke_slptime;	/* (j) Time since last idle. */
 	char		ke_rqindex;	/* (j) Run queue index. */
-#define	ke_endzero ke_priority
+	enum {
+		KES_IDLE = 0x10,
+		KES_ONRUNQ,
+		KES_UNQUEUED, /* in transit */
+		KES_RUNNING
+	} ke_state;     /* (j) S* process status. */
+	void 		*ke_mailbox;	/* the userland mailbox address */
+	struct thread	*ke_tdspare;	/* spare thread for upcalls */
+#define	ke_endzero ke_dummy
 
 #define	ke_startcopy ke_endzero
-	u_char		ke_priority;	/* (j) Process priority. */
-	u_char		ke_usrpri;	/* (j) User pri from cpu & nice. */
-#define	ke_endcopy ke_end
-
-	int		ke_end;		/* dummy entry */
+	u_char		ke_dummy;	/*   */
+#define	ke_endcopy ke_mdstorage
+
+	void		*ke_upcall;
+	void		*ke_stackbase;
+	u_long		ke_stacksize;
+	void 		*ke_mdstorage;	/* where we store the pcb and frame */
+	struct pcb	*ke_pcb;	/* the pcb saved for the upcalls */
+	struct trapframe *ke_frame;	/* the upcall trapframe */
+	void	*mdkse;			/* eventually you load from this in */
+					/* switch for our extension PCB x86 */
 };
+/* flags kept in ke_flags */
+#define	KEF_OWEUPC	0x00002	/* Owe process an addupc() call at next ast. */
+#define	KEF_IDLEKSE	0x00004	/* A 'Per CPU idle process'.. has one thread */
+#define	KEF_LOANED	0x00004	/* On loan from the bound thread to another */
+#define	KEF_ASTPENDING	0x00400	/* KSE has a pending ast. */
+#define	KEF_NEEDRESCHED	0x00800	/* Process needs to yield. */
+
+/*
+ * (*) A bound KSE with a bound thread in a KSE process may be lent to
+ * Other threads, as long as those threads do not leave the kernel. 
+ * The other threads must be either exiting, or be unbound with a valid
+ * mailbox so that they can save their state there rather than going
+ * to user space. While this happens the real bound thread is still linked
+ * to the kse via the ke_bound field, and the KSE has its "KEF_LOANED
+ * flag set.
+ */
 
 /*
  * Kernel-scheduled entity group (KSEG).  The scheduler considers each KSEG to
@@ -348,27 +413,29 @@ struct ksegrp {
 	struct proc	*kg_proc;	/* Process that contains this KSEG. */
 	TAILQ_ENTRY(ksegrp) kg_ksegrp;	/* Queue of KSEGs in kg_proc. */
 	TAILQ_HEAD(, kse) kg_kseq;	/* (ke_kglist) All KSEs. */
-	TAILQ_HEAD(, kse) kg_rq;	/* (ke_kgrlist) Runnable KSEs. */
 	TAILQ_HEAD(, kse) kg_iq;	/* (ke_kgrlist) Idle KSEs. */
 	TAILQ_HEAD(, thread) kg_threads;/* (td_kglist) All threads. */
-	TAILQ_HEAD(, thread) kg_runq;	/* (td_runq) Unbound RUNNABLE threads */
+	TAILQ_HEAD(, thread) kg_runq;	/* (td_runq) waiting RUNNABLE threads */
 	TAILQ_HEAD(, thread) kg_slpq;	/* (td_runq) NONRUNNABLE threads. */
 
 #define	kg_startzero kg_estcpu
 	u_int		kg_estcpu;	/* Sum of the same field in KSEs. */
  	u_int		kg_slptime;	/* (j) How long completely blocked. */
+	struct thread 	*kg_last_assigned; /* Last thread assigned to a KSE */
+	int		kg_numthreads;	/* Num threads in total */
+	int		kg_runnable;	/* Num runnable threads on queue. */
+	int		kg_kses;	/* Num KSEs in group. */
+	int		kg_runq_kses;	/* Num KSEs on runq. */
+	int		kg_idle_kses;	/* num KSEs idle */
 #define	kg_endzero kg_pri_class
 
 #define	kg_startcopy 	kg_endzero
 	u_char		kg_pri_class;	/* (j) Scheduling class. */
 	u_char		kg_user_pri;	/* (j) User pri from estcpu and nice. */
 	char		kg_nice;	/* (j?/k?) Process "nice" value. */
-	struct rtprio	kg_rtprio;	/* (j) Realtime priority. */
-#define	kg_endcopy kg_runnable
-
-	int		kg_runnable;	/* Num runnable threads on queue. */
-	int		kg_runq_kses;	/* Num KSEs on runq. */
-	int		kg_kses;	/* Num KSEs in group. */
+/*	struct rtprio	kg_rtprio; */	/* (j) Realtime priority. */
+#define	kg_endcopy kg_dummy
+	int		kg_dummy;
 };
 
 /*
@@ -379,6 +446,7 @@ struct proc {
 	LIST_ENTRY(proc) p_list;	/* (d) List of all processes. */
 	TAILQ_HEAD(, ksegrp) p_ksegrps;	/* (kg_ksegrp) All KSEGs. */
 	TAILQ_HEAD(, thread) p_threads;	/* (td_plist) Threads. (shortcut) */
+	TAILQ_HEAD(, thread) p_suspended; /* (td_runq) suspended threads */
 	struct ucred	*p_ucred;	/* (c) Process owner's identity. */
 	struct filedesc	*p_fd;		/* (b) Ptr to open files structure. */
 					/* Accumulated stats for all KSEs? */
@@ -389,7 +457,6 @@ struct proc {
  
 	struct ksegrp	p_ksegrp;
 	struct kse	p_kse;
-	struct thread	p_xxthread;
 
 	/*
 	 * The following don't make too much sense..
@@ -397,8 +464,12 @@ struct proc {
 	 */
 	int		p_flag;		/* (c) P_* flags. */
 	int		p_sflag;	/* (j) PS_* flags. */
-	int		p_stat;		/* (j) S* process status. */
-
+	enum {
+		PRS_NEW = 0,    /* In creation */
+		PRS_NORMAL,     /* KSEs can be run */
+		PRS_WAIT,       /* Waiting on interrupt ? */
+		PRS_ZOMBIE
+	} p_state;              /* (j) S* process status. */
 	pid_t		p_pid;		/* (b) Process identifier. */
 	LIST_ENTRY(proc) p_hash;	/* (d) Hash chain. */
 	LIST_ENTRY(proc) p_pglist;	/* (g + e) List of processes in pgrp. */
@@ -431,6 +502,10 @@ struct proc {
 	u_char		p_pfsflags;	/* (c) Procfs flags. */
 	struct nlminfo	*p_nlminfo;	/* (?) Only used by/for lockd. */
 	void		*p_aioinfo;	/* (c) ASYNC I/O info. */
+	int		p_numthreads;	/* (?) number of threads */
+	int		p_numksegrps;	/* (?) number of ksegrps */
+	struct thread	*p_singlethread;/* If single threading this is it */
+	int		p_suspcount;	/* # waiting threads in suspended mode*/
 /* End area that is zeroed on creation. */
 #define	p_startcopy	p_sigmask
 
@@ -467,13 +542,6 @@ struct proc {
 #define	NOCPU	0xff		/* For p_oncpu when we aren't on a CPU. */
 
 /* Status values (p_stat). */
-#define	SIDL	1		/* Process being created by fork. */
-#define	SRUN	2		/* Currently runnable. */
-#define	SSLEEP	3		/* Sleeping on an address. */
-#define	SSTOP	4		/* Process debugging or suspension. */
-#define	SZOMB	5		/* Awaiting collection by parent. */
-#define	SWAIT	6		/* Waiting for interrupt. */
-#define	SMTX	7		/* Blocked on a mutex. */
 
 /* These flags are kept in p_flag. */
 #define	P_ADVLOCK	0x00001	/* Process may hold a POSIX advisory lock. */
@@ -483,13 +551,21 @@ struct proc {
 #define	P_PPWAIT	0x00010	/* Parent is waiting for child to exec/exit. */
 #define	P_SUGID		0x00100	/* Had set id privileges since last exec. */
 #define	P_SYSTEM	0x00200	/* System proc: no sigs, stats or swapping. */
-#define	P_TRACED	0x00800	/* Debugged process being traced. */
-#define	P_WAITED	0x01000	/* Debugging process has waited for child. */
+#define	P_WAITED	0x01000	/* Someone is waiting for us */
 #define	P_WEXIT		0x02000	/* Working on exiting. */
 #define	P_EXEC		0x04000	/* Process called exec. */
 #define	P_KSES		0x08000	/* Process is using KSEs. */
 #define	P_CONTINUED	0x10000	/* Proc has continued from a stopped state. */
 
+/* flags that control how threads may be suspended for some reason */
+#define	P_STOPPED_SGNL	0x10000	/* Stopped due to SIGSTOP/SIGTSTP */
+#define	P_STOPPED_TRACE	0x20000	/* Stopped because of tracing */
+#define	P_STOPPED_SNGL	0x40000	/* Only one thread can continue (not to user) */
+#define	P_SINGLE_EXIT	0x00400	/* Threads suspending should exit, not wait */
+#define	P_TRACED	0x00800	/* Debugged process being traced. */
+#define	P_STOPPED	(P_STOPPED_SGNL|P_STOPPED_SNGL|P_STOPPED_TRACE)
+#define	P_SHOULDSTOP(p) ((p)->p_flag & P_STOPPED)
+
 /* Should be moved to machine-dependent areas. */
 #define	P_UNUSED100000	0x100000
 #define	P_COWINPROGRESS	0x400000 /* Snapshot copy-on-write in progress. */
@@ -508,21 +584,14 @@ struct proc {
 #define	PS_SWAPPING	0x00200	/* Process is being swapped. */
 #define	PS_NEEDSIGCHK	0x02000	/* Process may need signal delivery. */
 
-/* flags kept in td_flags */
-#define	TDF_ONRUNQ	0x00001	/* This KE is on a run queue */
-#define	TDF_SINTR	0x00008	/* Sleep is interruptible. */
-#define	TDF_TIMEOUT	0x00010	/* Timing out during sleep. */
-#define	TDF_SELECT	0x00040	/* Selecting; wakeup/waiting danger. */
-#define	TDF_CVWAITQ	0x00080	/* Thread is on a cv_waitq (not slpq). */
-#define	TDF_TIMOFAIL	0x01000	/* Timeout from sleep after we were awake. */
-#define	TDF_DEADLKTREAT	0x800000 /* Lock aquisition - deadlock treatment. */
-
-/* flags kept in ke_flags */
-#define	KEF_ONRUNQ	0x00001	/* This KE is on a run queue */
-#define	KEF_OWEUPC	0x00002	/* Owe process an addupc() call at next ast. */
-#define	KEF_ASTPENDING	0x00400	/* KSE has a pending ast. */
-#define	KEF_NEEDRESCHED	0x00800	/* Process needs to yield. */
-
+/* used only in legacy conversion code */
+#define SIDL	1		/* Process being created by fork. */
+#define SRUN	2		/* Currently runnable. */
+#define SSLEEP	3		/* Sleeping on an address. */
+#define SSTOP	4		/* Process debugging or suspension. */
+#define SZOMB	5		/* Awaiting collection by parent. */
+#define SWAIT	6		/* Waiting for interrupt. */
+#define SMTX	7		/* Blocked on a mutex. */
 
 #define	P_MAGIC		0xbeefface
 
@@ -728,6 +797,7 @@ void	pargs_drop(struct pargs *pa);
 void	pargs_free(struct pargs *pa);
 void	pargs_hold(struct pargs *pa);
 void	procinit(void);
+void	threadinit(void);
 void	proc_linkup(struct proc *p, struct ksegrp *kg,
 	    struct kse *ke, struct thread *td);
 void	proc_reparent(struct proc *child, struct proc *newparent);
@@ -758,7 +828,38 @@ void	cpu_fork(struct thread *, struct proc *, struct thread *, int);
 void	cpu_set_fork_handler(struct thread *, void (*)(void *), void *);
 void	cpu_wait(struct proc *);
 int	cpu_coredump(struct thread *, struct vnode *, struct ucred *);
-struct thread *thread_get(struct proc *);
+
+/* New in KSE. */
+struct	thread *thread_alloc(void);
+void	thread_free(struct thread *td);
+int	cpu_export_context(struct thread *td);
+void	cpu_free_kse_mdstorage(struct kse *kse);
+void	cpu_save_upcall(struct thread *td, struct kse *newkse);
+void	cpu_set_args(struct thread *, struct kse *);
+void	cpu_set_upcall(struct thread *td, void *pcb);
+void	cpu_thread_exit(struct thread *);
+void	cpu_thread_setup(struct thread *td);
+void	kse_reassign(struct kse *ke);
+void	kse_link(struct kse *ke, struct ksegrp *kg);
+void	ksegrp_link(struct ksegrp *kg, struct proc *p);
+int	kserunnable(void);
+void	make_kse_runnable(struct kse *ke);
+void	thread_exit(void) __dead2;
+int	thread_export_context(struct thread *td);
+void	thread_link(struct thread *td, struct ksegrp *kg);
+void	thread_reap(void);
+struct thread *thread_schedule_upcall(struct thread *td, struct kse *ke);
+int	thread_single(int how);
+#define	SNGLE_NO_EXIT 0			/* values for 'how' */
+#define	SNGLE_EXIT 1
+void	thread_single_end(void);
+void	thread_stash(struct thread *td);
+int	thread_suspend_check(int how);
+void	thread_unsuspend(struct proc *p);
+int	thread_userret(struct proc *p, struct ksegrp *kg, struct kse *ke,
+	    struct thread *td, struct trapframe *frame);
+
+void	thread_sanity_check(struct thread *td);
 #endif	/* _KERNEL */
 
 #endif	/* !_SYS_PROC_H_ */
diff --git a/sys/sys/queue.h b/sys/sys/queue.h
index 5209f4e..ffddc86 100644
--- a/sys/sys/queue.h
+++ b/sys/sys/queue.h
@@ -102,6 +102,36 @@
  * _REMOVE		+	+	+	+
  *
  */
+#define QUEUE_MACRO_DEBUG 1
+#ifdef QUEUE_MACRO_DEBUG
+struct qm_trace {
+	char * lastfile;
+	int lastline;
+	char * prevfile;
+	int prevline;
+};
+
+#define TRACEBUF	struct qm_trace trace;
+
+#define QMD_TRACE_HEAD(head) do {					\
+	(head)->trace.prevline = (head)->trace.lastline;		\
+	(head)->trace.prevfile = (head)->trace.lastfile;		\
+	(head)->trace.lastline = __LINE__;				\
+	(head)->trace.lastfile = __FILE__;				\
+} while (0)
+
+#define QMD_TRACE_ELEM(elem) do {					\
+	(elem)->trace.prevline = (elem)->trace.lastline;		\
+	(elem)->trace.prevfile = (elem)->trace.lastfile;		\
+	(elem)->trace.lastline = __LINE__;				\
+	(elem)->trace.lastfile = __FILE__;				\
+} while (0)
+
+#else
+#define QMD_TRACE_ELEM(elem)
+#define QMD_TRACE_HEAD(head)
+#define TRACEBUF
+#endif	/* QUEUE_MACRO_DEBUG */
 
 /*
  * Singly-linked List declarations.
@@ -329,6 +359,7 @@ struct {								\
 struct name {								\
 	struct type *tqh_first;	/* first element */			\
 	struct type **tqh_last;	/* addr of last next element */		\
+	TRACEBUF							\
 }
 
 #define	TAILQ_HEAD_INITIALIZER(head)					\
@@ -338,6 +369,7 @@ struct name {								\
 struct {								\
 	struct type *tqe_next;	/* next element */			\
 	struct type **tqe_prev;	/* address of previous next element */	\
+	TRACEBUF							\
 }
 
 /*
@@ -349,6 +381,8 @@ struct {								\
 		(head2)->tqh_first->field.tqe_prev = (head1)->tqh_last;	\
 		(head1)->tqh_last = (head2)->tqh_last;			\
 		TAILQ_INIT((head2));					\
+		QMD_TRACE_HEAD(head);					\
+		QMD_TRACE_HEAD(head2);					\
 	}								\
 } while (0)
 
@@ -369,16 +403,21 @@ struct {								\
 #define	TAILQ_INIT(head) do {						\
 	TAILQ_FIRST((head)) = NULL;					\
 	(head)->tqh_last = &TAILQ_FIRST((head));			\
+	QMD_TRACE_HEAD(head);						\
 } while (0)
 
 #define	TAILQ_INSERT_AFTER(head, listelm, elm, field) do {		\
 	if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\
 		TAILQ_NEXT((elm), field)->field.tqe_prev = 		\
 		    &TAILQ_NEXT((elm), field);				\
-	else								\
+	else {								\
 		(head)->tqh_last = &TAILQ_NEXT((elm), field);		\
+		QMD_TRACE_HEAD(head);					\
+	}								\
 	TAILQ_NEXT((listelm), field) = (elm);				\
 	(elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field);		\
+	QMD_TRACE_ELEM(&(elm)->field);					\
+	QMD_TRACE_ELEM(&listelm->field);				\
 } while (0)
 
 #define	TAILQ_INSERT_BEFORE(listelm, elm, field) do {			\
@@ -386,6 +425,8 @@ struct {								\
 	TAILQ_NEXT((elm), field) = (listelm);				\
 	*(listelm)->field.tqe_prev = (elm);				\
 	(listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field);		\
+	QMD_TRACE_ELEM(&(elm)->field);					\
+	QMD_TRACE_ELEM(&listelm->field);				\
 } while (0)
 
 #define	TAILQ_INSERT_HEAD(head, elm, field) do {			\
@@ -396,6 +437,8 @@ struct {								\
 		(head)->tqh_last = &TAILQ_NEXT((elm), field);		\
 	TAILQ_FIRST((head)) = (elm);					\
 	(elm)->field.tqe_prev = &TAILQ_FIRST((head));			\
+	QMD_TRACE_HEAD(head);						\
+	QMD_TRACE_ELEM(&(elm)->field);					\
 } while (0)
 
 #define	TAILQ_INSERT_TAIL(head, elm, field) do {			\
@@ -403,6 +446,8 @@ struct {								\
 	(elm)->field.tqe_prev = (head)->tqh_last;			\
 	*(head)->tqh_last = (elm);					\
 	(head)->tqh_last = &TAILQ_NEXT((elm), field);			\
+	QMD_TRACE_HEAD(head);						\
+	QMD_TRACE_ELEM(&(elm)->field);					\
 } while (0)
 
 #define	TAILQ_LAST(head, headname)					\
@@ -417,9 +462,13 @@ struct {								\
 	if ((TAILQ_NEXT((elm), field)) != NULL)				\
 		TAILQ_NEXT((elm), field)->field.tqe_prev = 		\
 		    (elm)->field.tqe_prev;				\
-	else								\
+	else {								\
 		(head)->tqh_last = (elm)->field.tqe_prev;		\
+		QMD_TRACE_HEAD(head);					\
+	}								\
 	*(elm)->field.tqe_prev = TAILQ_NEXT((elm), field);		\
+	(elm)->field.tqe_next = (void *)-1;				\
+	QMD_TRACE_ELEM(&(elm)->field);					\
 } while (0)
 
 
diff --git a/sys/sys/signalvar.h b/sys/sys/signalvar.h
index 6302d03..a8a68fc 100644
--- a/sys/sys/signalvar.h
+++ b/sys/sys/signalvar.h
@@ -234,10 +234,10 @@ extern struct mtx	sigio_lock;
 /*
  * Machine-independent functions:
  */
-int	cursig(struct proc *p);
+int	cursig(struct thread *td);
 void	execsigs(struct proc *p);
 void	gsignal(int pgid, int sig);
-int	issignal(struct proc *p);
+int	issignal(struct thread *p);
 void	killproc(struct proc *p, char *why);
 void	pgsigio(struct sigio **, int signum, int checkctty);
 void	pgsignal(struct pgrp *pgrp, int sig, int checkctty);
diff --git a/sys/sys/systm.h b/sys/sys/systm.h
index ccba626..134700b 100644
--- a/sys/sys/systm.h
+++ b/sys/sys/systm.h
@@ -309,6 +309,7 @@ extern watchdog_tickle_fn	wdog_tickler;
  */
 int	msleep(void *chan, struct mtx *mtx, int pri, const char *wmesg,
 	    int timo);
+void	abortsleep(struct thread *td);
 #define	tsleep(chan, pri, wmesg, timo)	msleep(chan, NULL, pri, wmesg, timo)
 void	wakeup(void *chan);
 void	wakeup_one(void *chan);
diff --git a/sys/sys/ucred.h b/sys/sys/ucred.h
index 3025eb4..565bd41 100644
--- a/sys/sys/ucred.h
+++ b/sys/sys/ucred.h
@@ -44,15 +44,15 @@
  * Only the suser() or suser_cred() function should be used for this.
  */
 struct ucred {
-	u_int	cr_ref;			/* reference count */
+	u_int		cr_ref;		/* reference count */
 #define	cr_startcopy cr_uid
-	uid_t	cr_uid;			/* effective user id */
-	uid_t	cr_ruid;		/* real user id */
-	uid_t	cr_svuid;		/* saved user id */
-	short	cr_ngroups;		/* number of groups */
-	gid_t	cr_groups[NGROUPS];	/* groups */
-	gid_t	cr_rgid;		/* real group id */
-	gid_t	cr_svgid;		/* saved user id */
+	uid_t		cr_uid;		/* effective user id */
+	uid_t		cr_ruid;	/* real user id */
+	uid_t		cr_svuid;	/* saved user id */
+	short		cr_ngroups;	/* number of groups */
+	gid_t		cr_groups[NGROUPS]; /* groups */
+	gid_t		cr_rgid;	/* real group id */
+	gid_t		cr_svgid;	/* saved user id */
 	struct uidinfo	*cr_uidinfo;	/* per euid resource consumption */
 	struct uidinfo	*cr_ruidinfo;	/* per ruid resource consumption */
 	struct prison	*cr_prison;	/* jail(4) */
diff --git a/sys/vm/uma_int.h b/sys/vm/uma_int.h
index e09d549..cf6dc39 100644
--- a/sys/vm/uma_int.h
+++ b/sys/vm/uma_int.h
@@ -109,7 +109,7 @@
 #define UMA_SLAB_MASK	(PAGE_SIZE - 1)	/* Mask to get back to the page */
 #define UMA_SLAB_SHIFT	PAGE_SHIFT	/* Number of bits PAGE_MASK */
 
-#define UMA_BOOT_PAGES		15	/* Number of pages allocated for startup */
+#define UMA_BOOT_PAGES		30	/* Number of pages allocated for startup */
 #define UMA_WORKING_TIME	20	/* Seconds worth of items to keep */
 
 
diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c
index 6c48cbc..25aa48e 100644
--- a/sys/vm/vm_glue.c
+++ b/sys/vm/vm_glue.c
@@ -299,8 +299,11 @@ vm_waitproc(p)
 	GIANT_REQUIRED;
 	cpu_wait(p);
 	pmap_dispose_proc(p);		/* drop per-process resources */
-	FOREACH_THREAD_IN_PROC(p, td)
+/* XXXKSE by here there should not be any threads left! */
+	FOREACH_THREAD_IN_PROC(p, td) {
+		panic("vm_waitproc: Survivor thread!");
 		pmap_dispose_thread(td);
+	}
 	vmspace_exitfree(p);		/* and clean-out the vmspace */
 }
 
@@ -355,7 +358,7 @@ faultin(p)
 		PROC_LOCK(p);
 		mtx_lock_spin(&sched_lock);
 		FOREACH_THREAD_IN_PROC (p, td)
-			if (td->td_proc->p_stat == SRUN)	/* XXXKSE */
+			if (td->td_state == TDS_RUNQ)	/* XXXKSE */
 				setrunqueue(td);
 
 		p->p_sflag |= PS_INMEM;
@@ -371,7 +374,7 @@ faultin(p)
  * is enough space for them.  Of course, if a process waits for a long
  * time, it will be swapped in anyway.
  *
- *  XXXKSE - KSEGRP with highest priority counts..
+ *  XXXKSE - process with the thread with highest priority counts..
  *
  * Giant is still held at this point, to be released in tsleep.
  */
@@ -381,6 +384,7 @@ scheduler(dummy)
 	void *dummy;
 {
 	struct proc *p;
+	struct thread *td;
 	int pri;
 	struct proc *pp;
 	int ppri;
@@ -399,11 +403,14 @@ loop:
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		struct ksegrp *kg;
+		if (p->p_sflag & (PS_INMEM | PS_SWAPPING)) {
+			continue;
+		}
 		mtx_lock_spin(&sched_lock);
-		if (p->p_stat == SRUN
-		&& (p->p_sflag & (PS_INMEM | PS_SWAPPING)) == 0) {
-			/* Find the minimum sleeptime for the process */
-			FOREACH_KSEGRP_IN_PROC(p, kg) {
+		FOREACH_THREAD_IN_PROC(p, td) {
+			/* Only consider runnable threads */
+			if (td->td_state == TDS_RUNQ) {
+				kg = td->td_ksegrp;
 				pri = p->p_swtime + kg->kg_slptime;
 				if ((p->p_sflag & PS_SWAPINREQ) == 0) {
 					pri -= kg->kg_nice * 8;
@@ -438,6 +445,7 @@ loop:
 
 	/*
 	 * We would like to bring someone in. (only if there is space).
+	 * [What checks the space? ]
 	 */
 	PROC_LOCK(p);
 	faultin(p);
@@ -478,6 +486,7 @@ swapout_procs(action)
 int action;
 {
 	struct proc *p;
+	struct thread *td;
 	struct ksegrp *kg;
 	struct proc *outp, *outp2;
 	int outpri, outpri2;
@@ -489,13 +498,13 @@ int action;
 	outpri = outpri2 = INT_MIN;
 retry:
 	sx_slock(&allproc_lock);
-	LIST_FOREACH(p, &allproc, p_list) {
+	FOREACH_PROC_IN_SYSTEM(p) {
 		struct vmspace *vm;
 		int minslptime = 100000;
 		
 		PROC_LOCK(p);
 		if (p->p_lock != 0 ||
-		    (p->p_flag & (P_TRACED|P_SYSTEM|P_WEXIT)) != 0) {
+		    (p->p_flag & (P_STOPPED_SNGL|P_TRACED|P_SYSTEM|P_WEXIT)) != 0) {
 			PROC_UNLOCK(p);
 			continue;
 		}
@@ -512,14 +521,15 @@ retry:
 			continue;
 		}
 
-		switch (p->p_stat) {
+		switch (p->p_state) {
 		default:
+			/* Don't swap out processes in any sort
+			 * of 'special' state. */
 			mtx_unlock_spin(&sched_lock);
 			PROC_UNLOCK(p);
 			continue;
 
-		case SSLEEP:
-		case SSTOP:
+		case PRS_NORMAL:
 			/*
 			 * do not swapout a realtime process
 			 * Check all the thread groups..
@@ -537,13 +547,18 @@ retry:
 				 * Also guarantee swap_idle_threshold1
 				 * time in memory.
 				 */
-				if (((FIRST_THREAD_IN_PROC(p)->td_priority) < PSOCK) ||
-				    (kg->kg_slptime < swap_idle_threshold1)) {
+				if (kg->kg_slptime < swap_idle_threshold1) {
 					mtx_unlock_spin(&sched_lock);
 					PROC_UNLOCK(p);
 					goto nextproc;
 				}
-
+				FOREACH_THREAD_IN_PROC(p, td) {
+					if ((td->td_priority) < PSOCK) {
+						mtx_unlock_spin(&sched_lock);
+						PROC_UNLOCK(p);
+						goto nextproc;
+					}
+				}
 				/*
 				 * If the system is under memory stress,
 				 * or if we are swapping
@@ -624,14 +639,13 @@ swapout(p)
 	p->p_sflag |= PS_SWAPPING;
 	PROC_UNLOCK(p);
 	FOREACH_THREAD_IN_PROC (p, td)
-		if (td->td_proc->p_stat == SRUN)	/* XXXKSE */
+		if (td->td_state == TDS_RUNQ)	/* XXXKSE */
 			remrunqueue(td);	/* XXXKSE */
 	mtx_unlock_spin(&sched_lock);
 
 	pmap_swapout_proc(p);
 	FOREACH_THREAD_IN_PROC(p, td)
 		pmap_swapout_thread(td);
-
 	mtx_lock_spin(&sched_lock);
 	p->p_sflag &= ~PS_SWAPPING;
 	p->p_swtime = 0;
diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c
index 935979ae..a1b8adb 100644
--- a/sys/vm/vm_meter.c
+++ b/sys/vm/vm_meter.c
@@ -81,6 +81,7 @@ SYSCTL_STRUCT(_vm, VM_LOADAVG, loadavg, CTLFLAG_RD,
 static int
 vmtotal(SYSCTL_HANDLER_ARGS)
 {
+/* XXXKSE almost completely broken */
 	struct proc *p;
 	struct vmtotal total, *totalp;
 	vm_map_entry_t entry;
@@ -88,6 +89,7 @@ vmtotal(SYSCTL_HANDLER_ARGS)
 	vm_map_t map;
 	int paging;
 	struct ksegrp *kg;
+	struct thread *td;
 
 	totalp = &total;
 	bzero(totalp, sizeof *totalp);
@@ -107,44 +109,49 @@ vmtotal(SYSCTL_HANDLER_ARGS)
 		if (p->p_flag & P_SYSTEM)
 			continue;
 		mtx_lock_spin(&sched_lock);
-		switch (p->p_stat) {
-		case 0:
+		switch (p->p_state) {
+		case PRS_NEW:
+			if (p->p_sflag & PS_INMEM)
+				totalp->t_rq++;
+			else
+				totalp->t_sw++;
 			mtx_unlock_spin(&sched_lock);
 			continue;
-
-		case SMTX:
-		case SSLEEP:
-		case SSTOP:
-			kg = &p->p_ksegrp;	/* XXXKSE */
-			if (p->p_sflag & PS_INMEM) {
-				if (FIRST_THREAD_IN_PROC(p)->td_priority
-				    <= PZERO)
-					totalp->t_dw++;
-				else if (kg->kg_slptime < maxslp)
-					totalp->t_sl++;
-			} else if (kg->kg_slptime < maxslp)
-				totalp->t_sw++;
-			if (kg->kg_slptime >= maxslp) {
-				mtx_unlock_spin(&sched_lock);
-				continue;
-			}
 			break;
+		default:
+			FOREACH_THREAD_IN_PROC(p, td) {
+				switch (td->td_state) {
+				case TDS_MTX:
+				case TDS_SLP:
+					kg = td->td_ksegrp;	/* XXXKSE */
+					if (p->p_sflag & PS_INMEM) {
+						if (td->td_priority <= PZERO)
+							totalp->t_dw++;
+						else if (kg->kg_slptime
+							< maxslp)
+							totalp->t_sl++;
+					} else if (kg->kg_slptime < maxslp)
+						totalp->t_sw++;
+					if (kg->kg_slptime >= maxslp) {
+						continue;
+					}
+					break;
 
-		case SWAIT:
-			totalp->t_sl++;
-			continue;
+				case TDS_RUNQ:
+				case TDS_RUNNING:
+					if (p->p_sflag & PS_INMEM)
+						totalp->t_rq++;
+					else
+						totalp->t_sw++;
+					continue;
 
-		case SRUN:
-		case SIDL:
-			if (p->p_sflag & PS_INMEM)
-				totalp->t_rq++;
-			else
-				totalp->t_sw++;
-			if (p->p_stat == SIDL) {
-				mtx_unlock_spin(&sched_lock);
-				continue;
+				case TDS_IWAIT:
+					totalp->t_sl++;
+					continue;
+				default:
+					break;
+				}
 			}
-			break;
 		}
 		mtx_unlock_spin(&sched_lock);
 		/*
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index 5708d8d..2e5bd07 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -642,6 +642,7 @@ vm_pageout_scan(int pass)
 	int vnodes_skipped = 0;
 	int maxlaunder;
 	int s;
+	struct thread *td;
 
 	GIANT_REQUIRED;
 	/*
@@ -1123,7 +1124,8 @@ rescan0:
 		bigproc = NULL;
 		bigsize = 0;
 		sx_slock(&allproc_lock);
-		LIST_FOREACH(p, &allproc, p_list) {
+		FOREACH_PROC_IN_SYSTEM(p) {
+			int breakout;
 			/*
 			 * If this process is already locked, skip it.
 			 */
@@ -1139,10 +1141,19 @@ rescan0:
 			}
 			/*
 			 * if the process is in a non-running type state,
-			 * don't touch it.
+			 * don't touch it. Check all the threads individually.
 			 */
 			mtx_lock_spin(&sched_lock);
-			if (p->p_stat != SRUN && p->p_stat != SSLEEP) {
+			breakout = 0;
+			FOREACH_THREAD_IN_PROC(p, td) {
+				if (td->td_state != TDS_RUNQ &&
+				    td->td_state != TDS_RUNNING &&
+				    td->td_state != TDS_SLP) {
+					breakout = 1;
+					break;
+				}
+			}
+			if (breakout) {
 				mtx_unlock_spin(&sched_lock);
 				PROC_UNLOCK(p);
 				continue;
@@ -1445,6 +1456,8 @@ static void
 vm_daemon()
 {
 	struct proc *p;
+	int breakout;
+	struct thread *td;
 
 	mtx_lock(&Giant);
 	while (TRUE) {
@@ -1473,7 +1486,16 @@ vm_daemon()
 			 * don't touch it.
 			 */
 			mtx_lock_spin(&sched_lock);
-			if (p->p_stat != SRUN && p->p_stat != SSLEEP) {
+			breakout = 0;
+			FOREACH_THREAD_IN_PROC(p, td) {
+				if (td->td_state != TDS_RUNQ &&
+				    td->td_state != TDS_RUNNING &&
+				    td->td_state != TDS_SLP) {
+					breakout = 1;
+					break;
+				}
+			}
+			if (breakout) {
 				mtx_unlock_spin(&sched_lock);
 				continue;
 			}
diff --git a/sys/vm/vm_zeroidle.c b/sys/vm/vm_zeroidle.c
index 99ace6e..d7ab1ce 100644
--- a/sys/vm/vm_zeroidle.c
+++ b/sys/vm/vm_zeroidle.c
@@ -127,7 +127,6 @@ vm_pagezero(void)
 			pages += vm_page_zero_idle();
 			if (pages > idlezero_maxrun) {
 				mtx_lock_spin(&sched_lock);
-				setrunqueue(td);
 				td->td_proc->p_stats->p_ru.ru_nvcsw++;
 				mi_switch();
 				mtx_unlock_spin(&sched_lock);
author	julian <julian@FreeBSD.org>	2002-06-29 17:26:22 +0000
committer	julian <julian@FreeBSD.org>	2002-06-29 17:26:22 +0000
commit	aa2dc0a5d9e7a19420c153cd414fefa8498eab71 (patch)
tree	0a0483a267784fa8e2bf86857d8727edb5b122e9
parent	6dbff7f2c1f8150887038aed666e11675adf0b4e (diff)
download	FreeBSD-src-aa2dc0a5d9e7a19420c153cd414fefa8498eab71.zip FreeBSD-src-aa2dc0a5d9e7a19420c153cd414fefa8498eab71.tar.gz