summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--sys/kern/kern_switch.c92
-rw-r--r--sys/kern/sched_4bsd.c160
-rw-r--r--sys/kern/sched_core.c59
-rw-r--r--sys/kern/sched_ule.c175
-rw-r--r--sys/sys/mutex.h9
-rw-r--r--sys/sys/proc.h113
-rw-r--r--sys/sys/sched.h16
7 files changed, 460 insertions, 164 deletions
diff --git a/sys/kern/kern_switch.c b/sys/kern/kern_switch.c
index 1ccf64c..13bba12 100644
--- a/sys/kern/kern_switch.c
+++ b/sys/kern/kern_switch.c
@@ -49,6 +49,8 @@ __FBSDID("$FreeBSD$");
#include <sys/sysctl.h>
#endif
+#include <machine/cpu.h>
+
/* Uncomment this to enable logging of critical_enter/exit. */
#if 0
#define KTR_CRITICAL KTR_SCHED
@@ -77,6 +79,49 @@ static int kern_sched_preemption = 0;
SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
&kern_sched_preemption, 0, "Kernel preemption enabled");
+#ifdef SCHED_STATS
+long switch_preempt;
+long switch_owepreempt;
+long switch_turnstile;
+long switch_sleepq;
+long switch_sleepqtimo;
+long switch_relinquish;
+long switch_needresched;
+static SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW, 0, "switch stats");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, preempt, CTLFLAG_RD, &switch_preempt, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, owepreempt, CTLFLAG_RD, &switch_owepreempt, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, turnstile, CTLFLAG_RD, &switch_turnstile, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, sleepq, CTLFLAG_RD, &switch_sleepq, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, sleepqtimo, CTLFLAG_RD, &switch_sleepqtimo, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, relinquish, CTLFLAG_RD, &switch_relinquish, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, needresched, CTLFLAG_RD, &switch_needresched, 0, "");
+static int
+sysctl_stats_reset(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ int val;
+
+ val = 0;
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (val == 0)
+ return (0);
+ switch_preempt = 0;
+ switch_owepreempt = 0;
+ switch_turnstile = 0;
+ switch_sleepq = 0;
+ switch_sleepqtimo = 0;
+ switch_relinquish = 0;
+ switch_needresched = 0;
+
+ return (0);
+}
+
+SYSCTL_PROC(_kern_sched_stats, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_WR, NULL,
+ 0, sysctl_stats_reset, "I", "Reset scheduler statistics");
+#endif
+
/************************************************************************
* Functions that manipulate runnability from a thread perspective. *
************************************************************************/
@@ -142,13 +187,13 @@ critical_exit(void)
#ifdef PREEMPTION
if (td->td_critnest == 1) {
td->td_critnest = 0;
- mtx_assert(&sched_lock, MA_NOTOWNED);
if (td->td_owepreempt) {
td->td_critnest = 1;
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
td->td_critnest--;
+ SCHED_STAT_INC(switch_owepreempt);
mi_switch(SW_INVOL, NULL);
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
}
} else
#endif
@@ -173,7 +218,6 @@ maybe_preempt(struct thread *td)
int cpri, pri;
#endif
- mtx_assert(&sched_lock, MA_OWNED);
#ifdef PREEMPTION
/*
* The new thread should not preempt the current thread if any of the
@@ -199,6 +243,7 @@ maybe_preempt(struct thread *td)
* to the new thread.
*/
ctd = curthread;
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
KASSERT ((ctd->td_sched != NULL && ctd->td_sched->ts_thread == ctd),
("thread has no (or wrong) sched-private part."));
KASSERT((td->td_inhibitors == 0),
@@ -219,15 +264,25 @@ maybe_preempt(struct thread *td)
ctd->td_owepreempt = 1;
return (0);
}
-
/*
* Thread is runnable but not yet put on system run queue.
*/
+ MPASS(ctd->td_lock == &sched_lock);
+ MPASS(td->td_lock == &sched_lock);
MPASS(TD_ON_RUNQ(td));
TD_SET_RUNNING(td);
CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
td->td_proc->p_pid, td->td_proc->p_comm);
+ SCHED_STAT_INC(switch_preempt);
mi_switch(SW_INVOL|SW_PREEMPT, td);
+ /*
+ * td's lock pointer may have changed. We have to return with it
+ * locked.
+ */
+ spinlock_enter();
+ thread_unlock(ctd);
+ thread_lock(td);
+ spinlock_exit();
return (1);
#else
return (0);
@@ -442,7 +497,6 @@ runq_choose(struct runq *rq)
struct td_sched *ts;
int pri;
- mtx_assert(&sched_lock, MA_OWNED);
while ((pri = runq_findbit(rq)) != -1) {
rqh = &rq->rq_queues[pri];
#if defined(SMP) && defined(SCHED_4BSD)
@@ -484,7 +538,6 @@ runq_choose_from(struct runq *rq, u_char idx)
struct td_sched *ts;
int pri;
- mtx_assert(&sched_lock, MA_OWNED);
if ((pri = runq_findbit_from(rq, idx)) != -1) {
rqh = &rq->rq_queues[pri];
ts = TAILQ_FIRST(rqh);
@@ -519,9 +572,20 @@ runq_remove_idx(struct runq *rq, struct td_sched *ts, u_char *idx)
KASSERT(ts->ts_thread->td_proc->p_sflag & PS_INMEM,
("runq_remove_idx: process swapped out"));
pri = ts->ts_rqindex;
+ KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri));
rqh = &rq->rq_queues[pri];
CTR5(KTR_RUNQ, "runq_remove_idx: td=%p, ts=%p pri=%d %d rqh=%p",
ts->ts_thread, ts, ts->ts_thread->td_priority, pri, rqh);
+ {
+ struct td_sched *nts;
+
+ TAILQ_FOREACH(nts, rqh, ts_procq)
+ if (nts == ts)
+ break;
+ if (ts != nts)
+ panic("runq_remove_idx: ts %p not on rqindex %d",
+ ts, pri);
+ }
TAILQ_REMOVE(rqh, ts, ts_procq);
if (TAILQ_EMPTY(rqh)) {
CTR0(KTR_RUNQ, "runq_remove_idx: empty");
@@ -589,18 +653,4 @@ sched_set_concurrency(struct proc *p, int concurrency)
{
}
-/*
- * Called from thread_exit() for all exiting thread
- *
- * Not to be confused with sched_exit_thread()
- * that is only called from thread_exit() for threads exiting
- * without the rest of the process exiting because it is also called from
- * sched_exit() and we wouldn't want to call it twice.
- * XXX This can probably be fixed.
- */
-void
-sched_thread_exit(struct thread *td)
-{
-}
-
#endif /* KERN_SWITCH_INCLUDE */
diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c
index 66952ec..a4b1e08 100644
--- a/sys/kern/sched_4bsd.c
+++ b/sys/kern/sched_4bsd.c
@@ -248,7 +248,7 @@ static void
maybe_resched(struct thread *td)
{
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
if (td->td_priority < curthread->td_priority)
curthread->td_flags |= TDF_NEEDRESCHED;
}
@@ -377,10 +377,7 @@ schedcpu(void)
realstathz = stathz ? stathz : hz;
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
- /*
- * Prevent state changes and protect run queue.
- */
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
/*
* Increment time in/out of memory. We ignore overflow; with
* 16-bit int's (remember them?) overflow takes 45 days.
@@ -388,6 +385,7 @@ schedcpu(void)
p->p_swtime++;
FOREACH_THREAD_IN_PROC(p, td) {
awake = 0;
+ thread_lock(td);
ts = td->td_sched;
/*
* Increment sleep time (if sleeping). We
@@ -456,13 +454,16 @@ XXX this is broken
td->td_slptime = 0;
} else
td->td_slptime++;
- if (td->td_slptime > 1)
+ if (td->td_slptime > 1) {
+ thread_unlock(td);
continue;
+ }
td->td_estcpu = decay_cpu(loadfac, td->td_estcpu);
resetpriority(td);
resetpriority_thread(td);
+ thread_unlock(td);
} /* end of thread loop */
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
} /* end of process loop */
sx_sunlock(&allproc_lock);
}
@@ -575,6 +576,7 @@ schedinit(void)
*/
proc0.p_sched = NULL; /* XXX */
thread0.td_sched = &td_sched0;
+ thread0.td_lock = &sched_lock;
td_sched0.ts_thread = &thread0;
}
@@ -615,7 +617,7 @@ sched_clock(struct thread *td)
{
struct td_sched *ts;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
ts = td->td_sched;
ts->ts_cpticks++;
@@ -635,22 +637,23 @@ sched_exit(struct proc *p, struct thread *td)
CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
td, td->td_proc->p_comm, td->td_priority);
-
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
}
void
sched_exit_thread(struct thread *td, struct thread *child)
{
- struct proc *childproc = child->td_proc;
CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
- child, childproc->p_comm, child->td_priority);
+ child, child->td_proc->p_comm, child->td_priority);
+ thread_lock(td);
td->td_estcpu = ESTCPULIM(td->td_estcpu + child->td_estcpu);
- childproc->p_estcpu = ESTCPULIM(childproc->p_estcpu +
- child->td_estcpu);
+ thread_unlock(td);
+ mtx_lock_spin(&sched_lock);
if ((child->td_proc->p_flag & P_NOLOAD) == 0)
sched_load_rem();
+ mtx_unlock_spin(&sched_lock);
}
void
@@ -663,6 +666,7 @@ void
sched_fork_thread(struct thread *td, struct thread *childtd)
{
childtd->td_estcpu = td->td_estcpu;
+ childtd->td_lock = &sched_lock;
sched_newthread(childtd);
}
@@ -672,18 +676,20 @@ sched_nice(struct proc *p, int nice)
struct thread *td;
PROC_LOCK_ASSERT(p, MA_OWNED);
- mtx_assert(&sched_lock, MA_OWNED);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
p->p_nice = nice;
FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
resetpriority(td);
resetpriority_thread(td);
+ thread_unlock(td);
}
}
void
sched_class(struct thread *td, int class)
{
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
td->td_pri_class = class;
}
@@ -697,7 +703,7 @@ sched_priority(struct thread *td, u_char prio)
td, td->td_proc->p_comm, td->td_priority, prio, curthread,
curthread->td_proc->p_comm);
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
if (td->td_priority == prio)
return;
td->td_priority = prio;
@@ -818,7 +824,7 @@ void
sched_sleep(struct thread *td)
{
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
td->td_slptime = 0;
}
@@ -831,26 +837,18 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
ts = td->td_sched;
p = td->td_proc;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ /*
+ * Switch to the sched lock to fix things up and pick
+ * a new thread.
+ */
+ if (td->td_lock != &sched_lock) {
+ mtx_lock_spin(&sched_lock);
+ thread_unlock(td);
+ }
if ((p->p_flag & P_NOLOAD) == 0)
sched_load_rem();
-#if 0
- /*
- * We are volunteering to switch out so we get to nominate
- * a successor for the rest of our quantum
- * First try another thread in our process
- *
- * this is too expensive to do without per process run queues
- * so skip it for now.
- * XXX keep this comment as a marker.
- */
- if (sched_followon &&
- (p->p_flag & P_HADTHREADS) &&
- (flags & SW_VOL) &&
- newtd == NULL)
- newtd = mumble();
-#endif
if (newtd)
newtd->td_flags |= (td->td_flags & TDF_NEEDRESCHED);
@@ -896,6 +894,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
} else {
newtd = choosethread();
}
+ MPASS(newtd->td_lock == &sched_lock);
if (td != newtd) {
#ifdef HWPMC_HOOKS
@@ -904,7 +903,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
#endif
/* I feel sleepy */
- cpu_switch(td, newtd);
+ cpu_switch(td, newtd, td->td_lock);
/*
* Where am I? What year is it?
* We are in the same thread that went to sleep above,
@@ -932,12 +931,13 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
#endif
sched_lock.mtx_lock = (uintptr_t)td;
td->td_oncpu = PCPU_GET(cpuid);
+ MPASS(td->td_lock == &sched_lock);
}
void
sched_wakeup(struct thread *td)
{
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
if (td->td_slptime > 1) {
updatepri(td);
resetpriority(td);
@@ -1079,7 +1079,7 @@ sched_add(struct thread *td, int flags)
int single_cpu = 0;
ts = td->td_sched;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
KASSERT((td->td_inhibitors == 0),
("sched_add: trying to run inhibited thread"));
KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
@@ -1089,6 +1089,14 @@ sched_add(struct thread *td, int flags)
CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
td, td->td_proc->p_comm, td->td_priority, curthread,
curthread->td_proc->p_comm);
+ /*
+ * Now that the thread is moving to the run-queue, set the lock
+ * to the scheduler's lock.
+ */
+ if (td->td_lock != &sched_lock) {
+ mtx_lock_spin(&sched_lock);
+ thread_lock_set(td, &sched_lock);
+ }
TD_SET_RUNQ(td);
if (td->td_pinned != 0) {
@@ -1140,7 +1148,7 @@ sched_add(struct thread *td, int flags)
{
struct td_sched *ts;
ts = td->td_sched;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
KASSERT((td->td_inhibitors == 0),
("sched_add: trying to run inhibited thread"));
KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
@@ -1150,6 +1158,14 @@ sched_add(struct thread *td, int flags)
CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
td, td->td_proc->p_comm, td->td_priority, curthread,
curthread->td_proc->p_comm);
+ /*
+ * Now that the thread is moving to the run-queue, set the lock
+ * to the scheduler's lock.
+ */
+ if (td->td_lock != &sched_lock) {
+ mtx_lock_spin(&sched_lock);
+ thread_lock_set(td, &sched_lock);
+ }
TD_SET_RUNQ(td);
CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td);
ts->ts_runq = &runq;
@@ -1207,6 +1223,7 @@ sched_choose(void)
struct td_sched *ts;
struct runq *rq;
+ mtx_assert(&sched_lock, MA_OWNED);
#ifdef SMP
struct td_sched *kecpu;
@@ -1256,10 +1273,10 @@ sched_userret(struct thread *td)
KASSERT((td->td_flags & TDF_BORROWING) == 0,
("thread with borrowed priority returning to userland"));
if (td->td_priority != td->td_user_pri) {
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
td->td_priority = td->td_user_pri;
td->td_base_pri = td->td_user_pri;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
}
}
@@ -1268,7 +1285,7 @@ sched_bind(struct thread *td, int cpu)
{
struct td_sched *ts;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
KASSERT(TD_IS_RUNNING(td),
("sched_bind: cannot bind non-running thread"));
@@ -1287,25 +1304,26 @@ sched_bind(struct thread *td, int cpu)
void
sched_unbind(struct thread* td)
{
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
td->td_sched->ts_flags &= ~TSF_BOUND;
}
int
sched_is_bound(struct thread *td)
{
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
return (td->td_sched->ts_flags & TSF_BOUND);
}
void
sched_relinquish(struct thread *td)
{
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
if (td->td_pri_class == PRI_TIMESHARE)
sched_prio(td, PRI_MAX_TIMESHARE);
+ SCHED_STAT_INC(switch_relinquish);
mi_switch(SW_VOL, NULL);
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
}
int
@@ -1363,5 +1381,57 @@ sched_idletd(void *dummy)
}
}
+/*
+ * A CPU is entering for the first time or a thread is exiting.
+ */
+void
+sched_throw(struct thread *td)
+{
+ /*
+ * Correct spinlock nesting. The idle thread context that we are
+ * borrowing was created so that it would start out with a single
+ * spin lock (sched_lock) held in fork_trampoline(). Since we've
+ * explicitly acquired locks in this function, the nesting count
+ * is now 2 rather than 1. Since we are nested, calling
+ * spinlock_exit() will simply adjust the counts without allowing
+ * spin lock using code to interrupt us.
+ */
+ if (td == NULL) {
+ mtx_lock_spin(&sched_lock);
+ spinlock_exit();
+ } else {
+ MPASS(td->td_lock == &sched_lock);
+ }
+ mtx_assert(&sched_lock, MA_OWNED);
+ KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
+ PCPU_SET(switchtime, cpu_ticks());
+ PCPU_SET(switchticks, ticks);
+ cpu_throw(td, choosethread()); /* doesn't return */
+}
+
+void
+sched_fork_exit(struct thread *ctd)
+{
+ struct thread *td;
+
+ /*
+ * Finish setting up thread glue so that it begins execution in a
+ * non-nested critical section with sched_lock held but not recursed.
+ */
+ ctd->td_oncpu = PCPU_GET(cpuid);
+ sched_lock.mtx_lock = (uintptr_t)ctd;
+ THREAD_LOCK_ASSERT(ctd, MA_OWNED | MA_NOTRECURSED);
+ /*
+ * Processes normally resume in mi_switch() after being
+ * cpu_switch()'ed to, but when children start up they arrive here
+ * instead, so we must do much the same things as mi_switch() would.
+ */
+ if ((td = PCPU_GET(deadthread))) {
+ PCPU_SET(deadthread, NULL);
+ thread_stash(td);
+ }
+ thread_unlock(ctd);
+}
+
#define KERN_SWITCH_INCLUDE 1
#include "kern/kern_switch.c"
diff --git a/sys/kern/sched_core.c b/sys/kern/sched_core.c
index b0994f8..4cec09b 100644
--- a/sys/kern/sched_core.c
+++ b/sys/kern/sched_core.c
@@ -784,6 +784,7 @@ schedinit(void)
*/
proc0.p_sched = NULL; /* XXX */
thread0.td_sched = &kse0;
+ thread0.td_lock = &sched_lock;
kse0.ts_thread = &thread0;
kse0.ts_slice = 100;
}
@@ -1018,7 +1019,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
if (PMC_PROC_IS_USING_PMCS(td->td_proc))
PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
#endif
- cpu_switch(td, newtd);
+ cpu_switch(td, newtd, td->td_lock);
#ifdef HWPMC_HOOKS
if (PMC_PROC_IS_USING_PMCS(td->td_proc))
PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
@@ -1110,6 +1111,7 @@ sched_fork_thread(struct thread *td, struct thread *child)
ts = td->td_sched;
ts2 = child->td_sched;
+ child->td_lock = td->td_lock;
ts2->ts_slptime = ts2->ts_slptime * CHILD_WEIGHT / 100;
if (child->td_pri_class == PRI_TIMESHARE)
sched_user_prio(child, sched_calc_pri(ts2));
@@ -1142,7 +1144,8 @@ sched_class(struct thread *td, int class)
void
sched_exit(struct proc *p, struct thread *childtd)
{
- mtx_assert(&sched_lock, MA_OWNED);
+
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
sched_exit_thread(FIRST_THREAD_IN_PROC(p), childtd);
}
@@ -1747,5 +1750,57 @@ sched_idletd(void *dummy)
}
}
+/*
+ * A CPU is entering for the first time or a thread is exiting.
+ */
+void
+sched_throw(struct thread *td)
+{
+ /*
+ * Correct spinlock nesting. The idle thread context that we are
+ * borrowing was created so that it would start out with a single
+ * spin lock (sched_lock) held in fork_trampoline(). Since we've
+ * explicitly acquired locks in this function, the nesting count
+ * is now 2 rather than 1. Since we are nested, calling
+ * spinlock_exit() will simply adjust the counts without allowing
+ * spin lock using code to interrupt us.
+ */
+ if (td == NULL) {
+ mtx_lock_spin(&sched_lock);
+ spinlock_exit();
+ } else {
+ MPASS(td->td_lock == &sched_lock);
+ }
+ mtx_assert(&sched_lock, MA_OWNED);
+ KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
+ PCPU_SET(switchtime, cpu_ticks());
+ PCPU_SET(switchticks, ticks);
+ cpu_throw(td, choosethread()); /* doesn't return */
+}
+
+void
+sched_fork_exit(struct thread *ctd)
+{
+ struct thread *td;
+
+ /*
+ * Finish setting up thread glue so that it begins execution in a
+ * non-nested critical section with sched_lock held but not recursed.
+ */
+ ctd->td_oncpu = PCPU_GET(cpuid);
+ sched_lock.mtx_lock = (uintptr_t)ctd;
+ THREAD_LOCK_ASSERT(ctd, MA_OWNED | MA_NOTRECURSED);
+ /*
+ * Processes normally resume in mi_switch() after being
+ * cpu_switch()'ed to, but when children start up they arrive here
+ * instead, so we must do much the same things as mi_switch() would.
+ */
+ if ((td = PCPU_GET(deadthread))) {
+ PCPU_SET(deadthread, NULL);
+ thread_stash(td);
+ }
+ thread_unlock(ctd);
+}
+
#define KERN_SWITCH_INCLUDE 1
#include "kern/kern_switch.c"
diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index 4f4cf41..30761fb 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -229,6 +229,7 @@ static int ipi_thresh = PRI_MIN_KERN;
static int steal_htt = 1;
static int steal_busy = 1;
static int busy_thresh = 4;
+static int topology = 0;
/*
* One thread queue per processor.
@@ -434,7 +435,7 @@ tdq_load_add(struct tdq *tdq, struct td_sched *ts)
mtx_assert(&sched_lock, MA_OWNED);
class = PRI_BASE(ts->ts_thread->td_pri_class);
tdq->tdq_load++;
- CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
+ CTR2(KTR_SCHED, "cpu %jd load: %d", TDQ_ID(tdq), tdq->tdq_load);
if (class != PRI_ITHD &&
(ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
#ifdef SMP
@@ -997,7 +998,7 @@ sched_setup(void *dummy)
tdq = &tdq_cpu[i];
tdq_setup(&tdq_cpu[i]);
}
- if (1) {
+ if (smp_topology == NULL) {
struct tdq_group *tdg;
struct tdq *tdq;
int cpus;
@@ -1027,6 +1028,7 @@ sched_setup(void *dummy)
struct cpu_group *cg;
int j;
+ topology = 1;
for (i = 0; i < smp_topology->ct_count; i++) {
cg = &smp_topology->ct_group[i];
tdg = &tdq_groups[i];
@@ -1248,6 +1250,7 @@ schedinit(void)
*/
proc0.p_sched = NULL; /* XXX */
thread0.td_sched = &td_sched0;
+ thread0.td_lock = &sched_lock;
td_sched0.ts_ltick = ticks;
td_sched0.ts_ftick = ticks;
td_sched0.ts_thread = &thread0;
@@ -1296,7 +1299,7 @@ sched_thread_priority(struct thread *td, u_char prio)
td, td->td_proc->p_comm, td->td_priority, prio, curthread,
curthread->td_proc->p_comm);
ts = td->td_sched;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
if (td->td_priority == prio)
return;
@@ -1307,9 +1310,10 @@ sched_thread_priority(struct thread *td, u_char prio)
* queue. This could be optimized to not re-add in some
* cases.
*/
+ MPASS(td->td_lock == &sched_lock);
sched_rem(td);
td->td_priority = prio;
- sched_add(td, SRQ_BORROWING);
+ sched_add(td, SRQ_BORROWING|SRQ_OURSELF);
} else
td->td_priority = prio;
}
@@ -1427,7 +1431,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
struct td_sched *ts;
int preempt;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
preempt = flags & SW_PREEMPT;
tdq = TDQ_SELF();
@@ -1440,24 +1444,33 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
* If the thread has been assigned it may be in the process of switching
* to the new cpu. This is the case in sched_bind().
*/
+ /*
+ * Switch to the sched lock to fix things up and pick
+ * a new thread.
+ */
+ if (td->td_lock != &sched_lock) {
+ mtx_lock_spin(&sched_lock);
+ thread_unlock(td);
+ }
if (TD_IS_IDLETHREAD(td)) {
+ MPASS(td->td_lock == &sched_lock);
TD_SET_CAN_RUN(td);
- } else {
+ } else if (TD_IS_RUNNING(td)) {
+ /*
+ * Don't allow the thread to migrate
+ * from a preemption.
+ */
tdq_load_rem(tdq, ts);
- if (TD_IS_RUNNING(td)) {
- /*
- * Don't allow the thread to migrate
- * from a preemption.
- */
- if (preempt)
- sched_pin_td(td);
- sched_add(td, preempt ?
- SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
- SRQ_OURSELF|SRQ_YIELDING);
- if (preempt)
- sched_unpin_td(td);
- }
- }
+ if (preempt)
+ sched_pin_td(td);
+ sched_add(td, preempt ?
+ SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
+ SRQ_OURSELF|SRQ_YIELDING);
+ if (preempt)
+ sched_unpin_td(td);
+ } else
+ tdq_load_rem(tdq, ts);
+ mtx_assert(&sched_lock, MA_OWNED);
if (newtd != NULL) {
/*
* If we bring in a thread account for it as if it had been
@@ -1473,7 +1486,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
#endif
- cpu_switch(td, newtd);
+ cpu_switch(td, newtd, td->td_lock);
#ifdef HWPMC_HOOKS
if (PMC_PROC_IS_USING_PMCS(td->td_proc))
PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
@@ -1481,6 +1494,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
}
sched_lock.mtx_lock = (uintptr_t)td;
td->td_oncpu = PCPU_GET(cpuid);
+ MPASS(td->td_lock == &sched_lock);
}
void
@@ -1489,12 +1503,14 @@ sched_nice(struct proc *p, int nice)
struct thread *td;
PROC_LOCK_ASSERT(p, MA_OWNED);
- mtx_assert(&sched_lock, MA_OWNED);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
p->p_nice = nice;
FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
sched_priority(td);
sched_prio(td, td->td_base_user_pri);
+ thread_unlock(td);
}
}
@@ -1502,7 +1518,7 @@ void
sched_sleep(struct thread *td)
{
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
td->td_sched->ts_slptime = ticks;
}
@@ -1513,7 +1529,7 @@ sched_wakeup(struct thread *td)
struct td_sched *ts;
int slptime;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
ts = td->td_sched;
/*
* If we slept for more than a tick update our interactivity and
@@ -1542,7 +1558,7 @@ sched_wakeup(struct thread *td)
void
sched_fork(struct thread *td, struct thread *child)
{
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
sched_fork_thread(td, child);
/*
* Penalize the parent and child for forking.
@@ -1563,7 +1579,9 @@ sched_fork_thread(struct thread *td, struct thread *child)
/*
* Initialize child.
*/
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
sched_newthread(child);
+ child->td_lock = &sched_lock;
ts = td->td_sched;
ts2 = child->td_sched;
ts2->ts_cpu = ts->ts_cpu;
@@ -1588,7 +1606,7 @@ void
sched_class(struct thread *td, int class)
{
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
if (td->td_pri_class == class)
return;
@@ -1627,6 +1645,7 @@ sched_exit(struct proc *p, struct thread *child)
CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
child, child->td_proc->p_comm, child->td_priority);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
td = FIRST_THREAD_IN_PROC(p);
sched_exit_thread(td, child);
}
@@ -1638,7 +1657,9 @@ sched_exit_thread(struct thread *td, struct thread *child)
CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
child, child->td_proc->p_comm, child->td_priority);
+ thread_lock(child);
tdq_load_rem(TDQ_CPU(child->td_sched->ts_cpu), child->td_sched);
+ thread_unlock(child);
#ifdef KSE
/*
* KSE forks and exits so often that this penalty causes short-lived
@@ -1653,9 +1674,11 @@ sched_exit_thread(struct thread *td, struct thread *child)
* sleep time as a penalty to the parent. This causes shells that
* launch expensive things to mark their children as expensive.
*/
+ thread_lock(td);
td->td_sched->skg_runtime += child->td_sched->skg_runtime;
sched_interact_update(td);
sched_priority(td);
+ thread_unlock(td);
}
void
@@ -1673,10 +1696,10 @@ sched_userret(struct thread *td)
KASSERT((td->td_flags & TDF_BORROWING) == 0,
("thread with borrowed priority returning to userland"));
if (td->td_priority != td->td_user_pri) {
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
td->td_priority = td->td_user_pri;
td->td_base_pri = td->td_user_pri;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
}
}
@@ -1805,9 +1828,22 @@ sched_preempt(struct thread *td)
*/
MPASS(TD_ON_RUNQ(td));
TD_SET_RUNNING(td);
+ MPASS(ctd->td_lock == &sched_lock);
+ MPASS(td->td_lock == &sched_lock);
CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
td->td_proc->p_pid, td->td_proc->p_comm);
+ /*
+ * We enter the switch with two runnable threads that both have
+ * the same lock. When we return td may be sleeping so we need
+ * to switch locks to make sure he's locked correctly.
+ */
+ SCHED_STAT_INC(switch_preempt);
mi_switch(SW_INVOL|SW_PREEMPT, td);
+ spinlock_enter();
+ thread_unlock(ctd);
+ thread_lock(td);
+ spinlock_exit();
+
return (1);
}
@@ -1824,7 +1860,7 @@ sched_add(struct thread *td, int flags)
#endif
ts = td->td_sched;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
td, td->td_proc->p_comm, td->td_priority, curthread,
curthread->td_proc->p_comm);
@@ -1834,8 +1870,15 @@ sched_add(struct thread *td, int flags)
("sched_add: bad thread state"));
KASSERT(td->td_proc->p_sflag & PS_INMEM,
("sched_add: process swapped out"));
- KASSERT(ts->ts_runq == NULL,
- ("sched_add: thread %p is still assigned to a run queue", td));
+ /*
+ * Now that the thread is moving to the run-queue, set the lock
+ * to the scheduler's lock.
+ */
+ if (td->td_lock != &sched_lock) {
+ mtx_lock_spin(&sched_lock);
+ thread_lock_set(td, &sched_lock);
+ }
+ mtx_assert(&sched_lock, MA_OWNED);
TD_SET_RUNQ(td);
tdq = TDQ_SELF();
class = PRI_BASE(td->td_pri_class);
@@ -1920,7 +1963,7 @@ sched_rem(struct thread *td)
CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
td, td->td_proc->p_comm, td->td_priority, curthread,
curthread->td_proc->p_comm);
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
ts = td->td_sched;
KASSERT(TD_ON_RUNQ(td),
("sched_rem: thread not on run queue"));
@@ -1942,7 +1985,7 @@ sched_pctcpu(struct thread *td)
if (ts == NULL)
return (0);
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
if (ts->ts_ticks) {
int rtick;
@@ -1952,7 +1995,7 @@ sched_pctcpu(struct thread *td)
pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
}
td->td_proc->p_swtime = ts->ts_ltick - ts->ts_ftick;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
return (pctcpu);
}
@@ -1962,7 +2005,7 @@ sched_bind(struct thread *td, int cpu)
{
struct td_sched *ts;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
ts = td->td_sched;
if (ts->ts_flags & TSF_BOUND)
sched_unbind(td);
@@ -1982,7 +2025,7 @@ sched_unbind(struct thread *td)
{
struct td_sched *ts;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
ts = td->td_sched;
if ((ts->ts_flags & TSF_BOUND) == 0)
return;
@@ -1995,18 +2038,19 @@ sched_unbind(struct thread *td)
int
sched_is_bound(struct thread *td)
{
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
return (td->td_sched->ts_flags & TSF_BOUND);
}
void
sched_relinquish(struct thread *td)
{
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
if (td->td_pri_class == PRI_TIMESHARE)
sched_prio(td, PRI_MAX_TIMESHARE);
+ SCHED_STAT_INC(switch_relinquish);
mi_switch(SW_VOL, NULL);
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
}
int
@@ -2071,6 +2115,58 @@ sched_idletd(void *dummy)
cpu_idle();
}
+/*
+ * A CPU is entering for the first time or a thread is exiting.
+ */
+void
+sched_throw(struct thread *td)
+{
+ /*
+ * Correct spinlock nesting. The idle thread context that we are
+ * borrowing was created so that it would start out with a single
+ * spin lock (sched_lock) held in fork_trampoline(). Since we've
+ * explicitly acquired locks in this function, the nesting count
+ * is now 2 rather than 1. Since we are nested, calling
+ * spinlock_exit() will simply adjust the counts without allowing
+ * spin lock using code to interrupt us.
+ */
+ if (td == NULL) {
+ mtx_lock_spin(&sched_lock);
+ spinlock_exit();
+ } else {
+ MPASS(td->td_lock == &sched_lock);
+ }
+ mtx_assert(&sched_lock, MA_OWNED);
+ KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
+ PCPU_SET(switchtime, cpu_ticks());
+ PCPU_SET(switchticks, ticks);
+ cpu_throw(td, choosethread()); /* doesn't return */
+}
+
+void
+sched_fork_exit(struct thread *ctd)
+{
+ struct thread *td;
+
+ /*
+ * Finish setting up thread glue so that it begins execution in a
+ * non-nested critical section with sched_lock held but not recursed.
+ */
+ ctd->td_oncpu = PCPU_GET(cpuid);
+ sched_lock.mtx_lock = (uintptr_t)ctd;
+ THREAD_LOCK_ASSERT(ctd, MA_OWNED | MA_NOTRECURSED);
+ /*
+ * Processes normally resume in mi_switch() after being
+ * cpu_switch()'ed to, but when children start up they arrive here
+ * instead, so we must do much the same things as mi_switch() would.
+ */
+ if ((td = PCPU_GET(deadthread))) {
+ PCPU_SET(deadthread, NULL);
+ thread_stash(td);
+ }
+ thread_unlock(ctd);
+}
+
static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0,
"Scheduler name");
@@ -2093,6 +2189,7 @@ SYSCTL_INT(_kern_sched, OID_AUTO, ipi_thresh, CTLFLAG_RW, &ipi_thresh, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, steal_busy, CTLFLAG_RW, &steal_busy, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, busy_thresh, CTLFLAG_RW, &busy_thresh, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, topology, CTLFLAG_RD, &topology, 0, "");
#endif
/* ps compat */
diff --git a/sys/sys/mutex.h b/sys/sys/mutex.h
index caa1311..d18061a 100644
--- a/sys/sys/mutex.h
+++ b/sys/sys/mutex.h
@@ -125,6 +125,14 @@ void _mtx_unlock_spin_flags(struct mtx *m, int opts, const char *file,
#if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
void _mtx_assert(struct mtx *m, int what, const char *file, int line);
#endif
+void _thread_lock_flags(struct thread *, int, const char *, int);
+
+#define thread_lock(tdp) \
+ _thread_lock_flags((tdp), 0, __FILE__, __LINE__)
+#define thread_lock_flags(tdp, opt) \
+ _thread_lock_flags((tdp), (opt), __FILE__, __LINE__)
+#define thread_unlock(tdp) \
+ mtx_unlock_spin(__DEVOLATILE(struct mtx *, (tdp)->td_lock))
/*
* We define our machine-independent (unoptimized) mutex micro-operations
@@ -349,6 +357,7 @@ extern struct mtx_pool *mtxpool_sleep;
*/
extern struct mtx sched_lock;
extern struct mtx Giant;
+extern struct mtx blocked_lock;
/*
* Giant lock manipulation and clean exit macros.
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index a73d2d5..acde39d 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -134,7 +134,7 @@ struct pargs {
* g - process group mtx
* h - callout_lock mtx
* i - by curproc or the master session mtx
- * j - locked by sched_lock mtx
+ * j - locked by proc slock
* k - only accessed by curthread
* k*- only accessed by curthread and from an interrupt
* l - the attaching proc or attaching proc parent
@@ -144,6 +144,7 @@ struct pargs {
* p - select lock (sellock)
* q - td_contested lock
* r - p_peers lock
+ * t - thread lock
* x - created at fork, only changes during single threading in exec
* z - zombie threads lock
*
@@ -195,32 +196,19 @@ struct mqueue_notifier;
* other than CPU cycles, which are parceled out to the threads.
*/
-/***************
- * Threads are the unit of execution
- With a single run queue used by all processors:
-
- RUNQ: --->THREAD---THREAD--... SLEEPQ:[]---THREAD---THREAD---THREAD
- []---THREAD
- []
- []---THREAD---THREAD
-
-With PER-CPU run queues:
-it gets more complicated.
- *
- *****************/
-
/*
* Kernel runnable context (thread).
* This is what is put to sleep and reactivated.
* Thread context. Processes may have multiple threads.
*/
struct thread {
+ volatile struct mtx *td_lock; /* replaces sched lock */
struct proc *td_proc; /* (*) Associated process. */
TAILQ_ENTRY(thread) td_plist; /* (*) All threads in this proc. */
/* The two queues below should someday be merged. */
- TAILQ_ENTRY(thread) td_slpq; /* (j) Sleep queue. */
- TAILQ_ENTRY(thread) td_lockq; /* (j) Lock queue. */
+ TAILQ_ENTRY(thread) td_slpq; /* (t) Sleep queue. */
+ TAILQ_ENTRY(thread) td_lockq; /* (t) Lock queue. */
TAILQ_HEAD(, selinfo) td_selq; /* (p) List of selinfos. */
struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */
@@ -232,20 +220,20 @@ struct thread {
/* Cleared during fork1() or thread_schedule_upcall(). */
#define td_startzero td_flags
- int td_flags; /* (j) TDF_* flags. */
- int td_inhibitors; /* (j) Why can not run. */
+ int td_flags; /* (t) TDF_* flags. */
+ int td_inhibitors; /* (t) Why can not run. */
int td_pflags; /* (k) Private thread (TDP_*) flags. */
int td_dupfd; /* (k) Ret value from fdopen. XXX */
- int td_sqqueue; /* (j) Sleepqueue queue blocked on. */
- void *td_wchan; /* (j) Sleep address. */
- const char *td_wmesg; /* (j) Reason for sleep. */
- u_char td_lastcpu; /* (j) Last cpu we were on. */
- u_char td_oncpu; /* (j) Which cpu we are on. */
+ int td_sqqueue; /* (t) Sleepqueue queue blocked on. */
+ void *td_wchan; /* (t) Sleep address. */
+ const char *td_wmesg; /* (t) Reason for sleep. */
+ u_char td_lastcpu; /* (t) Last cpu we were on. */
+ u_char td_oncpu; /* (t) Which cpu we are on. */
volatile u_char td_owepreempt; /* (k*) Preempt on last critical_exit */
short td_locks; /* (k) Count of non-spin locks. */
- u_char td_tsqueue; /* (j) Turnstile queue blocked on. */
- struct turnstile *td_blocked; /* (j) Lock thread is blocked on. */
- const char *td_lockname; /* (j) Name of lock blocked on. */
+ u_char td_tsqueue; /* (t) Turnstile queue blocked on. */
+ struct turnstile *td_blocked; /* (t) Lock thread is blocked on. */
+ const char *td_lockname; /* (t) Name of lock blocked on. */
LIST_HEAD(, turnstile) td_contested; /* (q) Contested locks. */
struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */
int td_intr_nesting_level; /* (k) Interrupt recursion. */
@@ -253,18 +241,18 @@ struct thread {
struct kse_thr_mailbox *td_mailbox; /* (*) Userland mailbox address. */
struct ucred *td_ucred; /* (k) Reference to credentials. */
struct thread *td_standin; /* (k + a) Use this for an upcall. */
- struct kse_upcall *td_upcall; /* (k + j) Upcall structure. */
- u_int td_estcpu; /* (j) Sum of the same field in KSEs. */
- u_int td_slptime; /* (j) How long completely blocked. */
- struct rusage td_ru; /* (j) rusage information */
- uint64_t td_runtime; /* (j) How many cpu ticks we've run. */
- u_int td_pticks; /* (j) Statclock hits for profiling */
- u_int td_sticks; /* (j) Statclock hits in system mode. */
- u_int td_iticks; /* (j) Statclock hits in intr mode. */
- u_int td_uticks; /* (j) Statclock hits in user mode. */
+ struct kse_upcall *td_upcall; /* (k + t) Upcall structure. */
+ u_int td_estcpu; /* (t) estimated cpu utilization */
+ u_int td_slptime; /* (t) How long completely blocked. */
+ struct rusage td_ru; /* (t) rusage information */
+ uint64_t td_runtime; /* (t) How many cpu ticks we've run. */
+ u_int td_pticks; /* (t) Statclock hits for profiling */
+ u_int td_sticks; /* (t) Statclock hits in system mode. */
+ u_int td_iticks; /* (t) Statclock hits in intr mode. */
+ u_int td_uticks; /* (t) Statclock hits in user mode. */
u_int td_uuticks; /* (k) Statclock hits (usr), for UTS. */
u_int td_usticks; /* (k) Statclock hits (sys), for UTS. */
- int td_intrval; /* (j) Return value of TDF_INTERRUPT. */
+ int td_intrval; /* (t) Return value of TDF_INTERRUPT. */
sigset_t td_oldsigmask; /* (k) Saved mask from pre sigpause. */
sigset_t td_sigmask; /* (c) Current signal mask. */
volatile u_int td_generation; /* (k) For detection of preemption */
@@ -278,11 +266,11 @@ struct thread {
/* Copied during fork1() or thread_sched_upcall(). */
#define td_startcopy td_endzero
- u_char td_base_pri; /* (j) Thread base kernel priority. */
- u_char td_priority; /* (j) Thread active priority. */
- u_char td_pri_class; /* (j) Scheduling class. */
- u_char td_user_pri; /* (j) User pri from estcpu and nice. */
- u_char td_base_user_pri; /* (j) Base user pri */
+ u_char td_base_pri; /* (t) Thread base kernel priority. */
+ u_char td_priority; /* (t) Thread active priority. */
+ u_char td_pri_class; /* (t) Scheduling class. */
+ u_char td_user_pri; /* (t) User pri from estcpu and nice. */
+ u_char td_base_user_pri; /* (t) Base user pri */
#define td_endcopy td_pcb
/*
@@ -296,7 +284,7 @@ struct thread {
TDS_CAN_RUN,
TDS_RUNQ,
TDS_RUNNING
- } td_state;
+ } td_state; /* (t) thread state */
register_t td_retval[2]; /* (k) Syscall aux returns. */
struct callout td_slpcallout; /* (h) Callout for sleep. */
struct trapframe *td_frame; /* (k) */
@@ -313,6 +301,16 @@ struct thread {
int td_syscalls; /* per-thread syscall count (used by NFS :)) */
};
+struct mtx *thread_lock_block(struct thread *);
+void thread_lock_unblock(struct thread *, struct mtx *);
+void thread_lock_set(struct thread *, struct mtx *);
+#define THREAD_LOCK_ASSERT(td, type) \
+do { \
+ struct mtx *__m = __DEVOLATILE(struct mtx *, (td)->td_lock); \
+ if (__m != &blocked_lock) \
+ mtx_assert(__m, (type)); \
+} while (0)
+
/*
* Flags kept in td_flags:
* To change these you MUST have the scheduler lock.
@@ -324,22 +322,22 @@ struct thread {
#define TDF_IDLETD 0x00000020 /* This is a per-CPU idle thread. */
#define TDF_SELECT 0x00000040 /* Selecting; wakeup/waiting danger. */
#define TDF_SLEEPABORT 0x00000080 /* sleepq_abort was called. */
-#define TDF_TSNOBLOCK 0x00000100 /* Don't block on a turnstile due to race. */
+#define TDF_UNUSEDx100 0x00000100 /* --available-- */
#define TDF_UBORROWING 0x00000200 /* Thread is borrowing user pri. */
#define TDF_BOUNDARY 0x00000400 /* Thread suspended at user boundary */
#define TDF_ASTPENDING 0x00000800 /* Thread has some asynchronous events. */
#define TDF_TIMOFAIL 0x00001000 /* Timeout from sleep after we were awake. */
#define TDF_INTERRUPT 0x00002000 /* Thread is marked as interrupted. */
#define TDF_UPIBLOCKED 0x00004000 /* Thread blocked on user PI mutex. */
-#define TDF_UNUSED15 0x00008000 /* --available -- */
+#define TDF_UNUSED15 0x00008000 /* --available-- */
#define TDF_NEEDRESCHED 0x00010000 /* Thread needs to yield. */
#define TDF_NEEDSIGCHK 0x00020000 /* Thread may need signal delivery. */
#define TDF_XSIG 0x00040000 /* Thread is exchanging signal under trace */
#define TDF_UNUSED19 0x00080000 /* Thread is sleeping on a umtx. */
#define TDF_THRWAKEUP 0x00100000 /* Libthr thread must not suspend itself. */
#define TDF_DBSUSPEND 0x00200000 /* Thread is suspended by debugger */
-#define TDF_UNUSED22 0x00400000 /* --available -- */
-#define TDF_UNUSED23 0x00800000 /* --available -- */
+#define TDF_UNUSED22 0x00400000 /* --available-- */
+#define TDF_UNUSED23 0x00800000 /* --available-- */
#define TDF_SCHED0 0x01000000 /* Reserved for scheduler private use */
#define TDF_SCHED1 0x02000000 /* Reserved for scheduler private use */
#define TDF_SCHED2 0x04000000 /* Reserved for scheduler private use */
@@ -482,7 +480,8 @@ struct rusage_ext {
*/
struct proc {
LIST_ENTRY(proc) p_list; /* (d) List of all processes. */
- TAILQ_HEAD(, thread) p_threads; /* (j)(td_plist) Threads. (shortcut) */
+ TAILQ_HEAD(, thread) p_threads; /* (j) all threads. */
+ struct mtx p_slock; /* process spin lock */
struct ucred *p_ucred; /* (c) Process owner's identity. */
struct filedesc *p_fd; /* (b) Open files. */
struct filedesc_to_leader *p_fdtol; /* (b) Tracking node */
@@ -491,7 +490,7 @@ struct proc {
struct plimit *p_limit; /* (c) Process limits. */
struct callout p_limco; /* (c) Limit callout handle */
struct sigacts *p_sigacts; /* (x) Signal actions, state (CPU). */
- TAILQ_HEAD(, kse_upcall) p_upcalls; /* All upcalls in the proc. */
+ TAILQ_HEAD(, kse_upcall) p_upcalls; /* (j) All upcalls in the proc. */
/*
* The following don't make too much sense.
@@ -504,7 +503,6 @@ struct proc {
PRS_NORMAL, /* threads can be run. */
PRS_ZOMBIE
} p_state; /* (j/c) S* process status. */
-
pid_t p_pid; /* (b) Process identifier. */
LIST_ENTRY(proc) p_hash; /* (d) Hash chain. */
LIST_ENTRY(proc) p_pglist; /* (g + e) List of processes in pgrp. */
@@ -542,14 +540,12 @@ struct proc {
struct nlminfo *p_nlminfo; /* (?) Only used by/for lockd. */
struct kaioinfo *p_aioinfo; /* (c) ASYNC I/O info. */
struct thread *p_singlethread;/* (c + j) If single threading this is it */
- int p_suspcount; /* (c) Num threads in suspended mode. */
+ int p_suspcount; /* (j) Num threads in suspended mode. */
struct thread *p_xthread; /* (c) Trap thread */
int p_boundary_count;/* (c) Num threads at user boundary */
int p_pendingcnt; /* how many signals are pending */
struct itimers *p_itimers; /* (c) POSIX interval timers. */
/* from ksegrp */
- u_int p_estcpu; /* (j) Sum of the field in threads. */
- u_int p_slptime; /* (j) How long completely blocked. */
int p_numupcalls; /* (j) Num upcalls. */
int p_upsleeps; /* (c) Num threads in kse_release(). */
struct kse_thr_mailbox *p_completed; /* (c) Completed thread mboxes. */
@@ -592,6 +588,9 @@ struct proc {
#define NOCPU 0xff /* For when we aren't on a CPU. */
+#define PROC_SLOCK(p) mtx_lock_spin(&(p)->p_slock)
+#define PROC_SUNLOCK(p) mtx_unlock_spin(&(p)->p_slock)
+#define PROC_SLOCK_ASSERT(p, type) mtx_assert(&(p)->p_slock, (type))
/* These flags are kept in p_flag. */
#define P_ADVLOCK 0x00001 /* Process may hold a POSIX advisory lock. */
@@ -626,7 +625,7 @@ struct proc {
#define P_STOPPED (P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE)
#define P_SHOULDSTOP(p) ((p)->p_flag & P_STOPPED)
-/* These flags are kept in p_sflag and are protected with sched_lock. */
+/* These flags are kept in p_sflag and are protected with proc slock. */
#define PS_INMEM 0x00001 /* Loaded into memory. */
#define PS_ALRMPEND 0x00020 /* Pending SIGVTALRM needs to be posted. */
#define PS_PROFPEND 0x00040 /* Pending SIGPROF needs to be posted. */
@@ -861,8 +860,8 @@ void stopevent(struct proc *, u_int, u_int);
void threadinit(void);
void cpu_idle(void);
extern void (*cpu_idle_hook)(void); /* Hook to machdep CPU idler. */
-void cpu_switch(struct thread *old, struct thread *new);
-void cpu_throw(struct thread *old, struct thread *new) __dead2;
+void cpu_switch(struct thread *, struct thread *, struct mtx *);
+void cpu_throw(struct thread *, struct thread *) __dead2;
void unsleep(struct thread *);
void userret(struct thread *, struct trapframe *);
@@ -872,6 +871,7 @@ void cpu_fork(struct thread *, struct proc *, struct thread *, int);
void cpu_set_fork_handler(struct thread *, void (*)(void *), void *);
/* New in KSE. */
+void kse_unlink(struct thread *);
void kse_GC(void);
void kseinit(void);
void cpu_set_upcall(struct thread *td, struct thread *td0);
@@ -900,6 +900,7 @@ void childproc_stopped(struct proc *child, int reason);
void childproc_continued(struct proc *child);
void childproc_exited(struct proc *child);
int thread_suspend_check(int how);
+void thread_suspend_switch(struct thread *);
void thread_suspend_one(struct thread *td);
struct thread *thread_switchout(struct thread *td, int flags,
struct thread *newtd);
diff --git a/sys/sys/sched.h b/sys/sys/sched.h
index 1342906..0dcf369 100644
--- a/sys/sys/sched.h
+++ b/sys/sys/sched.h
@@ -81,6 +81,7 @@ int sched_runnable(void);
*/
void sched_exit(struct proc *p, struct thread *childtd);
void sched_fork(struct thread *td, struct thread *childtd);
+void sched_fork_exit(struct thread *td);
/*
* KSE Groups contain scheduling priority information. They record the
@@ -101,6 +102,7 @@ fixpt_t sched_pctcpu(struct thread *td);
void sched_prio(struct thread *td, u_char prio);
void sched_sleep(struct thread *td);
void sched_switch(struct thread *td, struct thread *newtd, int flags);
+void sched_throw(struct thread *td);
void sched_unlend_prio(struct thread *td, u_char prio);
void sched_unlend_user_prio(struct thread *td, u_char pri);
void sched_user_prio(struct thread *td, u_char prio);
@@ -155,6 +157,19 @@ sched_unpin(void)
#define SRQ_PREEMPTED 0x0008 /* has been preempted.. be kind */
#define SRQ_BORROWING 0x0010 /* Priority updated due to prio_lend */
+/* Switch stats. */
+#ifdef SCHED_STATS
+extern long switch_preempt;
+extern long switch_owepreempt;
+extern long switch_turnstile;
+extern long switch_sleepq;
+extern long switch_sleepqtimo;
+extern long switch_relinquish;
+extern long switch_needresched;
+#define SCHED_STAT_INC(var) atomic_add_long(&(var), 1)
+#else
+#define SCHED_STAT_INC(var)
+#endif
/* temporarily here */
void schedinit(void);
@@ -162,7 +177,6 @@ void sched_init_concurrency(struct proc *p);
void sched_set_concurrency(struct proc *p, int cuncurrency);
void sched_schedinit(void);
void sched_newproc(struct proc *p, struct thread *td);
-void sched_thread_exit(struct thread *td);
void sched_newthread(struct thread *td);
#endif /* _KERNEL */
OpenPOWER on IntegriCloud