summaryrefslogtreecommitdiffstats
path: root/sys
diff options
context:
space:
mode:
authorjeff <jeff@FreeBSD.org>2007-06-04 23:50:30 +0000
committerjeff <jeff@FreeBSD.org>2007-06-04 23:50:30 +0000
commit186ae07cb61840670b6b7bc387b690bef2c2e262 (patch)
treee1f8264072afbc05d59439c37c9d1a06178296ad /sys
parent9bd4fdf7ce811d83f0305cacc5990ec339df9f13 (diff)
downloadFreeBSD-src-186ae07cb61840670b6b7bc387b690bef2c2e262.zip
FreeBSD-src-186ae07cb61840670b6b7bc387b690bef2c2e262.tar.gz
Commit 1/14 of sched_lock decomposition.
- Move all scheduler locking into the schedulers utilizing a technique similar to solaris's container locking. - A per-process spinlock is now used to protect the queue of threads, thread count, suspension count, p_sflags, and other process related scheduling fields. - The new thread lock is actually a pointer to a spinlock for the container that the thread is currently owned by. The container may be a turnstile, sleepqueue, or run queue. - thread_lock() is now used to protect access to thread related scheduling fields. thread_unlock() unlocks the lock and thread_set_lock() implements the transition from one lock to another. - A new "blocked_lock" is used in cases where it is not safe to hold the actual thread's lock yet we must prevent access to the thread. - sched_throw() and sched_fork_exit() are introduced to allow the schedulers to fix-up locking at these points. - Add some minor infrastructure for optionally exporting scheduler statistics that were invaluable in solving performance problems with this patch. Generally these statistics allow you to differentiate between different causes of context switches. Tested by: kris, current@ Tested on: i386, amd64, ULE, 4BSD, libthr, libkse, PREEMPTION, etc. Discussed with: kris, attilio, kmacy, jhb, julian, bde (small parts each)
Diffstat (limited to 'sys')
-rw-r--r--sys/kern/kern_switch.c92
-rw-r--r--sys/kern/sched_4bsd.c160
-rw-r--r--sys/kern/sched_core.c59
-rw-r--r--sys/kern/sched_ule.c175
-rw-r--r--sys/sys/mutex.h9
-rw-r--r--sys/sys/proc.h113
-rw-r--r--sys/sys/sched.h16
7 files changed, 460 insertions, 164 deletions
diff --git a/sys/kern/kern_switch.c b/sys/kern/kern_switch.c
index 1ccf64c..13bba12 100644
--- a/sys/kern/kern_switch.c
+++ b/sys/kern/kern_switch.c
@@ -49,6 +49,8 @@ __FBSDID("$FreeBSD$");
#include <sys/sysctl.h>
#endif
+#include <machine/cpu.h>
+
/* Uncomment this to enable logging of critical_enter/exit. */
#if 0
#define KTR_CRITICAL KTR_SCHED
@@ -77,6 +79,49 @@ static int kern_sched_preemption = 0;
SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
&kern_sched_preemption, 0, "Kernel preemption enabled");
+#ifdef SCHED_STATS
+long switch_preempt;
+long switch_owepreempt;
+long switch_turnstile;
+long switch_sleepq;
+long switch_sleepqtimo;
+long switch_relinquish;
+long switch_needresched;
+static SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW, 0, "switch stats");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, preempt, CTLFLAG_RD, &switch_preempt, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, owepreempt, CTLFLAG_RD, &switch_owepreempt, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, turnstile, CTLFLAG_RD, &switch_turnstile, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, sleepq, CTLFLAG_RD, &switch_sleepq, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, sleepqtimo, CTLFLAG_RD, &switch_sleepqtimo, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, relinquish, CTLFLAG_RD, &switch_relinquish, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, needresched, CTLFLAG_RD, &switch_needresched, 0, "");
+static int
+sysctl_stats_reset(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ int val;
+
+ val = 0;
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (val == 0)
+ return (0);
+ switch_preempt = 0;
+ switch_owepreempt = 0;
+ switch_turnstile = 0;
+ switch_sleepq = 0;
+ switch_sleepqtimo = 0;
+ switch_relinquish = 0;
+ switch_needresched = 0;
+
+ return (0);
+}
+
+SYSCTL_PROC(_kern_sched_stats, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_WR, NULL,
+ 0, sysctl_stats_reset, "I", "Reset scheduler statistics");
+#endif
+
/************************************************************************
* Functions that manipulate runnability from a thread perspective. *
************************************************************************/
@@ -142,13 +187,13 @@ critical_exit(void)
#ifdef PREEMPTION
if (td->td_critnest == 1) {
td->td_critnest = 0;
- mtx_assert(&sched_lock, MA_NOTOWNED);
if (td->td_owepreempt) {
td->td_critnest = 1;
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
td->td_critnest--;
+ SCHED_STAT_INC(switch_owepreempt);
mi_switch(SW_INVOL, NULL);
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
}
} else
#endif
@@ -173,7 +218,6 @@ maybe_preempt(struct thread *td)
int cpri, pri;
#endif
- mtx_assert(&sched_lock, MA_OWNED);
#ifdef PREEMPTION
/*
* The new thread should not preempt the current thread if any of the
@@ -199,6 +243,7 @@ maybe_preempt(struct thread *td)
* to the new thread.
*/
ctd = curthread;
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
KASSERT ((ctd->td_sched != NULL && ctd->td_sched->ts_thread == ctd),
("thread has no (or wrong) sched-private part."));
KASSERT((td->td_inhibitors == 0),
@@ -219,15 +264,25 @@ maybe_preempt(struct thread *td)
ctd->td_owepreempt = 1;
return (0);
}
-
/*
* Thread is runnable but not yet put on system run queue.
*/
+ MPASS(ctd->td_lock == &sched_lock);
+ MPASS(td->td_lock == &sched_lock);
MPASS(TD_ON_RUNQ(td));
TD_SET_RUNNING(td);
CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
td->td_proc->p_pid, td->td_proc->p_comm);
+ SCHED_STAT_INC(switch_preempt);
mi_switch(SW_INVOL|SW_PREEMPT, td);
+ /*
+ * td's lock pointer may have changed. We have to return with it
+ * locked.
+ */
+ spinlock_enter();
+ thread_unlock(ctd);
+ thread_lock(td);
+ spinlock_exit();
return (1);
#else
return (0);
@@ -442,7 +497,6 @@ runq_choose(struct runq *rq)
struct td_sched *ts;
int pri;
- mtx_assert(&sched_lock, MA_OWNED);
while ((pri = runq_findbit(rq)) != -1) {
rqh = &rq->rq_queues[pri];
#if defined(SMP) && defined(SCHED_4BSD)
@@ -484,7 +538,6 @@ runq_choose_from(struct runq *rq, u_char idx)
struct td_sched *ts;
int pri;
- mtx_assert(&sched_lock, MA_OWNED);
if ((pri = runq_findbit_from(rq, idx)) != -1) {
rqh = &rq->rq_queues[pri];
ts = TAILQ_FIRST(rqh);
@@ -519,9 +572,20 @@ runq_remove_idx(struct runq *rq, struct td_sched *ts, u_char *idx)
KASSERT(ts->ts_thread->td_proc->p_sflag & PS_INMEM,
("runq_remove_idx: process swapped out"));
pri = ts->ts_rqindex;
+ KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri));
rqh = &rq->rq_queues[pri];
CTR5(KTR_RUNQ, "runq_remove_idx: td=%p, ts=%p pri=%d %d rqh=%p",
ts->ts_thread, ts, ts->ts_thread->td_priority, pri, rqh);
+ {
+ struct td_sched *nts;
+
+ TAILQ_FOREACH(nts, rqh, ts_procq)
+ if (nts == ts)
+ break;
+ if (ts != nts)
+ panic("runq_remove_idx: ts %p not on rqindex %d",
+ ts, pri);
+ }
TAILQ_REMOVE(rqh, ts, ts_procq);
if (TAILQ_EMPTY(rqh)) {
CTR0(KTR_RUNQ, "runq_remove_idx: empty");
@@ -589,18 +653,4 @@ sched_set_concurrency(struct proc *p, int concurrency)
{
}
-/*
- * Called from thread_exit() for all exiting thread
- *
- * Not to be confused with sched_exit_thread()
- * that is only called from thread_exit() for threads exiting
- * without the rest of the process exiting because it is also called from
- * sched_exit() and we wouldn't want to call it twice.
- * XXX This can probably be fixed.
- */
-void
-sched_thread_exit(struct thread *td)
-{
-}
-
#endif /* KERN_SWITCH_INCLUDE */
diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c
index 66952ec..a4b1e08 100644
--- a/sys/kern/sched_4bsd.c
+++ b/sys/kern/sched_4bsd.c
@@ -248,7 +248,7 @@ static void
maybe_resched(struct thread *td)
{
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
if (td->td_priority < curthread->td_priority)
curthread->td_flags |= TDF_NEEDRESCHED;
}
@@ -377,10 +377,7 @@ schedcpu(void)
realstathz = stathz ? stathz : hz;
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
- /*
- * Prevent state changes and protect run queue.
- */
- mtx_lock_spin(&sched_lock);
+ PROC_SLOCK(p);
/*
* Increment time in/out of memory. We ignore overflow; with
* 16-bit int's (remember them?) overflow takes 45 days.
@@ -388,6 +385,7 @@ schedcpu(void)
p->p_swtime++;
FOREACH_THREAD_IN_PROC(p, td) {
awake = 0;
+ thread_lock(td);
ts = td->td_sched;
/*
* Increment sleep time (if sleeping). We
@@ -456,13 +454,16 @@ XXX this is broken
td->td_slptime = 0;
} else
td->td_slptime++;
- if (td->td_slptime > 1)
+ if (td->td_slptime > 1) {
+ thread_unlock(td);
continue;
+ }
td->td_estcpu = decay_cpu(loadfac, td->td_estcpu);
resetpriority(td);
resetpriority_thread(td);
+ thread_unlock(td);
} /* end of thread loop */
- mtx_unlock_spin(&sched_lock);
+ PROC_SUNLOCK(p);
} /* end of process loop */
sx_sunlock(&allproc_lock);
}
@@ -575,6 +576,7 @@ schedinit(void)
*/
proc0.p_sched = NULL; /* XXX */
thread0.td_sched = &td_sched0;
+ thread0.td_lock = &sched_lock;
td_sched0.ts_thread = &thread0;
}
@@ -615,7 +617,7 @@ sched_clock(struct thread *td)
{
struct td_sched *ts;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
ts = td->td_sched;
ts->ts_cpticks++;
@@ -635,22 +637,23 @@ sched_exit(struct proc *p, struct thread *td)
CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
td, td->td_proc->p_comm, td->td_priority);
-
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
}
void
sched_exit_thread(struct thread *td, struct thread *child)
{
- struct proc *childproc = child->td_proc;
CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
- child, childproc->p_comm, child->td_priority);
+ child, child->td_proc->p_comm, child->td_priority);
+ thread_lock(td);
td->td_estcpu = ESTCPULIM(td->td_estcpu + child->td_estcpu);
- childproc->p_estcpu = ESTCPULIM(childproc->p_estcpu +
- child->td_estcpu);
+ thread_unlock(td);
+ mtx_lock_spin(&sched_lock);
if ((child->td_proc->p_flag & P_NOLOAD) == 0)
sched_load_rem();
+ mtx_unlock_spin(&sched_lock);
}
void
@@ -663,6 +666,7 @@ void
sched_fork_thread(struct thread *td, struct thread *childtd)
{
childtd->td_estcpu = td->td_estcpu;
+ childtd->td_lock = &sched_lock;
sched_newthread(childtd);
}
@@ -672,18 +676,20 @@ sched_nice(struct proc *p, int nice)
struct thread *td;
PROC_LOCK_ASSERT(p, MA_OWNED);
- mtx_assert(&sched_lock, MA_OWNED);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
p->p_nice = nice;
FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
resetpriority(td);
resetpriority_thread(td);
+ thread_unlock(td);
}
}
void
sched_class(struct thread *td, int class)
{
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
td->td_pri_class = class;
}
@@ -697,7 +703,7 @@ sched_priority(struct thread *td, u_char prio)
td, td->td_proc->p_comm, td->td_priority, prio, curthread,
curthread->td_proc->p_comm);
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
if (td->td_priority == prio)
return;
td->td_priority = prio;
@@ -818,7 +824,7 @@ void
sched_sleep(struct thread *td)
{
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
td->td_slptime = 0;
}
@@ -831,26 +837,18 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
ts = td->td_sched;
p = td->td_proc;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ /*
+ * Switch to the sched lock to fix things up and pick
+ * a new thread.
+ */
+ if (td->td_lock != &sched_lock) {
+ mtx_lock_spin(&sched_lock);
+ thread_unlock(td);
+ }
if ((p->p_flag & P_NOLOAD) == 0)
sched_load_rem();
-#if 0
- /*
- * We are volunteering to switch out so we get to nominate
- * a successor for the rest of our quantum
- * First try another thread in our process
- *
- * this is too expensive to do without per process run queues
- * so skip it for now.
- * XXX keep this comment as a marker.
- */
- if (sched_followon &&
- (p->p_flag & P_HADTHREADS) &&
- (flags & SW_VOL) &&
- newtd == NULL)
- newtd = mumble();
-#endif
if (newtd)
newtd->td_flags |= (td->td_flags & TDF_NEEDRESCHED);
@@ -896,6 +894,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
} else {
newtd = choosethread();
}
+ MPASS(newtd->td_lock == &sched_lock);
if (td != newtd) {
#ifdef HWPMC_HOOKS
@@ -904,7 +903,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
#endif
/* I feel sleepy */
- cpu_switch(td, newtd);
+ cpu_switch(td, newtd, td->td_lock);
/*
* Where am I? What year is it?
* We are in the same thread that went to sleep above,
@@ -932,12 +931,13 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
#endif
sched_lock.mtx_lock = (uintptr_t)td;
td->td_oncpu = PCPU_GET(cpuid);
+ MPASS(td->td_lock == &sched_lock);
}
void
sched_wakeup(struct thread *td)
{
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
if (td->td_slptime > 1) {
updatepri(td);
resetpriority(td);
@@ -1079,7 +1079,7 @@ sched_add(struct thread *td, int flags)
int single_cpu = 0;
ts = td->td_sched;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
KASSERT((td->td_inhibitors == 0),
("sched_add: trying to run inhibited thread"));
KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
@@ -1089,6 +1089,14 @@ sched_add(struct thread *td, int flags)
CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
td, td->td_proc->p_comm, td->td_priority, curthread,
curthread->td_proc->p_comm);
+ /*
+ * Now that the thread is moving to the run-queue, set the lock
+ * to the scheduler's lock.
+ */
+ if (td->td_lock != &sched_lock) {
+ mtx_lock_spin(&sched_lock);
+ thread_lock_set(td, &sched_lock);
+ }
TD_SET_RUNQ(td);
if (td->td_pinned != 0) {
@@ -1140,7 +1148,7 @@ sched_add(struct thread *td, int flags)
{
struct td_sched *ts;
ts = td->td_sched;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
KASSERT((td->td_inhibitors == 0),
("sched_add: trying to run inhibited thread"));
KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
@@ -1150,6 +1158,14 @@ sched_add(struct thread *td, int flags)
CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
td, td->td_proc->p_comm, td->td_priority, curthread,
curthread->td_proc->p_comm);
+ /*
+ * Now that the thread is moving to the run-queue, set the lock
+ * to the scheduler's lock.
+ */
+ if (td->td_lock != &sched_lock) {
+ mtx_lock_spin(&sched_lock);
+ thread_lock_set(td, &sched_lock);
+ }
TD_SET_RUNQ(td);
CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td);
ts->ts_runq = &runq;
@@ -1207,6 +1223,7 @@ sched_choose(void)
struct td_sched *ts;
struct runq *rq;
+ mtx_assert(&sched_lock, MA_OWNED);
#ifdef SMP
struct td_sched *kecpu;
@@ -1256,10 +1273,10 @@ sched_userret(struct thread *td)
KASSERT((td->td_flags & TDF_BORROWING) == 0,
("thread with borrowed priority returning to userland"));
if (td->td_priority != td->td_user_pri) {
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
td->td_priority = td->td_user_pri;
td->td_base_pri = td->td_user_pri;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
}
}
@@ -1268,7 +1285,7 @@ sched_bind(struct thread *td, int cpu)
{
struct td_sched *ts;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
KASSERT(TD_IS_RUNNING(td),
("sched_bind: cannot bind non-running thread"));
@@ -1287,25 +1304,26 @@ sched_bind(struct thread *td, int cpu)
void
sched_unbind(struct thread* td)
{
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
td->td_sched->ts_flags &= ~TSF_BOUND;
}
int
sched_is_bound(struct thread *td)
{
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
return (td->td_sched->ts_flags & TSF_BOUND);
}
void
sched_relinquish(struct thread *td)
{
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
if (td->td_pri_class == PRI_TIMESHARE)
sched_prio(td, PRI_MAX_TIMESHARE);
+ SCHED_STAT_INC(switch_relinquish);
mi_switch(SW_VOL, NULL);
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
}
int
@@ -1363,5 +1381,57 @@ sched_idletd(void *dummy)
}
}
+/*
+ * A CPU is entering for the first time or a thread is exiting.
+ */
+void
+sched_throw(struct thread *td)
+{
+ /*
+ * Correct spinlock nesting. The idle thread context that we are
+ * borrowing was created so that it would start out with a single
+ * spin lock (sched_lock) held in fork_trampoline(). Since we've
+ * explicitly acquired locks in this function, the nesting count
+ * is now 2 rather than 1. Since we are nested, calling
+ * spinlock_exit() will simply adjust the counts without allowing
+ * spin lock using code to interrupt us.
+ */
+ if (td == NULL) {
+ mtx_lock_spin(&sched_lock);
+ spinlock_exit();
+ } else {
+ MPASS(td->td_lock == &sched_lock);
+ }
+ mtx_assert(&sched_lock, MA_OWNED);
+ KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
+ PCPU_SET(switchtime, cpu_ticks());
+ PCPU_SET(switchticks, ticks);
+ cpu_throw(td, choosethread()); /* doesn't return */
+}
+
+void
+sched_fork_exit(struct thread *ctd)
+{
+ struct thread *td;
+
+ /*
+ * Finish setting up thread glue so that it begins execution in a
+ * non-nested critical section with sched_lock held but not recursed.
+ */
+ ctd->td_oncpu = PCPU_GET(cpuid);
+ sched_lock.mtx_lock = (uintptr_t)ctd;
+ THREAD_LOCK_ASSERT(ctd, MA_OWNED | MA_NOTRECURSED);
+ /*
+ * Processes normally resume in mi_switch() after being
+ * cpu_switch()'ed to, but when children start up they arrive here
+ * instead, so we must do much the same things as mi_switch() would.
+ */
+ if ((td = PCPU_GET(deadthread))) {
+ PCPU_SET(deadthread, NULL);
+ thread_stash(td);
+ }
+ thread_unlock(ctd);
+}
+
#define KERN_SWITCH_INCLUDE 1
#include "kern/kern_switch.c"
diff --git a/sys/kern/sched_core.c b/sys/kern/sched_core.c
index b0994f8..4cec09b 100644
--- a/sys/kern/sched_core.c
+++ b/sys/kern/sched_core.c
@@ -784,6 +784,7 @@ schedinit(void)
*/
proc0.p_sched = NULL; /* XXX */
thread0.td_sched = &kse0;
+ thread0.td_lock = &sched_lock;
kse0.ts_thread = &thread0;
kse0.ts_slice = 100;
}
@@ -1018,7 +1019,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
if (PMC_PROC_IS_USING_PMCS(td->td_proc))
PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
#endif
- cpu_switch(td, newtd);
+ cpu_switch(td, newtd, td->td_lock);
#ifdef HWPMC_HOOKS
if (PMC_PROC_IS_USING_PMCS(td->td_proc))
PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
@@ -1110,6 +1111,7 @@ sched_fork_thread(struct thread *td, struct thread *child)
ts = td->td_sched;
ts2 = child->td_sched;
+ child->td_lock = td->td_lock;
ts2->ts_slptime = ts2->ts_slptime * CHILD_WEIGHT / 100;
if (child->td_pri_class == PRI_TIMESHARE)
sched_user_prio(child, sched_calc_pri(ts2));
@@ -1142,7 +1144,8 @@ sched_class(struct thread *td, int class)
void
sched_exit(struct proc *p, struct thread *childtd)
{
- mtx_assert(&sched_lock, MA_OWNED);
+
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
sched_exit_thread(FIRST_THREAD_IN_PROC(p), childtd);
}
@@ -1747,5 +1750,57 @@ sched_idletd(void *dummy)
}
}
+/*
+ * A CPU is entering for the first time or a thread is exiting.
+ */
+void
+sched_throw(struct thread *td)
+{
+ /*
+ * Correct spinlock nesting. The idle thread context that we are
+ * borrowing was created so that it would start out with a single
+ * spin lock (sched_lock) held in fork_trampoline(). Since we've
+ * explicitly acquired locks in this function, the nesting count
+ * is now 2 rather than 1. Since we are nested, calling
+ * spinlock_exit() will simply adjust the counts without allowing
+ * spin lock using code to interrupt us.
+ */
+ if (td == NULL) {
+ mtx_lock_spin(&sched_lock);
+ spinlock_exit();
+ } else {
+ MPASS(td->td_lock == &sched_lock);
+ }
+ mtx_assert(&sched_lock, MA_OWNED);
+ KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
+ PCPU_SET(switchtime, cpu_ticks());
+ PCPU_SET(switchticks, ticks);
+ cpu_throw(td, choosethread()); /* doesn't return */
+}
+
+void
+sched_fork_exit(struct thread *ctd)
+{
+ struct thread *td;
+
+ /*
+ * Finish setting up thread glue so that it begins execution in a
+ * non-nested critical section with sched_lock held but not recursed.
+ */
+ ctd->td_oncpu = PCPU_GET(cpuid);
+ sched_lock.mtx_lock = (uintptr_t)ctd;
+ THREAD_LOCK_ASSERT(ctd, MA_OWNED | MA_NOTRECURSED);
+ /*
+ * Processes normally resume in mi_switch() after being
+ * cpu_switch()'ed to, but when children start up they arrive here
+ * instead, so we must do much the same things as mi_switch() would.
+ */
+ if ((td = PCPU_GET(deadthread))) {
+ PCPU_SET(deadthread, NULL);
+ thread_stash(td);
+ }
+ thread_unlock(ctd);
+}
+
#define KERN_SWITCH_INCLUDE 1
#include "kern/kern_switch.c"
diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index 4f4cf41..30761fb 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -229,6 +229,7 @@ static int ipi_thresh = PRI_MIN_KERN;
static int steal_htt = 1;
static int steal_busy = 1;
static int busy_thresh = 4;
+static int topology = 0;
/*
* One thread queue per processor.
@@ -434,7 +435,7 @@ tdq_load_add(struct tdq *tdq, struct td_sched *ts)
mtx_assert(&sched_lock, MA_OWNED);
class = PRI_BASE(ts->ts_thread->td_pri_class);
tdq->tdq_load++;
- CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
+ CTR2(KTR_SCHED, "cpu %jd load: %d", TDQ_ID(tdq), tdq->tdq_load);
if (class != PRI_ITHD &&
(ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
#ifdef SMP
@@ -997,7 +998,7 @@ sched_setup(void *dummy)
tdq = &tdq_cpu[i];
tdq_setup(&tdq_cpu[i]);
}
- if (1) {
+ if (smp_topology == NULL) {
struct tdq_group *tdg;
struct tdq *tdq;
int cpus;
@@ -1027,6 +1028,7 @@ sched_setup(void *dummy)
struct cpu_group *cg;
int j;
+ topology = 1;
for (i = 0; i < smp_topology->ct_count; i++) {
cg = &smp_topology->ct_group[i];
tdg = &tdq_groups[i];
@@ -1248,6 +1250,7 @@ schedinit(void)
*/
proc0.p_sched = NULL; /* XXX */
thread0.td_sched = &td_sched0;
+ thread0.td_lock = &sched_lock;
td_sched0.ts_ltick = ticks;
td_sched0.ts_ftick = ticks;
td_sched0.ts_thread = &thread0;
@@ -1296,7 +1299,7 @@ sched_thread_priority(struct thread *td, u_char prio)
td, td->td_proc->p_comm, td->td_priority, prio, curthread,
curthread->td_proc->p_comm);
ts = td->td_sched;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
if (td->td_priority == prio)
return;
@@ -1307,9 +1310,10 @@ sched_thread_priority(struct thread *td, u_char prio)
* queue. This could be optimized to not re-add in some
* cases.
*/
+ MPASS(td->td_lock == &sched_lock);
sched_rem(td);
td->td_priority = prio;
- sched_add(td, SRQ_BORROWING);
+ sched_add(td, SRQ_BORROWING|SRQ_OURSELF);
} else
td->td_priority = prio;
}
@@ -1427,7 +1431,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
struct td_sched *ts;
int preempt;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
preempt = flags & SW_PREEMPT;
tdq = TDQ_SELF();
@@ -1440,24 +1444,33 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
* If the thread has been assigned it may be in the process of switching
* to the new cpu. This is the case in sched_bind().
*/
+ /*
+ * Switch to the sched lock to fix things up and pick
+ * a new thread.
+ */
+ if (td->td_lock != &sched_lock) {
+ mtx_lock_spin(&sched_lock);
+ thread_unlock(td);
+ }
if (TD_IS_IDLETHREAD(td)) {
+ MPASS(td->td_lock == &sched_lock);
TD_SET_CAN_RUN(td);
- } else {
+ } else if (TD_IS_RUNNING(td)) {
+ /*
+ * Don't allow the thread to migrate
+ * from a preemption.
+ */
tdq_load_rem(tdq, ts);
- if (TD_IS_RUNNING(td)) {
- /*
- * Don't allow the thread to migrate
- * from a preemption.
- */
- if (preempt)
- sched_pin_td(td);
- sched_add(td, preempt ?
- SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
- SRQ_OURSELF|SRQ_YIELDING);
- if (preempt)
- sched_unpin_td(td);
- }
- }
+ if (preempt)
+ sched_pin_td(td);
+ sched_add(td, preempt ?
+ SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
+ SRQ_OURSELF|SRQ_YIELDING);
+ if (preempt)
+ sched_unpin_td(td);
+ } else
+ tdq_load_rem(tdq, ts);
+ mtx_assert(&sched_lock, MA_OWNED);
if (newtd != NULL) {
/*
* If we bring in a thread account for it as if it had been
@@ -1473,7 +1486,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
#endif
- cpu_switch(td, newtd);
+ cpu_switch(td, newtd, td->td_lock);
#ifdef HWPMC_HOOKS
if (PMC_PROC_IS_USING_PMCS(td->td_proc))
PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
@@ -1481,6 +1494,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
}
sched_lock.mtx_lock = (uintptr_t)td;
td->td_oncpu = PCPU_GET(cpuid);
+ MPASS(td->td_lock == &sched_lock);
}
void
@@ -1489,12 +1503,14 @@ sched_nice(struct proc *p, int nice)
struct thread *td;
PROC_LOCK_ASSERT(p, MA_OWNED);
- mtx_assert(&sched_lock, MA_OWNED);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
p->p_nice = nice;
FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
sched_priority(td);
sched_prio(td, td->td_base_user_pri);
+ thread_unlock(td);
}
}
@@ -1502,7 +1518,7 @@ void
sched_sleep(struct thread *td)
{
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
td->td_sched->ts_slptime = ticks;
}
@@ -1513,7 +1529,7 @@ sched_wakeup(struct thread *td)
struct td_sched *ts;
int slptime;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
ts = td->td_sched;
/*
* If we slept for more than a tick update our interactivity and
@@ -1542,7 +1558,7 @@ sched_wakeup(struct thread *td)
void
sched_fork(struct thread *td, struct thread *child)
{
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
sched_fork_thread(td, child);
/*
* Penalize the parent and child for forking.
@@ -1563,7 +1579,9 @@ sched_fork_thread(struct thread *td, struct thread *child)
/*
* Initialize child.
*/
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
sched_newthread(child);
+ child->td_lock = &sched_lock;
ts = td->td_sched;
ts2 = child->td_sched;
ts2->ts_cpu = ts->ts_cpu;
@@ -1588,7 +1606,7 @@ void
sched_class(struct thread *td, int class)
{
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
if (td->td_pri_class == class)
return;
@@ -1627,6 +1645,7 @@ sched_exit(struct proc *p, struct thread *child)
CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
child, child->td_proc->p_comm, child->td_priority);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
td = FIRST_THREAD_IN_PROC(p);
sched_exit_thread(td, child);
}
@@ -1638,7 +1657,9 @@ sched_exit_thread(struct thread *td, struct thread *child)
CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
child, child->td_proc->p_comm, child->td_priority);
+ thread_lock(child);
tdq_load_rem(TDQ_CPU(child->td_sched->ts_cpu), child->td_sched);
+ thread_unlock(child);
#ifdef KSE
/*
* KSE forks and exits so often that this penalty causes short-lived
@@ -1653,9 +1674,11 @@ sched_exit_thread(struct thread *td, struct thread *child)
* sleep time as a penalty to the parent. This causes shells that
* launch expensive things to mark their children as expensive.
*/
+ thread_lock(td);
td->td_sched->skg_runtime += child->td_sched->skg_runtime;
sched_interact_update(td);
sched_priority(td);
+ thread_unlock(td);
}
void
@@ -1673,10 +1696,10 @@ sched_userret(struct thread *td)
KASSERT((td->td_flags & TDF_BORROWING) == 0,
("thread with borrowed priority returning to userland"));
if (td->td_priority != td->td_user_pri) {
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
td->td_priority = td->td_user_pri;
td->td_base_pri = td->td_user_pri;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
}
}
@@ -1805,9 +1828,22 @@ sched_preempt(struct thread *td)
*/
MPASS(TD_ON_RUNQ(td));
TD_SET_RUNNING(td);
+ MPASS(ctd->td_lock == &sched_lock);
+ MPASS(td->td_lock == &sched_lock);
CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
td->td_proc->p_pid, td->td_proc->p_comm);
+ /*
+ * We enter the switch with two runnable threads that both have
+ * the same lock. When we return td may be sleeping so we need
+ * to switch locks to make sure he's locked correctly.
+ */
+ SCHED_STAT_INC(switch_preempt);
mi_switch(SW_INVOL|SW_PREEMPT, td);
+ spinlock_enter();
+ thread_unlock(ctd);
+ thread_lock(td);
+ spinlock_exit();
+
return (1);
}
@@ -1824,7 +1860,7 @@ sched_add(struct thread *td, int flags)
#endif
ts = td->td_sched;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
td, td->td_proc->p_comm, td->td_priority, curthread,
curthread->td_proc->p_comm);
@@ -1834,8 +1870,15 @@ sched_add(struct thread *td, int flags)
("sched_add: bad thread state"));
KASSERT(td->td_proc->p_sflag & PS_INMEM,
("sched_add: process swapped out"));
- KASSERT(ts->ts_runq == NULL,
- ("sched_add: thread %p is still assigned to a run queue", td));
+ /*
+ * Now that the thread is moving to the run-queue, set the lock
+ * to the scheduler's lock.
+ */
+ if (td->td_lock != &sched_lock) {
+ mtx_lock_spin(&sched_lock);
+ thread_lock_set(td, &sched_lock);
+ }
+ mtx_assert(&sched_lock, MA_OWNED);
TD_SET_RUNQ(td);
tdq = TDQ_SELF();
class = PRI_BASE(td->td_pri_class);
@@ -1920,7 +1963,7 @@ sched_rem(struct thread *td)
CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
td, td->td_proc->p_comm, td->td_priority, curthread,
curthread->td_proc->p_comm);
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
ts = td->td_sched;
KASSERT(TD_ON_RUNQ(td),
("sched_rem: thread not on run queue"));
@@ -1942,7 +1985,7 @@ sched_pctcpu(struct thread *td)
if (ts == NULL)
return (0);
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
if (ts->ts_ticks) {
int rtick;
@@ -1952,7 +1995,7 @@ sched_pctcpu(struct thread *td)
pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
}
td->td_proc->p_swtime = ts->ts_ltick - ts->ts_ftick;
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
return (pctcpu);
}
@@ -1962,7 +2005,7 @@ sched_bind(struct thread *td, int cpu)
{
struct td_sched *ts;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
ts = td->td_sched;
if (ts->ts_flags & TSF_BOUND)
sched_unbind(td);
@@ -1982,7 +2025,7 @@ sched_unbind(struct thread *td)
{
struct td_sched *ts;
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
ts = td->td_sched;
if ((ts->ts_flags & TSF_BOUND) == 0)
return;
@@ -1995,18 +2038,19 @@ sched_unbind(struct thread *td)
int
sched_is_bound(struct thread *td)
{
- mtx_assert(&sched_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
return (td->td_sched->ts_flags & TSF_BOUND);
}
void
sched_relinquish(struct thread *td)
{
- mtx_lock_spin(&sched_lock);
+ thread_lock(td);
if (td->td_pri_class == PRI_TIMESHARE)
sched_prio(td, PRI_MAX_TIMESHARE);
+ SCHED_STAT_INC(switch_relinquish);
mi_switch(SW_VOL, NULL);
- mtx_unlock_spin(&sched_lock);
+ thread_unlock(td);
}
int
@@ -2071,6 +2115,58 @@ sched_idletd(void *dummy)
cpu_idle();
}
+/*
+ * A CPU is entering for the first time or a thread is exiting.
+ */
+void
+sched_throw(struct thread *td)
+{
+ /*
+ * Correct spinlock nesting. The idle thread context that we are
+ * borrowing was created so that it would start out with a single
+ * spin lock (sched_lock) held in fork_trampoline(). Since we've
+ * explicitly acquired locks in this function, the nesting count
+ * is now 2 rather than 1. Since we are nested, calling
+ * spinlock_exit() will simply adjust the counts without allowing
+ * spin lock using code to interrupt us.
+ */
+ if (td == NULL) {
+ mtx_lock_spin(&sched_lock);
+ spinlock_exit();
+ } else {
+ MPASS(td->td_lock == &sched_lock);
+ }
+ mtx_assert(&sched_lock, MA_OWNED);
+ KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
+ PCPU_SET(switchtime, cpu_ticks());
+ PCPU_SET(switchticks, ticks);
+ cpu_throw(td, choosethread()); /* doesn't return */
+}
+
+void
+sched_fork_exit(struct thread *ctd)
+{
+ struct thread *td;
+
+ /*
+ * Finish setting up thread glue so that it begins execution in a
+ * non-nested critical section with sched_lock held but not recursed.
+ */
+ ctd->td_oncpu = PCPU_GET(cpuid);
+ sched_lock.mtx_lock = (uintptr_t)ctd;
+ THREAD_LOCK_ASSERT(ctd, MA_OWNED | MA_NOTRECURSED);
+ /*
+ * Processes normally resume in mi_switch() after being
+ * cpu_switch()'ed to, but when children start up they arrive here
+ * instead, so we must do much the same things as mi_switch() would.
+ */
+ if ((td = PCPU_GET(deadthread))) {
+ PCPU_SET(deadthread, NULL);
+ thread_stash(td);
+ }
+ thread_unlock(ctd);
+}
+
static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0,
"Scheduler name");
@@ -2093,6 +2189,7 @@ SYSCTL_INT(_kern_sched, OID_AUTO, ipi_thresh, CTLFLAG_RW, &ipi_thresh, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, steal_busy, CTLFLAG_RW, &steal_busy, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, busy_thresh, CTLFLAG_RW, &busy_thresh, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, topology, CTLFLAG_RD, &topology, 0, "");
#endif
/* ps compat */
diff --git a/sys/sys/mutex.h b/sys/sys/mutex.h
index caa1311..d18061a 100644
--- a/sys/sys/mutex.h
+++ b/sys/sys/mutex.h
@@ -125,6 +125,14 @@ void _mtx_unlock_spin_flags(struct mtx *m, int opts, const char *file,
#if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
void _mtx_assert(struct mtx *m, int what, const char *file, int line);
#endif
+void _thread_lock_flags(struct thread *, int, const char *, int);
+
+#define thread_lock(tdp) \
+ _thread_lock_flags((tdp), 0, __FILE__, __LINE__)
+#define thread_lock_flags(tdp, opt) \
+ _thread_lock_flags((tdp), (opt), __FILE__, __LINE__)
+#define thread_unlock(tdp) \
+ mtx_unlock_spin(__DEVOLATILE(struct mtx *, (tdp)->td_lock))
/*
* We define our machine-independent (unoptimized) mutex micro-operations
@@ -349,6 +357,7 @@ extern struct mtx_pool *mtxpool_sleep;
*/
extern struct mtx sched_lock;
extern struct mtx Giant;
+extern struct mtx blocked_lock;
/*
* Giant lock manipulation and clean exit macros.
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index a73d2d5..acde39d 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -134,7 +134,7 @@ struct pargs {
* g - process group mtx
* h - callout_lock mtx
* i - by curproc or the master session mtx
- * j - locked by sched_lock mtx
+ * j - locked by proc slock
* k - only accessed by curthread
* k*- only accessed by curthread and from an interrupt
* l - the attaching proc or attaching proc parent
@@ -144,6 +144,7 @@ struct pargs {
* p - select lock (sellock)
* q - td_contested lock
* r - p_peers lock
+ * t - thread lock
* x - created at fork, only changes during single threading in exec
* z - zombie threads lock
*
@@ -195,32 +196,19 @@ struct mqueue_notifier;
* other than CPU cycles, which are parceled out to the threads.
*/
-/***************
- * Threads are the unit of execution
- With a single run queue used by all processors:
-
- RUNQ: --->THREAD---THREAD--... SLEEPQ:[]---THREAD---THREAD---THREAD
- []---THREAD
- []
- []---THREAD---THREAD
-
-With PER-CPU run queues:
-it gets more complicated.
- *
- *****************/
-
/*
* Kernel runnable context (thread).
* This is what is put to sleep and reactivated.
* Thread context. Processes may have multiple threads.
*/
struct thread {
+ volatile struct mtx *td_lock; /* replaces sched lock */
struct proc *td_proc; /* (*) Associated process. */
TAILQ_ENTRY(thread) td_plist; /* (*) All threads in this proc. */
/* The two queues below should someday be merged. */
- TAILQ_ENTRY(thread) td_slpq; /* (j) Sleep queue. */
- TAILQ_ENTRY(thread) td_lockq; /* (j) Lock queue. */
+ TAILQ_ENTRY(thread) td_slpq; /* (t) Sleep queue. */
+ TAILQ_ENTRY(thread) td_lockq; /* (t) Lock queue. */
TAILQ_HEAD(, selinfo) td_selq; /* (p) List of selinfos. */
struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */
@@ -232,20 +220,20 @@ struct thread {
/* Cleared during fork1() or thread_schedule_upcall(). */
#define td_startzero td_flags
- int td_flags; /* (j) TDF_* flags. */
- int td_inhibitors; /* (j) Why can not run. */
+ int td_flags; /* (t) TDF_* flags. */
+ int td_inhibitors; /* (t) Why can not run. */
int td_pflags; /* (k) Private thread (TDP_*) flags. */
int td_dupfd; /* (k) Ret value from fdopen. XXX */
- int td_sqqueue; /* (j) Sleepqueue queue blocked on. */
- void *td_wchan; /* (j) Sleep address. */
- const char *td_wmesg; /* (j) Reason for sleep. */
- u_char td_lastcpu; /* (j) Last cpu we were on. */
- u_char td_oncpu; /* (j) Which cpu we are on. */
+ int td_sqqueue; /* (t) Sleepqueue queue blocked on. */
+ void *td_wchan; /* (t) Sleep address. */
+ const char *td_wmesg; /* (t) Reason for sleep. */
+ u_char td_lastcpu; /* (t) Last cpu we were on. */
+ u_char td_oncpu; /* (t) Which cpu we are on. */
volatile u_char td_owepreempt; /* (k*) Preempt on last critical_exit */
short td_locks; /* (k) Count of non-spin locks. */
- u_char td_tsqueue; /* (j) Turnstile queue blocked on. */
- struct turnstile *td_blocked; /* (j) Lock thread is blocked on. */
- const char *td_lockname; /* (j) Name of lock blocked on. */
+ u_char td_tsqueue; /* (t) Turnstile queue blocked on. */
+ struct turnstile *td_blocked; /* (t) Lock thread is blocked on. */
+ const char *td_lockname; /* (t) Name of lock blocked on. */
LIST_HEAD(, turnstile) td_contested; /* (q) Contested locks. */
struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */
int td_intr_nesting_level; /* (k) Interrupt recursion. */
@@ -253,18 +241,18 @@ struct thread {
struct kse_thr_mailbox *td_mailbox; /* (*) Userland mailbox address. */
struct ucred *td_ucred; /* (k) Reference to credentials. */
struct thread *td_standin; /* (k + a) Use this for an upcall. */
- struct kse_upcall *td_upcall; /* (k + j) Upcall structure. */
- u_int td_estcpu; /* (j) Sum of the same field in KSEs. */
- u_int td_slptime; /* (j) How long completely blocked. */
- struct rusage td_ru; /* (j) rusage information */
- uint64_t td_runtime; /* (j) How many cpu ticks we've run. */
- u_int td_pticks; /* (j) Statclock hits for profiling */
- u_int td_sticks; /* (j) Statclock hits in system mode. */
- u_int td_iticks; /* (j) Statclock hits in intr mode. */
- u_int td_uticks; /* (j) Statclock hits in user mode. */
+ struct kse_upcall *td_upcall; /* (k + t) Upcall structure. */
+ u_int td_estcpu; /* (t) estimated cpu utilization */
+ u_int td_slptime; /* (t) How long completely blocked. */
+ struct rusage td_ru; /* (t) rusage information */
+ uint64_t td_runtime; /* (t) How many cpu ticks we've run. */
+ u_int td_pticks; /* (t) Statclock hits for profiling */
+ u_int td_sticks; /* (t) Statclock hits in system mode. */
+ u_int td_iticks; /* (t) Statclock hits in intr mode. */
+ u_int td_uticks; /* (t) Statclock hits in user mode. */
u_int td_uuticks; /* (k) Statclock hits (usr), for UTS. */
u_int td_usticks; /* (k) Statclock hits (sys), for UTS. */
- int td_intrval; /* (j) Return value of TDF_INTERRUPT. */
+ int td_intrval; /* (t) Return value of TDF_INTERRUPT. */
sigset_t td_oldsigmask; /* (k) Saved mask from pre sigpause. */
sigset_t td_sigmask; /* (c) Current signal mask. */
volatile u_int td_generation; /* (k) For detection of preemption */
@@ -278,11 +266,11 @@ struct thread {
/* Copied during fork1() or thread_sched_upcall(). */
#define td_startcopy td_endzero
- u_char td_base_pri; /* (j) Thread base kernel priority. */
- u_char td_priority; /* (j) Thread active priority. */
- u_char td_pri_class; /* (j) Scheduling class. */
- u_char td_user_pri; /* (j) User pri from estcpu and nice. */
- u_char td_base_user_pri; /* (j) Base user pri */
+ u_char td_base_pri; /* (t) Thread base kernel priority. */
+ u_char td_priority; /* (t) Thread active priority. */
+ u_char td_pri_class; /* (t) Scheduling class. */
+ u_char td_user_pri; /* (t) User pri from estcpu and nice. */
+ u_char td_base_user_pri; /* (t) Base user pri */
#define td_endcopy td_pcb
/*
@@ -296,7 +284,7 @@ struct thread {
TDS_CAN_RUN,
TDS_RUNQ,
TDS_RUNNING
- } td_state;
+ } td_state; /* (t) thread state */
register_t td_retval[2]; /* (k) Syscall aux returns. */
struct callout td_slpcallout; /* (h) Callout for sleep. */
struct trapframe *td_frame; /* (k) */
@@ -313,6 +301,16 @@ struct thread {
int td_syscalls; /* per-thread syscall count (used by NFS :)) */
};
+struct mtx *thread_lock_block(struct thread *);
+void thread_lock_unblock(struct thread *, struct mtx *);
+void thread_lock_set(struct thread *, struct mtx *);
+#define THREAD_LOCK_ASSERT(td, type) \
+do { \
+ struct mtx *__m = __DEVOLATILE(struct mtx *, (td)->td_lock); \
+ if (__m != &blocked_lock) \
+ mtx_assert(__m, (type)); \
+} while (0)
+
/*
* Flags kept in td_flags:
* To change these you MUST have the scheduler lock.
@@ -324,22 +322,22 @@ struct thread {
#define TDF_IDLETD 0x00000020 /* This is a per-CPU idle thread. */
#define TDF_SELECT 0x00000040 /* Selecting; wakeup/waiting danger. */
#define TDF_SLEEPABORT 0x00000080 /* sleepq_abort was called. */
-#define TDF_TSNOBLOCK 0x00000100 /* Don't block on a turnstile due to race. */
+#define TDF_UNUSEDx100 0x00000100 /* --available-- */
#define TDF_UBORROWING 0x00000200 /* Thread is borrowing user pri. */
#define TDF_BOUNDARY 0x00000400 /* Thread suspended at user boundary */
#define TDF_ASTPENDING 0x00000800 /* Thread has some asynchronous events. */
#define TDF_TIMOFAIL 0x00001000 /* Timeout from sleep after we were awake. */
#define TDF_INTERRUPT 0x00002000 /* Thread is marked as interrupted. */
#define TDF_UPIBLOCKED 0x00004000 /* Thread blocked on user PI mutex. */
-#define TDF_UNUSED15 0x00008000 /* --available -- */
+#define TDF_UNUSED15 0x00008000 /* --available-- */
#define TDF_NEEDRESCHED 0x00010000 /* Thread needs to yield. */
#define TDF_NEEDSIGCHK 0x00020000 /* Thread may need signal delivery. */
#define TDF_XSIG 0x00040000 /* Thread is exchanging signal under trace */
#define TDF_UNUSED19 0x00080000 /* Thread is sleeping on a umtx. */
#define TDF_THRWAKEUP 0x00100000 /* Libthr thread must not suspend itself. */
#define TDF_DBSUSPEND 0x00200000 /* Thread is suspended by debugger */
-#define TDF_UNUSED22 0x00400000 /* --available -- */
-#define TDF_UNUSED23 0x00800000 /* --available -- */
+#define TDF_UNUSED22 0x00400000 /* --available-- */
+#define TDF_UNUSED23 0x00800000 /* --available-- */
#define TDF_SCHED0 0x01000000 /* Reserved for scheduler private use */
#define TDF_SCHED1 0x02000000 /* Reserved for scheduler private use */
#define TDF_SCHED2 0x04000000 /* Reserved for scheduler private use */
@@ -482,7 +480,8 @@ struct rusage_ext {
*/
struct proc {
LIST_ENTRY(proc) p_list; /* (d) List of all processes. */
- TAILQ_HEAD(, thread) p_threads; /* (j)(td_plist) Threads. (shortcut) */
+ TAILQ_HEAD(, thread) p_threads; /* (j) all threads. */
+ struct mtx p_slock; /* process spin lock */
struct ucred *p_ucred; /* (c) Process owner's identity. */
struct filedesc *p_fd; /* (b) Open files. */
struct filedesc_to_leader *p_fdtol; /* (b) Tracking node */
@@ -491,7 +490,7 @@ struct proc {
struct plimit *p_limit; /* (c) Process limits. */
struct callout p_limco; /* (c) Limit callout handle */
struct sigacts *p_sigacts; /* (x) Signal actions, state (CPU). */
- TAILQ_HEAD(, kse_upcall) p_upcalls; /* All upcalls in the proc. */
+ TAILQ_HEAD(, kse_upcall) p_upcalls; /* (j) All upcalls in the proc. */
/*
* The following don't make too much sense.
@@ -504,7 +503,6 @@ struct proc {
PRS_NORMAL, /* threads can be run. */
PRS_ZOMBIE
} p_state; /* (j/c) S* process status. */
-
pid_t p_pid; /* (b) Process identifier. */
LIST_ENTRY(proc) p_hash; /* (d) Hash chain. */
LIST_ENTRY(proc) p_pglist; /* (g + e) List of processes in pgrp. */
@@ -542,14 +540,12 @@ struct proc {
struct nlminfo *p_nlminfo; /* (?) Only used by/for lockd. */
struct kaioinfo *p_aioinfo; /* (c) ASYNC I/O info. */
struct thread *p_singlethread;/* (c + j) If single threading this is it */
- int p_suspcount; /* (c) Num threads in suspended mode. */
+ int p_suspcount; /* (j) Num threads in suspended mode. */
struct thread *p_xthread; /* (c) Trap thread */
int p_boundary_count;/* (c) Num threads at user boundary */
int p_pendingcnt; /* how many signals are pending */
struct itimers *p_itimers; /* (c) POSIX interval timers. */
/* from ksegrp */
- u_int p_estcpu; /* (j) Sum of the field in threads. */
- u_int p_slptime; /* (j) How long completely blocked. */
int p_numupcalls; /* (j) Num upcalls. */
int p_upsleeps; /* (c) Num threads in kse_release(). */
struct kse_thr_mailbox *p_completed; /* (c) Completed thread mboxes. */
@@ -592,6 +588,9 @@ struct proc {
#define NOCPU 0xff /* For when we aren't on a CPU. */
+#define PROC_SLOCK(p) mtx_lock_spin(&(p)->p_slock)
+#define PROC_SUNLOCK(p) mtx_unlock_spin(&(p)->p_slock)
+#define PROC_SLOCK_ASSERT(p, type) mtx_assert(&(p)->p_slock, (type))
/* These flags are kept in p_flag. */
#define P_ADVLOCK 0x00001 /* Process may hold a POSIX advisory lock. */
@@ -626,7 +625,7 @@ struct proc {
#define P_STOPPED (P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE)
#define P_SHOULDSTOP(p) ((p)->p_flag & P_STOPPED)
-/* These flags are kept in p_sflag and are protected with sched_lock. */
+/* These flags are kept in p_sflag and are protected with proc slock. */
#define PS_INMEM 0x00001 /* Loaded into memory. */
#define PS_ALRMPEND 0x00020 /* Pending SIGVTALRM needs to be posted. */
#define PS_PROFPEND 0x00040 /* Pending SIGPROF needs to be posted. */
@@ -861,8 +860,8 @@ void stopevent(struct proc *, u_int, u_int);
void threadinit(void);
void cpu_idle(void);
extern void (*cpu_idle_hook)(void); /* Hook to machdep CPU idler. */
-void cpu_switch(struct thread *old, struct thread *new);
-void cpu_throw(struct thread *old, struct thread *new) __dead2;
+void cpu_switch(struct thread *, struct thread *, struct mtx *);
+void cpu_throw(struct thread *, struct thread *) __dead2;
void unsleep(struct thread *);
void userret(struct thread *, struct trapframe *);
@@ -872,6 +871,7 @@ void cpu_fork(struct thread *, struct proc *, struct thread *, int);
void cpu_set_fork_handler(struct thread *, void (*)(void *), void *);
/* New in KSE. */
+void kse_unlink(struct thread *);
void kse_GC(void);
void kseinit(void);
void cpu_set_upcall(struct thread *td, struct thread *td0);
@@ -900,6 +900,7 @@ void childproc_stopped(struct proc *child, int reason);
void childproc_continued(struct proc *child);
void childproc_exited(struct proc *child);
int thread_suspend_check(int how);
+void thread_suspend_switch(struct thread *);
void thread_suspend_one(struct thread *td);
struct thread *thread_switchout(struct thread *td, int flags,
struct thread *newtd);
diff --git a/sys/sys/sched.h b/sys/sys/sched.h
index 1342906..0dcf369 100644
--- a/sys/sys/sched.h
+++ b/sys/sys/sched.h
@@ -81,6 +81,7 @@ int sched_runnable(void);
*/
void sched_exit(struct proc *p, struct thread *childtd);
void sched_fork(struct thread *td, struct thread *childtd);
+void sched_fork_exit(struct thread *td);
/*
* KSE Groups contain scheduling priority information. They record the
@@ -101,6 +102,7 @@ fixpt_t sched_pctcpu(struct thread *td);
void sched_prio(struct thread *td, u_char prio);
void sched_sleep(struct thread *td);
void sched_switch(struct thread *td, struct thread *newtd, int flags);
+void sched_throw(struct thread *td);
void sched_unlend_prio(struct thread *td, u_char prio);
void sched_unlend_user_prio(struct thread *td, u_char pri);
void sched_user_prio(struct thread *td, u_char prio);
@@ -155,6 +157,19 @@ sched_unpin(void)
#define SRQ_PREEMPTED 0x0008 /* has been preempted.. be kind */
#define SRQ_BORROWING 0x0010 /* Priority updated due to prio_lend */
+/* Switch stats. */
+#ifdef SCHED_STATS
+extern long switch_preempt;
+extern long switch_owepreempt;
+extern long switch_turnstile;
+extern long switch_sleepq;
+extern long switch_sleepqtimo;
+extern long switch_relinquish;
+extern long switch_needresched;
+#define SCHED_STAT_INC(var) atomic_add_long(&(var), 1)
+#else
+#define SCHED_STAT_INC(var)
+#endif
/* temporarily here */
void schedinit(void);
@@ -162,7 +177,6 @@ void sched_init_concurrency(struct proc *p);
void sched_set_concurrency(struct proc *p, int cuncurrency);
void sched_schedinit(void);
void sched_newproc(struct proc *p, struct thread *td);
-void sched_thread_exit(struct thread *td);
void sched_newthread(struct thread *td);
#endif /* _KERNEL */
OpenPOWER on IntegriCloud