diff options
author | jeff <jeff@FreeBSD.org> | 2004-12-26 22:56:08 +0000 |
---|---|---|
committer | jeff <jeff@FreeBSD.org> | 2004-12-26 22:56:08 +0000 |
commit | ceca9b8f9ea53d9a923c535b211495acd5caa11c (patch) | |
tree | 22e02db61d98da077ce85e395b076fdd97651df6 /sys/kern | |
parent | b864ac486cc4d57ca87dac803405abb87b4a8f86 (diff) | |
download | FreeBSD-src-ceca9b8f9ea53d9a923c535b211495acd5caa11c.zip FreeBSD-src-ceca9b8f9ea53d9a923c535b211495acd5caa11c.tar.gz |
- Fix a long standing problem where an ithread would not honor sched_pin().
- Remove the sched_add wrapper that used sched_add_internal() as a backend.
Its only purpose was to interpret one flag and turn it into an int. Do
the right thing and interpret the flag in sched_add() instead.
- Pass the flag argument to sched_add() to kseq_runq_add() so that we can
get the SRQ_PREEMPT optimization too.
- Add a KEF_INTERNAL flag. If KEF_INTERNAL is set we don't adjust the SLOT
counts, otherwise the slot counts are adjusted as soon as we enter
sched_add() or sched_rem() rather than when the thread is actually placed
on the run queue. This greatly simplifies the handling of slots.
- Remove the explicit prevention of migration for ithreads on non-x86
platforms. This was never shown to have any real benefit.
- Remove the unused class argument to KSE_CAN_MIGRATE().
- Add ktr points for thread migration events.
- Fix a long standing bug on platforms which don't initialize the cpu
topology. The ksg_maxid variable was never correctly set on these
platforms which caused the long term load balancer to never inspect
more than the first group or processor.
- Fix another bug which prevented the long term load balancer from working
properly. If stathz != hz we can't expect sched_clock() to be called
on the exact tick count that we're anticipating.
- Rearrange sched_switch() a bit to reduce indentation levels.
Diffstat (limited to 'sys/kern')
-rw-r--r-- | sys/kern/sched_ule.c | 267 |
1 files changed, 140 insertions, 127 deletions
diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c index a5eac7d..bd3c93a 100644 --- a/sys/kern/sched_ule.c +++ b/sys/kern/sched_ule.c @@ -128,12 +128,13 @@ struct kse { #define ke_assign ke_procq.tqe_next -#define KEF_ASSIGNED KEF_SCHED0 /* Thread is being migrated. */ -#define KEF_BOUND KEF_SCHED1 /* Thread can not migrate. */ -#define KEF_XFERABLE KEF_SCHED2 /* Thread was added as transferable. */ -#define KEF_HOLD KEF_SCHED3 /* Thread is temporarily bound. */ -#define KEF_REMOVED KEF_SCHED4 /* Thread was removed while ASSIGNED */ -#define KEF_PRIOELEV KEF_SCHED5 /* Thread has had its prio elevated. */ +#define KEF_ASSIGNED 0x0001 /* Thread is being migrated. */ +#define KEF_BOUND 0x0002 /* Thread can not migrate. */ +#define KEF_XFERABLE 0x0004 /* Thread was added as transferable. */ +#define KEF_HOLD 0x0008 /* Thread is temporarily bound. */ +#define KEF_REMOVED 0x0010 /* Thread was removed while ASSIGNED */ +#define KEF_PRIOELEV 0x0020 /* Thread has had its prio elevated. */ +#define KEF_INTERNAL 0x0040 struct kg_sched { struct thread *skg_last_assigned; /* (j) Last thread assigned to */ @@ -297,6 +298,7 @@ static struct kseq kseq_cpu[MAXCPU]; static struct kseq_group kseq_groups[MAXCPU]; static int bal_tick; static int gbal_tick; +static int balance_groups; #define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)]) #define KSEQ_CPU(x) (&kseq_cpu[(x)]) @@ -311,7 +313,6 @@ static struct kseq kseq_cpu; static void slot_fill(struct ksegrp *kg); static struct kse *sched_choose(void); /* XXX Should be thread * */ -static void sched_add_internal(struct thread *td, int preemptive); static void sched_slice(struct kse *ke); static void sched_priority(struct ksegrp *kg); static int sched_interact_score(struct ksegrp *kg); @@ -324,7 +325,7 @@ static struct kse * kseq_choose(struct kseq *kseq); static void kseq_setup(struct kseq *kseq); static void kseq_load_add(struct kseq *kseq, struct kse *ke); static void kseq_load_rem(struct kseq *kseq, struct kse *ke); -static __inline void kseq_runq_add(struct kseq *kseq, struct kse *ke); +static __inline void kseq_runq_add(struct kseq *kseq, struct kse *ke, int); static __inline void kseq_runq_rem(struct kseq *kseq, struct kse *ke); static void kseq_nice_add(struct kseq *kseq, int nice); static void kseq_nice_rem(struct kseq *kseq, int nice); @@ -341,19 +342,8 @@ static int kseq_idled(struct kseq *kseq); static void kseq_notify(struct kse *ke, int cpu); static void kseq_assign(struct kseq *); static struct kse *kseq_steal(struct kseq *kseq, int stealidle); -/* - * On P4 Xeons the round-robin interrupt delivery is broken. As a result of - * this, we can't pin interrupts to the cpu that they were delivered to, - * otherwise all ithreads only run on CPU 0. - */ -#ifdef __i386__ -#define KSE_CAN_MIGRATE(ke, class) \ +#define KSE_CAN_MIGRATE(ke) \ ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0) -#else /* !__i386__ */ -#define KSE_CAN_MIGRATE(ke, class) \ - ((class) != PRI_ITHD && (ke)->ke_thread->td_pinned == 0 && \ - ((ke)->ke_flags & KEF_BOUND) == 0) -#endif /* !__i386__ */ #endif void @@ -379,16 +369,16 @@ kseq_print(int cpu) } static __inline void -kseq_runq_add(struct kseq *kseq, struct kse *ke) +kseq_runq_add(struct kseq *kseq, struct kse *ke, int flags) { #ifdef SMP - if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) { + if (KSE_CAN_MIGRATE(ke)) { kseq->ksq_transferable++; kseq->ksq_group->ksg_transferable++; ke->ke_flags |= KEF_XFERABLE; } #endif - runq_add(ke->ke_runq, ke, 0); + runq_add(ke->ke_runq, ke, flags); } static __inline void @@ -509,8 +499,9 @@ sched_balance(void) int cnt; int i; + bal_tick = ticks + (random() % (hz * 2)); if (smp_started == 0) - goto out; + return; low = high = NULL; i = random() % (ksg_maxid + 1); for (cnt = 0; cnt <= ksg_maxid; cnt++) { @@ -530,8 +521,6 @@ sched_balance(void) if (low != NULL && high != NULL && high != low) sched_balance_pair(LIST_FIRST(&high->ksg_members), LIST_FIRST(&low->ksg_members)); -out: - bal_tick = ticks + (random() % (hz * 2)); } static void @@ -539,11 +528,11 @@ sched_balance_groups(void) { int i; + gbal_tick = ticks + (random() % (hz * 2)); mtx_assert(&sched_lock, MA_OWNED); if (smp_started) for (i = 0; i <= ksg_maxid; i++) sched_balance_group(KSEQ_GROUP(i)); - gbal_tick = ticks + (random() % (hz * 2)); } static void @@ -665,7 +654,8 @@ kseq_idled(struct kseq *kseq) kseq_runq_rem(steal, ke); kseq_load_rem(steal, ke); ke->ke_cpu = PCPU_GET(cpuid); - sched_add_internal(ke->ke_thread, 0); + ke->ke_flags |= KEF_INTERNAL | KEF_HOLD; + sched_add(ke->ke_thread, SRQ_YIELDING); return (0); } } @@ -692,9 +682,11 @@ kseq_assign(struct kseq *kseq) } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL)); for (; ke != NULL; ke = nke) { nke = ke->ke_assign; + kseq->ksq_group->ksg_load--; + kseq->ksq_load--; ke->ke_flags &= ~KEF_ASSIGNED; - SLOT_RELEASE(ke->ke_thread->td_ksegrp); - sched_add_internal(ke->ke_thread, 0); + ke->ke_flags |= KEF_INTERNAL | KEF_HOLD; + sched_add(ke->ke_thread, SRQ_YIELDING); } } @@ -704,15 +696,21 @@ kseq_notify(struct kse *ke, int cpu) struct kseq *kseq; struct thread *td; struct pcpu *pcpu; + int class; int prio; + kseq = KSEQ_CPU(cpu); + /* XXX */ + class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); + if ((class == PRI_TIMESHARE || class == PRI_REALTIME) && + (kseq_idle & kseq->ksq_group->ksg_mask)) + atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask); + kseq->ksq_group->ksg_load++; + kseq->ksq_load++; ke->ke_cpu = cpu; ke->ke_flags |= KEF_ASSIGNED; - SLOT_USE(ke->ke_thread->td_ksegrp); prio = ke->ke_thread->td_priority; - kseq = KSEQ_CPU(cpu); - /* * Place a KSE on another cpu's queue and force a resched. */ @@ -753,8 +751,7 @@ runq_steal(struct runq *rq) continue; rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; TAILQ_FOREACH(ke, rqh, ke_procq) { - if (KSE_CAN_MIGRATE(ke, - PRI_BASE(ke->ke_ksegrp->kg_pri_class))) + if (KSE_CAN_MIGRATE(ke)) return (ke); } } @@ -783,8 +780,11 @@ kseq_steal(struct kseq *kseq, int stealidle) int kseq_transfer(struct kseq *kseq, struct kse *ke, int class) { + struct kseq_group *nksg; struct kseq_group *ksg; + struct kseq *old; int cpu; + int idx; if (smp_started == 0) return (0); @@ -800,35 +800,63 @@ kseq_transfer(struct kseq *kseq, struct kse *ke, int class) * some CPUs may idle. Too low and there will be excess migration * and context switches. */ + old = KSEQ_CPU(ke->ke_cpu); + nksg = old->ksq_group; ksg = kseq->ksq_group; - if (ksg->ksg_load > ksg->ksg_cpus && kseq_idle) { - ksg = KSEQ_CPU(ke->ke_cpu)->ksq_group; - if (kseq_idle & ksg->ksg_mask) { - cpu = ffs(ksg->ksg_idlemask); - if (cpu) + if (kseq_idle) { + if (kseq_idle & nksg->ksg_mask) { + cpu = ffs(nksg->ksg_idlemask); + if (cpu) { + CTR2(KTR_SCHED, + "kseq_transfer: %p found old cpu %X " + "in idlemask.", ke, cpu); goto migrate; + } } /* * Multiple cpus could find this bit simultaneously * but the race shouldn't be terrible. */ cpu = ffs(kseq_idle); - if (cpu) + if (cpu) { + CTR2(KTR_SCHED, "kseq_transfer: %p found %X " + "in idlemask.", ke, cpu); + goto migrate; + } + } + idx = 0; +#if 0 + if (old->ksq_load < kseq->ksq_load) { + cpu = ke->ke_cpu + 1; + CTR2(KTR_SCHED, "kseq_transfer: %p old cpu %X " + "load less than ours.", ke, cpu); + goto migrate; + } + /* + * No new CPU was found, look for one with less load. + */ + for (idx = 0; idx <= ksg_maxid; idx++) { + nksg = KSEQ_GROUP(idx); + if (nksg->ksg_load /*+ (nksg->ksg_cpus * 2)*/ < ksg->ksg_load) { + cpu = ffs(nksg->ksg_cpumask); + CTR2(KTR_SCHED, "kseq_transfer: %p cpu %X load less " + "than ours.", ke, cpu); goto migrate; + } } +#endif /* * If another cpu in this group has idled, assign a thread over * to them after checking to see if there are idled groups. */ - ksg = kseq->ksq_group; if (ksg->ksg_idlemask) { cpu = ffs(ksg->ksg_idlemask); - if (cpu) + if (cpu) { + CTR2(KTR_SCHED, "kseq_transfer: %p cpu %X idle in " + "group.", ke, cpu); goto migrate; + } } - /* - * No new CPU was found. - */ return (0); migrate: /* @@ -906,7 +934,6 @@ static void sched_setup(void *dummy) { #ifdef SMP - int balance_groups; int i; #endif @@ -928,10 +955,13 @@ sched_setup(void *dummy) if (smp_topology == NULL) { struct kseq_group *ksg; struct kseq *ksq; + int cpus; - for (i = 0; i < MAXCPU; i++) { - ksq = &kseq_cpu[i]; - ksg = &kseq_groups[i]; + for (cpus = 0, i = 0; i < MAXCPU; i++) { + if (CPU_ABSENT(i)) + continue; + ksq = &kseq_cpu[cpus]; + ksg = &kseq_groups[cpus]; /* * Setup a kseq group with one member. */ @@ -944,7 +974,9 @@ sched_setup(void *dummy) ksg->ksg_transferable = 0; LIST_INIT(&ksg->ksg_members); LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings); + cpus++; } + ksg_maxid = cpus - 1; } else { struct kseq_group *ksg; struct cpu_group *cg; @@ -1230,6 +1262,7 @@ sched_prio(struct thread *td, u_char prio) */ ke->ke_flags |= KEF_HOLD; adjustrunqueue(td, prio); + ke->ke_flags &= ~KEF_HOLD; } else td->td_priority = prio; } @@ -1237,11 +1270,13 @@ sched_prio(struct thread *td, u_char prio) void sched_switch(struct thread *td, struct thread *newtd, int flags) { + struct kseq *ksq; struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); ke = td->td_kse; + ksq = KSEQ_SELF(); td->td_lastcpu = td->td_oncpu; td->td_oncpu = NOCPU; @@ -1252,37 +1287,33 @@ sched_switch(struct thread *td, struct thread *newtd, int flags) * If the KSE has been assigned it may be in the process of switching * to the new cpu. This is the case in sched_bind(). */ - if ((ke->ke_flags & KEF_ASSIGNED) == 0) { - if (td == PCPU_GET(idlethread)) { - TD_SET_CAN_RUN(td); - } else { - /* We are ending our run so make our slot available again */ - SLOT_RELEASE(td->td_ksegrp); - if (TD_IS_RUNNING(td)) { - kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); - /* - * Don't allow the thread to migrate - * from a preemption. - */ - ke->ke_flags |= KEF_HOLD; - setrunqueue(td, SRQ_OURSELF|SRQ_YIELDING); - } else { - if (ke->ke_runq) { - kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); - } else if ((td->td_flags & TDF_IDLETD) == 0) - kdb_backtrace(); - /* - * We will not be on the run queue. - * So we must be sleeping or similar. - * Don't use the slot if we will need it - * for newtd. - */ - if ((td->td_proc->p_flag & P_HADTHREADS) && - (newtd == NULL || - newtd->td_ksegrp != td->td_ksegrp)) - slot_fill(td->td_ksegrp); - } - } + if (td == PCPU_GET(idlethread)) { + TD_SET_CAN_RUN(td); + } else if ((ke->ke_flags & KEF_ASSIGNED) == 0) { + /* We are ending our run so make our slot available again */ + SLOT_RELEASE(td->td_ksegrp); + if (ke->ke_runq == NULL) + panic("Thread not on runq."); + kseq_load_rem(ksq, ke); + if (TD_IS_RUNNING(td)) { + /* + * Don't allow the thread to migrate + * from a preemption. + */ + ke->ke_flags |= KEF_HOLD; + setrunqueue(td, (flags & SW_PREEMPT) ? + SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : + SRQ_OURSELF|SRQ_YIELDING); + ke->ke_flags &= ~KEF_HOLD; + } else if ((td->td_proc->p_flag & P_HADTHREADS) && + (newtd == NULL || newtd->td_ksegrp != td->td_ksegrp)) + /* + * We will not be on the run queue. + * So we must be sleeping or similar. + * Don't use the slot if we will need it + * for newtd. + */ + slot_fill(td->td_ksegrp); } if (newtd != NULL) { /* @@ -1291,6 +1322,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags) * run queue and then chosen. */ newtd->td_kse->ke_flags |= KEF_DIDRUN; + newtd->td_kse->ke_runq = ksq->ksq_curr; SLOT_USE(newtd->td_ksegrp); TD_SET_RUNNING(newtd); kseq_load_add(KSEQ_SELF(), newtd->td_kse); @@ -1450,11 +1482,11 @@ sched_class(struct ksegrp *kg, int class) * class. */ if (ke->ke_state == KES_ONRUNQ) { - if (KSE_CAN_MIGRATE(ke, oclass)) { + if (KSE_CAN_MIGRATE(ke)) { kseq->ksq_transferable--; kseq->ksq_group->ksg_transferable--; } - if (KSE_CAN_MIGRATE(ke, nclass)) { + if (KSE_CAN_MIGRATE(ke)) { kseq->ksq_transferable++; kseq->ksq_group->ksg_transferable++; } @@ -1510,9 +1542,9 @@ sched_clock(struct thread *td) mtx_assert(&sched_lock, MA_OWNED); kseq = KSEQ_SELF(); #ifdef SMP - if (ticks == bal_tick) + if (ticks >= bal_tick) sched_balance(); - if (ticks == gbal_tick) + if (ticks >= gbal_tick && balance_groups) sched_balance_groups(); /* * We could have been assigned a non real-time thread without an @@ -1665,44 +1697,38 @@ restart: void sched_add(struct thread *td, int flags) { - - /* let jeff work out how to map the flags better */ - /* I'm open to suggestions */ - if (flags & SRQ_YIELDING) - /* - * Preempting during switching can be bad JUJU - * especially for KSE processes - */ - sched_add_internal(td, 0); - else - sched_add_internal(td, 1); -} - -static void -sched_add_internal(struct thread *td, int preemptive) -{ struct kseq *kseq; struct ksegrp *kg; struct kse *ke; -#ifdef SMP + int preemptive; int canmigrate; -#endif int class; + if (td == NULL) { + mtx_unlock_spin(&sched_lock); + panic("wtf"); + } CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", td, td->td_proc->p_comm, td->td_priority, curthread, curthread->td_proc->p_comm); mtx_assert(&sched_lock, MA_OWNED); ke = td->td_kse; kg = td->td_ksegrp; + canmigrate = 1; + preemptive = !(flags & SRQ_YIELDING); + class = PRI_BASE(kg->kg_pri_class); + kseq = KSEQ_SELF(); + if ((ke->ke_flags & KEF_INTERNAL) == 0) + SLOT_USE(td->td_ksegrp); + ke->ke_flags &= ~KEF_INTERNAL; +#ifdef SMP if (ke->ke_flags & KEF_ASSIGNED) { - if (ke->ke_flags & KEF_REMOVED) { - SLOT_USE(ke->ke_ksegrp); + if (ke->ke_flags & KEF_REMOVED) ke->ke_flags &= ~KEF_REMOVED; - } return; } - kseq = KSEQ_SELF(); + canmigrate = KSE_CAN_MIGRATE(ke); +#endif KASSERT(ke->ke_state != KES_ONRUNQ, ("sched_add: kse %p (%s) already in run queue", ke, ke->ke_proc->p_comm)); @@ -1710,14 +1736,13 @@ sched_add_internal(struct thread *td, int preemptive) ("sched_add: process swapped out")); KASSERT(ke->ke_runq == NULL, ("sched_add: KSE %p is still assigned to a run queue", ke)); - - class = PRI_BASE(kg->kg_pri_class); switch (class) { case PRI_ITHD: case PRI_REALTIME: ke->ke_runq = kseq->ksq_curr; ke->ke_slice = SCHED_SLICE_MAX; - ke->ke_cpu = PCPU_GET(cpuid); + if (canmigrate) + ke->ke_cpu = PCPU_GET(cpuid); break; case PRI_TIMESHARE: if (SCHED_CURR(kg, ke)) @@ -1744,7 +1769,6 @@ sched_add_internal(struct thread *td, int preemptive) * Don't migrate running threads here. Force the long term balancer * to do it. */ - canmigrate = KSE_CAN_MIGRATE(ke, class); if (ke->ke_flags & KEF_HOLD) { ke->ke_flags &= ~KEF_HOLD; canmigrate = 0; @@ -1774,23 +1798,19 @@ sched_add_internal(struct thread *td, int preemptive) * Now remove ourselves from the group specific idle mask. */ kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask); - } else if (kseq->ksq_load > 1 && canmigrate) + } else if (canmigrate && kseq->ksq_load > 1 && class != PRI_ITHD) if (kseq_transfer(kseq, ke, class)) return; ke->ke_cpu = PCPU_GET(cpuid); #endif - /* - * XXX With preemption this is not necessary. - */ if (td->td_priority < curthread->td_priority && ke->ke_runq == kseq->ksq_curr) curthread->td_flags |= TDF_NEEDRESCHED; if (preemptive && maybe_preempt(td)) return; - SLOT_USE(td->td_ksegrp); ke->ke_state = KES_ONRUNQ; - kseq_runq_add(kseq, ke); + kseq_runq_add(kseq, ke, flags); kseq_load_add(kseq, ke); } @@ -1800,26 +1820,19 @@ sched_rem(struct thread *td) struct kseq *kseq; struct kse *ke; - mtx_assert(&sched_lock, MA_OWNED); - ke = td->td_kse; CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)", td, td->td_proc->p_comm, td->td_priority, curthread, curthread->td_proc->p_comm); - /* - * It is safe to just return here because sched_rem() is only ever - * used in places where we're immediately going to add the - * kse back on again. In that case it'll be added with the correct - * thread and priority when the caller drops the sched_lock. - */ + mtx_assert(&sched_lock, MA_OWNED); + ke = td->td_kse; + SLOT_RELEASE(td->td_ksegrp); if (ke->ke_flags & KEF_ASSIGNED) { - SLOT_RELEASE(td->td_ksegrp); ke->ke_flags |= KEF_REMOVED; return; } KASSERT((ke->ke_state == KES_ONRUNQ), ("sched_rem: KSE not on run queue")); - SLOT_RELEASE(td->td_ksegrp); ke->ke_state = KES_THREAD; kseq = KSEQ_CPU(ke->ke_cpu); kseq_runq_rem(kseq, ke); |