summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--sys/kern/sched_ule.c300
1 files changed, 222 insertions, 78 deletions
diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index c8c4618..dfef1e0 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -50,6 +50,7 @@ __FBSDID("$FreeBSD$");
#endif
#include <machine/cpu.h>
+#include <machine/smp.h>
#define KTR_ULE KTR_NFS
@@ -101,6 +102,9 @@ struct ke_sched {
#define ke_ftick ke_sched->ske_ftick
#define ke_ticks ke_sched->ske_ticks
#define ke_cpu ke_sched->ske_cpu
+#define ke_assign ke_procq.tqe_next
+
+#define KEF_ASSIGNED KEF_SCHED0 /* KSE is being migrated. */
struct kg_sched {
int skg_slptime; /* Number of ticks we vol. slept */
@@ -211,8 +215,9 @@ struct kseq {
short ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */
short ksq_nicemin; /* Least nice. */
#ifdef SMP
- int ksq_cpus; /* Count of CPUs in this kseq. */
unsigned int ksq_rslices; /* Slices on run queue */
+ int ksq_cpus; /* Count of CPUs in this kseq. */
+ struct kse *ksq_assigned; /* KSEs assigned by another CPU. */
#endif
};
@@ -220,12 +225,13 @@ struct kseq {
* One kse queue per processor.
*/
#ifdef SMP
-struct kseq kseq_cpu[MAXCPU];
-struct kseq *kseq_idmap[MAXCPU];
+static int kseq_idle;
+static struct kseq kseq_cpu[MAXCPU];
+static struct kseq *kseq_idmap[MAXCPU];
#define KSEQ_SELF() (kseq_idmap[PCPU_GET(cpuid)])
#define KSEQ_CPU(x) (kseq_idmap[(x)])
#else
-struct kseq kseq_cpu;
+static struct kseq kseq_cpu;
#define KSEQ_SELF() (&kseq_cpu)
#define KSEQ_CPU(x) (&kseq_cpu)
#endif
@@ -234,11 +240,10 @@ static void sched_slice(struct kse *ke);
static void sched_priority(struct ksegrp *kg);
static int sched_interact_score(struct ksegrp *kg);
static void sched_interact_update(struct ksegrp *kg);
-void sched_pctcpu_update(struct kse *ke);
-int sched_pickcpu(void);
+static void sched_pctcpu_update(struct kse *ke);
/* Operations on per processor queues */
-static struct kse * kseq_choose(struct kseq *kseq, int steal);
+static struct kse * kseq_choose(struct kseq *kseq);
static void kseq_setup(struct kseq *kseq);
static void kseq_add(struct kseq *kseq, struct kse *ke);
static void kseq_rem(struct kseq *kseq, struct kse *ke);
@@ -246,9 +251,17 @@ static void kseq_nice_add(struct kseq *kseq, int nice);
static void kseq_nice_rem(struct kseq *kseq, int nice);
void kseq_print(int cpu);
#ifdef SMP
-struct kseq * kseq_load_highest(void);
-void kseq_balance(void *arg);
-void kseq_move(struct kseq *from, int cpu);
+#if 0
+static int sched_pickcpu(void);
+#endif
+static struct kse *runq_steal(struct runq *rq);
+static struct kseq *kseq_load_highest(void);
+static void kseq_balance(void *arg);
+static void kseq_move(struct kseq *from, int cpu);
+static int kseq_find(void);
+static void kseq_notify(struct kse *ke, int cpu);
+static void kseq_assign(struct kseq *);
+static struct kse *kseq_steal(struct kseq *kseq);
#endif
void
@@ -359,7 +372,7 @@ kseq_nice_rem(struct kseq *kseq, int nice)
* any approach and so the semi random algorithm below may work as well as any.
*
*/
-void
+static void
kseq_balance(void *arg)
{
struct kseq *kseq;
@@ -396,6 +409,8 @@ kseq_balance(void *arg)
kseq = KSEQ_CPU(high_cpu);
+ high_load = kseq->ksq_loads[PRI_IDLE] + kseq->ksq_loads[PRI_TIMESHARE] +
+ kseq->ksq_loads[PRI_REALTIME];
/*
* Nothing to do.
*/
@@ -422,7 +437,7 @@ out:
return;
}
-struct kseq *
+static struct kseq *
kseq_load_highest(void)
{
struct kseq *kseq;
@@ -445,18 +460,19 @@ kseq_load_highest(void)
}
kseq = KSEQ_CPU(cpu);
- if (load > kseq->ksq_cpus)
+ if ((kseq->ksq_loads[PRI_IDLE] + kseq->ksq_loads[PRI_TIMESHARE] +
+ kseq->ksq_loads[PRI_REALTIME]) > kseq->ksq_cpus)
return (kseq);
return (NULL);
}
-void
+static void
kseq_move(struct kseq *from, int cpu)
{
struct kse *ke;
- ke = kseq_choose(from, 1);
+ ke = kseq_steal(from);
runq_remove(ke->ke_runq, ke);
ke->ke_state = KES_THREAD;
kseq_rem(from, ke);
@@ -464,16 +480,126 @@ kseq_move(struct kseq *from, int cpu)
ke->ke_cpu = cpu;
sched_add(ke->ke_thread);
}
-#endif
+
+static int
+kseq_find(void)
+{
+ struct kseq *high;
+
+ if (!smp_started)
+ return (0);
+ if (kseq_idle & PCPU_GET(cpumask))
+ return (0);
+ /*
+ * Find the cpu with the highest load and steal one proc.
+ */
+ if ((high = kseq_load_highest()) == NULL ||
+ high == KSEQ_SELF()) {
+ /*
+ * If we couldn't find one, set ourselves in the
+ * idle map.
+ */
+ atomic_set_int(&kseq_idle, PCPU_GET(cpumask));
+ return (0);
+ }
+ /*
+ * Remove this kse from this kseq and runq and then requeue
+ * on the current processor. We now have a load of one!
+ */
+ kseq_move(high, PCPU_GET(cpuid));
+
+ return (1);
+}
+
+static void
+kseq_assign(struct kseq *kseq)
+{
+ struct kse *nke;
+ struct kse *ke;
+
+ do {
+ ke = kseq->ksq_assigned;
+ } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL));
+ for (; ke != NULL; ke = nke) {
+ nke = ke->ke_assign;
+ ke->ke_flags &= ~KEF_ASSIGNED;
+ sched_add(ke->ke_thread);
+ }
+}
+
+static void
+kseq_notify(struct kse *ke, int cpu)
+{
+ struct kseq *kseq;
+ struct thread *td;
+ struct pcpu *pcpu;
+
+ ke->ke_flags |= KEF_ASSIGNED;
+
+ kseq = KSEQ_CPU(cpu);
+
+ /*
+ * Place a KSE on another cpu's queue and force a resched.
+ */
+ do {
+ ke->ke_assign = kseq->ksq_assigned;
+ } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke->ke_assign, ke));
+ pcpu = pcpu_find(cpu);
+ td = pcpu->pc_curthread;
+ if (ke->ke_thread->td_priority < td->td_priority ||
+ td == pcpu->pc_idlethread) {
+ td->td_flags |= TDF_NEEDRESCHED;
+ ipi_selected(1 << cpu, IPI_AST);
+ }
+}
+
+static struct kse *
+runq_steal(struct runq *rq)
+{
+ struct rqhead *rqh;
+ struct rqbits *rqb;
+ struct kse *ke;
+ int word;
+ int bit;
+
+ mtx_assert(&sched_lock, MA_OWNED);
+ rqb = &rq->rq_status;
+ for (word = 0; word < RQB_LEN; word++) {
+ if (rqb->rqb_bits[word] == 0)
+ continue;
+ for (bit = 0; bit < RQB_BPW; bit++) {
+ if ((rqb->rqb_bits[word] & (1 << bit)) == 0)
+ continue;
+ rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
+ TAILQ_FOREACH(ke, rqh, ke_procq) {
+ if (PRI_BASE(ke->ke_ksegrp->kg_pri_class) !=
+ PRI_ITHD)
+ return (ke);
+ }
+ }
+ }
+ return (NULL);
+}
+
+static struct kse *
+kseq_steal(struct kseq *kseq)
+{
+ struct kse *ke;
+
+ if ((ke = runq_steal(kseq->ksq_curr)) != NULL)
+ return (ke);
+ if ((ke = runq_steal(kseq->ksq_next)) != NULL)
+ return (ke);
+ return (runq_steal(&kseq->ksq_idle));
+}
+#endif /* SMP */
/*
- * Pick the highest priority task we have and return it. If steal is 1 we
- * will return kses that have been denied slices due to their nice being too
- * low. In the future we should prohibit stealing interrupt threads as well.
+ * Pick the highest priority task we have and return it.
*/
-struct kse *
-kseq_choose(struct kseq *kseq, int steal)
+static struct kse *
+kseq_choose(struct kseq *kseq)
{
struct kse *ke;
struct runq *swap;
@@ -499,7 +625,7 @@ kseq_choose(struct kseq *kseq, int steal)
* TIMESHARE kse group and its nice was too far out
* of the range that receives slices.
*/
- if (ke->ke_slice == 0 && steal == 0) {
+ if (ke->ke_slice == 0) {
runq_remove(ke->ke_runq, ke);
sched_slice(ke);
ke->ke_runq = kseq->ksq_next;
@@ -529,6 +655,7 @@ kseq_setup(struct kseq *kseq)
kseq->ksq_load = 0;
#ifdef SMP
kseq->ksq_rslices = 0;
+ kseq->ksq_assigned = NULL;
#endif
}
@@ -716,7 +843,7 @@ sched_rr_interval(void)
return (SCHED_SLICE_MAX);
}
-void
+static void
sched_pctcpu_update(struct kse *ke)
{
/*
@@ -737,7 +864,7 @@ sched_pctcpu_update(struct kse *ke)
ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
}
-#ifdef SMP
+#if 0
/* XXX Should be changed to kseq_load_lowest() */
int
sched_pickcpu(void)
@@ -767,12 +894,6 @@ sched_pickcpu(void)
CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu);
return (cpu);
}
-#else
-int
-sched_pickcpu(void)
-{
- return (0);
-}
#endif
void
@@ -789,10 +910,8 @@ sched_prio(struct thread *td, u_char prio)
* queue. We still call adjustrunqueue below in case kse
* needs to fix things up.
*/
- if (ke && ((td->td_ksegrp->kg_pri_class == PRI_TIMESHARE &&
- prio < td->td_ksegrp->kg_user_pri) ||
- (td->td_ksegrp->kg_pri_class == PRI_IDLE &&
- prio < PRI_MIN_IDLE))) {
+ if (ke && (ke->ke_flags & KEF_ASSIGNED) == 0 &&
+ ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) {
runq_remove(ke->ke_runq, ke);
ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr;
runq_add(ke->ke_runq, ke);
@@ -917,8 +1036,6 @@ sched_wakeup(struct thread *td)
td->td_slptime = 0;
}
setrunqueue(td);
- if (td->td_priority < curthread->td_priority)
- curthread->td_flags |= TDF_NEEDRESCHED;
}
/*
@@ -1119,30 +1236,16 @@ sched_runnable(void)
mtx_lock_spin(&sched_lock);
kseq = KSEQ_SELF();
-
+#ifdef SMP
+ if (kseq->ksq_assigned)
+ kseq_assign(kseq);
+#endif
if ((curthread->td_flags & TDF_IDLETD) != 0) {
if (kseq->ksq_load > 0)
goto out;
} else
if (kseq->ksq_load - 1 > 0)
goto out;
-#ifdef SMP
- /*
- * For SMP we may steal other processor's KSEs. Just search until we
- * verify that at least on other cpu has a runnable task.
- */
- if (smp_started) {
- int i;
-
- for (i = 0; i < mp_maxid; i++) {
- if (CPU_ABSENT(i) || (i & stopped_cpus) != 0)
- continue;
- kseq = KSEQ_CPU(i);
- if (kseq->ksq_load > kseq->ksq_cpus)
- goto out;
- }
- }
-#endif
load = 0;
out:
mtx_unlock_spin(&sched_lock);
@@ -1170,12 +1273,19 @@ sched_choose(void)
struct kse *ke;
mtx_assert(&sched_lock, MA_OWNED);
+ kseq = KSEQ_SELF();
#ifdef SMP
retry:
+ if (kseq->ksq_assigned)
+ kseq_assign(kseq);
#endif
- kseq = KSEQ_SELF();
- ke = kseq_choose(kseq, 0);
+ ke = kseq_choose(kseq);
if (ke) {
+#ifdef SMP
+ if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE)
+ if (kseq_find())
+ goto retry;
+#endif
runq_remove(ke->ke_runq, ke);
ke->ke_state = KES_THREAD;
@@ -1186,23 +1296,9 @@ retry:
}
return (ke);
}
-
#ifdef SMP
- if (smp_started) {
- /*
- * Find the cpu with the highest load and steal one proc.
- */
- if ((kseq = kseq_load_highest()) == NULL)
- return (NULL);
-
- /*
- * Remove this kse from this kseq and runq and then requeue
- * on the current processor. Then we will dequeue it
- * normally above.
- */
- kseq_move(kseq, PCPU_GET(cpuid));
+ if (kseq_find())
goto retry;
- }
#endif
return (NULL);
@@ -1214,10 +1310,14 @@ sched_add(struct thread *td)
struct kseq *kseq;
struct ksegrp *kg;
struct kse *ke;
+ int class;
+ mtx_assert(&sched_lock, MA_OWNED);
ke = td->td_kse;
kg = td->td_ksegrp;
- mtx_assert(&sched_lock, MA_OWNED);
+ if (ke->ke_flags & KEF_ASSIGNED)
+ return;
+ kseq = KSEQ_SELF();
KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE"));
KASSERT((ke->ke_thread->td_kse != NULL),
("sched_add: No KSE on thread"));
@@ -1229,24 +1329,33 @@ sched_add(struct thread *td)
KASSERT(ke->ke_runq == NULL,
("sched_add: KSE %p is still assigned to a run queue", ke));
-
- switch (PRI_BASE(kg->kg_pri_class)) {
+ class = PRI_BASE(kg->kg_pri_class);
+ switch (class) {
case PRI_ITHD:
case PRI_REALTIME:
- kseq = KSEQ_SELF();
ke->ke_runq = kseq->ksq_curr;
ke->ke_slice = SCHED_SLICE_MAX;
ke->ke_cpu = PCPU_GET(cpuid);
break;
case PRI_TIMESHARE:
- kseq = KSEQ_CPU(ke->ke_cpu);
+#ifdef SMP
+ if (ke->ke_cpu != PCPU_GET(cpuid)) {
+ kseq_notify(ke, ke->ke_cpu);
+ return;
+ }
+#endif
if (SCHED_CURR(kg, ke))
ke->ke_runq = kseq->ksq_curr;
else
ke->ke_runq = kseq->ksq_next;
break;
case PRI_IDLE:
- kseq = KSEQ_CPU(ke->ke_cpu);
+#ifdef SMP
+ if (ke->ke_cpu != PCPU_GET(cpuid)) {
+ kseq_notify(ke, ke->ke_cpu);
+ return;
+ }
+#endif
/*
* This is for priority prop.
*/
@@ -1260,6 +1369,34 @@ sched_add(struct thread *td)
panic("Unknown pri class.\n");
break;
}
+#ifdef SMP
+ /*
+ * If there are any idle processors, give them our extra load.
+ */
+ if (kseq_idle && class != PRI_ITHD &&
+ (kseq->ksq_loads[PRI_IDLE] + kseq->ksq_loads[PRI_TIMESHARE] +
+ kseq->ksq_loads[PRI_REALTIME]) >= kseq->ksq_cpus) {
+ int cpu;
+
+ /*
+ * Multiple cpus could find this bit simultaneously but the
+ * race shouldn't be terrible.
+ */
+ cpu = ffs(kseq_idle);
+ if (cpu) {
+ cpu--;
+ atomic_clear_int(&kseq_idle, 1 << cpu);
+ ke->ke_cpu = cpu;
+ ke->ke_runq = NULL;
+ kseq_notify(ke, cpu);
+ return;
+ }
+ }
+ if (class == PRI_TIMESHARE || class == PRI_REALTIME)
+ atomic_clear_int(&kseq_idle, PCPU_GET(cpumask));
+#endif
+ if (td->td_priority < curthread->td_priority)
+ curthread->td_flags |= TDF_NEEDRESCHED;
ke->ke_ksegrp->kg_runq_kses++;
ke->ke_state = KES_ONRUNQ;
@@ -1275,7 +1412,14 @@ sched_rem(struct thread *td)
struct kse *ke;
ke = td->td_kse;
-
+ /*
+ * It is safe to just return here because sched_rem() is only ever
+ * used in places where we're immediately going to add the
+ * kse back on again. In that case it'll be added with the correct
+ * thread and priority when the caller drops the sched_lock.
+ */
+ if (ke->ke_flags & KEF_ASSIGNED)
+ return;
mtx_assert(&sched_lock, MA_OWNED);
KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue"));
OpenPOWER on IntegriCloud