1 files changed, 222 insertions, 78 deletions
diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index c8c4618..dfef1e0 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -50,6 +50,7 @@ __FBSDID("$FreeBSD$");
 #endif
 
 #include <machine/cpu.h>
+#include <machine/smp.h>
 
 #define KTR_ULE         KTR_NFS
 
@@ -101,6 +102,9 @@ struct ke_sched {
 #define	ke_ftick	ke_sched->ske_ftick
 #define	ke_ticks	ke_sched->ske_ticks
 #define	ke_cpu		ke_sched->ske_cpu
+#define	ke_assign	ke_procq.tqe_next
+
+#define	KEF_ASSIGNED	KEF_SCHED0	/* KSE is being migrated. */
 
 struct kg_sched {
 	int	skg_slptime;		/* Number of ticks we vol. slept */
@@ -211,8 +215,9 @@ struct kseq {
 	short		ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */
 	short		ksq_nicemin;		/* Least nice. */
 #ifdef SMP
-	int		ksq_cpus;	/* Count of CPUs in this kseq. */
 	unsigned int	ksq_rslices;	/* Slices on run queue */
+	int		ksq_cpus;	/* Count of CPUs in this kseq. */
+	struct kse 	*ksq_assigned;	/* KSEs assigned by another CPU. */
 #endif
 };
 
@@ -220,12 +225,13 @@ struct kseq {
  * One kse queue per processor.
  */
 #ifdef SMP
-struct kseq	kseq_cpu[MAXCPU];
-struct kseq	*kseq_idmap[MAXCPU];
+static int kseq_idle;
+static struct kseq	kseq_cpu[MAXCPU];
+static struct kseq	*kseq_idmap[MAXCPU];
 #define	KSEQ_SELF()	(kseq_idmap[PCPU_GET(cpuid)])
 #define	KSEQ_CPU(x)	(kseq_idmap[(x)])
 #else
-struct kseq	kseq_cpu;
+static struct kseq	kseq_cpu;
 #define	KSEQ_SELF()	(&kseq_cpu)
 #define	KSEQ_CPU(x)	(&kseq_cpu)
 #endif
@@ -234,11 +240,10 @@ static void sched_slice(struct kse *ke);
 static void sched_priority(struct ksegrp *kg);
 static int sched_interact_score(struct ksegrp *kg);
 static void sched_interact_update(struct ksegrp *kg);
-void sched_pctcpu_update(struct kse *ke);
-int sched_pickcpu(void);
+static void sched_pctcpu_update(struct kse *ke);
 
 /* Operations on per processor queues */
-static struct kse * kseq_choose(struct kseq *kseq, int steal);
+static struct kse * kseq_choose(struct kseq *kseq);
 static void kseq_setup(struct kseq *kseq);
 static void kseq_add(struct kseq *kseq, struct kse *ke);
 static void kseq_rem(struct kseq *kseq, struct kse *ke);
@@ -246,9 +251,17 @@ static void kseq_nice_add(struct kseq *kseq, int nice);
 static void kseq_nice_rem(struct kseq *kseq, int nice);
 void kseq_print(int cpu);
 #ifdef SMP
-struct kseq * kseq_load_highest(void);
-void kseq_balance(void *arg);
-void kseq_move(struct kseq *from, int cpu);
+#if 0
+static int sched_pickcpu(void);
+#endif
+static struct kse *runq_steal(struct runq *rq);
+static struct kseq *kseq_load_highest(void);
+static void kseq_balance(void *arg);
+static void kseq_move(struct kseq *from, int cpu);
+static int kseq_find(void);
+static void kseq_notify(struct kse *ke, int cpu);
+static void kseq_assign(struct kseq *);
+static struct kse *kseq_steal(struct kseq *kseq);
 #endif
 
 void
@@ -359,7 +372,7 @@ kseq_nice_rem(struct kseq *kseq, int nice)
  * any approach and so the semi random algorithm below may work as well as any.
  *
  */
-void
+static void
 kseq_balance(void *arg)
 {
 	struct kseq *kseq;
@@ -396,6 +409,8 @@ kseq_balance(void *arg)
 
 	kseq = KSEQ_CPU(high_cpu);
 
+	high_load = kseq->ksq_loads[PRI_IDLE] + kseq->ksq_loads[PRI_TIMESHARE] +
+	    kseq->ksq_loads[PRI_REALTIME];
 	/*
 	 * Nothing to do.
 	 */
@@ -422,7 +437,7 @@ out:
 	return;
 }
 
-struct kseq *
+static struct kseq *
 kseq_load_highest(void)
 {
 	struct kseq *kseq;
@@ -445,18 +460,19 @@ kseq_load_highest(void)
 	}
 	kseq = KSEQ_CPU(cpu);
 
-	if (load > kseq->ksq_cpus)
+	if ((kseq->ksq_loads[PRI_IDLE] + kseq->ksq_loads[PRI_TIMESHARE] +
+	    kseq->ksq_loads[PRI_REALTIME]) > kseq->ksq_cpus)
 		return (kseq);
 
 	return (NULL);
 }
 
-void
+static void
 kseq_move(struct kseq *from, int cpu)
 {
 	struct kse *ke;
 
-	ke = kseq_choose(from, 1);
+	ke = kseq_steal(from);
 	runq_remove(ke->ke_runq, ke);
 	ke->ke_state = KES_THREAD;
 	kseq_rem(from, ke);
@@ -464,16 +480,126 @@ kseq_move(struct kseq *from, int cpu)
 	ke->ke_cpu = cpu;
 	sched_add(ke->ke_thread);
 }
-#endif
+
+static int
+kseq_find(void)
+{
+	struct kseq *high;
+
+	if (!smp_started)
+		return (0);
+	if (kseq_idle & PCPU_GET(cpumask))
+		return (0);
+	/*
+	 * Find the cpu with the highest load and steal one proc.
+	 */
+	if ((high = kseq_load_highest()) == NULL ||
+	    high == KSEQ_SELF()) {
+		/*
+		 * If we couldn't find one, set ourselves in the
+		 * idle map.
+		 */
+		atomic_set_int(&kseq_idle, PCPU_GET(cpumask));
+		return (0);
+	}
+	/*
+	 * Remove this kse from this kseq and runq and then requeue
+	 * on the current processor.  We now have a load of one!
+	 */
+	kseq_move(high, PCPU_GET(cpuid));
+
+	return (1);
+}
+
+static void
+kseq_assign(struct kseq *kseq)
+{
+	struct kse *nke;
+	struct kse *ke;
+
+	do {
+		ke = kseq->ksq_assigned;
+	} while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL));
+	for (; ke != NULL; ke = nke) {
+		nke = ke->ke_assign;
+		ke->ke_flags &= ~KEF_ASSIGNED;
+		sched_add(ke->ke_thread);
+	}
+}
+
+static void
+kseq_notify(struct kse *ke, int cpu)
+{
+	struct kseq *kseq;
+	struct thread *td;
+	struct pcpu *pcpu;
+
+	ke->ke_flags |= KEF_ASSIGNED;
+
+	kseq = KSEQ_CPU(cpu);
+
+	/*
+	 * Place a KSE on another cpu's queue and force a resched.
+	 */
+	do {
+		ke->ke_assign = kseq->ksq_assigned;
+	} while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke->ke_assign, ke));
+	pcpu = pcpu_find(cpu);
+	td = pcpu->pc_curthread;
+	if (ke->ke_thread->td_priority < td->td_priority ||
+	    td == pcpu->pc_idlethread) {
+		td->td_flags |= TDF_NEEDRESCHED;
+		ipi_selected(1 << cpu, IPI_AST);
+	}
+}
+
+static struct kse *
+runq_steal(struct runq *rq)
+{
+	struct rqhead *rqh;
+	struct rqbits *rqb;
+	struct kse *ke;
+	int word;
+	int bit;
+
+	mtx_assert(&sched_lock, MA_OWNED);
+	rqb = &rq->rq_status;
+	for (word = 0; word < RQB_LEN; word++) {
+		if (rqb->rqb_bits[word] == 0)
+			continue;
+		for (bit = 0; bit < RQB_BPW; bit++) {
+			if ((rqb->rqb_bits[word] & (1 << bit)) == 0)
+				continue;
+			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
+			TAILQ_FOREACH(ke, rqh, ke_procq) {
+				if (PRI_BASE(ke->ke_ksegrp->kg_pri_class) !=
+				    PRI_ITHD)
+					return (ke);
+			}
+		}
+	}
+	return (NULL);
+}
+
+static struct kse *
+kseq_steal(struct kseq *kseq)
+{
+	struct kse *ke;
+
+	if ((ke = runq_steal(kseq->ksq_curr)) != NULL)
+		return (ke);
+	if ((ke = runq_steal(kseq->ksq_next)) != NULL)
+		return (ke);
+	return (runq_steal(&kseq->ksq_idle));
+}
+#endif	/* SMP */
 
 /*
- * Pick the highest priority task we have and return it.   If steal is 1 we
- * will return kses that have been denied slices due to their nice being too
- * low.  In the future we should prohibit stealing interrupt threads as well.
+ * Pick the highest priority task we have and return it.
  */
 
-struct kse *
-kseq_choose(struct kseq *kseq, int steal)
+static struct kse *
+kseq_choose(struct kseq *kseq)
 {
 	struct kse *ke;
 	struct runq *swap;
@@ -499,7 +625,7 @@ kseq_choose(struct kseq *kseq, int steal)
 		 * TIMESHARE kse group and its nice was too far out
 		 * of the range that receives slices. 
 		 */
-		if (ke->ke_slice == 0 && steal == 0) {
+		if (ke->ke_slice == 0) {
 			runq_remove(ke->ke_runq, ke);
 			sched_slice(ke);
 			ke->ke_runq = kseq->ksq_next;
@@ -529,6 +655,7 @@ kseq_setup(struct kseq *kseq)
 	kseq->ksq_load = 0;
 #ifdef SMP
 	kseq->ksq_rslices = 0;
+	kseq->ksq_assigned = NULL;
 #endif
 }
 
@@ -716,7 +843,7 @@ sched_rr_interval(void)
 	return (SCHED_SLICE_MAX);
 }
 
-void
+static void
 sched_pctcpu_update(struct kse *ke)
 {
 	/*
@@ -737,7 +864,7 @@ sched_pctcpu_update(struct kse *ke)
 	ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
 }
 
-#ifdef SMP
+#if 0
 /* XXX Should be changed to kseq_load_lowest() */
 int
 sched_pickcpu(void)
@@ -767,12 +894,6 @@ sched_pickcpu(void)
 	CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu);
 	return (cpu);
 }
-#else
-int
-sched_pickcpu(void)
-{
-	return (0);
-}
 #endif
 
 void
@@ -789,10 +910,8 @@ sched_prio(struct thread *td, u_char prio)
 		 * queue.  We still call adjustrunqueue below in case kse
 		 * needs to fix things up.
 		 */
-		if (ke && ((td->td_ksegrp->kg_pri_class == PRI_TIMESHARE &&
-		    prio < td->td_ksegrp->kg_user_pri) ||
-		    (td->td_ksegrp->kg_pri_class == PRI_IDLE &&
-		    prio < PRI_MIN_IDLE))) {
+		if (ke && (ke->ke_flags & KEF_ASSIGNED) == 0 &&
+		    ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) {
 			runq_remove(ke->ke_runq, ke);
 			ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr;
 			runq_add(ke->ke_runq, ke);
@@ -917,8 +1036,6 @@ sched_wakeup(struct thread *td)
 		td->td_slptime = 0;
 	}
 	setrunqueue(td);
-        if (td->td_priority < curthread->td_priority)
-                curthread->td_flags |= TDF_NEEDRESCHED;
 }
 
 /*
@@ -1119,30 +1236,16 @@ sched_runnable(void)
 
 	mtx_lock_spin(&sched_lock);
 	kseq = KSEQ_SELF();
-
+#ifdef SMP
+	if (kseq->ksq_assigned)
+		kseq_assign(kseq);
+#endif
 	if ((curthread->td_flags & TDF_IDLETD) != 0) {
 		if (kseq->ksq_load > 0)
 			goto out;
 	} else
 		if (kseq->ksq_load - 1 > 0)
 			goto out;
-#ifdef SMP
-	/*
-	 * For SMP we may steal other processor's KSEs.  Just search until we
-	 * verify that at least on other cpu has a runnable task.
-	 */
-	if (smp_started) {
-		int i;
-
-		for (i = 0; i < mp_maxid; i++) {
-			if (CPU_ABSENT(i) || (i & stopped_cpus) != 0)
-				continue;
-			kseq = KSEQ_CPU(i);
-			if (kseq->ksq_load > kseq->ksq_cpus)
-				goto out;
-		}
-	}
-#endif
 	load = 0;
 out:
 	mtx_unlock_spin(&sched_lock);
@@ -1170,12 +1273,19 @@ sched_choose(void)
 	struct kse *ke;
 
 	mtx_assert(&sched_lock, MA_OWNED);
+	kseq = KSEQ_SELF();
 #ifdef SMP
 retry:
+	if (kseq->ksq_assigned)
+		kseq_assign(kseq);
 #endif
-	kseq = KSEQ_SELF();
-	ke = kseq_choose(kseq, 0);
+	ke = kseq_choose(kseq);
 	if (ke) {
+#ifdef SMP
+		if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE)
+			if (kseq_find())
+				goto retry;
+#endif
 		runq_remove(ke->ke_runq, ke);
 		ke->ke_state = KES_THREAD;
 
@@ -1186,23 +1296,9 @@ retry:
 		}
 		return (ke);
 	}
-
 #ifdef SMP
-	if (smp_started) {
-		/*
-		 * Find the cpu with the highest load and steal one proc.
-		 */
-		if ((kseq = kseq_load_highest()) == NULL)
-			return (NULL);
-
-		/*
-		 * Remove this kse from this kseq and runq and then requeue
-		 * on the current processor.  Then we will dequeue it
-		 * normally above.
-		 */
-		kseq_move(kseq, PCPU_GET(cpuid));
+	if (kseq_find())
 		goto retry;
-	}
 #endif
 
 	return (NULL);
@@ -1214,10 +1310,14 @@ sched_add(struct thread *td)
 	struct kseq *kseq;
 	struct ksegrp *kg;
 	struct kse *ke;
+	int class;
 
+	mtx_assert(&sched_lock, MA_OWNED);
 	ke = td->td_kse;
 	kg = td->td_ksegrp;
-	mtx_assert(&sched_lock, MA_OWNED);
+	if (ke->ke_flags & KEF_ASSIGNED)
+		return;
+	kseq = KSEQ_SELF();
 	KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE"));
 	KASSERT((ke->ke_thread->td_kse != NULL),
 	    ("sched_add: No KSE on thread"));
@@ -1229,24 +1329,33 @@ sched_add(struct thread *td)
 	KASSERT(ke->ke_runq == NULL,
 	    ("sched_add: KSE %p is still assigned to a run queue", ke));
 
-
-	switch (PRI_BASE(kg->kg_pri_class)) {
+	class = PRI_BASE(kg->kg_pri_class);
+	switch (class) {
 	case PRI_ITHD:
 	case PRI_REALTIME:
-		kseq = KSEQ_SELF();
 		ke->ke_runq = kseq->ksq_curr;
 		ke->ke_slice = SCHED_SLICE_MAX;
 		ke->ke_cpu = PCPU_GET(cpuid);
 		break;
 	case PRI_TIMESHARE:
-		kseq = KSEQ_CPU(ke->ke_cpu);
+#ifdef SMP
+		if (ke->ke_cpu != PCPU_GET(cpuid)) {
+			kseq_notify(ke, ke->ke_cpu);
+			return;
+		}
+#endif
 		if (SCHED_CURR(kg, ke))
 			ke->ke_runq = kseq->ksq_curr;
 		else
 			ke->ke_runq = kseq->ksq_next;
 		break;
 	case PRI_IDLE:
-		kseq = KSEQ_CPU(ke->ke_cpu);
+#ifdef SMP
+		if (ke->ke_cpu != PCPU_GET(cpuid)) {
+			kseq_notify(ke, ke->ke_cpu);
+			return;
+		}
+#endif
 		/*
 		 * This is for priority prop.
 		 */
@@ -1260,6 +1369,34 @@ sched_add(struct thread *td)
 		panic("Unknown pri class.\n");
 		break;
 	}
+#ifdef SMP
+	/*
+	 * If there are any idle processors, give them our extra load.
+	 */
+	if (kseq_idle && class != PRI_ITHD &&
+	    (kseq->ksq_loads[PRI_IDLE] + kseq->ksq_loads[PRI_TIMESHARE] +
+	    kseq->ksq_loads[PRI_REALTIME]) >= kseq->ksq_cpus) {
+		int cpu;
+
+		/*
+		 * Multiple cpus could find this bit simultaneously but the
+		 * race shouldn't be terrible.
+		 */
+		cpu = ffs(kseq_idle);
+		if (cpu) {
+			cpu--;
+			atomic_clear_int(&kseq_idle, 1 << cpu);
+			ke->ke_cpu = cpu;
+			ke->ke_runq = NULL;
+			kseq_notify(ke, cpu);
+			return;
+		}
+	}
+	if (class == PRI_TIMESHARE || class == PRI_REALTIME)
+		atomic_clear_int(&kseq_idle, PCPU_GET(cpumask));
+#endif
+        if (td->td_priority < curthread->td_priority)
+                curthread->td_flags |= TDF_NEEDRESCHED;
 
 	ke->ke_ksegrp->kg_runq_kses++;
 	ke->ke_state = KES_ONRUNQ;
@@ -1275,7 +1412,14 @@ sched_rem(struct thread *td)
 	struct kse *ke;
 
 	ke = td->td_kse;
-
+	/*
+	 * It is safe to just return here because sched_rem() is only ever
+	 * used in places where we're immediately going to add the
+	 * kse back on again.  In that case it'll be added with the correct
+	 * thread and priority when the caller drops the sched_lock.
+	 */
+	if (ke->ke_flags & KEF_ASSIGNED)
+		return;
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue"));