- Add static to local functions and data where it was missing.

- Add an IPI based mechanism for migrating kses. This mechanism is broken down into several components. This is intended to reduce cache thrashing by eliminating most cases where one cpu touches another's run queues. - kseq_notify() appends a kse to a lockless singly linked list and conditionally sends an IPI to the target processor. Right now this is protected by sched_lock but at some point I'd like to get rid of the global lock. This is why I used something more complicated than a standard queue. - kseq_assign() processes our list of kses that have been assigned to us by other processors. This simply calls sched_add() for each item on the list after clearing the new KEF_ASSIGNED flag. This flag is used to indicate that we have been appeneded to the assigned queue but not added to the run queue yet. - In sched_add(), instead of adding a KSE to another processor's queue we use kse_notify() so that we don't touch their queue. Also in sched_add(), if KEF_ASSIGNED is already set return immediately. This can happen if a thread is removed and readded so that the priority is recorded properly. - In sched_rem() return immediately if KEF_ASSIGNED is set. All callers immediately readd simply to adjust priorites etc. - In sched_choose(), if we're running an IDLE task or the per cpu idle thread set our cpumask bit in 'kseq_idle' so that other processors may know that we are idle. Before this, make a single pass through the run queues of other processors so that we may find work more immediately if it is available. - In sched_runnable(), don't scan each processor's run queue, they will IPI us if they have work for us to do. - In sched_add(), if we're adding a thread that can be migrated and we have plenty of work to do, try to migrate the thread to an idle kseq. - Simplify the logic in sched_prio() and take the KEF_ASSIGNED flag into consideration. - No longer use kseq_choose() to steal threads, it can lose it's last argument. - Create a new function runq_steal() which operates like runq_choose() but skips threads based on some criteria. Currently it will not steal PRI_ITHD threads. In the future this will be used for CPU binding. - Create a kseq_steal() that checks each run queue with runq_steal(), use kseq_steal() in the places where we used kseq_choose() to steal with before.
author: jeff <jeff@FreeBSD.org> 2003-10-31 11:16:04 +0000
committer: jeff <jeff@FreeBSD.org> 2003-10-31 11:16:04 +0000
commit: 5cf7d9c84d187ea5b2a7e6839042f2b9347a88ab (patch)
tree: 8a0fe94282f5bf361720f80769d1a02cc5b9c1ce /sys/kern
parent: b3642312228b79855c006513d0d8dd934eb866cc (diff)
download: FreeBSD-src-5cf7d9c84d187ea5b2a7e6839042f2b9347a88ab.zip
FreeBSD-src-5cf7d9c84d187ea5b2a7e6839042f2b9347a88ab.tar.gz
1 files changed, 222 insertions, 78 deletions
diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index c8c4618..dfef1e0 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -50,6 +50,7 @@ __FBSDID("$FreeBSD$");
 #endif
 
 #include <machine/cpu.h>
+#include <machine/smp.h>
 
 #define KTR_ULE         KTR_NFS
 
@@ -101,6 +102,9 @@ struct ke_sched {
 #define	ke_ftick	ke_sched->ske_ftick
 #define	ke_ticks	ke_sched->ske_ticks
 #define	ke_cpu		ke_sched->ske_cpu
+#define	ke_assign	ke_procq.tqe_next
+
+#define	KEF_ASSIGNED	KEF_SCHED0	/* KSE is being migrated. */
 
 struct kg_sched {
 	int	skg_slptime;		/* Number of ticks we vol. slept */
@@ -211,8 +215,9 @@ struct kseq {
 	short		ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */
 	short		ksq_nicemin;		/* Least nice. */
 #ifdef SMP
-	int		ksq_cpus;	/* Count of CPUs in this kseq. */
 	unsigned int	ksq_rslices;	/* Slices on run queue */
+	int		ksq_cpus;	/* Count of CPUs in this kseq. */
+	struct kse 	*ksq_assigned;	/* KSEs assigned by another CPU. */
 #endif
 };
 
@@ -220,12 +225,13 @@ struct kseq {
  * One kse queue per processor.
  */
 #ifdef SMP
-struct kseq	kseq_cpu[MAXCPU];
-struct kseq	*kseq_idmap[MAXCPU];
+static int kseq_idle;
+static struct kseq	kseq_cpu[MAXCPU];
+static struct kseq	*kseq_idmap[MAXCPU];
 #define	KSEQ_SELF()	(kseq_idmap[PCPU_GET(cpuid)])
 #define	KSEQ_CPU(x)	(kseq_idmap[(x)])
 #else
-struct kseq	kseq_cpu;
+static struct kseq	kseq_cpu;
 #define	KSEQ_SELF()	(&kseq_cpu)
 #define	KSEQ_CPU(x)	(&kseq_cpu)
 #endif
@@ -234,11 +240,10 @@ static void sched_slice(struct kse *ke);
 static void sched_priority(struct ksegrp *kg);
 static int sched_interact_score(struct ksegrp *kg);
 static void sched_interact_update(struct ksegrp *kg);
-void sched_pctcpu_update(struct kse *ke);
-int sched_pickcpu(void);
+static void sched_pctcpu_update(struct kse *ke);
 
 /* Operations on per processor queues */
-static struct kse * kseq_choose(struct kseq *kseq, int steal);
+static struct kse * kseq_choose(struct kseq *kseq);
 static void kseq_setup(struct kseq *kseq);
 static void kseq_add(struct kseq *kseq, struct kse *ke);
 static void kseq_rem(struct kseq *kseq, struct kse *ke);
@@ -246,9 +251,17 @@ static void kseq_nice_add(struct kseq *kseq, int nice);
 static void kseq_nice_rem(struct kseq *kseq, int nice);
 void kseq_print(int cpu);
 #ifdef SMP
-struct kseq * kseq_load_highest(void);
-void kseq_balance(void *arg);
-void kseq_move(struct kseq *from, int cpu);
+#if 0
+static int sched_pickcpu(void);
+#endif
+static struct kse *runq_steal(struct runq *rq);
+static struct kseq *kseq_load_highest(void);
+static void kseq_balance(void *arg);
+static void kseq_move(struct kseq *from, int cpu);
+static int kseq_find(void);
+static void kseq_notify(struct kse *ke, int cpu);
+static void kseq_assign(struct kseq *);
+static struct kse *kseq_steal(struct kseq *kseq);
 #endif
 
 void
@@ -359,7 +372,7 @@ kseq_nice_rem(struct kseq *kseq, int nice)
  * any approach and so the semi random algorithm below may work as well as any.
  *
  */
-void
+static void
 kseq_balance(void *arg)
 {
 	struct kseq *kseq;
@@ -396,6 +409,8 @@ kseq_balance(void *arg)
 
 	kseq = KSEQ_CPU(high_cpu);
 
+	high_load = kseq->ksq_loads[PRI_IDLE] + kseq->ksq_loads[PRI_TIMESHARE] +
+	    kseq->ksq_loads[PRI_REALTIME];
 	/*
 	 * Nothing to do.
 	 */
@@ -422,7 +437,7 @@ out:
 	return;
 }
 
-struct kseq *
+static struct kseq *
 kseq_load_highest(void)
 {
 	struct kseq *kseq;
@@ -445,18 +460,19 @@ kseq_load_highest(void)
 	}
 	kseq = KSEQ_CPU(cpu);
 
-	if (load > kseq->ksq_cpus)
+	if ((kseq->ksq_loads[PRI_IDLE] + kseq->ksq_loads[PRI_TIMESHARE] +
+	    kseq->ksq_loads[PRI_REALTIME]) > kseq->ksq_cpus)
 		return (kseq);
 
 	return (NULL);
 }
 
-void
+static void
 kseq_move(struct kseq *from, int cpu)
 {
 	struct kse *ke;
 
-	ke = kseq_choose(from, 1);
+	ke = kseq_steal(from);
 	runq_remove(ke->ke_runq, ke);
 	ke->ke_state = KES_THREAD;
 	kseq_rem(from, ke);
@@ -464,16 +480,126 @@ kseq_move(struct kseq *from, int cpu)
 	ke->ke_cpu = cpu;
 	sched_add(ke->ke_thread);
 }
-#endif
+
+static int
+kseq_find(void)
+{
+	struct kseq *high;
+
+	if (!smp_started)
+		return (0);
+	if (kseq_idle & PCPU_GET(cpumask))
+		return (0);
+	/*
+	 * Find the cpu with the highest load and steal one proc.
+	 */
+	if ((high = kseq_load_highest()) == NULL ||
+	    high == KSEQ_SELF()) {
+		/*
+		 * If we couldn't find one, set ourselves in the
+		 * idle map.
+		 */
+		atomic_set_int(&kseq_idle, PCPU_GET(cpumask));
+		return (0);
+	}
+	/*
+	 * Remove this kse from this kseq and runq and then requeue
+	 * on the current processor.  We now have a load of one!
+	 */
+	kseq_move(high, PCPU_GET(cpuid));
+
+	return (1);
+}
+
+static void
+kseq_assign(struct kseq *kseq)
+{
+	struct kse *nke;
+	struct kse *ke;
+
+	do {
+		ke = kseq->ksq_assigned;
+	} while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL));
+	for (; ke != NULL; ke = nke) {
+		nke = ke->ke_assign;
+		ke->ke_flags &= ~KEF_ASSIGNED;
+		sched_add(ke->ke_thread);
+	}
+}
+
+static void
+kseq_notify(struct kse *ke, int cpu)
+{
+	struct kseq *kseq;
+	struct thread *td;
+	struct pcpu *pcpu;
+
+	ke->ke_flags |= KEF_ASSIGNED;
+
+	kseq = KSEQ_CPU(cpu);
+
+	/*
+	 * Place a KSE on another cpu's queue and force a resched.
+	 */
+	do {
+		ke->ke_assign = kseq->ksq_assigned;
+	} while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke->ke_assign, ke));
+	pcpu = pcpu_find(cpu);
+	td = pcpu->pc_curthread;
+	if (ke->ke_thread->td_priority < td->td_priority ||
+	    td == pcpu->pc_idlethread) {
+		td->td_flags |= TDF_NEEDRESCHED;
+		ipi_selected(1 << cpu, IPI_AST);
+	}
+}
+
+static struct kse *
+runq_steal(struct runq *rq)
+{
+	struct rqhead *rqh;
+	struct rqbits *rqb;
+	struct kse *ke;
+	int word;
+	int bit;
+
+	mtx_assert(&sched_lock, MA_OWNED);
+	rqb = &rq->rq_status;
+	for (word = 0; word < RQB_LEN; word++) {
+		if (rqb->rqb_bits[word] == 0)
+			continue;
+		for (bit = 0; bit < RQB_BPW; bit++) {
+			if ((rqb->rqb_bits[word] & (1 << bit)) == 0)
+				continue;
+			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
+			TAILQ_FOREACH(ke, rqh, ke_procq) {
+				if (PRI_BASE(ke->ke_ksegrp->kg_pri_class) !=
+				    PRI_ITHD)
+					return (ke);
+			}
+		}
+	}
+	return (NULL);
+}
+
+static struct kse *
+kseq_steal(struct kseq *kseq)
+{
+	struct kse *ke;
+
+	if ((ke = runq_steal(kseq->ksq_curr)) != NULL)
+		return (ke);
+	if ((ke = runq_steal(kseq->ksq_next)) != NULL)
+		return (ke);
+	return (runq_steal(&kseq->ksq_idle));
+}
+#endif	/* SMP */
 
 /*
- * Pick the highest priority task we have and return it.   If steal is 1 we
- * will return kses that have been denied slices due to their nice being too
- * low.  In the future we should prohibit stealing interrupt threads as well.
+ * Pick the highest priority task we have and return it.
  */
 
-struct kse *
-kseq_choose(struct kseq *kseq, int steal)
+static struct kse *
+kseq_choose(struct kseq *kseq)
 {
 	struct kse *ke;
 	struct runq *swap;
@@ -499,7 +625,7 @@ kseq_choose(struct kseq *kseq, int steal)
 		 * TIMESHARE kse group and its nice was too far out
 		 * of the range that receives slices. 
 		 */
-		if (ke->ke_slice == 0 && steal == 0) {
+		if (ke->ke_slice == 0) {
 			runq_remove(ke->ke_runq, ke);
 			sched_slice(ke);
 			ke->ke_runq = kseq->ksq_next;
@@ -529,6 +655,7 @@ kseq_setup(struct kseq *kseq)
 	kseq->ksq_load = 0;
 #ifdef SMP
 	kseq->ksq_rslices = 0;
+	kseq->ksq_assigned = NULL;
 #endif
 }
 
@@ -716,7 +843,7 @@ sched_rr_interval(void)
 	return (SCHED_SLICE_MAX);
 }
 
-void
+static void
 sched_pctcpu_update(struct kse *ke)
 {
 	/*
@@ -737,7 +864,7 @@ sched_pctcpu_update(struct kse *ke)
 	ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
 }
 
-#ifdef SMP
+#if 0
 /* XXX Should be changed to kseq_load_lowest() */
 int
 sched_pickcpu(void)
@@ -767,12 +894,6 @@ sched_pickcpu(void)
 	CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu);
 	return (cpu);
 }
-#else
-int
-sched_pickcpu(void)
-{
-	return (0);
-}
 #endif
 
 void
@@ -789,10 +910,8 @@ sched_prio(struct thread *td, u_char prio)
 		 * queue.  We still call adjustrunqueue below in case kse
 		 * needs to fix things up.
 		 */
-		if (ke && ((td->td_ksegrp->kg_pri_class == PRI_TIMESHARE &&
-		    prio < td->td_ksegrp->kg_user_pri) ||
-		    (td->td_ksegrp->kg_pri_class == PRI_IDLE &&
-		    prio < PRI_MIN_IDLE))) {
+		if (ke && (ke->ke_flags & KEF_ASSIGNED) == 0 &&
+		    ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) {
 			runq_remove(ke->ke_runq, ke);
 			ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr;
 			runq_add(ke->ke_runq, ke);
@@ -917,8 +1036,6 @@ sched_wakeup(struct thread *td)
 		td->td_slptime = 0;
 	}
 	setrunqueue(td);
-        if (td->td_priority < curthread->td_priority)
-                curthread->td_flags |= TDF_NEEDRESCHED;
 }
 
 /*
@@ -1119,30 +1236,16 @@ sched_runnable(void)
 
 	mtx_lock_spin(&sched_lock);
 	kseq = KSEQ_SELF();
-
+#ifdef SMP
+	if (kseq->ksq_assigned)
+		kseq_assign(kseq);
+#endif
 	if ((curthread->td_flags & TDF_IDLETD) != 0) {
 		if (kseq->ksq_load > 0)
 			goto out;
 	} else
 		if (kseq->ksq_load - 1 > 0)
 			goto out;
-#ifdef SMP
-	/*
-	 * For SMP we may steal other processor's KSEs.  Just search until we
-	 * verify that at least on other cpu has a runnable task.
-	 */
-	if (smp_started) {
-		int i;
-
-		for (i = 0; i < mp_maxid; i++) {
-			if (CPU_ABSENT(i) || (i & stopped_cpus) != 0)
-				continue;
-			kseq = KSEQ_CPU(i);
-			if (kseq->ksq_load > kseq->ksq_cpus)
-				goto out;
-		}
-	}
-#endif
 	load = 0;
 out:
 	mtx_unlock_spin(&sched_lock);
@@ -1170,12 +1273,19 @@ sched_choose(void)
 	struct kse *ke;
 
 	mtx_assert(&sched_lock, MA_OWNED);
+	kseq = KSEQ_SELF();
 #ifdef SMP
 retry:
+	if (kseq->ksq_assigned)
+		kseq_assign(kseq);
 #endif
-	kseq = KSEQ_SELF();
-	ke = kseq_choose(kseq, 0);
+	ke = kseq_choose(kseq);
 	if (ke) {
+#ifdef SMP
+		if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE)
+			if (kseq_find())
+				goto retry;
+#endif
 		runq_remove(ke->ke_runq, ke);
 		ke->ke_state = KES_THREAD;
 
@@ -1186,23 +1296,9 @@ retry:
 		}
 		return (ke);
 	}
-
 #ifdef SMP
-	if (smp_started) {
-		/*
-		 * Find the cpu with the highest load and steal one proc.
-		 */
-		if ((kseq = kseq_load_highest()) == NULL)
-			return (NULL);
-
-		/*
-		 * Remove this kse from this kseq and runq and then requeue
-		 * on the current processor.  Then we will dequeue it
-		 * normally above.
-		 */
-		kseq_move(kseq, PCPU_GET(cpuid));
+	if (kseq_find())
 		goto retry;
-	}
 #endif
 
 	return (NULL);
@@ -1214,10 +1310,14 @@ sched_add(struct thread *td)
 	struct kseq *kseq;
 	struct ksegrp *kg;
 	struct kse *ke;
+	int class;
 
+	mtx_assert(&sched_lock, MA_OWNED);
 	ke = td->td_kse;
 	kg = td->td_ksegrp;
-	mtx_assert(&sched_lock, MA_OWNED);
+	if (ke->ke_flags & KEF_ASSIGNED)
+		return;
+	kseq = KSEQ_SELF();
 	KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE"));
 	KASSERT((ke->ke_thread->td_kse != NULL),
 	    ("sched_add: No KSE on thread"));
@@ -1229,24 +1329,33 @@ sched_add(struct thread *td)
 	KASSERT(ke->ke_runq == NULL,
 	    ("sched_add: KSE %p is still assigned to a run queue", ke));
 
-
-	switch (PRI_BASE(kg->kg_pri_class)) {
+	class = PRI_BASE(kg->kg_pri_class);
+	switch (class) {
 	case PRI_ITHD:
 	case PRI_REALTIME:
-		kseq = KSEQ_SELF();
 		ke->ke_runq = kseq->ksq_curr;
 		ke->ke_slice = SCHED_SLICE_MAX;
 		ke->ke_cpu = PCPU_GET(cpuid);
 		break;
 	case PRI_TIMESHARE:
-		kseq = KSEQ_CPU(ke->ke_cpu);
+#ifdef SMP
+		if (ke->ke_cpu != PCPU_GET(cpuid)) {
+			kseq_notify(ke, ke->ke_cpu);
+			return;
+		}
+#endif
 		if (SCHED_CURR(kg, ke))
 			ke->ke_runq = kseq->ksq_curr;
 		else
 			ke->ke_runq = kseq->ksq_next;
 		break;
 	case PRI_IDLE:
-		kseq = KSEQ_CPU(ke->ke_cpu);
+#ifdef SMP
+		if (ke->ke_cpu != PCPU_GET(cpuid)) {
+			kseq_notify(ke, ke->ke_cpu);
+			return;
+		}
+#endif
 		/*
 		 * This is for priority prop.
 		 */
@@ -1260,6 +1369,34 @@ sched_add(struct thread *td)
 		panic("Unknown pri class.\n");
 		break;
 	}
+#ifdef SMP
+	/*
+	 * If there are any idle processors, give them our extra load.
+	 */
+	if (kseq_idle && class != PRI_ITHD &&
+	    (kseq->ksq_loads[PRI_IDLE] + kseq->ksq_loads[PRI_TIMESHARE] +
+	    kseq->ksq_loads[PRI_REALTIME]) >= kseq->ksq_cpus) {
+		int cpu;
+
+		/*
+		 * Multiple cpus could find this bit simultaneously but the
+		 * race shouldn't be terrible.
+		 */
+		cpu = ffs(kseq_idle);
+		if (cpu) {
+			cpu--;
+			atomic_clear_int(&kseq_idle, 1 << cpu);
+			ke->ke_cpu = cpu;
+			ke->ke_runq = NULL;
+			kseq_notify(ke, cpu);
+			return;
+		}
+	}
+	if (class == PRI_TIMESHARE || class == PRI_REALTIME)
+		atomic_clear_int(&kseq_idle, PCPU_GET(cpumask));
+#endif
+        if (td->td_priority < curthread->td_priority)
+                curthread->td_flags |= TDF_NEEDRESCHED;
 
 	ke->ke_ksegrp->kg_runq_kses++;
 	ke->ke_state = KES_ONRUNQ;
@@ -1275,7 +1412,14 @@ sched_rem(struct thread *td)
 	struct kse *ke;
 
 	ke = td->td_kse;
-
+	/*
+	 * It is safe to just return here because sched_rem() is only ever
+	 * used in places where we're immediately going to add the
+	 * kse back on again.  In that case it'll be added with the correct
+	 * thread and priority when the caller drops the sched_lock.
+	 */
+	if (ke->ke_flags & KEF_ASSIGNED)
+		return;
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue"));
author	jeff <jeff@FreeBSD.org>	2003-10-31 11:16:04 +0000
committer	jeff <jeff@FreeBSD.org>	2003-10-31 11:16:04 +0000
commit	5cf7d9c84d187ea5b2a7e6839042f2b9347a88ab (patch)
tree	8a0fe94282f5bf361720f80769d1a02cc5b9c1ce /sys/kern
parent	b3642312228b79855c006513d0d8dd934eb866cc (diff)
download	FreeBSD-src-5cf7d9c84d187ea5b2a7e6839042f2b9347a88ab.zip FreeBSD-src-5cf7d9c84d187ea5b2a7e6839042f2b9347a88ab.tar.gz