Give the 4bsd scheduler the ability to wake up idle processors

when there is new work to be done. MFC after: 5 days
author: julian <julian@FreeBSD.org> 2004-09-01 06:42:02 +0000
committer: julian <julian@FreeBSD.org> 2004-09-01 06:42:02 +0000
commit: 8354ba9e3ae8db8ce805e66f768b37de9f172dce (patch)
tree: 93ac3864ee5c297785ca19470bbb03069c0240ae
parent: 80bf38e921816f690ba642ba76e42b17f5141d66 (diff)
download: FreeBSD-src-8354ba9e3ae8db8ce805e66f768b37de9f172dce.zip
FreeBSD-src-8354ba9e3ae8db8ce805e66f768b37de9f172dce.tar.gz
8 files changed, 251 insertions, 27 deletions
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index 793a56e..07855ce 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -113,7 +113,6 @@ extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
  */
 
 static u_int logical_cpus;
-static u_int logical_cpus_mask;
 
 /* used to hold the AP's until we are ready to release them */
 static struct mtx ap_boot_mtx;
@@ -138,7 +137,6 @@ static int	start_all_aps(void);
 static int	start_ap(int apic_id);
 static void	release_aps(void *dummy);
 
-static int	hlt_cpus_mask;
 static int	hlt_logical_cpus;
 static struct	sysctl_ctx_list logical_cpu_clist;
 static u_int	bootMP_size;
diff --git a/sys/i386/i386/mp_machdep.c b/sys/i386/i386/mp_machdep.c
index 20a3daf..915e3ad 100644
--- a/sys/i386/i386/mp_machdep.c
+++ b/sys/i386/i386/mp_machdep.c
@@ -183,7 +183,6 @@ volatile int smp_tlb_wait;
  */
 
 static u_int logical_cpus;
-static u_int logical_cpus_mask;
 
 /* used to hold the AP's until we are ready to release them */
 static struct mtx ap_boot_mtx;
@@ -209,7 +208,6 @@ static void	install_ap_tramp(void);
 static int	start_ap(int apic_id);
 static void	release_aps(void *dummy);
 
-static int	hlt_cpus_mask;
 static int	hlt_logical_cpus;
 static struct	sysctl_ctx_list logical_cpu_clist;
 
diff --git a/sys/i386/include/param.h b/sys/i386/include/param.h
index bb9c7a7..90541bf 100644
--- a/sys/i386/include/param.h
+++ b/sys/i386/include/param.h
@@ -98,9 +98,7 @@
 #define PDRMASK		(NBPDR-1)
 
 /* PREEMPTION exposes scheduler bugs that need to be fixed. */
-#if 0
 #define	PREEMPTION
-#endif
 
 #define IOPAGES	2		/* pages of i/o permission bitmap */
 
diff --git a/sys/kern/kern_idle.c b/sys/kern/kern_idle.c
index 8af741c..d3d891f 100644
--- a/sys/kern/kern_idle.c
+++ b/sys/kern/kern_idle.c
@@ -36,6 +36,9 @@ __FBSDID("$FreeBSD$");
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/unistd.h>
+#ifdef SMP
+#include <sys/smp.h>
+#endif
 
 static void idle_setup(void *dummy);
 SYSINIT(idle_setup, SI_SUB_SCHED_IDLE, SI_ORDER_FIRST, idle_setup, NULL)
@@ -96,9 +99,18 @@ idle_proc(void *dummy)
 {
 	struct proc *p;
 	struct thread *td;
+#ifdef SMP
+	cpumask_t mycpu;
+#endif
 
 	td = curthread;
 	p = td->td_proc;
+#ifdef SMP
+	mycpu = PCPU_GET(cpumask);
+	mtx_lock_spin(&sched_lock);
+	idle_cpus_mask |= mycpu;
+	mtx_unlock_spin(&sched_lock);
+#endif
 	for (;;) {
 		mtx_assert(&Giant, MA_NOTOWNED);
 
@@ -106,7 +118,13 @@ idle_proc(void *dummy)
 			cpu_idle();
 
 		mtx_lock_spin(&sched_lock);
+#ifdef SMP
+		idle_cpus_mask &= ~mycpu;
+#endif
 		mi_switch(SW_VOL, NULL);
+#ifdef SMP
+		idle_cpus_mask |= mycpu;
+#endif
 		mtx_unlock_spin(&sched_lock);
 	}
 }
diff --git a/sys/kern/kern_switch.c b/sys/kern/kern_switch.c
index 54d0025..e36128f 100644
--- a/sys/kern/kern_switch.c
+++ b/sys/kern/kern_switch.c
@@ -89,6 +89,7 @@ reassigned to keep this true.
 __FBSDID("$FreeBSD$");
 
 #include "opt_full_preemption.h"
+#include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -104,6 +105,10 @@ __FBSDID("$FreeBSD$");
 #include <sys/smp.h>
 #endif
 #include <machine/critical.h>
+#if defined(SMP) && defined(SCHED_4BSD)
+#include <sys/sysctl.h>
+#endif
+
 
 CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
 
@@ -686,6 +691,12 @@ runq_check(struct runq *rq)
 	return (0);
 }
 
+#if defined(SMP) && defined(SCHED_4BSD)
+int runq_fuzz = 1;
+SYSCTL_DECL(_kern_sched);
+SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
+#endif
+
 /*
  * Find the highest priority process on the run queue.
  */
@@ -699,7 +710,28 @@ runq_choose(struct runq *rq)
 	mtx_assert(&sched_lock, MA_OWNED);
 	while ((pri = runq_findbit(rq)) != -1) {
 		rqh = &rq->rq_queues[pri];
-		ke = TAILQ_FIRST(rqh);
+#if defined(SMP) && defined(SCHED_4BSD)
+		/* fuzz == 1 is normal.. 0 or less are ignored */
+		if (runq_fuzz > 1) {
+			/*
+			 * In the first couple of entries, check if
+			 * there is one for our CPU as a preference.
+			 */
+			int count = runq_fuzz;
+			int cpu = PCPU_GET(cpuid);
+			struct kse *ke2;
+			ke2 = ke = TAILQ_FIRST(rqh);
+
+			while (count-- && ke2) {
+				if (ke->ke_thread->td_lastcpu == cpu) {
+					ke = ke2;
+					break;
+				}
+				ke2 = TAILQ_NEXT(ke2, ke_procq);
+			}
+		} else 
+#endif
+			ke = TAILQ_FIRST(rqh);
 		KASSERT(ke != NULL, ("runq_choose: no proc on busy queue"));
 		CTR3(KTR_RUNQ,
 		    "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);
diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c
index ae8046a..043514b 100644
--- a/sys/kern/sched_4bsd.c
+++ b/sys/kern/sched_4bsd.c
@@ -698,6 +698,10 @@ void
 sched_add(struct thread *td, int flags)
 {
 	struct kse *ke;
+#ifdef SMP
+	int forwarded = 0;
+	int cpu;
+#endif
 
 	ke = td->td_kse;
 	mtx_assert(&sched_lock, MA_OWNED);
@@ -711,33 +715,70 @@ sched_add(struct thread *td, int flags)
 	    ("sched_add: process swapped out"));
 
 #ifdef SMP
-	/*
-	 * Only try to preempt if the thread is unpinned or pinned to the
-	 * current CPU.
-	 */
-	if (KSE_CAN_MIGRATE(ke) || ke->ke_runq == &runq_pcpu[PCPU_GET(cpuid)])
-#endif
-	/*
-	 * Don't try preempt if we are already switching. 
-	 * all hell might break loose.
-	 */
-	if ((flags & SRQ_YIELDING) == 0)
-		if (maybe_preempt(td))
-			return;
-
-#ifdef SMP
 	if (KSE_CAN_MIGRATE(ke)) {
-		CTR2(KTR_RUNQ, "sched_add: adding kse:%p (td:%p) to gbl runq", ke, td);
+		CTR2(KTR_RUNQ,
+		    "sched_add: adding kse:%p (td:%p) to gbl runq", ke, td);
+		cpu = NOCPU;
 		ke->ke_runq = &runq;
 	} else {
-		CTR2(KTR_RUNQ, "sched_add: adding kse:%p (td:%p)to pcpu runq", ke, td);
 		if (!SKE_RUNQ_PCPU(ke))
-			ke->ke_runq = &runq_pcpu[PCPU_GET(cpuid)];
+			ke->ke_runq = &runq_pcpu[(cpu = PCPU_GET(cpuid))];
+		else
+			cpu = td->td_lastcpu;
+		CTR3(KTR_RUNQ,
+		    "sched_add: Put kse:%p(td:%p) on cpu%d runq", ke, td, cpu);
 	}
 #else
 	CTR2(KTR_RUNQ, "sched_add: adding kse:%p (td:%p) to runq", ke, td);
 	ke->ke_runq = &runq;
+
 #endif
+	/* 
+	 * If we are yielding (on the way out anyhow) 
+	 * or the thread being saved is US,
+	 * then don't try be smart about preemption
+	 * or kicking off another CPU
+	 * as it won't help and may hinder.
+	 * In the YIEDLING case, we are about to run whoever is 
+	 * being put in the queue anyhow, and in the 
+	 * OURSELF case, we are puting ourself on the run queue
+	 * which also only happens when we are about to yield.
+	 */
+	if((flags & SRQ_YIELDING) == 0) {
+#ifdef SMP
+		cpumask_t me = PCPU_GET(cpumask);
+		int idle = idle_cpus_mask & me;
+		/*
+		 * Only try to kick off another CPU if
+		 * the thread is unpinned
+		 * or pinned to another cpu,
+		 * and there are other available and idle CPUs.
+		 * if we are idle, then skip straight to preemption.
+		 */
+		if ( (! idle) &&
+		    (idle_cpus_mask & ~(hlt_cpus_mask | me)) &&
+		    ( KSE_CAN_MIGRATE(ke) ||
+		      ke->ke_runq != &runq_pcpu[PCPU_GET(cpuid)])) {
+			forwarded = forward_wakeup(cpu);
+		}
+		/*
+		 * If we failed to kick off another cpu, then look to 
+		 * see if we should preempt this CPU. Only allow this
+		 * if it is not pinned or IS pinned to this CPU.
+		 * If we are the idle thread, we also try do preempt.
+		 * as it will be quicker and being idle, we won't 
+		 * lose in doing so.. 
+		 */
+		if ((!forwarded) &&
+		    (ke->ke_runq == &runq ||
+		     ke->ke_runq == &runq_pcpu[PCPU_GET(cpuid)]))
+#endif
+
+		{
+			if (maybe_preempt(td))
+				return;
+		}
+	}
 	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
 		sched_tdcnt++;
 	runq_add(ke->ke_runq, ke);
diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c
index f1b8499..c354c2d 100644
--- a/sys/kern/subr_smp.c
+++ b/sys/kern/subr_smp.c
@@ -49,9 +49,15 @@ __FBSDID("$FreeBSD$");
 
 #include <machine/smp.h>
 
+#include "opt_sched.h"
+
 #ifdef SMP
 volatile cpumask_t stopped_cpus;
 volatile cpumask_t started_cpus;
+cpumask_t all_cpus;
+cpumask_t idle_cpus_mask;
+cpumask_t hlt_cpus_mask;
+cpumask_t logical_cpus_mask;
 
 void (*cpustop_restartfunc)(void);
 #endif
@@ -62,7 +68,6 @@ int mp_maxcpus = MAXCPU;
 
 struct cpu_top *smp_topology;
 volatile int smp_started;
-cpumask_t all_cpus;
 u_int mp_maxid;
 
 SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD, NULL, "Kernel SMP");
@@ -96,6 +101,46 @@ SYSCTL_INT(_kern_smp, OID_AUTO, forward_roundrobin_enabled, CTLFLAG_RW,
 	   &forward_roundrobin_enabled, 0,
 	   "Forwarding of roundrobin to all other CPUs");
 
+#ifdef SCHED_4BSD
+/* Enable forwarding of wakeups to all other cpus */
+SYSCTL_NODE(_kern_smp, OID_AUTO, ipiwakeup, CTLFLAG_RD, NULL, "Kernel SMP");
+
+static int forward_wakeup_enabled = 0;
+SYSCTL_INT(_kern_smp_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW,
+	   &forward_wakeup_enabled, 0,
+	   "Forwarding of wakeup to idle CPUs");
+
+static int forward_wakeups_requested = 0;
+SYSCTL_INT(_kern_smp_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD,
+	   &forward_wakeups_requested, 0,
+	   "Requests for Forwarding of wakeup to idle CPUs");
+
+static int forward_wakeups_delivered = 0;
+SYSCTL_INT(_kern_smp_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD,
+	   &forward_wakeups_delivered, 0,
+	   "Completed Forwarding of wakeup to idle CPUs");
+
+static int forward_wakeup_use_mask = 0;
+SYSCTL_INT(_kern_smp_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW,
+	   &forward_wakeup_use_mask, 0,
+	   "Use the mask of idle cpus");
+
+static int forward_wakeup_use_loop = 0;
+SYSCTL_INT(_kern_smp_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW,
+	   &forward_wakeup_use_loop, 0,
+	   "Use a loop to find idle cpus");
+
+static int forward_wakeup_use_single = 0;
+SYSCTL_INT(_kern_smp_ipiwakeup, OID_AUTO, onecpu, CTLFLAG_RW,
+	   &forward_wakeup_use_single, 0,
+	   "Only signal one idle cpu");
+
+static int forward_wakeup_use_htt = 0;
+SYSCTL_INT(_kern_smp_ipiwakeup, OID_AUTO, htt2, CTLFLAG_RW,
+	   &forward_wakeup_use_htt, 0,
+	   "account for htt");
+
+#endif /* SCHED_4BSD */
 /* Variables needed for SMP rendezvous. */
 static void (*smp_rv_setup_func)(void *arg);
 static void (*smp_rv_action_func)(void *arg);
@@ -203,6 +248,95 @@ forward_roundrobin(void)
 	ipi_selected(map, IPI_AST);
 }
 
+#ifdef SCHED_4BSD
+/* enable HTT_2 if you have a 2-way HTT cpu.*/
+int
+forward_wakeup(int  cpunum)
+{
+	cpumask_t map, me, dontuse;
+	cpumask_t map2;
+	struct pcpu *pc;
+	cpumask_t id, map3;
+
+	mtx_assert(&sched_lock, MA_OWNED);
+
+	CTR0(KTR_SMP, "forward_wakeup()");
+
+	if ((!forward_wakeup_enabled) ||
+	     (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0))
+		return (0);
+	if (!smp_started || cold || panicstr)
+		return (0);
+
+	forward_wakeups_requested++;
+
+/*
+ * check the idle mask we received against what we calculated before
+ * in the old version.
+ */
+	me = PCPU_GET(cpumask);
+	/* 
+	 * don't bother if we should be doing it ourself..
+	 */
+	if ((me & idle_cpus_mask) && (cpunum == NOCPU || me == (1 << cpunum)))
+		return (0);
+
+	dontuse = me | stopped_cpus | hlt_cpus_mask;
+	map3 = 0;
+	if (forward_wakeup_use_loop) {
+		SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
+			id = pc->pc_cpumask;
+			if ( (id & dontuse) == 0 &&
+			    pc->pc_curthread == pc->pc_idlethread) {
+				map3 |= id;
+			}
+		}
+	}
+
+	if (forward_wakeup_use_mask) {
+		map = 0;
+		map = idle_cpus_mask & ~dontuse;
+
+		/* If they are both on, compare and use loop if different */
+		if (forward_wakeup_use_loop) {
+			if (map != map3) {
+				printf("map (%02X) != map3 (%02X)\n",
+						map, map3);
+				map = map3;
+			}
+		}
+	} else {
+		map = map3;
+	}
+	/* If we only allow a specific CPU, then mask off all the others */
+	if (cpunum != NOCPU) {
+		KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum."));
+		map &= (1 << cpunum);
+	} else {
+		/* Try choose an idle die. */
+		if (forward_wakeup_use_htt) {
+			map2 =  (map & (map >> 1)) & 0x5555;
+			if (map2) {
+				map = map2;
+			}
+		}
+
+		/* set only one bit */ 
+		if (forward_wakeup_use_single) {
+			map = map & ((~map) + 1);
+		}
+	}
+	if (map) {
+		forward_wakeups_delivered++;
+		ipi_selected(map, IPI_AST);
+		return (1);
+	}
+	if (cpunum == NOCPU)
+		printf("forward_wakeup: Idle processor not found\n");
+	return (0);
+}
+#endif /* SCHED_4BSD */
+
 /*
  * When called the executing CPU will send an IPI to all other CPUs
  *  requesting that they halt execution.
diff --git a/sys/sys/smp.h b/sys/sys/smp.h
index 12bfbba..af9176f 100644
--- a/sys/sys/smp.h
+++ b/sys/sys/smp.h
@@ -51,11 +51,15 @@ extern volatile cpumask_t started_cpus;
 extern volatile cpumask_t stopped_cpus;
 #endif /* SMP */
 
-extern cpumask_t all_cpus;
 extern u_int mp_maxid;
 extern int mp_ncpus;
 extern volatile int smp_started;
 
+extern cpumask_t all_cpus;
+extern cpumask_t idle_cpus_mask;
+extern cpumask_t hlt_cpus_mask;
+extern cpumask_t logical_cpus_mask;
+
 /*
  * Macro allowing us to determine whether a CPU is absent at any given
  * time, thus permitting us to configure sparse maps of cpuid-dependent
@@ -92,6 +96,7 @@ void	cpu_mp_start(void);
 
 void	forward_signal(struct thread *);
 void	forward_roundrobin(void);
+int	forward_wakeup(int cpunum);
 int	restart_cpus(cpumask_t);
 int	stop_cpus(cpumask_t);
 void	smp_rendezvous_action(void);
author	julian <julian@FreeBSD.org>	2004-09-01 06:42:02 +0000
committer	julian <julian@FreeBSD.org>	2004-09-01 06:42:02 +0000
commit	8354ba9e3ae8db8ce805e66f768b37de9f172dce (patch)
tree	93ac3864ee5c297785ca19470bbb03069c0240ae
parent	80bf38e921816f690ba642ba76e42b17f5141d66 (diff)
download	FreeBSD-src-8354ba9e3ae8db8ce805e66f768b37de9f172dce.zip FreeBSD-src-8354ba9e3ae8db8ce805e66f768b37de9f172dce.tar.gz