1 files changed, 182 insertions, 148 deletions
diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index 2065b9f..126c135 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -258,7 +258,6 @@ struct cpu_group *cpu_top;		/* CPU topology */
 static int rebalance = 1;
 static int balance_interval = 128;	/* Default set in sched_initticks(). */
 static int affinity;
-static int steal_htt = 1;
 static int steal_idle = 1;
 static int steal_thresh = 2;
 
@@ -268,6 +267,7 @@ static int steal_thresh = 2;
 static struct tdq	tdq_cpu[MAXCPU];
 static struct tdq	*balance_tdq;
 static int balance_ticks;
+static DPCPU_DEFINE(uint32_t, randomval);
 
 #define	TDQ_SELF()	(&tdq_cpu[PCPU_GET(cpuid)])
 #define	TDQ_CPU(x)	(&tdq_cpu[(x)])
@@ -553,9 +553,11 @@ tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
 #ifdef SMP
 struct cpu_search {
 	cpuset_t cs_mask;
-	u_int	cs_load;
-	u_int	cs_cpu;
-	int	cs_limit;	/* Min priority for low min load for high. */
+	u_int	cs_prefer;
+	int	cs_pri;		/* Min priority for low. */
+	int	cs_limit;	/* Max load for low, min load for high. */
+	int	cs_cpu;
+	int	cs_load;
 };
 
 #define	CPU_SEARCH_LOWEST	0x1
@@ -566,44 +568,14 @@ struct cpu_search {
 	for ((cpu) = 0; (cpu) <= mp_maxid; (cpu)++)		\
 		if (CPU_ISSET(cpu, &mask))
 
-static __inline int cpu_search(struct cpu_group *cg, struct cpu_search *low,
+static __inline int cpu_search(const struct cpu_group *cg, struct cpu_search *low,
     struct cpu_search *high, const int match);
-int cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low);
-int cpu_search_highest(struct cpu_group *cg, struct cpu_search *high);
-int cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
+int cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low);
+int cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high);
+int cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,
     struct cpu_search *high);
 
 /*
- * This routine compares according to the match argument and should be
- * reduced in actual instantiations via constant propagation and dead code
- * elimination.
- */ 
-static __inline int
-cpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high,
-    const int match)
-{
-	struct tdq *tdq;
-
-	tdq = TDQ_CPU(cpu);
-	if (match & CPU_SEARCH_LOWEST)
-		if (CPU_ISSET(cpu, &low->cs_mask) &&
-		    tdq->tdq_load < low->cs_load &&
-		    tdq->tdq_lowpri > low->cs_limit) {
-			low->cs_cpu = cpu;
-			low->cs_load = tdq->tdq_load;
-		}
-	if (match & CPU_SEARCH_HIGHEST)
-		if (CPU_ISSET(cpu, &high->cs_mask) &&
-		    tdq->tdq_load >= high->cs_limit && 
-		    tdq->tdq_load > high->cs_load &&
-		    tdq->tdq_transferable) {
-			high->cs_cpu = cpu;
-			high->cs_load = tdq->tdq_load;
-		}
-	return (tdq->tdq_load);
-}
-
-/*
  * Search the tree of cpu_groups for the lowest or highest loaded cpu
  * according to the match argument.  This routine actually compares the
  * load on all paths through the tree and finds the least loaded cpu on
@@ -615,33 +587,42 @@ cpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high,
  * also recursive to the depth of the tree.
  */
 static __inline int
-cpu_search(struct cpu_group *cg, struct cpu_search *low,
+cpu_search(const struct cpu_group *cg, struct cpu_search *low,
     struct cpu_search *high, const int match)
 {
-	int total;
+	struct cpu_search lgroup;
+	struct cpu_search hgroup;
+	cpuset_t cpumask;
+	struct cpu_group *child;
+	struct tdq *tdq;
+	int cpu, i, hload, lload, load, total, rnd;
 
 	total = 0;
-	if (cg->cg_children) {
-		struct cpu_search lgroup;
-		struct cpu_search hgroup;
-		struct cpu_group *child;
-		u_int lload;
-		int hload;
-		int load;
-		int i;
-
-		lload = -1;
+	cpumask = cg->cg_mask;
+	if (match & CPU_SEARCH_LOWEST) {
+		lload = INT_MAX;
+		low->cs_load = INT_MAX;
+		lgroup = *low;
+	}
+	if (match & CPU_SEARCH_HIGHEST) {
 		hload = -1;
-		for (i = 0; i < cg->cg_children; i++) {
+		high->cs_load = -1;
+		hgroup = *high;
+	}
+
+	/* Iterate through the child CPU groups and then remaining CPUs. */
+	for (i = 0, cpu = 0; i <= cg->cg_children; ) {
+		if (i >= cg->cg_children) {
+			while (cpu <= mp_maxid && !CPU_ISSET(cpu, &cpumask))
+				cpu++;
+			if (cpu > mp_maxid)
+				break;
+			child = NULL;
+		} else
 			child = &cg->cg_child[i];
-			if (match & CPU_SEARCH_LOWEST) {
-				lgroup = *low;
-				lgroup.cs_load = -1;
-			}
-			if (match & CPU_SEARCH_HIGHEST) {
-				hgroup = *high;
-				lgroup.cs_load = 0;
-			}
+
+		if (child) {			/* Handle child CPU group. */
+			CPU_NAND(&cpumask, &child->cg_mask);
 			switch (match) {
 			case CPU_SEARCH_LOWEST:
 				load = cpu_search_lowest(child, &lgroup);
@@ -653,23 +634,52 @@ cpu_search(struct cpu_group *cg, struct cpu_search *low,
 				load = cpu_search_both(child, &lgroup, &hgroup);
 				break;
 			}
-			total += load;
-			if (match & CPU_SEARCH_LOWEST)
-				if (load < lload || low->cs_cpu == -1) {
-					*low = lgroup;
-					lload = load;
+		} else {			/* Handle child CPU. */
+			tdq = TDQ_CPU(cpu);
+			load = tdq->tdq_load * 256;
+			rnd = DPCPU_SET(randomval,
+			    DPCPU_GET(randomval) * 69069 + 5) >> 26;
+			if (match & CPU_SEARCH_LOWEST) {
+				if (cpu == low->cs_prefer)
+					load -= 64;
+				/* If that CPU is allowed and get data. */
+				if (CPU_ISSET(cpu, &lgroup.cs_mask) &&
+				    tdq->tdq_lowpri > lgroup.cs_pri &&
+				    tdq->tdq_load <= lgroup.cs_limit) {
+					lgroup.cs_cpu = cpu;
+					lgroup.cs_load = load - rnd;
 				}
-			if (match & CPU_SEARCH_HIGHEST) 
-				if (load > hload || high->cs_cpu == -1) {
-					hload = load;
-					*high = hgroup;
+			}
+			if (match & CPU_SEARCH_HIGHEST)
+				if (CPU_ISSET(cpu, &hgroup.cs_mask) &&
+				    tdq->tdq_load >= hgroup.cs_limit &&
+				    tdq->tdq_transferable) {
+					hgroup.cs_cpu = cpu;
+					hgroup.cs_load = load - rnd;
 				}
 		}
-	} else {
-		int cpu;
-
-		CPUSET_FOREACH(cpu, cg->cg_mask)
-			total += cpu_compare(cpu, low, high, match);
+		total += load;
+
+		/* We have info about child item. Compare it. */
+		if (match & CPU_SEARCH_LOWEST) {
+			if ((load < lload) ||
+			    (load == lload && lgroup.cs_load < low->cs_load)) {
+				lload = load;
+				low->cs_cpu = lgroup.cs_cpu;
+				low->cs_load = lgroup.cs_load;
+			}
+		}
+		if (match & CPU_SEARCH_HIGHEST)
+			if ((load > hload) ||
+			    (load == hload && hgroup.cs_load > high->cs_load)) {
+				hload = load;
+				high->cs_cpu = hgroup.cs_cpu;
+				high->cs_load = hgroup.cs_load;
+			}
+		if (child)
+			i++;
+		else
+			cpu++;
 	}
 	return (total);
 }
@@ -679,19 +689,19 @@ cpu_search(struct cpu_group *cg, struct cpu_search *low,
  * optimization.
  */
 int
-cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low)
+cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low)
 {
 	return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);
 }
 
 int
-cpu_search_highest(struct cpu_group *cg, struct cpu_search *high)
+cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high)
 {
 	return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);
 }
 
 int
-cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
+cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,
     struct cpu_search *high)
 {
 	return cpu_search(cg, low, high, CPU_SEARCH_BOTH);
@@ -703,14 +713,16 @@ cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
  * acceptable.
  */
 static inline int
-sched_lowest(struct cpu_group *cg, cpuset_t mask, int pri)
+sched_lowest(const struct cpu_group *cg, cpuset_t mask, int pri, int maxload,
+    int prefer)
 {
 	struct cpu_search low;
 
 	low.cs_cpu = -1;
-	low.cs_load = -1;
+	low.cs_prefer = prefer;
 	low.cs_mask = mask;
-	low.cs_limit = pri;
+	low.cs_pri = pri;
+	low.cs_limit = maxload;
 	cpu_search_lowest(cg, &low);
 	return low.cs_cpu;
 }
@@ -719,12 +731,11 @@ sched_lowest(struct cpu_group *cg, cpuset_t mask, int pri)
  * Find the cpu with the highest load via the highest loaded path.
  */
 static inline int
-sched_highest(struct cpu_group *cg, cpuset_t mask, int minload)
+sched_highest(const struct cpu_group *cg, cpuset_t mask, int minload)
 {
 	struct cpu_search high;
 
 	high.cs_cpu = -1;
-	high.cs_load = 0;
 	high.cs_mask = mask;
 	high.cs_limit = minload;
 	cpu_search_highest(cg, &high);
@@ -735,17 +746,17 @@ sched_highest(struct cpu_group *cg, cpuset_t mask, int minload)
  * Simultaneously find the highest and lowest loaded cpu reachable via
  * cg.
  */
-static inline void 
-sched_both(struct cpu_group *cg, cpuset_t mask, int *lowcpu, int *highcpu)
+static inline void
+sched_both(const struct cpu_group *cg, cpuset_t mask, int *lowcpu, int *highcpu)
 {
 	struct cpu_search high;
 	struct cpu_search low;
 
 	low.cs_cpu = -1;
-	low.cs_limit = -1;
-	low.cs_load = -1;
+	low.cs_prefer = -1;
+	low.cs_pri = -1;
+	low.cs_limit = INT_MAX;
 	low.cs_mask = mask;
-	high.cs_load = 0;
 	high.cs_cpu = -1;
 	high.cs_limit = -1;
 	high.cs_mask = mask;
@@ -758,30 +769,45 @@ sched_both(struct cpu_group *cg, cpuset_t mask, int *lowcpu, int *highcpu)
 static void
 sched_balance_group(struct cpu_group *cg)
 {
-	cpuset_t mask;
-	int high;
-	int low;
-	int i;
+	cpuset_t hmask, lmask;
+	int high, low, anylow;
 
-	CPU_FILL(&mask);
+	CPU_FILL(&hmask);
 	for (;;) {
-		sched_both(cg, mask, &low, &high);
-		if (low == high || low == -1 || high == -1)
+		high = sched_highest(cg, hmask, 1);
+		/* Stop if there is no more CPU with transferrable threads. */
+		if (high == -1)
 			break;
-		if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low)))
+		CPU_CLR(high, &hmask);
+		CPU_COPY(&hmask, &lmask);
+		/* Stop if there is no more CPU left for low. */
+		if (CPU_EMPTY(&lmask))
 			break;
-		/*
-		 * If we failed to move any threads determine which cpu
-		 * to kick out of the set and try again.
-	 	 */
-		if (TDQ_CPU(high)->tdq_transferable == 0)
-			CPU_CLR(high, &mask);
-		else
-			CPU_CLR(low, &mask);
+		anylow = 1;
+nextlow:
+		low = sched_lowest(cg, lmask, -1,
+		    TDQ_CPU(high)->tdq_load - 1, high);
+		/* Stop if we looked well and found no less loaded CPU. */
+		if (anylow && low == -1)
+			break;
+		/* Go to next high if we found no less loaded CPU. */
+		if (low == -1)
+			continue;
+		/* Transfer thread from high to low. */
+		if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low))) {
+			/* CPU that got thread can no longer be a donor. */
+			CPU_CLR(low, &hmask);
+		} else {
+			/*
+			 * If failed, then there is no threads on high
+			 * that can run on this low. Drop low from low
+			 * mask and look for different one.
+			 */
+			CPU_CLR(low, &lmask);
+			anylow = 0;
+			goto nextlow;
+		}
 	}
-
-	for (i = 0; i < cg->cg_children; i++)
-		sched_balance_group(&cg->cg_child[i]);
 }
 
 static void
@@ -834,32 +860,17 @@ tdq_unlock_pair(struct tdq *one, struct tdq *two)
 static int
 sched_balance_pair(struct tdq *high, struct tdq *low)
 {
-	int transferable;
-	int high_load;
-	int low_load;
 	int moved;
-	int move;
 	int cpu;
-	int diff;
-	int i;
 
 	tdq_lock_pair(high, low);
-	transferable = high->tdq_transferable;
-	high_load = high->tdq_load;
-	low_load = low->tdq_load;
 	moved = 0;
 	/*
 	 * Determine what the imbalance is and then adjust that to how many
 	 * threads we actually have to give up (transferable).
 	 */
-	if (transferable != 0) {
-		diff = high_load - low_load;
-		move = diff / 2;
-		if (diff & 0x1)
-			move++;
-		move = min(move, transferable);
-		for (i = 0; i < move; i++)
-			moved += tdq_move(high, low);
+	if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load &&
+	    (moved = tdq_move(high, low)) > 0) {
 		/*
 		 * In case the target isn't the current cpu IPI it to force a
 		 * reschedule with the new workload.
@@ -1003,8 +1014,7 @@ runq_steal_from(struct runq *rq, int cpu, u_char start)
 {
 	struct rqbits *rqb;
 	struct rqhead *rqh;
-	struct thread *td;
-	int first;
+	struct thread *td, *first;
 	int bit;
 	int pri;
 	int i;
@@ -1012,7 +1022,7 @@ runq_steal_from(struct runq *rq, int cpu, u_char start)
 	rqb = &rq->rq_status;
 	bit = start & (RQB_BPW -1);
 	pri = 0;
-	first = 0;
+	first = NULL;
 again:
 	for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
 		if (rqb->rqb_bits[i] == 0)
@@ -1031,7 +1041,7 @@ again:
 			if (first && THREAD_CAN_MIGRATE(td) &&
 			    THREAD_CAN_SCHED(td, cpu))
 				return (td);
-			first = 1;
+			first = td;
 		}
 	}
 	if (start != 0) {
@@ -1039,6 +1049,9 @@ again:
 		goto again;
 	}
 
+	if (first && THREAD_CAN_MIGRATE(first) &&
+	    THREAD_CAN_SCHED(first, cpu))
+		return (first);
 	return (NULL);
 }
 
@@ -1140,13 +1153,11 @@ SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration");
 static int
 sched_pickcpu(struct thread *td, int flags)
 {
-	struct cpu_group *cg;
+	struct cpu_group *cg, *ccg;
 	struct td_sched *ts;
 	struct tdq *tdq;
 	cpuset_t mask;
-	int self;
-	int pri;
-	int cpu;
+	int cpu, pri, self;
 
 	self = PCPU_GET(cpuid);
 	ts = td->td_sched;
@@ -1161,45 +1172,70 @@ sched_pickcpu(struct thread *td, int flags)
 	 * Prefer to run interrupt threads on the processors that generate
 	 * the interrupt.
 	 */
+	pri = td->td_priority;
 	if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) &&
 	    curthread->td_intr_nesting_level && ts->ts_cpu != self) {
 		SCHED_STAT_INC(pickcpu_intrbind);
 		ts->ts_cpu = self;
+		if (TDQ_CPU(self)->tdq_lowpri > pri) {
+			SCHED_STAT_INC(pickcpu_affinity);
+			return (ts->ts_cpu);
+		}
 	}
 	/*
 	 * If the thread can run on the last cpu and the affinity has not
 	 * expired or it is idle run it there.
 	 */
-	pri = td->td_priority;
 	tdq = TDQ_CPU(ts->ts_cpu);
-	if (THREAD_CAN_SCHED(td, ts->ts_cpu)) {
-		if (tdq->tdq_lowpri > PRI_MIN_IDLE) {
+	cg = tdq->tdq_cg;
+	if (THREAD_CAN_SCHED(td, ts->ts_cpu) &&
+	    tdq->tdq_lowpri >= PRI_MIN_IDLE &&
+	    SCHED_AFFINITY(ts, CG_SHARE_L2)) {
+		if (cg->cg_flags & CG_FLAG_THREAD) {
+			CPUSET_FOREACH(cpu, cg->cg_mask) {
+				if (TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE)
+					break;
+			}
+		} else
+			cpu = INT_MAX;
+		if (cpu > mp_maxid) {
 			SCHED_STAT_INC(pickcpu_idle_affinity);
 			return (ts->ts_cpu);
 		}
-		if (SCHED_AFFINITY(ts, CG_SHARE_L2) && tdq->tdq_lowpri > pri) {
-			SCHED_STAT_INC(pickcpu_affinity);
-			return (ts->ts_cpu);
-		}
 	}
 	/*
-	 * Search for the highest level in the tree that still has affinity.
+	 * Search for the last level cache CPU group in the tree.
+	 * Skip caches with expired affinity time and SMT groups.
+	 * Affinity to higher level caches will be handled less aggressively.
 	 */
-	cg = NULL;
-	for (cg = tdq->tdq_cg; cg != NULL; cg = cg->cg_parent)
-		if (SCHED_AFFINITY(ts, cg->cg_level))
-			break;
+	for (ccg = NULL; cg != NULL; cg = cg->cg_parent) {
+		if (cg->cg_flags & CG_FLAG_THREAD)
+			continue;
+		if (!SCHED_AFFINITY(ts, cg->cg_level))
+			continue;
+		ccg = cg;
+	}
+	if (ccg != NULL)
+		cg = ccg;
 	cpu = -1;
+	/* Search the group for the less loaded idle CPU we can run now. */
 	mask = td->td_cpuset->cs_mask;
-	if (cg)
-		cpu = sched_lowest(cg, mask, pri);
+	if (cg != NULL && cg != cpu_top &&
+	    CPU_CMP(&cg->cg_mask, &cpu_top->cg_mask) != 0)
+		cpu = sched_lowest(cg, mask, max(pri, PRI_MAX_TIMESHARE),
+		    INT_MAX, ts->ts_cpu);
+	/* Search globally for the less loaded CPU we can run now. */
+	if (cpu == -1)
+		cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu);
+	/* Search globally for the less loaded CPU. */
 	if (cpu == -1)
-		cpu = sched_lowest(cpu_top, mask, -1);
+		cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu);
 	/*
 	 * Compare the lowest loaded cpu to current cpu.
 	 */
 	if (THREAD_CAN_SCHED(td, self) && TDQ_CPU(self)->tdq_lowpri > pri &&
-	    TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) {
+	    TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE &&
+	    TDQ_CPU(self)->tdq_load <= TDQ_CPU(cpu)->tdq_load + 1) {
 		SCHED_STAT_INC(pickcpu_local);
 		cpu = self;
 	} else
@@ -2750,8 +2786,6 @@ SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
 SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
     &balance_interval, 0,
     "Average frequency in stathz ticks to run the long-term balancer");
-SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0,
-    "Steals work from another hyper-threaded core on idle");
 SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
     "Attempts to steal work from other cores before idling");
 SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,