summaryrefslogtreecommitdiffstats
path: root/sys/kern/sched_ule.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/kern/sched_ule.c')
-rw-r--r--sys/kern/sched_ule.c330
1 files changed, 182 insertions, 148 deletions
diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index 2065b9f..126c135 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -258,7 +258,6 @@ struct cpu_group *cpu_top; /* CPU topology */
static int rebalance = 1;
static int balance_interval = 128; /* Default set in sched_initticks(). */
static int affinity;
-static int steal_htt = 1;
static int steal_idle = 1;
static int steal_thresh = 2;
@@ -268,6 +267,7 @@ static int steal_thresh = 2;
static struct tdq tdq_cpu[MAXCPU];
static struct tdq *balance_tdq;
static int balance_ticks;
+static DPCPU_DEFINE(uint32_t, randomval);
#define TDQ_SELF() (&tdq_cpu[PCPU_GET(cpuid)])
#define TDQ_CPU(x) (&tdq_cpu[(x)])
@@ -553,9 +553,11 @@ tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
#ifdef SMP
struct cpu_search {
cpuset_t cs_mask;
- u_int cs_load;
- u_int cs_cpu;
- int cs_limit; /* Min priority for low min load for high. */
+ u_int cs_prefer;
+ int cs_pri; /* Min priority for low. */
+ int cs_limit; /* Max load for low, min load for high. */
+ int cs_cpu;
+ int cs_load;
};
#define CPU_SEARCH_LOWEST 0x1
@@ -566,44 +568,14 @@ struct cpu_search {
for ((cpu) = 0; (cpu) <= mp_maxid; (cpu)++) \
if (CPU_ISSET(cpu, &mask))
-static __inline int cpu_search(struct cpu_group *cg, struct cpu_search *low,
+static __inline int cpu_search(const struct cpu_group *cg, struct cpu_search *low,
struct cpu_search *high, const int match);
-int cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low);
-int cpu_search_highest(struct cpu_group *cg, struct cpu_search *high);
-int cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
+int cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low);
+int cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high);
+int cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,
struct cpu_search *high);
/*
- * This routine compares according to the match argument and should be
- * reduced in actual instantiations via constant propagation and dead code
- * elimination.
- */
-static __inline int
-cpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high,
- const int match)
-{
- struct tdq *tdq;
-
- tdq = TDQ_CPU(cpu);
- if (match & CPU_SEARCH_LOWEST)
- if (CPU_ISSET(cpu, &low->cs_mask) &&
- tdq->tdq_load < low->cs_load &&
- tdq->tdq_lowpri > low->cs_limit) {
- low->cs_cpu = cpu;
- low->cs_load = tdq->tdq_load;
- }
- if (match & CPU_SEARCH_HIGHEST)
- if (CPU_ISSET(cpu, &high->cs_mask) &&
- tdq->tdq_load >= high->cs_limit &&
- tdq->tdq_load > high->cs_load &&
- tdq->tdq_transferable) {
- high->cs_cpu = cpu;
- high->cs_load = tdq->tdq_load;
- }
- return (tdq->tdq_load);
-}
-
-/*
* Search the tree of cpu_groups for the lowest or highest loaded cpu
* according to the match argument. This routine actually compares the
* load on all paths through the tree and finds the least loaded cpu on
@@ -615,33 +587,42 @@ cpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high,
* also recursive to the depth of the tree.
*/
static __inline int
-cpu_search(struct cpu_group *cg, struct cpu_search *low,
+cpu_search(const struct cpu_group *cg, struct cpu_search *low,
struct cpu_search *high, const int match)
{
- int total;
+ struct cpu_search lgroup;
+ struct cpu_search hgroup;
+ cpuset_t cpumask;
+ struct cpu_group *child;
+ struct tdq *tdq;
+ int cpu, i, hload, lload, load, total, rnd;
total = 0;
- if (cg->cg_children) {
- struct cpu_search lgroup;
- struct cpu_search hgroup;
- struct cpu_group *child;
- u_int lload;
- int hload;
- int load;
- int i;
-
- lload = -1;
+ cpumask = cg->cg_mask;
+ if (match & CPU_SEARCH_LOWEST) {
+ lload = INT_MAX;
+ low->cs_load = INT_MAX;
+ lgroup = *low;
+ }
+ if (match & CPU_SEARCH_HIGHEST) {
hload = -1;
- for (i = 0; i < cg->cg_children; i++) {
+ high->cs_load = -1;
+ hgroup = *high;
+ }
+
+ /* Iterate through the child CPU groups and then remaining CPUs. */
+ for (i = 0, cpu = 0; i <= cg->cg_children; ) {
+ if (i >= cg->cg_children) {
+ while (cpu <= mp_maxid && !CPU_ISSET(cpu, &cpumask))
+ cpu++;
+ if (cpu > mp_maxid)
+ break;
+ child = NULL;
+ } else
child = &cg->cg_child[i];
- if (match & CPU_SEARCH_LOWEST) {
- lgroup = *low;
- lgroup.cs_load = -1;
- }
- if (match & CPU_SEARCH_HIGHEST) {
- hgroup = *high;
- lgroup.cs_load = 0;
- }
+
+ if (child) { /* Handle child CPU group. */
+ CPU_NAND(&cpumask, &child->cg_mask);
switch (match) {
case CPU_SEARCH_LOWEST:
load = cpu_search_lowest(child, &lgroup);
@@ -653,23 +634,52 @@ cpu_search(struct cpu_group *cg, struct cpu_search *low,
load = cpu_search_both(child, &lgroup, &hgroup);
break;
}
- total += load;
- if (match & CPU_SEARCH_LOWEST)
- if (load < lload || low->cs_cpu == -1) {
- *low = lgroup;
- lload = load;
+ } else { /* Handle child CPU. */
+ tdq = TDQ_CPU(cpu);
+ load = tdq->tdq_load * 256;
+ rnd = DPCPU_SET(randomval,
+ DPCPU_GET(randomval) * 69069 + 5) >> 26;
+ if (match & CPU_SEARCH_LOWEST) {
+ if (cpu == low->cs_prefer)
+ load -= 64;
+ /* If that CPU is allowed and get data. */
+ if (CPU_ISSET(cpu, &lgroup.cs_mask) &&
+ tdq->tdq_lowpri > lgroup.cs_pri &&
+ tdq->tdq_load <= lgroup.cs_limit) {
+ lgroup.cs_cpu = cpu;
+ lgroup.cs_load = load - rnd;
}
- if (match & CPU_SEARCH_HIGHEST)
- if (load > hload || high->cs_cpu == -1) {
- hload = load;
- *high = hgroup;
+ }
+ if (match & CPU_SEARCH_HIGHEST)
+ if (CPU_ISSET(cpu, &hgroup.cs_mask) &&
+ tdq->tdq_load >= hgroup.cs_limit &&
+ tdq->tdq_transferable) {
+ hgroup.cs_cpu = cpu;
+ hgroup.cs_load = load - rnd;
}
}
- } else {
- int cpu;
-
- CPUSET_FOREACH(cpu, cg->cg_mask)
- total += cpu_compare(cpu, low, high, match);
+ total += load;
+
+ /* We have info about child item. Compare it. */
+ if (match & CPU_SEARCH_LOWEST) {
+ if ((load < lload) ||
+ (load == lload && lgroup.cs_load < low->cs_load)) {
+ lload = load;
+ low->cs_cpu = lgroup.cs_cpu;
+ low->cs_load = lgroup.cs_load;
+ }
+ }
+ if (match & CPU_SEARCH_HIGHEST)
+ if ((load > hload) ||
+ (load == hload && hgroup.cs_load > high->cs_load)) {
+ hload = load;
+ high->cs_cpu = hgroup.cs_cpu;
+ high->cs_load = hgroup.cs_load;
+ }
+ if (child)
+ i++;
+ else
+ cpu++;
}
return (total);
}
@@ -679,19 +689,19 @@ cpu_search(struct cpu_group *cg, struct cpu_search *low,
* optimization.
*/
int
-cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low)
+cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low)
{
return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);
}
int
-cpu_search_highest(struct cpu_group *cg, struct cpu_search *high)
+cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high)
{
return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);
}
int
-cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
+cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,
struct cpu_search *high)
{
return cpu_search(cg, low, high, CPU_SEARCH_BOTH);
@@ -703,14 +713,16 @@ cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
* acceptable.
*/
static inline int
-sched_lowest(struct cpu_group *cg, cpuset_t mask, int pri)
+sched_lowest(const struct cpu_group *cg, cpuset_t mask, int pri, int maxload,
+ int prefer)
{
struct cpu_search low;
low.cs_cpu = -1;
- low.cs_load = -1;
+ low.cs_prefer = prefer;
low.cs_mask = mask;
- low.cs_limit = pri;
+ low.cs_pri = pri;
+ low.cs_limit = maxload;
cpu_search_lowest(cg, &low);
return low.cs_cpu;
}
@@ -719,12 +731,11 @@ sched_lowest(struct cpu_group *cg, cpuset_t mask, int pri)
* Find the cpu with the highest load via the highest loaded path.
*/
static inline int
-sched_highest(struct cpu_group *cg, cpuset_t mask, int minload)
+sched_highest(const struct cpu_group *cg, cpuset_t mask, int minload)
{
struct cpu_search high;
high.cs_cpu = -1;
- high.cs_load = 0;
high.cs_mask = mask;
high.cs_limit = minload;
cpu_search_highest(cg, &high);
@@ -735,17 +746,17 @@ sched_highest(struct cpu_group *cg, cpuset_t mask, int minload)
* Simultaneously find the highest and lowest loaded cpu reachable via
* cg.
*/
-static inline void
-sched_both(struct cpu_group *cg, cpuset_t mask, int *lowcpu, int *highcpu)
+static inline void
+sched_both(const struct cpu_group *cg, cpuset_t mask, int *lowcpu, int *highcpu)
{
struct cpu_search high;
struct cpu_search low;
low.cs_cpu = -1;
- low.cs_limit = -1;
- low.cs_load = -1;
+ low.cs_prefer = -1;
+ low.cs_pri = -1;
+ low.cs_limit = INT_MAX;
low.cs_mask = mask;
- high.cs_load = 0;
high.cs_cpu = -1;
high.cs_limit = -1;
high.cs_mask = mask;
@@ -758,30 +769,45 @@ sched_both(struct cpu_group *cg, cpuset_t mask, int *lowcpu, int *highcpu)
static void
sched_balance_group(struct cpu_group *cg)
{
- cpuset_t mask;
- int high;
- int low;
- int i;
+ cpuset_t hmask, lmask;
+ int high, low, anylow;
- CPU_FILL(&mask);
+ CPU_FILL(&hmask);
for (;;) {
- sched_both(cg, mask, &low, &high);
- if (low == high || low == -1 || high == -1)
+ high = sched_highest(cg, hmask, 1);
+ /* Stop if there is no more CPU with transferrable threads. */
+ if (high == -1)
break;
- if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low)))
+ CPU_CLR(high, &hmask);
+ CPU_COPY(&hmask, &lmask);
+ /* Stop if there is no more CPU left for low. */
+ if (CPU_EMPTY(&lmask))
break;
- /*
- * If we failed to move any threads determine which cpu
- * to kick out of the set and try again.
- */
- if (TDQ_CPU(high)->tdq_transferable == 0)
- CPU_CLR(high, &mask);
- else
- CPU_CLR(low, &mask);
+ anylow = 1;
+nextlow:
+ low = sched_lowest(cg, lmask, -1,
+ TDQ_CPU(high)->tdq_load - 1, high);
+ /* Stop if we looked well and found no less loaded CPU. */
+ if (anylow && low == -1)
+ break;
+ /* Go to next high if we found no less loaded CPU. */
+ if (low == -1)
+ continue;
+ /* Transfer thread from high to low. */
+ if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low))) {
+ /* CPU that got thread can no longer be a donor. */
+ CPU_CLR(low, &hmask);
+ } else {
+ /*
+ * If failed, then there is no threads on high
+ * that can run on this low. Drop low from low
+ * mask and look for different one.
+ */
+ CPU_CLR(low, &lmask);
+ anylow = 0;
+ goto nextlow;
+ }
}
-
- for (i = 0; i < cg->cg_children; i++)
- sched_balance_group(&cg->cg_child[i]);
}
static void
@@ -834,32 +860,17 @@ tdq_unlock_pair(struct tdq *one, struct tdq *two)
static int
sched_balance_pair(struct tdq *high, struct tdq *low)
{
- int transferable;
- int high_load;
- int low_load;
int moved;
- int move;
int cpu;
- int diff;
- int i;
tdq_lock_pair(high, low);
- transferable = high->tdq_transferable;
- high_load = high->tdq_load;
- low_load = low->tdq_load;
moved = 0;
/*
* Determine what the imbalance is and then adjust that to how many
* threads we actually have to give up (transferable).
*/
- if (transferable != 0) {
- diff = high_load - low_load;
- move = diff / 2;
- if (diff & 0x1)
- move++;
- move = min(move, transferable);
- for (i = 0; i < move; i++)
- moved += tdq_move(high, low);
+ if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load &&
+ (moved = tdq_move(high, low)) > 0) {
/*
* In case the target isn't the current cpu IPI it to force a
* reschedule with the new workload.
@@ -1003,8 +1014,7 @@ runq_steal_from(struct runq *rq, int cpu, u_char start)
{
struct rqbits *rqb;
struct rqhead *rqh;
- struct thread *td;
- int first;
+ struct thread *td, *first;
int bit;
int pri;
int i;
@@ -1012,7 +1022,7 @@ runq_steal_from(struct runq *rq, int cpu, u_char start)
rqb = &rq->rq_status;
bit = start & (RQB_BPW -1);
pri = 0;
- first = 0;
+ first = NULL;
again:
for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
if (rqb->rqb_bits[i] == 0)
@@ -1031,7 +1041,7 @@ again:
if (first && THREAD_CAN_MIGRATE(td) &&
THREAD_CAN_SCHED(td, cpu))
return (td);
- first = 1;
+ first = td;
}
}
if (start != 0) {
@@ -1039,6 +1049,9 @@ again:
goto again;
}
+ if (first && THREAD_CAN_MIGRATE(first) &&
+ THREAD_CAN_SCHED(first, cpu))
+ return (first);
return (NULL);
}
@@ -1140,13 +1153,11 @@ SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration");
static int
sched_pickcpu(struct thread *td, int flags)
{
- struct cpu_group *cg;
+ struct cpu_group *cg, *ccg;
struct td_sched *ts;
struct tdq *tdq;
cpuset_t mask;
- int self;
- int pri;
- int cpu;
+ int cpu, pri, self;
self = PCPU_GET(cpuid);
ts = td->td_sched;
@@ -1161,45 +1172,70 @@ sched_pickcpu(struct thread *td, int flags)
* Prefer to run interrupt threads on the processors that generate
* the interrupt.
*/
+ pri = td->td_priority;
if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) &&
curthread->td_intr_nesting_level && ts->ts_cpu != self) {
SCHED_STAT_INC(pickcpu_intrbind);
ts->ts_cpu = self;
+ if (TDQ_CPU(self)->tdq_lowpri > pri) {
+ SCHED_STAT_INC(pickcpu_affinity);
+ return (ts->ts_cpu);
+ }
}
/*
* If the thread can run on the last cpu and the affinity has not
* expired or it is idle run it there.
*/
- pri = td->td_priority;
tdq = TDQ_CPU(ts->ts_cpu);
- if (THREAD_CAN_SCHED(td, ts->ts_cpu)) {
- if (tdq->tdq_lowpri > PRI_MIN_IDLE) {
+ cg = tdq->tdq_cg;
+ if (THREAD_CAN_SCHED(td, ts->ts_cpu) &&
+ tdq->tdq_lowpri >= PRI_MIN_IDLE &&
+ SCHED_AFFINITY(ts, CG_SHARE_L2)) {
+ if (cg->cg_flags & CG_FLAG_THREAD) {
+ CPUSET_FOREACH(cpu, cg->cg_mask) {
+ if (TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE)
+ break;
+ }
+ } else
+ cpu = INT_MAX;
+ if (cpu > mp_maxid) {
SCHED_STAT_INC(pickcpu_idle_affinity);
return (ts->ts_cpu);
}
- if (SCHED_AFFINITY(ts, CG_SHARE_L2) && tdq->tdq_lowpri > pri) {
- SCHED_STAT_INC(pickcpu_affinity);
- return (ts->ts_cpu);
- }
}
/*
- * Search for the highest level in the tree that still has affinity.
+ * Search for the last level cache CPU group in the tree.
+ * Skip caches with expired affinity time and SMT groups.
+ * Affinity to higher level caches will be handled less aggressively.
*/
- cg = NULL;
- for (cg = tdq->tdq_cg; cg != NULL; cg = cg->cg_parent)
- if (SCHED_AFFINITY(ts, cg->cg_level))
- break;
+ for (ccg = NULL; cg != NULL; cg = cg->cg_parent) {
+ if (cg->cg_flags & CG_FLAG_THREAD)
+ continue;
+ if (!SCHED_AFFINITY(ts, cg->cg_level))
+ continue;
+ ccg = cg;
+ }
+ if (ccg != NULL)
+ cg = ccg;
cpu = -1;
+ /* Search the group for the less loaded idle CPU we can run now. */
mask = td->td_cpuset->cs_mask;
- if (cg)
- cpu = sched_lowest(cg, mask, pri);
+ if (cg != NULL && cg != cpu_top &&
+ CPU_CMP(&cg->cg_mask, &cpu_top->cg_mask) != 0)
+ cpu = sched_lowest(cg, mask, max(pri, PRI_MAX_TIMESHARE),
+ INT_MAX, ts->ts_cpu);
+ /* Search globally for the less loaded CPU we can run now. */
+ if (cpu == -1)
+ cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu);
+ /* Search globally for the less loaded CPU. */
if (cpu == -1)
- cpu = sched_lowest(cpu_top, mask, -1);
+ cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu);
/*
* Compare the lowest loaded cpu to current cpu.
*/
if (THREAD_CAN_SCHED(td, self) && TDQ_CPU(self)->tdq_lowpri > pri &&
- TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) {
+ TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE &&
+ TDQ_CPU(self)->tdq_load <= TDQ_CPU(cpu)->tdq_load + 1) {
SCHED_STAT_INC(pickcpu_local);
cpu = self;
} else
@@ -2750,8 +2786,6 @@ SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
&balance_interval, 0,
"Average frequency in stathz ticks to run the long-term balancer");
-SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0,
- "Steals work from another hyper-threaded core on idle");
SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
"Attempts to steal work from other cores before idling");
SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,
OpenPOWER on IntegriCloud