summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjeff <jeff@FreeBSD.org>2007-01-19 21:56:08 +0000
committerjeff <jeff@FreeBSD.org>2007-01-19 21:56:08 +0000
commita5cccc05cbfa3300d92aad172b2c031250f812df (patch)
tree13f9cc281dab6a18057702f0afe7b15becb84744
parentc8eb91c205bbb209a1d0c28f1011f4a9d3f20ab3 (diff)
downloadFreeBSD-src-a5cccc05cbfa3300d92aad172b2c031250f812df.zip
FreeBSD-src-a5cccc05cbfa3300d92aad172b2c031250f812df.tar.gz
Major revamp of ULE's cpu load balancing:
- Switch back to direct modification of remote CPU run queues. This added a lot of complexity with questionable gain. It's easy enough to reimplement if it's shown to help on huge machines. - Re-implement the old tdq_transfer() call as tdq_pickidle(). Change sched_add() so we have selectable cpu choosers and simplify the logic a bit here. - Implement tdq_pickpri() as the new default cpu chooser. This algorithm is similar to Solaris in that it tries to always run the threads with the best priorities. It is actually slightly more complex than solaris's algorithm because we also tend to favor the local cpu over other cpus which has a boost in latency but also potentially enables cache sharing between the waking thread and the woken thread. - Add a bunch of tunables that can be used to measure effects of different load balancing strategies. Most of these will go away once the algorithm is more definite. - Add a new mechanism to steal threads from busy cpus when we idle. This is enabled with kern.sched.steal_busy and kern.sched.busy_thresh. The threshold is the required length of a tdq's run queue before another cpu will be able to steal runnable threads. This prevents most queue imbalances that contribute the long latencies.
-rw-r--r--sys/kern/sched_ule.c527
1 files changed, 290 insertions, 237 deletions
diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index b96338a..363ba41 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -80,17 +80,17 @@ struct td_sched {
int ts_ltick; /* Last tick that we were running on */
int ts_ftick; /* First tick that we were running on */
int ts_ticks; /* Tick count */
+#ifdef SMP
+ int ts_rltick; /* Real last tick, for affinity. */
+#endif
/* originally from kg_sched */
int skg_slptime; /* Number of ticks we vol. slept */
int skg_runtime; /* Number of ticks we were running */
};
-#define ts_assign ts_procq.tqe_next
/* flags kept in ts_flags */
-#define TSF_ASSIGNED 0x0001 /* Thread is being migrated. */
-#define TSF_BOUND 0x0002 /* Thread can not migrate. */
-#define TSF_XFERABLE 0x0004 /* Thread was added as transferable. */
-#define TSF_REMOVED 0x0008 /* Thread was removed while ASSIGNED */
+#define TSF_BOUND 0x0001 /* Thread can not migrate. */
+#define TSF_XFERABLE 0x0002 /* Thread was added as transferable. */
#define TSF_DIDRUN 0x2000 /* Thread actually ran. */
static struct td_sched td_sched0;
@@ -163,7 +163,6 @@ static int sched_interact = SCHED_INTERACT_THRESH;
static int realstathz;
static int tickincr;
static int sched_slice;
-static int sched_rebalance = 1;
/*
* tdq - per processor runqs and statistics.
@@ -175,16 +174,18 @@ struct tdq {
int tdq_idx; /* Current insert index. */
int tdq_ridx; /* Current removal index. */
int tdq_load; /* Aggregate load. */
+ int tdq_flags; /* Thread queue flags */
#ifdef SMP
int tdq_transferable;
LIST_ENTRY(tdq) tdq_siblings; /* Next in tdq group. */
struct tdq_group *tdq_group; /* Our processor group. */
- volatile struct td_sched *tdq_assigned; /* assigned by another CPU. */
#else
int tdq_sysload; /* For loadavg, !ITHD load. */
#endif
};
+#define TDQF_BUSY 0x0001 /* Queue is marked as busy */
+
#ifdef SMP
/*
* tdq groups are groups of processors which can cheaply share threads. When
@@ -203,13 +204,30 @@ struct tdq_group {
int tdg_transferable; /* Transferable load of this group. */
LIST_HEAD(, tdq) tdg_members; /* Linked list of all members. */
};
-#endif
+
+#define SCHED_AFFINITY_DEFAULT (hz / 100)
+#define SCHED_AFFINITY(ts) ((ts)->ts_rltick > ticks - affinity)
+
+/*
+ * Run-time tunables.
+ */
+static int rebalance = 1;
+static int pick_pri = 1;
+static int affinity;
+static int tryself = 1;
+static int tryselfidle = 1;
+static int ipi_ast = 0;
+static int ipi_preempt = 1;
+static int ipi_thresh = PRI_MIN_KERN;
+static int steal_htt = 1;
+static int steal_busy = 1;
+static int busy_thresh = 4;
/*
* One thread queue per processor.
*/
-#ifdef SMP
-static cpumask_t tdq_idle;
+static volatile cpumask_t tdq_idle;
+static volatile cpumask_t tdq_busy;
static int tdg_maxid;
static struct tdq tdq_cpu[MAXCPU];
static struct tdq_group tdq_groups[MAXCPU];
@@ -248,21 +266,20 @@ static __inline void tdq_runq_rem(struct tdq *, struct td_sched *);
void tdq_print(int cpu);
static void runq_print(struct runq *rq);
#ifdef SMP
-static int tdq_transfer(struct tdq *, struct td_sched *, int);
+static int tdq_pickidle(struct tdq *, struct td_sched *);
+static int tdq_pickpri(struct tdq *, struct td_sched *, int);
static struct td_sched *runq_steal(struct runq *);
static void sched_balance(void);
static void sched_balance_groups(void);
static void sched_balance_group(struct tdq_group *);
static void sched_balance_pair(struct tdq *, struct tdq *);
-static void sched_smp_tick(void);
+static void sched_smp_tick(struct thread *);
static void tdq_move(struct tdq *, int);
static int tdq_idled(struct tdq *);
-static void tdq_notify(struct td_sched *, int);
-static void tdq_assign(struct tdq *);
+static void tdq_notify(struct td_sched *);
static struct td_sched *tdq_steal(struct tdq *, int);
-#define THREAD_CAN_MIGRATE(td) \
- ((td)->td_pinned == 0 && (td)->td_pri_class != PRI_ITHD)
+#define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0)
#endif
static void sched_setup(void *dummy);
@@ -337,6 +354,11 @@ tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags)
tdq->tdq_transferable++;
tdq->tdq_group->tdg_transferable++;
ts->ts_flags |= TSF_XFERABLE;
+ if (tdq->tdq_transferable >= busy_thresh &&
+ (tdq->tdq_flags & TDQF_BUSY) == 0) {
+ tdq->tdq_flags |= TDQF_BUSY;
+ atomic_set_int(&tdq_busy, 1 << TDQ_ID(tdq));
+ }
}
#endif
if (ts->ts_runq == &tdq->tdq_timeshare) {
@@ -376,6 +398,11 @@ tdq_runq_rem(struct tdq *tdq, struct td_sched *ts)
tdq->tdq_transferable--;
tdq->tdq_group->tdg_transferable--;
ts->ts_flags &= ~TSF_XFERABLE;
+ if (tdq->tdq_transferable < busy_thresh &&
+ (tdq->tdq_flags & TDQF_BUSY)) {
+ atomic_clear_int(&tdq_busy, 1 << TDQ_ID(tdq));
+ tdq->tdq_flags &= ~TDQF_BUSY;
+ }
}
#endif
if (ts->ts_runq == &tdq->tdq_timeshare) {
@@ -402,7 +429,8 @@ tdq_load_add(struct tdq *tdq, struct td_sched *ts)
class = PRI_BASE(ts->ts_thread->td_pri_class);
tdq->tdq_load++;
CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
- if (class != PRI_ITHD && (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
+ if (class != PRI_ITHD &&
+ (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
#ifdef SMP
tdq->tdq_group->tdg_load++;
#else
@@ -416,7 +444,8 @@ tdq_load_rem(struct tdq *tdq, struct td_sched *ts)
int class;
mtx_assert(&sched_lock, MA_OWNED);
class = PRI_BASE(ts->ts_thread->td_pri_class);
- if (class != PRI_ITHD && (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
+ if (class != PRI_ITHD &&
+ (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
#ifdef SMP
tdq->tdq_group->tdg_load--;
#else
@@ -429,23 +458,18 @@ tdq_load_rem(struct tdq *tdq, struct td_sched *ts)
#ifdef SMP
static void
-sched_smp_tick(void)
+sched_smp_tick(struct thread *td)
{
struct tdq *tdq;
tdq = TDQ_SELF();
- if (sched_rebalance) {
+ if (rebalance) {
if (ticks >= bal_tick)
sched_balance();
if (ticks >= gbal_tick && balance_groups)
sched_balance_groups();
}
- /*
- * We could have been assigned a non real-time thread without an
- * IPI.
- */
- if (tdq->tdq_assigned)
- tdq_assign(tdq); /* Potentially sets NEEDRESCHED */
+ td->td_sched->ts_rltick = ticks;
}
/*
@@ -599,10 +623,11 @@ tdq_move(struct tdq *from, int cpu)
}
if (tdq == to)
return;
- ts->ts_state = TSS_THREAD;
- tdq_runq_rem(tdq, ts);
- tdq_load_rem(tdq, ts);
- tdq_notify(ts, cpu);
+ sched_rem(ts->ts_thread);
+ ts->ts_cpu = cpu;
+ sched_pin_td(ts->ts_thread);
+ sched_add(ts->ts_thread, SRQ_YIELDING);
+ sched_unpin_td(ts->ts_thread);
}
static int
@@ -617,21 +642,34 @@ tdq_idled(struct tdq *tdq)
* If we're in a cpu group, try and steal threads from another cpu in
* the group before idling.
*/
- if (tdg->tdg_cpus > 1 && tdg->tdg_transferable) {
+ if (steal_htt && tdg->tdg_cpus > 1 && tdg->tdg_transferable) {
LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) {
if (steal == tdq || steal->tdq_transferable == 0)
continue;
ts = tdq_steal(steal, 0);
+ if (ts)
+ goto steal;
+ }
+ }
+ if (steal_busy) {
+ while (tdq_busy) {
+ int cpu;
+
+ cpu = ffs(tdq_busy);
+ if (cpu == 0)
+ break;
+ cpu--;
+ steal = TDQ_CPU(cpu);
+ if (steal->tdq_transferable == 0)
+ continue;
+ ts = tdq_steal(steal, 1);
if (ts == NULL)
continue;
- ts->ts_state = TSS_THREAD;
- tdq_runq_rem(steal, ts);
- tdq_load_rem(steal, ts);
- ts->ts_cpu = PCPU_GET(cpuid);
- sched_pin_td(ts->ts_thread);
- sched_add(ts->ts_thread, SRQ_YIELDING);
- sched_unpin_td(ts->ts_thread);
- return (0);
+ CTR5(KTR_SCHED,
+ "tdq_idled: stealing td %p(%s) pri %d from %d busy 0x%X",
+ ts->ts_thread, ts->ts_thread->td_proc->p_comm,
+ ts->ts_thread->td_priority, cpu, tdq_busy);
+ goto steal;
}
}
/*
@@ -640,79 +678,51 @@ tdq_idled(struct tdq *tdq)
* back and forth between two idle cores on seperate physical CPUs.
*/
tdg->tdg_idlemask |= PCPU_GET(cpumask);
- if (tdg->tdg_idlemask != tdg->tdg_cpumask)
- return (1);
- atomic_set_int(&tdq_idle, tdg->tdg_mask);
+ if (tdg->tdg_idlemask == tdg->tdg_cpumask)
+ atomic_set_int(&tdq_idle, tdg->tdg_mask);
return (1);
-}
-
-static void
-tdq_assign(struct tdq *tdq)
-{
- struct td_sched *nts;
- struct td_sched *ts;
+steal:
+ sched_rem(ts->ts_thread);
+ ts->ts_cpu = PCPU_GET(cpuid);
+ sched_pin_td(ts->ts_thread);
+ sched_add(ts->ts_thread, SRQ_YIELDING);
+ sched_unpin_td(ts->ts_thread);
- do {
- *(volatile struct td_sched **)&ts = tdq->tdq_assigned;
- } while(!atomic_cmpset_ptr((volatile uintptr_t *)&tdq->tdq_assigned,
- (uintptr_t)ts, (uintptr_t)NULL));
- for (; ts != NULL; ts = nts) {
- nts = ts->ts_assign;
- tdq->tdq_group->tdg_load--;
- tdq->tdq_load--;
- ts->ts_flags &= ~TSF_ASSIGNED;
- if (ts->ts_flags & TSF_REMOVED) {
- ts->ts_flags &= ~TSF_REMOVED;
- continue;
- }
- sched_pin_td(ts->ts_thread);
- sched_add(ts->ts_thread, SRQ_YIELDING);
- sched_unpin_td(ts->ts_thread);
- }
+ return (0);
}
static void
-tdq_notify(struct td_sched *ts, int cpu)
+tdq_notify(struct td_sched *ts)
{
- struct tdq *tdq;
struct thread *td;
struct pcpu *pcpu;
- int class;
int prio;
+ int cpu;
- tdq = TDQ_CPU(cpu);
- class = PRI_BASE(ts->ts_thread->td_pri_class);
- if ((class != PRI_IDLE && class != PRI_ITHD)
- && (tdq_idle & tdq->tdq_group->tdg_mask))
- atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask);
- tdq->tdq_group->tdg_load++;
- tdq->tdq_load++;
- ts->ts_cpu = cpu;
- ts->ts_flags |= TSF_ASSIGNED;
prio = ts->ts_thread->td_priority;
-
+ cpu = ts->ts_cpu;
+ pcpu = pcpu_find(cpu);
+ td = pcpu->pc_curthread;
/*
- * Place a thread on another cpu's queue and force a resched.
+ * IPI if we exceed the threshold or if the target cpu is running an
+ * idle thread.
*/
- do {
- *(volatile struct td_sched **)&ts->ts_assign = tdq->tdq_assigned;
- } while(!atomic_cmpset_ptr((volatile uintptr_t *)&tdq->tdq_assigned,
- (uintptr_t)ts->ts_assign, (uintptr_t)ts));
- /* Only ipi for realtime/ithd priorities */
- if (ts->ts_thread->td_priority > PRI_MIN_KERN)
+ if (prio > ipi_thresh && td->td_priority < PRI_MIN_IDLE)
return;
/*
- * Without sched_lock we could lose a race where we set NEEDRESCHED
- * on a thread that is switched out before the IPI is delivered. This
- * would lead us to miss the resched. This will be a problem once
- * sched_lock is pushed down.
+ * IPI only if our priority is better than the running thread and
+ * the running thread is not the per cpu idle thread. The
+ * idlethread finds new work via sched_runnable().
*/
- pcpu = pcpu_find(cpu);
- td = pcpu->pc_curthread;
- if (ts->ts_thread->td_priority < td->td_priority) {
+ if (td == pcpu->pc_idlethread)
+ return;
+ if (prio > td->td_priority)
+ return;
+ if (ipi_ast) {
td->td_flags |= TDF_NEEDRESCHED;
ipi_selected(1 << cpu, IPI_AST);
- }
+ } else if (ipi_preempt)
+ ipi_selected(1 << cpu, IPI_PREEMPT);
}
static struct td_sched *
@@ -762,95 +772,134 @@ tdq_steal(struct tdq *tdq, int stealidle)
}
int
-tdq_transfer(struct tdq *tdq, struct td_sched *ts, int class)
+tdq_pickidle(struct tdq *tdq, struct td_sched *ts)
{
- struct tdq_group *ntdg;
struct tdq_group *tdg;
- struct tdq *old;
+ int self;
int cpu;
- int idx;
+ self = PCPU_GET(cpuid);
if (smp_started == 0)
- return (0);
- cpu = 0;
+ return (self);
/*
- * If our load exceeds a certain threshold we should attempt to
- * reassign this thread. The first candidate is the cpu that
- * originally ran the thread. If it is idle, assign it there,
- * otherwise, pick an idle cpu.
- *
- * The threshold at which we start to reassign has a large impact
- * on the overall performance of the system. Tuned too high and
- * some CPUs may idle. Too low and there will be excess migration
- * and context switches.
+ * If the current CPU has idled, just run it here.
*/
- old = TDQ_CPU(ts->ts_cpu);
- ntdg = old->tdq_group;
- tdg = tdq->tdq_group;
- if (tdq_idle) {
- if (tdq_idle & ntdg->tdg_mask) {
- cpu = ffs(ntdg->tdg_idlemask);
- if (cpu) {
- CTR2(KTR_SCHED,
- "tdq_transfer: %p found old cpu %X "
- "in idlemask.", ts, cpu);
- goto migrate;
- }
- }
- /*
- * Multiple cpus could find this bit simultaneously
- * but the race shouldn't be terrible.
- */
- cpu = ffs(tdq_idle);
- if (cpu) {
- CTR2(KTR_SCHED, "tdq_transfer: %p found %X "
- "in idlemask.", ts, cpu);
- goto migrate;
- }
+ if ((tdq->tdq_group->tdg_idlemask & PCPU_GET(cpumask)) != 0)
+ return (self);
+ /*
+ * Try the last group we ran on.
+ */
+ tdg = TDQ_CPU(ts->ts_cpu)->tdq_group;
+ cpu = ffs(tdg->tdg_idlemask);
+ if (cpu)
+ return (cpu - 1);
+ /*
+ * Search for an idle group.
+ */
+ cpu = ffs(tdq_idle);
+ if (cpu)
+ return (cpu - 1);
+ /*
+ * XXX If there are no idle groups, check for an idle core.
+ */
+ /*
+ * No idle CPUs?
+ */
+ return (self);
+}
+
+static int
+tdq_pickpri(struct tdq *tdq, struct td_sched *ts, int flags)
+{
+ struct pcpu *pcpu;
+ int lowpri;
+ int lowcpu;
+ int lowload;
+ int load;
+ int self;
+ int pri;
+ int cpu;
+
+ self = PCPU_GET(cpuid);
+ if (smp_started == 0)
+ return (self);
+
+ pri = ts->ts_thread->td_priority;
+ /*
+ * Regardless of affinity, if the last cpu is idle send it there.
+ */
+ pcpu = pcpu_find(ts->ts_cpu);
+ if (pcpu->pc_curthread->td_priority > PRI_MIN_IDLE) {
+ CTR5(KTR_SCHED,
+ "ts_cpu %d idle, ltick %d ticks %d pri %d curthread %d",
+ ts->ts_cpu, ts->ts_rltick, ticks, pri,
+ pcpu->pc_curthread->td_priority);
+ return (ts->ts_cpu);
}
- idx = 0;
-#if 0
- if (old->tdq_load < tdq->tdq_load) {
- cpu = ts->ts_cpu + 1;
- CTR2(KTR_SCHED, "tdq_transfer: %p old cpu %X "
- "load less than ours.", ts, cpu);
- goto migrate;
+ /*
+ * If we have affinity, try to place it on the cpu we last ran on.
+ */
+ if (SCHED_AFFINITY(ts) && pcpu->pc_curthread->td_priority > pri) {
+ CTR5(KTR_SCHED,
+ "affinity for %d, ltick %d ticks %d pri %d curthread %d",
+ ts->ts_cpu, ts->ts_rltick, ticks, pri,
+ pcpu->pc_curthread->td_priority);
+ return (ts->ts_cpu);
}
/*
- * No new CPU was found, look for one with less load.
+ * Try ourself first; If we're running something lower priority this
+ * may have some locality with the waking thread and execute faster
+ * here.
*/
- for (idx = 0; idx <= tdg_maxid; idx++) {
- ntdg = TDQ_GROUP(idx);
- if (ntdg->tdg_load /*+ (ntdg->tdg_cpus * 2)*/ < tdg->tdg_load) {
- cpu = ffs(ntdg->tdg_cpumask);
- CTR2(KTR_SCHED, "tdq_transfer: %p cpu %X load less "
- "than ours.", ts, cpu);
- goto migrate;
+ if (tryself) {
+ /*
+ * If we're being awoken by an interrupt thread or the waker
+ * is going right to sleep run here as well.
+ */
+ if ((TDQ_SELF()->tdq_load == 1) && (flags & SRQ_YIELDING ||
+ curthread->td_pri_class == PRI_ITHD)) {
+ CTR2(KTR_SCHED, "tryself load %d flags %d",
+ TDQ_SELF()->tdq_load, flags);
+ return (self);
}
}
-#endif
/*
- * If another cpu in this group has idled, assign a thread over
- * to them after checking to see if there are idled groups.
+ * Look for an idle group.
*/
- if (tdg->tdg_idlemask) {
- cpu = ffs(tdg->tdg_idlemask);
- if (cpu) {
- CTR2(KTR_SCHED, "tdq_transfer: %p cpu %X idle in "
- "group.", ts, cpu);
- goto migrate;
- }
+ CTR1(KTR_SCHED, "tdq_idle %X", tdq_idle);
+ cpu = ffs(tdq_idle);
+ if (cpu)
+ return (cpu - 1);
+ if (tryselfidle && pri < curthread->td_priority) {
+ CTR1(KTR_SCHED, "tryself %d",
+ curthread->td_priority);
+ return (self);
}
- return (0);
-migrate:
/*
- * Now that we've found an idle CPU, migrate the thread.
+ * Now search for the cpu running the lowest priority thread with
+ * the least load.
*/
- cpu--;
- ts->ts_runq = NULL;
- tdq_notify(ts, cpu);
+ lowload = 0;
+ lowpri = lowcpu = 0;
+ for (cpu = 0; cpu <= mp_maxid; cpu++) {
+ if (CPU_ABSENT(cpu))
+ continue;
+ pcpu = pcpu_find(cpu);
+ pri = pcpu->pc_curthread->td_priority;
+ CTR4(KTR_SCHED,
+ "cpu %d pri %d lowcpu %d lowpri %d",
+ cpu, pri, lowcpu, lowpri);
+ if (pri < lowpri)
+ continue;
+ load = TDQ_CPU(cpu)->tdq_load;
+ if (lowpri && lowpri == pri && load > lowload)
+ continue;
+ lowpri = pri;
+ lowcpu = cpu;
+ lowload = load;
+ }
- return (1);
+ return (lowcpu);
}
#endif /* SMP */
@@ -926,7 +975,6 @@ sched_setup(void *dummy)
struct tdq *tdq;
tdq = &tdq_cpu[i];
- tdq->tdq_assigned = NULL;
tdq_setup(&tdq_cpu[i]);
}
if (smp_topology == NULL) {
@@ -1023,6 +1071,9 @@ sched_initticks(void *dummy)
*/
if (tickincr == 0)
tickincr = 1;
+#ifdef SMP
+ affinity = SCHED_AFFINITY_DEFAULT;
+#endif
mtx_unlock_spin(&sched_lock);
}
@@ -1231,16 +1282,10 @@ sched_thread_priority(struct thread *td, u_char prio)
* propagation, we may have to move ourselves to a new
* queue. This could be optimized to not re-add in some
* cases.
- *
- * Hold this td_sched on this cpu so that sched_prio() doesn't
- * cause excessive migration. We only want migration to
- * happen as the result of a wakeup.
*/
- sched_pin_td(td);
sched_rem(td);
td->td_priority = prio;
sched_add(td, SRQ_BORROWING);
- sched_unpin_td(td);
} else
td->td_priority = prio;
}
@@ -1356,9 +1401,11 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
{
struct tdq *tdq;
struct td_sched *ts;
+ int preempt;
mtx_assert(&sched_lock, MA_OWNED);
+ preempt = flags & SW_PREEMPT;
tdq = TDQ_SELF();
ts = td->td_sched;
td->td_lastcpu = td->td_oncpu;
@@ -1371,19 +1418,20 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
*/
if (td == PCPU_GET(idlethread)) {
TD_SET_CAN_RUN(td);
- } else if ((ts->ts_flags & TSF_ASSIGNED) == 0) {
- /* We are ending our run so make our slot available again */
+ } else {
tdq_load_rem(tdq, ts);
if (TD_IS_RUNNING(td)) {
/*
* Don't allow the thread to migrate
* from a preemption.
*/
- sched_pin_td(td);
- setrunqueue(td, (flags & SW_PREEMPT) ?
+ if (preempt)
+ sched_pin_td(td);
+ setrunqueue(td, preempt ?
SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
SRQ_OURSELF|SRQ_YIELDING);
- sched_unpin_td(td);
+ if (preempt)
+ sched_unpin_td(td);
}
}
if (newtd != NULL) {
@@ -1614,7 +1662,7 @@ sched_clock(struct thread *td)
mtx_assert(&sched_lock, MA_OWNED);
#ifdef SMP
- sched_smp_tick();
+ sched_smp_tick(td);
#endif
tdq = TDQ_SELF();
/*
@@ -1656,9 +1704,6 @@ sched_clock(struct thread *td)
* We're out of time, recompute priorities and requeue.
*/
sched_priority(td);
- tdq_load_rem(tdq, ts);
- ts->ts_slice = sched_slice;
- tdq_load_add(tdq, ts);
td->td_flags |= TDF_NEEDRESCHED;
}
@@ -1672,11 +1717,8 @@ sched_runnable(void)
tdq = TDQ_SELF();
#ifdef SMP
- if (tdq->tdq_assigned) {
- mtx_lock_spin(&sched_lock);
- tdq_assign(tdq);
- mtx_unlock_spin(&sched_lock);
- }
+ if (tdq_busy)
+ goto out;
#endif
if ((curthread->td_flags & TDF_IDLETD) != 0) {
if (tdq->tdq_load > 0)
@@ -1699,8 +1741,6 @@ sched_choose(void)
tdq = TDQ_SELF();
#ifdef SMP
restart:
- if (tdq->tdq_assigned)
- tdq_assign(tdq);
#endif
ts = tdq_choose(tdq);
if (ts) {
@@ -1726,8 +1766,11 @@ sched_add(struct thread *td, int flags)
struct tdq *tdq;
struct td_sched *ts;
int preemptive;
- int canmigrate;
int class;
+#ifdef SMP
+ int cpuid;
+ int cpumask;
+#endif
CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
td, td->td_proc->p_comm, td->td_priority, curthread,
@@ -1737,15 +1780,6 @@ sched_add(struct thread *td, int flags)
ts = td->td_sched;
class = PRI_BASE(td->td_pri_class);
preemptive = !(flags & SRQ_YIELDING);
- canmigrate = 1;
-#ifdef SMP
- if (ts->ts_flags & TSF_ASSIGNED) {
- if (ts->ts_flags & TSF_REMOVED)
- ts->ts_flags &= ~TSF_REMOVED;
- return;
- }
- canmigrate = THREAD_CAN_MIGRATE(td);
-#endif
KASSERT(ts->ts_state != TSS_ONRUNQ,
("sched_add: thread %p (%s) already in run queue", td,
td->td_proc->p_comm));
@@ -1754,42 +1788,38 @@ sched_add(struct thread *td, int flags)
KASSERT(ts->ts_runq == NULL,
("sched_add: thread %p is still assigned to a run queue", td));
/*
- * Set the slice and pick the run queue.
+ * Recalculate the priority before we select the target cpu or
+ * run-queue.
*/
- if (ts->ts_slice == 0)
- ts->ts_slice = sched_slice;
if (class == PRI_TIMESHARE)
sched_priority(td);
- if (td->td_priority <= PRI_MAX_REALTIME) {
- ts->ts_runq = &tdq->tdq_realtime;
- /*
- * If the thread is not artificially pinned and it's in
- * the realtime queue we directly dispatch it on this cpu
- * for minimum latency. Interrupt handlers may also have
- * to complete on the cpu that dispatched them.
- */
- if (td->td_pinned == 0 && class == PRI_ITHD)
- ts->ts_cpu = PCPU_GET(cpuid);
- } else if (td->td_priority <= PRI_MAX_TIMESHARE)
- ts->ts_runq = &tdq->tdq_timeshare;
- else
- ts->ts_runq = &tdq->tdq_idle;
-
#ifdef SMP
+ cpuid = PCPU_GET(cpuid);
/*
- * If this thread is pinned or bound, notify the target cpu.
+ * Pick the destination cpu and if it isn't ours transfer to the
+ * target cpu.
*/
- if (!canmigrate && ts->ts_cpu != PCPU_GET(cpuid) ) {
- ts->ts_runq = NULL;
- tdq_notify(ts, ts->ts_cpu);
- return;
- }
+ if (THREAD_CAN_MIGRATE(td)) {
+ if (td->td_priority <= PRI_MAX_ITHD) {
+ CTR2(KTR_SCHED, "ithd %d < %d", td->td_priority, PRI_MAX_ITHD);
+ ts->ts_cpu = cpuid;
+ }
+ if (pick_pri)
+ ts->ts_cpu = tdq_pickpri(tdq, ts, flags);
+ else
+ ts->ts_cpu = tdq_pickidle(tdq, ts);
+ } else
+ CTR1(KTR_SCHED, "pinned %d", td->td_pinned);
+ if (ts->ts_cpu != cpuid)
+ preemptive = 0;
+ tdq = TDQ_CPU(ts->ts_cpu);
+ cpumask = 1 << ts->ts_cpu;
/*
* If we had been idle, clear our bit in the group and potentially
- * the global bitmap. If not, see if we should transfer this thread.
+ * the global bitmap.
*/
if ((class != PRI_IDLE && class != PRI_ITHD) &&
- (tdq->tdq_group->tdg_idlemask & PCPU_GET(cpumask)) != 0) {
+ (tdq->tdq_group->tdg_idlemask & cpumask) != 0) {
/*
* Check to see if our group is unidling, and if so, remove it
* from the global idle mask.
@@ -1800,20 +1830,34 @@ sched_add(struct thread *td, int flags)
/*
* Now remove ourselves from the group specific idle mask.
*/
- tdq->tdq_group->tdg_idlemask &= ~PCPU_GET(cpumask);
- } else if (canmigrate && tdq->tdq_load > 1)
- if (tdq_transfer(tdq, ts, class))
- return;
- ts->ts_cpu = PCPU_GET(cpuid);
+ tdq->tdq_group->tdg_idlemask &= ~cpumask;
+ }
#endif
- if (td->td_priority < curthread->td_priority)
- curthread->td_flags |= TDF_NEEDRESCHED;
+ /*
+ * Set the slice and pick the run queue.
+ */
+ if (ts->ts_slice == 0)
+ ts->ts_slice = sched_slice;
+ if (td->td_priority <= PRI_MAX_REALTIME)
+ ts->ts_runq = &tdq->tdq_realtime;
+ else if (td->td_priority <= PRI_MAX_TIMESHARE)
+ ts->ts_runq = &tdq->tdq_timeshare;
+ else
+ ts->ts_runq = &tdq->tdq_idle;
if (preemptive && maybe_preempt(td))
return;
ts->ts_state = TSS_ONRUNQ;
tdq_runq_add(tdq, ts, flags);
tdq_load_add(tdq, ts);
+#ifdef SMP
+ if (ts->ts_cpu != cpuid) {
+ tdq_notify(ts);
+ return;
+ }
+#endif
+ if (td->td_priority < curthread->td_priority)
+ curthread->td_flags |= TDF_NEEDRESCHED;
}
void
@@ -1827,10 +1871,6 @@ sched_rem(struct thread *td)
curthread->td_proc->p_comm);
mtx_assert(&sched_lock, MA_OWNED);
ts = td->td_sched;
- if (ts->ts_flags & TSF_ASSIGNED) {
- ts->ts_flags |= TSF_REMOVED;
- return;
- }
KASSERT((ts->ts_state == TSS_ONRUNQ),
("sched_rem: thread not on run queue"));
@@ -1881,8 +1921,6 @@ sched_bind(struct thread *td, int cpu)
return;
/* sched_rem without the runq_remove */
ts->ts_state = TSS_THREAD;
- tdq_load_rem(TDQ_CPU(ts->ts_cpu), ts);
- tdq_notify(ts, cpu);
/* When we return from mi_switch we'll be on the correct cpu. */
mi_switch(SW_VOL, NULL);
sched_pin();
@@ -1962,7 +2000,22 @@ SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, tickincr, CTLFLAG_RD, &tickincr, 0, "");
SYSCTL_INT(_kern_sched, OID_AUTO, realstathz, CTLFLAG_RD, &realstathz, 0, "");
-SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &sched_rebalance, 0, "");
+#ifdef SMP
+SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri, CTLFLAG_RW, &pick_pri, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri_affinity, CTLFLAG_RW,
+ &affinity, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri_tryself, CTLFLAG_RW,
+ &tryself, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri_tryselfidle, CTLFLAG_RW,
+ &tryselfidle, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, ipi_preempt, CTLFLAG_RW, &ipi_preempt, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, ipi_ast, CTLFLAG_RW, &ipi_ast, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, ipi_thresh, CTLFLAG_RW, &ipi_thresh, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, steal_busy, CTLFLAG_RW, &steal_busy, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, busy_thresh, CTLFLAG_RW, &busy_thresh, 0, "");
+#endif
/* ps compat */
static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
OpenPOWER on IntegriCloud