summaryrefslogtreecommitdiffstats
path: root/sys/kern
diff options
context:
space:
mode:
authorjeff <jeff@FreeBSD.org>2007-01-04 08:56:25 +0000
committerjeff <jeff@FreeBSD.org>2007-01-04 08:56:25 +0000
commit2c3282f28a13d8e226a5704915a0d5269af392d1 (patch)
tree23f084ec8ed5e3eb4d0371f701c5dc06e92b9ef4 /sys/kern
parent78c3275ce133e0cbb8114344ee970ae68e421a9e (diff)
downloadFreeBSD-src-2c3282f28a13d8e226a5704915a0d5269af392d1.zip
FreeBSD-src-2c3282f28a13d8e226a5704915a0d5269af392d1.tar.gz
ULE 2.0:
- Remove the double queue mechanism for timeshare threads. It was slow due to excess cache lines in play, caused suboptimal scheduling behavior with niced and other non-interactive processes, complicated priority lending, etc. - Use a circular queue with a floating starting index for timeshare threads. Enforces fairness by moving the insertion point closer to threads with worse priorities over time. - Give interactive timeshare threads real-time user-space priorities and place them on the realtime/ithd queue. - Select non-interactive timeshare thread priorities based on their cpu utilization over the last 10 seconds combined with the nice value. This gives us more sane priorities and behavior in a loaded system as compared to the old method of using the interactivity score. The interactive score quickly hit a ceiling if threads were non-interactive and penalized new hog threads. - Use one slice size for all threads. The slice is not currently dynamically set to adjust scheduling behavior of different threads. - Add some new sysctls for scheduling parameters. Bug fixes/Clean up: - Fix zeroing of td_sched after initialization in sched_fork_thread() caused by recent ksegrp removal. - Fix KSE interactivity issues related to frequent forking and exiting of kse threads. We simply disable the penalty for thread creation and exit for kse threads. - Cleanup the cpu estimator by using tickincr here as well. Keep ticks and ltick/ftick in the same frequency. Previously ticks were stathz and others were hz. - Lots of new and updated comments. - Many many others. Tested on: up x86/amd64, 8way amd64.
Diffstat (limited to 'sys/kern')
-rw-r--r--sys/kern/sched_ule.c740
1 files changed, 330 insertions, 410 deletions
diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index 40a3ad8..1a26e59 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2002-2006, Jeffrey Roberson <jeff@freebsd.org>
+ * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -60,31 +60,6 @@ __FBSDID("$FreeBSD$");
#include <machine/cpu.h>
#include <machine/smp.h>
-/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
-/* XXX This is bogus compatability crap for ps */
-static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
-SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
-
-static void sched_setup(void *dummy);
-SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
-
-static void sched_initticks(void *dummy);
-SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL)
-
-static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
-
-SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0,
- "Scheduler name");
-
-static int slice_min = 1;
-SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, "");
-
-static int slice_max = 10;
-SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, "");
-
-int realstathz;
-int tickincr = 1 << 10;
-
/*
* Thread scheduler specific section.
*/
@@ -95,7 +70,7 @@ struct td_sched {
fixpt_t ts_pctcpu; /* (j) %cpu during p_swtime. */
u_char ts_rqindex; /* (j) Run queue index. */
enum {
- TSS_THREAD = 0x0, /* slaved to thread state */
+ TSS_THREAD,
TSS_ONRUNQ
} ts_state; /* (j) thread sched specific status. */
int ts_slptime;
@@ -119,31 +94,53 @@ struct td_sched {
#define TSF_HOLD 0x0008 /* Thread is temporarily bound. */
#define TSF_REMOVED 0x0010 /* Thread was removed while ASSIGNED */
#define TSF_INTERNAL 0x0020 /* Thread added due to migration. */
-#define TSF_PREEMPTED 0x0040 /* Thread was preempted */
#define TSF_DIDRUN 0x2000 /* Thread actually ran. */
#define TSF_EXIT 0x4000 /* Thread is being killed. */
static struct td_sched td_sched0;
/*
- * The priority is primarily determined by the interactivity score. Thus, we
- * give lower(better) priorities to threads that use less CPU. The nice
- * value is then directly added to this to allow nice to have some effect
- * on latency.
+ * Cpu percentage computation macros and defines.
+ *
+ * SCHED_TICK_SECS: Number of seconds to average the cpu usage across.
+ * SCHED_TICK_TARG: Number of hz ticks to average the cpu usage across.
+ * SCHED_TICK_SHIFT: Shift factor to avoid rounding away results.
+ * SCHED_TICK_HZ: Compute the number of hz ticks for a given ticks count.
+ * SCHED_TICK_TOTAL: Gives the amount of time we've been recording ticks.
+ */
+#define SCHED_TICK_SECS 10
+#define SCHED_TICK_TARG (hz * SCHED_TICK_SECS)
+#define SCHED_TICK_SHIFT 10
+#define SCHED_TICK_HZ(ts) ((ts)->ts_ticks >> SCHED_TICK_SHIFT)
+#define SCHED_TICK_TOTAL(ts) ((ts)->ts_ltick - (ts)->ts_ftick)
+
+/*
+ * These macros determine priorities for non-interactive threads. They are
+ * assigned a priority based on their recent cpu utilization as expressed
+ * by the ratio of ticks to the tick total. NHALF priorities at the start
+ * and end of the MIN to MAX timeshare range are only reachable with negative
+ * or positive nice respectively.
*
- * PRI_RANGE: Total priority range for timeshare threads.
+ * PRI_RANGE: Priority range for utilization dependent priorities.
* PRI_NRESV: Number of nice values.
- * PRI_BASE: The start of the dynamic range.
+ * PRI_TICKS: Compute a priority in PRI_RANGE from the ticks count and total.
+ * PRI_NICE: Determines the part of the priority inherited from nice.
*/
-#define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
-#define SCHED_PRI_NRESV ((PRIO_MAX - PRIO_MIN) + 1)
+#define SCHED_PRI_NRESV (PRIO_MAX - PRIO_MIN)
#define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2)
-#define SCHED_PRI_BASE (PRI_MIN_TIMESHARE)
-#define SCHED_PRI_INTERACT(score) \
- ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX)
+#define SCHED_PRI_MIN (PRI_MIN_TIMESHARE + SCHED_PRI_NHALF)
+#define SCHED_PRI_MAX (PRI_MAX_TIMESHARE - SCHED_PRI_NHALF)
+#define SCHED_PRI_RANGE (SCHED_PRI_MAX - SCHED_PRI_MIN + 1)
+#define SCHED_PRI_TICKS(ts) \
+ (SCHED_TICK_HZ((ts)) / \
+ (max(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
+#define SCHED_PRI_NICE(nice) (nice)
/*
- * These determine the interactivity of a process.
+ * These determine the interactivity of a process. Interactivity differs from
+ * cpu utilization in that it expresses the voluntary time slept vs time ran
+ * while cpu utilization includes all time not running. This more accurately
+ * models the intent of the thread.
*
* SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate
* before throttling back.
@@ -151,64 +148,34 @@ static struct td_sched td_sched0;
* INTERACT_MAX: Maximum interactivity value. Smaller is better.
* INTERACT_THRESH: Threshhold for placement on the current runq.
*/
-#define SCHED_SLP_RUN_MAX ((hz * 5) << 10)
-#define SCHED_SLP_RUN_FORK ((hz / 2) << 10)
+#define SCHED_SLP_RUN_MAX ((hz * 5) << SCHED_TICK_SHIFT)
+#define SCHED_SLP_RUN_FORK ((hz / 2) << SCHED_TICK_SHIFT)
#define SCHED_INTERACT_MAX (100)
#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2)
#define SCHED_INTERACT_THRESH (30)
/*
- * These parameters and macros determine the size of the time slice that is
- * granted to each thread.
- *
- * SLICE_MIN: Minimum time slice granted, in units of ticks.
- * SLICE_MAX: Maximum time slice granted.
- * SLICE_RANGE: Range of available time slices scaled by hz.
- * SLICE_SCALE: The number slices granted per val in the range of [0, max].
- * SLICE_NICE: Determine the amount of slice granted to a scaled nice.
- * SLICE_NTHRESH: The nice cutoff point for slice assignment.
+ * tickincr: Converts a stathz tick into a hz domain scaled by
+ * the shift factor. Without the shift the error rate
+ * due to rounding would be unacceptably high.
+ * realstathz: stathz is sometimes 0 and run off of hz.
+ * sched_slice: Runtime of each thread before rescheduling.
*/
-#define SCHED_SLICE_MIN (slice_min)
-#define SCHED_SLICE_MAX (slice_max)
-#define SCHED_SLICE_INTERACTIVE (slice_max)
-#define SCHED_SLICE_NTHRESH (SCHED_PRI_NHALF - 1)
-#define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
-#define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max))
-#define SCHED_SLICE_NICE(nice) \
- (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH))
-
-/*
- * This macro determines whether or not the thread belongs on the current or
- * next run queue.
- */
-#define SCHED_INTERACTIVE(td) \
- (sched_interact_score(td) < SCHED_INTERACT_THRESH)
-#define SCHED_CURR(td, ts) \
- ((ts->ts_thread->td_flags & TDF_BORROWING) || \
- (ts->ts_flags & TSF_PREEMPTED) || SCHED_INTERACTIVE(td))
-
-/*
- * Cpu percentage computation macros and defines.
- *
- * SCHED_CPU_TIME: Number of seconds to average the cpu usage across.
- * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across.
- */
-
-#define SCHED_CPU_TIME 10
-#define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME)
+static int sched_interact = SCHED_INTERACT_THRESH;
+static int realstathz;
+static int tickincr;
+static int sched_slice;
/*
* tdq - per processor runqs and statistics.
*/
struct tdq {
struct runq tdq_idle; /* Queue of IDLE threads. */
- struct runq tdq_timeshare[2]; /* Run queues for !IDLE. */
- struct runq *tdq_next; /* Next timeshare queue. */
- struct runq *tdq_curr; /* Current queue. */
+ struct runq tdq_timeshare; /* timeshare run queue. */
+ struct runq tdq_realtime; /* real-time run queue. */
+ int tdq_idx; /* Current removal index. */
int tdq_load_timeshare; /* Load for timeshare. */
int tdq_load; /* Aggregate load. */
- short tdq_nice[SCHED_PRI_NRESV]; /* threadss in each nice bin. */
- short tdq_nicemin; /* Least nice. */
#ifdef SMP
int tdq_transferable;
LIST_ENTRY(tdq) tdq_siblings; /* Next in tdq group. */
@@ -262,8 +229,7 @@ static struct tdq tdq_cpu;
#define TDQ_CPU(x) (&tdq_cpu)
#endif
-static struct td_sched *sched_choose(void); /* XXX Should be thread * */
-static void sched_slice(struct td_sched *);
+static struct td_sched *sched_choose(void); /* XXX Should be thread * */
static void sched_priority(struct thread *);
static void sched_thread_priority(struct thread *, u_char);
static int sched_interact_score(struct thread *);
@@ -278,9 +244,8 @@ static void tdq_load_add(struct tdq *, struct td_sched *);
static void tdq_load_rem(struct tdq *, struct td_sched *);
static __inline void tdq_runq_add(struct tdq *, struct td_sched *, int);
static __inline void tdq_runq_rem(struct tdq *, struct td_sched *);
-static void tdq_nice_add(struct tdq *, int);
-static void tdq_nice_rem(struct tdq *, int);
void tdq_print(int cpu);
+static void runq_print(struct runq *rq);
#ifdef SMP
static int tdq_transfer(struct tdq *, struct td_sched *, int);
static struct td_sched *runq_steal(struct runq *);
@@ -293,45 +258,91 @@ static int tdq_idled(struct tdq *);
static void tdq_notify(struct td_sched *, int);
static void tdq_assign(struct tdq *);
static struct td_sched *tdq_steal(struct tdq *, int);
-#define THREAD_CAN_MIGRATE(ts) \
- ((ts)->ts_thread->td_pinned == 0 && ((ts)->ts_flags & TSF_BOUND) == 0)
+#define THREAD_CAN_MIGRATE(td) \
+ ((td)->td_pinned == 0 && (td)->td_pri_class != PRI_ITHD)
#endif
+static void sched_setup(void *dummy);
+SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
+
+static void sched_initticks(void *dummy);
+SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL)
+
+static void
+runq_print(struct runq *rq)
+{
+ struct rqhead *rqh;
+ struct td_sched *ts;
+ int pri;
+ int j;
+ int i;
+
+ for (i = 0; i < RQB_LEN; i++) {
+ printf("\t\trunq bits %d 0x%zx\n",
+ i, rq->rq_status.rqb_bits[i]);
+ for (j = 0; j < RQB_BPW; j++)
+ if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
+ pri = j + (i << RQB_L2BPW);
+ rqh = &rq->rq_queues[pri];
+ TAILQ_FOREACH(ts, rqh, ts_procq) {
+ printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
+ ts->ts_thread, ts->ts_thread->td_proc->p_comm, ts->ts_thread->td_priority, ts->ts_rqindex, pri);
+ }
+ }
+ }
+}
+
void
tdq_print(int cpu)
{
struct tdq *tdq;
- int i;
tdq = TDQ_CPU(cpu);
printf("tdq:\n");
printf("\tload: %d\n", tdq->tdq_load);
printf("\tload TIMESHARE: %d\n", tdq->tdq_load_timeshare);
+ printf("\ttimeshare idx: %d\n", tdq->tdq_idx);
+ printf("\trealtime runq:\n");
+ runq_print(&tdq->tdq_realtime);
+ printf("\ttimeshare runq:\n");
+ runq_print(&tdq->tdq_timeshare);
+ printf("\tidle runq:\n");
+ runq_print(&tdq->tdq_idle);
#ifdef SMP
printf("\tload transferable: %d\n", tdq->tdq_transferable);
#endif
- printf("\tnicemin:\t%d\n", tdq->tdq_nicemin);
- printf("\tnice counts:\n");
- for (i = 0; i < SCHED_PRI_NRESV; i++)
- if (tdq->tdq_nice[i])
- printf("\t\t%d = %d\n",
- i - SCHED_PRI_NHALF, tdq->tdq_nice[i]);
}
static __inline void
tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags)
{
#ifdef SMP
- if (THREAD_CAN_MIGRATE(ts)) {
+ if (THREAD_CAN_MIGRATE(ts->ts_thread)) {
tdq->tdq_transferable++;
tdq->tdq_group->tdg_transferable++;
ts->ts_flags |= TSF_XFERABLE;
}
#endif
- if (ts->ts_flags & TSF_PREEMPTED)
- flags |= SRQ_PREEMPTED;
- runq_add(ts->ts_runq, ts, flags);
+ if (ts->ts_runq == &tdq->tdq_timeshare) {
+ int pri;
+
+ pri = ts->ts_thread->td_priority;
+ KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE,
+ ("Invalid priority %d on timeshare runq", pri));
+ /*
+ * This queue contains only priorities between MIN and MAX
+ * realtime. Use the whole queue to represent these values.
+ */
+#define TS_RQ_PPQ (((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS)
+ if ((flags & SRQ_BORROWING) == 0) {
+ pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ;
+ pri = (pri + tdq->tdq_idx) % RQ_NQS;
+ } else
+ pri = tdq->tdq_idx;
+ runq_add_pri(ts->ts_runq, ts, pri, flags);
+ } else
+ runq_add(ts->ts_runq, ts, flags);
}
static __inline void
@@ -344,7 +355,10 @@ tdq_runq_rem(struct tdq *tdq, struct td_sched *ts)
ts->ts_flags &= ~TSF_XFERABLE;
}
#endif
- runq_remove(ts->ts_runq, ts);
+ if (ts->ts_runq == &tdq->tdq_timeshare)
+ runq_remove_idx(ts->ts_runq, ts, &tdq->tdq_idx);
+ else
+ runq_remove(ts->ts_runq, ts);
}
static void
@@ -363,8 +377,6 @@ tdq_load_add(struct tdq *tdq, struct td_sched *ts)
#else
tdq->tdq_sysload++;
#endif
- if (ts->ts_thread->td_pri_class == PRI_TIMESHARE)
- tdq_nice_add(tdq, ts->ts_thread->td_proc->p_nice);
}
static void
@@ -384,46 +396,6 @@ tdq_load_rem(struct tdq *tdq, struct td_sched *ts)
tdq->tdq_load--;
CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
ts->ts_runq = NULL;
- if (ts->ts_thread->td_pri_class == PRI_TIMESHARE)
- tdq_nice_rem(tdq, ts->ts_thread->td_proc->p_nice);
-}
-
-static void
-tdq_nice_add(struct tdq *tdq, int nice)
-{
- mtx_assert(&sched_lock, MA_OWNED);
- /* Normalize to zero. */
- tdq->tdq_nice[nice + SCHED_PRI_NHALF]++;
- if (nice < tdq->tdq_nicemin || tdq->tdq_load_timeshare == 1)
- tdq->tdq_nicemin = nice;
-}
-
-static void
-tdq_nice_rem(struct tdq *tdq, int nice)
-{
- int n;
-
- mtx_assert(&sched_lock, MA_OWNED);
- /* Normalize to zero. */
- n = nice + SCHED_PRI_NHALF;
- tdq->tdq_nice[n]--;
- KASSERT(tdq->tdq_nice[n] >= 0, ("Negative nice count."));
-
- /*
- * If this wasn't the smallest nice value or there are more in
- * this bucket we can just return. Otherwise we have to recalculate
- * the smallest nice.
- */
- if (nice != tdq->tdq_nicemin ||
- tdq->tdq_nice[n] != 0 ||
- tdq->tdq_load_timeshare == 0)
- return;
-
- for (; n < SCHED_PRI_NRESV; n++)
- if (tdq->tdq_nice[n]) {
- tdq->tdq_nicemin = n - SCHED_PRI_NHALF;
- return;
- }
}
#ifdef SMP
@@ -658,10 +630,9 @@ tdq_notify(struct td_sched *ts, int cpu)
int prio;
tdq = TDQ_CPU(cpu);
- /* XXX */
class = PRI_BASE(ts->ts_thread->td_pri_class);
- if ((class == PRI_TIMESHARE || class == PRI_REALTIME) &&
- (tdq_idle & tdq->tdq_group->tdg_mask))
+ if ((class != PRI_IDLE && class != PRI_ITHD)
+ && (tdq_idle & tdq->tdq_group->tdg_mask))
atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask);
tdq->tdq_group->tdg_load++;
tdq->tdq_load++;
@@ -710,7 +681,7 @@ runq_steal(struct runq *rq)
continue;
rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
TAILQ_FOREACH(ts, rqh, ts_procq) {
- if (THREAD_CAN_MIGRATE(ts))
+ if (THREAD_CAN_MIGRATE(ts->ts_thread))
return (ts);
}
}
@@ -726,10 +697,11 @@ tdq_steal(struct tdq *tdq, int stealidle)
/*
* Steal from next first to try to get a non-interactive task that
* may not have run for a while.
+ * XXX Need to effect steal order for timeshare threads.
*/
- if ((ts = runq_steal(tdq->tdq_next)) != NULL)
+ if ((ts = runq_steal(&tdq->tdq_realtime)) != NULL)
return (ts);
- if ((ts = runq_steal(tdq->tdq_curr)) != NULL)
+ if ((ts = runq_steal(&tdq->tdq_timeshare)) != NULL)
return (ts);
if (stealidle)
return (runq_steal(&tdq->tdq_idle));
@@ -837,56 +809,43 @@ migrate:
static struct td_sched *
tdq_choose(struct tdq *tdq)
{
- struct runq *swap;
struct td_sched *ts;
- int nice;
mtx_assert(&sched_lock, MA_OWNED);
- swap = NULL;
- for (;;) {
- ts = runq_choose(tdq->tdq_curr);
- if (ts == NULL) {
- /*
- * We already swapped once and didn't get anywhere.
- */
- if (swap)
- break;
- swap = tdq->tdq_curr;
- tdq->tdq_curr = tdq->tdq_next;
- tdq->tdq_next = swap;
- continue;
- }
- /*
- * If we encounter a slice of 0 the td_sched is in a
- * TIMESHARE td_sched group and its nice was too far out
- * of the range that receives slices.
- */
- nice = ts->ts_thread->td_proc->p_nice + (0 - tdq->tdq_nicemin);
-#if 0
- if (ts->ts_slice == 0 || (nice > SCHED_SLICE_NTHRESH &&
- ts->ts_thread->td_proc->p_nice != 0)) {
- runq_remove(ts->ts_runq, ts);
- sched_slice(ts);
- ts->ts_runq = tdq->tdq_next;
- runq_add(ts->ts_runq, ts, 0);
- continue;
- }
-#endif
+ ts = runq_choose(&tdq->tdq_realtime);
+ if (ts != NULL) {
+ KASSERT(ts->ts_thread->td_priority <= PRI_MAX_REALTIME,
+ ("tdq_choose: Invalid priority on realtime queue %d",
+ ts->ts_thread->td_priority));
+ return (ts);
+ }
+ ts = runq_choose_from(&tdq->tdq_timeshare, &tdq->tdq_idx);
+ if (ts != NULL) {
+ KASSERT(ts->ts_thread->td_priority <= PRI_MAX_TIMESHARE &&
+ ts->ts_thread->td_priority >= PRI_MIN_TIMESHARE,
+ ("tdq_choose: Invalid priority on timeshare queue %d",
+ ts->ts_thread->td_priority));
+ return (ts);
+ }
+
+ ts = runq_choose(&tdq->tdq_idle);
+ if (ts != NULL) {
+ KASSERT(ts->ts_thread->td_priority >= PRI_MIN_IDLE,
+ ("tdq_choose: Invalid priority on idle queue %d",
+ ts->ts_thread->td_priority));
return (ts);
}
- return (runq_choose(&tdq->tdq_idle));
+ return (NULL);
}
static void
tdq_setup(struct tdq *tdq)
{
- runq_init(&tdq->tdq_timeshare[0]);
- runq_init(&tdq->tdq_timeshare[1]);
+ runq_init(&tdq->tdq_realtime);
+ runq_init(&tdq->tdq_timeshare);
runq_init(&tdq->tdq_idle);
- tdq->tdq_curr = &tdq->tdq_timeshare[0];
- tdq->tdq_next = &tdq->tdq_timeshare[1];
tdq->tdq_load = 0;
tdq->tdq_load_timeshare = 0;
}
@@ -903,8 +862,8 @@ sched_setup(void *dummy)
* in case which sched_clock() called before sched_initticks().
*/
realstathz = hz;
- slice_min = (hz/100); /* 10ms */
- slice_max = (hz/7); /* ~140ms */
+ sched_slice = (realstathz/7); /* 140ms */
+ tickincr = 1 << SCHED_TICK_SHIFT;
#ifdef SMP
balance_groups = 0;
@@ -999,13 +958,16 @@ sched_initticks(void *dummy)
{
mtx_lock_spin(&sched_lock);
realstathz = stathz ? stathz : hz;
- slice_min = (realstathz/100); /* 10ms */
- slice_max = (realstathz/7); /* ~140ms */
+ sched_slice = (realstathz/7); /* ~140ms */
- tickincr = (hz << 10) / realstathz;
/*
- * XXX This does not work for values of stathz that are much
- * larger than hz.
+ * tickincr is shifted out by 10 to avoid rounding errors due to
+ * hz not being evenly divisible by stathz.
+ */
+ tickincr = (hz << SCHED_TICK_SHIFT) / realstathz;
+ /*
+ * This does not work for values of stathz that are more than
+ * 1 << SCHED_TICK_SHIFT * hz. In practice this does not happen.
*/
if (tickincr == 0)
tickincr = 1;
@@ -1020,81 +982,36 @@ sched_initticks(void *dummy)
static void
sched_priority(struct thread *td)
{
+ int score;
int pri;
if (td->td_pri_class != PRI_TIMESHARE)
return;
-
- pri = SCHED_PRI_INTERACT(sched_interact_score(td));
- pri += SCHED_PRI_BASE;
- pri += td->td_proc->p_nice;
-
- if (pri > PRI_MAX_TIMESHARE)
- pri = PRI_MAX_TIMESHARE;
- else if (pri < PRI_MIN_TIMESHARE)
- pri = PRI_MIN_TIMESHARE;
-
- sched_user_prio(td, pri);
-
- return;
-}
-
-/*
- * Calculate a time slice based on the properties of the process
- * and the runq that we're on. This is only for PRI_TIMESHARE threads.
- */
-static void
-sched_slice(struct td_sched *ts)
-{
- struct tdq *tdq;
- struct thread *td;
-
- td = ts->ts_thread;
- tdq = TDQ_CPU(ts->ts_cpu);
-
- if (td->td_flags & TDF_BORROWING) {
- ts->ts_slice = SCHED_SLICE_MIN;
- return;
- }
-
/*
- * Rationale:
- * Threads in interactive procs get a minimal slice so that we
- * quickly notice if it abuses its advantage.
- *
- * Threads in non-interactive procs are assigned a slice that is
- * based on the procs nice value relative to the least nice procs
- * on the run queue for this cpu.
- *
- * If the thread is less nice than all others it gets the maximum
- * slice and other threads will adjust their slice relative to
- * this when they first expire.
+ * If the score is interactive we place the thread in the realtime
+ * queue with a priority that is less than kernel and interrupt
+ * priorities. These threads are not subject to nice restrictions.
*
- * There is 20 point window that starts relative to the least
- * nice td_sched on the run queue. Slice size is determined by
- * the td_sched distance from the last nice thread.
- *
- * If the td_sched is outside of the window it will get no slice
- * and will be reevaluated each time it is selected on the
- * run queue. The exception to this is nice 0 procs when
- * a nice -20 is running. They are always granted a minimum
- * slice.
+ * Scores greater than this are placed on the normal realtime queue
+ * where the priority is partially decided by the most recent cpu
+ * utilization and the rest is decided by nice value.
*/
- if (!SCHED_INTERACTIVE(td)) {
- int nice;
-
- nice = td->td_proc->p_nice + (0 - tdq->tdq_nicemin);
- if (tdq->tdq_load_timeshare == 0 ||
- td->td_proc->p_nice < tdq->tdq_nicemin)
- ts->ts_slice = SCHED_SLICE_MAX;
- else if (nice <= SCHED_SLICE_NTHRESH)
- ts->ts_slice = SCHED_SLICE_NICE(nice);
- else if (td->td_proc->p_nice == 0)
- ts->ts_slice = SCHED_SLICE_MIN;
- else
- ts->ts_slice = SCHED_SLICE_MIN; /* 0 */
- } else
- ts->ts_slice = SCHED_SLICE_INTERACTIVE;
+ score = sched_interact_score(td);
+ if (score < sched_interact) {
+ pri = PRI_MIN_REALTIME;
+ pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact)
+ * score;
+ KASSERT(pri >= PRI_MIN_REALTIME && pri <= PRI_MAX_REALTIME,
+ ("sched_priority: invalid interactive priority %d", pri));
+ } else {
+ pri = SCHED_PRI_MIN;
+ if (td->td_sched->ts_ticks)
+ pri += SCHED_PRI_TICKS(td->td_sched);
+ pri += SCHED_PRI_NICE(td->td_proc->p_nice);
+ KASSERT(pri >= PRI_MIN_TIMESHARE && pri <= PRI_MAX_TIMESHARE,
+ ("sched_priority: invalid priority %d", pri));
+ }
+ sched_user_prio(td, pri);
return;
}
@@ -1163,19 +1080,19 @@ sched_interact_score(struct thread *td)
}
/*
- * Very early in the boot some setup of scheduler-specific
- * parts of proc0 and of soem scheduler resources needs to be done.
- * Called from:
- * proc0_init()
+ * Called from proc0_init() to bootstrap the scheduler.
*/
void
schedinit(void)
{
+
/*
* Set up the scheduler specific parts of proc0.
*/
proc0.p_sched = NULL; /* XXX */
thread0.td_sched = &td_sched0;
+ td_sched0.ts_ltick = ticks;
+ td_sched0.ts_ftick = ticks - 1;
td_sched0.ts_thread = &thread0;
td_sched0.ts_state = TSS_THREAD;
}
@@ -1183,36 +1100,35 @@ schedinit(void)
/*
* This is only somewhat accurate since given many processes of the same
* priority they will switch when their slices run out, which will be
- * at most SCHED_SLICE_MAX.
+ * at most sched_slice stathz ticks.
*/
int
sched_rr_interval(void)
{
- return (SCHED_SLICE_MAX);
+
+ /* Convert sched_slice to hz */
+ return (hz/(realstathz/sched_slice));
}
static void
sched_pctcpu_update(struct td_sched *ts)
{
+
+ if (ts->ts_ticks == 0)
+ return;
/*
* Adjust counters and watermark for pctcpu calc.
*/
- if (ts->ts_ltick > ticks - SCHED_CPU_TICKS) {
- /*
- * Shift the tick count out so that the divide doesn't
- * round away our results.
- */
- ts->ts_ticks <<= 10;
+ if (ts->ts_ltick > ticks - SCHED_TICK_TARG)
ts->ts_ticks = (ts->ts_ticks / (ticks - ts->ts_ftick)) *
- SCHED_CPU_TICKS;
- ts->ts_ticks >>= 10;
- } else
+ SCHED_TICK_TARG;
+ else
ts->ts_ticks = 0;
ts->ts_ltick = ticks;
- ts->ts_ftick = ts->ts_ltick - SCHED_CPU_TICKS;
+ ts->ts_ftick = ts->ts_ltick - SCHED_TICK_TARG;
}
-void
+static void
sched_thread_priority(struct thread *td, u_char prio)
{
struct td_sched *ts;
@@ -1224,27 +1140,22 @@ sched_thread_priority(struct thread *td, u_char prio)
mtx_assert(&sched_lock, MA_OWNED);
if (td->td_priority == prio)
return;
+
if (TD_ON_RUNQ(td)) {
/*
* If the priority has been elevated due to priority
* propagation, we may have to move ourselves to a new
- * queue. We still call adjustrunqueue below in case kse
- * needs to fix things up.
- */
- if (prio < td->td_priority && ts->ts_runq != NULL &&
- (ts->ts_flags & TSF_ASSIGNED) == 0 &&
- ts->ts_runq != TDQ_CPU(ts->ts_cpu)->tdq_curr) {
- runq_remove(ts->ts_runq, ts);
- ts->ts_runq = TDQ_CPU(ts->ts_cpu)->tdq_curr;
- runq_add(ts->ts_runq, ts, 0);
- }
- /*
+ * queue. This could be optimized to not re-add in some
+ * cases.
+ *
* Hold this td_sched on this cpu so that sched_prio() doesn't
* cause excessive migration. We only want migration to
* happen as the result of a wakeup.
*/
ts->ts_flags |= TSF_HOLD;
- adjustrunqueue(td, prio);
+ sched_rem(td);
+ td->td_priority = prio;
+ sched_add(td, SRQ_BORROWING);
ts->ts_flags &= ~TSF_HOLD;
} else
td->td_priority = prio;
@@ -1364,14 +1275,12 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
mtx_assert(&sched_lock, MA_OWNED);
- ts = td->td_sched;
tdq = TDQ_SELF();
-
+ ts = td->td_sched;
td->td_lastcpu = td->td_oncpu;
td->td_oncpu = NOCPU;
td->td_flags &= ~TDF_NEEDRESCHED;
td->td_owepreempt = 0;
-
/*
* If the thread has been assigned it may be in the process of switching
* to the new cpu. This is the case in sched_bind().
@@ -1399,7 +1308,6 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
* added to the run queue and then chosen.
*/
newtd->td_sched->ts_flags |= TSF_DIDRUN;
- newtd->td_sched->ts_runq = tdq->tdq_curr;
TD_SET_RUNNING(newtd);
tdq_load_add(TDQ_SELF(), newtd->td_sched);
} else
@@ -1416,44 +1324,29 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
#endif
}
-
sched_lock.mtx_lock = (uintptr_t)td;
-
td->td_oncpu = PCPU_GET(cpuid);
}
void
sched_nice(struct proc *p, int nice)
{
- struct td_sched *ts;
struct thread *td;
- struct tdq *tdq;
PROC_LOCK_ASSERT(p, MA_OWNED);
mtx_assert(&sched_lock, MA_OWNED);
- /*
- * We need to adjust the nice counts for running threads.
- */
- FOREACH_THREAD_IN_PROC(p, td) {
- if (td->td_pri_class == PRI_TIMESHARE) {
- ts = td->td_sched;
- if (ts->ts_runq == NULL)
- continue;
- tdq = TDQ_CPU(ts->ts_cpu);
- tdq_nice_rem(tdq, p->p_nice);
- tdq_nice_add(tdq, nice);
- }
- }
+
p->p_nice = nice;
FOREACH_THREAD_IN_PROC(p, td) {
sched_priority(td);
- td->td_flags |= TDF_NEEDRESCHED;
+ sched_prio(td, td->td_base_user_pri);
}
}
void
sched_sleep(struct thread *td)
{
+
mtx_assert(&sched_lock, MA_OWNED);
td->td_sched->ts_slptime = ticks;
@@ -1462,16 +1355,20 @@ sched_sleep(struct thread *td)
void
sched_wakeup(struct thread *td)
{
+ int slptime;
+
mtx_assert(&sched_lock, MA_OWNED);
/*
- * Let the procs know how long we slept for. This is because process
- * interactivity behavior is modeled in the procs.
+ * If we slept for more than a tick update our interactivity and
+ * priority.
*/
- if (td->td_sched->ts_slptime) {
+ slptime = td->td_sched->ts_slptime;
+ td->td_sched->ts_slptime = 0;
+ if (slptime && slptime != ticks) {
int hzticks;
- hzticks = (ticks - td->td_sched->ts_slptime) << 10;
+ hzticks = (ticks - slptime) << SCHED_TICK_SHIFT;
if (hzticks >= SCHED_SLP_RUN_MAX) {
td->td_sched->skg_slptime = SCHED_SLP_RUN_MAX;
td->td_sched->skg_runtime = 1;
@@ -1479,9 +1376,9 @@ sched_wakeup(struct thread *td)
td->td_sched->skg_slptime += hzticks;
sched_interact_update(td);
}
+ if (ticks - (hz / 10) > td->td_sched->ts_ltick)
+ sched_pctcpu_update(td->td_sched);
sched_priority(td);
- sched_slice(td->td_sched);
- td->td_sched->ts_slptime = 0;
}
setrunqueue(td, SRQ_BORING);
}
@@ -1495,6 +1392,14 @@ sched_fork(struct thread *td, struct thread *child)
{
mtx_assert(&sched_lock, MA_OWNED);
sched_fork_thread(td, child);
+ /*
+ * Penalize the parent and child for forking.
+ */
+ sched_interact_fork(child);
+ sched_priority(child);
+ td->td_sched->skg_runtime += tickincr;
+ sched_interact_update(td);
+ sched_priority(td);
}
void
@@ -1503,26 +1408,28 @@ sched_fork_thread(struct thread *td, struct thread *child)
struct td_sched *ts;
struct td_sched *ts2;
- child->td_sched->skg_slptime = td->td_sched->skg_slptime;
- child->td_sched->skg_runtime = td->td_sched->skg_runtime;
- child->td_user_pri = td->td_user_pri;
- child->td_base_user_pri = td->td_base_user_pri;
- sched_interact_fork(child);
- td->td_sched->skg_runtime += tickincr;
- sched_interact_update(td);
-
+ /*
+ * Initialize child.
+ */
sched_newthread(child);
-
ts = td->td_sched;
ts2 = child->td_sched;
- ts2->ts_slice = 1; /* Attempt to quickly learn interactivity. */
ts2->ts_cpu = ts->ts_cpu;
ts2->ts_runq = NULL;
-
- /* Grab our parents cpu estimation information. */
+ /*
+ * Grab our parents cpu estimation information and priority.
+ */
ts2->ts_ticks = ts->ts_ticks;
ts2->ts_ltick = ts->ts_ltick;
ts2->ts_ftick = ts->ts_ftick;
+ child->td_user_pri = td->td_user_pri;
+ child->td_base_user_pri = td->td_base_user_pri;
+ /*
+ * And update interactivity score.
+ */
+ ts2->skg_slptime = ts->skg_slptime;
+ ts2->skg_runtime = ts->skg_runtime;
+ ts2->ts_slice = 1; /* Attempt to quickly learn interactivity. */
}
void
@@ -1540,10 +1447,8 @@ sched_class(struct thread *td, int class)
nclass = PRI_BASE(class);
oclass = PRI_BASE(td->td_pri_class);
ts = td->td_sched;
- if (!((ts->ts_state != TSS_ONRUNQ &&
- ts->ts_state != TSS_THREAD) || ts->ts_runq == NULL)) {
+ if (ts->ts_state == TSS_ONRUNQ || td->td_state == TDS_RUNNING) {
tdq = TDQ_CPU(ts->ts_cpu);
-
#ifdef SMP
/*
* On SMP if we're on the RUNQ we must adjust the transferable
@@ -1551,24 +1456,20 @@ sched_class(struct thread *td, int class)
* class.
*/
if (ts->ts_state == TSS_ONRUNQ) {
- if (THREAD_CAN_MIGRATE(ts)) {
+ if (THREAD_CAN_MIGRATE(ts->ts_thread)) {
tdq->tdq_transferable--;
tdq->tdq_group->tdg_transferable--;
}
- if (THREAD_CAN_MIGRATE(ts)) {
+ if (THREAD_CAN_MIGRATE(ts->ts_thread)) {
tdq->tdq_transferable++;
tdq->tdq_group->tdg_transferable++;
}
}
#endif
- if (oclass == PRI_TIMESHARE) {
+ if (oclass == PRI_TIMESHARE)
tdq->tdq_load_timeshare--;
- tdq_nice_rem(tdq, td->td_proc->p_nice);
- }
- if (nclass == PRI_TIMESHARE) {
+ if (nclass == PRI_TIMESHARE)
tdq->tdq_load_timeshare++;
- tdq_nice_add(tdq, td->td_proc->p_nice);
- }
}
td->td_pri_class = class;
@@ -1580,22 +1481,40 @@ sched_class(struct thread *td, int class)
void
sched_exit(struct proc *p, struct thread *child)
{
+ struct thread *td;
CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
child, child->td_proc->p_comm, child->td_priority);
- sched_exit_thread(FIRST_THREAD_IN_PROC(p), child);
+ td = FIRST_THREAD_IN_PROC(p);
+ sched_exit_thread(td, child);
}
void
sched_exit_thread(struct thread *td, struct thread *child)
{
+
CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
- child, childproc->p_comm, child->td_priority);
+ child, child->td_proc->p_comm, child->td_priority);
+ tdq_load_rem(TDQ_CPU(child->td_sched->ts_cpu), child->td_sched);
+#ifdef KSE
+ /*
+ * KSE forks and exits so often that this penalty causes short-lived
+ * threads to always be non-interactive. This causes mozilla to
+ * crawl under load.
+ */
+ if ((td->td_pflags & TDP_SA) && td->td_proc == child->td_proc)
+ return;
+#endif
+ /*
+ * Give the child's runtime to the parent without returning the
+ * sleep time as a penalty to the parent. This causes shells that
+ * launch expensive things to mark their children as expensive.
+ */
td->td_sched->skg_runtime += child->td_sched->skg_runtime;
sched_interact_update(td);
- tdq_load_rem(TDQ_CPU(child->td_sched->ts_cpu), child->td_sched);
+ sched_priority(td);
}
void
@@ -1627,7 +1546,9 @@ sched_clock(struct thread *td)
struct td_sched *ts;
mtx_assert(&sched_lock, MA_OWNED);
+
tdq = TDQ_SELF();
+ ts = td->td_sched;
#ifdef SMP
if (ticks >= bal_tick)
sched_balance();
@@ -1640,18 +1561,15 @@ sched_clock(struct thread *td)
if (tdq->tdq_assigned)
tdq_assign(tdq); /* Potentially sets NEEDRESCHED */
#endif
- ts = td->td_sched;
-
/* Adjust ticks for pctcpu */
- ts->ts_ticks++;
+ ts->ts_ticks += tickincr;
ts->ts_ltick = ticks;
-
- /* Go up to one second beyond our max and then trim back down */
- if (ts->ts_ftick + SCHED_CPU_TICKS + hz < ts->ts_ltick)
+ /*
+ * Update if we've exceeded our desired tick threshhold by over one
+ * second.
+ */
+ if (ts->ts_ftick + SCHED_TICK_TARG + hz < ts->ts_ltick)
sched_pctcpu_update(ts);
-
- if (td->td_flags & TDF_IDLETD)
- return;
/*
* We only do slicing code for TIMESHARE threads.
*/
@@ -1674,11 +1592,7 @@ sched_clock(struct thread *td)
*/
tdq_load_rem(tdq, ts);
sched_priority(td);
- sched_slice(ts);
- if (SCHED_CURR(td, ts))
- ts->ts_runq = tdq->tdq_curr;
- else
- ts->ts_runq = tdq->tdq_next;
+ ts->ts_slice = sched_slice;
tdq_load_add(tdq, ts);
td->td_flags |= TDF_NEEDRESCHED;
}
@@ -1726,13 +1640,12 @@ restart:
ts = tdq_choose(tdq);
if (ts) {
#ifdef SMP
- if (ts->ts_thread->td_pri_class == PRI_IDLE)
+ if (ts->ts_thread->td_priority <= PRI_MIN_IDLE)
if (tdq_idled(tdq) == 0)
goto restart;
#endif
tdq_runq_rem(tdq, ts);
ts->ts_state = TSS_THREAD;
- ts->ts_flags &= ~TSF_PREEMPTED;
return (ts);
}
#ifdef SMP
@@ -1755,19 +1668,19 @@ sched_add(struct thread *td, int flags)
td, td->td_proc->p_comm, td->td_priority, curthread,
curthread->td_proc->p_comm);
mtx_assert(&sched_lock, MA_OWNED);
- ts = td->td_sched;
- canmigrate = 1;
- preemptive = !(flags & SRQ_YIELDING);
- class = PRI_BASE(td->td_pri_class);
tdq = TDQ_SELF();
+ ts = td->td_sched;
ts->ts_flags &= ~TSF_INTERNAL;
+ class = PRI_BASE(td->td_pri_class);
+ preemptive = !(flags & SRQ_YIELDING);
+ canmigrate = 1;
#ifdef SMP
if (ts->ts_flags & TSF_ASSIGNED) {
if (ts->ts_flags & TSF_REMOVED)
ts->ts_flags &= ~TSF_REMOVED;
return;
}
- canmigrate = THREAD_CAN_MIGRATE(ts);
+ canmigrate = THREAD_CAN_MIGRATE(td);
/*
* Don't migrate running threads here. Force the long term balancer
* to do it.
@@ -1784,36 +1697,26 @@ sched_add(struct thread *td, int flags)
("sched_add: process swapped out"));
KASSERT(ts->ts_runq == NULL,
("sched_add: thread %p is still assigned to a run queue", td));
- if (flags & SRQ_PREEMPTED)
- ts->ts_flags |= TSF_PREEMPTED;
- switch (class) {
- case PRI_ITHD:
- case PRI_REALTIME:
- ts->ts_runq = tdq->tdq_curr;
- ts->ts_slice = SCHED_SLICE_MAX;
- if (canmigrate)
- ts->ts_cpu = PCPU_GET(cpuid);
- break;
- case PRI_TIMESHARE:
- if (SCHED_CURR(td, ts))
- ts->ts_runq = tdq->tdq_curr;
- else
- ts->ts_runq = tdq->tdq_next;
- break;
- case PRI_IDLE:
+ /*
+ * Set the slice and pick the run queue.
+ */
+ if (ts->ts_slice == 0)
+ ts->ts_slice = sched_slice;
+ if (td->td_priority <= PRI_MAX_REALTIME) {
+ ts->ts_runq = &tdq->tdq_realtime;
/*
- * This is for priority prop.
+ * If the thread is not artificially pinned and it's in
+ * the realtime queue we directly dispatch it on this cpu
+ * for minimum latency. Interrupt handlers may also have
+ * to complete on the cpu that dispatched them.
*/
- if (ts->ts_thread->td_priority < PRI_MIN_IDLE)
- ts->ts_runq = tdq->tdq_curr;
- else
- ts->ts_runq = &tdq->tdq_idle;
- ts->ts_slice = SCHED_SLICE_MIN;
- break;
- default:
- panic("Unknown pri class.");
- break;
- }
+ if (td->td_pinned == 0)
+ ts->ts_cpu = PCPU_GET(cpuid);
+ } else if (td->td_priority <= PRI_MAX_TIMESHARE)
+ ts->ts_runq = &tdq->tdq_timeshare;
+ else
+ ts->ts_runq = &tdq->tdq_idle;
+
#ifdef SMP
/*
* If this thread is pinned or bound, notify the target cpu.
@@ -1827,7 +1730,7 @@ sched_add(struct thread *td, int flags)
* If we had been idle, clear our bit in the group and potentially
* the global bitmap. If not, see if we should transfer this thread.
*/
- if ((class == PRI_TIMESHARE || class == PRI_REALTIME) &&
+ if ((class != PRI_IDLE && class != PRI_ITHD) &&
(tdq->tdq_group->tdg_idlemask & PCPU_GET(cpumask)) != 0) {
/*
* Check to see if our group is unidling, and if so, remove it
@@ -1840,13 +1743,12 @@ sched_add(struct thread *td, int flags)
* Now remove ourselves from the group specific idle mask.
*/
tdq->tdq_group->tdg_idlemask &= ~PCPU_GET(cpumask);
- } else if (canmigrate && tdq->tdq_load > 1 && class != PRI_ITHD)
+ } else if (canmigrate && tdq->tdq_load > 1)
if (tdq_transfer(tdq, ts, class))
return;
ts->ts_cpu = PCPU_GET(cpuid);
#endif
- if (td->td_priority < curthread->td_priority &&
- ts->ts_runq == tdq->tdq_curr)
+ if (td->td_priority < curthread->td_priority)
curthread->td_flags |= TDF_NEEDRESCHED;
if (preemptive && maybe_preempt(td))
return;
@@ -1867,7 +1769,6 @@ sched_rem(struct thread *td)
curthread->td_proc->p_comm);
mtx_assert(&sched_lock, MA_OWNED);
ts = td->td_sched;
- ts->ts_flags &= ~TSF_PREEMPTED;
if (ts->ts_flags & TSF_ASSIGNED) {
ts->ts_flags |= TSF_REMOVED;
return;
@@ -1896,19 +1797,12 @@ sched_pctcpu(struct thread *td)
if (ts->ts_ticks) {
int rtick;
- /*
- * Don't update more frequently than twice a second. Allowing
- * this causes the cpu usage to decay away too quickly due to
- * rounding errors.
- */
- if (ts->ts_ftick + SCHED_CPU_TICKS < ts->ts_ltick ||
- ts->ts_ltick < (ticks - (hz / 2)))
+ if (ticks - (hz / 10) > td->td_sched->ts_ltick)
sched_pctcpu_update(ts);
/* How many rtick per second ? */
- rtick = min(ts->ts_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS);
- pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
+ rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz);
+ pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
}
-
td->td_proc->p_swtime = ts->ts_ltick - ts->ts_ftick;
mtx_unlock_spin(&sched_lock);
@@ -1922,6 +1816,8 @@ sched_bind(struct thread *td, int cpu)
mtx_assert(&sched_lock, MA_OWNED);
ts = td->td_sched;
+ KASSERT((ts->ts_flags & TSF_BOUND) == 0,
+ ("sched_bind: thread %p already bound.", td));
ts->ts_flags |= TSF_BOUND;
#ifdef SMP
if (PCPU_GET(cpuid) == cpu)
@@ -1932,14 +1828,24 @@ sched_bind(struct thread *td, int cpu)
tdq_notify(ts, cpu);
/* When we return from mi_switch we'll be on the correct cpu. */
mi_switch(SW_VOL, NULL);
+ sched_pin();
#endif
}
void
sched_unbind(struct thread *td)
{
+ struct td_sched *ts;
+
+ mtx_assert(&sched_lock, MA_OWNED);
+ ts = td->td_sched;
+ KASSERT(ts->ts_flags & TSF_BOUND,
+ ("sched_unbind: thread %p not bound.", td));
mtx_assert(&sched_lock, MA_OWNED);
- td->td_sched->ts_flags &= ~TSF_BOUND;
+ ts->ts_flags &= ~TSF_BOUND;
+#ifdef SMP
+ sched_unpin();
+#endif
}
int
@@ -1991,5 +1897,19 @@ void
sched_tick(void)
{
}
+
+static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
+SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0,
+ "Scheduler name");
+SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, tickincr, CTLFLAG_RD, &tickincr, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, realstathz, CTLFLAG_RD, &realstathz, 0, "");
+
+/* ps compat */
+static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
+SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
+
+
#define KERN_SWITCH_INCLUDE 1
#include "kern/kern_switch.c"
OpenPOWER on IntegriCloud