diff options
Diffstat (limited to 'sys/kern/kern_timeout.c')
-rw-r--r-- | sys/kern/kern_timeout.c | 760 |
1 files changed, 522 insertions, 238 deletions
diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c index a217db1..0787c01 100644 --- a/sys/kern/kern_timeout.c +++ b/sys/kern/kern_timeout.c @@ -37,7 +37,11 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include "opt_callout_profiling.h" #include "opt_kdtrace.h" +#if defined(__arm__) +#include "opt_timer.h" +#endif #include <sys/param.h> #include <sys/systm.h> @@ -59,6 +63,10 @@ __FBSDID("$FreeBSD$"); #include <machine/cpu.h> #endif +#ifndef NO_EVENTTIMERS +DPCPU_DECLARE(sbintime_t, hardclocktime); +#endif + SDT_PROVIDER_DEFINE(callout_execute); SDT_PROBE_DEFINE(callout_execute, kernel, , callout_start, callout-start); SDT_PROBE_ARGTYPE(callout_execute, kernel, , callout_start, 0, @@ -67,6 +75,7 @@ SDT_PROBE_DEFINE(callout_execute, kernel, , callout_end, callout-end); SDT_PROBE_ARGTYPE(callout_execute, kernel, , callout_end, 0, "struct callout *"); +#ifdef CALLOUT_PROFILING static int avg_depth; SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0, "Average number of items examined per softclock call. Units = 1/1000"); @@ -79,6 +88,19 @@ SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls, CTLFLAG_RD, &avg_lockcalls, 0, static int avg_mpcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0, "Average number of MP callouts made per softclock call. Units = 1/1000"); +static int avg_depth_dir; +SYSCTL_INT(_debug, OID_AUTO, to_avg_depth_dir, CTLFLAG_RD, &avg_depth_dir, 0, + "Average number of direct callouts examined per callout_process call. " + "Units = 1/1000"); +static int avg_lockcalls_dir; +SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls_dir, CTLFLAG_RD, + &avg_lockcalls_dir, 0, "Average number of lock direct callouts made per " + "callout_process call. Units = 1/1000"); +static int avg_mpcalls_dir; +SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls_dir, CTLFLAG_RD, &avg_mpcalls_dir, + 0, "Average number of MP direct callouts made per callout_process call. " + "Units = 1/1000"); +#endif /* * TODO: * allocate more timeout table slots when table overflows. @@ -86,58 +108,63 @@ SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0, u_int callwheelsize, callwheelmask; /* - * The callout cpu migration entity represents informations necessary for - * describing the migrating callout to the new callout cpu. + * The callout cpu exec entities represent informations necessary for + * describing the state of callouts currently running on the CPU and the ones + * necessary for migrating callouts to the new callout cpu. In particular, + * the first entry of the array cc_exec_entity holds informations for callout + * running in SWI thread context, while the second one holds informations + * for callout running directly from hardware interrupt context. * The cached informations are very important for deferring migration when * the migrating callout is already running. */ -struct cc_mig_ent { +struct cc_exec { + struct callout *cc_next; + struct callout *cc_curr; #ifdef SMP - void (*ce_migration_func)(void *); - void *ce_migration_arg; - int ce_migration_cpu; - int ce_migration_ticks; + void (*ce_migration_func)(void *); + void *ce_migration_arg; + int ce_migration_cpu; + sbintime_t ce_migration_time; #endif + boolean_t cc_cancel; + boolean_t cc_waiting; }; /* * There is one struct callout_cpu per cpu, holding all relevant * state for the callout processing thread on the individual CPU. - * In particular: - * cc_ticks is incremented once per tick in callout_cpu(). - * It tracks the global 'ticks' but in a way that the individual - * threads should not worry about races in the order in which - * hardclock() and hardclock_cpu() run on the various CPUs. - * cc_softclock is advanced in callout_cpu() to point to the - * first entry in cc_callwheel that may need handling. In turn, - * a softclock() is scheduled so it can serve the various entries i - * such that cc_softclock <= i <= cc_ticks . - * XXX maybe cc_softclock and cc_ticks should be volatile ? - * - * cc_ticks is also used in callout_reset_cpu() to determine - * when the callout should be served. */ struct callout_cpu { struct mtx_padalign cc_lock; - struct cc_mig_ent cc_migrating_entity; + struct cc_exec cc_exec_entity[2]; struct callout *cc_callout; - struct callout_tailq *cc_callwheel; - struct callout_list cc_callfree; - struct callout *cc_next; - struct callout *cc_curr; + struct callout_list *cc_callwheel; + struct callout_tailq cc_expireq; + struct callout_slist cc_callfree; + sbintime_t cc_firstevent; + sbintime_t cc_lastscan; void *cc_cookie; - int cc_ticks; - int cc_softticks; - int cc_cancel; - int cc_waiting; - int cc_firsttick; + u_int cc_bucket; }; +#define cc_exec_curr cc_exec_entity[0].cc_curr +#define cc_exec_next cc_exec_entity[0].cc_next +#define cc_exec_cancel cc_exec_entity[0].cc_cancel +#define cc_exec_waiting cc_exec_entity[0].cc_waiting +#define cc_exec_curr_dir cc_exec_entity[1].cc_curr +#define cc_exec_next_dir cc_exec_entity[1].cc_next +#define cc_exec_cancel_dir cc_exec_entity[1].cc_cancel +#define cc_exec_waiting_dir cc_exec_entity[1].cc_waiting + #ifdef SMP -#define cc_migration_func cc_migrating_entity.ce_migration_func -#define cc_migration_arg cc_migrating_entity.ce_migration_arg -#define cc_migration_cpu cc_migrating_entity.ce_migration_cpu -#define cc_migration_ticks cc_migrating_entity.ce_migration_ticks +#define cc_migration_func cc_exec_entity[0].ce_migration_func +#define cc_migration_arg cc_exec_entity[0].ce_migration_arg +#define cc_migration_cpu cc_exec_entity[0].ce_migration_cpu +#define cc_migration_time cc_exec_entity[0].ce_migration_time +#define cc_migration_func_dir cc_exec_entity[1].ce_migration_func +#define cc_migration_arg_dir cc_exec_entity[1].ce_migration_arg +#define cc_migration_cpu_dir cc_exec_entity[1].ce_migration_cpu +#define cc_migration_time_dir cc_exec_entity[1].ce_migration_time struct callout_cpu cc_cpu[MAXCPU]; #define CPUBLOCK MAXCPU @@ -153,39 +180,48 @@ struct callout_cpu cc_cpu; #define CC_LOCK_ASSERT(cc) mtx_assert(&(cc)->cc_lock, MA_OWNED) static int timeout_cpu; -void (*callout_new_inserted)(int cpu, int ticks) = NULL; + +static void softclock_call_cc(struct callout *c, struct callout_cpu *cc, +#ifdef CALLOUT_PROFILING + int *mpcalls, int *lockcalls, int *gcalls, +#endif + int direct); static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures"); /** * Locked by cc_lock: - * cc_curr - If a callout is in progress, it is curr_callout. - * If curr_callout is non-NULL, threads waiting in + * cc_curr - If a callout is in progress, it is cc_curr. + * If cc_curr is non-NULL, threads waiting in * callout_drain() will be woken up as soon as the * relevant callout completes. - * cc_cancel - Changing to 1 with both callout_lock and c_lock held + * cc_cancel - Changing to 1 with both callout_lock and cc_lock held * guarantees that the current callout will not run. * The softclock() function sets this to 0 before it * drops callout_lock to acquire c_lock, and it calls * the handler only if curr_cancelled is still 0 after - * c_lock is successfully acquired. + * cc_lock is successfully acquired. * cc_waiting - If a thread is waiting in callout_drain(), then * callout_wait is nonzero. Set only when - * curr_callout is non-NULL. + * cc_curr is non-NULL. */ /* - * Resets the migration entity tied to a specific callout cpu. + * Resets the execution entity tied to a specific callout cpu. */ static void -cc_cme_cleanup(struct callout_cpu *cc) +cc_cce_cleanup(struct callout_cpu *cc, int direct) { + cc->cc_exec_entity[direct].cc_curr = NULL; + cc->cc_exec_entity[direct].cc_next = NULL; + cc->cc_exec_entity[direct].cc_cancel = FALSE; + cc->cc_exec_entity[direct].cc_waiting = FALSE; #ifdef SMP - cc->cc_migration_cpu = CPUBLOCK; - cc->cc_migration_ticks = 0; - cc->cc_migration_func = NULL; - cc->cc_migration_arg = NULL; + cc->cc_exec_entity[direct].ce_migration_cpu = CPUBLOCK; + cc->cc_exec_entity[direct].ce_migration_time = 0; + cc->cc_exec_entity[direct].ce_migration_func = NULL; + cc->cc_exec_entity[direct].ce_migration_arg = NULL; #endif } @@ -193,11 +229,11 @@ cc_cme_cleanup(struct callout_cpu *cc) * Checks if migration is requested by a specific callout cpu. */ static int -cc_cme_migrating(struct callout_cpu *cc) +cc_cce_migrating(struct callout_cpu *cc, int direct) { #ifdef SMP - return (cc->cc_migration_cpu != CPUBLOCK); + return (cc->cc_exec_entity[direct].ce_migration_cpu != CPUBLOCK); #else return (0); #endif @@ -225,7 +261,7 @@ kern_timeout_callwheel_alloc(caddr_t v) cc->cc_callout = (struct callout *)v; v = (caddr_t)(cc->cc_callout + ncallout); - cc->cc_callwheel = (struct callout_tailq *)v; + cc->cc_callwheel = (struct callout_list *)v; v = (caddr_t)(cc->cc_callwheel + callwheelsize); return(v); } @@ -238,10 +274,12 @@ callout_cpu_init(struct callout_cpu *cc) mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE); SLIST_INIT(&cc->cc_callfree); - for (i = 0; i < callwheelsize; i++) { - TAILQ_INIT(&cc->cc_callwheel[i]); - } - cc_cme_cleanup(cc); + for (i = 0; i < callwheelsize; i++) + LIST_INIT(&cc->cc_callwheel[i]); + TAILQ_INIT(&cc->cc_expireq); + cc->cc_firstevent = INT64_MAX; + for (i = 0; i < 2; i++) + cc_cce_cleanup(cc, i); if (cc->cc_callout == NULL) return; for (i = 0; i < ncallout; i++) { @@ -320,7 +358,7 @@ start_softclock(void *dummy) panic("died while creating standard software ithreads"); cc->cc_callout = NULL; /* Only cpu0 handles timeout(). */ cc->cc_callwheel = malloc( - sizeof(struct callout_tailq) * callwheelsize, M_CALLOUT, + sizeof(struct callout_list) * callwheelsize, M_CALLOUT, M_WAITOK); callout_cpu_init(cc); } @@ -329,64 +367,148 @@ start_softclock(void *dummy) SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL); +#define CC_HASH_SHIFT 8 + +static inline u_int +callout_hash(sbintime_t sbt) +{ + + return (sbt >> (32 - CC_HASH_SHIFT)); +} + +static inline u_int +callout_get_bucket(sbintime_t sbt) +{ + + return (callout_hash(sbt) & callwheelmask); +} + void -callout_tick(void) +callout_process(sbintime_t now) { + struct callout *tmp, *tmpn; struct callout_cpu *cc; - int need_softclock; - int bucket; + struct callout_list *sc; + sbintime_t first, last, max, tmp_max; + uint32_t lookahead; + u_int firstb, lastb, nowb; +#ifdef CALLOUT_PROFILING + int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0; +#endif - /* - * Process callouts at a very low cpu priority, so we don't keep the - * relatively high clock interrupt priority any longer than necessary. - */ - need_softclock = 0; cc = CC_SELF(); mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET); - cc->cc_firsttick = cc->cc_ticks = ticks; - for (; (cc->cc_softticks - cc->cc_ticks) <= 0; cc->cc_softticks++) { - bucket = cc->cc_softticks & callwheelmask; - if (!TAILQ_EMPTY(&cc->cc_callwheel[bucket])) { - need_softclock = 1; - break; - } + + /* Compute the buckets of the last scan and present times. */ + firstb = callout_hash(cc->cc_lastscan); + cc->cc_lastscan = now; + nowb = callout_hash(now); + + /* Compute the last bucket and minimum time of the bucket after it. */ + if (nowb == firstb) + lookahead = (SBT_1S / 16); + else if (nowb - firstb == 1) + lookahead = (SBT_1S / 8); + else + lookahead = (SBT_1S / 2); + first = last = now; + first += (lookahead / 2); + last += lookahead; + last &= (0xffffffffffffffffLLU << (32 - CC_HASH_SHIFT)); + lastb = callout_hash(last) - 1; + max = last; + + /* + * Check if we wrapped around the entire wheel from the last scan. + * In case, we need to scan entirely the wheel for pending callouts. + */ + if (lastb - firstb >= callwheelsize) { + lastb = firstb + callwheelsize - 1; + if (nowb - firstb >= callwheelsize) + nowb = lastb; } + + /* Iterate callwheel from firstb to nowb and then up to lastb. */ + do { + sc = &cc->cc_callwheel[firstb & callwheelmask]; + tmp = LIST_FIRST(sc); + while (tmp != NULL) { + /* Run the callout if present time within allowed. */ + if (tmp->c_time <= now) { + /* + * Consumer told us the callout may be run + * directly from hardware interrupt context. + */ + if (tmp->c_flags & CALLOUT_DIRECT) { +#ifdef CALLOUT_PROFILING + ++depth_dir; +#endif + cc->cc_exec_next_dir = + LIST_NEXT(tmp, c_links.le); + cc->cc_bucket = firstb & callwheelmask; + LIST_REMOVE(tmp, c_links.le); + softclock_call_cc(tmp, cc, +#ifdef CALLOUT_PROFILING + &mpcalls_dir, &lockcalls_dir, NULL, +#endif + 1); + tmp = cc->cc_exec_next_dir; + } else { + tmpn = LIST_NEXT(tmp, c_links.le); + LIST_REMOVE(tmp, c_links.le); + TAILQ_INSERT_TAIL(&cc->cc_expireq, + tmp, c_links.tqe); + tmp->c_flags |= CALLOUT_PROCESSED; + tmp = tmpn; + } + continue; + } + /* Skip events from distant future. */ + if (tmp->c_time >= max) + goto next; + /* + * Event minimal time is bigger than present maximal + * time, so it cannot be aggregated. + */ + if (tmp->c_time > last) { + lastb = nowb; + goto next; + } + /* Update first and last time, respecting this event. */ + if (tmp->c_time < first) + first = tmp->c_time; + tmp_max = tmp->c_time + tmp->c_precision; + if (tmp_max < last) + last = tmp_max; +next: + tmp = LIST_NEXT(tmp, c_links.le); + } + /* Proceed with the next bucket. */ + firstb++; + /* + * Stop if we looked after present time and found + * some event we can't execute at now. + * Stop if we looked far enough into the future. + */ + } while (((int)(firstb - lastb)) <= 0); + cc->cc_firstevent = last; +#ifndef NO_EVENTTIMERS + cpu_new_callout(curcpu, last, first); +#endif +#ifdef CALLOUT_PROFILING + avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8; + avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8; + avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8; +#endif mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); /* * swi_sched acquires the thread lock, so we don't want to call it * with cc_lock held; incorrect locking order. */ - if (need_softclock) + if (!TAILQ_EMPTY(&cc->cc_expireq)) swi_sched(cc->cc_cookie, 0); } -int -callout_tickstofirst(int limit) -{ - struct callout_cpu *cc; - struct callout *c; - struct callout_tailq *sc; - int curticks; - int skip = 1; - - cc = CC_SELF(); - mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET); - curticks = cc->cc_ticks; - while( skip < ncallout && skip < limit ) { - sc = &cc->cc_callwheel[ (curticks+skip) & callwheelmask ]; - /* search scanning ticks */ - TAILQ_FOREACH( c, sc, c_links.tqe ){ - if (c->c_time - curticks <= ncallout) - goto out; - } - skip++; - } -out: - cc->cc_firsttick = curticks + skip; - mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); - return (skip); -} - static struct callout_cpu * callout_lock(struct callout *c) { @@ -412,26 +534,41 @@ callout_lock(struct callout *c) } static void -callout_cc_add(struct callout *c, struct callout_cpu *cc, int to_ticks, - void (*func)(void *), void *arg, int cpu) +callout_cc_add(struct callout *c, struct callout_cpu *cc, + sbintime_t sbt, sbintime_t precision, void (*func)(void *), + void *arg, int cpu, int flags) { + int bucket; CC_LOCK_ASSERT(cc); - - if (to_ticks <= 0) - to_ticks = 1; + if (sbt < cc->cc_lastscan) + sbt = cc->cc_lastscan; c->c_arg = arg; c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING); + if (flags & C_DIRECT_EXEC) + c->c_flags |= CALLOUT_DIRECT; + c->c_flags &= ~CALLOUT_PROCESSED; c->c_func = func; - c->c_time = ticks + to_ticks; - TAILQ_INSERT_TAIL(&cc->cc_callwheel[c->c_time & callwheelmask], - c, c_links.tqe); - if ((c->c_time - cc->cc_firsttick) < 0 && - callout_new_inserted != NULL) { - cc->cc_firsttick = c->c_time; - (*callout_new_inserted)(cpu, - to_ticks + (ticks - cc->cc_ticks)); + c->c_time = sbt; + c->c_precision = precision; + bucket = callout_get_bucket(c->c_time); + CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x", + c, (int)(c->c_precision >> 32), + (u_int)(c->c_precision & 0xffffffff)); + LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le); + if (cc->cc_bucket == bucket) + cc->cc_exec_next_dir = c; +#ifndef NO_EVENTTIMERS + /* + * Inform the eventtimers(4) subsystem there's a new callout + * that has been inserted, but only if really required. + */ + sbt = c->c_time + c->c_precision; + if (sbt < cc->cc_firstevent) { + cc->cc_firstevent = sbt; + cpu_new_callout(cpu, sbt, c->c_time); } +#endif } static void @@ -445,8 +582,11 @@ callout_cc_del(struct callout *c, struct callout_cpu *cc) } static void -softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, - int *lockcalls, int *gcalls) +softclock_call_cc(struct callout *c, struct callout_cpu *cc, +#ifdef CALLOUT_PROFILING + int *mpcalls, int *lockcalls, int *gcalls, +#endif + int direct) { void (*c_func)(void *); void *c_arg; @@ -457,12 +597,13 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, struct callout_cpu *new_cc; void (*new_func)(void *); void *new_arg; - int new_cpu, new_ticks; + int flags, new_cpu; + sbintime_t new_time; #endif -#ifdef DIAGNOSTIC - struct bintime bt1, bt2; +#if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) + sbintime_t bt1, bt2; struct timespec ts2; - static uint64_t maxdt = 36893488147419102LL; /* 2 msec */ + static sbintime_t maxdt = 2 * SBT_1MS; /* 2 msec */ static timeout_t *lastfunc; #endif @@ -479,8 +620,8 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, c->c_flags = CALLOUT_LOCAL_ALLOC; else c->c_flags &= ~CALLOUT_PENDING; - cc->cc_curr = c; - cc->cc_cancel = 0; + cc->cc_exec_entity[direct].cc_curr = c; + cc->cc_exec_entity[direct].cc_cancel = FALSE; CC_UNLOCK(cc); if (c_lock != NULL) { class->lc_lock(c_lock, sharedlock); @@ -488,29 +629,34 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, * The callout may have been cancelled * while we switched locks. */ - if (cc->cc_cancel) { + if (cc->cc_exec_entity[direct].cc_cancel) { class->lc_unlock(c_lock); goto skip; } /* The callout cannot be stopped now. */ - cc->cc_cancel = 1; - + cc->cc_exec_entity[direct].cc_cancel = TRUE; if (c_lock == &Giant.lock_object) { +#ifdef CALLOUT_PROFILING (*gcalls)++; - CTR3(KTR_CALLOUT, "callout %p func %p arg %p", +#endif + CTR3(KTR_CALLOUT, "callout giant %p func %p arg %p", c, c_func, c_arg); } else { +#ifdef CALLOUT_PROFILING (*lockcalls)++; +#endif CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p", c, c_func, c_arg); } } else { +#ifdef CALLOUT_PROFILING (*mpcalls)++; - CTR3(KTR_CALLOUT, "callout mpsafe %p func %p arg %p", +#endif + CTR3(KTR_CALLOUT, "callout %p func %p arg %p", c, c_func, c_arg); } #ifdef DIAGNOSTIC - binuptime(&bt1); + sbt1 = sbinuptime(); #endif THREAD_NO_SLEEPING(); SDT_PROBE(callout_execute, kernel, , callout_start, c, 0, 0, 0, 0); @@ -518,16 +664,16 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, SDT_PROBE(callout_execute, kernel, , callout_end, c, 0, 0, 0, 0); THREAD_SLEEPING_OK(); #ifdef DIAGNOSTIC - binuptime(&bt2); - bintime_sub(&bt2, &bt1); - if (bt2.frac > maxdt) { - if (lastfunc != c_func || bt2.frac > maxdt * 2) { - bintime2timespec(&bt2, &ts2); + bt2 = sbinuptime(); + bt2 -= bt1; + if (bt2 > maxdt) { + if (lastfunc != c_func || bt2 > maxdt * 2) { + ts2 = sbttots(bt2); printf( "Expensive timeout(9) function: %p(%p) %jd.%09ld s\n", c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec); } - maxdt = bt2.frac; + maxdt = bt2; lastfunc = c_func; } #endif @@ -536,17 +682,17 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, class->lc_unlock(c_lock); skip: CC_LOCK(cc); - KASSERT(cc->cc_curr == c, ("mishandled cc_curr")); - cc->cc_curr = NULL; - if (cc->cc_waiting) { + KASSERT(cc->cc_exec_entity[direct].cc_curr == c, ("mishandled cc_curr")); + cc->cc_exec_entity[direct].cc_curr = NULL; + if (cc->cc_exec_entity[direct].cc_waiting) { /* * There is someone waiting for the * callout to complete. * If the callout was scheduled for * migration just cancel it. */ - if (cc_cme_migrating(cc)) { - cc_cme_cleanup(cc); + if (cc_cce_migrating(cc, direct)) { + cc_cce_cleanup(cc, direct); /* * It should be assert here that the callout is not @@ -554,11 +700,11 @@ skip: */ c->c_flags &= ~CALLOUT_DFRMIGRATION; } - cc->cc_waiting = 0; + cc->cc_exec_entity[direct].cc_waiting = FALSE; CC_UNLOCK(cc); - wakeup(&cc->cc_waiting); + wakeup(&cc->cc_exec_entity[direct].cc_waiting); CC_LOCK(cc); - } else if (cc_cme_migrating(cc)) { + } else if (cc_cce_migrating(cc, direct)) { KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0, ("Migrating legacy callout %p", c)); #ifdef SMP @@ -566,11 +712,11 @@ skip: * If the callout was scheduled for * migration just perform it now. */ - new_cpu = cc->cc_migration_cpu; - new_ticks = cc->cc_migration_ticks; - new_func = cc->cc_migration_func; - new_arg = cc->cc_migration_arg; - cc_cme_cleanup(cc); + new_cpu = cc->cc_exec_entity[direct].ce_migration_cpu; + new_time = cc->cc_exec_entity[direct].ce_migration_time; + new_func = cc->cc_exec_entity[direct].ce_migration_func; + new_arg = cc->cc_exec_entity[direct].ce_migration_arg; + cc_cce_cleanup(cc, direct); /* * It should be assert here that the callout is not destroyed @@ -588,8 +734,9 @@ skip: c->c_flags &= ~CALLOUT_DFRMIGRATION; new_cc = callout_cpu_switch(c, cc, new_cpu); - callout_cc_add(c, new_cc, new_ticks, new_func, new_arg, - new_cpu); + flags = (direct) ? C_DIRECT_EXEC : 0; + callout_cc_add(c, new_cc, new_time, c->c_precision, new_func, + new_arg, new_cpu, flags); CC_UNLOCK(new_cc); CC_LOCK(cc); #else @@ -632,63 +779,29 @@ softclock(void *arg) { struct callout_cpu *cc; struct callout *c; - struct callout_tailq *bucket; - int curticks; - int steps; /* #steps since we last allowed interrupts */ - int depth; - int mpcalls; - int lockcalls; - int gcalls; - -#ifndef MAX_SOFTCLOCK_STEPS -#define MAX_SOFTCLOCK_STEPS 100 /* Maximum allowed value of steps. */ -#endif /* MAX_SOFTCLOCK_STEPS */ - - mpcalls = 0; - lockcalls = 0; - gcalls = 0; - depth = 0; - steps = 0; +#ifdef CALLOUT_PROFILING + int depth = 0, gcalls = 0, lockcalls = 0, mpcalls = 0; +#endif + cc = (struct callout_cpu *)arg; CC_LOCK(cc); - while (cc->cc_softticks - 1 != cc->cc_ticks) { - /* - * cc_softticks may be modified by hard clock, so cache - * it while we work on a given bucket. - */ - curticks = cc->cc_softticks; - cc->cc_softticks++; - bucket = &cc->cc_callwheel[curticks & callwheelmask]; - c = TAILQ_FIRST(bucket); - while (c != NULL) { - depth++; - if (c->c_time != curticks) { - c = TAILQ_NEXT(c, c_links.tqe); - ++steps; - if (steps >= MAX_SOFTCLOCK_STEPS) { - cc->cc_next = c; - /* Give interrupts a chance. */ - CC_UNLOCK(cc); - ; /* nothing */ - CC_LOCK(cc); - c = cc->cc_next; - steps = 0; - } - } else { - cc->cc_next = TAILQ_NEXT(c, c_links.tqe); - TAILQ_REMOVE(bucket, c, c_links.tqe); - softclock_call_cc(c, cc, &mpcalls, - &lockcalls, &gcalls); - steps = 0; - c = cc->cc_next; - } - } + while ((c = TAILQ_FIRST(&cc->cc_expireq)) != NULL) { + TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); + softclock_call_cc(c, cc, +#ifdef CALLOUT_PROFILING + &mpcalls, &lockcalls, &gcalls, +#endif + 0); +#ifdef CALLOUT_PROFILING + ++depth; +#endif } +#ifdef CALLOUT_PROFILING avg_depth += (depth * 1000 - avg_depth) >> 8; avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8; avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8; avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8; - cc->cc_next = NULL; +#endif CC_UNLOCK(cc); } @@ -778,28 +891,71 @@ callout_handle_init(struct callout_handle *handle) * callout_deactivate() - marks the callout as having been serviced */ int -callout_reset_on(struct callout *c, int to_ticks, void (*ftn)(void *), - void *arg, int cpu) +callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision, + void (*ftn)(void *), void *arg, int cpu, int flags) { + sbintime_t to_sbt, pr; struct callout_cpu *cc; - int cancelled = 0; + int cancelled, direct; + cancelled = 0; + if (flags & C_ABSOLUTE) { + to_sbt = sbt; + } else { + if ((flags & C_HARDCLOCK) && (sbt < tick_sbt)) + sbt = tick_sbt; + if ((flags & C_HARDCLOCK) || +#ifdef NO_EVENTTIMERS + sbt >= sbt_timethreshold) { + to_sbt = getsbinuptime(); + + /* Add safety belt for the case of hz > 1000. */ + to_sbt += tc_tick_sbt - tick_sbt; +#else + sbt >= sbt_tickthreshold) { + /* + * Obtain the time of the last hardclock() call on + * this CPU directly from the kern_clocksource.c. + * This value is per-CPU, but it is equal for all + * active ones. + */ +#ifdef __LP64__ + to_sbt = DPCPU_GET(hardclocktime); +#else + spinlock_enter(); + to_sbt = DPCPU_GET(hardclocktime); + spinlock_exit(); +#endif +#endif + if ((flags & C_HARDCLOCK) == 0) + to_sbt += tick_sbt; + } else + to_sbt = sbinuptime(); + to_sbt += sbt; + pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp : + sbt >> C_PRELGET(flags)); + if (pr > precision) + precision = pr; + } /* * Don't allow migration of pre-allocated callouts lest they * become unbalanced. */ if (c->c_flags & CALLOUT_LOCAL_ALLOC) cpu = c->c_cpu; + direct = (c->c_flags & CALLOUT_DIRECT) != 0; + KASSERT(!direct || c->c_lock == NULL, + ("%s: direct callout %p has lock", __func__, c)); cc = callout_lock(c); - if (cc->cc_curr == c) { + if (cc->cc_exec_entity[direct].cc_curr == c) { /* * We're being asked to reschedule a callout which is * currently in progress. If there is a lock then we * can cancel the callout if it has not really started. */ - if (c->c_lock != NULL && !cc->cc_cancel) - cancelled = cc->cc_cancel = 1; - if (cc->cc_waiting) { + if (c->c_lock != NULL && !cc->cc_exec_entity[direct].cc_cancel) + cancelled = cc->cc_exec_entity[direct].cc_cancel = TRUE; + if (cc->cc_exec_entity[direct].cc_waiting) { /* * Someone has called callout_drain to kill this * callout. Don't reschedule. @@ -812,12 +968,12 @@ callout_reset_on(struct callout *c, int to_ticks, void (*ftn)(void *), } } if (c->c_flags & CALLOUT_PENDING) { - if (cc->cc_next == c) { - cc->cc_next = TAILQ_NEXT(c, c_links.tqe); - } - TAILQ_REMOVE(&cc->cc_callwheel[c->c_time & callwheelmask], c, - c_links.tqe); - + if ((c->c_flags & CALLOUT_PROCESSED) == 0) { + if (cc->cc_exec_next_dir == c) + cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le); + LIST_REMOVE(c, c_links.le); + } else + TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); cancelled = 1; c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); } @@ -829,15 +985,17 @@ callout_reset_on(struct callout *c, int to_ticks, void (*ftn)(void *), * to a more appropriate moment. */ if (c->c_cpu != cpu) { - if (cc->cc_curr == c) { - cc->cc_migration_cpu = cpu; - cc->cc_migration_ticks = to_ticks; - cc->cc_migration_func = ftn; - cc->cc_migration_arg = arg; + if (cc->cc_exec_entity[direct].cc_curr == c) { + cc->cc_exec_entity[direct].ce_migration_cpu = cpu; + cc->cc_exec_entity[direct].ce_migration_time + = to_sbt; + cc->cc_exec_entity[direct].ce_migration_func = ftn; + cc->cc_exec_entity[direct].ce_migration_arg = arg; c->c_flags |= CALLOUT_DFRMIGRATION; - CTR5(KTR_CALLOUT, - "migration of %p func %p arg %p in %d to %u deferred", - c, c->c_func, c->c_arg, to_ticks, cpu); + CTR6(KTR_CALLOUT, + "migration of %p func %p arg %p in %d.%08x to %u deferred", + c, c->c_func, c->c_arg, (int)(to_sbt >> 32), + (u_int)(to_sbt & 0xffffffff), cpu); CC_UNLOCK(cc); return (cancelled); } @@ -845,9 +1003,10 @@ callout_reset_on(struct callout *c, int to_ticks, void (*ftn)(void *), } #endif - callout_cc_add(c, cc, to_ticks, ftn, arg, cpu); - CTR5(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d", - cancelled ? "re" : "", c, c->c_func, c->c_arg, to_ticks); + callout_cc_add(c, cc, to_sbt, precision, ftn, arg, cpu, flags); + CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x", + cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32), + (u_int)(to_sbt & 0xffffffff)); CC_UNLOCK(cc); return (cancelled); @@ -875,7 +1034,7 @@ _callout_stop_safe(c, safe) { struct callout_cpu *cc, *old_cc; struct lock_class *class; - int use_lock, sq_locked; + int direct, sq_locked, use_lock; /* * Some old subsystems don't hold Giant while running a callout_stop(), @@ -891,7 +1050,7 @@ _callout_stop_safe(c, safe) } } else use_lock = 0; - + direct = (c->c_flags & CALLOUT_DIRECT) != 0; sq_locked = 0; old_cc = NULL; again: @@ -905,7 +1064,7 @@ again: if (sq_locked != 0 && cc != old_cc) { #ifdef SMP CC_UNLOCK(cc); - sleepq_release(&old_cc->cc_waiting); + sleepq_release(&old_cc->cc_exec_entity[direct].cc_waiting); sq_locked = 0; old_cc = NULL; goto again; @@ -926,12 +1085,13 @@ again: * If it wasn't on the queue and it isn't the current * callout, then we can't stop it, so just bail. */ - if (cc->cc_curr != c) { + if (cc->cc_exec_entity[direct].cc_curr != c) { CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p", c, c->c_func, c->c_arg); CC_UNLOCK(cc); if (sq_locked) - sleepq_release(&cc->cc_waiting); + sleepq_release( + &cc->cc_exec_entity[direct].cc_waiting); return (0); } @@ -942,8 +1102,7 @@ again: * just wait for the current invocation to * finish. */ - while (cc->cc_curr == c) { - + while (cc->cc_exec_entity[direct].cc_curr == c) { /* * Use direct calls to sleepqueue interface * instead of cv/msleep in order to avoid @@ -963,7 +1122,8 @@ again: */ if (!sq_locked) { CC_UNLOCK(cc); - sleepq_lock(&cc->cc_waiting); + sleepq_lock( + &cc->cc_exec_entity[direct].cc_waiting); sq_locked = 1; old_cc = cc; goto again; @@ -975,13 +1135,16 @@ again: * will be packed up, just let softclock() * take care of it. */ - cc->cc_waiting = 1; + cc->cc_exec_entity[direct].cc_waiting = TRUE; DROP_GIANT(); CC_UNLOCK(cc); - sleepq_add(&cc->cc_waiting, + sleepq_add( + &cc->cc_exec_entity[direct].cc_waiting, &cc->cc_lock.lock_object, "codrain", SLEEPQ_SLEEP, 0); - sleepq_wait(&cc->cc_waiting, 0); + sleepq_wait( + &cc->cc_exec_entity[direct].cc_waiting, + 0); sq_locked = 0; old_cc = NULL; @@ -989,7 +1152,8 @@ again: PICKUP_GIANT(); CC_LOCK(cc); } - } else if (use_lock && !cc->cc_cancel) { + } else if (use_lock && + !cc->cc_exec_entity[direct].cc_cancel) { /* * The current callout is waiting for its * lock which we hold. Cancel the callout @@ -997,10 +1161,10 @@ again: * lock, the callout will be skipped in * softclock(). */ - cc->cc_cancel = 1; + cc->cc_exec_entity[direct].cc_cancel = TRUE; CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); - KASSERT(!cc_cme_migrating(cc), + KASSERT(!cc_cce_migrating(cc, direct), ("callout wrongly scheduled for migration")); CC_UNLOCK(cc); KASSERT(!sq_locked, ("sleepqueue chain locked")); @@ -1019,16 +1183,18 @@ again: return (0); } if (sq_locked) - sleepq_release(&cc->cc_waiting); + sleepq_release(&cc->cc_exec_entity[direct].cc_waiting); c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); - if (cc->cc_next == c) - cc->cc_next = TAILQ_NEXT(c, c_links.tqe); - TAILQ_REMOVE(&cc->cc_callwheel[c->c_time & callwheelmask], c, - c_links.tqe); + if ((c->c_flags & CALLOUT_PROCESSED) == 0) { + if (cc->cc_exec_next_dir == c) + cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le); + LIST_REMOVE(c, c_links.le); + } else + TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); callout_cc_del(c, cc); CC_UNLOCK(cc); @@ -1135,3 +1301,121 @@ adjust_timeout_calltodo(time_change) return; } #endif /* APM_FIXUP_CALLTODO */ + +static int +flssbt(sbintime_t sbt) +{ + + sbt += (uint64_t)sbt >> 1; + if (sizeof(long) >= sizeof(sbintime_t)) + return (flsl(sbt)); + if (sbt >= SBT_1S) + return (flsl(((uint64_t)sbt) >> 32) + 32); + return (flsl(sbt)); +} + +/* + * Dump immediate statistic snapshot of the scheduled callouts. + */ +static int +sysctl_kern_callout_stat(SYSCTL_HANDLER_ARGS) +{ + struct callout *tmp; + struct callout_cpu *cc; + struct callout_list *sc; + sbintime_t maxpr, maxt, medpr, medt, now, spr, st, t; + int ct[64], cpr[64], ccpbk[32]; + int error, val, i, count, tcum, pcum, maxc, c, medc; +#ifdef SMP + int cpu; +#endif + + val = 0; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + count = maxc = 0; + st = spr = maxt = maxpr = 0; + bzero(ccpbk, sizeof(ccpbk)); + bzero(ct, sizeof(ct)); + bzero(cpr, sizeof(cpr)); + now = sbinuptime(); +#ifdef SMP + CPU_FOREACH(cpu) { + cc = CC_CPU(cpu); +#else + cc = CC_CPU(timeout_cpu); +#endif + CC_LOCK(cc); + for (i = 0; i < callwheelsize; i++) { + sc = &cc->cc_callwheel[i]; + c = 0; + LIST_FOREACH(tmp, sc, c_links.le) { + c++; + t = tmp->c_time - now; + if (t < 0) + t = 0; + st += t / SBT_1US; + spr += tmp->c_precision / SBT_1US; + if (t > maxt) + maxt = t; + if (tmp->c_precision > maxpr) + maxpr = tmp->c_precision; + ct[flssbt(t)]++; + cpr[flssbt(tmp->c_precision)]++; + } + if (c > maxc) + maxc = c; + ccpbk[fls(c + c / 2)]++; + count += c; + } + CC_UNLOCK(cc); +#ifdef SMP + } +#endif + + for (i = 0, tcum = 0; i < 64 && tcum < count / 2; i++) + tcum += ct[i]; + medt = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0; + for (i = 0, pcum = 0; i < 64 && pcum < count / 2; i++) + pcum += cpr[i]; + medpr = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0; + for (i = 0, c = 0; i < 32 && c < count / 2; i++) + c += ccpbk[i]; + medc = (i >= 2) ? (1 << (i - 2)) : 0; + + printf("Scheduled callouts statistic snapshot:\n"); + printf(" Callouts: %6d Buckets: %6d*%-3d Bucket size: 0.%06ds\n", + count, callwheelsize, mp_ncpus, 1000000 >> CC_HASH_SHIFT); + printf(" C/Bk: med %5d avg %6d.%06jd max %6d\n", + medc, + count / callwheelsize / mp_ncpus, + (uint64_t)count * 1000000 / callwheelsize / mp_ncpus % 1000000, + maxc); + printf(" Time: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n", + medt / SBT_1S, (medt & 0xffffffff) * 1000000 >> 32, + (st / count) / 1000000, (st / count) % 1000000, + maxt / SBT_1S, (maxt & 0xffffffff) * 1000000 >> 32); + printf(" Prec: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n", + medpr / SBT_1S, (medpr & 0xffffffff) * 1000000 >> 32, + (spr / count) / 1000000, (spr / count) % 1000000, + maxpr / SBT_1S, (maxpr & 0xffffffff) * 1000000 >> 32); + printf(" Distribution: \tbuckets\t time\t tcum\t" + " prec\t pcum\n"); + for (i = 0, tcum = pcum = 0; i < 64; i++) { + if (ct[i] == 0 && cpr[i] == 0) + continue; + t = (i != 0) ? (((sbintime_t)1) << (i - 1)) : 0; + tcum += ct[i]; + pcum += cpr[i]; + printf(" %10jd.%06jds\t 2**%d\t%7d\t%7d\t%7d\t%7d\n", + t / SBT_1S, (t & 0xffffffff) * 1000000 >> 32, + i - 1 - (32 - CC_HASH_SHIFT), + ct[i], tcum, cpr[i], pcum); + } + return (error); +} +SYSCTL_PROC(_kern, OID_AUTO, callout_stat, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + 0, 0, sysctl_kern_callout_stat, "I", + "Dump immediate statistic snapshot of the scheduled callouts"); |