diff options
author | davide <davide@FreeBSD.org> | 2013-03-04 11:09:56 +0000 |
---|---|---|
committer | davide <davide@FreeBSD.org> | 2013-03-04 11:09:56 +0000 |
commit | 431035cf16837066ffdc5abc7e48a56cc1dfed5d (patch) | |
tree | 0c872a90fed688dcf8a42c76624fcacd6e02d513 /sys | |
parent | e52f997818951d22a197789d4aa9c32ab77ab70a (diff) | |
download | FreeBSD-src-431035cf16837066ffdc5abc7e48a56cc1dfed5d.zip FreeBSD-src-431035cf16837066ffdc5abc7e48a56cc1dfed5d.tar.gz |
- Make callout(9) tickless, relying on eventtimers(4) as backend for
precise time event generation. This greatly improves granularity of
callouts which are not anymore constrained to wait next tick to be
scheduled.
- Extend the callout KPI introducing a set of callout_reset_sbt* functions,
which take a sbintime_t as timeout argument. The new KPI also offers a
way for consumers to specify precision tolerance they allow, so that
callout can coalesce events and reduce number of interrupts as well as
potentially avoid scheduling a SWI thread.
- Introduce support for dispatching callouts directly from hardware
interrupt context, specifying an additional flag. This feature should be
used carefully, as long as interrupt context has some limitations
(e.g. no sleeping locks can be held).
- Enhance mechanisms to gather informations about callwheel, introducing
a new sysctl to obtain stats.
This change breaks the KBI. struct callout fields has been changed, in
particular 'int ticks' (4 bytes) has been replaced with 'sbintime_t'
(8 bytes) and another 'sbintime_t' field was added for precision.
Together with: mav
Reviewed by: attilio, bde, luigi, phk
Sponsored by: Google Summer of Code 2012, iXsystems inc.
Tested by: flo (amd64, sparc64), marius (sparc64), ian (arm),
markj (amd64), mav, Fabian Keil
Diffstat (limited to 'sys')
-rw-r--r-- | sys/conf/NOTES | 5 | ||||
-rw-r--r-- | sys/conf/options | 1 | ||||
-rw-r--r-- | sys/kern/kern_clock.c | 3 | ||||
-rw-r--r-- | sys/kern/kern_clocksource.c | 468 | ||||
-rw-r--r-- | sys/kern/kern_tc.c | 59 | ||||
-rw-r--r-- | sys/kern/kern_timeout.c | 760 | ||||
-rw-r--r-- | sys/kern/subr_param.c | 8 | ||||
-rw-r--r-- | sys/netinet/tcp_timer.c | 18 | ||||
-rw-r--r-- | sys/sys/_callout.h | 7 | ||||
-rw-r--r-- | sys/sys/callout.h | 24 | ||||
-rw-r--r-- | sys/sys/systm.h | 1 | ||||
-rw-r--r-- | sys/sys/time.h | 44 |
12 files changed, 893 insertions, 505 deletions
diff --git a/sys/conf/NOTES b/sys/conf/NOTES index 5d26093..27c3380 100644 --- a/sys/conf/NOTES +++ b/sys/conf/NOTES @@ -259,6 +259,8 @@ options SX_NOINLINE # SMP Debugging Options: # +# CALLOUT_PROFILING enables rudimentary profiling of the callwheel data +# structure used as backend in callout(9). # PREEMPTION allows the threads that are in the kernel to be preempted by # higher priority [interrupt] threads. It helps with interactivity # and allows interrupt threads to run sooner rather than waiting. @@ -297,6 +299,9 @@ options LOCK_PROFILING options MPROF_BUFFERS="1536" options MPROF_HASH_SIZE="1543" +# Profiling for the callout(9) backend. +options CALLOUT_PROFILING + # Profiling for internal hash tables. options SLEEPQUEUE_PROFILING options TURNSTILE_PROFILING diff --git a/sys/conf/options b/sys/conf/options index ab5d153..75d0c97 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -68,6 +68,7 @@ TEXTDUMP_VERBOSE opt_ddb.h ADAPTIVE_LOCKMGRS ALQ AUDIT opt_global.h +CALLOUT_PROFILING CAPABILITIES opt_capsicum.h CAPABILITY_MODE opt_capsicum.h COMPAT_43 opt_compat.h diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c index 55a2bff..4439ccb 100644 --- a/sys/kern/kern_clock.c +++ b/sys/kern/kern_clock.c @@ -460,7 +460,7 @@ hardclock_cpu(int usermode) if (td->td_intr_frame != NULL) PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame); #endif - callout_tick(); + callout_process(sbinuptime()); } /* @@ -550,7 +550,6 @@ hardclock_cnt(int cnt, int usermode) if (td->td_intr_frame != NULL) PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame); #endif - callout_tick(); /* We are in charge to handle this tick duty. */ if (newticks > 0) { /* Dangerous and no need to call these things concurrently. */ diff --git a/sys/kern/kern_clocksource.c b/sys/kern/kern_clocksource.c index 10732d9..c2bebbe 100644 --- a/sys/kern/kern_clocksource.c +++ b/sys/kern/kern_clocksource.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2010-2012 Alexander Motin <mav@FreeBSD.org> + * Copyright (c) 2010-2013 Alexander Motin <mav@FreeBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> #include <sys/bus.h> +#include <sys/limits.h> #include <sys/lock.h> #include <sys/kdb.h> #include <sys/ktr.h> @@ -63,17 +64,14 @@ int cpu_can_deep_sleep = 0; /* C3 state is available. */ int cpu_disable_deep_sleep = 0; /* Timer dies in C3. */ static void setuptimer(void); -static void loadtimer(struct bintime *now, int first); +static void loadtimer(sbintime_t now, int first); static int doconfigtimer(void); static void configtimer(int start); static int round_freq(struct eventtimer *et, int freq); -static void getnextcpuevent(struct bintime *event, int idle); -static void getnextevent(struct bintime *event); -static int handleevents(struct bintime *now, int fake); -#ifdef SMP -static void cpu_new_callout(int cpu, int ticks); -#endif +static sbintime_t getnextcpuevent(int idle); +static sbintime_t getnextevent(void); +static int handleevents(sbintime_t now, int fake); static struct mtx et_hw_mtx; @@ -94,13 +92,11 @@ static struct mtx et_hw_mtx; } static struct eventtimer *timer = NULL; -static struct bintime timerperiod; /* Timer period for periodic mode. */ -static struct bintime hardperiod; /* hardclock() events period. */ -static struct bintime statperiod; /* statclock() events period. */ -static struct bintime profperiod; /* profclock() events period. */ -static struct bintime nexttick; /* Next global timer tick time. */ -static struct bintime nexthard; /* Next global hardlock() event. */ -static u_int busy = 0; /* Reconfiguration is in progress. */ +static sbintime_t timerperiod; /* Timer period for periodic mode. */ +static sbintime_t statperiod; /* statclock() events period. */ +static sbintime_t profperiod; /* profclock() events period. */ +static sbintime_t nexttick; /* Next global timer tick time. */ +static u_int busy = 1; /* Reconfiguration is in progress. */ static int profiling = 0; /* Profiling events enabled. */ static char timername[32]; /* Wanted timer. */ @@ -116,11 +112,6 @@ TUNABLE_INT("kern.eventtimer.idletick", &idletick); SYSCTL_UINT(_kern_eventtimer, OID_AUTO, idletick, CTLFLAG_RW, &idletick, 0, "Run periodic events when idle"); -static u_int activetick = 1; /* Run all periodic events when active. */ -TUNABLE_INT("kern.eventtimer.activetick", &activetick); -SYSCTL_UINT(_kern_eventtimer, OID_AUTO, activetick, CTLFLAG_RW, &activetick, - 0, "Run all periodic events when active"); - static int periodic = 0; /* Periodic or one-shot mode. */ static int want_periodic = 0; /* What mode to prefer. */ TUNABLE_INT("kern.eventtimer.periodic", &want_periodic); @@ -129,31 +120,23 @@ struct pcpu_state { struct mtx et_hw_mtx; /* Per-CPU timer mutex. */ u_int action; /* Reconfiguration requests. */ u_int handle; /* Immediate handle resuests. */ - struct bintime now; /* Last tick time. */ - struct bintime nextevent; /* Next scheduled event on this CPU. */ - struct bintime nexttick; /* Next timer tick time. */ - struct bintime nexthard; /* Next hardlock() event. */ - struct bintime nextstat; /* Next statclock() event. */ - struct bintime nextprof; /* Next profclock() event. */ + sbintime_t now; /* Last tick time. */ + sbintime_t nextevent; /* Next scheduled event on this CPU. */ + sbintime_t nexttick; /* Next timer tick time. */ + sbintime_t nexthard; /* Next hardlock() event. */ + sbintime_t nextstat; /* Next statclock() event. */ + sbintime_t nextprof; /* Next profclock() event. */ + sbintime_t nextcall; /* Next callout event. */ + sbintime_t nextcallopt; /* Next optional callout event. */ #ifdef KDTRACE_HOOKS - struct bintime nextcyc; /* Next OpenSolaris cyclics event. */ + sbintime_t nextcyc; /* Next OpenSolaris cyclics event. */ #endif int ipi; /* This CPU needs IPI. */ int idle; /* This CPU is in idle mode. */ }; static DPCPU_DEFINE(struct pcpu_state, timerstate); - -#define FREQ2BT(freq, bt) \ -{ \ - (bt)->sec = 0; \ - (bt)->frac = ((uint64_t)0x8000000000000000 / (freq)) << 1; \ -} -#define BT2FREQ(bt) \ - (((uint64_t)0x8000000000000000 + ((bt)->frac >> 2)) / \ - ((bt)->frac >> 1)) - -#define SBT2FREQ(sbt) ((SBT_1S + ((sbt) >> 1)) / (sbt)) +DPCPU_DEFINE(sbintime_t, hardclocktime); /* * Timer broadcast IPI handler. @@ -161,7 +144,7 @@ static DPCPU_DEFINE(struct pcpu_state, timerstate); int hardclockintr(void) { - struct bintime now; + sbintime_t now; struct pcpu_state *state; int done; @@ -169,10 +152,9 @@ hardclockintr(void) return (FILTER_HANDLED); state = DPCPU_PTR(timerstate); now = state->now; - CTR4(KTR_SPARE2, "ipi at %d: now %d.%08x%08x", - curcpu, now.sec, (u_int)(now.frac >> 32), - (u_int)(now.frac & 0xffffffff)); - done = handleevents(&now, 0); + CTR3(KTR_SPARE2, "ipi at %d: now %d.%08x", + curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); + done = handleevents(now, 0); return (done ? FILTER_HANDLED : FILTER_STRAY); } @@ -180,48 +162,43 @@ hardclockintr(void) * Handle all events for specified time on this CPU */ static int -handleevents(struct bintime *now, int fake) +handleevents(sbintime_t now, int fake) { - struct bintime t; + sbintime_t t, *hct; struct trapframe *frame; struct pcpu_state *state; - uintfptr_t pc; int usermode; int done, runs; - CTR4(KTR_SPARE2, "handle at %d: now %d.%08x%08x", - curcpu, now->sec, (u_int)(now->frac >> 32), - (u_int)(now->frac & 0xffffffff)); + CTR3(KTR_SPARE2, "handle at %d: now %d.%08x", + curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); done = 0; if (fake) { frame = NULL; usermode = 0; - pc = 0; } else { frame = curthread->td_intr_frame; usermode = TRAPF_USERMODE(frame); - pc = TRAPF_PC(frame); } state = DPCPU_PTR(timerstate); runs = 0; - while (bintime_cmp(now, &state->nexthard, >=)) { - bintime_addx(&state->nexthard, hardperiod.frac); + while (now >= state->nexthard) { + state->nexthard += tick_sbt; runs++; } if (runs) { - if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 && - bintime_cmp(&state->nexthard, &nexthard, >)) - nexthard = state->nexthard; + hct = DPCPU_PTR(hardclocktime); + *hct = state->nexthard - tick_sbt; if (fake < 2) { hardclock_cnt(runs, usermode); done = 1; } } runs = 0; - while (bintime_cmp(now, &state->nextstat, >=)) { - bintime_addx(&state->nextstat, statperiod.frac); + while (now >= state->nextstat) { + state->nextstat += statperiod; runs++; } if (runs && fake < 2) { @@ -230,31 +207,29 @@ handleevents(struct bintime *now, int fake) } if (profiling) { runs = 0; - while (bintime_cmp(now, &state->nextprof, >=)) { - bintime_addx(&state->nextprof, profperiod.frac); + while (now >= state->nextprof) { + state->nextprof += profperiod; runs++; } if (runs && !fake) { - profclock_cnt(runs, usermode, pc); + profclock_cnt(runs, usermode, TRAPF_PC(frame)); done = 1; } } else state->nextprof = state->nextstat; + if (now >= state->nextcallopt) { + state->nextcall = state->nextcallopt = INT64_MAX; + callout_process(now); + } #ifdef KDTRACE_HOOKS - if (fake == 0 && cyclic_clock_func != NULL && - state->nextcyc.sec != -1 && - bintime_cmp(now, &state->nextcyc, >=)) { - state->nextcyc.sec = -1; + if (fake == 0 && now >= state->nextcyc && cyclic_clock_func != NULL) { + state->nextcyc = INT64_MAX; (*cyclic_clock_func)(frame); } #endif - getnextcpuevent(&t, 0); - if (fake == 2) { - state->nextevent = t; - return (done); - } + t = getnextcpuevent(0); ET_HW_LOCK(state); if (!busy) { state->idle = 0; @@ -268,84 +243,81 @@ handleevents(struct bintime *now, int fake) /* * Schedule binuptime of the next event on current CPU. */ -static void -getnextcpuevent(struct bintime *event, int idle) +static sbintime_t +getnextcpuevent(int idle) { - struct bintime tmp; + sbintime_t event; struct pcpu_state *state; - int skip; + u_int hardfreq; state = DPCPU_PTR(timerstate); - /* Handle hardclock() events. */ - *event = state->nexthard; - if (idle || (!activetick && !profiling && - (timer->et_flags & ET_FLAGS_PERCPU) == 0)) { - skip = idle ? 4 : (stathz / 2); - if (curcpu == CPU_FIRST() && tc_min_ticktock_freq > skip) - skip = tc_min_ticktock_freq; - skip = callout_tickstofirst(hz / skip) - 1; - CTR2(KTR_SPARE2, "skip at %d: %d", curcpu, skip); - tmp = hardperiod; - bintime_mul(&tmp, skip); - bintime_add(event, &tmp); - } + /* Handle hardclock() events, skipping some if CPU is idle. */ + event = state->nexthard; + if (idle) { + hardfreq = (u_int)hz / 2; + if (tc_min_ticktock_freq > 2 +#ifdef SMP + && curcpu == CPU_FIRST() +#endif + ) + hardfreq = hz / tc_min_ticktock_freq; + if (hardfreq > 1) + event += tick_sbt * (hardfreq - 1); + } + /* Handle callout events. */ + if (event > state->nextcall) + event = state->nextcall; if (!idle) { /* If CPU is active - handle other types of events. */ - if (bintime_cmp(event, &state->nextstat, >)) - *event = state->nextstat; - if (profiling && bintime_cmp(event, &state->nextprof, >)) - *event = state->nextprof; + if (event > state->nextstat) + event = state->nextstat; + if (profiling && event > state->nextprof) + event = state->nextprof; } #ifdef KDTRACE_HOOKS - if (state->nextcyc.sec != -1 && bintime_cmp(event, &state->nextcyc, >)) - *event = state->nextcyc; + if (event > state->nextcyc) + event = state->nextcyc; #endif + return (event); } /* * Schedule binuptime of the next event on all CPUs. */ -static void -getnextevent(struct bintime *event) +static sbintime_t +getnextevent(void) { struct pcpu_state *state; + sbintime_t event; #ifdef SMP int cpu; #endif - int c, nonidle; + int c; state = DPCPU_PTR(timerstate); - *event = state->nextevent; - c = curcpu; - nonidle = !state->idle; - if ((timer->et_flags & ET_FLAGS_PERCPU) == 0) { + event = state->nextevent; + c = -1; #ifdef SMP - if (smp_started) { - CPU_FOREACH(cpu) { - if (curcpu == cpu) - continue; - state = DPCPU_ID_PTR(cpu, timerstate); - nonidle += !state->idle; - if (bintime_cmp(event, &state->nextevent, >)) { - *event = state->nextevent; - c = cpu; - } + if ((timer->et_flags & ET_FLAGS_PERCPU) == 0) { + CPU_FOREACH(cpu) { + state = DPCPU_ID_PTR(cpu, timerstate); + if (event > state->nextevent) { + event = state->nextevent; + c = cpu; } } -#endif - if (nonidle != 0 && bintime_cmp(event, &nexthard, >)) - *event = nexthard; } - CTR5(KTR_SPARE2, "next at %d: next %d.%08x%08x by %d", - curcpu, event->sec, (u_int)(event->frac >> 32), - (u_int)(event->frac & 0xffffffff), c); +#endif + CTR4(KTR_SPARE2, "next at %d: next %d.%08x by %d", + curcpu, (int)(event >> 32), (u_int)(event & 0xffffffff), c); + return (event); } /* Hardware timer callback function. */ static void timercb(struct eventtimer *et, void *arg) { - struct bintime now; - struct bintime *next; + sbintime_t now; + sbintime_t *next; struct pcpu_state *state; #ifdef SMP int cpu, bcast; @@ -360,16 +332,14 @@ timercb(struct eventtimer *et, void *arg) next = &state->nexttick; } else next = &nexttick; - binuptime(&now); - if (periodic) { - *next = now; - bintime_addx(next, timerperiod.frac); /* Next tick in 1 period. */ - } else - next->sec = -1; /* Next tick is not scheduled yet. */ + now = sbinuptime(); + if (periodic) + *next = now + timerperiod; + else + *next = -1; /* Next tick is not scheduled yet. */ state->now = now; - CTR4(KTR_SPARE2, "intr at %d: now %d.%08x%08x", - curcpu, (int)(now.sec), (u_int)(now.frac >> 32), - (u_int)(now.frac & 0xffffffff)); + CTR3(KTR_SPARE2, "intr at %d: now %d.%08x", + curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); #ifdef SMP /* Prepare broadcasting to other CPUs for non-per-CPU timers. */ @@ -379,8 +349,8 @@ timercb(struct eventtimer *et, void *arg) state = DPCPU_ID_PTR(cpu, timerstate); ET_HW_LOCK(state); state->now = now; - if (bintime_cmp(&now, &state->nextevent, >=)) { - state->nextevent.sec++; + if (now >= state->nextevent) { + state->nextevent += SBT_1S; if (curcpu != cpu) { state->ipi = 1; bcast = 1; @@ -392,7 +362,7 @@ timercb(struct eventtimer *et, void *arg) #endif /* Handle events for this time on this CPU. */ - handleevents(&now, 0); + handleevents(now, 0); #ifdef SMP /* Broadcast interrupt to other CPUs for non-per-CPU timers. */ @@ -414,11 +384,11 @@ timercb(struct eventtimer *et, void *arg) * Load new value into hardware timer. */ static void -loadtimer(struct bintime *now, int start) +loadtimer(sbintime_t now, int start) { struct pcpu_state *state; - struct bintime new; - struct bintime *next; + sbintime_t new; + sbintime_t *next; uint64_t tmp; int eq; @@ -433,30 +403,24 @@ loadtimer(struct bintime *now, int start) * Try to start all periodic timers aligned * to period to make events synchronous. */ - tmp = ((uint64_t)now->sec << 36) + (now->frac >> 28); - tmp = (tmp % (timerperiod.frac >> 28)) << 28; - new.sec = 0; - new.frac = timerperiod.frac - tmp; - if (new.frac < tmp) /* Left less then passed. */ - bintime_addx(&new, timerperiod.frac); + tmp = now % timerperiod; + new = timerperiod - tmp; + if (new < tmp) /* Left less then passed. */ + new += timerperiod; CTR5(KTR_SPARE2, "load p at %d: now %d.%08x first in %d.%08x", - curcpu, now->sec, (u_int)(now->frac >> 32), - new.sec, (u_int)(new.frac >> 32)); - *next = new; - bintime_add(next, now); - et_start(timer, bttosbt(new), bttosbt(timerperiod)); + curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff), + (int)(new >> 32), (u_int)(new & 0xffffffff)); + *next = new + now; + et_start(timer, new, timerperiod); } } else { - getnextevent(&new); - eq = bintime_cmp(&new, next, ==); - CTR5(KTR_SPARE2, "load at %d: next %d.%08x%08x eq %d", - curcpu, new.sec, (u_int)(new.frac >> 32), - (u_int)(new.frac & 0xffffffff), - eq); + new = getnextevent(); + eq = (new == *next); + CTR4(KTR_SPARE2, "load at %d: next %d.%08x eq %d", + curcpu, (int)(new >> 32), (u_int)(new & 0xffffffff), eq); if (!eq) { *next = new; - bintime_sub(&new, now); - et_start(timer, bttosbt(new), 0); + et_start(timer, new - now, 0); } } } @@ -478,7 +442,7 @@ setuptimer(void) while (freq < (profiling ? profhz : stathz)) freq += hz; freq = round_freq(timer, freq); - FREQ2BT(freq, &timerperiod); + timerperiod = SBT_1S / freq; } /* @@ -487,15 +451,15 @@ setuptimer(void) static int doconfigtimer(void) { - struct bintime now; + sbintime_t now; struct pcpu_state *state; state = DPCPU_PTR(timerstate); switch (atomic_load_acq_int(&state->action)) { case 1: - binuptime(&now); + now = sbinuptime(); ET_HW_LOCK(state); - loadtimer(&now, 1); + loadtimer(now, 1); ET_HW_UNLOCK(state); state->handle = 0; atomic_store_rel_int(&state->action, 0); @@ -509,8 +473,8 @@ doconfigtimer(void) return (1); } if (atomic_readandclear_int(&state->handle) && !busy) { - binuptime(&now); - handleevents(&now, 0); + now = sbinuptime(); + handleevents(now, 0); return (1); } return (0); @@ -523,40 +487,45 @@ doconfigtimer(void) static void configtimer(int start) { - struct bintime now, next; + sbintime_t now, next; struct pcpu_state *state; int cpu; if (start) { setuptimer(); - binuptime(&now); - } + now = sbinuptime(); + } else + now = 0; critical_enter(); ET_HW_LOCK(DPCPU_PTR(timerstate)); if (start) { /* Initialize time machine parameters. */ - next = now; - bintime_addx(&next, timerperiod.frac); + next = now + timerperiod; if (periodic) nexttick = next; else - nexttick.sec = -1; + nexttick = -1; CPU_FOREACH(cpu) { state = DPCPU_ID_PTR(cpu, timerstate); state->now = now; - state->nextevent = next; + if (!smp_started && cpu != CPU_FIRST()) + state->nextevent = INT64_MAX; + else + state->nextevent = next; if (periodic) state->nexttick = next; else - state->nexttick.sec = -1; + state->nexttick = -1; state->nexthard = next; state->nextstat = next; state->nextprof = next; + state->nextcall = next; + state->nextcallopt = next; hardclock_sync(cpu); } busy = 0; /* Start global timer or per-CPU timer of this CPU. */ - loadtimer(&now, 1); + loadtimer(now, 1); } else { busy = 1; /* Stop global timer or per-CPU timer of this CPU. */ @@ -629,12 +598,11 @@ cpu_initclocks_bsp(void) state = DPCPU_ID_PTR(cpu, timerstate); mtx_init(&state->et_hw_mtx, "et_hw_mtx", NULL, MTX_SPIN); #ifdef KDTRACE_HOOKS - state->nextcyc.sec = -1; + state->nextcyc = INT64_MAX; #endif + state->nextcall = INT64_MAX; + state->nextcallopt = INT64_MAX; } -#ifdef SMP - callout_new_inserted = cpu_new_callout; -#endif periodic = want_periodic; /* Grab requested timer or the best of present. */ if (timername[0]) @@ -698,9 +666,10 @@ cpu_initclocks_bsp(void) profhz = round_freq(timer, stathz * 64); } tick = 1000000 / hz; - FREQ2BT(hz, &hardperiod); - FREQ2BT(stathz, &statperiod); - FREQ2BT(profhz, &profperiod); + tick_sbt = SBT_1S / hz; + tick_bt = sbttobt(tick_sbt); + statperiod = SBT_1S / stathz; + profperiod = SBT_1S / profhz; ET_LOCK(); configtimer(1); ET_UNLOCK(); @@ -712,18 +681,22 @@ cpu_initclocks_bsp(void) void cpu_initclocks_ap(void) { - struct bintime now; + sbintime_t now; struct pcpu_state *state; + struct thread *td; state = DPCPU_PTR(timerstate); - binuptime(&now); + now = sbinuptime(); ET_HW_LOCK(state); state->now = now; hardclock_sync(curcpu); - handleevents(&state->now, 2); - if (timer->et_flags & ET_FLAGS_PERCPU) - loadtimer(&now, 1); + spinlock_enter(); ET_HW_UNLOCK(state); + td = curthread; + td->td_intr_nesting_level++; + handleevents(state->now, 2); + td->td_intr_nesting_level--; + spinlock_exit(); } /* @@ -772,7 +745,7 @@ cpu_stopprofclock(void) sbintime_t cpu_idleclock(void) { - struct bintime now, t; + sbintime_t now, t; struct pcpu_state *state; if (idletick || busy || @@ -786,19 +759,17 @@ cpu_idleclock(void) if (periodic) now = state->now; else - binuptime(&now); - CTR4(KTR_SPARE2, "idle at %d: now %d.%08x%08x", - curcpu, now.sec, (u_int)(now.frac >> 32), - (u_int)(now.frac & 0xffffffff)); - getnextcpuevent(&t, 1); + now = sbinuptime(); + CTR3(KTR_SPARE2, "idle at %d: now %d.%08x", + curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); + t = getnextcpuevent(1); ET_HW_LOCK(state); state->idle = 1; state->nextevent = t; if (!periodic) - loadtimer(&now, 0); + loadtimer(now, 0); ET_HW_UNLOCK(state); - bintime_sub(&t, &now); - return (MAX(bttosbt(t), 0)); + return (MAX(t - now, 0)); } /* @@ -807,7 +778,7 @@ cpu_idleclock(void) void cpu_activeclock(void) { - struct bintime now; + sbintime_t now; struct pcpu_state *state; struct thread *td; @@ -817,101 +788,98 @@ cpu_activeclock(void) if (periodic) now = state->now; else - binuptime(&now); - CTR4(KTR_SPARE2, "active at %d: now %d.%08x%08x", - curcpu, now.sec, (u_int)(now.frac >> 32), - (u_int)(now.frac & 0xffffffff)); + now = sbinuptime(); + CTR3(KTR_SPARE2, "active at %d: now %d.%08x", + curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); spinlock_enter(); td = curthread; td->td_intr_nesting_level++; - handleevents(&now, 1); + handleevents(now, 1); td->td_intr_nesting_level--; spinlock_exit(); } #ifdef KDTRACE_HOOKS void -clocksource_cyc_set(const struct bintime *t) +clocksource_cyc_set(const struct bintime *bt) { - struct bintime now; + sbintime_t now, t; struct pcpu_state *state; + /* Do not touch anything if somebody reconfiguring timers. */ + if (busy) + return; + t = bttosbt(*bt); state = DPCPU_PTR(timerstate); if (periodic) now = state->now; else - binuptime(&now); + now = sbinuptime(); - CTR4(KTR_SPARE2, "set_cyc at %d: now %d.%08x%08x", - curcpu, now.sec, (u_int)(now.frac >> 32), - (u_int)(now.frac & 0xffffffff)); - CTR4(KTR_SPARE2, "set_cyc at %d: t %d.%08x%08x", - curcpu, t->sec, (u_int)(t->frac >> 32), - (u_int)(t->frac & 0xffffffff)); + CTR5(KTR_SPARE2, "set_cyc at %d: now %d.%08x t %d.%08x", + curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff), + (int)(t >> 32), (u_int)(t & 0xffffffff)); ET_HW_LOCK(state); - if (bintime_cmp(t, &state->nextcyc, ==)) { - ET_HW_UNLOCK(state); - return; - } - state->nextcyc = *t; - if (bintime_cmp(&state->nextcyc, &state->nextevent, >=)) { - ET_HW_UNLOCK(state); - return; - } - state->nextevent = state->nextcyc; + if (t == state->nextcyc) + goto done; + state->nextcyc = t; + if (t >= state->nextevent) + goto done; + state->nextevent = t; if (!periodic) - loadtimer(&now, 0); + loadtimer(now, 0); +done: ET_HW_UNLOCK(state); } #endif -#ifdef SMP -static void -cpu_new_callout(int cpu, int ticks) +void +cpu_new_callout(int cpu, sbintime_t bt, sbintime_t bt_opt) { - struct bintime tmp; struct pcpu_state *state; - CTR3(KTR_SPARE2, "new co at %d: on %d in %d", - curcpu, cpu, ticks); + /* Do not touch anything if somebody reconfiguring timers. */ + if (busy) + return; + CTR6(KTR_SPARE2, "new co at %d: on %d at %d.%08x - %d.%08x", + curcpu, cpu, (int)(bt_opt >> 32), (u_int)(bt_opt & 0xffffffff), + (int)(bt >> 32), (u_int)(bt & 0xffffffff)); state = DPCPU_ID_PTR(cpu, timerstate); ET_HW_LOCK(state); - if (state->idle == 0 || busy) { - ET_HW_UNLOCK(state); - return; - } + /* - * If timer is periodic - just update next event time for target CPU. - * If timer is global - there is chance it is already programmed. + * If there is callout time already set earlier -- do nothing. + * This check may appear redundant because we check already in + * callout_process() but this double check guarantees we're safe + * with respect to race conditions between interrupts execution + * and scheduling. */ - if (periodic || (timer->et_flags & ET_FLAGS_PERCPU) == 0) { - tmp = hardperiod; - bintime_mul(&tmp, ticks - 1); - bintime_add(&tmp, &state->nexthard); - if (bintime_cmp(&tmp, &state->nextevent, <)) - state->nextevent = tmp; - if (periodic || - bintime_cmp(&state->nextevent, &nexttick, >=)) { - ET_HW_UNLOCK(state); - return; - } + state->nextcallopt = bt_opt; + if (bt >= state->nextcall) + goto done; + state->nextcall = bt; + /* If there is some other event set earlier -- do nothing. */ + if (bt >= state->nextevent) + goto done; + state->nextevent = bt; + /* If timer is periodic -- there is nothing to reprogram. */ + if (periodic) + goto done; + /* If timer is global or of the current CPU -- reprogram it. */ + if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 || cpu == curcpu) { + loadtimer(sbinuptime(), 0); +done: + ET_HW_UNLOCK(state); + return; } - /* - * Otherwise we have to wake that CPU up, as we can't get present - * bintime to reprogram global timer from here. If timer is per-CPU, - * we by definition can't do it from here. - */ + /* Otherwise make other CPU to reprogram it. */ + state->handle = 1; ET_HW_UNLOCK(state); - if (timer->et_flags & ET_FLAGS_PERCPU) { - state->handle = 1; - ipi_cpu(cpu, IPI_HARDCLOCK); - } else { - if (!cpu_idle_wakeup(cpu)) - ipi_cpu(cpu, IPI_AST); - } -} +#ifdef SMP + ipi_cpu(cpu, IPI_HARDCLOCK); #endif +} /* * Report or change the active event timers hardware. diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c index 6e1f486..9fe7ebe 100644 --- a/sys/kern/kern_tc.c +++ b/sys/kern/kern_tc.c @@ -22,6 +22,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/kernel.h> +#include <sys/limits.h> #ifdef FFCLOCK #include <sys/lock.h> #include <sys/mutex.h> @@ -119,6 +120,21 @@ static int timestepwarnings; SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RW, ×tepwarnings, 0, "Log time steps"); +struct bintime bt_timethreshold; +struct bintime bt_tickthreshold; +sbintime_t sbt_timethreshold; +sbintime_t sbt_tickthreshold; +struct bintime tc_tick_bt; +sbintime_t tc_tick_sbt; +int tc_precexp; +int tc_timepercentage = TC_DEFAULTPERC; +TUNABLE_INT("kern.timecounter.alloweddeviation", &tc_timepercentage); +static int sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS); +SYSCTL_PROC(_kern_timecounter, OID_AUTO, alloweddeviation, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, + sysctl_kern_timecounter_adjprecision, "I", + "Allowed time interval deviation in percents"); + static void tc_windup(void); static void cpu_tick_calibrate(int); @@ -1746,10 +1762,47 @@ tc_ticktock(int cnt) tc_windup(); } +static void __inline +tc_adjprecision(void) +{ + int t; + + if (tc_timepercentage > 0) { + t = (99 + tc_timepercentage) / tc_timepercentage; + tc_precexp = fls(t + (t >> 1)) - 1; + FREQ2BT(hz / tc_tick, &bt_timethreshold); + FREQ2BT(hz, &bt_tickthreshold); + bintime_shift(&bt_timethreshold, tc_precexp); + bintime_shift(&bt_tickthreshold, tc_precexp); + } else { + tc_precexp = 31; + bt_timethreshold.sec = INT_MAX; + bt_timethreshold.frac = ~(uint64_t)0; + bt_tickthreshold = bt_timethreshold; + } + sbt_timethreshold = bttosbt(bt_timethreshold); + sbt_tickthreshold = bttosbt(bt_tickthreshold); +} + +static int +sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS) +{ + int error, val; + + val = tc_timepercentage; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + tc_timepercentage = val; + tc_adjprecision(); + return (0); +} + static void inittimecounter(void *dummy) { u_int p; + int tick_rate; /* * Set the initial timeout to @@ -1763,6 +1816,12 @@ inittimecounter(void *dummy) tc_tick = (hz + 500) / 1000; else tc_tick = 1; + tc_adjprecision(); + FREQ2BT(hz, &tick_bt); + tick_sbt = bttosbt(tick_bt); + tick_rate = hz / tc_tick; + FREQ2BT(tick_rate, &tc_tick_bt); + tc_tick_sbt = bttosbt(tc_tick_bt); p = (tc_tick * 1000000) / hz; printf("Timecounters tick every %d.%03u msec\n", p / 1000, p % 1000); diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c index a217db1..0787c01 100644 --- a/sys/kern/kern_timeout.c +++ b/sys/kern/kern_timeout.c @@ -37,7 +37,11 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include "opt_callout_profiling.h" #include "opt_kdtrace.h" +#if defined(__arm__) +#include "opt_timer.h" +#endif #include <sys/param.h> #include <sys/systm.h> @@ -59,6 +63,10 @@ __FBSDID("$FreeBSD$"); #include <machine/cpu.h> #endif +#ifndef NO_EVENTTIMERS +DPCPU_DECLARE(sbintime_t, hardclocktime); +#endif + SDT_PROVIDER_DEFINE(callout_execute); SDT_PROBE_DEFINE(callout_execute, kernel, , callout_start, callout-start); SDT_PROBE_ARGTYPE(callout_execute, kernel, , callout_start, 0, @@ -67,6 +75,7 @@ SDT_PROBE_DEFINE(callout_execute, kernel, , callout_end, callout-end); SDT_PROBE_ARGTYPE(callout_execute, kernel, , callout_end, 0, "struct callout *"); +#ifdef CALLOUT_PROFILING static int avg_depth; SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0, "Average number of items examined per softclock call. Units = 1/1000"); @@ -79,6 +88,19 @@ SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls, CTLFLAG_RD, &avg_lockcalls, 0, static int avg_mpcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0, "Average number of MP callouts made per softclock call. Units = 1/1000"); +static int avg_depth_dir; +SYSCTL_INT(_debug, OID_AUTO, to_avg_depth_dir, CTLFLAG_RD, &avg_depth_dir, 0, + "Average number of direct callouts examined per callout_process call. " + "Units = 1/1000"); +static int avg_lockcalls_dir; +SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls_dir, CTLFLAG_RD, + &avg_lockcalls_dir, 0, "Average number of lock direct callouts made per " + "callout_process call. Units = 1/1000"); +static int avg_mpcalls_dir; +SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls_dir, CTLFLAG_RD, &avg_mpcalls_dir, + 0, "Average number of MP direct callouts made per callout_process call. " + "Units = 1/1000"); +#endif /* * TODO: * allocate more timeout table slots when table overflows. @@ -86,58 +108,63 @@ SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0, u_int callwheelsize, callwheelmask; /* - * The callout cpu migration entity represents informations necessary for - * describing the migrating callout to the new callout cpu. + * The callout cpu exec entities represent informations necessary for + * describing the state of callouts currently running on the CPU and the ones + * necessary for migrating callouts to the new callout cpu. In particular, + * the first entry of the array cc_exec_entity holds informations for callout + * running in SWI thread context, while the second one holds informations + * for callout running directly from hardware interrupt context. * The cached informations are very important for deferring migration when * the migrating callout is already running. */ -struct cc_mig_ent { +struct cc_exec { + struct callout *cc_next; + struct callout *cc_curr; #ifdef SMP - void (*ce_migration_func)(void *); - void *ce_migration_arg; - int ce_migration_cpu; - int ce_migration_ticks; + void (*ce_migration_func)(void *); + void *ce_migration_arg; + int ce_migration_cpu; + sbintime_t ce_migration_time; #endif + boolean_t cc_cancel; + boolean_t cc_waiting; }; /* * There is one struct callout_cpu per cpu, holding all relevant * state for the callout processing thread on the individual CPU. - * In particular: - * cc_ticks is incremented once per tick in callout_cpu(). - * It tracks the global 'ticks' but in a way that the individual - * threads should not worry about races in the order in which - * hardclock() and hardclock_cpu() run on the various CPUs. - * cc_softclock is advanced in callout_cpu() to point to the - * first entry in cc_callwheel that may need handling. In turn, - * a softclock() is scheduled so it can serve the various entries i - * such that cc_softclock <= i <= cc_ticks . - * XXX maybe cc_softclock and cc_ticks should be volatile ? - * - * cc_ticks is also used in callout_reset_cpu() to determine - * when the callout should be served. */ struct callout_cpu { struct mtx_padalign cc_lock; - struct cc_mig_ent cc_migrating_entity; + struct cc_exec cc_exec_entity[2]; struct callout *cc_callout; - struct callout_tailq *cc_callwheel; - struct callout_list cc_callfree; - struct callout *cc_next; - struct callout *cc_curr; + struct callout_list *cc_callwheel; + struct callout_tailq cc_expireq; + struct callout_slist cc_callfree; + sbintime_t cc_firstevent; + sbintime_t cc_lastscan; void *cc_cookie; - int cc_ticks; - int cc_softticks; - int cc_cancel; - int cc_waiting; - int cc_firsttick; + u_int cc_bucket; }; +#define cc_exec_curr cc_exec_entity[0].cc_curr +#define cc_exec_next cc_exec_entity[0].cc_next +#define cc_exec_cancel cc_exec_entity[0].cc_cancel +#define cc_exec_waiting cc_exec_entity[0].cc_waiting +#define cc_exec_curr_dir cc_exec_entity[1].cc_curr +#define cc_exec_next_dir cc_exec_entity[1].cc_next +#define cc_exec_cancel_dir cc_exec_entity[1].cc_cancel +#define cc_exec_waiting_dir cc_exec_entity[1].cc_waiting + #ifdef SMP -#define cc_migration_func cc_migrating_entity.ce_migration_func -#define cc_migration_arg cc_migrating_entity.ce_migration_arg -#define cc_migration_cpu cc_migrating_entity.ce_migration_cpu -#define cc_migration_ticks cc_migrating_entity.ce_migration_ticks +#define cc_migration_func cc_exec_entity[0].ce_migration_func +#define cc_migration_arg cc_exec_entity[0].ce_migration_arg +#define cc_migration_cpu cc_exec_entity[0].ce_migration_cpu +#define cc_migration_time cc_exec_entity[0].ce_migration_time +#define cc_migration_func_dir cc_exec_entity[1].ce_migration_func +#define cc_migration_arg_dir cc_exec_entity[1].ce_migration_arg +#define cc_migration_cpu_dir cc_exec_entity[1].ce_migration_cpu +#define cc_migration_time_dir cc_exec_entity[1].ce_migration_time struct callout_cpu cc_cpu[MAXCPU]; #define CPUBLOCK MAXCPU @@ -153,39 +180,48 @@ struct callout_cpu cc_cpu; #define CC_LOCK_ASSERT(cc) mtx_assert(&(cc)->cc_lock, MA_OWNED) static int timeout_cpu; -void (*callout_new_inserted)(int cpu, int ticks) = NULL; + +static void softclock_call_cc(struct callout *c, struct callout_cpu *cc, +#ifdef CALLOUT_PROFILING + int *mpcalls, int *lockcalls, int *gcalls, +#endif + int direct); static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures"); /** * Locked by cc_lock: - * cc_curr - If a callout is in progress, it is curr_callout. - * If curr_callout is non-NULL, threads waiting in + * cc_curr - If a callout is in progress, it is cc_curr. + * If cc_curr is non-NULL, threads waiting in * callout_drain() will be woken up as soon as the * relevant callout completes. - * cc_cancel - Changing to 1 with both callout_lock and c_lock held + * cc_cancel - Changing to 1 with both callout_lock and cc_lock held * guarantees that the current callout will not run. * The softclock() function sets this to 0 before it * drops callout_lock to acquire c_lock, and it calls * the handler only if curr_cancelled is still 0 after - * c_lock is successfully acquired. + * cc_lock is successfully acquired. * cc_waiting - If a thread is waiting in callout_drain(), then * callout_wait is nonzero. Set only when - * curr_callout is non-NULL. + * cc_curr is non-NULL. */ /* - * Resets the migration entity tied to a specific callout cpu. + * Resets the execution entity tied to a specific callout cpu. */ static void -cc_cme_cleanup(struct callout_cpu *cc) +cc_cce_cleanup(struct callout_cpu *cc, int direct) { + cc->cc_exec_entity[direct].cc_curr = NULL; + cc->cc_exec_entity[direct].cc_next = NULL; + cc->cc_exec_entity[direct].cc_cancel = FALSE; + cc->cc_exec_entity[direct].cc_waiting = FALSE; #ifdef SMP - cc->cc_migration_cpu = CPUBLOCK; - cc->cc_migration_ticks = 0; - cc->cc_migration_func = NULL; - cc->cc_migration_arg = NULL; + cc->cc_exec_entity[direct].ce_migration_cpu = CPUBLOCK; + cc->cc_exec_entity[direct].ce_migration_time = 0; + cc->cc_exec_entity[direct].ce_migration_func = NULL; + cc->cc_exec_entity[direct].ce_migration_arg = NULL; #endif } @@ -193,11 +229,11 @@ cc_cme_cleanup(struct callout_cpu *cc) * Checks if migration is requested by a specific callout cpu. */ static int -cc_cme_migrating(struct callout_cpu *cc) +cc_cce_migrating(struct callout_cpu *cc, int direct) { #ifdef SMP - return (cc->cc_migration_cpu != CPUBLOCK); + return (cc->cc_exec_entity[direct].ce_migration_cpu != CPUBLOCK); #else return (0); #endif @@ -225,7 +261,7 @@ kern_timeout_callwheel_alloc(caddr_t v) cc->cc_callout = (struct callout *)v; v = (caddr_t)(cc->cc_callout + ncallout); - cc->cc_callwheel = (struct callout_tailq *)v; + cc->cc_callwheel = (struct callout_list *)v; v = (caddr_t)(cc->cc_callwheel + callwheelsize); return(v); } @@ -238,10 +274,12 @@ callout_cpu_init(struct callout_cpu *cc) mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE); SLIST_INIT(&cc->cc_callfree); - for (i = 0; i < callwheelsize; i++) { - TAILQ_INIT(&cc->cc_callwheel[i]); - } - cc_cme_cleanup(cc); + for (i = 0; i < callwheelsize; i++) + LIST_INIT(&cc->cc_callwheel[i]); + TAILQ_INIT(&cc->cc_expireq); + cc->cc_firstevent = INT64_MAX; + for (i = 0; i < 2; i++) + cc_cce_cleanup(cc, i); if (cc->cc_callout == NULL) return; for (i = 0; i < ncallout; i++) { @@ -320,7 +358,7 @@ start_softclock(void *dummy) panic("died while creating standard software ithreads"); cc->cc_callout = NULL; /* Only cpu0 handles timeout(). */ cc->cc_callwheel = malloc( - sizeof(struct callout_tailq) * callwheelsize, M_CALLOUT, + sizeof(struct callout_list) * callwheelsize, M_CALLOUT, M_WAITOK); callout_cpu_init(cc); } @@ -329,64 +367,148 @@ start_softclock(void *dummy) SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL); +#define CC_HASH_SHIFT 8 + +static inline u_int +callout_hash(sbintime_t sbt) +{ + + return (sbt >> (32 - CC_HASH_SHIFT)); +} + +static inline u_int +callout_get_bucket(sbintime_t sbt) +{ + + return (callout_hash(sbt) & callwheelmask); +} + void -callout_tick(void) +callout_process(sbintime_t now) { + struct callout *tmp, *tmpn; struct callout_cpu *cc; - int need_softclock; - int bucket; + struct callout_list *sc; + sbintime_t first, last, max, tmp_max; + uint32_t lookahead; + u_int firstb, lastb, nowb; +#ifdef CALLOUT_PROFILING + int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0; +#endif - /* - * Process callouts at a very low cpu priority, so we don't keep the - * relatively high clock interrupt priority any longer than necessary. - */ - need_softclock = 0; cc = CC_SELF(); mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET); - cc->cc_firsttick = cc->cc_ticks = ticks; - for (; (cc->cc_softticks - cc->cc_ticks) <= 0; cc->cc_softticks++) { - bucket = cc->cc_softticks & callwheelmask; - if (!TAILQ_EMPTY(&cc->cc_callwheel[bucket])) { - need_softclock = 1; - break; - } + + /* Compute the buckets of the last scan and present times. */ + firstb = callout_hash(cc->cc_lastscan); + cc->cc_lastscan = now; + nowb = callout_hash(now); + + /* Compute the last bucket and minimum time of the bucket after it. */ + if (nowb == firstb) + lookahead = (SBT_1S / 16); + else if (nowb - firstb == 1) + lookahead = (SBT_1S / 8); + else + lookahead = (SBT_1S / 2); + first = last = now; + first += (lookahead / 2); + last += lookahead; + last &= (0xffffffffffffffffLLU << (32 - CC_HASH_SHIFT)); + lastb = callout_hash(last) - 1; + max = last; + + /* + * Check if we wrapped around the entire wheel from the last scan. + * In case, we need to scan entirely the wheel for pending callouts. + */ + if (lastb - firstb >= callwheelsize) { + lastb = firstb + callwheelsize - 1; + if (nowb - firstb >= callwheelsize) + nowb = lastb; } + + /* Iterate callwheel from firstb to nowb and then up to lastb. */ + do { + sc = &cc->cc_callwheel[firstb & callwheelmask]; + tmp = LIST_FIRST(sc); + while (tmp != NULL) { + /* Run the callout if present time within allowed. */ + if (tmp->c_time <= now) { + /* + * Consumer told us the callout may be run + * directly from hardware interrupt context. + */ + if (tmp->c_flags & CALLOUT_DIRECT) { +#ifdef CALLOUT_PROFILING + ++depth_dir; +#endif + cc->cc_exec_next_dir = + LIST_NEXT(tmp, c_links.le); + cc->cc_bucket = firstb & callwheelmask; + LIST_REMOVE(tmp, c_links.le); + softclock_call_cc(tmp, cc, +#ifdef CALLOUT_PROFILING + &mpcalls_dir, &lockcalls_dir, NULL, +#endif + 1); + tmp = cc->cc_exec_next_dir; + } else { + tmpn = LIST_NEXT(tmp, c_links.le); + LIST_REMOVE(tmp, c_links.le); + TAILQ_INSERT_TAIL(&cc->cc_expireq, + tmp, c_links.tqe); + tmp->c_flags |= CALLOUT_PROCESSED; + tmp = tmpn; + } + continue; + } + /* Skip events from distant future. */ + if (tmp->c_time >= max) + goto next; + /* + * Event minimal time is bigger than present maximal + * time, so it cannot be aggregated. + */ + if (tmp->c_time > last) { + lastb = nowb; + goto next; + } + /* Update first and last time, respecting this event. */ + if (tmp->c_time < first) + first = tmp->c_time; + tmp_max = tmp->c_time + tmp->c_precision; + if (tmp_max < last) + last = tmp_max; +next: + tmp = LIST_NEXT(tmp, c_links.le); + } + /* Proceed with the next bucket. */ + firstb++; + /* + * Stop if we looked after present time and found + * some event we can't execute at now. + * Stop if we looked far enough into the future. + */ + } while (((int)(firstb - lastb)) <= 0); + cc->cc_firstevent = last; +#ifndef NO_EVENTTIMERS + cpu_new_callout(curcpu, last, first); +#endif +#ifdef CALLOUT_PROFILING + avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8; + avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8; + avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8; +#endif mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); /* * swi_sched acquires the thread lock, so we don't want to call it * with cc_lock held; incorrect locking order. */ - if (need_softclock) + if (!TAILQ_EMPTY(&cc->cc_expireq)) swi_sched(cc->cc_cookie, 0); } -int -callout_tickstofirst(int limit) -{ - struct callout_cpu *cc; - struct callout *c; - struct callout_tailq *sc; - int curticks; - int skip = 1; - - cc = CC_SELF(); - mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET); - curticks = cc->cc_ticks; - while( skip < ncallout && skip < limit ) { - sc = &cc->cc_callwheel[ (curticks+skip) & callwheelmask ]; - /* search scanning ticks */ - TAILQ_FOREACH( c, sc, c_links.tqe ){ - if (c->c_time - curticks <= ncallout) - goto out; - } - skip++; - } -out: - cc->cc_firsttick = curticks + skip; - mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); - return (skip); -} - static struct callout_cpu * callout_lock(struct callout *c) { @@ -412,26 +534,41 @@ callout_lock(struct callout *c) } static void -callout_cc_add(struct callout *c, struct callout_cpu *cc, int to_ticks, - void (*func)(void *), void *arg, int cpu) +callout_cc_add(struct callout *c, struct callout_cpu *cc, + sbintime_t sbt, sbintime_t precision, void (*func)(void *), + void *arg, int cpu, int flags) { + int bucket; CC_LOCK_ASSERT(cc); - - if (to_ticks <= 0) - to_ticks = 1; + if (sbt < cc->cc_lastscan) + sbt = cc->cc_lastscan; c->c_arg = arg; c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING); + if (flags & C_DIRECT_EXEC) + c->c_flags |= CALLOUT_DIRECT; + c->c_flags &= ~CALLOUT_PROCESSED; c->c_func = func; - c->c_time = ticks + to_ticks; - TAILQ_INSERT_TAIL(&cc->cc_callwheel[c->c_time & callwheelmask], - c, c_links.tqe); - if ((c->c_time - cc->cc_firsttick) < 0 && - callout_new_inserted != NULL) { - cc->cc_firsttick = c->c_time; - (*callout_new_inserted)(cpu, - to_ticks + (ticks - cc->cc_ticks)); + c->c_time = sbt; + c->c_precision = precision; + bucket = callout_get_bucket(c->c_time); + CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x", + c, (int)(c->c_precision >> 32), + (u_int)(c->c_precision & 0xffffffff)); + LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le); + if (cc->cc_bucket == bucket) + cc->cc_exec_next_dir = c; +#ifndef NO_EVENTTIMERS + /* + * Inform the eventtimers(4) subsystem there's a new callout + * that has been inserted, but only if really required. + */ + sbt = c->c_time + c->c_precision; + if (sbt < cc->cc_firstevent) { + cc->cc_firstevent = sbt; + cpu_new_callout(cpu, sbt, c->c_time); } +#endif } static void @@ -445,8 +582,11 @@ callout_cc_del(struct callout *c, struct callout_cpu *cc) } static void -softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, - int *lockcalls, int *gcalls) +softclock_call_cc(struct callout *c, struct callout_cpu *cc, +#ifdef CALLOUT_PROFILING + int *mpcalls, int *lockcalls, int *gcalls, +#endif + int direct) { void (*c_func)(void *); void *c_arg; @@ -457,12 +597,13 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, struct callout_cpu *new_cc; void (*new_func)(void *); void *new_arg; - int new_cpu, new_ticks; + int flags, new_cpu; + sbintime_t new_time; #endif -#ifdef DIAGNOSTIC - struct bintime bt1, bt2; +#if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) + sbintime_t bt1, bt2; struct timespec ts2; - static uint64_t maxdt = 36893488147419102LL; /* 2 msec */ + static sbintime_t maxdt = 2 * SBT_1MS; /* 2 msec */ static timeout_t *lastfunc; #endif @@ -479,8 +620,8 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, c->c_flags = CALLOUT_LOCAL_ALLOC; else c->c_flags &= ~CALLOUT_PENDING; - cc->cc_curr = c; - cc->cc_cancel = 0; + cc->cc_exec_entity[direct].cc_curr = c; + cc->cc_exec_entity[direct].cc_cancel = FALSE; CC_UNLOCK(cc); if (c_lock != NULL) { class->lc_lock(c_lock, sharedlock); @@ -488,29 +629,34 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, * The callout may have been cancelled * while we switched locks. */ - if (cc->cc_cancel) { + if (cc->cc_exec_entity[direct].cc_cancel) { class->lc_unlock(c_lock); goto skip; } /* The callout cannot be stopped now. */ - cc->cc_cancel = 1; - + cc->cc_exec_entity[direct].cc_cancel = TRUE; if (c_lock == &Giant.lock_object) { +#ifdef CALLOUT_PROFILING (*gcalls)++; - CTR3(KTR_CALLOUT, "callout %p func %p arg %p", +#endif + CTR3(KTR_CALLOUT, "callout giant %p func %p arg %p", c, c_func, c_arg); } else { +#ifdef CALLOUT_PROFILING (*lockcalls)++; +#endif CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p", c, c_func, c_arg); } } else { +#ifdef CALLOUT_PROFILING (*mpcalls)++; - CTR3(KTR_CALLOUT, "callout mpsafe %p func %p arg %p", +#endif + CTR3(KTR_CALLOUT, "callout %p func %p arg %p", c, c_func, c_arg); } #ifdef DIAGNOSTIC - binuptime(&bt1); + sbt1 = sbinuptime(); #endif THREAD_NO_SLEEPING(); SDT_PROBE(callout_execute, kernel, , callout_start, c, 0, 0, 0, 0); @@ -518,16 +664,16 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, SDT_PROBE(callout_execute, kernel, , callout_end, c, 0, 0, 0, 0); THREAD_SLEEPING_OK(); #ifdef DIAGNOSTIC - binuptime(&bt2); - bintime_sub(&bt2, &bt1); - if (bt2.frac > maxdt) { - if (lastfunc != c_func || bt2.frac > maxdt * 2) { - bintime2timespec(&bt2, &ts2); + bt2 = sbinuptime(); + bt2 -= bt1; + if (bt2 > maxdt) { + if (lastfunc != c_func || bt2 > maxdt * 2) { + ts2 = sbttots(bt2); printf( "Expensive timeout(9) function: %p(%p) %jd.%09ld s\n", c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec); } - maxdt = bt2.frac; + maxdt = bt2; lastfunc = c_func; } #endif @@ -536,17 +682,17 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, class->lc_unlock(c_lock); skip: CC_LOCK(cc); - KASSERT(cc->cc_curr == c, ("mishandled cc_curr")); - cc->cc_curr = NULL; - if (cc->cc_waiting) { + KASSERT(cc->cc_exec_entity[direct].cc_curr == c, ("mishandled cc_curr")); + cc->cc_exec_entity[direct].cc_curr = NULL; + if (cc->cc_exec_entity[direct].cc_waiting) { /* * There is someone waiting for the * callout to complete. * If the callout was scheduled for * migration just cancel it. */ - if (cc_cme_migrating(cc)) { - cc_cme_cleanup(cc); + if (cc_cce_migrating(cc, direct)) { + cc_cce_cleanup(cc, direct); /* * It should be assert here that the callout is not @@ -554,11 +700,11 @@ skip: */ c->c_flags &= ~CALLOUT_DFRMIGRATION; } - cc->cc_waiting = 0; + cc->cc_exec_entity[direct].cc_waiting = FALSE; CC_UNLOCK(cc); - wakeup(&cc->cc_waiting); + wakeup(&cc->cc_exec_entity[direct].cc_waiting); CC_LOCK(cc); - } else if (cc_cme_migrating(cc)) { + } else if (cc_cce_migrating(cc, direct)) { KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0, ("Migrating legacy callout %p", c)); #ifdef SMP @@ -566,11 +712,11 @@ skip: * If the callout was scheduled for * migration just perform it now. */ - new_cpu = cc->cc_migration_cpu; - new_ticks = cc->cc_migration_ticks; - new_func = cc->cc_migration_func; - new_arg = cc->cc_migration_arg; - cc_cme_cleanup(cc); + new_cpu = cc->cc_exec_entity[direct].ce_migration_cpu; + new_time = cc->cc_exec_entity[direct].ce_migration_time; + new_func = cc->cc_exec_entity[direct].ce_migration_func; + new_arg = cc->cc_exec_entity[direct].ce_migration_arg; + cc_cce_cleanup(cc, direct); /* * It should be assert here that the callout is not destroyed @@ -588,8 +734,9 @@ skip: c->c_flags &= ~CALLOUT_DFRMIGRATION; new_cc = callout_cpu_switch(c, cc, new_cpu); - callout_cc_add(c, new_cc, new_ticks, new_func, new_arg, - new_cpu); + flags = (direct) ? C_DIRECT_EXEC : 0; + callout_cc_add(c, new_cc, new_time, c->c_precision, new_func, + new_arg, new_cpu, flags); CC_UNLOCK(new_cc); CC_LOCK(cc); #else @@ -632,63 +779,29 @@ softclock(void *arg) { struct callout_cpu *cc; struct callout *c; - struct callout_tailq *bucket; - int curticks; - int steps; /* #steps since we last allowed interrupts */ - int depth; - int mpcalls; - int lockcalls; - int gcalls; - -#ifndef MAX_SOFTCLOCK_STEPS -#define MAX_SOFTCLOCK_STEPS 100 /* Maximum allowed value of steps. */ -#endif /* MAX_SOFTCLOCK_STEPS */ - - mpcalls = 0; - lockcalls = 0; - gcalls = 0; - depth = 0; - steps = 0; +#ifdef CALLOUT_PROFILING + int depth = 0, gcalls = 0, lockcalls = 0, mpcalls = 0; +#endif + cc = (struct callout_cpu *)arg; CC_LOCK(cc); - while (cc->cc_softticks - 1 != cc->cc_ticks) { - /* - * cc_softticks may be modified by hard clock, so cache - * it while we work on a given bucket. - */ - curticks = cc->cc_softticks; - cc->cc_softticks++; - bucket = &cc->cc_callwheel[curticks & callwheelmask]; - c = TAILQ_FIRST(bucket); - while (c != NULL) { - depth++; - if (c->c_time != curticks) { - c = TAILQ_NEXT(c, c_links.tqe); - ++steps; - if (steps >= MAX_SOFTCLOCK_STEPS) { - cc->cc_next = c; - /* Give interrupts a chance. */ - CC_UNLOCK(cc); - ; /* nothing */ - CC_LOCK(cc); - c = cc->cc_next; - steps = 0; - } - } else { - cc->cc_next = TAILQ_NEXT(c, c_links.tqe); - TAILQ_REMOVE(bucket, c, c_links.tqe); - softclock_call_cc(c, cc, &mpcalls, - &lockcalls, &gcalls); - steps = 0; - c = cc->cc_next; - } - } + while ((c = TAILQ_FIRST(&cc->cc_expireq)) != NULL) { + TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); + softclock_call_cc(c, cc, +#ifdef CALLOUT_PROFILING + &mpcalls, &lockcalls, &gcalls, +#endif + 0); +#ifdef CALLOUT_PROFILING + ++depth; +#endif } +#ifdef CALLOUT_PROFILING avg_depth += (depth * 1000 - avg_depth) >> 8; avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8; avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8; avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8; - cc->cc_next = NULL; +#endif CC_UNLOCK(cc); } @@ -778,28 +891,71 @@ callout_handle_init(struct callout_handle *handle) * callout_deactivate() - marks the callout as having been serviced */ int -callout_reset_on(struct callout *c, int to_ticks, void (*ftn)(void *), - void *arg, int cpu) +callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision, + void (*ftn)(void *), void *arg, int cpu, int flags) { + sbintime_t to_sbt, pr; struct callout_cpu *cc; - int cancelled = 0; + int cancelled, direct; + cancelled = 0; + if (flags & C_ABSOLUTE) { + to_sbt = sbt; + } else { + if ((flags & C_HARDCLOCK) && (sbt < tick_sbt)) + sbt = tick_sbt; + if ((flags & C_HARDCLOCK) || +#ifdef NO_EVENTTIMERS + sbt >= sbt_timethreshold) { + to_sbt = getsbinuptime(); + + /* Add safety belt for the case of hz > 1000. */ + to_sbt += tc_tick_sbt - tick_sbt; +#else + sbt >= sbt_tickthreshold) { + /* + * Obtain the time of the last hardclock() call on + * this CPU directly from the kern_clocksource.c. + * This value is per-CPU, but it is equal for all + * active ones. + */ +#ifdef __LP64__ + to_sbt = DPCPU_GET(hardclocktime); +#else + spinlock_enter(); + to_sbt = DPCPU_GET(hardclocktime); + spinlock_exit(); +#endif +#endif + if ((flags & C_HARDCLOCK) == 0) + to_sbt += tick_sbt; + } else + to_sbt = sbinuptime(); + to_sbt += sbt; + pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp : + sbt >> C_PRELGET(flags)); + if (pr > precision) + precision = pr; + } /* * Don't allow migration of pre-allocated callouts lest they * become unbalanced. */ if (c->c_flags & CALLOUT_LOCAL_ALLOC) cpu = c->c_cpu; + direct = (c->c_flags & CALLOUT_DIRECT) != 0; + KASSERT(!direct || c->c_lock == NULL, + ("%s: direct callout %p has lock", __func__, c)); cc = callout_lock(c); - if (cc->cc_curr == c) { + if (cc->cc_exec_entity[direct].cc_curr == c) { /* * We're being asked to reschedule a callout which is * currently in progress. If there is a lock then we * can cancel the callout if it has not really started. */ - if (c->c_lock != NULL && !cc->cc_cancel) - cancelled = cc->cc_cancel = 1; - if (cc->cc_waiting) { + if (c->c_lock != NULL && !cc->cc_exec_entity[direct].cc_cancel) + cancelled = cc->cc_exec_entity[direct].cc_cancel = TRUE; + if (cc->cc_exec_entity[direct].cc_waiting) { /* * Someone has called callout_drain to kill this * callout. Don't reschedule. @@ -812,12 +968,12 @@ callout_reset_on(struct callout *c, int to_ticks, void (*ftn)(void *), } } if (c->c_flags & CALLOUT_PENDING) { - if (cc->cc_next == c) { - cc->cc_next = TAILQ_NEXT(c, c_links.tqe); - } - TAILQ_REMOVE(&cc->cc_callwheel[c->c_time & callwheelmask], c, - c_links.tqe); - + if ((c->c_flags & CALLOUT_PROCESSED) == 0) { + if (cc->cc_exec_next_dir == c) + cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le); + LIST_REMOVE(c, c_links.le); + } else + TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); cancelled = 1; c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); } @@ -829,15 +985,17 @@ callout_reset_on(struct callout *c, int to_ticks, void (*ftn)(void *), * to a more appropriate moment. */ if (c->c_cpu != cpu) { - if (cc->cc_curr == c) { - cc->cc_migration_cpu = cpu; - cc->cc_migration_ticks = to_ticks; - cc->cc_migration_func = ftn; - cc->cc_migration_arg = arg; + if (cc->cc_exec_entity[direct].cc_curr == c) { + cc->cc_exec_entity[direct].ce_migration_cpu = cpu; + cc->cc_exec_entity[direct].ce_migration_time + = to_sbt; + cc->cc_exec_entity[direct].ce_migration_func = ftn; + cc->cc_exec_entity[direct].ce_migration_arg = arg; c->c_flags |= CALLOUT_DFRMIGRATION; - CTR5(KTR_CALLOUT, - "migration of %p func %p arg %p in %d to %u deferred", - c, c->c_func, c->c_arg, to_ticks, cpu); + CTR6(KTR_CALLOUT, + "migration of %p func %p arg %p in %d.%08x to %u deferred", + c, c->c_func, c->c_arg, (int)(to_sbt >> 32), + (u_int)(to_sbt & 0xffffffff), cpu); CC_UNLOCK(cc); return (cancelled); } @@ -845,9 +1003,10 @@ callout_reset_on(struct callout *c, int to_ticks, void (*ftn)(void *), } #endif - callout_cc_add(c, cc, to_ticks, ftn, arg, cpu); - CTR5(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d", - cancelled ? "re" : "", c, c->c_func, c->c_arg, to_ticks); + callout_cc_add(c, cc, to_sbt, precision, ftn, arg, cpu, flags); + CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x", + cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32), + (u_int)(to_sbt & 0xffffffff)); CC_UNLOCK(cc); return (cancelled); @@ -875,7 +1034,7 @@ _callout_stop_safe(c, safe) { struct callout_cpu *cc, *old_cc; struct lock_class *class; - int use_lock, sq_locked; + int direct, sq_locked, use_lock; /* * Some old subsystems don't hold Giant while running a callout_stop(), @@ -891,7 +1050,7 @@ _callout_stop_safe(c, safe) } } else use_lock = 0; - + direct = (c->c_flags & CALLOUT_DIRECT) != 0; sq_locked = 0; old_cc = NULL; again: @@ -905,7 +1064,7 @@ again: if (sq_locked != 0 && cc != old_cc) { #ifdef SMP CC_UNLOCK(cc); - sleepq_release(&old_cc->cc_waiting); + sleepq_release(&old_cc->cc_exec_entity[direct].cc_waiting); sq_locked = 0; old_cc = NULL; goto again; @@ -926,12 +1085,13 @@ again: * If it wasn't on the queue and it isn't the current * callout, then we can't stop it, so just bail. */ - if (cc->cc_curr != c) { + if (cc->cc_exec_entity[direct].cc_curr != c) { CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p", c, c->c_func, c->c_arg); CC_UNLOCK(cc); if (sq_locked) - sleepq_release(&cc->cc_waiting); + sleepq_release( + &cc->cc_exec_entity[direct].cc_waiting); return (0); } @@ -942,8 +1102,7 @@ again: * just wait for the current invocation to * finish. */ - while (cc->cc_curr == c) { - + while (cc->cc_exec_entity[direct].cc_curr == c) { /* * Use direct calls to sleepqueue interface * instead of cv/msleep in order to avoid @@ -963,7 +1122,8 @@ again: */ if (!sq_locked) { CC_UNLOCK(cc); - sleepq_lock(&cc->cc_waiting); + sleepq_lock( + &cc->cc_exec_entity[direct].cc_waiting); sq_locked = 1; old_cc = cc; goto again; @@ -975,13 +1135,16 @@ again: * will be packed up, just let softclock() * take care of it. */ - cc->cc_waiting = 1; + cc->cc_exec_entity[direct].cc_waiting = TRUE; DROP_GIANT(); CC_UNLOCK(cc); - sleepq_add(&cc->cc_waiting, + sleepq_add( + &cc->cc_exec_entity[direct].cc_waiting, &cc->cc_lock.lock_object, "codrain", SLEEPQ_SLEEP, 0); - sleepq_wait(&cc->cc_waiting, 0); + sleepq_wait( + &cc->cc_exec_entity[direct].cc_waiting, + 0); sq_locked = 0; old_cc = NULL; @@ -989,7 +1152,8 @@ again: PICKUP_GIANT(); CC_LOCK(cc); } - } else if (use_lock && !cc->cc_cancel) { + } else if (use_lock && + !cc->cc_exec_entity[direct].cc_cancel) { /* * The current callout is waiting for its * lock which we hold. Cancel the callout @@ -997,10 +1161,10 @@ again: * lock, the callout will be skipped in * softclock(). */ - cc->cc_cancel = 1; + cc->cc_exec_entity[direct].cc_cancel = TRUE; CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); - KASSERT(!cc_cme_migrating(cc), + KASSERT(!cc_cce_migrating(cc, direct), ("callout wrongly scheduled for migration")); CC_UNLOCK(cc); KASSERT(!sq_locked, ("sleepqueue chain locked")); @@ -1019,16 +1183,18 @@ again: return (0); } if (sq_locked) - sleepq_release(&cc->cc_waiting); + sleepq_release(&cc->cc_exec_entity[direct].cc_waiting); c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); - if (cc->cc_next == c) - cc->cc_next = TAILQ_NEXT(c, c_links.tqe); - TAILQ_REMOVE(&cc->cc_callwheel[c->c_time & callwheelmask], c, - c_links.tqe); + if ((c->c_flags & CALLOUT_PROCESSED) == 0) { + if (cc->cc_exec_next_dir == c) + cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le); + LIST_REMOVE(c, c_links.le); + } else + TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); callout_cc_del(c, cc); CC_UNLOCK(cc); @@ -1135,3 +1301,121 @@ adjust_timeout_calltodo(time_change) return; } #endif /* APM_FIXUP_CALLTODO */ + +static int +flssbt(sbintime_t sbt) +{ + + sbt += (uint64_t)sbt >> 1; + if (sizeof(long) >= sizeof(sbintime_t)) + return (flsl(sbt)); + if (sbt >= SBT_1S) + return (flsl(((uint64_t)sbt) >> 32) + 32); + return (flsl(sbt)); +} + +/* + * Dump immediate statistic snapshot of the scheduled callouts. + */ +static int +sysctl_kern_callout_stat(SYSCTL_HANDLER_ARGS) +{ + struct callout *tmp; + struct callout_cpu *cc; + struct callout_list *sc; + sbintime_t maxpr, maxt, medpr, medt, now, spr, st, t; + int ct[64], cpr[64], ccpbk[32]; + int error, val, i, count, tcum, pcum, maxc, c, medc; +#ifdef SMP + int cpu; +#endif + + val = 0; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + count = maxc = 0; + st = spr = maxt = maxpr = 0; + bzero(ccpbk, sizeof(ccpbk)); + bzero(ct, sizeof(ct)); + bzero(cpr, sizeof(cpr)); + now = sbinuptime(); +#ifdef SMP + CPU_FOREACH(cpu) { + cc = CC_CPU(cpu); +#else + cc = CC_CPU(timeout_cpu); +#endif + CC_LOCK(cc); + for (i = 0; i < callwheelsize; i++) { + sc = &cc->cc_callwheel[i]; + c = 0; + LIST_FOREACH(tmp, sc, c_links.le) { + c++; + t = tmp->c_time - now; + if (t < 0) + t = 0; + st += t / SBT_1US; + spr += tmp->c_precision / SBT_1US; + if (t > maxt) + maxt = t; + if (tmp->c_precision > maxpr) + maxpr = tmp->c_precision; + ct[flssbt(t)]++; + cpr[flssbt(tmp->c_precision)]++; + } + if (c > maxc) + maxc = c; + ccpbk[fls(c + c / 2)]++; + count += c; + } + CC_UNLOCK(cc); +#ifdef SMP + } +#endif + + for (i = 0, tcum = 0; i < 64 && tcum < count / 2; i++) + tcum += ct[i]; + medt = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0; + for (i = 0, pcum = 0; i < 64 && pcum < count / 2; i++) + pcum += cpr[i]; + medpr = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0; + for (i = 0, c = 0; i < 32 && c < count / 2; i++) + c += ccpbk[i]; + medc = (i >= 2) ? (1 << (i - 2)) : 0; + + printf("Scheduled callouts statistic snapshot:\n"); + printf(" Callouts: %6d Buckets: %6d*%-3d Bucket size: 0.%06ds\n", + count, callwheelsize, mp_ncpus, 1000000 >> CC_HASH_SHIFT); + printf(" C/Bk: med %5d avg %6d.%06jd max %6d\n", + medc, + count / callwheelsize / mp_ncpus, + (uint64_t)count * 1000000 / callwheelsize / mp_ncpus % 1000000, + maxc); + printf(" Time: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n", + medt / SBT_1S, (medt & 0xffffffff) * 1000000 >> 32, + (st / count) / 1000000, (st / count) % 1000000, + maxt / SBT_1S, (maxt & 0xffffffff) * 1000000 >> 32); + printf(" Prec: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n", + medpr / SBT_1S, (medpr & 0xffffffff) * 1000000 >> 32, + (spr / count) / 1000000, (spr / count) % 1000000, + maxpr / SBT_1S, (maxpr & 0xffffffff) * 1000000 >> 32); + printf(" Distribution: \tbuckets\t time\t tcum\t" + " prec\t pcum\n"); + for (i = 0, tcum = pcum = 0; i < 64; i++) { + if (ct[i] == 0 && cpr[i] == 0) + continue; + t = (i != 0) ? (((sbintime_t)1) << (i - 1)) : 0; + tcum += ct[i]; + pcum += cpr[i]; + printf(" %10jd.%06jds\t 2**%d\t%7d\t%7d\t%7d\t%7d\n", + t / SBT_1S, (t & 0xffffffff) * 1000000 >> 32, + i - 1 - (32 - CC_HASH_SHIFT), + ct[i], tcum, cpr[i], pcum); + } + return (error); +} +SYSCTL_PROC(_kern, OID_AUTO, callout_stat, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + 0, 0, sysctl_kern_callout_stat, "I", + "Dump immediate statistic snapshot of the scheduled callouts"); diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c index f36c769..0a3580b 100644 --- a/sys/kern/subr_param.c +++ b/sys/kern/subr_param.c @@ -81,8 +81,10 @@ __FBSDID("$FreeBSD$"); static int sysctl_kern_vm_guest(SYSCTL_HANDLER_ARGS); -int hz; -int tick; +int hz; /* system clock's frequency */ +int tick; /* usec per tick (1000000 / hz) */ +struct bintime tick_bt; /* bintime per tick (1s / hz) */ +sbintime_t tick_sbt; int maxusers; /* base tunable */ int maxproc; /* maximum # of processes */ int maxprocperuid; /* max # of procs per user */ @@ -221,6 +223,8 @@ init_param1(void) if (hz == -1) hz = vm_guest > VM_GUEST_NO ? HZ_VM : HZ; tick = 1000000 / hz; + tick_sbt = SBT_1S / hz; + tick_bt = sbttobt(tick_sbt); #ifdef VM_SWZONE_SIZE_MAX maxswzone = VM_SWZONE_SIZE_MAX; diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c index 48444c1..bde7503 100644 --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -719,20 +719,24 @@ tcp_timer_active(struct tcpcb *tp, int timer_type) #define ticks_to_msecs(t) (1000*(t) / hz) void -tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, struct xtcp_timer *xtimer) +tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, + struct xtcp_timer *xtimer) { - bzero(xtimer, sizeof(struct xtcp_timer)); + sbintime_t now; + + bzero(xtimer, sizeof(*xtimer)); if (timer == NULL) return; + now = getsbinuptime(); if (callout_active(&timer->tt_delack)) - xtimer->tt_delack = ticks_to_msecs(timer->tt_delack.c_time - ticks); + xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_rexmt)) - xtimer->tt_rexmt = ticks_to_msecs(timer->tt_rexmt.c_time - ticks); + xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_persist)) - xtimer->tt_persist = ticks_to_msecs(timer->tt_persist.c_time - ticks); + xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_keep)) - xtimer->tt_keep = ticks_to_msecs(timer->tt_keep.c_time - ticks); + xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_2msl)) - xtimer->tt_2msl = ticks_to_msecs(timer->tt_2msl.c_time - ticks); + xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS; xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime); } diff --git a/sys/sys/_callout.h b/sys/sys/_callout.h index b8c3ce9..e186aec 100644 --- a/sys/sys/_callout.h +++ b/sys/sys/_callout.h @@ -42,15 +42,18 @@ struct lock_object; -SLIST_HEAD(callout_list, callout); +LIST_HEAD(callout_list, callout); +SLIST_HEAD(callout_slist, callout); TAILQ_HEAD(callout_tailq, callout); struct callout { union { + LIST_ENTRY(callout) le; SLIST_ENTRY(callout) sle; TAILQ_ENTRY(callout) tqe; } c_links; - int c_time; /* ticks to the event */ + sbintime_t c_time; /* ticks to the event */ + sbintime_t c_precision; /* delta allowed wrt opt */ void *c_arg; /* function argument */ void (*c_func)(void *); /* function to call */ struct lock_object *c_lock; /* lock to handle */ diff --git a/sys/sys/callout.h b/sys/sys/callout.h index 95b9a32..7a4dec9 100644 --- a/sys/sys/callout.h +++ b/sys/sys/callout.h @@ -47,6 +47,16 @@ #define CALLOUT_RETURNUNLOCKED 0x0010 /* handler returns with mtx unlocked */ #define CALLOUT_SHAREDLOCK 0x0020 /* callout lock held in shared mode */ #define CALLOUT_DFRMIGRATION 0x0040 /* callout in deferred migration mode */ +#define CALLOUT_PROCESSED 0x0080 /* callout in wheel or processing list? */ +#define CALLOUT_DIRECT 0x0100 /* allow exec from hw int context */ + +#define C_DIRECT_EXEC 0x0001 /* direct execution of callout */ +#define C_PRELBITS 7 +#define C_PRELRANGE ((1 << C_PRELBITS) - 1) +#define C_PREL(x) (((x) + 1) << 1) +#define C_PRELGET(x) (int)((((x) >> 1) & C_PRELRANGE) - 1) +#define C_HARDCLOCK 0x0100 /* align to hardclock() calls */ +#define C_ABSOLUTE 0x0200 /* event time is absolute. */ struct callout_handle { struct callout *callout; @@ -67,7 +77,15 @@ void _callout_init_lock(struct callout *, struct lock_object *, int); _callout_init_lock((c), ((rw) != NULL) ? &(rw)->lock_object : \ NULL, (flags)) #define callout_pending(c) ((c)->c_flags & CALLOUT_PENDING) -int callout_reset_on(struct callout *, int, void (*)(void *), void *, int); +int callout_reset_sbt_on(struct callout *, sbintime_t, sbintime_t, + void (*)(void *), void *, int, int); +#define callout_reset_sbt(c, sbt, pr, fn, arg, flags) \ + callout_reset_sbt_on((c), (sbt), (pr), (fn), (arg), (c)->c_cpu, flags) +#define callout_reset_sbt_curcpu(c, sbt, pr, fn, arg, flags) \ + callout_reset_sbt_on((c), (sbt), (pr), (fn), (arg), PCPU_GET(cpuid), flags) +#define callout_reset_on(c, to_ticks, fn, arg, cpu) \ + callout_reset_sbt_on((c), (tick_sbt * (to_ticks)), 0, (fn), (arg), \ + (cpu), C_HARDCLOCK) #define callout_reset(c, on_tick, fn, arg) \ callout_reset_on((c), (on_tick), (fn), (arg), (c)->c_cpu) #define callout_reset_curcpu(c, on_tick, fn, arg) \ @@ -78,9 +96,7 @@ int callout_schedule_on(struct callout *, int, int); callout_schedule_on((c), (on_tick), PCPU_GET(cpuid)) #define callout_stop(c) _callout_stop_safe(c, 0) int _callout_stop_safe(struct callout *, int); -void callout_tick(void); -int callout_tickstofirst(int limit); -extern void (*callout_new_inserted)(int cpu, int ticks); +void callout_process(sbintime_t now); #endif diff --git a/sys/sys/systm.h b/sys/sys/systm.h index 11a9cf9..85fa1c8 100644 --- a/sys/sys/systm.h +++ b/sys/sys/systm.h @@ -269,6 +269,7 @@ void cpu_startprofclock(void); void cpu_stopprofclock(void); sbintime_t cpu_idleclock(void); void cpu_activeclock(void); +void cpu_new_callout(int cpu, sbintime_t bt, sbintime_t bt_opt); extern int cpu_can_deep_sleep; extern int cpu_disable_deep_sleep; diff --git a/sys/sys/time.h b/sys/sys/time.h index a735ad8..761a123 100644 --- a/sys/sys/time.h +++ b/sys/sys/time.h @@ -102,6 +102,21 @@ bintime_mul(struct bintime *bt, u_int x) bt->frac = (p2 << 32) | (p1 & 0xffffffffull); } +static __inline void +bintime_shift(struct bintime *bt, int exp) +{ + + if (exp > 0) { + bt->sec <<= exp; + bt->sec |= bt->frac >> (64 - exp); + bt->frac <<= exp; + } else if (exp < 0) { + bt->frac >>= -exp; + bt->frac |= (uint64_t)bt->sec << (64 + exp); + bt->sec >>= -exp; + } +} + #define bintime_clear(a) ((a)->sec = (a)->frac = 0) #define bintime_isset(a) ((a)->sec || (a)->frac) #define bintime_cmp(a, b, cmp) \ @@ -357,6 +372,16 @@ extern volatile time_t time_second; extern volatile time_t time_uptime; extern struct bintime boottimebin; extern struct timeval boottime; +extern struct bintime tc_tick_bt; +extern sbintime_t tc_tick_sbt; +extern struct bintime tick_bt; +extern sbintime_t tick_sbt; +extern int tc_precexp; +extern int tc_timepercentage; +extern struct bintime bt_timethreshold; +extern struct bintime bt_tickthreshold; +extern sbintime_t sbt_timethreshold; +extern sbintime_t sbt_tickthreshold; /* * Functions for looking at our clock: [get]{bin,nano,micro}[up]time() @@ -421,6 +446,25 @@ int ratecheck(struct timeval *, const struct timeval *); void timevaladd(struct timeval *t1, const struct timeval *t2); void timevalsub(struct timeval *t1, const struct timeval *t2); int tvtohz(struct timeval *tv); + +#define TC_DEFAULTPERC 5 + +#define BT2FREQ(bt) \ + (((uint64_t)0x8000000000000000 + ((bt)->frac >> 2)) / \ + ((bt)->frac >> 1)) + +#define SBT2FREQ(sbt) ((SBT_1S + ((sbt) >> 1)) / (sbt)) + +#define FREQ2BT(freq, bt) \ +{ \ + (bt)->sec = 0; \ + (bt)->frac = ((uint64_t)0x8000000000000000 / (freq)) << 1; \ +} + +#define TIMESEL(sbt, sbt2) \ + (((sbt2) >= sbt_timethreshold) ? \ + ((*(sbt) = getsbinuptime()), 1) : ((*(sbt) = sbinuptime()), 0)) + #else /* !_KERNEL */ #include <time.h> |