diff options
-rw-r--r-- | sys/conf/NOTES | 5 | ||||
-rw-r--r-- | sys/conf/options | 1 | ||||
-rw-r--r-- | sys/kern/kern_clock.c | 3 | ||||
-rw-r--r-- | sys/kern/kern_clocksource.c | 468 | ||||
-rw-r--r-- | sys/kern/kern_tc.c | 59 | ||||
-rw-r--r-- | sys/kern/kern_timeout.c | 760 | ||||
-rw-r--r-- | sys/kern/subr_param.c | 8 | ||||
-rw-r--r-- | sys/netinet/tcp_timer.c | 18 | ||||
-rw-r--r-- | sys/sys/_callout.h | 7 | ||||
-rw-r--r-- | sys/sys/callout.h | 24 | ||||
-rw-r--r-- | sys/sys/systm.h | 1 | ||||
-rw-r--r-- | sys/sys/time.h | 44 |
12 files changed, 893 insertions, 505 deletions
diff --git a/sys/conf/NOTES b/sys/conf/NOTES index 5d26093..27c3380 100644 --- a/sys/conf/NOTES +++ b/sys/conf/NOTES @@ -259,6 +259,8 @@ options SX_NOINLINE # SMP Debugging Options: # +# CALLOUT_PROFILING enables rudimentary profiling of the callwheel data +# structure used as backend in callout(9). # PREEMPTION allows the threads that are in the kernel to be preempted by # higher priority [interrupt] threads. It helps with interactivity # and allows interrupt threads to run sooner rather than waiting. @@ -297,6 +299,9 @@ options LOCK_PROFILING options MPROF_BUFFERS="1536" options MPROF_HASH_SIZE="1543" +# Profiling for the callout(9) backend. +options CALLOUT_PROFILING + # Profiling for internal hash tables. options SLEEPQUEUE_PROFILING options TURNSTILE_PROFILING diff --git a/sys/conf/options b/sys/conf/options index ab5d153..75d0c97 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -68,6 +68,7 @@ TEXTDUMP_VERBOSE opt_ddb.h ADAPTIVE_LOCKMGRS ALQ AUDIT opt_global.h +CALLOUT_PROFILING CAPABILITIES opt_capsicum.h CAPABILITY_MODE opt_capsicum.h COMPAT_43 opt_compat.h diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c index 55a2bff..4439ccb 100644 --- a/sys/kern/kern_clock.c +++ b/sys/kern/kern_clock.c @@ -460,7 +460,7 @@ hardclock_cpu(int usermode) if (td->td_intr_frame != NULL) PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame); #endif - callout_tick(); + callout_process(sbinuptime()); } /* @@ -550,7 +550,6 @@ hardclock_cnt(int cnt, int usermode) if (td->td_intr_frame != NULL) PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame); #endif - callout_tick(); /* We are in charge to handle this tick duty. */ if (newticks > 0) { /* Dangerous and no need to call these things concurrently. */ diff --git a/sys/kern/kern_clocksource.c b/sys/kern/kern_clocksource.c index 10732d9..c2bebbe 100644 --- a/sys/kern/kern_clocksource.c +++ b/sys/kern/kern_clocksource.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2010-2012 Alexander Motin <mav@FreeBSD.org> + * Copyright (c) 2010-2013 Alexander Motin <mav@FreeBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> #include <sys/bus.h> +#include <sys/limits.h> #include <sys/lock.h> #include <sys/kdb.h> #include <sys/ktr.h> @@ -63,17 +64,14 @@ int cpu_can_deep_sleep = 0; /* C3 state is available. */ int cpu_disable_deep_sleep = 0; /* Timer dies in C3. */ static void setuptimer(void); -static void loadtimer(struct bintime *now, int first); +static void loadtimer(sbintime_t now, int first); static int doconfigtimer(void); static void configtimer(int start); static int round_freq(struct eventtimer *et, int freq); -static void getnextcpuevent(struct bintime *event, int idle); -static void getnextevent(struct bintime *event); -static int handleevents(struct bintime *now, int fake); -#ifdef SMP -static void cpu_new_callout(int cpu, int ticks); -#endif +static sbintime_t getnextcpuevent(int idle); +static sbintime_t getnextevent(void); +static int handleevents(sbintime_t now, int fake); static struct mtx et_hw_mtx; @@ -94,13 +92,11 @@ static struct mtx et_hw_mtx; } static struct eventtimer *timer = NULL; -static struct bintime timerperiod; /* Timer period for periodic mode. */ -static struct bintime hardperiod; /* hardclock() events period. */ -static struct bintime statperiod; /* statclock() events period. */ -static struct bintime profperiod; /* profclock() events period. */ -static struct bintime nexttick; /* Next global timer tick time. */ -static struct bintime nexthard; /* Next global hardlock() event. */ -static u_int busy = 0; /* Reconfiguration is in progress. */ +static sbintime_t timerperiod; /* Timer period for periodic mode. */ +static sbintime_t statperiod; /* statclock() events period. */ +static sbintime_t profperiod; /* profclock() events period. */ +static sbintime_t nexttick; /* Next global timer tick time. */ +static u_int busy = 1; /* Reconfiguration is in progress. */ static int profiling = 0; /* Profiling events enabled. */ static char timername[32]; /* Wanted timer. */ @@ -116,11 +112,6 @@ TUNABLE_INT("kern.eventtimer.idletick", &idletick); SYSCTL_UINT(_kern_eventtimer, OID_AUTO, idletick, CTLFLAG_RW, &idletick, 0, "Run periodic events when idle"); -static u_int activetick = 1; /* Run all periodic events when active. */ -TUNABLE_INT("kern.eventtimer.activetick", &activetick); -SYSCTL_UINT(_kern_eventtimer, OID_AUTO, activetick, CTLFLAG_RW, &activetick, - 0, "Run all periodic events when active"); - static int periodic = 0; /* Periodic or one-shot mode. */ static int want_periodic = 0; /* What mode to prefer. */ TUNABLE_INT("kern.eventtimer.periodic", &want_periodic); @@ -129,31 +120,23 @@ struct pcpu_state { struct mtx et_hw_mtx; /* Per-CPU timer mutex. */ u_int action; /* Reconfiguration requests. */ u_int handle; /* Immediate handle resuests. */ - struct bintime now; /* Last tick time. */ - struct bintime nextevent; /* Next scheduled event on this CPU. */ - struct bintime nexttick; /* Next timer tick time. */ - struct bintime nexthard; /* Next hardlock() event. */ - struct bintime nextstat; /* Next statclock() event. */ - struct bintime nextprof; /* Next profclock() event. */ + sbintime_t now; /* Last tick time. */ + sbintime_t nextevent; /* Next scheduled event on this CPU. */ + sbintime_t nexttick; /* Next timer tick time. */ + sbintime_t nexthard; /* Next hardlock() event. */ + sbintime_t nextstat; /* Next statclock() event. */ + sbintime_t nextprof; /* Next profclock() event. */ + sbintime_t nextcall; /* Next callout event. */ + sbintime_t nextcallopt; /* Next optional callout event. */ #ifdef KDTRACE_HOOKS - struct bintime nextcyc; /* Next OpenSolaris cyclics event. */ + sbintime_t nextcyc; /* Next OpenSolaris cyclics event. */ #endif int ipi; /* This CPU needs IPI. */ int idle; /* This CPU is in idle mode. */ }; static DPCPU_DEFINE(struct pcpu_state, timerstate); - -#define FREQ2BT(freq, bt) \ -{ \ - (bt)->sec = 0; \ - (bt)->frac = ((uint64_t)0x8000000000000000 / (freq)) << 1; \ -} -#define BT2FREQ(bt) \ - (((uint64_t)0x8000000000000000 + ((bt)->frac >> 2)) / \ - ((bt)->frac >> 1)) - -#define SBT2FREQ(sbt) ((SBT_1S + ((sbt) >> 1)) / (sbt)) +DPCPU_DEFINE(sbintime_t, hardclocktime); /* * Timer broadcast IPI handler. @@ -161,7 +144,7 @@ static DPCPU_DEFINE(struct pcpu_state, timerstate); int hardclockintr(void) { - struct bintime now; + sbintime_t now; struct pcpu_state *state; int done; @@ -169,10 +152,9 @@ hardclockintr(void) return (FILTER_HANDLED); state = DPCPU_PTR(timerstate); now = state->now; - CTR4(KTR_SPARE2, "ipi at %d: now %d.%08x%08x", - curcpu, now.sec, (u_int)(now.frac >> 32), - (u_int)(now.frac & 0xffffffff)); - done = handleevents(&now, 0); + CTR3(KTR_SPARE2, "ipi at %d: now %d.%08x", + curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); + done = handleevents(now, 0); return (done ? FILTER_HANDLED : FILTER_STRAY); } @@ -180,48 +162,43 @@ hardclockintr(void) * Handle all events for specified time on this CPU */ static int -handleevents(struct bintime *now, int fake) +handleevents(sbintime_t now, int fake) { - struct bintime t; + sbintime_t t, *hct; struct trapframe *frame; struct pcpu_state *state; - uintfptr_t pc; int usermode; int done, runs; - CTR4(KTR_SPARE2, "handle at %d: now %d.%08x%08x", - curcpu, now->sec, (u_int)(now->frac >> 32), - (u_int)(now->frac & 0xffffffff)); + CTR3(KTR_SPARE2, "handle at %d: now %d.%08x", + curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); done = 0; if (fake) { frame = NULL; usermode = 0; - pc = 0; } else { frame = curthread->td_intr_frame; usermode = TRAPF_USERMODE(frame); - pc = TRAPF_PC(frame); } state = DPCPU_PTR(timerstate); runs = 0; - while (bintime_cmp(now, &state->nexthard, >=)) { - bintime_addx(&state->nexthard, hardperiod.frac); + while (now >= state->nexthard) { + state->nexthard += tick_sbt; runs++; } if (runs) { - if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 && - bintime_cmp(&state->nexthard, &nexthard, >)) - nexthard = state->nexthard; + hct = DPCPU_PTR(hardclocktime); + *hct = state->nexthard - tick_sbt; if (fake < 2) { hardclock_cnt(runs, usermode); done = 1; } } runs = 0; - while (bintime_cmp(now, &state->nextstat, >=)) { - bintime_addx(&state->nextstat, statperiod.frac); + while (now >= state->nextstat) { + state->nextstat += statperiod; runs++; } if (runs && fake < 2) { @@ -230,31 +207,29 @@ handleevents(struct bintime *now, int fake) } if (profiling) { runs = 0; - while (bintime_cmp(now, &state->nextprof, >=)) { - bintime_addx(&state->nextprof, profperiod.frac); + while (now >= state->nextprof) { + state->nextprof += profperiod; runs++; } if (runs && !fake) { - profclock_cnt(runs, usermode, pc); + profclock_cnt(runs, usermode, TRAPF_PC(frame)); done = 1; } } else state->nextprof = state->nextstat; + if (now >= state->nextcallopt) { + state->nextcall = state->nextcallopt = INT64_MAX; + callout_process(now); + } #ifdef KDTRACE_HOOKS - if (fake == 0 && cyclic_clock_func != NULL && - state->nextcyc.sec != -1 && - bintime_cmp(now, &state->nextcyc, >=)) { - state->nextcyc.sec = -1; + if (fake == 0 && now >= state->nextcyc && cyclic_clock_func != NULL) { + state->nextcyc = INT64_MAX; (*cyclic_clock_func)(frame); } #endif - getnextcpuevent(&t, 0); - if (fake == 2) { - state->nextevent = t; - return (done); - } + t = getnextcpuevent(0); ET_HW_LOCK(state); if (!busy) { state->idle = 0; @@ -268,84 +243,81 @@ handleevents(struct bintime *now, int fake) /* * Schedule binuptime of the next event on current CPU. */ -static void -getnextcpuevent(struct bintime *event, int idle) +static sbintime_t +getnextcpuevent(int idle) { - struct bintime tmp; + sbintime_t event; struct pcpu_state *state; - int skip; + u_int hardfreq; state = DPCPU_PTR(timerstate); - /* Handle hardclock() events. */ - *event = state->nexthard; - if (idle || (!activetick && !profiling && - (timer->et_flags & ET_FLAGS_PERCPU) == 0)) { - skip = idle ? 4 : (stathz / 2); - if (curcpu == CPU_FIRST() && tc_min_ticktock_freq > skip) - skip = tc_min_ticktock_freq; - skip = callout_tickstofirst(hz / skip) - 1; - CTR2(KTR_SPARE2, "skip at %d: %d", curcpu, skip); - tmp = hardperiod; - bintime_mul(&tmp, skip); - bintime_add(event, &tmp); - } + /* Handle hardclock() events, skipping some if CPU is idle. */ + event = state->nexthard; + if (idle) { + hardfreq = (u_int)hz / 2; + if (tc_min_ticktock_freq > 2 +#ifdef SMP + && curcpu == CPU_FIRST() +#endif + ) + hardfreq = hz / tc_min_ticktock_freq; + if (hardfreq > 1) + event += tick_sbt * (hardfreq - 1); + } + /* Handle callout events. */ + if (event > state->nextcall) + event = state->nextcall; if (!idle) { /* If CPU is active - handle other types of events. */ - if (bintime_cmp(event, &state->nextstat, >)) - *event = state->nextstat; - if (profiling && bintime_cmp(event, &state->nextprof, >)) - *event = state->nextprof; + if (event > state->nextstat) + event = state->nextstat; + if (profiling && event > state->nextprof) + event = state->nextprof; } #ifdef KDTRACE_HOOKS - if (state->nextcyc.sec != -1 && bintime_cmp(event, &state->nextcyc, >)) - *event = state->nextcyc; + if (event > state->nextcyc) + event = state->nextcyc; #endif + return (event); } /* * Schedule binuptime of the next event on all CPUs. */ -static void -getnextevent(struct bintime *event) +static sbintime_t +getnextevent(void) { struct pcpu_state *state; + sbintime_t event; #ifdef SMP int cpu; #endif - int c, nonidle; + int c; state = DPCPU_PTR(timerstate); - *event = state->nextevent; - c = curcpu; - nonidle = !state->idle; - if ((timer->et_flags & ET_FLAGS_PERCPU) == 0) { + event = state->nextevent; + c = -1; #ifdef SMP - if (smp_started) { - CPU_FOREACH(cpu) { - if (curcpu == cpu) - continue; - state = DPCPU_ID_PTR(cpu, timerstate); - nonidle += !state->idle; - if (bintime_cmp(event, &state->nextevent, >)) { - *event = state->nextevent; - c = cpu; - } + if ((timer->et_flags & ET_FLAGS_PERCPU) == 0) { + CPU_FOREACH(cpu) { + state = DPCPU_ID_PTR(cpu, timerstate); + if (event > state->nextevent) { + event = state->nextevent; + c = cpu; } } -#endif - if (nonidle != 0 && bintime_cmp(event, &nexthard, >)) - *event = nexthard; } - CTR5(KTR_SPARE2, "next at %d: next %d.%08x%08x by %d", - curcpu, event->sec, (u_int)(event->frac >> 32), - (u_int)(event->frac & 0xffffffff), c); +#endif + CTR4(KTR_SPARE2, "next at %d: next %d.%08x by %d", + curcpu, (int)(event >> 32), (u_int)(event & 0xffffffff), c); + return (event); } /* Hardware timer callback function. */ static void timercb(struct eventtimer *et, void *arg) { - struct bintime now; - struct bintime *next; + sbintime_t now; + sbintime_t *next; struct pcpu_state *state; #ifdef SMP int cpu, bcast; @@ -360,16 +332,14 @@ timercb(struct eventtimer *et, void *arg) next = &state->nexttick; } else next = &nexttick; - binuptime(&now); - if (periodic) { - *next = now; - bintime_addx(next, timerperiod.frac); /* Next tick in 1 period. */ - } else - next->sec = -1; /* Next tick is not scheduled yet. */ + now = sbinuptime(); + if (periodic) + *next = now + timerperiod; + else + *next = -1; /* Next tick is not scheduled yet. */ state->now = now; - CTR4(KTR_SPARE2, "intr at %d: now %d.%08x%08x", - curcpu, (int)(now.sec), (u_int)(now.frac >> 32), - (u_int)(now.frac & 0xffffffff)); + CTR3(KTR_SPARE2, "intr at %d: now %d.%08x", + curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); #ifdef SMP /* Prepare broadcasting to other CPUs for non-per-CPU timers. */ @@ -379,8 +349,8 @@ timercb(struct eventtimer *et, void *arg) state = DPCPU_ID_PTR(cpu, timerstate); ET_HW_LOCK(state); state->now = now; - if (bintime_cmp(&now, &state->nextevent, >=)) { - state->nextevent.sec++; + if (now >= state->nextevent) { + state->nextevent += SBT_1S; if (curcpu != cpu) { state->ipi = 1; bcast = 1; @@ -392,7 +362,7 @@ timercb(struct eventtimer *et, void *arg) #endif /* Handle events for this time on this CPU. */ - handleevents(&now, 0); + handleevents(now, 0); #ifdef SMP /* Broadcast interrupt to other CPUs for non-per-CPU timers. */ @@ -414,11 +384,11 @@ timercb(struct eventtimer *et, void *arg) * Load new value into hardware timer. */ static void -loadtimer(struct bintime *now, int start) +loadtimer(sbintime_t now, int start) { struct pcpu_state *state; - struct bintime new; - struct bintime *next; + sbintime_t new; + sbintime_t *next; uint64_t tmp; int eq; @@ -433,30 +403,24 @@ loadtimer(struct bintime *now, int start) * Try to start all periodic timers aligned * to period to make events synchronous. */ - tmp = ((uint64_t)now->sec << 36) + (now->frac >> 28); - tmp = (tmp % (timerperiod.frac >> 28)) << 28; - new.sec = 0; - new.frac = timerperiod.frac - tmp; - if (new.frac < tmp) /* Left less then passed. */ - bintime_addx(&new, timerperiod.frac); + tmp = now % timerperiod; + new = timerperiod - tmp; + if (new < tmp) /* Left less then passed. */ + new += timerperiod; CTR5(KTR_SPARE2, "load p at %d: now %d.%08x first in %d.%08x", - curcpu, now->sec, (u_int)(now->frac >> 32), - new.sec, (u_int)(new.frac >> 32)); - *next = new; - bintime_add(next, now); - et_start(timer, bttosbt(new), bttosbt(timerperiod)); + curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff), + (int)(new >> 32), (u_int)(new & 0xffffffff)); + *next = new + now; + et_start(timer, new, timerperiod); } } else { - getnextevent(&new); - eq = bintime_cmp(&new, next, ==); - CTR5(KTR_SPARE2, "load at %d: next %d.%08x%08x eq %d", - curcpu, new.sec, (u_int)(new.frac >> 32), - (u_int)(new.frac & 0xffffffff), - eq); + new = getnextevent(); + eq = (new == *next); + CTR4(KTR_SPARE2, "load at %d: next %d.%08x eq %d", + curcpu, (int)(new >> 32), (u_int)(new & 0xffffffff), eq); if (!eq) { *next = new; - bintime_sub(&new, now); - et_start(timer, bttosbt(new), 0); + et_start(timer, new - now, 0); } } } @@ -478,7 +442,7 @@ setuptimer(void) while (freq < (profiling ? profhz : stathz)) freq += hz; freq = round_freq(timer, freq); - FREQ2BT(freq, &timerperiod); + timerperiod = SBT_1S / freq; } /* @@ -487,15 +451,15 @@ setuptimer(void) static int doconfigtimer(void) { - struct bintime now; + sbintime_t now; struct pcpu_state *state; state = DPCPU_PTR(timerstate); switch (atomic_load_acq_int(&state->action)) { case 1: - binuptime(&now); + now = sbinuptime(); ET_HW_LOCK(state); - loadtimer(&now, 1); + loadtimer(now, 1); ET_HW_UNLOCK(state); state->handle = 0; atomic_store_rel_int(&state->action, 0); @@ -509,8 +473,8 @@ doconfigtimer(void) return (1); } if (atomic_readandclear_int(&state->handle) && !busy) { - binuptime(&now); - handleevents(&now, 0); + now = sbinuptime(); + handleevents(now, 0); return (1); } return (0); @@ -523,40 +487,45 @@ doconfigtimer(void) static void configtimer(int start) { - struct bintime now, next; + sbintime_t now, next; struct pcpu_state *state; int cpu; if (start) { setuptimer(); - binuptime(&now); - } + now = sbinuptime(); + } else + now = 0; critical_enter(); ET_HW_LOCK(DPCPU_PTR(timerstate)); if (start) { /* Initialize time machine parameters. */ - next = now; - bintime_addx(&next, timerperiod.frac); + next = now + timerperiod; if (periodic) nexttick = next; else - nexttick.sec = -1; + nexttick = -1; CPU_FOREACH(cpu) { state = DPCPU_ID_PTR(cpu, timerstate); state->now = now; - state->nextevent = next; + if (!smp_started && cpu != CPU_FIRST()) + state->nextevent = INT64_MAX; + else + state->nextevent = next; if (periodic) state->nexttick = next; else - state->nexttick.sec = -1; + state->nexttick = -1; state->nexthard = next; state->nextstat = next; state->nextprof = next; + state->nextcall = next; + state->nextcallopt = next; hardclock_sync(cpu); } busy = 0; /* Start global timer or per-CPU timer of this CPU. */ - loadtimer(&now, 1); + loadtimer(now, 1); } else { busy = 1; /* Stop global timer or per-CPU timer of this CPU. */ @@ -629,12 +598,11 @@ cpu_initclocks_bsp(void) state = DPCPU_ID_PTR(cpu, timerstate); mtx_init(&state->et_hw_mtx, "et_hw_mtx", NULL, MTX_SPIN); #ifdef KDTRACE_HOOKS - state->nextcyc.sec = -1; + state->nextcyc = INT64_MAX; #endif + state->nextcall = INT64_MAX; + state->nextcallopt = INT64_MAX; } -#ifdef SMP - callout_new_inserted = cpu_new_callout; -#endif periodic = want_periodic; /* Grab requested timer or the best of present. */ if (timername[0]) @@ -698,9 +666,10 @@ cpu_initclocks_bsp(void) profhz = round_freq(timer, stathz * 64); } tick = 1000000 / hz; - FREQ2BT(hz, &hardperiod); - FREQ2BT(stathz, &statperiod); - FREQ2BT(profhz, &profperiod); + tick_sbt = SBT_1S / hz; + tick_bt = sbttobt(tick_sbt); + statperiod = SBT_1S / stathz; + profperiod = SBT_1S / profhz; ET_LOCK(); configtimer(1); ET_UNLOCK(); @@ -712,18 +681,22 @@ cpu_initclocks_bsp(void) void cpu_initclocks_ap(void) { - struct bintime now; + sbintime_t now; struct pcpu_state *state; + struct thread *td; state = DPCPU_PTR(timerstate); - binuptime(&now); + now = sbinuptime(); ET_HW_LOCK(state); state->now = now; hardclock_sync(curcpu); - handleevents(&state->now, 2); - if (timer->et_flags & ET_FLAGS_PERCPU) - loadtimer(&now, 1); + spinlock_enter(); ET_HW_UNLOCK(state); + td = curthread; + td->td_intr_nesting_level++; + handleevents(state->now, 2); + td->td_intr_nesting_level--; + spinlock_exit(); } /* @@ -772,7 +745,7 @@ cpu_stopprofclock(void) sbintime_t cpu_idleclock(void) { - struct bintime now, t; + sbintime_t now, t; struct pcpu_state *state; if (idletick || busy || @@ -786,19 +759,17 @@ cpu_idleclock(void) if (periodic) now = state->now; else - binuptime(&now); - CTR4(KTR_SPARE2, "idle at %d: now %d.%08x%08x", - curcpu, now.sec, (u_int)(now.frac >> 32), - (u_int)(now.frac & 0xffffffff)); - getnextcpuevent(&t, 1); + now = sbinuptime(); + CTR3(KTR_SPARE2, "idle at %d: now %d.%08x", + curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); + t = getnextcpuevent(1); ET_HW_LOCK(state); state->idle = 1; state->nextevent = t; if (!periodic) - loadtimer(&now, 0); + loadtimer(now, 0); ET_HW_UNLOCK(state); - bintime_sub(&t, &now); - return (MAX(bttosbt(t), 0)); + return (MAX(t - now, 0)); } /* @@ -807,7 +778,7 @@ cpu_idleclock(void) void cpu_activeclock(void) { - struct bintime now; + sbintime_t now; struct pcpu_state *state; struct thread *td; @@ -817,101 +788,98 @@ cpu_activeclock(void) if (periodic) now = state->now; else - binuptime(&now); - CTR4(KTR_SPARE2, "active at %d: now %d.%08x%08x", - curcpu, now.sec, (u_int)(now.frac >> 32), - (u_int)(now.frac & 0xffffffff)); + now = sbinuptime(); + CTR3(KTR_SPARE2, "active at %d: now %d.%08x", + curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); spinlock_enter(); td = curthread; td->td_intr_nesting_level++; - handleevents(&now, 1); + handleevents(now, 1); td->td_intr_nesting_level--; spinlock_exit(); } #ifdef KDTRACE_HOOKS void -clocksource_cyc_set(const struct bintime *t) +clocksource_cyc_set(const struct bintime *bt) { - struct bintime now; + sbintime_t now, t; struct pcpu_state *state; + /* Do not touch anything if somebody reconfiguring timers. */ + if (busy) + return; + t = bttosbt(*bt); state = DPCPU_PTR(timerstate); if (periodic) now = state->now; else - binuptime(&now); + now = sbinuptime(); - CTR4(KTR_SPARE2, "set_cyc at %d: now %d.%08x%08x", - curcpu, now.sec, (u_int)(now.frac >> 32), - (u_int)(now.frac & 0xffffffff)); - CTR4(KTR_SPARE2, "set_cyc at %d: t %d.%08x%08x", - curcpu, t->sec, (u_int)(t->frac >> 32), - (u_int)(t->frac & 0xffffffff)); + CTR5(KTR_SPARE2, "set_cyc at %d: now %d.%08x t %d.%08x", + curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff), + (int)(t >> 32), (u_int)(t & 0xffffffff)); ET_HW_LOCK(state); - if (bintime_cmp(t, &state->nextcyc, ==)) { - ET_HW_UNLOCK(state); - return; - } - state->nextcyc = *t; - if (bintime_cmp(&state->nextcyc, &state->nextevent, >=)) { - ET_HW_UNLOCK(state); - return; - } - state->nextevent = state->nextcyc; + if (t == state->nextcyc) + goto done; + state->nextcyc = t; + if (t >= state->nextevent) + goto done; + state->nextevent = t; if (!periodic) - loadtimer(&now, 0); + loadtimer(now, 0); +done: ET_HW_UNLOCK(state); } #endif -#ifdef SMP -static void -cpu_new_callout(int cpu, int ticks) +void +cpu_new_callout(int cpu, sbintime_t bt, sbintime_t bt_opt) { - struct bintime tmp; struct pcpu_state *state; - CTR3(KTR_SPARE2, "new co at %d: on %d in %d", - curcpu, cpu, ticks); + /* Do not touch anything if somebody reconfiguring timers. */ + if (busy) + return; + CTR6(KTR_SPARE2, "new co at %d: on %d at %d.%08x - %d.%08x", + curcpu, cpu, (int)(bt_opt >> 32), (u_int)(bt_opt & 0xffffffff), + (int)(bt >> 32), (u_int)(bt & 0xffffffff)); state = DPCPU_ID_PTR(cpu, timerstate); ET_HW_LOCK(state); - if (state->idle == 0 || busy) { - ET_HW_UNLOCK(state); - return; - } + /* - * If timer is periodic - just update next event time for target CPU. - * If timer is global - there is chance it is already programmed. + * If there is callout time already set earlier -- do nothing. + * This check may appear redundant because we check already in + * callout_process() but this double check guarantees we're safe + * with respect to race conditions between interrupts execution + * and scheduling. */ - if (periodic || (timer->et_flags & ET_FLAGS_PERCPU) == 0) { - tmp = hardperiod; - bintime_mul(&tmp, ticks - 1); - bintime_add(&tmp, &state->nexthard); - if (bintime_cmp(&tmp, &state->nextevent, <)) - state->nextevent = tmp; - if (periodic || - bintime_cmp(&state->nextevent, &nexttick, >=)) { - ET_HW_UNLOCK(state); - return; - } + state->nextcallopt = bt_opt; + if (bt >= state->nextcall) + goto done; + state->nextcall = bt; + /* If there is some other event set earlier -- do nothing. */ + if (bt >= state->nextevent) + goto done; + state->nextevent = bt; + /* If timer is periodic -- there is nothing to reprogram. */ + if (periodic) + goto done; + /* If timer is global or of the current CPU -- reprogram it. */ + if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 || cpu == curcpu) { + loadtimer(sbinuptime(), 0); +done: + ET_HW_UNLOCK(state); + return; } - /* - * Otherwise we have to wake that CPU up, as we can't get present - * bintime to reprogram global timer from here. If timer is per-CPU, - * we by definition can't do it from here. - */ + /* Otherwise make other CPU to reprogram it. */ + state->handle = 1; ET_HW_UNLOCK(state); - if (timer->et_flags & ET_FLAGS_PERCPU) { - state->handle = 1; - ipi_cpu(cpu, IPI_HARDCLOCK); - } else { - if (!cpu_idle_wakeup(cpu)) - ipi_cpu(cpu, IPI_AST); - } -} +#ifdef SMP + ipi_cpu(cpu, IPI_HARDCLOCK); #endif +} /* * Report or change the active event timers hardware. diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c index 6e1f486..9fe7ebe 100644 --- a/sys/kern/kern_tc.c +++ b/sys/kern/kern_tc.c @@ -22,6 +22,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/kernel.h> +#include <sys/limits.h> #ifdef FFCLOCK #include <sys/lock.h> #include <sys/mutex.h> @@ -119,6 +120,21 @@ static int timestepwarnings; SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RW, ×tepwarnings, 0, "Log time steps"); +struct bintime bt_timethreshold; +struct bintime bt_tickthreshold; +sbintime_t sbt_timethreshold; +sbintime_t sbt_tickthreshold; +struct bintime tc_tick_bt; +sbintime_t tc_tick_sbt; +int tc_precexp; +int tc_timepercentage = TC_DEFAULTPERC; +TUNABLE_INT("kern.timecounter.alloweddeviation", &tc_timepercentage); +static int sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS); +SYSCTL_PROC(_kern_timecounter, OID_AUTO, alloweddeviation, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, + sysctl_kern_timecounter_adjprecision, "I", + "Allowed time interval deviation in percents"); + static void tc_windup(void); static void cpu_tick_calibrate(int); @@ -1746,10 +1762,47 @@ tc_ticktock(int cnt) tc_windup(); } +static void __inline +tc_adjprecision(void) +{ + int t; + + if (tc_timepercentage > 0) { + t = (99 + tc_timepercentage) / tc_timepercentage; + tc_precexp = fls(t + (t >> 1)) - 1; + FREQ2BT(hz / tc_tick, &bt_timethreshold); + FREQ2BT(hz, &bt_tickthreshold); + bintime_shift(&bt_timethreshold, tc_precexp); + bintime_shift(&bt_tickthreshold, tc_precexp); + } else { + tc_precexp = 31; + bt_timethreshold.sec = INT_MAX; + bt_timethreshold.frac = ~(uint64_t)0; + bt_tickthreshold = bt_timethreshold; + } + sbt_timethreshold = bttosbt(bt_timethreshold); + sbt_tickthreshold = bttosbt(bt_tickthreshold); +} + +static int +sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS) +{ + int error, val; + + val = tc_timepercentage; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + tc_timepercentage = val; + tc_adjprecision(); + return (0); +} + static void inittimecounter(void *dummy) { u_int p; + int tick_rate; /* * Set the initial timeout to @@ -1763,6 +1816,12 @@ inittimecounter(void *dummy) tc_tick = (hz + 500) / 1000; else tc_tick = 1; + tc_adjprecision(); + FREQ2BT(hz, &tick_bt); + tick_sbt = bttosbt(tick_bt); + tick_rate = hz / tc_tick; + FREQ2BT(tick_rate, &tc_tick_bt); + tc_tick_sbt = bttosbt(tc_tick_bt); p = (tc_tick * 1000000) / hz; printf("Timecounters tick every %d.%03u msec\n", p / 1000, p % 1000); diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c index a217db1..0787c01 100644 --- a/sys/kern/kern_timeout.c +++ b/sys/kern/kern_timeout.c @@ -37,7 +37,11 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include "opt_callout_profiling.h" #include "opt_kdtrace.h" +#if defined(__arm__) +#include "opt_timer.h" +#endif #include <sys/param.h> #include <sys/systm.h> @@ -59,6 +63,10 @@ __FBSDID("$FreeBSD$"); #include <machine/cpu.h> #endif +#ifndef NO_EVENTTIMERS +DPCPU_DECLARE(sbintime_t, hardclocktime); +#endif + SDT_PROVIDER_DEFINE(callout_execute); SDT_PROBE_DEFINE(callout_execute, kernel, , callout_start, callout-start); SDT_PROBE_ARGTYPE(callout_execute, kernel, , callout_start, 0, @@ -67,6 +75,7 @@ SDT_PROBE_DEFINE(callout_execute, kernel, , callout_end, callout-end); SDT_PROBE_ARGTYPE(callout_execute, kernel, , callout_end, 0, "struct callout *"); +#ifdef CALLOUT_PROFILING static int avg_depth; SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0, "Average number of items examined per softclock call. Units = 1/1000"); @@ -79,6 +88,19 @@ SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls, CTLFLAG_RD, &avg_lockcalls, 0, static int avg_mpcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0, "Average number of MP callouts made per softclock call. Units = 1/1000"); +static int avg_depth_dir; +SYSCTL_INT(_debug, OID_AUTO, to_avg_depth_dir, CTLFLAG_RD, &avg_depth_dir, 0, + "Average number of direct callouts examined per callout_process call. " + "Units = 1/1000"); +static int avg_lockcalls_dir; +SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls_dir, CTLFLAG_RD, + &avg_lockcalls_dir, 0, "Average number of lock direct callouts made per " + "callout_process call. Units = 1/1000"); +static int avg_mpcalls_dir; +SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls_dir, CTLFLAG_RD, &avg_mpcalls_dir, + 0, "Average number of MP direct callouts made per callout_process call. " + "Units = 1/1000"); +#endif /* * TODO: * allocate more timeout table slots when table overflows. @@ -86,58 +108,63 @@ SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0, u_int callwheelsize, callwheelmask; /* - * The callout cpu migration entity represents informations necessary for - * describing the migrating callout to the new callout cpu. + * The callout cpu exec entities represent informations necessary for + * describing the state of callouts currently running on the CPU and the ones + * necessary for migrating callouts to the new callout cpu. In particular, + * the first entry of the array cc_exec_entity holds informations for callout + * running in SWI thread context, while the second one holds informations + * for callout running directly from hardware interrupt context. * The cached informations are very important for deferring migration when * the migrating callout is already running. */ -struct cc_mig_ent { +struct cc_exec { + struct callout *cc_next; + struct callout *cc_curr; #ifdef SMP - void (*ce_migration_func)(void *); - void *ce_migration_arg; - int ce_migration_cpu; - int ce_migration_ticks; + void (*ce_migration_func)(void *); + void *ce_migration_arg; + int ce_migration_cpu; + sbintime_t ce_migration_time; #endif + boolean_t cc_cancel; + boolean_t cc_waiting; }; /* * There is one struct callout_cpu per cpu, holding all relevant * state for the callout processing thread on the individual CPU. - * In particular: - * cc_ticks is incremented once per tick in callout_cpu(). - * It tracks the global 'ticks' but in a way that the individual - * threads should not worry about races in the order in which - * hardclock() and hardclock_cpu() run on the various CPUs. - * cc_softclock is advanced in callout_cpu() to point to the - * first entry in cc_callwheel that may need handling. In turn, - * a softclock() is scheduled so it can serve the various entries i - * such that cc_softclock <= i <= cc_ticks . - * XXX maybe cc_softclock and cc_ticks should be volatile ? - * - * cc_ticks is also used in callout_reset_cpu() to determine - * when the callout should be served. */ struct callout_cpu { struct mtx_padalign cc_lock; - struct cc_mig_ent cc_migrating_entity; + struct cc_exec cc_exec_entity[2]; struct callout *cc_callout; - struct callout_tailq *cc_callwheel; - struct callout_list cc_callfree; - struct callout *cc_next; - struct callout *cc_curr; + struct callout_list *cc_callwheel; + struct callout_tailq cc_expireq; + struct callout_slist cc_callfree; + sbintime_t cc_firstevent; + sbintime_t cc_lastscan; void *cc_cookie; - int cc_ticks; - int cc_softticks; - int cc_cancel; - int cc_waiting; - int cc_firsttick; + u_int cc_bucket; }; +#define cc_exec_curr cc_exec_entity[0].cc_curr +#define cc_exec_next cc_exec_entity[0].cc_next +#define cc_exec_cancel cc_exec_entity[0].cc_cancel +#define cc_exec_waiting cc_exec_entity[0].cc_waiting +#define cc_exec_curr_dir cc_exec_entity[1].cc_curr +#define cc_exec_next_dir cc_exec_entity[1].cc_next +#define cc_exec_cancel_dir cc_exec_entity[1].cc_cancel +#define cc_exec_waiting_dir cc_exec_entity[1].cc_waiting + #ifdef SMP -#define cc_migration_func cc_migrating_entity.ce_migration_func -#define cc_migration_arg cc_migrating_entity.ce_migration_arg -#define cc_migration_cpu cc_migrating_entity.ce_migration_cpu -#define cc_migration_ticks cc_migrating_entity.ce_migration_ticks +#define cc_migration_func cc_exec_entity[0].ce_migration_func +#define cc_migration_arg cc_exec_entity[0].ce_migration_arg +#define cc_migration_cpu cc_exec_entity[0].ce_migration_cpu +#define cc_migration_time cc_exec_entity[0].ce_migration_time +#define cc_migration_func_dir cc_exec_entity[1].ce_migration_func +#define cc_migration_arg_dir cc_exec_entity[1].ce_migration_arg +#define cc_migration_cpu_dir cc_exec_entity[1].ce_migration_cpu +#define cc_migration_time_dir cc_exec_entity[1].ce_migration_time struct callout_cpu cc_cpu[MAXCPU]; #define CPUBLOCK MAXCPU @@ -153,39 +180,48 @@ struct callout_cpu cc_cpu; #define CC_LOCK_ASSERT(cc) mtx_assert(&(cc)->cc_lock, MA_OWNED) static int timeout_cpu; -void (*callout_new_inserted)(int cpu, int ticks) = NULL; + +static void softclock_call_cc(struct callout *c, struct callout_cpu *cc, +#ifdef CALLOUT_PROFILING + int *mpcalls, int *lockcalls, int *gcalls, +#endif + int direct); static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures"); /** * Locked by cc_lock: - * cc_curr - If a callout is in progress, it is curr_callout. - * If curr_callout is non-NULL, threads waiting in + * cc_curr - If a callout is in progress, it is cc_curr. + * If cc_curr is non-NULL, threads waiting in * callout_drain() will be woken up as soon as the * relevant callout completes. - * cc_cancel - Changing to 1 with both callout_lock and c_lock held + * cc_cancel - Changing to 1 with both callout_lock and cc_lock held * guarantees that the current callout will not run. * The softclock() function sets this to 0 before it * drops callout_lock to acquire c_lock, and it calls * the handler only if curr_cancelled is still 0 after - * c_lock is successfully acquired. + * cc_lock is successfully acquired. * cc_waiting - If a thread is waiting in callout_drain(), then * callout_wait is nonzero. Set only when - * curr_callout is non-NULL. + * cc_curr is non-NULL. */ /* - * Resets the migration entity tied to a specific callout cpu. + * Resets the execution entity tied to a specific callout cpu. */ static void -cc_cme_cleanup(struct callout_cpu *cc) +cc_cce_cleanup(struct callout_cpu *cc, int direct) { + cc->cc_exec_entity[direct].cc_curr = NULL; + cc->cc_exec_entity[direct].cc_next = NULL; + cc->cc_exec_entity[direct].cc_cancel = FALSE; + cc->cc_exec_entity[direct].cc_waiting = FALSE; #ifdef SMP - cc->cc_migration_cpu = CPUBLOCK; - cc->cc_migration_ticks = 0; - cc->cc_migration_func = NULL; - cc->cc_migration_arg = NULL; + cc->cc_exec_entity[direct].ce_migration_cpu = CPUBLOCK; + cc->cc_exec_entity[direct].ce_migration_time = 0; + cc->cc_exec_entity[direct].ce_migration_func = NULL; + cc->cc_exec_entity[direct].ce_migration_arg = NULL; #endif } @@ -193,11 +229,11 @@ cc_cme_cleanup(struct callout_cpu *cc) * Checks if migration is requested by a specific callout cpu. */ static int -cc_cme_migrating(struct callout_cpu *cc) +cc_cce_migrating(struct callout_cpu *cc, int direct) { #ifdef SMP - return (cc->cc_migration_cpu != CPUBLOCK); + return (cc->cc_exec_entity[direct].ce_migration_cpu != CPUBLOCK); #else return (0); #endif @@ -225,7 +261,7 @@ kern_timeout_callwheel_alloc(caddr_t v) cc->cc_callout = (struct callout *)v; v = (caddr_t)(cc->cc_callout + ncallout); - cc->cc_callwheel = (struct callout_tailq *)v; + cc->cc_callwheel = (struct callout_list *)v; v = (caddr_t)(cc->cc_callwheel + callwheelsize); return(v); } @@ -238,10 +274,12 @@ callout_cpu_init(struct callout_cpu *cc) mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE); SLIST_INIT(&cc->cc_callfree); - for (i = 0; i < callwheelsize; i++) { - TAILQ_INIT(&cc->cc_callwheel[i]); - } - cc_cme_cleanup(cc); + for (i = 0; i < callwheelsize; i++) + LIST_INIT(&cc->cc_callwheel[i]); + TAILQ_INIT(&cc->cc_expireq); + cc->cc_firstevent = INT64_MAX; + for (i = 0; i < 2; i++) + cc_cce_cleanup(cc, i); if (cc->cc_callout == NULL) return; for (i = 0; i < ncallout; i++) { @@ -320,7 +358,7 @@ start_softclock(void *dummy) panic("died while creating standard software ithreads"); cc->cc_callout = NULL; /* Only cpu0 handles timeout(). */ cc->cc_callwheel = malloc( - sizeof(struct callout_tailq) * callwheelsize, M_CALLOUT, + sizeof(struct callout_list) * callwheelsize, M_CALLOUT, M_WAITOK); callout_cpu_init(cc); } @@ -329,64 +367,148 @@ start_softclock(void *dummy) SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL); +#define CC_HASH_SHIFT 8 + +static inline u_int +callout_hash(sbintime_t sbt) +{ + + return (sbt >> (32 - CC_HASH_SHIFT)); +} + +static inline u_int +callout_get_bucket(sbintime_t sbt) +{ + + return (callout_hash(sbt) & callwheelmask); +} + void -callout_tick(void) +callout_process(sbintime_t now) { + struct callout *tmp, *tmpn; struct callout_cpu *cc; - int need_softclock; - int bucket; + struct callout_list *sc; + sbintime_t first, last, max, tmp_max; + uint32_t lookahead; + u_int firstb, lastb, nowb; +#ifdef CALLOUT_PROFILING + int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0; +#endif - /* - * Process callouts at a very low cpu priority, so we don't keep the - * relatively high clock interrupt priority any longer than necessary. - */ - need_softclock = 0; cc = CC_SELF(); mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET); - cc->cc_firsttick = cc->cc_ticks = ticks; - for (; (cc->cc_softticks - cc->cc_ticks) <= 0; cc->cc_softticks++) { - bucket = cc->cc_softticks & callwheelmask; - if (!TAILQ_EMPTY(&cc->cc_callwheel[bucket])) { - need_softclock = 1; - break; - } + + /* Compute the buckets of the last scan and present times. */ + firstb = callout_hash(cc->cc_lastscan); + cc->cc_lastscan = now; + nowb = callout_hash(now); + + /* Compute the last bucket and minimum time of the bucket after it. */ + if (nowb == firstb) + lookahead = (SBT_1S / 16); + else if (nowb - firstb == 1) + lookahead = (SBT_1S / 8); + else + lookahead = (SBT_1S / 2); + first = last = now; + first += (lookahead / 2); + last += lookahead; + last &= (0xffffffffffffffffLLU << (32 - CC_HASH_SHIFT)); + lastb = callout_hash(last) - 1; + max = last; + + /* + * Check if we wrapped around the entire wheel from the last scan. + * In case, we need to scan entirely the wheel for pending callouts. + */ + if (lastb - firstb >= callwheelsize) { + lastb = firstb + callwheelsize - 1; + if (nowb - firstb >= callwheelsize) + nowb = lastb; } + + /* Iterate callwheel from firstb to nowb and then up to lastb. */ + do { + sc = &cc->cc_callwheel[firstb & callwheelmask]; + tmp = LIST_FIRST(sc); + while (tmp != NULL) { + /* Run the callout if present time within allowed. */ + if (tmp->c_time <= now) { + /* + * Consumer told us the callout may be run + * directly from hardware interrupt context. + */ + if (tmp->c_flags & CALLOUT_DIRECT) { +#ifdef CALLOUT_PROFILING + ++depth_dir; +#endif + cc->cc_exec_next_dir = + LIST_NEXT(tmp, c_links.le); + cc->cc_bucket = firstb & callwheelmask; + LIST_REMOVE(tmp, c_links.le); + softclock_call_cc(tmp, cc, +#ifdef CALLOUT_PROFILING + &mpcalls_dir, &lockcalls_dir, NULL, +#endif + 1); + tmp = cc->cc_exec_next_dir; + } else { + tmpn = LIST_NEXT(tmp, c_links.le); + LIST_REMOVE(tmp, c_links.le); + TAILQ_INSERT_TAIL(&cc->cc_expireq, + tmp, c_links.tqe); + tmp->c_flags |= CALLOUT_PROCESSED; + tmp = tmpn; + } + continue; + } + /* Skip events from distant future. */ + if (tmp->c_time >= max) + goto next; + /* + * Event minimal time is bigger than present maximal + * time, so it cannot be aggregated. + */ + if (tmp->c_time > last) { + lastb = nowb; + goto next; + } + /* Update first and last time, respecting this event. */ + if (tmp->c_time < first) + first = tmp->c_time; + tmp_max = tmp->c_time + tmp->c_precision; + if (tmp_max < last) + last = tmp_max; +next: + tmp = LIST_NEXT(tmp, c_links.le); + } + /* Proceed with the next bucket. */ + firstb++; + /* + * Stop if we looked after present time and found + * some event we can't execute at now. + * Stop if we looked far enough into the future. + */ + } while (((int)(firstb - lastb)) <= 0); + cc->cc_firstevent = last; +#ifndef NO_EVENTTIMERS + cpu_new_callout(curcpu, last, first); +#endif +#ifdef CALLOUT_PROFILING + avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8; + avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8; + avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8; +#endif mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); /* * swi_sched acquires the thread lock, so we don't want to call it * with cc_lock held; incorrect locking order. */ - if (need_softclock) + if (!TAILQ_EMPTY(&cc->cc_expireq)) swi_sched(cc->cc_cookie, 0); } -int -callout_tickstofirst(int limit) -{ - struct callout_cpu *cc; - struct callout *c; - struct callout_tailq *sc; - int curticks; - int skip = 1; - - cc = CC_SELF(); - mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET); - curticks = cc->cc_ticks; - while( skip < ncallout && skip < limit ) { - sc = &cc->cc_callwheel[ (curticks+skip) & callwheelmask ]; - /* search scanning ticks */ - TAILQ_FOREACH( c, sc, c_links.tqe ){ - if (c->c_time - curticks <= ncallout) - goto out; - } - skip++; - } -out: - cc->cc_firsttick = curticks + skip; - mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); - return (skip); -} - static struct callout_cpu * callout_lock(struct callout *c) { @@ -412,26 +534,41 @@ callout_lock(struct callout *c) } static void -callout_cc_add(struct callout *c, struct callout_cpu *cc, int to_ticks, - void (*func)(void *), void *arg, int cpu) +callout_cc_add(struct callout *c, struct callout_cpu *cc, + sbintime_t sbt, sbintime_t precision, void (*func)(void *), + void *arg, int cpu, int flags) { + int bucket; CC_LOCK_ASSERT(cc); - - if (to_ticks <= 0) - to_ticks = 1; + if (sbt < cc->cc_lastscan) + sbt = cc->cc_lastscan; c->c_arg = arg; c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING); + if (flags & C_DIRECT_EXEC) + c->c_flags |= CALLOUT_DIRECT; + c->c_flags &= ~CALLOUT_PROCESSED; c->c_func = func; - c->c_time = ticks + to_ticks; - TAILQ_INSERT_TAIL(&cc->cc_callwheel[c->c_time & callwheelmask], - c, c_links.tqe); - if ((c->c_time - cc->cc_firsttick) < 0 && - callout_new_inserted != NULL) { - cc->cc_firsttick = c->c_time; - (*callout_new_inserted)(cpu, - to_ticks + (ticks - cc->cc_ticks)); + c->c_time = sbt; + c->c_precision = precision; + bucket = callout_get_bucket(c->c_time); + CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x", + c, (int)(c->c_precision >> 32), + (u_int)(c->c_precision & 0xffffffff)); + LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le); + if (cc->cc_bucket == bucket) + cc->cc_exec_next_dir = c; +#ifndef NO_EVENTTIMERS + /* + * Inform the eventtimers(4) subsystem there's a new callout + * that has been inserted, but only if really required. + */ + sbt = c->c_time + c->c_precision; + if (sbt < cc->cc_firstevent) { + cc->cc_firstevent = sbt; + cpu_new_callout(cpu, sbt, c->c_time); } +#endif } static void @@ -445,8 +582,11 @@ callout_cc_del(struct callout *c, struct callout_cpu *cc) } static void -softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, - int *lockcalls, int *gcalls) +softclock_call_cc(struct callout *c, struct callout_cpu *cc, +#ifdef CALLOUT_PROFILING + int *mpcalls, int *lockcalls, int *gcalls, +#endif + int direct) { void (*c_func)(void *); void *c_arg; @@ -457,12 +597,13 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, struct callout_cpu *new_cc; void (*new_func)(void *); void *new_arg; - int new_cpu, new_ticks; + int flags, new_cpu; + sbintime_t new_time; #endif -#ifdef DIAGNOSTIC - struct bintime bt1, bt2; +#if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) + sbintime_t bt1, bt2; struct timespec ts2; - static uint64_t maxdt = 36893488147419102LL; /* 2 msec */ + static sbintime_t maxdt = 2 * SBT_1MS; /* 2 msec */ static timeout_t *lastfunc; #endif @@ -479,8 +620,8 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, c->c_flags = CALLOUT_LOCAL_ALLOC; else c->c_flags &= ~CALLOUT_PENDING; - cc->cc_curr = c; - cc->cc_cancel = 0; + cc->cc_exec_entity[direct].cc_curr = c; + cc->cc_exec_entity[direct].cc_cancel = FALSE; CC_UNLOCK(cc); if (c_lock != NULL) { class->lc_lock(c_lock, sharedlock); @@ -488,29 +629,34 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, * The callout may have been cancelled * while we switched locks. */ - if (cc->cc_cancel) { + if (cc->cc_exec_entity[direct].cc_cancel) { class->lc_unlock(c_lock); goto skip; } /* The callout cannot be stopped now. */ - cc->cc_cancel = 1; - + cc->cc_exec_entity[direct].cc_cancel = TRUE; if (c_lock == &Giant.lock_object) { +#ifdef CALLOUT_PROFILING (*gcalls)++; - CTR3(KTR_CALLOUT, "callout %p func %p arg %p", +#endif + CTR3(KTR_CALLOUT, "callout giant %p func %p arg %p", c, c_func, c_arg); } else { +#ifdef CALLOUT_PROFILING (*lockcalls)++; +#endif CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p", c, c_func, c_arg); } } else { +#ifdef CALLOUT_PROFILING (*mpcalls)++; - CTR3(KTR_CALLOUT, "callout mpsafe %p func %p arg %p", +#endif + CTR3(KTR_CALLOUT, "callout %p func %p arg %p", c, c_func, c_arg); } #ifdef DIAGNOSTIC - binuptime(&bt1); + sbt1 = sbinuptime(); #endif THREAD_NO_SLEEPING(); SDT_PROBE(callout_execute, kernel, , callout_start, c, 0, 0, 0, 0); @@ -518,16 +664,16 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, SDT_PROBE(callout_execute, kernel, , callout_end, c, 0, 0, 0, 0); THREAD_SLEEPING_OK(); #ifdef DIAGNOSTIC - binuptime(&bt2); - bintime_sub(&bt2, &bt1); - if (bt2.frac > maxdt) { - if (lastfunc != c_func || bt2.frac > maxdt * 2) { - bintime2timespec(&bt2, &ts2); + bt2 = sbinuptime(); + bt2 -= bt1; + if (bt2 > maxdt) { + if (lastfunc != c_func || bt2 > maxdt * 2) { + ts2 = sbttots(bt2); printf( "Expensive timeout(9) function: %p(%p) %jd.%09ld s\n", c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec); } - maxdt = bt2.frac; + maxdt = bt2; lastfunc = c_func; } #endif @@ -536,17 +682,17 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, class->lc_unlock(c_lock); skip: CC_LOCK(cc); - KASSERT(cc->cc_curr == c, ("mishandled cc_curr")); - cc->cc_curr = NULL; - if (cc->cc_waiting) { + KASSERT(cc->cc_exec_entity[direct].cc_curr == c, ("mishandled cc_curr")); + cc->cc_exec_entity[direct].cc_curr = NULL; + if (cc->cc_exec_entity[direct].cc_waiting) { /* * There is someone waiting for the * callout to complete. * If the callout was scheduled for * migration just cancel it. */ - if (cc_cme_migrating(cc)) { - cc_cme_cleanup(cc); + if (cc_cce_migrating(cc, direct)) { + cc_cce_cleanup(cc, direct); /* * It should be assert here that the callout is not @@ -554,11 +700,11 @@ skip: */ c->c_flags &= ~CALLOUT_DFRMIGRATION; } - cc->cc_waiting = 0; + cc->cc_exec_entity[direct].cc_waiting = FALSE; CC_UNLOCK(cc); - wakeup(&cc->cc_waiting); + wakeup(&cc->cc_exec_entity[direct].cc_waiting); CC_LOCK(cc); - } else if (cc_cme_migrating(cc)) { + } else if (cc_cce_migrating(cc, direct)) { KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0, ("Migrating legacy callout %p", c)); #ifdef SMP @@ -566,11 +712,11 @@ skip: * If the callout was scheduled for * migration just perform it now. */ - new_cpu = cc->cc_migration_cpu; - new_ticks = cc->cc_migration_ticks; - new_func = cc->cc_migration_func; - new_arg = cc->cc_migration_arg; - cc_cme_cleanup(cc); + new_cpu = cc->cc_exec_entity[direct].ce_migration_cpu; + new_time = cc->cc_exec_entity[direct].ce_migration_time; + new_func = cc->cc_exec_entity[direct].ce_migration_func; + new_arg = cc->cc_exec_entity[direct].ce_migration_arg; + cc_cce_cleanup(cc, direct); /* * It should be assert here that the callout is not destroyed @@ -588,8 +734,9 @@ skip: c->c_flags &= ~CALLOUT_DFRMIGRATION; new_cc = callout_cpu_switch(c, cc, new_cpu); - callout_cc_add(c, new_cc, new_ticks, new_func, new_arg, - new_cpu); + flags = (direct) ? C_DIRECT_EXEC : 0; + callout_cc_add(c, new_cc, new_time, c->c_precision, new_func, + new_arg, new_cpu, flags); CC_UNLOCK(new_cc); CC_LOCK(cc); #else @@ -632,63 +779,29 @@ softclock(void *arg) { struct callout_cpu *cc; struct callout *c; - struct callout_tailq *bucket; - int curticks; - int steps; /* #steps since we last allowed interrupts */ - int depth; - int mpcalls; - int lockcalls; - int gcalls; - -#ifndef MAX_SOFTCLOCK_STEPS -#define MAX_SOFTCLOCK_STEPS 100 /* Maximum allowed value of steps. */ -#endif /* MAX_SOFTCLOCK_STEPS */ - - mpcalls = 0; - lockcalls = 0; - gcalls = 0; - depth = 0; - steps = 0; +#ifdef CALLOUT_PROFILING + int depth = 0, gcalls = 0, lockcalls = 0, mpcalls = 0; +#endif + cc = (struct callout_cpu *)arg; CC_LOCK(cc); - while (cc->cc_softticks - 1 != cc->cc_ticks) { - /* - * cc_softticks may be modified by hard clock, so cache - * it while we work on a given bucket. - */ - curticks = cc->cc_softticks; - cc->cc_softticks++; - bucket = &cc->cc_callwheel[curticks & callwheelmask]; - c = TAILQ_FIRST(bucket); - while (c != NULL) { - depth++; - if (c->c_time != curticks) { - c = TAILQ_NEXT(c, c_links.tqe); - ++steps; - if (steps >= MAX_SOFTCLOCK_STEPS) { - cc->cc_next = c; - /* Give interrupts a chance. */ - CC_UNLOCK(cc); - ; /* nothing */ - CC_LOCK(cc); - c = cc->cc_next; - steps = 0; - } - } else { - cc->cc_next = TAILQ_NEXT(c, c_links.tqe); - TAILQ_REMOVE(bucket, c, c_links.tqe); - softclock_call_cc(c, cc, &mpcalls, - &lockcalls, &gcalls); - steps = 0; - c = cc->cc_next; - } - } + while ((c = TAILQ_FIRST(&cc->cc_expireq)) != NULL) { + TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); + softclock_call_cc(c, cc, +#ifdef CALLOUT_PROFILING + &mpcalls, &lockcalls, &gcalls, +#endif + 0); +#ifdef CALLOUT_PROFILING + ++depth; +#endif } +#ifdef CALLOUT_PROFILING avg_depth += (depth * 1000 - avg_depth) >> 8; avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8; avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8; avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8; - cc->cc_next = NULL; +#endif CC_UNLOCK(cc); } @@ -778,28 +891,71 @@ callout_handle_init(struct callout_handle *handle) * callout_deactivate() - marks the callout as having been serviced */ int -callout_reset_on(struct callout *c, int to_ticks, void (*ftn)(void *), - void *arg, int cpu) +callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision, + void (*ftn)(void *), void *arg, int cpu, int flags) { + sbintime_t to_sbt, pr; struct callout_cpu *cc; - int cancelled = 0; + int cancelled, direct; + cancelled = 0; + if (flags & C_ABSOLUTE) { + to_sbt = sbt; + } else { + if ((flags & C_HARDCLOCK) && (sbt < tick_sbt)) + sbt = tick_sbt; + if ((flags & C_HARDCLOCK) || +#ifdef NO_EVENTTIMERS + sbt >= sbt_timethreshold) { + to_sbt = getsbinuptime(); + + /* Add safety belt for the case of hz > 1000. */ + to_sbt += tc_tick_sbt - tick_sbt; +#else + sbt >= sbt_tickthreshold) { + /* + * Obtain the time of the last hardclock() call on + * this CPU directly from the kern_clocksource.c. + * This value is per-CPU, but it is equal for all + * active ones. + */ +#ifdef __LP64__ + to_sbt = DPCPU_GET(hardclocktime); +#else + spinlock_enter(); + to_sbt = DPCPU_GET(hardclocktime); + spinlock_exit(); +#endif +#endif + if ((flags & C_HARDCLOCK) == 0) + to_sbt += tick_sbt; + } else + to_sbt = sbinuptime(); + to_sbt += sbt; + pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp : + sbt >> C_PRELGET(flags)); + if (pr > precision) + precision = pr; + } /* * Don't allow migration of pre-allocated callouts lest they * become unbalanced. */ if (c->c_flags & CALLOUT_LOCAL_ALLOC) cpu = c->c_cpu; + direct = (c->c_flags & CALLOUT_DIRECT) != 0; + KASSERT(!direct || c->c_lock == NULL, + ("%s: direct callout %p has lock", __func__, c)); cc = callout_lock(c); - if (cc->cc_curr == c) { + if (cc->cc_exec_entity[direct].cc_curr == c) { /* * We're being asked to reschedule a callout which is * currently in progress. If there is a lock then we * can cancel the callout if it has not really started. */ - if (c->c_lock != NULL && !cc->cc_cancel) - cancelled = cc->cc_cancel = 1; - if (cc->cc_waiting) { + if (c->c_lock != NULL && !cc->cc_exec_entity[direct].cc_cancel) + cancelled = cc->cc_exec_entity[direct].cc_cancel = TRUE; + if (cc->cc_exec_entity[direct].cc_waiting) { /* * Someone has called callout_drain to kill this * callout. Don't reschedule. @@ -812,12 +968,12 @@ callout_reset_on(struct callout *c, int to_ticks, void (*ftn)(void *), } } if (c->c_flags & CALLOUT_PENDING) { - if (cc->cc_next == c) { - cc->cc_next = TAILQ_NEXT(c, c_links.tqe); - } - TAILQ_REMOVE(&cc->cc_callwheel[c->c_time & callwheelmask], c, - c_links.tqe); - + if ((c->c_flags & CALLOUT_PROCESSED) == 0) { + if (cc->cc_exec_next_dir == c) + cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le); + LIST_REMOVE(c, c_links.le); + } else + TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); cancelled = 1; c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); } @@ -829,15 +985,17 @@ callout_reset_on(struct callout *c, int to_ticks, void (*ftn)(void *), * to a more appropriate moment. */ if (c->c_cpu != cpu) { - if (cc->cc_curr == c) { - cc->cc_migration_cpu = cpu; - cc->cc_migration_ticks = to_ticks; - cc->cc_migration_func = ftn; - cc->cc_migration_arg = arg; + if (cc->cc_exec_entity[direct].cc_curr == c) { + cc->cc_exec_entity[direct].ce_migration_cpu = cpu; + cc->cc_exec_entity[direct].ce_migration_time + = to_sbt; + cc->cc_exec_entity[direct].ce_migration_func = ftn; + cc->cc_exec_entity[direct].ce_migration_arg = arg; c->c_flags |= CALLOUT_DFRMIGRATION; - CTR5(KTR_CALLOUT, - "migration of %p func %p arg %p in %d to %u deferred", - c, c->c_func, c->c_arg, to_ticks, cpu); + CTR6(KTR_CALLOUT, + "migration of %p func %p arg %p in %d.%08x to %u deferred", + c, c->c_func, c->c_arg, (int)(to_sbt >> 32), + (u_int)(to_sbt & 0xffffffff), cpu); CC_UNLOCK(cc); return (cancelled); } @@ -845,9 +1003,10 @@ callout_reset_on(struct callout *c, int to_ticks, void (*ftn)(void *), } #endif - callout_cc_add(c, cc, to_ticks, ftn, arg, cpu); - CTR5(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d", - cancelled ? "re" : "", c, c->c_func, c->c_arg, to_ticks); + callout_cc_add(c, cc, to_sbt, precision, ftn, arg, cpu, flags); + CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x", + cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32), + (u_int)(to_sbt & 0xffffffff)); CC_UNLOCK(cc); return (cancelled); @@ -875,7 +1034,7 @@ _callout_stop_safe(c, safe) { struct callout_cpu *cc, *old_cc; struct lock_class *class; - int use_lock, sq_locked; + int direct, sq_locked, use_lock; /* * Some old subsystems don't hold Giant while running a callout_stop(), @@ -891,7 +1050,7 @@ _callout_stop_safe(c, safe) } } else use_lock = 0; - + direct = (c->c_flags & CALLOUT_DIRECT) != 0; sq_locked = 0; old_cc = NULL; again: @@ -905,7 +1064,7 @@ again: if (sq_locked != 0 && cc != old_cc) { #ifdef SMP CC_UNLOCK(cc); - sleepq_release(&old_cc->cc_waiting); + sleepq_release(&old_cc->cc_exec_entity[direct].cc_waiting); sq_locked = 0; old_cc = NULL; goto again; @@ -926,12 +1085,13 @@ again: * If it wasn't on the queue and it isn't the current * callout, then we can't stop it, so just bail. */ - if (cc->cc_curr != c) { + if (cc->cc_exec_entity[direct].cc_curr != c) { CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p", c, c->c_func, c->c_arg); CC_UNLOCK(cc); if (sq_locked) - sleepq_release(&cc->cc_waiting); + sleepq_release( + &cc->cc_exec_entity[direct].cc_waiting); return (0); } @@ -942,8 +1102,7 @@ again: * just wait for the current invocation to * finish. */ - while (cc->cc_curr == c) { - + while (cc->cc_exec_entity[direct].cc_curr == c) { /* * Use direct calls to sleepqueue interface * instead of cv/msleep in order to avoid @@ -963,7 +1122,8 @@ again: */ if (!sq_locked) { CC_UNLOCK(cc); - sleepq_lock(&cc->cc_waiting); + sleepq_lock( + &cc->cc_exec_entity[direct].cc_waiting); sq_locked = 1; old_cc = cc; goto again; @@ -975,13 +1135,16 @@ again: * will be packed up, just let softclock() * take care of it. */ - cc->cc_waiting = 1; + cc->cc_exec_entity[direct].cc_waiting = TRUE; DROP_GIANT(); CC_UNLOCK(cc); - sleepq_add(&cc->cc_waiting, + sleepq_add( + &cc->cc_exec_entity[direct].cc_waiting, &cc->cc_lock.lock_object, "codrain", SLEEPQ_SLEEP, 0); - sleepq_wait(&cc->cc_waiting, 0); + sleepq_wait( + &cc->cc_exec_entity[direct].cc_waiting, + 0); sq_locked = 0; old_cc = NULL; @@ -989,7 +1152,8 @@ again: PICKUP_GIANT(); CC_LOCK(cc); } - } else if (use_lock && !cc->cc_cancel) { + } else if (use_lock && + !cc->cc_exec_entity[direct].cc_cancel) { /* * The current callout is waiting for its * lock which we hold. Cancel the callout @@ -997,10 +1161,10 @@ again: * lock, the callout will be skipped in * softclock(). */ - cc->cc_cancel = 1; + cc->cc_exec_entity[direct].cc_cancel = TRUE; CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); - KASSERT(!cc_cme_migrating(cc), + KASSERT(!cc_cce_migrating(cc, direct), ("callout wrongly scheduled for migration")); CC_UNLOCK(cc); KASSERT(!sq_locked, ("sleepqueue chain locked")); @@ -1019,16 +1183,18 @@ again: return (0); } if (sq_locked) - sleepq_release(&cc->cc_waiting); + sleepq_release(&cc->cc_exec_entity[direct].cc_waiting); c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); - if (cc->cc_next == c) - cc->cc_next = TAILQ_NEXT(c, c_links.tqe); - TAILQ_REMOVE(&cc->cc_callwheel[c->c_time & callwheelmask], c, - c_links.tqe); + if ((c->c_flags & CALLOUT_PROCESSED) == 0) { + if (cc->cc_exec_next_dir == c) + cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le); + LIST_REMOVE(c, c_links.le); + } else + TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); callout_cc_del(c, cc); CC_UNLOCK(cc); @@ -1135,3 +1301,121 @@ adjust_timeout_calltodo(time_change) return; } #endif /* APM_FIXUP_CALLTODO */ + +static int +flssbt(sbintime_t sbt) +{ + + sbt += (uint64_t)sbt >> 1; + if (sizeof(long) >= sizeof(sbintime_t)) + return (flsl(sbt)); + if (sbt >= SBT_1S) + return (flsl(((uint64_t)sbt) >> 32) + 32); + return (flsl(sbt)); +} + +/* + * Dump immediate statistic snapshot of the scheduled callouts. + */ +static int +sysctl_kern_callout_stat(SYSCTL_HANDLER_ARGS) +{ + struct callout *tmp; + struct callout_cpu *cc; + struct callout_list *sc; + sbintime_t maxpr, maxt, medpr, medt, now, spr, st, t; + int ct[64], cpr[64], ccpbk[32]; + int error, val, i, count, tcum, pcum, maxc, c, medc; +#ifdef SMP + int cpu; +#endif + + val = 0; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + count = maxc = 0; + st = spr = maxt = maxpr = 0; + bzero(ccpbk, sizeof(ccpbk)); + bzero(ct, sizeof(ct)); + bzero(cpr, sizeof(cpr)); + now = sbinuptime(); +#ifdef SMP + CPU_FOREACH(cpu) { + cc = CC_CPU(cpu); +#else + cc = CC_CPU(timeout_cpu); +#endif + CC_LOCK(cc); + for (i = 0; i < callwheelsize; i++) { + sc = &cc->cc_callwheel[i]; + c = 0; + LIST_FOREACH(tmp, sc, c_links.le) { + c++; + t = tmp->c_time - now; + if (t < 0) + t = 0; + st += t / SBT_1US; + spr += tmp->c_precision / SBT_1US; + if (t > maxt) + maxt = t; + if (tmp->c_precision > maxpr) + maxpr = tmp->c_precision; + ct[flssbt(t)]++; + cpr[flssbt(tmp->c_precision)]++; + } + if (c > maxc) + maxc = c; + ccpbk[fls(c + c / 2)]++; + count += c; + } + CC_UNLOCK(cc); +#ifdef SMP + } +#endif + + for (i = 0, tcum = 0; i < 64 && tcum < count / 2; i++) + tcum += ct[i]; + medt = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0; + for (i = 0, pcum = 0; i < 64 && pcum < count / 2; i++) + pcum += cpr[i]; + medpr = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0; + for (i = 0, c = 0; i < 32 && c < count / 2; i++) + c += ccpbk[i]; + medc = (i >= 2) ? (1 << (i - 2)) : 0; + + printf("Scheduled callouts statistic snapshot:\n"); + printf(" Callouts: %6d Buckets: %6d*%-3d Bucket size: 0.%06ds\n", + count, callwheelsize, mp_ncpus, 1000000 >> CC_HASH_SHIFT); + printf(" C/Bk: med %5d avg %6d.%06jd max %6d\n", + medc, + count / callwheelsize / mp_ncpus, + (uint64_t)count * 1000000 / callwheelsize / mp_ncpus % 1000000, + maxc); + printf(" Time: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n", + medt / SBT_1S, (medt & 0xffffffff) * 1000000 >> 32, + (st / count) / 1000000, (st / count) % 1000000, + maxt / SBT_1S, (maxt & 0xffffffff) * 1000000 >> 32); + printf(" Prec: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n", + medpr / SBT_1S, (medpr & 0xffffffff) * 1000000 >> 32, + (spr / count) / 1000000, (spr / count) % 1000000, + maxpr / SBT_1S, (maxpr & 0xffffffff) * 1000000 >> 32); + printf(" Distribution: \tbuckets\t time\t tcum\t" + " prec\t pcum\n"); + for (i = 0, tcum = pcum = 0; i < 64; i++) { + if (ct[i] == 0 && cpr[i] == 0) + continue; + t = (i != 0) ? (((sbintime_t)1) << (i - 1)) : 0; + tcum += ct[i]; + pcum += cpr[i]; + printf(" %10jd.%06jds\t 2**%d\t%7d\t%7d\t%7d\t%7d\n", + t / SBT_1S, (t & 0xffffffff) * 1000000 >> 32, + i - 1 - (32 - CC_HASH_SHIFT), + ct[i], tcum, cpr[i], pcum); + } + return (error); +} +SYSCTL_PROC(_kern, OID_AUTO, callout_stat, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + 0, 0, sysctl_kern_callout_stat, "I", + "Dump immediate statistic snapshot of the scheduled callouts"); diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c index f36c769..0a3580b 100644 --- a/sys/kern/subr_param.c +++ b/sys/kern/subr_param.c @@ -81,8 +81,10 @@ __FBSDID("$FreeBSD$"); static int sysctl_kern_vm_guest(SYSCTL_HANDLER_ARGS); -int hz; -int tick; +int hz; /* system clock's frequency */ +int tick; /* usec per tick (1000000 / hz) */ +struct bintime tick_bt; /* bintime per tick (1s / hz) */ +sbintime_t tick_sbt; int maxusers; /* base tunable */ int maxproc; /* maximum # of processes */ int maxprocperuid; /* max # of procs per user */ @@ -221,6 +223,8 @@ init_param1(void) if (hz == -1) hz = vm_guest > VM_GUEST_NO ? HZ_VM : HZ; tick = 1000000 / hz; + tick_sbt = SBT_1S / hz; + tick_bt = sbttobt(tick_sbt); #ifdef VM_SWZONE_SIZE_MAX maxswzone = VM_SWZONE_SIZE_MAX; diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c index 48444c1..bde7503 100644 --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -719,20 +719,24 @@ tcp_timer_active(struct tcpcb *tp, int timer_type) #define ticks_to_msecs(t) (1000*(t) / hz) void -tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, struct xtcp_timer *xtimer) +tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, + struct xtcp_timer *xtimer) { - bzero(xtimer, sizeof(struct xtcp_timer)); + sbintime_t now; + + bzero(xtimer, sizeof(*xtimer)); if (timer == NULL) return; + now = getsbinuptime(); if (callout_active(&timer->tt_delack)) - xtimer->tt_delack = ticks_to_msecs(timer->tt_delack.c_time - ticks); + xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_rexmt)) - xtimer->tt_rexmt = ticks_to_msecs(timer->tt_rexmt.c_time - ticks); + xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_persist)) - xtimer->tt_persist = ticks_to_msecs(timer->tt_persist.c_time - ticks); + xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_keep)) - xtimer->tt_keep = ticks_to_msecs(timer->tt_keep.c_time - ticks); + xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_2msl)) - xtimer->tt_2msl = ticks_to_msecs(timer->tt_2msl.c_time - ticks); + xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS; xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime); } diff --git a/sys/sys/_callout.h b/sys/sys/_callout.h index b8c3ce9..e186aec 100644 --- a/sys/sys/_callout.h +++ b/sys/sys/_callout.h @@ -42,15 +42,18 @@ struct lock_object; -SLIST_HEAD(callout_list, callout); +LIST_HEAD(callout_list, callout); +SLIST_HEAD(callout_slist, callout); TAILQ_HEAD(callout_tailq, callout); struct callout { union { + LIST_ENTRY(callout) le; SLIST_ENTRY(callout) sle; TAILQ_ENTRY(callout) tqe; } c_links; - int c_time; /* ticks to the event */ + sbintime_t c_time; /* ticks to the event */ + sbintime_t c_precision; /* delta allowed wrt opt */ void *c_arg; /* function argument */ void (*c_func)(void *); /* function to call */ struct lock_object *c_lock; /* lock to handle */ diff --git a/sys/sys/callout.h b/sys/sys/callout.h index 95b9a32..7a4dec9 100644 --- a/sys/sys/callout.h +++ b/sys/sys/callout.h @@ -47,6 +47,16 @@ #define CALLOUT_RETURNUNLOCKED 0x0010 /* handler returns with mtx unlocked */ #define CALLOUT_SHAREDLOCK 0x0020 /* callout lock held in shared mode */ #define CALLOUT_DFRMIGRATION 0x0040 /* callout in deferred migration mode */ +#define CALLOUT_PROCESSED 0x0080 /* callout in wheel or processing list? */ +#define CALLOUT_DIRECT 0x0100 /* allow exec from hw int context */ + +#define C_DIRECT_EXEC 0x0001 /* direct execution of callout */ +#define C_PRELBITS 7 +#define C_PRELRANGE ((1 << C_PRELBITS) - 1) +#define C_PREL(x) (((x) + 1) << 1) +#define C_PRELGET(x) (int)((((x) >> 1) & C_PRELRANGE) - 1) +#define C_HARDCLOCK 0x0100 /* align to hardclock() calls */ +#define C_ABSOLUTE 0x0200 /* event time is absolute. */ struct callout_handle { struct callout *callout; @@ -67,7 +77,15 @@ void _callout_init_lock(struct callout *, struct lock_object *, int); _callout_init_lock((c), ((rw) != NULL) ? &(rw)->lock_object : \ NULL, (flags)) #define callout_pending(c) ((c)->c_flags & CALLOUT_PENDING) -int callout_reset_on(struct callout *, int, void (*)(void *), void *, int); +int callout_reset_sbt_on(struct callout *, sbintime_t, sbintime_t, + void (*)(void *), void *, int, int); +#define callout_reset_sbt(c, sbt, pr, fn, arg, flags) \ + callout_reset_sbt_on((c), (sbt), (pr), (fn), (arg), (c)->c_cpu, flags) +#define callout_reset_sbt_curcpu(c, sbt, pr, fn, arg, flags) \ + callout_reset_sbt_on((c), (sbt), (pr), (fn), (arg), PCPU_GET(cpuid), flags) +#define callout_reset_on(c, to_ticks, fn, arg, cpu) \ + callout_reset_sbt_on((c), (tick_sbt * (to_ticks)), 0, (fn), (arg), \ + (cpu), C_HARDCLOCK) #define callout_reset(c, on_tick, fn, arg) \ callout_reset_on((c), (on_tick), (fn), (arg), (c)->c_cpu) #define callout_reset_curcpu(c, on_tick, fn, arg) \ @@ -78,9 +96,7 @@ int callout_schedule_on(struct callout *, int, int); callout_schedule_on((c), (on_tick), PCPU_GET(cpuid)) #define callout_stop(c) _callout_stop_safe(c, 0) int _callout_stop_safe(struct callout *, int); -void callout_tick(void); -int callout_tickstofirst(int limit); -extern void (*callout_new_inserted)(int cpu, int ticks); +void callout_process(sbintime_t now); #endif diff --git a/sys/sys/systm.h b/sys/sys/systm.h index 11a9cf9..85fa1c8 100644 --- a/sys/sys/systm.h +++ b/sys/sys/systm.h @@ -269,6 +269,7 @@ void cpu_startprofclock(void); void cpu_stopprofclock(void); sbintime_t cpu_idleclock(void); void cpu_activeclock(void); +void cpu_new_callout(int cpu, sbintime_t bt, sbintime_t bt_opt); extern int cpu_can_deep_sleep; extern int cpu_disable_deep_sleep; diff --git a/sys/sys/time.h b/sys/sys/time.h index a735ad8..761a123 100644 --- a/sys/sys/time.h +++ b/sys/sys/time.h @@ -102,6 +102,21 @@ bintime_mul(struct bintime *bt, u_int x) bt->frac = (p2 << 32) | (p1 & 0xffffffffull); } +static __inline void +bintime_shift(struct bintime *bt, int exp) +{ + + if (exp > 0) { + bt->sec <<= exp; + bt->sec |= bt->frac >> (64 - exp); + bt->frac <<= exp; + } else if (exp < 0) { + bt->frac >>= -exp; + bt->frac |= (uint64_t)bt->sec << (64 + exp); + bt->sec >>= -exp; + } +} + #define bintime_clear(a) ((a)->sec = (a)->frac = 0) #define bintime_isset(a) ((a)->sec || (a)->frac) #define bintime_cmp(a, b, cmp) \ @@ -357,6 +372,16 @@ extern volatile time_t time_second; extern volatile time_t time_uptime; extern struct bintime boottimebin; extern struct timeval boottime; +extern struct bintime tc_tick_bt; +extern sbintime_t tc_tick_sbt; +extern struct bintime tick_bt; +extern sbintime_t tick_sbt; +extern int tc_precexp; +extern int tc_timepercentage; +extern struct bintime bt_timethreshold; +extern struct bintime bt_tickthreshold; +extern sbintime_t sbt_timethreshold; +extern sbintime_t sbt_tickthreshold; /* * Functions for looking at our clock: [get]{bin,nano,micro}[up]time() @@ -421,6 +446,25 @@ int ratecheck(struct timeval *, const struct timeval *); void timevaladd(struct timeval *t1, const struct timeval *t2); void timevalsub(struct timeval *t1, const struct timeval *t2); int tvtohz(struct timeval *tv); + +#define TC_DEFAULTPERC 5 + +#define BT2FREQ(bt) \ + (((uint64_t)0x8000000000000000 + ((bt)->frac >> 2)) / \ + ((bt)->frac >> 1)) + +#define SBT2FREQ(sbt) ((SBT_1S + ((sbt) >> 1)) / (sbt)) + +#define FREQ2BT(freq, bt) \ +{ \ + (bt)->sec = 0; \ + (bt)->frac = ((uint64_t)0x8000000000000000 / (freq)) << 1; \ +} + +#define TIMESEL(sbt, sbt2) \ + (((sbt2) >= sbt_timethreshold) ? \ + ((*(sbt) = getsbinuptime()), 1) : ((*(sbt) = sbinuptime()), 0)) + #else /* !_KERNEL */ #include <time.h> |