9 files changed, 118 insertions, 115 deletions
diff --git a/sys/kern/kern_clocksource.c b/sys/kern/kern_clocksource.c
index 49ac7bf..fc4ca83a 100644
--- a/sys/kern/kern_clocksource.c
+++ b/sys/kern/kern_clocksource.c
@@ -211,7 +211,7 @@ handleevents(sbintime_t now, int fake)
 	} else
 		state->nextprof = state->nextstat;
 	if (now >= state->nextcallopt) {
-		state->nextcall = state->nextcallopt = INT64_MAX;
+		state->nextcall = state->nextcallopt = SBT_MAX;
 		callout_process(now);
 	}
 
@@ -492,7 +492,7 @@ configtimer(int start)
 			state = DPCPU_ID_PTR(cpu, timerstate);
 			state->now = now;
 			if (!smp_started && cpu != CPU_FIRST())
-				state->nextevent = INT64_MAX;
+				state->nextevent = SBT_MAX;
 			else
 				state->nextevent = next;
 			if (periodic)
@@ -580,8 +580,8 @@ cpu_initclocks_bsp(void)
 	CPU_FOREACH(cpu) {
 		state = DPCPU_ID_PTR(cpu, timerstate);
 		mtx_init(&state->et_hw_mtx, "et_hw_mtx", NULL, MTX_SPIN);
-		state->nextcall = INT64_MAX;
-		state->nextcallopt = INT64_MAX;
+		state->nextcall = SBT_MAX;
+		state->nextcallopt = SBT_MAX;
 	}
 	periodic = want_periodic;
 	/* Grab requested timer or the best of present. */
diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c
index e4bf40b..fba163f 100644
--- a/sys/kern/kern_event.c
+++ b/sys/kern/kern_event.c
@@ -1471,7 +1471,7 @@ kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
 				rsbt = tstosbt(*tsp);
 				if (TIMESEL(&asbt, rsbt))
 					asbt += tc_tick_sbt;
-				if (asbt <= INT64_MAX - rsbt)
+				if (asbt <= SBT_MAX - rsbt)
 					asbt += rsbt;
 				else
 					asbt = 0;
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index 8fa6bcd..7877fab 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -470,6 +470,7 @@ do_fork(struct thread *td, int flags, struct proc *p2, struct thread *td2,
 	bzero(&td2->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
 	td2->td_su = NULL;
+	td2->td_sleeptimo = 0;
 
 	bcopy(&td->td_startcopy, &td2->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
diff --git a/sys/kern/kern_kthread.c b/sys/kern/kern_kthread.c
index fb46025..4c77f4f 100644
--- a/sys/kern/kern_kthread.c
+++ b/sys/kern/kern_kthread.c
@@ -272,6 +272,7 @@ kthread_add(void (*func)(void *), void *arg, struct proc *p,
 	bzero(&newtd->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
 	newtd->td_su = NULL;
+	newtd->td_sleeptimo = 0;
 	bcopy(&oldtd->td_startcopy, &newtd->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
 
diff --git a/sys/kern/kern_thr.c b/sys/kern/kern_thr.c
index b01aecb..74050ff 100644
--- a/sys/kern/kern_thr.c
+++ b/sys/kern/kern_thr.c
@@ -229,6 +229,7 @@ thread_create(struct thread *td, struct rtprio *rtp,
 	bzero(&newtd->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
 	newtd->td_su = NULL;
+	newtd->td_sleeptimo = 0;
 	bcopy(&td->td_startcopy, &newtd->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
 	newtd->td_proc = td->td_proc;
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
index 2f8382c..7b04d96 100644
--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@@ -319,7 +319,7 @@ thread_reap(void)
 
 	/*
 	 * Don't even bother to lock if none at this instant,
-	 * we really don't care about the next instant..
+	 * we really don't care about the next instant.
 	 */
 	if (!TAILQ_EMPTY(&zombie_threads)) {
 		mtx_lock_spin(&zombie_lock);
@@ -383,6 +383,7 @@ thread_free(struct thread *td)
 	cpu_thread_free(td);
 	if (td->td_kstack != 0)
 		vm_thread_dispose(td);
+	callout_drain(&td->td_slpcallout);
 	uma_zfree(thread_zone, td);
 }
 
@@ -524,6 +525,7 @@ thread_wait(struct proc *p)
 	td->td_cpuset = NULL;
 	cpu_thread_clean(td);
 	crfree(td->td_ucred);
+	callout_drain(&td->td_slpcallout);
 	thread_reap();	/* check for zombie threads etc. */
 }
 
diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c
index 9aa11ba..7e5aab7 100644
--- a/sys/kern/kern_timeout.c
+++ b/sys/kern/kern_timeout.c
@@ -296,7 +296,7 @@ callout_cpu_init(struct callout_cpu *cc, int cpu)
 	for (i = 0; i < callwheelsize; i++)
 		LIST_INIT(&cc->cc_callwheel[i]);
 	TAILQ_INIT(&cc->cc_expireq);
-	cc->cc_firstevent = INT64_MAX;
+	cc->cc_firstevent = SBT_MAX;
 	for (i = 0; i < 2; i++)
 		cc_cce_cleanup(cc, i);
 	snprintf(cc->cc_ktr_event_name, sizeof(cc->cc_ktr_event_name),
@@ -569,8 +569,8 @@ callout_cc_add(struct callout *c, struct callout_cpu *cc,
 	 * Inform the eventtimers(4) subsystem there's a new callout
 	 * that has been inserted, but only if really required.
 	 */
-	if (INT64_MAX - c->c_time < c->c_precision)
-		c->c_precision = INT64_MAX - c->c_time;
+	if (SBT_MAX - c->c_time < c->c_precision)
+		c->c_precision = SBT_MAX - c->c_time;
 	sbt = c->c_time + c->c_precision;
 	if (sbt < cc->cc_firstevent) {
 		cc->cc_firstevent = sbt;
@@ -896,6 +896,56 @@ callout_handle_init(struct callout_handle *handle)
 	handle->callout = NULL;
 }
 
+void
+callout_when(sbintime_t sbt, sbintime_t precision, int flags,
+    sbintime_t *res, sbintime_t *prec_res)
+{
+	sbintime_t to_sbt, to_pr;
+
+	if ((flags & (C_ABSOLUTE | C_PRECALC)) != 0) {
+		*res = sbt;
+		*prec_res = precision;
+		return;
+	}
+	if ((flags & C_HARDCLOCK) != 0 && sbt < tick_sbt)
+		sbt = tick_sbt;
+	if ((flags & C_HARDCLOCK) != 0 ||
+#ifdef NO_EVENTTIMERS
+	    sbt >= sbt_timethreshold) {
+		to_sbt = getsbinuptime();
+
+		/* Add safety belt for the case of hz > 1000. */
+		to_sbt += tc_tick_sbt - tick_sbt;
+#else
+	    sbt >= sbt_tickthreshold) {
+		/*
+		 * Obtain the time of the last hardclock() call on
+		 * this CPU directly from the kern_clocksource.c.
+		 * This value is per-CPU, but it is equal for all
+		 * active ones.
+		 */
+#ifdef __LP64__
+		to_sbt = DPCPU_GET(hardclocktime);
+#else
+		spinlock_enter();
+		to_sbt = DPCPU_GET(hardclocktime);
+		spinlock_exit();
+#endif
+#endif
+		if ((flags & C_HARDCLOCK) == 0)
+			to_sbt += tick_sbt;
+	} else
+		to_sbt = sbinuptime();
+	if (SBT_MAX - to_sbt < sbt)
+		to_sbt = SBT_MAX;
+	else
+		to_sbt += sbt;
+	*res = to_sbt;
+	to_pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp :
+	    sbt >> C_PRELGET(flags));
+	*prec_res = to_pr > precision ? to_pr : precision;
+}
+
 /*
  * New interface; clients allocate their own callout structures.
  *
@@ -913,10 +963,10 @@ callout_handle_init(struct callout_handle *handle)
  * callout_deactivate() - marks the callout as having been serviced
  */
 int
-callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision,
+callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t prec,
     void (*ftn)(void *), void *arg, int cpu, int flags)
 {
-	sbintime_t to_sbt, pr;
+	sbintime_t to_sbt, precision;
 	struct callout_cpu *cc;
 	int cancelled, direct;
 	int ignore_cpu=0;
@@ -929,47 +979,8 @@ callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision,
 		/* Invalid CPU spec */
 		panic("Invalid CPU in callout %d", cpu);
 	}
-	if (flags & C_ABSOLUTE) {
-		to_sbt = sbt;
-	} else {
-		if ((flags & C_HARDCLOCK) && (sbt < tick_sbt))
-			sbt = tick_sbt;
-		if ((flags & C_HARDCLOCK) ||
-#ifdef NO_EVENTTIMERS
-		    sbt >= sbt_timethreshold) {
-			to_sbt = getsbinuptime();
+	callout_when(sbt, prec, flags, &to_sbt, &precision);
 
-			/* Add safety belt for the case of hz > 1000. */
-			to_sbt += tc_tick_sbt - tick_sbt;
-#else
-		    sbt >= sbt_tickthreshold) {
-			/*
-			 * Obtain the time of the last hardclock() call on
-			 * this CPU directly from the kern_clocksource.c.
-			 * This value is per-CPU, but it is equal for all
-			 * active ones.
-			 */
-#ifdef __LP64__
-			to_sbt = DPCPU_GET(hardclocktime);
-#else
-			spinlock_enter();
-			to_sbt = DPCPU_GET(hardclocktime);
-			spinlock_exit();
-#endif
-#endif
-			if ((flags & C_HARDCLOCK) == 0)
-				to_sbt += tick_sbt;
-		} else
-			to_sbt = sbinuptime();
-		if (INT64_MAX - to_sbt < sbt)
-			to_sbt = INT64_MAX;
-		else
-			to_sbt += sbt;
-		pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp :
-		    sbt >> C_PRELGET(flags));
-		if (pr > precision)
-			precision = pr;
-	}
 	/* 
 	 * This flag used to be added by callout_cc_add, but the
 	 * first time you call this we could end up with the
diff --git a/sys/kern/subr_sleepqueue.c b/sys/kern/subr_sleepqueue.c
index c490460..4941b47 100644
--- a/sys/kern/subr_sleepqueue.c
+++ b/sys/kern/subr_sleepqueue.c
@@ -361,6 +361,7 @@ sleepq_set_timeout_sbt(void *wchan, sbintime_t sbt, sbintime_t pr,
 {
 	struct sleepqueue_chain *sc;
 	struct thread *td;
+	sbintime_t pr1;
 
 	td = curthread;
 	sc = SC_LOOKUP(wchan);
@@ -368,8 +369,14 @@ sleepq_set_timeout_sbt(void *wchan, sbintime_t sbt, sbintime_t pr,
 	MPASS(TD_ON_SLEEPQ(td));
 	MPASS(td->td_sleepqueue == NULL);
 	MPASS(wchan != NULL);
-	callout_reset_sbt_on(&td->td_slpcallout, sbt, pr,
-	    sleepq_timeout, td, PCPU_GET(cpuid), flags | C_DIRECT_EXEC);
+	KASSERT(td->td_sleeptimo == 0, ("td %d %p td_sleeptimo %jx",
+	    td->td_tid, td, (uintmax_t)td->td_sleeptimo));
+	thread_lock(td);
+	callout_when(sbt, pr, flags, &td->td_sleeptimo, &pr1);
+	thread_unlock(td);
+	callout_reset_sbt_on(&td->td_slpcallout, td->td_sleeptimo, pr1,
+	    sleepq_timeout, td, PCPU_GET(cpuid), flags | C_PRECALC |
+	    C_DIRECT_EXEC);
 }
 
 /*
@@ -548,37 +555,36 @@ static int
 sleepq_check_timeout(void)
 {
 	struct thread *td;
+	int res;
 
 	td = curthread;
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/*
-	 * If TDF_TIMEOUT is set, we timed out.
+	 * If TDF_TIMEOUT is set, we timed out.  But recheck
+	 * td_sleeptimo anyway.
 	 */
-	if (td->td_flags & TDF_TIMEOUT) {
-		td->td_flags &= ~TDF_TIMEOUT;
-		return (EWOULDBLOCK);
+	res = 0;
+	if (td->td_sleeptimo != 0) {
+		if (td->td_sleeptimo <= sbinuptime())
+			res = EWOULDBLOCK;
+		td->td_sleeptimo = 0;
 	}
-
-	/*
-	 * If TDF_TIMOFAIL is set, the timeout ran after we had
-	 * already been woken up.
-	 */
-	if (td->td_flags & TDF_TIMOFAIL)
-		td->td_flags &= ~TDF_TIMOFAIL;
-
-	/*
-	 * If callout_stop() fails, then the timeout is running on
-	 * another CPU, so synchronize with it to avoid having it
-	 * accidentally wake up a subsequent sleep.
-	 */
-	else if (_callout_stop_safe(&td->td_slpcallout, CS_MIGRBLOCK)
-	    == 0) {
-		td->td_flags |= TDF_TIMEOUT;
-		TD_SET_SLEEPING(td);
-		mi_switch(SW_INVOL | SWT_SLEEPQTIMO, NULL);
-	}
-	return (0);
+	if (td->td_flags & TDF_TIMEOUT)
+		td->td_flags &= ~TDF_TIMEOUT;
+	else
+		/*
+		 * We ignore the situation where timeout subsystem was
+		 * unable to stop our callout.  The struct thread is
+		 * type-stable, the callout will use the correct
+		 * memory when running.  The checks of the
+		 * td_sleeptimo value in this function and in
+		 * sleepq_timeout() ensure that the thread does not
+		 * get spurious wakeups, even if the callout was reset
+		 * or thread reused.
+		 */
+		callout_stop(&td->td_slpcallout);
+	return (res);
 }
 
 /*
@@ -887,12 +893,17 @@ sleepq_timeout(void *arg)
 	CTR3(KTR_PROC, "sleepq_timeout: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
 
-	/*
-	 * First, see if the thread is asleep and get the wait channel if
-	 * it is.
-	 */
 	thread_lock(td);
-	if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) {
+
+	if (td->td_sleeptimo > sbinuptime() || td->td_sleeptimo == 0) {
+		/*
+		 * The thread does not want a timeout (yet).
+		 */
+	} else if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) {
+		/*
+		 * See if the thread is asleep and get the wait
+		 * channel if it is.
+		 */
 		wchan = td->td_wchan;
 		sc = SC_LOOKUP(wchan);
 		THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock);
@@ -900,40 +911,16 @@ sleepq_timeout(void *arg)
 		MPASS(sq != NULL);
 		td->td_flags |= TDF_TIMEOUT;
 		wakeup_swapper = sleepq_resume_thread(sq, td, 0);
-		thread_unlock(td);
-		if (wakeup_swapper)
-			kick_proc0();
-		return;
-	}
-
-	/*
-	 * If the thread is on the SLEEPQ but isn't sleeping yet, it
-	 * can either be on another CPU in between sleepq_add() and
-	 * one of the sleepq_*wait*() routines or it can be in
-	 * sleepq_catch_signals().
-	 */
-	if (TD_ON_SLEEPQ(td)) {
+	} else if (TD_ON_SLEEPQ(td)) {
+		/*
+		 * If the thread is on the SLEEPQ but isn't sleeping
+		 * yet, it can either be on another CPU in between
+		 * sleepq_add() and one of the sleepq_*wait*()
+		 * routines or it can be in sleepq_catch_signals().
+		 */
 		td->td_flags |= TDF_TIMEOUT;
-		thread_unlock(td);
-		return;
 	}
 
-	/*
-	 * Now check for the edge cases.  First, if TDF_TIMEOUT is set,
-	 * then the other thread has already yielded to us, so clear
-	 * the flag and resume it.  If TDF_TIMEOUT is not set, then the
-	 * we know that the other thread is not on a sleep queue, but it
-	 * hasn't resumed execution yet.  In that case, set TDF_TIMOFAIL
-	 * to let it know that the timeout has already run and doesn't
-	 * need to be canceled.
-	 */
-	if (td->td_flags & TDF_TIMEOUT) {
-		MPASS(TD_IS_SLEEPING(td));
-		td->td_flags &= ~TDF_TIMEOUT;
-		TD_CLR_SLEEPING(td);
-		wakeup_swapper = setrunnable(td);
-	} else
-		td->td_flags |= TDF_TIMOFAIL;
 	thread_unlock(td);
 	if (wakeup_swapper)
 		kick_proc0();
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index 033bbf0..96ec1d1 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -1089,7 +1089,7 @@ kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
 			precision >>= tc_precexp;
 			if (TIMESEL(&asbt, rsbt))
 				asbt += tc_tick_sbt;
-			if (asbt <= INT64_MAX - rsbt)
+			if (asbt <= SBT_MAX - rsbt)
 				asbt += rsbt;
 			else
 				asbt = -1;
@@ -1626,7 +1626,7 @@ selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td)
 			precision >>= tc_precexp;
 			if (TIMESEL(&asbt, rsbt))
 				asbt += tc_tick_sbt;
-			if (asbt <= INT64_MAX - rsbt)
+			if (asbt <= SBT_MAX - rsbt)
 				asbt += rsbt;
 			else
 				asbt = -1;