Replace TOD clock code with more systematic approach.

Highlights: * Simple model for underlying hardware. * Hardware basis for timekeeping can be changed on the fly. * Only one hardware clock responsible for TOD keeping. * Provides a real nanotime() function. * Time granularity: .232E-18 seconds. * Frequency granularity: .238E-12 s/s * Frequency adjustment is continuous in time. * Less overhead for frequency adjustment. * Improves xntpd performance. Reviewed by: bde, bde, bde
author: phk <phk@FreeBSD.org> 1998-02-20 16:36:17 +0000
committer: phk <phk@FreeBSD.org> 1998-02-20 16:36:17 +0000
commit: 044e1e629691b102791a17fc6db270846b71b01c (patch)
tree: 73eeeac34c8a9254ecd3686eaa03c76379171a5c /sys/kern/kern_clock.c
parent: fba9e5d6630c11aef2f66bef4dd7fbf45179be87 (diff)
download: FreeBSD-src-044e1e629691b102791a17fc6db270846b71b01c.zip
FreeBSD-src-044e1e629691b102791a17fc6db270846b71b01c.tar.gz
1 files changed, 241 insertions, 94 deletions
diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c
index 20b700b..30bb775 100644
--- a/sys/kern/kern_clock.c
+++ b/sys/kern/kern_clock.c
@@ -1,4 +1,5 @@
 /*-
+ * Copyright (c) 1997, 1998 Poul-Henning Kamp <phk@FreeBSD.org>
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
@@ -36,7 +37,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
- * $Id: kern_clock.c,v 1.55 1998/02/06 12:13:22 eivind Exp $
+ * $Id: kern_clock.c,v 1.56 1998/02/15 13:55:06 phk Exp $
  */
 
 #include <sys/param.h>
@@ -55,7 +56,6 @@
 #include <sys/sysctl.h>
 
 #include <machine/cpu.h>
-#define CLOCK_HAIR		/* XXX */
 #include <machine/clock.h>
 #include <machine/limits.h>
 
@@ -70,6 +70,9 @@
 static void initclocks __P((void *dummy));
 SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
 
+static void tco_forward __P((void));
+static void tco_setscales __P((struct timecounter *tc));
+
 /* Some of these don't belong here, but it's easiest to concentrate them. */
 #if defined(SMP) && defined(BETTER_CLOCK)
 long cp_time[CPUSTATES];
@@ -91,55 +94,43 @@ long tk_nin;
 long tk_nout;
 long tk_rawcc;
 
+struct timecounter *timecounter;
+
 /*
  * Clock handling routines.
  *
- * This code is written to operate with two timers that run independently of
- * each other.  The main clock, running hz times per second, is used to keep
- * track of real time.  The second timer handles kernel and user profiling,
- * and does resource use estimation.  If the second timer is programmable,
- * it is randomized to avoid aliasing between the two clocks.  For example,
- * the randomization prevents an adversary from always giving up the cpu
- * just before its quantum expires.  Otherwise, it would never accumulate
- * cpu ticks.  The mean frequency of the second timer is stathz.
+ * This code is written to operate with two timers that run independently
+ * of each other.
  *
- * If no second timer exists, stathz will be zero; in this case we drive
- * profiling and statistics off the main clock.  This WILL NOT be accurate;
- * do not do it unless absolutely necessary.
+ * The main clock, running hz times per second, is used to trigger
+ * interval timers, timeouts and rescheduling as needed.
  *
+ * The second timer handles kernel and user profiling, and does resource
+ * use estimation.  If the second timer is programmable, it is randomized
+ * to avoid aliasing between the two clocks.  For example, the
+ * randomization prevents an adversary from always giving up the cpu
+ * just before its quantum expires.  Otherwise, it would never accumulate
+ * cpu ticks.  The mean frequency of the second timer is stathz.
+ * If no second timer exists, stathz will be zero; in this case we
+ * drive profiling and statistics off the main clock.  This WILL NOT
+ * be accurate; do not do it unless absolutely necessary.
  * The statistics clock may (or may not) be run at a higher rate while
- * profiling.  This profile clock runs at profhz.  We require that profhz
- * be an integral multiple of stathz.
+ * profiling.  This profile clock runs at profhz.  We require that
+ * profhz be an integral multiple of stathz.  If the statistics clock
+ * is running fast, it must be divided by the ratio profhz/stathz for
+ * statistics.  (For profiling, every tick counts.)
  *
- * If the statistics clock is running fast, it must be divided by the ratio
- * profhz/stathz for statistics.  (For profiling, every tick counts.)
- */
-
-/*
- * TODO:
- *	allocate more timeout table slots when table overflows.
- */
-
-/*
- * Bump a timeval by a small number of usec's.
+ * Time-of-day is maintained using a "timecounter", which may or may
+ * not be related to the hardware generating the above mentioned
+ * interrupts.
  */
-#define BUMPTIME(t, usec) { \
-	register volatile struct timeval *tp = (t); \
-	register long us; \
- \
-	tp->tv_usec = us = tp->tv_usec + (usec); \
-	if (us >= 1000000) { \
-		tp->tv_usec = us - 1000000; \
-		tp->tv_sec++; \
-	} \
-}
 
 int	stathz;
 int	profhz;
 static int profprocs;
 int	ticks;
 static int psdiv, pscnt;		/* prof => stat divider */
-int psratio;				/* ratio: prof / stat */
+int	psratio;			/* ratio: prof / stat */
 
 volatile struct	timeval time;
 volatile struct	timeval mono_time;
@@ -178,9 +169,6 @@ hardclock(frame)
 	register struct clockframe *frame;
 {
 	register struct proc *p;
-	int time_update;
-	struct timeval newtime = time;
-	long ltemp;
 
 	p = curproc;
 	if (p) {
@@ -208,55 +196,9 @@ hardclock(frame)
 	if (stathz == 0)
 		statclock(frame);
 
-	/*
-	 * Increment the time-of-day.
-	 */
-	ticks++;
+	tco_forward();
 
-	if (timedelta == 0) {
-		time_update = CPU_THISTICKLEN(tick);
-	} else {
-		time_update = CPU_THISTICKLEN(tick) + tickdelta;
-		timedelta -= tickdelta;
-	}
-	BUMPTIME(&mono_time, time_update);
-
-	/*
-	 * Compute the phase adjustment. If the low-order bits
-	 * (time_phase) of the update overflow, bump the high-order bits
-	 * (time_update).
-	 */
-	time_phase += time_adj;
-	if (time_phase <= -FINEUSEC) {
-		ltemp = -time_phase >> SHIFT_SCALE;
-		time_phase += ltemp << SHIFT_SCALE;
-		time_update -= ltemp;
-	}
-	else if (time_phase >= FINEUSEC) {
-		ltemp = time_phase >> SHIFT_SCALE;
-		time_phase -= ltemp << SHIFT_SCALE;
-		time_update += ltemp;
-	}
-
-	newtime.tv_usec += time_update;
-	/*
-	 * On rollover of the second the phase adjustment to be used for
-	 * the next second is calculated. Also, the maximum error is
-	 * increased by the tolerance. If the PPS frequency discipline
-	 * code is present, the phase is increased to compensate for the
-	 * CPU clock oscillator frequency error.
-	 *
-	 * On a 32-bit machine and given parameters in the timex.h
-	 * header file, the maximum phase adjustment is +-512 ms and
-	 * maximum frequency offset is a tad less than) +-512 ppm. On a
-	 * 64-bit machine, you shouldn't need to ask.
-	 */
-	if (newtime.tv_usec >= 1000000) {
-		newtime.tv_usec -= 1000000;
-		newtime.tv_sec++;
-		ntp_update_second(&newtime.tv_sec);
-	}
-	CPU_CLOCKUPDATE(&time, &newtime);
+	ticks++;
 
 	if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL)
 		setsoftclock();
@@ -315,6 +257,10 @@ hzto(tv)
 	}
 	if (sec < 0) {
 #ifdef DIAGNOSTIC
+		if (sec == -1 && usec > 0) {
+			sec++;
+			usec -= 1000000;
+		}
 		printf("hzto: negative time difference %ld sec %ld usec\n",
 		       sec, usec);
 #endif
@@ -529,11 +475,212 @@ SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD,
 	0, 0, sysctl_kern_clockrate, "S,clockinfo","");
 
 void
-nanotime(ts)
-	struct timespec *ts;
+microtime(struct timeval *tv)
+{
+	struct timecounter *tc;
+
+	tc = (struct timecounter *)timecounter;
+	tv->tv_sec = tc->offset_sec;
+	tv->tv_usec = tc->offset_micro;
+	tv->tv_usec += 
+	    ((u_int64_t)tc->get_timedelta(tc) * tc->scale_micro) >> 32;
+	if (tv->tv_usec >= 1000000) {
+		tv->tv_usec -= 1000000;
+		tv->tv_sec++;
+	}
+}
+
+void
+nanotime(struct timespec *tv)
+{
+	u_int32_t count;
+	u_int64_t delta;
+	struct timecounter *tc;
+
+	tc = (struct timecounter *)timecounter;
+	tv->tv_sec = tc->offset_sec;
+	count = tc->get_timedelta(tc);
+	delta = tc->offset_nano;
+	delta += ((u_int64_t)count * tc->scale_nano_f);
+	delta += ((u_int64_t)count * tc->scale_nano_i) << 32;
+	delta >>= 32;
+	if (delta >= 1000000000) {
+		delta -= 1000000000;
+		tv->tv_sec++;
+	}
+	tv->tv_nsec = delta;
+}
+
+static void
+tco_setscales(struct timecounter *tc)
+{
+	u_int64_t scale;
+
+	scale = 1000000000LL << 32;
+	if (tc->adjustment > 0)
+		scale += (tc->adjustment * 1000LL) << 10;
+	else
+		scale -= (-tc->adjustment * 1000LL) << 10;
+	/* scale += tc->frequency >> 1; */ /* XXX do we want to round ? */
+	scale /= tc->frequency;
+	tc->scale_micro = scale / 1000;
+	tc->scale_nano_f = scale & 0xffffffff;
+	tc->scale_nano_i = scale >> 32;
+}
+
+static u_int
+delta_timecounter(struct timecounter *tc)
+{
+	return((tc->get_timecount() - tc->offset_count) & tc->counter_mask);
+}
+
+void
+init_timecounter(struct timecounter *tc)
+{
+	struct timespec ts0, ts1;
+	int i;
+
+	if (!tc->get_timedelta) 
+		tc->get_timedelta = delta_timecounter;
+	tc->adjustment = 0;
+	tco_setscales(tc);
+	tc->offset_count = tc->get_timecount();
+	tc[0].tweak = &tc[0];
+	tc[2] = tc[1] = tc[0];
+	tc[1].other = &tc[2];
+	tc[2].other = &tc[1];
+	if (!timecounter)
+		timecounter = &tc[2];
+	tc = &tc[1];
+
+	/* 
+	 * Figure out the cost of calling this timecounter.
+	 * XXX: The 1:15 ratio is a guess at reality.
+	 */
+	nanotime(&ts0);
+	for (i = 0; i < 16; i ++) 
+		tc->get_timecount();
+	for (i = 0; i < 240; i ++)
+		tc->get_timedelta(tc);
+	nanotime(&ts1);
+	ts1.tv_sec -= ts0.tv_sec;
+	tc->cost = ts1.tv_sec * 1000000000 + ts1.tv_nsec - ts0.tv_nsec;
+	tc->cost >>= 8;
+	printf("Timecounter \"%s\"  frequency %lu Hz  cost %u ns\n", 
+	    tc->name, tc->frequency, tc->cost);
+
+	/* XXX: For now always start using the counter. */
+	tc->offset_count = tc->get_timecount();
+	nanotime(&ts1);
+	tc->offset_nano = (u_int64_t)ts1.tv_nsec << 32;
+	tc->offset_micro = ts1.tv_nsec / 1000;
+	tc->offset_sec = ts1.tv_sec;
+	timecounter = tc;
+}
+
+void
+set_timecounter(struct timespec *ts)
 {
-	struct timeval tv;
-	microtime(&tv);
-	ts->tv_sec = tv.tv_sec;
-	ts->tv_nsec = tv.tv_usec * 1000;
+	struct timecounter *tc, *tco;
+	int s;
+
+	s = splclock();
+	tc=timecounter->other;
+	tco = tc->other;
+	*tc = *timecounter;
+	tc->other = tco;
+	tc->offset_sec = ts->tv_sec;
+	tc->offset_nano = (u_int64_t)ts->tv_nsec << 32;
+	tc->offset_micro =  ts->tv_nsec / 1000;
+	tc->offset_count = tc->get_timecount();
+	time.tv_sec = tc->offset_sec;
+	time.tv_usec = tc->offset_micro;
+	timecounter = tc;
+	splx(s);
 }
+
+static struct timecounter *
+sync_other_counter(int flag)
+{
+	struct timecounter *tc, *tco;
+	u_int32_t delta;
+
+	tc = timecounter->other;
+	tco = tc->other;
+	*tc = *timecounter;
+	tc->other = tco;
+	delta = tc->get_timedelta(tc);
+	tc->offset_count += delta;
+	tc->offset_count &= tc->counter_mask;
+	tc->offset_nano += (u_int64_t)delta * tc->scale_nano_f;
+	tc->offset_nano += (u_int64_t)delta * tc->scale_nano_i << 32;
+	if (flag)
+		return (tc);
+	if (tc->offset_nano > 1000000000ULL << 32) {
+		tc->offset_sec++;
+		tc->offset_nano -= 1000000000ULL << 32;
+	}
+	tc->offset_micro = (tc->offset_nano / 1000) >> 32;
+	return (tc);
+}
+
+static void
+tco_forward(void)
+{
+	struct timecounter *tc;
+	u_int32_t time_update;
+
+	tc = sync_other_counter(1);
+	time_update = 0;
+
+	if (timedelta) {
+		time_update += tickdelta;
+		timedelta -= tickdelta;
+	}
+	mono_time.tv_usec += time_update + tick;
+	if (mono_time.tv_usec >= 1000000) {
+		mono_time.tv_usec -= 1000000;
+		mono_time.tv_sec++;
+	}
+	time_update *= 1000;
+	tc->offset_nano += (u_int64_t)time_update << 32;
+	if (tc->offset_nano >= 1000000000ULL << 32) {
+		tc->offset_nano -= 1000000000ULL << 32;
+		tc->offset_sec++;
+		tc->frequency = tc->tweak->frequency;
+		tc->adjustment = tc->tweak->adjustment;	/* XXX remove this ? */
+		ntp_update_second(tc);	/* XXX only needed if xntpd runs */
+		tco_setscales(tc);
+	}
+	/*
+	 * Find the usec from the nsec.  This is just as fast (one 
+	 * multiplication) and prevents skew between the two due
+	 * to rounding errors. (2^32/1000 = 4294967.296)
+	 */
+	tc->offset_micro = (tc->offset_nano / 1000) >> 32;
+	time.tv_usec = tc->offset_micro;
+	time.tv_sec = tc->offset_sec;
+	timecounter = tc;
+}
+
+static int
+sysctl_kern_timecounter_frequency SYSCTL_HANDLER_ARGS
+{
+	return (sysctl_handle_opaque(oidp, &timecounter->tweak->frequency,
+	    sizeof(timecounter->tweak->frequency), req));
+}
+
+static int
+sysctl_kern_timecounter_adjustment SYSCTL_HANDLER_ARGS
+{
+	return (sysctl_handle_opaque(oidp, &timecounter->tweak->adjustment,
+	    sizeof(timecounter->tweak->adjustment), req));
+}
+
+SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, "");
+
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, frequency, CTLTYPE_INT|CTLFLAG_RW,
+	0, sizeof(u_int) , sysctl_kern_timecounter_frequency, "I", "");
+
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, adjustment, CTLTYPE_INT|CTLFLAG_RW,
+	0, sizeof(int) , sysctl_kern_timecounter_adjustment, "I", "");
author	phk <phk@FreeBSD.org>	1998-02-20 16:36:17 +0000
committer	phk <phk@FreeBSD.org>	1998-02-20 16:36:17 +0000
commit	044e1e629691b102791a17fc6db270846b71b01c (patch)
tree	73eeeac34c8a9254ecd3686eaa03c76379171a5c /sys/kern/kern_clock.c
parent	fba9e5d6630c11aef2f66bef4dd7fbf45179be87 (diff)
download	FreeBSD-src-044e1e629691b102791a17fc6db270846b71b01c.zip FreeBSD-src-044e1e629691b102791a17fc6db270846b71b01c.tar.gz