From 044e1e629691b102791a17fc6db270846b71b01c Mon Sep 17 00:00:00 2001
From: phk <phk@FreeBSD.org>
Date: Fri, 20 Feb 1998 16:36:17 +0000
Subject: Replace TOD clock code with more systematic approach.

Highlights:
    * Simple model for underlying hardware.
    * Hardware basis for timekeeping can be changed on the fly.
    * Only one hardware clock responsible for TOD keeping.
    * Provides a real nanotime() function.
    * Time granularity: .232E-18 seconds.
    * Frequency granularity:  .238E-12 s/s
    * Frequency adjustment is continuous in time.
    * Less overhead for frequency adjustment.
    * Improves xntpd performance.

Reviewed by:    bde, bde, bde
---
 sys/kern/kern_clock.c   | 335 ++++++++++++++++++++++++++++++++++--------------
 sys/kern/kern_ntptime.c |  51 ++++----
 sys/kern/kern_random.c  |  19 +--
 sys/kern/kern_tc.c      | 335 ++++++++++++++++++++++++++++++++++--------------
 sys/kern/kern_time.c    |  13 +-
 5 files changed, 519 insertions(+), 234 deletions(-)

(limited to 'sys/kern')

diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c
index 20b700b..30bb775 100644
--- a/sys/kern/kern_clock.c
+++ b/sys/kern/kern_clock.c
@@ -1,4 +1,5 @@
 /*-
+ * Copyright (c) 1997, 1998 Poul-Henning Kamp <phk@FreeBSD.org>
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
@@ -36,7 +37,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
- * $Id: kern_clock.c,v 1.55 1998/02/06 12:13:22 eivind Exp $
+ * $Id: kern_clock.c,v 1.56 1998/02/15 13:55:06 phk Exp $
  */
 
 #include <sys/param.h>
@@ -55,7 +56,6 @@
 #include <sys/sysctl.h>
 
 #include <machine/cpu.h>
-#define CLOCK_HAIR		/* XXX */
 #include <machine/clock.h>
 #include <machine/limits.h>
 
@@ -70,6 +70,9 @@
 static void initclocks __P((void *dummy));
 SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
 
+static void tco_forward __P((void));
+static void tco_setscales __P((struct timecounter *tc));
+
 /* Some of these don't belong here, but it's easiest to concentrate them. */
 #if defined(SMP) && defined(BETTER_CLOCK)
 long cp_time[CPUSTATES];
@@ -91,55 +94,43 @@ long tk_nin;
 long tk_nout;
 long tk_rawcc;
 
+struct timecounter *timecounter;
+
 /*
  * Clock handling routines.
  *
- * This code is written to operate with two timers that run independently of
- * each other.  The main clock, running hz times per second, is used to keep
- * track of real time.  The second timer handles kernel and user profiling,
- * and does resource use estimation.  If the second timer is programmable,
- * it is randomized to avoid aliasing between the two clocks.  For example,
- * the randomization prevents an adversary from always giving up the cpu
- * just before its quantum expires.  Otherwise, it would never accumulate
- * cpu ticks.  The mean frequency of the second timer is stathz.
+ * This code is written to operate with two timers that run independently
+ * of each other.
  *
- * If no second timer exists, stathz will be zero; in this case we drive
- * profiling and statistics off the main clock.  This WILL NOT be accurate;
- * do not do it unless absolutely necessary.
+ * The main clock, running hz times per second, is used to trigger
+ * interval timers, timeouts and rescheduling as needed.
  *
+ * The second timer handles kernel and user profiling, and does resource
+ * use estimation.  If the second timer is programmable, it is randomized
+ * to avoid aliasing between the two clocks.  For example, the
+ * randomization prevents an adversary from always giving up the cpu
+ * just before its quantum expires.  Otherwise, it would never accumulate
+ * cpu ticks.  The mean frequency of the second timer is stathz.
+ * If no second timer exists, stathz will be zero; in this case we
+ * drive profiling and statistics off the main clock.  This WILL NOT
+ * be accurate; do not do it unless absolutely necessary.
  * The statistics clock may (or may not) be run at a higher rate while
- * profiling.  This profile clock runs at profhz.  We require that profhz
- * be an integral multiple of stathz.
+ * profiling.  This profile clock runs at profhz.  We require that
+ * profhz be an integral multiple of stathz.  If the statistics clock
+ * is running fast, it must be divided by the ratio profhz/stathz for
+ * statistics.  (For profiling, every tick counts.)
  *
- * If the statistics clock is running fast, it must be divided by the ratio
- * profhz/stathz for statistics.  (For profiling, every tick counts.)
- */
-
-/*
- * TODO:
- *	allocate more timeout table slots when table overflows.
- */
-
-/*
- * Bump a timeval by a small number of usec's.
+ * Time-of-day is maintained using a "timecounter", which may or may
+ * not be related to the hardware generating the above mentioned
+ * interrupts.
  */
-#define BUMPTIME(t, usec) { \
-	register volatile struct timeval *tp = (t); \
-	register long us; \
- \
-	tp->tv_usec = us = tp->tv_usec + (usec); \
-	if (us >= 1000000) { \
-		tp->tv_usec = us - 1000000; \
-		tp->tv_sec++; \
-	} \
-}
 
 int	stathz;
 int	profhz;
 static int profprocs;
 int	ticks;
 static int psdiv, pscnt;		/* prof => stat divider */
-int psratio;				/* ratio: prof / stat */
+int	psratio;			/* ratio: prof / stat */
 
 volatile struct	timeval time;
 volatile struct	timeval mono_time;
@@ -178,9 +169,6 @@ hardclock(frame)
 	register struct clockframe *frame;
 {
 	register struct proc *p;
-	int time_update;
-	struct timeval newtime = time;
-	long ltemp;
 
 	p = curproc;
 	if (p) {
@@ -208,55 +196,9 @@ hardclock(frame)
 	if (stathz == 0)
 		statclock(frame);
 
-	/*
-	 * Increment the time-of-day.
-	 */
-	ticks++;
+	tco_forward();
 
-	if (timedelta == 0) {
-		time_update = CPU_THISTICKLEN(tick);
-	} else {
-		time_update = CPU_THISTICKLEN(tick) + tickdelta;
-		timedelta -= tickdelta;
-	}
-	BUMPTIME(&mono_time, time_update);
-
-	/*
-	 * Compute the phase adjustment. If the low-order bits
-	 * (time_phase) of the update overflow, bump the high-order bits
-	 * (time_update).
-	 */
-	time_phase += time_adj;
-	if (time_phase <= -FINEUSEC) {
-		ltemp = -time_phase >> SHIFT_SCALE;
-		time_phase += ltemp << SHIFT_SCALE;
-		time_update -= ltemp;
-	}
-	else if (time_phase >= FINEUSEC) {
-		ltemp = time_phase >> SHIFT_SCALE;
-		time_phase -= ltemp << SHIFT_SCALE;
-		time_update += ltemp;
-	}
-
-	newtime.tv_usec += time_update;
-	/*
-	 * On rollover of the second the phase adjustment to be used for
-	 * the next second is calculated. Also, the maximum error is
-	 * increased by the tolerance. If the PPS frequency discipline
-	 * code is present, the phase is increased to compensate for the
-	 * CPU clock oscillator frequency error.
-	 *
-	 * On a 32-bit machine and given parameters in the timex.h
-	 * header file, the maximum phase adjustment is +-512 ms and
-	 * maximum frequency offset is a tad less than) +-512 ppm. On a
-	 * 64-bit machine, you shouldn't need to ask.
-	 */
-	if (newtime.tv_usec >= 1000000) {
-		newtime.tv_usec -= 1000000;
-		newtime.tv_sec++;
-		ntp_update_second(&newtime.tv_sec);
-	}
-	CPU_CLOCKUPDATE(&time, &newtime);
+	ticks++;
 
 	if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL)
 		setsoftclock();
@@ -315,6 +257,10 @@ hzto(tv)
 	}
 	if (sec < 0) {
 #ifdef DIAGNOSTIC
+		if (sec == -1 && usec > 0) {
+			sec++;
+			usec -= 1000000;
+		}
 		printf("hzto: negative time difference %ld sec %ld usec\n",
 		       sec, usec);
 #endif
@@ -529,11 +475,212 @@ SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD,
 	0, 0, sysctl_kern_clockrate, "S,clockinfo","");
 
 void
-nanotime(ts)
-	struct timespec *ts;
+microtime(struct timeval *tv)
+{
+	struct timecounter *tc;
+
+	tc = (struct timecounter *)timecounter;
+	tv->tv_sec = tc->offset_sec;
+	tv->tv_usec = tc->offset_micro;
+	tv->tv_usec += 
+	    ((u_int64_t)tc->get_timedelta(tc) * tc->scale_micro) >> 32;
+	if (tv->tv_usec >= 1000000) {
+		tv->tv_usec -= 1000000;
+		tv->tv_sec++;
+	}
+}
+
+void
+nanotime(struct timespec *tv)
+{
+	u_int32_t count;
+	u_int64_t delta;
+	struct timecounter *tc;
+
+	tc = (struct timecounter *)timecounter;
+	tv->tv_sec = tc->offset_sec;
+	count = tc->get_timedelta(tc);
+	delta = tc->offset_nano;
+	delta += ((u_int64_t)count * tc->scale_nano_f);
+	delta += ((u_int64_t)count * tc->scale_nano_i) << 32;
+	delta >>= 32;
+	if (delta >= 1000000000) {
+		delta -= 1000000000;
+		tv->tv_sec++;
+	}
+	tv->tv_nsec = delta;
+}
+
+static void
+tco_setscales(struct timecounter *tc)
+{
+	u_int64_t scale;
+
+	scale = 1000000000LL << 32;
+	if (tc->adjustment > 0)
+		scale += (tc->adjustment * 1000LL) << 10;
+	else
+		scale -= (-tc->adjustment * 1000LL) << 10;
+	/* scale += tc->frequency >> 1; */ /* XXX do we want to round ? */
+	scale /= tc->frequency;
+	tc->scale_micro = scale / 1000;
+	tc->scale_nano_f = scale & 0xffffffff;
+	tc->scale_nano_i = scale >> 32;
+}
+
+static u_int
+delta_timecounter(struct timecounter *tc)
+{
+	return((tc->get_timecount() - tc->offset_count) & tc->counter_mask);
+}
+
+void
+init_timecounter(struct timecounter *tc)
+{
+	struct timespec ts0, ts1;
+	int i;
+
+	if (!tc->get_timedelta) 
+		tc->get_timedelta = delta_timecounter;
+	tc->adjustment = 0;
+	tco_setscales(tc);
+	tc->offset_count = tc->get_timecount();
+	tc[0].tweak = &tc[0];
+	tc[2] = tc[1] = tc[0];
+	tc[1].other = &tc[2];
+	tc[2].other = &tc[1];
+	if (!timecounter)
+		timecounter = &tc[2];
+	tc = &tc[1];
+
+	/* 
+	 * Figure out the cost of calling this timecounter.
+	 * XXX: The 1:15 ratio is a guess at reality.
+	 */
+	nanotime(&ts0);
+	for (i = 0; i < 16; i ++) 
+		tc->get_timecount();
+	for (i = 0; i < 240; i ++)
+		tc->get_timedelta(tc);
+	nanotime(&ts1);
+	ts1.tv_sec -= ts0.tv_sec;
+	tc->cost = ts1.tv_sec * 1000000000 + ts1.tv_nsec - ts0.tv_nsec;
+	tc->cost >>= 8;
+	printf("Timecounter \"%s\"  frequency %lu Hz  cost %u ns\n", 
+	    tc->name, tc->frequency, tc->cost);
+
+	/* XXX: For now always start using the counter. */
+	tc->offset_count = tc->get_timecount();
+	nanotime(&ts1);
+	tc->offset_nano = (u_int64_t)ts1.tv_nsec << 32;
+	tc->offset_micro = ts1.tv_nsec / 1000;
+	tc->offset_sec = ts1.tv_sec;
+	timecounter = tc;
+}
+
+void
+set_timecounter(struct timespec *ts)
 {
-	struct timeval tv;
-	microtime(&tv);
-	ts->tv_sec = tv.tv_sec;
-	ts->tv_nsec = tv.tv_usec * 1000;
+	struct timecounter *tc, *tco;
+	int s;
+
+	s = splclock();
+	tc=timecounter->other;
+	tco = tc->other;
+	*tc = *timecounter;
+	tc->other = tco;
+	tc->offset_sec = ts->tv_sec;
+	tc->offset_nano = (u_int64_t)ts->tv_nsec << 32;
+	tc->offset_micro =  ts->tv_nsec / 1000;
+	tc->offset_count = tc->get_timecount();
+	time.tv_sec = tc->offset_sec;
+	time.tv_usec = tc->offset_micro;
+	timecounter = tc;
+	splx(s);
 }
+
+static struct timecounter *
+sync_other_counter(int flag)
+{
+	struct timecounter *tc, *tco;
+	u_int32_t delta;
+
+	tc = timecounter->other;
+	tco = tc->other;
+	*tc = *timecounter;
+	tc->other = tco;
+	delta = tc->get_timedelta(tc);
+	tc->offset_count += delta;
+	tc->offset_count &= tc->counter_mask;
+	tc->offset_nano += (u_int64_t)delta * tc->scale_nano_f;
+	tc->offset_nano += (u_int64_t)delta * tc->scale_nano_i << 32;
+	if (flag)
+		return (tc);
+	if (tc->offset_nano > 1000000000ULL << 32) {
+		tc->offset_sec++;
+		tc->offset_nano -= 1000000000ULL << 32;
+	}
+	tc->offset_micro = (tc->offset_nano / 1000) >> 32;
+	return (tc);
+}
+
+static void
+tco_forward(void)
+{
+	struct timecounter *tc;
+	u_int32_t time_update;
+
+	tc = sync_other_counter(1);
+	time_update = 0;
+
+	if (timedelta) {
+		time_update += tickdelta;
+		timedelta -= tickdelta;
+	}
+	mono_time.tv_usec += time_update + tick;
+	if (mono_time.tv_usec >= 1000000) {
+		mono_time.tv_usec -= 1000000;
+		mono_time.tv_sec++;
+	}
+	time_update *= 1000;
+	tc->offset_nano += (u_int64_t)time_update << 32;
+	if (tc->offset_nano >= 1000000000ULL << 32) {
+		tc->offset_nano -= 1000000000ULL << 32;
+		tc->offset_sec++;
+		tc->frequency = tc->tweak->frequency;
+		tc->adjustment = tc->tweak->adjustment;	/* XXX remove this ? */
+		ntp_update_second(tc);	/* XXX only needed if xntpd runs */
+		tco_setscales(tc);
+	}
+	/*
+	 * Find the usec from the nsec.  This is just as fast (one 
+	 * multiplication) and prevents skew between the two due
+	 * to rounding errors. (2^32/1000 = 4294967.296)
+	 */
+	tc->offset_micro = (tc->offset_nano / 1000) >> 32;
+	time.tv_usec = tc->offset_micro;
+	time.tv_sec = tc->offset_sec;
+	timecounter = tc;
+}
+
+static int
+sysctl_kern_timecounter_frequency SYSCTL_HANDLER_ARGS
+{
+	return (sysctl_handle_opaque(oidp, &timecounter->tweak->frequency,
+	    sizeof(timecounter->tweak->frequency), req));
+}
+
+static int
+sysctl_kern_timecounter_adjustment SYSCTL_HANDLER_ARGS
+{
+	return (sysctl_handle_opaque(oidp, &timecounter->tweak->adjustment,
+	    sizeof(timecounter->tweak->adjustment), req));
+}
+
+SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, "");
+
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, frequency, CTLTYPE_INT|CTLFLAG_RW,
+	0, sizeof(u_int) , sysctl_kern_timecounter_frequency, "I", "");
+
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, adjustment, CTLTYPE_INT|CTLFLAG_RW,
+	0, sizeof(int) , sysctl_kern_timecounter_adjustment, "I", "");
diff --git a/sys/kern/kern_ntptime.c b/sys/kern/kern_ntptime.c
index 102e650..636a5ce 100644
--- a/sys/kern/kern_ntptime.c
+++ b/sys/kern/kern_ntptime.c
@@ -99,6 +99,7 @@ static long time_tolerance = MAXFREQ;	/* frequency tolerance (scaled ppm) */
 static long time_precision = 1;		/* clock precision (us) */
 static long time_maxerror = MAXPHASE;	/* maximum error (us) */
 static long time_esterror = MAXPHASE;	/* estimated error (us) */
+static int time_daemon = 0;		/* No timedaemon active */
 
 /*
  * The following variables establish the state of the PLL/FLL and the
@@ -285,11 +286,28 @@ hardupdate(offset)
 		time_freq = -time_tolerance;
 }
 
+/*
+ * On rollover of the second the phase adjustment to be used for
+ * the next second is calculated. Also, the maximum error is
+ * increased by the tolerance. If the PPS frequency discipline
+ * code is present, the phase is increased to compensate for the
+ * CPU clock oscillator frequency error.
+ *
+ * On a 32-bit machine and given parameters in the timex.h
+ * header file, the maximum phase adjustment is +-512 ms and
+ * maximum frequency offset is a tad less than) +-512 ppm. On a
+ * 64-bit machine, you shouldn't need to ask.
+ */
 void
-ntp_update_second(long *newsec)
+ntp_update_second(struct timecounter *tc)
 {
+	u_int32_t *newsec;
 	long ltemp;
 
+	if (!time_daemon)
+		return;
+
+	newsec = &tc->offset_sec;
 	time_maxerror += time_tolerance >> SHIFT_USEC;
 
 	/*
@@ -308,7 +326,7 @@ ntp_update_second(long *newsec)
 		if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
 			ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
 		time_offset += ltemp;
-		time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
+		time_adj = -ltemp << (SHIFT_SCALE - SHIFT_UPDATE);
 	} else {
 		ltemp = time_offset;
 		if (!(time_status & STA_FLL))
@@ -316,7 +334,7 @@ ntp_update_second(long *newsec)
 		if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
 			ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
 		time_offset -= ltemp;
-		time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
+		time_adj = ltemp << (SHIFT_SCALE - SHIFT_UPDATE);
 	}
 
 	/*
@@ -339,29 +357,12 @@ ntp_update_second(long *newsec)
 	ltemp = time_freq;
 #endif /* PPS_SYNC */
 	if (ltemp < 0)
-		time_adj -= -ltemp >> (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+		time_adj -= -ltemp << (SHIFT_SCALE - SHIFT_USEC);
 	else
-		time_adj += ltemp >> (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
-
-#if SHIFT_HZ == 7
-	/*
-	* When the CPU clock oscillator frequency is not a
-	* power of two in Hz, the SHIFT_HZ is only an
-	* approximate scale factor. In the SunOS kernel, this
-	* results in a PLL gain factor of 1/1.28 = 0.78 what it
-	* should be. In the following code the overall gain is
-	* increased by a factor of 1.25, which results in a
-	* residual error less than 3 percent.
-	*/
-	/* Same thing applies for FreeBSD --GAW */
-	if (hz == 100) {
-		if (time_adj < 0)
-			time_adj -= -time_adj >> 2;
-		else
-			time_adj += time_adj >> 2;
-	}
-#endif /* SHIFT_HZ */
+		time_adj += ltemp << (SHIFT_SCALE - SHIFT_USEC);
 
+	tc->adjustment = time_adj;
+	
 	/* XXX - this is really bogus, but can't be fixed until
 	xntpd's idea of the system clock is fixed to know how
 	the user wants leap seconds handled; in the mean time,
@@ -490,6 +491,8 @@ ntp_adjtime(struct proc *p, struct ntp_adjtime_args *uap)
 	int s;
 	int error;
 
+	time_daemon = 1;
+
 	error = copyin((caddr_t)uap->tp, (caddr_t)&ntv, sizeof(ntv));
 	if (error)
 		return error;
diff --git a/sys/kern/kern_random.c b/sys/kern/kern_random.c
index f066949..7fd8364 100644
--- a/sys/kern/kern_random.c
+++ b/sys/kern/kern_random.c
@@ -1,7 +1,7 @@
 /*
  * random_machdep.c -- A strong random number generator
  *
- * $Id: random_machdep.c,v 1.19 1997/10/28 15:58:13 bde Exp $
+ * $Id: random_machdep.c,v 1.20 1997/12/26 20:42:11 phk Exp $
  *
  * Version 0.95, last modified 18-Oct-95
  * 
@@ -190,21 +190,8 @@ add_timer_randomness(struct random_bucket *r, struct timer_rand_state *state,
 	u_int		nbits;
 	u_int32_t	time;
 
-#if defined(I586_CPU) || defined(I686_CPU)
-	if (tsc_freq != 0) {
-		num ^= (u_int32_t) rdtsc() << 16;
-		r->entropy_count += 2;
-	} else {
-#endif
-		disable_intr();
-		outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH);
-		num ^= inb(TIMER_CNTR0) << 16;
-		num ^= inb(TIMER_CNTR0) << 24;
-		enable_intr();
-		r->entropy_count += 2;
-#if defined(I586_CPU) || defined(I686_CPU)
-	}
-#endif
+	num ^= timecounter->get_timecount() << 16;
+	r->entropy_count += 2;
 		
 	time = ticks;
 
diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c
index 20b700b..30bb775 100644
--- a/sys/kern/kern_tc.c
+++ b/sys/kern/kern_tc.c
@@ -1,4 +1,5 @@
 /*-
+ * Copyright (c) 1997, 1998 Poul-Henning Kamp <phk@FreeBSD.org>
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
@@ -36,7 +37,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
- * $Id: kern_clock.c,v 1.55 1998/02/06 12:13:22 eivind Exp $
+ * $Id: kern_clock.c,v 1.56 1998/02/15 13:55:06 phk Exp $
  */
 
 #include <sys/param.h>
@@ -55,7 +56,6 @@
 #include <sys/sysctl.h>
 
 #include <machine/cpu.h>
-#define CLOCK_HAIR		/* XXX */
 #include <machine/clock.h>
 #include <machine/limits.h>
 
@@ -70,6 +70,9 @@
 static void initclocks __P((void *dummy));
 SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
 
+static void tco_forward __P((void));
+static void tco_setscales __P((struct timecounter *tc));
+
 /* Some of these don't belong here, but it's easiest to concentrate them. */
 #if defined(SMP) && defined(BETTER_CLOCK)
 long cp_time[CPUSTATES];
@@ -91,55 +94,43 @@ long tk_nin;
 long tk_nout;
 long tk_rawcc;
 
+struct timecounter *timecounter;
+
 /*
  * Clock handling routines.
  *
- * This code is written to operate with two timers that run independently of
- * each other.  The main clock, running hz times per second, is used to keep
- * track of real time.  The second timer handles kernel and user profiling,
- * and does resource use estimation.  If the second timer is programmable,
- * it is randomized to avoid aliasing between the two clocks.  For example,
- * the randomization prevents an adversary from always giving up the cpu
- * just before its quantum expires.  Otherwise, it would never accumulate
- * cpu ticks.  The mean frequency of the second timer is stathz.
+ * This code is written to operate with two timers that run independently
+ * of each other.
  *
- * If no second timer exists, stathz will be zero; in this case we drive
- * profiling and statistics off the main clock.  This WILL NOT be accurate;
- * do not do it unless absolutely necessary.
+ * The main clock, running hz times per second, is used to trigger
+ * interval timers, timeouts and rescheduling as needed.
  *
+ * The second timer handles kernel and user profiling, and does resource
+ * use estimation.  If the second timer is programmable, it is randomized
+ * to avoid aliasing between the two clocks.  For example, the
+ * randomization prevents an adversary from always giving up the cpu
+ * just before its quantum expires.  Otherwise, it would never accumulate
+ * cpu ticks.  The mean frequency of the second timer is stathz.
+ * If no second timer exists, stathz will be zero; in this case we
+ * drive profiling and statistics off the main clock.  This WILL NOT
+ * be accurate; do not do it unless absolutely necessary.
  * The statistics clock may (or may not) be run at a higher rate while
- * profiling.  This profile clock runs at profhz.  We require that profhz
- * be an integral multiple of stathz.
+ * profiling.  This profile clock runs at profhz.  We require that
+ * profhz be an integral multiple of stathz.  If the statistics clock
+ * is running fast, it must be divided by the ratio profhz/stathz for
+ * statistics.  (For profiling, every tick counts.)
  *
- * If the statistics clock is running fast, it must be divided by the ratio
- * profhz/stathz for statistics.  (For profiling, every tick counts.)
- */
-
-/*
- * TODO:
- *	allocate more timeout table slots when table overflows.
- */
-
-/*
- * Bump a timeval by a small number of usec's.
+ * Time-of-day is maintained using a "timecounter", which may or may
+ * not be related to the hardware generating the above mentioned
+ * interrupts.
  */
-#define BUMPTIME(t, usec) { \
-	register volatile struct timeval *tp = (t); \
-	register long us; \
- \
-	tp->tv_usec = us = tp->tv_usec + (usec); \
-	if (us >= 1000000) { \
-		tp->tv_usec = us - 1000000; \
-		tp->tv_sec++; \
-	} \
-}
 
 int	stathz;
 int	profhz;
 static int profprocs;
 int	ticks;
 static int psdiv, pscnt;		/* prof => stat divider */
-int psratio;				/* ratio: prof / stat */
+int	psratio;			/* ratio: prof / stat */
 
 volatile struct	timeval time;
 volatile struct	timeval mono_time;
@@ -178,9 +169,6 @@ hardclock(frame)
 	register struct clockframe *frame;
 {
 	register struct proc *p;
-	int time_update;
-	struct timeval newtime = time;
-	long ltemp;
 
 	p = curproc;
 	if (p) {
@@ -208,55 +196,9 @@ hardclock(frame)
 	if (stathz == 0)
 		statclock(frame);
 
-	/*
-	 * Increment the time-of-day.
-	 */
-	ticks++;
+	tco_forward();
 
-	if (timedelta == 0) {
-		time_update = CPU_THISTICKLEN(tick);
-	} else {
-		time_update = CPU_THISTICKLEN(tick) + tickdelta;
-		timedelta -= tickdelta;
-	}
-	BUMPTIME(&mono_time, time_update);
-
-	/*
-	 * Compute the phase adjustment. If the low-order bits
-	 * (time_phase) of the update overflow, bump the high-order bits
-	 * (time_update).
-	 */
-	time_phase += time_adj;
-	if (time_phase <= -FINEUSEC) {
-		ltemp = -time_phase >> SHIFT_SCALE;
-		time_phase += ltemp << SHIFT_SCALE;
-		time_update -= ltemp;
-	}
-	else if (time_phase >= FINEUSEC) {
-		ltemp = time_phase >> SHIFT_SCALE;
-		time_phase -= ltemp << SHIFT_SCALE;
-		time_update += ltemp;
-	}
-
-	newtime.tv_usec += time_update;
-	/*
-	 * On rollover of the second the phase adjustment to be used for
-	 * the next second is calculated. Also, the maximum error is
-	 * increased by the tolerance. If the PPS frequency discipline
-	 * code is present, the phase is increased to compensate for the
-	 * CPU clock oscillator frequency error.
-	 *
-	 * On a 32-bit machine and given parameters in the timex.h
-	 * header file, the maximum phase adjustment is +-512 ms and
-	 * maximum frequency offset is a tad less than) +-512 ppm. On a
-	 * 64-bit machine, you shouldn't need to ask.
-	 */
-	if (newtime.tv_usec >= 1000000) {
-		newtime.tv_usec -= 1000000;
-		newtime.tv_sec++;
-		ntp_update_second(&newtime.tv_sec);
-	}
-	CPU_CLOCKUPDATE(&time, &newtime);
+	ticks++;
 
 	if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL)
 		setsoftclock();
@@ -315,6 +257,10 @@ hzto(tv)
 	}
 	if (sec < 0) {
 #ifdef DIAGNOSTIC
+		if (sec == -1 && usec > 0) {
+			sec++;
+			usec -= 1000000;
+		}
 		printf("hzto: negative time difference %ld sec %ld usec\n",
 		       sec, usec);
 #endif
@@ -529,11 +475,212 @@ SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD,
 	0, 0, sysctl_kern_clockrate, "S,clockinfo","");
 
 void
-nanotime(ts)
-	struct timespec *ts;
+microtime(struct timeval *tv)
+{
+	struct timecounter *tc;
+
+	tc = (struct timecounter *)timecounter;
+	tv->tv_sec = tc->offset_sec;
+	tv->tv_usec = tc->offset_micro;
+	tv->tv_usec += 
+	    ((u_int64_t)tc->get_timedelta(tc) * tc->scale_micro) >> 32;
+	if (tv->tv_usec >= 1000000) {
+		tv->tv_usec -= 1000000;
+		tv->tv_sec++;
+	}
+}
+
+void
+nanotime(struct timespec *tv)
+{
+	u_int32_t count;
+	u_int64_t delta;
+	struct timecounter *tc;
+
+	tc = (struct timecounter *)timecounter;
+	tv->tv_sec = tc->offset_sec;
+	count = tc->get_timedelta(tc);
+	delta = tc->offset_nano;
+	delta += ((u_int64_t)count * tc->scale_nano_f);
+	delta += ((u_int64_t)count * tc->scale_nano_i) << 32;
+	delta >>= 32;
+	if (delta >= 1000000000) {
+		delta -= 1000000000;
+		tv->tv_sec++;
+	}
+	tv->tv_nsec = delta;
+}
+
+static void
+tco_setscales(struct timecounter *tc)
+{
+	u_int64_t scale;
+
+	scale = 1000000000LL << 32;
+	if (tc->adjustment > 0)
+		scale += (tc->adjustment * 1000LL) << 10;
+	else
+		scale -= (-tc->adjustment * 1000LL) << 10;
+	/* scale += tc->frequency >> 1; */ /* XXX do we want to round ? */
+	scale /= tc->frequency;
+	tc->scale_micro = scale / 1000;
+	tc->scale_nano_f = scale & 0xffffffff;
+	tc->scale_nano_i = scale >> 32;
+}
+
+static u_int
+delta_timecounter(struct timecounter *tc)
+{
+	return((tc->get_timecount() - tc->offset_count) & tc->counter_mask);
+}
+
+void
+init_timecounter(struct timecounter *tc)
+{
+	struct timespec ts0, ts1;
+	int i;
+
+	if (!tc->get_timedelta) 
+		tc->get_timedelta = delta_timecounter;
+	tc->adjustment = 0;
+	tco_setscales(tc);
+	tc->offset_count = tc->get_timecount();
+	tc[0].tweak = &tc[0];
+	tc[2] = tc[1] = tc[0];
+	tc[1].other = &tc[2];
+	tc[2].other = &tc[1];
+	if (!timecounter)
+		timecounter = &tc[2];
+	tc = &tc[1];
+
+	/* 
+	 * Figure out the cost of calling this timecounter.
+	 * XXX: The 1:15 ratio is a guess at reality.
+	 */
+	nanotime(&ts0);
+	for (i = 0; i < 16; i ++) 
+		tc->get_timecount();
+	for (i = 0; i < 240; i ++)
+		tc->get_timedelta(tc);
+	nanotime(&ts1);
+	ts1.tv_sec -= ts0.tv_sec;
+	tc->cost = ts1.tv_sec * 1000000000 + ts1.tv_nsec - ts0.tv_nsec;
+	tc->cost >>= 8;
+	printf("Timecounter \"%s\"  frequency %lu Hz  cost %u ns\n", 
+	    tc->name, tc->frequency, tc->cost);
+
+	/* XXX: For now always start using the counter. */
+	tc->offset_count = tc->get_timecount();
+	nanotime(&ts1);
+	tc->offset_nano = (u_int64_t)ts1.tv_nsec << 32;
+	tc->offset_micro = ts1.tv_nsec / 1000;
+	tc->offset_sec = ts1.tv_sec;
+	timecounter = tc;
+}
+
+void
+set_timecounter(struct timespec *ts)
 {
-	struct timeval tv;
-	microtime(&tv);
-	ts->tv_sec = tv.tv_sec;
-	ts->tv_nsec = tv.tv_usec * 1000;
+	struct timecounter *tc, *tco;
+	int s;
+
+	s = splclock();
+	tc=timecounter->other;
+	tco = tc->other;
+	*tc = *timecounter;
+	tc->other = tco;
+	tc->offset_sec = ts->tv_sec;
+	tc->offset_nano = (u_int64_t)ts->tv_nsec << 32;
+	tc->offset_micro =  ts->tv_nsec / 1000;
+	tc->offset_count = tc->get_timecount();
+	time.tv_sec = tc->offset_sec;
+	time.tv_usec = tc->offset_micro;
+	timecounter = tc;
+	splx(s);
 }
+
+static struct timecounter *
+sync_other_counter(int flag)
+{
+	struct timecounter *tc, *tco;
+	u_int32_t delta;
+
+	tc = timecounter->other;
+	tco = tc->other;
+	*tc = *timecounter;
+	tc->other = tco;
+	delta = tc->get_timedelta(tc);
+	tc->offset_count += delta;
+	tc->offset_count &= tc->counter_mask;
+	tc->offset_nano += (u_int64_t)delta * tc->scale_nano_f;
+	tc->offset_nano += (u_int64_t)delta * tc->scale_nano_i << 32;
+	if (flag)
+		return (tc);
+	if (tc->offset_nano > 1000000000ULL << 32) {
+		tc->offset_sec++;
+		tc->offset_nano -= 1000000000ULL << 32;
+	}
+	tc->offset_micro = (tc->offset_nano / 1000) >> 32;
+	return (tc);
+}
+
+static void
+tco_forward(void)
+{
+	struct timecounter *tc;
+	u_int32_t time_update;
+
+	tc = sync_other_counter(1);
+	time_update = 0;
+
+	if (timedelta) {
+		time_update += tickdelta;
+		timedelta -= tickdelta;
+	}
+	mono_time.tv_usec += time_update + tick;
+	if (mono_time.tv_usec >= 1000000) {
+		mono_time.tv_usec -= 1000000;
+		mono_time.tv_sec++;
+	}
+	time_update *= 1000;
+	tc->offset_nano += (u_int64_t)time_update << 32;
+	if (tc->offset_nano >= 1000000000ULL << 32) {
+		tc->offset_nano -= 1000000000ULL << 32;
+		tc->offset_sec++;
+		tc->frequency = tc->tweak->frequency;
+		tc->adjustment = tc->tweak->adjustment;	/* XXX remove this ? */
+		ntp_update_second(tc);	/* XXX only needed if xntpd runs */
+		tco_setscales(tc);
+	}
+	/*
+	 * Find the usec from the nsec.  This is just as fast (one 
+	 * multiplication) and prevents skew between the two due
+	 * to rounding errors. (2^32/1000 = 4294967.296)
+	 */
+	tc->offset_micro = (tc->offset_nano / 1000) >> 32;
+	time.tv_usec = tc->offset_micro;
+	time.tv_sec = tc->offset_sec;
+	timecounter = tc;
+}
+
+static int
+sysctl_kern_timecounter_frequency SYSCTL_HANDLER_ARGS
+{
+	return (sysctl_handle_opaque(oidp, &timecounter->tweak->frequency,
+	    sizeof(timecounter->tweak->frequency), req));
+}
+
+static int
+sysctl_kern_timecounter_adjustment SYSCTL_HANDLER_ARGS
+{
+	return (sysctl_handle_opaque(oidp, &timecounter->tweak->adjustment,
+	    sizeof(timecounter->tweak->adjustment), req));
+}
+
+SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, "");
+
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, frequency, CTLTYPE_INT|CTLFLAG_RW,
+	0, sizeof(u_int) , sysctl_kern_timecounter_frequency, "I", "");
+
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, adjustment, CTLTYPE_INT|CTLFLAG_RW,
+	0, sizeof(int) , sysctl_kern_timecounter_adjustment, "I", "");
diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c
index 8060c15..fb78ffc 100644
--- a/sys/kern/kern_time.c
+++ b/sys/kern/kern_time.c
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)kern_time.c	8.1 (Berkeley) 6/10/93
- * $Id: kern_time.c,v 1.39 1997/11/06 19:29:16 phk Exp $
+ * $Id: kern_time.c,v 1.40 1997/11/07 08:52:58 phk Exp $
  */
 
 #include <sys/param.h>
@@ -78,6 +78,7 @@ settime(tv)
 	struct timeval *tv;
 {
 	struct timeval delta;
+	struct timespec ts;
 	struct proc *p;
 	int s;
 
@@ -99,7 +100,9 @@ settime(tv)
 	 */
 	delta.tv_sec = tv->tv_sec - time.tv_sec;
 	delta.tv_usec = tv->tv_usec - time.tv_usec;
-	time = *tv;
+	ts.tv_sec = tv->tv_sec;
+	ts.tv_nsec = tv->tv_usec * 1000;
+	set_timecounter(&ts);
 	/*
 	 * XXX should arrange for microtime() to agree with *tv if
 	 * it is called now.  As it is, it may add up to about
@@ -138,13 +141,11 @@ clock_gettime(p, uap)
 	struct proc *p;
 	struct clock_gettime_args *uap;
 {
-	struct timeval atv;
 	struct timespec ats;
 
 	if (SCARG(uap, clock_id) != CLOCK_REALTIME)
 		return (EINVAL);
-	microtime(&atv);
-	TIMEVAL_TO_TIMESPEC(&atv, &ats);
+	nanotime(&ats);
 	return (copyout(&ats, SCARG(uap, tp), sizeof(ats)));
 }
 
@@ -199,7 +200,7 @@ clock_getres(p, uap)
 	error = 0;
 	if (SCARG(uap, tp)) {
 		ts.tv_sec = 0;
-		ts.tv_nsec = 1000000000 / hz;
+		ts.tv_nsec = 1000000000 / timecounter->frequency;
 		error = copyout(&ts, SCARG(uap, tp), sizeof(ts));
 	}
 	return (error);
-- 
cgit v1.1