git: c2705ceaeb09 - main - x86: Speed up clock calibration

From: Colin Percival <cperciva_at_FreeBSD.org>
Date: Wed, 12 Jan 2022 20:37:05 UTC
The branch main has been updated by cperciva:

URL: https://cgit.FreeBSD.org/src/commit/?id=c2705ceaeb09d8579661097fd358ffb5defb5624

commit c2705ceaeb09d8579661097fd358ffb5defb5624
Author:     Colin Percival <cperciva@FreeBSD.org>
AuthorDate: 2022-01-10 01:22:20 +0000
Commit:     Colin Percival <cperciva@FreeBSD.org>
CommitDate: 2022-01-12 20:34:07 +0000

    x86: Speed up clock calibration
    
    Prior to this commit, the TSC and local APIC frequencies were calibrated
    at boot time by measuring the clocks before and after a one-second sleep.
    This was simple and effective, but had the disadvantage of *requiring a
    one-second sleep*.
    
    Rather than making two clock measurements (before and after sleeping) we
    now perform many measurements; and rather than simply subtracting the
    starting count from the ending count, we calculate a best-fit regression
    between the target clock and the reference clock (for which the current
    best available timecounter is used). While we do this, we keep track
    of an estimate of the uncertainty in the regression slope (aka. the ratio
    of clock speeds), and stop measuring when we believe the uncertainty is
    less than 1 PPM.
    
    In order to avoid the risk of aliasing resulting from the data-gathering
    loop synchronizing with (a multiple of) the frequency of the reference
    clock, we add some additional spinning depending upon the iteration number.
    
    For numerical stability and simplicity of implementation, we make use of
    floating-point arithmetic for the statistical calculations.
    
    On the author's Dell laptop, this reduces the time spent in calibration
    from 2000 ms to 29 ms; on an EC2 c5.xlarge instance, it is reduced from
    2000 ms to 2.5 ms.
    
    Reviewed by:    bde (previous version), kib
    MFC after:      1 month
    Sponsored by:   https://www.patreon.com/cperciva
    Differential Revision:  https://reviews.freebsd.org/D33802
---
 sys/conf/files.amd64       |   6 ++
 sys/conf/files.i386        |   6 ++
 sys/kern/subr_clockcalib.c | 183 +++++++++++++++++++++++++++++++++++++++++++++
 sys/sys/timetc.h           |   7 ++
 sys/x86/x86/local_apic.c   |  31 +++++---
 sys/x86/x86/tsc.c          |  46 ++----------
 6 files changed, 229 insertions(+), 50 deletions(-)

diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64
index 86b54315d897..88d73bfe1edd 100644
--- a/sys/conf/files.amd64
+++ b/sys/conf/files.amd64
@@ -407,3 +407,9 @@ contrib/openzfs/module/zfs/vdev_raidz_math_avx512bw.c		optional zfs compile-with
 contrib/openzfs/module/zfs/vdev_raidz_math_avx512f.c		optional zfs compile-with "${ZFS_C}"
 contrib/openzfs/module/zfs/vdev_raidz_math_sse2.c		optional zfs compile-with "${ZFS_C}"
 contrib/openzfs/module/zfs/vdev_raidz_math_ssse3.c		optional zfs compile-with "${ZFS_C}"
+# Clock calibration subroutine; uses floating-point arithmetic
+subr_clockcalib.o		standard				\
+	dependency	"$S/kern/subr_clockcalib.c"			\
+	compile-with	"${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} -mmmx -msse -msse2 ${.IMPSRC}" \
+	no-implicit-rule						\
+	clean		"subr_clockcalib.o"
diff --git a/sys/conf/files.i386 b/sys/conf/files.i386
index bca5f04aeb1d..2622a6e2bf91 100644
--- a/sys/conf/files.i386
+++ b/sys/conf/files.i386
@@ -171,3 +171,9 @@ x86/x86/local_apic.c		optional apic
 x86/x86/mptable.c		optional apic
 x86/x86/mptable_pci.c		optional apic pci
 x86/x86/msi.c			optional apic pci
+# Clock calibration subroutine; uses floating-point arithmetic
+subr_clockcalib.o		standard				\
+	dependency	"$S/kern/subr_clockcalib.c"			\
+	compile-with	"${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} -m80387 ${.IMPSRC}" \
+	no-implicit-rule						\
+	clean		"subr_clockcalib.o"
diff --git a/sys/kern/subr_clockcalib.c b/sys/kern/subr_clockcalib.c
new file mode 100644
index 000000000000..2d6a8c31a9b9
--- /dev/null
+++ b/sys/kern/subr_clockcalib.c
@@ -0,0 +1,183 @@
+/*-
+ * Copyright (c) 2022 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/timetc.h>
+#include <sys/tslog.h>
+#include <machine/cpu.h>
+
+/**
+ * clockcalib(clk, clkname):
+ * Return the frequency of the provided timer, as calibrated against the
+ * current best-available timecounter.
+ */
+uint64_t
+clockcalib(uint64_t (*clk)(void), const char *clkname)
+{
+	struct timecounter *tc = atomic_load_ptr(&timecounter);
+	uint64_t clk0, clk1, clk_delay, n, passes = 0;
+	uint64_t t0, t1, tadj, tlast;
+	double mu_clk = 0;
+	double mu_t = 0;
+	double va_clk = 0;
+	double va_t = 0;
+	double cva = 0;
+	double d1, d2;
+	double inv_n;
+	uint64_t freq;
+
+	TSENTER();
+	/*-
+	 * The idea here is to compute a best-fit linear regression between
+	 * the clock we're calibrating and the reference clock; the slope of
+	 * that line multiplied by the frequency of the reference clock gives
+	 * us the frequency we're looking for.
+	 *
+	 * To do this, we calculate the
+	 * (a) mean of the target clock measurements,
+	 * (b) variance of the target clock measurements,
+	 * (c) mean of the reference clock measurements,
+	 * (d) variance of the reference clock measurements, and
+	 * (e) covariance of the target clock and reference clock measurements
+	 * on an ongoing basis, updating all five values after each new data
+	 * point arrives, stopping when we're confident that we've accurately
+	 * measured the target clock frequency.
+	 *
+	 * Given those five values, the important formulas to remember from
+	 * introductory statistics are:
+	 * 1. slope of regression line = covariance(x, y) / variance(x)
+	 * 2. (relative uncertainty in slope)^2 =
+	 *    (variance(x) * variance(y) - covariance(x, y)^2)
+	 *    ------------------------------------------------
+	 *              covariance(x, y)^2 * (N - 2)
+	 *
+	 * We adjust the second formula slightly, adding a term to each of
+	 * the variance values to reflect the measurement quantization.
+	 *
+	 * Finally, we need to determine when to stop gathering data.  We
+	 * can't simply stop as soon as the computed uncertainty estimate
+	 * is below our threshold; this would make us overconfident since it
+	 * would introduce a multiple-comparisons problem (cf. sequential
+	 * analysis in clinical trials).  Instead, we stop with N data points
+	 * if the estimated uncertainty of the first k data points meets our
+	 * target for all N/2 < k <= N; this is not theoretically optimal,
+	 * but in practice works well enough.
+	 */
+
+	/*
+	 * Initial values for clocks; we'll subtract these off from values
+	 * we measure later in order to reduce floating-point rounding errors.
+	 * We keep track of an adjustment for values read from the reference
+	 * timecounter, since it can wrap.
+	 */
+	clk0 = clk();
+	t0 = tc->tc_get_timecount(tc) & tc->tc_counter_mask;
+	tadj = 0;
+	tlast = t0;
+
+	/* Loop until we give up or decide that we're calibrated. */
+	for (n = 1; ; n++) {
+		/* Get a new data point. */
+		clk1 = clk() - clk0;
+		t1 = tc->tc_get_timecount(tc) & tc->tc_counter_mask;
+		while (t1 + tadj < tlast)
+			tadj += tc->tc_counter_mask + 1;
+		tlast = t1 + tadj;
+		t1 += tadj - t0;
+
+		/* If we spent too long, bail. */
+		if (t1 > tc->tc_frequency) {
+			printf("Statistical %s calibration failed!  "
+			    "Clocks might be ticking at variable rates.\n",
+			     clkname);
+			printf("Falling back to slow %s calibration.\n",
+			    clkname);
+			freq = (double)(tc->tc_frequency) * clk1 / t1;
+			break;
+		}
+
+		/* Precompute to save on divisions later. */
+		inv_n = 1.0 / n;
+
+		/* Update mean and variance of recorded TSC values. */
+		d1 = clk1 - mu_clk;
+		mu_clk += d1 * inv_n;
+		d2 = d1 * (clk1 - mu_clk);
+		va_clk += (d2 - va_clk) * inv_n;
+
+		/* Update mean and variance of recorded time values. */
+		d1 = t1 - mu_t;
+		mu_t += d1 * inv_n;
+		d2 = d1 * (t1 - mu_t);
+		va_t += (d2 - va_t) * inv_n;
+
+		/* Update covariance. */
+		d2 = d1 * (clk1 - mu_clk);
+		cva += (d2 - cva) * inv_n;
+
+		/*
+		 * Count low-uncertainty iterations.  This is a rearrangement
+		 * of "relative uncertainty < 1 PPM" avoiding division.
+		 */
+#define TSC_PPM_UNCERTAINTY	1
+#define TSC_UNCERTAINTY		TSC_PPM_UNCERTAINTY * 0.000001
+#define TSC_UNCERTAINTY_SQR	TSC_UNCERTAINTY * TSC_UNCERTAINTY
+		if (TSC_UNCERTAINTY_SQR * (n - 2) * cva * cva >
+		    (va_t + 4) * (va_clk + 4) - cva * cva)
+			passes++;
+		else
+			passes = 0;
+
+		/* Break if we're consistently certain. */
+		if (passes * 2 > n) {
+			freq = (double)(tc->tc_frequency) * cva / va_t;
+			if (bootverbose)
+				printf("Statistical %s calibration took"
+				    " %lu us and %lu data points\n",
+				    clkname, (unsigned long)(t1 *
+					1000000.0 / tc->tc_frequency),
+				    (unsigned long)n);
+			break;
+		}
+
+		/*
+		 * Add variable delay to avoid theoretical risk of aliasing
+		 * resulting from this loop synchronizing with the frequency
+		 * of the reference clock.  On the nth iteration, we spend
+		 * O(1 / n) time here -- long enough to avoid aliasing, but
+		 * short enough to be insignificant as n grows.
+		 */
+		clk_delay = clk() + (clk() - clk0) / (n * n);
+		while (clk() < clk_delay)
+			cpu_spinwait(); /* Do nothing. */
+	}
+	TSEXIT();
+	return (freq);
+}
diff --git a/sys/sys/timetc.h b/sys/sys/timetc.h
index 55f61af4c46c..811cc1af461a 100644
--- a/sys/sys/timetc.h
+++ b/sys/sys/timetc.h
@@ -96,4 +96,11 @@ void	cpu_tick_calibration(void);
 SYSCTL_DECL(_kern_timecounter);
 #endif
 
+/**
+ * clockcalib(clk, clkname):
+ * Return the frequency of the provided timer, as calibrated against the
+ * current best-available timecounter.
+ */
+uint64_t clockcalib(uint64_t (*)(void), const char *);
+
 #endif /* !_SYS_TIMETC_H_ */
diff --git a/sys/x86/x86/local_apic.c b/sys/x86/x86/local_apic.c
index 23b780b121e1..4b66d10cb5ae 100644
--- a/sys/x86/x86/local_apic.c
+++ b/sys/x86/x86/local_apic.c
@@ -56,6 +56,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/timeet.h>
+#include <sys/timetc.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
@@ -64,6 +65,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/clock.h>
 #include <machine/cpufunc.h>
 #include <machine/cputypes.h>
+#include <machine/fpu.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
@@ -1000,30 +1002,39 @@ native_lapic_disable_pmc(void)
 #endif
 }
 
+static uint64_t
+cb_lapic_getcount(void)
+{
+
+	return (APIC_TIMER_MAX_COUNT - lapic_read32(LAPIC_CCR_TIMER));
+}
+
 static void
 lapic_calibrate_initcount(struct lapic *la)
 {
-	u_long value;
+	uint64_t freq;
+
+	/* Calibrate the APIC timer frequency. */
+	lapic_timer_set_divisor(2);
+	lapic_timer_oneshot_nointr(la, APIC_TIMER_MAX_COUNT);
+	fpu_kern_enter(curthread, NULL, FPU_KERN_NOCTX);
+	freq = clockcalib(cb_lapic_getcount, "lapic");
+	fpu_kern_leave(curthread, NULL);
 
-	/* Start off with a divisor of 2 (power on reset default). */
+	/* Pick a different divisor if necessary. */
 	lapic_timer_divisor = 2;
-	/* Try to calibrate the local APIC timer. */
 	do {
-		lapic_timer_set_divisor(lapic_timer_divisor);
-		lapic_timer_oneshot_nointr(la, APIC_TIMER_MAX_COUNT);
-		DELAY(1000000);
-		value = APIC_TIMER_MAX_COUNT - lapic_read32(LAPIC_CCR_TIMER);
-		if (value != APIC_TIMER_MAX_COUNT)
+		if (freq * 2 / lapic_timer_divisor < APIC_TIMER_MAX_COUNT)
 			break;
 		lapic_timer_divisor <<= 1;
 	} while (lapic_timer_divisor <= 128);
 	if (lapic_timer_divisor > 128)
 		panic("lapic: Divisor too big");
+	count_freq = freq * 2 / lapic_timer_divisor;
 	if (bootverbose) {
 		printf("lapic: Divisor %lu, Frequency %lu Hz\n",
-		    lapic_timer_divisor, value);
+		    lapic_timer_divisor, count_freq);
 	}
-	count_freq = value;
 }
 
 static void
diff --git a/sys/x86/x86/tsc.c b/sys/x86/x86/tsc.c
index 3b2e044573f1..fa21d9c51fcb 100644
--- a/sys/x86/x86/tsc.c
+++ b/sys/x86/x86/tsc.c
@@ -48,6 +48,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/vdso.h>
 #include <machine/clock.h>
 #include <machine/cputypes.h>
+#include <machine/fpu.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 #include <x86/vmware.h>
@@ -703,53 +704,18 @@ tsc_update_freq(uint64_t new_freq)
 void
 tsc_calibrate(void)
 {
-	struct timecounter *tc;
-	uint64_t freq, tsc_start, tsc_end;
-	u_int t_start, t_end;
-	register_t flags;
-	int cpu;
+	uint64_t freq;
 
 	if (tsc_disabled)
 		return;
 	if (tsc_early_calib_exact)
 		goto calibrated;
 
-	/*
-	 * Avoid using a low-quality timecounter to re-calibrate.  In
-	 * particular, old 32-bit platforms might only have the 8254 timer to
-	 * calibrate against.
-	 */
-	tc = atomic_load_ptr(&timecounter);
-	if (tc->tc_quality <= 0)
-		goto calibrated;
-
-	flags = intr_disable();
-	cpu = curcpu;
-	tsc_start = rdtsc_ordered();
-	t_start = tc->tc_get_timecount(tc) & tc->tc_counter_mask;
-	intr_restore(flags);
-
-	DELAY(1000000);
-
-	thread_lock(curthread);
-	sched_bind(curthread, cpu);
-
-	flags = intr_disable();
-	tsc_end = rdtsc_ordered();
-	t_end = tc->tc_get_timecount(tc) & tc->tc_counter_mask;
-	intr_restore(flags);
-
-	sched_unbind(curthread);
-	thread_unlock(curthread);
-
-	if (t_end <= t_start) {
-		/* Assume that the counter has wrapped around at most once. */
-		t_end += (uint64_t)tc->tc_counter_mask + 1;
-	}
-
-	freq = tc->tc_frequency * (tsc_end - tsc_start) / (t_end - t_start);
-
+	fpu_kern_enter(curthread, NULL, FPU_KERN_NOCTX);
+	freq = clockcalib(rdtsc_ordered, "TSC");
+	fpu_kern_leave(curthread, NULL);
 	tsc_update_freq(freq);
+
 calibrated:
 	tc_init(&tsc_timecounter);
 	set_cputicker(rdtsc, tsc_freq, !tsc_is_invariant);