git: 22875f88799e - main - x86: Implement deferred TSC calibration
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Mon, 15 Nov 2021 21:13:30 UTC
The branch main has been updated by markj: URL: https://cgit.FreeBSD.org/src/commit/?id=22875f88799e1684febf79b5049541e0f825aaa1 commit 22875f88799e1684febf79b5049541e0f825aaa1 Author: Mark Johnston <markj@FreeBSD.org> AuthorDate: 2021-11-15 20:31:21 +0000 Commit: Mark Johnston <markj@FreeBSD.org> CommitDate: 2021-11-15 21:13:24 +0000 x86: Implement deferred TSC calibration There is no universal way to find the TSC frequency. Newer Intel CPUs may report it via CPUID leaves 0x15 and 0x16. Sometimes it can be obtained from the PLATFORM_INFO MSR as well, though we never use that. On older platforms we derive the frequency using a DELAY(1000000) call, which uses the 8254 PIT. On some newer platforms the 8254 is apparently non-functional, leading to bogus calibration results. On such platforms the TSC frequency must be available from CPUID. It is also possible to disable calibration with a tunable, in which case we try to parse the brand string if the TSC freq is not available from CPUID. CPUID 0x15 provides an authoritative TSC frequency value, but even that is not always available on new Intel platforms. CPUID 0x16 provides the specified processor base frequency, which is not the same as the TSC frequency. Empirically, it is close enough for early boot, but too far off for timekeeping: on a Comet Lake NUC, CPUID 0x16 yields 1600MHz but the TSC frequency is rougly 1608MHz, leading to frequent clock stepping when NTP is in use. Thus we have a situation where we cannot calibrate using the PIT and cannot obtain a precise frequency from CPUID (or MSRs). This change seeks to address that by using the CPUID 0x16 value during early boot and refining the calibration later once ACPI-based timecounters are available. TSC frequency detection is thus split into two phases: Early phase: - On Intel platforms, query CPUID 0x15 and 0x16 and use that value initially if available. - Otherwise, get an estimate using the PIT, reducing the delay loop to 100ms from 1s. - Continue to register the TSC as the CPU ticks provider early, even though the frequency may be off. Otherwise any code executed during boot that uses cpu_ticks() (e.g., context switching) gets tripped up when the ticks provider changes. Later phase: - In SI_SUB_CLOCKS, once the timehands are initialized, load the current TSC and timecounter (sbinuptime()) values at the beginning and end of a 1s interval and use the timecounter frequency (typically from kvmclock, HPET or the ACPI PM timer) to estimate the TSC frequency. - Update the TSC timecounter, global tsc_freq and CPU ticker with the new frequency and finally register the TSC as a timecounter. Reviewed by: kib, jhb (previous version) Discussed with: imp, cperciva MFC after: 6 weeks Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D32512 --- sys/x86/x86/local_apic.c | 7 +- sys/x86/x86/tsc.c | 202 +++++++++++++++++++++++++++++++---------------- 2 files changed, 141 insertions(+), 68 deletions(-) diff --git a/sys/x86/x86/local_apic.c b/sys/x86/x86/local_apic.c index 2cd2e0c23e11..0def21687bbf 100644 --- a/sys/x86/x86/local_apic.c +++ b/sys/x86/x86/local_apic.c @@ -1013,7 +1013,12 @@ lapic_change_mode(struct eventtimer *et, struct lapic *la, enum lat_timer_mode newmode) { - if (la->la_timer_mode == newmode) + /* + * The TSC frequency may change during late calibration against other + * timecounters (HPET or ACPI PMTimer). + */ + if (la->la_timer_mode == newmode && + (newmode != LAT_MODE_DEADLINE || et->et_frequency == tsc_freq)) return; switch (newmode) { case LAT_MODE_PERIODIC: diff --git a/sys/x86/x86/tsc.c b/sys/x86/x86/tsc.c index a6c7ec7a8307..bba5e352b031 100644 --- a/sys/x86/x86/tsc.c +++ b/sys/x86/x86/tsc.c @@ -32,12 +32,14 @@ __FBSDID("$FreeBSD$"); #include "opt_clock.h" #include <sys/param.h> +#include <sys/systm.h> #include <sys/bus.h> #include <sys/cpu.h> #include <sys/eventhandler.h> #include <sys/limits.h> #include <sys/malloc.h> -#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/sched.h> #include <sys/sysctl.h> #include <sys/time.h> #include <sys/timetc.h> @@ -84,7 +86,7 @@ SYSCTL_INT(_machdep, OID_AUTO, disable_tsc, CTLFLAG_RDTUN, &tsc_disabled, 0, static int tsc_skip_calibration; SYSCTL_INT(_machdep, OID_AUTO, disable_tsc_calibration, CTLFLAG_RDTUN, &tsc_skip_calibration, 0, - "Disable TSC frequency calibration"); + "Disable early TSC frequency calibration"); static void tsc_freq_changed(void *arg, const struct cf_level *level, int status); @@ -134,14 +136,11 @@ tsc_freq_vmware(void) } /* - * Calculate TSC frequency using information from the CPUID leaf 0x15 - * 'Time Stamp Counter and Nominal Core Crystal Clock'. If leaf 0x15 - * is not functional, as it is on Skylake/Kabylake, try 0x16 'Processor - * Frequency Information'. Leaf 0x16 is described in the SDM as - * informational only, but if 0x15 did not work, and TSC calibration - * is disabled, it is the best we can get at all. It should still be - * an improvement over the parsing of the CPU model name in - * tsc_freq_intel(), when available. + * Calculate TSC frequency using information from the CPUID leaf 0x15 'Time + * Stamp Counter and Nominal Core Crystal Clock'. If leaf 0x15 is not + * functional, as it is on Skylake/Kabylake, try 0x16 'Processor Frequency + * Information'. Leaf 0x16 is described in the SDM as informational only, but + * we can use this value until late calibration is complete. */ static bool tsc_freq_cpuid(uint64_t *res) @@ -167,8 +166,8 @@ tsc_freq_cpuid(uint64_t *res) return (false); } -static void -tsc_freq_intel(void) +static bool +tsc_freq_intel_brand(uint64_t *res) { char brand[48]; u_int regs[4]; @@ -205,7 +204,7 @@ tsc_freq_intel(void) i = 1000000; break; default: - return; + return (false); } #define C2D(c) ((c) - '0') if (p[1] == '.') { @@ -221,17 +220,39 @@ tsc_freq_intel(void) freq *= i * 1000000; } #undef C2D - tsc_freq = freq; + *res = freq; + return (true); } } + return (false); } static void -probe_tsc_freq(void) +tsc_freq_8254(uint64_t *res) { - uint64_t tmp_freq, tsc1, tsc2; - int no_cpuid_override; + uint64_t tsc1, tsc2; + int64_t overhead; + int count, i; + + overhead = 0; + for (i = 0, count = 8; i < count; i++) { + tsc1 = rdtsc_ordered(); + DELAY(0); + tsc2 = rdtsc_ordered(); + if (i > 0) + overhead += tsc2 - tsc1; + } + overhead /= count; + + tsc1 = rdtsc_ordered(); + DELAY(100000); + tsc2 = rdtsc_ordered(); + tsc_freq = (tsc2 - tsc1 - overhead) * 10; +} +static void +probe_tsc_freq(void) +{ if (cpu_power_ecx & CPUID_PERF_STAT) { /* * XXX Some emulators expose host CPUID without actual support @@ -287,50 +308,44 @@ probe_tsc_freq(void) break; } - if (tsc_skip_calibration) { - if (tsc_freq_cpuid(&tmp_freq)) - tsc_freq = tmp_freq; - else if (cpu_vendor_id == CPU_VENDOR_INTEL) - tsc_freq_intel(); - if (tsc_freq == 0) - tsc_disabled = 1; - } else { + if (tsc_freq_cpuid(&tsc_freq)) { + /* + * If possible, use the value obtained from CPUID as the initial + * frequency. This will be refined later during boot but is + * good enough for now. The 8254 PIT is not functional on some + * newer platforms anyway, so don't delay our boot for what + * might be a garbage result. Late calibration is required if + * the initial frequency was obtained from CPUID.16H, as the + * derived value may be off by as much as 1%. + */ if (bootverbose) - printf("Calibrating TSC clock ... "); - tsc1 = rdtsc(); - DELAY(1000000); - tsc2 = rdtsc(); - tsc_freq = tsc2 - tsc1; - + printf("Early TSC frequency %juHz derived from CPUID\n", + (uintmax_t)tsc_freq); + } else if (tsc_skip_calibration) { /* - * If the difference between calibrated frequency and - * the frequency reported by CPUID 0x15/0x16 leafs - * differ significantly, this probably means that - * calibration is bogus. It happens on machines - * without 8254 timer. The BIOS rarely properly - * reports it in FADT boot flags, so just compare the - * frequencies directly. + * Try to parse the brand string to obtain the nominal TSC + * frequency. */ - if (tsc_freq_cpuid(&tmp_freq) && qabs(tsc_freq - tmp_freq) > - uqmin(tsc_freq, tmp_freq)) { - no_cpuid_override = 0; - TUNABLE_INT_FETCH("machdep.disable_tsc_cpuid_override", - &no_cpuid_override); - if (!no_cpuid_override) { - if (bootverbose) { - printf( - "TSC clock: calibration freq %ju Hz, CPUID freq %ju Hz%s\n", - (uintmax_t)tsc_freq, - (uintmax_t)tmp_freq, - no_cpuid_override ? "" : - ", doing CPUID override"); - } - tsc_freq = tmp_freq; - } + if (cpu_vendor_id == CPU_VENDOR_INTEL && + tsc_freq_intel_brand(&tsc_freq)) { + if (bootverbose) + printf( + "Early TSC frequency %juHz derived from brand string\n", + (uintmax_t)tsc_freq); + } else { + tsc_disabled = 1; } + } else { + /* + * Calibrate against the 8254 PIT. This estimate will be + * refined later in tsc_calib(). + */ + tsc_freq_8254(&tsc_freq); + if (bootverbose) + printf( + "Early TSC frequency %juHz calibrated from 8254 PIT\n", + (uintmax_t)tsc_freq); } - if (bootverbose) - printf("TSC clock: %ju Hz\n", (intmax_t)tsc_freq); } void @@ -372,13 +387,18 @@ init_TSC(void) break; } #endif - + probe_tsc_freq(); /* * Inform CPU accounting about our boot-time clock rate. This will * be updated if someone loads a cpufreq driver after boot that * discovers a new max frequency. + * + * The frequency may also be updated after late calibration is complete; + * however, we register the TSC as the ticker now to avoid switching + * counters after much of the kernel has already booted and potentially + * sampled the CPU clock. */ if (tsc_freq != 0) set_cputicker(rdtsc, tsc_freq, !tsc_is_invariant); @@ -656,11 +676,65 @@ init_TSC_tc(void) if (tsc_freq != 0) { tsc_timecounter.tc_frequency = tsc_freq >> shift; tsc_timecounter.tc_priv = (void *)(intptr_t)shift; - tc_init(&tsc_timecounter); + + /* + * Timecounter registration is deferred until after late + * calibration is finished. + */ } } SYSINIT(tsc_tc, SI_SUB_SMP, SI_ORDER_ANY, init_TSC_tc, NULL); +static void +tsc_update_freq(uint64_t new_freq) +{ + atomic_store_rel_64(&tsc_freq, new_freq); + atomic_store_rel_64(&tsc_timecounter.tc_frequency, + new_freq >> (int)(intptr_t)tsc_timecounter.tc_priv); +} + +/* + * Perform late calibration of the TSC frequency once ACPI-based timecounters + * are available. + */ +static void +tsc_calib(void *arg __unused) +{ + sbintime_t t_start, t_end; + uint64_t freq_khz, tsc_start, tsc_end; + register_t flags; + int cpu; + + if (tsc_disabled) + return; + + flags = intr_disable(); + cpu = curcpu; + tsc_start = rdtsc_ordered(); + t_start = sbinuptime(); + intr_restore(flags); + + DELAY(1000000); + + thread_lock(curthread); + sched_bind(curthread, cpu); + + flags = intr_disable(); + tsc_end = rdtsc_ordered(); + t_end = sbinuptime(); + intr_restore(flags); + + sched_unbind(curthread); + thread_unlock(curthread); + + freq_khz = (SBT_1S / 1024) * (tsc_end - tsc_start) / (t_end - t_start); + + tsc_update_freq(freq_khz * 1024); + tc_init(&tsc_timecounter); + set_cputicker(rdtsc, tsc_freq, !tsc_is_invariant); +} +SYSINIT(tsc_calib, SI_SUB_CLOCKS + 1, SI_ORDER_ANY, tsc_calib, NULL); + void resume_TSC(void) { @@ -752,9 +826,7 @@ tsc_freq_changed(void *arg, const struct cf_level *level, int status) /* Total setting for this level gives the new frequency in MHz. */ freq = (uint64_t)level->total_set.freq * 1000000; - atomic_store_rel_64(&tsc_freq, freq); - tsc_timecounter.tc_frequency = - freq >> (int)(intptr_t)tsc_timecounter.tc_priv; + tsc_update_freq(freq); } static int @@ -767,14 +839,10 @@ sysctl_machdep_tsc_freq(SYSCTL_HANDLER_ARGS) if (freq == 0) return (EOPNOTSUPP); error = sysctl_handle_64(oidp, &freq, 0, req); - if (error == 0 && req->newptr != NULL) { - atomic_store_rel_64(&tsc_freq, freq); - atomic_store_rel_64(&tsc_timecounter.tc_frequency, - freq >> (int)(intptr_t)tsc_timecounter.tc_priv); - } + if (error == 0 && req->newptr != NULL) + tsc_update_freq(freq); return (error); } - SYSCTL_PROC(_machdep, OID_AUTO, tsc_freq, CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, 0, sysctl_machdep_tsc_freq, "QU",