git: 03f868b163ad - main - x86: Add a required store-load barrier in cpu_idle()
Date: Thu, 14 Jul 2022 14:47:37 UTC
The branch main has been updated by markj: URL: https://cgit.FreeBSD.org/src/commit/?id=03f868b163ad46d6f7cb03dc46fb83ca01fb8f69 commit 03f868b163ad46d6f7cb03dc46fb83ca01fb8f69 Author: Mark Johnston <markj@FreeBSD.org> AuthorDate: 2022-07-14 14:24:25 +0000 Commit: Mark Johnston <markj@FreeBSD.org> CommitDate: 2022-07-14 14:28:01 +0000 x86: Add a required store-load barrier in cpu_idle() ULE's tdq_notify() tries to avoid delivering IPIs to the idle thread. In particular, it tries to detect whether the idle thread is running. There are two mechanisms for this: - tdq_cpu_idle, an MI flag which is set prior to calling cpu_idle(). If tdq_cpu_idle == 0, then no IPI is needed; - idle_state, an x86-specific state flag which is updated after cpu_idleclock() is called. The implementation of the second mechanism is racy; the race can cause a CPU to go to sleep with pending work. Specifically, cpu_idle_*() set idle_state = STATE_SLEEPING, then check for pending work by loading the tdq_load field of the CPU's runqueue. These operations can be reordered so that the idle thread observes tdq_load == 0, and tdq_notify() observes idle_state == STATE_RUNNING. Some counters indicate that the idle_state check in tdq_notify() frequently elides an IPI. So, fix the problem by inserting a fence after the store to idle_state, immediately before idling the CPU. PR: 264867 Reviewed by: mav, kib, jhb MFC after: 1 month Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D35777 --- sys/x86/x86/cpu_machdep.c | 103 ++++++++++++++++++++++++++++------------------ 1 file changed, 62 insertions(+), 41 deletions(-) diff --git a/sys/x86/x86/cpu_machdep.c b/sys/x86/x86/cpu_machdep.c index fa11f64e2779..040438043c73 100644 --- a/sys/x86/x86/cpu_machdep.c +++ b/sys/x86/x86/cpu_machdep.c @@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$"); #include "opt_maxmem.h" #include "opt_mp_watchdog.h" #include "opt_platform.h" +#include "opt_sched.h" #ifdef __i386__ #include "opt_apic.h" #endif @@ -532,32 +533,25 @@ static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */ SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait, 0, "Use MONITOR/MWAIT for short idle"); -static void -cpu_idle_acpi(sbintime_t sbt) +static bool +cpu_idle_enter(int *statep, int newstate) { - int *state; + KASSERT(atomic_load_int(statep) == STATE_RUNNING, + ("%s: state %d", __func__, atomic_load_int(statep))); - state = &PCPU_PTR(monitorbuf)->idle_state; - atomic_store_int(state, STATE_SLEEPING); - - /* See comments in cpu_idle_hlt(). */ - disable_intr(); - if (sched_runnable()) - enable_intr(); - else if (cpu_idle_hook) - cpu_idle_hook(sbt); - else - acpi_cpu_c1(); - atomic_store_int(state, STATE_RUNNING); -} - -static void -cpu_idle_hlt(sbintime_t sbt) -{ - int *state; - - state = &PCPU_PTR(monitorbuf)->idle_state; - atomic_store_int(state, STATE_SLEEPING); + /* + * A fence is needed to prevent reordering of the load in + * sched_runnable() with this store to the idle state word. Without it, + * cpu_idle_wakeup() can observe the state as STATE_RUNNING after having + * added load to the queue, and elide an IPI. Then, sched_runnable() + * can observe tdq_load == 0, so the CPU ends up idling with pending + * work. tdq_notify() similarly ensures that a prior update to tdq_load + * is visible before calling cpu_idle_wakeup(). + */ + atomic_store_int(statep, newstate); +#if defined(SCHED_ULE) && defined(SMP) + atomic_thread_fence_seq_cst(); +#endif /* * Since we may be in a critical section from cpu_idle(), if @@ -576,35 +570,62 @@ cpu_idle_hlt(sbintime_t sbt) * interrupt. */ disable_intr(); - if (sched_runnable()) + if (sched_runnable()) { enable_intr(); - else - acpi_cpu_c1(); - atomic_store_int(state, STATE_RUNNING); + atomic_store_int(statep, STATE_RUNNING); + return (false); + } else { + return (true); + } } static void -cpu_idle_mwait(sbintime_t sbt) +cpu_idle_exit(int *statep) +{ + atomic_store_int(statep, STATE_RUNNING); +} + +static void +cpu_idle_acpi(sbintime_t sbt) { int *state; state = &PCPU_PTR(monitorbuf)->idle_state; - atomic_store_int(state, STATE_MWAIT); + if (cpu_idle_enter(state, STATE_SLEEPING)) { + if (cpu_idle_hook) + cpu_idle_hook(sbt); + else + acpi_cpu_c1(); + cpu_idle_exit(state); + } +} - /* See comments in cpu_idle_hlt(). */ - disable_intr(); - if (sched_runnable()) { +static void +cpu_idle_hlt(sbintime_t sbt) +{ + int *state; + + state = &PCPU_PTR(monitorbuf)->idle_state; + if (cpu_idle_enter(state, STATE_SLEEPING)) { + acpi_cpu_c1(); atomic_store_int(state, STATE_RUNNING); - enable_intr(); - return; } +} - cpu_monitor(state, 0, 0); - if (atomic_load_int(state) == STATE_MWAIT) - __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0)); - else - enable_intr(); - atomic_store_int(state, STATE_RUNNING); +static void +cpu_idle_mwait(sbintime_t sbt) +{ + int *state; + + state = &PCPU_PTR(monitorbuf)->idle_state; + if (cpu_idle_enter(state, STATE_MWAIT)) { + cpu_monitor(state, 0, 0); + if (atomic_load_int(state) == STATE_MWAIT) + __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0)); + else + enable_intr(); + cpu_idle_exit(state); + } } static void