git: 03f868b163ad - main - x86: Add a required store-load barrier in cpu_idle()

From: Mark Johnston <markj_at_FreeBSD.org>
Date: Thu, 14 Jul 2022 14:47:37 UTC
The branch main has been updated by markj:

URL: https://cgit.FreeBSD.org/src/commit/?id=03f868b163ad46d6f7cb03dc46fb83ca01fb8f69

commit 03f868b163ad46d6f7cb03dc46fb83ca01fb8f69
Author:     Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2022-07-14 14:24:25 +0000
Commit:     Mark Johnston <markj@FreeBSD.org>
CommitDate: 2022-07-14 14:28:01 +0000

    x86: Add a required store-load barrier in cpu_idle()
    
    ULE's tdq_notify() tries to avoid delivering IPIs to the idle thread.
    In particular, it tries to detect whether the idle thread is running.
    There are two mechanisms for this:
    - tdq_cpu_idle, an MI flag which is set prior to calling cpu_idle().  If
      tdq_cpu_idle == 0, then no IPI is needed;
    - idle_state, an x86-specific state flag which is updated after
      cpu_idleclock() is called.
    
    The implementation of the second mechanism is racy; the race can cause a
    CPU to go to sleep with pending work.  Specifically, cpu_idle_*() set
    idle_state = STATE_SLEEPING, then check for pending work by loading the
    tdq_load field of the CPU's runqueue.  These operations can be reordered
    so that the idle thread observes tdq_load == 0, and tdq_notify()
    observes idle_state == STATE_RUNNING.
    
    Some counters indicate that the idle_state check in tdq_notify()
    frequently elides an IPI.  So, fix the problem by inserting a fence
    after the store to idle_state, immediately before idling the CPU.
    
    PR:             264867
    Reviewed by:    mav, kib, jhb
    MFC after:      1 month
    Sponsored by:   The FreeBSD Foundation
    Differential Revision:  https://reviews.freebsd.org/D35777
---
 sys/x86/x86/cpu_machdep.c | 103 ++++++++++++++++++++++++++++------------------
 1 file changed, 62 insertions(+), 41 deletions(-)

diff --git a/sys/x86/x86/cpu_machdep.c b/sys/x86/x86/cpu_machdep.c
index fa11f64e2779..040438043c73 100644
--- a/sys/x86/x86/cpu_machdep.c
+++ b/sys/x86/x86/cpu_machdep.c
@@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$");
 #include "opt_maxmem.h"
 #include "opt_mp_watchdog.h"
 #include "opt_platform.h"
+#include "opt_sched.h"
 #ifdef __i386__
 #include "opt_apic.h"
 #endif
@@ -532,32 +533,25 @@ static int	idle_mwait = 1;		/* Use MONITOR/MWAIT for short idle. */
 SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait,
     0, "Use MONITOR/MWAIT for short idle");
 
-static void
-cpu_idle_acpi(sbintime_t sbt)
+static bool
+cpu_idle_enter(int *statep, int newstate)
 {
-	int *state;
+	KASSERT(atomic_load_int(statep) == STATE_RUNNING,
+	    ("%s: state %d", __func__, atomic_load_int(statep)));
 
-	state = &PCPU_PTR(monitorbuf)->idle_state;
-	atomic_store_int(state, STATE_SLEEPING);
-
-	/* See comments in cpu_idle_hlt(). */
-	disable_intr();
-	if (sched_runnable())
-		enable_intr();
-	else if (cpu_idle_hook)
-		cpu_idle_hook(sbt);
-	else
-		acpi_cpu_c1();
-	atomic_store_int(state, STATE_RUNNING);
-}
-
-static void
-cpu_idle_hlt(sbintime_t sbt)
-{
-	int *state;
-
-	state = &PCPU_PTR(monitorbuf)->idle_state;
-	atomic_store_int(state, STATE_SLEEPING);
+	/*
+	 * A fence is needed to prevent reordering of the load in
+	 * sched_runnable() with this store to the idle state word.  Without it,
+	 * cpu_idle_wakeup() can observe the state as STATE_RUNNING after having
+	 * added load to the queue, and elide an IPI.  Then, sched_runnable()
+	 * can observe tdq_load == 0, so the CPU ends up idling with pending
+	 * work.  tdq_notify() similarly ensures that a prior update to tdq_load
+	 * is visible before calling cpu_idle_wakeup().
+	 */
+	atomic_store_int(statep, newstate);
+#if defined(SCHED_ULE) && defined(SMP)
+	atomic_thread_fence_seq_cst();
+#endif
 
 	/*
 	 * Since we may be in a critical section from cpu_idle(), if
@@ -576,35 +570,62 @@ cpu_idle_hlt(sbintime_t sbt)
 	 * interrupt.
 	 */
 	disable_intr();
-	if (sched_runnable())
+	if (sched_runnable()) {
 		enable_intr();
-	else
-		acpi_cpu_c1();
-	atomic_store_int(state, STATE_RUNNING);
+		atomic_store_int(statep, STATE_RUNNING);
+		return (false);
+	} else {
+		return (true);
+	}
 }
 
 static void
-cpu_idle_mwait(sbintime_t sbt)
+cpu_idle_exit(int *statep)
+{
+	atomic_store_int(statep, STATE_RUNNING);
+}
+
+static void
+cpu_idle_acpi(sbintime_t sbt)
 {
 	int *state;
 
 	state = &PCPU_PTR(monitorbuf)->idle_state;
-	atomic_store_int(state, STATE_MWAIT);
+	if (cpu_idle_enter(state, STATE_SLEEPING)) {
+		if (cpu_idle_hook)
+			cpu_idle_hook(sbt);
+		else
+			acpi_cpu_c1();
+		cpu_idle_exit(state);
+	}
+}
 
-	/* See comments in cpu_idle_hlt(). */
-	disable_intr();
-	if (sched_runnable()) {
+static void
+cpu_idle_hlt(sbintime_t sbt)
+{
+	int *state;
+
+	state = &PCPU_PTR(monitorbuf)->idle_state;
+	if (cpu_idle_enter(state, STATE_SLEEPING)) {
+		acpi_cpu_c1();
 		atomic_store_int(state, STATE_RUNNING);
-		enable_intr();
-		return;
 	}
+}
 
-	cpu_monitor(state, 0, 0);
-	if (atomic_load_int(state) == STATE_MWAIT)
-		__asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
-	else
-		enable_intr();
-	atomic_store_int(state, STATE_RUNNING);
+static void
+cpu_idle_mwait(sbintime_t sbt)
+{
+	int *state;
+
+	state = &PCPU_PTR(monitorbuf)->idle_state;
+	if (cpu_idle_enter(state, STATE_MWAIT)) {
+		cpu_monitor(state, 0, 0);
+		if (atomic_load_int(state) == STATE_MWAIT)
+			__asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
+		else
+			enable_intr();
+		cpu_idle_exit(state);
+	}
 }
 
 static void