git: 1820ca215461 - stable/12 - MFC r368649 / 3fd989da by kib: amd64 pmap: fix PCID mode invalidations

From: Andriy Gapon <avg_at_FreeBSD.org>
Date: Tue, 14 Dec 2021 14:45:14 UTC
The branch stable/12 has been updated by avg:

URL: https://cgit.FreeBSD.org/src/commit/?id=1820ca2154611d6f27ce5a5fdd561a16ac54fdd8

commit 1820ca2154611d6f27ce5a5fdd561a16ac54fdd8
Author:     Andriy Gapon <avg@FreeBSD.org>
AuthorDate: 2021-12-14 14:43:29 +0000
Commit:     Andriy Gapon <avg@FreeBSD.org>
CommitDate: 2021-12-14 14:43:29 +0000

    MFC r368649 / 3fd989da by kib: amd64 pmap: fix PCID mode invalidations
    
    r368649 fixed a regression in r362031 that was MFC-ed to stable/12 as
    a part of r362572.  That commit reordered IPI send and local TLB flush in
    TLB invalidations.
    
    Without this fix we've been seeing problems with stale memory content
    where changes done under a mutex were not immediately observed by
    another thread after taking the same mutex.  Those inconsistenices were
    correlated to copy-on-write faults for pages contaning the data.
    
    The change needed some adaptations as I elected to skip two significant
    intermediate changes:
    - r363195 / dc43978a, amd64: allow parallel shootdown IPIs
    - r363311 / 3ec7e169, amd64 pmap: microoptimize local shootdowns for
      PCID PTI configurations
    
    Reviewed by:    kib
    Differential Revision:  https://reviews.freebsd.org/D33413
---
 sys/amd64/amd64/pmap.c | 273 ++++++++++++++++++++++++-------------------------
 sys/x86/x86/mp_x86.c   |  32 ++++--
 2 files changed, 153 insertions(+), 152 deletions(-)

diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index c6f56c1cd73b..ee9777c788c1 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -2329,45 +2329,19 @@ pmap_invalidate_ept(pmap_t pmap)
 static cpuset_t
 pmap_invalidate_cpu_mask(pmap_t pmap)
 {
-
 	return (pmap == kernel_pmap ? all_cpus : pmap->pm_active);
 }
 
 static inline void
-pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va,
-    const bool invpcid_works1)
+pmap_invalidate_preipi_pcid(pmap_t pmap)
 {
-	struct invpcid_descr d;
-	uint64_t kcr3, ucr3;
-	uint32_t pcid;
 	u_int cpuid, i;
 
+	sched_pin();
+
 	cpuid = PCPU_GET(cpuid);
-	if (pmap == PCPU_GET(curpmap)) {
-		if (pmap->pm_ucr3 != PMAP_NO_CR3) {
-			/*
-			 * Because pm_pcid is recalculated on a
-			 * context switch, we must disable switching.
-			 * Otherwise, we might use a stale value
-			 * below.
-			 */
-			critical_enter();
-			pcid = pmap->pm_pcids[cpuid].pm_pcid;
-			if (invpcid_works1) {
-				d.pcid = pcid | PMAP_PCID_USER_PT;
-				d.pad = 0;
-				d.addr = va;
-				invpcid(&d, INVPCID_ADDR);
-			} else {
-				kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
-				ucr3 = pmap->pm_ucr3 | pcid |
-				    PMAP_PCID_USER_PT | CR3_PCID_SAVE;
-				pmap_pti_pcid_invlpg(ucr3, kcr3, va);
-			}
-			critical_exit();
-		}
-	} else
-		pmap->pm_pcids[cpuid].pm_gen = 0;
+	if (pmap != PCPU_GET(curpmap))
+		cpuid = 0xffffffff;	/* An impossible value */
 
 	CPU_FOREACH(i) {
 		if (cpuid != i)
@@ -2388,52 +2362,96 @@ pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va,
 }
 
 static void
-pmap_invalidate_page_pcid_invpcid(pmap_t pmap, vm_offset_t va)
+pmap_invalidate_preipi_nopcid(pmap_t pmap __unused)
 {
+	sched_pin();
+}
 
-	pmap_invalidate_page_pcid(pmap, va, true);
+DEFINE_IFUNC(static, void, pmap_invalidate_preipi, (pmap_t), static)
+{
+	return (pmap_pcid_enabled ? pmap_invalidate_preipi_pcid :
+	    pmap_invalidate_preipi_nopcid);
+}
+
+static inline void
+pmap_invalidate_page_pcid_cb(pmap_t pmap, vm_offset_t va,
+    const bool invpcid_works1)
+{
+	struct invpcid_descr d;
+	uint64_t kcr3, ucr3;
+	uint32_t pcid;
+	u_int cpuid;
+
+	/*
+	 * Because pm_pcid is recalculated on a context switch, we
+	 * must ensure there is no preemption, not just pinning.
+	 * Otherwise, we might use a stale value below.
+	 */
+	CRITICAL_ASSERT(curthread);
+
+	/*
+	 * No need to do anything with user page tables invalidation
+	 * if there is no user page table.
+	 */
+	if (pmap->pm_ucr3 == PMAP_NO_CR3)
+		return;
+
+	cpuid = PCPU_GET(cpuid);
+
+	pcid = pmap->pm_pcids[cpuid].pm_pcid;
+	if (invpcid_works1) {
+		d.pcid = pcid | PMAP_PCID_USER_PT;
+		d.pad = 0;
+		d.addr = va;
+		invpcid(&d, INVPCID_ADDR);
+	} else {
+		kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
+		ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
+		pmap_pti_pcid_invlpg(ucr3, kcr3, va);
+	}
 }
 
 static void
-pmap_invalidate_page_pcid_noinvpcid(pmap_t pmap, vm_offset_t va)
+pmap_invalidate_page_pcid_invpcid_cb(pmap_t pmap, vm_offset_t va)
 {
+	pmap_invalidate_page_pcid_cb(pmap, va, true);
+}
 
-	pmap_invalidate_page_pcid(pmap, va, false);
+static void
+pmap_invalidate_page_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t va)
+{
+	pmap_invalidate_page_pcid_cb(pmap, va, false);
 }
 
 static void
-pmap_invalidate_page_nopcid(pmap_t pmap, vm_offset_t va)
+pmap_invalidate_page_nopcid_cb(pmap_t pmap __unused, vm_offset_t va __unused)
 {
 }
 
-DEFINE_IFUNC(static, void, pmap_invalidate_page_mode, (pmap_t, vm_offset_t),
+DEFINE_IFUNC(static, void, pmap_invalidate_page_cb, (pmap_t, vm_offset_t),
     static)
 {
-
 	if (pmap_pcid_enabled)
-		return (invpcid_works ? pmap_invalidate_page_pcid_invpcid :
-		    pmap_invalidate_page_pcid_noinvpcid);
-	return (pmap_invalidate_page_nopcid);
+		return (invpcid_works ? pmap_invalidate_page_pcid_invpcid_cb :
+		    pmap_invalidate_page_pcid_noinvpcid_cb);
+	return (pmap_invalidate_page_nopcid_cb);
 }
 
 static void
 pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va,
     vm_offset_t addr2 __unused)
 {
-
 	if (pmap == kernel_pmap) {
 		invlpg(va);
-	} else {
-		if (pmap == PCPU_GET(curpmap))
-			invlpg(va);
-		pmap_invalidate_page_mode(pmap, va);
+	} else if (pmap == PCPU_GET(curpmap)) {
+		invlpg(va);
+		pmap_invalidate_page_cb(pmap, va);
 	}
 }
 
 void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
-
 	if (pmap_type_guest(pmap)) {
 		pmap_invalidate_ept(pmap);
 		return;
@@ -2442,6 +2460,7 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
 
+	pmap_invalidate_preipi(pmap);
 	smp_masked_invlpg(pmap_invalidate_cpu_mask(pmap), va, pmap,
 	    pmap_invalidate_page_curcpu_cb);
 }
@@ -2450,73 +2469,62 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 #define	PMAP_INVLPG_THRESHOLD	(4 * 1024 * PAGE_SIZE)
 
 static void
-pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
+pmap_invalidate_range_pcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
     const bool invpcid_works1)
 {
 	struct invpcid_descr d;
 	uint64_t kcr3, ucr3;
 	uint32_t pcid;
-	u_int cpuid, i;
+	u_int cpuid;
+
+	CRITICAL_ASSERT(curthread);
+
+	if (pmap != PCPU_GET(curpmap) ||
+	    pmap->pm_ucr3 == PMAP_NO_CR3)
+		return;
 
 	cpuid = PCPU_GET(cpuid);
-	if (pmap == PCPU_GET(curpmap)) {
-		if (pmap->pm_ucr3 != PMAP_NO_CR3) {
-			critical_enter();
-			pcid = pmap->pm_pcids[cpuid].pm_pcid;
-			if (invpcid_works1) {
-				d.pcid = pcid | PMAP_PCID_USER_PT;
-				d.pad = 0;
-				d.addr = sva;
-				for (; d.addr < eva; d.addr += PAGE_SIZE)
-					invpcid(&d, INVPCID_ADDR);
-			} else {
-				kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
-				ucr3 = pmap->pm_ucr3 | pcid |
-				    PMAP_PCID_USER_PT | CR3_PCID_SAVE;
-				pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
-			}
-			critical_exit();
-		}
-	} else
-		pmap->pm_pcids[cpuid].pm_gen = 0;
 
-	CPU_FOREACH(i) {
-		if (cpuid != i)
-			pmap->pm_pcids[i].pm_gen = 0;
+	pcid = pmap->pm_pcids[cpuid].pm_pcid;
+	if (invpcid_works1) {
+		d.pcid = pcid | PMAP_PCID_USER_PT;
+		d.pad = 0;
+		for (d.addr = sva; d.addr < eva; d.addr += PAGE_SIZE)
+			invpcid(&d, INVPCID_ADDR);
+	} else {
+		kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
+		ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
+		pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
 	}
-	/* See the comment in pmap_invalidate_page_pcid(). */
-	atomic_thread_fence_seq_cst();
 }
 
 static void
-pmap_invalidate_range_pcid_invpcid(pmap_t pmap, vm_offset_t sva,
+pmap_invalidate_range_pcid_invpcid_cb(pmap_t pmap, vm_offset_t sva,
     vm_offset_t eva)
 {
-
-	pmap_invalidate_range_pcid(pmap, sva, eva, true);
+	pmap_invalidate_range_pcid_cb(pmap, sva, eva, true);
 }
 
 static void
-pmap_invalidate_range_pcid_noinvpcid(pmap_t pmap, vm_offset_t sva,
+pmap_invalidate_range_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t sva,
     vm_offset_t eva)
 {
-
-	pmap_invalidate_range_pcid(pmap, sva, eva, false);
+	pmap_invalidate_range_pcid_cb(pmap, sva, eva, false);
 }
 
 static void
-pmap_invalidate_range_nopcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+pmap_invalidate_range_nopcid_cb(pmap_t pmap __unused, vm_offset_t sva __unused,
+    vm_offset_t eva __unused)
 {
 }
 
-DEFINE_IFUNC(static, void, pmap_invalidate_range_mode, (pmap_t, vm_offset_t,
+DEFINE_IFUNC(static, void, pmap_invalidate_range_cb, (pmap_t, vm_offset_t,
     vm_offset_t), static)
 {
-
 	if (pmap_pcid_enabled)
-		return (invpcid_works ? pmap_invalidate_range_pcid_invpcid :
-		    pmap_invalidate_range_pcid_noinvpcid);
-	return (pmap_invalidate_range_nopcid);
+		return (invpcid_works ? pmap_invalidate_range_pcid_invpcid_cb :
+		    pmap_invalidate_range_pcid_noinvpcid_cb);
+	return (pmap_invalidate_range_nopcid_cb);
 }
 
 static void
@@ -2527,19 +2535,16 @@ pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 	if (pmap == kernel_pmap) {
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
-	} else {
-		if (pmap == PCPU_GET(curpmap)) {
-			for (addr = sva; addr < eva; addr += PAGE_SIZE)
-				invlpg(addr);
-		}
-		pmap_invalidate_range_mode(pmap, sva, eva);
+	} else if (pmap == PCPU_GET(curpmap)) {
+		for (addr = sva; addr < eva; addr += PAGE_SIZE)
+			invlpg(addr);
+		pmap_invalidate_range_cb(pmap, sva, eva);
 	}
 }
 
 void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
-
 	if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
 		pmap_invalidate_all(pmap);
 		return;
@@ -2553,17 +2558,18 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
 
+	pmap_invalidate_preipi(pmap);
 	smp_masked_invlpg_range(pmap_invalidate_cpu_mask(pmap), sva, eva, pmap,
 	    pmap_invalidate_range_curcpu_cb);
 }
 
 static inline void
-pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1)
+pmap_invalidate_all_pcid_cb(pmap_t pmap, bool invpcid_works1)
 {
 	struct invpcid_descr d;
 	uint64_t kcr3, ucr3;
 	uint32_t pcid;
-	u_int cpuid, i;
+	u_int cpuid;
 
 	if (pmap == kernel_pmap) {
 		if (invpcid_works1) {
@@ -2572,87 +2578,72 @@ pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1)
 		} else {
 			invltlb_glob();
 		}
-	} else {
+	} else if (pmap == PCPU_GET(curpmap)) {
+		CRITICAL_ASSERT(curthread);
 		cpuid = PCPU_GET(cpuid);
-		if (pmap == PCPU_GET(curpmap)) {
-			critical_enter();
-			pcid = pmap->pm_pcids[cpuid].pm_pcid;
-			if (invpcid_works1) {
-				d.pcid = pcid;
-				d.pad = 0;
-				d.addr = 0;
+
+		pcid = pmap->pm_pcids[cpuid].pm_pcid;
+		if (invpcid_works1) {
+			d.pcid = pcid;
+			d.pad = 0;
+			d.addr = 0;
+			invpcid(&d, INVPCID_CTX);
+			if (pmap->pm_ucr3 != PMAP_NO_CR3) {
+				d.pcid |= PMAP_PCID_USER_PT;
 				invpcid(&d, INVPCID_CTX);
-				if (pmap->pm_ucr3 != PMAP_NO_CR3) {
-					d.pcid |= PMAP_PCID_USER_PT;
-					invpcid(&d, INVPCID_CTX);
-				}
+			}
+		} else {
+			kcr3 = pmap->pm_cr3 | pcid;
+			ucr3 = pmap->pm_ucr3;
+			if (ucr3 != PMAP_NO_CR3) {
+				ucr3 |= pcid | PMAP_PCID_USER_PT;
+				pmap_pti_pcid_invalidate(ucr3, kcr3);
 			} else {
-				kcr3 = pmap->pm_cr3 | pcid;
-				ucr3 = pmap->pm_ucr3;
-				if (ucr3 != PMAP_NO_CR3) {
-					ucr3 |= pcid | PMAP_PCID_USER_PT;
-					pmap_pti_pcid_invalidate(ucr3, kcr3);
-				} else {
-					load_cr3(kcr3);
-				}
+				load_cr3(kcr3);
 			}
-			critical_exit();
-		} else
-			pmap->pm_pcids[cpuid].pm_gen = 0;
-		CPU_FOREACH(i) {
-			if (cpuid != i)
-				pmap->pm_pcids[i].pm_gen = 0;
 		}
 	}
-	/* See the comment in pmap_invalidate_page_pcid(). */
-	atomic_thread_fence_seq_cst();
 }
 
 static void
-pmap_invalidate_all_pcid_invpcid(pmap_t pmap)
+pmap_invalidate_all_pcid_invpcid_cb(pmap_t pmap)
 {
-
-	pmap_invalidate_all_pcid(pmap, true);
+	pmap_invalidate_all_pcid_cb(pmap, true);
 }
 
 static void
-pmap_invalidate_all_pcid_noinvpcid(pmap_t pmap)
+pmap_invalidate_all_pcid_noinvpcid_cb(pmap_t pmap)
 {
-
-	pmap_invalidate_all_pcid(pmap, false);
+	pmap_invalidate_all_pcid_cb(pmap, false);
 }
 
 static void
-pmap_invalidate_all_nopcid(pmap_t pmap)
+pmap_invalidate_all_nopcid_cb(pmap_t pmap)
 {
-
 	if (pmap == kernel_pmap)
 		invltlb_glob();
 	else if (pmap == PCPU_GET(curpmap))
 		invltlb();
 }
 
-DEFINE_IFUNC(static, void, pmap_invalidate_all_mode, (pmap_t), static)
+DEFINE_IFUNC(static, void, pmap_invalidate_all_cb, (pmap_t), static)
 {
-
 	if (pmap_pcid_enabled)
-		return (invpcid_works ? pmap_invalidate_all_pcid_invpcid :
-		    pmap_invalidate_all_pcid_noinvpcid);
-	return (pmap_invalidate_all_nopcid);
+		return (invpcid_works ? pmap_invalidate_all_pcid_invpcid_cb :
+		    pmap_invalidate_all_pcid_noinvpcid_cb);
+	return (pmap_invalidate_all_nopcid_cb);
 }
 
 static void
 pmap_invalidate_all_curcpu_cb(pmap_t pmap, vm_offset_t addr1 __unused,
     vm_offset_t addr2 __unused)
 {
-
-	pmap_invalidate_all_mode(pmap);
+	pmap_invalidate_all_cb(pmap);
 }
 
 void
 pmap_invalidate_all(pmap_t pmap)
 {
-
 	if (pmap_type_guest(pmap)) {
 		pmap_invalidate_ept(pmap);
 		return;
@@ -2661,6 +2652,7 @@ pmap_invalidate_all(pmap_t pmap)
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
 
+	pmap_invalidate_preipi(pmap);
 	smp_masked_invltlb(pmap_invalidate_cpu_mask(pmap), pmap,
 	    pmap_invalidate_all_curcpu_cb);
 }
@@ -2669,14 +2661,13 @@ static void
 pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused, vm_offset_t va __unused,
     vm_offset_t addr2 __unused)
 {
-
 	wbinvd();
 }
 
 void
 pmap_invalidate_cache(void)
 {
-
+	sched_pin();
 	smp_cache_flush(pmap_invalidate_cache_curcpu_cb);
 }
 
diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c
index 41dc36672135..2108d448bb9a 100644
--- a/sys/x86/x86/mp_x86.c
+++ b/sys/x86/x86/mp_x86.c
@@ -1658,13 +1658,16 @@ volatile uint32_t smp_tlb_generation;
  * Used by pmap to request invalidation of TLB or cache on local and
  * remote processors.  Mask provides the set of remote CPUs which are
  * to be signalled with the IPI specified by vector.  The curcpu_cb
- * callback is invoked on the calling CPU while waiting for remote
- * CPUs to complete the operation.
+ * callback is invoked on the calling CPU in a critical section while
+ * waiting for remote CPUs to complete the operation.
  *
  * The callback function is called unconditionally on the caller's
  * underlying processor, even when this processor is not set in the
  * mask.  So, the callback function must be prepared to handle such
  * spurious invocations.
+ *
+ * This function must be called with the thread pinned, and it unpins on
+ * completion.
  */
 static void
 smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
@@ -1679,23 +1682,21 @@ smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
 	 * It is not necessary to signal other CPUs while booting or
 	 * when in the debugger.
 	 */
-	if (kdb_active || panicstr != NULL || !smp_started) {
-		curcpu_cb(pmap, addr1, addr2);
-		return;
-	}
+	if (kdb_active || panicstr != NULL || !smp_started)
+		goto local_cb;
 
-	sched_pin();
+	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 
 	/*
 	 * Check for other cpus.  Return if none.
 	 */
 	if (CPU_ISFULLSET(&mask)) {
 		if (mp_ncpus <= 1)
-			goto nospinexit;
+			goto local_cb;
 	} else {
 		CPU_CLR(PCPU_GET(cpuid), &mask);
 		if (CPU_EMPTY(&mask))
-			goto nospinexit;
+			goto local_cb;
 	}
 
 	if (!(read_eflags() & PSL_I))
@@ -1727,13 +1728,22 @@ smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
 		while (*p_cpudone != generation)
 			ia32_pause();
 	}
-	mtx_unlock_spin(&smp_ipi_mtx);
+
+	/*
+	 * Unpin before unlocking smp_ipi_mtx.  If the thread owes
+	 * preemption, this allows scheduler to select thread on any
+	 * CPU from its cpuset.
+	 */
 	sched_unpin();
+	mtx_unlock_spin(&smp_ipi_mtx);
+
 	return;
 
-nospinexit:
+local_cb:
+	critical_enter();
 	curcpu_cb(pmap, addr1, addr2);
 	sched_unpin();
+	critical_exit();
 }
 
 void