git: 1820ca215461 - stable/12 - MFC r368649 / 3fd989da by kib: amd64 pmap: fix PCID mode invalidations
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Tue, 14 Dec 2021 14:45:14 UTC
The branch stable/12 has been updated by avg: URL: https://cgit.FreeBSD.org/src/commit/?id=1820ca2154611d6f27ce5a5fdd561a16ac54fdd8 commit 1820ca2154611d6f27ce5a5fdd561a16ac54fdd8 Author: Andriy Gapon <avg@FreeBSD.org> AuthorDate: 2021-12-14 14:43:29 +0000 Commit: Andriy Gapon <avg@FreeBSD.org> CommitDate: 2021-12-14 14:43:29 +0000 MFC r368649 / 3fd989da by kib: amd64 pmap: fix PCID mode invalidations r368649 fixed a regression in r362031 that was MFC-ed to stable/12 as a part of r362572. That commit reordered IPI send and local TLB flush in TLB invalidations. Without this fix we've been seeing problems with stale memory content where changes done under a mutex were not immediately observed by another thread after taking the same mutex. Those inconsistenices were correlated to copy-on-write faults for pages contaning the data. The change needed some adaptations as I elected to skip two significant intermediate changes: - r363195 / dc43978a, amd64: allow parallel shootdown IPIs - r363311 / 3ec7e169, amd64 pmap: microoptimize local shootdowns for PCID PTI configurations Reviewed by: kib Differential Revision: https://reviews.freebsd.org/D33413 --- sys/amd64/amd64/pmap.c | 273 ++++++++++++++++++++++++------------------------- sys/x86/x86/mp_x86.c | 32 ++++-- 2 files changed, 153 insertions(+), 152 deletions(-) diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index c6f56c1cd73b..ee9777c788c1 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -2329,45 +2329,19 @@ pmap_invalidate_ept(pmap_t pmap) static cpuset_t pmap_invalidate_cpu_mask(pmap_t pmap) { - return (pmap == kernel_pmap ? all_cpus : pmap->pm_active); } static inline void -pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va, - const bool invpcid_works1) +pmap_invalidate_preipi_pcid(pmap_t pmap) { - struct invpcid_descr d; - uint64_t kcr3, ucr3; - uint32_t pcid; u_int cpuid, i; + sched_pin(); + cpuid = PCPU_GET(cpuid); - if (pmap == PCPU_GET(curpmap)) { - if (pmap->pm_ucr3 != PMAP_NO_CR3) { - /* - * Because pm_pcid is recalculated on a - * context switch, we must disable switching. - * Otherwise, we might use a stale value - * below. - */ - critical_enter(); - pcid = pmap->pm_pcids[cpuid].pm_pcid; - if (invpcid_works1) { - d.pcid = pcid | PMAP_PCID_USER_PT; - d.pad = 0; - d.addr = va; - invpcid(&d, INVPCID_ADDR); - } else { - kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; - ucr3 = pmap->pm_ucr3 | pcid | - PMAP_PCID_USER_PT | CR3_PCID_SAVE; - pmap_pti_pcid_invlpg(ucr3, kcr3, va); - } - critical_exit(); - } - } else - pmap->pm_pcids[cpuid].pm_gen = 0; + if (pmap != PCPU_GET(curpmap)) + cpuid = 0xffffffff; /* An impossible value */ CPU_FOREACH(i) { if (cpuid != i) @@ -2388,52 +2362,96 @@ pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va, } static void -pmap_invalidate_page_pcid_invpcid(pmap_t pmap, vm_offset_t va) +pmap_invalidate_preipi_nopcid(pmap_t pmap __unused) { + sched_pin(); +} - pmap_invalidate_page_pcid(pmap, va, true); +DEFINE_IFUNC(static, void, pmap_invalidate_preipi, (pmap_t), static) +{ + return (pmap_pcid_enabled ? pmap_invalidate_preipi_pcid : + pmap_invalidate_preipi_nopcid); +} + +static inline void +pmap_invalidate_page_pcid_cb(pmap_t pmap, vm_offset_t va, + const bool invpcid_works1) +{ + struct invpcid_descr d; + uint64_t kcr3, ucr3; + uint32_t pcid; + u_int cpuid; + + /* + * Because pm_pcid is recalculated on a context switch, we + * must ensure there is no preemption, not just pinning. + * Otherwise, we might use a stale value below. + */ + CRITICAL_ASSERT(curthread); + + /* + * No need to do anything with user page tables invalidation + * if there is no user page table. + */ + if (pmap->pm_ucr3 == PMAP_NO_CR3) + return; + + cpuid = PCPU_GET(cpuid); + + pcid = pmap->pm_pcids[cpuid].pm_pcid; + if (invpcid_works1) { + d.pcid = pcid | PMAP_PCID_USER_PT; + d.pad = 0; + d.addr = va; + invpcid(&d, INVPCID_ADDR); + } else { + kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; + ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; + pmap_pti_pcid_invlpg(ucr3, kcr3, va); + } } static void -pmap_invalidate_page_pcid_noinvpcid(pmap_t pmap, vm_offset_t va) +pmap_invalidate_page_pcid_invpcid_cb(pmap_t pmap, vm_offset_t va) { + pmap_invalidate_page_pcid_cb(pmap, va, true); +} - pmap_invalidate_page_pcid(pmap, va, false); +static void +pmap_invalidate_page_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t va) +{ + pmap_invalidate_page_pcid_cb(pmap, va, false); } static void -pmap_invalidate_page_nopcid(pmap_t pmap, vm_offset_t va) +pmap_invalidate_page_nopcid_cb(pmap_t pmap __unused, vm_offset_t va __unused) { } -DEFINE_IFUNC(static, void, pmap_invalidate_page_mode, (pmap_t, vm_offset_t), +DEFINE_IFUNC(static, void, pmap_invalidate_page_cb, (pmap_t, vm_offset_t), static) { - if (pmap_pcid_enabled) - return (invpcid_works ? pmap_invalidate_page_pcid_invpcid : - pmap_invalidate_page_pcid_noinvpcid); - return (pmap_invalidate_page_nopcid); + return (invpcid_works ? pmap_invalidate_page_pcid_invpcid_cb : + pmap_invalidate_page_pcid_noinvpcid_cb); + return (pmap_invalidate_page_nopcid_cb); } static void pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va, vm_offset_t addr2 __unused) { - if (pmap == kernel_pmap) { invlpg(va); - } else { - if (pmap == PCPU_GET(curpmap)) - invlpg(va); - pmap_invalidate_page_mode(pmap, va); + } else if (pmap == PCPU_GET(curpmap)) { + invlpg(va); + pmap_invalidate_page_cb(pmap, va); } } void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { - if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; @@ -2442,6 +2460,7 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va) KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); + pmap_invalidate_preipi(pmap); smp_masked_invlpg(pmap_invalidate_cpu_mask(pmap), va, pmap, pmap_invalidate_page_curcpu_cb); } @@ -2450,73 +2469,62 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va) #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) static void -pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, +pmap_invalidate_range_pcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, const bool invpcid_works1) { struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; - u_int cpuid, i; + u_int cpuid; + + CRITICAL_ASSERT(curthread); + + if (pmap != PCPU_GET(curpmap) || + pmap->pm_ucr3 == PMAP_NO_CR3) + return; cpuid = PCPU_GET(cpuid); - if (pmap == PCPU_GET(curpmap)) { - if (pmap->pm_ucr3 != PMAP_NO_CR3) { - critical_enter(); - pcid = pmap->pm_pcids[cpuid].pm_pcid; - if (invpcid_works1) { - d.pcid = pcid | PMAP_PCID_USER_PT; - d.pad = 0; - d.addr = sva; - for (; d.addr < eva; d.addr += PAGE_SIZE) - invpcid(&d, INVPCID_ADDR); - } else { - kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; - ucr3 = pmap->pm_ucr3 | pcid | - PMAP_PCID_USER_PT | CR3_PCID_SAVE; - pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); - } - critical_exit(); - } - } else - pmap->pm_pcids[cpuid].pm_gen = 0; - CPU_FOREACH(i) { - if (cpuid != i) - pmap->pm_pcids[i].pm_gen = 0; + pcid = pmap->pm_pcids[cpuid].pm_pcid; + if (invpcid_works1) { + d.pcid = pcid | PMAP_PCID_USER_PT; + d.pad = 0; + for (d.addr = sva; d.addr < eva; d.addr += PAGE_SIZE) + invpcid(&d, INVPCID_ADDR); + } else { + kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; + ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; + pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); } - /* See the comment in pmap_invalidate_page_pcid(). */ - atomic_thread_fence_seq_cst(); } static void -pmap_invalidate_range_pcid_invpcid(pmap_t pmap, vm_offset_t sva, +pmap_invalidate_range_pcid_invpcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { - - pmap_invalidate_range_pcid(pmap, sva, eva, true); + pmap_invalidate_range_pcid_cb(pmap, sva, eva, true); } static void -pmap_invalidate_range_pcid_noinvpcid(pmap_t pmap, vm_offset_t sva, +pmap_invalidate_range_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { - - pmap_invalidate_range_pcid(pmap, sva, eva, false); + pmap_invalidate_range_pcid_cb(pmap, sva, eva, false); } static void -pmap_invalidate_range_nopcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +pmap_invalidate_range_nopcid_cb(pmap_t pmap __unused, vm_offset_t sva __unused, + vm_offset_t eva __unused) { } -DEFINE_IFUNC(static, void, pmap_invalidate_range_mode, (pmap_t, vm_offset_t, +DEFINE_IFUNC(static, void, pmap_invalidate_range_cb, (pmap_t, vm_offset_t, vm_offset_t), static) { - if (pmap_pcid_enabled) - return (invpcid_works ? pmap_invalidate_range_pcid_invpcid : - pmap_invalidate_range_pcid_noinvpcid); - return (pmap_invalidate_range_nopcid); + return (invpcid_works ? pmap_invalidate_range_pcid_invpcid_cb : + pmap_invalidate_range_pcid_noinvpcid_cb); + return (pmap_invalidate_range_nopcid_cb); } static void @@ -2527,19 +2535,16 @@ pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) if (pmap == kernel_pmap) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); - } else { - if (pmap == PCPU_GET(curpmap)) { - for (addr = sva; addr < eva; addr += PAGE_SIZE) - invlpg(addr); - } - pmap_invalidate_range_mode(pmap, sva, eva); + } else if (pmap == PCPU_GET(curpmap)) { + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + pmap_invalidate_range_cb(pmap, sva, eva); } } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { - if (eva - sva >= PMAP_INVLPG_THRESHOLD) { pmap_invalidate_all(pmap); return; @@ -2553,17 +2558,18 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); + pmap_invalidate_preipi(pmap); smp_masked_invlpg_range(pmap_invalidate_cpu_mask(pmap), sva, eva, pmap, pmap_invalidate_range_curcpu_cb); } static inline void -pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1) +pmap_invalidate_all_pcid_cb(pmap_t pmap, bool invpcid_works1) { struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; - u_int cpuid, i; + u_int cpuid; if (pmap == kernel_pmap) { if (invpcid_works1) { @@ -2572,87 +2578,72 @@ pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1) } else { invltlb_glob(); } - } else { + } else if (pmap == PCPU_GET(curpmap)) { + CRITICAL_ASSERT(curthread); cpuid = PCPU_GET(cpuid); - if (pmap == PCPU_GET(curpmap)) { - critical_enter(); - pcid = pmap->pm_pcids[cpuid].pm_pcid; - if (invpcid_works1) { - d.pcid = pcid; - d.pad = 0; - d.addr = 0; + + pcid = pmap->pm_pcids[cpuid].pm_pcid; + if (invpcid_works1) { + d.pcid = pcid; + d.pad = 0; + d.addr = 0; + invpcid(&d, INVPCID_CTX); + if (pmap->pm_ucr3 != PMAP_NO_CR3) { + d.pcid |= PMAP_PCID_USER_PT; invpcid(&d, INVPCID_CTX); - if (pmap->pm_ucr3 != PMAP_NO_CR3) { - d.pcid |= PMAP_PCID_USER_PT; - invpcid(&d, INVPCID_CTX); - } + } + } else { + kcr3 = pmap->pm_cr3 | pcid; + ucr3 = pmap->pm_ucr3; + if (ucr3 != PMAP_NO_CR3) { + ucr3 |= pcid | PMAP_PCID_USER_PT; + pmap_pti_pcid_invalidate(ucr3, kcr3); } else { - kcr3 = pmap->pm_cr3 | pcid; - ucr3 = pmap->pm_ucr3; - if (ucr3 != PMAP_NO_CR3) { - ucr3 |= pcid | PMAP_PCID_USER_PT; - pmap_pti_pcid_invalidate(ucr3, kcr3); - } else { - load_cr3(kcr3); - } + load_cr3(kcr3); } - critical_exit(); - } else - pmap->pm_pcids[cpuid].pm_gen = 0; - CPU_FOREACH(i) { - if (cpuid != i) - pmap->pm_pcids[i].pm_gen = 0; } } - /* See the comment in pmap_invalidate_page_pcid(). */ - atomic_thread_fence_seq_cst(); } static void -pmap_invalidate_all_pcid_invpcid(pmap_t pmap) +pmap_invalidate_all_pcid_invpcid_cb(pmap_t pmap) { - - pmap_invalidate_all_pcid(pmap, true); + pmap_invalidate_all_pcid_cb(pmap, true); } static void -pmap_invalidate_all_pcid_noinvpcid(pmap_t pmap) +pmap_invalidate_all_pcid_noinvpcid_cb(pmap_t pmap) { - - pmap_invalidate_all_pcid(pmap, false); + pmap_invalidate_all_pcid_cb(pmap, false); } static void -pmap_invalidate_all_nopcid(pmap_t pmap) +pmap_invalidate_all_nopcid_cb(pmap_t pmap) { - if (pmap == kernel_pmap) invltlb_glob(); else if (pmap == PCPU_GET(curpmap)) invltlb(); } -DEFINE_IFUNC(static, void, pmap_invalidate_all_mode, (pmap_t), static) +DEFINE_IFUNC(static, void, pmap_invalidate_all_cb, (pmap_t), static) { - if (pmap_pcid_enabled) - return (invpcid_works ? pmap_invalidate_all_pcid_invpcid : - pmap_invalidate_all_pcid_noinvpcid); - return (pmap_invalidate_all_nopcid); + return (invpcid_works ? pmap_invalidate_all_pcid_invpcid_cb : + pmap_invalidate_all_pcid_noinvpcid_cb); + return (pmap_invalidate_all_nopcid_cb); } static void pmap_invalidate_all_curcpu_cb(pmap_t pmap, vm_offset_t addr1 __unused, vm_offset_t addr2 __unused) { - - pmap_invalidate_all_mode(pmap); + pmap_invalidate_all_cb(pmap); } void pmap_invalidate_all(pmap_t pmap) { - if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; @@ -2661,6 +2652,7 @@ pmap_invalidate_all(pmap_t pmap) KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); + pmap_invalidate_preipi(pmap); smp_masked_invltlb(pmap_invalidate_cpu_mask(pmap), pmap, pmap_invalidate_all_curcpu_cb); } @@ -2669,14 +2661,13 @@ static void pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused, vm_offset_t va __unused, vm_offset_t addr2 __unused) { - wbinvd(); } void pmap_invalidate_cache(void) { - + sched_pin(); smp_cache_flush(pmap_invalidate_cache_curcpu_cb); } diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c index 41dc36672135..2108d448bb9a 100644 --- a/sys/x86/x86/mp_x86.c +++ b/sys/x86/x86/mp_x86.c @@ -1658,13 +1658,16 @@ volatile uint32_t smp_tlb_generation; * Used by pmap to request invalidation of TLB or cache on local and * remote processors. Mask provides the set of remote CPUs which are * to be signalled with the IPI specified by vector. The curcpu_cb - * callback is invoked on the calling CPU while waiting for remote - * CPUs to complete the operation. + * callback is invoked on the calling CPU in a critical section while + * waiting for remote CPUs to complete the operation. * * The callback function is called unconditionally on the caller's * underlying processor, even when this processor is not set in the * mask. So, the callback function must be prepared to handle such * spurious invocations. + * + * This function must be called with the thread pinned, and it unpins on + * completion. */ static void smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, @@ -1679,23 +1682,21 @@ smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, * It is not necessary to signal other CPUs while booting or * when in the debugger. */ - if (kdb_active || panicstr != NULL || !smp_started) { - curcpu_cb(pmap, addr1, addr2); - return; - } + if (kdb_active || panicstr != NULL || !smp_started) + goto local_cb; - sched_pin(); + KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); /* * Check for other cpus. Return if none. */ if (CPU_ISFULLSET(&mask)) { if (mp_ncpus <= 1) - goto nospinexit; + goto local_cb; } else { CPU_CLR(PCPU_GET(cpuid), &mask); if (CPU_EMPTY(&mask)) - goto nospinexit; + goto local_cb; } if (!(read_eflags() & PSL_I)) @@ -1727,13 +1728,22 @@ smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, while (*p_cpudone != generation) ia32_pause(); } - mtx_unlock_spin(&smp_ipi_mtx); + + /* + * Unpin before unlocking smp_ipi_mtx. If the thread owes + * preemption, this allows scheduler to select thread on any + * CPU from its cpuset. + */ sched_unpin(); + mtx_unlock_spin(&smp_ipi_mtx); + return; -nospinexit: +local_cb: + critical_enter(); curcpu_cb(pmap, addr1, addr2); sched_unpin(); + critical_exit(); } void