git: 567cc4e6bfd9 - stable/13 - amd64: for small cores, use (big hammer) INVPCID_CTXGLOB instead of INVLPG
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Fri, 20 Jan 2023 03:23:55 UTC
The branch stable/13 has been updated by kib: URL: https://cgit.FreeBSD.org/src/commit/?id=567cc4e6bfd92d7351e385569f2bb4b7c89b6db0 commit 567cc4e6bfd92d7351e385569f2bb4b7c89b6db0 Author: Konstantin Belousov <kib@FreeBSD.org> AuthorDate: 2022-10-10 23:08:55 +0000 Commit: Konstantin Belousov <kib@FreeBSD.org> CommitDate: 2023-01-20 03:21:57 +0000 amd64: for small cores, use (big hammer) INVPCID_CTXGLOB instead of INVLPG PR: 261169, 266145 Tested by: pho (cherry picked from commit cde70e312c3fde5b37a29be1dacb7fde9a45b94a) --- sys/amd64/amd64/initcpu.c | 5 +++++ sys/amd64/amd64/mp_machdep.c | 16 +++++++++++----- sys/amd64/amd64/pmap.c | 36 +++++++++++++++++++++++++++++------- sys/amd64/include/pcpu.h | 3 ++- sys/amd64/include/pmap.h | 20 ++++++++++++++++++++ 5 files changed, 67 insertions(+), 13 deletions(-) diff --git a/sys/amd64/amd64/initcpu.c b/sys/amd64/amd64/initcpu.c index 1b731821889e..08385d3095d0 100644 --- a/sys/amd64/amd64/initcpu.c +++ b/sys/amd64/amd64/initcpu.c @@ -324,6 +324,11 @@ initializecpu(void) if ((r[0] & CPUID_HYBRID_CORE_MASK) == CPUID_HYBRID_SMALL_CORE) { PCPU_SET(small_core, 1); + if (pmap_pcid_enabled && + pmap_pcid_invlpg_workaround_uena) { + PCPU_SET(pcid_invlpg_workaround, 1); + pmap_pcid_invlpg_workaround = 1; + } } } } diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index 5e94ba822871..650f83b1aad4 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -863,7 +863,7 @@ invlpg_invpcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1) (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ - invlpg(smp_tlb_addr1); + pmap_invlpg(smp_tlb_pmap, smp_tlb_addr1); if (smp_tlb_pmap == PCPU_GET(curpmap) && smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3 && PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) { @@ -933,10 +933,16 @@ invlrng_invpcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1, #endif /* COUNT_IPIS */ addr = smp_tlb_addr1; - do { - invlpg(addr); - addr += PAGE_SIZE; - } while (addr < smp_tlb_addr2); + if (smp_tlb_pmap == kernel_pmap && PCPU_GET(pcid_invlpg_workaround)) { + struct invpcid_descr d = { 0 }; + + invpcid(&d, INVPCID_CTXGLOB); + } else { + do { + invlpg(addr); + addr += PAGE_SIZE; + } while (addr < smp_tlb_addr2); + } if (smp_tlb_pmap == PCPU_GET(curpmap) && smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3 && PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) { diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index 233c58b83f2d..62fa64881c7b 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -517,6 +517,12 @@ SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, int invpcid_works = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, "Is the invpcid instruction available ?"); +int pmap_pcid_invlpg_workaround = 0; +SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_invlpg_workaround, + CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &pmap_pcid_invlpg_workaround, 0, + "Enable small core PCID/INVLPG workaround"); +int pmap_pcid_invlpg_workaround_uena = 1; int __read_frequently pti = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, @@ -2518,6 +2524,9 @@ pmap_init(void) VM_PAGE_TO_PHYS(m); } } + + TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround", + &pmap_pcid_invlpg_workaround_uena); } SYSCTL_UINT(_vm_pmap, OID_AUTO, large_map_pml4_entries, @@ -2749,7 +2758,7 @@ pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ - invlpg(va); + pmap_invlpg(pmap, va); else if ((newpde & PG_G) == 0) /* * Promotion: flush every 4KB page mapping from the TLB @@ -3088,7 +3097,7 @@ pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va, vm_offset_t addr2 __unused) { if (pmap == kernel_pmap) { - invlpg(va); + pmap_invlpg(kernel_pmap, va); } else if (pmap == PCPU_GET(curpmap)) { invlpg(va); pmap_invalidate_page_cb(pmap, va); @@ -3179,8 +3188,14 @@ pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) vm_offset_t addr; if (pmap == kernel_pmap) { - for (addr = sva; addr < eva; addr += PAGE_SIZE) - invlpg(addr); + if (PCPU_GET(pcid_invlpg_workaround)) { + struct invpcid_descr d = { 0 }; + + invpcid(&d, INVPCID_CTXGLOB); + } else { + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + } } else if (pmap == PCPU_GET(curpmap)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); @@ -3717,7 +3732,7 @@ pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) for (; spa < epa; spa += PAGE_SIZE) { sched_pin(); pte_store(pte, spa | pte_bits); - invlpg(vaddr); + pmap_invlpg(kernel_pmap, vaddr); /* XXXKIB atomic inside flush_cache_range are excessive */ pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE); sched_unpin(); @@ -7527,7 +7542,7 @@ pmap_kenter_temporary(vm_paddr_t pa, int i) va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); - invlpg(va); + pmap_invlpg(kernel_pmap, va); return ((void *)crashdumpmap); } @@ -10223,7 +10238,7 @@ pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, page[i]->md.pat_mode, 0); pte_store(pte, paddr | X86_PG_RW | X86_PG_V | cache_bits); - invlpg(vaddr[i]); + pmap_invlpg(kernel_pmap, vaddr[i]); } } } @@ -10272,7 +10287,14 @@ pmap_quick_remove_page(vm_offset_t addr) if (addr != qframe) return; pte_store(vtopte(qframe), 0); + + /* + * Since qframe is exclusively mapped by + * pmap_quick_enter_page() and that function doesn't set PG_G, + * we can use INVLPG here. + */ invlpg(qframe); + mtx_unlock_spin(&qframe_mtx); } diff --git a/sys/amd64/include/pcpu.h b/sys/amd64/include/pcpu.h index c0b8ee456f25..13de60f650de 100644 --- a/sys/amd64/include/pcpu.h +++ b/sys/amd64/include/pcpu.h @@ -100,7 +100,8 @@ _Static_assert(sizeof(struct monitorbuf) == 128, "2x cache line"); u_int pc_smp_tlb_op; \ uint64_t pc_ucr3_load_mask; \ u_int pc_small_core; \ - char __pad[2912] /* pad to UMA_PCPU_ALLOC_SIZE */ + u_int pc_pcid_invlpg_workaround; \ + char __pad[2908] /* pad to UMA_PCPU_ALLOC_SIZE */ #define PC_DBREG_CMD_NONE 0 #define PC_DBREG_CMD_LOAD 1 diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h index 8f1e77806a25..7b86f9e139e1 100644 --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -461,6 +461,8 @@ extern vm_offset_t virtual_end; extern vm_paddr_t dmaplimit; extern int pmap_pcid_enabled; extern int invpcid_works; +extern int pmap_pcid_invlpg_workaround; +extern int pmap_pcid_invlpg_workaround_uena; #define pmap_page_get_memattr(m) ((vm_memattr_t)(m)->md.pat_mode) #define pmap_page_is_write_mapped(m) (((m)->a.flags & PGA_WRITEABLE) != 0) @@ -546,6 +548,24 @@ pmap_invalidate_cpu_mask(pmap_t pmap) return (&pmap->pm_active); } +/* + * It seems that AlderLake+ small cores have some microarchitectural + * bug, which results in the INVLPG instruction failing to flush all + * global TLB entries when PCID is enabled. Work around it for now, + * by doing global invalidation on small cores instead of INVLPG. + */ +static __inline void +pmap_invlpg(pmap_t pmap, vm_offset_t va) +{ + if (pmap == kernel_pmap && PCPU_GET(pcid_invlpg_workaround)) { + struct invpcid_descr d = { 0 }; + + invpcid(&d, INVPCID_CTXGLOB); + } else { + invlpg(va); + } +} + #endif /* _KERNEL */ /* Return various clipped indexes for a given VA */