git: e3917bb256de - main - Disable promotion on pcpu memory on arm64

From: Andrew Turner <andrew_at_FreeBSD.org>
Date: Tue, 16 Aug 2022 14:48:27 UTC
The branch main has been updated by andrew:

URL: https://cgit.FreeBSD.org/src/commit/?id=e3917bb256dea63945b5bef7fe2b792a280fcafe

commit e3917bb256dea63945b5bef7fe2b792a280fcafe
Author:     Andrew Turner <andrew@FreeBSD.org>
AuthorDate: 2022-04-29 09:30:38 +0000
Commit:     Andrew Turner <andrew@FreeBSD.org>
CommitDate: 2022-08-16 14:41:24 +0000

    Disable promotion on pcpu memory on arm64
    
    We need to be careful to not promote or demote the memory containing
    the per-CPU structures as the exception handlers will dereference it
    so any time it's invalid may cause recursive exceptions.
    
    Add a new pmap function to set a flag in the pte marking memory that
    cannot be promoted or demoted and use it to mark pcpu memory.
    
    Sponsored by:   The FreeBSD Foundation
    Differential Revision: https://reviews.freebsd.org/D35434
---
 sys/arm64/arm64/mp_machdep.c | 15 ++++---
 sys/arm64/arm64/pmap.c       | 97 ++++++++++++++++++++++++++++++--------------
 sys/arm64/include/pmap.h     |  1 +
 sys/arm64/include/pte.h      |  4 +-
 4 files changed, 80 insertions(+), 37 deletions(-)

diff --git a/sys/arm64/arm64/mp_machdep.c b/sys/arm64/arm64/mp_machdep.c
index adf5592832c5..a6e2be300bae 100644
--- a/sys/arm64/arm64/mp_machdep.c
+++ b/sys/arm64/arm64/mp_machdep.c
@@ -495,6 +495,8 @@ static bool
 start_cpu(u_int cpuid, uint64_t target_cpu, int domain)
 {
 	struct pcpu *pcpup;
+	vm_offset_t pcpu_mem;
+	vm_size_t size;
 	vm_paddr_t pa;
 	int err, naps;
 
@@ -508,13 +510,16 @@ start_cpu(u_int cpuid, uint64_t target_cpu, int domain)
 
 	KASSERT(cpuid < MAXCPU, ("Too many CPUs"));
 
-	pcpup = (void *)kmem_malloc_domainset(DOMAINSET_PREF(domain),
-	    sizeof(*pcpup), M_WAITOK | M_ZERO);
+	size = round_page(sizeof(*pcpup) + DPCPU_SIZE);
+	pcpu_mem = kmem_malloc_domainset(DOMAINSET_PREF(domain), size,
+	    M_WAITOK | M_ZERO);
+	pmap_disable_promotion(pcpu_mem, size);
+
+	pcpup = (struct pcpu *)pcpu_mem;
 	pcpu_init(pcpup, cpuid, sizeof(struct pcpu));
 	pcpup->pc_mpidr = target_cpu & CPU_AFF_MASK;
 
-	dpcpu[cpuid - 1] = (void *)kmem_malloc_domainset(
-	    DOMAINSET_PREF(domain), DPCPU_SIZE, M_WAITOK | M_ZERO);
+	dpcpu[cpuid - 1] = (void *)(pcpup + 1);
 	dpcpu_init(dpcpu[cpuid - 1], cpuid);
 
 	bootstacks[cpuid] = (void *)kmem_malloc_domainset(
@@ -538,9 +543,9 @@ start_cpu(u_int cpuid, uint64_t target_cpu, int domain)
 		    cpuid, target_cpu, err));
 
 		pcpu_destroy(pcpup);
-		kmem_free((vm_offset_t)dpcpu[cpuid - 1], DPCPU_SIZE);
 		dpcpu[cpuid - 1] = NULL;
 		kmem_free((vm_offset_t)bootstacks[cpuid], PAGE_SIZE);
+		kmem_free(pcpu_mem, size);
 		bootstacks[cpuid] = NULL;
 		mp_ncpus--;
 		return (false);
diff --git a/sys/arm64/arm64/pmap.c b/sys/arm64/arm64/pmap.c
index d95eccb445a5..62276c024212 100644
--- a/sys/arm64/arm64/pmap.c
+++ b/sys/arm64/arm64/pmap.c
@@ -3467,7 +3467,7 @@ retry:
 }
 
 /*
- * pmap_protect_l2: do the things to protect a 2MB page in a pmap
+ * Masks and sets bits in a level 2 page table entries in the specified pmap
  */
 static void
 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask,
@@ -3515,34 +3515,16 @@ pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask,
 }
 
 /*
- *	Set the physical protection on the
- *	specified range of this map as requested.
+ * Masks and sets bits in last level page table entries in the specified
+ * pmap and range
  */
-void
-pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
+static void
+pmap_mask_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
+    pt_entry_t nbits, bool invalidate)
 {
 	vm_offset_t va, va_next;
 	pd_entry_t *l0, *l1, *l2;
-	pt_entry_t *l3p, l3, mask, nbits;
-
-	PMAP_ASSERT_STAGE1(pmap);
-	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
-	if (prot == VM_PROT_NONE) {
-		pmap_remove(pmap, sva, eva);
-		return;
-	}
-
-	mask = nbits = 0;
-	if ((prot & VM_PROT_WRITE) == 0) {
-		mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM;
-		nbits |= ATTR_S1_AP(ATTR_S1_AP_RO);
-	}
-	if ((prot & VM_PROT_EXECUTE) == 0) {
-		mask |= ATTR_S1_XN;
-		nbits |= ATTR_S1_XN;
-	}
-	if (mask == 0)
-		return;
+	pt_entry_t *l3p, l3;
 
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
@@ -3569,7 +3551,8 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 			MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
 			if ((pmap_load(l1) & mask) != nbits) {
 				pmap_store(l1, (pmap_load(l1) & ~mask) | nbits);
-				pmap_invalidate_page(pmap, sva, true);
+				if (invalidate)
+					pmap_invalidate_page(pmap, sva, true);
 			}
 			continue;
 		}
@@ -3610,8 +3593,9 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 			 */
 			if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) {
 				if (va != va_next) {
-					pmap_invalidate_range(pmap, va, sva,
-					    true);
+					if (invalidate)
+						pmap_invalidate_range(pmap,
+						    va, sva, true);
 					va = va_next;
 				}
 				continue;
@@ -3633,12 +3617,54 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 			if (va == va_next)
 				va = sva;
 		}
-		if (va != va_next)
+		if (va != va_next && invalidate)
 			pmap_invalidate_range(pmap, va, sva, true);
 	}
 	PMAP_UNLOCK(pmap);
 }
 
+/*
+ *	Set the physical protection on the
+ *	specified range of this map as requested.
+ */
+void
+pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
+{
+	pt_entry_t mask, nbits;
+
+	PMAP_ASSERT_STAGE1(pmap);
+	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
+	if (prot == VM_PROT_NONE) {
+		pmap_remove(pmap, sva, eva);
+		return;
+	}
+
+	mask = nbits = 0;
+	if ((prot & VM_PROT_WRITE) == 0) {
+		mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM;
+		nbits |= ATTR_S1_AP(ATTR_S1_AP_RO);
+	}
+	if ((prot & VM_PROT_EXECUTE) == 0) {
+		mask |= ATTR_S1_XN;
+		nbits |= ATTR_S1_XN;
+	}
+	if (mask == 0)
+		return;
+
+	pmap_mask_set(pmap, sva, eva, mask, nbits, true);
+}
+
+void
+pmap_disable_promotion(vm_offset_t sva, vm_size_t size)
+{
+
+	MPASS((sva & L3_OFFSET) == 0);
+	MPASS(((sva + size) & L3_OFFSET) == 0);
+
+	pmap_mask_set(kernel_pmap, sva, sva + size, ATTR_SW_NO_PROMOTE,
+	    ATTR_SW_NO_PROMOTE, false);
+}
+
 /*
  * Inserts the specified page table page into the specified pmap's collection
  * of idle page table pages.  Each of a pmap's page table pages is responsible
@@ -3683,6 +3709,9 @@ pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
+	if ((newpte & ATTR_SW_NO_PROMOTE) != 0)
+		panic("%s: Updating non-promote pte", __func__);
+
 	/*
 	 * Ensure we don't get switched out with the page table in an
 	 * inconsistent state. We also need to ensure no interrupts fire
@@ -3775,7 +3804,8 @@ pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
 	firstl3 = pmap_l2_to_l3(l2, sva);
 	newl2 = pmap_load(firstl3);
 
-	if (((newl2 & (~ATTR_MASK | ATTR_AF)) & L2_OFFSET) != ATTR_AF) {
+	if (((newl2 & (~ATTR_MASK | ATTR_AF)) & L2_OFFSET) != ATTR_AF ||
+	    (newl2 & ATTR_SW_NO_PROMOTE) != 0) {
 		atomic_add_long(&pmap_l2_p_failures, 1);
 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
 		    " in pmap %p", va, pmap);
@@ -6284,6 +6314,9 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
 				break;
 			}
 		} else {
+			/* We can't demote/promote this entry */
+			MPASS((pmap_load(ptep) & ATTR_SW_NO_PROMOTE) == 0);
+
 			/*
 			 * Split the entry to an level 3 table, then
 			 * set the new attribute.
@@ -6375,6 +6408,8 @@ pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
 	    ("pmap_demote_l1: Invalid virtual address %#lx", va));
 	KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
 	    ("pmap_demote_l1: Level 1 table shouldn't be managed"));
+	KASSERT((oldl1 & ATTR_SW_NO_PROMOTE) == 0,
+	    ("pmap_demote_l1: Demoting entry with no-demote flag set"));
 
 	tmpl1 = 0;
 	if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
@@ -6470,6 +6505,8 @@ pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
 	oldl2 = pmap_load(l2);
 	KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
 	    ("pmap_demote_l2: Demoting a non-block entry"));
+	KASSERT((oldl2 & ATTR_SW_NO_PROMOTE) == 0,
+	    ("pmap_demote_l2: Demoting entry with no-demote flag set"));
 	va &= ~L2_OFFSET;
 
 	tmpl2 = 0;
diff --git a/sys/arm64/include/pmap.h b/sys/arm64/include/pmap.h
index 6da54a841a28..f36b081ea869 100644
--- a/sys/arm64/include/pmap.h
+++ b/sys/arm64/include/pmap.h
@@ -189,6 +189,7 @@ bool	pmap_page_is_mapped(vm_page_t m);
 int	pmap_pinit_stage(pmap_t, enum pmap_stage, int);
 bool	pmap_ps_enabled(pmap_t pmap);
 uint64_t pmap_to_ttbr0(pmap_t pmap);
+void	pmap_disable_promotion(vm_offset_t sva, vm_size_t size);
 
 void	*pmap_mapdev(vm_paddr_t, vm_size_t);
 void	*pmap_mapbios(vm_paddr_t, vm_size_t);
diff --git a/sys/arm64/include/pte.h b/sys/arm64/include/pte.h
index eaf6745f9679..24130f26cee1 100644
--- a/sys/arm64/include/pte.h
+++ b/sys/arm64/include/pte.h
@@ -52,8 +52,8 @@ typedef	uint64_t	pt_entry_t;		/* page table entry */
 #define	ATTR_MASK_L		UINT64_C(0x0000000000000fff)
 #define	ATTR_MASK		(ATTR_MASK_H | ATTR_MASK_L)
 /* Bits 58:55 are reserved for software */
-#define	ATTR_SW_UNUSED2		(1UL << 58)
-#define	ATTR_SW_UNUSED1		(1UL << 57)
+#define	ATTR_SW_UNUSED1		(1UL << 58)
+#define	ATTR_SW_NO_PROMOTE	(1UL << 57)
 #define	ATTR_SW_MANAGED		(1UL << 56)
 #define	ATTR_SW_WIRED		(1UL << 55)