git: 36f1526a598c - main - Add experimental 16k page support on arm64

From: Andrew Turner <andrew_at_FreeBSD.org>
Date: Tue, 19 Jul 2022 09:57:50 UTC
The branch main has been updated by andrew:

URL: https://cgit.FreeBSD.org/src/commit/?id=36f1526a598c373ca660910c9772d28a61383c3b

commit 36f1526a598c373ca660910c9772d28a61383c3b
Author:     Andrew Turner <andrew@FreeBSD.org>
AuthorDate: 2022-03-23 17:39:58 +0000
Commit:     Andrew Turner <andrew@FreeBSD.org>
CommitDate: 2022-07-19 09:57:03 +0000

    Add experimental 16k page support on arm64
    
    Add initial 16k page support on arm64. It is considered experimental,
    with no guarantee of compatibility with a userspace or kernel modules
    built with the current a 4k page size as code will likely try to pass
    in a too small size when working with APIs that take a multiple of a
    page, e.g. mmap.
    
    As this is experimental, and because userspace and the kernel need to
    have the PAGE_SIZE macro kept in sync there is no kernel option to
    enable this. To test a new image should be built with the
    PAGE_{SIZE,SHIFT,MASK} macros changed to the 16k versions.
    
    There are currently known issues with loading modules from an old
    loader as it can misalign them to load on a non-16k boundary.
    
    Testing has shown good results in kernel workloads that allocate and
    free large amounts of memory as only a quarter of the number of calls
    into the VM subsystem are needed in the best case.
    
    Reviewed by:    markj
    Tested by:      gallatin
    Sponsored by:   The FreeBSD Foundation
    Differential Revision: https://reviews.freebsd.org/D34793
---
 sys/arm64/arm64/locore.S           | 136 ++++++++++++++++++++++++++++++++--
 sys/arm64/arm64/minidump_machdep.c |   6 ++
 sys/arm64/arm64/pmap.c             | 146 +++++++++++++++++++++++++------------
 sys/arm64/include/pmap.h           |  11 +++
 sys/arm64/include/pte.h            |  42 ++++++++---
 5 files changed, 277 insertions(+), 64 deletions(-)

diff --git a/sys/arm64/arm64/locore.S b/sys/arm64/arm64/locore.S
index ba85bb4e46b2..518c6c812aa9 100644
--- a/sys/arm64/arm64/locore.S
+++ b/sys/arm64/arm64/locore.S
@@ -39,6 +39,14 @@
 
 #define	VIRT_BITS	48
 
+#if PAGE_SIZE == PAGE_SIZE_16K
+/*
+ * The number of level 3 tables to create. 32 will allow for 1G of address
+ * space, the same as a single level 2 page with 4k pages.
+ */
+#define	L3_PAGE_COUNT	32
+#endif
+
 	.globl	kernbase
 	.set	kernbase, KERNBASE
 
@@ -431,8 +439,13 @@ LENTRY(create_pagetables)
 	/* Booted with modules pointer */
 	/* Find modulep - begin */
 	sub	x8, x0, x6
-	/* Add two 2MiB pages for the module data and round up */
-	ldr	x7, =(3 * L2_SIZE - 1)
+	/*
+	 * Add space for the module data. When PAGE_SIZE is 4k this will
+	 * add at least 2 level 2 blocks (2 * 2MiB). When PAGE_SIZE is
+	 * larger it will be at least as large as we use smaller level 3
+	 * pages.
+	 */
+	ldr	x7, =((6 * 1024 * 1024) - 1)
 	add	x8, x8, x7
 	b	common
 
@@ -457,6 +470,34 @@ booti_no_fdt:
 #endif
 
 common:
+#if PAGE_SIZE != PAGE_SIZE_4K
+	/*
+	 * Create L3 pages. The kernel will be loaded at a 2M aligned
+	 * address, however L2 blocks are too large when the page size is
+	 * not 4k to map the kernel with such an aligned address. However,
+	 * when the page size is larger than 4k, L2 blocks are too large to
+	 * map the kernel with such an alignment.
+	 */
+
+	/* Get the number of l3 pages to allocate, rounded down */
+	lsr	x10, x8, #(L3_SHIFT)
+
+	/* Create the kernel space L2 table */
+	mov	x6, x26
+	mov	x7, #(ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK))
+	mov	x8, #(KERNBASE)
+	mov	x9, x28
+	bl	build_l3_page_pagetable
+
+	/* Move to the l2 table */
+	ldr	x9, =(PAGE_SIZE * L3_PAGE_COUNT)
+	add	x26, x26, x9
+
+	/* Link the l2 -> l3 table */
+	mov	x9, x6
+	mov	x6, x26
+	bl	link_l2_pagetable
+#else
 	/* Get the number of l2 pages to allocate, rounded down */
 	lsr	x10, x8, #(L2_SHIFT)
 
@@ -466,6 +507,7 @@ common:
 	mov	x8, #(KERNBASE)
 	mov	x9, x28
 	bl	build_l2_block_pagetable
+#endif
 
 	/* Move to the l1 table */
 	add	x26, x26, #PAGE_SIZE
@@ -504,7 +546,8 @@ common:
 #if defined(SOCDEV_PA)
 	/* Create a table for the UART */
 	mov	x7, #(ATTR_S1_nG | ATTR_S1_IDX(VM_MEMATTR_DEVICE))
-	add	x16, x16, #(L2_SIZE)	/* VA start */
+	ldr	x9, =(L2_SIZE)
+	add	x16, x16, x9	/* VA start */
 	mov	x8, x16
 
 	/* Store the socdev virtual address */
@@ -523,7 +566,8 @@ common:
 
 	/* Create the mapping for FDT data (2 MiB max) */
 	mov	x7, #(ATTR_S1_nG | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK))
-	add	x16, x16, #(L2_SIZE)	/* VA start */
+	ldr	x9, =(L2_SIZE)
+	add	x16, x16, x9	/* VA start */
 	mov	x8, x16
 	mov	x9, x0			/* PA start */
 	/* Update the module pointer to point at the allocated memory */
@@ -662,6 +706,76 @@ LENTRY(build_l2_block_pagetable)
 	ret
 LEND(build_l2_block_pagetable)
 
+#if PAGE_SIZE != PAGE_SIZE_4K
+/*
+ * Builds an L2 -> L3 table descriptor
+ *
+ *  x6  = L2 table
+ *  x8  = Virtual Address
+ *  x9  = L3 PA (trashed)
+ *  x11, x12 and x13 are trashed
+ */
+LENTRY(link_l2_pagetable)
+	/*
+	 * Link an L2 -> L3 table entry.
+	 */
+	/* Find the table index */
+	lsr	x11, x8, #L2_SHIFT
+	and	x11, x11, #Ln_ADDR_MASK
+
+	/* Build the L1 block entry */
+	mov	x12, #L2_TABLE
+
+	/* Only use the output address bits */
+	lsr	x9, x9, #PAGE_SHIFT
+	orr	x13, x12, x9, lsl #PAGE_SHIFT
+
+	/* Store the entry */
+	str	x13, [x6, x11, lsl #3]
+
+	ret
+LEND(link_l2_pagetable)
+
+/*
+ * Builds count level 3 page table entries
+ *  x6  = L3 table
+ *  x7  = Block attributes
+ *  x8  = VA start
+ *  x9  = PA start (trashed)
+ *  x10 = Entry count (trashed)
+ *  x11, x12 and x13 are trashed
+ */
+LENTRY(build_l3_page_pagetable)
+	/*
+	 * Build the L3 table entry.
+	 */
+	/* Find the table index */
+	lsr	x11, x8, #L3_SHIFT
+	and	x11, x11, #Ln_ADDR_MASK
+
+	/* Build the L3 page entry */
+	orr	x12, x7, #L3_PAGE
+	orr	x12, x12, #(ATTR_DEFAULT)
+	orr	x12, x12, #(ATTR_S1_UXN)
+
+	/* Only use the output address bits */
+	lsr	x9, x9, #L3_SHIFT
+
+	/* Set the physical address for this virtual address */
+1:	orr	x13, x12, x9, lsl #L3_SHIFT
+
+	/* Store the entry */
+	str	x13, [x6, x11, lsl #3]
+
+	sub	x10, x10, #1
+	add	x11, x11, #1
+	add	x9, x9, #1
+	cbnz	x10, 1b
+
+	ret
+LEND(build_l3_page_pagetable)
+#endif
+
 LENTRY(start_mmu)
 	dsb	sy
 
@@ -743,7 +857,15 @@ mair:
 		MAIR_ATTR(MAIR_NORMAL_WT, VM_MEMATTR_WRITE_THROUGH) |	\
 		MAIR_ATTR(MAIR_DEVICE_nGnRE, VM_MEMATTR_DEVICE_nGnRE)
 tcr:
-	.quad (TCR_TxSZ(64 - VIRT_BITS) | TCR_TG1_4K | TCR_TG0_4K | \
+#if PAGE_SIZE == PAGE_SIZE_4K
+#define	TCR_TG	(TCR_TG1_4K | TCR_TG0_4K)
+#elif PAGE_SIZE == PAGE_SIZE_16K
+#define	TCR_TG	(TCR_TG1_16K | TCR_TG0_16K)
+#else
+#error Unsupported page size
+#endif
+
+	.quad (TCR_TxSZ(64 - VIRT_BITS) | TCR_TG | \
 	    TCR_CACHE_ATTRS | TCR_SMP_ATTRS)
 sctlr_set:
 	/* Bits to set */
@@ -774,6 +896,10 @@ END(abort)
 	 */
 	.globl pagetable_l0_ttbr1
 pagetable:
+#if PAGE_SIZE != PAGE_SIZE_4K
+	.space	(PAGE_SIZE * L3_PAGE_COUNT)
+pagetable_l2_ttbr1:
+#endif
 	.space	PAGE_SIZE
 pagetable_l1_ttbr1:
 	.space	PAGE_SIZE
diff --git a/sys/arm64/arm64/minidump_machdep.c b/sys/arm64/arm64/minidump_machdep.c
index 3dfeb3dfef1e..ee2b1be9b0b1 100644
--- a/sys/arm64/arm64/minidump_machdep.c
+++ b/sys/arm64/arm64/minidump_machdep.c
@@ -239,7 +239,13 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 	mdhdr.dmapbase = DMAP_MIN_ADDRESS;
 	mdhdr.dmapend = DMAP_MAX_ADDRESS;
 	mdhdr.dumpavailsize = round_page(sizeof(dump_avail));
+#if PAGE_SIZE == PAGE_SIZE_4K
 	mdhdr.flags = MINIDUMP_FLAG_PS_4K;
+#elif PAGE_SIZE == PAGE_SIZE_16K
+	mdhdr.flags = MINIDUMP_FLAG_PS_16K;
+#else
+#error Unsupported page size
+#endif
 
 	dump_init_header(di, &kdh, KERNELDUMPMAGIC, KERNELDUMP_AARCH64_VERSION,
 	    dumpsize);
diff --git a/sys/arm64/arm64/pmap.c b/sys/arm64/arm64/pmap.c
index 3b37ce214664..d95eccb445a5 100644
--- a/sys/arm64/arm64/pmap.c
+++ b/sys/arm64/arm64/pmap.c
@@ -286,10 +286,6 @@ vm_paddr_t dmap_phys_base;	/* The start of the dmap region */
 vm_paddr_t dmap_phys_max;	/* The limit of the dmap region */
 vm_offset_t dmap_max_addr;	/* The virtual address limit of the dmap */
 
-/* This code assumes all L1 DMAP entries will be used */
-CTASSERT((DMAP_MIN_ADDRESS  & ~L0_OFFSET) == DMAP_MIN_ADDRESS);
-CTASSERT((DMAP_MAX_ADDRESS  & ~L0_OFFSET) == DMAP_MAX_ADDRESS);
-
 extern pt_entry_t pagetable_l0_ttbr1[];
 
 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
@@ -299,6 +295,15 @@ static u_int physmap_idx;
 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "VM/pmap parameters");
 
+#if PAGE_SIZE == PAGE_SIZE_4K
+#define	L1_BLOCKS_SUPPORTED	1
+#else
+/* TODO: Make this dynamic when we support FEAT_LPA2 (TCR_EL1.DS == 1) */
+#define	L1_BLOCKS_SUPPORTED	0
+#endif
+
+#define	PMAP_ASSERT_L1_BLOCKS_SUPPORTED	MPASS(L1_BLOCKS_SUPPORTED)
+
 /*
  * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs
  * that it has currently allocated to a pmap, a cursor ("asid_next") to
@@ -571,6 +576,7 @@ pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
 	}
 	desc = pmap_load(l1) & ATTR_DESCR_MASK;
 	if (desc == L1_BLOCK) {
+		PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
 		*level = 1;
 		return (l1);
 	}
@@ -621,9 +627,11 @@ pmap_pte_exists(pmap_t pmap, vm_offset_t va, int level, const char *diag)
 	if (desc == L0_TABLE && level > 0) {
 		l1p = pmap_l0_to_l1(l0p, va);
 		desc = pmap_load(l1p) & ATTR_DESCR_MASK;
-		if (desc == L1_BLOCK && level == 1)
+		if (desc == L1_BLOCK && level == 1) {
+			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
 			return (l1p);
-		else if (desc == L1_TABLE && level > 1) {
+		}
+		if (desc == L1_TABLE && level > 1) {
 			l2p = pmap_l1_to_l2(l1p, va);
 			desc = pmap_load(l2p) & ATTR_DESCR_MASK;
 			if (desc == L2_BLOCK && level == 2)
@@ -673,6 +681,7 @@ pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
 	*l1 = l1p;
 
 	if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
+		PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
 		*l2 = NULL;
 		*l3 = NULL;
 		return (true);
@@ -1013,29 +1022,36 @@ pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa,
 			pmap_bootstrap_dmap_l3_page(&state, i);
 		MPASS(state.pa <= physmap[i + 1]);
 
-		/* Create L2 mappings at the start of the region */
-		if ((state.pa & L1_OFFSET) != 0)
-			pmap_bootstrap_dmap_l2_block(&state, i);
-		MPASS(state.pa <= physmap[i + 1]);
+		if (L1_BLOCKS_SUPPORTED) {
+			/* Create L2 mappings at the start of the region */
+			if ((state.pa & L1_OFFSET) != 0)
+				pmap_bootstrap_dmap_l2_block(&state, i);
+			MPASS(state.pa <= physmap[i + 1]);
+
+			/* Create the main L1 block mappings */
+			for (; state.va < DMAP_MAX_ADDRESS &&
+			    (physmap[i + 1] - state.pa) >= L1_SIZE;
+			    state.va += L1_SIZE, state.pa += L1_SIZE) {
+				/* Make sure there is a valid L1 table */
+				pmap_bootstrap_dmap_l0_table(&state);
+				MPASS((state.pa & L1_OFFSET) == 0);
+				pmap_store(&state.l1[pmap_l1_index(state.va)],
+				    state.pa | ATTR_DEFAULT | ATTR_S1_XN |
+				    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
+				    L1_BLOCK);
+			}
+			MPASS(state.pa <= physmap[i + 1]);
 
-		/* Create the main L1 block mappings */
-		for (; state.va < DMAP_MAX_ADDRESS &&
-		    (physmap[i + 1] - state.pa) >= L1_SIZE;
-		    state.va += L1_SIZE, state.pa += L1_SIZE) {
-			/* Make sure there is a valid L1 table */
-			pmap_bootstrap_dmap_l0_table(&state);
-			MPASS((state.pa & L1_OFFSET) == 0);
-			pmap_store(&state.l1[pmap_l1_index(state.va)],
-			    state.pa | ATTR_DEFAULT | ATTR_S1_XN |
-			    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
-			    L1_BLOCK);
+			/* Create L2 mappings at the end of the region */
+			pmap_bootstrap_dmap_l2_block(&state, i);
+		} else {
+			while (state.va < DMAP_MAX_ADDRESS &&
+			    (physmap[i + 1] - state.pa) >= L2_SIZE) {
+				pmap_bootstrap_dmap_l2_block(&state, i);
+			}
 		}
 		MPASS(state.pa <= physmap[i + 1]);
 
-		/* Create L2 mappings at the end of the region */
-		pmap_bootstrap_dmap_l2_block(&state, i);
-		MPASS(state.pa <= physmap[i + 1]);
-
 		/* Create L3 mappings at the end of the region */
 		pmap_bootstrap_dmap_l3_page(&state, i);
 		MPASS(state.pa == physmap[i + 1]);
@@ -1261,9 +1277,11 @@ pmap_init(void)
 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
 		    ("pmap_init: can't assign to pagesizes[1]"));
 		pagesizes[1] = L2_SIZE;
-		KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0,
-		    ("pmap_init: can't assign to pagesizes[2]"));
-		pagesizes[2] = L1_SIZE;
+		if (L1_BLOCKS_SUPPORTED) {
+			KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0,
+			    ("pmap_init: can't assign to pagesizes[2]"));
+			pagesizes[2] = L1_SIZE;
+		}
 	}
 
 	/*
@@ -1483,6 +1501,7 @@ pmap_extract(pmap_t pmap, vm_offset_t va)
 		pa = tpte & ~ATTR_MASK;
 		switch(lvl) {
 		case 1:
+			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
 			KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
 			    ("pmap_extract: Invalid L1 pte found: %lx",
 			    tpte & ATTR_DESCR_MASK));
@@ -1530,6 +1549,10 @@ pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 
 		KASSERT(lvl > 0 && lvl <= 3,
 		    ("pmap_extract_and_hold: Invalid level %d", lvl));
+		/*
+		 * Check that the pte is either a L3 page, or a L1 or L2 block
+		 * entry. We can assume L1_BLOCK == L2_BLOCK.
+		 */
 		KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
 		    (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
 		    ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
@@ -2426,8 +2449,13 @@ pmap_growkernel(vm_offset_t addr)
  ***************************************************/
 
 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
+#if PAGE_SIZE == PAGE_SIZE_4K
 CTASSERT(_NPCM == 3);
 CTASSERT(_NPCPV == 168);
+#else
+CTASSERT(_NPCM == 11);
+CTASSERT(_NPCPV == 677);
+#endif
 
 static __inline struct pv_chunk *
 pv_to_chunk(pv_entry_t pv)
@@ -2438,11 +2466,30 @@ pv_to_chunk(pv_entry_t pv)
 
 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 
-#define	PC_FREE0	0xfffffffffffffffful
-#define	PC_FREE1	0xfffffffffffffffful
-#define	PC_FREE2	0x000000fffffffffful
+#define	PC_FREEN	0xfffffffffffffffful
+#if _NPCM == 3
+#define	PC_FREEL	0x000000fffffffffful
+#elif _NPCM == 11
+#define	PC_FREEL	0x0000001ffffffffful
+#endif
+
+#if _NPCM == 3
+#define	PC_IS_FREE(pc)	((pc)->pc_map[0] == PC_FREEN &&			\
+    (pc)->pc_map[1] == PC_FREEN && (pc)->pc_map[2] == PC_FREEL)
+#else
+#define	PC_IS_FREE(pc)							\
+    (memcmp((pc)->pc_map, pc_freemask, sizeof(pc_freemask)) == 0)
+#endif
 
-static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
+static const uint64_t pc_freemask[] = { PC_FREEN, PC_FREEN,
+#if _NPCM > 3
+    PC_FREEN, PC_FREEN, PC_FREEN, PC_FREEN, PC_FREEN, PC_FREEN, PC_FREEN,
+    PC_FREEN,
+#endif
+    PC_FREEL
+};
+
+CTASSERT(nitems(pc_freemask) == _NPCM);
 
 #ifdef PV_STATS
 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
@@ -2608,8 +2655,7 @@ reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
-		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
-		    pc->pc_map[2] == PC_FREE2) {
+		if (PC_IS_FREE(pc)) {
 			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
@@ -2678,8 +2724,7 @@ free_pv_entry(pmap_t pmap, pv_entry_t pv)
 	field = idx / 64;
 	bit = idx % 64;
 	pc->pc_map[field] |= 1ul << bit;
-	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
-	    pc->pc_map[2] != PC_FREE2) {
+	if (!PC_IS_FREE(pc)) {
 		/* 98% of the time, pc is already at the head of the list. */
 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
@@ -2767,9 +2812,8 @@ retry:
 	dump_add_page(m->phys_addr);
 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 	pc->pc_pmap = pmap;
-	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
-	pc->pc_map[1] = PC_FREE1;
-	pc->pc_map[2] = PC_FREE2;
+	memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
+	pc->pc_map[0] &= ~1ul;		/* preallocated bit 0 */
 	mtx_lock(&pv_chunks_mutex);
 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 	mtx_unlock(&pv_chunks_mutex);
@@ -2829,9 +2873,7 @@ retry:
 		dump_add_page(m->phys_addr);
 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 		pc->pc_pmap = pmap;
-		pc->pc_map[0] = PC_FREE0;
-		pc->pc_map[1] = PC_FREE1;
-		pc->pc_map[2] = PC_FREE2;
+		memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
@@ -3265,6 +3307,7 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 		if (pmap_load(l1) == 0)
 			continue;
 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
+			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
 			KASSERT(va_next <= eva,
 			    ("partial update of non-transparent 1G page "
 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
@@ -3518,6 +3561,7 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 		if (pmap_load(l1) == 0)
 			continue;
 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
+			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
 			KASSERT(va_next <= eva,
 			    ("partial update of non-transparent 1G page "
 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
@@ -3848,9 +3892,10 @@ restart:
 				mp->ref_count++;
 			}
 		}
-		KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
-		    ((origpte & ATTR_DESCR_MASK) == L1_BLOCK &&
-		     (origpte & ~ATTR_MASK) == (newpte & ~ATTR_MASK)),
+		KASSERT((origpte & ~ATTR_MASK) == (newpte & ~ATTR_MASK) ||
+		    (L1_BLOCKS_SUPPORTED &&
+		    (origpte & ATTR_DESCR_MASK) == L1_BLOCK &&
+		    (origpte & ATTR_DESCR_VALID) == 0),
 		    ("va %#lx changing 1G phys page l1 %#lx newpte %#lx",
 		    va, origpte, newpte));
 		pmap_store(l1p, newpte);
@@ -3980,9 +4025,10 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 		KASSERT((m->oflags & VPO_UNMANAGED) != 0,
 		    ("managed largepage va %#lx flags %#x", va, flags));
 		new_l3 &= ~L3_PAGE;
-		if (psind == 2)
+		if (psind == 2) {
+			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
 			new_l3 |= L1_BLOCK;
-		else /* (psind == 1) */
+		} else /* (psind == 1) */
 			new_l3 |= L2_BLOCK;
 		rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind);
 		goto out;
@@ -4660,6 +4706,7 @@ pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 			continue;
 
 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
+			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
 			KASSERT(va_next <= eva,
 			    ("partial update of non-transparent 1G page "
 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
@@ -4772,6 +4819,7 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
 		if (pmap_load(l1) == 0)
 			continue;
 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
+			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
 			KASSERT(va_next <= end_addr,
 			    ("partial update of non-transparent 1G page "
 			    "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
@@ -5730,6 +5778,7 @@ pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
 		if (pmap_load(l1) == 0)
 			continue;
 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
+			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
 			KASSERT(va_next <= eva,
 			    ("partial update of non-transparent 1G page "
 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
@@ -6243,6 +6292,7 @@ pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
 			default:
 				panic("Invalid DMAP table level: %d\n", lvl);
 			case 1:
+				PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
 				if ((tmpva & L1_OFFSET) == 0 &&
 				    (base + size - tmpva) >= L1_SIZE) {
 					pte_size = L1_SIZE;
@@ -6318,6 +6368,7 @@ pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldl1 = pmap_load(l1);
+	PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
 	KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
 	    ("pmap_demote_l1: Demoting a non-block entry"));
 	KASSERT((va & L1_OFFSET) == 0,
@@ -7400,6 +7451,7 @@ sysctl_kmaps(SYSCTL_HANDLER_ARGS)
 				continue;
 			}
 			if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) {
+				PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
 				sysctl_kmaps_check(sb, &range, sva, l0e, l1e,
 				    0, 0);
 				range.l1blocks++;
diff --git a/sys/arm64/include/pmap.h b/sys/arm64/include/pmap.h
index 8c7c26fce8a0..87527c390f57 100644
--- a/sys/arm64/include/pmap.h
+++ b/sys/arm64/include/pmap.h
@@ -106,8 +106,18 @@ typedef struct pv_entry {
  * pv_entries are allocated in chunks per-process.  This avoids the
  * need to track per-pmap assignments.
  */
+#if PAGE_SIZE == PAGE_SIZE_4K
 #define	_NPCM	3
 #define	_NPCPV	168
+#define	_NPAD	0
+#elif PAGE_SIZE == PAGE_SIZE_16K
+#define	_NPCM	11
+#define	_NPCPV	677
+#define	_NPAD	1
+#else
+#error Unsupported page size
+#endif
+
 #define	PV_CHUNK_HEADER							\
 	pmap_t			pc_pmap;				\
 	TAILQ_ENTRY(pv_chunk)	pc_list;				\
@@ -121,6 +131,7 @@ struct pv_chunk_header {
 struct pv_chunk {
 	PV_CHUNK_HEADER
 	struct pv_entry		pc_pventry[_NPCPV];
+	uint64_t		pc_pad[_NPAD];
 };
 
 struct thread;
diff --git a/sys/arm64/include/pte.h b/sys/arm64/include/pte.h
index 3ce11133e2ef..eaf6745f9679 100644
--- a/sys/arm64/include/pte.h
+++ b/sys/arm64/include/pte.h
@@ -109,33 +109,43 @@ typedef	uint64_t	pt_entry_t;		/* page table entry */
 #define	ATTR_DESCR_TYPE_PAGE	2
 #define	ATTR_DESCR_TYPE_BLOCK	0
 
-/* Level 0 table, 512GiB per entry */
+#if PAGE_SIZE == PAGE_SIZE_4K
 #define	L0_SHIFT	39
-#define	L0_SIZE		(1ul << L0_SHIFT)
+#define	L1_SHIFT	30
+#define	L2_SHIFT	21
+#define	L3_SHIFT	12
+#elif PAGE_SIZE == PAGE_SIZE_16K
+#define	L0_SHIFT	47
+#define	L1_SHIFT	36
+#define	L2_SHIFT	25
+#define	L3_SHIFT	14
+#else
+#error Unsupported page size
+#endif
+
+/* Level 0 table, 512GiB/128TiB per entry */
+#define	L0_SIZE		(UINT64_C(1) << L0_SHIFT)
 #define	L0_OFFSET	(L0_SIZE - 1ul)
 #define	L0_INVAL	0x0 /* An invalid address */
 	/* 0x1 Level 0 doesn't support block translation */
 	/* 0x2 also marks an invalid address */
 #define	L0_TABLE	0x3 /* A next-level table */
 
-/* Level 1 table, 1GiB per entry */
-#define	L1_SHIFT	30
-#define	L1_SIZE 	(1 << L1_SHIFT)
+/* Level 1 table, 1GiB/64GiB per entry */
+#define	L1_SIZE 	(UINT64_C(1) << L1_SHIFT)
 #define	L1_OFFSET 	(L1_SIZE - 1)
 #define	L1_INVAL	L0_INVAL
 #define	L1_BLOCK	0x1
 #define	L1_TABLE	L0_TABLE
 
-/* Level 2 table, 2MiB per entry */
-#define	L2_SHIFT	21
-#define	L2_SIZE 	(1 << L2_SHIFT)
+/* Level 2 table, 2MiB/32MiB per entry */
+#define	L2_SIZE 	(UINT64_C(1) << L2_SHIFT)
 #define	L2_OFFSET 	(L2_SIZE - 1)
 #define	L2_INVAL	L1_INVAL
-#define	L2_BLOCK	L1_BLOCK
+#define	L2_BLOCK	0x1
 #define	L2_TABLE	L1_TABLE
 
-/* Level 3 table, 4KiB per entry */
-#define	L3_SHIFT	12
+/* Level 3 table, 4KiB/16KiB per entry */
 #define	L3_SIZE 	(1 << L3_SHIFT)
 #define	L3_OFFSET 	(L3_SIZE - 1)
 #define	L3_INVAL	0x0
@@ -145,11 +155,19 @@ typedef	uint64_t	pt_entry_t;		/* page table entry */
 
 #define	PMAP_MAPDEV_EARLY_SIZE	(L2_SIZE * 8)
 
+#if PAGE_SIZE == PAGE_SIZE_4K
 #define	L0_ENTRIES_SHIFT 9
+#define	Ln_ENTRIES_SHIFT 9
+#elif PAGE_SIZE == PAGE_SIZE_16K
+#define	L0_ENTRIES_SHIFT 1
+#define	Ln_ENTRIES_SHIFT 11
+#else
+#error Unsupported page size
+#endif
+
 #define	L0_ENTRIES	(1 << L0_ENTRIES_SHIFT)
 #define	L0_ADDR_MASK	(L0_ENTRIES - 1)
 
-#define	Ln_ENTRIES_SHIFT 9
 #define	Ln_ENTRIES	(1 << Ln_ENTRIES_SHIFT)
 #define	Ln_ADDR_MASK	(Ln_ENTRIES - 1)
 #define	Ln_TABLE_MASK	((1 << 12) - 1)