git: 756bc3adc578 - main - kasan: Create a shadow for the bootstack prior to hammer_time()

From: Mark Johnston <markj_at_FreeBSD.org>
Date: Wed, 15 Jun 2022 15:39:26 UTC
The branch main has been updated by markj:

URL: https://cgit.FreeBSD.org/src/commit/?id=756bc3adc578077d530e7f64683d4fc8383030ce

commit 756bc3adc578077d530e7f64683d4fc8383030ce
Author:     Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2022-06-15 14:48:16 +0000
Commit:     Mark Johnston <markj@FreeBSD.org>
CommitDate: 2022-06-15 15:39:10 +0000

    kasan: Create a shadow for the bootstack prior to hammer_time()
    
    When the kernel is compiled with -asan-stack=true, the address sanitizer
    will emit inline accesses to the shadow map.  In other words, some
    shadow map accesses are not intercepted by the KASAN runtime, so they
    cannot be disabled even if the runtime is not yet initialized by
    kasan_init() at the end of hammer_time().
    
    This went unnoticed because the loader will initialize all PML4 entries
    of the bootstrap page table to point to the same PDP page, so early
    shadow map accesses do not raise a page fault, though they are silently
    corrupting memory.  In fact, when the loader does not copy the staging
    area, we do get a page fault since in that case only the first and last
    PML4Es are populated by the loader.  But due to another bug, the loader
    always treated KASAN kernels as non-relocatable and thus always copied
    the staging area.
    
    It is not really practical to annotate hammer_time() and all callees
    with __nosanitizeaddress, so instead add some early initialization which
    creates a shadow for the boot stack used by hammer_time().  This is only
    needed by KASAN, not by KMSAN, but the shared pmap code handles both.
    
    Reported by:    mhorne
    Reviewed by:    kib
    MFC after:      1 month
    Sponsored by:   The FreeBSD Foundation
    Differential Revision:  https://reviews.freebsd.org/D35449
---
 sys/amd64/amd64/locore.S   |  22 ++++++---
 sys/amd64/amd64/machdep.c  |  64 +++++++++++++-------------
 sys/amd64/amd64/pmap.c     | 111 ++++++++++++++++++++++++++++++++++++++++++++-
 sys/amd64/include/asan.h   |   6 +++
 sys/amd64/include/md_var.h |   1 +
 sys/amd64/include/pmap.h   |   1 +
 sys/kern/subr_asan.c       |   6 +++
 sys/sys/asan.h             |   3 +-
 8 files changed, 174 insertions(+), 40 deletions(-)

diff --git a/sys/amd64/amd64/locore.S b/sys/amd64/amd64/locore.S
index f2eedb402ef3..f034a25c9b1b 100644
--- a/sys/amd64/amd64/locore.S
+++ b/sys/amd64/amd64/locore.S
@@ -48,6 +48,8 @@
 	.set	dmapbase,DMAP_MIN_ADDRESS
 	.set	dmapend,DMAP_MAX_ADDRESS
 
+#define	BOOTSTACK_SIZE	4096
+
 	.text
 /**********************************************************************
  *
@@ -66,14 +68,22 @@ ENTRY(btext)
 	pushq	$PSL_KERNEL
 	popfq
 
-	/* Find the metadata pointers before we lose them */
+	/* Get onto a stack that we can trust - there is no going back now. */
 	movq	%rsp, %rbp
+	movq	$bootstack,%rsp
+
+#ifdef KASAN
+	/* Bootstrap a shadow map for the boot stack. */
+	movq	$bootstack, %rdi
+	subq	$BOOTSTACK_SIZE, %rdi
+	movq	$BOOTSTACK_SIZE, %rsi
+	call	kasan_init_early
+#endif
+
+	/* Grab metadata pointers from the loader. */
 	movl	4(%rbp),%edi		/* modulep (arg 1) */
 	movl	8(%rbp),%esi		/* kernend (arg 2) */
-
-	/* Get onto a stack that we can trust - there is no going back now. */
-	movq	$bootstack,%rsp
-	xorl	%ebp, %ebp
+	xorq	%rbp, %rbp
 
 	call	hammer_time		/* set up cpu for unix operation */
 	movq	%rax,%rsp		/* set up kstack for mi_startup() */
@@ -140,5 +150,5 @@ ENTRY(la57_trampoline_end)
 	.bss
 	ALIGN_DATA			/* just to be sure */
 	.globl	bootstack
-	.space	0x1000			/* space for bootstack - temporary stack */
+	.space	BOOTSTACK_SIZE		/* space for bootstack - temporary stack */
 bootstack:
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 55a278de6020..9979592acc19 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -1260,16 +1260,43 @@ amd64_bsp_ist_init(struct pcpu *pc)
 	tssp->tss_ist4 = (long)np;
 }
 
+/*
+ * Calculate the kernel load address by inspecting page table created by loader.
+ * The assumptions:
+ * - kernel is mapped at KERNBASE, backed by contiguous phys memory
+ *   aligned at 2M, below 4G (the latter is important for AP startup)
+ * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M)
+ * - kernel is mapped with 2M superpages
+ * - all participating memory, i.e. kernel, modules, metadata,
+ *   page table is accessible by pre-created 1:1 mapping
+ *   (right now loader creates 1:1 mapping for lower 4G, and all
+ *   memory is from there)
+ * - there is a usable memory block right after the end of the
+ *   mapped kernel and all modules/metadata, pointed to by
+ *   physfree, for early allocations
+ */
+vm_paddr_t __nosanitizeaddress __nosanitizememory
+amd64_loadaddr(void)
+{
+	pml4_entry_t *pml4e;
+	pdp_entry_t *pdpe;
+	pd_entry_t *pde;
+	uint64_t cr3;
+
+	cr3 = rcr3();
+	pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(KERNSTART);
+	pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(KERNSTART);
+	pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(KERNSTART);
+	return (*pde & PG_FRAME);
+}
+
 u_int64_t
 hammer_time(u_int64_t modulep, u_int64_t physfree)
 {
 	caddr_t kmdp;
 	int gsel_tss, x;
 	struct pcpu *pc;
-	uint64_t cr3, rsp0;
-	pml4_entry_t *pml4e;
-	pdp_entry_t *pdpe;
-	pd_entry_t *pde;
+	uint64_t rsp0;
 	char *env;
 	struct user_segment_descriptor *gdt;
 	struct region_descriptor r_gdt;
@@ -1278,34 +1305,9 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
 
 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
 
-	/*
-	 * Calculate kernphys by inspecting page table created by loader.
-	 * The assumptions:
-	 * - kernel is mapped at KERNBASE, backed by contiguous phys memory
-	 *   aligned at 2M, below 4G (the latter is important for AP startup)
-	 * - there is a 2M hole at KERNBASE
-	 * - kernel is mapped with 2M superpages
-	 * - all participating memory, i.e. kernel, modules, metadata,
-	 *   page table is accessible by pre-created 1:1 mapping
-	 *   (right now loader creates 1:1 mapping for lower 4G, and all
-	 *   memory is from there)
-	 * - there is a usable memory block right after the end of the
-	 *   mapped kernel and all modules/metadata, pointed to by
-	 *   physfree, for early allocations
-	 */
-	cr3 = rcr3();
-	pml4e = (pml4_entry_t *)(cr3 & ~PAGE_MASK) + pmap_pml4e_index(
-	    (vm_offset_t)hammer_time);
-	pdpe = (pdp_entry_t *)(*pml4e & ~PAGE_MASK) + pmap_pdpe_index(
-	    (vm_offset_t)hammer_time);
-	pde = (pd_entry_t *)(*pdpe & ~PAGE_MASK) + pmap_pde_index(
-	    (vm_offset_t)hammer_time);
-	kernphys = (vm_paddr_t)(*pde & ~PDRMASK) -
-	    (vm_paddr_t)(((vm_offset_t)hammer_time - KERNBASE) & ~PDRMASK);
-
-	/* Fix-up for 2M hole */
+	kernphys = amd64_loadaddr();
+
 	physfree += kernphys;
-	kernphys += NBPDR;
 
 	kmdp = init_ops.parse_preload_data(modulep);
 
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index a4e796becc73..f35a8c4c789c 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -11429,6 +11429,107 @@ pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 }
 
 #if defined(KASAN) || defined(KMSAN)
+
+/*
+ * Reserve enough memory to:
+ * 1) allocate PDP pages for the shadow map(s),
+ * 2) shadow one page of memory, so one PD page, one PT page, and one shadow
+ *    page per shadow map.
+ */
+#ifdef KASAN
+#define	SAN_EARLY_PAGES	(NKASANPML4E + 3)
+#else
+#define	SAN_EARLY_PAGES	(NKMSANSHADPML4E + NKMSANORIGPML4E + 2 * 3)
+#endif
+
+static uint64_t __nosanitizeaddress __nosanitizememory
+pmap_san_enter_early_alloc_4k(uint64_t pabase)
+{
+	static uint8_t data[PAGE_SIZE * SAN_EARLY_PAGES] __aligned(PAGE_SIZE);
+	static size_t offset = 0;
+	uint64_t pa;
+
+	if (offset == sizeof(data)) {
+		panic("%s: ran out of memory for the bootstrap shadow map",
+		    __func__);
+	}
+
+	pa = pabase + ((vm_offset_t)&data[offset] - KERNSTART);
+	offset += PAGE_SIZE;
+	return (pa);
+}
+
+/*
+ * Map a shadow page, before the kernel has bootstrapped its page tables.  This
+ * is currently only used to shadow the temporary boot stack set up by locore.
+ */
+static void __nosanitizeaddress __nosanitizememory
+pmap_san_enter_early(vm_offset_t va)
+{
+	static bool first = true;
+	pml4_entry_t *pml4e;
+	pdp_entry_t *pdpe;
+	pd_entry_t *pde;
+	pt_entry_t *pte;
+	uint64_t cr3, pa, base;
+	int i;
+
+	base = amd64_loadaddr();
+	cr3 = rcr3();
+
+	if (first) {
+		/*
+		 * If this the first call, we need to allocate new PML4Es for
+		 * the bootstrap shadow map(s).  We don't know how the PML4 page
+		 * was initialized by the boot loader, so we can't simply test
+		 * whether the shadow map's PML4Es are zero.
+		 */
+		first = false;
+#ifdef KASAN
+		for (i = 0; i < NKASANPML4E; i++) {
+			pa = pmap_san_enter_early_alloc_4k(base);
+
+			pml4e = (pml4_entry_t *)cr3 +
+			    pmap_pml4e_index(KASAN_MIN_ADDRESS + i * NBPML4);
+			*pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V);
+		}
+#else
+		for (i = 0; i < NKMSANORIGPML4E; i++) {
+			pa = pmap_san_enter_early_alloc_4k(base);
+
+			pml4e = (pml4_entry_t *)cr3 +
+			    pmap_pml4e_index(KMSAN_ORIG_MIN_ADDRESS +
+			    i * NBPML4);
+			*pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V);
+		}
+		for (i = 0; i < NKMSANSHADPML4E; i++) {
+			pa = pmap_san_enter_early_alloc_4k(base);
+
+			pml4e = (pml4_entry_t *)cr3 +
+			    pmap_pml4e_index(KMSAN_SHAD_MIN_ADDRESS +
+			    i * NBPML4);
+			*pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V);
+		}
+#endif
+	}
+	pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(va);
+	pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(va);
+	if (*pdpe == 0) {
+		pa = pmap_san_enter_early_alloc_4k(base);
+		*pdpe = (pdp_entry_t)(pa | X86_PG_RW | X86_PG_V);
+	}
+	pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(va);
+	if (*pde == 0) {
+		pa = pmap_san_enter_early_alloc_4k(base);
+		*pde = (pd_entry_t)(pa | X86_PG_RW | X86_PG_V);
+	}
+	pte = (pt_entry_t *)(*pde & PG_FRAME) + pmap_pte_index(va);
+	if (*pte != 0)
+		panic("%s: PTE for %#lx is already initialized", __func__, va);
+	pa = pmap_san_enter_early_alloc_4k(base);
+	*pte = (pt_entry_t)(pa | X86_PG_A | X86_PG_M | X86_PG_RW | X86_PG_V);
+}
+
 static vm_page_t
 pmap_san_enter_alloc_4k(void)
 {
@@ -11452,7 +11553,7 @@ pmap_san_enter_alloc_2m(void)
  * Grow a shadow map by at least one 4KB page at the specified address.  Use 2MB
  * pages when possible.
  */
-void
+void __nosanitizeaddress __nosanitizememory
 pmap_san_enter(vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
@@ -11460,6 +11561,14 @@ pmap_san_enter(vm_offset_t va)
 	pt_entry_t *pte;
 	vm_page_t m;
 
+	if (kernphys == 0) {
+		/*
+		 * We're creating a temporary shadow map for the boot stack.
+		 */
+		pmap_san_enter_early(va);
+		return;
+	}
+
 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 
 	pdpe = pmap_pdpe(kernel_pmap, va);
diff --git a/sys/amd64/include/asan.h b/sys/amd64/include/asan.h
index 03d57673d05e..a27fbbcb30c7 100644
--- a/sys/amd64/include/asan.h
+++ b/sys/amd64/include/asan.h
@@ -66,6 +66,12 @@ kasan_md_init(void)
 {
 }
 
+static inline void
+kasan_md_init_early(vm_offset_t bootstack, size_t size)
+{
+	kasan_shadow_map(bootstack, size);
+}
+
 #endif /* KASAN */
 
 #endif /* !_MACHINE_ASAN_H_ */
diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h
index 196836b5baea..f014c66c0d06 100644
--- a/sys/amd64/include/md_var.h
+++ b/sys/amd64/include/md_var.h
@@ -64,6 +64,7 @@ struct	sysentvec;
 
 void	amd64_conf_fast_syscall(void);
 void	amd64_db_resume_dbreg(void);
+vm_paddr_t amd64_loadaddr(void);
 void	amd64_lower_shared_page(struct sysentvec *);
 void	amd64_bsp_pcpu_init1(struct pcpu *pc);
 void	amd64_bsp_pcpu_init2(uint64_t rsp0);
diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h
index 9aacae73ebd4..3d51803d82b7 100644
--- a/sys/amd64/include/pmap.h
+++ b/sys/amd64/include/pmap.h
@@ -533,6 +533,7 @@ void	pmap_page_array_startup(long count);
 vm_page_t pmap_page_alloc_below_4g(bool zeroed);
 
 #if defined(KASAN) || defined(KMSAN)
+void	pmap_san_bootstrap(void);
 void	pmap_san_enter(vm_offset_t);
 #endif
 
diff --git a/sys/kern/subr_asan.c b/sys/kern/subr_asan.c
index 003b89f888e9..11f7996cfe73 100644
--- a/sys/kern/subr_asan.c
+++ b/sys/kern/subr_asan.c
@@ -139,6 +139,12 @@ kasan_init(void)
 	kasan_enabled = true;
 }
 
+void
+kasan_init_early(vm_offset_t stack, size_t size)
+{
+	kasan_md_init_early(stack, size);
+}
+
 static inline const char *
 kasan_code_name(uint8_t code)
 {
diff --git a/sys/sys/asan.h b/sys/sys/asan.h
index caa6643bda68..0a9d94007bec 100644
--- a/sys/sys/asan.h
+++ b/sys/sys/asan.h
@@ -56,11 +56,10 @@
 #define	KASAN_EXEC_ARGS_FREED	0xFF
 
 void kasan_init(void);
+void kasan_init_early(vm_offset_t, size_t);
 void kasan_shadow_map(vm_offset_t, size_t);
-
 void kasan_mark(const void *, size_t, size_t, uint8_t);
 #else /* KASAN */
-#define kasan_early_init(u)
 #define kasan_init()
 #define kasan_shadow_map(a, s)
 #define kasan_mark(p, s, l, c)