git: c946f699856f - stable/13 - amd64: rework AP startup
Konstantin Belousov
kib at FreeBSD.org
Mon Aug 23 23:22:20 UTC 2021
The branch stable/13 has been updated by kib:
URL: https://cgit.FreeBSD.org/src/commit/?id=c946f699856f6737a5256d7c9f746ac8035339ee
commit c946f699856f6737a5256d7c9f746ac8035339ee
Author: Konstantin Belousov <kib at FreeBSD.org>
AuthorDate: 2021-07-10 19:38:42 +0000
Commit: Konstantin Belousov <kib at FreeBSD.org>
CommitDate: 2021-08-23 23:21:12 +0000
amd64: rework AP startup
(cherry picked from commit d6717f877872e62d9df1e0ce2d8856620c993924)
---
sys/amd64/amd64/machdep.c | 4 +-
sys/amd64/amd64/mp_machdep.c | 187 ++++++++++++++++---------------------------
sys/amd64/amd64/mpboot.S | 64 +++++++--------
sys/amd64/include/smp.h | 3 +-
sys/x86/x86/mp_x86.c | 5 --
sys/x86/xen/pv.c | 1 -
6 files changed, 96 insertions(+), 168 deletions(-)
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 93030cbe7126..840570be534a 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -187,7 +187,6 @@ struct init_ops init_ops = {
.early_delay = i8254_delay,
.parse_memmap = native_parse_memmap,
#ifdef SMP
- .mp_bootaddress = mp_bootaddress,
.start_all_aps = native_start_all_aps,
#endif
#ifdef DEV_PCI
@@ -1288,8 +1287,7 @@ getmemsize(caddr_t kmdp, u_int64_t first)
* is configured to support APs and APs for the system start
* in real mode mode (e.g. SMP bare metal).
*/
- if (init_ops.mp_bootaddress)
- init_ops.mp_bootaddress(physmap, &physmap_idx);
+ alloc_ap_trampoline(physmap, &physmap_idx);
/* call pmap initialization to make new kernel address space */
pmap_bootstrap(&first);
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index d1064262891f..082a58ada48f 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -106,6 +106,7 @@ char *dbg_stack;
void *bootpcpu;
extern u_int mptramp_la57;
+extern u_int mptramp_nx;
/*
* Local data and functions.
@@ -113,86 +114,6 @@ extern u_int mptramp_la57;
static int start_ap(int apic_id);
-static bool
-is_kernel_paddr(vm_paddr_t pa)
-{
-
- return (pa >= trunc_2mpage(btext - KERNBASE) &&
- pa < round_page(_end - KERNBASE));
-}
-
-static bool
-is_mpboot_good(vm_paddr_t start, vm_paddr_t end)
-{
-
- return (start + AP_BOOTPT_SZ <= GiB(4) && atop(end) < Maxmem);
-}
-
-/*
- * Calculate usable address in base memory for AP trampoline code.
- */
-void
-mp_bootaddress(vm_paddr_t *physmap, unsigned int *physmap_idx)
-{
- vm_paddr_t start, end;
- unsigned int i;
- bool allocated;
-
- alloc_ap_trampoline(physmap, physmap_idx);
-
- /*
- * Find a memory region big enough below the 4GB boundary to
- * store the initial page tables. Region must be mapped by
- * the direct map.
- *
- * Note that it needs to be aligned to a page boundary.
- */
- allocated = false;
- for (i = *physmap_idx; i <= *physmap_idx; i -= 2) {
- /*
- * First, try to chomp at the start of the physmap region.
- * Kernel binary might claim it already.
- */
- start = round_page(physmap[i]);
- end = start + AP_BOOTPT_SZ;
- if (start < end && end <= physmap[i + 1] &&
- is_mpboot_good(start, end) &&
- !is_kernel_paddr(start) && !is_kernel_paddr(end - 1)) {
- allocated = true;
- physmap[i] = end;
- break;
- }
-
- /*
- * Second, try to chomp at the end. Again, check
- * against kernel.
- */
- end = trunc_page(physmap[i + 1]);
- start = end - AP_BOOTPT_SZ;
- if (start < end && start >= physmap[i] &&
- is_mpboot_good(start, end) &&
- !is_kernel_paddr(start) && !is_kernel_paddr(end - 1)) {
- allocated = true;
- physmap[i + 1] = start;
- break;
- }
- }
- if (allocated) {
- mptramp_pagetables = start;
- if (physmap[i] == physmap[i + 1] && *physmap_idx != 0) {
- memmove(&physmap[i], &physmap[i + 2],
- sizeof(*physmap) * (*physmap_idx - i + 2));
- *physmap_idx -= 2;
- }
- } else {
- mptramp_pagetables = trunc_page(boot_address) - AP_BOOTPT_SZ;
- if (bootverbose)
- printf(
-"Cannot find enough space for the initial AP page tables, placing them at %#x",
- mptramp_pagetables);
- }
-}
-
/*
* Initialize the IPI handlers and start up the AP's.
*/
@@ -244,6 +165,9 @@ cpu_mp_start(void)
assign_cpu_ids();
mptramp_la57 = la57;
+ mptramp_nx = pg_nx != 0;
+ MPASS(kernel_pmap->pm_cr3 < (1UL << 32));
+ mptramp_pagetables = kernel_pmap->pm_cr3;
/* Start each Application Processor */
init_ops.start_all_aps();
@@ -398,55 +322,67 @@ mp_realloc_pcpu(int cpuid, int domain)
int
native_start_all_aps(void)
{
- u_int64_t *pt5, *pt4, *pt3, *pt2;
+ vm_page_t m_pml4, m_pdp, m_pd[4];
+ pml5_entry_t old_pml45;
+ pml4_entry_t *v_pml4;
+ pdp_entry_t *v_pdp;
+ pd_entry_t *v_pd;
u_int32_t mpbioswarmvec;
- int apic_id, cpu, domain, i, xo;
+ int apic_id, cpu, domain, i;
u_char mpbiosreason;
mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
- /* copy the AP 1st level boot code */
- bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size);
-
- /* Locate the page tables, they'll be below the trampoline */
+ /* Create a transient 1:1 mapping of low 4G */
if (la57) {
- pt5 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables);
- xo = 1;
+ m_pml4 = pmap_page_alloc_below_4g(true);
+ v_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4));
} else {
- xo = 0;
+ v_pml4 = &kernel_pmap->pm_pmltop[0];
}
- pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables + xo * PAGE_SIZE);
- pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t);
- pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t);
-
- /* Create the initial 1GB replicated page tables */
- for (i = 0; i < 512; i++) {
- if (la57) {
- pt5[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
- PAGE_SIZE);
- pt5[i] |= PG_V | PG_RW | PG_U;
- }
-
- /*
- * Each slot of the level 4 pages points to the same
- * level 3 page.
- */
- pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
- (xo + 1) * PAGE_SIZE);
- pt4[i] |= PG_V | PG_RW | PG_U;
-
- /*
- * Each slot of the level 3 pages points to the same
- * level 2 page.
- */
- pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
- ((xo + 2) * PAGE_SIZE));
- pt3[i] |= PG_V | PG_RW | PG_U;
-
- /* The level 2 page slots are mapped with 2MB pages for 1GB. */
- pt2[i] = i * (2 * 1024 * 1024);
- pt2[i] |= PG_V | PG_RW | PG_PS | PG_U;
+ m_pdp = pmap_page_alloc_below_4g(true);
+ v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp));
+ m_pd[0] = pmap_page_alloc_below_4g(false);
+ v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[0]));
+ for (i = 0; i < NPDEPG; i++)
+ v_pd[i] = (i << PDRSHIFT) | X86_PG_V | X86_PG_RW | X86_PG_A |
+ X86_PG_M | PG_PS;
+ m_pd[1] = pmap_page_alloc_below_4g(false);
+ v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[1]));
+ for (i = 0; i < NPDEPG; i++)
+ v_pd[i] = (NBPDP + (i << PDRSHIFT)) | X86_PG_V | X86_PG_RW |
+ X86_PG_A | X86_PG_M | PG_PS;
+ m_pd[2] = pmap_page_alloc_below_4g(false);
+ v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[2]));
+ for (i = 0; i < NPDEPG; i++)
+ v_pd[i] = (2UL * NBPDP + (i << PDRSHIFT)) | X86_PG_V |
+ X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS;
+ m_pd[3] = pmap_page_alloc_below_4g(false);
+ v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[3]));
+ for (i = 0; i < NPDEPG; i++)
+ v_pd[i] = (3UL * NBPDP + (i << PDRSHIFT)) | X86_PG_V |
+ X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS;
+ v_pdp[0] = VM_PAGE_TO_PHYS(m_pd[0]) | X86_PG_V |
+ X86_PG_RW | X86_PG_A | X86_PG_M;
+ v_pdp[1] = VM_PAGE_TO_PHYS(m_pd[1]) | X86_PG_V |
+ X86_PG_RW | X86_PG_A | X86_PG_M;
+ v_pdp[2] = VM_PAGE_TO_PHYS(m_pd[2]) | X86_PG_V |
+ X86_PG_RW | X86_PG_A | X86_PG_M;
+ v_pdp[3] = VM_PAGE_TO_PHYS(m_pd[3]) | X86_PG_V |
+ X86_PG_RW | X86_PG_A | X86_PG_M;
+ old_pml45 = kernel_pmap->pm_pmltop[0];
+ if (la57) {
+ kernel_pmap->pm_pmltop[0] = VM_PAGE_TO_PHYS(m_pml4) |
+ X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
}
+ v_pml4[0] = VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V |
+ X86_PG_RW | X86_PG_A | X86_PG_M;
+ pmap_invalidate_all(kernel_pmap);
+
+ /* copy the AP 1st level boot code */
+ bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size);
+ if (bootverbose)
+ printf("AP boot address %#x\n", boot_address);
/* save the current value of the warm-start vector */
if (!efi_boot)
@@ -517,6 +453,17 @@ native_start_all_aps(void)
outb(CMOS_REG, BIOS_RESET);
outb(CMOS_DATA, mpbiosreason);
+ /* Destroy transient 1:1 mapping */
+ kernel_pmap->pm_pmltop[0] = old_pml45;
+ invlpg(0);
+ if (la57)
+ vm_page_free(m_pml4);
+ vm_page_free(m_pd[3]);
+ vm_page_free(m_pd[2]);
+ vm_page_free(m_pd[1]);
+ vm_page_free(m_pd[0]);
+ vm_page_free(m_pdp);
+
/* number of APs actually started */
return (mp_naps);
}
diff --git a/sys/amd64/amd64/mpboot.S b/sys/amd64/amd64/mpboot.S
index afdcffa573a4..1b5657d3bef8 100644
--- a/sys/amd64/amd64/mpboot.S
+++ b/sys/amd64/amd64/mpboot.S
@@ -95,12 +95,25 @@ protmode:
* is later enabled.
*/
mov %cr4, %eax
- orl $CR4_PAE, %eax
+ orl $(CR4_PAE | CR4_PGE), %eax
cmpb $0, mptramp_la57-mptramp_start(%ebx)
je 1f
orl $CR4_LA57, %eax
1: mov %eax, %cr4
+ /*
+ * If the BSP reported NXE support, enable EFER.NXE for all APs
+ * prior to loading %cr3. This avoids page faults if the AP
+ * encounters memory marked with the NX bit prior to detecting and
+ * enabling NXE support.
+ */
+ cmpb $0,mptramp_nx-mptramp_start(%ebx)
+ je 2f
+ movl $MSR_EFER, %ecx
+ rdmsr
+ orl $EFER_NXE, %eax
+ wrmsr
+2:
/*
* Enable EFER.LME so that we get long mode when all the prereqs are
* in place. In this case, it turns on when CR0_PG is finally enabled.
@@ -112,12 +125,13 @@ protmode:
wrmsr
/*
- * Point to the embedded page tables for startup. Note that this
- * only gets accessed after we're actually in 64 bit mode, however
- * we can only set the bottom 32 bits of %cr3 in this state. This
- * means we are required to use a temporary page table that is below
- * the 4GB limit. %ebx is still our relocation base. We could just
- * subtract 3 * PAGE_SIZE, but that would be too easy.
+ * Load kernel page table pointer into %cr3.
+ * %ebx is still our relocation base.
+ *
+ * Note that this only gets accessed after we're actually in 64 bit
+ * mode, however we can only set the bottom 32 bits of %cr3 in this
+ * state. This means we depend on the kernel page table being
+ * allocated from the low 4G.
*/
leal mptramp_pagetables-mptramp_start(%ebx),%eax
movl (%eax), %eax
@@ -155,10 +169,8 @@ jmp_64:
/*
* Yeehar! We're running in 64 bit mode! We can mostly ignore our
* segment registers, and get on with it.
- * Note that we are running at the correct virtual address, but with
- * a 1:1 1GB mirrored mapping over entire address space. We had better
- * switch to a real %cr3 promptly so that we can get to the direct map
- * space. Remember that jmp is relative and that we've been relocated,
+ * We are running at the correct virtual address space.
+ * Note that the jmp is relative and that we've been relocated,
* so use an indirect jump.
*/
.code64
@@ -220,6 +232,10 @@ mptramp_pagetables:
mptramp_la57:
.long 0
+ .globl mptramp_nx
+mptramp_nx:
+ .long 0
+
/*
* The pseudo descriptor for lgdt to use.
*/
@@ -243,32 +259,6 @@ bootMP_size:
.code64
.p2align 4,0
entry_64:
- /*
- * If the BSP reported NXE support, enable EFER.NXE for all APs
- * prior to loading %cr3. This avoids page faults if the AP
- * encounters memory marked with the NX bit prior to detecting and
- * enabling NXE support.
- */
- movq pg_nx, %rbx
- testq %rbx, %rbx
- je 1f
- movl $MSR_EFER, %ecx
- rdmsr
- orl $EFER_NXE, %eax
- wrmsr
-
-1:
- /*
- * Load a real %cr3 that has all the direct map stuff and switches
- * off the 1GB replicated mirror. Load a stack pointer and jump
- * into AP startup code in C.
- */
- cmpl $0, la57
- jne 2f
- movq KPML4phys, %rax
- jmp 3f
-2: movq KPML5phys, %rax
-3: movq %rax, %cr3
movq bootSTK, %rsp
/*
diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h
index 8fbd89da0e57..84ee73cef723 100644
--- a/sys/amd64/include/smp.h
+++ b/sys/amd64/include/smp.h
@@ -38,8 +38,7 @@ inthand_t
IDTVEC(rendezvous_pti);
void invlop_handler(void);
-int native_start_all_aps(void);
-void mp_bootaddress(vm_paddr_t *, unsigned int *);
+int native_start_all_aps(void);
#endif /* !LOCORE */
#endif /* SMP */
diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c
index f1c1e45e79b8..441a766f87fb 100644
--- a/sys/x86/x86/mp_x86.c
+++ b/sys/x86/x86/mp_x86.c
@@ -1070,11 +1070,6 @@ init_secondary_tail(void)
}
#ifdef __amd64__
- /*
- * Enable global pages TLB extension
- * This also implicitly flushes the TLB
- */
- load_cr4(rcr4() | CR4_PGE);
if (pmap_pcid_enabled)
load_cr4(rcr4() | CR4_PCIDE);
load_ds(_udatasel);
diff --git a/sys/x86/xen/pv.c b/sys/x86/xen/pv.c
index 2fd698772f9d..59c5b464aace 100644
--- a/sys/x86/xen/pv.c
+++ b/sys/x86/xen/pv.c
@@ -134,7 +134,6 @@ struct init_ops xen_pvh_init_ops = {
.early_delay = xen_delay,
.parse_memmap = xen_pvh_parse_memmap,
#ifdef SMP
- .mp_bootaddress = mp_bootaddress,
.start_all_aps = native_start_all_aps,
#endif
.msi_init = msi_init,
More information about the dev-commits-src-all
mailing list