expanding amd64 past the 1TB limit
Chris Torek
torek at torek.net
Wed Jun 26 16:11:51 UTC 2013
(Note: Last week I asked about this on the freebsd-current list.
It turned out slightly harder than I thought, as the 512GB kernel
virtual area is based on what fits into a single L4 page table
entry.)
I was asked to expand the kernel limits for amd64 systems. While
I do not have a system with enough RAM to test this for real, the
changes below seem to boot and run OK.
I went just a little bit wild in create_pagetables(). :-) The
lines with the casts got long (and hard to read) so I shortened
them (but I still needed the map I drew of the page tables...).
If using ptoa() like this is OK, probably there should be a few
more of those, e.g., in the changes to pmap_pinit().
Anyway, I wonder if some form of this patch (perhaps even without
the #ifdefs) might be accepted back. I'm not sure about the KPML4BASE
name, but it clearly needs to be different from KPML4I. (At first
I was considering moving KERNBASE too but the branch offsets seem
to be the real limiting factor here.)
Possibly dumb question: around the comment "this replaces some of
the KPTphys entries above", would it be possible to reclaim a few
pages by calculating in advance where the 2MB page mappings obviate
the need for the underlying KPTphys pages, and just offset things?
Another note: one could get rid of the "power of 2" requirement
for NDMPML4E. It arises from the translation between direct
mapped virtual and physical addresses (being |= and &=~), but the
same result can be achieved by adding and subtracting an offset,
which would allow the base and limit to be arbitrary, rather than
a power of two. (Still, it did not seem worth doing here.)
Chris
diff --git a/amd64/amd64/pmap.c b/amd64/amd64/pmap.c
index 272158d..acf5af2 100644
--- a/amd64/amd64/pmap.c
+++ b/amd64/amd64/pmap.c
@@ -534,6 +534,10 @@ static void
create_pagetables(vm_paddr_t *firstaddr)
{
int i, j, ndm1g, nkpdpe;
+ pt_entry_t *pt_p;
+ pd_entry_t *pd_p;
+ pdp_entry_t *pdp_p;
+ pml4_entry_t *p4_p;
/* Allocate page table pages for the direct map */
ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
@@ -556,6 +560,10 @@ create_pagetables(vm_paddr_t *firstaddr)
* bootstrap. We defer this until after all memory-size dependent
* allocations are done (e.g. direct map), so that we don't have to
* build in too much slop in our estimate.
+ *
+ * Note that when NKPML4E > 1, we have an empty page underneath
+ * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
+ * pages. (pmap_enter requires a PD page to exist for each KPML4E.)
*/
nkpt_init(*firstaddr);
nkpdpe = NKPDPE(nkpt);
@@ -564,32 +572,26 @@ create_pagetables(vm_paddr_t *firstaddr)
KPDphys = allocpages(firstaddr, nkpdpe);
/* Fill in the underlying page table pages */
- /* Read-only from zero to physfree */
+ /* Nominally read-only (but really R/W) from zero to physfree */
/* XXX not fully used, underneath 2M pages */
- for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
- ((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
- ((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G;
- }
+ pt_p = (pt_entry_t *)KPTphys;
+ for (i = 0; ptoa(i) < *firstaddr; i++)
+ pt_p[i] = ptoa(i) | PG_RW | PG_V | PG_G;
/* Now map the page tables at their location within PTmap */
- for (i = 0; i < nkpt; i++) {
- ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
- ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
- }
+ pd_p = (pd_entry_t *)KPDphys;
+ for (i = 0; i < nkpt; i++)
+ pd_p[i] = (KPTphys + ptoa(i)) | PG_RW | PG_V;
/* Map from zero to end of allocations under 2M pages */
/* This replaces some of the KPTphys entries above */
- for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {
- ((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
- ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
- }
+ for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
+ pd_p[i] = (i << PDRSHIFT) | PG_RW | PG_V | PG_PS | PG_G;
- /* And connect up the PD to the PDP */
- for (i = 0; i < nkpdpe; i++) {
- ((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys +
- (i << PAGE_SHIFT);
- ((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
- }
+ /* And connect up the PD to the PDP (leaving room for L4 pages) */
+ pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
+ for (i = 0; i < nkpdpe; i++)
+ pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | PG_RW | PG_V | PG_U;
/*
* Now, set up the direct map region using 2MB and/or 1GB pages. If
@@ -599,37 +601,41 @@ create_pagetables(vm_paddr_t *firstaddr)
* memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
* that are partially used.
*/
+ pd_p = (pd_entry_t *)DMPDphys;
for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
- ((pd_entry_t *)DMPDphys)[j] = (vm_paddr_t)i << PDRSHIFT;
+ pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
/* Preset PG_M and PG_A because demotion expects it. */
- ((pd_entry_t *)DMPDphys)[j] |= PG_RW | PG_V | PG_PS | PG_G |
+ pd_p[j] |= PG_RW | PG_V | PG_PS | PG_G |
PG_M | PG_A;
}
+ pdp_p = (pdp_entry_t *)DMPDPphys;
for (i = 0; i < ndm1g; i++) {
- ((pdp_entry_t *)DMPDPphys)[i] = (vm_paddr_t)i << PDPSHIFT;
+ pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
/* Preset PG_M and PG_A because demotion expects it. */
- ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS | PG_G |
+ pdp_p[i] |= PG_RW | PG_V | PG_PS | PG_G |
PG_M | PG_A;
}
for (j = 0; i < ndmpdp; i++, j++) {
- ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (j << PAGE_SHIFT);
- ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
+ pdp_p[i] = DMPDphys + ptoa(j);
+ pdp_p[i] |= PG_RW | PG_V | PG_U;
}
/* And recursively map PML4 to itself in order to get PTmap */
- ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
- ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
+ p4_p = (pml4_entry_t *)KPML4phys;
+ p4_p[PML4PML4I] = KPML4phys;
+ p4_p[PML4PML4I] |= PG_RW | PG_V | PG_U;
/* Connect the Direct Map slot(s) up to the PML4. */
for (i = 0; i < NDMPML4E; i++) {
- ((pdp_entry_t *)KPML4phys)[DMPML4I + i] = DMPDPphys +
- (i << PAGE_SHIFT);
- ((pdp_entry_t *)KPML4phys)[DMPML4I + i] |= PG_RW | PG_V | PG_U;
+ p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
+ p4_p[DMPML4I + i] |= PG_RW | PG_V | PG_U;
}
- /* Connect the KVA slot up to the PML4 */
- ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
- ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
+ /* Connect the KVA slots up to the PML4 */
+ for (i = 0; i < NKPML4E; i++) {
+ p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
+ p4_p[KPML4BASE + i] |= PG_RW | PG_V | PG_U;
+ }
}
/*
@@ -1688,7 +1694,10 @@ pmap_pinit(pmap_t pmap)
pagezero(pmap->pm_pml4);
/* Wire in kernel global address entries. */
- pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
+ for (i = 0; i < NKPML4E; i++) {
+ pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + (i << PAGE_SHIFT)) |
+ PG_RW | PG_V | PG_U;
+ }
for (i = 0; i < NDMPML4E; i++) {
pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + (i << PAGE_SHIFT)) |
PG_RW | PG_V | PG_U;
@@ -1944,7 +1953,8 @@ pmap_release(pmap_t pmap)
m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
- pmap->pm_pml4[KPML4I] = 0; /* KVA */
+ for (i = 0; i < NKPML4E; i++) /* KVA */
+ pmap->pm_pml4[KPML4BASE + i] = 0;
for (i = 0; i < NDMPML4E; i++) /* Direct Map */
pmap->pm_pml4[DMPML4I + i] = 0;
pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */
diff --git a/amd64/include/pmap.h b/amd64/include/pmap.h
index 6d76ec3..58d1c9d 100644
--- a/amd64/include/pmap.h
+++ b/amd64/include/pmap.h
@@ -113,7 +113,17 @@
((unsigned long)(l2) << PDRSHIFT) | \
((unsigned long)(l1) << PAGE_SHIFT))
-#define NKPML4E 1 /* number of kernel PML4 slots */
+/*
+ * Number of kernel PML4 slots. Can be anywhere from 1 to 64 or so,
+ * but setting it larger than NDMPML4E makes no sense.
+ *
+ * Each slot provides .5 TB of kernel virtual space.
+ */
+#ifdef AMD64_HUGE
+#define NKPML4E 16
+#else
+#define NKPML4E 1
+#endif
#define NUPML4E (NPML4EPG/2) /* number of userland PML4 pages */
#define NUPDPE (NUPML4E*NPDPEPG)/* number of userland PDP pages */
@@ -121,20 +131,39 @@
/*
* NDMPML4E is the number of PML4 entries that are used to implement the
- * direct map. It must be a power of two.
+ * direct map. It must be a power of two, and should generally exceed
+ * NKPML4E. The maximum possible value is 64; using 128 will make the
+ * direct map intrude into the recursive page table map.
*/
+#ifdef AMD64_HUGE
+#define NDMPML4E 32
+#else
#define NDMPML4E 2
+#endif
/*
- * The *PDI values control the layout of virtual memory. The starting address
+ * These values control the layout of virtual memory. The starting address
* of the direct map, which is controlled by DMPML4I, must be a multiple of
* its size. (See the PHYS_TO_DMAP() and DMAP_TO_PHYS() macros.)
+ *
+ * Note: KPML4I is the index of the (single) level 4 page that maps
+ * the KVA that holds KERNBASE, while KPML4BASE is the index of the
+ * first level 4 page that maps VM_MIN_KERNEL_ADDRESS. If NKPML4E
+ * is 1, these are the same, otherwise KPML4BASE < KPML4I and extra
+ * level 4 PDEs are needed to map from VM_MIN_KERNEL_ADDRESS up to
+ * KERNBASE. Similarly, if KMPL4I < NKPML4E, extra level 4 PDEs are
+ * needed to map from somewhere-above-KERNBASE to VM_MAX_KERNEL_ADDRESS.
+ *
+ * (KPML4I combines with KPDPI to choose where KERNBASE starts.
+ * Or, in other words, KPML4I provides bits 39..46 of KERNBASE,
+ * and KPDPI provides bits 30..38.)
*/
#define PML4PML4I (NPML4EPG/2) /* Index of recursive pml4 mapping */
-#define KPML4I (NPML4EPG-1) /* Top 512GB for KVM */
-#define DMPML4I rounddown(KPML4I - NDMPML4E, NDMPML4E) /* Below KVM */
+#define KPML4BASE (NPML4EPG-NKPML4E) /* KVM at highest addresses */
+#define DMPML4I rounddown(KPML4BASE-NDMPML4E, NDMPML4E) /* Below KVM */
+#define KPML4I (NPML4EPG-1)
#define KPDPI (NPDPEPG-2) /* kernbase at -2GB */
/*
diff --git a/amd64/include/vmparam.h b/amd64/include/vmparam.h
index 33f62bd..47a8ef8 100644
--- a/amd64/include/vmparam.h
+++ b/amd64/include/vmparam.h
@@ -145,18 +145,26 @@
* 0x0000000000000000 - 0x00007fffffffffff user map
* 0x0000800000000000 - 0xffff7fffffffffff does not exist (hole)
* 0xffff800000000000 - 0xffff804020100fff recursive page table (512GB slot)
+#ifdef AMD64_HUGE
+ * 0xffff804020101000 - 0xffffdfffffffffff unused
+ * 0xffffe00000000000 - 0xffffefffffffffff 16TB direct map
+ * 0xfffff00000000000 - 0xfffff7ffffffffff unused
+ * 0xfffff80000000000 - 0xffffffffffffffff 8TB kernel map
+#else
* 0xffff804020101000 - 0xfffffdffffffffff unused
* 0xfffffe0000000000 - 0xfffffeffffffffff 1TB direct map
* 0xffffff0000000000 - 0xffffff7fffffffff unused
* 0xffffff8000000000 - 0xffffffffffffffff 512GB kernel map
+#endif
*
* Within the kernel map:
*
* 0xffffffff80000000 KERNBASE
*/
-#define VM_MAX_KERNEL_ADDRESS KVADDR(KPML4I, NPDPEPG-1, NPDEPG-1, NPTEPG-1)
-#define VM_MIN_KERNEL_ADDRESS KVADDR(KPML4I, NPDPEPG-512, 0, 0)
+#define VM_MIN_KERNEL_ADDRESS KVADDR(KPML4BASE, 0, 0, 0)
+#define VM_MAX_KERNEL_ADDRESS KVADDR(KPML4BASE + NKPML4E - 1, \
+ NPDPEPG-1, NPDEPG-1, NPTEPG-1)
#define DMAP_MIN_ADDRESS KVADDR(DMPML4I, 0, 0, 0)
#define DMAP_MAX_ADDRESS KVADDR(DMPML4I + NDMPML4E, 0, 0, 0)
diff --git a/conf/options.amd64 b/conf/options.amd64
index 90348b7..f3ce505 100644
--- a/conf/options.amd64
+++ b/conf/options.amd64
@@ -1,6 +1,7 @@
# $FreeBSD$
# Options specific to AMD64 platform kernels
+AMD64_HUGE opt_global.h
AUTO_EOI_1 opt_auto_eoi.h
AUTO_EOI_2 opt_auto_eoi.h
COUNT_XINVLTLB_HITS opt_smp.h
More information about the freebsd-hackers
mailing list