git: c70276686572 - stable/12 - arm64: Print per-CPU cache summary

From: Justin Hibbits <jhibbits_at_FreeBSD.org>
Date: Wed, 22 Jun 2022 15:57:40 UTC
The branch stable/12 has been updated by jhibbits:

URL: https://cgit.FreeBSD.org/src/commit/?id=c70276686572fa4976babed8aa936ed3d3f2a52d

commit c70276686572fa4976babed8aa936ed3d3f2a52d
Author:     Justin Hibbits <jhibbits@FreeBSD.org>
AuthorDate: 2022-05-31 15:40:20 +0000
Commit:     Justin Hibbits <jhibbits@FreeBSD.org>
CommitDate: 2022-06-22 15:46:40 +0000

    arm64: Print per-CPU cache summary
    
    Summary:
    It can be useful to see a summary of CPU caches on bootup.  This is done
    for most platforms already, so add this to arm64, in the form of (taken
    from Apple M1 pro test):
    
      L1 cache: 192KB (instruction), 128KB (data)
      L2 cache: 12288KB (unified)
    
    This is printed out per-CPU, only under bootverbose.
    
    Future refinements could instead determine if a cache level is shared
    with other cores (L2 is shared among cores on some SoCs, for instance),
    and perform a better calculation to the full true cache sizes.  For
    instance, it's known that the M1 pro, on which this test was done, has 2
    12MB L2 clusters, for a total of 24MB.  Seeing each CPU with 12288KB L2
    would make one think that there's 12MB * NCPUs, for possibly 120MB
    cache, which is incorrect.
    
    Sponsored by:   Juniper Networks, Inc.
    Reviewed by:    #arm64, andrew
    Differential Revision: https://reviews.freebsd.org/D35366
    
    (cherry picked from commit 139ba152c9c91fad9b63ccd2382a80f753f217b9)
---
 sys/arm64/arm64/identcpu.c | 83 +++++++++++++++++++++++++++++++++++++++++++++-
 sys/arm64/include/armreg.h | 30 +++++++++++++++++
 2 files changed, 112 insertions(+), 1 deletion(-)

diff --git a/sys/arm64/arm64/identcpu.c b/sys/arm64/arm64/identcpu.c
index 0b376cd02286..ed92cf412b33 100644
--- a/sys/arm64/arm64/identcpu.c
+++ b/sys/arm64/arm64/identcpu.c
@@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$");
 static int ident_lock;
 static void print_cpu_features(u_int cpu);
 static u_long parse_cpu_features_hwcap(u_int cpu);
+static void print_cpu_caches(struct sbuf *sb, u_int);
 
 char machine[] = "arm64";
 
@@ -59,6 +60,8 @@ static char cpu_model[64];
 SYSCTL_STRING(_hw, HW_MODEL, model, CTLFLAG_RD,
 	cpu_model, sizeof(cpu_model), "Machine model");
 
+#define	MAX_CACHES	8	/* Maximum number of caches supported
+				   architecturally. */
 /*
  * Per-CPU affinity as provided in MPIDR_EL1
  * Indexed by CPU number in logical order selected by the system.
@@ -92,6 +95,8 @@ struct cpu_desc {
 	uint64_t	id_aa64mmfr2;
 	uint64_t	id_aa64pfr0;
 	uint64_t	id_aa64pfr1;
+	uint64_t	clidr;
+	uint32_t	ccsidr[MAX_CACHES][2]; /* 2 possible types. */
 };
 
 struct cpu_desc cpu_desc[MAXCPU];
@@ -520,6 +525,62 @@ parse_cpu_features_hwcap(u_int cpu)
 	return (hwcap);
 }
 
+static void
+print_cpu_cache(u_int cpu, struct sbuf *sb, uint64_t ccs, bool icache,
+    bool unified)
+{
+	size_t cache_size;
+	size_t line_size;
+
+	/* LineSize is Log2(S) - 4. */
+	line_size = 1 << ((ccs & CCSIDR_LineSize_MASK) + 4);
+	/*
+	 * Calculate cache size (sets * ways * line size).  There are different
+	 * formats depending on the FEAT_CCIDX bit in ID_AA64MMFR2 feature
+	 * register.
+	 */
+	if ((cpu_desc[cpu].id_aa64mmfr2 & ID_AA64MMFR2_CCIDX_64))
+		cache_size = (CCSIDR_NSETS_64(ccs) + 1) *
+		    (CCSIDR_ASSOC_64(ccs) + 1);
+	else
+		cache_size = (CCSIDR_NSETS(ccs) + 1) * (CCSIDR_ASSOC(ccs) + 1);
+
+	cache_size *= line_size;
+	sbuf_printf(sb, "%zuKB (%s)", cache_size / 1024,
+	    icache ? "instruction" : unified ? "unified" : "data");
+}
+
+static void
+print_cpu_caches(struct sbuf *sb, u_int cpu)
+{
+	/* Print out each cache combination */
+	uint64_t clidr;
+	int i = 1;
+	clidr = cpu_desc[cpu].clidr;
+
+	for (i = 0; (clidr & CLIDR_CTYPE_MASK) != 0; i++, clidr >>= 3) {
+		int j = 0;
+		int ctype_m = (clidr & CLIDR_CTYPE_MASK);
+
+		sbuf_printf(sb, " L%d cache: ", i + 1);
+		if ((clidr & CLIDR_CTYPE_IO)) {
+			print_cpu_cache(cpu, sb, cpu_desc[cpu].ccsidr[i][j++],
+			    true, false);
+			/* If there's more, add to the line. */
+			if ((ctype_m & ~CLIDR_CTYPE_IO) != 0)
+				sbuf_printf(sb, ", ");
+		}
+		if ((ctype_m & ~CLIDR_CTYPE_IO) != 0) {
+			print_cpu_cache(cpu, sb, cpu_desc[cpu].ccsidr[i][j],
+			    false, (clidr & CLIDR_CTYPE_UNIFIED));
+		}
+		sbuf_printf(sb, "\n");
+
+	}
+	sbuf_finish(sb);
+	printf("%s", sbuf_data(sb));
+}
+
 static void
 print_cpu_features(u_int cpu)
 {
@@ -1351,6 +1412,8 @@ print_cpu_features(u_int cpu)
 		printf("         Auxiliary Features 1 = <%#lx>\n",
 		    cpu_desc[cpu].id_aa64afr1);
 	}
+	if (bootverbose)
+		print_cpu_caches(sb, cpu);
 
 	sbuf_delete(sb);
 	sb = NULL;
@@ -1360,6 +1423,7 @@ print_cpu_features(u_int cpu)
 void
 identify_cpu(void)
 {
+	uint64_t clidr;
 	u_int midr;
 	u_int impl_id;
 	u_int part_id;
@@ -1407,7 +1471,6 @@ identify_cpu(void)
 	/* Save affinity for current CPU */
 	cpu_desc[cpu].mpidr = get_mpidr();
 	CPU_AFFINITY(cpu) = cpu_desc[cpu].mpidr & CPU_AFF_MASK;
-
 	cpu_desc[cpu].id_aa64dfr0 = READ_SPECIALREG(ID_AA64DFR0_EL1);
 	cpu_desc[cpu].id_aa64dfr1 = READ_SPECIALREG(ID_AA64DFR1_EL1);
 	cpu_desc[cpu].id_aa64isar0 = READ_SPECIALREG(ID_AA64ISAR0_EL1);
@@ -1418,6 +1481,24 @@ identify_cpu(void)
 	cpu_desc[cpu].id_aa64pfr0 = READ_SPECIALREG(ID_AA64PFR0_EL1);
 	cpu_desc[cpu].id_aa64pfr1 = READ_SPECIALREG(ID_AA64PFR1_EL1);
 
+	cpu_desc[cpu].clidr = READ_SPECIALREG(clidr_el1);
+
+	clidr = cpu_desc[cpu].clidr;
+
+	for (int i = 0; (clidr & CLIDR_CTYPE_MASK) != 0; i++, clidr >>= 3) {
+		int j = 0;
+		if ((clidr & CLIDR_CTYPE_IO)) {
+			WRITE_SPECIALREG(CSSELR_EL1,
+			    CSSELR_Level(i) | CSSELR_InD);
+			cpu_desc[cpu].ccsidr[i][j++] =
+			    READ_SPECIALREG(CCSIDR_EL1);
+		}
+		if ((clidr & ~CLIDR_CTYPE_IO) == 0)
+			continue;
+		WRITE_SPECIALREG(CSSELR_EL1, CSSELR_Level(i));
+		cpu_desc[cpu].ccsidr[i][j] = READ_SPECIALREG(CCSIDR_EL1);
+	}
+
 	if (cpu != 0) {
 		/*
 		 * This code must run on one cpu at a time, but we are
diff --git a/sys/arm64/include/armreg.h b/sys/arm64/include/armreg.h
index d528f1af2377..c3fc7da46c6e 100644
--- a/sys/arm64/include/armreg.h
+++ b/sys/arm64/include/armreg.h
@@ -45,6 +45,32 @@
 
 #define	UL(x)	UINT64_C(x)
 
+/* CCSIDR_EL1 - Cache Size ID Register */
+#define	CCSIDR_NumSets_MASK	0x0FFFE000
+#define	CCSIDR_NumSets64_MASK	0x00FFFFFF00000000
+#define	CCSIDR_NumSets_SHIFT	13
+#define	CCSIDR_NumSets64_SHIFT	32
+#define	CCSIDR_Assoc_MASK	0x00001FF8
+#define	CCSIDR_Assoc64_MASK	0x0000000000FFFFF8
+#define	CCSIDR_Assoc_SHIFT	3
+#define	CCSIDR_Assoc64_SHIFT	3
+#define	CCSIDR_LineSize_MASK	0x7
+#define	CCSIDR_NSETS(idr)						\
+	(((idr) & CCSIDR_NumSets_MASK) >> CCSIDR_NumSets_SHIFT)
+#define	CCSIDR_ASSOC(idr)						\
+	(((idr) & CCSIDR_Assoc_MASK) >> CCSIDR_Assoc_SHIFT)
+#define	CCSIDR_NSETS_64(idr)						\
+	(((idr) & CCSIDR_NumSets64_MASK) >> CCSIDR_NumSets64_SHIFT)
+#define	CCSIDR_ASSOC_64(idr)						\
+	(((idr) & CCSIDR_Assoc64_MASK) >> CCSIDR_Assoc64_SHIFT)
+
+/* CLIDR_EL1 - Cache level ID register */
+#define	CLIDR_CTYPE_MASK	0x7	/* Cache type mask bits */
+#define	CLIDR_CTYPE_IO		0x1	/* Instruction only */
+#define	CLIDR_CTYPE_DO		0x2	/* Data only */
+#define	CLIDR_CTYPE_ID		0x3	/* Split instruction and data */
+#define	CLIDR_CTYPE_UNIFIED	0x4	/* Unified */
+
 /* CNTHCTL_EL2 - Counter-timer Hypervisor Control register */
 #define	CNTHCTL_EVNTI_MASK	(0xf << 4) /* Bit to trigger event stream */
 #define	CNTHCTL_EVNTDIR		(1 << 3) /* Control transition trigger bit */
@@ -60,6 +86,10 @@
 #define	 CPACR_FPEN_TRAP_NONE	(0x3 << 20) /* No traps */
 #define	CPACR_TTA		(0x1 << 28)
 
+/* CSSELR_EL1 - Cache size selection register */
+#define	CSSELR_Level(i)		(i << 1)
+#define	CSSELR_InD		0x00000001
+
 /* CTR_EL0 - Cache Type Register */
 #define	CTR_DLINE_SHIFT		16
 #define	CTR_DLINE_MASK		(0xf << CTR_DLINE_SHIFT)