git: 2c5961fff4e5 - stable/14 - arm64: Initial SVE support

From: Andrew Turner <andrew_at_FreeBSD.org>
Date: Mon, 21 Oct 2024 15:05:10 UTC
The branch stable/14 has been updated by andrew:

URL: https://cgit.FreeBSD.org/src/commit/?id=2c5961fff4e5f31079d69e57105132ef9685ca53

commit 2c5961fff4e5f31079d69e57105132ef9685ca53
Author:     Andrew Turner <andrew@FreeBSD.org>
AuthorDate: 2024-09-27 13:36:35 +0000
Commit:     Andrew Turner <andrew@FreeBSD.org>
CommitDate: 2024-10-21 15:03:26 +0000

    arm64: Initial SVE support
    
    Add initial kernel support for SVE. This detects if SVE is present on
    all CPUs, and if so allows for the use of SVE in the future.
    
    As the SVE registers are a superset of the VFP registers we don't need
    to restore the VFP registers when SVE is enabled.
    
    Ths interface to enable SVE is provided, but not used until SVE is
    supported in signals and with ptrace.
    
    Sponsored by:   Arm Ltd
    Differential Revision:  https://reviews.freebsd.org/D43306
    
    (cherry picked from commit 332c426328dbb30a6b2e69d9b1e8298d77d85bd1)
---
 sys/arm64/arm64/exec_machdep.c |   2 +
 sys/arm64/arm64/trap.c         |   3 +-
 sys/arm64/arm64/vfp.c          | 606 +++++++++++++++++++++++++++++++++++++++--
 sys/arm64/include/armreg.h     |   7 +
 sys/arm64/include/pcb.h        |   8 +-
 sys/arm64/include/vfp.h        |   6 +
 6 files changed, 610 insertions(+), 22 deletions(-)

diff --git a/sys/arm64/arm64/exec_machdep.c b/sys/arm64/arm64/exec_machdep.c
index bc4ee178db23..31191ec21602 100644
--- a/sys/arm64/arm64/exec_machdep.c
+++ b/sys/arm64/arm64/exec_machdep.c
@@ -607,6 +607,8 @@ sys_sigreturn(struct thread *td, struct sigreturn_args *uap)
 	if (copyin(uap->sigcntxp, &uc, sizeof(uc)))
 		return (EFAULT);
 
+	/* Stop an interrupt from causing the sve state to be dropped */
+	td->td_sa.code = -1;
 	error = set_mcontext(td, &uc.uc_mcontext);
 	if (error != 0)
 		return (error);
diff --git a/sys/arm64/arm64/trap.c b/sys/arm64/arm64/trap.c
index 69c5cd73ade8..78582f5ac563 100644
--- a/sys/arm64/arm64/trap.c
+++ b/sys/arm64/arm64/trap.c
@@ -730,7 +730,8 @@ do_el0_sync(struct thread *td, struct trapframe *frame)
 		break;
 	}
 
-	KASSERT((td->td_pcb->pcb_fpflags & ~PCB_FP_USERMASK) == 0,
+	KASSERT(
+	    (td->td_pcb->pcb_fpflags & ~(PCB_FP_USERMASK|PCB_FP_SVEVALID)) == 0,
 	    ("Kernel VFP flags set while entering userspace"));
 	KASSERT(
 	    td->td_pcb->pcb_fpusaved == &td->td_pcb->pcb_fpustate,
diff --git a/sys/arm64/arm64/vfp.c b/sys/arm64/arm64/vfp.c
index c65108a83399..d57927991c03 100644
--- a/sys/arm64/arm64/vfp.c
+++ b/sys/arm64/arm64/vfp.c
@@ -30,11 +30,13 @@
 #ifdef VFP
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/eventhandler.h>
 #include <sys/limits.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
+#include <sys/smp.h>
 
 #include <vm/uma.h>
 
@@ -60,6 +62,63 @@ struct fpu_kern_ctx {
 static uma_zone_t fpu_save_area_zone;
 static struct vfpstate *fpu_initialstate;
 
+static u_int sve_max_vector_len;
+
+static size_t
+_sve_buf_size(u_int sve_len)
+{
+	size_t len;
+
+	/* 32 vector registers */
+	len = (size_t)sve_len * 32;
+	/*
+	 * 16 predicate registers and the fault fault register, each 1/8th
+	 * the size of a vector register.
+	 */
+	len += ((size_t)sve_len * 17) / 8;
+	/*
+	 * FPSR and FPCR
+	 */
+	len += sizeof(uint64_t) * 2;
+
+	return (len);
+}
+
+size_t
+sve_max_buf_size(void)
+{
+	MPASS(sve_max_vector_len > 0);
+	return (_sve_buf_size(sve_max_vector_len));
+}
+
+size_t
+sve_buf_size(struct thread *td)
+{
+	struct pcb *pcb;
+
+	pcb = td->td_pcb;
+	MPASS(pcb->pcb_svesaved != NULL);
+	MPASS(pcb->pcb_sve_len > 0);
+
+	return (_sve_buf_size(pcb->pcb_sve_len));
+}
+
+static void *
+sve_alloc(void)
+{
+	void *buf;
+
+	buf = malloc(sve_max_buf_size(), M_FPUKERN_CTX, M_WAITOK | M_ZERO);
+
+	return (buf);
+}
+
+static void
+sve_free(void *buf)
+{
+	free(buf, M_FPUKERN_CTX);
+}
+
 void
 vfp_enable(void)
 {
@@ -71,13 +130,30 @@ vfp_enable(void)
 	isb();
 }
 
+static void
+sve_enable(void)
+{
+	uint32_t cpacr;
+
+	cpacr = READ_SPECIALREG(cpacr_el1);
+	/* Enable FP */
+	cpacr = (cpacr & ~CPACR_FPEN_MASK) | CPACR_FPEN_TRAP_NONE;
+	/* Enable SVE */
+	cpacr = (cpacr & ~CPACR_ZEN_MASK) | CPACR_ZEN_TRAP_NONE;
+	WRITE_SPECIALREG(cpacr_el1, cpacr);
+	isb();
+}
+
 void
 vfp_disable(void)
 {
 	uint32_t cpacr;
 
 	cpacr = READ_SPECIALREG(cpacr_el1);
+	/* Disable FP */
 	cpacr = (cpacr & ~CPACR_FPEN_MASK) | CPACR_FPEN_TRAP_ALL1;
+	/* Disable SVE */
+	cpacr = (cpacr & ~CPACR_ZEN_MASK) | CPACR_ZEN_TRAP_ALL1;
 	WRITE_SPECIALREG(cpacr_el1, cpacr);
 	isb();
 }
@@ -171,9 +247,266 @@ vfp_restore(struct vfpstate *state)
 }
 
 static void
-vfp_save_state_common(struct thread *td, struct pcb *pcb)
+sve_store(void *state, u_int sve_len)
+{
+	vm_offset_t f_start, p_start, z_start;
+	uint64_t fpcr, fpsr;
+
+	/*
+	 * Calculate the start of each register groups. There are three
+	 * groups depending on size, with the First Fault Register (FFR)
+	 * stored with the predicate registers as we use one of them to
+	 * temporarily hold it.
+	 *
+	 *                 +-------------------------+-------------------+
+	 *                 | Contents                | Register size     |
+	 *      z_start -> +-------------------------+-------------------+
+	 *                 |                         |                   |
+	 *                 | 32 Z regs               | sve_len           |
+	 *                 |                         |                   |
+	 *      p_start -> +-------------------------+-------------------+
+	 *                 |                         |                   |
+	 *                 | 16 Predicate registers  | 1/8 size of Z reg |
+	 *                 |  1 First Fault register |                   |
+	 *                 |                         |                   |
+	 *      f_start -> +-------------------------+-------------------+
+	 *                 |                         |                   |
+	 *                 | FPSR/FPCR               | 32 bit            |
+	 *                 |                         |                   |
+	 *                 +-------------------------+-------------------+
+	 */
+	z_start = (vm_offset_t)state;
+	p_start = z_start + sve_len * 32;
+	f_start = p_start + (sve_len / 8) * 17;
+
+	__asm __volatile(
+	    ".arch_extension sve				\n"
+	    "str	z0, [%0, #0, MUL VL]			\n"
+	    "str	z1, [%0, #1, MUL VL]			\n"
+	    "str	z2, [%0, #2, MUL VL]			\n"
+	    "str	z3, [%0, #3, MUL VL]			\n"
+	    "str	z4, [%0, #4, MUL VL]			\n"
+	    "str	z5, [%0, #5, MUL VL]			\n"
+	    "str	z6, [%0, #6, MUL VL]			\n"
+	    "str	z7, [%0, #7, MUL VL]			\n"
+	    "str	z8, [%0, #8, MUL VL]			\n"
+	    "str	z9, [%0, #9, MUL VL]			\n"
+	    "str	z10, [%0, #10, MUL VL]			\n"
+	    "str	z11, [%0, #11, MUL VL]			\n"
+	    "str	z12, [%0, #12, MUL VL]			\n"
+	    "str	z13, [%0, #13, MUL VL]			\n"
+	    "str	z14, [%0, #14, MUL VL]			\n"
+	    "str	z15, [%0, #15, MUL VL]			\n"
+	    "str	z16, [%0, #16, MUL VL]			\n"
+	    "str	z17, [%0, #17, MUL VL]			\n"
+	    "str	z18, [%0, #18, MUL VL]			\n"
+	    "str	z19, [%0, #19, MUL VL]			\n"
+	    "str	z20, [%0, #20, MUL VL]			\n"
+	    "str	z21, [%0, #21, MUL VL]			\n"
+	    "str	z22, [%0, #22, MUL VL]			\n"
+	    "str	z23, [%0, #23, MUL VL]			\n"
+	    "str	z24, [%0, #24, MUL VL]			\n"
+	    "str	z25, [%0, #25, MUL VL]			\n"
+	    "str	z26, [%0, #26, MUL VL]			\n"
+	    "str	z27, [%0, #27, MUL VL]			\n"
+	    "str	z28, [%0, #28, MUL VL]			\n"
+	    "str	z29, [%0, #29, MUL VL]			\n"
+	    "str	z30, [%0, #30, MUL VL]			\n"
+	    "str	z31, [%0, #31, MUL VL]			\n"
+	    /* Store the predicate registers */
+	    "str	p0, [%1, #0, MUL VL]			\n"
+	    "str	p1, [%1, #1, MUL VL]			\n"
+	    "str	p2, [%1, #2, MUL VL]			\n"
+	    "str	p3, [%1, #3, MUL VL]			\n"
+	    "str	p4, [%1, #4, MUL VL]			\n"
+	    "str	p5, [%1, #5, MUL VL]			\n"
+	    "str	p6, [%1, #6, MUL VL]			\n"
+	    "str	p7, [%1, #7, MUL VL]			\n"
+	    "str	p8, [%1, #8, MUL VL]			\n"
+	    "str	p9, [%1, #9, MUL VL]			\n"
+	    "str	p10, [%1, #10, MUL VL]			\n"
+	    "str	p11, [%1, #11, MUL VL]			\n"
+	    "str	p12, [%1, #12, MUL VL]			\n"
+	    "str	p13, [%1, #13, MUL VL]			\n"
+	    "str	p14, [%1, #14, MUL VL]			\n"
+	    "str	p15, [%1, #15, MUL VL]			\n"
+	    ".arch_extension nosve				\n"
+	    : : "r"(z_start), "r"(p_start));
+
+	/* Save the FFR if needed */
+	/* TODO: Skip if in SME streaming mode (when supported) */
+	__asm __volatile(
+	    ".arch_extension sve				\n"
+	    "rdffr	p0.b					\n"
+	    "str	p0, [%0, #16, MUL VL]			\n"
+	/*
+	 * Load the old p0 value to ensure it is consistent if we enable
+	 * without calling sve_restore, e.g. switch to a kernel thread and
+	 * back.
+	 */
+	    "ldr	p0, [%0, #0, MUL VL]			\n"
+	    ".arch_extension nosve				\n"
+	    : : "r"(p_start));
+
+	__asm __volatile(
+	    ".arch_extension fp					\n"
+	    "mrs	%0, fpsr				\n"
+	    "mrs	%1, fpcr				\n"
+	    "stp	%w0, %w1, [%2]				\n"
+	    ".arch_extension nofp				\n"
+	    : "=&r"(fpsr), "=&r"(fpcr) : "r"(f_start));
+}
+
+static void
+sve_restore(void *state, u_int sve_len)
+{
+	vm_offset_t f_start, p_start, z_start;
+	uint64_t fpcr, fpsr;
+
+	/* See sve_store for the layout of the state buffer */
+	z_start = (vm_offset_t)state;
+	p_start = z_start + sve_len * 32;
+	f_start = p_start + (sve_len / 8) * 17;
+
+	__asm __volatile(
+	    ".arch_extension sve				\n"
+	    "ldr	p0, [%0, #16, MUL VL]			\n"
+	    "wrffr	p0.b					\n"
+	    ".arch_extension nosve				\n"
+	    : : "r"(p_start));
+
+	__asm __volatile(
+	    ".arch_extension sve				\n"
+	    "ldr	z0, [%0, #0, MUL VL]			\n"
+	    "ldr	z1, [%0, #1, MUL VL]			\n"
+	    "ldr	z2, [%0, #2, MUL VL]			\n"
+	    "ldr	z3, [%0, #3, MUL VL]			\n"
+	    "ldr	z4, [%0, #4, MUL VL]			\n"
+	    "ldr	z5, [%0, #5, MUL VL]			\n"
+	    "ldr	z6, [%0, #6, MUL VL]			\n"
+	    "ldr	z7, [%0, #7, MUL VL]			\n"
+	    "ldr	z8, [%0, #8, MUL VL]			\n"
+	    "ldr	z9, [%0, #9, MUL VL]			\n"
+	    "ldr	z10, [%0, #10, MUL VL]			\n"
+	    "ldr	z11, [%0, #11, MUL VL]			\n"
+	    "ldr	z12, [%0, #12, MUL VL]			\n"
+	    "ldr	z13, [%0, #13, MUL VL]			\n"
+	    "ldr	z14, [%0, #14, MUL VL]			\n"
+	    "ldr	z15, [%0, #15, MUL VL]			\n"
+	    "ldr	z16, [%0, #16, MUL VL]			\n"
+	    "ldr	z17, [%0, #17, MUL VL]			\n"
+	    "ldr	z18, [%0, #18, MUL VL]			\n"
+	    "ldr	z19, [%0, #19, MUL VL]			\n"
+	    "ldr	z20, [%0, #20, MUL VL]			\n"
+	    "ldr	z21, [%0, #21, MUL VL]			\n"
+	    "ldr	z22, [%0, #22, MUL VL]			\n"
+	    "ldr	z23, [%0, #23, MUL VL]			\n"
+	    "ldr	z24, [%0, #24, MUL VL]			\n"
+	    "ldr	z25, [%0, #25, MUL VL]			\n"
+	    "ldr	z26, [%0, #26, MUL VL]			\n"
+	    "ldr	z27, [%0, #27, MUL VL]			\n"
+	    "ldr	z28, [%0, #28, MUL VL]			\n"
+	    "ldr	z29, [%0, #29, MUL VL]			\n"
+	    "ldr	z30, [%0, #30, MUL VL]			\n"
+	    "ldr	z31, [%0, #31, MUL VL]			\n"
+	    /* Store the predicate registers */
+	    "ldr	p0, [%1, #0, MUL VL]			\n"
+	    "ldr	p1, [%1, #1, MUL VL]			\n"
+	    "ldr	p2, [%1, #2, MUL VL]			\n"
+	    "ldr	p3, [%1, #3, MUL VL]			\n"
+	    "ldr	p4, [%1, #4, MUL VL]			\n"
+	    "ldr	p5, [%1, #5, MUL VL]			\n"
+	    "ldr	p6, [%1, #6, MUL VL]			\n"
+	    "ldr	p7, [%1, #7, MUL VL]			\n"
+	    "ldr	p8, [%1, #8, MUL VL]			\n"
+	    "ldr	p9, [%1, #9, MUL VL]			\n"
+	    "ldr	p10, [%1, #10, MUL VL]			\n"
+	    "ldr	p11, [%1, #11, MUL VL]			\n"
+	    "ldr	p12, [%1, #12, MUL VL]			\n"
+	    "ldr	p13, [%1, #13, MUL VL]			\n"
+	    "ldr	p14, [%1, #14, MUL VL]			\n"
+	    "ldr	p15, [%1, #15, MUL VL]			\n"
+	    ".arch_extension nosve				\n"
+	    : : "r"(z_start), "r"(p_start));
+
+	__asm __volatile(
+	    ".arch_extension fp					\n"
+	    "ldp	%w0, %w1, [%2]				\n"
+	    "msr	fpsr, %0				\n"
+	    "msr	fpcr, %1				\n"
+	    ".arch_extension nofp				\n"
+	    : "=&r"(fpsr), "=&r"(fpcr) : "r"(f_start));
+}
+
+/*
+ * Sync the VFP registers to the SVE register state, e.g. in signal return
+ * when userspace may have changed the vfp register values and expect them
+ * to be used when the signal handler returns.
+ */
+void
+vfp_to_sve_sync(struct thread *td)
+{
+	struct pcb *pcb;
+	uint32_t *fpxr;
+
+	pcb = td->td_pcb;
+	if (pcb->pcb_svesaved == NULL)
+		return;
+
+	MPASS(pcb->pcb_fpusaved != NULL);
+
+	/* Copy the VFP registers to the SVE region */
+	for (int i = 0; i < nitems(pcb->pcb_fpusaved->vfp_regs); i++) {
+		__uint128_t *sve_reg;
+
+		sve_reg = (__uint128_t *)((uintptr_t)pcb->pcb_svesaved +
+		    i * pcb->pcb_sve_len);
+		*sve_reg = pcb->pcb_fpusaved->vfp_regs[i];
+	}
+
+	fpxr = (uint32_t *)((uintptr_t)pcb->pcb_svesaved +
+	    (32 * pcb->pcb_sve_len) + (17 * pcb->pcb_sve_len / 8));
+	fpxr[0] = pcb->pcb_fpusaved->vfp_fpsr;
+	fpxr[1] = pcb->pcb_fpusaved->vfp_fpcr;
+}
+
+/*
+ * Sync the SVE registers to the VFP register state.
+ */
+void
+sve_to_vfp_sync(struct thread *td)
+{
+	struct pcb *pcb;
+	uint32_t *fpxr;
+
+	pcb = td->td_pcb;
+	if (pcb->pcb_svesaved == NULL)
+		return;
+
+	MPASS(pcb->pcb_fpusaved == &pcb->pcb_fpustate);
+
+	/* Copy the SVE registers to the VFP saved state */
+	for (int i = 0; i < nitems(pcb->pcb_fpusaved->vfp_regs); i++) {
+		__uint128_t *sve_reg;
+
+		sve_reg = (__uint128_t *)((uintptr_t)pcb->pcb_svesaved +
+		    i * pcb->pcb_sve_len);
+		pcb->pcb_fpusaved->vfp_regs[i] = *sve_reg;
+	}
+
+	fpxr = (uint32_t *)((uintptr_t)pcb->pcb_svesaved +
+	    (32 * pcb->pcb_sve_len) + (17 * pcb->pcb_sve_len / 8));
+	pcb->pcb_fpusaved->vfp_fpsr = fpxr[0];
+	pcb->pcb_fpusaved->vfp_fpcr = fpxr[1];
+}
+
+static void
+vfp_save_state_common(struct thread *td, struct pcb *pcb, bool full_save)
 {
 	uint32_t cpacr;
+	bool save_sve;
+
+	save_sve = false;
 
 	critical_enter();
 	/*
@@ -181,14 +514,49 @@ vfp_save_state_common(struct thread *td, struct pcb *pcb)
 	 * i.e. return if we are trapping on FP access.
 	 */
 	cpacr = READ_SPECIALREG(cpacr_el1);
-	if ((cpacr & CPACR_FPEN_MASK) == CPACR_FPEN_TRAP_NONE) {
-		KASSERT(PCPU_GET(fpcurthread) == td,
-		    ("Storing an invalid VFP state"));
+	if ((cpacr & CPACR_FPEN_MASK) != CPACR_FPEN_TRAP_NONE)
+		goto done;
 
+	KASSERT(PCPU_GET(fpcurthread) == td,
+	    ("Storing an invalid VFP state"));
+
+	/*
+	 * Also save the SVE state. As SVE depends on the VFP being
+	 * enabled we can rely on only needing to check this when
+	 * the VFP unit has been enabled.
+	 */
+	if ((cpacr & CPACR_ZEN_MASK) == CPACR_ZEN_TRAP_NONE) {
+		/* If SVE is enabled it should be valid */
+		MPASS((pcb->pcb_fpflags & PCB_FP_SVEVALID) != 0);
+
+		/*
+		 * If we are switching while in a system call skip saving
+		 * SVE registers. The ABI allows us to drop them over any
+		 * system calls, however doing so is expensive in SVE
+		 * heavy userspace code. This would require us to disable
+		 * SVE for all system calls and trap the next use of them.
+		 * As an optimisation only disable SVE on context switch.
+		 */
+		if (td->td_frame == NULL ||
+		    (ESR_ELx_EXCEPTION(td->td_frame->tf_esr) != EXCP_SVC64 &&
+		    td->td_sa.code != (u_int)-1))
+			save_sve = true;
+	}
+
+	if (save_sve) {
+		KASSERT(pcb->pcb_svesaved != NULL,
+		    ("Storing to a NULL SVE state"));
+		sve_store(pcb->pcb_svesaved, pcb->pcb_sve_len);
+		if (full_save)
+			sve_to_vfp_sync(td);
+	} else {
+		pcb->pcb_fpflags &= ~PCB_FP_SVEVALID;
 		vfp_store(pcb->pcb_fpusaved);
-		dsb(ish);
-		vfp_disable();
 	}
+	dsb(ish);
+	vfp_disable();
+
+done:
 	critical_exit();
 }
 
@@ -199,7 +567,7 @@ vfp_save_state(struct thread *td, struct pcb *pcb)
 	KASSERT(pcb != NULL, ("NULL vfp pcb"));
 	KASSERT(td->td_pcb == pcb, ("Invalid vfp pcb"));
 
-	vfp_save_state_common(td, pcb);
+	vfp_save_state_common(td, pcb, true);
 }
 
 void
@@ -213,7 +581,7 @@ vfp_save_state_savectx(struct pcb *pcb)
 	MPASS(pcb->pcb_fpusaved == NULL);
 	pcb->pcb_fpusaved = &pcb->pcb_fpustate;
 
-	vfp_save_state_common(curthread, pcb);
+	vfp_save_state_common(curthread, pcb, true);
 }
 
 void
@@ -221,7 +589,7 @@ vfp_save_state_switch(struct thread *td)
 {
 	KASSERT(td != NULL, ("NULL vfp thread"));
 
-	vfp_save_state_common(td, td->td_pcb);
+	vfp_save_state_common(td, td->td_pcb, false);
 }
 
 /*
@@ -231,21 +599,40 @@ vfp_save_state_switch(struct thread *td)
 void
 vfp_new_thread(struct thread *newtd, struct thread *oldtd, bool fork)
 {
-	struct pcb *newpcb;
+	struct pcb *newpcb, *oldpcb;
 
 	newpcb = newtd->td_pcb;
+	oldpcb = oldtd->td_pcb;
 
 	/* Kernel threads start with clean VFP */
 	if ((oldtd->td_pflags & TDP_KTHREAD) != 0) {
 		newpcb->pcb_fpflags &=
-		    ~(PCB_FP_STARTED | PCB_FP_KERN | PCB_FP_NOSAVE);
+		    ~(PCB_FP_STARTED | PCB_FP_SVEVALID | PCB_FP_KERN |
+		      PCB_FP_NOSAVE);
 	} else {
 		MPASS((newpcb->pcb_fpflags & (PCB_FP_KERN|PCB_FP_NOSAVE)) == 0);
+
+		/*
+		 * The only SVE register state to be guaranteed to be saved
+		 * a system call is the lower bits of the Z registers as
+		 * these are aliased with the existing FP registers. Because
+		 * we can only create a new thread or fork through a system
+		 * call it is safe to drop the SVE state in the new thread.
+		 */
+		newpcb->pcb_fpflags &= ~PCB_FP_SVEVALID;
 		if (!fork) {
 			newpcb->pcb_fpflags &= ~PCB_FP_STARTED;
 		}
 	}
 
+	newpcb->pcb_svesaved = NULL;
+	if (oldpcb->pcb_svesaved == NULL)
+		newpcb->pcb_sve_len = sve_max_vector_len;
+	else
+		KASSERT(newpcb->pcb_sve_len == oldpcb->pcb_sve_len,
+		    ("%s: pcb sve vector length differs: %x != %x", __func__,
+		    newpcb->pcb_sve_len, oldpcb->pcb_sve_len));
+
 	newpcb->pcb_fpusaved = &newpcb->pcb_fpustate;
 	newpcb->pcb_vfpcpu = UINT_MAX;
 }
@@ -272,23 +659,48 @@ vfp_reset_state(struct thread *td, struct pcb *pcb)
 	    ("pcb_fpusaved should point to pcb_fpustate."));
 	pcb->pcb_fpustate.vfp_fpcr = VFPCR_INIT;
 	pcb->pcb_fpustate.vfp_fpsr = 0;
+	/* XXX: Memory leak when using SVE between fork & exec? */
+	pcb->pcb_svesaved = NULL;
 	pcb->pcb_vfpcpu = UINT_MAX;
 	pcb->pcb_fpflags = 0;
 }
 
-void
-vfp_restore_state(void)
+static void
+vfp_restore_state_common(struct thread *td, int flags)
 {
 	struct pcb *curpcb;
 	u_int cpu;
+	bool restore_sve;
+
+	KASSERT(td == curthread, ("%s: Called with non-current thread",
+	    __func__));
 
 	critical_enter();
 
 	cpu = PCPU_GET(cpuid);
-	curpcb = curthread->td_pcb;
-	curpcb->pcb_fpflags |= PCB_FP_STARTED;
+	curpcb = td->td_pcb;
 
-	vfp_enable();
+	/*
+	 * If SVE has been used and the base VFP state is in use then
+	 * restore the SVE registers. A non-base VFP state should only
+	 * be used by the kernel and SVE should onlu be used by userspace.
+	 */
+	restore_sve = false;
+	if ((curpcb->pcb_fpflags & PCB_FP_SVEVALID) != 0 &&
+	    curpcb->pcb_fpusaved == &curpcb->pcb_fpustate) {
+		MPASS(curpcb->pcb_svesaved != NULL);
+		/* SVE shouldn't be enabled in the kernel */
+		MPASS((flags & PCB_FP_KERN) == 0);
+		restore_sve = true;
+	}
+
+	if (restore_sve) {
+		MPASS((curpcb->pcb_fpflags & PCB_FP_SVEVALID) != 0);
+		sve_enable();
+	} else {
+		curpcb->pcb_fpflags |= PCB_FP_STARTED;
+		vfp_enable();
+	}
 
 	/*
 	 * If the previous thread on this cpu to use the VFP was not the
@@ -296,14 +708,104 @@ vfp_restore_state(void)
 	 * cpu we need to restore the old state.
 	 */
 	if (PCPU_GET(fpcurthread) != curthread || cpu != curpcb->pcb_vfpcpu) {
-		vfp_restore(curthread->td_pcb->pcb_fpusaved);
-		PCPU_SET(fpcurthread, curthread);
+		/*
+		 * The VFP registers are the lower 128 bits of the SVE
+		 * registers. Use the SVE store state if it was previously
+		 * enabled.
+		 */
+		if (restore_sve) {
+			MPASS(td->td_pcb->pcb_svesaved != NULL);
+			sve_restore(td->td_pcb->pcb_svesaved,
+			    td->td_pcb->pcb_sve_len);
+		} else {
+			vfp_restore(td->td_pcb->pcb_fpusaved);
+		}
+		PCPU_SET(fpcurthread, td);
 		curpcb->pcb_vfpcpu = cpu;
 	}
 
 	critical_exit();
 }
 
+void
+vfp_restore_state(void)
+{
+	struct thread *td;
+
+	td = curthread;
+	vfp_restore_state_common(td, td->td_pcb->pcb_fpflags);
+}
+
+bool
+sve_restore_state(struct thread *td)
+{
+	struct pcb *curpcb;
+	void *svesaved;
+	uint64_t cpacr;
+
+	KASSERT(td == curthread, ("%s: Called with non-current thread",
+	    __func__));
+
+	curpcb = td->td_pcb;
+
+	/* The SVE state should alias the base VFP state */
+	MPASS(curpcb->pcb_fpusaved == &curpcb->pcb_fpustate);
+
+	/* SVE not enabled, tell the caller to raise a fault */
+	if (curpcb->pcb_sve_len == 0) {
+		/*
+		 * The init pcb is created before we read the vector length.
+		 * Set it to the default length.
+		 */
+		if (sve_max_vector_len == 0)
+			return (false);
+
+		MPASS(curpcb->pcb_svesaved == NULL);
+		curpcb->pcb_sve_len = sve_max_vector_len;
+	}
+
+	if (curpcb->pcb_svesaved == NULL) {
+		/* SVE should be disabled so will be invalid */
+		MPASS((curpcb->pcb_fpflags & PCB_FP_SVEVALID) == 0);
+
+		/*
+		 * Allocate the SVE buffer of this thread.
+		 * Enable interrupts so the allocation can sleep
+		 */
+		svesaved = sve_alloc();
+
+		critical_enter();
+
+		/* Restore the VFP state if needed */
+		cpacr = READ_SPECIALREG(cpacr_el1);
+		if ((cpacr & CPACR_FPEN_MASK) != CPACR_FPEN_TRAP_NONE) {
+			vfp_restore_state_common(td, curpcb->pcb_fpflags);
+		}
+
+		/*
+		 * Set the flags after enabling the VFP as the SVE saved
+		 * state will be invalid.
+		 */
+		curpcb->pcb_svesaved = svesaved;
+		curpcb->pcb_fpflags |= PCB_FP_SVEVALID;
+		sve_enable();
+
+		critical_exit();
+	} else {
+		vfp_restore_state_common(td, curpcb->pcb_fpflags);
+
+		/* Enable SVE if it wasn't previously enabled */
+		if ((curpcb->pcb_fpflags & PCB_FP_SVEVALID) == 0) {
+			critical_enter();
+			sve_enable();
+			curpcb->pcb_fpflags |= PCB_FP_SVEVALID;
+			critical_exit();
+		}
+	}
+
+	return (true);
+}
+
 void
 vfp_init_secondary(void)
 {
@@ -348,6 +850,74 @@ vfp_init(const void *dummy __unused)
 
 SYSINIT(vfp, SI_SUB_CPU, SI_ORDER_ANY, vfp_init, NULL);
 
+static void
+sve_thread_dtor(void *arg __unused, struct thread *td)
+{
+	sve_free(td->td_pcb->pcb_svesaved);
+}
+
+static void
+sve_pcpu_read(void *arg)
+{
+	u_int *len;
+	uint64_t vl;
+
+	len = arg;
+
+	/* Enable SVE to read zcr_el1 and VFP for rdvl */
+	sve_enable();
+
+	/* Set the longest vector length */
+	WRITE_SPECIALREG(ZCR_EL1_REG, ZCR_LEN_MASK);
+	isb();
+
+	/* Read the real vector length */
+	__asm __volatile(
+	    ".arch_extension sve	\n"
+	    "rdvl	%0, #1		\n"
+	    ".arch_extension nosve	\n"
+	    : "=&r"(vl));
+
+	vfp_disable();
+
+	len[PCPU_GET(cpuid)] = vl;
+}
+
+static void
+sve_init(const void *dummy __unused)
+{
+	u_int *len_list;
+	uint64_t reg;
+	int i;
+
+	if (!get_kernel_reg(ID_AA64PFR0_EL1, &reg))
+		return;
+
+	if (ID_AA64PFR0_SVE_VAL(reg) == ID_AA64PFR0_SVE_NONE)
+		return;
+
+	len_list = malloc(sizeof(*len_list) * (mp_maxid + 1), M_TEMP,
+	    M_WAITOK | M_ZERO);
+	smp_rendezvous(NULL, sve_pcpu_read, NULL, len_list);
+
+	sve_max_vector_len = ZCR_LEN_BYTES(ZCR_LEN_MASK);
+	CPU_FOREACH(i) {
+		if (bootverbose)
+			printf("CPU%d SVE vector length: %u\n", i, len_list[i]);
+		sve_max_vector_len = MIN(sve_max_vector_len, len_list[i]);
+	}
+	free(len_list, M_TEMP);
+
+	if (bootverbose)
+		printf("SVE with %u byte vectors\n", sve_max_vector_len);
+
+	if (sve_max_vector_len > 0) {
+		EVENTHANDLER_REGISTER(thread_dtor, sve_thread_dtor, NULL,
+		    EVENTHANDLER_PRI_ANY);
+	}
+}
+SYSINIT(sve, SI_SUB_SMP, SI_ORDER_ANY, sve_init, NULL);
+
 struct fpu_kern_ctx *
 fpu_kern_alloc_ctx(u_int flags)
 {
diff --git a/sys/arm64/include/armreg.h b/sys/arm64/include/armreg.h
index 8bf4445e13b3..2b92c036ef1c 100644
--- a/sys/arm64/include/armreg.h
+++ b/sys/arm64/include/armreg.h
@@ -2567,6 +2567,13 @@
 #define	VBAR_EL12_op2			0
 
 /* ZCR_EL1 - SVE Control Register */
+#define	ZCR_EL1			MRS_REG(ZCR_EL1)
+#define	ZCR_EL1_REG		MRS_REG_ALT_NAME(ZCR_EL1_REG)
+#define	ZCR_EL1_REG_op0		3
+#define	ZCR_EL1_REG_op1		0
+#define	ZCR_EL1_REG_CRn		1
+#define	ZCR_EL1_REG_CRm		2
+#define	ZCR_EL1_REG_op2		0
 #define	ZCR_LEN_SHIFT		0
 #define	ZCR_LEN_MASK		(0xf << ZCR_LEN_SHIFT)
 #define	ZCR_LEN_BYTES(x)	((((x) & ZCR_LEN_MASK) + 1) * 16)
diff --git a/sys/arm64/include/pcb.h b/sys/arm64/include/pcb.h
index d7392d5f2032..273b53cdc6cf 100644
--- a/sys/arm64/include/pcb.h
+++ b/sys/arm64/include/pcb.h
@@ -59,17 +59,19 @@ struct pcb {
 	u_int		pcb_flags;
 #define	PCB_SINGLE_STEP_SHIFT	0
 #define	PCB_SINGLE_STEP		(1 << PCB_SINGLE_STEP_SHIFT)
-	uint32_t	pcb_pad1;
+	u_int		pcb_sve_len;	/* The SVE vector length */
 
 	struct vfpstate	*pcb_fpusaved;
 	int		pcb_fpflags;
 #define	PCB_FP_STARTED	0x00000001
+#define	PCB_FP_SVEVALID	0x00000002
 #define	PCB_FP_KERN	0x40000000
 #define	PCB_FP_NOSAVE	0x80000000
 /* The bits passed to userspace in get_fpcontext */
-#define	PCB_FP_USERMASK	(PCB_FP_STARTED)
+#define	PCB_FP_USERMASK	(PCB_FP_STARTED | PCB_FP_SVEVALID)
 	u_int		pcb_vfpcpu;	/* Last cpu this thread ran VFP code */
-	uint64_t	pcb_reserved[5];
+	void		*pcb_svesaved;
+	uint64_t	pcb_reserved[4];
 
 	/*
 	 * The userspace VFP state. The pcb_fpusaved pointer will point to
diff --git a/sys/arm64/include/vfp.h b/sys/arm64/include/vfp.h
index 47d068d6050c..fc93908add0b 100644
--- a/sys/arm64/include/vfp.h
+++ b/sys/arm64/include/vfp.h
@@ -80,6 +80,12 @@ void	vfp_restore_state(void);
 void	vfp_save_state(struct thread *, struct pcb *);
 void	vfp_save_state_savectx(struct pcb *);
 void	vfp_save_state_switch(struct thread *);
+void	vfp_to_sve_sync(struct thread *);
+void	sve_to_vfp_sync(struct thread *);
+
+size_t	sve_max_buf_size(void);
+size_t	sve_buf_size(struct thread *);
+bool	sve_restore_state(struct thread *);
 
 struct fpu_kern_ctx;