git: 3863fec1ce2d - main - lib/libc/aarch64/string: add strlen SIMD implementation

From: Robert Clausecker <fuz_at_FreeBSD.org>
Date: Fri, 10 Jan 2025 15:04:00 UTC
The branch main has been updated by fuz:

URL: https://cgit.FreeBSD.org/src/commit/?id=3863fec1ce2dc6033f094a085118605ea89db9e2

commit 3863fec1ce2dc6033f094a085118605ea89db9e2
Author:     Getz Mikalsen <getz@FreeBSD.org>
AuthorDate: 2024-08-26 19:54:32 +0000
Commit:     Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2025-01-10 15:02:40 +0000

    lib/libc/aarch64/string: add strlen SIMD implementation
    
    Adds a SIMD enhanced strlen for Aarch64. It takes inspiration from
    the amd64 implementation but I struggled getting the performance I
    had hoped for on cores like the Graviton3 when compared to the
    existing implementation from Arm Optimized Routines.
    
    See the DR for bechmark results.
    
    Tested by:      fuz (exprun)
    Reviewed by:    fuz, emaste
    Sponsored by:   Google LLC (GSoC 2024)
    PR:             281175
    Differential Revision: https://reviews.freebsd.org/D45623
---
 lib/libc/aarch64/string/Makefile.inc |  4 ++--
 lib/libc/aarch64/string/strlen.S     | 46 ++++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc
index f8c67319fe12..7325b54d9716 100644
--- a/lib/libc/aarch64/string/Makefile.inc
+++ b/lib/libc/aarch64/string/Makefile.inc
@@ -14,7 +14,6 @@ AARCH64_STRING_FUNCS= \
 	strchr \
 	strchrnul \
 	strcpy \
-	strlen \
 	strnlen \
 	strrchr
 
@@ -30,7 +29,8 @@ MDSRCS+= \
 	strncmp.S \
 	memccpy.S \
 	strncat.c \
-	strlcat.c
+	strlcat.c \
+	strlen.S
 
 #
 # Add the above functions. Generate an asm file that includes the needed
diff --git a/lib/libc/aarch64/string/strlen.S b/lib/libc/aarch64/string/strlen.S
new file mode 100644
index 000000000000..7bfac7f4b1e1
--- /dev/null
+++ b/lib/libc/aarch64/string/strlen.S
@@ -0,0 +1,46 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org>
+*/
+
+#include <machine/asm.h>
+
+	.weak	strlen
+	.set	strlen, __strlen
+	.text
+
+ENTRY(__strlen)
+	bic	x10, x0, #0xf		// aligned src
+	and	x9, x0, #0xf
+	ldr	q0, [x10]
+	cmeq	v0.16b, v0.16b, #0
+	shrn	v0.8b, v0.8h, #4
+	fmov	x1, d0
+	cbz	x9, .Laligned
+	lsl	x2, x0, #2		// get the byte offset
+	lsr	x1, x1, x2		// shift by offset index
+	cbz	x1, .Lloop
+	rbit	x1, x1
+	clz	x0, x1
+	lsr	x0, x0, #2
+	ret
+
+.Laligned:
+	cbnz	x1, .Ldone
+
+.Lloop:
+	ldr	q0, [x10, #16]!
+	cmeq	v0.16b, v0.16b, #0
+	shrn	v0.8b, v0.8h, #4	// reduce to fit mask in GPR
+	fcmp	d0, #0.0
+	b.eq	.Lloop
+	fmov	x1, d0
+.Ldone:
+	sub	x0, x10, x0
+	rbit	x1, x1			// reverse bits as NEON has no ctz
+	clz	x3, x1
+	lsr	x3, x3, #2
+	add	x0, x0, x3
+	ret
+END(__strlen)