git: f2bd390a54f1 - main - lib/libc/aarch64/string: add strcspn optimized implementation

From: Robert Clausecker <fuz_at_FreeBSD.org>
Date: Fri, 10 Jan 2025 15:03:50 UTC
The branch main has been updated by fuz:

URL: https://cgit.FreeBSD.org/src/commit/?id=f2bd390a54f183f85dd7faab815740fb3bea9591

commit f2bd390a54f183f85dd7faab815740fb3bea9591
Author:     Getz Mikalsen <getz@FreeBSD.org>
AuthorDate: 2024-08-26 18:14:01 +0000
Commit:     Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2025-01-10 15:02:39 +0000

    lib/libc/aarch64/string: add strcspn optimized implementation
    
    This is a port of the Scalar optimized variant of strcspn for amd64
    to aarch64 It utilizes a LUT to speed up the function, a SIMD
    variant is still under development.
    
    Performance benchmarks are as usual generated by strperf.
    
    See the DR for benchmark results.
    
    Tested by:      fuz (exprun)
    Reviewed by:    fuz, emaste
    Sponsored by:   Google LLC (GSoC 2024)
    PR:             281175
    Differential Revision: https://reviews.freebsd.org/D46398
---
 lib/libc/aarch64/string/Makefile.inc |   3 +-
 lib/libc/aarch64/string/strcspn.S    | 109 +++++++++++++++++++++++++++++++++++
 2 files changed, 111 insertions(+), 1 deletion(-)

diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc
index 09bfaef963eb..34483532a3dd 100644
--- a/lib/libc/aarch64/string/Makefile.inc
+++ b/lib/libc/aarch64/string/Makefile.inc
@@ -22,7 +22,8 @@ AARCH64_STRING_FUNCS= \
 # SIMD-enhanced routines not derived from Arm's code
 MDSRCS+= \
 	strcmp.S \
-	strspn.S
+	strspn.S \
+	strcspn.S
 
 #
 # Add the above functions. Generate an asm file that includes the needed
diff --git a/lib/libc/aarch64/string/strcspn.S b/lib/libc/aarch64/string/strcspn.S
new file mode 100644
index 000000000000..8f2d6d20f0f6
--- /dev/null
+++ b/lib/libc/aarch64/string/strcspn.S
@@ -0,0 +1,109 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org>
+*/
+
+#include <machine/asm.h>
+
+	.weak	strcspn
+	.set	strcspn, __strcspn
+	.text
+
+ENTRY(__strcspn)
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	mov	x15, #1			// preload register with 1 for stores
+
+	/* check for special cases */
+	ldrb	w4, [x1]		// first character in the set
+	cbz	w4, .Lstrlen
+
+	movi	v0.16b, #0
+
+	ldrb	w5, [x1, #1]		// second character in the set
+	cbz	w5, .Lstrchr
+
+	sub	sp, sp, #256		// allocate 256 bytes on the stack
+
+	/* no special case matches -- prepare lookup table */
+	mov	w3, #20
+	.p2align 4
+0:	add	x9, sp, x3, lsl #3
+	stp	xzr, xzr, [x9]
+	stp	xzr, xzr, [x9, #16]
+	subs	w3, w3, #4
+	b.cs	0b
+
+	/* utilize SIMD stores to speed up zeroing the table */
+	stp	q0, q0, [sp, #6*32]
+	stp	q0, q0, [sp, #7*32]
+
+	add	x1, x1, #2
+	strb	w15, [sp, x4]		// register first chars in the set
+	strb	w15, [sp, x5]
+
+	mov	x4, x0			// stash a copy of src
+
+	/* process remaining chars in set */
+	.p2align 4
+0:	ldrb	w5, [x1]
+	strb	w15, [sp, x5]
+	cbz	w5, 1f			// end of set?
+
+	ldrb	w5, [x1, #1]
+	strb	w15, [sp, x5]
+	cbz	w5, 1f
+
+	add	x1, x1, #2
+	b	0b
+
+	/* find match */
+	.p2align 4
+1:	ldrb	w8, [x0]
+	ldrb	w9, [sp, x8]
+	cbnz	w9, 2f
+
+	ldrb	w8, [x0, #1]
+	ldrb	w9, [sp, x8]
+	cbnz	w9, 3f
+
+	ldrb	w8, [x0, #2]
+	ldrb	w9, [sp, x8]
+	cbnz	w9, 4f
+
+	ldrb	w8, [x0, #3]
+	ldrb	w9, [sp, x8]
+	add	x0, x0, #4
+	cbz	w9, 1b
+
+	sub	x0, x0, #3		// fix up return value
+4:	sub	x4, x4, #1
+3:	add	x0, x0, #1
+2:	sub	x0, x0, x4
+	mov	sp, x29
+	ldp	x29, x30, [sp], #16	// restore sp and lr
+	ret
+
+	/* set is empty, degrades to strlen */
+	.p2align 4
+.Lstrlen:
+	mov	sp, x29
+	ldp	x29, x30, [sp], #16	// restore sp and lr
+	b	strlen
+
+	/* just one character in set, degrades to strchrnul */
+	.p2align 4
+.Lstrchr:
+	stp	x0, x1, [sp, #-16]!
+	mov	x1, x4
+
+	bl	strchrnul
+
+	ldp	x18, x17, [sp], #16	// restore stashed src
+	sub	x0, x0, x18
+
+	ldp	x29, x30, [sp], #16	// Restore sp and lr
+	ret
+
+END(__strcspn)