git: 3863fec1ce2d - main - lib/libc/aarch64/string: add strlen SIMD implementation
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Fri, 10 Jan 2025 15:04:00 UTC
The branch main has been updated by fuz: URL: https://cgit.FreeBSD.org/src/commit/?id=3863fec1ce2dc6033f094a085118605ea89db9e2 commit 3863fec1ce2dc6033f094a085118605ea89db9e2 Author: Getz Mikalsen <getz@FreeBSD.org> AuthorDate: 2024-08-26 19:54:32 +0000 Commit: Robert Clausecker <fuz@FreeBSD.org> CommitDate: 2025-01-10 15:02:40 +0000 lib/libc/aarch64/string: add strlen SIMD implementation Adds a SIMD enhanced strlen for Aarch64. It takes inspiration from the amd64 implementation but I struggled getting the performance I had hoped for on cores like the Graviton3 when compared to the existing implementation from Arm Optimized Routines. See the DR for bechmark results. Tested by: fuz (exprun) Reviewed by: fuz, emaste Sponsored by: Google LLC (GSoC 2024) PR: 281175 Differential Revision: https://reviews.freebsd.org/D45623 --- lib/libc/aarch64/string/Makefile.inc | 4 ++-- lib/libc/aarch64/string/strlen.S | 46 ++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc index f8c67319fe12..7325b54d9716 100644 --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -14,7 +14,6 @@ AARCH64_STRING_FUNCS= \ strchr \ strchrnul \ strcpy \ - strlen \ strnlen \ strrchr @@ -30,7 +29,8 @@ MDSRCS+= \ strncmp.S \ memccpy.S \ strncat.c \ - strlcat.c + strlcat.c \ + strlen.S # # Add the above functions. Generate an asm file that includes the needed diff --git a/lib/libc/aarch64/string/strlen.S b/lib/libc/aarch64/string/strlen.S new file mode 100644 index 000000000000..7bfac7f4b1e1 --- /dev/null +++ b/lib/libc/aarch64/string/strlen.S @@ -0,0 +1,46 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org> +*/ + +#include <machine/asm.h> + + .weak strlen + .set strlen, __strlen + .text + +ENTRY(__strlen) + bic x10, x0, #0xf // aligned src + and x9, x0, #0xf + ldr q0, [x10] + cmeq v0.16b, v0.16b, #0 + shrn v0.8b, v0.8h, #4 + fmov x1, d0 + cbz x9, .Laligned + lsl x2, x0, #2 // get the byte offset + lsr x1, x1, x2 // shift by offset index + cbz x1, .Lloop + rbit x1, x1 + clz x0, x1 + lsr x0, x0, #2 + ret + +.Laligned: + cbnz x1, .Ldone + +.Lloop: + ldr q0, [x10, #16]! + cmeq v0.16b, v0.16b, #0 + shrn v0.8b, v0.8h, #4 // reduce to fit mask in GPR + fcmp d0, #0.0 + b.eq .Lloop + fmov x1, d0 +.Ldone: + sub x0, x10, x0 + rbit x1, x1 // reverse bits as NEON has no ctz + clz x3, x1 + lsr x3, x3, #2 + add x0, x0, x3 + ret +END(__strlen)