git: 756b7fc80837 - main - lib/libc/aarch64/string: add strlcpy SIMD implementation

From: Robert Clausecker <fuz_at_FreeBSD.org>
Date: Fri, 10 Jan 2025 15:03:53 UTC
The branch main has been updated by fuz:

URL: https://cgit.FreeBSD.org/src/commit/?id=756b7fc80837567d114a3c93e9bb987e219a1b23

commit 756b7fc80837567d114a3c93e9bb987e219a1b23
Author:     Getz Mikalsen <getz@FreeBSD.org>
AuthorDate: 2024-08-26 18:14:31 +0000
Commit:     Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2025-01-10 15:02:40 +0000

    lib/libc/aarch64/string: add strlcpy SIMD implementation
    
    This changeset includes a port of the SIMD implementation of
    strlcpy for amd64 to Aarch64.
    
    It is based on memccpy (D46170) with some minor differences.
    
    Performance is significantly better than the scalar implementation.
    
    Benchmark results are as usual generated by the strperf utility
    written by fuz.
    
    See the DR for benchmark results.
    
    Tested by:      fuz (exprun)
    Reviewed by:    fuz, emaste
    Sponsored by:   Google LLC (GSoC 2024)
    PR:             281175
    Differential Revision: https://reviews.freebsd.org/D46243
---
 lib/libc/aarch64/string/Makefile.inc |   3 +-
 lib/libc/aarch64/string/strlcpy.S    | 316 +++++++++++++++++++++++++++++++++++
 2 files changed, 318 insertions(+), 1 deletion(-)

diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc
index 0b2974947389..34a84bcfe133 100644
--- a/lib/libc/aarch64/string/Makefile.inc
+++ b/lib/libc/aarch64/string/Makefile.inc
@@ -26,7 +26,8 @@ MDSRCS+= \
 	strcspn.S \
 	strpbrk.c \
 	strsep.c \
-	strcat.c
+	strcat.c \
+	strlcpy.S
 
 #
 # Add the above functions. Generate an asm file that includes the needed
diff --git a/lib/libc/aarch64/string/strlcpy.S b/lib/libc/aarch64/string/strlcpy.S
new file mode 100644
index 000000000000..3859aaca447b
--- /dev/null
+++ b/lib/libc/aarch64/string/strlcpy.S
@@ -0,0 +1,316 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org>
+*/
+
+#include <machine/asm.h>
+
+	.weak strlcpy
+	.set strlcpy, __strlcpy
+	.text
+
+ENTRY(__strlcpy)
+	subs	x2, x2, #1
+	b.lo	.L0
+
+	mov	x9, x0			// stash copy of dst pointer
+	bic	x10, x1, #0xf		// src aligned
+	and	x11, x1, #0xf		// src offset
+
+	ldr	q1, [x10]
+	cmeq	v1.16b, v1.16b, #0	// NUL found in head?
+
+	mov	x8, #-1			// fill register with 0xfff..fff
+	lsl	x12, x11, #2
+	lsl	x8, x8, x12		// mask of bytes in the string
+
+	shrn	v1.8b, v1.8h, #4
+	fmov	x5, d1
+
+	ands	x5, x5, x8
+	b.ne	.Lhead_nul
+
+	ldr	q3, [x10, #16]		// load second string chunk
+	ldr	q2, [x1]		// load true head
+	mov	x8, #32
+	sub	x8, x8, x11
+
+	cmeq	v1.16b, v3.16b, #0	// NUL found in second chunk?
+
+	subs	x2, x2, x8
+	b.ls	.Lhead_buf_end
+
+	/* process second chunk */
+	shrn	v1.8b, v1.8h, #4
+	fmov	x5, d1
+	cbnz	x5, .Lsecond_nul
+
+	/* string didn't end in second chunk and neither did buffer */
+	ldr	q1,	[x10, #32]	// load next string chunk
+	str	q2,	[x0]		// deposit head into buffer
+	sub	x0, x0, x11		// adjust x0
+	str	q3,	[x0, #16]	// deposit second chunk
+	add	x10, x10, #32		// advance src
+	add	x0, x0, #32		// advance dst
+	subs	x2, x2, #16		// enough left for another round?
+	b.ls	1f
+
+	/* main loop unrolled twice */
+	.p2align 4
+0:
+	cmeq	v2.16b, v1.16b, #0	// NUL found in second chunk?
+	shrn	v2.8b, v2.8h, #4
+	fmov	x5, d2
+
+	cbnz	x5, 3f
+
+	str	q1, [x0]
+	ldr	q1, [x10, #16]		// load next chunk
+
+	cmp	x2, #16			// more than a full chunk left?
+	b.ls	2f
+
+	add	x10, x10, #32		// advance pointers
+	add	x0, x0, #32
+
+	cmeq	v2.16b, v1.16b, #0	// NUL found in second chunk?
+	shrn	v2.8b, v2.8h, #4
+	fmov	x5, d2
+	cbnz	x5, 4f			// process chunk if match
+
+	str	q1, [x0, #-16]
+	ldr	q1, [x10]		// load next chunk
+
+	subs	x2, x2, #32
+	b.hi	0b
+
+1:
+	sub	x10, x10, #16		// undo second advancement
+	add	x2, x2, #16
+	sub	x0, x0, #16
+
+	/* 1--16 bytes left in the buffer but string has not ended yet */
+2:
+	cmeq	v2.16b, v1.16b, #0	// NUL found in second chunk?
+	shrn	v2.8b, v2.8h, #4
+	fmov	x4, d2
+
+	mov	x6, #0xf
+	mov	x7, x4
+
+	lsl	x5, x2, #2		// shift 0xf to the limits position
+	lsl	x5, x6, x5
+	cmp	x2, #16			// dont induce match if limit >=16
+	csel	x5, x5, xzr, lo
+	orr	x8, x4, x5		// treat limit as if terminator present
+
+	rbit	x8, x8			// simulate x86 tzcnt
+	clz	x8, x8			// index of mismatch
+	lsr	x8, x8, #2
+
+	add	x0, x0, x8
+
+	ldr	q1, [x10, x8]		// load tail
+	str	q1, [x0]		// store tail
+	strb	wzr, [x0, #16]
+
+	/* continue to find the end of the string */
+	cbnz	x7, 1f
+
+	/* we opt for a simpler strlen than the one in libc as the
+	 * cmeq, shrn approach is faster for shorter strings.
+	 */
+	.p2align 4
+0:
+	ldr	q1, [x10, #32]
+	cmeq	v1.16b, v1.16b, #0	// bytewise compare against NUL
+	shrn	v1.8b, v1.8h, #4
+	fmov	x7, d1
+	cbnz	x7, 2f
+
+	ldr	q1, [x10, #48]
+	cmeq	v1.16b, v1.16b, #0	// bytewise compare against NUL
+	shrn	v1.8b, v1.8h, #4
+	fmov	x7, d1
+	add	x10, x10, #32
+	cbz	x7, 0b
+
+1:	sub	x10, x10, #16
+2:	rbit	x8, x7
+	clz	x8, x8			// index of mismatch
+	lsr	x8, x8, #2
+
+	sub	x10, x10, x1
+	add	x0, x10, #32
+	add	x0, x0, x8
+
+	ret
+
+4:
+	sub	x10, x10, #16		// undo second advancement
+	sub	x0, x0, #16		// undo second advancement
+
+	/* string has ended but buffer has not */
+3:
+	rbit	x8, x5
+	clz	x8, x8			// index of mismatch
+	lsr	x8, x8, #2
+
+	add	x0, x0, x8		// restore dst pointer
+	add	x10, x10, x8
+
+	ldr	q1, [x10, #-15]
+	str	q1, [x0, #-15]
+	add	x0, x0, #1
+	sub	x0, x10, x1
+
+	ret
+
+.Lhead_buf_end:
+	shrn	v1.8b, v1.8h, #4
+	fmov	x8, d1
+
+	add	x2, x2, #32		// restore limit
+
+	mov	x7, x8
+	mov	x6, #0xf
+
+	cmp	x2, #16			// should we induce a match or not
+	b.lo	0f
+
+	rbit	x8, x8
+	clz	x8, x8			// index of mismatch
+	lsr	x8, x8, #2
+	add	x8, x8, #16
+
+	cmp	x8, x2
+	csel	x8, x8, x2, lo		// copy min(buflen, srclen) bytes
+	b	1f
+0:
+
+	rbit	x8, x8
+	clz	x8, x8			// index of mismatch
+	lsr	x8, x8, #2
+
+	mov	x8, x2
+1:
+
+	sub	x8, x8, x11
+	strb	wzr, [x9, x8]
+
+	/* continue to find the end of the string */
+	cbnz	x7, 1f
+
+	/* we opt for a simpler strlen than the one in libc as the
+	 * cmeq, shrn approach is faster for shorter strings.
+	 */
+	.p2align 4
+0:
+	ldr	q1, [x10, #32]
+	cmeq	v1.16b, v1.16b, #0	// bytewise compare against NUL
+	shrn	v1.8b, v1.8h, #4
+	fmov	x7, d1
+	cbnz	x7, 2f
+
+	ldr	q1, [x10, #48]
+	cmeq	v1.16b, v1.16b, #0	// bytewise compare against NUL
+	shrn	v1.8b, v1.8h, #4
+	fmov	x7, d1
+	add	x10, x10, #32
+	cbz	x7, 0b
+
+1:	sub	x10, x10, #16
+2:	rbit	x6, x7
+	clz	x6, x6			// index of mismatch
+	lsr	x6, x6, #2
+
+	sub	x10, x10, x1
+	add	x0, x10, #32
+	add	x0, x0, x6
+
+	add	x4, x9, x8		// dst + cnt
+	add	x5, x1, x8		// src + cnt
+
+	b	.L1732
+
+.Lsecond_nul:
+	add	x2, x2, x8
+
+	rbit	x8, x5
+	clz	x8, x8			// index of mismatch
+	lsr	x5, x8, #2
+
+	sub	x8, x11, #16
+	sub	x0, x5, x8		// string length
+
+	cmp	x0, x2			// did we match or hit limit first?
+	csel	x8, x2, x0, hi
+
+	add	x4, x9, x8		// dst + cnt
+	add	x5, x1, x8		// src + cnt
+
+	strb	wzr, [x4]
+
+	/* copy 17-32 bytes */
+.L1732:
+	cmp	x8, #16
+	b.lo	.L0816
+	ldp	x16, x17, [x1]
+	ldp	x12, x1, [x5, #-16]
+	stp	x16, x17, [x9]
+	stp	x12, x1, [x4, #-16]
+	ret
+
+.Lhead_nul:
+	rbit	x8, x5
+	clz	x8, x8			// index of mismatch
+	lsr	x8, x8, #2
+
+	sub	x0, x8, x11
+	cmp	x0, x2
+	csel	x8, x2, x0, hi
+
+	add	x4, x9, x8		// dst + cnt
+	add	x5, x1, x8		// src + cnt
+	strb	wzr, [x4]
+
+	/* Copy 8-16 bytes */
+.L0816:
+	tbz	x8, #3, .L0407
+	ldr	x16, [x1]
+	ldr	x17, [x5, #-8]
+	str	x16, [x9]
+	str	x17, [x4, #-8]
+	ret
+
+	/* Copy 4-7 bytes */
+	.p2align 4
+.L0407:
+	cmp	x8, #3
+	b.ls	.L0203
+	ldr	w16, [x1]
+	ldr	w18, [x5, #-4]
+	str	w16, [x9]
+	str	w18, [x4, #-4]
+	ret
+
+.L0203:
+	tbz	x8, 1, .L0001
+	ldrh	w16, [x1]
+	ldrh	w17, [x5, #-2]
+	strh	w16, [x9]
+	strh	w17, [x4, #-2]
+	ret
+
+.L0001:
+	ldrb	w16, [x1]
+	strb	w16, [x9]
+	strb	wzr, [x4]
+	ret
+
+.L0:
+	mov	x0, x1
+	b	strlen
+	ret
+END(__strlcpy)