git: bad17991c06d - main - lib/libc/aarch64/string: add memccpy SIMD implementation

From: Robert Clausecker <fuz_at_FreeBSD.org>
Date: Fri, 10 Jan 2025 15:03:56 UTC
The branch main has been updated by fuz:

URL: https://cgit.FreeBSD.org/src/commit/?id=bad17991c06d684e9053938d00a07b962e2fd31c

commit bad17991c06d684e9053938d00a07b962e2fd31c
Author:     Getz Mikalsen <getz@FreeBSD.org>
AuthorDate: 2024-08-26 18:15:13 +0000
Commit:     Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2025-01-10 15:02:40 +0000

    lib/libc/aarch64/string: add memccpy SIMD implementation
    
    This changeset includes a port of the SIMD implementation of
    memccpy for amd64 to Aarch64.
    
    Performance is significantly better than the scalar implementation
    except for short strings.
    
    Benchmark results are as usual generated by the strperf utility
    written by fuz.
    
    See the DR for benchmark results.
    
    Tested by:      fuz (exprun)
    Reviewed by:    fuz, emaste
    Sponsored by:   Google LLC (GSoC 2024)
    PR:             281175
    Differential Revision: https://reviews.freebsd.org/D46170
---
 lib/libc/aarch64/string/Makefile.inc |   3 +-
 lib/libc/aarch64/string/memccpy.S    | 271 +++++++++++++++++++++++++++++++++++
 2 files changed, 273 insertions(+), 1 deletion(-)

diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc
index 351f3424b6d0..78145a17ab85 100644
--- a/lib/libc/aarch64/string/Makefile.inc
+++ b/lib/libc/aarch64/string/Makefile.inc
@@ -27,7 +27,8 @@ MDSRCS+= \
 	strsep.c \
 	strcat.c \
 	strlcpy.S \
-	strncmp.S
+	strncmp.S \
+	memccpy.S
 
 #
 # Add the above functions. Generate an asm file that includes the needed
diff --git a/lib/libc/aarch64/string/memccpy.S b/lib/libc/aarch64/string/memccpy.S
new file mode 100644
index 000000000000..7d9fdb14b84b
--- /dev/null
+++ b/lib/libc/aarch64/string/memccpy.S
@@ -0,0 +1,271 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org>
+*/
+
+#include <machine/asm.h>
+
+	.weak	memccpy
+	.set	memccpy, __memccpy
+	.text
+
+ENTRY(__memccpy)
+	subs	x3, x3, #1
+	b.lo	.L0
+
+	dup	v0.16b,	w2
+
+	mov	x9, x0			// stash copy of src pointer
+	bic	x10, x1, #0xf		// src aligned
+	and	x11, x1, #0xf		// src offset
+
+	ldr	q1, [x10]
+	cmeq	v1.16b, v1.16b, v0.16b	// bytewise compare against src char
+
+	mov	x8, #-1			// prepare a 0xfff..fff register
+	mov	x6, #0xf
+
+	lsl	x12, x11, #2
+	lsl	x8, x8, x12		// mask of bytes in the string
+
+	shrn	v1.8b, v1.8h, #4
+	fmov	x5, d1
+
+	sub	x12, x11, #32
+	adds	x12, x12, x3		// distance from alignment boundary - 32
+	b.cc	.Lrunt			// branch if buffer length is 32 or less
+
+	ands	x8, x8, x5
+	b.eq	0f
+
+	/* match in first chunk */
+	rbit	x8, x8
+	clz	x8, x8			// index of mismatch
+	lsr	x8, x8, #2
+
+	sub	x8, x8, x11		// ... from beginning of the string
+
+	add	x0, x0, x8
+	add	x4, x9, x8		// dst + cnt
+	add	x5, x1, x8		// src + cnt
+	add	x0, x0, #1
+
+	b	.L0816
+
+0:
+	ldr	q3,	[x10, #16]	// load second string chunk
+	ldr	q2,	[x1]		// load true head
+	cmeq	v1.16b, v3.16b, v0.16b	// char found in second chunk?
+
+	/* process second chunk */
+	shrn	v1.8b, v1.8h, #4
+	fmov	x5, d1
+
+	cbz	x5, 0f
+
+	/* match in second chunk */
+	rbit	x8, x5
+	clz	x8, x8			// index of mismatch
+	lsr	x8, x8, #2
+
+	sub	x11, x11, #16
+	sub	x8, x8, x11		// adjust for alignment offset
+	add	x0, x0, x8		// return value
+	add	x0, x0, #1
+
+	add	x4, x9, x8
+	add	x5, x1, x8
+	b	.L1732
+
+0:
+	/* string didn't end in second chunk and neither did buffer */
+	ldr	q1,	[x10, #32]	// load next string chunk
+	str	q2,	[x0]		// deposit head into buffer
+	sub	x0, x0, x11		// adjust x0
+	mov	x3, x12
+	str	q3,	[x0, #16]	// deposit second chunk
+
+	add	x10, x10, #32		// advance src
+	add	x0, x0, #32		// advance dst
+	subs	x3, x3, #16		// enough left for another round?
+	b.lo	1f
+
+	/* main loop unrolled twice */
+	.p2align 4
+0:
+	cmeq	v2.16b, v1.16b, v0.16b	// char found in second chunk?
+	shrn	v2.8b, v2.8h, #4
+	fmov	x5, d2
+
+	cbnz	x5, 3f
+
+	str	q1, [x0]
+	ldr	q1, [x10, #16]		// load next chunk
+
+	cmp	x3, #16			// more than a full chunk left?
+	b.lo	2f
+
+	add	x10, x10, #32		// advance pointers
+	add	x0, x0, #32
+
+	cmeq	v2.16b, v1.16b, v0.16b	// char found in second chunk?
+	shrn	v2.8b, v2.8h, #4
+	fmov	x5, d2
+	cbnz	x5, 4f			// process chunk if match
+
+	str	q1, [x0, #-16]
+	ldr	q1, [x10]		// load next chunk
+
+	subs	x3, x3, #32
+	b.hs	0b
+
+1:
+	sub	x10, x10, #16		// undo second advancement
+	add	x3, x3, #16
+	sub	x0, x0, #16
+
+	/* 1--16 bytes left in the buffer but string has not ended yet */
+2:
+	cmeq	v2.16b, v1.16b, v0.16b	// char found in second chunk?
+	shrn	v2.8b, v2.8h, #4
+	fmov	x4, d2
+
+	lsl	x5, x3, #2		// shift 0xf to the limits position
+	lsl	x5, x6, x5
+	orr	x8, x4, x5		// insert match in mask at limit
+
+	rbit	x8, x8			// simulate x86 tzcnt
+	clz	x7, x8			// index of mismatch
+	lsr	x8, x7, #2
+
+	lsl	x5, x6, x7		// simulate x86 bt with shifted 0xf
+
+	add	x8, x8, #1
+	add	x0, x0, x8
+
+	ldr	q1, [x10, x8]		// load tail
+	str	q1, [x0]		// store tail
+
+	add	x0, x0, #16
+
+	tst	x4, x5			// terminator encountered inside buffer?
+	csel	x0, x0, xzr, ne		// if yes, return pointer, else NUL
+	ret
+
+4:
+	sub	x10, x10, #16		// undo second advancement
+	sub	x0, x0, #16		// undo second advancement
+
+3:
+	rbit	x8, x5
+	clz	x8, x8			// index of mismatch
+	lsr	x3, x8, #2
+
+	add	x0, x0, x3		// restore dst pointer
+	add	x10, x10, x3
+	ldr	q1, [x10, #-15]
+	str	q1, [x0, #-15]
+	add	x0, x0, #1
+	ret
+
+.Lrunt:
+	add	x13, x11, x3
+
+	mov	x7, x5			// keep a copy of original match mask
+
+	lsl	x4, x12, #2		// shift 0xf to the limits position
+	lsl	x4, x6, x4
+
+	cmp	x13, #16		// dont induce match if limit >=16
+	csel	x4, x4, xzr, lo
+	orr	x5, x5, x4		// insert match in mask at limit
+
+	ands	x8, x8, x5		// if match always fall through
+	b.ne	0f
+
+	ldr	q4,	[x10, #16]	// load second string chunk
+	cmeq	v1.16b, v4.16b, v0.16b	// char found in second chunk?
+
+	/* process second chunk */
+	shrn	v1.8b, v1.8h, #4
+	fmov	x8, d1
+	mov	x7, x8
+
+	lsl	x4, x12, #2
+	lsl	x4, x6, x4
+	orr	x8, x8, x4		// induce match in upper bytes of mask
+
+	rbit	x8, x8
+	clz	x4, x8			// index of mismatch
+	lsr	x8, x4, #2
+	add	x8, x8, #16		// no match in first chunk
+	b	1f
+
+0:
+	rbit	x8, x8
+	clz	x4, x8			// index of mismatch
+	lsr	x8, x4, #2
+1:
+	add	x0, x0, x8		// return value if terminator not found
+	sub	x0, x0, x11
+	add	x0, x0, #1
+
+	/* check if we encountered a match or the limit first */
+	lsl	x5, x6, x4
+	ands	x7, x7, x5		// was the terminator present?
+	csel	x0, xzr, x0, eq		// return value based on what we matched
+
+	sub	x8, x8, x11
+	add	x4, x9, x8		// dst + cnt
+	add	x5, x1, x8		// src + cnt
+
+	/* copy 17-32 bytes */
+.L1732:
+	cmp	x8, #16
+	b.lo	.L0816
+	add	x5, x5, #1		// ldp offsets are powers of 2
+	add	x4, x4, #1
+	ldp	x16, x17, [x1]
+	ldp	x12, x13, [x5, #-16]
+	stp	x16, x17, [x9]
+	stp	x12, x13, [x4, #-16]
+	ret
+
+	/* Copy 8-16 bytes */
+.L0816:
+	tbz	x8, #3, .L0407
+	ldr	x16, [x1]
+	ldr	x17, [x5, #-7]
+	str	x16, [x9]
+	str	x17, [x4, #-7]
+	ret
+
+	/* Copy 4-7 bytes */
+	.p2align 4
+.L0407:
+	cmp	x8, #3
+	b.lo	.L0103
+	ldr	w16, [x1]
+	ldr	w18, [x5, #-3]
+	str	w16, [x9]
+	str	w18, [x4, #-3]
+	ret
+
+	/* Copy 1-3 bytes */
+	.p2align 4
+.L0103:
+	lsr	x14, x8, #1
+	ldrb	w16, [x1]
+	ldrb	w15, [x5]
+	ldrb	w18, [x1, x14]
+	strb	w16, [x9]
+	strb	w18, [x9, x14]
+	strb	w15, [x4]
+	ret
+
+.L0:
+	eor	x0, x0, x0
+	ret
+
+END(__memccpy)