git: fb197a4f7751 - main - lib/libc/amd64/string: add memrchr() scalar, baseline implementation

From: Robert Clausecker <fuz_at_FreeBSD.org>
Date: Mon, 25 Dec 2023 14:26:06 UTC
The branch main has been updated by fuz:

URL: https://cgit.FreeBSD.org/src/commit/?id=fb197a4f7751bb4e116989e57ba7fb12a981895f

commit fb197a4f7751bb4e116989e57ba7fb12a981895f
Author:     Robert Clausecker <fuz@FreeBSD.org>
AuthorDate: 2023-12-06 10:05:47 +0000
Commit:     Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2023-12-25 14:00:05 +0000

    lib/libc/amd64/string: add memrchr() scalar, baseline implementation
    
    The scalar implementation is fairly simplistic and only performs
    slightly better than the generic C implementation. It could be
    improved by using the same algorithm as for memchr, but it would
    have been a lot more complicated.
    
    The baseline implementation is similar to timingsafe_memcmp.  It's
    slightly slower than memchr() due to the more complicated main
    loop, but I don't think that can be significantly improved.
    
    Tested by:      developers@, exp-run
    Approved by:    mjg
    MFC after:      1 month
    MFC to:         stable/14
    PR:             275785
    Differential Revision:  https://reviews.freebsd.org/D42925
---
 lib/libc/amd64/string/Makefile.inc |   1 +
 lib/libc/amd64/string/memrchr.S    | 166 +++++++++++++++++++++++++++++++++++++
 2 files changed, 167 insertions(+)

diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc
index a14e8a768f01..b1369841bc74 100644
--- a/lib/libc/amd64/string/Makefile.inc
+++ b/lib/libc/amd64/string/Makefile.inc
@@ -6,6 +6,7 @@ MDSRCS+= \
 	memccpy.S \
 	memcpy.S \
 	memmove.S \
+	memrchr.S \
 	memset.S \
 	stpcpy.S \
 	stpncpy.S \
diff --git a/lib/libc/amd64/string/memrchr.S b/lib/libc/amd64/string/memrchr.S
new file mode 100644
index 000000000000..4f6c5a238daa
--- /dev/null
+++ b/lib/libc/amd64/string/memrchr.S
@@ -0,0 +1,166 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Robert Clausecker
+ */
+
+#include <machine/asm.h>
+
+#include "amd64_archlevel.h"
+
+#define	ALIGN_TEXT	.p2align 4, 0x90
+
+ARCHFUNCS(memrchr)
+	ARCHFUNC(memrchr, scalar)
+	ARCHFUNC(memrchr, baseline)
+ENDARCHFUNCS(memrchr)
+
+ARCHENTRY(memrchr, scalar)
+	xor	%eax, %eax		# prospective return value
+	sub	$4, %rdx		# 4 bytes left to process?
+	jb	1f
+
+	ALIGN_TEXT
+0:	xor	%r8, %r8
+	lea	2(%rdi), %r10
+	cmp	%sil, 2(%rdi)
+	cmovne	%r8, %r10		# point to null if no match
+
+	cmp	%sil, (%rdi)
+	cmove	%rdi, %r8		# point to first char if match
+
+	lea	1(%rdi), %r9
+	cmp	%sil, 1(%rdi)
+	cmovne	%r8, %r9		# point to first result if no match in second
+
+	lea	3(%rdi), %r11
+	cmp	%sil, 3(%rdi)
+	cmovne	%r10, %r11
+
+	test	%r11, %r11
+	cmovz	%r9, %r11		# take first pair match if none in second
+
+	test	%r11, %r11
+	cmovnz	%r11, %rax		# take match in current set if any
+
+	add	$4, %rdi
+	sub	$4, %rdx
+	jae	0b
+
+1:	cmp	$-3, %edx		# a least one character left to process?
+	jb	2f
+
+	cmp	%sil, (%rdi)
+	cmove	%rdi, %rax
+
+	lea	1(%rdi), %rcx
+	cmp	$-2, %edx		# at least two characters left to process?
+	jb	2f
+
+	cmp	%sil, 1(%rdi)
+	cmove	%rcx, %rax
+
+	lea	2(%rdi), %rcx
+	cmp	$-1, %edx		# at least three character left to process?
+	jb	2f
+
+	cmp	%sil, 2(%rdi)
+	cmove	%rcx, %rax
+
+2:	ret
+ARCHEND(memrchr, scalar)
+
+ARCHENTRY(memrchr, baseline)
+	movd		%esi, %xmm4
+	test		%rdx, %rdx		# empty buffer?
+	jz		.L0			# if yes, return immediately
+
+	punpcklbw	%xmm4, %xmm4		# c -> cc
+	mov		%edi, %ecx
+	punpcklwd	%xmm4, %xmm4		# cc -> cccc
+	and		$~0xf, %rdi		# align source pointer
+	pshufd		$0, %xmm4, %xmm4	# cccc -> cccccccccccccccc
+	and		$0xf, %ecx
+	movdqa		%xmm4, %xmm0
+	mov		$-1, %r8d
+	pcmpeqb		(%rdi), %xmm0		# compare aligned head
+	shl		%cl, %r8d		# mask of bytes in the head of the buffer
+	pmovmskb	%xmm0, %eax
+
+	sub		$16, %rcx
+	and		%r8d, %eax		# match mask
+	add		%rcx, %rdx		# advance past head
+	cmc
+	jbe		.Lrunt			# did the string end in the buffer?
+
+	mov		%rdi, %rsi		# pointer to matching chunk
+	add		$16, %rdi
+	sub		$16, %rdx		# enough left for another round?
+	jbe		1f
+
+	/* main loop unrolled twice */
+	ALIGN_TEXT
+0:	movdqa		%xmm4, %xmm0
+	pcmpeqb		(%rdi), %xmm0
+	pmovmskb	%xmm0, %r8d
+
+	cmp		$16, %rdx		# enough left for second chunk?
+	jbe		2f
+
+	movdqa		%xmm4, %xmm0
+	pcmpeqb		16(%rdi), %xmm0
+	pmovmskb	%xmm0, %ecx
+
+	lea		16(%rdi), %r9
+	test		%ecx, %ecx		# match found in second chunk?
+	cmovz		%r8d, %ecx		# if not, use match data from first chunk
+	cmovz		%rdi, %r9
+
+	test		%ecx, %ecx		# any match found?
+	cmovnz		%ecx, %eax		# if yes, overwrite previously found match
+	cmovnz		%r9, %rsi
+
+	add		$32, %rdi		# advance to next iteration
+	sub		$32, %rdx		# advance to next chunks
+	ja		0b
+
+	/* process remaining 1--16 bytes */
+1:	pcmpeqb		(%rdi), %xmm4
+	mov		$0xffff, %r8d
+	xor		%ecx, %ecx
+	sub		%edx, %ecx		# number of bytes to be masked out
+	pmovmskb	%xmm4, %r9d
+	shr		%cl, %r8d		# mask of bytes to be kept in the buffer
+	and		%r9d, %r8d
+	cmovnz		%r8d, %eax
+	cmovnz		%rdi, %rsi
+	bsr		%eax, %eax
+	lea		(%rsi, %rax, 1), %rsi	# pointer to match (or junk)
+	cmovnz		%rsi, %rax		# if any match was found, return it
+	ret
+
+	/* end of chunk reached within first half iteration */
+2:	test		%r8d, %r8d		# match in previous chunk?
+	cmovnz		%r8d, %eax		# if yes, overwrite previous chunks
+	cmovnz		%rdi, %rsi
+	add		$16, %rdi		# point to tail
+	sub		$16, %edx
+	jmp		1b			# handle tail the same otherwise
+
+	/* runt: string ends within head, edx has negated amount of invalid head bytes */
+.Lrunt:	mov		$0xffff, %r8d
+	xor		%ecx, %ecx
+	sub		%edx, %ecx
+	shr		%cl, %r8d
+	and		%r8d, %eax
+	bsr		%eax, %eax
+	lea		(%rdi, %rax, 1), %rdi
+	cmovnz		%rdi, %rax
+	ret
+
+	/* empty buffer: return a null pointer */
+.L0:	xor		%eax, %eax
+	ret
+ARCHEND(memrchr, baseline)
+
+	.section	.note.GNU-stack, "", %progbits