git: 7084133cde6a - main - lib/libc/amd64/string: add strspn(3) scalar, x86-64-v2 implementation

From: Robert Clausecker <fuz_at_FreeBSD.org>
Date: Fri, 08 Sep 2023 21:23:31 UTC
The branch main has been updated by fuz:

URL: https://cgit.FreeBSD.org/src/commit/?id=7084133cde6a58412d86bae9f8a55b86141fb304

commit 7084133cde6a58412d86bae9f8a55b86141fb304
Author:     Robert Clausecker <fuz@FreeBSD.org>
AuthorDate: 2023-08-21 16:06:42 +0000
Commit:     Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2023-09-08 21:21:59 +0000

    lib/libc/amd64/string: add strspn(3) scalar, x86-64-v2 implementation
    
    This is conceptually very similar to the strcspn(3) implementations
    from D41557, but we can't do the fast paths the same way.
    
    Sponsored by:   The FreeBSD Foundation
    Approved by:    mjg
    MFC after:      1 week
    MFC to:         stable/14
    Differential Revision:  https://reviews.freebsd.org/D41567
---
 lib/libc/amd64/string/Makefile.inc |   3 +-
 lib/libc/amd64/string/strspn.S     | 358 +++++++++++++++++++++++++++++++++++++
 2 files changed, 360 insertions(+), 1 deletion(-)

diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc
index f01a52d9ebea..b03811f40062 100644
--- a/lib/libc/amd64/string/Makefile.inc
+++ b/lib/libc/amd64/string/Makefile.inc
@@ -12,4 +12,5 @@ MDSRCS+= \
 	strcmp.S \
 	strcspn.S \
 	strlen.S \
-	strcpy.c
+	strcpy.c \
+	strspn.S
diff --git a/lib/libc/amd64/string/strspn.S b/lib/libc/amd64/string/strspn.S
new file mode 100644
index 000000000000..565330f0c385
--- /dev/null
+++ b/lib/libc/amd64/string/strspn.S
@@ -0,0 +1,358 @@
+/*-
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+#include <machine/asm.h>
+#include <machine/param.h>
+
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT	.p2align 4,0x90 /* 16-byte alignment, nop filled */
+
+ARCHFUNCS(strspn)
+	ARCHFUNC(strspn, scalar)
+	NOARCHFUNC
+	ARCHFUNC(strspn, x86_64_v2)
+ENDARCHFUNCS(strspn)
+
+ARCHENTRY(strspn, scalar)
+	push	%rbp			# align stack to enable function call
+	mov	%rsp, %rbp
+	sub	$256, %rsp		# allocate space for lookup table
+
+	/* check for special cases */
+	movzbl	(%rsi), %edx		# first character in the set
+	test	%edx, %edx
+	jz	.Lzero			# empty set always returns 0
+
+	movzbl	1(%rsi), %eax		# second character in the set
+	test	%eax, %eax
+	jz	.Lsingle
+
+	/* no special case matches -- prepare lookup table */
+	xor	%r8d, %r8d
+	mov	$28, %ecx
+0:	mov	%r8, (%rsp, %rcx, 8)
+	mov	%r8, 8(%rsp, %rcx, 8)
+	mov	%r8, 16(%rsp, %rcx, 8)
+	mov	%r8, 24(%rsp, %rcx, 8)
+	sub	$4, %ecx
+	jnc	0b
+
+	movb	$1, (%rsp, %rdx, 1)	# register first char in set
+	add	$2, %rsi
+
+	/* process remaining chars in set */
+	ALIGN_TEXT
+0:	movb	$1, (%rsp, %rax, 1)	# register previous char
+	movzbl	(%rsi), %eax		# next char in set
+	test	%eax, %eax		# end of string?
+	jz	1f
+
+	movb	$1, (%rsp, %rax, 1)
+	add	$2, %rsi
+	movzbl	-1(%rsi), %eax
+	test	%eax, %eax
+	jnz	0b
+
+1:	mov	%rdi, %rax		# a copy of the source to iterate over
+
+	/* find mismatch */
+	ALIGN_TEXT
+0:	movzbl	(%rax), %ecx
+	cmpb	$0, (%rsp, %rcx, 1)
+	je	2f
+
+	movzbl	1(%rax), %ecx
+	cmpb	$0, (%rsp, %rcx, 1)
+	je	3f
+
+	movzbl	2(%rax), %ecx
+	cmpb	$0, (%rsp, %rcx, 1)
+	je	4f
+
+	movzbl	3(%rax), %ecx
+	add	$4, %rax
+	cmpb	$0, (%rsp, %rcx, 1)
+	jne	0b
+
+	sub	$3, %rax
+4:	dec	%rdi
+3:	inc	%rax
+2:	sub	%rdi, %rax		# number of characters preceding match
+	leave
+	ret
+
+	/* empty set never matches */
+.Lzero:	xor	%eax, %eax
+	leave
+	ret
+
+	/* find repeated single character */
+	ALIGN_TEXT
+.Lsingle:
+	cmpb	%dl, (%rdi, %rax, 1)
+	jne	1f
+
+	cmpb	%dl, 1(%rdi, %rax, 1)
+	jne	2f
+
+	cmpb	%dl, 2(%rdi, %rax, 1)
+	jne	3f
+
+	cmpb	%dl, 3(%rdi, %rax, 1)
+	lea	4(%rax), %rax
+	je	.Lsingle
+
+	sub	$3, %rax
+3:	inc	%rax
+2:	inc	%rax
+1:	leave
+	ret
+ARCHEND(strspn, scalar)
+
+	/*
+	 * This kernel uses pcmpistri to do the heavy lifting.
+	 * We provide three code paths, depending on set size:
+	 *
+	 *  0--16: one pcmpistri per 16 bytes of input
+	 * 17--32: two pcmpistri per 16 bytes of input
+	 *   >=33: fall back to look up table
+	 */
+ARCHENTRY(strspn, x86_64_v2)
+	push		%rbp
+	mov		%rsp, %rbp
+	sub		$256, %rsp
+
+	/* find set size and copy up to 32 bytes to (%rsp) */
+	mov		%esi, %ecx
+	and		$~0xf, %rsi		# align set pointer
+	movdqa		(%rsi), %xmm0
+	pxor		%xmm1, %xmm1
+	and		$0xf, %ecx		# amount of bytes rsi is past alignment
+	xor		%edx, %edx
+	pcmpeqb		%xmm0, %xmm1		# end of string reached?
+	movdqa		%xmm0, 32(%rsp)		# transfer head of set to stack
+	pmovmskb	%xmm1, %eax
+	shr		%cl, %eax		# clear out junk before string
+	test		%eax, %eax		# end of set reached?
+	jnz		0f
+
+	movdqa		16(%rsi), %xmm0		# second chunk of the set
+	mov		$16, %edx
+	sub		%ecx, %edx		# length of set preceding xmm0
+	pxor		%xmm1, %xmm1
+	pcmpeqb		%xmm0, %xmm1
+	movdqa		%xmm0, 48(%rsp)
+	movdqu		32(%rsp, %rcx, 1), %xmm2 # head of set
+	pmovmskb	%xmm1, %eax
+	test		%eax, %eax
+	jnz		1f
+
+	movdqa		32(%rsi), %xmm0		# third chunk
+	add		$16, %edx
+	pxor		%xmm1, %xmm1
+	pcmpeqb		%xmm0, %xmm1
+	movdqa		%xmm0, 64(%rsp)
+	pmovmskb	%xmm1, %eax
+	test		%eax, %eax		# still not done?
+	jz		.Lgt32v2
+
+0:	movdqu		32(%rsp, %rcx, 1), %xmm2 # head of set
+1:	tzcnt		%eax, %eax
+	add		%eax, %edx		# length of set (excluding NUL byte)
+	cmp		$32, %edx		# above 32 bytes?
+	ja		.Lgt32v2
+
+	/*
+	 * At this point we know that we want to use pcmpistri.
+	 * one last problem obtains: the head of the string is not
+	 * aligned and may cross a cacheline.  If this is the case,
+	 * we take the part before the page boundary and repeat the
+	 * last byte to fill up the xmm register.
+	 */
+	mov		%rdi, %rax		# save original string pointer
+	lea		15(%rdi), %esi		# last byte of the head
+	xor		%edi, %esi
+	test		$PAGE_SIZE, %esi	# does the head cross a page?
+	jz		0f
+
+	/* head crosses page: copy to stack to fix up */
+	and		$~0xf, %rax		# align head pointer temporarily
+	movzbl		15(%rax), %esi		# last head byte on the page
+	movdqa		(%rax), %xmm0
+	movabs		$0x0101010101010101, %r8
+	imul		%r8, %rsi		# repeated 8 times
+	movdqa		%xmm0, (%rsp)		# head word on stack
+	mov		%rsi, 16(%rsp)		# followed by filler (last byte x8)
+	mov		%rsi, 24(%rsp)
+	mov		%edi, %eax
+	and		$0xf, %eax		# offset of head from alignment
+	add		%rsp, %rax		# pointer to fake head
+
+0:	movdqu		(%rax), %xmm1		# load head (fake or real)
+	lea		16(%rdi), %rax
+	and		$~0xf, %rax		# second 16 bytes of string (aligned)
+1:	cmp		$16, %edx		# 16--32 bytes?
+	ja		.Lgt16v2
+
+
+	/* set is 2--16 bytes in size */
+
+	/* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT|_SIDD_NEGATIVE_POLARITY */
+	pcmpistri	$0x10, %xmm1, %xmm2	# match in head?
+	jc		.Lheadmismatchv2
+
+	ALIGN_TEXT
+0:	pcmpistri	$0x10, (%rax), %xmm2
+	jc		1f			# match or end of string?
+	pcmpistri	$0x10, 16(%rax), %xmm2
+	lea		32(%rax), %rax
+	jnc		0b			# match or end of string?
+
+	sub		$16, %rax		# go back to second half
+1:	sub		%rdi, %rax		# offset of (%rax) from beginning of string
+	add		%rcx, %rax		# prefix length before match/NUL
+        leave
+        ret
+
+.Lheadmismatchv2:
+	mov		%ecx, %eax		# prefix length before mismatch/NUL
+	leave
+	ret
+
+	/* set is 17--32 bytes in size */
+.Lgt16v2:
+	movdqu		48(%rsp, %rcx, 1), %xmm3 # second part of set
+
+	/* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_BIT_MASK|_SIDD_NEGATIVE_POLARITY */
+	pcmpistrm	$0x10, %xmm1, %xmm2	# any mismatch in first half?
+	movdqa		%xmm0, %xmm4
+	pcmpistrm	$0x10, %xmm1, %xmm3	# any mismatch in the second half?
+	ptest		%xmm0, %xmm4		# any entry that doesn't match either?
+	jnz		2f
+
+	ALIGN_TEXT
+0:	movdqa		(%rax), %xmm1
+	pcmpistrm	$0x10, %xmm1, %xmm2
+	movdqa		%xmm0, %xmm4
+	pcmpistrm	$0x10, %xmm1, %xmm3
+	ptest		%xmm0, %xmm4
+	jnz		1f
+	movdqa		16(%rax), %xmm1
+	add		$32, %rax
+	pcmpistrm	$0x10, %xmm1, %xmm2
+	movdqa		%xmm0, %xmm4
+	pcmpistrm	$0x10, %xmm1, %xmm3
+	ptest		%xmm0, %xmm4
+	jz		0b
+
+	sub		$16, %rax
+1:	pand		%xmm4, %xmm0
+	movd		%xmm0, %ecx
+	sub		%rdi, %rax		# offset of %xmm1 from beginning of string
+	tzcnt		%ecx, %ecx
+	add		%rcx, %rax		# prefix length before match/NUL
+	leave
+	ret
+
+	/* mismatch or string end in head */
+2:	pand		%xmm4, %xmm0		# bit mask of mismatches (end of string counts)
+	movd		%xmm0, %eax
+	tzcnt		%eax, %eax		# prefix length before mismatch/NUL
+	leave
+	ret
+
+	/* set is >=33 bytes in size */
+.Lgt32v2:
+	xorps	%xmm0, %xmm0
+	mov	$256-64, %edx
+
+	/* clear out look up table */
+0:	movaps	%xmm0, (%rsp, %rdx, 1)
+	movaps	%xmm0, 16(%rsp, %rdx, 1)
+	movaps	%xmm0, 32(%rsp, %rdx, 1)
+	movaps	%xmm0, 48(%rsp, %rdx, 1)
+	sub	$64, %edx
+	jnc	0b
+
+	add	%rcx, %rsi		# restore string pointer
+	mov	%rdi, %rax		# keep a copy of the string
+
+	/* initialise look up table */
+	movzbl	(%rsi), %ecx		# string is known not to be empty
+
+	ALIGN_TEXT
+0:	movb	$1, (%rsp, %rcx, 1)
+	movzbl	1(%rsi), %ecx
+	test	%ecx, %ecx
+	jz	1f
+
+	movb	$1, (%rsp, %rcx, 1)
+	movzbl	2(%rsi), %ecx
+	test	%ecx, %ecx
+	jz	1f
+
+	movb	$1, (%rsp, %rcx, 1)
+	movzbl	3(%rsi), %ecx
+	add	$4, %rsi
+	test	%ecx, %ecx
+	jz	1f
+
+	movb	$1, (%rsp, %rcx, 1)
+	movzbl	(%rsi), %ecx
+	test	%ecx, %ecx
+	jnz	0b
+
+	/* find match */
+	ALIGN_TEXT
+1:	movzbl	(%rax), %ecx
+	cmpb	$0, (%rsp, %rcx, 1)
+	je	2f
+
+	movzbl	1(%rax), %ecx
+	cmpb	$0, (%rsp, %rcx, 1)
+	je	3f
+
+	movzbl	2(%rax), %ecx
+	cmpb	$0, (%rsp, %rcx, 1)
+	je	4f
+
+	movzbl	3(%rax), %ecx
+	add	$4, %rax
+	cmpb	$0, (%rsp, %rcx, 1)
+	jne	1b
+
+	sub	$3, %rax
+4:	dec	%rdi
+3:	inc	%rax
+2:	sub	%rdi, %rax		# number of characters preceding match
+	leave
+	ret
+ARCHEND(strspn, x86_64_v2)
+
+	.section .note.GNU-stack,"",%progbits