git: 9fbea870286d - main - lib/libc/amd64/string/stpcpy.S: add baseline implementation

From: Robert Clausecker <fuz_at_FreeBSD.org>
Date: Mon, 21 Aug 2023 19:28:43 UTC
The branch main has been updated by fuz:

URL: https://cgit.FreeBSD.org/src/commit/?id=9fbea870286d53d906ffaf6b15ace8e40019a880

commit 9fbea870286d53d906ffaf6b15ace8e40019a880
Author:     Robert Clausecker <fuz@FreeBSD.org>
AuthorDate: 2023-07-05 21:23:33 +0000
Commit:     Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2023-08-21 18:59:38 +0000

    lib/libc/amd64/string/stpcpy.S: add baseline implementation
    
    This commit adds a baseline implementation of stpcpy(3) for amd64.
    It performs quite well in comparison to the previous scalar implementation
    as well as agains bionic and glibc (though glibc is faster for very long
    strings).  Fiddle with the Makefile to also have strcpy(3) call into the
    optimised stpcpy(3) code, fixing an oversight from D9841.
    
    Sponsored by:   The FreeBSD Foundation
    Reviewed by:    imp ngie emaste
    Approved by:    mjg kib
    Fixes:          D9841
    Differential Revision:  https://reviews.freebsd.org/D41349
---
 lib/libc/amd64/string/Makefile.inc |   3 +-
 lib/libc/amd64/string/stpcpy.S     | 143 ++++++++++++++++++++++++++++++++++---
 2 files changed, 135 insertions(+), 11 deletions(-)

diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc
index 3bc36078768b..4df4ff8f1417 100644
--- a/lib/libc/amd64/string/Makefile.inc
+++ b/lib/libc/amd64/string/Makefile.inc
@@ -6,8 +6,9 @@ MDSRCS+= \
 	memcpy.S \
 	memmove.S \
 	memset.S \
+	stpcpy.S \
 	strcat.S \
 	strchrnul.S \
 	strcmp.S \
 	strlen.S \
-	stpcpy.S
+	strcpy.c
diff --git a/lib/libc/amd64/string/stpcpy.S b/lib/libc/amd64/string/stpcpy.S
index 73c765556dc1..59358e3245a8 100644
--- a/lib/libc/amd64/string/stpcpy.S
+++ b/lib/libc/amd64/string/stpcpy.S
@@ -1,10 +1,30 @@
-/*
- * Adapted by Guillaume Morin <guillaume@morinfr.org> from strcpy.S
- * written by J.T. Conklin <jtc@acorntoolworks.com>
- * Public domain.
+/*-
+ * Copyright (c) 2023, The FreeBSD Foundation
+ *
+ * SPDX-License-Expression: BSD-2-Clause
+ *
+ * Portions of this software were developed by Robert Clausecker
+ * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
+ *
+ * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcpy.S
+ * written by J.T. Conklin <jtc@acorntoolworks.com> and
+ * adapted by Guillaume Morin <guillaume@morinfr.org> to implement stpcpy
+ * that was originally dedicated to the public domain
  */
 
 #include <machine/asm.h>
+
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT	.p2align 4, 0x90
+
+	.weak stpcpy
+	.set stpcpy, __stpcpy
+ARCHFUNCS(__stpcpy)
+	ARCHFUNC(__stpcpy, scalar)
+	ARCHFUNC(__stpcpy, baseline)
+ENDARCHFUNCS(__stpcpy)
+
 /*
  * This stpcpy implementation copies a byte at a time until the
  * source pointer is aligned to a word boundary, it then copies by
@@ -18,9 +38,7 @@
  * requirements.
  */
 
-	.globl	stpcpy,__stpcpy
-ENTRY(stpcpy)
-__stpcpy:
+ARCHENTRY(__stpcpy, scalar)
 	movabsq $0x0101010101010101,%r8
 	movabsq $0x8080808080808080,%r9
 
@@ -41,7 +59,7 @@ __stpcpy:
 	dec	%rax
 	ret
 
-	.p2align 4
+	ALIGN_TEXT
 .Lloop:
 	movq	%rdx,(%rdi)
 	addq	$8,%rdi
@@ -109,6 +127,111 @@ __stpcpy:
 .Ldone:
 	movq	%rdi,%rax
 	ret
-END(stpcpy)
-	
+ARCHEND(__stpcpy, scalar)
+
+ARCHENTRY(__stpcpy, baseline)
+	mov	%esi, %ecx
+	mov	%rdi, %rdx
+	sub	%rsi, %rdi		# express destination as distance to surce
+	and	$~0xf, %rsi		# align source to 16 byte
+	movdqa	(%rsi), %xmm0		# head of string with junk before
+	pxor	%xmm1, %xmm1
+	and	$0xf, %ecx		# misalignment in bytes
+	pcmpeqb	%xmm1, %xmm0		# NUL byte present?
+	pmovmskb %xmm0, %eax
+	shr	%cl, %eax		# clear out matches in junk bytes
+	bsf	%eax, %eax		# find match if any
+	jnz	.Lrunt
+
+	/* first normal iteration: write head back if it succeeds */
+	movdqa	16(%rsi), %xmm0		# 16 bytes of current iteration
+	movdqu	(%rsi, %rcx, 1), %xmm2	# first 16 bytes of the string
+	pcmpeqb	%xmm0, %xmm1		# NUL byte present?
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax		# find match if any
+	jnz	.Lshorty
+
+	movdqu	%xmm2, (%rdx)		# store beginning of string
+
+	/* main loop, unrolled twice */
+	ALIGN_TEXT
+0:	movdqa	32(%rsi), %xmm2		# load current iteraion
+	movdqu	%xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion
+	pxor	%xmm1, %xmm1
+	add	$32, %rsi
+	pcmpeqb	%xmm2, %xmm1		# NUL byte present?
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jnz	1f
+
+	movdqa	16(%rsi), %xmm0		# load current iteraion
+	movdqu	%xmm2, (%rsi, %rdi, 1)	# write back previous iteraion
+	pxor	%xmm1, %xmm1
+	pcmpeqb	%xmm0, %xmm1		# NUL byte present?
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax
+	jz	0b
+
+	/* end of string after main loop has iterated */
+	add	$16, %rsi		# advance rsi to second unrolled half
+1:	tzcnt	%eax, %eax		# find location of match
+					# (behaves as bsf on pre-x86-64-v3 CPUs)
+	add	%rsi, %rax		# point to NUL byte
+	movdqu	-15(%rax), %xmm0	# last 16 bytes of string
+	movdqu	%xmm0, -15(%rax, %rdi, 1) # copied to destination
+	add	%rdi, %rax		# point to destination's NUL byte
+	ret
+
+	/* NUL encountered in second iteration */
+.Lshorty:
+	tzcnt	%eax, %eax
+	add	$16, %eax		# account for length of first iteration
+	sub	%ecx, %eax		# but not the parts before the string
+
+	/* NUL encountered in first iteration */
+.Lrunt:	lea	1(%rax), %edi		# string length including NUL byte
+	add	%rcx, %rsi		# point to beginning of string
+	add	%rdx, %rax		# point to NUL byte
+
+	/* transfer 16--32 bytes */
+.L1632:	cmp	$16, %edi
+	jb	.L0815
+
+	movdqu	-16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes
+	movdqu	%xmm2, (%rdx)		# store first 16 bytes
+	movdqu	%xmm0, -15(%rax)	# store last 16 bytes
+	ret
+
+	/* transfer 8--15 bytes */
+.L0815:	cmp	$8, %edi
+	jb	.L0407
+
+	mov	(%rsi), %rcx		# load first 8 bytes
+	mov	-8(%rsi, %rdi, 1), %rdi	# load last 8 bytes
+	mov	%rcx, (%rdx)		# store to dst
+	mov	%rdi, -7(%rax)		# dito
+	ret
+
+	/* transfer 4--7 bytes */
+.L0407:	cmp	$4, %edi
+	jb	.L0203
+
+	mov	(%rsi), %ecx
+	mov	-4(%rsi, %rdi, 1), %edi
+	mov	%ecx, (%rdx)
+	mov	%edi, -3(%rax)
+	ret
+
+	/* transfer 2--3 bytes */
+.L0203:	cmp	$2, %edi
+	jb	.L0101
+
+	movzwl	(%rsi), %ecx
+	mov	%cx, (%rdx)		# store first two bytes
+
+	/* transfer 0 bytes (last byte is always NUL) */
+.L0101:	movb	$0, (%rax)		# store terminating NUL byte
+	ret
+ARCHEND(__stpcpy, baseline)
+
 	.section .note.GNU-stack,"",%progbits