git: 9fbea870286d - main - lib/libc/amd64/string/stpcpy.S: add baseline implementation
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Mon, 21 Aug 2023 19:28:43 UTC
The branch main has been updated by fuz: URL: https://cgit.FreeBSD.org/src/commit/?id=9fbea870286d53d906ffaf6b15ace8e40019a880 commit 9fbea870286d53d906ffaf6b15ace8e40019a880 Author: Robert Clausecker <fuz@FreeBSD.org> AuthorDate: 2023-07-05 21:23:33 +0000 Commit: Robert Clausecker <fuz@FreeBSD.org> CommitDate: 2023-08-21 18:59:38 +0000 lib/libc/amd64/string/stpcpy.S: add baseline implementation This commit adds a baseline implementation of stpcpy(3) for amd64. It performs quite well in comparison to the previous scalar implementation as well as agains bionic and glibc (though glibc is faster for very long strings). Fiddle with the Makefile to also have strcpy(3) call into the optimised stpcpy(3) code, fixing an oversight from D9841. Sponsored by: The FreeBSD Foundation Reviewed by: imp ngie emaste Approved by: mjg kib Fixes: D9841 Differential Revision: https://reviews.freebsd.org/D41349 --- lib/libc/amd64/string/Makefile.inc | 3 +- lib/libc/amd64/string/stpcpy.S | 143 ++++++++++++++++++++++++++++++++++--- 2 files changed, 135 insertions(+), 11 deletions(-) diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc index 3bc36078768b..4df4ff8f1417 100644 --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -6,8 +6,9 @@ MDSRCS+= \ memcpy.S \ memmove.S \ memset.S \ + stpcpy.S \ strcat.S \ strchrnul.S \ strcmp.S \ strlen.S \ - stpcpy.S + strcpy.c diff --git a/lib/libc/amd64/string/stpcpy.S b/lib/libc/amd64/string/stpcpy.S index 73c765556dc1..59358e3245a8 100644 --- a/lib/libc/amd64/string/stpcpy.S +++ b/lib/libc/amd64/string/stpcpy.S @@ -1,10 +1,30 @@ -/* - * Adapted by Guillaume Morin <guillaume@morinfr.org> from strcpy.S - * written by J.T. Conklin <jtc@acorntoolworks.com> - * Public domain. +/*- + * Copyright (c) 2023, The FreeBSD Foundation + * + * SPDX-License-Expression: BSD-2-Clause + * + * Portions of this software were developed by Robert Clausecker + * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation. + * + * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcpy.S + * written by J.T. Conklin <jtc@acorntoolworks.com> and + * adapted by Guillaume Morin <guillaume@morinfr.org> to implement stpcpy + * that was originally dedicated to the public domain */ #include <machine/asm.h> + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + + .weak stpcpy + .set stpcpy, __stpcpy +ARCHFUNCS(__stpcpy) + ARCHFUNC(__stpcpy, scalar) + ARCHFUNC(__stpcpy, baseline) +ENDARCHFUNCS(__stpcpy) + /* * This stpcpy implementation copies a byte at a time until the * source pointer is aligned to a word boundary, it then copies by @@ -18,9 +38,7 @@ * requirements. */ - .globl stpcpy,__stpcpy -ENTRY(stpcpy) -__stpcpy: +ARCHENTRY(__stpcpy, scalar) movabsq $0x0101010101010101,%r8 movabsq $0x8080808080808080,%r9 @@ -41,7 +59,7 @@ __stpcpy: dec %rax ret - .p2align 4 + ALIGN_TEXT .Lloop: movq %rdx,(%rdi) addq $8,%rdi @@ -109,6 +127,111 @@ __stpcpy: .Ldone: movq %rdi,%rax ret -END(stpcpy) - +ARCHEND(__stpcpy, scalar) + +ARCHENTRY(__stpcpy, baseline) + mov %esi, %ecx + mov %rdi, %rdx + sub %rsi, %rdi # express destination as distance to surce + and $~0xf, %rsi # align source to 16 byte + movdqa (%rsi), %xmm0 # head of string with junk before + pxor %xmm1, %xmm1 + and $0xf, %ecx # misalignment in bytes + pcmpeqb %xmm1, %xmm0 # NUL byte present? + pmovmskb %xmm0, %eax + shr %cl, %eax # clear out matches in junk bytes + bsf %eax, %eax # find match if any + jnz .Lrunt + + /* first normal iteration: write head back if it succeeds */ + movdqa 16(%rsi), %xmm0 # 16 bytes of current iteration + movdqu (%rsi, %rcx, 1), %xmm2 # first 16 bytes of the string + pcmpeqb %xmm0, %xmm1 # NUL byte present? + pmovmskb %xmm1, %eax + test %eax, %eax # find match if any + jnz .Lshorty + + movdqu %xmm2, (%rdx) # store beginning of string + + /* main loop, unrolled twice */ + ALIGN_TEXT +0: movdqa 32(%rsi), %xmm2 # load current iteraion + movdqu %xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion + pxor %xmm1, %xmm1 + add $32, %rsi + pcmpeqb %xmm2, %xmm1 # NUL byte present? + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 1f + + movdqa 16(%rsi), %xmm0 # load current iteraion + movdqu %xmm2, (%rsi, %rdi, 1) # write back previous iteraion + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 # NUL byte present? + pmovmskb %xmm1, %eax + test %eax, %eax + jz 0b + + /* end of string after main loop has iterated */ + add $16, %rsi # advance rsi to second unrolled half +1: tzcnt %eax, %eax # find location of match + # (behaves as bsf on pre-x86-64-v3 CPUs) + add %rsi, %rax # point to NUL byte + movdqu -15(%rax), %xmm0 # last 16 bytes of string + movdqu %xmm0, -15(%rax, %rdi, 1) # copied to destination + add %rdi, %rax # point to destination's NUL byte + ret + + /* NUL encountered in second iteration */ +.Lshorty: + tzcnt %eax, %eax + add $16, %eax # account for length of first iteration + sub %ecx, %eax # but not the parts before the string + + /* NUL encountered in first iteration */ +.Lrunt: lea 1(%rax), %edi # string length including NUL byte + add %rcx, %rsi # point to beginning of string + add %rdx, %rax # point to NUL byte + + /* transfer 16--32 bytes */ +.L1632: cmp $16, %edi + jb .L0815 + + movdqu -16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes + movdqu %xmm2, (%rdx) # store first 16 bytes + movdqu %xmm0, -15(%rax) # store last 16 bytes + ret + + /* transfer 8--15 bytes */ +.L0815: cmp $8, %edi + jb .L0407 + + mov (%rsi), %rcx # load first 8 bytes + mov -8(%rsi, %rdi, 1), %rdi # load last 8 bytes + mov %rcx, (%rdx) # store to dst + mov %rdi, -7(%rax) # dito + ret + + /* transfer 4--7 bytes */ +.L0407: cmp $4, %edi + jb .L0203 + + mov (%rsi), %ecx + mov -4(%rsi, %rdi, 1), %edi + mov %ecx, (%rdx) + mov %edi, -3(%rax) + ret + + /* transfer 2--3 bytes */ +.L0203: cmp $2, %edi + jb .L0101 + + movzwl (%rsi), %ecx + mov %cx, (%rdx) # store first two bytes + + /* transfer 0 bytes (last byte is always NUL) */ +.L0101: movb $0, (%rax) # store terminating NUL byte + ret +ARCHEND(__stpcpy, baseline) + .section .note.GNU-stack,"",%progbits