git: d8385768fb12 - main - lib/libc/amd64/string/strlen.S: add amd64 baseline kernel
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Thu, 03 Aug 2023 22:55:50 UTC
The branch main has been updated by fuz: URL: https://cgit.FreeBSD.org/src/commit/?id=d8385768fb12e6205d73a20ad05fba9f3281b6e1 commit d8385768fb12e6205d73a20ad05fba9f3281b6e1 Author: Robert Clausecker <fuz@FreeBSD.org> AuthorDate: 2023-08-03 22:48:32 +0000 Commit: Robert Clausecker <fuz@FreeBSD.org> CommitDate: 2023-08-03 22:54:23 +0000 lib/libc/amd64/string/strlen.S: add amd64 baseline kernel This performs very well. x86-64-v3 and x86-64-v4 kernels were written, too, but performed worse than the baseline kernel on short strings. These may be added at a future point in time if the performance issues can be fixed. os: FreeBSD arch: amd64 cpu: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz │ strlen_scalar.out │ strlen_baseline.out │ │ B/s │ B/s vs base │ Short 1.667Gi ± 1% 2.676Gi ± 1% +60.55% (p=0.000 n=20) Mid 5.459Gi ± 1% 8.756Gi ± 1% +60.39% (p=0.000 n=20) Long 15.34Gi ± 0% 52.27Gi ± 0% +240.64% (p=0.000 n=20) geomean 5.188Gi 10.70Gi +106.24% Sponsored by: The FreeBSD Foundation Approved by: kib Reviewed by: mjg jrtc27 Differential Revision: https://reviews.freebsd.org/D40693 --- lib/libc/amd64/string/strlen.S | 58 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/lib/libc/amd64/string/strlen.S b/lib/libc/amd64/string/strlen.S index 1d2428e3420e..7e2514de44b0 100644 --- a/lib/libc/amd64/string/strlen.S +++ b/lib/libc/amd64/string/strlen.S @@ -1,11 +1,18 @@ -/* +/*- * Written by Mateusz Guzik <mjg@freebsd.org> + * Copyright (c) 2023 The FreeBSD Foundation + * + * Portions of this software were developed by Robert Clausecker + * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation. + * * Public domain. */ #include <machine/asm.h> __FBSDID("$FreeBSD$"); +#include "amd64_archlevel.h" + /* * Note: this routine was written with kernel use in mind (read: no simd), * it is only present in userspace as a temporary measure until something @@ -14,6 +21,11 @@ __FBSDID("$FreeBSD$"); #define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ +ARCHFUNCS(strlen) + ARCHFUNC(strlen, scalar) + ARCHFUNC(strlen, baseline) +ENDARCHFUNCS(strlen) + /* * strlen(string) * %rdi @@ -30,7 +42,7 @@ __FBSDID("$FreeBSD$"); * * The latter contains a 32-bit variant of the same algorithm coded in assembly for i386. */ -ENTRY(strlen) +ARCHENTRY(strlen, scalar) movabsq $0xfefefefefefefeff,%r8 movabsq $0x8080808080808080,%r9 @@ -76,6 +88,46 @@ ENTRY(strlen) leaq (%rcx,%rdi),%rax subq %r10,%rax ret -END(strlen) +ARCHEND(strlen, scalar) + +ARCHENTRY(strlen, baseline) + mov %rdi, %rcx + pxor %xmm1, %xmm1 + and $~0xf, %rdi # align string + pcmpeqb (%rdi), %xmm1 # compare head (with junk before string) + mov %rcx, %rsi # string pointer copy for later + and $0xf, %ecx # amount of bytes rdi is past 16 byte alignment + pmovmskb %xmm1, %eax + add $32, %rdi # advance to next iteration + shr %cl, %eax # clear out matches in junk bytes + test %eax, %eax # any match? (can't use ZF from SHR as CL=0 is possible) + jnz 2f + + ALIGN_TEXT +1: pxor %xmm1, %xmm1 + pcmpeqb -16(%rdi), %xmm1 # find NUL bytes + pmovmskb %xmm1, %eax + test %eax, %eax # were any NUL bytes present? + jnz 3f + + /* the same unrolled once more */ + pxor %xmm1, %xmm1 + pcmpeqb (%rdi), %xmm1 + pmovmskb %xmm1, %eax + add $32, %rdi # advance to next iteration + test %eax, %eax + jz 1b + + /* match found in loop body */ + sub $16, %rdi # undo half the advancement +3: tzcnt %eax, %eax # find the first NUL byte + sub %rsi, %rdi # string length until beginning of (%rdi) + lea -16(%rdi, %rax, 1), %rax # that plus loc. of NUL byte: full string length + ret + + /* match found in head */ +2: tzcnt %eax, %eax # compute string length + ret +ARCHEND(strlen, baseline) .section .note.GNU-stack,"",%progbits