git: d8385768fb12 - main - lib/libc/amd64/string/strlen.S: add amd64 baseline kernel

From: Robert Clausecker <fuz_at_FreeBSD.org>
Date: Thu, 03 Aug 2023 22:55:50 UTC
The branch main has been updated by fuz:

URL: https://cgit.FreeBSD.org/src/commit/?id=d8385768fb12e6205d73a20ad05fba9f3281b6e1

commit d8385768fb12e6205d73a20ad05fba9f3281b6e1
Author:     Robert Clausecker <fuz@FreeBSD.org>
AuthorDate: 2023-08-03 22:48:32 +0000
Commit:     Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2023-08-03 22:54:23 +0000

    lib/libc/amd64/string/strlen.S: add amd64 baseline kernel
    
    This performs very well.  x86-64-v3 and x86-64-v4 kernels were written,
    too, but performed worse than the baseline kernel on short strings.
    These may be added at a future point in time if the performance issues
    can be fixed.
    
    os: FreeBSD
    arch: amd64
    cpu: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
            │ strlen_scalar.out │          strlen_baseline.out          │
            │        B/s        │     B/s       vs base                 │
    Short          1.667Gi ± 1%   2.676Gi ± 1%   +60.55% (p=0.000 n=20)
    Mid            5.459Gi ± 1%   8.756Gi ± 1%   +60.39% (p=0.000 n=20)
    Long           15.34Gi ± 0%   52.27Gi ± 0%  +240.64% (p=0.000 n=20)
    geomean        5.188Gi        10.70Gi       +106.24%
    
    Sponsored by:   The FreeBSD Foundation
    Approved by:    kib
    Reviewed by:    mjg jrtc27
    Differential Revision:  https://reviews.freebsd.org/D40693
---
 lib/libc/amd64/string/strlen.S | 58 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 55 insertions(+), 3 deletions(-)

diff --git a/lib/libc/amd64/string/strlen.S b/lib/libc/amd64/string/strlen.S
index 1d2428e3420e..7e2514de44b0 100644
--- a/lib/libc/amd64/string/strlen.S
+++ b/lib/libc/amd64/string/strlen.S
@@ -1,11 +1,18 @@
-/*
+/*-
  * Written by Mateusz Guzik <mjg@freebsd.org>
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by Robert Clausecker
+ * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
+ *
  * Public domain.
  */
 
 #include <machine/asm.h>
 __FBSDID("$FreeBSD$");
 
+#include "amd64_archlevel.h"
+
 /*
  * Note: this routine was written with kernel use in mind (read: no simd),
  * it is only present in userspace as a temporary measure until something
@@ -14,6 +21,11 @@ __FBSDID("$FreeBSD$");
 
 #define ALIGN_TEXT      .p2align 4,0x90 /* 16-byte alignment, nop filled */
 
+ARCHFUNCS(strlen)
+	ARCHFUNC(strlen, scalar)
+	ARCHFUNC(strlen, baseline)
+ENDARCHFUNCS(strlen)
+
 /*
  * strlen(string)
  *	  %rdi
@@ -30,7 +42,7 @@ __FBSDID("$FreeBSD$");
  *
  * The latter contains a 32-bit variant of the same algorithm coded in assembly for i386.
  */
-ENTRY(strlen)
+ARCHENTRY(strlen, scalar)
 	movabsq	$0xfefefefefefefeff,%r8
 	movabsq	$0x8080808080808080,%r9
 
@@ -76,6 +88,46 @@ ENTRY(strlen)
 	leaq	(%rcx,%rdi),%rax
 	subq	%r10,%rax
 	ret
-END(strlen)
+ARCHEND(strlen, scalar)
+
+ARCHENTRY(strlen, baseline)
+	mov	%rdi, %rcx
+	pxor	%xmm1, %xmm1
+	and	$~0xf, %rdi			# align string
+	pcmpeqb	(%rdi), %xmm1			# compare head (with junk before string)
+	mov	%rcx, %rsi			# string pointer copy for later
+	and	$0xf, %ecx			# amount of bytes rdi is past 16 byte alignment
+	pmovmskb %xmm1, %eax
+	add	$32, %rdi			# advance to next iteration
+	shr	%cl, %eax			# clear out matches in junk bytes
+	test	%eax, %eax			# any match? (can't use ZF from SHR as CL=0 is possible)
+	jnz	2f
+
+	ALIGN_TEXT
+1:	pxor	%xmm1, %xmm1
+	pcmpeqb	-16(%rdi), %xmm1		# find NUL bytes
+	pmovmskb %xmm1, %eax
+	test	%eax, %eax			# were any NUL bytes present?
+	jnz	3f
+
+	/* the same unrolled once more */
+	pxor	%xmm1, %xmm1
+	pcmpeqb	(%rdi), %xmm1
+	pmovmskb %xmm1, %eax
+	add	$32, %rdi			# advance to next iteration
+	test	%eax, %eax
+	jz	1b
+
+	/* match found in loop body */
+	sub	$16, %rdi			# undo half the advancement
+3:	tzcnt	%eax, %eax			# find the first NUL byte
+	sub	%rsi, %rdi			# string length until beginning of (%rdi)
+	lea	-16(%rdi, %rax, 1), %rax	# that plus loc. of NUL byte: full string length
+	ret
+
+	/* match found in head */
+2:	tzcnt	%eax, %eax			# compute string length
+	ret
+ARCHEND(strlen, baseline)
 
 	.section .note.GNU-stack,"",%progbits