git: 8803f01e9322 - main - lib/libc/amd64/string/memcmp.S: add baseline implementation

From: Robert Clausecker <fuz_at_FreeBSD.org>
Date: Mon, 21 Aug 2023 19:28:45 UTC
The branch main has been updated by fuz:

URL: https://cgit.FreeBSD.org/src/commit/?id=8803f01e932275cd405690526bb8dba031a02ffe

commit 8803f01e932275cd405690526bb8dba031a02ffe
Author:     Robert Clausecker <fuz@FreeBSD.org>
AuthorDate: 2023-07-12 13:35:13 +0000
Commit:     Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2023-08-21 19:19:46 +0000

    lib/libc/amd64/string/memcmp.S: add baseline implementation
    
    This changeset adds a baseline implementation of memcmp and bcmp
    for amd64. The same code is used for both functions with conditional
    code were the behaviour differs (we need more precise output for the
    memcmp case).
    
    FreeBSD documents that memcmp returns the difference between the
    mismatching characters. Slightly faster code would be possible could
    we relax this requirement to the ISO/IEC 9899:1999 requirement of
    merely returning a negative/positive integer or zero.
    
    Performance is better than bionic and glibc, except for long strings
    were the two are 13% faster. This could be because they use SSE4
    ptest which we cannot use in a baseline kernel.
    
    Sponsored by:   The FreeBSD Foundation
    Approved by:    mjg
    Differential Revision:  https://reviews.freebsd.org/D41442
---
 lib/libc/amd64/string/memcmp.S | 181 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 175 insertions(+), 6 deletions(-)

diff --git a/lib/libc/amd64/string/memcmp.S b/lib/libc/amd64/string/memcmp.S
index fea5cebc65f2..d192229677b3 100644
--- a/lib/libc/amd64/string/memcmp.S
+++ b/lib/libc/amd64/string/memcmp.S
@@ -1,9 +1,12 @@
 /*-
- * Copyright (c) 2018 The FreeBSD Foundation
+ * Copyright (c) 2018, 2023 The FreeBSD Foundation
  *
  * This software was developed by Mateusz Guzik <mjg@FreeBSD.org>
  * under sponsorship from the FreeBSD Foundation.
  *
+ * Portions of this software were developed by Robert Clausecker
+ * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -27,6 +30,10 @@
  */
 
 #include <machine/asm.h>
+#include <machine/param.h>
+
+#include "amd64_archlevel.h"
+
 /*
  * Note: this routine was written with kernel use in mind (read: no simd),
  * it is only present in userspace as a temporary measure until something
@@ -36,10 +43,15 @@
 #define ALIGN_TEXT      .p2align 4,0x90 /* 16-byte alignment, nop filled */
 
 #ifdef BCMP
-ENTRY(bcmp)
-#else
-ENTRY(memcmp)
+#define memcmp bcmp
 #endif
+
+ARCHFUNCS(memcmp)
+	ARCHFUNC(memcmp, scalar)
+	ARCHFUNC(memcmp, baseline)
+ENDARCHFUNCS(memcmp)
+
+ARCHENTRY(memcmp, scalar)
 	xorl	%eax,%eax
 10:
 	cmpq	$16,%rdx
@@ -157,7 +169,6 @@ ENTRY(memcmp)
 1:
 	leal	1(%eax),%eax
 	ret
-END(bcmp)
 #else
 /*
  * We need to compute the difference between strings.
@@ -230,7 +241,165 @@ END(bcmp)
 2:
 	subl	%r8d,%eax
 	ret
-END(memcmp)
 #endif
+ARCHEND(memcmp, scalar)
+
+ARCHENTRY(memcmp, baseline)
+	cmp		$32, %rdx		# enough to permit use of the long kernel?
+	ja		.Llong
+
+	test		%rdx, %rdx		# zero bytes buffer?
+	je		.L0
+
+	/*
+	 * Compare strings of 1--32 bytes.  We want to do this by
+	 * loading into two xmm registers and then comparing.  To avoid
+	 * crossing into unmapped pages, we either load 32 bytes from
+	 * the start of the buffer or 32 bytes before its end, depending
+	 * on whether there is a page boundary between the overread area
+	 * or not.
+	 */
+
+	/* check for page boundaries overreads */
+	lea		31(%rdi), %eax		# end of overread
+	lea		31(%rsi), %r8d
+	lea		-1(%rdi, %rdx, 1), %ecx	# last character in buffer
+	lea		-1(%rsi, %rdx, 1), %r9d
+	xor		%ecx, %eax
+	xor		%r9d, %r8d
+	test		$PAGE_SIZE, %eax	# are they on different pages?
+	jz		0f
+
+	/* fix up rdi */
+	movdqu		-32(%rdi, %rdx, 1), %xmm0
+	movdqu		-16(%rdi, %rdx, 1), %xmm1
+	lea		-8(%rsp), %rdi		# end of replacement buffer
+	sub		%rdx, %rdi		# start of replacement buffer
+	movdqa		%xmm0, -40(%rsp)	# copy to replacement buffer
+	movdqa		%xmm1, -24(%rsp)
+
+0:	test		$PAGE_SIZE, %r8d
+	jz		0f
+
+	/* fix up rsi */
+	movdqu		-32(%rsi, %rdx, 1), %xmm0
+	movdqu		-16(%rsi, %rdx, 1), %xmm1
+	lea		-40(%rsp), %rsi		# end of replacement buffer
+	sub		%rdx, %rsi		# start of replacement buffer
+	movdqa		%xmm0, -72(%rsp)	# copy to replacement buffer
+	movdqa		%xmm1, -56(%rsp)
+
+	/* load data and compare properly */
+0:	movdqu		16(%rdi), %xmm1
+	movdqu		16(%rsi), %xmm3
+	movdqu		(%rdi), %xmm0
+	movdqu		(%rsi), %xmm2
+	mov		%edx, %ecx
+	mov		$-1, %edx
+	shl		%cl, %rdx		# ones where the buffer is not
+	pcmpeqb		%xmm3, %xmm1
+	pcmpeqb		%xmm2, %xmm0
+	pmovmskb	%xmm1, %ecx
+	pmovmskb	%xmm0, %eax
+	shl		$16, %ecx
+	or		%ecx, %eax		# ones where the buffers match
+	or		%edx, %eax		# including where the buffer is not
+	not		%eax			# ones where there is a mismatch
+#ifndef BCMP
+	bsf		%eax, %edx		# location of the first mismatch
+	cmovz		%eax, %edx		# including if there is no mismatch
+	movzbl		(%rdi, %rdx, 1), %eax	# mismatching bytes
+	movzbl		(%rsi, %rdx, 1), %edx
+	sub		%edx, %eax
+#endif
+	ret
+
+	/* empty input */
+.L0:	xor		%eax, %eax
+	ret
+
+	/* compare 33+ bytes */
+	ALIGN_TEXT
+.Llong:	movdqu		(%rdi), %xmm0		# load head
+	movdqu		(%rsi), %xmm2
+	mov		%rdi, %rcx
+	sub		%rdi, %rsi		# express rsi as distance from rdi
+	and		$~0xf, %rdi		# align rdi to 16 bytes
+	movdqu		16(%rsi, %rdi, 1), %xmm1
+	pcmpeqb		16(%rdi), %xmm1		# compare second half of this iteration
+	add		%rcx, %rdx		# pointer to last byte in buffer
+	pcmpeqb		%xmm2, %xmm0
+	pmovmskb	%xmm0, %eax
+	xor		$0xffff, %eax		# any mismatch?
+	jne		.Lmismatch_head
+	add		$64, %rdi		# advance to next iteration
+	jmp		1f			# and get going with the loop
+
+	/* process buffer 32 bytes at a time */
+	ALIGN_TEXT
+0:	movdqu		-32(%rsi, %rdi, 1), %xmm0
+	movdqu		-16(%rsi, %rdi, 1), %xmm1
+	pcmpeqb		-32(%rdi), %xmm0
+	pcmpeqb		-16(%rdi), %xmm1
+	add		$32, %rdi		# advance to next iteration
+1:	pand		%xmm0, %xmm1		# 0xff where both halves matched
+	pmovmskb	%xmm1, %eax
+	cmp		$0xffff, %eax		# all bytes matched?
+	jne		.Lmismatch
+	cmp		%rdx, %rdi		# end of buffer reached?
+	jb		0b
+
+	/* less than 32 bytes left to compare */
+	movdqu		-16(%rdx), %xmm1	# load 32 byte tail through end pointer
+	movdqu		-16(%rdx, %rsi, 1), %xmm3
+	movdqu		-32(%rdx), %xmm0
+	movdqu		-32(%rdx, %rsi, 1), %xmm2
+	pcmpeqb		%xmm3, %xmm1
+	pcmpeqb		%xmm2, %xmm0
+	pmovmskb	%xmm1, %ecx
+	pmovmskb	%xmm0, %eax
+	shl		$16, %ecx
+	or		%ecx, %eax		# ones where the buffers match
+	not		%eax			# ones where there is a mismatch
+#ifndef BCMP
+	bsf		%eax, %ecx		# location of the first mismatch
+	cmovz		%eax, %ecx		# including if there is no mismatch
+	add		%rcx, %rdx		# pointer to potential mismatch
+	movzbl		-32(%rdx), %eax		# mismatching bytes
+	movzbl		-32(%rdx, %rsi, 1), %edx
+	sub		%edx, %eax
+#endif
+	ret
+
+#ifdef BCMP
+.Lmismatch:
+	mov		$1, %eax
+.Lmismatch_head:
+	ret
+#else /* memcmp */
+.Lmismatch_head:
+	tzcnt		%eax, %eax		# location of mismatch
+	add		%rax, %rcx		# pointer to mismatch
+	movzbl		(%rcx), %eax		# mismatching bytes
+	movzbl		(%rcx, %rsi, 1), %ecx
+	sub		%ecx, %eax
+	ret
+
+.Lmismatch:
+	movdqu		-48(%rsi, %rdi, 1), %xmm1
+	pcmpeqb		-48(%rdi), %xmm1	# reconstruct xmm1 before PAND
+	pmovmskb	%xmm0, %eax		# mismatches in first 16 bytes
+	pmovmskb	%xmm1, %edx		# mismatches in second 16 bytes
+	shl		$16, %edx
+	or		%edx, %eax		# mismatches in both
+	not		%eax			# matches in both
+	tzcnt		%eax, %eax		# location of mismatch
+	add		%rax, %rdi		# pointer to mismatch
+	movzbl		-64(%rdi), %eax		# mismatching bytes
+	movzbl		-64(%rdi, %rsi, 1), %ecx
+	sub		%ecx, %eax
+	ret
+#endif
+ARCHEND(memcmp, baseline)
 
 	.section .note.GNU-stack,"",%progbits