git: f2c98669fc1b - main - lib/libc/aarch64/string: add ASIMD-enhanced timingsafe_bcmp implementation

From: Robert Clausecker <fuz_at_FreeBSD.org>
Date: Fri, 10 Jan 2025 15:04:04 UTC
The branch main has been updated by fuz:

URL: https://cgit.FreeBSD.org/src/commit/?id=f2c98669fc1b3fd2dbc7a7e3eedd098970a10dec

commit f2c98669fc1b3fd2dbc7a7e3eedd098970a10dec
Author:     Robert Clausecker <fuz@FreeBSD.org>
AuthorDate: 2024-12-09 09:49:49 +0000
Commit:     Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2025-01-10 15:02:41 +0000

    lib/libc/aarch64/string: add ASIMD-enhanced timingsafe_bcmp implementation
    
    A straightforward port of the amd64 implementation.
    
    Approved by:    security (cperciva)
    Reviewed by:    getz, cperciva
    Event:          EuroBSDcon 2024
    Differential Revision:  https://reviews.freebsd.org/D46757
---
 lib/libc/aarch64/string/Makefile.inc      |   1 +
 lib/libc/aarch64/string/timingsafe_bcmp.S | 113 ++++++++++++++++++++++++++++++
 2 files changed, 114 insertions(+)

diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc
index 752cc6d9900b..8019ab4adafc 100644
--- a/lib/libc/aarch64/string/Makefile.inc
+++ b/lib/libc/aarch64/string/Makefile.inc
@@ -31,6 +31,7 @@ MDSRCS+= \
 	strncat.c \
 	strlcat.c \
 	strlen.S \
+	timingsafe_bcmp.S \
 	bcopy.c \
 	bzero.c
 
diff --git a/lib/libc/aarch64/string/timingsafe_bcmp.S b/lib/libc/aarch64/string/timingsafe_bcmp.S
new file mode 100644
index 000000000000..baa5c6f0940c
--- /dev/null
+++ b/lib/libc/aarch64/string/timingsafe_bcmp.S
@@ -0,0 +1,113 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Robert Clausecker
+ */
+
+#include <machine/asm.h>
+
+ENTRY(timingsafe_bcmp)
+	cmp	x2, #32			// at least 33 bytes to process?
+	bhi	.Lgt32
+
+	cmp	x2, #16			// at least 17 bytes to process?
+	bhi	.L1732
+
+	cmp	x2, #8			// at least 9 bytes to process?
+	bhi	.L0916
+
+	cmp	x2, #4			// at least 5 bytes to process?
+	bhi	.L0508
+
+	cmp	x2, #2			// at least 3 bytes to process?
+	bhi	.L0304
+
+	cbnz	x2, .L0102		// buffer empty?
+
+	mov	w0, #0			// empty buffer always matches
+	ret
+
+.L0102:	ldrb	w3, [x0]		// load first bytes
+	ldrb	w4, [x1]
+	sub	x2, x2, #1
+	ldrb	w5, [x0, x2]		// load last bytes
+	ldrb	w6, [x1, x2]
+	eor	w3, w3, w4
+	eor	w5, w5, w6
+	orr	w0, w3, w5
+	ret
+
+.L0304:	ldrh	w3, [x0]		// load first halfwords
+	ldrh	w4, [x1]
+	sub	x2, x2, #2
+	ldrh	w5, [x0, x2]		// load last halfwords
+	ldrh	w6, [x1, x2]
+	eor	w3, w3, w4
+	eor	w5, w5, w6
+	orr	w0, w3, w5
+	ret
+
+.L0508:	ldr	w3, [x0]		// load first words
+	ldr	w4, [x1]
+	sub	x2, x2, #4
+	ldr	w5, [x0, x2]		// load last words
+	ldr	w6, [x1, x2]
+	eor	w3, w3, w4
+	eor	w5, w5, w6
+	orr	w0, w3, w5
+	ret
+
+.L0916:	ldr	x3, [x0]
+	ldr	x4, [x1]
+	sub	x2, x2, #8
+	ldr	x5, [x0, x2]
+	ldr	x6, [x1, x2]
+	eor	x3, x3, x4
+	eor	x5, x5, x6
+	orr	x0, x3, x5
+	orr	x0, x0, x0, lsr #32	// ensure low 32 bits are nonzero iff mismatch
+	ret
+
+.L1732:	ldr	q0, [x0]
+	ldr	q1, [x1]
+	sub	x2, x2, #16
+	ldr	q2, [x0, x2]
+	ldr	q3, [x1, x2]
+	eor	v0.16b, v0.16b, v1.16b
+	eor	v2.16b, v2.16b, v3.16b
+	orr	v0.16b, v0.16b, v2.16b
+	umaxv	s0, v0.4s		// get a nonzero word if any
+	mov	w0, v0.s[0]
+	ret
+
+	/* more than 32 bytes: process buffer in a loop */
+.Lgt32:	ldp	q0, q1, [x0], #32
+	ldp	q2, q3, [x1], #32
+	eor	v0.16b, v0.16b, v2.16b
+	eor	v1.16b, v1.16b, v3.16b
+	orr	v4.16b, v0.16b, v1.16b
+	subs	x2, x2, #64		// enough left for another iteration?
+	bls	.Ltail
+
+0:	ldp	q0, q1, [x0], #32
+	ldp	q2, q3, [x1], #32
+	eor	v0.16b, v0.16b, v2.16b
+	eor	v1.16b, v1.16b, v3.16b
+	orr	v0.16b, v0.16b, v1.16b
+	orr	v4.16b, v4.16b, v0.16b
+	subs	x2, x2, #32
+	bhi	0b
+
+	/* process last 32 bytes */
+.Ltail:	add	x0, x0, x2		// point to the last 32 bytes in the buffer
+	add	x1, x1, x2
+	ldp	q0, q1, [x0]
+	ldp	q2, q3, [x1]
+	eor	v0.16b, v0.16b, v2.16b
+	eor	v1.16b, v1.16b, v3.16b
+	orr	v0.16b, v0.16b, v1.16b
+	orr	v4.16b, v4.16b, v0.16b
+	umaxv	s0, v4.4s		// get a nonzero word if any
+	mov	w0, v0.s[0]
+	ret
+END(timingsafe_bcmp)