git: 9082398090bc - main - lib/libc/amd64/string: fix overread condition in memccpy

From: Robert Clausecker <fuz_at_FreeBSD.org>
Date: Mon, 29 Jul 2024 19:37:46 UTC
The branch main has been updated by fuz:

URL: https://cgit.FreeBSD.org/src/commit/?id=9082398090bcf0ac333397d47e594b105ea3aefd

commit 9082398090bcf0ac333397d47e594b105ea3aefd
Author:     Robert Clausecker <fuz@FreeBSD.org>
AuthorDate: 2024-07-20 19:53:04 +0000
Commit:     Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2024-07-29 19:36:10 +0000

    lib/libc/amd64/string: fix overread condition in memccpy
    
    An overread condition in memccpy(dst, src, c, len) would occur if
    src does not cross a 16 byte boundary and there is no instance of
    c between *src and the next 16 byte boundary.  This could cause a
    read fault if src is just before the end of a page and the next page
    is unmapped or unreadable.
    
    The bug is a consequence of basing memccpy() on the strlcpy() code:
    whereas strlcpy() assumes that src is a nul-terminated string and
    hence a terminator is always present, c may not be present at all in
    the source string.  It was not caught earlier due to insufficient
    unit test design.
    
    As a part of the fix, the function is refactored such that the runt
    case (buffer length from last alignment boundary between 1 and 32 B)
    is handled separately.  This reduces the number of conditional
    branches on all code paths and simplifies the handling of early
    matches in the non-runt case.  Performance is improved slightly.
    
    os: FreeBSD
    arch: amd64
    cpu: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
            │ memccpy.unfixed.out │        memccpy.fixed.out           │
            │       sec/op        │   sec/op     vs base               │
    Short             66.76µ ± 0%   62.45µ ± 1%  -6.44% (p=0.000 n=20)
    Mid               7.938µ ± 0%   7.967µ ± 0%  +0.36% (p=0.001 n=20)
    Long              3.577µ ± 0%   3.577µ ± 0%       ~ (p=0.429 n=20)
    geomean           12.38µ        12.12µ       -2.08%
    
            │ memccpy.unfixed.out │         memccpy.fixed.out           │
            │         B/s         │     B/s       vs base               │
    Short            1.744Gi ± 0%   1.864Gi ± 1%  +6.89% (p=0.000 n=20)
    Mid              14.67Gi ± 0%   14.61Gi ± 0%  -0.36% (p=0.001 n=20)
    Long             32.55Gi ± 0%   32.55Gi ± 0%       ~ (p=0.429 n=20)
    geomean          9.407Gi        9.606Gi       +2.12%
    
    Reported by:    getz
    Reviewed by:    getz
    Approved by:    mjg (blanket, via IRC)
    See also:       D46051
    MFC:            stable/14
    Event:          GSoC 2024
    Differential Revision:  https://reviews.freebsd.org/D46052
---
 lib/libc/amd64/string/memccpy.S | 113 ++++++++++++++++++++--------------------
 1 file changed, 57 insertions(+), 56 deletions(-)

diff --git a/lib/libc/amd64/string/memccpy.S b/lib/libc/amd64/string/memccpy.S
index a2d9e33b3d36..69b650fffc33 100644
--- a/lib/libc/amd64/string/memccpy.S
+++ b/lib/libc/amd64/string/memccpy.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 The FreeBSD Foundation
+ * Copyright (c) 2023, 2024 The FreeBSD Foundation
  *
  * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
  * under sponsorship from the FreeBSD Foundation.
@@ -83,34 +83,47 @@ ARCHENTRY(__memccpy, baseline)
 	pshufd		$0, %xmm4, %xmm4	# cccc -> cccccccccccccccc
 	and		$~0xf, %rsi
 	movdqa		%xmm4, %xmm1
-	pcmpeqb		(%rsi), %xmm1		# NUL found in head?
-	mov		$-1, %r8d
+	pcmpeqb		(%rsi), %xmm1		# c found in head?
 	and		$0xf, %ecx
-	shl		%cl, %r8d		# mask of bytes in the string
-	pmovmskb	%xmm1, %eax
+	mov		$-1, %eax
+	pmovmskb	%xmm1, %r8d
+	lea		-32(%rcx), %r11
+	shl		%cl, %eax		# mask of bytes in the string
+	add		%rdx, %r11		# distance from alignment boundary - 32
+	jnc		.Lrunt			# jump if buffer length is 32 or less
+
 	and		%r8d, %eax
-	jnz		.Lhead_nul
+	jz		0f			# match (or induced match) found?
+
+	/* match in first chunk */
+	tzcnt		%eax, %edx		# where is c?
+	sub		%ecx, %edx		# ... from the beginning of the string?
+	lea		1(%rdi, %rdx, 1), %rax	# return value
+	jmp		.L0116
 
-	movdqa		16(%rsi), %xmm3		# load second string chunk
+0:	movdqa		16(%rsi), %xmm3		# load second string chunk
 	movdqu		(%r9), %xmm2		# load unaligned string head
-	mov		$32, %r8d
-	sub		%ecx, %r8d		# head length + length of second chunk
 	movdqa		%xmm4, %xmm1
-	pcmpeqb		%xmm3, %xmm1		# NUL found in second chunk?
-
-	sub		%r8, %rdx		# enough space left for the second chunk?
-	jb		.Lhead_buf_end
+	pcmpeqb		%xmm3, %xmm1		# c found in second chunk?
 
 	/* process second chunk */
 	pmovmskb	%xmm1, %eax
 	test		%eax, %eax
-	jnz		.Lsecond_nul
+	jz		0f
+
+	/* match in second chunk */
+	tzcnt		%eax, %edx		# where is c?
+	sub		$16, %ecx
+	sub		%ecx, %edx		# adjust for alignment offset
+	lea		1(%rdi, %rdx, 1), %rax	# return value
+	jmp		.L0132
 
-	/* string didn't end in second chunk and neither did buffer -- not a runt! */
-	movdqa		32(%rsi), %xmm0		# load next string chunk
+	/* c not found in second chunk: prepare for main loop */
+0:	movdqa		32(%rsi), %xmm0		# load next string chunk
 	movdqa		%xmm4, %xmm1
 	movdqu		%xmm2, (%rdi)		# deposit head into buffer
 	sub		%rcx, %rdi		# adjust RDI to correspond to RSI
+	mov		%r11, %rdx
 	movdqu		%xmm3, 16(%rdi)		# deposit second chunk
 	sub		%rsi, %rdi		# express RDI as distance from RSI
 	add		$32, %rsi		# advance RSI past first two chunks
@@ -119,7 +132,7 @@ ARCHENTRY(__memccpy, baseline)
 
 	/* main loop unrolled twice */
 	ALIGN_TEXT
-0:	pcmpeqb		%xmm0, %xmm1		# NUL byte encountered?
+0:	pcmpeqb		%xmm0, %xmm1		# c encountered?
 	pmovmskb	%xmm1, %eax
 	test		%eax, %eax
 	jnz		3f
@@ -131,7 +144,7 @@ ARCHENTRY(__memccpy, baseline)
 	jb		2f
 
 	add		$32, %rsi		# advance pointers to next chunk
-	pcmpeqb		%xmm0, %xmm1		# NUL byte encountered?
+	pcmpeqb		%xmm0, %xmm1		# c encountered?
 	pmovmskb	%xmm1, %eax
 	test		%eax, %eax
 	jnz		4f
@@ -146,11 +159,10 @@ ARCHENTRY(__memccpy, baseline)
 	add		$16, %edx
 
 	/* 1--16 bytes left in the buffer but string has not ended yet */
-2:	pcmpeqb		%xmm1, %xmm0		# NUL byte encountered?
+2:	pcmpeqb		%xmm1, %xmm0		# c encountered?
 	pmovmskb	%xmm0, %r8d
 	mov		%r8d, %ecx
 	bts		%edx, %r8d		# treat end of buffer as end of string
-	or		$0x10000, %eax		# ensure TZCNT finds a set bit
 	tzcnt		%r8d, %r8d		# find tail length
 	add		%rsi, %rdi		# restore RDI
 	movdqu		1(%rsi, %r8, 1), %xmm0	# load string tail
@@ -162,42 +174,39 @@ ARCHENTRY(__memccpy, baseline)
 	ret
 
 4:	sub		$16, %rsi		# undo second advancement
-	add		$16, %rdx		# restore number of remaining bytes
 
-	/* string has ended but buffer has not */
+	/* terminator found and buffer has not ended yet */
 3:	tzcnt		%eax, %eax		# find length of string tail
-	movdqu		-15(%rsi, %rax, 1), %xmm0 # load string tail (incl. NUL)
+	movdqu		-15(%rsi, %rax, 1), %xmm0 # load string tail (incl. c)
 	add		%rsi, %rdi		# restore destination pointer
-	movdqu		%xmm0, -15(%rdi, %rax, 1) # store string tail (incl. NUL)
+	movdqu		%xmm0, -15(%rdi, %rax, 1) # store string tail (incl. c)
 	lea		1(%rdi, %rax, 1), %rax	# compute return value
 	ret
 
-.Lhead_buf_end:
-	pmovmskb	%xmm1, %r8d
-	add		$32, %edx		# restore edx to (len-1) + ecx
-	shl		$16, %r8d		# place 2nd chunk NUL mask into bits 16--31
-	mov		%r8d, %r10d
-	bts		%rdx, %r8		# treat end of buffer as if terminator present
-	xor		%eax, %eax		# return value if terminator not found
-	tzcnt		%r8, %rdx		# find string/buffer len from alignment boundary
+	/* buffer is 1--32 bytes in size */
+	ALIGN_TEXT
+.Lrunt:	add		$32, %r11d		# undo earlier decrement
+	mov		%r8d, %r10d		# keep a copy of the original match mask
+	bts		%r11d, %r8d		# induce match at buffer end
+	and		%ax, %r8w		# is there a match in the first 16 bytes?
+	jnz		0f			# if yes, skip looking at second chunk
+
+	pcmpeqb		16(%rsi), %xmm4		# check for match in second chunk
+	pmovmskb	%xmm4, %r8d
+	shl		$16, %r8d		# place second chunk matches in bits 16--31
+	mov		%r8d, %r10d		# keep a copy of the original match mask
+	bts		%r11d, %r8d		# induce a match at buffer end
+
+0:	xor		%eax, %eax		# return value if terminator not found
+	tzcnt		%r8d, %edx		# find string/buffer length from alignment boundary
 	lea		1(%rdi, %rdx, 1), %r8	# return value if terminator found + rcx
-	sub		%rcx, %r8		# subtract rcx
-	bt		%rdx, %r10		# was the terminator present?
+	sub		%rcx, %r8
+	bt		%edx, %r10d		# was the terminator present?
 	cmovc		%r8, %rax		# if yes, return pointer, else NULL
-	sub		%ecx, %edx		# find actual string/buffer len
-	jmp		.L0132
+	sub		%ecx, %edx		# find actual string/buffer length
 
-.Lsecond_nul:
-	add		%r8, %rdx		# restore buffer length
-	tzcnt		%eax, %r8d		# where is the NUL byte?
-	lea		-16(%rcx), %eax
-	sub		%eax, %r8d		# string length
-	lea		1(%rdi, %r8, 1), %rax	# return value if NUL before end of buffer
-	xor		%ecx, %ecx		# return value if not
-	cmp		%r8, %rdx		# is the string shorter than the buffer?
-	cmova		%r8, %rdx		# copy only min(buflen, srclen) bytes
-	cmovb		%rcx, %rax		# return NUL if buffer ended before string
-.L0132:	cmp		$16, %rdx		# at least 17 bytes to copy (not incl NUL)?
+	ALIGN_TEXT
+.L0132:	cmp		$16, %rdx		# at least 17 bytes to copy?
 	jb		.L0116
 
 	/* copy 17--32 bytes */
@@ -207,16 +216,8 @@ ARCHENTRY(__memccpy, baseline)
 	movdqu		%xmm1, -15(%rdi, %rdx, 1)
 	ret
 
-.Lhead_nul:
-	tzcnt		%eax, %r8d		# where is the NUL byte?
-	sub		%ecx, %r8d		# ... from the beginning of the string?
-	lea		1(%rdi, %r8, 1), %rax	# return value if NUL before end of buffer
-	xor		%ecx, %ecx		# return value if not
-	cmp		%r8, %rdx		# is the string shorter than the buffer?
-	cmova		%r8, %rdx		# copy only min(buflen, srclen) bytes
-	cmovb		%rcx, %rax		# return NUL if buffer ended before string
-
 	/* process strings of 1--16 bytes (rdx: min(buflen, srclen), rax: srclen) */
+	ALIGN_TEXT
 .L0116:	cmp		$8, %rdx		# at least 9 bytes to copy?
 	jae		.L0916