git: 9082398090bc - main - lib/libc/amd64/string: fix overread condition in memccpy
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Mon, 29 Jul 2024 19:37:46 UTC
The branch main has been updated by fuz: URL: https://cgit.FreeBSD.org/src/commit/?id=9082398090bcf0ac333397d47e594b105ea3aefd commit 9082398090bcf0ac333397d47e594b105ea3aefd Author: Robert Clausecker <fuz@FreeBSD.org> AuthorDate: 2024-07-20 19:53:04 +0000 Commit: Robert Clausecker <fuz@FreeBSD.org> CommitDate: 2024-07-29 19:36:10 +0000 lib/libc/amd64/string: fix overread condition in memccpy An overread condition in memccpy(dst, src, c, len) would occur if src does not cross a 16 byte boundary and there is no instance of c between *src and the next 16 byte boundary. This could cause a read fault if src is just before the end of a page and the next page is unmapped or unreadable. The bug is a consequence of basing memccpy() on the strlcpy() code: whereas strlcpy() assumes that src is a nul-terminated string and hence a terminator is always present, c may not be present at all in the source string. It was not caught earlier due to insufficient unit test design. As a part of the fix, the function is refactored such that the runt case (buffer length from last alignment boundary between 1 and 32 B) is handled separately. This reduces the number of conditional branches on all code paths and simplifies the handling of early matches in the non-runt case. Performance is improved slightly. os: FreeBSD arch: amd64 cpu: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz │ memccpy.unfixed.out │ memccpy.fixed.out │ │ sec/op │ sec/op vs base │ Short 66.76µ ± 0% 62.45µ ± 1% -6.44% (p=0.000 n=20) Mid 7.938µ ± 0% 7.967µ ± 0% +0.36% (p=0.001 n=20) Long 3.577µ ± 0% 3.577µ ± 0% ~ (p=0.429 n=20) geomean 12.38µ 12.12µ -2.08% │ memccpy.unfixed.out │ memccpy.fixed.out │ │ B/s │ B/s vs base │ Short 1.744Gi ± 0% 1.864Gi ± 1% +6.89% (p=0.000 n=20) Mid 14.67Gi ± 0% 14.61Gi ± 0% -0.36% (p=0.001 n=20) Long 32.55Gi ± 0% 32.55Gi ± 0% ~ (p=0.429 n=20) geomean 9.407Gi 9.606Gi +2.12% Reported by: getz Reviewed by: getz Approved by: mjg (blanket, via IRC) See also: D46051 MFC: stable/14 Event: GSoC 2024 Differential Revision: https://reviews.freebsd.org/D46052 --- lib/libc/amd64/string/memccpy.S | 113 ++++++++++++++++++++-------------------- 1 file changed, 57 insertions(+), 56 deletions(-) diff --git a/lib/libc/amd64/string/memccpy.S b/lib/libc/amd64/string/memccpy.S index a2d9e33b3d36..69b650fffc33 100644 --- a/lib/libc/amd64/string/memccpy.S +++ b/lib/libc/amd64/string/memccpy.S @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023 The FreeBSD Foundation + * Copyright (c) 2023, 2024 The FreeBSD Foundation * * This software was developed by Robert Clausecker <fuz@FreeBSD.org> * under sponsorship from the FreeBSD Foundation. @@ -83,34 +83,47 @@ ARCHENTRY(__memccpy, baseline) pshufd $0, %xmm4, %xmm4 # cccc -> cccccccccccccccc and $~0xf, %rsi movdqa %xmm4, %xmm1 - pcmpeqb (%rsi), %xmm1 # NUL found in head? - mov $-1, %r8d + pcmpeqb (%rsi), %xmm1 # c found in head? and $0xf, %ecx - shl %cl, %r8d # mask of bytes in the string - pmovmskb %xmm1, %eax + mov $-1, %eax + pmovmskb %xmm1, %r8d + lea -32(%rcx), %r11 + shl %cl, %eax # mask of bytes in the string + add %rdx, %r11 # distance from alignment boundary - 32 + jnc .Lrunt # jump if buffer length is 32 or less + and %r8d, %eax - jnz .Lhead_nul + jz 0f # match (or induced match) found? + + /* match in first chunk */ + tzcnt %eax, %edx # where is c? + sub %ecx, %edx # ... from the beginning of the string? + lea 1(%rdi, %rdx, 1), %rax # return value + jmp .L0116 - movdqa 16(%rsi), %xmm3 # load second string chunk +0: movdqa 16(%rsi), %xmm3 # load second string chunk movdqu (%r9), %xmm2 # load unaligned string head - mov $32, %r8d - sub %ecx, %r8d # head length + length of second chunk movdqa %xmm4, %xmm1 - pcmpeqb %xmm3, %xmm1 # NUL found in second chunk? - - sub %r8, %rdx # enough space left for the second chunk? - jb .Lhead_buf_end + pcmpeqb %xmm3, %xmm1 # c found in second chunk? /* process second chunk */ pmovmskb %xmm1, %eax test %eax, %eax - jnz .Lsecond_nul + jz 0f + + /* match in second chunk */ + tzcnt %eax, %edx # where is c? + sub $16, %ecx + sub %ecx, %edx # adjust for alignment offset + lea 1(%rdi, %rdx, 1), %rax # return value + jmp .L0132 - /* string didn't end in second chunk and neither did buffer -- not a runt! */ - movdqa 32(%rsi), %xmm0 # load next string chunk + /* c not found in second chunk: prepare for main loop */ +0: movdqa 32(%rsi), %xmm0 # load next string chunk movdqa %xmm4, %xmm1 movdqu %xmm2, (%rdi) # deposit head into buffer sub %rcx, %rdi # adjust RDI to correspond to RSI + mov %r11, %rdx movdqu %xmm3, 16(%rdi) # deposit second chunk sub %rsi, %rdi # express RDI as distance from RSI add $32, %rsi # advance RSI past first two chunks @@ -119,7 +132,7 @@ ARCHENTRY(__memccpy, baseline) /* main loop unrolled twice */ ALIGN_TEXT -0: pcmpeqb %xmm0, %xmm1 # NUL byte encountered? +0: pcmpeqb %xmm0, %xmm1 # c encountered? pmovmskb %xmm1, %eax test %eax, %eax jnz 3f @@ -131,7 +144,7 @@ ARCHENTRY(__memccpy, baseline) jb 2f add $32, %rsi # advance pointers to next chunk - pcmpeqb %xmm0, %xmm1 # NUL byte encountered? + pcmpeqb %xmm0, %xmm1 # c encountered? pmovmskb %xmm1, %eax test %eax, %eax jnz 4f @@ -146,11 +159,10 @@ ARCHENTRY(__memccpy, baseline) add $16, %edx /* 1--16 bytes left in the buffer but string has not ended yet */ -2: pcmpeqb %xmm1, %xmm0 # NUL byte encountered? +2: pcmpeqb %xmm1, %xmm0 # c encountered? pmovmskb %xmm0, %r8d mov %r8d, %ecx bts %edx, %r8d # treat end of buffer as end of string - or $0x10000, %eax # ensure TZCNT finds a set bit tzcnt %r8d, %r8d # find tail length add %rsi, %rdi # restore RDI movdqu 1(%rsi, %r8, 1), %xmm0 # load string tail @@ -162,42 +174,39 @@ ARCHENTRY(__memccpy, baseline) ret 4: sub $16, %rsi # undo second advancement - add $16, %rdx # restore number of remaining bytes - /* string has ended but buffer has not */ + /* terminator found and buffer has not ended yet */ 3: tzcnt %eax, %eax # find length of string tail - movdqu -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. NUL) + movdqu -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. c) add %rsi, %rdi # restore destination pointer - movdqu %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. NUL) + movdqu %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. c) lea 1(%rdi, %rax, 1), %rax # compute return value ret -.Lhead_buf_end: - pmovmskb %xmm1, %r8d - add $32, %edx # restore edx to (len-1) + ecx - shl $16, %r8d # place 2nd chunk NUL mask into bits 16--31 - mov %r8d, %r10d - bts %rdx, %r8 # treat end of buffer as if terminator present - xor %eax, %eax # return value if terminator not found - tzcnt %r8, %rdx # find string/buffer len from alignment boundary + /* buffer is 1--32 bytes in size */ + ALIGN_TEXT +.Lrunt: add $32, %r11d # undo earlier decrement + mov %r8d, %r10d # keep a copy of the original match mask + bts %r11d, %r8d # induce match at buffer end + and %ax, %r8w # is there a match in the first 16 bytes? + jnz 0f # if yes, skip looking at second chunk + + pcmpeqb 16(%rsi), %xmm4 # check for match in second chunk + pmovmskb %xmm4, %r8d + shl $16, %r8d # place second chunk matches in bits 16--31 + mov %r8d, %r10d # keep a copy of the original match mask + bts %r11d, %r8d # induce a match at buffer end + +0: xor %eax, %eax # return value if terminator not found + tzcnt %r8d, %edx # find string/buffer length from alignment boundary lea 1(%rdi, %rdx, 1), %r8 # return value if terminator found + rcx - sub %rcx, %r8 # subtract rcx - bt %rdx, %r10 # was the terminator present? + sub %rcx, %r8 + bt %edx, %r10d # was the terminator present? cmovc %r8, %rax # if yes, return pointer, else NULL - sub %ecx, %edx # find actual string/buffer len - jmp .L0132 + sub %ecx, %edx # find actual string/buffer length -.Lsecond_nul: - add %r8, %rdx # restore buffer length - tzcnt %eax, %r8d # where is the NUL byte? - lea -16(%rcx), %eax - sub %eax, %r8d # string length - lea 1(%rdi, %r8, 1), %rax # return value if NUL before end of buffer - xor %ecx, %ecx # return value if not - cmp %r8, %rdx # is the string shorter than the buffer? - cmova %r8, %rdx # copy only min(buflen, srclen) bytes - cmovb %rcx, %rax # return NUL if buffer ended before string -.L0132: cmp $16, %rdx # at least 17 bytes to copy (not incl NUL)? + ALIGN_TEXT +.L0132: cmp $16, %rdx # at least 17 bytes to copy? jb .L0116 /* copy 17--32 bytes */ @@ -207,16 +216,8 @@ ARCHENTRY(__memccpy, baseline) movdqu %xmm1, -15(%rdi, %rdx, 1) ret -.Lhead_nul: - tzcnt %eax, %r8d # where is the NUL byte? - sub %ecx, %r8d # ... from the beginning of the string? - lea 1(%rdi, %r8, 1), %rax # return value if NUL before end of buffer - xor %ecx, %ecx # return value if not - cmp %r8, %rdx # is the string shorter than the buffer? - cmova %r8, %rdx # copy only min(buflen, srclen) bytes - cmovb %rcx, %rax # return NUL if buffer ended before string - /* process strings of 1--16 bytes (rdx: min(buflen, srclen), rax: srclen) */ + ALIGN_TEXT .L0116: cmp $8, %rdx # at least 9 bytes to copy? jae .L0916