git: bad17991c06d - main - lib/libc/aarch64/string: add memccpy SIMD implementation
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Fri, 10 Jan 2025 15:03:56 UTC
The branch main has been updated by fuz: URL: https://cgit.FreeBSD.org/src/commit/?id=bad17991c06d684e9053938d00a07b962e2fd31c commit bad17991c06d684e9053938d00a07b962e2fd31c Author: Getz Mikalsen <getz@FreeBSD.org> AuthorDate: 2024-08-26 18:15:13 +0000 Commit: Robert Clausecker <fuz@FreeBSD.org> CommitDate: 2025-01-10 15:02:40 +0000 lib/libc/aarch64/string: add memccpy SIMD implementation This changeset includes a port of the SIMD implementation of memccpy for amd64 to Aarch64. Performance is significantly better than the scalar implementation except for short strings. Benchmark results are as usual generated by the strperf utility written by fuz. See the DR for benchmark results. Tested by: fuz (exprun) Reviewed by: fuz, emaste Sponsored by: Google LLC (GSoC 2024) PR: 281175 Differential Revision: https://reviews.freebsd.org/D46170 --- lib/libc/aarch64/string/Makefile.inc | 3 +- lib/libc/aarch64/string/memccpy.S | 271 +++++++++++++++++++++++++++++++++++ 2 files changed, 273 insertions(+), 1 deletion(-) diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc index 351f3424b6d0..78145a17ab85 100644 --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -27,7 +27,8 @@ MDSRCS+= \ strsep.c \ strcat.c \ strlcpy.S \ - strncmp.S + strncmp.S \ + memccpy.S # # Add the above functions. Generate an asm file that includes the needed diff --git a/lib/libc/aarch64/string/memccpy.S b/lib/libc/aarch64/string/memccpy.S new file mode 100644 index 000000000000..7d9fdb14b84b --- /dev/null +++ b/lib/libc/aarch64/string/memccpy.S @@ -0,0 +1,271 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org> +*/ + +#include <machine/asm.h> + + .weak memccpy + .set memccpy, __memccpy + .text + +ENTRY(__memccpy) + subs x3, x3, #1 + b.lo .L0 + + dup v0.16b, w2 + + mov x9, x0 // stash copy of src pointer + bic x10, x1, #0xf // src aligned + and x11, x1, #0xf // src offset + + ldr q1, [x10] + cmeq v1.16b, v1.16b, v0.16b // bytewise compare against src char + + mov x8, #-1 // prepare a 0xfff..fff register + mov x6, #0xf + + lsl x12, x11, #2 + lsl x8, x8, x12 // mask of bytes in the string + + shrn v1.8b, v1.8h, #4 + fmov x5, d1 + + sub x12, x11, #32 + adds x12, x12, x3 // distance from alignment boundary - 32 + b.cc .Lrunt // branch if buffer length is 32 or less + + ands x8, x8, x5 + b.eq 0f + + /* match in first chunk */ + rbit x8, x8 + clz x8, x8 // index of mismatch + lsr x8, x8, #2 + + sub x8, x8, x11 // ... from beginning of the string + + add x0, x0, x8 + add x4, x9, x8 // dst + cnt + add x5, x1, x8 // src + cnt + add x0, x0, #1 + + b .L0816 + +0: + ldr q3, [x10, #16] // load second string chunk + ldr q2, [x1] // load true head + cmeq v1.16b, v3.16b, v0.16b // char found in second chunk? + + /* process second chunk */ + shrn v1.8b, v1.8h, #4 + fmov x5, d1 + + cbz x5, 0f + + /* match in second chunk */ + rbit x8, x5 + clz x8, x8 // index of mismatch + lsr x8, x8, #2 + + sub x11, x11, #16 + sub x8, x8, x11 // adjust for alignment offset + add x0, x0, x8 // return value + add x0, x0, #1 + + add x4, x9, x8 + add x5, x1, x8 + b .L1732 + +0: + /* string didn't end in second chunk and neither did buffer */ + ldr q1, [x10, #32] // load next string chunk + str q2, [x0] // deposit head into buffer + sub x0, x0, x11 // adjust x0 + mov x3, x12 + str q3, [x0, #16] // deposit second chunk + + add x10, x10, #32 // advance src + add x0, x0, #32 // advance dst + subs x3, x3, #16 // enough left for another round? + b.lo 1f + + /* main loop unrolled twice */ + .p2align 4 +0: + cmeq v2.16b, v1.16b, v0.16b // char found in second chunk? + shrn v2.8b, v2.8h, #4 + fmov x5, d2 + + cbnz x5, 3f + + str q1, [x0] + ldr q1, [x10, #16] // load next chunk + + cmp x3, #16 // more than a full chunk left? + b.lo 2f + + add x10, x10, #32 // advance pointers + add x0, x0, #32 + + cmeq v2.16b, v1.16b, v0.16b // char found in second chunk? + shrn v2.8b, v2.8h, #4 + fmov x5, d2 + cbnz x5, 4f // process chunk if match + + str q1, [x0, #-16] + ldr q1, [x10] // load next chunk + + subs x3, x3, #32 + b.hs 0b + +1: + sub x10, x10, #16 // undo second advancement + add x3, x3, #16 + sub x0, x0, #16 + + /* 1--16 bytes left in the buffer but string has not ended yet */ +2: + cmeq v2.16b, v1.16b, v0.16b // char found in second chunk? + shrn v2.8b, v2.8h, #4 + fmov x4, d2 + + lsl x5, x3, #2 // shift 0xf to the limits position + lsl x5, x6, x5 + orr x8, x4, x5 // insert match in mask at limit + + rbit x8, x8 // simulate x86 tzcnt + clz x7, x8 // index of mismatch + lsr x8, x7, #2 + + lsl x5, x6, x7 // simulate x86 bt with shifted 0xf + + add x8, x8, #1 + add x0, x0, x8 + + ldr q1, [x10, x8] // load tail + str q1, [x0] // store tail + + add x0, x0, #16 + + tst x4, x5 // terminator encountered inside buffer? + csel x0, x0, xzr, ne // if yes, return pointer, else NUL + ret + +4: + sub x10, x10, #16 // undo second advancement + sub x0, x0, #16 // undo second advancement + +3: + rbit x8, x5 + clz x8, x8 // index of mismatch + lsr x3, x8, #2 + + add x0, x0, x3 // restore dst pointer + add x10, x10, x3 + ldr q1, [x10, #-15] + str q1, [x0, #-15] + add x0, x0, #1 + ret + +.Lrunt: + add x13, x11, x3 + + mov x7, x5 // keep a copy of original match mask + + lsl x4, x12, #2 // shift 0xf to the limits position + lsl x4, x6, x4 + + cmp x13, #16 // dont induce match if limit >=16 + csel x4, x4, xzr, lo + orr x5, x5, x4 // insert match in mask at limit + + ands x8, x8, x5 // if match always fall through + b.ne 0f + + ldr q4, [x10, #16] // load second string chunk + cmeq v1.16b, v4.16b, v0.16b // char found in second chunk? + + /* process second chunk */ + shrn v1.8b, v1.8h, #4 + fmov x8, d1 + mov x7, x8 + + lsl x4, x12, #2 + lsl x4, x6, x4 + orr x8, x8, x4 // induce match in upper bytes of mask + + rbit x8, x8 + clz x4, x8 // index of mismatch + lsr x8, x4, #2 + add x8, x8, #16 // no match in first chunk + b 1f + +0: + rbit x8, x8 + clz x4, x8 // index of mismatch + lsr x8, x4, #2 +1: + add x0, x0, x8 // return value if terminator not found + sub x0, x0, x11 + add x0, x0, #1 + + /* check if we encountered a match or the limit first */ + lsl x5, x6, x4 + ands x7, x7, x5 // was the terminator present? + csel x0, xzr, x0, eq // return value based on what we matched + + sub x8, x8, x11 + add x4, x9, x8 // dst + cnt + add x5, x1, x8 // src + cnt + + /* copy 17-32 bytes */ +.L1732: + cmp x8, #16 + b.lo .L0816 + add x5, x5, #1 // ldp offsets are powers of 2 + add x4, x4, #1 + ldp x16, x17, [x1] + ldp x12, x13, [x5, #-16] + stp x16, x17, [x9] + stp x12, x13, [x4, #-16] + ret + + /* Copy 8-16 bytes */ +.L0816: + tbz x8, #3, .L0407 + ldr x16, [x1] + ldr x17, [x5, #-7] + str x16, [x9] + str x17, [x4, #-7] + ret + + /* Copy 4-7 bytes */ + .p2align 4 +.L0407: + cmp x8, #3 + b.lo .L0103 + ldr w16, [x1] + ldr w18, [x5, #-3] + str w16, [x9] + str w18, [x4, #-3] + ret + + /* Copy 1-3 bytes */ + .p2align 4 +.L0103: + lsr x14, x8, #1 + ldrb w16, [x1] + ldrb w15, [x5] + ldrb w18, [x1, x14] + strb w16, [x9] + strb w18, [x9, x14] + strb w15, [x4] + ret + +.L0: + eor x0, x0, x0 + ret + +END(__memccpy)