git: 8c6e5d8cf1e6 - main - Import an optimized str{n}cmp on arm64
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Thu, 08 Sep 2022 13:32:24 UTC
The branch main has been updated by andrew: URL: https://cgit.FreeBSD.org/src/commit/?id=8c6e5d8cf1e67c6744be2f5bfe42eeda0479744c commit 8c6e5d8cf1e67c6744be2f5bfe42eeda0479744c Author: Andrew Turner <andrew@FreeBSD.org> AuthorDate: 2022-09-07 10:40:26 +0000 Commit: Andrew Turner <andrew@FreeBSD.org> CommitDate: 2022-09-08 13:23:20 +0000 Import an optimized str{n}cmp on arm64 These are from the Arm Optimized Routines and don't use the VFP so are safe to use in the kernel. Sponsored by: The FreeBSD Foundation --- sys/arm64/arm64/strcmp.S | 189 ++++++++++++++++++++++++++++ sys/arm64/arm64/strncmp.S | 307 ++++++++++++++++++++++++++++++++++++++++++++++ sys/conf/files | 2 - sys/conf/files.arm | 2 + sys/conf/files.arm64 | 2 + sys/conf/files.powerpc | 2 + sys/conf/files.riscv | 2 + sys/conf/files.x86 | 2 + 8 files changed, 506 insertions(+), 2 deletions(-) diff --git a/sys/arm64/arm64/strcmp.S b/sys/arm64/arm64/strcmp.S new file mode 100644 index 000000000000..0d66aae07d9e --- /dev/null +++ b/sys/arm64/arm64/strcmp.S @@ -0,0 +1,189 @@ +/* + * strcmp - compare two strings + * + * Copyright (c) 2012-2022, Arm Limited. + * SPDX-License-Identifier: MIT + */ + + +/* Assumptions: + * + * ARMv8-a, AArch64. + * MTE compatible. + */ + +#include <machine/asm.h> + +#define L(l) .L ## l + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f + +#define src1 x0 +#define src2 x1 +#define result x0 + +#define data1 x2 +#define data1w w2 +#define data2 x3 +#define data2w w3 +#define has_nul x4 +#define diff x5 +#define off1 x5 +#define syndrome x6 +#define tmp x6 +#define data3 x7 +#define zeroones x8 +#define shift x9 +#define off2 x10 + +/* On big-endian early bytes are at MSB and on little-endian LSB. + LS_FW means shifting towards early bytes. */ +#ifdef __AARCH64EB__ +# define LS_FW lsl +#else +# define LS_FW lsr +#endif + +/* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. + Since carry propagation makes 0x1 bytes before a NUL byte appear + NUL too in big-endian, byte-reverse the data before the NUL check. */ + + +ENTRY (strcmp) + sub off2, src2, src1 + mov zeroones, REP8_01 + and tmp, src1, 7 + tst off2, 7 + b.ne L(misaligned8) + cbnz tmp, L(mutual_align) + + .p2align 4 + +L(loop_aligned): + ldr data2, [src1, off2] + ldr data1, [src1], 8 +L(start_realigned): +#ifdef __AARCH64EB__ + rev tmp, data1 + sub has_nul, tmp, zeroones + orr tmp, tmp, REP8_7f +#else + sub has_nul, data1, zeroones + orr tmp, data1, REP8_7f +#endif + bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */ + ccmp data1, data2, 0, eq + b.eq L(loop_aligned) +#ifdef __AARCH64EB__ + rev has_nul, has_nul +#endif + eor diff, data1, data2 + orr syndrome, diff, has_nul +L(end): +#ifndef __AARCH64EB__ + rev syndrome, syndrome + rev data1, data1 + rev data2, data2 +#endif + clz shift, syndrome + /* The most-significant-non-zero bit of the syndrome marks either the + first bit that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ + lsl data1, data1, shift + lsl data2, data2, shift + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, 56 + sub result, data1, data2, lsr 56 + ret + + .p2align 4 + +L(mutual_align): + /* Sources are mutually aligned, but are not currently at an + alignment boundary. Round down the addresses and then mask off + the bytes that precede the start point. */ + bic src1, src1, 7 + ldr data2, [src1, off2] + ldr data1, [src1], 8 + neg shift, src2, lsl 3 /* Bits to alignment -64. */ + mov tmp, -1 + LS_FW tmp, tmp, shift + orr data1, data1, tmp + orr data2, data2, tmp + b L(start_realigned) + +L(misaligned8): + /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always + checking to make sure that we don't access beyond the end of SRC2. */ + cbz tmp, L(src1_aligned) +L(do_misaligned): + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + cmp data1w, 0 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ + b.ne L(done) + tst src1, 7 + b.ne L(do_misaligned) + +L(src1_aligned): + neg shift, src2, lsl 3 + bic src2, src2, 7 + ldr data3, [src2], 8 +#ifdef __AARCH64EB__ + rev data3, data3 +#endif + lsr tmp, zeroones, shift + orr data3, data3, tmp + sub has_nul, data3, zeroones + orr tmp, data3, REP8_7f + bics has_nul, has_nul, tmp + b.ne L(tail) + + sub off1, src2, src1 + + .p2align 4 + +L(loop_unaligned): + ldr data3, [src1, off1] + ldr data2, [src1, off2] +#ifdef __AARCH64EB__ + rev data3, data3 +#endif + sub has_nul, data3, zeroones + orr tmp, data3, REP8_7f + ldr data1, [src1], 8 + bics has_nul, has_nul, tmp + ccmp data1, data2, 0, eq + b.eq L(loop_unaligned) + + lsl tmp, has_nul, shift +#ifdef __AARCH64EB__ + rev tmp, tmp +#endif + eor diff, data1, data2 + orr syndrome, diff, tmp + cbnz syndrome, L(end) +L(tail): + ldr data1, [src1] + neg shift, shift + lsr data2, data3, shift + lsr has_nul, has_nul, shift +#ifdef __AARCH64EB__ + rev data2, data2 + rev has_nul, has_nul +#endif + eor diff, data1, data2 + orr syndrome, diff, has_nul + b L(end) + +L(done): + sub result, data1, data2 + ret + +END (strcmp) + diff --git a/sys/arm64/arm64/strncmp.S b/sys/arm64/arm64/strncmp.S new file mode 100644 index 000000000000..595de0312678 --- /dev/null +++ b/sys/arm64/arm64/strncmp.S @@ -0,0 +1,307 @@ +/* + * strncmp - compare two strings + * + * Copyright (c) 2013-2022, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64. + * MTE compatible. + */ + +#include <machine/asm.h> + +#define L(l) .L ## l + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f + +/* Parameters and result. */ +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result x0 + +/* Internal variables. */ +#define data1 x3 +#define data1w w3 +#define data2 x4 +#define data2w w4 +#define has_nul x5 +#define diff x6 +#define syndrome x7 +#define tmp1 x8 +#define tmp2 x9 +#define tmp3 x10 +#define zeroones x11 +#define pos x12 +#define mask x13 +#define endloop x14 +#define count mask +#define offset pos +#define neg_offset x15 + +/* Define endian dependent shift operations. + On big-endian early bytes are at MSB and on little-endian LSB. + LS_FW means shifting towards early bytes. + LS_BK means shifting towards later bytes. + */ +#ifdef __AARCH64EB__ +#define LS_FW lsl +#define LS_BK lsr +#else +#define LS_FW lsr +#define LS_BK lsl +#endif + +ENTRY (strncmp) + cbz limit, L(ret0) + eor tmp1, src1, src2 + mov zeroones, #REP8_01 + tst tmp1, #7 + and count, src1, #7 + b.ne L(misaligned8) + cbnz count, L(mutual_align) + + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. */ + .p2align 4 +L(loop_aligned): + ldr data1, [src1], #8 + ldr data2, [src2], #8 +L(start_realigned): + subs limit, limit, #8 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + csinv endloop, diff, xzr, hi /* Last Dword or differences. */ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp endloop, #0, #0, eq + b.eq L(loop_aligned) + /* End of main loop */ + +L(full_check): +#ifndef __AARCH64EB__ + orr syndrome, diff, has_nul + add limit, limit, 8 /* Rewind limit to before last subs. */ +L(syndrome_check): + /* Limit was reached. Check if the NUL byte or the difference + is before the limit. */ + rev syndrome, syndrome + rev data1, data1 + clz pos, syndrome + rev data2, data2 + lsl data1, data1, pos + cmp limit, pos, lsr #3 + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + csel result, result, xzr, hi + ret +#else + /* Not reached the limit, must have found the end or a diff. */ + tbz limit, #63, L(not_limit) + add tmp1, limit, 8 + cbz limit, L(not_limit) + + lsl limit, tmp1, #3 /* Bits -> bytes. */ + mov mask, #~0 + lsr mask, mask, limit + bic data1, data1, mask + bic data2, data2, mask + + /* Make sure that the NUL byte is marked in the syndrome. */ + orr has_nul, has_nul, mask + +L(not_limit): + /* For big-endian we cannot use the trick with the syndrome value + as carry-propagation can corrupt the upper bits if the trailing + bytes in the string contain 0x01. */ + /* However, if there is no NUL byte in the dword, we can generate + the result directly. We can't just subtract the bytes as the + MSB might be significant. */ + cbnz has_nul, 1f + cmp data1, data2 + cset result, ne + cneg result, result, lo + ret +1: + /* Re-compute the NUL-byte detection, using a byte-reversed value. */ + rev tmp3, data1 + sub tmp1, tmp3, zeroones + orr tmp2, tmp3, #REP8_7f + bic has_nul, tmp1, tmp2 + rev has_nul, has_nul + orr syndrome, diff, has_nul + clz pos, syndrome + /* The most-significant-non-zero bit of the syndrome marks either the + first bit that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ +L(end_quick): + lsl data1, data1, pos + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret +#endif + +L(mutual_align): + /* Sources are mutually aligned, but are not currently at an + alignment boundary. Round down the addresses and then mask off + the bytes that precede the start point. + We also need to adjust the limit calculations, but without + overflowing if the limit is near ULONG_MAX. */ + bic src1, src1, #7 + bic src2, src2, #7 + ldr data1, [src1], #8 + neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ + ldr data2, [src2], #8 + mov tmp2, #~0 + LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ + /* Adjust the limit and ensure it doesn't overflow. */ + adds limit, limit, count + csinv limit, limit, xzr, lo + orr data1, data1, tmp2 + orr data2, data2, tmp2 + b L(start_realigned) + + .p2align 4 + /* Don't bother with dwords for up to 16 bytes. */ +L(misaligned8): + cmp limit, #16 + b.hs L(try_misaligned_words) + +L(byte_loop): + /* Perhaps we can do better than this. */ + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + subs limit, limit, #1 + ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.eq L(byte_loop) +L(done): + sub result, data1, data2 + ret + /* Align the SRC1 to a dword by doing a bytewise compare and then do + the dword loop. */ +L(try_misaligned_words): + cbz count, L(src1_aligned) + + neg count, count + and count, count, #7 + sub limit, limit, count + +L(page_end_loop): + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + cmp data1w, #1 + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.ne L(done) + subs count, count, #1 + b.hi L(page_end_loop) + + /* The following diagram explains the comparison of misaligned strings. + The bytes are shown in natural order. For little-endian, it is + reversed in the registers. The "x" bytes are before the string. + The "|" separates data that is loaded at one time. + src1 | a a a a a a a a | b b b c c c c c | . . . + src2 | x x x x x a a a a a a a a b b b | c c c c c . . . + + After shifting in each step, the data looks like this: + STEP_A STEP_B STEP_C + data1 a a a a a a a a b b b c c c c c b b b c c c c c + data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c + + The bytes with "0" are eliminated from the syndrome via mask. + + Align SRC2 down to 16 bytes. This way we can read 16 bytes at a + time from SRC2. The comparison happens in 3 steps. After each step + the loop can exit, or read from SRC1 or SRC2. */ +L(src1_aligned): + /* Calculate offset from 8 byte alignment to string start in bits. No + need to mask offset since shifts are ignoring upper bits. */ + lsl offset, src2, #3 + bic src2, src2, #0xf + mov mask, -1 + neg neg_offset, offset + ldr data1, [src1], #8 + ldp tmp1, tmp2, [src2], #16 + LS_BK mask, mask, neg_offset + and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ + /* Skip the first compare if data in tmp1 is irrelevant. */ + tbnz offset, 6, L(misaligned_mid_loop) + +L(loop_misaligned): + /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ + LS_FW data2, tmp1, offset + LS_BK tmp1, tmp2, neg_offset + subs limit, limit, #8 + orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ + sub has_nul, data1, zeroones + eor diff, data1, data2 /* Non-zero if differences found. */ + orr tmp3, data1, #REP8_7f + csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ + bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ + orr tmp3, endloop, has_nul + cbnz tmp3, L(full_check) + + ldr data1, [src1], #8 +L(misaligned_mid_loop): + /* STEP_B: Compare first part of data1 to second part of tmp2. */ + LS_FW data2, tmp2, offset +#ifdef __AARCH64EB__ + /* For big-endian we do a byte reverse to avoid carry-propagation + problem described above. This way we can reuse the has_nul in the + next step and also use syndrome value trick at the end. */ + rev tmp3, data1 + #define data1_fixed tmp3 +#else + #define data1_fixed data1 +#endif + sub has_nul, data1_fixed, zeroones + orr tmp3, data1_fixed, #REP8_7f + eor diff, data2, data1 /* Non-zero if differences found. */ + bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ +#ifdef __AARCH64EB__ + rev has_nul, has_nul +#endif + cmp limit, neg_offset, lsr #3 + orr syndrome, diff, has_nul + bic syndrome, syndrome, mask /* Ignore later bytes. */ + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ + cbnz tmp3, L(syndrome_check) + + /* STEP_C: Compare second part of data1 to first part of tmp1. */ + ldp tmp1, tmp2, [src2], #16 + cmp limit, #8 + LS_BK data2, tmp1, neg_offset + eor diff, data2, data1 /* Non-zero if differences found. */ + orr syndrome, diff, has_nul + and syndrome, syndrome, mask /* Ignore earlier bytes. */ + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ + cbnz tmp3, L(syndrome_check) + + ldr data1, [src1], #8 + sub limit, limit, #8 + b L(loop_misaligned) + +#ifdef __AARCH64EB__ +L(syndrome_check): + clz pos, syndrome + cmp pos, limit, lsl #3 + b.lo L(end_quick) +#endif + +L(ret0): + mov result, #0 + ret +END(strncmp) + diff --git a/sys/conf/files b/sys/conf/files index a0e969b59e5f..afe2575c8548 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -4060,7 +4060,6 @@ libkern/strcasestr.c standard libkern/strcat.c standard libkern/strchr.c standard libkern/strchrnul.c optional gdb -libkern/strcmp.c standard libkern/strcpy.c standard libkern/strcspn.c standard libkern/strdup.c standard @@ -4068,7 +4067,6 @@ libkern/strndup.c standard libkern/strlcat.c standard libkern/strlcpy.c standard libkern/strncat.c standard -libkern/strncmp.c standard libkern/strncpy.c standard libkern/strnlen.c standard libkern/strnstr.c standard diff --git a/sys/conf/files.arm b/sys/conf/files.arm index eed9933129ec..85afa4893d3c 100644 --- a/sys/conf/files.arm +++ b/sys/conf/files.arm @@ -127,7 +127,9 @@ libkern/flsll.c optional !armv7 !armv6 libkern/lshrdi3.c standard libkern/moddi3.c standard libkern/qdivrem.c standard +libkern/strcmp.c standard libkern/strlen.c standard +libkern/strncmp.c standard libkern/ucmpdi2.c standard libkern/udivdi3.c standard libkern/umoddi3.c standard diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64 index 7d237859b8d6..a647d4e32230 100644 --- a/sys/conf/files.arm64 +++ b/sys/conf/files.arm64 @@ -71,6 +71,8 @@ arm64/arm64/pmap.c standard arm64/arm64/ptrace_machdep.c standard arm64/arm64/sigtramp.S standard arm64/arm64/stack_machdep.c optional ddb | stack +arm64/arm64/strcmp.S standard +arm64/arm64/strncmp.S standard arm64/arm64/support_ifunc.c standard arm64/arm64/support.S standard arm64/arm64/swtch.S standard diff --git a/sys/conf/files.powerpc b/sys/conf/files.powerpc index 17c6bf278b40..2277468c3c9e 100644 --- a/sys/conf/files.powerpc +++ b/sys/conf/files.powerpc @@ -188,7 +188,9 @@ libkern/memcmp.c standard libkern/memset.c standard libkern/moddi3.c optional powerpc | powerpcspe libkern/qdivrem.c optional powerpc | powerpcspe +libkern/strcmp.c standard libkern/strlen.c standard +libkern/strncmp.c standard libkern/ucmpdi2.c optional powerpc | powerpcspe libkern/udivdi3.c optional powerpc | powerpcspe libkern/umoddi3.c optional powerpc | powerpcspe diff --git a/sys/conf/files.riscv b/sys/conf/files.riscv index f7272a6f5785..774dabe0cd61 100644 --- a/sys/conf/files.riscv +++ b/sys/conf/files.riscv @@ -30,7 +30,9 @@ libkern/flsl.c standard libkern/flsll.c standard libkern/memcmp.c standard libkern/memset.c standard +libkern/strcmp.c standard libkern/strlen.c standard +libkern/strncmp.c standard riscv/riscv/autoconf.c standard riscv/riscv/bus_machdep.c standard riscv/riscv/bus_space_asm.S standard diff --git a/sys/conf/files.x86 b/sys/conf/files.x86 index 8478afab972f..e8f65628c5c1 100644 --- a/sys/conf/files.x86 +++ b/sys/conf/files.x86 @@ -292,6 +292,8 @@ dev/qat_c2xxx/qat.c optional qat_c2xxx dev/qat_c2xxx/qat_ae.c optional qat_c2xxx dev/qat_c2xxx/qat_c2xxx.c optional qat_c2xxx dev/qat_c2xxx/qat_hw15.c optional qat_c2xxx +libkern/strcmp.c standard +libkern/strncmp.c standard libkern/x86/crc32_sse42.c standard # # x86 shared code between IA32 and AMD64 architectures