git: 42f0ac5f1bd2 - main - Fix BLAKE3 aarch64 assembly for FreeBSD and macOS
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Thu, 27 Apr 2023 00:47:20 UTC
The branch main has been updated by kevans: URL: https://cgit.FreeBSD.org/src/commit/?id=42f0ac5f1bd2bb5c779ce51c369a0e47c62cbf9b commit 42f0ac5f1bd2bb5c779ce51c369a0e47c62cbf9b Author: Tino Reichardt <milky-zfs@mcmilk.de> AuthorDate: 2023-04-26 19:40:26 +0000 Commit: Kyle Evans <kevans@FreeBSD.org> CommitDate: 2023-04-27 00:46:47 +0000 Fix BLAKE3 aarch64 assembly for FreeBSD and macOS The x18 register isn't useable within FreeBSD kernel space, so we have to fix the BLAKE3 aarch64 assembly for not using it. The source files are here: https://github.com/mcmilk/BLAKE3-tests Reviewed-by: Kyle Evans <kevans@FreeBSD.org> Signed-off-by: Tino Reichardt <milky-zfs@mcmilk.de> Closes #14728 --- .../icp/asm-aarch64/blake3/b3_aarch64_sse2.S | 4163 +++++++++--------- .../icp/asm-aarch64/blake3/b3_aarch64_sse41.S | 4447 ++++++++++---------- 2 files changed, 4078 insertions(+), 4532 deletions(-) diff --git a/sys/contrib/openzfs/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S b/sys/contrib/openzfs/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S index 8237f0eb5a4e..dc2719d142db 100644 --- a/sys/contrib/openzfs/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S +++ b/sys/contrib/openzfs/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S @@ -22,480 +22,61 @@ /* * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 * Copyright (c) 2019-2022 Samuel Neves and Matthew Krupcale - * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de> + * Copyright (c) 2022-2023 Tino Reichardt <milky-zfs@mcmilk.de> * * This is converted assembly: SSE2 -> ARMv8-A * Used tools: SIMDe https://github.com/simd-everywhere/simde + * + * Should work on FreeBSD, Linux and macOS + * see: https://github.com/mcmilk/BLAKE3-tests/blob/master/contrib/simde.sh */ #if defined(__aarch64__) .text - .section .rodata.cst16,"aM",@progbits,16 - .p2align 4 -.LCPI0_0: - .word 1779033703 - .word 3144134277 - .word 1013904242 - .word 2773480762 -.LCPI0_1: - .xword 0 - .xword -4294967296 -.LCPI0_2: - .xword -1 - .xword 4294967295 + .section .note.gnu.property,"a",@note + .p2align 3 + .word 4 + .word 16 + .word 5 + .asciz "GNU" + .word 3221225472 + .word 4 + .word 3 + .word 0 +.Lsec_end0: .text .globl zfs_blake3_compress_in_place_sse2 .p2align 2 .type zfs_blake3_compress_in_place_sse2,@function zfs_blake3_compress_in_place_sse2: .cfi_startproc - ldp q3, q2, [x0] - ldp q5, q6, [x1] - add x10, x1, #32 - lsr x11, x3, #32 - fmov s4, w3 - ld2 { v17.4s, v18.4s }, [x10] - adrp x10, .LCPI0_2 - and w8, w2, #0xff - mov v4.s[1], w11 - ldr q1, [x10, :lo12:.LCPI0_2] - and w9, w4, #0xff - adrp x12, .LCPI0_0 - mov v4.s[2], w8 - uzp1 v19.4s, v5.4s, v6.4s - add v3.4s, v2.4s, v3.4s - ldr q7, [x12, :lo12:.LCPI0_0] - mov v4.s[3], w9 - add v3.4s, v3.4s, v19.4s - uzp2 v5.4s, v5.4s, v6.4s - ext v21.16b, v18.16b, v18.16b, #12 - uzp1 v6.4s, v19.4s, v19.4s - ext v22.16b, v19.16b, v19.16b, #12 - eor v4.16b, v3.16b, v4.16b - ext v20.16b, v17.16b, v17.16b, #12 - ext v6.16b, v6.16b, v19.16b, #8 - ext v19.16b, v19.16b, v22.16b, #12 - zip1 v22.2d, v21.2d, v5.2d - rev32 v24.8h, v4.8h - mov v4.16b, v1.16b - zip2 v23.4s, v5.4s, v21.4s - uzp2 v6.4s, v6.4s, v5.4s - bsl v4.16b, v22.16b, v20.16b - add v3.4s, v3.4s, v5.4s - zip1 v5.4s, v23.4s, v20.4s - zip1 v22.4s, v20.4s, v23.4s - add v23.4s, v24.4s, v7.4s - ext v7.16b, v6.16b, v6.16b, #4 - ext v25.16b, v4.16b, v4.16b, #12 - ext v5.16b, v22.16b, v5.16b, #8 - eor v2.16b, v23.16b, v2.16b - uzp1 v4.4s, v4.4s, v25.4s - uzp1 v22.4s, v7.4s, v7.4s - ext v25.16b, v7.16b, v7.16b, #12 - ext v22.16b, v22.16b, v7.16b, #8 - ext v7.16b, v7.16b, v25.16b, #12 - ushr v25.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - orr v2.16b, v2.16b, v25.16b - add v3.4s, v3.4s, v2.4s - eor v24.16b, v3.16b, v24.16b - add v3.4s, v3.4s, v17.4s - ushr v17.4s, v24.4s, #8 - shl v18.4s, v24.4s, #24 - orr v17.16b, v18.16b, v17.16b - add v18.4s, v17.4s, v23.4s - eor v2.16b, v18.16b, v2.16b - ushr v23.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - ext v3.16b, v3.16b, v3.16b, #12 - orr v2.16b, v2.16b, v23.16b - ext v17.16b, v17.16b, v17.16b, #8 - add v3.4s, v2.4s, v3.4s - adrp x11, .LCPI0_1 - eor v17.16b, v3.16b, v17.16b - ldr q16, [x11, :lo12:.LCPI0_1] - ext v18.16b, v18.16b, v18.16b, #4 - rev32 v24.8h, v17.8h - movi v0.2d, #0xffffffff00000000 - add v23.4s, v3.4s, v21.4s - mov v21.s[1], v20.s[2] - add v20.4s, v18.4s, v24.4s - bit v19.16b, v21.16b, v0.16b - eor v3.16b, v20.16b, v2.16b - uzp2 v2.4s, v22.4s, v19.4s - zip1 v17.2d, v5.2d, v19.2d - zip2 v18.4s, v19.4s, v5.4s - ushr v21.4s, v3.4s, #12 - shl v3.4s, v3.4s, #20 - ext v22.16b, v2.16b, v2.16b, #4 - bsl v16.16b, v4.16b, v17.16b - zip1 v17.4s, v18.4s, v4.4s - zip1 v18.4s, v4.4s, v18.4s - orr v21.16b, v3.16b, v21.16b - ext v25.16b, v16.16b, v16.16b, #12 - ext v3.16b, v18.16b, v17.16b, #8 - uzp1 v18.4s, v22.4s, v22.4s - ext v26.16b, v22.16b, v22.16b, #12 - add v23.4s, v23.4s, v21.4s - uzp1 v17.4s, v16.4s, v25.4s - ext v16.16b, v18.16b, v22.16b, #8 - ext v18.16b, v22.16b, v26.16b, #12 - eor v22.16b, v23.16b, v24.16b - add v6.4s, v23.4s, v6.4s - ushr v23.4s, v22.4s, #8 - shl v22.4s, v22.4s, #24 - orr v22.16b, v22.16b, v23.16b - add v20.4s, v22.4s, v20.4s - eor v21.16b, v20.16b, v21.16b - ushr v23.4s, v21.4s, #7 - shl v21.4s, v21.4s, #25 - ext v6.16b, v6.16b, v6.16b, #4 - orr v21.16b, v21.16b, v23.16b - ext v22.16b, v22.16b, v22.16b, #8 - add v6.4s, v21.4s, v6.4s - eor v22.16b, v6.16b, v22.16b - ext v20.16b, v20.16b, v20.16b, #12 - add v6.4s, v6.4s, v19.4s - rev32 v19.8h, v22.8h - add v20.4s, v20.4s, v19.4s - eor v21.16b, v20.16b, v21.16b - ushr v22.4s, v21.4s, #12 - shl v21.4s, v21.4s, #20 - orr v21.16b, v21.16b, v22.16b - add v6.4s, v6.4s, v21.4s - eor v19.16b, v6.16b, v19.16b - ushr v22.4s, v19.4s, #8 - shl v19.4s, v19.4s, #24 - orr v19.16b, v19.16b, v22.16b - add v20.4s, v19.4s, v20.4s - eor v21.16b, v20.16b, v21.16b - ext v6.16b, v6.16b, v6.16b, #12 - ushr v22.4s, v21.4s, #7 - shl v21.4s, v21.4s, #25 - add v6.4s, v6.4s, v4.4s - orr v21.16b, v21.16b, v22.16b - ext v19.16b, v19.16b, v19.16b, #8 - add v6.4s, v6.4s, v21.4s - eor v19.16b, v6.16b, v19.16b - ext v20.16b, v20.16b, v20.16b, #4 - rev32 v19.8h, v19.8h - add v20.4s, v20.4s, v19.4s - add v6.4s, v6.4s, v5.4s - mov v5.s[1], v4.s[2] - eor v4.16b, v20.16b, v21.16b - ushr v21.4s, v4.4s, #12 - shl v4.4s, v4.4s, #20 - orr v21.16b, v4.16b, v21.16b - add v6.4s, v6.4s, v21.4s - eor v19.16b, v6.16b, v19.16b - add v2.4s, v6.4s, v2.4s - ushr v6.4s, v19.4s, #8 - shl v19.4s, v19.4s, #24 - orr v6.16b, v19.16b, v6.16b - add v19.4s, v6.4s, v20.4s - eor v20.16b, v19.16b, v21.16b - ushr v21.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - ext v2.16b, v2.16b, v2.16b, #4 - orr v20.16b, v20.16b, v21.16b - ext v6.16b, v6.16b, v6.16b, #8 - add v2.4s, v20.4s, v2.4s - eor v6.16b, v2.16b, v6.16b - ext v19.16b, v19.16b, v19.16b, #12 - rev32 v6.8h, v6.8h - add v19.4s, v19.4s, v6.4s - mov v22.16b, v0.16b - eor v20.16b, v19.16b, v20.16b - bsl v22.16b, v5.16b, v7.16b - ushr v21.4s, v20.4s, #12 - shl v20.4s, v20.4s, #20 - add v2.4s, v2.4s, v22.4s - orr v20.16b, v20.16b, v21.16b - add v2.4s, v2.4s, v20.4s - eor v6.16b, v2.16b, v6.16b - ushr v21.4s, v6.4s, #8 - shl v6.4s, v6.4s, #24 - orr v6.16b, v6.16b, v21.16b - add v19.4s, v6.4s, v19.4s - eor v20.16b, v19.16b, v20.16b - ext v2.16b, v2.16b, v2.16b, #12 - ushr v21.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - add v2.4s, v2.4s, v17.4s - orr v20.16b, v20.16b, v21.16b - ext v6.16b, v6.16b, v6.16b, #8 - add v2.4s, v2.4s, v20.4s - eor v6.16b, v2.16b, v6.16b - uzp2 v5.4s, v16.4s, v22.4s - zip1 v7.2d, v3.2d, v22.2d - zip2 v16.4s, v22.4s, v3.4s - ext v19.16b, v19.16b, v19.16b, #4 - rev32 v22.8h, v6.8h - ext v23.16b, v5.16b, v5.16b, #4 - bif v7.16b, v17.16b, v1.16b - zip1 v24.4s, v16.4s, v17.4s - zip1 v16.4s, v17.4s, v16.4s - add v21.4s, v2.4s, v3.4s - mov v3.s[1], v17.s[2] - add v17.4s, v19.4s, v22.4s - mov v19.16b, v0.16b - ext v25.16b, v7.16b, v7.16b, #12 - ext v4.16b, v16.16b, v24.16b, #8 - uzp1 v16.4s, v23.4s, v23.4s - bsl v19.16b, v3.16b, v18.16b - eor v2.16b, v17.16b, v20.16b - uzp1 v7.4s, v7.4s, v25.4s - ext v25.16b, v16.16b, v23.16b, #8 - zip1 v3.2d, v4.2d, v19.2d - ushr v20.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - ext v24.16b, v23.16b, v23.16b, #12 - uzp2 v6.4s, v25.4s, v19.4s - zip2 v18.4s, v19.4s, v4.4s - bif v3.16b, v7.16b, v1.16b - orr v20.16b, v2.16b, v20.16b - ext v16.16b, v23.16b, v24.16b, #12 - ext v23.16b, v6.16b, v6.16b, #4 - zip1 v24.4s, v18.4s, v7.4s - zip1 v18.4s, v7.4s, v18.4s - ext v25.16b, v3.16b, v3.16b, #12 - add v21.4s, v21.4s, v20.4s - ext v2.16b, v18.16b, v24.16b, #8 - uzp1 v18.4s, v23.4s, v23.4s - ext v24.16b, v23.16b, v23.16b, #12 - uzp1 v3.4s, v3.4s, v25.4s - eor v22.16b, v21.16b, v22.16b - ext v25.16b, v18.16b, v23.16b, #8 - dup v18.4s, v2.s[3] - ext v23.16b, v23.16b, v24.16b, #12 - add v5.4s, v21.4s, v5.4s - trn1 v21.4s, v3.4s, v3.4s - ushr v24.4s, v22.4s, #8 - shl v22.4s, v22.4s, #24 - ext v18.16b, v21.16b, v18.16b, #8 - orr v21.16b, v22.16b, v24.16b - add v17.4s, v21.4s, v17.4s - eor v20.16b, v17.16b, v20.16b - ushr v22.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - ext v5.16b, v5.16b, v5.16b, #4 - orr v20.16b, v20.16b, v22.16b - ext v21.16b, v21.16b, v21.16b, #8 - add v5.4s, v20.4s, v5.4s - eor v21.16b, v5.16b, v21.16b - ext v17.16b, v17.16b, v17.16b, #12 - add v5.4s, v5.4s, v19.4s - rev32 v19.8h, v21.8h - add v17.4s, v17.4s, v19.4s - eor v20.16b, v17.16b, v20.16b - ushr v21.4s, v20.4s, #12 - shl v20.4s, v20.4s, #20 - orr v20.16b, v20.16b, v21.16b - add v5.4s, v5.4s, v20.4s - eor v19.16b, v5.16b, v19.16b - ushr v21.4s, v19.4s, #8 - shl v19.4s, v19.4s, #24 - orr v19.16b, v19.16b, v21.16b - add v17.4s, v19.4s, v17.4s - eor v20.16b, v17.16b, v20.16b - ext v5.16b, v5.16b, v5.16b, #12 - ushr v21.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - add v5.4s, v5.4s, v7.4s - orr v20.16b, v20.16b, v21.16b - ext v19.16b, v19.16b, v19.16b, #8 - add v5.4s, v5.4s, v20.4s - eor v19.16b, v5.16b, v19.16b - ext v17.16b, v17.16b, v17.16b, #4 - rev32 v22.8h, v19.8h - add v21.4s, v5.4s, v4.4s - mov v4.s[1], v7.s[2] - add v19.4s, v17.4s, v22.4s - bit v16.16b, v4.16b, v0.16b - eor v5.16b, v19.16b, v20.16b - uzp2 v4.4s, v25.4s, v16.4s - zip1 v7.2d, v2.2d, v16.2d - zip2 v17.4s, v16.4s, v2.4s - ushr v20.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - ext v24.16b, v4.16b, v4.16b, #4 - bif v7.16b, v3.16b, v1.16b - zip1 v25.4s, v17.4s, v3.4s - zip1 v17.4s, v3.4s, v17.4s - orr v20.16b, v5.16b, v20.16b - ext v26.16b, v7.16b, v7.16b, #12 - ext v5.16b, v17.16b, v25.16b, #8 - uzp1 v17.4s, v24.4s, v24.4s - ext v25.16b, v24.16b, v24.16b, #12 - bit v23.16b, v18.16b, v0.16b - add v21.4s, v21.4s, v20.4s - uzp1 v7.4s, v7.4s, v26.4s - ext v26.16b, v17.16b, v24.16b, #8 - ext v17.16b, v24.16b, v25.16b, #12 - eor v22.16b, v21.16b, v22.16b - add v6.4s, v21.4s, v6.4s - zip1 v21.2d, v5.2d, v23.2d - zip2 v24.4s, v23.4s, v5.4s - bif v21.16b, v7.16b, v1.16b - zip1 v1.4s, v24.4s, v7.4s - zip1 v24.4s, v7.4s, v24.4s - ext v1.16b, v24.16b, v1.16b, #8 - ushr v24.4s, v22.4s, #8 - shl v22.4s, v22.4s, #24 - orr v22.16b, v22.16b, v24.16b - add v19.4s, v22.4s, v19.4s - ext v24.16b, v21.16b, v21.16b, #12 - eor v20.16b, v19.16b, v20.16b - uzp1 v21.4s, v21.4s, v24.4s - ushr v24.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - orr v20.16b, v20.16b, v24.16b - ext v6.16b, v6.16b, v6.16b, #4 - ext v22.16b, v22.16b, v22.16b, #8 - add v6.4s, v20.4s, v6.4s - eor v22.16b, v6.16b, v22.16b - ext v19.16b, v19.16b, v19.16b, #12 - add v6.4s, v6.4s, v16.4s - rev32 v16.8h, v22.8h - add v19.4s, v19.4s, v16.4s - eor v20.16b, v19.16b, v20.16b - ushr v22.4s, v20.4s, #12 - shl v20.4s, v20.4s, #20 - orr v20.16b, v20.16b, v22.16b - add v6.4s, v6.4s, v20.4s - eor v16.16b, v6.16b, v16.16b - ext v6.16b, v6.16b, v6.16b, #12 - add v3.4s, v6.4s, v3.4s - ushr v6.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - orr v6.16b, v16.16b, v6.16b - add v16.4s, v6.4s, v19.4s - eor v19.16b, v16.16b, v20.16b - ushr v20.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - orr v19.16b, v19.16b, v20.16b - ext v6.16b, v6.16b, v6.16b, #8 - add v3.4s, v3.4s, v19.4s - eor v6.16b, v3.16b, v6.16b - ext v16.16b, v16.16b, v16.16b, #4 - add v2.4s, v3.4s, v2.4s - rev32 v3.8h, v6.8h - add v6.4s, v16.4s, v3.4s - eor v16.16b, v6.16b, v19.16b - ushr v19.4s, v16.4s, #12 - shl v16.4s, v16.4s, #20 - orr v16.16b, v16.16b, v19.16b - add v2.4s, v2.4s, v16.4s - eor v3.16b, v2.16b, v3.16b - add v2.4s, v2.4s, v4.4s - ushr v4.4s, v3.4s, #8 - shl v3.4s, v3.4s, #24 - orr v3.16b, v3.16b, v4.16b - add v4.4s, v3.4s, v6.4s - eor v6.16b, v4.16b, v16.16b - ushr v16.4s, v6.4s, #7 - shl v6.4s, v6.4s, #25 - ext v2.16b, v2.16b, v2.16b, #4 - orr v6.16b, v6.16b, v16.16b - ext v3.16b, v3.16b, v3.16b, #8 - add v2.4s, v6.4s, v2.4s - eor v3.16b, v2.16b, v3.16b - ext v4.16b, v4.16b, v4.16b, #12 - rev32 v3.8h, v3.8h - add v4.4s, v4.4s, v3.4s - eor v6.16b, v4.16b, v6.16b - ushr v16.4s, v6.4s, #12 - shl v6.4s, v6.4s, #20 - add v2.4s, v2.4s, v23.4s - orr v6.16b, v6.16b, v16.16b - add v2.4s, v2.4s, v6.4s - eor v3.16b, v2.16b, v3.16b - ushr v16.4s, v3.4s, #8 - shl v3.4s, v3.4s, #24 - orr v3.16b, v3.16b, v16.16b - add v4.4s, v3.4s, v4.4s - eor v6.16b, v4.16b, v6.16b - ext v2.16b, v2.16b, v2.16b, #12 - ushr v16.4s, v6.4s, #7 - shl v6.4s, v6.4s, #25 - add v2.4s, v2.4s, v7.4s - orr v6.16b, v6.16b, v16.16b - ext v3.16b, v3.16b, v3.16b, #8 - add v2.4s, v2.4s, v6.4s - eor v3.16b, v2.16b, v3.16b - ext v4.16b, v4.16b, v4.16b, #4 - rev32 v3.8h, v3.8h - add v2.4s, v2.4s, v5.4s - mov v5.s[1], v7.s[2] - add v4.4s, v4.4s, v3.4s - bsl v0.16b, v5.16b, v17.16b - eor v5.16b, v4.16b, v6.16b - ushr v6.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - orr v5.16b, v5.16b, v6.16b - add v2.4s, v2.4s, v5.4s - eor v3.16b, v2.16b, v3.16b - ushr v6.4s, v3.4s, #8 - shl v3.4s, v3.4s, #24 - orr v3.16b, v3.16b, v6.16b - add v4.4s, v3.4s, v4.4s - uzp2 v18.4s, v26.4s, v18.4s - eor v5.16b, v4.16b, v5.16b - add v2.4s, v2.4s, v18.4s - ushr v6.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - ext v2.16b, v2.16b, v2.16b, #4 - orr v5.16b, v5.16b, v6.16b - ext v3.16b, v3.16b, v3.16b, #8 - add v2.4s, v5.4s, v2.4s - eor v3.16b, v2.16b, v3.16b - ext v4.16b, v4.16b, v4.16b, #12 - add v0.4s, v2.4s, v0.4s - rev32 v2.8h, v3.8h - add v3.4s, v4.4s, v2.4s - eor v4.16b, v3.16b, v5.16b - ushr v5.4s, v4.4s, #12 - shl v4.4s, v4.4s, #20 - orr v4.16b, v4.16b, v5.16b - add v0.4s, v0.4s, v4.4s - eor v2.16b, v0.16b, v2.16b - ushr v5.4s, v2.4s, #8 - shl v2.4s, v2.4s, #24 - orr v2.16b, v2.16b, v5.16b - add v3.4s, v2.4s, v3.4s - eor v4.16b, v3.16b, v4.16b - ext v0.16b, v0.16b, v0.16b, #12 - ushr v5.4s, v4.4s, #7 - shl v4.4s, v4.4s, #25 - add v0.4s, v0.4s, v21.4s - orr v4.16b, v4.16b, v5.16b - ext v2.16b, v2.16b, v2.16b, #8 - add v0.4s, v0.4s, v4.4s - eor v2.16b, v0.16b, v2.16b - ext v3.16b, v3.16b, v3.16b, #4 - add v0.4s, v0.4s, v1.4s - rev32 v1.8h, v2.8h - add v2.4s, v3.4s, v1.4s - eor v3.16b, v2.16b, v4.16b - ushr v4.4s, v3.4s, #12 - shl v3.4s, v3.4s, #20 - orr v3.16b, v3.16b, v4.16b - add v0.4s, v0.4s, v3.4s - eor v1.16b, v0.16b, v1.16b - ushr v4.4s, v1.4s, #8 - shl v1.4s, v1.4s, #24 - orr v1.16b, v1.16b, v4.16b - add v2.4s, v1.4s, v2.4s - eor v3.16b, v2.16b, v3.16b - ext v0.16b, v0.16b, v0.16b, #4 - ext v2.16b, v2.16b, v2.16b, #12 - ushr v4.4s, v3.4s, #7 - shl v3.4s, v3.4s, #25 - ext v1.16b, v1.16b, v1.16b, #8 + hint #25 + .cfi_negate_ra_state + sub sp, sp, #96 + stp x29, x30, [sp, #64] + add x29, sp, #64 + str x19, [sp, #80] + .cfi_def_cfa w29, 32 + .cfi_offset w19, -16 + .cfi_offset w30, -24 + .cfi_offset w29, -32 + mov x19, x0 + mov w5, w4 + mov x4, x3 + mov w3, w2 + mov x2, x1 + mov x0, sp + mov x1, x19 + bl compress_pre + ldp q0, q1, [sp] + ldp q2, q3, [sp, #32] eor v0.16b, v2.16b, v0.16b - orr v2.16b, v3.16b, v4.16b - eor v1.16b, v2.16b, v1.16b - stp q0, q1, [x0] + eor v1.16b, v3.16b, v1.16b + ldp x29, x30, [sp, #64] + stp q0, q1, [x19] + ldr x19, [sp, #80] + add sp, sp, #96 + hint #29 ret .Lfunc_end0: .size zfs_blake3_compress_in_place_sse2, .Lfunc_end0-zfs_blake3_compress_in_place_sse2 @@ -504,483 +85,518 @@ zfs_blake3_compress_in_place_sse2: .section .rodata.cst16,"aM",@progbits,16 .p2align 4 .LCPI1_0: - .word 1779033703 - .word 3144134277 - .word 1013904242 - .word 2773480762 -.LCPI1_1: - .xword 0 - .xword -4294967296 -.LCPI1_2: - .xword -1 - .xword 4294967295 + .xword -4942790177982912921 + .xword -6534734903820487822 .text - .globl zfs_blake3_compress_xof_sse2 .p2align 2 - .type zfs_blake3_compress_xof_sse2,@function -zfs_blake3_compress_xof_sse2: + .type compress_pre,@function +compress_pre: .cfi_startproc - ldp q3, q2, [x0] - ldp q5, q6, [x1] - add x10, x1, #32 - lsr x11, x3, #32 - fmov s4, w3 - ld2 { v17.4s, v18.4s }, [x10] - adrp x10, .LCPI1_2 - and w8, w2, #0xff - mov v4.s[1], w11 - ldr q1, [x10, :lo12:.LCPI1_2] - and w9, w4, #0xff - adrp x12, .LCPI1_0 - mov v4.s[2], w8 - uzp1 v19.4s, v5.4s, v6.4s - add v3.4s, v2.4s, v3.4s - ldr q7, [x12, :lo12:.LCPI1_0] - mov v4.s[3], w9 - add v3.4s, v3.4s, v19.4s - uzp2 v5.4s, v5.4s, v6.4s - ext v21.16b, v18.16b, v18.16b, #12 - uzp1 v6.4s, v19.4s, v19.4s - ext v22.16b, v19.16b, v19.16b, #12 - eor v4.16b, v3.16b, v4.16b - ext v20.16b, v17.16b, v17.16b, #12 - ext v6.16b, v6.16b, v19.16b, #8 - ext v19.16b, v19.16b, v22.16b, #12 - zip1 v22.2d, v21.2d, v5.2d - rev32 v24.8h, v4.8h - mov v4.16b, v1.16b - zip2 v23.4s, v5.4s, v21.4s - uzp2 v6.4s, v6.4s, v5.4s - bsl v4.16b, v22.16b, v20.16b - add v3.4s, v3.4s, v5.4s - zip1 v5.4s, v23.4s, v20.4s - zip1 v22.4s, v20.4s, v23.4s - add v23.4s, v24.4s, v7.4s - ext v7.16b, v6.16b, v6.16b, #4 - ext v25.16b, v4.16b, v4.16b, #12 - ext v5.16b, v22.16b, v5.16b, #8 - eor v2.16b, v23.16b, v2.16b - uzp1 v4.4s, v4.4s, v25.4s - uzp1 v22.4s, v7.4s, v7.4s - ext v25.16b, v7.16b, v7.16b, #12 - ext v22.16b, v22.16b, v7.16b, #8 - ext v7.16b, v7.16b, v25.16b, #12 - ushr v25.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - orr v2.16b, v2.16b, v25.16b - add v3.4s, v3.4s, v2.4s - eor v24.16b, v3.16b, v24.16b - add v3.4s, v3.4s, v17.4s - ushr v17.4s, v24.4s, #8 - shl v18.4s, v24.4s, #24 - orr v17.16b, v18.16b, v17.16b - add v18.4s, v17.4s, v23.4s - eor v2.16b, v18.16b, v2.16b - ushr v23.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - ext v3.16b, v3.16b, v3.16b, #12 - orr v2.16b, v2.16b, v23.16b - ext v17.16b, v17.16b, v17.16b, #8 - add v3.4s, v2.4s, v3.4s - adrp x11, .LCPI1_1 - eor v17.16b, v3.16b, v17.16b - ldr q16, [x11, :lo12:.LCPI1_1] - ext v18.16b, v18.16b, v18.16b, #4 - rev32 v24.8h, v17.8h - movi v0.2d, #0xffffffff00000000 - add v23.4s, v3.4s, v21.4s - mov v21.s[1], v20.s[2] - add v20.4s, v18.4s, v24.4s - bit v19.16b, v21.16b, v0.16b - eor v3.16b, v20.16b, v2.16b - uzp2 v2.4s, v22.4s, v19.4s - zip1 v17.2d, v5.2d, v19.2d - zip2 v18.4s, v19.4s, v5.4s - ushr v21.4s, v3.4s, #12 - shl v3.4s, v3.4s, #20 - ext v22.16b, v2.16b, v2.16b, #4 - bsl v16.16b, v4.16b, v17.16b - zip1 v17.4s, v18.4s, v4.4s - zip1 v18.4s, v4.4s, v18.4s - orr v21.16b, v3.16b, v21.16b - ext v25.16b, v16.16b, v16.16b, #12 - ext v3.16b, v18.16b, v17.16b, #8 - uzp1 v18.4s, v22.4s, v22.4s - ext v26.16b, v22.16b, v22.16b, #12 - add v23.4s, v23.4s, v21.4s - uzp1 v17.4s, v16.4s, v25.4s - ext v16.16b, v18.16b, v22.16b, #8 - ext v18.16b, v22.16b, v26.16b, #12 - eor v22.16b, v23.16b, v24.16b - add v6.4s, v23.4s, v6.4s - ushr v23.4s, v22.4s, #8 - shl v22.4s, v22.4s, #24 - orr v22.16b, v22.16b, v23.16b - add v20.4s, v22.4s, v20.4s - eor v21.16b, v20.16b, v21.16b - ushr v23.4s, v21.4s, #7 - shl v21.4s, v21.4s, #25 - ext v6.16b, v6.16b, v6.16b, #4 - orr v21.16b, v21.16b, v23.16b - ext v22.16b, v22.16b, v22.16b, #8 - add v6.4s, v21.4s, v6.4s - eor v22.16b, v6.16b, v22.16b - ext v20.16b, v20.16b, v20.16b, #12 - add v6.4s, v6.4s, v19.4s - rev32 v19.8h, v22.8h - add v20.4s, v20.4s, v19.4s - eor v21.16b, v20.16b, v21.16b - ushr v22.4s, v21.4s, #12 - shl v21.4s, v21.4s, #20 - orr v21.16b, v21.16b, v22.16b - add v6.4s, v6.4s, v21.4s - eor v19.16b, v6.16b, v19.16b - ushr v22.4s, v19.4s, #8 - shl v19.4s, v19.4s, #24 - orr v19.16b, v19.16b, v22.16b - add v20.4s, v19.4s, v20.4s - eor v21.16b, v20.16b, v21.16b - ext v6.16b, v6.16b, v6.16b, #12 - ushr v22.4s, v21.4s, #7 - shl v21.4s, v21.4s, #25 - add v6.4s, v6.4s, v4.4s - orr v21.16b, v21.16b, v22.16b - ext v19.16b, v19.16b, v19.16b, #8 - add v6.4s, v6.4s, v21.4s - eor v19.16b, v6.16b, v19.16b - ext v20.16b, v20.16b, v20.16b, #4 - rev32 v19.8h, v19.8h - add v20.4s, v20.4s, v19.4s - add v6.4s, v6.4s, v5.4s - mov v5.s[1], v4.s[2] - eor v4.16b, v20.16b, v21.16b - ushr v21.4s, v4.4s, #12 - shl v4.4s, v4.4s, #20 - orr v21.16b, v4.16b, v21.16b - add v6.4s, v6.4s, v21.4s - eor v19.16b, v6.16b, v19.16b - add v2.4s, v6.4s, v2.4s - ushr v6.4s, v19.4s, #8 - shl v19.4s, v19.4s, #24 - orr v6.16b, v19.16b, v6.16b - add v19.4s, v6.4s, v20.4s - eor v20.16b, v19.16b, v21.16b - ushr v21.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - ext v2.16b, v2.16b, v2.16b, #4 - orr v20.16b, v20.16b, v21.16b - ext v6.16b, v6.16b, v6.16b, #8 - add v2.4s, v20.4s, v2.4s - eor v6.16b, v2.16b, v6.16b - ext v19.16b, v19.16b, v19.16b, #12 - rev32 v6.8h, v6.8h - add v19.4s, v19.4s, v6.4s - mov v22.16b, v0.16b - eor v20.16b, v19.16b, v20.16b - bsl v22.16b, v5.16b, v7.16b - ushr v21.4s, v20.4s, #12 - shl v20.4s, v20.4s, #20 - add v2.4s, v2.4s, v22.4s - orr v20.16b, v20.16b, v21.16b - add v2.4s, v2.4s, v20.4s - eor v6.16b, v2.16b, v6.16b - ushr v21.4s, v6.4s, #8 - shl v6.4s, v6.4s, #24 - orr v6.16b, v6.16b, v21.16b - add v19.4s, v6.4s, v19.4s - eor v20.16b, v19.16b, v20.16b - ext v2.16b, v2.16b, v2.16b, #12 - ushr v21.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - add v2.4s, v2.4s, v17.4s - orr v20.16b, v20.16b, v21.16b - ext v6.16b, v6.16b, v6.16b, #8 - add v2.4s, v2.4s, v20.4s - eor v6.16b, v2.16b, v6.16b - uzp2 v5.4s, v16.4s, v22.4s - zip1 v7.2d, v3.2d, v22.2d - zip2 v16.4s, v22.4s, v3.4s - ext v19.16b, v19.16b, v19.16b, #4 - rev32 v22.8h, v6.8h - ext v23.16b, v5.16b, v5.16b, #4 - bif v7.16b, v17.16b, v1.16b - zip1 v24.4s, v16.4s, v17.4s - zip1 v16.4s, v17.4s, v16.4s - add v21.4s, v2.4s, v3.4s - mov v3.s[1], v17.s[2] - add v17.4s, v19.4s, v22.4s - mov v19.16b, v0.16b - ext v25.16b, v7.16b, v7.16b, #12 - ext v4.16b, v16.16b, v24.16b, #8 - uzp1 v16.4s, v23.4s, v23.4s - bsl v19.16b, v3.16b, v18.16b - eor v2.16b, v17.16b, v20.16b - uzp1 v7.4s, v7.4s, v25.4s - ext v25.16b, v16.16b, v23.16b, #8 - zip1 v3.2d, v4.2d, v19.2d - ushr v20.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - ext v24.16b, v23.16b, v23.16b, #12 - uzp2 v6.4s, v25.4s, v19.4s - zip2 v18.4s, v19.4s, v4.4s - bif v3.16b, v7.16b, v1.16b - orr v20.16b, v2.16b, v20.16b - ext v16.16b, v23.16b, v24.16b, #12 - ext v23.16b, v6.16b, v6.16b, #4 - zip1 v24.4s, v18.4s, v7.4s - zip1 v18.4s, v7.4s, v18.4s - ext v25.16b, v3.16b, v3.16b, #12 - add v21.4s, v21.4s, v20.4s - ext v2.16b, v18.16b, v24.16b, #8 - uzp1 v18.4s, v23.4s, v23.4s - ext v24.16b, v23.16b, v23.16b, #12 - uzp1 v3.4s, v3.4s, v25.4s - eor v22.16b, v21.16b, v22.16b - ext v25.16b, v18.16b, v23.16b, #8 - dup v18.4s, v2.s[3] - ext v23.16b, v23.16b, v24.16b, #12 - add v5.4s, v21.4s, v5.4s - trn1 v21.4s, v3.4s, v3.4s - ushr v24.4s, v22.4s, #8 - shl v22.4s, v22.4s, #24 - ext v18.16b, v21.16b, v18.16b, #8 - orr v21.16b, v22.16b, v24.16b - add v17.4s, v21.4s, v17.4s - eor v20.16b, v17.16b, v20.16b - ushr v22.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - ext v5.16b, v5.16b, v5.16b, #4 - orr v20.16b, v20.16b, v22.16b - ext v21.16b, v21.16b, v21.16b, #8 - add v5.4s, v20.4s, v5.4s - eor v21.16b, v5.16b, v21.16b - ext v17.16b, v17.16b, v17.16b, #12 - add v5.4s, v5.4s, v19.4s - rev32 v19.8h, v21.8h - add v17.4s, v17.4s, v19.4s - eor v20.16b, v17.16b, v20.16b - ushr v21.4s, v20.4s, #12 - shl v20.4s, v20.4s, #20 - orr v20.16b, v20.16b, v21.16b - add v5.4s, v5.4s, v20.4s - eor v19.16b, v5.16b, v19.16b - ushr v21.4s, v19.4s, #8 - shl v19.4s, v19.4s, #24 - orr v19.16b, v19.16b, v21.16b - add v17.4s, v19.4s, v17.4s - eor v20.16b, v17.16b, v20.16b - ext v5.16b, v5.16b, v5.16b, #12 - ushr v21.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - add v5.4s, v5.4s, v7.4s - orr v20.16b, v20.16b, v21.16b - ext v19.16b, v19.16b, v19.16b, #8 - add v5.4s, v5.4s, v20.4s - eor v19.16b, v5.16b, v19.16b - ext v17.16b, v17.16b, v17.16b, #4 - rev32 v22.8h, v19.8h - add v21.4s, v5.4s, v4.4s - mov v4.s[1], v7.s[2] - add v19.4s, v17.4s, v22.4s - bit v16.16b, v4.16b, v0.16b - eor v5.16b, v19.16b, v20.16b - uzp2 v4.4s, v25.4s, v16.4s - zip1 v7.2d, v2.2d, v16.2d - zip2 v17.4s, v16.4s, v2.4s - ushr v20.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - ext v24.16b, v4.16b, v4.16b, #4 - bif v7.16b, v3.16b, v1.16b - zip1 v25.4s, v17.4s, v3.4s - zip1 v17.4s, v3.4s, v17.4s - orr v20.16b, v5.16b, v20.16b - ext v26.16b, v7.16b, v7.16b, #12 - ext v5.16b, v17.16b, v25.16b, #8 - uzp1 v17.4s, v24.4s, v24.4s - ext v25.16b, v24.16b, v24.16b, #12 - bit v23.16b, v18.16b, v0.16b - add v21.4s, v21.4s, v20.4s - uzp1 v7.4s, v7.4s, v26.4s - ext v26.16b, v17.16b, v24.16b, #8 - ext v17.16b, v24.16b, v25.16b, #12 - eor v22.16b, v21.16b, v22.16b - add v6.4s, v21.4s, v6.4s - zip1 v21.2d, v5.2d, v23.2d - zip2 v24.4s, v23.4s, v5.4s - bif v21.16b, v7.16b, v1.16b - zip1 v1.4s, v24.4s, v7.4s - zip1 v24.4s, v7.4s, v24.4s - ext v1.16b, v24.16b, v1.16b, #8 - ushr v24.4s, v22.4s, #8 - shl v22.4s, v22.4s, #24 - orr v22.16b, v22.16b, v24.16b - add v19.4s, v22.4s, v19.4s - ext v24.16b, v21.16b, v21.16b, #12 - eor v20.16b, v19.16b, v20.16b - uzp1 v21.4s, v21.4s, v24.4s - ushr v24.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - orr v20.16b, v20.16b, v24.16b - ext v6.16b, v6.16b, v6.16b, #4 - ext v22.16b, v22.16b, v22.16b, #8 - add v6.4s, v20.4s, v6.4s - eor v22.16b, v6.16b, v22.16b - ext v19.16b, v19.16b, v19.16b, #12 - add v6.4s, v6.4s, v16.4s - rev32 v16.8h, v22.8h - add v19.4s, v19.4s, v16.4s - eor v20.16b, v19.16b, v20.16b - ushr v22.4s, v20.4s, #12 - shl v20.4s, v20.4s, #20 - orr v20.16b, v20.16b, v22.16b - add v6.4s, v6.4s, v20.4s - eor v16.16b, v6.16b, v16.16b - ext v6.16b, v6.16b, v6.16b, #12 - add v3.4s, v6.4s, v3.4s - ushr v6.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - orr v6.16b, v16.16b, v6.16b - add v16.4s, v6.4s, v19.4s - eor v19.16b, v16.16b, v20.16b - ushr v20.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - orr v19.16b, v19.16b, v20.16b - ext v6.16b, v6.16b, v6.16b, #8 - add v3.4s, v3.4s, v19.4s - eor v6.16b, v3.16b, v6.16b - ext v16.16b, v16.16b, v16.16b, #4 - add v2.4s, v3.4s, v2.4s - rev32 v3.8h, v6.8h - add v6.4s, v16.4s, v3.4s - eor v16.16b, v6.16b, v19.16b - ushr v19.4s, v16.4s, #12 - shl v16.4s, v16.4s, #20 - orr v16.16b, v16.16b, v19.16b - add v2.4s, v2.4s, v16.4s - eor v3.16b, v2.16b, v3.16b - add v2.4s, v2.4s, v4.4s - ushr v4.4s, v3.4s, #8 - shl v3.4s, v3.4s, #24 - orr v3.16b, v3.16b, v4.16b - add v4.4s, v3.4s, v6.4s - eor v6.16b, v4.16b, v16.16b - ushr v16.4s, v6.4s, #7 - shl v6.4s, v6.4s, #25 - ext v2.16b, v2.16b, v2.16b, #4 - orr v6.16b, v6.16b, v16.16b - ext v3.16b, v3.16b, v3.16b, #8 - add v2.4s, v6.4s, v2.4s + hint #34 + fmov s1, w3 + movi d0, #0x0000ff000000ff + ldr q2, [x1] + fmov d3, x4 + adrp x8, .LCPI1_0 + mov v1.s[1], w5 + str q2, [x0] + ldr q4, [x8, :lo12:.LCPI1_0] + add x8, x2, #32 + ldr q5, [x1, #16] + and v0.8b, v1.8b, v0.8b + stp q5, q4, [x0, #16] + mov v3.d[1], v0.d[0] + str q3, [x0, #48] + ldp q0, q6, [x2] + uzp1 v1.4s, v0.4s, v6.4s + uzp2 v0.4s, v0.4s, v6.4s + add v2.4s, v2.4s, v1.4s + uzp1 v18.4s, v1.4s, v1.4s + add v2.4s, v2.4s, v5.4s eor v3.16b, v2.16b, v3.16b - ext v4.16b, v4.16b, v4.16b, #12 + add v2.4s, v2.4s, v0.4s rev32 v3.8h, v3.8h - add v4.4s, v4.4s, v3.4s - eor v6.16b, v4.16b, v6.16b - ushr v16.4s, v6.4s, #12 - shl v6.4s, v6.4s, #20 - add v2.4s, v2.4s, v23.4s - orr v6.16b, v6.16b, v16.16b - add v2.4s, v2.4s, v6.4s + add v4.4s, v3.4s, v4.4s + eor v5.16b, v4.16b, v5.16b + ushr v6.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v5.16b, v5.16b, v6.16b + add v2.4s, v2.4s, v5.4s eor v3.16b, v2.16b, v3.16b - ushr v16.4s, v3.4s, #8 + ushr v6.4s, v3.4s, #8 shl v3.4s, v3.4s, #24 - orr v3.16b, v3.16b, v16.16b + orr v3.16b, v3.16b, v6.16b + ld2 { v6.4s, v7.4s }, [x8] add v4.4s, v3.4s, v4.4s - eor v6.16b, v4.16b, v6.16b - ext v2.16b, v2.16b, v2.16b, #12 - ushr v16.4s, v6.4s, #7 - shl v6.4s, v6.4s, #25 - add v2.4s, v2.4s, v7.4s - orr v6.16b, v6.16b, v16.16b ext v3.16b, v3.16b, v3.16b, #8 add v2.4s, v2.4s, v6.4s - eor v3.16b, v2.16b, v3.16b + eor v5.16b, v4.16b, v5.16b ext v4.16b, v4.16b, v4.16b, #4 - rev32 v3.8h, v3.8h + ext v6.16b, v6.16b, v6.16b, #12 + ext v2.16b, v2.16b, v2.16b, #12 + ushr v16.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 *** 7941 LINES SKIPPED ***