git: 9a3444d91c70 - main - ossl: Add a VAES-based AES-GCM implementation for amd64
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Fri, 02 Jun 2023 16:19:21 UTC
The branch main has been updated by markj: URL: https://cgit.FreeBSD.org/src/commit/?id=9a3444d91c706dda65040138acbdb8c932213960 commit 9a3444d91c706dda65040138acbdb8c932213960 Author: Mark Johnston <markj@FreeBSD.org> AuthorDate: 2023-06-02 15:58:29 +0000 Commit: Mark Johnston <markj@FreeBSD.org> CommitDate: 2023-06-02 16:15:01 +0000 ossl: Add a VAES-based AES-GCM implementation for amd64 aes-gcm-avx512.S is generated from OpenSSL 3.1 and implements AES-GCM. ossl_x86.c detects whether the CPU implements the required AVX512 instructions; if not, the ossl(4) module does not provide an AES-GCM implementation. The VAES implementation increases throughput for all buffer sizes in both directions, up to 2x for sufficiently large buffers. The "process" implementation is in two parts: a generic OCF layer in ossl_aes.c that calls a set of MD functions to do the heavy lifting. The intent there is to make it possible to add other implementations for other platforms, e.g., to reduce the diff required for D37421. A follow-up commit will add a fallback path to legacy AES-NI, so that ossl(4) can be used in preference to aesni(4) on all amd64 platforms. In the long term we would like to replace aesni(4) and armv8crypto(4) with ossl(4). Note, currently this implementation will not be selected by default since aesni(4) and ossl(4) return the same probe priority for crypto sessions, and the opencrypto framework selects the first registered implementation to break a tie. Since aesni(4) is compiled into the kernel, aesni(4) wins. A separate change may modify ossl(4) to have priority. Sponsored by: Stormshield Sponsored by: Klara, Inc. Reviewed by: jhb MFC after: 3 months Differential Revision: https://reviews.freebsd.org/D39783 --- sys/crypto/openssl/amd64/aes-gcm-avx512.S | 136132 +++++++++++++++++++++++++++ sys/crypto/openssl/amd64/ossl_aes_gcm.c | 233 + sys/crypto/openssl/ossl.c | 54 +- sys/crypto/openssl/ossl.h | 6 +- sys/crypto/openssl/ossl_aes.c | 103 + sys/crypto/openssl/ossl_aes_gcm.h | 71 + sys/crypto/openssl/ossl_x86.c | 25 +- sys/modules/ossl/Makefile | 2 + 8 files changed, 136616 insertions(+), 10 deletions(-) diff --git a/sys/crypto/openssl/amd64/aes-gcm-avx512.S b/sys/crypto/openssl/amd64/aes-gcm-avx512.S new file mode 100644 index 000000000000..6ddd1f994704 --- /dev/null +++ b/sys/crypto/openssl/amd64/aes-gcm-avx512.S @@ -0,0 +1,136132 @@ +/* $FreeBSD$ */ +/* Do not modify. This file is auto-generated from aes-gcm-avx512.pl. */ +.globl ossl_vaes_vpclmulqdq_capable +.type ossl_vaes_vpclmulqdq_capable,@function +.align 32 +ossl_vaes_vpclmulqdq_capable: + movq OPENSSL_ia32cap_P+8(%rip),%rcx + + movq $6600291188736,%rdx + xorl %eax,%eax + andq %rdx,%rcx + cmpq %rdx,%rcx + cmoveq %rcx,%rax + .byte 0xf3,0xc3 +.size ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable +.text +.globl ossl_aes_gcm_init_avx512 +.type ossl_aes_gcm_init_avx512,@function +.align 32 +ossl_aes_gcm_init_avx512: +.cfi_startproc +.byte 243,15,30,250 + vpxorq %xmm16,%xmm16,%xmm16 + + + movl 240(%rdi),%eax + cmpl $9,%eax + je .Laes_128_duiuljAybFADyhe + cmpl $11,%eax + je .Laes_192_duiuljAybFADyhe + cmpl $13,%eax + je .Laes_256_duiuljAybFADyhe + jmp .Lexit_aes_duiuljAybFADyhe +.align 32 +.Laes_128_duiuljAybFADyhe: + vpxorq 0(%rdi),%xmm16,%xmm16 + + vaesenc 16(%rdi),%xmm16,%xmm16 + + vaesenc 32(%rdi),%xmm16,%xmm16 + + vaesenc 48(%rdi),%xmm16,%xmm16 + + vaesenc 64(%rdi),%xmm16,%xmm16 + + vaesenc 80(%rdi),%xmm16,%xmm16 + + vaesenc 96(%rdi),%xmm16,%xmm16 + + vaesenc 112(%rdi),%xmm16,%xmm16 + + vaesenc 128(%rdi),%xmm16,%xmm16 + + vaesenc 144(%rdi),%xmm16,%xmm16 + + vaesenclast 160(%rdi),%xmm16,%xmm16 + jmp .Lexit_aes_duiuljAybFADyhe +.align 32 +.Laes_192_duiuljAybFADyhe: + vpxorq 0(%rdi),%xmm16,%xmm16 + + vaesenc 16(%rdi),%xmm16,%xmm16 + + vaesenc 32(%rdi),%xmm16,%xmm16 + + vaesenc 48(%rdi),%xmm16,%xmm16 + + vaesenc 64(%rdi),%xmm16,%xmm16 + + vaesenc 80(%rdi),%xmm16,%xmm16 + + vaesenc 96(%rdi),%xmm16,%xmm16 + + vaesenc 112(%rdi),%xmm16,%xmm16 + + vaesenc 128(%rdi),%xmm16,%xmm16 + + vaesenc 144(%rdi),%xmm16,%xmm16 + + vaesenc 160(%rdi),%xmm16,%xmm16 + + vaesenc 176(%rdi),%xmm16,%xmm16 + + vaesenclast 192(%rdi),%xmm16,%xmm16 + jmp .Lexit_aes_duiuljAybFADyhe +.align 32 +.Laes_256_duiuljAybFADyhe: + vpxorq 0(%rdi),%xmm16,%xmm16 + + vaesenc 16(%rdi),%xmm16,%xmm16 + + vaesenc 32(%rdi),%xmm16,%xmm16 + + vaesenc 48(%rdi),%xmm16,%xmm16 + + vaesenc 64(%rdi),%xmm16,%xmm16 + + vaesenc 80(%rdi),%xmm16,%xmm16 + + vaesenc 96(%rdi),%xmm16,%xmm16 + + vaesenc 112(%rdi),%xmm16,%xmm16 + + vaesenc 128(%rdi),%xmm16,%xmm16 + + vaesenc 144(%rdi),%xmm16,%xmm16 + + vaesenc 160(%rdi),%xmm16,%xmm16 + + vaesenc 176(%rdi),%xmm16,%xmm16 + + vaesenc 192(%rdi),%xmm16,%xmm16 + + vaesenc 208(%rdi),%xmm16,%xmm16 + + vaesenclast 224(%rdi),%xmm16,%xmm16 + jmp .Lexit_aes_duiuljAybFADyhe +.Lexit_aes_duiuljAybFADyhe: + + vpshufb SHUF_MASK(%rip),%xmm16,%xmm16 + + vmovdqa64 %xmm16,%xmm2 + vpsllq $1,%xmm16,%xmm16 + vpsrlq $63,%xmm2,%xmm2 + vmovdqa %xmm2,%xmm1 + vpslldq $8,%xmm2,%xmm2 + vpsrldq $8,%xmm1,%xmm1 + vporq %xmm2,%xmm16,%xmm16 + + vpshufd $36,%xmm1,%xmm2 + vpcmpeqd TWOONE(%rip),%xmm2,%xmm2 + vpand POLY(%rip),%xmm2,%xmm2 + vpxorq %xmm2,%xmm16,%xmm16 + + vmovdqu64 %xmm16,336(%rsi) + vshufi32x4 $0x00,%ymm16,%ymm16,%ymm4 + vmovdqa %ymm4,%ymm3 + + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm0 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm1 + vpclmulqdq $0x01,%ymm4,%ymm3,%ymm2 + vpclmulqdq $0x10,%ymm4,%ymm3,%ymm3 + vpxorq %ymm2,%ymm3,%ymm3 + + vpsrldq $8,%ymm3,%ymm2 + vpslldq $8,%ymm3,%ymm3 + vpxorq %ymm2,%ymm0,%ymm0 + vpxorq %ymm1,%ymm3,%ymm3 + + + + vmovdqu64 POLY2(%rip),%ymm2 + + vpclmulqdq $0x01,%ymm3,%ymm2,%ymm1 + vpslldq $8,%ymm1,%ymm1 + vpxorq %ymm1,%ymm3,%ymm3 + + + + vpclmulqdq $0x00,%ymm3,%ymm2,%ymm1 + vpsrldq $4,%ymm1,%ymm1 + vpclmulqdq $0x10,%ymm3,%ymm2,%ymm3 + vpslldq $4,%ymm3,%ymm3 + + vpternlogq $0x96,%ymm1,%ymm0,%ymm3 + + vmovdqu64 %xmm3,320(%rsi) + vinserti64x2 $1,%xmm16,%ymm3,%ymm4 + vmovdqa64 %ymm4,%ymm5 + + vpclmulqdq $0x11,%ymm3,%ymm4,%ymm0 + vpclmulqdq $0x00,%ymm3,%ymm4,%ymm1 + vpclmulqdq $0x01,%ymm3,%ymm4,%ymm2 + vpclmulqdq $0x10,%ymm3,%ymm4,%ymm4 + vpxorq %ymm2,%ymm4,%ymm4 + + vpsrldq $8,%ymm4,%ymm2 + vpslldq $8,%ymm4,%ymm4 + vpxorq %ymm2,%ymm0,%ymm0 + vpxorq %ymm1,%ymm4,%ymm4 + + + + vmovdqu64 POLY2(%rip),%ymm2 + + vpclmulqdq $0x01,%ymm4,%ymm2,%ymm1 + vpslldq $8,%ymm1,%ymm1 + vpxorq %ymm1,%ymm4,%ymm4 + + + + vpclmulqdq $0x00,%ymm4,%ymm2,%ymm1 + vpsrldq $4,%ymm1,%ymm1 + vpclmulqdq $0x10,%ymm4,%ymm2,%ymm4 + vpslldq $4,%ymm4,%ymm4 + + vpternlogq $0x96,%ymm1,%ymm0,%ymm4 + + vmovdqu64 %ymm4,288(%rsi) + + vinserti64x4 $1,%ymm5,%zmm4,%zmm4 + + + vshufi64x2 $0x00,%zmm4,%zmm4,%zmm3 + vmovdqa64 %zmm4,%zmm5 + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm0 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm1 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm2 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm2,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm2 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm2,%zmm0,%zmm0 + vpxorq %zmm1,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm2 + + vpclmulqdq $0x01,%zmm4,%zmm2,%zmm1 + vpslldq $8,%zmm1,%zmm1 + vpxorq %zmm1,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm2,%zmm1 + vpsrldq $4,%zmm1,%zmm1 + vpclmulqdq $0x10,%zmm4,%zmm2,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm1,%zmm0,%zmm4 + + vmovdqu64 %zmm4,224(%rsi) + vshufi64x2 $0x00,%zmm4,%zmm4,%zmm3 + + vpclmulqdq $0x11,%zmm3,%zmm5,%zmm0 + vpclmulqdq $0x00,%zmm3,%zmm5,%zmm1 + vpclmulqdq $0x01,%zmm3,%zmm5,%zmm2 + vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5 + vpxorq %zmm2,%zmm5,%zmm5 + + vpsrldq $8,%zmm5,%zmm2 + vpslldq $8,%zmm5,%zmm5 + vpxorq %zmm2,%zmm0,%zmm0 + vpxorq %zmm1,%zmm5,%zmm5 + + + + vmovdqu64 POLY2(%rip),%zmm2 + + vpclmulqdq $0x01,%zmm5,%zmm2,%zmm1 + vpslldq $8,%zmm1,%zmm1 + vpxorq %zmm1,%zmm5,%zmm5 + + + + vpclmulqdq $0x00,%zmm5,%zmm2,%zmm1 + vpsrldq $4,%zmm1,%zmm1 + vpclmulqdq $0x10,%zmm5,%zmm2,%zmm5 + vpslldq $4,%zmm5,%zmm5 + + vpternlogq $0x96,%zmm1,%zmm0,%zmm5 + + vmovdqu64 %zmm5,160(%rsi) + + vpclmulqdq $0x11,%zmm3,%zmm4,%zmm0 + vpclmulqdq $0x00,%zmm3,%zmm4,%zmm1 + vpclmulqdq $0x01,%zmm3,%zmm4,%zmm2 + vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4 + vpxorq %zmm2,%zmm4,%zmm4 + + vpsrldq $8,%zmm4,%zmm2 + vpslldq $8,%zmm4,%zmm4 + vpxorq %zmm2,%zmm0,%zmm0 + vpxorq %zmm1,%zmm4,%zmm4 + + + + vmovdqu64 POLY2(%rip),%zmm2 + + vpclmulqdq $0x01,%zmm4,%zmm2,%zmm1 + vpslldq $8,%zmm1,%zmm1 + vpxorq %zmm1,%zmm4,%zmm4 + + + + vpclmulqdq $0x00,%zmm4,%zmm2,%zmm1 + vpsrldq $4,%zmm1,%zmm1 + vpclmulqdq $0x10,%zmm4,%zmm2,%zmm4 + vpslldq $4,%zmm4,%zmm4 + + vpternlogq $0x96,%zmm1,%zmm0,%zmm4 + + vmovdqu64 %zmm4,96(%rsi) + vzeroupper +.Labort_init: + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_aes_gcm_init_avx512, .-ossl_aes_gcm_init_avx512 +.globl ossl_aes_gcm_setiv_avx512 +.type ossl_aes_gcm_setiv_avx512,@function +.align 32 +ossl_aes_gcm_setiv_avx512: +.cfi_startproc +.Lsetiv_seh_begin: +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 +.Lsetiv_seh_push_rbx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 +.Lsetiv_seh_push_rbp: + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 +.Lsetiv_seh_push_r12: + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 +.Lsetiv_seh_push_r13: + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 +.Lsetiv_seh_push_r14: + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lsetiv_seh_push_r15: + + + + + + + + + + + leaq 0(%rsp),%rbp +.cfi_def_cfa_register %rbp +.Lsetiv_seh_setfp: + +.Lsetiv_seh_prolog_end: + subq $820,%rsp + andq $(-64),%rsp + cmpq $12,%rcx + je iv_len_12_init_IV + vpxor %xmm2,%xmm2,%xmm2 + movq %rdx,%r10 + movq %rcx,%r11 + orq %r11,%r11 + jz .L_CALC_AAD_done_mBgdvxqgFGebeug + + xorq %rbx,%rbx + vmovdqa64 SHUF_MASK(%rip),%zmm16 + +.L_get_AAD_loop48x16_mBgdvxqgFGebeug: + cmpq $768,%r11 + jl .L_exit_AAD_loop48x16_mBgdvxqgFGebeug + vmovdqu64 0(%r10),%zmm11 + vmovdqu64 64(%r10),%zmm3 + vmovdqu64 128(%r10),%zmm4 + vmovdqu64 192(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + testq %rbx,%rbx + jnz .L_skip_hkeys_precomputation_EzsAegbBbaerfwt + + vmovdqu64 288(%rsi),%zmm1 + vmovdqu64 %zmm1,704(%rsp) + + vmovdqu64 224(%rsi),%zmm9 + vmovdqu64 %zmm9,640(%rsp) + + + vshufi64x2 $0x00,%zmm9,%zmm9,%zmm9 + + vmovdqu64 160(%rsi),%zmm10 + vmovdqu64 %zmm10,576(%rsp) + + vmovdqu64 96(%rsi),%zmm12 + vmovdqu64 %zmm12,512(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,448(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,384(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,320(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,256(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,192(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,128(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,64(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,0(%rsp) +.L_skip_hkeys_precomputation_EzsAegbBbaerfwt: + movq $1,%rbx + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 0(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 64(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpxorq %zmm17,%zmm10,%zmm7 + vpxorq %zmm13,%zmm1,%zmm6 + vpxorq %zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 128(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 192(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 256(%r10),%zmm11 + vmovdqu64 320(%r10),%zmm3 + vmovdqu64 384(%r10),%zmm4 + vmovdqu64 448(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vmovdqu64 256(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 320(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 384(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 448(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 512(%r10),%zmm11 + vmovdqu64 576(%r10),%zmm3 + vmovdqu64 640(%r10),%zmm4 + vmovdqu64 704(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + vmovdqu64 512(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 576(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18 + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + vmovdqu64 640(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12 + vmovdqu64 704(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17 + vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18 + + vpternlogq $0x96,%zmm17,%zmm10,%zmm7 + vpternlogq $0x96,%zmm13,%zmm1,%zmm6 + vpternlogq $0x96,%zmm15,%zmm9,%zmm8 + vpternlogq $0x96,%zmm18,%zmm12,%zmm7 + + vpsrldq $8,%zmm7,%zmm1 + vpslldq $8,%zmm7,%zmm9 + vpxorq %zmm1,%zmm6,%zmm6 + vpxorq %zmm9,%zmm8,%zmm8 + vextracti64x4 $1,%zmm6,%ymm1 + vpxorq %ymm1,%ymm6,%ymm6 + vextracti32x4 $1,%ymm6,%xmm1 + vpxorq %xmm1,%xmm6,%xmm6 + vextracti64x4 $1,%zmm8,%ymm9 + vpxorq %ymm9,%ymm8,%ymm8 + vextracti32x4 $1,%ymm8,%xmm9 + vpxorq %xmm9,%xmm8,%xmm8 + vmovdqa64 POLY2(%rip),%xmm10 + + + vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1 + vpslldq $8,%xmm1,%xmm1 + vpxorq %xmm1,%xmm8,%xmm1 + + + vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9 + vpsrldq $4,%xmm9,%xmm9 + vpclmulqdq $0x10,%xmm1,%xmm10,%xmm2 + vpslldq $4,%xmm2,%xmm2 + vpternlogq $0x96,%xmm6,%xmm9,%xmm2 + + subq $768,%r11 + je .L_CALC_AAD_done_mBgdvxqgFGebeug + + addq $768,%r10 + jmp .L_get_AAD_loop48x16_mBgdvxqgFGebeug + +.L_exit_AAD_loop48x16_mBgdvxqgFGebeug: + + cmpq $512,%r11 + jl .L_less_than_32x16_mBgdvxqgFGebeug + + vmovdqu64 0(%r10),%zmm11 + vmovdqu64 64(%r10),%zmm3 + vmovdqu64 128(%r10),%zmm4 + vmovdqu64 192(%r10),%zmm5 + vpshufb %zmm16,%zmm11,%zmm11 + vpshufb %zmm16,%zmm3,%zmm3 + vpshufb %zmm16,%zmm4,%zmm4 + vpshufb %zmm16,%zmm5,%zmm5 + testq %rbx,%rbx + jnz .L_skip_hkeys_precomputation_xCxmdbgxoCdwefc + + vmovdqu64 288(%rsi),%zmm1 + vmovdqu64 %zmm1,704(%rsp) + + vmovdqu64 224(%rsi),%zmm9 + vmovdqu64 %zmm9,640(%rsp) + + + vshufi64x2 $0x00,%zmm9,%zmm9,%zmm9 + + vmovdqu64 160(%rsi),%zmm10 + vmovdqu64 %zmm10,576(%rsp) + + vmovdqu64 96(%rsi),%zmm12 + vmovdqu64 %zmm12,512(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,448(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,384(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10 + vpxorq %zmm17,%zmm10,%zmm10 + + vpsrldq $8,%zmm10,%zmm17 + vpslldq $8,%zmm10,%zmm10 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm10,%zmm10 + + + + vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10 + vpslldq $4,%zmm10,%zmm10 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm10 + + vmovdqu64 %zmm10,320(%rsp) + + vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13 + vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15 + vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17 + vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12 + vpxorq %zmm17,%zmm12,%zmm12 + + vpsrldq $8,%zmm12,%zmm17 + vpslldq $8,%zmm12,%zmm12 + vpxorq %zmm17,%zmm13,%zmm13 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vmovdqu64 POLY2(%rip),%zmm17 + + vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15 + vpslldq $8,%zmm15,%zmm15 + vpxorq %zmm15,%zmm12,%zmm12 + + + + vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15 + vpsrldq $4,%zmm15,%zmm15 + vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12 + vpslldq $4,%zmm12,%zmm12 + + vpternlogq $0x96,%zmm15,%zmm13,%zmm12 + + vmovdqu64 %zmm12,256(%rsp) +.L_skip_hkeys_precomputation_xCxmdbgxoCdwefc: + movq $1,%rbx + vpxorq %zmm2,%zmm11,%zmm11 + vmovdqu64 256(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1 + vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9 + vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10 + vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12 + vmovdqu64 320(%rsp),%zmm19 + vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13 + vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15 + vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17 *** 135867 LINES SKIPPED ***