git: 9a3444d91c70 - main - ossl: Add a VAES-based AES-GCM implementation for amd64

From: Mark Johnston <markj_at_FreeBSD.org>
Date: Fri, 02 Jun 2023 16:19:21 UTC
The branch main has been updated by markj:

URL: https://cgit.FreeBSD.org/src/commit/?id=9a3444d91c706dda65040138acbdb8c932213960

commit 9a3444d91c706dda65040138acbdb8c932213960
Author:     Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2023-06-02 15:58:29 +0000
Commit:     Mark Johnston <markj@FreeBSD.org>
CommitDate: 2023-06-02 16:15:01 +0000

    ossl: Add a VAES-based AES-GCM implementation for amd64
    
    aes-gcm-avx512.S is generated from OpenSSL 3.1 and implements AES-GCM.
    ossl_x86.c detects whether the CPU implements the required AVX512
    instructions; if not, the ossl(4) module does not provide an AES-GCM
    implementation.  The VAES implementation increases throughput for all
    buffer sizes in both directions, up to 2x for sufficiently large
    buffers.
    
    The "process" implementation is in two parts: a generic OCF layer in
    ossl_aes.c that calls a set of MD functions to do the heavy lifting.
    The intent there is to make it possible to add other implementations for
    other platforms, e.g., to reduce the diff required for D37421.
    
    A follow-up commit will add a fallback path to legacy AES-NI, so that
    ossl(4) can be used in preference to aesni(4) on all amd64 platforms.
    In the long term we would like to replace aesni(4) and armv8crypto(4)
    with ossl(4).
    
    Note, currently this implementation will not be selected by default
    since aesni(4) and ossl(4) return the same probe priority for crypto
    sessions, and the opencrypto framework selects the first registered
    implementation to break a tie.  Since aesni(4) is compiled into the
    kernel, aesni(4) wins.  A separate change may modify ossl(4) to have
    priority.
    
    Sponsored by:   Stormshield
    Sponsored by:   Klara, Inc.
    Reviewed by:    jhb
    MFC after:      3 months
    Differential Revision:  https://reviews.freebsd.org/D39783
---
 sys/crypto/openssl/amd64/aes-gcm-avx512.S | 136132 +++++++++++++++++++++++++++
 sys/crypto/openssl/amd64/ossl_aes_gcm.c   |    233 +
 sys/crypto/openssl/ossl.c                 |     54 +-
 sys/crypto/openssl/ossl.h                 |      6 +-
 sys/crypto/openssl/ossl_aes.c             |    103 +
 sys/crypto/openssl/ossl_aes_gcm.h         |     71 +
 sys/crypto/openssl/ossl_x86.c             |     25 +-
 sys/modules/ossl/Makefile                 |      2 +
 8 files changed, 136616 insertions(+), 10 deletions(-)

diff --git a/sys/crypto/openssl/amd64/aes-gcm-avx512.S b/sys/crypto/openssl/amd64/aes-gcm-avx512.S
new file mode 100644
index 000000000000..6ddd1f994704
--- /dev/null
+++ b/sys/crypto/openssl/amd64/aes-gcm-avx512.S
@@ -0,0 +1,136132 @@
+/* $FreeBSD$ */
+/* Do not modify. This file is auto-generated from aes-gcm-avx512.pl. */
+.globl	ossl_vaes_vpclmulqdq_capable
+.type	ossl_vaes_vpclmulqdq_capable,@function
+.align	32
+ossl_vaes_vpclmulqdq_capable:
+	movq	OPENSSL_ia32cap_P+8(%rip),%rcx
+
+	movq	$6600291188736,%rdx
+	xorl	%eax,%eax
+	andq	%rdx,%rcx
+	cmpq	%rdx,%rcx
+	cmoveq	%rcx,%rax
+	.byte	0xf3,0xc3
+.size	ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable
+.text	
+.globl	ossl_aes_gcm_init_avx512
+.type	ossl_aes_gcm_init_avx512,@function
+.align	32
+ossl_aes_gcm_init_avx512:
+.cfi_startproc	
+.byte	243,15,30,250
+	vpxorq	%xmm16,%xmm16,%xmm16
+
+
+	movl	240(%rdi),%eax
+	cmpl	$9,%eax
+	je	.Laes_128_duiuljAybFADyhe
+	cmpl	$11,%eax
+	je	.Laes_192_duiuljAybFADyhe
+	cmpl	$13,%eax
+	je	.Laes_256_duiuljAybFADyhe
+	jmp	.Lexit_aes_duiuljAybFADyhe
+.align	32
+.Laes_128_duiuljAybFADyhe:
+	vpxorq	0(%rdi),%xmm16,%xmm16
+
+	vaesenc	16(%rdi),%xmm16,%xmm16
+
+	vaesenc	32(%rdi),%xmm16,%xmm16
+
+	vaesenc	48(%rdi),%xmm16,%xmm16
+
+	vaesenc	64(%rdi),%xmm16,%xmm16
+
+	vaesenc	80(%rdi),%xmm16,%xmm16
+
+	vaesenc	96(%rdi),%xmm16,%xmm16
+
+	vaesenc	112(%rdi),%xmm16,%xmm16
+
+	vaesenc	128(%rdi),%xmm16,%xmm16
+
+	vaesenc	144(%rdi),%xmm16,%xmm16
+
+	vaesenclast	160(%rdi),%xmm16,%xmm16
+	jmp	.Lexit_aes_duiuljAybFADyhe
+.align	32
+.Laes_192_duiuljAybFADyhe:
+	vpxorq	0(%rdi),%xmm16,%xmm16
+
+	vaesenc	16(%rdi),%xmm16,%xmm16
+
+	vaesenc	32(%rdi),%xmm16,%xmm16
+
+	vaesenc	48(%rdi),%xmm16,%xmm16
+
+	vaesenc	64(%rdi),%xmm16,%xmm16
+
+	vaesenc	80(%rdi),%xmm16,%xmm16
+
+	vaesenc	96(%rdi),%xmm16,%xmm16
+
+	vaesenc	112(%rdi),%xmm16,%xmm16
+
+	vaesenc	128(%rdi),%xmm16,%xmm16
+
+	vaesenc	144(%rdi),%xmm16,%xmm16
+
+	vaesenc	160(%rdi),%xmm16,%xmm16
+
+	vaesenc	176(%rdi),%xmm16,%xmm16
+
+	vaesenclast	192(%rdi),%xmm16,%xmm16
+	jmp	.Lexit_aes_duiuljAybFADyhe
+.align	32
+.Laes_256_duiuljAybFADyhe:
+	vpxorq	0(%rdi),%xmm16,%xmm16
+
+	vaesenc	16(%rdi),%xmm16,%xmm16
+
+	vaesenc	32(%rdi),%xmm16,%xmm16
+
+	vaesenc	48(%rdi),%xmm16,%xmm16
+
+	vaesenc	64(%rdi),%xmm16,%xmm16
+
+	vaesenc	80(%rdi),%xmm16,%xmm16
+
+	vaesenc	96(%rdi),%xmm16,%xmm16
+
+	vaesenc	112(%rdi),%xmm16,%xmm16
+
+	vaesenc	128(%rdi),%xmm16,%xmm16
+
+	vaesenc	144(%rdi),%xmm16,%xmm16
+
+	vaesenc	160(%rdi),%xmm16,%xmm16
+
+	vaesenc	176(%rdi),%xmm16,%xmm16
+
+	vaesenc	192(%rdi),%xmm16,%xmm16
+
+	vaesenc	208(%rdi),%xmm16,%xmm16
+
+	vaesenclast	224(%rdi),%xmm16,%xmm16
+	jmp	.Lexit_aes_duiuljAybFADyhe
+.Lexit_aes_duiuljAybFADyhe:
+
+	vpshufb	SHUF_MASK(%rip),%xmm16,%xmm16
+
+	vmovdqa64	%xmm16,%xmm2
+	vpsllq	$1,%xmm16,%xmm16
+	vpsrlq	$63,%xmm2,%xmm2
+	vmovdqa	%xmm2,%xmm1
+	vpslldq	$8,%xmm2,%xmm2
+	vpsrldq	$8,%xmm1,%xmm1
+	vporq	%xmm2,%xmm16,%xmm16
+
+	vpshufd	$36,%xmm1,%xmm2
+	vpcmpeqd	TWOONE(%rip),%xmm2,%xmm2
+	vpand	POLY(%rip),%xmm2,%xmm2
+	vpxorq	%xmm2,%xmm16,%xmm16
+
+	vmovdqu64	%xmm16,336(%rsi)
+	vshufi32x4	$0x00,%ymm16,%ymm16,%ymm4
+	vmovdqa	%ymm4,%ymm3
+
+	vpclmulqdq	$0x11,%ymm4,%ymm3,%ymm0
+	vpclmulqdq	$0x00,%ymm4,%ymm3,%ymm1
+	vpclmulqdq	$0x01,%ymm4,%ymm3,%ymm2
+	vpclmulqdq	$0x10,%ymm4,%ymm3,%ymm3
+	vpxorq	%ymm2,%ymm3,%ymm3
+
+	vpsrldq	$8,%ymm3,%ymm2
+	vpslldq	$8,%ymm3,%ymm3
+	vpxorq	%ymm2,%ymm0,%ymm0
+	vpxorq	%ymm1,%ymm3,%ymm3
+
+
+
+	vmovdqu64	POLY2(%rip),%ymm2
+
+	vpclmulqdq	$0x01,%ymm3,%ymm2,%ymm1
+	vpslldq	$8,%ymm1,%ymm1
+	vpxorq	%ymm1,%ymm3,%ymm3
+
+
+
+	vpclmulqdq	$0x00,%ymm3,%ymm2,%ymm1
+	vpsrldq	$4,%ymm1,%ymm1
+	vpclmulqdq	$0x10,%ymm3,%ymm2,%ymm3
+	vpslldq	$4,%ymm3,%ymm3
+
+	vpternlogq	$0x96,%ymm1,%ymm0,%ymm3
+
+	vmovdqu64	%xmm3,320(%rsi)
+	vinserti64x2	$1,%xmm16,%ymm3,%ymm4
+	vmovdqa64	%ymm4,%ymm5
+
+	vpclmulqdq	$0x11,%ymm3,%ymm4,%ymm0
+	vpclmulqdq	$0x00,%ymm3,%ymm4,%ymm1
+	vpclmulqdq	$0x01,%ymm3,%ymm4,%ymm2
+	vpclmulqdq	$0x10,%ymm3,%ymm4,%ymm4
+	vpxorq	%ymm2,%ymm4,%ymm4
+
+	vpsrldq	$8,%ymm4,%ymm2
+	vpslldq	$8,%ymm4,%ymm4
+	vpxorq	%ymm2,%ymm0,%ymm0
+	vpxorq	%ymm1,%ymm4,%ymm4
+
+
+
+	vmovdqu64	POLY2(%rip),%ymm2
+
+	vpclmulqdq	$0x01,%ymm4,%ymm2,%ymm1
+	vpslldq	$8,%ymm1,%ymm1
+	vpxorq	%ymm1,%ymm4,%ymm4
+
+
+
+	vpclmulqdq	$0x00,%ymm4,%ymm2,%ymm1
+	vpsrldq	$4,%ymm1,%ymm1
+	vpclmulqdq	$0x10,%ymm4,%ymm2,%ymm4
+	vpslldq	$4,%ymm4,%ymm4
+
+	vpternlogq	$0x96,%ymm1,%ymm0,%ymm4
+
+	vmovdqu64	%ymm4,288(%rsi)
+
+	vinserti64x4	$1,%ymm5,%zmm4,%zmm4
+
+
+	vshufi64x2	$0x00,%zmm4,%zmm4,%zmm3
+	vmovdqa64	%zmm4,%zmm5
+
+	vpclmulqdq	$0x11,%zmm3,%zmm4,%zmm0
+	vpclmulqdq	$0x00,%zmm3,%zmm4,%zmm1
+	vpclmulqdq	$0x01,%zmm3,%zmm4,%zmm2
+	vpclmulqdq	$0x10,%zmm3,%zmm4,%zmm4
+	vpxorq	%zmm2,%zmm4,%zmm4
+
+	vpsrldq	$8,%zmm4,%zmm2
+	vpslldq	$8,%zmm4,%zmm4
+	vpxorq	%zmm2,%zmm0,%zmm0
+	vpxorq	%zmm1,%zmm4,%zmm4
+
+
+
+	vmovdqu64	POLY2(%rip),%zmm2
+
+	vpclmulqdq	$0x01,%zmm4,%zmm2,%zmm1
+	vpslldq	$8,%zmm1,%zmm1
+	vpxorq	%zmm1,%zmm4,%zmm4
+
+
+
+	vpclmulqdq	$0x00,%zmm4,%zmm2,%zmm1
+	vpsrldq	$4,%zmm1,%zmm1
+	vpclmulqdq	$0x10,%zmm4,%zmm2,%zmm4
+	vpslldq	$4,%zmm4,%zmm4
+
+	vpternlogq	$0x96,%zmm1,%zmm0,%zmm4
+
+	vmovdqu64	%zmm4,224(%rsi)
+	vshufi64x2	$0x00,%zmm4,%zmm4,%zmm3
+
+	vpclmulqdq	$0x11,%zmm3,%zmm5,%zmm0
+	vpclmulqdq	$0x00,%zmm3,%zmm5,%zmm1
+	vpclmulqdq	$0x01,%zmm3,%zmm5,%zmm2
+	vpclmulqdq	$0x10,%zmm3,%zmm5,%zmm5
+	vpxorq	%zmm2,%zmm5,%zmm5
+
+	vpsrldq	$8,%zmm5,%zmm2
+	vpslldq	$8,%zmm5,%zmm5
+	vpxorq	%zmm2,%zmm0,%zmm0
+	vpxorq	%zmm1,%zmm5,%zmm5
+
+
+
+	vmovdqu64	POLY2(%rip),%zmm2
+
+	vpclmulqdq	$0x01,%zmm5,%zmm2,%zmm1
+	vpslldq	$8,%zmm1,%zmm1
+	vpxorq	%zmm1,%zmm5,%zmm5
+
+
+
+	vpclmulqdq	$0x00,%zmm5,%zmm2,%zmm1
+	vpsrldq	$4,%zmm1,%zmm1
+	vpclmulqdq	$0x10,%zmm5,%zmm2,%zmm5
+	vpslldq	$4,%zmm5,%zmm5
+
+	vpternlogq	$0x96,%zmm1,%zmm0,%zmm5
+
+	vmovdqu64	%zmm5,160(%rsi)
+
+	vpclmulqdq	$0x11,%zmm3,%zmm4,%zmm0
+	vpclmulqdq	$0x00,%zmm3,%zmm4,%zmm1
+	vpclmulqdq	$0x01,%zmm3,%zmm4,%zmm2
+	vpclmulqdq	$0x10,%zmm3,%zmm4,%zmm4
+	vpxorq	%zmm2,%zmm4,%zmm4
+
+	vpsrldq	$8,%zmm4,%zmm2
+	vpslldq	$8,%zmm4,%zmm4
+	vpxorq	%zmm2,%zmm0,%zmm0
+	vpxorq	%zmm1,%zmm4,%zmm4
+
+
+
+	vmovdqu64	POLY2(%rip),%zmm2
+
+	vpclmulqdq	$0x01,%zmm4,%zmm2,%zmm1
+	vpslldq	$8,%zmm1,%zmm1
+	vpxorq	%zmm1,%zmm4,%zmm4
+
+
+
+	vpclmulqdq	$0x00,%zmm4,%zmm2,%zmm1
+	vpsrldq	$4,%zmm1,%zmm1
+	vpclmulqdq	$0x10,%zmm4,%zmm2,%zmm4
+	vpslldq	$4,%zmm4,%zmm4
+
+	vpternlogq	$0x96,%zmm1,%zmm0,%zmm4
+
+	vmovdqu64	%zmm4,96(%rsi)
+	vzeroupper
+.Labort_init:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ossl_aes_gcm_init_avx512, .-ossl_aes_gcm_init_avx512
+.globl	ossl_aes_gcm_setiv_avx512
+.type	ossl_aes_gcm_setiv_avx512,@function
+.align	32
+ossl_aes_gcm_setiv_avx512:
+.cfi_startproc	
+.Lsetiv_seh_begin:
+.byte	243,15,30,250
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-16
+.Lsetiv_seh_push_rbx:
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-24
+.Lsetiv_seh_push_rbp:
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+.Lsetiv_seh_push_r12:
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+.Lsetiv_seh_push_r13:
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+.Lsetiv_seh_push_r14:
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lsetiv_seh_push_r15:
+
+
+
+
+
+
+
+
+
+
+	leaq	0(%rsp),%rbp
+.cfi_def_cfa_register	%rbp
+.Lsetiv_seh_setfp:
+
+.Lsetiv_seh_prolog_end:
+	subq	$820,%rsp
+	andq	$(-64),%rsp
+	cmpq	$12,%rcx
+	je	iv_len_12_init_IV
+	vpxor	%xmm2,%xmm2,%xmm2
+	movq	%rdx,%r10
+	movq	%rcx,%r11
+	orq	%r11,%r11
+	jz	.L_CALC_AAD_done_mBgdvxqgFGebeug
+
+	xorq	%rbx,%rbx
+	vmovdqa64	SHUF_MASK(%rip),%zmm16
+
+.L_get_AAD_loop48x16_mBgdvxqgFGebeug:
+	cmpq	$768,%r11
+	jl	.L_exit_AAD_loop48x16_mBgdvxqgFGebeug
+	vmovdqu64	0(%r10),%zmm11
+	vmovdqu64	64(%r10),%zmm3
+	vmovdqu64	128(%r10),%zmm4
+	vmovdqu64	192(%r10),%zmm5
+	vpshufb	%zmm16,%zmm11,%zmm11
+	vpshufb	%zmm16,%zmm3,%zmm3
+	vpshufb	%zmm16,%zmm4,%zmm4
+	vpshufb	%zmm16,%zmm5,%zmm5
+	testq	%rbx,%rbx
+	jnz	.L_skip_hkeys_precomputation_EzsAegbBbaerfwt
+
+	vmovdqu64	288(%rsi),%zmm1
+	vmovdqu64	%zmm1,704(%rsp)
+
+	vmovdqu64	224(%rsi),%zmm9
+	vmovdqu64	%zmm9,640(%rsp)
+
+
+	vshufi64x2	$0x00,%zmm9,%zmm9,%zmm9
+
+	vmovdqu64	160(%rsi),%zmm10
+	vmovdqu64	%zmm10,576(%rsp)
+
+	vmovdqu64	96(%rsi),%zmm12
+	vmovdqu64	%zmm12,512(%rsp)
+
+	vpclmulqdq	$0x11,%zmm9,%zmm10,%zmm13
+	vpclmulqdq	$0x00,%zmm9,%zmm10,%zmm15
+	vpclmulqdq	$0x01,%zmm9,%zmm10,%zmm17
+	vpclmulqdq	$0x10,%zmm9,%zmm10,%zmm10
+	vpxorq	%zmm17,%zmm10,%zmm10
+
+	vpsrldq	$8,%zmm10,%zmm17
+	vpslldq	$8,%zmm10,%zmm10
+	vpxorq	%zmm17,%zmm13,%zmm13
+	vpxorq	%zmm15,%zmm10,%zmm10
+
+
+
+	vmovdqu64	POLY2(%rip),%zmm17
+
+	vpclmulqdq	$0x01,%zmm10,%zmm17,%zmm15
+	vpslldq	$8,%zmm15,%zmm15
+	vpxorq	%zmm15,%zmm10,%zmm10
+
+
+
+	vpclmulqdq	$0x00,%zmm10,%zmm17,%zmm15
+	vpsrldq	$4,%zmm15,%zmm15
+	vpclmulqdq	$0x10,%zmm10,%zmm17,%zmm10
+	vpslldq	$4,%zmm10,%zmm10
+
+	vpternlogq	$0x96,%zmm15,%zmm13,%zmm10
+
+	vmovdqu64	%zmm10,448(%rsp)
+
+	vpclmulqdq	$0x11,%zmm9,%zmm12,%zmm13
+	vpclmulqdq	$0x00,%zmm9,%zmm12,%zmm15
+	vpclmulqdq	$0x01,%zmm9,%zmm12,%zmm17
+	vpclmulqdq	$0x10,%zmm9,%zmm12,%zmm12
+	vpxorq	%zmm17,%zmm12,%zmm12
+
+	vpsrldq	$8,%zmm12,%zmm17
+	vpslldq	$8,%zmm12,%zmm12
+	vpxorq	%zmm17,%zmm13,%zmm13
+	vpxorq	%zmm15,%zmm12,%zmm12
+
+
+
+	vmovdqu64	POLY2(%rip),%zmm17
+
+	vpclmulqdq	$0x01,%zmm12,%zmm17,%zmm15
+	vpslldq	$8,%zmm15,%zmm15
+	vpxorq	%zmm15,%zmm12,%zmm12
+
+
+
+	vpclmulqdq	$0x00,%zmm12,%zmm17,%zmm15
+	vpsrldq	$4,%zmm15,%zmm15
+	vpclmulqdq	$0x10,%zmm12,%zmm17,%zmm12
+	vpslldq	$4,%zmm12,%zmm12
+
+	vpternlogq	$0x96,%zmm15,%zmm13,%zmm12
+
+	vmovdqu64	%zmm12,384(%rsp)
+
+	vpclmulqdq	$0x11,%zmm9,%zmm10,%zmm13
+	vpclmulqdq	$0x00,%zmm9,%zmm10,%zmm15
+	vpclmulqdq	$0x01,%zmm9,%zmm10,%zmm17
+	vpclmulqdq	$0x10,%zmm9,%zmm10,%zmm10
+	vpxorq	%zmm17,%zmm10,%zmm10
+
+	vpsrldq	$8,%zmm10,%zmm17
+	vpslldq	$8,%zmm10,%zmm10
+	vpxorq	%zmm17,%zmm13,%zmm13
+	vpxorq	%zmm15,%zmm10,%zmm10
+
+
+
+	vmovdqu64	POLY2(%rip),%zmm17
+
+	vpclmulqdq	$0x01,%zmm10,%zmm17,%zmm15
+	vpslldq	$8,%zmm15,%zmm15
+	vpxorq	%zmm15,%zmm10,%zmm10
+
+
+
+	vpclmulqdq	$0x00,%zmm10,%zmm17,%zmm15
+	vpsrldq	$4,%zmm15,%zmm15
+	vpclmulqdq	$0x10,%zmm10,%zmm17,%zmm10
+	vpslldq	$4,%zmm10,%zmm10
+
+	vpternlogq	$0x96,%zmm15,%zmm13,%zmm10
+
+	vmovdqu64	%zmm10,320(%rsp)
+
+	vpclmulqdq	$0x11,%zmm9,%zmm12,%zmm13
+	vpclmulqdq	$0x00,%zmm9,%zmm12,%zmm15
+	vpclmulqdq	$0x01,%zmm9,%zmm12,%zmm17
+	vpclmulqdq	$0x10,%zmm9,%zmm12,%zmm12
+	vpxorq	%zmm17,%zmm12,%zmm12
+
+	vpsrldq	$8,%zmm12,%zmm17
+	vpslldq	$8,%zmm12,%zmm12
+	vpxorq	%zmm17,%zmm13,%zmm13
+	vpxorq	%zmm15,%zmm12,%zmm12
+
+
+
+	vmovdqu64	POLY2(%rip),%zmm17
+
+	vpclmulqdq	$0x01,%zmm12,%zmm17,%zmm15
+	vpslldq	$8,%zmm15,%zmm15
+	vpxorq	%zmm15,%zmm12,%zmm12
+
+
+
+	vpclmulqdq	$0x00,%zmm12,%zmm17,%zmm15
+	vpsrldq	$4,%zmm15,%zmm15
+	vpclmulqdq	$0x10,%zmm12,%zmm17,%zmm12
+	vpslldq	$4,%zmm12,%zmm12
+
+	vpternlogq	$0x96,%zmm15,%zmm13,%zmm12
+
+	vmovdqu64	%zmm12,256(%rsp)
+
+	vpclmulqdq	$0x11,%zmm9,%zmm10,%zmm13
+	vpclmulqdq	$0x00,%zmm9,%zmm10,%zmm15
+	vpclmulqdq	$0x01,%zmm9,%zmm10,%zmm17
+	vpclmulqdq	$0x10,%zmm9,%zmm10,%zmm10
+	vpxorq	%zmm17,%zmm10,%zmm10
+
+	vpsrldq	$8,%zmm10,%zmm17
+	vpslldq	$8,%zmm10,%zmm10
+	vpxorq	%zmm17,%zmm13,%zmm13
+	vpxorq	%zmm15,%zmm10,%zmm10
+
+
+
+	vmovdqu64	POLY2(%rip),%zmm17
+
+	vpclmulqdq	$0x01,%zmm10,%zmm17,%zmm15
+	vpslldq	$8,%zmm15,%zmm15
+	vpxorq	%zmm15,%zmm10,%zmm10
+
+
+
+	vpclmulqdq	$0x00,%zmm10,%zmm17,%zmm15
+	vpsrldq	$4,%zmm15,%zmm15
+	vpclmulqdq	$0x10,%zmm10,%zmm17,%zmm10
+	vpslldq	$4,%zmm10,%zmm10
+
+	vpternlogq	$0x96,%zmm15,%zmm13,%zmm10
+
+	vmovdqu64	%zmm10,192(%rsp)
+
+	vpclmulqdq	$0x11,%zmm9,%zmm12,%zmm13
+	vpclmulqdq	$0x00,%zmm9,%zmm12,%zmm15
+	vpclmulqdq	$0x01,%zmm9,%zmm12,%zmm17
+	vpclmulqdq	$0x10,%zmm9,%zmm12,%zmm12
+	vpxorq	%zmm17,%zmm12,%zmm12
+
+	vpsrldq	$8,%zmm12,%zmm17
+	vpslldq	$8,%zmm12,%zmm12
+	vpxorq	%zmm17,%zmm13,%zmm13
+	vpxorq	%zmm15,%zmm12,%zmm12
+
+
+
+	vmovdqu64	POLY2(%rip),%zmm17
+
+	vpclmulqdq	$0x01,%zmm12,%zmm17,%zmm15
+	vpslldq	$8,%zmm15,%zmm15
+	vpxorq	%zmm15,%zmm12,%zmm12
+
+
+
+	vpclmulqdq	$0x00,%zmm12,%zmm17,%zmm15
+	vpsrldq	$4,%zmm15,%zmm15
+	vpclmulqdq	$0x10,%zmm12,%zmm17,%zmm12
+	vpslldq	$4,%zmm12,%zmm12
+
+	vpternlogq	$0x96,%zmm15,%zmm13,%zmm12
+
+	vmovdqu64	%zmm12,128(%rsp)
+
+	vpclmulqdq	$0x11,%zmm9,%zmm10,%zmm13
+	vpclmulqdq	$0x00,%zmm9,%zmm10,%zmm15
+	vpclmulqdq	$0x01,%zmm9,%zmm10,%zmm17
+	vpclmulqdq	$0x10,%zmm9,%zmm10,%zmm10
+	vpxorq	%zmm17,%zmm10,%zmm10
+
+	vpsrldq	$8,%zmm10,%zmm17
+	vpslldq	$8,%zmm10,%zmm10
+	vpxorq	%zmm17,%zmm13,%zmm13
+	vpxorq	%zmm15,%zmm10,%zmm10
+
+
+
+	vmovdqu64	POLY2(%rip),%zmm17
+
+	vpclmulqdq	$0x01,%zmm10,%zmm17,%zmm15
+	vpslldq	$8,%zmm15,%zmm15
+	vpxorq	%zmm15,%zmm10,%zmm10
+
+
+
+	vpclmulqdq	$0x00,%zmm10,%zmm17,%zmm15
+	vpsrldq	$4,%zmm15,%zmm15
+	vpclmulqdq	$0x10,%zmm10,%zmm17,%zmm10
+	vpslldq	$4,%zmm10,%zmm10
+
+	vpternlogq	$0x96,%zmm15,%zmm13,%zmm10
+
+	vmovdqu64	%zmm10,64(%rsp)
+
+	vpclmulqdq	$0x11,%zmm9,%zmm12,%zmm13
+	vpclmulqdq	$0x00,%zmm9,%zmm12,%zmm15
+	vpclmulqdq	$0x01,%zmm9,%zmm12,%zmm17
+	vpclmulqdq	$0x10,%zmm9,%zmm12,%zmm12
+	vpxorq	%zmm17,%zmm12,%zmm12
+
+	vpsrldq	$8,%zmm12,%zmm17
+	vpslldq	$8,%zmm12,%zmm12
+	vpxorq	%zmm17,%zmm13,%zmm13
+	vpxorq	%zmm15,%zmm12,%zmm12
+
+
+
+	vmovdqu64	POLY2(%rip),%zmm17
+
+	vpclmulqdq	$0x01,%zmm12,%zmm17,%zmm15
+	vpslldq	$8,%zmm15,%zmm15
+	vpxorq	%zmm15,%zmm12,%zmm12
+
+
+
+	vpclmulqdq	$0x00,%zmm12,%zmm17,%zmm15
+	vpsrldq	$4,%zmm15,%zmm15
+	vpclmulqdq	$0x10,%zmm12,%zmm17,%zmm12
+	vpslldq	$4,%zmm12,%zmm12
+
+	vpternlogq	$0x96,%zmm15,%zmm13,%zmm12
+
+	vmovdqu64	%zmm12,0(%rsp)
+.L_skip_hkeys_precomputation_EzsAegbBbaerfwt:
+	movq	$1,%rbx
+	vpxorq	%zmm2,%zmm11,%zmm11
+	vmovdqu64	0(%rsp),%zmm19
+	vpclmulqdq	$0x11,%zmm19,%zmm11,%zmm1
+	vpclmulqdq	$0x00,%zmm19,%zmm11,%zmm9
+	vpclmulqdq	$0x01,%zmm19,%zmm11,%zmm10
+	vpclmulqdq	$0x10,%zmm19,%zmm11,%zmm12
+	vmovdqu64	64(%rsp),%zmm19
+	vpclmulqdq	$0x11,%zmm19,%zmm3,%zmm13
+	vpclmulqdq	$0x00,%zmm19,%zmm3,%zmm15
+	vpclmulqdq	$0x01,%zmm19,%zmm3,%zmm17
+	vpclmulqdq	$0x10,%zmm19,%zmm3,%zmm18
+	vpxorq	%zmm17,%zmm10,%zmm7
+	vpxorq	%zmm13,%zmm1,%zmm6
+	vpxorq	%zmm15,%zmm9,%zmm8
+	vpternlogq	$0x96,%zmm18,%zmm12,%zmm7
+	vmovdqu64	128(%rsp),%zmm19
+	vpclmulqdq	$0x11,%zmm19,%zmm4,%zmm1
+	vpclmulqdq	$0x00,%zmm19,%zmm4,%zmm9
+	vpclmulqdq	$0x01,%zmm19,%zmm4,%zmm10
+	vpclmulqdq	$0x10,%zmm19,%zmm4,%zmm12
+	vmovdqu64	192(%rsp),%zmm19
+	vpclmulqdq	$0x11,%zmm19,%zmm5,%zmm13
+	vpclmulqdq	$0x00,%zmm19,%zmm5,%zmm15
+	vpclmulqdq	$0x01,%zmm19,%zmm5,%zmm17
+	vpclmulqdq	$0x10,%zmm19,%zmm5,%zmm18
+
+	vpternlogq	$0x96,%zmm17,%zmm10,%zmm7
+	vpternlogq	$0x96,%zmm13,%zmm1,%zmm6
+	vpternlogq	$0x96,%zmm15,%zmm9,%zmm8
+	vpternlogq	$0x96,%zmm18,%zmm12,%zmm7
+	vmovdqu64	256(%r10),%zmm11
+	vmovdqu64	320(%r10),%zmm3
+	vmovdqu64	384(%r10),%zmm4
+	vmovdqu64	448(%r10),%zmm5
+	vpshufb	%zmm16,%zmm11,%zmm11
+	vpshufb	%zmm16,%zmm3,%zmm3
+	vpshufb	%zmm16,%zmm4,%zmm4
+	vpshufb	%zmm16,%zmm5,%zmm5
+	vmovdqu64	256(%rsp),%zmm19
+	vpclmulqdq	$0x11,%zmm19,%zmm11,%zmm1
+	vpclmulqdq	$0x00,%zmm19,%zmm11,%zmm9
+	vpclmulqdq	$0x01,%zmm19,%zmm11,%zmm10
+	vpclmulqdq	$0x10,%zmm19,%zmm11,%zmm12
+	vmovdqu64	320(%rsp),%zmm19
+	vpclmulqdq	$0x11,%zmm19,%zmm3,%zmm13
+	vpclmulqdq	$0x00,%zmm19,%zmm3,%zmm15
+	vpclmulqdq	$0x01,%zmm19,%zmm3,%zmm17
+	vpclmulqdq	$0x10,%zmm19,%zmm3,%zmm18
+	vpternlogq	$0x96,%zmm17,%zmm10,%zmm7
+	vpternlogq	$0x96,%zmm13,%zmm1,%zmm6
+	vpternlogq	$0x96,%zmm15,%zmm9,%zmm8
+	vpternlogq	$0x96,%zmm18,%zmm12,%zmm7
+	vmovdqu64	384(%rsp),%zmm19
+	vpclmulqdq	$0x11,%zmm19,%zmm4,%zmm1
+	vpclmulqdq	$0x00,%zmm19,%zmm4,%zmm9
+	vpclmulqdq	$0x01,%zmm19,%zmm4,%zmm10
+	vpclmulqdq	$0x10,%zmm19,%zmm4,%zmm12
+	vmovdqu64	448(%rsp),%zmm19
+	vpclmulqdq	$0x11,%zmm19,%zmm5,%zmm13
+	vpclmulqdq	$0x00,%zmm19,%zmm5,%zmm15
+	vpclmulqdq	$0x01,%zmm19,%zmm5,%zmm17
+	vpclmulqdq	$0x10,%zmm19,%zmm5,%zmm18
+
+	vpternlogq	$0x96,%zmm17,%zmm10,%zmm7
+	vpternlogq	$0x96,%zmm13,%zmm1,%zmm6
+	vpternlogq	$0x96,%zmm15,%zmm9,%zmm8
+	vpternlogq	$0x96,%zmm18,%zmm12,%zmm7
+	vmovdqu64	512(%r10),%zmm11
+	vmovdqu64	576(%r10),%zmm3
+	vmovdqu64	640(%r10),%zmm4
+	vmovdqu64	704(%r10),%zmm5
+	vpshufb	%zmm16,%zmm11,%zmm11
+	vpshufb	%zmm16,%zmm3,%zmm3
+	vpshufb	%zmm16,%zmm4,%zmm4
+	vpshufb	%zmm16,%zmm5,%zmm5
+	vmovdqu64	512(%rsp),%zmm19
+	vpclmulqdq	$0x11,%zmm19,%zmm11,%zmm1
+	vpclmulqdq	$0x00,%zmm19,%zmm11,%zmm9
+	vpclmulqdq	$0x01,%zmm19,%zmm11,%zmm10
+	vpclmulqdq	$0x10,%zmm19,%zmm11,%zmm12
+	vmovdqu64	576(%rsp),%zmm19
+	vpclmulqdq	$0x11,%zmm19,%zmm3,%zmm13
+	vpclmulqdq	$0x00,%zmm19,%zmm3,%zmm15
+	vpclmulqdq	$0x01,%zmm19,%zmm3,%zmm17
+	vpclmulqdq	$0x10,%zmm19,%zmm3,%zmm18
+	vpternlogq	$0x96,%zmm17,%zmm10,%zmm7
+	vpternlogq	$0x96,%zmm13,%zmm1,%zmm6
+	vpternlogq	$0x96,%zmm15,%zmm9,%zmm8
+	vpternlogq	$0x96,%zmm18,%zmm12,%zmm7
+	vmovdqu64	640(%rsp),%zmm19
+	vpclmulqdq	$0x11,%zmm19,%zmm4,%zmm1
+	vpclmulqdq	$0x00,%zmm19,%zmm4,%zmm9
+	vpclmulqdq	$0x01,%zmm19,%zmm4,%zmm10
+	vpclmulqdq	$0x10,%zmm19,%zmm4,%zmm12
+	vmovdqu64	704(%rsp),%zmm19
+	vpclmulqdq	$0x11,%zmm19,%zmm5,%zmm13
+	vpclmulqdq	$0x00,%zmm19,%zmm5,%zmm15
+	vpclmulqdq	$0x01,%zmm19,%zmm5,%zmm17
+	vpclmulqdq	$0x10,%zmm19,%zmm5,%zmm18
+
+	vpternlogq	$0x96,%zmm17,%zmm10,%zmm7
+	vpternlogq	$0x96,%zmm13,%zmm1,%zmm6
+	vpternlogq	$0x96,%zmm15,%zmm9,%zmm8
+	vpternlogq	$0x96,%zmm18,%zmm12,%zmm7
+
+	vpsrldq	$8,%zmm7,%zmm1
+	vpslldq	$8,%zmm7,%zmm9
+	vpxorq	%zmm1,%zmm6,%zmm6
+	vpxorq	%zmm9,%zmm8,%zmm8
+	vextracti64x4	$1,%zmm6,%ymm1
+	vpxorq	%ymm1,%ymm6,%ymm6
+	vextracti32x4	$1,%ymm6,%xmm1
+	vpxorq	%xmm1,%xmm6,%xmm6
+	vextracti64x4	$1,%zmm8,%ymm9
+	vpxorq	%ymm9,%ymm8,%ymm8
+	vextracti32x4	$1,%ymm8,%xmm9
+	vpxorq	%xmm9,%xmm8,%xmm8
+	vmovdqa64	POLY2(%rip),%xmm10
+
+
+	vpclmulqdq	$0x01,%xmm8,%xmm10,%xmm1
+	vpslldq	$8,%xmm1,%xmm1
+	vpxorq	%xmm1,%xmm8,%xmm1
+
+
+	vpclmulqdq	$0x00,%xmm1,%xmm10,%xmm9
+	vpsrldq	$4,%xmm9,%xmm9
+	vpclmulqdq	$0x10,%xmm1,%xmm10,%xmm2
+	vpslldq	$4,%xmm2,%xmm2
+	vpternlogq	$0x96,%xmm6,%xmm9,%xmm2
+
+	subq	$768,%r11
+	je	.L_CALC_AAD_done_mBgdvxqgFGebeug
+
+	addq	$768,%r10
+	jmp	.L_get_AAD_loop48x16_mBgdvxqgFGebeug
+
+.L_exit_AAD_loop48x16_mBgdvxqgFGebeug:
+
+	cmpq	$512,%r11
+	jl	.L_less_than_32x16_mBgdvxqgFGebeug
+
+	vmovdqu64	0(%r10),%zmm11
+	vmovdqu64	64(%r10),%zmm3
+	vmovdqu64	128(%r10),%zmm4
+	vmovdqu64	192(%r10),%zmm5
+	vpshufb	%zmm16,%zmm11,%zmm11
+	vpshufb	%zmm16,%zmm3,%zmm3
+	vpshufb	%zmm16,%zmm4,%zmm4
+	vpshufb	%zmm16,%zmm5,%zmm5
+	testq	%rbx,%rbx
+	jnz	.L_skip_hkeys_precomputation_xCxmdbgxoCdwefc
+
+	vmovdqu64	288(%rsi),%zmm1
+	vmovdqu64	%zmm1,704(%rsp)
+
+	vmovdqu64	224(%rsi),%zmm9
+	vmovdqu64	%zmm9,640(%rsp)
+
+
+	vshufi64x2	$0x00,%zmm9,%zmm9,%zmm9
+
+	vmovdqu64	160(%rsi),%zmm10
+	vmovdqu64	%zmm10,576(%rsp)
+
+	vmovdqu64	96(%rsi),%zmm12
+	vmovdqu64	%zmm12,512(%rsp)
+
+	vpclmulqdq	$0x11,%zmm9,%zmm10,%zmm13
+	vpclmulqdq	$0x00,%zmm9,%zmm10,%zmm15
+	vpclmulqdq	$0x01,%zmm9,%zmm10,%zmm17
+	vpclmulqdq	$0x10,%zmm9,%zmm10,%zmm10
+	vpxorq	%zmm17,%zmm10,%zmm10
+
+	vpsrldq	$8,%zmm10,%zmm17
+	vpslldq	$8,%zmm10,%zmm10
+	vpxorq	%zmm17,%zmm13,%zmm13
+	vpxorq	%zmm15,%zmm10,%zmm10
+
+
+
+	vmovdqu64	POLY2(%rip),%zmm17
+
+	vpclmulqdq	$0x01,%zmm10,%zmm17,%zmm15
+	vpslldq	$8,%zmm15,%zmm15
+	vpxorq	%zmm15,%zmm10,%zmm10
+
+
+
+	vpclmulqdq	$0x00,%zmm10,%zmm17,%zmm15
+	vpsrldq	$4,%zmm15,%zmm15
+	vpclmulqdq	$0x10,%zmm10,%zmm17,%zmm10
+	vpslldq	$4,%zmm10,%zmm10
+
+	vpternlogq	$0x96,%zmm15,%zmm13,%zmm10
+
+	vmovdqu64	%zmm10,448(%rsp)
+
+	vpclmulqdq	$0x11,%zmm9,%zmm12,%zmm13
+	vpclmulqdq	$0x00,%zmm9,%zmm12,%zmm15
+	vpclmulqdq	$0x01,%zmm9,%zmm12,%zmm17
+	vpclmulqdq	$0x10,%zmm9,%zmm12,%zmm12
+	vpxorq	%zmm17,%zmm12,%zmm12
+
+	vpsrldq	$8,%zmm12,%zmm17
+	vpslldq	$8,%zmm12,%zmm12
+	vpxorq	%zmm17,%zmm13,%zmm13
+	vpxorq	%zmm15,%zmm12,%zmm12
+
+
+
+	vmovdqu64	POLY2(%rip),%zmm17
+
+	vpclmulqdq	$0x01,%zmm12,%zmm17,%zmm15
+	vpslldq	$8,%zmm15,%zmm15
+	vpxorq	%zmm15,%zmm12,%zmm12
+
+
+
+	vpclmulqdq	$0x00,%zmm12,%zmm17,%zmm15
+	vpsrldq	$4,%zmm15,%zmm15
+	vpclmulqdq	$0x10,%zmm12,%zmm17,%zmm12
+	vpslldq	$4,%zmm12,%zmm12
+
+	vpternlogq	$0x96,%zmm15,%zmm13,%zmm12
+
+	vmovdqu64	%zmm12,384(%rsp)
+
+	vpclmulqdq	$0x11,%zmm9,%zmm10,%zmm13
+	vpclmulqdq	$0x00,%zmm9,%zmm10,%zmm15
+	vpclmulqdq	$0x01,%zmm9,%zmm10,%zmm17
+	vpclmulqdq	$0x10,%zmm9,%zmm10,%zmm10
+	vpxorq	%zmm17,%zmm10,%zmm10
+
+	vpsrldq	$8,%zmm10,%zmm17
+	vpslldq	$8,%zmm10,%zmm10
+	vpxorq	%zmm17,%zmm13,%zmm13
+	vpxorq	%zmm15,%zmm10,%zmm10
+
+
+
+	vmovdqu64	POLY2(%rip),%zmm17
+
+	vpclmulqdq	$0x01,%zmm10,%zmm17,%zmm15
+	vpslldq	$8,%zmm15,%zmm15
+	vpxorq	%zmm15,%zmm10,%zmm10
+
+
+
+	vpclmulqdq	$0x00,%zmm10,%zmm17,%zmm15
+	vpsrldq	$4,%zmm15,%zmm15
+	vpclmulqdq	$0x10,%zmm10,%zmm17,%zmm10
+	vpslldq	$4,%zmm10,%zmm10
+
+	vpternlogq	$0x96,%zmm15,%zmm13,%zmm10
+
+	vmovdqu64	%zmm10,320(%rsp)
+
+	vpclmulqdq	$0x11,%zmm9,%zmm12,%zmm13
+	vpclmulqdq	$0x00,%zmm9,%zmm12,%zmm15
+	vpclmulqdq	$0x01,%zmm9,%zmm12,%zmm17
+	vpclmulqdq	$0x10,%zmm9,%zmm12,%zmm12
+	vpxorq	%zmm17,%zmm12,%zmm12
+
+	vpsrldq	$8,%zmm12,%zmm17
+	vpslldq	$8,%zmm12,%zmm12
+	vpxorq	%zmm17,%zmm13,%zmm13
+	vpxorq	%zmm15,%zmm12,%zmm12
+
+
+
+	vmovdqu64	POLY2(%rip),%zmm17
+
+	vpclmulqdq	$0x01,%zmm12,%zmm17,%zmm15
+	vpslldq	$8,%zmm15,%zmm15
+	vpxorq	%zmm15,%zmm12,%zmm12
+
+
+
+	vpclmulqdq	$0x00,%zmm12,%zmm17,%zmm15
+	vpsrldq	$4,%zmm15,%zmm15
+	vpclmulqdq	$0x10,%zmm12,%zmm17,%zmm12
+	vpslldq	$4,%zmm12,%zmm12
+
+	vpternlogq	$0x96,%zmm15,%zmm13,%zmm12
+
+	vmovdqu64	%zmm12,256(%rsp)
+.L_skip_hkeys_precomputation_xCxmdbgxoCdwefc:
+	movq	$1,%rbx
+	vpxorq	%zmm2,%zmm11,%zmm11
+	vmovdqu64	256(%rsp),%zmm19
+	vpclmulqdq	$0x11,%zmm19,%zmm11,%zmm1
+	vpclmulqdq	$0x00,%zmm19,%zmm11,%zmm9
+	vpclmulqdq	$0x01,%zmm19,%zmm11,%zmm10
+	vpclmulqdq	$0x10,%zmm19,%zmm11,%zmm12
+	vmovdqu64	320(%rsp),%zmm19
+	vpclmulqdq	$0x11,%zmm19,%zmm3,%zmm13
+	vpclmulqdq	$0x00,%zmm19,%zmm3,%zmm15
+	vpclmulqdq	$0x01,%zmm19,%zmm3,%zmm17
*** 135867 LINES SKIPPED ***