svn commit: r300819 - in vendor-sys/skein: . dist dist/Additional_Implementations dist/Optimized_32bit dist/Optimized_64bit dist/README dist/Reference_Implementation dist/Supporting_Documentation d...
Allan Jude
allanjude at FreeBSD.org
Fri May 27 02:42:48 UTC 2016
Author: allanjude
Date: Fri May 27 02:42:46 2016
New Revision: 300819
URL: https://svnweb.freebsd.org/changeset/base/300819
Log:
Import Skein 1.3
Bruce Schneier's hashing algorithm
Used by newer versions of ZFS
Added:
vendor-sys/skein/
vendor-sys/skein/dist/
vendor-sys/skein/dist/Additional_Implementations/
vendor-sys/skein/dist/Additional_Implementations/Atmel_AVR.c (contents, props changed)
vendor-sys/skein/dist/Additional_Implementations/skein_8bit_estimates.xls (contents, props changed)
vendor-sys/skein/dist/Additional_Implementations/skein_MSC_v9_perf.txt (contents, props changed)
vendor-sys/skein/dist/Additional_Implementations/skein_block_x64.asm
vendor-sys/skein/dist/Additional_Implementations/skein_block_x64.s (contents, props changed)
vendor-sys/skein/dist/Additional_Implementations/skein_block_x86.asm
vendor-sys/skein/dist/Additional_Implementations/skein_block_xmm32.asm
vendor-sys/skein/dist/Additional_Implementations/skein_block_xmm32.s (contents, props changed)
vendor-sys/skein/dist/Additional_Implementations/skein_perf_core2.txt (contents, props changed)
vendor-sys/skein/dist/Additional_Implementations/skein_rot_search2.c (contents, props changed)
vendor-sys/skein/dist/Additional_Implementations/skein_test.c (contents, props changed)
vendor-sys/skein/dist/Optimized_32bit/
vendor-sys/skein/dist/Optimized_32bit/SHA3api_ref.c (contents, props changed)
vendor-sys/skein/dist/Optimized_32bit/SHA3api_ref.h (contents, props changed)
vendor-sys/skein/dist/Optimized_32bit/brg_endian.h (contents, props changed)
vendor-sys/skein/dist/Optimized_32bit/brg_types.h (contents, props changed)
vendor-sys/skein/dist/Optimized_32bit/skein.c (contents, props changed)
vendor-sys/skein/dist/Optimized_32bit/skein.h (contents, props changed)
vendor-sys/skein/dist/Optimized_32bit/skein_block.c (contents, props changed)
vendor-sys/skein/dist/Optimized_32bit/skein_debug.c (contents, props changed)
vendor-sys/skein/dist/Optimized_32bit/skein_debug.h (contents, props changed)
vendor-sys/skein/dist/Optimized_32bit/skein_iv.h (contents, props changed)
vendor-sys/skein/dist/Optimized_32bit/skein_port.h (contents, props changed)
vendor-sys/skein/dist/Optimized_64bit/
vendor-sys/skein/dist/Optimized_64bit/SHA3api_ref.c (contents, props changed)
vendor-sys/skein/dist/Optimized_64bit/SHA3api_ref.h (contents, props changed)
vendor-sys/skein/dist/Optimized_64bit/brg_endian.h (contents, props changed)
vendor-sys/skein/dist/Optimized_64bit/brg_types.h (contents, props changed)
vendor-sys/skein/dist/Optimized_64bit/skein.c (contents, props changed)
vendor-sys/skein/dist/Optimized_64bit/skein.h (contents, props changed)
vendor-sys/skein/dist/Optimized_64bit/skein_block.c (contents, props changed)
vendor-sys/skein/dist/Optimized_64bit/skein_debug.c (contents, props changed)
vendor-sys/skein/dist/Optimized_64bit/skein_debug.h (contents, props changed)
vendor-sys/skein/dist/Optimized_64bit/skein_iv.h (contents, props changed)
vendor-sys/skein/dist/Optimized_64bit/skein_port.h (contents, props changed)
vendor-sys/skein/dist/README/
vendor-sys/skein/dist/README/readme.txt (contents, props changed)
vendor-sys/skein/dist/Reference_Implementation/
vendor-sys/skein/dist/Reference_Implementation/SHA3api_ref.c (contents, props changed)
vendor-sys/skein/dist/Reference_Implementation/SHA3api_ref.h (contents, props changed)
vendor-sys/skein/dist/Reference_Implementation/brg_endian.h (contents, props changed)
vendor-sys/skein/dist/Reference_Implementation/brg_types.h (contents, props changed)
vendor-sys/skein/dist/Reference_Implementation/skein.c (contents, props changed)
vendor-sys/skein/dist/Reference_Implementation/skein.h (contents, props changed)
vendor-sys/skein/dist/Reference_Implementation/skein_block.c (contents, props changed)
vendor-sys/skein/dist/Reference_Implementation/skein_debug.c (contents, props changed)
vendor-sys/skein/dist/Reference_Implementation/skein_debug.h (contents, props changed)
vendor-sys/skein/dist/Reference_Implementation/skein_port.h (contents, props changed)
vendor-sys/skein/dist/Supporting_Documentation/
vendor-sys/skein/dist/Supporting_Documentation/Skein Cover Sheet.pdf (contents, props changed)
vendor-sys/skein/dist/Supporting_Documentation/Skein_Implementation_Statement.pdf (contents, props changed)
vendor-sys/skein/dist/Supporting_Documentation/Skein_Submitter_Statement.pdf (contents, props changed)
vendor-sys/skein/dist/Supporting_Documentation/skein1.3.pdf (contents, props changed)
vendor-sys/skein/dist/Supporting_Documentation/skeinround3Mods.pdf (contents, props changed)
vendor-sys/skein/dist/Supporting_Documentation/tex/
vendor-sys/skein/dist/Supporting_Documentation/tex/key_recover.pdf (contents, props changed)
vendor-sys/skein/dist/Supporting_Documentation/tex/reverserounds256.pdf (contents, props changed)
vendor-sys/skein/dist/Supporting_Documentation/tex/skein-21.mps (contents, props changed)
vendor-sys/skein/dist/Supporting_Documentation/tex/skein-22.mps (contents, props changed)
vendor-sys/skein/dist/Supporting_Documentation/tex/skein-23.mps (contents, props changed)
vendor-sys/skein/dist/Supporting_Documentation/tex/skein-24.mps (contents, props changed)
vendor-sys/skein/dist/Supporting_Documentation/tex/skein-25.mps (contents, props changed)
vendor-sys/skein/dist/Supporting_Documentation/tex/skein-31.mps (contents, props changed)
vendor-sys/skein/dist/Supporting_Documentation/tex/skein-32.mps (contents, props changed)
vendor-sys/skein/dist/Supporting_Documentation/tex/skein-33.mps (contents, props changed)
vendor-sys/skein/dist/Supporting_Documentation/tex/skein-41.mps (contents, props changed)
vendor-sys/skein/dist/Supporting_Documentation/tex/skein-42.mps (contents, props changed)
vendor-sys/skein/dist/Supporting_Documentation/tex/skein-51.mps (contents, props changed)
vendor-sys/skein/dist/Supporting_Documentation/tex/skein-52.mps (contents, props changed)
vendor-sys/skein/dist/Supporting_Documentation/tex/skein-53.mps (contents, props changed)
vendor-sys/skein/dist/Supporting_Documentation/tex/skein-61.mps (contents, props changed)
vendor-sys/skein/dist/Supporting_Documentation/tex/skein-71.mps (contents, props changed)
vendor-sys/skein/dist/Supporting_Documentation/tex/skein-81.mps (contents, props changed)
vendor-sys/skein/dist/Supporting_Documentation/tex/skein1.3.tex
vendor-sys/skein/dist/Supporting_Documentation/tex/skeinround3Mods.tex
Added: vendor-sys/skein/dist/Additional_Implementations/Atmel_AVR.c
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ vendor-sys/skein/dist/Additional_Implementations/Atmel_AVR.c Fri May 27 02:42:46 2016 (r300819)
@@ -0,0 +1,77 @@
+#include <stdio.h>
+#include "skein.h"
+
+#define SKEIN_CODE_SIZE (1) /* instantiate code size routines */
+#define SKEIN_LOOP (111) /* unroll only 8 rounds */
+#define SKEIN_USE_ASM (512+1024) /* what to exclude here */
+#include "skein.c"
+#include "skein_block.c"
+
+/* for code size limitations, make "dummy" versions of unused block functions */
+#if SKEIN_USE_ASM & 256
+void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd) { }
+#endif
+#if SKEIN_USE_ASM & 512
+void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd) { }
+#endif
+#if SKEIN_USE_ASM & 1024
+void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd) { }
+#endif
+
+const u08b_t msg[1] =
+ {
+ 0
+ };
+
+int main(int argc,char *argv[])
+ {
+ u08b_t hash[1024/8];
+ u08b_t i,x;
+ static size_t aBytes,bBytes,uCount;
+
+#if !(SKEIN_USE_ASM & 256)
+ Skein_256_Ctxt_t ctx;
+
+ aBytes = 2*Skein_256_API_CodeSize();
+ bBytes = 2*Skein_256_Process_Block_CodeSize();
+ uCount = Skein_256_Unroll_Cnt();
+
+ Skein_256_Init (&ctx,256);
+ Skein_256_Update(&ctx,msg,sizeof(msg));
+ Skein_256_Final (&ctx,hash);
+
+ Skein_256_Process_Block(&ctx,msg,1,256);
+#endif
+
+#if !(SKEIN_USE_ASM & 512)
+ Skein_512_Ctxt_t ctx;
+
+ aBytes = 2*Skein_512_API_CodeSize();
+ bBytes = 2*Skein_512_Process_Block_CodeSize();
+ uCount = Skein_512_Unroll_Cnt();
+
+ Skein_512_Init (&ctx,512);
+ Skein_512_Update(&ctx,msg,sizeof(msg));
+ Skein_512_Final (&ctx,hash);
+
+ Skein_512_Process_Block(&ctx,msg,1,512);
+#endif
+
+#if !(SKEIN_USE_ASM & 1024)
+ Skein1024_Ctxt_t ctx;
+
+ aBytes = 2*Skein1024_API_CodeSize();
+ bBytes = 2*Skein1024_Process_Block_CodeSize();
+ uCount = Skein1024_Unroll_Cnt();
+
+ Skein1024_Init (&ctx,1024);
+ Skein1024_Update(&ctx,msg,sizeof(msg));
+ Skein1024_Final (&ctx,hash);
+
+ Skein1024_Process_Block(&ctx,msg,1,1024);
+#endif
+ printf("API size = %4d bytes. Block size = %4d bytes. Unroll=%d\n",
+ aBytes,bBytes,uCount);
+ for (i=x=0;i<5;i++)
+ printf("hash[%d] = %02X [%02X]\n",i,hash[i],x ^= hash[i]);
+ }
Added: vendor-sys/skein/dist/Additional_Implementations/skein_8bit_estimates.xls
==============================================================================
Binary file. No diff available.
Added: vendor-sys/skein/dist/Additional_Implementations/skein_MSC_v9_perf.txt
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ vendor-sys/skein/dist/Additional_Implementations/skein_MSC_v9_perf.txt Fri May 27 02:42:46 2016 (r300819)
@@ -0,0 +1,129 @@
+File STDIN:
+ 1_ || 2802.00 2814.00 | 5952.00 5952.00 | 30606.00 30606.00 | //: 32-bit, MSC_v9.00 [ C =...]
+ 10_ || 278.40 278.40 | 593.40 593.40 | 3063.00 3063.00 | //: 32-bit, MSC_v9.00 [ C =...]
+ 100_ || 65.52 65.58 | 88.02 88.08 | 306.30 306.30 | //: 32-bit, MSC_v9.00 [ C =...]
+ 1000_ || 41.26 41.41 | 47.96 47.96 | 135.28 135.29 | //: 32-bit, MSC_v9.00 [ C =...]
+ 10000_ || 38.86 39.08 | 44.13 44.21 | 119.88 120.11 | //: 32-bit, MSC_v9.00 [ C =...]
+ 100000_ || 38.85 39.09 | 43.56 43.77 | 105.79 114.18 | //: 32-bit, MSC_v9.00 [ C =...]
+ API || 864 bytes | 704 bytes | 720 bytes | //: 32-bit, MSC_v9.00 [ C =...]
+ Block || 10192 bytes | 22960 bytes | 53072 bytes | //: 32-bit, MSC_v9.00 [ C =...]
+ 1_ || 780.00 786.00 | 1110.00 1110.00 | 3288.00 3318.00 | //: 64-bit, MSC_v9.00 [ C =...]
+ 10_ || 78.60 79.80 | 109.80 109.80 | 331.20 331.80 | //: 64-bit, MSC_v9.00 [ C =...]
+ 100_ || 16.74 16.80 | 15.54 15.54 | 33.30 33.30 | //: 64-bit, MSC_v9.00 [ C =...]
+ 1000_ || 9.88 10.67 | 7.38 7.38 | 14.16 14.17 | //: 64-bit, MSC_v9.00 [ C =...]
+ 10000_ || 9.21 9.22 | 6.60 6.60 | 12.27 12.39 | //: 64-bit, MSC_v9.00 [ C =...]
+ 100000_ || 9.98 10.01 | 7.04 7.08 | 12.36 13.14 | //: 64-bit, MSC_v9.00 [ C =...]
+ API || 992 bytes | 1312 bytes | 864 bytes | //: 64-bit, MSC_v9.00 [ C =...]
+ Block || 2272 bytes | 4944 bytes | 15264 bytes | //: 64-bit, MSC_v9.00 [ C =...]
+ 1_ || 2484.00 2490.00 | 4830.00 4836.00 | 22182.00 22188.00 | //: 32-bit, MSC_v9.00 [asm=...]
+ 10_ || 250.20 252.00 | 485.40 488.40 | 1936.80 1959.00 | //: 32-bit, MSC_v9.00 [asm=...]
+ 100_ || 58.62 58.68 | 70.74 70.80 | 221.76 221.76 | //: 32-bit, MSC_v9.00 [asm=...]
+ 1000_ || 34.12 34.16 | 35.44 35.44 | 85.27 85.31 | //: 32-bit, MSC_v9.00 [asm=...]
+ 10000_ || 34.78 34.98 | 35.36 35.36 | 86.31 86.35 | //: 32-bit, MSC_v9.00 [asm=...]
+ 100000_ || 32.96 33.40 | 33.29 33.60 | 75.79 76.81 | //: 32-bit, MSC_v9.00 [asm=...]
+ API || 864 bytes | 704 bytes | 720 bytes | //: 32-bit, MSC_v9.00 [asm=...]
+ Block || 7588 bytes | 16636 bytes | 38262 bytes | //: 32-bit, MSC_v9.00 [asm=...]
+ 1_ || 672.00 672.00 | 1068.00 1068.00 | 1920.00 1926.00 | //: 64-bit, MSC_v9.00 [asm=...]
+ 10_ || 64.80 65.40 | 107.40 108.00 | 192.00 192.60 | //: 64-bit, MSC_v9.00 [asm=...]
+ 100_ || 15.54 15.60 | 16.20 16.26 | 21.06 21.06 | //: 64-bit, MSC_v9.00 [asm=...]
+ 1000_ || 8.18 8.18 | 6.97 6.97 | 7.77 7.78 | //: 64-bit, MSC_v9.00 [asm=...]
+ 10000_ || 7.59 7.59 | 6.23 6.23 | 6.69 6.69 | //: 64-bit, MSC_v9.00 [asm=...]
+ 100000_ || 7.55 7.71 | 6.14 6.38 | 6.56 6.86 | //: 64-bit, MSC_v9.00 [asm=...]
+ API || 992 bytes | 1312 bytes | 864 bytes | //: 64-bit, MSC_v9.00 [asm=...]
+ Block || 2323 bytes | 4733 bytes | 11817 bytes | //: 64-bit, MSC_v9.00 [asm=...]
+ 1_ || 2952.00 2958.00 | 6030.00 6036.00 | 13668.00 13674.00 | //: 32-bit, MSC_v9.00 [ C =111]
+ 10_ || 295.80 295.80 | 603.00 603.60 | 1366.80 1366.80 | //: 32-bit, MSC_v9.00 [ C =111]
+ 100_ || 69.96 70.02 | 88.98 89.04 | 136.92 137.52 | //: 32-bit, MSC_v9.00 [ C =111]
+ 1000_ || 43.90 43.96 | 48.78 48.85 | 60.08 60.11 | //: 32-bit, MSC_v9.00 [ C =111]
+ 10000_ || 41.53 41.59 | 44.76 44.80 | 53.01 53.01 | //: 32-bit, MSC_v9.00 [ C =111]
+ 100000_ || 41.32 41.60 | 44.52 44.62 | 51.75 51.92 | //: 32-bit, MSC_v9.00 [ C =111]
+ API || 864 bytes | 704 bytes | 720 bytes | //: 32-bit, MSC_v9.00 [ C =111]
+ Block || 1712 bytes | 3664 bytes | 7200 bytes | //: 32-bit, MSC_v9.00 [ C =111]
+ 1_ || 780.00 786.00 | 1422.00 1434.00 | 3810.00 3816.00 | //: 64-bit, MSC_v9.00 [ C =111]
+ 10_ || 75.60 76.20 | 140.40 140.40 | 380.40 381.00 | //: 64-bit, MSC_v9.00 [ C =111]
+ 100_ || 17.16 17.22 | 20.52 21.00 | 38.22 38.28 | //: 64-bit, MSC_v9.00 [ C =111]
+ 1000_ || 9.69 9.69 | 10.42 10.42 | 16.51 16.51 | //: 64-bit, MSC_v9.00 [ C =111]
+ 10000_ || 8.97 8.97 | 9.38 9.38 | 14.38 14.40 | //: 64-bit, MSC_v9.00 [ C =111]
+ 100000_ || 9.18 9.71 | 9.35 9.49 | 14.79 14.99 | //: 64-bit, MSC_v9.00 [ C =111]
+ API || 992 bytes | 1312 bytes | 864 bytes | //: 64-bit, MSC_v9.00 [ C =111]
+ Block || 704 bytes | 1456 bytes | 2976 bytes | //: 64-bit, MSC_v9.00 [ C =111]
+ 1_ || 2580.00 2598.00 | 4842.00 4848.00 | 10578.00 10602.00 | //: 32-bit, MSC_v9.00 [asm=111]
+ 10_ || 259.80 259.80 | 484.20 484.20 | 1059.60 1060.20 | //: 32-bit, MSC_v9.00 [asm=111]
+ 100_ || 57.18 57.24 | 66.42 66.48 | 98.40 98.46 | //: 32-bit, MSC_v9.00 [asm=111]
+ 1000_ || 35.56 35.59 | 35.96 35.96 | 42.79 42.80 | //: 32-bit, MSC_v9.00 [asm=111]
+ 10000_ || 33.69 36.50 | 33.29 33.42 | 37.98 41.34 | //: 32-bit, MSC_v9.00 [asm=111]
+ 100000_ || 33.96 34.57 | 33.93 35.69 | 38.04 38.20 | //: 32-bit, MSC_v9.00 [asm=111]
+ API || 864 bytes | 704 bytes | 720 bytes | //: 32-bit, MSC_v9.00 [asm=111]
+ Block || 1276 bytes | 2532 bytes | 4983 bytes | //: 32-bit, MSC_v9.00 [asm=111]
+ 1_ || 678.00 678.00 | 1098.00 1098.00 | 2034.00 2040.00 | //: 64-bit, MSC_v9.00 [asm=111]
+ 10_ || 66.60 66.60 | 109.80 109.80 | 204.00 204.00 | //: 64-bit, MSC_v9.00 [asm=111]
+ 100_ || 15.48 16.68 | 16.98 16.98 | 22.38 22.38 | //: 64-bit, MSC_v9.00 [asm=111]
+ 1000_ || 8.45 8.45 | 7.93 7.93 | 8.39 8.39 | //: 64-bit, MSC_v9.00 [asm=111]
+ 10000_ || 7.81 7.81 | 6.50 6.50 | 7.18 7.18 | //: 64-bit, MSC_v9.00 [asm=111]
+ 100000_ || 8.08 8.09 | 6.40 6.71 | 6.98 7.21 | //: 64-bit, MSC_v9.00 [asm=111]
+ API || 992 bytes | 1312 bytes | 864 bytes | //: 64-bit, MSC_v9.00 [asm=111]
+ Block || 664 bytes | 1074 bytes | 2221 bytes | //: 64-bit, MSC_v9.00 [asm=111]
+ 1_ || 2988.00 2994.00 | 6240.00 6246.00 | 13794.00 13800.00 | //: 32-bit, MSC_v9.00 [ C =332]
+ 10_ || 297.60 299.40 | 623.40 624.00 | 1379.40 1380.00 | //: 32-bit, MSC_v9.00 [ C =332]
+ 100_ || 70.26 70.32 | 91.92 91.92 | 138.00 138.06 | //: 32-bit, MSC_v9.00 [ C =332]
+ 1000_ || 44.88 44.89 | 50.20 50.20 | 60.44 60.45 | //: 32-bit, MSC_v9.00 [ C =332]
+ 10000_ || 42.42 42.42 | 46.30 46.31 | 53.29 53.31 | //: 32-bit, MSC_v9.00 [ C =332]
+ 100000_ || 42.21 42.50 | 43.60 45.77 | 49.55 50.03 | //: 32-bit, MSC_v9.00 [ C =332]
+ API || 864 bytes | 704 bytes | 720 bytes | //: 32-bit, MSC_v9.00 [ C =332]
+ Block || 4560 bytes | 9232 bytes | 12560 bytes | //: 32-bit, MSC_v9.00 [ C =332]
+ 1_ || 780.00 798.00 | 1920.00 1920.00 | 3732.00 3732.00 | //: 64-bit, MSC_v9.00 [ C =332]
+ 10_ || 76.80 76.80 | 189.00 191.40 | 402.60 402.60 | //: 64-bit, MSC_v9.00 [ C =332]
+ 100_ || 17.10 17.16 | 27.66 27.90 | 37.62 37.62 | //: 64-bit, MSC_v9.00 [ C =332]
+ 1000_ || 9.98 10.12 | 14.23 14.25 | 16.13 16.13 | //: 64-bit, MSC_v9.00 [ C =332]
+ 10000_ || 9.27 9.28 | 12.89 12.99 | 13.98 13.98 | //: 64-bit, MSC_v9.00 [ C =332]
+ 100000_ || 9.32 9.56 | 13.12 13.19 | 14.15 14.23 | //: 64-bit, MSC_v9.00 [ C =332]
+ API || 992 bytes | 1312 bytes | 864 bytes | //: 64-bit, MSC_v9.00 [ C =332]
+ Block || 1200 bytes | 2928 bytes | 5008 bytes | //: 64-bit, MSC_v9.00 [ C =332]
+ 1_ || 2598.00 2604.00 | 4866.00 4878.00 | 10614.00 10632.00 | //: 32-bit, MSC_v9.00 [asm=332]
+ 10_ || 260.40 261.00 | 490.20 490.20 | 1067.40 1067.40 | //: 32-bit, MSC_v9.00 [asm=332]
+ 100_ || 60.78 60.78 | 72.00 72.00 | 106.86 106.92 | //: 32-bit, MSC_v9.00 [asm=332]
+ 1000_ || 38.38 38.42 | 39.17 39.19 | 46.49 46.61 | //: 32-bit, MSC_v9.00 [asm=332]
+ 10000_ || 40.98 47.69 | 35.81 35.86 | 40.96 43.93 | //: 32-bit, MSC_v9.00 [asm=332]
+ 100000_ || 34.46 36.34 | 34.07 37.16 | 39.60 43.18 | //: 32-bit, MSC_v9.00 [asm=332]
+ API || 864 bytes | 704 bytes | 720 bytes | //: 32-bit, MSC_v9.00 [asm=332]
+ Block || 3060 bytes | 6300 bytes | 8835 bytes | //: 32-bit, MSC_v9.00 [asm=332]
+ 1_ || 684.00 690.00 | 1104.00 1104.00 | 2028.00 2034.00 | //: 64-bit, MSC_v9.00 [asm=332]
+ 10_ || 70.80 70.80 | 120.00 120.00 | 219.00 219.00 | //: 64-bit, MSC_v9.00 [asm=332]
+ 100_ || 15.72 15.72 | 16.74 16.74 | 22.20 22.20 | //: 64-bit, MSC_v9.00 [asm=332]
+ 1000_ || 8.42 8.42 | 7.22 7.22 | 8.30 8.30 | //: 64-bit, MSC_v9.00 [asm=332]
+ 10000_ || 7.85 8.51 | 6.58 6.58 | 7.11 7.12 | //: 64-bit, MSC_v9.00 [asm=332]
+ 100000_ || 7.80 9.43 | 6.90 7.71 | 7.18 8.48 | //: 64-bit, MSC_v9.00 [asm=332]
+ API || 992 bytes | 1312 bytes | 864 bytes | //: 64-bit, MSC_v9.00 [asm=332]
+ Block || 1288 bytes | 2182 bytes | 3449 bytes | //: 64-bit, MSC_v9.00 [asm=332]
+ 1_ || 2994.00 2994.00 | 6240.00 6240.00 | 14598.00 14604.00 | //: 32-bit, MSC_v9.00 [ C =335]
+ 10_ || 300.60 301.20 | 624.00 624.60 | 1459.20 1461.00 | //: 32-bit, MSC_v9.00 [ C =335]
+ 100_ || 70.62 70.68 | 91.86 91.92 | 146.10 146.16 | //: 32-bit, MSC_v9.00 [ C =335]
+ 1000_ || 44.65 44.65 | 50.20 50.20 | 62.74 62.76 | //: 32-bit, MSC_v9.00 [ C =335]
+ 10000_ || 42.16 42.42 | 46.31 46.73 | 55.11 55.13 | //: 32-bit, MSC_v9.00 [ C =335]
+ 100000_ || 40.09 40.55 | 45.76 45.97 | 51.00 53.08 | //: 32-bit, MSC_v9.00 [ C =335]
+ API || 864 bytes | 704 bytes | 720 bytes | //: 32-bit, MSC_v9.00 [ C =335]
+ Block || 4560 bytes | 9232 bytes | 29280 bytes | //: 32-bit, MSC_v9.00 [ C =335]
+ 1_ || 780.00 798.00 | 1890.00 1920.00 | 3498.00 3498.00 | //: 64-bit, MSC_v9.00 [ C =335]
+ 10_ || 77.40 78.00 | 190.80 195.00 | 350.40 379.20 | //: 64-bit, MSC_v9.00 [ C =335]
+ 100_ || 17.10 17.10 | 27.72 28.08 | 35.28 35.28 | //: 64-bit, MSC_v9.00 [ C =335]
+ 1000_ || 9.95 10.00 | 14.23 14.24 | 15.09 15.10 | //: 64-bit, MSC_v9.00 [ C =335]
+ 10000_ || 9.30 10.06 | 12.94 14.10 | 13.07 14.36 | //: 64-bit, MSC_v9.00 [ C =335]
+ 100000_ || 9.33 9.58 | 13.94 13.95 | 13.24 13.92 | //: 64-bit, MSC_v9.00 [ C =335]
+ API || 992 bytes | 1312 bytes | 864 bytes | //: 64-bit, MSC_v9.00 [ C =335]
+ Block || 1200 bytes | 2928 bytes | 10880 bytes | //: 64-bit, MSC_v9.00 [ C =335]
+ 1_ || 2586.00 2592.00 | 4896.00 4902.00 | 10668.00 10668.00 | //: 32-bit, MSC_v9.00 [asm=335]
+ 10_ || 263.40 263.40 | 489.60 489.60 | 1069.20 1069.80 | //: 32-bit, MSC_v9.00 [asm=335]
+ 100_ || 61.08 61.14 | 72.30 72.36 | 107.04 107.10 | //: 32-bit, MSC_v9.00 [asm=335]
+ 1000_ || 35.57 35.57 | 36.11 36.12 | 43.07 43.12 | //: 32-bit, MSC_v9.00 [asm=335]
+ 10000_ || 33.68 34.51 | 33.29 36.32 | 37.91 39.80 | //: 32-bit, MSC_v9.00 [asm=335]
+ 100000_ || 36.32 36.43 | 35.91 35.98 | 38.02 38.19 | //: 32-bit, MSC_v9.00 [asm=335]
+ API || 864 bytes | 704 bytes | 720 bytes | //: 32-bit, MSC_v9.00 [asm=335]
+ Block || 3060 bytes | 6300 bytes | 20391 bytes | //: 32-bit, MSC_v9.00 [asm=335]
+ 1_ || 684.00 690.00 | 1104.00 1104.00 | 2022.00 2022.00 | //: 64-bit, MSC_v9.00 [asm=335]
+ 10_ || 65.40 65.40 | 109.80 109.80 | 201.60 202.20 | //: 64-bit, MSC_v9.00 [asm=335]
+ 100_ || 15.78 15.78 | 16.80 16.80 | 22.02 22.08 | //: 64-bit, MSC_v9.00 [asm=335]
+ 1000_ || 8.41 8.42 | 7.21 7.22 | 8.24 8.26 | //: 64-bit, MSC_v9.00 [asm=335]
+ 10000_ || 7.84 7.84 | 6.45 6.50 | 7.12 7.12 | //: 64-bit, MSC_v9.00 [asm=335]
+ 100000_ || 8.11 8.11 | 6.49 6.74 | 6.95 7.26 | //: 64-bit, MSC_v9.00 [asm=335]
+ API || 992 bytes | 1312 bytes | 864 bytes | //: 64-bit, MSC_v9.00 [asm=335]
+ Block || 1288 bytes | 2182 bytes | 7133 bytes | //: 64-bit, MSC_v9.00 [asm=335]
Added: vendor-sys/skein/dist/Additional_Implementations/skein_block_x64.asm
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ vendor-sys/skein/dist/Additional_Implementations/skein_block_x64.asm Fri May 27 02:42:46 2016 (r300819)
@@ -0,0 +1,1335 @@
+;
+;----------------------------------------------------------------
+; 64-bit x86 assembler code (Microsoft ML64) for Skein block functions
+;
+; Author: Doug Whiting, Hifn
+;
+; This code is released to the public domain.
+;----------------------------------------------------------------
+;
+ .code
+;
+_MASK_ALL_ equ (256+512+1024) ;all three algorithm bits
+_MAX_FRAME_ equ 240
+;
+;;;;;;;;;;;;;;;;;
+ifndef SKEIN_USE_ASM
+_USE_ASM_ = _MASK_ALL_
+elseif SKEIN_USE_ASM and _MASK_ALL_
+_USE_ASM_ = SKEIN_USE_ASM
+else
+_USE_ASM_ = _MASK_ALL_
+endif
+;;;;;;;;;;;;;;;;;
+ifndef SKEIN_LOOP ;configure loop unrolling
+_SKEIN_LOOP = 0 ;default is all fully unrolled
+else
+_SKEIN_LOOP = SKEIN_LOOP
+endif
+; the unroll counts (0 --> fully unrolled)
+SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) mod 10
+SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) mod 10
+SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) mod 10
+;
+SKEIN_ASM_UNROLL = 0
+ irp _NN_,<256,512,1024>
+ if (SKEIN_UNROLL_&_NN_) eq 0
+SKEIN_ASM_UNROLL = SKEIN_ASM_UNROLL + _NN_
+ endif
+ endm
+;;;;;;;;;;;;;;;;;
+;
+ifndef SKEIN_ROUNDS
+ROUNDS_256 = 72
+ROUNDS_512 = 72
+ROUNDS_1024 = 80
+else
+ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) mod 10) + 5)
+ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) mod 10) + 5)
+ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) mod 10) + 5)
+endif
+;
+irp _NN_,<256,512,1024>
+ if _USE_ASM_ and _NN_
+ irp _RR_,<%(ROUNDS_&_NN_)>
+ if _NN_ eq 1024
+%out +++ SKEIN_ROUNDS_&_NN_ = _RR_
+ else
+%out +++ SKEIN_ROUNDS_&_NN_ = _RR_
+ endif
+ endm
+ endif
+endm
+;;;;;;;;;;;;;;;;;
+;
+ifndef SKEIN_CODE_SIZE
+ifdef SKEIN_PERF
+SKEIN_CODE_SIZE equ (1)
+endif
+endif
+;
+;;;;;;;;;;;;;;;;;
+;
+ifndef SKEIN_DEBUG
+_SKEIN_DEBUG = 0
+else
+_SKEIN_DEBUG = 1
+endif
+;;;;;;;;;;;;;;;;;
+;
+; define offsets of fields in hash context structure
+;
+HASH_BITS = 0 ;# bits of hash output
+BCNT = 8 + HASH_BITS ;number of bytes in BUFFER[]
+TWEAK = 8 + BCNT ;tweak values[0..1]
+X_VARS = 16 + TWEAK ;chaining vars
+;
+;(Note: buffer[] in context structure is NOT needed here :-)
+;
+r08 equ <r8>
+r09 equ <r9>
+;
+KW_PARITY = 01BD11BDAA9FC1A22h ;overall parity of key schedule words
+FIRST_MASK = NOT (1 SHL 62)
+;
+; rotation constants for Skein
+;
+RC_256_0_0 = 14
+RC_256_0_1 = 16
+
+RC_256_1_0 = 52
+RC_256_1_1 = 57
+
+RC_256_2_0 = 23
+RC_256_2_1 = 40
+
+RC_256_3_0 = 5
+RC_256_3_1 = 37
+
+RC_256_4_0 = 25
+RC_256_4_1 = 33
+
+RC_256_5_0 = 46
+RC_256_5_1 = 12
+
+RC_256_6_0 = 58
+RC_256_6_1 = 22
+
+RC_256_7_0 = 32
+RC_256_7_1 = 32
+
+RC_512_0_0 = 46
+RC_512_0_1 = 36
+RC_512_0_2 = 19
+RC_512_0_3 = 37
+
+RC_512_1_0 = 33
+RC_512_1_1 = 27
+RC_512_1_2 = 14
+RC_512_1_3 = 42
+
+RC_512_2_0 = 17
+RC_512_2_1 = 49
+RC_512_2_2 = 36
+RC_512_2_3 = 39
+
+RC_512_3_0 = 44
+RC_512_3_1 = 9
+RC_512_3_2 = 54
+RC_512_3_3 = 56
+
+RC_512_4_0 = 39
+RC_512_4_1 = 30
+RC_512_4_2 = 34
+RC_512_4_3 = 24
+
+RC_512_5_0 = 13
+RC_512_5_1 = 50
+RC_512_5_2 = 10
+RC_512_5_3 = 17
+
+RC_512_6_0 = 25
+RC_512_6_1 = 29
+RC_512_6_2 = 39
+RC_512_6_3 = 43
+
+RC_512_7_0 = 8
+RC_512_7_1 = 35
+RC_512_7_2 = 56
+RC_512_7_3 = 22
+
+RC_1024_0_0 = 24
+RC_1024_0_1 = 13
+RC_1024_0_2 = 8
+RC_1024_0_3 = 47
+RC_1024_0_4 = 8
+RC_1024_0_5 = 17
+RC_1024_0_6 = 22
+RC_1024_0_7 = 37
+
+RC_1024_1_0 = 38
+RC_1024_1_1 = 19
+RC_1024_1_2 = 10
+RC_1024_1_3 = 55
+RC_1024_1_4 = 49
+RC_1024_1_5 = 18
+RC_1024_1_6 = 23
+RC_1024_1_7 = 52
+
+RC_1024_2_0 = 33
+RC_1024_2_1 = 4
+RC_1024_2_2 = 51
+RC_1024_2_3 = 13
+RC_1024_2_4 = 34
+RC_1024_2_5 = 41
+RC_1024_2_6 = 59
+RC_1024_2_7 = 17
+
+RC_1024_3_0 = 5
+RC_1024_3_1 = 20
+RC_1024_3_2 = 48
+RC_1024_3_3 = 41
+RC_1024_3_4 = 47
+RC_1024_3_5 = 28
+RC_1024_3_6 = 16
+RC_1024_3_7 = 25
+
+RC_1024_4_0 = 41
+RC_1024_4_1 = 9
+RC_1024_4_2 = 37
+RC_1024_4_3 = 31
+RC_1024_4_4 = 12
+RC_1024_4_5 = 47
+RC_1024_4_6 = 44
+RC_1024_4_7 = 30
+
+RC_1024_5_0 = 16
+RC_1024_5_1 = 34
+RC_1024_5_2 = 56
+RC_1024_5_3 = 51
+RC_1024_5_4 = 4
+RC_1024_5_5 = 53
+RC_1024_5_6 = 42
+RC_1024_5_7 = 41
+
+RC_1024_6_0 = 31
+RC_1024_6_1 = 44
+RC_1024_6_2 = 47
+RC_1024_6_3 = 46
+RC_1024_6_4 = 19
+RC_1024_6_5 = 42
+RC_1024_6_6 = 44
+RC_1024_6_7 = 25
+
+RC_1024_7_0 = 9
+RC_1024_7_1 = 48
+RC_1024_7_2 = 35
+RC_1024_7_3 = 52
+RC_1024_7_4 = 23
+RC_1024_7_5 = 31
+RC_1024_7_6 = 37
+RC_1024_7_7 = 20
+;
+; Input: reg
+; Output: <reg> <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024
+;
+RotL64 macro reg,BLK_SIZE,ROUND_NUM,MIX_NUM
+_RCNT_ = ( RC_&BLK_SIZE&_&ROUND_NUM&_&MIX_NUM AND 63 )
+ if _RCNT_ ;is there anything to do?
+ rol reg,_RCNT_
+ endif
+endm
+;
+;----------------------------------------------------------------
+;
+; MACROS: define local vars and configure stack
+;
+;----------------------------------------------------------------
+; declare allocated space on the stack
+StackVar macro localName,localSize
+localName = _STK_OFFS_
+_STK_OFFS_ = _STK_OFFS_+(localSize)
+endm ;StackVar
+;
+;----------------------------------------------------------------
+;
+; MACRO: Configure stack frame, allocate local vars
+;
+Setup_Stack macro BLK_BITS,KS_CNT,NO_FRAME,debugCnt
+ WCNT = (BLK_BITS)/64
+;
+_PushCnt_ = 0 ;save nonvolatile regs on stack
+ irp _reg_,<rbp,rsi,rdi,rbx,r12,r13,r14,r15>
+ push _reg_
+ .pushreg _reg_ ;pseudo-op push for exception handling
+_PushCnt_ = _PushCnt_ + 1 ;track count to keep alignment
+ endm
+;
+_STK_OFFS_ = 0 ;starting offset from rsp
+ ;---- local variables ;<-- rsp
+ StackVar X_stk ,8*(WCNT) ;local context vars
+ StackVar ksTwk ,8*3 ;key schedule: tweak words
+ StackVar ksKey ,8*(WCNT)+8 ;key schedule: key words
+ if (SKEIN_ASM_UNROLL and (BLK_BITS)) eq 0
+ StackVar ksRot ,16*(KS_CNT+0);leave space for "rotation" to happen
+ endif
+ StackVar Wcopy ,8*(WCNT) ;copy of input block
+ if _SKEIN_DEBUG
+ ifnb <debugCnt> ;temp location for debug X[] info
+ StackVar xDebug_&BLK_BITS ,8*(debugCnt)
+ endif
+ endif
+ if ((8*_PushCnt_ + _STK_OFFS_) and 8) eq 0
+ StackVar align16,8 ;keep 16-byte aligned (adjust for retAddr?)
+tmpStk_&BLK_BITS = align16 ;use this
+ endif
+LOCAL_SIZE = _STK_OFFS_ ;size of local vars
+ ;----
+ StackVar savRegs,8*_PushCnt_ ;saved registers
+ StackVar retAddr,8 ;return address
+ ;---- caller parameters
+ StackVar ctxPtr ,8 ;context ptr
+ StackVar blkPtr ,8 ;pointer to block data
+ StackVar blkCnt ,8 ;number of full blocks to process
+ StackVar bitAdd ,8 ;bit count to add to tweak
+ ;---- caller's stack frame
+;
+; set up the stack frame pointer (rbp)
+;
+FRAME_OFFS = ksTwk + 128 ;allow short (negative) offset to ksTwk, kwKey
+ if FRAME_OFFS gt _STK_OFFS_ ;keep rbp in the "locals" range
+FRAME_OFFS = _STK_OFFS_
+ endif
+ if FRAME_OFFS gt _MAX_FRAME_ ;keep Microsoft .setframe happy
+FRAME_OFFS = _MAX_FRAME_
+ endif
+;
+ifdef SKEIN_ASM_INFO
+ if FRAME_OFFS+128 lt savRegs
+%out +++ SKEIN_&BLK_BITS: Unable to reach all of Wcopy with short offset from rbp.
+ elseif FRAME_OFFS+128 lt Wcopy
+%out +++ SKEIN_&BLK_BITS: Unable to reach end of Wcopy with short offset from rbp.
+ elseif FRAME_OFFS+128 lt _STK_OFFS_
+%out +++ SKEIN_&BLK_BITS: Unable to reach caller parms with short offset from rbp
+ endif
+endif
+ ;put some useful defines in the .lst file (for grep)
+__STK_LCL_SIZE_&BLK_BITS = LOCAL_SIZE
+__STK_TOT_SIZE_&BLK_BITS = _STK_OFFS_
+__STK_FRM_OFFS_&BLK_BITS = FRAME_OFFS
+;
+; Notes on stack frame setup:
+; * the most frequently used variable is X_stk[], based at [rsp+0]
+; * the next most used is the key schedule arrays, ksKey and ksTwk
+; so rbp is "centered" there, allowing short offsets to the key
+; schedule even in 1024-bit Skein case
+; * the Wcopy variables are infrequently accessed, but they have long
+; offsets from both rsp and rbp only in the 1024-bit case.
+; * all other local vars and calling parameters can be accessed
+; with short offsets, except in the 1024-bit case
+;
+ sub rsp,LOCAL_SIZE ;make room for the locals
+ .allocstack LOCAL_SIZE ;pseudo op for exception handling
+ lea rbp,[rsp+FRAME_OFFS] ;maximize use of short offsets
+ ifb <NO_FRAME>
+ .setframe rbp, FRAME_OFFS ;pseudo op for exception handling
+ endif
+ mov [FP_+ctxPtr],rcx ;save caller's parameters on the stack
+ mov [FP_+blkPtr],rdx
+ mov [FP_+blkCnt],r08
+ mov [FP_+bitAdd],r09
+ .endprolog ;pseudo op to support exception handling
+
+ mov rdi,[FP_+ctxPtr ] ;rdi --> context
+;
+endm ;Setup_Stack
+;
+FP_ equ <rbp-FRAME_OFFS> ;keep as many short offsets as possible
+;
+;----------------------------------------------------------------
+;
+Reset_Stack macro procStart
+ add rsp,LOCAL_SIZE ;get rid of locals (wipe??)
+ irp _reg_,<r15,r14,r13,r12,rbx,rdi,rsi,rbp>
+ pop _reg_
+_PushCnt_ = _PushCnt_ - 1
+ endm
+ if _PushCnt_
+ .err "Mismatched push/pops?"
+ endif
+
+ ;display code size in bytes to stdout
+ irp _BCNT_,<%($+1-procStart)> ;account for return opcode
+_ProcBytes_ = _BCNT_
+if _BCNT_ ge 10000
+%out procStart code size = _BCNT_ bytes
+elseif _BCNT_ ge 1000
+%out procStart code size = _BCNT_ bytes
+else
+%out procStart code size = _BCNT_ bytes
+endif
+ endm ;irp _BCNT_
+endm ; Reset_Stack
+;
+;----------------------------------------------------------------
+; macros to help debug internals
+;
+if _SKEIN_DEBUG
+ extrn Skein_Show_Block:proc ;calls to C routines
+ extrn Skein_Show_Round:proc
+;
+SKEIN_RND_SPECIAL = 1000
+SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0
+SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1
+SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2
+;
+Skein_Debug_Block macro BLK_BITS
+;
+;void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
+; const u08b_t *blkPtr, const u64b_t *wPtr,
+; const u64b_t *ksPtr,const u64b_t *tsPtr);
+;
+ irp _reg_,<rax,rcx,rdx,r08,r09,r10,r11>
+ push _reg_ ;save all volatile regs on tack before the call
+ endm
+ ; get and push call parameters
+ lea rax,[FP_+ksTwk] ;tweak pointer
+ push rax
+ lea rax,[FP_+ksKey] ;key pointer
+ push rax
+ lea rax,[FP_+Wcopy] ;wPtr
+ push rax
+ mov r09,[FP_+blkPtr] ;blkPtr
+ push r09 ;(push register parameters anyway to make room on stack)
+ mov rdx,[FP_+ctxPtr]
+ lea r08,[rdx+X_VARS] ;X (pointer)
+ push r08
+ push rdx ;h (pointer)
+ mov rcx, BLK_BITS ;bits
+ push rdx
+ call Skein_Show_Block ;call external debug handler
+ add rsp,7*8 ;discard parameters on stack
+ irp _reg_,<r11,r10,r09,r08,rdx,rcx,rax>
+ pop _reg_ ;restore regs
+ endm
+endm ; Skein_Debug_Block
+;
+;
+; the macro to "call" to debug a round
+;
+Skein_Debug_Round macro BLK_BITS,R,RDI_OFFS,afterOp
+ ; call the appropriate (local) debug function
+ push r08
+ if (SKEIN_ASM_UNROLL and BLK_BITS) or (R ge SKEIN_RND_SPECIAL)
+ mov r08, R
+ else ;compute round number using edi
+_rOffs_ = RDI_OFFS + 0
+ if BLK_BITS eq 1024
+ mov r08,[rsp+8+rIdx_offs] ;get rIdx off the stack (adjust for push r08)
+ lea r08,[4*r08+1+(((R)-1) and 3)+_rOffs_]
+ else
+ lea r08,[4*rdi+1+(((R)-1) and 3)+_rOffs_]
+ endif
+ endif
+ call Skein_Debug_Round_&BLK_BITS
+ pop r08
+;
+ afterOp
+endm ; Skein_Debug_Round
+else ;------- _SKEIN_DEBUG (dummy macros if debug not enabled)
+Skein_Debug_Block macro BLK_BITS,afterOp
+endm
+;
+Skein_Debug_Round macro BLK_BITS,R,RDI_OFFS,afterOp
+endm
+;
+endif ; _SKEIN_DEBUG
+;
+;----------------------------------------------------------------
+;
+addReg macro dstReg,srcReg_A,srcReg_B,useAddOp,immOffs
+ ifnb <immOffs>
+ lea dstReg,[srcReg_A&&srcReg_B + dstReg + immOffs]
+ elseif ((useAddOp + 0) eq 0)
+ ifndef ASM_NO_LEA
+ ;lea seems to be faster on Core 2 Duo CPUs!
+ lea dstReg,[srcReg_A&&srcReg_B + dstReg]
+ else
+ add dstReg, srcReg_A&&srcReg_B
+ endif
+ else
+ add dstReg, srcReg_A&&srcReg_B
+ endif
+endm
+;
+;=================================== Skein_256 =============================================
+;
+if _USE_ASM_ and 256
+ public Skein_256_Process_Block
+;
+; void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+;;;;;;;;;;;;;;;;;
+;
+; code
+;
+Skein_256_Process_Block proc frame
+ Setup_Stack 256,((ROUNDS_256/8)+1)
+ mov r14,[rdi+TWEAK+8]
+ jmp short Skein_256_block_loop
+ align 16
+ ; main hash loop for Skein_256
+Skein_256_block_loop:
+ ;
+ ; general register usage:
+ ; RAX..RDX = X0..X3
+ ; R08..R12 = ks[0..4]
+ ; R13..R15 = ts[0..2]
+ ; RSP, RBP = stack/frame pointers
+ ; RDI = round counter or context pointer
+ ; RSI = temp
+ ;
+ mov r13,[rdi+TWEAK+0]
+ add r13,[FP_+bitAdd] ;computed updated tweak value T0
+ mov r15,r14
+ xor r15,r13 ;now r13.r15 is set as the tweak
+
+ mov r12,KW_PARITY
+ mov r08,[rdi+X_VARS+ 0]
+ mov r09,[rdi+X_VARS+ 8]
+ mov r10,[rdi+X_VARS+16]
+ mov r11,[rdi+X_VARS+24]
+ mov [rdi+TWEAK+0],r13 ;save updated tweak value ctx->h.T[0]
+ xor r12,r08 ;start accumulating overall parity
+
+ mov rsi,[FP_+blkPtr ] ;esi --> input block
+ xor r12,r09
+ mov rax,[rsi+ 0] ;get X[0..3]
+ xor r12,r10
+ mov rbx,[rsi+ 8]
+ xor r12,r11
+ mov rcx,[rsi+16]
+ mov rdx,[rsi+24]
+
+ mov [FP_+Wcopy+ 0],rax ;save copy of input block
+ mov [FP_+Wcopy+ 8],rbx
+ mov [FP_+Wcopy+16],rcx
+ mov [FP_+Wcopy+24],rdx
+
+ add rax, r08 ;initial key injection
+ add rbx, r09
+ add rcx, r10
+ add rdx, r11
+ add rbx, r13
+ add rcx, r14
+
+if _SKEIN_DEBUG
+ mov [rdi+TWEAK+ 8],r14 ;save updated tweak T[1] (start bit cleared?)
+ mov [FP_+ksKey+ 0],r08 ;save key schedule on stack for Skein_Debug_Block
+ mov [FP_+ksKey+ 8],r09
+ mov [FP_+ksKey+16],r10
+ mov [FP_+ksKey+24],r11
+ mov [FP_+ksKey+32],r12
+
+ mov [FP_+ksTwk+ 0],r13
+ mov [FP_+ksTwk+ 8],r14
+ mov [FP_+ksTwk+16],r15
+
+ mov [rsp+X_stk + 0],rax ;save X[] on stack for Skein_Debug_Block
+ mov [rsp+X_stk + 8],rbx
+ mov [rsp+X_stk +16],rcx
+ mov [rsp+X_stk +24],rdx
+
+ Skein_Debug_Block 256 ;debug dump
+ Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL
+endif
+;
+if ((SKEIN_ASM_UNROLL and 256) eq 0)
+ mov [FP_+ksKey+40],r08 ;save key schedule on stack for looping code
+ mov [FP_+ksKey+ 8],r09
+ mov [FP_+ksKey+16],r10
+ mov [FP_+ksKey+24],r11
+ mov [FP_+ksKey+32],r12
+
+ mov [FP_+ksTwk+24],r13
+ mov [FP_+ksTwk+ 8],r14
+ mov [FP_+ksTwk+16],r15
+endif
+ add rsi, WCNT*8 ;skip the block
+ mov [FP_+blkPtr ],rsi ;update block pointer
+;
+opLoop macro op1,op2
+ if (SKEIN_ASM_UNROLL and 256) eq 0
+ op1
+ else
+ op2
+ endif
+endm
+;
+ ;
+ ; now the key schedule is computed. Start the rounds
+ ;
+if SKEIN_ASM_UNROLL and 256
+_UNROLL_CNT = ROUNDS_256/8
+else
+_UNROLL_CNT = SKEIN_UNROLL_256
+ if ((ROUNDS_256/8) mod _UNROLL_CNT)
+ .err "Invalid SKEIN_UNROLL_256"
+ endif
+ xor rdi,rdi ;rdi = iteration count
+Skein_256_round_loop:
+endif
+_Rbase_ = 0
+rept _UNROLL_CNT*2
+ ; all X and ks vars in regs ; (ops to "rotate" ks vars, via mem, if not unrolled)
+ ; round 4*_RBase_ + 0
+ addReg rax, rbx
+ RotL64 rbx, 256,%((4*_RBase_+0) and 7),0
+ addReg rcx, rdx
+ opLoop <mov r08,[FP_+ksKey+8*rdi+8*1]>
+ xor rbx, rax
+ RotL64 rdx, 256,%((4*_RBase_+0) and 7),1
+ xor rdx, rcx
+ if SKEIN_ASM_UNROLL and 256
+ irp _r0_,<%(08+(_Rbase_+3) mod 5)>
+ irp _r1_,<%(13+(_Rbase_+2) mod 3)>
+ lea rdi,[r&_r0_+r&_r1_] ;precompute key injection value for rcx
+ endm
+ endm
+ endif
+ opLoop <mov r13,[FP_+ksTwk+8*rdi+8*1]>
+ Skein_Debug_Round 256,%(4*_RBase_+1)
+
+ ; round 4*_RBase_ + 1
+ addReg rax, rdx
+ RotL64 rdx, 256,%((4*_RBase_+1) and 7),0
+ xor rdx, rax
+ opLoop <mov r09,[FP_+ksKey+8*rdi+8*2]>
+ addReg rcx, rbx
+ RotL64 rbx, 256,%((4*_RBase_+1) and 7),1
+ xor rbx, rcx
+ opLoop <mov r11,[FP_+ksKey+8*rdi+8*4]>
+ Skein_Debug_Round 256,%(4*_RBase_+2)
+ if SKEIN_ASM_UNROLL and 256
+ irp _r0_,<%(08+(_Rbase_+2) mod 5)>
+ irp _r1_,<%(13+(_Rbase_+1) mod 3)>
+ lea rsi,[r&_r0_+r&_r1_] ;precompute key injection value for rbx
+ endm
+ endm
+ endif
+ ; round 4*_RBase_ + 2
+ addReg rax, rbx
+ RotL64 rbx, 256,%((4*_RBase_+2) and 7),0
+ addReg rcx, rdx
+ opLoop <mov r10,[FP_+ksKey+8*rdi+8*3]>
+ xor rbx, rax
+ RotL64 rdx, 256,%((4*_RBase_+2) and 7),1
+ xor rdx, rcx
+ opLoop <mov [FP_+ksKey+8*rdi+8*6],r08> ;"rotate" the key
+ opLoop <lea r11,[r11+rdi+1]> ;precompute key + tweak
+ Skein_Debug_Round 256,%(4*_RBase_+3)
+ ; round 4*_RBase_ + 3
+ addReg rax, rdx
+ RotL64 rdx, 256,%((4*_RBase_+3) and 7),0
+ addReg rcx, rbx
+ opLoop <add r10,[FP_+ksTwk+8*rdi+8*2]> ;precompute key + tweak
+ opLoop <mov [FP_+ksTwk+8*rdi+8*4],r13> ;"rotate" the tweak
+ xor rdx, rax
+ RotL64 rbx, 256,%((4*_RBase_+3) and 7),1
+ xor rbx, rcx
+ Skein_Debug_Round 256,%(4*_RBase_+4)
+ opLoop <addReg r09,r13> ;precompute key+tweak
+ ;inject key schedule words
+_Rbase_ = _Rbase_+1
+ if SKEIN_ASM_UNROLL and 256
+ addReg rax,r,%(08+((_Rbase_+0) mod 5))
+ addReg rbx,rsi
+ addReg rcx,rdi
+ addReg rdx,r,%(08+((_Rbase_+3) mod 5)),,_Rbase_
+ else
+ inc rdi
+ addReg rax,r08
+ addReg rcx,r10
+ addReg rbx,r09
+ addReg rdx,r11
+ endif
+ Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT
+endm ;rept _UNROLL_CNT
+
+;
+if (SKEIN_ASM_UNROLL and 256) eq 0
+ cmp rdi,2*(ROUNDS_256/8)
+ jb Skein_256_round_loop
+endif ; (SKEIN_ASM_UNROLL and 256) eq 0
+ mov rdi,[FP_+ctxPtr ] ;restore edi --> context
+
+ ;----------------------------
+ ; feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3}
+ xor rax,[FP_+Wcopy + 0]
+ mov r14,FIRST_MASK
+ xor rbx,[FP_+Wcopy + 8]
+ xor rcx,[FP_+Wcopy +16]
+ xor rdx,[FP_+Wcopy +24]
+ mov [rdi+X_VARS+ 0],rax ;store final result
+ and r14,[rdi+TWEAK + 8]
+ dec qword ptr [FP_+blkCnt] ;set zero flag
+ mov [rdi+X_VARS+ 8],rbx
+ mov [rdi+X_VARS+16],rcx
+ mov [rdi+X_VARS+24],rdx
+
+ Skein_Debug_Round 256,SKEIN_RND_FEED_FWD,,<cmp qword ptr [FP_+blkCnt],0>
+
+ ; go back for more blocks, if needed
+ jnz Skein_256_block_loop
+ mov [rdi+TWEAK + 8],r14
+ Reset_Stack Skein_256_Process_Block
+ ret
+
+ if _SKEIN_DEBUG
+Skein_Debug_Round_256:
+ mov [FP_+X_stk+ 0],rax ;first, save X[] state on stack so debug routines can access it
+ mov [FP_+X_stk+ 8],rbx ;(use FP_ since rsp has changed!)
+ mov [FP_+X_stk+16],rcx
+ mov [FP_+X_stk+24],rdx
+ push rdx ;save two regs for BLK_BITS-specific parms
+ push rcx
+ mov rdx,[FP_+ctxPtr] ;ctx_hdr_ptr
+ mov rcx, 256
+ jmp Skein_Debug_Round_Common
+ endif
+
+Skein_256_Process_Block endp
+;
+ifdef SKEIN_CODE_SIZE
+ public Skein_256_Process_Block_CodeSize
+Skein_256_Process_Block_CodeSize proc
+ mov rax,_ProcBytes_
+ ret
+Skein_256_Process_Block_CodeSize endp
+;
+ public Skein_256_Unroll_Cnt
+Skein_256_Unroll_Cnt proc
+ if _UNROLL_CNT ne ROUNDS_256/8
+ mov rax,_UNROLL_CNT
+ else
+ xor rax,rax
+ endif
+ ret
+Skein_256_Unroll_Cnt endp
+endif
+;
+endif ;_USE_ASM_ and 256
+;
+;=================================== Skein_512 =============================================
+;
+if _USE_ASM_ and 512
+ public Skein_512_Process_Block
+;
+; void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+rX_512_0 equ r08 ;register assignments for X[] values during rounds
+rX_512_1 equ r09
+rX_512_2 equ r10
+rX_512_3 equ r11
+rX_512_4 equ r12
+rX_512_5 equ r13
+rX_512_6 equ r14
+rX_512_7 equ r15
+;
+;;;;;;;;;;;;;;;;;
+; MACRO: one round for 512-bit blocks
+;
+R_512_OneRound macro r0,r1,r2,r3,r4,r5,r6,r7,_Rn_,op1,op2,op3,op4
+;
+ addReg rX_512_&r0, rX_512_&r1
+ RotL64 rX_512_&r1, 512,%((_Rn_) and 7),0
+ xor rX_512_&r1, rX_512_&r0
+ op1
+ addReg rX_512_&r2, rX_512_&r3
+ RotL64 rX_512_&r3, 512,%((_Rn_) and 7),1
+ xor rX_512_&r3, rX_512_&r2
+ op2
+ addReg rX_512_&r4, rX_512_&r5
+ RotL64 rX_512_&r5, 512,%((_Rn_) and 7),2
+ xor rX_512_&r5, rX_512_&r4
+ op3
+ addReg rX_512_&r6, rX_512_&r7
+ RotL64 rX_512_&r7, 512,%((_Rn_) and 7),3
+ xor rX_512_&r7, rX_512_&r6
+ op4
+ Skein_Debug_Round 512,%(_Rn_+1),-4
+;
+endm ;R_512_OneRound
+;
+;;;;;;;;;;;;;;;;;
+; MACRO: eight rounds for 512-bit blocks
+;
+R_512_FourRounds macro _RR_ ;RR = base round number (0 mod 8)
+ if SKEIN_ASM_UNROLL and 512
+ ; here for fully unrolled case.
+ _II_ = ((_RR_)/4) + 1 ;key injection counter
+ R_512_OneRound 0,1,2,3,4,5,6,7,%((_RR_)+0),<mov rax,[FP_+ksKey+8*(((_II_)+3) mod 9)]>,,<mov rbx,[FP_+ksKey+8*(((_II_)+4) mod 9)]>
+ R_512_OneRound 2,1,4,7,6,5,0,3,%((_RR_)+1),<mov rcx,[FP_+ksKey+8*(((_II_)+5) mod 9)]>,,<mov rdx,[FP_+ksKey+8*(((_II_)+6) mod 9)]>
+ R_512_OneRound 4,1,6,3,0,5,2,7,%((_RR_)+2),<mov rsi,[FP_+ksKey+8*(((_II_)+7) mod 9)]>,,<add rcx,[FP_+ksTwk+8*(((_II_)+0) mod 3)]>
+ R_512_OneRound 6,1,0,7,2,5,4,3,%((_RR_)+3),<add rdx,[FP_+ksTwk+8*(((_II_)+1) mod 3)]>,
+ ; inject the key schedule
+ add r08,[FP_+ksKey+8*(((_II_)+0) mod 9)]
+ addReg r11,rax
+ add r09,[FP_+ksKey+8*(((_II_)+1) mod 9)]
+ addReg r12,rbx
+ add r10,[FP_+ksKey+8*(((_II_)+2) mod 9)]
+ addReg r13,rcx
+ addReg r14,rdx
+ addReg r15,rsi,,,(_II_)
+ else
+ ; here for looping case ;"rotate" key/tweak schedule (move up on stack)
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-vendor
mailing list