From 03c04b1ae03d6ab09d4e9577ef098d8fb10c7594 Mon Sep 17 00:00:00 2001 From: Jorgen Lundman Date: Sun, 27 Nov 2022 18:00:20 +0900 Subject: [PATCH 1/3] All the renames and deletes --- .../os/windows/spl/sys/ia32}/asm_linkage.h | 16 +- .../icp/include => include}/sys/asm_linkage.h | 5 - .../include/os/linux}/sys/ia32/asm_linkage.h | 8 + .../libspl/include/os/linux}/sys/ia32/stack.h | 0 .../libspl/include/os/linux}/sys/ia32/trap.h | 0 .../os/windows/sys/ia32}/asm_linkage.h | 13 +- .../icp/asm-x86_64/os/windows/aes/aes_aesni.S | 762 ------ .../icp/asm-x86_64/os/windows/aes/aes_amd64.S | 908 ------- .../os/windows/modes/gcm_pclmulqdq.S | 267 --- .../asm-x86_64/os/windows/sha2/sha256_impl.S | 2078 ---------------- .../asm-x86_64/os/windows/sha2/sha512_impl.S | 2104 ----------------- module/lua/setjmp/win_setjmp_x86_64.S | 95 - 12 files changed, 30 insertions(+), 6226 deletions(-) rename {lib/libspl/include/os/windows/win => include/os/windows/spl/sys/ia32}/asm_linkage.h (95%) rename {module/icp/include => include}/sys/asm_linkage.h (95%) rename {module/icp/include => lib/libspl/include/os/linux}/sys/ia32/asm_linkage.h (95%) rename {module/icp/include => lib/libspl/include/os/linux}/sys/ia32/stack.h (100%) rename {module/icp/include => lib/libspl/include/os/linux}/sys/ia32/trap.h (100%) rename {include/os/windows/spl/win => lib/libspl/include/os/windows/sys/ia32}/asm_linkage.h (95%) delete mode 100644 module/icp/asm-x86_64/os/windows/aes/aes_aesni.S delete mode 100644 module/icp/asm-x86_64/os/windows/aes/aes_amd64.S delete mode 100644 module/icp/asm-x86_64/os/windows/modes/gcm_pclmulqdq.S delete mode 100644 module/icp/asm-x86_64/os/windows/sha2/sha256_impl.S delete mode 100644 module/icp/asm-x86_64/os/windows/sha2/sha512_impl.S delete mode 100644 module/lua/setjmp/win_setjmp_x86_64.S diff --git a/lib/libspl/include/os/windows/win/asm_linkage.h b/include/os/windows/spl/sys/ia32/asm_linkage.h similarity index 95% rename from lib/libspl/include/os/windows/win/asm_linkage.h rename to include/os/windows/spl/sys/ia32/asm_linkage.h index e68a335274c4..2fe6d8ae8496 100644 --- a/lib/libspl/include/os/windows/win/asm_linkage.h +++ b/include/os/windows/spl/sys/ia32/asm_linkage.h @@ -27,11 +27,15 @@ #ifndef _IA32_SYS_ASM_LINKAGE_H #define _IA32_SYS_ASM_LINKAGE_H -#if defined(__linux__) && defined(CONFIG_SLS) -#define RET ret; int3 -#else #define RET ret -#endif + +/* Tell compiler to call assembler like Unix */ +#define ASMABI __attribute__((sysv_abi)) + +#define ENDBR + +#define SECTION_TEXT .text +#define SECTION_STATIC .data #ifdef __cplusplus extern "C" { @@ -39,6 +43,7 @@ extern "C" { #ifdef _ASM /* The remainder of this file is only for assembly files */ + /* * make annoying differences in assembler syntax go away */ @@ -154,6 +159,9 @@ x:; \ */ #define SET_SIZE(x) +#define SET_OBJ(x) + + #endif /* _ASM */ #ifdef __cplusplus diff --git a/module/icp/include/sys/asm_linkage.h b/include/sys/asm_linkage.h similarity index 95% rename from module/icp/include/sys/asm_linkage.h rename to include/sys/asm_linkage.h index a6975ae9428c..072b45959405 100644 --- a/module/icp/include/sys/asm_linkage.h +++ b/include/sys/asm_linkage.h @@ -28,13 +28,8 @@ #define _SYS_ASM_LINKAGE_H #if defined(__i386) || defined(__amd64) - -#ifdef _WIN32 -#include -#else #include /* XX64 x86/sys/asm_linkage.h */ #endif -#endif #if defined(_KERNEL) && defined(HAVE_KERNEL_OBJTOOL) diff --git a/module/icp/include/sys/ia32/asm_linkage.h b/lib/libspl/include/os/linux/sys/ia32/asm_linkage.h similarity index 95% rename from module/icp/include/sys/ia32/asm_linkage.h rename to lib/libspl/include/os/linux/sys/ia32/asm_linkage.h index e3e769ffd858..6180c783ba21 100644 --- a/module/icp/include/sys/ia32/asm_linkage.h +++ b/lib/libspl/include/os/linux/sys/ia32/asm_linkage.h @@ -56,6 +56,12 @@ #define RET ret #endif +/* You can set to nothing on Unix platforms */ +#define ASMABI __attribute__((sysv_abi)) + +#define SECTION_TEXT .text +#define SECTION_STATIC .section .rodata + #ifdef __cplusplus extern "C" { #endif @@ -185,6 +191,8 @@ x:; \ #define SET_SIZE(x) \ .size x, [.-x] +#define SET_OBJ(x) .type x, @object + #endif /* _ASM */ #ifdef __cplusplus diff --git a/module/icp/include/sys/ia32/stack.h b/lib/libspl/include/os/linux/sys/ia32/stack.h similarity index 100% rename from module/icp/include/sys/ia32/stack.h rename to lib/libspl/include/os/linux/sys/ia32/stack.h diff --git a/module/icp/include/sys/ia32/trap.h b/lib/libspl/include/os/linux/sys/ia32/trap.h similarity index 100% rename from module/icp/include/sys/ia32/trap.h rename to lib/libspl/include/os/linux/sys/ia32/trap.h diff --git a/include/os/windows/spl/win/asm_linkage.h b/lib/libspl/include/os/windows/sys/ia32/asm_linkage.h similarity index 95% rename from include/os/windows/spl/win/asm_linkage.h rename to lib/libspl/include/os/windows/sys/ia32/asm_linkage.h index 1982ac96e36f..ca669343aab8 100644 --- a/include/os/windows/spl/win/asm_linkage.h +++ b/lib/libspl/include/os/windows/sys/ia32/asm_linkage.h @@ -27,15 +27,20 @@ #ifndef _IA32_SYS_ASM_LINKAGE_H #define _IA32_SYS_ASM_LINKAGE_H -#include -#include - #if defined(__linux__) && defined(CONFIG_SLS) #define RET ret; int3 #else #define RET ret #endif +/* Tell compiler to call assembler like Unix */ +#define ASMABI __attribute__((sysv_abi)) + +#define ENDBR + +#define SECTION_TEXT .text +#define SECTION_STATIC .data + #ifdef __cplusplus extern "C" { #endif @@ -157,6 +162,8 @@ x:; \ */ #define SET_SIZE(x) +#define SET_OBJ(x) + #endif /* _ASM */ #ifdef __cplusplus diff --git a/module/icp/asm-x86_64/os/windows/aes/aes_aesni.S b/module/icp/asm-x86_64/os/windows/aes/aes_aesni.S deleted file mode 100644 index 6a18aa5529f5..000000000000 --- a/module/icp/asm-x86_64/os/windows/aes/aes_aesni.S +++ /dev/null @@ -1,762 +0,0 @@ -/* - * ==================================================================== - * Written by Intel Corporation for the OpenSSL project to add support - * for Intel AES-NI instructions. Rights for redistribution and usage - * in source and binary forms are granted according to the OpenSSL - * license. - * - * Author: Huang Ying - * Vinodh Gopal - * Kahraman Akdemir - * - * Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD) - * instructions that are going to be introduced in the next generation - * of Intel processor, as of 2009. These instructions enable fast and - * secure data encryption and decryption, using the Advanced Encryption - * Standard (AES), defined by FIPS Publication number 197. The - * architecture introduces six instructions that offer full hardware - * support for AES. Four of them support high performance data - * encryption and decryption, and the other two instructions support - * the AES key expansion procedure. - * ==================================================================== - */ - -/* - * ==================================================================== - * Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - */ - -/* - * ==================================================================== - * OpenSolaris OS modifications - * - * This source originates as files aes-intel.S and eng_aesni_asm.pl, in - * patches sent sent Dec. 9, 2008 and Dec. 24, 2008, respectively, by - * Huang Ying of Intel to the openssl-dev mailing list under the subject - * of "Add support to Intel AES-NI instruction set for x86_64 platform". - * - * This OpenSolaris version has these major changes from the original source: - * - * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from - * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function - * definitions for lint. - * - * 2. Formatted code, added comments, and added #includes and #defines. - * - * 3. If bit CR0.TS is set, clear and set the TS bit, after and before - * calling kpreempt_disable() and kpreempt_enable(). - * If the TS bit is not set, Save and restore %xmm registers at the beginning - * and end of function calls (%xmm* registers are not saved and restored by - * during kernel thread preemption). - * - * 4. Renamed functions, reordered parameters, and changed return value - * to match OpenSolaris: - * - * OpenSSL interface: - * int intel_AES_set_encrypt_key(const unsigned char *userKey, - * const int bits, AES_KEY *key); - * int intel_AES_set_decrypt_key(const unsigned char *userKey, - * const int bits, AES_KEY *key); - * Return values for above are non-zero on error, 0 on success. - * - * void intel_AES_encrypt(const unsigned char *in, unsigned char *out, - * const AES_KEY *key); - * void intel_AES_decrypt(const unsigned char *in, unsigned char *out, - * const AES_KEY *key); - * typedef struct aes_key_st { - * unsigned int rd_key[4 *(AES_MAXNR + 1)]; - * int rounds; - * unsigned int pad[3]; - * } AES_KEY; - * Note: AES_LONG is undefined (that is, Intel uses 32-bit key schedules - * (ks32) instead of 64-bit (ks64). - * Number of rounds (aka round count) is at offset 240 of AES_KEY. - * - * OpenSolaris OS interface (#ifdefs removed for readability): - * int rijndael_key_setup_dec_intel(uint32_t rk[], - * const uint32_t cipherKey[], uint64_t keyBits); - * int rijndael_key_setup_enc_intel(uint32_t rk[], - * const uint32_t cipherKey[], uint64_t keyBits); - * Return values for above are 0 on error, number of rounds on success. - * - * void aes_encrypt_intel(const aes_ks_t *ks, int Nr, - * const uint32_t pt[4], uint32_t ct[4]); - * void aes_decrypt_intel(const aes_ks_t *ks, int Nr, - * const uint32_t pt[4], uint32_t ct[4]); - * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]; - * uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t; - * - * typedef union { - * uint32_t ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)]; - * } aes_ks_t; - * typedef struct aes_key { - * aes_ks_t encr_ks, decr_ks; - * long double align128; - * int flags, nr, type; - * } aes_key_t; - * - * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text, - * ct is crypto text, and MAX_AES_NR is 14. - * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64. - * - * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary. - * - * ==================================================================== - */ - -#if defined(lint) || defined(__lint) - -#include - -void -aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4], - uint32_t ct[4]) { - (void) rk, (void) Nr, (void) pt, (void) ct; -} -void -aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4], - uint32_t pt[4]) { - (void) rk, (void) Nr, (void) ct, (void) pt; -} -int -rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[], - uint64_t keyBits) { - (void) rk, (void) cipherKey, (void) keyBits; - return (0); -} -int -rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[], - uint64_t keyBits) { - (void) rk, (void) cipherKey, (void) keyBits; - return (0); -} - - -#elif defined(HAVE_AES) /* guard by instruction set */ - -#define _ASM -#include - -/* - * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(), - * _key_expansion_256a(), _key_expansion_256b() - * - * Helper functions called by rijndael_key_setup_inc_intel(). - * Also used indirectly by rijndael_key_setup_dec_intel(). - * - * Input: - * %xmm0 User-provided cipher key - * %xmm1 Round constant - * Output: - * (%rcx) AES key - */ - -ENTRY_NP2(_key_expansion_128, _key_expansion_256a) -_key_expansion_128_local: -_key_expansion_256a_local: - pshufd $0b11111111, %xmm1, %xmm1 - shufps $0b00010000, %xmm0, %xmm4 - pxor %xmm4, %xmm0 - shufps $0b10001100, %xmm0, %xmm4 - pxor %xmm4, %xmm0 - pxor %xmm1, %xmm0 - movups %xmm0, (%rcx) - add $0x10, %rcx - RET - nop -SET_SIZE(_key_expansion_128) -SET_SIZE(_key_expansion_256a) - - -ENTRY_NP(_key_expansion_192a) -_key_expansion_192a_local: - pshufd $0b01010101, %xmm1, %xmm1 - shufps $0b00010000, %xmm0, %xmm4 - pxor %xmm4, %xmm0 - shufps $0b10001100, %xmm0, %xmm4 - pxor %xmm4, %xmm0 - pxor %xmm1, %xmm0 - - movups %xmm2, %xmm5 - movups %xmm2, %xmm7 - pslldq $4, %xmm5 - pshufd $0b11111111, %xmm0, %xmm3 - pxor %xmm3, %xmm2 - pxor %xmm5, %xmm2 - - movups %xmm0, %xmm1 - shufps $0b01000100, %xmm0, %xmm7 - movups %xmm7, (%rcx) - shufps $0b01001110, %xmm2, %xmm1 - movups %xmm1, 0x10(%rcx) - add $0x20, %rcx - RET -SET_SIZE(_key_expansion_192a) - - -ENTRY_NP(_key_expansion_192b) -_key_expansion_192b_local: - pshufd $0b01010101, %xmm1, %xmm1 - shufps $0b00010000, %xmm0, %xmm4 - pxor %xmm4, %xmm0 - shufps $0b10001100, %xmm0, %xmm4 - pxor %xmm4, %xmm0 - pxor %xmm1, %xmm0 - - movups %xmm2, %xmm5 - pslldq $4, %xmm5 - pshufd $0b11111111, %xmm0, %xmm3 - pxor %xmm3, %xmm2 - pxor %xmm5, %xmm2 - - movups %xmm0, (%rcx) - add $0x10, %rcx - RET -SET_SIZE(_key_expansion_192b) - - -ENTRY_NP(_key_expansion_256b) -_key_expansion_256b_local: - pshufd $0b10101010, %xmm1, %xmm1 - shufps $0b00010000, %xmm2, %xmm4 - pxor %xmm4, %xmm2 - shufps $0b10001100, %xmm2, %xmm4 - pxor %xmm4, %xmm2 - pxor %xmm1, %xmm2 - movups %xmm2, (%rcx) - add $0x10, %rcx - RET -SET_SIZE(_key_expansion_256b) - - -/* - * rijndael_key_setup_enc_intel() - * Expand the cipher key into the encryption key schedule. - * - * For kernel code, caller is responsible for ensuring kpreempt_disable() - * has been called. This is because %xmm registers are not saved/restored. - * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set - * on entry. Otherwise, if TS is not set, save and restore %xmm registers - * on the stack. - * - * OpenSolaris interface: - * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[], - * uint64_t keyBits); - * Return value is 0 on error, number of rounds on success. - * - * Original Intel OpenSSL interface: - * int intel_AES_set_encrypt_key(const unsigned char *userKey, - * const int bits, AES_KEY *key); - * Return value is non-zero on error, 0 on success. - */ - -// Windows x64: -// Calling: rcx, rdx, r8, and r9 (float: xmm0-xmm3) -// Return: rax (float: xmm0) -// Volatile: rax, rcx, rdx, r8-r11 -// Nonvolatile: rbx, rbp, rsp, rdi, rsi, r12-r15 (xmm6, xmm15) - -// Unix x64: -// Calling: rdi, rsi, rdx, rcx, r8, r9 (float: xmm0-xmm7) -// Return: rax (float: xmm0) -// Volatile: -// Nonvolatile: rbx, rbp, rsp, r12-r15 - -// outcome: -// Since xmm6 is nonvolatile, but xmm7 isnt, we can just move to that. - -#ifdef OPENSSL_INTERFACE -#define rijndael_key_setup_enc_intel intel_AES_set_encrypt_key -#define rijndael_key_setup_dec_intel intel_AES_set_decrypt_key - -#define USERCIPHERKEY rdi /* P1, 64 bits */ -#define KEYSIZE32 esi /* P2, 32 bits */ -#define KEYSIZE64 rsi /* P2, 64 bits */ -#define AESKEY rdx /* P3, 64 bits */ - -#else /* OpenSolaris^W Windows Interface */ -#define AESKEY rcx /* P1, 64 bits (was rdi) */ -#define USERCIPHERKEY rdx /* P2, 64 bits (rsi) */ -#define KEYSIZE32 r8d /* P3, 32 bits (edx) */ -#define KEYSIZE64 r8 /* P3, 64 bits (rdx) */ -#endif /* OPENSSL_INTERFACE */ - -#define ROUNDS32 KEYSIZE32 /* temp */ -#define ROUNDS64 KEYSIZE64 /* temp */ -#define ENDAESKEY USERCIPHERKEY /* temp */ - -ENTRY_NP(rijndael_key_setup_enc_intel) -rijndael_key_setup_enc_intel_local: - FRAME_BEGIN - // NULL pointer sanity check - test %USERCIPHERKEY, %USERCIPHERKEY - jz .Lenc_key_invalid_param - test %AESKEY, %AESKEY - jz .Lenc_key_invalid_param - - movups (%USERCIPHERKEY), %xmm0 // user key (first 16 bytes) - movups %xmm0, (%AESKEY) - lea 0x10(%AESKEY), %rcx // key addr - pxor %xmm4, %xmm4 // xmm4 is assumed 0 in _key_expansion_x - - cmp $256, %KEYSIZE32 - jnz .Lenc_key192 - - // AES 256: 14 rounds in encryption key schedule -#ifdef OPENSSL_INTERFACE - mov $14, %ROUNDS32 - movl %ROUNDS32, 240(%AESKEY) // key.rounds = 14 -#endif /* OPENSSL_INTERFACE */ - - movups 0x10(%USERCIPHERKEY), %xmm2 // other user key (2nd 16 bytes) - movups %xmm2, (%rcx) - add $0x10, %rcx - - aeskeygenassist $0x1, %xmm2, %xmm1 // expand the key - call _key_expansion_256a_local - aeskeygenassist $0x1, %xmm0, %xmm1 - call _key_expansion_256b_local - aeskeygenassist $0x2, %xmm2, %xmm1 // expand the key - call _key_expansion_256a_local - aeskeygenassist $0x2, %xmm0, %xmm1 - call _key_expansion_256b_local - aeskeygenassist $0x4, %xmm2, %xmm1 // expand the key - call _key_expansion_256a_local - aeskeygenassist $0x4, %xmm0, %xmm1 - call _key_expansion_256b_local - aeskeygenassist $0x8, %xmm2, %xmm1 // expand the key - call _key_expansion_256a_local - aeskeygenassist $0x8, %xmm0, %xmm1 - call _key_expansion_256b_local - aeskeygenassist $0x10, %xmm2, %xmm1 // expand the key - call _key_expansion_256a_local - aeskeygenassist $0x10, %xmm0, %xmm1 - call _key_expansion_256b_local - aeskeygenassist $0x20, %xmm2, %xmm1 // expand the key - call _key_expansion_256a_local - aeskeygenassist $0x20, %xmm0, %xmm1 - call _key_expansion_256b_local - aeskeygenassist $0x40, %xmm2, %xmm1 // expand the key - call _key_expansion_256a_local - -#ifdef OPENSSL_INTERFACE - xor %rax, %rax // return 0 (OK) -#else /* Open Solaris Interface */ - mov $14, %rax // return # rounds = 14 -#endif - FRAME_END - RET - -.align 4 -.Lenc_key192: - cmp $192, %KEYSIZE32 - jnz .Lenc_key128 - - // AES 192: 12 rounds in encryption key schedule -#ifdef OPENSSL_INTERFACE - mov $12, %ROUNDS32 - movl %ROUNDS32, 240(%AESKEY) // key.rounds = 12 -#endif /* OPENSSL_INTERFACE */ - - movq 0x10(%USERCIPHERKEY), %xmm2 // other user key - aeskeygenassist $0x1, %xmm2, %xmm1 // expand the key - call _key_expansion_192a_local - aeskeygenassist $0x2, %xmm2, %xmm1 // expand the key - call _key_expansion_192b_local - aeskeygenassist $0x4, %xmm2, %xmm1 // expand the key - call _key_expansion_192a_local - aeskeygenassist $0x8, %xmm2, %xmm1 // expand the key - call _key_expansion_192b_local - aeskeygenassist $0x10, %xmm2, %xmm1 // expand the key - call _key_expansion_192a_local - aeskeygenassist $0x20, %xmm2, %xmm1 // expand the key - call _key_expansion_192b_local - aeskeygenassist $0x40, %xmm2, %xmm1 // expand the key - call _key_expansion_192a_local - aeskeygenassist $0x80, %xmm2, %xmm1 // expand the key - call _key_expansion_192b_local - -#ifdef OPENSSL_INTERFACE - xor %rax, %rax // return 0 (OK) -#else /* OpenSolaris Interface */ - mov $12, %rax // return # rounds = 12 -#endif - FRAME_END - RET - -.align 4 -.Lenc_key128: - cmp $128, %KEYSIZE32 - jnz .Lenc_key_invalid_key_bits - - // AES 128: 10 rounds in encryption key schedule -#ifdef OPENSSL_INTERFACE - mov $10, %ROUNDS32 - movl %ROUNDS32, 240(%AESKEY) // key.rounds = 10 -#endif /* OPENSSL_INTERFACE */ - - aeskeygenassist $0x1, %xmm0, %xmm1 // expand the key - call _key_expansion_128_local - aeskeygenassist $0x2, %xmm0, %xmm1 // expand the key - call _key_expansion_128_local - aeskeygenassist $0x4, %xmm0, %xmm1 // expand the key - call _key_expansion_128_local - aeskeygenassist $0x8, %xmm0, %xmm1 // expand the key - call _key_expansion_128_local - aeskeygenassist $0x10, %xmm0, %xmm1 // expand the key - call _key_expansion_128_local - aeskeygenassist $0x20, %xmm0, %xmm1 // expand the key - call _key_expansion_128_local - aeskeygenassist $0x40, %xmm0, %xmm1 // expand the key - call _key_expansion_128_local - aeskeygenassist $0x80, %xmm0, %xmm1 // expand the key - call _key_expansion_128_local - aeskeygenassist $0x1b, %xmm0, %xmm1 // expand the key - call _key_expansion_128_local - aeskeygenassist $0x36, %xmm0, %xmm1 // expand the key - call _key_expansion_128_local - -#ifdef OPENSSL_INTERFACE - xor %rax, %rax // return 0 (OK) -#else /* OpenSolaris Interface */ - mov $10, %rax // return # rounds = 10 -#endif - FRAME_END - RET - -.Lenc_key_invalid_param: -#ifdef OPENSSL_INTERFACE - mov $-1, %rax // user key or AES key pointer is NULL - FRAME_END - RET -#else - /* FALLTHROUGH */ -#endif /* OPENSSL_INTERFACE */ - -.Lenc_key_invalid_key_bits: -#ifdef OPENSSL_INTERFACE - mov $-2, %rax // keysize is invalid -#else /* Open Solaris Interface */ - xor %rax, %rax // a key pointer is NULL or invalid keysize -#endif /* OPENSSL_INTERFACE */ - FRAME_END - RET - SET_SIZE(rijndael_key_setup_enc_intel) - - -/* - * rijndael_key_setup_dec_intel() - * Expand the cipher key into the decryption key schedule. - * - * For kernel code, caller is responsible for ensuring kpreempt_disable() - * has been called. This is because %xmm registers are not saved/restored. - * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set - * on entry. Otherwise, if TS is not set, save and restore %xmm registers - * on the stack. - * - * OpenSolaris interface: - * int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[], - * uint64_t keyBits); - * Return value is 0 on error, number of rounds on success. - * P1->P2, P2->P3, P3->P1 - * - * Original Intel OpenSSL interface: - * int intel_AES_set_decrypt_key(const unsigned char *userKey, - * const int bits, AES_KEY *key); - * Return value is non-zero on error, 0 on success. - */ - -ENTRY_NP(rijndael_key_setup_dec_intel) -FRAME_BEGIN - // Generate round keys used for encryption - call rijndael_key_setup_enc_intel_local - test %rax, %rax -#ifdef OPENSSL_INTERFACE - jnz .Ldec_key_exit // Failed if returned non-0 -#else /* OpenSolaris Interface */ - jz .Ldec_key_exit // Failed if returned 0 -#endif /* OPENSSL_INTERFACE */ - - /* - * Convert round keys used for encryption - * to a form usable for decryption - */ -#ifndef OPENSSL_INTERFACE /* OpenSolaris Interface */ - mov %rax, %ROUNDS64 // set # rounds (10, 12, or 14) - // (already set for OpenSSL) -#endif - - lea 0x10(%AESKEY), %rcx // key addr - shl $4, %ROUNDS32 - add %AESKEY, %ROUNDS64 - mov %ROUNDS64, %ENDAESKEY - -.align 4 -.Ldec_key_reorder_loop: - movups (%AESKEY), %xmm0 - movups (%ROUNDS64), %xmm1 - movups %xmm0, (%ROUNDS64) - movups %xmm1, (%AESKEY) - lea 0x10(%AESKEY), %AESKEY - lea -0x10(%ROUNDS64), %ROUNDS64 - cmp %AESKEY, %ROUNDS64 - ja .Ldec_key_reorder_loop - -.align 4 -.Ldec_key_inv_loop: - movups (%rcx), %xmm0 - // Convert an encryption round key to a form usable for decryption - // with the "AES Inverse Mix Columns" instruction - aesimc %xmm0, %xmm1 - movups %xmm1, (%rcx) - lea 0x10(%rcx), %rcx - cmp %ENDAESKEY, %rcx - jnz .Ldec_key_inv_loop - -.Ldec_key_exit: - // OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error - // OpenSSL: rax = 0 for OK, or non-zero for error - FRAME_END - RET - SET_SIZE(rijndael_key_setup_dec_intel) - - -/* - * aes_encrypt_intel() - * Encrypt a single block (in and out can overlap). - * - * For kernel code, caller is responsible for ensuring kpreempt_disable() - * has been called. This is because %xmm registers are not saved/restored. - * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set - * on entry. Otherwise, if TS is not set, save and restore %xmm registers - * on the stack. - * - * Temporary register usage: - * %xmm0 State - * %xmm1 Key - * - * Original OpenSolaris Interface: - * void aes_encrypt_intel(const aes_ks_t *ks, int Nr, - * const uint32_t pt[4], uint32_t ct[4]) - * - * Original Intel OpenSSL Interface: - * void intel_AES_encrypt(const unsigned char *in, unsigned char *out, - * const AES_KEY *key) - */ - -#ifdef OPENSSL_INTERFACE -#define aes_encrypt_intel intel_AES_encrypt -#define aes_decrypt_intel intel_AES_decrypt - -#define INP rdi /* P1, 64 bits */ -#define OUTP rsi /* P2, 64 bits */ -#define KEYP rdx /* P3, 64 bits */ - -/* No NROUNDS parameter--offset 240 from KEYP saved in %ecx: */ -#define NROUNDS32 ecx /* temporary, 32 bits */ -#define NROUNDS cl /* temporary, 8 bits */ - -#else /* OpenSolaris Interface */ -#define KEYP rcx /* P1, 64 bits (rdi) */ -#define NROUNDS edx /* P2, 32 bits (esi) */ -#define INP r8 /* P3, 64 bits (rdx) */ -#define OUTP r9 /* P4, 64 bits (rcx) */ -#endif /* OPENSSL_INTERFACE */ - -#define STATE xmm0 /* temporary, 128 bits */ -#define KEY xmm1 /* temporary, 128 bits */ - - -ENTRY_NP(aes_encrypt_intel) - - movups (%INP), %STATE // input - movups (%KEYP), %KEY // key -#ifdef OPENSSL_INTERFACE - mov 240(%KEYP), %NROUNDS32 // round count -#else /* OpenSolaris Interface */ - /* Round count is already present as P2 in %rsi/%esi */ -#endif /* OPENSSL_INTERFACE */ - - pxor %KEY, %STATE // round 0 - lea 0x30(%KEYP), %KEYP - cmp $12, %NROUNDS - jb .Lenc128 - lea 0x20(%KEYP), %KEYP - je .Lenc192 - - // AES 256 - lea 0x20(%KEYP), %KEYP - movups -0x60(%KEYP), %KEY - aesenc %KEY, %STATE - movups -0x50(%KEYP), %KEY - aesenc %KEY, %STATE - -.align 4 -.Lenc192: - // AES 192 and 256 - movups -0x40(%KEYP), %KEY - aesenc %KEY, %STATE - movups -0x30(%KEYP), %KEY - aesenc %KEY, %STATE - -.align 4 -.Lenc128: - // AES 128, 192, and 256 - movups -0x20(%KEYP), %KEY - aesenc %KEY, %STATE - movups -0x10(%KEYP), %KEY - aesenc %KEY, %STATE - movups (%KEYP), %KEY - aesenc %KEY, %STATE - movups 0x10(%KEYP), %KEY - aesenc %KEY, %STATE - movups 0x20(%KEYP), %KEY - aesenc %KEY, %STATE - movups 0x30(%KEYP), %KEY - aesenc %KEY, %STATE - movups 0x40(%KEYP), %KEY - aesenc %KEY, %STATE - movups 0x50(%KEYP), %KEY - aesenc %KEY, %STATE - movups 0x60(%KEYP), %KEY - aesenc %KEY, %STATE - movups 0x70(%KEYP), %KEY - aesenclast %KEY, %STATE // last round - movups %STATE, (%OUTP) // output - - RET - SET_SIZE(aes_encrypt_intel) - - -/* - * aes_decrypt_intel() - * Decrypt a single block (in and out can overlap). - * - * For kernel code, caller is responsible for ensuring kpreempt_disable() - * has been called. This is because %xmm registers are not saved/restored. - * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set - * on entry. Otherwise, if TS is not set, save and restore %xmm registers - * on the stack. - * - * Temporary register usage: - * %xmm0 State - * %xmm1 Key - * - * Original OpenSolaris Interface: - * void aes_decrypt_intel(const aes_ks_t *ks, int Nr, - * const uint32_t pt[4], uint32_t ct[4])/ - * - * Original Intel OpenSSL Interface: - * void intel_AES_decrypt(const unsigned char *in, unsigned char *out, - * const AES_KEY *key); - */ -ENTRY_NP(aes_decrypt_intel) - - movups (%INP), %STATE // input - movups (%KEYP), %KEY // key -#ifdef OPENSSL_INTERFACE - mov 240(%KEYP), %NROUNDS32 // round count -#else /* OpenSolaris Interface */ - /* Round count is already present as P2 in %rsi/%esi */ -#endif /* OPENSSL_INTERFACE */ - - pxor %KEY, %STATE // round 0 - lea 0x30(%KEYP), %KEYP - cmp $12, %NROUNDS - jb .Ldec128 - lea 0x20(%KEYP), %KEYP - je .Ldec192 - - // AES 256 - lea 0x20(%KEYP), %KEYP - movups -0x60(%KEYP), %KEY - aesdec %KEY, %STATE - movups -0x50(%KEYP), %KEY - aesdec %KEY, %STATE - -.align 4 -.Ldec192: - // AES 192 and 256 - movups -0x40(%KEYP), %KEY - aesdec %KEY, %STATE - movups -0x30(%KEYP), %KEY - aesdec %KEY, %STATE - -.align 4 -.Ldec128: - // AES 128, 192, and 256 - movups -0x20(%KEYP), %KEY - aesdec %KEY, %STATE - movups -0x10(%KEYP), %KEY - aesdec %KEY, %STATE - movups (%KEYP), %KEY - aesdec %KEY, %STATE - movups 0x10(%KEYP), %KEY - aesdec %KEY, %STATE - movups 0x20(%KEYP), %KEY - aesdec %KEY, %STATE - movups 0x30(%KEYP), %KEY - aesdec %KEY, %STATE - movups 0x40(%KEYP), %KEY - aesdec %KEY, %STATE - movups 0x50(%KEYP), %KEY - aesdec %KEY, %STATE - movups 0x60(%KEYP), %KEY - aesdec %KEY, %STATE - movups 0x70(%KEYP), %KEY - aesdeclast %KEY, %STATE // last round - movups %STATE, (%OUTP) // output - - RET - SET_SIZE(aes_decrypt_intel) - -#endif /* lint || __lint */ - -#ifdef __ELF__ -.section .note.GNU-stack,"",%progbits -#endif diff --git a/module/icp/asm-x86_64/os/windows/aes/aes_amd64.S b/module/icp/asm-x86_64/os/windows/aes/aes_amd64.S deleted file mode 100644 index 251ab5648b42..000000000000 --- a/module/icp/asm-x86_64/os/windows/aes/aes_amd64.S +++ /dev/null @@ -1,908 +0,0 @@ -/* - * --------------------------------------------------------------------------- - * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. - * - * LICENSE TERMS - * - * The free distribution and use of this software is allowed (with or without - * changes) provided that: - * - * 1. source code distributions include the above copyright notice, this - * list of conditions and the following disclaimer; - * - * 2. binary distributions include the above copyright notice, this list - * of conditions and the following disclaimer in their documentation; - * - * 3. the name of the copyright holder is not used to endorse products - * built using this software without specific written permission. - * - * DISCLAIMER - * - * This software is provided 'as is' with no explicit or implied warranties - * in respect of its properties, including, but not limited to, correctness - * and/or fitness for purpose. - * --------------------------------------------------------------------------- - * Issue 20/12/2007 - * - * I am grateful to Dag Arne Osvik for many discussions of the techniques that - * can be used to optimise AES assembler code on AMD64/EM64T architectures. - * Some of the techniques used in this implementation are the result of - * suggestions made by him for which I am most grateful. - * - * An AES implementation for AMD64 processors using the YASM assembler. This - * implementation provides only encryption, decryption and hence requires key - * scheduling support in C. It uses 8k bytes of tables but its encryption and - * decryption performance is very close to that obtained using large tables. - * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions, - * which are as follows: - * ms windows gnu/linux/opensolaris os - * - * in_blk rcx rdi - * out_blk rdx rsi - * context (cx) r8 rdx - * - * preserved rsi - + rbx, rbp, rsp, r12, r13, r14 & r15 - * registers rdi - on both - * - * destroyed - rsi + rax, rcx, rdx, r8, r9, r10 & r11 - * registers - rdi on both - * - * The convention used here is that for gnu/linux/opensolaris os. - * - * This code provides the standard AES block size (128 bits, 16 bytes) and the - * three standard AES key sizes (128, 192 and 256 bits). It has the same call - * interface as my C implementation. It uses the Microsoft C AMD64 calling - * conventions in which the three parameters are placed in rcx, rdx and r8 - * respectively. The rbx, rsi, rdi, rbp and r12..r15 registers are preserved. - * - * OpenSolaris Note: - * Modified to use GNU/Linux/Solaris calling conventions. - * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively. - * - * AES_RETURN aes_encrypt(const unsigned char in_blk[], - * unsigned char out_blk[], const aes_encrypt_ctx cx[1])/ - * - * AES_RETURN aes_decrypt(const unsigned char in_blk[], - * unsigned char out_blk[], const aes_decrypt_ctx cx[1])/ - * - * AES_RETURN aes_encrypt_key(const unsigned char key[], - * const aes_encrypt_ctx cx[1])/ - * - * AES_RETURN aes_decrypt_key(const unsigned char key[], - * const aes_decrypt_ctx cx[1])/ - * - * AES_RETURN aes_encrypt_key(const unsigned char key[], - * unsigned int len, const aes_decrypt_ctx cx[1])/ - * - * AES_RETURN aes_decrypt_key(const unsigned char key[], - * unsigned int len, const aes_decrypt_ctx cx[1])/ - * - * where is 128, 102 or 256. In the last two calls the length can be in - * either bits or bytes. - * - * Comment in/out the following lines to obtain the desired subroutines. These - * selections MUST match those in the C header file aesopt.h - */ -#define AES_REV_DKS /* define if key decryption schedule is reversed */ - -#define LAST_ROUND_TABLES /* define for the faster version using extra tables */ - -/* - * The encryption key schedule has the following in memory layout where N is the - * number of rounds (10, 12 or 14): - * - * lo: | input key (round 0) | / each round is four 32-bit words - * | encryption round 1 | - * | encryption round 2 | - * .... - * | encryption round N-1 | - * hi: | encryption round N | - * - * The decryption key schedule is normally set up so that it has the same - * layout as above by actually reversing the order of the encryption key - * schedule in memory (this happens when AES_REV_DKS is set): - * - * lo: | decryption round 0 | = | encryption round N | - * | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ] - * | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ] - * .... .... - * | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ] - * hi: | decryption round N | = | input key (round 0) | - * - * with rounds except the first and last modified using inv_mix_column() - * But if AES_REV_DKS is NOT set the order of keys is left as it is for - * encryption so that it has to be accessed in reverse when used for - * decryption (although the inverse mix column modifications are done) - * - * lo: | decryption round 0 | = | input key (round 0) | - * | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ] - * | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ] - * .... .... - * | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ] - * hi: | decryption round N | = | encryption round N | - * - * This layout is faster when the assembler key scheduling provided here - * is used. - * - * End of user defines - */ - -/* - * --------------------------------------------------------------------------- - * OpenSolaris OS modifications - * - * This source originates from Brian Gladman file aes_amd64.asm - * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip - * with these changes: - * - * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and - * !__GNUC__ ifdefs. Also removed ENCRYPTION, DECRYPTION, - * AES_128, AES_192, AES_256, AES_VAR ifdefs. - * - * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define - * - * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef - * - * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax - * (operands reversed, literals prefixed with "$", registers prefixed with "%", - * and "[register+offset]", addressing changed to "offset(register)", - * parenthesis in constant expressions "()" changed to square brackets "[]", - * "." removed from local (numeric) labels, and other changes. - * Examples: - * Intel/yasm/nasm Syntax ATT/OpenSolaris Syntax - * mov rax,(4*20h) mov $[4*0x20],%rax - * mov rax,[ebx+20h] mov 0x20(%ebx),%rax - * lea rax,[ebx+ecx] lea (%ebx,%ecx),%rax - * sub rax,[ebx+ecx*4-20h] sub -0x20(%ebx,%ecx,4),%rax - * - * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from - * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function - * definitions for lint. - * - * 6. Renamed functions and reordered parameters to match OpenSolaris: - * Original Gladman interface: - * int aes_encrypt(const unsigned char *in, - * unsigned char *out, const aes_encrypt_ctx cx[1])/ - * int aes_decrypt(const unsigned char *in, - * unsigned char *out, const aes_encrypt_ctx cx[1])/ - * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t, - * and a union type, inf., containing inf.l, a uint32_t and - * inf.b, a 4-element array of uint32_t. Only b[0] in the array (aka "l") is - * used and contains the key schedule length * 16 where key schedule length is - * 10, 12, or 14 bytes. - * - * OpenSolaris OS interface: - * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr, - * const uint32_t pt[4], uint32_t ct[4])/ - * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr, - * const uint32_t pt[4], uint32_t ct[4])/ - * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/ - * uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/ - * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text, - * ct is crypto text, and MAX_AES_NR is 14. - * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64. - */ - -#define _ASM -#include - -#define KS_LENGTH 60 - -#define raxd eax -#define rdxd edx -#define rcxd ecx -#define rbxd ebx -#define rsid esi -#define rdid edi - -#define raxb al -#define rdxb dl -#define rcxb cl -#define rbxb bl -#define rsib sil -#define rdib dil - -// finite field multiplies by {02}, {04} and {08} - -#define f2(x) ((x<<1)^(((x>>7)&1)*0x11b)) -#define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b)) -#define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b)) - -// finite field multiplies required in table generation - -#define f3(x) ((f2(x)) ^ (x)) -#define f9(x) ((f8(x)) ^ (x)) -#define fb(x) ((f8(x)) ^ (f2(x)) ^ (x)) -#define fd(x) ((f8(x)) ^ (f4(x)) ^ (x)) -#define fe(x) ((f8(x)) ^ (f4(x)) ^ (f2(x))) - -// macros for expanding S-box data - -#define u8(x) (f2(x)), (x), (x), (f3(x)), (f2(x)), (x), (x), (f3(x)) -#define v8(x) (fe(x)), (f9(x)), (fd(x)), (fb(x)), (fe(x)), (f9(x)), (fd(x)), (x) -#define w8(x) (x), 0, 0, 0, (x), 0, 0, 0 - -#define enc_vals(x) \ - .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \ - .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \ - .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \ - .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \ - .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \ - .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \ - .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \ - .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \ - .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \ - .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \ - .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \ - .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \ - .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \ - .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \ - .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \ - .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \ - .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \ - .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \ - .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \ - .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \ - .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \ - .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \ - .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \ - .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \ - .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \ - .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \ - .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \ - .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \ - .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \ - .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \ - .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \ - .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16) - -#define dec_vals(x) \ - .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \ - .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \ - .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \ - .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \ - .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \ - .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \ - .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \ - .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \ - .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \ - .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \ - .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \ - .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \ - .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \ - .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \ - .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \ - .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \ - .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \ - .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \ - .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \ - .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \ - .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \ - .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \ - .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \ - .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \ - .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \ - .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \ - .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \ - .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \ - .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \ - .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \ - .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \ - .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d) - -#define tptr %rbp /* table pointer */ -#define kptr %r8 /* key schedule pointer */ -#define fofs 128 /* adjust offset in key schedule to keep |disp| < 128 */ -#define fk_ref(x, y) -16*x+fofs+4*y(kptr) - -#ifdef AES_REV_DKS -#define rofs 128 -#define ik_ref(x, y) -16*x+rofs+4*y(kptr) - -#else -#define rofs -128 -#define ik_ref(x, y) 16*x+rofs+4*y(kptr) -#endif /* AES_REV_DKS */ - -#define tab_0(x) (tptr,x,8) -#define tab_1(x) 3(tptr,x,8) -#define tab_2(x) 2(tptr,x,8) -#define tab_3(x) 1(tptr,x,8) -#define tab_f(x) 1(tptr,x,8) -#define tab_i(x) 7(tptr,x,8) - -#define ff_rnd(p1, p2, p3, p4, round) /* normal forward round */ \ - mov fk_ref(round,0), p1; \ - mov fk_ref(round,1), p2; \ - mov fk_ref(round,2), p3; \ - mov fk_ref(round,3), p4; \ - \ - movzx %al, %esi; \ - movzx %ah, %edi; \ - shr $16, %eax; \ - xor tab_0(%rsi), p1; \ - xor tab_1(%rdi), p4; \ - movzx %al, %esi; \ - movzx %ah, %edi; \ - xor tab_2(%rsi), p3; \ - xor tab_3(%rdi), p2; \ - \ - movzx %bl, %esi; \ - movzx %bh, %edi; \ - shr $16, %ebx; \ - xor tab_0(%rsi), p2; \ - xor tab_1(%rdi), p1; \ - movzx %bl, %esi; \ - movzx %bh, %edi; \ - xor tab_2(%rsi), p4; \ - xor tab_3(%rdi), p3; \ - \ - movzx %cl, %esi; \ - movzx %ch, %edi; \ - shr $16, %ecx; \ - xor tab_0(%rsi), p3; \ - xor tab_1(%rdi), p2; \ - movzx %cl, %esi; \ - movzx %ch, %edi; \ - xor tab_2(%rsi), p1; \ - xor tab_3(%rdi), p4; \ - \ - movzx %dl, %esi; \ - movzx %dh, %edi; \ - shr $16, %edx; \ - xor tab_0(%rsi), p4; \ - xor tab_1(%rdi), p3; \ - movzx %dl, %esi; \ - movzx %dh, %edi; \ - xor tab_2(%rsi), p2; \ - xor tab_3(%rdi), p1; \ - \ - mov p1, %eax; \ - mov p2, %ebx; \ - mov p3, %ecx; \ - mov p4, %edx - -#ifdef LAST_ROUND_TABLES - -#define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ - add $2048, tptr; \ - mov fk_ref(round,0), p1; \ - mov fk_ref(round,1), p2; \ - mov fk_ref(round,2), p3; \ - mov fk_ref(round,3), p4; \ - \ - movzx %al, %esi; \ - movzx %ah, %edi; \ - shr $16, %eax; \ - xor tab_0(%rsi), p1; \ - xor tab_1(%rdi), p4; \ - movzx %al, %esi; \ - movzx %ah, %edi; \ - xor tab_2(%rsi), p3; \ - xor tab_3(%rdi), p2; \ - \ - movzx %bl, %esi; \ - movzx %bh, %edi; \ - shr $16, %ebx; \ - xor tab_0(%rsi), p2; \ - xor tab_1(%rdi), p1; \ - movzx %bl, %esi; \ - movzx %bh, %edi; \ - xor tab_2(%rsi), p4; \ - xor tab_3(%rdi), p3; \ - \ - movzx %cl, %esi; \ - movzx %ch, %edi; \ - shr $16, %ecx; \ - xor tab_0(%rsi), p3; \ - xor tab_1(%rdi), p2; \ - movzx %cl, %esi; \ - movzx %ch, %edi; \ - xor tab_2(%rsi), p1; \ - xor tab_3(%rdi), p4; \ - \ - movzx %dl, %esi; \ - movzx %dh, %edi; \ - shr $16, %edx; \ - xor tab_0(%rsi), p4; \ - xor tab_1(%rdi), p3; \ - movzx %dl, %esi; \ - movzx %dh, %edi; \ - xor tab_2(%rsi), p2; \ - xor tab_3(%rdi), p1 - -#else - -#define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ - mov fk_ref(round,0), p1; \ - mov fk_ref(round,1), p2; \ - mov fk_ref(round,2), p3; \ - mov fk_ref(round,3), p4; \ - \ - movzx %al, %esi; \ - movzx %ah, %edi; \ - shr $16, %eax; \ - movzx tab_f(%rsi), %esi; \ - movzx tab_f(%rdi), %edi; \ - xor %esi, p1; \ - rol $8, %edi; \ - xor %edi, p4; \ - movzx %al, %esi; \ - movzx %ah, %edi; \ - movzx tab_f(%rsi), %esi; \ - movzx tab_f(%rdi), %edi; \ - rol $16, %esi; \ - rol $24, %edi; \ - xor %esi, p3; \ - xor %edi, p2; \ - \ - movzx %bl, %esi; \ - movzx %bh, %edi; \ - shr $16, %ebx; \ - movzx tab_f(%rsi), %esi; \ - movzx tab_f(%rdi), %edi; \ - xor %esi, p2; \ - rol $8, %edi; \ - xor %edi, p1; \ - movzx %bl, %esi; \ - movzx %bh, %edi; \ - movzx tab_f(%rsi), %esi; \ - movzx tab_f(%rdi), %edi; \ - rol $16, %esi; \ - rol $24, %edi; \ - xor %esi, p4; \ - xor %edi, p3; \ - \ - movzx %cl, %esi; \ - movzx %ch, %edi; \ - movzx tab_f(%rsi), %esi; \ - movzx tab_f(%rdi), %edi; \ - shr $16, %ecx; \ - xor %esi, p3; \ - rol $8, %edi; \ - xor %edi, p2; \ - movzx %cl, %esi; \ - movzx %ch, %edi; \ - movzx tab_f(%rsi), %esi; \ - movzx tab_f(%rdi), %edi; \ - rol $16, %esi; \ - rol $24, %edi; \ - xor %esi, p1; \ - xor %edi, p4; \ - \ - movzx %dl, %esi; \ - movzx %dh, %edi; \ - movzx tab_f(%rsi), %esi; \ - movzx tab_f(%rdi), %edi; \ - shr $16, %edx; \ - xor %esi, p4; \ - rol $8, %edi; \ - xor %edi, p3; \ - movzx %dl, %esi; \ - movzx %dh, %edi; \ - movzx tab_f(%rsi), %esi; \ - movzx tab_f(%rdi), %edi; \ - rol $16, %esi; \ - rol $24, %edi; \ - xor %esi, p2; \ - xor %edi, p1 - -#endif /* LAST_ROUND_TABLES */ - -#define ii_rnd(p1, p2, p3, p4, round) /* normal inverse round */ \ - mov ik_ref(round,0), p1; \ - mov ik_ref(round,1), p2; \ - mov ik_ref(round,2), p3; \ - mov ik_ref(round,3), p4; \ - \ - movzx %al, %esi; \ - movzx %ah, %edi; \ - shr $16, %eax; \ - xor tab_0(%rsi), p1; \ - xor tab_1(%rdi), p2; \ - movzx %al, %esi; \ - movzx %ah, %edi; \ - xor tab_2(%rsi), p3; \ - xor tab_3(%rdi), p4; \ - \ - movzx %bl, %esi; \ - movzx %bh, %edi; \ - shr $16, %ebx; \ - xor tab_0(%rsi), p2; \ - xor tab_1(%rdi), p3; \ - movzx %bl, %esi; \ - movzx %bh, %edi; \ - xor tab_2(%rsi), p4; \ - xor tab_3(%rdi), p1; \ - \ - movzx %cl, %esi; \ - movzx %ch, %edi; \ - shr $16, %ecx; \ - xor tab_0(%rsi), p3; \ - xor tab_1(%rdi), p4; \ - movzx %cl, %esi; \ - movzx %ch, %edi; \ - xor tab_2(%rsi), p1; \ - xor tab_3(%rdi), p2; \ - \ - movzx %dl, %esi; \ - movzx %dh, %edi; \ - shr $16, %edx; \ - xor tab_0(%rsi), p4; \ - xor tab_1(%rdi), p1; \ - movzx %dl, %esi; \ - movzx %dh, %edi; \ - xor tab_2(%rsi), p2; \ - xor tab_3(%rdi), p3; \ - \ - mov p1, %eax; \ - mov p2, %ebx; \ - mov p3, %ecx; \ - mov p4, %edx - -#ifdef LAST_ROUND_TABLES - -#define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ - add $2048, tptr; \ - mov ik_ref(round,0), p1; \ - mov ik_ref(round,1), p2; \ - mov ik_ref(round,2), p3; \ - mov ik_ref(round,3), p4; \ - \ - movzx %al, %esi; \ - movzx %ah, %edi; \ - shr $16, %eax; \ - xor tab_0(%rsi), p1; \ - xor tab_1(%rdi), p2; \ - movzx %al, %esi; \ - movzx %ah, %edi; \ - xor tab_2(%rsi), p3; \ - xor tab_3(%rdi), p4; \ - \ - movzx %bl, %esi; \ - movzx %bh, %edi; \ - shr $16, %ebx; \ - xor tab_0(%rsi), p2; \ - xor tab_1(%rdi), p3; \ - movzx %bl, %esi; \ - movzx %bh, %edi; \ - xor tab_2(%rsi), p4; \ - xor tab_3(%rdi), p1; \ - \ - movzx %cl, %esi; \ - movzx %ch, %edi; \ - shr $16, %ecx; \ - xor tab_0(%rsi), p3; \ - xor tab_1(%rdi), p4; \ - movzx %cl, %esi; \ - movzx %ch, %edi; \ - xor tab_2(%rsi), p1; \ - xor tab_3(%rdi), p2; \ - \ - movzx %dl, %esi; \ - movzx %dh, %edi; \ - shr $16, %edx; \ - xor tab_0(%rsi), p4; \ - xor tab_1(%rdi), p1; \ - movzx %dl, %esi; \ - movzx %dh, %edi; \ - xor tab_2(%rsi), p2; \ - xor tab_3(%rdi), p3 - -#else - -#define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ - mov ik_ref(round,0), p1; \ - mov ik_ref(round,1), p2; \ - mov ik_ref(round,2), p3; \ - mov ik_ref(round,3), p4; \ - \ - movzx %al, %esi; \ - movzx %ah, %edi; \ - movzx tab_i(%rsi), %esi; \ - movzx tab_i(%rdi), %edi; \ - shr $16, %eax; \ - xor %esi, p1; \ - rol $8, %edi; \ - xor %edi, p2; \ - movzx %al, %esi; \ - movzx %ah, %edi; \ - movzx tab_i(%rsi), %esi; \ - movzx tab_i(%rdi), %edi; \ - rol $16, %esi; \ - rol $24, %edi; \ - xor %esi, p3; \ - xor %edi, p4; \ - \ - movzx %bl, %esi; \ - movzx %bh, %edi; \ - movzx tab_i(%rsi), %esi; \ - movzx tab_i(%rdi), %edi; \ - shr $16, %ebx; \ - xor %esi, p2; \ - rol $8, %edi; \ - xor %edi, p3; \ - movzx %bl, %esi; \ - movzx %bh, %edi; \ - movzx tab_i(%rsi), %esi; \ - movzx tab_i(%rdi), %edi; \ - rol $16, %esi; \ - rol $24, %edi; \ - xor %esi, p4; \ - xor %edi, p1; \ - \ - movzx %cl, %esi; \ - movzx %ch, %edi; \ - movzx tab_i(%rsi), %esi; \ - movzx tab_i(%rdi), %edi; \ - shr $16, %ecx; \ - xor %esi, p3; \ - rol $8, %edi; \ - xor %edi, p4; \ - movzx %cl, %esi; \ - movzx %ch, %edi; \ - movzx tab_i(%rsi), %esi; \ - movzx tab_i(%rdi), %edi; \ - rol $16, %esi; \ - rol $24, %edi; \ - xor %esi, p1; \ - xor %edi, p2; \ - \ - movzx %dl, %esi; \ - movzx %dh, %edi; \ - movzx tab_i(%rsi), %esi; \ - movzx tab_i(%rdi), %edi; \ - shr $16, %edx; \ - xor %esi, p4; \ - rol $8, %edi; \ - xor %edi, p1; \ - movzx %dl, %esi; \ - movzx %dh, %edi; \ - movzx tab_i(%rsi), %esi; \ - movzx tab_i(%rdi), %edi; \ - rol $16, %esi; \ - rol $24, %edi; \ - xor %esi, p2; \ - xor %edi, p3 - -#endif /* LAST_ROUND_TABLES */ - -// Windows x64: -// Calling: rcx, rdx, r8, and r9 (float: xmm0-xmm3) -// Return: rax (float: xmm0) -// Volatile: rax, rcx, rdx, r8-r11 -// Nonvolatile: rbx, rbp, rsp, rdi, rsi, r12-r15 (xmm6, xmm15) - -// Unix x64: -// Calling: rdi, rsi, rdx, rcx, r8, r9 (float: xmm0-xmm7) -// Return: rax (float: xmm0) -// Volatile: -// Nonvolatile: rbx, rbp, rsp, r12-r15 - -// outcome: - -/* - * OpenSolaris OS: - * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr, - * const uint32_t pt[4], uint32_t ct[4])/ - * - * Original interface: - * int aes_encrypt(const unsigned char *in, - * unsigned char *out, const aes_encrypt_ctx cx[1])/ - */ - .align 64 -enc_tab: - enc_vals(u8) -#ifdef LAST_ROUND_TABLES - // Last Round Tables: - enc_vals(w8) -#endif - - - ENTRY_NP(aes_encrypt_amd64) -#ifdef GLADMAN_INTERFACE - // Original interface - sub $[4*8], %rsp // gnu/linux/opensolaris binary interface - mov %rsi, (%rsp) // output pointer (P2) - mov %rdx, %r8 // context (P3) - - mov %rbx, 1*8(%rsp) // P1: input pointer in rdi - mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp) - mov %r12, 3*8(%rsp) // P3: context in r8 - movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16 - -#else - - // Windows interface - push %rdi - push %rsi - sub $(4*8), %rsp // Make room on stack to save registers - mov %r9, (%rsp) // Save output pointer (P4) on stack (was rcx) - mov %r8, %rdi // P3: save input pointer (was rdx) - mov %rcx, %r8 // context (P1) (was rdi) - mov %edx, %esi // P2: to esi. - shl $4, %esi // P2: esi byte key length * 16 (was esi) - - mov %rbx, 1*8(%rsp) // Save registers - mov %rbp, 2*8(%rsp) - mov %r12, 3*8(%rsp) - // P1: context in r8 - // P2: byte key length * 16 in esi - // P3: input pointer in rdi - // P4: output pointer in (rsp) -#endif /* GLADMAN_INTERFACE */ - - lea enc_tab(%rip), tptr - sub $fofs, kptr - - // Load input block into registers - mov (%rdi), %eax - mov 1*4(%rdi), %ebx - mov 2*4(%rdi), %ecx - mov 3*4(%rdi), %edx - - xor fofs(kptr), %eax - xor fofs+4(kptr), %ebx - xor fofs+8(kptr), %ecx - xor fofs+12(kptr), %edx - - lea (kptr,%rsi), kptr - // Jump based on byte key length * 16: - cmp $(10*16), %esi - je 3f - cmp $(12*16), %esi - je 2f - cmp $(14*16), %esi - je 1f - mov $-1, %rax // error - jmp 4f - - // Perform normal forward rounds -1: ff_rnd(%r9d, %r10d, %r11d, %r12d, 13) - ff_rnd(%r9d, %r10d, %r11d, %r12d, 12) -2: ff_rnd(%r9d, %r10d, %r11d, %r12d, 11) - ff_rnd(%r9d, %r10d, %r11d, %r12d, 10) -3: ff_rnd(%r9d, %r10d, %r11d, %r12d, 9) - ff_rnd(%r9d, %r10d, %r11d, %r12d, 8) - ff_rnd(%r9d, %r10d, %r11d, %r12d, 7) - ff_rnd(%r9d, %r10d, %r11d, %r12d, 6) - ff_rnd(%r9d, %r10d, %r11d, %r12d, 5) - ff_rnd(%r9d, %r10d, %r11d, %r12d, 4) - ff_rnd(%r9d, %r10d, %r11d, %r12d, 3) - ff_rnd(%r9d, %r10d, %r11d, %r12d, 2) - ff_rnd(%r9d, %r10d, %r11d, %r12d, 1) - fl_rnd(%r9d, %r10d, %r11d, %r12d, 0) - - // Copy results - mov (%rsp), %rbx - mov %r9d, (%rbx) - mov %r10d, 4(%rbx) - mov %r11d, 8(%rbx) - mov %r12d, 12(%rbx) - xor %rax, %rax -4: // Restore registers - mov 1*8(%rsp), %rbx - mov 2*8(%rsp), %rbp - mov 3*8(%rsp), %r12 - add $(4*8), %rsp - pop %rsi - pop %rdi - ret - - SET_SIZE(aes_encrypt_amd64) - -/* - * OpenSolaris OS: - * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr, - * const uint32_t pt[4], uint32_t ct[4])/ - * - * Original interface: - * int aes_decrypt(const unsigned char *in, - * unsigned char *out, const aes_encrypt_ctx cx[1])/ - */ - .align 64 -dec_tab: - dec_vals(v8) -#ifdef LAST_ROUND_TABLES - // Last Round Tables: - dec_vals(w8) -#endif - - - ENTRY_NP(aes_decrypt_amd64) -#ifdef GLADMAN_INTERFACE - // Original interface - sub $[4*8], %rsp // gnu/linux/opensolaris binary interface - mov %rsi, (%rsp) // output pointer (P2) - mov %rdx, %r8 // context (P3) - - mov %rbx, 1*8(%rsp) // P1: input pointer in rdi - mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp) - mov %r12, 3*8(%rsp) // P3: context in r8 - movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16 - -#else - // Windows interface - push %rdi - push %rsi - - sub $(4*8), %rsp // Make room on stack to save registers - mov %r9, (%rsp) // Save output pointer (P4) on stack was rcx - mov %r8, %rdi // P3: save input pointer was rdx - mov %rcx, %r8 // context (P1) was rdi - mov %edx, %esi // P2 - shl $4, %esi // P2: esi byte key length * 16 was esi - - mov %rbx, 1*8(%rsp) // Save registers - mov %rbp, 2*8(%rsp) - mov %r12, 3*8(%rsp) - // P1: context in r8 - // P2: byte key length * 16 in esi - // P3: input pointer in rdi - // P4: output pointer in (rsp) -#endif /* GLADMAN_INTERFACE */ - - lea dec_tab(%rip), tptr - sub $rofs, kptr - - // Load input block into registers - mov (%rdi), %eax - mov 1*4(%rdi), %ebx - mov 2*4(%rdi), %ecx - mov 3*4(%rdi), %edx - -#ifdef AES_REV_DKS - mov kptr, %rdi - lea (kptr,%rsi), kptr -#else - lea (kptr,%rsi), %rdi -#endif - - xor rofs(%rdi), %eax - xor rofs+4(%rdi), %ebx - xor rofs+8(%rdi), %ecx - xor rofs+12(%rdi), %edx - - // Jump based on byte key length * 16: - cmp $(10*16), %esi - je 3f - cmp $(12*16), %esi - je 2f - cmp $(14*16), %esi - je 1f - mov $-1, %rax // error - jmp 4f - - // Perform normal inverse rounds -1: ii_rnd(%r9d, %r10d, %r11d, %r12d, 13) - ii_rnd(%r9d, %r10d, %r11d, %r12d, 12) -2: ii_rnd(%r9d, %r10d, %r11d, %r12d, 11) - ii_rnd(%r9d, %r10d, %r11d, %r12d, 10) -3: ii_rnd(%r9d, %r10d, %r11d, %r12d, 9) - ii_rnd(%r9d, %r10d, %r11d, %r12d, 8) - ii_rnd(%r9d, %r10d, %r11d, %r12d, 7) - ii_rnd(%r9d, %r10d, %r11d, %r12d, 6) - ii_rnd(%r9d, %r10d, %r11d, %r12d, 5) - ii_rnd(%r9d, %r10d, %r11d, %r12d, 4) - ii_rnd(%r9d, %r10d, %r11d, %r12d, 3) - ii_rnd(%r9d, %r10d, %r11d, %r12d, 2) - ii_rnd(%r9d, %r10d, %r11d, %r12d, 1) - il_rnd(%r9d, %r10d, %r11d, %r12d, 0) - - // Copy results - mov (%rsp), %rbx - mov %r9d, (%rbx) - mov %r10d, 4(%rbx) - mov %r11d, 8(%rbx) - mov %r12d, 12(%rbx) - xor %rax, %rax -4: // Restore registers - mov 1*8(%rsp), %rbx - mov 2*8(%rsp), %rbp - mov 3*8(%rsp), %r12 - add $(4*8), %rsp - pop %rsi - pop %rdi - ret - - SET_SIZE(aes_decrypt_amd64) diff --git a/module/icp/asm-x86_64/os/windows/modes/gcm_pclmulqdq.S b/module/icp/asm-x86_64/os/windows/modes/gcm_pclmulqdq.S deleted file mode 100644 index 1e82258a8621..000000000000 --- a/module/icp/asm-x86_64/os/windows/modes/gcm_pclmulqdq.S +++ /dev/null @@ -1,267 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2009 Intel Corporation - * All Rights Reserved. - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Accelerated GHASH implementation with Intel PCLMULQDQ-NI - * instructions. This file contains an accelerated - * Galois Field Multiplication implementation. - * - * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH, - * carry-less multiplication. More information about PCLMULQDQ can be - * found at: - * http://software.intel.com/en-us/articles/ - * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ - * - */ - -/* - * ==================================================================== - * OpenSolaris OS modifications - * - * This source originates as file galois_hash_asm.c from - * Intel Corporation dated September 21, 2009. - * - * This OpenSolaris version has these major changes from the original source: - * - * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from - * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function - * definition for lint. - * - * 2. Formatted code, added comments, and added #includes and #defines. - * - * 3. If bit CR0.TS is set, clear and set the TS bit, after and before - * calling kpreempt_disable() and kpreempt_enable(). - * If the TS bit is not set, Save and restore %xmm registers at the beginning - * and end of function calls (%xmm* registers are not saved and restored by - * during kernel thread preemption). - * - * 4. Removed code to perform hashing. This is already done with C macro - * GHASH in gcm.c. For better performance, this removed code should be - * reintegrated in the future to replace the C GHASH macro. - * - * 5. Added code to byte swap 16-byte input and output. - * - * 6. Folded in comments from the original C source with embedded assembly - * (SB_w_shift_xor.c) - * - * 7. Renamed function and reordered parameters to match OpenSolaris: - * Intel interface: - * void galois_hash_asm(unsigned char *hk, unsigned char *s, - * unsigned char *d, int length) - * OpenSolaris OS interface: - * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); - * ==================================================================== - */ - - -#if defined(lint) || defined(__lint) /* lint */ - -#include - -void -gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) { - (void) x_in, (void) y, (void) res; -} - -#elif defined(HAVE_PCLMULQDQ) /* guard by instruction set */ - -#define _ASM -#include - -/* - * Use this mask to byte-swap a 16-byte integer with the pshufb instruction - */ - -// static uint8_t byte_swap16_mask[] = { -// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 }; -.section .rodata -.align XMM_ALIGN -.Lbyte_swap16_mask: - .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 - - -/* - * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); - * - * Perform a carry-less multiplication (that is, use XOR instead of the - * multiply operator) on P1 and P2 and place the result in P3. - * - * Byte swap the input and the output. - * - * Note: x_in, y, and res all point to a block of 20-byte numbers - * (an array of two 64-bit integers). - * - * Note2: For kernel code, caller is responsible for ensuring - * kpreempt_disable() has been called. This is because %xmm registers are - * not saved/restored. Clear and set the CR0.TS bit on entry and exit, - * respectively, if TS is set on entry. Otherwise, if TS is not set, - * save and restore %xmm registers on the stack. - * - * Note3: Original Intel definition: - * void galois_hash_asm(unsigned char *hk, unsigned char *s, - * unsigned char *d, int length) - * - * Note4: Register/parameter mapping: - * Intel: - * Parameter 1: %rcx (copied to %xmm0) hk or x_in - * Parameter 2: %rdx (copied to %xmm1) s or y - * Parameter 3: %rdi (result) d or res - * OpenSolaris: - * Parameter 1: %rdi (copied to %xmm0) x_in - * Parameter 2: %rsi (copied to %xmm1) y - * Parameter 3: %rdx (result) res - */ -// Windows x64: -// Calling: rcx, rdx, r8, and r9 (float: xmm0-xmm3) -// Return: rax (float: xmm0) -// Volatile: rax, rcx, rdx, r8-r11 -// Nonvolatile: rbx, rbp, rsp, rdi, rsi, r12-r15 (xmm6, xmm15) - -// Unix x64: -// Calling: rdi, rsi, rdx, rcx, r8, r9 (float: xmm0-xmm7) -// Return: rax (float: xmm0) -// Volatile: -// Nonvolatile: rbx, rbp, rsp, r12-r15 - -// outcome: - -ENTRY_NP(gcm_mul_pclmulqdq) - // - // Copy Parameters - // - movdqu (%rcx), %xmm0 // P1 - movdqu (%rdx), %xmm1 // P2 - - // - // Byte swap 16-byte input - // - lea .Lbyte_swap16_mask(%rip), %rax - movups (%rax), %xmm10 - pshufb %xmm10, %xmm0 - pshufb %xmm10, %xmm1 - - - // - // Multiply with the hash key - // - movdqu %xmm0, %xmm3 - pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0 - - movdqu %xmm0, %xmm4 - pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1 - - movdqu %xmm0, %xmm5 - pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0 - movdqu %xmm0, %xmm6 - pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1 - - pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0 - - movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5 - psrldq $8, %xmm4 // shift by xmm4 64 bits to the right - pslldq $8, %xmm5 // shift by xmm5 64 bits to the left - pxor %xmm5, %xmm3 - pxor %xmm4, %xmm6 // Register pair holds the result - // of the carry-less multiplication of - // xmm0 by xmm1. - - // We shift the result of the multiplication by one bit position - // to the left to cope for the fact that the bits are reversed. - movdqu %xmm3, %xmm7 - movdqu %xmm6, %xmm8 - pslld $1, %xmm3 - pslld $1, %xmm6 - psrld $31, %xmm7 - psrld $31, %xmm8 - movdqu %xmm7, %xmm9 - pslldq $4, %xmm8 - pslldq $4, %xmm7 - psrldq $12, %xmm9 - por %xmm7, %xmm3 - por %xmm8, %xmm6 - por %xmm9, %xmm6 - - // - // First phase of the reduction - // - // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts - // independently. - movdqu %xmm3, %xmm7 - movdqu %xmm3, %xmm8 - movdqu %xmm3, %xmm9 - pslld $31, %xmm7 // packed right shift shifting << 31 - pslld $30, %xmm8 // packed right shift shifting << 30 - pslld $25, %xmm9 // packed right shift shifting << 25 - pxor %xmm8, %xmm7 // xor the shifted versions - pxor %xmm9, %xmm7 - movdqu %xmm7, %xmm8 - pslldq $12, %xmm7 - psrldq $4, %xmm8 - pxor %xmm7, %xmm3 // first phase of the reduction complete - - // - // Second phase of the reduction - // - // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these - // shift operations. - movdqu %xmm3, %xmm2 - movdqu %xmm3, %xmm4 // packed left shifting >> 1 - movdqu %xmm3, %xmm5 - psrld $1, %xmm2 - psrld $2, %xmm4 // packed left shifting >> 2 - psrld $7, %xmm5 // packed left shifting >> 7 - pxor %xmm4, %xmm2 // xor the shifted versions - pxor %xmm5, %xmm2 - pxor %xmm8, %xmm2 - pxor %xmm2, %xmm3 - pxor %xmm3, %xmm6 // the result is in xmm6 - - // - // Byte swap 16-byte result - // - pshufb %xmm10, %xmm6 // %xmm10 has the swap mask - - // - // Store the result - // - movdqu %xmm6, (%r8) // P3 - - - // - // Return - // - RET - SET_SIZE(gcm_mul_pclmulqdq) - -#endif /* lint || __lint */ - -#ifdef __ELF__ -.section .note.GNU-stack,"",%progbits -#endif diff --git a/module/icp/asm-x86_64/os/windows/sha2/sha256_impl.S b/module/icp/asm-x86_64/os/windows/sha2/sha256_impl.S deleted file mode 100644 index d406e9cc11b8..000000000000 --- a/module/icp/asm-x86_64/os/windows/sha2/sha256_impl.S +++ /dev/null @@ -1,2078 +0,0 @@ - -/* - * ==================================================================== - * Written by Andy Polyakov for the OpenSSL - * project. Rights for redistribution and usage in source and binary - * forms are granted according to the OpenSSL license. - * ==================================================================== - * - * sha256/512_block procedure for x86_64. - * - * 40% improvement over compiler-generated code on Opteron. On EM64T - * sha256 was observed to run >80% faster and sha512 - >40%. No magical - * tricks, just straight implementation... I really wonder why gcc - * [being armed with inline assembler] fails to generate as fast code. - * The only thing which is cool about this module is that it's very - * same instruction sequence used for both SHA-256 and SHA-512. In - * former case the instructions operate on 32-bit operands, while in - * latter - on 64-bit ones. All I had to do is to get one flavor right, - * the other one passed the test right away:-) - * - * sha256_block runs in ~1005 cycles on Opteron, which gives you - * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock - * frequency in GHz. sha512_block runs in ~1275 cycles, which results - * in 128*1000/1275=100MBps per GHz. Is there room for improvement? - * Well, if you compare it to IA-64 implementation, which maintains - * X[16] in register bank[!], tends to 4 instructions per CPU clock - * cycle and runs in 1003 cycles, 1275 is very good result for 3-way - * issue Opteron pipeline and X[16] maintained in memory. So that *if* - * there is a way to improve it, *then* the only way would be to try to - * offload X[16] updates to SSE unit, but that would require "deeper" - * loop unroll, which in turn would naturally cause size blow-up, not - * to mention increased complexity! And once again, only *if* it's - * actually possible to noticeably improve overall ILP, instruction - * level parallelism, on a given CPU implementation in this case. - * - * Special note on Intel EM64T. While Opteron CPU exhibits perfect - * perfromance ratio of 1.5 between 64- and 32-bit flavors [see above], - * [currently available] EM64T CPUs apparently are far from it. On the - * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit - * sha256_block:-( This is presumably because 64-bit shifts/rotates - * apparently are not atomic instructions, but implemented in microcode. - */ - -/* - * OpenSolaris OS modifications - * - * Sun elects to use this software under the BSD license. - * - * This source originates from OpenSSL file sha512-x86_64.pl at - * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz - * (presumably for future OpenSSL release 0.9.8h), with these changes: - * - * 1. Added perl "use strict" and declared variables. - * - * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from - * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards. - * - * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) - * assemblers). Replaced the .picmeup macro with assembler code. - * - * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype", - * at the beginning of SHA2_CTX (the next field is 8-byte aligned). - */ - -/* - * This file was generated by a perl script (sha512-x86_64.pl) that were - * used to generate sha256 and sha512 variants from the same code base. - * The comments from the original file have been pasted above. - */ - -#if 0 -/* ARGSUSED */ -void -SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num) -{ -} -#endif - -#define _ASM -#include - -// Windows x64: -// Calling: rcx, rdx, r8, and r9 (float: xmm0-xmm3) -// Return: rax (float: xmm0) -// Volatile: rax, rcx, rdx, r8-r11 -// Nonvolatile: rbx, rbp, rsp, rdi, rsi, r12-r15 - -// Unix x64: -// Calling: rdi, rsi, rdx, rcx, r8, r9 (float: xmm0-xmm7) -// Return: rax (float: xmm0) -// Volatile: -// Nonvolatile: rbx, rbp, rsp, r12-r15 - -// outcome: -// rdi -> rcx -// save rdi, rsi. - -ENTRY_NP(SHA256TransformBlocks) - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - - push %rdi # win - push %rsi # win - mov %rcx, %rdi # win fix arg1 - mov %rdx, %rsi # win fix arg2 - mov %r8, %rdx # win fix arg3 - - mov %rsp,%rbp # copy %rsp - shl $4,%rdx # num*16 - sub $16*4+4*8,%rsp - lea (%rsi,%rdx,4),%rdx # inp+num*16*4 - and $-64,%rsp # align stack frame - add $8,%rdi # Skip OpenSolaris field, "algotype" - mov %rdi,16*4+0*8(%rsp) # save ctx, 1st arg - mov %rsi,16*4+1*8(%rsp) # save inp, 2nd arg - mov %rdx,16*4+2*8(%rsp) # save end pointer, "3rd" arg - mov %rbp,16*4+3*8(%rsp) # save copy of %rsp - - //.picmeup %rbp - // The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts - // the address of the "next" instruction into the target register - // (%rbp). This generates these 2 instructions: - lea .Llea(%rip),%rbp - //nop // .picmeup generates a nop for mod 8 alignment--not needed here - -.Llea: - lea K256-.(%rbp),%rbp - - mov 4*0(%rdi),%eax - mov 4*1(%rdi),%ebx - mov 4*2(%rdi),%ecx - mov 4*3(%rdi),%edx - mov 4*4(%rdi),%r8d - mov 4*5(%rdi),%r9d - mov 4*6(%rdi),%r10d - mov 4*7(%rdi),%r11d - jmp .Lloop - -.align 4, 0x90 -.Lloop: - xor %rdi,%rdi - mov 4*0(%rsi),%r12d - bswap %r12d - mov %r8d,%r13d - mov %r8d,%r14d - mov %r9d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r10d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r8d,%r15d # (f^g)&e - mov %r12d,0(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r11d,%r12d # T1+=h - - mov %eax,%r11d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %eax,%r13d - mov %eax,%r14d - - ror $2,%r11d - ror $13,%r13d - mov %eax,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r11d - ror $9,%r13d - or %ecx,%r14d # a|c - - xor %r13d,%r11d # h=Sigma0(a) - and %ecx,%r15d # a&c - add %r12d,%edx # d+=T1 - - and %ebx,%r14d # (a|c)&b - add %r12d,%r11d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r11d # h+=Maj(a,b,c) - mov 4*1(%rsi),%r12d - bswap %r12d - mov %edx,%r13d - mov %edx,%r14d - mov %r8d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r9d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %edx,%r15d # (f^g)&e - mov %r12d,4(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r10d,%r12d # T1+=h - - mov %r11d,%r10d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r11d,%r13d - mov %r11d,%r14d - - ror $2,%r10d - ror $13,%r13d - mov %r11d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r10d - ror $9,%r13d - or %ebx,%r14d # a|c - - xor %r13d,%r10d # h=Sigma0(a) - and %ebx,%r15d # a&c - add %r12d,%ecx # d+=T1 - - and %eax,%r14d # (a|c)&b - add %r12d,%r10d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r10d # h+=Maj(a,b,c) - mov 4*2(%rsi),%r12d - bswap %r12d - mov %ecx,%r13d - mov %ecx,%r14d - mov %edx,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r8d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %ecx,%r15d # (f^g)&e - mov %r12d,8(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r9d,%r12d # T1+=h - - mov %r10d,%r9d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r10d,%r13d - mov %r10d,%r14d - - ror $2,%r9d - ror $13,%r13d - mov %r10d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r9d - ror $9,%r13d - or %eax,%r14d # a|c - - xor %r13d,%r9d # h=Sigma0(a) - and %eax,%r15d # a&c - add %r12d,%ebx # d+=T1 - - and %r11d,%r14d # (a|c)&b - add %r12d,%r9d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r9d # h+=Maj(a,b,c) - mov 4*3(%rsi),%r12d - bswap %r12d - mov %ebx,%r13d - mov %ebx,%r14d - mov %ecx,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %edx,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %ebx,%r15d # (f^g)&e - mov %r12d,12(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r8d,%r12d # T1+=h - - mov %r9d,%r8d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r9d,%r13d - mov %r9d,%r14d - - ror $2,%r8d - ror $13,%r13d - mov %r9d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r8d - ror $9,%r13d - or %r11d,%r14d # a|c - - xor %r13d,%r8d # h=Sigma0(a) - and %r11d,%r15d # a&c - add %r12d,%eax # d+=T1 - - and %r10d,%r14d # (a|c)&b - add %r12d,%r8d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r8d # h+=Maj(a,b,c) - mov 4*4(%rsi),%r12d - bswap %r12d - mov %eax,%r13d - mov %eax,%r14d - mov %ebx,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %ecx,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %eax,%r15d # (f^g)&e - mov %r12d,16(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %edx,%r12d # T1+=h - - mov %r8d,%edx - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r8d,%r13d - mov %r8d,%r14d - - ror $2,%edx - ror $13,%r13d - mov %r8d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%edx - ror $9,%r13d - or %r10d,%r14d # a|c - - xor %r13d,%edx # h=Sigma0(a) - and %r10d,%r15d # a&c - add %r12d,%r11d # d+=T1 - - and %r9d,%r14d # (a|c)&b - add %r12d,%edx # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%edx # h+=Maj(a,b,c) - mov 4*5(%rsi),%r12d - bswap %r12d - mov %r11d,%r13d - mov %r11d,%r14d - mov %eax,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %ebx,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r11d,%r15d # (f^g)&e - mov %r12d,20(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %ecx,%r12d # T1+=h - - mov %edx,%ecx - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %edx,%r13d - mov %edx,%r14d - - ror $2,%ecx - ror $13,%r13d - mov %edx,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%ecx - ror $9,%r13d - or %r9d,%r14d # a|c - - xor %r13d,%ecx # h=Sigma0(a) - and %r9d,%r15d # a&c - add %r12d,%r10d # d+=T1 - - and %r8d,%r14d # (a|c)&b - add %r12d,%ecx # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%ecx # h+=Maj(a,b,c) - mov 4*6(%rsi),%r12d - bswap %r12d - mov %r10d,%r13d - mov %r10d,%r14d - mov %r11d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %eax,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r10d,%r15d # (f^g)&e - mov %r12d,24(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %ebx,%r12d # T1+=h - - mov %ecx,%ebx - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %ecx,%r13d - mov %ecx,%r14d - - ror $2,%ebx - ror $13,%r13d - mov %ecx,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%ebx - ror $9,%r13d - or %r8d,%r14d # a|c - - xor %r13d,%ebx # h=Sigma0(a) - and %r8d,%r15d # a&c - add %r12d,%r9d # d+=T1 - - and %edx,%r14d # (a|c)&b - add %r12d,%ebx # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%ebx # h+=Maj(a,b,c) - mov 4*7(%rsi),%r12d - bswap %r12d - mov %r9d,%r13d - mov %r9d,%r14d - mov %r10d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r11d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r9d,%r15d # (f^g)&e - mov %r12d,28(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %eax,%r12d # T1+=h - - mov %ebx,%eax - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %ebx,%r13d - mov %ebx,%r14d - - ror $2,%eax - ror $13,%r13d - mov %ebx,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%eax - ror $9,%r13d - or %edx,%r14d # a|c - - xor %r13d,%eax # h=Sigma0(a) - and %edx,%r15d # a&c - add %r12d,%r8d # d+=T1 - - and %ecx,%r14d # (a|c)&b - add %r12d,%eax # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%eax # h+=Maj(a,b,c) - mov 4*8(%rsi),%r12d - bswap %r12d - mov %r8d,%r13d - mov %r8d,%r14d - mov %r9d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r10d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r8d,%r15d # (f^g)&e - mov %r12d,32(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r11d,%r12d # T1+=h - - mov %eax,%r11d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %eax,%r13d - mov %eax,%r14d - - ror $2,%r11d - ror $13,%r13d - mov %eax,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r11d - ror $9,%r13d - or %ecx,%r14d # a|c - - xor %r13d,%r11d # h=Sigma0(a) - and %ecx,%r15d # a&c - add %r12d,%edx # d+=T1 - - and %ebx,%r14d # (a|c)&b - add %r12d,%r11d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r11d # h+=Maj(a,b,c) - mov 4*9(%rsi),%r12d - bswap %r12d - mov %edx,%r13d - mov %edx,%r14d - mov %r8d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r9d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %edx,%r15d # (f^g)&e - mov %r12d,36(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r10d,%r12d # T1+=h - - mov %r11d,%r10d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r11d,%r13d - mov %r11d,%r14d - - ror $2,%r10d - ror $13,%r13d - mov %r11d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r10d - ror $9,%r13d - or %ebx,%r14d # a|c - - xor %r13d,%r10d # h=Sigma0(a) - and %ebx,%r15d # a&c - add %r12d,%ecx # d+=T1 - - and %eax,%r14d # (a|c)&b - add %r12d,%r10d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r10d # h+=Maj(a,b,c) - mov 4*10(%rsi),%r12d - bswap %r12d - mov %ecx,%r13d - mov %ecx,%r14d - mov %edx,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r8d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %ecx,%r15d # (f^g)&e - mov %r12d,40(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r9d,%r12d # T1+=h - - mov %r10d,%r9d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r10d,%r13d - mov %r10d,%r14d - - ror $2,%r9d - ror $13,%r13d - mov %r10d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r9d - ror $9,%r13d - or %eax,%r14d # a|c - - xor %r13d,%r9d # h=Sigma0(a) - and %eax,%r15d # a&c - add %r12d,%ebx # d+=T1 - - and %r11d,%r14d # (a|c)&b - add %r12d,%r9d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r9d # h+=Maj(a,b,c) - mov 4*11(%rsi),%r12d - bswap %r12d - mov %ebx,%r13d - mov %ebx,%r14d - mov %ecx,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %edx,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %ebx,%r15d # (f^g)&e - mov %r12d,44(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r8d,%r12d # T1+=h - - mov %r9d,%r8d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r9d,%r13d - mov %r9d,%r14d - - ror $2,%r8d - ror $13,%r13d - mov %r9d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r8d - ror $9,%r13d - or %r11d,%r14d # a|c - - xor %r13d,%r8d # h=Sigma0(a) - and %r11d,%r15d # a&c - add %r12d,%eax # d+=T1 - - and %r10d,%r14d # (a|c)&b - add %r12d,%r8d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r8d # h+=Maj(a,b,c) - mov 4*12(%rsi),%r12d - bswap %r12d - mov %eax,%r13d - mov %eax,%r14d - mov %ebx,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %ecx,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %eax,%r15d # (f^g)&e - mov %r12d,48(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %edx,%r12d # T1+=h - - mov %r8d,%edx - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r8d,%r13d - mov %r8d,%r14d - - ror $2,%edx - ror $13,%r13d - mov %r8d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%edx - ror $9,%r13d - or %r10d,%r14d # a|c - - xor %r13d,%edx # h=Sigma0(a) - and %r10d,%r15d # a&c - add %r12d,%r11d # d+=T1 - - and %r9d,%r14d # (a|c)&b - add %r12d,%edx # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%edx # h+=Maj(a,b,c) - mov 4*13(%rsi),%r12d - bswap %r12d - mov %r11d,%r13d - mov %r11d,%r14d - mov %eax,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %ebx,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r11d,%r15d # (f^g)&e - mov %r12d,52(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %ecx,%r12d # T1+=h - - mov %edx,%ecx - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %edx,%r13d - mov %edx,%r14d - - ror $2,%ecx - ror $13,%r13d - mov %edx,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%ecx - ror $9,%r13d - or %r9d,%r14d # a|c - - xor %r13d,%ecx # h=Sigma0(a) - and %r9d,%r15d # a&c - add %r12d,%r10d # d+=T1 - - and %r8d,%r14d # (a|c)&b - add %r12d,%ecx # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%ecx # h+=Maj(a,b,c) - mov 4*14(%rsi),%r12d - bswap %r12d - mov %r10d,%r13d - mov %r10d,%r14d - mov %r11d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %eax,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r10d,%r15d # (f^g)&e - mov %r12d,56(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %ebx,%r12d # T1+=h - - mov %ecx,%ebx - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %ecx,%r13d - mov %ecx,%r14d - - ror $2,%ebx - ror $13,%r13d - mov %ecx,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%ebx - ror $9,%r13d - or %r8d,%r14d # a|c - - xor %r13d,%ebx # h=Sigma0(a) - and %r8d,%r15d # a&c - add %r12d,%r9d # d+=T1 - - and %edx,%r14d # (a|c)&b - add %r12d,%ebx # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%ebx # h+=Maj(a,b,c) - mov 4*15(%rsi),%r12d - bswap %r12d - mov %r9d,%r13d - mov %r9d,%r14d - mov %r10d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r11d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r9d,%r15d # (f^g)&e - mov %r12d,60(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %eax,%r12d # T1+=h - - mov %ebx,%eax - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %ebx,%r13d - mov %ebx,%r14d - - ror $2,%eax - ror $13,%r13d - mov %ebx,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%eax - ror $9,%r13d - or %edx,%r14d # a|c - - xor %r13d,%eax # h=Sigma0(a) - and %edx,%r15d # a&c - add %r12d,%r8d # d+=T1 - - and %ecx,%r14d # (a|c)&b - add %r12d,%eax # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%eax # h+=Maj(a,b,c) - jmp .Lrounds_16_xx -.align 4, 0x90 -.Lrounds_16_xx: - mov 4(%rsp),%r13d - mov 56(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 36(%rsp),%r12d - - add 0(%rsp),%r12d - mov %r8d,%r13d - mov %r8d,%r14d - mov %r9d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r10d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r8d,%r15d # (f^g)&e - mov %r12d,0(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r11d,%r12d # T1+=h - - mov %eax,%r11d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %eax,%r13d - mov %eax,%r14d - - ror $2,%r11d - ror $13,%r13d - mov %eax,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r11d - ror $9,%r13d - or %ecx,%r14d # a|c - - xor %r13d,%r11d # h=Sigma0(a) - and %ecx,%r15d # a&c - add %r12d,%edx # d+=T1 - - and %ebx,%r14d # (a|c)&b - add %r12d,%r11d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r11d # h+=Maj(a,b,c) - mov 8(%rsp),%r13d - mov 60(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 40(%rsp),%r12d - - add 4(%rsp),%r12d - mov %edx,%r13d - mov %edx,%r14d - mov %r8d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r9d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %edx,%r15d # (f^g)&e - mov %r12d,4(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r10d,%r12d # T1+=h - - mov %r11d,%r10d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r11d,%r13d - mov %r11d,%r14d - - ror $2,%r10d - ror $13,%r13d - mov %r11d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r10d - ror $9,%r13d - or %ebx,%r14d # a|c - - xor %r13d,%r10d # h=Sigma0(a) - and %ebx,%r15d # a&c - add %r12d,%ecx # d+=T1 - - and %eax,%r14d # (a|c)&b - add %r12d,%r10d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r10d # h+=Maj(a,b,c) - mov 12(%rsp),%r13d - mov 0(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 44(%rsp),%r12d - - add 8(%rsp),%r12d - mov %ecx,%r13d - mov %ecx,%r14d - mov %edx,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r8d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %ecx,%r15d # (f^g)&e - mov %r12d,8(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r9d,%r12d # T1+=h - - mov %r10d,%r9d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r10d,%r13d - mov %r10d,%r14d - - ror $2,%r9d - ror $13,%r13d - mov %r10d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r9d - ror $9,%r13d - or %eax,%r14d # a|c - - xor %r13d,%r9d # h=Sigma0(a) - and %eax,%r15d # a&c - add %r12d,%ebx # d+=T1 - - and %r11d,%r14d # (a|c)&b - add %r12d,%r9d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r9d # h+=Maj(a,b,c) - mov 16(%rsp),%r13d - mov 4(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 48(%rsp),%r12d - - add 12(%rsp),%r12d - mov %ebx,%r13d - mov %ebx,%r14d - mov %ecx,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %edx,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %ebx,%r15d # (f^g)&e - mov %r12d,12(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r8d,%r12d # T1+=h - - mov %r9d,%r8d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r9d,%r13d - mov %r9d,%r14d - - ror $2,%r8d - ror $13,%r13d - mov %r9d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r8d - ror $9,%r13d - or %r11d,%r14d # a|c - - xor %r13d,%r8d # h=Sigma0(a) - and %r11d,%r15d # a&c - add %r12d,%eax # d+=T1 - - and %r10d,%r14d # (a|c)&b - add %r12d,%r8d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r8d # h+=Maj(a,b,c) - mov 20(%rsp),%r13d - mov 8(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 52(%rsp),%r12d - - add 16(%rsp),%r12d - mov %eax,%r13d - mov %eax,%r14d - mov %ebx,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %ecx,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %eax,%r15d # (f^g)&e - mov %r12d,16(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %edx,%r12d # T1+=h - - mov %r8d,%edx - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r8d,%r13d - mov %r8d,%r14d - - ror $2,%edx - ror $13,%r13d - mov %r8d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%edx - ror $9,%r13d - or %r10d,%r14d # a|c - - xor %r13d,%edx # h=Sigma0(a) - and %r10d,%r15d # a&c - add %r12d,%r11d # d+=T1 - - and %r9d,%r14d # (a|c)&b - add %r12d,%edx # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%edx # h+=Maj(a,b,c) - mov 24(%rsp),%r13d - mov 12(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 56(%rsp),%r12d - - add 20(%rsp),%r12d - mov %r11d,%r13d - mov %r11d,%r14d - mov %eax,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %ebx,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r11d,%r15d # (f^g)&e - mov %r12d,20(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %ecx,%r12d # T1+=h - - mov %edx,%ecx - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %edx,%r13d - mov %edx,%r14d - - ror $2,%ecx - ror $13,%r13d - mov %edx,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%ecx - ror $9,%r13d - or %r9d,%r14d # a|c - - xor %r13d,%ecx # h=Sigma0(a) - and %r9d,%r15d # a&c - add %r12d,%r10d # d+=T1 - - and %r8d,%r14d # (a|c)&b - add %r12d,%ecx # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%ecx # h+=Maj(a,b,c) - mov 28(%rsp),%r13d - mov 16(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 60(%rsp),%r12d - - add 24(%rsp),%r12d - mov %r10d,%r13d - mov %r10d,%r14d - mov %r11d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %eax,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r10d,%r15d # (f^g)&e - mov %r12d,24(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %ebx,%r12d # T1+=h - - mov %ecx,%ebx - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %ecx,%r13d - mov %ecx,%r14d - - ror $2,%ebx - ror $13,%r13d - mov %ecx,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%ebx - ror $9,%r13d - or %r8d,%r14d # a|c - - xor %r13d,%ebx # h=Sigma0(a) - and %r8d,%r15d # a&c - add %r12d,%r9d # d+=T1 - - and %edx,%r14d # (a|c)&b - add %r12d,%ebx # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%ebx # h+=Maj(a,b,c) - mov 32(%rsp),%r13d - mov 20(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 0(%rsp),%r12d - - add 28(%rsp),%r12d - mov %r9d,%r13d - mov %r9d,%r14d - mov %r10d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r11d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r9d,%r15d # (f^g)&e - mov %r12d,28(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %eax,%r12d # T1+=h - - mov %ebx,%eax - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %ebx,%r13d - mov %ebx,%r14d - - ror $2,%eax - ror $13,%r13d - mov %ebx,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%eax - ror $9,%r13d - or %edx,%r14d # a|c - - xor %r13d,%eax # h=Sigma0(a) - and %edx,%r15d # a&c - add %r12d,%r8d # d+=T1 - - and %ecx,%r14d # (a|c)&b - add %r12d,%eax # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%eax # h+=Maj(a,b,c) - mov 36(%rsp),%r13d - mov 24(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 4(%rsp),%r12d - - add 32(%rsp),%r12d - mov %r8d,%r13d - mov %r8d,%r14d - mov %r9d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r10d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r8d,%r15d # (f^g)&e - mov %r12d,32(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r11d,%r12d # T1+=h - - mov %eax,%r11d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %eax,%r13d - mov %eax,%r14d - - ror $2,%r11d - ror $13,%r13d - mov %eax,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r11d - ror $9,%r13d - or %ecx,%r14d # a|c - - xor %r13d,%r11d # h=Sigma0(a) - and %ecx,%r15d # a&c - add %r12d,%edx # d+=T1 - - and %ebx,%r14d # (a|c)&b - add %r12d,%r11d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r11d # h+=Maj(a,b,c) - mov 40(%rsp),%r13d - mov 28(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 8(%rsp),%r12d - - add 36(%rsp),%r12d - mov %edx,%r13d - mov %edx,%r14d - mov %r8d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r9d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %edx,%r15d # (f^g)&e - mov %r12d,36(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r10d,%r12d # T1+=h - - mov %r11d,%r10d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r11d,%r13d - mov %r11d,%r14d - - ror $2,%r10d - ror $13,%r13d - mov %r11d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r10d - ror $9,%r13d - or %ebx,%r14d # a|c - - xor %r13d,%r10d # h=Sigma0(a) - and %ebx,%r15d # a&c - add %r12d,%ecx # d+=T1 - - and %eax,%r14d # (a|c)&b - add %r12d,%r10d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r10d # h+=Maj(a,b,c) - mov 44(%rsp),%r13d - mov 32(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 12(%rsp),%r12d - - add 40(%rsp),%r12d - mov %ecx,%r13d - mov %ecx,%r14d - mov %edx,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r8d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %ecx,%r15d # (f^g)&e - mov %r12d,40(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r9d,%r12d # T1+=h - - mov %r10d,%r9d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r10d,%r13d - mov %r10d,%r14d - - ror $2,%r9d - ror $13,%r13d - mov %r10d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r9d - ror $9,%r13d - or %eax,%r14d # a|c - - xor %r13d,%r9d # h=Sigma0(a) - and %eax,%r15d # a&c - add %r12d,%ebx # d+=T1 - - and %r11d,%r14d # (a|c)&b - add %r12d,%r9d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r9d # h+=Maj(a,b,c) - mov 48(%rsp),%r13d - mov 36(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 16(%rsp),%r12d - - add 44(%rsp),%r12d - mov %ebx,%r13d - mov %ebx,%r14d - mov %ecx,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %edx,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %ebx,%r15d # (f^g)&e - mov %r12d,44(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %r8d,%r12d # T1+=h - - mov %r9d,%r8d - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r9d,%r13d - mov %r9d,%r14d - - ror $2,%r8d - ror $13,%r13d - mov %r9d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%r8d - ror $9,%r13d - or %r11d,%r14d # a|c - - xor %r13d,%r8d # h=Sigma0(a) - and %r11d,%r15d # a&c - add %r12d,%eax # d+=T1 - - and %r10d,%r14d # (a|c)&b - add %r12d,%r8d # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%r8d # h+=Maj(a,b,c) - mov 52(%rsp),%r13d - mov 40(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 20(%rsp),%r12d - - add 48(%rsp),%r12d - mov %eax,%r13d - mov %eax,%r14d - mov %ebx,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %ecx,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %eax,%r15d # (f^g)&e - mov %r12d,48(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %edx,%r12d # T1+=h - - mov %r8d,%edx - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %r8d,%r13d - mov %r8d,%r14d - - ror $2,%edx - ror $13,%r13d - mov %r8d,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%edx - ror $9,%r13d - or %r10d,%r14d # a|c - - xor %r13d,%edx # h=Sigma0(a) - and %r10d,%r15d # a&c - add %r12d,%r11d # d+=T1 - - and %r9d,%r14d # (a|c)&b - add %r12d,%edx # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%edx # h+=Maj(a,b,c) - mov 56(%rsp),%r13d - mov 44(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 24(%rsp),%r12d - - add 52(%rsp),%r12d - mov %r11d,%r13d - mov %r11d,%r14d - mov %eax,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %ebx,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r11d,%r15d # (f^g)&e - mov %r12d,52(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %ecx,%r12d # T1+=h - - mov %edx,%ecx - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %edx,%r13d - mov %edx,%r14d - - ror $2,%ecx - ror $13,%r13d - mov %edx,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%ecx - ror $9,%r13d - or %r9d,%r14d # a|c - - xor %r13d,%ecx # h=Sigma0(a) - and %r9d,%r15d # a&c - add %r12d,%r10d # d+=T1 - - and %r8d,%r14d # (a|c)&b - add %r12d,%ecx # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%ecx # h+=Maj(a,b,c) - mov 60(%rsp),%r13d - mov 48(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 28(%rsp),%r12d - - add 56(%rsp),%r12d - mov %r10d,%r13d - mov %r10d,%r14d - mov %r11d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %eax,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r10d,%r15d # (f^g)&e - mov %r12d,56(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %ebx,%r12d # T1+=h - - mov %ecx,%ebx - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %ecx,%r13d - mov %ecx,%r14d - - ror $2,%ebx - ror $13,%r13d - mov %ecx,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%ebx - ror $9,%r13d - or %r8d,%r14d # a|c - - xor %r13d,%ebx # h=Sigma0(a) - and %r8d,%r15d # a&c - add %r12d,%r9d # d+=T1 - - and %edx,%r14d # (a|c)&b - add %r12d,%ebx # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%ebx # h+=Maj(a,b,c) - mov 0(%rsp),%r13d - mov 52(%rsp),%r12d - - mov %r13d,%r15d - - shr $3,%r13d - ror $7,%r15d - - xor %r15d,%r13d - ror $11,%r15d - - xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) - mov %r12d,%r14d - - shr $10,%r12d - ror $17,%r14d - - xor %r14d,%r12d - ror $2,%r14d - - xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) - - add %r13d,%r12d - - add 32(%rsp),%r12d - - add 60(%rsp),%r12d - mov %r9d,%r13d - mov %r9d,%r14d - mov %r10d,%r15d - - ror $6,%r13d - ror $11,%r14d - xor %r11d,%r15d # f^g - - xor %r14d,%r13d - ror $14,%r14d - and %r9d,%r15d # (f^g)&e - mov %r12d,60(%rsp) - - xor %r14d,%r13d # Sigma1(e) - xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g - add %eax,%r12d # T1+=h - - mov %ebx,%eax - add %r13d,%r12d # T1+=Sigma1(e) - - add %r15d,%r12d # T1+=Ch(e,f,g) - mov %ebx,%r13d - mov %ebx,%r14d - - ror $2,%eax - ror $13,%r13d - mov %ebx,%r15d - add (%rbp,%rdi,4),%r12d # T1+=K[round] - - xor %r13d,%eax - ror $9,%r13d - or %edx,%r14d # a|c - - xor %r13d,%eax # h=Sigma0(a) - and %edx,%r15d # a&c - add %r12d,%r8d # d+=T1 - - and %ecx,%r14d # (a|c)&b - add %r12d,%eax # h+=T1 - - or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14d,%eax # h+=Maj(a,b,c) - cmp $64,%rdi - jb .Lrounds_16_xx - - mov 16*4+0*8(%rsp),%rdi - lea 16*4(%rsi),%rsi - - add 4*0(%rdi),%eax - add 4*1(%rdi),%ebx - add 4*2(%rdi),%ecx - add 4*3(%rdi),%edx - add 4*4(%rdi),%r8d - add 4*5(%rdi),%r9d - add 4*6(%rdi),%r10d - add 4*7(%rdi),%r11d - - cmp 16*4+2*8(%rsp),%rsi - - mov %eax,4*0(%rdi) - mov %ebx,4*1(%rdi) - mov %ecx,4*2(%rdi) - mov %edx,4*3(%rdi) - mov %r8d,4*4(%rdi) - mov %r9d,4*5(%rdi) - mov %r10d,4*6(%rdi) - mov %r11d,4*7(%rdi) - jb .Lloop - - mov 16*4+3*8(%rsp),%rsp - pop %rsi # win - pop %rdi # win - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - - ret -SET_SIZE(SHA256TransformBlocks) - -.align 64 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 diff --git a/module/icp/asm-x86_64/os/windows/sha2/sha512_impl.S b/module/icp/asm-x86_64/os/windows/sha2/sha512_impl.S deleted file mode 100644 index 437dac785d02..000000000000 --- a/module/icp/asm-x86_64/os/windows/sha2/sha512_impl.S +++ /dev/null @@ -1,2104 +0,0 @@ -/* - * ==================================================================== - * Written by Andy Polyakov for the OpenSSL - * project. Rights for redistribution and usage in source and binary - * forms are granted according to the OpenSSL license. - * ==================================================================== - * - * sha256/512_block procedure for x86_64. - * - * 40% improvement over compiler-generated code on Opteron. On EM64T - * sha256 was observed to run >80% faster and sha512 - >40%. No magical - * tricks, just straight implementation... I really wonder why gcc - * [being armed with inline assembler] fails to generate as fast code. - * The only thing which is cool about this module is that it's very - * same instruction sequence used for both SHA-256 and SHA-512. In - * former case the instructions operate on 32-bit operands, while in - * latter - on 64-bit ones. All I had to do is to get one flavor right, - * the other one passed the test right away:-) - * - * sha256_block runs in ~1005 cycles on Opteron, which gives you - * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock - * frequency in GHz. sha512_block runs in ~1275 cycles, which results - * in 128*1000/1275=100MBps per GHz. Is there room for improvement? - * Well, if you compare it to IA-64 implementation, which maintains - * X[16] in register bank[!], tends to 4 instructions per CPU clock - * cycle and runs in 1003 cycles, 1275 is very good result for 3-way - * issue Opteron pipeline and X[16] maintained in memory. So that *if* - * there is a way to improve it, *then* the only way would be to try to - * offload X[16] updates to SSE unit, but that would require "deeper" - * loop unroll, which in turn would naturally cause size blow-up, not - * to mention increased complexity! And once again, only *if* it's - * actually possible to noticeably improve overall ILP, instruction - * level parallelism, on a given CPU implementation in this case. - * - * Special note on Intel EM64T. While Opteron CPU exhibits perfect - * perfromance ratio of 1.5 between 64- and 32-bit flavors [see above], - * [currently available] EM64T CPUs apparently are far from it. On the - * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit - * sha256_block:-( This is presumably because 64-bit shifts/rotates - * apparently are not atomic instructions, but implemented in microcode. - */ - -/* - * OpenSolaris OS modifications - * - * Sun elects to use this software under the BSD license. - * - * This source originates from OpenSSL file sha512-x86_64.pl at - * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz - * (presumably for future OpenSSL release 0.9.8h), with these changes: - * - * 1. Added perl "use strict" and declared variables. - * - * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from - * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards. - * - * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) - * assemblers). Replaced the .picmeup macro with assembler code. - * - * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype", - * at the beginning of SHA2_CTX (the next field is 8-byte aligned). - */ - -/* - * This file was generated by a perl script (sha512-x86_64.pl) that were - * used to generate sha256 and sha512 variants from the same code base. - * The comments from the original file have been pasted above. - */ - - -#if 0 -/* ARGSUSED */ -void -SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num) -{ -} -#endif - -#define _ASM -#include - -// Windows x64: -// Calling: rcx, rdx, r8, and r9 (float: xmm0-xmm3) -// Return: rax (float: xmm0) -// Volatile: rax, rcx, rdx, r8-r11 -// Nonvolatile: rbx, rbp, rsp, rdi, rsi, r12-r15 - -// Unix x64: -// Calling: rdi, rsi, rdx, rcx, r8, r9 (float: xmm0-xmm7) -// Return: rax (float: xmm0) -// Volatile: -// Nonvolatile: rbx, rbp, rsp, r12-r15 - -// outcome: -// rdi -> rcx -// save rdi, rsi. - -ENTRY_NP(SHA512TransformBlocks) - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - - push %rdi # win - push %rsi # win - mov %rcx, %rdi # win fix arg1 - mov %rdx, %rsi # win fix arg2 - mov %r8, %rdx # win fix arg3 - - mov %rsp,%rbp # copy %rsp - shl $4,%rdx # num*16 - sub $16*8+4*8,%rsp - lea (%rsi,%rdx,8),%rdx # inp+num*16*8 - and $-64,%rsp # align stack frame - add $8,%rdi # Skip OpenSolaris field, "algotype" - mov %rdi,16*8+0*8(%rsp) # save ctx, 1st arg - mov %rsi,16*8+1*8(%rsp) # save inp, 2nd arg - mov %rdx,16*8+2*8(%rsp) # save end pointer, "3rd" arg - mov %rbp,16*8+3*8(%rsp) # save copy of %rsp - - //.picmeup %rbp - // The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts - // the address of the "next" instruction into the target register - // (%rbp). This generates these 2 instructions: - lea .Llea(%rip),%rbp - //nop // .picmeup generates a nop for mod 8 alignment--not needed here - -.Llea: - lea K512-.(%rbp),%rbp - - mov 8*0(%rdi),%rax - mov 8*1(%rdi),%rbx - mov 8*2(%rdi),%rcx - mov 8*3(%rdi),%rdx - mov 8*4(%rdi),%r8 - mov 8*5(%rdi),%r9 - mov 8*6(%rdi),%r10 - mov 8*7(%rdi),%r11 - jmp .Lloop - -.align 16 -.Lloop: - xor %rdi,%rdi - mov 8*0(%rsi),%r12 - bswap %r12 - mov %r8,%r13 - mov %r8,%r14 - mov %r9,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r10,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r8,%r15 # (f^g)&e - mov %r12,0(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r11,%r12 # T1+=h - - mov %rax,%r11 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rax,%r13 - mov %rax,%r14 - - ror $28,%r11 - ror $34,%r13 - mov %rax,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r11 - ror $5,%r13 - or %rcx,%r14 # a|c - - xor %r13,%r11 # h=Sigma0(a) - and %rcx,%r15 # a&c - add %r12,%rdx # d+=T1 - - and %rbx,%r14 # (a|c)&b - add %r12,%r11 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r11 # h+=Maj(a,b,c) - mov 8*1(%rsi),%r12 - bswap %r12 - mov %rdx,%r13 - mov %rdx,%r14 - mov %r8,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r9,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rdx,%r15 # (f^g)&e - mov %r12,8(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r10,%r12 # T1+=h - - mov %r11,%r10 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r11,%r13 - mov %r11,%r14 - - ror $28,%r10 - ror $34,%r13 - mov %r11,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r10 - ror $5,%r13 - or %rbx,%r14 # a|c - - xor %r13,%r10 # h=Sigma0(a) - and %rbx,%r15 # a&c - add %r12,%rcx # d+=T1 - - and %rax,%r14 # (a|c)&b - add %r12,%r10 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r10 # h+=Maj(a,b,c) - mov 8*2(%rsi),%r12 - bswap %r12 - mov %rcx,%r13 - mov %rcx,%r14 - mov %rdx,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r8,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rcx,%r15 # (f^g)&e - mov %r12,16(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r9,%r12 # T1+=h - - mov %r10,%r9 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r10,%r13 - mov %r10,%r14 - - ror $28,%r9 - ror $34,%r13 - mov %r10,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r9 - ror $5,%r13 - or %rax,%r14 # a|c - - xor %r13,%r9 # h=Sigma0(a) - and %rax,%r15 # a&c - add %r12,%rbx # d+=T1 - - and %r11,%r14 # (a|c)&b - add %r12,%r9 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r9 # h+=Maj(a,b,c) - mov 8*3(%rsi),%r12 - bswap %r12 - mov %rbx,%r13 - mov %rbx,%r14 - mov %rcx,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rdx,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rbx,%r15 # (f^g)&e - mov %r12,24(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r8,%r12 # T1+=h - - mov %r9,%r8 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r9,%r13 - mov %r9,%r14 - - ror $28,%r8 - ror $34,%r13 - mov %r9,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r8 - ror $5,%r13 - or %r11,%r14 # a|c - - xor %r13,%r8 # h=Sigma0(a) - and %r11,%r15 # a&c - add %r12,%rax # d+=T1 - - and %r10,%r14 # (a|c)&b - add %r12,%r8 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r8 # h+=Maj(a,b,c) - mov 8*4(%rsi),%r12 - bswap %r12 - mov %rax,%r13 - mov %rax,%r14 - mov %rbx,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rcx,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rax,%r15 # (f^g)&e - mov %r12,32(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rdx,%r12 # T1+=h - - mov %r8,%rdx - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r8,%r13 - mov %r8,%r14 - - ror $28,%rdx - ror $34,%r13 - mov %r8,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rdx - ror $5,%r13 - or %r10,%r14 # a|c - - xor %r13,%rdx # h=Sigma0(a) - and %r10,%r15 # a&c - add %r12,%r11 # d+=T1 - - and %r9,%r14 # (a|c)&b - add %r12,%rdx # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rdx # h+=Maj(a,b,c) - mov 8*5(%rsi),%r12 - bswap %r12 - mov %r11,%r13 - mov %r11,%r14 - mov %rax,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rbx,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r11,%r15 # (f^g)&e - mov %r12,40(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rcx,%r12 # T1+=h - - mov %rdx,%rcx - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rdx,%r13 - mov %rdx,%r14 - - ror $28,%rcx - ror $34,%r13 - mov %rdx,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rcx - ror $5,%r13 - or %r9,%r14 # a|c - - xor %r13,%rcx # h=Sigma0(a) - and %r9,%r15 # a&c - add %r12,%r10 # d+=T1 - - and %r8,%r14 # (a|c)&b - add %r12,%rcx # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rcx # h+=Maj(a,b,c) - mov 8*6(%rsi),%r12 - bswap %r12 - mov %r10,%r13 - mov %r10,%r14 - mov %r11,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rax,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r10,%r15 # (f^g)&e - mov %r12,48(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rbx,%r12 # T1+=h - - mov %rcx,%rbx - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rcx,%r13 - mov %rcx,%r14 - - ror $28,%rbx - ror $34,%r13 - mov %rcx,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rbx - ror $5,%r13 - or %r8,%r14 # a|c - - xor %r13,%rbx # h=Sigma0(a) - and %r8,%r15 # a&c - add %r12,%r9 # d+=T1 - - and %rdx,%r14 # (a|c)&b - add %r12,%rbx # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rbx # h+=Maj(a,b,c) - mov 8*7(%rsi),%r12 - bswap %r12 - mov %r9,%r13 - mov %r9,%r14 - mov %r10,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r11,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r9,%r15 # (f^g)&e - mov %r12,56(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rax,%r12 # T1+=h - - mov %rbx,%rax - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rbx,%r13 - mov %rbx,%r14 - - ror $28,%rax - ror $34,%r13 - mov %rbx,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rax - ror $5,%r13 - or %rdx,%r14 # a|c - - xor %r13,%rax # h=Sigma0(a) - and %rdx,%r15 # a&c - add %r12,%r8 # d+=T1 - - and %rcx,%r14 # (a|c)&b - add %r12,%rax # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rax # h+=Maj(a,b,c) - mov 8*8(%rsi),%r12 - bswap %r12 - mov %r8,%r13 - mov %r8,%r14 - mov %r9,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r10,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r8,%r15 # (f^g)&e - mov %r12,64(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r11,%r12 # T1+=h - - mov %rax,%r11 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rax,%r13 - mov %rax,%r14 - - ror $28,%r11 - ror $34,%r13 - mov %rax,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r11 - ror $5,%r13 - or %rcx,%r14 # a|c - - xor %r13,%r11 # h=Sigma0(a) - and %rcx,%r15 # a&c - add %r12,%rdx # d+=T1 - - and %rbx,%r14 # (a|c)&b - add %r12,%r11 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r11 # h+=Maj(a,b,c) - mov 8*9(%rsi),%r12 - bswap %r12 - mov %rdx,%r13 - mov %rdx,%r14 - mov %r8,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r9,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rdx,%r15 # (f^g)&e - mov %r12,72(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r10,%r12 # T1+=h - - mov %r11,%r10 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r11,%r13 - mov %r11,%r14 - - ror $28,%r10 - ror $34,%r13 - mov %r11,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r10 - ror $5,%r13 - or %rbx,%r14 # a|c - - xor %r13,%r10 # h=Sigma0(a) - and %rbx,%r15 # a&c - add %r12,%rcx # d+=T1 - - and %rax,%r14 # (a|c)&b - add %r12,%r10 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r10 # h+=Maj(a,b,c) - mov 8*10(%rsi),%r12 - bswap %r12 - mov %rcx,%r13 - mov %rcx,%r14 - mov %rdx,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r8,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rcx,%r15 # (f^g)&e - mov %r12,80(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r9,%r12 # T1+=h - - mov %r10,%r9 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r10,%r13 - mov %r10,%r14 - - ror $28,%r9 - ror $34,%r13 - mov %r10,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r9 - ror $5,%r13 - or %rax,%r14 # a|c - - xor %r13,%r9 # h=Sigma0(a) - and %rax,%r15 # a&c - add %r12,%rbx # d+=T1 - - and %r11,%r14 # (a|c)&b - add %r12,%r9 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r9 # h+=Maj(a,b,c) - mov 8*11(%rsi),%r12 - bswap %r12 - mov %rbx,%r13 - mov %rbx,%r14 - mov %rcx,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rdx,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rbx,%r15 # (f^g)&e - mov %r12,88(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r8,%r12 # T1+=h - - mov %r9,%r8 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r9,%r13 - mov %r9,%r14 - - ror $28,%r8 - ror $34,%r13 - mov %r9,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r8 - ror $5,%r13 - or %r11,%r14 # a|c - - xor %r13,%r8 # h=Sigma0(a) - and %r11,%r15 # a&c - add %r12,%rax # d+=T1 - - and %r10,%r14 # (a|c)&b - add %r12,%r8 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r8 # h+=Maj(a,b,c) - mov 8*12(%rsi),%r12 - bswap %r12 - mov %rax,%r13 - mov %rax,%r14 - mov %rbx,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rcx,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rax,%r15 # (f^g)&e - mov %r12,96(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rdx,%r12 # T1+=h - - mov %r8,%rdx - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r8,%r13 - mov %r8,%r14 - - ror $28,%rdx - ror $34,%r13 - mov %r8,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rdx - ror $5,%r13 - or %r10,%r14 # a|c - - xor %r13,%rdx # h=Sigma0(a) - and %r10,%r15 # a&c - add %r12,%r11 # d+=T1 - - and %r9,%r14 # (a|c)&b - add %r12,%rdx # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rdx # h+=Maj(a,b,c) - mov 8*13(%rsi),%r12 - bswap %r12 - mov %r11,%r13 - mov %r11,%r14 - mov %rax,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rbx,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r11,%r15 # (f^g)&e - mov %r12,104(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rcx,%r12 # T1+=h - - mov %rdx,%rcx - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rdx,%r13 - mov %rdx,%r14 - - ror $28,%rcx - ror $34,%r13 - mov %rdx,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rcx - ror $5,%r13 - or %r9,%r14 # a|c - - xor %r13,%rcx # h=Sigma0(a) - and %r9,%r15 # a&c - add %r12,%r10 # d+=T1 - - and %r8,%r14 # (a|c)&b - add %r12,%rcx # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rcx # h+=Maj(a,b,c) - mov 8*14(%rsi),%r12 - bswap %r12 - mov %r10,%r13 - mov %r10,%r14 - mov %r11,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rax,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r10,%r15 # (f^g)&e - mov %r12,112(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rbx,%r12 # T1+=h - - mov %rcx,%rbx - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rcx,%r13 - mov %rcx,%r14 - - ror $28,%rbx - ror $34,%r13 - mov %rcx,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rbx - ror $5,%r13 - or %r8,%r14 # a|c - - xor %r13,%rbx # h=Sigma0(a) - and %r8,%r15 # a&c - add %r12,%r9 # d+=T1 - - and %rdx,%r14 # (a|c)&b - add %r12,%rbx # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rbx # h+=Maj(a,b,c) - mov 8*15(%rsi),%r12 - bswap %r12 - mov %r9,%r13 - mov %r9,%r14 - mov %r10,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r11,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r9,%r15 # (f^g)&e - mov %r12,120(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rax,%r12 # T1+=h - - mov %rbx,%rax - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rbx,%r13 - mov %rbx,%r14 - - ror $28,%rax - ror $34,%r13 - mov %rbx,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rax - ror $5,%r13 - or %rdx,%r14 # a|c - - xor %r13,%rax # h=Sigma0(a) - and %rdx,%r15 # a&c - add %r12,%r8 # d+=T1 - - and %rcx,%r14 # (a|c)&b - add %r12,%rax # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rax # h+=Maj(a,b,c) - jmp .Lrounds_16_xx -.align 16 -.Lrounds_16_xx: - mov 8(%rsp),%r13 - mov 112(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 72(%rsp),%r12 - - add 0(%rsp),%r12 - mov %r8,%r13 - mov %r8,%r14 - mov %r9,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r10,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r8,%r15 # (f^g)&e - mov %r12,0(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r11,%r12 # T1+=h - - mov %rax,%r11 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rax,%r13 - mov %rax,%r14 - - ror $28,%r11 - ror $34,%r13 - mov %rax,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r11 - ror $5,%r13 - or %rcx,%r14 # a|c - - xor %r13,%r11 # h=Sigma0(a) - and %rcx,%r15 # a&c - add %r12,%rdx # d+=T1 - - and %rbx,%r14 # (a|c)&b - add %r12,%r11 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r11 # h+=Maj(a,b,c) - mov 16(%rsp),%r13 - mov 120(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 80(%rsp),%r12 - - add 8(%rsp),%r12 - mov %rdx,%r13 - mov %rdx,%r14 - mov %r8,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r9,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rdx,%r15 # (f^g)&e - mov %r12,8(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r10,%r12 # T1+=h - - mov %r11,%r10 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r11,%r13 - mov %r11,%r14 - - ror $28,%r10 - ror $34,%r13 - mov %r11,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r10 - ror $5,%r13 - or %rbx,%r14 # a|c - - xor %r13,%r10 # h=Sigma0(a) - and %rbx,%r15 # a&c - add %r12,%rcx # d+=T1 - - and %rax,%r14 # (a|c)&b - add %r12,%r10 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r10 # h+=Maj(a,b,c) - mov 24(%rsp),%r13 - mov 0(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 88(%rsp),%r12 - - add 16(%rsp),%r12 - mov %rcx,%r13 - mov %rcx,%r14 - mov %rdx,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r8,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rcx,%r15 # (f^g)&e - mov %r12,16(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r9,%r12 # T1+=h - - mov %r10,%r9 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r10,%r13 - mov %r10,%r14 - - ror $28,%r9 - ror $34,%r13 - mov %r10,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r9 - ror $5,%r13 - or %rax,%r14 # a|c - - xor %r13,%r9 # h=Sigma0(a) - and %rax,%r15 # a&c - add %r12,%rbx # d+=T1 - - and %r11,%r14 # (a|c)&b - add %r12,%r9 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r9 # h+=Maj(a,b,c) - mov 32(%rsp),%r13 - mov 8(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 96(%rsp),%r12 - - add 24(%rsp),%r12 - mov %rbx,%r13 - mov %rbx,%r14 - mov %rcx,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rdx,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rbx,%r15 # (f^g)&e - mov %r12,24(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r8,%r12 # T1+=h - - mov %r9,%r8 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r9,%r13 - mov %r9,%r14 - - ror $28,%r8 - ror $34,%r13 - mov %r9,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r8 - ror $5,%r13 - or %r11,%r14 # a|c - - xor %r13,%r8 # h=Sigma0(a) - and %r11,%r15 # a&c - add %r12,%rax # d+=T1 - - and %r10,%r14 # (a|c)&b - add %r12,%r8 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r8 # h+=Maj(a,b,c) - mov 40(%rsp),%r13 - mov 16(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 104(%rsp),%r12 - - add 32(%rsp),%r12 - mov %rax,%r13 - mov %rax,%r14 - mov %rbx,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rcx,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rax,%r15 # (f^g)&e - mov %r12,32(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rdx,%r12 # T1+=h - - mov %r8,%rdx - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r8,%r13 - mov %r8,%r14 - - ror $28,%rdx - ror $34,%r13 - mov %r8,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rdx - ror $5,%r13 - or %r10,%r14 # a|c - - xor %r13,%rdx # h=Sigma0(a) - and %r10,%r15 # a&c - add %r12,%r11 # d+=T1 - - and %r9,%r14 # (a|c)&b - add %r12,%rdx # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rdx # h+=Maj(a,b,c) - mov 48(%rsp),%r13 - mov 24(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 112(%rsp),%r12 - - add 40(%rsp),%r12 - mov %r11,%r13 - mov %r11,%r14 - mov %rax,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rbx,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r11,%r15 # (f^g)&e - mov %r12,40(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rcx,%r12 # T1+=h - - mov %rdx,%rcx - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rdx,%r13 - mov %rdx,%r14 - - ror $28,%rcx - ror $34,%r13 - mov %rdx,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rcx - ror $5,%r13 - or %r9,%r14 # a|c - - xor %r13,%rcx # h=Sigma0(a) - and %r9,%r15 # a&c - add %r12,%r10 # d+=T1 - - and %r8,%r14 # (a|c)&b - add %r12,%rcx # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rcx # h+=Maj(a,b,c) - mov 56(%rsp),%r13 - mov 32(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 120(%rsp),%r12 - - add 48(%rsp),%r12 - mov %r10,%r13 - mov %r10,%r14 - mov %r11,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rax,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r10,%r15 # (f^g)&e - mov %r12,48(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rbx,%r12 # T1+=h - - mov %rcx,%rbx - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rcx,%r13 - mov %rcx,%r14 - - ror $28,%rbx - ror $34,%r13 - mov %rcx,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rbx - ror $5,%r13 - or %r8,%r14 # a|c - - xor %r13,%rbx # h=Sigma0(a) - and %r8,%r15 # a&c - add %r12,%r9 # d+=T1 - - and %rdx,%r14 # (a|c)&b - add %r12,%rbx # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rbx # h+=Maj(a,b,c) - mov 64(%rsp),%r13 - mov 40(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 0(%rsp),%r12 - - add 56(%rsp),%r12 - mov %r9,%r13 - mov %r9,%r14 - mov %r10,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r11,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r9,%r15 # (f^g)&e - mov %r12,56(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rax,%r12 # T1+=h - - mov %rbx,%rax - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rbx,%r13 - mov %rbx,%r14 - - ror $28,%rax - ror $34,%r13 - mov %rbx,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rax - ror $5,%r13 - or %rdx,%r14 # a|c - - xor %r13,%rax # h=Sigma0(a) - and %rdx,%r15 # a&c - add %r12,%r8 # d+=T1 - - and %rcx,%r14 # (a|c)&b - add %r12,%rax # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rax # h+=Maj(a,b,c) - mov 72(%rsp),%r13 - mov 48(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 8(%rsp),%r12 - - add 64(%rsp),%r12 - mov %r8,%r13 - mov %r8,%r14 - mov %r9,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r10,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r8,%r15 # (f^g)&e - mov %r12,64(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r11,%r12 # T1+=h - - mov %rax,%r11 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rax,%r13 - mov %rax,%r14 - - ror $28,%r11 - ror $34,%r13 - mov %rax,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r11 - ror $5,%r13 - or %rcx,%r14 # a|c - - xor %r13,%r11 # h=Sigma0(a) - and %rcx,%r15 # a&c - add %r12,%rdx # d+=T1 - - and %rbx,%r14 # (a|c)&b - add %r12,%r11 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r11 # h+=Maj(a,b,c) - mov 80(%rsp),%r13 - mov 56(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 16(%rsp),%r12 - - add 72(%rsp),%r12 - mov %rdx,%r13 - mov %rdx,%r14 - mov %r8,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r9,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rdx,%r15 # (f^g)&e - mov %r12,72(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r10,%r12 # T1+=h - - mov %r11,%r10 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r11,%r13 - mov %r11,%r14 - - ror $28,%r10 - ror $34,%r13 - mov %r11,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r10 - ror $5,%r13 - or %rbx,%r14 # a|c - - xor %r13,%r10 # h=Sigma0(a) - and %rbx,%r15 # a&c - add %r12,%rcx # d+=T1 - - and %rax,%r14 # (a|c)&b - add %r12,%r10 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r10 # h+=Maj(a,b,c) - mov 88(%rsp),%r13 - mov 64(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 24(%rsp),%r12 - - add 80(%rsp),%r12 - mov %rcx,%r13 - mov %rcx,%r14 - mov %rdx,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r8,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rcx,%r15 # (f^g)&e - mov %r12,80(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r9,%r12 # T1+=h - - mov %r10,%r9 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r10,%r13 - mov %r10,%r14 - - ror $28,%r9 - ror $34,%r13 - mov %r10,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r9 - ror $5,%r13 - or %rax,%r14 # a|c - - xor %r13,%r9 # h=Sigma0(a) - and %rax,%r15 # a&c - add %r12,%rbx # d+=T1 - - and %r11,%r14 # (a|c)&b - add %r12,%r9 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r9 # h+=Maj(a,b,c) - mov 96(%rsp),%r13 - mov 72(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 32(%rsp),%r12 - - add 88(%rsp),%r12 - mov %rbx,%r13 - mov %rbx,%r14 - mov %rcx,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rdx,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rbx,%r15 # (f^g)&e - mov %r12,88(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %r8,%r12 # T1+=h - - mov %r9,%r8 - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r9,%r13 - mov %r9,%r14 - - ror $28,%r8 - ror $34,%r13 - mov %r9,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%r8 - ror $5,%r13 - or %r11,%r14 # a|c - - xor %r13,%r8 # h=Sigma0(a) - and %r11,%r15 # a&c - add %r12,%rax # d+=T1 - - and %r10,%r14 # (a|c)&b - add %r12,%r8 # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%r8 # h+=Maj(a,b,c) - mov 104(%rsp),%r13 - mov 80(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 40(%rsp),%r12 - - add 96(%rsp),%r12 - mov %rax,%r13 - mov %rax,%r14 - mov %rbx,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rcx,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %rax,%r15 # (f^g)&e - mov %r12,96(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rdx,%r12 # T1+=h - - mov %r8,%rdx - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %r8,%r13 - mov %r8,%r14 - - ror $28,%rdx - ror $34,%r13 - mov %r8,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rdx - ror $5,%r13 - or %r10,%r14 # a|c - - xor %r13,%rdx # h=Sigma0(a) - and %r10,%r15 # a&c - add %r12,%r11 # d+=T1 - - and %r9,%r14 # (a|c)&b - add %r12,%rdx # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rdx # h+=Maj(a,b,c) - mov 112(%rsp),%r13 - mov 88(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 48(%rsp),%r12 - - add 104(%rsp),%r12 - mov %r11,%r13 - mov %r11,%r14 - mov %rax,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rbx,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r11,%r15 # (f^g)&e - mov %r12,104(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rcx,%r12 # T1+=h - - mov %rdx,%rcx - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rdx,%r13 - mov %rdx,%r14 - - ror $28,%rcx - ror $34,%r13 - mov %rdx,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rcx - ror $5,%r13 - or %r9,%r14 # a|c - - xor %r13,%rcx # h=Sigma0(a) - and %r9,%r15 # a&c - add %r12,%r10 # d+=T1 - - and %r8,%r14 # (a|c)&b - add %r12,%rcx # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rcx # h+=Maj(a,b,c) - mov 120(%rsp),%r13 - mov 96(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 56(%rsp),%r12 - - add 112(%rsp),%r12 - mov %r10,%r13 - mov %r10,%r14 - mov %r11,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %rax,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r10,%r15 # (f^g)&e - mov %r12,112(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rbx,%r12 # T1+=h - - mov %rcx,%rbx - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rcx,%r13 - mov %rcx,%r14 - - ror $28,%rbx - ror $34,%r13 - mov %rcx,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rbx - ror $5,%r13 - or %r8,%r14 # a|c - - xor %r13,%rbx # h=Sigma0(a) - and %r8,%r15 # a&c - add %r12,%r9 # d+=T1 - - and %rdx,%r14 # (a|c)&b - add %r12,%rbx # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rbx # h+=Maj(a,b,c) - mov 0(%rsp),%r13 - mov 104(%rsp),%r12 - - mov %r13,%r15 - - shr $7,%r13 - ror $1,%r15 - - xor %r15,%r13 - ror $7,%r15 - - xor %r15,%r13 # sigma0(X[(i+1)&0xf]) - mov %r12,%r14 - - shr $6,%r12 - ror $19,%r14 - - xor %r14,%r12 - ror $42,%r14 - - xor %r14,%r12 # sigma1(X[(i+14)&0xf]) - - add %r13,%r12 - - add 64(%rsp),%r12 - - add 120(%rsp),%r12 - mov %r9,%r13 - mov %r9,%r14 - mov %r10,%r15 - - ror $14,%r13 - ror $18,%r14 - xor %r11,%r15 # f^g - - xor %r14,%r13 - ror $23,%r14 - and %r9,%r15 # (f^g)&e - mov %r12,120(%rsp) - - xor %r14,%r13 # Sigma1(e) - xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g - add %rax,%r12 # T1+=h - - mov %rbx,%rax - add %r13,%r12 # T1+=Sigma1(e) - - add %r15,%r12 # T1+=Ch(e,f,g) - mov %rbx,%r13 - mov %rbx,%r14 - - ror $28,%rax - ror $34,%r13 - mov %rbx,%r15 - add (%rbp,%rdi,8),%r12 # T1+=K[round] - - xor %r13,%rax - ror $5,%r13 - or %rdx,%r14 # a|c - - xor %r13,%rax # h=Sigma0(a) - and %rdx,%r15 # a&c - add %r12,%r8 # d+=T1 - - and %rcx,%r14 # (a|c)&b - add %r12,%rax # h+=T1 - - or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) - lea 1(%rdi),%rdi # round++ - - add %r14,%rax # h+=Maj(a,b,c) - cmp $80,%rdi - jb .Lrounds_16_xx - - mov 16*8+0*8(%rsp),%rdi - lea 16*8(%rsi),%rsi - - add 8*0(%rdi),%rax - add 8*1(%rdi),%rbx - add 8*2(%rdi),%rcx - add 8*3(%rdi),%rdx - add 8*4(%rdi),%r8 - add 8*5(%rdi),%r9 - add 8*6(%rdi),%r10 - add 8*7(%rdi),%r11 - - cmp 16*8+2*8(%rsp),%rsi - - mov %rax,8*0(%rdi) - mov %rbx,8*1(%rdi) - mov %rcx,8*2(%rdi) - mov %rdx,8*3(%rdi) - mov %r8,8*4(%rdi) - mov %r9,8*5(%rdi) - mov %r10,8*6(%rdi) - mov %r11,8*7(%rdi) - jb .Lloop - - mov 16*8+3*8(%rsp),%rsp - - pop %rsi # win - pop %rdi # win - - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - - ret -SET_SIZE(SHA512TransformBlocks) - -.align 64 -K512: - .quad 0x428a2f98d728ae22,0x7137449123ef65cd - .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc - .quad 0x3956c25bf348b538,0x59f111f1b605d019 - .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 - .quad 0xd807aa98a3030242,0x12835b0145706fbe - .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 - .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 - .quad 0x9bdc06a725c71235,0xc19bf174cf692694 - .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 - .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 - .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 - .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 - .quad 0x983e5152ee66dfab,0xa831c66d2db43210 - .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 - .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 - .quad 0x06ca6351e003826f,0x142929670a0e6e70 - .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 - .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df - .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 - .quad 0x81c2c92e47edaee6,0x92722c851482353b - .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 - .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 - .quad 0xd192e819d6ef5218,0xd69906245565a910 - .quad 0xf40e35855771202a,0x106aa07032bbd1b8 - .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 - .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 - .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb - .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 - .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 - .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec - .quad 0x90befffa23631e28,0xa4506cebde82bde9 - .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b - .quad 0xca273eceea26619c,0xd186b8c721c0c207 - .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 - .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 - .quad 0x113f9804bef90dae,0x1b710b35131c471b - .quad 0x28db77f523047d84,0x32caab7b40c72493 - .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c - .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a - .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 diff --git a/module/lua/setjmp/win_setjmp_x86_64.S b/module/lua/setjmp/win_setjmp_x86_64.S deleted file mode 100644 index 0eb202d2f2e9..000000000000 --- a/module/lua/setjmp/win_setjmp_x86_64.S +++ /dev/null @@ -1,95 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Jorgen Lundman - */ - - // Windows x64: - // Calling: rcx, rdx, r8, and r9 (float: xmm0-xmm3) - // Return: rax (float: xmm0) - // Volatile: rax, rcx, rdx, r8-r11 - // Nonvolatile: rbx, rbp, rsp, rdi, rsi, r12-r15 - - // Unix x64: - // Calling: rdi, rsi, rdx, rcx, r8, r9 (float: xmm0-xmm7) - // Return: rax (float: xmm0) - // Volatile: - // Nonvolatile: rbx, rbp, rsp, r12-r15 - - // outcome: - // rdi -> rcx - // save rdi, rsi. - - #define ENTRY(x) \ - .text; \ - .align 8; \ - .globl x; \ - .def x; .scl 2; .type 32; .endef ; \ -x: - -#define SET_SIZE(x) - -/* - * Setjmp and longjmp implement non-local gotos using state vectors - * type label_t. - */ -#ifdef __x86_64__ - - ENTRY(setjmp) - nop - movq %rsp, 0(%rcx) - movq %rbp, 8(%rcx) - movq %rbx, 16(%rcx) - movq %r12, 24(%rcx) - movq %r13, 32(%rcx) - movq %r14, 40(%rcx) - movq %r15, 48(%rcx) - movq %rdi, 56(%rcx) - movq %rsi, 64(%rcx) - movq 0(%rsp), %rdx /* return address */ - movq %rdx, 72(%rcx) /* rip */ - xorl %eax, %eax /* return 0 */ - ret - SET_SIZE(setjmp) - - ENTRY(longjmp) - movq 0(%rdi), %rsp - movq 8(%rdi), %rbp - movq 16(%rdi), %rbx - movq 24(%rdi), %r12 - movq 32(%rdi), %r13 - movq 40(%rdi), %r14 - movq 48(%rdi), %r15 - movq 56(%rdi), %rdi - movq 64(%rdi), %rsi - movq 72(%rdi), %rdx /* return address */ - movq %rdx, 0(%rsp) - xorl %eax, %eax - incl %eax /* return 1 */ - ret - SET_SIZE(longjmp) - -#ifdef __ELF__ -.section .note.GNU-stack,"",%progbits -#endif - -#endif /* __x86_64__ */ From 83d972bfbd0fd8aabe66ece2c8859842d55c58a8 Mon Sep 17 00:00:00 2001 From: Jorgen Lundman Date: Sun, 27 Nov 2022 18:04:18 +0900 Subject: [PATCH 2/3] Linux assembler changes --- module/icp/algs/aes/aes_impl.c | 31 +++++++- module/icp/algs/aes/aes_impl_aesni.c | 9 ++- module/icp/algs/blake3/blake3_impl.c | 28 +++++-- module/icp/algs/blake3/blake3_impl.h | 1 + module/icp/algs/blake3/blake3_x86-64.c | 22 +++--- module/icp/algs/modes/gcm.c | 75 ++++++++++++++++--- module/icp/algs/modes/gcm_pclmulqdq.c | 3 +- module/icp/algs/sha2/sha2.c | 5 +- module/icp/asm-x86_64/aes/aes_amd64.S | 56 +++++++------- module/icp/asm-x86_64/blake3/blake3_avx2.S | 12 +-- module/icp/asm-x86_64/blake3/blake3_avx512.S | 27 ++----- module/icp/asm-x86_64/blake3/blake3_sse2.S | 29 ++----- module/icp/asm-x86_64/blake3/blake3_sse41.S | 28 ++----- .../icp/asm-x86_64/modes/aesni-gcm-x86_64.S | 63 ++++++---------- module/icp/asm-x86_64/modes/ghash-x86_64.S | 44 +++++------ module/icp/asm-x86_64/sha2/sha256_impl.S | 4 +- module/icp/asm-x86_64/sha2/sha512_impl.S | 4 +- module/icp/include/aes/aes_impl.h | 9 ++- module/lua/ldo.c | 8 +- module/lua/setjmp/setjmp_x86_64.S | 22 ++---- module/zcommon/zfs_fletcher.c | 33 +++++--- module/zfs/vdev_raidz_math.c | 29 ++++++- 22 files changed, 305 insertions(+), 237 deletions(-) diff --git a/module/icp/algs/aes/aes_impl.c b/module/icp/algs/aes/aes_impl.c index 7c92a7c8d136..41406a25fa29 100644 --- a/module/icp/algs/aes/aes_impl.c +++ b/module/icp/algs/aes/aes_impl.c @@ -404,7 +404,8 @@ aes_impl_set(const char *val) return (err); } -#if defined(_KERNEL) && defined(__linux__) +#if defined(_KERNEL) +#if defined(__linux__) || defined(_WIN32) static int icp_aes_impl_set(const char *val, zfs_kernel_param_t *kp) @@ -435,8 +436,34 @@ icp_aes_impl_get(char *buffer, zfs_kernel_param_t *kp) return (cnt); } +#endif /* Linux || Windows */ + +#ifdef _WIN32 +int +win32_icp_aes_impl_set(ZFS_MODULE_PARAM_ARGS) +{ + uint32_t val; + static unsigned char str[1024] = ""; + + *type = ZT_TYPE_STRING; + + if (set == B_FALSE) { + if (aes_impl_initialized) + icp_aes_impl_get(str, NULL); + *ptr = str; + *len = strlen(str); + return (0); + } + + ASSERT3P(ptr, !=, NULL); + + aes_impl_set(*ptr); + + return (0); +} +#endif /* WIN32 */ module_param_call(icp_aes_impl, icp_aes_impl_set, icp_aes_impl_get, NULL, 0644); MODULE_PARM_DESC(icp_aes_impl, "Select aes implementation."); -#endif +#endif /* KERNEL */ diff --git a/module/icp/algs/aes/aes_impl_aesni.c b/module/icp/algs/aes/aes_impl_aesni.c index 133c524aab96..ff359586be97 100644 --- a/module/icp/algs/aes/aes_impl_aesni.c +++ b/module/icp/algs/aes/aes_impl_aesni.c @@ -26,15 +26,16 @@ #include #include +#include /* These functions are used to execute AES-NI instructions: */ -extern int rijndael_key_setup_enc_intel(uint32_t rk[], +extern ASMABI int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[], uint64_t keyBits); -extern int rijndael_key_setup_dec_intel(uint32_t rk[], +extern ASMABI int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[], uint64_t keyBits); -extern void aes_encrypt_intel(const uint32_t rk[], int Nr, +extern ASMABI void aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4], uint32_t ct[4]); -extern void aes_decrypt_intel(const uint32_t rk[], int Nr, +extern ASMABI void aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4], uint32_t pt[4]); diff --git a/module/icp/algs/blake3/blake3_impl.c b/module/icp/algs/blake3/blake3_impl.c index 4d53126fd359..e03e81e83099 100644 --- a/module/icp/algs/blake3/blake3_impl.c +++ b/module/icp/algs/blake3/blake3_impl.c @@ -272,7 +272,7 @@ blake3_per_cpu_ctx_fini(void) #define IMPL_FMT(impl, i) (((impl) == (i)) ? "[%s] " : "%s ") -#if defined(__linux__) +#if defined(__linux__) || defined(_WIN32) static int blake3_param_get(char *buffer, zfs_kernel_param_t *unused) @@ -306,16 +306,28 @@ blake3_param_set(const char *val, zfs_kernel_param_t *unused) return (blake3_impl_setname(val)); } -#elif defined(_WIN32) +#endif /* Linux || Windows */ -static uint32_t zfs_blake3_impl = 0; +#if defined(_WIN32) -static int -blake3_param_set(ZFS_MODULE_PARAM_ARGS) +int +win32_blake3_param_set(ZFS_MODULE_PARAM_ARGS) { - *ptr = zt->zt_ptr; - *len = sizeof (uint32_t); - *type = ZT_TYPE_INT; + static char str[1024] = ""; + + *type = ZT_TYPE_STRING; + + if (set == B_FALSE) { + if (blake3_initialized) + blake3_param_get(str, NULL); + *ptr = str; + *len = strlen(str); + return (0); + } + + ASSERT3P(ptr, !=, NULL); + + blake3_impl_setname(*ptr); return (0); } diff --git a/module/icp/algs/blake3/blake3_impl.h b/module/icp/algs/blake3/blake3_impl.h index eef74eaa9098..ecb51e3a3010 100644 --- a/module/icp/algs/blake3/blake3_impl.h +++ b/module/icp/algs/blake3/blake3_impl.h @@ -35,6 +35,7 @@ extern "C" { #include #include #include +#include /* * Methods used to define BLAKE3 assembler implementations diff --git a/module/icp/algs/blake3/blake3_x86-64.c b/module/icp/algs/blake3/blake3_x86-64.c index 8139789fd779..efe1f6040afe 100644 --- a/module/icp/algs/blake3/blake3_x86-64.c +++ b/module/icp/algs/blake3/blake3_x86-64.c @@ -29,20 +29,20 @@ (defined(__x86_64) && defined(HAVE_SSE2)) || \ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) -extern void zfs_blake3_compress_in_place_sse2(uint32_t cv[8], +extern void ASMABI zfs_blake3_compress_in_place_sse2(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); -extern void zfs_blake3_compress_xof_sse2(const uint32_t cv[8], +extern void ASMABI zfs_blake3_compress_xof_sse2(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); -extern void zfs_blake3_hash_many_sse2(const uint8_t * const *inputs, +extern void ASMABI zfs_blake3_hash_many_sse2(const uint8_t * const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, boolean_t increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); -static void blake3_compress_in_place_sse2(uint32_t cv[8], +static void ASMABI blake3_compress_in_place_sse2(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { kfpu_begin(); @@ -95,15 +95,15 @@ const blake3_ops_t blake3_sse2_impl = { (defined(__x86_64) && defined(HAVE_SSE2)) || \ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) -extern void zfs_blake3_compress_in_place_sse41(uint32_t cv[8], +extern void ASMABI zfs_blake3_compress_in_place_sse41(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); -extern void zfs_blake3_compress_xof_sse41(const uint32_t cv[8], +extern void ASMABI zfs_blake3_compress_xof_sse41(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); -extern void zfs_blake3_hash_many_sse41(const uint8_t * const *inputs, +extern void ASMABI zfs_blake3_hash_many_sse41(const uint8_t * const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, boolean_t increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); @@ -158,7 +158,7 @@ const blake3_ops_t blake3_sse41_impl = { #endif #if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) -extern void zfs_blake3_hash_many_avx2(const uint8_t * const *inputs, +extern void ASMABI zfs_blake3_hash_many_avx2(const uint8_t * const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, boolean_t increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); @@ -190,15 +190,15 @@ const blake3_ops_t blake3_avx2_impl = { #endif #if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) -extern void zfs_blake3_compress_in_place_avx512(uint32_t cv[8], +extern void ASMABI zfs_blake3_compress_in_place_avx512(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); -extern void zfs_blake3_compress_xof_avx512(const uint32_t cv[8], +extern void ASMABI zfs_blake3_compress_xof_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); -extern void zfs_blake3_hash_many_avx512(const uint8_t * const *inputs, +extern void ASMABI zfs_blake3_hash_many_avx512(const uint8_t * const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, boolean_t increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); diff --git a/module/icp/algs/modes/gcm.c b/module/icp/algs/modes/gcm.c index 16ef14b8ccaf..6e17514bb1c8 100644 --- a/module/icp/algs/modes/gcm.c +++ b/module/icp/algs/modes/gcm.c @@ -59,7 +59,7 @@ boolean_t gcm_avx_can_use_movbe = B_FALSE; static boolean_t gcm_use_avx = B_FALSE; #define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx) -extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); +extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *); static inline boolean_t gcm_avx_will_work(void); static inline void gcm_set_avx(boolean_t); @@ -994,7 +994,8 @@ gcm_impl_set(const char *val) return (err); } -#if defined(_KERNEL) && defined(__linux__) +#if defined(_KERNEL) +#if defined(__linux__) || defined(_WIN32) static int icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp) @@ -1031,6 +1032,32 @@ icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp) return (cnt); } +#endif /* linux || windows */ + +#ifdef _WIN32 +int +win32_icp_gcm_impl_set(ZFS_MODULE_PARAM_ARGS) +{ + uint32_t val; + static unsigned char str[1024] = ""; + + *type = ZT_TYPE_STRING; + + if (set == B_FALSE) { + if (gcm_impl_initialized) + icp_gcm_impl_get(str, NULL); + *ptr = str; + *len = strlen(str); + return (0); + } + + ASSERT3P(ptr, !=, NULL); + + gcm_impl_set(*ptr); + + return (0); +} +#endif module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get, NULL, 0644); @@ -1071,19 +1098,19 @@ MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation."); static uint32_t gcm_avx_chunk_size = ((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; -extern void clear_fpu_regs_avx(void); -extern void gcm_xor_avx(const uint8_t *src, uint8_t *dst); -extern void aes_encrypt_intel(const uint32_t rk[], int nr, +extern void ASMABI clear_fpu_regs_avx(void); +extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst); +extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr, const uint32_t pt[4], uint32_t ct[4]); -extern void gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]); -extern void gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable, +extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]); +extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable, const uint8_t *in, size_t len); -extern size_t aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t, +extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t, const void *, uint64_t *, uint64_t *); -extern size_t aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t, +extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t, const void *, uint64_t *, uint64_t *); static inline boolean_t @@ -1584,6 +1611,36 @@ icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp) return (error); } +#ifdef _WIN32 +/* Lives in here to have access to GCM macros */ +int +win32_icp_gcm_avx_set_chunk_size(ZFS_MODULE_PARAM_ARGS) +{ + uint32_t val; + + *type = ZT_TYPE_UINT; + + if (set == B_FALSE) { + *ptr = &gcm_avx_chunk_size; + *len = sizeof (gcm_avx_chunk_size); + return (0); + } + + ASSERT3U(*len, >=, sizeof (gcm_avx_chunk_size)); + + val = *(uint32_t *)(*ptr); + + val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; + + if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE) + return (-EINVAL); + + gcm_avx_chunk_size = val; + + return (0); +} +#endif + module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size, param_get_uint, &gcm_avx_chunk_size, 0644); diff --git a/module/icp/algs/modes/gcm_pclmulqdq.c b/module/icp/algs/modes/gcm_pclmulqdq.c index c2c8bc221203..737d2e47ecb7 100644 --- a/module/icp/algs/modes/gcm_pclmulqdq.c +++ b/module/icp/algs/modes/gcm_pclmulqdq.c @@ -26,9 +26,10 @@ #include #include +#include /* These functions are used to execute pclmulqdq based assembly methods */ -extern void gcm_mul_pclmulqdq(uint64_t *, uint64_t *, uint64_t *); +extern void ASMABI gcm_mul_pclmulqdq(uint64_t *, uint64_t *, uint64_t *); #include diff --git a/module/icp/algs/sha2/sha2.c b/module/icp/algs/sha2/sha2.c index 151432f1a5df..e6bbe34eaa57 100644 --- a/module/icp/algs/sha2/sha2.c +++ b/module/icp/algs/sha2/sha2.c @@ -48,6 +48,7 @@ #define HAVE_HTONL #endif #include /* for _ILP32 */ +#include static void Encode(uint8_t *, uint32_t *, size_t); static void Encode64(uint8_t *, uint64_t *, size_t); @@ -57,8 +58,8 @@ static void Encode64(uint8_t *, uint64_t *, size_t); #define SHA512Transform(ctx, in) SHA512TransformBlocks((ctx), (in), 1) #define SHA256Transform(ctx, in) SHA256TransformBlocks((ctx), (in), 1) -void SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num); -void SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num); +void ASMABI SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num); +void ASMABI SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num); #else static void SHA256Transform(SHA2_CTX *, const uint8_t *); diff --git a/module/icp/asm-x86_64/aes/aes_amd64.S b/module/icp/asm-x86_64/aes/aes_amd64.S index a0525dd464f5..d5cf4040fb93 100644 --- a/module/icp/asm-x86_64/aes/aes_amd64.S +++ b/module/icp/asm-x86_64/aes/aes_amd64.S @@ -188,13 +188,13 @@ #include void aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4], - uint32_t ct[4]) { - (void) rk, (void) Nr, (void) pt, (void) ct; + uint32_t ct[4]) { + (void) rk, (void) Nr, (void) pt, (void) ct; } void aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4], - uint32_t pt[4]) { - (void) rk, (void) Nr, (void) pt, (void) ct; + uint32_t pt[4]) { + (void) rk, (void) Nr, (void) pt, (void) ct; } @@ -221,23 +221,23 @@ aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4], // finite field multiplies by {02}, {04} and {08} -#define f2(x) [[x<<1]^[[[x>>7]&1]*0x11b]] -#define f4(x) [[x<<2]^[[[x>>6]&1]*0x11b]^[[[x>>6]&2]*0x11b]] -#define f8(x) [[x<<3]^[[[x>>5]&1]*0x11b]^[[[x>>5]&2]*0x11b]^[[[x>>5]&4]*0x11b]] +#define f2(x) ((x<<1)^(((x>>7)&1)*0x11b)) +#define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b)) +#define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b)) // finite field multiplies required in table generation -#define f3(x) [[f2(x)] ^ [x]] -#define f9(x) [[f8(x)] ^ [x]] -#define fb(x) [[f8(x)] ^ [f2(x)] ^ [x]] -#define fd(x) [[f8(x)] ^ [f4(x)] ^ [x]] -#define fe(x) [[f8(x)] ^ [f4(x)] ^ [f2(x)]] +#define f3(x) ((f2(x)) ^ (x)) +#define f9(x) ((f8(x)) ^ (x)) +#define fb(x) ((f8(x)) ^ (f2(x)) ^ (x)) +#define fd(x) ((f8(x)) ^ (f4(x)) ^ (x)) +#define fe(x) ((f8(x)) ^ (f4(x)) ^ (f2(x))) // macros for expanding S-box data -#define u8(x) [f2(x)], [x], [x], [f3(x)], [f2(x)], [x], [x], [f3(x)] -#define v8(x) [fe(x)], [f9(x)], [fd(x)], [fb(x)], [fe(x)], [f9(x)], [fd(x)], [x] -#define w8(x) [x], 0, 0, 0, [x], 0, 0, 0 +#define u8(x) (f2(x)), (x), (x), (f3(x)), (f2(x)), (x), (x), (f3(x)) +#define v8(x) (fe(x)), (f9(x)), (fd(x)), (fb(x)), (fe(x)), (f9(x)), (fd(x)), (x) +#define w8(x) (x), 0, 0, 0, (x), 0, 0, 0 #define enc_vals(x) \ .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \ @@ -693,7 +693,7 @@ aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4], * int aes_encrypt(const unsigned char *in, * unsigned char *out, const aes_encrypt_ctx cx[1])/ */ -.section .rodata +SECTION_STATIC .align 64 enc_tab: enc_vals(u8) @@ -718,7 +718,7 @@ ENTRY_NP(aes_encrypt_amd64) #else // OpenSolaris OS interface - sub $[4*8], %rsp // Make room on stack to save registers + sub $(4*8), %rsp // Make room on stack to save registers mov %rcx, (%rsp) // Save output pointer (P4) on stack mov %rdi, %r8 // context (P1) mov %rdx, %rdi // P3: save input pointer @@ -749,11 +749,11 @@ ENTRY_NP(aes_encrypt_amd64) lea (kptr,%rsi), kptr // Jump based on byte key length * 16: - cmp $[10*16], %esi + cmp $(10*16), %esi je 3f - cmp $[12*16], %esi + cmp $(12*16), %esi je 2f - cmp $[14*16], %esi + cmp $(14*16), %esi je 1f mov $-1, %rax // error jmp 4f @@ -785,7 +785,7 @@ ENTRY_NP(aes_encrypt_amd64) mov 1*8(%rsp), %rbx mov 2*8(%rsp), %rbp mov 3*8(%rsp), %r12 - add $[4*8], %rsp + add $(4*8), %rsp RET SET_SIZE(aes_encrypt_amd64) @@ -799,7 +799,7 @@ ENTRY_NP(aes_encrypt_amd64) * int aes_decrypt(const unsigned char *in, * unsigned char *out, const aes_encrypt_ctx cx[1])/ */ -.section .rodata +SECTION_STATIC .align 64 dec_tab: dec_vals(v8) @@ -824,7 +824,7 @@ ENTRY_NP(aes_decrypt_amd64) #else // OpenSolaris OS interface - sub $[4*8], %rsp // Make room on stack to save registers + sub $(4*8), %rsp // Make room on stack to save registers mov %rcx, (%rsp) // Save output pointer (P4) on stack mov %rdi, %r8 // context (P1) mov %rdx, %rdi // P3: save input pointer @@ -861,11 +861,11 @@ ENTRY_NP(aes_decrypt_amd64) xor rofs+12(%rdi), %edx // Jump based on byte key length * 16: - cmp $[10*16], %esi + cmp $(10*16), %esi je 3f - cmp $[12*16], %esi + cmp $(12*16), %esi je 2f - cmp $[14*16], %esi + cmp $(14*16), %esi je 1f mov $-1, %rax // error jmp 4f @@ -897,11 +897,11 @@ ENTRY_NP(aes_decrypt_amd64) mov 1*8(%rsp), %rbx mov 2*8(%rsp), %rbp mov 3*8(%rsp), %r12 - add $[4*8], %rsp + add $(4*8), %rsp RET SET_SIZE(aes_decrypt_amd64) -#endif /* lint || __lint */ +#endif /* lint || __lint */ #ifdef __ELF__ .section .note.GNU-stack,"",%progbits diff --git a/module/icp/asm-x86_64/blake3/blake3_avx2.S b/module/icp/asm-x86_64/blake3/blake3_avx2.S index cb08430b81ed..21236eb48b19 100644 --- a/module/icp/asm-x86_64/blake3/blake3_avx2.S +++ b/module/icp/asm-x86_64/blake3/blake3_avx2.S @@ -31,12 +31,9 @@ #include .intel_syntax noprefix -.global zfs_blake3_hash_many_avx2 .text -.type zfs_blake3_hash_many_avx2,@function -.p2align 6 -zfs_blake3_hash_many_avx2: +ENTRY_NP(zfs_blake3_hash_many_avx2) ENDBR push r15 push r14 @@ -1791,13 +1788,10 @@ zfs_blake3_hash_many_avx2: vmovdqu xmmword ptr [rbx+0x10], xmm1 jmp 4b -.size zfs_blake3_hash_many_avx2, . - zfs_blake3_hash_many_avx2 +SET_SIZE(zfs_blake3_hash_many_avx2) -#ifdef __APPLE__ -.static_data -#else +SECTION_STATIC .section .rodata -#endif .p2align 6 ADD0: diff --git a/module/icp/asm-x86_64/blake3/blake3_avx512.S b/module/icp/asm-x86_64/blake3/blake3_avx512.S index 960406ea2c01..c5bf431383ba 100644 --- a/module/icp/asm-x86_64/blake3/blake3_avx512.S +++ b/module/icp/asm-x86_64/blake3/blake3_avx512.S @@ -31,17 +31,10 @@ #include .intel_syntax noprefix -.global zfs_blake3_hash_many_avx512 -.global zfs_blake3_compress_in_place_avx512 -.global zfs_blake3_compress_xof_avx512 .text -.type zfs_blake3_hash_many_avx512,@function -.type zfs_blake3_compress_xof_avx512,@function -.type zfs_blake3_compress_in_place_avx512,@function -.p2align 6 -zfs_blake3_hash_many_avx512: +ENTRY_NP(zfs_blake3_hash_many_avx512) ENDBR push r15 push r14 @@ -2397,8 +2390,7 @@ zfs_blake3_hash_many_avx512: vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 jmp 4b -.p2align 6 -zfs_blake3_compress_in_place_avx512: +ENTRY_NP(zfs_blake3_compress_in_place_avx512) ENDBR vmovdqu xmm0, xmmword ptr [rdi] vmovdqu xmm1, xmmword ptr [rdi+0x10] @@ -2479,8 +2471,7 @@ zfs_blake3_compress_in_place_avx512: vmovdqu xmmword ptr [rdi+0x10], xmm1 RET -.p2align 6 -zfs_blake3_compress_xof_avx512: +ENTRY_NP(zfs_blake3_compress_xof_avx512) ENDBR vmovdqu xmm0, xmmword ptr [rdi] vmovdqu xmm1, xmmword ptr [rdi+0x10] @@ -2565,15 +2556,11 @@ zfs_blake3_compress_xof_avx512: vmovdqu xmmword ptr [r9+0x30], xmm3 RET -.size zfs_blake3_hash_many_avx512, . - zfs_blake3_hash_many_avx512 -.size zfs_blake3_compress_in_place_avx512, . - zfs_blake3_compress_in_place_avx512 -.size zfs_blake3_compress_xof_avx512, . - zfs_blake3_compress_xof_avx512 +SET_SIZE(zfs_blake3_hash_many_avx512) +SET_SIZE(zfs_blake3_compress_in_place_avx512) +SET_SIZE(zfs_blake3_compress_xof_avx512) -#ifdef __APPLE__ -.static_data -#else -.section .rodata -#endif +SECTION_STATIC .p2align 6 INDEX0: diff --git a/module/icp/asm-x86_64/blake3/blake3_sse2.S b/module/icp/asm-x86_64/blake3/blake3_sse2.S index c4290aaa8faf..358e3c0c3c9f 100644 --- a/module/icp/asm-x86_64/blake3/blake3_sse2.S +++ b/module/icp/asm-x86_64/blake3/blake3_sse2.S @@ -31,17 +31,10 @@ #include .intel_syntax noprefix -.global zfs_blake3_hash_many_sse2 -.global zfs_blake3_compress_in_place_sse2 -.global zfs_blake3_compress_xof_sse2 -.text -.type zfs_blake3_hash_many_sse2,@function -.type zfs_blake3_compress_in_place_sse2,@function -.type zfs_blake3_compress_xof_sse2,@function +SECTION_TEXT - .p2align 6 -zfs_blake3_hash_many_sse2: +ENTRY_NP(zfs_blake3_hash_many_sse2) ENDBR push r15 push r14 @@ -2038,8 +2031,7 @@ zfs_blake3_hash_many_sse2: movups xmmword ptr [rbx+0x10], xmm1 jmp 4b -.p2align 6 -zfs_blake3_compress_in_place_sse2: +ENTRY_NP(zfs_blake3_compress_in_place_sse2) ENDBR movups xmm0, xmmword ptr [rdi] movups xmm1, xmmword ptr [rdi+0x10] @@ -2149,8 +2141,7 @@ zfs_blake3_compress_in_place_sse2: movups xmmword ptr [rdi+0x10], xmm1 RET -.p2align 6 -zfs_blake3_compress_xof_sse2: +ENTRY_NP(zfs_blake3_compress_xof_sse2) ENDBR movups xmm0, xmmword ptr [rdi] movups xmm1, xmmword ptr [rdi+0x10] @@ -2268,15 +2259,11 @@ zfs_blake3_compress_xof_sse2: movups xmmword ptr [r9+0x30], xmm3 RET -.size zfs_blake3_hash_many_sse2, . - zfs_blake3_hash_many_sse2 -.size zfs_blake3_compress_in_place_sse2, . - zfs_blake3_compress_in_place_sse2 -.size zfs_blake3_compress_xof_sse2, . - zfs_blake3_compress_xof_sse2 +SET_SIZE(zfs_blake3_hash_many_sse2) +SET_SIZE(zfs_blake3_compress_in_place_sse2) +SET_SIZE(zfs_blake3_compress_xof_sse2) -#ifdef __APPLE__ -.static_data -#else -.section .rodata -#endif +SECTION_STATIC .p2align 6 BLAKE3_IV: .long 0x6A09E667, 0xBB67AE85 diff --git a/module/icp/asm-x86_64/blake3/blake3_sse41.S b/module/icp/asm-x86_64/blake3/blake3_sse41.S index 45b90cc9ed89..d17411f3a1f9 100644 --- a/module/icp/asm-x86_64/blake3/blake3_sse41.S +++ b/module/icp/asm-x86_64/blake3/blake3_sse41.S @@ -31,17 +31,10 @@ #include .intel_syntax noprefix -.global zfs_blake3_compress_in_place_sse41 -.global zfs_blake3_compress_xof_sse41 -.global zfs_blake3_hash_many_sse41 .text -.type zfs_blake3_hash_many_sse41,@function -.type zfs_blake3_compress_in_place_sse41,@function -.type zfs_blake3_compress_xof_sse41,@function -.p2align 6 -zfs_blake3_hash_many_sse41: +ENTRY_NP(zfs_blake3_hash_many_sse41) ENDBR push r15 push r14 @@ -1800,8 +1793,7 @@ zfs_blake3_hash_many_sse41: movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+0x10], xmm1 jmp 4b -.p2align 6 -zfs_blake3_compress_in_place_sse41: +ENTRY_NP(zfs_blake3_compress_in_place_sse41) ENDBR movups xmm0, xmmword ptr [rdi] movups xmm1, xmmword ptr [rdi+0x10] @@ -1899,8 +1891,7 @@ zfs_blake3_compress_in_place_sse41: movups xmmword ptr [rdi], xmm0 movups xmmword ptr [rdi+0x10], xmm1 RET -.p2align 6 -zfs_blake3_compress_xof_sse41: +ENTRY_NP(zfs_blake3_compress_xof_sse41) ENDBR movups xmm0, xmmword ptr [rdi] movups xmm1, xmmword ptr [rdi+0x10] @@ -2007,15 +1998,12 @@ zfs_blake3_compress_xof_sse41: movups xmmword ptr [r9+0x30], xmm3 RET -.size zfs_blake3_hash_many_sse41, . - zfs_blake3_hash_many_sse41 -.size zfs_blake3_compress_in_place_sse41, . - zfs_blake3_compress_in_place_sse41 -.size zfs_blake3_compress_xof_sse41, . - zfs_blake3_compress_xof_sse41 +SET_SIZE(zfs_blake3_hash_many_sse41) +SET_SIZE(zfs_blake3_compress_in_place_sse41) +SET_SIZE(zfs_blake3_compress_xof_sse41) + +SECTION_STATIC -#ifdef __APPLE__ -.static_data -#else -.section .rodata -#endif .p2align 6 BLAKE3_IV: .long 0x6A09E667, 0xBB67AE85 diff --git a/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S index cf17b3768712..3836144dc4fc 100644 --- a/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S +++ b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S @@ -50,14 +50,15 @@ #define _ASM #include +/* Windows userland links with OpenSSL */ +#if !defined (_WIN32) || defined (_KERNEL) + .extern gcm_avx_can_use_movbe .text #ifdef HAVE_MOVBE -.type _aesni_ctr32_ghash_6x,@function -.align 32 -_aesni_ctr32_ghash_6x: +ENTRY_NP(_aesni_ctr32_ghash_6x) .cfi_startproc ENDBR vmovdqu 32(%r11),%xmm2 @@ -369,12 +370,10 @@ _aesni_ctr32_ghash_6x: RET .cfi_endproc -.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x +SET_SIZE(_aesni_ctr32_ghash_6x) #endif /* ifdef HAVE_MOVBE */ -.type _aesni_ctr32_ghash_no_movbe_6x,@function -.align 32 -_aesni_ctr32_ghash_no_movbe_6x: +ENTRY_NP(_aesni_ctr32_ghash_no_movbe_6x) .cfi_startproc ENDBR vmovdqu 32(%r11),%xmm2 @@ -698,12 +697,9 @@ _aesni_ctr32_ghash_no_movbe_6x: RET .cfi_endproc -.size _aesni_ctr32_ghash_no_movbe_6x,.-_aesni_ctr32_ghash_no_movbe_6x +SET_SIZE(_aesni_ctr32_ghash_no_movbe_6x) -.globl aesni_gcm_decrypt -.type aesni_gcm_decrypt,@function -.align 32 -aesni_gcm_decrypt: +ENTRY_NP(aesni_gcm_decrypt) .cfi_startproc ENDBR xorq %r10,%r10 @@ -818,10 +814,9 @@ aesni_gcm_decrypt: movq %r10,%rax RET .cfi_endproc -.size aesni_gcm_decrypt,.-aesni_gcm_decrypt -.type _aesni_ctr32_6x,@function -.align 32 -_aesni_ctr32_6x: +SET_SIZE(aesni_gcm_decrypt) + +ENTRY_NP(_aesni_ctr32_6x) .cfi_startproc ENDBR vmovdqu 0-128(%rcx),%xmm4 @@ -911,12 +906,9 @@ _aesni_ctr32_6x: vpxor %xmm4,%xmm14,%xmm14 jmp .Loop_ctr32 .cfi_endproc -.size _aesni_ctr32_6x,.-_aesni_ctr32_6x +SET_SIZE(_aesni_ctr32_6x) -.globl aesni_gcm_encrypt -.type aesni_gcm_encrypt,@function -.align 32 -aesni_gcm_encrypt: +ENTRY_NP(aesni_gcm_encrypt) .cfi_startproc ENDBR xorq %r10,%r10 @@ -1196,7 +1188,9 @@ aesni_gcm_encrypt: movq %r10,%rax RET .cfi_endproc -.size aesni_gcm_encrypt,.-aesni_gcm_encrypt +SET_SIZE(aesni_gcm_encrypt) + +#endif /* !_WIN32 || _KERNEL */ /* Some utility routines */ @@ -1204,13 +1198,10 @@ aesni_gcm_encrypt: * clear all fpu registers * void clear_fpu_regs_avx(void); */ -.globl clear_fpu_regs_avx -.type clear_fpu_regs_avx,@function -.align 32 -clear_fpu_regs_avx: +ENTRY_NP(clear_fpu_regs_avx) vzeroall RET -.size clear_fpu_regs_avx,.-clear_fpu_regs_avx +SET_SIZE(clear_fpu_regs_avx) /* * void gcm_xor_avx(const uint8_t *src, uint8_t *dst); @@ -1219,25 +1210,19 @@ clear_fpu_regs_avx: * stores the result at `dst'. The XOR is performed using FPU registers, * so make sure FPU state is saved when running this in the kernel. */ -.globl gcm_xor_avx -.type gcm_xor_avx,@function -.align 32 -gcm_xor_avx: +ENTRY_NP(gcm_xor_avx) movdqu (%rdi), %xmm0 movdqu (%rsi), %xmm1 pxor %xmm1, %xmm0 movdqu %xmm0, (%rsi) RET -.size gcm_xor_avx,.-gcm_xor_avx +SET_SIZE(gcm_xor_avx) /* * Toggle a boolean_t value atomically and return the new value. * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); */ -.globl atomic_toggle_boolean_nv -.type atomic_toggle_boolean_nv,@function -.align 32 -atomic_toggle_boolean_nv: +ENTRY_NP(atomic_toggle_boolean_nv) xorl %eax, %eax lock xorl $1, (%rdi) @@ -1245,9 +1230,10 @@ atomic_toggle_boolean_nv: movl $1, %eax 1: RET -.size atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv +SET_SIZE(atomic_toggle_boolean_nv) + +SECTION_STATIC -.pushsection .rodata .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 @@ -1261,7 +1247,6 @@ atomic_toggle_boolean_nv: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 -.popsection /* Mark the stack non-executable. */ #if defined(__linux__) && defined(__ELF__) diff --git a/module/icp/asm-x86_64/modes/ghash-x86_64.S b/module/icp/asm-x86_64/modes/ghash-x86_64.S index bf3724a23eae..ccdb1937cb73 100644 --- a/module/icp/asm-x86_64/modes/ghash-x86_64.S +++ b/module/icp/asm-x86_64/modes/ghash-x86_64.S @@ -102,12 +102,13 @@ .text -.globl gcm_gmult_clmul -.type gcm_gmult_clmul,@function -.align 16 -gcm_gmult_clmul: +/* Windows userland links with OpenSSL */ +#if !defined (_WIN32) || defined (_KERNEL) +ENTRY_NP(gcm_gmult_clmul) + .cfi_startproc ENDBR + .L_gmult_clmul: movdqu (%rdi),%xmm0 movdqa .Lbswap_mask(%rip),%xmm5 @@ -155,12 +156,10 @@ gcm_gmult_clmul: movdqu %xmm0,(%rdi) RET .cfi_endproc -.size gcm_gmult_clmul,.-gcm_gmult_clmul +SET_SIZE(gcm_gmult_clmul) +#endif /* !_WIN32 || _KERNEL */ -.globl gcm_init_htab_avx -.type gcm_init_htab_avx,@function -.align 32 -gcm_init_htab_avx: +ENTRY_NP(gcm_init_htab_avx) .cfi_startproc ENDBR vzeroupper @@ -269,21 +268,17 @@ gcm_init_htab_avx: vzeroupper RET .cfi_endproc -.size gcm_init_htab_avx,.-gcm_init_htab_avx +SET_SIZE(gcm_init_htab_avx) -.globl gcm_gmult_avx -.type gcm_gmult_avx,@function -.align 32 -gcm_gmult_avx: +#if !defined (_WIN32) || defined (_KERNEL) +ENTRY_NP(gcm_gmult_avx) .cfi_startproc ENDBR jmp .L_gmult_clmul .cfi_endproc -.size gcm_gmult_avx,.-gcm_gmult_avx -.globl gcm_ghash_avx -.type gcm_ghash_avx,@function -.align 32 -gcm_ghash_avx: +SET_SIZE(gcm_gmult_avx) + +ENTRY_NP(gcm_ghash_avx) .cfi_startproc ENDBR vzeroupper @@ -658,9 +653,11 @@ gcm_ghash_avx: vzeroupper RET .cfi_endproc -.size gcm_ghash_avx,.-gcm_ghash_avx +SET_SIZE(gcm_ghash_avx) + +#endif /* !_WIN32 || _KERNEL */ -.pushsection .rodata +SECTION_STATIC .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 @@ -671,13 +668,13 @@ gcm_ghash_avx: .L7_mask_poly: .long 7,0,450,0 .align 64 -.type .Lrem_4bit,@object +SET_OBJ(.Lrem_4bit) .Lrem_4bit: .long 0,0,0,471859200,0,943718400,0,610271232 .long 0,1887436800,0,1822425088,0,1220542464,0,1423966208 .long 0,3774873600,0,4246732800,0,3644850176,0,3311403008 .long 0,2441084928,0,2376073216,0,2847932416,0,3051356160 -.type .Lrem_8bit,@object +SET_OBJ(.Lrem_8bit) .Lrem_8bit: .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E @@ -714,7 +711,6 @@ gcm_ghash_avx: .byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 -.popsection /* Mark the stack non-executable. */ #if defined(__linux__) && defined(__ELF__) diff --git a/module/icp/asm-x86_64/sha2/sha256_impl.S b/module/icp/asm-x86_64/sha2/sha256_impl.S index 60d34b4a3be0..3b68f72355ba 100644 --- a/module/icp/asm-x86_64/sha2/sha256_impl.S +++ b/module/icp/asm-x86_64/sha2/sha256_impl.S @@ -2065,7 +2065,7 @@ SET_SIZE(SHA256TransformBlocks) .section .rodata .align 64 -.type K256,@object +SET_OBJ(K256) K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 @@ -2085,6 +2085,6 @@ K256: .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 #endif /* !lint && !__lint */ -#ifdef __ELF__ +#if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/module/icp/asm-x86_64/sha2/sha512_impl.S b/module/icp/asm-x86_64/sha2/sha512_impl.S index ed7fb362a1ac..48f1f34ca085 100644 --- a/module/icp/asm-x86_64/sha2/sha512_impl.S +++ b/module/icp/asm-x86_64/sha2/sha512_impl.S @@ -2066,7 +2066,7 @@ SET_SIZE(SHA512TransformBlocks) .section .rodata .align 64 -.type K512,@object +SET_OBJ(K512) K512: .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc @@ -2110,6 +2110,6 @@ K512: .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 #endif /* !lint && !__lint */ -#ifdef __ELF__ +#if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/module/icp/include/aes/aes_impl.h b/module/icp/include/aes/aes_impl.h index fe5c23974682..66eb4a6c8fb6 100644 --- a/module/icp/include/aes/aes_impl.h +++ b/module/icp/include/aes/aes_impl.h @@ -36,6 +36,7 @@ extern "C" { #include #include +#include /* Similar to sysmacros.h IS_P2ALIGNED, but checks two pointers: */ #define IS_P2ALIGNED2(v, w, a) \ @@ -190,13 +191,13 @@ extern const aes_impl_ops_t aes_generic_impl; extern const aes_impl_ops_t aes_x86_64_impl; /* These functions are used to execute amd64 instructions for AMD or Intel: */ -extern int rijndael_key_setup_enc_amd64(uint32_t rk[], +extern ASMABI int rijndael_key_setup_enc_amd64(uint32_t rk[], const uint32_t cipherKey[], int keyBits); -extern int rijndael_key_setup_dec_amd64(uint32_t rk[], +extern ASMABI int rijndael_key_setup_dec_amd64(uint32_t rk[], const uint32_t cipherKey[], int keyBits); -extern void aes_encrypt_amd64(const uint32_t rk[], int Nr, +extern ASMABI void aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4], uint32_t ct[4]); -extern void aes_decrypt_amd64(const uint32_t rk[], int Nr, +extern ASMABI void aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4], uint32_t pt[4]); #endif #if defined(__x86_64) && defined(HAVE_AES) diff --git a/module/lua/ldo.c b/module/lua/ldo.c index 6e292df5ba7c..d101284f0e64 100644 --- a/module/lua/ldo.c +++ b/module/lua/ldo.c @@ -25,7 +25,7 @@ #include "ltm.h" #include "lvm.h" #include "lzio.h" - +#include /* Return the number of bytes available on the stack. */ @@ -75,7 +75,7 @@ static intptr_t stack_remaining(void) { #define JMP_BUF_CNT 6 #elif defined(__x86_64__) #ifdef _WIN32 -#define JMP_BUF_CNT 10 // +rsi +rdi see win_setjmp_x86_64.S +#define JMP_BUF_CNT 10 // +rsi +rdi #else #define JMP_BUF_CNT 8 #endif @@ -99,8 +99,8 @@ static intptr_t stack_remaining(void) { typedef struct _label_t { long long unsigned val[JMP_BUF_CNT]; } label_t; -int setjmp(label_t *) __attribute__ ((__nothrow__)); -extern __attribute__((noreturn)) void longjmp(label_t *); +int ASMABI setjmp(label_t *) __attribute__ ((__nothrow__)); +extern __attribute__((noreturn)) void ASMABI longjmp(label_t *); #define LUAI_THROW(L,c) longjmp(&(c)->b) #define LUAI_TRY(L,c,a) if (setjmp(&(c)->b) == 0) { a } diff --git a/module/lua/setjmp/setjmp_x86_64.S b/module/lua/setjmp/setjmp_x86_64.S index 7e13fea05dda..e0c362d9045b 100644 --- a/module/lua/setjmp/setjmp_x86_64.S +++ b/module/lua/setjmp/setjmp_x86_64.S @@ -27,28 +27,16 @@ #include #endif -#ifndef RET -#define RET ret -#endif - -#undef ENTRY -#define ENTRY(x) \ - .text; \ - .align 8; \ - .globl x; \ - .type x, @function; \ -x: - -#define SET_SIZE(x) \ - .size x, [.-x] - /* * Setjmp and longjmp implement non-local gotos using state vectors * type label_t. */ #ifdef __x86_64__ - ENTRY(setjmp) +#define _ASM +#include + + ENTRY_NP(setjmp) movq %rsp, 0(%rdi) movq %rbp, 8(%rdi) movq %rbx, 16(%rdi) @@ -62,7 +50,7 @@ x: RET SET_SIZE(setjmp) - ENTRY(longjmp) + ENTRY_NP(longjmp) movq 0(%rdi), %rsp movq 8(%rdi), %rbp movq 16(%rdi), %rbx diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c index 06bb900ec8de..8e950dbf8e9a 100644 --- a/module/zcommon/zfs_fletcher.c +++ b/module/zcommon/zfs_fletcher.c @@ -892,7 +892,7 @@ zio_abd_checksum_func_t fletcher_4_abd_ops = { #define IMPL_FMT(impl, i) (((impl) == (i)) ? "[%s] " : "%s ") -#if defined(__linux__) +#if defined(__linux__) || defined(_WIN32) static int fletcher_4_param_get(char *buffer, zfs_kernel_param_t *unused) @@ -921,21 +921,36 @@ fletcher_4_param_set(const char *val, zfs_kernel_param_t *unused) return (fletcher_4_impl_set(val)); } -#elif defined(_WIN32) +#endif /* Linux || Windows */ -static uint32_t zfs_fletcher_4_impl = 0; +#ifdef _WIN32 -static int -fletcher_4_param_set(ZFS_MODULE_PARAM_ARGS) +int +win32_fletcher_4_param_set(ZFS_MODULE_PARAM_ARGS) { - *ptr = zt->zt_ptr; - *len = sizeof (uint32_t); - *type = ZT_TYPE_INT; + uint32_t val; + static unsigned char str[1024] = ""; + + *type = ZT_TYPE_STRING; + + if (set == B_FALSE) { + if (fletcher_4_initialized) + fletcher_4_param_get(str, NULL); + *ptr = str; + *len = strlen(str); + return (0); + } + + ASSERT3P(ptr, !=, NULL); + + fletcher_4_impl_set(*ptr); return (0); } -#else +#endif /* WIN32 */ + +#ifdef __FreeBSD__ #include diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c index 2980f8acfbd7..4f3f5a70d602 100644 --- a/module/zfs/vdev_raidz_math.c +++ b/module/zfs/vdev_raidz_math.c @@ -633,7 +633,8 @@ vdev_raidz_impl_set(const char *val) return (err); } -#if defined(_KERNEL) && defined(__linux__) +#if defined(_KERNEL) +#if defined(__linux__) || defined(_WIN32) static int zfs_vdev_raidz_impl_set(const char *val, zfs_kernel_param_t *kp) @@ -664,6 +665,32 @@ zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp) return (cnt); } +#endif /* Linux || Windows */ + +#ifdef _WIN32 +int +win32_zfs_vdev_raidz_impl_set(ZFS_MODULE_PARAM_ARGS) +{ + uint32_t val; + static unsigned char str[1024] = ""; + + *type = ZT_TYPE_STRING; + + if (set == B_FALSE) { + if (raidz_math_initialized) + zfs_vdev_raidz_impl_get(str, NULL); + *ptr = str; + *len = strlen(str); + return (0); + } + + ASSERT3P(ptr, !=, NULL); + + vdev_raidz_impl_set(*ptr); + + return (0); +} +#endif module_param_call(zfs_vdev_raidz_impl, zfs_vdev_raidz_impl_set, zfs_vdev_raidz_impl_get, NULL, 0644); From 4f95556a3510cca1bf0cbcfab4226204011c2404 Mon Sep 17 00:00:00 2001 From: Jorgen Lundman Date: Sun, 27 Nov 2022 18:04:43 +0900 Subject: [PATCH 3/3] Windows assembler changes --- CMakeLists.txt | 2 +- include/os/windows/spl/sys/debug.h | 7 +- include/os/windows/spl/sys/linker_set.h | 2 +- include/os/windows/spl/sys/mod_os.h | 19 ++-- include/os/windows/spl/sys/processor.h | 17 +++ include/os/windows/spl/sys/simd.h | 45 ++++---- include/os/windows/spl/sys/thread.h | 2 +- include/os/windows/zfs/zfs_config.h | 17 ++- lib/libicp/CMakeLists.txt | 17 ++- lib/libzpool/CMakeLists.txt | 8 ++ module/icp/CMakeLists.txt | 18 ++-- module/lua/CMakeLists.txt | 2 +- module/os/windows/spl/spl-processor.c | 16 ++- module/os/windows/zfs/sysctl_os.c | 132 ++++++++++++++---------- module/zfs/CMakeLists.txt | 5 + 15 files changed, 205 insertions(+), 104 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bb7c5f8c5b0f..55c2efeb7b16 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,7 +48,7 @@ add_compile_options( -Wno-microsoft-enum-forward-reference -Wno-visibility -Wno-microsoft-anon-tag - -Wno-ignored-attributes + # -Wno-ignored-attributes -Wno-unused-command-line-argument -Wno-unused-local-typedef -Wno-int-to-void-pointer-cast diff --git a/include/os/windows/spl/sys/debug.h b/include/os/windows/spl/sys/debug.h index a26de24cc9f9..f3f3dd8992f8 100644 --- a/include/os/windows/spl/sys/debug.h +++ b/include/os/windows/spl/sys/debug.h @@ -62,12 +62,12 @@ #define unlikely #define likely -#define __attribute__(X) #define __maybe_unused #define __printflike(X, Y) #define __unused #define always_inline __forceinline -#define _Noreturn +#define _Noreturn __declspec(noreturn) + #else @@ -88,12 +88,11 @@ #define __unused __attribute__((unused)) #define _Noreturn __attribute__((__noreturn__)) - #endif +extern void _Noreturn panic(const char *fmt, ...); -extern void panic(const char *fmt, ...) __attribute__((__noreturn__)); extern void printBuffer(const char *fmt, ...); diff --git a/include/os/windows/spl/sys/linker_set.h b/include/os/windows/spl/sys/linker_set.h index 31115b643f96..fb02a90f5a11 100644 --- a/include/os/windows/spl/sys/linker_set.h +++ b/include/os/windows/spl/sys/linker_set.h @@ -47,7 +47,7 @@ #define __GLOBL(sym) __asm__(".globl " __XSTRING(sym)) #define __WEAK(sym) __asm__(".weak " __XSTRING(sym)) -#define __weak_symbol __attribute__((__weak__)) +#define __weak_symbol /* __attribute__((__weak__)) */ #if __has_attribute(no_sanitize) && defined(__clang__) #ifdef _KERNEL diff --git a/include/os/windows/spl/sys/mod_os.h b/include/os/windows/spl/sys/mod_os.h index 38235ba593b1..c942d619f677 100644 --- a/include/os/windows/spl/sys/mod_os.h +++ b/include/os/windows/spl/sys/mod_os.h @@ -87,7 +87,6 @@ extern "C" { fn(); \ } -#define module_param_call(a, b, c, d, e) #define module_param_named(a, b, c, d) #define module_init_early(fn) \ @@ -304,10 +303,10 @@ ZT_GET_VALUE(ztunable_t *zt, void **ptr, ULONG *len, ULONG *type) #define ZFS_MODULE_PARAM_CALL_IMPL( \ - scope_prefix, name_prefix, name, perm, args, desc) \ + scope_prefix, name_prefix, name, perm, func, args, desc) \ static ztunable_t zt_ ## name_prefix ## name = { \ - .zt_ptr = &name_prefix ## name, \ - .zt_func = args, \ + .zt_ptr = args, \ + .zt_func = func, \ .zt_name = #name_prefix #name, \ .zt_prefix = #scope_prefix, \ .zt_desc = #desc, \ @@ -320,9 +319,17 @@ ZT_GET_VALUE(ztunable_t *zt, void **ptr, ULONG *len, ULONG *type) #define ZFS_MODULE_PARAM_CALL( \ scope_prefix, name_prefix, name, func, _, perm, desc) \ ZFS_MODULE_PARAM_CALL_IMPL(scope_prefix, name_prefix, name, perm, \ - func, desc) + func, &name_prefix ## name, desc) + +#define ZFS_MODULE_VIRTUAL_PARAM_CALL( \ + scope_prefix, name_prefix, name, func, _, perm, desc) \ + ZFS_MODULE_PARAM_CALL_IMPL(scope_prefix, name_prefix, name, perm, \ + win32_ ## func, NULL, desc) -#define ZFS_MODULE_VIRTUAL_PARAM_CALL ZFS_MODULE_PARAM_CALL +#define module_param_call(name, _set, _get, var, mode) \ + extern int win32_ ## _set(ZFS_MODULE_PARAM_ARGS); \ + ZFS_MODULE_PARAM_CALL_IMPL(zfs, /* */, name, ZMOD_RW, \ + win32_ ## _set, var, "xxx") struct zfs_kernel_param_s; typedef struct zfs_kernel_param_s zfs_kernel_param_t; diff --git a/include/os/windows/spl/sys/processor.h b/include/os/windows/spl/sys/processor.h index 5088f327becd..d1110726c522 100644 --- a/include/os/windows/spl/sys/processor.h +++ b/include/os/windows/spl/sys/processor.h @@ -8,4 +8,21 @@ extern uint32_t getcpuid(); typedef int processorid_t; +#define CPUID_FEATURE_PCLMULQDQ (1<<1) +#define CPUID_FEATURE_MOVBE (1<<22) +#define CPUID_FEATURE_AES (1<<25) +#define CPUID_FEATURE_XSAVE (1<<26) +#define CPUID_FEATURE_OSXSAVE (1<<27) +#define CPUID_FEATURE_AVX1_0 (1<<28) + +#define CPUID_FEATURE_SSE (1<<25) +#define CPUID_FEATURE_SSE2 (1<<26) +#define CPUID_FEATURE_SSE3 (1<<0) +#define CPUID_FEATURE_SSSE3 (1<<9) +#define CPUID_FEATURE_SSE4_2 (1<<20) +#define CPUID_FEATURE_SSE4_1 (1<<19) + +#define CPUID_LEAF7_FEATURE_AVX2 (1<<5) +#define CPUID_LEAF7_FEATURE_AVX512F (1<<16) + #endif /* _SPL_PROCESSOR_H */ diff --git a/include/os/windows/spl/sys/simd.h b/include/os/windows/spl/sys/simd.h index 6be5940ad704..22f120bc5191 100644 --- a/include/os/windows/spl/sys/simd.h +++ b/include/os/windows/spl/sys/simd.h @@ -68,6 +68,7 @@ #define _SIMD_X86_H #include +#include /* only for __x86 */ #if defined(__x86) @@ -92,11 +93,6 @@ xgetbv(uint32_t c) #endif -#define CPUID_FEATURE_PCLMULQDQ (1<<1) -#define CPUID_FEATURE_AES (1<<25) -#define CPUID_FEATURE_XSAVE (1<<26) -// #define CPUID_FEATURE_AVX (1<<28) - extern uint64_t spl_cpuid_features(void); extern uint64_t spl_cpuid_leaf7_features(void); @@ -109,8 +105,16 @@ extern uint64_t spl_cpuid_leaf7_features(void); #define kfpu_init() (0) #define kfpu_fini() do {} while (0) -#define kfpu_begin() ((void)0) -#define kfpu_end() ((void)0) +extern uint32_t kfpu_state; + +#define kfpu_begin() \ + NTSTATUS saveStatus = STATUS_INVALID_PARAMETER; \ + XSTATE_SAVE SaveState; \ + saveStatus = KeSaveExtendedProcessorState(kfpu_state, &SaveState); + +#define kfpu_end() \ + if (NT_SUCCESS(saveStatus)) \ + KeRestoreExtendedProcessorState(&SaveState); /* * CPUID feature tests for user-space. Linux kernel provides an interface for @@ -284,19 +288,6 @@ CPUID_FEATURE_CHECK(pclmulqdq, PCLMULQDQ); #endif /* !defined(_KERNEL) */ -// WIN fix me, no asm for now -#define CPUID_FEATURE_SSE 0 -#define CPUID_FEATURE_SSE2 0 -#define CPUID_FEATURE_SSE3 0 -#define CPUID_FEATURE_SSSE3 0 -#define CPUID_FEATURE_SSE4_1 0 -#define CPUID_FEATURE_SSE4_2 0 -#define CPUID_FEATURE_OSXSAVE 0 -#define CPUID_FEATURE_AVX1_0 0 -#define CPUID_FEATURE_SSE 0 -#define CPUID_FEATURE_SSE 0 -#define CPUID_FEATURE_SSE 0 - /* * Detect register set support @@ -727,6 +718,20 @@ zfs_avx512vbmi_available(void) return (has_avx512 && __zmm_enabled()); } +static inline boolean_t +zfs_movbe_available(void) +{ +#if defined(_KERNEL) +#if defined(HAVE_MOVBE) + return (!!(spl_cpuid_features() & CPUID_FEATURE_MOVBE)); +#else + return (B_FALSE); +#endif + return (B_FALSE); +#endif +} + + #endif /* defined(__x86) */ #endif /* _SIMD_X86_H */ diff --git a/include/os/windows/spl/sys/thread.h b/include/os/windows/spl/sys/thread.h index 8b921578f5bf..d66082ec07f7 100644 --- a/include/os/windows/spl/sys/thread.h +++ b/include/os/windows/spl/sys/thread.h @@ -83,7 +83,7 @@ extern kthread_t *spl_thread_create(caddr_t stk, size_t stksize, #endif #define thread_exit spl_thread_exit -extern void spl_thread_exit(void); +extern void __declspec(noreturn) spl_thread_exit(void); extern kthread_t *spl_current_thread(void); diff --git a/include/os/windows/zfs/zfs_config.h b/include/os/windows/zfs/zfs_config.h index 09d76d2330f2..9d1a76f92803 100644 --- a/include/os/windows/zfs/zfs_config.h +++ b/include/os/windows/zfs/zfs_config.h @@ -64,9 +64,24 @@ #define HAVE_USLEEP 1 /* These control which assembler files to use */ -//#define HAVE_AVX 1 +#define HAVE_SSE2 1 +#define HAVE_SSSE3 1 +#define HAVE_SSE4_1 +#define HAVE_AVX 1 +#define HAVE_AVX2 1 #define HAVE_PCLMULQDQ 1 +#define HAVE_MOVBE 1 #define HAVE_AES 1 +#define HAVE_AVX512F 1 +#define HAVE_AVX512CD 1 +#define HAVE_AVX512ER 1 +#define HAVE_AVX512BW 1 +#define HAVE_AVX512DQ 1 +#define HAVE_AVX512VL 1 +#define HAVE_AVX512IFMA 1 +#define HAVE_AVX512VBMI 1 +#define HAVE_AVX512PF 1 + /* Path where the kernel module is installed. */ #define KERNEL_MODPREFIX "/Library/Extensions" diff --git a/lib/libicp/CMakeLists.txt b/lib/libicp/CMakeLists.txt index e7a71aed06b9..0a04d2a4bd42 100644 --- a/lib/libicp/CMakeLists.txt +++ b/lib/libicp/CMakeLists.txt @@ -41,11 +41,18 @@ add_library(libicp "${ICP_MODULE_DIR}/spi/kcf_spi.c" "${ICP_MODULE_DIR}/asm-x86_64/aes/aeskey.c" - "${ICP_MODULE_DIR}/asm-x86_64/os/windows/aes/aes_aesni.S" - "${ICP_MODULE_DIR}/asm-x86_64/os/windows/aes/aes_amd64.S" - "${ICP_MODULE_DIR}/asm-x86_64/os/windows/sha2/sha256_impl.S" - "${ICP_MODULE_DIR}/asm-x86_64/os/windows/sha2/sha512_impl.S" - "${ICP_MODULE_DIR}/asm-x86_64/os/windows/modes/gcm_pclmulqdq.S" + "${ICP_MODULE_DIR}/asm-x86_64/aes/aes_aesni.S" + "${ICP_MODULE_DIR}/asm-x86_64/aes/aes_amd64.S" + "${ICP_MODULE_DIR}/asm-x86_64/modes/aesni-gcm-x86_64.S" + "${ICP_MODULE_DIR}/asm-x86_64/modes/gcm_pclmulqdq.S" + "${ICP_MODULE_DIR}/asm-x86_64/modes/ghash-x86_64.S" + "${ICP_MODULE_DIR}/asm-x86_64/sha2/sha256_impl.S" + "${ICP_MODULE_DIR}/asm-x86_64/sha2/sha512_impl.S" + + "${ICP_MODULE_DIR}/asm-x86_64/blake3/blake3_avx2.S" + "${ICP_MODULE_DIR}/asm-x86_64/blake3/blake3_avx512.S" + "${ICP_MODULE_DIR}/asm-x86_64/blake3/blake3_sse2.S" + "${ICP_MODULE_DIR}/asm-x86_64/blake3/blake3_sse41.S" ) # Add windows/assembler sources here too. diff --git a/lib/libzpool/CMakeLists.txt b/lib/libzpool/CMakeLists.txt index c065f6e28de0..87fabf722ab3 100644 --- a/lib/libzpool/CMakeLists.txt +++ b/lib/libzpool/CMakeLists.txt @@ -38,6 +38,9 @@ add_library(libzpool "${MODULE_DIR}/zcommon/zfs_comutil.c" "${MODULE_DIR}/zcommon/zfs_deleg.c" "${MODULE_DIR}/zcommon/zfs_fletcher.c" + "${MODULE_DIR}/zcommon/zfs_fletcher_avx512.c" + "${MODULE_DIR}/zcommon/zfs_fletcher_intel.c" + "${MODULE_DIR}/zcommon/zfs_fletcher_sse.c" "${MODULE_DIR}/zcommon/zfs_fletcher_superscalar.c" "${MODULE_DIR}/zcommon/zfs_fletcher_superscalar4.c" "${MODULE_DIR}/zcommon/zfs_namecheck.c" @@ -127,7 +130,12 @@ add_library(libzpool "${MODULE_DIR}/zfs/vdev_queue.c" "${MODULE_DIR}/zfs/vdev_raidz.c" "${MODULE_DIR}/zfs/vdev_raidz_math.c" + "${MODULE_DIR}/zfs/vdev_raidz_math_avx2.c" + "${MODULE_DIR}/zfs/vdev_raidz_math_avx512bw.c" + "${MODULE_DIR}/zfs/vdev_raidz_math_avx512f.c" "${MODULE_DIR}/zfs/vdev_raidz_math_scalar.c" + "${MODULE_DIR}/zfs/vdev_raidz_math_sse2.c" + "${MODULE_DIR}/zfs/vdev_raidz_math_ssse3.c" "${MODULE_DIR}/zfs/vdev_rebuild.c" "${MODULE_DIR}/zfs/vdev_removal.c" "${MODULE_DIR}/zfs/vdev_root.c" diff --git a/module/icp/CMakeLists.txt b/module/icp/CMakeLists.txt index 5fc885c6845b..0c0152dbf2b2 100644 --- a/module/icp/CMakeLists.txt +++ b/module/icp/CMakeLists.txt @@ -31,18 +31,18 @@ wdk_add_library(icpkern api/kcf_ctxops.c api/kcf_mac.c asm-x86_64/aes/aeskey.c - asm-x86_64/os/windows/aes/aes_aesni.S - asm-x86_64/os/windows/aes/aes_amd64.S - asm-x86_64/modes/aesni-gcm-x86_64.S - asm-x86_64/os/windows/modes/gcm_pclmulqdq.S + asm-x86_64/aes/aes_aesni.S + asm-x86_64/aes/aes_amd64.S + asm-x86_64/modes/gcm_pclmulqdq.S asm-x86_64/modes/ghash-x86_64.S + asm-x86_64/modes/aesni-gcm-x86_64.S # asm-x86_64/sha1/sha1-x86_64.S - asm-x86_64/os/windows/sha2/sha256_impl.S - asm-x86_64/os/windows/sha2/sha512_impl.S -# asm-x86_64/blake3/blake3_avx2.S -# asm-x86_64/blake3/blake3_avx512.S + asm-x86_64/sha2/sha256_impl.S + asm-x86_64/sha2/sha512_impl.S + asm-x86_64/blake3/blake3_avx2.S + asm-x86_64/blake3/blake3_avx512.S asm-x86_64/blake3/blake3_sse2.S -# asm-x86_64/blake3/blake3_sse41.S + asm-x86_64/blake3/blake3_sse41.S core/kcf_callprov.c core/kcf_mech_tabs.c core/kcf_prov_lib.c diff --git a/module/lua/CMakeLists.txt b/module/lua/CMakeLists.txt index 4de852689ff1..2352c9e3e9ef 100644 --- a/module/lua/CMakeLists.txt +++ b/module/lua/CMakeLists.txt @@ -50,6 +50,6 @@ wdk_add_library(luakern #lundump.h lvm.h lzio.h - setjmp/win_setjmp_x86_64.S + setjmp/setjmp_x86_64.S ) target_link_libraries(luakern PRIVATE splkern) diff --git a/module/os/windows/spl/spl-processor.c b/module/os/windows/spl/spl-processor.c index e1dd52435c6f..0eb4347fabdb 100644 --- a/module/os/windows/spl/spl-processor.c +++ b/module/os/windows/spl/spl-processor.c @@ -27,6 +27,9 @@ #include +/* Holds the flags for KeSaveExtendedProcessorState() in simd.h */ +uint32_t kfpu_state = 0; + #ifdef __x86_64__ /* Should probably use MS cpuid() call */ @@ -47,9 +50,6 @@ #endif -/* Place these in header, or better, use MS versions */ -#define CPUID_FEATURE_XSAVE (1<<26) - static uint64_t _spl_cpuid_features = 0ULL; static uint64_t _spl_cpuid_features_leaf7 = 0ULL; static boolean_t _spl_cpuid_has_xgetbv = B_FALSE; @@ -104,6 +104,16 @@ spl_cpuid_features(void) _spl_cpuid(7, a, b, d, c); _spl_cpuid_features_leaf7 = b | (c << 32); } + xprintf("SPL: CPUID 0x%08llx and leaf7 0x%08llx\n", + _spl_cpuid_features, _spl_cpuid_features_leaf7); + + if (_spl_cpuid_features & CPUID_FEATURE_AVX1_0) + kfpu_state |= XSTATE_MASK_AVX; + if (_spl_cpuid_features_leaf7 & CPUID_LEAF7_FEATURE_AVX2) + kfpu_state |= XSTATE_MASK_AVX; + if (_spl_cpuid_features_leaf7 & CPUID_LEAF7_FEATURE_AVX512F) + kfpu_state |= XSTATE_MASK_AVX512; + } #endif diff --git a/module/os/windows/zfs/sysctl_os.c b/module/os/windows/zfs/sysctl_os.c index d372dccd7a2e..3fb255f50921 100644 --- a/module/os/windows/zfs/sysctl_os.c +++ b/module/os/windows/zfs/sysctl_os.c @@ -81,6 +81,67 @@ sysctl_os_close_registry(HANDLE regfd) ZwClose(regfd); } +int +sysctl_os_write_registry(HANDLE regfd, ztunable_t *zt, UNICODE_STRING *entry) +{ + void *val = NULL; + ULONG len = 0; + ULONG type = 0; // Registry type + UNICODE_STRING str = { 0 }; + NTSTATUS Status; + ULONG length; + + ZT_GET_VALUE(zt, &val, &len, &type); + + ASSERT3P(val, !=, NULL); + + if (type == ZT_TYPE_STRING) { + + /* + * STRINGS: from zfs/ZT struct to write out to Registry + * Check how much space convert will need, allocate + * buffer + * Convert ascii -> utf8 the string + * Assign to Registry update. + */ + Status = RtlUTF8ToUnicodeN(NULL, 0, + &length, val, len); + if (!NT_SUCCESS(Status)) + goto skip; + str.Length = str.MaximumLength = length; + str.Buffer = ExAllocatePoolWithTag(PagedPool, length, + 'ZTST'); + if (str.Buffer == NULL) { + Status = STATUS_NO_MEMORY; + goto skip; + } + + Status = RtlUTF8ToUnicodeN(str.Buffer, + str.MaximumLength, &length, val, len); + str.Length = length; + + len = length; + val = str.Buffer; + + if (!NT_SUCCESS(Status)) + goto skip; + } + + Status = ZwSetValueKey( + regfd, + entry, + 0, + ZT_TYPE_REGISTRY(type), + val, + len); + +skip: + if ((type == ZT_TYPE_STRING) && + str.Buffer != NULL) + ExFreePool(str.Buffer); + + return (Status); +} void sysctl_os_process(PUNICODE_STRING pRegistryPath, ztunable_t *zt) @@ -158,58 +219,8 @@ sysctl_os_process(PUNICODE_STRING pRegistryPath, ztunable_t *zt) &length); if (Status == STATUS_OBJECT_NAME_NOT_FOUND) { - void *val = NULL; - ULONG len = 0; - ULONG type = 0; // Registry type - UNICODE_STRING str = { 0 }; - - ZT_GET_VALUE(zt, &val, &len, &type); - - ASSERT3P(val, !=, NULL); - - if (type == ZT_TYPE_STRING) { - - /* - * STRINGS: from zfs/ZT struct to write out to Registry - * Check how much space convert will need, allocate - * buffer - * Convert ascii -> utf8 the string - * Assign to Registry update. - */ - Status = RtlUTF8ToUnicodeN(NULL, 0, - &length, val, len); - if (!NT_SUCCESS(Status)) - goto skip; - str.Length = str.MaximumLength = length; - str.Buffer = ExAllocatePoolWithTag(PagedPool, length, - 'ZTST'); - if (str.Buffer == NULL) - goto skip; - - Status = RtlUTF8ToUnicodeN(str.Buffer, - str.MaximumLength, &length, val, len); - str.Length = length; - - len = length; - val = str.Buffer; - - if (!NT_SUCCESS(Status)) - goto skip; - } - // No entry, add it - Status = ZwSetValueKey( - regfd, - &entry, - 0, - ZT_TYPE_REGISTRY(type), - val, - len); - -skip: - if ((type == ZT_TYPE_STRING) && - str.Buffer != NULL) - ExFreePool(str.Buffer); + Status = sysctl_os_write_registry(regfd, zt, &entry); } else { // Has entry in Registry, read it, and update tunable @@ -274,7 +285,7 @@ sysctl_os_process(PUNICODE_STRING pRegistryPath, ztunable_t *zt) /* Get space */ strval = ExAllocatePoolWithTag( - PagedPool, length, 'ZTST'); + PagedPool, length + 1, 'ZTST'); if (strval == NULL) goto failed; @@ -295,6 +306,23 @@ sysctl_os_process(PUNICODE_STRING pRegistryPath, ztunable_t *zt) strval != NULL) { ExFreePoolWithTag(strval, '!SFZ'); } + + + /* + * If the registry exists, it is written to by + * user, the actual value may be changed by the + * _set functions, so we need to call GET again, + * and if it differs, update Registry with real + * (new) value. + * So if its a call-out type, it could have been + * adjusted by the call. + */ + if (zt->zt_func != NULL) { + Status = sysctl_os_write_registry(regfd, + zt, &entry); + } + + } // RD vs RW } diff --git a/module/zfs/CMakeLists.txt b/module/zfs/CMakeLists.txt index a274fe7aa3c4..c68677ee718f 100644 --- a/module/zfs/CMakeLists.txt +++ b/module/zfs/CMakeLists.txt @@ -89,6 +89,11 @@ wdk_add_library(zfskern vdev_raidz.c vdev_raidz_math.c vdev_raidz_math_scalar.c + vdev_raidz_math_avx2.c + vdev_raidz_math_avx512f.c + vdev_raidz_math_avx512bw.c + vdev_raidz_math_sse2.c + vdev_raidz_math_ssse3.c vdev_rebuild.c vdev_removal.c vdev_root.c