From 861095be62d1b8db932f1c4c83bffdb28aebffd6 Mon Sep 17 00:00:00 2001 From: Hanno Becker Date: Tue, 17 Sep 2024 16:20:25 +0100 Subject: [PATCH] Add scalar AArch64 Keccak-f1600 ASM (#133) * Add first AArch64 Keccak-f1600 ASM Signed-off-by: Hanno Becker * Update scalar Keccak ASM with A55-optimized version This should perform decent on most microarchitectures. Signed-off-by: Hanno Becker * Minor cleanup of auto-generated scalar Keccak-f1600 assembly Signed-off-by: Hanno Becker --------- Signed-off-by: Hanno Becker --- fips202/asm/aarch64/common.i | 12 + .../aarch64/keccak_f1600_x1_scalar_opt_a55.S | 458 ++++++++++++++++++ fips202/asm/asm.h | 16 + fips202/keccakf1600.c | 41 +- fips202/keccakf1600.h | 6 + mk/crypto.mk | 8 +- mk/rules.mk | 5 + 7 files changed, 527 insertions(+), 19 deletions(-) create mode 100644 fips202/asm/aarch64/common.i create mode 100644 fips202/asm/aarch64/keccak_f1600_x1_scalar_opt_a55.S create mode 100644 fips202/asm/asm.h diff --git a/fips202/asm/aarch64/common.i b/fips202/asm/aarch64/common.i new file mode 100644 index 000000000..55e0ed803 --- /dev/null +++ b/fips202/asm/aarch64/common.i @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: MIT + +#if __APPLE__ +#define ASM_LOAD(dst, symbol) \ + adrp dst, symbol @PAGE %% add dst, dst, symbol @PAGEOFF +#else +#define ASM_LOAD(dst, symbol) \ + adrp dst, symbol; \ + add dst, dst, : lo12 : symbol; +.endm + +#endif diff --git a/fips202/asm/aarch64/keccak_f1600_x1_scalar_opt_a55.S b/fips202/asm/aarch64/keccak_f1600_x1_scalar_opt_a55.S new file mode 100644 index 000000000..d1ee0d155 --- /dev/null +++ b/fips202/asm/aarch64/keccak_f1600_x1_scalar_opt_a55.S @@ -0,0 +1,458 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + + +// Author: Hanno Becker +// Author: Matthias Kannwischer + +#include "config.h" +#if defined(MLKEM_USE_AARCH64_ASM) + +// Needed to provide ASM_LOAD directive +#include "common.i" + +/********************** CONSTANTS *************************/ + .data + .balign 64 +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x26 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + Aba .req x1 + Abe .req x6 + Abi .req x11 + Abo .req x16 + Abu .req x21 + Aga .req x2 + Age .req x7 + Agi .req x12 + Ago .req x17 + Agu .req x22 + Aka .req x3 + Ake .req x8 + Aki .req x13 + Ako .req x28 + Aku .req x23 + Ama .req x4 + Ame .req x9 + Ami .req x14 + Amo .req x19 + Amu .req x24 + Asa .req x5 + Ase .req x10 + Asi .req x15 + Aso .req x20 + Asu .req x25 + +/************************ MACROS ****************************/ + +#define STACK_LOCS 4 + +#define STACK_SIZE (16*6 + (STACK_LOCS) * 8) +#define STACK_BASE_GPRS (3*8+8) +#define STACK_LOC_INPUT (0*8) +#define STACK_LOC_CONST (1*8) +#define STACK_LOC_COUNT (2*8) +#define STACK_LOC_MISC0 (3*8) + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro load_state + ldp Aba, Abe, [input_addr, #(1*8*0)] + ldp Abi, Abo, [input_addr, #(1*8*2)] + ldp Abu, Aga, [input_addr, #(1*8*4)] + ldp Age, Agi, [input_addr, #(1*8*6)] + ldp Ago, Agu, [input_addr, #(1*8*8)] + ldp Aka, Ake, [input_addr, #(1*8*10)] + ldp Aki, Ako, [input_addr, #(1*8*12)] + ldp Aku, Ama, [input_addr, #(1*8*14)] + ldp Ame, Ami, [input_addr, #(1*8*16)] + ldp Amo, Amu, [input_addr, #(1*8*18)] + ldp Asa, Ase, [input_addr, #(1*8*20)] + ldp Asi, Aso, [input_addr, #(1*8*22)] + ldr Asu, [input_addr, #(1*8*24)] +.endm + +.macro store_state + stp Aba, Abe, [input_addr, #(1*8*0)] + stp Abi, Abo, [input_addr, #(1*8*2)] + stp Abu, Aga, [input_addr, #(1*8*4)] + stp Age, Agi, [input_addr, #(1*8*6)] + stp Ago, Agu, [input_addr, #(1*8*8)] + stp Aka, Ake, [input_addr, #(1*8*10)] + stp Aki, Ako, [input_addr, #(1*8*12)] + stp Aku, Ama, [input_addr, #(1*8*14)] + stp Ame, Ami, [input_addr, #(1*8*16)] + stp Amo, Amu, [input_addr, #(1*8*18)] + stp Asa, Ase, [input_addr, #(1*8*20)] + stp Asi, Aso, [input_addr, #(1*8*22)] + str Asu, [input_addr, #(1*8*24)] +.endm + +.macro final_rotate + ror Abe, Abe,#(64-21) + ror Abi, Abi,#(64-14) + ror Abu, Abu,#(64-44) + ror Aga, Aga,#(64-3) + ror Age, Age,#(64-45) + ror Agi, Agi,#(64-61) + ror Ago, Ago,#(64-28) + ror Agu, Agu,#(64-20) + ror Aka, Aka,#(64-25) + ror Ake, Ake,#(64-8) + ror Aki, Aki,#(64-18) + ror Ako, Ako,#(64-1) + ror Aku, Aku,#(64-6) + ror Ama, Ama,#(64-10) + ror Ame, Ame,#(64-15) + ror Ami, Ami,#(64-56) + ror Amo, Amo,#(64-27) + ror Amu, Amu,#(64-36) + ror Asa, Asa,#(64-39) + ror Ase, Ase,#(64-41) + ror Asi, Asi,#(64-2) + ror Aso, Aso,#(64-62) + ror Asu, Asu,#(64-55) +.endm + +#define KECCAK_F1600_ROUNDS 24 + +.text +.balign 16 +.global keccak_f1600_x1_scalar_slothy_opt_a55 +.global _keccak_f1600_x1_scalar_slothy_opt_a55 + +keccak_f1600_x1_scalar_slothy_opt_a55: +_keccak_f1600_x1_scalar_slothy_opt_a55: + alloc_stack + save_gprs + +initial: + ASM_LOAD(const_addr, round_constants) + str const_addr, [sp, #STACK_LOC_CONST] + load_state + str input_addr, [sp, #STACK_LOC_INPUT] // @slothy:writes=STACK_LOC_INPUT + + // (Optimized for Cortex-A55) + // Instructions: 107 + // Expected cycles: 54 + // Expected IPC: 1.98 + // + // ----------------- cycle (expected) ------------------> + // 0 25 50 + // |------------------------|------------------------|--- + eor x30, x24, x25 // *..................................................... + eor x27, x9, x10 // *..................................................... + eor x0, x30, x21 // .*.................................................... + eor x26, x27, x6 // .*.................................................... + eor x27, x26, x7 // ..*................................................... + eor x29, x0, x22 // ..*................................................... + eor x26, x29, x23 // ...*.................................................. + eor x29, x4, x5 // ...*.................................................. + eor x30, x29, x1 // ....*................................................. + eor x0, x27, x8 // ....*................................................. + eor x29, x30, x2 // .....*................................................ + eor x30, x19, x20 // .....*................................................ + eor x30, x30, x16 // ......*............................................... + eor x27, x26, x0, ror #63 // ......*............................................... + eor x4, x4, x27 // .......*.............................................. + eor x30, x30, x17 // .......*.............................................. + eor x30, x30, x28 // ........*............................................. + eor x29, x29, x3 // ........*............................................. + eor x0, x0, x30, ror #63 // .........*............................................ + eor x30, x30, x29, ror #63 // .........*............................................ + eor x22, x22, x30 // ..........*........................................... + eor x23, x23, x30 // ..........*........................................... + str x23, [sp, #STACK_LOC_MISC0] // ...........*.......................................... + eor x23, x14, x15 // ...........*.......................................... + eor x14, x14, x0 // ............*......................................... + eor x23, x23, x11 // ............*......................................... + eor x15, x15, x0 // .............*........................................ + eor x1, x1, x27 // .............*........................................ + eor x23, x23, x12 // ..............*....................................... + eor x23, x23, x13 // ...............*...................................... + eor x11, x11, x0 // ...............*...................................... + eor x29, x29, x23, ror #63 // ................*..................................... + eor x23, x23, x26, ror #63 // ................*..................................... + eor x26, x13, x0 // .................*.................................... + eor x13, x28, x23 // .................*.................................... + eor x28, x24, x30 // ..................*................................... + eor x24, x16, x23 // ..................*................................... + eor x16, x21, x30 // ...................*.................................. + eor x21, x25, x30 // ...................*.................................. + eor x30, x19, x23 // ....................*................................. + eor x19, x20, x23 // ....................*................................. + eor x20, x17, x23 // .....................*................................ + eor x17, x12, x0 // .....................*................................ + eor x0, x2, x27 // ......................*............................... + eor x2, x6, x29 // ......................*............................... + eor x6, x8, x29 // .......................*.............................. + bic x8, x28, x13, ror #47 // .......................*.............................. + eor x12, x3, x27 // ........................*............................. + bic x3, x13, x17, ror #19 // ........................*............................. + eor x5, x5, x27 // .........................*............................ + ldr x27, [sp, #STACK_LOC_MISC0] // .........................*............................ + bic x25, x17, x2, ror #5 // ..........................*........................... + eor x9, x9, x29 // ..........................*........................... + eor x23, x25, x5, ror #52 // ...........................*.......................... + eor x3, x3, x2, ror #24 // ...........................*.......................... + eor x8, x8, x17, ror #2 // ............................*......................... + eor x17, x10, x29 // ............................*......................... + bic x25, x12, x22, ror #47 // .............................*........................ + eor x29, x7, x29 // .............................*........................ + bic x10, x4, x27, ror #2 // ..............................*....................... + bic x7, x5, x28, ror #10 // ..............................*....................... + eor x10, x10, x20, ror #50 // ...............................*...................... + eor x13, x7, x13, ror #57 // ...............................*...................... + bic x7, x2, x5, ror #47 // ................................*..................... + eor x2, x25, x24, ror #39 // ................................*..................... + bic x25, x20, x11, ror #57 // .................................*.................... + bic x5, x17, x4, ror #25 // .................................*.................... + eor x25, x25, x17, ror #53 // ..................................*................... + bic x17, x11, x17, ror #60 // ..................................*................... + eor x28, x7, x28, ror #57 // ...................................*.................. + bic x7, x9, x12, ror #42 // ...................................*.................. + eor x7, x7, x22, ror #25 // ....................................*................. + bic x22, x22, x24, ror #56 // ....................................*................. + bic x24, x24, x15, ror #31 // .....................................*................ + eor x22, x22, x15, ror #23 // .....................................*................ + bic x20, x27, x20, ror #48 // ......................................*............... + bic x15, x15, x9, ror #16 // ......................................*............... + eor x12, x15, x12, ror #58 // .......................................*.............. + eor x15, x5, x27, ror #27 // .......................................*.............. + eor x5, x20, x11, ror #41 // ........................................*............. + ldr x11, [sp, #STACK_LOC_CONST] // ........................................*............. + eor x20, x17, x4, ror #21 // .........................................*............ + eor x17, x24, x9, ror #47 // .........................................*............ + mov x24, #1 // ..........................................*........... + bic x9, x0, x16, ror #9 // ..........................................*........... + str x24, [sp, #STACK_LOC_COUNT] // ...........................................*.......... + bic x24, x29, x1, ror #44 // ...........................................*.......... + bic x27, x1, x21, ror #50 // ............................................*......... + bic x4, x26, x29, ror #63 // ............................................*......... + eor x1, x1, x4, ror #21 // .............................................*........ + ldr x11, [x11] // .............................................*........ + bic x4, x21, x30, ror #57 // ..............................................*....... + eor x21, x24, x21, ror #30 // ..............................................*....... + eor x24, x9, x19, ror #44 // ...............................................*...... + bic x9, x14, x6, ror #5 // ...............................................*...... + eor x9, x9, x0, ror #43 // ................................................*..... + bic x0, x6, x0, ror #38 // ................................................*..... + eor x1, x1, x11 // .................................................*.... + eor x11, x4, x26, ror #35 // .................................................*.... + eor x4, x0, x16, ror #47 // ..................................................*... + bic x0, x16, x19, ror #35 // ..................................................*... + eor x16, x27, x30, ror #43 // ...................................................*.. + bic x27, x30, x26, ror #42 // ...................................................*.. + bic x26, x19, x14, ror #41 // ....................................................*. + eor x19, x0, x14, ror #12 // ....................................................*. + eor x14, x26, x6, ror #46 // .....................................................* + eor x6, x27, x29, ror #41 // .....................................................* + + loop: + // (Optimized for Cortex-A55) + // Instructions: 112 + // Expected cycles: 57 + // Expected IPC: 1.96 + // + // ------------------- cycle (expected) -------------------> + // 0 25 50 + // |------------------------|------------------------|------ + eor x0, x15, x11, ror #52 // *........................................................ + eor x0, x0, x13, ror #48 // .*....................................................... + eor x26, x8, x9, ror #57 // .*....................................................... + eor x27, x0, x14, ror #10 // ..*...................................................... + eor x29, x16, x28, ror #63 // ..*...................................................... + eor x26, x26, x6, ror #51 // ...*..................................................... + eor x30, x23, x22, ror #50 // ...*..................................................... + eor x0, x26, x10, ror #31 // ....*.................................................... + eor x29, x29, x19, ror #37 // ....*.................................................... + eor x27, x27, x12, ror #5 // .....*................................................... + eor x30, x30, x24, ror #34 // .....*................................................... + eor x0, x0, x7, ror #27 // ......*.................................................. + eor x26, x30, x21, ror #26 // ......*.................................................. + eor x26, x26, x25, ror #15 // .......*................................................. + ror x30, x27, #62 // .......*................................................. + eor x30, x30, x26, ror #57 // ........*................................................ + ror x26, x26, #58 // ........*................................................ + eor x16, x30, x16 // .........*............................................... + eor x28, x30, x28, ror #63 // .........*............................................... + str x28, [sp, #STACK_LOC_MISC0] // ..........*.............................................. + eor x29, x29, x17, ror #36 // ..........*.............................................. + eor x28, x1, x2, ror #61 // ...........*............................................. + eor x19, x30, x19, ror #37 // ...........*............................................. + eor x29, x29, x20, ror #2 // ............*............................................ + eor x28, x28, x4, ror #54 // ............*............................................ + eor x26, x26, x0, ror #55 // .............*........................................... + eor x28, x28, x3, ror #39 // .............*........................................... + eor x28, x28, x5, ror #25 // ..............*.......................................... + ror x0, x0, #56 // ..............*.......................................... + eor x0, x0, x29, ror #63 // ...............*......................................... + eor x27, x28, x27, ror #61 // ...............*......................................... + eor x13, x0, x13, ror #46 // ................*........................................ + eor x28, x29, x28, ror #63 // ................*........................................ + eor x29, x30, x20, ror #2 // .................*....................................... + eor x20, x26, x3, ror #39 // .................*....................................... + eor x11, x0, x11, ror #50 // ..................*...................................... + eor x25, x28, x25, ror #9 // ..................*...................................... + eor x3, x28, x21, ror #20 // ...................*..................................... + eor x21, x26, x1 // ...................*..................................... + eor x9, x27, x9, ror #49 // ....................*.................................... + eor x24, x28, x24, ror #28 // ....................*.................................... + eor x1, x30, x17, ror #36 // .....................*................................... + eor x14, x0, x14, ror #8 // .....................*................................... + eor x22, x28, x22, ror #44 // ......................*.................................. + eor x8, x27, x8, ror #56 // ......................*.................................. + eor x17, x27, x7, ror #19 // .......................*................................. + eor x15, x0, x15, ror #62 // .......................*................................. + bic x7, x20, x22, ror #47 // ........................*................................ + eor x4, x26, x4, ror #54 // ........................*................................ + eor x0, x0, x12, ror #3 // .........................*............................... + eor x28, x28, x23, ror #58 // .........................*............................... + eor x23, x26, x2, ror #61 // ..........................*.............................. + eor x26, x26, x5, ror #25 // ..........................*.............................. + eor x2, x7, x16, ror #39 // ...........................*............................. + bic x7, x9, x20, ror #42 // ...........................*............................. + bic x30, x15, x9, ror #16 // ............................*............................ + eor x7, x7, x22, ror #25 // ............................*............................ + eor x12, x30, x20, ror #58 // .............................*........................... + bic x20, x22, x16, ror #56 // .............................*........................... + eor x30, x27, x6, ror #43 // ..............................*.......................... + eor x22, x20, x15, ror #23 // ..............................*.......................... + bic x6, x19, x13, ror #42 // ...............................*......................... + eor x6, x6, x17, ror #41 // ................................*........................ + bic x5, x13, x17, ror #63 // ................................*........................ + eor x5, x21, x5, ror #21 // .................................*....................... + bic x17, x17, x21, ror #44 // .................................*....................... + eor x27, x27, x10, ror #23 // ..................................*...................... + bic x21, x21, x25, ror #50 // ..................................*...................... + bic x20, x27, x4, ror #25 // ...................................*..................... + bic x10, x16, x15, ror #31 // ...................................*..................... + eor x16, x21, x19, ror #43 // ....................................*.................... + eor x21, x17, x25, ror #30 // ....................................*.................... + bic x19, x25, x19, ror #57 // .....................................*................... + ldr x25, [sp, #STACK_LOC_COUNT] // .....................................*................... + eor x17, x10, x9, ror #47 // ......................................*.................. + ldr x9, [sp, #STACK_LOC_CONST] // ......................................*.................. + eor x15, x20, x28, ror #27 // .......................................*................. + bic x20, x4, x28, ror #2 // .......................................*................. + eor x10, x20, x1, ror #50 // ........................................*................ + bic x20, x11, x27, ror #60 // ........................................*................ + eor x20, x20, x4, ror #21 // .........................................*............... + bic x4, x28, x1, ror #48 // .........................................*............... + bic x1, x1, x11, ror #57 // ..........................................*.............. + ldr x28, [x9, w25, UXTW #3] // ..........................................*.............. + ldr x9, [sp, #STACK_LOC_MISC0] // ...........................................*............. + add x25, x25, #1 // ...........................................*............. + str x25, [sp, #STACK_LOC_COUNT] // ............................................*............ + cmp x25, #(KECCAK_F1600_ROUNDS-1) // ............................................*............ + eor x25, x1, x27, ror #53 // .............................................*........... + bic x27, x30, x26, ror #47 // .............................................*........... + eor x1, x5, x28 // ..............................................*.......... + eor x5, x4, x11, ror #41 // ..............................................*.......... + eor x11, x19, x13, ror #35 // ...............................................*......... + bic x13, x26, x24, ror #10 // ...............................................*......... + eor x28, x27, x24, ror #57 // ................................................*........ + bic x27, x24, x9, ror #47 // ................................................*........ + bic x19, x23, x3, ror #9 // .................................................*....... + bic x4, x29, x14, ror #41 // .................................................*....... + eor x24, x19, x29, ror #44 // ..................................................*...... + bic x29, x3, x29, ror #35 // ..................................................*...... + eor x13, x13, x9, ror #57 // ...................................................*..... + eor x19, x29, x14, ror #12 // ...................................................*..... + bic x29, x9, x0, ror #19 // ....................................................*.... + bic x14, x14, x8, ror #5 // ....................................................*.... + eor x9, x14, x23, ror #43 // .....................................................*... + eor x14, x4, x8, ror #46 // .....................................................*... + bic x23, x8, x23, ror #38 // ......................................................*.. + eor x8, x27, x0, ror #2 // ......................................................*.. + eor x4, x23, x3, ror #47 // .......................................................*. + bic x3, x0, x30, ror #5 // .......................................................*. + eor x23, x3, x26, ror #52 // ........................................................* + eor x3, x29, x30, ror #24 // ........................................................* + ble loop + + final_rotate + ldr input_addr, [sp, #STACK_LOC_INPUT] + store_state + + restore_gprs + free_stack + ret + +#endif /* MLKEM_USE_AARCH64_ASM */ diff --git a/fips202/asm/asm.h b/fips202/asm/asm.h new file mode 100644 index 000000000..c25398aa4 --- /dev/null +++ b/fips202/asm/asm.h @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: Apache-2.0 +#ifndef ASM_H +#define ASM_H + +#include +#include "params.h" +#include "config.h" + +#ifdef MLKEM_USE_AARCH64_ASM +void keccak_f1600_x1_scalar_slothy_opt_a55(uint64_t *state); + +#define keccak_f1600_x1_asm keccak_f1600_x1_scalar_slothy_opt_a55 +#endif /* MLKEM_USE_AARCH64_ASM */ + + +#endif diff --git a/fips202/keccakf1600.c b/fips202/keccakf1600.c index 8e862ffe0..0e920ca6c 100644 --- a/fips202/keccakf1600.c +++ b/fips202/keccakf1600.c @@ -11,9 +11,31 @@ #include #include "keccakf1600.h" +#include "asm/asm.h" +#include "config.h" + #define NROUNDS 24 #define ROL(a, offset) ((a << offset) ^ (a >> (64-offset))) +void KeccakF1600_StateExtractBytes(uint64_t *state, unsigned char *data, unsigned int offset, unsigned int length) +{ + unsigned int i; + for (i = 0; i < length; i++) + { + data[i] = state[(offset + i) >> 3] >> (8 * ((offset + i) & 0x07)); + } +} + +void KeccakF1600_StateXORBytes(uint64_t *state, const unsigned char *data, unsigned int offset, unsigned int length) +{ + unsigned int i; + for (i = 0; i < length; i++) + { + state[(offset + i) >> 3] ^= (uint64_t)data[i] << (8 * ((offset + i) & 0x07)); + } +} + +#if !defined(MLKEM_USE_AARCH64_ASM) static const uint64_t KeccakF_RoundConstants[NROUNDS] = { (uint64_t)0x0000000000000001ULL, @@ -42,24 +64,6 @@ static const uint64_t KeccakF_RoundConstants[NROUNDS] = (uint64_t)0x8000000080008008ULL }; -void KeccakF1600_StateExtractBytes(uint64_t *state, unsigned char *data, unsigned int offset, unsigned int length) -{ - unsigned int i; - for (i = 0; i < length; i++) - { - data[i] = state[(offset + i) >> 3] >> (8 * ((offset + i) & 0x07)); - } -} - -void KeccakF1600_StateXORBytes(uint64_t *state, const unsigned char *data, unsigned int offset, unsigned int length) -{ - unsigned int i; - for (i = 0; i < length; i++) - { - state[(offset + i) >> 3] ^= (uint64_t)data[i] << (8 * ((offset + i) & 0x07)); - } -} - void KeccakF1600_StatePermute(uint64_t *state) { int round; @@ -326,3 +330,4 @@ void KeccakF1600_StatePermute(uint64_t *state) #undef round } +#endif /* !MLKEM_USE_AARCH64_ASM */ diff --git a/fips202/keccakf1600.h b/fips202/keccakf1600.h index 0b3f5bb43..e43acc454 100644 --- a/fips202/keccakf1600.h +++ b/fips202/keccakf1600.h @@ -2,10 +2,16 @@ #ifndef KECCAKF1600_H #define KECCAKF1600_H +#include "asm/asm.h" #include void KeccakF1600_StateExtractBytes(uint64_t *state, unsigned char *data, unsigned int offset, unsigned int length); void KeccakF1600_StateXORBytes(uint64_t *state, const unsigned char *data, unsigned int offset, unsigned int length); + +#if !defined(MLKEM_USE_AARCH64_ASM) void KeccakF1600_StatePermute(uint64_t *state); +#else +#define KeccakF1600_StatePermute keccak_f1600_x1_asm +#endif #endif diff --git a/mk/crypto.mk b/mk/crypto.mk index 476191ab4..0e9c868e2 100644 --- a/mk/crypto.mk +++ b/mk/crypto.mk @@ -15,9 +15,15 @@ else CPPFLAGS += -Irandombytes endif +FIPS202_SRCS = $(wildcard fips202/*.c) +ifeq ($(OPT),1) + FIPS202_SRCS += $(wildcard fips202/asm/aarch64/*.S) + CPPFLAGS += -DMLKEM_USE_ASM +endif + $(LIB_DIR)/librng.a: $(call OBJS,$(wildcard randombytes/*.c)) $(LIB_DIR)/libnistrng.a: CFLAGS += -Wno-unused-result -O3 -fomit-frame-pointer $(LIB_DIR)/libnistrng.a: $(call OBJS,$(wildcard test/nistrng/*.c)) -$(LIB_DIR)/libfips202.a: $(call OBJS,$(wildcard fips202/*.c)) +$(LIB_DIR)/libfips202.a: $(call OBJS, $(FIPS202_SRCS)) diff --git a/mk/rules.mk b/mk/rules.mk index 0f009f692..9b67bbd37 100644 --- a/mk/rules.mk +++ b/mk/rules.mk @@ -15,6 +15,11 @@ $(BUILD_DIR)/%.c.o: %.c $(CONFIG) $(Q)[ -d $(@D) ] || mkdir -p $(@D) $(Q)$(CC) -c -o $@ $(CFLAGS) $< +$(BUILD_DIR)/%.S.o: %.S $(CONFIG) + $(Q)echo " AS $@" + $(Q)[ -d $(@D) ] || mkdir -p $(@D) + $(Q)$(CC) -c -o $@ $(CFLAGS) $< + $(BUILD_DIR)/mlkem512/%.c.o: %.c $(CONFIG) $(Q)echo " CC $@" $(Q)[ -d $(@D) ] || mkdir -p $(@D)