diff --git a/crypto_sign/dilithium3/m4f/smallntt.S b/crypto_sign/dilithium3/m4f/smallntt.S deleted file mode 100644 index 747c111c..00000000 --- a/crypto_sign/dilithium3/m4f/smallntt.S +++ /dev/null @@ -1,837 +0,0 @@ -#include "macros.i" - -.syntax unified -.cpu cortex-m4 -.thumb - -// general macros -.macro load a, a0, a1, a2, a3, mem0, mem1, mem2, mem3 - ldr.w \a0, [\a, \mem0] - ldr.w \a1, [\a, \mem1] - ldr.w \a2, [\a, \mem2] - ldr.w \a3, [\a, \mem3] -.endm - -.macro store a, a0, a1, a2, a3, mem0, mem1, mem2, mem3 - str.w \a0, [\a, \mem0] - str.w \a1, [\a, \mem1] - str.w \a2, [\a, \mem2] - str.w \a3, [\a, \mem3] -.endm - -.macro montgomery q, qinv, a, tmp - smulbt \tmp, \a, \qinv - smlabb \tmp, \q, \tmp, \a -.endm - -.macro montgomery_inplace q, qinv, a, tmp - smulbt \tmp, \a, \qinv - smlabb \a, \q, \tmp, \a -.endm - -.macro doublemontgomery a, tmp, tmp2, q, qinv, montconst - smulbb \tmp2, \a, \montconst - montgomery \q, \qinv, \tmp2, \tmp - smultb \a, \a, \montconst - montgomery \q, \qinv, \a, \tmp2 - pkhtb \a, \tmp2, \tmp, asr#16 -.endm - -// ####### -// ####### -// # NTT # -// ####### -// ####### - -.macro mul_twiddle tb, a, twiddle, tmp, tmp2, q, qinv - smulb\tb \tmp, \a, \twiddle - smult\tb \a, \a, \twiddle - montgomery \q, \qinv, \tmp, \tmp2 // reduce -> result in tmp2 - montgomery \q, \qinv, \a, \tmp // reduce -> result in tmp2 - pkhtb \a, \tmp, \tmp2, asr#16 // combine results from above in one register as 16bit halves -.endm - -.macro doublebutterfly tb, a0, a1, twiddle, tmp, tmp2, q, qinv - smulb\tb \tmp, \a1, \twiddle // a1_b * twiddle_tb - smult\tb \a1, \a1, \twiddle // a1_t * twiddle_tb - montgomery \q, \qinv, \tmp, \tmp2 // reduce -> result in tmp2 - montgomery \q, \qinv, \a1, \tmp // reduce -> result in tmp - pkhtb \tmp2, \tmp, \tmp2, asr#16 // combine results from above in one register as 16bit halves - usub16 \a1, \a0, \tmp2 // a0 - a1 * twiddle (a0, a1 contain 2 coeffs) - uadd16 \a0, \a0, \tmp2 // a0 + a1 * twiddle (a0, a1 contain 2 coeffs) -.endm - -.macro two_doublebutterfly tb1, tb2, a0, a1, a2, a3, twiddle, tmp, tmp2, q, qinv - doublebutterfly \tb1, \a0, \a1, \twiddle, \tmp, \tmp2, \q, \qinv - doublebutterfly \tb2, \a2, \a3, \twiddle, \tmp, \tmp2, \q, \qinv -.endm - -.macro _3_layer_double_CT_16 c0, c1, c2, c3, c4, c5, c6, c7, twiddle, twiddle_ptr, Qprime, Q, tmp, tmp2 - // layer 3 - ldrh.w \twiddle, [\twiddle_ptr], #2 - two_doublebutterfly b, b, \c0, \c4, \c1, \c5, \twiddle, \tmp, \tmp2, \Q, \Qprime - two_doublebutterfly b, b, \c2, \c6, \c3, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime - - // layer 2 - ldr.w \twiddle, [\twiddle_ptr], #4 - two_doublebutterfly b, b, \c0, \c2, \c1, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime - - two_doublebutterfly t, t, \c4, \c6, \c5, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime - - // layer 1 - ldr.w \twiddle, [\twiddle_ptr], #4 - two_doublebutterfly b, t, \c0, \c1, \c2, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime - - ldr.w \twiddle, [\twiddle_ptr], #4 - two_doublebutterfly b, t, \c4, \c5, \c6, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime -.endm - -.macro _3_layer_double_CT_16_fp c0, c1, c2, c3, c4, c5, c6, c7, xi01, xi23, xi45, xi67, twiddle, Qprime, Q, tmp, tmp2 - // layer 3 - vmov \twiddle, \xi01 - two_doublebutterfly t, t, \c0, \c4, \c1, \c5, \twiddle, \tmp, \tmp2, \Q, \Qprime - two_doublebutterfly t, t, \c2, \c6, \c3, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime - - // layer 2 - vmov \twiddle, \xi23 - two_doublebutterfly b, b, \c0, \c2, \c1, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime - - two_doublebutterfly t, t, \c4, \c6, \c5, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime - - // layer 1 - vmov \twiddle, \xi45 - two_doublebutterfly b, t, \c0, \c1, \c2, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime - - vmov \twiddle, \xi67 - two_doublebutterfly b, t, \c4, \c5, \c6, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime -.endm - -.global small_ntt_asm -.type small_ntt_asm, %function -.align 2 -small_ntt_asm: - push {r4-r11, r14} - vpush.w {s16} - - poly .req r0 - twiddle_ptr .req r1 - poly0 .req r2 - poly1 .req r3 - poly2 .req r4 - poly3 .req r5 - poly4 .req r6 - poly5 .req r7 - poly6 .req r8 - poly7 .req r9 - twiddle .req r10 - qinv .req r11 - q .req r11 - tmp .req r12 - tmp2 .req r14 - - movw q, #769 - movt qinv, #767 - - ### LAYER 7+6+5+4 - .equ distance, 256 - .equ offset, 32 - .equ strincr, 4 - // pre-load twiddle factors to FPU registers - vldm twiddle_ptr!, {s8-s15} - - - add tmp, poly, #strincr*8 - vmov s16, tmp - 1: - // load a1, a3, ..., a15 - load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset - load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset - - // 8-NTT on a1, a3, ..., a15 - _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, qinv, q, tmp, tmp2 - - // multiply coeffs by layer 4 twiddles for later use - vmov twiddle, s12 - mul_twiddle b, poly0, twiddle, tmp, tmp2, q, qinv - mul_twiddle t, poly1, twiddle, tmp, tmp2, q, qinv - - vmov twiddle, s13 - mul_twiddle b, poly2, twiddle, tmp, tmp2, q, qinv - mul_twiddle t, poly3, twiddle, tmp, tmp2, q, qinv - - vmov twiddle, s14 - mul_twiddle b, poly4, twiddle, tmp, tmp2, q, qinv - mul_twiddle t, poly5, twiddle, tmp, tmp2, q, qinv - - vmov twiddle, s15 - mul_twiddle b, poly6, twiddle, tmp, tmp2, q, qinv - mul_twiddle t, poly7, twiddle, tmp, tmp2, q, qinv - - vmov s0, poly0 // a1 - vmov s1, poly1 // a3 - vmov s2, poly2 // a5 - vmov s3, poly3 // a7 - vmov s4, poly4 // a9 - vmov s5, poly5 // a11 - vmov s6, poly6 // a13 - vmov s7, poly7 // a15 - - // ---------- - - // load a0, a2, ..., a14 - load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 - load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - - // 8-NTT on a0, a2, ..., a14 - _3_layer_double_CT_16_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, qinv, q, tmp, tmp2 - - // layer 4 - 1 - // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) - vmov tmp2, s1 // load a3 - vmov s1, poly0 // preserve a0 - uadd16 poly0, poly1, tmp2 - usub16 poly1, poly1, tmp2 - - vmov tmp2, s3 // load a7 - vmov s3, poly2 // preserve a4 - uadd16 poly2, poly3, tmp2 - usub16 poly3, poly3, tmp2 - - vmov tmp2, s5 // load a11 - vmov s5, poly4 // preserve a8 - uadd16 poly4, poly5, tmp2 - usub16 poly5, poly5, tmp2 - - vmov tmp2, s7 // load a15 - vmov s7, poly6 // preserve a12 - uadd16 poly6, poly7, tmp2 - usub16 poly7, poly7, tmp2 - - str.w poly0, [poly, #1*distance/4] - str.w poly1, [poly, #1*distance/4+offset] - str.w poly2, [poly, #3*distance/4] - str.w poly3, [poly, #3*distance/4+offset] - str.w poly4, [poly, #5*distance/4] - str.w poly5, [poly, #5*distance/4+offset] - str.w poly6, [poly, #7*distance/4] - str.w poly7, [poly, #7*distance/4+offset] - - // layer 4 - 2 - // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13) - vmov tmp2, s1 // load a0 - vmov poly1, s0 // load a1 - uadd16 poly0, tmp2, poly1 - usub16 poly1, tmp2, poly1 - - vmov tmp2, s3 // load a4 - vmov poly3, s2 // load a5 - uadd16 poly2, tmp2, poly3 - usub16 poly3, tmp2, poly3 - - vmov tmp2, s5 // load a8 - vmov poly5, s4 // load a9 - uadd16 poly4, tmp2, poly5 - usub16 poly5, tmp2, poly5 - - vmov tmp2, s7 // load a12 - vmov poly7, s6 // load a13 - uadd16 poly6, tmp2, poly7 - usub16 poly7, tmp2, poly7 - - str.w poly1, [poly, #offset] - str.w poly2, [poly, #2*distance/4] - str.w poly3, [poly, #2*distance/4+offset] - str.w poly4, [poly, #4*distance/4] - str.w poly5, [poly, #4*distance/4+offset] - str.w poly6, [poly, #6*distance/4] - str.w poly7, [poly, #6*distance/4+offset] - str.w poly0, [poly], #4 - - vmov tmp, s16 - cmp.w poly, tmp - bne.w 1b - - sub.w poly, #8*strincr - - ### LAYER 3+2+1 - - .equ distance, distance/16 - .equ strincr, 32 - - add.w tmp, poly, #strincr*16 - vmov s13, tmp - - 2: - load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 - load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - - _3_layer_double_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2 - - store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - str.w poly1, [poly, #distance/4] - str.w poly2, [poly, #2*distance/4] - str.w poly3, [poly, #3*distance/4] - str.w poly0, [poly], #strincr - - vmov tmp, s13 - cmp.w poly, tmp - bne.w 2b - - vpop.w {s16} - pop {r4-r11, pc} - - -.unreq poly -.unreq twiddle_ptr -.unreq poly0 -.unreq poly1 -.unreq poly2 -.unreq poly3 -.unreq poly4 -.unreq poly5 -.unreq poly6 -.unreq poly7 -.unreq twiddle -.unreq qinv -.unreq q -.unreq tmp -.unreq tmp2 - -// ######## -// ######## -// # INTT # -// ######## -// ######## - -.macro doublebutterfly_light a0, a1, tmp, tmp2, q, qinv - uadd16 \tmp, \a0, \a1 - usub16 \a1, \a0, \a1 - mov.w \a0, \tmp -.endm - -.macro two_doublebutterfly_light a0, a1, a2, a3, tmp, tmp2, q, qinv - doublebutterfly_light \a0, \a1, \tmp, \tmp2, \q, \qinv - doublebutterfly_light \a2, \a3, \tmp, \tmp2, \q, \qinv -.endm - -.macro _3_layer_double_inv_CT_16_light c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi12, xi34, xi56, twiddle, q, qinv, tmp, tmp2 - - // layer 1 - sadd16.w \tmp, \c0, \c1 // c0, c1 - ssub16.w \c1, \c0, \c1 - sadd16.w \tmp2, \c2, \c3 // c2, c3 - ssub16.w \c3, \c2, \c3 - - sadd16.w \c0, \c4, \c5 // c4, c5 - ssub16.w \c5, \c4, \c5 - sadd16.w \c2, \c6, \c7 // c6, c7 - ssub16.w \c7, \c6, \c7 - // c4, c6 are free at this point - - // layer 2 - sadd16.w \c6, \tmp, \tmp2 // c0, c2 - ssub16.w \tmp2, \tmp, \tmp2 - sadd16.w \c4, \c0, \c2 // c4, c6 - ssub16.w \c2, \c0, \c2 - - vmov.w \twiddle, \xi12 - doublebutterfly t, \c1, \c3, \twiddle, \tmp, \c0, \q, \qinv // c0 has been used and c6 still free - doublebutterfly t, \c5, \c7, \twiddle, \tmp, \c0, \q, \qinv - // c0, c6 are free at this point - - // layer 3 - sadd16.w \c0, \c6, \c4 // c0, c4 - ssub16.w \c4, \c6, \c4 - - vmov.w \twiddle, \xi34 - doublebutterfly t, \c1, \c5, \twiddle, \tmp, \c6, \q, \qinv - - vmov.w \twiddle, \xi56 - // this block is one doublebutterfly - smulbb \tmp, \c2, \twiddle // c2, c6 - smultb \c2, \c2, \twiddle - montgomery_inplace \q, \qinv, \tmp, \c6 - montgomery_inplace \q, \qinv, \c2, \c6 - pkhtb \tmp, \c2, \tmp, asr #16 - ssub16.w \c6, \tmp2, \tmp - sadd16.w \c2, \tmp2, \tmp - - doublebutterfly t, \c3, \c7, \twiddle, \tmp, \tmp2, \q, \qinv - -.endm - -.macro _3_layer_double_inv_CT_16_light_reduce c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi12, xi34, xi56, twiddle, q, qinv, tmp, tmp2 - - // layer 1 - sadd16.w \tmp, \c0, \c1 // c0, c1 - ssub16.w \c1, \c0, \c1 - sadd16.w \tmp2, \c2, \c3 // c2, c3 - ssub16.w \c3, \c2, \c3 - - sadd16.w \c0, \c4, \c5 // c4, c5 - ssub16.w \c5, \c4, \c5 - sadd16.w \c2, \c6, \c7 // c6, c7 - ssub16.w \c7, \c6, \c7 - // c4, c6 are free at this point - - mov.w \c6, \tmp - mov.w \c4, \c0 - - // layer 2 - vmov.w \twiddle, \xi12 - doublebutterfly b, \c6, \tmp2, \twiddle, \tmp, \c0, \q, \qinv - doublebutterfly b, \c4, \c2, \twiddle, \tmp, \c0, \q, \qinv - doublebutterfly t, \c1, \c3, \twiddle, \tmp, \c0, \q, \qinv // c0 has been used and c6 still free - doublebutterfly t, \c5, \c7, \twiddle, \tmp, \c0, \q, \qinv - // c0, c6 are free at this point - - // layer 3 - sadd16.w \c0, \c6, \c4 // c0, c4 - ssub16.w \c4, \c6, \c4 - - vmov.w \twiddle, \xi34 - doublebutterfly t, \c1, \c5, \twiddle, \tmp, \c6, \q, \qinv - - vmov.w \twiddle, \xi56 - // this block is one doublebutterfly - smulbb \tmp, \c2, \twiddle // c2, c6 - smultb \c2, \c2, \twiddle - montgomery_inplace \q, \qinv, \tmp, \c6 - montgomery_inplace \q, \qinv, \c2, \c6 - pkhtb \tmp, \c2, \tmp, asr #16 - ssub16.w \c6, \tmp2, \tmp - sadd16.w \c2, \tmp2, \tmp - - doublebutterfly t, \c3, \c7, \twiddle, \tmp, \tmp2, \q, \qinv - -.endm - -.macro _3_layer_double_inv_CT_16 c0, c1, c2, c3, c4, c5, c6, c7, twiddle, twiddle_ptr, Qprime, Q, tmp, tmp2 - // layer 3 - ldrh.w twiddle, [twiddle_ptr], #2 - two_doublebutterfly b, b, \c0, \c1, \c2, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime - two_doublebutterfly b, b, \c4, \c5, \c6, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime - - // layer 2 - ldr.w twiddle, [twiddle_ptr], #4 - two_doublebutterfly b, t, \c0, \c2, \c1, \c3, \twiddle, \tmp, \tmp2, \Q, \Qprime - - two_doublebutterfly b, t, \c4, \c6, \c5, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime - - // layer 1 - ldr.w twiddle, [twiddle_ptr], #4 - two_doublebutterfly b, t, \c0, \c4, \c1, \c5, \twiddle, \tmp, \tmp2, \Q, \Qprime - - ldr.w twiddle, [twiddle_ptr], #4 - two_doublebutterfly b, t, \c2, \c6, \c3, \c7, \twiddle, \tmp, \tmp2, \Q, \Qprime -.endm - -.macro mul_twiddle_barrett_32 tb a, twiddle, Qbar, Q, tmp, tmp2 - smulb\tb \tmp, \a, \twiddle - smmulr.w \tmp2, \tmp, \Qbar - mls.w \tmp, \tmp2, \Q, \tmp - smult\tb \a, \a, \twiddle - smmulr.w \tmp2, \a, \Qbar - mls.w \a, \tmp2, \Q, \a - pkhbt \a, \tmp, \a, lsl #16 -.endm - -.macro _3_layer_double_inv_twist_16 c0, c1, c2, c3, c4, c5, c6, c7, twiddle, twiddle_ptr, Qbar, Q, tmp, tmp2 - - movt \Q, #0 - - ldr.w \twiddle, [\twiddle_ptr], #4 - - mul_twiddle_barrett_32 b, \c0, \twiddle, \Qbar, \Q, \tmp, \tmp2 - mul_twiddle_barrett_32 t, \c1, \twiddle, \Qbar, \Q, \tmp, \tmp2 - - ldr.w \twiddle, [\twiddle_ptr], #4 - - mul_twiddle_barrett_32 b, \c2, \twiddle, \Qbar, \Q, \tmp, \tmp2 - mul_twiddle_barrett_32 t, \c3, \twiddle, \Qbar, \Q, \tmp, \tmp2 - - ldr.w \twiddle, [\twiddle_ptr], #4 - - mul_twiddle_barrett_32 b, \c4, \twiddle, \Qbar, \Q, \tmp, \tmp2 - mul_twiddle_barrett_32 t, \c5, \twiddle, \Qbar, \Q, \tmp, \tmp2 - - ldr.w \twiddle, [\twiddle_ptr], #4 - - mul_twiddle_barrett_32 b, \c6, \twiddle, \Qbar, \Q, \tmp, \tmp2 - mul_twiddle_barrett_32 t, \c7, \twiddle, \Qbar, \Q, \tmp, \tmp2 - - movt \Q, #767 - -.endm - -.global small_invntt_tomont_asm -.type small_invntt_tomont_asm, %function -.align 2 -small_invntt_tomont_asm: - push {r4-r11, r14} - - poly .req r0 - twiddle_ptr .req r1 - poly0 .req r2 - poly1 .req r3 - poly2 .req r4 - poly3 .req r5 - poly4 .req r6 - poly5 .req r7 - poly6 .req r8 - poly7 .req r9 - twiddle .req r10 - qinv .req r11 - q .req r11 - tmp .req r12 - tmp2 .req r14 - - movw q, #769 - movt qinv, #767 - - ### LAYER 7+6+5+4 - .equ distance, 16 - .equ offset, 32 - .equ strincr, 64 - - // pre-load twiddle factors to FPU registers - vldm twiddle_ptr!, {s8-s15} - - add.w tmp, poly, #8*strincr - vmov s8, tmp - 1: - // load a1, a3, ..., a15 - load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset - load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset - - // NTT on a1, a3, ..., a15 - _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, q, qinv, tmp, tmp2 - - // multiply coeffs by layer 4 twiddles for later use - vmov twiddle, s12 - mul_twiddle b, poly0, twiddle, tmp, tmp2, q, qinv // could be omitted but kept for reduction only - mul_twiddle t, poly1, twiddle, tmp, tmp2, q, qinv - - vmov twiddle, s13 - mul_twiddle b, poly2, twiddle, tmp, tmp2, q, qinv - mul_twiddle t, poly3, twiddle, tmp, tmp2, q, qinv - - vmov twiddle, s14 - mul_twiddle b, poly4, twiddle, tmp, tmp2, q, qinv - mul_twiddle t, poly5, twiddle, tmp, tmp2, q, qinv - - vmov twiddle, s15 - mul_twiddle b, poly6, twiddle, tmp, tmp2, q, qinv - mul_twiddle t, poly7, twiddle, tmp, tmp2, q, qinv - - vmov s0, poly0 // a1 - vmov s1, poly1 // a3 - vmov s2, poly2 // a5 - vmov s3, poly3 // a7 - vmov s4, poly4 // a9 - vmov s5, poly5 // a11 - vmov s6, poly6 // a13 - vmov s7, poly7 // a15 - - // ---------- - - // load a0, a2, ..., a14 - load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 - load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - - // NTT on a0, a2, ..., a14 - _3_layer_double_inv_CT_16_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, twiddle, q, qinv, tmp, tmp2 - - // layer 4 - 1 - // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) - vmov tmp2, s1 // load a3 - vmov s1, poly0 // preserve a0 - uadd16 poly0, poly1, tmp2 - usub16 poly1, poly1, tmp2 - - vmov tmp2, s3 // load a7 - vmov s3, poly2 // preserve a4 - uadd16 poly2, poly3, tmp2 - usub16 poly3, poly3, tmp2 - - vmov tmp2, s5 // load a11 - vmov s5, poly4 // preserve a8 - uadd16 poly4, poly5, tmp2 - usub16 poly5, poly5, tmp2 - - vmov tmp2, s7 // load a15 - vmov s7, poly6 // preserve a12 - uadd16 poly6, poly7, tmp2 - usub16 poly7, poly7, tmp2 - - str.w poly0, [poly, #1*distance/4] - str.w poly1, [poly, #1*distance/4+offset] - str.w poly2, [poly, #3*distance/4] - str.w poly3, [poly, #3*distance/4+offset] - str.w poly4, [poly, #5*distance/4] - str.w poly5, [poly, #5*distance/4+offset] - str.w poly6, [poly, #7*distance/4] - str.w poly7, [poly, #7*distance/4+offset] - - // layer 4 - 2 - // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13) - vmov tmp2, s1 // load a0 - vmov poly1, s0 // load a1 - uadd16 poly0, tmp2, poly1 - usub16 poly1, tmp2, poly1 - - vmov tmp2, s3 // load a4 - vmov poly3, s2 // load a5 - uadd16 poly2, tmp2, poly3 - usub16 poly3, tmp2, poly3 - - vmov tmp2, s5 // load a8 - vmov poly5, s4 // load a9 - uadd16 poly4, tmp2, poly5 - usub16 poly5, tmp2, poly5 - - vmov tmp2, s7 // load a12 - vmov poly7, s6 // load a13 - uadd16 poly6, tmp2, poly7 - usub16 poly7, tmp2, poly7 - - str.w poly1, [poly, #offset] - str.w poly2, [poly, #2*distance/4] - str.w poly3, [poly, #2*distance/4+offset] - str.w poly4, [poly, #4*distance/4] - str.w poly5, [poly, #4*distance/4+offset] - str.w poly6, [poly, #6*distance/4] - str.w poly7, [poly, #6*distance/4+offset] - str.w poly0, [poly], #strincr // increase 2*8*4 = 64 (2 * 8 loads of 4 bytes each) - - vmov tmp, s8 - cmp.w poly, tmp - bne.w 1b - - sub.w poly, #8*strincr - - ### LAYER 3+2+1 - .equ distance, distance*16 - .equ strincr, 4 - - // ITER 0 - load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 - load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - - vldm twiddle_ptr!, {s5-s7} - - _3_layer_double_inv_CT_16_light_reduce poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s5, s5, s6, s7, twiddle, q, qinv, tmp, tmp2 - - vmov.w s2, poly - movw poly, #:lower16:5585133 - movt poly, #:upper16:5585133 - - // twisting - _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2 - - vmov.w poly, s2 - - store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - str.w poly1, [poly, #distance/4] - str.w poly2, [poly, #2*distance/4] - str.w poly3, [poly, #3*distance/4] - str.w poly0, [poly], #4 - - // ITER 1-12 - add.w tmp, poly, #strincr*3*(3+1) - vmov s14, tmp - 3: - add.w tmp, poly, #strincr*3 - vmov s13, tmp - 2: - // polys upto 6q - load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 - load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - - - _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2 - - vmov.w s2, poly - movw poly, #:lower16:5585133 - movt poly, #:upper16:5585133 - - // twisting - _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2 - - vmov.w poly, s2 - - store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - str.w poly1, [poly, #distance/4] - str.w poly2, [poly, #2*distance/4] - str.w poly3, [poly, #3*distance/4] - str.w poly0, [poly], #4 - - vmov tmp, s13 - cmp.w poly, tmp - bne.w 2b - - // polys upto 9q - load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 - load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - - _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2 - - vmov.w s2, poly - movw poly, #:lower16:5585133 - movt poly, #:upper16:5585133 - - // twisting - _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2 - - vmov.w poly, s2 - - store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - str.w poly1, [poly, #distance/4] - str.w poly2, [poly, #2*distance/4] - str.w poly3, [poly, #3*distance/4] - str.w poly0, [poly], #4 - - vmov tmp, s14 - cmp.w poly, tmp - bne.w 3b - - // ITER 13-15 - add tmp, poly, #3*strincr - vmov s13, tmp - 2: - // polys upto 6q - load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 - load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - - _3_layer_double_inv_CT_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, qinv, q, tmp, tmp2 - - vmov.w s2, poly - movw poly, #:lower16:5585133 - movt poly, #:upper16:5585133 - - // twisting - _3_layer_double_inv_twist_16 poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle, twiddle_ptr, poly, q, tmp, tmp2 - - vmov.w poly, s2 - - store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 - str.w poly1, [poly, #distance/4] - str.w poly2, [poly, #2*distance/4] - str.w poly3, [poly, #3*distance/4] - str.w poly0, [poly], #strincr - - vmov tmp, s13 - cmp.w poly, tmp - bne.w 2b - - pop {r4-r11, pc} - -.unreq poly -.unreq twiddle_ptr -.unreq poly0 -.unreq poly1 -.unreq poly2 -.unreq poly3 -.unreq poly4 -.unreq poly5 -.unreq poly6 -.unreq poly7 -.unreq twiddle -.unreq qinv -.unreq q -.unreq tmp -.unreq tmp2 - -.align 2 -.global small_pointmul_asm -.type small_pointmul_asm, %function -small_pointmul_asm: - push.w {r4-r11, lr} - - movw r14, #769 - movt r14, #767 - - .equ width, 4 - - add.w r12, r2, #64*2 - _point_mul_16_loop: - - ldr.w r7, [r1, #2*width] - ldr.w r8, [r1, #3*width] - ldrsh.w r9, [r2, #1*2] - ldr.w r5, [r1, #1*width] - ldr.w r4, [r1], #4*width - ldrsh.w r6, [r2], #2*2 - - smultb r10, r4, r6 - montgomery r14, r14, r10, r11 - pkhbt r4, r4, r11 - - - neg.w r6, r6 - - smultb r10, r5, r6 - montgomery r14, r14, r10, r11 - pkhbt r5, r5, r11 - - str.w r5, [r0, #1*width] - str.w r4, [r0], #2*width - - smultb r10, r7, r9 - montgomery r14, r14, r10, r11 - pkhbt r7, r7, r11 - - neg.w r9, r9 - - smultb r10, r8, r9 - montgomery r14, r14, r10, r11 - pkhbt r8, r8, r11 - - str.w r8, [r0, #1*width] - str.w r7, [r0], #2*width - - cmp.w r2, r12 - bne.w _point_mul_16_loop - - pop.w {r4-r11, pc} - - .align 2 -.global small_asymmetric_mul_asm -.type small_asymmetric_mul_asm, %function -small_asymmetric_mul_asm: - push.w {r4-r11, lr} - - movw r14, #769 - movt r14, #767 - .equ width, 4 - add.w r12, r0, #256*2 - _asymmetric_mul_16_loop: - ldr.w r7, [r1, #width] - ldr.w r4, [r1], #2*width - ldr.w r8, [r2, #width] - ldr.w r5, [r2], #2*width - ldr.w r9, [r3, #width] - ldr.w r6, [r3], #2*width - - smuad r10, r4, r6 - montgomery r14, r14, r10, r6 - smuadx r11, r4, r5 - montgomery r14, r14, r11, r10 - - pkhtb r10, r10, r6, asr#16 - - str.w r10, [r0], #width - - smuad r10, r7, r9 - montgomery r14, r14, r10, r6 - smuadx r11, r7, r8 - montgomery r14, r14, r11, r10 - - pkhtb r10, r10, r6, asr#16 - str.w r10, [r0], #width - - - cmp.w r0, r12 - bne.w _asymmetric_mul_16_loop - - pop.w {r4-r11, pc} \ No newline at end of file