From 44365aaec2f0e35793ebc137cc06891be09bd2e4 Mon Sep 17 00:00:00 2001 From: "Thing-han, Lim" <15379156+potsrevennil@users.noreply.github.com> Date: Thu, 17 Oct 2024 19:21:54 +0800 Subject: [PATCH] init pqm7 Signed-off-by: Thing-han, Lim <15379156+potsrevennil@users.noreply.github.com> --- .github/dependabot.yml | 6 + .github/pull_request_template.md | 7 + .github/workflows/nucleo-f767zi.yml | 21 + .gitignore | 16 + .gitmodules | 9 + Makefile | 23 + README.md | 411 + benchmarks.py | 33 + build_everything.py | 16 + common/aes-encrypt.S | 613 ++ common/aes-keyschedule.S | 851 +++ common/aes-publicinputs.S | 1327 ++++ common/aes-publicinputs.c | 259 + common/aes-publicinputs.h | 62 + common/aes.c | 232 + common/aes.h | 49 + common/aestest.c | 185 + common/crypto_hashblocks_sha512.c | 101 + common/crypto_hashblocks_sha512_inner32.s | 6593 +++++++++++++++++ common/hal-mps2.c | 279 + common/hal-opencm3.c | 245 + common/keccakf1600.S | 1134 +++ common/keccaktest.c | 85 + common/mps2/CMSDK_CM4.h | 1289 ++++ common/mps2/LICENSE.txt | 201 + common/mps2/MPS2.ld | 208 + common/mps2/cmsis_armclang.h | 1467 ++++ common/mps2/cmsis_compiler.h | 283 + common/mps2/cmsis_gcc.h | 2177 ++++++ common/mps2/cmsis_nvic.h | 47 + common/mps2/cmsis_version.h | 39 + common/mps2/core_cm4.h | 2129 ++++++ common/mps2/memory_zones.h | 49 + common/mps2/mpu_armv7.h | 275 + common/mps2/startup_MPS2.S | 206 + common/randombytes.c | 121 + common/test.c | 161 + common/testfast.c | 1 + convert_benchmarks.py | 19 + crypto_kem/ml-kem-1024/m4fspeed/api.h | 20 + crypto_kem/ml-kem-1024/m4fspeed/cbd.c | 1 + crypto_kem/ml-kem-1024/m4fspeed/cbd.h | 1 + crypto_kem/ml-kem-1024/m4fspeed/cmov_int16.S | 1 + crypto_kem/ml-kem-1024/m4fspeed/fastaddsub.S | 1 + crypto_kem/ml-kem-1024/m4fspeed/fastbasemul.S | 1 + crypto_kem/ml-kem-1024/m4fspeed/fastinvntt.S | 1 + crypto_kem/ml-kem-1024/m4fspeed/fastntt.S | 1 + crypto_kem/ml-kem-1024/m4fspeed/indcpa.c | 1 + crypto_kem/ml-kem-1024/m4fspeed/indcpa.h | 1 + crypto_kem/ml-kem-1024/m4fspeed/kem.c | 1 + crypto_kem/ml-kem-1024/m4fspeed/macros.i | 1 + crypto_kem/ml-kem-1024/m4fspeed/matacc.c | 1 + crypto_kem/ml-kem-1024/m4fspeed/matacc.h | 1 + crypto_kem/ml-kem-1024/m4fspeed/matacc.i | 1 + crypto_kem/ml-kem-1024/m4fspeed/matacc_asm.S | 1 + crypto_kem/ml-kem-1024/m4fspeed/ntt.c | 1 + crypto_kem/ml-kem-1024/m4fspeed/ntt.h | 1 + crypto_kem/ml-kem-1024/m4fspeed/params.h | 31 + crypto_kem/ml-kem-1024/m4fspeed/poly.c | 1 + crypto_kem/ml-kem-1024/m4fspeed/poly.h | 1 + crypto_kem/ml-kem-1024/m4fspeed/poly_asm.S | 1 + crypto_kem/ml-kem-1024/m4fspeed/polyvec.c | 1 + crypto_kem/ml-kem-1024/m4fspeed/polyvec.h | 1 + crypto_kem/ml-kem-1024/m4fspeed/reduce.S | 1 + .../ml-kem-1024/m4fspeed/symmetric-fips202.c | 1 + crypto_kem/ml-kem-1024/m4fspeed/symmetric.h | 1 + crypto_kem/ml-kem-1024/m4fspeed/verify.c | 1 + crypto_kem/ml-kem-1024/m4fspeed/verify.h | 1 + crypto_kem/ml-kem-1024/m4fstack/api.h | 1 + crypto_kem/ml-kem-1024/m4fstack/cbd.c | 1 + crypto_kem/ml-kem-1024/m4fstack/cbd.h | 1 + crypto_kem/ml-kem-1024/m4fstack/cmov_int16.S | 1 + crypto_kem/ml-kem-1024/m4fstack/fastaddsub.S | 1 + crypto_kem/ml-kem-1024/m4fstack/fastbasemul.S | 1 + crypto_kem/ml-kem-1024/m4fstack/fastinvntt.S | 1 + crypto_kem/ml-kem-1024/m4fstack/fastntt.S | 1 + crypto_kem/ml-kem-1024/m4fstack/indcpa.c | 1 + crypto_kem/ml-kem-1024/m4fstack/indcpa.h | 1 + crypto_kem/ml-kem-1024/m4fstack/kem.c | 1 + crypto_kem/ml-kem-1024/m4fstack/macros.i | 1 + crypto_kem/ml-kem-1024/m4fstack/matacc.c | 1 + crypto_kem/ml-kem-1024/m4fstack/matacc.h | 1 + crypto_kem/ml-kem-1024/m4fstack/matacc.i | 1 + crypto_kem/ml-kem-1024/m4fstack/matacc_asm.S | 1 + crypto_kem/ml-kem-1024/m4fstack/ntt.c | 1 + crypto_kem/ml-kem-1024/m4fstack/ntt.h | 1 + crypto_kem/ml-kem-1024/m4fstack/params.h | 1 + crypto_kem/ml-kem-1024/m4fstack/poly.c | 1 + crypto_kem/ml-kem-1024/m4fstack/poly.h | 1 + crypto_kem/ml-kem-1024/m4fstack/poly_asm.S | 1 + crypto_kem/ml-kem-1024/m4fstack/polyvec.c | 1 + crypto_kem/ml-kem-1024/m4fstack/polyvec.h | 1 + crypto_kem/ml-kem-1024/m4fstack/reduce.S | 1 + .../ml-kem-1024/m4fstack/symmetric-fips202.c | 1 + crypto_kem/ml-kem-1024/m4fstack/symmetric.h | 1 + crypto_kem/ml-kem-1024/m4fstack/verify.c | 1 + crypto_kem/ml-kem-1024/m4fstack/verify.h | 1 + crypto_kem/ml-kem-512/m4fspeed/api.h | 20 + crypto_kem/ml-kem-512/m4fspeed/cbd.c | 112 + crypto_kem/ml-kem-512/m4fspeed/cbd.h | 9 + crypto_kem/ml-kem-512/m4fspeed/cmov_int16.S | 1 + crypto_kem/ml-kem-512/m4fspeed/fastaddsub.S | 1 + crypto_kem/ml-kem-512/m4fspeed/fastbasemul.S | 1 + crypto_kem/ml-kem-512/m4fspeed/fastinvntt.S | 1 + crypto_kem/ml-kem-512/m4fspeed/fastntt.S | 1 + crypto_kem/ml-kem-512/m4fspeed/indcpa.c | 246 + crypto_kem/ml-kem-512/m4fspeed/indcpa.h | 1 + crypto_kem/ml-kem-512/m4fspeed/kem.c | 1 + crypto_kem/ml-kem-512/m4fspeed/macros.i | 1 + crypto_kem/ml-kem-512/m4fspeed/matacc.c | 1 + crypto_kem/ml-kem-512/m4fspeed/matacc.h | 1 + crypto_kem/ml-kem-512/m4fspeed/matacc.i | 1 + crypto_kem/ml-kem-512/m4fspeed/matacc_asm.S | 1 + crypto_kem/ml-kem-512/m4fspeed/ntt.c | 1 + crypto_kem/ml-kem-512/m4fspeed/ntt.h | 1 + crypto_kem/ml-kem-512/m4fspeed/params.h | 32 + crypto_kem/ml-kem-512/m4fspeed/poly.c | 672 ++ crypto_kem/ml-kem-512/m4fspeed/poly.h | 56 + crypto_kem/ml-kem-512/m4fspeed/poly_asm.S | 1 + crypto_kem/ml-kem-512/m4fspeed/polyvec.c | 1 + crypto_kem/ml-kem-512/m4fspeed/polyvec.h | 1 + crypto_kem/ml-kem-512/m4fspeed/reduce.S | 1 + .../ml-kem-512/m4fspeed/symmetric-fips202. | 1 + .../ml-kem-512/m4fspeed/symmetric-fips202.c | 1 + crypto_kem/ml-kem-512/m4fspeed/symmetric.h | 1 + crypto_kem/ml-kem-512/m4fspeed/verify.c | 1 + crypto_kem/ml-kem-512/m4fspeed/verify.h | 1 + crypto_kem/ml-kem-512/m4fstack/api.h | 1 + crypto_kem/ml-kem-512/m4fstack/cbd.c | 1 + crypto_kem/ml-kem-512/m4fstack/cbd.h | 1 + crypto_kem/ml-kem-512/m4fstack/cmov_int16.S | 1 + crypto_kem/ml-kem-512/m4fstack/fastaddsub.S | 1 + crypto_kem/ml-kem-512/m4fstack/fastbasemul.S | 1 + crypto_kem/ml-kem-512/m4fstack/fastinvntt.S | 1 + crypto_kem/ml-kem-512/m4fstack/fastntt.S | 1 + crypto_kem/ml-kem-512/m4fstack/indcpa.c | 211 + crypto_kem/ml-kem-512/m4fstack/indcpa.h | 1 + crypto_kem/ml-kem-512/m4fstack/kem.c | 1 + crypto_kem/ml-kem-512/m4fstack/macros.i | 1 + crypto_kem/ml-kem-512/m4fstack/matacc.c | 1 + crypto_kem/ml-kem-512/m4fstack/matacc.h | 1 + crypto_kem/ml-kem-512/m4fstack/matacc.i | 1 + crypto_kem/ml-kem-512/m4fstack/matacc_asm.S | 1 + crypto_kem/ml-kem-512/m4fstack/ntt.c | 1 + crypto_kem/ml-kem-512/m4fstack/ntt.h | 1 + crypto_kem/ml-kem-512/m4fstack/params.h | 1 + crypto_kem/ml-kem-512/m4fstack/poly.c | 637 ++ crypto_kem/ml-kem-512/m4fstack/poly.h | 54 + crypto_kem/ml-kem-512/m4fstack/poly_asm.S | 1 + crypto_kem/ml-kem-512/m4fstack/polyvec.c | 1 + crypto_kem/ml-kem-512/m4fstack/polyvec.h | 1 + crypto_kem/ml-kem-512/m4fstack/reduce.S | 1 + .../ml-kem-512/m4fstack/symmetric-fips202.c | 1 + crypto_kem/ml-kem-512/m4fstack/symmetric.h | 1 + crypto_kem/ml-kem-512/m4fstack/verify.c | 1 + crypto_kem/ml-kem-512/m4fstack/verify.h | 1 + crypto_kem/ml-kem-768/m4fspeed/api.h | 20 + crypto_kem/ml-kem-768/m4fspeed/cbd.c | 55 + crypto_kem/ml-kem-768/m4fspeed/cbd.h | 8 + crypto_kem/ml-kem-768/m4fspeed/cmov_int16.S | 15 + crypto_kem/ml-kem-768/m4fspeed/fastaddsub.S | 60 + crypto_kem/ml-kem-768/m4fspeed/fastbasemul.S | 193 + crypto_kem/ml-kem-768/m4fspeed/fastinvntt.S | 356 + crypto_kem/ml-kem-768/m4fspeed/fastntt.S | 265 + crypto_kem/ml-kem-768/m4fspeed/indcpa.c | 244 + crypto_kem/ml-kem-768/m4fspeed/indcpa.h | 22 + crypto_kem/ml-kem-768/m4fspeed/kem.c | 159 + crypto_kem/ml-kem-768/m4fspeed/macros.i | 60 + crypto_kem/ml-kem-768/m4fspeed/matacc.c | 121 + crypto_kem/ml-kem-768/m4fspeed/matacc.h | 63 + crypto_kem/ml-kem-768/m4fspeed/matacc.i | 301 + crypto_kem/ml-kem-768/m4fspeed/matacc_asm.S | 377 + crypto_kem/ml-kem-768/m4fspeed/ntt.c | 106 + crypto_kem/ml-kem-768/m4fspeed/ntt.h | 11 + crypto_kem/ml-kem-768/m4fspeed/params.h | 31 + crypto_kem/ml-kem-768/m4fspeed/poly.c | 654 ++ crypto_kem/ml-kem-768/m4fspeed/poly.h | 53 + crypto_kem/ml-kem-768/m4fspeed/poly_asm.S | 246 + crypto_kem/ml-kem-768/m4fspeed/polyvec.c | 212 + crypto_kem/ml-kem-768/m4fspeed/polyvec.h | 24 + crypto_kem/ml-kem-768/m4fspeed/reduce.S | 140 + .../ml-kem-768/m4fspeed/symmetric-fips202.c | 71 + crypto_kem/ml-kem-768/m4fspeed/symmetric.h | 29 + crypto_kem/ml-kem-768/m4fspeed/verify.c | 51 + crypto_kem/ml-kem-768/m4fspeed/verify.h | 10 + crypto_kem/ml-kem-768/m4fstack/api.h | 1 + crypto_kem/ml-kem-768/m4fstack/cbd.c | 1 + crypto_kem/ml-kem-768/m4fstack/cbd.h | 1 + crypto_kem/ml-kem-768/m4fstack/cmov_int16.S | 1 + crypto_kem/ml-kem-768/m4fstack/fastaddsub.S | 1 + crypto_kem/ml-kem-768/m4fstack/fastbasemul.S | 207 + crypto_kem/ml-kem-768/m4fstack/fastinvntt.S | 360 + crypto_kem/ml-kem-768/m4fstack/fastntt.S | 1 + crypto_kem/ml-kem-768/m4fstack/indcpa.c | 211 + crypto_kem/ml-kem-768/m4fstack/indcpa.h | 1 + crypto_kem/ml-kem-768/m4fstack/kem.c | 1 + crypto_kem/ml-kem-768/m4fstack/macros.i | 1 + crypto_kem/ml-kem-768/m4fstack/matacc.c | 43 + crypto_kem/ml-kem-768/m4fstack/matacc.h | 26 + crypto_kem/ml-kem-768/m4fstack/matacc.i | 197 + crypto_kem/ml-kem-768/m4fstack/matacc_asm.S | 118 + crypto_kem/ml-kem-768/m4fstack/ntt.c | 1 + crypto_kem/ml-kem-768/m4fstack/ntt.h | 1 + crypto_kem/ml-kem-768/m4fstack/params.h | 1 + crypto_kem/ml-kem-768/m4fstack/poly.c | 618 ++ crypto_kem/ml-kem-768/m4fstack/poly.h | 51 + crypto_kem/ml-kem-768/m4fstack/poly_asm.S | 198 + crypto_kem/ml-kem-768/m4fstack/polyvec.c | 1 + crypto_kem/ml-kem-768/m4fstack/polyvec.h | 1 + crypto_kem/ml-kem-768/m4fstack/reduce.S | 1 + .../ml-kem-768/m4fstack/symmetric-fips202.c | 1 + crypto_kem/ml-kem-768/m4fstack/symmetric.h | 1 + crypto_kem/ml-kem-768/m4fstack/verify.c | 1 + crypto_kem/ml-kem-768/m4fstack/verify.h | 1 + crypto_sign/dilithium2/m4f/api.h | 26 + crypto_sign/dilithium2/m4f/basemul_257.S | 91 + crypto_sign/dilithium2/m4f/config.h | 7 + crypto_sign/dilithium2/m4f/fnt_257.S | 145 + crypto_sign/dilithium2/m4f/ifnt_257.S | 306 + crypto_sign/dilithium2/m4f/macros.i | 191 + crypto_sign/dilithium2/m4f/macros_fnt.i | 158 + crypto_sign/dilithium2/m4f/ntt.S | 402 + crypto_sign/dilithium2/m4f/ntt.h | 13 + crypto_sign/dilithium2/m4f/packing.c | 390 + crypto_sign/dilithium2/m4f/packing.h | 68 + crypto_sign/dilithium2/m4f/params.h | 83 + crypto_sign/dilithium2/m4f/pointwise_mont.h | 13 + crypto_sign/dilithium2/m4f/pointwise_mont.s | 128 + crypto_sign/dilithium2/m4f/poly.c | 863 +++ crypto_sign/dilithium2/m4f/poly.h | 84 + crypto_sign/dilithium2/m4f/polyvec.c | 429 ++ crypto_sign/dilithium2/m4f/polyvec.h | 99 + crypto_sign/dilithium2/m4f/reduce.h | 29 + crypto_sign/dilithium2/m4f/rounding.c | 102 + crypto_sign/dilithium2/m4f/rounding.h | 19 + crypto_sign/dilithium2/m4f/sign.c | 391 + crypto_sign/dilithium2/m4f/sign.h | 37 + crypto_sign/dilithium2/m4f/smallntt.h | 31 + crypto_sign/dilithium2/m4f/smallpoly.c | 84 + crypto_sign/dilithium2/m4f/smallpoly.h | 39 + crypto_sign/dilithium2/m4f/symmetric-shake.c | 28 + crypto_sign/dilithium2/m4f/symmetric.h | 65 + crypto_sign/dilithium2/m4f/vector.h | 22 + crypto_sign/dilithium2/m4f/vector.s | 263 + crypto_sign/dilithium2/m4fstack/api.h | 1 + crypto_sign/dilithium2/m4fstack/config.h | 1 + crypto_sign/dilithium2/m4fstack/macros.i | 1 + .../dilithium2/m4fstack/macros_smallntt.i | 91 + crypto_sign/dilithium2/m4fstack/ntt.S | 1 + crypto_sign/dilithium2/m4fstack/ntt.h | 1 + crypto_sign/dilithium2/m4fstack/packing.c | 1 + crypto_sign/dilithium2/m4fstack/packing.h | 1 + crypto_sign/dilithium2/m4fstack/params.h | 1 + .../dilithium2/m4fstack/pointwise_mont.h | 1 + .../dilithium2/m4fstack/pointwise_mont.s | 1 + crypto_sign/dilithium2/m4fstack/poly.c | 1 + crypto_sign/dilithium2/m4fstack/poly.h | 1 + crypto_sign/dilithium2/m4fstack/polyvec.c | 1 + crypto_sign/dilithium2/m4fstack/polyvec.h | 1 + crypto_sign/dilithium2/m4fstack/reduce.h | 79 + crypto_sign/dilithium2/m4fstack/rounding.c | 1 + crypto_sign/dilithium2/m4fstack/rounding.h | 1 + crypto_sign/dilithium2/m4fstack/sign.c | 484 ++ crypto_sign/dilithium2/m4fstack/sign.h | 1 + crypto_sign/dilithium2/m4fstack/smallntt.h | 47 + .../dilithium2/m4fstack/smallntt_769.S | 691 ++ crypto_sign/dilithium2/m4fstack/smallpoly.c | 83 + crypto_sign/dilithium2/m4fstack/smallpoly.h | 27 + crypto_sign/dilithium2/m4fstack/stack.c | 715 ++ crypto_sign/dilithium2/m4fstack/stack.h | 69 + .../dilithium2/m4fstack/symmetric-shake.c | 1 + crypto_sign/dilithium2/m4fstack/symmetric.h | 1 + crypto_sign/dilithium2/m4fstack/vector.h | 1 + crypto_sign/dilithium2/m4fstack/vector.s | 1 + crypto_sign/dilithium3/m4f/api.h | 1 + crypto_sign/dilithium3/m4f/config.h | 7 + crypto_sign/dilithium3/m4f/macros.i | 1 + crypto_sign/dilithium3/m4f/macros_smallntt.i | 98 + crypto_sign/dilithium3/m4f/ntt.S | 1 + crypto_sign/dilithium3/m4f/ntt.h | 1 + crypto_sign/dilithium3/m4f/packing.c | 1 + crypto_sign/dilithium3/m4f/packing.h | 1 + crypto_sign/dilithium3/m4f/params.h | 1 + crypto_sign/dilithium3/m4f/pointwise_mont.h | 1 + crypto_sign/dilithium3/m4f/pointwise_mont.s | 1 + crypto_sign/dilithium3/m4f/poly.c | 1 + crypto_sign/dilithium3/m4f/poly.h | 1 + crypto_sign/dilithium3/m4f/polyvec.c | 1 + crypto_sign/dilithium3/m4f/polyvec.h | 1 + crypto_sign/dilithium3/m4f/reduce.h | 1 + crypto_sign/dilithium3/m4f/rounding.c | 1 + crypto_sign/dilithium3/m4f/rounding.h | 1 + crypto_sign/dilithium3/m4f/sign.c | 1 + crypto_sign/dilithium3/m4f/sign.h | 1 + crypto_sign/dilithium3/m4f/smallntt.h | 48 + crypto_sign/dilithium3/m4f/smallntt_769.S | 681 ++ crypto_sign/dilithium3/m4f/smallpoly.c | 1 + crypto_sign/dilithium3/m4f/smallpoly.h | 1 + crypto_sign/dilithium3/m4f/symmetric-shake.c | 1 + crypto_sign/dilithium3/m4f/symmetric.h | 1 + crypto_sign/dilithium3/m4f/vector.h | 1 + crypto_sign/dilithium3/m4f/vector.s | 1 + crypto_sign/dilithium3/m4fstack/api.h | 1 + crypto_sign/dilithium3/m4fstack/config.h | 1 + crypto_sign/dilithium3/m4fstack/macros.i | 1 + .../dilithium3/m4fstack/macros_smallntt.i | 1 + crypto_sign/dilithium3/m4fstack/ntt.S | 1 + crypto_sign/dilithium3/m4fstack/ntt.h | 1 + crypto_sign/dilithium3/m4fstack/packing.c | 1 + crypto_sign/dilithium3/m4fstack/packing.h | 1 + crypto_sign/dilithium3/m4fstack/params.h | 1 + .../dilithium3/m4fstack/pointwise_mont.h | 1 + .../dilithium3/m4fstack/pointwise_mont.s | 1 + crypto_sign/dilithium3/m4fstack/poly.c | 1 + crypto_sign/dilithium3/m4fstack/poly.h | 1 + crypto_sign/dilithium3/m4fstack/polyvec.c | 1 + crypto_sign/dilithium3/m4fstack/polyvec.h | 1 + crypto_sign/dilithium3/m4fstack/reduce.h | 1 + crypto_sign/dilithium3/m4fstack/rounding.c | 1 + crypto_sign/dilithium3/m4fstack/rounding.h | 1 + crypto_sign/dilithium3/m4fstack/sign.c | 1 + crypto_sign/dilithium3/m4fstack/sign.h | 1 + crypto_sign/dilithium3/m4fstack/smallntt.h | 1 + .../dilithium3/m4fstack/smallntt_769.S | 1 + crypto_sign/dilithium3/m4fstack/smallpoly.c | 1 + crypto_sign/dilithium3/m4fstack/smallpoly.h | 1 + crypto_sign/dilithium3/m4fstack/stack.c | 1 + crypto_sign/dilithium3/m4fstack/stack.h | 1 + .../dilithium3/m4fstack/symmetric-shake.c | 1 + crypto_sign/dilithium3/m4fstack/symmetric.h | 1 + crypto_sign/dilithium3/m4fstack/vector.h | 1 + crypto_sign/dilithium3/m4fstack/vector.s | 1 + crypto_sign/dilithium5/m4f/api.h | 1 + crypto_sign/dilithium5/m4f/basemul_257.S | 1 + crypto_sign/dilithium5/m4f/config.h | 7 + crypto_sign/dilithium5/m4f/fnt_257.S | 1 + crypto_sign/dilithium5/m4f/ifnt_257.S | 1 + crypto_sign/dilithium5/m4f/macros.i | 1 + crypto_sign/dilithium5/m4f/macros_fnt.i | 1 + crypto_sign/dilithium5/m4f/ntt.S | 1 + crypto_sign/dilithium5/m4f/ntt.h | 1 + crypto_sign/dilithium5/m4f/packing.c | 1 + crypto_sign/dilithium5/m4f/packing.h | 1 + crypto_sign/dilithium5/m4f/params.h | 1 + crypto_sign/dilithium5/m4f/pointwise_mont.h | 1 + crypto_sign/dilithium5/m4f/pointwise_mont.s | 1 + crypto_sign/dilithium5/m4f/poly.c | 1 + crypto_sign/dilithium5/m4f/poly.h | 1 + crypto_sign/dilithium5/m4f/polyvec.c | 1 + crypto_sign/dilithium5/m4f/polyvec.h | 1 + crypto_sign/dilithium5/m4f/reduce.h | 1 + crypto_sign/dilithium5/m4f/rounding.c | 1 + crypto_sign/dilithium5/m4f/rounding.h | 1 + crypto_sign/dilithium5/m4f/sign.c | 1 + crypto_sign/dilithium5/m4f/sign.h | 1 + crypto_sign/dilithium5/m4f/smallntt.h | 1 + crypto_sign/dilithium5/m4f/smallpoly.c | 1 + crypto_sign/dilithium5/m4f/smallpoly.h | 1 + crypto_sign/dilithium5/m4f/symmetric-shake.c | 1 + crypto_sign/dilithium5/m4f/symmetric.h | 1 + crypto_sign/dilithium5/m4f/vector.h | 1 + crypto_sign/dilithium5/m4f/vector.s | 1 + crypto_sign/dilithium5/m4fstack/api.h | 1 + crypto_sign/dilithium5/m4fstack/config.h | 1 + crypto_sign/dilithium5/m4fstack/macros.i | 1 + .../dilithium5/m4fstack/macros_smallntt.i | 1 + crypto_sign/dilithium5/m4fstack/ntt.S | 1 + crypto_sign/dilithium5/m4fstack/ntt.h | 1 + crypto_sign/dilithium5/m4fstack/packing.c | 1 + crypto_sign/dilithium5/m4fstack/packing.h | 1 + crypto_sign/dilithium5/m4fstack/params.h | 1 + .../dilithium5/m4fstack/pointwise_mont.h | 1 + .../dilithium5/m4fstack/pointwise_mont.s | 1 + crypto_sign/dilithium5/m4fstack/poly.c | 1 + crypto_sign/dilithium5/m4fstack/poly.h | 1 + crypto_sign/dilithium5/m4fstack/polyvec.c | 1 + crypto_sign/dilithium5/m4fstack/polyvec.h | 1 + crypto_sign/dilithium5/m4fstack/reduce.h | 1 + crypto_sign/dilithium5/m4fstack/rounding.c | 1 + crypto_sign/dilithium5/m4fstack/rounding.h | 1 + crypto_sign/dilithium5/m4fstack/sign.c | 1 + crypto_sign/dilithium5/m4fstack/sign.h | 1 + crypto_sign/dilithium5/m4fstack/smallntt.h | 1 + .../dilithium5/m4fstack/smallntt_769.S | 1 + crypto_sign/dilithium5/m4fstack/smallpoly.c | 1 + crypto_sign/dilithium5/m4fstack/smallpoly.h | 1 + crypto_sign/dilithium5/m4fstack/stack.c | 1 + crypto_sign/dilithium5/m4fstack/stack.h | 1 + .../dilithium5/m4fstack/symmetric-shake.c | 1 + crypto_sign/dilithium5/m4fstack/symmetric.h | 1 + crypto_sign/dilithium5/m4fstack/vector.h | 1 + crypto_sign/dilithium5/m4fstack/vector.s | 1 + hostside/host_unidirectional.py | 16 + interface.py | 110 + ldscripts/devices.data | 5 + libopencm3 | 1 + mk/config.mk | 3 + mk/crypto.mk | 29 + mk/nucleo-f767zi.mk | 31 + mk/opencm3.mk | 111 + mk/tests.mk | 29 + mupq | 1 + requirements.txt | 2 + skiplist.py | 250 + slothy | 1 + test.py | 14 + testvectors.py | 13 + 407 files changed, 37922 insertions(+) create mode 100644 .github/dependabot.yml create mode 100644 .github/pull_request_template.md create mode 100644 .github/workflows/nucleo-f767zi.yml create mode 100644 .gitignore create mode 100644 .gitmodules create mode 100644 Makefile create mode 100644 README.md create mode 100755 benchmarks.py create mode 100755 build_everything.py create mode 100644 common/aes-encrypt.S create mode 100644 common/aes-keyschedule.S create mode 100644 common/aes-publicinputs.S create mode 100644 common/aes-publicinputs.c create mode 100644 common/aes-publicinputs.h create mode 100644 common/aes.c create mode 100644 common/aes.h create mode 100644 common/aestest.c create mode 100644 common/crypto_hashblocks_sha512.c create mode 100644 common/crypto_hashblocks_sha512_inner32.s create mode 100644 common/hal-mps2.c create mode 100644 common/hal-opencm3.c create mode 100644 common/keccakf1600.S create mode 100644 common/keccaktest.c create mode 100644 common/mps2/CMSDK_CM4.h create mode 100644 common/mps2/LICENSE.txt create mode 100644 common/mps2/MPS2.ld create mode 100644 common/mps2/cmsis_armclang.h create mode 100644 common/mps2/cmsis_compiler.h create mode 100644 common/mps2/cmsis_gcc.h create mode 100644 common/mps2/cmsis_nvic.h create mode 100644 common/mps2/cmsis_version.h create mode 100644 common/mps2/core_cm4.h create mode 100644 common/mps2/memory_zones.h create mode 100644 common/mps2/mpu_armv7.h create mode 100644 common/mps2/startup_MPS2.S create mode 100644 common/randombytes.c create mode 100644 common/test.c create mode 120000 common/testfast.c create mode 100755 convert_benchmarks.py create mode 100644 crypto_kem/ml-kem-1024/m4fspeed/api.h create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/cbd.c create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/cbd.h create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/cmov_int16.S create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/fastaddsub.S create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/fastbasemul.S create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/fastinvntt.S create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/fastntt.S create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/indcpa.c create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/indcpa.h create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/kem.c create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/macros.i create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/matacc.c create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/matacc.h create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/matacc.i create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/matacc_asm.S create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/ntt.c create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/ntt.h create mode 100644 crypto_kem/ml-kem-1024/m4fspeed/params.h create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/poly.c create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/poly.h create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/poly_asm.S create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/polyvec.c create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/polyvec.h create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/reduce.S create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/symmetric-fips202.c create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/symmetric.h create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/verify.c create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/verify.h create mode 120000 crypto_kem/ml-kem-1024/m4fstack/api.h create mode 120000 crypto_kem/ml-kem-1024/m4fstack/cbd.c create mode 120000 crypto_kem/ml-kem-1024/m4fstack/cbd.h create mode 120000 crypto_kem/ml-kem-1024/m4fstack/cmov_int16.S create mode 120000 crypto_kem/ml-kem-1024/m4fstack/fastaddsub.S create mode 120000 crypto_kem/ml-kem-1024/m4fstack/fastbasemul.S create mode 120000 crypto_kem/ml-kem-1024/m4fstack/fastinvntt.S create mode 120000 crypto_kem/ml-kem-1024/m4fstack/fastntt.S create mode 120000 crypto_kem/ml-kem-1024/m4fstack/indcpa.c create mode 120000 crypto_kem/ml-kem-1024/m4fstack/indcpa.h create mode 120000 crypto_kem/ml-kem-1024/m4fstack/kem.c create mode 120000 crypto_kem/ml-kem-1024/m4fstack/macros.i create mode 120000 crypto_kem/ml-kem-1024/m4fstack/matacc.c create mode 120000 crypto_kem/ml-kem-1024/m4fstack/matacc.h create mode 120000 crypto_kem/ml-kem-1024/m4fstack/matacc.i create mode 120000 crypto_kem/ml-kem-1024/m4fstack/matacc_asm.S create mode 120000 crypto_kem/ml-kem-1024/m4fstack/ntt.c create mode 120000 crypto_kem/ml-kem-1024/m4fstack/ntt.h create mode 120000 crypto_kem/ml-kem-1024/m4fstack/params.h create mode 120000 crypto_kem/ml-kem-1024/m4fstack/poly.c create mode 120000 crypto_kem/ml-kem-1024/m4fstack/poly.h create mode 120000 crypto_kem/ml-kem-1024/m4fstack/poly_asm.S create mode 120000 crypto_kem/ml-kem-1024/m4fstack/polyvec.c create mode 120000 crypto_kem/ml-kem-1024/m4fstack/polyvec.h create mode 120000 crypto_kem/ml-kem-1024/m4fstack/reduce.S create mode 120000 crypto_kem/ml-kem-1024/m4fstack/symmetric-fips202.c create mode 120000 crypto_kem/ml-kem-1024/m4fstack/symmetric.h create mode 120000 crypto_kem/ml-kem-1024/m4fstack/verify.c create mode 120000 crypto_kem/ml-kem-1024/m4fstack/verify.h create mode 100644 crypto_kem/ml-kem-512/m4fspeed/api.h create mode 100644 crypto_kem/ml-kem-512/m4fspeed/cbd.c create mode 100644 crypto_kem/ml-kem-512/m4fspeed/cbd.h create mode 120000 crypto_kem/ml-kem-512/m4fspeed/cmov_int16.S create mode 120000 crypto_kem/ml-kem-512/m4fspeed/fastaddsub.S create mode 120000 crypto_kem/ml-kem-512/m4fspeed/fastbasemul.S create mode 120000 crypto_kem/ml-kem-512/m4fspeed/fastinvntt.S create mode 120000 crypto_kem/ml-kem-512/m4fspeed/fastntt.S create mode 100644 crypto_kem/ml-kem-512/m4fspeed/indcpa.c create mode 120000 crypto_kem/ml-kem-512/m4fspeed/indcpa.h create mode 120000 crypto_kem/ml-kem-512/m4fspeed/kem.c create mode 120000 crypto_kem/ml-kem-512/m4fspeed/macros.i create mode 120000 crypto_kem/ml-kem-512/m4fspeed/matacc.c create mode 120000 crypto_kem/ml-kem-512/m4fspeed/matacc.h create mode 120000 crypto_kem/ml-kem-512/m4fspeed/matacc.i create mode 120000 crypto_kem/ml-kem-512/m4fspeed/matacc_asm.S create mode 120000 crypto_kem/ml-kem-512/m4fspeed/ntt.c create mode 120000 crypto_kem/ml-kem-512/m4fspeed/ntt.h create mode 100644 crypto_kem/ml-kem-512/m4fspeed/params.h create mode 100644 crypto_kem/ml-kem-512/m4fspeed/poly.c create mode 100644 crypto_kem/ml-kem-512/m4fspeed/poly.h create mode 120000 crypto_kem/ml-kem-512/m4fspeed/poly_asm.S create mode 120000 crypto_kem/ml-kem-512/m4fspeed/polyvec.c create mode 120000 crypto_kem/ml-kem-512/m4fspeed/polyvec.h create mode 120000 crypto_kem/ml-kem-512/m4fspeed/reduce.S create mode 120000 crypto_kem/ml-kem-512/m4fspeed/symmetric-fips202. create mode 120000 crypto_kem/ml-kem-512/m4fspeed/symmetric-fips202.c create mode 120000 crypto_kem/ml-kem-512/m4fspeed/symmetric.h create mode 120000 crypto_kem/ml-kem-512/m4fspeed/verify.c create mode 120000 crypto_kem/ml-kem-512/m4fspeed/verify.h create mode 120000 crypto_kem/ml-kem-512/m4fstack/api.h create mode 120000 crypto_kem/ml-kem-512/m4fstack/cbd.c create mode 120000 crypto_kem/ml-kem-512/m4fstack/cbd.h create mode 120000 crypto_kem/ml-kem-512/m4fstack/cmov_int16.S create mode 120000 crypto_kem/ml-kem-512/m4fstack/fastaddsub.S create mode 120000 crypto_kem/ml-kem-512/m4fstack/fastbasemul.S create mode 120000 crypto_kem/ml-kem-512/m4fstack/fastinvntt.S create mode 120000 crypto_kem/ml-kem-512/m4fstack/fastntt.S create mode 100644 crypto_kem/ml-kem-512/m4fstack/indcpa.c create mode 120000 crypto_kem/ml-kem-512/m4fstack/indcpa.h create mode 120000 crypto_kem/ml-kem-512/m4fstack/kem.c create mode 120000 crypto_kem/ml-kem-512/m4fstack/macros.i create mode 120000 crypto_kem/ml-kem-512/m4fstack/matacc.c create mode 120000 crypto_kem/ml-kem-512/m4fstack/matacc.h create mode 120000 crypto_kem/ml-kem-512/m4fstack/matacc.i create mode 120000 crypto_kem/ml-kem-512/m4fstack/matacc_asm.S create mode 120000 crypto_kem/ml-kem-512/m4fstack/ntt.c create mode 120000 crypto_kem/ml-kem-512/m4fstack/ntt.h create mode 120000 crypto_kem/ml-kem-512/m4fstack/params.h create mode 100644 crypto_kem/ml-kem-512/m4fstack/poly.c create mode 100644 crypto_kem/ml-kem-512/m4fstack/poly.h create mode 120000 crypto_kem/ml-kem-512/m4fstack/poly_asm.S create mode 120000 crypto_kem/ml-kem-512/m4fstack/polyvec.c create mode 120000 crypto_kem/ml-kem-512/m4fstack/polyvec.h create mode 120000 crypto_kem/ml-kem-512/m4fstack/reduce.S create mode 120000 crypto_kem/ml-kem-512/m4fstack/symmetric-fips202.c create mode 120000 crypto_kem/ml-kem-512/m4fstack/symmetric.h create mode 120000 crypto_kem/ml-kem-512/m4fstack/verify.c create mode 120000 crypto_kem/ml-kem-512/m4fstack/verify.h create mode 100644 crypto_kem/ml-kem-768/m4fspeed/api.h create mode 100644 crypto_kem/ml-kem-768/m4fspeed/cbd.c create mode 100644 crypto_kem/ml-kem-768/m4fspeed/cbd.h create mode 100644 crypto_kem/ml-kem-768/m4fspeed/cmov_int16.S create mode 100644 crypto_kem/ml-kem-768/m4fspeed/fastaddsub.S create mode 100644 crypto_kem/ml-kem-768/m4fspeed/fastbasemul.S create mode 100644 crypto_kem/ml-kem-768/m4fspeed/fastinvntt.S create mode 100644 crypto_kem/ml-kem-768/m4fspeed/fastntt.S create mode 100644 crypto_kem/ml-kem-768/m4fspeed/indcpa.c create mode 100644 crypto_kem/ml-kem-768/m4fspeed/indcpa.h create mode 100644 crypto_kem/ml-kem-768/m4fspeed/kem.c create mode 100644 crypto_kem/ml-kem-768/m4fspeed/macros.i create mode 100644 crypto_kem/ml-kem-768/m4fspeed/matacc.c create mode 100644 crypto_kem/ml-kem-768/m4fspeed/matacc.h create mode 100644 crypto_kem/ml-kem-768/m4fspeed/matacc.i create mode 100644 crypto_kem/ml-kem-768/m4fspeed/matacc_asm.S create mode 100644 crypto_kem/ml-kem-768/m4fspeed/ntt.c create mode 100644 crypto_kem/ml-kem-768/m4fspeed/ntt.h create mode 100644 crypto_kem/ml-kem-768/m4fspeed/params.h create mode 100644 crypto_kem/ml-kem-768/m4fspeed/poly.c create mode 100644 crypto_kem/ml-kem-768/m4fspeed/poly.h create mode 100644 crypto_kem/ml-kem-768/m4fspeed/poly_asm.S create mode 100644 crypto_kem/ml-kem-768/m4fspeed/polyvec.c create mode 100644 crypto_kem/ml-kem-768/m4fspeed/polyvec.h create mode 100644 crypto_kem/ml-kem-768/m4fspeed/reduce.S create mode 100644 crypto_kem/ml-kem-768/m4fspeed/symmetric-fips202.c create mode 100644 crypto_kem/ml-kem-768/m4fspeed/symmetric.h create mode 100644 crypto_kem/ml-kem-768/m4fspeed/verify.c create mode 100644 crypto_kem/ml-kem-768/m4fspeed/verify.h create mode 120000 crypto_kem/ml-kem-768/m4fstack/api.h create mode 120000 crypto_kem/ml-kem-768/m4fstack/cbd.c create mode 120000 crypto_kem/ml-kem-768/m4fstack/cbd.h create mode 120000 crypto_kem/ml-kem-768/m4fstack/cmov_int16.S create mode 120000 crypto_kem/ml-kem-768/m4fstack/fastaddsub.S create mode 100644 crypto_kem/ml-kem-768/m4fstack/fastbasemul.S create mode 100644 crypto_kem/ml-kem-768/m4fstack/fastinvntt.S create mode 120000 crypto_kem/ml-kem-768/m4fstack/fastntt.S create mode 100644 crypto_kem/ml-kem-768/m4fstack/indcpa.c create mode 120000 crypto_kem/ml-kem-768/m4fstack/indcpa.h create mode 120000 crypto_kem/ml-kem-768/m4fstack/kem.c create mode 120000 crypto_kem/ml-kem-768/m4fstack/macros.i create mode 100644 crypto_kem/ml-kem-768/m4fstack/matacc.c create mode 100644 crypto_kem/ml-kem-768/m4fstack/matacc.h create mode 100644 crypto_kem/ml-kem-768/m4fstack/matacc.i create mode 100644 crypto_kem/ml-kem-768/m4fstack/matacc_asm.S create mode 120000 crypto_kem/ml-kem-768/m4fstack/ntt.c create mode 120000 crypto_kem/ml-kem-768/m4fstack/ntt.h create mode 120000 crypto_kem/ml-kem-768/m4fstack/params.h create mode 100644 crypto_kem/ml-kem-768/m4fstack/poly.c create mode 100644 crypto_kem/ml-kem-768/m4fstack/poly.h create mode 100644 crypto_kem/ml-kem-768/m4fstack/poly_asm.S create mode 120000 crypto_kem/ml-kem-768/m4fstack/polyvec.c create mode 120000 crypto_kem/ml-kem-768/m4fstack/polyvec.h create mode 120000 crypto_kem/ml-kem-768/m4fstack/reduce.S create mode 120000 crypto_kem/ml-kem-768/m4fstack/symmetric-fips202.c create mode 120000 crypto_kem/ml-kem-768/m4fstack/symmetric.h create mode 120000 crypto_kem/ml-kem-768/m4fstack/verify.c create mode 120000 crypto_kem/ml-kem-768/m4fstack/verify.h create mode 100644 crypto_sign/dilithium2/m4f/api.h create mode 100644 crypto_sign/dilithium2/m4f/basemul_257.S create mode 100644 crypto_sign/dilithium2/m4f/config.h create mode 100644 crypto_sign/dilithium2/m4f/fnt_257.S create mode 100644 crypto_sign/dilithium2/m4f/ifnt_257.S create mode 100644 crypto_sign/dilithium2/m4f/macros.i create mode 100644 crypto_sign/dilithium2/m4f/macros_fnt.i create mode 100644 crypto_sign/dilithium2/m4f/ntt.S create mode 100644 crypto_sign/dilithium2/m4f/ntt.h create mode 100644 crypto_sign/dilithium2/m4f/packing.c create mode 100644 crypto_sign/dilithium2/m4f/packing.h create mode 100644 crypto_sign/dilithium2/m4f/params.h create mode 100644 crypto_sign/dilithium2/m4f/pointwise_mont.h create mode 100644 crypto_sign/dilithium2/m4f/pointwise_mont.s create mode 100644 crypto_sign/dilithium2/m4f/poly.c create mode 100644 crypto_sign/dilithium2/m4f/poly.h create mode 100644 crypto_sign/dilithium2/m4f/polyvec.c create mode 100644 crypto_sign/dilithium2/m4f/polyvec.h create mode 100644 crypto_sign/dilithium2/m4f/reduce.h create mode 100644 crypto_sign/dilithium2/m4f/rounding.c create mode 100644 crypto_sign/dilithium2/m4f/rounding.h create mode 100644 crypto_sign/dilithium2/m4f/sign.c create mode 100644 crypto_sign/dilithium2/m4f/sign.h create mode 100644 crypto_sign/dilithium2/m4f/smallntt.h create mode 100644 crypto_sign/dilithium2/m4f/smallpoly.c create mode 100644 crypto_sign/dilithium2/m4f/smallpoly.h create mode 100644 crypto_sign/dilithium2/m4f/symmetric-shake.c create mode 100644 crypto_sign/dilithium2/m4f/symmetric.h create mode 100644 crypto_sign/dilithium2/m4f/vector.h create mode 100644 crypto_sign/dilithium2/m4f/vector.s create mode 120000 crypto_sign/dilithium2/m4fstack/api.h create mode 120000 crypto_sign/dilithium2/m4fstack/config.h create mode 120000 crypto_sign/dilithium2/m4fstack/macros.i create mode 100644 crypto_sign/dilithium2/m4fstack/macros_smallntt.i create mode 120000 crypto_sign/dilithium2/m4fstack/ntt.S create mode 120000 crypto_sign/dilithium2/m4fstack/ntt.h create mode 120000 crypto_sign/dilithium2/m4fstack/packing.c create mode 120000 crypto_sign/dilithium2/m4fstack/packing.h create mode 120000 crypto_sign/dilithium2/m4fstack/params.h create mode 120000 crypto_sign/dilithium2/m4fstack/pointwise_mont.h create mode 120000 crypto_sign/dilithium2/m4fstack/pointwise_mont.s create mode 120000 crypto_sign/dilithium2/m4fstack/poly.c create mode 120000 crypto_sign/dilithium2/m4fstack/poly.h create mode 120000 crypto_sign/dilithium2/m4fstack/polyvec.c create mode 120000 crypto_sign/dilithium2/m4fstack/polyvec.h create mode 100644 crypto_sign/dilithium2/m4fstack/reduce.h create mode 120000 crypto_sign/dilithium2/m4fstack/rounding.c create mode 120000 crypto_sign/dilithium2/m4fstack/rounding.h create mode 100644 crypto_sign/dilithium2/m4fstack/sign.c create mode 120000 crypto_sign/dilithium2/m4fstack/sign.h create mode 100644 crypto_sign/dilithium2/m4fstack/smallntt.h create mode 100644 crypto_sign/dilithium2/m4fstack/smallntt_769.S create mode 100644 crypto_sign/dilithium2/m4fstack/smallpoly.c create mode 100644 crypto_sign/dilithium2/m4fstack/smallpoly.h create mode 100644 crypto_sign/dilithium2/m4fstack/stack.c create mode 100644 crypto_sign/dilithium2/m4fstack/stack.h create mode 120000 crypto_sign/dilithium2/m4fstack/symmetric-shake.c create mode 120000 crypto_sign/dilithium2/m4fstack/symmetric.h create mode 120000 crypto_sign/dilithium2/m4fstack/vector.h create mode 120000 crypto_sign/dilithium2/m4fstack/vector.s create mode 120000 crypto_sign/dilithium3/m4f/api.h create mode 100644 crypto_sign/dilithium3/m4f/config.h create mode 120000 crypto_sign/dilithium3/m4f/macros.i create mode 100644 crypto_sign/dilithium3/m4f/macros_smallntt.i create mode 120000 crypto_sign/dilithium3/m4f/ntt.S create mode 120000 crypto_sign/dilithium3/m4f/ntt.h create mode 120000 crypto_sign/dilithium3/m4f/packing.c create mode 120000 crypto_sign/dilithium3/m4f/packing.h create mode 120000 crypto_sign/dilithium3/m4f/params.h create mode 120000 crypto_sign/dilithium3/m4f/pointwise_mont.h create mode 120000 crypto_sign/dilithium3/m4f/pointwise_mont.s create mode 120000 crypto_sign/dilithium3/m4f/poly.c create mode 120000 crypto_sign/dilithium3/m4f/poly.h create mode 120000 crypto_sign/dilithium3/m4f/polyvec.c create mode 120000 crypto_sign/dilithium3/m4f/polyvec.h create mode 120000 crypto_sign/dilithium3/m4f/reduce.h create mode 120000 crypto_sign/dilithium3/m4f/rounding.c create mode 120000 crypto_sign/dilithium3/m4f/rounding.h create mode 120000 crypto_sign/dilithium3/m4f/sign.c create mode 120000 crypto_sign/dilithium3/m4f/sign.h create mode 100644 crypto_sign/dilithium3/m4f/smallntt.h create mode 100644 crypto_sign/dilithium3/m4f/smallntt_769.S create mode 120000 crypto_sign/dilithium3/m4f/smallpoly.c create mode 120000 crypto_sign/dilithium3/m4f/smallpoly.h create mode 120000 crypto_sign/dilithium3/m4f/symmetric-shake.c create mode 120000 crypto_sign/dilithium3/m4f/symmetric.h create mode 120000 crypto_sign/dilithium3/m4f/vector.h create mode 120000 crypto_sign/dilithium3/m4f/vector.s create mode 120000 crypto_sign/dilithium3/m4fstack/api.h create mode 120000 crypto_sign/dilithium3/m4fstack/config.h create mode 120000 crypto_sign/dilithium3/m4fstack/macros.i create mode 120000 crypto_sign/dilithium3/m4fstack/macros_smallntt.i create mode 120000 crypto_sign/dilithium3/m4fstack/ntt.S create mode 120000 crypto_sign/dilithium3/m4fstack/ntt.h create mode 120000 crypto_sign/dilithium3/m4fstack/packing.c create mode 120000 crypto_sign/dilithium3/m4fstack/packing.h create mode 120000 crypto_sign/dilithium3/m4fstack/params.h create mode 120000 crypto_sign/dilithium3/m4fstack/pointwise_mont.h create mode 120000 crypto_sign/dilithium3/m4fstack/pointwise_mont.s create mode 120000 crypto_sign/dilithium3/m4fstack/poly.c create mode 120000 crypto_sign/dilithium3/m4fstack/poly.h create mode 120000 crypto_sign/dilithium3/m4fstack/polyvec.c create mode 120000 crypto_sign/dilithium3/m4fstack/polyvec.h create mode 120000 crypto_sign/dilithium3/m4fstack/reduce.h create mode 120000 crypto_sign/dilithium3/m4fstack/rounding.c create mode 120000 crypto_sign/dilithium3/m4fstack/rounding.h create mode 120000 crypto_sign/dilithium3/m4fstack/sign.c create mode 120000 crypto_sign/dilithium3/m4fstack/sign.h create mode 120000 crypto_sign/dilithium3/m4fstack/smallntt.h create mode 120000 crypto_sign/dilithium3/m4fstack/smallntt_769.S create mode 120000 crypto_sign/dilithium3/m4fstack/smallpoly.c create mode 120000 crypto_sign/dilithium3/m4fstack/smallpoly.h create mode 120000 crypto_sign/dilithium3/m4fstack/stack.c create mode 120000 crypto_sign/dilithium3/m4fstack/stack.h create mode 120000 crypto_sign/dilithium3/m4fstack/symmetric-shake.c create mode 120000 crypto_sign/dilithium3/m4fstack/symmetric.h create mode 120000 crypto_sign/dilithium3/m4fstack/vector.h create mode 120000 crypto_sign/dilithium3/m4fstack/vector.s create mode 120000 crypto_sign/dilithium5/m4f/api.h create mode 120000 crypto_sign/dilithium5/m4f/basemul_257.S create mode 100644 crypto_sign/dilithium5/m4f/config.h create mode 120000 crypto_sign/dilithium5/m4f/fnt_257.S create mode 120000 crypto_sign/dilithium5/m4f/ifnt_257.S create mode 120000 crypto_sign/dilithium5/m4f/macros.i create mode 120000 crypto_sign/dilithium5/m4f/macros_fnt.i create mode 120000 crypto_sign/dilithium5/m4f/ntt.S create mode 120000 crypto_sign/dilithium5/m4f/ntt.h create mode 120000 crypto_sign/dilithium5/m4f/packing.c create mode 120000 crypto_sign/dilithium5/m4f/packing.h create mode 120000 crypto_sign/dilithium5/m4f/params.h create mode 120000 crypto_sign/dilithium5/m4f/pointwise_mont.h create mode 120000 crypto_sign/dilithium5/m4f/pointwise_mont.s create mode 120000 crypto_sign/dilithium5/m4f/poly.c create mode 120000 crypto_sign/dilithium5/m4f/poly.h create mode 120000 crypto_sign/dilithium5/m4f/polyvec.c create mode 120000 crypto_sign/dilithium5/m4f/polyvec.h create mode 120000 crypto_sign/dilithium5/m4f/reduce.h create mode 120000 crypto_sign/dilithium5/m4f/rounding.c create mode 120000 crypto_sign/dilithium5/m4f/rounding.h create mode 120000 crypto_sign/dilithium5/m4f/sign.c create mode 120000 crypto_sign/dilithium5/m4f/sign.h create mode 120000 crypto_sign/dilithium5/m4f/smallntt.h create mode 120000 crypto_sign/dilithium5/m4f/smallpoly.c create mode 120000 crypto_sign/dilithium5/m4f/smallpoly.h create mode 120000 crypto_sign/dilithium5/m4f/symmetric-shake.c create mode 120000 crypto_sign/dilithium5/m4f/symmetric.h create mode 120000 crypto_sign/dilithium5/m4f/vector.h create mode 120000 crypto_sign/dilithium5/m4f/vector.s create mode 120000 crypto_sign/dilithium5/m4fstack/api.h create mode 120000 crypto_sign/dilithium5/m4fstack/config.h create mode 120000 crypto_sign/dilithium5/m4fstack/macros.i create mode 120000 crypto_sign/dilithium5/m4fstack/macros_smallntt.i create mode 120000 crypto_sign/dilithium5/m4fstack/ntt.S create mode 120000 crypto_sign/dilithium5/m4fstack/ntt.h create mode 120000 crypto_sign/dilithium5/m4fstack/packing.c create mode 120000 crypto_sign/dilithium5/m4fstack/packing.h create mode 120000 crypto_sign/dilithium5/m4fstack/params.h create mode 120000 crypto_sign/dilithium5/m4fstack/pointwise_mont.h create mode 120000 crypto_sign/dilithium5/m4fstack/pointwise_mont.s create mode 120000 crypto_sign/dilithium5/m4fstack/poly.c create mode 120000 crypto_sign/dilithium5/m4fstack/poly.h create mode 120000 crypto_sign/dilithium5/m4fstack/polyvec.c create mode 120000 crypto_sign/dilithium5/m4fstack/polyvec.h create mode 120000 crypto_sign/dilithium5/m4fstack/reduce.h create mode 120000 crypto_sign/dilithium5/m4fstack/rounding.c create mode 120000 crypto_sign/dilithium5/m4fstack/rounding.h create mode 120000 crypto_sign/dilithium5/m4fstack/sign.c create mode 120000 crypto_sign/dilithium5/m4fstack/sign.h create mode 120000 crypto_sign/dilithium5/m4fstack/smallntt.h create mode 120000 crypto_sign/dilithium5/m4fstack/smallntt_769.S create mode 120000 crypto_sign/dilithium5/m4fstack/smallpoly.c create mode 120000 crypto_sign/dilithium5/m4fstack/smallpoly.h create mode 120000 crypto_sign/dilithium5/m4fstack/stack.c create mode 120000 crypto_sign/dilithium5/m4fstack/stack.h create mode 120000 crypto_sign/dilithium5/m4fstack/symmetric-shake.c create mode 120000 crypto_sign/dilithium5/m4fstack/symmetric.h create mode 120000 crypto_sign/dilithium5/m4fstack/vector.h create mode 120000 crypto_sign/dilithium5/m4fstack/vector.s create mode 100755 hostside/host_unidirectional.py create mode 100644 interface.py create mode 100644 ldscripts/devices.data create mode 160000 libopencm3 create mode 100644 mk/config.mk create mode 100644 mk/crypto.mk create mode 100644 mk/nucleo-f767zi.mk create mode 100644 mk/opencm3.mk create mode 100644 mk/tests.mk create mode 160000 mupq create mode 100644 requirements.txt create mode 100644 skiplist.py create mode 160000 slothy create mode 100755 test.py create mode 100755 testvectors.py diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..4c35da0 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,6 @@ +version: 2 +updates: + - package-ecosystem: "gitsubmodule" + directory: '/' + schedule: + interval: "monthly" diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..216e642 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,7 @@ +- [ ] PR changes testvectors +- [ ] Tests pass in qemu +- [ ] Testvectors pass in qemu +- [ ] Tests pass on Nucleo-L4R5ZI +- [ ] Testvectors pass on Nucleo-L4R5ZI +- [ ] Updated Benchmarks +- [ ] Updated Skiplist entries diff --git a/.github/workflows/nucleo-f767zi.yml b/.github/workflows/nucleo-f767zi.yml new file mode 100644 index 0000000..9e78ef4 --- /dev/null +++ b/.github/workflows/nucleo-f767zi.yml @@ -0,0 +1,21 @@ +name: stm32f4discovery build +on: + push: + branches: + - master + pull_request: + branches: [ "master" ] +jobs: + build-all: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + - name: Install Toolchain + uses: carlosperate/arm-none-eabi-gcc-action@v1.9.1 + with: + release: 13.3.Rel1 + - name: Build All (nucleo-f767zi) + run: make PLATFORM=nucleo-f767zi -j2 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4740396 --- /dev/null +++ b/.gitignore @@ -0,0 +1,16 @@ +*.o +*.bin +*.elf +*.a +*.d +*.log* +venv/ +testvectors/ +benchmarks/ +__pycache__/ +bin/ +obj/ +elf/ +bin-host/ +compile_commands.json +.vscode diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..cd5c612 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,9 @@ +[submodule "libopencm3"] + path = libopencm3 + url = https://github.com/libopencm3/libopencm3.git +[submodule "mupq"] + path = mupq + url = https://github.com/mupq/mupq.git +[submodule "slothy"] + path = slothy + url = https://github.com/slothy-optimizer/slothy diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..67dd7e6 --- /dev/null +++ b/Makefile @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: Apache-2.0 or CC0-1.0 +.PHONY: all +all: tests tests-bin + +include mupq/mk/config.mk +include mk/config.mk +include mk/crypto.mk +include mupq/mk/host-crypto.mk +include mupq/mk/rules.mk +include mupq/mk/schemes.mk +include mk/tests.mk + +.PHONY: clean libclean + +clean: + rm -rf elf/ + rm -rf bin/ + rm -rf bin-host/ + rm -rf obj/ + rm -rf testvectors/ + rm -rf benchmarks/ + +.SECONDARY: diff --git a/README.md b/README.md new file mode 100644 index 0000000..2c0ec88 --- /dev/null +++ b/README.md @@ -0,0 +1,411 @@ +# pqm4 +Collection of post-quantum cryptographic alrogithms for the ARM Cortex-M4 + +## Introduction +The **pqm4** library, benchmarking and testing framework started as a result of the +[PQCRYPTO](https://pqcrypto.eu.org) project funded by the European Commission in the H2020 program. +It currently contains implementations post-quantum key-encapsulation mechanisms +and post-quantum signature schemes targeting the ARM Cortex-M4 family of microcontrollers. +The design goals of the library are to offer +* automated functional testing on a widely available development board; +* automated generation of test vectors and comparison against output + of a reference implementation running host-side (i.e., on the computer the + development board is connected to); +* automated benchmarking for speed, stack usage, and code-size; +* automated profiling of cycles spent in symmetric primitives (SHA-2, SHA-3, AES); +* integration of clean implementations from [PQClean](https://github.com/PQClean/PQClean); and +* easy integration of new schemes and implementations into the framework. + +## Previous NIST PQC + +The master branch of **pqm4** contains schemes that either [selected for standardization by NIST](https://csrc.nist.gov/Projects/post-quantum-cryptography/selected-algorithms-2022), +part of the [4th round of the NIST PQC standardization process](https://csrc.nist.gov/Projects/post-quantum-cryptography/round-4-submissions), +or part or the [first round of additional signatures of the NIST PQC standardization process](https://csrc.nist.gov/projects/pqc-dig-sig/round-1-additional-signatures). + +Implementations for previous NIST PQC rounds are available here: +- Round 3: https://github.com/mupq/pqm4/releases/tag/Round3 +- Round 2: https://github.com/mupq/pqm4/releases/tag/Round2 +- Round 1: https://github.com/mupq/pqm4/releases/tag/Round1 + +## Changes in Round 2 +For the second round of the NIST PQC process, **pqm4** was extended (see [#78](https://github.com/mupq/pqm4/pull/78)) with the following features: +- common code was moved to [mupq](https://github.com/mupq/mupq) for reuse in [pqriscv](https://github.com/mupq/pqriscv), +- much simpler build process, +- automated profiling of cycles spent in symmetric primitives (SHA-2, SHA-3, AES), +- reporting of code-size, +- integration of clean implementations from [PQClean](https://github.com/PQClean/PQClean). + +## Changes in Round 3 +For the third round of the NIST PQC process, **pqm4** was extended with the following features: +- overhaul of the build process to support multiple target boards, and +- use of the QEMU simulator to measure stack usage of larger schemes. + +## Changes in Round 4 / Round 1 of Additional signatures +For the fourth round of the NIST PQC process **pqm4** was extended with the following features: +- Switch to the Nucleo-L4R5ZI board as the default board for measurements, and +- an overhaul of the console output. + +## Schemes included in pqm4 + +For most of the schemes there are multiple implementations. +The naming scheme for these implementations is as follows: +* `clean`: clean reference implementation from [PQClean](https://github.com/PQClean/PQClean), +* `ref`: the reference implementation submitted to NIST (will be replaced by `clean` in the long term), +* `opt`: an optimized implementation in plain C (e.g., the optimized implementation submitted to NIST), +* `m4`: an implementation with Cortex-M4 specific optimizations (typically in assembly). +* `m4f`: an implementation with Cortex-M4F specific optimizations (typically assembly using floating-point registers). + +## Setup/Installation +The testing and benchmarking framework of **pqm4** targets several development +boards, all featuring an ARM Cortex-M4 chip: + +* `nucleo-l4r5zi` (default): The [NUCLEO-L4R5ZI board](https://www.st.com/en/evaluation-tools/nucleo-l4r5zi.html) + featuring 2MB of Flash and 640KB of RAM. This board does not require a + separate USB serial interface converter. +* `stm32f4discovery`: The [STM32F4 Discovery board](https://www.st.com/en/evaluation-tools/stm32f4discovery.html) + featuring 1MB of Flash, and 192KB of RAM. Connecting the + development to the host computer requires a mini-USB cable and a USB-TTL + converter together with a 2-pin dupont / jumper cable. +* `nucleo-l476rg`: The [NUCLEO-L476RG board](https://www.st.com/en/evaluation-tools/nucleo-l476rg.html) + featuring 1MB of Flash and 128KB of RAM. This board does not require a + separate USB serial interface converter. +* `cw308t-stm32f3`: The ChipWhisperer [CW308-STM32F3 target board](https://rtfm.newae.com/Targets/UFO%20Targets/CW308T-STM32F/) + (in the F3 configuration) featuring 256KB of Flash and 40KB of RAM. +* `mps2-an386`: The ARM MPS2(+) FPGA prototyping board when used with the + ARM-Cortex M4 bitstream (see [ARM AN386](https://developer.arm.com/documentation/dai0386/c)) + featuring two 4MB RAM blocks, one used in lieu of Flash one as RAM. This board + can also be simulated with the QEMU 5.2 simulator (the cycle counts are, + however, meaningless in this case). + +### Installing the ARM toolchain +The **pqm4** build system assumes that you have the [arm-none-eabi toolchain](https://developer.arm.com/downloads/-/arm-gnu-toolchain-downloads) +toolchain installed. All benchmarks are performed using this toolchain. +On most Linux systems, the correct toolchain gets installed when you install the `arm-none-eabi-gcc` (or `gcc-arm-none-eabi`) package. +On some Linux distributions, you will also have to explicitly install `libnewlib-arm-none-eabi` . + +### Installing stlink +To flash binaries onto most development boards, **pqm4** is using [stlink](https://github.com/texane/stlink). +Depending on your operating system, stlink may be available in your package manager -- if not, please +refer to the stlink Github page for instructions on how to [compile it from source](https://github.com/texane/stlink/blob/master/doc/compiling.md) +(in that case, be careful to use libusb-1.0.0-dev, not libusb-0.1). + +### Installing OpenOCD +For the `nucleo-l4r5zi` board [OpenOCD](http://openocd.org) (tested with version 0.12) is used for flashing binaries. +Depending on your operating system, OpenOCD may be available in your package manager -- if not, please +refer to the OpenOCD README for instructions on how to [compile it from source](http://openocd.org/doc-release/README). + +### Python3 +The benchmarking scripts used in **pqm4** require Python >= 3.8. + +### Installing pyserial +The host-side Python code for most platforms requires the [pyserial](https://github.com/pyserial/pyserial) module. +Your package repository might offer `python3-serial` (Debian, Ubuntu) or `python-pyserial` (Arch) or `python3-pyserial` (Fedora, openSUSE) or `pyserial` (Slack, CentOS, Gentoo) or `py3-pyserial` (Alpine) directly. +Alternatively, this can be easily installed from PyPA by calling `pip3 install -r requirements.txt`. +If you do not have `pip3` installed yet, you can typically find it as `python3-pip` (Debian, Ubuntu) or `python-pip` (Arch) using your package manager. + +### Installing ChipWhisperer +The host-side Python code for the `cw308t-stm32f3` board requires the [chipwhisperer](https://chipwhisperer.readthedocs.io/en/latest/installing.html#install-repo-pypi) module. +If you don't target this board, you can skip the installation. + +### Installing QEMU >=5.2 +The `mps2-an386` platform is simulated with the [QEMU](https://www.qemu.org/) +ARM system emulator. You'll need at least the version 5.2, which is fairly +recent at the time of writing and may not be available on your favourite Linux +distro. If you don't target this platform, you can skip the installation. + +### Connecting the STM32F4 Discovery board to the host +Connect the board to your host machine using the mini-USB port. +This provides it with power, and allows you to flash binaries onto the board. +It should show up in `lsusb` as `STMicroelectronics ST-LINK/V2`. + +If you are using a UART-USB connector that has a PL2303 chip on board (which appears to be the most common), +the driver should be loaded in your kernel by default. If it is not, it is typically called `pl2303`. +On macOS, you will still need to [install it](http://www.prolific.com.tw/US/ShowProduct.aspx?p_id=229&pcid=41) (and reboot). +When you plug in the device, it should show up as `Prolific Technology, Inc. PL2303 Serial Port` when you type `lsusb`. + +Using dupont / jumper cables, connect the `TX`/`TXD` pin of the USB connector to the `PA3` pin on the board, and connect `RX`/`RXD` to `PA2`. +Depending on your setup, you may also want to connect the `GND` pins. + +### Downloading pqm4 and libopencm3 +Finally, obtain the **pqm4** library and the submodules: +``` +git clone --recursive https://github.com/mupq/pqm4.git +``` + +Now you may pick your platform and compile the code (adapt the `PLATFORM` +variable to your chosen platform and the number of threads in `-j4` to your PC accordingly): +``` +make -j4 PLATFORM=stm32f4discovery +``` + +## API documentation +The **pqm4** library uses the NIST/SUPERCOP/[PQClean +API](https://github.com/PQClean/PQClean). It is mandated for all included +schemes. + +KEMs need to define `CRYPTO_SECRETKEYBYTES`, `CRYPTO_PUBLICKEYBYTES`, `CRYPTO_BYTES`, and `CRYPTO_CIPHERTEXTBYTES` and implement +```c +int crypto_kem_keypair(unsigned char *pk, unsigned char *sk); +int crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); +int crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); +``` + +Signature schemes need to define `CRYPTO_SECRETKEYBYTES`, `CRYPTO_PUBLICKEYBYTES`, and `CRYPTO_BYTES` and implement +```c +int crypto_sign_keypair(unsigned char *pk, unsigned char *sk); +int crypto_sign(unsigned char *sm, size_t *smlen, + const unsigned char *msg, size_t len, + const unsigned char *sk); +int crypto_sign_open(unsigned char *m, size_t *mlen, + const unsigned char *sm, size_t smlen, + const unsigned char *pk); +``` + + +## Running tests and benchmarks +The build system compiles six binaries for each implemenation which can be used to test and benchmark the schemes. For example, for the reference implementation of [ML-KEM-768](https://pq-crystals.org/kyber/) the following binaries are assembled: + - `bin/crypto_kem_ml-kem-768_m4_test.bin` tests if the scheme works as expected. For KEMs this tests if Alice and Bob derive the same shared key and for signature schemes it tests if a generated signature can be verified correctly. Several failure cases are also checked, see [mupq/crypto_kem/test.c](https://github.com/mupq/mupq/blob/master/crypto_kem/test.c) and [mupq/crypto_sign/test.c](https://github.com/mupq/mupq/blob/master/crypto_sign/test.c) for details. + - `bin/crypto_kem_ml-kem-768_m4_speed.bin` measures the runtime of `crypto_kem_keypair`, `crypto_kem_enc`, and `crypto_kem_dec` for KEMs and `crypto_sign_keypair`, `crypto_sign`, and `crypto_sign_open` for signatures. See [mupq/crypto_kem/speed.c](https://github.com/mupq/mupq/blob/master/crypto_kem/speed.c) and [mupq/crypto_sign/speed.c](https://github.com/mupq/mupq/blob/master/crypto_sign/speed.c). + - `bin/crypto_kem_ml-kem-768_m4_hashing.bin` measures the cycles spent in SHA-2, SHA-3, and AES of `crypto_kem_keypair`, `crypto_kem_enc`, and `crypto_kem_dec` for KEMs and `crypto_sign_keypair`, `crypto_sign`, and `crypto_sign_open` for signatures. See [mupq/crypto_kem/hashing.c](https://github.com/mupq/mupq/blob/master/crypto_kem/speed.c) and [mupq/crypto_sign/speed.c](https://github.com/mupq/mupq/blob/master/crypto_sign/speed.c). + - `bin/crypto_kem_ml-kem-768_m4_stack.bin` measures the stack consumption of each of the procedures involved. The memory allocated outside of the procedures (e.g., public keys, private keys, ciphertexts, signatures) is not included. See [mupq/crypto_kem/stack.c](https://github.com/mupq/mupq/blob/master/crypto_kem/stack.c) and [mupq/crypto_sign/stack.c](https://github.com/mupq/mupq/blob/master/crypto_sign/stack.c). + - `bin/crypto_kem_ml-kem-768_m4_testvectors.bin` uses a deterministic random number generator to generate testvectors for the implementation. These can be used to cross-check different implemenatations of the same scheme. See [mupq/crypto_kem/testvectors.c](https://github.com/mupq/mupq/blob/master/crypto_kem/testvectors.c) and [mupq/crypto_sign/testvectors.c](https://github.com/mupq/mupq/blob/master/crypto_sign/testvectors.c). +- `bin-host/crypto_kem_ml-kem-768_m4_testvectors` uses the same deterministic random number generator to create the testvectors on your host. See [mupq/crypto_kem/testvectors-host.c](https://github.com/mupq/mupq/blob/master/crypto_kem/testvectors-host.c) and [mupq/crypto_sign/testvectors-host.c](https://github.com/mupq/mupq/blob/master/crypto_sign/testvectors-host.c). +- An `elf` file for each binary is generated in the `elf/` folder if desired. + +The `elf` files or binaries can be flashed to your board using an appropriate +tool. For example, the `stm32f4discovery` platform uses `st-flash`, e.g., `st-flash write bin/crypto_kem_ml-kem-768_m4_test.bin 0x8000000`. To receive the output, run `python3 hostside/host_unidirectional.py`. + +If you target the `mps2-an386` platform, you can also run the `elf` file using +the QEMU ARM emulator: +``` +qemu-system-arm -M mps2-an386 -nographic -semihosting -kernel elf/crypto_kem_ml-kem-512_m4_test.elf +``` +The emulator should exit automatically when the test / benchmark completes. If +you run into an error, you can exit QEMU pressing CTRL+A and then X. + +The **pqm4** framework automates testing and benchmarking for all schemes using Python3 scripts: +- `python3 test.py`: flashes all test binaries to the boards and checks that no errors occur. +- `python3 testvectors.py`: flashes all testvector binaries to the boards and writes the testvectors to `testvectors/`. Additionally, it executes the reference implementations on your host machine. Afterwards, it checks the testvectors of different implementations of the same scheme for consistency. +- `python3 benchmarks.py`: flashes the stack and speed binaries and writes the results to `benchmarks/stack/` and `benchmarks/speed/`. You may want to execute this several times for certain schemes for which the execution time varies significantly. + +The scripts take a number of command line arguments, which you'll need to adapt: +- `--platform ` or `-p `: Sets the target platform (default `stm32f4discovery`). +- `--opt {speed,size,debug}` or `-o {speed,size,debug}`: Sets optimization flags for compilation (default `speed`). +- `--lto` or `-l`: Use link-time optimization during compilation. +- `--no-aio`: Use link-time optimization during compilation. + +If you change any of these values, you'll need to run `make clean` (the build +system will remind you). + +In case you don't want to include all schemes, pass a list of schemes you want to include to any of the scripts, e.g., `python3 test.py ml-kem-768 sphincs-shake256-128f-simple`. +In case you want to exclude certain schemes pass `--exclude`, e.g., `python3 test.py --exclude saber`. + +The benchmark results (in `benchmarks/`) created by +`python3 benchmarks.py` can be automatically converted to a markdown table using `python3 convert_benchmarks.py md` or to csv using `python3 convert_benchmarks.py csv`. + +## Benchmarks +The current benchmark results can be found in [benchmarks.csv](benchmarks.csv) or [benchmarks.md](benchmarks.md). + +All cycle counts were obtained at 24MHz to avoid wait cycles due to the speed of the memory controller. +For most schemes we report minimum, maximum, and average cycle counts of 100 executions. +For some particularly slow schemes we reduce the number of executions; the number of +executions is reported in parentheses. + +The numbers were obtained with `arm-none-eabi-gcc (Arm GNU Toolchain 11.3.Rel1) 11.3.1 20220712` from [Arm](https://developer.arm.com/downloads/-/arm-gnu-toolchain-downloads). + +The code-size measurements only include the code that is provided by the scheme implementation, i.e., exclude common code like hashing or C standard library functions. +The measurements are performed with `arm-none-eabi-size`. +The size contributions to the `.text`, `.data`, and `.bss` sections are also listed separately. + + +## Adding new schemes and implementations +The **pqm4** build system is designed to make it very easy to add new schemes +and implementations, if these implementations follow the NIST/SUPERCOP/PQClean API. + +In case you want to contribute a reference implementation, please open a pull request to [PQClean](https://github.com/PQClean/PQClean). +In case you want to contribute an optimized C implementation, please open a pull request to [mupq](https://github.com/mupq/mupq). +In case you want to add an implementation optimized for the Cortex-M4, please open a pull request here. + +In the following we consider the example of adding an M4-optimized implementation +of [NewHope-512-CPA-KEM](https://newhopecrypto.org) to **pqm4**: + +1. Create a subdirectory for the new scheme under `crypto_kem/`; in the following we assume that this subdirectory is called `newhope512cpa`. +1. Create a subdirectory `m4` under `crypto_kem/newhope512cpa/`. +1. Copy all files of the implementation into this new subdirectory `crypto_kem/newhope512cpa/m4/`, + except for the file implementing the `randombytes` function (typically `PQCgenKAT_kem.c`). + +The procedure for adding a signature scheme is the same, except that it starts with creating a +new subdirectory under `crypto_sign/`. + +### Using optimized FIPS202 (Keccak, SHA3, SHAKE) + Many schemes submitted to NIST use SHA-3, SHAKE or cSHAKE for hashing. + This is why **pqm4** comes with highly optimized Keccak code that is accessible + from all KEM and signature implementations. + Functions from the FIPS202 standard are defined in `mupq/common/fips202.h` as follows: + + ```c + void shake128_absorb(shake128ctx *state, const uint8_t *input, size_t inlen); + void shake128_squeezeblocks(uint8_t *output, size_t nblocks, shake128ctx *state); + void shake128(uint8_t *output, size_t outlen, const uint8_t *input, size_t inlen); + + void shake128_inc_init(shake128incctx *state); + void shake128_inc_absorb(shake128incctx *state, const uint8_t *input, size_t inlen); + void shake128_inc_finalize(shake128incctx *state); + void shake128_inc_squeeze(uint8_t *output, size_t outlen, shake128incctx *state); + + void shake256_absorb(shake256ctx *state, const uint8_t *input, size_t inlen); + void shake256_squeezeblocks(uint8_t *output, size_t nblocks, shake256ctx *state); + void shake256(uint8_t *output, size_t outlen, const uint8_t *input, size_t inlen); + + void shake256_inc_init(shake256incctx *state); + void shake256_inc_absorb(shake256incctx *state, const uint8_t *input, size_t inlen); + void shake256_inc_finalize(shake256incctx *state); + void shake256_inc_squeeze(uint8_t *output, size_t outlen, shake256incctx *state); + + void sha3_256_inc_init(sha3_256incctx *state); + void sha3_256_inc_absorb(sha3_256incctx *state, const uint8_t *input, size_t inlen); + void sha3_256_inc_finalize(uint8_t *output, sha3_256incctx *state); + + void sha3_256(uint8_t *output, const uint8_t *input, size_t inlen); + + void sha3_512_inc_init(sha3_512incctx *state); + void sha3_512_inc_absorb(sha3_512incctx *state, const uint8_t *input, size_t inlen); + void sha3_512_inc_finalize(uint8_t *output, sha3_512incctx *state); + + void sha3_512(uint8_t *output, const uint8_t *input, size_t inlen); + ``` + + Functions from the related publication SP 800-185 (cSHAKE) are defined in `mupq/common/sp800-185.h`: + + ```c + void cshake128_inc_init(shake128incctx *state, const uint8_t *name, size_t namelen, const uint8_t *cstm, size_t cstmlen); + void cshake128_inc_absorb(shake128incctx *state, const uint8_t *input, size_t inlen); + void cshake128_inc_finalize(shake128incctx *state); + void cshake128_inc_squeeze(uint8_t *output, size_t outlen, shake128incctx *state); + + void cshake128(uint8_t *output, size_t outlen, const uint8_t *name, size_t namelen, const uint8_t *cstm, size_t cstmlen, const uint8_t *input, size_t inlen); + + void cshake256_inc_init(shake256incctx *state, const uint8_t *name, size_t namelen, const uint8_t *cstm, size_t cstmlen); + void cshake256_inc_absorb(shake256incctx *state, const uint8_t *input, size_t inlen); + void cshake256_inc_finalize(shake256incctx *state); + void cshake256_inc_squeeze(uint8_t *output, size_t outlen, shake256incctx *state); + + void cshake256(uint8_t *output, size_t outlen, const uint8_t *name, size_t namelen, const uint8_t* cstm, size_t cstmlen, const uint8_t *input, size_t inlen); + ``` + + Implementations that want to make use of these optimized routines simply include + `fips202.h` (or `sp800-185.h`). The API for `sha3_256` and `sha3_512` follows the + [SUPERCOP hash API](https://bench.cr.yp.to/call-hash.html). + The API for `shake128` and `shake256` is very similar, except that it supports variable-length output. + The SHAKE functions are also accessible via the absorb-squeezeblocks functions, which offer incremental + output generation (but not incremental input handling). + The variants with `_inc_` offer both incremental input handling and output generation. + +## Using optimized SHA-2 + + Some schemes submitted to NIST use SHA-224, SHA-256, SHA-384, or SHA-512 for hashing. + We've experimented with assembly-optimized SHA-512, but found that the speed-up + achievable with this compared to the C implementation from + [SUPERCOP](https://bench.cr.yp.to/) is negligible + when compiled using `arm-none-eabi-gcc-8.3.0`. + For older compiler versions (e.g. `5.4.1`) hand-optimized assembly implementations + were significantly faster. + We've therefore decided to only include a C version of the SHA-2 variants. + The available functions are: + ```c + void sha224_inc_init(sha224ctx *state); + void sha224_inc_blocks(sha224ctx *state, const uint8_t *in, size_t inblocks); + void sha224_inc_finalize(uint8_t *out, sha224ctx *state, const uint8_t *in, size_t inlen); + void sha224(uint8_t *out, const uint8_t *in, size_t inlen); + + void sha256_inc_init(sha256ctx *state); + void sha256_inc_blocks(sha256ctx *state, const uint8_t *in, size_t inblocks); + void sha256_inc_finalize(uint8_t *out, sha256ctx *state, const uint8_t *in, size_t inlen); + void sha256(uint8_t *out, const uint8_t *in, size_t inlen); + + void sha384_inc_init(sha384ctx *state); + void sha384_inc_blocks(sha384ctx *state, const uint8_t *in, size_t inblocks); + void sha384_inc_finalize(uint8_t *out, sha384ctx *state, const uint8_t *in, size_t inlen); + void sha384(uint8_t *out, const uint8_t *in, size_t inlen); + + void sha512_inc_init(sha512ctx *state); + void sha512_inc_blocks(sha512ctx *state, const uint8_t *in, size_t inblocks); + void sha512_inc_finalize(uint8_t *out, sha512ctx *state, const uint8_t *in, size_t inlen); + void sha512(uint8_t *out, const uint8_t *in, size_t inlen); + ``` + Implementations can use these by including `sha2.h`. + +## Using optimized AES + + Some schemes submitted to NIST make use of AES as a subroutine. + We included assembly-optimized implementations of AES-128 and AES-256 in ECB mode and in CTR mode. + + Up until January 2021, pqm4 relied on the [t-table implementation](https://github.com/Ko-/aes-armcortexm) by Schwabe and Stoffelen published at [SAC2016](https://eprint.iacr.org/2016/714.pdf). + On Cortex-M4 platforms with a data cache, this implementation may be vulnerable to cache attacks. + Hence, pqm4 is now using the [bitsliced implementation](https://github.com/aadomn/aes) by Adomnicai and Peyrin published in [TCHES2021/1](https://eprint.iacr.org/2020/1123.pdf). + + The functions that can be used are stated in `common/aes.h` as follows: + ```c + void aes128_ecb_keyexp(aes128ctx *r, const unsigned char *key); + void aes128_ctr_keyexp(aes128ctx *r, const unsigned char *key); + void aes128_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes128ctx *ctx); + void aes128_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes128ctx *ctx); + + void aes256_ecb_keyexp(aes256ctx *r, const unsigned char *key); + void aes256_ctr_keyexp(aes256ctx *r, const unsigned char *key); + void aes256_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes256ctx *ctx); + void aes256_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes256ctx *ctx); + ``` + Implementations can use these by including `aes.h`. + + Some post-quantum schemes use AES with only public inputs (e.g., Kyber and FrodoKEM) and, consequently, do not need a constant-time AES implementation. + As those schemes would be unfairly penalized by swiching to a slower constant-time implementation, we additionally provide the t-table implementation. + The functions that can be used are stated in `common/aes-publicinputs.h` as follows: + ```c + void aes128_ecb_keyexp_publicinputs(aes128ctx_publicinputs *r, const unsigned char *key); + void aes128_ctr_keyexp_publicinputs(aes128ctx_publicinputs *r, const unsigned char *key); + void aes128_ecb_publicinputs(unsigned char *out, const unsigned char *in, size_t nblocks, const aes128ctx_publicinputs *ctx); + void aes128_ctr_publicinputs(unsigned char *out, size_t outlen, const unsigned char *iv, const aes128ctx_publicinputs *ctx); + + void aes192_ecb_keyexp_publicinputs(aes192ctx_publicinputs *r, const unsigned char *key); + void aes192_ctr_keyexp_publicinputs(aes192ctx_publicinputs *r, const unsigned char *key); + void aes192_ecb_publicinputs(unsigned char *out, const unsigned char *in, size_t nblocks, const aes192ctx_publicinputs *ctx); + void aes192_ctr_publicinputs(unsigned char *out, size_t outlen, const unsigned char *iv, const aes192ctx_publicinputs *ctx); + + void aes256_ecb_keyexp_publicinputs(aes256ctx_publicinputs *r, const unsigned char *key); + void aes256_ctr_keyexp_publicinputs(aes256ctx_publicinputs *r, const unsigned char *key); + void aes256_ecb_publicinputs(unsigned char *out, const unsigned char *in, size_t nblocks, const aes256ctx_publicinputs *ctx); + void aes256_ctr_publicinputs(unsigned char *out, size_t outlen, const unsigned char *iv, const aes256ctx_publicinputs *ctx); + ``` + +## Bibliography + +When referring to this framework in academic literature, please consider using the following bibTeX excerpt: + +``` +@misc{PQM4, + title = {{PQM4}: Post-quantum crypto library for the {ARM} {Cortex-M4}}, + author = {Matthias J. Kannwischer and Richard Petri and Joost Rijneveld and Peter Schwabe and Ko Stoffelen}, + note = {\url{https://github.com/mupq/pqm4}} +} +``` + +**Please note** however, that pqm4 does not author the implementations that +are included in pqm4. Most of the implementations that are included in the +collection originate from original research projects. Moreover, many +implementations have been swapped out over the years. When comparing or +improving implementations, please consider not only pqm4, but also cite +the publication corresponding to the implementation. + +Sometimes it might not be entirely clear which paper to cite. Feel free to +you open an issue such that we can help you find it. + +## License +Different parts of **pqm4** have different licenses. +Each subdirectory containing implementations contains a LICENSE or COPYING file stating +under what license that specific implementation is released. +The files in common contain licensing information at the top of the file (and +are currently either public domain or MIT). + +All other code in this repository is dual-licensed under [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) and under the conditions of [CC0](https://creativecommons.org/publicdomain/zero/1.0/). + diff --git a/benchmarks.py b/benchmarks.py new file mode 100755 index 0000000..f1b9118 --- /dev/null +++ b/benchmarks.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 or CC0-1.0 +from mupq import mupq +from interface import parse_arguments, get_platform +import sys + +if __name__ == "__main__": + args, rest = parse_arguments() + platform, settings = get_platform(args) + with platform: + schemes = [s for s in rest if s not in ['--nostack', + '--nospeed', + '--nohashing', + '--nosize']] + if "--nostack" not in rest: + test = mupq.StackBenchmark(settings, platform) + if test.test_all(schemes): + sys.exit(1) + + if "--nospeed" not in rest: + test = mupq.SpeedBenchmark(settings, platform) + if test.test_all(schemes): + sys.exit(1) + + if "--nohashing" not in rest: + test = mupq.HashingBenchmark(settings, platform) + if test.test_all(schemes): + sys.exit(1) + + if "--nosize" not in rest: + test = mupq.SizeBenchmark(settings, platform) + if test.test_all(schemes): + sys.exit(1) diff --git a/build_everything.py b/build_everything.py new file mode 100755 index 0000000..2194789 --- /dev/null +++ b/build_everything.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 or CC0-1.0 +""" +Builds all of the binaries without flashing them. +""" +import sys + +from interface import parse_arguments, get_platform +from mupq import mupq + + +if __name__ == "__main__": + args, rest = parse_arguments() + platform, settings = get_platform(args) + with platform: + mupq.BuildAll(settings).test_all(rest) diff --git a/common/aes-encrypt.S b/common/aes-encrypt.S new file mode 100644 index 0000000..2f19ff7 --- /dev/null +++ b/common/aes-encrypt.S @@ -0,0 +1,613 @@ +/****************************************************************************** +* Assembly fixsliced implementation of AES-128 and AES-256 (encryption only). +* +* Fully-fixsliced implementation runs faster than the semi-fixsliced variant +* at the cost of a larger code size. +* +* See the paper at https://eprint.iacr.org/2020/1123.pdf for more details. +* +* @author Alexandre Adomnicai, Nanyang Technological University, Singapore +* alexandre.adomnicai@ntu.edu.sg +* +* @date October 2020 +******************************************************************************/ + +.syntax unified +.thumb + +/****************************************************************************** +* Macro to compute the SWAPMOVE technique: swap the bits in 'in1' masked by 'm' +* by the bits in 'in0' masked by 'm << n' and put the results in 'out0', 'out1' +******************************************************************************/ +.macro swpmv out0, out1, in0, in1, m, n, tmp + eor \tmp, \in1, \in0, lsr \n + and \tmp, \m + eor \out1, \in1, \tmp + eor \out0, \in0, \tmp, lsl \n +.endm + +/****************************************************************************** +* Rotate all bytes in 'in' by 'n0' bits to the rights and put the results in +* 'out'. 'm' refers to the appropriate bitmask and 'n1' = 8-'n0'. +******************************************************************************/ +.macro byteror out, in, m, n0, n1, tmp + and \out, \m, \in, lsr \n0 + bic \tmp, \in, \m, ror \n1 + orr \out, \out, \tmp, lsl \n1 +.endm + +/****************************************************************************** +* Compute the MixColumns for rounds i st i%4 == 0 or 2. +* Between the two versions, only the masks and the shifts for the 'byteror' are +* differing. +******************************************************************************/ +.macro mc_0_2 m, n0, n1, n2, n3 + byteror r14, r1, \m, \n0, \n1, r9 // r14 <- BYTE_ROR_n0(S0) + eor r4, r1, r14, ror #8 // r4 <- S0 ^ (BYTE_ROR_6(S0) >>> 8) + movw r1, #0x0f0f + movt r1, #0x0f0f // r1 <- 0x0f0f0f0f (for BYTE_ROR) + byteror r5, r11, \m, \n0, \n1, r9 // r5 <- BYTE_ROR_n0(S7) + eor r10, r11, r5, ror #8 // r10<- S7 ^ BYTE_ROR_n0(S7 >>> 8) + byteror r11, r10, r1, 4, 4, r9 // r11<- BYTE_ROR_4(r10) + eor r11, r4, r11, ror #16 // r11<- BYTE_ROR_4(r10) ^ (r10 >>> 16) + eor r11, r11, r5, ror #8 // r11<- S'7 + byteror r5, r2, \m, \n0, \n1, r9 // r5 <- BYTE_ROR_n0(S6) + eor r2, r2, r5, ror #8 // r2 <- S6 ^ BYTE_ROR_n0(S6 >>> 8) + eor r10, r10, r5, ror #8 // r10<- r10 ^ (BYTE_ROR_n0(S6) >>> 8) + byteror r5, r2, r1, 4, 4, r9 // r5 <- BYTE_ROR_4(r2) + eor r10, r10, r5, ror #16 // r10<- r10 ^ (r5 >>> 16) + eor r10, r10, r4 // r10<- S'6 + byteror r5, r0, \m, \n0, \n1, r9 // r5 <- BYTE_ROR_n0(S5) + eor r0, r0, r5, ror #8 // r0 <- S5 ^ BYTE_ROR_6(S5 >>> 8) + eor r9, r2, r5, ror #8 // r9 <- r2 ^ (BYTE_ROR_n0(S5) >>> 8) + byteror r5, r0, r1, 4, 4, r2 // r5 <- BYTE_ROR_4(r0) + eor r9, r9, r5, ror #16 // r9 <- S'5 + byteror r5, r8, \m, \n0, \n1, r2 // r5 <- BYTE_ROR_n0(S4) + eor r2, r8, r5, ror #8 // r2 <- S4 ^ BYTE_ROR_6(S4 >>> 8) + eor r8, r0, r5, ror #8 // r8 <- r0 ^ (BYTE_ROR_n0(S4) >>> 8) + byteror r5, r2, r1, 4, 4, r0 // r5 <- BYTE_ROR_4(r2) + eor r8, r8, r5, ror #16 // r8 <- r8 ^ (r5 >>> 16) + eor r8, r8, r4 // r8 <- S'4 + byteror r5, r7, \m, \n0, \n1, r0 // r5 <- BYTE_ROR_n0(S3) + eor r0, r7, r5, ror #8 // r0 <- S3 ^ BYTE_ROR_6(S3 >>> 8) + eor r7, r2, r5, ror #8 // r2 ^ (BYTE_ROR_n0(S3) >>> 8) + byteror r5, r0, r1, 4, 4, r2 // r5 <- BYTE_ROR_4(r0) + eor r7, r7, r5, ror #16 // r7 <- r7 ^ (r5 >>> 16) + eor r7, r7, r4 // r7 <- S'3 + byteror r5, r6, \m, \n0, \n1, r2 // r5 <- BYTE_ROR_n0(S2) + eor r2, r6, r5, ror #8 // r2 <- S2 ^ BYTE_ROR_6(S2 >>> 8) + eor r6, r0, r5, ror #8 // r6 <- r0 ^ (BYTE_ROR_n0(S2) >>> 8) + byteror r5, r2, r1, 4, 4, r0 // r5 <- BYTE_ROR_4(r2) + eor r6, r6, r5, ror #16 // r6 <- S'2 + byteror r5, r3, \m, \n0, \n1, r0 // r5 <- BYTE_ROR_n0(S1) + eor r0, r3, r5, ror #8 // r0 <- S1 ^ BYTE_ROR_6(S1 >>> 8) + eor r3, r2, r5, ror #8 // r3 <- r0 ^ (BYTE_ROR_n0(S1) >>> 8) + byteror r5, r0, r1, 4, 4, r2 // r5 <- BYTE_ROR_4(r0) + eor r5, r3, r5, ror #16 // r5 <- S'1 + eor r14, r0, r14, ror #8 // r14<- r0 ^ (BYTE_ROR_n0(S0) >>> 8) + byteror r0, r4, r1, 4, 4, r2 // r0 <- BYTE_ROR_4(r4) + eor r4, r14, r0, ror #16 // r4 <- S'0 +.endm + +/****************************************************************************** +* Packs two 128-bit input blocs stored in r4-r7 and r8-r11, respectively, into +* the 256-bit internal state where the bits are packed as follows: +* r4 = b_24 b_56 b_88 b_120 || ... || b_0 b_32 b_64 b_96 +* r5 = b_25 b_57 b_89 b_121 || ... || b_1 b_33 b_65 b_97 +* r6 = b_26 b_58 b_90 b_122 || ... || b_2 b_34 b_66 b_98 +* r7 = b_27 b_59 b_91 b_123 || ... || b_3 b_35 b_67 b_99 +* r8 = b_28 b_60 b_92 b_124 || ... || b_4 b_36 b_68 b_100 +* r9 = b_29 b_61 b_93 b_125 || ... || b_5 b_37 b_69 b_101 +* r10 = b_30 b_62 b_94 b_126 || ... || b_6 b_38 b_70 b_102 +* r11 = b_31 b_63 b_95 b_127 || ... || b_7 b_39 b_71 b_103 +******************************************************************************/ +.align 2 +packing: + movw r3, #0x0f0f + movt r3, #0x0f0f // r3 <- 0x0f0f0f0f (mask for SWAPMOVE) + eor r2, r3, r3, lsl #2 // r2 <- 0x33333333 (mask for SWAPMOVE) + eor r1, r2, r2, lsl #1 // r1 <- 0x55555555 (mask for SWAPMOVE) + swpmv r8, r4, r8, r4, r1, #1, r12 + swpmv r9, r5, r9, r5, r1, #1, r12 + swpmv r10, r6, r10, r6, r1, #1, r12 + swpmv r11, r7, r11, r7, r1, #1, r12 + swpmv r0, r4, r5, r4, r2, #2, r12 + swpmv r9, r5, r9, r8, r2, #2, r12 + swpmv r7, r8, r7, r6, r2, #2, r12 + swpmv r11, r2, r11, r10, r2, #2, r12 + swpmv r8, r4, r8, r4, r3, #4, r12 + swpmv r10, r6, r7, r0, r3, #4, r12 + swpmv r11, r7, r11, r9, r3, #4, r12 + swpmv r9, r5, r2, r5, r3, #4, r12 + bx lr + +/****************************************************************************** +* Unpacks the 256-bit internal state in two 128-bit blocs. +******************************************************************************/ +.align 2 +unpacking: + movw r3, #0x0f0f + movt r3, #0x0f0f // r3 <- 0x0f0f0f0f (mask for SWAPMOVE) + swpmv r2, r5, r9, r5, r3, #4, r12 + swpmv r11, r9, r11, r7, r3, #4, r12 + swpmv r7, r1, r10, r6, r3, #4, r12 + swpmv r8, r4, r8, r4, r3, #4, r12 + eor r3, r3, r3, lsl #2 // r3 <- 0x33333333 (mask for SWAPMOVE) + swpmv r11, r10,r11, r2, r3, #2, r12 + swpmv r7, r6, r7, r8, r3, #2, r12 + swpmv r9, r8, r9, r5, r3, #2, r12 + swpmv r5, r4, r1, r4, r3, #2, r12 + eor r1, r3, r3, lsl #1 // r1 <- 0x55555555 (mask for SWAPMOVE) + swpmv r8, r4, r8, r4, r1, #1, r12 + swpmv r9, r5,r9, r5, r1, #1, r12 + swpmv r10, r6, r10, r6, r1, #1, r12 + swpmv r11, r7, r11, r7, r1, #1, r12 + bx lr + +/****************************************************************************** +* Subroutine that computes the AddRoundKey and the S-box. +* Credits to https://github.com/Ko-/aes-armcortexm for the S-box implementation +******************************************************************************/ +.align 2 +ark_sbox: + // add round key + ldr.w r1, [sp, #48] + ldmia r1!, {r0,r2,r3,r12} + eor r4, r0 + eor r5, r2 + eor r6, r3 + eor r7, r12 + ldmia r1!, {r0,r2,r3,r12} + eor r8, r0 + eor r9, r2 + eor r10, r3 + eor r11, r12 + str.w r1, [sp, #48] + str r14, [sp, #52] + // sbox: credits to https://github.com/Ko-/aes-armcortexm + eor r1, r7, r9 //Exec y14 = U3 ^ U5; into r1 + eor r3, r4, r10 //Exec y13 = U0 ^ U6; into r3 + eor r2, r3, r1 //Exec y12 = y13 ^ y14; into r2 + eor r0, r8, r2 //Exec t1 = U4 ^ y12; into r0 + eor r14, r0, r9 //Exec y15 = t1 ^ U5; into r14 + and r12, r2, r14 //Exec t2 = y12 & y15; into r12 + eor r8, r14, r11 //Exec y6 = y15 ^ U7; into r8 + eor r0, r0, r5 //Exec y20 = t1 ^ U1; into r0 + str.w r2, [sp, #44] //Store r2/y12 on stack + eor r2, r4, r7 //Exec y9 = U0 ^ U3; into r2 + str r0, [sp, #40] //Store r0/y20 on stack + eor r0, r0, r2 //Exec y11 = y20 ^ y9; into r0 + str r2, [sp, #36] //Store r2/y9 on stack + and r2, r2, r0 //Exec t12 = y9 & y11; into r2 + str r8, [sp, #32] //Store r8/y6 on stack + eor r8, r11, r0 //Exec y7 = U7 ^ y11; into r8 + eor r9, r4, r9 //Exec y8 = U0 ^ U5; into r9 + eor r6, r5, r6 //Exec t0 = U1 ^ U2; into r6 + eor r5, r14, r6 //Exec y10 = y15 ^ t0; into r5 + str r14, [sp, #28] //Store r14/y15 on stack + eor r14, r5, r0 //Exec y17 = y10 ^ y11; into r14 + str.w r1, [sp, #24] //Store r1/y14 on stack + and r1, r1, r14 //Exec t13 = y14 & y17; into r1 + eor r1, r1, r2 //Exec t14 = t13 ^ t12; into r1 + str r14, [sp, #20] //Store r14/y17 on stack + eor r14, r5, r9 //Exec y19 = y10 ^ y8; into r14 + str.w r5, [sp, #16] //Store r5/y10 on stack + and r5, r9, r5 //Exec t15 = y8 & y10; into r5 + eor r2, r5, r2 //Exec t16 = t15 ^ t12; into r2 + eor r5, r6, r0 //Exec y16 = t0 ^ y11; into r5 + str.w r0, [sp, #12] //Store r0/y11 on stack + eor r0, r3, r5 //Exec y21 = y13 ^ y16; into r0 + str r3, [sp, #8] //Store r3/y13 on stack + and r3, r3, r5 //Exec t7 = y13 & y16; into r3 + str r5, [sp, #4] //Store r5/y16 on stack + str r11, [sp, #0] //Store r11/U7 on stack + eor r5, r4, r5 //Exec y18 = U0 ^ y16; into r5 + eor r6, r6, r11 //Exec y1 = t0 ^ U7; into r6 + eor r7, r6, r7 //Exec y4 = y1 ^ U3; into r7 + and r11, r7, r11 //Exec t5 = y4 & U7; into r11 + eor r11, r11, r12 //Exec t6 = t5 ^ t2; into r11 + eor r11, r11, r2 //Exec t18 = t6 ^ t16; into r11 + eor r14, r11, r14 //Exec t22 = t18 ^ y19; into r14 + eor r4, r6, r4 //Exec y2 = y1 ^ U0; into r4 + and r11, r4, r8 //Exec t10 = y2 & y7; into r11 + eor r11, r11, r3 //Exec t11 = t10 ^ t7; into r11 + eor r2, r11, r2 //Exec t20 = t11 ^ t16; into r2 + eor r2, r2, r5 //Exec t24 = t20 ^ y18; into r2 + eor r10, r6, r10 //Exec y5 = y1 ^ U6; into r10 + and r11, r10, r6 //Exec t8 = y5 & y1; into r11 + eor r3, r11, r3 //Exec t9 = t8 ^ t7; into r3 + eor r3, r3, r1 //Exec t19 = t9 ^ t14; into r3 + eor r3, r3, r0 //Exec t23 = t19 ^ y21; into r3 + eor r0, r10, r9 //Exec y3 = y5 ^ y8; into r0 + ldr r11, [sp, #32] //Load y6 into r11 + and r5, r0, r11 //Exec t3 = y3 & y6; into r5 + eor r12, r5, r12 //Exec t4 = t3 ^ t2; into r12 + ldr r5, [sp, #40] //Load y20 into r5 + str r7, [sp, #32] //Store r7/y4 on stack + eor r12, r12, r5 //Exec t17 = t4 ^ y20; into r12 + eor r1, r12, r1 //Exec t21 = t17 ^ t14; into r1 + and r12, r1, r3 //Exec t26 = t21 & t23; into r12 + eor r5, r2, r12 //Exec t27 = t24 ^ t26; into r5 + eor r12, r14, r12 //Exec t31 = t22 ^ t26; into r12 + eor r1, r1, r14 //Exec t25 = t21 ^ t22; into r1 + and r7, r1, r5 //Exec t28 = t25 & t27; into r7 + eor r14, r7, r14 //Exec t29 = t28 ^ t22; into r14 + and r4, r14, r4 //Exec z14 = t29 & y2; into r4 + and r8, r14, r8 //Exec z5 = t29 & y7; into r8 + eor r7, r3, r2 //Exec t30 = t23 ^ t24; into r7 + and r12, r12, r7 //Exec t32 = t31 & t30; into r12 + eor r12, r12, r2 //Exec t33 = t32 ^ t24; into r12 + eor r7, r5, r12 //Exec t35 = t27 ^ t33; into r7 + and r2, r2, r7 //Exec t36 = t24 & t35; into r2 + eor r5, r5, r2 //Exec t38 = t27 ^ t36; into r5 + and r5, r14, r5 //Exec t39 = t29 & t38; into r5 + eor r1, r1, r5 //Exec t40 = t25 ^ t39; into r1 + eor r5, r14, r1 //Exec t43 = t29 ^ t40; into r5 + ldr.w r7, [sp, #4] //Load y16 into r7 + and r7, r5, r7 //Exec z3 = t43 & y16; into r7 + eor r8, r7, r8 //Exec tc12 = z3 ^ z5; into r8 + str r8, [sp, #40] //Store r8/tc12 on stack + ldr r8, [sp, #8] //Load y13 into r8 + and r8, r5, r8 //Exec z12 = t43 & y13; into r8 + and r10, r1, r10 //Exec z13 = t40 & y5; into r10 + and r6, r1, r6 //Exec z4 = t40 & y1; into r6 + eor r6, r7, r6 //Exec tc6 = z3 ^ z4; into r6 + eor r3, r3, r12 //Exec t34 = t23 ^ t33; into r3 + eor r3, r2, r3 //Exec t37 = t36 ^ t34; into r3 + eor r1, r1, r3 //Exec t41 = t40 ^ t37; into r1 + ldr.w r5, [sp, #16] //Load y10 into r5 + and r2, r1, r5 //Exec z8 = t41 & y10; into r2 + and r9, r1, r9 //Exec z17 = t41 & y8; into r9 + str r9, [sp, #16] //Store r9/z17 on stack + eor r5, r12, r3 //Exec t44 = t33 ^ t37; into r5 + ldr r9, [sp, #28] //Load y15 into r9 + ldr.w r7, [sp, #44] //Load y12 into r7 + and r9, r5, r9 //Exec z0 = t44 & y15; into r9 + and r7, r5, r7 //Exec z9 = t44 & y12; into r7 + and r0, r3, r0 //Exec z10 = t37 & y3; into r0 + and r3, r3, r11 //Exec z1 = t37 & y6; into r3 + eor r3, r3, r9 //Exec tc5 = z1 ^ z0; into r3 + eor r3, r6, r3 //Exec tc11 = tc6 ^ tc5; into r3 + ldr r11, [sp, #32] //Load y4 into r11 + ldr.w r5, [sp, #20] //Load y17 into r5 + and r11, r12, r11 //Exec z11 = t33 & y4; into r11 + eor r14, r14, r12 //Exec t42 = t29 ^ t33; into r14 + eor r1, r14, r1 //Exec t45 = t42 ^ t41; into r1 + and r5, r1, r5 //Exec z7 = t45 & y17; into r5 + eor r6, r5, r6 //Exec tc8 = z7 ^ tc6; into r6 + ldr r5, [sp, #24] //Load y14 into r5 + str r4, [sp, #32] //Store r4/z14 on stack + and r1, r1, r5 //Exec z16 = t45 & y14; into r1 + ldr r5, [sp, #12] //Load y11 into r5 + ldr r4, [sp, #36] //Load y9 into r4 + and r5, r14, r5 //Exec z6 = t42 & y11; into r5 + eor r5, r5, r6 //Exec tc16 = z6 ^ tc8; into r5 + and r4, r14, r4 //Exec z15 = t42 & y9; into r4 + eor r14, r4, r5 //Exec tc20 = z15 ^ tc16; into r14 + eor r4, r4, r1 //Exec tc1 = z15 ^ z16; into r4 + eor r1, r0, r4 //Exec tc2 = z10 ^ tc1; into r1 + eor r0, r1, r11 //Exec tc21 = tc2 ^ z11; into r0 + eor r7, r7, r1 //Exec tc3 = z9 ^ tc2; into r7 + eor r1, r7, r5 //Exec S0 = tc3 ^ tc16; into r1 + eor r7, r7, r3 //Exec S3 = tc3 ^ tc11; into r7 + eor r3, r7, r5 //Exec S1 = S3 ^ tc16 ^ 1; into r3 + eor r11, r10, r4 //Exec tc13 = z13 ^ tc1; into r11 + ldr.w r4, [sp, #0] //Load U7 into r4 + and r12, r12, r4 //Exec z2 = t33 & U7; into r12 + eor r9, r9, r12 //Exec tc4 = z0 ^ z2; into r9 + eor r12, r8, r9 //Exec tc7 = z12 ^ tc4; into r12 + eor r2, r2, r12 //Exec tc9 = z8 ^ tc7; into r2 + eor r2, r6, r2 //Exec tc10 = tc8 ^ tc9; into r2 + ldr.w r4, [sp, #32] //Load z14 into r4 + eor r12, r4, r2 //Exec tc17 = z14 ^ tc10; into r12 + eor r0, r0, r12 //Exec S5 = tc21 ^ tc17; into r0 + eor r6, r12, r14 //Exec tc26 = tc17 ^ tc20; into r6 + ldr.w r4, [sp, #16] //Load z17 into r4 + ldr r12, [sp, #40] //Load tc12 into r12 + eor r6, r6, r4 //Exec S2 = tc26 ^ z17 ^ 1; into r6 + eor r12, r9, r12 //Exec tc14 = tc4 ^ tc12; into r12 + eor r14, r11, r12 //Exec tc18 = tc13 ^ tc14; into r14 + eor r2, r2, r14 //Exec S6 = tc10 ^ tc18 ^ 1; into r2 + eor r11, r8, r14 //Exec S7 = z12 ^ tc18 ^ 1; into r11 + ldr r14, [sp, #52] // restore link register + eor r8, r12, r7 //Exec S4 = tc14 ^ S3; into r8 + bx lr + // [('r0', 'S5'), ('r1', 'S0'), ('r2', 'S6'), ('r3', 'S1'), + // ('r6', 'S2'),('r7', 'S3'), ('r8', 'S4'), ('r11', 'S7')] + +/****************************************************************************** +* Computation of the MixColumns transformation in the fixsliced representation. +* For fully-fixsliced implementations, it is used for rounds i s.t. (i%4) == 0. +* For semi-fixsliced implementations, it is used for rounds i s.t. (i%2) == 0. +******************************************************************************/ +.align 2 +mixcolumns_0: + str r14, [sp, #52] // store link register + movw r12, #0x0303 + movt r12, #0x0303 + mc_0_2 r12, 6, 2, 26, 18 + ldr r14, [sp, #52] // restore link register + bx lr + +/****************************************************************************** +* Computation of the MixColumns transformation in the fixsliced representation. +* For fully-fixsliced implementations only, for round i s.t. (i%4) == 1. +******************************************************************************/ +.align 2 +mixcolumns_1: + str r14, [sp, #52] // store link register + movw r14, #0x0f0f + movt r14, #0x0f0f // r14<- 0x0f0f0f0f (mask for BYTE_ROR_4) + and r5, r14, r1, lsr #4 // r5 <- (S0 >> 4) & 0x0f0f0f0f + and r9, r14, r1 // r9 <- S0 & 0x0f0f0f0f + orr r5, r5, r9, lsl #4 // r5 <- BYTE_ROR_4(S0) + eor r4, r1, r5, ror #8 // r4 <- S0 ^ (BYTE_ROR_4(S0) >>> 8) + mov.w r1, r5, ror #8 // r1 <- (BYTE_ROR_4(S0) >>> 8) + and r5, r14, r11, lsr #4 // r5 <- (S7 >> 4) & 0x0f0f0f0f + and r9, r14, r11 // r9 <- S7 & 0x0f0f0f0f + orr r5, r5, r9, lsl #4 // r5 <- BYTE_ROR_4(S7) + eor r12, r11, r5, ror #8 // r12<- S7 ^ (BYTE_ROR_4(S7) >>> 8) + eor r10, r4, r12 // r10<- r4 ^ r12 + eor r11, r10 // r11<- S7 ^ r4 ^ r12 + eor r11, r11, r12, ror #16 // r11<- r11 ^ (r12 >>> 16) + and r5, r14, r2, lsr #4 // r5 <- (S6 >> 4) & 0x0f0f0f0f + and r9, r14, r2 // r9 <- S6 & 0x0f0f0f0f + orr r5, r5, r9, lsl #4 // r5 <- BYTE_ROR_4(S6) + eor r10, r10, r5, ror #8 // r10<- r10 ^ (BYTE_ROR_4(S6) >>> 8) + eor r12, r2, r5, ror #8 // r12<- S6 ^ (BYTE_ROR_4(S6) >>> 8) + eor r10, r10, r12, ror #16 // r10<- r10 ^ (r12 >>> 16) + and r5, r14, r0, lsr #4 // r5 <- (S5 >> 4) & 0x0f0f0f0f + and r9, r14, r0 // r9 <- S5 & 0x0f0f0f0f + orr r5, r5, r9, lsl #4 // r5 <- BYTE_ROR_4(S5) + eor r9, r12, r5, ror #8 // r9 <- r12 ^ (BYTE_ROR_4(S5) >>> 8) + eor r12, r0, r5, ror #8 // r12<- S5 ^ (BYTE_ROR_4(S5) >>> 8) + eor r9, r9, r12, ror #16 // r9 <- (r9 ^ r12 >>> 16) + eor r0, r4, r12 // r0 <- r12 ^ S0 ^ (BYTE_ROR_4(S0) >>> 8) + and r5, r14, r8, lsr #4 // r5 <- (S4 >> 4) & 0x0f0f0f0f + and r2, r14, r8 // r2 <- S4 & 0x0f0f0f0f + orr r2, r5, r2, lsl #4 // r2 <- BYTE_ROR_4(S4) + eor r0, r0, r2, ror #8 // r0 <- r0 ^ (BYTE_ROR_4(S4) >>> 8) + eor r2, r8, r2, ror #8 // r2 <- S4 ^ (BYTE_ROR_4(S4) >>> 8) + eor r8, r0, r2, ror #16 // r8 <- r0 ^ (r2 >>> 16) + eor r2, r4 // r2 <- r2 ^ S0 ^ (BYTE_ROR_4(S0) >>> 8) + and r5, r14, r7, lsr #4 // r5 <- (S3 >> 4) & 0x0f0f0f0f + and r0, r14, r7 // r0 <- S3 & 0x0f0f0f0f + orr r0, r5, r0, lsl #4 // r0 <- BYTE_ROR_4(S3) + eor r2, r2, r0, ror #8 // r2 <- r2 ^ (BYTE_ROR_4(S3) >>> 8) + eor r0, r7, r0, ror #8 // r0 <- S3 ^ (BYTE_ROR_4(S3) >>> 8) + eor r7, r2, r0, ror #16 // r7 <- r2 ^ (r0 >>> 16) + and r5, r14, r6, lsr #4 // r5 <- (S2 >> 4) & 0x0f0f0f0f + and r2, r14, r6 // r2 <- S2 & 0x0f0f0f0f + orr r2, r5, r2, lsl #4 // r2 <- BYTE_ROR_4(S2) + eor r0, r0, r2, ror #8 // r0 <- r0 ^ (BYTE_ROR_4(S2) >>> 8) + eor r2, r6, r2, ror #8 // r2 <- S2 ^ (BYTE_ROR_4(S2) >>> 8) + eor r6, r0, r2, ror #16 // r6 <- r0 ^ (r2 >>> 16) + and r5, r14, r3, lsr #4 // r5 <- (S1 >> 4) & 0x0f0f0f0f + and r0, r14, r3 // r0 <- S1 & 0x0f0f0f0f + orr r0, r5, r0, lsl #4 // r0 <- BYTE_ROR_4(S1) + ldr r14, [sp, #52] // restore link register + eor r2, r2, r0, ror #8 // r2 <- r2 ^ (BYTE_ROR_4(S1) >>> 8) + eor r0, r3, r0, ror #8 // r0 <- S1 ^ (BYTE_ROR_4(S1) >>> 8) + eor r5, r2, r0, ror #16 // r5 <- r2 <- (r0 >>> 16) + eor r1, r0, r1 // r1 <- r0 ^ BYTE_ROR_4(S0) >>> 8 + eor r4, r1, r4, ror #16 // r4 <- r4 ^ (r0 >>> 16) + bx lr + +/****************************************************************************** +* Computation of the MixColumns transformation in the fixsliced representation. +* For fully-fixsliced implementations only, for rounds i s.t. (i%4) == 2. +******************************************************************************/ +.align 2 +mixcolumns_2: + str r14, [sp, #52] // store link register + movw r12, #0x3f3f + movt r12, #0x3f3f + mc_0_2 r12, 2, 6, 30, 22 + ldr r14, [sp, #52] // restore link register + bx lr + +/****************************************************************************** +* Computation of the MixColumns transformation in the fixsliced representation. +* For fully-fixsliced implementations, it is used for rounds i s.t. (i%4) == 3. +* For semi-fixsliced implementations, it is used for rounds i s.t. (i%2) == 1. +* Based on Käsper-Schwabe, similar to https://github.com/Ko-/aes-armcortexm. +******************************************************************************/ +.align 2 +mixcolumns_3: + eor r12, r11, r11, ror #8 // r12<- S7 ^ (S7 >>> 8) + eor r4, r1, r1, ror #8 // r4 <- S0 ^ (S0 >>> 8) + eor r11, r4, r11, ror #8 // r11<- S0 ^ (S0 >>> 8) ^ (S7 >>> 8) + eor r11, r11, r12, ror #16 // r11<- r11 ^ (S7 >>> 16) ^ (S7 >>> 24) + eor r10, r12, r2, ror #8 // r10<- S7 ^ (S7 >>> 8) ^ (S6 >>> 8) + eor r12, r2, r2, ror #8 // r12<- S6 ^ (S6 >>> 8) + eor r10, r10, r12, ror #16 // r10<- r10 ^ (S6 >>> 16) ^ (S6 >>> 24) + eor r10, r4 // r10<- r10 ^ S0 ^ (S0 >>> 8) + eor r9, r12, r0, ror #8 // r9 <- S6 ^ (S6 >>> 8) ^ (S5 >>> 8) + eor r12, r0, r0, ror #8 // r12<- S5 ^ (S5 >>> 8) + eor r9, r9, r12, ror #16 // r9 <- r9 ^ (S5 >>> 16) ^ (S5 >>> 24) + eor r2, r8, r8, ror #8 // r2 <- S4 ^ (S4 >>> 8) + eor r8, r12, r8, ror #8 // r8 <- S5 ^ (S5 >>> 8) ^ (S4 >>> 8) + eor r8, r4 // r8 <- r8 ^ S0 ^ (S0 >>> 8) + eor r8, r8, r2, ror #16 // r8 <- r8 ^ (S4 >>> 16) ^ (S4 >>> 24) + eor r12, r7, r7, ror #8 // r12<- S3 ^ (S3 >>> 8) + eor r7, r2, r7, ror #8 // r7 <- S4 ^ (S4 >>> 8) ^ (S3 >>> 8) + eor r7, r4 // r7 <- r7 ^ S0 ^ (S0 >>> 8) + eor r7, r7, r12, ror #16 // r7 <- r7 ^ (S3 >>> 16) ^ (S3 >>> 24) + eor r2, r6, r6, ror #8 // r2 <- S2 ^ (S2 >>> 8) + eor r6, r12, r6, ror #8 // r6 <- S3 ^ (S3 >>> 8) ^ (S2 >>> 8) + eor r6, r6, r2, ror #16 // r6 <- r6 ^ (S2 >>> 16) ^ (S2 >>> 24) + eor r12, r3, r3, ror #8 // r12<- S1 ^ (S1 >>> 8) + eor r5, r2, r3, ror #8 // r5 <- S2 ^ (S2 >>> 8) ^ (S1 >>> 8) + eor r5, r5, r12, ror #16 // r5 <- r5 ^ (S1 >>> 16) ^ (S1 >>> 24) + eor r4, r12, r4, ror #16 // r4 <- S1 ^ (S1 >>> 8) ^ (r4 >>> 16) + eor r4, r4, r1, ror #8 // r4 <- r4 ^ (S0 >>> 8) + bx lr + +/****************************************************************************** +* Applies the ShiftRows transformation twice (i.e. SR^2) on the internal state. +******************************************************************************/ +.align 2 +double_shiftrows: + movw r10, #0x0f00 + movt r10, #0x0f00 // r10<- 0x0f000f00 (mask) + swpmv r0, r0, r0, r0, r10, #4, r12 + swpmv r1, r1, r1, r1, r10, #4, r12 + swpmv r2, r2, r2, r2, r10, #4, r12 + swpmv r3, r3, r3, r3, r10, #4, r12 + swpmv r6, r6, r6, r6, r10, #4, r12 + swpmv r7, r7, r7, r7, r10, #4, r12 + swpmv r8, r8, r8, r8, r10, #4, r12 + swpmv r11, r11, r11, r11, r10, #4, r12 + bx lr + +/****************************************************************************** +* Fully-fixsliced implementation of AES-128. +* +* Two blocks are encrypted in parallel, without any operating mode. +* +* Note that additional 4 bytes are allocated on the stack as the function takes +* 5 arguments as input. +******************************************************************************/ +@ void aes128_encrypt_ffs(u8* ctext, u8* ctext_bis, const u8* ptext, +@ const u8* ptext_bis, const u32* rkey); +.global aes128_encrypt_ffs +.type aes128_encrypt_ffs,%function +.align 2 +aes128_encrypt_ffs: + push {r0-r12,r14} + sub.w sp, #56 // allow space on the stack for tmp var + ldr.w r4, [r2] // load the 1st 128-bit blocks in r4-r7 + ldr r5, [r2, #4] + ldr r6, [r2, #8] + ldr r7, [r2, #12] + ldr.w r8, [r3] // load the 2nd 128-bit blocks in r8-r11 + ldr r9, [r3, #4] + ldr r10,[r3, #8] + ldr r11,[r3, #12] + ldr.w r1, [sp, #112] // load 'rkey' argument from the stack + str.w r1, [sp, #48] // store it there for 'add_round_key' + bl packing // pack the 2 input blocks + bl ark_sbox // ark + sbox (round 0) + bl mixcolumns_0 // mixcolumns (round 0) + bl ark_sbox // ark + sbox (round 1) + bl mixcolumns_1 // mixcolumns (round 1) + bl ark_sbox // ark + sbox (round 2) + bl mixcolumns_2 // mixcolumns (round 2) + bl ark_sbox // ark + sbox (round 3) + bl mixcolumns_3 // mixcolumns (round 3) + bl ark_sbox // ark + sbox (round 4) + bl mixcolumns_0 // mixcolumns (round 4) + bl ark_sbox // ark + sbox (round 5) + bl mixcolumns_1 // mixcolumns (round 5) + bl ark_sbox // ark + sbox (round 6) + bl mixcolumns_2 // mixcolumns (round 6) + bl ark_sbox // ark + sbox (round 7) + bl mixcolumns_3 // mixcolumns (round 7) + bl ark_sbox // ark + sbox (round 8) + bl mixcolumns_0 // mixcolumns (round 8) + bl ark_sbox // ark + sbox (round 9) + bl double_shiftrows // to resynchronize with the classical rep + ldr r14, [sp, #48] // --------------------------------------- + ldmia r14!, {r4,r5,r10,r12} // + eor r4, r1 // + eor r5, r3 // + eor r6, r10 // + eor r7, r12 // Last add_round_key + ldmia r14!, {r1,r3,r10,r12} // + eor r8, r1 // + eor r9, r0, r3 // + eor r10, r2 // + eor r11, r12 // --------------------------------------- + bl unpacking // unpack the internal state + ldrd r0, r1, [sp, #56] // restore the addr to store the ciphertext + add.w sp, #64 // restore the stack pointer + str.w r4, [r0] // store the ciphertext + str r5, [r0, #4] + str r6, [r0, #8] + str r7, [r0, #12] + str.w r8, [r1] // store the ciphertext + str r9, [r1, #4] + str r10,[r1, #8] + str r11,[r1, #12] + pop {r2-r12, r14} // restore context + bx lr + +/****************************************************************************** +* Fully-fixsliced implementation of AES-256. +* +* Two blocks are encrypted in parallel, without any operating mode. +* +* Note that additional 4 bytes are allocated on the stack as the function takes +* 5 arguments as input. +******************************************************************************/ +@ void aes256_encrypt_ffs(u8* ctext, u8* ctext_bis, const u8* ptext, +@ const u8* ptext_bis, const u32* rkey); +.global aes256_encrypt_ffs +.type aes256_encrypt_ffs,%function +.align 2 +aes256_encrypt_ffs: + push {r0-r12,r14} + sub.w sp, #56 // allow space on the stack for tmp var + ldr.w r4, [r2] // load the 1st 128-bit blocks in r4-r7 + ldr r5, [r2, #4] + ldr r6, [r2, #8] + ldr r7, [r2, #12] + ldr.w r8, [r3] // load the 2nd 128-bit blocks in r8-r11 + ldr r9, [r3, #4] + ldr r10,[r3, #8] + ldr r11,[r3, #12] + ldr.w r1, [sp, #112] // load 'rkey' argument from the stack + str.w r1, [sp, #48] // store it there for 'add_round_key' + bl packing // pack the 2 input blocks + bl ark_sbox // ark + sbox (round 0) + bl mixcolumns_0 // mixcolumns (round 0) + bl ark_sbox // ark + sbox (round 1) + bl mixcolumns_1 // mixcolumns (round 1) + bl ark_sbox // ark + sbox (round 2) + bl mixcolumns_2 // mixcolumns (round 2) + bl ark_sbox // ark + sbox (round 3) + bl mixcolumns_3 // mixcolumns (round 3) + bl ark_sbox // ark + sbox (round 4) + bl mixcolumns_0 // mixcolumns (round 4) + bl ark_sbox // ark + sbox (round 5) + bl mixcolumns_1 // mixcolumns (round 5) + bl ark_sbox // ark + sbox (round 6) + bl mixcolumns_2 // mixcolumns (round 6) + bl ark_sbox // ark + sbox (round 7) + bl mixcolumns_3 // mixcolumns (round 7) + bl ark_sbox // ark + sbox (round 8) + bl mixcolumns_0 // mixcolumns (round 8) + bl ark_sbox // ark + sbox (round 9) + bl mixcolumns_1 // mixcolumns (round 9) + bl ark_sbox // ark + sbox (round 10) + bl mixcolumns_2 // mixcolumns (round 10) + bl ark_sbox // ark + sbox (round 11) + bl mixcolumns_3 // mixcolumns (round 11) + bl ark_sbox // ark + sbox (round 12) + bl mixcolumns_0 // mixcolumns (round 12) + bl ark_sbox // ark + sbox (round 13) + bl double_shiftrows // to resynchronize with the classical rep + ldr r14, [sp, #48] // --------------------------------------- + ldmia r14!, {r4,r5,r10,r12} // + eor r4, r1 // + eor r5, r3 // + eor r6, r10 // + eor r7, r12 // Last add_round_key + ldmia r14!, {r1,r3,r10,r12} // + eor r8, r1 // + eor r9, r0, r3 // + eor r10, r2 // + eor r11, r12 // --------------------------------------- + bl unpacking // unpack the internal state + ldrd r0, r1, [sp, #56] // restore the addr to store the ciphertext + add.w sp, #64 // restore the stack pointer + str.w r4, [r0] // store the ciphertext + str r5, [r0, #4] + str r6, [r0, #8] + str r7, [r0, #12] + str.w r8, [r1] // store the ciphertext + str r9, [r1, #4] + str r10,[r1, #8] + str r11,[r1, #12] + pop {r2-r12, r14} // restore context + bx lr \ No newline at end of file diff --git a/common/aes-keyschedule.S b/common/aes-keyschedule.S new file mode 100644 index 0000000..246bc5f --- /dev/null +++ b/common/aes-keyschedule.S @@ -0,0 +1,851 @@ +/****************************************************************************** +* ARM assembly implemetnations of the AES-128 and AES-256 key schedule to +* match fixslicing. +* Note that those implementations are fully bitsliced and do not rely on any +* Look-Up Table (LUT). +* +* See the paper at https://eprint.iacr.org/2020/1123.pdf for more details. +* +* @author Alexandre Adomnicai, Nanyang Technological University, Singapore +* alexandre.adomnicai@ntu.edu.sg +* +* @date October 2020 +******************************************************************************/ + +.syntax unified +.thumb + +/****************************************************************************** +* Macro to compute the SWAPMOVE technique: swap the bits in 'in1' masked by 'm' +* by the bits in 'in0' masked by 'm << n' and put the results in 'out0', 'out1' +******************************************************************************/ +.macro swpmv out0, out1, in0, in1, m, n, tmp + eor \tmp, \in1, \in0, lsr \n + and \tmp, \m + eor \out1, \in1, \tmp + eor \out0, \in0, \tmp, lsl \n +.endm + +/****************************************************************************** +* Packing routine. Note that it is the same as the one used in the encryption +* function so some code size could be saved by merging the two files. +******************************************************************************/ +.align 2 +packing: + movw r3, #0x0f0f + movt r3, #0x0f0f // r3 <- 0x0f0f0f0f (mask for SWAPMOVE) + eor r2, r3, r3, lsl #2 // r2 <- 0x33333333 (mask for SWAPMOVE) + eor r1, r2, r2, lsl #1 // r1 <- 0x55555555 (mask for SWAPMOVE) + swpmv r8, r4, r8, r4, r1, #1, r12 + swpmv r9, r5, r9, r5, r1, #1, r12 + swpmv r10, r6, r10, r6, r1, #1, r12 + swpmv r11, r7, r11, r7, r1, #1, r12 + swpmv r0, r4, r5, r4, r2, #2, r12 + swpmv r9, r5, r9, r8, r2, #2, r12 + swpmv r7, r8, r7, r6, r2, #2, r12 + swpmv r11, r2, r11, r10, r2, #2, r12 + swpmv r8, r4, r8, r4, r3, #4, r12 + swpmv r10, r6, r7, r0, r3, #4, r12 + swpmv r11, r7, r11, r9, r3, #4, r12 + swpmv r9, r5, r2, r5, r3, #4, r12 + bx lr + +/****************************************************************************** +* Subroutine that computes S-box. Note that the same code is used in the +* encryption function, so some code size could be saved by merging the 2 files. +* Credits to https://github.com/Ko-/aes-armcortexm. +******************************************************************************/ +.align 2 +sbox: + str r14, [sp, #52] + eor r1, r7, r9 //Exec y14 = U3 ^ U5; into r1 + eor r3, r4, r10 //Exec y13 = U0 ^ U6; into r3 + eor r2, r3, r1 //Exec y12 = y13 ^ y14; into r2 + eor r0, r8, r2 //Exec t1 = U4 ^ y12; into r0 + eor r14, r0, r9 //Exec y15 = t1 ^ U5; into r14 + and r12, r2, r14 //Exec t2 = y12 & y15; into r12 + eor r8, r14, r11 //Exec y6 = y15 ^ U7; into r8 + eor r0, r0, r5 //Exec y20 = t1 ^ U1; into r0 + str.w r2, [sp, #44] //Store r2/y12 on stack + eor r2, r4, r7 //Exec y9 = U0 ^ U3; into r2 + str r0, [sp, #40] //Store r0/y20 on stack + eor r0, r0, r2 //Exec y11 = y20 ^ y9; into r0 + str r2, [sp, #36] //Store r2/y9 on stack + and r2, r2, r0 //Exec t12 = y9 & y11; into r2 + str r8, [sp, #32] //Store r8/y6 on stack + eor r8, r11, r0 //Exec y7 = U7 ^ y11; into r8 + eor r9, r4, r9 //Exec y8 = U0 ^ U5; into r9 + eor r6, r5, r6 //Exec t0 = U1 ^ U2; into r6 + eor r5, r14, r6 //Exec y10 = y15 ^ t0; into r5 + str r14, [sp, #28] //Store r14/y15 on stack + eor r14, r5, r0 //Exec y17 = y10 ^ y11; into r14 + str.w r1, [sp, #24] //Store r1/y14 on stack + and r1, r1, r14 //Exec t13 = y14 & y17; into r1 + eor r1, r1, r2 //Exec t14 = t13 ^ t12; into r1 + str r14, [sp, #20] //Store r14/y17 on stack + eor r14, r5, r9 //Exec y19 = y10 ^ y8; into r14 + str.w r5, [sp, #16] //Store r5/y10 on stack + and r5, r9, r5 //Exec t15 = y8 & y10; into r5 + eor r2, r5, r2 //Exec t16 = t15 ^ t12; into r2 + eor r5, r6, r0 //Exec y16 = t0 ^ y11; into r5 + str.w r0, [sp, #12] //Store r0/y11 on stack + eor r0, r3, r5 //Exec y21 = y13 ^ y16; into r0 + str r3, [sp, #8] //Store r3/y13 on stack + and r3, r3, r5 //Exec t7 = y13 & y16; into r3 + str r5, [sp, #4] //Store r5/y16 on stack + str r11, [sp, #0] //Store r11/U7 on stack + eor r5, r4, r5 //Exec y18 = U0 ^ y16; into r5 + eor r6, r6, r11 //Exec y1 = t0 ^ U7; into r6 + eor r7, r6, r7 //Exec y4 = y1 ^ U3; into r7 + and r11, r7, r11 //Exec t5 = y4 & U7; into r11 + eor r11, r11, r12 //Exec t6 = t5 ^ t2; into r11 + eor r11, r11, r2 //Exec t18 = t6 ^ t16; into r11 + eor r14, r11, r14 //Exec t22 = t18 ^ y19; into r14 + eor r4, r6, r4 //Exec y2 = y1 ^ U0; into r4 + and r11, r4, r8 //Exec t10 = y2 & y7; into r11 + eor r11, r11, r3 //Exec t11 = t10 ^ t7; into r11 + eor r2, r11, r2 //Exec t20 = t11 ^ t16; into r2 + eor r2, r2, r5 //Exec t24 = t20 ^ y18; into r2 + eor r10, r6, r10 //Exec y5 = y1 ^ U6; into r10 + and r11, r10, r6 //Exec t8 = y5 & y1; into r11 + eor r3, r11, r3 //Exec t9 = t8 ^ t7; into r3 + eor r3, r3, r1 //Exec t19 = t9 ^ t14; into r3 + eor r3, r3, r0 //Exec t23 = t19 ^ y21; into r3 + eor r0, r10, r9 //Exec y3 = y5 ^ y8; into r0 + ldr r11, [sp, #32] //Load y6 into r11 + and r5, r0, r11 //Exec t3 = y3 & y6; into r5 + eor r12, r5, r12 //Exec t4 = t3 ^ t2; into r12 + ldr r5, [sp, #40] //Load y20 into r5 + str r7, [sp, #32] //Store r7/y4 on stack + eor r12, r12, r5 //Exec t17 = t4 ^ y20; into r12 + eor r1, r12, r1 //Exec t21 = t17 ^ t14; into r1 + and r12, r1, r3 //Exec t26 = t21 & t23; into r12 + eor r5, r2, r12 //Exec t27 = t24 ^ t26; into r5 + eor r12, r14, r12 //Exec t31 = t22 ^ t26; into r12 + eor r1, r1, r14 //Exec t25 = t21 ^ t22; into r1 + and r7, r1, r5 //Exec t28 = t25 & t27; into r7 + eor r14, r7, r14 //Exec t29 = t28 ^ t22; into r14 + and r4, r14, r4 //Exec z14 = t29 & y2; into r4 + and r8, r14, r8 //Exec z5 = t29 & y7; into r8 + eor r7, r3, r2 //Exec t30 = t23 ^ t24; into r7 + and r12, r12, r7 //Exec t32 = t31 & t30; into r12 + eor r12, r12, r2 //Exec t33 = t32 ^ t24; into r12 + eor r7, r5, r12 //Exec t35 = t27 ^ t33; into r7 + and r2, r2, r7 //Exec t36 = t24 & t35; into r2 + eor r5, r5, r2 //Exec t38 = t27 ^ t36; into r5 + and r5, r14, r5 //Exec t39 = t29 & t38; into r5 + eor r1, r1, r5 //Exec t40 = t25 ^ t39; into r1 + eor r5, r14, r1 //Exec t43 = t29 ^ t40; into r5 + ldr.w r7, [sp, #4] //Load y16 into r7 + and r7, r5, r7 //Exec z3 = t43 & y16; into r7 + eor r8, r7, r8 //Exec tc12 = z3 ^ z5; into r8 + str r8, [sp, #40] //Store r8/tc12 on stack + ldr r8, [sp, #8] //Load y13 into r8 + and r8, r5, r8 //Exec z12 = t43 & y13; into r8 + and r10, r1, r10 //Exec z13 = t40 & y5; into r10 + and r6, r1, r6 //Exec z4 = t40 & y1; into r6 + eor r6, r7, r6 //Exec tc6 = z3 ^ z4; into r6 + eor r3, r3, r12 //Exec t34 = t23 ^ t33; into r3 + eor r3, r2, r3 //Exec t37 = t36 ^ t34; into r3 + eor r1, r1, r3 //Exec t41 = t40 ^ t37; into r1 + ldr.w r5, [sp, #16] //Load y10 into r5 + and r2, r1, r5 //Exec z8 = t41 & y10; into r2 + and r9, r1, r9 //Exec z17 = t41 & y8; into r9 + str r9, [sp, #16] //Store r9/z17 on stack + eor r5, r12, r3 //Exec t44 = t33 ^ t37; into r5 + ldr r9, [sp, #28] //Load y15 into r9 + ldr.w r7, [sp, #44] //Load y12 into r7 + and r9, r5, r9 //Exec z0 = t44 & y15; into r9 + and r7, r5, r7 //Exec z9 = t44 & y12; into r7 + and r0, r3, r0 //Exec z10 = t37 & y3; into r0 + and r3, r3, r11 //Exec z1 = t37 & y6; into r3 + eor r3, r3, r9 //Exec tc5 = z1 ^ z0; into r3 + eor r3, r6, r3 //Exec tc11 = tc6 ^ tc5; into r3 + ldr r11, [sp, #32] //Load y4 into r11 + ldr.w r5, [sp, #20] //Load y17 into r5 + and r11, r12, r11 //Exec z11 = t33 & y4; into r11 + eor r14, r14, r12 //Exec t42 = t29 ^ t33; into r14 + eor r1, r14, r1 //Exec t45 = t42 ^ t41; into r1 + and r5, r1, r5 //Exec z7 = t45 & y17; into r5 + eor r6, r5, r6 //Exec tc8 = z7 ^ tc6; into r6 + ldr r5, [sp, #24] //Load y14 into r5 + str r4, [sp, #32] //Store r4/z14 on stack + and r1, r1, r5 //Exec z16 = t45 & y14; into r1 + ldr r5, [sp, #12] //Load y11 into r5 + ldr r4, [sp, #36] //Load y9 into r4 + and r5, r14, r5 //Exec z6 = t42 & y11; into r5 + eor r5, r5, r6 //Exec tc16 = z6 ^ tc8; into r5 + and r4, r14, r4 //Exec z15 = t42 & y9; into r4 + eor r14, r4, r5 //Exec tc20 = z15 ^ tc16; into r14 + eor r4, r4, r1 //Exec tc1 = z15 ^ z16; into r4 + eor r1, r0, r4 //Exec tc2 = z10 ^ tc1; into r1 + eor r0, r1, r11 //Exec tc21 = tc2 ^ z11; into r0 + eor r7, r7, r1 //Exec tc3 = z9 ^ tc2; into r7 + eor r1, r7, r5 //Exec S0 = tc3 ^ tc16; into r1 + eor r7, r7, r3 //Exec S3 = tc3 ^ tc11; into r7 + eor r3, r7, r5 //Exec S1 = S3 ^ tc16 ^ 1; into r3 + eor r11, r10, r4 //Exec tc13 = z13 ^ tc1; into r11 + ldr.w r4, [sp, #0] //Load U7 into r4 + and r12, r12, r4 //Exec z2 = t33 & U7; into r12 + eor r9, r9, r12 //Exec tc4 = z0 ^ z2; into r9 + eor r12, r8, r9 //Exec tc7 = z12 ^ tc4; into r12 + eor r2, r2, r12 //Exec tc9 = z8 ^ tc7; into r2 + eor r2, r6, r2 //Exec tc10 = tc8 ^ tc9; into r2 + ldr.w r4, [sp, #32] //Load z14 into r4 + eor r12, r4, r2 //Exec tc17 = z14 ^ tc10; into r12 + eor r0, r0, r12 //Exec S5 = tc21 ^ tc17; into r0 + eor r6, r12, r14 //Exec tc26 = tc17 ^ tc20; into r6 + ldr.w r4, [sp, #16] //Load z17 into r4 + ldr r12, [sp, #40] //Load tc12 into r12 + eor r6, r6, r4 //Exec S2 = tc26 ^ z17 ^ 1; into r6 + eor r12, r9, r12 //Exec tc14 = tc4 ^ tc12; into r12 + eor r14, r11, r12 //Exec tc18 = tc13 ^ tc14; into r14 + eor r2, r2, r14 //Exec S6 = tc10 ^ tc18 ^ 1; into r2 + eor r11, r8, r14 //Exec S7 = z12 ^ tc18 ^ 1; into r11 + ldr r14, [sp, #52] // restore link register + eor r8, r12, r7 //Exec S4 = tc14 ^ S3; into r8 + bx lr + // [('r0', 'S5'), ('r1', 'S0'), ('r2', 'S6'), ('r3', 'S1'), + // ('r6', 'S2'),('r7', 'S3'), ('r8', 'S4'), ('r11', 'S7')] + +/****************************************************************************** +* Subroutine that XORs the columns after the S-box during the AES-128 key +* schedule round function, for rounds i such that (i % 4) == 0. +* Note that the code size could be reduced at the cost of some instructions +* since some redundant code is applied on different registers. +******************************************************************************/ +.align 2 +aes128_xorcolumns_rotword: + ldr r12, [sp, #56] // restore 'rkeys' address + ldr.w r5, [r12, #28] // load rkey word of rkey from prev round + movw r4, #0xc0c0 + movt r4, #0xc0c0 // r4 <- 0xc0c0c0c0 + eor r11, r5, r11, ror #2 // r11<- r5 ^ (r11 >>> 2) + bic r11, r4, r11 // r11<- ~r11 & 0xc0c0c0c0 (NOT omitted in sbox) + eor r9, r5, r11, ror #2 // r9 <- r5 ^ (r11 >>> 2) + and r9, r9, r4, ror #2 // r9 <- r9 & 0x30303030 + orr r11, r11, r9 // r11<- r11 | r9 + eor r9, r5, r11, ror #2 // r9 <- r5 ^ (r11 >>> 2) + and r9, r9, r4, ror #4 // r9 <- r9 & 0x0c0c0c0c + orr r11, r11, r9 // r11<- r11 | r9 + eor r9, r5, r11, ror #2 // r9 <- r5 ^ (r11 >>> 2) + and r9, r9, r4, ror #6 // r9 <- r9 & 0x03030303 + orr r11, r11, r9 // r11<- r11 | r9 + mvn r9, r5 // NOT omitted in sbox + ldr.w r5, [r12, #24] // load rkey word of rkey from prev round + str r9, [r12, #28] // store new rkey word after NOT + str r11, [r12, #60] // store new rkey word in 'rkeys' + eor r10, r5, r2, ror #2 // r10<- r5 ^ (r2 >>> 2) + bic r10, r4, r10 // r10<- ~r10 & 0xc0c0c0c0 (NOT omitted in sbox) + eor r9, r5, r10, ror #2 // r9 <- r5 ^ (r10 >>> 2) + and r9, r9, r4, ror #2 // r9 <- r9 & 0x30303030 + orr r10, r10, r9 // r10<- r10 | r9 + eor r9, r5, r10, ror #2 // r9 <- r5 ^ (r10 >>> 2) + and r9, r9, r4, ror #4 // r9 <- r9 & 0x0c0c0c0c + orr r10, r10, r9 // r10<- r10 | r9 + eor r9, r5, r10, ror #2 // r9 <- r5 ^ (r10 >>> 2) + and r9, r9, r4, ror #6 // r9 <- r9 & 0x03030303 + orr r10, r10, r9 // r10<- r10 | r9 + mvn r9, r5 // NOT omitted in sbox + ldr.w r2, [r12, #20] // load rkey word of rkey from prev round + str r9, [r12, #24] // store new rkey word after NOT + str r10, [r12, #56] // store new rkey word in 'rkeys' + eor r9, r2, r0, ror #2 // r9 <- r2 ^ (r9 >>> 2) + and r9, r4, r9 // r9 <- r9 & 0xc0c0c0c0 + eor r0, r2, r9, ror #2 // r0 <- r2 ^ (r9 >>> 2) + and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 + orr r9, r9, r0 // r9 <- r9 | r0 + eor r0, r2, r9, ror #2 // r0 <- r2 ^ (r9 >>> 2) + and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c + orr r9, r9, r0 // r9 <- r9 | r0 + eor r0, r2, r9, ror #2 // r0 <- r2 ^ (r9 >>> 2) + and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 + orr r9, r9, r0 // r9 <- r9 | r0 + ldr.w r2, [r12, #16] // load rkey word of rkey from prev round + str.w r9, [r12, #52] // store new rkey word in 'rkeys' + eor r8, r2, r8, ror #2 // r8 <- r2 ^ (r8 >>> 2) + and r8, r4, r8 // r8 <- r8 & 0xc0c0c0c0 + eor r0, r2, r8, ror #2 // r0 <- r2 ^ (r8 >>> 2) + and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 + orr r8, r8, r0 // r8 <- r8 | r0 + eor r0, r2, r8, ror #2 // r0 <- r2 ^ (r8 >>> 2) + and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c + orr r8, r8, r0 // r8 <- r8 | r0 + eor r0, r2, r8, ror #2 // r0 <- r2 ^ (r8 >>> 2) + and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 + orr r8, r8, r0 // r8 <- r8 | r0 + ldr.w r2, [r12, #12] // load rkey word of rkey from prev round + str.w r8, [r12, #48] // store new rkey word in 'rkeys' + eor r7, r2, r7, ror #2 // r7 <- r2 ^ (r7 >>> 2) + and r7, r4, r7 // r7 <- r7 & 0xc0c0c0c0 + eor r0, r2, r7, ror #2 // r0 <- r2 ^ (r7 >>> 2) + and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 + orr r7, r7, r0 // r7 <- r7 | r0 + eor r0, r2, r7, ror #2 // r0 <- r2 ^ (r7 >>> 2) + and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c + orr r7, r7, r0 // r7 <- r7 | r0 + eor r0, r2, r7, ror #2 // r0 <- r2 ^ (r7 >>> 2) + and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 + orr r7, r7, r0 // r7 <- r7 | r0 + ldr.w r2, [r12, #8] // load rkey word of rkey from prev round + str.w r7, [r12, #44] // store new rkey word in 'rkeys' + eor r6, r2, r6, ror #2 // r6 <- r2 ^ (r6 >>> 2) + bic r6, r4, r6 // r6 <- ~r6 & 0xc0c0c0c0 (NOT omitted in sbox) + eor r0, r2, r6, ror #2 // r0 <- r2 ^ (r6 >>> 2) + and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 + orr r6, r6, r0 // r6 <- r6 | r0 + eor r0, r2, r6, ror #2 // r0 <- r2 ^ (r6 >>> 2) + and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c + orr r6, r6, r0 // r6 <- r6 | r0 + eor r0, r2, r6, ror #2 // r0 <- r2 ^ (r6 >>> 2) + and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 + orr r6, r6, r0 // r6 <- r6 | r0 + mvn r0, r2 // NOT omitted in sbox + ldr.w r2, [r12, #4] // load rkey word of rkey from prev round + str.w r0, [r12, #8] // store new rkey word after NOT + str.w r6, [r12, #40] // store new rkey word in 'rkeys' + eor r5, r2, r3, ror #2 // r5 <- r2 ^ (r3 >>> 2) + bic r5, r4, r5 // r5 <- ~r5 & 0xc0c0c0c0 (NOT omitted in sbox) + eor r0, r2, r5, ror #2 // r0 <- r2 ^ (r5 >>> 2) + and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 + orr r5, r5, r0 // r5 <- r5 | r0 + eor r0, r2, r5, ror #2 // r0 <- r2 ^ (r5 >>> 2) + and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c + orr r5, r5, r0 // r5 <- r5 | r0 + eor r0, r2, r5, ror #2 // r0 <- r2 ^ (r5 >>> 2) + and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 + orr r5, r5, r0 // r5 <- r5 | r0 + mvn r0, r2 // NOT omitted in sbox + ldr.w r2, [r12], #32 // load rkey word of rkey from prev round + str.w r0, [r12, #-28] // store new rkey word after NOT + str.w r5, [r12, #4] // store new rkey word in 'rkeys' + eor r3, r2, r1, ror #2 // r3 <- r2 ^ (r1 >>> 2) + and r3, r4, r3 // r3 <- r3 & 0xc0c0c0c0 + eor r0, r2, r3, ror #2 // r0 <- r2 ^ (r3 >>> 2) + and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 + orr r3, r3, r0 // r3 <- r3 | r0 + eor r0, r2, r3, ror #2 // r0 <- r2 ^ (r3 >>> 2) + and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c + orr r3, r3, r0 // r3 <- r3 | r0 + eor r0, r2, r3, ror #2 // r0 <- r2 ^ (r3 >>> 2) + and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 + orr r4, r3, r0 // r4 <- r3 | r0 + str.w r4, [r12] + str.w r12, [sp, #56] // store the new rkeys address on the stack + bx lr + +/****************************************************************************** +* Subroutine that XORs the columns after the S-box during the AES-256 key +* schedule round function, for rounds i such that (i % 4) == 0. +* Differs from 'aes128_xorcolumns_rotword' by the rkeys' indexes to be involved +* in XORs. +******************************************************************************/ +.align 2 +aes256_xorcolumns_rotword: + ldr r12, [sp, #56] // restore 'rkeys' address + ldr.w r5, [r12, #28] // load rkey word of rkey from prev round + movw r4, #0xc0c0 + movt r4, #0xc0c0 // r4 <- 0xc0c0c0c0 + eor r11, r5, r11, ror #2 // r11<- r5 ^ (r11 >>> 2) + bic r11, r4, r11 // r11<- ~r11 & 0xc0c0c0c0 (NOT omitted in sbox) + eor r9, r5, r11, ror #2 // r9 <- r5 ^ (r11 >>> 2) + and r9, r9, r4, ror #2 // r9 <- r9 & 0x30303030 + orr r11, r11, r9 // r11<- r11 | r9 + eor r9, r5, r11, ror #2 // r9 <- r5 ^ (r11 >>> 2) + and r9, r9, r4, ror #4 // r9 <- r9 & 0x0c0c0c0c + orr r11, r11, r9 // r11<- r11 | r9 + eor r9, r5, r11, ror #2 // r9 <- r5 ^ (r11 >>> 2) + and r9, r9, r4, ror #6 // r9 <- r9 & 0x03030303 + orr r11, r11, r9 // r11<- r11 | r9 + mvn r9, r5 // NOT omitted in sbox + ldr.w r5, [r12, #24] // load rkey word of rkey from prev round + str r9, [r12, #28] // store new rkey word after NOT + str r11, [r12, #92] // store new rkey word in 'rkeys' + eor r10, r5, r2, ror #2 // r10<- r5 ^ (r2 >>> 2) + bic r10, r4, r10 // r10<- ~r10 & 0xc0c0c0c0 (NOT omitted in sbox) + eor r9, r5, r10, ror #2 // r9 <- r5 ^ (r10 >>> 2) + and r9, r9, r4, ror #2 // r9 <- r9 & 0x30303030 + orr r10, r10, r9 // r10<- r10 | r9 + eor r9, r5, r10, ror #2 // r9 <- r5 ^ (r10 >>> 2) + and r9, r9, r4, ror #4 // r9 <- r9 & 0x0c0c0c0c + orr r10, r10, r9 // r10<- r10 | r9 + eor r9, r5, r10, ror #2 // r9 <- r5 ^ (r10 >>> 2) + and r9, r9, r4, ror #6 // r9 <- r9 & 0x03030303 + orr r10, r10, r9 // r10<- r10 | r9 + mvn r9, r5 // NOT omitted in sbox + ldr.w r2, [r12, #20] // load rkey word of rkey from prev round + str r9, [r12, #24] // store new rkey word after NOT + str r10, [r12, #88] // store new rkey word in 'rkeys' + eor r9, r2, r0, ror #2 // r9 <- r2 ^ (r9 >>> 2) + and r9, r4, r9 // r9 <- r9 & 0xc0c0c0c0 + eor r0, r2, r9, ror #2 // r0 <- r2 ^ (r9 >>> 2) + and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 + orr r9, r9, r0 // r9 <- r9 | r0 + eor r0, r2, r9, ror #2 // r0 <- r2 ^ (r9 >>> 2) + and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c + orr r9, r9, r0 // r9 <- r9 | r0 + eor r0, r2, r9, ror #2 // r0 <- r2 ^ (r9 >>> 2) + and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 + orr r9, r9, r0 // r9 <- r9 | r0 + ldr.w r2, [r12, #16] // load rkey word of rkey from prev round + str.w r9, [r12, #84] // store new rkey word in 'rkeys' + eor r8, r2, r8, ror #2 // r8 <- r2 ^ (r8 >>> 2) + and r8, r4, r8 // r8 <- r8 & 0xc0c0c0c0 + eor r0, r2, r8, ror #2 // r0 <- r2 ^ (r8 >>> 2) + and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 + orr r8, r8, r0 // r8 <- r8 | r0 + eor r0, r2, r8, ror #2 // r0 <- r2 ^ (r8 >>> 2) + and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c + orr r8, r8, r0 // r8 <- r8 | r0 + eor r0, r2, r8, ror #2 // r0 <- r2 ^ (r8 >>> 2) + and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 + orr r8, r8, r0 // r8 <- r8 | r0 + ldr.w r2, [r12, #12] // load rkey word of rkey from prev round + str.w r8, [r12, #80] // store new rkey word in 'rkeys' + eor r7, r2, r7, ror #2 // r7 <- r2 ^ (r7 >>> 2) + and r7, r4, r7 // r7 <- r7 & 0xc0c0c0c0 + eor r0, r2, r7, ror #2 // r0 <- r2 ^ (r7 >>> 2) + and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 + orr r7, r7, r0 // r7 <- r7 | r0 + eor r0, r2, r7, ror #2 // r0 <- r2 ^ (r7 >>> 2) + and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c + orr r7, r7, r0 // r7 <- r7 | r0 + eor r0, r2, r7, ror #2 // r0 <- r2 ^ (r7 >>> 2) + and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 + orr r7, r7, r0 // r7 <- r7 | r0 + ldr.w r2, [r12, #8] // load rkey word of rkey from prev round + str.w r7, [r12, #76] // store new rkey word in 'rkeys' + eor r6, r2, r6, ror #2 // r6 <- r2 ^ (r6 >>> 2) + bic r6, r4, r6 // r6 <- ~r6 & 0xc0c0c0c0 (NOT omitted in sbox) + eor r0, r2, r6, ror #2 // r0 <- r2 ^ (r6 >>> 2) + and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 + orr r6, r6, r0 // r6 <- r6 | r0 + eor r0, r2, r6, ror #2 // r0 <- r2 ^ (r6 >>> 2) + and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c + orr r6, r6, r0 // r6 <- r6 | r0 + eor r0, r2, r6, ror #2 // r0 <- r2 ^ (r6 >>> 2) + and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 + orr r6, r6, r0 // r6 <- r6 | r0 + mvn r0, r2 // NOT omitted in sbox + ldr.w r2, [r12, #4] // load rkey word of rkey from prev round + str.w r0, [r12, #8] // store new rkey word after NOT + str.w r6, [r12, #72] // store new rkey word in 'rkeys' + eor r5, r2, r3, ror #2 // r5 <- r2 ^ (r3 >>> 2) + bic r5, r4, r5 // r5 <- ~r5 & 0xc0c0c0c0 (NOT omitted in sbox) + eor r0, r2, r5, ror #2 // r0 <- r2 ^ (r5 >>> 2) + and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 + orr r5, r5, r0 // r5 <- r5 | r0 + eor r0, r2, r5, ror #2 // r0 <- r2 ^ (r5 >>> 2) + and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c + orr r5, r5, r0 // r5 <- r5 | r0 + eor r0, r2, r5, ror #2 // r0 <- r2 ^ (r5 >>> 2) + and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 + orr r5, r5, r0 // r5 <- r5 | r0 + mvn r0, r2 // NOT omitted in sbox + ldr.w r2, [r12], #32 // load rkey word of rkey from prev round + str.w r0, [r12, #-28] // store new rkey word after NOT + str.w r5, [r12, #36] // store new rkey word in 'rkeys' + eor r3, r2, r1, ror #2 // r3 <- r2 ^ (r1 >>> 2) + and r3, r4, r3 // r3 <- r3 & 0xc0c0c0c0 + eor r0, r2, r3, ror #2 // r0 <- r2 ^ (r3 >>> 2) + and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 + orr r3, r3, r0 // r3 <- r3 | r0 + eor r0, r2, r3, ror #2 // r0 <- r2 ^ (r3 >>> 2) + and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c + orr r3, r3, r0 // r3 <- r3 | r0 + eor r0, r2, r3, ror #2 // r0 <- r2 ^ (r3 >>> 2) + and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 + orr r4, r3, r0 // r4 <- r3 | r0 + str.w r4, [r12, #32] + str.w r12, [sp, #56] // store the new rkeys address on the stack + bx lr + +/****************************************************************************** +* Subroutine that XORs the columns after the S-box during the AES-256 key +* schedule round function, for rounds i such that (i % 4) == 0. +* It differs from 'aes256_xorcolumns_rotword' by the omission of the rotword +* operation (i.e. 'ror #26' instead of 'ror #2'). +******************************************************************************/ +.align 2 +aes256_xorcolumns: + ldr r12, [sp, #56] // restore 'rkeys' address + ldr.w r5, [r12, #28] // load rkey word of rkey from prev round + movw r4, #0xc0c0 + movt r4, #0xc0c0 // r4 <- 0xc0c0c0c0 + eor r11, r5, r11, ror #26 // r11<- r5 ^ (r11 >>> 26) + bic r11, r4, r11 // r11<- ~r11 & 0xc0c0c0c0 (NOT omitted in sbox) + eor r9, r5, r11, ror #2 // r9 <- r5 ^ (r11 >>> 2) + and r9, r9, r4, ror #2 // r9 <- r9 & 0x30303030 + orr r11, r11, r9 // r11<- r11 | r9 + eor r9, r5, r11, ror #2 // r9 <- r5 ^ (r11 >>> 2) + and r9, r9, r4, ror #4 // r9 <- r9 & 0x0c0c0c0c + orr r11, r11, r9 // r11<- r11 | r9 + eor r9, r5, r11, ror #2 // r9 <- r5 ^ (r11 >>> 2) + and r9, r9, r4, ror #6 // r9 <- r9 & 0x03030303 + orr r11, r11, r9 // r11<- r11 | r9 + mvn r9, r5 // NOT omitted in sbox + ldr.w r5, [r12, #24] // load rkey word of rkey from prev round + str r9, [r12, #28] // store new rkey word after NOT + str r11, [r12, #92] // store new rkey word in 'rkeys' + eor r10, r5, r2, ror #26 // r10<- r5 ^ (r2 >>> 2) + bic r10, r4, r10 // r10<- ~r10 & 0xc0c0c0c0 (NOT omitted in sbox) + eor r9, r5, r10, ror #2 // r9 <- r5 ^ (r10 >>> 2) + and r9, r9, r4, ror #2 // r9 <- r9 & 0x30303030 + orr r10, r10, r9 // r10<- r10 | r9 + eor r9, r5, r10, ror #2 // r9 <- r5 ^ (r10 >>> 2) + and r9, r9, r4, ror #4 // r9 <- r9 & 0x0c0c0c0c + orr r10, r10, r9 // r10<- r10 | r9 + eor r9, r5, r10, ror #2 // r9 <- r5 ^ (r10 >>> 2) + and r9, r9, r4, ror #6 // r9 <- r9 & 0x03030303 + orr r10, r10, r9 // r10<- r10 | r9 + mvn r9, r5 // NOT omitted in sbox + ldr.w r2, [r12, #20] // load rkey word of rkey from prev round + str r9, [r12, #24] // store new rkey word after NOT + str r10, [r12, #88] // store new rkey word in 'rkeys' + eor r9, r2, r0, ror #26 // r9 <- r2 ^ (r9 >>> 26) + and r9, r4, r9 // r9 <- r9 & 0xc0c0c0c0 + eor r0, r2, r9, ror #2 // r0 <- r2 ^ (r9 >>> 2) + and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 + orr r9, r9, r0 // r9 <- r9 | r0 + eor r0, r2, r9, ror #2 // r0 <- r2 ^ (r9 >>> 2) + and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c + orr r9, r9, r0 // r9 <- r9 | r0 + eor r0, r2, r9, ror #2 // r0 <- r2 ^ (r9 >>> 2) + and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 + orr r9, r9, r0 // r9 <- r9 | r0 + ldr.w r2, [r12, #16] // load rkey word of rkey from prev round + str.w r9, [r12, #84] // store new rkey word in 'rkeys' + eor r8, r2, r8, ror #26 // r8 <- r2 ^ (r8 >>> 26) + and r8, r4, r8 // r8 <- r8 & 0xc0c0c0c0 + eor r0, r2, r8, ror #2 // r0 <- r2 ^ (r8 >>> 2) + and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 + orr r8, r8, r0 // r8 <- r8 | r0 + eor r0, r2, r8, ror #2 // r0 <- r2 ^ (r8 >>> 2) + and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c + orr r8, r8, r0 // r8 <- r8 | r0 + eor r0, r2, r8, ror #2 // r0 <- r2 ^ (r8 >>> 2) + and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 + orr r8, r8, r0 // r8 <- r8 | r0 + ldr.w r2, [r12, #12] // load rkey word of rkey from prev round + str.w r8, [r12, #80] // store new rkey word in 'rkeys' + eor r7, r2, r7, ror #26 // r7 <- r2 ^ (r7 >>> 26) + and r7, r4, r7 // r7 <- r7 & 0xc0c0c0c0 + eor r0, r2, r7, ror #2 // r0 <- r2 ^ (r7 >>> 2) + and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 + orr r7, r7, r0 // r7 <- r7 | r0 + eor r0, r2, r7, ror #2 // r0 <- r2 ^ (r7 >>> 2) + and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c + orr r7, r7, r0 // r7 <- r7 | r0 + eor r0, r2, r7, ror #2 // r0 <- r2 ^ (r7 >>> 2) + and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 + orr r7, r7, r0 // r7 <- r7 | r0 + ldr.w r2, [r12, #8] // load rkey word of rkey from prev round + str.w r7, [r12, #76] // store new rkey word in 'rkeys' + eor r6, r2, r6, ror #26 // r6 <- r2 ^ (r6 >>> 26) + bic r6, r4, r6 // r6 <- ~r6 & 0xc0c0c0c0 (NOT omitted in sbox) + eor r0, r2, r6, ror #2 // r0 <- r2 ^ (r6 >>> 2) + and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 + orr r6, r6, r0 // r6 <- r6 | r0 + eor r0, r2, r6, ror #2 // r0 <- r2 ^ (r6 >>> 2) + and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c + orr r6, r6, r0 // r6 <- r6 | r0 + eor r0, r2, r6, ror #2 // r0 <- r2 ^ (r6 >>> 2) + and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 + orr r6, r6, r0 // r6 <- r6 | r0 + mvn r0, r2 // NOT omitted in sbox + ldr.w r2, [r12, #4] // load rkey word of rkey from prev round + str.w r0, [r12, #8] // store new rkey word after NOT + str.w r6, [r12, #72] // store new rkey word in 'rkeys' + eor r5, r2, r3, ror #26 // r5 <- r2 ^ (r3 >>> 26) + bic r5, r4, r5 // r5 <- ~r5 & 0xc0c0c0c0 (NOT omitted in sbox) + eor r0, r2, r5, ror #2 // r0 <- r2 ^ (r5 >>> 2) + and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 + orr r5, r5, r0 // r5 <- r5 | r0 + eor r0, r2, r5, ror #2 // r0 <- r2 ^ (r5 >>> 2) + and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c + orr r5, r5, r0 // r5 <- r5 | r0 + eor r0, r2, r5, ror #2 // r0 <- r2 ^ (r5 >>> 2) + and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 + orr r5, r5, r0 // r5 <- r5 | r0 + mvn r0, r2 // NOT omitted in sbox + ldr.w r2, [r12], #32 // load rkey word of rkey from prev round + str.w r0, [r12, #-28] // store new rkey word after NOT + str.w r5, [r12, #36] // store new rkey word in 'rkeys' + eor r3, r2, r1, ror #26 // r3 <- r2 ^ (r1 >>> 26) + and r3, r4, r3 // r3 <- r3 & 0xc0c0c0c0 + eor r0, r2, r3, ror #2 // r0 <- r2 ^ (r3 >>> 2) + and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 + orr r3, r3, r0 // r3 <- r3 | r0 + eor r0, r2, r3, ror #2 // r0 <- r2 ^ (r3 >>> 2) + and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c + orr r3, r3, r0 // r3 <- r3 | r0 + eor r0, r2, r3, ror #2 // r0 <- r2 ^ (r3 >>> 2) + and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 + orr r4, r3, r0 // r4 <- r3 | r0 + str.w r4, [r12, #32] + str.w r12, [sp, #56] // store the new rkeys address on the stack + bx lr + +/****************************************************************************** +* Applies ShiftRows^(-1) on a round key to match the fixsliced representation. +******************************************************************************/ +.align 2 +inv_shiftrows_1: + ldr.w r2, [r12, #-32]! + str r14, [sp, #52] // store link register + movw r1, #8 + movw r14, #0x0300 + movt r14, #0x0c0f // r14<- 0x0c0f0300 for ShiftRows^[-1] +loop_inv_sr_1: + movw r3, #0x3300 + movt r3, #0x3300 // r3 <- 0x33003300 for ShiftRows^[-1] + swpmv r2, r2, r2, r2, r14, 4, r0 + eor r0, r2, r2, lsr #2 + and r0, r3 + eor r2, r2, r0 + eor r3, r2, r0, lsl #2 + ldr.w r2, [r12, #4]! + str.w r3, [r12, #-4] + subs r1, #1 + bne loop_inv_sr_1 + ldr r14, [sp, #52] // restore link register + bx lr + +/****************************************************************************** +* Applies ShiftRows^(-2) on a round key to match the fixsliced representation. +* Only needed for the fully-fixsliced (ffs) representation. +******************************************************************************/ +.align 2 +inv_shiftrows_2: + ldr.w r2, [r12, #-32]! + str r14, [sp, #52] // store link register + movw r1, #8 + movw r14, #0x0f00 + movt r14, #0x0f00 // r14<- 0x0f000f00 for ShiftRows^[-2] +loop_inv_sr_2: + eor r0, r2, r2, lsr #4 + and r0, r14 + eor r2, r2, r0 + eor r3, r2, r0, lsl #4 + ldr.w r2, [r12, #4]! + str.w r3, [r12, #-4] + subs r1, #1 + bne loop_inv_sr_2 + ldr r14, [sp, #52] // restore link register + bx lr + +/****************************************************************************** +* Applies ShiftRows^(-3) on a round key to match the fixsliced representation. +* Only needed for the fully-fixsliced (ffs) representation. +******************************************************************************/ +.align 2 +inv_shiftrows_3: + ldr.w r2, [r12, #-32]! + str r14, [sp, #52] // store link register + movw r1, #8 + movw r14, #0x0c00 + movt r14, #0x030f // r14<- 0x030f0c00 for ShiftRows^[-3] +loop_inv_sr_3: + movw r3, #0x3300 + movt r3, #0x3300 // r3 <- 0x33003300 for ShiftRows^[-3] + swpmv r2, r2, r2, r2, r14, 4, r0 + eor r0, r2, r2, lsr #2 + and r0, r3 + eor r2, r2, r0 + eor r3, r2, r0, lsl #2 + ldr.w r2, [r12, #4]! + str.w r3, [r12, #-4] + subs r1, #1 + bne loop_inv_sr_3 + ldr r14, [sp, #52] // restore link register + bx lr + +/****************************************************************************** +* Fully bitsliced AES-128 key schedule to match the fully-fixsliced (ffs) +* representation. Note that it is possible to pass two different keys as input +* parameters if one wants to encrypt 2 blocks in with two different keys. +******************************************************************************/ +@ void aes128_keyschedule_ffs(u32* rkeys, const u8* key); +.global aes128_keyschedule_ffs +.type aes128_keyschedule_ffs,%function +.align 2 +aes128_keyschedule_ffs: + push {r0-r12,r14} + sub.w sp, #56 // allow space on the stack for tmp var + ldr.w r4, [r1] // load the 128-bit key in r4-r7 + ldr r5, [r1, #4] + ldr r6, [r1, #8] + ldr r7, [r1, #12] + ldr.w r8, [r1] // load the 128-bit key in r8-r11 + ldr r9, [r1, #4] + ldr r10,[r1, #8] + ldr r11,[r1, #12] + bl packing // pack the master key + ldr.w r0, [sp, #56] // restore 'rkeys' address + stm r0, {r4-r11} // store the packed master key in 'rkeys' + bl sbox // apply the sbox to the master key + eor r11, r11, #0x00000300 // add the 1st rconst + bl aes128_xorcolumns_rotword + bl sbox // apply the sbox to the master key + eor r2, r2, #0x00000300 // add the 2nd rconst + bl aes128_xorcolumns_rotword + bl inv_shiftrows_1 + bl sbox // apply the sbox to the master key + eor r0, r0, #0x00000300 // add the 3rd rconst + bl aes128_xorcolumns_rotword + bl inv_shiftrows_2 + bl sbox // apply the sbox to the master key + eor r8, r8, #0x00000300 // add the 4th rconst + bl aes128_xorcolumns_rotword + bl inv_shiftrows_3 + bl sbox // apply the sbox to the master key + eor r7, r7, #0x00000300 // add the 5th rconst + bl aes128_xorcolumns_rotword + bl sbox // apply the sbox to the master key + eor r6, r6, #0x00000300 // add the 6th rconst + bl aes128_xorcolumns_rotword + bl inv_shiftrows_1 + bl sbox // apply the sbox to the master key + eor r3, r3, #0x00000300 // add the 7th rconst + bl aes128_xorcolumns_rotword + bl inv_shiftrows_2 + bl sbox // apply the sbox to the master key + eor r1, r1, #0x00000300 // add the 8th rconst + bl aes128_xorcolumns_rotword + bl inv_shiftrows_3 + bl sbox // apply the sbox to the master key + eor r11, r11, #0x00000300 // add the 9th rconst + eor r2, r2, #0x00000300 // add the 9th rconst + eor r8, r8, #0x00000300 // add the 9th rconst + eor r7, r7, #0x00000300 // add the 9th rconst + bl aes128_xorcolumns_rotword + bl sbox // apply the sbox to the master key + eor r2, r2, #0x00000300 // add the 10th rconst + eor r0, r0, #0x00000300 // add the 10th rconst + eor r7, r7, #0x00000300 // add the 10th rconst + eor r6, r6, #0x00000300 // add the 10th rconst + bl aes128_xorcolumns_rotword + bl inv_shiftrows_1 + mvn r5, r5 // add the NOT for the last rkey + mvn r6, r6 // add the NOT for the last rkey + mvn r10, r10 // add the NOT for the last rkey + mvn r11, r11 // add the NOT for the last rkey + strd r5, r6, [r12, #4] + strd r10, r11, [r12, #24] + ldrd r0, r1, [r12, #-316] + ldrd r2, r3, [r12, #-296] + mvn r0, r0 // remove the NOT for the key whitening + mvn r1, r1 // remove the NOT for the key whitening + mvn r2, r2 // remove the NOT for the key whitening + mvn r3, r3 // remove the NOT for the key whitening + strd r0, r1, [r12, #-316] + strd r2, r3, [r12, #-296] + add.w sp, #56 // restore stack + pop {r0-r12, r14} // restore context + bx lr + +/****************************************************************************** +* Fully bitsliced AES-256 key schedule to match the fully-fixsliced (ffs) +* representation. Note that it is possible to pass 2 different keys as input +* parameters if one wants to encrypt 2 blocks in with 2 different keys. +******************************************************************************/ +@ void aes256_keyschedule_ffs(u32* rkeys, const u8* key); +.global aes256_keyschedule_ffs +.type aes256_keyschedule_ffs,%function +.align 2 +aes256_keyschedule_ffs: + push {r0-r12,r14} + sub.w sp, #56 // allow space on the stack for tmp var + ldr.w r4, [r1] // load the 128 first key bits in r4-r7 + ldr r5, [r1, #4] + ldr r6, [r1, #8] + ldr r7, [r1, #12] + ldr.w r8, [r1] // load the 128 first key bits in r8-r11 + ldr r9, [r1, #4] + ldr r10,[r1, #8] + ldr r11,[r1, #12] + bl packing // pack the master key + ldrd r0,r1, [sp, #56] // restore 'rkeys' and 'key' addresses + stm r0, {r4-r11} // store the packed master key in 'rkeys' + add.w r1, #16 // points to the 128 last bits of the key + ldr.w r4, [r1] // load the 128 first key bits in r4-r7 + ldr r5, [r1, #4] + ldr r6, [r1, #8] + ldr r7, [r1, #12] + ldr.w r8, [r1] // load the 128 first key bits in r8-r11 + ldr r9, [r1, #4] + ldr r10,[r1, #8] + ldr r11,[r1, #12] + bl packing // pack the master key + ldr.w r0, [sp, #56] // restore 'rkeys' address + add.w r0, #32 // points to the 128 last bits of the key + stm r0, {r4-r11} // store the packed master key in 'rkeys' + bl sbox // apply the sbox to the master key + eor r11, r11, #0x00000300 // add the 1st rconst + bl aes256_xorcolumns_rotword + bl sbox // apply the sbox to the master key + bl aes256_xorcolumns + bl inv_shiftrows_1 + bl sbox // apply the sbox to the master key + eor r2, r2, #0x00000300 // add the 2nd rconst + bl aes256_xorcolumns_rotword + bl inv_shiftrows_2 + bl sbox // apply the sbox to the master key + bl aes256_xorcolumns + bl inv_shiftrows_3 + bl sbox // apply the sbox to the master key + eor r0, r0, #0x00000300 // add the 3rd rconst + bl aes256_xorcolumns_rotword + bl sbox // apply the sbox to the master key + bl aes256_xorcolumns + bl inv_shiftrows_1 + bl sbox // apply the sbox to the master key + eor r8, r8, #0x00000300 // add the 4th rconst + bl aes256_xorcolumns_rotword + bl inv_shiftrows_2 + bl sbox // apply the sbox to the master key + bl aes256_xorcolumns + bl inv_shiftrows_3 + bl sbox // apply the sbox to the master key + eor r7, r7, #0x00000300 // add the 5th rconst + bl aes256_xorcolumns_rotword + bl sbox // apply the sbox to the master key + bl aes256_xorcolumns + bl inv_shiftrows_1 + bl sbox // apply the sbox to the master key + eor r6, r6, #0x00000300 // add the 6th rconst + bl aes256_xorcolumns_rotword + bl inv_shiftrows_2 + bl sbox // apply the sbox to the master key + bl aes256_xorcolumns + bl inv_shiftrows_3 + bl sbox // apply the sbox to the master key + eor r3, r3, #0x00000300 // add the 6th rconst + bl aes256_xorcolumns_rotword + add r12, #32 + bl inv_shiftrows_1 + mvn r5, r5 // add the NOT for the last rkey + mvn r6, r6 // add the NOT for the last rkey + mvn r10, r10 // add the NOT for the last rkey + mvn r11, r11 // add the NOT for the last rkey + ldrd r0, r1, [r12, #-28] + ldrd r2, r3, [r12, #-8] + strd r5, r6, [r12, #4] + strd r10, r11, [r12, #24] + mvn r0, r0 // add the NOT for the penultimate rkey + mvn r1, r1 // add the NOT for the penultimate rkey + mvn r2, r2 // add the NOT for the penultimate rkey + mvn r3, r3 // add the NOT for the penultimate rkey + ldrd r5, r6, [r12, #-444] + ldrd r10, r11, [r12, #-424] + strd r0, r1, [r12, #-28] + strd r2, r3, [r12, #-8] + mvn r5, r5 // remove the NOT for the key whitening + mvn r6, r6 // remove the NOT for the key whitening + mvn r10, r10 // remove the NOT for the key whitening + mvn r11, r11 // remove the NOT for the key whitening + strd r5, r6, [r12, #-444] + strd r10, r11, [r12, #-424] + add.w sp, #56 // restore stack + pop {r0-r12, r14} // restore context + bx lr \ No newline at end of file diff --git a/common/aes-publicinputs.S b/common/aes-publicinputs.S new file mode 100644 index 0000000..9205d29 --- /dev/null +++ b/common/aes-publicinputs.S @@ -0,0 +1,1327 @@ +.syntax unified +.thumb + +.section .data.aestable +.global AES_Te0 +.type AES_Te0,%object +.align 2 +AES_Te0: +.word 0x63c6a563, 0x7cf8847c, 0x77ee9977, 0x7bf68d7b +.word 0xf2ff0df2, 0x6bd6bd6b, 0x6fdeb16f, 0xc59154c5 +.word 0x30605030, 0x01020301, 0x67cea967, 0x2b567d2b +.word 0xfee719fe, 0xd7b562d7, 0xab4de6ab, 0x76ec9a76 +.word 0xca8f45ca, 0x821f9d82, 0xc98940c9, 0x7dfa877d +.word 0xfaef15fa, 0x59b2eb59, 0x478ec947, 0xf0fb0bf0 +.word 0xad41ecad, 0xd4b367d4, 0xa25ffda2, 0xaf45eaaf +.word 0x9c23bf9c, 0xa453f7a4, 0x72e49672, 0xc09b5bc0 +.word 0xb775c2b7, 0xfde11cfd, 0x933dae93, 0x264c6a26 +.word 0x366c5a36, 0x3f7e413f, 0xf7f502f7, 0xcc834fcc +.word 0x34685c34, 0xa551f4a5, 0xe5d134e5, 0xf1f908f1 +.word 0x71e29371, 0xd8ab73d8, 0x31625331, 0x152a3f15 +.word 0x04080c04, 0xc79552c7, 0x23466523, 0xc39d5ec3 +.word 0x18302818, 0x9637a196, 0x050a0f05, 0x9a2fb59a +.word 0x070e0907, 0x12243612, 0x801b9b80, 0xe2df3de2 +.word 0xebcd26eb, 0x274e6927, 0xb27fcdb2, 0x75ea9f75 +.word 0x09121b09, 0x831d9e83, 0x2c58742c, 0x1a342e1a +.word 0x1b362d1b, 0x6edcb26e, 0x5ab4ee5a, 0xa05bfba0 +.word 0x52a4f652, 0x3b764d3b, 0xd6b761d6, 0xb37dceb3 +.word 0x29527b29, 0xe3dd3ee3, 0x2f5e712f, 0x84139784 +.word 0x53a6f553, 0xd1b968d1, 0x00000000, 0xedc12ced +.word 0x20406020, 0xfce31ffc, 0xb179c8b1, 0x5bb6ed5b +.word 0x6ad4be6a, 0xcb8d46cb, 0xbe67d9be, 0x39724b39 +.word 0x4a94de4a, 0x4c98d44c, 0x58b0e858, 0xcf854acf +.word 0xd0bb6bd0, 0xefc52aef, 0xaa4fe5aa, 0xfbed16fb +.word 0x4386c543, 0x4d9ad74d, 0x33665533, 0x85119485 +.word 0x458acf45, 0xf9e910f9, 0x02040602, 0x7ffe817f +.word 0x50a0f050, 0x3c78443c, 0x9f25ba9f, 0xa84be3a8 +.word 0x51a2f351, 0xa35dfea3, 0x4080c040, 0x8f058a8f +.word 0x923fad92, 0x9d21bc9d, 0x38704838, 0xf5f104f5 +.word 0xbc63dfbc, 0xb677c1b6, 0xdaaf75da, 0x21426321 +.word 0x10203010, 0xffe51aff, 0xf3fd0ef3, 0xd2bf6dd2 +.word 0xcd814ccd, 0x0c18140c, 0x13263513, 0xecc32fec +.word 0x5fbee15f, 0x9735a297, 0x4488cc44, 0x172e3917 +.word 0xc49357c4, 0xa755f2a7, 0x7efc827e, 0x3d7a473d +.word 0x64c8ac64, 0x5dbae75d, 0x19322b19, 0x73e69573 +.word 0x60c0a060, 0x81199881, 0x4f9ed14f, 0xdca37fdc +.word 0x22446622, 0x2a547e2a, 0x903bab90, 0x880b8388 +.word 0x468cca46, 0xeec729ee, 0xb86bd3b8, 0x14283c14 +.word 0xdea779de, 0x5ebce25e, 0x0b161d0b, 0xdbad76db +.word 0xe0db3be0, 0x32645632, 0x3a744e3a, 0x0a141e0a +.word 0x4992db49, 0x060c0a06, 0x24486c24, 0x5cb8e45c +.word 0xc29f5dc2, 0xd3bd6ed3, 0xac43efac, 0x62c4a662 +.word 0x9139a891, 0x9531a495, 0xe4d337e4, 0x79f28b79 +.word 0xe7d532e7, 0xc88b43c8, 0x376e5937, 0x6ddab76d +.word 0x8d018c8d, 0xd5b164d5, 0x4e9cd24e, 0xa949e0a9 +.word 0x6cd8b46c, 0x56acfa56, 0xf4f307f4, 0xeacf25ea +.word 0x65caaf65, 0x7af48e7a, 0xae47e9ae, 0x08101808 +.word 0xba6fd5ba, 0x78f08878, 0x254a6f25, 0x2e5c722e +.word 0x1c38241c, 0xa657f1a6, 0xb473c7b4, 0xc69751c6 +.word 0xe8cb23e8, 0xdda17cdd, 0x74e89c74, 0x1f3e211f +.word 0x4b96dd4b, 0xbd61dcbd, 0x8b0d868b, 0x8a0f858a +.word 0x70e09070, 0x3e7c423e, 0xb571c4b5, 0x66ccaa66 +.word 0x4890d848, 0x03060503, 0xf6f701f6, 0x0e1c120e +.word 0x61c2a361, 0x356a5f35, 0x57aef957, 0xb969d0b9 +.word 0x86179186, 0xc19958c1, 0x1d3a271d, 0x9e27b99e +.word 0xe1d938e1, 0xf8eb13f8, 0x982bb398, 0x11223311 +.word 0x69d2bb69, 0xd9a970d9, 0x8e07898e, 0x9433a794 +.word 0x9b2db69b, 0x1e3c221e, 0x87159287, 0xe9c920e9 +.word 0xce8749ce, 0x55aaff55, 0x28507828, 0xdfa57adf +.word 0x8c038f8c, 0xa159f8a1, 0x89098089, 0x0d1a170d +.word 0xbf65dabf, 0xe6d731e6, 0x4284c642, 0x68d0b868 +.word 0x4182c341, 0x9929b099, 0x2d5a772d, 0x0f1e110f +.word 0xb07bcbb0, 0x54a8fc54, 0xbb6dd6bb, 0x162c3a16 +.size AES_Te0,.-AES_Te0 + +.section .text.aes128 +@ void aes128_keyexp_publicinputs_asm(const uint8_t *key, +@ uint8_t *rk) { +.global aes128_keyexp_publicinputs_asm +.type aes128_keyexp_publicinputs_asm,%function +.align 2 +aes128_keyexp_publicinputs_asm: + + //function prologue, preserve registers + push {r4-r11} + + //load key + //pointer may be non-aligned, so avoid using ldm/stm + ldr r4, [r0, #0] + ldr r5, [r0, #4] + ldr r6, [r0, #8] + ldr r7, [r0, #12] + + //load table address once + ldr r3, =AES_Te0 + + //round 1 + uxtb r8, r7, ror #8 + uxtb r9, r7, ror #16 + uxtb r10, r7, ror #24 + uxtb r11, r7 + + ldrb r8, [r3, r8, lsl #2] + ldrb r9, [r3, r9, lsl #2] + ldrb r10, [r3, r10, lsl #2] + ldrb r11, [r3, r11, lsl #2] + + eor r4, #0x00000001 //rcon + eor r4, r4, r8 + eor r4, r4, r9, lsl #8 + eor r4, r4, r10, lsl #16 + eor r4, r4, r11, lsl #24 //rk[4] + eor r5, r4 //rk[5] + eor r6, r5 //rk[6] + eor r7, r6 //rk[7] + + //write to memory + str r4, [r1, #0] + str r5, [r1, #4] + str r6, [r1, #8] + str r7, [r1, #12] + + //round 2 + uxtb r8, r7, ror #8 + uxtb r9, r7, ror #16 + uxtb r10, r7, ror #24 + uxtb r11, r7 + + ldrb r8, [r3, r8, lsl #2] + ldrb r9, [r3, r9, lsl #2] + ldrb r10, [r3, r10, lsl #2] + ldrb r11, [r3, r11, lsl #2] + + eor r4, #0x00000002 //rcon + eor r4, r4, r8 + eor r4, r4, r9, lsl #8 + eor r4, r4, r10, lsl #16 + eor r4, r4, r11, lsl #24 //rk[8] + eor r5, r4 //rk[9] + eor r6, r5 //rk[10] + eor r7, r6 //rk[11] + + //write to memory + str r4, [r1, #16] + str r5, [r1, #20] + str r6, [r1, #24] + str r7, [r1, #28] + + //round 3 + uxtb r8, r7, ror #8 + uxtb r9, r7, ror #16 + uxtb r10, r7, ror #24 + uxtb r11, r7 + + ldrb r8, [r3, r8, lsl #2] + ldrb r9, [r3, r9, lsl #2] + ldrb r10, [r3, r10, lsl #2] + ldrb r11, [r3, r11, lsl #2] + + eor r4, #0x00000004 //rcon + eor r4, r4, r8 + eor r4, r4, r9, lsl #8 + eor r4, r4, r10, lsl #16 + eor r4, r4, r11, lsl #24 //rk[12] + eor r5, r4 //rk[13] + eor r6, r5 //rk[14] + eor r7, r6 //rk[15] + + //write to memory + str r4, [r1, #32] + str r5, [r1, #36] + str r6, [r1, #40] + str r7, [r1, #44] + + //round 4 + uxtb r8, r7, ror #8 + uxtb r9, r7, ror #16 + uxtb r10, r7, ror #24 + uxtb r11, r7 + + ldrb r8, [r3, r8, lsl #2] + ldrb r9, [r3, r9, lsl #2] + ldrb r10, [r3, r10, lsl #2] + ldrb r11, [r3, r11, lsl #2] + + eor r4, #0x00000008 //rcon + eor r4, r4, r8 + eor r4, r4, r9, lsl #8 + eor r4, r4, r10, lsl #16 + eor r4, r4, r11, lsl #24 //rk[16] + eor r5, r4 //rk[17] + eor r6, r5 //rk[18] + eor r7, r6 //rk[19] + + //write to memory + str r4, [r1, #48] + str r5, [r1, #52] + str r6, [r1, #56] + str r7, [r1, #60] + + //round 5 + uxtb r8, r7, ror #8 + uxtb r9, r7, ror #16 + uxtb r10, r7, ror #24 + uxtb r11, r7 + + ldrb r8, [r3, r8, lsl #2] + ldrb r9, [r3, r9, lsl #2] + ldrb r10, [r3, r10, lsl #2] + ldrb r11, [r3, r11, lsl #2] + + eor r4, #0x00000010 //rcon + eor r4, r4, r8 + eor r4, r4, r9, lsl #8 + eor r4, r4, r10, lsl #16 + eor r4, r4, r11, lsl #24 //rk[20] + eor r5, r4 //rk[21] + eor r6, r5 //rk[22] + eor r7, r6 //rk[23] + + //write to memory + str r4, [r1, #64] + str r5, [r1, #68] + str r6, [r1, #72] + str r7, [r1, #76] + + //round 6 + uxtb r8, r7, ror #8 + uxtb r9, r7, ror #16 + uxtb r10, r7, ror #24 + uxtb r11, r7 + + ldrb r8, [r3, r8, lsl #2] + ldrb r9, [r3, r9, lsl #2] + ldrb r10, [r3, r10, lsl #2] + ldrb r11, [r3, r11, lsl #2] + + eor r4, #0x00000020 //rcon + eor r4, r4, r8 + eor r4, r4, r9, lsl #8 + eor r4, r4, r10, lsl #16 + eor r4, r4, r11, lsl #24 //rk[24] + eor r5, r4 //rk[25] + eor r6, r5 //rk[26] + eor r7, r6 //rk[27] + + //write to memory + str r4, [r1, #80] + str r5, [r1, #84] + str r6, [r1, #88] + str r7, [r1, #92] + + //round 7 + uxtb r8, r7, ror #8 + uxtb r9, r7, ror #16 + uxtb r10, r7, ror #24 + uxtb r11, r7 + + ldrb r8, [r3, r8, lsl #2] + ldrb r9, [r3, r9, lsl #2] + ldrb r10, [r3, r10, lsl #2] + ldrb r11, [r3, r11, lsl #2] + + eor r4, #0x00000040 //rcon + eor r4, r4, r8 + eor r4, r4, r9, lsl #8 + eor r4, r4, r10, lsl #16 + eor r4, r4, r11, lsl #24 //rk[28] + eor r5, r4 //rk[29] + eor r6, r5 //rk[30] + eor r7, r6 //rk[31] + + //write to memory + str r4, [r1, #96] + str r5, [r1, #100] + str r6, [r1, #104] + str r7, [r1, #108] + + //round 8 + uxtb r8, r7, ror #8 + uxtb r9, r7, ror #16 + uxtb r10, r7, ror #24 + uxtb r11, r7 + + ldrb r8, [r3, r8, lsl #2] + ldrb r9, [r3, r9, lsl #2] + ldrb r10, [r3, r10, lsl #2] + ldrb r11, [r3, r11, lsl #2] + + eor r4, #0x00000080 //rcon + eor r4, r4, r8 + eor r4, r4, r9, lsl #8 + eor r4, r4, r10, lsl #16 + eor r4, r4, r11, lsl #24 //rk[32] + eor r5, r4 //rk[33] + eor r6, r5 //rk[34] + eor r7, r6 //rk[35] + + //write to memory + str r4, [r1, #112] + str r5, [r1, #116] + str r6, [r1, #120] + str r7, [r1, #124] + + add r1, #128 + + //round 9 + uxtb r8, r7, ror #8 + uxtb r9, r7, ror #16 + uxtb r10, r7, ror #24 + uxtb r11, r7 + + ldrb r8, [r3, r8, lsl #2] + ldrb r9, [r3, r9, lsl #2] + ldrb r10, [r3, r10, lsl #2] + ldrb r11, [r3, r11, lsl #2] + + eor r4, #0x0000001B //rcon + eor r4, r4, r8 + eor r4, r4, r9, lsl #8 + eor r4, r4, r10, lsl #16 + eor r4, r4, r11, lsl #24 //rk[36] + eor r5, r4 //rk[37] + eor r6, r5 //rk[38] + eor r7, r6 //rk[39] + + //write to memory + str r4, [r1, #0] + str r5, [r1, #4] + str r6, [r1, #8] + str r7, [r1, #12] + + //round 10 + uxtb r8, r7, ror #8 + uxtb r9, r7, ror #16 + uxtb r10, r7, ror #24 + uxtb r11, r7 + + ldrb r8, [r3, r8, lsl #2] + ldrb r9, [r3, r9, lsl #2] + ldrb r10, [r3, r10, lsl #2] + ldrb r11, [r3, r11, lsl #2] + + eor r4, #0x00000036 //rcon + eor r4, r4, r8 + eor r4, r4, r9, lsl #8 + eor r4, r4, r10, lsl #16 + eor r4, r4, r11, lsl #24 //rk[40] + eor r5, r4 //rk[41] + eor r6, r5 //rk[42] + eor r7, r6 //rk[43] + + //write to memory + str r4, [r1, #16] + str r5, [r1, #20] + str r6, [r1, #24] + str r7, [r1, #28] + + //function epilogue, restore state + pop {r4-r11} + bx lr +.size aes128_keyexp_publicinputs_asm,.-aes128_keyexp_publicinputs_asm + +.macro aesencrypt_oddround + ldr r8, [r14], #4 + ldr r9, [r14], #4 + ldr r10, [r14], #4 + ldr r11, [r14], #4 + + uxtb r0, r4 + uxtb r1, r5 + uxtb r2, r6 + uxtb r3, r7 + ldr r0, [r12, r0, lsl #2] + ldr r1, [r12, r1, lsl #2] + ldr r2, [r12, r2, lsl #2] + ldr r3, [r12, r3, lsl #2] + eor r8, r8, r0, ror #16 + eor r9, r9, r1, ror #16 + eor r10, r10, r2, ror #16 + eor r11, r11, r3, ror #16 + + uxtb r0, r5, ror #8 + uxtb r1, r6, ror #8 + uxtb r2, r7, ror #8 + uxtb r3, r4, ror #8 + ldr r0, [r12, r0, lsl #2] + ldr r1, [r12, r1, lsl #2] + ldr r2, [r12, r2, lsl #2] + ldr r3, [r12, r3, lsl #2] + eor r8, r8, r0, ror #8 + eor r9, r9, r1, ror #8 + eor r10, r10, r2, ror #8 + eor r11, r11, r3, ror #8 + + uxtb r0, r6, ror #16 + uxtb r1, r7, ror #16 + uxtb r2, r4, ror #16 + uxtb r3, r5, ror #16 + ldr r0, [r12, r0, lsl #2] + ldr r1, [r12, r1, lsl #2] + ldr r2, [r12, r2, lsl #2] + ldr r3, [r12, r3, lsl #2] + eor r8, r0 + eor r9, r1 + eor r10, r2 + eor r11, r3 + + uxtb r0, r7, ror #24 + uxtb r1, r4, ror #24 + uxtb r2, r5, ror #24 + uxtb r3, r6, ror #24 + ldr r0, [r12, r0, lsl #2] + ldr r1, [r12, r1, lsl #2] + ldr r2, [r12, r2, lsl #2] + ldr r3, [r12, r3, lsl #2] + eor r8, r8, r0, ror #24 + eor r9, r9, r1, ror #24 + eor r10, r10, r2, ror #24 + eor r11, r11, r3, ror #24 +.endm + +.macro aesencrypt_evenround + ldr r4, [r14], #4 + ldr r5, [r14], #4 + ldr r6, [r14], #4 + ldr r7, [r14], #4 + + uxtb r0, r8 + uxtb r1, r9 + uxtb r2, r10 + uxtb r3, r11 + ldr r0, [r12, r0, lsl #2] + ldr r1, [r12, r1, lsl #2] + ldr r2, [r12, r2, lsl #2] + ldr r3, [r12, r3, lsl #2] + eor r4, r4, r0, ror #16 + eor r5, r5, r1, ror #16 + eor r6, r6, r2, ror #16 + eor r7, r7, r3, ror #16 + + uxtb r0, r9, ror #8 + uxtb r1, r10, ror #8 + uxtb r2, r11, ror #8 + uxtb r3, r8, ror #8 + ldr r0, [r12, r0, lsl #2] + ldr r1, [r12, r1, lsl #2] + ldr r2, [r12, r2, lsl #2] + ldr r3, [r12, r3, lsl #2] + eor r4, r4, r0, ror #8 + eor r5, r5, r1, ror #8 + eor r6, r6, r2, ror #8 + eor r7, r7, r3, ror #8 + + uxtb r0, r10, ror #16 + uxtb r1, r11, ror #16 + uxtb r2, r8, ror #16 + uxtb r3, r9, ror #16 + ldr r0, [r12, r0, lsl #2] + ldr r1, [r12, r1, lsl #2] + ldr r2, [r12, r2, lsl #2] + ldr r3, [r12, r3, lsl #2] + eor r4, r0 + eor r5, r1 + eor r6, r2 + eor r7, r3 + + uxtb r0, r11, ror #24 + uxtb r1, r8, ror #24 + uxtb r2, r9, ror #24 + uxtb r3, r10, ror #24 + ldr r0, [r12, r0, lsl #2] + ldr r1, [r12, r1, lsl #2] + ldr r2, [r12, r2, lsl #2] + ldr r3, [r12, r3, lsl #2] + eor r4, r4, r0, ror #24 + eor r5, r5, r1, ror #24 + eor r6, r6, r2, ror #24 + eor r7, r7, r3, ror #24 +.endm + +.macro aesencrypt_finalround + uxtb r0, r11, ror #24 + uxtb r1, r8, ror #24 + uxtb r2, r9, ror #24 + uxtb r3, r10, ror #24 + ldr r4, [r12, r0, lsl #2] + ldr r5, [r12, r1, lsl #2] + ldr r6, [r12, r2, lsl #2] + ldr r7, [r12, r3, lsl #2] + + uxtb r0, r10, ror #16 + uxtb r1, r11, ror #16 + uxtb r2, r8, ror #16 + uxtb r3, r9, ror #16 + ldr r0, [r12, r0, lsl #2] + ldr r1, [r12, r1, lsl #2] + ldr r2, [r12, r2, lsl #2] + ldr r3, [r12, r3, lsl #2] + bfi r4, r0, #24, #8 + bfi r5, r1, #24, #8 + bfi r6, r2, #24, #8 + bfi r7, r3, #24, #8 + + uxtb r0, r8 + uxtb r1, r9 + uxtb r2, r10 + uxtb r3, r11 + ldr r0, [r12, r0, lsl #2] + ldr r1, [r12, r1, lsl #2] + ldr r2, [r12, r2, lsl #2] + ldr r3, [r12, r3, lsl #2] + bfi r4, r0, #8, #8 + bfi r5, r1, #8, #8 + bfi r6, r2, #8, #8 + bfi r7, r3, #8, #8 + + uxtb r0, r9, ror #8 + uxtb r1, r10, ror #8 + uxtb r2, r11, ror #8 + uxtb r3, r8, ror #8 + ldr r0, [r12, r0, lsl #2] + ldr r1, [r12, r1, lsl #2] + ldr r2, [r12, r2, lsl #2] + ldr r3, [r12, r3, lsl #2] + bfi r4, r0, #16, #8 + bfi r5, r1, #16, #8 + bfi r6, r2, #16, #8 + bfi r7, r3, #16, #8 +.endm + +@ void aes128_encrypt_publicinputs_asm(const uint8_t *rk, +@ const uint8_t *in, uint8_t *out) { +.global aes128_encrypt_publicinputs_asm +.type aes128_encrypt_publicinputs_asm,%function +.align 2 +aes128_encrypt_publicinputs_asm: + + //function prologue, preserve registers and free r2 + push {r2,r4-r12,r14} + + //load input + ldr r4, [r1, #0] + ldr r5, [r1, #4] + ldr r6, [r1, #8] + ldr r7, [r1, #12] + //r1 now free to overwrite + //load key + ldr r8, [r0], #4 + ldr r9, [r0], #4 + ldr r10, [r0], #4 + ldr r11, [r0], #4 + mov.w r14, r0 + + //load table address once + ldr r12, =AES_Te0 + + //initial addroundkey + eor r4, r8 + eor r5, r9 + eor r6, r10 + eor r7, r11 + +.rept 4 + aesencrypt_oddround + aesencrypt_evenround +.endr + aesencrypt_oddround + aesencrypt_finalround + + //rk[40]-rk[43] + ldr r0, [r14, #0] + ldr r1, [r14, #4] + ldr r2, [r14, #8] + ldr r3, [r14, #12] + + eor r0, r0, r4, ror #8 + eor r1, r1, r5, ror #8 + eor r2, r2, r6, ror #8 + pop.w {r4} + eor r3, r3, r7, ror #8 + + //write output + str r0, [r4, #0] + str r1, [r4, #4] + str r2, [r4, #8] + str r3, [r4, #12] + + //function epilogue, restore state + pop {r4-r12,r14} + bx lr +.size aes128_encrypt_publicinputs_asm,.-aes128_encrypt_publicinputs_asm + +.section .text.aes192 +@ void aes192_keyexp_publicinputs_asm(const uint8_t *key, +@ uint8_t *rk) { +.global aes192_keyexp_publicinputs_asm +.type aes192_keyexp_publicinputs_asm,%function +.align 2 +aes192_keyexp_publicinputs_asm: + + //function prologue, preserve registers + push {r4-r11} + + //load key + ldr r2, [r0, #0] + ldr r3, [r0, #4] + ldr r4, [r0, #8] + ldr r5, [r0, #12] + ldr r6, [r0, #16] + ldr r7, [r0, #20] + + //load table address once + ldr r0, =AES_Te0 + + //round 1 + uxtb r8, r7, ror #8 + uxtb r9, r7, ror #16 + uxtb r10, r7, ror #24 + uxtb r11, r7 + + ldrb r8, [r0, r8, lsl #2] + ldrb r9, [r0, r9, lsl #2] + ldrb r10, [r0, r10, lsl #2] + ldrb r11, [r0, r11, lsl #2] + + eor r2, #0x00000001 //rcon + eor r2, r2, r8 + eor r2, r2, r9, lsl #8 + eor r2, r2, r10, lsl #16 + eor r2, r2, r11, lsl #24 //rk[6] + eor r3, r2 //rk[7] + eor r4, r3 //rk[8] + eor r5, r4 //rk[9] + eor r6, r5 //rk[10] + eor r7, r6 //rk[11] + + //write to memory + str r2, [r1, #0] + str r3, [r1, #4] + str r4, [r1, #8] + str r5, [r1, #12] + str r6, [r1, #16] + str r7, [r1, #20] + + //round 2 + uxtb r8, r7, ror #8 + uxtb r9, r7, ror #16 + uxtb r10, r7, ror #24 + uxtb r11, r7 + + ldrb r8, [r0, r8, lsl #2] + ldrb r9, [r0, r9, lsl #2] + ldrb r10, [r0, r10, lsl #2] + ldrb r11, [r0, r11, lsl #2] + + eor r2, #0x00000002 //rcon + eor r2, r2, r8 + eor r2, r2, r9, lsl #8 + eor r2, r2, r10, lsl #16 + eor r2, r2, r11, lsl #24 //rk[12] + eor r3, r2 //rk[13] + eor r4, r3 //rk[14] + eor r5, r4 //rk[15] + eor r6, r5 //rk[16] + eor r7, r6 //rk[17] + + //write to memory + str r2, [r1, #24] + str r3, [r1, #28] + str r4, [r1, #32] + str r5, [r1, #36] + str r6, [r1, #40] + str r7, [r1, #44] + + //round 3 + uxtb r8, r7, ror #8 + uxtb r9, r7, ror #16 + uxtb r10, r7, ror #24 + uxtb r11, r7 + + ldrb r8, [r0, r8, lsl #2] + ldrb r9, [r0, r9, lsl #2] + ldrb r10, [r0, r10, lsl #2] + ldrb r11, [r0, r11, lsl #2] + + eor r2, #0x00000004 //rcon + eor r2, r2, r8 + eor r2, r2, r9, lsl #8 + eor r2, r2, r10, lsl #16 + eor r2, r2, r11, lsl #24 //rk[18] + eor r3, r2 //rk[19] + eor r4, r3 //rk[20] + eor r5, r4 //rk[21] + eor r6, r5 //rk[22] + eor r7, r6 //rk[23] + + //write to memory + str r2, [r1, #48] + str r3, [r1, #52] + str r4, [r1, #56] + str r5, [r1, #60] + str r6, [r1, #64] + str r7, [r1, #68] + + //round 4 + uxtb r8, r7, ror #8 + uxtb r9, r7, ror #16 + uxtb r10, r7, ror #24 + uxtb r11, r7 + + ldrb r8, [r0, r8, lsl #2] + ldrb r9, [r0, r9, lsl #2] + ldrb r10, [r0, r10, lsl #2] + ldrb r11, [r0, r11, lsl #2] + + eor r2, #0x00000008 //rcon + eor r2, r2, r8 + eor r2, r2, r9, lsl #8 + eor r2, r2, r10, lsl #16 + eor r2, r2, r11, lsl #24 //rk[24] + eor r3, r2 //rk[25] + eor r4, r3 //rk[26] + eor r5, r4 //rk[27] + eor r6, r5 //rk[28] + eor r7, r6 //rk[29] + + //write to memory + str r2, [r1, #72] + str r3, [r1, #76] + str r4, [r1, #80] + str r5, [r1, #84] + str r6, [r1, #88] + str r7, [r1, #92] + + //round 5 + uxtb r8, r7, ror #8 + uxtb r9, r7, ror #16 + uxtb r10, r7, ror #24 + uxtb r11, r7 + + ldrb r8, [r0, r8, lsl #2] + ldrb r9, [r0, r9, lsl #2] + ldrb r10, [r0, r10, lsl #2] + ldrb r11, [r0, r11, lsl #2] + + eor r2, #0x00000010 //rcon + eor r2, r2, r8 + eor r2, r2, r9, lsl #8 + eor r2, r2, r10, lsl #16 + eor r2, r2, r11, lsl #24 //rk[30] + eor r3, r2 //rk[31] + eor r4, r3 //rk[32] + eor r5, r4 //rk[33] + eor r6, r5 //rk[34] + eor r7, r6 //rk[35] + + //write to memory + str r2, [r1, #96] + str r3, [r1, #100] + str r4, [r1, #104] + str r5, [r1, #108] + str r6, [r1, #112] + str r7, [r1, #116] + + add r1, #120 + + //round 6 + uxtb r8, r7, ror #8 + uxtb r9, r7, ror #16 + uxtb r10, r7, ror #24 + uxtb r11, r7 + + ldrb r8, [r0, r8, lsl #2] + ldrb r9, [r0, r9, lsl #2] + ldrb r10, [r0, r10, lsl #2] + ldrb r11, [r0, r11, lsl #2] + + eor r2, #0x00000020 //rcon + eor r2, r2, r8 + eor r2, r2, r9, lsl #8 + eor r2, r2, r10, lsl #16 + eor r2, r2, r11, lsl #24 //rk[36] + eor r3, r2 //rk[37] + eor r4, r3 //rk[38] + eor r5, r4 //rk[39] + eor r6, r5 //rk[40] + eor r7, r6 //rk[41] + + //write to memory + str r2, [r1, #0] + str r3, [r1, #4] + str r4, [r1, #8] + str r5, [r1, #12] + str r6, [r1, #16] + str r7, [r1, #20] + + //round 7 + uxtb r8, r7, ror #8 + uxtb r9, r7, ror #16 + uxtb r10, r7, ror #24 + uxtb r11, r7 + + ldrb r8, [r0, r8, lsl #2] + ldrb r9, [r0, r9, lsl #2] + ldrb r10, [r0, r10, lsl #2] + ldrb r11, [r0, r11, lsl #2] + + eor r2, #0x00000040 //rcon + eor r2, r2, r8 + eor r2, r2, r9, lsl #8 + eor r2, r2, r10, lsl #16 + eor r2, r2, r11, lsl #24 //rk[42] + eor r3, r2 //rk[43] + eor r4, r3 //rk[44] + eor r5, r4 //rk[45] + eor r6, r5 //rk[46] + eor r7, r6 //rk[47] + + //write to memory + str r2, [r1, #24] + str r3, [r1, #28] + str r4, [r1, #32] + str r5, [r1, #36] + str r6, [r1, #40] + str r7, [r1, #44] + + //round 8 + uxtb r8, r7, ror #8 + uxtb r9, r7, ror #16 + uxtb r10, r7, ror #24 + uxtb r11, r7 + + ldrb r8, [r0, r8, lsl #2] + ldrb r9, [r0, r9, lsl #2] + ldrb r10, [r0, r10, lsl #2] + ldrb r11, [r0, r11, lsl #2] + + eor r2, #0x00000080 //rcon + eor r2, r2, r8 + eor r2, r2, r9, lsl #8 + eor r2, r2, r10, lsl #16 + eor r2, r2, r11, lsl #24 //rk[48] + eor r3, r2 //rk[49] + eor r4, r3 //rk[50] + eor r5, r4 //rk[51] + //write to memory + str r2, [r1, #48] + str r3, [r1, #52] + str r4, [r1, #56] + str r5, [r1, #60] + + //function epilogue, restore state + pop {r4-r11} + bx lr +.size aes192_keyexp_publicinputs_asm,.-aes192_keyexp_publicinputs_asm + +@ void aes192_encrypt_publicinputs_asm(const uint8_t *rk, +@ const uint8_t *in, uint8_t *out) { +.global aes192_encrypt_publicinputs_asm +.type aes192_encrypt_publicinputs_asm,%function +.align 2 +aes192_encrypt_publicinputs_asm: + + //function prologue, preserve registers and free r2 + push {r2,r4-r12,r14} + + //load input + ldr r4, [r1, #0] + ldr r5, [r1, #4] + ldr r6, [r1, #8] + ldr r7, [r1, #12] + //r1 now free to overwrite + //load key + ldr r8, [r0], #4 + ldr r9, [r0], #4 + ldr r10, [r0], #4 + ldr r11, [r0], #4 + mov.w r14, r0 + + //load table address once + ldr r12, =AES_Te0 + + //initial addroundkey + eor r4, r8 + eor r5, r9 + eor r6, r10 + eor r7, r11 + +.rept 5 + aesencrypt_oddround + aesencrypt_evenround +.endr + aesencrypt_oddround + aesencrypt_finalround + + //rk[48]-rk[51] + ldr r0, [r14, #0] + ldr r1, [r14, #4] + ldr r2, [r14, #8] + ldr r3, [r14, #12] + + eor r0, r0, r4, ror #8 + eor r1, r1, r5, ror #8 + eor r2, r2, r6, ror #8 + pop.w {r4} + eor r3, r3, r7, ror #8 + + //write output + str r0, [r4, #0] + str r1, [r4, #4] + str r2, [r4, #8] + str r3, [r4, #12] + + //function epilogue, restore state + pop {r4-r12,r14} + bx lr +.size aes192_encrypt_publicinputs_asm,.-aes192_encrypt_publicinputs_asm + +.section .text.aes256 +@ void aes256_keyexp_publicinputs_asm(const uint8_t *key, +@ uint8_t *rk) { +.global aes256_keyexp_publicinputs_asm +.type aes256_keyexp_publicinputs_asm,%function +.align 2 +aes256_keyexp_publicinputs_asm: + + //function prologue, preserve registers + push {r4-r12,r14} + + //load key + ldr r2, [r0, #0] + ldr r3, [r0, #4] + ldr r4, [r0, #8] + ldr r5, [r0, #12] + ldr r6, [r0, #16] + ldr r7, [r0, #20] + ldr r8, [r0, #24] + ldr r9, [r0, #28] + + //load table address once + ldr r0, =AES_Te0 + + //round 1 + uxtb r10, r9, ror #8 + uxtb r11, r9, ror #16 + uxtb r12, r9, ror #24 + uxtb r14, r9 + + ldrb r10, [r0, r10, lsl #2] + ldrb r11, [r0, r11, lsl #2] + ldrb r12, [r0, r12, lsl #2] + ldrb r14, [r0, r14, lsl #2] + + eor r2, #0x00000001 //rcon + eor r2, r2, r10 + eor r2, r2, r11, lsl #8 + eor r2, r2, r12, lsl #16 + eor r2, r2, r14, lsl #24 //rk[8] + eor r3, r2 //rk[9] + eor r4, r3 //rk[10] + eor r5, r4 //rk[11] + + uxtb r10, r5, ror #16 + uxtb r11, r5, ror #8 + uxtb r12, r5 + uxtb r14, r5, ror #24 + + ldrb r10, [r0, r10, lsl #2] + ldrb r11, [r0, r11, lsl #2] + ldrb r12, [r0, r12, lsl #2] + ldrb r14, [r0, r14, lsl #2] + + eor r6, r6, r10, lsl #16 + eor r6, r6, r11, lsl #8 + eor r6, r12 + eor r6, r6, r14, lsl #24 //rk[12] + eor r7, r6 //rk[13] + eor r8, r7 //rk[14] + eor r9, r8 //rk[15] + + //write to memory + str r2, [r1, #0] + str r3, [r1, #4] + str r4, [r1, #8] + str r5, [r1, #12] + str r6, [r1, #16] + str r7, [r1, #20] + str r8, [r1, #24] + str r9, [r1, #28] + + //round 2 + uxtb r10, r9, ror #8 + uxtb r11, r9, ror #16 + uxtb r12, r9, ror #24 + uxtb r14, r9 + + ldrb r10, [r0, r10, lsl #2] + ldrb r11, [r0, r11, lsl #2] + ldrb r12, [r0, r12, lsl #2] + ldrb r14, [r0, r14, lsl #2] + + eor r2, #0x00000002 //rcon + eor r2, r2, r10 + eor r2, r2, r11, lsl #8 + eor r2, r2, r12, lsl #16 + eor r2, r2, r14, lsl #24 //rk[16] + eor r3, r2 //rk[17] + eor r4, r3 //rk[18] + eor r5, r4 //rk[19] + + uxtb r10, r5, ror #16 + uxtb r11, r5, ror #8 + uxtb r12, r5 + uxtb r14, r5, ror #24 + + ldrb r10, [r0, r10, lsl #2] + ldrb r11, [r0, r11, lsl #2] + ldrb r12, [r0, r12, lsl #2] + ldrb r14, [r0, r14, lsl #2] + + eor r6, r6, r10, lsl #16 + eor r6, r6, r11, lsl #8 + eor r6, r12 + eor r6, r6, r14, lsl #24 //rk[20] + eor r7, r6 //rk[21] + eor r8, r7 //rk[22] + eor r9, r8 //rk[23] + + //write to memory + str r2, [r1, #32] + str r3, [r1, #36] + str r4, [r1, #40] + str r5, [r1, #44] + str r6, [r1, #48] + str r7, [r1, #52] + str r8, [r1, #56] + str r9, [r1, #60] + + //round 3 + uxtb r10, r9, ror #8 + uxtb r11, r9, ror #16 + uxtb r12, r9, ror #24 + uxtb r14, r9 + + ldrb r10, [r0, r10, lsl #2] + ldrb r11, [r0, r11, lsl #2] + ldrb r12, [r0, r12, lsl #2] + ldrb r14, [r0, r14, lsl #2] + + eor r2, #0x00000004 //rcon + eor r2, r2, r10 + eor r2, r2, r11, lsl #8 + eor r2, r2, r12, lsl #16 + eor r2, r2, r14, lsl #24 //rk[24] + eor r3, r2 //rk[25] + eor r4, r3 //rk[26] + eor r5, r4 //rk[27] + + uxtb r10, r5, ror #16 + uxtb r11, r5, ror #8 + uxtb r12, r5 + uxtb r14, r5, ror #24 + + ldrb r10, [r0, r10, lsl #2] + ldrb r11, [r0, r11, lsl #2] + ldrb r12, [r0, r12, lsl #2] + ldrb r14, [r0, r14, lsl #2] + + eor r6, r6, r10, lsl #16 + eor r6, r6, r11, lsl #8 + eor r6, r12 + eor r6, r6, r14, lsl #24 //rk[28] + eor r7, r6 //rk[29] + eor r8, r7 //rk[30] + eor r9, r8 //rk[31] + + //write to memory + str r2, [r1, #64] + str r3, [r1, #68] + str r4, [r1, #72] + str r5, [r1, #76] + str r6, [r1, #80] + str r7, [r1, #84] + str r8, [r1, #88] + str r9, [r1, #92] + + //round 4 + uxtb r10, r9, ror #8 + uxtb r11, r9, ror #16 + uxtb r12, r9, ror #24 + uxtb r14, r9 + + ldrb r10, [r0, r10, lsl #2] + ldrb r11, [r0, r11, lsl #2] + ldrb r12, [r0, r12, lsl #2] + ldrb r14, [r0, r14, lsl #2] + + eor r2, #0x00000008 //rcon + eor r2, r2, r10 + eor r2, r2, r11, lsl #8 + eor r2, r2, r12, lsl #16 + eor r2, r2, r14, lsl #24 //rk[32] + eor r3, r2 //rk[33] + eor r4, r3 //rk[34] + eor r5, r4 //rk[35] + + uxtb r10, r5, ror #16 + uxtb r11, r5, ror #8 + uxtb r12, r5 + uxtb r14, r5, ror #24 + + ldrb r10, [r0, r10, lsl #2] + ldrb r11, [r0, r11, lsl #2] + ldrb r12, [r0, r12, lsl #2] + ldrb r14, [r0, r14, lsl #2] + + eor r6, r6, r10, lsl #16 + eor r6, r6, r11, lsl #8 + eor r6, r12 + eor r6, r6, r14, lsl #24 //rk[36] + eor r7, r6 //rk[37] + eor r8, r7 //rk[38] + eor r9, r8 //rk[39] + + //write to memory + str r2, [r1, #96] + str r3, [r1, #100] + str r4, [r1, #104] + str r5, [r1, #108] + str r6, [r1, #112] + str r7, [r1, #116] + str r8, [r1, #120] + str r9, [r1, #124] + + add r1, #128 + + //round 5 + uxtb r10, r9, ror #8 + uxtb r11, r9, ror #16 + uxtb r12, r9, ror #24 + uxtb r14, r9 + + ldrb r10, [r0, r10, lsl #2] + ldrb r11, [r0, r11, lsl #2] + ldrb r12, [r0, r12, lsl #2] + ldrb r14, [r0, r14, lsl #2] + + eor r2, #0x00000010 //rcon + eor r2, r2, r10 + eor r2, r2, r11, lsl #8 + eor r2, r2, r12, lsl #16 + eor r2, r2, r14, lsl #24 //rk[40] + eor r3, r2 //rk[41] + eor r4, r3 //rk[42] + eor r5, r4 //rk[43] + + uxtb r10, r5, ror #16 + uxtb r11, r5, ror #8 + uxtb r12, r5 + uxtb r14, r5, ror #24 + + ldrb r10, [r0, r10, lsl #2] + ldrb r11, [r0, r11, lsl #2] + ldrb r12, [r0, r12, lsl #2] + ldrb r14, [r0, r14, lsl #2] + + eor r6, r6, r10, lsl #16 + eor r6, r6, r11, lsl #8 + eor r6, r12 + eor r6, r6, r14, lsl #24 //rk[44] + eor r7, r6 //rk[45] + eor r8, r7 //rk[46] + eor r9, r8 //rk[47] + + //write to memory + str r2, [r1, #0] + str r3, [r1, #4] + str r4, [r1, #8] + str r5, [r1, #12] + str r6, [r1, #16] + str r7, [r1, #20] + str r8, [r1, #24] + str r9, [r1, #28] + + //round 6 + uxtb r10, r9, ror #8 + uxtb r11, r9, ror #16 + uxtb r12, r9, ror #24 + uxtb r14, r9 + + ldrb r10, [r0, r10, lsl #2] + ldrb r11, [r0, r11, lsl #2] + ldrb r12, [r0, r12, lsl #2] + ldrb r14, [r0, r14, lsl #2] + + eor r2, #0x00000020 //rcon + eor r2, r2, r10 + eor r2, r2, r11, lsl #8 + eor r2, r2, r12, lsl #16 + eor r2, r2, r14, lsl #24 //rk[48] + eor r3, r2 //rk[49] + eor r4, r3 //rk[50] + eor r5, r4 //rk[51] + + uxtb r10, r5, ror #16 + uxtb r11, r5, ror #8 + uxtb r12, r5 + uxtb r14, r5, ror #24 + + ldrb r10, [r0, r10, lsl #2] + ldrb r11, [r0, r11, lsl #2] + ldrb r12, [r0, r12, lsl #2] + ldrb r14, [r0, r14, lsl #2] + + eor r6, r6, r10, lsl #16 + eor r6, r6, r11, lsl #8 + eor r6, r12 + eor r6, r6, r14, lsl #24 //rk[52] + eor r7, r6 //rk[53] + eor r8, r7 //rk[54] + eor r9, r8 //rk[55] + + //write to memory + str r2, [r1, #32] + str r3, [r1, #36] + str r4, [r1, #40] + str r5, [r1, #44] + str r6, [r1, #48] + str r7, [r1, #52] + str r8, [r1, #56] + str r9, [r1, #60] + + //round 7 + uxtb r10, r9, ror #8 + uxtb r11, r9, ror #16 + uxtb r12, r9, ror #24 + uxtb r14, r9 + + ldrb r10, [r0, r10, lsl #2] + ldrb r11, [r0, r11, lsl #2] + ldrb r12, [r0, r12, lsl #2] + ldrb r14, [r0, r14, lsl #2] + + eor r2, #0x00000040 //rcon + eor r2, r2, r10 + eor r2, r2, r11, lsl #8 + eor r2, r2, r12, lsl #16 + eor r2, r2, r14, lsl #24 //rk[56] + eor r3, r2 //rk[57] + eor r4, r3 //rk[58] + eor r5, r4 //rk[59] + + //write to memory + str r2, [r1, #64] + str r3, [r1, #68] + str r4, [r1, #72] + str r5, [r1, #76] + + //function epilogue, restore state + pop {r4-r12,r14} + bx lr +.size aes256_keyexp_publicinputs_asm,.-aes256_keyexp_publicinputs_asm + +.align 2 +.ltorg + +@ void aes256_encrypt_publicinputs_asm(const uint8_t *rk, +@ const uint8_t *in, uint8_t *out) { +.global aes256_encrypt_publicinputs_asm +.type aes256_encrypt_publicinputs_asm,%function +.align 2 +aes256_encrypt_publicinputs_asm: + + //function prologue, preserve registers and free r2 + push {r2,r4-r12,r14} + + //load input + ldr r4, [r1, #0] + ldr r5, [r1, #4] + ldr r6, [r1, #8] + ldr r7, [r1, #12] + //r1 now free to overwrite + //load key + ldr r8, [r0], #4 + ldr r9, [r0], #4 + ldr r10, [r0], #4 + ldr r11, [r0], #4 + mov.w r14, r0 + + //load table address once + ldr r12, =AES_Te0 + + //initial addroundkey + eor r4, r8 + eor r5, r9 + eor r6, r10 + eor r7, r11 + +.rept 6 + aesencrypt_oddround + aesencrypt_evenround +.endr + aesencrypt_oddround + aesencrypt_finalround + + //rk[56]-rk[59] + ldr r0, [r14, #0] + ldr r1, [r14, #4] + ldr r2, [r14, #8] + ldr r3, [r14, #12] + + eor r0, r0, r4, ror #8 + eor r1, r1, r5, ror #8 + eor r2, r2, r6, ror #8 + pop.w {r4} + eor r3, r3, r7, ror #8 + + //write output + str r0, [r4, #0] + str r1, [r4, #4] + str r2, [r4, #8] + str r3, [r4, #12] + + //function epilogue, restore state + pop {r4-r12,r14} + bx lr +.size aes256_encrypt_publicinputs_asm,.-aes256_encrypt_publicinputs_asm diff --git a/common/aes-publicinputs.c b/common/aes-publicinputs.c new file mode 100644 index 0000000..b216562 --- /dev/null +++ b/common/aes-publicinputs.c @@ -0,0 +1,259 @@ +/* + * AES implementation based on code from BearSSL (https://bearssl.org/) + * by Thomas Pornin. + * + * + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "aes-publicinputs.h" + +#ifdef PROFILE_HASHING +#include "hal.h" +extern unsigned long long hash_cycles; +#endif + +extern void aes128_keyexp_publicinputs_asm(const uint8_t *key, uint8_t *rk); +extern void aes192_keyexp_publicinputs_asm(const uint8_t *key, uint8_t *rk); +extern void aes256_keyexp_publicinputs_asm(const uint8_t *key, uint8_t *rk); +extern void aes128_encrypt_publicinputs_asm(const uint8_t *rk, const uint8_t *in, uint8_t *out); +extern void aes192_encrypt_publicinputs_asm(const uint8_t *rk, const uint8_t *in, uint8_t *out); +extern void aes256_encrypt_publicinputs_asm(const uint8_t *rk, const uint8_t *in, uint8_t *out); + + +static inline uint32_t br_swap32(uint32_t x) { + x = ((x & (uint32_t)0x00FF00FF) << 8) + | ((x >> 8) & (uint32_t)0x00FF00FF); + return (x << 16) | (x >> 16); +} + + +static inline void inc1_be(uint32_t *x) { + uint32_t t = br_swap32(*x) + 1; + *x = br_swap32(t); +} + + +static void aes_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const uint64_t *rkeys, void (*aes_encrypt_asm)(const uint8_t *, const uint8_t *, uint8_t *)) { + unsigned int i; + for (i = 0; i < nblocks; ++i) { + aes_encrypt_asm((uint8_t *)rkeys, in, out); + in += AES_BLOCKBYTES; + out += AES_BLOCKBYTES; + } +} + + +static void aes_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const uint64_t *rkeys, void (*aes_encrypt_asm)(const uint8_t *, const uint8_t *, uint8_t *)) { + uint32_t ivw[4] = {0}; + uint8_t buf[AES_BLOCKBYTES]; + size_t i; + + memcpy(ivw, iv, AESCTR_NONCEBYTES); + + while (outlen > AES_BLOCKBYTES) { + aes_encrypt_asm((uint8_t *)rkeys, (uint8_t *)ivw, out); + inc1_be(ivw + 3); + out += AES_BLOCKBYTES; + outlen -= AES_BLOCKBYTES; + } + if (outlen > 0) { + aes_encrypt_asm((unsigned char *)rkeys, (unsigned char *)ivw, buf); + for (i = 0; i < outlen; i++) { + out[i] = buf[i]; + } + } +} + + +static void aes128_keyexp_publicinputs(aes128ctx_publicinputs *r, const unsigned char *key) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + + memcpy((uint8_t *)r->sk_exp, key, AES128_KEYBYTES); + aes128_keyexp_publicinputs_asm(key, ((uint8_t *)r->sk_exp) + AES128_KEYBYTES); + +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void aes128_ecb_keyexp_publicinputs(aes128ctx_publicinputs *r, const unsigned char *key) { + aes128_keyexp_publicinputs(r, key); +} + +void aes128_ctr_keyexp_publicinputs(aes128ctx_publicinputs *r, const unsigned char *key) { + aes128_keyexp_publicinputs(r, key); +} + +static void aes192_keyexp_publicinputs(aes192ctx_publicinputs *r, const unsigned char *key) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + + memcpy((uint8_t *)r->sk_exp, key, AES192_KEYBYTES); + aes192_keyexp_publicinputs_asm(key, ((uint8_t *)r->sk_exp) + AES192_KEYBYTES); + +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + + +void aes192_ecb_keyexp_publicinputs(aes192ctx_publicinputs *r, const unsigned char *key) { + aes192_keyexp_publicinputs(r, key); +} + +void aes192_ctr_keyexp_publicinputs(aes192ctx_publicinputs *r, const unsigned char *key) { + aes192_keyexp_publicinputs(r, key); +} + + +static void aes256_keyexp_publicinputs(aes256ctx_publicinputs *r, const unsigned char *key) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + + memcpy((uint8_t *)r->sk_exp, key, AES256_KEYBYTES); + aes256_keyexp_publicinputs_asm(key, ((uint8_t *)r->sk_exp) + AES256_KEYBYTES); + +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void aes256_ecb_keyexp_publicinputs(aes256ctx_publicinputs *r, const unsigned char *key) { + aes256_keyexp_publicinputs(r, key); +} + +void aes256_ctr_keyexp_publicinputs(aes256ctx_publicinputs *r, const unsigned char *key) { + aes256_keyexp_publicinputs(r, key); +} + + +void aes128_ecb_publicinputs(unsigned char *out, const unsigned char *in, size_t nblocks, const aes128ctx_publicinputs *ctx) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + + aes_ecb(out, in, nblocks, ctx->sk_exp, aes128_encrypt_publicinputs_asm); + +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void aes128_ctr_publicinputs(unsigned char *out, size_t outlen, const unsigned char *iv, const aes128ctx_publicinputs *ctx) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + + aes_ctr(out, outlen, iv, ctx->sk_exp, aes128_encrypt_publicinputs_asm); + +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void aes192_ecb_publicinputs(unsigned char *out, const unsigned char *in, size_t nblocks, const aes192ctx_publicinputs *ctx) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + + aes_ecb(out, in, nblocks, ctx->sk_exp, aes192_encrypt_publicinputs_asm); + +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void aes192_ctr_publicinputs(unsigned char *out, size_t outlen, const unsigned char *iv, const aes192ctx_publicinputs *ctx) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + + aes_ctr(out, outlen, iv, ctx->sk_exp, aes192_encrypt_publicinputs_asm); + +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void aes256_ecb_publicinputs(unsigned char *out, const unsigned char *in, size_t nblocks, const aes256ctx_publicinputs *ctx) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + + aes_ecb(out, in, nblocks, ctx->sk_exp, aes256_encrypt_publicinputs_asm); + +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void aes256_ctr_publicinputs(unsigned char *out, size_t outlen, const unsigned char *iv, const aes256ctx_publicinputs *ctx) { +#ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); +#endif + + aes_ctr(out, outlen, iv, ctx->sk_exp, aes256_encrypt_publicinputs_asm); + +#ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); +#endif +} + +void aes128_ctx_release_publicinputs(aes128ctx_publicinputs *r) { + // no-op for mupq's basic AES operation + // this is required for compatibility with code from PQClean + // see https://github.com/PQClean/PQClean/pull/198 + (void) r; +} + +void aes192_ctx_release_publicinputs(aes192ctx_publicinputs *r) { + // no-op for mupq's basic AES operation + // this is required for compatibility with code from PQClean + // see https://github.com/PQClean/PQClean/pull/198 + (void) r; +} + +void aes256_ctx_release_publicinputs(aes256ctx_publicinputs *r) { + // no-op for mupq's basic AES operation + // this is required for compatibility with code from PQClean + // see https://github.com/PQClean/PQClean/pull/198 + (void) r; +} + diff --git a/common/aes-publicinputs.h b/common/aes-publicinputs.h new file mode 100644 index 0000000..13fbadd --- /dev/null +++ b/common/aes-publicinputs.h @@ -0,0 +1,62 @@ +#ifndef AES_PUBLICINPUTS_H +#define AES_PUBLICINPUTS_H + +#include +#include + +#define AES128_KEYBYTES 16 +#define AES192_KEYBYTES 24 +#define AES256_KEYBYTES 32 +#define AESCTR_NONCEBYTES 12 +#define AES_BLOCKBYTES 16 + +typedef struct { + uint64_t sk_exp[88]; +} aes128ctx_publicinputs; + +typedef struct { + uint64_t sk_exp[104]; +} aes192ctx_publicinputs; + +typedef struct { + uint64_t sk_exp[120]; +} aes256ctx_publicinputs; + + + +/** Initializes the context **/ +void aes128_ecb_keyexp_publicinputs(aes128ctx_publicinputs *r, const unsigned char *key); + +void aes128_ctr_keyexp_publicinputs(aes128ctx_publicinputs *r, const unsigned char *key); + +void aes128_ecb_publicinputs(unsigned char *out, const unsigned char *in, size_t nblocks, const aes128ctx_publicinputs *ctx); + +void aes128_ctr_publicinputs(unsigned char *out, size_t outlen, const unsigned char *iv, const aes128ctx_publicinputs *ctx); + +void aes128_ctx_release_publicinputs(aes128ctx_publicinputs *r); + +/** Initializes the context **/ +void aes192_ecb_keyexp_publicinputs(aes192ctx_publicinputs *r, const unsigned char *key); + +void aes192_ctr_keyexp_publicinputs(aes192ctx_publicinputs *r, const unsigned char *key); + +void aes192_ecb_publicinputs(unsigned char *out, const unsigned char *in, size_t nblocks, const aes192ctx_publicinputs *ctx); + +void aes192_ctr_publicinputs(unsigned char *out, size_t outlen, const unsigned char *iv, const aes192ctx_publicinputs *ctx); + +void aes192_ctx_release_publicinputs(aes192ctx_publicinputs *r); + + +/** Initializes the context **/ +void aes256_ecb_keyexp_publicinputs(aes256ctx_publicinputs *r, const unsigned char *key); + +void aes256_ctr_keyexp_publicinputs(aes256ctx_publicinputs *r, const unsigned char *key); + +void aes256_ecb_publicinputs(unsigned char *out, const unsigned char *in, size_t nblocks, const aes256ctx_publicinputs *ctx); + +void aes256_ctr_publicinputs(unsigned char *out, size_t outlen, const unsigned char *iv, const aes256ctx_publicinputs *ctx); + +void aes256_ctx_release_publicinputs(aes256ctx_publicinputs *r); + + +#endif diff --git a/common/aes.c b/common/aes.c new file mode 100644 index 0000000..ff84540 --- /dev/null +++ b/common/aes.c @@ -0,0 +1,232 @@ +// SPDX-License-Identifier: Apache-2.0 or CC0-1.0 +#include +#include +#include "aes.h" + +#ifdef PROFILE_HASHING +#include "hal.h" +extern unsigned long long hash_cycles; +#endif + +extern void aes128_keyschedule_ffs(uint32_t* rkeys, const uint8_t* key); +extern void aes256_keyschedule_ffs(uint32_t* rkeys, const uint8_t* key); + +extern void aes256_encrypt_ffs(uint8_t* ctext, uint8_t* ctext_bis, const uint8_t* ptext, + const uint8_t* ptext_bis, const uint32_t* rkey); + +extern void aes128_encrypt_ffs(uint8_t* ctext, uint8_t* ctext_bis, const uint8_t* ptext, + const uint8_t* ptext_bis, const uint32_t* rkey); + + +static inline uint32_t br_swap32(uint32_t x) { + x = ((x & (uint32_t)0x00FF00FF) << 8) + | ((x >> 8) & (uint32_t)0x00FF00FF); + return (x << 16) | (x >> 16); +} + +static inline void inc1_be(uint32_t *x) { + uint32_t t = br_swap32(*x) + 1; + *x = br_swap32(t); +} + +static inline void inc2_be(uint32_t *x) { + uint32_t t = br_swap32(*x) + 2; + *x = br_swap32(t); +} + +void aes128_ecb_keyexp(aes128ctx *r, const unsigned char *key){ + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif + + aes128_keyschedule_ffs(r->sk_exp, key); + + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); + #endif +} + +void aes128_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, + const aes128ctx *ctx){ + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif + uint8_t buf0[AES_BLOCKBYTES], buf1[AES_BLOCKBYTES]; + + while(nblocks > 0){ + if(nblocks >= 2){ + aes128_encrypt_ffs(out, out+AES_BLOCKBYTES, in, in+AES_BLOCKBYTES, ctx->sk_exp); + out += AES_BLOCKBYTES*2; + in += AES_BLOCKBYTES*2; + nblocks -= 2; + } else { + aes128_encrypt_ffs(out, buf0, in, buf1, ctx->sk_exp); + nblocks--; + } + } + + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); + #endif +} + +void aes128_ctr_keyexp(aes128ctx *r, const unsigned char *key){ + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif + + aes128_keyschedule_ffs(r->sk_exp, key); + + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); + #endif +} + + +void aes128_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, + const aes128ctx *ctx){ + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif + uint32_t ivw1[4] = {0}; + uint32_t ivw2[4] = {0}; + uint8_t buf1[AES_BLOCKBYTES]; + uint8_t buf2[AES_BLOCKBYTES]; + size_t i; + + memcpy(ivw1, iv, AESCTR_NONCEBYTES); + memcpy(ivw2, iv, AESCTR_NONCEBYTES); + inc1_be(ivw2 + 3); + + + while (outlen > 2*AES_BLOCKBYTES) { + aes128_encrypt_ffs(out, out+16, (uint8_t *)ivw1, (uint8_t *)ivw2, ctx->sk_exp); + inc2_be(ivw1 + 3); + inc2_be(ivw2 + 3); + out += AES_BLOCKBYTES*2; + outlen -= AES_BLOCKBYTES*2; + } + if (outlen >= AES_BLOCKBYTES) { + + aes128_encrypt_ffs(out, buf2, (uint8_t *)ivw1, (uint8_t *)ivw2, ctx->sk_exp); + out += AES_BLOCKBYTES; + outlen -= AES_BLOCKBYTES; + for (i = 0; i < outlen; i++) { + out[i] = buf2[i]; + } + } else if (outlen > 0) { + aes128_encrypt_ffs(buf1, buf2, (uint8_t *)ivw1, (uint8_t *)ivw2, ctx->sk_exp); + for (i = 0; i < outlen; i++) { + out[i] = buf1[i]; + } + } + + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); + #endif +} +void aes128_ctx_release(aes128ctx *r){ + (void) r; +} + + +void aes256_ecb_keyexp(aes256ctx *r, const unsigned char *key){ + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif + + aes256_keyschedule_ffs(r->sk_exp, key); + + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); + #endif +} + +void aes256_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes256ctx *ctx){ + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif + + uint8_t buf0[AES_BLOCKBYTES], buf1[AES_BLOCKBYTES]; + + while(nblocks > 0){ + if(nblocks >= 2){ + aes256_encrypt_ffs(out, out+AES_BLOCKBYTES, in, in+AES_BLOCKBYTES, ctx->sk_exp); + out += 2*AES_BLOCKBYTES; + in += 2*AES_BLOCKBYTES; + nblocks -= 2; + } else { + aes256_encrypt_ffs(out, buf0, in, buf1, ctx->sk_exp); + nblocks--; + } + } + + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); + #endif +} + +void aes256_ctr_keyexp(aes256ctx *r, const unsigned char *key){ + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif + aes256_keyschedule_ffs(r->sk_exp, key); + + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); + #endif +} + +void aes256_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, + const aes256ctx *ctx){ + #ifdef PROFILE_HASHING + uint64_t t0 = hal_get_time(); + #endif + uint32_t ivw1[4] = {0}; + uint32_t ivw2[4] = {0}; + uint8_t buf1[AES_BLOCKBYTES]; + uint8_t buf2[AES_BLOCKBYTES]; + size_t i; + + memcpy(ivw1, iv, AESCTR_NONCEBYTES); + memcpy(ivw2, iv, AESCTR_NONCEBYTES); + inc1_be(ivw2 + 3); + + + while (outlen > 2*AES_BLOCKBYTES) { + aes256_encrypt_ffs(out, out+AES_BLOCKBYTES, (uint8_t *)ivw1, (uint8_t *)ivw2, ctx->sk_exp); + inc2_be(ivw1 + 3); + inc2_be(ivw2 + 3); + out += AES_BLOCKBYTES*2; + outlen -= AES_BLOCKBYTES*2; + } + if (outlen >= AES_BLOCKBYTES) { + + aes256_encrypt_ffs(out, buf2, (uint8_t *)ivw1, (uint8_t *)ivw2, ctx->sk_exp); + out += AES_BLOCKBYTES; + outlen -= AES_BLOCKBYTES; + for (i = 0; i < outlen; i++) { + out[i] = buf2[i]; + } + } else if (outlen > 0) { + aes256_encrypt_ffs(buf1, buf2, (uint8_t *)ivw1, (uint8_t *)ivw2, ctx->sk_exp); + for (i = 0; i < outlen; i++) { + out[i] = buf1[i]; + } + } + #ifdef PROFILE_HASHING + uint64_t t1 = hal_get_time(); + hash_cycles += (t1-t0); + #endif +} + +void aes256_ctx_release(aes256ctx *r){ + (void) r; +} \ No newline at end of file diff --git a/common/aes.h b/common/aes.h new file mode 100644 index 0000000..49e9433 --- /dev/null +++ b/common/aes.h @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: Apache-2.0 or CC0-1.0 +#ifndef AES_H +#define AES_H + +#include +#include + +#define AES128_KEYBYTES 16 +#define AES192_KEYBYTES 24 +#define AES256_KEYBYTES 32 +#define AESCTR_NONCEBYTES 12 +#define AES_BLOCKBYTES 16 + +typedef struct { + uint32_t sk_exp[2*11*AES_BLOCKBYTES/sizeof(uint32_t)]; +} aes128ctx; + + +typedef struct { + uint32_t sk_exp[2*15*AES_BLOCKBYTES/sizeof(uint32_t)]; +} aes256ctx; + + + +/** Initializes the context **/ +void aes128_ecb_keyexp(aes128ctx *r, const unsigned char *key); + +void aes128_ctr_keyexp(aes128ctx *r, const unsigned char *key); + +void aes128_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes128ctx *ctx); + +void aes128_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes128ctx *ctx); + +void aes128_ctx_release(aes128ctx *r); + + +/** Initializes the context **/ +void aes256_ecb_keyexp(aes256ctx *r, const unsigned char *key); + +void aes256_ctr_keyexp(aes256ctx *r, const unsigned char *key); + +void aes256_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes256ctx *ctx); + +void aes256_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes256ctx *ctx); + +void aes256_ctx_release(aes256ctx *r); + + +#endif diff --git a/common/aestest.c b/common/aestest.c new file mode 100644 index 0000000..bfce91f --- /dev/null +++ b/common/aestest.c @@ -0,0 +1,185 @@ +#include "randombytes.h" +#include +#include +#include +#include +#include + +#include + + +#include +#include + +const unsigned char msg[48] = "The quick brown fox jumps over the lazy dog!!!!"; + +const unsigned char stream128[67] = { + 0x36, 0x88, 0x7b, 0x28, 0x99, 0x8f, 0x4d, 0x2b, 0x37, 0xff, 0x06, 0x63, 0xfc, 0x5c, 0xef, 0x2f, + 0x43, 0xeb, 0xeb, 0x7e, 0xc1, 0x58, 0xe4, 0xb4, 0x27, 0x78, 0x2e, 0xa7, 0x90, 0x00, 0x09, 0x91, + 0xcd, 0x0e, 0x40, 0x18, 0x8b, 0x2b, 0x52, 0x1e, 0x8d, 0xfb, 0x0e, 0x7b, 0x80, 0xe7, 0xb6, 0xd4, + 0xba, 0x48, 0xf5, 0x19, 0xb4, 0xbf, 0xbf, 0xb1, 0x52, 0x6f, 0x12, 0xe2, 0x7b, 0x61, 0x90, 0x0d, + 0x95, 0x83, 0x84}; + +const unsigned char stream192[67] = { + 0xb0, 0x13, 0x08, 0x01, 0xfa, 0x88, 0x28, 0x98, 0xac, 0x77, 0x72, 0xaf, 0x2a, 0x8d, 0x17, 0xf3, + 0xba, 0x37, 0xd6, 0x8e, 0x01, 0x5b, 0x25, 0xb5, 0x51, 0x40, 0xf9, 0x55, 0x73, 0x89, 0xbe, 0xd3, + 0x5f, 0x3c, 0xde, 0xc2, 0x6b, 0xf6, 0xfc, 0x26, 0x97, 0x22, 0xfa, 0x8e, 0xcb, 0x62, 0xe8, 0xa5, + 0x03, 0xd2, 0x8a, 0x7e, 0xaf, 0xe3, 0xe5, 0x63, 0x2e, 0xb8, 0x48, 0x20, 0x1c, 0x48, 0xd5, 0xc2, + 0x27, 0x1e, 0x1e}; + +const unsigned char stream256[67] = { + 0x12, 0x39, 0x28, 0xd0, 0xda, 0xd1, 0xfd, 0xe7, 0x64, 0x74, 0x10, 0x5a, 0x29, 0x17, 0x3c, 0x62, + 0x05, 0xde, 0x4c, 0x98, 0x22, 0x9b, 0xad, 0x16, 0x32, 0x75, 0xbf, 0x26, 0xe5, 0x84, 0x7e, 0xc8, + 0x4e, 0x73, 0x68, 0xce, 0x9a, 0x11, 0xb6, 0x55, 0x53, 0x05, 0x39, 0xa1, 0xa7, 0x1f, 0x16, 0x55, + 0x4a, 0xd3, 0x6c, 0xc6, 0x2c, 0xb4, 0x55, 0x9f, 0x5f, 0xa3, 0xe8, 0x39, 0xfa, 0x9d, 0x96, 0xb6, + 0xb7, 0xc9, 0xc5}; + +const unsigned char ct128[48] = { +0x10, 0xdc, 0x43, 0x2b, 0x15, 0x11, 0x81, 0x36, 0x3f, 0x00, 0x51, 0x74, 0x81, 0x7c, 0x22, 0x87, +0x3a, 0x3b, 0xfe, 0xd7, 0xb9, 0xa6, 0xf2, 0x3c, 0x81, 0x00, 0x63, 0xef, 0xe5, 0xb8, 0xbd, 0x36, +0x11, 0xcc, 0xc9, 0xdf, 0x2b, 0xea, 0xbc, 0xe6, 0x11, 0x1c, 0x34, 0x79, 0xf9, 0x6b, 0x47, 0x7b}; + +const unsigned char ct192[48] = { +0x63, 0xc6, 0xde, 0x28, 0x36, 0xb4, 0x29, 0xbf, 0xbe, 0x9d, 0x15, 0x8e, 0x83, 0x04, 0xa3, 0x18, +0x34, 0x79, 0xe8, 0x02, 0x8a, 0x34, 0x50, 0x7c, 0xa9, 0x08, 0x48, 0x47, 0xee, 0x90, 0x79, 0x13, +0x66, 0x2d, 0xa4, 0xf1, 0x3e, 0x8b, 0x76, 0xa9, 0x50, 0xf7, 0x6e, 0xa8, 0xbf, 0x29, 0xaf, 0x84}; + +const unsigned char ct256[48] = { +0xdb, 0xb5, 0x44, 0x70, 0x68, 0xe6, 0xad, 0x6a, 0x09, 0xdf, 0xa6, 0xef, 0x85, 0x73, 0xff, 0xc0, +0xc2, 0x91, 0x38, 0xbd, 0xd7, 0xd0, 0x22, 0x7e, 0x79, 0x71, 0xa1, 0x98, 0x6f, 0xd5, 0x80, 0xa8, +0x1e, 0x97, 0xd7, 0x6d, 0xd2, 0x6b, 0x0e, 0x7b, 0x79, 0x76, 0x75, 0x86, 0xa5, 0x2f, 0x76, 0x0b}; + +const unsigned char key[AES256_KEYBYTES] = { + 0x66, 0xd9, 0xb7, 0x60, 0x0e, 0xda, 0xaa, 0x81, 0x42, 0xa2, 0xd6, 0x3d, 0x8f, 0x51, 0x6c, 0x6f, + 0xb6, 0xdf, 0x5b, 0x97, 0xf3, 0xf1, 0xf7, 0x0e, 0xeb, 0xe0, 0x40, 0x4d, 0xc5, 0x24, 0xa1, 0xfa}; +const unsigned char nonce[AESCTR_NONCEBYTES] = { + 0x9d, 0x2d, 0x3e, 0x6e, 0x48, 0x5c, 0xf6, 0x6b, 0xb2, 0xb9, 0x25, 0xf4}; + +static int test(void) +{ + unsigned char ct[67]; + int r = 0; + aes128ctx ctx128_ecb, ctx128_ctr; + aes256ctx ctx256_ecb, ctx256_ctr; + + aes128_ecb_keyexp(&ctx128_ecb, key); + aes256_ecb_keyexp(&ctx256_ecb, key); + aes128_ctr_keyexp(&ctx128_ctr, key); + aes256_ctr_keyexp(&ctx256_ctr, key); + + aes128_ctr(ct, 67, nonce, &ctx128_ctr); + if(memcmp(ct, stream128, 67)) { + hal_send_str("ERROR AES128CTR output does not match test vector.\n"); + r = 1; + } + + aes256_ctr(ct, 67, nonce, &ctx256_ctr); + if(memcmp(ct, stream256, 67)) { + hal_send_str("ERROR AES256CTR output does not match test vector.\n"); + r = 1; + } + + + aes128_ecb(ct, msg, sizeof(msg) / AES_BLOCKBYTES, &ctx128_ecb); + if(memcmp(ct, ct128, 48)) { + hal_send_str("ERROR AES128ECB output does not match test vector.\n"); + r = 1; + } + + + aes256_ecb(ct, msg, sizeof(msg) / AES_BLOCKBYTES, &ctx256_ecb); + if(memcmp(ct, ct256, 48)) { + hal_send_str("ERROR AES256ECB output does not match test vector.\n"); + r = 1; + } + + aes128_ctx_release(&ctx128_ecb); + aes256_ctx_release(&ctx256_ecb); + aes128_ctx_release(&ctx128_ctr); + aes256_ctx_release(&ctx256_ctr); + + return r; +} + + +static void bench(void) +{ + char str[100]; + unsigned char ct[1024*32]; + unsigned char pt[1024*32]; + uint64_t t0, t1; + aes128ctx ctx128_ecb, ctx128_ctr; + aes256ctx ctx256_ecb, ctx256_ctr; + hal_send_str("-"); + t0 = hal_get_time(); + aes128_ecb_keyexp(&ctx128_ecb, key); + t1 = hal_get_time(); + sprintf(str, "aes128_ecb_keyexp: %llu cycles", t1-t0); + hal_send_str(str); + + t0 = hal_get_time(); + aes256_ecb_keyexp(&ctx256_ecb, key); + t1 = hal_get_time(); + sprintf(str, "aes256_ecb_keyexp: %llu cycles", t1-t0); + hal_send_str(str); + + t0 = hal_get_time(); + aes128_ctr_keyexp(&ctx128_ctr, key); + t1 = hal_get_time(); + sprintf(str, "aes128_ctr_keyexp: %llu cycles", t1-t0); + hal_send_str(str); + + t0 = hal_get_time(); + aes256_ctr_keyexp(&ctx256_ctr, key); + t1 = hal_get_time(); + sprintf(str, "aes256_ctr_keyexp: %llu cycles", t1-t0); + hal_send_str(str); + + hal_send_str("-"); + for(size_t blocks=1; blocks <= sizeof pt/16; blocks <<= 1){ + t0 = hal_get_time(); + aes128_ecb(ct, pt, blocks, &ctx128_ecb); + t1 = hal_get_time(); + sprintf(str, "aes128_ecb: %llu cycles for %u bytes (%.2f cycles/byte) -- w/o key expansion", t1-t0, blocks*AES_BLOCKBYTES, (double)(t1-t0)/(blocks*AES_BLOCKBYTES)); + hal_send_str(str); + } + hal_send_str("-"); + for(size_t blocks=1; blocks <= sizeof pt/16; blocks <<= 1){ + t0 = hal_get_time(); + aes256_ecb(ct, pt, blocks, &ctx256_ecb); + t1 = hal_get_time(); + sprintf(str, "aes256_ecb: %llu cycles for %u bytes (%.2f cycles/byte) -- w/o key expansion", t1-t0, blocks*AES_BLOCKBYTES, (double)(t1-t0)/(blocks*AES_BLOCKBYTES)); + hal_send_str(str); + } + hal_send_str("-"); + for(size_t blocks=1; blocks <= sizeof pt/16; blocks <<= 1){ + t0 = hal_get_time(); + aes128_ctr(ct, blocks*AES_BLOCKBYTES, nonce, &ctx128_ctr); + t1 = hal_get_time(); + sprintf(str, "aes128_ctr: %llu cycles for %u bytes (%.2f cycles/byte) -- w/o key expansion", t1-t0, blocks*AES_BLOCKBYTES, (double)(t1-t0)/(blocks*AES_BLOCKBYTES)); + hal_send_str(str); + } + hal_send_str("-"); + for(size_t blocks=1; blocks <= sizeof pt/16; blocks <<= 1){ + t0 = hal_get_time(); + aes256_ctr(ct, blocks*AES_BLOCKBYTES, nonce, &ctx256_ctr); + t1 = hal_get_time(); + sprintf(str, "aes256_ctr: %llu cycles for %u bytes (%.2f cycles/byte) -- w/o key expansion", t1-t0, blocks*AES_BLOCKBYTES, (double)(t1-t0)/(blocks*AES_BLOCKBYTES)); + hal_send_str(str); + } +} + + +int main(void) +{ + hal_setup(CLOCK_BENCHMARK); + hal_send_str("==="); + if(test()){ + hal_send_str("ERR"); + } else { + hal_send_str("ALL GOOD!"); + } + bench(); + hal_send_str("###"); + return 0; +} \ No newline at end of file diff --git a/common/crypto_hashblocks_sha512.c b/common/crypto_hashblocks_sha512.c new file mode 100644 index 0000000..b1c4664 --- /dev/null +++ b/common/crypto_hashblocks_sha512.c @@ -0,0 +1,101 @@ +#include "crypto_hashblocks_sha512.h" +#include + +static const uint64_t constants[80] = { + 0x428a2f98d728ae22ULL +, 0x7137449123ef65cdULL +, 0xb5c0fbcfec4d3b2fULL +, 0xe9b5dba58189dbbcULL +, 0x3956c25bf348b538ULL +, 0x59f111f1b605d019ULL +, 0x923f82a4af194f9bULL +, 0xab1c5ed5da6d8118ULL +, 0xd807aa98a3030242ULL +, 0x12835b0145706fbeULL +, 0x243185be4ee4b28cULL +, 0x550c7dc3d5ffb4e2ULL +, 0x72be5d74f27b896fULL +, 0x80deb1fe3b1696b1ULL +, 0x9bdc06a725c71235ULL +, 0xc19bf174cf692694ULL +, 0xe49b69c19ef14ad2ULL +, 0xefbe4786384f25e3ULL +, 0x0fc19dc68b8cd5b5ULL +, 0x240ca1cc77ac9c65ULL +, 0x2de92c6f592b0275ULL +, 0x4a7484aa6ea6e483ULL +, 0x5cb0a9dcbd41fbd4ULL +, 0x76f988da831153b5ULL +, 0x983e5152ee66dfabULL +, 0xa831c66d2db43210ULL +, 0xb00327c898fb213fULL +, 0xbf597fc7beef0ee4ULL +, 0xc6e00bf33da88fc2ULL +, 0xd5a79147930aa725ULL +, 0x06ca6351e003826fULL +, 0x142929670a0e6e70ULL +, 0x27b70a8546d22ffcULL +, 0x2e1b21385c26c926ULL +, 0x4d2c6dfc5ac42aedULL +, 0x53380d139d95b3dfULL +, 0x650a73548baf63deULL +, 0x766a0abb3c77b2a8ULL +, 0x81c2c92e47edaee6ULL +, 0x92722c851482353bULL +, 0xa2bfe8a14cf10364ULL +, 0xa81a664bbc423001ULL +, 0xc24b8b70d0f89791ULL +, 0xc76c51a30654be30ULL +, 0xd192e819d6ef5218ULL +, 0xd69906245565a910ULL +, 0xf40e35855771202aULL +, 0x106aa07032bbd1b8ULL +, 0x19a4c116b8d2d0c8ULL +, 0x1e376c085141ab53ULL +, 0x2748774cdf8eeb99ULL +, 0x34b0bcb5e19b48a8ULL +, 0x391c0cb3c5c95a63ULL +, 0x4ed8aa4ae3418acbULL +, 0x5b9cca4f7763e373ULL +, 0x682e6ff3d6b2b8a3ULL +, 0x748f82ee5defb2fcULL +, 0x78a5636f43172f60ULL +, 0x84c87814a1f0ab72ULL +, 0x8cc702081a6439ecULL +, 0x90befffa23631e28ULL +, 0xa4506cebde82bde9ULL +, 0xbef9a3f7b2c67915ULL +, 0xc67178f2e372532bULL +, 0xca273eceea26619cULL +, 0xd186b8c721c0c207ULL +, 0xeada7dd6cde0eb1eULL +, 0xf57d4f7fee6ed178ULL +, 0x06f067aa72176fbaULL +, 0x0a637dc5a2c898a6ULL +, 0x113f9804bef90daeULL +, 0x1b710b35131c471bULL +, 0x28db77f523047d84ULL +, 0x32caab7b40c72493ULL +, 0x3c9ebe0a15c9bebcULL +, 0x431d67c49c100d4cULL +, 0x4cc5d4becb3e42b6ULL +, 0x597f299cfc657e2aULL +, 0x5fcb6fab3ad6faecULL +, 0x6c44198c4a475817ULL +}; + + +#define CUTOFF 32768 /* must be multiple of 128 */ + +extern int crypto_hashblocks_sha512_m4nofpu_inner(unsigned char *,const unsigned char *,unsigned int,const uint64_t *); + +int crypto_hashblocks_sha512(unsigned char *statebytes,const unsigned char *in,unsigned long long inlen) +{ + while (inlen >= CUTOFF) { + crypto_hashblocks_sha512_m4nofpu_inner(statebytes,in,CUTOFF,constants); /* returns 0 */ + in += CUTOFF; + inlen -= CUTOFF; + } + if (inlen < 128) return inlen; + return crypto_hashblocks_sha512_m4nofpu_inner(statebytes,in,inlen,constants); +} diff --git a/common/crypto_hashblocks_sha512_inner32.s b/common/crypto_hashblocks_sha512_inner32.s new file mode 100644 index 0000000..a900501 --- /dev/null +++ b/common/crypto_hashblocks_sha512_inner32.s @@ -0,0 +1,6593 @@ + +# qhasm: int32 input_0 + +# qhasm: int32 input_1 + +# qhasm: int32 input_2 + +# qhasm: int32 input_3 + +# qhasm: stack32 input_4 + +# qhasm: stack32 input_5 + +# qhasm: stack32 input_6 + +# qhasm: stack32 input_7 + +# qhasm: int32 caller_r4 + +# qhasm: int32 caller_r5 + +# qhasm: int32 caller_r6 + +# qhasm: int32 caller_r7 + +# qhasm: int32 caller_r8 + +# qhasm: int32 caller_r9 + +# qhasm: int32 caller_r10 + +# qhasm: int32 caller_r11 + +# qhasm: int32 caller_r12 + +# qhasm: int32 caller_r14 + +# qhasm: startcode +.text +.arch armv7 +.fpu fpv4-sp-d16 +.syntax unified + +# qhasm: int32 two13 + +# qhasm: int32 two23 + +# qhasm: int32 two24 + +# qhasm: int32 two25 + +# qhasm: int32 lotmp + +# qhasm: int32 lotmp2 + +# qhasm: int32 hitmp + +# qhasm: int32 hitmp2 + +# qhasm: int32 lou0 + +# qhasm: int32 lou1 + +# qhasm: int32 lou2 + +# qhasm: int32 lou3 + +# qhasm: int32 lou4 + +# qhasm: int32 lou5 + +# qhasm: int32 hiu0 + +# qhasm: int32 hiu1 + +# qhasm: int32 hiu2 + +# qhasm: int32 hiu3 + +# qhasm: int32 hiu4 + +# qhasm: int32 hiu5 + +# qhasm: stack32 hid0 + +# qhasm: stack32 hid1 + +# qhasm: stack32 hid2 + +# qhasm: stack32 hid3 + +# qhasm: stack32 hid4 + +# qhasm: stack32 hid5 + +# qhasm: stack32 hid6 + +# qhasm: stack32 hid7 + +# qhasm: stack32 hid8 + +# qhasm: stack32 hid9 + +# qhasm: stack32 hid10 + +# qhasm: stack32 hid11 + +# qhasm: stack32 hid12 + +# qhasm: stack32 hid13 + +# qhasm: stack32 hid14 + +# qhasm: stack32 hid15 + +# qhasm: stack32 lod0 + +# qhasm: stack32 lod1 + +# qhasm: stack32 lod2 + +# qhasm: stack32 lod3 + +# qhasm: stack32 lod4 + +# qhasm: stack32 lod5 + +# qhasm: stack32 lod6 + +# qhasm: stack32 lod7 + +# qhasm: stack32 lod8 + +# qhasm: stack32 lod9 + +# qhasm: stack32 lod10 + +# qhasm: stack32 lod11 + +# qhasm: stack32 lod12 + +# qhasm: stack32 lod13 + +# qhasm: stack32 lod14 + +# qhasm: stack32 lod15 + +# qhasm: stack32 him0 + +# qhasm: stack32 him1 + +# qhasm: stack32 him2 + +# qhasm: stack32 him3 + +# qhasm: stack32 him4 + +# qhasm: stack32 him5 + +# qhasm: stack32 him6 + +# qhasm: stack32 him7 + +# qhasm: stack32 him8 + +# qhasm: stack32 him9 + +# qhasm: stack32 him10 + +# qhasm: stack32 him11 + +# qhasm: stack32 him12 + +# qhasm: stack32 him13 + +# qhasm: stack32 him14 + +# qhasm: stack32 him15 + +# qhasm: stack32 lom0 + +# qhasm: stack32 lom1 + +# qhasm: stack32 lom2 + +# qhasm: stack32 lom3 + +# qhasm: stack32 lom4 + +# qhasm: stack32 lom5 + +# qhasm: stack32 lom6 + +# qhasm: stack32 lom7 + +# qhasm: stack32 lom8 + +# qhasm: stack32 lom9 + +# qhasm: stack32 lom10 + +# qhasm: stack32 lom11 + +# qhasm: stack32 lom12 + +# qhasm: stack32 lom13 + +# qhasm: stack32 lom14 + +# qhasm: stack32 lom15 + +# qhasm: stack32 o0 + +# qhasm: stack32 o1 + +# qhasm: stack32 o2 + +# qhasm: stack32 o3 + +# qhasm: stack32 o4 + +# qhasm: rpushenter crypto_hashblocks_sha512_m4nofpu_inner +.p2align 2 +.global crypto_hashblocks_sha512_m4nofpu_inner +.type crypto_hashblocks_sha512_m4nofpu_inner,%function +.thumb +.thumb_func +crypto_hashblocks_sha512_m4nofpu_inner: +push {r4,r5,r6,r7,r8,r9,r10,r11,r14} +sub.w sp,sp,#288 + +# qhasm: o0 = input_0 +# asm 1: str o0=stack32#1 +# asm 2: str o0=[sp,#0] +# copy-collector input: str r0,[sp,#0] + +# qhasm: o1 = input_1 +# asm 1: str o1=stack32#2 +# asm 2: str o1=[sp,#4] +# copy-collector input: str r1,[sp,#4] + +# qhasm: input_1 = input_2 - 128 +# asm 1: sub >input_1=int32#2,input_1=r1,o2=stack32#3 +# asm 2: str o2=[sp,#8] +# copy-collector input: str r1,[sp,#8] + +# qhasm: o3 = input_3 +# asm 1: str o3=stack32#4 +# asm 2: str o3=[sp,#12] +# copy-collector input: str r3,[sp,#12] + +# qhasm: hiu0 = mem32[input_0] +# asm 1: ldr >hiu0=int32#2,[hiu0=r1,[lou0=int32#3,[lou0=r2,[hiu1=int32#4,[hiu1=r3,[lou1=int32#5,[lou1=r4,[hiu2=int32#6,[hiu2=r5,[lou2=int32#7,[lou2=r6,[hiu3=int32#8,[hiu3=r7,[lou3=int32#9,[lou3=r8,[lou0=int32#3,lou0=r2,hiu0=int32#2,hiu0=r1,lou1=int32#5,lou1=r4,hiu1=int32#4,hiu1=r3,lou2=int32#7,lou2=r6,hiu2=int32#6,hiu2=r5,lou3=int32#9,lou3=r8,hiu3=int32#8,hiu3=r7,lom0=stack32#5 +# asm 2: str lom0=[sp,#16] +# copy-collector input: str r2,[sp,#16] + +# qhasm: him0 = hiu0 +# asm 1: str him0=stack32#6 +# asm 2: str him0=[sp,#20] +# copy-collector input: str r1,[sp,#20] + +# qhasm: lom1 = lou1 +# asm 1: str lom1=stack32#7 +# asm 2: str lom1=[sp,#24] +# copy-collector input: str r4,[sp,#24] + +# qhasm: him1 = hiu1 +# asm 1: str him1=stack32#8 +# asm 2: str him1=[sp,#28] +# copy-collector input: str r3,[sp,#28] + +# qhasm: lom2 = lou2 +# asm 1: str lom2=stack32#9 +# asm 2: str lom2=[sp,#32] +# copy-collector input: str r6,[sp,#32] + +# qhasm: him2 = hiu2 +# asm 1: str him2=stack32#10 +# asm 2: str him2=[sp,#36] +# copy-collector input: str r5,[sp,#36] + +# qhasm: lom3 = lou3 +# asm 1: str lom3=stack32#11 +# asm 2: str lom3=[sp,#40] +# copy-collector input: str r8,[sp,#40] + +# qhasm: him3 = hiu3 +# asm 1: str him3=stack32#12 +# asm 2: str him3=[sp,#44] +# copy-collector input: str r7,[sp,#44] + +# qhasm: lod0 = lou0 +# asm 1: str lod0=stack32#13 +# asm 2: str lod0=[sp,#48] +# copy-collector input: str r2,[sp,#48] + +# qhasm: hid0 = hiu0 +# asm 1: str hid0=stack32#14 +# asm 2: str hid0=[sp,#52] +# copy-collector input: str r1,[sp,#52] + +# qhasm: lod1 = lou1 +# asm 1: str lod1=stack32#15 +# asm 2: str lod1=[sp,#56] +# copy-collector input: str r4,[sp,#56] + +# qhasm: hid1 = hiu1 +# asm 1: str hid1=stack32#16 +# asm 2: str hid1=[sp,#60] +# copy-collector input: str r3,[sp,#60] + +# qhasm: lod2 = lou2 +# asm 1: str lod2=stack32#17 +# asm 2: str lod2=[sp,#64] +# copy-collector input: str r6,[sp,#64] + +# qhasm: hid2 = hiu2 +# asm 1: str hid2=stack32#18 +# asm 2: str hid2=[sp,#68] +# copy-collector input: str r5,[sp,#68] + +# qhasm: lod3 = lou3 +# asm 1: str lod3=stack32#19 +# asm 2: str lod3=[sp,#72] +# copy-collector input: str r8,[sp,#72] + +# qhasm: hid3 = hiu3 +# asm 1: str hid3=stack32#20 +# asm 2: str hid3=[sp,#76] +# copy-collector input: str r7,[sp,#76] + +# qhasm: hiu0 = mem32[input_0+32] +# asm 1: ldr >hiu0=int32#2,[hiu0=r1,[lou0=int32#3,[lou0=r2,[hiu1=int32#4,[hiu1=r3,[lou1=int32#5,[lou1=r4,[hiu2=int32#6,[hiu2=r5,[lou2=int32#7,[lou2=r6,[hiu3=int32#8,[hiu3=r7,[lou3=int32#1,[lou3=r0,[lou0=int32#3,lou0=r2,hiu0=int32#2,hiu0=r1,lou1=int32#5,lou1=r4,hiu1=int32#4,hiu1=r3,lou2=int32#7,lou2=r6,hiu2=int32#6,hiu2=r5,lou3=int32#1,lou3=r0,hiu3=int32#8,hiu3=r7,lom4=stack32#21 +# asm 2: str lom4=[sp,#80] +# copy-collector input: str r2,[sp,#80] + +# qhasm: him4 = hiu0 +# asm 1: str him4=stack32#22 +# asm 2: str him4=[sp,#84] +# copy-collector input: str r1,[sp,#84] + +# qhasm: lom5 = lou1 +# asm 1: str lom5=stack32#23 +# asm 2: str lom5=[sp,#88] +# copy-collector input: str r4,[sp,#88] + +# qhasm: him5 = hiu1 +# asm 1: str him5=stack32#24 +# asm 2: str him5=[sp,#92] +# copy-collector input: str r3,[sp,#92] + +# qhasm: lom6 = lou2 +# asm 1: str lom6=stack32#25 +# asm 2: str lom6=[sp,#96] +# copy-collector input: str r6,[sp,#96] + +# qhasm: him6 = hiu2 +# asm 1: str him6=stack32#26 +# asm 2: str him6=[sp,#100] +# copy-collector input: str r5,[sp,#100] + +# qhasm: lom7 = lou3 +# asm 1: str lom7=stack32#27 +# asm 2: str lom7=[sp,#104] +# copy-collector input: str r0,[sp,#104] + +# qhasm: him7 = hiu3 +# asm 1: str him7=stack32#28 +# asm 2: str him7=[sp,#108] +# copy-collector input: str r7,[sp,#108] + +# qhasm: lod4 = lou0 +# asm 1: str lod4=stack32#29 +# asm 2: str lod4=[sp,#112] +# copy-collector input: str r2,[sp,#112] + +# qhasm: hid4 = hiu0 +# asm 1: str hid4=stack32#30 +# asm 2: str hid4=[sp,#116] +# copy-collector input: str r1,[sp,#116] + +# qhasm: lod5 = lou1 +# asm 1: str lod5=stack32#31 +# asm 2: str lod5=[sp,#120] +# copy-collector input: str r4,[sp,#120] + +# qhasm: hid5 = hiu1 +# asm 1: str hid5=stack32#32 +# asm 2: str hid5=[sp,#124] +# copy-collector input: str r3,[sp,#124] + +# qhasm: lod6 = lou2 +# asm 1: str lod6=stack32#33 +# asm 2: str lod6=[sp,#128] +# copy-collector input: str r6,[sp,#128] + +# qhasm: hid6 = hiu2 +# asm 1: str hid6=stack32#34 +# asm 2: str hid6=[sp,#132] +# copy-collector input: str r5,[sp,#132] + +# qhasm: lod7 = lou3 +# asm 1: str lod7=stack32#35 +# asm 2: str lod7=[sp,#136] +# copy-collector input: str r0,[sp,#136] + +# qhasm: hid7 = hiu3 +# asm 1: str hid7=stack32#36 +# asm 2: str hid7=[sp,#140] +# copy-collector input: str r7,[sp,#140] + +# qhasm: mainloop: +# copy-collector output starts +strd r2,r1,[sp,#80] +strd r4,r3,[sp,#88] +strd r6,r5,[sp,#96] +strd r0,r7,[sp,#104] +strd r2,r1,[sp,#112] +strd r4,r3,[sp,#120] +strd r6,r5,[sp,#128] +strd r0,r7,[sp,#136] +# copy-collector output ends +._mainloop: + +# qhasm: input_0 = o1 +# asm 1: ldr >input_0=int32#1,input_0=r0,hiu0=int32#2,[hiu0=r1,[lou0=int32#3,[lou0=r2,[hiu1=int32#4,[hiu1=r3,[lou1=int32#5,[lou1=r4,[hiu2=int32#6,[hiu2=r5,[lou2=int32#7,[lou2=r6,[hiu3=int32#8,[hiu3=r7,[lou3=int32#9,[lou3=r8,[lou0=int32#3,lou0=r2,hiu0=int32#2,hiu0=r1,lou1=int32#5,lou1=r4,hiu1=int32#4,hiu1=r3,lou2=int32#7,lou2=r6,hiu2=int32#6,hiu2=r5,lou3=int32#9,lou3=r8,hiu3=int32#8,hiu3=r7,lod8=stack32#37 +# asm 2: str lod8=[sp,#144] +# copy-collector input: str r2,[sp,#144] + +# qhasm: hid8 = hiu0 +# asm 1: str hid8=stack32#38 +# asm 2: str hid8=[sp,#148] +# copy-collector input: str r1,[sp,#148] + +# qhasm: lod9 = lou1 +# asm 1: str lod9=stack32#39 +# asm 2: str lod9=[sp,#152] +# copy-collector input: str r4,[sp,#152] + +# qhasm: hid9 = hiu1 +# asm 1: str hid9=stack32#40 +# asm 2: str hid9=[sp,#156] +# copy-collector input: str r3,[sp,#156] + +# qhasm: lod10 = lou2 +# asm 1: str lod10=stack32#41 +# asm 2: str lod10=[sp,#160] +# copy-collector input: str r6,[sp,#160] + +# qhasm: hid10 = hiu2 +# asm 1: str hid10=stack32#42 +# asm 2: str hid10=[sp,#164] +# copy-collector input: str r5,[sp,#164] + +# qhasm: lod11 = lou3 +# asm 1: str lod11=stack32#43 +# asm 2: str lod11=[sp,#168] +# copy-collector input: str r8,[sp,#168] + +# qhasm: hid11 = hiu3 +# asm 1: str hid11=stack32#44 +# asm 2: str hid11=[sp,#172] +# copy-collector input: str r7,[sp,#172] + +# qhasm: hiu0 = mem32[input_0]; input_0 += 4 +# asm 1: ldr >hiu0=int32#2,[hiu0=r1,[lou0=int32#3,[lou0=r2,[hiu1=int32#4,[hiu1=r3,[lou1=int32#5,[lou1=r4,[hiu2=int32#6,[hiu2=r5,[lou2=int32#7,[lou2=r6,[hiu3=int32#8,[hiu3=r7,[lou3=int32#9,[lou3=r8,[lou0=int32#3,lou0=r2,hiu0=int32#2,hiu0=r1,lou1=int32#5,lou1=r4,hiu1=int32#4,hiu1=r3,lou2=int32#7,lou2=r6,hiu2=int32#6,hiu2=r5,lou3=int32#9,lou3=r8,hiu3=int32#8,hiu3=r7,lod12=stack32#45 +# asm 2: str lod12=[sp,#176] +# copy-collector input: str r2,[sp,#176] + +# qhasm: hid12 = hiu0 +# asm 1: str hid12=stack32#46 +# asm 2: str hid12=[sp,#180] +# copy-collector input: str r1,[sp,#180] + +# qhasm: lod13 = lou1 +# asm 1: str lod13=stack32#47 +# asm 2: str lod13=[sp,#184] +# copy-collector input: str r4,[sp,#184] + +# qhasm: hid13 = hiu1 +# asm 1: str hid13=stack32#48 +# asm 2: str hid13=[sp,#188] +# copy-collector input: str r3,[sp,#188] + +# qhasm: lod14 = lou2 +# asm 1: str lod14=stack32#49 +# asm 2: str lod14=[sp,#192] +# copy-collector input: str r6,[sp,#192] + +# qhasm: hid14 = hiu2 +# asm 1: str hid14=stack32#50 +# asm 2: str hid14=[sp,#196] +# copy-collector input: str r5,[sp,#196] + +# qhasm: lod15 = lou3 +# asm 1: str lod15=stack32#51 +# asm 2: str lod15=[sp,#200] +# copy-collector input: str r8,[sp,#200] + +# qhasm: hid15 = hiu3 +# asm 1: str hid15=stack32#52 +# asm 2: str hid15=[sp,#204] +# copy-collector input: str r7,[sp,#204] + +# qhasm: hiu0 = mem32[input_0]; input_0 += 4 +# asm 1: ldr >hiu0=int32#2,[hiu0=r1,[lou0=int32#3,[lou0=r2,[hiu1=int32#4,[hiu1=r3,[lou1=int32#5,[lou1=r4,[hiu2=int32#6,[hiu2=r5,[lou2=int32#7,[lou2=r6,[hiu3=int32#8,[hiu3=r7,[lou3=int32#9,[lou3=r8,[lou0=int32#3,lou0=r2,hiu0=int32#2,hiu0=r1,lou1=int32#5,lou1=r4,hiu1=int32#4,hiu1=r3,lou2=int32#7,lou2=r6,hiu2=int32#6,hiu2=r5,lou3=int32#9,lou3=r8,hiu3=int32#8,hiu3=r7,lom8=stack32#53 +# asm 2: str lom8=[sp,#208] +# copy-collector input: str r2,[sp,#208] + +# qhasm: him8 = hiu0 +# asm 1: str him8=stack32#54 +# asm 2: str him8=[sp,#212] +# copy-collector input: str r1,[sp,#212] + +# qhasm: lom9 = lou1 +# asm 1: str lom9=stack32#55 +# asm 2: str lom9=[sp,#216] +# copy-collector input: str r4,[sp,#216] + +# qhasm: him9 = hiu1 +# asm 1: str him9=stack32#56 +# asm 2: str him9=[sp,#220] +# copy-collector input: str r3,[sp,#220] + +# qhasm: lom10 = lou2 +# asm 1: str lom10=stack32#57 +# asm 2: str lom10=[sp,#224] +# copy-collector input: str r6,[sp,#224] + +# qhasm: him10 = hiu2 +# asm 1: str him10=stack32#58 +# asm 2: str him10=[sp,#228] +# copy-collector input: str r5,[sp,#228] + +# qhasm: lom11 = lou3 +# asm 1: str lom11=stack32#59 +# asm 2: str lom11=[sp,#232] +# copy-collector input: str r8,[sp,#232] + +# qhasm: him11 = hiu3 +# asm 1: str him11=stack32#60 +# asm 2: str him11=[sp,#236] +# copy-collector input: str r7,[sp,#236] + +# qhasm: hiu0 = mem32[input_0]; input_0 += 4 +# asm 1: ldr >hiu0=int32#2,[hiu0=r1,[lou0=int32#3,[lou0=r2,[hiu1=int32#4,[hiu1=r3,[lou1=int32#5,[lou1=r4,[hiu2=int32#6,[hiu2=r5,[lou2=int32#7,[lou2=r6,[hiu3=int32#8,[hiu3=r7,[lou3=int32#9,[lou3=r8,[lou0=int32#3,lou0=r2,hiu0=int32#2,hiu0=r1,lou1=int32#5,lou1=r4,hiu1=int32#4,hiu1=r3,lou2=int32#7,lou2=r6,hiu2=int32#6,hiu2=r5,lou3=int32#9,lou3=r8,hiu3=int32#8,hiu3=r7,lom12=stack32#61 +# asm 2: str lom12=[sp,#240] +# copy-collector input: str r2,[sp,#240] + +# qhasm: him12 = hiu0 +# asm 1: str him12=stack32#62 +# asm 2: str him12=[sp,#244] +# copy-collector input: str r1,[sp,#244] + +# qhasm: lom13 = lou1 +# asm 1: str lom13=stack32#63 +# asm 2: str lom13=[sp,#248] +# copy-collector input: str r4,[sp,#248] + +# qhasm: him13 = hiu1 +# asm 1: str him13=stack32#64 +# asm 2: str him13=[sp,#252] +# copy-collector input: str r3,[sp,#252] + +# qhasm: lom14 = lou2 +# asm 1: str lom14=stack32#65 +# asm 2: str lom14=[sp,#256] +# copy-collector input: str r6,[sp,#256] + +# qhasm: him14 = hiu2 +# asm 1: str him14=stack32#66 +# asm 2: str him14=[sp,#260] +# copy-collector input: str r5,[sp,#260] + +# qhasm: lom15 = lou3 +# asm 1: str lom15=stack32#67 +# asm 2: str lom15=[sp,#264] +# copy-collector input: str r8,[sp,#264] + +# qhasm: him15 = hiu3 +# asm 1: str him15=stack32#68 +# asm 2: str him15=[sp,#268] +# copy-collector input: str r7,[sp,#268] + +# qhasm: o1 = input_0 +# asm 1: str o1=stack32#2 +# asm 2: str o1=[sp,#4] +# copy-collector input: str r0,[sp,#4] + +# qhasm: input_0 = 80 simple +# asm 1: mov >input_0=int32#1,80 +# asm 2: mov >input_0=r0,80 +# copy-collector output starts +strd r2,r1,[sp,#240] +strd r4,r3,[sp,#248] +strd r6,r5,[sp,#256] +strd r8,r7,[sp,#264] +str r0,[sp,#4] +# copy-collector output ends +mov r0,80 + +# qhasm: o4 = input_0 +# asm 1: str o4=stack32#69 +# asm 2: str o4=[sp,#272] +# copy-collector input: str r0,[sp,#272] + +# qhasm: innerloop: +# copy-collector output starts +str r0,[sp,#272] +# copy-collector output ends +._innerloop: + +# qhasm: input_0 = o3 +# asm 1: ldr >input_0=int32#1,input_0=r0,lou0=int32#2,lou0=r1,hiu0=int32#3,hiu0=r2,lou1=int32#4,lou1=r3,hiu1=int32#5,hiu1=r4,lou2=int32#6,lou2=r5,hiu2=int32#7,hiu2=r6,lou3=int32#8,lou3=r7,hiu3=int32#9,hiu3=r8,lou4=int32#10,lou4=r9,hiu4=int32#11,hiu4=r10,lou5=int32#12,lou5=r11,hiu5=int32#13,hiu5=r12,two23=int32#14,0x800000 +# asm 2: mov >two23=r14,0x800000 +# copy-collector output starts +ldr r0,[sp,#12] +ldr r1,[sp,#72] +ldr r2,[sp,#76] +ldr r3,[sp,#112] +ldr r4,[sp,#116] +ldr r5,[sp,#120] +ldr r6,[sp,#124] +ldr r7,[sp,#128] +ldr r8,[sp,#132] +ldr r9,[sp,#136] +ldr r10,[sp,#140] +ldr r11,[sp,#144] +ldr r12,[sp,#148] +# copy-collector output ends +mov r14,0x800000 + +# qhasm: carry? lou4 += lou5 +# asm 1: adds >lou4=int32#10,lou4=r9,hiu4=int32#11,hiu4=r10,lotmp=int32#12,[lotmp=r11,[hitmp=int32#13,[hitmp=r12,[lou4=int32#10,lou4=r9,hiu4=int32#11,hiu4=r10,lotmp=int32#13,>hitmp=int32#12,lotmp=r12,>hitmp=r11,> 18) +# asm 1: eor >lotmp=int32#13,lotmp=r12,lotmp=int32#13,lotmp=r12,> 14) +# asm 1: eor >lotmp=int32#13,lotmp=r12,lotmp=int32#13,lotmp=r12,> 18) +# asm 1: eor >hitmp=int32#12,hitmp=r11,hitmp=int32#12,hitmp=r11,> 14) +# asm 1: eor >hitmp=int32#12,hitmp=r11,hitmp=int32#12,hitmp=r11,lou4=int32#10,lou4=r9,hiu4=int32#11,hiu4=r10,lotmp=int32#6,lotmp=r5,lotmp2=int32#4,lotmp2=r3,lotmp=int32#4,lotmp=r3,lou4=int32#4,lou4=r3,hitmp=int32#6,hitmp=r5,hitmp2=int32#5,hitmp2=r4,hitmp=int32#5,hitmp=r4,hiu4=int32#5,hiu4=r4,lou0=int32#2,lou0=r1,hiu0=int32#3,hiu0=r2,lod3=stack32#4 +# asm 2: str lod3=[sp,#12] +# copy-collector input: str r1,[sp,#12] + +# qhasm: hid3 = hiu0 +# asm 1: str hid3=stack32#19 +# asm 2: str hid3=[sp,#72] +# copy-collector input: str r2,[sp,#72] + +# qhasm: lou1 = lod0 +# asm 1: ldr >lou1=int32#6,lou1=r5,hiu1=int32#7,hiu1=r6,lou2=int32#8,lou2=r7,hiu2=int32#9,hiu2=r8,lou3=int32#10,lou3=r9,hiu3=int32#11,hiu3=r10,two25=int32#12,0x2000000 +# asm 2: mov >two25=r11,0x2000000 +# copy-collector output starts +str r1,[sp,#12] +str r2,[sp,#72] +ldr r5,[sp,#48] +ldr r6,[sp,#52] +ldr r7,[sp,#56] +ldr r8,[sp,#60] +ldr r9,[sp,#64] +ldr r10,[sp,#68] +# copy-collector output ends +mov r11,0x2000000 + +# qhasm: hitmp lotmp = lou1 * two25 +# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,lotmp=r14,>hitmp=r12,> 2) +# asm 1: eor >lotmp=int32#12,lotmp=r11,lotmp=int32#12,lotmp=r11,> 28) +# asm 1: eor >lotmp=int32#12,lotmp=r11,lotmp=int32#12,lotmp=r11,> 2) +# asm 1: eor >hitmp=int32#13,hitmp=r12,hitmp=int32#13,hitmp=r12,> 28) +# asm 1: eor >hitmp=int32#13,hitmp=r12,hitmp=int32#13,hitmp=r12,lou4=int32#4,lou4=r3,hiu4=int32#5,hiu4=r4,lotmp=int32#12,lotmp=r11,lotmp=int32#6,lotmp=r5,lotmp2=int32#8,lotmp2=r7,lotmp=int32#6,lotmp=r5,lou4=int32#4,lou4=r3,hitmp=int32#6,hitmp=r5,hitmp=int32#6,hitmp=r5,hitmp2=int32#7,hitmp2=r6,hitmp=int32#6,hitmp=r5,hiu4=int32#5,hiu4=r4,lod7=stack32#17 +# asm 2: str lod7=[sp,#64] +# copy-collector input: str r3,[sp,#64] + +# qhasm: hid7 = hiu4 +# asm 1: str hid7=stack32#18 +# asm 2: str hid7=[sp,#68] +# copy-collector input: str r4,[sp,#68] + +# qhasm: lou1 = lod4 +# asm 1: ldr >lou1=int32#4,lou1=r3,hiu1=int32#5,hiu1=r4,lou2=int32#6,lou2=r5,hiu2=int32#7,hiu2=r6,lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lou5=int32#12,lou5=r11,hiu5=int32#13,hiu5=r12,two23=int32#14,0x800000 +# asm 2: mov >two23=r14,0x800000 +# copy-collector output starts +strd r3,r4,[sp,#64] +ldr r3,[sp,#112] +ldr r4,[sp,#116] +ldr r5,[sp,#120] +ldr r6,[sp,#124] +ldr.w r7,[sp,#128] +ldr r8,[sp,#132] +ldr r11,[sp,#152] +ldr r12,[sp,#156] +# copy-collector output ends +mov r14,0x800000 + +# qhasm: carry? lou4 += lou5 +# asm 1: adds >lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lotmp=int32#12,[lotmp=r11,[hitmp=int32#13,[hitmp=r12,[lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lotmp=int32#13,>hitmp=int32#12,lotmp=r12,>hitmp=r11,> 18) +# asm 1: eor >lotmp=int32#13,lotmp=r12,lotmp=int32#13,lotmp=r12,> 14) +# asm 1: eor >lotmp=int32#13,lotmp=r12,lotmp=int32#13,lotmp=r12,> 18) +# asm 1: eor >hitmp=int32#12,hitmp=r11,hitmp=int32#12,hitmp=r11,> 14) +# asm 1: eor >hitmp=int32#12,hitmp=r11,hitmp=int32#12,hitmp=r11,lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lotmp=int32#4,lotmp=r3,lotmp2=int32#2,lotmp2=r1,lotmp=int32#2,lotmp=r1,lou4=int32#2,lou4=r1,hitmp=int32#4,hitmp=r3,hitmp2=int32#3,hitmp2=r2,hitmp=int32#3,hitmp=r2,hiu4=int32#3,hiu4=r2,lou3=int32#4,lou3=r3,hiu3=int32#5,hiu3=r4,lod2=stack32#33 +# asm 2: str lod2=[sp,#128] +# copy-collector input: str r3,[sp,#128] + +# qhasm: hid2 = hiu3 +# asm 1: str hid2=stack32#34 +# asm 2: str hid2=[sp,#132] +# copy-collector input: str r4,[sp,#132] + +# qhasm: lou0 = lod7 +# asm 1: ldr >lou0=int32#6,lou0=r5,hiu0=int32#7,hiu0=r6,lou1=int32#8,lou1=r7,hiu1=int32#9,hiu1=r8,lou2=int32#10,lou2=r9,hiu2=int32#11,hiu2=r10,two25=int32#12,0x2000000 +# asm 2: mov >two25=r11,0x2000000 +# copy-collector output starts +strd r3,r4,[sp,#128] +ldr r5,[sp,#64] +ldr r6,[sp,#68] +ldr.w r7,[sp,#48] +ldr r8,[sp,#52] +ldr r9,[sp,#56] +ldr r10,[sp,#60] +# copy-collector output ends +mov r11,0x2000000 + +# qhasm: hitmp lotmp = lou0 * two25 +# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,lotmp=r14,>hitmp=r12,> 2) +# asm 1: eor >lotmp=int32#12,lotmp=r11,lotmp=int32#12,lotmp=r11,> 28) +# asm 1: eor >lotmp=int32#12,lotmp=r11,lotmp=int32#12,lotmp=r11,> 2) +# asm 1: eor >hitmp=int32#13,hitmp=r12,hitmp=int32#13,hitmp=r12,> 28) +# asm 1: eor >hitmp=int32#13,hitmp=r12,hitmp=int32#13,hitmp=r12,lou4=int32#2,lou4=r1,hiu4=int32#3,hiu4=r2,lotmp=int32#12,lotmp=r11,lotmp=int32#6,lotmp=r5,lotmp2=int32#8,lotmp2=r7,lotmp=int32#6,lotmp=r5,lou4=int32#2,lou4=r1,hitmp=int32#6,hitmp=r5,hitmp=int32#6,hitmp=r5,hitmp2=int32#7,hitmp2=r6,hitmp=int32#6,hitmp=r5,hiu4=int32#3,hiu4=r2,lod6=stack32#15 +# asm 2: str lod6=[sp,#56] +# copy-collector input: str r1,[sp,#56] + +# qhasm: hid6 = hiu4 +# asm 1: str hid6=stack32#16 +# asm 2: str hid6=[sp,#60] +# copy-collector input: str r2,[sp,#60] + +# qhasm: lou0 = lod3 +# asm 1: ldr >lou0=int32#2,lou0=r1,hiu0=int32#3,hiu0=r2,lou1=int32#6,lou1=r5,hiu1=int32#7,hiu1=r6,lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lou5=int32#12,lou5=r11,hiu5=int32#13,hiu5=r12,two23=int32#14,0x800000 +# asm 2: mov >two23=r14,0x800000 +# copy-collector output starts +strd r1,r2,[sp,#56] +ldr r1,[sp,#12] +ldr r2,[sp,#72] +ldr r5,[sp,#112] +ldr r6,[sp,#116] +ldr.w r7,[sp,#120] +ldr r8,[sp,#124] +ldr r11,[sp,#160] +ldr r12,[sp,#164] +# copy-collector output ends +mov r14,0x800000 + +# qhasm: carry? lou4 += lou5 +# asm 1: adds >lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lotmp=int32#12,[lotmp=r11,[hitmp=int32#13,[hitmp=r12,[lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lotmp=int32#13,>hitmp=int32#12,lotmp=r12,>hitmp=r11,> 18) +# asm 1: eor >lotmp=int32#13,lotmp=r12,lotmp=int32#13,lotmp=r12,> 14) +# asm 1: eor >lotmp=int32#13,lotmp=r12,lotmp=int32#13,lotmp=r12,> 18) +# asm 1: eor >hitmp=int32#12,hitmp=r11,hitmp=int32#12,hitmp=r11,> 14) +# asm 1: eor >hitmp=int32#12,hitmp=r11,hitmp=int32#12,hitmp=r11,lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lotmp=int32#2,lotmp=r1,lotmp2=int32#4,lotmp2=r3,lotmp=int32#2,lotmp=r1,lou4=int32#2,lou4=r1,hitmp=int32#3,hitmp=r2,hitmp2=int32#4,hitmp2=r3,hitmp=int32#3,hitmp=r2,hiu4=int32#3,hiu4=r2,lou2=int32#4,lou2=r3,hiu2=int32#5,hiu2=r4,lod1=stack32#31 +# asm 2: str lod1=[sp,#120] +# copy-collector input: str r3,[sp,#120] + +# qhasm: hid1 = hiu2 +# asm 1: str hid1=stack32#32 +# asm 2: str hid1=[sp,#124] +# copy-collector input: str r4,[sp,#124] + +# qhasm: lou0 = lod6 +# asm 1: ldr >lou0=int32#6,lou0=r5,hiu0=int32#7,hiu0=r6,lou1=int32#8,lou1=r7,hiu1=int32#9,hiu1=r8,lou3=int32#10,lou3=r9,hiu3=int32#11,hiu3=r10,two25=int32#12,0x2000000 +# asm 2: mov >two25=r11,0x2000000 +# copy-collector output starts +strd r3,r4,[sp,#120] +ldr r5,[sp,#56] +ldr r6,[sp,#60] +ldr.w r7,[sp,#64] +ldr r8,[sp,#68] +ldr r9,[sp,#48] +ldr r10,[sp,#52] +# copy-collector output ends +mov r11,0x2000000 + +# qhasm: hitmp lotmp = lou0 * two25 +# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,lotmp=r14,>hitmp=r12,> 2) +# asm 1: eor >lotmp=int32#12,lotmp=r11,lotmp=int32#12,lotmp=r11,> 28) +# asm 1: eor >lotmp=int32#12,lotmp=r11,lotmp=int32#12,lotmp=r11,> 2) +# asm 1: eor >hitmp=int32#13,hitmp=r12,hitmp=int32#13,hitmp=r12,> 28) +# asm 1: eor >hitmp=int32#13,hitmp=r12,hitmp=int32#13,hitmp=r12,lou4=int32#2,lou4=r1,hiu4=int32#3,hiu4=r2,lotmp=int32#12,lotmp=r11,lotmp=int32#6,lotmp=r5,lotmp2=int32#8,lotmp2=r7,lotmp=int32#6,lotmp=r5,lou4=int32#2,lou4=r1,hitmp=int32#6,hitmp=r5,hitmp=int32#6,hitmp=r5,hitmp2=int32#7,hitmp2=r6,hitmp=int32#6,hitmp=r5,hiu4=int32#3,hiu4=r2,lod5=stack32#13 +# asm 2: str lod5=[sp,#48] +# copy-collector input: str r1,[sp,#48] + +# qhasm: hid5 = hiu4 +# asm 1: str hid5=stack32#14 +# asm 2: str hid5=[sp,#52] +# copy-collector input: str r2,[sp,#52] + +# qhasm: lou0 = lod2 +# asm 1: ldr >lou0=int32#2,lou0=r1,hiu0=int32#3,hiu0=r2,lou1=int32#6,lou1=r5,hiu1=int32#7,hiu1=r6,lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lou5=int32#12,lou5=r11,hiu5=int32#13,hiu5=r12,two23=int32#14,0x800000 +# asm 2: mov >two23=r14,0x800000 +# copy-collector output starts +strd r1,r2,[sp,#48] +ldr r1,[sp,#128] +ldr r2,[sp,#132] +ldr r5,[sp,#12] +ldr r6,[sp,#72] +ldr.w r7,[sp,#112] +ldr r8,[sp,#116] +ldr r11,[sp,#168] +ldr r12,[sp,#172] +# copy-collector output ends +mov r14,0x800000 + +# qhasm: carry? lou4 += lou5 +# asm 1: adds >lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lotmp=int32#12,[lotmp=r11,[hitmp=int32#13,[hitmp=r12,[lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lotmp=int32#13,>hitmp=int32#12,lotmp=r12,>hitmp=r11,> 18) +# asm 1: eor >lotmp=int32#13,lotmp=r12,lotmp=int32#13,lotmp=r12,> 14) +# asm 1: eor >lotmp=int32#13,lotmp=r12,lotmp=int32#13,lotmp=r12,> 18) +# asm 1: eor >hitmp=int32#12,hitmp=r11,hitmp=int32#12,hitmp=r11,> 14) +# asm 1: eor >hitmp=int32#12,hitmp=r11,hitmp=int32#12,hitmp=r11,lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lotmp=int32#2,lotmp=r1,lotmp2=int32#4,lotmp2=r3,lotmp=int32#2,lotmp=r1,lou4=int32#2,lou4=r1,hitmp=int32#3,hitmp=r2,hitmp2=int32#4,hitmp2=r3,hitmp=int32#3,hitmp=r2,hiu4=int32#3,hiu4=r2,lou3=int32#4,lou3=r3,hiu3=int32#5,hiu3=r4,lod0=stack32#29 +# asm 2: str lod0=[sp,#112] +# copy-collector input: str r3,[sp,#112] + +# qhasm: hid0 = hiu3 +# asm 1: str hid0=stack32#30 +# asm 2: str hid0=[sp,#116] +# copy-collector input: str r4,[sp,#116] + +# qhasm: lou0 = lod5 +# asm 1: ldr >lou0=int32#6,lou0=r5,hiu0=int32#7,hiu0=r6,lou1=int32#8,lou1=r7,hiu1=int32#9,hiu1=r8,lou2=int32#10,lou2=r9,hiu2=int32#11,hiu2=r10,two25=int32#12,0x2000000 +# asm 2: mov >two25=r11,0x2000000 +# copy-collector output starts +strd r3,r4,[sp,#112] +ldr r5,[sp,#48] +ldr r6,[sp,#52] +ldr.w r7,[sp,#56] +ldr r8,[sp,#60] +ldr r9,[sp,#64] +ldr r10,[sp,#68] +# copy-collector output ends +mov r11,0x2000000 + +# qhasm: hitmp lotmp = lou0 * two25 +# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,lotmp=r14,>hitmp=r12,> 2) +# asm 1: eor >lotmp=int32#12,lotmp=r11,lotmp=int32#12,lotmp=r11,> 28) +# asm 1: eor >lotmp=int32#12,lotmp=r11,lotmp=int32#12,lotmp=r11,> 2) +# asm 1: eor >hitmp=int32#13,hitmp=r12,hitmp=int32#13,hitmp=r12,> 28) +# asm 1: eor >hitmp=int32#13,hitmp=r12,hitmp=int32#13,hitmp=r12,lou4=int32#2,lou4=r1,hiu4=int32#3,hiu4=r2,lotmp=int32#12,lotmp=r11,lotmp=int32#6,lotmp=r5,lotmp2=int32#8,lotmp2=r7,lotmp=int32#6,lotmp=r5,lou4=int32#2,lou4=r1,hitmp=int32#6,hitmp=r5,hitmp=int32#6,hitmp=r5,hitmp2=int32#7,hitmp2=r6,hitmp=int32#6,hitmp=r5,hiu4=int32#3,hiu4=r2,lod4=stack32#70 +# asm 2: str lod4=[sp,#276] +# copy-collector input: str r1,[sp,#276] + +# qhasm: hid4 = hiu4 +# asm 1: str hid4=stack32#71 +# asm 2: str hid4=[sp,#280] +# copy-collector input: str r2,[sp,#280] + +# qhasm: lou0 = lod1 +# asm 1: ldr >lou0=int32#2,lou0=r1,hiu0=int32#3,hiu0=r2,lou1=int32#6,lou1=r5,hiu1=int32#7,hiu1=r6,lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lou5=int32#12,lou5=r11,hiu5=int32#13,hiu5=r12,two23=int32#14,0x800000 +# asm 2: mov >two23=r14,0x800000 +# copy-collector output starts +strd r1,r2,[sp,#276] +ldr r1,[sp,#120] +ldr r2,[sp,#124] +ldr r5,[sp,#128] +ldr r6,[sp,#132] +ldr.w r7,[sp,#12] +ldr r8,[sp,#72] +ldr r11,[sp,#176] +ldr r12,[sp,#180] +# copy-collector output ends +mov r14,0x800000 + +# qhasm: carry? lou4 += lou5 +# asm 1: adds >lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lotmp=int32#12,[lotmp=r11,[hitmp=int32#13,[hitmp=r12,[lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lotmp=int32#13,>hitmp=int32#12,lotmp=r12,>hitmp=r11,> 18) +# asm 1: eor >lotmp=int32#13,lotmp=r12,lotmp=int32#13,lotmp=r12,> 14) +# asm 1: eor >lotmp=int32#13,lotmp=r12,lotmp=int32#13,lotmp=r12,> 18) +# asm 1: eor >hitmp=int32#12,hitmp=r11,hitmp=int32#12,hitmp=r11,> 14) +# asm 1: eor >hitmp=int32#12,hitmp=r11,hitmp=int32#12,hitmp=r11,lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lotmp=int32#2,lotmp=r1,lotmp2=int32#4,lotmp2=r3,lotmp=int32#2,lotmp=r1,lou4=int32#2,lou4=r1,hitmp=int32#3,hitmp=r2,hitmp2=int32#4,hitmp2=r3,hitmp=int32#3,hitmp=r2,hiu4=int32#3,hiu4=r2,lou2=int32#4,lou2=r3,hiu2=int32#5,hiu2=r4,lod7=stack32#35 +# asm 2: str lod7=[sp,#136] +# copy-collector input: str r3,[sp,#136] + +# qhasm: hid7 = hiu2 +# asm 1: str hid7=stack32#36 +# asm 2: str hid7=[sp,#140] +# copy-collector input: str r4,[sp,#140] + +# qhasm: lou0 = lod4 +# asm 1: ldr >lou0=int32#6,lou0=r5,hiu0=int32#7,hiu0=r6,lou1=int32#8,lou1=r7,hiu1=int32#9,hiu1=r8,lou3=int32#10,lou3=r9,hiu3=int32#11,hiu3=r10,two25=int32#12,0x2000000 +# asm 2: mov >two25=r11,0x2000000 +# copy-collector output starts +strd r3,r4,[sp,#136] +ldr r5,[sp,#276] +ldr r6,[sp,#280] +ldr.w r7,[sp,#48] +ldr r8,[sp,#52] +ldr r9,[sp,#56] +ldr r10,[sp,#60] +# copy-collector output ends +mov r11,0x2000000 + +# qhasm: hitmp lotmp = lou0 * two25 +# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,lotmp=r14,>hitmp=r12,> 2) +# asm 1: eor >lotmp=int32#12,lotmp=r11,lotmp=int32#12,lotmp=r11,> 28) +# asm 1: eor >lotmp=int32#12,lotmp=r11,lotmp=int32#12,lotmp=r11,> 2) +# asm 1: eor >hitmp=int32#13,hitmp=r12,hitmp=int32#13,hitmp=r12,> 28) +# asm 1: eor >hitmp=int32#13,hitmp=r12,hitmp=int32#13,hitmp=r12,lou4=int32#2,lou4=r1,hiu4=int32#3,hiu4=r2,lotmp=int32#12,lotmp=r11,lotmp=int32#6,lotmp=r5,lotmp2=int32#8,lotmp2=r7,lotmp=int32#6,lotmp=r5,lou4=int32#2,lou4=r1,hitmp=int32#6,hitmp=r5,hitmp=int32#6,hitmp=r5,hitmp2=int32#7,hitmp2=r6,hitmp=int32#6,hitmp=r5,hiu4=int32#3,hiu4=r2,lod3=stack32#19 +# asm 2: str lod3=[sp,#72] +# copy-collector input: str r1,[sp,#72] + +# qhasm: hid3 = hiu4 +# asm 1: str hid3=stack32#20 +# asm 2: str hid3=[sp,#76] +# copy-collector input: str r2,[sp,#76] + +# qhasm: lou0 = lod0 +# asm 1: ldr >lou0=int32#2,lou0=r1,hiu0=int32#3,hiu0=r2,lou1=int32#6,lou1=r5,hiu1=int32#7,hiu1=r6,lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lou5=int32#12,lou5=r11,hiu5=int32#13,hiu5=r12,two23=int32#14,0x800000 +# asm 2: mov >two23=r14,0x800000 +# copy-collector output starts +strd r1,r2,[sp,#72] +ldr r1,[sp,#112] +ldr r2,[sp,#116] +ldr r5,[sp,#120] +ldr r6,[sp,#124] +ldr.w r7,[sp,#128] +ldr r8,[sp,#132] +ldr r11,[sp,#184] +ldr r12,[sp,#188] +# copy-collector output ends +mov r14,0x800000 + +# qhasm: carry? lou4 += lou5 +# asm 1: adds >lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lotmp=int32#12,[lotmp=r11,[hitmp=int32#13,[hitmp=r12,[lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lotmp=int32#13,>hitmp=int32#12,lotmp=r12,>hitmp=r11,> 18) +# asm 1: eor >lotmp=int32#13,lotmp=r12,lotmp=int32#13,lotmp=r12,> 14) +# asm 1: eor >lotmp=int32#13,lotmp=r12,lotmp=int32#13,lotmp=r12,> 18) +# asm 1: eor >hitmp=int32#12,hitmp=r11,hitmp=int32#12,hitmp=r11,> 14) +# asm 1: eor >hitmp=int32#12,hitmp=r11,hitmp=int32#12,hitmp=r11,lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lotmp=int32#2,lotmp=r1,lotmp2=int32#4,lotmp2=r3,lotmp=int32#2,lotmp=r1,lou4=int32#2,lou4=r1,hitmp=int32#3,hitmp=r2,hitmp2=int32#4,hitmp2=r3,hitmp=int32#3,hitmp=r2,hiu4=int32#3,hiu4=r2,lou3=int32#4,lou3=r3,hiu3=int32#5,hiu3=r4,lod6=stack32#33 +# asm 2: str lod6=[sp,#128] +# copy-collector input: str r3,[sp,#128] + +# qhasm: hid6 = hiu3 +# asm 1: str hid6=stack32#34 +# asm 2: str hid6=[sp,#132] +# copy-collector input: str r4,[sp,#132] + +# qhasm: lou0 = lod3 +# asm 1: ldr >lou0=int32#6,lou0=r5,hiu0=int32#7,hiu0=r6,lou1=int32#8,lou1=r7,hiu1=int32#9,hiu1=r8,lou2=int32#10,lou2=r9,hiu2=int32#11,hiu2=r10,two25=int32#12,0x2000000 +# asm 2: mov >two25=r11,0x2000000 +# copy-collector output starts +strd r3,r4,[sp,#128] +ldr r5,[sp,#72] +ldr r6,[sp,#76] +ldr.w r7,[sp,#276] +ldr r8,[sp,#280] +ldr r9,[sp,#48] +ldr r10,[sp,#52] +# copy-collector output ends +mov r11,0x2000000 + +# qhasm: hitmp lotmp = lou0 * two25 +# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,lotmp=r14,>hitmp=r12,> 2) +# asm 1: eor >lotmp=int32#12,lotmp=r11,lotmp=int32#12,lotmp=r11,> 28) +# asm 1: eor >lotmp=int32#12,lotmp=r11,lotmp=int32#12,lotmp=r11,> 2) +# asm 1: eor >hitmp=int32#13,hitmp=r12,hitmp=int32#13,hitmp=r12,> 28) +# asm 1: eor >hitmp=int32#13,hitmp=r12,hitmp=int32#13,hitmp=r12,lou4=int32#2,lou4=r1,hiu4=int32#3,hiu4=r2,lotmp=int32#12,lotmp=r11,lotmp=int32#6,lotmp=r5,lotmp2=int32#8,lotmp2=r7,lotmp=int32#6,lotmp=r5,lou4=int32#2,lou4=r1,hitmp=int32#6,hitmp=r5,hitmp=int32#6,hitmp=r5,hitmp2=int32#7,hitmp2=r6,hitmp=int32#6,hitmp=r5,hiu4=int32#3,hiu4=r2,lod2=stack32#17 +# asm 2: str lod2=[sp,#64] +# copy-collector input: str r1,[sp,#64] + +# qhasm: hid2 = hiu4 +# asm 1: str hid2=stack32#18 +# asm 2: str hid2=[sp,#68] +# copy-collector input: str r2,[sp,#68] + +# qhasm: lou0 = lod7 +# asm 1: ldr >lou0=int32#2,lou0=r1,hiu0=int32#3,hiu0=r2,lou1=int32#6,lou1=r5,hiu1=int32#7,hiu1=r6,lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lou5=int32#12,lou5=r11,hiu5=int32#13,hiu5=r12,two23=int32#14,0x800000 +# asm 2: mov >two23=r14,0x800000 +# copy-collector output starts +strd r1,r2,[sp,#64] +ldr r1,[sp,#136] +ldr r2,[sp,#140] +ldr r5,[sp,#112] +ldr r6,[sp,#116] +ldr.w r7,[sp,#120] +ldr r8,[sp,#124] +ldr r11,[sp,#192] +ldr r12,[sp,#196] +# copy-collector output ends +mov r14,0x800000 + +# qhasm: carry? lou4 += lou5 +# asm 1: adds >lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lotmp=int32#12,[lotmp=r11,[hitmp=int32#13,[hitmp=r12,[lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lotmp=int32#13,>hitmp=int32#12,lotmp=r12,>hitmp=r11,> 18) +# asm 1: eor >lotmp=int32#13,lotmp=r12,lotmp=int32#13,lotmp=r12,> 14) +# asm 1: eor >lotmp=int32#13,lotmp=r12,lotmp=int32#13,lotmp=r12,> 18) +# asm 1: eor >hitmp=int32#12,hitmp=r11,hitmp=int32#12,hitmp=r11,> 14) +# asm 1: eor >hitmp=int32#12,hitmp=r11,hitmp=int32#12,hitmp=r11,lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lotmp=int32#2,lotmp=r1,lotmp2=int32#4,lotmp2=r3,lotmp=int32#2,lotmp=r1,lou4=int32#2,lou4=r1,hitmp=int32#3,hitmp=r2,hitmp2=int32#4,hitmp2=r3,hitmp=int32#3,hitmp=r2,hiu4=int32#3,hiu4=r2,lou2=int32#4,lou2=r3,hiu2=int32#5,hiu2=r4,lod5=stack32#31 +# asm 2: str lod5=[sp,#120] +# copy-collector input: str r3,[sp,#120] + +# qhasm: hid5 = hiu2 +# asm 1: str hid5=stack32#32 +# asm 2: str hid5=[sp,#124] +# copy-collector input: str r4,[sp,#124] + +# qhasm: lou0 = lod2 +# asm 1: ldr >lou0=int32#6,lou0=r5,hiu0=int32#7,hiu0=r6,lou1=int32#8,lou1=r7,hiu1=int32#9,hiu1=r8,lou3=int32#10,lou3=r9,hiu3=int32#11,hiu3=r10,two25=int32#12,0x2000000 +# asm 2: mov >two25=r11,0x2000000 +# copy-collector output starts +strd r3,r4,[sp,#120] +ldr r5,[sp,#64] +ldr r6,[sp,#68] +ldr.w r7,[sp,#72] +ldr r8,[sp,#76] +ldr r9,[sp,#276] +ldr r10,[sp,#280] +# copy-collector output ends +mov r11,0x2000000 + +# qhasm: hitmp lotmp = lou0 * two25 +# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,lotmp=r14,>hitmp=r12,> 2) +# asm 1: eor >lotmp=int32#12,lotmp=r11,lotmp=int32#12,lotmp=r11,> 28) +# asm 1: eor >lotmp=int32#12,lotmp=r11,lotmp=int32#12,lotmp=r11,> 2) +# asm 1: eor >hitmp=int32#13,hitmp=r12,hitmp=int32#13,hitmp=r12,> 28) +# asm 1: eor >hitmp=int32#13,hitmp=r12,hitmp=int32#13,hitmp=r12,lou4=int32#2,lou4=r1,hiu4=int32#3,hiu4=r2,lotmp=int32#12,lotmp=r11,lotmp=int32#6,lotmp=r5,lotmp2=int32#8,lotmp2=r7,lotmp=int32#6,lotmp=r5,lou4=int32#2,lou4=r1,hitmp=int32#6,hitmp=r5,hitmp=int32#6,hitmp=r5,hitmp2=int32#7,hitmp2=r6,hitmp=int32#6,hitmp=r5,hiu4=int32#3,hiu4=r2,lod1=stack32#15 +# asm 2: str lod1=[sp,#56] +# copy-collector input: str r1,[sp,#56] + +# qhasm: hid1 = hiu4 +# asm 1: str hid1=stack32#16 +# asm 2: str hid1=[sp,#60] +# copy-collector input: str r2,[sp,#60] + +# qhasm: lou0 = lod6 +# asm 1: ldr >lou0=int32#2,lou0=r1,hiu0=int32#3,hiu0=r2,lou1=int32#6,lou1=r5,hiu1=int32#7,hiu1=r6,lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lou5=int32#12,lou5=r11,hiu5=int32#13,hiu5=r12,two23=int32#14,0x800000 +# asm 2: mov >two23=r14,0x800000 +# copy-collector output starts +strd r1,r2,[sp,#56] +ldr r1,[sp,#128] +ldr r2,[sp,#132] +ldr r5,[sp,#136] +ldr r6,[sp,#140] +ldr.w r7,[sp,#112] +ldr r8,[sp,#116] +ldr r11,[sp,#200] +ldr r12,[sp,#204] +# copy-collector output ends +mov r14,0x800000 + +# qhasm: carry? lou4 += lou5 +# asm 1: adds >lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lotmp=int32#12,[lotmp=r11,[hitmp=int32#13,[hitmp=r12,[lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lotmp=int32#13,>hitmp=int32#12,lotmp=r12,>hitmp=r11,> 18) +# asm 1: eor >lotmp=int32#13,lotmp=r12,lotmp=int32#13,lotmp=r12,> 14) +# asm 1: eor >lotmp=int32#13,lotmp=r12,lotmp=int32#13,lotmp=r12,> 18) +# asm 1: eor >hitmp=int32#12,hitmp=r11,hitmp=int32#12,hitmp=r11,> 14) +# asm 1: eor >hitmp=int32#12,hitmp=r11,hitmp=int32#12,hitmp=r11,lou4=int32#8,lou4=r7,hiu4=int32#9,hiu4=r8,lotmp=int32#2,lotmp=r1,lotmp2=int32#4,lotmp2=r3,lotmp=int32#2,lotmp=r1,lou4=int32#2,lou4=r1,hitmp=int32#3,hitmp=r2,hitmp2=int32#4,hitmp2=r3,hitmp=int32#3,hitmp=r2,hiu4=int32#3,hiu4=r2,lou3=int32#4,lou3=r3,hiu3=int32#5,hiu3=r4,lod4=stack32#29 +# asm 2: str lod4=[sp,#112] +# copy-collector input: str r3,[sp,#112] + +# qhasm: hid4 = hiu3 +# asm 1: str hid4=stack32#30 +# asm 2: str hid4=[sp,#116] +# copy-collector input: str r4,[sp,#116] + +# qhasm: lou0 = lod1 +# asm 1: ldr >lou0=int32#4,lou0=r3,hiu0=int32#5,hiu0=r4,lou1=int32#6,lou1=r5,hiu1=int32#7,hiu1=r6,lou2=int32#8,lou2=r7,hiu2=int32#9,hiu2=r8,two25=int32#10,0x2000000 +# asm 2: mov >two25=r9,0x2000000 +# copy-collector output starts +strd r3,r4,[sp,#112] +ldr r3,[sp,#56] +ldr r4,[sp,#60] +ldr r5,[sp,#64] +ldr r6,[sp,#68] +ldr.w r7,[sp,#72] +ldr r8,[sp,#76] +# copy-collector output ends +mov r9,0x2000000 + +# qhasm: hitmp lotmp = lou0 * two25 +# asm 1: umull >lotmp=int32#12,>hitmp=int32#11,lotmp=r11,>hitmp=r10,> 2) +# asm 1: eor >lotmp=int32#10,lotmp=r9,lotmp=int32#10,lotmp=r9,> 28) +# asm 1: eor >lotmp=int32#10,lotmp=r9,lotmp=int32#10,lotmp=r9,> 2) +# asm 1: eor >hitmp=int32#11,hitmp=r10,hitmp=int32#11,hitmp=r10,> 28) +# asm 1: eor >hitmp=int32#11,hitmp=r10,hitmp=int32#11,hitmp=r10,lou4=int32#2,lou4=r1,hiu4=int32#3,hiu4=r2,lotmp=int32#10,lotmp=r9,lotmp=int32#4,lotmp=r3,lotmp2=int32#6,lotmp2=r5,lotmp=int32#4,lotmp=r3,lou4=int32#2,lou4=r1,hitmp=int32#4,hitmp=r3,hitmp=int32#4,hitmp=r3,hitmp2=int32#5,hitmp2=r4,hitmp=int32#4,hitmp=r3,hiu4=int32#3,hiu4=r2,lod0=stack32#13 +# asm 2: str lod0=[sp,#48] +# copy-collector input: str r1,[sp,#48] + +# qhasm: hid0 = hiu4 +# asm 1: str hid0=stack32#14 +# asm 2: str hid0=[sp,#52] +# copy-collector input: str r2,[sp,#52] + +# qhasm: o3 = input_0 +# asm 1: str o3=stack32#4 +# asm 2: str o3=[sp,#12] +# copy-collector input: str r0,[sp,#12] + +# qhasm: input_0 = o4 +# asm 1: ldr >input_0=int32#1,input_0=r0,input_0=int32#1,input_0=r0,o4=stack32#69 +# asm 2: str o4=[sp,#272] +# copy-collector input: str r0,[sp,#272] + +# qhasm: =? input_0 - 8 +# asm 1: cmp two24=int32#1,0x1000000 +# asm 2: mov >two24=r0,0x1000000 +mov r0,0x1000000 + +# qhasm: two13 = 0x2000 simple +# asm 1: mov >two13=int32#2,0x2000 +# asm 2: mov >two13=r1,0x2000 +mov r1,0x2000 + +# qhasm: lou0 = lod8 +# asm 1: ldr >lou0=int32#3,lou0=r2,hiu0=int32#4,hiu0=r3,lou1=int32#5,lou1=r4,hiu1=int32#6,hiu1=r5,lou2=int32#7,lou2=r6,hiu2=int32#8,hiu2=r7,lou3=int32#9,lou3=r8,hiu3=int32#10,hiu3=r9,lotmp=int32#12,>hitmp=int32#11,lotmp=r11,>hitmp=r10,> 6) +# asm 1: eor >lotmp=int32#12,lotmp=r11,lotmp=int32#12,lotmp=r11,> 29) +# asm 1: eor >lotmp=int32#12,lotmp=r11,lotmp=int32#12,lotmp=r11,> 6) +# asm 1: eor >hitmp=int32#11,hitmp=r10,> 29) +# asm 1: eor >hitmp=int32#11,hitmp=r10,hitmp=int32#11,hitmp=r10,lou0=int32#3,lou0=r2,hiu0=int32#4,hiu0=r3,lotmp=int32#12,>hitmp=int32#11,lotmp=r11,>hitmp=r10,> 1) +# asm 1: eors >lotmp=int32#12,lotmp=r11,> 1) +# asm 1: eors >hitmp=int32#11,hitmp=r10,lotmp=int32#12,lotmp=r11,>7) +# asm 1: eor >lotmp=int32#12,lotmp=r11,lotmp=int32#12,lotmp=r11,>7) +# asm 1: eor >hitmp=int32#11,hitmp=r10,lou0=int32#3,lou0=r2,hiu0=int32#4,hiu0=r3,lou0=int32#3,lou0=r2,hiu0=int32#4,hiu0=r3,lod9=stack32#39 +# asm 2: str lod9=[sp,#152] +# copy-collector input: str r8,[sp,#152] + +# qhasm: hid9 = hiu3 +# asm 1: str hid9=stack32#40 +# asm 2: str hid9=[sp,#156] +# copy-collector input: str r9,[sp,#156] + +# qhasm: lou3 = lom15 +# asm 1: ldr >lou3=int32#9,lou3=r8,hiu3=int32#10,hiu3=r9,lom15=stack32#37 +# asm 2: str lom15=[sp,#144] +# copy-collector input: str r2,[sp,#144] + +# qhasm: him15 = hiu0 +# asm 1: str him15=stack32#38 +# asm 2: str him15=[sp,#148] +# copy-collector input: str r3,[sp,#148] + +# qhasm: hitmp lotmp = hiu3 * two13 +# asm 1: umull >lotmp=int32#12,>hitmp=int32#11,lotmp=r11,>hitmp=r10,> 6) +# asm 1: eor >lotmp=int32#12,lotmp=r11,lotmp=int32#12,lotmp=r11,> 29) +# asm 1: eor >lotmp=int32#12,lotmp=r11,lotmp=int32#12,lotmp=r11,> 6) +# asm 1: eor >hitmp=int32#11,hitmp=r10,> 29) +# asm 1: eor >hitmp=int32#11,hitmp=r10,hitmp=int32#11,hitmp=r10,lou1=int32#5,lou1=r4,hiu1=int32#6,hiu1=r5,lou4=int32#11,lou4=r10,hiu4=int32#12,hiu4=r11,lotmp=int32#14,>hitmp=int32#13,lotmp=r14,>hitmp=r12,> 1) +# asm 1: eors >lotmp=int32#14,lotmp=r14,> 1) +# asm 1: eors >hitmp=int32#13,hitmp=r12,lotmp=int32#14,lotmp=r14,>7) +# asm 1: eor >lotmp=int32#14,lotmp=r14,lotmp=int32#14,lotmp=r14,>7) +# asm 1: eor >hitmp=int32#13,hitmp=r12,lou1=int32#5,lou1=r4,hiu1=int32#6,hiu1=r5,lou5=int32#13,lou5=r12,hiu5=int32#14,hiu5=r14,lou1=int32#5,lou1=r4,hiu1=int32#6,hiu1=r5,lod10=stack32#41 +# asm 2: str lod10=[sp,#160] +# copy-collector input: str r12,[sp,#160] + +# qhasm: hid10 = hiu5 +# asm 1: str hid10=stack32#42 +# asm 2: str hid10=[sp,#164] +# copy-collector input: str r14,[sp,#164] + +# qhasm: lom9 = lou1 +# asm 1: str lom9=stack32#55 +# asm 2: str lom9=[sp,#216] +# copy-collector input: str r4,[sp,#216] + +# qhasm: him9 = hiu1 +# asm 1: str him9=stack32#56 +# asm 2: str him9=[sp,#220] +# copy-collector input: str r5,[sp,#220] + +# qhasm: hitmp lotmp = hiu0 * two13 +# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,lotmp=r14,>hitmp=r12,> 6) +# asm 1: eor >lotmp=int32#14,lotmp=r14,lotmp=int32#14,lotmp=r14,> 29) +# asm 1: eor >lotmp=int32#14,lotmp=r14,lotmp=int32#14,lotmp=r14,> 6) +# asm 1: eor >hitmp=int32#13,hitmp=r12,> 29) +# asm 1: eor >hitmp=int32#3,hitmp=r2,hitmp=int32#3,hitmp=r2,lou4=int32#4,lou4=r3,hiu4=int32#3,hiu4=r2,lou0=int32#11,lou0=r10,hiu0=int32#12,hiu0=r11,lotmp=int32#14,>hitmp=int32#13,lotmp=r14,>hitmp=r12,> 1) +# asm 1: eors >lotmp=int32#14,lotmp=r14,> 1) +# asm 1: eors >hitmp=int32#13,hitmp=r12,lotmp=int32#14,lotmp=r14,>7) +# asm 1: eor >lotmp=int32#14,lotmp=r14,lotmp=int32#14,lotmp=r14,>7) +# asm 1: eor >hitmp=int32#13,hitmp=r12,lou4=int32#4,lou4=r3,hiu4=int32#3,hiu4=r2,lou5=int32#13,lou5=r12,hiu5=int32#14,hiu5=r14,lou4=int32#4,lou4=r3,hiu4=int32#3,hiu4=r2,lod11=stack32#43 +# asm 2: str lod11=[sp,#168] +# copy-collector input: str r12,[sp,#168] + +# qhasm: hid11 = hiu5 +# asm 1: str hid11=stack32#44 +# asm 2: str hid11=[sp,#172] +# copy-collector input: str r14,[sp,#172] + +# qhasm: lom10 = lou4 +# asm 1: str lom10=stack32#57 +# asm 2: str lom10=[sp,#224] +# copy-collector input: str r3,[sp,#224] + +# qhasm: him10 = hiu4 +# asm 1: str him10=stack32#58 +# asm 2: str him10=[sp,#228] +# copy-collector input: str r2,[sp,#228] + +# qhasm: hitmp lotmp = hiu1 * two13 +# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,lotmp=r14,>hitmp=r12,> 6) +# asm 1: eor >lotmp=int32#14,lotmp=r14,lotmp=int32#14,lotmp=r14,> 29) +# asm 1: eor >lotmp=int32#14,lotmp=r14,lotmp=int32#14,lotmp=r14,> 6) +# asm 1: eor >hitmp=int32#13,hitmp=r12,> 29) +# asm 1: eor >hitmp=int32#5,hitmp=r4,hitmp=int32#5,hitmp=r4,lou0=int32#6,lou0=r5,hiu0=int32#5,hiu0=r4,lou1=int32#11,lou1=r10,hiu1=int32#12,hiu1=r11,lotmp=int32#14,>hitmp=int32#13,lotmp=r14,>hitmp=r12,> 1) +# asm 1: eors >lotmp=int32#14,lotmp=r14,> 1) +# asm 1: eors >hitmp=int32#13,hitmp=r12,lotmp=int32#14,lotmp=r14,>7) +# asm 1: eor >lotmp=int32#14,lotmp=r14,lotmp=int32#14,lotmp=r14,>7) +# asm 1: eor >hitmp=int32#13,hitmp=r12,lou0=int32#6,lou0=r5,hiu0=int32#5,hiu0=r4,lou5=int32#13,lou5=r12,hiu5=int32#14,hiu5=r14,lou0=int32#6,lou0=r5,hiu0=int32#5,hiu0=r4,lod12=stack32#45 +# asm 2: str lod12=[sp,#176] +# copy-collector input: str r12,[sp,#176] + +# qhasm: hid12 = hiu5 +# asm 1: str hid12=stack32#46 +# asm 2: str hid12=[sp,#180] +# copy-collector input: str r14,[sp,#180] + +# qhasm: lom11 = lou0 +# asm 1: str lom11=stack32#59 +# asm 2: str lom11=[sp,#232] +# copy-collector input: str r5,[sp,#232] + +# qhasm: him11 = hiu0 +# asm 1: str him11=stack32#60 +# asm 2: str him11=[sp,#236] +# copy-collector input: str r4,[sp,#236] + +# qhasm: hitmp lotmp = hiu4 * two13 +# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,lotmp=r14,>hitmp=r12,> 6) +# asm 1: eor >lotmp=int32#14,lotmp=r14,lotmp=int32#14,lotmp=r14,> 29) +# asm 1: eor >lotmp=int32#14,lotmp=r14,lotmp=int32#14,lotmp=r14,> 6) +# asm 1: eor >hitmp=int32#13,hitmp=r12,> 29) +# asm 1: eor >hitmp=int32#4,hitmp=r3,hitmp=int32#3,hitmp=r2,lou1=int32#4,lou1=r3,hiu1=int32#3,hiu1=r2,lou4=int32#11,lou4=r10,hiu4=int32#12,hiu4=r11,lotmp=int32#14,>hitmp=int32#13,lotmp=r14,>hitmp=r12,> 1) +# asm 1: eors >lotmp=int32#14,lotmp=r14,> 1) +# asm 1: eors >hitmp=int32#13,hitmp=r12,lotmp=int32#14,lotmp=r14,>7) +# asm 1: eor >lotmp=int32#14,lotmp=r14,lotmp=int32#14,lotmp=r14,>7) +# asm 1: eor >hitmp=int32#13,hitmp=r12,lou1=int32#4,lou1=r3,hiu1=int32#3,hiu1=r2,lou5=int32#13,lou5=r12,hiu5=int32#14,hiu5=r14,lou1=int32#4,lou1=r3,hiu1=int32#3,hiu1=r2,lod13=stack32#47 +# asm 2: str lod13=[sp,#184] +# copy-collector input: str r12,[sp,#184] + +# qhasm: hid13 = hiu5 +# asm 1: str hid13=stack32#48 +# asm 2: str hid13=[sp,#188] +# copy-collector input: str r14,[sp,#188] + +# qhasm: lom12 = lou1 +# asm 1: str lom12=stack32#61 +# asm 2: str lom12=[sp,#240] +# copy-collector input: str r3,[sp,#240] + +# qhasm: him12 = hiu1 +# asm 1: str him12=stack32#62 +# asm 2: str him12=[sp,#244] +# copy-collector input: str r2,[sp,#244] + +# qhasm: hitmp lotmp = hiu0 * two13 +# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,lotmp=r14,>hitmp=r12,> 6) +# asm 1: eor >lotmp=int32#14,lotmp=r14,lotmp=int32#14,lotmp=r14,> 29) +# asm 1: eor >lotmp=int32#14,lotmp=r14,lotmp=int32#14,lotmp=r14,> 6) +# asm 1: eor >hitmp=int32#13,hitmp=r12,> 29) +# asm 1: eor >hitmp=int32#6,hitmp=r5,hitmp=int32#5,hitmp=r4,lou4=int32#6,lou4=r5,hiu4=int32#5,hiu4=r4,lou0=int32#11,lou0=r10,hiu0=int32#12,hiu0=r11,lod14=stack32#49 +# asm 2: str lod14=[sp,#192] +# copy-collector input: str r6,[sp,#192] + +# qhasm: hid14 = hiu2 +# asm 1: str hid14=stack32#50 +# asm 2: str hid14=[sp,#196] +# copy-collector input: str r7,[sp,#196] + +# qhasm: hitmp lotmp = hiu0 * two24 +# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,lotmp=r14,>hitmp=r12,> 1) +# asm 1: eors >lotmp=int32#14,lotmp=r14,> 1) +# asm 1: eors >hitmp=int32#13,hitmp=r12,lotmp=int32#14,lotmp=r14,>7) +# asm 1: eor >lotmp=int32#14,lotmp=r14,lotmp=int32#14,lotmp=r14,>7) +# asm 1: eor >hitmp=int32#13,hitmp=r12,lou4=int32#6,lou4=r5,hiu4=int32#5,hiu4=r4,lou4=int32#6,lou4=r5,hiu4=int32#5,hiu4=r4,lom13=stack32#63 +# asm 2: str lom13=[sp,#248] +# copy-collector input: str r5,[sp,#248] + +# qhasm: him13 = hiu4 +# asm 1: str him13=stack32#64 +# asm 2: str him13=[sp,#252] +# copy-collector input: str r4,[sp,#252] + +# qhasm: hitmp lotmp = hiu1 * two13 +# asm 1: umull >lotmp=int32#8,>hitmp=int32#7,lotmp=r7,>hitmp=r6,> 6) +# asm 1: eor >lotmp=int32#8,lotmp=r7,lotmp=int32#8,lotmp=r7,> 29) +# asm 1: eor >lotmp=int32#8,lotmp=r7,lotmp=int32#8,lotmp=r7,> 6) +# asm 1: eor >hitmp=int32#7,hitmp=r6,> 29) +# asm 1: eor >hitmp=int32#4,hitmp=r3,hitmp=int32#3,hitmp=r2,lou0=int32#4,lou0=r3,hiu0=int32#3,hiu0=r2,lou1=int32#7,lou1=r6,hiu1=int32#8,hiu1=r7,lod15=stack32#51 +# asm 2: str lod15=[sp,#200] +# copy-collector input: str r8,[sp,#200] + +# qhasm: hid15 = hiu3 +# asm 1: str hid15=stack32#52 +# asm 2: str hid15=[sp,#204] +# copy-collector input: str r9,[sp,#204] + +# qhasm: hitmp lotmp = hiu1 * two24 +# asm 1: umull >lotmp=int32#12,>hitmp=int32#11,lotmp=r11,>hitmp=r10,> 1) +# asm 1: eors >lotmp=int32#12,lotmp=r11,> 1) +# asm 1: eors >hitmp=int32#11,hitmp=r10,lotmp=int32#12,lotmp=r11,>7) +# asm 1: eor >lotmp=int32#12,lotmp=r11,lotmp=int32#12,lotmp=r11,>7) +# asm 1: eor >hitmp=int32#11,hitmp=r10,lou0=int32#4,lou0=r3,hiu0=int32#3,hiu0=r2,lou0=int32#4,lou0=r3,hiu0=int32#3,hiu0=r2,lom14=stack32#65 +# asm 2: str lom14=[sp,#256] +# copy-collector input: str r3,[sp,#256] + +# qhasm: him14 = hiu0 +# asm 1: str him14=stack32#66 +# asm 2: str him14=[sp,#260] +# copy-collector input: str r2,[sp,#260] + +# qhasm: hitmp lotmp = hiu4 * two13 +# asm 1: umull >lotmp=int32#4,>hitmp=int32#3,lotmp=r3,>hitmp=r2,> 6) +# asm 1: eor >lotmp=int32#2,lotmp=r1,lotmp=int32#2,lotmp=r1,> 29) +# asm 1: eor >lotmp=int32#2,lotmp=r1,lotmp=int32#2,lotmp=r1,> 6) +# asm 1: eor >hitmp=int32#3,hitmp=r2,> 29) +# asm 1: eor >hitmp=int32#3,hitmp=r2,hitmp=int32#3,hitmp=r2,lou1=int32#2,lou1=r1,hiu1=int32#3,hiu1=r2,lou0=int32#4,lou0=r3,hiu0=int32#5,hiu0=r4,lou2=int32#6,lou2=r5,hiu2=int32#7,hiu2=r6,lom8=stack32#53 +# asm 2: str lom8=[sp,#208] +# copy-collector input: str r5,[sp,#208] + +# qhasm: him8 = hiu2 +# asm 1: str him8=stack32#54 +# asm 2: str him8=[sp,#212] +# copy-collector input: str r6,[sp,#212] + +# qhasm: hitmp lotmp = hiu0 * two24 +# asm 1: umull >lotmp=int32#9,>hitmp=int32#8,lotmp=r8,>hitmp=r7,> 1) +# asm 1: eors >lotmp=int32#1,lotmp=r0,> 1) +# asm 1: eors >hitmp=int32#8,hitmp=r7,lotmp=int32#1,lotmp=r0,>7) +# asm 1: eor >lotmp=int32#1,lotmp=r0,lotmp=int32#1,lotmp=r0,>7) +# asm 1: eor >hitmp=int32#8,hitmp=r7,lou1=int32#1,lou1=r0,hiu1=int32#2,hiu1=r1,lou1=int32#1,lou1=r0,hiu1=int32#2,hiu1=r1,lom15=stack32#67 +# asm 2: str lom15=[sp,#264] +# copy-collector input: str r0,[sp,#264] + +# qhasm: him15 = hiu1 +# asm 1: str him15=stack32#68 +# asm 2: str him15=[sp,#268] +# copy-collector input: str r1,[sp,#268] + +# qhasm: lod8 = lou0 +# asm 1: str lod8=stack32#37 +# asm 2: str lod8=[sp,#144] +# copy-collector input: str r3,[sp,#144] + +# qhasm: hid8 = hiu0 +# asm 1: str hid8=stack32#38 +# asm 2: str hid8=[sp,#148] +# copy-collector input: str r4,[sp,#148] + +# qhasm: goto innerloop +# copy-collector output starts +strd r0,r1,[sp,#264] +strd r3,r4,[sp,#144] +# copy-collector output ends +b ._innerloop + +# qhasm: nearend: +._nearend: + +# qhasm: lou0 = lom8 +# asm 1: ldr >lou0=int32#1,lou0=r0,hiu0=int32#2,hiu0=r1,lou1=int32#3,lou1=r2,hiu1=int32#4,hiu1=r3,lou2=int32#5,lou2=r4,hiu2=int32#6,hiu2=r5,lou3=int32#7,lou3=r6,hiu3=int32#8,hiu3=r7,lod8=stack32#37 +# asm 2: str lod8=[sp,#144] +# copy-collector input: str r0,[sp,#144] + +# qhasm: hid8 = hiu0 +# asm 1: str hid8=stack32#38 +# asm 2: str hid8=[sp,#148] +# copy-collector input: str r1,[sp,#148] + +# qhasm: lod9 = lou1 +# asm 1: str lod9=stack32#39 +# asm 2: str lod9=[sp,#152] +# copy-collector input: str r2,[sp,#152] + +# qhasm: hid9 = hiu1 +# asm 1: str hid9=stack32#40 +# asm 2: str hid9=[sp,#156] +# copy-collector input: str r3,[sp,#156] + +# qhasm: lod10 = lou2 +# asm 1: str lod10=stack32#41 +# asm 2: str lod10=[sp,#160] +# copy-collector input: str r4,[sp,#160] + +# qhasm: hid10 = hiu2 +# asm 1: str hid10=stack32#42 +# asm 2: str hid10=[sp,#164] +# copy-collector input: str r5,[sp,#164] + +# qhasm: lod11 = lou3 +# asm 1: str lod11=stack32#43 +# asm 2: str lod11=[sp,#168] +# copy-collector input: str r6,[sp,#168] + +# qhasm: hid11 = hiu3 +# asm 1: str hid11=stack32#44 +# asm 2: str hid11=[sp,#172] +# copy-collector input: str r7,[sp,#172] + +# qhasm: lou0 = lom12 +# asm 1: ldr >lou0=int32#1,lou0=r0,hiu0=int32#2,hiu0=r1,lou1=int32#3,lou1=r2,hiu1=int32#4,hiu1=r3,lou2=int32#5,lou2=r4,hiu2=int32#6,hiu2=r5,lou3=int32#7,lou3=r6,hiu3=int32#8,hiu3=r7,lod12=stack32#45 +# asm 2: str lod12=[sp,#176] +# copy-collector input: str r0,[sp,#176] + +# qhasm: hid12 = hiu0 +# asm 1: str hid12=stack32#46 +# asm 2: str hid12=[sp,#180] +# copy-collector input: str r1,[sp,#180] + +# qhasm: lod13 = lou1 +# asm 1: str lod13=stack32#47 +# asm 2: str lod13=[sp,#184] +# copy-collector input: str r2,[sp,#184] + +# qhasm: hid13 = hiu1 +# asm 1: str hid13=stack32#48 +# asm 2: str hid13=[sp,#188] +# copy-collector input: str r3,[sp,#188] + +# qhasm: lod14 = lou2 +# asm 1: str lod14=stack32#49 +# asm 2: str lod14=[sp,#192] +# copy-collector input: str r4,[sp,#192] + +# qhasm: hid14 = hiu2 +# asm 1: str hid14=stack32#50 +# asm 2: str hid14=[sp,#196] +# copy-collector input: str r5,[sp,#196] + +# qhasm: lod15 = lou3 +# asm 1: str lod15=stack32#51 +# asm 2: str lod15=[sp,#200] +# copy-collector input: str r6,[sp,#200] + +# qhasm: hid15 = hiu3 +# asm 1: str hid15=stack32#52 +# asm 2: str hid15=[sp,#204] +# copy-collector input: str r7,[sp,#204] + +# qhasm: goto innerloop +# copy-collector output starts +ldr r0,[sp,#208] +ldr r1,[sp,#212] +ldr r2,[sp,#216] +ldr r3,[sp,#220] +ldr r4,[sp,#224] +ldr r5,[sp,#228] +ldr r6,[sp,#232] +ldr r7,[sp,#236] +strd r0,r1,[sp,#144] +strd r2,r3,[sp,#152] +strd r4,r5,[sp,#160] +strd r6,r7,[sp,#168] +ldr r0,[sp,#240] +ldr r1,[sp,#244] +ldr r2,[sp,#248] +ldr r3,[sp,#252] +ldr r4,[sp,#256] +ldr r5,[sp,#260] +ldr r6,[sp,#264] +ldr r7,[sp,#268] +strd r0,r1,[sp,#176] +strd r2,r3,[sp,#184] +strd r4,r5,[sp,#192] +strd r6,r7,[sp,#200] +# copy-collector output ends +b ._innerloop + +# qhasm: endinnerloop: +._endinnerloop: + +# qhasm: input_0 = o3 +# asm 1: ldr >input_0=int32#1,input_0=r0,input_0=int32#1,input_0=r0,o3=stack32#4 +# asm 2: str o3=[sp,#12] +# copy-collector input: str r0,[sp,#12] + +# qhasm: lou0 = lod0 +# asm 1: ldr >lou0=int32#1,lou0=r0,hiu0=int32#2,hiu0=r1,lou1=int32#3,lou1=r2,hiu1=int32#4,hiu1=r3,lou2=int32#5,lou2=r4,hiu2=int32#6,hiu2=r5,lou3=int32#7,lou3=r6,hiu3=int32#8,hiu3=r7,lotmp=int32#9,lotmp=r8,hitmp=int32#10,hitmp=r9,lou0=int32#1,lou0=r0,hiu0=int32#2,hiu0=r1,lotmp=int32#9,lotmp=r8,hitmp=int32#10,hitmp=r9,lou1=int32#3,lou1=r2,hiu1=int32#4,hiu1=r3,lotmp=int32#9,lotmp=r8,hitmp=int32#10,hitmp=r9,lou2=int32#5,lou2=r4,hiu2=int32#6,hiu2=r5,lotmp=int32#9,lotmp=r8,hitmp=int32#10,hitmp=r9,lou3=int32#7,lou3=r6,hiu3=int32#8,hiu3=r7,lom0=stack32#5 +# asm 2: str lom0=[sp,#16] +# copy-collector input: str r0,[sp,#16] + +# qhasm: him0 = hiu0 +# asm 1: str him0=stack32#6 +# asm 2: str him0=[sp,#20] +# copy-collector input: str r1,[sp,#20] + +# qhasm: lom1 = lou1 +# asm 1: str lom1=stack32#7 +# asm 2: str lom1=[sp,#24] +# copy-collector input: str r2,[sp,#24] + +# qhasm: him1 = hiu1 +# asm 1: str him1=stack32#8 +# asm 2: str him1=[sp,#28] +# copy-collector input: str r3,[sp,#28] + +# qhasm: lom2 = lou2 +# asm 1: str lom2=stack32#9 +# asm 2: str lom2=[sp,#32] +# copy-collector input: str r4,[sp,#32] + +# qhasm: him2 = hiu2 +# asm 1: str him2=stack32#10 +# asm 2: str him2=[sp,#36] +# copy-collector input: str r5,[sp,#36] + +# qhasm: lom3 = lou3 +# asm 1: str lom3=stack32#11 +# asm 2: str lom3=[sp,#40] +# copy-collector input: str r6,[sp,#40] + +# qhasm: him3 = hiu3 +# asm 1: str him3=stack32#12 +# asm 2: str him3=[sp,#44] +# copy-collector input: str r7,[sp,#44] + +# qhasm: lod0 = lou0 +# asm 1: str lod0=stack32#13 +# asm 2: str lod0=[sp,#48] +# copy-collector input: str r0,[sp,#48] + +# qhasm: hid0 = hiu0 +# asm 1: str hid0=stack32#14 +# asm 2: str hid0=[sp,#52] +# copy-collector input: str r1,[sp,#52] + +# qhasm: lod1 = lou1 +# asm 1: str lod1=stack32#15 +# asm 2: str lod1=[sp,#56] +# copy-collector input: str r2,[sp,#56] + +# qhasm: hid1 = hiu1 +# asm 1: str hid1=stack32#16 +# asm 2: str hid1=[sp,#60] +# copy-collector input: str r3,[sp,#60] + +# qhasm: lod2 = lou2 +# asm 1: str lod2=stack32#17 +# asm 2: str lod2=[sp,#64] +# copy-collector input: str r4,[sp,#64] + +# qhasm: hid2 = hiu2 +# asm 1: str hid2=stack32#18 +# asm 2: str hid2=[sp,#68] +# copy-collector input: str r5,[sp,#68] + +# qhasm: lod3 = lou3 +# asm 1: str lod3=stack32#19 +# asm 2: str lod3=[sp,#72] +# copy-collector input: str r6,[sp,#72] + +# qhasm: hid3 = hiu3 +# asm 1: str hid3=stack32#20 +# asm 2: str hid3=[sp,#76] +# copy-collector input: str r7,[sp,#76] + +# qhasm: lou0 = lod4 +# asm 1: ldr >lou0=int32#1,lou0=r0,hiu0=int32#2,hiu0=r1,lou1=int32#3,lou1=r2,hiu1=int32#4,hiu1=r3,lou2=int32#5,lou2=r4,hiu2=int32#6,hiu2=r5,lou3=int32#7,lou3=r6,hiu3=int32#8,hiu3=r7,lotmp=int32#9,lotmp=r8,hitmp=int32#10,hitmp=r9,lou0=int32#1,lou0=r0,hiu0=int32#2,hiu0=r1,lotmp=int32#9,lotmp=r8,hitmp=int32#10,hitmp=r9,lou1=int32#3,lou1=r2,hiu1=int32#4,hiu1=r3,lotmp=int32#9,lotmp=r8,hitmp=int32#10,hitmp=r9,lou2=int32#5,lou2=r4,hiu2=int32#6,hiu2=r5,lotmp=int32#9,lotmp=r8,hitmp=int32#10,hitmp=r9,lou3=int32#7,lou3=r6,hiu3=int32#8,hiu3=r7,lom4=stack32#21 +# asm 2: str lom4=[sp,#80] +# copy-collector input: str r0,[sp,#80] + +# qhasm: him4 = hiu0 +# asm 1: str him4=stack32#22 +# asm 2: str him4=[sp,#84] +# copy-collector input: str r1,[sp,#84] + +# qhasm: lom5 = lou1 +# asm 1: str lom5=stack32#23 +# asm 2: str lom5=[sp,#88] +# copy-collector input: str r2,[sp,#88] + +# qhasm: him5 = hiu1 +# asm 1: str him5=stack32#24 +# asm 2: str him5=[sp,#92] +# copy-collector input: str r3,[sp,#92] + +# qhasm: lom6 = lou2 +# asm 1: str lom6=stack32#25 +# asm 2: str lom6=[sp,#96] +# copy-collector input: str r4,[sp,#96] + +# qhasm: him6 = hiu2 +# asm 1: str him6=stack32#26 +# asm 2: str him6=[sp,#100] +# copy-collector input: str r5,[sp,#100] + +# qhasm: lom7 = lou3 +# asm 1: str lom7=stack32#27 +# asm 2: str lom7=[sp,#104] +# copy-collector input: str r6,[sp,#104] + +# qhasm: him7 = hiu3 +# asm 1: str him7=stack32#28 +# asm 2: str him7=[sp,#108] +# copy-collector input: str r7,[sp,#108] + +# qhasm: lod4 = lou0 +# asm 1: str lod4=stack32#29 +# asm 2: str lod4=[sp,#112] +# copy-collector input: str r0,[sp,#112] + +# qhasm: hid4 = hiu0 +# asm 1: str hid4=stack32#30 +# asm 2: str hid4=[sp,#116] +# copy-collector input: str r1,[sp,#116] + +# qhasm: lod5 = lou1 +# asm 1: str lod5=stack32#31 +# asm 2: str lod5=[sp,#120] +# copy-collector input: str r2,[sp,#120] + +# qhasm: hid5 = hiu1 +# asm 1: str hid5=stack32#32 +# asm 2: str hid5=[sp,#124] +# copy-collector input: str r3,[sp,#124] + +# qhasm: lod6 = lou2 +# asm 1: str lod6=stack32#33 +# asm 2: str lod6=[sp,#128] +# copy-collector input: str r4,[sp,#128] + +# qhasm: hid6 = hiu2 +# asm 1: str hid6=stack32#34 +# asm 2: str hid6=[sp,#132] +# copy-collector input: str r5,[sp,#132] + +# qhasm: lod7 = lou3 +# asm 1: str lod7=stack32#35 +# asm 2: str lod7=[sp,#136] +# copy-collector input: str r6,[sp,#136] + +# qhasm: hid7 = hiu3 +# asm 1: str hid7=stack32#36 +# asm 2: str hid7=[sp,#140] +# copy-collector input: str r7,[sp,#140] + +# qhasm: input_0 = o2 +# asm 1: ldr >input_0=int32#1,input_0=r0,input_0=int32#1,input_0=r0,o2=stack32#3 +# asm 2: str o2=[sp,#8] +# copy-collector input: str r0,[sp,#8] + +# qhasm: goto mainloop if !unsigned< +# copy-collector output starts +str r0,[sp,#8] +# copy-collector output ends +bhs ._mainloop + +# qhasm: endmainloop: +._endmainloop: + +# qhasm: input_1 = o0 +# asm 1: ldr >input_1=int32#2,input_1=r1,lou0=int32#3,lou0=r2,hiu0=int32#4,hiu0=r3,lou1=int32#5,lou1=r4,hiu1=int32#6,hiu1=r5,lou2=int32#7,lou2=r6,hiu2=int32#8,hiu2=r7,lou3=int32#9,lou3=r8,hiu3=int32#10,hiu3=r9,lou0=int32#3,lou0=r2,hiu0=int32#4,hiu0=r3,lou1=int32#5,lou1=r4,hiu1=int32#6,hiu1=r5,lou2=int32#7,lou2=r6,hiu2=int32#8,hiu2=r7,lou3=int32#9,lou3=r8,hiu3=int32#10,hiu3=r9,lou0=int32#3,lou0=r2,hiu0=int32#4,hiu0=r3,lou1=int32#5,lou1=r4,hiu1=int32#6,hiu1=r5,lou2=int32#7,lou2=r6,hiu2=int32#8,hiu2=r7,lou3=int32#9,lou3=r8,hiu3=int32#10,hiu3=r9,lou0=int32#3,lou0=r2,hiu0=int32#4,hiu0=r3,lou1=int32#5,lou1=r4,hiu1=int32#6,hiu1=r5,lou2=int32#7,lou2=r6,hiu2=int32#8,hiu2=r7,lou3=int32#9,lou3=r8,hiu3=int32#10,hiu3=r9,input_0=int32#1,input_0=r0, +#include + +#define BAUD 38400 + +/* Default clock on the MPS2 boards seems to be 25MHz */ +#ifndef SYSTEM_CLOCK +#define SYSTEM_CLOCK 25000000UL +#endif + +/* The startup file calls a SystemInit function. */ +void SystemInit(void) +{ + /* Enable the FPU */ + SCB->CPACR |= ((3UL << 10 * 2) | /* set CP10 Full Access */ + (3UL << 11 * 2)); /* set CP11 Full Access */ + /* Enable UART */ + /* TODO: Validate this on a *real* MPS2 board (works in QEMU) */ + CMSDK_GPIO0->ALTFUNCSET |= 1u; + CMSDK_GPIO0->ALTFUNCSET |= 2u; + CMSDK_UART0->BAUDDIV = SYSTEM_CLOCK / BAUD; + CMSDK_UART0->CTRL |= 1 << CMSDK_UART_CTRL_RXEN_Pos; + CMSDK_UART0->CTRL |= 1 << CMSDK_UART_CTRL_TXEN_Pos; + /* Enable SysTick Timer */ + SysTick->LOAD = 0xFFFFFFu; + NVIC_SetPriority(SysTick_IRQn, (1UL << __NVIC_PRIO_BITS) - 1UL); + NVIC_EnableIRQ(SysTick_IRQn); + SysTick->VAL = 0UL; + SysTick->CTRL = SysTick_CTRL_CLKSOURCE_Msk | SysTick_CTRL_TICKINT_Msk | SysTick_CTRL_ENABLE_Msk; +} + +static volatile unsigned long long overflowcnt = 0; + +/* SysTick Interrupt */ +void SysTick_Handler(void) +{ + ++overflowcnt; +} + +uint64_t hal_get_time() +{ + while (1) { + unsigned long long before = overflowcnt; + unsigned long long result = (before + 1) * 16777216llu - SysTick->VAL; + if (overflowcnt == before) { + return result; + } + } +} + +void hal_setup(const enum clock_mode clock) +{ + (void) clock; +} + +static inline void uart_putc(int c) +{ + while(CMSDK_UART0->STATE & CMSDK_UART_STATE_TXBF_Msk); + CMSDK_UART0->DATA = c & 0xFFu; +} + +void hal_send_str(const char* in) +{ + const char* cur = in; + while (*cur) { + uart_putc(*cur); + cur += 1; + } + uart_putc('\n'); +} + +#if !defined(NO_SEMIHOSTING_EXIT) +// TODO(dsprenkels) Currently, we only exit the QEMU host when a the program +// exists sucessfully. We should also populate some interrupts handlers that +// occur on errors and/or other exception. + +// These two syscall values are used at the end of the program, when we want +// to tell the QEMU host that we are done. I took them from +// . +static const uint32_t REPORT_EXCEPTION = 0x18; +static const uint32_t ApplicationExit = 0x20026; + +// Do a system call towards QEMU or the debugger. +static uint32_t semihosting_syscall(uint32_t nr, const uint32_t arg) { + __asm__ volatile ( + "mov r0, %[nr]\n" + "mov r1, %[arg]\n" + "bkpt 0xAB\n" + "mov %[nr], r0\n" + : [nr] "+r" (nr) : [arg] "r" (arg) : "0", "1"); + return nr; +} + +// Register a destructor that will call qemu telling them that the program +// has exited successfully. +static void __attribute__ ((destructor)) semihosting_exit(void) { + semihosting_syscall(REPORT_EXCEPTION, ApplicationExit); +} + +void NMI_Handler(void) { + hal_send_str("NMI_Handler"); + semihosting_syscall(REPORT_EXCEPTION, ApplicationExit); +} + +void HardFault_Handler(void) { + hal_send_str("HardFault_Handler"); + semihosting_syscall(REPORT_EXCEPTION, ApplicationExit); +} + +void MemManage_Handler(void) { + hal_send_str("MemManage_Handler"); + semihosting_syscall(REPORT_EXCEPTION, ApplicationExit); +} + +void BusFault_Handler(void) { + hal_send_str("BusFault_Handler"); + semihosting_syscall(REPORT_EXCEPTION, ApplicationExit); +} + +void UsageFault_Handler(void) { + hal_send_str("UsageFault_Handler"); + semihosting_syscall(REPORT_EXCEPTION, ApplicationExit); +} + +void SVC_Handler(void) { + hal_send_str("SVC_Handler"); + semihosting_syscall(REPORT_EXCEPTION, ApplicationExit); +} + +void DebugMon_Handler(void) { + hal_send_str("DebugMon_Handler"); + semihosting_syscall(REPORT_EXCEPTION, ApplicationExit); +} + +void PendSV_Handler(void) { + hal_send_str("PendSV_Handler"); + semihosting_syscall(REPORT_EXCEPTION, ApplicationExit); +} + +void Default_Handler(void) { + semihosting_syscall(REPORT_EXCEPTION, ApplicationExit); +} + +#endif /* !defined(NO_SEMIHOSTING_EXIT) */ + +/* End of BSS is where the heap starts (defined in the linker script) */ +extern char end; +static char* heap_end = &end; + +void* __wrap__sbrk (int incr) +{ + char* prev_heap_end; + + prev_heap_end = heap_end; + heap_end += incr; + + return (void *) prev_heap_end; +} + +size_t hal_get_stack_size(void) +{ + register char* cur_stack; + __asm__ volatile ("mov %0, sp" : "=r" (cur_stack)); + return cur_stack - heap_end; +} + +const uint32_t stackpattern = 0xDEADBEEFlu; + +static void* last_sp = NULL; + +void hal_spraystack(void) +{ + + char* _heap_end = heap_end; + asm volatile ("mov %0, sp\n" + ".L%=:\n\t" + "str %2, [%1], #4\n\t" + "cmp %1, %0\n\t" + "blt .L%=\n\t" + : "+r" (last_sp), "+r" (_heap_end) : "r" (stackpattern) : "cc", "memory"); +} + +size_t hal_checkstack(void) +{ + size_t result = 0; + asm volatile("sub %0, %1, %2\n" + ".L%=:\n\t" + "ldr ip, [%2], #4\n\t" + "cmp ip, %3\n\t" + "ite eq\n\t" + "subeq %0, #4\n\t" + "bne .LE%=\n\t" + "cmp %2, %1\n\t" + "blt .L%=\n\t" + ".LE%=:\n" + : "+r"(result) : "r" (last_sp), "r" (heap_end), "r" (stackpattern) : "ip", "cc"); + return result; +} + +/* Implement some system calls to shut up the linker warnings */ + +#include +#undef errno +extern int errno; + +int __wrap__open(char *file, int flags, int mode) +{ + (void) file; + (void) flags; + (void) mode; + errno = ENOSYS; + return -1; +} + +int __wrap__close(int fd) +{ + errno = ENOSYS; + (void) fd; + return -1; +} + +#include + +int __wrap__fstat(int fd, struct stat* buf) +{ + (void) fd; + (void) buf; + errno = ENOSYS; + return -1; +} + +int __wrap__getpid(void) +{ + errno = ENOSYS; + return -1; +} + +int __wrap__isatty(int file) +{ + (void) file; + errno = ENOSYS; + return 0; +} + +int __wrap__kill(int pid, int sig) +{ + (void) pid; + (void) sig; + errno = ENOSYS; + return -1; +} + +int __wrap__lseek(int fd, int ptr, int dir) +{ + (void) fd; + (void) ptr; + (void) dir; + errno = ENOSYS; + return -1; +} + +int __wrap__read(int fd, char* ptr, int len) +{ + (void) fd; + (void) ptr; + (void) len; + errno = ENOSYS; + return -1; +} + +int __wrap__write(int fd, const char* ptr, int len) +{ + (void) fd; + (void) ptr; + (void) len; + errno = ENOSYS; + return -1; +} diff --git a/common/hal-opencm3.c b/common/hal-opencm3.c new file mode 100644 index 0000000..a72f44a --- /dev/null +++ b/common/hal-opencm3.c @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: Apache-2.0 +#include "hal.h" +#include + +#define SERIAL_BAUD 38400 + +#include +#include +#include +#include +#include +#include +#include + +#if defined(STM32F767ZI) +#include +#include + +#define SERIAL_GPIO GPIOD +#define SERIAL_USART USART3 +#define SERIAL_PINS (GPIO8 | GPIO9) +#define STM32 + +#else +#error Unsupported libopencm3 board +#endif + +#define _RCC_CAT(A, B) A##_##B +#define RCC_ID(NAME) _RCC_CAT(RCC, NAME) + +__attribute__((unused)) static uint32_t _clock_freq; + +static void clock_setup(enum clock_mode clock) { +#if defined(STM32F7) + switch (clock) { + case CLOCK_BENCHMARK: + rcc_clock_setup_hsi(&rcc_3v3[RCC_CLOCK_3V3_24MHZ]); + break; + case CLOCK_FAST: + default: + rcc_clock_setup_hsi(&rcc_3v3[RCC_CLOCK_3V3_216MHZ]); + break; + } + + rcc_periph_clock_enable(RCC_RNG); + flash_art_enable(); + flash_prefetch_enable(); +#else +#error Unsupported platform +#endif +} + +void usart_setup() { +#if defined(STM32F7) + rcc_periph_clock_enable(RCC_GPIOD); + rcc_periph_clock_enable(RCC_USART3); +#else +#error Unsupported platform +#endif + +#if defined(STM32F7) + gpio_set_output_options(SERIAL_GPIO, GPIO_OTYPE_OD, GPIO_OSPEED_100MHZ, + SERIAL_PINS); + gpio_set_af(SERIAL_GPIO, GPIO_AF7, SERIAL_PINS); + gpio_mode_setup(SERIAL_GPIO, GPIO_MODE_AF, GPIO_PUPD_PULLUP, SERIAL_PINS); + usart_set_baudrate(SERIAL_USART, SERIAL_BAUD); + usart_set_databits(SERIAL_USART, 8); + usart_set_stopbits(SERIAL_USART, USART_STOPBITS_1); + usart_set_mode(SERIAL_USART, USART_MODE_TX_RX); + usart_set_parity(SERIAL_USART, USART_PARITY_NONE); + usart_set_flow_control(SERIAL_USART, USART_FLOWCONTROL_NONE); + usart_disable_rx_interrupt(SERIAL_USART); + usart_disable_tx_interrupt(SERIAL_USART); + usart_enable(SERIAL_USART); +#endif +} + +void systick_setup() { + /* Systick is always the same on libopencm3 */ + systick_set_clocksource(STK_CSR_CLKSOURCE_AHB); + systick_set_reload(0xFFFFFFu); + systick_interrupt_enable(); + systick_counter_enable(); +} +static volatile unsigned long long overflowcnt = 0; +void hal_setup(const enum clock_mode clock) { + clock_setup(clock); + usart_setup(); + systick_setup(); + rng_enable(); + + // wait for the first systick overflow + // improves reliability of the benchmarking scripts since it makes it much + // less likely that the host will miss the start of the output + unsigned long long old = overflowcnt; + while (old == overflowcnt) + ; +} + +void hal_send_str(const char *in) { + const char *cur = in; + while (*cur) { + usart_send_blocking(SERIAL_USART, *cur); + cur += 1; + } + usart_send_blocking(SERIAL_USART, '\n'); +} + +void sys_tick_handler(void) { ++overflowcnt; } + +uint64_t hal_get_time() { + while (true) { + unsigned long long before = overflowcnt; + unsigned long long result = + (before + 1) * 16777216llu - systick_get_value(); + if (overflowcnt == before) { + return result; + } + } +} + +/* End of BSS is where the heap starts (defined in the linker script) */ +extern char end; +static char *heap_end = &end; + +void *__wrap__sbrk(int incr) { + char *prev_heap_end; + + prev_heap_end = heap_end; + heap_end += incr; + + return (void *)prev_heap_end; +} + +size_t hal_get_stack_size(void) { + register char *cur_stack; + asm volatile("mov %0, sp" : "=r"(cur_stack)); + return cur_stack - heap_end; +} + +const uint32_t stackpattern = 0xDEADBEEFlu; + +static void *last_sp = NULL; + +void hal_spraystack(void) { + + char *_heap_end = heap_end; + asm volatile("mov %0, sp\n" + ".L%=:\n\t" + "str %2, [%1], #4\n\t" + "cmp %1, %0\n\t" + "blt .L%=\n\t" + : "+r"(last_sp), "+r"(_heap_end) + : "r"(stackpattern) + : "cc", "memory"); +} + +size_t hal_checkstack(void) { + size_t result = 0; + asm volatile("sub %0, %1, %2\n" + ".L%=:\n\t" + "ldr ip, [%2], #4\n\t" + "cmp ip, %3\n\t" + "ite eq\n\t" + "subeq %0, #4\n\t" + "bne .LE%=\n\t" + "cmp %2, %1\n\t" + "blt .L%=\n\t" + ".LE%=:\n" + : "+r"(result) + : "r"(last_sp), "r"(heap_end), "r"(stackpattern) + : "ip", "cc"); + return result; +} + +/* Implement some system calls to shut up the linker warnings */ + +#include +#undef errno +extern int errno; + +int __wrap__open(char *file, int flags, int mode) { + (void)file; + (void)flags; + (void)mode; + errno = ENOSYS; + return -1; +} + +int __wrap__close(int fd) { + errno = ENOSYS; + (void)fd; + return -1; +} + +#include + +int __wrap__fstat(int fd, struct stat *buf) { + (void)fd; + (void)buf; + errno = ENOSYS; + return -1; +} + +int __wrap__getpid(void) { + errno = ENOSYS; + return -1; +} + +int __wrap__isatty(int file) { + (void)file; + errno = ENOSYS; + return 0; +} + +int __wrap__kill(int pid, int sig) { + (void)pid; + (void)sig; + errno = ENOSYS; + return -1; +} + +int __wrap__lseek(int fd, int ptr, int dir) { + (void)fd; + (void)ptr; + (void)dir; + errno = ENOSYS; + return -1; +} + +int __wrap__read(int fd, char *ptr, int len) { + (void)fd; + (void)ptr; + (void)len; + errno = ENOSYS; + return -1; +} + +int __wrap__write(int fd, const char *ptr, int len) { + (void)fd; + (void)ptr; + (void)len; + errno = ENOSYS; + return -1; +} diff --git a/common/keccakf1600.S b/common/keccakf1600.S new file mode 100644 index 0000000..7075ed0 --- /dev/null +++ b/common/keccakf1600.S @@ -0,0 +1,1134 @@ +@ +@ Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni, +@ Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby +@ denoted as "the implementer". +@ Additional optimizations by Alexandre Adomnicai. +@ +@ For more information, feedback or questions, please refer to our websites: +@ http://keccak.noekeon.org/ +@ http://keyak.noekeon.org/ +@ http://ketje.noekeon.org/ +@ +@ To the extent possible under law, the implementer has waived all copyright +@ and related or neighboring rights to the source code in this file. +@ http://creativecommons.org/publicdomain/zero/1.0/ +@ + +@ WARNING: These functions work only on little endian CPU with@ ARMv7m architecture (ARM Cortex-M3, ...). + + + .thumb + .syntax unified +.text + + @ Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 +.macro toBitInterleaving x0,x1,s0,s1,t,over + + and \t,\x0,#0x55555555 + orr \t,\t,\t, LSR #1 + and \t,\t,#0x33333333 + orr \t,\t,\t, LSR #2 + and \t,\t,#0x0F0F0F0F + orr \t,\t,\t, LSR #4 + and \t,\t,#0x00FF00FF + bfi \t,\t,#8, #8 + .if \over != 0 + lsr \s0,\t, #8 + .else + eor \s0,\s0,\t, LSR #8 + .endif + + and \t,\x1,#0x55555555 + orr \t,\t,\t, LSR #1 + and \t,\t,#0x33333333 + orr \t,\t,\t, LSR #2 + and \t,\t,#0x0F0F0F0F + orr \t,\t,\t, LSR #4 + and \t,\t,#0x00FF00FF + orr \t,\t,\t, LSR #8 + eor \s0,\s0,\t, LSL #16 + + and \t,\x0,#0xAAAAAAAA + orr \t,\t,\t, LSL #1 + and \t,\t,#0xCCCCCCCC + orr \t,\t,\t, LSL #2 + and \t,\t,#0xF0F0F0F0 + orr \t,\t,\t, LSL #4 + and \t,\t,#0xFF00FF00 + orr \t,\t,\t, LSL #8 + .if \over != 0 + lsr \s1,\t, #16 + .else + eor \s1,\s1,\t, LSR #16 + .endif + + and \t,\x1,#0xAAAAAAAA + orr \t,\t,\t, LSL #1 + and \t,\t,#0xCCCCCCCC + orr \t,\t,\t, LSL #2 + and \t,\t,#0xF0F0F0F0 + orr \t,\t,\t, LSL #4 + and \t,\t,#0xFF00FF00 + orr \t,\t,\t, LSL #8 + bfc \t, #0, #16 + eors \s1,\s1,\t + .endm + + @ Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 +.macro fromBitInterleaving x0, x1, t + + movs \t, \x0 @ t = x0@ + bfi \x0, \x1, #16, #16 @ x0 = (x0 & 0x0000FFFF) | (x1 << 16)@ + bfc \x1, #0, #16 @ x1 = (t >> 16) | (x1 & 0xFFFF0000)@ + orr \x1, \x1, \t, LSR #16 + + eor \t, \x0, \x0, LSR #8 @ t = (x0 ^ (x0 >> 8)) & 0x0000FF00UL@ x0 = x0 ^ t ^ (t << 8)@ + and \t, #0x0000FF00 + eors \x0, \x0, \t + eor \x0, \x0, \t, LSL #8 + + eor \t, \x0, \x0, LSR #4 @ t = (x0 ^ (x0 >> 4)) & 0x00F000F0UL@ x0 = x0 ^ t ^ (t << 4)@ + and \t, #0x00F000F0 + eors \x0, \x0, \t + eor \x0, \x0, \t, LSL #4 + + eor \t, \x0, \x0, LSR #2 @ t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0CUL@ x0 = x0 ^ t ^ (t << 2)@ + and \t, #0x0C0C0C0C + eors \x0, \x0, \t + eor \x0, \x0, \t, LSL #2 + + eor \t, \x0, \x0, LSR #1 @ t = (x0 ^ (x0 >> 1)) & 0x22222222UL@ x0 = x0 ^ t ^ (t << 1)@ + and \t, #0x22222222 + eors \x0, \x0, \t + eor \x0, \x0, \t, LSL #1 + + eor \t, \x1, \x1, LSR #8 @ t = (x1 ^ (x1 >> 8)) & 0x0000FF00UL@ x1 = x1 ^ t ^ (t << 8)@ + and \t, #0x0000FF00 + eors \x1, \x1, \t + eor \x1, \x1, \t, LSL #8 + + eor \t, \x1, \x1, LSR #4 @ t = (x1 ^ (x1 >> 4)) & 0x00F000F0UL@ x1 = x1 ^ t ^ (t << 4)@ + and \t, #0x00F000F0 + eors \x1, \x1, \t + eor \x1, \x1, \t, LSL #4 + + eor \t, \x1, \x1, LSR #2 @ t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0CUL@ x1 = x1 ^ t ^ (t << 2)@ + and \t, #0x0C0C0C0C + eors \x1, \x1, \t + eor \x1, \x1, \t, LSL #2 + + eor \t, \x1, \x1, LSR #1 @ t = (x1 ^ (x1 >> 1)) & 0x22222222UL@ x1 = x1 ^ t ^ (t << 1)@ + and \t, #0x22222222 + eors \x1, \x1, \t + eor \x1, \x1, \t, LSL #1 + .endm + +@ --- offsets in state +.equ Aba0, 0*4 +.equ Aba1, 1*4 +.equ Abe0, 2*4 +.equ Abe1, 3*4 +.equ Abi0, 4*4 +.equ Abi1, 5*4 +.equ Abo0, 6*4 +.equ Abo1, 7*4 +.equ Abu0, 8*4 +.equ Abu1, 9*4 +.equ Aga0, 10*4 +.equ Aga1, 11*4 +.equ Age0, 12*4 +.equ Age1, 13*4 +.equ Agi0, 14*4 +.equ Agi1, 15*4 +.equ Ago0, 16*4 +.equ Ago1, 17*4 +.equ Agu0, 18*4 +.equ Agu1, 19*4 +.equ Aka0, 20*4 +.equ Aka1, 21*4 +.equ Ake0, 22*4 +.equ Ake1, 23*4 +.equ Aki0, 24*4 +.equ Aki1, 25*4 +.equ Ako0, 26*4 +.equ Ako1, 27*4 +.equ Aku0, 28*4 +.equ Aku1, 29*4 +.equ Ama0, 30*4 +.equ Ama1, 31*4 +.equ Ame0, 32*4 +.equ Ame1, 33*4 +.equ Ami0, 34*4 +.equ Ami1, 35*4 +.equ Amo0, 36*4 +.equ Amo1, 37*4 +.equ Amu0, 38*4 +.equ Amu1, 39*4 +.equ Asa0, 40*4 +.equ Asa1, 41*4 +.equ Ase0, 42*4 +.equ Ase1, 43*4 +.equ Asi0, 44*4 +.equ Asi1, 45*4 +.equ Aso0, 46*4 +.equ Aso1, 47*4 +.equ Asu0, 48*4 +.equ Asu1, 49*4 + +@ --- offsets on stack +.equ mDa0, 0*4 +.equ mDa1, 1*4 +.equ mDo0, 2*4 +.equ mDo1, 3*4 +.equ mDi0, 4*4 +.equ mRC , 5*4 +.equ mSize, 6*4 + +/****************************************************************************** + * Bitwise exclusive-OR where both operands are misaligned (i.e. src1 and src2 + * are rotated by rot1 and rot2, respectively). + * The output result is also misaligned (i.e. dst is rotated by rot1-rot2). + * - dst destination register + * - src1-src2 source registers + * - rot1-rot2 rotation values + *****************************************************************************/ +.macro eorror dst, src1, src2, rot1, rot2 +.if \rot1 >= \rot2 + eor \dst, \src1, \src2, ror \rot1-\rot2 +.else + eor \dst, \src1, \src2, ror 32+\rot1-\rot2 +.endif +.endm + + +/****************************************************************************** + * Bit clear instruction where both operands are misaligned (i.e. src1 and src2 + * are rotated by rot1 and rot2, respectively). + * The output result is also misaligned (i.e. dst is rotated by rot1-rot2). + * - dst destination register + * - src1-src2 source registers + * - rot1-rot2 rotation values + *****************************************************************************/ +.macro bicror dst, src1, src2, rot1, rot2 +.if \rot1 >= \rot2 + bic \dst, \src1, \src2, ror \rot1-\rot2 +.else + bic \dst, \src1, \src2, ror 32+\rot1-\rot2 +.endif +.endm + + +/****************************************************************************** + * Load 5 words from memory and XOR them all together. It is used to compute + * the parity columns for the Theta step. + * Note that all operands may be misaligned (i.e. rotated by a certain amount + * of bits), as well as the result. + * - dst destination register + * - src1-src5 source registers + * - rot1-rot5 rotation values + *****************************************************************************/ +.macro xor5 dst, src1, src2, src3, src4, src5, rot1, rot2, rot3, rot4, rot5 + ldr.w \dst, [r0, #\src1] + ldr.w r1, [r0, #\src2] + ldr.w r5, [r0, #\src3] + ldr r11, [r0, #\src4] + ldr r12, [r0, #\src5] + eorror \dst, \dst, r1, \rot1, \rot2 + eorror \dst, \dst, r5, \rot1, \rot3 + eorror \dst, \dst, r11, \rot1, \rot4 + eorror \dst, \dst, r12, \rot1, \rot5 +.endm + + +/****************************************************************************** + * Same as xor5, except that a previous result is stored on the stack after the + * loads from memory. This allows to have the str instruction for free. + * - dst destination register + * - src1-src5 source registers + * - rot1-rot5 rotation values + * - strreg register from previous calculations to be stored in memory + * - stradr register holding the address to store `prev` + * - strofs stack pointer memory offset for the str instruction + *****************************************************************************/ +.macro xor5str dst, src1, src2, src3, src4, src5, rot1, rot2, rot3, rot4, rot5, strreg, stradr, strofs + ldr.w \dst, [r0, #\src1] + ldr.w r1, [r0, #\src2] + ldr.w r5, [r0, #\src3] + ldr r11, [r0, #\src4] + ldr r12, [r0, #\src5] + str.w \strreg, [\stradr, #\strofs] + eorror \dst, \dst, r1, \rot1, \rot2 + eorror \dst, \dst, r5, \rot1, \rot3 + eorror \dst, \dst, r11, \rot1, \rot4 + eorror \dst, \dst, r12, \rot1, \rot5 +.endm + + +/****************************************************************************** + * Exclusive-OR where the 2nd operand is rotated by 1 bit to the left. + * - dst destination register + * - src1-src2 source registers + * - rot differential rotation btw src1 & src2 (i.e. rot=rot1-rot2) + *****************************************************************************/ +.macro xorrol dst, src1, src2, rot + eor \dst, \src1, \src2, ror \rot-1 +.endm + + +/****************************************************************************** + * Bitslice implementation of the Chi step with misaligned operands. + * - resofs memory offset within the internal state to store the result + * - src1-src3 source registers + * - rot1-rot3 rotation values + *****************************************************************************/ +.macro xandnotlazystr resofs, src1, src2, src3, rot1, rot2, rot3 + bicror r1, \src3, \src2, \rot3, \rot2 + eorror r1, r1, \src1, \rot3, \rot1 + str.w r1, [r0, #\resofs] +.endm + + +/****************************************************************************** + * Same as xandnotlazystr but without the str instruction which will be carried + * out later in order to take advantage of future ldr instructions. + * - src1-src3 source registers + * - rot1-rot3 rotation values + *****************************************************************************/ +.macro xandnotlazy src1, src2, src3, rot1, rot2, rot3 + bicror r1, \src3, \src2, \rot3, \rot2 + eorror r1, r1, \src1, \rot3, \rot1 +.endm + + +/****************************************************************************** + * Same as xandnotlazystr with an additional rotation in order to explictly + * compute the Rho step. It is useful in KeccakRound3 in order to return to the + * classical representation every 4 rounds. + * - resofs memory offset within the internal state to store the result + * - src1-src3 source registers + * - rot1-rot3 rotation values + *****************************************************************************/ +.macro xandnotstr resofs, src1, src2, src3, rot1, rot2, rot3 + bicror r1, \src3, \src2, \rot3, \rot2 + eorror r1, r1, \src1, \rot3, \rot1 +.if \rot3 > 0 + ror r1, r1, #32-\rot3 +.endif + str.w r1, [r0, #\resofs] +.endm + + +/****************************************************************************** + * Same as xandnotstr but without the str instruction which will be carried + * out later in order to take advantage of future ldr instructions. + * - src1-src3 source registers + * - rot1-rot3 rotation values + *****************************************************************************/ +.macro xandnot src1, src2, src3, rot1, rot2, rot3 + bicror r1, \src3, \src2, \rot3, \rot2 + eorror r1, r1, \src1, \rot3, \rot1 +.if \rot3 > 0 + ror r1, r1, #32-\rot3 +.endif +.endm + + +/****************************************************************************** + * Same as xandnot followed by the Iota step. Note that the source registers + * are not specified since they are always r3, r4 and r5. + * - out output reg (useful to store the result in the next round) + * - rot2-rot3 rotation values + * - rcofs memory offset to load the round constant + * - last Boolean to indicate whether its the last round of the + * quadruple round routine + *****************************************************************************/ +.macro xandnotiota out, rot3, rot2, rcofs, last + bicror r5, r5, r4, \rot3, \rot2 + ldr r1, [sp, #mRC] + ldr r4, [r1, #\rcofs] +.if \last == 1 + ldr r7, [r1, #32]! + str r1, [sp, #mRC] + cmp r7, #0xFF +.endif +.if \rot3 > 0 + eor r3, r3, r5, ror 32-\rot3 +.else + eor.w r3, r3, r5 +.endif + eor.w \out, r4, r3 +.endm + + +/****************************************************************************** + * Add the parity bits to the state registers r3-r7. If the state registers are + * not properly aligned due to previous lazy rotations, use the barrel shifter + * to fix the misalignment when adding the parity bits. + * - par1-par5 registers containing the parity bits + * - dly1-dly5 rotation values to compute the (delayed) Rho step + *****************************************************************************/ +.macro addparity par1, dly1, par2, dly2, par3, dly3, par4, dly4, par5, dly5 +.if \dly1 > 0 + eor r3, \par1, r3, ror 32-\dly1 +.else + eor.w r3, \par1, r3 +.endif +.if \dly2 > 0 + eor r4, \par2, r4, ror 32-\dly2 +.else + eor.w r4, \par2, r4 +.endif +.if \dly3 > 0 + eor r5, \par3, r5, ror 32-\dly3 +.else + eor.w r5, \par3, r5 +.endif +.if \dly4 > 0 + eor r6, \par4, r6, ror 32-\dly4 +.else + eor.w r6, \par4, r6 +.endif +.if \dly5 > 0 + eor r7, \par5, r7, ror 32-\dly5 +.else + eor.w r7, \par5, r7 +.endif +.endm + + +/****************************************************************************** + * Apply Theta, Pi, Chi and Iota steps to half a plane (i.e. 5 32-bit words) of + * the internal state. + * Note that the Rho step is calculated if and only if \lazy == 0, otherwise it + * is delayed until the next round using ''lazy reductions'' thanks to the + * inline barrel shifter. + * - src1-src5 source registers + * - par1-par5 registers containing the parity bits + * - rot2-rot5 rotation values to compute the current Rho step + * - dly1-dly5 rotation values to compute the delayed Rho step + * - prev register from previous calculations to be stored in memory + * - strofs stack pointer memory offset for the str instruction + * - reg output reg related to the Iota step (to be stored later) + *****************************************************************************/ +.macro KeccakThetaRhoPiChiIota src1, par1, dly1, \ + src2, par2, rot2, dly2, \ + src3, par3, rot3, dly3, \ + src4, par4, rot4, dly4, \ + src5, par5, rot5, dly5, \ + ofs, last, lazy, strofs, reg + ldr.w r3, [r0, #\src1] + ldr r4, [r0, #\src2] + ldr r5, [r0, #\src3] + ldr r6, [r0, #\src4] + ldr r7, [r0, #\src5] + str.w r1, [r0, #\strofs] + addparity \par1, \dly1, \par2, \dly2, \par3, \dly3, \par4, \dly4, \par5, \dly5 +.if \lazy == 1 + xandnotlazystr \src2, r4, r5, r6, \rot2, \rot3, \rot4 + xandnotlazystr \src3, r5, r6, r7, \rot3, \rot4, \rot5 + xandnotlazystr \src4, r6, r7, r3, \rot4, \rot5, 0 + xandnotlazystr \src5, r7, r3, r4, \rot5, 0, \rot2 +.else + xandnotstr \src2, r4, r5, r6, \rot2, \rot3, \rot4 + xandnotstr \src3, r5, r6, r7, \rot3, \rot4, \rot5 + xandnotstr \src4, r6, r7, r3, \rot4, \rot5, 0 + xandnotstr \src5, r7, r3, r4, \rot5, 0, \rot2 +.endif + xandnotiota \reg, \rot3, \rot2, \ofs, \last +.endm + + +/****************************************************************************** + * Apply Theta, Pi, and Chi steps to half a plane (i.e. 5 32-bit words) of the + * internal state. + * Note that the Rho step is calculated if and only if \lazy == 0, otherwise it + * is delayed until the next round using ''lazy reductions'' thanks to the + * inline barrel shifter. + * - src1-src5 source registers + * - dst1-dst5 memory offsets to store the output registers + * - par1-par5 registers containing the parity bits + * - rot2-rot5 rotation values to compute the current Rho step + * - dly1-dly5 rotation values to compute the delayed Rho step + * - lazy Boolean to indicate whether lazy rotations are used or not + * - strofs stack pointer memory offset to store the last output of the + * previous round. + *****************************************************************************/ +.macro KeccakThetaRhoPiChi src1, dst1, par1, rot1, dly1, \ + src2, dst2, par2, rot2, dly2, \ + src3, dst3, par3, rot3, dly3, \ + src4, dst4, par4, rot4, dly4, \ + src5, dst5, par5, rot5, dly5, \ + lazy, strofs + ldr.w r3, [r0, #\src1] + ldr.w r4, [r0, #\src2] + ldr.w r5, [r0, #\src3] + ldr.w r6, [r0, #\src4] + ldr.w r7, [r0, #\src5] + str.w r1, [r0, #\strofs] + addparity \par1, \dly1, \par2, \dly2, \par3, \dly3, \par4, \dly4, \par5, \dly5 +.if \lazy == 1 + xandnotlazystr \dst1, r3, r4, r5, \rot1, \rot2, \rot3 + xandnotlazystr \dst2, r4, r5, r6, \rot2, \rot3, \rot4 + xandnotlazystr \dst3, r5, r6, r7, \rot3, \rot4, \rot5 + xandnotlazystr \dst4, r6, r7, r3, \rot4, \rot5, \rot1 + xandnotlazy r7, r3, r4, \rot5, \rot1, \rot2 +.else + xandnotstr \dst1, r3, r4, r5, \rot1, \rot2, \rot3 + xandnotstr \dst2, r4, r5, r6, \rot2, \rot3, \rot4 + xandnotstr \dst3, r5, r6, r7, \rot3, \rot4, \rot5 + xandnotstr \dst4, r6, r7, r3, \rot4, \rot5, \rot1 + xandnot r7, r3, r4, \rot5, \rot1, \rot2 +.endif +.endm + + +/****************************************************************************** + * 1st round of the 4 unrolled rounds routine due to in-place processing. + * At the beginning of such rounds, the internal state is expected to match the + * classical representation (i.e. without transition and no delayed Rho step). + *****************************************************************************/ +.macro KeccakRound0 + xor5 r3, Abu0, Agu0, Aku0, Amu0, Asu0, 0, 0, 0, 0, 0 + xor5 r7, Abe1, Age1, Ake1, Ame1, Ase1, 0, 0, 0, 0, 0 + xorrol r6, r3, r7, 32 + xor5str r4, Abi1, Agi1, Aki1, Ami1, Asi1, 0, 0, 0, 0, 0, r6, sp, mDa0 + eor.w r6, r3, r4 + xor5str r3, Abo0, Ago0, Ako0, Amo0, Aso0, 0, 0, 0, 0, 0, r6, sp, mDo1 + eor.w r2, r7, r3 + xor5 r7, Aba0, Aga0, Aka0, Ama0, Asa0, 0, 0, 0, 0, 0 + xorrol r10, r7, r4, 32 + xor5 r4, Abo1, Ago1, Ako1, Amo1, Aso1, 0, 0, 0, 0, 0 + eor r14, r4, r7 + xor5 r7, Abe0, Age0, Ake0, Ame0, Ase0, 0, 0, 0, 0, 0 + xorrol r6, r7, r4, 32 + xor5str r4, Abu1, Agu1, Aku1, Amu1, Asu1, 0, 0, 0, 0, 0, r6, sp, mDi0 + eor.w r8, r4, r7 + xor5str r7, Abi0, Agi0, Aki0, Ami0, Asi0, 0, 0, 0, 0, 0, r8, sp, mDa1 + xorrol r9, r7, r4, 32 + xor5str r4, Aba1, Aga1, Aka1, Ama1, Asa1, 0, 0, 0, 0, 0, r9, sp, mDo0 + eor r11, r4, r7 + xorrol r12, r3, r4, 32 + KeccakThetaRhoPiChi Abo0, Aka1, r9, 14, 0, \ + Agu0, Ame1, r12, 10, 0, \ + Aka1, Asi1, r8, 2, 0, \ + Ame1, Abo0, r11, 23, 0, \ + Asi1, Agu0, r2, 31, 0, \ + 1, Aka1 + KeccakThetaRhoPiChi Abe0, Asa1, r10, 0, 0, \ + Agi1, Abe0, r2, 3, 0, \ + Ako0, Agi1, r9, 12, 0, \ + Amu1, Ako0, r14, 4, 0, \ + Asa1, Amu1, r8, 9, 0, \ + 1, Agu0 + ldr r8, [sp, #mDa0] + KeccakThetaRhoPiChi Abu1, Aga0, r14, 14, 0, \ + Aga0, Ake0, r8, 18, 0, \ + Ake0, Ami1, r10, 5, 0, \ + Ami1, Aso0, r2, 8, 0, \ + Aso0, Abu1, r9, 28, 0, \ + 1, Amu1 + KeccakThetaRhoPiChi Abi1, Ama0, r2, 31, 0, \ + Ago0, Ase1, r9, 27, 0, \ + Aku0, Abi1, r12, 19, 0, \ + Ama0, Ago0, r8, 20, 0, \ + Ase1, Aku0, r11, 1, 0, \ + 1, Abu1 + ldr r9, [sp, #mDo1] + KeccakThetaRhoPiChiIota Aba0, r8, 0, \ + Age0, r10, 22, 0, \ + Aki1, r2, 22, 0, \ + Amo1, r9, 11, 0, \ + Asu0, r12, 7, 0, \ + 0, 0, 1, Aku0, r1 + ldr.w r2, [sp, #mDi0] + KeccakThetaRhoPiChi Abo1, Aka0, r9, 14, 0, \ + Agu1, Ame0, r14, 10, 0, \ + Aka0, Asi0, r8, 1, 0, \ + Ame0, Abo1, r10, 22, 0, \ + Asi0, Agu1, r2, 30, 0, \ + 1, Aba0 + KeccakThetaRhoPiChi Abe1, Asa0, r11, 1, 0, \ + Agi0, Abe1, r2, 3, 0, \ + Ako1, Agi0, r9, 13, 0, \ + Amu0, Ako1, r12, 4, 0, \ + Asa0, Amu0, r8, 9, 0, \ + 1, Agu1 + ldr r8, [sp, #mDa1] + KeccakThetaRhoPiChi Abu0, Aga1, r12, 13, 0, \ + Aga1, Ake1, r8, 18, 0, \ + Ake1, Ami0, r11, 5, 0, \ + Ami0, Aso1, r2, 7, 0, \ + Aso1, Abu0, r9, 28, 0, \ + 1, Amu0 + KeccakThetaRhoPiChi Abi0, Ama1, r2, 31, 0, \ + Ago1, Ase0, r9, 28, 0, \ + Aku1, Abi0, r14, 20, 0, \ + Ama1, Ago1, r8, 21, 0, \ + Ase0, Aku1, r10, 1, 0, \ + 1, Abu0 + ldr r9, [sp, #mDo0] + KeccakThetaRhoPiChiIota Aba1, r8, 0, \ + Age1, r11, 22, 0, \ + Aki0, r2, 21, 0, \ + Amo0, r9, 10, 0, \ + Asu1, r14, 7, 0, \ + 4, 0, 1, Aku1, r14 +.endm + + + +/****************************************************************************** + * 2nd round of the 4 unrolled rounds routine due to in-place processing. + *****************************************************************************/ +.macro KeccakRound1 + xor5str r3, Asu0, Agu0, Amu0, Abu1, Aku1, 22, 10, 3, 18, 28, r14, r0, Aba1 + xor5 r7, Age1, Ame0, Abe0, Ake1, Ase1, 10, 22, 4, 7, 20 + ror r3, 32-22 + xorrol r6, r3, r7, 32-10 + xor5str r4, Aki0, Asi0, Agi1, Ami0, Abi1, 7, 30, 9, 28, 1, r6, sp, mDa0 + eor r6, r3, r4, ror 32-7 + xor5str r3, Amo1, Abo0, Ako1, Aso0, Ago1, 0, 14, 1, 14, 31, r6, sp, mDo1 + eor r2, r3, r7, ror 32-10 + xor5 r7, Aba0, Aka1, Asa0, Aga0, Ama1, 0, 2, 13, 5, 20 + xorrol r10, r7, r4, 32-7 + xor5 r4, Amo0, Abo1, Ako0, Aso1, Ago0, 0, 14, 0, 13, 31 + eor r14, r4, r7 + xor5 r7, Age0, Ame1, Abe1, Ake0, Ase0, 11, 23, 4, 8, 21 + ror r7, 32-11 + xorrol r6, r7, r4, 32 + xor5str r4, Asu1, Agu1, Amu1, Abu0, Aku0, 22, 10, 3, 18, 27, r6, sp, mDi0 + eor r8, r7, r4, ror 32-22 + xor5str r7, Aki1, Asi1, Agi0, Ami1, Abi0, 7, 31, 9, 28, 1, r8, sp, mDa1 + ror r7, 32-7 + xorrol r9, r7, r4, 32-22 + xor5str r4, Aba1, Aka0, Asa1, Aga1, Ama0, 0, 1, 12, 5, 19, r9, sp, mDo0 + eor r11, r4, r7 + xorrol r12, r3, r4, 32 + KeccakThetaRhoPiChi Amo1, Asa1, r9, 14, 0, \ + Agu0, Ake1, r12, 10, 10, \ + Asa1, Abi1, r8, 2, 12, \ + Ake1, Amo1, r11, 23, 7, \ + Abi1, Agu0, r2, 31, 1, \ + 1, Asa1 + KeccakThetaRhoPiChi Age0, Ama0, r10, 0, 11, \ + Asi0, Age0, r2, 3, 30, \ + Ako1, Asi0, r9, 12, 1, \ + Abu0, Ako1, r14, 4, 18, \ + Ama0, Abu0, r8, 9, 19, \ + 1, Agu0 + ldr r8, [sp, #mDa0] + KeccakThetaRhoPiChi Asu1, Aka1, r14, 14, 22, \ + Aka1, Abe1, r8, 18, 2, \ + Abe1, Ami0, r10, 5, 4, \ + Ami0, Ago1, r2, 8, 28, \ + Ago1, Asu1, r9, 28, 31, \ + 1, Abu0 + KeccakThetaRhoPiChi Aki0, Aga0, r2, 31, 7, \ + Abo0, Ase1, r9, 27, 14, \ + Amu0, Aki0, r12, 19, 3, \ + Aga0, Abo0, r8, 20, 5, \ + Ase1, Amu0, r11, 1, 20, \ + 1, Asu1 + ldr r9, [sp, #mDo1] + KeccakThetaRhoPiChiIota Aba0, r8, 0, \ + Ame1, r10, 22, 23, \ + Agi1, r2, 22, 9, \ + Aso1, r9, 11, 13, \ + Aku1, r12, 7, 28, \ + 8, 0, 1, Amu0, r1 + ldr.w r2, [sp, #mDi0] + KeccakThetaRhoPiChi Amo0, Asa0, r9, 14, 0, \ + Agu1, Ake0, r14, 10, 10, \ + Asa0, Abi0, r8, 1, 13, \ + Ake0, Amo0, r10, 22, 8, \ + Abi0, Agu1, r2, 30, 1, \ + 1, Aba0 + KeccakThetaRhoPiChi Age1, Ama1, r11, 1, 10, \ + Asi1, Age1, r2, 3, 31, \ + Ako0, Asi1, r9, 13, 0, \ + Abu1, Ako0, r12, 4, 18, \ + Ama1, Abu1, r8, 9, 20, \ + 1, Agu1 + ldr r8, [sp, #mDa1] + KeccakThetaRhoPiChi Asu0, Aka0, r12, 13, 22, \ + Aka0, Abe0, r8, 18, 1, \ + Abe0, Ami1, r11, 5, 4, \ + Ami1, Ago0, r2, 7, 28, \ + Ago0, Asu0, r9, 28, 31, \ + 1, Abu1 + KeccakThetaRhoPiChi Aki1, Aga1, r2, 31, 7, \ + Abo1, Ase0, r9, 28, 14, \ + Amu1, Aki1, r14, 20, 3, \ + Aga1, Abo1, r8, 21, 5, \ + Ase0, Amu1, r10, 1, 21, \ + 1, Asu0 + ldr r9, [sp, #mDo0] + KeccakThetaRhoPiChiIota Aba1, r8, 0, \ + Ame0, r11, 22, 22, \ + Agi0, r2, 21, 9, \ + Aso0, r9, 10, 14, \ + Aku0, r14, 7, 27, \ + 12, 0, 1, Amu1, r14 +.endm + +/****************************************************************************** + * 3rd round of the 4 unrolled rounds routine due to in-place processing. + *****************************************************************************/ +.macro KeccakRound2 + xor5str r3, Aku1, Agu0, Abu1, Asu1, Amu1, 22, 10, 3, 18, 28, r14, r0, Aba1 + xor5 r7, Ame0, Ake0, Age0, Abe0, Ase1, 10, 22, 4, 7, 20 + ror r3, 32-22 + xorrol r6, r3, r7, 32-10 + xor5str r4, Agi0, Abi0, Asi0, Ami1, Aki0, 7, 30, 9, 28, 1, r6, sp, mDa0 + eor r6, r3, r4, ror 32-7 + xor5str r3, Aso1, Amo1, Ako0, Ago1, Abo1, 0, 14, 1, 14, 31, r6, sp, mDo1 + eor r2, r3, r7, ror 32-10 + xor5 r7, Aba0, Asa1, Ama1, Aka1, Aga1, 0, 2, 13, 5, 20 + xorrol r10, r7, r4, 32-7 + xor5 r4, Aso0, Amo0, Ako1, Ago0, Abo0, 0, 14, 0, 13, 31 + eor r14, r4, r7 + xor5 r7, Ame1, Ake1, Age1, Abe1, Ase0, 11, 23, 4, 8, 21 + ror r7, 32-11 + xorrol r6, r7, r4, 32 + xor5str r4, Aku0, Agu1, Abu0, Asu0, Amu0, 22, 10, 3, 18, 27, r6, sp, mDi0 + eor r8, r7, r4, ror 32-22 + xor5str r7, Agi1, Abi1, Asi1, Ami0, Aki1, 7, 31, 9, 28, 1, r8, sp, mDa1 + ror r7, 32-7 + xorrol r9, r7, r4, 32-22 + xor5str r4, Aba1, Asa0, Ama0, Aka0, Aga0, 0, 1, 12, 5, 19, r9, sp, mDo0 + eor r11, r4, r7 + xorrol r12, r3, r4, 32 + KeccakThetaRhoPiChi Aso1, Ama0, r9, 14, 0, \ + Agu0, Abe0, r12, 10, 10, \ + Ama0, Aki0, r8, 2, 12, \ + Abe0, Aso1, r11, 23, 7, \ + Aki0, Agu0, r2, 31, 1, \ + 1, Ama0 + KeccakThetaRhoPiChi Ame1, Aga0, r10, 0, 11, \ + Abi0, Ame1, r2, 3, 30, \ + Ako0, Abi0, r9, 12, 1, \ + Asu0, Ako0, r14, 4, 18, \ + Aga0, Asu0, r8, 9, 19, \ + 1, Agu0 + ldr r8, [sp, #mDa0] + KeccakThetaRhoPiChi Aku0, Asa1, r14, 14, 22, \ + Asa1, Age1, r8, 18, 2, \ + Age1, Ami1, r10, 5, 4, \ + Ami1, Abo1, r2, 8, 28, \ + Abo1, Aku0, r9, 28, 31, \ + 1, Asu0 + KeccakThetaRhoPiChi Agi0, Aka1, r2, 31, 7, \ + Amo1, Ase1, r9, 27, 14, \ + Abu1, Agi0, r12, 19, 3, \ + Aka1, Amo1, r8, 20, 5, \ + Ase1, Abu1, r11, 1, 20, \ + 1, Aku0 + ldr r9, [sp, #mDo1] + KeccakThetaRhoPiChiIota Aba0, r8, 0, \ + Ake1, r10,22, 23, \ + Asi0, r2, 22, 9, \ + Ago0, r9, 11, 13, \ + Amu1, r12, 7, 28, \ + 16, 0, 1, Abu1, r1 + ldr.w r2, [sp, #mDi0] + KeccakThetaRhoPiChi Aso0, Ama1, r9, 14, 0, \ + Agu1, Abe1, r14, 10, 10, \ + Ama1, Aki1, r8, 1, 13, \ + Abe1, Aso0, r10, 22, 8, \ + Aki1, Agu1, r2, 30, 1, \ + 1, Aba0 + KeccakThetaRhoPiChi Ame0, Aga1, r11, 1, 10, \ + Abi1, Ame0, r2, 3, 31, \ + Ako1, Abi1, r9, 13, 0, \ + Asu1, Ako1, r12, 4, 18, \ + Aga1, Asu1, r8, 9, 20, \ + 1, Agu1 + ldr r8, [sp, #mDa1] + KeccakThetaRhoPiChi Aku1, Asa0, r12, 13, 22, \ + Asa0, Age0, r8, 18, 1, \ + Age0, Ami0, r11, 5, 4, \ + Ami0, Abo0, r2, 7, 28, \ + Abo0, Aku1, r9, 28, 31, \ + 1, Asu1 + KeccakThetaRhoPiChi Agi1, Aka0, r2, 31, 7, \ + Amo0, Ase0, r9, 28, 14, \ + Abu0, Agi1, r14, 20, 3, \ + Aka0, Amo0, r8, 21, 5, \ + Ase0, Abu0, r10, 1, 21, \ + 1, Aku1 + ldr r9, [sp, #mDo0] + KeccakThetaRhoPiChiIota Aba1, r8, 0, \ + Ake0, r11, 22, 22, \ + Asi1, r2, 21, 9, \ + Ago1, r9, 10, 14, \ + Amu0, r14, 7, 27, \ + 20, 0, 1, Abu0, r14 + +.endm + + +/****************************************************************************** + * 4th round of the 4 unrolled rounds routine due to in-place processing. + * Note that the Rho step is *not* delayed so that the internal state is + * compliant w/ the classical representation at the end of the routine. + *****************************************************************************/ +.macro KeccakRound3 + xor5str r3, Amu1, Agu0, Asu1, Aku0, Abu0, 22, 10, 3, 18, 28, r14, r0, Aba1 + xor5 r7, Ake0, Abe1, Ame1, Age0, Ase1, 10, 22, 4, 7, 20 + ror r3, 32-22 + xorrol r6, r3, r7, 32-10 + xor5str r4, Asi1, Aki1, Abi0, Ami0, Agi0, 7, 30, 9, 28, 1, r6, sp, mDa0 + eor r6, r3, r4, ror 32-7 + xor5str r3, Ago0, Aso1, Ako1, Abo1, Amo0, 0, 14, 1, 14, 31, r6, sp, mDo1 + eor r2, r3, r7, ror 32-10 + xor5 r7, Aba0, Ama0, Aga1, Asa1, Aka0, 0, 2, 13, 5, 20 + xorrol r10, r7, r4, 32-7 + xor5 r4, Ago1, Aso0, Ako0, Abo0, Amo1, 0, 14, 0, 13, 31 + eor r14, r4, r7 + xor5 r7, Ake1, Abe0, Ame0, Age1, Ase0, 11, 23, 4, 8, 21 + ror r7, #32-11 + xorrol r6, r7, r4, 32 + xor5str r4, Amu0, Agu1, Asu0, Aku1, Abu1, 22, 10, 3, 18, 27, r6, sp, mDi0 + eor r8, r7, r4, ror 32-22 + xor5str r7, Asi0, Aki0, Abi1, Ami1, Agi1, 7, 31, 9, 28, 1, r8, sp, mDa1 + ror r7, 32-7 + xorrol r9, r7, r4, 32-22 + xor5str r4, Aba1, Ama1, Aga0, Asa0, Aka1, 0, 1, 12, 5, 19, r9, sp, mDo0 + eor r11, r4, r7 + xorrol r12, r3, r4, 32 + KeccakThetaRhoPiChi Ago0, Aga0, r9, 14, 0, \ + Agu0, Age0, r12, 10, 10, \ + Aga0, Agi0, r8, 2, 12, \ + Age0, Ago0, r11, 23, 7, \ + Agi0, Agu0, r2, 31, 1, \ + 0, Aga0 + KeccakThetaRhoPiChi Ake1, Aka1, r10, 0, 11, \ + Aki1, Ake1, r2, 3, 30, \ + Ako1, Aki1, r9, 12, 1, \ + Aku1, Ako1, r14, 4, 18, \ + Aka1, Aku1, r8, 9, 19, \ + 0, Agu0 + ldr r8, [sp, #mDa0] + KeccakThetaRhoPiChi Amu0, Ama0, r14, 14, 22, \ + Ama0, Ame0, r8, 18, 2, \ + Ame0, Ami0, r10, 5, 4, \ + Ami0, Amo0, r2, 8, 28, \ + Amo0, Amu0, r9, 28, 31, \ + 0, Aku1 + KeccakThetaRhoPiChi Asi1, Asa1, r2, 31, 7, \ + Aso1, Ase1, r9, 27, 14, \ + Asu1, Asi1, r12, 19, 3, \ + Asa1, Aso1, r8, 20, 5, \ + Ase1, Asu1, r11, 1, 20, \ + 0, Amu0 + ldr r9, [sp, #mDo1] + KeccakThetaRhoPiChiIota Aba0, r8, 0, \ + Abe0, r10, 22, 23, \ + Abi0, r2, 22, 9, \ + Abo0, r9, 11, 13, \ + Abu0, r12, 7, 28, \ + 24, 0, 0, Asu1, r1 + ldr.w r2, [sp, #mDi0] + KeccakThetaRhoPiChi Ago1, Aga1, r9, 14, 0, \ + Agu1, Age1, r14, 10, 10, \ + Aga1, Agi1, r8, 1, 13, \ + Age1, Ago1, r10, 22, 8, \ + Agi1, Agu1, r2, 30, 1, \ + 0, Aba0 + KeccakThetaRhoPiChi Ake0, Aka0, r11, 1, 10, \ + Aki0, Ake0, r2, 3, 31, \ + Ako0, Aki0, r9, 13, 0, \ + Aku0, Ako0, r12, 4, 18, \ + Aka0, Aku0, r8, 9, 20, \ + 0, Agu1 + ldr r8, [sp, #mDa1] + KeccakThetaRhoPiChi Amu1, Ama1, r12, 13, 22, \ + Ama1, Ame1, r8, 18, 1, \ + Ame1, Ami1, r11, 5, 4, \ + Ami1, Amo1, r2, 7, 28, \ + Amo1, Amu1, r9, 28, 31, \ + 0, Aku0 + KeccakThetaRhoPiChi Asi0, Asa0, r2, 31, 7, \ + Aso0, Ase0, r9, 28, 14, \ + Asu0, Asi0, r14, 20, 3, \ + Asa0, Aso0, r8, 21, 5, \ + Ase0, Asu0, r10, 1, 21, \ + 0, Amu1 + ldr r9, [sp, #mDo0] + KeccakThetaRhoPiChiIota Aba1, r8, 0, \ + Abe1, r11, 22, 22, \ + Abi1, r2, 21, 9, \ + Abo1, r9, 10, 14, \ + Abu1, r14, 7, 27, \ + 28, 1, 0, Asu0, r1 + str.w r1, [r0, #Aba1] +.endm + + +@---------------------------------------------------------------------------- +@ +@ void KeccakF1600_Initialize( void ) +@ +.align 8 +.global KeccakF1600_Initialize +KeccakF1600_Initialize: + bx lr + + + +@---------------------------------------------------------------------------- +@ +@ void KeccakF1600_StateXORBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) +@ +.align 8 +.global KeccakF1600_StateXORBytes +KeccakF1600_StateXORBytes: + cbz r3, KeccakF1600_StateXORBytes_Exit1 + push {r4 - r8, lr} @ then + bic r4, r2, #7 @ offset &= ~7 + adds r0, r0, r4 @ add whole lane offset to state pointer + ands r2, r2, #7 @ offset &= 7 (part not lane aligned) + beq KeccakF1600_StateXORBytes_CheckLanes @ .if offset != 0 + movs r4, r3 @ then, do remaining bytes in first lane + rsb r5, r2, #8 @ max size in lane = 8 - offset + cmp r4, r5 + ble KeccakF1600_StateXORBytes_BytesAlign + movs r4, r5 +KeccakF1600_StateXORBytes_BytesAlign: + sub r8, r3, r4 @ size left + movs r3, r4 + bl __KeccakF1600_StateXORBytesInLane + mov r3, r8 +KeccakF1600_StateXORBytes_CheckLanes: + lsrs r2, r3, #3 @ .if length >= 8 + beq KeccakF1600_StateXORBytes_Bytes + mov r8, r3 + bl __KeccakF1600_StateXORLanes + and r3, r8, #7 +KeccakF1600_StateXORBytes_Bytes: + cbz r3, KeccakF1600_StateXORBytes_Exit + movs r2, #0 + bl __KeccakF1600_StateXORBytesInLane +KeccakF1600_StateXORBytes_Exit: + pop {r4 - r8, pc} +KeccakF1600_StateXORBytes_Exit1: + bx lr + + +@---------------------------------------------------------------------------- +@ +@ __KeccakF1600_StateXORLanes +@ +@ Input: +@ r0 state pointer +@ r1 data pointer +@ r2 laneCount +@ +@ Output: +@ r0 state pointer next lane +@ r1 data pointer next byte to input +@ +@ Changed: r2-r7 +@ +.align 8 +__KeccakF1600_StateXORLanes: +__KeccakF1600_StateXORLanes_LoopAligned: + ldr r4, [r1], #4 + ldr r5, [r1], #4 + ldrd r6, r7, [r0] + toBitInterleaving r4, r5, r6, r7, r3, 0 + strd r6, r7, [r0], #8 + subs r2, r2, #1 + bne __KeccakF1600_StateXORLanes_LoopAligned + bx lr + + +@---------------------------------------------------------------------------- +@ +@ __KeccakF1600_StateXORBytesInLane +@ +@ Input: +@ r0 state pointer +@ r1 data pointer +@ r2 offset in lane +@ r3 length +@ +@ Output: +@ r0 state pointer next lane +@ r1 data pointer next byte to input +@ +@ Changed: r2-r7 +@ +.align 8 +__KeccakF1600_StateXORBytesInLane: + movs r4, #0 + movs r5, #0 + push { r4 - r5 } + add r2, r2, sp +__KeccakF1600_StateXORBytesInLane_Loop: + ldrb r5, [r1], #1 + strb r5, [r2], #1 + subs r3, r3, #1 + bne __KeccakF1600_StateXORBytesInLane_Loop + pop { r4 - r5 } + ldrd r6, r7, [r0] + toBitInterleaving r4, r5, r6, r7, r3, 0 + strd r6, r7, [r0], #8 + bx lr + + + + +@---------------------------------------------------------------------------- +@ +@ void KeccakF1600_StateExtractBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) +@ +.align 8 +.global KeccakF1600_StateExtractBytes +KeccakF1600_StateExtractBytes: + cbz r3, KeccakF1600_StateExtractBytes_Exit1 @ .if length != 0 + push {r4 - r8, lr} @ then + bic r4, r2, #7 @ offset &= ~7 + adds r0, r0, r4 @ add whole lane offset to state pointer + ands r2, r2, #7 @ offset &= 7 (part not lane aligned) + beq KeccakF1600_StateExtractBytes_CheckLanes @ .if offset != 0 + movs r4, r3 @ then, do remaining bytes in first lane + rsb r5, r2, #8 @ max size in lane = 8 - offset + cmp r4, r5 + ble KeccakF1600_StateExtractBytes_BytesAlign + movs r4, r5 +KeccakF1600_StateExtractBytes_BytesAlign: + sub r8, r3, r4 @ size left + movs r3, r4 + bl __KeccakF1600_StateExtractBytesInLane + mov r3, r8 +KeccakF1600_StateExtractBytes_CheckLanes: + lsrs r2, r3, #3 @ .if length >= 8 + beq KeccakF1600_StateExtractBytes_Bytes + mov r8, r3 + bl __KeccakF1600_StateExtractLanes + and r3, r8, #7 +KeccakF1600_StateExtractBytes_Bytes: + cbz r3, KeccakF1600_StateExtractBytes_Exit + movs r2, #0 + bl __KeccakF1600_StateExtractBytesInLane +KeccakF1600_StateExtractBytes_Exit: + pop {r4 - r8, pc} +KeccakF1600_StateExtractBytes_Exit1: + bx lr + + +@---------------------------------------------------------------------------- +@ +@ __KeccakF1600_StateExtractLanes +@ +@ Input: +@ r0 state pointer +@ r1 data pointer +@ r2 laneCount +@ +@ Output: +@ r0 state pointer next lane +@ r1 data pointer next byte to input +@ +@ Changed: r2-r5 +@ +.align 8 +__KeccakF1600_StateExtractLanes: +__KeccakF1600_StateExtractLanes_LoopAligned: + ldrd r4, r5, [r0], #8 + fromBitInterleaving r4, r5, r3 + str r4, [r1], #4 + subs r2, r2, #1 + str r5, [r1], #4 + bne __KeccakF1600_StateExtractLanes_LoopAligned + bx lr + + +@---------------------------------------------------------------------------- +@ +@ __KeccakF1600_StateExtractBytesInLane +@ +@ Input: +@ r0 state pointer +@ r1 data pointer +@ r2 offset in lane +@ r3 length +@ +@ Output: +@ r0 state pointer next lane +@ r1 data pointer next byte to input +@ +@ Changed: r2-r6 +@ +.align 8 +__KeccakF1600_StateExtractBytesInLane: + ldrd r4, r5, [r0], #8 + fromBitInterleaving r4, r5, r6 + push {r4, r5} + add r2, sp, r2 +__KeccakF1600_StateExtractBytesInLane_Loop: + ldrb r4, [r2], #1 + subs r3, r3, #1 + strb r4, [r1], #1 + bne __KeccakF1600_StateExtractBytesInLane_Loop + add sp, #8 + bx lr + + + +.align 8 +KeccakF1600_StatePermute_RoundConstantsWithTerminator: + @ 0 1 + .long 0x00000001, 0x00000000 + .long 0x00000000, 0x00000089 + .long 0x00000000, 0x8000008b + .long 0x00000000, 0x80008080 + + .long 0x00000001, 0x0000008b + .long 0x00000001, 0x00008000 + .long 0x00000001, 0x80008088 + .long 0x00000001, 0x80000082 + + .long 0x00000000, 0x0000000b + .long 0x00000000, 0x0000000a + .long 0x00000001, 0x00008082 + .long 0x00000000, 0x00008003 + + .long 0x00000001, 0x0000808b + .long 0x00000001, 0x8000000b + .long 0x00000001, 0x8000008a + .long 0x00000001, 0x80000081 + + .long 0x00000000, 0x80000081 + .long 0x00000000, 0x80000008 + .long 0x00000000, 0x00000083 + .long 0x00000000, 0x80008003 + + .long 0x00000001, 0x80008088 + .long 0x00000000, 0x80000088 + .long 0x00000001, 0x00008000 + .long 0x00000000, 0x80008082 + + .long 0x000000FF @terminator + +@---------------------------------------------------------------------------- +@ +@ void KeccakF1600_StatePermute( void *state ) +@ +.align 8 +.global KeccakF1600_StatePermute +KeccakF1600_StatePermute: + adr r1, KeccakF1600_StatePermute_RoundConstantsWithTerminator + push { r4 - r12, lr } + sub sp, #mSize + str r1, [sp, #mRC] +KeccakF1600_StatePermute_RoundLoop: + KeccakRound0 + KeccakRound1 + KeccakRound2 + KeccakRound3 + bne KeccakF1600_StatePermute_RoundLoop + add sp, #mSize + pop { r4 - r12, pc } + diff --git a/common/keccaktest.c b/common/keccaktest.c new file mode 100644 index 0000000..389bb9c --- /dev/null +++ b/common/keccaktest.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: Apache-2.0 or CC0-1.0 +#include "randombytes.h" +#include +#include +#include +#include +#include + +#include + + +#include +#include + +const unsigned char msg1[] = { + 0x84u, 0xb6u, 0x0cu, 0xb3u, 0x72u, 0x0bu, 0xf2u, 0x97u, 0x48u, 0x48u, 0x3cu, 0xf7u, 0xabu, 0xd0u, 0xd1u, + 0xf1u, 0xd9u, 0x38u, 0x04u, 0x59u, 0xdfu, 0xa9u, 0x68u, 0x46u, 0x0cu, 0x86u, 0xe5u, 0xd1u, 0xa5u, 0x4fu, + 0x0bu, 0x19u, 0xdau, 0xc6u, 0xa7u, 0x8bu, 0xf9u, 0x50u, 0x94u, 0x60u, 0xe2u, 0x9du, 0xd4u, 0x66u, 0xbbu, + 0x8bu, 0xdfu, 0x04u, 0xe5u, 0x48u, 0x3bu, 0x78u, 0x2eu, 0xb7u, 0x4du, 0x64u, 0x48u, 0x16u, 0x6fu, 0x89u, + 0x7au, 0xddu, 0x43u, 0xd2u, 0x95u, 0xe9u, 0x46u, 0x94u, 0x2au, 0xd9u, 0xa8u, 0x14u, 0xfau, 0xb9u, 0x5bu, + 0x4au, 0xaeu, 0xdeu, 0x6au, 0xe4u, 0xc8u, 0x10u, 0x8cu, 0x8eu, 0xdau, 0xefu, 0xf9u, 0x71u, 0xf5u, 0x8fu, + 0x7cu, 0xf9u, 0x65u, 0x66u, 0xc9u, 0xdcu, 0x9bu, 0x68u, 0x12u, 0x58u, 0x6bu, 0x70u, 0xd5u, 0xbcu, 0x78u, + 0xe2u, 0xf8u, 0x29u, 0xecu, 0x8eu, 0x17u, 0x9au, 0x6cu, 0xd8u, 0x1du, 0x22u, 0x4bu, 0x16u, 0x11u, 0x75u, + 0xfdu, 0x3au, 0x33u, 0xaau, 0xcfu, 0xb1u, 0x48u, 0x3fu, +}; + +const unsigned char md1[] = { + 0x88u, 0x14u, 0x63u, 0x0au, 0x39u, 0xdcu, 0xb9u, 0x97u, 0x92u, 0xccu, 0x4eu, + 0x08u, 0xcau, 0xe5u, 0xddu, 0x07u, 0x89u, 0x73u, 0xd1u, 0x5cu, 0xd1u, 0x9fu, + 0x17u, 0xbau, 0xcfu, 0x04u, 0xdeu, 0xdau, 0x9eu, 0x62u, 0xc4u, 0x5fu, +}; + +static int test(void) +{ + unsigned char buf[32]; + int r = 0; + sha3_256(buf, msg1, sizeof(msg1)); + if(memcmp(buf, md1, sizeof(buf))) { + hal_send_str("ERROR SHA3-256 output does not match test vector.\n"); + r = 1; + } + return r; +} + +static int bench(void) +{ + char str[128]; + unsigned char msg[1024*32]; + unsigned char md[1024*32]; + uint64_t t0, t1; +#define TESTMD(MD) \ + hal_send_str("-"); \ + t0 = hal_get_time(); \ + MD(md, msg, sizeof(msg)); \ + t1 = hal_get_time(); \ + sprintf(str, #MD": %llu cycles", t1-t0); \ + hal_send_str(str) + TESTMD(sha3_256); + TESTMD(sha3_384); + TESTMD(sha3_512); +#define TESTXOF(XOF) \ + hal_send_str("-"); \ + t0 = hal_get_time(); \ + XOF(md, sizeof(md), msg, sizeof(msg)); \ + t1 = hal_get_time(); \ + sprintf(str, #XOF": %llu cycles", t1-t0); \ + hal_send_str(str) + TESTXOF(shake128); + TESTXOF(shake256); + return 0; +} + +int main(void) +{ + hal_setup(CLOCK_BENCHMARK); + hal_send_str("==="); + if(test()){ + hal_send_str("ERR"); + } else { + hal_send_str("ALL GOOD!"); + } + bench(); + hal_send_str("###"); + return 0; +} diff --git a/common/mps2/CMSDK_CM4.h b/common/mps2/CMSDK_CM4.h new file mode 100644 index 0000000..7a6fd8e --- /dev/null +++ b/common/mps2/CMSDK_CM4.h @@ -0,0 +1,1289 @@ +/* MPS2 CMSIS Library +* +* Copyright (c) 2006-2018 ARM Limited +* SPDX-License-Identifier: BSD-3-Clause +* All rights reserved. +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions are met: +* +* 1. Redistributions of source code must retain the above copyright notice, +* this list of conditions and the following disclaimer. +* +* 2. Redistributions in binary form must reproduce the above copyright notice, +* this list of conditions and the following disclaimer in the documentation +* and/or other materials provided with the distribution. +* +* 3. Neither the name of the copyright holder nor the names of its contributors +* may be used to endorse or promote products derived from this software without +* specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +* POSSIBILITY OF SUCH DAMAGE. +******************************************************************************* +* @file CMSDK_CM4.h +* @brief CMSIS Cortex-M4 Core Peripheral Access Layer Header File for +* Device CMSDK_CM4 +* +*******************************************************************************/ + + +#ifndef CMSDK_CM4_H +#define CMSDK_CM4_H + +#ifdef __cplusplus +extern "C" { +#endif + + +/* ------------------------- Interrupt Number Definition ------------------------ */ + +typedef enum IRQn { + /* ------------------- Cortex-M3 Processor Exceptions Numbers ------------------- */ + NonMaskableInt_IRQn = -14, /* 2 Non Maskable Interrupt */ + HardFault_IRQn = -13, /* 3 HardFault Interrupt */ + MemoryManagement_IRQn = -12, /* 4 Memory Management Interrupt */ + BusFault_IRQn = -11, /* 5 Bus Fault Interrupt */ + UsageFault_IRQn = -10, /* 6 Usage Fault Interrupt */ + SVCall_IRQn = -5, /* 11 SV Call Interrupt */ + DebugMonitor_IRQn = -4, /* 12 Debug Monitor Interrupt */ + PendSV_IRQn = -2, /* 14 Pend SV Interrupt */ + SysTick_IRQn = -1, /* 15 System Tick Interrupt */ + + /****** CMSDK Specific Interrupt Numbers *********************************************************/ + UARTRX0_IRQn = 0, /*!< UART 0 RX Interrupt */ + UARTTX0_IRQn = 1, /*!< UART 0 TX Interrupt */ + UARTRX1_IRQn = 2, /*!< UART 1 RX Interrupt */ + UARTTX1_IRQn = 3, /*!< UART 1 TX Interrupt */ + UARTRX2_IRQn = 4, /*!< UART 2 RX Interrupt */ + UARTTX2_IRQn = 5, /*!< UART 2 TX Interrupt */ + PORT0_ALL_IRQn = 6, /*!< Port 0 combined Interrupt */ + PORT1_ALL_IRQn = 7, /*!< Port 1 combined Interrupt */ + TIMER0_IRQn = 8, /*!< TIMER 0 Interrupt */ + TIMER1_IRQn = 9, /*!< TIMER 1 Interrupt */ + DUALTIMER_IRQn = 10, /*!< Dual Timer Interrupt */ + SPI_IRQn = 11, /*!< SPI Interrupt */ + UARTOVF_IRQn = 12, /*!< UART 0,1,2 Overflow Interrupt */ + ETHERNET_IRQn = 13, /*!< Ethernet Interrupt */ + I2S_IRQn = 14, /*!< I2S Interrupt */ + TSC_IRQn = 15, /*!< Touch Screen Interrupt */ + PORT2_ALL_IRQn = 16, /*!< Port 2 combined Interrupt */ + PORT3_ALL_IRQn = 17, /*!< Port 3 combined Interrupt */ + UARTRX3_IRQn = 18, /*!< UART 3 RX Interrupt */ + UARTTX3_IRQn = 19, /*!< UART 3 TX Interrupt */ + UARTRX4_IRQn = 20, /*!< UART 4 RX Interrupt */ + UARTTX4_IRQn = 21, /*!< UART 4 TX Interrupt */ + ADCSPI_IRQn = 22, /*!< SHIELD ADC SPI Interrupt */ + SHIELDSPI_IRQn = 23, /*!< SHIELD SPI Combined Interrupt */ + PORT0_0_IRQn = 24, /*!< GPIO Port 0 pin 0 Interrupt */ + PORT0_1_IRQn = 25, /*!< GPIO Port 0 pin 1 Interrupt */ + PORT0_2_IRQn = 26, /*!< GPIO Port 0 pin 2 Interrupt */ + PORT0_3_IRQn = 27, /*!< GPIO Port 0 pin 3 Interrupt */ + PORT0_4_IRQn = 28, /*!< GPIO Port 0 pin 4 Interrupt */ + PORT0_5_IRQn = 29, /*!< GPIO Port 0 pin 5 Interrupt */ + PORT0_6_IRQn = 30, /*!< GPIO Port 0 pin 6 Interrupt */ + PORT0_7_IRQn = 31, /*!< GPIO Port 0 pin 7 Interrupt */ +} IRQn_Type; + + +/* + * ========================================================================== + * ----------- Processor and Core Peripheral Section ------------------------ + * ========================================================================== + */ + +/* Configuration of the Cortex-M4 Processor and Core Peripherals */ +#define __CM4_REV 0x0001 /*!< Core Revision r0p1 */ +#define __NVIC_PRIO_BITS 3 /*!< Number of Bits used for Priority Levels */ +#define __Vendor_SysTickConfig 0 /*!< Set to 1 if different SysTick Config is used */ +#define __MPU_PRESENT 1 /*!< MPU present or not */ +#define __FPU_PRESENT 1 /*!< FPU present or not */ + +/*@}*/ /* end of group CMSDK_CM4_CMSIS */ + + +#include "core_cm4.h" /* Cortex-M4 processor and core peripherals */ + + +/******************************************************************************/ +/* Device Specific Peripheral registers structures */ +/******************************************************************************/ +/** @addtogroup CMSDK_CM4_Peripherals CMSDK_CM4 Peripherals + CMSDK_CM4 Device Specific Peripheral registers structures + @{ +*/ + +#if defined ( __CC_ARM ) +#pragma anon_unions +#endif + +/*------------- Universal Asynchronous Receiver Transmitter (UART) -----------*/ +/** @addtogroup CMSDK_UART CMSDK Universal Asynchronous Receiver/Transmitter + memory mapped structure for CMSDK_UART + @{ +*/ +typedef struct { + __IO uint32_t DATA; /*!< Offset: 0x000 Data Register (R/W) */ + __IO uint32_t STATE; /*!< Offset: 0x004 Status Register (R/W) */ + __IO uint32_t CTRL; /*!< Offset: 0x008 Control Register (R/W) */ + union { + __I uint32_t INTSTATUS; /*!< Offset: 0x00C Interrupt Status Register (R/ ) */ + __O uint32_t INTCLEAR; /*!< Offset: 0x00C Interrupt Clear Register ( /W) */ + }; + __IO uint32_t BAUDDIV; /*!< Offset: 0x010 Baudrate Divider Register (R/W) */ + +} CMSDK_UART_TypeDef; + +/* CMSDK_UART DATA Register Definitions */ + +#define CMSDK_UART_DATA_Pos 0 /*!< CMSDK_UART_DATA_Pos: DATA Position */ +#define CMSDK_UART_DATA_Msk (0xFFul << CMSDK_UART_DATA_Pos) /*!< CMSDK_UART DATA: DATA Mask */ + +#define CMSDK_UART_STATE_RXOR_Pos 3 /*!< CMSDK_UART STATE: RXOR Position */ +#define CMSDK_UART_STATE_RXOR_Msk (0x1ul << CMSDK_UART_STATE_RXOR_Pos) /*!< CMSDK_UART STATE: RXOR Mask */ + +#define CMSDK_UART_STATE_TXOR_Pos 2 /*!< CMSDK_UART STATE: TXOR Position */ +#define CMSDK_UART_STATE_TXOR_Msk (0x1ul << CMSDK_UART_STATE_TXOR_Pos) /*!< CMSDK_UART STATE: TXOR Mask */ + +#define CMSDK_UART_STATE_RXBF_Pos 1 /*!< CMSDK_UART STATE: RXBF Position */ +#define CMSDK_UART_STATE_RXBF_Msk (0x1ul << CMSDK_UART_STATE_RXBF_Pos) /*!< CMSDK_UART STATE: RXBF Mask */ + +#define CMSDK_UART_STATE_TXBF_Pos 0 /*!< CMSDK_UART STATE: TXBF Position */ +#define CMSDK_UART_STATE_TXBF_Msk (0x1ul << CMSDK_UART_STATE_TXBF_Pos ) /*!< CMSDK_UART STATE: TXBF Mask */ + +#define CMSDK_UART_CTRL_HSTM_Pos 6 /*!< CMSDK_UART CTRL: HSTM Position */ +#define CMSDK_UART_CTRL_HSTM_Msk (0x01ul << CMSDK_UART_CTRL_HSTM_Pos) /*!< CMSDK_UART CTRL: HSTM Mask */ + +#define CMSDK_UART_CTRL_RXORIRQEN_Pos 5 /*!< CMSDK_UART CTRL: RXORIRQEN Position */ +#define CMSDK_UART_CTRL_RXORIRQEN_Msk (0x01ul << CMSDK_UART_CTRL_RXORIRQEN_Pos) /*!< CMSDK_UART CTRL: RXORIRQEN Mask */ + +#define CMSDK_UART_CTRL_TXORIRQEN_Pos 4 /*!< CMSDK_UART CTRL: TXORIRQEN Position */ +#define CMSDK_UART_CTRL_TXORIRQEN_Msk (0x01ul << CMSDK_UART_CTRL_TXORIRQEN_Pos) /*!< CMSDK_UART CTRL: TXORIRQEN Mask */ + +#define CMSDK_UART_CTRL_RXIRQEN_Pos 3 /*!< CMSDK_UART CTRL: RXIRQEN Position */ +#define CMSDK_UART_CTRL_RXIRQEN_Msk (0x01ul << CMSDK_UART_CTRL_RXIRQEN_Pos) /*!< CMSDK_UART CTRL: RXIRQEN Mask */ + +#define CMSDK_UART_CTRL_TXIRQEN_Pos 2 /*!< CMSDK_UART CTRL: TXIRQEN Position */ +#define CMSDK_UART_CTRL_TXIRQEN_Msk (0x01ul << CMSDK_UART_CTRL_TXIRQEN_Pos) /*!< CMSDK_UART CTRL: TXIRQEN Mask */ + +#define CMSDK_UART_CTRL_RXEN_Pos 1 /*!< CMSDK_UART CTRL: RXEN Position */ +#define CMSDK_UART_CTRL_RXEN_Msk (0x01ul << CMSDK_UART_CTRL_RXEN_Pos) /*!< CMSDK_UART CTRL: RXEN Mask */ + +#define CMSDK_UART_CTRL_TXEN_Pos 0 /*!< CMSDK_UART CTRL: TXEN Position */ +#define CMSDK_UART_CTRL_TXEN_Msk (0x01ul << CMSDK_UART_CTRL_TXEN_Pos) /*!< CMSDK_UART CTRL: TXEN Mask */ + +#define CMSDK_UART_INTSTATUS_RXORIRQ_Pos 3 /*!< CMSDK_UART CTRL: RXORIRQ Position */ +#define CMSDK_UART_CTRL_RXORIRQ_Msk (0x01ul << CMSDK_UART_INTSTATUS_RXORIRQ_Pos) /*!< CMSDK_UART CTRL: RXORIRQ Mask */ + +#define CMSDK_UART_CTRL_TXORIRQ_Pos 2 /*!< CMSDK_UART CTRL: TXORIRQ Position */ +#define CMSDK_UART_CTRL_TXORIRQ_Msk (0x01ul << CMSDK_UART_CTRL_TXORIRQ_Pos) /*!< CMSDK_UART CTRL: TXORIRQ Mask */ + +#define CMSDK_UART_CTRL_RXIRQ_Pos 1 /*!< CMSDK_UART CTRL: RXIRQ Position */ +#define CMSDK_UART_CTRL_RXIRQ_Msk (0x01ul << CMSDK_UART_CTRL_RXIRQ_Pos) /*!< CMSDK_UART CTRL: RXIRQ Mask */ + +#define CMSDK_UART_CTRL_TXIRQ_Pos 0 /*!< CMSDK_UART CTRL: TXIRQ Position */ +#define CMSDK_UART_CTRL_TXIRQ_Msk (0x01ul << CMSDK_UART_CTRL_TXIRQ_Pos) /*!< CMSDK_UART CTRL: TXIRQ Mask */ + +#define CMSDK_UART_BAUDDIV_Pos 0 /*!< CMSDK_UART BAUDDIV: BAUDDIV Position */ +#define CMSDK_UART_BAUDDIV_Msk (0xFFFFFul << CMSDK_UART_BAUDDIV_Pos) /*!< CMSDK_UART BAUDDIV: BAUDDIV Mask */ + +/*@}*/ /* end of group CMSDK_UART */ + + +/*----------------------------- Timer (TIMER) -------------------------------*/ +/** @addtogroup CMSDK_TIMER CMSDK Timer + @{ +*/ +typedef struct { + __IO uint32_t CTRL; /*!< Offset: 0x000 Control Register (R/W) */ + __IO uint32_t VALUE; /*!< Offset: 0x004 Current Value Register (R/W) */ + __IO uint32_t RELOAD; /*!< Offset: 0x008 Reload Value Register (R/W) */ + union { + __I uint32_t INTSTATUS; /*!< Offset: 0x00C Interrupt Status Register (R/ ) */ + __O uint32_t INTCLEAR; /*!< Offset: 0x00C Interrupt Clear Register ( /W) */ + }; + +} CMSDK_TIMER_TypeDef; + +/* CMSDK_TIMER CTRL Register Definitions */ + +#define CMSDK_TIMER_CTRL_IRQEN_Pos 3 /*!< CMSDK_TIMER CTRL: IRQEN Position */ +#define CMSDK_TIMER_CTRL_IRQEN_Msk (0x01ul << CMSDK_TIMER_CTRL_IRQEN_Pos) /*!< CMSDK_TIMER CTRL: IRQEN Mask */ + +#define CMSDK_TIMER_CTRL_SELEXTCLK_Pos 2 /*!< CMSDK_TIMER CTRL: SELEXTCLK Position */ +#define CMSDK_TIMER_CTRL_SELEXTCLK_Msk (0x01ul << CMSDK_TIMER_CTRL_SELEXTCLK_Pos) /*!< CMSDK_TIMER CTRL: SELEXTCLK Mask */ + +#define CMSDK_TIMER_CTRL_SELEXTEN_Pos 1 /*!< CMSDK_TIMER CTRL: SELEXTEN Position */ +#define CMSDK_TIMER_CTRL_SELEXTEN_Msk (0x01ul << CMSDK_TIMER_CTRL_SELEXTEN_Pos) /*!< CMSDK_TIMER CTRL: SELEXTEN Mask */ + +#define CMSDK_TIMER_CTRL_EN_Pos 0 /*!< CMSDK_TIMER CTRL: EN Position */ +#define CMSDK_TIMER_CTRL_EN_Msk (0x01ul << CMSDK_TIMER_CTRL_EN_Pos) /*!< CMSDK_TIMER CTRL: EN Mask */ + +#define CMSDK_TIMER_VAL_CURRENT_Pos 0 /*!< CMSDK_TIMER VALUE: CURRENT Position */ +#define CMSDK_TIMER_VAL_CURRENT_Msk (0xFFFFFFFFul << CMSDK_TIMER_VAL_CURRENT_Pos) /*!< CMSDK_TIMER VALUE: CURRENT Mask */ + +#define CMSDK_TIMER_RELOAD_VAL_Pos 0 /*!< CMSDK_TIMER RELOAD: RELOAD Position */ +#define CMSDK_TIMER_RELOAD_VAL_Msk (0xFFFFFFFFul << CMSDK_TIMER_RELOAD_VAL_Pos) /*!< CMSDK_TIMER RELOAD: RELOAD Mask */ + +#define CMSDK_TIMER_INTSTATUS_Pos 0 /*!< CMSDK_TIMER INTSTATUS: INTSTATUSPosition */ +#define CMSDK_TIMER_INTSTATUS_Msk (0x01ul << CMSDK_TIMER_INTSTATUS_Pos) /*!< CMSDK_TIMER INTSTATUS: INTSTATUSMask */ + +#define CMSDK_TIMER_INTCLEAR_Pos 0 /*!< CMSDK_TIMER INTCLEAR: INTCLEAR Position */ +#define CMSDK_TIMER_INTCLEAR_Msk (0x01ul << CMSDK_TIMER_INTCLEAR_Pos) /*!< CMSDK_TIMER INTCLEAR: INTCLEAR Mask */ + +/*@}*/ /* end of group CMSDK_TIMER */ + + +/*------------- Timer (TIM) --------------------------------------------------*/ +// Timer (TIM) + +/** @addtogroup CMSDK_DualTIMER CMSDK Dual Timer + @{ +*/ + +typedef struct { + __IO uint32_t Timer1Load; /* Offset: 0x000 (R/W) Timer 1 Load */ + __I uint32_t Timer1Value; /* Offset: 0x004 (R/ ) Timer 1 Counter Current Value */ + __IO uint32_t Timer1Control; /* Offset: 0x008 (R/W) Timer 1 Control */ + /* TimerEn: Timer Enable */ + /* TimerMode: Timer Mode */ + /* <0=> Freerunning-mode */ + /* <1=> Periodic mode */ + /* IntEnable: Interrupt Enable */ + /* TimerPre: Timer Prescale */ + /* <0=> / 1 */ + /* <1=> / 16 */ + /* <2=> / 256 */ + /* <3=> Undefined! */ + /* TimerSize: Timer Size */ + /* <0=> 16-bit counter */ + /* <1=> 32-bit counter */ + /* OneShot: One-shoot mode */ + /* <0=> Wrapping mode */ + /* <1=> One-shot mode */ + /* */ + __O uint32_t Timer1IntClr; /* Offset: 0x00C ( /W) Timer 1 Interrupt Clear */ + __I uint32_t Timer1RIS; /* Offset: 0x010 (R/ ) Timer 1 Raw Interrupt Status */ + __I uint32_t Timer1MIS; /* Offset: 0x014 (R/ ) Timer 1 Masked Interrupt Status */ + __IO uint32_t Timer1BGLoad; /* Offset: 0x018 (R/W) Background Load Register */ + uint32_t RESERVED0; + __IO uint32_t Timer2Load; /* Offset: 0x020 (R/W) Timer 2 Load */ + __I uint32_t Timer2Value; /* Offset: 0x024 (R/ ) Timer 2 Counter Current Value */ + __IO uint32_t Timer2Control; /* Offset: 0x028 (R/W) Timer 2 Control */ + /* TimerEn: Timer Enable */ + /* TimerMode: Timer Mode */ + /* <0=> Freerunning-mode */ + /* <1=> Periodic mode */ + /* IntEnable: Interrupt Enable */ + /* TimerPre: Timer Prescale */ + /* <0=> / 1 */ + /* <1=> / 16 */ + /* <2=> / 256 */ + /* <3=> Undefined! */ + /* TimerSize: Timer Size */ + /* <0=> 16-bit counter */ + /* <1=> 32-bit counter */ + /* OneShot: One-shoot mode */ + /* <0=> Wrapping mode */ + /* <1=> One-shot mode */ + /* */ + __O uint32_t Timer2IntClr; /* Offset: 0x02C ( /W) Timer 2 Interrupt Clear */ + __I uint32_t Timer2RIS; /* Offset: 0x030 (R/ ) Timer 2 Raw Interrupt Status */ + __I uint32_t Timer2MIS; /* Offset: 0x034 (R/ ) Timer 2 Masked Interrupt Status */ + __IO uint32_t Timer2BGLoad; /* Offset: 0x038 (R/W) Background Load Register */ + uint32_t RESERVED1[945]; + __IO uint32_t ITCR; /* Offset: 0xF00 (R/W) Integration Test Control Register */ + __O uint32_t ITOP; /* Offset: 0xF04 ( /W) Integration Test Output Set Register */ +} CMSDK_DUALTIMER_BOTH_TypeDef; + +#define CMSDK_DUALTIMER1_LOAD_Pos 0 /*!< CMSDK_DUALTIMER1 LOAD: LOAD Position */ +#define CMSDK_DUALTIMER1_LOAD_Msk (0xFFFFFFFFul << CMSDK_DUALTIMER1_LOAD_Pos) /*!< CMSDK_DUALTIMER1 LOAD: LOAD Mask */ + +#define CMSDK_DUALTIMER1_VALUE_Pos 0 /*!< CMSDK_DUALTIMER1 VALUE: VALUE Position */ +#define CMSDK_DUALTIMER1_VALUE_Msk (0xFFFFFFFFul << CMSDK_DUALTIMER1_VALUE_Pos) /*!< CMSDK_DUALTIMER1 VALUE: VALUE Mask */ + +#define CMSDK_DUALTIMER1_CTRL_EN_Pos 7 /*!< CMSDK_DUALTIMER1 CTRL_EN: CTRL Enable Position */ +#define CMSDK_DUALTIMER1_CTRL_EN_Msk (0x1ul << CMSDK_DUALTIMER1_CTRL_EN_Pos) /*!< CMSDK_DUALTIMER1 CTRL_EN: CTRL Enable Mask */ + +#define CMSDK_DUALTIMER1_CTRL_MODE_Pos 6 /*!< CMSDK_DUALTIMER1 CTRL_MODE: CTRL MODE Position */ +#define CMSDK_DUALTIMER1_CTRL_MODE_Msk (0x1ul << CMSDK_DUALTIMER1_CTRL_MODE_Pos) /*!< CMSDK_DUALTIMER1 CTRL_MODE: CTRL MODE Mask */ + +#define CMSDK_DUALTIMER1_CTRL_INTEN_Pos 5 /*!< CMSDK_DUALTIMER1 CTRL_INTEN: CTRL Int Enable Position */ +#define CMSDK_DUALTIMER1_CTRL_INTEN_Msk (0x1ul << CMSDK_DUALTIMER1_CTRL_INTEN_Pos) /*!< CMSDK_DUALTIMER1 CTRL_INTEN: CTRL Int Enable Mask */ + +#define CMSDK_DUALTIMER1_CTRL_PRESCALE_Pos 2 /*!< CMSDK_DUALTIMER1 CTRL_PRESCALE: CTRL PRESCALE Position */ +#define CMSDK_DUALTIMER1_CTRL_PRESCALE_Msk (0x3ul << CMSDK_DUALTIMER1_CTRL_PRESCALE_Pos) /*!< CMSDK_DUALTIMER1 CTRL_PRESCALE: CTRL PRESCALE Mask */ + +#define CMSDK_DUALTIMER1_CTRL_SIZE_Pos 1 /*!< CMSDK_DUALTIMER1 CTRL_SIZE: CTRL SIZE Position */ +#define CMSDK_DUALTIMER1_CTRL_SIZE_Msk (0x1ul << CMSDK_DUALTIMER1_CTRL_SIZE_Pos) /*!< CMSDK_DUALTIMER1 CTRL_SIZE: CTRL SIZE Mask */ + +#define CMSDK_DUALTIMER1_CTRL_ONESHOOT_Pos 0 /*!< CMSDK_DUALTIMER1 CTRL_ONESHOOT: CTRL ONESHOOT Position */ +#define CMSDK_DUALTIMER1_CTRL_ONESHOOT_Msk (0x1ul << CMSDK_DUALTIMER1_CTRL_ONESHOOT_Pos) /*!< CMSDK_DUALTIMER1 CTRL_ONESHOOT: CTRL ONESHOOT Mask */ + +#define CMSDK_DUALTIMER1_INTCLR_Pos 0 /*!< CMSDK_DUALTIMER1 INTCLR: INT Clear Position */ +#define CMSDK_DUALTIMER1_INTCLR_Msk (0x1ul << CMSDK_DUALTIMER1_INTCLR_Pos) /*!< CMSDK_DUALTIMER1 INTCLR: INT Clear Mask */ + +#define CMSDK_DUALTIMER1_RAWINTSTAT_Pos 0 /*!< CMSDK_DUALTIMER1 RAWINTSTAT: Raw Int Status Position */ +#define CMSDK_DUALTIMER1_RAWINTSTAT_Msk (0x1ul << CMSDK_DUALTIMER1_RAWINTSTAT_Pos) /*!< CMSDK_DUALTIMER1 RAWINTSTAT: Raw Int Status Mask */ + +#define CMSDK_DUALTIMER1_MASKINTSTAT_Pos 0 /*!< CMSDK_DUALTIMER1 MASKINTSTAT: Mask Int Status Position */ +#define CMSDK_DUALTIMER1_MASKINTSTAT_Msk (0x1ul << CMSDK_DUALTIMER1_MASKINTSTAT_Pos) /*!< CMSDK_DUALTIMER1 MASKINTSTAT: Mask Int Status Mask */ + +#define CMSDK_DUALTIMER1_BGLOAD_Pos 0 /*!< CMSDK_DUALTIMER1 BGLOAD: Background Load Position */ +#define CMSDK_DUALTIMER1_BGLOAD_Msk (0xFFFFFFFFul << CMSDK_DUALTIMER1_BGLOAD_Pos) /*!< CMSDK_DUALTIMER1 BGLOAD: Background Load Mask */ + +#define CMSDK_DUALTIMER2_LOAD_Pos 0 /*!< CMSDK_DUALTIMER2 LOAD: LOAD Position */ +#define CMSDK_DUALTIMER2_LOAD_Msk (0xFFFFFFFFul << CMSDK_DUALTIMER2_LOAD_Pos) /*!< CMSDK_DUALTIMER2 LOAD: LOAD Mask */ + +#define CMSDK_DUALTIMER2_VALUE_Pos 0 /*!< CMSDK_DUALTIMER2 VALUE: VALUE Position */ +#define CMSDK_DUALTIMER2_VALUE_Msk (0xFFFFFFFFul << CMSDK_DUALTIMER2_VALUE_Pos) /*!< CMSDK_DUALTIMER2 VALUE: VALUE Mask */ + +#define CMSDK_DUALTIMER2_CTRL_EN_Pos 7 /*!< CMSDK_DUALTIMER2 CTRL_EN: CTRL Enable Position */ +#define CMSDK_DUALTIMER2_CTRL_EN_Msk (0x1ul << CMSDK_DUALTIMER2_CTRL_EN_Pos) /*!< CMSDK_DUALTIMER2 CTRL_EN: CTRL Enable Mask */ + +#define CMSDK_DUALTIMER2_CTRL_MODE_Pos 6 /*!< CMSDK_DUALTIMER2 CTRL_MODE: CTRL MODE Position */ +#define CMSDK_DUALTIMER2_CTRL_MODE_Msk (0x1ul << CMSDK_DUALTIMER2_CTRL_MODE_Pos) /*!< CMSDK_DUALTIMER2 CTRL_MODE: CTRL MODE Mask */ + +#define CMSDK_DUALTIMER2_CTRL_INTEN_Pos 5 /*!< CMSDK_DUALTIMER2 CTRL_INTEN: CTRL Int Enable Position */ +#define CMSDK_DUALTIMER2_CTRL_INTEN_Msk (0x1ul << CMSDK_DUALTIMER2_CTRL_INTEN_Pos) /*!< CMSDK_DUALTIMER2 CTRL_INTEN: CTRL Int Enable Mask */ + +#define CMSDK_DUALTIMER2_CTRL_PRESCALE_Pos 2 /*!< CMSDK_DUALTIMER2 CTRL_PRESCALE: CTRL PRESCALE Position */ +#define CMSDK_DUALTIMER2_CTRL_PRESCALE_Msk (0x3ul << CMSDK_DUALTIMER2_CTRL_PRESCALE_Pos) /*!< CMSDK_DUALTIMER2 CTRL_PRESCALE: CTRL PRESCALE Mask */ + +#define CMSDK_DUALTIMER2_CTRL_SIZE_Pos 1 /*!< CMSDK_DUALTIMER2 CTRL_SIZE: CTRL SIZE Position */ +#define CMSDK_DUALTIMER2_CTRL_SIZE_Msk (0x1ul << CMSDK_DUALTIMER2_CTRL_SIZE_Pos) /*!< CMSDK_DUALTIMER2 CTRL_SIZE: CTRL SIZE Mask */ + +#define CMSDK_DUALTIMER2_CTRL_ONESHOOT_Pos 0 /*!< CMSDK_DUALTIMER2 CTRL_ONESHOOT: CTRL ONESHOOT Position */ +#define CMSDK_DUALTIMER2_CTRL_ONESHOOT_Msk (0x1ul << CMSDK_DUALTIMER2_CTRL_ONESHOOT_Pos) /*!< CMSDK_DUALTIMER2 CTRL_ONESHOOT: CTRL ONESHOOT Mask */ + +#define CMSDK_DUALTIMER2_INTCLR_Pos 0 /*!< CMSDK_DUALTIMER2 INTCLR: INT Clear Position */ +#define CMSDK_DUALTIMER2_INTCLR_Msk (0x1ul << CMSDK_DUALTIMER2_INTCLR_Pos) /*!< CMSDK_DUALTIMER2 INTCLR: INT Clear Mask */ + +#define CMSDK_DUALTIMER2_RAWINTSTAT_Pos 0 /*!< CMSDK_DUALTIMER2 RAWINTSTAT: Raw Int Status Position */ +#define CMSDK_DUALTIMER2_RAWINTSTAT_Msk (0x1ul << CMSDK_DUALTIMER2_RAWINTSTAT_Pos) /*!< CMSDK_DUALTIMER2 RAWINTSTAT: Raw Int Status Mask */ + +#define CMSDK_DUALTIMER2_MASKINTSTAT_Pos 0 /*!< CMSDK_DUALTIMER2 MASKINTSTAT: Mask Int Status Position */ +#define CMSDK_DUALTIMER2_MASKINTSTAT_Msk (0x1ul << CMSDK_DUALTIMER2_MASKINTSTAT_Pos) /*!< CMSDK_DUALTIMER2 MASKINTSTAT: Mask Int Status Mask */ + +#define CMSDK_DUALTIMER2_BGLOAD_Pos 0 /*!< CMSDK_DUALTIMER2 BGLOAD: Background Load Position */ +#define CMSDK_DUALTIMER2_BGLOAD_Msk (0xFFFFFFFFul << CMSDK_DUALTIMER2_BGLOAD_Pos) /*!< CMSDK_DUALTIMER2 BGLOAD: Background Load Mask */ + +typedef struct { + __IO uint32_t TimerLoad; /* Offset: 0x000 (R/W) Timer Load */ + __I uint32_t TimerValue; /* Offset: 0x000 (R/W) Timer Counter Current Value */ + __IO uint32_t TimerControl; /* Offset: 0x000 (R/W) Timer Control */ + /* TimerEn: Timer Enable */ + /* TimerMode: Timer Mode */ + /* <0=> Freerunning-mode */ + /* <1=> Periodic mode */ + /* IntEnable: Interrupt Enable */ + /* TimerPre: Timer Prescale */ + /* <0=> / 1 */ + /* <1=> / 16 */ + /* <2=> / 256 */ + /* <3=> Undefined! */ + /* TimerSize: Timer Size */ + /* <0=> 16-bit counter */ + /* <1=> 32-bit counter */ + /* OneShot: One-shoot mode */ + /* <0=> Wrapping mode */ + /* <1=> One-shot mode */ + /* */ + __O uint32_t TimerIntClr; /* Offset: 0x000 (R/W) Timer Interrupt Clear */ + __I uint32_t TimerRIS; /* Offset: 0x000 (R/W) Timer Raw Interrupt Status */ + __I uint32_t TimerMIS; /* Offset: 0x000 (R/W) Timer Masked Interrupt Status */ + __IO uint32_t TimerBGLoad; /* Offset: 0x000 (R/W) Background Load Register */ +} CMSDK_DUALTIMER_SINGLE_TypeDef; + +#define CMSDK_DUALTIMER_LOAD_Pos 0 /*!< CMSDK_DUALTIMER LOAD: LOAD Position */ +#define CMSDK_DUALTIMER_LOAD_Msk (0xFFFFFFFFul << CMSDK_DUALTIMER_LOAD_Pos) /*!< CMSDK_DUALTIMER LOAD: LOAD Mask */ + +#define CMSDK_DUALTIMER_VALUE_Pos 0 /*!< CMSDK_DUALTIMER VALUE: VALUE Position */ +#define CMSDK_DUALTIMER_VALUE_Msk (0xFFFFFFFFul << CMSDK_DUALTIMER_VALUE_Pos) /*!< CMSDK_DUALTIMER VALUE: VALUE Mask */ + +#define CMSDK_DUALTIMER_CTRL_EN_Pos 7 /*!< CMSDK_DUALTIMER CTRL_EN: CTRL Enable Position */ +#define CMSDK_DUALTIMER_CTRL_EN_Msk (0x1ul << CMSDK_DUALTIMER_CTRL_EN_Pos) /*!< CMSDK_DUALTIMER CTRL_EN: CTRL Enable Mask */ + +#define CMSDK_DUALTIMER_CTRL_MODE_Pos 6 /*!< CMSDK_DUALTIMER CTRL_MODE: CTRL MODE Position */ +#define CMSDK_DUALTIMER_CTRL_MODE_Msk (0x1ul << CMSDK_DUALTIMER_CTRL_MODE_Pos) /*!< CMSDK_DUALTIMER CTRL_MODE: CTRL MODE Mask */ + +#define CMSDK_DUALTIMER_CTRL_INTEN_Pos 5 /*!< CMSDK_DUALTIMER CTRL_INTEN: CTRL Int Enable Position */ +#define CMSDK_DUALTIMER_CTRL_INTEN_Msk (0x1ul << CMSDK_DUALTIMER_CTRL_INTEN_Pos) /*!< CMSDK_DUALTIMER CTRL_INTEN: CTRL Int Enable Mask */ + +#define CMSDK_DUALTIMER_CTRL_PRESCALE_Pos 2 /*!< CMSDK_DUALTIMER CTRL_PRESCALE: CTRL PRESCALE Position */ +#define CMSDK_DUALTIMER_CTRL_PRESCALE_Msk (0x3ul << CMSDK_DUALTIMER_CTRL_PRESCALE_Pos) /*!< CMSDK_DUALTIMER CTRL_PRESCALE: CTRL PRESCALE Mask */ + +#define CMSDK_DUALTIMER_CTRL_SIZE_Pos 1 /*!< CMSDK_DUALTIMER CTRL_SIZE: CTRL SIZE Position */ +#define CMSDK_DUALTIMER_CTRL_SIZE_Msk (0x1ul << CMSDK_DUALTIMER_CTRL_SIZE_Pos) /*!< CMSDK_DUALTIMER CTRL_SIZE: CTRL SIZE Mask */ + +#define CMSDK_DUALTIMER_CTRL_ONESHOOT_Pos 0 /*!< CMSDK_DUALTIMER CTRL_ONESHOOT: CTRL ONESHOOT Position */ +#define CMSDK_DUALTIMER_CTRL_ONESHOOT_Msk (0x1ul << CMSDK_DUALTIMER_CTRL_ONESHOOT_Pos) /*!< CMSDK_DUALTIMER CTRL_ONESHOOT: CTRL ONESHOOT Mask */ + +#define CMSDK_DUALTIMER_INTCLR_Pos 0 /*!< CMSDK_DUALTIMER INTCLR: INT Clear Position */ +#define CMSDK_DUALTIMER_INTCLR_Msk (0x1ul << CMSDK_DUALTIMER_INTCLR_Pos) /*!< CMSDK_DUALTIMER INTCLR: INT Clear Mask */ + +#define CMSDK_DUALTIMER_RAWINTSTAT_Pos 0 /*!< CMSDK_DUALTIMER RAWINTSTAT: Raw Int Status Position */ +#define CMSDK_DUALTIMER_RAWINTSTAT_Msk (0x1ul << CMSDK_DUALTIMER_RAWINTSTAT_Pos) /*!< CMSDK_DUALTIMER RAWINTSTAT: Raw Int Status Mask */ + +#define CMSDK_DUALTIMER_MASKINTSTAT_Pos 0 /*!< CMSDK_DUALTIMER MASKINTSTAT: Mask Int Status Position */ +#define CMSDK_DUALTIMER_MASKINTSTAT_Msk (0x1ul << CMSDK_DUALTIMER_MASKINTSTAT_Pos) /*!< CMSDK_DUALTIMER MASKINTSTAT: Mask Int Status Mask */ + +#define CMSDK_DUALTIMER_BGLOAD_Pos 0 /*!< CMSDK_DUALTIMER BGLOAD: Background Load Position */ +#define CMSDK_DUALTIMER_BGLOAD_Msk (0xFFFFFFFFul << CMSDK_DUALTIMER_BGLOAD_Pos) /*!< CMSDK_DUALTIMER BGLOAD: Background Load Mask */ + +/*@}*/ /* end of group CMSDK_DualTIMER */ + + +/*-------------------- General Purpose Input Output (GPIO) -------------------*/ +/** @addtogroup CMSDK_GPIO CMSDK GPIO + @{ +*/ +typedef struct { + __IO uint32_t DATA; /* Offset: 0x000 (R/W) DATA Register */ + __IO uint32_t DATAOUT; /* Offset: 0x004 (R/W) Data Output Latch Register */ + uint32_t RESERVED0[2]; + __IO uint32_t OUTENABLESET; /* Offset: 0x010 (R/W) Output Enable Set Register */ + __IO uint32_t OUTENABLECLR; /* Offset: 0x014 (R/W) Output Enable Clear Register */ + __IO uint32_t ALTFUNCSET; /* Offset: 0x018 (R/W) Alternate Function Set Register */ + __IO uint32_t ALTFUNCCLR; /* Offset: 0x01C (R/W) Alternate Function Clear Register */ + __IO uint32_t INTENSET; /* Offset: 0x020 (R/W) Interrupt Enable Set Register */ + __IO uint32_t INTENCLR; /* Offset: 0x024 (R/W) Interrupt Enable Clear Register */ + __IO uint32_t INTTYPESET; /* Offset: 0x028 (R/W) Interrupt Type Set Register */ + __IO uint32_t INTTYPECLR; /* Offset: 0x02C (R/W) Interrupt Type Clear Register */ + __IO uint32_t INTPOLSET; /* Offset: 0x030 (R/W) Interrupt Polarity Set Register */ + __IO uint32_t INTPOLCLR; /* Offset: 0x034 (R/W) Interrupt Polarity Clear Register */ + union { + __I uint32_t INTSTATUS; /* Offset: 0x038 (R/ ) Interrupt Status Register */ + __O uint32_t INTCLEAR; /* Offset: 0x038 ( /W) Interrupt Clear Register */ + }; + uint32_t RESERVED1[241]; + __IO uint32_t LB_MASKED[256]; /* Offset: 0x400 - 0x7FC Lower byte Masked Access Register (R/W) */ + __IO uint32_t UB_MASKED[256]; /* Offset: 0x800 - 0xBFC Upper byte Masked Access Register (R/W) */ +} CMSDK_GPIO_TypeDef; + +#define CMSDK_GPIO_DATA_Pos 0 /*!< CMSDK_GPIO DATA: DATA Position */ +#define CMSDK_GPIO_DATA_Msk (0xFFFFul << CMSDK_GPIO_DATA_Pos) /*!< CMSDK_GPIO DATA: DATA Mask */ + +#define CMSDK_GPIO_DATAOUT_Pos 0 /*!< CMSDK_GPIO DATAOUT: DATAOUT Position */ +#define CMSDK_GPIO_DATAOUT_Msk (0xFFFFul << CMSDK_GPIO_DATAOUT_Pos) /*!< CMSDK_GPIO DATAOUT: DATAOUT Mask */ + +#define CMSDK_GPIO_OUTENSET_Pos 0 /*!< CMSDK_GPIO OUTEN: OUTEN Position */ +#define CMSDK_GPIO_OUTENSET_Msk (0xFFFFul << CMSDK_GPIO_OUTEN_Pos) /*!< CMSDK_GPIO OUTEN: OUTEN Mask */ + +#define CMSDK_GPIO_OUTENCLR_Pos 0 /*!< CMSDK_GPIO OUTEN: OUTEN Position */ +#define CMSDK_GPIO_OUTENCLR_Msk (0xFFFFul << CMSDK_GPIO_OUTEN_Pos) /*!< CMSDK_GPIO OUTEN: OUTEN Mask */ + +#define CMSDK_GPIO_ALTFUNCSET_Pos 0 /*!< CMSDK_GPIO ALTFUNC: ALTFUNC Position */ +#define CMSDK_GPIO_ALTFUNCSET_Msk (0xFFFFul << CMSDK_GPIO_ALTFUNC_Pos) /*!< CMSDK_GPIO ALTFUNC: ALTFUNC Mask */ + +#define CMSDK_GPIO_ALTFUNCCLR_Pos 0 /*!< CMSDK_GPIO ALTFUNC: ALTFUNC Position */ +#define CMSDK_GPIO_ALTFUNCCLR_Msk (0xFFFFul << CMSDK_GPIO_ALTFUNC_Pos) /*!< CMSDK_GPIO ALTFUNC: ALTFUNC Mask */ + +#define CMSDK_GPIO_INTENSET_Pos 0 /*!< CMSDK_GPIO INTEN: INTEN Position */ +#define CMSDK_GPIO_INTENSET_Msk (0xFFFFul << CMSDK_GPIO_INTEN_Pos) /*!< CMSDK_GPIO INTEN: INTEN Mask */ + +#define CMSDK_GPIO_INTENCLR_Pos 0 /*!< CMSDK_GPIO INTEN: INTEN Position */ +#define CMSDK_GPIO_INTENCLR_Msk (0xFFFFul << CMSDK_GPIO_INTEN_Pos) /*!< CMSDK_GPIO INTEN: INTEN Mask */ + +#define CMSDK_GPIO_INTTYPESET_Pos 0 /*!< CMSDK_GPIO INTTYPE: INTTYPE Position */ +#define CMSDK_GPIO_INTTYPESET_Msk (0xFFFFul << CMSDK_GPIO_INTTYPE_Pos) /*!< CMSDK_GPIO INTTYPE: INTTYPE Mask */ + +#define CMSDK_GPIO_INTTYPECLR_Pos 0 /*!< CMSDK_GPIO INTTYPE: INTTYPE Position */ +#define CMSDK_GPIO_INTTYPECLR_Msk (0xFFFFul << CMSDK_GPIO_INTTYPE_Pos) /*!< CMSDK_GPIO INTTYPE: INTTYPE Mask */ + +#define CMSDK_GPIO_INTPOLSET_Pos 0 /*!< CMSDK_GPIO INTPOL: INTPOL Position */ +#define CMSDK_GPIO_INTPOLSET_Msk (0xFFFFul << CMSDK_GPIO_INTPOL_Pos) /*!< CMSDK_GPIO INTPOL: INTPOL Mask */ + +#define CMSDK_GPIO_INTPOLCLR_Pos 0 /*!< CMSDK_GPIO INTPOL: INTPOL Position */ +#define CMSDK_GPIO_INTPOLCLR_Msk (0xFFFFul << CMSDK_GPIO_INTPOL_Pos) /*!< CMSDK_GPIO INTPOL: INTPOL Mask */ + +#define CMSDK_GPIO_INTSTATUS_Pos 0 /*!< CMSDK_GPIO INTSTATUS: INTSTATUS Position */ +#define CMSDK_GPIO_INTSTATUS_Msk (0xFFul << CMSDK_GPIO_INTSTATUS_Pos) /*!< CMSDK_GPIO INTSTATUS: INTSTATUS Mask */ + +#define CMSDK_GPIO_INTCLEAR_Pos 0 /*!< CMSDK_GPIO INTCLEAR: INTCLEAR Position */ +#define CMSDK_GPIO_INTCLEAR_Msk (0xFFul << CMSDK_GPIO_INTCLEAR_Pos) /*!< CMSDK_GPIO INTCLEAR: INTCLEAR Mask */ + +#define CMSDK_GPIO_MASKLOWBYTE_Pos 0 /*!< CMSDK_GPIO MASKLOWBYTE: MASKLOWBYTE Position */ +#define CMSDK_GPIO_MASKLOWBYTE_Msk (0x00FFul << CMSDK_GPIO_MASKLOWBYTE_Pos) /*!< CMSDK_GPIO MASKLOWBYTE: MASKLOWBYTE Mask */ + +#define CMSDK_GPIO_MASKHIGHBYTE_Pos 0 /*!< CMSDK_GPIO MASKHIGHBYTE: MASKHIGHBYTE Position */ +#define CMSDK_GPIO_MASKHIGHBYTE_Msk (0xFF00ul << CMSDK_GPIO_MASKHIGHBYTE_Pos) /*!< CMSDK_GPIO MASKHIGHBYTE: MASKHIGHBYTE Mask */ + +/*@}*/ /* end of group CMSDK_GPIO */ + + +/*------------- System Control (SYSCON) --------------------------------------*/ +/** @addtogroup CMSDK_SYSCON CMSDK System Control + @{ +*/ +typedef struct { + __IO uint32_t REMAP; /* Offset: 0x000 (R/W) Remap Control Register */ + __IO uint32_t PMUCTRL; /* Offset: 0x004 (R/W) PMU Control Register */ + __IO uint32_t RESETOP; /* Offset: 0x008 (R/W) Reset Option Register */ + __IO uint32_t EMICTRL; /* Offset: 0x00C (R/W) EMI Control Register */ + __IO uint32_t RSTINFO; /* Offset: 0x010 (R/W) Reset Information Register */ +} CMSDK_SYSCON_TypeDef; + +#define CMSDK_SYSCON_REMAP_Pos 0 +#define CMSDK_SYSCON_REMAP_Msk (0x01ul << CMSDK_SYSCON_REMAP_Pos) /*!< CMSDK_SYSCON MEME_CTRL: REMAP Mask */ + +#define CMSDK_SYSCON_PMUCTRL_EN_Pos 0 +#define CMSDK_SYSCON_PMUCTRL_EN_Msk (0x01ul << CMSDK_SYSCON_PMUCTRL_EN_Pos) /*!< CMSDK_SYSCON PMUCTRL: PMUCTRL ENABLE Mask */ + +#define CMSDK_SYSCON_LOCKUPRST_RESETOP_Pos 0 +#define CMSDK_SYSCON_LOCKUPRST_RESETOP_Msk (0x01ul << CMSDK_SYSCON_LOCKUPRST_RESETOP_Pos) /*!< CMSDK_SYSCON SYS_CTRL: LOCKUP RESET ENABLE Mask */ + +#define CMSDK_SYSCON_EMICTRL_SIZE_Pos 24 +#define CMSDK_SYSCON_EMICTRL_SIZE_Msk (0x00001ul << CMSDK_SYSCON_EMICTRL_SIZE_Pos) /*!< CMSDK_SYSCON EMICTRL: SIZE Mask */ + +#define CMSDK_SYSCON_EMICTRL_TACYC_Pos 16 +#define CMSDK_SYSCON_EMICTRL_TACYC_Msk (0x00007ul << CMSDK_SYSCON_EMICTRL_TACYC_Pos) /*!< CMSDK_SYSCON EMICTRL: TURNAROUNDCYCLE Mask */ + +#define CMSDK_SYSCON_EMICTRL_WCYC_Pos 8 +#define CMSDK_SYSCON_EMICTRL_WCYC_Msk (0x00003ul << CMSDK_SYSCON_EMICTRL_WCYC_Pos) /*!< CMSDK_SYSCON EMICTRL: WRITECYCLE Mask */ + +#define CMSDK_SYSCON_EMICTRL_RCYC_Pos 0 +#define CMSDK_SYSCON_EMICTRL_RCYC_Msk (0x00007ul << CMSDK_SYSCON_EMICTRL_RCYC_Pos) /*!< CMSDK_SYSCON EMICTRL: READCYCLE Mask */ + +#define CMSDK_SYSCON_RSTINFO_SYSRESETREQ_Pos 0 +#define CMSDK_SYSCON_RSTINFO_SYSRESETREQ_Msk (0x00001ul << CMSDK_SYSCON_RSTINFO_SYSRESETREQ_Pos) /*!< CMSDK_SYSCON RSTINFO: SYSRESETREQ Mask */ + +#define CMSDK_SYSCON_RSTINFO_WDOGRESETREQ_Pos 1 +#define CMSDK_SYSCON_RSTINFO_WDOGRESETREQ_Msk (0x00001ul << CMSDK_SYSCON_RSTINFO_WDOGRESETREQ_Pos) /*!< CMSDK_SYSCON RSTINFO: WDOGRESETREQ Mask */ + +#define CMSDK_SYSCON_RSTINFO_LOCKUPRESET_Pos 2 +#define CMSDK_SYSCON_RSTINFO_LOCKUPRESET_Msk (0x00001ul << CMSDK_SYSCON_RSTINFO_LOCKUPRESET_Pos) /*!< CMSDK_SYSCON RSTINFO: LOCKUPRESET Mask */ + +/*@}*/ /* end of group CMSDK_SYSCON */ + +/*------------- PL230 uDMA (PL230) --------------------------------------*/ +/** @addtogroup CMSDK_PL230 CMSDK uDMA controller + @{ +*/ +typedef struct { + __I uint32_t DMA_STATUS; /* Offset: 0x000 (R/W) DMA status Register */ + __O uint32_t DMA_CFG; /* Offset: 0x004 ( /W) DMA configuration Register */ + __IO uint32_t CTRL_BASE_PTR; /* Offset: 0x008 (R/W) Channel Control Data Base Pointer Register */ + __I uint32_t ALT_CTRL_BASE_PTR; /* Offset: 0x00C (R/ ) Channel Alternate Control Data Base Pointer Register */ + __I uint32_t DMA_WAITONREQ_STATUS; /* Offset: 0x010 (R/ ) Channel Wait On Request Status Register */ + __O uint32_t CHNL_SW_REQUEST; /* Offset: 0x014 ( /W) Channel Software Request Register */ + __IO uint32_t CHNL_USEBURST_SET; /* Offset: 0x018 (R/W) Channel UseBurst Set Register */ + __O uint32_t CHNL_USEBURST_CLR; /* Offset: 0x01C ( /W) Channel UseBurst Clear Register */ + __IO uint32_t CHNL_REQ_MASK_SET; /* Offset: 0x020 (R/W) Channel Request Mask Set Register */ + __O uint32_t CHNL_REQ_MASK_CLR; /* Offset: 0x024 ( /W) Channel Request Mask Clear Register */ + __IO uint32_t CHNL_ENABLE_SET; /* Offset: 0x028 (R/W) Channel Enable Set Register */ + __O uint32_t CHNL_ENABLE_CLR; /* Offset: 0x02C ( /W) Channel Enable Clear Register */ + __IO uint32_t CHNL_PRI_ALT_SET; /* Offset: 0x030 (R/W) Channel Primary-Alterante Set Register */ + __O uint32_t CHNL_PRI_ALT_CLR; /* Offset: 0x034 ( /W) Channel Primary-Alterante Clear Register */ + __IO uint32_t CHNL_PRIORITY_SET; /* Offset: 0x038 (R/W) Channel Priority Set Register */ + __O uint32_t CHNL_PRIORITY_CLR; /* Offset: 0x03C ( /W) Channel Priority Clear Register */ + uint32_t RESERVED0[3]; + __IO uint32_t ERR_CLR; /* Offset: 0x04C (R/W) Bus Error Clear Register */ + +} CMSDK_PL230_TypeDef; + +#define PL230_DMA_CHNL_BITS 0 + +#define CMSDK_PL230_DMA_STATUS_MSTREN_Pos 0 /*!< CMSDK_PL230 DMA STATUS: MSTREN Position */ +#define CMSDK_PL230_DMA_STATUS_MSTREN_Msk (0x00000001ul << CMSDK_PL230_DMA_STATUS_MSTREN_Pos) /*!< CMSDK_PL230 DMA STATUS: MSTREN Mask */ + +#define CMSDK_PL230_DMA_STATUS_STATE_Pos 0 /*!< CMSDK_PL230 DMA STATUS: STATE Position */ +#define CMSDK_PL230_DMA_STATUS_STATE_Msk (0x0000000Ful << CMSDK_PL230_DMA_STATUS_STATE_Pos) /*!< CMSDK_PL230 DMA STATUS: STATE Mask */ + +#define CMSDK_PL230_DMA_STATUS_CHNLS_MINUS1_Pos 0 /*!< CMSDK_PL230 DMA STATUS: CHNLS_MINUS1 Position */ +#define CMSDK_PL230_DMA_STATUS_CHNLS_MINUS1_Msk (0x0000001Ful << CMSDK_PL230_DMA_STATUS_CHNLS_MINUS1_Pos) /*!< CMSDK_PL230 DMA STATUS: CHNLS_MINUS1 Mask */ + +#define CMSDK_PL230_DMA_STATUS_TEST_STATUS_Pos 0 /*!< CMSDK_PL230 DMA STATUS: TEST_STATUS Position */ +#define CMSDK_PL230_DMA_STATUS_TEST_STATUS_Msk (0x00000001ul << CMSDK_PL230_DMA_STATUS_TEST_STATUS_Pos) /*!< CMSDK_PL230 DMA STATUS: TEST_STATUS Mask */ + +#define CMSDK_PL230_DMA_CFG_MSTREN_Pos 0 /*!< CMSDK_PL230 DMA CFG: MSTREN Position */ +#define CMSDK_PL230_DMA_CFG_MSTREN_Msk (0x00000001ul << CMSDK_PL230_DMA_CFG_MSTREN_Pos) /*!< CMSDK_PL230 DMA CFG: MSTREN Mask */ + +#define CMSDK_PL230_DMA_CFG_CPCCACHE_Pos 2 /*!< CMSDK_PL230 DMA CFG: CPCCACHE Position */ +#define CMSDK_PL230_DMA_CFG_CPCCACHE_Msk (0x00000001ul << CMSDK_PL230_DMA_CFG_CPCCACHE_Pos) /*!< CMSDK_PL230 DMA CFG: CPCCACHE Mask */ + +#define CMSDK_PL230_DMA_CFG_CPCBUF_Pos 1 /*!< CMSDK_PL230 DMA CFG: CPCBUF Position */ +#define CMSDK_PL230_DMA_CFG_CPCBUF_Msk (0x00000001ul << CMSDK_PL230_DMA_CFG_CPCBUF_Pos) /*!< CMSDK_PL230 DMA CFG: CPCBUF Mask */ + +#define CMSDK_PL230_DMA_CFG_CPCPRIV_Pos 0 /*!< CMSDK_PL230 DMA CFG: CPCPRIV Position */ +#define CMSDK_PL230_DMA_CFG_CPCPRIV_Msk (0x00000001ul << CMSDK_PL230_DMA_CFG_CPCPRIV_Pos) /*!< CMSDK_PL230 DMA CFG: CPCPRIV Mask */ + +#define CMSDK_PL230_CTRL_BASE_PTR_Pos PL230_DMA_CHNL_BITS + 5 /*!< CMSDK_PL230 STATUS: BASE_PTR Position */ +#define CMSDK_PL230_CTRL_BASE_PTR_Msk (0x0FFFFFFFul << CMSDK_PL230_CTRL_BASE_PTR_Pos) /*!< CMSDK_PL230 STATUS: BASE_PTR Mask */ + +#define CMSDK_PL230_ALT_CTRL_BASE_PTR_Pos 0 /*!< CMSDK_PL230 STATUS: MSTREN Position */ +#define CMSDK_PL230_ALT_CTRL_BASE_PTR_Msk (0xFFFFFFFFul << CMSDK_PL230_ALT_CTRL_BASE_PTR_Pos) /*!< CMSDK_PL230 STATUS: MSTREN Mask */ + +#define CMSDK_PL230_DMA_WAITONREQ_STATUS_Pos 0 /*!< CMSDK_PL230 DMA_WAITONREQ_STATUS: DMA_WAITONREQ_STATUS Position */ +#define CMSDK_PL230_DMA_WAITONREQ_STATUS_Msk (0xFFFFFFFFul << CMSDK_PL230_DMA_WAITONREQ_STATUS_Pos) /*!< CMSDK_PL230 DMA_WAITONREQ_STATUS: DMA_WAITONREQ_STATUS Mask */ + +#define CMSDK_PL230_CHNL_SW_REQUEST_Pos 0 /*!< CMSDK_PL230 CHNL_SW_REQUEST: CHNL_SW_REQUEST Position */ +#define CMSDK_PL230_CHNL_SW_REQUEST_Msk (0xFFFFFFFFul << CMSDK_PL230_CHNL_SW_REQUEST_Pos) /*!< CMSDK_PL230 CHNL_SW_REQUEST: CHNL_SW_REQUEST Mask */ + +#define CMSDK_PL230_CHNL_USEBURST_SET_Pos 0 /*!< CMSDK_PL230 CHNL_USEBURST: SET Position */ +#define CMSDK_PL230_CHNL_USEBURST_SET_Msk (0xFFFFFFFFul << CMSDK_PL230_CHNL_USEBURST_SET_Pos) /*!< CMSDK_PL230 CHNL_USEBURST: SET Mask */ + +#define CMSDK_PL230_CHNL_USEBURST_CLR_Pos 0 /*!< CMSDK_PL230 CHNL_USEBURST: CLR Position */ +#define CMSDK_PL230_CHNL_USEBURST_CLR_Msk (0xFFFFFFFFul << CMSDK_PL230_CHNL_USEBURST_CLR_Pos) /*!< CMSDK_PL230 CHNL_USEBURST: CLR Mask */ + +#define CMSDK_PL230_CHNL_REQ_MASK_SET_Pos 0 /*!< CMSDK_PL230 CHNL_REQ_MASK: SET Position */ +#define CMSDK_PL230_CHNL_REQ_MASK_SET_Msk (0xFFFFFFFFul << CMSDK_PL230_CHNL_REQ_MASK_SET_Pos) /*!< CMSDK_PL230 CHNL_REQ_MASK: SET Mask */ + +#define CMSDK_PL230_CHNL_REQ_MASK_CLR_Pos 0 /*!< CMSDK_PL230 CHNL_REQ_MASK: CLR Position */ +#define CMSDK_PL230_CHNL_REQ_MASK_CLR_Msk (0xFFFFFFFFul << CMSDK_PL230_CHNL_REQ_MASK_CLR_Pos) /*!< CMSDK_PL230 CHNL_REQ_MASK: CLR Mask */ + +#define CMSDK_PL230_CHNL_ENABLE_SET_Pos 0 /*!< CMSDK_PL230 CHNL_ENABLE: SET Position */ +#define CMSDK_PL230_CHNL_ENABLE_SET_Msk (0xFFFFFFFFul << CMSDK_PL230_CHNL_ENABLE_SET_Pos) /*!< CMSDK_PL230 CHNL_ENABLE: SET Mask */ + +#define CMSDK_PL230_CHNL_ENABLE_CLR_Pos 0 /*!< CMSDK_PL230 CHNL_ENABLE: CLR Position */ +#define CMSDK_PL230_CHNL_ENABLE_CLR_Msk (0xFFFFFFFFul << CMSDK_PL230_CHNL_ENABLE_CLR_Pos) /*!< CMSDK_PL230 CHNL_ENABLE: CLR Mask */ + +#define CMSDK_PL230_CHNL_PRI_ALT_SET_Pos 0 /*!< CMSDK_PL230 CHNL_PRI_ALT: SET Position */ +#define CMSDK_PL230_CHNL_PRI_ALT_SET_Msk (0xFFFFFFFFul << CMSDK_PL230_CHNL_PRI_ALT_SET_Pos) /*!< CMSDK_PL230 CHNL_PRI_ALT: SET Mask */ + +#define CMSDK_PL230_CHNL_PRI_ALT_CLR_Pos 0 /*!< CMSDK_PL230 CHNL_PRI_ALT: CLR Position */ +#define CMSDK_PL230_CHNL_PRI_ALT_CLR_Msk (0xFFFFFFFFul << CMSDK_PL230_CHNL_PRI_ALT_CLR_Pos) /*!< CMSDK_PL230 CHNL_PRI_ALT: CLR Mask */ + +#define CMSDK_PL230_CHNL_PRIORITY_SET_Pos 0 /*!< CMSDK_PL230 CHNL_PRIORITY: SET Position */ +#define CMSDK_PL230_CHNL_PRIORITY_SET_Msk (0xFFFFFFFFul << CMSDK_PL230_CHNL_PRIORITY_SET_Pos) /*!< CMSDK_PL230 CHNL_PRIORITY: SET Mask */ + +#define CMSDK_PL230_CHNL_PRIORITY_CLR_Pos 0 /*!< CMSDK_PL230 CHNL_PRIORITY: CLR Position */ +#define CMSDK_PL230_CHNL_PRIORITY_CLR_Msk (0xFFFFFFFFul << CMSDK_PL230_CHNL_PRIORITY_CLR_Pos) /*!< CMSDK_PL230 CHNL_PRIORITY: CLR Mask */ + +#define CMSDK_PL230_ERR_CLR_Pos 0 /*!< CMSDK_PL230 ERR: CLR Position */ +#define CMSDK_PL230_ERR_CLR_Msk (0x00000001ul << CMSDK_PL230_ERR_CLR_Pos) /*!< CMSDK_PL230 ERR: CLR Mask */ + + +/*@}*/ /* end of group CMSDK_PL230 */ + + +/*------------- PrimeCell UART (PL110) --------------------------------------*/ +/** @addtogroup CMSDK_PL110 CMSDK PrimeCell UART + @{ +*/ + +typedef struct { + __IO uint32_t UARTDR; // Data + // OE: Overrun error + // BE: Break error + // PE: Parity error + // FE: Framing error + // DATA: Received or Transmitting data (0..255) + // + union { + __I uint32_t UARTRSR; // Receive Status + // OE: Overrun error + // BE: Break error + // PE: Parity error + // FE: Framing error + // + __O uint32_t UARTECR; // Error Clear + // OE: Overrun error + // BE: Break error + // PE: Parity error + // FE: Framing error + // + }; + uint32_t RESERVED0[4]; + __IO uint32_t UARTFR; // Flags + // RI: Ring indicator + // TXFE: Transmit FIFO empty + // RXFF: Receive FIFO full + // TXFF: Transmit FIFO full + // RXFE: Receive FIFO empty + // BUSY: UART busy + // DCD: Data carrier detect + // DSR: Data set ready + // CTS: Clear to send + // + uint32_t RESERVED1; + __IO uint32_t UARTILPR; // IrDA Low-power Counter + // ILPDVSR: 8-bit low-power divisor value (0..255) + // + __IO uint32_t UARTIBRD; // Integer Baud Rate + // BAUD DIVINT: Integer baud rate divisor (0..65535) + // + __IO uint32_t UARTFBRD; // Fractional Baud Rate + // BAUD DIVFRAC: Fractional baud rate divisor (0..63) + // + __IO uint32_t UARTLCR_H; // Line Control + // SPS: Stick parity select + // WLEN: Word length + // <0=> 5 bits + // <1=> 6 bits + // <2=> 7 bits + // <3=> 8 bits + // FEN: Enable FIFOs + // STP2: Two stop bits select + // EPS: Even parity select + // PEN: Parity enable + // BRK: Send break + // + __IO uint32_t UARTCR; // Control + // CTSEn: CTS hardware flow control enable + // RTSEn: RTS hardware flow control enable + // Out2: Complement of Out2 modem status output + // Out1: Complement of Out1 modem status output + // RTS: Request to send + // DTR: Data transmit ready + // RXE: Receive enable + // TXE: Transmit enable + // LBE: Loop-back enable + // SIRLP: IrDA SIR low power mode + // SIREN: SIR enable + // UARTEN: UART enable + // + __IO uint32_t UARTIFLS; // Interrupt FIFO Level Select + // RXIFLSEL: Receive interrupt FIFO level select + // <0=> >= 1/8 full + // <1=> >= 1/4 full + // <2=> >= 1/2 full + // <3=> >= 3/4 full + // <4=> >= 7/8 full + // <5=> reserved + // <6=> reserved + // <7=> reserved + // TXIFLSEL: Transmit interrupt FIFO level select + // <0=> <= 1/8 full + // <1=> <= 1/4 full + // <2=> <= 1/2 full + // <3=> <= 3/4 full + // <4=> <= 7/8 full + // <5=> reserved + // <6=> reserved + // <7=> reserved + // + __IO uint32_t UARTIMSC; // Interrupt Mask Set / Clear + // OEIM: Overrun error interrupt mask + // BEIM: Break error interrupt mask + // PEIM: Parity error interrupt mask + // FEIM: Framing error interrupt mask + // RTIM: Receive interrupt mask + // TXIM: Transmit interrupt mask + // RXIM: Receive interrupt mask + // DSRMIM: nUARTDSR modem interrupt mask + // DCDMIM: nUARTDCD modem interrupt mask + // CTSMIM: nUARTCTS modem interrupt mask + // RIMIM: nUARTRI modem interrupt mask + // + __IO uint32_t UARTRIS; // Raw Interrupt Status + // OERIS: Overrun error interrupt status + // BERIS: Break error interrupt status + // PERIS: Parity error interrupt status + // FERIS: Framing error interrupt status + // RTRIS: Receive timeout interrupt status + // TXRIS: Transmit interrupt status + // RXRIS: Receive interrupt status + // DSRRMIS: nUARTDSR modem interrupt status + // DCDRMIS: nUARTDCD modem interrupt status + // CTSRMIS: nUARTCTS modem interrupt status + // RIRMIS: nUARTRI modem interrupt status + // + __IO uint32_t UARTMIS; // Masked Interrupt Status + // OEMIS: Overrun error masked interrupt status + // BEMIS: Break error masked interrupt status + // PEMIS: Parity error masked interrupt status + // FEMIS: Framing error masked interrupt status + // RTMIS: Receive timeout masked interrupt status + // TXMIS: Transmit masked interrupt status + // RXMIS: Receive masked interrupt status + // DSRMMIS: nUARTDSR modem masked interrupt status + // DCDMMIS: nUARTDCD modem masked interrupt status + // CTSMMIS: nUARTCTS modem masked interrupt status + // RIMMIS: nUARTRI modem masked interrupt status + // + __O uint32_t UARTICR; // Interrupt Clear + // OEIC: Overrun error interrupt clear + // BEIC: Break error interrupt clear + // PEIC: Parity error interrupt clear + // FEIC: Framing error interrupt clear + // RTIC: Receive timeout interrupt clear + // TXIC: Transmit interrupt clear + // RXIC: Receive interrupt clear + // DSRMIC: nUARTDSR modem interrupt clear + // DCDMIC: nUARTDCD modem interrupt clear + // CTSMIC: nUARTCTS modem interrupt clear + // RIMIC: nUARTRI modem interrupt clear + // + __IO uint32_t UARTDMACR; // DMA Control + // DMAONERR: DMA on error + // TXDMAE: Transmit DMA enable + // RXDMAE: Receive DMA enable + // +} PL110_UART_TypeDef; + +#define CMSDK_PL110_DATAOVRRUN_Pos 11 /*!< CMSDK_PL110 DATAOVRRUN: Data Overrun Position */ +#define CMSDK_PL110_DATAOVRRUN_Msk (0x1ul << CMSDK_PL110_DATAOVRRUN_Pos) /*!< CMSDK_PL110 DATAOVRRUN: Data Overrun Mask */ + +#define CMSDK_PL110_DATABREAKERR_Pos 10 /*!< CMSDK_PL110 DATABREAKERR: Data Break Error Position */ +#define CMSDK_PL110_DATABREAKERR_Msk (0x1ul << CMSDK_PL110_DATABREAKERR_Pos) /*!< CMSDK_PL110 DATABREAKERR: Data Break Error Mask */ + +#define CMSDK_PL110_DATAPARITYERR_Pos 9 /*!< CMSDK_PL110 DATAPARITYERR: Data Parity Error Position */ +#define CMSDK_PL110_DATAPARITYERR_Msk (0x1ul << CMSDK_PL110_DATAPARITYERR_Pos) /*!< CMSDK_PL110 DATAPARITYERR: Data Parity Error Mask */ + +#define CMSDK_PL110_DATAFRAMEERR_Pos 8 /*!< CMSDK_PL110 DATAFRAMEERR: Data Frame Error Position */ +#define CMSDK_PL110_DATAFRAMEERR_Msk (0x1ul << CMSDK_PL110_DATAFRAMEERR_Pos) /*!< CMSDK_PL110 DATAFRAMEERR: Data Frame Error Mask */ + +#define CMSDK_PL110_RECOVRRUN_Pos 3 /*!< CMSDK_PL110 RECOVRRUN: Receive Overrun Position */ +#define CMSDK_PL110_RECOVRRUN_Msk (0x1ul << CMSDK_PL110_RECOVRRUN_Pos) /*!< CMSDK_PL110 RECOVRRUN: Receive Overrun Mask */ + +#define CMSDK_PL110_RECBREAKERR_Pos 2 /*!< CMSDK_PL110 RECBREAKERR: Receive Break Error Position */ +#define CMSDK_PL110_RECBREAKERR_Msk (0x1ul << CMSDK_PL110_RECBREAKERR_Pos) /*!< CMSDK_PL110 RECBREAKERR: Receive Break Error Mask */ + +#define CMSDK_PL110_RECPARITYERR_Pos 1 /*!< CMSDK_PL110 RECPARITYERR: Receive Parity Error Position */ +#define CMSDK_PL110_RECPARITYERR_Msk (0x1ul << CMSDK_PL110_RECPARITYERR_Pos) /*!< CMSDK_PL110 RECPARITYERR: Receive Parity Error Mask */ + +#define CMSDK_PL110_RECFRAMEERR_Pos 0 /*!< CMSDK_PL110 RECFRAMEERR: Receive Frame Error Position */ +#define CMSDK_PL110_RECFRAMEERR_Msk (0x1ul << CMSDK_PL110_RECFRAMEERR_Pos) /*!< CMSDK_PL110 RECFRAMEERR: Receive Frame Error Mask */ + +#define CMSDK_PL110_ERRCLROVRRUN_Pos 3 /*!< CMSDK_PL110 ERRCLROVRRUN: Clear Overrun Position */ +#define CMSDK_PL110_ERRCLROVRRUN_Msk (0x1ul << CMSDK_PL110_ERRCLROVRRUN_Pos) /*!< CMSDK_PL110 ERRCLROVRRUN: Clear Overrun Mask */ + +#define CMSDK_PL110_ERRCLRBREAKERR_Pos 2 /*!< CMSDK_PL110 ERRCLRBREAKERR: Clear Break Error Position */ +#define CMSDK_PL110_ERRCLRBREAKERR_Msk (0x1ul << CMSDK_PL110_ERRCLRBREAKERR_Pos) /*!< CMSDK_PL110 ERRCLRBREAKERR: Clear Break Error Mask */ + +#define CMSDK_PL110_ERRCLRPARITYERR_Pos 1 /*!< CMSDK_PL110 ERRCLRPARITYERR: Clear Parity Error Position */ +#define CMSDK_PL110_ERRCLRPARITYERR_Msk (0x1ul << CMSDK_PL110_ERRCLRPARITYERR_Pos) /*!< CMSDK_PL110 ERRCLRPARITYERR: Clear Parity Error Mask */ + +#define CMSDK_PL110_ERRCLRFRAMEERR_Pos 0 /*!< CMSDK_PL110 ERRCLRFRAMEERR: Clear Frame Error Position */ +#define CMSDK_PL110_ERRCLRFRAMEERR_Msk (0x1ul << CMSDK_PL110_ERRCLRFRAMEERR_Pos) /*!< CMSDK_PL110 ERRCLRFRAMEERR: Clear Frame Error Mask */ + +#define CMSDK_PL110_FLAG_RINGIND_Pos 8 /*!< CMSDK_PL110 FLAG_RINGIND: Ring Indicator Position */ +#define CMSDK_PL110_FLAG_RINGIND_Msk (0x1ul << CMSDK_PL110_FLAG_RINGIND_Pos) /*!< CMSDK_PL110 FLAG_RINGIND: Ring Indicator Mask */ + +#define CMSDK_PL110_FLAG_TXFEMPTY_Pos 7 /*!< CMSDK_PL110 FLAG_TXFEMPTY: Transmit FIFO Empty Position */ +#define CMSDK_PL110_FLAG_TXFEMPTY_Msk (0x1ul << CMSDK_PL110_FLAG_TXFEMPTY_Pos) /*!< CMSDK_PL110 FLAG_TXFEMPTY: Transmit FIFO Empty Mask */ + +#define CMSDK_PL110_FLAG_RXFFULL_Pos 6 /*!< CMSDK_PL110 FLAG_RXFFULL: Receive FIFO Full Position */ +#define CMSDK_PL110_FLAG_RXFFULL_Msk (0x1ul << CMSDK_PL110_FLAG_RXFFULL_Pos) /*!< CMSDK_PL110 FLAG_RXFFULL: Receive FIFO Full Mask */ + +#define CMSDK_PL110_FLAG_TXFFULL_Pos 5 /*!< CMSDK_PL110 FLAG_TXFFULL: Transmit FIFO Full Position */ +#define CMSDK_PL110_FLAG_TXFFULL_Msk (0x1ul << CMSDK_PL110_FLAG_TXFFULL_Pos) /*!< CMSDK_PL110 FLAG_TXFFULL: Transmit FIFO Full Mask */ + +#define CMSDK_PL110_FLAG_RXFEMPTY_Pos 4 /*!< CMSDK_PL110 FLAG_RXFEMPTY: Receive FIFO Empty Position */ +#define CMSDK_PL110_FLAG_RXFEMPTY_Msk (0x1ul << CMSDK_PL110_FLAG_RXFEMPTY_Pos) /*!< CMSDK_PL110 FLAG_RXFEMPTY: Receive FIFO Empty Mask */ + +#define CMSDK_PL110_FLAG_UARTBUSY_Pos 3 /*!< CMSDK_PL110 FLAG_UARTBUSY: UART Busy Position */ +#define CMSDK_PL110_FLAG_UARTBUSY_Msk (0x1ul << CMSDK_PL110_FLAG_UARTBUSY_Pos) /*!< CMSDK_PL110 FLAG_UARTBUSY: UART Busy Mask */ + +#define CMSDK_PL110_FLAG_CARRIERDETECT_Pos 2 /*!< CMSDK_PL110 FLAG_CARRIERDETECT: Carrier Detect Position */ +#define CMSDK_PL110_FLAG_CARRIERDETECT_Msk (0x1ul << CMSDK_PL110_FLAG_CARRIERDETECT_Pos) /*!< CMSDK_PL110 FLAG_CARRIERDETECT: Carrier Detect Mask */ + +#define CMSDK_PL110_FLAG_DATASETREADY_Pos 1 /*!< CMSDK_PL110 FLAG_DATASETREADY: Data Set Ready Position */ +#define CMSDK_PL110_FLAG_DATASETREADY_Msk (0x1ul << CMSDK_PL110_FLAG_DATASETREADY_Pos) /*!< CMSDK_PL110 FLAG_DATASETREADY: Data Set Ready Mask */ + +#define CMSDK_PL110_FLAG_CLR2SEND_Pos 0 /*!< CMSDK_PL110 FLAG_CLR2SEND: Clear To Send Position */ +#define CMSDK_PL110_FLAG_CLR2SEND_Msk (0x1ul << CMSDK_PL110_FLAG_CLR2SEND_Pos) /*!< CMSDK_PL110 FLAG_CLR2SEND: Clear To Send Mask */ + +#define CMSDK_PL110_IRDALOWPOWERCOUNT_Pos 0 /*!< CMSDK_PL110 IRDALOWPOWERCOUNT: IrDA 8-bit low-power divisor value Position */ +#define CMSDK_PL110_IRDALOWPOWERCOUNT_Msk (0xFFul << CMSDK_PL110_IRDALOWPOWERCOUNT_Pos) /*!< CMSDK_PL110 IRDALOWPOWERCOUNT: IrDA 8-bit low-power divisor value Mask */ + +#define CMSDK_PL110_INTDIVIDER_Pos 0 /*!< CMSDK_PL110 INTDIVIDER: Integer Divider Position */ +#define CMSDK_PL110_INTDIVIDER_Msk (0xFFFFul << CMSDK_PL110_INTDIVIDER_Pos) /*!< CMSDK_PL110 INTDIVIDER: Integer Divider Mask */ + +#define CMSDK_PL110_FRACTDIVIDER_Pos 0 /*!< CMSDK_PL110 FRACTDIVIDER: Fractional Divider Position */ +#define CMSDK_PL110_FRACTDIVIDER_Msk (0x3Ful << CMSDK_PL110_FRACTDIVIDER_Pos) /*!< CMSDK_PL110 FRACTDIVIDER: Fractional Divider Mask */ + +#define CMSDK_PL110_STICKPARITYSEL_Pos 8 /*!< CMSDK_PL110 STICKPARITYSEL: Stick parity select Position */ +#define CMSDK_PL110_STICKPARITYSEL_Msk (0x1ul << CMSDK_PL110_STICKPARITYSEL_Pos) /*!< CMSDK_PL110 STICKPARITYSEL: Stick parity select Mask */ + +#define CMSDK_PL110_WORDLEN_Pos 5 /*!< CMSDK_PL110 WORDLEN: Word Length Select Position */ +#define CMSDK_PL110_WORDLEN_Msk (0x3ul << CMSDK_PL110_WORDLEN_Pos) /*!< CMSDK_PL110 WORDLEN: Word Length Select Mask */ + +#define CMSDK_PL110_ENFIFOS_Pos 4 /*!< CMSDK_PL110 ENFIFOS: Enable FIFOs Position */ +#define CMSDK_PL110_ENFIFOS_Msk (0x1ul << CMSDK_PL110_ENFIFOS_Pos) /*!< CMSDK_PL110 ENFIFOS: Enable FIFOs Mask */ + +#define CMSDK_PL110_2STOPBITS_Pos 3 /*!< CMSDK_PL110 2STOPBITS: Two Stop Bits Select Position */ +#define CMSDK_PL110_2STOPBITS_Msk (0x1ul << CMSDK_PL110_2STOPBITS_Pos) /*!< CMSDK_PL110 2STOPBITS: Two Stop Bits Select Mask */ + +#define CMSDK_PL110_EVENPARITY_Pos 2 /*!< CMSDK_PL110 EVENPARITY: Even Parity Select Position */ +#define CMSDK_PL110_EVENPARITY_Msk (0x1ul << CMSDK_PL110_EVENPARITY_Pos) /*!< CMSDK_PL110 EVENPARITY: Even Parity Select Mask */ + +#define CMSDK_PL110_PARITYEN_Pos 1 /*!< CMSDK_PL110 PARITYEN: Parity Enable Position */ +#define CMSDK_PL110_PARITYEN_Msk (0x1ul << CMSDK_PL110_PARITYEN_Pos) /*!< CMSDK_PL110 PARITYEN: Parity Enable Mask */ + +#define CMSDK_PL110_SENDBREAK_Pos 0 /*!< CMSDK_PL110 SENDBREAK: Send Break Position */ +#define CMSDK_PL110_SENDBREAK_Msk (0x1ul << CMSDK_PL110_SENDBREAK_Pos) /*!< CMSDK_PL110 SENDBREAK: Send Break Mask */ + +#define CMSDK_PL110_CTS_FLOWCTRL_Pos 15 /*!< CMSDK_PL110 CTS_FLOWCTRL: Enable CTS Flow Control Position */ +#define CMSDK_PL110_CTS_FLOWCTRL_Msk (0x1ul << CMSDK_PL110_CTS_FLOWCTRL_Pos) /*!< CMSDK_PL110 CTS_FLOWCTRL: Enable CTS Flow Control Mask */ + +#define CMSDK_PL110_RTS_FLOWCTRL_Pos 14 /*!< CMSDK_PL110 RTS_FLOWCTRL: Enable RTS Flow Control Position */ +#define CMSDK_PL110_RTS_FLOWCTRL_Msk (0x1ul << CMSDK_PL110_RTS_FLOWCTRL_Pos) /*!< CMSDK_PL110 RTS_FLOWCTRL: Enable RTS Flow Control Mask */ + +#define CMSDK_PL110_OUT2_Pos 13 /*!< CMSDK_PL110 OUT2: Complement of Out2 modem status output Position */ +#define CMSDK_PL110_OUT2_Msk (0x1ul << CMSDK_PL110_OUT2_Pos) /*!< CMSDK_PL110 OUT2: Complement of Out2 modem status output Mask */ + +#define CMSDK_PL110_OUT1_Pos 12 /*!< CMSDK_PL110 OUT1: Complement of Out1 modem status output Position */ +#define CMSDK_PL110_OUT1_Msk (0x1ul << CMSDK_PL110_OUT1_Pos) /*!< CMSDK_PL110 OUT1: Complement of Out1 modem status output Mask */ + +#define CMSDK_PL110_REQ2SEND_Pos 11 /*!< CMSDK_PL110 REQ2SEND: Request To Send Position */ +#define CMSDK_PL110_REQ2SEND_Msk (0x1ul << CMSDK_PL110_REQ2SEND_Pos) /*!< CMSDK_PL110 REQ2SEND: Request To Send Mask */ + +#define CMSDK_PL110_DATATRANSREADY_Pos 10 /*!< CMSDK_PL110 DATATRANSREADY: Transmit Ready Position */ +#define CMSDK_PL110_DATATRANSREADY_Msk (0x1ul << CMSDK_PL110_DATATRANSREADY_Pos) /*!< CMSDK_PL110 DATATRANSREADY: Transmit Ready Mask */ + +#define CMSDK_PL110_RXEN_Pos 9 /*!< CMSDK_PL110 RXEN: Receive Enable Position */ +#define CMSDK_PL110_RXEN_Msk (0x1ul << CMSDK_PL110_RXEN_Pos) /*!< CMSDK_PL110 RXEN: Receive Enable Mask */ + +#define CMSDK_PL110_TXEN_Pos 8 /*!< CMSDK_PL110 TXEN: Transmit Enable Position */ +#define CMSDK_PL110_TXEN_Msk (0x1ul << CMSDK_PL110_TXEN_Pos) /*!< CMSDK_PL110 TXEN: Transmit Enable Mask */ + +#define CMSDK_PL110_LOOPBACKEN_Pos 7 /*!< CMSDK_PL110 LOOPBACKEN: Loopback Enable Position */ +#define CMSDK_PL110_LOOPBACKEN_Msk (0x1ul << CMSDK_PL110_LOOPBACKEN_Pos) /*!< CMSDK_PL110 LOOPBACKEN: Loopback Enable Mask */ + +#define CMSDK_PL110_IRDASIRLPM_Pos 2 /*!< CMSDK_PL110 IRDASIRLPM: IRDA SIR Low Power Position */ +#define CMSDK_PL110_IRDASIRLPM_Msk (0x1ul << CMSDK_PL110_IRDASIRLPM_Pos) /*!< CMSDK_PL110 IRDASIRLPM: IRDA SIR Low Power Mask */ + +#define CMSDK_PL110_SIREN_Pos 1 /*!< CMSDK_PL110 SIREN: SIR Enable Position */ +#define CMSDK_PL110_SIREN_Msk (0x1ul << CMSDK_PL110_SIREN_Pos) /*!< CMSDK_PL110 SIREN: SIR Enable Mask */ + +#define CMSDK_PL110_UARTEN_Pos 0 /*!< CMSDK_PL110 UARTEN: UART Enable Position */ +#define CMSDK_PL110_UARTEN_Msk (0x1ul << CMSDK_PL110_UARTEN_Pos) /*!< CMSDK_PL110 UARTEN: UART Enable Mask */ + +#define CMSDK_PL110_RECINTFIFOLEVEL_Pos 3 /*!< CMSDK_PL110 RECINTFIFOLEVEL: Set Receive Int FIFO Level Position */ +#define CMSDK_PL110_RECINTFIFOLEVEL_Msk (0x7ul << CMSDK_PL110_RECINTFIFOLEVEL_Pos) /*!< CMSDK_PL110 RECINTFIFOLEVEL: Set Receive Int FIFO Level Mask */ + +#define CMSDK_PL110_TRANSINTFIFOLEVEL_Pos 0 /*!< CMSDK_PL110 TRANSINTFIFOLEVEL: Set Transmit Int FIFO Level Position */ +#define CMSDK_PL110_TRANSINTFIFOLEVEL_Msk (0x7ul << CMSDK_PL110_TRANSINTFIFOLEVEL_Pos) /*!< CMSDK_PL110 TRANSINTFIFOLEVEL: Set Transmit Int FIFO Level Mask */ + +#define CMSDK_PL110_SETMASK_OVRRUNERRINT_Pos 10 /*!< CMSDK_PL110 SETMASK_OVRRUNERRINT: Set Overrun Error Int Mask Position */ +#define CMSDK_PL110_SETMASK_OVRRUNERRINT_Msk (0x1ul << CMSDK_PL110_SETMASK_OVRRUNERRINT_Pos) /*!< CMSDK_PL110 SETMASK_OVRRUNERRINT: Set Overrun Error Int Mask Mask */ + +#define CMSDK_PL110_SETMASK_BREAKERRINT_Pos 9 /*!< CMSDK_PL110 SETMASK_BREAKERRINT: Set Break Error Int Mask Position */ +#define CMSDK_PL110_SETMASK_BREAKERRINT_Msk (0x1ul << CMSDK_PL110_SETMASK_BREAKERRINT_Pos) /*!< CMSDK_PL110 SETMASK_BREAKERRINT: Set Break Error Int Mask Mask */ + +#define CMSDK_PL110_SETMASK_PARITYERRINT_Pos 8 /*!< CMSDK_PL110 SETMASK_PARITYERRINT: Set Parity Error Int Mask Position */ +#define CMSDK_PL110_SETMASK_PARITYERRINT_Msk (0x1ul << CMSDK_PL110_SETMASK_PARITYERRINT_Pos) /*!< CMSDK_PL110 SETMASK_PARITYERRINT: Set Parity Error Int Mask Mask */ + +#define CMSDK_PL110_SETMASK_FRAMEERRINT_Pos 7 /*!< CMSDK_PL110 SETMASK_FRAMEERRINT: Set Frame Error Int Mask Position */ +#define CMSDK_PL110_SETMASK_FRAMEERRINT_Msk (0x1ul << CMSDK_PL110_SETMASK_FRAMEERRINT_Pos) /*!< CMSDK_PL110 SETMASK_FRAMEERRINT: Set Frame Error Int Mask Mask */ + +#define CMSDK_PL110_SETMASK_RECTRANSINT_Pos 6 /*!< CMSDK_PL110 SETMASK_RECTRANSINT: Set Transmit Receive Comb Int Mask Position */ +#define CMSDK_PL110_SETMASK_RECTRANSINT_Msk (0x1ul << CMSDK_PL110_SETMASK_RECTRANSINT_Pos) /*!< CMSDK_PL110 SETMASK_RECTRANSINT: Set Transmit Receive Comb Int Mask Mask */ + +#define CMSDK_PL110_SETMASK_TRANSINT_Pos 5 /*!< CMSDK_PL110 SETMASK_TRANSINT: Set Transmit Int Mask Position */ +#define CMSDK_PL110_SETMASK_TRANSINT_Msk (0x1ul << CMSDK_PL110_SETMASK_TRANSINT_Pos) /*!< CMSDK_PL110 SETMASK_TRANSINT: Set Transmit Int Mask Mask */ + +#define CMSDK_PL110_SETMASK_RECINT_Pos 4 /*!< CMSDK_PL110 SETMASK_RECINT: Set Receive Int Mask Position */ +#define CMSDK_PL110_SETMASK_RECINT_Msk (0x1ul << CMSDK_PL110_SETMASK_RECINT_Pos) /*!< CMSDK_PL110 SETMASK_RECINT: Set Receive Int Mask Mask */ + +#define CMSDK_PL110_SETMASK_UART_DSRMODINT_Pos 3 /*!< CMSDK_PL110 SETMASK_UART_DSRMODINT: Set Data Set Ready Modem Int Mask Position */ +#define CMSDK_PL110_SETMASK_UART_DSRMODINT_Msk (0x1ul << CMSDK_PL110_SETMASK_UARTD_SRMODINT_Pos) /*!< CMSDK_PL110 SETMASK_UART_DSRMODINT: Set Data Set Ready Modem Int Mask Mask */ + +#define CMSDK_PL110_SETMASK_UART_DCDMODINT_Pos 2 /*!< CMSDK_PL110 SETMASK_UART_DCDMODINT: Set Data Carrier Detect Modem Int Mask Position */ +#define CMSDK_PL110_SETMASK_UART_DCDMODINT_Msk (0x1ul << CMSDK_PL110_SETMASK_UART_DCDMODINT_Pos) /*!< CMSDK_PL110 SETMASK_UART_DCDMODINT: Set Data Carrier Detect Modem Int Mask Mask */ + +#define CMSDK_PL110_SETMASK_UART_CTSMODINT_Pos 1 /*!< CMSDK_PL110 SETMASK_UART_CTSMODINT: Set Clear To Send Modem Int Mask Position */ +#define CMSDK_PL110_SETMASK_UART_CTSMODINT_Msk (0x1ul << CMSDK_PL110_SETMASK_UART_CTSMODINT_Pos) /*!< CMSDK_PL110 SETMASK_UART_CTSMODINT: Set Clear To Send Modem Int Mask Mask */ + +#define CMSDK_PL110_SETMASK_UART_RIMODINT_Pos 0 /*!< CMSDK_PL110 SETMASK_UART_RIMODINT: Set nUARTRI Modem Int Mask Position */ +#define CMSDK_PL110_SETMASK_UART_RIMODINT_Msk (0x1ul << CMSDK_PL110_SETMASK_UART_RIMODINT_Pos) /*!< CMSDK_PL110 SETMASK_UART_RIMODINT: Set nUARTRI Modem Int Mask Mask */ + +#define CMSDK_PL110_RAWINTSTAT_OVRRUNERRINT_Pos 10 /*!< CMSDK_PL110 RAWINTSTAT_OVRRUNERRINT: Raw Overrun Error Int Status Mask Position */ +#define CMSDK_PL110_RAWINTSTAT_OVRRUNERRINT_Msk (0x1ul << CMSDK_PL110_RAWINTSTAT_OVRRUNERRINT_Pos) /*!< CMSDK_PL110 RAWINTSTAT_OVRRUNERRINT: Raw Overrun Error Int Status Mask */ + +#define CMSDK_PL110_RAWINTSTAT_BREAKERRINT_Pos 9 /*!< CMSDK_PL110 RAWINTSTAT_BREAKERRINT: Raw Break Error Int Status Mask Position */ +#define CMSDK_PL110_RAWINTSTAT_BREAKERRINT_Msk (0x1ul << CMSDK_PL110_RAWINTSTAT_BREAKERRINT_Pos) /*!< CMSDK_PL110 RAWINTSTAT_BREAKERRINT: Raw Break Error Int Status Mask */ + +#define CMSDK_PL110_RAWINTSTAT_PARITYERRINT_Pos 8 /*!< CMSDK_PL110 RAWINTSTAT_PARITYERRINT: Raw Parity Error Int Status Mask Position */ +#define CMSDK_PL110_RAWINTSTAT_PARITYERRINT_Msk (0x1ul << CMSDK_PL110_RAWINTSTAT_PARITYERRINT_Pos) /*!< CMSDK_PL110 RAWINTSTAT_PARITYERRINT: Raw Parity Error Int Status Mask */ + +#define CMSDK_PL110_RAWINTSTAT_FRAMEERRINT_Pos 7 /*!< CMSDK_PL110 RAWINTSTAT_FRAMEERRINT: Raw Frame Error Int Status Mask Position */ +#define CMSDK_PL110_RAWINTSTAT_FRAMEERRINT_Msk (0x1ul << CMSDK_PL110_RAWINTSTAT_FRAMEERRINT_Pos) /*!< CMSDK_PL110 RAWINTSTAT_FRAMEERRINT: Raw Frame Error Int Status Mask */ + +#define CMSDK_PL110_RAWINTSTAT_RECTRANSINT_Pos 6 /*!< CMSDK_PL110 RAWINTSTAT_RECTRANSINT: Raw Transmit Receive Comb Int Status Position */ +#define CMSDK_PL110_RAWINTSTAT_RECTRANSINT_Msk (0x1ul << CMSDK_PL110_RAWINTSTAT_RECTRANSINT_Pos) /*!< CMSDK_PL110 RAWINTSTAT_RECTRANSINT: Raw Transmit Receive Comb Int Status Mask */ + +#define CMSDK_PL110_RAWINTSTAT_TRANSINT_Pos 5 /*!< CMSDK_PL110 RAWINTSTAT_TRANSINT: Raw Transmit Int Status Position */ +#define CMSDK_PL110_RAWINTSTAT_TRANSINT_Msk (0x1ul << CMSDK_PL110_RAWINTSTAT_TRANSINT_Pos) /*!< CMSDK_PL110 RAWINTSTAT_TRANSINT: Raw Transmit Int Status Mask */ + +#define CMSDK_PL110_RAWINTSTAT_RECINT_Pos 4 /*!< CMSDK_PL110 RAWINTSTAT_RECINT: Raw Receive Int Status Position */ +#define CMSDK_PL110_RAWINTSTAT_RECINT_Msk (0x1ul << CMSDK_PL110_RAWINTSTAT_RECINT_Pos) /*!< CMSDK_PL110 RAWINTSTAT_RECINT: Raw Receive Int Status Mask */ + +#define CMSDK_PL110_RAWINTSTAT_UART_DSRMODINT_Pos 3 /*!< CMSDK_PL110 RAWINTSTAT_UART_DSRMODINT: Raw Data Set Ready Int Status Position */ +#define CMSDK_PL110_RAWINTSTAT_UART_DSRMODINT_Msk (0x1ul << CMSDK_PL110_RAWINTSTAT_UARTD_SRMODINT_Pos) /*!< CMSDK_PL110 RAWINTSTAT_UARTD_SRMODINT: Raw Data Set Ready Int Status Mask */ + +#define CMSDK_PL110_RAWINTSTAT_UART_DCDMODINT_Pos 2 /*!< CMSDK_PL110 RAWINTSTAT_UART_DCDMODINT: Raw Data Carrier Detect Int Status Position */ +#define CMSDK_PL110_RAWINTSTAT_UART_DCDMODINT_Msk (0x1ul << CMSDK_PL110_RAWINTSTAT_UART_DCDMODINT_Pos) /*!< CMSDK_PL110 RAWINTSTAT_UART_DCDMODINT: Raw Data Carrier Detect Int Status Mask */ + +#define CMSDK_PL110_RAWINTSTAT_UART_CTSMODINT_Pos 1 /*!< CMSDK_PL110 RAWINTSTAT_UART_CTSMODINT: Raw Clear To Send Int Status Position */ +#define CMSDK_PL110_RAWINTSTAT_UART_CTSMODINT_Msk (0x1ul << CMSDK_PL110_RAWINTSTAT_UART_CTSMODINT_Pos) /*!< CMSDK_PL110 RAWINTSTAT_UART_CTSMODINT: Raw Clear To Send Int Status Mask */ + +#define CMSDK_PL110_RAWINTSTAT_UART_RIMODINT_Pos 0 /*!< CMSDK_PL110 RAWINTSTAT_UART_RIMODINT: Raw nUARTRI Modem Int Status Position */ +#define CMSDK_PL110_RAWINTSTAT_UART_RIMODINT_Msk (0x1ul << CMSDK_PL110_RAWINTSTAT_UART_RIMODINT_Pos) /*!< CMSDK_PL110 RAWINTSTAT_UART_RIMODINT: Raw nUARTRI Modem Int Status Mask */ + +#define CMSDK_PL110_MSKINTSTAT_OVRRUNERRINT_Pos 10 /*!< CMSDK_PL110 MSKINTSTAT_OVRRUNERRINT: Masked Overrun Error Int Status Position */ +#define CMSDK_PL110_MSKINTSTAT_OVRRUNERRINT_Msk (0x1ul << CMSDK_PL110_MSKINTSTAT_OVRRUNERRINT_Pos) /*!< CMSDK_PL110 MSKINTSTAT_OVRRUNERRINT: Masked Overrun Error Int Status Mask */ + +#define CMSDK_PL110_MSKINTSTAT_BREAKERRINT_Pos 9 /*!< CMSDK_PL110 MSKINTSTAT_BREAKERRINT: Masked Break Error Int Status Position */ +#define CMSDK_PL110_MSKINTSTAT_BREAKERRINT_Msk (0x1ul << CMSDK_PL110_MSKINTSTAT_BREAKERRINT_Pos) /*!< CMSDK_PL110 MSKINTSTAT_BREAKERRINT: Masked Break Error Int Status Mask */ + +#define CMSDK_PL110_MSKINTSTAT_PARITYERRINT_Pos 8 /*!< CMSDK_PL110 MSKINTSTAT_PARITYERRINT: Masked Parity Error Int Status Position */ +#define CMSDK_PL110_MSKINTSTAT_PARITYERRINT_Msk (0x1ul << CMSDK_PL110_MSKINTSTAT_PARITYERRINT_Pos) /*!< CMSDK_PL110 MSKINTSTAT_PARITYERRINT: Masked Parity Error Int Status Mask */ + +#define CMSDK_PL110_MSKINTSTAT_FRAMEERRINT_Pos 7 /*!< CMSDK_PL110 MSKINTSTAT_FRAMEERRINT: Masked Frame Error Int Status Position */ +#define CMSDK_PL110_MSKINTSTAT_FRAMEERRINT_Msk (0x1ul << CMSDK_PL110_MSKINTSTAT_FRAMEERRINT_Pos) /*!< CMSDK_PL110 MSKINTSTAT_FRAMEERRINT: Masked Frame Error Int Status Mask */ + +#define CMSDK_PL110_MSKINTSTAT_RECTRANSINT_Pos 6 /*!< CMSDK_PL110 MSKINTSTAT_RECTRANSINT: Masked Transmit Receive Comb Int Status Position */ +#define CMSDK_PL110_MSKINTSTAT_RECTRANSINT_Msk (0x1ul << CMSDK_PL110_MSKINTSTAT_RECTRANSINT_Pos) /*!< CMSDK_PL110 MSKINTSTAT_RECTRANSINT: Masked Transmit Receive Comb Int Status Mask */ + +#define CMSDK_PL110_MSKINTSTAT_TRANSINT_Pos 5 /*!< CMSDK_PL110 MSKINTSTAT_TRANSINT: Masked Transmit Int Status Position */ +#define CMSDK_PL110_MSKINTSTAT_TRANSINT_Msk (0x1ul << CMSDK_PL110_MSKINTSTAT_TRANSINT_Pos) /*!< CMSDK_PL110 MSKINTSTAT_TRANSINT: Masked Transmit Int Status Mask */ + +#define CMSDK_PL110_MSKINTSTAT_RECINT_Pos 4 /*!< CMSDK_PL110 MSKINTSTAT_RECINT: Masked Receive Int Status Position */ +#define CMSDK_PL110_MSKINTSTAT_RECINT_Msk (0x1ul << CMSDK_PL110_MSKINTSTAT_RECINT_Pos) /*!< CMSDK_PL110 MSKINTSTAT_RECINT: Masked Receive Int Status Mask */ + +#define CMSDK_PL110_MSKINTSTAT_UART_DSRMODINT_Pos 3 /*!< CMSDK_PL110 MSKINTSTAT_UART_DSRMODINT: Masked Data Set Ready Int Status Position */ +#define CMSDK_PL110_MSKINTSTAT_UART_DSRMODINT_Msk (0x1ul << CMSDK_PL110_MSKINTSTAT_UARTD_SRMODINT_Pos) /*!< CMSDK_PL110 MSKINTSTAT_UART_DSRMODINT: Masked Data Set Ready Int Status Mask */ + +#define CMSDK_PL110_MSKINTSTAT_UART_DCDMODINT_Pos 2 /*!< CMSDK_PL110 MSKINTSTAT_UART_DCDMODINT: Masked Data Carrier Detect Int Status Position */ +#define CMSDK_PL110_MSKINTSTAT_UART_DCDMODINT_Msk (0x1ul << CMSDK_PL110_MSKINTSTAT_UART_DCDMODINT_Pos) /*!< CMSDK_PL110 MSKINTSTAT_UART_DCDMODINT: Masked Data Carrier Detect Int Status Mask */ + +#define CMSDK_PL110_MSKINTSTAT_UART_CTSMODINT_Pos 1 /*!< CMSDK_PL110 MSKINTSTAT_UART_CTSMODINT: Masked Clear To Send Int Status Position */ +#define CMSDK_PL110_MSKINTSTAT_UART_CTSMODINT_Msk (0x1ul << CMSDK_PL110_MSKINTSTAT_UART_CTSMODINT_Pos) /*!< CMSDK_PL110 MSKINTSTAT_UART_CTSMODINT: Masked Clear To Send Int Status Mask */ + +#define CMSDK_PL110_MSKINTSTAT_UART_RIMODINT_Pos 0 /*!< CMSDK_PL110 MSKINTSTAT_UART_RIMODINT: Masked nUARTRI Modem Int Status Position */ +#define CMSDK_PL110_MSKINTSTAT_UART_RIMODINT_Msk (0x1ul << CMSDK_PL110_MSKINTSTAT_UART_RIMODINT_Pos) /*!< CMSDK_PL110 MSKINTSTAT_UART_RIMODINT: Masked nUARTRI Modem Int Status Mask */ + +#define CMSDK_PL110_INTCLR_OVRRUNERRINT_Pos 10 /*!< CMSDK_PL110 INTCLR_OVRRUNERRINT: Clear Overrun Error Int Position */ +#define CMSDK_PL110_INTCLR_OVRRUNERRINT_Msk (0x1ul << CMSDK_PL110_INTCLR_OVRRUNERRINT_Pos) /*!< CMSDK_PL110 INTCLR_OVRRUNERRINT: Clear Overrun Error Int Mask */ + +#define CMSDK_PL110_INTCLR_BREAKERRINT_Pos 9 /*!< CMSDK_PL110 INTCLR_BREAKERRINT: Clear Break Error Int Position */ +#define CMSDK_PL110_INTCLR_BREAKERRINT_Msk (0x1ul << CMSDK_PL110_INTCLR_BREAKERRINT_Pos) /*!< CMSDK_PL110 INTCLR_BREAKERRINT: Clear Break Error Int Mask */ + +#define CMSDK_PL110_INTCLR_PARITYERRINT_Pos 8 /*!< CMSDK_PL110 INTCLR_PARITYERRINT: Clear Parity Error Int Position */ +#define CMSDK_PL110_INTCLR_PARITYERRINT_Msk (0x1ul << CMSDK_PL110_INTCLR_PARITYERRINT_Pos) /*!< CMSDK_PL110 INTCLR_PARITYERRINT: Clear Parity Error Int Mask */ + +#define CMSDK_PL110_INTCLR_FRAMEERRINT_Pos 7 /*!< CMSDK_PL110 INTCLR_FRAMEERRINT: Clear Frame Error Int Position */ +#define CMSDK_PL110_INTCLR_FRAMEERRINT_Msk (0x1ul << CMSDK_PL110_INTCLR_FRAMEERRINT_Pos) /*!< CMSDK_PL110 INTCLR_FRAMEERRINT: Clear Frame Error Int Mask */ + +#define CMSDK_PL110_INTCLR_RECTRANSINT_Pos 6 /*!< CMSDK_PL110 INTCLR_RECTRANSINT: Clear Receive Transmit Comb Int Position */ +#define CMSDK_PL110_INTCLR_RECTRANSINT_Msk (0x1ul << CMSDK_PL110_INTCLR_RECTRANSINT_Pos) /*!< CMSDK_PL110 INTCLR_RECTRANSINT: Clear Receive Transmit Comb Int Mask */ + +#define CMSDK_PL110_INTCLR_TRANSINT_Pos 5 /*!< CMSDK_PL110 INTCLR_TRANSINT: Clear Transmit Int Position */ +#define CMSDK_PL110_INTCLR_TRANSINT_Msk (0x1ul << CMSDK_PL110_INTCLR_TRANSINT_Pos) /*!< CMSDK_PL110 INTCLR_TRANSINT: Clear Transmit Int Mask */ + +#define CMSDK_PL110_INTCLR_RECINT_Pos 4 /*!< CMSDK_PL110 INTCLR_RECINT: Clear Receive Int Position */ +#define CMSDK_PL110_INTCLR_RECINT_Msk (0x1ul << CMSDK_PL110_INTCLR_RECINT_Pos) /*!< CMSDK_PL110 INTCLR_RECINT: Clear Receive Int Mask */ + +#define CMSDK_PL110_INTCLR_UART_DSRMODINT_Pos 3 /*!< CMSDK_PL110 INTCLR_UART_DSRMODINT: Clear Data Carrier Detect Int Position */ +#define CMSDK_PL110_INTCLR_UART_DSRMODINT_Msk (0x1ul << CMSDK_PL110_INTCLR_UARTD_SRMODINT_Pos) /*!< CMSDK_PL110 INTCLR_UARTD_SRMODINT: Clear Data Carrier Detect Int Mask */ + +#define CMSDK_PL110_INTCLR_UART_DCDMODINT_Pos 2 /*!< CMSDK_PL110 INTCLR_UART_DCDMODINT: Clear Data Set Ready Int Position */ +#define CMSDK_PL110_INTCLR_UART_DCDMODINT_Msk (0x1ul << CMSDK_PL110_INTCLR_UART_DCDMODINT_Pos) /*!< CMSDK_PL110 INTCLR_UART_DCDMODINT: Clear Data Set Ready Int Mask */ + +#define CMSDK_PL110_INTCLR_UART_CTSMODINT_Pos 1 /*!< CMSDK_PL110 INTCLR_UART_CTSMODINT: Clear Clear To Sent Int Position */ +#define CMSDK_PL110_INTCLR_UART_CTSMODINT_Msk (0x1ul << CMSDK_PL110_INTCLR_UART_CTSMODINT_Pos) /*!< CMSDK_PL110 INTCLR_UART_CTSMODINT: Clear Clear To Sent Int Mask */ + +#define CMSDK_PL110_INTCLR_UART_RIMODINT_Pos 0 /*!< CMSDK_PL110 INTCLR_UART_RIMODINT: Clear nUARTRI Modem Int Position */ +#define CMSDK_PL110_INTCLR_UART_RIMODINT_Msk (0x1ul << CMSDK_PL110_INTCLR_UART_RIMODINT_Pos) /*!< CMSDK_PL110 INTCLR_UART_RIMODINT: Clear nUARTRI Modem Int Mask */ + +#define CMSDK_PL110_DMA_ERR_Pos 2 /*!< CMSDK_PL110 DMA_ERR: DMA Error Position */ +#define CMSDK_PL110_DMA_ERR_Msk (0x1ul << CMSDK_PL110_DMA_ERR_Pos) /*!< CMSDK_PL110 DMA_ERR: DMA Error Mask */ + +#define CMSDK_PL110_DMA_TRANS_EN_Pos 1 /*!< CMSDK_PL110 DMA_TRANS_EN: DMA Transmit Error Position */ +#define CMSDK_PL110_DMA_TRANS_EN_Msk (0x1ul << CMSDK_PL110_DMA_TRANS_EN_Pos) /*!< CMSDK_PL110 DMA_TRANS_EN: DMA Transmit Error Mask */ + +#define CMSDK_PL110_DMA_REC_EN_Pos 0 /*!< CMSDK_PL110 DMA_REC_EN: DMA Receive Error Position */ +#define CMSDK_PL110_DMA_REC_EN_Msk (0x1ul << CMSDK_PL110_DMA_REC_EN_Pos) /*!< CMSDK_PL110 DMA_REC_EN: DMA Receive Error Mask */ + + +/*@}*/ /* end of group CMSDK_PL110 */ + +/*------------------- Watchdog ----------------------------------------------*/ +/** @addtogroup CMSDK_Watchdog CMSDK Watchdog + @{ +*/ +typedef struct { + + __IO uint32_t LOAD; /* Offset: 0x000 (R/W) Watchdog Load Register */ + __I uint32_t VALUE; /* Offset: 0x004 (R/ ) Watchdog Value Register */ + __IO uint32_t CTRL; /* Offset: 0x008 (R/W) Watchdog Control Register */ + /* RESEN: Reset enable */ + /* INTEN: Interrupt enable */ + /* */ + __O uint32_t INTCLR; /* Offset: 0x00C ( /W) Watchdog Clear Interrupt Register */ + __I uint32_t RAWINTSTAT; /* Offset: 0x010 (R/ ) Watchdog Raw Interrupt Status Register */ + __I uint32_t MASKINTSTAT; /* Offset: 0x014 (R/ ) Watchdog Interrupt Status Register */ + uint32_t RESERVED0[762]; + __IO uint32_t LOCK; /* Offset: 0xC00 (R/W) Watchdog Lock Register */ + uint32_t RESERVED1[191]; + __IO uint32_t ITCR; /* Offset: 0xF00 (R/W) Watchdog Integration Test Control Register */ + __O uint32_t ITOP; /* Offset: 0xF04 ( /W) Watchdog Integration Test Output Set Register */ +} CMSDK_WATCHDOG_TypeDef; + +#define CMSDK_Watchdog_LOAD_Pos 0 /*!< CMSDK_Watchdog LOAD: LOAD Position */ +#define CMSDK_Watchdog_LOAD_Msk (0xFFFFFFFFul << CMSDK_Watchdog_LOAD_Pos) /*!< CMSDK_Watchdog LOAD: LOAD Mask */ + +#define CMSDK_Watchdog_VALUE_Pos 0 /*!< CMSDK_Watchdog VALUE: VALUE Position */ +#define CMSDK_Watchdog_VALUE_Msk (0xFFFFFFFFul << CMSDK_Watchdog_VALUE_Pos) /*!< CMSDK_Watchdog VALUE: VALUE Mask */ + +#define CMSDK_Watchdog_CTRL_RESEN_Pos 1 /*!< CMSDK_Watchdog CTRL_RESEN: Enable Reset Output Position */ +#define CMSDK_Watchdog_CTRL_RESEN_Msk (0x1ul << CMSDK_Watchdog_CTRL_RESEN_Pos) /*!< CMSDK_Watchdog CTRL_RESEN: Enable Reset Output Mask */ + +#define CMSDK_Watchdog_CTRL_INTEN_Pos 0 /*!< CMSDK_Watchdog CTRL_INTEN: Int Enable Position */ +#define CMSDK_Watchdog_CTRL_INTEN_Msk (0x1ul << CMSDK_Watchdog_CTRL_INTEN_Pos) /*!< CMSDK_Watchdog CTRL_INTEN: Int Enable Mask */ + +#define CMSDK_Watchdog_INTCLR_Pos 0 /*!< CMSDK_Watchdog INTCLR: Int Clear Position */ +#define CMSDK_Watchdog_INTCLR_Msk (0x1ul << CMSDK_Watchdog_INTCLR_Pos) /*!< CMSDK_Watchdog INTCLR: Int Clear Mask */ + +#define CMSDK_Watchdog_RAWINTSTAT_Pos 0 /*!< CMSDK_Watchdog RAWINTSTAT: Raw Int Status Position */ +#define CMSDK_Watchdog_RAWINTSTAT_Msk (0x1ul << CMSDK_Watchdog_RAWINTSTAT_Pos) /*!< CMSDK_Watchdog RAWINTSTAT: Raw Int Status Mask */ + +#define CMSDK_Watchdog_MASKINTSTAT_Pos 0 /*!< CMSDK_Watchdog MASKINTSTAT: Mask Int Status Position */ +#define CMSDK_Watchdog_MASKINTSTAT_Msk (0x1ul << CMSDK_Watchdog_MASKINTSTAT_Pos) /*!< CMSDK_Watchdog MASKINTSTAT: Mask Int Status Mask */ + +#define CMSDK_Watchdog_LOCK_Pos 0 /*!< CMSDK_Watchdog LOCK: LOCK Position */ +#define CMSDK_Watchdog_LOCK_Msk (0x1ul << CMSDK_Watchdog_LOCK_Pos) /*!< CMSDK_Watchdog LOCK: LOCK Mask */ + +#define CMSDK_Watchdog_INTEGTESTEN_Pos 0 /*!< CMSDK_Watchdog INTEGTESTEN: Integration Test Enable Position */ +#define CMSDK_Watchdog_INTEGTESTEN_Msk (0x1ul << CMSDK_Watchdog_INTEGTESTEN_Pos) /*!< CMSDK_Watchdog INTEGTESTEN: Integration Test Enable Mask */ + +#define CMSDK_Watchdog_INTEGTESTOUTSET_Pos 1 /*!< CMSDK_Watchdog INTEGTESTOUTSET: Integration Test Output Set Position */ +#define CMSDK_Watchdog_INTEGTESTOUTSET_Msk (0x1ul << CMSDK_Watchdog_INTEGTESTOUTSET_Pos) /*!< CMSDK_Watchdog INTEGTESTOUTSET: Integration Test Output Set Mask */ + +/*@}*/ /* end of group CMSDK_Watchdog */ + +/*------------------- PrimeCell APB GPIO --------------------------------------*/ +/** @addtogroup CMSDK_PL061 CMSDK APB GPIO + @{ +*/ +typedef struct { + + __IO uint32_t DATA[256]; + __IO uint32_t DIR; + __IO uint32_t INTSENSE; + __IO uint32_t INTBOTHEDGE; + __IO uint32_t INTEVENT; + __IO uint32_t INTMASK; + __O uint32_t RAWINTSTAT; + __O uint32_t MASKINTSTAT; + __I uint32_t INTCLR; + __IO uint32_t MODECTRL; + +} APBGPIO_TypeDef; + +#define CMSDK_PL061_DATA_Pos 0 /*!< CMSDK_PL061 DATA: DATA Position */ +#define CMSDK_PL061_DATA_Msk (0xFFFFFFFFul << CMSDK_PL061_LOAD_Pos) /*!< CMSDK_PL061 DATA: DATA Mask */ + +#define CMSDK_PL061_DIR_Pos 0 /*!< CMSDK_PL061 DIR: Data Direction Position */ +#define CMSDK_PL061_DIR_Msk (0x1ul << CMSDK_PL061_DIR_Pos) /*!< CMSDK_PL061 DIR: Data Direction Mask */ + +#define CMSDK_PL061_INTSENSE_Pos 0 /*!< CMSDK_PL061 INTSENSE: INT SENSE Position */ +#define CMSDK_PL061_INTSENSE_Msk (0x1ul << CMSDK_PL061_INTSENSE_Pos) /*!< CMSDK_PL061 INTSENSE: INT SENSE Mask */ + +#define CMSDK_PL061_INTBOTHEDGE_Pos 0 /*!< CMSDK_PL061 INTBOTHEDGE: INT BOTH EDGE Position */ +#define CMSDK_PL061_INTBOTHEDGE_Msk (0x1ul << CMSDK_PL061_INTBOTHEDGE_Pos) /*!< CMSDK_PL061 INTBOTHEDGE: INT BOTH EDGE Mask */ + +#define CMSDK_PL061_INTEVENT_Pos 0 /*!< CMSDK_PL061 INTEVENT: INT EVENT Position */ +#define CMSDK_PL061_INTEVENT_Msk (0x1ul << CMSDK_PL061_INTEVENT_Pos) /*!< CMSDK_PL061 INTEVENT: INT EVENT Mask */ + +#define CMSDK_PL061_INTMASK_Pos 0 /*!< CMSDK_PL061 INTMASK: INT MASK Position */ +#define CMSDK_PL061_INTMASK_Msk (0x1ul << CMSDK_PL061_INTMASK_Pos) /*!< CMSDK_PL061 INTMASK: INT MASK Mask */ + +#define CMSDK_PL061_RAWINTSTAT_Pos 0 /*!< CMSDK_PL061 RAWINTSTAT: Raw Int Status Position */ +#define CMSDK_PL061_RAWINTSTAT_Msk (0x1ul << CMSDK_PL061_RAWINTSTAT_Pos) /*!< CMSDK_PL061 RAWINTSTAT: Raw Int Status Mask */ + +#define CMSDK_PL061_MASKINTSTAT_Pos 0 /*!< CMSDK_PL061 MASKINTSTAT: Mask Int Status Position */ +#define CMSDK_PL061_MASKINTSTAT_Msk (0x1ul << CMSDK_PL061_MASKINTSTAT_Pos) /*!< CMSDK_PL061 MASKINTSTAT: Mask Int Status Mask */ + +#define CMSDK_PL061_INTCLR_Pos 0 /*!< CMSDK_PL061 INTCLR: Int Clear Position */ +#define CMSDK_PL061_INTCLR_Msk (0x1ul << CMSDK_PL061_INTCLR_Pos) /*!< CMSDK_PL061 INTCLR: Int Clear Mask */ + +#define CMSDK_PL061_MODECTRL_HWEN_Pos 0 /*!< CMSDK_PL061 MODECTRL_HWEN: Mode Control Hardware Enable Position */ +#define CMSDK_PL061_MODECTRL_HWEN_Msk (0x1ul << CMSDK_PL061_MODECTRL_HWEN_Pos) /*!< CMSDK_PL061 MODECTRL_HWEN: Mode Control Hardware Enable Mask */ + + +/*@}*/ /* end of group CMSDK_PL061 */ + + +#if defined ( __CC_ARM ) +#pragma no_anon_unions +#endif + +/*@}*/ /* end of group CMSDK_CM4_Peripherals */ + + +/******************************************************************************/ +/* Peripheral memory map */ +/******************************************************************************/ +/** @addtogroup CMSDK_CM4_MemoryMap CMSDK_CM4 Memory Mapping + @{ +*/ + +/* Peripheral and SRAM base address */ +#define CMSDK_FLASH_BASE (0x00000000UL) /*!< (FLASH ) Base Address */ +#define CMSDK_SRAM_BASE (0x20000000UL) /*!< (SRAM ) Base Address */ +#define CMSDK_PERIPH_BASE (0x40000000UL) /*!< (Peripheral) Base Address */ + +/* Base addresses */ +#define CMSDK_RAM_BASE (0x20000000UL) +#define CMSDK_APB_BASE (0x40000000UL) +#define CMSDK_AHB_BASE (0x40010000UL) + +/* APB peripherals */ +#define CMSDK_TIMER0_BASE (CMSDK_APB_BASE + 0x0000UL) +#define CMSDK_TIMER1_BASE (CMSDK_APB_BASE + 0x1000UL) +#define CMSDK_DUALTIMER_BASE (CMSDK_APB_BASE + 0x2000UL) +#define CMSDK_DUALTIMER_1_BASE (CMSDK_DUALTIMER_BASE) +#define CMSDK_DUALTIMER_2_BASE (CMSDK_DUALTIMER_BASE + 0x20UL) +#define CMSDK_UART0_BASE (CMSDK_APB_BASE + 0x4000UL) +#define CMSDK_UART1_BASE (CMSDK_APB_BASE + 0x5000UL) +#define CMSDK_UART2_BASE (CMSDK_APB_BASE + 0x6000UL) +#define CMSDK_UART3_BASE (CMSDK_APB_BASE + 0x7000UL) +#define CMSDK_WATCHDOG_BASE (CMSDK_APB_BASE + 0x8000UL) +#define CMSDK_UART4_BASE (CMSDK_APB_BASE + 0x9000UL) +#define CMSDK_PL230_BASE (CMSDK_APB_BASE + 0xF000UL) + +/* AHB peripherals */ +#define CMSDK_GPIO0_BASE (CMSDK_AHB_BASE + 0x0000UL) +#define CMSDK_GPIO1_BASE (CMSDK_AHB_BASE + 0x1000UL) +#define CMSDK_GPIO2_BASE (CMSDK_AHB_BASE + 0x2000UL) +#define CMSDK_GPIO3_BASE (CMSDK_AHB_BASE + 0x3000UL) +#define CMSDK_SYSCTRL_BASE (CMSDK_AHB_BASE + 0xF000UL) +/*@}*/ /* end of group CMSDK_CM4_MemoryMap */ + + +/******************************************************************************/ +/* Peripheral declaration */ +/******************************************************************************/ +/** @addtogroup CMSDK_CM4_PeripheralDecl CMSDK_CM4 Peripheral Declaration + @{ +*/ + +#define CMSDK_UART0 ((CMSDK_UART_TypeDef *) CMSDK_UART0_BASE ) +#define CMSDK_UART1 ((CMSDK_UART_TypeDef *) CMSDK_UART1_BASE ) +#define CMSDK_UART2 ((CMSDK_UART_TypeDef *) CMSDK_UART2_BASE ) +#define CMSDK_UART3 ((CMSDK_UART_TypeDef *) CMSDK_UART3_BASE ) +#define CMSDK_UART4 ((CMSDK_UART_TypeDef *) CMSDK_UART4_BASE ) +#define CMSDK_TIMER0 ((CMSDK_TIMER_TypeDef *) CMSDK_TIMER0_BASE ) +#define CMSDK_TIMER1 ((CMSDK_TIMER_TypeDef *) CMSDK_TIMER1_BASE ) +#define CMSDK_DUALTIMER ((CMSDK_DUALTIMER_BOTH_TypeDef *) CMSDK_DUALTIMER_BASE ) +#define CMSDK_DUALTIMER1 ((CMSDK_DUALTIMER_SINGLE_TypeDef *) CMSDK_DUALTIMER_1_BASE ) +#define CMSDK_DUALTIMER2 ((CMSDK_DUALTIMER_SINGLE_TypeDef *) CMSDK_DUALTIMER_2_BASE ) +#define CMSDK_WATCHDOG ((CMSDK_WATCHDOG_TypeDef *) CMSDK_WATCHDOG_BASE ) +#define CMSDK_DMA ((CMSDK_PL230_TypeDef *) CMSDK_PL230_BASE ) +#define CMSDK_GPIO0 ((CMSDK_GPIO_TypeDef *) CMSDK_GPIO0_BASE ) +#define CMSDK_GPIO1 ((CMSDK_GPIO_TypeDef *) CMSDK_GPIO1_BASE ) +#define CMSDK_GPIO2 ((CMSDK_GPIO_TypeDef *) CMSDK_GPIO2_BASE ) +#define CMSDK_GPIO3 ((CMSDK_GPIO_TypeDef *) CMSDK_GPIO3_BASE ) +#define CMSDK_SYSCON ((CMSDK_SYSCON_TypeDef *) CMSDK_SYSCTRL_BASE ) +/*@}*/ /* end of group CMSDK_CM4_PeripheralDecl */ + +/*@}*/ /* end of group CMSDK_CM4_Definitions */ + +#ifdef __cplusplus +} +#endif + +#endif /* CMSDK_CM4_H */ diff --git a/common/mps2/LICENSE.txt b/common/mps2/LICENSE.txt new file mode 100644 index 0000000..8dada3e --- /dev/null +++ b/common/mps2/LICENSE.txt @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/common/mps2/MPS2.ld b/common/mps2/MPS2.ld new file mode 100644 index 0000000..55b8716 --- /dev/null +++ b/common/mps2/MPS2.ld @@ -0,0 +1,208 @@ +/* + * MPS2 CMSIS Library + */ +/* + * Copyright (c) 2009-2019 ARM Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * This file is derivative of CMSIS V5.00 gcc_arm.ld + */ +/* Linker script for mbed FVP Cortex-M on MPS2 */ + +/* Linker script to configure memory regions. */ +/* The length of the VECTORS region is a bit larger than + * is necessary based on the number of exception handlers. + */ + +#include "memory_zones.h" +#include "cmsis_nvic.h" + +#if !defined(MBED_CONF_TARGET_BOOT_STACK_SIZE) + #define MBED_CONF_TARGET_BOOT_STACK_SIZE 0x400 +#endif + +MEMORY +{ + FLASH (rx) : ORIGIN = ZBT_SRAM1_START, LENGTH = ZBT_SRAM1_SIZE + RAM (rwx) : ORIGIN = ZBT_SRAM2_START, LENGTH = ZBT_SRAM2_SIZE +} + +/* Linker script to place sections and symbol values. Should be used together + * with other linker script that defines memory regions FLASH and RAM. + * It references following symbols, which must be defined in code: + * Reset_Handler : Entry of reset handler + * + * It defines following symbols, which code can use without definition: + * __exidx_start + * __exidx_end + * __etext + * __data_start__ + * __preinit_array_start + * __preinit_array_end + * __init_array_start + * __init_array_end + * __fini_array_start + * __fini_array_end + * __data_end__ + * __bss_start__ + * __bss_end__ + * __end__ + * end + * __HeapLimit + * __StackLimit + * __StackTop + * __stack + */ +ENTRY(Reset_Handler) + +STACK_SIZE = MBED_CONF_TARGET_BOOT_STACK_SIZE; + +/* Size of the vector table in SRAM */ +M_VECTOR_RAM_SIZE = NVIC_VECTORS_SIZE; + +SECTIONS +{ + .isr_vector : + { + __vector_table = .; + KEEP(*(.vector_table)) + . = ALIGN(8); + } > FLASH + + .text : + { + . = ALIGN(8); + *(.text*) + + KEEP(*(.init)) + KEEP(*(.fini)) + + /* .ctors */ + *crtbegin.o(.ctors) + *crtbegin?.o(.ctors) + *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors) + *(SORT(.ctors.*)) + *(.ctors) + + /* .dtors */ + *crtbegin.o(.dtors) + *crtbegin?.o(.dtors) + *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors) + *(SORT(.dtors.*)) + *(.dtors) + + *(.rodata*) + + KEEP(*(.eh_frame*)) + } > FLASH + + .ARM.extab : + { + *(.ARM.extab* .gnu.linkonce.armextab.*) + } > FLASH + + __exidx_start = .; + .ARM.exidx : + { + *(.ARM.exidx* .gnu.linkonce.armexidx.*) + } > FLASH + __exidx_end = .; + + .data : + { +#ifndef DATA_IN_FLASH + PROVIDE(__etext = LOADADDR(.data)); + . = ALIGN(8); +#endif + __data_start__ = .; + *(vtable) + *(.data) + *(.data*) + + . = ALIGN(8); + /* preinit data */ + PROVIDE (__preinit_array_start = .); + KEEP(*(.preinit_array)) + PROVIDE (__preinit_array_end = .); + + . = ALIGN(8); + /* init data */ + PROVIDE (__init_array_start = .); + KEEP(*(SORT(.init_array.*))) + KEEP(*(.init_array)) + PROVIDE (__init_array_end = .); + + + . = ALIGN(8); + /* finit data */ + PROVIDE (__fini_array_start = .); + KEEP(*(SORT(.fini_array.*))) + KEEP(*(.fini_array)) + PROVIDE (__fini_array_end = .); + + . = ALIGN(8); + /* All data end */ + __data_end__ = .; + +#ifdef DATA_IN_FLASH + } > FLASH +#else + } > RAM AT > FLASH +#endif + + .bss : + { + . = ALIGN(8); + __START_BSS = .; + __bss_start__ = .; + *(.bss) + *(.bss*) + *(COMMON) + . = ALIGN(8); + __bss_end__ = .; + __END_BSS = .; + +#ifdef DATA_IN_FLASH + } > FLASH +#else + } > RAM +#endif + + bss_size = __bss_end__ - __bss_start__; + + .heap (COPY): + { + . = ALIGN(8); + __end__ = .; + PROVIDE(end = .); + __HeapBase = .; + *(.heap*) + . = ORIGIN(RAM) + LENGTH(RAM) - STACK_SIZE; + __HeapLimit = .; + __heap_limit = .; /* Add for _sbrk */ + } > RAM + + /* Set stack top to end of RAM, and stack limit move down by + * size of stack_dummy section */ + __StackTop = ORIGIN(RAM) + LENGTH(RAM); + __StackLimit = __StackTop - STACK_SIZE; + PROVIDE(__stack = __StackTop); + + /* Check if data + heap + stack exceeds RAM limit */ + ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed with stack") + +} /* End of sections */ diff --git a/common/mps2/cmsis_armclang.h b/common/mps2/cmsis_armclang.h new file mode 100644 index 0000000..90de9db --- /dev/null +++ b/common/mps2/cmsis_armclang.h @@ -0,0 +1,1467 @@ +/**************************************************************************//** + * @file cmsis_armclang.h + * @brief CMSIS compiler armclang (Arm Compiler 6) header file + * @version V5.3.1 + * @date 26. March 2020 + ******************************************************************************/ +/* + * Copyright (c) 2009-2020 Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*lint -esym(9058, IRQn)*/ /* disable MISRA 2012 Rule 2.4 for IRQn */ + +#ifndef __CMSIS_ARMCLANG_H +#define __CMSIS_ARMCLANG_H + +#pragma clang system_header /* treat file as system include file */ + +#ifndef __ARM_COMPAT_H +#include /* Compatibility header for Arm Compiler 5 intrinsics */ +#endif + +/* CMSIS compiler specific defines */ +#ifndef __ASM + #define __ASM __asm +#endif +#ifndef __INLINE + #define __INLINE __inline +#endif +#ifndef __STATIC_INLINE + #define __STATIC_INLINE static __inline +#endif +#ifndef __STATIC_FORCEINLINE + #define __STATIC_FORCEINLINE __attribute__((always_inline)) static __inline +#endif +#ifndef __NO_RETURN + #define __NO_RETURN __attribute__((__noreturn__)) +#endif +#ifndef __USED + #define __USED __attribute__((used)) +#endif +#ifndef __WEAK + #define __WEAK __attribute__((weak)) +#endif +#ifndef __PACKED + #define __PACKED __attribute__((packed, aligned(1))) +#endif +#ifndef __PACKED_STRUCT + #define __PACKED_STRUCT struct __attribute__((packed, aligned(1))) +#endif +#ifndef __PACKED_UNION + #define __PACKED_UNION union __attribute__((packed, aligned(1))) +#endif +#ifndef __UNALIGNED_UINT32 /* deprecated */ + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wpacked" +/*lint -esym(9058, T_UINT32)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT32 */ + struct __attribute__((packed)) T_UINT32 { uint32_t v; }; + #pragma clang diagnostic pop + #define __UNALIGNED_UINT32(x) (((struct T_UINT32 *)(x))->v) +#endif +#ifndef __UNALIGNED_UINT16_WRITE + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wpacked" +/*lint -esym(9058, T_UINT16_WRITE)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT16_WRITE */ + __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; }; + #pragma clang diagnostic pop + #define __UNALIGNED_UINT16_WRITE(addr, val) (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val)) +#endif +#ifndef __UNALIGNED_UINT16_READ + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wpacked" +/*lint -esym(9058, T_UINT16_READ)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT16_READ */ + __PACKED_STRUCT T_UINT16_READ { uint16_t v; }; + #pragma clang diagnostic pop + #define __UNALIGNED_UINT16_READ(addr) (((const struct T_UINT16_READ *)(const void *)(addr))->v) +#endif +#ifndef __UNALIGNED_UINT32_WRITE + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wpacked" +/*lint -esym(9058, T_UINT32_WRITE)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT32_WRITE */ + __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; }; + #pragma clang diagnostic pop + #define __UNALIGNED_UINT32_WRITE(addr, val) (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val)) +#endif +#ifndef __UNALIGNED_UINT32_READ + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wpacked" +/*lint -esym(9058, T_UINT32_READ)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT32_READ */ + __PACKED_STRUCT T_UINT32_READ { uint32_t v; }; + #pragma clang diagnostic pop + #define __UNALIGNED_UINT32_READ(addr) (((const struct T_UINT32_READ *)(const void *)(addr))->v) +#endif +#ifndef __ALIGNED + #define __ALIGNED(x) __attribute__((aligned(x))) +#endif +#ifndef __RESTRICT + #define __RESTRICT __restrict +#endif +#ifndef __COMPILER_BARRIER + #define __COMPILER_BARRIER() __ASM volatile("":::"memory") +#endif + +/* ######################### Startup and Lowlevel Init ######################## */ + +#ifndef __PROGRAM_START +#define __PROGRAM_START __main +#endif + +#ifndef __INITIAL_SP +#define __INITIAL_SP Image$$ARM_LIB_STACK$$ZI$$Limit +#endif + +#ifndef __STACK_LIMIT +#define __STACK_LIMIT Image$$ARM_LIB_STACK$$ZI$$Base +#endif + +#ifndef __VECTOR_TABLE +#define __VECTOR_TABLE __Vectors +#endif + +#ifndef __VECTOR_TABLE_ATTRIBUTE +#define __VECTOR_TABLE_ATTRIBUTE __attribute__((used, section("RESET"))) +#endif + +/* ########################### Core Function Access ########################### */ +/** \ingroup CMSIS_Core_FunctionInterface + \defgroup CMSIS_Core_RegAccFunctions CMSIS Core Register Access Functions + @{ + */ + +/** + \brief Enable IRQ Interrupts + \details Enables IRQ interrupts by clearing the I-bit in the CPSR. + Can only be executed in Privileged modes. + */ +/* intrinsic void __enable_irq(); see arm_compat.h */ + + +/** + \brief Disable IRQ Interrupts + \details Disables IRQ interrupts by setting the I-bit in the CPSR. + Can only be executed in Privileged modes. + */ +/* intrinsic void __disable_irq(); see arm_compat.h */ + + +/** + \brief Get Control Register + \details Returns the content of the Control Register. + \return Control Register value + */ +__STATIC_FORCEINLINE uint32_t __get_CONTROL(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, control" : "=r" (result) ); + return(result); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Control Register (non-secure) + \details Returns the content of the non-secure Control Register when in secure mode. + \return non-secure Control Register value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_CONTROL_NS(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, control_ns" : "=r" (result) ); + return(result); +} +#endif + + +/** + \brief Set Control Register + \details Writes the given value to the Control Register. + \param [in] control Control Register value to set + */ +__STATIC_FORCEINLINE void __set_CONTROL(uint32_t control) +{ + __ASM volatile ("MSR control, %0" : : "r" (control) : "memory"); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Set Control Register (non-secure) + \details Writes the given value to the non-secure Control Register when in secure state. + \param [in] control Control Register value to set + */ +__STATIC_FORCEINLINE void __TZ_set_CONTROL_NS(uint32_t control) +{ + __ASM volatile ("MSR control_ns, %0" : : "r" (control) : "memory"); +} +#endif + + +/** + \brief Get IPSR Register + \details Returns the content of the IPSR Register. + \return IPSR Register value + */ +__STATIC_FORCEINLINE uint32_t __get_IPSR(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, ipsr" : "=r" (result) ); + return(result); +} + + +/** + \brief Get APSR Register + \details Returns the content of the APSR Register. + \return APSR Register value + */ +__STATIC_FORCEINLINE uint32_t __get_APSR(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, apsr" : "=r" (result) ); + return(result); +} + + +/** + \brief Get xPSR Register + \details Returns the content of the xPSR Register. + \return xPSR Register value + */ +__STATIC_FORCEINLINE uint32_t __get_xPSR(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, xpsr" : "=r" (result) ); + return(result); +} + + +/** + \brief Get Process Stack Pointer + \details Returns the current value of the Process Stack Pointer (PSP). + \return PSP Register value + */ +__STATIC_FORCEINLINE uint32_t __get_PSP(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, psp" : "=r" (result) ); + return(result); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Process Stack Pointer (non-secure) + \details Returns the current value of the non-secure Process Stack Pointer (PSP) when in secure state. + \return PSP Register value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_PSP_NS(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, psp_ns" : "=r" (result) ); + return(result); +} +#endif + + +/** + \brief Set Process Stack Pointer + \details Assigns the given value to the Process Stack Pointer (PSP). + \param [in] topOfProcStack Process Stack Pointer value to set + */ +__STATIC_FORCEINLINE void __set_PSP(uint32_t topOfProcStack) +{ + __ASM volatile ("MSR psp, %0" : : "r" (topOfProcStack) : ); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Set Process Stack Pointer (non-secure) + \details Assigns the given value to the non-secure Process Stack Pointer (PSP) when in secure state. + \param [in] topOfProcStack Process Stack Pointer value to set + */ +__STATIC_FORCEINLINE void __TZ_set_PSP_NS(uint32_t topOfProcStack) +{ + __ASM volatile ("MSR psp_ns, %0" : : "r" (topOfProcStack) : ); +} +#endif + + +/** + \brief Get Main Stack Pointer + \details Returns the current value of the Main Stack Pointer (MSP). + \return MSP Register value + */ +__STATIC_FORCEINLINE uint32_t __get_MSP(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, msp" : "=r" (result) ); + return(result); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Main Stack Pointer (non-secure) + \details Returns the current value of the non-secure Main Stack Pointer (MSP) when in secure state. + \return MSP Register value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_MSP_NS(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, msp_ns" : "=r" (result) ); + return(result); +} +#endif + + +/** + \brief Set Main Stack Pointer + \details Assigns the given value to the Main Stack Pointer (MSP). + \param [in] topOfMainStack Main Stack Pointer value to set + */ +__STATIC_FORCEINLINE void __set_MSP(uint32_t topOfMainStack) +{ + __ASM volatile ("MSR msp, %0" : : "r" (topOfMainStack) : ); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Set Main Stack Pointer (non-secure) + \details Assigns the given value to the non-secure Main Stack Pointer (MSP) when in secure state. + \param [in] topOfMainStack Main Stack Pointer value to set + */ +__STATIC_FORCEINLINE void __TZ_set_MSP_NS(uint32_t topOfMainStack) +{ + __ASM volatile ("MSR msp_ns, %0" : : "r" (topOfMainStack) : ); +} +#endif + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Stack Pointer (non-secure) + \details Returns the current value of the non-secure Stack Pointer (SP) when in secure state. + \return SP Register value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_SP_NS(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, sp_ns" : "=r" (result) ); + return(result); +} + + +/** + \brief Set Stack Pointer (non-secure) + \details Assigns the given value to the non-secure Stack Pointer (SP) when in secure state. + \param [in] topOfStack Stack Pointer value to set + */ +__STATIC_FORCEINLINE void __TZ_set_SP_NS(uint32_t topOfStack) +{ + __ASM volatile ("MSR sp_ns, %0" : : "r" (topOfStack) : ); +} +#endif + + +/** + \brief Get Priority Mask + \details Returns the current state of the priority mask bit from the Priority Mask Register. + \return Priority Mask value + */ +__STATIC_FORCEINLINE uint32_t __get_PRIMASK(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, primask" : "=r" (result) ); + return(result); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Priority Mask (non-secure) + \details Returns the current state of the non-secure priority mask bit from the Priority Mask Register when in secure state. + \return Priority Mask value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_PRIMASK_NS(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, primask_ns" : "=r" (result) ); + return(result); +} +#endif + + +/** + \brief Set Priority Mask + \details Assigns the given value to the Priority Mask Register. + \param [in] priMask Priority Mask + */ +__STATIC_FORCEINLINE void __set_PRIMASK(uint32_t priMask) +{ + __ASM volatile ("MSR primask, %0" : : "r" (priMask) : "memory"); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Set Priority Mask (non-secure) + \details Assigns the given value to the non-secure Priority Mask Register when in secure state. + \param [in] priMask Priority Mask + */ +__STATIC_FORCEINLINE void __TZ_set_PRIMASK_NS(uint32_t priMask) +{ + __ASM volatile ("MSR primask_ns, %0" : : "r" (priMask) : "memory"); +} +#endif + + +#if ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \ + (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ + (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1)) ) +/** + \brief Enable FIQ + \details Enables FIQ interrupts by clearing the F-bit in the CPSR. + Can only be executed in Privileged modes. + */ +#define __enable_fault_irq __enable_fiq /* see arm_compat.h */ + + +/** + \brief Disable FIQ + \details Disables FIQ interrupts by setting the F-bit in the CPSR. + Can only be executed in Privileged modes. + */ +#define __disable_fault_irq __disable_fiq /* see arm_compat.h */ + + +/** + \brief Get Base Priority + \details Returns the current value of the Base Priority register. + \return Base Priority register value + */ +__STATIC_FORCEINLINE uint32_t __get_BASEPRI(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, basepri" : "=r" (result) ); + return(result); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Base Priority (non-secure) + \details Returns the current value of the non-secure Base Priority register when in secure state. + \return Base Priority register value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_BASEPRI_NS(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, basepri_ns" : "=r" (result) ); + return(result); +} +#endif + + +/** + \brief Set Base Priority + \details Assigns the given value to the Base Priority register. + \param [in] basePri Base Priority value to set + */ +__STATIC_FORCEINLINE void __set_BASEPRI(uint32_t basePri) +{ + __ASM volatile ("MSR basepri, %0" : : "r" (basePri) : "memory"); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Set Base Priority (non-secure) + \details Assigns the given value to the non-secure Base Priority register when in secure state. + \param [in] basePri Base Priority value to set + */ +__STATIC_FORCEINLINE void __TZ_set_BASEPRI_NS(uint32_t basePri) +{ + __ASM volatile ("MSR basepri_ns, %0" : : "r" (basePri) : "memory"); +} +#endif + + +/** + \brief Set Base Priority with condition + \details Assigns the given value to the Base Priority register only if BASEPRI masking is disabled, + or the new value increases the BASEPRI priority level. + \param [in] basePri Base Priority value to set + */ +__STATIC_FORCEINLINE void __set_BASEPRI_MAX(uint32_t basePri) +{ + __ASM volatile ("MSR basepri_max, %0" : : "r" (basePri) : "memory"); +} + + +/** + \brief Get Fault Mask + \details Returns the current value of the Fault Mask register. + \return Fault Mask register value + */ +__STATIC_FORCEINLINE uint32_t __get_FAULTMASK(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, faultmask" : "=r" (result) ); + return(result); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Fault Mask (non-secure) + \details Returns the current value of the non-secure Fault Mask register when in secure state. + \return Fault Mask register value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_FAULTMASK_NS(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, faultmask_ns" : "=r" (result) ); + return(result); +} +#endif + + +/** + \brief Set Fault Mask + \details Assigns the given value to the Fault Mask register. + \param [in] faultMask Fault Mask value to set + */ +__STATIC_FORCEINLINE void __set_FAULTMASK(uint32_t faultMask) +{ + __ASM volatile ("MSR faultmask, %0" : : "r" (faultMask) : "memory"); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Set Fault Mask (non-secure) + \details Assigns the given value to the non-secure Fault Mask register when in secure state. + \param [in] faultMask Fault Mask value to set + */ +__STATIC_FORCEINLINE void __TZ_set_FAULTMASK_NS(uint32_t faultMask) +{ + __ASM volatile ("MSR faultmask_ns, %0" : : "r" (faultMask) : "memory"); +} +#endif + +#endif /* ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \ + (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ + (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1)) ) */ + + +#if ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1)) || \ + (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1)) ) + +/** + \brief Get Process Stack Pointer Limit + Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure + Stack Pointer Limit register hence zero is returned always in non-secure + mode. + + \details Returns the current value of the Process Stack Pointer Limit (PSPLIM). + \return PSPLIM Register value + */ +__STATIC_FORCEINLINE uint32_t __get_PSPLIM(void) +{ +#if (!((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8_1M_MAIN__ ) && (__ARM_ARCH_8_1M_MAIN__ == 1)) ) && \ + (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3))) + // without main extensions, the non-secure PSPLIM is RAZ/WI + return 0U; +#else + uint32_t result; + __ASM volatile ("MRS %0, psplim" : "=r" (result) ); + return result; +#endif +} + +#if (defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Process Stack Pointer Limit (non-secure) + Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure + Stack Pointer Limit register hence zero is returned always in non-secure + mode. + + \details Returns the current value of the non-secure Process Stack Pointer Limit (PSPLIM) when in secure state. + \return PSPLIM Register value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_PSPLIM_NS(void) +{ +#if (!((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8_1M_MAIN__ ) && (__ARM_ARCH_8_1M_MAIN__ == 1)) ) ) + // without main extensions, the non-secure PSPLIM is RAZ/WI + return 0U; +#else + uint32_t result; + __ASM volatile ("MRS %0, psplim_ns" : "=r" (result) ); + return result; +#endif +} +#endif + + +/** + \brief Set Process Stack Pointer Limit + Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure + Stack Pointer Limit register hence the write is silently ignored in non-secure + mode. + + \details Assigns the given value to the Process Stack Pointer Limit (PSPLIM). + \param [in] ProcStackPtrLimit Process Stack Pointer Limit value to set + */ +__STATIC_FORCEINLINE void __set_PSPLIM(uint32_t ProcStackPtrLimit) +{ +#if (!((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8_1M_MAIN__ ) && (__ARM_ARCH_8_1M_MAIN__ == 1)) ) && \ + (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3))) + // without main extensions, the non-secure PSPLIM is RAZ/WI + (void)ProcStackPtrLimit; +#else + __ASM volatile ("MSR psplim, %0" : : "r" (ProcStackPtrLimit)); +#endif +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Set Process Stack Pointer (non-secure) + Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure + Stack Pointer Limit register hence the write is silently ignored in non-secure + mode. + + \details Assigns the given value to the non-secure Process Stack Pointer Limit (PSPLIM) when in secure state. + \param [in] ProcStackPtrLimit Process Stack Pointer Limit value to set + */ +__STATIC_FORCEINLINE void __TZ_set_PSPLIM_NS(uint32_t ProcStackPtrLimit) +{ +#if (!((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8_1M_MAIN__ ) && (__ARM_ARCH_8_1M_MAIN__ == 1)) ) ) + // without main extensions, the non-secure PSPLIM is RAZ/WI + (void)ProcStackPtrLimit; +#else + __ASM volatile ("MSR psplim_ns, %0\n" : : "r" (ProcStackPtrLimit)); +#endif +} +#endif + + +/** + \brief Get Main Stack Pointer Limit + Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure + Stack Pointer Limit register hence zero is returned always. + + \details Returns the current value of the Main Stack Pointer Limit (MSPLIM). + \return MSPLIM Register value + */ +__STATIC_FORCEINLINE uint32_t __get_MSPLIM(void) +{ +#if (!((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8_1M_MAIN__ ) && (__ARM_ARCH_8_1M_MAIN__ == 1)) ) && \ + (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3))) + // without main extensions, the non-secure MSPLIM is RAZ/WI + return 0U; +#else + uint32_t result; + __ASM volatile ("MRS %0, msplim" : "=r" (result) ); + return result; +#endif +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Main Stack Pointer Limit (non-secure) + Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure + Stack Pointer Limit register hence zero is returned always. + + \details Returns the current value of the non-secure Main Stack Pointer Limit(MSPLIM) when in secure state. + \return MSPLIM Register value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_MSPLIM_NS(void) +{ +#if (!((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8_1M_MAIN__ ) && (__ARM_ARCH_8_1M_MAIN__ == 1)) ) ) + // without main extensions, the non-secure MSPLIM is RAZ/WI + return 0U; +#else + uint32_t result; + __ASM volatile ("MRS %0, msplim_ns" : "=r" (result) ); + return result; +#endif +} +#endif + + +/** + \brief Set Main Stack Pointer Limit + Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure + Stack Pointer Limit register hence the write is silently ignored. + + \details Assigns the given value to the Main Stack Pointer Limit (MSPLIM). + \param [in] MainStackPtrLimit Main Stack Pointer Limit value to set + */ +__STATIC_FORCEINLINE void __set_MSPLIM(uint32_t MainStackPtrLimit) +{ +#if (!((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8_1M_MAIN__ ) && (__ARM_ARCH_8_1M_MAIN__ == 1)) ) && \ + (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3))) + // without main extensions, the non-secure MSPLIM is RAZ/WI + (void)MainStackPtrLimit; +#else + __ASM volatile ("MSR msplim, %0" : : "r" (MainStackPtrLimit)); +#endif +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Set Main Stack Pointer Limit (non-secure) + Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure + Stack Pointer Limit register hence the write is silently ignored. + + \details Assigns the given value to the non-secure Main Stack Pointer Limit (MSPLIM) when in secure state. + \param [in] MainStackPtrLimit Main Stack Pointer value to set + */ +__STATIC_FORCEINLINE void __TZ_set_MSPLIM_NS(uint32_t MainStackPtrLimit) +{ +#if (!((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8_1M_MAIN__ ) && (__ARM_ARCH_8_1M_MAIN__ == 1)) ) ) + // without main extensions, the non-secure MSPLIM is RAZ/WI + (void)MainStackPtrLimit; +#else + __ASM volatile ("MSR msplim_ns, %0" : : "r" (MainStackPtrLimit)); +#endif +} +#endif + +#endif /* ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1)) || \ + (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1)) ) */ + +/** + \brief Get FPSCR + \details Returns the current value of the Floating Point Status/Control register. + \return Floating Point Status/Control register value + */ +#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \ + (defined (__FPU_USED ) && (__FPU_USED == 1U)) ) +#define __get_FPSCR (uint32_t)__builtin_arm_get_fpscr +#else +#define __get_FPSCR() ((uint32_t)0U) +#endif + +/** + \brief Set FPSCR + \details Assigns the given value to the Floating Point Status/Control register. + \param [in] fpscr Floating Point Status/Control value to set + */ +#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \ + (defined (__FPU_USED ) && (__FPU_USED == 1U)) ) +#define __set_FPSCR __builtin_arm_set_fpscr +#else +#define __set_FPSCR(x) ((void)(x)) +#endif + + +/*@} end of CMSIS_Core_RegAccFunctions */ + + +/* ########################## Core Instruction Access ######################### */ +/** \defgroup CMSIS_Core_InstructionInterface CMSIS Core Instruction Interface + Access to dedicated instructions + @{ +*/ + +/* Define macros for porting to both thumb1 and thumb2. + * For thumb1, use low register (r0-r7), specified by constraint "l" + * Otherwise, use general registers, specified by constraint "r" */ +#if defined (__thumb__) && !defined (__thumb2__) +#define __CMSIS_GCC_OUT_REG(r) "=l" (r) +#define __CMSIS_GCC_RW_REG(r) "+l" (r) +#define __CMSIS_GCC_USE_REG(r) "l" (r) +#else +#define __CMSIS_GCC_OUT_REG(r) "=r" (r) +#define __CMSIS_GCC_RW_REG(r) "+r" (r) +#define __CMSIS_GCC_USE_REG(r) "r" (r) +#endif + +/** + \brief No Operation + \details No Operation does nothing. This instruction can be used for code alignment purposes. + */ +#define __NOP __builtin_arm_nop + +/** + \brief Wait For Interrupt + \details Wait For Interrupt is a hint instruction that suspends execution until one of a number of events occurs. + */ +#define __WFI __builtin_arm_wfi + + +/** + \brief Wait For Event + \details Wait For Event is a hint instruction that permits the processor to enter + a low-power state until one of a number of events occurs. + */ +#define __WFE __builtin_arm_wfe + + +/** + \brief Send Event + \details Send Event is a hint instruction. It causes an event to be signaled to the CPU. + */ +#define __SEV __builtin_arm_sev + + +/** + \brief Instruction Synchronization Barrier + \details Instruction Synchronization Barrier flushes the pipeline in the processor, + so that all instructions following the ISB are fetched from cache or memory, + after the instruction has been completed. + */ +#define __ISB() __builtin_arm_isb(0xF) + +/** + \brief Data Synchronization Barrier + \details Acts as a special kind of Data Memory Barrier. + It completes when all explicit memory accesses before this instruction complete. + */ +#define __DSB() __builtin_arm_dsb(0xF) + + +/** + \brief Data Memory Barrier + \details Ensures the apparent order of the explicit memory operations before + and after the instruction, without ensuring their completion. + */ +#define __DMB() __builtin_arm_dmb(0xF) + + +/** + \brief Reverse byte order (32 bit) + \details Reverses the byte order in unsigned integer value. For example, 0x12345678 becomes 0x78563412. + \param [in] value Value to reverse + \return Reversed value + */ +#define __REV(value) __builtin_bswap32(value) + + +/** + \brief Reverse byte order (16 bit) + \details Reverses the byte order within each halfword of a word. For example, 0x12345678 becomes 0x34127856. + \param [in] value Value to reverse + \return Reversed value + */ +#define __REV16(value) __ROR(__REV(value), 16) + + +/** + \brief Reverse byte order (16 bit) + \details Reverses the byte order in a 16-bit value and returns the signed 16-bit result. For example, 0x0080 becomes 0x8000. + \param [in] value Value to reverse + \return Reversed value + */ +#define __REVSH(value) (int16_t)__builtin_bswap16(value) + + +/** + \brief Rotate Right in unsigned value (32 bit) + \details Rotate Right (immediate) provides the value of the contents of a register rotated by a variable number of bits. + \param [in] op1 Value to rotate + \param [in] op2 Number of Bits to rotate + \return Rotated value + */ +__STATIC_FORCEINLINE uint32_t __ROR(uint32_t op1, uint32_t op2) +{ + op2 %= 32U; + if (op2 == 0U) + { + return op1; + } + return (op1 >> op2) | (op1 << (32U - op2)); +} + + +/** + \brief Breakpoint + \details Causes the processor to enter Debug state. + Debug tools can use this to investigate system state when the instruction at a particular address is reached. + \param [in] value is ignored by the processor. + If required, a debugger can use it to store additional information about the breakpoint. + */ +#define __BKPT(value) __ASM volatile ("bkpt "#value) + + +/** + \brief Reverse bit order of value + \details Reverses the bit order of the given value. + \param [in] value Value to reverse + \return Reversed value + */ +#define __RBIT __builtin_arm_rbit + +/** + \brief Count leading zeros + \details Counts the number of leading zeros of a data value. + \param [in] value Value to count the leading zeros + \return number of leading zeros in value + */ +__STATIC_FORCEINLINE uint8_t __CLZ(uint32_t value) +{ + /* Even though __builtin_clz produces a CLZ instruction on ARM, formally + __builtin_clz(0) is undefined behaviour, so handle this case specially. + This guarantees ARM-compatible results if happening to compile on a non-ARM + target, and ensures the compiler doesn't decide to activate any + optimisations using the logic "value was passed to __builtin_clz, so it + is non-zero". + ARM Compiler 6.10 and possibly earlier will optimise this test away, leaving a + single CLZ instruction. + */ + if (value == 0U) + { + return 32U; + } + return __builtin_clz(value); +} + + +#if ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \ + (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ + (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1)) || \ + (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1)) ) + +/** + \brief LDR Exclusive (8 bit) + \details Executes a exclusive LDR instruction for 8 bit value. + \param [in] ptr Pointer to data + \return value of type uint8_t at (*ptr) + */ +#define __LDREXB (uint8_t)__builtin_arm_ldrex + + +/** + \brief LDR Exclusive (16 bit) + \details Executes a exclusive LDR instruction for 16 bit values. + \param [in] ptr Pointer to data + \return value of type uint16_t at (*ptr) + */ +#define __LDREXH (uint16_t)__builtin_arm_ldrex + + +/** + \brief LDR Exclusive (32 bit) + \details Executes a exclusive LDR instruction for 32 bit values. + \param [in] ptr Pointer to data + \return value of type uint32_t at (*ptr) + */ +#define __LDREXW (uint32_t)__builtin_arm_ldrex + + +/** + \brief STR Exclusive (8 bit) + \details Executes a exclusive STR instruction for 8 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + \return 0 Function succeeded + \return 1 Function failed + */ +#define __STREXB (uint32_t)__builtin_arm_strex + + +/** + \brief STR Exclusive (16 bit) + \details Executes a exclusive STR instruction for 16 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + \return 0 Function succeeded + \return 1 Function failed + */ +#define __STREXH (uint32_t)__builtin_arm_strex + + +/** + \brief STR Exclusive (32 bit) + \details Executes a exclusive STR instruction for 32 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + \return 0 Function succeeded + \return 1 Function failed + */ +#define __STREXW (uint32_t)__builtin_arm_strex + + +/** + \brief Remove the exclusive lock + \details Removes the exclusive lock which is created by LDREX. + */ +#define __CLREX __builtin_arm_clrex + +#endif /* ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \ + (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ + (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1)) || \ + (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1)) ) */ + + +#if ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \ + (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ + (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1)) ) + +/** + \brief Signed Saturate + \details Saturates a signed value. + \param [in] value Value to be saturated + \param [in] sat Bit position to saturate to (1..32) + \return Saturated value + */ +#define __SSAT __builtin_arm_ssat + + +/** + \brief Unsigned Saturate + \details Saturates an unsigned value. + \param [in] value Value to be saturated + \param [in] sat Bit position to saturate to (0..31) + \return Saturated value + */ +#define __USAT __builtin_arm_usat + + +/** + \brief Rotate Right with Extend (32 bit) + \details Moves each bit of a bitstring right by one bit. + The carry input is shifted in at the left end of the bitstring. + \param [in] value Value to rotate + \return Rotated value + */ +__STATIC_FORCEINLINE uint32_t __RRX(uint32_t value) +{ + uint32_t result; + + __ASM volatile ("rrx %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) ); + return(result); +} + + +/** + \brief LDRT Unprivileged (8 bit) + \details Executes a Unprivileged LDRT instruction for 8 bit value. + \param [in] ptr Pointer to data + \return value of type uint8_t at (*ptr) + */ +__STATIC_FORCEINLINE uint8_t __LDRBT(volatile uint8_t *ptr) +{ + uint32_t result; + + __ASM volatile ("ldrbt %0, %1" : "=r" (result) : "Q" (*ptr) ); + return ((uint8_t) result); /* Add explicit type cast here */ +} + + +/** + \brief LDRT Unprivileged (16 bit) + \details Executes a Unprivileged LDRT instruction for 16 bit values. + \param [in] ptr Pointer to data + \return value of type uint16_t at (*ptr) + */ +__STATIC_FORCEINLINE uint16_t __LDRHT(volatile uint16_t *ptr) +{ + uint32_t result; + + __ASM volatile ("ldrht %0, %1" : "=r" (result) : "Q" (*ptr) ); + return ((uint16_t) result); /* Add explicit type cast here */ +} + + +/** + \brief LDRT Unprivileged (32 bit) + \details Executes a Unprivileged LDRT instruction for 32 bit values. + \param [in] ptr Pointer to data + \return value of type uint32_t at (*ptr) + */ +__STATIC_FORCEINLINE uint32_t __LDRT(volatile uint32_t *ptr) +{ + uint32_t result; + + __ASM volatile ("ldrt %0, %1" : "=r" (result) : "Q" (*ptr) ); + return(result); +} + + +/** + \brief STRT Unprivileged (8 bit) + \details Executes a Unprivileged STRT instruction for 8 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + */ +__STATIC_FORCEINLINE void __STRBT(uint8_t value, volatile uint8_t *ptr) +{ + __ASM volatile ("strbt %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) ); +} + + +/** + \brief STRT Unprivileged (16 bit) + \details Executes a Unprivileged STRT instruction for 16 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + */ +__STATIC_FORCEINLINE void __STRHT(uint16_t value, volatile uint16_t *ptr) +{ + __ASM volatile ("strht %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) ); +} + + +/** + \brief STRT Unprivileged (32 bit) + \details Executes a Unprivileged STRT instruction for 32 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + */ +__STATIC_FORCEINLINE void __STRT(uint32_t value, volatile uint32_t *ptr) +{ + __ASM volatile ("strt %1, %0" : "=Q" (*ptr) : "r" (value) ); +} + +#else /* ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \ + (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ + (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1)) ) */ + +/** + \brief Signed Saturate + \details Saturates a signed value. + \param [in] value Value to be saturated + \param [in] sat Bit position to saturate to (1..32) + \return Saturated value + */ +__STATIC_FORCEINLINE int32_t __SSAT(int32_t val, uint32_t sat) +{ + if ((sat >= 1U) && (sat <= 32U)) + { + const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U); + const int32_t min = -1 - max ; + if (val > max) + { + return max; + } + else if (val < min) + { + return min; + } + } + return val; +} + +/** + \brief Unsigned Saturate + \details Saturates an unsigned value. + \param [in] value Value to be saturated + \param [in] sat Bit position to saturate to (0..31) + \return Saturated value + */ +__STATIC_FORCEINLINE uint32_t __USAT(int32_t val, uint32_t sat) +{ + if (sat <= 31U) + { + const uint32_t max = ((1U << sat) - 1U); + if (val > (int32_t)max) + { + return max; + } + else if (val < 0) + { + return 0U; + } + } + return (uint32_t)val; +} + +#endif /* ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \ + (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ + (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1)) ) */ + + +#if ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1)) || \ + (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1)) ) + +/** + \brief Load-Acquire (8 bit) + \details Executes a LDAB instruction for 8 bit value. + \param [in] ptr Pointer to data + \return value of type uint8_t at (*ptr) + */ +__STATIC_FORCEINLINE uint8_t __LDAB(volatile uint8_t *ptr) +{ + uint32_t result; + + __ASM volatile ("ldab %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" ); + return ((uint8_t) result); +} + + +/** + \brief Load-Acquire (16 bit) + \details Executes a LDAH instruction for 16 bit values. + \param [in] ptr Pointer to data + \return value of type uint16_t at (*ptr) + */ +__STATIC_FORCEINLINE uint16_t __LDAH(volatile uint16_t *ptr) +{ + uint32_t result; + + __ASM volatile ("ldah %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" ); + return ((uint16_t) result); +} + + +/** + \brief Load-Acquire (32 bit) + \details Executes a LDA instruction for 32 bit values. + \param [in] ptr Pointer to data + \return value of type uint32_t at (*ptr) + */ +__STATIC_FORCEINLINE uint32_t __LDA(volatile uint32_t *ptr) +{ + uint32_t result; + + __ASM volatile ("lda %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" ); + return(result); +} + + +/** + \brief Store-Release (8 bit) + \details Executes a STLB instruction for 8 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + */ +__STATIC_FORCEINLINE void __STLB(uint8_t value, volatile uint8_t *ptr) +{ + __ASM volatile ("stlb %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" ); +} + + +/** + \brief Store-Release (16 bit) + \details Executes a STLH instruction for 16 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + */ +__STATIC_FORCEINLINE void __STLH(uint16_t value, volatile uint16_t *ptr) +{ + __ASM volatile ("stlh %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" ); +} + + +/** + \brief Store-Release (32 bit) + \details Executes a STL instruction for 32 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + */ +__STATIC_FORCEINLINE void __STL(uint32_t value, volatile uint32_t *ptr) +{ + __ASM volatile ("stl %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" ); +} + + +/** + \brief Load-Acquire Exclusive (8 bit) + \details Executes a LDAB exclusive instruction for 8 bit value. + \param [in] ptr Pointer to data + \return value of type uint8_t at (*ptr) + */ +#define __LDAEXB (uint8_t)__builtin_arm_ldaex + + +/** + \brief Load-Acquire Exclusive (16 bit) + \details Executes a LDAH exclusive instruction for 16 bit values. + \param [in] ptr Pointer to data + \return value of type uint16_t at (*ptr) + */ +#define __LDAEXH (uint16_t)__builtin_arm_ldaex + + +/** + \brief Load-Acquire Exclusive (32 bit) + \details Executes a LDA exclusive instruction for 32 bit values. + \param [in] ptr Pointer to data + \return value of type uint32_t at (*ptr) + */ +#define __LDAEX (uint32_t)__builtin_arm_ldaex + + +/** + \brief Store-Release Exclusive (8 bit) + \details Executes a STLB exclusive instruction for 8 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + \return 0 Function succeeded + \return 1 Function failed + */ +#define __STLEXB (uint32_t)__builtin_arm_stlex + + +/** + \brief Store-Release Exclusive (16 bit) + \details Executes a STLH exclusive instruction for 16 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + \return 0 Function succeeded + \return 1 Function failed + */ +#define __STLEXH (uint32_t)__builtin_arm_stlex + + +/** + \brief Store-Release Exclusive (32 bit) + \details Executes a STL exclusive instruction for 32 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + \return 0 Function succeeded + \return 1 Function failed + */ +#define __STLEX (uint32_t)__builtin_arm_stlex + +#endif /* ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1)) || \ + (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1)) ) */ + +/*@}*/ /* end of group CMSIS_Core_InstructionInterface */ + + +/* ################### Compiler specific Intrinsics ########################### */ +/** \defgroup CMSIS_SIMD_intrinsics CMSIS SIMD Intrinsics + Access to dedicated SIMD instructions + @{ +*/ + +#if (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1)) + +#define __SADD8 __builtin_arm_sadd8 +#define __QADD8 __builtin_arm_qadd8 +#define __SHADD8 __builtin_arm_shadd8 +#define __UADD8 __builtin_arm_uadd8 +#define __UQADD8 __builtin_arm_uqadd8 +#define __UHADD8 __builtin_arm_uhadd8 +#define __SSUB8 __builtin_arm_ssub8 +#define __QSUB8 __builtin_arm_qsub8 +#define __SHSUB8 __builtin_arm_shsub8 +#define __USUB8 __builtin_arm_usub8 +#define __UQSUB8 __builtin_arm_uqsub8 +#define __UHSUB8 __builtin_arm_uhsub8 +#define __SADD16 __builtin_arm_sadd16 +#define __QADD16 __builtin_arm_qadd16 +#define __SHADD16 __builtin_arm_shadd16 +#define __UADD16 __builtin_arm_uadd16 +#define __UQADD16 __builtin_arm_uqadd16 +#define __UHADD16 __builtin_arm_uhadd16 +#define __SSUB16 __builtin_arm_ssub16 +#define __QSUB16 __builtin_arm_qsub16 +#define __SHSUB16 __builtin_arm_shsub16 +#define __USUB16 __builtin_arm_usub16 +#define __UQSUB16 __builtin_arm_uqsub16 +#define __UHSUB16 __builtin_arm_uhsub16 +#define __SASX __builtin_arm_sasx +#define __QASX __builtin_arm_qasx +#define __SHASX __builtin_arm_shasx +#define __UASX __builtin_arm_uasx +#define __UQASX __builtin_arm_uqasx +#define __UHASX __builtin_arm_uhasx +#define __SSAX __builtin_arm_ssax +#define __QSAX __builtin_arm_qsax +#define __SHSAX __builtin_arm_shsax +#define __USAX __builtin_arm_usax +#define __UQSAX __builtin_arm_uqsax +#define __UHSAX __builtin_arm_uhsax +#define __USAD8 __builtin_arm_usad8 +#define __USADA8 __builtin_arm_usada8 +#define __SSAT16 __builtin_arm_ssat16 +#define __USAT16 __builtin_arm_usat16 +#define __UXTB16 __builtin_arm_uxtb16 +#define __UXTAB16 __builtin_arm_uxtab16 +#define __SXTB16 __builtin_arm_sxtb16 +#define __SXTAB16 __builtin_arm_sxtab16 +#define __SMUAD __builtin_arm_smuad +#define __SMUADX __builtin_arm_smuadx +#define __SMLAD __builtin_arm_smlad +#define __SMLADX __builtin_arm_smladx +#define __SMLALD __builtin_arm_smlald +#define __SMLALDX __builtin_arm_smlaldx +#define __SMUSD __builtin_arm_smusd +#define __SMUSDX __builtin_arm_smusdx +#define __SMLSD __builtin_arm_smlsd +#define __SMLSDX __builtin_arm_smlsdx +#define __SMLSLD __builtin_arm_smlsld +#define __SMLSLDX __builtin_arm_smlsldx +#define __SEL __builtin_arm_sel +#define __QADD __builtin_arm_qadd +#define __QSUB __builtin_arm_qsub + +#define __PKHBT(ARG1,ARG2,ARG3) ( ((((uint32_t)(ARG1)) ) & 0x0000FFFFUL) | \ + ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL) ) + +#define __PKHTB(ARG1,ARG2,ARG3) ( ((((uint32_t)(ARG1)) ) & 0xFFFF0000UL) | \ + ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL) ) + +#define __SXTB16_RORn(ARG1, ARG2) __SXTB16(__ROR(ARG1, ARG2)) + +__STATIC_FORCEINLINE int32_t __SMMLA (int32_t op1, int32_t op2, int32_t op3) +{ + int32_t result; + + __ASM volatile ("smmla %0, %1, %2, %3" : "=r" (result): "r" (op1), "r" (op2), "r" (op3) ); + return(result); +} + +#endif /* (__ARM_FEATURE_DSP == 1) */ +/*@} end of group CMSIS_SIMD_intrinsics */ + + +#endif /* __CMSIS_ARMCLANG_H */ diff --git a/common/mps2/cmsis_compiler.h b/common/mps2/cmsis_compiler.h new file mode 100644 index 0000000..adbf296 --- /dev/null +++ b/common/mps2/cmsis_compiler.h @@ -0,0 +1,283 @@ +/**************************************************************************//** + * @file cmsis_compiler.h + * @brief CMSIS compiler generic header file + * @version V5.1.0 + * @date 09. October 2018 + ******************************************************************************/ +/* + * Copyright (c) 2009-2018 Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __CMSIS_COMPILER_H +#define __CMSIS_COMPILER_H + +#include + +/* + * Arm Compiler 4/5 + */ +#if defined ( __CC_ARM ) + #include "cmsis_armcc.h" + + +/* + * Arm Compiler 6.6 LTM (armclang) + */ +#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) && (__ARMCC_VERSION < 6100100) + #include "cmsis_armclang_ltm.h" + + /* + * Arm Compiler above 6.10.1 (armclang) + */ +#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6100100) + #include "cmsis_armclang.h" + + +/* + * GNU Compiler + */ +#elif defined ( __GNUC__ ) + #include "cmsis_gcc.h" + + +/* + * IAR Compiler + */ +#elif defined ( __ICCARM__ ) + #include + + +/* + * TI Arm Compiler + */ +#elif defined ( __TI_ARM__ ) + #include + + #ifndef __ASM + #define __ASM __asm + #endif + #ifndef __INLINE + #define __INLINE inline + #endif + #ifndef __STATIC_INLINE + #define __STATIC_INLINE static inline + #endif + #ifndef __STATIC_FORCEINLINE + #define __STATIC_FORCEINLINE __STATIC_INLINE + #endif + #ifndef __NO_RETURN + #define __NO_RETURN __attribute__((noreturn)) + #endif + #ifndef __USED + #define __USED __attribute__((used)) + #endif + #ifndef __WEAK + #define __WEAK __attribute__((weak)) + #endif + #ifndef __PACKED + #define __PACKED __attribute__((packed)) + #endif + #ifndef __PACKED_STRUCT + #define __PACKED_STRUCT struct __attribute__((packed)) + #endif + #ifndef __PACKED_UNION + #define __PACKED_UNION union __attribute__((packed)) + #endif + #ifndef __UNALIGNED_UINT32 /* deprecated */ + struct __attribute__((packed)) T_UINT32 { uint32_t v; }; + #define __UNALIGNED_UINT32(x) (((struct T_UINT32 *)(x))->v) + #endif + #ifndef __UNALIGNED_UINT16_WRITE + __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; }; + #define __UNALIGNED_UINT16_WRITE(addr, val) (void)((((struct T_UINT16_WRITE *)(void*)(addr))->v) = (val)) + #endif + #ifndef __UNALIGNED_UINT16_READ + __PACKED_STRUCT T_UINT16_READ { uint16_t v; }; + #define __UNALIGNED_UINT16_READ(addr) (((const struct T_UINT16_READ *)(const void *)(addr))->v) + #endif + #ifndef __UNALIGNED_UINT32_WRITE + __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; }; + #define __UNALIGNED_UINT32_WRITE(addr, val) (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val)) + #endif + #ifndef __UNALIGNED_UINT32_READ + __PACKED_STRUCT T_UINT32_READ { uint32_t v; }; + #define __UNALIGNED_UINT32_READ(addr) (((const struct T_UINT32_READ *)(const void *)(addr))->v) + #endif + #ifndef __ALIGNED + #define __ALIGNED(x) __attribute__((aligned(x))) + #endif + #ifndef __RESTRICT + #define __RESTRICT __restrict + #endif + #ifndef __COMPILER_BARRIER + #warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored. + #define __COMPILER_BARRIER() (void)0 + #endif + + +/* + * TASKING Compiler + */ +#elif defined ( __TASKING__ ) + /* + * The CMSIS functions have been implemented as intrinsics in the compiler. + * Please use "carm -?i" to get an up to date list of all intrinsics, + * Including the CMSIS ones. + */ + + #ifndef __ASM + #define __ASM __asm + #endif + #ifndef __INLINE + #define __INLINE inline + #endif + #ifndef __STATIC_INLINE + #define __STATIC_INLINE static inline + #endif + #ifndef __STATIC_FORCEINLINE + #define __STATIC_FORCEINLINE __STATIC_INLINE + #endif + #ifndef __NO_RETURN + #define __NO_RETURN __attribute__((noreturn)) + #endif + #ifndef __USED + #define __USED __attribute__((used)) + #endif + #ifndef __WEAK + #define __WEAK __attribute__((weak)) + #endif + #ifndef __PACKED + #define __PACKED __packed__ + #endif + #ifndef __PACKED_STRUCT + #define __PACKED_STRUCT struct __packed__ + #endif + #ifndef __PACKED_UNION + #define __PACKED_UNION union __packed__ + #endif + #ifndef __UNALIGNED_UINT32 /* deprecated */ + struct __packed__ T_UINT32 { uint32_t v; }; + #define __UNALIGNED_UINT32(x) (((struct T_UINT32 *)(x))->v) + #endif + #ifndef __UNALIGNED_UINT16_WRITE + __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; }; + #define __UNALIGNED_UINT16_WRITE(addr, val) (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val)) + #endif + #ifndef __UNALIGNED_UINT16_READ + __PACKED_STRUCT T_UINT16_READ { uint16_t v; }; + #define __UNALIGNED_UINT16_READ(addr) (((const struct T_UINT16_READ *)(const void *)(addr))->v) + #endif + #ifndef __UNALIGNED_UINT32_WRITE + __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; }; + #define __UNALIGNED_UINT32_WRITE(addr, val) (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val)) + #endif + #ifndef __UNALIGNED_UINT32_READ + __PACKED_STRUCT T_UINT32_READ { uint32_t v; }; + #define __UNALIGNED_UINT32_READ(addr) (((const struct T_UINT32_READ *)(const void *)(addr))->v) + #endif + #ifndef __ALIGNED + #define __ALIGNED(x) __align(x) + #endif + #ifndef __RESTRICT + #warning No compiler specific solution for __RESTRICT. __RESTRICT is ignored. + #define __RESTRICT + #endif + #ifndef __COMPILER_BARRIER + #warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored. + #define __COMPILER_BARRIER() (void)0 + #endif + + +/* + * COSMIC Compiler + */ +#elif defined ( __CSMC__ ) + #include + + #ifndef __ASM + #define __ASM _asm + #endif + #ifndef __INLINE + #define __INLINE inline + #endif + #ifndef __STATIC_INLINE + #define __STATIC_INLINE static inline + #endif + #ifndef __STATIC_FORCEINLINE + #define __STATIC_FORCEINLINE __STATIC_INLINE + #endif + #ifndef __NO_RETURN + // NO RETURN is automatically detected hence no warning here + #define __NO_RETURN + #endif + #ifndef __USED + #warning No compiler specific solution for __USED. __USED is ignored. + #define __USED + #endif + #ifndef __WEAK + #define __WEAK __weak + #endif + #ifndef __PACKED + #define __PACKED @packed + #endif + #ifndef __PACKED_STRUCT + #define __PACKED_STRUCT @packed struct + #endif + #ifndef __PACKED_UNION + #define __PACKED_UNION @packed union + #endif + #ifndef __UNALIGNED_UINT32 /* deprecated */ + @packed struct T_UINT32 { uint32_t v; }; + #define __UNALIGNED_UINT32(x) (((struct T_UINT32 *)(x))->v) + #endif + #ifndef __UNALIGNED_UINT16_WRITE + __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; }; + #define __UNALIGNED_UINT16_WRITE(addr, val) (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val)) + #endif + #ifndef __UNALIGNED_UINT16_READ + __PACKED_STRUCT T_UINT16_READ { uint16_t v; }; + #define __UNALIGNED_UINT16_READ(addr) (((const struct T_UINT16_READ *)(const void *)(addr))->v) + #endif + #ifndef __UNALIGNED_UINT32_WRITE + __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; }; + #define __UNALIGNED_UINT32_WRITE(addr, val) (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val)) + #endif + #ifndef __UNALIGNED_UINT32_READ + __PACKED_STRUCT T_UINT32_READ { uint32_t v; }; + #define __UNALIGNED_UINT32_READ(addr) (((const struct T_UINT32_READ *)(const void *)(addr))->v) + #endif + #ifndef __ALIGNED + #warning No compiler specific solution for __ALIGNED. __ALIGNED is ignored. + #define __ALIGNED(x) + #endif + #ifndef __RESTRICT + #warning No compiler specific solution for __RESTRICT. __RESTRICT is ignored. + #define __RESTRICT + #endif + #ifndef __COMPILER_BARRIER + #warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored. + #define __COMPILER_BARRIER() (void)0 + #endif + + +#else + #error Unknown compiler. +#endif + + +#endif /* __CMSIS_COMPILER_H */ + diff --git a/common/mps2/cmsis_gcc.h b/common/mps2/cmsis_gcc.h new file mode 100644 index 0000000..a2778f5 --- /dev/null +++ b/common/mps2/cmsis_gcc.h @@ -0,0 +1,2177 @@ +/**************************************************************************//** + * @file cmsis_gcc.h + * @brief CMSIS compiler GCC header file + * @version V5.3.0 + * @date 26. March 2020 + ******************************************************************************/ +/* + * Copyright (c) 2009-2020 Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __CMSIS_GCC_H +#define __CMSIS_GCC_H + +/* ignore some GCC warnings */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wsign-conversion" +#pragma GCC diagnostic ignored "-Wconversion" +#pragma GCC diagnostic ignored "-Wunused-parameter" + +/* Fallback for __has_builtin */ +#ifndef __has_builtin + #define __has_builtin(x) (0) +#endif + +/* CMSIS compiler specific defines */ +#ifndef __ASM + #define __ASM __asm +#endif +#ifndef __INLINE + #define __INLINE inline +#endif +#ifndef __STATIC_INLINE + #define __STATIC_INLINE static inline +#endif +#ifndef __STATIC_FORCEINLINE + #define __STATIC_FORCEINLINE __attribute__((always_inline)) static inline +#endif +#ifndef __NO_RETURN + #define __NO_RETURN __attribute__((__noreturn__)) +#endif +#ifndef __USED + #define __USED __attribute__((used)) +#endif +#ifndef __WEAK + #define __WEAK __attribute__((weak)) +#endif +#ifndef __PACKED + #define __PACKED __attribute__((packed, aligned(1))) +#endif +#ifndef __PACKED_STRUCT + #define __PACKED_STRUCT struct __attribute__((packed, aligned(1))) +#endif +#ifndef __PACKED_UNION + #define __PACKED_UNION union __attribute__((packed, aligned(1))) +#endif +#ifndef __UNALIGNED_UINT32 /* deprecated */ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wpacked" + #pragma GCC diagnostic ignored "-Wattributes" + struct __attribute__((packed)) T_UINT32 { uint32_t v; }; + #pragma GCC diagnostic pop + #define __UNALIGNED_UINT32(x) (((struct T_UINT32 *)(x))->v) +#endif +#ifndef __UNALIGNED_UINT16_WRITE + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wpacked" + #pragma GCC diagnostic ignored "-Wattributes" + __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; }; + #pragma GCC diagnostic pop + #define __UNALIGNED_UINT16_WRITE(addr, val) (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val)) +#endif +#ifndef __UNALIGNED_UINT16_READ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wpacked" + #pragma GCC diagnostic ignored "-Wattributes" + __PACKED_STRUCT T_UINT16_READ { uint16_t v; }; + #pragma GCC diagnostic pop + #define __UNALIGNED_UINT16_READ(addr) (((const struct T_UINT16_READ *)(const void *)(addr))->v) +#endif +#ifndef __UNALIGNED_UINT32_WRITE + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wpacked" + #pragma GCC diagnostic ignored "-Wattributes" + __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; }; + #pragma GCC diagnostic pop + #define __UNALIGNED_UINT32_WRITE(addr, val) (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val)) +#endif +#ifndef __UNALIGNED_UINT32_READ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wpacked" + #pragma GCC diagnostic ignored "-Wattributes" + __PACKED_STRUCT T_UINT32_READ { uint32_t v; }; + #pragma GCC diagnostic pop + #define __UNALIGNED_UINT32_READ(addr) (((const struct T_UINT32_READ *)(const void *)(addr))->v) +#endif +#ifndef __ALIGNED + #define __ALIGNED(x) __attribute__((aligned(x))) +#endif +#ifndef __RESTRICT + #define __RESTRICT __restrict +#endif +#ifndef __COMPILER_BARRIER + #define __COMPILER_BARRIER() __ASM volatile("":::"memory") +#endif + +/* ######################### Startup and Lowlevel Init ######################## */ + +#ifndef __PROGRAM_START + +/** + \brief Initializes data and bss sections + \details This default implementations initialized all data and additional bss + sections relying on .copy.table and .zero.table specified properly + in the used linker script. + + */ +__STATIC_FORCEINLINE __NO_RETURN void __cmsis_start(void) +{ + extern void _start(void) __NO_RETURN; + + typedef struct { + uint32_t const* src; + uint32_t* dest; + uint32_t wlen; + } __copy_table_t; + + typedef struct { + uint32_t* dest; + uint32_t wlen; + } __zero_table_t; + + extern const __copy_table_t __copy_table_start__; + extern const __copy_table_t __copy_table_end__; + extern const __zero_table_t __zero_table_start__; + extern const __zero_table_t __zero_table_end__; + + for (__copy_table_t const* pTable = &__copy_table_start__; pTable < &__copy_table_end__; ++pTable) { + for(uint32_t i=0u; iwlen; ++i) { + pTable->dest[i] = pTable->src[i]; + } + } + + for (__zero_table_t const* pTable = &__zero_table_start__; pTable < &__zero_table_end__; ++pTable) { + for(uint32_t i=0u; iwlen; ++i) { + pTable->dest[i] = 0u; + } + } + + _start(); +} + +#define __PROGRAM_START __cmsis_start +#endif + +#ifndef __INITIAL_SP +#define __INITIAL_SP __StackTop +#endif + +#ifndef __STACK_LIMIT +#define __STACK_LIMIT __StackLimit +#endif + +#ifndef __VECTOR_TABLE +#define __VECTOR_TABLE __Vectors +#endif + +#ifndef __VECTOR_TABLE_ATTRIBUTE +#define __VECTOR_TABLE_ATTRIBUTE __attribute__((used, section(".vectors"))) +#endif + +/* ########################### Core Function Access ########################### */ +/** \ingroup CMSIS_Core_FunctionInterface + \defgroup CMSIS_Core_RegAccFunctions CMSIS Core Register Access Functions + @{ + */ + +/** + \brief Enable IRQ Interrupts + \details Enables IRQ interrupts by clearing the I-bit in the CPSR. + Can only be executed in Privileged modes. + */ +__STATIC_FORCEINLINE void __enable_irq(void) +{ + __ASM volatile ("cpsie i" : : : "memory"); +} + + +/** + \brief Disable IRQ Interrupts + \details Disables IRQ interrupts by setting the I-bit in the CPSR. + Can only be executed in Privileged modes. + */ +__STATIC_FORCEINLINE void __disable_irq(void) +{ + __ASM volatile ("cpsid i" : : : "memory"); +} + + +/** + \brief Get Control Register + \details Returns the content of the Control Register. + \return Control Register value + */ +__STATIC_FORCEINLINE uint32_t __get_CONTROL(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, control" : "=r" (result) ); + return(result); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Control Register (non-secure) + \details Returns the content of the non-secure Control Register when in secure mode. + \return non-secure Control Register value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_CONTROL_NS(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, control_ns" : "=r" (result) ); + return(result); +} +#endif + + +/** + \brief Set Control Register + \details Writes the given value to the Control Register. + \param [in] control Control Register value to set + */ +__STATIC_FORCEINLINE void __set_CONTROL(uint32_t control) +{ + __ASM volatile ("MSR control, %0" : : "r" (control) : "memory"); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Set Control Register (non-secure) + \details Writes the given value to the non-secure Control Register when in secure state. + \param [in] control Control Register value to set + */ +__STATIC_FORCEINLINE void __TZ_set_CONTROL_NS(uint32_t control) +{ + __ASM volatile ("MSR control_ns, %0" : : "r" (control) : "memory"); +} +#endif + + +/** + \brief Get IPSR Register + \details Returns the content of the IPSR Register. + \return IPSR Register value + */ +__STATIC_FORCEINLINE uint32_t __get_IPSR(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, ipsr" : "=r" (result) ); + return(result); +} + + +/** + \brief Get APSR Register + \details Returns the content of the APSR Register. + \return APSR Register value + */ +__STATIC_FORCEINLINE uint32_t __get_APSR(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, apsr" : "=r" (result) ); + return(result); +} + + +/** + \brief Get xPSR Register + \details Returns the content of the xPSR Register. + \return xPSR Register value + */ +__STATIC_FORCEINLINE uint32_t __get_xPSR(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, xpsr" : "=r" (result) ); + return(result); +} + + +/** + \brief Get Process Stack Pointer + \details Returns the current value of the Process Stack Pointer (PSP). + \return PSP Register value + */ +__STATIC_FORCEINLINE uint32_t __get_PSP(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, psp" : "=r" (result) ); + return(result); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Process Stack Pointer (non-secure) + \details Returns the current value of the non-secure Process Stack Pointer (PSP) when in secure state. + \return PSP Register value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_PSP_NS(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, psp_ns" : "=r" (result) ); + return(result); +} +#endif + + +/** + \brief Set Process Stack Pointer + \details Assigns the given value to the Process Stack Pointer (PSP). + \param [in] topOfProcStack Process Stack Pointer value to set + */ +__STATIC_FORCEINLINE void __set_PSP(uint32_t topOfProcStack) +{ + __ASM volatile ("MSR psp, %0" : : "r" (topOfProcStack) : ); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Set Process Stack Pointer (non-secure) + \details Assigns the given value to the non-secure Process Stack Pointer (PSP) when in secure state. + \param [in] topOfProcStack Process Stack Pointer value to set + */ +__STATIC_FORCEINLINE void __TZ_set_PSP_NS(uint32_t topOfProcStack) +{ + __ASM volatile ("MSR psp_ns, %0" : : "r" (topOfProcStack) : ); +} +#endif + + +/** + \brief Get Main Stack Pointer + \details Returns the current value of the Main Stack Pointer (MSP). + \return MSP Register value + */ +__STATIC_FORCEINLINE uint32_t __get_MSP(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, msp" : "=r" (result) ); + return(result); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Main Stack Pointer (non-secure) + \details Returns the current value of the non-secure Main Stack Pointer (MSP) when in secure state. + \return MSP Register value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_MSP_NS(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, msp_ns" : "=r" (result) ); + return(result); +} +#endif + + +/** + \brief Set Main Stack Pointer + \details Assigns the given value to the Main Stack Pointer (MSP). + \param [in] topOfMainStack Main Stack Pointer value to set + */ +__STATIC_FORCEINLINE void __set_MSP(uint32_t topOfMainStack) +{ + __ASM volatile ("MSR msp, %0" : : "r" (topOfMainStack) : ); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Set Main Stack Pointer (non-secure) + \details Assigns the given value to the non-secure Main Stack Pointer (MSP) when in secure state. + \param [in] topOfMainStack Main Stack Pointer value to set + */ +__STATIC_FORCEINLINE void __TZ_set_MSP_NS(uint32_t topOfMainStack) +{ + __ASM volatile ("MSR msp_ns, %0" : : "r" (topOfMainStack) : ); +} +#endif + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Stack Pointer (non-secure) + \details Returns the current value of the non-secure Stack Pointer (SP) when in secure state. + \return SP Register value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_SP_NS(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, sp_ns" : "=r" (result) ); + return(result); +} + + +/** + \brief Set Stack Pointer (non-secure) + \details Assigns the given value to the non-secure Stack Pointer (SP) when in secure state. + \param [in] topOfStack Stack Pointer value to set + */ +__STATIC_FORCEINLINE void __TZ_set_SP_NS(uint32_t topOfStack) +{ + __ASM volatile ("MSR sp_ns, %0" : : "r" (topOfStack) : ); +} +#endif + + +/** + \brief Get Priority Mask + \details Returns the current state of the priority mask bit from the Priority Mask Register. + \return Priority Mask value + */ +__STATIC_FORCEINLINE uint32_t __get_PRIMASK(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, primask" : "=r" (result) ); + return(result); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Priority Mask (non-secure) + \details Returns the current state of the non-secure priority mask bit from the Priority Mask Register when in secure state. + \return Priority Mask value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_PRIMASK_NS(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, primask_ns" : "=r" (result) ); + return(result); +} +#endif + + +/** + \brief Set Priority Mask + \details Assigns the given value to the Priority Mask Register. + \param [in] priMask Priority Mask + */ +__STATIC_FORCEINLINE void __set_PRIMASK(uint32_t priMask) +{ + __ASM volatile ("MSR primask, %0" : : "r" (priMask) : "memory"); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Set Priority Mask (non-secure) + \details Assigns the given value to the non-secure Priority Mask Register when in secure state. + \param [in] priMask Priority Mask + */ +__STATIC_FORCEINLINE void __TZ_set_PRIMASK_NS(uint32_t priMask) +{ + __ASM volatile ("MSR primask_ns, %0" : : "r" (priMask) : "memory"); +} +#endif + + +#if ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \ + (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ + (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) ) +/** + \brief Enable FIQ + \details Enables FIQ interrupts by clearing the F-bit in the CPSR. + Can only be executed in Privileged modes. + */ +__STATIC_FORCEINLINE void __enable_fault_irq(void) +{ + __ASM volatile ("cpsie f" : : : "memory"); +} + + +/** + \brief Disable FIQ + \details Disables FIQ interrupts by setting the F-bit in the CPSR. + Can only be executed in Privileged modes. + */ +__STATIC_FORCEINLINE void __disable_fault_irq(void) +{ + __ASM volatile ("cpsid f" : : : "memory"); +} + + +/** + \brief Get Base Priority + \details Returns the current value of the Base Priority register. + \return Base Priority register value + */ +__STATIC_FORCEINLINE uint32_t __get_BASEPRI(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, basepri" : "=r" (result) ); + return(result); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Base Priority (non-secure) + \details Returns the current value of the non-secure Base Priority register when in secure state. + \return Base Priority register value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_BASEPRI_NS(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, basepri_ns" : "=r" (result) ); + return(result); +} +#endif + + +/** + \brief Set Base Priority + \details Assigns the given value to the Base Priority register. + \param [in] basePri Base Priority value to set + */ +__STATIC_FORCEINLINE void __set_BASEPRI(uint32_t basePri) +{ + __ASM volatile ("MSR basepri, %0" : : "r" (basePri) : "memory"); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Set Base Priority (non-secure) + \details Assigns the given value to the non-secure Base Priority register when in secure state. + \param [in] basePri Base Priority value to set + */ +__STATIC_FORCEINLINE void __TZ_set_BASEPRI_NS(uint32_t basePri) +{ + __ASM volatile ("MSR basepri_ns, %0" : : "r" (basePri) : "memory"); +} +#endif + + +/** + \brief Set Base Priority with condition + \details Assigns the given value to the Base Priority register only if BASEPRI masking is disabled, + or the new value increases the BASEPRI priority level. + \param [in] basePri Base Priority value to set + */ +__STATIC_FORCEINLINE void __set_BASEPRI_MAX(uint32_t basePri) +{ + __ASM volatile ("MSR basepri_max, %0" : : "r" (basePri) : "memory"); +} + + +/** + \brief Get Fault Mask + \details Returns the current value of the Fault Mask register. + \return Fault Mask register value + */ +__STATIC_FORCEINLINE uint32_t __get_FAULTMASK(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, faultmask" : "=r" (result) ); + return(result); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Fault Mask (non-secure) + \details Returns the current value of the non-secure Fault Mask register when in secure state. + \return Fault Mask register value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_FAULTMASK_NS(void) +{ + uint32_t result; + + __ASM volatile ("MRS %0, faultmask_ns" : "=r" (result) ); + return(result); +} +#endif + + +/** + \brief Set Fault Mask + \details Assigns the given value to the Fault Mask register. + \param [in] faultMask Fault Mask value to set + */ +__STATIC_FORCEINLINE void __set_FAULTMASK(uint32_t faultMask) +{ + __ASM volatile ("MSR faultmask, %0" : : "r" (faultMask) : "memory"); +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Set Fault Mask (non-secure) + \details Assigns the given value to the non-secure Fault Mask register when in secure state. + \param [in] faultMask Fault Mask value to set + */ +__STATIC_FORCEINLINE void __TZ_set_FAULTMASK_NS(uint32_t faultMask) +{ + __ASM volatile ("MSR faultmask_ns, %0" : : "r" (faultMask) : "memory"); +} +#endif + +#endif /* ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \ + (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ + (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) ) */ + + +#if ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1)) ) + +/** + \brief Get Process Stack Pointer Limit + Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure + Stack Pointer Limit register hence zero is returned always in non-secure + mode. + + \details Returns the current value of the Process Stack Pointer Limit (PSPLIM). + \return PSPLIM Register value + */ +__STATIC_FORCEINLINE uint32_t __get_PSPLIM(void) +{ +#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \ + (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3))) + // without main extensions, the non-secure PSPLIM is RAZ/WI + return 0U; +#else + uint32_t result; + __ASM volatile ("MRS %0, psplim" : "=r" (result) ); + return result; +#endif +} + +#if (defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Process Stack Pointer Limit (non-secure) + Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure + Stack Pointer Limit register hence zero is returned always. + + \details Returns the current value of the non-secure Process Stack Pointer Limit (PSPLIM) when in secure state. + \return PSPLIM Register value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_PSPLIM_NS(void) +{ +#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))) + // without main extensions, the non-secure PSPLIM is RAZ/WI + return 0U; +#else + uint32_t result; + __ASM volatile ("MRS %0, psplim_ns" : "=r" (result) ); + return result; +#endif +} +#endif + + +/** + \brief Set Process Stack Pointer Limit + Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure + Stack Pointer Limit register hence the write is silently ignored in non-secure + mode. + + \details Assigns the given value to the Process Stack Pointer Limit (PSPLIM). + \param [in] ProcStackPtrLimit Process Stack Pointer Limit value to set + */ +__STATIC_FORCEINLINE void __set_PSPLIM(uint32_t ProcStackPtrLimit) +{ +#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \ + (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3))) + // without main extensions, the non-secure PSPLIM is RAZ/WI + (void)ProcStackPtrLimit; +#else + __ASM volatile ("MSR psplim, %0" : : "r" (ProcStackPtrLimit)); +#endif +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Set Process Stack Pointer (non-secure) + Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure + Stack Pointer Limit register hence the write is silently ignored. + + \details Assigns the given value to the non-secure Process Stack Pointer Limit (PSPLIM) when in secure state. + \param [in] ProcStackPtrLimit Process Stack Pointer Limit value to set + */ +__STATIC_FORCEINLINE void __TZ_set_PSPLIM_NS(uint32_t ProcStackPtrLimit) +{ +#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))) + // without main extensions, the non-secure PSPLIM is RAZ/WI + (void)ProcStackPtrLimit; +#else + __ASM volatile ("MSR psplim_ns, %0\n" : : "r" (ProcStackPtrLimit)); +#endif +} +#endif + + +/** + \brief Get Main Stack Pointer Limit + Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure + Stack Pointer Limit register hence zero is returned always in non-secure + mode. + + \details Returns the current value of the Main Stack Pointer Limit (MSPLIM). + \return MSPLIM Register value + */ +__STATIC_FORCEINLINE uint32_t __get_MSPLIM(void) +{ +#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \ + (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3))) + // without main extensions, the non-secure MSPLIM is RAZ/WI + return 0U; +#else + uint32_t result; + __ASM volatile ("MRS %0, msplim" : "=r" (result) ); + return result; +#endif +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Get Main Stack Pointer Limit (non-secure) + Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure + Stack Pointer Limit register hence zero is returned always. + + \details Returns the current value of the non-secure Main Stack Pointer Limit(MSPLIM) when in secure state. + \return MSPLIM Register value + */ +__STATIC_FORCEINLINE uint32_t __TZ_get_MSPLIM_NS(void) +{ +#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))) + // without main extensions, the non-secure MSPLIM is RAZ/WI + return 0U; +#else + uint32_t result; + __ASM volatile ("MRS %0, msplim_ns" : "=r" (result) ); + return result; +#endif +} +#endif + + +/** + \brief Set Main Stack Pointer Limit + Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure + Stack Pointer Limit register hence the write is silently ignored in non-secure + mode. + + \details Assigns the given value to the Main Stack Pointer Limit (MSPLIM). + \param [in] MainStackPtrLimit Main Stack Pointer Limit value to set + */ +__STATIC_FORCEINLINE void __set_MSPLIM(uint32_t MainStackPtrLimit) +{ +#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \ + (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3))) + // without main extensions, the non-secure MSPLIM is RAZ/WI + (void)MainStackPtrLimit; +#else + __ASM volatile ("MSR msplim, %0" : : "r" (MainStackPtrLimit)); +#endif +} + + +#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3)) +/** + \brief Set Main Stack Pointer Limit (non-secure) + Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure + Stack Pointer Limit register hence the write is silently ignored. + + \details Assigns the given value to the non-secure Main Stack Pointer Limit (MSPLIM) when in secure state. + \param [in] MainStackPtrLimit Main Stack Pointer value to set + */ +__STATIC_FORCEINLINE void __TZ_set_MSPLIM_NS(uint32_t MainStackPtrLimit) +{ +#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))) + // without main extensions, the non-secure MSPLIM is RAZ/WI + (void)MainStackPtrLimit; +#else + __ASM volatile ("MSR msplim_ns, %0" : : "r" (MainStackPtrLimit)); +#endif +} +#endif + +#endif /* ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1)) ) */ + + +/** + \brief Get FPSCR + \details Returns the current value of the Floating Point Status/Control register. + \return Floating Point Status/Control register value + */ +__STATIC_FORCEINLINE uint32_t __get_FPSCR(void) +{ +#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \ + (defined (__FPU_USED ) && (__FPU_USED == 1U)) ) +#if __has_builtin(__builtin_arm_get_fpscr) +// Re-enable using built-in when GCC has been fixed +// || (__GNUC__ > 7) || (__GNUC__ == 7 && __GNUC_MINOR__ >= 2) + /* see https://gcc.gnu.org/ml/gcc-patches/2017-04/msg00443.html */ + return __builtin_arm_get_fpscr(); +#else + uint32_t result; + + __ASM volatile ("VMRS %0, fpscr" : "=r" (result) ); + return(result); +#endif +#else + return(0U); +#endif +} + + +/** + \brief Set FPSCR + \details Assigns the given value to the Floating Point Status/Control register. + \param [in] fpscr Floating Point Status/Control value to set + */ +__STATIC_FORCEINLINE void __set_FPSCR(uint32_t fpscr) +{ +#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \ + (defined (__FPU_USED ) && (__FPU_USED == 1U)) ) +#if __has_builtin(__builtin_arm_set_fpscr) +// Re-enable using built-in when GCC has been fixed +// || (__GNUC__ > 7) || (__GNUC__ == 7 && __GNUC_MINOR__ >= 2) + /* see https://gcc.gnu.org/ml/gcc-patches/2017-04/msg00443.html */ + __builtin_arm_set_fpscr(fpscr); +#else + __ASM volatile ("VMSR fpscr, %0" : : "r" (fpscr) : "vfpcc", "memory"); +#endif +#else + (void)fpscr; +#endif +} + + +/*@} end of CMSIS_Core_RegAccFunctions */ + + +/* ########################## Core Instruction Access ######################### */ +/** \defgroup CMSIS_Core_InstructionInterface CMSIS Core Instruction Interface + Access to dedicated instructions + @{ +*/ + +/* Define macros for porting to both thumb1 and thumb2. + * For thumb1, use low register (r0-r7), specified by constraint "l" + * Otherwise, use general registers, specified by constraint "r" */ +#if defined (__thumb__) && !defined (__thumb2__) +#define __CMSIS_GCC_OUT_REG(r) "=l" (r) +#define __CMSIS_GCC_RW_REG(r) "+l" (r) +#define __CMSIS_GCC_USE_REG(r) "l" (r) +#else +#define __CMSIS_GCC_OUT_REG(r) "=r" (r) +#define __CMSIS_GCC_RW_REG(r) "+r" (r) +#define __CMSIS_GCC_USE_REG(r) "r" (r) +#endif + +/** + \brief No Operation + \details No Operation does nothing. This instruction can be used for code alignment purposes. + */ +#define __NOP() __ASM volatile ("nop") + +/** + \brief Wait For Interrupt + \details Wait For Interrupt is a hint instruction that suspends execution until one of a number of events occurs. + */ +#define __WFI() __ASM volatile ("wfi":::"memory") + + +/** + \brief Wait For Event + \details Wait For Event is a hint instruction that permits the processor to enter + a low-power state until one of a number of events occurs. + */ +#define __WFE() __ASM volatile ("wfe":::"memory") + + +/** + \brief Send Event + \details Send Event is a hint instruction. It causes an event to be signaled to the CPU. + */ +#define __SEV() __ASM volatile ("sev") + + +/** + \brief Instruction Synchronization Barrier + \details Instruction Synchronization Barrier flushes the pipeline in the processor, + so that all instructions following the ISB are fetched from cache or memory, + after the instruction has been completed. + */ +__STATIC_FORCEINLINE void __ISB(void) +{ + __ASM volatile ("isb 0xF":::"memory"); +} + + +/** + \brief Data Synchronization Barrier + \details Acts as a special kind of Data Memory Barrier. + It completes when all explicit memory accesses before this instruction complete. + */ +__STATIC_FORCEINLINE void __DSB(void) +{ + __ASM volatile ("dsb 0xF":::"memory"); +} + + +/** + \brief Data Memory Barrier + \details Ensures the apparent order of the explicit memory operations before + and after the instruction, without ensuring their completion. + */ +__STATIC_FORCEINLINE void __DMB(void) +{ + __ASM volatile ("dmb 0xF":::"memory"); +} + + +/** + \brief Reverse byte order (32 bit) + \details Reverses the byte order in unsigned integer value. For example, 0x12345678 becomes 0x78563412. + \param [in] value Value to reverse + \return Reversed value + */ +__STATIC_FORCEINLINE uint32_t __REV(uint32_t value) +{ +#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5) + return __builtin_bswap32(value); +#else + uint32_t result; + + __ASM ("rev %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) ); + return result; +#endif +} + + +/** + \brief Reverse byte order (16 bit) + \details Reverses the byte order within each halfword of a word. For example, 0x12345678 becomes 0x34127856. + \param [in] value Value to reverse + \return Reversed value + */ +__STATIC_FORCEINLINE uint32_t __REV16(uint32_t value) +{ + uint32_t result; + + __ASM ("rev16 %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) ); + return result; +} + + +/** + \brief Reverse byte order (16 bit) + \details Reverses the byte order in a 16-bit value and returns the signed 16-bit result. For example, 0x0080 becomes 0x8000. + \param [in] value Value to reverse + \return Reversed value + */ +__STATIC_FORCEINLINE int16_t __REVSH(int16_t value) +{ +#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) + return (int16_t)__builtin_bswap16(value); +#else + int16_t result; + + __ASM ("revsh %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) ); + return result; +#endif +} + + +/** + \brief Rotate Right in unsigned value (32 bit) + \details Rotate Right (immediate) provides the value of the contents of a register rotated by a variable number of bits. + \param [in] op1 Value to rotate + \param [in] op2 Number of Bits to rotate + \return Rotated value + */ +__STATIC_FORCEINLINE uint32_t __ROR(uint32_t op1, uint32_t op2) +{ + op2 %= 32U; + if (op2 == 0U) + { + return op1; + } + return (op1 >> op2) | (op1 << (32U - op2)); +} + + +/** + \brief Breakpoint + \details Causes the processor to enter Debug state. + Debug tools can use this to investigate system state when the instruction at a particular address is reached. + \param [in] value is ignored by the processor. + If required, a debugger can use it to store additional information about the breakpoint. + */ +#define __BKPT(value) __ASM volatile ("bkpt "#value) + + +/** + \brief Reverse bit order of value + \details Reverses the bit order of the given value. + \param [in] value Value to reverse + \return Reversed value + */ +__STATIC_FORCEINLINE uint32_t __RBIT(uint32_t value) +{ + uint32_t result; + +#if ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \ + (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ + (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) ) + __ASM ("rbit %0, %1" : "=r" (result) : "r" (value) ); +#else + uint32_t s = (4U /*sizeof(v)*/ * 8U) - 1U; /* extra shift needed at end */ + + result = value; /* r will be reversed bits of v; first get LSB of v */ + for (value >>= 1U; value != 0U; value >>= 1U) + { + result <<= 1U; + result |= value & 1U; + s--; + } + result <<= s; /* shift when v's highest bits are zero */ +#endif + return result; +} + + +/** + \brief Count leading zeros + \details Counts the number of leading zeros of a data value. + \param [in] value Value to count the leading zeros + \return number of leading zeros in value + */ +__STATIC_FORCEINLINE uint8_t __CLZ(uint32_t value) +{ + /* Even though __builtin_clz produces a CLZ instruction on ARM, formally + __builtin_clz(0) is undefined behaviour, so handle this case specially. + This guarantees ARM-compatible results if happening to compile on a non-ARM + target, and ensures the compiler doesn't decide to activate any + optimisations using the logic "value was passed to __builtin_clz, so it + is non-zero". + ARM GCC 7.3 and possibly earlier will optimise this test away, leaving a + single CLZ instruction. + */ + if (value == 0U) + { + return 32U; + } + return __builtin_clz(value); +} + + +#if ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \ + (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ + (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1)) ) +/** + \brief LDR Exclusive (8 bit) + \details Executes a exclusive LDR instruction for 8 bit value. + \param [in] ptr Pointer to data + \return value of type uint8_t at (*ptr) + */ +__STATIC_FORCEINLINE uint8_t __LDREXB(volatile uint8_t *addr) +{ + uint32_t result; + +#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) + __ASM volatile ("ldrexb %0, %1" : "=r" (result) : "Q" (*addr) ); +#else + /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not + accepted by assembler. So has to use following less efficient pattern. + */ + __ASM volatile ("ldrexb %0, [%1]" : "=r" (result) : "r" (addr) : "memory" ); +#endif + return ((uint8_t) result); /* Add explicit type cast here */ +} + + +/** + \brief LDR Exclusive (16 bit) + \details Executes a exclusive LDR instruction for 16 bit values. + \param [in] ptr Pointer to data + \return value of type uint16_t at (*ptr) + */ +__STATIC_FORCEINLINE uint16_t __LDREXH(volatile uint16_t *addr) +{ + uint32_t result; + +#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) + __ASM volatile ("ldrexh %0, %1" : "=r" (result) : "Q" (*addr) ); +#else + /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not + accepted by assembler. So has to use following less efficient pattern. + */ + __ASM volatile ("ldrexh %0, [%1]" : "=r" (result) : "r" (addr) : "memory" ); +#endif + return ((uint16_t) result); /* Add explicit type cast here */ +} + + +/** + \brief LDR Exclusive (32 bit) + \details Executes a exclusive LDR instruction for 32 bit values. + \param [in] ptr Pointer to data + \return value of type uint32_t at (*ptr) + */ +__STATIC_FORCEINLINE uint32_t __LDREXW(volatile uint32_t *addr) +{ + uint32_t result; + + __ASM volatile ("ldrex %0, %1" : "=r" (result) : "Q" (*addr) ); + return(result); +} + + +/** + \brief STR Exclusive (8 bit) + \details Executes a exclusive STR instruction for 8 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + \return 0 Function succeeded + \return 1 Function failed + */ +__STATIC_FORCEINLINE uint32_t __STREXB(uint8_t value, volatile uint8_t *addr) +{ + uint32_t result; + + __ASM volatile ("strexb %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" ((uint32_t)value) ); + return(result); +} + + +/** + \brief STR Exclusive (16 bit) + \details Executes a exclusive STR instruction for 16 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + \return 0 Function succeeded + \return 1 Function failed + */ +__STATIC_FORCEINLINE uint32_t __STREXH(uint16_t value, volatile uint16_t *addr) +{ + uint32_t result; + + __ASM volatile ("strexh %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" ((uint32_t)value) ); + return(result); +} + + +/** + \brief STR Exclusive (32 bit) + \details Executes a exclusive STR instruction for 32 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + \return 0 Function succeeded + \return 1 Function failed + */ +__STATIC_FORCEINLINE uint32_t __STREXW(uint32_t value, volatile uint32_t *addr) +{ + uint32_t result; + + __ASM volatile ("strex %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" (value) ); + return(result); +} + + +/** + \brief Remove the exclusive lock + \details Removes the exclusive lock which is created by LDREX. + */ +__STATIC_FORCEINLINE void __CLREX(void) +{ + __ASM volatile ("clrex" ::: "memory"); +} + +#endif /* ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \ + (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ + (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1)) ) */ + + +#if ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \ + (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ + (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) ) +/** + \brief Signed Saturate + \details Saturates a signed value. + \param [in] ARG1 Value to be saturated + \param [in] ARG2 Bit position to saturate to (1..32) + \return Saturated value + */ +#define __SSAT(ARG1, ARG2) \ +__extension__ \ +({ \ + int32_t __RES, __ARG1 = (ARG1); \ + __ASM volatile ("ssat %0, %1, %2" : "=r" (__RES) : "I" (ARG2), "r" (__ARG1) : "cc" ); \ + __RES; \ + }) + + +/** + \brief Unsigned Saturate + \details Saturates an unsigned value. + \param [in] ARG1 Value to be saturated + \param [in] ARG2 Bit position to saturate to (0..31) + \return Saturated value + */ +#define __USAT(ARG1, ARG2) \ + __extension__ \ +({ \ + uint32_t __RES, __ARG1 = (ARG1); \ + __ASM volatile ("usat %0, %1, %2" : "=r" (__RES) : "I" (ARG2), "r" (__ARG1) : "cc" ); \ + __RES; \ + }) + + +/** + \brief Rotate Right with Extend (32 bit) + \details Moves each bit of a bitstring right by one bit. + The carry input is shifted in at the left end of the bitstring. + \param [in] value Value to rotate + \return Rotated value + */ +__STATIC_FORCEINLINE uint32_t __RRX(uint32_t value) +{ + uint32_t result; + + __ASM volatile ("rrx %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) ); + return(result); +} + + +/** + \brief LDRT Unprivileged (8 bit) + \details Executes a Unprivileged LDRT instruction for 8 bit value. + \param [in] ptr Pointer to data + \return value of type uint8_t at (*ptr) + */ +__STATIC_FORCEINLINE uint8_t __LDRBT(volatile uint8_t *ptr) +{ + uint32_t result; + +#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) + __ASM volatile ("ldrbt %0, %1" : "=r" (result) : "Q" (*ptr) ); +#else + /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not + accepted by assembler. So has to use following less efficient pattern. + */ + __ASM volatile ("ldrbt %0, [%1]" : "=r" (result) : "r" (ptr) : "memory" ); +#endif + return ((uint8_t) result); /* Add explicit type cast here */ +} + + +/** + \brief LDRT Unprivileged (16 bit) + \details Executes a Unprivileged LDRT instruction for 16 bit values. + \param [in] ptr Pointer to data + \return value of type uint16_t at (*ptr) + */ +__STATIC_FORCEINLINE uint16_t __LDRHT(volatile uint16_t *ptr) +{ + uint32_t result; + +#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) + __ASM volatile ("ldrht %0, %1" : "=r" (result) : "Q" (*ptr) ); +#else + /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not + accepted by assembler. So has to use following less efficient pattern. + */ + __ASM volatile ("ldrht %0, [%1]" : "=r" (result) : "r" (ptr) : "memory" ); +#endif + return ((uint16_t) result); /* Add explicit type cast here */ +} + + +/** + \brief LDRT Unprivileged (32 bit) + \details Executes a Unprivileged LDRT instruction for 32 bit values. + \param [in] ptr Pointer to data + \return value of type uint32_t at (*ptr) + */ +__STATIC_FORCEINLINE uint32_t __LDRT(volatile uint32_t *ptr) +{ + uint32_t result; + + __ASM volatile ("ldrt %0, %1" : "=r" (result) : "Q" (*ptr) ); + return(result); +} + + +/** + \brief STRT Unprivileged (8 bit) + \details Executes a Unprivileged STRT instruction for 8 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + */ +__STATIC_FORCEINLINE void __STRBT(uint8_t value, volatile uint8_t *ptr) +{ + __ASM volatile ("strbt %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) ); +} + + +/** + \brief STRT Unprivileged (16 bit) + \details Executes a Unprivileged STRT instruction for 16 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + */ +__STATIC_FORCEINLINE void __STRHT(uint16_t value, volatile uint16_t *ptr) +{ + __ASM volatile ("strht %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) ); +} + + +/** + \brief STRT Unprivileged (32 bit) + \details Executes a Unprivileged STRT instruction for 32 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + */ +__STATIC_FORCEINLINE void __STRT(uint32_t value, volatile uint32_t *ptr) +{ + __ASM volatile ("strt %1, %0" : "=Q" (*ptr) : "r" (value) ); +} + +#else /* ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \ + (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ + (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) ) */ + +/** + \brief Signed Saturate + \details Saturates a signed value. + \param [in] value Value to be saturated + \param [in] sat Bit position to saturate to (1..32) + \return Saturated value + */ +__STATIC_FORCEINLINE int32_t __SSAT(int32_t val, uint32_t sat) +{ + if ((sat >= 1U) && (sat <= 32U)) + { + const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U); + const int32_t min = -1 - max ; + if (val > max) + { + return max; + } + else if (val < min) + { + return min; + } + } + return val; +} + +/** + \brief Unsigned Saturate + \details Saturates an unsigned value. + \param [in] value Value to be saturated + \param [in] sat Bit position to saturate to (0..31) + \return Saturated value + */ +__STATIC_FORCEINLINE uint32_t __USAT(int32_t val, uint32_t sat) +{ + if (sat <= 31U) + { + const uint32_t max = ((1U << sat) - 1U); + if (val > (int32_t)max) + { + return max; + } + else if (val < 0) + { + return 0U; + } + } + return (uint32_t)val; +} + +#endif /* ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \ + (defined (__ARM_ARCH_7EM__ ) && (__ARM_ARCH_7EM__ == 1)) || \ + (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) ) */ + + +#if ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1)) ) +/** + \brief Load-Acquire (8 bit) + \details Executes a LDAB instruction for 8 bit value. + \param [in] ptr Pointer to data + \return value of type uint8_t at (*ptr) + */ +__STATIC_FORCEINLINE uint8_t __LDAB(volatile uint8_t *ptr) +{ + uint32_t result; + + __ASM volatile ("ldab %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" ); + return ((uint8_t) result); +} + + +/** + \brief Load-Acquire (16 bit) + \details Executes a LDAH instruction for 16 bit values. + \param [in] ptr Pointer to data + \return value of type uint16_t at (*ptr) + */ +__STATIC_FORCEINLINE uint16_t __LDAH(volatile uint16_t *ptr) +{ + uint32_t result; + + __ASM volatile ("ldah %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" ); + return ((uint16_t) result); +} + + +/** + \brief Load-Acquire (32 bit) + \details Executes a LDA instruction for 32 bit values. + \param [in] ptr Pointer to data + \return value of type uint32_t at (*ptr) + */ +__STATIC_FORCEINLINE uint32_t __LDA(volatile uint32_t *ptr) +{ + uint32_t result; + + __ASM volatile ("lda %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" ); + return(result); +} + + +/** + \brief Store-Release (8 bit) + \details Executes a STLB instruction for 8 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + */ +__STATIC_FORCEINLINE void __STLB(uint8_t value, volatile uint8_t *ptr) +{ + __ASM volatile ("stlb %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" ); +} + + +/** + \brief Store-Release (16 bit) + \details Executes a STLH instruction for 16 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + */ +__STATIC_FORCEINLINE void __STLH(uint16_t value, volatile uint16_t *ptr) +{ + __ASM volatile ("stlh %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" ); +} + + +/** + \brief Store-Release (32 bit) + \details Executes a STL instruction for 32 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + */ +__STATIC_FORCEINLINE void __STL(uint32_t value, volatile uint32_t *ptr) +{ + __ASM volatile ("stl %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" ); +} + + +/** + \brief Load-Acquire Exclusive (8 bit) + \details Executes a LDAB exclusive instruction for 8 bit value. + \param [in] ptr Pointer to data + \return value of type uint8_t at (*ptr) + */ +__STATIC_FORCEINLINE uint8_t __LDAEXB(volatile uint8_t *ptr) +{ + uint32_t result; + + __ASM volatile ("ldaexb %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" ); + return ((uint8_t) result); +} + + +/** + \brief Load-Acquire Exclusive (16 bit) + \details Executes a LDAH exclusive instruction for 16 bit values. + \param [in] ptr Pointer to data + \return value of type uint16_t at (*ptr) + */ +__STATIC_FORCEINLINE uint16_t __LDAEXH(volatile uint16_t *ptr) +{ + uint32_t result; + + __ASM volatile ("ldaexh %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" ); + return ((uint16_t) result); +} + + +/** + \brief Load-Acquire Exclusive (32 bit) + \details Executes a LDA exclusive instruction for 32 bit values. + \param [in] ptr Pointer to data + \return value of type uint32_t at (*ptr) + */ +__STATIC_FORCEINLINE uint32_t __LDAEX(volatile uint32_t *ptr) +{ + uint32_t result; + + __ASM volatile ("ldaex %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" ); + return(result); +} + + +/** + \brief Store-Release Exclusive (8 bit) + \details Executes a STLB exclusive instruction for 8 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + \return 0 Function succeeded + \return 1 Function failed + */ +__STATIC_FORCEINLINE uint32_t __STLEXB(uint8_t value, volatile uint8_t *ptr) +{ + uint32_t result; + + __ASM volatile ("stlexb %0, %2, %1" : "=&r" (result), "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" ); + return(result); +} + + +/** + \brief Store-Release Exclusive (16 bit) + \details Executes a STLH exclusive instruction for 16 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + \return 0 Function succeeded + \return 1 Function failed + */ +__STATIC_FORCEINLINE uint32_t __STLEXH(uint16_t value, volatile uint16_t *ptr) +{ + uint32_t result; + + __ASM volatile ("stlexh %0, %2, %1" : "=&r" (result), "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" ); + return(result); +} + + +/** + \brief Store-Release Exclusive (32 bit) + \details Executes a STL exclusive instruction for 32 bit values. + \param [in] value Value to store + \param [in] ptr Pointer to location + \return 0 Function succeeded + \return 1 Function failed + */ +__STATIC_FORCEINLINE uint32_t __STLEX(uint32_t value, volatile uint32_t *ptr) +{ + uint32_t result; + + __ASM volatile ("stlex %0, %2, %1" : "=&r" (result), "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" ); + return(result); +} + +#endif /* ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \ + (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1)) ) */ + +/*@}*/ /* end of group CMSIS_Core_InstructionInterface */ + + +/* ################### Compiler specific Intrinsics ########################### */ +/** \defgroup CMSIS_SIMD_intrinsics CMSIS SIMD Intrinsics + Access to dedicated SIMD instructions + @{ +*/ + +#if (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1)) + +__STATIC_FORCEINLINE uint32_t __SADD8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("sadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __QADD8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("qadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SHADD8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("shadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UADD8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("uadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UQADD8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uqadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UHADD8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uhadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + + +__STATIC_FORCEINLINE uint32_t __SSUB8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("ssub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __QSUB8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("qsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SHSUB8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("shsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __USUB8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("usub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UQSUB8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uqsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UHSUB8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uhsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + + +__STATIC_FORCEINLINE uint32_t __SADD16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("sadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __QADD16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("qadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SHADD16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("shadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UADD16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("uadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UQADD16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uqadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UHADD16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uhadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SSUB16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("ssub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __QSUB16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("qsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SHSUB16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("shsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __USUB16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("usub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UQSUB16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uqsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UHSUB16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uhsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SASX(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("sasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __QASX(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("qasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SHASX(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("shasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UASX(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("uasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UQASX(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uqasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UHASX(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uhasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SSAX(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("ssax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __QSAX(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("qsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SHSAX(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("shsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __USAX(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("usax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UQSAX(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uqsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UHSAX(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uhsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __USAD8(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("usad8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __USADA8(uint32_t op1, uint32_t op2, uint32_t op3) +{ + uint32_t result; + + __ASM ("usada8 %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) ); + return(result); +} + +#define __SSAT16(ARG1, ARG2) \ +({ \ + int32_t __RES, __ARG1 = (ARG1); \ + __ASM volatile ("ssat16 %0, %1, %2" : "=r" (__RES) : "I" (ARG2), "r" (__ARG1) : "cc" ); \ + __RES; \ + }) + +#define __USAT16(ARG1, ARG2) \ +({ \ + uint32_t __RES, __ARG1 = (ARG1); \ + __ASM volatile ("usat16 %0, %1, %2" : "=r" (__RES) : "I" (ARG2), "r" (__ARG1) : "cc" ); \ + __RES; \ + }) + +__STATIC_FORCEINLINE uint32_t __UXTB16(uint32_t op1) +{ + uint32_t result; + + __ASM ("uxtb16 %0, %1" : "=r" (result) : "r" (op1)); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __UXTAB16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("uxtab16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SXTB16(uint32_t op1) +{ + uint32_t result; + + __ASM ("sxtb16 %0, %1" : "=r" (result) : "r" (op1)); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SXTB16_RORn(uint32_t op1, uint32_t rotate) +{ + uint32_t result; + + __ASM ("sxtb16 %0, %1, ROR %2" : "=r" (result) : "r" (op1), "i" (rotate) ); + + return result; +} + +__STATIC_FORCEINLINE uint32_t __SXTAB16(uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM ("sxtab16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SMUAD (uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("smuad %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SMUADX (uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("smuadx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SMLAD (uint32_t op1, uint32_t op2, uint32_t op3) +{ + uint32_t result; + + __ASM volatile ("smlad %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SMLADX (uint32_t op1, uint32_t op2, uint32_t op3) +{ + uint32_t result; + + __ASM volatile ("smladx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) ); + return(result); +} + +__STATIC_FORCEINLINE uint64_t __SMLALD (uint32_t op1, uint32_t op2, uint64_t acc) +{ + union llreg_u{ + uint32_t w32[2]; + uint64_t w64; + } llr; + llr.w64 = acc; + +#ifndef __ARMEB__ /* Little endian */ + __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) ); +#else /* Big endian */ + __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) ); +#endif + + return(llr.w64); +} + +__STATIC_FORCEINLINE uint64_t __SMLALDX (uint32_t op1, uint32_t op2, uint64_t acc) +{ + union llreg_u{ + uint32_t w32[2]; + uint64_t w64; + } llr; + llr.w64 = acc; + +#ifndef __ARMEB__ /* Little endian */ + __ASM volatile ("smlaldx %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) ); +#else /* Big endian */ + __ASM volatile ("smlaldx %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) ); +#endif + + return(llr.w64); +} + +__STATIC_FORCEINLINE uint32_t __SMUSD (uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("smusd %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SMUSDX (uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("smusdx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SMLSD (uint32_t op1, uint32_t op2, uint32_t op3) +{ + uint32_t result; + + __ASM volatile ("smlsd %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) ); + return(result); +} + +__STATIC_FORCEINLINE uint32_t __SMLSDX (uint32_t op1, uint32_t op2, uint32_t op3) +{ + uint32_t result; + + __ASM volatile ("smlsdx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) ); + return(result); +} + +__STATIC_FORCEINLINE uint64_t __SMLSLD (uint32_t op1, uint32_t op2, uint64_t acc) +{ + union llreg_u{ + uint32_t w32[2]; + uint64_t w64; + } llr; + llr.w64 = acc; + +#ifndef __ARMEB__ /* Little endian */ + __ASM volatile ("smlsld %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) ); +#else /* Big endian */ + __ASM volatile ("smlsld %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) ); +#endif + + return(llr.w64); +} + +__STATIC_FORCEINLINE uint64_t __SMLSLDX (uint32_t op1, uint32_t op2, uint64_t acc) +{ + union llreg_u{ + uint32_t w32[2]; + uint64_t w64; + } llr; + llr.w64 = acc; + +#ifndef __ARMEB__ /* Little endian */ + __ASM volatile ("smlsldx %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) ); +#else /* Big endian */ + __ASM volatile ("smlsldx %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) ); +#endif + + return(llr.w64); +} + +__STATIC_FORCEINLINE uint32_t __SEL (uint32_t op1, uint32_t op2) +{ + uint32_t result; + + __ASM volatile ("sel %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE int32_t __QADD( int32_t op1, int32_t op2) +{ + int32_t result; + + __ASM volatile ("qadd %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +__STATIC_FORCEINLINE int32_t __QSUB( int32_t op1, int32_t op2) +{ + int32_t result; + + __ASM volatile ("qsub %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) ); + return(result); +} + +#if 0 +#define __PKHBT(ARG1,ARG2,ARG3) \ +({ \ + uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \ + __ASM ("pkhbt %0, %1, %2, lsl %3" : "=r" (__RES) : "r" (__ARG1), "r" (__ARG2), "I" (ARG3) ); \ + __RES; \ + }) + +#define __PKHTB(ARG1,ARG2,ARG3) \ +({ \ + uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \ + if (ARG3 == 0) \ + __ASM ("pkhtb %0, %1, %2" : "=r" (__RES) : "r" (__ARG1), "r" (__ARG2) ); \ + else \ + __ASM ("pkhtb %0, %1, %2, asr %3" : "=r" (__RES) : "r" (__ARG1), "r" (__ARG2), "I" (ARG3) ); \ + __RES; \ + }) +#endif + +#define __PKHBT(ARG1,ARG2,ARG3) ( ((((uint32_t)(ARG1)) ) & 0x0000FFFFUL) | \ + ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL) ) + +#define __PKHTB(ARG1,ARG2,ARG3) ( ((((uint32_t)(ARG1)) ) & 0xFFFF0000UL) | \ + ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL) ) + +__STATIC_FORCEINLINE int32_t __SMMLA (int32_t op1, int32_t op2, int32_t op3) +{ + int32_t result; + + __ASM ("smmla %0, %1, %2, %3" : "=r" (result): "r" (op1), "r" (op2), "r" (op3) ); + return(result); +} + +#endif /* (__ARM_FEATURE_DSP == 1) */ +/*@} end of group CMSIS_SIMD_intrinsics */ + + +#pragma GCC diagnostic pop + +#endif /* __CMSIS_GCC_H */ diff --git a/common/mps2/cmsis_nvic.h b/common/mps2/cmsis_nvic.h new file mode 100644 index 0000000..d71cfe6 --- /dev/null +++ b/common/mps2/cmsis_nvic.h @@ -0,0 +1,47 @@ +/* MPS2 CMSIS Library +* +* Copyright (c) 2006-2018 ARM Limited +* SPDX-License-Identifier: BSD-3-Clause +* All rights reserved. +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions are met: +* +* 1. Redistributions of source code must retain the above copyright notice, +* this list of conditions and the following disclaimer. +* +* 2. Redistributions in binary form must reproduce the above copyright notice, +* this list of conditions and the following disclaimer in the documentation +* and/or other materials provided with the distribution. +* +* 3. Neither the name of the copyright holder nor the names of its contributors +* may be used to endorse or promote products derived from this software without +* specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +* POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#ifndef MBED_CMSIS_NVIC_H +#define MBED_CMSIS_NVIC_H + +#include "memory_zones.h" + +#define NVIC_NUM_VECTORS (16 + 48) +#define NVIC_RAM_VECTOR_ADDRESS ZBT_SRAM2_START // Location of vectors in RAM + +/* + * Size of the whole vector table in bytes. Each vector is on 32 bits. + */ +#define NVIC_VECTORS_SIZE (NVIC_NUM_VECTORS * 4) + +#endif diff --git a/common/mps2/cmsis_version.h b/common/mps2/cmsis_version.h new file mode 100644 index 0000000..2f048e4 --- /dev/null +++ b/common/mps2/cmsis_version.h @@ -0,0 +1,39 @@ +/**************************************************************************//** + * @file cmsis_version.h + * @brief CMSIS Core(M) Version definitions + * @version V5.0.4 + * @date 23. July 2019 + ******************************************************************************/ +/* + * Copyright (c) 2009-2019 ARM Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#if defined ( __ICCARM__ ) + #pragma system_include /* treat file as system include file for MISRA check */ +#elif defined (__clang__) + #pragma clang system_header /* treat file as system include file */ +#endif + +#ifndef __CMSIS_VERSION_H +#define __CMSIS_VERSION_H + +/* CMSIS Version definitions */ +#define __CM_CMSIS_VERSION_MAIN ( 5U) /*!< [31:16] CMSIS Core(M) main version */ +#define __CM_CMSIS_VERSION_SUB ( 4U) /*!< [15:0] CMSIS Core(M) sub version */ +#define __CM_CMSIS_VERSION ((__CM_CMSIS_VERSION_MAIN << 16U) | \ + __CM_CMSIS_VERSION_SUB ) /*!< CMSIS Core(M) version number */ +#endif diff --git a/common/mps2/core_cm4.h b/common/mps2/core_cm4.h new file mode 100644 index 0000000..4e0e886 --- /dev/null +++ b/common/mps2/core_cm4.h @@ -0,0 +1,2129 @@ +/**************************************************************************//** + * @file core_cm4.h + * @brief CMSIS Cortex-M4 Core Peripheral Access Layer Header File + * @version V5.1.1 + * @date 27. March 2020 + ******************************************************************************/ +/* + * Copyright (c) 2009-2020 Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#if defined ( __ICCARM__ ) + #pragma system_include /* treat file as system include file for MISRA check */ +#elif defined (__clang__) + #pragma clang system_header /* treat file as system include file */ +#endif + +#ifndef __CORE_CM4_H_GENERIC +#define __CORE_CM4_H_GENERIC + +#include + +#ifdef __cplusplus + extern "C" { +#endif + +/** + \page CMSIS_MISRA_Exceptions MISRA-C:2004 Compliance Exceptions + CMSIS violates the following MISRA-C:2004 rules: + + \li Required Rule 8.5, object/function definition in header file.
+ Function definitions in header files are used to allow 'inlining'. + + \li Required Rule 18.4, declaration of union type or object of union type: '{...}'.
+ Unions are used for effective representation of core registers. + + \li Advisory Rule 19.7, Function-like macro defined.
+ Function-like macros are used to allow more efficient code. + */ + + +/******************************************************************************* + * CMSIS definitions + ******************************************************************************/ +/** + \ingroup Cortex_M4 + @{ + */ + +#include "cmsis_version.h" + +/* CMSIS CM4 definitions */ +#define __CM4_CMSIS_VERSION_MAIN (__CM_CMSIS_VERSION_MAIN) /*!< \deprecated [31:16] CMSIS HAL main version */ +#define __CM4_CMSIS_VERSION_SUB (__CM_CMSIS_VERSION_SUB) /*!< \deprecated [15:0] CMSIS HAL sub version */ +#define __CM4_CMSIS_VERSION ((__CM4_CMSIS_VERSION_MAIN << 16U) | \ + __CM4_CMSIS_VERSION_SUB ) /*!< \deprecated CMSIS HAL version number */ + +#define __CORTEX_M (4U) /*!< Cortex-M Core */ + +/** __FPU_USED indicates whether an FPU is used or not. + For this, __FPU_PRESENT has to be checked prior to making use of FPU specific registers and functions. +*/ +#if defined ( __CC_ARM ) + #if defined __TARGET_FPU_VFP + #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U) + #define __FPU_USED 1U + #else + #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)" + #define __FPU_USED 0U + #endif + #else + #define __FPU_USED 0U + #endif + +#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) + #if defined __ARM_FP + #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U) + #define __FPU_USED 1U + #else + #warning "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)" + #define __FPU_USED 0U + #endif + #else + #define __FPU_USED 0U + #endif + +#elif defined ( __GNUC__ ) + #if defined (__VFP_FP__) && !defined(__SOFTFP__) + #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U) + #define __FPU_USED 1U + #else + #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)" + #define __FPU_USED 0U + #endif + #else + #define __FPU_USED 0U + #endif + +#elif defined ( __ICCARM__ ) + #if defined __ARMVFP__ + #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U) + #define __FPU_USED 1U + #else + #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)" + #define __FPU_USED 0U + #endif + #else + #define __FPU_USED 0U + #endif + +#elif defined ( __TI_ARM__ ) + #if defined __TI_VFP_SUPPORT__ + #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U) + #define __FPU_USED 1U + #else + #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)" + #define __FPU_USED 0U + #endif + #else + #define __FPU_USED 0U + #endif + +#elif defined ( __TASKING__ ) + #if defined __FPU_VFP__ + #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U) + #define __FPU_USED 1U + #else + #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)" + #define __FPU_USED 0U + #endif + #else + #define __FPU_USED 0U + #endif + +#elif defined ( __CSMC__ ) + #if ( __CSMC__ & 0x400U) + #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U) + #define __FPU_USED 1U + #else + #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)" + #define __FPU_USED 0U + #endif + #else + #define __FPU_USED 0U + #endif + +#endif + +#include "cmsis_compiler.h" /* CMSIS compiler specific defines */ + + +#ifdef __cplusplus +} +#endif + +#endif /* __CORE_CM4_H_GENERIC */ + +#ifndef __CMSIS_GENERIC + +#ifndef __CORE_CM4_H_DEPENDANT +#define __CORE_CM4_H_DEPENDANT + +#ifdef __cplusplus + extern "C" { +#endif + +/* check device defines and use defaults */ +#if defined __CHECK_DEVICE_DEFINES + #ifndef __CM4_REV + #define __CM4_REV 0x0000U + #warning "__CM4_REV not defined in device header file; using default!" + #endif + + #ifndef __FPU_PRESENT + #define __FPU_PRESENT 0U + #warning "__FPU_PRESENT not defined in device header file; using default!" + #endif + + #ifndef __MPU_PRESENT + #define __MPU_PRESENT 0U + #warning "__MPU_PRESENT not defined in device header file; using default!" + #endif + + #ifndef __VTOR_PRESENT + #define __VTOR_PRESENT 1U + #warning "__VTOR_PRESENT not defined in device header file; using default!" + #endif + + #ifndef __NVIC_PRIO_BITS + #define __NVIC_PRIO_BITS 3U + #warning "__NVIC_PRIO_BITS not defined in device header file; using default!" + #endif + + #ifndef __Vendor_SysTickConfig + #define __Vendor_SysTickConfig 0U + #warning "__Vendor_SysTickConfig not defined in device header file; using default!" + #endif +#endif + +/* IO definitions (access restrictions to peripheral registers) */ +/** + \defgroup CMSIS_glob_defs CMSIS Global Defines + + IO Type Qualifiers are used + \li to specify the access to peripheral variables. + \li for automatic generation of peripheral register debug information. +*/ +#ifdef __cplusplus + #define __I volatile /*!< Defines 'read only' permissions */ +#else + #define __I volatile const /*!< Defines 'read only' permissions */ +#endif +#define __O volatile /*!< Defines 'write only' permissions */ +#define __IO volatile /*!< Defines 'read / write' permissions */ + +/* following defines should be used for structure members */ +#define __IM volatile const /*! Defines 'read only' structure member permissions */ +#define __OM volatile /*! Defines 'write only' structure member permissions */ +#define __IOM volatile /*! Defines 'read / write' structure member permissions */ + +/*@} end of group Cortex_M4 */ + + + +/******************************************************************************* + * Register Abstraction + Core Register contain: + - Core Register + - Core NVIC Register + - Core SCB Register + - Core SysTick Register + - Core Debug Register + - Core MPU Register + - Core FPU Register + ******************************************************************************/ +/** + \defgroup CMSIS_core_register Defines and Type Definitions + \brief Type definitions and defines for Cortex-M processor based devices. +*/ + +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_CORE Status and Control Registers + \brief Core Register type definitions. + @{ + */ + +/** + \brief Union type to access the Application Program Status Register (APSR). + */ +typedef union +{ + struct + { + uint32_t _reserved0:16; /*!< bit: 0..15 Reserved */ + uint32_t GE:4; /*!< bit: 16..19 Greater than or Equal flags */ + uint32_t _reserved1:7; /*!< bit: 20..26 Reserved */ + uint32_t Q:1; /*!< bit: 27 Saturation condition flag */ + uint32_t V:1; /*!< bit: 28 Overflow condition code flag */ + uint32_t C:1; /*!< bit: 29 Carry condition code flag */ + uint32_t Z:1; /*!< bit: 30 Zero condition code flag */ + uint32_t N:1; /*!< bit: 31 Negative condition code flag */ + } b; /*!< Structure used for bit access */ + uint32_t w; /*!< Type used for word access */ +} APSR_Type; + +/* APSR Register Definitions */ +#define APSR_N_Pos 31U /*!< APSR: N Position */ +#define APSR_N_Msk (1UL << APSR_N_Pos) /*!< APSR: N Mask */ + +#define APSR_Z_Pos 30U /*!< APSR: Z Position */ +#define APSR_Z_Msk (1UL << APSR_Z_Pos) /*!< APSR: Z Mask */ + +#define APSR_C_Pos 29U /*!< APSR: C Position */ +#define APSR_C_Msk (1UL << APSR_C_Pos) /*!< APSR: C Mask */ + +#define APSR_V_Pos 28U /*!< APSR: V Position */ +#define APSR_V_Msk (1UL << APSR_V_Pos) /*!< APSR: V Mask */ + +#define APSR_Q_Pos 27U /*!< APSR: Q Position */ +#define APSR_Q_Msk (1UL << APSR_Q_Pos) /*!< APSR: Q Mask */ + +#define APSR_GE_Pos 16U /*!< APSR: GE Position */ +#define APSR_GE_Msk (0xFUL << APSR_GE_Pos) /*!< APSR: GE Mask */ + + +/** + \brief Union type to access the Interrupt Program Status Register (IPSR). + */ +typedef union +{ + struct + { + uint32_t ISR:9; /*!< bit: 0.. 8 Exception number */ + uint32_t _reserved0:23; /*!< bit: 9..31 Reserved */ + } b; /*!< Structure used for bit access */ + uint32_t w; /*!< Type used for word access */ +} IPSR_Type; + +/* IPSR Register Definitions */ +#define IPSR_ISR_Pos 0U /*!< IPSR: ISR Position */ +#define IPSR_ISR_Msk (0x1FFUL /*<< IPSR_ISR_Pos*/) /*!< IPSR: ISR Mask */ + + +/** + \brief Union type to access the Special-Purpose Program Status Registers (xPSR). + */ +typedef union +{ + struct + { + uint32_t ISR:9; /*!< bit: 0.. 8 Exception number */ + uint32_t _reserved0:1; /*!< bit: 9 Reserved */ + uint32_t ICI_IT_1:6; /*!< bit: 10..15 ICI/IT part 1 */ + uint32_t GE:4; /*!< bit: 16..19 Greater than or Equal flags */ + uint32_t _reserved1:4; /*!< bit: 20..23 Reserved */ + uint32_t T:1; /*!< bit: 24 Thumb bit */ + uint32_t ICI_IT_2:2; /*!< bit: 25..26 ICI/IT part 2 */ + uint32_t Q:1; /*!< bit: 27 Saturation condition flag */ + uint32_t V:1; /*!< bit: 28 Overflow condition code flag */ + uint32_t C:1; /*!< bit: 29 Carry condition code flag */ + uint32_t Z:1; /*!< bit: 30 Zero condition code flag */ + uint32_t N:1; /*!< bit: 31 Negative condition code flag */ + } b; /*!< Structure used for bit access */ + uint32_t w; /*!< Type used for word access */ +} xPSR_Type; + +/* xPSR Register Definitions */ +#define xPSR_N_Pos 31U /*!< xPSR: N Position */ +#define xPSR_N_Msk (1UL << xPSR_N_Pos) /*!< xPSR: N Mask */ + +#define xPSR_Z_Pos 30U /*!< xPSR: Z Position */ +#define xPSR_Z_Msk (1UL << xPSR_Z_Pos) /*!< xPSR: Z Mask */ + +#define xPSR_C_Pos 29U /*!< xPSR: C Position */ +#define xPSR_C_Msk (1UL << xPSR_C_Pos) /*!< xPSR: C Mask */ + +#define xPSR_V_Pos 28U /*!< xPSR: V Position */ +#define xPSR_V_Msk (1UL << xPSR_V_Pos) /*!< xPSR: V Mask */ + +#define xPSR_Q_Pos 27U /*!< xPSR: Q Position */ +#define xPSR_Q_Msk (1UL << xPSR_Q_Pos) /*!< xPSR: Q Mask */ + +#define xPSR_ICI_IT_2_Pos 25U /*!< xPSR: ICI/IT part 2 Position */ +#define xPSR_ICI_IT_2_Msk (3UL << xPSR_ICI_IT_2_Pos) /*!< xPSR: ICI/IT part 2 Mask */ + +#define xPSR_T_Pos 24U /*!< xPSR: T Position */ +#define xPSR_T_Msk (1UL << xPSR_T_Pos) /*!< xPSR: T Mask */ + +#define xPSR_GE_Pos 16U /*!< xPSR: GE Position */ +#define xPSR_GE_Msk (0xFUL << xPSR_GE_Pos) /*!< xPSR: GE Mask */ + +#define xPSR_ICI_IT_1_Pos 10U /*!< xPSR: ICI/IT part 1 Position */ +#define xPSR_ICI_IT_1_Msk (0x3FUL << xPSR_ICI_IT_1_Pos) /*!< xPSR: ICI/IT part 1 Mask */ + +#define xPSR_ISR_Pos 0U /*!< xPSR: ISR Position */ +#define xPSR_ISR_Msk (0x1FFUL /*<< xPSR_ISR_Pos*/) /*!< xPSR: ISR Mask */ + + +/** + \brief Union type to access the Control Registers (CONTROL). + */ +typedef union +{ + struct + { + uint32_t nPRIV:1; /*!< bit: 0 Execution privilege in Thread mode */ + uint32_t SPSEL:1; /*!< bit: 1 Stack to be used */ + uint32_t FPCA:1; /*!< bit: 2 FP extension active flag */ + uint32_t _reserved0:29; /*!< bit: 3..31 Reserved */ + } b; /*!< Structure used for bit access */ + uint32_t w; /*!< Type used for word access */ +} CONTROL_Type; + +/* CONTROL Register Definitions */ +#define CONTROL_FPCA_Pos 2U /*!< CONTROL: FPCA Position */ +#define CONTROL_FPCA_Msk (1UL << CONTROL_FPCA_Pos) /*!< CONTROL: FPCA Mask */ + +#define CONTROL_SPSEL_Pos 1U /*!< CONTROL: SPSEL Position */ +#define CONTROL_SPSEL_Msk (1UL << CONTROL_SPSEL_Pos) /*!< CONTROL: SPSEL Mask */ + +#define CONTROL_nPRIV_Pos 0U /*!< CONTROL: nPRIV Position */ +#define CONTROL_nPRIV_Msk (1UL /*<< CONTROL_nPRIV_Pos*/) /*!< CONTROL: nPRIV Mask */ + +/*@} end of group CMSIS_CORE */ + + +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_NVIC Nested Vectored Interrupt Controller (NVIC) + \brief Type definitions for the NVIC Registers + @{ + */ + +/** + \brief Structure type to access the Nested Vectored Interrupt Controller (NVIC). + */ +typedef struct +{ + __IOM uint32_t ISER[8U]; /*!< Offset: 0x000 (R/W) Interrupt Set Enable Register */ + uint32_t RESERVED0[24U]; + __IOM uint32_t ICER[8U]; /*!< Offset: 0x080 (R/W) Interrupt Clear Enable Register */ + uint32_t RESERVED1[24U]; + __IOM uint32_t ISPR[8U]; /*!< Offset: 0x100 (R/W) Interrupt Set Pending Register */ + uint32_t RESERVED2[24U]; + __IOM uint32_t ICPR[8U]; /*!< Offset: 0x180 (R/W) Interrupt Clear Pending Register */ + uint32_t RESERVED3[24U]; + __IOM uint32_t IABR[8U]; /*!< Offset: 0x200 (R/W) Interrupt Active bit Register */ + uint32_t RESERVED4[56U]; + __IOM uint8_t IP[240U]; /*!< Offset: 0x300 (R/W) Interrupt Priority Register (8Bit wide) */ + uint32_t RESERVED5[644U]; + __OM uint32_t STIR; /*!< Offset: 0xE00 ( /W) Software Trigger Interrupt Register */ +} NVIC_Type; + +/* Software Triggered Interrupt Register Definitions */ +#define NVIC_STIR_INTID_Pos 0U /*!< STIR: INTLINESNUM Position */ +#define NVIC_STIR_INTID_Msk (0x1FFUL /*<< NVIC_STIR_INTID_Pos*/) /*!< STIR: INTLINESNUM Mask */ + +/*@} end of group CMSIS_NVIC */ + + +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_SCB System Control Block (SCB) + \brief Type definitions for the System Control Block Registers + @{ + */ + +/** + \brief Structure type to access the System Control Block (SCB). + */ +typedef struct +{ + __IM uint32_t CPUID; /*!< Offset: 0x000 (R/ ) CPUID Base Register */ + __IOM uint32_t ICSR; /*!< Offset: 0x004 (R/W) Interrupt Control and State Register */ + __IOM uint32_t VTOR; /*!< Offset: 0x008 (R/W) Vector Table Offset Register */ + __IOM uint32_t AIRCR; /*!< Offset: 0x00C (R/W) Application Interrupt and Reset Control Register */ + __IOM uint32_t SCR; /*!< Offset: 0x010 (R/W) System Control Register */ + __IOM uint32_t CCR; /*!< Offset: 0x014 (R/W) Configuration Control Register */ + __IOM uint8_t SHP[12U]; /*!< Offset: 0x018 (R/W) System Handlers Priority Registers (4-7, 8-11, 12-15) */ + __IOM uint32_t SHCSR; /*!< Offset: 0x024 (R/W) System Handler Control and State Register */ + __IOM uint32_t CFSR; /*!< Offset: 0x028 (R/W) Configurable Fault Status Register */ + __IOM uint32_t HFSR; /*!< Offset: 0x02C (R/W) HardFault Status Register */ + __IOM uint32_t DFSR; /*!< Offset: 0x030 (R/W) Debug Fault Status Register */ + __IOM uint32_t MMFAR; /*!< Offset: 0x034 (R/W) MemManage Fault Address Register */ + __IOM uint32_t BFAR; /*!< Offset: 0x038 (R/W) BusFault Address Register */ + __IOM uint32_t AFSR; /*!< Offset: 0x03C (R/W) Auxiliary Fault Status Register */ + __IM uint32_t PFR[2U]; /*!< Offset: 0x040 (R/ ) Processor Feature Register */ + __IM uint32_t DFR; /*!< Offset: 0x048 (R/ ) Debug Feature Register */ + __IM uint32_t ADR; /*!< Offset: 0x04C (R/ ) Auxiliary Feature Register */ + __IM uint32_t MMFR[4U]; /*!< Offset: 0x050 (R/ ) Memory Model Feature Register */ + __IM uint32_t ISAR[5U]; /*!< Offset: 0x060 (R/ ) Instruction Set Attributes Register */ + uint32_t RESERVED0[5U]; + __IOM uint32_t CPACR; /*!< Offset: 0x088 (R/W) Coprocessor Access Control Register */ +} SCB_Type; + +/* SCB CPUID Register Definitions */ +#define SCB_CPUID_IMPLEMENTER_Pos 24U /*!< SCB CPUID: IMPLEMENTER Position */ +#define SCB_CPUID_IMPLEMENTER_Msk (0xFFUL << SCB_CPUID_IMPLEMENTER_Pos) /*!< SCB CPUID: IMPLEMENTER Mask */ + +#define SCB_CPUID_VARIANT_Pos 20U /*!< SCB CPUID: VARIANT Position */ +#define SCB_CPUID_VARIANT_Msk (0xFUL << SCB_CPUID_VARIANT_Pos) /*!< SCB CPUID: VARIANT Mask */ + +#define SCB_CPUID_ARCHITECTURE_Pos 16U /*!< SCB CPUID: ARCHITECTURE Position */ +#define SCB_CPUID_ARCHITECTURE_Msk (0xFUL << SCB_CPUID_ARCHITECTURE_Pos) /*!< SCB CPUID: ARCHITECTURE Mask */ + +#define SCB_CPUID_PARTNO_Pos 4U /*!< SCB CPUID: PARTNO Position */ +#define SCB_CPUID_PARTNO_Msk (0xFFFUL << SCB_CPUID_PARTNO_Pos) /*!< SCB CPUID: PARTNO Mask */ + +#define SCB_CPUID_REVISION_Pos 0U /*!< SCB CPUID: REVISION Position */ +#define SCB_CPUID_REVISION_Msk (0xFUL /*<< SCB_CPUID_REVISION_Pos*/) /*!< SCB CPUID: REVISION Mask */ + +/* SCB Interrupt Control State Register Definitions */ +#define SCB_ICSR_NMIPENDSET_Pos 31U /*!< SCB ICSR: NMIPENDSET Position */ +#define SCB_ICSR_NMIPENDSET_Msk (1UL << SCB_ICSR_NMIPENDSET_Pos) /*!< SCB ICSR: NMIPENDSET Mask */ + +#define SCB_ICSR_PENDSVSET_Pos 28U /*!< SCB ICSR: PENDSVSET Position */ +#define SCB_ICSR_PENDSVSET_Msk (1UL << SCB_ICSR_PENDSVSET_Pos) /*!< SCB ICSR: PENDSVSET Mask */ + +#define SCB_ICSR_PENDSVCLR_Pos 27U /*!< SCB ICSR: PENDSVCLR Position */ +#define SCB_ICSR_PENDSVCLR_Msk (1UL << SCB_ICSR_PENDSVCLR_Pos) /*!< SCB ICSR: PENDSVCLR Mask */ + +#define SCB_ICSR_PENDSTSET_Pos 26U /*!< SCB ICSR: PENDSTSET Position */ +#define SCB_ICSR_PENDSTSET_Msk (1UL << SCB_ICSR_PENDSTSET_Pos) /*!< SCB ICSR: PENDSTSET Mask */ + +#define SCB_ICSR_PENDSTCLR_Pos 25U /*!< SCB ICSR: PENDSTCLR Position */ +#define SCB_ICSR_PENDSTCLR_Msk (1UL << SCB_ICSR_PENDSTCLR_Pos) /*!< SCB ICSR: PENDSTCLR Mask */ + +#define SCB_ICSR_ISRPREEMPT_Pos 23U /*!< SCB ICSR: ISRPREEMPT Position */ +#define SCB_ICSR_ISRPREEMPT_Msk (1UL << SCB_ICSR_ISRPREEMPT_Pos) /*!< SCB ICSR: ISRPREEMPT Mask */ + +#define SCB_ICSR_ISRPENDING_Pos 22U /*!< SCB ICSR: ISRPENDING Position */ +#define SCB_ICSR_ISRPENDING_Msk (1UL << SCB_ICSR_ISRPENDING_Pos) /*!< SCB ICSR: ISRPENDING Mask */ + +#define SCB_ICSR_VECTPENDING_Pos 12U /*!< SCB ICSR: VECTPENDING Position */ +#define SCB_ICSR_VECTPENDING_Msk (0x1FFUL << SCB_ICSR_VECTPENDING_Pos) /*!< SCB ICSR: VECTPENDING Mask */ + +#define SCB_ICSR_RETTOBASE_Pos 11U /*!< SCB ICSR: RETTOBASE Position */ +#define SCB_ICSR_RETTOBASE_Msk (1UL << SCB_ICSR_RETTOBASE_Pos) /*!< SCB ICSR: RETTOBASE Mask */ + +#define SCB_ICSR_VECTACTIVE_Pos 0U /*!< SCB ICSR: VECTACTIVE Position */ +#define SCB_ICSR_VECTACTIVE_Msk (0x1FFUL /*<< SCB_ICSR_VECTACTIVE_Pos*/) /*!< SCB ICSR: VECTACTIVE Mask */ + +/* SCB Vector Table Offset Register Definitions */ +#define SCB_VTOR_TBLOFF_Pos 7U /*!< SCB VTOR: TBLOFF Position */ +#define SCB_VTOR_TBLOFF_Msk (0x1FFFFFFUL << SCB_VTOR_TBLOFF_Pos) /*!< SCB VTOR: TBLOFF Mask */ + +/* SCB Application Interrupt and Reset Control Register Definitions */ +#define SCB_AIRCR_VECTKEY_Pos 16U /*!< SCB AIRCR: VECTKEY Position */ +#define SCB_AIRCR_VECTKEY_Msk (0xFFFFUL << SCB_AIRCR_VECTKEY_Pos) /*!< SCB AIRCR: VECTKEY Mask */ + +#define SCB_AIRCR_VECTKEYSTAT_Pos 16U /*!< SCB AIRCR: VECTKEYSTAT Position */ +#define SCB_AIRCR_VECTKEYSTAT_Msk (0xFFFFUL << SCB_AIRCR_VECTKEYSTAT_Pos) /*!< SCB AIRCR: VECTKEYSTAT Mask */ + +#define SCB_AIRCR_ENDIANESS_Pos 15U /*!< SCB AIRCR: ENDIANESS Position */ +#define SCB_AIRCR_ENDIANESS_Msk (1UL << SCB_AIRCR_ENDIANESS_Pos) /*!< SCB AIRCR: ENDIANESS Mask */ + +#define SCB_AIRCR_PRIGROUP_Pos 8U /*!< SCB AIRCR: PRIGROUP Position */ +#define SCB_AIRCR_PRIGROUP_Msk (7UL << SCB_AIRCR_PRIGROUP_Pos) /*!< SCB AIRCR: PRIGROUP Mask */ + +#define SCB_AIRCR_SYSRESETREQ_Pos 2U /*!< SCB AIRCR: SYSRESETREQ Position */ +#define SCB_AIRCR_SYSRESETREQ_Msk (1UL << SCB_AIRCR_SYSRESETREQ_Pos) /*!< SCB AIRCR: SYSRESETREQ Mask */ + +#define SCB_AIRCR_VECTCLRACTIVE_Pos 1U /*!< SCB AIRCR: VECTCLRACTIVE Position */ +#define SCB_AIRCR_VECTCLRACTIVE_Msk (1UL << SCB_AIRCR_VECTCLRACTIVE_Pos) /*!< SCB AIRCR: VECTCLRACTIVE Mask */ + +#define SCB_AIRCR_VECTRESET_Pos 0U /*!< SCB AIRCR: VECTRESET Position */ +#define SCB_AIRCR_VECTRESET_Msk (1UL /*<< SCB_AIRCR_VECTRESET_Pos*/) /*!< SCB AIRCR: VECTRESET Mask */ + +/* SCB System Control Register Definitions */ +#define SCB_SCR_SEVONPEND_Pos 4U /*!< SCB SCR: SEVONPEND Position */ +#define SCB_SCR_SEVONPEND_Msk (1UL << SCB_SCR_SEVONPEND_Pos) /*!< SCB SCR: SEVONPEND Mask */ + +#define SCB_SCR_SLEEPDEEP_Pos 2U /*!< SCB SCR: SLEEPDEEP Position */ +#define SCB_SCR_SLEEPDEEP_Msk (1UL << SCB_SCR_SLEEPDEEP_Pos) /*!< SCB SCR: SLEEPDEEP Mask */ + +#define SCB_SCR_SLEEPONEXIT_Pos 1U /*!< SCB SCR: SLEEPONEXIT Position */ +#define SCB_SCR_SLEEPONEXIT_Msk (1UL << SCB_SCR_SLEEPONEXIT_Pos) /*!< SCB SCR: SLEEPONEXIT Mask */ + +/* SCB Configuration Control Register Definitions */ +#define SCB_CCR_STKALIGN_Pos 9U /*!< SCB CCR: STKALIGN Position */ +#define SCB_CCR_STKALIGN_Msk (1UL << SCB_CCR_STKALIGN_Pos) /*!< SCB CCR: STKALIGN Mask */ + +#define SCB_CCR_BFHFNMIGN_Pos 8U /*!< SCB CCR: BFHFNMIGN Position */ +#define SCB_CCR_BFHFNMIGN_Msk (1UL << SCB_CCR_BFHFNMIGN_Pos) /*!< SCB CCR: BFHFNMIGN Mask */ + +#define SCB_CCR_DIV_0_TRP_Pos 4U /*!< SCB CCR: DIV_0_TRP Position */ +#define SCB_CCR_DIV_0_TRP_Msk (1UL << SCB_CCR_DIV_0_TRP_Pos) /*!< SCB CCR: DIV_0_TRP Mask */ + +#define SCB_CCR_UNALIGN_TRP_Pos 3U /*!< SCB CCR: UNALIGN_TRP Position */ +#define SCB_CCR_UNALIGN_TRP_Msk (1UL << SCB_CCR_UNALIGN_TRP_Pos) /*!< SCB CCR: UNALIGN_TRP Mask */ + +#define SCB_CCR_USERSETMPEND_Pos 1U /*!< SCB CCR: USERSETMPEND Position */ +#define SCB_CCR_USERSETMPEND_Msk (1UL << SCB_CCR_USERSETMPEND_Pos) /*!< SCB CCR: USERSETMPEND Mask */ + +#define SCB_CCR_NONBASETHRDENA_Pos 0U /*!< SCB CCR: NONBASETHRDENA Position */ +#define SCB_CCR_NONBASETHRDENA_Msk (1UL /*<< SCB_CCR_NONBASETHRDENA_Pos*/) /*!< SCB CCR: NONBASETHRDENA Mask */ + +/* SCB System Handler Control and State Register Definitions */ +#define SCB_SHCSR_USGFAULTENA_Pos 18U /*!< SCB SHCSR: USGFAULTENA Position */ +#define SCB_SHCSR_USGFAULTENA_Msk (1UL << SCB_SHCSR_USGFAULTENA_Pos) /*!< SCB SHCSR: USGFAULTENA Mask */ + +#define SCB_SHCSR_BUSFAULTENA_Pos 17U /*!< SCB SHCSR: BUSFAULTENA Position */ +#define SCB_SHCSR_BUSFAULTENA_Msk (1UL << SCB_SHCSR_BUSFAULTENA_Pos) /*!< SCB SHCSR: BUSFAULTENA Mask */ + +#define SCB_SHCSR_MEMFAULTENA_Pos 16U /*!< SCB SHCSR: MEMFAULTENA Position */ +#define SCB_SHCSR_MEMFAULTENA_Msk (1UL << SCB_SHCSR_MEMFAULTENA_Pos) /*!< SCB SHCSR: MEMFAULTENA Mask */ + +#define SCB_SHCSR_SVCALLPENDED_Pos 15U /*!< SCB SHCSR: SVCALLPENDED Position */ +#define SCB_SHCSR_SVCALLPENDED_Msk (1UL << SCB_SHCSR_SVCALLPENDED_Pos) /*!< SCB SHCSR: SVCALLPENDED Mask */ + +#define SCB_SHCSR_BUSFAULTPENDED_Pos 14U /*!< SCB SHCSR: BUSFAULTPENDED Position */ +#define SCB_SHCSR_BUSFAULTPENDED_Msk (1UL << SCB_SHCSR_BUSFAULTPENDED_Pos) /*!< SCB SHCSR: BUSFAULTPENDED Mask */ + +#define SCB_SHCSR_MEMFAULTPENDED_Pos 13U /*!< SCB SHCSR: MEMFAULTPENDED Position */ +#define SCB_SHCSR_MEMFAULTPENDED_Msk (1UL << SCB_SHCSR_MEMFAULTPENDED_Pos) /*!< SCB SHCSR: MEMFAULTPENDED Mask */ + +#define SCB_SHCSR_USGFAULTPENDED_Pos 12U /*!< SCB SHCSR: USGFAULTPENDED Position */ +#define SCB_SHCSR_USGFAULTPENDED_Msk (1UL << SCB_SHCSR_USGFAULTPENDED_Pos) /*!< SCB SHCSR: USGFAULTPENDED Mask */ + +#define SCB_SHCSR_SYSTICKACT_Pos 11U /*!< SCB SHCSR: SYSTICKACT Position */ +#define SCB_SHCSR_SYSTICKACT_Msk (1UL << SCB_SHCSR_SYSTICKACT_Pos) /*!< SCB SHCSR: SYSTICKACT Mask */ + +#define SCB_SHCSR_PENDSVACT_Pos 10U /*!< SCB SHCSR: PENDSVACT Position */ +#define SCB_SHCSR_PENDSVACT_Msk (1UL << SCB_SHCSR_PENDSVACT_Pos) /*!< SCB SHCSR: PENDSVACT Mask */ + +#define SCB_SHCSR_MONITORACT_Pos 8U /*!< SCB SHCSR: MONITORACT Position */ +#define SCB_SHCSR_MONITORACT_Msk (1UL << SCB_SHCSR_MONITORACT_Pos) /*!< SCB SHCSR: MONITORACT Mask */ + +#define SCB_SHCSR_SVCALLACT_Pos 7U /*!< SCB SHCSR: SVCALLACT Position */ +#define SCB_SHCSR_SVCALLACT_Msk (1UL << SCB_SHCSR_SVCALLACT_Pos) /*!< SCB SHCSR: SVCALLACT Mask */ + +#define SCB_SHCSR_USGFAULTACT_Pos 3U /*!< SCB SHCSR: USGFAULTACT Position */ +#define SCB_SHCSR_USGFAULTACT_Msk (1UL << SCB_SHCSR_USGFAULTACT_Pos) /*!< SCB SHCSR: USGFAULTACT Mask */ + +#define SCB_SHCSR_BUSFAULTACT_Pos 1U /*!< SCB SHCSR: BUSFAULTACT Position */ +#define SCB_SHCSR_BUSFAULTACT_Msk (1UL << SCB_SHCSR_BUSFAULTACT_Pos) /*!< SCB SHCSR: BUSFAULTACT Mask */ + +#define SCB_SHCSR_MEMFAULTACT_Pos 0U /*!< SCB SHCSR: MEMFAULTACT Position */ +#define SCB_SHCSR_MEMFAULTACT_Msk (1UL /*<< SCB_SHCSR_MEMFAULTACT_Pos*/) /*!< SCB SHCSR: MEMFAULTACT Mask */ + +/* SCB Configurable Fault Status Register Definitions */ +#define SCB_CFSR_USGFAULTSR_Pos 16U /*!< SCB CFSR: Usage Fault Status Register Position */ +#define SCB_CFSR_USGFAULTSR_Msk (0xFFFFUL << SCB_CFSR_USGFAULTSR_Pos) /*!< SCB CFSR: Usage Fault Status Register Mask */ + +#define SCB_CFSR_BUSFAULTSR_Pos 8U /*!< SCB CFSR: Bus Fault Status Register Position */ +#define SCB_CFSR_BUSFAULTSR_Msk (0xFFUL << SCB_CFSR_BUSFAULTSR_Pos) /*!< SCB CFSR: Bus Fault Status Register Mask */ + +#define SCB_CFSR_MEMFAULTSR_Pos 0U /*!< SCB CFSR: Memory Manage Fault Status Register Position */ +#define SCB_CFSR_MEMFAULTSR_Msk (0xFFUL /*<< SCB_CFSR_MEMFAULTSR_Pos*/) /*!< SCB CFSR: Memory Manage Fault Status Register Mask */ + +/* MemManage Fault Status Register (part of SCB Configurable Fault Status Register) */ +#define SCB_CFSR_MMARVALID_Pos (SCB_SHCSR_MEMFAULTACT_Pos + 7U) /*!< SCB CFSR (MMFSR): MMARVALID Position */ +#define SCB_CFSR_MMARVALID_Msk (1UL << SCB_CFSR_MMARVALID_Pos) /*!< SCB CFSR (MMFSR): MMARVALID Mask */ + +#define SCB_CFSR_MLSPERR_Pos (SCB_SHCSR_MEMFAULTACT_Pos + 5U) /*!< SCB CFSR (MMFSR): MLSPERR Position */ +#define SCB_CFSR_MLSPERR_Msk (1UL << SCB_CFSR_MLSPERR_Pos) /*!< SCB CFSR (MMFSR): MLSPERR Mask */ + +#define SCB_CFSR_MSTKERR_Pos (SCB_SHCSR_MEMFAULTACT_Pos + 4U) /*!< SCB CFSR (MMFSR): MSTKERR Position */ +#define SCB_CFSR_MSTKERR_Msk (1UL << SCB_CFSR_MSTKERR_Pos) /*!< SCB CFSR (MMFSR): MSTKERR Mask */ + +#define SCB_CFSR_MUNSTKERR_Pos (SCB_SHCSR_MEMFAULTACT_Pos + 3U) /*!< SCB CFSR (MMFSR): MUNSTKERR Position */ +#define SCB_CFSR_MUNSTKERR_Msk (1UL << SCB_CFSR_MUNSTKERR_Pos) /*!< SCB CFSR (MMFSR): MUNSTKERR Mask */ + +#define SCB_CFSR_DACCVIOL_Pos (SCB_SHCSR_MEMFAULTACT_Pos + 1U) /*!< SCB CFSR (MMFSR): DACCVIOL Position */ +#define SCB_CFSR_DACCVIOL_Msk (1UL << SCB_CFSR_DACCVIOL_Pos) /*!< SCB CFSR (MMFSR): DACCVIOL Mask */ + +#define SCB_CFSR_IACCVIOL_Pos (SCB_SHCSR_MEMFAULTACT_Pos + 0U) /*!< SCB CFSR (MMFSR): IACCVIOL Position */ +#define SCB_CFSR_IACCVIOL_Msk (1UL /*<< SCB_CFSR_IACCVIOL_Pos*/) /*!< SCB CFSR (MMFSR): IACCVIOL Mask */ + +/* BusFault Status Register (part of SCB Configurable Fault Status Register) */ +#define SCB_CFSR_BFARVALID_Pos (SCB_CFSR_BUSFAULTSR_Pos + 7U) /*!< SCB CFSR (BFSR): BFARVALID Position */ +#define SCB_CFSR_BFARVALID_Msk (1UL << SCB_CFSR_BFARVALID_Pos) /*!< SCB CFSR (BFSR): BFARVALID Mask */ + +#define SCB_CFSR_LSPERR_Pos (SCB_CFSR_BUSFAULTSR_Pos + 5U) /*!< SCB CFSR (BFSR): LSPERR Position */ +#define SCB_CFSR_LSPERR_Msk (1UL << SCB_CFSR_LSPERR_Pos) /*!< SCB CFSR (BFSR): LSPERR Mask */ + +#define SCB_CFSR_STKERR_Pos (SCB_CFSR_BUSFAULTSR_Pos + 4U) /*!< SCB CFSR (BFSR): STKERR Position */ +#define SCB_CFSR_STKERR_Msk (1UL << SCB_CFSR_STKERR_Pos) /*!< SCB CFSR (BFSR): STKERR Mask */ + +#define SCB_CFSR_UNSTKERR_Pos (SCB_CFSR_BUSFAULTSR_Pos + 3U) /*!< SCB CFSR (BFSR): UNSTKERR Position */ +#define SCB_CFSR_UNSTKERR_Msk (1UL << SCB_CFSR_UNSTKERR_Pos) /*!< SCB CFSR (BFSR): UNSTKERR Mask */ + +#define SCB_CFSR_IMPRECISERR_Pos (SCB_CFSR_BUSFAULTSR_Pos + 2U) /*!< SCB CFSR (BFSR): IMPRECISERR Position */ +#define SCB_CFSR_IMPRECISERR_Msk (1UL << SCB_CFSR_IMPRECISERR_Pos) /*!< SCB CFSR (BFSR): IMPRECISERR Mask */ + +#define SCB_CFSR_PRECISERR_Pos (SCB_CFSR_BUSFAULTSR_Pos + 1U) /*!< SCB CFSR (BFSR): PRECISERR Position */ +#define SCB_CFSR_PRECISERR_Msk (1UL << SCB_CFSR_PRECISERR_Pos) /*!< SCB CFSR (BFSR): PRECISERR Mask */ + +#define SCB_CFSR_IBUSERR_Pos (SCB_CFSR_BUSFAULTSR_Pos + 0U) /*!< SCB CFSR (BFSR): IBUSERR Position */ +#define SCB_CFSR_IBUSERR_Msk (1UL << SCB_CFSR_IBUSERR_Pos) /*!< SCB CFSR (BFSR): IBUSERR Mask */ + +/* UsageFault Status Register (part of SCB Configurable Fault Status Register) */ +#define SCB_CFSR_DIVBYZERO_Pos (SCB_CFSR_USGFAULTSR_Pos + 9U) /*!< SCB CFSR (UFSR): DIVBYZERO Position */ +#define SCB_CFSR_DIVBYZERO_Msk (1UL << SCB_CFSR_DIVBYZERO_Pos) /*!< SCB CFSR (UFSR): DIVBYZERO Mask */ + +#define SCB_CFSR_UNALIGNED_Pos (SCB_CFSR_USGFAULTSR_Pos + 8U) /*!< SCB CFSR (UFSR): UNALIGNED Position */ +#define SCB_CFSR_UNALIGNED_Msk (1UL << SCB_CFSR_UNALIGNED_Pos) /*!< SCB CFSR (UFSR): UNALIGNED Mask */ + +#define SCB_CFSR_NOCP_Pos (SCB_CFSR_USGFAULTSR_Pos + 3U) /*!< SCB CFSR (UFSR): NOCP Position */ +#define SCB_CFSR_NOCP_Msk (1UL << SCB_CFSR_NOCP_Pos) /*!< SCB CFSR (UFSR): NOCP Mask */ + +#define SCB_CFSR_INVPC_Pos (SCB_CFSR_USGFAULTSR_Pos + 2U) /*!< SCB CFSR (UFSR): INVPC Position */ +#define SCB_CFSR_INVPC_Msk (1UL << SCB_CFSR_INVPC_Pos) /*!< SCB CFSR (UFSR): INVPC Mask */ + +#define SCB_CFSR_INVSTATE_Pos (SCB_CFSR_USGFAULTSR_Pos + 1U) /*!< SCB CFSR (UFSR): INVSTATE Position */ +#define SCB_CFSR_INVSTATE_Msk (1UL << SCB_CFSR_INVSTATE_Pos) /*!< SCB CFSR (UFSR): INVSTATE Mask */ + +#define SCB_CFSR_UNDEFINSTR_Pos (SCB_CFSR_USGFAULTSR_Pos + 0U) /*!< SCB CFSR (UFSR): UNDEFINSTR Position */ +#define SCB_CFSR_UNDEFINSTR_Msk (1UL << SCB_CFSR_UNDEFINSTR_Pos) /*!< SCB CFSR (UFSR): UNDEFINSTR Mask */ + +/* SCB Hard Fault Status Register Definitions */ +#define SCB_HFSR_DEBUGEVT_Pos 31U /*!< SCB HFSR: DEBUGEVT Position */ +#define SCB_HFSR_DEBUGEVT_Msk (1UL << SCB_HFSR_DEBUGEVT_Pos) /*!< SCB HFSR: DEBUGEVT Mask */ + +#define SCB_HFSR_FORCED_Pos 30U /*!< SCB HFSR: FORCED Position */ +#define SCB_HFSR_FORCED_Msk (1UL << SCB_HFSR_FORCED_Pos) /*!< SCB HFSR: FORCED Mask */ + +#define SCB_HFSR_VECTTBL_Pos 1U /*!< SCB HFSR: VECTTBL Position */ +#define SCB_HFSR_VECTTBL_Msk (1UL << SCB_HFSR_VECTTBL_Pos) /*!< SCB HFSR: VECTTBL Mask */ + +/* SCB Debug Fault Status Register Definitions */ +#define SCB_DFSR_EXTERNAL_Pos 4U /*!< SCB DFSR: EXTERNAL Position */ +#define SCB_DFSR_EXTERNAL_Msk (1UL << SCB_DFSR_EXTERNAL_Pos) /*!< SCB DFSR: EXTERNAL Mask */ + +#define SCB_DFSR_VCATCH_Pos 3U /*!< SCB DFSR: VCATCH Position */ +#define SCB_DFSR_VCATCH_Msk (1UL << SCB_DFSR_VCATCH_Pos) /*!< SCB DFSR: VCATCH Mask */ + +#define SCB_DFSR_DWTTRAP_Pos 2U /*!< SCB DFSR: DWTTRAP Position */ +#define SCB_DFSR_DWTTRAP_Msk (1UL << SCB_DFSR_DWTTRAP_Pos) /*!< SCB DFSR: DWTTRAP Mask */ + +#define SCB_DFSR_BKPT_Pos 1U /*!< SCB DFSR: BKPT Position */ +#define SCB_DFSR_BKPT_Msk (1UL << SCB_DFSR_BKPT_Pos) /*!< SCB DFSR: BKPT Mask */ + +#define SCB_DFSR_HALTED_Pos 0U /*!< SCB DFSR: HALTED Position */ +#define SCB_DFSR_HALTED_Msk (1UL /*<< SCB_DFSR_HALTED_Pos*/) /*!< SCB DFSR: HALTED Mask */ + +/*@} end of group CMSIS_SCB */ + + +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_SCnSCB System Controls not in SCB (SCnSCB) + \brief Type definitions for the System Control and ID Register not in the SCB + @{ + */ + +/** + \brief Structure type to access the System Control and ID Register not in the SCB. + */ +typedef struct +{ + uint32_t RESERVED0[1U]; + __IM uint32_t ICTR; /*!< Offset: 0x004 (R/ ) Interrupt Controller Type Register */ + __IOM uint32_t ACTLR; /*!< Offset: 0x008 (R/W) Auxiliary Control Register */ +} SCnSCB_Type; + +/* Interrupt Controller Type Register Definitions */ +#define SCnSCB_ICTR_INTLINESNUM_Pos 0U /*!< ICTR: INTLINESNUM Position */ +#define SCnSCB_ICTR_INTLINESNUM_Msk (0xFUL /*<< SCnSCB_ICTR_INTLINESNUM_Pos*/) /*!< ICTR: INTLINESNUM Mask */ + +/* Auxiliary Control Register Definitions */ +#define SCnSCB_ACTLR_DISOOFP_Pos 9U /*!< ACTLR: DISOOFP Position */ +#define SCnSCB_ACTLR_DISOOFP_Msk (1UL << SCnSCB_ACTLR_DISOOFP_Pos) /*!< ACTLR: DISOOFP Mask */ + +#define SCnSCB_ACTLR_DISFPCA_Pos 8U /*!< ACTLR: DISFPCA Position */ +#define SCnSCB_ACTLR_DISFPCA_Msk (1UL << SCnSCB_ACTLR_DISFPCA_Pos) /*!< ACTLR: DISFPCA Mask */ + +#define SCnSCB_ACTLR_DISFOLD_Pos 2U /*!< ACTLR: DISFOLD Position */ +#define SCnSCB_ACTLR_DISFOLD_Msk (1UL << SCnSCB_ACTLR_DISFOLD_Pos) /*!< ACTLR: DISFOLD Mask */ + +#define SCnSCB_ACTLR_DISDEFWBUF_Pos 1U /*!< ACTLR: DISDEFWBUF Position */ +#define SCnSCB_ACTLR_DISDEFWBUF_Msk (1UL << SCnSCB_ACTLR_DISDEFWBUF_Pos) /*!< ACTLR: DISDEFWBUF Mask */ + +#define SCnSCB_ACTLR_DISMCYCINT_Pos 0U /*!< ACTLR: DISMCYCINT Position */ +#define SCnSCB_ACTLR_DISMCYCINT_Msk (1UL /*<< SCnSCB_ACTLR_DISMCYCINT_Pos*/) /*!< ACTLR: DISMCYCINT Mask */ + +/*@} end of group CMSIS_SCnotSCB */ + + +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_SysTick System Tick Timer (SysTick) + \brief Type definitions for the System Timer Registers. + @{ + */ + +/** + \brief Structure type to access the System Timer (SysTick). + */ +typedef struct +{ + __IOM uint32_t CTRL; /*!< Offset: 0x000 (R/W) SysTick Control and Status Register */ + __IOM uint32_t LOAD; /*!< Offset: 0x004 (R/W) SysTick Reload Value Register */ + __IOM uint32_t VAL; /*!< Offset: 0x008 (R/W) SysTick Current Value Register */ + __IM uint32_t CALIB; /*!< Offset: 0x00C (R/ ) SysTick Calibration Register */ +} SysTick_Type; + +/* SysTick Control / Status Register Definitions */ +#define SysTick_CTRL_COUNTFLAG_Pos 16U /*!< SysTick CTRL: COUNTFLAG Position */ +#define SysTick_CTRL_COUNTFLAG_Msk (1UL << SysTick_CTRL_COUNTFLAG_Pos) /*!< SysTick CTRL: COUNTFLAG Mask */ + +#define SysTick_CTRL_CLKSOURCE_Pos 2U /*!< SysTick CTRL: CLKSOURCE Position */ +#define SysTick_CTRL_CLKSOURCE_Msk (1UL << SysTick_CTRL_CLKSOURCE_Pos) /*!< SysTick CTRL: CLKSOURCE Mask */ + +#define SysTick_CTRL_TICKINT_Pos 1U /*!< SysTick CTRL: TICKINT Position */ +#define SysTick_CTRL_TICKINT_Msk (1UL << SysTick_CTRL_TICKINT_Pos) /*!< SysTick CTRL: TICKINT Mask */ + +#define SysTick_CTRL_ENABLE_Pos 0U /*!< SysTick CTRL: ENABLE Position */ +#define SysTick_CTRL_ENABLE_Msk (1UL /*<< SysTick_CTRL_ENABLE_Pos*/) /*!< SysTick CTRL: ENABLE Mask */ + +/* SysTick Reload Register Definitions */ +#define SysTick_LOAD_RELOAD_Pos 0U /*!< SysTick LOAD: RELOAD Position */ +#define SysTick_LOAD_RELOAD_Msk (0xFFFFFFUL /*<< SysTick_LOAD_RELOAD_Pos*/) /*!< SysTick LOAD: RELOAD Mask */ + +/* SysTick Current Register Definitions */ +#define SysTick_VAL_CURRENT_Pos 0U /*!< SysTick VAL: CURRENT Position */ +#define SysTick_VAL_CURRENT_Msk (0xFFFFFFUL /*<< SysTick_VAL_CURRENT_Pos*/) /*!< SysTick VAL: CURRENT Mask */ + +/* SysTick Calibration Register Definitions */ +#define SysTick_CALIB_NOREF_Pos 31U /*!< SysTick CALIB: NOREF Position */ +#define SysTick_CALIB_NOREF_Msk (1UL << SysTick_CALIB_NOREF_Pos) /*!< SysTick CALIB: NOREF Mask */ + +#define SysTick_CALIB_SKEW_Pos 30U /*!< SysTick CALIB: SKEW Position */ +#define SysTick_CALIB_SKEW_Msk (1UL << SysTick_CALIB_SKEW_Pos) /*!< SysTick CALIB: SKEW Mask */ + +#define SysTick_CALIB_TENMS_Pos 0U /*!< SysTick CALIB: TENMS Position */ +#define SysTick_CALIB_TENMS_Msk (0xFFFFFFUL /*<< SysTick_CALIB_TENMS_Pos*/) /*!< SysTick CALIB: TENMS Mask */ + +/*@} end of group CMSIS_SysTick */ + + +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_ITM Instrumentation Trace Macrocell (ITM) + \brief Type definitions for the Instrumentation Trace Macrocell (ITM) + @{ + */ + +/** + \brief Structure type to access the Instrumentation Trace Macrocell Register (ITM). + */ +typedef struct +{ + __OM union + { + __OM uint8_t u8; /*!< Offset: 0x000 ( /W) ITM Stimulus Port 8-bit */ + __OM uint16_t u16; /*!< Offset: 0x000 ( /W) ITM Stimulus Port 16-bit */ + __OM uint32_t u32; /*!< Offset: 0x000 ( /W) ITM Stimulus Port 32-bit */ + } PORT [32U]; /*!< Offset: 0x000 ( /W) ITM Stimulus Port Registers */ + uint32_t RESERVED0[864U]; + __IOM uint32_t TER; /*!< Offset: 0xE00 (R/W) ITM Trace Enable Register */ + uint32_t RESERVED1[15U]; + __IOM uint32_t TPR; /*!< Offset: 0xE40 (R/W) ITM Trace Privilege Register */ + uint32_t RESERVED2[15U]; + __IOM uint32_t TCR; /*!< Offset: 0xE80 (R/W) ITM Trace Control Register */ + uint32_t RESERVED3[32U]; + uint32_t RESERVED4[43U]; + __OM uint32_t LAR; /*!< Offset: 0xFB0 ( /W) ITM Lock Access Register */ + __IM uint32_t LSR; /*!< Offset: 0xFB4 (R/ ) ITM Lock Status Register */ + uint32_t RESERVED5[6U]; + __IM uint32_t PID4; /*!< Offset: 0xFD0 (R/ ) ITM Peripheral Identification Register #4 */ + __IM uint32_t PID5; /*!< Offset: 0xFD4 (R/ ) ITM Peripheral Identification Register #5 */ + __IM uint32_t PID6; /*!< Offset: 0xFD8 (R/ ) ITM Peripheral Identification Register #6 */ + __IM uint32_t PID7; /*!< Offset: 0xFDC (R/ ) ITM Peripheral Identification Register #7 */ + __IM uint32_t PID0; /*!< Offset: 0xFE0 (R/ ) ITM Peripheral Identification Register #0 */ + __IM uint32_t PID1; /*!< Offset: 0xFE4 (R/ ) ITM Peripheral Identification Register #1 */ + __IM uint32_t PID2; /*!< Offset: 0xFE8 (R/ ) ITM Peripheral Identification Register #2 */ + __IM uint32_t PID3; /*!< Offset: 0xFEC (R/ ) ITM Peripheral Identification Register #3 */ + __IM uint32_t CID0; /*!< Offset: 0xFF0 (R/ ) ITM Component Identification Register #0 */ + __IM uint32_t CID1; /*!< Offset: 0xFF4 (R/ ) ITM Component Identification Register #1 */ + __IM uint32_t CID2; /*!< Offset: 0xFF8 (R/ ) ITM Component Identification Register #2 */ + __IM uint32_t CID3; /*!< Offset: 0xFFC (R/ ) ITM Component Identification Register #3 */ +} ITM_Type; + +/* ITM Trace Privilege Register Definitions */ +#define ITM_TPR_PRIVMASK_Pos 0U /*!< ITM TPR: PRIVMASK Position */ +#define ITM_TPR_PRIVMASK_Msk (0xFFFFFFFFUL /*<< ITM_TPR_PRIVMASK_Pos*/) /*!< ITM TPR: PRIVMASK Mask */ + +/* ITM Trace Control Register Definitions */ +#define ITM_TCR_BUSY_Pos 23U /*!< ITM TCR: BUSY Position */ +#define ITM_TCR_BUSY_Msk (1UL << ITM_TCR_BUSY_Pos) /*!< ITM TCR: BUSY Mask */ + +#define ITM_TCR_TraceBusID_Pos 16U /*!< ITM TCR: ATBID Position */ +#define ITM_TCR_TraceBusID_Msk (0x7FUL << ITM_TCR_TraceBusID_Pos) /*!< ITM TCR: ATBID Mask */ + +#define ITM_TCR_GTSFREQ_Pos 10U /*!< ITM TCR: Global timestamp frequency Position */ +#define ITM_TCR_GTSFREQ_Msk (3UL << ITM_TCR_GTSFREQ_Pos) /*!< ITM TCR: Global timestamp frequency Mask */ + +#define ITM_TCR_TSPrescale_Pos 8U /*!< ITM TCR: TSPrescale Position */ +#define ITM_TCR_TSPrescale_Msk (3UL << ITM_TCR_TSPrescale_Pos) /*!< ITM TCR: TSPrescale Mask */ + +#define ITM_TCR_SWOENA_Pos 4U /*!< ITM TCR: SWOENA Position */ +#define ITM_TCR_SWOENA_Msk (1UL << ITM_TCR_SWOENA_Pos) /*!< ITM TCR: SWOENA Mask */ + +#define ITM_TCR_DWTENA_Pos 3U /*!< ITM TCR: DWTENA Position */ +#define ITM_TCR_DWTENA_Msk (1UL << ITM_TCR_DWTENA_Pos) /*!< ITM TCR: DWTENA Mask */ + +#define ITM_TCR_SYNCENA_Pos 2U /*!< ITM TCR: SYNCENA Position */ +#define ITM_TCR_SYNCENA_Msk (1UL << ITM_TCR_SYNCENA_Pos) /*!< ITM TCR: SYNCENA Mask */ + +#define ITM_TCR_TSENA_Pos 1U /*!< ITM TCR: TSENA Position */ +#define ITM_TCR_TSENA_Msk (1UL << ITM_TCR_TSENA_Pos) /*!< ITM TCR: TSENA Mask */ + +#define ITM_TCR_ITMENA_Pos 0U /*!< ITM TCR: ITM Enable bit Position */ +#define ITM_TCR_ITMENA_Msk (1UL /*<< ITM_TCR_ITMENA_Pos*/) /*!< ITM TCR: ITM Enable bit Mask */ + +/* ITM Lock Status Register Definitions */ +#define ITM_LSR_ByteAcc_Pos 2U /*!< ITM LSR: ByteAcc Position */ +#define ITM_LSR_ByteAcc_Msk (1UL << ITM_LSR_ByteAcc_Pos) /*!< ITM LSR: ByteAcc Mask */ + +#define ITM_LSR_Access_Pos 1U /*!< ITM LSR: Access Position */ +#define ITM_LSR_Access_Msk (1UL << ITM_LSR_Access_Pos) /*!< ITM LSR: Access Mask */ + +#define ITM_LSR_Present_Pos 0U /*!< ITM LSR: Present Position */ +#define ITM_LSR_Present_Msk (1UL /*<< ITM_LSR_Present_Pos*/) /*!< ITM LSR: Present Mask */ + +/*@}*/ /* end of group CMSIS_ITM */ + + +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_DWT Data Watchpoint and Trace (DWT) + \brief Type definitions for the Data Watchpoint and Trace (DWT) + @{ + */ + +/** + \brief Structure type to access the Data Watchpoint and Trace Register (DWT). + */ +typedef struct +{ + __IOM uint32_t CTRL; /*!< Offset: 0x000 (R/W) Control Register */ + __IOM uint32_t CYCCNT; /*!< Offset: 0x004 (R/W) Cycle Count Register */ + __IOM uint32_t CPICNT; /*!< Offset: 0x008 (R/W) CPI Count Register */ + __IOM uint32_t EXCCNT; /*!< Offset: 0x00C (R/W) Exception Overhead Count Register */ + __IOM uint32_t SLEEPCNT; /*!< Offset: 0x010 (R/W) Sleep Count Register */ + __IOM uint32_t LSUCNT; /*!< Offset: 0x014 (R/W) LSU Count Register */ + __IOM uint32_t FOLDCNT; /*!< Offset: 0x018 (R/W) Folded-instruction Count Register */ + __IM uint32_t PCSR; /*!< Offset: 0x01C (R/ ) Program Counter Sample Register */ + __IOM uint32_t COMP0; /*!< Offset: 0x020 (R/W) Comparator Register 0 */ + __IOM uint32_t MASK0; /*!< Offset: 0x024 (R/W) Mask Register 0 */ + __IOM uint32_t FUNCTION0; /*!< Offset: 0x028 (R/W) Function Register 0 */ + uint32_t RESERVED0[1U]; + __IOM uint32_t COMP1; /*!< Offset: 0x030 (R/W) Comparator Register 1 */ + __IOM uint32_t MASK1; /*!< Offset: 0x034 (R/W) Mask Register 1 */ + __IOM uint32_t FUNCTION1; /*!< Offset: 0x038 (R/W) Function Register 1 */ + uint32_t RESERVED1[1U]; + __IOM uint32_t COMP2; /*!< Offset: 0x040 (R/W) Comparator Register 2 */ + __IOM uint32_t MASK2; /*!< Offset: 0x044 (R/W) Mask Register 2 */ + __IOM uint32_t FUNCTION2; /*!< Offset: 0x048 (R/W) Function Register 2 */ + uint32_t RESERVED2[1U]; + __IOM uint32_t COMP3; /*!< Offset: 0x050 (R/W) Comparator Register 3 */ + __IOM uint32_t MASK3; /*!< Offset: 0x054 (R/W) Mask Register 3 */ + __IOM uint32_t FUNCTION3; /*!< Offset: 0x058 (R/W) Function Register 3 */ +} DWT_Type; + +/* DWT Control Register Definitions */ +#define DWT_CTRL_NUMCOMP_Pos 28U /*!< DWT CTRL: NUMCOMP Position */ +#define DWT_CTRL_NUMCOMP_Msk (0xFUL << DWT_CTRL_NUMCOMP_Pos) /*!< DWT CTRL: NUMCOMP Mask */ + +#define DWT_CTRL_NOTRCPKT_Pos 27U /*!< DWT CTRL: NOTRCPKT Position */ +#define DWT_CTRL_NOTRCPKT_Msk (0x1UL << DWT_CTRL_NOTRCPKT_Pos) /*!< DWT CTRL: NOTRCPKT Mask */ + +#define DWT_CTRL_NOEXTTRIG_Pos 26U /*!< DWT CTRL: NOEXTTRIG Position */ +#define DWT_CTRL_NOEXTTRIG_Msk (0x1UL << DWT_CTRL_NOEXTTRIG_Pos) /*!< DWT CTRL: NOEXTTRIG Mask */ + +#define DWT_CTRL_NOCYCCNT_Pos 25U /*!< DWT CTRL: NOCYCCNT Position */ +#define DWT_CTRL_NOCYCCNT_Msk (0x1UL << DWT_CTRL_NOCYCCNT_Pos) /*!< DWT CTRL: NOCYCCNT Mask */ + +#define DWT_CTRL_NOPRFCNT_Pos 24U /*!< DWT CTRL: NOPRFCNT Position */ +#define DWT_CTRL_NOPRFCNT_Msk (0x1UL << DWT_CTRL_NOPRFCNT_Pos) /*!< DWT CTRL: NOPRFCNT Mask */ + +#define DWT_CTRL_CYCEVTENA_Pos 22U /*!< DWT CTRL: CYCEVTENA Position */ +#define DWT_CTRL_CYCEVTENA_Msk (0x1UL << DWT_CTRL_CYCEVTENA_Pos) /*!< DWT CTRL: CYCEVTENA Mask */ + +#define DWT_CTRL_FOLDEVTENA_Pos 21U /*!< DWT CTRL: FOLDEVTENA Position */ +#define DWT_CTRL_FOLDEVTENA_Msk (0x1UL << DWT_CTRL_FOLDEVTENA_Pos) /*!< DWT CTRL: FOLDEVTENA Mask */ + +#define DWT_CTRL_LSUEVTENA_Pos 20U /*!< DWT CTRL: LSUEVTENA Position */ +#define DWT_CTRL_LSUEVTENA_Msk (0x1UL << DWT_CTRL_LSUEVTENA_Pos) /*!< DWT CTRL: LSUEVTENA Mask */ + +#define DWT_CTRL_SLEEPEVTENA_Pos 19U /*!< DWT CTRL: SLEEPEVTENA Position */ +#define DWT_CTRL_SLEEPEVTENA_Msk (0x1UL << DWT_CTRL_SLEEPEVTENA_Pos) /*!< DWT CTRL: SLEEPEVTENA Mask */ + +#define DWT_CTRL_EXCEVTENA_Pos 18U /*!< DWT CTRL: EXCEVTENA Position */ +#define DWT_CTRL_EXCEVTENA_Msk (0x1UL << DWT_CTRL_EXCEVTENA_Pos) /*!< DWT CTRL: EXCEVTENA Mask */ + +#define DWT_CTRL_CPIEVTENA_Pos 17U /*!< DWT CTRL: CPIEVTENA Position */ +#define DWT_CTRL_CPIEVTENA_Msk (0x1UL << DWT_CTRL_CPIEVTENA_Pos) /*!< DWT CTRL: CPIEVTENA Mask */ + +#define DWT_CTRL_EXCTRCENA_Pos 16U /*!< DWT CTRL: EXCTRCENA Position */ +#define DWT_CTRL_EXCTRCENA_Msk (0x1UL << DWT_CTRL_EXCTRCENA_Pos) /*!< DWT CTRL: EXCTRCENA Mask */ + +#define DWT_CTRL_PCSAMPLENA_Pos 12U /*!< DWT CTRL: PCSAMPLENA Position */ +#define DWT_CTRL_PCSAMPLENA_Msk (0x1UL << DWT_CTRL_PCSAMPLENA_Pos) /*!< DWT CTRL: PCSAMPLENA Mask */ + +#define DWT_CTRL_SYNCTAP_Pos 10U /*!< DWT CTRL: SYNCTAP Position */ +#define DWT_CTRL_SYNCTAP_Msk (0x3UL << DWT_CTRL_SYNCTAP_Pos) /*!< DWT CTRL: SYNCTAP Mask */ + +#define DWT_CTRL_CYCTAP_Pos 9U /*!< DWT CTRL: CYCTAP Position */ +#define DWT_CTRL_CYCTAP_Msk (0x1UL << DWT_CTRL_CYCTAP_Pos) /*!< DWT CTRL: CYCTAP Mask */ + +#define DWT_CTRL_POSTINIT_Pos 5U /*!< DWT CTRL: POSTINIT Position */ +#define DWT_CTRL_POSTINIT_Msk (0xFUL << DWT_CTRL_POSTINIT_Pos) /*!< DWT CTRL: POSTINIT Mask */ + +#define DWT_CTRL_POSTPRESET_Pos 1U /*!< DWT CTRL: POSTPRESET Position */ +#define DWT_CTRL_POSTPRESET_Msk (0xFUL << DWT_CTRL_POSTPRESET_Pos) /*!< DWT CTRL: POSTPRESET Mask */ + +#define DWT_CTRL_CYCCNTENA_Pos 0U /*!< DWT CTRL: CYCCNTENA Position */ +#define DWT_CTRL_CYCCNTENA_Msk (0x1UL /*<< DWT_CTRL_CYCCNTENA_Pos*/) /*!< DWT CTRL: CYCCNTENA Mask */ + +/* DWT CPI Count Register Definitions */ +#define DWT_CPICNT_CPICNT_Pos 0U /*!< DWT CPICNT: CPICNT Position */ +#define DWT_CPICNT_CPICNT_Msk (0xFFUL /*<< DWT_CPICNT_CPICNT_Pos*/) /*!< DWT CPICNT: CPICNT Mask */ + +/* DWT Exception Overhead Count Register Definitions */ +#define DWT_EXCCNT_EXCCNT_Pos 0U /*!< DWT EXCCNT: EXCCNT Position */ +#define DWT_EXCCNT_EXCCNT_Msk (0xFFUL /*<< DWT_EXCCNT_EXCCNT_Pos*/) /*!< DWT EXCCNT: EXCCNT Mask */ + +/* DWT Sleep Count Register Definitions */ +#define DWT_SLEEPCNT_SLEEPCNT_Pos 0U /*!< DWT SLEEPCNT: SLEEPCNT Position */ +#define DWT_SLEEPCNT_SLEEPCNT_Msk (0xFFUL /*<< DWT_SLEEPCNT_SLEEPCNT_Pos*/) /*!< DWT SLEEPCNT: SLEEPCNT Mask */ + +/* DWT LSU Count Register Definitions */ +#define DWT_LSUCNT_LSUCNT_Pos 0U /*!< DWT LSUCNT: LSUCNT Position */ +#define DWT_LSUCNT_LSUCNT_Msk (0xFFUL /*<< DWT_LSUCNT_LSUCNT_Pos*/) /*!< DWT LSUCNT: LSUCNT Mask */ + +/* DWT Folded-instruction Count Register Definitions */ +#define DWT_FOLDCNT_FOLDCNT_Pos 0U /*!< DWT FOLDCNT: FOLDCNT Position */ +#define DWT_FOLDCNT_FOLDCNT_Msk (0xFFUL /*<< DWT_FOLDCNT_FOLDCNT_Pos*/) /*!< DWT FOLDCNT: FOLDCNT Mask */ + +/* DWT Comparator Mask Register Definitions */ +#define DWT_MASK_MASK_Pos 0U /*!< DWT MASK: MASK Position */ +#define DWT_MASK_MASK_Msk (0x1FUL /*<< DWT_MASK_MASK_Pos*/) /*!< DWT MASK: MASK Mask */ + +/* DWT Comparator Function Register Definitions */ +#define DWT_FUNCTION_MATCHED_Pos 24U /*!< DWT FUNCTION: MATCHED Position */ +#define DWT_FUNCTION_MATCHED_Msk (0x1UL << DWT_FUNCTION_MATCHED_Pos) /*!< DWT FUNCTION: MATCHED Mask */ + +#define DWT_FUNCTION_DATAVADDR1_Pos 16U /*!< DWT FUNCTION: DATAVADDR1 Position */ +#define DWT_FUNCTION_DATAVADDR1_Msk (0xFUL << DWT_FUNCTION_DATAVADDR1_Pos) /*!< DWT FUNCTION: DATAVADDR1 Mask */ + +#define DWT_FUNCTION_DATAVADDR0_Pos 12U /*!< DWT FUNCTION: DATAVADDR0 Position */ +#define DWT_FUNCTION_DATAVADDR0_Msk (0xFUL << DWT_FUNCTION_DATAVADDR0_Pos) /*!< DWT FUNCTION: DATAVADDR0 Mask */ + +#define DWT_FUNCTION_DATAVSIZE_Pos 10U /*!< DWT FUNCTION: DATAVSIZE Position */ +#define DWT_FUNCTION_DATAVSIZE_Msk (0x3UL << DWT_FUNCTION_DATAVSIZE_Pos) /*!< DWT FUNCTION: DATAVSIZE Mask */ + +#define DWT_FUNCTION_LNK1ENA_Pos 9U /*!< DWT FUNCTION: LNK1ENA Position */ +#define DWT_FUNCTION_LNK1ENA_Msk (0x1UL << DWT_FUNCTION_LNK1ENA_Pos) /*!< DWT FUNCTION: LNK1ENA Mask */ + +#define DWT_FUNCTION_DATAVMATCH_Pos 8U /*!< DWT FUNCTION: DATAVMATCH Position */ +#define DWT_FUNCTION_DATAVMATCH_Msk (0x1UL << DWT_FUNCTION_DATAVMATCH_Pos) /*!< DWT FUNCTION: DATAVMATCH Mask */ + +#define DWT_FUNCTION_CYCMATCH_Pos 7U /*!< DWT FUNCTION: CYCMATCH Position */ +#define DWT_FUNCTION_CYCMATCH_Msk (0x1UL << DWT_FUNCTION_CYCMATCH_Pos) /*!< DWT FUNCTION: CYCMATCH Mask */ + +#define DWT_FUNCTION_EMITRANGE_Pos 5U /*!< DWT FUNCTION: EMITRANGE Position */ +#define DWT_FUNCTION_EMITRANGE_Msk (0x1UL << DWT_FUNCTION_EMITRANGE_Pos) /*!< DWT FUNCTION: EMITRANGE Mask */ + +#define DWT_FUNCTION_FUNCTION_Pos 0U /*!< DWT FUNCTION: FUNCTION Position */ +#define DWT_FUNCTION_FUNCTION_Msk (0xFUL /*<< DWT_FUNCTION_FUNCTION_Pos*/) /*!< DWT FUNCTION: FUNCTION Mask */ + +/*@}*/ /* end of group CMSIS_DWT */ + + +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_TPI Trace Port Interface (TPI) + \brief Type definitions for the Trace Port Interface (TPI) + @{ + */ + +/** + \brief Structure type to access the Trace Port Interface Register (TPI). + */ +typedef struct +{ + __IM uint32_t SSPSR; /*!< Offset: 0x000 (R/ ) Supported Parallel Port Size Register */ + __IOM uint32_t CSPSR; /*!< Offset: 0x004 (R/W) Current Parallel Port Size Register */ + uint32_t RESERVED0[2U]; + __IOM uint32_t ACPR; /*!< Offset: 0x010 (R/W) Asynchronous Clock Prescaler Register */ + uint32_t RESERVED1[55U]; + __IOM uint32_t SPPR; /*!< Offset: 0x0F0 (R/W) Selected Pin Protocol Register */ + uint32_t RESERVED2[131U]; + __IM uint32_t FFSR; /*!< Offset: 0x300 (R/ ) Formatter and Flush Status Register */ + __IOM uint32_t FFCR; /*!< Offset: 0x304 (R/W) Formatter and Flush Control Register */ + __IM uint32_t FSCR; /*!< Offset: 0x308 (R/ ) Formatter Synchronization Counter Register */ + uint32_t RESERVED3[759U]; + __IM uint32_t TRIGGER; /*!< Offset: 0xEE8 (R/ ) TRIGGER Register */ + __IM uint32_t FIFO0; /*!< Offset: 0xEEC (R/ ) Integration ETM Data */ + __IM uint32_t ITATBCTR2; /*!< Offset: 0xEF0 (R/ ) ITATBCTR2 */ + uint32_t RESERVED4[1U]; + __IM uint32_t ITATBCTR0; /*!< Offset: 0xEF8 (R/ ) ITATBCTR0 */ + __IM uint32_t FIFO1; /*!< Offset: 0xEFC (R/ ) Integration ITM Data */ + __IOM uint32_t ITCTRL; /*!< Offset: 0xF00 (R/W) Integration Mode Control */ + uint32_t RESERVED5[39U]; + __IOM uint32_t CLAIMSET; /*!< Offset: 0xFA0 (R/W) Claim tag set */ + __IOM uint32_t CLAIMCLR; /*!< Offset: 0xFA4 (R/W) Claim tag clear */ + uint32_t RESERVED7[8U]; + __IM uint32_t DEVID; /*!< Offset: 0xFC8 (R/ ) TPIU_DEVID */ + __IM uint32_t DEVTYPE; /*!< Offset: 0xFCC (R/ ) TPIU_DEVTYPE */ +} TPI_Type; + +/* TPI Asynchronous Clock Prescaler Register Definitions */ +#define TPI_ACPR_PRESCALER_Pos 0U /*!< TPI ACPR: PRESCALER Position */ +#define TPI_ACPR_PRESCALER_Msk (0x1FFFUL /*<< TPI_ACPR_PRESCALER_Pos*/) /*!< TPI ACPR: PRESCALER Mask */ + +/* TPI Selected Pin Protocol Register Definitions */ +#define TPI_SPPR_TXMODE_Pos 0U /*!< TPI SPPR: TXMODE Position */ +#define TPI_SPPR_TXMODE_Msk (0x3UL /*<< TPI_SPPR_TXMODE_Pos*/) /*!< TPI SPPR: TXMODE Mask */ + +/* TPI Formatter and Flush Status Register Definitions */ +#define TPI_FFSR_FtNonStop_Pos 3U /*!< TPI FFSR: FtNonStop Position */ +#define TPI_FFSR_FtNonStop_Msk (0x1UL << TPI_FFSR_FtNonStop_Pos) /*!< TPI FFSR: FtNonStop Mask */ + +#define TPI_FFSR_TCPresent_Pos 2U /*!< TPI FFSR: TCPresent Position */ +#define TPI_FFSR_TCPresent_Msk (0x1UL << TPI_FFSR_TCPresent_Pos) /*!< TPI FFSR: TCPresent Mask */ + +#define TPI_FFSR_FtStopped_Pos 1U /*!< TPI FFSR: FtStopped Position */ +#define TPI_FFSR_FtStopped_Msk (0x1UL << TPI_FFSR_FtStopped_Pos) /*!< TPI FFSR: FtStopped Mask */ + +#define TPI_FFSR_FlInProg_Pos 0U /*!< TPI FFSR: FlInProg Position */ +#define TPI_FFSR_FlInProg_Msk (0x1UL /*<< TPI_FFSR_FlInProg_Pos*/) /*!< TPI FFSR: FlInProg Mask */ + +/* TPI Formatter and Flush Control Register Definitions */ +#define TPI_FFCR_TrigIn_Pos 8U /*!< TPI FFCR: TrigIn Position */ +#define TPI_FFCR_TrigIn_Msk (0x1UL << TPI_FFCR_TrigIn_Pos) /*!< TPI FFCR: TrigIn Mask */ + +#define TPI_FFCR_EnFCont_Pos 1U /*!< TPI FFCR: EnFCont Position */ +#define TPI_FFCR_EnFCont_Msk (0x1UL << TPI_FFCR_EnFCont_Pos) /*!< TPI FFCR: EnFCont Mask */ + +/* TPI TRIGGER Register Definitions */ +#define TPI_TRIGGER_TRIGGER_Pos 0U /*!< TPI TRIGGER: TRIGGER Position */ +#define TPI_TRIGGER_TRIGGER_Msk (0x1UL /*<< TPI_TRIGGER_TRIGGER_Pos*/) /*!< TPI TRIGGER: TRIGGER Mask */ + +/* TPI Integration ETM Data Register Definitions (FIFO0) */ +#define TPI_FIFO0_ITM_ATVALID_Pos 29U /*!< TPI FIFO0: ITM_ATVALID Position */ +#define TPI_FIFO0_ITM_ATVALID_Msk (0x1UL << TPI_FIFO0_ITM_ATVALID_Pos) /*!< TPI FIFO0: ITM_ATVALID Mask */ + +#define TPI_FIFO0_ITM_bytecount_Pos 27U /*!< TPI FIFO0: ITM_bytecount Position */ +#define TPI_FIFO0_ITM_bytecount_Msk (0x3UL << TPI_FIFO0_ITM_bytecount_Pos) /*!< TPI FIFO0: ITM_bytecount Mask */ + +#define TPI_FIFO0_ETM_ATVALID_Pos 26U /*!< TPI FIFO0: ETM_ATVALID Position */ +#define TPI_FIFO0_ETM_ATVALID_Msk (0x1UL << TPI_FIFO0_ETM_ATVALID_Pos) /*!< TPI FIFO0: ETM_ATVALID Mask */ + +#define TPI_FIFO0_ETM_bytecount_Pos 24U /*!< TPI FIFO0: ETM_bytecount Position */ +#define TPI_FIFO0_ETM_bytecount_Msk (0x3UL << TPI_FIFO0_ETM_bytecount_Pos) /*!< TPI FIFO0: ETM_bytecount Mask */ + +#define TPI_FIFO0_ETM2_Pos 16U /*!< TPI FIFO0: ETM2 Position */ +#define TPI_FIFO0_ETM2_Msk (0xFFUL << TPI_FIFO0_ETM2_Pos) /*!< TPI FIFO0: ETM2 Mask */ + +#define TPI_FIFO0_ETM1_Pos 8U /*!< TPI FIFO0: ETM1 Position */ +#define TPI_FIFO0_ETM1_Msk (0xFFUL << TPI_FIFO0_ETM1_Pos) /*!< TPI FIFO0: ETM1 Mask */ + +#define TPI_FIFO0_ETM0_Pos 0U /*!< TPI FIFO0: ETM0 Position */ +#define TPI_FIFO0_ETM0_Msk (0xFFUL /*<< TPI_FIFO0_ETM0_Pos*/) /*!< TPI FIFO0: ETM0 Mask */ + +/* TPI ITATBCTR2 Register Definitions */ +#define TPI_ITATBCTR2_ATREADY2_Pos 0U /*!< TPI ITATBCTR2: ATREADY2 Position */ +#define TPI_ITATBCTR2_ATREADY2_Msk (0x1UL /*<< TPI_ITATBCTR2_ATREADY2_Pos*/) /*!< TPI ITATBCTR2: ATREADY2 Mask */ + +#define TPI_ITATBCTR2_ATREADY1_Pos 0U /*!< TPI ITATBCTR2: ATREADY1 Position */ +#define TPI_ITATBCTR2_ATREADY1_Msk (0x1UL /*<< TPI_ITATBCTR2_ATREADY1_Pos*/) /*!< TPI ITATBCTR2: ATREADY1 Mask */ + +/* TPI Integration ITM Data Register Definitions (FIFO1) */ +#define TPI_FIFO1_ITM_ATVALID_Pos 29U /*!< TPI FIFO1: ITM_ATVALID Position */ +#define TPI_FIFO1_ITM_ATVALID_Msk (0x1UL << TPI_FIFO1_ITM_ATVALID_Pos) /*!< TPI FIFO1: ITM_ATVALID Mask */ + +#define TPI_FIFO1_ITM_bytecount_Pos 27U /*!< TPI FIFO1: ITM_bytecount Position */ +#define TPI_FIFO1_ITM_bytecount_Msk (0x3UL << TPI_FIFO1_ITM_bytecount_Pos) /*!< TPI FIFO1: ITM_bytecount Mask */ + +#define TPI_FIFO1_ETM_ATVALID_Pos 26U /*!< TPI FIFO1: ETM_ATVALID Position */ +#define TPI_FIFO1_ETM_ATVALID_Msk (0x1UL << TPI_FIFO1_ETM_ATVALID_Pos) /*!< TPI FIFO1: ETM_ATVALID Mask */ + +#define TPI_FIFO1_ETM_bytecount_Pos 24U /*!< TPI FIFO1: ETM_bytecount Position */ +#define TPI_FIFO1_ETM_bytecount_Msk (0x3UL << TPI_FIFO1_ETM_bytecount_Pos) /*!< TPI FIFO1: ETM_bytecount Mask */ + +#define TPI_FIFO1_ITM2_Pos 16U /*!< TPI FIFO1: ITM2 Position */ +#define TPI_FIFO1_ITM2_Msk (0xFFUL << TPI_FIFO1_ITM2_Pos) /*!< TPI FIFO1: ITM2 Mask */ + +#define TPI_FIFO1_ITM1_Pos 8U /*!< TPI FIFO1: ITM1 Position */ +#define TPI_FIFO1_ITM1_Msk (0xFFUL << TPI_FIFO1_ITM1_Pos) /*!< TPI FIFO1: ITM1 Mask */ + +#define TPI_FIFO1_ITM0_Pos 0U /*!< TPI FIFO1: ITM0 Position */ +#define TPI_FIFO1_ITM0_Msk (0xFFUL /*<< TPI_FIFO1_ITM0_Pos*/) /*!< TPI FIFO1: ITM0 Mask */ + +/* TPI ITATBCTR0 Register Definitions */ +#define TPI_ITATBCTR0_ATREADY2_Pos 0U /*!< TPI ITATBCTR0: ATREADY2 Position */ +#define TPI_ITATBCTR0_ATREADY2_Msk (0x1UL /*<< TPI_ITATBCTR0_ATREADY2_Pos*/) /*!< TPI ITATBCTR0: ATREADY2 Mask */ + +#define TPI_ITATBCTR0_ATREADY1_Pos 0U /*!< TPI ITATBCTR0: ATREADY1 Position */ +#define TPI_ITATBCTR0_ATREADY1_Msk (0x1UL /*<< TPI_ITATBCTR0_ATREADY1_Pos*/) /*!< TPI ITATBCTR0: ATREADY1 Mask */ + +/* TPI Integration Mode Control Register Definitions */ +#define TPI_ITCTRL_Mode_Pos 0U /*!< TPI ITCTRL: Mode Position */ +#define TPI_ITCTRL_Mode_Msk (0x3UL /*<< TPI_ITCTRL_Mode_Pos*/) /*!< TPI ITCTRL: Mode Mask */ + +/* TPI DEVID Register Definitions */ +#define TPI_DEVID_NRZVALID_Pos 11U /*!< TPI DEVID: NRZVALID Position */ +#define TPI_DEVID_NRZVALID_Msk (0x1UL << TPI_DEVID_NRZVALID_Pos) /*!< TPI DEVID: NRZVALID Mask */ + +#define TPI_DEVID_MANCVALID_Pos 10U /*!< TPI DEVID: MANCVALID Position */ +#define TPI_DEVID_MANCVALID_Msk (0x1UL << TPI_DEVID_MANCVALID_Pos) /*!< TPI DEVID: MANCVALID Mask */ + +#define TPI_DEVID_PTINVALID_Pos 9U /*!< TPI DEVID: PTINVALID Position */ +#define TPI_DEVID_PTINVALID_Msk (0x1UL << TPI_DEVID_PTINVALID_Pos) /*!< TPI DEVID: PTINVALID Mask */ + +#define TPI_DEVID_MinBufSz_Pos 6U /*!< TPI DEVID: MinBufSz Position */ +#define TPI_DEVID_MinBufSz_Msk (0x7UL << TPI_DEVID_MinBufSz_Pos) /*!< TPI DEVID: MinBufSz Mask */ + +#define TPI_DEVID_AsynClkIn_Pos 5U /*!< TPI DEVID: AsynClkIn Position */ +#define TPI_DEVID_AsynClkIn_Msk (0x1UL << TPI_DEVID_AsynClkIn_Pos) /*!< TPI DEVID: AsynClkIn Mask */ + +#define TPI_DEVID_NrTraceInput_Pos 0U /*!< TPI DEVID: NrTraceInput Position */ +#define TPI_DEVID_NrTraceInput_Msk (0x1FUL /*<< TPI_DEVID_NrTraceInput_Pos*/) /*!< TPI DEVID: NrTraceInput Mask */ + +/* TPI DEVTYPE Register Definitions */ +#define TPI_DEVTYPE_SubType_Pos 4U /*!< TPI DEVTYPE: SubType Position */ +#define TPI_DEVTYPE_SubType_Msk (0xFUL /*<< TPI_DEVTYPE_SubType_Pos*/) /*!< TPI DEVTYPE: SubType Mask */ + +#define TPI_DEVTYPE_MajorType_Pos 0U /*!< TPI DEVTYPE: MajorType Position */ +#define TPI_DEVTYPE_MajorType_Msk (0xFUL << TPI_DEVTYPE_MajorType_Pos) /*!< TPI DEVTYPE: MajorType Mask */ + +/*@}*/ /* end of group CMSIS_TPI */ + + +#if defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U) +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_MPU Memory Protection Unit (MPU) + \brief Type definitions for the Memory Protection Unit (MPU) + @{ + */ + +/** + \brief Structure type to access the Memory Protection Unit (MPU). + */ +typedef struct +{ + __IM uint32_t TYPE; /*!< Offset: 0x000 (R/ ) MPU Type Register */ + __IOM uint32_t CTRL; /*!< Offset: 0x004 (R/W) MPU Control Register */ + __IOM uint32_t RNR; /*!< Offset: 0x008 (R/W) MPU Region RNRber Register */ + __IOM uint32_t RBAR; /*!< Offset: 0x00C (R/W) MPU Region Base Address Register */ + __IOM uint32_t RASR; /*!< Offset: 0x010 (R/W) MPU Region Attribute and Size Register */ + __IOM uint32_t RBAR_A1; /*!< Offset: 0x014 (R/W) MPU Alias 1 Region Base Address Register */ + __IOM uint32_t RASR_A1; /*!< Offset: 0x018 (R/W) MPU Alias 1 Region Attribute and Size Register */ + __IOM uint32_t RBAR_A2; /*!< Offset: 0x01C (R/W) MPU Alias 2 Region Base Address Register */ + __IOM uint32_t RASR_A2; /*!< Offset: 0x020 (R/W) MPU Alias 2 Region Attribute and Size Register */ + __IOM uint32_t RBAR_A3; /*!< Offset: 0x024 (R/W) MPU Alias 3 Region Base Address Register */ + __IOM uint32_t RASR_A3; /*!< Offset: 0x028 (R/W) MPU Alias 3 Region Attribute and Size Register */ +} MPU_Type; + +#define MPU_TYPE_RALIASES 4U + +/* MPU Type Register Definitions */ +#define MPU_TYPE_IREGION_Pos 16U /*!< MPU TYPE: IREGION Position */ +#define MPU_TYPE_IREGION_Msk (0xFFUL << MPU_TYPE_IREGION_Pos) /*!< MPU TYPE: IREGION Mask */ + +#define MPU_TYPE_DREGION_Pos 8U /*!< MPU TYPE: DREGION Position */ +#define MPU_TYPE_DREGION_Msk (0xFFUL << MPU_TYPE_DREGION_Pos) /*!< MPU TYPE: DREGION Mask */ + +#define MPU_TYPE_SEPARATE_Pos 0U /*!< MPU TYPE: SEPARATE Position */ +#define MPU_TYPE_SEPARATE_Msk (1UL /*<< MPU_TYPE_SEPARATE_Pos*/) /*!< MPU TYPE: SEPARATE Mask */ + +/* MPU Control Register Definitions */ +#define MPU_CTRL_PRIVDEFENA_Pos 2U /*!< MPU CTRL: PRIVDEFENA Position */ +#define MPU_CTRL_PRIVDEFENA_Msk (1UL << MPU_CTRL_PRIVDEFENA_Pos) /*!< MPU CTRL: PRIVDEFENA Mask */ + +#define MPU_CTRL_HFNMIENA_Pos 1U /*!< MPU CTRL: HFNMIENA Position */ +#define MPU_CTRL_HFNMIENA_Msk (1UL << MPU_CTRL_HFNMIENA_Pos) /*!< MPU CTRL: HFNMIENA Mask */ + +#define MPU_CTRL_ENABLE_Pos 0U /*!< MPU CTRL: ENABLE Position */ +#define MPU_CTRL_ENABLE_Msk (1UL /*<< MPU_CTRL_ENABLE_Pos*/) /*!< MPU CTRL: ENABLE Mask */ + +/* MPU Region Number Register Definitions */ +#define MPU_RNR_REGION_Pos 0U /*!< MPU RNR: REGION Position */ +#define MPU_RNR_REGION_Msk (0xFFUL /*<< MPU_RNR_REGION_Pos*/) /*!< MPU RNR: REGION Mask */ + +/* MPU Region Base Address Register Definitions */ +#define MPU_RBAR_ADDR_Pos 5U /*!< MPU RBAR: ADDR Position */ +#define MPU_RBAR_ADDR_Msk (0x7FFFFFFUL << MPU_RBAR_ADDR_Pos) /*!< MPU RBAR: ADDR Mask */ + +#define MPU_RBAR_VALID_Pos 4U /*!< MPU RBAR: VALID Position */ +#define MPU_RBAR_VALID_Msk (1UL << MPU_RBAR_VALID_Pos) /*!< MPU RBAR: VALID Mask */ + +#define MPU_RBAR_REGION_Pos 0U /*!< MPU RBAR: REGION Position */ +#define MPU_RBAR_REGION_Msk (0xFUL /*<< MPU_RBAR_REGION_Pos*/) /*!< MPU RBAR: REGION Mask */ + +/* MPU Region Attribute and Size Register Definitions */ +#define MPU_RASR_ATTRS_Pos 16U /*!< MPU RASR: MPU Region Attribute field Position */ +#define MPU_RASR_ATTRS_Msk (0xFFFFUL << MPU_RASR_ATTRS_Pos) /*!< MPU RASR: MPU Region Attribute field Mask */ + +#define MPU_RASR_XN_Pos 28U /*!< MPU RASR: ATTRS.XN Position */ +#define MPU_RASR_XN_Msk (1UL << MPU_RASR_XN_Pos) /*!< MPU RASR: ATTRS.XN Mask */ + +#define MPU_RASR_AP_Pos 24U /*!< MPU RASR: ATTRS.AP Position */ +#define MPU_RASR_AP_Msk (0x7UL << MPU_RASR_AP_Pos) /*!< MPU RASR: ATTRS.AP Mask */ + +#define MPU_RASR_TEX_Pos 19U /*!< MPU RASR: ATTRS.TEX Position */ +#define MPU_RASR_TEX_Msk (0x7UL << MPU_RASR_TEX_Pos) /*!< MPU RASR: ATTRS.TEX Mask */ + +#define MPU_RASR_S_Pos 18U /*!< MPU RASR: ATTRS.S Position */ +#define MPU_RASR_S_Msk (1UL << MPU_RASR_S_Pos) /*!< MPU RASR: ATTRS.S Mask */ + +#define MPU_RASR_C_Pos 17U /*!< MPU RASR: ATTRS.C Position */ +#define MPU_RASR_C_Msk (1UL << MPU_RASR_C_Pos) /*!< MPU RASR: ATTRS.C Mask */ + +#define MPU_RASR_B_Pos 16U /*!< MPU RASR: ATTRS.B Position */ +#define MPU_RASR_B_Msk (1UL << MPU_RASR_B_Pos) /*!< MPU RASR: ATTRS.B Mask */ + +#define MPU_RASR_SRD_Pos 8U /*!< MPU RASR: Sub-Region Disable Position */ +#define MPU_RASR_SRD_Msk (0xFFUL << MPU_RASR_SRD_Pos) /*!< MPU RASR: Sub-Region Disable Mask */ + +#define MPU_RASR_SIZE_Pos 1U /*!< MPU RASR: Region Size Field Position */ +#define MPU_RASR_SIZE_Msk (0x1FUL << MPU_RASR_SIZE_Pos) /*!< MPU RASR: Region Size Field Mask */ + +#define MPU_RASR_ENABLE_Pos 0U /*!< MPU RASR: Region enable bit Position */ +#define MPU_RASR_ENABLE_Msk (1UL /*<< MPU_RASR_ENABLE_Pos*/) /*!< MPU RASR: Region enable bit Disable Mask */ + +/*@} end of group CMSIS_MPU */ +#endif /* defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U) */ + + +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_FPU Floating Point Unit (FPU) + \brief Type definitions for the Floating Point Unit (FPU) + @{ + */ + +/** + \brief Structure type to access the Floating Point Unit (FPU). + */ +typedef struct +{ + uint32_t RESERVED0[1U]; + __IOM uint32_t FPCCR; /*!< Offset: 0x004 (R/W) Floating-Point Context Control Register */ + __IOM uint32_t FPCAR; /*!< Offset: 0x008 (R/W) Floating-Point Context Address Register */ + __IOM uint32_t FPDSCR; /*!< Offset: 0x00C (R/W) Floating-Point Default Status Control Register */ + __IM uint32_t MVFR0; /*!< Offset: 0x010 (R/ ) Media and FP Feature Register 0 */ + __IM uint32_t MVFR1; /*!< Offset: 0x014 (R/ ) Media and FP Feature Register 1 */ + __IM uint32_t MVFR2; /*!< Offset: 0x018 (R/ ) Media and FP Feature Register 2 */ +} FPU_Type; + +/* Floating-Point Context Control Register Definitions */ +#define FPU_FPCCR_ASPEN_Pos 31U /*!< FPCCR: ASPEN bit Position */ +#define FPU_FPCCR_ASPEN_Msk (1UL << FPU_FPCCR_ASPEN_Pos) /*!< FPCCR: ASPEN bit Mask */ + +#define FPU_FPCCR_LSPEN_Pos 30U /*!< FPCCR: LSPEN Position */ +#define FPU_FPCCR_LSPEN_Msk (1UL << FPU_FPCCR_LSPEN_Pos) /*!< FPCCR: LSPEN bit Mask */ + +#define FPU_FPCCR_MONRDY_Pos 8U /*!< FPCCR: MONRDY Position */ +#define FPU_FPCCR_MONRDY_Msk (1UL << FPU_FPCCR_MONRDY_Pos) /*!< FPCCR: MONRDY bit Mask */ + +#define FPU_FPCCR_BFRDY_Pos 6U /*!< FPCCR: BFRDY Position */ +#define FPU_FPCCR_BFRDY_Msk (1UL << FPU_FPCCR_BFRDY_Pos) /*!< FPCCR: BFRDY bit Mask */ + +#define FPU_FPCCR_MMRDY_Pos 5U /*!< FPCCR: MMRDY Position */ +#define FPU_FPCCR_MMRDY_Msk (1UL << FPU_FPCCR_MMRDY_Pos) /*!< FPCCR: MMRDY bit Mask */ + +#define FPU_FPCCR_HFRDY_Pos 4U /*!< FPCCR: HFRDY Position */ +#define FPU_FPCCR_HFRDY_Msk (1UL << FPU_FPCCR_HFRDY_Pos) /*!< FPCCR: HFRDY bit Mask */ + +#define FPU_FPCCR_THREAD_Pos 3U /*!< FPCCR: processor mode bit Position */ +#define FPU_FPCCR_THREAD_Msk (1UL << FPU_FPCCR_THREAD_Pos) /*!< FPCCR: processor mode active bit Mask */ + +#define FPU_FPCCR_USER_Pos 1U /*!< FPCCR: privilege level bit Position */ +#define FPU_FPCCR_USER_Msk (1UL << FPU_FPCCR_USER_Pos) /*!< FPCCR: privilege level bit Mask */ + +#define FPU_FPCCR_LSPACT_Pos 0U /*!< FPCCR: Lazy state preservation active bit Position */ +#define FPU_FPCCR_LSPACT_Msk (1UL /*<< FPU_FPCCR_LSPACT_Pos*/) /*!< FPCCR: Lazy state preservation active bit Mask */ + +/* Floating-Point Context Address Register Definitions */ +#define FPU_FPCAR_ADDRESS_Pos 3U /*!< FPCAR: ADDRESS bit Position */ +#define FPU_FPCAR_ADDRESS_Msk (0x1FFFFFFFUL << FPU_FPCAR_ADDRESS_Pos) /*!< FPCAR: ADDRESS bit Mask */ + +/* Floating-Point Default Status Control Register Definitions */ +#define FPU_FPDSCR_AHP_Pos 26U /*!< FPDSCR: AHP bit Position */ +#define FPU_FPDSCR_AHP_Msk (1UL << FPU_FPDSCR_AHP_Pos) /*!< FPDSCR: AHP bit Mask */ + +#define FPU_FPDSCR_DN_Pos 25U /*!< FPDSCR: DN bit Position */ +#define FPU_FPDSCR_DN_Msk (1UL << FPU_FPDSCR_DN_Pos) /*!< FPDSCR: DN bit Mask */ + +#define FPU_FPDSCR_FZ_Pos 24U /*!< FPDSCR: FZ bit Position */ +#define FPU_FPDSCR_FZ_Msk (1UL << FPU_FPDSCR_FZ_Pos) /*!< FPDSCR: FZ bit Mask */ + +#define FPU_FPDSCR_RMode_Pos 22U /*!< FPDSCR: RMode bit Position */ +#define FPU_FPDSCR_RMode_Msk (3UL << FPU_FPDSCR_RMode_Pos) /*!< FPDSCR: RMode bit Mask */ + +/* Media and FP Feature Register 0 Definitions */ +#define FPU_MVFR0_FP_rounding_modes_Pos 28U /*!< MVFR0: FP rounding modes bits Position */ +#define FPU_MVFR0_FP_rounding_modes_Msk (0xFUL << FPU_MVFR0_FP_rounding_modes_Pos) /*!< MVFR0: FP rounding modes bits Mask */ + +#define FPU_MVFR0_Short_vectors_Pos 24U /*!< MVFR0: Short vectors bits Position */ +#define FPU_MVFR0_Short_vectors_Msk (0xFUL << FPU_MVFR0_Short_vectors_Pos) /*!< MVFR0: Short vectors bits Mask */ + +#define FPU_MVFR0_Square_root_Pos 20U /*!< MVFR0: Square root bits Position */ +#define FPU_MVFR0_Square_root_Msk (0xFUL << FPU_MVFR0_Square_root_Pos) /*!< MVFR0: Square root bits Mask */ + +#define FPU_MVFR0_Divide_Pos 16U /*!< MVFR0: Divide bits Position */ +#define FPU_MVFR0_Divide_Msk (0xFUL << FPU_MVFR0_Divide_Pos) /*!< MVFR0: Divide bits Mask */ + +#define FPU_MVFR0_FP_excep_trapping_Pos 12U /*!< MVFR0: FP exception trapping bits Position */ +#define FPU_MVFR0_FP_excep_trapping_Msk (0xFUL << FPU_MVFR0_FP_excep_trapping_Pos) /*!< MVFR0: FP exception trapping bits Mask */ + +#define FPU_MVFR0_Double_precision_Pos 8U /*!< MVFR0: Double-precision bits Position */ +#define FPU_MVFR0_Double_precision_Msk (0xFUL << FPU_MVFR0_Double_precision_Pos) /*!< MVFR0: Double-precision bits Mask */ + +#define FPU_MVFR0_Single_precision_Pos 4U /*!< MVFR0: Single-precision bits Position */ +#define FPU_MVFR0_Single_precision_Msk (0xFUL << FPU_MVFR0_Single_precision_Pos) /*!< MVFR0: Single-precision bits Mask */ + +#define FPU_MVFR0_A_SIMD_registers_Pos 0U /*!< MVFR0: A_SIMD registers bits Position */ +#define FPU_MVFR0_A_SIMD_registers_Msk (0xFUL /*<< FPU_MVFR0_A_SIMD_registers_Pos*/) /*!< MVFR0: A_SIMD registers bits Mask */ + +/* Media and FP Feature Register 1 Definitions */ +#define FPU_MVFR1_FP_fused_MAC_Pos 28U /*!< MVFR1: FP fused MAC bits Position */ +#define FPU_MVFR1_FP_fused_MAC_Msk (0xFUL << FPU_MVFR1_FP_fused_MAC_Pos) /*!< MVFR1: FP fused MAC bits Mask */ + +#define FPU_MVFR1_FP_HPFP_Pos 24U /*!< MVFR1: FP HPFP bits Position */ +#define FPU_MVFR1_FP_HPFP_Msk (0xFUL << FPU_MVFR1_FP_HPFP_Pos) /*!< MVFR1: FP HPFP bits Mask */ + +#define FPU_MVFR1_D_NaN_mode_Pos 4U /*!< MVFR1: D_NaN mode bits Position */ +#define FPU_MVFR1_D_NaN_mode_Msk (0xFUL << FPU_MVFR1_D_NaN_mode_Pos) /*!< MVFR1: D_NaN mode bits Mask */ + +#define FPU_MVFR1_FtZ_mode_Pos 0U /*!< MVFR1: FtZ mode bits Position */ +#define FPU_MVFR1_FtZ_mode_Msk (0xFUL /*<< FPU_MVFR1_FtZ_mode_Pos*/) /*!< MVFR1: FtZ mode bits Mask */ + +/* Media and FP Feature Register 2 Definitions */ + +#define FPU_MVFR2_VFP_Misc_Pos 4U /*!< MVFR2: VFP Misc bits Position */ +#define FPU_MVFR2_VFP_Misc_Msk (0xFUL << FPU_MVFR2_VFP_Misc_Pos) /*!< MVFR2: VFP Misc bits Mask */ + +/*@} end of group CMSIS_FPU */ + + +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_CoreDebug Core Debug Registers (CoreDebug) + \brief Type definitions for the Core Debug Registers + @{ + */ + +/** + \brief Structure type to access the Core Debug Register (CoreDebug). + */ +typedef struct +{ + __IOM uint32_t DHCSR; /*!< Offset: 0x000 (R/W) Debug Halting Control and Status Register */ + __OM uint32_t DCRSR; /*!< Offset: 0x004 ( /W) Debug Core Register Selector Register */ + __IOM uint32_t DCRDR; /*!< Offset: 0x008 (R/W) Debug Core Register Data Register */ + __IOM uint32_t DEMCR; /*!< Offset: 0x00C (R/W) Debug Exception and Monitor Control Register */ +} CoreDebug_Type; + +/* Debug Halting Control and Status Register Definitions */ +#define CoreDebug_DHCSR_DBGKEY_Pos 16U /*!< CoreDebug DHCSR: DBGKEY Position */ +#define CoreDebug_DHCSR_DBGKEY_Msk (0xFFFFUL << CoreDebug_DHCSR_DBGKEY_Pos) /*!< CoreDebug DHCSR: DBGKEY Mask */ + +#define CoreDebug_DHCSR_S_RESET_ST_Pos 25U /*!< CoreDebug DHCSR: S_RESET_ST Position */ +#define CoreDebug_DHCSR_S_RESET_ST_Msk (1UL << CoreDebug_DHCSR_S_RESET_ST_Pos) /*!< CoreDebug DHCSR: S_RESET_ST Mask */ + +#define CoreDebug_DHCSR_S_RETIRE_ST_Pos 24U /*!< CoreDebug DHCSR: S_RETIRE_ST Position */ +#define CoreDebug_DHCSR_S_RETIRE_ST_Msk (1UL << CoreDebug_DHCSR_S_RETIRE_ST_Pos) /*!< CoreDebug DHCSR: S_RETIRE_ST Mask */ + +#define CoreDebug_DHCSR_S_LOCKUP_Pos 19U /*!< CoreDebug DHCSR: S_LOCKUP Position */ +#define CoreDebug_DHCSR_S_LOCKUP_Msk (1UL << CoreDebug_DHCSR_S_LOCKUP_Pos) /*!< CoreDebug DHCSR: S_LOCKUP Mask */ + +#define CoreDebug_DHCSR_S_SLEEP_Pos 18U /*!< CoreDebug DHCSR: S_SLEEP Position */ +#define CoreDebug_DHCSR_S_SLEEP_Msk (1UL << CoreDebug_DHCSR_S_SLEEP_Pos) /*!< CoreDebug DHCSR: S_SLEEP Mask */ + +#define CoreDebug_DHCSR_S_HALT_Pos 17U /*!< CoreDebug DHCSR: S_HALT Position */ +#define CoreDebug_DHCSR_S_HALT_Msk (1UL << CoreDebug_DHCSR_S_HALT_Pos) /*!< CoreDebug DHCSR: S_HALT Mask */ + +#define CoreDebug_DHCSR_S_REGRDY_Pos 16U /*!< CoreDebug DHCSR: S_REGRDY Position */ +#define CoreDebug_DHCSR_S_REGRDY_Msk (1UL << CoreDebug_DHCSR_S_REGRDY_Pos) /*!< CoreDebug DHCSR: S_REGRDY Mask */ + +#define CoreDebug_DHCSR_C_SNAPSTALL_Pos 5U /*!< CoreDebug DHCSR: C_SNAPSTALL Position */ +#define CoreDebug_DHCSR_C_SNAPSTALL_Msk (1UL << CoreDebug_DHCSR_C_SNAPSTALL_Pos) /*!< CoreDebug DHCSR: C_SNAPSTALL Mask */ + +#define CoreDebug_DHCSR_C_MASKINTS_Pos 3U /*!< CoreDebug DHCSR: C_MASKINTS Position */ +#define CoreDebug_DHCSR_C_MASKINTS_Msk (1UL << CoreDebug_DHCSR_C_MASKINTS_Pos) /*!< CoreDebug DHCSR: C_MASKINTS Mask */ + +#define CoreDebug_DHCSR_C_STEP_Pos 2U /*!< CoreDebug DHCSR: C_STEP Position */ +#define CoreDebug_DHCSR_C_STEP_Msk (1UL << CoreDebug_DHCSR_C_STEP_Pos) /*!< CoreDebug DHCSR: C_STEP Mask */ + +#define CoreDebug_DHCSR_C_HALT_Pos 1U /*!< CoreDebug DHCSR: C_HALT Position */ +#define CoreDebug_DHCSR_C_HALT_Msk (1UL << CoreDebug_DHCSR_C_HALT_Pos) /*!< CoreDebug DHCSR: C_HALT Mask */ + +#define CoreDebug_DHCSR_C_DEBUGEN_Pos 0U /*!< CoreDebug DHCSR: C_DEBUGEN Position */ +#define CoreDebug_DHCSR_C_DEBUGEN_Msk (1UL /*<< CoreDebug_DHCSR_C_DEBUGEN_Pos*/) /*!< CoreDebug DHCSR: C_DEBUGEN Mask */ + +/* Debug Core Register Selector Register Definitions */ +#define CoreDebug_DCRSR_REGWnR_Pos 16U /*!< CoreDebug DCRSR: REGWnR Position */ +#define CoreDebug_DCRSR_REGWnR_Msk (1UL << CoreDebug_DCRSR_REGWnR_Pos) /*!< CoreDebug DCRSR: REGWnR Mask */ + +#define CoreDebug_DCRSR_REGSEL_Pos 0U /*!< CoreDebug DCRSR: REGSEL Position */ +#define CoreDebug_DCRSR_REGSEL_Msk (0x1FUL /*<< CoreDebug_DCRSR_REGSEL_Pos*/) /*!< CoreDebug DCRSR: REGSEL Mask */ + +/* Debug Exception and Monitor Control Register Definitions */ +#define CoreDebug_DEMCR_TRCENA_Pos 24U /*!< CoreDebug DEMCR: TRCENA Position */ +#define CoreDebug_DEMCR_TRCENA_Msk (1UL << CoreDebug_DEMCR_TRCENA_Pos) /*!< CoreDebug DEMCR: TRCENA Mask */ + +#define CoreDebug_DEMCR_MON_REQ_Pos 19U /*!< CoreDebug DEMCR: MON_REQ Position */ +#define CoreDebug_DEMCR_MON_REQ_Msk (1UL << CoreDebug_DEMCR_MON_REQ_Pos) /*!< CoreDebug DEMCR: MON_REQ Mask */ + +#define CoreDebug_DEMCR_MON_STEP_Pos 18U /*!< CoreDebug DEMCR: MON_STEP Position */ +#define CoreDebug_DEMCR_MON_STEP_Msk (1UL << CoreDebug_DEMCR_MON_STEP_Pos) /*!< CoreDebug DEMCR: MON_STEP Mask */ + +#define CoreDebug_DEMCR_MON_PEND_Pos 17U /*!< CoreDebug DEMCR: MON_PEND Position */ +#define CoreDebug_DEMCR_MON_PEND_Msk (1UL << CoreDebug_DEMCR_MON_PEND_Pos) /*!< CoreDebug DEMCR: MON_PEND Mask */ + +#define CoreDebug_DEMCR_MON_EN_Pos 16U /*!< CoreDebug DEMCR: MON_EN Position */ +#define CoreDebug_DEMCR_MON_EN_Msk (1UL << CoreDebug_DEMCR_MON_EN_Pos) /*!< CoreDebug DEMCR: MON_EN Mask */ + +#define CoreDebug_DEMCR_VC_HARDERR_Pos 10U /*!< CoreDebug DEMCR: VC_HARDERR Position */ +#define CoreDebug_DEMCR_VC_HARDERR_Msk (1UL << CoreDebug_DEMCR_VC_HARDERR_Pos) /*!< CoreDebug DEMCR: VC_HARDERR Mask */ + +#define CoreDebug_DEMCR_VC_INTERR_Pos 9U /*!< CoreDebug DEMCR: VC_INTERR Position */ +#define CoreDebug_DEMCR_VC_INTERR_Msk (1UL << CoreDebug_DEMCR_VC_INTERR_Pos) /*!< CoreDebug DEMCR: VC_INTERR Mask */ + +#define CoreDebug_DEMCR_VC_BUSERR_Pos 8U /*!< CoreDebug DEMCR: VC_BUSERR Position */ +#define CoreDebug_DEMCR_VC_BUSERR_Msk (1UL << CoreDebug_DEMCR_VC_BUSERR_Pos) /*!< CoreDebug DEMCR: VC_BUSERR Mask */ + +#define CoreDebug_DEMCR_VC_STATERR_Pos 7U /*!< CoreDebug DEMCR: VC_STATERR Position */ +#define CoreDebug_DEMCR_VC_STATERR_Msk (1UL << CoreDebug_DEMCR_VC_STATERR_Pos) /*!< CoreDebug DEMCR: VC_STATERR Mask */ + +#define CoreDebug_DEMCR_VC_CHKERR_Pos 6U /*!< CoreDebug DEMCR: VC_CHKERR Position */ +#define CoreDebug_DEMCR_VC_CHKERR_Msk (1UL << CoreDebug_DEMCR_VC_CHKERR_Pos) /*!< CoreDebug DEMCR: VC_CHKERR Mask */ + +#define CoreDebug_DEMCR_VC_NOCPERR_Pos 5U /*!< CoreDebug DEMCR: VC_NOCPERR Position */ +#define CoreDebug_DEMCR_VC_NOCPERR_Msk (1UL << CoreDebug_DEMCR_VC_NOCPERR_Pos) /*!< CoreDebug DEMCR: VC_NOCPERR Mask */ + +#define CoreDebug_DEMCR_VC_MMERR_Pos 4U /*!< CoreDebug DEMCR: VC_MMERR Position */ +#define CoreDebug_DEMCR_VC_MMERR_Msk (1UL << CoreDebug_DEMCR_VC_MMERR_Pos) /*!< CoreDebug DEMCR: VC_MMERR Mask */ + +#define CoreDebug_DEMCR_VC_CORERESET_Pos 0U /*!< CoreDebug DEMCR: VC_CORERESET Position */ +#define CoreDebug_DEMCR_VC_CORERESET_Msk (1UL /*<< CoreDebug_DEMCR_VC_CORERESET_Pos*/) /*!< CoreDebug DEMCR: VC_CORERESET Mask */ + +/*@} end of group CMSIS_CoreDebug */ + + +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_core_bitfield Core register bit field macros + \brief Macros for use with bit field definitions (xxx_Pos, xxx_Msk). + @{ + */ + +/** + \brief Mask and shift a bit field value for use in a register bit range. + \param[in] field Name of the register bit field. + \param[in] value Value of the bit field. This parameter is interpreted as an uint32_t type. + \return Masked and shifted value. +*/ +#define _VAL2FLD(field, value) (((uint32_t)(value) << field ## _Pos) & field ## _Msk) + +/** + \brief Mask and shift a register value to extract a bit filed value. + \param[in] field Name of the register bit field. + \param[in] value Value of register. This parameter is interpreted as an uint32_t type. + \return Masked and shifted bit field value. +*/ +#define _FLD2VAL(field, value) (((uint32_t)(value) & field ## _Msk) >> field ## _Pos) + +/*@} end of group CMSIS_core_bitfield */ + + +/** + \ingroup CMSIS_core_register + \defgroup CMSIS_core_base Core Definitions + \brief Definitions for base addresses, unions, and structures. + @{ + */ + +/* Memory mapping of Core Hardware */ +#define SCS_BASE (0xE000E000UL) /*!< System Control Space Base Address */ +#define ITM_BASE (0xE0000000UL) /*!< ITM Base Address */ +#define DWT_BASE (0xE0001000UL) /*!< DWT Base Address */ +#define TPI_BASE (0xE0040000UL) /*!< TPI Base Address */ +#define CoreDebug_BASE (0xE000EDF0UL) /*!< Core Debug Base Address */ +#define SysTick_BASE (SCS_BASE + 0x0010UL) /*!< SysTick Base Address */ +#define NVIC_BASE (SCS_BASE + 0x0100UL) /*!< NVIC Base Address */ +#define SCB_BASE (SCS_BASE + 0x0D00UL) /*!< System Control Block Base Address */ + +#define SCnSCB ((SCnSCB_Type *) SCS_BASE ) /*!< System control Register not in SCB */ +#define SCB ((SCB_Type *) SCB_BASE ) /*!< SCB configuration struct */ +#define SysTick ((SysTick_Type *) SysTick_BASE ) /*!< SysTick configuration struct */ +#define NVIC ((NVIC_Type *) NVIC_BASE ) /*!< NVIC configuration struct */ +#define ITM ((ITM_Type *) ITM_BASE ) /*!< ITM configuration struct */ +#define DWT ((DWT_Type *) DWT_BASE ) /*!< DWT configuration struct */ +#define TPI ((TPI_Type *) TPI_BASE ) /*!< TPI configuration struct */ +#define CoreDebug ((CoreDebug_Type *) CoreDebug_BASE) /*!< Core Debug configuration struct */ + +#if defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U) + #define MPU_BASE (SCS_BASE + 0x0D90UL) /*!< Memory Protection Unit */ + #define MPU ((MPU_Type *) MPU_BASE ) /*!< Memory Protection Unit */ +#endif + +#define FPU_BASE (SCS_BASE + 0x0F30UL) /*!< Floating Point Unit */ +#define FPU ((FPU_Type *) FPU_BASE ) /*!< Floating Point Unit */ + +/*@} */ + + + +/******************************************************************************* + * Hardware Abstraction Layer + Core Function Interface contains: + - Core NVIC Functions + - Core SysTick Functions + - Core Debug Functions + - Core Register Access Functions + ******************************************************************************/ +/** + \defgroup CMSIS_Core_FunctionInterface Functions and Instructions Reference +*/ + + + +/* ########################## NVIC functions #################################### */ +/** + \ingroup CMSIS_Core_FunctionInterface + \defgroup CMSIS_Core_NVICFunctions NVIC Functions + \brief Functions that manage interrupts and exceptions via the NVIC. + @{ + */ + +#ifdef CMSIS_NVIC_VIRTUAL + #ifndef CMSIS_NVIC_VIRTUAL_HEADER_FILE + #define CMSIS_NVIC_VIRTUAL_HEADER_FILE "cmsis_nvic_virtual.h" + #endif + #include CMSIS_NVIC_VIRTUAL_HEADER_FILE +#else + #define NVIC_SetPriorityGrouping __NVIC_SetPriorityGrouping + #define NVIC_GetPriorityGrouping __NVIC_GetPriorityGrouping + #define NVIC_EnableIRQ __NVIC_EnableIRQ + #define NVIC_GetEnableIRQ __NVIC_GetEnableIRQ + #define NVIC_DisableIRQ __NVIC_DisableIRQ + #define NVIC_GetPendingIRQ __NVIC_GetPendingIRQ + #define NVIC_SetPendingIRQ __NVIC_SetPendingIRQ + #define NVIC_ClearPendingIRQ __NVIC_ClearPendingIRQ + #define NVIC_GetActive __NVIC_GetActive + #define NVIC_SetPriority __NVIC_SetPriority + #define NVIC_GetPriority __NVIC_GetPriority + #define NVIC_SystemReset __NVIC_SystemReset +#endif /* CMSIS_NVIC_VIRTUAL */ + +#ifdef CMSIS_VECTAB_VIRTUAL + #ifndef CMSIS_VECTAB_VIRTUAL_HEADER_FILE + #define CMSIS_VECTAB_VIRTUAL_HEADER_FILE "cmsis_vectab_virtual.h" + #endif + #include CMSIS_VECTAB_VIRTUAL_HEADER_FILE +#else + #define NVIC_SetVector __NVIC_SetVector + #define NVIC_GetVector __NVIC_GetVector +#endif /* (CMSIS_VECTAB_VIRTUAL) */ + +#define NVIC_USER_IRQ_OFFSET 16 + + +/* The following EXC_RETURN values are saved the LR on exception entry */ +#define EXC_RETURN_HANDLER (0xFFFFFFF1UL) /* return to Handler mode, uses MSP after return */ +#define EXC_RETURN_THREAD_MSP (0xFFFFFFF9UL) /* return to Thread mode, uses MSP after return */ +#define EXC_RETURN_THREAD_PSP (0xFFFFFFFDUL) /* return to Thread mode, uses PSP after return */ +#define EXC_RETURN_HANDLER_FPU (0xFFFFFFE1UL) /* return to Handler mode, uses MSP after return, restore floating-point state */ +#define EXC_RETURN_THREAD_MSP_FPU (0xFFFFFFE9UL) /* return to Thread mode, uses MSP after return, restore floating-point state */ +#define EXC_RETURN_THREAD_PSP_FPU (0xFFFFFFEDUL) /* return to Thread mode, uses PSP after return, restore floating-point state */ + + +/** + \brief Set Priority Grouping + \details Sets the priority grouping field using the required unlock sequence. + The parameter PriorityGroup is assigned to the field SCB->AIRCR [10:8] PRIGROUP field. + Only values from 0..7 are used. + In case of a conflict between priority grouping and available + priority bits (__NVIC_PRIO_BITS), the smallest possible priority group is set. + \param [in] PriorityGroup Priority grouping field. + */ +__STATIC_INLINE void __NVIC_SetPriorityGrouping(uint32_t PriorityGroup) +{ + uint32_t reg_value; + uint32_t PriorityGroupTmp = (PriorityGroup & (uint32_t)0x07UL); /* only values 0..7 are used */ + + reg_value = SCB->AIRCR; /* read old register configuration */ + reg_value &= ~((uint32_t)(SCB_AIRCR_VECTKEY_Msk | SCB_AIRCR_PRIGROUP_Msk)); /* clear bits to change */ + reg_value = (reg_value | + ((uint32_t)0x5FAUL << SCB_AIRCR_VECTKEY_Pos) | + (PriorityGroupTmp << SCB_AIRCR_PRIGROUP_Pos) ); /* Insert write key and priority group */ + SCB->AIRCR = reg_value; +} + + +/** + \brief Get Priority Grouping + \details Reads the priority grouping field from the NVIC Interrupt Controller. + \return Priority grouping field (SCB->AIRCR [10:8] PRIGROUP field). + */ +__STATIC_INLINE uint32_t __NVIC_GetPriorityGrouping(void) +{ + return ((uint32_t)((SCB->AIRCR & SCB_AIRCR_PRIGROUP_Msk) >> SCB_AIRCR_PRIGROUP_Pos)); +} + + +/** + \brief Enable Interrupt + \details Enables a device specific interrupt in the NVIC interrupt controller. + \param [in] IRQn Device specific interrupt number. + \note IRQn must not be negative. + */ +__STATIC_INLINE void __NVIC_EnableIRQ(IRQn_Type IRQn) +{ + if ((int32_t)(IRQn) >= 0) + { + __COMPILER_BARRIER(); + NVIC->ISER[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL)); + __COMPILER_BARRIER(); + } +} + + +/** + \brief Get Interrupt Enable status + \details Returns a device specific interrupt enable status from the NVIC interrupt controller. + \param [in] IRQn Device specific interrupt number. + \return 0 Interrupt is not enabled. + \return 1 Interrupt is enabled. + \note IRQn must not be negative. + */ +__STATIC_INLINE uint32_t __NVIC_GetEnableIRQ(IRQn_Type IRQn) +{ + if ((int32_t)(IRQn) >= 0) + { + return((uint32_t)(((NVIC->ISER[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL)); + } + else + { + return(0U); + } +} + + +/** + \brief Disable Interrupt + \details Disables a device specific interrupt in the NVIC interrupt controller. + \param [in] IRQn Device specific interrupt number. + \note IRQn must not be negative. + */ +__STATIC_INLINE void __NVIC_DisableIRQ(IRQn_Type IRQn) +{ + if ((int32_t)(IRQn) >= 0) + { + NVIC->ICER[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL)); + __DSB(); + __ISB(); + } +} + + +/** + \brief Get Pending Interrupt + \details Reads the NVIC pending register and returns the pending bit for the specified device specific interrupt. + \param [in] IRQn Device specific interrupt number. + \return 0 Interrupt status is not pending. + \return 1 Interrupt status is pending. + \note IRQn must not be negative. + */ +__STATIC_INLINE uint32_t __NVIC_GetPendingIRQ(IRQn_Type IRQn) +{ + if ((int32_t)(IRQn) >= 0) + { + return((uint32_t)(((NVIC->ISPR[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL)); + } + else + { + return(0U); + } +} + + +/** + \brief Set Pending Interrupt + \details Sets the pending bit of a device specific interrupt in the NVIC pending register. + \param [in] IRQn Device specific interrupt number. + \note IRQn must not be negative. + */ +__STATIC_INLINE void __NVIC_SetPendingIRQ(IRQn_Type IRQn) +{ + if ((int32_t)(IRQn) >= 0) + { + NVIC->ISPR[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL)); + } +} + + +/** + \brief Clear Pending Interrupt + \details Clears the pending bit of a device specific interrupt in the NVIC pending register. + \param [in] IRQn Device specific interrupt number. + \note IRQn must not be negative. + */ +__STATIC_INLINE void __NVIC_ClearPendingIRQ(IRQn_Type IRQn) +{ + if ((int32_t)(IRQn) >= 0) + { + NVIC->ICPR[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL)); + } +} + + +/** + \brief Get Active Interrupt + \details Reads the active register in the NVIC and returns the active bit for the device specific interrupt. + \param [in] IRQn Device specific interrupt number. + \return 0 Interrupt status is not active. + \return 1 Interrupt status is active. + \note IRQn must not be negative. + */ +__STATIC_INLINE uint32_t __NVIC_GetActive(IRQn_Type IRQn) +{ + if ((int32_t)(IRQn) >= 0) + { + return((uint32_t)(((NVIC->IABR[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL)); + } + else + { + return(0U); + } +} + + +/** + \brief Set Interrupt Priority + \details Sets the priority of a device specific interrupt or a processor exception. + The interrupt number can be positive to specify a device specific interrupt, + or negative to specify a processor exception. + \param [in] IRQn Interrupt number. + \param [in] priority Priority to set. + \note The priority cannot be set for every processor exception. + */ +__STATIC_INLINE void __NVIC_SetPriority(IRQn_Type IRQn, uint32_t priority) +{ + if ((int32_t)(IRQn) >= 0) + { + NVIC->IP[((uint32_t)IRQn)] = (uint8_t)((priority << (8U - __NVIC_PRIO_BITS)) & (uint32_t)0xFFUL); + } + else + { + SCB->SHP[(((uint32_t)IRQn) & 0xFUL)-4UL] = (uint8_t)((priority << (8U - __NVIC_PRIO_BITS)) & (uint32_t)0xFFUL); + } +} + + +/** + \brief Get Interrupt Priority + \details Reads the priority of a device specific interrupt or a processor exception. + The interrupt number can be positive to specify a device specific interrupt, + or negative to specify a processor exception. + \param [in] IRQn Interrupt number. + \return Interrupt Priority. + Value is aligned automatically to the implemented priority bits of the microcontroller. + */ +__STATIC_INLINE uint32_t __NVIC_GetPriority(IRQn_Type IRQn) +{ + + if ((int32_t)(IRQn) >= 0) + { + return(((uint32_t)NVIC->IP[((uint32_t)IRQn)] >> (8U - __NVIC_PRIO_BITS))); + } + else + { + return(((uint32_t)SCB->SHP[(((uint32_t)IRQn) & 0xFUL)-4UL] >> (8U - __NVIC_PRIO_BITS))); + } +} + + +/** + \brief Encode Priority + \details Encodes the priority for an interrupt with the given priority group, + preemptive priority value, and subpriority value. + In case of a conflict between priority grouping and available + priority bits (__NVIC_PRIO_BITS), the smallest possible priority group is set. + \param [in] PriorityGroup Used priority group. + \param [in] PreemptPriority Preemptive priority value (starting from 0). + \param [in] SubPriority Subpriority value (starting from 0). + \return Encoded priority. Value can be used in the function \ref NVIC_SetPriority(). + */ +__STATIC_INLINE uint32_t NVIC_EncodePriority (uint32_t PriorityGroup, uint32_t PreemptPriority, uint32_t SubPriority) +{ + uint32_t PriorityGroupTmp = (PriorityGroup & (uint32_t)0x07UL); /* only values 0..7 are used */ + uint32_t PreemptPriorityBits; + uint32_t SubPriorityBits; + + PreemptPriorityBits = ((7UL - PriorityGroupTmp) > (uint32_t)(__NVIC_PRIO_BITS)) ? (uint32_t)(__NVIC_PRIO_BITS) : (uint32_t)(7UL - PriorityGroupTmp); + SubPriorityBits = ((PriorityGroupTmp + (uint32_t)(__NVIC_PRIO_BITS)) < (uint32_t)7UL) ? (uint32_t)0UL : (uint32_t)((PriorityGroupTmp - 7UL) + (uint32_t)(__NVIC_PRIO_BITS)); + + return ( + ((PreemptPriority & (uint32_t)((1UL << (PreemptPriorityBits)) - 1UL)) << SubPriorityBits) | + ((SubPriority & (uint32_t)((1UL << (SubPriorityBits )) - 1UL))) + ); +} + + +/** + \brief Decode Priority + \details Decodes an interrupt priority value with a given priority group to + preemptive priority value and subpriority value. + In case of a conflict between priority grouping and available + priority bits (__NVIC_PRIO_BITS) the smallest possible priority group is set. + \param [in] Priority Priority value, which can be retrieved with the function \ref NVIC_GetPriority(). + \param [in] PriorityGroup Used priority group. + \param [out] pPreemptPriority Preemptive priority value (starting from 0). + \param [out] pSubPriority Subpriority value (starting from 0). + */ +__STATIC_INLINE void NVIC_DecodePriority (uint32_t Priority, uint32_t PriorityGroup, uint32_t* const pPreemptPriority, uint32_t* const pSubPriority) +{ + uint32_t PriorityGroupTmp = (PriorityGroup & (uint32_t)0x07UL); /* only values 0..7 are used */ + uint32_t PreemptPriorityBits; + uint32_t SubPriorityBits; + + PreemptPriorityBits = ((7UL - PriorityGroupTmp) > (uint32_t)(__NVIC_PRIO_BITS)) ? (uint32_t)(__NVIC_PRIO_BITS) : (uint32_t)(7UL - PriorityGroupTmp); + SubPriorityBits = ((PriorityGroupTmp + (uint32_t)(__NVIC_PRIO_BITS)) < (uint32_t)7UL) ? (uint32_t)0UL : (uint32_t)((PriorityGroupTmp - 7UL) + (uint32_t)(__NVIC_PRIO_BITS)); + + *pPreemptPriority = (Priority >> SubPriorityBits) & (uint32_t)((1UL << (PreemptPriorityBits)) - 1UL); + *pSubPriority = (Priority ) & (uint32_t)((1UL << (SubPriorityBits )) - 1UL); +} + + +/** + \brief Set Interrupt Vector + \details Sets an interrupt vector in SRAM based interrupt vector table. + The interrupt number can be positive to specify a device specific interrupt, + or negative to specify a processor exception. + VTOR must been relocated to SRAM before. + \param [in] IRQn Interrupt number + \param [in] vector Address of interrupt handler function + */ +__STATIC_INLINE void __NVIC_SetVector(IRQn_Type IRQn, uint32_t vector) +{ + uint32_t *vectors = (uint32_t *)SCB->VTOR; + vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET] = vector; + /* ARM Application Note 321 states that the M4 does not require the architectural barrier */ +} + + +/** + \brief Get Interrupt Vector + \details Reads an interrupt vector from interrupt vector table. + The interrupt number can be positive to specify a device specific interrupt, + or negative to specify a processor exception. + \param [in] IRQn Interrupt number. + \return Address of interrupt handler function + */ +__STATIC_INLINE uint32_t __NVIC_GetVector(IRQn_Type IRQn) +{ + uint32_t *vectors = (uint32_t *)SCB->VTOR; + return vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET]; +} + + +/** + \brief System Reset + \details Initiates a system reset request to reset the MCU. + */ +__NO_RETURN __STATIC_INLINE void __NVIC_SystemReset(void) +{ + __DSB(); /* Ensure all outstanding memory accesses included + buffered write are completed before reset */ + SCB->AIRCR = (uint32_t)((0x5FAUL << SCB_AIRCR_VECTKEY_Pos) | + (SCB->AIRCR & SCB_AIRCR_PRIGROUP_Msk) | + SCB_AIRCR_SYSRESETREQ_Msk ); /* Keep priority group unchanged */ + __DSB(); /* Ensure completion of memory access */ + + for(;;) /* wait until reset */ + { + __NOP(); + } +} + +/*@} end of CMSIS_Core_NVICFunctions */ + + +/* ########################## MPU functions #################################### */ + +#if defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U) + +#include "mpu_armv7.h" + +#endif + + +/* ########################## FPU functions #################################### */ +/** + \ingroup CMSIS_Core_FunctionInterface + \defgroup CMSIS_Core_FpuFunctions FPU Functions + \brief Function that provides FPU type. + @{ + */ + +/** + \brief get FPU type + \details returns the FPU type + \returns + - \b 0: No FPU + - \b 1: Single precision FPU + - \b 2: Double + Single precision FPU + */ +__STATIC_INLINE uint32_t SCB_GetFPUType(void) +{ + uint32_t mvfr0; + + mvfr0 = FPU->MVFR0; + if ((mvfr0 & (FPU_MVFR0_Single_precision_Msk | FPU_MVFR0_Double_precision_Msk)) == 0x020U) + { + return 1U; /* Single precision FPU */ + } + else + { + return 0U; /* No FPU */ + } +} + + +/*@} end of CMSIS_Core_FpuFunctions */ + + + +/* ################################## SysTick function ############################################ */ +/** + \ingroup CMSIS_Core_FunctionInterface + \defgroup CMSIS_Core_SysTickFunctions SysTick Functions + \brief Functions that configure the System. + @{ + */ + +#if defined (__Vendor_SysTickConfig) && (__Vendor_SysTickConfig == 0U) + +/** + \brief System Tick Configuration + \details Initializes the System Timer and its interrupt, and starts the System Tick Timer. + Counter is in free running mode to generate periodic interrupts. + \param [in] ticks Number of ticks between two interrupts. + \return 0 Function succeeded. + \return 1 Function failed. + \note When the variable __Vendor_SysTickConfig is set to 1, then the + function SysTick_Config is not included. In this case, the file device.h + must contain a vendor-specific implementation of this function. + */ +__STATIC_INLINE uint32_t SysTick_Config(uint32_t ticks) +{ + if ((ticks - 1UL) > SysTick_LOAD_RELOAD_Msk) + { + return (1UL); /* Reload value impossible */ + } + + SysTick->LOAD = (uint32_t)(ticks - 1UL); /* set reload register */ + NVIC_SetPriority (SysTick_IRQn, (1UL << __NVIC_PRIO_BITS) - 1UL); /* set Priority for Systick Interrupt */ + SysTick->VAL = 0UL; /* Load the SysTick Counter Value */ + SysTick->CTRL = SysTick_CTRL_CLKSOURCE_Msk | + SysTick_CTRL_TICKINT_Msk | + SysTick_CTRL_ENABLE_Msk; /* Enable SysTick IRQ and SysTick Timer */ + return (0UL); /* Function successful */ +} + +#endif + +/*@} end of CMSIS_Core_SysTickFunctions */ + + + +/* ##################################### Debug In/Output function ########################################### */ +/** + \ingroup CMSIS_Core_FunctionInterface + \defgroup CMSIS_core_DebugFunctions ITM Functions + \brief Functions that access the ITM debug interface. + @{ + */ + +extern volatile int32_t ITM_RxBuffer; /*!< External variable to receive characters. */ +#define ITM_RXBUFFER_EMPTY ((int32_t)0x5AA55AA5U) /*!< Value identifying \ref ITM_RxBuffer is ready for next character. */ + + +/** + \brief ITM Send Character + \details Transmits a character via the ITM channel 0, and + \li Just returns when no debugger is connected that has booked the output. + \li Is blocking when a debugger is connected, but the previous character sent has not been transmitted. + \param [in] ch Character to transmit. + \returns Character to transmit. + */ +__STATIC_INLINE uint32_t ITM_SendChar (uint32_t ch) +{ + if (((ITM->TCR & ITM_TCR_ITMENA_Msk) != 0UL) && /* ITM enabled */ + ((ITM->TER & 1UL ) != 0UL) ) /* ITM Port #0 enabled */ + { + while (ITM->PORT[0U].u32 == 0UL) + { + __NOP(); + } + ITM->PORT[0U].u8 = (uint8_t)ch; + } + return (ch); +} + + +/** + \brief ITM Receive Character + \details Inputs a character via the external variable \ref ITM_RxBuffer. + \return Received character. + \return -1 No character pending. + */ +__STATIC_INLINE int32_t ITM_ReceiveChar (void) +{ + int32_t ch = -1; /* no character available */ + + if (ITM_RxBuffer != ITM_RXBUFFER_EMPTY) + { + ch = ITM_RxBuffer; + ITM_RxBuffer = ITM_RXBUFFER_EMPTY; /* ready for next character */ + } + + return (ch); +} + + +/** + \brief ITM Check Character + \details Checks whether a character is pending for reading in the variable \ref ITM_RxBuffer. + \return 0 No character available. + \return 1 Character available. + */ +__STATIC_INLINE int32_t ITM_CheckChar (void) +{ + + if (ITM_RxBuffer == ITM_RXBUFFER_EMPTY) + { + return (0); /* no character available */ + } + else + { + return (1); /* character available */ + } +} + +/*@} end of CMSIS_core_DebugFunctions */ + + + + +#ifdef __cplusplus +} +#endif + +#endif /* __CORE_CM4_H_DEPENDANT */ + +#endif /* __CMSIS_GENERIC */ diff --git a/common/mps2/memory_zones.h b/common/mps2/memory_zones.h new file mode 100644 index 0000000..432d393 --- /dev/null +++ b/common/mps2/memory_zones.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2017-2019 ARM Limited + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing software + * distributed under the License is distributed on an "AS IS" BASIS + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This file contains the information of memory zones for code and data on + * ARM Versatile Express Cortex-M Prototyping Systems (V2M-MPS2) TRM. + * It is used in startup code and linker scripts of supported compilers (ARM and + * GCC_ARM). + * + * WARNING: IAR does not include this file and re-define these values in + * MPS2.icf file. Please make sure that the two files share the same values. + * + * These memory zones are defined in section 4.2 of ARM V2M-MPS2 RTL and + * Fast Model Reference Guide. + */ + +#ifndef MEMORY_ZONES_H +#define MEMORY_ZONES_H + +/* + * Code memory zones + * Please note that MPS2 on Fast Models do not implemented persistent flash memory. + * The FLASH memory can be simulated via 4MB ZBT_SRAM1 block + * only to keep the same name than in the CMSDK RTL and Fast Models Reference + * Guide. + */ +#define ZBT_SRAM1_START 0x00000000 +#define ZBT_SRAM1_SIZE 0x00400000 /* 4 MiB */ + +/* Data memory zones */ +#define ZBT_SRAM2_START 0x20000000 +#define ZBT_SRAM2_SIZE 0x00400000 /* 4 MiB */ + +#endif /* MEMORY_ZONES_H */ + diff --git a/common/mps2/mpu_armv7.h b/common/mps2/mpu_armv7.h new file mode 100644 index 0000000..791a8da --- /dev/null +++ b/common/mps2/mpu_armv7.h @@ -0,0 +1,275 @@ +/****************************************************************************** + * @file mpu_armv7.h + * @brief CMSIS MPU API for Armv7-M MPU + * @version V5.1.1 + * @date 10. February 2020 + ******************************************************************************/ +/* + * Copyright (c) 2017-2020 Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#if defined ( __ICCARM__ ) + #pragma system_include /* treat file as system include file for MISRA check */ +#elif defined (__clang__) + #pragma clang system_header /* treat file as system include file */ +#endif + +#ifndef ARM_MPU_ARMV7_H +#define ARM_MPU_ARMV7_H + +#define ARM_MPU_REGION_SIZE_32B ((uint8_t)0x04U) ///!< MPU Region Size 32 Bytes +#define ARM_MPU_REGION_SIZE_64B ((uint8_t)0x05U) ///!< MPU Region Size 64 Bytes +#define ARM_MPU_REGION_SIZE_128B ((uint8_t)0x06U) ///!< MPU Region Size 128 Bytes +#define ARM_MPU_REGION_SIZE_256B ((uint8_t)0x07U) ///!< MPU Region Size 256 Bytes +#define ARM_MPU_REGION_SIZE_512B ((uint8_t)0x08U) ///!< MPU Region Size 512 Bytes +#define ARM_MPU_REGION_SIZE_1KB ((uint8_t)0x09U) ///!< MPU Region Size 1 KByte +#define ARM_MPU_REGION_SIZE_2KB ((uint8_t)0x0AU) ///!< MPU Region Size 2 KBytes +#define ARM_MPU_REGION_SIZE_4KB ((uint8_t)0x0BU) ///!< MPU Region Size 4 KBytes +#define ARM_MPU_REGION_SIZE_8KB ((uint8_t)0x0CU) ///!< MPU Region Size 8 KBytes +#define ARM_MPU_REGION_SIZE_16KB ((uint8_t)0x0DU) ///!< MPU Region Size 16 KBytes +#define ARM_MPU_REGION_SIZE_32KB ((uint8_t)0x0EU) ///!< MPU Region Size 32 KBytes +#define ARM_MPU_REGION_SIZE_64KB ((uint8_t)0x0FU) ///!< MPU Region Size 64 KBytes +#define ARM_MPU_REGION_SIZE_128KB ((uint8_t)0x10U) ///!< MPU Region Size 128 KBytes +#define ARM_MPU_REGION_SIZE_256KB ((uint8_t)0x11U) ///!< MPU Region Size 256 KBytes +#define ARM_MPU_REGION_SIZE_512KB ((uint8_t)0x12U) ///!< MPU Region Size 512 KBytes +#define ARM_MPU_REGION_SIZE_1MB ((uint8_t)0x13U) ///!< MPU Region Size 1 MByte +#define ARM_MPU_REGION_SIZE_2MB ((uint8_t)0x14U) ///!< MPU Region Size 2 MBytes +#define ARM_MPU_REGION_SIZE_4MB ((uint8_t)0x15U) ///!< MPU Region Size 4 MBytes +#define ARM_MPU_REGION_SIZE_8MB ((uint8_t)0x16U) ///!< MPU Region Size 8 MBytes +#define ARM_MPU_REGION_SIZE_16MB ((uint8_t)0x17U) ///!< MPU Region Size 16 MBytes +#define ARM_MPU_REGION_SIZE_32MB ((uint8_t)0x18U) ///!< MPU Region Size 32 MBytes +#define ARM_MPU_REGION_SIZE_64MB ((uint8_t)0x19U) ///!< MPU Region Size 64 MBytes +#define ARM_MPU_REGION_SIZE_128MB ((uint8_t)0x1AU) ///!< MPU Region Size 128 MBytes +#define ARM_MPU_REGION_SIZE_256MB ((uint8_t)0x1BU) ///!< MPU Region Size 256 MBytes +#define ARM_MPU_REGION_SIZE_512MB ((uint8_t)0x1CU) ///!< MPU Region Size 512 MBytes +#define ARM_MPU_REGION_SIZE_1GB ((uint8_t)0x1DU) ///!< MPU Region Size 1 GByte +#define ARM_MPU_REGION_SIZE_2GB ((uint8_t)0x1EU) ///!< MPU Region Size 2 GBytes +#define ARM_MPU_REGION_SIZE_4GB ((uint8_t)0x1FU) ///!< MPU Region Size 4 GBytes + +#define ARM_MPU_AP_NONE 0U ///!< MPU Access Permission no access +#define ARM_MPU_AP_PRIV 1U ///!< MPU Access Permission privileged access only +#define ARM_MPU_AP_URO 2U ///!< MPU Access Permission unprivileged access read-only +#define ARM_MPU_AP_FULL 3U ///!< MPU Access Permission full access +#define ARM_MPU_AP_PRO 5U ///!< MPU Access Permission privileged access read-only +#define ARM_MPU_AP_RO 6U ///!< MPU Access Permission read-only access + +/** MPU Region Base Address Register Value +* +* \param Region The region to be configured, number 0 to 15. +* \param BaseAddress The base address for the region. +*/ +#define ARM_MPU_RBAR(Region, BaseAddress) \ + (((BaseAddress) & MPU_RBAR_ADDR_Msk) | \ + ((Region) & MPU_RBAR_REGION_Msk) | \ + (MPU_RBAR_VALID_Msk)) + +/** +* MPU Memory Access Attributes +* +* \param TypeExtField Type extension field, allows you to configure memory access type, for example strongly ordered, peripheral. +* \param IsShareable Region is shareable between multiple bus masters. +* \param IsCacheable Region is cacheable, i.e. its value may be kept in cache. +* \param IsBufferable Region is bufferable, i.e. using write-back caching. Cacheable but non-bufferable regions use write-through policy. +*/ +#define ARM_MPU_ACCESS_(TypeExtField, IsShareable, IsCacheable, IsBufferable) \ + ((((TypeExtField) << MPU_RASR_TEX_Pos) & MPU_RASR_TEX_Msk) | \ + (((IsShareable) << MPU_RASR_S_Pos) & MPU_RASR_S_Msk) | \ + (((IsCacheable) << MPU_RASR_C_Pos) & MPU_RASR_C_Msk) | \ + (((IsBufferable) << MPU_RASR_B_Pos) & MPU_RASR_B_Msk)) + +/** +* MPU Region Attribute and Size Register Value +* +* \param DisableExec Instruction access disable bit, 1= disable instruction fetches. +* \param AccessPermission Data access permissions, allows you to configure read/write access for User and Privileged mode. +* \param AccessAttributes Memory access attribution, see \ref ARM_MPU_ACCESS_. +* \param SubRegionDisable Sub-region disable field. +* \param Size Region size of the region to be configured, for example 4K, 8K. +*/ +#define ARM_MPU_RASR_EX(DisableExec, AccessPermission, AccessAttributes, SubRegionDisable, Size) \ + ((((DisableExec) << MPU_RASR_XN_Pos) & MPU_RASR_XN_Msk) | \ + (((AccessPermission) << MPU_RASR_AP_Pos) & MPU_RASR_AP_Msk) | \ + (((AccessAttributes) & (MPU_RASR_TEX_Msk | MPU_RASR_S_Msk | MPU_RASR_C_Msk | MPU_RASR_B_Msk))) | \ + (((SubRegionDisable) << MPU_RASR_SRD_Pos) & MPU_RASR_SRD_Msk) | \ + (((Size) << MPU_RASR_SIZE_Pos) & MPU_RASR_SIZE_Msk) | \ + (((MPU_RASR_ENABLE_Msk)))) + +/** +* MPU Region Attribute and Size Register Value +* +* \param DisableExec Instruction access disable bit, 1= disable instruction fetches. +* \param AccessPermission Data access permissions, allows you to configure read/write access for User and Privileged mode. +* \param TypeExtField Type extension field, allows you to configure memory access type, for example strongly ordered, peripheral. +* \param IsShareable Region is shareable between multiple bus masters. +* \param IsCacheable Region is cacheable, i.e. its value may be kept in cache. +* \param IsBufferable Region is bufferable, i.e. using write-back caching. Cacheable but non-bufferable regions use write-through policy. +* \param SubRegionDisable Sub-region disable field. +* \param Size Region size of the region to be configured, for example 4K, 8K. +*/ +#define ARM_MPU_RASR(DisableExec, AccessPermission, TypeExtField, IsShareable, IsCacheable, IsBufferable, SubRegionDisable, Size) \ + ARM_MPU_RASR_EX(DisableExec, AccessPermission, ARM_MPU_ACCESS_(TypeExtField, IsShareable, IsCacheable, IsBufferable), SubRegionDisable, Size) + +/** +* MPU Memory Access Attribute for strongly ordered memory. +* - TEX: 000b +* - Shareable +* - Non-cacheable +* - Non-bufferable +*/ +#define ARM_MPU_ACCESS_ORDERED ARM_MPU_ACCESS_(0U, 1U, 0U, 0U) + +/** +* MPU Memory Access Attribute for device memory. +* - TEX: 000b (if shareable) or 010b (if non-shareable) +* - Shareable or non-shareable +* - Non-cacheable +* - Bufferable (if shareable) or non-bufferable (if non-shareable) +* +* \param IsShareable Configures the device memory as shareable or non-shareable. +*/ +#define ARM_MPU_ACCESS_DEVICE(IsShareable) ((IsShareable) ? ARM_MPU_ACCESS_(0U, 1U, 0U, 1U) : ARM_MPU_ACCESS_(2U, 0U, 0U, 0U)) + +/** +* MPU Memory Access Attribute for normal memory. +* - TEX: 1BBb (reflecting outer cacheability rules) +* - Shareable or non-shareable +* - Cacheable or non-cacheable (reflecting inner cacheability rules) +* - Bufferable or non-bufferable (reflecting inner cacheability rules) +* +* \param OuterCp Configures the outer cache policy. +* \param InnerCp Configures the inner cache policy. +* \param IsShareable Configures the memory as shareable or non-shareable. +*/ +#define ARM_MPU_ACCESS_NORMAL(OuterCp, InnerCp, IsShareable) ARM_MPU_ACCESS_((4U | (OuterCp)), IsShareable, ((InnerCp) >> 1U), ((InnerCp) & 1U)) + +/** +* MPU Memory Access Attribute non-cacheable policy. +*/ +#define ARM_MPU_CACHEP_NOCACHE 0U + +/** +* MPU Memory Access Attribute write-back, write and read allocate policy. +*/ +#define ARM_MPU_CACHEP_WB_WRA 1U + +/** +* MPU Memory Access Attribute write-through, no write allocate policy. +*/ +#define ARM_MPU_CACHEP_WT_NWA 2U + +/** +* MPU Memory Access Attribute write-back, no write allocate policy. +*/ +#define ARM_MPU_CACHEP_WB_NWA 3U + + +/** +* Struct for a single MPU Region +*/ +typedef struct { + uint32_t RBAR; //!< The region base address register value (RBAR) + uint32_t RASR; //!< The region attribute and size register value (RASR) \ref MPU_RASR +} ARM_MPU_Region_t; + +/** Enable the MPU. +* \param MPU_Control Default access permissions for unconfigured regions. +*/ +__STATIC_INLINE void ARM_MPU_Enable(uint32_t MPU_Control) +{ + __DMB(); + MPU->CTRL = MPU_Control | MPU_CTRL_ENABLE_Msk; +#ifdef SCB_SHCSR_MEMFAULTENA_Msk + SCB->SHCSR |= SCB_SHCSR_MEMFAULTENA_Msk; +#endif + __DSB(); + __ISB(); +} + +/** Disable the MPU. +*/ +__STATIC_INLINE void ARM_MPU_Disable(void) +{ + __DMB(); +#ifdef SCB_SHCSR_MEMFAULTENA_Msk + SCB->SHCSR &= ~SCB_SHCSR_MEMFAULTENA_Msk; +#endif + MPU->CTRL &= ~MPU_CTRL_ENABLE_Msk; + __DSB(); + __ISB(); +} + +/** Clear and disable the given MPU region. +* \param rnr Region number to be cleared. +*/ +__STATIC_INLINE void ARM_MPU_ClrRegion(uint32_t rnr) +{ + MPU->RNR = rnr; + MPU->RASR = 0U; +} + +/** Configure an MPU region. +* \param rbar Value for RBAR register. +* \param rsar Value for RSAR register. +*/ +__STATIC_INLINE void ARM_MPU_SetRegion(uint32_t rbar, uint32_t rasr) +{ + MPU->RBAR = rbar; + MPU->RASR = rasr; +} + +/** Configure the given MPU region. +* \param rnr Region number to be configured. +* \param rbar Value for RBAR register. +* \param rsar Value for RSAR register. +*/ +__STATIC_INLINE void ARM_MPU_SetRegionEx(uint32_t rnr, uint32_t rbar, uint32_t rasr) +{ + MPU->RNR = rnr; + MPU->RBAR = rbar; + MPU->RASR = rasr; +} + +/** Memcopy with strictly ordered memory access, e.g. for register targets. +* \param dst Destination data is copied to. +* \param src Source data is copied from. +* \param len Amount of data words to be copied. +*/ +__STATIC_INLINE void ARM_MPU_OrderedMemcpy(volatile uint32_t* dst, const uint32_t* __RESTRICT src, uint32_t len) +{ + uint32_t i; + for (i = 0U; i < len; ++i) + { + dst[i] = src[i]; + } +} + +/** Load the given number of MPU regions from a table. +* \param table Pointer to the MPU configuration table. +* \param cnt Amount of regions to be configured. +*/ +__STATIC_INLINE void ARM_MPU_Load(ARM_MPU_Region_t const* table, uint32_t cnt) +{ + const uint32_t rowWordSize = sizeof(ARM_MPU_Region_t)/4U; + while (cnt > MPU_TYPE_RALIASES) { + ARM_MPU_OrderedMemcpy(&(MPU->RBAR), &(table->RBAR), MPU_TYPE_RALIASES*rowWordSize); + table += MPU_TYPE_RALIASES; + cnt -= MPU_TYPE_RALIASES; + } + ARM_MPU_OrderedMemcpy(&(MPU->RBAR), &(table->RBAR), cnt*rowWordSize); +} + +#endif diff --git a/common/mps2/startup_MPS2.S b/common/mps2/startup_MPS2.S new file mode 100644 index 0000000..fd38c39 --- /dev/null +++ b/common/mps2/startup_MPS2.S @@ -0,0 +1,206 @@ +/* + * MPS2 CMSIS Library + */ +/* + * Copyright (c) 2009-2018 ARM Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * This file is derivative of CMSIS V5.00 startup_ARMCM3.S + */ + .syntax unified + .arch armv7-m + + .section .vector_table,"a",%progbits + .align 2 + .globl __isr_vector +__isr_vector: + .long __StackTop /* Top of Stack */ + .long Reset_Handler /* Reset Handler */ + .long NMI_Handler /* NMI Handler */ + .long HardFault_Handler /* Hard Fault Handler */ + .long MemManage_Handler /* MPU Fault Handler */ + .long BusFault_Handler /* Bus Fault Handler */ + .long UsageFault_Handler /* Usage Fault Handler */ + .long 0 /* Reserved */ + .long 0 /* Reserved */ + .long 0 /* Reserved */ + .long 0 /* Reserved */ + .long SVC_Handler /* SVCall Handler */ + .long DebugMon_Handler /* Debug Monitor Handler */ + .long 0 /* Reserved */ + .long PendSV_Handler /* PendSV Handler */ + .long SysTick_Handler /* SysTick Handler */ + + /* External Interrupts */ + .long UARTRX0_Handler /* UART 0 RX Handler */ + .long UARTTX0_Handler /* UART 0 TX Handler */ + .long UARTRX1_Handler /* UART 1 RX Handler */ + .long UARTTX1_Handler /* UART 1 TX Handler */ + .long UARTRX2_Handler /* UART 2 RX Handler */ + .long UARTTX2_Handler /* UART 2 TX Handler */ + .long PORT0_COMB_Handler /* GPIO Port 0 Combined Handler */ + .long PORT1_COMB_Handler /* GPIO Port 1 Combined Handler */ + .long TIMER0_Handler /* TIMER 0 handler */ + .long TIMER1_Handler /* TIMER 1 handler */ + .long DUALTIMER_HANDLER /* Dual timer handler */ + .long SPI_Handler /* SPI exceptions Handler */ + .long UARTOVF_Handler /* UART 0,1,2 Overflow Handler */ + .long ETHERNET_Handler /* Ethernet Overflow Handler */ + .long I2S_Handler /* I2S Handler */ + .long TSC_Handler /* Touch Screen handler */ + .long PORT2_COMB_Handler /* GPIO Port 2 Combined Handler */ + .long PORT3_COMB_Handler /* GPIO Port 3 Combined Handler */ + .long UARTRX3_Handler /* UART 3 RX Handler */ + .long UARTTX3_Handler /* UART 3 TX Handler */ + .long UARTRX4_Handler /* UART 4 RX Handler */ + .long UARTTX4_Handler /* UART 4 TX Handler */ + .long ADCSPI_Handler /* SHIELD ADC SPI exceptions Handler */ + .long SHIELDSPI_Handler /* SHIELD SPI exceptions Handler */ + .long PORT0_0_Handler /* GPIO Port 0 pin 0 Handler */ + .long PORT0_1_Handler /* GPIO Port 0 pin 1 Handler */ + .long PORT0_2_Handler /* GPIO Port 0 pin 2 Handler */ + .long PORT0_3_Handler /* GPIO Port 0 pin 3 Handler */ + .long PORT0_4_Handler /* GPIO Port 0 pin 4 Handler */ + .long PORT0_5_Handler /* GPIO Port 0 pin 5 Handler */ + .long PORT0_6_Handler /* GPIO Port 0 pin 6 Handler */ + .long PORT0_7_Handler /* GPIO Port 0 pin 7 Handler */ + + .size __isr_vector, . - __isr_vector + + .section .text.Reset_Handler + .thumb + .thumb_func + .align 2 + .globl Reset_Handler + .type Reset_Handler, %function +Reset_Handler: +/* + * Loop to copy data from read only memory to RAM. The ranges + * of copy from/to are specified by following symbols evaluated in + * linker script. + * _etext: End of code section, i.e., begin of data sections to copy from. + * __data_start__/__data_end__: RAM address range that data should be + * copied to. Both must be aligned to 4 bytes boundary. + */ +#if !defined(DATA_IN_FLASH) + ldr r1, =__etext + ldr r2, =__data_start__ + ldr r3, =__data_end__ + + subs r3, r2 + ble .Lflash_to_ram_loop_end + + movs r4, 0 +.Lflash_to_ram_loop: + ldr r0, [r1,r4] + str r0, [r2,r4] + adds r4, 4 + cmp r4, r3 + blt .Lflash_to_ram_loop +.Lflash_to_ram_loop_end: +#endif /* DATA_IN_FLASH */ + +/* Initialize .bss */ +init_bss: + ldr r1, =__bss_start__ + ldr r2, =__bss_end__ + ldr r3, =bss_size + + cmp r3, #0 + beq system_startup + + mov r4, #0 +zero: + strb r4, [r1], #1 + subs r3, r3, #1 + bne zero + +system_startup: + ldr r0, =SystemInit + blx r0 + ldr r0, =_start + bx r0 + .pool + .size Reset_Handler, . - Reset_Handler + + .text +/* + * Macro to define default handlers. Default handler + * will be weak symbol and just dead loops. They can be + * overwritten by other handlers + */ + .macro def_default_handler handler_name + .align 1 + .thumb_func + .weak \handler_name + .type \handler_name, %function +\handler_name : + b . + .size \handler_name, . - \handler_name + .endm + + def_default_handler NMI_Handler + def_default_handler HardFault_Handler + def_default_handler MemManage_Handler + def_default_handler BusFault_Handler + def_default_handler UsageFault_Handler + def_default_handler SVC_Handler + def_default_handler DebugMon_Handler + def_default_handler PendSV_Handler + def_default_handler SysTick_Handler + def_default_handler Default_Handler + + .macro def_irq_default_handler handler_name + .weak \handler_name + .set \handler_name, Default_Handler + .endm + + /* External interrupts */ + def_irq_default_handler UARTRX0_Handler /* 0: UART 0 RX Handler */ + def_irq_default_handler UARTTX0_Handler /* 1: UART 0 TX Handler */ + def_irq_default_handler UARTRX1_Handler /* 2: UART 1 RX Handler */ + def_irq_default_handler UARTTX1_Handler /* 3: UART 1 TX Handler */ + def_irq_default_handler UARTRX2_Handler /* 4: UART 2 RX Handler */ + def_irq_default_handler UARTTX2_Handler /* 5: UART 2 TX Handler */ + def_irq_default_handler PORT0_COMB_Handler /* 6: GPIO Port 0 Combined Handler */ + def_irq_default_handler PORT1_COMB_Handler /* 7: GPIO Port 1 Combined Handler */ + def_irq_default_handler TIMER0_Handler /* 8: TIMER 0 handler */ + def_irq_default_handler TIMER1_Handler /* 9: TIMER 1 handler */ + def_irq_default_handler DUALTIMER_HANDLER /* 10: Dual timer handler */ + def_irq_default_handler SPI_Handler /* 11: SPI exceptions Handler */ + def_irq_default_handler UARTOVF_Handler /* 12: UART 0,1,2 Overflow Handler */ + def_irq_default_handler ETHERNET_Handler /* 13: Ethernet Overflow Handler */ + def_irq_default_handler I2S_Handler /* 14: I2S Handler */ + def_irq_default_handler TSC_Handler /* 15: Touch Screen handler */ + def_irq_default_handler PORT2_COMB_Handler /* 16: GPIO Port 2 Combined Handler */ + def_irq_default_handler PORT3_COMB_Handler /* 17: GPIO Port 3 Combined Handler */ + def_irq_default_handler UARTRX3_Handler /* 18: UART 3 RX Handler */ + def_irq_default_handler UARTTX3_Handler /* 19: UART 3 TX Handler */ + def_irq_default_handler UARTRX4_Handler /* 20: UART 4 RX Handler */ + def_irq_default_handler UARTTX4_Handler /* 21: UART 4 TX Handler */ + def_irq_default_handler ADCSPI_Handler /* 22: SHIELD ADC SPI exceptions Handler */ + def_irq_default_handler SHIELDSPI_Handler /* 23: SHIELD SPI exceptions Handler */ + def_irq_default_handler PORT0_0_Handler /* 24: GPIO Port 0 pin 0 Handler */ + def_irq_default_handler PORT0_1_Handler /* 25: GPIO Port 0 pin 1 Handler */ + def_irq_default_handler PORT0_2_Handler /* 26: GPIO Port 0 pin 2 Handler */ + def_irq_default_handler PORT0_3_Handler /* 27: GPIO Port 0 pin 3 Handler */ + def_irq_default_handler PORT0_4_Handler /* 28: GPIO Port 0 pin 4 Handler */ + def_irq_default_handler PORT0_5_Handler /* 29: GPIO Port 0 pin 5 Handler */ + def_irq_default_handler PORT0_6_Handler /* 30: GPIO Port 0 pin 6 Handler */ + def_irq_default_handler PORT0_7_Handler /* 31: GPIO Port 0 pin 7 Handler */ + + .end diff --git a/common/randombytes.c b/common/randombytes.c new file mode 100644 index 0000000..b27d4f0 --- /dev/null +++ b/common/randombytes.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: Apache-2.0 or CC0-1.0 +#include "randombytes.h" + +#if defined(STM32F7) + +#include + +// TODO Maybe we do not want to use the hardware RNG for all randomness, but +// instead only read a seed and then expand that using fips202. + +int randombytes(uint8_t *obuf, size_t len) { + union { + unsigned char aschar[4]; + uint32_t asint; + } random; + + while (len > 4) { + random.asint = rng_get_random_blocking(); + *obuf++ = random.aschar[0]; + *obuf++ = random.aschar[1]; + *obuf++ = random.aschar[2]; + *obuf++ = random.aschar[3]; + len -= 4; + } + if (len > 0) { + for (random.asint = rng_get_random_blocking(); len > 0; --len) { + *obuf++ = random.aschar[len - 1]; + } + } + + return 0; +} + +#else /* NONRANDOM FALLBACK IMPLEMENTATION */ +#warning Using a non-random randombytes + +#include + +static uint32_t seed[32] = {3, 1, 4, 1, 5, 9, 2, 6, 5, 3, 5, 8, 9, 7, 9, 3, + 2, 3, 8, 4, 6, 2, 6, 4, 3, 3, 8, 3, 2, 7, 9, 5}; +static uint32_t in[12]; +static uint8_t out_buf[sizeof(uint32_t) * 16]; +static int32_t outleft = 0; + +#define ROTATE(x, b) (((x) << (b)) | ((x) >> (32 - (b)))) +#define MUSH(i, b) x = t[i] += (((x ^ seed[i]) + sum) ^ ROTATE(x, b)); + +static void surf(uint32_t out[8]) { + uint32_t t[12]; + uint32_t x; + uint32_t sum = 0; + int32_t r; + int32_t i; + int32_t loop; + + for (i = 0; i < 12; ++i) { + t[i] = in[i] ^ seed[12 + i]; + } + for (i = 0; i < 8; ++i) { + out[i] = seed[24 + i]; + } + x = t[11]; + for (loop = 0; loop < 2; ++loop) { + for (r = 0; r < 16; ++r) { + sum += 0x9e3779b9; + MUSH(0, 5) + MUSH(1, 7) + MUSH(2, 9) + MUSH(3, 13) + MUSH(4, 5) + MUSH(5, 7) + MUSH(6, 9) + MUSH(7, 13) + MUSH(8, 5) + MUSH(9, 7) + MUSH(10, 9) + MUSH(11, 13) + } + for (i = 0; i < 8; ++i) { + out[i] ^= t[i + 4]; + } + } +} + +void randombytes_regen(void); +void randombytes_regen(void) { + uint32_t out[8]; + if (!++in[0]) { + if (!++in[1]) { + if (!++in[2]) { + ++in[3]; + } + } + } + surf(out); + memcpy(out_buf, out, sizeof(out)); + if (!++in[0]) { + if (!++in[1]) { + if (!++in[2]) { + ++in[3]; + } + } + } + surf(out); + memcpy(out_buf + sizeof(out), out, sizeof(out)); + outleft = sizeof(out_buf); +} + +int randombytes(uint8_t *buf, size_t xlen) { + while (xlen > 0) { + if (!outleft) { + randombytes_regen(); + } + *buf = out_buf[--outleft]; + ++buf; + --xlen; + } + return 0; +} + +#endif diff --git a/common/test.c b/common/test.c new file mode 100644 index 0000000..1a24c27 --- /dev/null +++ b/common/test.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: Apache-2.0 or CC0-1.0 +#include +#include +#include +#include + +#if defined(SRAM_TIMING_TEST) +#define TEST_BLOCK_SIZE 4096 + +/* Don't use opencm3 here, since not all platforms might use opencm3, but all + have these DWT registers */ +#define DWT_CTRL (*(volatile uint32_t*)(0xE0001000u + 0x00)) +#define DWT_CYCCNT (*(volatile uint32_t*)(0xE0001000u + 0x04)) +#define DWT_CTRL_CYCCNTENA (1 << 0) +#define SCS_DEMCR (*(volatile uint32_t*)(0xE000E000u + 0xDFC)) +#define SCS_DEMCR_TRCENA (1 << 24) +/* Need a really precise cycle counter. */ +static void cyccnt_enable() +{ + SCS_DEMCR |= SCS_DEMCR_TRCENA; + DWT_CYCCNT = 0; + DWT_CTRL |= DWT_CTRL_CYCCNTENA; +} +static inline void cyccnt_start() +{ + DWT_CYCCNT = 0; +} +static inline uint32_t cyccnt_get() +{ + return DWT_CYCCNT; +} + +__attribute__((noinline)) +static uint32_t test_load(volatile unsigned* ram_block) +{ + asm volatile("cpsid if"); + cyccnt_start(); +#define NL "\n\t" + asm volatile("_MEMLOOP%=:" NL + "ldr r12, [%0], #4" NL + "cmp %0, %1" NL + "bne _MEMLOOP%=" NL + :"+r" (ram_block): "r" (ram_block + (TEST_BLOCK_SIZE / sizeof(unsigned))): "r12", "cc"); + uint32_t result = cyccnt_get(); + asm volatile("cpsie if"); + return result; +} + +__attribute__((noinline)) +static uint32_t test_unalignedload(volatile void* ram_block) +{ + volatile unsigned char* ram_block8 = ram_block; + ram_block8 += 2; + asm volatile("cpsid if"); + cyccnt_start(); +#define NL "\n\t" + asm volatile("_MEMLOOP%=:" NL + "ldr r12, [%0], #4" NL + "cmp %0, %1" NL + "blt _MEMLOOP%=" NL + :"+r" (ram_block8): "r" (ram_block8 + TEST_BLOCK_SIZE): "r12", "cc"); + uint32_t result = cyccnt_get(); + asm volatile("cpsie if"); + return result; +} + +__attribute__((noinline)) +static uint32_t test_store(volatile unsigned* ram_block) +{ + cyccnt_start(); +#define NL "\n\t" + asm volatile("_MEMLOOP%=:" NL + "str r12, [%0], #4" NL + "cmp %0, %1" NL + "bne _MEMLOOP%=" NL + :"+r" (ram_block): "r" (ram_block + (TEST_BLOCK_SIZE / sizeof(unsigned))): "r12", "cc"); + return cyccnt_get(); +} + +__attribute__((noinline)) +static uint32_t test_loadstore(volatile unsigned* ram_block) +{ + cyccnt_start(); +#define NL "\n\t" + asm volatile("_MEMLOOP%=:" NL + "str r12, [%0]" NL + "add r12, r12, #1" NL + "ldr r12, [%0], #4" NL + "cmp %0, %1" NL + "bne _MEMLOOP%=" NL + :"+r" (ram_block): "r" (ram_block + (TEST_BLOCK_SIZE / sizeof(unsigned))): "r12", "cc"); + return cyccnt_get(); +} + +static void memory_timing_test(void) +{ + cyccnt_enable(); +#define RAMBLK(BLK) \ + static volatile unsigned ram ## BLK ## _block[TEST_BLOCK_SIZE / sizeof(unsigned) + 1] __attribute__((section(".ram" #BLK))) + +#define TEST(BLK) \ + test_load(ram ## BLK ## _block); \ + test_unalignedload(ram ## BLK ## _block); \ + test_store(ram ## BLK ## _block); \ + test_loadstore(ram ## BLK ## _block); \ + send_unsigned("ram" #BLK " load", test_load(ram ## BLK ## _block)); \ + send_unsigned("ram" #BLK " unalignedload", test_unalignedload(ram ## BLK ## _block)); \ + send_unsigned("ram" #BLK " store", test_store(ram ## BLK ## _block)); \ + send_unsigned("ram" #BLK " loadstore", test_loadstore(ram ## BLK ## _block)); + + static volatile unsigned ram1_block[TEST_BLOCK_SIZE / sizeof(unsigned) + 1]; + TEST(1); +#if defined(HAS_SRAM2) + RAMBLK(2); + TEST(2); +#endif +#if defined(HAS_SRAM3) + RAMBLK(3); + TEST(3); +#endif +#if defined(HAS_CCM) + static volatile unsigned ramccm_block[TEST_BLOCK_SIZE / sizeof(unsigned) + 1] __attribute__((section(".ccmram"))); + TEST(ccm); +#endif +} +#endif + +#ifndef CLOCK_TEST +#define CLOCK_TEST CLOCK_BENCHMARK +#endif + +void stacktest(size_t size) +{ + volatile uint32_t mem[size] __attribute__((unused)); + for (unsigned i = 0; i < size; ++i) { + mem[i] = 0; + } +} + +int main(void) +{ + hal_setup(CLOCK_TEST); + hal_send_str("Hello world"); + send_unsigned("Stack Size", hal_get_stack_size()); + unsigned rnd; + randombytes((unsigned char*) &rnd, sizeof(unsigned)); + send_unsigned("Random number", rnd); + size_t stack; + hal_spraystack(); + stacktest(100); + stack = hal_checkstack(); + send_unsigned("stackusage1", stack); + hal_spraystack(); + stacktest(200); + stack = hal_checkstack(); + send_unsigned("stackusage2", stack); +#if defined(SRAM_TIMING_TEST) + memory_timing_test(); +#endif + return 0; +} diff --git a/common/testfast.c b/common/testfast.c new file mode 120000 index 0000000..aeebb26 --- /dev/null +++ b/common/testfast.c @@ -0,0 +1 @@ +test.c \ No newline at end of file diff --git a/convert_benchmarks.py b/convert_benchmarks.py new file mode 100755 index 0000000..6d9667f --- /dev/null +++ b/convert_benchmarks.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 or CC0-1.0 +import sys +from mupq import mupq + +def usage(): + print("Usage: python3 convert_benchmarks.py csv|md") + sys.exit(1) + +if __name__ == "__main__": + if len(sys.argv) != 2: + usage() + if sys.argv[1] == "csv": + converter = mupq.CsvConverter() + elif sys.argv[1] == "md": + converter = mupq.MarkdownConverter() + else: + usage() + converter.convert() diff --git a/crypto_kem/ml-kem-1024/m4fspeed/api.h b/crypto_kem/ml-kem-1024/m4fspeed/api.h new file mode 100644 index 0000000..92ea9be --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/api.h @@ -0,0 +1,20 @@ +#ifndef API_H +#define API_H + +#include "params.h" + +#define CRYPTO_SECRETKEYBYTES KYBER_SECRETKEYBYTES +#define CRYPTO_PUBLICKEYBYTES KYBER_PUBLICKEYBYTES +#define CRYPTO_CIPHERTEXTBYTES KYBER_CIPHERTEXTBYTES +#define CRYPTO_BYTES KYBER_SSBYTES + +#define CRYPTO_ALGNAME "Kyber1024" + +int crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + +int crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); + +int crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); + + +#endif diff --git a/crypto_kem/ml-kem-1024/m4fspeed/cbd.c b/crypto_kem/ml-kem-1024/m4fspeed/cbd.c new file mode 120000 index 0000000..801f7f8 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/cbd.c @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/cbd.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/cbd.h b/crypto_kem/ml-kem-1024/m4fspeed/cbd.h new file mode 120000 index 0000000..4f9e3af --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/cbd.h @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/cbd.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/cmov_int16.S b/crypto_kem/ml-kem-1024/m4fspeed/cmov_int16.S new file mode 120000 index 0000000..bdef6f4 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/cmov_int16.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/cmov_int16.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/fastaddsub.S b/crypto_kem/ml-kem-1024/m4fspeed/fastaddsub.S new file mode 120000 index 0000000..aa55564 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/fastaddsub.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/fastaddsub.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/fastbasemul.S b/crypto_kem/ml-kem-1024/m4fspeed/fastbasemul.S new file mode 120000 index 0000000..4384e1d --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/fastbasemul.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/fastbasemul.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/fastinvntt.S b/crypto_kem/ml-kem-1024/m4fspeed/fastinvntt.S new file mode 120000 index 0000000..ede60d7 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/fastinvntt.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/fastinvntt.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/fastntt.S b/crypto_kem/ml-kem-1024/m4fspeed/fastntt.S new file mode 120000 index 0000000..d34524f --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/fastntt.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/fastntt.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/indcpa.c b/crypto_kem/ml-kem-1024/m4fspeed/indcpa.c new file mode 120000 index 0000000..25db6b1 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/indcpa.c @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/indcpa.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/indcpa.h b/crypto_kem/ml-kem-1024/m4fspeed/indcpa.h new file mode 120000 index 0000000..e6f3662 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/indcpa.h @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/indcpa.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/kem.c b/crypto_kem/ml-kem-1024/m4fspeed/kem.c new file mode 120000 index 0000000..489b6f9 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/kem.c @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/kem.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/macros.i b/crypto_kem/ml-kem-1024/m4fspeed/macros.i new file mode 120000 index 0000000..a7d8e74 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/macros.i @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/macros.i \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/matacc.c b/crypto_kem/ml-kem-1024/m4fspeed/matacc.c new file mode 120000 index 0000000..71d7234 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/matacc.c @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/matacc.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/matacc.h b/crypto_kem/ml-kem-1024/m4fspeed/matacc.h new file mode 120000 index 0000000..19b6772 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/matacc.h @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/matacc.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/matacc.i b/crypto_kem/ml-kem-1024/m4fspeed/matacc.i new file mode 120000 index 0000000..39b6e23 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/matacc.i @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/matacc.i \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/matacc_asm.S b/crypto_kem/ml-kem-1024/m4fspeed/matacc_asm.S new file mode 120000 index 0000000..3c7d05e --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/matacc_asm.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/matacc_asm.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/ntt.c b/crypto_kem/ml-kem-1024/m4fspeed/ntt.c new file mode 120000 index 0000000..971c6b0 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/ntt.c @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/ntt.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/ntt.h b/crypto_kem/ml-kem-1024/m4fspeed/ntt.h new file mode 120000 index 0000000..11e111d --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/ntt.h @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/ntt.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/params.h b/crypto_kem/ml-kem-1024/m4fspeed/params.h new file mode 100644 index 0000000..a3153e7 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/params.h @@ -0,0 +1,31 @@ +#ifndef PARAMS_H +#define PARAMS_H + +#define KYBER_K 4 /* Change this for different security strengths */ + +/* Don't change parameters below this line */ + +#define KYBER_N 256 +#define KYBER_Q 3329 + +#define KYBER_ETA 2 + +#define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ +#define KYBER_SSBYTES 32 /* size in bytes of shared key */ + +#define KYBER_POLYBYTES 384 +#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) + +#define KYBER_POLYCOMPRESSEDBYTES 160 +#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352) + +#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES +#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) +#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) +#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) + +#define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) +#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */ +#define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES + +#endif diff --git a/crypto_kem/ml-kem-1024/m4fspeed/poly.c b/crypto_kem/ml-kem-1024/m4fspeed/poly.c new file mode 120000 index 0000000..b432b8a --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/poly.c @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/poly.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/poly.h b/crypto_kem/ml-kem-1024/m4fspeed/poly.h new file mode 120000 index 0000000..6003dc3 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/poly.h @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/poly.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/poly_asm.S b/crypto_kem/ml-kem-1024/m4fspeed/poly_asm.S new file mode 120000 index 0000000..c4bda05 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/poly_asm.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/poly_asm.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/polyvec.c b/crypto_kem/ml-kem-1024/m4fspeed/polyvec.c new file mode 120000 index 0000000..c3f7d0a --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/polyvec.c @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/polyvec.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/polyvec.h b/crypto_kem/ml-kem-1024/m4fspeed/polyvec.h new file mode 120000 index 0000000..47cf6c3 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/polyvec.h @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/polyvec.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/reduce.S b/crypto_kem/ml-kem-1024/m4fspeed/reduce.S new file mode 120000 index 0000000..2edf10c --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/reduce.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/reduce.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/symmetric-fips202.c b/crypto_kem/ml-kem-1024/m4fspeed/symmetric-fips202.c new file mode 120000 index 0000000..5adc9ae --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/symmetric-fips202.c @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/symmetric-fips202.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/symmetric.h b/crypto_kem/ml-kem-1024/m4fspeed/symmetric.h new file mode 120000 index 0000000..698a10d --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/symmetric.h @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/symmetric.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/verify.c b/crypto_kem/ml-kem-1024/m4fspeed/verify.c new file mode 120000 index 0000000..85d7f50 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/verify.c @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/verify.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fspeed/verify.h b/crypto_kem/ml-kem-1024/m4fspeed/verify.h new file mode 120000 index 0000000..e19a301 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fspeed/verify.h @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/verify.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/api.h b/crypto_kem/ml-kem-1024/m4fstack/api.h new file mode 120000 index 0000000..cf75db9 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/api.h @@ -0,0 +1 @@ +../m4fspeed/api.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/cbd.c b/crypto_kem/ml-kem-1024/m4fstack/cbd.c new file mode 120000 index 0000000..903fa59 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/cbd.c @@ -0,0 +1 @@ +../m4fspeed/cbd.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/cbd.h b/crypto_kem/ml-kem-1024/m4fstack/cbd.h new file mode 120000 index 0000000..d264c36 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/cbd.h @@ -0,0 +1 @@ +../m4fspeed/cbd.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/cmov_int16.S b/crypto_kem/ml-kem-1024/m4fstack/cmov_int16.S new file mode 120000 index 0000000..bdef6f4 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/cmov_int16.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/cmov_int16.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/fastaddsub.S b/crypto_kem/ml-kem-1024/m4fstack/fastaddsub.S new file mode 120000 index 0000000..d1317f7 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/fastaddsub.S @@ -0,0 +1 @@ +../m4fspeed/fastaddsub.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/fastbasemul.S b/crypto_kem/ml-kem-1024/m4fstack/fastbasemul.S new file mode 120000 index 0000000..531385d --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/fastbasemul.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fstack/fastbasemul.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/fastinvntt.S b/crypto_kem/ml-kem-1024/m4fstack/fastinvntt.S new file mode 120000 index 0000000..ede60d7 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/fastinvntt.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/fastinvntt.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/fastntt.S b/crypto_kem/ml-kem-1024/m4fstack/fastntt.S new file mode 120000 index 0000000..d34524f --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/fastntt.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/fastntt.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/indcpa.c b/crypto_kem/ml-kem-1024/m4fstack/indcpa.c new file mode 120000 index 0000000..a4103b1 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/indcpa.c @@ -0,0 +1 @@ +../../ml-kem-768/m4fstack/indcpa.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/indcpa.h b/crypto_kem/ml-kem-1024/m4fstack/indcpa.h new file mode 120000 index 0000000..9e56c80 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/indcpa.h @@ -0,0 +1 @@ +../../ml-kem-768/m4fstack/indcpa.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/kem.c b/crypto_kem/ml-kem-1024/m4fstack/kem.c new file mode 120000 index 0000000..302153d --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/kem.c @@ -0,0 +1 @@ +../m4fspeed/kem.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/macros.i b/crypto_kem/ml-kem-1024/m4fstack/macros.i new file mode 120000 index 0000000..6e83891 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/macros.i @@ -0,0 +1 @@ +../m4fspeed/macros.i \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/matacc.c b/crypto_kem/ml-kem-1024/m4fstack/matacc.c new file mode 120000 index 0000000..5558ec8 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/matacc.c @@ -0,0 +1 @@ +../../ml-kem-768/m4fstack/matacc.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/matacc.h b/crypto_kem/ml-kem-1024/m4fstack/matacc.h new file mode 120000 index 0000000..4eb7706 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/matacc.h @@ -0,0 +1 @@ +../../ml-kem-768/m4fstack/matacc.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/matacc.i b/crypto_kem/ml-kem-1024/m4fstack/matacc.i new file mode 120000 index 0000000..0d39b07 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/matacc.i @@ -0,0 +1 @@ +../../ml-kem-768/m4fstack/matacc.i \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/matacc_asm.S b/crypto_kem/ml-kem-1024/m4fstack/matacc_asm.S new file mode 120000 index 0000000..0079bb5 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/matacc_asm.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fstack/matacc_asm.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/ntt.c b/crypto_kem/ml-kem-1024/m4fstack/ntt.c new file mode 120000 index 0000000..c9d6e8a --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/ntt.c @@ -0,0 +1 @@ +../m4fspeed/ntt.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/ntt.h b/crypto_kem/ml-kem-1024/m4fstack/ntt.h new file mode 120000 index 0000000..5fd83c0 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/ntt.h @@ -0,0 +1 @@ +../m4fspeed/ntt.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/params.h b/crypto_kem/ml-kem-1024/m4fstack/params.h new file mode 120000 index 0000000..59dd7f1 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/params.h @@ -0,0 +1 @@ +../m4fspeed/params.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/poly.c b/crypto_kem/ml-kem-1024/m4fstack/poly.c new file mode 120000 index 0000000..df6f119 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/poly.c @@ -0,0 +1 @@ +../../ml-kem-768/m4fstack/poly.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/poly.h b/crypto_kem/ml-kem-1024/m4fstack/poly.h new file mode 120000 index 0000000..ad89400 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/poly.h @@ -0,0 +1 @@ +../../ml-kem-768/m4fstack/poly.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/poly_asm.S b/crypto_kem/ml-kem-1024/m4fstack/poly_asm.S new file mode 120000 index 0000000..167ee5e --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/poly_asm.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fstack/poly_asm.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/polyvec.c b/crypto_kem/ml-kem-1024/m4fstack/polyvec.c new file mode 120000 index 0000000..f398d76 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/polyvec.c @@ -0,0 +1 @@ +../m4fspeed/polyvec.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/polyvec.h b/crypto_kem/ml-kem-1024/m4fstack/polyvec.h new file mode 120000 index 0000000..3113837 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/polyvec.h @@ -0,0 +1 @@ +../m4fspeed/polyvec.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/reduce.S b/crypto_kem/ml-kem-1024/m4fstack/reduce.S new file mode 120000 index 0000000..29ae453 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/reduce.S @@ -0,0 +1 @@ +../m4fspeed/reduce.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/symmetric-fips202.c b/crypto_kem/ml-kem-1024/m4fstack/symmetric-fips202.c new file mode 120000 index 0000000..5adc9ae --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/symmetric-fips202.c @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/symmetric-fips202.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/symmetric.h b/crypto_kem/ml-kem-1024/m4fstack/symmetric.h new file mode 120000 index 0000000..28c6fac --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/symmetric.h @@ -0,0 +1 @@ +../m4fspeed/symmetric.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/verify.c b/crypto_kem/ml-kem-1024/m4fstack/verify.c new file mode 120000 index 0000000..a7a9856 --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/verify.c @@ -0,0 +1 @@ +../m4fspeed/verify.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-1024/m4fstack/verify.h b/crypto_kem/ml-kem-1024/m4fstack/verify.h new file mode 120000 index 0000000..cb2da4b --- /dev/null +++ b/crypto_kem/ml-kem-1024/m4fstack/verify.h @@ -0,0 +1 @@ +../m4fspeed/verify.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fspeed/api.h b/crypto_kem/ml-kem-512/m4fspeed/api.h new file mode 100644 index 0000000..3b9244a --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/api.h @@ -0,0 +1,20 @@ +#ifndef API_H +#define API_H + +#include "params.h" + +#define CRYPTO_SECRETKEYBYTES KYBER_SECRETKEYBYTES +#define CRYPTO_PUBLICKEYBYTES KYBER_PUBLICKEYBYTES +#define CRYPTO_CIPHERTEXTBYTES KYBER_CIPHERTEXTBYTES +#define CRYPTO_BYTES KYBER_SSBYTES + +#define CRYPTO_ALGNAME "Kyber512" + +int crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + +int crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); + +int crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); + + +#endif diff --git a/crypto_kem/ml-kem-512/m4fspeed/cbd.c b/crypto_kem/ml-kem-512/m4fspeed/cbd.c new file mode 100644 index 0000000..f8911fc --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/cbd.c @@ -0,0 +1,112 @@ +#include "cbd.h" +#include "params.h" + +#include + +/************************************************* +* Name: load32_littleendian +* +* Description: load bytes into a 32-bit integer +* in little-endian order +* +* Arguments: - const unsigned char *x: pointer to input byte array +* +* Returns 32-bit unsigned integer loaded from x +**************************************************/ +static uint32_t load32_littleendian(const unsigned char *x) { + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + r |= (uint32_t)x[3] << 24; + return r; +} + +/************************************************* +* Name: load24_littleendian +* +* Description: load 3 bytes into a 32-bit integer +* in little-endian order +* This function is only needed for Kyber-512 +* +* Arguments: - const uint8_t *x: pointer to input byte array +* +* Returns 32-bit unsigned integer loaded from x (most significant byte is zero) +**************************************************/ +static uint32_t load24_littleendian(const uint8_t x[3]) +{ + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + return r; +} + + + +/************************************************* +* Name: cbd_eta1 +* +* Description: Given an array of uniformly random bytes, compute +* polynomial with coefficients distributed according to +* a centered binomial distribution with parameter KYBER_ETA1 +* specialized for KYBER_ETA1=3 +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *buf: pointer to input byte array +* - int add: boolean to indicate to accumulate into r +**************************************************/ +void cbd_eta1(poly *r, const unsigned char *buf, int add) { + unsigned int i,j; + uint32_t t,d; + int16_t a,b; + + for(i=0;i>1) & 0x00249249; + d += (t>>2) & 0x00249249; + + for(j=0;j<4;j++) { + a = (d >> (6*j+0)) & 0x7; + b = (d >> (6*j+3)) & 0x7; + if (!add) + r->coeffs[4 * i + j] = 0; + r->coeffs[4 * i + j] = r->coeffs[4 * i + j] + (a - b); + } + } +} + +/************************************************* +* Name: cbd_eta2 +* +* Description: Given an array of uniformly random bytes, compute +* polynomial with coefficients distributed according to +* a centered binomial distribution with parameter KYBER_ETA2 +* specialized for KYBER_ETA2=2 +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *buf: pointer to input byte array +* - int add: boolean to indicate to accumulate into r +**************************************************/ +void cbd_eta2(poly *r, const unsigned char *buf, int add) { + uint32_t d, t; + int16_t a, b; + int i, j; + + for (i = 0; i < KYBER_N / 8; i++) { + t = load32_littleendian(buf + 4 * i); + d = t & 0x55555555; + d += (t >> 1) & 0x55555555; + + for (j = 0; j < 8; j++) { + a = (d >> 4 * j) & 0x3; + b = (d >> (4 * j + 2)) & 0x3; + if (!add) + r->coeffs[8 * i + j] = 0; + r->coeffs[8 * i + j] = r->coeffs[8 * i + j] + (a - b); + } + } +} + + diff --git a/crypto_kem/ml-kem-512/m4fspeed/cbd.h b/crypto_kem/ml-kem-512/m4fspeed/cbd.h new file mode 100644 index 0000000..47f1d24 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/cbd.h @@ -0,0 +1,9 @@ +#ifndef CBD_H +#define CBD_H + +#include "poly.h" + +void cbd_eta1(poly *r, const unsigned char *buf, int add); +void cbd_eta2(poly *r, const unsigned char *buf, int add); + +#endif diff --git a/crypto_kem/ml-kem-512/m4fspeed/cmov_int16.S b/crypto_kem/ml-kem-512/m4fspeed/cmov_int16.S new file mode 120000 index 0000000..bdef6f4 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/cmov_int16.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/cmov_int16.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fspeed/fastaddsub.S b/crypto_kem/ml-kem-512/m4fspeed/fastaddsub.S new file mode 120000 index 0000000..aa55564 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/fastaddsub.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/fastaddsub.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fspeed/fastbasemul.S b/crypto_kem/ml-kem-512/m4fspeed/fastbasemul.S new file mode 120000 index 0000000..4384e1d --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/fastbasemul.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/fastbasemul.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fspeed/fastinvntt.S b/crypto_kem/ml-kem-512/m4fspeed/fastinvntt.S new file mode 120000 index 0000000..ede60d7 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/fastinvntt.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/fastinvntt.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fspeed/fastntt.S b/crypto_kem/ml-kem-512/m4fspeed/fastntt.S new file mode 120000 index 0000000..d34524f --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/fastntt.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/fastntt.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fspeed/indcpa.c b/crypto_kem/ml-kem-512/m4fspeed/indcpa.c new file mode 100644 index 0000000..99f5b3c --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/indcpa.c @@ -0,0 +1,246 @@ +#include "indcpa.h" +#include "ntt.h" +#include "poly.h" +#include "polyvec.h" +#include "randombytes.h" +#include "symmetric.h" +#include "matacc.h" + +#include +#include + + +/************************************************* +* Name: indcpa_keypair_derand +* +* Description: Generates public and private key for the CPA-secure +* public-key encryption scheme underlying Kyber +* +* Arguments: - uint8_t *pk: pointer to output public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key +* (of length KYBER_INDCPA_SECRETKEYBYTES bytes) +* - const uint8_t *coins: pointer to input randomness +* (of length KYBER_SYMBYTES bytes) +**************************************************/ +void indcpa_keypair_derand(unsigned char *pk, + unsigned char *sk, + const unsigned char *coins){ + polyvec skpv, skpv_prime; + poly pkp; + unsigned char buf[2 * KYBER_SYMBYTES]; + unsigned char *publicseed = buf; + unsigned char *noiseseed = buf + KYBER_SYMBYTES; + int i; + unsigned char nonce = 0; + + memcpy(buf, coins, KYBER_SYMBYTES); + buf[KYBER_SYMBYTES] = KYBER_K; + hash_g(buf, buf, KYBER_SYMBYTES + 1); + + for (i = 0; i < KYBER_K; i++) + poly_getnoise_eta1(skpv.vec + i, noiseseed, nonce++); + + polyvec_ntt(&skpv); + + // i = 0 + matacc_cache32(&pkp, &skpv, &skpv_prime, 0, publicseed, 0); + poly_invntt(&pkp); + + poly_addnoise_eta1(&pkp, noiseseed, nonce++); + poly_ntt(&pkp); + + poly_tobytes(pk, &pkp); + for (i = 1; i < KYBER_K; i++) { + matacc_opt32(&pkp, &skpv, &skpv_prime, i, publicseed, 0); + poly_invntt(&pkp); + + poly_addnoise_eta1(&pkp, noiseseed, nonce++); + poly_ntt(&pkp); + + poly_tobytes(pk+i*KYBER_POLYBYTES, &pkp); + } + + polyvec_tobytes(sk, &skpv); + memcpy(pk + KYBER_POLYVECBYTES, publicseed, KYBER_SYMBYTES); // Pack the public seed in the public key +} + +/************************************************* +* Name: indcpa_enc +* +* Description: Encryption function of the CPA-secure +* public-key encryption scheme underlying Kyber. +* +* Arguments: - unsigned char *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes) +* - const unsigned char *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) +* - const unsigned char *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - const unsigned char *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) +* to deterministically generate all randomness +**************************************************/ +void indcpa_enc(unsigned char *c, + const unsigned char *m, + const unsigned char *pk, + const unsigned char *coins) { + polyvec sp, sp_prime; + poly bp; + poly *pkp = &bp; + poly *k = &bp; + poly *v = &sp.vec[0]; + const unsigned char *seed = pk+KYBER_POLYVECBYTES; + int i; + unsigned char nonce = 0; + + for (i = 0; i < KYBER_K; i++) + poly_getnoise_eta1(sp.vec + i, coins, nonce++); + + polyvec_ntt(&sp); + + // i = 0 + matacc_cache32(&bp, &sp, &sp_prime, 0, seed, 1); + poly_invntt(&bp); + poly_addnoise_eta2(&bp, coins, nonce++); + poly_reduce(&bp); + poly_packcompress(c, &bp, 0); + for (i = 1; i < KYBER_K; i++) { + matacc_opt32(&bp, &sp, &sp_prime, i, seed, 1); + poly_invntt(&bp); + + poly_addnoise_eta2(&bp, coins, nonce++); + poly_reduce(&bp); + + poly_packcompress(c, &bp, i); + } + + poly_frombytes(pkp, pk); + int32_t v_tmp[KYBER_N]; + + poly_basemul_opt_16_32(v_tmp, &sp.vec[0], pkp, &sp_prime.vec[0]); + for (i = 1; i < KYBER_K - 1; i++) { + poly_frombytes(pkp, pk + i*KYBER_POLYBYTES); + poly_basemul_acc_opt_32_32(v_tmp, &sp.vec[i], pkp, &sp_prime.vec[i]); + } + poly_frombytes(pkp, pk + i*KYBER_POLYBYTES); + poly_basemul_acc_opt_32_16(v, &sp.vec[i], pkp, &sp_prime.vec[i], v_tmp); + + poly_invntt(v); + + poly_addnoise_eta2(v, coins, nonce++); + + poly_frommsg(k, m); + poly_add(v, v, k); + poly_reduce(v); + + poly_compress(c + KYBER_POLYVECCOMPRESSEDBYTES, v); +} + +/************************************************* +* Name: indcpa_enc_cmp +* +* Description: Re-encryption function. +* Compares the re-encypted ciphertext with the original ciphertext byte per byte. +* The comparison is performed in a constant time manner. +* +* +* Arguments: - unsigned char *ct: pointer to input ciphertext to compare the new ciphertext with (of length KYBER_INDCPA_BYTES bytes) +* - const unsigned char *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) +* - const unsigned char *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - const unsigned char *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) +* to deterministically generate all randomness +* Returns: - boolean byte indicating that re-encrypted ciphertext is NOT equal to the original ciphertext +**************************************************/ +unsigned char indcpa_enc_cmp(const unsigned char *c, + const unsigned char *m, + const unsigned char *pk, + const unsigned char *coins) { + uint64_t rc = 0; + polyvec sp, sp_prime; + poly bp; + poly *pkp = &bp; + poly *k = &bp; + poly *v = &sp.vec[0]; + const unsigned char *seed = pk+KYBER_POLYVECBYTES; + int i; + unsigned char nonce = 0; + + for (i = 0; i < KYBER_K; i++) + poly_getnoise_eta1(sp.vec + i, coins, nonce++); + + polyvec_ntt(&sp); + // i = 0 + matacc_cache32(&bp, &sp, &sp_prime, 0, seed, 1); + poly_invntt(&bp); + poly_addnoise_eta2(&bp, coins, nonce++); + poly_reduce(&bp); + rc |= cmp_poly_packcompress(c, &bp, 0); + for (i = 1; i < KYBER_K; i++) { + matacc_opt32(&bp, &sp, &sp_prime, i, seed, 1); + poly_invntt(&bp); + + poly_addnoise_eta2(&bp, coins, nonce++); + poly_reduce(&bp); + + rc |= cmp_poly_packcompress(c, &bp, i); + } + + poly_frombytes(pkp, pk); + int32_t v_tmp[KYBER_N]; + + poly_basemul_opt_16_32(v_tmp, &sp.vec[0], pkp, &sp_prime.vec[0]); + for (i = 1; i < KYBER_K - 1; i++) { + poly_frombytes(pkp, pk + i*KYBER_POLYBYTES); + poly_basemul_acc_opt_32_32(v_tmp, &sp.vec[i], pkp, &sp_prime.vec[i]); + } + poly_frombytes(pkp, pk + i*KYBER_POLYBYTES); + poly_basemul_acc_opt_32_16(v, &sp.vec[i], pkp, &sp_prime.vec[i], v_tmp); + + poly_invntt(v); + + poly_addnoise_eta2(v, coins, nonce++); + poly_frommsg(k, m); + poly_add(v, v, k); + poly_reduce(v); + + rc |= cmp_poly_compress(c + KYBER_POLYVECCOMPRESSEDBYTES, v); + + rc = ~rc + 1; + rc >>= 63; + return (unsigned char)rc; +} + +/************************************************* +* Name: indcpa_dec +* +* Description: Decryption function of the CPA-secure +* public-key encryption scheme underlying Kyber. +* +* Arguments: - unsigned char *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES) +* - const unsigned char *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES) +* - const unsigned char *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES) +**************************************************/ +void __attribute__ ((noinline)) indcpa_dec(unsigned char *m, + const unsigned char *c, + const unsigned char *sk) { + poly mp, bp; + poly *v = &bp; + int32_t r_tmp[KYBER_N]; + int i; + + poly_unpackdecompress(&mp, c, 0); + poly_ntt(&mp); + poly_frombytes_mul_16_32(r_tmp, &mp, sk); + for(i = 1; i < KYBER_K - 1; i++) { + poly_unpackdecompress(&bp, c, i); + poly_ntt(&bp); + poly_frombytes_mul_32_32(r_tmp, &bp, sk + i*KYBER_POLYBYTES); + } + poly_unpackdecompress(&bp, c, i); + poly_ntt(&bp); + poly_frombytes_mul_32_16(&mp, &bp, sk + i*KYBER_POLYBYTES, r_tmp); + + poly_invntt(&mp); + poly_decompress(v, c+KYBER_POLYVECCOMPRESSEDBYTES); + poly_sub(&mp, v, &mp); + poly_reduce(&mp); + + poly_tomsg(m, &mp); +} diff --git a/crypto_kem/ml-kem-512/m4fspeed/indcpa.h b/crypto_kem/ml-kem-512/m4fspeed/indcpa.h new file mode 120000 index 0000000..e6f3662 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/indcpa.h @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/indcpa.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fspeed/kem.c b/crypto_kem/ml-kem-512/m4fspeed/kem.c new file mode 120000 index 0000000..489b6f9 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/kem.c @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/kem.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fspeed/macros.i b/crypto_kem/ml-kem-512/m4fspeed/macros.i new file mode 120000 index 0000000..a7d8e74 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/macros.i @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/macros.i \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fspeed/matacc.c b/crypto_kem/ml-kem-512/m4fspeed/matacc.c new file mode 120000 index 0000000..71d7234 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/matacc.c @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/matacc.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fspeed/matacc.h b/crypto_kem/ml-kem-512/m4fspeed/matacc.h new file mode 120000 index 0000000..19b6772 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/matacc.h @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/matacc.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fspeed/matacc.i b/crypto_kem/ml-kem-512/m4fspeed/matacc.i new file mode 120000 index 0000000..39b6e23 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/matacc.i @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/matacc.i \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fspeed/matacc_asm.S b/crypto_kem/ml-kem-512/m4fspeed/matacc_asm.S new file mode 120000 index 0000000..3c7d05e --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/matacc_asm.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/matacc_asm.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fspeed/ntt.c b/crypto_kem/ml-kem-512/m4fspeed/ntt.c new file mode 120000 index 0000000..971c6b0 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/ntt.c @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/ntt.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fspeed/ntt.h b/crypto_kem/ml-kem-512/m4fspeed/ntt.h new file mode 120000 index 0000000..11e111d --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/ntt.h @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/ntt.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fspeed/params.h b/crypto_kem/ml-kem-512/m4fspeed/params.h new file mode 100644 index 0000000..be9ec45 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/params.h @@ -0,0 +1,32 @@ +#ifndef PARAMS_H +#define PARAMS_H + +#define KYBER_K 2 /* Change this for different security strengths */ + +/* Don't change parameters below this line */ + +#define KYBER_N 256 +#define KYBER_Q 3329 + +#define KYBER_ETA1 3 +#define KYBER_ETA2 2 + +#define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ +#define KYBER_SSBYTES 32 /* size in bytes of shared key */ + +#define KYBER_POLYBYTES 384 +#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) + +#define KYBER_POLYCOMPRESSEDBYTES 128 +#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320) + +#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES +#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) +#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) +#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) + +#define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) +#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */ +#define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES + +#endif diff --git a/crypto_kem/ml-kem-512/m4fspeed/poly.c b/crypto_kem/ml-kem-512/m4fspeed/poly.c new file mode 100644 index 0000000..401b26b --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/poly.c @@ -0,0 +1,672 @@ +#include "poly.h" + +#include "cbd.h" +#include "ntt.h" +#include "params.h" +#include "symmetric.h" + +#include + + +/************************************************* +* Name: poly_compress +* +* Description: Serialization of a polynomial and subsequent compression of a polynomial; +* +* Arguments: - unsigned char *r: pointer to output byte array (of length KYBER_POLYCOMPRESSEDBYTES) +* - const poly *a: pointer to input polynomial to be serialized +*************************************************/ +void poly_compress(unsigned char *r, const poly *a) +{ + unsigned int i,j; + int16_t u; + uint32_t d0; + uint8_t t[8]; + +#if (KYBER_POLYCOMPRESSEDBYTES == 128) + for(i=0;icoeffs[8*i+j]; + u += (u >> 15) & KYBER_Q; +/* t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */ + d0 = u << 4; + d0 += 1665; + d0 *= 80635; + d0 >>= 28; + t[j] = d0 & 0xf; + } + + r[0] = t[0] | (t[1] << 4); + r[1] = t[2] | (t[3] << 4); + r[2] = t[4] | (t[5] << 4); + r[3] = t[6] | (t[7] << 4); + r += 4; + } +#elif (KYBER_POLYCOMPRESSEDBYTES == 160) + for(i=0;icoeffs[8*i+j]; + u += (u >> 15) & KYBER_Q; +/* t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */ + d0 = u << 5; + d0 += 1664; + d0 *= 40318; + d0 >>= 27; + t[j] = d0 & 0x1f; + } + + r[0] = (t[0] >> 0) | (t[1] << 5); + r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7); + r[2] = (t[3] >> 1) | (t[4] << 4); + r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6); + r[4] = (t[6] >> 2) | (t[7] << 3); + r += 5; + } +#else +#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}" +#endif +} + +/************************************************* +* Name: poly_decompress +* +* Description: De-serialization and subsequent decompression of a polynomial; +* approximate inverse of poly_compress +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *a: pointer to input byte array (of length KYBER_POLYCOMPRESSEDBYTES bytes) +**************************************************/ +void poly_decompress(poly *r, const unsigned char *a) +{ + int i; +#if (KYBER_POLYCOMPRESSEDBYTES == 128) + for(i=0;icoeffs[i+0] = (((a[0] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[i+1] = (((a[0] >> 4) * KYBER_Q) + 8) >> 4; + r->coeffs[i+2] = (((a[1] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[i+3] = (((a[1] >> 4) * KYBER_Q) + 8) >> 4; + r->coeffs[i+4] = (((a[2] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[i+5] = (((a[2] >> 4) * KYBER_Q) + 8) >> 4; + r->coeffs[i+6] = (((a[3] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[i+7] = (((a[3] >> 4) * KYBER_Q) + 8) >> 4; + a += 4; + } +#elif (KYBER_POLYCOMPRESSEDBYTES == 160) + for(i=0;icoeffs[i+0] = (((a[0] & 31) * KYBER_Q) + 16) >> 5; + r->coeffs[i+1] = ((((a[0] >> 5) | ((a[1] & 3) << 3)) * KYBER_Q) + 16) >> 5; + r->coeffs[i+2] = ((((a[1] >> 2) & 31) * KYBER_Q) + 16) >> 5; + r->coeffs[i+3] = ((((a[1] >> 7) | ((a[2] & 15) << 1)) * KYBER_Q) + 16) >> 5; + r->coeffs[i+4] = ((((a[2] >> 4) | ((a[3] & 1) << 4)) * KYBER_Q) + 16) >> 5; + r->coeffs[i+5] = ((((a[3] >> 1) & 31) * KYBER_Q) + 16) >> 5; + r->coeffs[i+6] = ((((a[3] >> 6) | ((a[4] & 7) << 2)) * KYBER_Q) + 16) >> 5; + r->coeffs[i+7] = (((a[4] >> 3) * KYBER_Q) + 16) >> 5; + a += 5; + } +#else +#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {96, 128, 160}" +#endif +} + +/************************************************* +* Name: poly_packcompress +* +* Description: Serialization and subsequent compression of a polynomial of a polyvec, +* writes to a byte string representation of the whole polyvec. +* Used to compress a polyvec one poly at a time in a loop. +* +* Arguments: - unsigned char *r: pointer to output byte string representation of a polyvec (of length KYBER_POLYVECCOMPRESSEDBYTES) +* - const poly *a: pointer to input polynomial +* - int i: index of to be serialized polynomial in serialized polyec +**************************************************/ +void poly_packcompress(unsigned char *r, poly *a, int i) { + int j, k; + uint64_t d0; + +#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) + uint16_t t[8]; + + for(j=0;jcoeffs[8*j+k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; +/* t[k] = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */ + d0 = t[k]; + d0 <<= 11; + d0 += 1664; + d0 *= 645084; + d0 >>= 31; + t[k] = d0 & 0x7ff; + } + + + r[352*i+11*j+ 0] = t[0] & 0xff; + r[352*i+11*j+ 1] = (t[0] >> 8) | ((t[1] & 0x1f) << 3); + r[352*i+11*j+ 2] = (t[1] >> 5) | ((t[2] & 0x03) << 6); + r[352*i+11*j+ 3] = (t[2] >> 2) & 0xff; + r[352*i+11*j+ 4] = (t[2] >> 10) | ((t[3] & 0x7f) << 1); + r[352*i+11*j+ 5] = (t[3] >> 7) | ((t[4] & 0x0f) << 4); + r[352*i+11*j+ 6] = (t[4] >> 4) | ((t[5] & 0x01) << 7); + r[352*i+11*j+ 7] = (t[5] >> 1) & 0xff; + r[352*i+11*j+ 8] = (t[5] >> 9) | ((t[6] & 0x3f) << 2); + r[352*i+11*j+ 9] = (t[6] >> 6) | ((t[7] & 0x07) << 5); + r[352*i+11*j+10] = (t[7] >> 3); + } +#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) + uint16_t t[4]; + + for (j = 0; j < KYBER_N / 4; j++) { + for(k=0;k<4;k++) { + t[k] = a->coeffs[4*j+k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; + /* t[k] = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */ + d0 = t[k]; + d0 <<= 10; + d0 += 1665; + d0 *= 1290167; + d0 >>= 32; + t[k] = d0 & 0x3ff; + } + r[320*i+5*j+0] = t[0] & 0xff; + r[320*i+5*j+1] = (t[0] >> 8) | ((t[1] & 0x3f) << 2); + r[320*i+5*j+2] = ((t[1] >> 6) | ((t[2] & 0x0f) << 4)) & 0xff; + r[320*i+5*j+3] = ((t[2] >> 4) | ((t[3] & 0x03) << 6)) & 0xff; + r[320*i+5*j+4] = (t[3] >> 2) & 0xff; + } +#else +#error "KYBER_POLYVECCOMPRESSEDBYTES needs to in (KYBER_K * {352, 320})" +#endif +} + +/************************************************* +* Name: poly_unpackdecompress +* +* Description: Deserialization and subsequent compression of a polynomial of a polyvec, +* Used to uncompress a polyvec one poly at a time in a loop. +* +* Arguments: - const poly *r: pointer to output polynomial +* - unsigned char *a: pointer to input byte string representation of a polyvec (of length KYBER_POLYVECCOMPRESSEDBYTES) +* - int i: index of poly in polyvec to decompress +**************************************************/ +void poly_unpackdecompress(poly *r, const unsigned char *a, int i) { + int j; +#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) + for(j=0;jcoeffs[8*j+0] = (((a[352*i+11*j+ 0] | (((uint32_t)a[352*i+11*j+ 1] & 0x07) << 8)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+1] = ((((a[352*i+11*j+ 1] >> 3) | (((uint32_t)a[352*i+11*j+ 2] & 0x3f) << 5)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+2] = ((((a[352*i+11*j+ 2] >> 6) | (((uint32_t)a[352*i+11*j+ 3] & 0xff) << 2) | (((uint32_t)a[352*i+11*j+4] & 0x01) << 10)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+3] = ((((a[352*i+11*j+ 4] >> 1) | (((uint32_t)a[352*i+11*j+ 5] & 0x0f) << 7)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+4] = ((((a[352*i+11*j+ 5] >> 4) | (((uint32_t)a[352*i+11*j+ 6] & 0x7f) << 4)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+5] = ((((a[352*i+11*j+ 6] >> 7) | (((uint32_t)a[352*i+11*j+ 7] & 0xff) << 1) | (((uint32_t)a[352*i+11*j+8] & 0x03) << 9)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+6] = ((((a[352*i+11*j+ 8] >> 2) | (((uint32_t)a[352*i+11*j+ 9] & 0x1f) << 6)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+7] = ((((a[352*i+11*j+ 9] >> 5) | (((uint32_t)a[352*i+11*j+10] & 0xff) << 3)) * KYBER_Q) + 1024) >> 11; + } +#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) + for(j=0;jcoeffs[4*j+0] = (((a[320*i+5*j+ 0] | (((uint32_t)a[320*i+5*j+ 1] & 0x03) << 8)) * KYBER_Q) + 512) >> 10; + r->coeffs[4*j+1] = ((((a[320*i+5*j+ 1] >> 2) | (((uint32_t)a[320*i+5*j+ 2] & 0x0f) << 6)) * KYBER_Q) + 512) >> 10; + r->coeffs[4*j+2] = ((((a[320*i+5*j+ 2] >> 4) | (((uint32_t)a[320*i+5*j+ 3] & 0x3f) << 4)) * KYBER_Q) + 512) >> 10; + r->coeffs[4*j+3] = ((((a[320*i+5*j+ 3] >> 6) | (((uint32_t)a[320*i+5*j+ 4] & 0xff) << 2)) * KYBER_Q) + 512) >> 10; + } +#else +#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}" +#endif +} + + +/************************************************* +* Name: cmp_poly_compress +* +* Description: Serializes and consequently compares polynomial to a serialized polynomial +* +* Arguments: - const unsigned char *r: pointer to serialized polynomial to compare with +* - poly *a: pointer to input polynomial to serialize and compare +* Returns: boolean indicating whether the polynomials are equal +**************************************************/ +int cmp_poly_compress(const unsigned char *r, poly *a) { + unsigned char rc = 0; + int16_t u; + uint32_t d0; + uint8_t t[8]; + int i, j, k = 0; + +#if (KYBER_POLYCOMPRESSEDBYTES == 128) + for(i=0;icoeffs[8*i+j]; + u += (u >> 15) & KYBER_Q; +/* t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */ + d0 = u << 4; + d0 += 1665; + d0 *= 80635; + d0 >>= 28; + t[j] = d0 & 0xf; + } + rc |= r[k] ^ (t[0] | (t[1] << 4)); + rc |= r[k + 1] ^ (t[2] | (t[3] << 4)); + rc |= r[k + 2] ^ (t[4] | (t[5] << 4)); + rc |= r[k + 3] ^ (t[6] | (t[7] << 4)); + k += 4; + } +#elif (KYBER_POLYCOMPRESSEDBYTES == 160) + for(i=0;icoeffs[8*i+j]; + u += (u >> 15) & KYBER_Q; +/* t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */ + d0 = u << 5; + d0 += 1664; + d0 *= 40318; + d0 >>= 27; + t[j] = d0 & 0x1f; + } + + + rc |= r[k] ^ (t[0] | (t[1] << 5)); + rc |= r[k+1] ^ ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); + rc |= r[k+2] ^ ((t[3] >> 1) | (t[4] << 4)); + rc |= r[k+3] ^ ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); + rc |= r[k+4] ^ ((t[6] >> 2) | (t[7] << 3)); + k += 5; + } +#else +#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}" +#endif + return rc; +} + +/************************************************* +* Name: cmp_poly_packcompress +* +* Description: Serializes and consequently compares poly of polyvec to a serialized polyvec +* Should be called in a loop over all poly's of a polyvec. +* +* Arguments: - const unsigned char *r: pointer to serialized polyvec to compare with +* - poly *a: pointer to input polynomial of polyvec to serialize and compare +* - int i: index of poly in polyvec to compare with +* Returns: boolean indicating whether the polyvecs are equal +**************************************************/ +int cmp_poly_packcompress(const unsigned char *r, poly *a, int i) { + unsigned char rc = 0; + int j, k; + uint64_t d0; + +#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) + uint16_t t[8]; + for(j=0;jcoeffs[8*j+k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; +/* t[k] = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */ + d0 = t[k]; + d0 <<= 11; + d0 += 1664; + d0 *= 645084; + d0 >>= 31; + t[k] = d0 & 0x7ff; + } + + rc |= r[352*i+11*j+ 0] ^ (t[0] & 0xff); + rc |= r[352*i+11*j+ 1] ^ ((t[0] >> 8) | ((t[1] & 0x1f) << 3)); + rc |= r[352*i+11*j+ 2] ^ ((t[1] >> 5) | ((t[2] & 0x03) << 6)); + rc |= r[352*i+11*j+ 3] ^ ((t[2] >> 2) & 0xff); + rc |= r[352*i+11*j+ 4] ^ ((t[2] >> 10) | ((t[3] & 0x7f) << 1)); + rc |= r[352*i+11*j+ 5] ^ ((t[3] >> 7) | ((t[4] & 0x0f) << 4)); + rc |= r[352*i+11*j+ 6] ^ ((t[4] >> 4) | ((t[5] & 0x01) << 7)); + rc |= r[352*i+11*j+ 7] ^ ((t[5] >> 1) & 0xff); + rc |= r[352*i+11*j+ 8] ^ ((t[5] >> 9) | ((t[6] & 0x3f) << 2)); + rc |= r[352*i+11*j+ 9] ^ ((t[6] >> 6) | ((t[7] & 0x07) << 5)); + rc |= r[352*i+11*j+10] ^ ((t[7] >> 3)); + } +#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) + uint16_t t[4]; + for (j = 0; j < KYBER_N / 4; j++) { + for(k=0;k<4;k++) { + t[k] = a->coeffs[4*j+k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; + /* t[k] = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */ + d0 = t[k]; + d0 <<= 10; + d0 += 1665; + d0 *= 1290167; + d0 >>= 32; + t[k] = d0 & 0x3ff; + } + + rc |= r[320*i+5*j+0] ^ (t[0] & 0xff); + rc |= r[320*i+5*j+1] ^ ((t[0] >> 8) | ((t[1] & 0x3f) << 2)); + rc |= r[320*i+5*j+2] ^ (((t[1] >> 6) | ((t[2] & 0x0f) << 4)) & 0xff); + rc |= r[320*i+5*j+3] ^ (((t[2] >> 4) | ((t[3] & 0x03) << 6)) & 0xff); + rc |= r[320*i+5*j+4] ^ ((t[3] >> 2) & 0xff); + } +#else +#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}" +#endif + return rc; +} + +/************************************************* +* Name: poly_tobytes +* +* Description: Serialization of a polynomial +* +* Arguments: - unsigned char *r: pointer to output byte array (needs space for KYBER_POLYBYTES bytes) +* - const poly *a: pointer to input polynomial +**************************************************/ +void poly_tobytes(unsigned char *r, poly *a) { + int i; + uint16_t t0, t1; + + poly_reduce(a); + + for (i = 0; i < KYBER_N / 2; i++) { + t0 = a->coeffs[2 * i]; + t1 = a->coeffs[2 * i + 1]; + r[3 * i] = t0 & 0xff; + r[3 * i + 1] = (t0 >> 8) | ((t1 & 0xf) << 4); + r[3 * i + 2] = (t1 >> 4) & 0xff; + } +} + +/************************************************* +* Name: poly_frombytes +* +* Description: De-serialization of a polynomial; +* inverse of poly_tobytes +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes) +**************************************************/ +void poly_frombytes(poly *r, const unsigned char *a) { + int i; + + for (i = 0; i < KYBER_N / 2; i++) { + r->coeffs[2 * i] = a[3 * i] | ((uint16_t)a[3 * i + 1] & 0x0f) << 8; + r->coeffs[2 * i + 1] = a[3 * i + 1] >> 4 | ((uint16_t)a[3 * i + 2] & 0xff) << 4; + } +} + +/************************************************* +* Name: poly_frombytes_mul_16_32 +* +* Description: Multiplication of a polynomial with a de-serialization of another polynomial +* Using strategy of better accumulation. +* Arguments: - const poly *b: pointer to input polynomial +* - int32_t *r_tmp: array for accumulating unreduced results +* - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes) +**************************************************/ +extern void frombytes_mul_asm_16_32(int32_t *r_tmp, const int16_t *b, const unsigned char *c, const int32_t zetas[64]); +void poly_frombytes_mul_16_32(int32_t *r_tmp, const poly *b, const unsigned char *a) { + frombytes_mul_asm_16_32(r_tmp, b->coeffs, a, zetas); +} + +/************************************************* +* Name: poly_frombytes_mul_32_32 +* +* Description: Multiplication of a polynomial with a de-serialization of another polynomial +* Using strategy of better accumulation. +* Arguments: - const poly *b: pointer to input polynomial +* - int32_t *r_tmp: array for accumulating unreduced results +* - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes) +**************************************************/ +extern void frombytes_mul_asm_acc_32_32(int32_t *r_tmp, const int16_t *b, const unsigned char *c, const int32_t zetas[64]); +void poly_frombytes_mul_32_32(int32_t *r_tmp, const poly *b, const unsigned char *a) { + frombytes_mul_asm_acc_32_32(r_tmp, b->coeffs, a, zetas); +} + +/************************************************* +* Name: poly_frombytes_mul_32_16 +* +* Description: Multiplication of a polynomial with a de-serialization of another polynomial +* Using strategy of better accumulation. +* Arguments: - poly *r: pointer to output polynomial +* - const poly *b: pointer to input polynomial +* - const int32_t *r_tmp: array containing unreduced results +* - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes) +**************************************************/ +extern void frombytes_mul_asm_acc_32_16(int16_t *r, const int16_t *b, const unsigned char *c, const int32_t zetas[64], const int32_t *r_tmp); +void poly_frombytes_mul_32_16(poly *r, const poly* b, const unsigned char *a, const int32_t *r_tmp) { + frombytes_mul_asm_acc_32_16(r->coeffs, b->coeffs, a, zetas, r_tmp); +} + +/************************************************* +* Name: poly_getnoise_eta1 +* +* Description: Sample a polynomial deterministically from a seed and a nonce, +* with output polynomial close to centered binomial distribution +* with parameter KYBER_ETA1 +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *seed: pointer to input seed (pointing to array of length KYBER_SYMBYTES bytes) +* - unsigned char nonce: one-byte input nonce +* - int add: boolean to indicate to accumulate into r +**************************************************/ +void poly_noise_eta1(poly *r, const unsigned char *seed, unsigned char nonce, int add) { + unsigned char buf[KYBER_ETA1 * KYBER_N / 4]; + + prf(buf, KYBER_ETA1 * KYBER_N / 4, seed, nonce); + cbd_eta1(r, buf, add); +} + +/************************************************* +* Name: poly_getnoise_eta2 +* +* Description: Sample a polynomial deterministically from a seed and a nonce, +* with output polynomial close to centered binomial distribution +* with parameter KYBER_ETA2 +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *seed: pointer to input seed (pointing to array of length KYBER_SYMBYTES bytes) +* - unsigned char nonce: one-byte input nonce +* - int add: boolean to indicate to accumulate into r +**************************************************/ +void poly_noise_eta2(poly *r, const unsigned char *seed, unsigned char nonce, int add) { + unsigned char buf[KYBER_ETA2 * KYBER_N / 4]; + + prf(buf, KYBER_ETA2 * KYBER_N / 4, seed, nonce); + cbd_eta2(r, buf, add); +} + +/************************************************* +* Name: poly_basemul_opt_16_32 +* +* Description: Multiplication of two polynomials using asymmetric multiplication. +* Cached values are generated during matrix-vector product. +* Using strategy of better accumulation (initial step). +* Arguments: - const poly *a: pointer to input polynomial +* - const poly *b: pointer to input polynomial +* - const poly *a_prime: pointer to a pre-multiplied by zetas +* - int32_t *r_tmp: array for accumulating unreduced results +**************************************************/ +extern void basemul_asm_opt_16_32(int32_t *, const int16_t *, const int16_t *, const int16_t *); +void poly_basemul_opt_16_32(int32_t *r_tmp, const poly *a, const poly *b, const poly *a_prime) { + basemul_asm_opt_16_32(r_tmp, a->coeffs, b->coeffs, a_prime->coeffs); +} + +/************************************************* +* Name: poly_basemul_acc_opt_32_32 +* +* Description: Multiplication of two polynomials using asymmetric multiplication. +* Cached values are generated during matrix-vector product. +* Using strategy of better accumulation. +* Arguments: - const poly *a: pointer to input polynomial +* - const poly *b: pointer to input polynomial +* - const poly *a_prime: pointer to a pre-multiplied by zetas +* - int32_t *r_tmp: array for accumulating unreduced results +**************************************************/ +extern void basemul_asm_acc_opt_32_32(int32_t *, const int16_t *, const int16_t *, const int16_t *); +void poly_basemul_acc_opt_32_32(int32_t *r, const poly *a, const poly *b, const poly *a_prime) { + basemul_asm_acc_opt_32_32(r, a->coeffs, b->coeffs, a_prime->coeffs); +} + +/************************************************* +* Name: poly_basemul_acc_opt_32_16 +* +* Description: Multiplication of two polynomials using asymmetric multiplication. +* Cached values are generated during matrix-vector product. +* Using strategy of better accumulation (final step). +* Arguments: - const poly *a: pointer to input polynomial +* - const poly *b: pointer to input polynomial +* - const poly *a_prime: pointer to a pre-multiplied by zetas +* - poly *r: pointer to output polynomial +* - const int32_t *r_tmp: array containing unreduced results +**************************************************/ +extern void basemul_asm_acc_opt_32_16(int16_t *, const int16_t *, const int16_t *, const int16_t *, const int32_t *); +void poly_basemul_acc_opt_32_16(poly *r, const poly *a, const poly *b, const poly *a_prime, const int32_t * r_tmp) { + basemul_asm_acc_opt_32_16(r->coeffs, a->coeffs, b->coeffs, a_prime->coeffs, r_tmp); +} + +/************************************************* +* Name: poly_ntt +* +* Description: Computes negacyclic number-theoretic transform (NTT) of +* a polynomial in place; +* inputs assumed to be in normal order, output in bitreversed order +* +* Arguments: - uint16_t *r: pointer to in/output polynomial +**************************************************/ +void poly_ntt(poly *r) { + ntt(r->coeffs); +} + +/************************************************* +* Name: poly_invntt +* +* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of +* a polynomial in place; +* inputs assumed to be in bitreversed order, output in normal order +* +* Arguments: - uint16_t *a: pointer to in/output polynomial +**************************************************/ +void poly_invntt(poly *r) { + invntt(r->coeffs); +} + +extern void asm_fromplant(int16_t *r); +/************************************************* +* Name: poly_fromplantt +* +* Description: Inplace conversion of all coefficients of a polynomial +* from Montgomery domain to normal domain +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void poly_fromplant(poly *r) { + asm_fromplant(r->coeffs); +} + +extern void asm_barrett_reduce(int16_t *r); +/************************************************* +* Name: poly_reduce +* +* Description: Applies Barrett reduction to all coefficients of a polynomial +* for details of the Barrett reduction see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void poly_reduce(poly *r) { + asm_barrett_reduce(r->coeffs); +} + +extern void pointwise_add(int16_t *, const int16_t *, const int16_t *); +/************************************************* +* Name: poly_add +* +* Description: Add two polynomials +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_add(poly *r, const poly *a, const poly *b) { + pointwise_add(r->coeffs,a->coeffs,b->coeffs); +} + + +extern void pointwise_sub(int16_t *, const int16_t *, const int16_t *); +/************************************************* +* Name: poly_sub +* +* Description: Subtract two polynomials +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_sub(poly *r, const poly *a, const poly *b) { + pointwise_sub(r->coeffs,a->coeffs,b->coeffs); +} + +void cmov_int16(int16_t *r, int16_t v, uint16_t b); + +/************************************************* +* Name: poly_frommsg +* +* Description: Convert 32-byte message to polynomial +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *msg: pointer to input message +**************************************************/ +void poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) +{ + unsigned int i,j; + +#if (KYBER_INDCPA_MSGBYTES != KYBER_N/8) +#error "KYBER_INDCPA_MSGBYTES must be equal to KYBER_N/8 bytes!" +#endif + + for(i=0;icoeffs[8*i+j] = 0; + cmov_int16(r->coeffs+8*i+j, ((KYBER_Q+1)/2), (msg[i] >> j)&1); + } + } +} + +/************************************************* +* Name: poly_tomsg +* +* Description: Convert polynomial to 32-byte message +* +* Arguments: - unsigned char *msg: pointer to output message +* - const poly *a: pointer to input polynomial +**************************************************/ +void poly_tomsg(unsigned char msg[KYBER_SYMBYTES], poly *a) { + uint32_t t; + int i, j; + + for (i = 0; i < KYBER_SYMBYTES; i++) { + msg[i] = 0; + for (j = 0; j < 8; j++) { + t = a->coeffs[8*i+j]; + t <<= 1; + t += 1665; + t *= 80635; + t >>= 28; + t &= 1; + msg[i] |= t << j; + } + } +} + +/************************************************* +* Name: poly_zeroize +* +* Description: Zeros a polynomial +* +* Arguments: - poly *p: pointer to polynomial +**************************************************/ +void poly_zeroize(poly *p) { + int i; + for(i = 0; i < KYBER_N; i++) + p->coeffs[i] = 0; +} diff --git a/crypto_kem/ml-kem-512/m4fspeed/poly.h b/crypto_kem/ml-kem-512/m4fspeed/poly.h new file mode 100644 index 0000000..4994d87 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/poly.h @@ -0,0 +1,56 @@ +#ifndef POLY_H +#define POLY_H + +#include "params.h" + +#include + +#define poly_getnoise_eta1(p, seed, nonce) poly_noise_eta1(p, seed, nonce, 0) +#define poly_getnoise_eta2(p, seed, nonce) poly_noise_eta2(p, seed, nonce, 0) +#define poly_addnoise_eta1(p, seed, nonce) poly_noise_eta1(p, seed, nonce, 1) +#define poly_addnoise_eta2(p, seed, nonce) poly_noise_eta2(p, seed, nonce, 1) + +/* + * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial + * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] + */ +typedef struct { + int16_t coeffs[KYBER_N]; +} poly; + +void poly_compress(unsigned char *r, const poly *a); +void poly_decompress(poly *r, const unsigned char *a); + +void poly_packcompress(unsigned char *r, poly *a, int i); +void poly_unpackdecompress(poly *r, const unsigned char *a, int i); + +int cmp_poly_compress(const unsigned char *r, poly *a); +int cmp_poly_packcompress(const unsigned char *r, poly *a, int i); + +void poly_tobytes(unsigned char *r, poly *a); +void poly_frombytes(poly *r, const unsigned char *a); +void poly_frombytes_mul_16_32(int32_t *r_tmp, const poly *b, const unsigned char *a); +void poly_frombytes_mul_32_32(int32_t *r_tmp, const poly *b, const unsigned char *a); +void poly_frombytes_mul_32_16(poly *r, const poly* b, const unsigned char *a, const int32_t *r_tmp); + +void poly_frommsg(poly *r, const unsigned char msg[KYBER_SYMBYTES]); +void poly_tomsg(unsigned char msg[KYBER_SYMBYTES], poly *a); + +void poly_noise_eta1(poly *r, const unsigned char *seed, unsigned char nonce, int add); +void poly_noise_eta2(poly *r, const unsigned char *seed, unsigned char nonce, int add); + +void poly_ntt(poly *r); +void poly_invntt(poly *r); +void poly_basemul_opt_16_32(int32_t *r, const poly *a, const poly *b, const poly *a_prime); +void poly_basemul_acc_opt_32_32(int32_t *r, const poly *a, const poly *b, const poly *a_prime); +void poly_basemul_acc_opt_32_16(poly *r, const poly *a, const poly *b, const poly *a_prime, const int32_t * r_tmp); +void poly_fromplantt(poly *r); + +void poly_reduce(poly *r); + +void poly_add(poly *r, const poly *a, const poly *b); +void poly_sub(poly *r, const poly *a, const poly *b); + +void poly_zeroize(poly *p); + +#endif diff --git a/crypto_kem/ml-kem-512/m4fspeed/poly_asm.S b/crypto_kem/ml-kem-512/m4fspeed/poly_asm.S new file mode 120000 index 0000000..c4bda05 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/poly_asm.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/poly_asm.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fspeed/polyvec.c b/crypto_kem/ml-kem-512/m4fspeed/polyvec.c new file mode 120000 index 0000000..c3f7d0a --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/polyvec.c @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/polyvec.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fspeed/polyvec.h b/crypto_kem/ml-kem-512/m4fspeed/polyvec.h new file mode 120000 index 0000000..47cf6c3 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/polyvec.h @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/polyvec.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fspeed/reduce.S b/crypto_kem/ml-kem-512/m4fspeed/reduce.S new file mode 120000 index 0000000..2edf10c --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/reduce.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/reduce.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fspeed/symmetric-fips202. b/crypto_kem/ml-kem-512/m4fspeed/symmetric-fips202. new file mode 120000 index 0000000..e49ba06 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/symmetric-fips202. @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/symmetric-fips202. \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fspeed/symmetric-fips202.c b/crypto_kem/ml-kem-512/m4fspeed/symmetric-fips202.c new file mode 120000 index 0000000..5adc9ae --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/symmetric-fips202.c @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/symmetric-fips202.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fspeed/symmetric.h b/crypto_kem/ml-kem-512/m4fspeed/symmetric.h new file mode 120000 index 0000000..698a10d --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/symmetric.h @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/symmetric.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fspeed/verify.c b/crypto_kem/ml-kem-512/m4fspeed/verify.c new file mode 120000 index 0000000..85d7f50 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/verify.c @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/verify.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fspeed/verify.h b/crypto_kem/ml-kem-512/m4fspeed/verify.h new file mode 120000 index 0000000..e19a301 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fspeed/verify.h @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/verify.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/api.h b/crypto_kem/ml-kem-512/m4fstack/api.h new file mode 120000 index 0000000..cf75db9 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/api.h @@ -0,0 +1 @@ +../m4fspeed/api.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/cbd.c b/crypto_kem/ml-kem-512/m4fstack/cbd.c new file mode 120000 index 0000000..903fa59 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/cbd.c @@ -0,0 +1 @@ +../m4fspeed/cbd.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/cbd.h b/crypto_kem/ml-kem-512/m4fstack/cbd.h new file mode 120000 index 0000000..d264c36 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/cbd.h @@ -0,0 +1 @@ +../m4fspeed/cbd.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/cmov_int16.S b/crypto_kem/ml-kem-512/m4fstack/cmov_int16.S new file mode 120000 index 0000000..bdef6f4 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/cmov_int16.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fspeed/cmov_int16.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/fastaddsub.S b/crypto_kem/ml-kem-512/m4fstack/fastaddsub.S new file mode 120000 index 0000000..d1317f7 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/fastaddsub.S @@ -0,0 +1 @@ +../m4fspeed/fastaddsub.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/fastbasemul.S b/crypto_kem/ml-kem-512/m4fstack/fastbasemul.S new file mode 120000 index 0000000..531385d --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/fastbasemul.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fstack/fastbasemul.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/fastinvntt.S b/crypto_kem/ml-kem-512/m4fstack/fastinvntt.S new file mode 120000 index 0000000..1ad2d31 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/fastinvntt.S @@ -0,0 +1 @@ +../m4fspeed/fastinvntt.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/fastntt.S b/crypto_kem/ml-kem-512/m4fstack/fastntt.S new file mode 120000 index 0000000..208c11d --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/fastntt.S @@ -0,0 +1 @@ +../m4fspeed/fastntt.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/indcpa.c b/crypto_kem/ml-kem-512/m4fstack/indcpa.c new file mode 100644 index 0000000..94d6a57 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/indcpa.c @@ -0,0 +1,211 @@ +#include "indcpa.h" +#include "ntt.h" +#include "poly.h" +#include "polyvec.h" +#include "randombytes.h" +#include "symmetric.h" +#include "matacc.h" + +#include +#include + +/************************************************* +* Name: indcpa_keypair +* +* Description: Generates public and private key for the CPA-secure +* public-key encryption scheme underlying Kyber +* +* Arguments: - unsigned char *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - unsigned char *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes) +**************************************************/ +void indcpa_keypair_derand(unsigned char *pk, + unsigned char *sk, + const unsigned char *coins){ + polyvec skpv; + poly pkp; + unsigned char buf[2 * KYBER_SYMBYTES]; + unsigned char *publicseed = buf; + unsigned char *noiseseed = buf + KYBER_SYMBYTES; + int i; + unsigned char nonce = 0; + + memcpy(buf, coins, KYBER_SYMBYTES); + buf[KYBER_SYMBYTES] = KYBER_K; + hash_g(buf, buf, KYBER_SYMBYTES + 1); + + for (i = 0; i < KYBER_K; i++) + poly_getnoise_eta1(skpv.vec + i, noiseseed, nonce++); + + polyvec_ntt(&skpv); + + for (i = 0; i < KYBER_K; i++) { + matacc(&pkp, &skpv, i, publicseed, 0); + poly_invntt(&pkp); + + poly_addnoise_eta1(&pkp, noiseseed, nonce++); + poly_ntt(&pkp); + + poly_tobytes(pk+i*KYBER_POLYBYTES, &pkp); + } + + polyvec_tobytes(sk, &skpv); + memcpy(pk + KYBER_POLYVECBYTES, publicseed, KYBER_SYMBYTES); // Pack the public seed in the public key +} + +/************************************************* +* Name: indcpa_enc +* +* Description: Encryption function of the CPA-secure +* public-key encryption scheme underlying Kyber. +* +* Arguments: - unsigned char *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes) +* - const unsigned char *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) +* - const unsigned char *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - const unsigned char *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) +* to deterministically generate all randomness +**************************************************/ +void indcpa_enc(unsigned char *c, + const unsigned char *m, + const unsigned char *pk, + const unsigned char *coins) { + polyvec sp; + poly bp; + poly *pkp = &bp; + poly *k = &bp; + poly *v = &sp.vec[0]; + const unsigned char *seed = pk+KYBER_POLYVECBYTES; + int i; + unsigned char nonce = 0; + + for (i = 0; i < KYBER_K; i++) + poly_getnoise_eta1(sp.vec + i, coins, nonce++); + + polyvec_ntt(&sp); + + for (i = 0; i < KYBER_K; i++) { + matacc(&bp, &sp, i, seed, 1); + poly_invntt(&bp); + + poly_addnoise_eta2(&bp, coins, nonce++); + poly_reduce(&bp); + + poly_packcompress(c, &bp, i); + } + + poly_frombytes(pkp, pk); + poly_basemul(v, pkp, &sp.vec[0]); + for (i = 1; i < KYBER_K; i++) { + poly_frombytes(pkp, pk + i*KYBER_POLYBYTES); + poly_basemul_acc(v, pkp, &sp.vec[i]); + } + + poly_invntt(v); + + poly_addnoise_eta2(v, coins, nonce++); + + poly_frommsg(k, m); + poly_add(v, v, k); + poly_reduce(v); + + poly_compress(c + KYBER_POLYVECCOMPRESSEDBYTES, v); +} + +/************************************************* +* Name: indcpa_enc_cmp +* +* Description: Re-encryption function. +* Compares the re-encypted ciphertext with the original ciphertext byte per byte. +* The comparison is performed in a constant time manner. +* +* +* Arguments: - unsigned char *ct: pointer to input ciphertext to compare the new ciphertext with (of length KYBER_INDCPA_BYTES bytes) +* - const unsigned char *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) +* - const unsigned char *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - const unsigned char *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) +* to deterministically generate all randomness +* Returns: - boolean byte indicating that re-encrypted ciphertext is NOT equal to the original ciphertext +**************************************************/ +unsigned char indcpa_enc_cmp(const unsigned char *c, + const unsigned char *m, + const unsigned char *pk, + const unsigned char *coins) { + uint64_t rc = 0; + polyvec sp; + poly bp; + poly *pkp = &bp; + poly *k = &bp; + poly *v = &sp.vec[0]; + const unsigned char *seed = pk+KYBER_POLYVECBYTES; + int i; + unsigned char nonce = 0; + + for (i = 0; i < KYBER_K; i++) + poly_getnoise_eta1(sp.vec + i, coins, nonce++); + + polyvec_ntt(&sp); + + for (i = 0; i < KYBER_K; i++) { + matacc(&bp, &sp, i, seed, 1); + poly_invntt(&bp); + + poly_addnoise_eta2(&bp, coins, nonce++); + poly_reduce(&bp); + + rc |= cmp_poly_packcompress(c, &bp, i); + } + + poly_frombytes(pkp, pk); + poly_basemul(v, pkp, &sp.vec[0]); + for (i = 1; i < KYBER_K; i++) { + poly_frombytes(pkp, pk + i*KYBER_POLYBYTES); + poly_basemul_acc(v, pkp, &sp.vec[i]); + } + + poly_invntt(v); + + poly_addnoise_eta2(v, coins, nonce++); + poly_frommsg(k, m); + poly_add(v, v, k); + poly_reduce(v); + + rc |= cmp_poly_compress(c + KYBER_POLYVECCOMPRESSEDBYTES, v); + + rc = ~rc + 1; + rc >>= 63; + return (unsigned char)rc; +} + +/************************************************* +* Name: indcpa_dec +* +* Description: Decryption function of the CPA-secure +* public-key encryption scheme underlying Kyber. +* +* Arguments: - unsigned char *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES) +* - const unsigned char *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES) +* - const unsigned char *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES) +**************************************************/ +void __attribute__ ((noinline)) indcpa_dec(unsigned char *m, + const unsigned char *c, + const unsigned char *sk) { + poly mp, bp; + poly *v = &bp; + int i; + + poly_unpackdecompress(&mp, c, 0); + poly_ntt(&mp); + + poly_frombytes_mul(&mp, &mp, sk); + for(i = 1; i < KYBER_K; i++) { + poly_unpackdecompress(&bp, c, i); + poly_ntt(&bp); + poly_frombytes_mul_acc(&mp, &bp, sk + i*KYBER_POLYBYTES); + } + + poly_invntt(&mp); + poly_decompress(v, c+KYBER_POLYVECCOMPRESSEDBYTES); + poly_sub(&mp, v, &mp); + poly_reduce(&mp); + + poly_tomsg(m, &mp); +} diff --git a/crypto_kem/ml-kem-512/m4fstack/indcpa.h b/crypto_kem/ml-kem-512/m4fstack/indcpa.h new file mode 120000 index 0000000..5893b12 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/indcpa.h @@ -0,0 +1 @@ +../m4fspeed/indcpa.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/kem.c b/crypto_kem/ml-kem-512/m4fstack/kem.c new file mode 120000 index 0000000..302153d --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/kem.c @@ -0,0 +1 @@ +../m4fspeed/kem.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/macros.i b/crypto_kem/ml-kem-512/m4fstack/macros.i new file mode 120000 index 0000000..6e83891 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/macros.i @@ -0,0 +1 @@ +../m4fspeed/macros.i \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/matacc.c b/crypto_kem/ml-kem-512/m4fstack/matacc.c new file mode 120000 index 0000000..5558ec8 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/matacc.c @@ -0,0 +1 @@ +../../ml-kem-768/m4fstack/matacc.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/matacc.h b/crypto_kem/ml-kem-512/m4fstack/matacc.h new file mode 120000 index 0000000..4eb7706 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/matacc.h @@ -0,0 +1 @@ +../../ml-kem-768/m4fstack/matacc.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/matacc.i b/crypto_kem/ml-kem-512/m4fstack/matacc.i new file mode 120000 index 0000000..0d39b07 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/matacc.i @@ -0,0 +1 @@ +../../ml-kem-768/m4fstack/matacc.i \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/matacc_asm.S b/crypto_kem/ml-kem-512/m4fstack/matacc_asm.S new file mode 120000 index 0000000..0079bb5 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/matacc_asm.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fstack/matacc_asm.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/ntt.c b/crypto_kem/ml-kem-512/m4fstack/ntt.c new file mode 120000 index 0000000..c9d6e8a --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/ntt.c @@ -0,0 +1 @@ +../m4fspeed/ntt.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/ntt.h b/crypto_kem/ml-kem-512/m4fstack/ntt.h new file mode 120000 index 0000000..5fd83c0 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/ntt.h @@ -0,0 +1 @@ +../m4fspeed/ntt.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/params.h b/crypto_kem/ml-kem-512/m4fstack/params.h new file mode 120000 index 0000000..59dd7f1 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/params.h @@ -0,0 +1 @@ +../m4fspeed/params.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/poly.c b/crypto_kem/ml-kem-512/m4fstack/poly.c new file mode 100644 index 0000000..443fdba --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/poly.c @@ -0,0 +1,637 @@ +#include "poly.h" + +#include "cbd.h" +#include "ntt.h" +#include "params.h" +#include "symmetric.h" + +#include + + +/************************************************* +* Name: poly_compress +* +* Description: Serialization of a polynomial and subsequent compression of a polynomial; +* +* Arguments: - unsigned char *r: pointer to output byte array (of length KYBER_POLYCOMPRESSEDBYTES) +* - const poly *a: pointer to input polynomial to be serialized +*************************************************/ +void poly_compress(unsigned char *r, const poly *a) +{ + unsigned int i,j; + int16_t u; + uint32_t d0; + uint8_t t[8]; + +#if (KYBER_POLYCOMPRESSEDBYTES == 128) + for(i=0;icoeffs[8*i+j]; + u += (u >> 15) & KYBER_Q; +/* t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */ + d0 = u << 4; + d0 += 1665; + d0 *= 80635; + d0 >>= 28; + t[j] = d0 & 0xf; + } + + r[0] = t[0] | (t[1] << 4); + r[1] = t[2] | (t[3] << 4); + r[2] = t[4] | (t[5] << 4); + r[3] = t[6] | (t[7] << 4); + r += 4; + } +#elif (KYBER_POLYCOMPRESSEDBYTES == 160) + for(i=0;icoeffs[8*i+j]; + u += (u >> 15) & KYBER_Q; +/* t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */ + d0 = u << 5; + d0 += 1664; + d0 *= 40318; + d0 >>= 27; + t[j] = d0 & 0x1f; + } + + r[0] = (t[0] >> 0) | (t[1] << 5); + r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7); + r[2] = (t[3] >> 1) | (t[4] << 4); + r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6); + r[4] = (t[6] >> 2) | (t[7] << 3); + r += 5; + } +#else +#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}" +#endif +} + +/************************************************* +* Name: poly_decompress +* +* Description: De-serialization and subsequent decompression of a polynomial; +* approximate inverse of poly_compress +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *a: pointer to input byte array (of length KYBER_POLYCOMPRESSEDBYTES bytes) +**************************************************/ +void poly_decompress(poly *r, const unsigned char *a) +{ + int i; +#if (KYBER_POLYCOMPRESSEDBYTES == 128) + for(i=0;icoeffs[i+0] = (((a[0] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[i+1] = (((a[0] >> 4) * KYBER_Q) + 8) >> 4; + r->coeffs[i+2] = (((a[1] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[i+3] = (((a[1] >> 4) * KYBER_Q) + 8) >> 4; + r->coeffs[i+4] = (((a[2] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[i+5] = (((a[2] >> 4) * KYBER_Q) + 8) >> 4; + r->coeffs[i+6] = (((a[3] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[i+7] = (((a[3] >> 4) * KYBER_Q) + 8) >> 4; + a += 4; + } +#elif (KYBER_POLYCOMPRESSEDBYTES == 160) + for(i=0;icoeffs[i+0] = (((a[0] & 31) * KYBER_Q) + 16) >> 5; + r->coeffs[i+1] = ((((a[0] >> 5) | ((a[1] & 3) << 3)) * KYBER_Q) + 16) >> 5; + r->coeffs[i+2] = ((((a[1] >> 2) & 31) * KYBER_Q) + 16) >> 5; + r->coeffs[i+3] = ((((a[1] >> 7) | ((a[2] & 15) << 1)) * KYBER_Q) + 16) >> 5; + r->coeffs[i+4] = ((((a[2] >> 4) | ((a[3] & 1) << 4)) * KYBER_Q) + 16) >> 5; + r->coeffs[i+5] = ((((a[3] >> 1) & 31) * KYBER_Q) + 16) >> 5; + r->coeffs[i+6] = ((((a[3] >> 6) | ((a[4] & 7) << 2)) * KYBER_Q) + 16) >> 5; + r->coeffs[i+7] = (((a[4] >> 3) * KYBER_Q) + 16) >> 5; + a += 5; + } +#else +#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {96, 128, 160}" +#endif +} + +/************************************************* +* Name: poly_packcompress +* +* Description: Serialization and subsequent compression of a polynomial of a polyvec, +* writes to a byte string representation of the whole polyvec. +* Used to compress a polyvec one poly at a time in a loop. +* +* Arguments: - unsigned char *r: pointer to output byte string representation of a polyvec (of length KYBER_POLYVECCOMPRESSEDBYTES) +* - const poly *a: pointer to input polynomial +* - int i: index of to be serialized polynomial in serialized polyec +**************************************************/ +void poly_packcompress(unsigned char *r, poly *a, int i) { + int j, k; + uint64_t d0; + +#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) + uint16_t t[8]; + + for(j=0;jcoeffs[8*j+k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; +/* t[k] = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */ + d0 = t[k]; + d0 <<= 11; + d0 += 1664; + d0 *= 645084; + d0 >>= 31; + t[k] = d0 & 0x7ff; + } + + + r[352*i+11*j+ 0] = t[0] & 0xff; + r[352*i+11*j+ 1] = (t[0] >> 8) | ((t[1] & 0x1f) << 3); + r[352*i+11*j+ 2] = (t[1] >> 5) | ((t[2] & 0x03) << 6); + r[352*i+11*j+ 3] = (t[2] >> 2) & 0xff; + r[352*i+11*j+ 4] = (t[2] >> 10) | ((t[3] & 0x7f) << 1); + r[352*i+11*j+ 5] = (t[3] >> 7) | ((t[4] & 0x0f) << 4); + r[352*i+11*j+ 6] = (t[4] >> 4) | ((t[5] & 0x01) << 7); + r[352*i+11*j+ 7] = (t[5] >> 1) & 0xff; + r[352*i+11*j+ 8] = (t[5] >> 9) | ((t[6] & 0x3f) << 2); + r[352*i+11*j+ 9] = (t[6] >> 6) | ((t[7] & 0x07) << 5); + r[352*i+11*j+10] = (t[7] >> 3); + } +#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) + uint16_t t[4]; + + for (j = 0; j < KYBER_N / 4; j++) { + for(k=0;k<4;k++) { + t[k] = a->coeffs[4*j+k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; + /* t[k] = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */ + d0 = t[k]; + d0 <<= 10; + d0 += 1665; + d0 *= 1290167; + d0 >>= 32; + t[k] = d0 & 0x3ff; + } + r[320*i+5*j+0] = t[0] & 0xff; + r[320*i+5*j+1] = (t[0] >> 8) | ((t[1] & 0x3f) << 2); + r[320*i+5*j+2] = ((t[1] >> 6) | ((t[2] & 0x0f) << 4)) & 0xff; + r[320*i+5*j+3] = ((t[2] >> 4) | ((t[3] & 0x03) << 6)) & 0xff; + r[320*i+5*j+4] = (t[3] >> 2) & 0xff; + } +#else +#error "KYBER_POLYVECCOMPRESSEDBYTES needs to in (KYBER_K * {352, 320})" +#endif +} + +/************************************************* +* Name: poly_unpackdecompress +* +* Description: Deserialization and subsequent compression of a polynomial of a polyvec, +* Used to uncompress a polyvec one poly at a time in a loop. +* +* Arguments: - const poly *r: pointer to output polynomial +* - unsigned char *a: pointer to input byte string representation of a polyvec (of length KYBER_POLYVECCOMPRESSEDBYTES) +* - int i: index of poly in polyvec to decompress +**************************************************/ +void poly_unpackdecompress(poly *r, const unsigned char *a, int i) { + int j; +#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) + for(j=0;jcoeffs[8*j+0] = (((a[352*i+11*j+ 0] | (((uint32_t)a[352*i+11*j+ 1] & 0x07) << 8)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+1] = ((((a[352*i+11*j+ 1] >> 3) | (((uint32_t)a[352*i+11*j+ 2] & 0x3f) << 5)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+2] = ((((a[352*i+11*j+ 2] >> 6) | (((uint32_t)a[352*i+11*j+ 3] & 0xff) << 2) | (((uint32_t)a[352*i+11*j+4] & 0x01) << 10)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+3] = ((((a[352*i+11*j+ 4] >> 1) | (((uint32_t)a[352*i+11*j+ 5] & 0x0f) << 7)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+4] = ((((a[352*i+11*j+ 5] >> 4) | (((uint32_t)a[352*i+11*j+ 6] & 0x7f) << 4)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+5] = ((((a[352*i+11*j+ 6] >> 7) | (((uint32_t)a[352*i+11*j+ 7] & 0xff) << 1) | (((uint32_t)a[352*i+11*j+8] & 0x03) << 9)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+6] = ((((a[352*i+11*j+ 8] >> 2) | (((uint32_t)a[352*i+11*j+ 9] & 0x1f) << 6)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+7] = ((((a[352*i+11*j+ 9] >> 5) | (((uint32_t)a[352*i+11*j+10] & 0xff) << 3)) * KYBER_Q) + 1024) >> 11; + } +#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) + for(j=0;jcoeffs[4*j+0] = (((a[320*i+5*j+ 0] | (((uint32_t)a[320*i+5*j+ 1] & 0x03) << 8)) * KYBER_Q) + 512) >> 10; + r->coeffs[4*j+1] = ((((a[320*i+5*j+ 1] >> 2) | (((uint32_t)a[320*i+5*j+ 2] & 0x0f) << 6)) * KYBER_Q) + 512) >> 10; + r->coeffs[4*j+2] = ((((a[320*i+5*j+ 2] >> 4) | (((uint32_t)a[320*i+5*j+ 3] & 0x3f) << 4)) * KYBER_Q) + 512) >> 10; + r->coeffs[4*j+3] = ((((a[320*i+5*j+ 3] >> 6) | (((uint32_t)a[320*i+5*j+ 4] & 0xff) << 2)) * KYBER_Q) + 512) >> 10; + } +#else +#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}" +#endif +} + + +/************************************************* +* Name: cmp_poly_compress +* +* Description: Serializes and consequently compares polynomial to a serialized polynomial +* +* Arguments: - const unsigned char *r: pointer to serialized polynomial to compare with +* - poly *a: pointer to input polynomial to serialize and compare +* Returns: boolean indicating whether the polynomials are equal +**************************************************/ +int cmp_poly_compress(const unsigned char *r, poly *a) { + unsigned char rc = 0; + int16_t u; + uint32_t d0; + uint8_t t[8]; + int i, j, k = 0; + +#if (KYBER_POLYCOMPRESSEDBYTES == 128) + for(i=0;icoeffs[8*i+j]; + u += (u >> 15) & KYBER_Q; +/* t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */ + d0 = u << 4; + d0 += 1665; + d0 *= 80635; + d0 >>= 28; + t[j] = d0 & 0xf; + } + rc |= r[k] ^ (t[0] | (t[1] << 4)); + rc |= r[k + 1] ^ (t[2] | (t[3] << 4)); + rc |= r[k + 2] ^ (t[4] | (t[5] << 4)); + rc |= r[k + 3] ^ (t[6] | (t[7] << 4)); + k += 4; + } +#elif (KYBER_POLYCOMPRESSEDBYTES == 160) + for(i=0;icoeffs[8*i+j]; + u += (u >> 15) & KYBER_Q; +/* t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */ + d0 = u << 5; + d0 += 1664; + d0 *= 40318; + d0 >>= 27; + t[j] = d0 & 0x1f; + } + + + rc |= r[k] ^ (t[0] | (t[1] << 5)); + rc |= r[k+1] ^ ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); + rc |= r[k+2] ^ ((t[3] >> 1) | (t[4] << 4)); + rc |= r[k+3] ^ ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); + rc |= r[k+4] ^ ((t[6] >> 2) | (t[7] << 3)); + k += 5; + } +#else +#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}" +#endif + return rc; +} + +/************************************************* +* Name: cmp_poly_packcompress +* +* Description: Serializes and consequently compares poly of polyvec to a serialized polyvec +* Should be called in a loop over all poly's of a polyvec. +* +* Arguments: - const unsigned char *r: pointer to serialized polyvec to compare with +* - poly *a: pointer to input polynomial of polyvec to serialize and compare +* - int i: index of poly in polyvec to compare with +* Returns: boolean indicating whether the polyvecs are equal +**************************************************/ +int cmp_poly_packcompress(const unsigned char *r, poly *a, int i) { + unsigned char rc = 0; + int j, k; + uint64_t d0; + +#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) + uint16_t t[8]; + for(j=0;jcoeffs[8*j+k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; +/* t[k] = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */ + d0 = t[k]; + d0 <<= 11; + d0 += 1664; + d0 *= 645084; + d0 >>= 31; + t[k] = d0 & 0x7ff; + } + + rc |= r[352*i+11*j+ 0] ^ (t[0] & 0xff); + rc |= r[352*i+11*j+ 1] ^ ((t[0] >> 8) | ((t[1] & 0x1f) << 3)); + rc |= r[352*i+11*j+ 2] ^ ((t[1] >> 5) | ((t[2] & 0x03) << 6)); + rc |= r[352*i+11*j+ 3] ^ ((t[2] >> 2) & 0xff); + rc |= r[352*i+11*j+ 4] ^ ((t[2] >> 10) | ((t[3] & 0x7f) << 1)); + rc |= r[352*i+11*j+ 5] ^ ((t[3] >> 7) | ((t[4] & 0x0f) << 4)); + rc |= r[352*i+11*j+ 6] ^ ((t[4] >> 4) | ((t[5] & 0x01) << 7)); + rc |= r[352*i+11*j+ 7] ^ ((t[5] >> 1) & 0xff); + rc |= r[352*i+11*j+ 8] ^ ((t[5] >> 9) | ((t[6] & 0x3f) << 2)); + rc |= r[352*i+11*j+ 9] ^ ((t[6] >> 6) | ((t[7] & 0x07) << 5)); + rc |= r[352*i+11*j+10] ^ ((t[7] >> 3)); + } +#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) + uint16_t t[4]; + for (j = 0; j < KYBER_N / 4; j++) { + for(k=0;k<4;k++) { + t[k] = a->coeffs[4*j+k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; + /* t[k] = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */ + d0 = t[k]; + d0 <<= 10; + d0 += 1665; + d0 *= 1290167; + d0 >>= 32; + t[k] = d0 & 0x3ff; + } + + rc |= r[320*i+5*j+0] ^ (t[0] & 0xff); + rc |= r[320*i+5*j+1] ^ ((t[0] >> 8) | ((t[1] & 0x3f) << 2)); + rc |= r[320*i+5*j+2] ^ (((t[1] >> 6) | ((t[2] & 0x0f) << 4)) & 0xff); + rc |= r[320*i+5*j+3] ^ (((t[2] >> 4) | ((t[3] & 0x03) << 6)) & 0xff); + rc |= r[320*i+5*j+4] ^ ((t[3] >> 2) & 0xff); + } +#else +#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}" +#endif + return rc; +} + +/************************************************* +* Name: poly_tobytes +* +* Description: Serialization of a polynomial +* +* Arguments: - unsigned char *r: pointer to output byte array (needs space for KYBER_POLYBYTES bytes) +* - const poly *a: pointer to input polynomial +**************************************************/ +void poly_tobytes(unsigned char *r, poly *a) { + int i; + uint16_t t0, t1; + + poly_reduce(a); + + for (i = 0; i < KYBER_N / 2; i++) { + t0 = a->coeffs[2 * i]; + t1 = a->coeffs[2 * i + 1]; + r[3 * i] = t0 & 0xff; + r[3 * i + 1] = (t0 >> 8) | ((t1 & 0xf) << 4); + r[3 * i + 2] = (t1 >> 4) & 0xff; + } +} + +/************************************************* +* Name: poly_frombytes +* +* Description: De-serialization of a polynomial; +* inverse of poly_tobytes +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes) +**************************************************/ +void poly_frombytes(poly *r, const unsigned char *a) { + int i; + + for (i = 0; i < KYBER_N / 2; i++) { + r->coeffs[2 * i] = a[3 * i] | ((uint16_t)a[3 * i + 1] & 0x0f) << 8; + r->coeffs[2 * i + 1] = a[3 * i + 1] >> 4 | ((uint16_t)a[3 * i + 2] & 0xff) << 4; + } +} + +/************************************************* +* Name: poly_frombytes_mul +* +* Description: Multiplication of a polynomial with a de-serialization of another polynomial +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *b: pointer to input polynomial +* - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes) +**************************************************/ +extern void frombytes_mul_asm(int16_t *r, const int16_t *b, const unsigned char *c, const int32_t zetas[64]); +void poly_frombytes_mul(poly *r, const poly *b, const unsigned char *a) { + frombytes_mul_asm(r->coeffs, b->coeffs, a, zetas); +} + +/************************************************* +* Name: poly_frombytes_mul_acc +* +* Description: Multiplication of a polynomial with a de-serialization of another polynomial +* Accumulation in r. +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *b: pointer to input polynomial +* - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes) +**************************************************/ +extern void frombytes_mul_asm_acc(int16_t *r, const int16_t *b, const unsigned char *c, const int32_t zetas[64]); +void poly_frombytes_mul_acc(poly *r, const poly *b, const unsigned char *a) { + frombytes_mul_asm_acc(r->coeffs, b->coeffs, a, zetas); +} + +/************************************************* +* Name: poly_getnoise_eta1 +* +* Description: Sample a polynomial deterministically from a seed and a nonce, +* with output polynomial close to centered binomial distribution +* with parameter KYBER_ETA1 +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *seed: pointer to input seed (pointing to array of length KYBER_SYMBYTES bytes) +* - unsigned char nonce: one-byte input nonce +* - int add: boolean to indicate to accumulate into r +**************************************************/ +void poly_noise_eta1(poly *r, const unsigned char *seed, unsigned char nonce, int add) { + unsigned char buf[KYBER_ETA1 * KYBER_N / 4]; + + prf(buf, KYBER_ETA1 * KYBER_N / 4, seed, nonce); + cbd_eta1(r, buf, add); +} + +/************************************************* +* Name: poly_getnoise_eta2 +* +* Description: Sample a polynomial deterministically from a seed and a nonce, +* with output polynomial close to centered binomial distribution +* with parameter KYBER_ETA2 +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *seed: pointer to input seed (pointing to array of length KYBER_SYMBYTES bytes) +* - unsigned char nonce: one-byte input nonce +* - int add: boolean to indicate to accumulate into r +**************************************************/ +void poly_noise_eta2(poly *r, const unsigned char *seed, unsigned char nonce, int add) { + unsigned char buf[KYBER_ETA2 * KYBER_N / 4]; + + prf(buf, KYBER_ETA2 * KYBER_N / 4, seed, nonce); + cbd_eta2(r, buf, add); +} + +/************************************************* +* Name: poly_ntt +* +* Description: Computes negacyclic number-theoretic transform (NTT) of +* a polynomial in place; +* inputs assumed to be in normal order, output in bitreversed order +* +* Arguments: - uint16_t *r: pointer to in/output polynomial +**************************************************/ +void poly_ntt(poly *r) { + ntt(r->coeffs); +} + +/************************************************* +* Name: poly_invntt +* +* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of +* a polynomial in place; +* inputs assumed to be in bitreversed order, output in normal order +* +* Arguments: - uint16_t *a: pointer to in/output polynomial +**************************************************/ +void poly_invntt(poly *r) { + invntt(r->coeffs); +} + +extern void basemul_asm(int16_t *, const int16_t *, const int16_t *, const int32_t *); +/************************************************* +* Name: poly_basemul +* +* Description: Multiplication of two polynomials in NTT domain +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_basemul(poly *r, const poly *a, const poly *b) { + basemul_asm(r->coeffs, a->coeffs, b->coeffs, zetas); +} + +extern void basemul_asm_acc(int16_t *, const int16_t *, const int16_t *, const int32_t *); +/************************************************* +* Name: poly_basemul_acc +* +* Description: Multiplication of two polynomials in NTT domain, accumulating +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_basemul_acc(poly *r, const poly *a, const poly *b) { + basemul_asm_acc(r->coeffs, a->coeffs, b->coeffs, zetas); +} + +extern void asm_fromplant(int16_t *r); +/************************************************* +* Name: poly_fromplant +* +* Description: Inplace conversion of all coefficients of a polynomial +* from Montgomery domain to normal domain +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void poly_fromplant(poly *r) { + asm_fromplant(r->coeffs); +} + +extern void asm_barrett_reduce(int16_t *r); +/************************************************* +* Name: poly_reduce +* +* Description: Applies Barrett reduction to all coefficients of a polynomial +* for details of the Barrett reduction see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void poly_reduce(poly *r) { + asm_barrett_reduce(r->coeffs); +} + +extern void pointwise_add(int16_t *, const int16_t *, const int16_t *); +/************************************************* +* Name: poly_add +* +* Description: Add two polynomials +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_add(poly *r, const poly *a, const poly *b) { + pointwise_add(r->coeffs,a->coeffs,b->coeffs); +} + + +extern void pointwise_sub(int16_t *, const int16_t *, const int16_t *); +/************************************************* +* Name: poly_sub +* +* Description: Subtract two polynomials +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_sub(poly *r, const poly *a, const poly *b) { + pointwise_sub(r->coeffs,a->coeffs,b->coeffs); +} + +void cmov_int16(int16_t *r, int16_t v, uint16_t b); + +/************************************************* +* Name: poly_frommsg +* +* Description: Convert 32-byte message to polynomial +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *msg: pointer to input message +**************************************************/ +void poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) +{ + unsigned int i,j; + +#if (KYBER_INDCPA_MSGBYTES != KYBER_N/8) +#error "KYBER_INDCPA_MSGBYTES must be equal to KYBER_N/8 bytes!" +#endif + + for(i=0;icoeffs[8*i+j] = 0; + cmov_int16(r->coeffs+8*i+j, ((KYBER_Q+1)/2), (msg[i] >> j)&1); + } + } +} + +/************************************************* +* Name: poly_tomsg +* +* Description: Convert polynomial to 32-byte message +* +* Arguments: - unsigned char *msg: pointer to output message +* - const poly *a: pointer to input polynomial +**************************************************/ +void poly_tomsg(unsigned char msg[KYBER_SYMBYTES], poly *a) { + uint32_t t; + int i, j; + + for (i = 0; i < KYBER_SYMBYTES; i++) { + msg[i] = 0; + for (j = 0; j < 8; j++) { + t = a->coeffs[8*i+j]; + t <<= 1; + t += 1665; + t *= 80635; + t >>= 28; + t &= 1; + msg[i] |= t << j; + } + } +} + +/************************************************* +* Name: poly_zeroize +* +* Description: Zeros a polynomial +* +* Arguments: - poly *p: pointer to polynomial +**************************************************/ +void poly_zeroize(poly *p) { + int i; + for(i = 0; i < KYBER_N; i++) + p->coeffs[i] = 0; +} diff --git a/crypto_kem/ml-kem-512/m4fstack/poly.h b/crypto_kem/ml-kem-512/m4fstack/poly.h new file mode 100644 index 0000000..d62e966 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/poly.h @@ -0,0 +1,54 @@ +#ifndef POLY_H +#define POLY_H + +#include "params.h" + +#include + +#define poly_getnoise_eta1(p, seed, nonce) poly_noise_eta1(p, seed, nonce, 0) +#define poly_getnoise_eta2(p, seed, nonce) poly_noise_eta2(p, seed, nonce, 0) +#define poly_addnoise_eta1(p, seed, nonce) poly_noise_eta1(p, seed, nonce, 1) +#define poly_addnoise_eta2(p, seed, nonce) poly_noise_eta2(p, seed, nonce, 1) + +/* + * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial + * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] + */ +typedef struct { + int16_t coeffs[KYBER_N]; +} poly; + +void poly_compress(unsigned char *r, const poly *a); +void poly_decompress(poly *r, const unsigned char *a); + +void poly_packcompress(unsigned char *r, poly *a, int i); +void poly_unpackdecompress(poly *r, const unsigned char *a, int i); + +int cmp_poly_compress(const unsigned char *r, poly *a); +int cmp_poly_packcompress(const unsigned char *r, poly *a, int i); + +void poly_tobytes(unsigned char *r, poly *a); +void poly_frombytes(poly *r, const unsigned char *a); +void poly_frombytes_mul(poly *r, const poly *b, const unsigned char *a); +void poly_frombytes_mul_acc(poly *r, const poly *b, const unsigned char *a); + +void poly_frommsg(poly *r, const unsigned char msg[KYBER_SYMBYTES]); +void poly_tomsg(unsigned char msg[KYBER_SYMBYTES], poly *a); + +void poly_noise_eta1(poly *r, const unsigned char *seed, unsigned char nonce, int add); +void poly_noise_eta2(poly *r, const unsigned char *seed, unsigned char nonce, int add); + +void poly_ntt(poly *r); +void poly_invntt(poly *r); +void poly_basemul(poly *r, const poly *a, const poly *b); +void poly_basemul_acc(poly *r, const poly *a, const poly *b); +void poly_fromplant(poly *r); + +void poly_reduce(poly *r); + +void poly_add(poly *r, const poly *a, const poly *b); +void poly_sub(poly *r, const poly *a, const poly *b); + +void poly_zeroize(poly *p); + +#endif diff --git a/crypto_kem/ml-kem-512/m4fstack/poly_asm.S b/crypto_kem/ml-kem-512/m4fstack/poly_asm.S new file mode 120000 index 0000000..167ee5e --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/poly_asm.S @@ -0,0 +1 @@ +../../ml-kem-768/m4fstack/poly_asm.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/polyvec.c b/crypto_kem/ml-kem-512/m4fstack/polyvec.c new file mode 120000 index 0000000..f398d76 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/polyvec.c @@ -0,0 +1 @@ +../m4fspeed/polyvec.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/polyvec.h b/crypto_kem/ml-kem-512/m4fstack/polyvec.h new file mode 120000 index 0000000..3113837 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/polyvec.h @@ -0,0 +1 @@ +../m4fspeed/polyvec.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/reduce.S b/crypto_kem/ml-kem-512/m4fstack/reduce.S new file mode 120000 index 0000000..29ae453 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/reduce.S @@ -0,0 +1 @@ +../m4fspeed/reduce.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/symmetric-fips202.c b/crypto_kem/ml-kem-512/m4fstack/symmetric-fips202.c new file mode 120000 index 0000000..fa4ba9a --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/symmetric-fips202.c @@ -0,0 +1 @@ +../m4fspeed/symmetric-fips202.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/symmetric.h b/crypto_kem/ml-kem-512/m4fstack/symmetric.h new file mode 120000 index 0000000..28c6fac --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/symmetric.h @@ -0,0 +1 @@ +../m4fspeed/symmetric.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/verify.c b/crypto_kem/ml-kem-512/m4fstack/verify.c new file mode 120000 index 0000000..a7a9856 --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/verify.c @@ -0,0 +1 @@ +../m4fspeed/verify.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-512/m4fstack/verify.h b/crypto_kem/ml-kem-512/m4fstack/verify.h new file mode 120000 index 0000000..cb2da4b --- /dev/null +++ b/crypto_kem/ml-kem-512/m4fstack/verify.h @@ -0,0 +1 @@ +../m4fspeed/verify.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fspeed/api.h b/crypto_kem/ml-kem-768/m4fspeed/api.h new file mode 100644 index 0000000..bdb694f --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/api.h @@ -0,0 +1,20 @@ +#ifndef API_H +#define API_H + +#include "params.h" + +#define CRYPTO_SECRETKEYBYTES KYBER_SECRETKEYBYTES +#define CRYPTO_PUBLICKEYBYTES KYBER_PUBLICKEYBYTES +#define CRYPTO_CIPHERTEXTBYTES KYBER_CIPHERTEXTBYTES +#define CRYPTO_BYTES KYBER_SSBYTES + +#define CRYPTO_ALGNAME "Kyber768" + +int crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + +int crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); + +int crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); + + +#endif diff --git a/crypto_kem/ml-kem-768/m4fspeed/cbd.c b/crypto_kem/ml-kem-768/m4fspeed/cbd.c new file mode 100644 index 0000000..c7b5dee --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/cbd.c @@ -0,0 +1,55 @@ +#include "cbd.h" +#include "params.h" + +#include + +/************************************************* +* Name: load32_littleendian +* +* Description: load bytes into a 32-bit integer +* in little-endian order +* +* Arguments: - const unsigned char *x: pointer to input byte array +* +* Returns 32-bit unsigned integer loaded from x +**************************************************/ +static uint32_t load32_littleendian(const unsigned char *x) { + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + r |= (uint32_t)x[3] << 24; + return r; +} + +/************************************************* +* Name: cbd +* +* Description: Given an array of uniformly random bytes, compute +* polynomial with coefficients distributed according to +* a centered binomial distribution with parameter KYBER_ETA +* specialized for KYBER_ETA=2 +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *buf: pointer to input byte array +* - int add: boolean to indicate to accumulate into r +**************************************************/ +void cbd(poly *r, const unsigned char *buf, int add) { + uint32_t d, t; + int16_t a, b; + int i, j; + + for (i = 0; i < KYBER_N / 8; i++) { + t = load32_littleendian(buf + 4 * i); + d = t & 0x55555555; + d += (t >> 1) & 0x55555555; + + for (j = 0; j < 8; j++) { + a = (d >> 4 * j) & 0x3; + b = (d >> (4 * j + 2)) & 0x3; + if (!add) + r->coeffs[8 * i + j] = 0; + r->coeffs[8 * i + j] = r->coeffs[8 * i + j] + (a - b); + } + } +} diff --git a/crypto_kem/ml-kem-768/m4fspeed/cbd.h b/crypto_kem/ml-kem-768/m4fspeed/cbd.h new file mode 100644 index 0000000..4aa5a8a --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/cbd.h @@ -0,0 +1,8 @@ +#ifndef CBD_H +#define CBD_H + +#include "poly.h" + +void cbd(poly *r, const unsigned char *buf, int add); + +#endif diff --git a/crypto_kem/ml-kem-768/m4fspeed/cmov_int16.S b/crypto_kem/ml-kem-768/m4fspeed/cmov_int16.S new file mode 100644 index 0000000..4f7dcc6 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/cmov_int16.S @@ -0,0 +1,15 @@ +.syntax unified +.cpu cortex-m4 +.thumb + +// void cmov_int16(int16_t *r, int16_t v, uint16_t b) +.global cmov_int16 +.type cmov_int16, %function +.align 2 +cmov_int16: + cmp.w r2, #0 + ldrsh.w r3, [r0] + it ne + movne.w r3, r1 + strh.w r3, [r0] + bx lr \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fspeed/fastaddsub.S b/crypto_kem/ml-kem-768/m4fspeed/fastaddsub.S new file mode 100644 index 0000000..0d4ae50 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/fastaddsub.S @@ -0,0 +1,60 @@ +.syntax unified +.cpu cortex-m4 +.thumb + +.align 2 +.global pointwise_sub +.type pointwise_sub, %function +pointwise_sub: + push {r4-r11, lr} + + movw r14, #25 + 1: + ldm r1!, {r3-r7} + ldm r2!, {r8-r12} + usub16 r3, r3, r8 + usub16 r4, r4, r9 + usub16 r5, r5, r10 + usub16 r6, r6, r11 + usub16 r7, r7, r12 + stm r0!, {r3-r7} + + subs.w r14, #1 + bne.w 1b + + ldm r1!, {r3-r5} + ldm r2!, {r8-r10} + usub16 r3, r3, r8 + usub16 r4, r4, r9 + usub16 r5, r5, r10 + stm r0!, {r3-r5} + pop {r4-r11, pc} + + +.align 2 +.global pointwise_add +.type pointwise_add, %function +pointwise_add: + push {r4-r11, lr} + + movw r14, #25 + 1: + ldm r1!, {r3-r7} + ldm r2!, {r8-r12} + uadd16 r3, r3, r8 + uadd16 r4, r4, r9 + uadd16 r5, r5, r10 + uadd16 r6, r6, r11 + uadd16 r7, r7, r12 + stm r0!, {r3-r7} + + subs.w r14, #1 + bne.w 1b + + ldm r1!, {r3-r5} + ldm r2!, {r8-r10} + uadd16 r3, r3, r8 + uadd16 r4, r4, r9 + uadd16 r5, r5, r10 + stm r0!, {r3-r5} + pop {r4-r11, pc} diff --git a/crypto_kem/ml-kem-768/m4fspeed/fastbasemul.S b/crypto_kem/ml-kem-768/m4fspeed/fastbasemul.S new file mode 100644 index 0000000..ffee442 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/fastbasemul.S @@ -0,0 +1,193 @@ +#include "macros.i" +.syntax unified +.cpu cortex-m4 +.thumb + +// void basemul_asm_opt_16_32(int32_t *, const int16_t *, const int16_t *, const int16_t *) +.global basemul_asm_opt_16_32 +.type basemul_asm_opt_16_32, %function +.align 2 +basemul_asm_opt_16_32: + push {r4-r11, lr} + + rptr_tmp .req r0 + aptr .req r1 + bptr .req r2 + aprimeptr .req r3 + poly0 .req r4 + poly1 .req r6 + poly2 .req r5 + poly3 .req r7 + q .req r8 + qa .req r9 + qinv .req r10 + tmp .req r11 + tmp2 .req r12 + loop .req r14 + + //movw qa, #26632 + //movt q, #3329 + ### qinv=0x6ba8f301 + //movw qinv, #62209 + //movt qinv, #27560 + + movw loop, #64 + 1: + ldr poly0, [aptr], #4 + ldr poly1, [bptr], #4 + ldr poly2, [aptr], #4 + ldr poly3, [bptr], #4 + ldr.w tmp, [aprimeptr, #4] + ldr tmp2, [aprimeptr], #8 + + // (poly0_t * zeta) * poly1_t + poly0_b * poly1_b + smuad tmp2, tmp2, poly1 + str tmp2, [rptr_tmp], #4 + + // poly1_t * poly0_b + poly1_b * poly0_t + smuadx tmp2, poly0, poly1 + str tmp2, [rptr_tmp], #4 + + smuad tmp2, tmp, poly3 + str tmp2, [rptr_tmp], #4 + + smuadx tmp2, poly2, poly3 + str tmp2, [rptr_tmp], #4 + + subs.w loop, #1 + bne.w 1b + + pop {r4-r11, pc} + +// void basemul_asm_acc_opt_32_32(int32_t *, const int16_t *, const int16_t *, const int16_t *) +.global basemul_asm_acc_opt_32_32 +.type basemul_asm_acc_opt_32_32, %function +.align 2 +basemul_asm_acc_opt_32_32: + push {r4-r11, lr} + + rptr_tmp .req r0 + aptr .req r1 + bptr .req r2 + aprimeptr .req r3 + poly0 .req r4 + poly1 .req r6 + res0 .req r5 + res1 .req r7 + q .req r8 + qa .req r9 + qinv .req r10 + tmp .req r11 + tmp2 .req r12 + loop .req r14 + + //movw qa, #26632 + //movt q, #3329 + ### qinv=0x6ba8f301 + //movw qinv, #62209 + //movt qinv, #27560 + + movw loop, #64 + 1: + ldr poly0, [aptr], #4 + ldr poly1, [bptr], #4 + ldr.w res0, [rptr_tmp] + ldr tmp2, [aprimeptr], #4 + ldr.w res1, [rptr_tmp, #4] + + // (poly0_t * zeta) * poly1_t + poly0_b * poly0_t + res + smlad tmp2, tmp2, poly1, res0 + str tmp2, [rptr_tmp], #4 + + // poly1_t * poly0_b + poly1_b * poly0_t + res + smladx tmp, poly0, poly1, res1 + str tmp, [rptr_tmp], #4 + + ldr poly0, [aptr], #4 + ldr poly1, [bptr], #4 + ldr.w res0, [rptr_tmp] + ldr tmp2, [aprimeptr], #4 + ldr.w res1, [rptr_tmp, #4] + + smlad tmp2, tmp2, poly1, res0 + str tmp2, [rptr_tmp], #4 + + smladx tmp, poly0, poly1, res1 + str tmp, [rptr_tmp], #4 + + subs.w loop, #1 + bne.w 1b + + pop {r4-r11, pc} + +.unreq rptr_tmp + + +// void basemul_asm_acc_opt_32_16(int16_t *, const int16_t *, const int16_t *, const int16_t *, const int32_t *) +.global basemul_asm_acc_opt_32_16 +.type basemul_asm_acc_opt_32_16, %function +.align 2 +basemul_asm_acc_opt_32_16: + push {r4-r11, lr} + + rptr .req r0 + aptr .req r1 + bptr .req r2 + aprimeptr .req r3 + poly0 .req r4 + poly1 .req r6 + res0 .req r5 + res1 .req r7 + q .req r8 + qa .req r9 + qinv .req r10 + //tmp .req r11 + tmp2 .req r12 + rptr_tmp .req r11 + loop .req r14 + + movw qa, #26632 + movt q, #3329 + ### qinv=0x6ba8f301 + movw qinv, #62209 + movt qinv, #27560 + + ldr rptr_tmp, [sp, #9*4] + movw loop, #64 + 1: + ldr poly0, [aptr], #4 + ldr poly1, [bptr], #4 + ldr.w res0, [rptr_tmp], #4 + ldr tmp2, [aprimeptr], #4 + ldr.w res1, [rptr_tmp], #4 + + // (poly0_t * zeta) * poly1_t + poly0_b * poly0_b + res + smlad res0, tmp2, poly1, res0 + plant_red q, qa, qinv, res0 + + // poly1_t * poly0_b + poly1_b * poly0_t + res + smladx res1, poly0, poly1, res1 + plant_red q, qa, qinv, res1 + + pkhtb res0, res1, res0, asr#16 + str res0, [rptr], #4 + + ldr poly0, [aptr], #4 + ldr poly1, [bptr], #4 + ldr.w res0, [rptr_tmp], #4 + ldr tmp2, [aprimeptr], #4 + ldr.w res1, [rptr_tmp], #4 + + smlad res0, tmp2, poly1, res0 + plant_red q, qa, qinv, res0 + + smladx res1, poly0, poly1, res1 + plant_red q, qa, qinv, res1 + + pkhtb res0, res1, res0, asr#16 + str res0, [rptr], #4 + + subs.w loop, #1 + bne.w 1b + + pop {r4-r11, pc} \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fspeed/fastinvntt.S b/crypto_kem/ml-kem-768/m4fspeed/fastinvntt.S new file mode 100644 index 0000000..606fe1f --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/fastinvntt.S @@ -0,0 +1,356 @@ +/****************************************************************************** +* Integrating the improved Plantard arithmetic into Kyber. +* +* Efficient Plantard arithmetic enables a faster Kyber implementation with the +* same stack usage. +* +* See the paper at https://eprint.iacr.org/2022/956.pdf for more details. +* +* @author Junhao Huang, BNU-HKBU United International College, Zhuhai, China +* jhhuang_nuaa@126.com +* +* @date September 2022 +******************************************************************************/ +#include "macros.i" + +.syntax unified +.cpu cortex-m4 +.thumb + +.macro mul_twiddle_plant a, twiddle, tmp, q, qa + smulwb \tmp, \twiddle, \a + smulwt \a, \twiddle, \a + smlabt \tmp, \tmp, \q, \qa + smlabt \a, \a, \q, \qa + pkhtb \a, \a, \tmp, asr#16 +.endm + +.macro doublebutterfly_plant a0, a1, twiddle, tmp, q, qa + smulwb \tmp, \twiddle, \a1 + smulwt \a1, \twiddle, \a1 + smlabt \tmp, \tmp, \q, \qa + smlabt \a1, \a1, \q, \qa + pkhtb \tmp, \a1, \tmp, asr#16 + usub16 \a1, \a0, \tmp + uadd16 \a0, \a0, \tmp +.endm + +.macro two_doublebutterfly_plant a0, a1, a2, a3, twiddle0, twiddle1, tmp, q, qa + doublebutterfly_plant \a0, \a1, \twiddle0, \tmp, \q, \qa + doublebutterfly_plant \a2, \a3, \twiddle1, \tmp, \q, \qa +.endm + +.macro fullplant a0, a1, a2, a3, a4, a5, a6, a7, tmp, q, qa, plantconst + movw \plantconst, #44984 + movt \plantconst, #19 + doubleplant \a0, \tmp, \q, \qa, \plantconst + doubleplant \a1, \tmp, \q, \qa, \plantconst + doubleplant \a2, \tmp, \q, \qa, \plantconst + doubleplant \a3, \tmp, \q, \qa, \plantconst + doubleplant \a4, \tmp, \q, \qa, \plantconst + doubleplant \a5, \tmp, \q, \qa, \plantconst + doubleplant \a6, \tmp, \q, \qa, \plantconst + doubleplant \a7, \tmp, \q, \qa, \plantconst +.endm + +.macro halfplant a0, a1, a2, a3, tmp, q, qa, plantconst + movw \plantconst, #44984 + movt \plantconst, #19 + doubleplant \a0, \tmp, \q, \qa, \plantconst + doubleplant \a1, \tmp, \q, \qa, \plantconst + doubleplant \a2, \tmp, \q, \qa, \plantconst + doubleplant \a3, \tmp, \q, \qa, \plantconst +.endm + + +// input: 0.5/1q +.macro _3_layer_double_inv_CT_16_plant_light c0, c1, c2, c3, c4, c5, c6, c7, xi2, xi4, xi5, xi6, twiddle1, tmp2, q, qa, tmp + + // layer 1 + sadd16.w \tmp, \c0, \c1 // c0, c1 + ssub16.w \c1, \c0, \c1 + sadd16.w \tmp2, \c2, \c3 // c2, c3 + ssub16.w \c3, \c2, \c3 + // tmp, c1, tmp2, c3: 1q maximum + sadd16.w \c0, \c4, \c5 // c4, c5 + ssub16.w \c5, \c4, \c5 + sadd16.w \c2, \c6, \c7 // c6, c7 + ssub16.w \c7, \c6, \c7 + // c4, c6 are free at this point + // c0,c5,c2,c7 1q maximum + + // layer 2 + sadd16.w \c6, \tmp, \tmp2 // c0, c2 + ssub16.w \tmp2, \tmp, \tmp2 + sadd16.w \c4, \c0, \c2 // c4, c6 + ssub16.w \c2, \c0, \c2 + // c6, tmp2, c4, c2: 2q maximum + + vmov.w \twiddle1, \xi2 + doublebutterfly_plant \c1, \c3, \twiddle1, \tmp, \q, \qa + doublebutterfly_plant \c5, \c7, \twiddle1, \tmp, \q, \qa + // c1, c3, c7, c5: 1.5q maximum; + + // tmp and c0 are free at this point + // layer 3 + sadd16.w \c0, \c6, \c4 // c0, c4 + ssub16.w \c4, \c6, \c4 + // c0, c4: 4q + // c6 are free at this point + vmov.w \twiddle1, \xi4 + doublebutterfly_plant \c1, \c5, \twiddle1, \tmp, \q, \qa + // c1, c5: 2q maximum + + vmov.w \twiddle1, \xi5 + // this block is one doublebutterfly + smulwb \tmp, \twiddle1, \c2 // c2, c6 + smulwt \c2, \twiddle1, \c2 + smlabt \tmp, \tmp, \q, \qa + smlabt \c2, \c2, \q, \qa + pkhtb \tmp, \c2, \tmp, asr#16 + ssub16.w \c6, \tmp2, \tmp + sadd16.w \c2, \tmp2, \tmp + //c6, c2: 4.5q + vmov.w \twiddle1, \xi6 + doublebutterfly_plant \c3, \c7, \twiddle1, \tmp, \q, \qa + //c3, c7: 2.5q maximum +.endm +.macro _3_layer_double_inv_CT_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + // layer 3 + ldr.w \twiddle1, [\twiddle_ptr], #4 + two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa + two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa + + // layer 2 + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa + + two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa + + // layer 1 + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle2, \tmp, \q, \qa + + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa +.endm + +.macro _3_layer_double_inv_twist_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + mul_twiddle_plant \c0, \twiddle1, \tmp, \q, \qa + mul_twiddle_plant \c1, \twiddle2, \tmp, \q, \qa + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + mul_twiddle_plant \c2, \twiddle1, \tmp, \q, \qa + mul_twiddle_plant \c3, \twiddle2, \tmp, \q, \qa + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + mul_twiddle_plant \c4, \twiddle1, \tmp, \q, \qa + mul_twiddle_plant \c5, \twiddle2, \tmp, \q, \qa + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + mul_twiddle_plant \c6, \twiddle1, \tmp, \q, \qa + mul_twiddle_plant \c7, \twiddle2, \tmp, \q, \qa +.endm +# input coefficients < 0.5q +.global invntt_fast +.type invntt_fast, %function +.align 2 +invntt_fast: + push {r4-r11, r14} + vpush.w {s16-s23} + poly .req r0 + twiddle_ptr .req r1 + poly0 .req r2 + poly1 .req r3 + poly2 .req r4 + poly3 .req r5 + poly4 .req r6 + poly5 .req r7 + poly6 .req r8 + poly7 .req r9 + twiddle1 .req r10 + twiddle2 .req r11 + q .req r12 + // at the top of r12 + qa .req r0 + // qa=2^a q;a=3; at the bottom of r12 + tmp .req r14 + + movt q, #3329 + + ### LAYER 7+6+5+4 + .equ distance, 16 + .equ offset, 32 + .equ strincr, 64 + + // pre-load twiddle factors to FPU registers + vldm twiddle_ptr!, {s8-s22} + + add.w tmp, poly, #8*strincr + vmov s8, tmp + 1: + vmov s23, poly + // load a1, a3, ..., a15 + load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset + load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset + + movw qa, #26632 + + // NTT on a1, a3, ..., a15 + // twiddle2 is used as tmp2 + _3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s10, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp + + // multiply coeffs by layer 4 twiddles for later use + // vmov twiddle1, s15 + vmov twiddle2, s16 + // mul_twiddle_plant poly0, twiddle1, tmp, q, qa // could be omitted but kept for reduction only + mul_twiddle_plant poly1, twiddle2, tmp, q, qa + + vmov twiddle1, s17 + vmov twiddle2, s18 + mul_twiddle_plant poly2, twiddle1, tmp, q, qa + mul_twiddle_plant poly3, twiddle2, tmp, q, qa + + vmov twiddle1, s19 + vmov twiddle2, s20 + mul_twiddle_plant poly4, twiddle1, tmp, q, qa + mul_twiddle_plant poly5, twiddle2, tmp, q, qa + + vmov twiddle1, s21 + vmov twiddle2, s22 + mul_twiddle_plant poly6, twiddle1, tmp, q, qa + mul_twiddle_plant poly7, twiddle2, tmp, q, qa + + vmov s0, poly0 // a1 + vmov s1, poly1 // a3 + vmov s2, poly2 // a5 + vmov s3, poly3 // a7 + vmov s4, poly4 // a9 + vmov s5, poly5 // a11 + vmov s6, poly6 // a13 + vmov s7, poly7 // a15 + // 0.5q + // ---------- + + vmov poly, s23 + // load a0, a2, ..., a14 + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + movw qa, #26632 + // NTT on a0, a2, ..., a14 + // twiddle2 is used as tmp2 + _3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s10, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp + // 1,3,5,7: <5q; 0,2,4,6:<1q + // layer 4 - 1 + // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) + vmov poly, s23 + vmov twiddle2, s1 // load a3 + uadd16 tmp, poly1, twiddle2 + usub16 poly1, poly1, twiddle2 + str.w tmp, [poly, #1*distance/4] + str.w poly1, [poly, #1*distance/4+offset] + + vmov twiddle2, s3 // load a7 + uadd16 tmp, poly3, twiddle2 + usub16 poly3, poly3, twiddle2 + str.w tmp, [poly, #3*distance/4] + str.w poly3, [poly, #3*distance/4+offset] + + vmov twiddle2, s5 // load a11 + uadd16 tmp, poly5, twiddle2 + usub16 poly5, poly5, twiddle2 + str.w tmp, [poly, #5*distance/4] + str.w poly5, [poly, #5*distance/4+offset] + + vmov twiddle2, s7 // load a15 + uadd16 tmp, poly7, twiddle2 + usub16 poly7, poly7, twiddle2 + str.w tmp, [poly, #7*distance/4] + str.w poly7, [poly, #7*distance/4+offset] + //1,3,5,7: < 5.5q + + // layer 4 - 2 + // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13) + vmov poly3, s2 // load a5 + uadd16 tmp, poly2, poly3 + usub16 twiddle2, poly2, poly3 + str.w tmp, [poly, #2*distance/4] + str.w twiddle2, [poly, #2*distance/4+offset] + + vmov poly5, s4 // load a9 + uadd16 tmp, poly4, poly5 + usub16 twiddle2, poly4, poly5 + str.w tmp, [poly, #4*distance/4] + str.w twiddle2, [poly, #4*distance/4+offset] + + vmov poly7, s6 // load a13 + uadd16 tmp, poly6, poly7 + usub16 twiddle2, poly6, poly7 + str.w tmp, [poly, #6*distance/4] + str.w twiddle2, [poly, #6*distance/4+offset] + + vmov poly1, s0 // load a1 + uadd16 tmp, poly0, poly1 + usub16 twiddle2, poly0, poly1 + str.w twiddle2, [poly, #offset] + str.w tmp, [poly], #strincr // increase 2*8*4 = 64 (2 * 8 loads of 4 bytes each) + //0,2,4,6: < 1.5q + vmov tmp, s8 + cmp.w poly, tmp + bne.w 1b + + sub.w poly, #8*strincr + + ### LAYER 3+2+1 + + .equ distance, distance*16 + .equ strincr, 4 + + // ITER 0 + vmov s6, poly + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + vldm twiddle_ptr!, {s0-s5} + movw qa, #26632 + fullplant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7 tmp, q, qa, twiddle1 + // twiddle2 is used as tmp2 + _3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s1, s3, s4, s5, twiddle1, twiddle2, q, qa, tmp + + // twisting + _3_layer_double_inv_twist_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + + vmov poly, s6 + store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + str.w poly1, [poly, #distance/4] + str.w poly2, [poly, #2*distance/4] + str.w poly3, [poly, #3*distance/4] + str.w poly0, [poly], #4 + + // ITER 1-15 + add.w tmp, poly, #strincr*3*(5) + vmov s14, tmp + 2: + vmov s6, poly + // polys upto 5.5q + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + movw qa, #26632 + _3_layer_double_inv_CT_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + + // twisting + _3_layer_double_inv_twist_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + + vmov poly, s6 + store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + str.w poly1, [poly, #distance/4] + str.w poly2, [poly, #2*distance/4] + str.w poly3, [poly, #3*distance/4] + str.w poly0, [poly], #4 + + vmov tmp, s14 + cmp.w poly, tmp + bne.w 2b + + vpop.w {s16-s23} + pop {r4-r11, pc} diff --git a/crypto_kem/ml-kem-768/m4fspeed/fastntt.S b/crypto_kem/ml-kem-768/m4fspeed/fastntt.S new file mode 100644 index 0000000..ddc1906 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/fastntt.S @@ -0,0 +1,265 @@ +/****************************************************************************** +* Integrating the improved Plantard arithmetic into Kyber. +* +* Efficient Plantard arithmetic enables a faster Kyber implementation with the +* same stack usage. +* +* See the paper at https://eprint.iacr.org/2022/956.pdf for more details. +* +* @author Junhao Huang, BNU-HKBU United International College, Zhuhai, China +* jhhuang_nuaa@126.com +* +* @date September 2022 +******************************************************************************/ +#include "macros.i" + +.syntax unified +.cpu cortex-m4 +.thumb + +.macro mul_twiddle_plant a, twiddle, tmp, q, qa + smulwb \tmp, \twiddle, \a + smulwt \a, \twiddle, \a + smlabt \tmp, \tmp, \q, \qa + smlabt \a, \a, \q, \qa + pkhtb \a, \a, \tmp, asr#16 +.endm + +.macro doublebutterfly_plant a0, a1, twiddle, tmp, q, qa + smulwb \tmp, \twiddle, \a1 + smulwt \a1, \twiddle, \a1 + smlabt \tmp, \tmp, \q, \qa + smlabt \a1, \a1, \q, \qa + pkhtb \tmp, \a1, \tmp, asr#16 + usub16 \a1, \a0, \tmp + uadd16 \a0, \a0, \tmp +.endm + +.macro two_doublebutterfly_plant a0, a1, a2, a3, twiddle0, twiddle1, tmp, q, qa + doublebutterfly_plant \a0, \a1, \twiddle0, \tmp, \q, \qa + doublebutterfly_plant \a2, \a3, \twiddle1, \tmp, \q, \qa +.endm + +.macro _3_layer_double_CT_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + // layer 3 + ldr.w \twiddle1, [\twiddle_ptr], #4 + two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle1, \tmp, \q, \qa + two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa + + // layer 2 + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa + + two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle2, \twiddle2, \tmp, \q, \qa + + // layer 1 + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa + + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa +.endm + +.macro _3_layer_double_CT_16_plant_fp c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle1, twiddle2, q, qa, tmp + // layer 3 + vmov \twiddle1, \xi0 + two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle1, \tmp, \q, \qa + two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa + + // layer 2 + vmov \twiddle1, \xi1 + vmov \twiddle2, \xi2 + two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa + + two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle2, \twiddle2, \tmp, \q, \qa + + // layer 1 + vmov \twiddle1, \xi3 + vmov \twiddle2, \xi4 + two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa + + vmov \twiddle1, \xi5 + vmov \twiddle2, \xi6 + two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa +.endm + +.global ntt_fast +.type ntt_fast, %function +.align 2 +ntt_fast: + push {r4-r11, r14} + vpush.w {s16-s24} + poly .req r0 + twiddle_ptr .req r1 + poly0 .req r2 + poly1 .req r3 + poly2 .req r4 + poly3 .req r5 + poly4 .req r6 + poly5 .req r7 + poly6 .req r8 + poly7 .req r9 + twiddle1 .req r10 + twiddle2 .req r11 + ### qinv .req r11 ### q^-1 mod 2^2n; n=16 + q .req r12 + ### at the top of r12 + qa .req r0 + ### qa=2^a q;a=3; at the bottom of r12 + tmp .req r14 + + // movw qa, #26632 + // Why movt? Because we initially placed qa at the bottom of the same register as q; + movt q, #3329 + + ### LAYER 7+6+5+4 + .equ distance, 256 + .equ offset, 32 + .equ strincr, 4 + // pre-load 15 twiddle factors to 15 FPU registers + // s0-s7 used to temporary store 16 16-bit polys. + vldm twiddle_ptr!, {s8-s22} + + add tmp, poly, #strincr*8 + // s23: poly addr + // s24: tmp + vmov s24, tmp + 1: + // load a1, a3, ..., a15 + vmov s23, poly + load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset + load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset + + movw qa, #26632 + + // 8-NTT on a1, a3, ..., a15 + _3_layer_double_CT_16_plant_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp + + // s15, s16, s17, s18, s19, s20, s21, s22 left + // multiply coeffs by layer 8 twiddles for later use + vmov twiddle1, s15 + vmov twiddle2, s16 + mul_twiddle_plant poly0, twiddle1, tmp, q, qa + mul_twiddle_plant poly1, twiddle2, tmp, q, qa + + vmov twiddle1, s17 + vmov twiddle2, s18 + mul_twiddle_plant poly2, twiddle1, tmp, q, qa + mul_twiddle_plant poly3, twiddle2, tmp, q, qa + + vmov twiddle1, s19 + vmov twiddle2, s20 + mul_twiddle_plant poly4, twiddle1, tmp, q, qa + mul_twiddle_plant poly5, twiddle2, tmp, q, qa + + vmov twiddle1, s21 + vmov twiddle2, s22 + mul_twiddle_plant poly6, twiddle1, tmp, q, qa + mul_twiddle_plant poly7, twiddle2, tmp, q, qa + + vmov s0, poly0 // a1 + vmov s1, poly1 // a3 + vmov s2, poly2 // a5 + vmov s3, poly3 // a7 + vmov s4, poly4 // a9 + vmov s5, poly5 // a11 + vmov s6, poly6 // a13 + vmov s7, poly7 // a15 + + vmov poly, s23 + + // load a0, a2, ..., a14 + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + movw qa, #26632 + // 8-NTT on a0, a2, ..., a14 + _3_layer_double_CT_16_plant_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp + + + // layer 4 - 1 + // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) + vmov poly, s23 + vmov twiddle1, s1 // load a3 + uadd16 tmp, poly1, twiddle1 + usub16 poly1, poly1, twiddle1 + str.w tmp, [poly, #1*distance/4] + str.w poly1, [poly, #1*distance/4+offset] + + vmov twiddle1, s3 // load a7 + uadd16 tmp, poly3, twiddle1 + usub16 poly3, poly3, twiddle1 + str.w tmp, [poly, #3*distance/4] + str.w poly3, [poly, #3*distance/4+offset] + + vmov twiddle1, s5 // load a11 + uadd16 tmp, poly5, twiddle1 + usub16 poly5, poly5, twiddle1 + str.w tmp, [poly, #5*distance/4] + str.w poly5, [poly, #5*distance/4+offset] + + vmov twiddle1, s7 // load a15 + uadd16 tmp, poly7, twiddle1 + usub16 poly7, poly7, twiddle1 + str.w tmp, [poly, #7*distance/4] + str.w poly7, [poly, #7*distance/4+offset] + + // layer 4 - 2 + // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13) + vmov poly3, s2 // load a5 + uadd16 tmp, poly2, poly3 + usub16 twiddle1, poly2, poly3 + str.w tmp, [poly, #2*distance/4] + str.w twiddle1, [poly, #2*distance/4+offset] + + vmov poly5, s4 // load a9 + uadd16 tmp, poly4, poly5 + usub16 twiddle1, poly4, poly5 + str.w tmp, [poly, #4*distance/4] + str.w twiddle1, [poly, #4*distance/4+offset] + + vmov poly7, s6 // load a13 + uadd16 tmp, poly6, poly7 + usub16 twiddle1, poly6, poly7 + str.w tmp, [poly, #6*distance/4] + str.w twiddle1, [poly, #6*distance/4+offset] + + vmov poly1, s0 // load a1 + uadd16 tmp, poly0, poly1 + usub16 twiddle1, poly0, poly1 + str.w twiddle1, [poly, #offset] + str.w tmp, [poly], #4 + + vmov tmp, s24 + cmp.w poly, tmp + bne.w 1b + + sub.w poly, #8*strincr + + ### LAYER 3+2+1 + + .equ distance, distance/16 + .equ strincr, 32 + + add.w tmp, poly, #strincr*16 + vmov s13, tmp + 2: + vmov s23, poly + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + movw qa, #26632 + _3_layer_double_CT_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + + vmov poly, s23 + store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + str.w poly1, [poly, #distance/4] + str.w poly2, [poly, #2*distance/4] + str.w poly3, [poly, #3*distance/4] + str.w poly0, [poly], #strincr + + vmov tmp, s13 + cmp.w poly, tmp + bne.w 2b + vpop.w {s16-s24} + pop {r4-r11, pc} diff --git a/crypto_kem/ml-kem-768/m4fspeed/indcpa.c b/crypto_kem/ml-kem-768/m4fspeed/indcpa.c new file mode 100644 index 0000000..1aceabe --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/indcpa.c @@ -0,0 +1,244 @@ +#include "indcpa.h" +#include "ntt.h" +#include "poly.h" +#include "polyvec.h" +#include "randombytes.h" +#include "symmetric.h" +#include "matacc.h" + +#include +#include +/************************************************* +* Name: indcpa_keypair_derand +* +* Description: Generates public and private key for the CPA-secure +* public-key encryption scheme underlying Kyber +* +* Arguments: - uint8_t *pk: pointer to output public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key +* (of length KYBER_INDCPA_SECRETKEYBYTES bytes) +* - const uint8_t *coins: pointer to input randomness +* (of length KYBER_SYMBYTES bytes) +**************************************************/ +void indcpa_keypair_derand(unsigned char *pk, + unsigned char *sk, + const unsigned char *coins){ + polyvec skpv, skpv_prime; + poly pkp; + unsigned char buf[2 * KYBER_SYMBYTES]; + unsigned char *publicseed = buf; + unsigned char *noiseseed = buf + KYBER_SYMBYTES; + int i; + unsigned char nonce = 0; + + memcpy(buf, coins, KYBER_SYMBYTES); + buf[KYBER_SYMBYTES] = KYBER_K; + hash_g(buf, buf, KYBER_SYMBYTES + 1); + + for (i = 0; i < KYBER_K; i++) + poly_getnoise(skpv.vec + i, noiseseed, nonce++); + + polyvec_ntt(&skpv); + + // i = 0 + matacc_cache32(&pkp, &skpv, &skpv_prime, 0, publicseed, 0); + poly_invntt(&pkp); + + poly_addnoise(&pkp, noiseseed, nonce++); + poly_ntt(&pkp); + + poly_tobytes(pk, &pkp); + for (i = 1; i < KYBER_K; i++) { + matacc_opt32(&pkp, &skpv, &skpv_prime, i, publicseed, 0); + poly_invntt(&pkp); + + poly_addnoise(&pkp, noiseseed, nonce++); + poly_ntt(&pkp); + + poly_tobytes(pk+i*KYBER_POLYBYTES, &pkp); + } + polyvec_tobytes(sk, &skpv); + memcpy(pk + KYBER_POLYVECBYTES, publicseed, KYBER_SYMBYTES); // Pack the public seed in the public key +} + +/************************************************* +* Name: indcpa_enc +* +* Description: Encryption function of the CPA-secure +* public-key encryption scheme underlying Kyber. +* +* Arguments: - unsigned char *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes) +* - const unsigned char *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) +* - const unsigned char *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - const unsigned char *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) +* to deterministically generate all randomness +**************************************************/ +void indcpa_enc(unsigned char *c, + const unsigned char *m, + const unsigned char *pk, + const unsigned char *coins) { + polyvec sp, sp_prime; + poly bp; + poly *pkp = &bp; + poly *k = &bp; + poly *v = &sp.vec[0]; + const unsigned char *seed = pk+KYBER_POLYVECBYTES; + int i; + unsigned char nonce = 0; + + for (i = 0; i < KYBER_K; i++) + poly_getnoise(sp.vec + i, coins, nonce++); + + polyvec_ntt(&sp); + + // i = 0 + matacc_cache32(&bp, &sp, &sp_prime, 0, seed, 1); + poly_invntt(&bp); + poly_addnoise(&bp, coins, nonce++); + poly_reduce(&bp); + poly_packcompress(c, &bp, 0); + for (i = 1; i < KYBER_K; i++) { + matacc_opt32(&bp, &sp, &sp_prime, i, seed, 1); + poly_invntt(&bp); + + poly_addnoise(&bp, coins, nonce++); + poly_reduce(&bp); + + poly_packcompress(c, &bp, i); + } + + poly_frombytes(pkp, pk); + int32_t v_tmp[KYBER_N]; + + poly_basemul_opt_16_32(v_tmp, &sp.vec[0], pkp, &sp_prime.vec[0]); + for (i = 1; i < KYBER_K - 1; i++) { + poly_frombytes(pkp, pk + i*KYBER_POLYBYTES); + poly_basemul_acc_opt_32_32(v_tmp, &sp.vec[i], pkp, &sp_prime.vec[i]); + } + poly_frombytes(pkp, pk + i*KYBER_POLYBYTES); + poly_basemul_acc_opt_32_16(v, &sp.vec[i], pkp, &sp_prime.vec[i], v_tmp); + + poly_invntt(v); + + poly_addnoise(v, coins, nonce++); + + poly_frommsg(k, m); + poly_add(v, v, k); + poly_reduce(v); + + poly_compress(c + KYBER_POLYVECCOMPRESSEDBYTES, v); +} + +/************************************************* +* Name: indcpa_enc_cmp +* +* Description: Re-encryption function. +* Compares the re-encypted ciphertext with the original ciphertext byte per byte. +* The comparison is performed in a constant time manner. +* +* +* Arguments: - unsigned char *ct: pointer to input ciphertext to compare the new ciphertext with (of length KYBER_INDCPA_BYTES bytes) +* - const unsigned char *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) +* - const unsigned char *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - const unsigned char *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) +* to deterministically generate all randomness +* Returns: - boolean byte indicating that re-encrypted ciphertext is NOT equal to the original ciphertext +**************************************************/ +unsigned char indcpa_enc_cmp(const unsigned char *c, + const unsigned char *m, + const unsigned char *pk, + const unsigned char *coins) { + uint64_t rc = 0; + polyvec sp, sp_prime; + poly bp; + poly *pkp = &bp; + poly *k = &bp; + poly *v = &sp.vec[0]; + const unsigned char *seed = pk+KYBER_POLYVECBYTES; + int i; + unsigned char nonce = 0; + + for (i = 0; i < KYBER_K; i++) + poly_getnoise(sp.vec + i, coins, nonce++); + + polyvec_ntt(&sp); + + // i = 0 + matacc_cache32(&bp, &sp, &sp_prime, 0, seed, 1); + poly_invntt(&bp); + poly_addnoise(&bp, coins, nonce++); + poly_reduce(&bp); + rc |= cmp_poly_packcompress(c, &bp, 0); + for (i = 1; i < KYBER_K; i++) { + matacc_opt32(&bp, &sp, &sp_prime, i, seed, 1); + poly_invntt(&bp); + + poly_addnoise(&bp, coins, nonce++); + poly_reduce(&bp); + + rc |= cmp_poly_packcompress(c, &bp, i); + } + + poly_frombytes(pkp, pk); + int32_t v_tmp[KYBER_N]; + + poly_basemul_opt_16_32(v_tmp, &sp.vec[0], pkp, &sp_prime.vec[0]); + for (i = 1; i < KYBER_K - 1; i++) { + poly_frombytes(pkp, pk + i*KYBER_POLYBYTES); + poly_basemul_acc_opt_32_32(v_tmp, &sp.vec[i], pkp, &sp_prime.vec[i]); + } + poly_frombytes(pkp, pk + i*KYBER_POLYBYTES); + poly_basemul_acc_opt_32_16(v, &sp.vec[i], pkp, &sp_prime.vec[i], v_tmp); + + poly_invntt(v); + + poly_addnoise(v, coins, nonce++); + poly_frommsg(k, m); + poly_add(v, v, k); + poly_reduce(v); + + rc |= cmp_poly_compress(c + KYBER_POLYVECCOMPRESSEDBYTES, v); + + rc = ~rc + 1; + rc >>= 63; + return (unsigned char)rc; +} + +/************************************************* +* Name: indcpa_dec +* +* Description: Decryption function of the CPA-secure +* public-key encryption scheme underlying Kyber. +* +* Arguments: - unsigned char *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES) +* - const unsigned char *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES) +* - const unsigned char *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES) +**************************************************/ +void __attribute__ ((noinline)) indcpa_dec(unsigned char *m, + const unsigned char *c, + const unsigned char *sk) { + poly mp, bp; + poly *v = &bp; + int32_t r_tmp[KYBER_N]; + int i; + + poly_unpackdecompress(&mp, c, 0); + poly_ntt(&mp); + poly_frombytes_mul_16_32(r_tmp, &mp, sk); + for(i = 1; i < KYBER_K - 1; i++) { + poly_unpackdecompress(&bp, c, i); + poly_ntt(&bp); + poly_frombytes_mul_32_32(r_tmp, &bp, sk + i*KYBER_POLYBYTES); + } + poly_unpackdecompress(&bp, c, i); + poly_ntt(&bp); + poly_frombytes_mul_32_16(&mp, &bp, sk + i*KYBER_POLYBYTES, r_tmp); + + poly_invntt(&mp); + poly_decompress(v, c+KYBER_POLYVECCOMPRESSEDBYTES); + poly_sub(&mp, v, &mp); + poly_reduce(&mp); + + poly_tomsg(m, &mp); +} diff --git a/crypto_kem/ml-kem-768/m4fspeed/indcpa.h b/crypto_kem/ml-kem-768/m4fspeed/indcpa.h new file mode 100644 index 0000000..6d5588a --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/indcpa.h @@ -0,0 +1,22 @@ +#ifndef INDCPA_H +#define INDCPA_H + +void indcpa_keypair_derand(unsigned char *pk, + unsigned char *sk, + const unsigned char *coins); + +void indcpa_enc(unsigned char *c, + const unsigned char *m, + const unsigned char *pk, + const unsigned char *coins); + +unsigned char indcpa_enc_cmp(const unsigned char *ct, + const unsigned char *m, + const unsigned char *pk, + const unsigned char *coins); + +void indcpa_dec(unsigned char *m, + const unsigned char *c, + const unsigned char *sk); + +#endif diff --git a/crypto_kem/ml-kem-768/m4fspeed/kem.c b/crypto_kem/ml-kem-768/m4fspeed/kem.c new file mode 100644 index 0000000..5cfa62b --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/kem.c @@ -0,0 +1,159 @@ +#include "api.h" +#include "indcpa.h" +#include "params.h" +#include "randombytes.h" +#include "symmetric.h" +#include "verify.h" + +#include + +#include + +#include + + +/************************************************* +* Name: crypto_kem_keypair_derand +* +* Description: Generates public and private key +* for CCA-secure Kyber key encapsulation mechanism +* +* Arguments: - uint8_t *pk: pointer to output public key +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key +* (an already allocated array of KYBER_SECRETKEYBYTES bytes) +* - uint8_t *coins: pointer to input randomness +* (an already allocated array filled with 2*KYBER_SYMBYTES random bytes) +** +* Returns 0 (success) +**************************************************/ +static int crypto_kem_keypair_derand(uint8_t *pk, + uint8_t *sk, + const uint8_t *coins) { + indcpa_keypair_derand(pk, sk, coins); + memcpy(sk + KYBER_INDCPA_SECRETKEYBYTES, pk, KYBER_PUBLICKEYBYTES); + hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); + /* Value z for pseudo-random output on reject */ + memcpy(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, coins + KYBER_SYMBYTES, KYBER_SYMBYTES); + return 0; +} + +/************************************************* +* Name: crypto_kem_keypair +* +* Description: Generates public and private key +* for CCA-secure Kyber key encapsulation mechanism +* +* Arguments: - unsigned char *pk: pointer to output public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* - unsigned char *sk: pointer to output private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* +* Returns 0 (success) +**************************************************/ +int crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { + uint8_t coins[2 * KYBER_SYMBYTES]; + randombytes(coins, 2 * KYBER_SYMBYTES); + crypto_kem_keypair_derand(pk, sk, coins); + return 0; +} + + +/************************************************* +* Name: crypto_kem_enc_derand +* +* Description: Generates cipher text and shared +* secret for given public key +* +* Arguments: - uint8_t *ct: pointer to output cipher text +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) +* - uint8_t *ss: pointer to output shared secret +* (an already allocated array of KYBER_SSBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) +* - const uint8_t *coins: pointer to input randomness +* (an already allocated array filled with KYBER_SYMBYTES random bytes) +** +* Returns 0 (success) +**************************************************/ +static int crypto_kem_enc_derand(uint8_t *ct, + uint8_t *ss, + const uint8_t *pk, + const uint8_t *coins) { + uint8_t buf[2 * KYBER_SYMBYTES]; + /* Will contain key, coins */ + uint8_t kr[2 * KYBER_SYMBYTES]; + + memcpy(buf, coins, KYBER_SYMBYTES); + + /* Multitarget countermeasure for coins + contributory KEM */ + hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); + hash_g(kr, buf, 2 * KYBER_SYMBYTES); + + /* coins are in kr+KYBER_SYMBYTES */ + indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); + + memcpy(ss, kr, KYBER_SYMBYTES); + return 0; +} + +/************************************************* +* Name: crypto_kem_enc +* +* Description: Generates cipher text and shared +* secret for given public key +* +* Arguments: - uint8_t *ct: pointer to output cipher text +* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes) +* - uint8_t *ss: pointer to output shared secret +* (an already allocated array of KYBER_SSBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (an already allocated array of KYBER_PUBLICKEYBYTES bytes) +* +* Returns 0 (success) +**************************************************/ +int crypto_kem_enc(uint8_t *ct, + uint8_t *ss, + const uint8_t *pk) { + uint8_t coins[KYBER_SYMBYTES]; + randombytes(coins, KYBER_SYMBYTES); + crypto_kem_enc_derand(ct, ss, pk, coins); + return 0; +} + +/************************************************* +* Name: crypto_kem_dec +* +* Description: Generates shared secret for given +* cipher text and private key +* +* Arguments: - unsigned char *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *ct: pointer to input cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - const unsigned char *sk: pointer to input private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* +* Returns 0. +* +* On failure, ss will contain a pseudo-random value. +**************************************************/ +int crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk) { + int fail; + uint8_t buf[2 * KYBER_SYMBYTES]; + /* Will contain key, coins */ + uint8_t kr[2 * KYBER_SYMBYTES]; + const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES; + + indcpa_dec(buf, ct, sk); + + /* Multitarget countermeasure for coins + contributory KEM */ + memcpy(buf + KYBER_SYMBYTES, sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, KYBER_SYMBYTES); + hash_g(kr, buf, 2 * KYBER_SYMBYTES); + + /* coins are in kr+KYBER_SYMBYTES */ + fail = indcpa_enc_cmp(ct, buf, pk, kr + KYBER_SYMBYTES); + + /* Compute rejection key */ + rkprf(ss, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, ct); + + /* Copy true key to return buffer if fail is false */ + cmov(ss, kr, KYBER_SYMBYTES, (uint8_t) (1 - fail)); + + return 0; +} diff --git a/crypto_kem/ml-kem-768/m4fspeed/macros.i b/crypto_kem/ml-kem-768/m4fspeed/macros.i new file mode 100644 index 0000000..ebe5743 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/macros.i @@ -0,0 +1,60 @@ +/****************************************************************************** + * Integrating the improved Plantard arithmetic into Kyber. + * + * Efficient Plantard arithmetic enables a faster Kyber implementation with the + * same stack usage. + * + * See the paper at https://eprint.iacr.org/2022/956.pdf for more details. + * + * @author Junhao Huang, BNU-HKBU United International College, Zhuhai, China + * jhhuang_nuaa@126.com + * + * @date September 2022 + ******************************************************************************/ +#ifndef MACROS_I +#define MACROS_I + +.macro load a, a0, a1, a2, a3, mem0, mem1, mem2, mem3 + ldr.w \a0, [\a, \mem0] + ldr.w \a1, [\a, \mem1] + ldr.w \a2, [\a, \mem2] + ldr.w \a3, [\a, \mem3] +.endm + +.macro store a, a0, a1, a2, a3, mem0, mem1, mem2, mem3 + str.w \a0, [\a, \mem0] + str.w \a1, [\a, \mem1] + str.w \a2, [\a, \mem2] + str.w \a3, [\a, \mem3] +.endm + +.macro doubleplant a, tmp, q, qa, plantconst + smulwb \tmp, \plantconst, \a + smulwt \a, \plantconst, \a + smlabt \tmp, \tmp, \q, \qa + smlabt \a, \a, \q, \qa + pkhtb \a, \a, \tmp, asr#16 +.endm + +.macro doublebarrett a, tmp, tmp2, q, barrettconst + smulbb \tmp, \a, \barrettconst + smultb \tmp2, \a, \barrettconst + asr \tmp, \tmp, #26 + asr \tmp2, \tmp2, #26 + smulbb \tmp, \tmp, \q + smulbb \tmp2, \tmp2, \q + pkhbt \tmp, \tmp, \tmp2, lsl#16 + usub16 \a, \a, \tmp +.endm + +// q locate in the top half of the register +.macro plant_red q, qa, qinv, tmp + mul \tmp, \tmp, \qinv + //tmp*qinv mod 2^2n/ 2^n; in high half + smlatt \tmp, \tmp, \q, \qa + // result in high half +.endm + + + +#endif /* MACROS_I */ diff --git a/crypto_kem/ml-kem-768/m4fspeed/matacc.c b/crypto_kem/ml-kem-768/m4fspeed/matacc.c new file mode 100644 index 0000000..736c5ae --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/matacc.c @@ -0,0 +1,121 @@ +#include "poly.h" +#include "polyvec.h" +#include "randombytes.h" +#include "symmetric.h" +#include "ntt.h" +#include "matacc.h" + + +/************************************************* +* Name: matacc_cache32 +* +* Description: Multiplies a row of A or A^T, generated on-the-fly, +* with a vector of polynomials and accumulates into the result. +* Using asymmetric multiplication and better accumulation. +* +* Arguments: - poly *r: pointer to output polynomial to accumulate in +* - const polyvec *b: pointer to input vector of polynomials to multiply with +* - polyvec *b_prime: pointer to output vector of polynomials to store b multiplied by zetas +* - unsigned char i: byte to indicate the index < KYBER_K of the row of A or A^T +* - const unsigned char *seed: pointer to the public seed used to generate A +* - int transposed: boolean indicatin whether A or A^T is generated +**************************************************/ +void matacc_cache32(poly* r, const polyvec *b, polyvec *b_prime, unsigned char i, const unsigned char *seed, int transposed) { + unsigned char buf[XOF_BLOCKBYTES+2]; + xof_state state; + int16_t c[4]; + int32_t r_tmp[KYBER_N]; // stores intermediate accumulated values to save reductions + int j = 0; + + // 16-32 + + if (transposed) + xof_absorb(&state, seed, i, j); + else + xof_absorb(&state, seed, j, i); + + xof_squeezeblocks(buf, 1, &state); + + matacc_asm_cache_16_32(r_tmp, b->vec[j].coeffs, c, buf, zetas, &state, b_prime->vec[j].coeffs); + + // 32-32 KYBER_K - 2 times + for(j=1;jvec[j].coeffs, c, buf, zetas, &state, b_prime->vec[j].coeffs); + } + + // 32-16 + + if (transposed) + xof_absorb(&state, seed, i, j); + else + xof_absorb(&state, seed, j, i); + + xof_squeezeblocks(buf, 1, &state); + + matacc_asm_cache_32_16(r->coeffs, b->vec[j].coeffs, c, buf, zetas, &state, b_prime->vec[j].coeffs, r_tmp); +} + +/************************************************* +* Name: matacc_opt32 +* +* Description: Multiplies a row of A or A^T, generated on-the-fly, +* with a vector of polynomials and accumulates into the result. +* Using asymmetric multiplication and better accumulation. +* +* Arguments: - poly *r: pointer to output polynomial to accumulate in +* - const polyvec *b: pointer to input vector of polynomials to multiply with +* - const polyvec *b_prime: pointer to input vector of polynomials to store b multiplied by zetas +* - unsigned char i: byte to indicate the index < KYBER_K of the row of A or A^T +* - const unsigned char *seed: pointer to the public seed used to generate A +* - int transposed: boolean indicatin whether A or A^T is generated +**************************************************/ +void matacc_opt32(poly* r, const polyvec *b, const polyvec *b_prime, unsigned char i, const unsigned char *seed, int transposed) { + unsigned char buf[XOF_BLOCKBYTES+2]; + xof_state state; + int16_t c[4]; + int32_t r_tmp[KYBER_N]; // stores intermediate accumulated values to save reductions + int j = 0; + + // 16-32 + + if (transposed) + xof_absorb(&state, seed, i, j); + else + xof_absorb(&state, seed, j, i); + + xof_squeezeblocks(buf, 1, &state); + + matacc_asm_opt_16_32(r_tmp, b->vec[j].coeffs, c, buf, &state, b_prime->vec[j].coeffs); + + // 32-32 KYBER_K - 2 times + for(j=1;jvec[j].coeffs, c, buf, &state, b_prime->vec[j].coeffs); + } + + // 32-16 + + if (transposed) + xof_absorb(&state, seed, i, j); + else + xof_absorb(&state, seed, j, i); + + xof_squeezeblocks(buf, 1, &state); + + matacc_asm_opt_32_16(r->coeffs, b->vec[j].coeffs, c, buf, &state, b_prime->vec[j].coeffs, r_tmp); +} diff --git a/crypto_kem/ml-kem-768/m4fspeed/matacc.h b/crypto_kem/ml-kem-768/m4fspeed/matacc.h new file mode 100644 index 0000000..39c0c79 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/matacc.h @@ -0,0 +1,63 @@ +#ifndef MATACC_H +#define MATACC_H +#include "poly.h" +#include "polyvec.h" +#include "symmetric.h" + +extern void matacc_asm_cache_16_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t zetas[64], xof_state *state, int16_t *aprimeptr); +static inline void _matacc_asm_cache_16_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t _zetas[64], xof_state *state, int16_t *aprimeptr) +{ + // floating point registers clobbered by assembly function + asm volatile("" : : : "s16", "s17", "s18", "s19", "s20", "s21", "s26", "s27", "s28", "s29"); + matacc_asm_cache_16_32(r_tmp, b, c, buf, _zetas, state, aprimeptr); +} +#define matacc_asm_cache_16_32 _matacc_asm_cache_16_32 + +extern void matacc_asm_cache_32_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t zetas[64], xof_state *state, int16_t *aprimeptr); +static inline void _matacc_asm_cache_32_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t _zetas[64], xof_state *state, int16_t *aprimeptr) +{ + // floating point registers clobbered by assembly function + asm volatile("" : : : "s16", "s17", "s18", "s19", "s20", "s21", "s26", "s27", "s28", "s29"); + matacc_asm_cache_32_32(r_tmp, b, c, buf, _zetas, state, aprimeptr); +} +#define matacc_asm_cache_32_32 _matacc_asm_cache_32_32 + +extern void matacc_asm_cache_32_16(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t zetas[64], xof_state *state, int16_t *aprimeptr, const int32_t *r_tmp); +static inline void _matacc_asm_cache_32_16(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t _zetas[64], xof_state *state, int16_t *aprimeptr, const int32_t *r_tmp) +{ + // floating point registers clobbered by assembly function + asm volatile("" : : : "s16", "s17", "s18", "s19", "s20", "s21", "s26", "s27", "s28", "s29"); + matacc_asm_cache_32_16(r, b, c, buf, _zetas, state, aprimeptr, r_tmp); +} +#define matacc_asm_cache_32_16 _matacc_asm_cache_32_16 + +extern void matacc_asm_opt_16_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], xof_state *state, const int16_t *aprimeptr); +static inline void _matacc_asm_opt_16_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], xof_state *state, const int16_t *aprimeptr) +{ + // floating point registers clobbered by assembly function + asm volatile("" : : : "s16", "s17", "s18", "s19", "s20", "s21", "s26", "s27", "s28", "s29"); + matacc_asm_opt_16_32(r_tmp, b, c, buf, state, aprimeptr); +} +#define matacc_asm_opt_16_32 _matacc_asm_opt_16_32 + +extern void matacc_asm_opt_32_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], xof_state *state, const int16_t *aprimeptr); +static inline void _matacc_asm_opt_32_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], xof_state *state, const int16_t *aprimeptr) +{ + // floating point registers clobbered by assembly function + asm volatile("" : : : "s16", "s17", "s18", "s19", "s20", "s21", "s26", "s27", "s28", "s29"); + matacc_asm_opt_32_32(r_tmp, b, c, buf, state, aprimeptr); +} +#define matacc_asm_opt_32_32 _matacc_asm_opt_32_32 + +extern void matacc_asm_opt_32_16(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], xof_state *state, const int16_t *aprimeptr, const int32_t *r_tmp); +static inline void _matacc_asm_opt_32_16(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], xof_state *state, const int16_t *aprimeptr, const int32_t *r_tmp) +{ + // floating point registers clobbered by assembly function + asm volatile("" : : : "s16", "s17", "s18", "s19", "s20", "s21", "s26", "s27", "s28", "s29"); + matacc_asm_opt_32_16(r, b, c, buf, state, aprimeptr, r_tmp); +} +#define matacc_asm_opt_32_16 _matacc_asm_opt_32_16 + +void matacc_opt32(poly* r, const polyvec *b, const polyvec *b_prime, unsigned char i, const unsigned char *seed, int transposed); +void matacc_cache32(poly* r, const polyvec *b, polyvec *b_prime, unsigned char i, const unsigned char *seed, int transposed); +#endif \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fspeed/matacc.i b/crypto_kem/ml-kem-768/m4fspeed/matacc.i new file mode 100644 index 0000000..d0da46a --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/matacc.i @@ -0,0 +1,301 @@ + +// q locates in the bottom half of the register +.macro plant_red_b q, qa, qinv, tmp + mul \tmp, \tmp, \qinv + //tmp*qinv mod 2^2n/ 2^n; in high half + smlatb \tmp, \tmp, \q, \qa + // result in high half +.endm + +// Checks if val0 is suitable and multiplies with values from bptr using func +.macro first_if func, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr + // if (val0 < KYBER_Q) + cmp.w \val0, \q + bhs.w 2f + strh \val0, [\cptr], #2 + add \k, #1 + cmp.w \k, #4 + bne.w 2f + sub \cptr, #4*2 + vmov s18, \bufptr + vmov s19, \ctr + vmov s20, \val1 + \func \rptr, \bptr, \cptr, \zetaptr, \bufptr, \k, \val0, \val1, \q, \qa, \qinv, \tmp, \tmp2, \ctr + vmov \bufptr, s18 + vmov \ctr, s19 + vmov \val1, s20 + + add \ctr, #1 + + movw \k, #0 + 2: +.endm + +// Checks if val1 is suitable and multiplies with values from bptr using func +.macro second_if func, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr +// if (val1 < KYBER_Q && ctr < KYBER_N/4) + cmp.w \val1, \q + bhs.w 2f + cmp.w \ctr, #256/4 + bge.w 2f + strh \val1, [\cptr], #2 + add \k, #1 + cmp.w \k, #4 + bne.w 2f + sub \cptr, #4*2 + vmov s18, \bufptr + vmov s19, \ctr + \func \rptr, \bptr, \cptr, \zetaptr, \bufptr, \k, \val0, \val1, \q, \qa, \qinv, \tmp, \tmp2, \ctr + vmov \bufptr, s18 + vmov \ctr, s19 + + add \ctr, #1 + + movw \k, #0 + 2: +.endm + +.macro doublebasemul_asm_cache_16_32 rptr_tmp, aptr, bptr, zetaptr, poly0, poly1, tmp, tmp2, q, qa, qinv, res, aprimeptr, zeta + vmov \aprimeptr, s27 + ldr \poly0, [\aptr], #4 + ldr \poly1, [\bptr] + ldr \zeta, [\zetaptr], #4 + + smulwt \tmp, \zeta, \poly0 + smlabb \tmp, \tmp, \q, \qa + pkhbt \tmp, \poly0, \tmp + str \tmp, [\aprimeptr], #4 // store (poly0_t*zeta || poly0_b) for later re-use + smultt \tmp2, \tmp, \poly1 + smlabb \tmp2, \poly0, \poly1, \tmp2 + + smuadx \tmp, \poly0, \poly1 + + str.w \tmp, [\rptr_tmp, #4] + str \tmp2, [\rptr_tmp], #8 + + neg \zeta, \zeta + + ldr \poly0, [\aptr], #4 + ldr.w \poly1, [\bptr, #4] + + smulwt \tmp, \zeta, \poly0 + smlabb \tmp, \tmp, \q, \qa + pkhbt \tmp, \poly0, \tmp + str \tmp, [\aprimeptr], #4 // store (poly2_t*zeta || poly2_b) for later re-use + smultt \tmp2, \tmp, \poly1 + smlabb \tmp2, \poly0, \poly1, \tmp2 + + smuadx \tmp, \poly0, \poly1 + str.w \tmp, [\rptr_tmp, #4] + str \tmp2, [\rptr_tmp], #8 + vmov s27, \aprimeptr +.endm + +.macro doublebasemul_asm_acc_cache_32_32 rptr_tmp, aptr, bptr, zetaptr, poly0, poly1, tmp, tmp2, q, qa, qinv, res, aprimeptr, zeta + vmov \aprimeptr, s27 + ldr \poly0, [\aptr], #4 + ldr \poly1, [\bptr] + + ldr \res, [\rptr_tmp] + ldr \zeta, [\zetaptr], #4 + + smulwt \tmp, \zeta, \poly0 + smlabb \tmp, \tmp, \q, \qa + pkhbt \tmp, \poly0, \tmp + str \tmp, [\aprimeptr], #4 // store (poly0_t*zeta || poly0_b) for later re-use + smlatt \tmp, \tmp, \poly1, \res + smlabb \res, \poly0, \poly1, \tmp + str \res, [\rptr_tmp], #4 + + ldr.w \res, [\rptr_tmp] + smladx \res, \poly0, \poly1, \res + + str.w \res, [\rptr_tmp], #4 + + neg \zeta, \zeta + + ldr \poly0, [\aptr], #4 + ldr.w \poly1, [\bptr, #4] + ldr \res, [\rptr_tmp] + smulwt \tmp, \zeta, \poly0 + smlabb \tmp, \tmp, \q, \qa + pkhbt \tmp, \poly0, \tmp + str \tmp, [\aprimeptr], #4 // store (poly2_t*zeta || poly2_b) for later re-use + smlatt \tmp, \tmp, \poly1, \res + smlabb \res, \poly0, \poly1, \tmp + str.w \res, [\rptr_tmp], #4 + + ldr.w \res, [\rptr_tmp] + smladx \res, \poly0, \poly1, \res + + str \res, [\rptr_tmp], #4 + vmov s27, \aprimeptr +.endm + +.macro doublebasemul_asm_acc_cache_32_16 rptr_tmp, aptr, bptr, zetaptr, poly0, poly1, tmp, tmp2, q, qa, qinv, res, aprimeptr, zeta + vmov \aprimeptr, s27 + ldr \poly0, [\aptr], #4 + ldr \poly1, [\bptr] + + ldr \res, [\rptr_tmp], #4 + ldr \zeta, [\zetaptr], #4 + + smulwt \tmp, \zeta, \poly0 + smlabb \tmp, \tmp, \q, \qa + pkhbt \tmp, \poly0, \tmp + str \tmp, [\aprimeptr], #4 // store (poly0_t*zeta || poly0_b) for later re-use + smlatt \tmp, \tmp, \poly1, \res + smlabb \tmp2, \poly0, \poly1, \tmp + + plant_red_b \q, \qa, \qinv, \tmp2 + ldr.w \tmp, [\rptr_tmp], #4 + smladx \tmp, \poly0, \poly1, \tmp + + plant_red_b \q, \qa, \qinv, \tmp + + pkhtb \res, \tmp, \tmp2, asr#16 + vmov \tmp2, s28 + str \res, [\tmp2], #4 + + neg \zeta, \zeta + + ldr \poly0, [\aptr], #4 + ldr.w \poly1, [\bptr, #4] + + smulwt \tmp, \zeta, \poly0 + smlabb \tmp, \tmp, \q, \qa + pkhbt \tmp, \poly0, \tmp + ldr \res, [\rptr_tmp], #4 + str \tmp, [\aprimeptr], #4 // store (poly2_t*zeta || poly2_b) for later re-use + smlatt \tmp, \tmp, \poly1, \res + smlabb \tmp, \poly0, \poly1, \tmp + + plant_red_b \q, \qa, \qinv, \tmp + + ldr \res, [\rptr_tmp], #4 + smladx \res, \poly0, \poly1, \res + + plant_red_b \q, \qa, \qinv, \res + + pkhtb \res, \res, \tmp, asr#16 + + str \res, [\tmp2], #4 + vmov s28, \tmp2 + vmov s27, \aprimeptr +.endm + +.macro load_vals val0, val1, bufptr, tmp + ldrh \val0, [\bufptr], #2 + ldrb \val1, [\bufptr], #1 + ubfx \tmp, \val0, #12, #4 + orr \val1, \tmp, \val1, lsl #4 + ubfx \val0, \val0, #0, #12 + ubfx \val1, \val1, #0, #12 +.endm + +.macro doublebasemul_asm_opt_16_32 rptr_tmp, aptr, bptr, tmp3, poly0, poly1, poly2, poly3, q, qa, qinv, tmp, aprimeptr, tmp2 + vmov \aprimeptr, s27 + ldr \poly0, [\aptr], #4 + ldr \poly1, [\bptr] + ldr \poly2, [\aptr], #4 + ldr.w \poly3, [\bptr, #4] + + ldr.w \tmp2, [\aprimeptr], #4 // load cached value + + // (poly0_t * zeta) * poly1_t + poly0_b * poly1_b + smuad \tmp, \tmp2, \poly1 + + // poly1_t * poly0_b + poly1_b * poly0_t + smuadx \tmp3, \poly0, \poly1 + + str \tmp, [\rptr_tmp], #4 + str \tmp3, [\rptr_tmp], #4 + + ldr \tmp, [\aprimeptr], #4 // load cached value + + smuad \tmp2, \tmp, \poly3 + + smuadx \tmp3, \poly2, \poly3 + + str.w \tmp2, [\rptr_tmp], #4 + str.w \tmp3, [\rptr_tmp], #4 + vmov s27, \aprimeptr +.endm + +.macro doublebasemul_asm_acc_opt_32_32 rptr_tmp, aptr, bptr, tmp2, poly0, poly1, poly2, poly3, q, qa, qinv, res, aprimeptr, tmp + vmov \aprimeptr, s27 + ldr.w \poly0, [\aptr], #4 + ldr.w \poly1, [\bptr] + ldr.w \poly2, [\aptr], #4 + ldr.w \poly3, [\bptr, #4] + + ldr.w \res, [\rptr_tmp] + ldr.w \tmp, [\rptr_tmp, #4] + + ldr \tmp2, [\aprimeptr], #4 // load cached value + + // (poly0_t * zeta) * poly1_t + poly0_b * poly0_t + res + smlad \res, \tmp2, \poly1, \res + + // poly1_t * poly0_b + poly1_b * poly0_t + res + smladx \tmp, \poly0, \poly1, \tmp + + str.w \tmp, [\rptr_tmp, #4] + str.w \res, [\rptr_tmp], #8 + + ldr \tmp2, [\aprimeptr], #4 // load cached value + ldr \res, [\rptr_tmp] + ldr \tmp, [\rptr_tmp, #4] + + smlad \res, \tmp2, \poly3, \res + + smladx \tmp, \poly2, \poly3, \tmp + + str.w \tmp, [\rptr_tmp, #4] + str \res, [\rptr_tmp], #8 + + vmov s27, \aprimeptr +.endm + +.macro doublebasemul_asm_acc_opt_32_16 rptr_tmp, aptr, bptr, tmp2, poly0, poly1, poly2, poly3, q, qa, qinv, res, aprimeptr, tmp + vmov \aprimeptr, s27 + + ldr \poly0, [\aptr], #4 + ldr \poly1, [\bptr] + ldr \poly2, [\aptr], #4 + ldr.w \poly3, [\bptr, #4] + + ldr.w \tmp, [\rptr_tmp, #4] + ldr \res, [\rptr_tmp], #8 + + ldr \tmp2, [\aprimeptr], #4 // load cached value + + // (poly0_t * zeta) * poly1_t + poly0_b * poly0_t + res + smlad \res, \tmp2, \poly1, \res + plant_red_b \q, \qa, \qinv, \res + + // poly1_t * poly0_b + poly1_b * poly0_t + res + smladx \tmp, \poly0, \poly1, \tmp + plant_red_b \q, \qa, \qinv, \tmp + + pkhtb \res, \tmp, \res, asr#16 + vmov \poly0, s28 + str \res, [\poly0], #4 + + ldr \tmp2, [\aprimeptr], #4 // load cached value + ldr.w \tmp, [\rptr_tmp, #4] + ldr \res, [\rptr_tmp], #8 + + smlad \res, \tmp2, \poly3, \res + + plant_red_b \q, \qa, \qinv, \res + + smladx \tmp, \poly2, \poly3, \tmp + + plant_red_b \q, \qa, \qinv, \tmp + + pkhtb \res, \tmp, \res, asr#16 + str \res, [\poly0], #4 + vmov s28, \poly0 + vmov s27, \aprimeptr +.endm diff --git a/crypto_kem/ml-kem-768/m4fspeed/matacc_asm.S b/crypto_kem/ml-kem-768/m4fspeed/matacc_asm.S new file mode 100644 index 0000000..f77ae60 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/matacc_asm.S @@ -0,0 +1,377 @@ +#include "matacc.i" +.extern shake128_squeezeblocks + +.syntax unified +.cpu cortex-m4 +.thumb + +// shake128_squeezeblocks into buffer if all bytes have been used +.macro third_if tmp, tmp2, rptr, bptr, cptr, bufptr, ctr +// if (pos + 3 > buflen && ctr < KYBER_N/4) + vmov \tmp, s17 + add \tmp, #168 // XOF_BLOCKBYTES=168 + add \tmp2, \bufptr, #3 + cmp.w \tmp2, \tmp // pos + 3 > buflen + ble.w 2f + cmp.w \ctr, #256/4 + bge.w 2f + vmov \bufptr, s17 + + vmov s16, r12 + vmov s18, \rptr + vmov s19, \bptr + vmov s20, \cptr + vmov s21, \ctr + + mov \rptr, \bufptr + movw \bptr, #1 + vmov \cptr, s26 // load state + + bl shake128_squeezeblocks + + vmov r12, s16 + vmov \rptr, s18 + vmov \bptr, s19 + vmov \cptr, s20 + vmov \ctr, s21 + vmov \bufptr, s17 + 2: +.endm + +// void matacc_asm_cache_16_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t zetas[64], xof_state *state, int16_t *aprimeptr) +.global matacc_asm_cache_16_32 +.type matacc_asm_cache_16_32, %function +.align 2 +matacc_asm_cache_16_32: + push {r0-r11, r14} + rptr .req r0 + bptr .req r1 + cptr .req r2 + bufptr .req r3 + zetaptr .req r4 + val0 .req r5 + val1 .req r6 + tmp .req r7 + tmp2 .req r8 + k .req r9 + q .req r10 + qa .req r11 + qinv .req r12 + ctr .req r14 + + movw qa, #26632 + movw q, #3329 + ### qinv=0x6ba8f301 + movw qinv, #62209 + movt qinv, #27560 + movw k, #0 + + ldr.w zetaptr, [sp, #13*4] // load zetaptr from stack + ldr.w tmp, [sp, #14*4] // load state from stack + vmov s26, tmp + + ldr.w tmp, [sp, #15*4] // load aprimeptr from stack + vmov s27, tmp + + // outer while loop + movw ctr, #0 + vmov s17, bufptr // save bufptr to check later + 1: + + load_vals val0, val1, bufptr, tmp + + first_if doublebasemul_asm_cache_16_32, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr + + second_if doublebasemul_asm_cache_16_32, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr + + third_if tmp, tmp2, rptr, bptr, cptr, bufptr, ctr + + cmp ctr, #256/4 + blt.w 1b + + pop {r0-r11, pc} +.size matacc_asm_cache_16_32, . - matacc_asm_cache_16_32 + +// void matacc_asm_cache_32_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t zetas[64], xof_state *state, int16_t *aprimeptr) +.global matacc_asm_cache_32_32 +.type matacc_asm_cache_32_32, %function +.align 2 +matacc_asm_cache_32_32: + push {r0-r11, r14} + rptr .req r0 + bptr .req r1 + cptr .req r2 + bufptr .req r3 + zetaptr .req r4 + val0 .req r5 + val1 .req r6 + tmp .req r7 + tmp2 .req r8 + k .req r9 + q .req r10 + qa .req r11 + qinv .req r12 + ctr .req r14 + + movw qa, #26632 + movw q, #3329 + ### qinv=0x6ba8f301 + movw qinv, #62209 + movt qinv, #27560 + movw k, #0 + + ldr.w zetaptr, [sp, #13*4] // load zetaptr from stack + ldr.w tmp, [sp, #14*4] // load state from stack + vmov s26, tmp + + ldr.w tmp, [sp, #15*4] // load aprimeptr from stack + vmov s27, tmp + + // outer while loop + movw ctr, #0 + vmov s17, bufptr // save bufptr to check later + 1: + + load_vals val0, val1, bufptr, tmp + + first_if doublebasemul_asm_acc_cache_32_32, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr + + second_if doublebasemul_asm_acc_cache_32_32, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr + + third_if tmp, tmp2, rptr, bptr, cptr, bufptr, ctr + + cmp ctr, #256/4 + blt.w 1b + + pop {r0-r11, pc} +.size matacc_asm_cache_32_32, . - matacc_asm_cache_32_32 + +// void matacc_asm_cache_32_16(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t zetas[64], xof_state *state, int16_t *aprimeptr, const int32_t *r_tmp) +.global matacc_asm_cache_32_16 +.type matacc_asm_cache_32_16, %function +.align 2 +matacc_asm_cache_32_16: + push {r0-r11, r14} + rptr .req r0 + bptr .req r1 + cptr .req r2 + bufptr .req r3 + zetaptr .req r4 + val0 .req r5 + val1 .req r6 + tmp .req r7 + tmp2 .req r8 + k .req r9 + q .req r10 + qa .req r11 + qinv .req r12 + ctr .req r14 + + movw qa, #26632 + movw q, #3329 + ### qinv=0x6ba8f301 + movw qinv, #62209 + movt qinv, #27560 + movw k, #0 + + ldr.w zetaptr, [sp, #13*4] // load zetaptr from stack + + ldr.w tmp, [sp, #14*4] // load state from stack + vmov s26, tmp + + ldr.w tmp, [sp, #15*4] // load aprimeptr from stack + vmov s27, tmp + + vmov s28, rptr // store "real" destinaton in FP + vmov s29, rptr // backup + ldr.w rptr, [sp, #16*4] + + // outer while loop + movw ctr, #0 + vmov s17, bufptr // save bufptr to check later + 1: + load_vals val0, val1, bufptr, tmp + + first_if doublebasemul_asm_acc_cache_32_16, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr + + second_if doublebasemul_asm_acc_cache_32_16, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr + + third_if tmp, tmp2, rptr, bptr, cptr, bufptr, ctr + + cmp ctr, #256/4 + blt.w 1b + + vmov rptr, s29 + + pop {r0-r11, pc} +.size matacc_asm_cache_32_16, . - matacc_asm_cache_32_16 + +.unreq zetaptr + +// void matacc_asm_opt_16_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], xof_state *state, const int16_t *aprimeptr) +.global matacc_asm_opt_16_32 +.type matacc_asm_opt_16_32, %function +.align 2 +matacc_asm_opt_16_32: + push {r0-r11, r14} + rptr .req r0 + bptr .req r1 + cptr .req r2 + bufptr .req r3 + tmp3 .req r4 + val0 .req r5 + val1 .req r6 + tmp .req r7 + tmp2 .req r8 + k .req r9 + q .req r10 + qa .req r11 + qinv .req r12 + ctr .req r14 + + movw qa, #26632 + movw q, #3329 + ### qinv=0x6ba8f301 + movw qinv, #62209 + movt qinv, #27560 + movw k, #0 + + ldr.w tmp, [sp, #13*4] // load state from stack + vmov s26, tmp + + ldr.w tmp, [sp, #14*4] // load aprimeptr from stack + vmov s27, tmp + + // outer while loop + movw ctr, #0 + vmov s17, bufptr // save bufptr to check later + 1: + + load_vals val0, val1, bufptr, tmp + + first_if doublebasemul_asm_opt_16_32, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, tmp3, k, q, qa, qinv, ctr + + second_if doublebasemul_asm_opt_16_32, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, tmp3, k, q, qa, qinv, ctr + + third_if tmp, tmp2, rptr, bptr, cptr, bufptr, ctr + + cmp ctr, #256/4 + blt.w 1b + + pop {r0-r11, pc} +.size matacc_asm_opt_16_32, . - matacc_asm_opt_16_32 + +// void matacc_asm_opt_32_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], xof_state *state, const int16_t *aprimeptr) +.global matacc_asm_opt_32_32 +.type matacc_asm_opt_32_32, %function +.align 2 +matacc_asm_opt_32_32: + push {r0-r11, r14} + rptr .req r0 + bptr .req r1 + cptr .req r2 + bufptr .req r3 + tmp3 .req r4 + val0 .req r5 + val1 .req r6 + tmp .req r7 + tmp2 .req r8 + k .req r9 + q .req r10 + qa .req r11 + qinv .req r12 + ctr .req r14 + + movw qa, #26632 + movw q, #3329 + ### qinv=0x6ba8f301 + movw qinv, #62209 + movt qinv, #27560 + movw k, #0 + + ldr.w tmp, [sp, #13*4] // load state from stack + vmov s26, tmp + + ldr.w tmp, [sp, #14*4] // load aprimeptr from stack + vmov s27, tmp + + // outer while loop + movw ctr, #0 + vmov s17, bufptr // save bufptr to check later + 1: + + load_vals val0, val1, bufptr, tmp + + first_if doublebasemul_asm_acc_opt_32_32, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, tmp3, k, q, qa, qinv, ctr + + second_if doublebasemul_asm_acc_opt_32_32, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, tmp3, k, q, qa, qinv, ctr + + third_if tmp, tmp2, rptr, bptr, cptr, bufptr, ctr + + cmp ctr, #256/4 + blt.w 1b + + pop {r0-r11, pc} +.size matacc_asm_opt_32_32, . - matacc_asm_opt_32_32 + +.unreq tmp3 + + +// void matacc_asm_opt_32_16(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], xof_state *state, const int16_t *aprimeptr, const int32_t *r_tmp) +.global matacc_asm_opt_32_16 +.type matacc_asm_opt_32_16, %function +.align 2 +matacc_asm_opt_32_16: + push {r0-r11, r14} + rptr .req r0 + bptr .req r1 + cptr .req r2 + bufptr .req r3 + tmp3 .req r4 + val0 .req r5 + val1 .req r6 + tmp .req r7 + tmp2 .req r8 + k .req r9 + q .req r10 + qa .req r11 + qinv .req r12 + ctr .req r14 + + movw qa, #26632 + movw q, #3329 + ### qinv=0x6ba8f301 + movw qinv, #62209 + movt qinv, #27560 + movw k, #0 + + ldr.w tmp, [sp, #13*4] // load state from stack + vmov s26, tmp + + ldr.w tmp, [sp, #14*4] // load aprimeptr from stack + vmov s27, tmp + + vmov s28, rptr // store "real" destinaton in FP + vmov s29, rptr // backup + ldr.w rptr, [sp, #15*4] + + // outer while loop + movw ctr, #0 + vmov s17, bufptr // save bufptr to check later + 1: + + load_vals val0, val1, bufptr, tmp + + first_if doublebasemul_asm_acc_opt_32_16, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, tmp3, k, q, qa, qinv, ctr + + second_if doublebasemul_asm_acc_opt_32_16, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, tmp3, k, q, qa, qinv, ctr + + third_if tmp, tmp2, rptr, bptr, cptr, bufptr, ctr + + cmp ctr, #256/4 + blt.w 1b + + vmov rptr, s29 + + pop {r0-r11, pc} +.size matacc_asm_opt_32_16, . - matacc_asm_opt_32_16 \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fspeed/ntt.c b/crypto_kem/ml-kem-768/m4fspeed/ntt.c new file mode 100644 index 0000000..7fd1208 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/ntt.c @@ -0,0 +1,106 @@ +#include "ntt.h" + +#include "params.h" + +#include + +// for basemul not changed + +const int32_t zetas[64] = {21932846, 3562152210, 752167598, 3417653460, 2112004045, 932791035, 2951903026, 1419184148, 1817845876, 3434425636, 4233039261, 300609006, 975366560, 2781600929, 3889854731, 3935010590, 2197155094, 2130066389, 3598276897, 2308109491, 2382939200, 1228239371, 1884934581, 3466679822, 1211467195, 2977706375, 3144137970, 3080919767, 945692709, 3015121229, 345764865, 826997308, 2043625172, 2964804700, 2628071007, 4154339049, 483812778, 3288636719, 2696449880, 2122325384, 1371447954, 411563403, 3577634219, 976656727, 2708061387, 723783916, 3181552825, 3346694253, 3617629408, 1408862808, 519937465, 1323711759, 1474661346, 2773859924, 3580214553, 1143088323, 2221668274, 1563682897, 2417773720, 1327582262, 2722253228, 3786641338, 1141798155, 2779020594}; + +const int32_t zetas_asm[128] = { + 2230699446, 3328631909, 4243360600, 3408622288, 812805467, 2447447570, 1094061961, 1370157786, 2475831253, 249002310, 1028263423, 3594406395, 4205945745, 734105255, 2252632292, 381889553, 372858381, 427045412, 21932846, 3562152210, 752167598, 3417653460, 3157039644, 4196914574, 2265533966, 2112004045, 932791035, 2951903026, 1419184148, 1727534158, 1544330386, 2972545705, 1817845876, 3434425636, 4233039261, 300609006, 1904287092, 2937711185, 2651294021, 975366560, 2781600929, 3889854731, 3935010590, 3929849920, 838608815, 2550660963, 2197155094, 2130066389, 3598276897, 2308109491, 72249375, 3242190693, 815385801, 2382939200, 1228239371, 1884934581, 3466679822, 2889974991, 3696329620, 42575525, 1211467195, 2977706375, 3144137970, 3080919767, 1719793153, 1703020977, 2470670584, 945692709, 3015121229, 345764865, 826997308, 1839778722, 2991898216, 1851390229, 2043625172, 2964804700, 2628071007, 4154339049, 2701610550, 1041165097, 583155668, 483812778, 3288636719, 2696449880, 2122325384, 690239563, 1855260731, 3700200122, 1371447954, 411563403, 3577634219, 976656727, 3718262466, 1979116802, 3098982111, 2708061387, 723783916, 3181552825, 3346694253, 3087370604, 3415073125, 3376368103, 3617629408, 1408862808, 519937465, 1323711759, 3714391964, 1910737929, 836028480, 1474661346, 2773859924, 3580214553, 1143088323, 2546790461, 3191874164, 4012420634, 2221668274, 1563682897, 2417773720, 1327582262, 1059227441, 1583035408, 1174052340, 2722253228, 3786641338, 1141798155, 2779020594, 0}; + +const int32_t zetas_inv_CT_asm[256] = { + // LAYER 7+6+5+4 + 1290168, 1290168, 2064267851, 1290168, 51606697, 2064267851, 966335388, 1290168, 3200905336, 51606697, 3482161830, 2064267851, 1847519727, 966335388, 886345009, + // removed first "2285" + LAYER 3+2+1 - 1 - butterfly + 1290168, 2064267851, 1290168, 51606697, 2064267851, 966335388, + // LAYER 3+2+1 - 1 - twist + 2435836064, 290287667, 2944162022, 3021572066, 1802363867, 603798347, 3375077936, 2677097369, + // LAYER 3+2+1 - 2 - butterfly + 2042335005, 3235739856, 1748176836, 3120914957, 282546663, 2711931889, 1103093133, + // LAYER 3+2+1 - 2 - twist + 1659155285, 1785591691, 1941701947, 2704190884, 358666539, 793452955, 1461759672, 1673347127, + // LAYER 3+2+1 - 3 - butterfly + 3200905336, 2042335005, 3560862042, 3235739856, 580575333, 1748176836, 1207596693, + // LAYER 3+2+1 - 3 - twist + 3887274396, 2126195886, 872153167, 3443456808, 526388302, 299318839, 3875662889, 3382818940, + // LAYER 3+2+1 - 4 - butterfly + 3266703874, 2575174144, 1404992306, 1824296713, 4252391772, 2591946320, 598637677, + // LAYER 3+2+1 - 4 - twist + 1997179146, 2904166832, 2577754479, 202556283, 30964018, 3807284017, 1238560711, 1967505295, + // LAYER 3+2+1 - 5 - butterfly + 51606697, 3200905336, 1847519727, 2042335005, 89021552, 3560862042, 700560902, + // LAYER 3+2+1 - 5 - twist + 1633351937, 2191994424, 909568022, 1780431021, 2022982494, 2497764099, 3609888404, 1126316146, + // LAYER 3+2+1 - 6 - butterfly + 89021552, 576704831, 3604727734, 1195985186, 594767175, 2315850495, 2439706566, + // LAYER 3+2+1 - 6 - twist + 3633111417, 2908037335, 3590535893, 357376372, 1887514916, 1410152976, 2486152593, 571544162, + // LAYER 3+2+1 - 7 - butterfly + 3482161830, 3266703874, 4045964987, 2575174144, 4222717922, 1404992306, 365117377, + // LAYER 3+2+1 - 7 - twist + 4003389463, 2444867236, 1221788534, 3305408896, 1626901100, 3367336931, 651534541, 1549491056, + // LAYER 3+2+1 - 8 - butterfly + 1819136044, 2390680205, 2567433139, 1643673276, 1322421592, 1357256112, 2750636911, + // LAYER 3+2+1 - 8 - twist + 993428903, 3680847611, 1082450454, 1205016358, 348345200, 956014049, 1048906102, 3880823559, + // LAYER 3+2+1 - 9 - butterfly + 2064267851, 51606697, 966335388, 3200905336, 3482161830, 1847519727, 886345009, + // LAYER 3+2+1 - 9 - twist + 3342823751, 4258842609, 568963827, 2849979801, 1283716570, 2330042337, 4104022520, 3007380225, + // LAYER 3+2+1 - 10 - butterfly + 3560862042, 580575333, 1207596693, 3458938817, 918599194, 2384229368, 879894172, + // LAYER 3+2+1 - 10 - twist + 2217797772, 503165289, 2812564947, 2946742357, 833448145, 1905577260, 3273154711, 3208646340, + // LAYER 3+2+1 - 11 - butterfly + 1847519727, 89021552, 700560902, 576704831, 1593356747, 3604727734, 2455188575, + // LAYER 3+2+1 - 11 - twist + 3162200314, 2808694444, 1933960943, 678628056, 49026362, 1375318456, 1961054458, 3473130659, + // LAYER 3+2+1 - 12 - butterfly + 4045964987, 4222717922, 365117377, 3479581496, 1744306334, 1052776604, 3456358482, + // LAYER 3+2+1 - 12 - twist + 438656919, 1681088131, 366407544, 2819015784, 1771399850, 1091481626, 2136517226, 709592074, + // LAYER 3+2+1 - 13 - butterfly + 966335388, 3482161830, 886345009, 3266703874, 1819136044, 4045964987, 2924809511, + // LAYER 3+2+1 - 13 - twist + 25803349, 3888564563, 1032133926, 923759864, 2630651342, 2590656153, 2146838565, 547030981, + // LAYER 3+2+1 - 14 - butterfly + 700560902, 1593356747, 2455188575, 3711811629, 2443577068, 3253802200, 1303069081, + // LAYER 3+2+1 - 14 - twist + 254162980, 3513125848, 1576584571, 3086080437, 2933840683, 3184133160, 1389510297, 2811274779, + // LAYER 3+2+1 - 15 - butterfly + 886345009, 1819136044, 2924809511, 2390680205, 1137927653, 2567433139, 3913077744, + // LAYER 3+2+1 - 15 - twist + 2288756980, 459299597, 1355965945, 1192114684, 2699030215, 439947086, 587026170, 418014240, + // LAYER 3+2+1 - 16 - butterfly + 2924809511, 1137927653, 3913077744, 2029433331, 3867921885, 98052723, 3922108916, 639923034, + // LAYER 3+2+1 - 16 - twist + 2806114109, 4122084864, 575414664, 1674637294, 1541750051, 2560982302, 1540459884, 0}; + +extern void ntt_fast(int16_t *, const int32_t *); +/************************************************* +* Name: ntt +* +* Description: Inplace number-theoretic transform (NTT) in Rq +* input is in standard order, output is in bitreversed order +* +* Arguments: - int16_t *poly: pointer to input/output vector of 256 elements of Zq +**************************************************/ +void ntt(int16_t *poly) { + ntt_fast(poly, zetas_asm); +} + +extern void invntt_fast(int16_t *, const int32_t *); +/************************************************* +* Name: invntt +* +* Description: Inplace inverse number-theoretic transform in Rq +* input is in bitreversed order, output is in standard order +* +* Arguments: - int16_t *poly: pointer to input/output vector of 256 elements of Zq +**************************************************/ +void invntt(int16_t *poly) { + invntt_fast(poly, zetas_inv_CT_asm); +} diff --git a/crypto_kem/ml-kem-768/m4fspeed/ntt.h b/crypto_kem/ml-kem-768/m4fspeed/ntt.h new file mode 100644 index 0000000..a161be5 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/ntt.h @@ -0,0 +1,11 @@ +#ifndef NTT_H +#define NTT_H + +#include + +extern const int32_t zetas[64]; + +void ntt(int16_t *poly); +void invntt(int16_t *poly); + +#endif diff --git a/crypto_kem/ml-kem-768/m4fspeed/params.h b/crypto_kem/ml-kem-768/m4fspeed/params.h new file mode 100644 index 0000000..bd1dfe1 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/params.h @@ -0,0 +1,31 @@ +#ifndef PARAMS_H +#define PARAMS_H + +#define KYBER_K 3 + +/* Don't change parameters below this line */ + +#define KYBER_N 256 +#define KYBER_Q 3329 + +#define KYBER_ETA 2 + +#define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ +#define KYBER_SSBYTES 32 /* size in bytes of shared key */ + +#define KYBER_POLYBYTES 384 +#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) + +#define KYBER_POLYCOMPRESSEDBYTES 128 +#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320) + +#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES +#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) +#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) +#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) + +#define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) +#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */ +#define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES + +#endif diff --git a/crypto_kem/ml-kem-768/m4fspeed/poly.c b/crypto_kem/ml-kem-768/m4fspeed/poly.c new file mode 100644 index 0000000..b52060f --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/poly.c @@ -0,0 +1,654 @@ +#include "poly.h" + +#include "cbd.h" +#include "ntt.h" +#include "params.h" +#include "symmetric.h" + +#include + + +/************************************************* +* Name: poly_compress +* +* Description: Serialization of a polynomial and subsequent compression of a polynomial; +* +* Arguments: - unsigned char *r: pointer to output byte array (of length KYBER_POLYCOMPRESSEDBYTES) +* - const poly *a: pointer to input polynomial to be serialized +*************************************************/ +void poly_compress(unsigned char *r, const poly *a) +{ + unsigned int i,j; + int16_t u; + uint32_t d0; + uint8_t t[8]; + +#if (KYBER_POLYCOMPRESSEDBYTES == 128) + for(i=0;icoeffs[8*i+j]; + u += (u >> 15) & KYBER_Q; +/* t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */ + d0 = u << 4; + d0 += 1665; + d0 *= 80635; + d0 >>= 28; + t[j] = d0 & 0xf; + } + + r[0] = t[0] | (t[1] << 4); + r[1] = t[2] | (t[3] << 4); + r[2] = t[4] | (t[5] << 4); + r[3] = t[6] | (t[7] << 4); + r += 4; + } +#elif (KYBER_POLYCOMPRESSEDBYTES == 160) + for(i=0;icoeffs[8*i+j]; + u += (u >> 15) & KYBER_Q; +/* t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */ + d0 = u << 5; + d0 += 1664; + d0 *= 40318; + d0 >>= 27; + t[j] = d0 & 0x1f; + } + + r[0] = (t[0] >> 0) | (t[1] << 5); + r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7); + r[2] = (t[3] >> 1) | (t[4] << 4); + r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6); + r[4] = (t[6] >> 2) | (t[7] << 3); + r += 5; + } +#else +#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}" +#endif +} + +/************************************************* +* Name: poly_decompress +* +* Description: De-serialization and subsequent decompression of a polynomial; +* approximate inverse of poly_compress +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *a: pointer to input byte array (of length KYBER_POLYCOMPRESSEDBYTES bytes) +**************************************************/ +void poly_decompress(poly *r, const unsigned char *a) +{ + int i; +#if (KYBER_POLYCOMPRESSEDBYTES == 128) + for(i=0;icoeffs[i+0] = (((a[0] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[i+1] = (((a[0] >> 4) * KYBER_Q) + 8) >> 4; + r->coeffs[i+2] = (((a[1] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[i+3] = (((a[1] >> 4) * KYBER_Q) + 8) >> 4; + r->coeffs[i+4] = (((a[2] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[i+5] = (((a[2] >> 4) * KYBER_Q) + 8) >> 4; + r->coeffs[i+6] = (((a[3] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[i+7] = (((a[3] >> 4) * KYBER_Q) + 8) >> 4; + a += 4; + } +#elif (KYBER_POLYCOMPRESSEDBYTES == 160) + for(i=0;icoeffs[i+0] = (((a[0] & 31) * KYBER_Q) + 16) >> 5; + r->coeffs[i+1] = ((((a[0] >> 5) | ((a[1] & 3) << 3)) * KYBER_Q) + 16) >> 5; + r->coeffs[i+2] = ((((a[1] >> 2) & 31) * KYBER_Q) + 16) >> 5; + r->coeffs[i+3] = ((((a[1] >> 7) | ((a[2] & 15) << 1)) * KYBER_Q) + 16) >> 5; + r->coeffs[i+4] = ((((a[2] >> 4) | ((a[3] & 1) << 4)) * KYBER_Q) + 16) >> 5; + r->coeffs[i+5] = ((((a[3] >> 1) & 31) * KYBER_Q) + 16) >> 5; + r->coeffs[i+6] = ((((a[3] >> 6) | ((a[4] & 7) << 2)) * KYBER_Q) + 16) >> 5; + r->coeffs[i+7] = (((a[4] >> 3) * KYBER_Q) + 16) >> 5; + a += 5; + } +#else +#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}" +#endif +} + +/************************************************* +* Name: poly_packcompress +* +* Description: Serialization and subsequent compression of a polynomial of a polyvec, +* writes to a byte string representation of the whole polyvec. +* Used to compress a polyvec one poly at a time in a loop. +* +* Arguments: - unsigned char *r: pointer to output byte string representation of a polyvec (of length KYBER_POLYVECCOMPRESSEDBYTES) +* - const poly *a: pointer to input polynomial +* - int i: index of to be serialized polynomial in serialized polyec +**************************************************/ +void poly_packcompress(unsigned char *r, poly *a, int i) { + int j, k; + uint64_t d0; + +#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) + uint16_t t[8]; + + for(j=0;jcoeffs[8*j+k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; +/* t[k] = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */ + d0 = t[k]; + d0 <<= 11; + d0 += 1664; + d0 *= 645084; + d0 >>= 31; + t[k] = d0 & 0x7ff; + } + + + r[352*i+11*j+ 0] = t[0] & 0xff; + r[352*i+11*j+ 1] = (t[0] >> 8) | ((t[1] & 0x1f) << 3); + r[352*i+11*j+ 2] = (t[1] >> 5) | ((t[2] & 0x03) << 6); + r[352*i+11*j+ 3] = (t[2] >> 2) & 0xff; + r[352*i+11*j+ 4] = (t[2] >> 10) | ((t[3] & 0x7f) << 1); + r[352*i+11*j+ 5] = (t[3] >> 7) | ((t[4] & 0x0f) << 4); + r[352*i+11*j+ 6] = (t[4] >> 4) | ((t[5] & 0x01) << 7); + r[352*i+11*j+ 7] = (t[5] >> 1) & 0xff; + r[352*i+11*j+ 8] = (t[5] >> 9) | ((t[6] & 0x3f) << 2); + r[352*i+11*j+ 9] = (t[6] >> 6) | ((t[7] & 0x07) << 5); + r[352*i+11*j+10] = (t[7] >> 3); + } +#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) + uint16_t t[4]; + + for (j = 0; j < KYBER_N / 4; j++) { + for(k=0;k<4;k++) { + t[k] = a->coeffs[4*j+k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; + /* t[k] = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */ + d0 = t[k]; + d0 <<= 10; + d0 += 1665; + d0 *= 1290167; + d0 >>= 32; + t[k] = d0 & 0x3ff; + } + r[320*i+5*j+0] = t[0] & 0xff; + r[320*i+5*j+1] = (t[0] >> 8) | ((t[1] & 0x3f) << 2); + r[320*i+5*j+2] = ((t[1] >> 6) | ((t[2] & 0x0f) << 4)) & 0xff; + r[320*i+5*j+3] = ((t[2] >> 4) | ((t[3] & 0x03) << 6)) & 0xff; + r[320*i+5*j+4] = (t[3] >> 2) & 0xff; + } +#else +#error "KYBER_POLYVECCOMPRESSEDBYTES needs to in (KYBER_K * {352, 320})" +#endif +} + +/************************************************* +* Name: poly_unpackdecompress +* +* Description: Deserialization and subsequent compression of a polynomial of a polyvec, +* Used to uncompress a polyvec one poly at a time in a loop. +* +* Arguments: - const poly *r: pointer to output polynomial +* - unsigned char *a: pointer to input byte string representation of a polyvec (of length KYBER_POLYVECCOMPRESSEDBYTES) +* - int i: index of poly in polyvec to decompress +**************************************************/ +void poly_unpackdecompress(poly *r, const unsigned char *a, int i) { + int j; +#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) + for(j=0;jcoeffs[8*j+0] = (((a[352*i+11*j+ 0] | (((uint32_t)a[352*i+11*j+ 1] & 0x07) << 8)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+1] = ((((a[352*i+11*j+ 1] >> 3) | (((uint32_t)a[352*i+11*j+ 2] & 0x3f) << 5)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+2] = ((((a[352*i+11*j+ 2] >> 6) | (((uint32_t)a[352*i+11*j+ 3] & 0xff) << 2) | (((uint32_t)a[352*i+11*j+4] & 0x01) << 10)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+3] = ((((a[352*i+11*j+ 4] >> 1) | (((uint32_t)a[352*i+11*j+ 5] & 0x0f) << 7)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+4] = ((((a[352*i+11*j+ 5] >> 4) | (((uint32_t)a[352*i+11*j+ 6] & 0x7f) << 4)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+5] = ((((a[352*i+11*j+ 6] >> 7) | (((uint32_t)a[352*i+11*j+ 7] & 0xff) << 1) | (((uint32_t)a[352*i+11*j+8] & 0x03) << 9)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+6] = ((((a[352*i+11*j+ 8] >> 2) | (((uint32_t)a[352*i+11*j+ 9] & 0x1f) << 6)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+7] = ((((a[352*i+11*j+ 9] >> 5) | (((uint32_t)a[352*i+11*j+10] & 0xff) << 3)) * KYBER_Q) + 1024) >> 11; + } +#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) + for(j=0;jcoeffs[4*j+0] = (((a[320*i+5*j+ 0] | (((uint32_t)a[320*i+5*j+ 1] & 0x03) << 8)) * KYBER_Q) + 512) >> 10; + r->coeffs[4*j+1] = ((((a[320*i+5*j+ 1] >> 2) | (((uint32_t)a[320*i+5*j+ 2] & 0x0f) << 6)) * KYBER_Q) + 512) >> 10; + r->coeffs[4*j+2] = ((((a[320*i+5*j+ 2] >> 4) | (((uint32_t)a[320*i+5*j+ 3] & 0x3f) << 4)) * KYBER_Q) + 512) >> 10; + r->coeffs[4*j+3] = ((((a[320*i+5*j+ 3] >> 6) | (((uint32_t)a[320*i+5*j+ 4] & 0xff) << 2)) * KYBER_Q) + 512) >> 10; + } +#else +#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}" +#endif +} + + +/************************************************* +* Name: cmp_poly_compress +* +* Description: Serializes and consequently compares polynomial to a serialized polynomial +* +* Arguments: - const unsigned char *r: pointer to serialized polynomial to compare with +* - poly *a: pointer to input polynomial to serialize and compare +* Returns: boolean indicating whether the polynomials are equal +**************************************************/ +int cmp_poly_compress(const unsigned char *r, poly *a) { + unsigned char rc = 0; + int16_t u; + uint32_t d0; + uint8_t t[8]; + int i, j, k = 0; + +#if (KYBER_POLYCOMPRESSEDBYTES == 128) + for(i=0;icoeffs[8*i+j]; + u += (u >> 15) & KYBER_Q; +/* t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */ + d0 = u << 4; + d0 += 1665; + d0 *= 80635; + d0 >>= 28; + t[j] = d0 & 0xf; + } + rc |= r[k] ^ (t[0] | (t[1] << 4)); + rc |= r[k + 1] ^ (t[2] | (t[3] << 4)); + rc |= r[k + 2] ^ (t[4] | (t[5] << 4)); + rc |= r[k + 3] ^ (t[6] | (t[7] << 4)); + k += 4; + } +#elif (KYBER_POLYCOMPRESSEDBYTES == 160) + for(i=0;icoeffs[8*i+j]; + u += (u >> 15) & KYBER_Q; +/* t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */ + d0 = u << 5; + d0 += 1664; + d0 *= 40318; + d0 >>= 27; + t[j] = d0 & 0x1f; + } + + + rc |= r[k] ^ (t[0] | (t[1] << 5)); + rc |= r[k+1] ^ ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); + rc |= r[k+2] ^ ((t[3] >> 1) | (t[4] << 4)); + rc |= r[k+3] ^ ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); + rc |= r[k+4] ^ ((t[6] >> 2) | (t[7] << 3)); + k += 5; + } +#else +#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}" +#endif + return rc; +} + +/************************************************* +* Name: cmp_poly_packcompress +* +* Description: Serializes and consequently compares poly of polyvec to a serialized polyvec +* Should be called in a loop over all poly's of a polyvec. +* +* Arguments: - const unsigned char *r: pointer to serialized polyvec to compare with +* - poly *a: pointer to input polynomial of polyvec to serialize and compare +* - int i: index of poly in polyvec to compare with +* Returns: boolean indicating whether the polyvecs are equal +**************************************************/ +int cmp_poly_packcompress(const unsigned char *r, poly *a, int i) { + unsigned char rc = 0; + int j, k; + uint64_t d0; + +#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) + uint16_t t[8]; + for(j=0;jcoeffs[8*j+k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; +/* t[k] = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */ + d0 = t[k]; + d0 <<= 11; + d0 += 1664; + d0 *= 645084; + d0 >>= 31; + t[k] = d0 & 0x7ff; + } + + rc |= r[352*i+11*j+ 0] ^ (t[0] & 0xff); + rc |= r[352*i+11*j+ 1] ^ ((t[0] >> 8) | ((t[1] & 0x1f) << 3)); + rc |= r[352*i+11*j+ 2] ^ ((t[1] >> 5) | ((t[2] & 0x03) << 6)); + rc |= r[352*i+11*j+ 3] ^ ((t[2] >> 2) & 0xff); + rc |= r[352*i+11*j+ 4] ^ ((t[2] >> 10) | ((t[3] & 0x7f) << 1)); + rc |= r[352*i+11*j+ 5] ^ ((t[3] >> 7) | ((t[4] & 0x0f) << 4)); + rc |= r[352*i+11*j+ 6] ^ ((t[4] >> 4) | ((t[5] & 0x01) << 7)); + rc |= r[352*i+11*j+ 7] ^ ((t[5] >> 1) & 0xff); + rc |= r[352*i+11*j+ 8] ^ ((t[5] >> 9) | ((t[6] & 0x3f) << 2)); + rc |= r[352*i+11*j+ 9] ^ ((t[6] >> 6) | ((t[7] & 0x07) << 5)); + rc |= r[352*i+11*j+10] ^ ((t[7] >> 3)); + } +#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) + uint16_t t[4]; + for (j = 0; j < KYBER_N / 4; j++) { + for(k=0;k<4;k++) { + t[k] = a->coeffs[4*j+k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; + /* t[k] = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */ + d0 = t[k]; + d0 <<= 10; + d0 += 1665; + d0 *= 1290167; + d0 >>= 32; + t[k] = d0 & 0x3ff; + } + + rc |= r[320*i+5*j+0] ^ (t[0] & 0xff); + rc |= r[320*i+5*j+1] ^ ((t[0] >> 8) | ((t[1] & 0x3f) << 2)); + rc |= r[320*i+5*j+2] ^ (((t[1] >> 6) | ((t[2] & 0x0f) << 4)) & 0xff); + rc |= r[320*i+5*j+3] ^ (((t[2] >> 4) | ((t[3] & 0x03) << 6)) & 0xff); + rc |= r[320*i+5*j+4] ^ ((t[3] >> 2) & 0xff); + } +#else +#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}" +#endif + return rc; +} + +/************************************************* +* Name: poly_tobytes +* +* Description: Serialization of a polynomial +* +* Arguments: - unsigned char *r: pointer to output byte array (needs space for KYBER_POLYBYTES bytes) +* - const poly *a: pointer to input polynomial +**************************************************/ +void poly_tobytes(unsigned char *r, poly *a) { + int i; + uint16_t t0, t1; + + poly_reduce(a); + + for (i = 0; i < KYBER_N / 2; i++) { + t0 = a->coeffs[2 * i]; + t1 = a->coeffs[2 * i + 1]; + r[3 * i] = t0 & 0xff; + r[3 * i + 1] = (t0 >> 8) | ((t1 & 0xf) << 4); + r[3 * i + 2] = (t1 >> 4) & 0xff; + } +} + +/************************************************* +* Name: poly_frombytes +* +* Description: De-serialization of a polynomial; +* inverse of poly_tobytes +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes) +**************************************************/ +void poly_frombytes(poly *r, const unsigned char *a) { + int i; + + for (i = 0; i < KYBER_N / 2; i++) { + r->coeffs[2 * i] = a[3 * i] | ((uint16_t)a[3 * i + 1] & 0x0f) << 8; + r->coeffs[2 * i + 1] = a[3 * i + 1] >> 4 | ((uint16_t)a[3 * i + 2] & 0xff) << 4; + } +} + +/************************************************* +* Name: poly_frombytes_mul_16_32 +* +* Description: Multiplication of a polynomial with a de-serialization of another polynomial +* Using strategy of better accumulation. +* Arguments: - const poly *b: pointer to input polynomial +* - int32_t *r_tmp: array for accumulating unreduced results +* - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes) +**************************************************/ +extern void frombytes_mul_asm_16_32(int32_t *r_tmp, const int16_t *b, const unsigned char *c, const int32_t zetas[64]); +void poly_frombytes_mul_16_32(int32_t *r_tmp, const poly *b, const unsigned char *a) { + frombytes_mul_asm_16_32(r_tmp, b->coeffs, a, zetas); +} + +/************************************************* +* Name: poly_frombytes_mul_32_32 +* +* Description: Multiplication of a polynomial with a de-serialization of another polynomial +* Using strategy of better accumulation. +* Arguments: - const poly *b: pointer to input polynomial +* - int32_t *r_tmp: array for accumulating unreduced results +* - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes) +**************************************************/ +extern void frombytes_mul_asm_acc_32_32(int32_t *r_tmp, const int16_t *b, const unsigned char *c, const int32_t zetas[64]); +void poly_frombytes_mul_32_32(int32_t *r_tmp, const poly *b, const unsigned char *a) { + frombytes_mul_asm_acc_32_32(r_tmp, b->coeffs, a, zetas); +} + +/************************************************* +* Name: poly_frombytes_mul_32_16 +* +* Description: Multiplication of a polynomial with a de-serialization of another polynomial +* Using strategy of better accumulation. +* Arguments: - poly *r: pointer to output polynomial +* - const poly *b: pointer to input polynomial +* - const int32_t *r_tmp: array containing unreduced results +* - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes) +**************************************************/ +extern void frombytes_mul_asm_acc_32_16(int16_t *r, const int16_t *b, const unsigned char *c, const int32_t zetas[64], const int32_t *r_tmp); +void poly_frombytes_mul_32_16(poly *r, const poly* b, const unsigned char *a, const int32_t *r_tmp) { + frombytes_mul_asm_acc_32_16(r->coeffs, b->coeffs, a, zetas, r_tmp); +} + +/************************************************* +* Name: poly_getnoise +* +* Description: Sample a polynomial deterministically from a seed and a nonce, +* with output polynomial close to centered binomial distribution +* with parameter KYBER_ETA +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *seed: pointer to input seed (pointing to array of length KYBER_SYMBYTES bytes) +* - unsigned char nonce: one-byte input nonce +* - int add: boolean to indicate to accumulate into r +**************************************************/ +void poly_noise(poly *r, const unsigned char *seed, unsigned char nonce, int add) { + unsigned char buf[KYBER_ETA * KYBER_N / 4]; + + prf(buf, KYBER_ETA * KYBER_N / 4, seed, nonce); + cbd(r, buf, add); +} + +/************************************************* +* Name: poly_basemul_opt_16_32 +* +* Description: Multiplication of two polynomials using asymmetric multiplication. +* Cached values are generated during matrix-vector product. +* Using strategy of better accumulation (initial step). +* Arguments: - const poly *a: pointer to input polynomial +* - const poly *b: pointer to input polynomial +* - const poly *a_prime: pointer to a pre-multiplied by zetas +* - int32_t *r_tmp: array for accumulating unreduced results +**************************************************/ +extern void basemul_asm_opt_16_32(int32_t *, const int16_t *, const int16_t *, const int16_t *); +void poly_basemul_opt_16_32(int32_t *r_tmp, const poly *a, const poly *b, const poly *a_prime) { + basemul_asm_opt_16_32(r_tmp, a->coeffs, b->coeffs, a_prime->coeffs); +} + +/************************************************* +* Name: poly_basemul_acc_opt_32_32 +* +* Description: Multiplication of two polynomials using asymmetric multiplication. +* Cached values are generated during matrix-vector product. +* Using strategy of better accumulation. +* Arguments: - const poly *a: pointer to input polynomial +* - const poly *b: pointer to input polynomial +* - const poly *a_prime: pointer to a pre-multiplied by zetas +* - int32_t *r_tmp: array for accumulating unreduced results +**************************************************/ +extern void basemul_asm_acc_opt_32_32(int32_t *, const int16_t *, const int16_t *, const int16_t *); +void poly_basemul_acc_opt_32_32(int32_t *r_tmp, const poly *a, const poly *b, const poly *a_prime) { + basemul_asm_acc_opt_32_32(r_tmp, a->coeffs, b->coeffs, a_prime->coeffs); +} + +/************************************************* +* Name: poly_basemul_acc_opt_32_16 +* +* Description: Multiplication of two polynomials using asymmetric multiplication. +* Cached values are generated during matrix-vector product. +* Using strategy of better accumulation (final step). +* Arguments: - const poly *a: pointer to input polynomial +* - const poly *b: pointer to input polynomial +* - const poly *a_prime: pointer to a pre-multiplied by zetas +* - poly *r: pointer to output polynomial +* - int32_t *r_tmp: array for accumulating unreduced results +**************************************************/ +extern void basemul_asm_acc_opt_32_16(int16_t *, const int16_t *, const int16_t *, const int16_t *, const int32_t *); +void poly_basemul_acc_opt_32_16(poly *r, const poly *a, const poly *b, const poly *a_prime, const int32_t * r_tmp) { + basemul_asm_acc_opt_32_16(r->coeffs, a->coeffs, b->coeffs, a_prime->coeffs, r_tmp); +} + +/************************************************* +* Name: poly_ntt +* +* Description: Computes negacyclic number-theoretic transform (NTT) of +* a polynomial in place; +* inputs assumed to be in normal order, output in bitreversed order +* +* Arguments: - uint16_t *r: pointer to in/output polynomial +**************************************************/ +void poly_ntt(poly *r) { + ntt(r->coeffs); +} + +/************************************************* +* Name: poly_invntt +* +* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of +* a polynomial in place; +* inputs assumed to be in bitreversed order, output in normal order +* +* Arguments: - uint16_t *a: pointer to in/output polynomial +**************************************************/ +void poly_invntt(poly *r) { + invntt(r->coeffs); +} + +extern void asm_fromplant(int16_t *r); +/************************************************* +* Name: poly_fromplant +* +* Description: Inplace conversion of all coefficients of a polynomial +* from Plantard domain to normal domain +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void poly_fromplant(poly *r) { + asm_fromplant(r->coeffs); +} + +extern void asm_barrett_reduce(int16_t *r); +/************************************************* +* Name: poly_reduce +* +* Description: Applies Barrett reduction to all coefficients of a polynomial +* for details of the Barrett reduction see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void poly_reduce(poly *r) { + asm_barrett_reduce(r->coeffs); +} + +extern void pointwise_add(int16_t *, const int16_t *, const int16_t *); +/************************************************* +* Name: poly_add +* +* Description: Add two polynomials +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_add(poly *r, const poly *a, const poly *b) { + pointwise_add(r->coeffs,a->coeffs,b->coeffs); +} + + +extern void pointwise_sub(int16_t *, const int16_t *, const int16_t *); +/************************************************* +* Name: poly_sub +* +* Description: Subtract two polynomials +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_sub(poly *r, const poly *a, const poly *b) { + pointwise_sub(r->coeffs,a->coeffs,b->coeffs); +} + + +void cmov_int16(int16_t *r, int16_t v, uint16_t b); + +/************************************************* +* Name: poly_frommsg +* +* Description: Convert 32-byte message to polynomial +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *msg: pointer to input message +**************************************************/ +void poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) +{ + unsigned int i,j; + +#if (KYBER_INDCPA_MSGBYTES != KYBER_N/8) +#error "KYBER_INDCPA_MSGBYTES must be equal to KYBER_N/8 bytes!" +#endif + + for(i=0;icoeffs[8*i+j] = 0; + cmov_int16(r->coeffs+8*i+j, ((KYBER_Q+1)/2), (msg[i] >> j)&1); + } + } +} + +/************************************************* +* Name: poly_tomsg +* +* Description: Convert polynomial to 32-byte message +* +* Arguments: - unsigned char *msg: pointer to output message +* - const poly *a: pointer to input polynomial +**************************************************/ +void poly_tomsg(unsigned char msg[KYBER_SYMBYTES], poly *a) { + uint32_t t; + int i, j; + + for (i = 0; i < KYBER_SYMBYTES; i++) { + msg[i] = 0; + for (j = 0; j < 8; j++) { + t = a->coeffs[8*i+j]; + t <<= 1; + t += 1665; + t *= 80635; + t >>= 28; + t &= 1; + msg[i] |= t << j; + } + } +} + +/************************************************* +* Name: poly_zeroize +* +* Description: Zeros a polynomial +* +* Arguments: - poly *p: pointer to polynomial +**************************************************/ +void poly_zeroize(poly *p) { + int i; + for(i = 0; i < KYBER_N; i++) + p->coeffs[i] = 0; +} diff --git a/crypto_kem/ml-kem-768/m4fspeed/poly.h b/crypto_kem/ml-kem-768/m4fspeed/poly.h new file mode 100644 index 0000000..fc61dd5 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/poly.h @@ -0,0 +1,53 @@ +#ifndef POLY_H +#define POLY_H + +#include "params.h" + +#include + +#define poly_getnoise(p, seed, nonce) poly_noise(p, seed, nonce, 0) +#define poly_addnoise(p, seed, nonce) poly_noise(p, seed, nonce, 1) + +/* + * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial + * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] + */ +typedef struct { + int16_t coeffs[KYBER_N]; +} poly; + +void poly_compress(unsigned char *r, const poly *a); +void poly_decompress(poly *r, const unsigned char *a); + +void poly_packcompress(unsigned char *r, poly *a, int i); +void poly_unpackdecompress(poly *r, const unsigned char *a, int i); + +int cmp_poly_compress(const unsigned char *r, poly *a); +int cmp_poly_packcompress(const unsigned char *r, poly *a, int i); + +void poly_tobytes(unsigned char *r, poly *a); +void poly_frombytes(poly *r, const unsigned char *a); +void poly_frombytes_mul_16_32(int32_t *r_tmp, const poly *b, const unsigned char *a); +void poly_frombytes_mul_32_32(int32_t *r_tmp, const poly *b, const unsigned char *a); +void poly_frombytes_mul_32_16(poly *r, const poly* b, const unsigned char *a, const int32_t *r_tmp); + +void poly_frommsg(poly *r, const unsigned char msg[KYBER_SYMBYTES]); +void poly_tomsg(unsigned char msg[KYBER_SYMBYTES], poly *a); + +void poly_noise(poly *r, const unsigned char *seed, unsigned char nonce, int add); + +void poly_ntt(poly *r); +void poly_invntt(poly *r); +void poly_basemul_opt_16_32(int32_t *r, const poly *a, const poly *b, const poly *a_prime); +void poly_basemul_acc_opt_32_32(int32_t *r_tmp, const poly *a, const poly *b, const poly *a_prime); +void poly_basemul_acc_opt_32_16(poly *r, const poly *a, const poly *b, const poly *a_prime, const int32_t * r_tmp); +void poly_fromplant(poly *r); + +void poly_reduce(poly *r); + +void poly_add(poly *r, const poly *a, const poly *b); +void poly_sub(poly *r, const poly *a, const poly *b); + +void poly_zeroize(poly *p); + +#endif \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fspeed/poly_asm.S b/crypto_kem/ml-kem-768/m4fspeed/poly_asm.S new file mode 100644 index 0000000..e58896a --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/poly_asm.S @@ -0,0 +1,246 @@ +#include "macros.i" + +.syntax unified +.cpu cortex-m4 +.thumb + +.macro doublebasemul_frombytes_asm_16_32 rptr_tmp, bptr, zeta, poly0, poly2, poly1, poly3, tmp, q, qa, qinv + ldr \poly0, [\bptr], #4 + ldr \poly2, [\bptr], #4 + + smulwt \tmp, \zeta, \poly1 + smlabt \tmp, \tmp, \q, \qa + smultt \tmp, \poly0, \tmp + smlabb \tmp, \poly0, \poly1, \tmp + str \tmp, [\rptr_tmp], #4 + + smuadx \tmp, \poly0, \poly1 + str \tmp, [\rptr_tmp], #4 + + neg \zeta, \zeta + + smulwt \tmp, \zeta, \poly3 + smlabt \tmp, \tmp, \q, \qa + smultt \tmp, \poly2, \tmp + smlabb \tmp, \poly2, \poly3, \tmp + str \tmp, [\rptr_tmp], #4 + + smuadx \tmp, \poly2, \poly3 + str \tmp, [\rptr_tmp], #4 +.endm + +.macro doublebasemul_frombytes_asm_acc_32_32 rptr_tmp, bptr, zeta, poly0, poly1, poly3, res0, tmp, q, qa, qinv + ldr \poly0, [\bptr], #4 + ldr \res0, [\rptr_tmp] + + smulwt \tmp, \zeta, \poly1 + smlabt \tmp, \tmp, \q, \qa + smlatt \tmp, \poly0, \tmp, \res0 + smlabb \tmp, \poly0, \poly1, \tmp + + str \tmp, [\rptr_tmp], #4 + + ldr \res0, [\rptr_tmp] + smladx \tmp, \poly0, \poly1, \res0 + str \tmp, [\rptr_tmp], #4 + + neg \zeta, \zeta + + ldr \poly0, [\bptr], #4 + ldr \res0, [\rptr_tmp] + + smulwt \tmp, \zeta, \poly3 + smlabt \tmp, \tmp, \q, \qa + smlatt \tmp, \poly0, \tmp, \res0 + smlabb \tmp, \poly0, \poly3, \tmp + + str \tmp, [\rptr_tmp], #4 + + ldr \res0, [\rptr_tmp] + smladx \tmp, \poly0, \poly3, \res0 + str \tmp, [\rptr_tmp], #4 +.endm + +.macro doublebasemul_frombytes_asm_acc_32_16 rptr_tmp, rptr, bptr, zeta, poly0, poly1, poly3, res0, tmp, q, qa, qinv + ldr \poly0, [\bptr], #4 + ldr \res0, [\rptr_tmp], #4 + + smulwt \tmp, \zeta, \poly1 + smlabt \tmp, \tmp, \q, \qa + smlatt \tmp, \poly0, \tmp, \res0 + smlabb \tmp, \poly0, \poly1, \tmp + plant_red \q, \qa, \qinv, \tmp + + ldr \res0, [\rptr_tmp], #4 + smladx \res0, \poly0, \poly1, \res0 + plant_red \q, \qa, \qinv, \res0 + + pkhtb \res0, \res0, \tmp, asr#16 + str \res0, [\rptr], #4 + + neg \zeta, \zeta + + ldr \poly0, [\bptr], #4 + ldr \res0, [\rptr_tmp], #4 + + smulwt \tmp, \zeta, \poly3 + smlabt \tmp, \tmp, \q, \qa + smlatt \tmp, \poly0, \tmp, \res0 + smlabb \tmp, \poly0, \poly3, \tmp + plant_red \q, \qa, \qinv, \tmp + + ldr \res0, [\rptr_tmp], #4 + smladx \res0, \poly0, \poly3, \res0 + plant_red \q, \qa, \qinv, \res0 + + pkhtb \res0, \res0, \tmp, asr#16 + str \res0, [\rptr], #4 +.endm + +// reduce 2 registers +.macro deserialize aptr, tmp, tmp2, tmp3, t0, t1 + ldrb.w \tmp, [\aptr, #2] + ldrh.w \tmp2, [\aptr, #3] + ldrb.w \tmp3, [\aptr, #5] + ldrh.w \t0, [\aptr], #6 + + ubfx.w \t1, \t0, #12, #4 + ubfx.w \t0, \t0, #0, #12 + orr \t1, \t1, \tmp, lsl #4 + orr \t0, \t0, \t1, lsl #16 + //tmp is free now + ubfx.w \t1, \tmp2, #12, #4 + ubfx.w \tmp, \tmp2, #0, #12 + orr \t1, \t1, \tmp3, lsl #4 + orr \t1, \tmp, \t1, lsl #16 +.endm + +// void frombytes_mul_asm_16_32(int32_t *r_tmp, const int16_t *b, const unsigned char *c, const int32_t zetas[64]) +.global frombytes_mul_asm_16_32 +.type frombytes_mul_asm_16_32, %function +.align 2 +frombytes_mul_asm_16_32: + push {r4-r11, r14} + + rptr_tmp .req r0 + bptr .req r1 + aptr .req r2 + zetaptr .req r3 + t0 .req r4 + t1 .req r5 + tmp .req r6 + tmp2 .req r7 + tmp3 .req r8 + q .req r9 + qa .req r10 + qinv .req r11 + zeta .req r12 + ctr .req r14 + + movw qa, #26632 + movt q, #3329 + ### qinv=0x6ba8f301 + movw qinv, #62209 + movt qinv, #27560 + + add ctr, rptr_tmp, #64*4*4 + 1: + ldr.w zeta, [zetaptr], #4 + deserialize aptr, tmp, tmp2, tmp3, t0, t1 + + doublebasemul_frombytes_asm_16_32 rptr_tmp, bptr, zeta, tmp, tmp2, t0, t1, tmp3, q, qa, qinv + + cmp.w rptr_tmp, ctr + bne.w 1b + +pop {r4-r11, pc} +.size frombytes_mul_asm_16_32, . -frombytes_mul_asm_16_32 + +// void frombytes_mul_asm_acc_32_32(int32_t *r_tmp, const int16_t *b, const unsigned char *c, const int32_t zetas[64]) +.global frombytes_mul_asm_acc_32_32 +.type frombytes_mul_asm_acc_32_32, %function +.align 2 +frombytes_mul_asm_acc_32_32: + push {r4-r11, r14} + + rptr_tmp .req r0 + bptr .req r1 + aptr .req r2 + zetaptr .req r3 + t0 .req r4 + t1 .req r5 + tmp .req r6 + tmp2 .req r7 + tmp3 .req r8 + q .req r9 + qa .req r10 + qinv .req r11 + zeta .req r12 + ctr .req r14 + + movw qa, #26632 + movt q, #3329 + ### qinv=0x6ba8f301 + movw qinv, #62209 + movt qinv, #27560 + + add ctr, rptr_tmp, #64*4*4 + 1: + ldr.w zeta, [zetaptr], #4 + deserialize aptr, tmp, tmp2, tmp3, t0, t1 + + doublebasemul_frombytes_asm_acc_32_32 rptr_tmp, bptr, zeta, tmp3, t0, t1, tmp, tmp2, q, qa, qinv + cmp.w rptr_tmp, ctr + bne.w 1b + +pop {r4-r11, pc} +.size frombytes_mul_asm_acc_32_32, . - frombytes_mul_asm_acc_32_32 + +.unreq rptr_tmp + +// void frombytes_mul_asm_acc_32_16(int16_t *r, const int16_t *b, const unsigned char *c, const int32_t zetas[64], const int32_t *r_tmp) +.global frombytes_mul_asm_acc_32_16 +.type frombytes_mul_asm_acc_32_16, %function +.align 2 +frombytes_mul_asm_acc_32_16: + push {r4-r11, r14} + + rptr .req r0 + bptr .req r1 + aptr .req r2 + zetaptr .req r3 + t0 .req r4 + t1 .req r5 + tmp .req r6 + tmp2 .req r7 + tmp3 .req r8 + q .req r9 + qa .req r10 + qinv .req r11 + zeta .req r12 + ctr .req r14 + rptr_tmp .req r3 + + movw qa, #26632 + movt q, #3329 + ### qinv=0x6ba8f301 + movw qinv, #62209 + movt qinv, #27560 + + ldr.w tmp, [sp, #9*4] // load rptr_tmp from stack + vmov s1, tmp + + add ctr, tmp, #64*4*4 + 1: + ldr.w zeta, [zetaptr], #4 + deserialize aptr, tmp, tmp2, tmp3, t0, t1 + vmov s2, zetaptr + vmov rptr_tmp, s1 + doublebasemul_frombytes_asm_acc_32_16 rptr_tmp, rptr, bptr, zeta, tmp3, t0, t1, tmp, tmp2, q, qa, qinv + vmov s1, rptr_tmp + cmp.w rptr_tmp, ctr + vmov zetaptr, s2 + bne.w 1b + +pop {r4-r11, pc} +.size frombytes_mul_asm_acc_32_16, . - frombytes_mul_asm_acc_32_16 \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fspeed/polyvec.c b/crypto_kem/ml-kem-768/m4fspeed/polyvec.c new file mode 100644 index 0000000..a405e91 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/polyvec.c @@ -0,0 +1,212 @@ +#include +#include "polyvec.h" +#include "poly.h" + +/************************************************* +* Name: polyvec_compress +* +* Description: Compress and serialize vector of polynomials +* +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for KYBER_POLYVECCOMPRESSEDBYTES) +* - const polyvec *a: pointer to input vector of polynomials +**************************************************/ +void polyvec_compress(unsigned char *r, const polyvec *a) +{ + unsigned int i,j,k; + uint64_t d0; + +#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) + uint16_t t[8]; + for(i=0;ivec[i].coeffs[8*j+k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; +/* t[k] = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */ + d0 = t[k]; + d0 <<= 11; + d0 += 1664; + d0 *= 645084; + d0 >>= 31; + t[k] = d0 & 0x7ff; + } + + r[ 0] = (t[0] >> 0); + r[ 1] = (t[0] >> 8) | (t[1] << 3); + r[ 2] = (t[1] >> 5) | (t[2] << 6); + r[ 3] = (t[2] >> 2); + r[ 4] = (t[2] >> 10) | (t[3] << 1); + r[ 5] = (t[3] >> 7) | (t[4] << 4); + r[ 6] = (t[4] >> 4) | (t[5] << 7); + r[ 7] = (t[5] >> 1); + r[ 8] = (t[5] >> 9) | (t[6] << 2); + r[ 9] = (t[6] >> 6) | (t[7] << 5); + r[10] = (t[7] >> 3); + r += 11; + } + } +#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) + uint16_t t[4]; + for(i=0;ivec[i].coeffs[4*j+k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; +/* t[k] = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */ + d0 = t[k]; + d0 <<= 10; + d0 += 1665; + d0 *= 1290167; + d0 >>= 32; + t[k] = d0 & 0x3ff; + } + + r[0] = (t[0] >> 0); + r[1] = (t[0] >> 8) | (t[1] << 2); + r[2] = (t[1] >> 6) | (t[2] << 4); + r[3] = (t[2] >> 4) | (t[3] << 6); + r[4] = (t[3] >> 2); + r += 5; + } + } +#else +#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}" +#endif +} + +/************************************************* +* Name: polyvec_decompress +* +* Description: De-serialize and decompress vector of polynomials; +* approximate inverse of polyvec_compress +* +* Arguments: - polyvec *r: pointer to output vector of polynomials +* - unsigned char *a: pointer to input byte array (of length KYBER_POLYVECCOMPRESSEDBYTES) +**************************************************/ +void polyvec_decompress(polyvec *r, const unsigned char *a) +{ + int i,j; +#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) + for(i=0;ivec[i].coeffs[8*j+0] = (((a[11*j+ 0] | (((uint32_t)a[11*j+ 1] & 0x07) << 8)) * KYBER_Q) + 1024) >> 11; + r->vec[i].coeffs[8*j+1] = ((((a[11*j+ 1] >> 3) | (((uint32_t)a[11*j+ 2] & 0x3f) << 5)) * KYBER_Q) + 1024) >> 11; + r->vec[i].coeffs[8*j+2] = ((((a[11*j+ 2] >> 6) | (((uint32_t)a[11*j+ 3] & 0xff) << 2) | (((uint32_t)a[11*j+ 4] & 0x01) << 10)) * KYBER_Q) + 1024) >> 11; + r->vec[i].coeffs[8*j+3] = ((((a[11*j+ 4] >> 1) | (((uint32_t)a[11*j+ 5] & 0x0f) << 7)) * KYBER_Q) + 1024) >> 11; + r->vec[i].coeffs[8*j+4] = ((((a[11*j+ 5] >> 4) | (((uint32_t)a[11*j+ 6] & 0x7f) << 4)) * KYBER_Q) + 1024) >> 11; + r->vec[i].coeffs[8*j+5] = ((((a[11*j+ 6] >> 7) | (((uint32_t)a[11*j+ 7] & 0xff) << 1) | (((uint32_t)a[11*j+ 8] & 0x03) << 9)) * KYBER_Q) + 1024) >> 11; + r->vec[i].coeffs[8*j+6] = ((((a[11*j+ 8] >> 2) | (((uint32_t)a[11*j+ 9] & 0x1f) << 6)) * KYBER_Q) + 1024) >> 11; + r->vec[i].coeffs[8*j+7] = ((((a[11*j+ 9] >> 5) | (((uint32_t)a[11*j+10] & 0xff) << 3)) * KYBER_Q) + 1024) >> 11; + } + a += 352; + } +#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) + for(i=0;ivec[i].coeffs[4*j+0] = (((a[5*j+ 0] | (((uint32_t)a[5*j+ 1] & 0x03) << 8)) * KYBER_Q) + 512) >> 10; + r->vec[i].coeffs[4*j+1] = ((((a[5*j+ 1] >> 2) | (((uint32_t)a[5*j+ 2] & 0x0f) << 6)) * KYBER_Q) + 512) >> 10; + r->vec[i].coeffs[4*j+2] = ((((a[5*j+ 2] >> 4) | (((uint32_t)a[5*j+ 3] & 0x3f) << 4)) * KYBER_Q) + 512) >> 10; + r->vec[i].coeffs[4*j+3] = ((((a[5*j+ 3] >> 6) | (((uint32_t)a[5*j+ 4] & 0xff) << 2)) * KYBER_Q) + 512) >> 10; + } + a += 320; + } +#else +#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}" +#endif +} + +/************************************************* +* Name: polyvec_tobytes +* +* Description: Serialize vector of polynomials +* +* Arguments: - unsigned char *r: pointer to output byte array (needs space for KYBER_POLYVECBYTES) +* - const polyvec *a: pointer to input vector of polynomials +**************************************************/ +void polyvec_tobytes(unsigned char *r, polyvec *a) +{ + int i; + for(i=0;ivec[i]); +} + +/************************************************* +* Name: polyvec_frombytes +* +* Description: De-serialize vector of polynomials; +* inverse of polyvec_tobytes +* +* Arguments: - unsigned char *r: pointer to output byte array +* - const polyvec *a: pointer to input vector of polynomials (of length KYBER_POLYVECBYTES) +**************************************************/ +void polyvec_frombytes(polyvec *r, const unsigned char *a) +{ + int i; + for(i=0;ivec[i], a+i*KYBER_POLYBYTES); +} + +/************************************************* +* Name: polyvec_ntt +* +* Description: Apply forward NTT to all elements of a vector of polynomials +* +* Arguments: - polyvec *r: pointer to in/output vector of polynomials +**************************************************/ +void polyvec_ntt(polyvec *r) +{ + int i; + for(i=0;ivec[i]); +} + +/************************************************* +* Name: polyvec_invntt +* +* Description: Apply inverse NTT to all elements of a vector of polynomials +* +* Arguments: - polyvec *r: pointer to in/output vector of polynomials +**************************************************/ +void polyvec_invntt(polyvec *r) +{ + int i; + for(i=0;ivec[i]); +} + +/************************************************* +* Name: polyvec_reduce +* +* Description: Applies Barrett reduction to each coefficient +* of each element of a vector of polynomials +* for details of the Barrett reduction see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void polyvec_reduce(polyvec *r) +{ + int i; + for(i=0;ivec[i]); +} + +/************************************************* +* Name: polyvec_add +* +* Description: Add vectors of polynomials +* +* Arguments: - polyvec *r: pointer to output vector of polynomials +* - const polyvec *a: pointer to first input vector of polynomials +* - const polyvec *b: pointer to second input vector of polynomials +**************************************************/ +void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b) +{ + int i; + for(i=0;ivec[i], &a->vec[i], &b->vec[i]); +} diff --git a/crypto_kem/ml-kem-768/m4fspeed/polyvec.h b/crypto_kem/ml-kem-768/m4fspeed/polyvec.h new file mode 100644 index 0000000..0be7873 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/polyvec.h @@ -0,0 +1,24 @@ +#ifndef POLYVEC_H +#define POLYVEC_H + +#include "params.h" +#include "poly.h" + +typedef struct { + poly vec[KYBER_K]; +} polyvec; + +void polyvec_compress(unsigned char *r, const polyvec *a); +void polyvec_decompress(polyvec *r, const unsigned char *a); + +void polyvec_tobytes(unsigned char *r, polyvec *a); +void polyvec_frombytes(polyvec *r, const unsigned char *a); + +void polyvec_ntt(polyvec *r); +void polyvec_invntt(polyvec *r); + +void polyvec_reduce(polyvec *r); + +void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); + +#endif diff --git a/crypto_kem/ml-kem-768/m4fspeed/reduce.S b/crypto_kem/ml-kem-768/m4fspeed/reduce.S new file mode 100644 index 0000000..bfc53f6 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/reduce.S @@ -0,0 +1,140 @@ +/****************************************************************************** +* Integrating the improved Plantard arithmetic into Kyber. +* +* Efficient Plantard arithmetic enables a faster Kyber implementation with the +* same stack usage. +* +* See the paper at https://eprint.iacr.org/2022/956.pdf for more details. +* +* @author Junhao Huang, BNU-HKBU United International College, Zhuhai, China +* jhhuang_nuaa@126.com +* +* @date September 2022 +******************************************************************************/ + +#include "macros.i" + +.syntax unified +.cpu cortex-m4 +.thumb + +.global asm_barrett_reduce +.type asm_barrett_reduce,%function +.align 2 +asm_barrett_reduce: + push {r4-r11, r14} + + poly .req r0 + poly0 .req r1 + poly1 .req r2 + poly2 .req r3 + poly3 .req r4 + poly4 .req r5 + poly5 .req r6 + poly6 .req r7 + poly7 .req r8 + loop .req r9 + barrettconst .req r10 + q .req r11 + tmp .req r12 + tmp2 .req r14 + + movw barrettconst, #20159 + movw q, #3329 + + movw loop, #16 + 1: + ldm poly, {poly0-poly7} + + doublebarrett poly0, tmp, tmp2, q, barrettconst + doublebarrett poly1, tmp, tmp2, q, barrettconst + doublebarrett poly2, tmp, tmp2, q, barrettconst + doublebarrett poly3, tmp, tmp2, q, barrettconst + doublebarrett poly4, tmp, tmp2, q, barrettconst + doublebarrett poly5, tmp, tmp2, q, barrettconst + doublebarrett poly6, tmp, tmp2, q, barrettconst + doublebarrett poly7, tmp, tmp2, q, barrettconst + + stm poly!, {poly0-poly7} + + subs.w loop, #1 + bne.w 1b + + .unreq poly + .unreq poly0 + .unreq poly1 + .unreq poly2 + .unreq poly3 + .unreq poly4 + .unreq poly5 + .unreq poly6 + .unreq poly7 + .unreq loop + .unreq barrettconst + .unreq q + .unreq tmp + .unreq tmp2 + + pop {r4-r11, pc} + +.global asm_fromplant +.type asm_fromplant,%function +.align 2 +asm_fromplant: + push {r4-r11, r14} + + poly .req r0 + poly0 .req r1 + poly1 .req r2 + poly2 .req r3 + poly3 .req r4 + poly4 .req r5 + poly5 .req r6 + poly6 .req r7 + poly7 .req r8 + loop .req r9 + plantconst .req r10 + q .req r11 + qa .req r12 + tmp .req r14 + + movw qa, #26632 + movt q, #3329 + + ### movt qinv, #3327 + ### plant_constant=(Plant_const^2%M)*(p^-1) % 2^32 + movw plantconst, #20396 + movt plantconst, #38900 + movw loop, #16 + 1: + ldm poly, {poly0-poly7} + + doubleplant poly0, tmp, q, qa, plantconst + doubleplant poly1, tmp, q, qa, plantconst + doubleplant poly2, tmp, q, qa, plantconst + doubleplant poly3, tmp, q, qa, plantconst + doubleplant poly4, tmp, q, qa, plantconst + doubleplant poly5, tmp, q, qa, plantconst + doubleplant poly6, tmp, q, qa, plantconst + doubleplant poly7, tmp, q, qa, plantconst + + stm poly!, {poly0-poly7} + + subs.w loop, #1 + bne.w 1b + + .unreq poly + .unreq poly0 + .unreq poly1 + .unreq poly2 + .unreq poly3 + .unreq poly4 + .unreq poly5 + .unreq poly6 + .unreq poly7 + .unreq loop + .unreq plantconst + .unreq q + .unreq qa + .unreq tmp + pop {r4-r11, pc} diff --git a/crypto_kem/ml-kem-768/m4fspeed/symmetric-fips202.c b/crypto_kem/ml-kem-768/m4fspeed/symmetric-fips202.c new file mode 100644 index 0000000..4ee0723 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/symmetric-fips202.c @@ -0,0 +1,71 @@ +#include "fips202.h" +#include "params.h" +#include "symmetric.h" +#include +#include +#include + +/************************************************* +* Name: kyber_shake128_absorb +* +* Description: Absorb step of the SHAKE128 specialized for the Kyber context. +* +* Arguments: - xof_state *state: pointer to (uninitialized) output Keccak state +* - const uint8_t *seed: pointer to KYBER_SYMBYTES input to be absorbed into state +* - uint8_t i: additional byte of input +* - uint8_t j: additional byte of input +**************************************************/ +void kyber_shake128_absorb(xof_state *state, + const uint8_t seed[KYBER_SYMBYTES], + uint8_t x, + uint8_t y) { + uint8_t extseed[KYBER_SYMBYTES + 2]; + + memcpy(extseed, seed, KYBER_SYMBYTES); + extseed[KYBER_SYMBYTES + 0] = x; + extseed[KYBER_SYMBYTES + 1] = y; + + shake128_absorb(state, extseed, sizeof(extseed)); +} + +/************************************************* +* Name: kyber_shake256_prf +* +* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input +* and then generates outlen bytes of SHAKE256 output +* +* Arguments: - uint8_t *out: pointer to output +* - size_t outlen: number of requested output bytes +* - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES) +* - uint8_t nonce: single-byte nonce (public PRF input) +**************************************************/ +void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce) { + uint8_t extkey[KYBER_SYMBYTES + 1]; + + memcpy(extkey, key, KYBER_SYMBYTES); + extkey[KYBER_SYMBYTES] = nonce; + + shake256(out, outlen, extkey, sizeof(extkey)); +} + +/************************************************* +* Name: kyber_shake256_prf +* +* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input +* and then generates outlen bytes of SHAKE256 output +* +* Arguments: - uint8_t *out: pointer to output +* - size_t outlen: number of requested output bytes +* - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES) +* - uint8_t nonce: single-byte nonce (public PRF input) +**************************************************/ +void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES]) { + shake256incctx s; + + shake256_inc_init(&s); + shake256_inc_absorb(&s, key, KYBER_SYMBYTES); + shake256_inc_absorb(&s, input, KYBER_CIPHERTEXTBYTES); + shake256_inc_finalize(&s); + shake256_inc_squeeze(out, KYBER_SSBYTES, &s); + shake256_inc_ctx_release(&s); +} \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fspeed/symmetric.h b/crypto_kem/ml-kem-768/m4fspeed/symmetric.h new file mode 100644 index 0000000..8441c83 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/symmetric.h @@ -0,0 +1,29 @@ +#ifndef SYMMETRIC_H +#define SYMMETRIC_H +#include "fips202.h" +#include "params.h" +#include +#include + +typedef shake128ctx xof_state; + +void kyber_shake128_absorb(xof_state *s, + const uint8_t seed[KYBER_SYMBYTES], + uint8_t x, + uint8_t y); + +void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce); + +void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES]); + +#define XOF_BLOCKBYTES SHAKE128_RATE + +#define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES) +#define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES) +#define xof_absorb(STATE, SEED, X, Y) kyber_shake128_absorb(STATE, SEED, X, Y) +#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define xof_ctx_release(STATE) shake128_ctx_release(STATE) +#define prf(OUT, OUTBYTES, KEY, NONCE) kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) +#define rkprf(OUT, KEY, INPUT) kyber_shake256_rkprf(OUT, KEY, INPUT) + +#endif /* SYMMETRIC_H */ diff --git a/crypto_kem/ml-kem-768/m4fspeed/verify.c b/crypto_kem/ml-kem-768/m4fspeed/verify.c new file mode 100644 index 0000000..679ec89 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/verify.c @@ -0,0 +1,51 @@ +#include "verify.h" + +#include +#include + +/************************************************* +* Name: verify +* +* Description: Compare two arrays for equality in constant time. +* +* Arguments: const unsigned char *a: pointer to first byte array +* const unsigned char *b: pointer to second byte array +* size_t len: length of the byte arrays +* +* Returns 0 if the byte arrays are equal, 1 otherwise +**************************************************/ +unsigned char verify(const unsigned char *a, const unsigned char *b, size_t len) { + uint64_t r; + size_t i; + + r = 0; + for (i = 0; i < len; i++) { + r |= a[i] ^ b[i]; + } + + r = (~r + 1); // Two's complement + r >>= 63; + return (unsigned char)r; +} + +/************************************************* +* Name: cmov +* +* Description: Copy len bytes from x to r if b is 1; +* don't modify x if b is 0. Requires b to be in {0,1}; +* assumes two's complement representation of negative integers. +* Runs in constant time. +* +* Arguments: unsigned char *r: pointer to output byte array +* const unsigned char *x: pointer to input byte array +* size_t len: Amount of bytes to be copied +* unsigned char b: Condition bit; has to be in {0,1} +**************************************************/ +void cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b) { + size_t i; + + b = -b; + for (i = 0; i < len; i++) { + r[i] ^= b & (x[i] ^ r[i]); + } +} diff --git a/crypto_kem/ml-kem-768/m4fspeed/verify.h b/crypto_kem/ml-kem-768/m4fspeed/verify.h new file mode 100644 index 0000000..8777a14 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fspeed/verify.h @@ -0,0 +1,10 @@ +#ifndef VERIFY_H +#define VERIFY_H + +#include + +unsigned char verify(const unsigned char *a, const unsigned char *b, size_t len); + +void cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b); + +#endif diff --git a/crypto_kem/ml-kem-768/m4fstack/api.h b/crypto_kem/ml-kem-768/m4fstack/api.h new file mode 120000 index 0000000..cf75db9 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/api.h @@ -0,0 +1 @@ +../m4fspeed/api.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fstack/cbd.c b/crypto_kem/ml-kem-768/m4fstack/cbd.c new file mode 120000 index 0000000..903fa59 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/cbd.c @@ -0,0 +1 @@ +../m4fspeed/cbd.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fstack/cbd.h b/crypto_kem/ml-kem-768/m4fstack/cbd.h new file mode 120000 index 0000000..d264c36 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/cbd.h @@ -0,0 +1 @@ +../m4fspeed/cbd.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fstack/cmov_int16.S b/crypto_kem/ml-kem-768/m4fstack/cmov_int16.S new file mode 120000 index 0000000..9055f6a --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/cmov_int16.S @@ -0,0 +1 @@ +../m4fspeed/cmov_int16.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fstack/fastaddsub.S b/crypto_kem/ml-kem-768/m4fstack/fastaddsub.S new file mode 120000 index 0000000..d1317f7 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/fastaddsub.S @@ -0,0 +1 @@ +../m4fspeed/fastaddsub.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fstack/fastbasemul.S b/crypto_kem/ml-kem-768/m4fstack/fastbasemul.S new file mode 100644 index 0000000..c6e4e49 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/fastbasemul.S @@ -0,0 +1,207 @@ +/****************************************************************************** +* Integrating the improved Plantard arithmetic into Kyber. +* +* Efficient Plantard arithmetic enables a faster Kyber implementation with the +* same stack usage. +* +* See the paper at https://eprint.iacr.org/2022/956.pdf for more details. +* +* @author Junhao Huang, BNU-HKBU United International College, Zhuhai, China +* jhhuang_nuaa@126.com +* +* @date September 2022 +******************************************************************************/ +#include "macros.i" +.syntax unified +.cpu cortex-m4 +.thumb + +.global basemul_asm +.type basemul_asm, %function +.align 2 +basemul_asm: + push {r4-r11, lr} + + rptr .req r0 + aptr .req r1 + bptr .req r2 + zetaptr .req r3 + poly0 .req r4 + poly1 .req r6 + poly2 .req r5 + poly3 .req r7 + q .req r8 + qa .req r14 + qinv .req r9 + tmp .req r10 + tmp2 .req r11 + zeta .req r12 + loop .req r14 + + //movw qa, #26632 + movt q, #3329 + ### qinv=0x6ba8f301 + movw qinv, #62209 + movt qinv, #27560 + + movw loop, #64 + 1: + vmov.w s0,loop + movw qa, #26632 + + ldrd poly0, poly2, [aptr], #8 + ldrd poly1, poly3, [bptr], #8 + // ldr poly0, [aptr], #4 + // ldr poly1, [bptr], #4 + // ldr poly2, [aptr], #4 + // ldr poly3, [bptr], #4 + + ldr.w zeta, [zetaptr], #4 + + // basemul(r->coeffs + 4 * i, a->coeffs + 4 * i, b->coeffs + 4 * i, zetas[64 + i]); + smulwt tmp, zeta, poly1 + smlabt tmp, tmp, q, qa + smultt tmp, poly0, tmp + smlabb tmp, poly0, poly1, tmp + plant_red q, qa, qinv, tmp + // r[0] in upper half of tmp + + smuadx tmp2, poly0, poly1 + plant_red q, qa, qinv, tmp2 + // r[1] in upper half of tmp2 + pkhtb tmp, tmp2, tmp, asr#16 + str tmp, [rptr], #4 + + neg zeta, zeta + + // basemul(r->coeffs + 4 * i + 2, a->coeffs + 4 * i + 2, b->coeffs + 4 * i + 2, - zetas[64 + i]); + smulwt tmp, zeta, poly3 + smlabt tmp, tmp, q, qa + smultt tmp, poly2, tmp + smlabb tmp, poly2, poly3, tmp + plant_red q, qa, qinv, tmp + // r[0] in upper half of tmp + + smuadx tmp2, poly2, poly3 + plant_red q, qa, qinv, tmp2 + // r[1] in upper half of tmp2 + pkhtb tmp, tmp2, tmp, asr#16 + str tmp, [rptr], #4 + + vmov.w loop,s0 + subs.w loop, #1 + bne.w 1b + + .unreq rptr + .unreq aptr + .unreq bptr + .unreq zetaptr + .unreq poly0 + .unreq poly1 + .unreq poly2 + .unreq poly3 + .unreq q + .unreq qa + .unreq qinv + .unreq tmp + .unreq tmp2 + .unreq zeta + .unreq loop + + pop {r4-r11, pc} +//-0.5p~0.5p +.global basemul_asm_acc +.type basemul_asm_acc, %function +.align 2 +basemul_asm_acc: + push {r4-r11, lr} + + rptr .req r0 + aptr .req r1 + bptr .req r2 + zetaptr .req r3 + poly0 .req r4 + poly1 .req r6 + poly2 .req r5 + poly3 .req r7 + q .req r8 + qa .req r14 + qinv .req r9 + tmp .req r10 + tmp2 .req r11 + zeta .req r12 + loop .req r14 + + + movt q, #3329 + ### qinv=0x6ba8f301 + movw qinv, #62209 + movt qinv, #27560 + + movw loop, #64 + 1: + vmov.w s0,loop + movw qa, #26632 + + ldrd poly0, poly2, [aptr], #8 + ldrd poly1, poly3, [bptr], #8 + + ldr.w zeta, [zetaptr], #4 + + //basemul(r->coeffs + 4 * i, a->coeffs + 4 * i, b->coeffs + 4 * i, zetas[64 + i]); + smulwt tmp, zeta, poly1 + smlabt tmp, tmp, q, qa + smultt tmp, poly0, tmp + smlabb tmp, poly0, poly1, tmp + plant_red q, qa, qinv, tmp + // r[0] in upper half of tmp + + smuadx tmp2, poly0, poly1 + plant_red q, qa, qinv, tmp2 + // r[1] in upper half of tmp2 + pkhtb tmp, tmp2, tmp, asr#16 + + ldr.w tmp2, [rptr] + uadd16 tmp, tmp, tmp2 + str.w tmp, [rptr], #4 + + neg zeta, zeta + + // basemul(r->coeffs + 4 * i + 2, a->coeffs + 4 * i + 2, b->coeffs + 4 * i + 2, - zetas[64 + i]); + smulwt tmp, zeta, poly3 + smlabt tmp, tmp, q, qa + smultt tmp, poly2, tmp + smlabb tmp, poly2, poly3, tmp + plant_red q, qa, qinv, tmp + // r[0] in upper half of tmp + + smuadx tmp2, poly2, poly3 + plant_red q, qa, qinv, tmp2 + // r[1] in upper half of tmp2 + pkhtb tmp, tmp2, tmp, asr#16 + + ldr.w tmp2, [rptr] + uadd16 tmp, tmp, tmp2 + str.w tmp, [rptr], #4 + + vmov.w loop, s0 + subs.w loop, #1 + bne.w 1b + + .unreq rptr + .unreq aptr + .unreq bptr + .unreq zetaptr + .unreq poly0 + .unreq poly1 + .unreq poly2 + .unreq poly3 + .unreq q + .unreq qa + .unreq qinv + .unreq tmp + .unreq tmp2 + .unreq zeta + .unreq loop + + pop {r4-r11, pc} diff --git a/crypto_kem/ml-kem-768/m4fstack/fastinvntt.S b/crypto_kem/ml-kem-768/m4fstack/fastinvntt.S new file mode 100644 index 0000000..0fe208d --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/fastinvntt.S @@ -0,0 +1,360 @@ +/****************************************************************************** +* Integrating the improved Plantard arithmetic into Kyber. +* +* Efficient Plantard arithmetic enables a faster Kyber implementation with the +* same stack usage. +* +* See the paper at https://eprint.iacr.org/2022/956.pdf for more details. +* +* @author Junhao Huang, BNU-HKBU United International College, Zhuhai, China +* jhhuang_nuaa@126.com +* +* @date September 2022 +******************************************************************************/ +#include "macros.i" + +.syntax unified +.cpu cortex-m4 +.thumb + +.macro mul_twiddle_plant a, twiddle, tmp, q, qa + smulwb \tmp, \twiddle, \a + smulwt \a, \twiddle, \a + smlabt \tmp, \tmp, \q, \qa + smlabt \a, \a, \q, \qa + pkhtb \a, \a, \tmp, asr#16 +.endm + +.macro doublebutterfly_plant a0, a1, twiddle, tmp, q, qa + smulwb \tmp, \twiddle, \a1 + smulwt \a1, \twiddle, \a1 + smlabt \tmp, \tmp, \q, \qa + smlabt \a1, \a1, \q, \qa + pkhtb \tmp, \a1, \tmp, asr#16 + usub16 \a1, \a0, \tmp + uadd16 \a0, \a0, \tmp +.endm + +.macro two_doublebutterfly_plant a0, a1, a2, a3, twiddle0, twiddle1, tmp, q, qa + doublebutterfly_plant \a0, \a1, \twiddle0, \tmp, \q, \qa + doublebutterfly_plant \a2, \a3, \twiddle1, \tmp, \q, \qa +.endm + +.macro fullplant a0, a1, a2, a3, a4, a5, a6, a7, tmp, q, qa, plantconst + movw \plantconst, #44984 + movt \plantconst, #19 + doubleplant \a0, \tmp, \q, \qa, \plantconst + doubleplant \a1, \tmp, \q, \qa, \plantconst + doubleplant \a2, \tmp, \q, \qa, \plantconst + doubleplant \a3, \tmp, \q, \qa, \plantconst + doubleplant \a4, \tmp, \q, \qa, \plantconst + doubleplant \a5, \tmp, \q, \qa, \plantconst + doubleplant \a6, \tmp, \q, \qa, \plantconst + doubleplant \a7, \tmp, \q, \qa, \plantconst +.endm + +.macro halfplant a0, a1, a2, a3, tmp, q, qa, plantconst + movw \plantconst, #44984 + movt \plantconst, #19 + doubleplant \a0, \tmp, \q, \qa, \plantconst + doubleplant \a1, \tmp, \q, \qa, \plantconst + doubleplant \a2, \tmp, \q, \qa, \plantconst + doubleplant \a3, \tmp, \q, \qa, \plantconst +.endm + +// twiddle2 is used as tmp2 +// c0, c2, c4, c6: output 6.5q maximum; c1 c3 c5 c7: output 4q maximum. +.macro _3_layer_double_inv_CT_16_plant_light c0, c1, c2, c3, c4, c5, c6, c7, xi2, xi4, xi5, xi6, twiddle1, tmp2, q, qa, tmp + + // layer 1 + sadd16.w \tmp, \c0, \c1 // c0, c1 + ssub16.w \c1, \c0, \c1 + sadd16.w \tmp2, \c2, \c3 // c2, c3 + ssub16.w \c3, \c2, \c3 + // tmp, c1, tmp2, c3: 4q maximum + sadd16.w \c0, \c4, \c5 // c4, c5 + ssub16.w \c5, \c4, \c5 + sadd16.w \c2, \c6, \c7 // c6, c7 + ssub16.w \c7, \c6, \c7 + // c4, c6 are free at this point + // c0,c5,c2,c7 4q maximum + + // layer 2 + sadd16.w \c6, \tmp, \tmp2 // c0, c2 + ssub16.w \tmp2, \tmp, \tmp2 + sadd16.w \c4, \c0, \c2 // c4, c6 + ssub16.w \c2, \c0, \c2 + // c6, tmp2, c4, c2: 6q maximum + + vmov.w \twiddle1, \xi2 + doublebutterfly_plant \c1, \c3, \twiddle1, \tmp, \q, \qa + doublebutterfly_plant \c5, \c7, \twiddle1, \tmp, \q, \qa + // c1, c3, c7, c5: 3.5q maximum; + + // tmp and c0 are free at this point + //reduction c6, tmp2, c4, c2: 0.5q + movw \twiddle1, #44984 + movt \twiddle1, #19 + doubleplant \c6, \tmp, \q, \qa, \twiddle1 + + // layer 3 + sadd16.w \c0, \c6, \c4 // c0, c4 + ssub16.w \c4, \c6, \c4 + // c0, c4: 6.5q + // c6 are free at this point + vmov.w \twiddle1, \xi4 + doublebutterfly_plant \c1, \c5, \twiddle1, \tmp, \q, \qa + // c1, c5: 4q maximum + + vmov.w \twiddle1, \xi5 + // this block is one doublebutterfly + smulwb \tmp, \twiddle1, \c2 // c2, c6 + smulwt \c2, \twiddle1, \c2 + smlabt \tmp, \tmp, \q, \qa + smlabt \c2, \c2, \q, \qa + pkhtb \tmp, \c2, \tmp, asr#16 + ssub16.w \c6, \tmp2, \tmp + sadd16.w \c2, \tmp2, \tmp + //c6, c2: 6.5q + vmov.w \twiddle1, \xi6 + doublebutterfly_plant \c3, \c7, \twiddle1, \tmp, \q, \qa + //c3, c7: 4q +.endm + +.macro _3_layer_double_inv_CT_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + // layer 3 + ldr.w \twiddle1, [\twiddle_ptr], #4 + two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa + two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa + + // layer 2 + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa + + two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa + + // layer 1 + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle2, \tmp, \q, \qa + + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa +.endm + +.macro _3_layer_double_inv_twist_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + mul_twiddle_plant \c0, \twiddle1, \tmp, \q, \qa + mul_twiddle_plant \c1, \twiddle2, \tmp, \q, \qa + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + mul_twiddle_plant \c2, \twiddle1, \tmp, \q, \qa + mul_twiddle_plant \c3, \twiddle2, \tmp, \q, \qa + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + mul_twiddle_plant \c4, \twiddle1, \tmp, \q, \qa + mul_twiddle_plant \c5, \twiddle2, \tmp, \q, \qa + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + mul_twiddle_plant \c6, \twiddle1, \tmp, \q, \qa + mul_twiddle_plant \c7, \twiddle2, \tmp, \q, \qa +.endm + +.global invntt_fast +.type invntt_fast, %function +.align 2 +invntt_fast: + push {r4-r11, r14} + vpush.w {s16-s23} + poly .req r0 + twiddle_ptr .req r1 + poly0 .req r2 + poly1 .req r3 + poly2 .req r4 + poly3 .req r5 + poly4 .req r6 + poly5 .req r7 + poly6 .req r8 + poly7 .req r9 + twiddle1 .req r10 + twiddle2 .req r11 + q .req r12 + // at the top of r12 + qa .req r0 + // qa=2^a q;a=3; at the bottom of r12 + tmp .req r14 + + movt q, #3329 + + ### LAYER 7+6+5+4 + .equ distance, 16 + .equ offset, 32 + .equ strincr, 64 + + // pre-load twiddle factors to FPU registers + vldm twiddle_ptr!, {s8-s22} + + add.w tmp, poly, #8*strincr + vmov s8, tmp + 1: + vmov s23, poly + // load a1, a3, ..., a15 + load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset + load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset + + movw qa, #26632 + + // NTT on a1, a3, ..., a15 + // twiddle2 is used as tmp2 + _3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s10, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp + + // multiply coeffs by layer 4 twiddles for later use + vmov twiddle1, s15 + vmov twiddle2, s16 + mul_twiddle_plant poly0, twiddle1, tmp, q, qa // could be omitted but kept for reduction only + mul_twiddle_plant poly1, twiddle2, tmp, q, qa + + vmov twiddle1, s17 + vmov twiddle2, s18 + mul_twiddle_plant poly2, twiddle1, tmp, q, qa + mul_twiddle_plant poly3, twiddle2, tmp, q, qa + + vmov twiddle1, s19 + vmov twiddle2, s20 + mul_twiddle_plant poly4, twiddle1, tmp, q, qa + mul_twiddle_plant poly5, twiddle2, tmp, q, qa + + vmov twiddle1, s21 + vmov twiddle2, s22 + mul_twiddle_plant poly6, twiddle1, tmp, q, qa + mul_twiddle_plant poly7, twiddle2, tmp, q, qa + + vmov s0, poly0 // a1 + vmov s1, poly1 // a3 + vmov s2, poly2 // a5 + vmov s3, poly3 // a7 + vmov s4, poly4 // a9 + vmov s5, poly5 // a11 + vmov s6, poly6 // a13 + vmov s7, poly7 // a15 + // 0.5q + + vmov poly, s23 + // load a0, a2, ..., a14 + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + movw qa, #26632 + // NTT on a0, a2, ..., a14 + // twiddle2 is used as tmp2 + _3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s10, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp + + // layer 4 - 1 + // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) + vmov poly, s23 + vmov twiddle2, s1 // load a3 + uadd16 tmp, poly1, twiddle2 + usub16 poly1, poly1, twiddle2 + str.w tmp, [poly, #1*distance/4] + str.w poly1, [poly, #1*distance/4+offset] + + vmov twiddle2, s3 // load a7 + uadd16 tmp, poly3, twiddle2 + usub16 poly3, poly3, twiddle2 + str.w tmp, [poly, #3*distance/4] + str.w poly3, [poly, #3*distance/4+offset] + + vmov twiddle2, s5 // load a11 + uadd16 tmp, poly5, twiddle2 + usub16 poly5, poly5, twiddle2 + str.w tmp, [poly, #5*distance/4] + str.w poly5, [poly, #5*distance/4+offset] + + vmov twiddle2, s7 // load a15 + uadd16 tmp, poly7, twiddle2 + usub16 poly7, poly7, twiddle2 + str.w tmp, [poly, #7*distance/4] + str.w poly7, [poly, #7*distance/4+offset] + //1,3,5,7: upto 4.5q + // layer 4 - 2 + // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13) + vmov poly3, s2 // load a5 + uadd16 tmp, poly2, poly3 + usub16 twiddle2, poly2, poly3 + str.w tmp, [poly, #2*distance/4] + str.w twiddle2, [poly, #2*distance/4+offset] + + vmov poly5, s4 // load a9 + uadd16 tmp, poly4, poly5 + usub16 twiddle2, poly4, poly5 + str.w tmp, [poly, #4*distance/4] + str.w twiddle2, [poly, #4*distance/4+offset] + + vmov poly7, s6 // load a13 + uadd16 tmp, poly6, poly7 + usub16 twiddle2, poly6, poly7 + str.w tmp, [poly, #6*distance/4] + str.w twiddle2, [poly, #6*distance/4+offset] + + vmov poly1, s0 // load a1 + uadd16 tmp, poly0, poly1 + usub16 twiddle2, poly0, poly1 + str.w twiddle2, [poly, #offset] + str.w tmp, [poly], #strincr // increase 2*8*4 = 64 (2 * 8 loads of 4 bytes each) + //0,2,4,6: upto 7q + vmov tmp, s8 + cmp.w poly, tmp + bne.w 1b + + sub.w poly, #8*strincr + + ### LAYER 3+2+1 + + + .equ distance, distance*16 + .equ strincr, 4 + + // ITER 0 + vmov s6, poly + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + vldm twiddle_ptr!, {s0-s5} + movw qa, #26632 + fullplant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7 tmp, q, qa, twiddle1 + // twiddle2 is used as tmp2 + _3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s1, s3, s4, s5, twiddle1, twiddle2, q, qa, tmp + + // twisting + _3_layer_double_inv_twist_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + + vmov poly, s6 + store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + str.w poly1, [poly, #distance/4] + str.w poly2, [poly, #2*distance/4] + str.w poly3, [poly, #3*distance/4] + str.w poly0, [poly], #4 + + // ITER 1-15 + add.w tmp, poly, #strincr*3*(5) + vmov s14, tmp + 2: + vmov s6, poly + // polys upto 7q + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + movw qa, #26632 + _3_layer_double_inv_CT_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + + // twisting + _3_layer_double_inv_twist_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + + vmov poly, s6 + store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + str.w poly1, [poly, #distance/4] + str.w poly2, [poly, #2*distance/4] + str.w poly3, [poly, #3*distance/4] + str.w poly0, [poly], #4 + + vmov tmp, s14 + cmp.w poly, tmp + bne.w 2b + vpop.w {s16-s23} + pop {r4-r11, pc} diff --git a/crypto_kem/ml-kem-768/m4fstack/fastntt.S b/crypto_kem/ml-kem-768/m4fstack/fastntt.S new file mode 120000 index 0000000..208c11d --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/fastntt.S @@ -0,0 +1 @@ +../m4fspeed/fastntt.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fstack/indcpa.c b/crypto_kem/ml-kem-768/m4fstack/indcpa.c new file mode 100644 index 0000000..3869797 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/indcpa.c @@ -0,0 +1,211 @@ +#include "indcpa.h" +#include "ntt.h" +#include "poly.h" +#include "polyvec.h" +#include "randombytes.h" +#include "symmetric.h" +#include "matacc.h" + +#include +#include + +/************************************************* +* Name: indcpa_keypair +* +* Description: Generates public and private key for the CPA-secure +* public-key encryption scheme underlying Kyber +* +* Arguments: - unsigned char *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - unsigned char *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes) +**************************************************/ +void indcpa_keypair_derand(unsigned char *pk, + unsigned char *sk, + const unsigned char *coins){ + polyvec skpv; + poly pkp; + unsigned char buf[2 * KYBER_SYMBYTES]; + unsigned char *publicseed = buf; + unsigned char *noiseseed = buf + KYBER_SYMBYTES; + int i; + unsigned char nonce = 0; + + memcpy(buf, coins, KYBER_SYMBYTES); + buf[KYBER_SYMBYTES] = KYBER_K; + hash_g(buf, buf, KYBER_SYMBYTES + 1); + + for (i = 0; i < KYBER_K; i++) + poly_getnoise(skpv.vec + i, noiseseed, nonce++); + + polyvec_ntt(&skpv); + + for (i = 0; i < KYBER_K; i++) { + matacc(&pkp, &skpv, i, publicseed, 0); + + poly_invntt(&pkp); + + poly_addnoise(&pkp, noiseseed, nonce++); + poly_ntt(&pkp); + + poly_tobytes(pk+i*KYBER_POLYBYTES, &pkp); + } + polyvec_tobytes(sk, &skpv); + memcpy(pk + KYBER_POLYVECBYTES, publicseed, KYBER_SYMBYTES); // Pack the public seed in the public key +} + +/************************************************* +* Name: indcpa_enc +* +* Description: Encryption function of the CPA-secure +* public-key encryption scheme underlying Kyber. +* +* Arguments: - unsigned char *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes) +* - const unsigned char *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) +* - const unsigned char *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - const unsigned char *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) +* to deterministically generate all randomness +**************************************************/ +void indcpa_enc(unsigned char *c, + const unsigned char *m, + const unsigned char *pk, + const unsigned char *coins) { + polyvec sp; + poly bp; + poly *pkp = &bp; + poly *k = &bp; + poly *v = &sp.vec[0]; + const unsigned char *seed = pk+KYBER_POLYVECBYTES; + int i; + unsigned char nonce = 0; + + for (i = 0; i < KYBER_K; i++) + poly_getnoise(sp.vec + i, coins, nonce++); + + polyvec_ntt(&sp); + + for (i = 0; i < KYBER_K; i++) { + matacc(&bp, &sp, i, seed, 1); + poly_invntt(&bp); + + poly_addnoise(&bp, coins, nonce++); + poly_reduce(&bp); + + poly_packcompress(c, &bp, i); + } + + poly_frombytes(pkp, pk); + poly_basemul(v, pkp, &sp.vec[0]); + for (i = 1; i < KYBER_K; i++) { + poly_frombytes(pkp, pk + i*KYBER_POLYBYTES); + poly_basemul_acc(v, pkp, &sp.vec[i]); + } + + poly_invntt(v); + + poly_addnoise(v, coins, nonce++); + + poly_frommsg(k, m); + poly_add(v, v, k); + poly_reduce(v); + + poly_compress(c + KYBER_POLYVECCOMPRESSEDBYTES, v); +} + +/************************************************* +* Name: indcpa_enc_cmp +* +* Description: Re-encryption function. +* Compares the re-encypted ciphertext with the original ciphertext byte per byte. +* The comparison is performed in a constant time manner. +* +* +* Arguments: - unsigned char *ct: pointer to input ciphertext to compare the new ciphertext with (of length KYBER_INDCPA_BYTES bytes) +* - const unsigned char *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) +* - const unsigned char *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - const unsigned char *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) +* to deterministically generate all randomness +* Returns: - boolean byte indicating that re-encrypted ciphertext is NOT equal to the original ciphertext +**************************************************/ +unsigned char indcpa_enc_cmp(const unsigned char *c, + const unsigned char *m, + const unsigned char *pk, + const unsigned char *coins) { + uint64_t rc = 0; + polyvec sp; + poly bp; + poly *pkp = &bp; + poly *k = &bp; + poly *v = &sp.vec[0]; + const unsigned char *seed = pk+KYBER_POLYVECBYTES; + int i; + unsigned char nonce = 0; + + for (i = 0; i < KYBER_K; i++) + poly_getnoise(sp.vec + i, coins, nonce++); + + polyvec_ntt(&sp); + + for (i = 0; i < KYBER_K; i++) { + matacc(&bp, &sp, i, seed, 1); + poly_invntt(&bp); + + poly_addnoise(&bp, coins, nonce++); + poly_reduce(&bp); + + rc |= cmp_poly_packcompress(c, &bp, i); + } + + poly_frombytes(pkp, pk); + poly_basemul(v, pkp, &sp.vec[0]); + for (i = 1; i < KYBER_K; i++) { + poly_frombytes(pkp, pk + i*KYBER_POLYBYTES); + poly_basemul_acc(v, pkp, &sp.vec[i]); + } + + poly_invntt(v); + + poly_addnoise(v, coins, nonce++); + poly_frommsg(k, m); + poly_add(v, v, k); + poly_reduce(v); + + rc |= cmp_poly_compress(c + KYBER_POLYVECCOMPRESSEDBYTES, v); + + rc = ~rc + 1; + rc >>= 63; + return (unsigned char)rc; +} + +/************************************************* +* Name: indcpa_dec +* +* Description: Decryption function of the CPA-secure +* public-key encryption scheme underlying Kyber. +* +* Arguments: - unsigned char *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES) +* - const unsigned char *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES) +* - const unsigned char *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES) +**************************************************/ +void __attribute__ ((noinline)) indcpa_dec(unsigned char *m, + const unsigned char *c, + const unsigned char *sk) { + poly mp, bp; + poly *v = &bp; + int i; + + poly_unpackdecompress(&mp, c, 0); + poly_ntt(&mp); + + poly_frombytes_mul(&mp, &mp, sk); + for(i = 1; i < KYBER_K; i++) { + poly_unpackdecompress(&bp, c, i); + poly_ntt(&bp); + poly_frombytes_mul_acc(&mp, &bp, sk + i*KYBER_POLYBYTES); + } + + poly_invntt(&mp); + poly_decompress(v, c+KYBER_POLYVECCOMPRESSEDBYTES); + poly_sub(&mp, v, &mp); + poly_reduce(&mp); + + poly_tomsg(m, &mp); +} diff --git a/crypto_kem/ml-kem-768/m4fstack/indcpa.h b/crypto_kem/ml-kem-768/m4fstack/indcpa.h new file mode 120000 index 0000000..5893b12 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/indcpa.h @@ -0,0 +1 @@ +../m4fspeed/indcpa.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fstack/kem.c b/crypto_kem/ml-kem-768/m4fstack/kem.c new file mode 120000 index 0000000..302153d --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/kem.c @@ -0,0 +1 @@ +../m4fspeed/kem.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fstack/macros.i b/crypto_kem/ml-kem-768/m4fstack/macros.i new file mode 120000 index 0000000..6e83891 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/macros.i @@ -0,0 +1 @@ +../m4fspeed/macros.i \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fstack/matacc.c b/crypto_kem/ml-kem-768/m4fstack/matacc.c new file mode 100644 index 0000000..9aaec0f --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/matacc.c @@ -0,0 +1,43 @@ +#include "ntt.h" +#include "poly.h" +#include "polyvec.h" +#include "symmetric.h" +#include "matacc.h" + +/************************************************* +* Name: matacc +* +* Description: Multiplies a row of A or A^T, generated on-the-fly, +* with a vector of polynomials and accumulates into the result. +* +* Arguments: - poly *r: pointer to output polynomial to accumulate in +* - polyvec *b: pointer to input vector of polynomials to multiply with +* - unsigned char i: byte to indicate the index < KYBER_K of the row of A or A^T +* - const unsigned char *seed: pointer to the public seed used to generate A +* - int transposed: boolean indicatin whether A or A^T is generated +**************************************************/ +void matacc(poly* r, const polyvec *b, unsigned char i, const unsigned char *seed, int transposed) { + unsigned char buf[XOF_BLOCKBYTES+2]; + xof_state state; + int16_t c[4]; + int j = 0; + + if (transposed) + xof_absorb(&state, seed, i, j); + else + xof_absorb(&state, seed, j, i); + + xof_squeezeblocks(buf, 1, &state); + + matacc_asm(r->coeffs, b->vec[j].coeffs, c, buf, zetas, &state); + for(j=1;jcoeffs, b->vec[j].coeffs, c, buf, zetas, &state); + } +} \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fstack/matacc.h b/crypto_kem/ml-kem-768/m4fstack/matacc.h new file mode 100644 index 0000000..92a3b38 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/matacc.h @@ -0,0 +1,26 @@ +#ifndef MATACC_H +#define MATACC_H +#include "poly.h" +#include "polyvec.h" +#include "symmetric.h" + +extern void matacc_asm(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t zetas[64], xof_state *state); +static inline void _matacc_asm(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES + 2], const int32_t _zetas[64], xof_state *state) +{ + // floating point registers clobbered by assembly function + asm volatile("" : : : "s16", "s17", "s18", "s19", "s20", "s21", "s26"); + matacc_asm(r, b, c, buf, _zetas, state); +} +#define matacc_asm _matacc_asm + +extern void matacc_asm_acc(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t zetas[64], xof_state *state); +static inline void _matacc_asm_acc(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES + 2], const int32_t _zetas[64], xof_state *state) +{ + // floating point registers clobbered by assembly function + asm volatile("" : : : "s16", "s17", "s18", "s19", "s20", "s21", "s26"); + matacc_asm_acc(r, b, c, buf, _zetas, state); +} +#define matacc_asm_acc _matacc_asm_acc + +void matacc(poly* r, const polyvec *b, unsigned char i, const unsigned char *seed, int transposed); +#endif \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fstack/matacc.i b/crypto_kem/ml-kem-768/m4fstack/matacc.i new file mode 100644 index 0000000..237ee46 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/matacc.i @@ -0,0 +1,197 @@ +/****************************************************************************** + * Integrating the improved Plantard arithmetic into Kyber. + * + * Efficient Plantard arithmetic enables a faster Kyber implementation with the + * same stack usage. + * + * See the paper at https://eprint.iacr.org/2022/956.pdf for more details. + * + * @author Junhao Huang, BNU-HKBU United International College, Zhuhai, China + * jhhuang_nuaa@126.com + * + * @date September 2022 + ******************************************************************************/ +// q locates in the bottom half of the register +.macro plant_red_b q, qa, qinv, tmp + mul \tmp, \tmp, \qinv + //tmp*qinv mod 2^2n/ 2^n; in high half + smlatb \tmp, \tmp, \q, \qa + // result in high half +.endm + +.macro load_vals val0, val1, bufptr, tmp + ldrh \val0, [\bufptr], #2 + ldrb \val1, [\bufptr], #1 + ubfx \tmp, \val0, #12, #4 + orr \val1, \tmp, \val1, lsl #4 + ubfx \val0, \val0, #0, #12 + ubfx \val1, \val1, #0, #12 +.endm + +// s17: bufptr; s26: state +// Checks if val0 is suitable and multiplies with values from bptr using func +.macro first_if func, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr +// if (val0 < KYBER_Q) + cmp.w \val0, \q + bhs.w 2f + strh \val0, [\cptr], #2 + add \k, #1 + cmp.w \k, #4 + bne.w 2f + sub \cptr, #4*2 + vmov s18, \bufptr + vmov s19, \ctr + vmov s20, \val1 + \func \rptr, \bptr, \cptr, \zetaptr, \bufptr, \k, \val0, \val1, \q, \qa, \qinv, \tmp, \tmp2, \ctr + vmov \bufptr, s18 + vmov \ctr, s19 + vmov \val1, s20 + + add \ctr, #1 + + movw \k, #0 + 2: +.endm + +// Checks if val1 is suitable and multiplies with values from bptr using func +.macro second_if func, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr +// if (val1 < KYBER_Q && ctr < KYBER_N/4) + cmp.w \val1, \q + bhs.w 2f + cmp.w \ctr, #256/4 + bge.w 2f + strh \val1, [\cptr], #2 + add \k, #1 + cmp.w \k, #4 + bne.w 2f + sub \cptr, #4*2 + vmov s18, \bufptr + vmov s19, \ctr + \func \rptr, \bptr, \cptr, \zetaptr, \bufptr, \k, \val0, \val1, \q, \qa, \qinv, \tmp, \tmp2, \ctr + vmov \bufptr, s18 + vmov \ctr, s19 + + add \ctr, #1 + + movw \k, #0 + 2: +.endm + +.macro third_if tmp, tmp2, rptr, bptr, cptr, bufptr, ctr +// if (pos + 3 > buflen && ctr < KYBER_N/4) + vmov \tmp, s17 + add \tmp, #168 // XOF_BLOCKBYTES=168 + add \tmp2, \bufptr, #3 + cmp.w \tmp2, \tmp // pos + 3 > buflen + ble.w 2f + cmp.w \ctr, #256/4 + bge.w 2f + vmov \bufptr, s17 + + vmov s16, r12 + vmov s18, \rptr + vmov s19, \bptr + vmov s20, \cptr + vmov s21, \ctr + + mov \rptr, \bufptr //bufptr + movw \bptr, #1 + vmov \cptr, s26 // load state + #ifndef nohash + bl shake128_squeezeblocks + #endif + + vmov r12, s16 + vmov \rptr, s18 + vmov \bptr, s19 + vmov \cptr, s20 + vmov \ctr, s21 + vmov \bufptr, s17 + 2: +.endm + +.macro doublebasemul_asm rptr, aptr, bptr, zetaptr, poly0, poly1, poly2, poly3, q, qa, qinv, tmp, tmp2, zeta + ldr.w \poly0, [\aptr], #4 + ldr.w \poly1, [\bptr] + ldr.w \poly2, [\aptr], #4 + ldr.w \poly3, [\bptr, #4] + ldr.w \zeta, [\zetaptr], #4 + + //basemul(r->coeffs + 4 * i, a->coeffs + 4 * i, b->coeffs + 4 * i, zetas[64 + i]); + smulwt \tmp, \zeta, \poly1 + // b_1*zeta*qinv*plant_const; in low half + smlabb \tmp, \tmp, \q, \qa + // b_1*zeta + smultt \tmp, \poly0, \tmp + //a_1*b_1*zeta <2^32 + smlabb \tmp, \poly0, \poly1, \tmp + // a1*b1*zeta+a0*b0 + plant_red_b \q, \qa, \qinv, \tmp + // r[0] in upper half of tmp + smuadx \tmp2, \poly0, \poly1 + plant_red_b \q, \qa, \qinv, \tmp2 + // r[1] in upper half of tmp2 + pkhtb \tmp, \tmp2, \tmp, asr#16 + str \tmp, [\rptr], #4 + + neg \zeta, \zeta + + //basemul(r->coeffs + 4 * i + 2, a->coeffs + 4 * i + 2, b->coeffs + 4 * i + 2, - zetas[64 + i]); + smulwt \tmp, \zeta, \poly3 + smlabb \tmp, \tmp, \q, \qa + smultt \tmp, \poly2, \tmp + smlabb \tmp, \poly2, \poly3, \tmp + plant_red_b \q, \qa, \qinv, \tmp + // r[0] in upper half of tmp + + smuadx \tmp2, \poly2, \poly3 + plant_red_b \q, \qa, \qinv, \tmp2 + // r[1] in upper half of tmp2 + pkhtb \tmp, \tmp2, \tmp, asr#16 + str \tmp, [\rptr], #4 +.endm +// res replace poly2 +.macro doublebasemul_asm_acc rptr, aptr, bptr, zetaptr, poly0, poly1, res, poly3, q, qa, qinv, tmp, tmp2, zeta + ldr.w \poly0, [\aptr], #4 + ldr.w \poly1, [\bptr] + ldr.w \poly3, [\bptr, #4] + ldr.w \res, [\rptr] + ldr.w \zeta, [\zetaptr], #4 + + //basemul(r->coeffs + 4 * i, a->coeffs + 4 * i, b->coeffs + 4 * i, zetas[64 + i]); + smulwt \tmp, \zeta, \poly1 + // b_1*zeta*qinv*plant_const; in low half + smlabb \tmp, \tmp, \q, \qa + // b_1*zeta + smultt \tmp, \poly0, \tmp + //a_1*b_1*zeta <2^32 + smlabb \tmp, \poly0, \poly1, \tmp + // a1*b1*zeta+a0*b0 + plant_red_b \q, \qa, \qinv, \tmp + // r[0] in upper half of tmp + smuadx \tmp2, \poly0, \poly1 + plant_red_b \q, \qa, \qinv, \tmp2 + // r[1] in upper half of tmp2 + pkhtb \tmp, \tmp2, \tmp, asr#16 + uadd16 \res, \res, \tmp + str \res, [\rptr], #4 + + neg \zeta, \zeta + + ldr.w \res, [\rptr] + ldr \poly0, [\aptr], #4 + //basemul(r->coeffs + 4 * i + 2, a->coeffs + 4 * i + 2, b->coeffs + 4 * i + 2, - zetas[64 + i]); + smulwt \tmp, \zeta, \poly3 + smlabb \tmp, \tmp, \q, \qa + smultt \tmp, \poly0, \tmp + smlabb \tmp, \poly0, \poly3, \tmp + plant_red_b \q, \qa, \qinv, \tmp + // r[0] in upper half of tmp + + smuadx \tmp2, \poly0, \poly3 + plant_red_b \q, \qa, \qinv, \tmp2 + // r[1] in upper half of tmp2 + pkhtb \tmp, \tmp2, \tmp, asr#16 + uadd16 \res, \res, \tmp + str \res, [\rptr], #4 +.endm \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fstack/matacc_asm.S b/crypto_kem/ml-kem-768/m4fstack/matacc_asm.S new file mode 100644 index 0000000..2a5a307 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/matacc_asm.S @@ -0,0 +1,118 @@ +#include "matacc.i" +.extern shake128_squeezeblocks + +.syntax unified +.cpu cortex-m4 +.thumb + +// void matacc_asm(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t zetas[64], xof_state *state) +.global matacc_asm +.type matacc_asm, %function +.align 2 +matacc_asm: + push {r0-r11, r14} + rptr .req r0 + bptr .req r1 + cptr .req r2 + bufptr .req r3 + zetaptr .req r4 + val0 .req r5 + val1 .req r6 + tmp .req r7 + tmp2 .req r8 + k .req r9 + q .req r10 + qa .req r11 + qinv .req r12 + ctr .req r14 + + ldr.w zetaptr, [sp, #13*4] // load zetaptr from stack + ldr.w tmp, [sp, #14*4] // load state from stack + vmov s26, tmp + + movw qa, #26632 + movw q, #3329 + ### qinv=0x6ba8f301 + movw qinv, #62209 + movt qinv, #27560 + movw k, #0 + + // outer while loop + movw ctr, #0 + vmov s17, bufptr // save bufptr to check later + 1: + + ldrh val0, [bufptr], #2 + ldrb val1, [bufptr], #1 + ubfx tmp, val0, #12, #4 + orr val1, tmp, val1, lsl #4 + ubfx val0, val0, #0, #12 + ubfx val1, val1, #0, #12 + + first_if doublebasemul_asm, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr + + second_if doublebasemul_asm, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr + + third_if tmp, tmp2, rptr, bptr, cptr, bufptr, ctr + + cmp ctr, #256/4 + blt.w 1b + + pop {r0-r11, pc} +.size matacc_asm, . - matacc_asm + +// void matacc_asm(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t zetas[64], xof_state *state) +.global matacc_asm_acc +.type matacc_asm_acc, %function +.align 2 +matacc_asm_acc: + push {r0-r11, r14} + rptr .req r0 + bptr .req r1 + cptr .req r2 + bufptr .req r3 + zetaptr .req r4 + val0 .req r5 + val1 .req r6 + tmp .req r7 + tmp2 .req r8 + k .req r9 + q .req r10 + qa .req r11 + qinv .req r12 + ctr .req r14 + + ldr.w zetaptr, [sp, #13*4] // load zetaptr from stack + ldr.w tmp, [sp, #14*4] // load state from stack + vmov s26, tmp + + movw qa, #26632 + movw q, #3329 + ### qinv=0x6ba8f301 + movw qinv, #62209 + movt qinv, #27560 + movw k, #0 + + // outer while loop + movw ctr, #0 + vmov s17, bufptr // save bufptr to check later + 1: + + ldrh val0, [bufptr], #2 + ldrb val1, [bufptr], #1 + ubfx tmp, val0, #12, #4 + orr val1, tmp, val1, lsl #4 + ubfx val0, val0, #0, #12 + ubfx val1, val1, #0, #12 + + first_if doublebasemul_asm_acc, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr + + second_if doublebasemul_asm_acc, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr + + third_if tmp, tmp2, rptr, bptr, cptr, bufptr, ctr + + cmp ctr, #256/4 + blt.w 1b + + pop {r0-r11, pc} +.size matacc_asm_acc, . - matacc_asm_acc \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fstack/ntt.c b/crypto_kem/ml-kem-768/m4fstack/ntt.c new file mode 120000 index 0000000..c9d6e8a --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/ntt.c @@ -0,0 +1 @@ +../m4fspeed/ntt.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fstack/ntt.h b/crypto_kem/ml-kem-768/m4fstack/ntt.h new file mode 120000 index 0000000..5fd83c0 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/ntt.h @@ -0,0 +1 @@ +../m4fspeed/ntt.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fstack/params.h b/crypto_kem/ml-kem-768/m4fstack/params.h new file mode 120000 index 0000000..59dd7f1 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/params.h @@ -0,0 +1 @@ +../m4fspeed/params.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fstack/poly.c b/crypto_kem/ml-kem-768/m4fstack/poly.c new file mode 100644 index 0000000..35475ad --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/poly.c @@ -0,0 +1,618 @@ +#include "poly.h" + +#include "cbd.h" +#include "ntt.h" +#include "params.h" +#include "symmetric.h" + +#include + + +/************************************************* +* Name: poly_compress +* +* Description: Serialization of a polynomial and subsequent compression of a polynomial; +* +* Arguments: - unsigned char *r: pointer to output byte array (of length KYBER_POLYCOMPRESSEDBYTES) +* - const poly *a: pointer to input polynomial to be serialized +*************************************************/ +void poly_compress(unsigned char *r, const poly *a) +{ + unsigned int i,j; + int16_t u; + uint32_t d0; + uint8_t t[8]; + +#if (KYBER_POLYCOMPRESSEDBYTES == 128) + for(i=0;icoeffs[8*i+j]; + u += (u >> 15) & KYBER_Q; +/* t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */ + d0 = u << 4; + d0 += 1665; + d0 *= 80635; + d0 >>= 28; + t[j] = d0 & 0xf; + } + + r[0] = t[0] | (t[1] << 4); + r[1] = t[2] | (t[3] << 4); + r[2] = t[4] | (t[5] << 4); + r[3] = t[6] | (t[7] << 4); + r += 4; + } +#elif (KYBER_POLYCOMPRESSEDBYTES == 160) + for(i=0;icoeffs[8*i+j]; + u += (u >> 15) & KYBER_Q; +/* t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */ + d0 = u << 5; + d0 += 1664; + d0 *= 40318; + d0 >>= 27; + t[j] = d0 & 0x1f; + } + + r[0] = (t[0] >> 0) | (t[1] << 5); + r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7); + r[2] = (t[3] >> 1) | (t[4] << 4); + r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6); + r[4] = (t[6] >> 2) | (t[7] << 3); + r += 5; + } +#else +#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}" +#endif +} + +/************************************************* +* Name: poly_decompress +* +* Description: De-serialization and subsequent decompression of a polynomial; +* approximate inverse of poly_compress +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *a: pointer to input byte array (of length KYBER_POLYCOMPRESSEDBYTES bytes) +**************************************************/ +void poly_decompress(poly *r, const unsigned char *a) +{ + int i; +#if (KYBER_POLYCOMPRESSEDBYTES == 128) + for(i=0;icoeffs[i+0] = (((a[0] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[i+1] = (((a[0] >> 4) * KYBER_Q) + 8) >> 4; + r->coeffs[i+2] = (((a[1] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[i+3] = (((a[1] >> 4) * KYBER_Q) + 8) >> 4; + r->coeffs[i+4] = (((a[2] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[i+5] = (((a[2] >> 4) * KYBER_Q) + 8) >> 4; + r->coeffs[i+6] = (((a[3] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[i+7] = (((a[3] >> 4) * KYBER_Q) + 8) >> 4; + a += 4; + } +#elif (KYBER_POLYCOMPRESSEDBYTES == 160) + for(i=0;icoeffs[i+0] = (((a[0] & 31) * KYBER_Q) + 16) >> 5; + r->coeffs[i+1] = ((((a[0] >> 5) | ((a[1] & 3) << 3)) * KYBER_Q) + 16) >> 5; + r->coeffs[i+2] = ((((a[1] >> 2) & 31) * KYBER_Q) + 16) >> 5; + r->coeffs[i+3] = ((((a[1] >> 7) | ((a[2] & 15) << 1)) * KYBER_Q) + 16) >> 5; + r->coeffs[i+4] = ((((a[2] >> 4) | ((a[3] & 1) << 4)) * KYBER_Q) + 16) >> 5; + r->coeffs[i+5] = ((((a[3] >> 1) & 31) * KYBER_Q) + 16) >> 5; + r->coeffs[i+6] = ((((a[3] >> 6) | ((a[4] & 7) << 2)) * KYBER_Q) + 16) >> 5; + r->coeffs[i+7] = (((a[4] >> 3) * KYBER_Q) + 16) >> 5; + a += 5; + } +#else +#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}" +#endif +} + +/************************************************* +* Name: poly_packcompress +* +* Description: Serialization and subsequent compression of a polynomial of a polyvec, +* writes to a byte string representation of the whole polyvec. +* Used to compress a polyvec one poly at a time in a loop. +* +* Arguments: - unsigned char *r: pointer to output byte string representation of a polyvec (of length KYBER_POLYVECCOMPRESSEDBYTES) +* - const poly *a: pointer to input polynomial +* - int i: index of to be serialized polynomial in serialized polyec +**************************************************/ +void poly_packcompress(unsigned char *r, poly *a, int i) { + int j, k; + uint64_t d0; + +#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) + uint16_t t[8]; + + for(j=0;jcoeffs[8*j+k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; +/* t[k] = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */ + d0 = t[k]; + d0 <<= 11; + d0 += 1664; + d0 *= 645084; + d0 >>= 31; + t[k] = d0 & 0x7ff; + } + + + r[352*i+11*j+ 0] = t[0] & 0xff; + r[352*i+11*j+ 1] = (t[0] >> 8) | ((t[1] & 0x1f) << 3); + r[352*i+11*j+ 2] = (t[1] >> 5) | ((t[2] & 0x03) << 6); + r[352*i+11*j+ 3] = (t[2] >> 2) & 0xff; + r[352*i+11*j+ 4] = (t[2] >> 10) | ((t[3] & 0x7f) << 1); + r[352*i+11*j+ 5] = (t[3] >> 7) | ((t[4] & 0x0f) << 4); + r[352*i+11*j+ 6] = (t[4] >> 4) | ((t[5] & 0x01) << 7); + r[352*i+11*j+ 7] = (t[5] >> 1) & 0xff; + r[352*i+11*j+ 8] = (t[5] >> 9) | ((t[6] & 0x3f) << 2); + r[352*i+11*j+ 9] = (t[6] >> 6) | ((t[7] & 0x07) << 5); + r[352*i+11*j+10] = (t[7] >> 3); + } +#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) + uint16_t t[4]; + + for (j = 0; j < KYBER_N / 4; j++) { + for(k=0;k<4;k++) { + t[k] = a->coeffs[4*j+k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; + /* t[k] = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */ + d0 = t[k]; + d0 <<= 10; + d0 += 1665; + d0 *= 1290167; + d0 >>= 32; + t[k] = d0 & 0x3ff; + } + r[320*i+5*j+0] = t[0] & 0xff; + r[320*i+5*j+1] = (t[0] >> 8) | ((t[1] & 0x3f) << 2); + r[320*i+5*j+2] = ((t[1] >> 6) | ((t[2] & 0x0f) << 4)) & 0xff; + r[320*i+5*j+3] = ((t[2] >> 4) | ((t[3] & 0x03) << 6)) & 0xff; + r[320*i+5*j+4] = (t[3] >> 2) & 0xff; + } +#else +#error "KYBER_POLYVECCOMPRESSEDBYTES needs to in (KYBER_K * {352, 320})" +#endif +} + +/************************************************* +* Name: poly_unpackdecompress +* +* Description: Deserialization and subsequent compression of a polynomial of a polyvec, +* Used to uncompress a polyvec one poly at a time in a loop. +* +* Arguments: - const poly *r: pointer to output polynomial +* - unsigned char *a: pointer to input byte string representation of a polyvec (of length KYBER_POLYVECCOMPRESSEDBYTES) +* - int i: index of poly in polyvec to decompress +**************************************************/ +void poly_unpackdecompress(poly *r, const unsigned char *a, int i) { + int j; +#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) + for(j=0;jcoeffs[8*j+0] = (((a[352*i+11*j+ 0] | (((uint32_t)a[352*i+11*j+ 1] & 0x07) << 8)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+1] = ((((a[352*i+11*j+ 1] >> 3) | (((uint32_t)a[352*i+11*j+ 2] & 0x3f) << 5)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+2] = ((((a[352*i+11*j+ 2] >> 6) | (((uint32_t)a[352*i+11*j+ 3] & 0xff) << 2) | (((uint32_t)a[352*i+11*j+4] & 0x01) << 10)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+3] = ((((a[352*i+11*j+ 4] >> 1) | (((uint32_t)a[352*i+11*j+ 5] & 0x0f) << 7)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+4] = ((((a[352*i+11*j+ 5] >> 4) | (((uint32_t)a[352*i+11*j+ 6] & 0x7f) << 4)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+5] = ((((a[352*i+11*j+ 6] >> 7) | (((uint32_t)a[352*i+11*j+ 7] & 0xff) << 1) | (((uint32_t)a[352*i+11*j+8] & 0x03) << 9)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+6] = ((((a[352*i+11*j+ 8] >> 2) | (((uint32_t)a[352*i+11*j+ 9] & 0x1f) << 6)) * KYBER_Q) + 1024) >> 11; + r->coeffs[8*j+7] = ((((a[352*i+11*j+ 9] >> 5) | (((uint32_t)a[352*i+11*j+10] & 0xff) << 3)) * KYBER_Q) + 1024) >> 11; + } +#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) + for(j=0;jcoeffs[4*j+0] = (((a[320*i+5*j+ 0] | (((uint32_t)a[320*i+5*j+ 1] & 0x03) << 8)) * KYBER_Q) + 512) >> 10; + r->coeffs[4*j+1] = ((((a[320*i+5*j+ 1] >> 2) | (((uint32_t)a[320*i+5*j+ 2] & 0x0f) << 6)) * KYBER_Q) + 512) >> 10; + r->coeffs[4*j+2] = ((((a[320*i+5*j+ 2] >> 4) | (((uint32_t)a[320*i+5*j+ 3] & 0x3f) << 4)) * KYBER_Q) + 512) >> 10; + r->coeffs[4*j+3] = ((((a[320*i+5*j+ 3] >> 6) | (((uint32_t)a[320*i+5*j+ 4] & 0xff) << 2)) * KYBER_Q) + 512) >> 10; + } +#else +#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}" +#endif +} + + +/************************************************* +* Name: cmp_poly_compress +* +* Description: Serializes and consequently compares polynomial to a serialized polynomial +* +* Arguments: - const unsigned char *r: pointer to serialized polynomial to compare with +* - poly *a: pointer to input polynomial to serialize and compare +* Returns: boolean indicating whether the polynomials are equal +**************************************************/ +int cmp_poly_compress(const unsigned char *r, poly *a) { + unsigned char rc = 0; + int16_t u; + uint32_t d0; + uint8_t t[8]; + int i, j, k = 0; + +#if (KYBER_POLYCOMPRESSEDBYTES == 128) + for(i=0;icoeffs[8*i+j]; + u += (u >> 15) & KYBER_Q; +/* t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */ + d0 = u << 4; + d0 += 1665; + d0 *= 80635; + d0 >>= 28; + t[j] = d0 & 0xf; + } + rc |= r[k] ^ (t[0] | (t[1] << 4)); + rc |= r[k + 1] ^ (t[2] | (t[3] << 4)); + rc |= r[k + 2] ^ (t[4] | (t[5] << 4)); + rc |= r[k + 3] ^ (t[6] | (t[7] << 4)); + k += 4; + } +#elif (KYBER_POLYCOMPRESSEDBYTES == 160) + for(i=0;icoeffs[8*i+j]; + u += (u >> 15) & KYBER_Q; +/* t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */ + d0 = u << 5; + d0 += 1664; + d0 *= 40318; + d0 >>= 27; + t[j] = d0 & 0x1f; + } + + + rc |= r[k] ^ (t[0] | (t[1] << 5)); + rc |= r[k+1] ^ ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); + rc |= r[k+2] ^ ((t[3] >> 1) | (t[4] << 4)); + rc |= r[k+3] ^ ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); + rc |= r[k+4] ^ ((t[6] >> 2) | (t[7] << 3)); + k += 5; + } +#else +#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}" +#endif + return rc; +} + +/************************************************* +* Name: cmp_poly_packcompress +* +* Description: Serializes and consequently compares poly of polyvec to a serialized polyvec +* Should be called in a loop over all poly's of a polyvec. +* +* Arguments: - const unsigned char *r: pointer to serialized polyvec to compare with +* - poly *a: pointer to input polynomial of polyvec to serialize and compare +* - int i: index of poly in polyvec to compare with +* Returns: boolean indicating whether the polyvecs are equal +**************************************************/ +int cmp_poly_packcompress(const unsigned char *r, poly *a, int i) { + unsigned char rc = 0; + int j, k; + uint64_t d0; + +#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352)) + uint16_t t[8]; + for(j=0;jcoeffs[8*j+k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; +/* t[k] = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */ + d0 = t[k]; + d0 <<= 11; + d0 += 1664; + d0 *= 645084; + d0 >>= 31; + t[k] = d0 & 0x7ff; + } + + rc |= r[352*i+11*j+ 0] ^ (t[0] & 0xff); + rc |= r[352*i+11*j+ 1] ^ ((t[0] >> 8) | ((t[1] & 0x1f) << 3)); + rc |= r[352*i+11*j+ 2] ^ ((t[1] >> 5) | ((t[2] & 0x03) << 6)); + rc |= r[352*i+11*j+ 3] ^ ((t[2] >> 2) & 0xff); + rc |= r[352*i+11*j+ 4] ^ ((t[2] >> 10) | ((t[3] & 0x7f) << 1)); + rc |= r[352*i+11*j+ 5] ^ ((t[3] >> 7) | ((t[4] & 0x0f) << 4)); + rc |= r[352*i+11*j+ 6] ^ ((t[4] >> 4) | ((t[5] & 0x01) << 7)); + rc |= r[352*i+11*j+ 7] ^ ((t[5] >> 1) & 0xff); + rc |= r[352*i+11*j+ 8] ^ ((t[5] >> 9) | ((t[6] & 0x3f) << 2)); + rc |= r[352*i+11*j+ 9] ^ ((t[6] >> 6) | ((t[7] & 0x07) << 5)); + rc |= r[352*i+11*j+10] ^ ((t[7] >> 3)); + } +#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320)) + uint16_t t[4]; + for (j = 0; j < KYBER_N / 4; j++) { + for(k=0;k<4;k++) { + t[k] = a->coeffs[4*j+k]; + t[k] += ((int16_t)t[k] >> 15) & KYBER_Q; + /* t[k] = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */ + d0 = t[k]; + d0 <<= 10; + d0 += 1665; + d0 *= 1290167; + d0 >>= 32; + t[k] = d0 & 0x3ff; + } + + rc |= r[320*i+5*j+0] ^ (t[0] & 0xff); + rc |= r[320*i+5*j+1] ^ ((t[0] >> 8) | ((t[1] & 0x3f) << 2)); + rc |= r[320*i+5*j+2] ^ (((t[1] >> 6) | ((t[2] & 0x0f) << 4)) & 0xff); + rc |= r[320*i+5*j+3] ^ (((t[2] >> 4) | ((t[3] & 0x03) << 6)) & 0xff); + rc |= r[320*i+5*j+4] ^ ((t[3] >> 2) & 0xff); + } +#else +#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}" +#endif + return rc; +} + +/************************************************* +* Name: poly_tobytes +* +* Description: Serialization of a polynomial +* +* Arguments: - unsigned char *r: pointer to output byte array (needs space for KYBER_POLYBYTES bytes) +* - const poly *a: pointer to input polynomial +**************************************************/ +void poly_tobytes(unsigned char *r, poly *a) { + int i; + uint16_t t0, t1; + + poly_reduce(a); + + for (i = 0; i < KYBER_N / 2; i++) { + t0 = a->coeffs[2 * i]; + t1 = a->coeffs[2 * i + 1]; + r[3 * i] = t0 & 0xff; + r[3 * i + 1] = (t0 >> 8) | ((t1 & 0xf) << 4); + r[3 * i + 2] = (t1 >> 4) & 0xff; + } +} + +/************************************************* +* Name: poly_frombytes +* +* Description: De-serialization of a polynomial; +* inverse of poly_tobytes +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes) +**************************************************/ +void poly_frombytes(poly *r, const unsigned char *a) { + int i; + + for (i = 0; i < KYBER_N / 2; i++) { + r->coeffs[2 * i] = a[3 * i] | ((uint16_t)a[3 * i + 1] & 0x0f) << 8; + r->coeffs[2 * i + 1] = a[3 * i + 1] >> 4 | ((uint16_t)a[3 * i + 2] & 0xff) << 4; + } +} + +/************************************************* +* Name: poly_frombytes_mul +* +* Description: Multiplication of a polynomial with a de-serialization of another polynomial +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *b: pointer to input polynomial +* - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes) +**************************************************/ +extern void frombytes_mul_asm(int16_t *r, const int16_t *b, const unsigned char *c, const int32_t zetas[64]); +void poly_frombytes_mul(poly *r, const poly *b, const unsigned char *a) { + frombytes_mul_asm(r->coeffs, b->coeffs, a, zetas); +} + +/************************************************* +* Name: poly_frombytes_mul_acc +* +* Description: Multiplication of a polynomial with a de-serialization of another polynomial +* Accumulation in r. +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *b: pointer to input polynomial +* - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes) +**************************************************/ +extern void frombytes_mul_asm_acc(int16_t *r, const int16_t *b, const unsigned char *c, const int32_t zetas[64]); +void poly_frombytes_mul_acc(poly *r, const poly *b, const unsigned char *a) { + frombytes_mul_asm_acc(r->coeffs, b->coeffs, a, zetas); +} + +/************************************************* +* Name: poly_getnoise +* +* Description: Sample a polynomial deterministically from a seed and a nonce, +* with output polynomial close to centered binomial distribution +* with parameter KYBER_ETA +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *seed: pointer to input seed (pointing to array of length KYBER_SYMBYTES bytes) +* - unsigned char nonce: one-byte input nonce +* - int add: boolean to indicate to accumulate into r +**************************************************/ +void poly_noise(poly *r, const unsigned char *seed, unsigned char nonce, int add) { + unsigned char buf[KYBER_ETA * KYBER_N / 4]; + + prf(buf, KYBER_ETA * KYBER_N / 4, seed, nonce); + cbd(r, buf, add); +} + +/************************************************* +* Name: poly_ntt +* +* Description: Computes negacyclic number-theoretic transform (NTT) of +* a polynomial in place; +* inputs assumed to be in normal order, output in bitreversed order +* +* Arguments: - uint16_t *r: pointer to in/output polynomial +**************************************************/ +void poly_ntt(poly *r) { + ntt(r->coeffs); +} + +/************************************************* +* Name: poly_invntt +* +* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of +* a polynomial in place; +* inputs assumed to be in bitreversed order, output in normal order +* +* Arguments: - uint16_t *a: pointer to in/output polynomial +**************************************************/ +void poly_invntt(poly *r) { + invntt(r->coeffs); +} + +extern void basemul_asm(int16_t *, const int16_t *, const int16_t *, const int32_t *); +/************************************************* +* Name: poly_basemul +* +* Description: Multiplication of two polynomials in NTT domain +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_basemul(poly *r, const poly *a, const poly *b) { + basemul_asm(r->coeffs, a->coeffs, b->coeffs, zetas); +} + +extern void basemul_asm_acc(int16_t *, const int16_t *, const int16_t *, const int32_t *); +/************************************************* +* Name: poly_basemul_acc +* +* Description: Multiplication of two polynomials in NTT domain, accumulating +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_basemul_acc(poly *r, const poly *a, const poly *b) { + basemul_asm_acc(r->coeffs, a->coeffs, b->coeffs, zetas); +} + +extern void asm_fromplant(int16_t *r); +/************************************************* +* Name: poly_fromplant +* +* Description: Inplace conversion of all coefficients of a polynomial +* from Montgomery domain to normal domain +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void poly_fromplant(poly *r) { + asm_fromplant(r->coeffs); +} + +extern void asm_barrett_reduce(int16_t *r); +/************************************************* +* Name: poly_reduce +* +* Description: Applies Barrett reduction to all coefficients of a polynomial +* for details of the Barrett reduction see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void poly_reduce(poly *r) { + asm_barrett_reduce(r->coeffs); +} + +extern void pointwise_add(int16_t *, const int16_t *, const int16_t *); +/************************************************* +* Name: poly_add +* +* Description: Add two polynomials +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_add(poly *r, const poly *a, const poly *b) { + pointwise_add(r->coeffs,a->coeffs,b->coeffs); +} + + +extern void pointwise_sub(int16_t *, const int16_t *, const int16_t *); +/************************************************* +* Name: poly_sub +* +* Description: Subtract two polynomials +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_sub(poly *r, const poly *a, const poly *b) { + pointwise_sub(r->coeffs,a->coeffs,b->coeffs); +} + +void cmov_int16(int16_t *r, int16_t v, uint16_t b); + +/************************************************* +* Name: poly_frommsg +* +* Description: Convert 32-byte message to polynomial +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *msg: pointer to input message +**************************************************/ +void poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) +{ + unsigned int i,j; + +#if (KYBER_INDCPA_MSGBYTES != KYBER_N/8) +#error "KYBER_INDCPA_MSGBYTES must be equal to KYBER_N/8 bytes!" +#endif + + for(i=0;icoeffs[8*i+j] = 0; + cmov_int16(r->coeffs+8*i+j, ((KYBER_Q+1)/2), (msg[i] >> j)&1); + } + } +} + +/************************************************* +* Name: poly_tomsg +* +* Description: Convert polynomial to 32-byte message +* +* Arguments: - unsigned char *msg: pointer to output message +* - const poly *a: pointer to input polynomial +**************************************************/ +void poly_tomsg(unsigned char msg[KYBER_SYMBYTES], poly *a) { + uint32_t t; + int i, j; + + for (i = 0; i < KYBER_SYMBYTES; i++) { + msg[i] = 0; + for (j = 0; j < 8; j++) { + t = a->coeffs[8*i+j]; + t <<= 1; + t += 1665; + t *= 80635; + t >>= 28; + t &= 1; + msg[i] |= t << j; + } + } +} + +/************************************************* +* Name: poly_zeroize +* +* Description: Zeros a polynomial +* +* Arguments: - poly *p: pointer to polynomial +**************************************************/ +void poly_zeroize(poly *p) { + int i; + for(i = 0; i < KYBER_N; i++) + p->coeffs[i] = 0; +} diff --git a/crypto_kem/ml-kem-768/m4fstack/poly.h b/crypto_kem/ml-kem-768/m4fstack/poly.h new file mode 100644 index 0000000..635abe9 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/poly.h @@ -0,0 +1,51 @@ +#ifndef POLY_H +#define POLY_H + +#include "params.h" + +#include + +#define poly_getnoise(p, seed, nonce) poly_noise(p, seed, nonce, 0) +#define poly_addnoise(p, seed, nonce) poly_noise(p, seed, nonce, 1) + +/* + * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial + * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] + */ +typedef struct { + int16_t coeffs[KYBER_N]; +} poly; + +void poly_compress(unsigned char *r, const poly *a); +void poly_decompress(poly *r, const unsigned char *a); + +void poly_packcompress(unsigned char *r, poly *a, int i); +void poly_unpackdecompress(poly *r, const unsigned char *a, int i); + +int cmp_poly_compress(const unsigned char *r, poly *a); +int cmp_poly_packcompress(const unsigned char *r, poly *a, int i); + +void poly_tobytes(unsigned char *r, poly *a); +void poly_frombytes(poly *r, const unsigned char *a); +void poly_frombytes_mul(poly *r, const poly *b, const unsigned char *a); +void poly_frombytes_mul_acc(poly *r, const poly *b, const unsigned char *a); + +void poly_frommsg(poly *r, const unsigned char msg[KYBER_SYMBYTES]); +void poly_tomsg(unsigned char msg[KYBER_SYMBYTES], poly *a); + +void poly_noise(poly *r, const unsigned char *seed, unsigned char nonce, int add); + +void poly_ntt(poly *r); +void poly_invntt(poly *r); +void poly_basemul(poly *r, const poly *a, const poly *b); +void poly_basemul_acc(poly *r, const poly *a, const poly *b); +void poly_fromplant(poly *r); + +void poly_reduce(poly *r); + +void poly_add(poly *r, const poly *a, const poly *b); +void poly_sub(poly *r, const poly *a, const poly *b); + +void poly_zeroize(poly *p); + +#endif diff --git a/crypto_kem/ml-kem-768/m4fstack/poly_asm.S b/crypto_kem/ml-kem-768/m4fstack/poly_asm.S new file mode 100644 index 0000000..0b5aa6a --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/poly_asm.S @@ -0,0 +1,198 @@ +/****************************************************************************** +* Integrating the improved Plantard arithmetic into Kyber. +* +* Efficient Plantard arithmetic enables a faster Kyber implementation with the +* same stack usage. +* +* See the paper at https://eprint.iacr.org/2022/956.pdf for more details. +* +* @author Junhao Huang, BNU-HKBU United International College, Zhuhai, China +* jhhuang_nuaa@126.com +* +* @date September 2022 +******************************************************************************/ +#include "macros.i" + +.syntax unified +.cpu cortex-m4 +.thumb + +.macro doublebasemul_frombytes_asm rptr, bptr, zeta, poly0, poly1, poly3, tmp, tmp2, q, qa, qinv + ldr.w \poly0, [\bptr], #4 + + smulwt \tmp, \zeta, \poly1 + smlabt \tmp, \tmp, \q, \qa + smultt \tmp, \poly0, \tmp + smlabb \tmp, \poly0, \poly1, \tmp + // a1*b1*zeta+a0*b0 + plant_red \q, \qa, \qinv, \tmp + // r[0] in upper half of tmp + + smuadx \tmp2, \poly0, \poly1 + plant_red \q, \qa, \qinv, \tmp2 + + // r[1] in upper half of tmp2 + pkhtb \tmp, \tmp2, \tmp, asr#16 + str \tmp, [rptr], #4 + + neg \zeta, \zeta + + ldr.w \poly0, [\bptr], #4 + //basemul(r->coeffs + 4 * i + 2, a->coeffs + 4 * i + 2, b->coeffs + 4 * i + 2, - zetas[64 + i]); + smulwt \tmp, \zeta, \poly3 + smlabt \tmp, \tmp, \q, \qa + smultt \tmp, \poly0, \tmp + smlabb \tmp, \poly0, \poly3, \tmp + plant_red \q, \qa, \qinv, \tmp + // r[0] in upper half of tmp + + smuadx \tmp2, \poly0, \poly3 + plant_red \q, \qa, \qinv, \tmp2 + // r[1] in upper half of tmp2 + pkhtb \tmp, \tmp2, \tmp, asr#16 + str \tmp, [rptr], #4 +.endm + +.macro doublebasemul_frombytes_asm_acc rptr, bptr, zeta, poly0, poly1, poly3, res0, tmp, tmp2, q, qa, qinv + + ldr \poly0, [\bptr], #4 + + ldr \res0, [\rptr] + smulwt \tmp, \zeta, \poly1 + // b_1*zeta*qinv*plant_const; in low half + smlabt \tmp, \tmp, \q, \qa + // b_1*zeta + smultt \tmp, \poly0, \tmp + //a_1*b_1*zeta <2^32 + smlabb \tmp, \poly0, \poly1, \tmp + // a1*b1*zeta+a0*b0 + plant_red \q, \qa, \qinv, \tmp + // r[0] in upper half of tmp + + smuadx \tmp2, \poly0, \poly1 + plant_red \q, \qa, \qinv, \tmp2 + + // r[1] in upper half of tmp2 + pkhtb \tmp, \tmp2, \tmp, asr#16 + uadd16 \res0, \res0, \tmp + str \res0, [\rptr], #4 + + neg \zeta, \zeta + + ldr \poly0, [\bptr], #4 + ldr \res0, [\rptr] + + smulwt \tmp, \zeta, \poly3 + smlabt \tmp, \tmp, \q, \qa + smultt \tmp, \poly0, \tmp + smlabb \tmp, \poly0, \poly3, \tmp + plant_red \q, \qa, \qinv, \tmp + // r[0] in upper half of tmp + + smuadx \tmp2, \poly0, \poly3 + plant_red \q, \qa, \qinv, \tmp2 + // r[1] in upper half of tmp2 + pkhtb \tmp, \tmp2, \tmp, asr#16 + uadd16 \res0, \res0, \tmp + str \res0, [\rptr], #4 +.endm + +// reduce 2 registers +.macro deserialize aptr, tmp, tmp2, tmp3, t0, t1 + ldrb.w \tmp, [\aptr, #2] + ldrh.w \tmp2, [\aptr, #3] + ldrb.w \tmp3, [\aptr, #5] + ldrh.w \t0, [\aptr], #6 + + ubfx.w \t1, \t0, #12, #4 + ubfx.w \t0, \t0, #0, #12 + orr \t1, \t1, \tmp, lsl #4 + orr \t0, \t0, \t1, lsl #16 + //tmp is free now + ubfx.w \t1, \tmp2, #12, #4 + ubfx.w \tmp, \tmp2, #0, #12 + orr \t1, \t1, \tmp3, lsl #4 + orr \t1, \tmp, \t1, lsl #16 +.endm + +// void frombytes_mul_asm(int16_t *r, const int16_t *b, const unsigned char *a, const int32_t zetas[64]) +.global frombytes_mul_asm +.type frombytes_mul_asm, %function +.align 2 +frombytes_mul_asm: + push {r4-r11, r14} + + rptr .req r0 + bptr .req r1 + aptr .req r2 + zetaptr .req r3 + t0 .req r4 + t1 .req r5 + tmp .req r6 + tmp2 .req r7 + tmp3 .req r8 + q .req r9 + qa .req r10 + qinv .req r11 + zeta .req r12 + ctr .req r14 + + movw qa, #26632 + movt q, #3329 + ### qinv=0x6ba8f301 + movw qinv, #62209 + movt qinv, #27560 + + add ctr, rptr, #64*4*2 + 1: + ldr.w zeta, [zetaptr], #4 + deserialize aptr, tmp, tmp2, tmp3, t0, t1 + + doublebasemul_frombytes_asm rptr, bptr, zeta, tmp3, t0, t1, tmp, tmp2, q, qa, qinv + + cmp.w rptr, ctr + bne.w 1b + + pop {r4-r11, pc} +.size frombytes_mul_asm, . -frombytes_mul_asm + +// void frombytes_mul_asm_acc(int16_t *r, const int16_t *b, const unsigned char *a, const int32_t zetas[64]) +.global frombytes_mul_asm_acc +.type frombytes_mul_asm_acc, %function +.align 2 +frombytes_mul_asm_acc: + push {r4-r11, r14} + + rptr .req r0 + bptr .req r1 + aptr .req r2 + zetaptr .req r3 + t0 .req r4 + t1 .req r5 + tmp .req r6 + tmp2 .req r7 + tmp3 .req r8 + q .req r9 + qa .req r10 + qinv .req r11 + zeta .req r12 + ctr .req r14 + + movw qa, #26632 + movt q, #3329 + ### qinv=0x6ba8f301 + movw qinv, #62209 + movt qinv, #27560 + + add ctr, rptr, #64*4*2 + 1: + ldr.w zeta, [zetaptr], #4 + deserialize aptr, tmp, tmp2, tmp3, t0, t1 + vmov s0, ctr + doublebasemul_frombytes_asm_acc rptr, bptr, zeta, tmp3, t0, t1, ctr, tmp, tmp2, q, qa, qinv + vmov ctr, s0 + cmp.w rptr, ctr + bne.w 1b + + pop {r4-r11, pc} +.size frombytes_mul_asm_acc, . - frombytes_mul_asm_acc \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fstack/polyvec.c b/crypto_kem/ml-kem-768/m4fstack/polyvec.c new file mode 120000 index 0000000..f398d76 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/polyvec.c @@ -0,0 +1 @@ +../m4fspeed/polyvec.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fstack/polyvec.h b/crypto_kem/ml-kem-768/m4fstack/polyvec.h new file mode 120000 index 0000000..3113837 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/polyvec.h @@ -0,0 +1 @@ +../m4fspeed/polyvec.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fstack/reduce.S b/crypto_kem/ml-kem-768/m4fstack/reduce.S new file mode 120000 index 0000000..29ae453 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/reduce.S @@ -0,0 +1 @@ +../m4fspeed/reduce.S \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fstack/symmetric-fips202.c b/crypto_kem/ml-kem-768/m4fstack/symmetric-fips202.c new file mode 120000 index 0000000..fa4ba9a --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/symmetric-fips202.c @@ -0,0 +1 @@ +../m4fspeed/symmetric-fips202.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fstack/symmetric.h b/crypto_kem/ml-kem-768/m4fstack/symmetric.h new file mode 120000 index 0000000..28c6fac --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/symmetric.h @@ -0,0 +1 @@ +../m4fspeed/symmetric.h \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fstack/verify.c b/crypto_kem/ml-kem-768/m4fstack/verify.c new file mode 120000 index 0000000..a7a9856 --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/verify.c @@ -0,0 +1 @@ +../m4fspeed/verify.c \ No newline at end of file diff --git a/crypto_kem/ml-kem-768/m4fstack/verify.h b/crypto_kem/ml-kem-768/m4fstack/verify.h new file mode 120000 index 0000000..cb2da4b --- /dev/null +++ b/crypto_kem/ml-kem-768/m4fstack/verify.h @@ -0,0 +1 @@ +../m4fspeed/verify.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4f/api.h b/crypto_sign/dilithium2/m4f/api.h new file mode 100644 index 0000000..a289632 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/api.h @@ -0,0 +1,26 @@ +#ifndef API_H +#define API_H + +#include +#include +#include "params.h" + +int crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +int crypto_sign_signature(uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int crypto_sign(uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int crypto_sign_verify(const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, + const uint8_t *pk); + +int crypto_sign_open(uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk); + +#endif diff --git a/crypto_sign/dilithium2/m4f/basemul_257.S b/crypto_sign/dilithium2/m4f/basemul_257.S new file mode 100644 index 0000000..da647d8 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/basemul_257.S @@ -0,0 +1,91 @@ +#include "macros_fnt.i" + +.syntax unified +.cpu cortex-m4 + +.align 2 +.global __asm_point_mul_257_16 +.type __asm_point_mul_257_16, %function +__asm_point_mul_257_16: + push.w {r4-r11, lr} + + ldr.w r14, [sp, #36] + + .equ width, 4 + + add.w r12, r14, #64*width + _point_mul_16_loop: + + ldr.w r7, [r1, #2*width] + ldr.w r8, [r1, #3*width] + ldr.w r9, [r14, #1*width] + ldr.w r5, [r1, #1*width] + ldr.w r4, [r1], #4*width + ldr.w r6, [r14], #2*width + + smultb r10, r4, r6 + barrett_32 r10, r2, r3, r11 + pkhbt r4, r4, r10, lsl #16 + + neg.w r6, r6 + + smultb r10, r5, r6 + barrett_32 r10, r2, r3, r11 + pkhbt r5, r5, r10, lsl #16 + + str.w r5, [r0, #1*width] + str.w r4, [r0], #2*width + + smultb r10, r7, r9 + barrett_32 r10, r2, r3, r11 + pkhbt r7, r7, r10, lsl #16 + + neg.w r9, r9 + + smultb r10, r8, r9 + barrett_32 r10, r2, r3, r11 + pkhbt r8, r8, r10, lsl #16 + + str.w r8, [r0, #1*width] + str.w r7, [r0], #2*width + + cmp.w r14, r12 + bne.w _point_mul_16_loop + + pop.w {r4-r11, pc} + + +.align 2 +.global __asm_asymmetric_mul_257_16 +.type __asm_asymmetric_mul_257_16, %function +__asm_asymmetric_mul_257_16: + push.w {r4-r11, lr} + + .equ width, 4 + + add.w r12, r0, #256*width + _asymmetric_mul_16_loop: + + ldr.w r7, [r1, #width] + ldr.w r4, [r1], #2*width + ldr.w r8, [r2, #width] + ldr.w r5, [r2], #2*width + ldr.w r9, [r3, #width] + ldr.w r6, [r3], #2*width + + smuad r10, r4, r6 + smuadx r11, r4, r5 + + str.w r11, [r0, #width] + str.w r10, [r0], #2*width + + smuad r10, r7, r9 + smuadx r11, r7, r8 + + str.w r11, [r0, #width] + str.w r10, [r0], #2*width + + cmp.w r0, r12 + bne.w _asymmetric_mul_16_loop + + pop.w {r4-r11, pc} \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4f/config.h b/crypto_sign/dilithium2/m4f/config.h new file mode 100644 index 0000000..298a707 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/config.h @@ -0,0 +1,7 @@ +#ifndef CONFIG_H +#define CONFIG_H + +#define DILITHIUM_MODE 2 +// #define SIGN_STACKSTRATEGY 2 + +#endif diff --git a/crypto_sign/dilithium2/m4f/fnt_257.S b/crypto_sign/dilithium2/m4f/fnt_257.S new file mode 100644 index 0000000..545883b --- /dev/null +++ b/crypto_sign/dilithium2/m4f/fnt_257.S @@ -0,0 +1,145 @@ +#include "macros_fnt.i" +.macro final_butterfly c0, c1f, twiddle, c0out, c1, qprime, q, tmp + vmov.w \c1, \c1f + vmov.w \tmp, \twiddle + + mla \c0out, \c1, \tmp, \c0 + mls \c1, \c1, \tmp, \c0 + + barrett_32 \c0out, \qprime, \q, \tmp + barrett_32 \c1, \qprime, \q, \tmp +.endm + + +.syntax unified +.cpu cortex-m4 + +.align 2 +.global __asm_fnt_257 +.type __asm_fnt_257, %function +__asm_fnt_257: + push.w {r4-r11, lr} + vpush.w {s16-s27} + + vmov.w s27, r1 + + .equ width, 4 + + add.w r12, r0, #32*width + _fnt_0_1_2: + + ldrstrvec ldr.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #(32*0*width), #(32*1*width), #(32*2*width), #(32*3*width), #(32*4*width), #(32*5*width), #(32*6*width), #(32*7*width) + + FNT_CT_butterfly r4, r8, 4 + FNT_CT_butterfly r5, r9, 4 + FNT_CT_butterfly r6, r10, 4 + FNT_CT_butterfly r7, r11, 4 + + FNT_CT_butterfly r4, r6, 2 + FNT_CT_butterfly r5, r7, 2 + FNT_CT_butterfly r8, r10, 6 + FNT_CT_butterfly r9, r11, 6 + + FNT_CT_butterfly r4, r5, 1 + FNT_CT_butterfly r6, r7, 5 + FNT_CT_butterfly r8, r9, 3 + FNT_CT_butterfly r10, r11, 7 + + ldrstrvecjump str.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #(32*1*width), #(32*2*width), #(32*3*width), #(32*4*width), #(32*5*width), #(32*6*width), #(32*7*width), #width + + cmp.w r0, r12 + bne.w _fnt_0_1_2 + + sub.w r0, r0, #32*width + + add.w r12, r0, #256*width + vmov.w s25, r12 + _fnt_3_4_5_6: + + vmov r1, s27 + vldm.w r1!, {s2-s16} + vmov s27, r1 + + .rept 2 + ldrstrvec ldr.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #(4*0*width+2*width), #(4*1*width+2*width), #(4*2*width+2*width), #(4*3*width+2*width), #(4*4*width+2*width), #(4*5*width+2*width), #(4*6*width+2*width), #(4*7*width+2*width) + + _3_layer_CT_32_FNT r4, r5, r6, r7, r8, r9, r10, r11, s2, s3, s4, s5, s6, s7, s8, r14, r2, r3, r1, r12 + + vmov.w s17, s18, r4, r5 // a1, a3 + vmov.w s19, s20, r6, r7 // a5, a7 + vmov.w s21, s22, r8, r9 // a9, a11 + vmov.w s23, s24, r10, r11 // a13, a15 + + ldrstrvec ldr.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #(4*0*width), #(4*1*width), #(4*2*width), #(4*3*width), #(4*4*width), #(4*5*width), #(4*6*width), #(4*7*width) + + _3_layer_CT_32_FNT r4, r5, r6, r7, r8, r9, r10, r11, s2, s3, s4, s5, s6, s7, s8, r14, r2, r3, r1, r12 + + final_butterfly r5, s18, s10, r1, r12, r2, r3, r14 + str.w r12, [r0, #(4*1*width+2*width)] + str.w r1, [r0, #(4*1*width)] + + final_butterfly r6, s19, s11, r1, r12, r2, r3, r14 + str.w r12, [r0, #(4*2*width+2*width)] + str.w r1, [r0, #(4*2*width)] + + final_butterfly r7, s20, s12, r1, r12, r2, r3, r14 + str.w r12, [r0, #(4*3*width+2*width)] + str.w r1, [r0, #(4*3*width)] + + final_butterfly r8, s21, s13, r1, r12, r2, r3, r14 + str.w r12, [r0, #(4*4*width+2*width)] + str.w r1, [r0, #(4*4*width)] + + final_butterfly r9, s22, s14, r1, r12, r2, r3, r14 + str.w r12, [r0, #(4*5*width+2*width)] + str.w r1, [r0, #(4*5*width)] + + final_butterfly r10, s23, s15, r1, r12, r2, r3, r14 + str.w r12, [r0, #(4*6*width+2*width)] + str.w r1, [r0, #(4*6*width)] + + final_butterfly r11, s24, s16, r1, r12, r2, r3, r14 + str.w r12, [r0, #(4*7*width+2*width)] + str.w r1, [r0, #(4*7*width)] + + final_butterfly r4, s17, s9, r1, r12, r2, r3, r14 + str.w r12, [r0, #(4*0*width+2*width)] + str.w r1, [r0], #width + .endr + add.w r0, #((32-2)*width) + + vmov.w r12, s25 + cmp.w r0, r12 + bne.w _fnt_3_4_5_6 + + # switch to 16-bit representation + sub.w r0, r0, #256*width + mov.w r1, r0 + _fnt_to_16_bit: + ldr.w r3, [r0, #1*width] + ldr.w r4, [r0, #2*width] + ldr.w r5, [r0, #3*width] + ldr.w r6, [r0, #4*width] + ldr.w r7, [r0, #5*width] + ldr.w r8, [r0, #6*width] + ldr.w r9, [r0, #7*width] + ldr.w r2, [r0], #8*width + strh.w r3, [r1, #1*2] + strh.w r4, [r1, #2*2] + strh.w r5, [r1, #3*2] + strh.w r6, [r1, #4*2] + strh.w r7, [r1, #5*2] + strh.w r8, [r1, #6*2] + strh.w r9, [r1, #7*2] + strh.w r2, [r1], #8*2 + cmp.w r0, r12 + bne.w _fnt_to_16_bit + + vpop.w {s16-s27} + pop.w {r4-r11, pc} + + + + + + diff --git a/crypto_sign/dilithium2/m4f/ifnt_257.S b/crypto_sign/dilithium2/m4f/ifnt_257.S new file mode 100644 index 0000000..1c51165 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/ifnt_257.S @@ -0,0 +1,306 @@ +#include "macros_fnt.i" +.macro final_butterfly c0, c1, c1f, twiddle + vmov.w \c1, \c1f + add.w \c0, \c1 + sub.w \c1, \c0, \c1, lsl#1 + mul.w \c1, \twiddle +.endm + +.macro final_butterfly2 c0, c0out, c1, c1f, twiddle, twiddle2 + vmov.w \c1, \c1f + mla.w \c0out, \twiddle2, \c1, \c0 + mls.w \c1, \twiddle2, \c1, \c0 + mul.w \c1, \twiddle +.endm + +.syntax unified +.cpu cortex-m4 +.align 2 +.global __asm_ifnt_257 +.type __asm_ifnt_257, %function +__asm_ifnt_257: + push.w {r4-r11, lr} + vpush.w {s16-s24} + + .equ width, 4 + + add.w r12, r0, #256*width + vmov.w s1, r12 + _ifnt_7_6_5_4: + + vldm.w r1!, {s2-s16} + +// ================ + + ldrstrvec ldr.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #(2*8*width), #(2*9*width), #(2*10*width), #(2*11*width), #(2*12*width), #(2*13*width), #(2*14*width), #(2*15*width) + + addSub4 r4, r5, r6, r7, r8, r9, r10, r11 + vmov.w r14, s6 + mul.w r5, r5, r14 + vmov.w r14, s8 + mul.w r9, r9, r14 + addSub2 r4, r6, r8, r10 + vmov.w r14, s7 + mla.w r12, r7, r14, r5 + mls.w r7, r7, r14, r5 + vmov.w r14, s9 + mla.w r5, r11, r14, r9 + mls.w r11, r11, r14, r9 + + // r4, r12, r6, r7, r8, r5, r10, r11 + + vmov.w r14, s12 + mul.w r6, r6, r14 + mul.w r7, r7, r14 + vmov.w r14, s13 + mul.w r10, r10, r14 + mul.w r11, r11, r14 + + barrett_32 r4, r2, r3, r14 + barrett_32 r12, r2, r3, r14 + barrett_32 r6, r2, r3, r14 + barrett_32 r7, r2, r3, r14 + barrett_32 r8, r2, r3, r14 + barrett_32 r5, r2, r3, r14 + barrett_32 r10, r2, r3, r14 + barrett_32 r11, r2, r3, r14 + + addSub4 r4, r8, r6, r10, r12, r5, r7, r11 + + vmov.w s17, s18, r4, r12 + vmov.w s19, s20, r6, r7 + vmov.w s21, s22, r8, r5 + vmov.w s23, s24, r10, r11 + + ldrstrvec ldr.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #(2*0*width), #(2*1*width), #(2*2*width), #(2*3*width), #(2*4*width), #(2*5*width), #(2*6*width), #(2*7*width) + + addSub4 r4, r5, r6, r7, r8, r9, r10, r11 + vmov.w r14, s2 + mul.w r5, r5, r14 + vmov.w r14, s4 + mul.w r9, r9, r14 + addSub2 r4, r6, r8, r10 + vmov.w r14, s3 + mla.w r12, r7, r14, r5 + mls.w r7, r7, r14, r5 + vmov.w r14, s5 + mla.w r5, r11, r14, r9 + mls.w r11, r11, r14, r9 + + // r4, r12, r6, r7, r8, r5, r10, r11 + + vmov.w r14, s10 + mul.w r6, r6, r14 + mul.w r7, r7, r14 + vmov.w r14, s11 + mul.w r10, r10, r14 + mul.w r11, r11, r14 + + barrett_32 r4, r2, r3, r14 + barrett_32 r12, r2, r3, r14 + barrett_32 r6, r2, r3, r14 + barrett_32 r7, r2, r3, r14 + barrett_32 r8, r2, r3, r14 + barrett_32 r5, r2, r3, r14 + barrett_32 r10, r2, r3, r14 + barrett_32 r11, r2, r3, r14 + + addSub4 r4, r8, r6, r10, r12, r5, r7, r11 + vmov.w r14, s14 + mul.w r8, r8, r14 + mul.w r5, r5, r14 + mul.w r10, r10, r14 + mul.w r11, r11, r14 + vmov.w r14, s16 + final_butterfly r12, r9, s18, r14 + str.w r12, [r0, #(2*1*width)] + str.w r9, [r0, #(2*9*width)] + final_butterfly r6, r9, s19, r14 + str.w r6, [r0, #(2*2*width)] + str.w r9, [r0, #(2*10*width)] + final_butterfly r7, r9, s20, r14 + str.w r7, [r0, #(2*3*width)] + str.w r9, [r0, #(2*11*width)] + vmov.w r12, s15 + final_butterfly2 r8, r6, r9, s21, r14, r12 + str.w r6, [r0, #(2*4*width)] + str.w r9, [r0, #(2*12*width)] + final_butterfly2 r5, r6, r9, s22, r14, r12 + str.w r6, [r0, #(2*5*width)] + str.w r9, [r0, #(2*13*width)] + final_butterfly2 r10, r6, r9, s23, r14, r12 + str.w r6, [r0, #(2*6*width)] + str.w r9, [r0, #(2*14*width)] + final_butterfly2 r11, r6, r9, s24, r14, r12 + str.w r6, [r0, #(2*7*width)] + str.w r9, [r0, #(2*15*width)] + final_butterfly r4, r9, s17, r14 + str.w r9, [r0, #(2*8*width)] + str.w r4, [r0], #width + +// ================ + + ldrstrvec ldr.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #(2*8*width), #(2*9*width), #(2*10*width), #(2*11*width), #(2*12*width), #(2*13*width), #(2*14*width), #(2*15*width) + + addSub4 r4, r5, r6, r7, r8, r9, r10, r11 + vmov.w r14, s6 + mul.w r5, r5, r14 + vmov.w r14, s8 + mul.w r9, r9, r14 + addSub2 r4, r6, r8, r10 + vmov.w r14, s7 + mla.w r12, r7, r14, r5 + mls.w r7, r7, r14, r5 + vmov.w r14, s9 + mla.w r5, r11, r14, r9 + mls.w r11, r11, r14, r9 + + // r4, r12, r6, r7, r8, r5, r10, r11 + + vmov.w r14, s12 + mul.w r6, r6, r14 + mul.w r7, r7, r14 + vmov.w r14, s13 + mul.w r10, r10, r14 + mul.w r11, r11, r14 + + barrett_32 r4, r2, r3, r14 + barrett_32 r12, r2, r3, r14 + barrett_32 r6, r2, r3, r14 + barrett_32 r7, r2, r3, r14 + barrett_32 r8, r2, r3, r14 + barrett_32 r5, r2, r3, r14 + barrett_32 r10, r2, r3, r14 + barrett_32 r11, r2, r3, r14 + + addSub4 r4, r8, r6, r10, r12, r5, r7, r11 + + vmov.w s17, s18, r4, r12 + vmov.w s19, s20, r6, r7 + vmov.w s21, s22, r8, r5 + vmov.w s23, s24, r10, r11 + + ldrstrvec ldr.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #(2*0*width), #(2*1*width), #(2*2*width), #(2*3*width), #(2*4*width), #(2*5*width), #(2*6*width), #(2*7*width) + + addSub4 r4, r5, r6, r7, r8, r9, r10, r11 + vmov.w r14, s2 + mul.w r5, r5, r14 + vmov.w r14, s4 + mul.w r9, r9, r14 + addSub2 r4, r6, r8, r10 + vmov.w r14, s3 + mla.w r12, r7, r14, r5 + mls.w r7, r7, r14, r5 + vmov.w r14, s5 + mla.w r5, r11, r14, r9 + mls.w r11, r11, r14, r9 + + // r4, r12, r6, r7, r8, r5, r10, r11 + + vmov.w r14, s10 + mul.w r6, r6, r14 + mul.w r7, r7, r14 + vmov.w r14, s11 + mul.w r10, r10, r14 + mul.w r11, r11, r14 + + barrett_32 r4, r2, r3, r14 + barrett_32 r12, r2, r3, r14 + barrett_32 r6, r2, r3, r14 + barrett_32 r7, r2, r3, r14 + barrett_32 r8, r2, r3, r14 + barrett_32 r5, r2, r3, r14 + barrett_32 r10, r2, r3, r14 + barrett_32 r11, r2, r3, r14 + + addSub4 r4, r8, r6, r10, r12, r5, r7, r11 + vmov.w r14, s14 + mul.w r8, r8, r14 + mul.w r5, r5, r14 + mul.w r10, r10, r14 + mul.w r11, r11, r14 + vmov.w r14, s16 + + final_butterfly r12, r9, s18, r14 + str.w r12, [r0, #(2*1*width)] + str.w r9, [r0, #(2*9*width)] + final_butterfly r6, r9, s19, r14 + str.w r6, [r0, #(2*2*width)] + str.w r9, [r0, #(2*10*width)] + final_butterfly r7, r9, s20, r14 + str.w r7, [r0, #(2*3*width)] + str.w r9, [r0, #(2*11*width)] + vmov.w r12, s15 + final_butterfly2 r8, r6, r9, s21, r14, r12 + str.w r6, [r0, #(2*4*width)] + str.w r9, [r0, #(2*12*width)] + final_butterfly2 r5, r6, r9, s22, r14, r12 + str.w r6, [r0, #(2*5*width)] + str.w r9, [r0, #(2*13*width)] + final_butterfly2 r10, r6, r9, s23, r14, r12 + str.w r6, [r0, #(2*6*width)] + str.w r9, [r0, #(2*14*width)] + final_butterfly2 r11, r6, r9, s24, r14, r12 + str.w r6, [r0, #(2*7*width)] + str.w r9, [r0, #(2*15*width)] + final_butterfly r4, r9, s17, r14 + str.w r9, [r0, #(2*8*width)] + str.w r4, [r0], #31*width + +// ================ + + vmov.w r12, s1 + cmp.w r0, r12 + bne.w _ifnt_7_6_5_4 + + sub.w r0, r0, #256*width + + mov.w r14, #0 + + add.w r1, r0, #32*width + _ifnt_0_1_2: + +.rept 2 + + ldrstrvec ldr.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #(32*0*width), #(32*1*width), #(32*2*width), #(32*3*width), #(32*4*width), #(32*5*width), #(32*6*width), #(32*7*width) + + addSub4 r4, r5, r6, r7, r8, r9, r10, r11 + + addSub2 r4, r6, r8, r10 + FNT_CT_ibutterfly r5, r7, 4 + FNT_CT_ibutterfly r9, r11, 4 + + addSub1 r4, r8 + barrett_32 r9, r2, r3, r12 + FNT_CT_ibutterfly r5, r9, 6 + FNT_CT_ibutterfly r6, r10, 4 + FNT_CT_ibutterfly r7, r11, 2 + + barrett_32 r6, r2, r3, r12 + barrett_32 r7, r2, r3, r12 + sub.w r4, r14, r4, lsl #1 + neg.w r5, r5 + lsl.w r6, r6, #7 + lsl.w r7, r7, #6 + lsl.w r8, r8, #5 + lsl.w r9, r9, #4 + lsl.w r10, r10, #3 + lsl.w r11, r11, #2 + + barrett_32 r4, r2, r3, r12 + barrett_32 r5, r2, r3, r12 + barrett_32 r6, r2, r3, r12 + barrett_32 r7, r2, r3, r12 + barrett_32 r8, r2, r3, r12 + barrett_32 r9, r2, r3, r12 + barrett_32 r10, r2, r3, r12 + barrett_32 r11, r2, r3, r12 + + ldrstrvecjump str.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #(32*1*width), #(32*2*width), #(32*3*width), #(32*4*width), #(32*5*width), #(32*6*width), #(32*7*width), #width + +.endr + + cmp.w r0, r1 + bne.w _ifnt_0_1_2 + vpop.w {s16-s24} + pop.w {r4-r11, pc} diff --git a/crypto_sign/dilithium2/m4f/macros.i b/crypto_sign/dilithium2/m4f/macros.i new file mode 100644 index 0000000..25d98c2 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/macros.i @@ -0,0 +1,191 @@ +#ifndef MACROS_I +#define MACROS_I +// 3 +.macro montgomery_mul_32 a, b, Qprime, Q, tmp, tmp2 + smull \tmp, \a, \a, \b + mul \tmp2, \tmp, \Qprime + smlal \tmp, \a, \tmp2, \Q +.endm + +// 2 +.macro addSub1 c0, c1 + add.w \c0, \c1 + sub.w \c1, \c0, \c1, lsl #1 +.endm + +// 3 +.macro addSub2 c0, c1, c2, c3 + add \c0, \c1 + add \c2, \c3 + sub.w \c1, \c0, \c1, lsl #1 + sub.w \c3, \c2, \c3, lsl #1 +.endm + +// 6 +.macro addSub4 c0, c1, c2, c3, c4, c5, c6, c7 + add \c0, \c1 + add \c2, \c3 + add \c4, \c5 + add \c6, \c7 + sub.w \c1, \c0, \c1, lsl #1 + sub.w \c3, \c2, \c3, lsl #1 + sub.w \c5, \c4, \c5, lsl #1 + sub.w \c7, \c6, \c7, lsl #1 +.endm + +.macro _2_layer_CT_32 c0, c1, c2, c3, zeta0, zeta1, zeta2, Qprime, Q, tmp, tmp2 + montgomery_mul_32 \c2, \zeta0, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c3, \zeta0, \Qprime, \Q, \tmp, \tmp2 + addSub2 \c0, \c2, \c1, \c3 + + montgomery_mul_32 \c1, \zeta1, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c3, \zeta2, \Qprime, \Q, \tmp, \tmp2 + addSub2 \c0, \c1, \c2, \c3 +.endm + +.macro _2_layer_inv_CT_32 c0, c1, c2, c3, zeta0, zeta1, zeta2, Qprime, Q, tmp, tmp2 + montgomery_mul_32 \c1, \zeta0, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c3, \zeta0, \Qprime, \Q, \tmp, \tmp2 + addSub2 \c0, \c1, \c2, \c3 + + montgomery_mul_32 \c2, \zeta1, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c3, \zeta2, \Qprime, \Q, \tmp, \tmp2 + addSub2 \c0, \c2, \c1, \c3 +.endm + +.macro _3_layer_CT_32 c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2 + vmov.w \twiddle, \xi0 + montgomery_mul_32 \c4, \twiddle, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c5, \twiddle, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2 + addSub4 \c0, \c4, \c1, \c5, \c2, \c6, \c3, \c7 + + vmov.w \twiddle, \xi1 + montgomery_mul_32 \c2, \twiddle, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2 + vmov.w \twiddle, \xi2 + montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2 + addSub4 \c0, \c2, \c1, \c3, \c4, \c6, \c5, \c7 + + vmov.w \twiddle, \xi3 + montgomery_mul_32 \c1, \twiddle, \Qprime, \Q, \tmp, \tmp2 + vmov.w \twiddle, \xi4 + montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2 + vmov.w \twiddle, \xi5 + montgomery_mul_32 \c5, \twiddle, \Qprime, \Q, \tmp, \tmp2 + vmov.w \twiddle, \xi6 + montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2 + addSub4 \c0, \c1, \c2, \c3, \c4, \c5, \c6, \c7 +.endm + +.macro _3_layer_inv_CT_32 c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2 + vmov.w \twiddle, \xi0 + montgomery_mul_32 \c1, \twiddle, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c5, \twiddle, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2 + addSub4 \c0, \c1, \c2, \c3, \c4, \c5, \c6, \c7 + + vmov.w \twiddle, \xi1 + montgomery_mul_32 \c2, \twiddle, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2 + vmov.w \twiddle, \xi2 + montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2 + montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2 + addSub4 \c0, \c2, \c1, \c3, \c4, \c6, \c5, \c7 + + vmov.w \twiddle, \xi3 + montgomery_mul_32 \c4, \twiddle, \Qprime, \Q, \tmp, \tmp2 + vmov.w \twiddle, \xi4 + montgomery_mul_32 \c5, \twiddle, \Qprime, \Q, \tmp, \tmp2 + vmov.w \twiddle, \xi5 + montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2 + vmov.w \twiddle, \xi6 + montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2 + addSub4 \c0, \c4, \c1, \c5, \c2, \c6, \c3, \c7 +.endm + +/************************************************************ +* Name: _3_layer_inv_butterfly_light_fast_first +* +* Description: upper half of 3-layer inverse butterfly +* defined over X^8 - 1 +* +* Input: (c4, c1, c6, c3) = coefficients on the upper half; +* (xi0, xi1, xi2, xi3, xi4, xi5, xi6) = +* ( 1, 1, w_4, 1, w_8, w_4, w_8^3) in +* Montgomery domain +* +* Symbols: R = 2^32 +* +* Constants: Qprime = -MOD^{-1} mod^{+-} R, Q = MOD +* +* Output: +* c4 = c4 + c1 + (c6 + c3) +* c5 = (c4 - c1) w_4 + (c6 + c3) w_8^3 +* c6 = c4 + c1 - (c6 + c3) +* c7 = (c4 - c1) w_8^3 + (c6 + c3) w_4 +************************************************************/ +// 15 +.macro _3_layer_inv_butterfly_light_fast_first c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2 + addSub2 \c4, \c1, \c6, \c3 + addSub1 \c4, \c6 + + vmov.w \tmp, \xi4 + vmov.w \tmp2, \xi6 + + smull.w \c0, \c5, \c1, \tmp + smlal.w \c0, \c5, \c3, \tmp2 + mul.w \twiddle, \c0, \Qprime + smlal.w \c0, \c5, \twiddle, \Q + + smull.w \c2, \c7, \c1, \tmp2 + smlal.w \c2, \c7, \c3, \tmp + mul.w \twiddle, \c2, \Qprime + smlal.w \c2, \c7, \twiddle, \Q +.endm + +/************************************************************ +* Name: _3_layer_inv_butterfly_light_fast_second +* +* Description: lower half of 3-layer inverse butterfly +* defined over X^8 - 1, and the 2nd +* layer of butterflies +* +* Input: +* (c4, c5, c6, c7) = results of the upper half; +* (c0, c1, c2, c3) = coefficients on the lower half; +* (xi0, xi1, xi2, xi3, xi4, xi5, xi6) = +* ( 1, 1, w_4, 1, w_8, w_4, w_8^3) in +* Montgomery domain +* +* Symbols: R = 2^32 +* +* Constants: Qprime = -MOD^{-1} mod^{+-} R, Q = MOD +* +* Output: (normal order) +* c0 = c0 + c1 + (c2 + c3) + ( c4 + c5 + (c6 + c7) ) +* c1 = (c0 - c1) w3 + (c2 - c3) w4 + ( (c4 - c5) w5 + (c6 - c7) w6 ) +* c2 = ( c0 + c1 - (c2 + c3)) w1 + (( c4 + c5 - (c6 + c7) ) w2) +* c3 = ((c0 - c1) w3 - (c2 - c3) w4) w1 + (((c4 - c5) w5 - (c6 - c7) w6) w2) +* c4 = c0 + c1 - (c2 + c3) - ( c4 + c5 + (c6 + c7) ) w0 +* c5 = (c0 - c1) w3 + (c2 - c3) w4 - ( (c4 - c5) w5 + (c6 - c7) w6 ) w0 +* c6 = ( c0 + c1 - (c2 + c3)) w1 - (( c4 + c5 - (c6 + c7) ) w2) w0 +* c7 = ((c0 - c1) w3 - (c2 - c3) w4) w1 - (((c4 - c5) w5 - (c6 - c7) w6) w2) w0 +************************************************************/ +// 19 +.macro _3_layer_inv_butterfly_light_fast_second c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2 + addSub2 \c0, \c1, \c2, \c3 + + vmov.w \twiddle, \xi2 + montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2 + addSub2 \c0, \c2, \c1, \c3 + + montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2 + + addSub4 \c0, \c4, \c1, \c5, \c2, \c6, \c3, \c7 +.endm + +#endif /* MACROS_I */ diff --git a/crypto_sign/dilithium2/m4f/macros_fnt.i b/crypto_sign/dilithium2/m4f/macros_fnt.i new file mode 100644 index 0000000..25903e4 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/macros_fnt.i @@ -0,0 +1,158 @@ +// 2 +.macro ldrstr2 ldrstr, target, c0, c1, mem0, mem1 + \ldrstr \c0, [\target, \mem0] + \ldrstr \c1, [\target, \mem1] +.endm + +// 2 +.macro ldrstr2jump ldrstr, target, c0, c1, mem1, jump + \ldrstr \c1, [\target, \mem1] + \ldrstr \c0, [\target], \jump +.endm + +// 4 +.macro ldrstr4 ldrstr, target, c0, c1, c2, c3, mem0, mem1, mem2, mem3 + \ldrstr \c0, [\target, \mem0] + \ldrstr \c1, [\target, \mem1] + \ldrstr \c2, [\target, \mem2] + \ldrstr \c3, [\target, \mem3] +.endm + +// 4 +.macro ldrstr4jump ldrstr, target, c0, c1, c2, c3, mem1, mem2, mem3, jump + \ldrstr \c1, [\target, \mem1] + \ldrstr \c2, [\target, \mem2] + \ldrstr \c3, [\target, \mem3] + \ldrstr \c0, [\target], \jump +.endm + +// 8 +.macro ldrstrvec ldrstr, target, c0, c1, c2, c3, c4, c5, c6, c7, mem0, mem1, mem2, mem3, mem4, mem5, mem6, mem7 + ldrstr4 \ldrstr, \target, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3 + ldrstr4 \ldrstr, \target, \c4, \c5, \c6, \c7, \mem4, \mem5, \mem6, \mem7 +.endm + +// 8 +.macro ldrstrvecjump ldrstr, target, c0, c1, c2, c3, c4, c5, c6, c7, mem1, mem2, mem3, mem4, mem5, mem6, mem7, jump + ldrstr4 \ldrstr, \target, \c4, \c5, \c6, \c7, \mem4, \mem5, \mem6, \mem7 + ldrstr4jump \ldrstr, \target, \c0, \c1, \c2, \c3, \mem1, \mem2, \mem3, \jump +.endm + + + +.macro addSub1 c0, c1 + add.w \c0, \c1 + sub.w \c1, \c0, \c1, lsl #1 +.endm + +.macro addSub2 c0, c1, c2, c3 + add \c0, \c1 + add \c2, \c3 + sub.w \c1, \c0, \c1, lsl #1 + sub.w \c3, \c2, \c3, lsl #1 +.endm + +.macro addSub4 c0, c1, c2, c3, c4, c5, c6, c7 + add \c0, \c1 + add \c2, \c3 + add \c4, \c5 + add \c6, \c7 + sub.w \c1, \c0, \c1, lsl #1 + sub.w \c3, \c2, \c3, lsl #1 + sub.w \c5, \c4, \c5, lsl #1 + sub.w \c7, \c6, \c7, lsl #1 +.endm + +// 2 +.macro barrett_32 a, Qbar, Q, tmp + smmulr.w \tmp, \a, \Qbar + mls.w \a, \tmp, \Q, \a +.endm + +.macro FNT_CT_butterfly c0, c1, logW + add.w \c0, \c0, \c1, lsl #\logW + sub.w \c1, \c0, \c1, lsl #(\logW+1) +.endm + +.macro shift_subAdd c0, c1, shlv + sub.w \c0, \c0, \c1, lsl #(\shlv) + add.w \c1, \c0, \c1, lsl #(\shlv+1) +.endm + +.macro FNT_CT_ibutterfly c0, c1, shlv + shift_subAdd \c0, \c1, \shlv +.endm + +// 46 +.macro _3_layer_CT_32_FNT c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2 + vmov.w \twiddle, \xi0 + + // c0, c1, c2, c3, c4, c5, c6, c7, c8 + // 0,4 + mla \tmp, \c4, \twiddle, \c0 + mls \c4, \c4, \twiddle, \c0 + + // 1,5 + mla \c0, \c5, \twiddle, \c1 + mls \c5, \c5, \twiddle, \c1 + + // 2,6 + mla \c1, \c6, \twiddle, \c2 + mls \c6, \c6, \twiddle, \c2 + + // 3,7 + mla \c2, \c7, \twiddle, \c3 + mls \c7, \c7, \twiddle, \c3 + + // tmp, c0, c1, c2, c4, c5, c6, c7 + + barrett_32 \tmp, \Qprime, \Q, \c3 + barrett_32 \c0, \Qprime, \Q, \c3 + barrett_32 \c1, \Qprime, \Q, \c3 + barrett_32 \c2, \Qprime, \Q, \c3 + barrett_32 \c4, \Qprime, \Q, \c3 + barrett_32 \c5, \Qprime, \Q, \c3 + barrett_32 \c6, \Qprime, \Q, \c3 + barrett_32 \c7, \Qprime, \Q, \c3 + + vmov.w \twiddle, \xi1 + // 0,2 + mla \tmp2, \c1, \twiddle, \tmp + mls \c3, \c1, \twiddle, \tmp + + // 1,3 + mla \tmp, \c2, \twiddle, \c0 + mls \c0, \c2, \twiddle, \c0 + + vmov.w \twiddle, \xi2 + + // 4,6 + mla \c2, \c6, \twiddle, \c4 + mls \c1, \c6, \twiddle, \c4 + + // 5,7 + mla \c6, \c7, \twiddle, \c5 + mls \c7, \c7, \twiddle, \c5 + + // tmp2, tmp, c3, c0 | c2, c6, c1, c7 + + // 4,5 + vmov.w \twiddle, \xi5 + mla \c4, \c6, \twiddle, \c2 + mls \c5, \c6, \twiddle, \c2 + + // 6,7 + vmov.w \twiddle, \xi6 + mla \c6, \c7, \twiddle, \c1 + mls \c7, \c7, \twiddle, \c1 + + // 2,3 + vmov.w \twiddle, \xi4 + mla \c2, \c0, \twiddle, \c3 + mls \c3, \c0, \twiddle, \c3 + + // 0,1 + vmov.w \twiddle, \xi3 + mla \c0, \tmp, \twiddle, \tmp2 + mls \c1, \tmp, \twiddle, \tmp2 +.endm \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4f/ntt.S b/crypto_sign/dilithium2/m4f/ntt.S new file mode 100644 index 0000000..bfd5f7a --- /dev/null +++ b/crypto_sign/dilithium2/m4f/ntt.S @@ -0,0 +1,402 @@ +// based on code by: Markus Krausz (18.03.18) +// date 23.07.21: Now licensed under CC0 with permission of the authors. + +.syntax unified +#include "macros.i" + +// This code uses UMULL - which is constant time on the M4, but not on the M3 +// Make sure that this code is never used on an M3 +smlad r0,r0,r0,r0 + +// ############################## +// ########## NTT ########## +// ############################## + +//void pqcrystals_dilithium_ntt(int32_t p[N]); +.global pqcrystals_dilithium_ntt +.type pqcrystals_dilithium_ntt,%function +.align 2 +pqcrystals_dilithium_ntt: + //bind aliases + ptr_p .req R0 + ptr_zeta .req R1 + zeta .req R1 + qinv .req R2 + q .req R3 + cntr .req R4 + pol4 .req R4 + pol0 .req R5 + pol1 .req R6 + pol2 .req R7 + pol3 .req R8 + temp_h .req R9 + temp_l .req R10 + zeta0 .req R11 + zeta1 .req R12 + zeta2 .req R14 + pol5 .req R11 + pol6 .req R12 + pol7 .req R14 + + //preserve registers + push {R4-R11, R14} + + //load constants, ptr + ldr.w qinv, inv_ntt_asm_smull_qinv //-qinv_signed + ldr.w q, inv_ntt_asm_smull_q + + //stage 1 - 3 + .equ distance, 512 + .equ strincr, 4 + + ldr ptr_zeta, =#zetas_new332 + vldm ptr_zeta!, {s2-s8} + vmov s0, ptr_zeta + + add.w temp_l, ptr_p, #32*strincr // 32 iterations + vmov s9, temp_l + 1: + .rept 2 + ldr.w pol0, [ptr_p] + ldr.w pol1, [ptr_p, #1*distance/4] + ldr.w pol2, [ptr_p, #2*distance/4] + ldr.w pol3, [ptr_p, #3*distance/4] + ldr.w pol4, [ptr_p, #4*distance/4] + ldr.w pol5, [ptr_p, #5*distance/4] + ldr.w pol6, [ptr_p, #6*distance/4] + ldr.w pol7, [ptr_p, #7*distance/4] + + _3_layer_CT_32 pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l + + str.w pol1, [ptr_p, #1*distance/4] + str.w pol2, [ptr_p, #2*distance/4] + str.w pol3, [ptr_p, #3*distance/4] + str.w pol4, [ptr_p, #4*distance/4] + str.w pol5, [ptr_p, #5*distance/4] + str.w pol6, [ptr_p, #6*distance/4] + str.w pol7, [ptr_p, #7*distance/4] + str.w pol0, [ptr_p], #strincr + .endr + vmov temp_l, s9 + cmp.w ptr_p, temp_l + bne 1b + + sub ptr_p, #32*4 + + // stage 4 - 6 + .equ distance, 64 + add.w temp_l, ptr_p, #8*112+8*4*4 // 8 iterations + vmov s9, temp_l + 1: + add.w temp_l, ptr_p, #4*strincr // 4 iterations + vmov s10, temp_l + vmov ptr_zeta, s0 + vldm ptr_zeta!, {s2-s8} + vmov s0, ptr_zeta + 2: + .rept 2 + ldr.w pol0, [ptr_p] + ldr.w pol1, [ptr_p, #1*distance/4] + ldr.w pol2, [ptr_p, #2*distance/4] + ldr.w pol3, [ptr_p, #3*distance/4] + ldr.w pol4, [ptr_p, #4*distance/4] + ldr.w pol5, [ptr_p, #5*distance/4] + ldr.w pol6, [ptr_p, #6*distance/4] + ldr.w pol7, [ptr_p, #7*distance/4] + + _3_layer_CT_32 pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l + + str.w pol1, [ptr_p, #1*distance/4] + str.w pol2, [ptr_p, #2*distance/4] + str.w pol3, [ptr_p, #3*distance/4] + str.w pol4, [ptr_p, #4*distance/4] + str.w pol5, [ptr_p, #5*distance/4] + str.w pol6, [ptr_p, #6*distance/4] + str.w pol7, [ptr_p, #7*distance/4] + str.w pol0, [ptr_p], #4 + .endr + vmov temp_l, s10 + cmp.w ptr_p, temp_l + bne 2b + + add.w ptr_p, #112 + vmov temp_l, s9 + cmp.w ptr_p, temp_l + bne 1b + + sub ptr_p, #4*4*8+112*8 + vmov ptr_zeta, s0 + //stage 7 and 8 + add cntr, ptr_p, #1024 // 64 iterations + 1: + ldr.w zeta1, [ptr_zeta, #4] //z128,..., z254 + ldr.w zeta2, [ptr_zeta, #8] //z129,..., z255 + ldr zeta0, [ptr_zeta], #12 //z64, ..., z127 + ldr.w pol0, [ptr_p] //1*4 + ldr.w pol1, [ptr_p, #4] + ldr.w pol2, [ptr_p, #8] + ldr.w pol3, [ptr_p, #12] + + _2_layer_CT_32 pol0, pol1, pol2, pol3, zeta0, zeta1, zeta2, qinv, q, temp_h, temp_l + + str.w pol1, [ptr_p, #4] + str.w pol2, [ptr_p, #8] + str.w pol3, [ptr_p, #12] + str pol0, [ptr_p], #16 + + cmp.w cntr, ptr_p + bne.w 1b + + //restore registers + pop {R4-R11, PC} + + //unbind aliases + .unreq ptr_p + .unreq ptr_zeta + .unreq qinv + .unreq q + .unreq cntr + .unreq pol0 + .unreq pol1 + .unreq pol2 + .unreq pol3 + .unreq temp_h + .unreq temp_l + .unreq zeta0 + .unreq zeta1 + .unreq zeta2 + +.ltorg +// ############################## +// ########## NTT^-1 ########## +// ############################## + +//void pqcrystals_dilithium_invntt_tomont(int32_t p[N]); +.global pqcrystals_dilithium_invntt_tomont +.type pqcrystals_dilithium_invntt_tomont,%function +.align 2 +pqcrystals_dilithium_invntt_tomont: + //bind aliases + ptr_p .req R0 + ptr_zeta .req R1 + zeta .req R1 + qinv .req R2 + q .req R3 + cntr .req R4 + pol4 .req R4 + pol0 .req R5 + pol1 .req R6 + pol2 .req R7 + pol3 .req R8 + temp_h .req R9 + temp_l .req R10 + zeta0 .req R11 + zeta1 .req R12 + zeta2 .req R14 + pol5 .req R11 + pol6 .req R12 + pol7 .req R14 + + //preserve registers + push {R4-R11, R14} + + //load constants, ptr + ldr.w qinv, inv_ntt_asm_smull_qinv //-qinv_signed + ldr.w q, inv_ntt_asm_smull_q + + //stage 1 - 3 + .equ distance, 16 + .equ strincr, 32 + + ldr ptr_zeta, =#zetas_new332inv + vldm ptr_zeta!, {s2-s8} + vmov s0, ptr_zeta + + add.w temp_l, ptr_p, #32*strincr // 32 iterations + vmov s9, temp_l + 1: + ldr.w pol4, [ptr_p, #4*distance/4] + ldr.w pol1, [ptr_p, #5*distance/4] + ldr.w pol6, [ptr_p, #6*distance/4] + ldr.w pol3, [ptr_p, #7*distance/4] + _3_layer_inv_butterfly_light_fast_first pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l + + ldr.w pol0, [ptr_p] + ldr.w pol1, [ptr_p, #1*distance/4] + ldr.w pol2, [ptr_p, #2*distance/4] + ldr.w pol3, [ptr_p, #3*distance/4] + _3_layer_inv_butterfly_light_fast_second pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l + + str.w pol1, [ptr_p, #1*distance/4] + str.w pol2, [ptr_p, #2*distance/4] + str.w pol3, [ptr_p, #3*distance/4] + str.w pol4, [ptr_p, #4*distance/4] + str.w pol5, [ptr_p, #5*distance/4] + str.w pol6, [ptr_p, #6*distance/4] + str.w pol7, [ptr_p, #7*distance/4] + str.w pol0, [ptr_p], #strincr + vmov temp_l, s9 + cmp.w ptr_p, temp_l + bne.w 1b + + sub ptr_p, #32*strincr + + // stage 4 - 6 + .equ distance, 128 + .equ strincr, 256 + + // iteration 0 + movw temp_l, #4 + add.w temp_l, ptr_p, #4*256 // 4 iterations + vmov s10, temp_l + + vmov ptr_zeta, s0 + vldm ptr_zeta!, {s2-s8} + vmov s0, ptr_zeta + + 2: + ldr.w pol4, [ptr_p, #4*distance/4] + ldr.w pol1, [ptr_p, #5*distance/4] + ldr.w pol6, [ptr_p, #6*distance/4] + ldr.w pol3, [ptr_p, #7*distance/4] + _3_layer_inv_butterfly_light_fast_first pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l + + ldr.w pol0, [ptr_p] + ldr.w pol1, [ptr_p, #1*distance/4] + ldr.w pol2, [ptr_p, #2*distance/4] + ldr.w pol3, [ptr_p, #3*distance/4] + _3_layer_inv_butterfly_light_fast_second pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l + + str.w pol1, [ptr_p, #1*distance/4] + str.w pol2, [ptr_p, #2*distance/4] + str.w pol3, [ptr_p, #3*distance/4] + str.w pol4, [ptr_p, #4*distance/4] + str.w pol5, [ptr_p, #5*distance/4] + str.w pol6, [ptr_p, #6*distance/4] + str.w pol7, [ptr_p, #7*distance/4] + str.w pol0, [ptr_p] + add.w ptr_p, #strincr + + vmov temp_l, s10 + cmp.w temp_l, ptr_p + bne.w 2b + + sub.w ptr_p, #4*256-4 + + // iteration 1-7 + add.w temp_l, ptr_p, #7*4 // 7 iterations + vmov s9, temp_l + 1: + add.w temp_l, ptr_p, #4*strincr // 4 iterations + vmov s10, temp_l + + vmov ptr_zeta, s0 + vldm ptr_zeta!, {s2-s8} + vmov s0, ptr_zeta + 2: + ldr.w pol0, [ptr_p] + ldr.w pol1, [ptr_p, #1*distance/4] + ldr.w pol2, [ptr_p, #2*distance/4] + ldr.w pol3, [ptr_p, #3*distance/4] + ldr.w pol4, [ptr_p, #4*distance/4] + ldr.w pol5, [ptr_p, #5*distance/4] + ldr.w pol6, [ptr_p, #6*distance/4] + ldr.w pol7, [ptr_p, #7*distance/4] + + _3_layer_inv_CT_32 pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l + + str.w pol1, [ptr_p, #1*distance/4] + str.w pol2, [ptr_p, #2*distance/4] + str.w pol3, [ptr_p, #3*distance/4] + str.w pol4, [ptr_p, #4*distance/4] + str.w pol5, [ptr_p, #5*distance/4] + str.w pol6, [ptr_p, #6*distance/4] + str.w pol7, [ptr_p, #7*distance/4] + str.w pol0, [ptr_p] + add.w ptr_p, #strincr + + vmov temp_l, s10 + cmp.w ptr_p, temp_l + bne 2b + sub.w ptr_p, #4*strincr-4 + + vmov temp_l, s9 + cmp.w temp_l, ptr_p + bne 1b + + sub ptr_p, #8*4 + vmov ptr_zeta, s0 + + //stage 7 and 8 + .equ strincr, 4 + + add.w cntr, ptr_p, #64*strincr // 64 iterations + vmov s9, cntr + 1: + ldr.w zeta1, [ptr_zeta, #4] + ldr.w zeta2, [ptr_zeta, #8] + ldr zeta0, [ptr_zeta], #12 + ldr.w pol0, [ptr_p] + ldr.w pol1, [ptr_p, #256] + ldr.w pol2, [ptr_p, #512] + ldr.w pol3, [ptr_p, #768] + + _2_layer_inv_CT_32 pol0, pol1, pol2, pol3, zeta0, zeta1, zeta2, qinv, q, temp_h, temp_l + + ldr.w zeta1, [ptr_zeta, #4] + ldr.w zeta2, [ptr_zeta, #8] + ldr.w zeta0, [ptr_zeta, #12] + ldr.w cntr, [ptr_zeta], #16 + montgomery_mul_32 pol0, cntr, qinv, q, temp_h, temp_l + montgomery_mul_32 pol1, zeta1, qinv, q, temp_h, temp_l + montgomery_mul_32 pol2, zeta2, qinv, q, temp_h, temp_l + montgomery_mul_32 pol3, zeta0, qinv, q, temp_h, temp_l + + str.w pol1, [ptr_p, #256] + str.w pol2, [ptr_p, #512] + str.w pol3, [ptr_p, #768] + str pol0, [ptr_p], #strincr + + vmov cntr, s9 + cmp.w cntr, ptr_p + bne.w 1b + + //restore registers + pop {R4-R11, PC} + + //unbind aliases + .unreq ptr_p + .unreq ptr_zeta + .unreq qinv + .unreq q + .unreq cntr + .unreq pol0 + .unreq pol1 + .unreq pol2 + .unreq pol3 + .unreq temp_h + .unreq temp_l + .unreq zeta0 + .unreq zeta1 + .unreq zeta2 + +.align 2 +inv_ntt_asm_smull_qinv: +.word 0xfc7fdfff +.align 2 +inv_ntt_asm_smull_q: +.word 8380417 + +.section .rodata + +.type zetas_new332, %object +.align 2 +zetas_new332: +.word 25847, -2608894, -518909, 237124, -777960, -876248, 466468, 1826347, 2725464, 1024112, 2706023, 95776, 3077325, 3530437, 2353451, -1079900, 3585928, -1661693, -3592148, -2537516, 3915439, -359251, -549488, -1119584, -3861115, -3043716, 3574422, -2867647, -2091905, 2619752, -2108549, 3539968, -300467, 2348700, -539299, 3119733, -2118186, -3859737, -1699267, -1643818, 3505694, -3821735, -2884855, -1399561, -3277672, 3507263, -2140649, -1600420, 3699596, 3111497, 1757237, -19422, 811944, 531354, 954230, 3881043, 2680103, 4010497, 280005, 3900724, -2556880, 2071892, -2797779, -3930395, 2091667, 3407706, -1528703, 2316500, 3817976, -3677745, -3342478, 2244091, -3041255, -2446433, -3562462, -1452451, 266997, 2434439, 3475950, -1235728, 3513181, 2176455, -3520352, -3759364, -1585221, -1197226, -3193378, -1257611, 900702, 1859098, 1939314, 909542, 819034, -4083598, 495491, -1613174, -1000202, -43260, -522500, -3190144, -655327, -3122442, -3157330, 2031748, 3207046, -3632928, -3556995, -525098, 126922, -768622, -3595838, 3412210, 342297, 286988, -983419, -2437823, 4108315, 2147896, 3437287, -3342277, 2715295, 1735879, 203044, -2967645, 2842341, 2691481, -3693493, -2590150, 1265009, -411027, 4055324, 1247620, -2477047, 2486353, 1595974, -671102, -3767016, 1250494, -1228525, 2635921, -3548272, -22981, -2994039, 1869119, -1308169, 1903435, -1050970, -381987, -1333058, 1237275, 1349076, -3318210, -1430225, 1852771, -451100, 1312455, -1430430, 3306115, -1962642, -3343383, -1279661, 1917081, 264944, -2546312, -1374803, 508951, 1500165, 777191, 3097992, 2235880, 3406031, 44288, -542412, -2831860, -1100098, -1671176, -1846953, 904516, -2584293, -3724270, 3958618, 594136, -3776993, -3724342, -2013608, 2432395, -8578, 2454455, -164721, 1653064, 1957272, 3369112, -3249728, 185531, -1207385, 2389356, -3183426, 162844, -210977, 1616392, 3014001, 759969, 810149, 1652634, -1316856, -3694233, -1799107, 189548, -3038916, 3523897, -3553272, 3866901, 269760, 3159746, 2213111, -975884, -1851402, 1717735, 472078, -2409325, -426683, 1723600, -177440, -1803090, 1910376, 1315589, -1667432, -1104333, 1341330, -260646, -3833893, 1285669, -2939036, -2235985, -1584928, -420899, -2286327, -812732, 183443, -976891, -1439742, 1612842, -3545687, -3019102, -554416, 3919660, -3881060, -48306, -1362209, -3628969, 3937738, 1400424, 3839961, -846154, 1976782 +.size zetas_new332,.-zetas_new332 + +.type zetas_new332inv, %object +.align 2 +zetas_new332inv: +.word 4193792, 4193792, -25847, 4193792, 518909, -25847, 2608894, 4193792, 4193792, -25847, 4193792, 518909, -25847, 2608894, -466468, -2680103, -3111497, -280005, 19422, -4010497, -1757237, 518909, -466468, 876248, -2680103, 2884855, -3111497, -3119733, 777960, 2091905, 359251, 2108549, 1119584, -2619752, 549488, -25847, 518909, 2608894, -466468, 777960, 876248, -237124, 876248, 2884855, -3119733, 3277672, 3859737, 1399561, 2118186, 2608894, 777960, -237124, 2091905, -2353451, 359251, -1826347, -237124, -2353451, -1826347, -3585928, -1024112, 1079900, -2725464, 4193792, 4193792, -25847, 41978, 3024400, 3975713, -1225192, 2797779, -3839961, 3628969, -1711436, 3835778, 485110, -3954267, -280005, 2797779, -2071892, -2831100, -2698859, -908040, -2292170, 539299, 1430430, -1852771, -3658785, 3512212, 1859141, -1607594, -2680103, -280005, -4010497, 715005, 1483994, -1045894, -980943, -3699596, 1316856, -759969, -955715, 3677139, 3933849, 2719610, 2108549, 539299, -2348700, 1658328, -1403403, 1775852, -2460465, -3915439, -126922, 3632928, 1067023, 3847594, 4179270, 1652689, -466468, -2680103, -3111497, -2953811, -284642, 2507426, -324139, -3881043, -1341330, -1315589, 3990128, -2137097, -4109898, 4092021, 3277672, -3699596, 1600420, 1541634, 3493410, 3487504, 2497815, 2867647, 2477047, 411027, 1654972, 1326223, -2608226, -2752209, 2091905, 2108549, -2619752, 1836700, 2945615, -1908953, 729864, 3821735, -3958618, -904516, 2080615, 1555380, -3471815, -1978758, -3585928, -3915439, 2537516, -892788, -553664, -3095038, 658596, -3530437, 1585221, -2176455, 3355482, -1783485, 2780552, -3623330, 518909, -466468, 876248, -442683, 2523147, -2847660, -3683140, 2556880, 1439742, 812732, 774207, -3168108, 1877157, 3406477, 19422, -3881043, -954230, -214686, -1182619, 2453526, -2201920, 300467, 1308169, 22981, 3614022, 2136260, 1459487, -2233803, 2884855, 3277672, 1399561, 394072, -3933227, 4136064, 156486, 2140649, 3249728, -1653064, 1596950, 633578, 2722529, -554462, 1119584, 2867647, -3574422, 1004840, 191586, 3969463, 1161373, 3592148, 1000202, 4083598, 3189243, 3561667, -3650125, 3490511, 777960, 2091905, 359251, -1829156, -3707725, -661807, 1144558, -531354, 1851402, -3159746, 1543095, -2903948, 1505516, -1500460, 3859737, 3821735, -3505694, -2413330, 3908886, -1203856, 3570263, 3043716, -2715295, -2147896, 758741, 3917553, -2414897, -1613811, -2353451, -3585928, 1079900, 990020, -719638, 2718792, 2260310, 1643818, -3097992, -508951, -783456, -2089539, 2616547, 4060031, -1024112, -3530437, -3077325, -1821861, 1920615, 3988525, 2048419, -95776, 3041255, 3677745, -971504, 2190617, 2311312, -1170082, -25847, 518909, 2608894, 1261528, -2073537, -959585, 3948120, -2071892, 3881060, 3019102, -1342633, -1115066, 3589694, -1929116, -4010497, 2556880, -3900724, 3360006, 1758630, -2306989, -1841637, -2348700, -1349076, 381987, -1699982, 3189673, 3531558, -1210546, -3111497, 19422, -1757237, 2977353, 2612035, -2718155, -1544829, 1600420, 210977, -2389356, 2052582, -2737802, 2383976, -450259, -2619752, 300467, -3539968, 1698289, -4065084, -644023, -1114140, 2537516, 3157330, 3190144, -993399, -2220524, 2920588, 252737, 876248, 2884855, -3119733, 1490985, -34731, -1212610, -3183745, -954230, 177440, 2409325, -3302554, -2390327, -2749545, 653128, 1399561, 2140649, -3507263, -3745105, -1942293, -3367121, 2734884, -3574422, 3693493, 2967645, 1393803, -2467905, 1786029, -1633410, 359251, 1119584, 549488, -2824548, -1325638, -2207625, -2601586, -3505694, 1100098, -44288, 3478676, -2457992, -1617107, 2551364, 1079900, 3592148, 1661693, 1593929, 318899, -3366475, 3118416, -3077325, -3475950, 1452451, 3772814, 1424805, -3391376, 632820, 2608894, 777960, -237124, 2062597, 4064335, 2197148, -1127864, -3900724, 1584928, -1285669, 2525341, -896437, -1915773, 1792087, -1757237, -531354, -811944, 938441, -674578, 2876837, 3959371, -3539968, 1228525, 671102, 1219592, -3853560, 2630979, -2134676, -3119733, 3859737, 2118186, -2432637, 2746655, 718593, -2353280, -3507263, 8578, 3724342, -34852, 1387945, 358956, 1604944, 549488, 3043716, 3861115, 1290746, 3208584, 2538711, -1442830, 1661693, -1939314, 1257611, -367371, -1308058, 264382, 2614173, -237124, -2353451, -1826347, 2050674, 592050, -138487, 2310528, -811944, 3553272, -189548, -2728561, -4168358, -79, 3844932, 2118186, 1643818, 1699267, 500408, 743398, 879633, -3105206, 3861115, 983419, -3412210, 712597, -23479, 3729381, -1010481, -1826347, -1024112, -2725464, -2361217, -1864453, 3850522, 2337144, 1699267, -264944, 3343383, 3842267, 4181974, -4032642, 3983585, -2725464, -95776, -2706023, 260345, 2526550, 2000777, 987079, -2706023, 1528703, 3930395, -3030761, -3082055, -2374824, 1836319 +.size zetas_new332inv,.-zetas_new332inv diff --git a/crypto_sign/dilithium2/m4f/ntt.h b/crypto_sign/dilithium2/m4f/ntt.h new file mode 100644 index 0000000..731132d --- /dev/null +++ b/crypto_sign/dilithium2/m4f/ntt.h @@ -0,0 +1,13 @@ +#ifndef NTT_H +#define NTT_H + +#include +#include "params.h" + +#define ntt DILITHIUM_NAMESPACE(ntt) +void ntt(int32_t a[N]); + +#define invntt_tomont DILITHIUM_NAMESPACE(invntt_tomont) +void invntt_tomont(int32_t a[N]); + +#endif diff --git a/crypto_sign/dilithium2/m4f/packing.c b/crypto_sign/dilithium2/m4f/packing.c new file mode 100644 index 0000000..eb9d9a3 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/packing.c @@ -0,0 +1,390 @@ +#include "params.h" +#include "packing.h" +#include "polyvec.h" +#include "poly.h" +#include + +/************************************************* +* Name: pack_pk +* +* Description: Bit-pack public key pk = (rho, t1). +* +* Arguments: - uint8_t pk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const polyveck *t1: pointer to vector t1 +**************************************************/ +void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], + const uint8_t rho[SEEDBYTES], + const polyveck *t1) +{ + unsigned int i; + + for(i = 0; i < SEEDBYTES; ++i) + pk[i] = rho[i]; + pk += SEEDBYTES; + + for(i = 0; i < K; ++i) + polyt1_pack(pk + i*POLYT1_PACKEDBYTES, &t1->vec[i]); +} + +/************************************************* +* Name: unpack_pk +* +* Description: Unpack public key pk = (rho, t1). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const polyveck *t1: pointer to output vector t1 +* - uint8_t pk[]: byte array containing bit-packed pk +**************************************************/ +void unpack_pk(uint8_t rho[SEEDBYTES], + polyveck *t1, + const uint8_t pk[CRYPTO_PUBLICKEYBYTES]) +{ + unsigned int i; + + for(i = 0; i < SEEDBYTES; ++i) + rho[i] = pk[i]; + pk += SEEDBYTES; + + for(i = 0; i < K; ++i) + polyt1_unpack(&t1->vec[i], pk + i*POLYT1_PACKEDBYTES); +} + +/************************************************* +* Name: unpack_pk_t1 +* +* Description: Unpack public key pk = (rho, t1). +* +* Arguments: - const polyvec *t1: pointer to output vector t1 +* - const size_t idx: unpack n'th element from t1 +* - unsigned char pk[]: byte array containing bit-packed pk +**************************************************/ +void unpack_pk_t1(poly *t1, size_t idx, const unsigned char pk[CRYPTO_PUBLICKEYBYTES]) { + pk += SEEDBYTES; + polyt1_unpack(t1, pk + idx * POLYT1_PACKEDBYTES); +} + + +/************************************************* +* Name: pack_sk +* +* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2). +* +* Arguments: - uint8_t sk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const uint8_t tr[]: byte array containing tr +* - const uint8_t key[]: byte array containing key +* - const polyveck *t0: pointer to vector t0 +* - const polyvecl *s1: pointer to vector s1 +* - const polyveck *s2: pointer to vector s2 +**************************************************/ +void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[TRBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2) +{ + unsigned int i; + + for(i = 0; i < SEEDBYTES; ++i) + sk[i] = rho[i]; + sk += SEEDBYTES; + + for(i = 0; i < SEEDBYTES; ++i) + sk[i] = key[i]; + sk += SEEDBYTES; + + for(i = 0; i < TRBYTES; ++i) + sk[i] = tr[i]; + sk += TRBYTES; + + for(i = 0; i < L; ++i) + polyeta_pack(sk + i*POLYETA_PACKEDBYTES, &s1->vec[i]); + sk += L*POLYETA_PACKEDBYTES; + + for(i = 0; i < K; ++i) + polyeta_pack(sk + i*POLYETA_PACKEDBYTES, &s2->vec[i]); + sk += K*POLYETA_PACKEDBYTES; + + for(i = 0; i < K; ++i) + polyt0_pack(sk + i*POLYT0_PACKEDBYTES, &t0->vec[i]); +} + +/************************************************* +* Name: unpack_sk +* +* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const uint8_t tr[]: output byte array for tr +* - const uint8_t key[]: output byte array for key +* - const polyveck *t0: pointer to output vector t0 +* - const polyvecl *s1: pointer to output vector s1 +* - const polyveck *s2: pointer to output vector s2 +* - uint8_t sk[]: byte array containing bit-packed sk +**************************************************/ +void unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[TRBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + smallpoly s1[L], + smallpoly s2[K], + const uint8_t sk[CRYPTO_SECRETKEYBYTES]) +{ + unsigned int i; + + for(i = 0; i < SEEDBYTES; ++i) + rho[i] = sk[i]; + sk += SEEDBYTES; + + for(i = 0; i < SEEDBYTES; ++i) + key[i] = sk[i]; + sk += SEEDBYTES; + + for(i = 0; i < TRBYTES; ++i) + tr[i] = sk[i]; + sk += TRBYTES; + + for(i=0; i < L; ++i) + small_polyeta_unpack(&s1[i], sk + i*POLYETA_PACKEDBYTES); + sk += L*POLYETA_PACKEDBYTES; + + for(i=0; i < K; ++i) + small_polyeta_unpack(&s2[i], sk + i*POLYETA_PACKEDBYTES); + sk += K*POLYETA_PACKEDBYTES; + + for(i=0; i < K; ++i) + polyt0_unpack(&t0->vec[i], sk + i*POLYT0_PACKEDBYTES); +} + + +/************************************************* +* Name: pack_sig +* +* Description: Bit-pack signature sig = (c, z, h). +* +* Arguments: - uint8_t sig[]: output byte array +* - const uint8_t *c: pointer to challenge hash length SEEDBYTES +* - const polyvecl *z: pointer to vector z +* - const polyveck *h: pointer to hint vector h +**************************************************/ +void pack_sig(uint8_t sig[CRYPTO_BYTES], + const uint8_t c[CTILDEBYTES], + const polyvecl *z, + const polyveck *h) +{ + unsigned int i, j, k; + + for(i=0; i < CTILDEBYTES; ++i) + sig[i] = c[i]; + sig += CTILDEBYTES; + + for(i = 0; i < L; ++i) + polyz_pack(sig + i*POLYZ_PACKEDBYTES, &z->vec[i]); + sig += L*POLYZ_PACKEDBYTES; + + /* Encode h */ + for(i = 0; i < OMEGA + K; ++i) + sig[i] = 0; + + k = 0; + for(i = 0; i < K; ++i) { + for(j = 0; j < N; ++j) + if(h->vec[i].coeffs[j] != 0) + sig[k++] = j; + + sig[OMEGA + i] = k; + } +} + +void pack_sig_c(uint8_t sig[CRYPTO_BYTES], + const uint8_t c[CTILDEBYTES]) +{ + unsigned int i; + + for(i=0; i < CTILDEBYTES; ++i) + sig[i] = c[i]; + sig += CTILDEBYTES; +} + +void pack_sig_z(uint8_t sig[CRYPTO_BYTES], + const polyvecl *z) +{ + unsigned int i; + sig += CTILDEBYTES; + for(i = 0; i < L; ++i) + polyz_pack(sig + i*POLYZ_PACKEDBYTES, &z->vec[i]); +} + + +void pack_sig_h(unsigned char sig[CRYPTO_BYTES], + const poly *h_elem, + const unsigned int idx, + unsigned int *hints_written) +{ + sig += CTILDEBYTES; + sig += L*POLYZ_PACKEDBYTES; + + // Encode h + for (unsigned int j = 0; j < N; j++) { + if (h_elem->coeffs[j] != 0) { + sig[*hints_written] = (uint8_t)j; + (*hints_written)++; + } + } + sig[OMEGA + idx] = (uint8_t)*hints_written; +} + +void pack_sig_h_zero(unsigned char sig[CRYPTO_BYTES], + unsigned int *hints_written) { + sig += CTILDEBYTES; + sig += L * POLYZ_PACKEDBYTES; + while (*hints_written < OMEGA) { + sig[*hints_written] = 0; + (*hints_written)++; + } +} + +/************************************************* +* Name: unpack_sig +* +* Description: Unpack signature sig = (c, z, h). +* +* Arguments: - uint8_t *c: pointer to output challenge hash +* - polyvecl *z: pointer to output vector z +* - polyveck *h: pointer to output hint vector h +* - const uint8_t sig[]: byte array containing +* bit-packed signature +* +* Returns 1 in case of malformed signature; otherwise 0. +**************************************************/ +int unpack_sig(uint8_t c[CTILDEBYTES], + polyvecl *z, + polyveck *h, + const uint8_t sig[CRYPTO_BYTES]) +{ + unsigned int i, j, k; + + for(i = 0; i < CTILDEBYTES; ++i) + c[i] = sig[i]; + sig += CTILDEBYTES; + + for(i = 0; i < L; ++i) + polyz_unpack(&z->vec[i], sig + i*POLYZ_PACKEDBYTES); + sig += L*POLYZ_PACKEDBYTES; + + /* Decode h */ + k = 0; + for(i = 0; i < K; ++i) { + for(j = 0; j < N; ++j) + h->vec[i].coeffs[j] = 0; + + if(sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) + return 1; + + for(j = k; j < sig[OMEGA + i]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if(j > k && sig[j] <= sig[j-1]) return 1; + h->vec[i].coeffs[sig[j]] = 1; + } + + k = sig[OMEGA + i]; + } + + /* Extra indices are zero for strong unforgeability */ + for(j = k; j < OMEGA; ++j) + if(sig[j]) + return 1; + + return 0; +} + +/************************************************* +* Name: unpack_sig_c +* +* Description: Unpack only c from signature sig = (z, h, c). +* +* Arguments: - poly *c: pointer to output challenge polynomial +* - const unsigned char sig[]: byte array containing +* bit-packed signature +* +* Returns 1 in case of malformed signature; otherwise 0. +**************************************************/ +int unpack_sig_c(uint8_t c[CTILDEBYTES], const unsigned char sig[CRYPTO_BYTES]) { + for(size_t i = 0; i < CTILDEBYTES; ++i) + c[i] = sig[i]; + sig += CTILDEBYTES; + return 0; +} + +/************************************************* +* Name: unpack_sig_z +* +* Description: Unpack only z from signature sig = (z, h, c). +* +* Arguments: - polyvecl *z: pointer to output vector z +* - const unsigned char sig[]: byte array containing +* bit-packed signature +* +* Returns 1 in case of malformed signature; otherwise 0. +**************************************************/ +int unpack_sig_z(polyvecl *z, const unsigned char sig[CRYPTO_BYTES]) { + sig += CTILDEBYTES; + for (size_t i = 0; i < L; ++i) { + polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES); + } + return 0; +} + +/************************************************* +* Name: unpack_sig_h +* +* Description: Unpack only h from signature sig = (z, h, c). +* +* Arguments: - polyveck *h: pointer to output hint vector h +* - const unsigned char sig[]: byte array containing +* bit-packed signature +* +* Returns 1 in case of malformed signature; otherwise 0. +**************************************************/ +int unpack_sig_h(poly *h, size_t idx, const unsigned char sig[CRYPTO_BYTES]) { + sig += CTILDEBYTES; + sig += L * POLYZ_PACKEDBYTES; + + /* Decode h */ + size_t k = 0; + for (size_t i = 0; i < K; ++i) { + for (size_t j = 0; j < N; ++j) { + if (i == idx) { + h->coeffs[j] = 0; + } + } + + if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) { + return 1; + } + + for (size_t j = k; j < sig[OMEGA + i]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if (j > k && sig[j] <= sig[j - 1]) { + return 1; + } + if (i == idx) { + h->coeffs[sig[j]] = 1; + } + } + + k = sig[OMEGA + i]; + } + + /* Extra indices are zero for strong unforgeability */ + for (size_t j = k; j < OMEGA; ++j) { + if (sig[j]) { + return 1; + } + } + return 0; +} + diff --git a/crypto_sign/dilithium2/m4f/packing.h b/crypto_sign/dilithium2/m4f/packing.h new file mode 100644 index 0000000..78ef2c2 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/packing.h @@ -0,0 +1,68 @@ +#ifndef PACKING_H +#define PACKING_H + +#include +#include +#include "params.h" +#include "polyvec.h" +#include "smallpoly.h" + +#define pack_pk DILITHIUM_NAMESPACE(pack_pk) +void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); + +#define pack_sk DILITHIUM_NAMESPACE(pack_sk) +void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[TRBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2); + +#define pack_sig DILITHIUM_NAMESPACE(pack_sig) +void pack_sig(uint8_t sig[CRYPTO_BYTES], const uint8_t c[CTILDEBYTES], const polyvecl *z, const polyveck *h); + +#define unpack_pk DILITHIUM_NAMESPACE(unpack_pk) +void unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[CRYPTO_PUBLICKEYBYTES]); + +#define unpack_pk_t1 DILITHIUM_NAMESPACE(unpack_pk_t1) +void unpack_pk_t1(poly *t1, size_t idx, const unsigned char pk[CRYPTO_PUBLICKEYBYTES]); + +#define unpack_sk DILITHIUM_NAMESPACE(unpack_sk) +void unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[TRBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + smallpoly s1[L], + smallpoly s2[K], + const uint8_t sk[CRYPTO_SECRETKEYBYTES]); + +#define unpack_sig DILITHIUM_NAMESPACE(unpack_sig) +int unpack_sig(uint8_t c[CTILDEBYTES], polyvecl *z, polyveck *h, const uint8_t sig[CRYPTO_BYTES]); + + +#define unpack_sig_z DILITHIUM_NAMESPACE(unpack_sig_z) +int unpack_sig_z(polyvecl *z, const unsigned char sig[CRYPTO_BYTES]); +#define unpack_sig_h DILITHIUM_NAMESPACE(unpack_sig_h) +int unpack_sig_h(poly *h, size_t idx, const unsigned char sig[CRYPTO_BYTES]); +#define unpack_sig_c DILITHIUM_NAMESPACE(unpack_sig_c) +int unpack_sig_c(uint8_t c[CTILDEBYTES], const unsigned char sig[CRYPTO_BYTES]); + + +#define pack_sig_c DILITHIUM_NAMESPACE(pack_sig_c) +void pack_sig_c(uint8_t sig[CRYPTO_BYTES], const uint8_t c[CTILDEBYTES]); + +#define pack_sig_z DILITHIUM_NAMESPACE(pack_sig_z) +void pack_sig_z(uint8_t sig[CRYPTO_BYTES], const polyvecl *z); + +#define pack_sig_h DILITHIUM_NAMESPACE(pack_sig_h) +void pack_sig_h(unsigned char sig[CRYPTO_BYTES], + const poly *h_elem, + const unsigned int idx, + unsigned int *hints_written); + +#define pack_sig_h_zero DILITHIUM_NAMESPACE(pack_sig_h_zero) +void pack_sig_h_zero(unsigned char sig[CRYPTO_BYTES], + unsigned int *hints_written); + +#endif diff --git a/crypto_sign/dilithium2/m4f/params.h b/crypto_sign/dilithium2/m4f/params.h new file mode 100644 index 0000000..507de46 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/params.h @@ -0,0 +1,83 @@ +#ifndef PARAMS_H +#define PARAMS_H + +#include "config.h" + +#define DILITHIUM_NAMESPACE(s) pqcrystals_dilithium_##s + + +#define SEEDBYTES 32 +#define CRHBYTES 64 +#define TRBYTES 64 +#define RNDBYTES 32 +#define N 256 +#define Q 8380417 +#define D 13 +#define ROOT_OF_UNITY 1753 + +#if DILITHIUM_MODE == 2 +#define K 4 +#define L 4 +#define ETA 2 +#define TAU 39 +#define BETA 78 +#define GAMMA1 (1 << 17) +#define GAMMA2 ((Q-1)/88) +#define OMEGA 80 +#define CTILDEBYTES 32 + +#elif DILITHIUM_MODE == 3 +#define K 6 +#define L 5 +#define ETA 4 +#define TAU 49 +#define BETA 196 +#define GAMMA1 (1 << 19) +#define GAMMA2 ((Q-1)/32) +#define OMEGA 55 +#define CTILDEBYTES 48 + +#elif DILITHIUM_MODE == 5 +#define K 8 +#define L 7 +#define ETA 2 +#define TAU 60 +#define BETA 120 +#define GAMMA1 (1 << 19) +#define GAMMA2 ((Q-1)/32) +#define OMEGA 75 +#define CTILDEBYTES 64 + +#endif + +#define POLYT1_PACKEDBYTES 320 +#define POLYT0_PACKEDBYTES 416 +#define POLYVECH_PACKEDBYTES (OMEGA + K) + +#if GAMMA1 == (1 << 17) +#define POLYZ_PACKEDBYTES 576 +#elif GAMMA1 == (1 << 19) +#define POLYZ_PACKEDBYTES 640 +#endif + +#if GAMMA2 == (Q-1)/88 +#define POLYW1_PACKEDBYTES 192 +#elif GAMMA2 == (Q-1)/32 +#define POLYW1_PACKEDBYTES 128 +#endif + +#if ETA == 2 +#define POLYETA_PACKEDBYTES 96 +#elif ETA == 4 +#define POLYETA_PACKEDBYTES 128 +#endif + +#define CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) +#define CRYPTO_SECRETKEYBYTES (2*SEEDBYTES \ + + TRBYTES \ + + L*POLYETA_PACKEDBYTES \ + + K*POLYETA_PACKEDBYTES \ + + K*POLYT0_PACKEDBYTES) +#define CRYPTO_BYTES (CTILDEBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) + +#endif \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4f/pointwise_mont.h b/crypto_sign/dilithium2/m4f/pointwise_mont.h new file mode 100644 index 0000000..2647a11 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/pointwise_mont.h @@ -0,0 +1,13 @@ +#ifndef POINTWISE_MONT_H +#define POINTWISE_MONT_H + +#include +#include "params.h" + + +#define asm_pointwise_montgomery DILITHIUM_NAMESPACE(asm_pointwise_montgomery) +void asm_pointwise_montgomery(int32_t c[N], const int32_t a[N], const int32_t b[N]); +#define asm_pointwise_acc_montgomery DILITHIUM_NAMESPACE(asm_pointwise_acc_montgomery) +void asm_pointwise_acc_montgomery(int32_t c[N], const int32_t a[N], const int32_t b[N]); + +#endif diff --git a/crypto_sign/dilithium2/m4f/pointwise_mont.s b/crypto_sign/dilithium2/m4f/pointwise_mont.s new file mode 100644 index 0000000..e21125d --- /dev/null +++ b/crypto_sign/dilithium2/m4f/pointwise_mont.s @@ -0,0 +1,128 @@ +.syntax unified +.thumb + +.macro montgomery_multiplication res, pa, pb, q, qinv + smull \pa, \res, \pa, \pb + mul \pb, \pa, \qinv + smlal \pa, \res, \pb, \q +.endm + + +// void asm_pointwise_montgomery(int32_t c[N], const int32_t a[N], const int32_t b[N]); +.global pqcrystals_dilithium_asm_pointwise_montgomery +.type pqcrystals_dilithium_asm_pointwise_montgomery,%function +.align 2 +pqcrystals_dilithium_asm_pointwise_montgomery: + push.w {r4-r11, r14} + c_ptr .req r0 + a_ptr .req r1 + b_ptr .req r2 + qinv .req r3 + q .req r4 + pa0 .req r5 + pa1 .req r6 + pa2 .req r7 + pb0 .req r8 + pb1 .req r9 + pb2 .req r10 + tmp0 .req r11 + ctr .req r12 + res .req r14 + + movw qinv, #:lower16:0xfc7fdfff + movt qinv, #:upper16:0xfc7fdfff + movw q, #0xE001 + movt q, #0x7F + + + // 85x3 = 255 coefficients + movw ctr, #85 + 1: + ldr.w pa1, [a_ptr, #4] + ldr.w pa2, [a_ptr, #8] + ldr pa0, [a_ptr], #12 + ldr.w pb1, [b_ptr, #4] + ldr.w pb2, [b_ptr, #8] + ldr pb0, [b_ptr], #12 + + montgomery_multiplication res, pa0, pb0, q, qinv + str res, [c_ptr], #4 + montgomery_multiplication res, pa1, pb1, q, qinv + str res, [c_ptr], #4 + montgomery_multiplication res, pa2, pb2, q, qinv + str res, [c_ptr], #4 + subs ctr, #1 + bne.w 1b + + // final coefficient + ldr.w pa0, [a_ptr] + ldr.w pb0, [b_ptr] + montgomery_multiplication res, pa0, pb0, q, qinv + str.w res, [c_ptr] + + pop.w {r4-r11, pc} +.size pqcrystals_dilithium_asm_pointwise_montgomery, .-pqcrystals_dilithium_asm_pointwise_montgomery + +// void asm_pointwise_acc_montgomery(int32_t c[N], const int32_t a[N], const int32_t b[N]); +.global pqcrystals_dilithium_asm_pointwise_acc_montgomery +.type pqcrystals_dilithium_asm_pointwise_acc_montgomery,%function +.align 2 +pqcrystals_dilithium_asm_pointwise_acc_montgomery: + push.w {r4-r11, r14} + c_ptr .req r0 + a_ptr .req r1 + b_ptr .req r2 + qinv .req r3 + q .req r4 + pa0 .req r5 + pa1 .req r6 + pa2 .req r7 + pb0 .req r8 + pb1 .req r9 + pb2 .req r10 + tmp0 .req r11 + ctr .req r12 + res .req r14 + + movw qinv, #:lower16:0xfc7fdfff + movt qinv, #:upper16:0xfc7fdfff + movw q, #0xE001 + movt q, #0x7F + + + // 85x3 = 255 coefficients + movw ctr, #85 + 1: + ldr.w pa1, [a_ptr, #4] + ldr.w pa2, [a_ptr, #8] + ldr pa0, [a_ptr], #12 + ldr.w pb1, [b_ptr, #4] + ldr.w pb2, [b_ptr, #8] + ldr pb0, [b_ptr], #12 + + montgomery_multiplication res, pa0, pb0, q, qinv + montgomery_multiplication pa0, pa1, pb1, q, qinv + montgomery_multiplication pa1, pa2, pb2, q, qinv + + ldr.w pb0, [c_ptr] + ldr.w pb1, [c_ptr, #4] + ldr.w pb2, [c_ptr, #8] + add.w res, res, pb0 + str res, [c_ptr], #12 + add.w pa0, pa0, pb1 + str pa0, [c_ptr, #-8] + add.w pa1, pa1, pb2 + str pa1, [c_ptr, #-4] + subs ctr, #1 + bne.w 1b + + // final coefficient + ldr.w pa0, [a_ptr] + ldr.w pb0, [b_ptr] + ldr.w pa1, [c_ptr] + montgomery_multiplication res, pa0, pb0, q, qinv + add.w res, res, pa1 + str.w res, [c_ptr] + + pop.w {r4-r11, pc} +.size pqcrystals_dilithium_asm_pointwise_acc_montgomery, .-pqcrystals_dilithium_asm_pointwise_acc_montgomery diff --git a/crypto_sign/dilithium2/m4f/poly.c b/crypto_sign/dilithium2/m4f/poly.c new file mode 100644 index 0000000..654f4f2 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/poly.c @@ -0,0 +1,863 @@ +#include +#include "params.h" +#include "poly.h" +#include "vector.h" +#include "ntt.h" +#include "pointwise_mont.h" +#include "rounding.h" +#include "symmetric.h" + +#include +#include "hal.h" + +#ifdef DBENCH +#include "test/cpucycles.h" +extern const uint64_t timing_overhead; +extern uint64_t *tred, *tadd, *tmul, *tround, *tsample, *tpack; +#define DBENCH_START() uint64_t time = cpucycles() +#define DBENCH_STOP(t) t += cpucycles() - time - timing_overhead +#else +#define DBENCH_START() +#define DBENCH_STOP(t) +#endif + +/************************************************* +* Name: poly_reduce +* +* Description: Inplace reduction of all coefficients of polynomial to +* representative in [-6283009,6283007]. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void poly_reduce(poly *a) { + asm_reduce32(a->coeffs); +} + +/************************************************* +* Name: poly_caddq +* +* Description: For all coefficients of in/out polynomial add Q if +* coefficient is negative. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void poly_caddq(poly *a) { + asm_caddq(a->coeffs); +} + +/************************************************* +* Name: poly_csubq +* +* Description: For all coefficients of input polynomial subtract Q if +* coefficient is bigger than Q; add Q if coefficient is negative. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void poly_csubq(poly *a) { + asm_caddq(a->coeffs); +} + +#if 0 +/************************************************* +* Name: poly_freeze +* +* Description: Inplace reduction of all coefficients of polynomial to +* standard representatives. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void poly_freeze(poly *a) { + asm_freeze(a->coeffs); +} +#endif + +/************************************************* +* Name: poly_add +* +* Description: Add polynomials. No modular reduction is performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first summand +* - const poly *b: pointer to second summand +**************************************************/ +void poly_add(poly *c, const poly *a, const poly *b) { + unsigned int i; + DBENCH_START(); + + for(i = 0; i < N; ++i) + c->coeffs[i] = a->coeffs[i] + b->coeffs[i]; + + DBENCH_STOP(*tadd); +} + +/************************************************* +* Name: poly_sub +* +* Description: Subtract polynomials. No modular reduction is +* performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial to be +* subtraced from first input polynomial +**************************************************/ +void poly_sub(poly *c, const poly *a, const poly *b) { + unsigned int i; + DBENCH_START(); + + for(i = 0; i < N; ++i) + c->coeffs[i] = a->coeffs[i] - b->coeffs[i]; + + DBENCH_STOP(*tadd); +} + +/************************************************* +* Name: poly_shiftl +* +* Description: Multiply polynomial by 2^D without modular reduction. Assumes +* input coefficients to be less than 2^{31-D} in absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void poly_shiftl(poly *a) { + unsigned int i; + DBENCH_START(); + + for(i = 0; i < N; ++i) + a->coeffs[i] <<= D; + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: poly_ntt +* +* Description: Inplace forward NTT. Coefficients can grow by +* 8*Q in absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void poly_ntt(poly *a) { + DBENCH_START(); + + ntt(a->coeffs); + + DBENCH_STOP(*tmul); +} + + +/************************************************* +* Name: poly_invntt_tomont +* +* Description: Inplace inverse NTT and multiplication by 2^{32}. +* Input coefficients need to be less than Q in absolute +* value and output coefficients are again bounded by Q. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void poly_invntt_tomont(poly *a) { + DBENCH_START(); + + invntt_tomont(a->coeffs); + + DBENCH_STOP(*tmul); +} + + +/************************************************* +* Name: poly_pointwise_montgomery +* +* Description: Pointwise multiplication of polynomials in NTT domain +* representation and multiplication of resulting polynomial +* by 2^{-32}. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { + DBENCH_START(); + + asm_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs); + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: poly_pointwise_acc_montgomery +* +* Description: Pointwise multiplication of polynomials in NTT domain +* representation, multiplication of resulting polynomial +* by 2^{-32} and accumulate. +* +* Arguments: - poly *c: pointer to output (accumulating) polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void poly_pointwise_acc_montgomery(poly *c, const poly *a, const poly *b) { + DBENCH_START(); + + asm_pointwise_acc_montgomery(c->coeffs, a->coeffs, b->coeffs); + + DBENCH_STOP(*tmul); +} + + +/************************************************* +* Name: poly_power2round +* +* Description: For all coefficients c of the input polynomial, +* compute c0, c1 such that c mod Q = c1*2^D + c0 +* with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be +* standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial +**************************************************/ +void poly_power2round(poly *a1, poly *a0, const poly *a) { + unsigned int i; + DBENCH_START(); + + for(i = 0; i < N; ++i) + a1->coeffs[i] = power2round(&a0->coeffs[i], a->coeffs[i]); + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: poly_decompose +* +* Description: For all coefficients c of the input polynomial, +* compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0 +* with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we +* set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0. +* Assumes coefficients to be standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial +**************************************************/ +void poly_decompose(poly *a1, poly *a0, const poly *a) { + unsigned int i; + DBENCH_START(); + + for(i = 0; i < N; ++i) + a1->coeffs[i] = decompose(&a0->coeffs[i], a->coeffs[i]); + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: poly_make_hint +* +* Description: Compute hint polynomial. The coefficients of which indicate +* whether the low bits of the corresponding coefficient of +* the input polynomial overflow into the high bits. +* +* Arguments: - poly *h: pointer to output hint polynomial +* - const poly *a0: pointer to low part of input polynomial +* - const poly *a1: pointer to high part of input polynomial +* +* Returns number of 1 bits. +**************************************************/ +unsigned int poly_make_hint(poly *h, const poly *a0, const poly *a1) { + unsigned int i, s = 0; + DBENCH_START(); + + for(i = 0; i < N; ++i) { + h->coeffs[i] = make_hint(a0->coeffs[i], a1->coeffs[i]); + s += h->coeffs[i]; + } + + DBENCH_STOP(*tround); + return s; +} + +/************************************************* +* Name: poly_use_hint +* +* Description: Use hint polynomial to correct the high bits of a polynomial. +* +* Arguments: - poly *b: pointer to output polynomial with corrected high bits +* - const poly *a: pointer to input polynomial +* - const poly *h: pointer to input hint polynomial +**************************************************/ +void poly_use_hint(poly *b, const poly *a, const poly *h) { + unsigned int i; + DBENCH_START(); + + for(i = 0; i < N; ++i) + b->coeffs[i] = use_hint(a->coeffs[i], h->coeffs[i]); + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: poly_chknorm +* +* Description: Check infinity norm of polynomial against given bound. +* Assumes input coefficients were reduced by reduce32(). +* +* Arguments: - const poly *a: pointer to polynomial +* - int32_t B: norm bound +* +* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 1 otherwise. +**************************************************/ +int poly_chknorm(const poly *a, int32_t B) { + unsigned int i; + int32_t t; + DBENCH_START(); + + if(B > (Q-1)/8) + return 1; + + /* It is ok to leak which coefficient violates the bound since + the probability for each coefficient is independent of secret + data but we must not leak the sign of the centralized representative. */ + for(i = 0; i < N; ++i) { + /* Absolute value */ + t = a->coeffs[i] >> 31; + t = a->coeffs[i] - (t & 2*a->coeffs[i]); + + if(t >= B) { + DBENCH_STOP(*tsample); + return 1; + } + } + + DBENCH_STOP(*tsample); + return 0; +} + +/************************************************* +* Name: poly_uniform +* +* Description: Sample polynomial with uniformly random coefficients +* in [0,Q-1] by performing rejection sampling on the +* output stream of SHAKE256(seed|nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +#define POLY_UNIFORM_NBLOCKS ((768 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES) +void poly_uniform(poly *a, + const uint8_t seed[SEEDBYTES], + uint16_t nonce) +{ + unsigned int i, ctr, off; + unsigned int buflen = POLY_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES; + uint8_t buf[POLY_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES + 2]; + stream128_state state; + + stream128_init(&state, seed, nonce); + stream128_squeezeblocks(buf, POLY_UNIFORM_NBLOCKS, &state); + + ctr = asm_rej_uniform(a->coeffs, N, buf, buflen); + + while(ctr < N) { + off = buflen % 3; + for(i = 0; i < off; ++i) + buf[i] = buf[buflen - off + i]; + + stream128_squeezeblocks(buf + off, 1, &state); + buflen = STREAM128_BLOCKBYTES + off; + ctr += asm_rej_uniform(a->coeffs + ctr, N - ctr, buf, buflen); + } +} + +/************************************************* +* Name: rej_eta +* +* Description: Sample uniformly random coefficients in [-ETA, ETA] by +* performing rejection sampling on array of random bytes. +* +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const uint8_t *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_eta(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) +{ + unsigned int ctr, pos; + uint32_t t0, t1; + DBENCH_START(); + + ctr = pos = 0; + while(ctr < len && pos < buflen) { + t0 = buf[pos] & 0x0F; + t1 = buf[pos++] >> 4; + +#if ETA == 2 + if(t0 < 15) { + t0 = t0 - (205*t0 >> 10)*5; + a[ctr++] = 2 - t0; + } + if(t1 < 15 && ctr < len) { + t1 = t1 - (205*t1 >> 10)*5; + a[ctr++] = 2 - t1; + } +#elif ETA == 4 + if(t0 < 9) + a[ctr++] = 4 - t0; + if(t1 < 9 && ctr < len) + a[ctr++] = 4 - t1; +#endif + } + + DBENCH_STOP(*tsample); + return ctr; +} + +/************************************************* +* Name: poly_uniform_eta +* +* Description: Sample polynomial with uniformly random coefficients +* in [-ETA,ETA] by performing rejection sampling on the +* output stream from SHAKE256(seed|nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +#if ETA == 2 +#define POLY_UNIFORM_ETA_NBLOCKS ((136 + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) +#elif ETA == 4 +#define POLY_UNIFORM_ETA_NBLOCKS ((227 + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) +#endif +void poly_uniform_eta(poly *a, + const uint8_t seed[CRHBYTES], + uint16_t nonce) { + unsigned int ctr; + unsigned int buflen = POLY_UNIFORM_ETA_NBLOCKS * STREAM256_BLOCKBYTES; + uint8_t buf[POLY_UNIFORM_ETA_NBLOCKS * STREAM256_BLOCKBYTES]; + stream256_state state; + + stream256_init(&state, seed, nonce); + stream256_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state); + + ctr = rej_eta(a->coeffs, N, buf, buflen); + + while(ctr < N) { + stream256_squeezeblocks(buf, 1, &state); + ctr += rej_eta(a->coeffs + ctr, N - ctr, buf, STREAM256_BLOCKBYTES); + } +} + +/************************************************* +* Name: poly_uniform_gamma1m1 +* +* Description: Sample polynomial with uniformly random coefficients +* in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream +* of SHAKE256(seed|nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length CRHBYTES +* - uint16_t nonce: 16-bit nonce +**************************************************/ +#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) +void poly_uniform_gamma1(poly *a, + const uint8_t seed[CRHBYTES], + uint16_t nonce) +{ + uint8_t buf[POLY_UNIFORM_GAMMA1_NBLOCKS*STREAM256_BLOCKBYTES]; + stream256_state state; + + stream256_init(&state, seed, nonce); + stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1_NBLOCKS, &state); + polyz_unpack(a, buf); +} + +/************************************************* +* Name: challenge +* +* Description: Implementation of H. Samples polynomial with TAU nonzero +* coefficients in {-1,1} using the output stream of +* SHAKE256(seed). +* +* Arguments: - poly *c: pointer to output polynomial +* - const uint8_t mu[]: byte array containing seed of length SEEDBYTES +**************************************************/ +void poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]) { + unsigned int i, b, pos; + uint64_t signs; + uint8_t buf[SHAKE256_RATE]; + shake256incctx state; + + shake256_inc_init(&state); + shake256_inc_absorb(&state, seed, SEEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeezeblocks(buf, 1, &state); + + signs = 0; + for(i = 0; i < 8; ++i) + signs |= (uint64_t)buf[i] << 8*i; + pos = 8; + + for(i = 0; i < N; ++i) + c->coeffs[i] = 0; + for(i = N-TAU; i < N; ++i) { + do { + if(pos >= SHAKE256_RATE) { + shake256_inc_squeezeblocks(buf, 1, &state); + pos = 0; + } + + b = buf[pos++]; + } while(b > i); + + c->coeffs[i] = c->coeffs[b]; + c->coeffs[b] = 1 - 2*(signs & 1); + signs >>= 1; + } +} + +/************************************************* +* Name: polyeta_pack +* +* Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYETA_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void polyeta_pack(uint8_t *r, const poly *a) { + unsigned int i; + uint8_t t[8]; + DBENCH_START(); + +#if ETA == 2 + for(i = 0; i < N/8; ++i) { + t[0] = ETA - a->coeffs[8*i+0]; + t[1] = ETA - a->coeffs[8*i+1]; + t[2] = ETA - a->coeffs[8*i+2]; + t[3] = ETA - a->coeffs[8*i+3]; + t[4] = ETA - a->coeffs[8*i+4]; + t[5] = ETA - a->coeffs[8*i+5]; + t[6] = ETA - a->coeffs[8*i+6]; + t[7] = ETA - a->coeffs[8*i+7]; + + r[3*i+0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6); + r[3*i+1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); + r[3*i+2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); + } +#elif ETA == 4 + for(i = 0; i < N/2; ++i) { + t[0] = ETA - a->coeffs[2*i+0]; + t[1] = ETA - a->coeffs[2*i+1]; + r[i] = t[0] | (t[1] << 4); + } +#endif + + DBENCH_STOP(*tpack); +} + + +/************************************************* +* Name: polyt1_pack +* +* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits. +* Input coefficients are assumed to be standard representatives. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYT1_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void polyt1_pack(uint8_t *r, const poly *a) { + unsigned int i; + DBENCH_START(); + + for(i = 0; i < N/4; ++i) { + r[5*i+0] = (a->coeffs[4*i+0] >> 0); + r[5*i+1] = (a->coeffs[4*i+0] >> 8) | (a->coeffs[4*i+1] << 2); + r[5*i+2] = (a->coeffs[4*i+1] >> 6) | (a->coeffs[4*i+2] << 4); + r[5*i+3] = (a->coeffs[4*i+2] >> 4) | (a->coeffs[4*i+3] << 6); + r[5*i+4] = (a->coeffs[4*i+3] >> 2); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: polyt1_unpack +* +* Description: Unpack polynomial t1 with 10-bit coefficients. +* Output coefficients are standard representatives. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void polyt1_unpack(poly *r, const uint8_t *a) { + unsigned int i; + DBENCH_START(); + + for(i = 0; i < N/4; ++i) { + r->coeffs[4*i+0] = ((a[5*i+0] >> 0) | ((uint32_t)a[5*i+1] << 8)) & 0x3FF; + r->coeffs[4*i+1] = ((a[5*i+1] >> 2) | ((uint32_t)a[5*i+2] << 6)) & 0x3FF; + r->coeffs[4*i+2] = ((a[5*i+2] >> 4) | ((uint32_t)a[5*i+3] << 4)) & 0x3FF; + r->coeffs[4*i+3] = ((a[5*i+3] >> 6) | ((uint32_t)a[5*i+4] << 2)) & 0x3FF; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: polyt0_pack +* +* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYT0_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void polyt0_pack(uint8_t *r, const poly *a) { + unsigned int i; + uint32_t t[8]; + DBENCH_START(); + + for(i = 0; i < N/8; ++i) { + t[0] = (1 << (D-1)) - a->coeffs[8*i+0]; + t[1] = (1 << (D-1)) - a->coeffs[8*i+1]; + t[2] = (1 << (D-1)) - a->coeffs[8*i+2]; + t[3] = (1 << (D-1)) - a->coeffs[8*i+3]; + t[4] = (1 << (D-1)) - a->coeffs[8*i+4]; + t[5] = (1 << (D-1)) - a->coeffs[8*i+5]; + t[6] = (1 << (D-1)) - a->coeffs[8*i+6]; + t[7] = (1 << (D-1)) - a->coeffs[8*i+7]; + + r[13*i+ 0] = t[0]; + r[13*i+ 1] = t[0] >> 8; + r[13*i+ 1] |= t[1] << 5; + r[13*i+ 2] = t[1] >> 3; + r[13*i+ 3] = t[1] >> 11; + r[13*i+ 3] |= t[2] << 2; + r[13*i+ 4] = t[2] >> 6; + r[13*i+ 4] |= t[3] << 7; + r[13*i+ 5] = t[3] >> 1; + r[13*i+ 6] = t[3] >> 9; + r[13*i+ 6] |= t[4] << 4; + r[13*i+ 7] = t[4] >> 4; + r[13*i+ 8] = t[4] >> 12; + r[13*i+ 8] |= t[5] << 1; + r[13*i+ 9] = t[5] >> 7; + r[13*i+ 9] |= t[6] << 6; + r[13*i+10] = t[6] >> 2; + r[13*i+11] = t[6] >> 10; + r[13*i+11] |= t[7] << 3; + r[13*i+12] = t[7] >> 5; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: polyt0_unpack +* +* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void polyt0_unpack(poly *r, const uint8_t *a) { + unsigned int i; + DBENCH_START(); + + for(i = 0; i < N/8; ++i) { + r->coeffs[8*i+0] = a[13*i+0]; + r->coeffs[8*i+0] |= (uint32_t)a[13*i+1] << 8; + r->coeffs[8*i+0] &= 0x1FFF; + + r->coeffs[8*i+1] = a[13*i+1] >> 5; + r->coeffs[8*i+1] |= (uint32_t)a[13*i+2] << 3; + r->coeffs[8*i+1] |= (uint32_t)a[13*i+3] << 11; + r->coeffs[8*i+1] &= 0x1FFF; + + r->coeffs[8*i+2] = a[13*i+3] >> 2; + r->coeffs[8*i+2] |= (uint32_t)a[13*i+4] << 6; + r->coeffs[8*i+2] &= 0x1FFF; + + r->coeffs[8*i+3] = a[13*i+4] >> 7; + r->coeffs[8*i+3] |= (uint32_t)a[13*i+5] << 1; + r->coeffs[8*i+3] |= (uint32_t)a[13*i+6] << 9; + r->coeffs[8*i+3] &= 0x1FFF; + + r->coeffs[8*i+4] = a[13*i+6] >> 4; + r->coeffs[8*i+4] |= (uint32_t)a[13*i+7] << 4; + r->coeffs[8*i+4] |= (uint32_t)a[13*i+8] << 12; + r->coeffs[8*i+4] &= 0x1FFF; + + r->coeffs[8*i+5] = a[13*i+8] >> 1; + r->coeffs[8*i+5] |= (uint32_t)a[13*i+9] << 7; + r->coeffs[8*i+5] &= 0x1FFF; + + r->coeffs[8*i+6] = a[13*i+9] >> 6; + r->coeffs[8*i+6] |= (uint32_t)a[13*i+10] << 2; + r->coeffs[8*i+6] |= (uint32_t)a[13*i+11] << 10; + r->coeffs[8*i+6] &= 0x1FFF; + + r->coeffs[8*i+7] = a[13*i+11] >> 3; + r->coeffs[8*i+7] |= (uint32_t)a[13*i+12] << 5; + r->coeffs[8*i+7] &= 0x1FFF; + + r->coeffs[8*i+0] = (1 << (D-1)) - r->coeffs[8*i+0]; + r->coeffs[8*i+1] = (1 << (D-1)) - r->coeffs[8*i+1]; + r->coeffs[8*i+2] = (1 << (D-1)) - r->coeffs[8*i+2]; + r->coeffs[8*i+3] = (1 << (D-1)) - r->coeffs[8*i+3]; + r->coeffs[8*i+4] = (1 << (D-1)) - r->coeffs[8*i+4]; + r->coeffs[8*i+5] = (1 << (D-1)) - r->coeffs[8*i+5]; + r->coeffs[8*i+6] = (1 << (D-1)) - r->coeffs[8*i+6]; + r->coeffs[8*i+7] = (1 << (D-1)) - r->coeffs[8*i+7]; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: polyz_pack +* +* Description: Bit-pack polynomial with coefficients +* in [-(GAMMA1 - 1), GAMMA1]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYZ_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void polyz_pack(uint8_t *r, const poly *a) { + unsigned int i; + uint32_t t[4]; + DBENCH_START(); + +#if GAMMA1 == (1 << 17) + for(i = 0; i < N/4; ++i) { + t[0] = GAMMA1 - a->coeffs[4*i+0]; + t[1] = GAMMA1 - a->coeffs[4*i+1]; + t[2] = GAMMA1 - a->coeffs[4*i+2]; + t[3] = GAMMA1 - a->coeffs[4*i+3]; + + r[9*i+0] = t[0]; + r[9*i+1] = t[0] >> 8; + r[9*i+2] = t[0] >> 16; + r[9*i+2] |= t[1] << 2; + r[9*i+3] = t[1] >> 6; + r[9*i+4] = t[1] >> 14; + r[9*i+4] |= t[2] << 4; + r[9*i+5] = t[2] >> 4; + r[9*i+6] = t[2] >> 12; + r[9*i+6] |= t[3] << 6; + r[9*i+7] = t[3] >> 2; + r[9*i+8] = t[3] >> 10; + } +#elif GAMMA1 == (1 << 19) + for(i = 0; i < N/2; ++i) { + t[0] = GAMMA1 - a->coeffs[2*i+0]; + t[1] = GAMMA1 - a->coeffs[2*i+1]; + + r[5*i+0] = t[0]; + r[5*i+1] = t[0] >> 8; + r[5*i+2] = t[0] >> 16; + r[5*i+2] |= t[1] << 4; + r[5*i+3] = t[1] >> 4; + r[5*i+4] = t[1] >> 12; + } +#endif + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: polyz_unpack +* +* Description: Unpack polynomial z with coefficients +* in [-(GAMMA1 - 1), GAMMA1]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void polyz_unpack(poly *r, const uint8_t *a) { + unsigned int i; + DBENCH_START(); + +#if GAMMA1 == (1 << 17) + for(i = 0; i < N/4; ++i) { + r->coeffs[4*i+0] = a[9*i+0]; + r->coeffs[4*i+0] |= (uint32_t)a[9*i+1] << 8; + r->coeffs[4*i+0] |= (uint32_t)a[9*i+2] << 16; + r->coeffs[4*i+0] &= 0x3FFFF; + + r->coeffs[4*i+1] = a[9*i+2] >> 2; + r->coeffs[4*i+1] |= (uint32_t)a[9*i+3] << 6; + r->coeffs[4*i+1] |= (uint32_t)a[9*i+4] << 14; + r->coeffs[4*i+1] &= 0x3FFFF; + + r->coeffs[4*i+2] = a[9*i+4] >> 4; + r->coeffs[4*i+2] |= (uint32_t)a[9*i+5] << 4; + r->coeffs[4*i+2] |= (uint32_t)a[9*i+6] << 12; + r->coeffs[4*i+2] &= 0x3FFFF; + + r->coeffs[4*i+3] = a[9*i+6] >> 6; + r->coeffs[4*i+3] |= (uint32_t)a[9*i+7] << 2; + r->coeffs[4*i+3] |= (uint32_t)a[9*i+8] << 10; + r->coeffs[4*i+3] &= 0x3FFFF; + + r->coeffs[4*i+0] = GAMMA1 - r->coeffs[4*i+0]; + r->coeffs[4*i+1] = GAMMA1 - r->coeffs[4*i+1]; + r->coeffs[4*i+2] = GAMMA1 - r->coeffs[4*i+2]; + r->coeffs[4*i+3] = GAMMA1 - r->coeffs[4*i+3]; + } +#elif GAMMA1 == (1 << 19) + for(i = 0; i < N/2; ++i) { + r->coeffs[2*i+0] = a[5*i+0]; + r->coeffs[2*i+0] |= (uint32_t)a[5*i+1] << 8; + r->coeffs[2*i+0] |= (uint32_t)a[5*i+2] << 16; + r->coeffs[2*i+0] &= 0xFFFFF; + + r->coeffs[2*i+1] = a[5*i+2] >> 4; + r->coeffs[2*i+1] |= (uint32_t)a[5*i+3] << 4; + r->coeffs[2*i+1] |= (uint32_t)a[5*i+4] << 12; + r->coeffs[2*i+0] &= 0xFFFFF; + + r->coeffs[2*i+0] = GAMMA1 - r->coeffs[2*i+0]; + r->coeffs[2*i+1] = GAMMA1 - r->coeffs[2*i+1]; + } +#endif + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: polyw1_pack +* +* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43]. +* Input coefficients are assumed to be standard representatives. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYW1_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void polyw1_pack(uint8_t *r, const poly *a) { + unsigned int i; + DBENCH_START(); + +#if GAMMA2 == (Q-1)/88 + for(i = 0; i < N/4; ++i) { + r[3*i+0] = a->coeffs[4*i+0]; + r[3*i+0] |= a->coeffs[4*i+1] << 6; + r[3*i+1] = a->coeffs[4*i+1] >> 2; + r[3*i+1] |= a->coeffs[4*i+2] << 4; + r[3*i+2] = a->coeffs[4*i+2] >> 4; + r[3*i+2] |= a->coeffs[4*i+3] << 2; + } +#elif GAMMA2 == (Q-1)/32 + for(i = 0; i < N/2; ++i) + r[i] = a->coeffs[2*i+0] | (a->coeffs[2*i+1] << 4); +#endif + + DBENCH_STOP(*tpack); +} diff --git a/crypto_sign/dilithium2/m4f/poly.h b/crypto_sign/dilithium2/m4f/poly.h new file mode 100644 index 0000000..af9e7a5 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/poly.h @@ -0,0 +1,84 @@ +#ifndef POLY_H +#define POLY_H + +#include +#include "params.h" + +typedef struct { + int32_t coeffs[N]; +} poly; + +#define poly_reduce DILITHIUM_NAMESPACE(poly_reduce) +void poly_reduce(poly *a); +#define poly_caddq DILITHIUM_NAMESPACE(poly_caddq) +void poly_caddq(poly *a); +#define poly_csubq DILITHIUM_NAMESPACE(poly_csubq) +void poly_csubq(poly *a); +#define poly_freeze DILITHIUM_NAMESPACE(poly_freeze) +void poly_freeze(poly *a); + +#define poly_add DILITHIUM_NAMESPACE(poly_add) +void poly_add(poly *c, const poly *a, const poly *b); +#define poly_sub DILITHIUM_NAMESPACE(poly_sub) +void poly_sub(poly *c, const poly *a, const poly *b); +#define poly_shiftl DILITHIUM_NAMESPACE(poly_shiftl) +void poly_shiftl(poly *a); + +#define poly_ntt DILITHIUM_NAMESPACE(poly_ntt) +void poly_ntt(poly *a); + +#define poly_invntt_tomont DILITHIUM_NAMESPACE(poly_invntt_tomont) +void poly_invntt_tomont(poly *a); +#define poly_pointwise_montgomery DILITHIUM_NAMESPACE(poly_pointwise_montgomery) +void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b); +#define poly_pointwise_acc_montgomery DILITHIUM_NAMESPACE(poly_pointwise_acc_montgomery) +void poly_pointwise_acc_montgomery(poly *c, const poly *a, const poly *b); + +#define poly_power2round DILITHIUM_NAMESPACE(poly_power2round) +void poly_power2round(poly *a1, poly *a0, const poly *a); +#define poly_decompose DILITHIUM_NAMESPACE(poly_decompose) +void poly_decompose(poly *a1, poly *a0, const poly *a); +#define poly_make_hint DILITHIUM_NAMESPACE(poly_make_hint) +unsigned int poly_make_hint(poly *h, const poly *a0, const poly *a1); +#define poly_use_hint DILITHIUM_NAMESPACE(poly_use_hint) +void poly_use_hint(poly *b, const poly *a, const poly *h); + +#define poly_chknorm DILITHIUM_NAMESPACE(poly_chknorm) +int poly_chknorm(const poly *a, int32_t B); +#define poly_uniform DILITHIUM_NAMESPACE(poly_uniform) +void poly_uniform(poly *a, + const uint8_t seed[SEEDBYTES], + uint16_t nonce); +#define poly_uniform_eta DILITHIUM_NAMESPACE(poly_uniform_eta) +void poly_uniform_eta(poly *a, + const uint8_t seed[CRHBYTES], + uint16_t nonce); +#define poly_uniform_gamma1 DILITHIUM_NAMESPACE(poly_uniform_gamma1) +void poly_uniform_gamma1(poly *a, + const uint8_t seed[CRHBYTES], + uint16_t nonce); +#define poly_challenge DILITHIUM_NAMESPACE(poly_challenge) +void poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]); + +#define polyeta_pack DILITHIUM_NAMESPACE(polyeta_pack) +void polyeta_pack(uint8_t *r, const poly *a); + +#define polyt1_pack DILITHIUM_NAMESPACE(polyt1_pack) +void polyt1_pack(uint8_t *r, const poly *a); +#define polyt1_unpack DILITHIUM_NAMESPACE(polyt1_unpack) +void polyt1_unpack(poly *r, const uint8_t *a); + +#define polyt0_pack DILITHIUM_NAMESPACE(polyt0_pack) +void polyt0_pack(uint8_t *r, const poly *a); +#define polyt0_unpack DILITHIUM_NAMESPACE(polyt0_unpack) +void polyt0_unpack(poly *r, const uint8_t *a); + +#define polyz_pack DILITHIUM_NAMESPACE(polyz_pack) +void polyz_pack(uint8_t *r, const poly *a); +#define polyz_unpack DILITHIUM_NAMESPACE(polyz_unpack) +void polyz_unpack(poly *r, const uint8_t *a); + +#define polyw1_pack DILITHIUM_NAMESPACE(polyw1_pack) +void polyw1_pack(uint8_t *r, const poly *a); + +#endif diff --git a/crypto_sign/dilithium2/m4f/polyvec.c b/crypto_sign/dilithium2/m4f/polyvec.c new file mode 100644 index 0000000..e20749c --- /dev/null +++ b/crypto_sign/dilithium2/m4f/polyvec.c @@ -0,0 +1,429 @@ +#include +#include "params.h" +#include "polyvec.h" +#include "poly.h" + +#include +#include "hal.h" + +/************************************************* +* Name: expand_mat +* +* Description: Implementation of ExpandA. Generates matrix A with uniformly +* random coefficients a_{i,j} by performing rejection +* sampling on the output stream of SHAKE128(rho|j|i). +* +* Arguments: - polyvecl mat[K]: output matrix +* - const uint8_t rho[]: byte array containing seed rho +**************************************************/ +void polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { + unsigned int i, j; + + for(i = 0; i < K; ++i) + for(j = 0; j < L; ++j) + poly_uniform(&mat[i].vec[j], rho, (i << 8) + j); +} + +void polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) { + unsigned int i; + + for(i = 0; i < K; ++i) + polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v); +} + +/**************************************************************/ +/************ Vectors of polynomials of length L **************/ +/**************************************************************/ + +void polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce) { + unsigned int i; + + for(i = 0; i < L; ++i) + poly_uniform_eta(&v->vec[i], seed, nonce++); +} + +void polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce) { + unsigned int i; + + for(i = 0; i < L; ++i) + poly_uniform_gamma1(&v->vec[i], seed, L*nonce + i); +} + +void polyvecl_reduce(polyvecl *v) { + unsigned int i; + + for(i = 0; i < L; ++i) + poly_reduce(&v->vec[i]); +} + +#if 0 +/************************************************* +* Name: polyvecl_freeze +* +* Description: Reduce coefficients of polynomials in vector of length L +* to standard representatives. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void polyvecl_freeze(polyvecl *v) { + unsigned int i; + + for(i = 0; i < L; ++i) + poly_freeze(&v->vec[i]); +} +#endif + +/************************************************* +* Name: polyvecl_add +* +* Description: Add vectors of polynomials of length L. +* No modular reduction is performed. +* +* Arguments: - polyvecl *w: pointer to output vector +* - const polyvecl *u: pointer to first summand +* - const polyvecl *v: pointer to second summand +**************************************************/ +void polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { + unsigned int i; + + for(i = 0; i < L; ++i) + poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); +} + +/************************************************* +* Name: polyvecl_ntt +* +* Description: Forward NTT of all polynomials in vector of length L. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void polyvecl_ntt(polyvecl *v) { + unsigned int i; + + for(i = 0; i < L; ++i) + poly_ntt(&v->vec[i]); +} + +void polyvecl_invntt_tomont(polyvecl *v) { + unsigned int i; + + for(i = 0; i < L; ++i) + poly_invntt_tomont(&v->vec[i]); +} + +void polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) { + unsigned int i; + + for(i = 0; i < L; ++i) + poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); +} + + + +/************************************************* +* Name: polyvecl_pointwise_acc_montgomery +* +* Description: Pointwise multiply vectors of polynomials of length L, multiply +* resulting vector by 2^{-32} and add (accumulate) polynomials +* in it. Input/output vectors are in NTT domain representation. +* +* Arguments: - poly *w: output polynomial +* - const polyvecl *u: pointer to first input vector +* - const polyvecl *v: pointer to second input vector +**************************************************/ +void polyvecl_pointwise_acc_montgomery(poly *w, + const polyvecl *u, + const polyvecl *v) +{ + unsigned int i; + + poly_pointwise_montgomery(w, &u->vec[0], &v->vec[0]); + for(i = 1; i < L; ++i) { + poly_pointwise_acc_montgomery(w, &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: polyvecl_chknorm +* +* Description: Check infinity norm of polynomials in vector of length L. +* Assumes input polyvecl to be reduced by polyvecl_reduce(). +* +* Arguments: - const polyvecl *v: pointer to vector +* - int32_t B: norm bound +* +* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. +**************************************************/ +int polyvecl_chknorm(const polyvecl *v, int32_t bound) { + unsigned int i; + + for(i = 0; i < L; ++i) + if(poly_chknorm(&v->vec[i], bound)) + return 1; + + return 0; +} + +/**************************************************************/ +/************ Vectors of polynomials of length K **************/ +/**************************************************************/ + +void polyveck_uniform_eta(polyveck *v, const uint8_t seed[CRHBYTES], uint16_t nonce) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_uniform_eta(&v->vec[i], seed, nonce++); +} + +/************************************************* +* Name: polyveck_reduce +* +* Description: Reduce coefficients of polynomials in vector of length K +* to representatives in [-6283009,6283007]. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void polyveck_reduce(polyveck *v) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_reduce(&v->vec[i]); +} + +/************************************************* +* Name: polyveck_caddq +* +* Description: For all coefficients of polynomials in vector of length K +* add Q if coefficient is negative. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void polyveck_caddq(polyveck *v) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_caddq(&v->vec[i]); +} + +#if 0 +/************************************************* +* Name: polyveck_freeze +* +* Description: Reduce coefficients of polynomials in vector of length K +* to standard representatives. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void polyveck_freeze(polyveck *v) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_freeze(&v->vec[i]); +} +#endif + +/************************************************* +* Name: polyveck_add +* +* Description: Add vectors of polynomials of length K. +* No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first summand +* - const polyveck *v: pointer to second summand +**************************************************/ +void polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); +} + +/************************************************* +* Name: polyveck_sub +* +* Description: Subtract vectors of polynomials of length K. +* No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first input vector +* - const polyveck *v: pointer to second input vector to be +* subtracted from first input vector +**************************************************/ +void polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); +} + +/************************************************* +* Name: polyveck_shiftl +* +* Description: Multiply vector of polynomials of Length K by 2^D without modular +* reduction. Assumes input coefficients to be less than 2^{31-D}. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void polyveck_shiftl(polyveck *v) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_shiftl(&v->vec[i]); +} + +/************************************************* +* Name: polyveck_ntt +* +* Description: Forward NTT of all polynomials in vector of length K. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void polyveck_ntt(polyveck *v) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_ntt(&v->vec[i]); +} + + + +/************************************************* +* Name: polyveck_invntt_tomont +* +* Description: Inverse NTT and multiplication by 2^{32} of polynomials +* in vector of length K. Input coefficients need to be less +* than 2*Q. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void polyveck_invntt_tomont(polyveck *v) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_invntt_tomont(&v->vec[i]); +} + + +void polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); +} + + +/************************************************* +* Name: polyveck_chknorm +* +* Description: Check infinity norm of polynomials in vector of length K. +* Assumes input polyveck to be reduced by polyveck_reduce(). +* +* Arguments: - const polyveck *v: pointer to vector +* - int32_t B: norm bound +* +* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. +**************************************************/ +int polyveck_chknorm(const polyveck *v, int32_t bound) { + unsigned int i; + + for(i = 0; i < K; ++i) + if(poly_chknorm(&v->vec[i], bound)) + return 1; + + return 0; +} + +/************************************************* +* Name: polyveck_power2round +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute a0, a1 such that a mod^+ Q = a1*2^D + a0 +* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be +* standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); +} + +/************************************************* +* Name: polyveck_decompose +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0 +* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we +* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. +* Assumes coefficients to be standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); +} + +/************************************************* +* Name: polyveck_make_hint +* +* Description: Compute hint vector. +* +* Arguments: - polyveck *h: pointer to output vector +* - const polyveck *v0: pointer to low part of input vector +* - const polyveck *v1: pointer to high part of input vector +* +* Returns number of 1 bits. +**************************************************/ +unsigned int polyveck_make_hint(polyveck *h, + const polyveck *v0, + const polyveck *v1) +{ + unsigned int i, s = 0; + + for(i = 0; i < K; ++i) + s += poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]); + + return s; +} + +/************************************************* +* Name: polyveck_use_hint +* +* Description: Use hint vector to correct the high bits of input vector. +* +* Arguments: - polyveck *w: pointer to output vector of polynomials with +* corrected high bits +* - const polyveck *u: pointer to input vector +* - const polyveck *h: pointer to input hint vector +**************************************************/ +void polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) { + unsigned int i; + + for(i = 0; i < K; ++i) + poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); +} + +void polyveck_pack_w1(uint8_t r[K*POLYW1_PACKEDBYTES], const polyveck *w1) { + unsigned int i; + + for(i = 0; i < K; ++i) + polyw1_pack(&r[i*POLYW1_PACKEDBYTES], &w1->vec[i]); +} diff --git a/crypto_sign/dilithium2/m4f/polyvec.h b/crypto_sign/dilithium2/m4f/polyvec.h new file mode 100644 index 0000000..d92cd75 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/polyvec.h @@ -0,0 +1,99 @@ +#ifndef POLYVEC_H +#define POLYVEC_H + +#include +#include "params.h" +#include "poly.h" + +/* Vectors of polynomials of length L */ +typedef struct { + poly vec[L]; +} polyvecl; + +#define polyvecl_uniform_eta DILITHIUM_NAMESPACE(polyvecl_uniform_eta) +void polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce); + +#define polyvecl_uniform_gamma1 DILITHIUM_NAMESPACE(polyvecl_uniform_gamma1) +void polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce); + +#define polyvecl_reduce DILITHIUM_NAMESPACE(polyvecl_reduce) +void polyvecl_reduce(polyvecl *v); + +#define polyvecl_freeze DILITHIUM_NAMESPACE(polyvecl_freeze) +void polyvecl_freeze(polyvecl *v); + +#define polyvecl_add DILITHIUM_NAMESPACE(polyvecl_add) +void polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); + +#define polyvecl_ntt DILITHIUM_NAMESPACE(polyvecl_ntt) +void polyvecl_ntt(polyvecl *v); +#define polyvecl_invntt_tomont DILITHIUM_NAMESPACE(polyvecl_invntt_tomont) +void polyvecl_invntt_tomont(polyvecl *v); +#define polyvecl_pointwise_poly_montgomery DILITHIUM_NAMESPACE(polyvecl_pointwise_poly_montgomery) +void polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v); +#define polyvecl_pointwise_acc_montgomery \ + DILITHIUM_NAMESPACE(polyvecl_pointwise_acc_montgomery) +void polyvecl_pointwise_acc_montgomery(poly *w, + const polyvecl *u, + const polyvecl *v); + + +#define polyvecl_chknorm DILITHIUM_NAMESPACE(polyvecl_chknorm) +int polyvecl_chknorm(const polyvecl *v, int32_t B); + + + +/* Vectors of polynomials of length K */ +typedef struct { + poly vec[K]; +} polyveck; + +#define polyveck_uniform_eta DILITHIUM_NAMESPACE(polyveck_uniform_eta) +void polyveck_uniform_eta(polyveck *v, const uint8_t seed[CRHBYTES], uint16_t nonce); + +#define polyveck_reduce DILITHIUM_NAMESPACE(polyveck_reduce) +void polyveck_reduce(polyveck *v); +#define polyveck_caddq DILITHIUM_NAMESPACE(polyveck_caddq) +void polyveck_caddq(polyveck *v); +#define polyveck_freeze DILITHIUM_NAMESPACE(polyveck_freeze) +void polyveck_freeze(polyveck *v); + +#define polyveck_add DILITHIUM_NAMESPACE(polyveck_add) +void polyveck_add(polyveck *w, const polyveck *u, const polyveck *v); +#define polyveck_sub DILITHIUM_NAMESPACE(polyveck_sub) +void polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v); +#define polyveck_shiftl DILITHIUM_NAMESPACE(polyveck_shiftl) +void polyveck_shiftl(polyveck *v); + +#define polyveck_ntt DILITHIUM_NAMESPACE(polyveck_ntt) +void polyveck_ntt(polyveck *v); +#define polyveck_invntt_tomont DILITHIUM_NAMESPACE(polyveck_invntt_tomont) +void polyveck_invntt_tomont(polyveck *v); +#define polyveck_pointwise_poly_montgomery DILITHIUM_NAMESPACE(polyveck_pointwise_poly_montgomery) +void polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v); + + +#define polyveck_chknorm DILITHIUM_NAMESPACE(polyveck_chknorm) +int polyveck_chknorm(const polyveck *v, int32_t B); + +#define polyveck_power2round DILITHIUM_NAMESPACE(polyveck_power2round) +void polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v); +#define polyveck_decompose DILITHIUM_NAMESPACE(polyveck_decompose) +void polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v); +#define polyveck_make_hint DILITHIUM_NAMESPACE(polyveck_make_hint) +unsigned int polyveck_make_hint(polyveck *h, + const polyveck *v0, + const polyveck *v1); +#define polyveck_use_hint DILITHIUM_NAMESPACE(polyveck_use_hint) +void polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h); + +#define polyveck_pack_w1 DILITHIUM_NAMESPACE(polyveck_pack_w1) +void polyveck_pack_w1(uint8_t r[K*POLYW1_PACKEDBYTES], const polyveck *w1); + +#define polyvec_matrix_expand DILITHIUM_NAMESPACE(polyvec_matrix_expand) +void polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); + +#define polyvec_matrix_pointwise_montgomery DILITHIUM_NAMESPACE(polyvec_matrix_pointwise_montgomery) +void polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v); + +#endif diff --git a/crypto_sign/dilithium2/m4f/reduce.h b/crypto_sign/dilithium2/m4f/reduce.h new file mode 100644 index 0000000..02df550 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/reduce.h @@ -0,0 +1,29 @@ +#ifndef REDUCE_H +#define REDUCE_H + +#include +#include "params.h" + +#define MONT -4186625 // 2^32 % Q +#define QINV 58728449 // q^(-1) mod 2^32 + +#define montgomery_reduce DILITHIUM_NAMESPACE(montgomery_reduce) +/************************************************* +* Name: montgomery_reduce +* +* Description: For finite field element a with -2^{31}Q <= a <= Q*2^31, +* compute r \equiv a*2^{-32} (mod Q) such that -Q < r < Q. +* +* Arguments: - int64_t: finite field element a +* +* Returns r. +**************************************************/ +static inline int32_t montgomery_reduce(int64_t a) { + int32_t t; + + t = (int64_t)(int32_t)a*QINV; + t = (a - (int64_t)t*Q) >> 32; + return t; +} + +#endif diff --git a/crypto_sign/dilithium2/m4f/rounding.c b/crypto_sign/dilithium2/m4f/rounding.c new file mode 100644 index 0000000..889f0a2 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/rounding.c @@ -0,0 +1,102 @@ +#include +#include "params.h" +#include "rounding.h" + +/************************************************* +* Name: power2round +* +* Description: For finite field element a, compute a0, a1 such that +* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. +* Assumes a to be standard representative. +* +* Arguments: - int32_t a: input element +* - int32_t *a0: pointer to output element a0 +* +* Returns a1. +**************************************************/ +int32_t power2round(int32_t *a0, int32_t a) { + int32_t a1; + + a1 = (a + (1 << (D-1)) - 1) >> D; + *a0 = a - (a1 << D); + return a1; +} + +/************************************************* +* Name: decompose +* +* Description: For finite field element a, compute high and low bits a0, a1 such +* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except +* if a1 = (Q-1)/ALPHA where we set a1 = 0 and +* -ALPHA/2 <= a0 = a mod^+ Q - Q < 0. Assumes a to be standard +* representative. +* +* Arguments: - int32_t a: input element +* - int32_t *a0: pointer to output element a0 +* +* Returns a1. +**************************************************/ +int32_t decompose(int32_t *a0, int32_t a) { + int32_t a1; + + a1 = (a + 127) >> 7; +#if GAMMA2 == (Q-1)/32 + a1 = (a1*1025 + (1 << 21)) >> 22; + a1 &= 15; +#elif GAMMA2 == (Q-1)/88 + a1 = (a1*11275 + (1 << 23)) >> 24; + a1 ^= ((43 - a1) >> 31) & a1; +#endif + + *a0 = a - a1*2*GAMMA2; + *a0 -= (((Q-1)/2 - *a0) >> 31) & Q; + return a1; +} + +/************************************************* +* Name: make_hint +* +* Description: Compute hint bit indicating whether the low bits of the +* input element overflow into the high bits. +* +* Arguments: - int32_t a0: low bits of input element +* - int32_t a1: high bits of input element +* +* Returns 1 if overflow. +**************************************************/ +unsigned int make_hint(int32_t a0, int32_t a1) { + if(a0 > GAMMA2 || a0 < -GAMMA2 || (a0 == -GAMMA2 && a1 != 0)) + return 1; + + return 0; +} + +/************************************************* +* Name: use_hint +* +* Description: Correct high bits according to hint. +* +* Arguments: - int32_t a: input element +* - unsigned int hint: hint bit +* +* Returns corrected high bits. +**************************************************/ +int32_t use_hint(int32_t a, unsigned int hint) { + int32_t a0, a1; + + a1 = decompose(&a0, a); + if(hint == 0) + return a1; + +#if GAMMA2 == (Q-1)/32 + if(a0 > 0) + return (a1 + 1) & 15; + else + return (a1 - 1) & 15; +#elif GAMMA2 == (Q-1)/88 + if(a0 > 0) + return (a1 == 43) ? 0 : a1 + 1; + else + return (a1 == 0) ? 43 : a1 - 1; +#endif +} diff --git a/crypto_sign/dilithium2/m4f/rounding.h b/crypto_sign/dilithium2/m4f/rounding.h new file mode 100644 index 0000000..b72e8e8 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/rounding.h @@ -0,0 +1,19 @@ +#ifndef ROUNDING_H +#define ROUNDING_H + +#include +#include "params.h" + +#define power2round DILITHIUM_NAMESPACE(power2round) +int32_t power2round(int32_t *a0, int32_t a); + +#define decompose DILITHIUM_NAMESPACE(decompose) +int32_t decompose(int32_t *a0, int32_t a); + +#define make_hint DILITHIUM_NAMESPACE(make_hint) +unsigned int make_hint(int32_t a0, int32_t a1); + +#define use_hint DILITHIUM_NAMESPACE(use_hint) +int32_t use_hint(int32_t a, unsigned int hint); + +#endif diff --git a/crypto_sign/dilithium2/m4f/sign.c b/crypto_sign/dilithium2/m4f/sign.c new file mode 100644 index 0000000..d1c5222 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/sign.c @@ -0,0 +1,391 @@ +#include +#include "params.h" +#include "sign.h" +#include "packing.h" +#include "polyvec.h" +#include "poly.h" +#include "randombytes.h" +#include "symmetric.h" +#include "smallpoly.h" + +/************************************************* +* Name: crypto_sign_keypair +* +* Description: Generates public and private key. +* +* Arguments: - uint8_t *pk: pointer to output public key (allocated +* array of CRYPTO_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key (allocated +* array of CRYPTO_SECRETKEYBYTES bytes) +* +* Returns 0 (success) +**************************************************/ +int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { + uint8_t seedbuf[2*SEEDBYTES + CRHBYTES]; + uint8_t tr[TRBYTES]; + const uint8_t *rho, *rhoprime, *key; + polyvecl mat[K]; + polyvecl s1, s1hat; + polyveck s2, t1, t0; + + /* Get randomness for rho, rhoprime and key */ + randombytes(seedbuf, SEEDBYTES); + shake256(seedbuf, 2*SEEDBYTES + CRHBYTES, seedbuf, SEEDBYTES); + rho = seedbuf; + rhoprime = rho + SEEDBYTES; + key = rhoprime + CRHBYTES; + + /* Expand matrix */ + polyvec_matrix_expand(mat, rho); + + /* Sample short vectors s1 and s2 */ + polyvecl_uniform_eta(&s1, rhoprime, 0); + polyveck_uniform_eta(&s2, rhoprime, L); + + /* Matrix-vector multiplication */ + s1hat = s1; + polyvecl_ntt(&s1hat); + polyvec_matrix_pointwise_montgomery(&t1, mat, &s1hat); + polyveck_reduce(&t1); + polyveck_invntt_tomont(&t1); + + /* Add error vector s2 */ + polyveck_add(&t1, &t1, &s2); + + /* Extract t1 and write public key */ + polyveck_caddq(&t1); + polyveck_power2round(&t1, &t0, &t1); + pack_pk(pk, rho, &t1); + + /* Compute H(rho, t1) and write secret key */ + shake256(tr, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES); + pack_sk(sk, rho, tr, key, &t0, &s1, &s2); + + return 0; +} + + +/************************************************* +* Name: crypto_sign_signature +* +* Description: Computes signature. +* +* Arguments: - uint8_t *sig: pointer to output signature (of length CRYPTO_BYTES) +* - size_t *siglen: pointer to output length of signature +* - uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int crypto_sign_signature(uint8_t *sig, + size_t *siglen, + const uint8_t *m, + size_t mlen, + const uint8_t *sk) +{ + uint8_t seedbuf[2 * SEEDBYTES + TRBYTES + RNDBYTES + 2 * CRHBYTES]; + uint8_t *rho, *tr, *key, *mu, *rhoprime, *rnd; + uint16_t nonce = 0; + unsigned int n; + polyvecl mat[K], y, z; + polyveck t0, w1, w0; + poly cp; + shake256incctx state; + + smallpoly s1_prime[L]; + smallpoly s2_prime[K]; + smallpoly cp_small; + smallhalfpoly cp_small_prime; + + rho = seedbuf; + tr = rho + SEEDBYTES; + key = tr + TRBYTES; + rnd = key + SEEDBYTES; + mu = rnd + RNDBYTES; + rhoprime = mu + CRHBYTES; + unpack_sk(rho, tr, key, &t0, s1_prime, s2_prime, sk); + + /* Compute mu = CRH(tr, msg) */ + shake256_inc_init(&state); + shake256_inc_absorb(&state, tr, TRBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + + for (n = 0; n < RNDBYTES; n++) { + rnd[n] = 0; + } + shake256(rhoprime, CRHBYTES, key, SEEDBYTES + RNDBYTES + CRHBYTES); + + /* Expand matrix and transform vectors */ + polyvec_matrix_expand(mat, rho); + polyvecl_small_ntt(s1_prime); + polyveck_small_ntt(s2_prime); + + polyveck_ntt(&t0); + +rej: + /* Sample intermediate vector y */ + polyvecl_uniform_gamma1(&y, rhoprime, nonce++); + + /* Matrix-vector multiplication */ + z = y; + polyvecl_ntt(&z); + polyvec_matrix_pointwise_montgomery(&w1, mat, &z); + polyveck_reduce(&w1); + polyveck_invntt_tomont(&w1); + + /* Decompose w and call the random oracle */ + polyveck_caddq(&w1); + polyveck_decompose(&w1, &w0, &w1); + polyveck_pack_w1(sig, &w1); + + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, sig, K*POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(sig, CTILDEBYTES, &state); + poly_challenge(&cp, sig); + + poly_small_ntt_precomp(&cp_small, &cp_small_prime, &cp); + poly_ntt(&cp); + + /* Compute z, reject if it reveals secret */ + polyvecl_small_basemul_invntt(&z, &cp_small, &cp_small_prime, s1_prime); + + polyvecl_add(&z, &z, &y); + polyvecl_reduce(&z); + if(polyvecl_chknorm(&z, GAMMA1 - BETA)) + goto rej; + + + /* Write signature */ + pack_sig_z(sig, &z); + unsigned int hint_n = 0; + unsigned int hints_written = 0; + /* Check that subtracting cs2 does not change high bits of w and low bits + * do not reveal secret information */ + for(unsigned int i = 0; i < K; ++i) { + poly *tmp = &z.vec[0]; + poly_small_basemul_invntt(tmp, &cp_small, &cp_small_prime, &s2_prime[i]); + + poly_sub(&w0.vec[i], &w0.vec[i], tmp); + poly_reduce(&w0.vec[i]); + if(poly_chknorm(&w0.vec[i], GAMMA2 - BETA)) + goto rej; + + /* Compute hints for w1 */ + poly_pointwise_montgomery(tmp, &cp, &t0.vec[i]); + + poly_invntt_tomont(tmp); + poly_reduce(tmp); + + if(poly_chknorm(tmp, GAMMA2)) + goto rej; + poly_add(&w0.vec[i], &w0.vec[i], tmp); + hint_n += poly_make_hint(tmp, &w0.vec[i], &w1.vec[i]); + if (hint_n > OMEGA) { + goto rej; + } + pack_sig_h(sig, tmp, i, &hints_written); + } + pack_sig_h_zero(sig, &hints_written); + *siglen = CRYPTO_BYTES; + return 0; +} + +/************************************************* +* Name: crypto_sign +* +* Description: Compute signed message. +* +* Arguments: - uint8_t *sm: pointer to output signed message (allocated +* array with CRYPTO_BYTES + mlen bytes), +* can be equal to m +* - size_t *smlen: pointer to output length of signed +* message +* - const uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - const uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int crypto_sign(uint8_t *sm, + size_t *smlen, + const uint8_t *m, + size_t mlen, + const uint8_t *sk) +{ + size_t i; + + for(i = 0; i < mlen; ++i) + sm[CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; + crypto_sign_signature(sm, smlen, sm + CRYPTO_BYTES, mlen, sk); + *smlen += mlen; + return 0; +} +/************************************************* + * Name: expand_mat_elem + * + * Description: Implementation of ExpandA. Generates matrix A with uniformly + * random coefficients a_{i,j} by performing rejection + * sampling on the output stream of SHAKE128(rho|i|j). + * + * Arguments: - poly mat_elem: output matrix element + * - const unsigned char rho[]: byte array containing seed rho + * - k_idx: matrix row index + * - l_idx: matrix col index + **************************************************/ +static void expand_mat_elem(poly *mat_elem, const unsigned char rho[SEEDBYTES], size_t k_idx, size_t l_idx) +{ + poly_uniform(mat_elem, rho, (uint16_t)((k_idx << 8) + l_idx)); +} + +/************************************************* + * Name: crypto_sign_verify + * + * Description: Verifies signature. + * + * Arguments: - uint8_t *m: pointer to input signature + * - size_t siglen: length of signature + * - const uint8_t *m: pointer to message + * - size_t mlen: length of message + * - const uint8_t *pk: pointer to bit-packed public key + * + * Returns 0 if signature could be verified correctly and -1 otherwise + **************************************************/ +int crypto_sign_verify(const uint8_t *sig, + size_t siglen, + const uint8_t *m, + size_t mlen, + const uint8_t *pk) +{ + unsigned int i; + const uint8_t *rho = pk; + uint8_t mu[CRHBYTES]; + uint8_t c[CTILDEBYTES]; + uint8_t c2[CTILDEBYTES]; + poly cp; + polyvecl z; + shake256incctx state; + + poly tmp_elem, w1_elem; + + if (siglen != CRYPTO_BYTES) + return -1; + + if (unpack_sig_z(&z, sig) != 0) { + return -1; + } + if (polyvecl_chknorm(&z, GAMMA1 - BETA)) + return -1; + + /* Compute CRH(h(rho, t1), msg) */ + shake256(mu, CRHBYTES, pk, CRYPTO_PUBLICKEYBYTES); + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + + // Hash [mu || w1'] to get c. + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + + /* Matrix-vector multiplication; compute Az - c2^dt1 */ + if (unpack_sig_c(c, sig) != 0) { + return -1; + } + poly_challenge(&cp, c); + poly_ntt(&cp); + polyvecl_ntt(&z); + + + for (size_t k_idx = 0; k_idx < K; k_idx++) + { + // Sample the current element from A. + expand_mat_elem(&tmp_elem, rho, k_idx, 0); + poly_pointwise_montgomery(&w1_elem, &tmp_elem, &z.vec[0]); + + for (size_t l_idx = 1; l_idx < L; l_idx++) + { + // Sample the element from A. + expand_mat_elem(&tmp_elem, rho, k_idx, l_idx); + poly_pointwise_acc_montgomery(&w1_elem, &tmp_elem, &z.vec[l_idx]); + } + + // Subtract c*(t1_{k_idx} * 2^d) + unpack_pk_t1(&tmp_elem, k_idx, pk); + poly_shiftl(&tmp_elem); + poly_ntt(&tmp_elem); + poly_pointwise_montgomery(&tmp_elem, &cp, &tmp_elem); + poly_sub(&w1_elem, &w1_elem, &tmp_elem); + poly_reduce(&w1_elem); + poly_invntt_tomont(&w1_elem); + + // Reconstruct w1 + poly_csubq(&w1_elem); + if (unpack_sig_h(&tmp_elem, k_idx, sig) != 0) { + return -1; + } + poly_use_hint(&w1_elem, &w1_elem, &tmp_elem); + uint8_t w1_packed[POLYW1_PACKEDBYTES]; + polyw1_pack(w1_packed, &w1_elem); + shake256_inc_absorb(&state, w1_packed, POLYW1_PACKEDBYTES); + } + + + /* Call random oracle and verify challenge */ + shake256_inc_finalize(&state); + shake256_inc_squeeze(c2, CTILDEBYTES, &state); + for (i = 0; i < CTILDEBYTES; ++i) + if (c[i] != c2[i]) + return -1; + + return 0; +} + +/************************************************* +* Name: crypto_sign_open +* +* Description: Verify signed message. +* +* Arguments: - uint8_t *m: pointer to output message (allocated +* array with smlen bytes), can be equal to sm +* - size_t *mlen: pointer to output length of message +* - const uint8_t *sm: pointer to signed message +* - size_t smlen: length of signed message +* - const uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signed message could be verified correctly and -1 otherwise +**************************************************/ +int crypto_sign_open(uint8_t *m, + size_t *mlen, + const uint8_t *sm, + size_t smlen, + const uint8_t *pk) +{ + size_t i; + + if(smlen < CRYPTO_BYTES) + goto badsig; + + *mlen = smlen - CRYPTO_BYTES; + if(crypto_sign_verify(sm, CRYPTO_BYTES, sm + CRYPTO_BYTES, *mlen, pk)) + goto badsig; + else { + /* All good, copy msg, return 0 */ + for(i = 0; i < *mlen; ++i) + m[i] = sm[CRYPTO_BYTES + i]; + return 0; + } + +badsig: + /* Signature verification failed */ + *mlen = -1; + for(i = 0; i < smlen; ++i) + m[i] = 0; + + return -1; +} diff --git a/crypto_sign/dilithium2/m4f/sign.h b/crypto_sign/dilithium2/m4f/sign.h new file mode 100644 index 0000000..42240b3 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/sign.h @@ -0,0 +1,37 @@ +#ifndef SIGN_H +#define SIGN_H + +#include +#include +#include "params.h" +#include "api.h" +#include "polyvec.h" +#include "poly.h" + +#define challenge DILITHIUM_NAMESPACE(challenge) +void challenge(poly *c, const uint8_t seed[SEEDBYTES]); + +// #define crypto_sign_keypair DILITHIUM_NAMESPACE(crypto_sign_keypair) +// int crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +// #define crypto_sign_signature DILITHIUM_NAMESPACE(signature) +// int crypto_sign_signature(uint8_t *sig, size_t *siglen, +// const uint8_t *m, size_t mlen, +// const uint8_t *sk); + +// #define crypto_sign DILITHIUM_NAMESPACE(crypto_sign) +// int crypto_sign(uint8_t *sm, size_t *smlen, +// const uint8_t *m, size_t mlen, +// const uint8_t *sk); + +// #define crypto_sign_verify DILITHIUM_NAMESPACE(verify) +// int crypto_sign_verify(const uint8_t *sig, size_t siglen, +// const uint8_t *m, size_t mlen, +// const uint8_t *pk); + +// #define crypto_sign_open DILITHIUM_NAMESPACE(crypto_sign_open) +// int crypto_sign_open(uint8_t *m, size_t *mlen, +// const uint8_t *sm, size_t smlen, +// const uint8_t *pk); + +#endif diff --git a/crypto_sign/dilithium2/m4f/smallntt.h b/crypto_sign/dilithium2/m4f/smallntt.h new file mode 100644 index 0000000..f39a9a9 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/smallntt.h @@ -0,0 +1,31 @@ +#ifndef SMALLNTT_H +#define SMALLNTT_H + +#include +#include "params.h" + + +#define SMALL_Q 257 +#define SMALL_Q_PRIME (16711935) // -q^-1 mod 2**32 + +static const int32_t twiddles_ntt_257_streamlined[] = {-60, -35, -46, -42, 99, 89, -118, 27, -82, 108, -71, 54, 93, -41, 115, 68, 117, 73, -84, -59, -79, 21, -78, 37, -55, -109, 101, 74, -110, 39, 17, -70, -92, -50, -29, 57, -116, 83, 43, 75, -85, -91, 86, -107, 87, 15, -23, -111, -100, -58, 114, 25, -97, -10, 126, -40, 63, -20, -5, -80, -120, 44, -67, -72, -124, -31, 18, -106, 103, 90, -102, 45, -51, -77, 53, -121, -81, -11, 113, 9, -62, 36, -65, -12, -3, -48, 127, -24, -6, -96, 34, 88, 123, -49, -13, 61, -52, 112, -7, -66, -28, -33, -14, 125, -56, 30, 95, -22, -98, -26, 122, -104, -38, -94, 105, -119, -76, 69, -47, 19}; +static const int32_t twiddles_intt_257_streamlined[] = { -19, 47, -69, 76, 119, -105, 94, 38, 104, -122, 26, 98, 22, -95, -30, 56, -125, 14, 33, 28, 66, 7, -112, 52, -61, 13, 49, -123, -88, -34, 96, 6, 24, -127, 48, 3, 12, 65, -36, 62, -9, -113, 11, 81, 121, -53, 77, 51, -45, 102, -90, -103, 106, -18, 31, 124, 72, 67, -44, 120, 80, 5, 20, -63, 40, -126, 10, 97, -25, -114, 58, 100, 111, 23, -15, -87, 107, -86, 91, 85, -75, -43, -83, 116, -57, 29, 50, 92, 70, -17, -39, 110, -74, -101, 109, 55, -37, 78, -21, 79, 59, 84, -73, -117, -68, -115, 41, -93, -54, 71, -108, 82, -27, 118, -89, -99, 42, 46, 35, 60}; +static const int32_t twiddles_basemul_257[] = {27, -82, 108, -71, 54, 93, -41, 115, -78, 37, -55, -109, 101, 74, -110, 39, 83, 43, 75, -85, -91, 86, -107, 87, -97, -10, 126, -40, 63, -20, -5, -80, -106, 103, 90, -102, 45, -51, -77, 53, -65, -12, -3, -48, 127, -24, -6, -96, 112, -7, -66, -28, -33, -14, 125, -56, -38, -94, 105, -119, -76, 69, -47, 19}; + + +// inputs in [-2, 2]; outputs in [-128, +128] +void __asm_fnt_257(int32_t *p, const int32_t twiddles[112], int32_t qprime, int32_t q); + +void __asm_point_mul_257_16(int16_t p_prime[128], const int32_t p[256], int32_t qprime, int32_t q, const int32_t twiddles[64]); +void __asm_asymmetric_mul_257_16(int32_t c[256], const int32_t a[256], const int32_t b[256], const int16_t b_prime[128]); + +// inputs in [-32768, 32768] outputs in [-128, +128] +void __asm_ifnt_257(int32_t *p, const int32_t twiddles[112], int32_t qprime, int32_t q); + +#define small_ntt(a) __asm_fnt_257(a, twiddles_ntt_257_streamlined, SMALL_Q_PRIME, SMALL_Q) +#define small_invntt_tomont(a) __asm_ifnt_257(a, twiddles_intt_257_streamlined, SMALL_Q_PRIME, SMALL_Q) + +#define small_point_mul(b_prime, b) __asm_point_mul_257_16(b_prime, b, SMALL_Q_PRIME, SMALL_Q, twiddles_basemul_257); +#define small_asymmetric_mul(c, a, b, b_prime) __asm_asymmetric_mul_257_16(c, a, b, b_prime); + +#endif diff --git a/crypto_sign/dilithium2/m4f/smallpoly.c b/crypto_sign/dilithium2/m4f/smallpoly.c new file mode 100644 index 0000000..9e1f6c8 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/smallpoly.c @@ -0,0 +1,84 @@ +#include "smallpoly.h" +#include "smallntt.h" + +void poly_small_ntt_precomp(smallpoly *out, smallhalfpoly *out2, poly *in) { + for (int i = 0; i < N; i++) + { + out->coeffs[i] = in->coeffs[i]; + } + small_ntt(out->coeffs); + small_point_mul(out2->coeffs, out->coeffs); +} + + +void polyvecl_small_ntt(smallpoly v[L]) { + unsigned int i; + + for(i = 0; i < L; ++i) + small_ntt(v[i].coeffs); +} + + +void polyveck_small_ntt(smallpoly v[K]) { + unsigned int i; + + for(i = 0; i < K; ++i) + small_ntt(v[i].coeffs); +} + + + +void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly *b){ + // re-use the buffer + smallpoly *tmp = (smallpoly *)r; + small_asymmetric_mul(tmp->coeffs, b->coeffs, a->coeffs, aprime->coeffs); + small_invntt_tomont(tmp->coeffs); + + #ifdef SMALL_POLY_16_BIT + int j; + // buffer is the same, so we neeed to be careful + for(j=N-1;j>=0;j--){ + r->coeffs[j] = tmp->coeffs[j]; + } + #endif +} + +void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly b[L]){ + unsigned int i; + for(i=0;ivec[i], a, aprime, &b[i]); + } +} + +void small_polyeta_unpack(smallpoly *r, const uint8_t *a) { + unsigned int i; + +#if ETA == 2 + for(i = 0; i < N/8; ++i) { + r->coeffs[8*i+0] = (a[3*i+0] >> 0) & 7; + r->coeffs[8*i+1] = (a[3*i+0] >> 3) & 7; + r->coeffs[8*i+2] = ((a[3*i+0] >> 6) | (a[3*i+1] << 2)) & 7; + r->coeffs[8*i+3] = (a[3*i+1] >> 1) & 7; + r->coeffs[8*i+4] = (a[3*i+1] >> 4) & 7; + r->coeffs[8*i+5] = ((a[3*i+1] >> 7) | (a[3*i+2] << 1)) & 7; + r->coeffs[8*i+6] = (a[3*i+2] >> 2) & 7; + r->coeffs[8*i+7] = (a[3*i+2] >> 5) & 7; + + r->coeffs[8*i+0] = ETA - r->coeffs[8*i+0]; + r->coeffs[8*i+1] = ETA - r->coeffs[8*i+1]; + r->coeffs[8*i+2] = ETA - r->coeffs[8*i+2]; + r->coeffs[8*i+3] = ETA - r->coeffs[8*i+3]; + r->coeffs[8*i+4] = ETA - r->coeffs[8*i+4]; + r->coeffs[8*i+5] = ETA - r->coeffs[8*i+5]; + r->coeffs[8*i+6] = ETA - r->coeffs[8*i+6]; + r->coeffs[8*i+7] = ETA - r->coeffs[8*i+7]; + } +#elif ETA == 4 + for(i = 0; i < N/2; ++i) { + r->coeffs[2*i+0] = a[i] & 0x0F; + r->coeffs[2*i+1] = a[i] >> 4; + r->coeffs[2*i+0] = ETA - r->coeffs[2*i+0]; + r->coeffs[2*i+1] = ETA - r->coeffs[2*i+1]; + } +#endif +} diff --git a/crypto_sign/dilithium2/m4f/smallpoly.h b/crypto_sign/dilithium2/m4f/smallpoly.h new file mode 100644 index 0000000..caa2626 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/smallpoly.h @@ -0,0 +1,39 @@ +#ifndef SMALLPOLY_H +#define SMALLPOLY_H +#include "params.h" +#include "poly.h" +#include "polyvec.h" + + + +#if DILITHIUM_MODE == 3 // use q=769 +#define SMALL_POLY_16_BIT +typedef struct { + int16_t coeffs[N]; +} smallpoly; + +typedef smallpoly smallhalfpoly; + +#else // use q=257 +#define SMALL_POLY_32_BIT +typedef struct { + int32_t coeffs[N]; +} smallpoly; + +typedef struct { + int16_t coeffs[N]; +} smallhalfpoly; +#endif + + +void poly_small_ntt_precomp(smallpoly *out, smallhalfpoly *out2, poly *in); +void polyvecl_small_ntt(smallpoly v[L]); +void polyveck_small_ntt(smallpoly v[K]); + + +void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly b[L]); +void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly *b); + +void small_polyeta_unpack(smallpoly *r, const uint8_t *a); + +#endif \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4f/symmetric-shake.c b/crypto_sign/dilithium2/m4f/symmetric-shake.c new file mode 100644 index 0000000..963f649 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/symmetric-shake.c @@ -0,0 +1,28 @@ +#include +#include "params.h" +#include "symmetric.h" +#include "fips202.h" + +void dilithium_shake128_stream_init(shake128incctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) +{ + uint8_t t[2]; + t[0] = nonce; + t[1] = nonce >> 8; + + shake128_inc_init(state); + shake128_inc_absorb(state, seed, SEEDBYTES); + shake128_inc_absorb(state, t, 2); + shake128_inc_finalize(state); +} + +void dilithium_shake256_stream_init(shake256incctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) +{ + uint8_t t[2]; + t[0] = nonce; + t[1] = nonce >> 8; + + shake256_inc_init(state); + shake256_inc_absorb(state, seed, CRHBYTES); + shake256_inc_absorb(state, t, 2); + shake256_inc_finalize(state); +} diff --git a/crypto_sign/dilithium2/m4f/symmetric.h b/crypto_sign/dilithium2/m4f/symmetric.h new file mode 100644 index 0000000..4703737 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/symmetric.h @@ -0,0 +1,65 @@ +#ifndef SYMMETRIC_H +#define SYMMETRIC_H + +#include +#include "params.h" + +#ifdef DILITHIUM_USE_AES + +#include "aes256ctr.h" +#include "fips202.h" + +typedef aes256ctr_ctx stream128_state; +typedef aes256ctr_ctx stream256_state; + +#define dilithium_aes256ctr_init DILITHIUM_NAMESPACE(dilithium_aes256ctr_init) +void dilithium_aes256ctr_init(aes256ctr_ctx *state, + const uint8_t key[32], + uint16_t nonce); + +#define STREAM128_BLOCKBYTES AES256CTR_BLOCKBYTES +#define STREAM256_BLOCKBYTES AES256CTR_BLOCKBYTES + +#define stream128_init(STATE, SEED, NONCE) \ + dilithium_aes256ctr_init(STATE, SEED, NONCE) +#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define stream256_init(STATE, SEED, NONCE) \ + dilithium_aes256ctr_init(STATE, SEED, NONCE) +#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) + +#else + +#include "fips202.h" +typedef shake128incctx stream128_state; +typedef shake256incctx stream256_state; + +#define shake256_inc_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake256_inc_squeeze(OUT, OUTBLOCKS*SHAKE256_RATE, STATE) + +#define dilithium_shake128_stream_init DILITHIUM_NAMESPACE(dilithium_shake128_stream_init) +void dilithium_shake128_stream_init(stream128_state *state, + const uint8_t seed[SEEDBYTES], + uint16_t nonce); + +#define dilithium_shake256_stream_init DILITHIUM_NAMESPACE(dilithium_shake256_stream_init) +void dilithium_shake256_stream_init(stream256_state *state, + const uint8_t seed[CRHBYTES], + uint16_t nonce); + +#define STREAM128_BLOCKBYTES SHAKE128_RATE +#define STREAM256_BLOCKBYTES SHAKE256_RATE + +#define stream128_init(STATE, SEED, NONCE) \ + dilithium_shake128_stream_init(STATE, SEED, NONCE) +#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake128_inc_squeeze(OUT, OUTBLOCKS*SHAKE128_RATE, STATE) +#define stream256_init(STATE, SEED, NONCE) \ + dilithium_shake256_stream_init(STATE, SEED, NONCE) +#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake256_inc_squeeze(OUT, OUTBLOCKS*SHAKE256_RATE, STATE) + +#endif + +#endif diff --git a/crypto_sign/dilithium2/m4f/vector.h b/crypto_sign/dilithium2/m4f/vector.h new file mode 100644 index 0000000..183ddc8 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/vector.h @@ -0,0 +1,22 @@ +#ifndef VECTOR_H +#define VECTOR_H + +#include +#include "params.h" + +#define asm_reduce32 DILITHIUM_NAMESPACE(asm_reduce32) +void asm_reduce32(int32_t a[N]); +#define small_asm_reduce32_central DILITHIUM_NAMESPACE(small_asm_reduce32_central) +void small_asm_reduce32_central(int32_t a[N]); +#define asm_caddq DILITHIUM_NAMESPACE(asm_caddq) +void asm_caddq(int32_t a[N]); +#define asm_csubq DILITHIUM_NAMESPACE(asm_csubq) +void asm_csubq(int32_t a[N]); +#define asm_freeze DILITHIUM_NAMESPACE(asm_freeze) +void asm_freeze(int32_t a[N]); +#define asm_rej_uniform DILITHIUM_NAMESPACE(asm_rej_uniform) +unsigned int asm_rej_uniform(int32_t *a, + unsigned int len, + const unsigned char *buf, + unsigned int buflen); +#endif diff --git a/crypto_sign/dilithium2/m4f/vector.s b/crypto_sign/dilithium2/m4f/vector.s new file mode 100644 index 0000000..a393c91 --- /dev/null +++ b/crypto_sign/dilithium2/m4f/vector.s @@ -0,0 +1,263 @@ +.syntax unified +.thumb +.macro redq a, tmp, q + add \tmp, \a, #4194304 + asrs \tmp, \tmp, #23 + mls \a, \tmp, \q, \a +.endm + +// void asm_reduce32(int32_t a[N]); +.global pqcrystals_dilithium_asm_reduce32 +.type pqcrystals_dilithium_asm_reduce32, %function +.align 2 +pqcrystals_dilithium_asm_reduce32: + push {r4-r10} + + movw r12,#:lower16:8380417 + movt r12,#:upper16:8380417 + movw r10, #32 + 1: + ldr.w r1, [r0] + ldr.w r2, [r0, #1*4] + ldr.w r3, [r0, #2*4] + ldr.w r4, [r0, #3*4] + ldr.w r5, [r0, #4*4] + ldr.w r6, [r0, #5*4] + ldr.w r7, [r0, #6*4] + ldr.w r8, [r0, #7*4] + + redq r1, r9, r12 + redq r2, r9, r12 + redq r3, r9, r12 + redq r4, r9, r12 + redq r5, r9, r12 + redq r6, r9, r12 + redq r7, r9, r12 + redq r8, r9, r12 + + str.w r2, [r0, #1*4] + str.w r3, [r0, #2*4] + str.w r4, [r0, #3*4] + str.w r5, [r0, #4*4] + str.w r6, [r0, #5*4] + str.w r7, [r0, #6*4] + str.w r8, [r0, #7*4] + str r1, [r0], #8*4 + subs r10, #1 + bne.w 1b + + pop {r4-r10} + bx lr +.size pqcrystals_dilithium_asm_reduce32, .-pqcrystals_dilithium_asm_reduce32 + +.macro barrett_32 a, Qbar, Q, tmp + smmulr.w \tmp, \a, \Qbar + mls.w \a, \tmp, \Q, \a +.endm + +// INPUT: target (signed), KYBER_Q (signed) +// OUTPUT: target adjusted to be between -KYBER_Q/2 and KYBER_Q/2 +.macro central_reduce target, Q + cmp \target, \Q, lsr #1 + it hi + subhi \target, \Q + cmn \target, \Q, lsr #1 + it lt + addlt \target, \Q +.endm + +// void asm_reduce32(int32_t a[N]); +.global pqcrystals_dilithium_small_asm_reduce32_central +.type pqcrystals_dilithium_small_asm_reduce32_central, %function +.align 2 +pqcrystals_dilithium_small_asm_reduce32_central: + push {r4-r12, lr} + + + movw r9, #:lower16:5585133 + movt r9, #:upper16:5585133 + mov.w r10,#769 + + movw r12, #32 + 1: + ldr.w r1, [r0] + ldr.w r2, [r0, #1*4] + ldr.w r3, [r0, #2*4] + ldr.w r4, [r0, #3*4] + ldr.w r5, [r0, #4*4] + ldr.w r6, [r0, #5*4] + ldr.w r7, [r0, #6*4] + ldr.w r8, [r0, #7*4] + + barrett_32 r1, r9, r10, r11 + barrett_32 r2, r9, r10, r11 + barrett_32 r3, r9, r10, r11 + barrett_32 r4, r9, r10, r11 + barrett_32 r5, r9, r10, r11 + barrett_32 r6, r9, r10, r11 + barrett_32 r7, r9, r10, r11 + barrett_32 r8, r9, r10, r11 + + + str.w r2, [r0, #1*4] + str.w r3, [r0, #2*4] + str.w r4, [r0, #3*4] + str.w r5, [r0, #4*4] + str.w r6, [r0, #5*4] + str.w r7, [r0, #6*4] + str.w r8, [r0, #7*4] + str r1, [r0], #8*4 + subs r12, #1 + bne.w 1b + + pop {r4-r12, pc} + +.size pqcrystals_dilithium_small_asm_reduce32_central, .-pqcrystals_dilithium_small_asm_reduce32_central + +.macro caddq a, tmp, q + and \tmp, \q, \a, asr #31 + add \a, \a, \tmp +.endm + +.macro freezeq a, tmp, q + redq \a, \tmp, \q + caddq \a, \tmp, \q +.endm + +// void asm_caddq(int32_t a[N]); +.global pqcrystals_dilithium_asm_caddq +.type pqcrystals_dilithium_asm_caddq, %function +.align 2 +pqcrystals_dilithium_asm_caddq: + push {r4-r10} + + movw r12,#:lower16:8380417 + movt r12,#:upper16:8380417 + + movw r10, #32 + 1: + ldr.w r1, [r0] + ldr.w r2, [r0, #1*4] + ldr.w r3, [r0, #2*4] + ldr.w r4, [r0, #3*4] + ldr.w r5, [r0, #4*4] + ldr.w r6, [r0, #5*4] + ldr.w r7, [r0, #6*4] + ldr.w r8, [r0, #7*4] + + caddq r1, r9, r12 + caddq r2, r9, r12 + caddq r3, r9, r12 + caddq r4, r9, r12 + caddq r5, r9, r12 + caddq r6, r9, r12 + caddq r7, r9, r12 + caddq r8, r9, r12 + + str.w r2, [r0, #1*4] + str.w r3, [r0, #2*4] + str.w r4, [r0, #3*4] + str.w r5, [r0, #4*4] + str.w r6, [r0, #5*4] + str.w r7, [r0, #6*4] + str.w r8, [r0, #7*4] + str r1, [r0], #8*4 + subs r10, #1 + bne.w 1b + + pop {r4-r10} + bx lr +.size pqcrystals_dilithium_asm_caddq, .-pqcrystals_dilithium_asm_caddq + +.macro csubq a, tmp, q + cmp.n \a, \q + it ge + subge.w \a, \a, \q + cmp \a, #0 + it mi + addmi.w \a, \a, \q +.endm + +// void asm_csubq(int32_t a[N]); +.global pqcrystals_dilithium_asm_csubq +.type pqcrystals_dilithium_asm_csubq, %function +.align 2 +pqcrystals_dilithium_asm_csubq: + push {r4-r10} + + movw r12,#:lower16:8380417 + movt r12,#:upper16:8380417 + + movw r10, #32 + 1: + ldr.w r1, [r0] + ldr.w r2, [r0, #1*4] + ldr.w r3, [r0, #2*4] + ldr.w r4, [r0, #3*4] + ldr.w r5, [r0, #4*4] + ldr.w r6, [r0, #5*4] + ldr.w r7, [r0, #6*4] + ldr.w r8, [r0, #7*4] + + csubq r1, r9, r12 + csubq r2, r9, r12 + csubq r3, r9, r12 + csubq r4, r9, r12 + csubq r5, r9, r12 + csubq r6, r9, r12 + csubq r7, r9, r12 + csubq r8, r9, r12 + + str.w r2, [r0, #1*4] + str.w r3, [r0, #2*4] + str.w r4, [r0, #3*4] + str.w r5, [r0, #4*4] + str.w r6, [r0, #5*4] + str.w r7, [r0, #6*4] + str.w r8, [r0, #7*4] + str r1, [r0], #8*4 + subs r10, #1 + bne.w 1b + + pop {r4-r10} + bx lr +.size pqcrystals_dilithium_asm_csubq, .-pqcrystals_dilithium_asm_csubq + +// asm_rej_uniform(int32_t *a,unsigned int len,const unsigned char *buf, unsigned int buflen); +.global pqcrystals_dilithium_asm_rej_uniform +.type pqcrystals_dilithium_asm_rej_uniform, %function +.align 2 +pqcrystals_dilithium_asm_rej_uniform: + push.w {r4-r6} + push.w {r1} + // Store Q-1 in r12. + movw r12,#:lower16:8380416 + movt r12,#:upper16:8380416 + + add.w r6, r0, r1, lsl #2 + add.w r3, r2, r3 + sub.w r3, r3, #2 + +1: + // If there are less than 3 bytes available, return. + cmp.w r3, r2 + ble.w end + + ldr r5, [r2], #3 + ubfx r5, r5, #0, #23 + + cmp.n r5, r12 + it le + strle r5, [r0], #4 + + cmp.n r0, r6 + bne.n 1b + +end: + pop.w {r5} + + sub.w r0, r6, r0 + sub.w r0, r5, r0, lsr #2 + pop.w {r4-r6} + bx lr +.size pqcrystals_dilithium_asm_rej_uniform, .-pqcrystals_dilithium_asm_rej_uniform diff --git a/crypto_sign/dilithium2/m4fstack/api.h b/crypto_sign/dilithium2/m4fstack/api.h new file mode 120000 index 0000000..d29362d --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/api.h @@ -0,0 +1 @@ +../m4f/api.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/config.h b/crypto_sign/dilithium2/m4fstack/config.h new file mode 120000 index 0000000..f3892d9 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/config.h @@ -0,0 +1 @@ +../m4f/config.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/macros.i b/crypto_sign/dilithium2/m4fstack/macros.i new file mode 120000 index 0000000..d615b85 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/macros.i @@ -0,0 +1 @@ +../m4f/macros.i \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/macros_smallntt.i b/crypto_sign/dilithium2/m4fstack/macros_smallntt.i new file mode 100644 index 0000000..7c9a387 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/macros_smallntt.i @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com) + * + * Licensed under the Apache License, Version 2.0(the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * NTT and inverse NTT code from: + * Huang, J. et al. 2024. Revisiting Keccak and Dilithium Implementations on ARMv7-M. + * IACR Transactions on Cryptographic Hardware and Embedded Systems. 2024, 2 (Mar. 2024), 1–24. + * DOI:https://doi.org/10.46586/tches.v2024.i2.1-24. + * https://github.com/UIC-ESLAS/Dilithium-Multi-Moduli/blob/332a32cc02d407020e48a4f9b3a0dc78d4c8b0bc/M4/crypto_sign/dilithium3/m4plant/smallntt_769.S +*/ + +#ifndef MACROS_SMALLNTT_I +#define MACROS_SMALLNTT_I + +// general macros +.macro load a, a0, a1, a2, a3, mem0, mem1, mem2, mem3 + ldr.w \a0, [\a, \mem0] + ldr.w \a1, [\a, \mem1] + ldr.w \a2, [\a, \mem2] + ldr.w \a3, [\a, \mem3] +.endm + +.macro store a, a0, a1, a2, a3, mem0, mem1, mem2, mem3 + str.w \a0, [\a, \mem0] + str.w \a1, [\a, \mem1] + str.w \a2, [\a, \mem2] + str.w \a3, [\a, \mem3] +.endm + +.macro doubleplant a, tmp, q, qa, plantconst + smulwb \tmp, \plantconst, \a + smulwt \a, \plantconst, \a + smlabt \tmp, \tmp, \q, \qa + smlabt \a, \a, \q, \qa + pkhtb \a, \a, \tmp, asr#16 +.endm + +.macro doublebarrett a, tmp, tmp2, q, barrettconst + smulbb \tmp, \a, \barrettconst + smultb \tmp2, \a, \barrettconst + asr \tmp, \tmp, #26 + asr \tmp2, \tmp2, #26 + smulbb \tmp, \tmp, \q + smulbb \tmp2, \tmp2, \q + pkhbt \tmp, \tmp, \tmp2, lsl#16 + usub16 \a, \a, \tmp +.endm + +// q locate in the top half of the register +.macro plant_red q, qa, qinv, tmp + mul \tmp, \tmp, \qinv + //tmp*qinv mod 2^2n/ 2^n; in high half + smlatt \tmp, \tmp, \q, \qa + // result in high half +.endm + +.macro mul_twiddle_plant a, twiddle, tmp, q, qa + smulwb \tmp, \twiddle, \a + smulwt \a, \twiddle, \a + smlabt \tmp, \tmp, \q, \qa + smlabt \a, \a, \q, \qa + pkhtb \a, \a, \tmp, asr#16 +.endm + +.macro doublebutterfly_plant a0, a1, twiddle, tmp, q, qa + smulwb \tmp, \twiddle, \a1 + smulwt \a1, \twiddle, \a1 + smlabt \tmp, \tmp, \q, \qa + smlabt \a1, \a1, \q, \qa + pkhtb \tmp, \a1, \tmp, asr#16 + usub16 \a1, \a0, \tmp + uadd16 \a0, \a0, \tmp +.endm + +.macro two_doublebutterfly_plant a0, a1, a2, a3, twiddle0, twiddle1, tmp, q, qa + doublebutterfly_plant \a0, \a1, \twiddle0, \tmp, \q, \qa + doublebutterfly_plant \a2, \a3, \twiddle1, \tmp, \q, \qa +.endm + +#endif \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/ntt.S b/crypto_sign/dilithium2/m4fstack/ntt.S new file mode 120000 index 0000000..40cd5d4 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/ntt.S @@ -0,0 +1 @@ +../m4f/ntt.S \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/ntt.h b/crypto_sign/dilithium2/m4fstack/ntt.h new file mode 120000 index 0000000..8e99cae --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/ntt.h @@ -0,0 +1 @@ +../m4f/ntt.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/packing.c b/crypto_sign/dilithium2/m4fstack/packing.c new file mode 120000 index 0000000..1052fe2 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/packing.c @@ -0,0 +1 @@ +../m4f/packing.c \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/packing.h b/crypto_sign/dilithium2/m4fstack/packing.h new file mode 120000 index 0000000..643cc32 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/packing.h @@ -0,0 +1 @@ +../m4f/packing.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/params.h b/crypto_sign/dilithium2/m4fstack/params.h new file mode 120000 index 0000000..1f91a36 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/params.h @@ -0,0 +1 @@ +../m4f/params.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/pointwise_mont.h b/crypto_sign/dilithium2/m4fstack/pointwise_mont.h new file mode 120000 index 0000000..3255885 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/pointwise_mont.h @@ -0,0 +1 @@ +../m4f/pointwise_mont.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/pointwise_mont.s b/crypto_sign/dilithium2/m4fstack/pointwise_mont.s new file mode 120000 index 0000000..3597ffd --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/pointwise_mont.s @@ -0,0 +1 @@ +../m4f/pointwise_mont.s \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/poly.c b/crypto_sign/dilithium2/m4fstack/poly.c new file mode 120000 index 0000000..2544e75 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/poly.c @@ -0,0 +1 @@ +../../dilithium2/m4f/poly.c \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/poly.h b/crypto_sign/dilithium2/m4fstack/poly.h new file mode 120000 index 0000000..7ef70e5 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/poly.h @@ -0,0 +1 @@ +../../dilithium2/m4f/poly.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/polyvec.c b/crypto_sign/dilithium2/m4fstack/polyvec.c new file mode 120000 index 0000000..569a9a1 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/polyvec.c @@ -0,0 +1 @@ +../m4f/polyvec.c \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/polyvec.h b/crypto_sign/dilithium2/m4fstack/polyvec.h new file mode 120000 index 0000000..d02c99c --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/polyvec.h @@ -0,0 +1 @@ +../m4f/polyvec.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/reduce.h b/crypto_sign/dilithium2/m4fstack/reduce.h new file mode 100644 index 0000000..5990918 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/reduce.h @@ -0,0 +1,79 @@ +#ifndef REDUCE_H +#define REDUCE_H + +#include +#include "params.h" + +#define MONT -4186625 // 2^32 % Q +#define QINV 58728449 // q^(-1) mod 2^32 + +#define montgomery_reduce DILITHIUM_NAMESPACE(montgomery_reduce) +/************************************************* +* Name: montgomery_reduce +* +* Description: For finite field element a with -2^{31}Q <= a <= Q*2^31, +* compute r \equiv a*2^{-32} (mod Q) such that -Q < r < Q. +* +* Arguments: - int64_t: finite field element a +* +* Returns r. +**************************************************/ +static inline int32_t montgomery_reduce(int64_t a) { + int32_t t; + + t = (int64_t)(int32_t)a*QINV; + t = (a - (int64_t)t*Q) >> 32; + return t; +} + +/************************************************* +* Name: reduce32 +* +* Description: For finite field element a with a <= 2^{31} - 2^{22} - 1, +* compute r \equiv a (mod Q) such that -6283009 <= r <= 6283007. +* +* Arguments: - int32_t: finite field element a +* +* Returns r. +**************************************************/ +static int32_t reduce32(int32_t a) { + int32_t t; + + t = (a + (1 << 22)) >> 23; + t = a - t*Q; + return t; +} + +/************************************************* +* Name: caddq +* +* Description: Add Q if input coefficient is negative. +* +* Arguments: - int32_t: finite field element a +* +* Returns r. +**************************************************/ +static int32_t caddq(int32_t a) { + a += (a >> 31) & Q; + return a; +} + +/************************************************* +* Name: freeze +* +* Description: For finite field element a, compute standard +* representative r = a mod^+ Q. +* +* Arguments: - int32_t: finite field element a +* +* Returns r. +**************************************************/ +static int32_t freeze(int32_t a) { + a = reduce32(a); + a = caddq(a); + return a; +} + + + +#endif diff --git a/crypto_sign/dilithium2/m4fstack/rounding.c b/crypto_sign/dilithium2/m4fstack/rounding.c new file mode 120000 index 0000000..ec78068 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/rounding.c @@ -0,0 +1 @@ +../m4f/rounding.c \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/rounding.h b/crypto_sign/dilithium2/m4fstack/rounding.h new file mode 120000 index 0000000..e64114b --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/rounding.h @@ -0,0 +1 @@ +../m4f/rounding.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/sign.c b/crypto_sign/dilithium2/m4fstack/sign.c new file mode 100644 index 0000000..bf0b939 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/sign.c @@ -0,0 +1,484 @@ +#include +#include "params.h" +#include "sign.h" +#include "packing.h" +#include "polyvec.h" +#include "poly.h" +#include "randombytes.h" +#include "symmetric.h" +#include "smallpoly.h" +#include "stack.h" + +#include "smallntt.h" + +/************************************************* +* Name: crypto_sign_keypair +* +* Description: Generates public and private key. +* +* Arguments: - uint8_t *pk: pointer to output public key (allocated +* array of CRYPTO_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key (allocated +* array of CRYPTO_SECRETKEYBYTES bytes) +* +* Returns 0 (success) +**************************************************/ +int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { + unsigned int i, j; + uint8_t seedbuf[2*SEEDBYTES + CRHBYTES]; + const uint8_t *rho, *rhoprime, *key; + + poly tA, tB; + + union { + uint8_t tr[TRBYTES]; + shake256incctx s256; + poly tC; + } data; + + shake256incctx *s256 = &data.s256; + uint8_t *tr = data.tr; + poly *tC = &data.tC; + + /* Get randomness for rho, rhoprime and key */ + randombytes(seedbuf, SEEDBYTES); + shake256_inc_init(s256); + shake256_inc_absorb(s256, seedbuf, SEEDBYTES); + shake256_inc_finalize(s256); + shake256_inc_squeeze(seedbuf, 2*SEEDBYTES + CRHBYTES, s256); + + rho = seedbuf; + rhoprime = rho + SEEDBYTES; + key = rhoprime + CRHBYTES; + + pack_sk_rho(sk, rho); + pack_sk_key(sk, key); + pack_pk_rho(pk, rho); + + /* Matrix-vector multiplication */ + for (i = 0; i < K; i++) + { + /* Expand part of s1 */ + poly_uniform_eta(tC, rhoprime, 0); + if (i == 0) + { + pack_sk_s1(sk, tC, 0); + } + poly_ntt(tC); + /* expand part of the matrix */ + poly_uniform(&tB, rho, (i << 8) + 0); + /* partial matrix-vector multiplication */ + poly_pointwise_montgomery(&tA, &tB, tC); + for(j = 1; j < L; j++) + { + /* Expand part of s1 */ + poly_uniform_eta(tC, rhoprime, j); + if (i == 0) + { + pack_sk_s1(sk, tC, j); + } + poly_ntt(tC); + poly_uniform(&tB, rho, (i << 8) + j); + poly_pointwise_acc_montgomery(&tA, &tB, tC); + } + + poly_reduce(&tA); + poly_invntt_tomont(&tA); + + /* Add error vector s2 */ + /* Sample short vector s2 */ + poly_uniform_eta(&tB, rhoprime, L + i); + pack_sk_s2(sk, &tB, i); + poly_add(&tA, &tA, &tB); + + /* Compute t{0,1} */ + poly_caddq(&tA); + poly_power2round(tC, &tB, &tA); + pack_sk_t0(sk, &tB, i); + pack_pk_t1(pk, tC, i); + + } + + /* Compute H(rho, t1) and write secret key */ + shake256(tr, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES); + pack_sk_tr(sk, tr); + + return 0; +} + + +/************************************************* +* Name: crypto_sign_signature +* +* Description: Computes signature. +* +* Arguments: - uint8_t *sig: pointer to output signature (of length CRYPTO_BYTES) +* - size_t *siglen: pointer to output length of signature +* - uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int crypto_sign_signature(uint8_t *sig, + size_t *siglen, + const uint8_t *m, + size_t mlen, + const uint8_t *sk) +{ + uint8_t buf[2 * CRHBYTES]; + uint8_t *mu, *rhoprime, *rnd; + const uint8_t *rho, *tr, *key; + uint16_t nonce = 0; + unsigned int n; + uint8_t wcomp[K][768]; + uint8_t ccomp[68]; + + union { + shake128incctx s128; + shake256incctx s256; + } state; + + union { + poly full; + struct { + smallpoly stmp0; + smallpoly stmp1; + } small; + } polybuffer; + + poly *tmp0 = &polybuffer.full; + smallpoly *stmp0 = &polybuffer.small.stmp0; + smallpoly *scp = &polybuffer.small.stmp1; + + rho = sk; + tr = sk + SEEDBYTES*2; + key = sk + SEEDBYTES; + + mu = buf; + rnd = mu + CRHBYTES; + rhoprime = mu + CRHBYTES; + unpack_sk_stack((uint8_t*)rho, (uint8_t*)tr, (uint8_t*)key, sk); + + /* Compute mu = CRH(tr, msg) */ + shake256_inc_init(&state.s256); + shake256_inc_absorb(&state.s256, tr, TRBYTES); + shake256_inc_absorb(&state.s256, m, mlen); + shake256_inc_finalize(&state.s256); + shake256_inc_squeeze(mu, CRHBYTES, &state.s256); + + // Note: RNDBYTES < CRHBYTES, so buffer has proper size + for (n = 0; n < RNDBYTES; n++) { + rnd[n] = 0; + } + + shake256_inc_init(&state.s256); + shake256_inc_absorb(&state.s256, key, SEEDBYTES); + shake256_inc_absorb(&state.s256, rnd, RNDBYTES); + shake256_inc_absorb(&state.s256, mu, CRHBYTES); + shake256_inc_finalize(&state.s256); + // rnd can be overwritten here + shake256_inc_squeeze(rhoprime, CRHBYTES, &state.s256); + +rej: + for (size_t k_idx = 0; k_idx < K; k_idx++) { + for(size_t i=0;i<768;i++){ + wcomp[k_idx][i] = 0; + } + } + + for (size_t l_idx = 0; l_idx < L; l_idx++) { + /* Sample intermediate vector y */ + poly_uniform_gamma1_stack(tmp0, rhoprime, L*nonce + l_idx, &state.s256); + poly_ntt(tmp0); + + /* Matrix-vector multiplication */ + for (size_t k_idx = 0; k_idx < K; k_idx++) { + // sampling of y and packing into wcomp inlined into the basemul + poly_uniform_pointwise_montgomery_polywadd_stack(wcomp[k_idx], tmp0, rho, (k_idx << 8) + l_idx, &state.s128); + } + } + nonce++; + for (size_t k_idx = 0; k_idx < K; k_idx++) { + polyw_unpack(tmp0, wcomp[k_idx]); + poly_invntt_tomont(tmp0); + poly_caddq(tmp0); + + polyw_pack(wcomp[k_idx], tmp0); + poly_highbits(tmp0, tmp0); + polyw1_pack(&sig[k_idx*POLYW1_PACKEDBYTES], tmp0); + } + + shake256_inc_init(&state.s256); + shake256_inc_absorb(&state.s256, mu, CRHBYTES); + shake256_inc_absorb(&state.s256, sig, K*POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state.s256); + shake256_inc_squeeze(sig, CTILDEBYTES, &state.s256); + poly_challenge(tmp0, sig); + + poly_challenge_compress(ccomp, tmp0); + + /* Compute z, reject if it reveals secret */ + for(size_t l_idx=0;l_idx < L; l_idx++){ + if(l_idx != 0){ + poly_challenge_decompress(tmp0, ccomp); + } + poly_small_ntt_copy(scp, tmp0); + unpack_sk_s1(stmp0, sk, l_idx); + small_ntt(stmp0->coeffs); + poly_small_basemul_invntt(tmp0, scp, stmp0); + + poly_uniform_gamma1_add_stack(tmp0, tmp0, rhoprime, L*(nonce-1) + l_idx, &state.s256); + + poly_reduce(tmp0); + + if(poly_chknorm(tmp0, GAMMA1 - BETA)) + goto rej; + + polyz_pack(sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES, tmp0); + } + + + /* Write signature */ + unsigned int hint_n = 0; + unsigned int hints_written = 0; + /* Check that subtracting cs2 does not change high bits of w and low bits + * do not reveal secret information */ + + for(unsigned int k_idx = 0; k_idx < K; ++k_idx) { + poly_challenge_decompress(tmp0, ccomp); + poly_small_ntt_copy(scp, tmp0); + + unpack_sk_s2(stmp0, sk, k_idx); + small_ntt(stmp0->coeffs); + poly_small_basemul_invntt(tmp0, scp, stmp0); + + polyw_sub(tmp0, wcomp[k_idx], tmp0); + poly_reduce(tmp0); + + polyw_pack(wcomp[k_idx], tmp0); + + poly_lowbits(tmp0, tmp0); + poly_reduce(tmp0); + if(poly_chknorm(tmp0, GAMMA2 - BETA)){ + goto rej; + } + + poly_schoolbook(tmp0, ccomp, sk + SEEDBYTES + TRBYTES + SEEDBYTES + + L*POLYETA_PACKEDBYTES + K*POLYETA_PACKEDBYTES + k_idx*POLYT0_PACKEDBYTES); + + /* Compute hints for w1 */ + + if(poly_chknorm(tmp0, GAMMA2)) { + goto rej; + } + + hint_n += poly_make_hint_stack(tmp0, tmp0, wcomp[k_idx]); + + if (hint_n > OMEGA) { + goto rej; + } + pack_sig_h(sig, tmp0, k_idx, &hints_written); + } + pack_sig_h_zero(sig, &hints_written); + *siglen = CRYPTO_BYTES; + return 0; +} + +/************************************************* +* Name: crypto_sign +* +* Description: Compute signed message. +* +* Arguments: - uint8_t *sm: pointer to output signed message (allocated +* array with CRYPTO_BYTES + mlen bytes), +* can be equal to m +* - size_t *smlen: pointer to output length of signed +* message +* - const uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - const uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int crypto_sign(uint8_t *sm, + size_t *smlen, + const uint8_t *m, + size_t mlen, + const uint8_t *sk) +{ + size_t i; + + for(i = 0; i < mlen; ++i) + sm[CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; + crypto_sign_signature(sm, smlen, sm + CRYPTO_BYTES, mlen, sk); + *smlen += mlen; + return 0; +} + +/************************************************* +* Name: crypto_sign_verify +* +* Description: Verifies signature. +* +* Arguments: - uint8_t *m: pointer to input signature +* - size_t siglen: length of signature +* - const uint8_t *m: pointer to message +* - size_t mlen: length of message +* - const uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signature could be verified correctly and -1 otherwise +**************************************************/ +int crypto_sign_verify(const uint8_t *sig, + size_t siglen, + const uint8_t *m, + size_t mlen, + const uint8_t *pk) +{ + unsigned int i; + + poly p; + + union { + uint8_t w1_packed[POLYW1_PACKEDBYTES]; + uint8_t wcomp[768]; + } w1_packed_comp; + uint8_t *w1_packed = w1_packed_comp.w1_packed; + uint8_t *wcomp = w1_packed_comp.wcomp; + + union { + uint8_t ccomp[68]; + uint8_t mu[CRHBYTES]; + } ccomp_mu; + uint8_t *ccomp = ccomp_mu.ccomp; + uint8_t *mu = ccomp_mu.mu; + + shake256incctx s256; + + union { + uint8_t hint_ones[OMEGA]; + shake128incctx s128; + uint8_t c2[CTILDEBYTES]; + } shake_hint; + + uint8_t *hint_ones = shake_hint.hint_ones; + shake128incctx *s128 = &shake_hint.s128; + uint8_t *c2 = shake_hint.c2; + + if(siglen != CRYPTO_BYTES) + return -1; + + /* Compute CRH(h(rho, t1), msg) */ + shake256_inc_init(&s256); + shake256_inc_absorb(&s256, pk, CRYPTO_PUBLICKEYBYTES); + shake256_inc_finalize(&s256); + shake256_inc_squeeze(mu, CRHBYTES, &s256); + + shake256_inc_init(&s256); + shake256_inc_absorb(&s256, mu, TRBYTES); + shake256_inc_absorb(&s256, m, mlen); + shake256_inc_finalize(&s256); + shake256_inc_squeeze(mu, CRHBYTES, &s256); + + shake256_inc_init(&s256); + shake256_inc_absorb(&s256, mu, CRHBYTES); + + /* Matrix-vector multiplication; compute Az - c2^dt1 */ + poly_challenge_stack(&p, sig); + poly_challenge_compress(ccomp, &p); + + for (size_t k_idx = 0; k_idx < K; k_idx++) { + for(size_t widx=0;widx<768;widx++){ + wcomp[widx] = 0; + } + + polyz_unpack(&p, sig + CTILDEBYTES); + if(poly_chknorm(&p, GAMMA1 - BETA)) + return -1; + poly_ntt(&p); + + poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &p, pk, (k_idx << 8) + 0, s128); + + for (size_t l_idx = 1; l_idx < L; l_idx++) { + polyz_unpack(&p, sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES); + if(poly_chknorm(&p, GAMMA1 - BETA)) + return -1; + poly_ntt(&p); + poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &p, pk, (k_idx << 8) + l_idx, s128); + } + polyw_unpack(&p, wcomp); + poly_reduce(&p); + poly_invntt_tomont(&p); + polyw_pack(wcomp, &p); + + poly_schoolbook_t1(&p, ccomp, pk + SEEDBYTES + k_idx*POLYT1_PACKEDBYTES); + + polyw_sub(&p, wcomp, &p); + poly_reduce(&p); + + /* Reconstruct w1 */ + poly_caddq(&p); + + if (unpack_sig_h_indices(hint_ones, &i, k_idx, sig) != 0) + { + return -1; + } + poly_use_hint_stack(&p, &p, hint_ones, i); + + polyw1_pack(w1_packed, &p); + + shake256_inc_absorb(&s256, w1_packed, POLYW1_PACKEDBYTES); + } + /* Call random oracle and verify challenge */ + shake256_inc_finalize(&s256); + shake256_inc_squeeze(c2, CTILDEBYTES, &s256); + for(i = 0; i < CTILDEBYTES; ++i) + if(sig[i] != c2[i]) + return -1; + + return 0; +} + +/************************************************* +* Name: crypto_sign_open +* +* Description: Verify signed message. +* +* Arguments: - uint8_t *m: pointer to output message (allocated +* array with smlen bytes), can be equal to sm +* - size_t *mlen: pointer to output length of message +* - const uint8_t *sm: pointer to signed message +* - size_t smlen: length of signed message +* - const uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signed message could be verified correctly and -1 otherwise +**************************************************/ +int crypto_sign_open(uint8_t *m, + size_t *mlen, + const uint8_t *sm, + size_t smlen, + const uint8_t *pk) +{ + size_t i; + + if(smlen < CRYPTO_BYTES) + goto badsig; + + *mlen = smlen - CRYPTO_BYTES; + if(crypto_sign_verify(sm, CRYPTO_BYTES, sm + CRYPTO_BYTES, *mlen, pk)) + goto badsig; + else { + /* All good, copy msg, return 0 */ + for(i = 0; i < *mlen; ++i) + m[i] = sm[CRYPTO_BYTES + i]; + return 0; + } + +badsig: + /* Signature verification failed */ + *mlen = -1; + for(i = 0; i < smlen; ++i) + m[i] = 0; + + return -1; +} diff --git a/crypto_sign/dilithium2/m4fstack/sign.h b/crypto_sign/dilithium2/m4fstack/sign.h new file mode 120000 index 0000000..551f979 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/sign.h @@ -0,0 +1 @@ +../m4f/sign.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/smallntt.h b/crypto_sign/dilithium2/m4fstack/smallntt.h new file mode 100644 index 0000000..244fad2 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/smallntt.h @@ -0,0 +1,47 @@ +/** + * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com) + * + * Licensed under the Apache License, Version 2.0(the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SMALLNTT_H +#define SMALLNTT_H + +#include +#include "params.h" + +#define SMALL_Q 769 + +static const int32_t zetas_769[64] = { + 3138844760, 1334846793, 999738812, 1854264165, 1681125041, 1150537404, 2820492178, 3071823164, 726067294, 2066499220, 3272887953, 1055590142, 4255871365, 1871019564, 2731130050, 1826338500, 513832239, 1792827701, 3373420347, 2993631302, 1161707670, 3306398751, 3518633806, 3406931146, 1586177780, 3853741788, 3317569017, 3825816122, 971813147, 122872927, 217820188, 619949766, 3753209393, 770748358, 4099487641, 765163225, 3630336467, 1742561504, 3479537875, 982983413, 2809321912, 2379266669, 703726762, 681386230, 4110657907, 1457719720, 1217559000, 2474213930, 1195218468, 1089100940, 564098436, 614364633, 3635921600, 2088839752, 3702943196, 1949211426, 2569161192, 374203913, 3982199847, 2083254619, 1513571050, 3647091866, 413299844, 4149753838}; + +static const int32_t zetas_asm_769[128] = { + 346278248, 223405321, 966228013, 759578091, -150798592, 318352582, -1736976371, 1697880440, -2105595150, -804259156, 1675539907, -1016494210, 1401868389, -2005062756, 240160720, 474736307, -1200803600, -1435379187, -1156122536, 1334846793, 999738811, 1854264164, -631120032, -787503756, -1580592646, 1681125040, 1150537403, -1474475119, -1223144132, 1809583100, -100532394, -1938041160, 726067293, 2066499219, -1022079344, 1055590142, 525002504, 273671518, -212235055, -39095931, 1871019563, -1563837247, 1826338499, 139628326, 27925665, 1731391238, 513832238, 1792827701, -921546949, -1301335995, 67021596, 1117026605, 536172770, 1161707669, -988568545, -776333490, -888036151, 1290165729, -497076839, -753992958, 1586177779, -441225509, -977398279, -469151174, -1614103444, 1591762912, -94947261, 971813146, 122872927, 217820188, 619949766, -1709050706, 1010909077, -1748146637, -541757903, 770748357, -195479656, 765163224, 1413038655, 1781657435, -1206388733, -664630830, 1742561504, -815429422, 982983412, 357448514, 44681064, -1524741316, -1485645385, -1915700627, 703726761, 681386229, 686971362, 1787242568, -860110486, -184309390, 1457719719, 1217558999, -1820753366, -502661972, -1921285760, 1139367137, 1195218467, 1089100940, 564098435, 614364633, -1100271206, 457980908, -1669954774, -659045697, 2088839751, -592024101, 1949211426, 1368357591, 698141628, 335107981, -1725806105, 374203913, -312767449, 2083254618, -1061175275, -2139105948, 519417371, 1513571050, -647875431, 413299844, -145213459, 0}; + +// INTT with CT butterfly +static const int32_t zetas_inv_asm_769[256] = { + 5585134, 5585134, -346278248, 5585134, -966228013, -346278248, -223405321, 5585134, 1736976371, -966228013, 150798592, -346278248, -318352582, -223405321, -759578091, + // removed first "2285" + LAYER 3+2+1 - 1 - butterfly + 5585134, -346278248, 5585134, -966228013, -346278248, -223405321, 636705165, 446810642, 1519156183, 11170266, -821014555, -1932456027, 301597183, -692556495, -240160720, 1061175275, -1368357591, -519417371, -335107981, 2139105948, -698141628, -625534899, -1267825197, 843355087, 290426917, 128458060, 1295750862, -748407825, -826599688, 1736976371, -240160720, 2005062756, 1061175275, 1100271206, -1368357591, 502661972, 915961816, 1396283256, 452395775, -1038834743, -955057747, -670215963, 2016233022, -16755399, -1675539907, 1614103444, -1290165729, 94947261, 753992958, -1591762912, 497076839, -1954796559, 1943626293, -1122611738, -1239899531, 938302348, -245745853, 882451018, -435640376, -966228013, 1736976371, -318352582, -240160720, -1401868389, 2005062756, 1016494210, 714897027, -1005323944, 876865885, 2122350549, -1373942724, -2094424884, 1468889985, 1558252114, -1401868389, -686971362, -357448514, 860110486, 1524741316, -1787242568, -44681064, 1407453522, -368618780, 1323676527, -653460564, -1362772458, 1379527857, -463566041, 1859849297, 150798592, -1675539907, 804259156, 1614103444, -67021596, -1290165729, -139628326, -2060914086, -994153678, 55851330, 189894523, -1072345541, 1507985917, 832184821, 1111441472, 2105595150, -525002504, -1809583100, 212235055, 1938041160, -273671518, 100532394, -2044158687, -78191862, 1452134586, 642290298, -2111180283, 552928169, 161968858, -1167292802, -346278248, -966228013, -223405321, 1736976371, 150798592, -318352582, -759578091, -1608518311, -2032988421, -899206417, -480321440, 943887481, 1491230518, -83776995, -284841784, 2005062756, 1100271206, 502661972, 1669954774, -1139367137, -457980908, 1921285760, 1128196871, -1318091394, -1904530361, 396544445, -1228729265, 117287794, 2116765416, 1184048201, -318352582, -1401868389, 1016494210, -686971362, -1413038655, -357448514, 1709050706, -731652426, 89362128, 2021818155, 1720220972, -1882189829, -1245484665, -798674023, 720482160, 804259156, -67021596, -139628326, -536172770, -1731391238, -1117026605, -27925665, -1843093898, -1971551958, 1027664477, 1776072302, -1692295306, 1977137091, 709311894, 1552666981, -223405321, 150798592, -759578091, -1675539907, 2105595150, 804259156, -1697880440, -675801096, 279256651, 949472614, -1066760408, -1050005009, -134043193, 1262240064, 1714635839, 1016494210, -1413038655, 1709050706, 1206388733, 1748146637, -1781657435, -1010909077, -390959312, -1329261660, -1083515807, -1965966825, -1530326449, 809844289, -1541496715, 1630858843, -759578091, 2105595150, -1697880440, -525002504, 631120032, -1809583100, -474736307, -1575007513, -201064789, 1893360095, 424470110, -1133782004, -418884977, -1424208921, -547343036, -1697880440, 631120032, -474736307, 1580592646, 1435379187, 787503756, 1200803600, 1999477623, -932717215, 1982722224, -1848679031, 586438968, 1993892490, 1625273710, -1346017059, 0}; + +// Q1=769 +void small_ntt_asm_769(int16_t a[N], const int32_t * zetas); +void small_invntt_asm_769(int16_t a[N], const int32_t * zetas); +void small_basemul_asm_769(int16_t *c, const int16_t *a, const int16_t *b, const int32_t *zetas); + +// small NTT for computing cs0 and cs1 +#define small_ntt(a) small_ntt_asm_769(a, zetas_asm_769) +#define small_invntt_tomont(a) small_invntt_asm_769(a, zetas_inv_asm_769) +#define small_basemul(r,a,b) small_basemul_asm_769(r, a, b, zetas_769) + +#endif diff --git a/crypto_sign/dilithium2/m4fstack/smallntt_769.S b/crypto_sign/dilithium2/m4fstack/smallntt_769.S new file mode 100644 index 0000000..1c3c9a8 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/smallntt_769.S @@ -0,0 +1,691 @@ +/* + * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com) + * + * Licensed under the Apache License, Version 2.0(the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * NTT and inverse NTT code from: + * Huang, J. et al. 2024. Revisiting Keccak and Dilithium Implementations on ARMv7-M. + * IACR Transactions on Cryptographic Hardware and Embedded Systems. 2024, 2 (Mar. 2024), 1–24. + * DOI:https://doi.org/10.46586/tches.v2024.i2.1-24. + * https://github.com/UIC-ESLAS/Dilithium-Multi-Moduli/blob/332a32cc02d407020e48a4f9b3a0dc78d4c8b0bc/M4/crypto_sign/dilithium3/m4plant/smallntt_769.S +*/ + +#include "macros.i" + +.syntax unified +.cpu cortex-m4 +.thumb + +#include "macros_smallntt.i" +// ####### +// ####### +// # NTT # +// ####### +// ####### + +.macro _3_layer_double_CT_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + // layer 3 + ldr.w \twiddle1, [\twiddle_ptr], #4 + two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle1, \tmp, \q, \qa + two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa + + // layer 2 + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa + + two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle2, \twiddle2, \tmp, \q, \qa + + // layer 1 + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa + + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa +.endm + +.macro _3_layer_double_CT_16_plant_fp c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle1, twiddle2, q, qa, tmp + // layer 3 + vmov \twiddle1, \xi0 + two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle1, \tmp, \q, \qa + two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa + + // layer 2 + vmov \twiddle1, \xi1 + vmov \twiddle2, \xi2 + two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa + + two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle2, \twiddle2, \tmp, \q, \qa + + // layer 1 + vmov \twiddle1, \xi3 + vmov \twiddle2, \xi4 + two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa + + vmov \twiddle1, \xi5 + vmov \twiddle2, \xi6 + two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa +.endm + +.global small_ntt_asm_769 +.type small_ntt_asm_769, %function +.align 2 +small_ntt_asm_769: + push {r4-r11, r14} + vpush.w {s16-s24} + poly .req r0 + twiddle_ptr .req r1 + poly0 .req r2 + poly1 .req r3 + poly2 .req r4 + poly3 .req r5 + poly4 .req r6 + poly5 .req r7 + poly6 .req r8 + poly7 .req r9 + twiddle1 .req r10 + twiddle2 .req r11 + ### qinv .req r11 ### q^-1 mod 2^2n; n=16 + q .req r12 + ### at the top of r12 + qa .req r0 + ### qa=2^a q;a=3; at the bottom of r12 + tmp .req r14 + + // movw qa, #24608 + // Why movt? Because we initially placed qa at the bottom of the same register as q; + movt q, #769 + + ### LAYER 7+6+5+4 + .equ distance, 256 + .equ offset, 32 + .equ strincr, 4 + // pre-load 15 twiddle factors to 15 FPU registers + // s0-s7 used to temporary store 16 16-bit polys. + vldm twiddle_ptr!, {s8-s22} + + add tmp, poly, #strincr*8 + // s23: poly addr + // s24: tmp + vmov s24, tmp + 1: + // load a1, a3, ..., a15 + vmov s23, poly + load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset + load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset + + movw qa, #24608 + + // 8-NTT on a1, a3, ..., a15 + _3_layer_double_CT_16_plant_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp + + // s15, s16, s17, s18, s19, s20, s21, s22 left + // multiply coeffs by layer 8 twiddles for later use + vmov twiddle1, s15 + vmov twiddle2, s16 + mul_twiddle_plant poly0, twiddle1, tmp, q, qa + mul_twiddle_plant poly1, twiddle2, tmp, q, qa + + vmov twiddle1, s17 + vmov twiddle2, s18 + mul_twiddle_plant poly2, twiddle1, tmp, q, qa + mul_twiddle_plant poly3, twiddle2, tmp, q, qa + + vmov twiddle1, s19 + vmov twiddle2, s20 + mul_twiddle_plant poly4, twiddle1, tmp, q, qa + mul_twiddle_plant poly5, twiddle2, tmp, q, qa + + vmov twiddle1, s21 + vmov twiddle2, s22 + mul_twiddle_plant poly6, twiddle1, tmp, q, qa + mul_twiddle_plant poly7, twiddle2, tmp, q, qa + + vmov s0, poly0 // a1 + vmov s1, poly1 // a3 + vmov s2, poly2 // a5 + vmov s3, poly3 // a7 + vmov s4, poly4 // a9 + vmov s5, poly5 // a11 + vmov s6, poly6 // a13 + vmov s7, poly7 // a15 + + vmov poly, s23 + + // load a0, a2, ..., a14 + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + movw qa, #24608 + // 8-NTT on a0, a2, ..., a14 + _3_layer_double_CT_16_plant_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp + + + // layer 4 - 1 + // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) + vmov poly, s23 + vmov twiddle1, s1 // load a3 + uadd16 tmp, poly1, twiddle1 + usub16 poly1, poly1, twiddle1 + str.w tmp, [poly, #1*distance/4] + str.w poly1, [poly, #1*distance/4+offset] + + vmov twiddle1, s3 // load a7 + uadd16 tmp, poly3, twiddle1 + usub16 poly3, poly3, twiddle1 + str.w tmp, [poly, #3*distance/4] + str.w poly3, [poly, #3*distance/4+offset] + + vmov twiddle1, s5 // load a11 + uadd16 tmp, poly5, twiddle1 + usub16 poly5, poly5, twiddle1 + str.w tmp, [poly, #5*distance/4] + str.w poly5, [poly, #5*distance/4+offset] + + vmov twiddle1, s7 // load a15 + uadd16 tmp, poly7, twiddle1 + usub16 poly7, poly7, twiddle1 + str.w tmp, [poly, #7*distance/4] + str.w poly7, [poly, #7*distance/4+offset] + + // layer 4 - 2 + // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13) + vmov poly3, s2 // load a5 + uadd16 tmp, poly2, poly3 + usub16 twiddle1, poly2, poly3 + str.w tmp, [poly, #2*distance/4] + str.w twiddle1, [poly, #2*distance/4+offset] + + vmov poly5, s4 // load a9 + uadd16 tmp, poly4, poly5 + usub16 twiddle1, poly4, poly5 + str.w tmp, [poly, #4*distance/4] + str.w twiddle1, [poly, #4*distance/4+offset] + + vmov poly7, s6 // load a13 + uadd16 tmp, poly6, poly7 + usub16 twiddle1, poly6, poly7 + str.w tmp, [poly, #6*distance/4] + str.w twiddle1, [poly, #6*distance/4+offset] + + vmov poly1, s0 // load a1 + uadd16 tmp, poly0, poly1 + usub16 twiddle1, poly0, poly1 + str.w twiddle1, [poly, #offset] + str.w tmp, [poly], #4 + + vmov tmp, s24 + cmp.w poly, tmp + bne.w 1b + + sub.w poly, #8*strincr + + ### LAYER 3+2+1 + + .equ distance, distance/16 + .equ strincr, 32 + + add.w tmp, poly, #strincr*16 + vmov s13, tmp + 2: + vmov s23, poly + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + movw qa, #24608 + _3_layer_double_CT_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + + vmov poly, s23 + store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + str.w poly1, [poly, #distance/4] + str.w poly2, [poly, #2*distance/4] + str.w poly3, [poly, #3*distance/4] + str.w poly0, [poly], #strincr + + vmov tmp, s13 + cmp.w poly, tmp + bne.w 2b + vpop.w {s16-s24} + pop {r4-r11, pc} + +.unreq poly +.unreq twiddle_ptr +.unreq poly0 +.unreq poly1 +.unreq poly2 +.unreq poly3 +.unreq poly4 +.unreq poly5 +.unreq poly6 +.unreq poly7 +.unreq twiddle1 +.unreq twiddle2 +.unreq q +.unreq qa +.unreq tmp + + +// ######## +// ######## +// # INTT # +// ######## +// ######## + +// input: 0.5/1q +.macro _3_layer_double_inv_CT_16_plant_light c0, c1, c2, c3, c4, c5, c6, c7, xi2, xi4, xi5, xi6, twiddle1, tmp2, q, qa, tmp + + // layer 1 + sadd16.w \tmp, \c0, \c1 // c0, c1 + ssub16.w \c1, \c0, \c1 + sadd16.w \tmp2, \c2, \c3 // c2, c3 + ssub16.w \c3, \c2, \c3 + // tmp, c1, tmp2, c3: 1q maximum + sadd16.w \c0, \c4, \c5 // c4, c5 + ssub16.w \c5, \c4, \c5 + sadd16.w \c2, \c6, \c7 // c6, c7 + ssub16.w \c7, \c6, \c7 + // c4, c6 are free at this point + // c0,c5,c2,c7 1q maximum + + // layer 2 + sadd16.w \c6, \tmp, \tmp2 // c0, c2 + ssub16.w \tmp2, \tmp, \tmp2 + sadd16.w \c4, \c0, \c2 // c4, c6 + ssub16.w \c2, \c0, \c2 + // c6, tmp2, c4, c2: 2q maximum + + vmov.w \twiddle1, \xi2 + doublebutterfly_plant \c1, \c3, \twiddle1, \tmp, \q, \qa + doublebutterfly_plant \c5, \c7, \twiddle1, \tmp, \q, \qa + // c1, c3, c7, c5: 1.5q maximum; + + // tmp and c0 are free at this point + // layer 3 + sadd16.w \c0, \c6, \c4 // c0, c4 + ssub16.w \c4, \c6, \c4 + // c0, c4: 4q + // c6 are free at this point + vmov.w \twiddle1, \xi4 + doublebutterfly_plant \c1, \c5, \twiddle1, \tmp, \q, \qa + // c1, c5: 2q maximum + + vmov.w \twiddle1, \xi5 + // this block is one doublebutterfly + smulwb \tmp, \twiddle1, \c2 // c2, c6 + smulwt \c2, \twiddle1, \c2 + smlabt \tmp, \tmp, \q, \qa + smlabt \c2, \c2, \q, \qa + pkhtb \tmp, \c2, \tmp, asr#16 + ssub16.w \c6, \tmp2, \tmp + sadd16.w \c2, \tmp2, \tmp + //c6, c2: 4.5q + vmov.w \twiddle1, \xi6 + doublebutterfly_plant \c3, \c7, \twiddle1, \tmp, \q, \qa + //c3, c7: 2.5q maximum +.endm +.macro _3_layer_double_inv_CT_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + // layer 3 + ldr.w \twiddle1, [\twiddle_ptr], #4 + two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa + two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa + + // layer 2 + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa + + two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa + + // layer 1 + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle2, \tmp, \q, \qa + + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa +.endm + +.macro _3_layer_double_inv_twist_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + mul_twiddle_plant \c0, \twiddle1, \tmp, \q, \qa + mul_twiddle_plant \c1, \twiddle2, \tmp, \q, \qa + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + mul_twiddle_plant \c2, \twiddle1, \tmp, \q, \qa + mul_twiddle_plant \c3, \twiddle2, \tmp, \q, \qa + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + mul_twiddle_plant \c4, \twiddle1, \tmp, \q, \qa + mul_twiddle_plant \c5, \twiddle2, \tmp, \q, \qa + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + mul_twiddle_plant \c6, \twiddle1, \tmp, \q, \qa + mul_twiddle_plant \c7, \twiddle2, \tmp, \q, \qa +.endm +# input coefficients < 0.5q +.global small_invntt_asm_769 +.type small_invntt_asm_769, %function +.align 2 +small_invntt_asm_769: + push {r4-r11, r14} + vpush.w {s16-s23} + poly .req r0 + twiddle_ptr .req r1 + poly0 .req r2 + poly1 .req r3 + poly2 .req r4 + poly3 .req r5 + poly4 .req r6 + poly5 .req r7 + poly6 .req r8 + poly7 .req r9 + twiddle1 .req r10 + twiddle2 .req r11 + q .req r12 + // at the top of r12 + qa .req r0 + // qa=2^a q;a=3; at the bottom of r12 + tmp .req r14 + + movt q, #769 + + ### LAYER 7+6+5+4 + .equ distance, 16 + .equ offset, 32 + .equ strincr, 64 + + // pre-load twiddle factors to FPU registers + vldm twiddle_ptr!, {s8-s22} + + add.w tmp, poly, #8*strincr + vmov s8, tmp + 1: + vmov s23, poly + // load a1, a3, ..., a15 + load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset + load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset + + movw qa, #24608 + + // NTT on a1, a3, ..., a15 + // twiddle2 is used as tmp2 + _3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s10, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp + + // multiply coeffs by layer 4 twiddles for later use + // vmov twiddle1, s15 + vmov twiddle2, s16 + // mul_twiddle_plant poly0, twiddle1, tmp, q, qa // could be omitted but kept for reduction only + mul_twiddle_plant poly1, twiddle2, tmp, q, qa + + vmov twiddle1, s17 + vmov twiddle2, s18 + mul_twiddle_plant poly2, twiddle1, tmp, q, qa + mul_twiddle_plant poly3, twiddle2, tmp, q, qa + + vmov twiddle1, s19 + vmov twiddle2, s20 + mul_twiddle_plant poly4, twiddle1, tmp, q, qa + mul_twiddle_plant poly5, twiddle2, tmp, q, qa + + vmov twiddle1, s21 + vmov twiddle2, s22 + mul_twiddle_plant poly6, twiddle1, tmp, q, qa + mul_twiddle_plant poly7, twiddle2, tmp, q, qa + + vmov s0, poly0 // a1 + vmov s1, poly1 // a3 + vmov s2, poly2 // a5 + vmov s3, poly3 // a7 + vmov s4, poly4 // a9 + vmov s5, poly5 // a11 + vmov s6, poly6 // a13 + vmov s7, poly7 // a15 + // 0.5q + // ---------- + + vmov poly, s23 + // load a0, a2, ..., a14 + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + movw qa, #24608 + // NTT on a0, a2, ..., a14 + // twiddle2 is used as tmp2 + _3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s10, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp + // 1,3,5,7: <5q; 0,2,4,6:<1q + // layer 4 - 1 + // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) + vmov poly, s23 + vmov twiddle2, s1 // load a3 + uadd16 tmp, poly1, twiddle2 + usub16 poly1, poly1, twiddle2 + str.w tmp, [poly, #1*distance/4] + str.w poly1, [poly, #1*distance/4+offset] + + vmov twiddle2, s3 // load a7 + uadd16 tmp, poly3, twiddle2 + usub16 poly3, poly3, twiddle2 + str.w tmp, [poly, #3*distance/4] + str.w poly3, [poly, #3*distance/4+offset] + + vmov twiddle2, s5 // load a11 + uadd16 tmp, poly5, twiddle2 + usub16 poly5, poly5, twiddle2 + str.w tmp, [poly, #5*distance/4] + str.w poly5, [poly, #5*distance/4+offset] + + vmov twiddle2, s7 // load a15 + uadd16 tmp, poly7, twiddle2 + usub16 poly7, poly7, twiddle2 + str.w tmp, [poly, #7*distance/4] + str.w poly7, [poly, #7*distance/4+offset] + //1,3,5,7: < 5.5q + + // layer 4 - 2 + // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13) + vmov poly3, s2 // load a5 + uadd16 tmp, poly2, poly3 + usub16 twiddle2, poly2, poly3 + str.w tmp, [poly, #2*distance/4] + str.w twiddle2, [poly, #2*distance/4+offset] + + vmov poly5, s4 // load a9 + uadd16 tmp, poly4, poly5 + usub16 twiddle2, poly4, poly5 + str.w tmp, [poly, #4*distance/4] + str.w twiddle2, [poly, #4*distance/4+offset] + + vmov poly7, s6 // load a13 + uadd16 tmp, poly6, poly7 + usub16 twiddle2, poly6, poly7 + str.w tmp, [poly, #6*distance/4] + str.w twiddle2, [poly, #6*distance/4+offset] + + vmov poly1, s0 // load a1 + uadd16 tmp, poly0, poly1 + usub16 twiddle2, poly0, poly1 + str.w twiddle2, [poly, #offset] + str.w tmp, [poly], #strincr // increase 2*8*4 = 64 (2 * 8 loads of 4 bytes each) + //0,2,4,6: < 1.5q + vmov tmp, s8 + cmp.w poly, tmp + bne.w 1b + + sub.w poly, #8*strincr + + ### LAYER 3+2+1 + + .equ distance, distance*16 + .equ strincr, 4 + + // ITER 0 + vmov s6, poly + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + vldm twiddle_ptr!, {s0-s5} + movw qa, #24608 + // twiddle2 is used as tmp2 + _3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s1, s3, s4, s5, twiddle1, twiddle2, q, qa, tmp + + // twisting + _3_layer_double_inv_twist_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + + vmov poly, s6 + store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + str.w poly1, [poly, #distance/4] + str.w poly2, [poly, #2*distance/4] + str.w poly3, [poly, #3*distance/4] + str.w poly0, [poly], #4 + + // ITER 1-15 + add.w tmp, poly, #strincr*3*(5) + vmov s14, tmp + 2: + vmov s6, poly + // polys upto 5.5q + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + movw qa, #24608 + _3_layer_double_inv_CT_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + + // twisting + _3_layer_double_inv_twist_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + + vmov poly, s6 + store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + str.w poly1, [poly, #distance/4] + str.w poly2, [poly, #2*distance/4] + str.w poly3, [poly, #3*distance/4] + str.w poly0, [poly], #4 + + vmov tmp, s14 + cmp.w poly, tmp + bne.w 2b + + vpop.w {s16-s23} + pop {r4-r11, pc} + +.unreq poly +.unreq twiddle_ptr +.unreq poly0 +.unreq poly1 +.unreq poly2 +.unreq poly3 +.unreq poly4 +.unreq poly5 +.unreq poly6 +.unreq poly7 +.unreq twiddle1 +.unreq twiddle2 +.unreq q +.unreq qa +.unreq tmp + +// BASEMUL + +/* +* Basemul code (adapted to q=769) from: +* Huang, J. et al. 2022. Improved Plantard Arithmetic for Lattice-based Cryptography. +* IACR Transactions on Cryptographic Hardware and Embedded Systems. 2022, 4 (Aug. 2022), 614–636. +* DOI:https://doi.org/10.46586/tches.v2022.i4.614-636. +* https://github.com/UIC-ESLAS/ImprovedPlantardArithmetic/blob/f3482cfd09dda8f1f55b95e13616147e3b6dd008/crypto_kem/kyber768/m4fstack/fastbasemul.S +*/ + +.global small_basemul_asm_769 +.type small_basemul_asm_769, %function +.align 2 +small_basemul_asm_769: + push {r4-r11, lr} + + rptr .req r0 + aptr .req r1 + bptr .req r2 + zetaptr .req r3 + poly0 .req r4 + poly1 .req r6 + poly2 .req r5 + poly3 .req r7 + q .req r8 + qa .req r14 + qinv .req r9 + tmp .req r10 + tmp2 .req r11 + zeta .req r12 + loop .req r14 + + movt q, #769 + movw qinv, #64769 + movt qinv, #58632 + + movw loop, #64 + 1: + vmov.w s0,loop + movw qa, #24608 + + ldrd poly0, poly2, [aptr], #8 + ldrd poly1, poly3, [bptr], #8 + // ldr poly0, [aptr], #4 + // ldr poly1, [bptr], #4 + // ldr poly2, [aptr], #4 + // ldr poly3, [bptr], #4 + + ldr.w zeta, [zetaptr], #4 + + // basemul(r->coeffs + 4 * i, a->coeffs + 4 * i, b->coeffs + 4 * i, zetas[64 + i]); + smulwt tmp, zeta, poly1 + smlabt tmp, tmp, q, qa + smultt tmp, poly0, tmp + smlabb tmp, poly0, poly1, tmp + plant_red q, qa, qinv, tmp + // r[0] in upper half of tmp + + smuadx tmp2, poly0, poly1 + plant_red q, qa, qinv, tmp2 + // r[1] in upper half of tmp2 + pkhtb tmp, tmp2, tmp, asr#16 + str tmp, [rptr], #4 + + neg zeta, zeta + + // basemul(r->coeffs + 4 * i + 2, a->coeffs + 4 * i + 2, b->coeffs + 4 * i + 2, - zetas[64 + i]); + smulwt tmp, zeta, poly3 + smlabt tmp, tmp, q, qa + smultt tmp, poly2, tmp + smlabb tmp, poly2, poly3, tmp + plant_red q, qa, qinv, tmp + // r[0] in upper half of tmp + + smuadx tmp2, poly2, poly3 + plant_red q, qa, qinv, tmp2 + // r[1] in upper half of tmp2 + pkhtb tmp, tmp2, tmp, asr#16 + str tmp, [rptr], #4 + + vmov.w loop,s0 + subs.w loop, #1 + bne.w 1b + + .unreq rptr + .unreq aptr + .unreq bptr + .unreq zetaptr + .unreq poly0 + .unreq poly1 + .unreq poly2 + .unreq poly3 + .unreq q + .unreq qa + .unreq qinv + .unreq tmp + .unreq tmp2 + .unreq zeta + .unreq loop + + pop {r4-r11, pc} +//-0.5p~0.5p \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/smallpoly.c b/crypto_sign/dilithium2/m4fstack/smallpoly.c new file mode 100644 index 0000000..433d98a --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/smallpoly.c @@ -0,0 +1,83 @@ +#include "smallpoly.h" +#include "smallntt.h" + +void poly_small_ntt_copy(smallpoly *out, poly *in) { + for (int i = N - 1; i >= 0; i--) + { + out->coeffs[i] = in->coeffs[i]; + } + small_ntt(out->coeffs); +} + + +void polyvecl_small_ntt(smallpoly v[L]) { + unsigned int i; + + for(i = 0; i < L; ++i) + small_ntt(v[i].coeffs); +} + + +void polyveck_small_ntt(smallpoly v[K]) { + unsigned int i; + + for(i = 0; i < K; ++i) + small_ntt(v[i].coeffs); +} + + + +void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallpoly *b){ + // re-use the buffer + smallpoly *tmp = (smallpoly *)r; + small_basemul(tmp->coeffs, a->coeffs, b->coeffs); + small_invntt_tomont(tmp->coeffs); + + #ifdef SMALL_POLY_16_BIT + int j; + // buffer is the same, so we neeed to be careful + for(j=N-1;j>=0;j--){ + r->coeffs[j] = tmp->coeffs[j]; + } + #endif +} + +void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallpoly b[L]){ + unsigned int i; + for(i=0;ivec[i], a, &b[i]); + } +} + +void small_polyeta_unpack(smallpoly *r, const uint8_t *a) { + unsigned int i; + +#if ETA == 2 + for(i = 0; i < N/8; ++i) { + r->coeffs[8*i+0] = (a[3*i+0] >> 0) & 7; + r->coeffs[8*i+1] = (a[3*i+0] >> 3) & 7; + r->coeffs[8*i+2] = ((a[3*i+0] >> 6) | (a[3*i+1] << 2)) & 7; + r->coeffs[8*i+3] = (a[3*i+1] >> 1) & 7; + r->coeffs[8*i+4] = (a[3*i+1] >> 4) & 7; + r->coeffs[8*i+5] = ((a[3*i+1] >> 7) | (a[3*i+2] << 1)) & 7; + r->coeffs[8*i+6] = (a[3*i+2] >> 2) & 7; + r->coeffs[8*i+7] = (a[3*i+2] >> 5) & 7; + + r->coeffs[8*i+0] = ETA - r->coeffs[8*i+0]; + r->coeffs[8*i+1] = ETA - r->coeffs[8*i+1]; + r->coeffs[8*i+2] = ETA - r->coeffs[8*i+2]; + r->coeffs[8*i+3] = ETA - r->coeffs[8*i+3]; + r->coeffs[8*i+4] = ETA - r->coeffs[8*i+4]; + r->coeffs[8*i+5] = ETA - r->coeffs[8*i+5]; + r->coeffs[8*i+6] = ETA - r->coeffs[8*i+6]; + r->coeffs[8*i+7] = ETA - r->coeffs[8*i+7]; + } +#elif ETA == 4 + for(i = 0; i < N/2; ++i) { + r->coeffs[2*i+0] = a[i] & 0x0F; + r->coeffs[2*i+1] = a[i] >> 4; + r->coeffs[2*i+0] = ETA - r->coeffs[2*i+0]; + r->coeffs[2*i+1] = ETA - r->coeffs[2*i+1]; + } +#endif +} diff --git a/crypto_sign/dilithium2/m4fstack/smallpoly.h b/crypto_sign/dilithium2/m4fstack/smallpoly.h new file mode 100644 index 0000000..1aac98f --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/smallpoly.h @@ -0,0 +1,27 @@ +#ifndef SMALLPOLY_H +#define SMALLPOLY_H +#include "params.h" +#include "poly.h" +#include "polyvec.h" + + + +#define SMALL_POLY_16_BIT +typedef struct { + int16_t coeffs[N]; +} smallpoly; + +typedef smallpoly smallhalfpoly; + +void poly_small_ntt_copy(smallpoly*, poly*); + +void polyvecl_small_ntt(smallpoly v[L]); +void polyveck_small_ntt(smallpoly v[K]); + + +void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallpoly b[L]); +void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallpoly *b); + +void small_polyeta_unpack(smallpoly *r, const uint8_t *a); + +#endif \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/stack.c b/crypto_sign/dilithium2/m4fstack/stack.c new file mode 100644 index 0000000..b45f702 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/stack.c @@ -0,0 +1,715 @@ +#include "stack.h" +#include "fips202.h" +#include "symmetric.h" +#include "vector.h" +#include "reduce.h" +#include "rounding.h" + +void poly_challenge_compress(uint8_t c[68], const poly *cp){ + unsigned int i, pos; + uint64_t signs; + uint64_t mask; + /* Encode c */ + for(i=0;i<68;i++) c[i] = 0; + signs = 0; + mask = 1; + pos = 0; + for(i = 0; i < N; ++i){ + if(cp->coeffs[i] != 0){ + c[pos++] = i; + if(cp->coeffs[i] == -1){ + signs |= mask; + } + mask <<= 1; + } + } + + for (i = 0; i < 8; ++i) { + c[60+i] = (unsigned char) (signs >> 8 * i); + } +} + +void poly_challenge_decompress(poly *cp, const uint8_t c[68]){ + unsigned int i; + unsigned pos; + uint64_t signs = 0; + for(i = 0; i < N; i++) cp->coeffs[i] = 0; + for(i = 0; i < 8; i++) { + signs |= ((uint64_t)c[60+i]) << (8*i); + } + + for(i = 0; i < TAU; i++){ + pos = c[i]; + if(signs & 1){ + cp->coeffs[pos] = -1; + } else { + cp->coeffs[pos] = 1; + } + signs >>= 1; + } +} + + +// TODO: buffer at most 8 coeffs at once +static inline int32_t polyt0_unpack_idx(const uint8_t *t0, unsigned idx){ + int32_t coeff; + // 8 coefficients are packed in 13 bytes + t0 += 13*(idx >> 3); + + if(idx % 8 == 0){ + coeff = t0[0]; + coeff |= (uint32_t)t0[1] << 8; + } else if(idx % 8 == 1){ + coeff = t0[1] >> 5; + coeff |= (uint32_t)t0[2] << 3; + coeff |= (uint32_t)t0[3] << 11; + } else if(idx % 8 == 2){ + coeff = t0[3] >> 2; + coeff |= (uint32_t)t0[4] << 6; + } else if(idx % 8 == 3){ + coeff = t0[4] >> 7; + coeff |= (uint32_t)t0[5] << 1; + coeff |= (uint32_t)t0[6] << 9; + } else if(idx % 8 == 4){ + coeff = t0[6] >> 4; + coeff |= (uint32_t)t0[7] << 4; + coeff |= (uint32_t)t0[8] << 12; + } else if(idx % 8 == 5){ + coeff = t0[8] >> 1; + coeff |= (uint32_t)t0[9] << 7; + } else if(idx % 8 == 6){ + coeff = t0[9] >> 6; + coeff |= (uint32_t)t0[10] << 2; + coeff |= (uint32_t)t0[11] << 10; + } else if(idx % 8 == 7){ + coeff = t0[11] >> 3; + coeff |= (uint32_t)t0[12] << 5; + } + coeff &= 0x1FFF; + return (1 << (D-1)) - coeff; +} + +static inline int32_t polyt1_unpack_idx(const uint8_t *t1, unsigned idx){ + int32_t coeff; + // 4 coefficients are packed in 5 bytes + t1 += 5*(idx >> 2); + + if(idx % 4 == 0){ + coeff = (t1[0] >> 0); + coeff |= ((uint32_t)t1[1] << 8); + } else if(idx % 4 == 1){ + coeff = (t1[1] >> 2); + coeff |= ((uint32_t)t1[2] << 6); + } else if(idx % 4 == 2){ + coeff = (t1[2] >> 4); + coeff |= ((uint32_t)t1[3] << 4); + } else if(idx % 4 == 3){ + coeff = (t1[3] >> 6); + coeff |= ((uint32_t)t1[4] << 2); + } + coeff &= 0x3FF; + return coeff; +} + +void poly_schoolbook(poly *c, const uint8_t ccomp[68], const uint8_t *t0){ + unsigned i,j,idx; + uint64_t signs = 0; + for(i = 0; i < N; i++) c->coeffs[i] = 0; + for(i = 0; i < 8; i++) { + signs |= ((uint64_t)ccomp[60+i]) << (8*i); + } + + for(idx = 0; idx < TAU; idx++){ + i = ccomp[idx]; + if(!(signs & 1)){ + for(j = 0; i+j < N; j++){ + c->coeffs[i+j] += polyt0_unpack_idx(t0, j); + } + for(j = N-i; jcoeffs[i+j-N] -= polyt0_unpack_idx(t0, j); + } + } else { + for(j = 0; i+j < N; j++){ + c->coeffs[i+j] -= polyt0_unpack_idx(t0, j); + } + for(j = N-i; jcoeffs[i+j-N] += polyt0_unpack_idx(t0, j); + } + } + + signs >>= 1; + } +} + +void poly_schoolbook_t1(poly *c, const uint8_t ccomp[68], const uint8_t *t1){ + unsigned i,j,idx; + uint64_t signs = 0; + for(i = 0; i < N; i++) c->coeffs[i] = 0; + for(i = 0; i < 8; i++) { + signs |= ((uint64_t)ccomp[60+i]) << (8*i); + } + + for(idx = 0; idx < TAU; idx++){ + i = ccomp[idx]; + if(!(signs & 1)){ + for(j = 0; i+j < N; j++){ + c->coeffs[i+j] += (polyt1_unpack_idx(t1, j) << D); + } + for(j = N-i; jcoeffs[i+j-N] -= (polyt1_unpack_idx(t1, j) << D); + } + } else { + for(j = 0; i+j < N; j++){ + c->coeffs[i+j] -= (polyt1_unpack_idx(t1, j) << D); + } + for(j = N-i; jcoeffs[i+j-N] += (polyt1_unpack_idx(t1, j) << D); + } + } + + signs >>= 1; + } +} + + +void polyw_pack(uint8_t buf[3*256], poly *w){ + poly_reduce(w); + poly_caddq(w); + unsigned int i; + for(i = 0; i < N; i++){ + buf[i*3 + 0] = w->coeffs[i]; + buf[i*3 + 1] = w->coeffs[i] >> 8; + buf[i*3 + 2] = w->coeffs[i] >> 16; + } +} + +void polyw_unpack(poly *w, const uint8_t buf[3*256]) { + unsigned int i; + for(i = 0; i < N; i++){ + w->coeffs[i] = buf[i*3 + 0]; + w->coeffs[i] |= (int32_t)buf[i*3 + 1] << 8; + w->coeffs[i] |= (int32_t)buf[i*3 + 2] << 16; + } +} + + +static void polyw_add_idx(uint8_t buf[3*256], int32_t a, size_t i){ + int32_t coeff; + coeff = buf[i*3 + 0]; + coeff |= (int32_t)buf[i*3 + 1] << 8; + coeff |= (int32_t)buf[i*3 + 2] << 16; + + coeff += a; + + coeff = freeze(coeff); + + buf[i*3 + 0] = coeff; + buf[i*3 + 1] = coeff >> 8; + buf[i*3 + 2] = coeff >> 16; +} + +void polyw_add(uint8_t buf[3*256], poly *p){ + unsigned int i; + for(i = 0; i < N; i++){ + polyw_add_idx(buf, p->coeffs[i], i); + } +} +void polyw_sub(poly* c, uint8_t buf[3*256], poly *a){ + int32_t coeff; + + + for(size_t i=0;icoeffs[i] = coeff - a->coeffs[i]; + } +} + +static int32_t highbits(int32_t a){ + int32_t a1; + + a1 = (a + 127) >> 7; +#if GAMMA2 == (Q-1)/32 + a1 = (a1*1025 + (1 << 21)) >> 22; + a1 &= 15; +#elif GAMMA2 == (Q-1)/88 + a1 = (a1*11275 + (1 << 23)) >> 24; + a1 ^= ((43 - a1) >> 31) & a1; +#endif + + return a1; +} + +void poly_highbits(poly *a1, const poly *a) { + unsigned int i; + + for(i = 0; i < N; ++i) + a1->coeffs[i] = highbits(a->coeffs[i]); +} + +static int32_t lowbits(int32_t a){ + int32_t a1; + int32_t a0; + + a1 = (a + 127) >> 7; +#if GAMMA2 == (Q-1)/32 + a1 = (a1*1025 + (1 << 21)) >> 22; + a1 &= 15; +#elif GAMMA2 == (Q-1)/88 + a1 = (a1*11275 + (1 << 23)) >> 24; + a1 ^= ((43 - a1) >> 31) & a1; +#endif + + a0 = a - a1*2*GAMMA2; + a0 -= (((Q-1)/2 - a0) >> 31) & Q; + return a0; +} + +void poly_lowbits(poly *a0, const poly *a){ + unsigned int i; + + for(i = 0; i < N; ++i) + a0->coeffs[i] = lowbits(a->coeffs[i]); +} + +void unpack_sk_s1(smallpoly *a, const uint8_t *sk, size_t idx) { + small_polyeta_unpack(a, sk + 2*SEEDBYTES + TRBYTES + idx*POLYETA_PACKEDBYTES); +} +void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx) { + small_polyeta_unpack(a, sk + 2*SEEDBYTES + TRBYTES + L*POLYETA_PACKEDBYTES + idx*POLYETA_PACKEDBYTES); +} + + +// TODO: in the end increase this buffer size as far as possible +#define POLY_UNIFORM_BUFFERSIZE 3 +void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, const uint8_t seed[SEEDBYTES], uint16_t nonce, shake128incctx *state){ + int32_t t; + uint8_t buf[POLY_UNIFORM_BUFFERSIZE*3]; + { + size_t ctr = 0; + stream128_init(state, seed, nonce); + + do { + shake128_inc_squeeze(buf, sizeof buf, state); + + for(size_t pos=0; pos < sizeof buf && ctr < N; pos += 3){ + t = buf[pos]; + t |= (uint32_t)buf[pos+1] << 8; + t |= (uint32_t)buf[pos+2] << 16; + t &= 0x7FFFFF; + + if(t < Q) { + t = montgomery_reduce((int64_t)t * b->coeffs[ctr]); + polyw_add_idx(wcomp, t, ctr); + ctr++; + } + } + } while(ctr < N); + + } +} + +#define POLY_UNIFORM_GAMMA1_BUFFERSIZE 1 +#if GAMMA1 == (1 << 17) +#define POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS (POLY_UNIFORM_GAMMA1_BUFFERSIZE*4) +#define POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES (POLY_UNIFORM_GAMMA1_BUFFERSIZE*9) +#elif GAMMA1 == (1 << 19) +#define POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS (POLY_UNIFORM_GAMMA1_BUFFERSIZE*2) +#define POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES (POLY_UNIFORM_GAMMA1_BUFFERSIZE*5) +#endif + +static void polyz_unpack_inplace(int32_t *r){ + uint8_t *a = (uint8_t *)r; + + unsigned int i,j; + #if GAMMA1 == (1 << 17) + for(j = 0; j < POLY_UNIFORM_GAMMA1_BUFFERSIZE; ++j) { + i = POLY_UNIFORM_GAMMA1_BUFFERSIZE-1-j; + int32_t t0; + + + r[4*i+3] = a[9*i+6] >> 6; + r[4*i+3] |= (uint32_t)a[9*i+7] << 2; + r[4*i+3] |= (uint32_t)a[9*i+8] << 10; + r[4*i+3] &= 0x3FFFF; + + r[4*i+2] = a[9*i+4] >> 4; + r[4*i+2] |= (uint32_t)a[9*i+5] << 4; + r[4*i+2] |= (uint32_t)a[9*i+6] << 12; + r[4*i+2] &= 0x3FFFF; + + + r[4*i+1] = (uint32_t)a[9*i+4] << 14; + r[4*i+1] |= a[9*i+2] >> 2; + r[4*i+1] |= (uint32_t)a[9*i+3] << 6; + r[4*i+1] &= 0x3FFFF; + + t0 = a[9*i+0]; + t0 |= (uint32_t)a[9*i+1] << 8; + t0 |= (uint32_t)a[9*i+2] << 16; + t0 &= 0x3FFFF; + + r[4*i+0] = GAMMA1 - t0; + r[4*i+1] = GAMMA1 - r[4*i+1]; + r[4*i+2] = GAMMA1 - r[4*i+2]; + r[4*i+3] = GAMMA1 - r[4*i+3]; + + } +#elif GAMMA1 == (1 << 19) + for(j = 0; j < POLY_UNIFORM_GAMMA1_BUFFERSIZE; ++j) { + i = POLY_UNIFORM_GAMMA1_BUFFERSIZE-1-j; + int32_t tmp0, tmp1; + + tmp0 = a[5*i+2] >> 4; + tmp0 |= (uint32_t)a[5*i+3] << 4; + tmp0 |= (uint32_t)a[5*i+4] << 12; + tmp0 &= 0xFFFFF; + + tmp1 = a[5*i+0]; + tmp1 |= (uint32_t)a[5*i+1] << 8; + tmp1 |= (uint32_t)a[5*i+2] << 16; + tmp1 &= 0xFFFFF; + + r[2*i+0] = GAMMA1 - tmp1; + r[2*i+1] = GAMMA1 - tmp0; + } +#endif +} + +void poly_uniform_gamma1_stack(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state){ + int32_t buf[POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS]; + + stream256_init(state, seed, nonce); + for(size_t i = 0; i < N/POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; i++){ + shake256_inc_squeeze((uint8_t *)buf, POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES, state); + polyz_unpack_inplace(buf); + + for(size_t j = 0; j < POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; j++){ + a->coeffs[i*POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS + j] = buf[j]; + } + } +} + +void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state){ + int32_t buf[POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS]; + + stream256_init(state, seed, nonce); + for(size_t i = 0; i < N/POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; i++){ + shake256_inc_squeeze((uint8_t *)buf, POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES, state); + polyz_unpack_inplace(buf); + + for(size_t j = 0; j < POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; j++){ + a->coeffs[i*POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS + j] = buf[j] + b->coeffs[i*POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS + j]; + } + } +} + + +static inline int32_t make_hint_stack(int32_t z, int32_t r){ + int32_t r1, v1; + + r1 = highbits(r); + v1 = highbits(r+z); + + if(r1 != v1) return 1; + return 0; +} + +size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]){ + int32_t coeff; + size_t hints_n = 0; + for(size_t i=0;icoeffs[i]; + + a->coeffs[i] = make_hint_stack(-t->coeffs[i], coeff); + if(a->coeffs[i] == 1){ + hints_n++; + } + } + return hints_n; +} + +void unpack_sk_stack(uint8_t rho[SEEDBYTES], + uint8_t tr[TRBYTES], + uint8_t key[SEEDBYTES], + const uint8_t sk[CRYPTO_SECRETKEYBYTES]) +{ + unsigned int i; + + for(i = 0; i < SEEDBYTES; ++i) + rho[i] = sk[i]; + sk += SEEDBYTES; + + for(i = 0; i < SEEDBYTES; ++i) + key[i] = sk[i]; + sk += SEEDBYTES; + + for(i = 0; i < TRBYTES; ++i) + tr[i] = sk[i]; + sk += TRBYTES; +} + +/************************************************* +* Name: unpack_sig_h_indices +* +* Description: Unpack only h from signature sig = (c, z, h). +* +* Arguments: - polyveck *h: pointer to output hint vector h +* - const unsigned char sig[]: byte array containing +* bit-packed signature +* +* Returns 1 in case of malformed signature; otherwise 0. +**************************************************/ +int unpack_sig_h_indices(uint8_t h_i[OMEGA], unsigned int * number_of_hints, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]) { + sig += L * POLYZ_PACKEDBYTES; + sig += CTILDEBYTES; + /* Decode h */ + unsigned int k = 0; + unsigned int hidx = 0; + + if (idx > 0) + { + k = sig[OMEGA + (idx - 1)]; + } + + if (sig[OMEGA + idx] < k || sig[OMEGA + idx] > OMEGA) { + return 1; + } + + for (unsigned int j = k; j < sig[OMEGA + idx]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if (j > k && sig[j] <= sig[j - 1]) { + return 1; + } + h_i[hidx++] = sig[j]; + } + + *number_of_hints = hidx; + + /* TODO: extract this check, redundant here */ + k = sig[OMEGA + (K - 1)]; + /* Extra indices are zero for strong unforgeability */ + for (unsigned int j = k; j < OMEGA; ++j) { + if (sig[j]) { + return 1; + } + } + return 0; +} + +/************************************************* +* Name: poly_use_hint_stack +* +* Description: Use hint polynomial to correct the high bits of a polynomial. +* +* Arguments: - poly *b: pointer to output polynomial with corrected high bits +* - const poly *a: pointer to input polynomial +* - const poly *h: pointer to input hint polynomial +**************************************************/ +void poly_use_hint_stack(poly *b, const poly *a, uint8_t h_i[OMEGA], unsigned int number_of_hints) { + unsigned int i; + unsigned int in_list; + + for(i = 0; i < N; ++i) + { + in_list = 0; + for (size_t hidx = 0; hidx < number_of_hints; hidx++) + { + if (i == h_i[hidx]) + { + in_list = 1; + break; + } + } + if (in_list) + { + b->coeffs[i] = use_hint(a->coeffs[i], 1); + } + else + { + b->coeffs[i] = use_hint(a->coeffs[i], 0); + } + + } +} + +/************************************************* +* Name: pack_pk_rho +* +* Description: Bit-pack only rho in public key pk = (rho, t1). +* +* Arguments: - unsigned char pk[]: output byte array +* - const unsigned char rho[]: byte array containing rho +**************************************************/ +void pack_pk_rho(unsigned char pk[CRYPTO_PUBLICKEYBYTES], + const unsigned char rho[SEEDBYTES]) { + for (unsigned int i = 0; i < SEEDBYTES; ++i) { + pk[i] = rho[i]; + } +} + +/************************************************* +* Name: pack_pk_t1 +* +* Description: Bit-pack only the t1 elem at idx in public key pk = (rho, t1). +* +* Arguments: - unsigned char pk[]: output byte array +* - const polyveck *t1: pointer to vector t1 +* - const unsigned int idx: index to the elem to pack +**************************************************/ +void pack_pk_t1(unsigned char pk[CRYPTO_PUBLICKEYBYTES], + const poly *t1, + const unsigned int idx) { + pk += SEEDBYTES; + polyt1_pack(pk + idx * POLYT1_PACKEDBYTES, t1); +} + +/************************************************* +* Name: pack_sk_s1 +* +* Description: Bit-pack only some element of s1 in secret key sk = (rho, key, tr, s1, s2, t0). +* +* Arguments: - unsigned char sk[]: output byte array +* - const poly *s1_elem: pointer to vector element idx in s1 +* - const unisgned int idx: index to the element of s1 that should be packed +**************************************************/ +void pack_sk_s1(unsigned char sk[CRYPTO_SECRETKEYBYTES], + const poly *s1_elem, + const unsigned int idx) { + sk += 2 * SEEDBYTES + TRBYTES; + polyeta_pack(sk + idx * POLYETA_PACKEDBYTES, s1_elem); +} + +/************************************************* +* Name: pack_sk_s2 +* +* Description: Bit-pack only some element of s2 in secret key sk = (rho, key, tr, s1, s2, t0). +* +* Arguments: - unsigned char sk[]: output byte array +* - const poly *s2_elem: pointer to vector element idx in s2 +* - const unsigned int idx: index to the element of s1 that should be packed +**************************************************/ +void pack_sk_s2(unsigned char sk[CRYPTO_SECRETKEYBYTES], + const poly *s2_elem, + const unsigned int idx) { + sk += 2 * SEEDBYTES + TRBYTES + L * POLYETA_PACKEDBYTES; + polyeta_pack(sk + idx * POLYETA_PACKEDBYTES, s2_elem); +} + +/************************************************* +* Name: pack_sk_t0 +* +* Description: Bit-pack only some element of t0 in secret key sk = (rho, key, tr, s1, s2, t0). +* +* Arguments: - unsigned char sk[]: output byte array +* - const poly *t0_elem: pointer to vector element idx in s2 +* - const unsigned int idx: index to the element of s1 that should be packed +**************************************************/ +void pack_sk_t0(unsigned char sk[CRYPTO_SECRETKEYBYTES], + const poly *t0_elem, + const unsigned int idx) { + sk += 2 * SEEDBYTES + TRBYTES + L * POLYETA_PACKEDBYTES + K * POLYETA_PACKEDBYTES; + polyt0_pack(sk + idx * POLYT0_PACKEDBYTES, t0_elem); +} + +/************************************************* +* Name: pack_sk_rho +* +* Description: Bit-pack only rho in secret key sk = (rho, key, tr, s1, s2, t0). +* +* Arguments: - unsigned char sk[]: output byte array +* - const unsigned char rho[]: byte array containing rho +**************************************************/ +void pack_sk_rho(unsigned char sk[CRYPTO_SECRETKEYBYTES], + const unsigned char rho[SEEDBYTES]) { + for (unsigned int i = 0; i < SEEDBYTES; ++i) { + sk[i] = rho[i]; + } +} + +/************************************************* +* Name: pack_sk_key +* +* Description: Bit-pack only key in secret key sk = (rho, key, tr, s1, s2, t0). +* +* Arguments: - unsigned char sk[]: output byte array +* - const unsigned char key[]: byte array containing key +**************************************************/ +void pack_sk_key(unsigned char sk[CRYPTO_SECRETKEYBYTES], + const unsigned char key[SEEDBYTES]) { + sk += SEEDBYTES; + for (unsigned int i = 0; i < SEEDBYTES; ++i) { + sk[i] = key[i]; + } +} + +/************************************************* +* Name: pack_sk_tr +* +* Description: Bit-pack only tr in secret key sk = (rho, key, tr, s1, s2, t0). +* +* Arguments: - unsigned char sk[]: output byte array +* - const unsigned char tr[]: byte array containing tr +**************************************************/ +void pack_sk_tr(unsigned char sk[CRYPTO_SECRETKEYBYTES], + const unsigned char tr[TRBYTES]) { + sk += 2*SEEDBYTES; + for (unsigned int i = 0; i < TRBYTES; ++i) { + sk[i] = tr[i]; + } +} + +/************************************************* +* Name: challenge +* +* Description: Implementation of H. Samples polynomial with TAU nonzero +* coefficients in {-1,1} using the output stream of +* SHAKE256(seed). Stack optimized. +* +* Arguments: - poly *c: pointer to output polynomial +* - const uint8_t mu[]: byte array containing seed of length SEEDBYTES +**************************************************/ +#define CHALLENGE_STACK_BUF_SIZE 8 +void poly_challenge_stack(poly *c, const uint8_t seed[SEEDBYTES]) { + unsigned int i, b, pos; + uint64_t signs; + uint8_t buf[CHALLENGE_STACK_BUF_SIZE]; + shake256incctx state; + + shake256_inc_init(&state); + shake256_inc_absorb(&state, seed, SEEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(buf, CHALLENGE_STACK_BUF_SIZE, &state); + signs = 0; + for(i = 0; i < 8; ++i) + { + signs |= (uint64_t)buf[i] << 8*i; + } + pos = 8; + + for(i = 0; i < N; ++i) + c->coeffs[i] = 0; + for(i = N-TAU; i < N; ++i) { + do { + if(pos >= CHALLENGE_STACK_BUF_SIZE) { + shake256_inc_squeeze(buf, CHALLENGE_STACK_BUF_SIZE, &state); + pos = 0; + } + + b = buf[pos++]; + } while(b > i); + + c->coeffs[i] = c->coeffs[b]; + c->coeffs[b] = 1 - 2*(signs & 1); + signs >>= 1; + } +} \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/stack.h b/crypto_sign/dilithium2/m4fstack/stack.h new file mode 100644 index 0000000..06c8c57 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/stack.h @@ -0,0 +1,69 @@ +#ifndef STACK_H +#define STACK_H + +#include "poly.h" +#include "smallpoly.h" +#include +#include +#include "fips202.h" + +void poly_challenge_compress(uint8_t c[68], const poly *cp); +void poly_challenge_decompress(poly *cp, const uint8_t c[68]); + + +void poly_schoolbook(poly *c, const uint8_t ccomp[68], const uint8_t *t0); +void poly_schoolbook_t1(poly *c, const uint8_t ccomp[68], const uint8_t *t1); +void polyw_pack(uint8_t buf[3*256], poly *w); +void polyw_unpack(poly *w, const uint8_t buf[3*256]); + +void polyw_add(uint8_t buf[3*256], poly *p); +void polyw_sub(poly* c, uint8_t buf[3*256], poly *a); + +void poly_highbits(poly *a1, const poly *a); +void poly_lowbits(poly *a0, const poly *a); + +void unpack_sk_s1(smallpoly *a, const uint8_t *sk, size_t idx); +void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx); + +void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, const uint8_t seed[SEEDBYTES], uint16_t nonce, shake128incctx *state); +void poly_uniform_gamma1_stack(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state); +void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state); +void poly_challenge_stack(poly *c, const uint8_t seed[SEEDBYTES]); + +size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]); +int unpack_sig_h_indices(uint8_t h_i[OMEGA], unsigned int * number_of_hints, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]); +void poly_use_hint_stack(poly *b, const poly *a, uint8_t h_i[OMEGA], unsigned int number_of_hints); + +void unpack_sk_stack(uint8_t rho[SEEDBYTES], + uint8_t tr[TRBYTES], + uint8_t key[SEEDBYTES], + const uint8_t sk[CRYPTO_SECRETKEYBYTES]); + +void pack_pk_rho(unsigned char pk[CRYPTO_PUBLICKEYBYTES], + const unsigned char rho[SEEDBYTES]); + +void pack_pk_t1(unsigned char pk[CRYPTO_PUBLICKEYBYTES], + const poly *t1, + const unsigned int idx); + +void pack_sk_s1(unsigned char sk[CRYPTO_SECRETKEYBYTES], + const poly *s1_elem, + const unsigned int idx); + +void pack_sk_s2(unsigned char sk[CRYPTO_SECRETKEYBYTES], + const poly *s2_elem, + const unsigned int idx); + +void pack_sk_t0(unsigned char sk[CRYPTO_SECRETKEYBYTES], + const poly *t0_elem, + const unsigned int idx); + +void pack_sk_rho(unsigned char sk[CRYPTO_SECRETKEYBYTES], + const unsigned char rho[SEEDBYTES]); + +void pack_sk_key(unsigned char sk[CRYPTO_SECRETKEYBYTES], + const unsigned char key[SEEDBYTES]); + +void pack_sk_tr(unsigned char sk[CRYPTO_SECRETKEYBYTES], + const unsigned char tr[TRBYTES]); +#endif \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/symmetric-shake.c b/crypto_sign/dilithium2/m4fstack/symmetric-shake.c new file mode 120000 index 0000000..b95855b --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/symmetric-shake.c @@ -0,0 +1 @@ +../m4f/symmetric-shake.c \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/symmetric.h b/crypto_sign/dilithium2/m4fstack/symmetric.h new file mode 120000 index 0000000..e89ae95 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/symmetric.h @@ -0,0 +1 @@ +../m4f/symmetric.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/vector.h b/crypto_sign/dilithium2/m4fstack/vector.h new file mode 120000 index 0000000..0793594 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/vector.h @@ -0,0 +1 @@ +../m4f/vector.h \ No newline at end of file diff --git a/crypto_sign/dilithium2/m4fstack/vector.s b/crypto_sign/dilithium2/m4fstack/vector.s new file mode 120000 index 0000000..1a49605 --- /dev/null +++ b/crypto_sign/dilithium2/m4fstack/vector.s @@ -0,0 +1 @@ +../m4f/vector.s \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/api.h b/crypto_sign/dilithium3/m4f/api.h new file mode 120000 index 0000000..9d1668d --- /dev/null +++ b/crypto_sign/dilithium3/m4f/api.h @@ -0,0 +1 @@ +../../dilithium2/m4f/api.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/config.h b/crypto_sign/dilithium3/m4f/config.h new file mode 100644 index 0000000..5572407 --- /dev/null +++ b/crypto_sign/dilithium3/m4f/config.h @@ -0,0 +1,7 @@ +#ifndef CONFIG_H +#define CONFIG_H + +#define DILITHIUM_MODE 3 +// #define SIGN_STACKSTRATEGY 2 + +#endif diff --git a/crypto_sign/dilithium3/m4f/macros.i b/crypto_sign/dilithium3/m4f/macros.i new file mode 120000 index 0000000..e3f2469 --- /dev/null +++ b/crypto_sign/dilithium3/m4f/macros.i @@ -0,0 +1 @@ +../../dilithium2/m4f/macros.i \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/macros_smallntt.i b/crypto_sign/dilithium3/m4f/macros_smallntt.i new file mode 100644 index 0000000..61b6324 --- /dev/null +++ b/crypto_sign/dilithium3/m4f/macros_smallntt.i @@ -0,0 +1,98 @@ +/** + * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com) + * + * Licensed under the Apache License, Version 2.0(the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MACROS_SMALLNTT_I +#define MACROS_SMALLNTT_I + +// general macros +.macro load a, a0, a1, a2, a3, mem0, mem1, mem2, mem3 + ldr.w \a0, [\a, \mem0] + ldr.w \a1, [\a, \mem1] + ldr.w \a2, [\a, \mem2] + ldr.w \a3, [\a, \mem3] +.endm + +.macro store a, a0, a1, a2, a3, mem0, mem1, mem2, mem3 + str.w \a0, [\a, \mem0] + str.w \a1, [\a, \mem1] + str.w \a2, [\a, \mem2] + str.w \a3, [\a, \mem3] +.endm + +.macro doubleplant a, tmp, q, qa, plantconst + smulwb \tmp, \plantconst, \a + smulwt \a, \plantconst, \a + smlabt \tmp, \tmp, \q, \qa + smlabt \a, \a, \q, \qa + pkhtb \a, \a, \tmp, asr#16 +.endm + +.macro doublebarrett a, tmp, tmp2, q, barrettconst + smulbb \tmp, \a, \barrettconst + smultb \tmp2, \a, \barrettconst + asr \tmp, \tmp, #26 + asr \tmp2, \tmp2, #26 + smulbb \tmp, \tmp, \q + smulbb \tmp2, \tmp2, \q + pkhbt \tmp, \tmp, \tmp2, lsl#16 + usub16 \a, \a, \tmp +.endm + +// q locate in the top half of the register +.macro plant_red q, qa, qinv, tmp + mul \tmp, \tmp, \qinv + //tmp*qinv mod 2^2n/ 2^n; in high half + smlatt \tmp, \tmp, \q, \qa + // result in high half +.endm + +.macro mul_twiddle_plant a, twiddle, tmp, q, qa + smulwb \tmp, \twiddle, \a + smulwt \a, \twiddle, \a + smlabt \tmp, \tmp, \q, \qa + smlabt \a, \a, \q, \qa + pkhtb \a, \a, \tmp, asr#16 +.endm + +.macro doublebutterfly_plant a0, a1, twiddle, tmp, q, qa + smulwb \tmp, \twiddle, \a1 + smulwt \a1, \twiddle, \a1 + smlabt \tmp, \tmp, \q, \qa + smlabt \a1, \a1, \q, \qa + pkhtb \tmp, \a1, \tmp, asr#16 + usub16 \a1, \a0, \tmp + uadd16 \a0, \a0, \tmp +.endm + +.macro two_doublebutterfly_plant a0, a1, a2, a3, twiddle0, twiddle1, tmp, q, qa + doublebutterfly_plant \a0, \a1, \twiddle0, \tmp, \q, \qa + doublebutterfly_plant \a2, \a3, \twiddle1, \tmp, \q, \qa +.endm + +//For 3329 +.macro fullplant a0, a1, a2, a3, a4, a5, a6, a7, tmp, q, qa, plantconst + movw \plantconst, #44984 + movt \plantconst, #19 + doubleplant \a0, \tmp, \q, \qa, \plantconst + doubleplant \a1, \tmp, \q, \qa, \plantconst + doubleplant \a2, \tmp, \q, \qa, \plantconst + doubleplant \a3, \tmp, \q, \qa, \plantconst + doubleplant \a4, \tmp, \q, \qa, \plantconst + doubleplant \a5, \tmp, \q, \qa, \plantconst + doubleplant \a6, \tmp, \q, \qa, \plantconst + doubleplant \a7, \tmp, \q, \qa, \plantconst +.endm + +#endif \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/ntt.S b/crypto_sign/dilithium3/m4f/ntt.S new file mode 120000 index 0000000..6fbceff --- /dev/null +++ b/crypto_sign/dilithium3/m4f/ntt.S @@ -0,0 +1 @@ +../../dilithium2/m4f/ntt.S \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/ntt.h b/crypto_sign/dilithium3/m4f/ntt.h new file mode 120000 index 0000000..43729fe --- /dev/null +++ b/crypto_sign/dilithium3/m4f/ntt.h @@ -0,0 +1 @@ +../../dilithium2/m4f/ntt.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/packing.c b/crypto_sign/dilithium3/m4f/packing.c new file mode 120000 index 0000000..b41782c --- /dev/null +++ b/crypto_sign/dilithium3/m4f/packing.c @@ -0,0 +1 @@ +../../dilithium2/m4f/packing.c \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/packing.h b/crypto_sign/dilithium3/m4f/packing.h new file mode 120000 index 0000000..ba1a6b3 --- /dev/null +++ b/crypto_sign/dilithium3/m4f/packing.h @@ -0,0 +1 @@ +../../dilithium2/m4f/packing.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/params.h b/crypto_sign/dilithium3/m4f/params.h new file mode 120000 index 0000000..a6a4d8b --- /dev/null +++ b/crypto_sign/dilithium3/m4f/params.h @@ -0,0 +1 @@ +../../dilithium2/m4f/params.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/pointwise_mont.h b/crypto_sign/dilithium3/m4f/pointwise_mont.h new file mode 120000 index 0000000..0a6f8b9 --- /dev/null +++ b/crypto_sign/dilithium3/m4f/pointwise_mont.h @@ -0,0 +1 @@ +../../dilithium2/m4f/pointwise_mont.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/pointwise_mont.s b/crypto_sign/dilithium3/m4f/pointwise_mont.s new file mode 120000 index 0000000..c4ddb96 --- /dev/null +++ b/crypto_sign/dilithium3/m4f/pointwise_mont.s @@ -0,0 +1 @@ +../../dilithium2/m4f/pointwise_mont.s \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/poly.c b/crypto_sign/dilithium3/m4f/poly.c new file mode 120000 index 0000000..2544e75 --- /dev/null +++ b/crypto_sign/dilithium3/m4f/poly.c @@ -0,0 +1 @@ +../../dilithium2/m4f/poly.c \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/poly.h b/crypto_sign/dilithium3/m4f/poly.h new file mode 120000 index 0000000..7ef70e5 --- /dev/null +++ b/crypto_sign/dilithium3/m4f/poly.h @@ -0,0 +1 @@ +../../dilithium2/m4f/poly.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/polyvec.c b/crypto_sign/dilithium3/m4f/polyvec.c new file mode 120000 index 0000000..a8edd0d --- /dev/null +++ b/crypto_sign/dilithium3/m4f/polyvec.c @@ -0,0 +1 @@ +../../dilithium2/m4f/polyvec.c \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/polyvec.h b/crypto_sign/dilithium3/m4f/polyvec.h new file mode 120000 index 0000000..cabd6a9 --- /dev/null +++ b/crypto_sign/dilithium3/m4f/polyvec.h @@ -0,0 +1 @@ +../../dilithium2/m4f/polyvec.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/reduce.h b/crypto_sign/dilithium3/m4f/reduce.h new file mode 120000 index 0000000..6c13df5 --- /dev/null +++ b/crypto_sign/dilithium3/m4f/reduce.h @@ -0,0 +1 @@ +../../dilithium2/m4f/reduce.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/rounding.c b/crypto_sign/dilithium3/m4f/rounding.c new file mode 120000 index 0000000..80b8dce --- /dev/null +++ b/crypto_sign/dilithium3/m4f/rounding.c @@ -0,0 +1 @@ +../../dilithium2/m4f/rounding.c \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/rounding.h b/crypto_sign/dilithium3/m4f/rounding.h new file mode 120000 index 0000000..74c40c5 --- /dev/null +++ b/crypto_sign/dilithium3/m4f/rounding.h @@ -0,0 +1 @@ +../../dilithium2/m4f/rounding.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/sign.c b/crypto_sign/dilithium3/m4f/sign.c new file mode 120000 index 0000000..b7ccdf0 --- /dev/null +++ b/crypto_sign/dilithium3/m4f/sign.c @@ -0,0 +1 @@ +../../dilithium2/m4f/sign.c \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/sign.h b/crypto_sign/dilithium3/m4f/sign.h new file mode 120000 index 0000000..b7f1e89 --- /dev/null +++ b/crypto_sign/dilithium3/m4f/sign.h @@ -0,0 +1 @@ +../../dilithium2/m4f/sign.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/smallntt.h b/crypto_sign/dilithium3/m4f/smallntt.h new file mode 100644 index 0000000..2927ff4 --- /dev/null +++ b/crypto_sign/dilithium3/m4f/smallntt.h @@ -0,0 +1,48 @@ +/** + * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com) + * + * Licensed under the Apache License, Version 2.0(the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SMALLNTT_H +#define SMALLNTT_H + +#include +#include "params.h" + +#define SMALL_Q 769 + +static const int32_t zetas_769[64] = { + 3138844760, 1334846793, 999738812, 1854264165, 1681125041, 1150537404, 2820492178, 3071823164, 726067294, 2066499220, 3272887953, 1055590142, 4255871365, 1871019564, 2731130050, 1826338500, 513832239, 1792827701, 3373420347, 2993631302, 1161707670, 3306398751, 3518633806, 3406931146, 1586177780, 3853741788, 3317569017, 3825816122, 971813147, 122872927, 217820188, 619949766, 3753209393, 770748358, 4099487641, 765163225, 3630336467, 1742561504, 3479537875, 982983413, 2809321912, 2379266669, 703726762, 681386230, 4110657907, 1457719720, 1217559000, 2474213930, 1195218468, 1089100940, 564098436, 614364633, 3635921600, 2088839752, 3702943196, 1949211426, 2569161192, 374203913, 3982199847, 2083254619, 1513571050, 3647091866, 413299844, 4149753838}; + +static const int32_t zetas_asm_769[128] = { + 346278248, 223405321, 966228013, 759578091, -150798592, 318352582, -1736976371, 1697880440, -2105595150, -804259156, 1675539907, -1016494210, 1401868389, -2005062756, 240160720, 474736307, -1200803600, -1435379187, -1156122536, 1334846793, 999738811, 1854264164, -631120032, -787503756, -1580592646, 1681125040, 1150537403, -1474475119, -1223144132, 1809583100, -100532394, -1938041160, 726067293, 2066499219, -1022079344, 1055590142, 525002504, 273671518, -212235055, -39095931, 1871019563, -1563837247, 1826338499, 139628326, 27925665, 1731391238, 513832238, 1792827701, -921546949, -1301335995, 67021596, 1117026605, 536172770, 1161707669, -988568545, -776333490, -888036151, 1290165729, -497076839, -753992958, 1586177779, -441225509, -977398279, -469151174, -1614103444, 1591762912, -94947261, 971813146, 122872927, 217820188, 619949766, -1709050706, 1010909077, -1748146637, -541757903, 770748357, -195479656, 765163224, 1413038655, 1781657435, -1206388733, -664630830, 1742561504, -815429422, 982983412, 357448514, 44681064, -1524741316, -1485645385, -1915700627, 703726761, 681386229, 686971362, 1787242568, -860110486, -184309390, 1457719719, 1217558999, -1820753366, -502661972, -1921285760, 1139367137, 1195218467, 1089100940, 564098435, 614364633, -1100271206, 457980908, -1669954774, -659045697, 2088839751, -592024101, 1949211426, 1368357591, 698141628, 335107981, -1725806105, 374203913, -312767449, 2083254618, -1061175275, -2139105948, 519417371, 1513571050, -647875431, 413299844, -145213459, 0}; + +// INTT with CT butterfly +static const int32_t zetas_inv_asm_769[256] = { + 5585134, 5585134, -346278248, 5585134, -966228013, -346278248, -223405321, 5585134, 1736976371, -966228013, 150798592, -346278248, -318352582, -223405321, -759578091, + // removed first "2285" + LAYER 3+2+1 - 1 - butterfly + 5585134, -346278248, 5585134, -966228013, -346278248, -223405321, 636705165, 446810642, 1519156183, 11170266, -821014555, -1932456027, 301597183, -692556495, -240160720, 1061175275, -1368357591, -519417371, -335107981, 2139105948, -698141628, -625534899, -1267825197, 843355087, 290426917, 128458060, 1295750862, -748407825, -826599688, 1736976371, -240160720, 2005062756, 1061175275, 1100271206, -1368357591, 502661972, 915961816, 1396283256, 452395775, -1038834743, -955057747, -670215963, 2016233022, -16755399, -1675539907, 1614103444, -1290165729, 94947261, 753992958, -1591762912, 497076839, -1954796559, 1943626293, -1122611738, -1239899531, 938302348, -245745853, 882451018, -435640376, -966228013, 1736976371, -318352582, -240160720, -1401868389, 2005062756, 1016494210, 714897027, -1005323944, 876865885, 2122350549, -1373942724, -2094424884, 1468889985, 1558252114, -1401868389, -686971362, -357448514, 860110486, 1524741316, -1787242568, -44681064, 1407453522, -368618780, 1323676527, -653460564, -1362772458, 1379527857, -463566041, 1859849297, 150798592, -1675539907, 804259156, 1614103444, -67021596, -1290165729, -139628326, -2060914086, -994153678, 55851330, 189894523, -1072345541, 1507985917, 832184821, 1111441472, 2105595150, -525002504, -1809583100, 212235055, 1938041160, -273671518, 100532394, -2044158687, -78191862, 1452134586, 642290298, -2111180283, 552928169, 161968858, -1167292802, -346278248, -966228013, -223405321, 1736976371, 150798592, -318352582, -759578091, -1608518311, -2032988421, -899206417, -480321440, 943887481, 1491230518, -83776995, -284841784, 2005062756, 1100271206, 502661972, 1669954774, -1139367137, -457980908, 1921285760, 1128196871, -1318091394, -1904530361, 396544445, -1228729265, 117287794, 2116765416, 1184048201, -318352582, -1401868389, 1016494210, -686971362, -1413038655, -357448514, 1709050706, -731652426, 89362128, 2021818155, 1720220972, -1882189829, -1245484665, -798674023, 720482160, 804259156, -67021596, -139628326, -536172770, -1731391238, -1117026605, -27925665, -1843093898, -1971551958, 1027664477, 1776072302, -1692295306, 1977137091, 709311894, 1552666981, -223405321, 150798592, -759578091, -1675539907, 2105595150, 804259156, -1697880440, -675801096, 279256651, 949472614, -1066760408, -1050005009, -134043193, 1262240064, 1714635839, 1016494210, -1413038655, 1709050706, 1206388733, 1748146637, -1781657435, -1010909077, -390959312, -1329261660, -1083515807, -1965966825, -1530326449, 809844289, -1541496715, 1630858843, -759578091, 2105595150, -1697880440, -525002504, 631120032, -1809583100, -474736307, -1575007513, -201064789, 1893360095, 424470110, -1133782004, -418884977, -1424208921, -547343036, -1697880440, 631120032, -474736307, 1580592646, 1435379187, 787503756, 1200803600, 1999477623, -932717215, 1982722224, -1848679031, 586438968, 1993892490, 1625273710, -1346017059, 0}; + +// Q1=769 +void small_ntt_asm_769(int16_t a[N], const int32_t *zetas); +void small_invntt_asm_769(int16_t a[N], const int32_t *zetas); +void small_pointmul_asm_769(int16_t out[N], const int16_t in[N], const int32_t *zetas); +void small_asymmetric_mul_asm_769(int16_t c[N], const int16_t a[N], const int16_t b[N], const int16_t b_prime[N]); + +// small NTT for computing cs0 and cs1; default use 769 as modulus. +#define small_ntt(a) small_ntt_asm_769(a, zetas_asm_769) +#define small_invntt_tomont(a) small_invntt_asm_769(a, zetas_inv_asm_769) +#define small_point_mul(out, in) small_pointmul_asm_769(out, in, zetas_769) +#define small_asymmetric_mul(c, a, b, b_prime) small_asymmetric_mul_asm_769(c, a, b, b_prime); + +#endif diff --git a/crypto_sign/dilithium3/m4f/smallntt_769.S b/crypto_sign/dilithium3/m4f/smallntt_769.S new file mode 100644 index 0000000..97c60f0 --- /dev/null +++ b/crypto_sign/dilithium3/m4f/smallntt_769.S @@ -0,0 +1,681 @@ +/** + * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com) + * + * Licensed under the Apache License, Version 2.0(the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "macros.i" + +.syntax unified +.cpu cortex-m4 +.thumb + +#include "macros_smallntt.i" +// ####### +// ####### +// # NTT # +// ####### +// ####### + +.macro _3_layer_double_CT_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + // layer 3 + ldr.w \twiddle1, [\twiddle_ptr], #4 + two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle1, \tmp, \q, \qa + two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa + + // layer 2 + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa + + two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle2, \twiddle2, \tmp, \q, \qa + + // layer 1 + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa + + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa +.endm + +.macro _3_layer_double_CT_16_plant_fp c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle1, twiddle2, q, qa, tmp + // layer 3 + vmov \twiddle1, \xi0 + two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle1, \tmp, \q, \qa + two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa + + // layer 2 + vmov \twiddle1, \xi1 + vmov \twiddle2, \xi2 + two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa + + two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle2, \twiddle2, \tmp, \q, \qa + + // layer 1 + vmov \twiddle1, \xi3 + vmov \twiddle2, \xi4 + two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa + + vmov \twiddle1, \xi5 + vmov \twiddle2, \xi6 + two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa +.endm + +.global small_ntt_asm_769 +.type small_ntt_asm_769, %function +.align 2 +small_ntt_asm_769: + push {r4-r11, r14} + vpush.w {s16-s24} + poly .req r0 + twiddle_ptr .req r1 + poly0 .req r2 + poly1 .req r3 + poly2 .req r4 + poly3 .req r5 + poly4 .req r6 + poly5 .req r7 + poly6 .req r8 + poly7 .req r9 + twiddle1 .req r10 + twiddle2 .req r11 + ### qinv .req r11 ### q^-1 mod 2^2n; n=16 + q .req r12 + ### at the top of r12 + qa .req r0 + ### qa=2^a q;a=3; at the bottom of r12 + tmp .req r14 + + // movw qa, #24608 + // Why movt? Because we initially placed qa at the bottom of the same register as q; + movt q, #769 + + ### LAYER 7+6+5+4 + .equ distance, 256 + .equ offset, 32 + .equ strincr, 4 + // pre-load 15 twiddle factors to 15 FPU registers + // s0-s7 used to temporary store 16 16-bit polys. + vldm twiddle_ptr!, {s8-s22} + + add tmp, poly, #strincr*8 + // s23: poly addr + // s24: tmp + vmov s24, tmp + 1: + // load a1, a3, ..., a15 + vmov s23, poly + load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset + load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset + + movw qa, #24608 + + // 8-NTT on a1, a3, ..., a15 + _3_layer_double_CT_16_plant_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp + + // s15, s16, s17, s18, s19, s20, s21, s22 left + // multiply coeffs by layer 8 twiddles for later use + vmov twiddle1, s15 + vmov twiddle2, s16 + mul_twiddle_plant poly0, twiddle1, tmp, q, qa + mul_twiddle_plant poly1, twiddle2, tmp, q, qa + + vmov twiddle1, s17 + vmov twiddle2, s18 + mul_twiddle_plant poly2, twiddle1, tmp, q, qa + mul_twiddle_plant poly3, twiddle2, tmp, q, qa + + vmov twiddle1, s19 + vmov twiddle2, s20 + mul_twiddle_plant poly4, twiddle1, tmp, q, qa + mul_twiddle_plant poly5, twiddle2, tmp, q, qa + + vmov twiddle1, s21 + vmov twiddle2, s22 + mul_twiddle_plant poly6, twiddle1, tmp, q, qa + mul_twiddle_plant poly7, twiddle2, tmp, q, qa + + vmov s0, poly0 // a1 + vmov s1, poly1 // a3 + vmov s2, poly2 // a5 + vmov s3, poly3 // a7 + vmov s4, poly4 // a9 + vmov s5, poly5 // a11 + vmov s6, poly6 // a13 + vmov s7, poly7 // a15 + + vmov poly, s23 + + // load a0, a2, ..., a14 + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + movw qa, #24608 + // 8-NTT on a0, a2, ..., a14 + _3_layer_double_CT_16_plant_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp + + + // layer 4 - 1 + // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) + vmov poly, s23 + vmov twiddle1, s1 // load a3 + uadd16 tmp, poly1, twiddle1 + usub16 poly1, poly1, twiddle1 + str.w tmp, [poly, #1*distance/4] + str.w poly1, [poly, #1*distance/4+offset] + + vmov twiddle1, s3 // load a7 + uadd16 tmp, poly3, twiddle1 + usub16 poly3, poly3, twiddle1 + str.w tmp, [poly, #3*distance/4] + str.w poly3, [poly, #3*distance/4+offset] + + vmov twiddle1, s5 // load a11 + uadd16 tmp, poly5, twiddle1 + usub16 poly5, poly5, twiddle1 + str.w tmp, [poly, #5*distance/4] + str.w poly5, [poly, #5*distance/4+offset] + + vmov twiddle1, s7 // load a15 + uadd16 tmp, poly7, twiddle1 + usub16 poly7, poly7, twiddle1 + str.w tmp, [poly, #7*distance/4] + str.w poly7, [poly, #7*distance/4+offset] + + // layer 4 - 2 + // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13) + vmov poly3, s2 // load a5 + uadd16 tmp, poly2, poly3 + usub16 twiddle1, poly2, poly3 + str.w tmp, [poly, #2*distance/4] + str.w twiddle1, [poly, #2*distance/4+offset] + + vmov poly5, s4 // load a9 + uadd16 tmp, poly4, poly5 + usub16 twiddle1, poly4, poly5 + str.w tmp, [poly, #4*distance/4] + str.w twiddle1, [poly, #4*distance/4+offset] + + vmov poly7, s6 // load a13 + uadd16 tmp, poly6, poly7 + usub16 twiddle1, poly6, poly7 + str.w tmp, [poly, #6*distance/4] + str.w twiddle1, [poly, #6*distance/4+offset] + + vmov poly1, s0 // load a1 + uadd16 tmp, poly0, poly1 + usub16 twiddle1, poly0, poly1 + str.w twiddle1, [poly, #offset] + str.w tmp, [poly], #4 + + vmov tmp, s24 + cmp.w poly, tmp + bne.w 1b + + sub.w poly, #8*strincr + + ### LAYER 3+2+1 + + .equ distance, distance/16 + .equ strincr, 32 + + add.w tmp, poly, #strincr*16 + vmov s13, tmp + 2: + vmov s23, poly + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + movw qa, #24608 + _3_layer_double_CT_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + + vmov poly, s23 + store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + str.w poly1, [poly, #distance/4] + str.w poly2, [poly, #2*distance/4] + str.w poly3, [poly, #3*distance/4] + str.w poly0, [poly], #strincr + + vmov tmp, s13 + cmp.w poly, tmp + bne.w 2b + vpop.w {s16-s24} + pop {r4-r11, pc} + +.unreq poly +.unreq twiddle_ptr +.unreq poly0 +.unreq poly1 +.unreq poly2 +.unreq poly3 +.unreq poly4 +.unreq poly5 +.unreq poly6 +.unreq poly7 +.unreq twiddle1 +.unreq twiddle2 +.unreq q +.unreq qa +.unreq tmp + + +// ######## +// ######## +// # INTT # +// ######## +// ######## + +// input: 0.5/1q +.macro _3_layer_double_inv_CT_16_plant_light c0, c1, c2, c3, c4, c5, c6, c7, xi2, xi4, xi5, xi6, twiddle1, tmp2, q, qa, tmp + + // layer 1 + sadd16.w \tmp, \c0, \c1 // c0, c1 + ssub16.w \c1, \c0, \c1 + sadd16.w \tmp2, \c2, \c3 // c2, c3 + ssub16.w \c3, \c2, \c3 + // tmp, c1, tmp2, c3: 1q maximum + sadd16.w \c0, \c4, \c5 // c4, c5 + ssub16.w \c5, \c4, \c5 + sadd16.w \c2, \c6, \c7 // c6, c7 + ssub16.w \c7, \c6, \c7 + // c4, c6 are free at this point + // c0,c5,c2,c7 1q maximum + + // layer 2 + sadd16.w \c6, \tmp, \tmp2 // c0, c2 + ssub16.w \tmp2, \tmp, \tmp2 + sadd16.w \c4, \c0, \c2 // c4, c6 + ssub16.w \c2, \c0, \c2 + // c6, tmp2, c4, c2: 2q maximum + + vmov.w \twiddle1, \xi2 + doublebutterfly_plant \c1, \c3, \twiddle1, \tmp, \q, \qa + doublebutterfly_plant \c5, \c7, \twiddle1, \tmp, \q, \qa + // c1, c3, c7, c5: 1.5q maximum; + + // tmp and c0 are free at this point + // layer 3 + sadd16.w \c0, \c6, \c4 // c0, c4 + ssub16.w \c4, \c6, \c4 + // c0, c4: 4q + // c6 are free at this point + vmov.w \twiddle1, \xi4 + doublebutterfly_plant \c1, \c5, \twiddle1, \tmp, \q, \qa + // c1, c5: 2q maximum + + vmov.w \twiddle1, \xi5 + // this block is one doublebutterfly + smulwb \tmp, \twiddle1, \c2 // c2, c6 + smulwt \c2, \twiddle1, \c2 + smlabt \tmp, \tmp, \q, \qa + smlabt \c2, \c2, \q, \qa + pkhtb \tmp, \c2, \tmp, asr#16 + ssub16.w \c6, \tmp2, \tmp + sadd16.w \c2, \tmp2, \tmp + //c6, c2: 4.5q + vmov.w \twiddle1, \xi6 + doublebutterfly_plant \c3, \c7, \twiddle1, \tmp, \q, \qa + //c3, c7: 2.5q maximum +.endm +.macro _3_layer_double_inv_CT_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + // layer 3 + ldr.w \twiddle1, [\twiddle_ptr], #4 + two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa + two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa + + // layer 2 + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa + + two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa + + // layer 1 + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle2, \tmp, \q, \qa + + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa +.endm + +.macro _3_layer_double_inv_twist_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + mul_twiddle_plant \c0, \twiddle1, \tmp, \q, \qa + mul_twiddle_plant \c1, \twiddle2, \tmp, \q, \qa + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + mul_twiddle_plant \c2, \twiddle1, \tmp, \q, \qa + mul_twiddle_plant \c3, \twiddle2, \tmp, \q, \qa + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + mul_twiddle_plant \c4, \twiddle1, \tmp, \q, \qa + mul_twiddle_plant \c5, \twiddle2, \tmp, \q, \qa + ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8 + mul_twiddle_plant \c6, \twiddle1, \tmp, \q, \qa + mul_twiddle_plant \c7, \twiddle2, \tmp, \q, \qa +.endm +# input coefficients < 0.5q +.global small_invntt_asm_769 +.type small_invntt_asm_769, %function +.align 2 +small_invntt_asm_769: + push {r4-r11, r14} + vpush.w {s16-s23} + poly .req r0 + twiddle_ptr .req r1 + poly0 .req r2 + poly1 .req r3 + poly2 .req r4 + poly3 .req r5 + poly4 .req r6 + poly5 .req r7 + poly6 .req r8 + poly7 .req r9 + twiddle1 .req r10 + twiddle2 .req r11 + q .req r12 + // at the top of r12 + qa .req r0 + // qa=2^a q;a=3; at the bottom of r12 + tmp .req r14 + + movt q, #769 + + ### LAYER 7+6+5+4 + .equ distance, 16 + .equ offset, 32 + .equ strincr, 64 + + // pre-load twiddle factors to FPU registers + vldm twiddle_ptr!, {s8-s22} + + add.w tmp, poly, #8*strincr + vmov s8, tmp + 1: + vmov s23, poly + // load a1, a3, ..., a15 + load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset + load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset + + movw qa, #24608 + + // NTT on a1, a3, ..., a15 + // twiddle2 is used as tmp2 + _3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s10, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp + + // multiply coeffs by layer 4 twiddles for later use + // vmov twiddle1, s15 + vmov twiddle2, s16 + // mul_twiddle_plant poly0, twiddle1, tmp, q, qa // could be omitted but kept for reduction only + mul_twiddle_plant poly1, twiddle2, tmp, q, qa + + vmov twiddle1, s17 + vmov twiddle2, s18 + mul_twiddle_plant poly2, twiddle1, tmp, q, qa + mul_twiddle_plant poly3, twiddle2, tmp, q, qa + + vmov twiddle1, s19 + vmov twiddle2, s20 + mul_twiddle_plant poly4, twiddle1, tmp, q, qa + mul_twiddle_plant poly5, twiddle2, tmp, q, qa + + vmov twiddle1, s21 + vmov twiddle2, s22 + mul_twiddle_plant poly6, twiddle1, tmp, q, qa + mul_twiddle_plant poly7, twiddle2, tmp, q, qa + + vmov s0, poly0 // a1 + vmov s1, poly1 // a3 + vmov s2, poly2 // a5 + vmov s3, poly3 // a7 + vmov s4, poly4 // a9 + vmov s5, poly5 // a11 + vmov s6, poly6 // a13 + vmov s7, poly7 // a15 + // 0.5q + // ---------- + + vmov poly, s23 + // load a0, a2, ..., a14 + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + movw qa, #24608 + // NTT on a0, a2, ..., a14 + // twiddle2 is used as tmp2 + _3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s10, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp + // 1,3,5,7: <5q; 0,2,4,6:<1q + // layer 4 - 1 + // addsub: (a2, a6, a10, a14), (a3, a7, a11, a15) + vmov poly, s23 + vmov twiddle2, s1 // load a3 + uadd16 tmp, poly1, twiddle2 + usub16 poly1, poly1, twiddle2 + str.w tmp, [poly, #1*distance/4] + str.w poly1, [poly, #1*distance/4+offset] + + vmov twiddle2, s3 // load a7 + uadd16 tmp, poly3, twiddle2 + usub16 poly3, poly3, twiddle2 + str.w tmp, [poly, #3*distance/4] + str.w poly3, [poly, #3*distance/4+offset] + + vmov twiddle2, s5 // load a11 + uadd16 tmp, poly5, twiddle2 + usub16 poly5, poly5, twiddle2 + str.w tmp, [poly, #5*distance/4] + str.w poly5, [poly, #5*distance/4+offset] + + vmov twiddle2, s7 // load a15 + uadd16 tmp, poly7, twiddle2 + usub16 poly7, poly7, twiddle2 + str.w tmp, [poly, #7*distance/4] + str.w poly7, [poly, #7*distance/4+offset] + //1,3,5,7: < 5.5q + + // layer 4 - 2 + // addsub: (a0, a4, a8, a12), (a1, a5, a9, a13) + vmov poly3, s2 // load a5 + uadd16 tmp, poly2, poly3 + usub16 twiddle2, poly2, poly3 + str.w tmp, [poly, #2*distance/4] + str.w twiddle2, [poly, #2*distance/4+offset] + + vmov poly5, s4 // load a9 + uadd16 tmp, poly4, poly5 + usub16 twiddle2, poly4, poly5 + str.w tmp, [poly, #4*distance/4] + str.w twiddle2, [poly, #4*distance/4+offset] + + vmov poly7, s6 // load a13 + uadd16 tmp, poly6, poly7 + usub16 twiddle2, poly6, poly7 + str.w tmp, [poly, #6*distance/4] + str.w twiddle2, [poly, #6*distance/4+offset] + + vmov poly1, s0 // load a1 + uadd16 tmp, poly0, poly1 + usub16 twiddle2, poly0, poly1 + str.w twiddle2, [poly, #offset] + str.w tmp, [poly], #strincr // increase 2*8*4 = 64 (2 * 8 loads of 4 bytes each) + //0,2,4,6: < 1.5q + vmov tmp, s8 + cmp.w poly, tmp + bne.w 1b + + sub.w poly, #8*strincr + + ### LAYER 3+2+1 + + .equ distance, distance*16 + .equ strincr, 4 + + // ITER 0 + vmov s6, poly + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + vldm twiddle_ptr!, {s0-s5} + movw qa, #24608 + // twiddle2 is used as tmp2 + _3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s1, s3, s4, s5, twiddle1, twiddle2, q, qa, tmp + + // twisting + _3_layer_double_inv_twist_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + + vmov poly, s6 + store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + str.w poly1, [poly, #distance/4] + str.w poly2, [poly, #2*distance/4] + str.w poly3, [poly, #3*distance/4] + str.w poly0, [poly], #4 + + // ITER 1-15 + add.w tmp, poly, #strincr*3*(5) + vmov s14, tmp + 2: + vmov s6, poly + // polys upto 5.5q + load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4 + load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + + movw qa, #24608 + _3_layer_double_inv_CT_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + + // twisting + _3_layer_double_inv_twist_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp + + vmov poly, s6 + store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4 + str.w poly1, [poly, #distance/4] + str.w poly2, [poly, #2*distance/4] + str.w poly3, [poly, #3*distance/4] + str.w poly0, [poly], #4 + + vmov tmp, s14 + cmp.w poly, tmp + bne.w 2b + + vpop.w {s16-s23} + pop {r4-r11, pc} + +.unreq poly +.unreq twiddle_ptr +.unreq poly0 +.unreq poly1 +.unreq poly2 +.unreq poly3 +.unreq poly4 +.unreq poly5 +.unreq poly6 +.unreq poly7 +.unreq twiddle1 +.unreq twiddle2 +.unreq q +.unreq qa +.unreq tmp + + +################################### +#### small point-multiplication#### +#### r0: out; r1: in; r2: zetas#### +################################### +.align 2 +.global small_pointmul_asm_769 +.type small_pointmul_asm_769, %function +small_pointmul_asm_769: + push.w {r4-r11, lr} + + movw r14, #24608 // qa + movt r12, #769 // q + .equ width, 4 + + + add.w r3, r2, #64*width + _point_mul_16_loop: + + ldr.w r7, [r1, #2*width] + ldr.w r8, [r1, #3*width] + ldr.w r9, [r2, #1*width] + ldr.w r5, [r1, #1*width] + ldr.w r4, [r1], #4*width + ldr.w r6, [r2], #2*width + + smulwt r10, r6, r4 + smlabt r10, r10, r12, r14 + pkhbt r4, r4, r10 + + neg.w r6, r6 + + smulwt r10, r6, r5 + smlabt r10, r10, r12, r14 + pkhbt r5, r5, r10 + + str.w r5, [r0, #1*width] + str.w r4, [r0], #2*width + + smulwt r10, r9, r7 + smlabt r10, r10, r12, r14 + pkhbt r7, r7, r10 + + neg.w r9, r9 + + smulwt r10, r9, r8 + smlabt r10, r10, r12, r14 + pkhbt r8, r8, r10 + + str.w r8, [r0, #1*width] + str.w r7, [r0], #2*width + + cmp.w r2, r3 + bne.w _point_mul_16_loop + + pop.w {r4-r11, pc} + + +#### r0: out; r1: a; r2: b; r3: bprime + .align 2 +.global small_asymmetric_mul_asm_769 +.type small_asymmetric_mul_asm_769, %function +small_asymmetric_mul_asm_769: + push.w {r4-r11, lr} + + movw r14, #24608 // qa + movt r12, #769 // q + movw r11, #64769 + movt r11, #58632 // qinv + .equ width, 4 + add.w r10, r0, #256*2 + _asymmetric_mul_16_loop: + ldr.w r7, [r1, #width] + ldr.w r4, [r1], #2*width + ldr.w r8, [r2, #width] + ldr.w r5, [r2], #2*width + ldr.w r9, [r3, #width] + ldr.w r6, [r3], #2*width + + smuad r6, r4, r6 + plant_red r12, r14, r11, r6 + smuadx r5, r4, r5 + plant_red r12, r14, r11, r5 + + pkhtb r5, r5, r6, asr#16 + str.w r5, [r0], #width + + smuad r6, r7, r9 + plant_red r12, r14, r11, r6 + smuadx r8, r7, r8 + plant_red r12, r14, r11, r8 + + pkhtb r8, r8, r6, asr#16 + str.w r8, [r0], #width + + cmp.w r0, r10 + bne.w _asymmetric_mul_16_loop + + pop.w {r4-r11, pc} \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/smallpoly.c b/crypto_sign/dilithium3/m4f/smallpoly.c new file mode 120000 index 0000000..b59f668 --- /dev/null +++ b/crypto_sign/dilithium3/m4f/smallpoly.c @@ -0,0 +1 @@ +../../dilithium2/m4f/smallpoly.c \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/smallpoly.h b/crypto_sign/dilithium3/m4f/smallpoly.h new file mode 120000 index 0000000..9d46a7a --- /dev/null +++ b/crypto_sign/dilithium3/m4f/smallpoly.h @@ -0,0 +1 @@ +../../dilithium2/m4f/smallpoly.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/symmetric-shake.c b/crypto_sign/dilithium3/m4f/symmetric-shake.c new file mode 120000 index 0000000..6ad8054 --- /dev/null +++ b/crypto_sign/dilithium3/m4f/symmetric-shake.c @@ -0,0 +1 @@ +../../dilithium2/m4f/symmetric-shake.c \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/symmetric.h b/crypto_sign/dilithium3/m4f/symmetric.h new file mode 120000 index 0000000..90ad5c0 --- /dev/null +++ b/crypto_sign/dilithium3/m4f/symmetric.h @@ -0,0 +1 @@ +../../dilithium2/m4f/symmetric.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/vector.h b/crypto_sign/dilithium3/m4f/vector.h new file mode 120000 index 0000000..6e2280f --- /dev/null +++ b/crypto_sign/dilithium3/m4f/vector.h @@ -0,0 +1 @@ +../../dilithium2/m4f/vector.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4f/vector.s b/crypto_sign/dilithium3/m4f/vector.s new file mode 120000 index 0000000..2d2b4dc --- /dev/null +++ b/crypto_sign/dilithium3/m4f/vector.s @@ -0,0 +1 @@ +../../dilithium2/m4f/vector.s \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/api.h b/crypto_sign/dilithium3/m4fstack/api.h new file mode 120000 index 0000000..9d1668d --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/api.h @@ -0,0 +1 @@ +../../dilithium2/m4f/api.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/config.h b/crypto_sign/dilithium3/m4fstack/config.h new file mode 120000 index 0000000..f3892d9 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/config.h @@ -0,0 +1 @@ +../m4f/config.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/macros.i b/crypto_sign/dilithium3/m4fstack/macros.i new file mode 120000 index 0000000..e3f2469 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/macros.i @@ -0,0 +1 @@ +../../dilithium2/m4f/macros.i \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/macros_smallntt.i b/crypto_sign/dilithium3/m4fstack/macros_smallntt.i new file mode 120000 index 0000000..37838a2 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/macros_smallntt.i @@ -0,0 +1 @@ +../../dilithium2/m4fstack/macros_smallntt.i \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/ntt.S b/crypto_sign/dilithium3/m4fstack/ntt.S new file mode 120000 index 0000000..6fbceff --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/ntt.S @@ -0,0 +1 @@ +../../dilithium2/m4f/ntt.S \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/ntt.h b/crypto_sign/dilithium3/m4fstack/ntt.h new file mode 120000 index 0000000..43729fe --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/ntt.h @@ -0,0 +1 @@ +../../dilithium2/m4f/ntt.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/packing.c b/crypto_sign/dilithium3/m4fstack/packing.c new file mode 120000 index 0000000..b41782c --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/packing.c @@ -0,0 +1 @@ +../../dilithium2/m4f/packing.c \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/packing.h b/crypto_sign/dilithium3/m4fstack/packing.h new file mode 120000 index 0000000..ba1a6b3 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/packing.h @@ -0,0 +1 @@ +../../dilithium2/m4f/packing.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/params.h b/crypto_sign/dilithium3/m4fstack/params.h new file mode 120000 index 0000000..a6a4d8b --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/params.h @@ -0,0 +1 @@ +../../dilithium2/m4f/params.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/pointwise_mont.h b/crypto_sign/dilithium3/m4fstack/pointwise_mont.h new file mode 120000 index 0000000..0a6f8b9 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/pointwise_mont.h @@ -0,0 +1 @@ +../../dilithium2/m4f/pointwise_mont.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/pointwise_mont.s b/crypto_sign/dilithium3/m4fstack/pointwise_mont.s new file mode 120000 index 0000000..c4ddb96 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/pointwise_mont.s @@ -0,0 +1 @@ +../../dilithium2/m4f/pointwise_mont.s \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/poly.c b/crypto_sign/dilithium3/m4fstack/poly.c new file mode 120000 index 0000000..2544e75 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/poly.c @@ -0,0 +1 @@ +../../dilithium2/m4f/poly.c \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/poly.h b/crypto_sign/dilithium3/m4fstack/poly.h new file mode 120000 index 0000000..7ef70e5 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/poly.h @@ -0,0 +1 @@ +../../dilithium2/m4f/poly.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/polyvec.c b/crypto_sign/dilithium3/m4fstack/polyvec.c new file mode 120000 index 0000000..a8edd0d --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/polyvec.c @@ -0,0 +1 @@ +../../dilithium2/m4f/polyvec.c \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/polyvec.h b/crypto_sign/dilithium3/m4fstack/polyvec.h new file mode 120000 index 0000000..cabd6a9 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/polyvec.h @@ -0,0 +1 @@ +../../dilithium2/m4f/polyvec.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/reduce.h b/crypto_sign/dilithium3/m4fstack/reduce.h new file mode 120000 index 0000000..f1e2b38 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/reduce.h @@ -0,0 +1 @@ +../../dilithium2/m4fstack/reduce.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/rounding.c b/crypto_sign/dilithium3/m4fstack/rounding.c new file mode 120000 index 0000000..80b8dce --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/rounding.c @@ -0,0 +1 @@ +../../dilithium2/m4f/rounding.c \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/rounding.h b/crypto_sign/dilithium3/m4fstack/rounding.h new file mode 120000 index 0000000..74c40c5 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/rounding.h @@ -0,0 +1 @@ +../../dilithium2/m4f/rounding.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c new file mode 120000 index 0000000..39f6ec4 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/sign.c @@ -0,0 +1 @@ +../../dilithium2/m4fstack/sign.c \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/sign.h b/crypto_sign/dilithium3/m4fstack/sign.h new file mode 120000 index 0000000..b7f1e89 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/sign.h @@ -0,0 +1 @@ +../../dilithium2/m4f/sign.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/smallntt.h b/crypto_sign/dilithium3/m4fstack/smallntt.h new file mode 120000 index 0000000..60f2d18 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/smallntt.h @@ -0,0 +1 @@ +../../dilithium2/m4fstack/smallntt.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/smallntt_769.S b/crypto_sign/dilithium3/m4fstack/smallntt_769.S new file mode 120000 index 0000000..4ae2f9b --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/smallntt_769.S @@ -0,0 +1 @@ +../../dilithium2/m4fstack/smallntt_769.S \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/smallpoly.c b/crypto_sign/dilithium3/m4fstack/smallpoly.c new file mode 120000 index 0000000..9c35056 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/smallpoly.c @@ -0,0 +1 @@ +../../dilithium2/m4fstack/smallpoly.c \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/smallpoly.h b/crypto_sign/dilithium3/m4fstack/smallpoly.h new file mode 120000 index 0000000..45701a4 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/smallpoly.h @@ -0,0 +1 @@ +../../dilithium2/m4fstack/smallpoly.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c new file mode 120000 index 0000000..d25ed6f --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/stack.c @@ -0,0 +1 @@ +../../dilithium2/m4fstack/stack.c \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h new file mode 120000 index 0000000..beab8ca --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/stack.h @@ -0,0 +1 @@ +../../dilithium2/m4fstack/stack.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/symmetric-shake.c b/crypto_sign/dilithium3/m4fstack/symmetric-shake.c new file mode 120000 index 0000000..6ad8054 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/symmetric-shake.c @@ -0,0 +1 @@ +../../dilithium2/m4f/symmetric-shake.c \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/symmetric.h b/crypto_sign/dilithium3/m4fstack/symmetric.h new file mode 120000 index 0000000..90ad5c0 --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/symmetric.h @@ -0,0 +1 @@ +../../dilithium2/m4f/symmetric.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/vector.h b/crypto_sign/dilithium3/m4fstack/vector.h new file mode 120000 index 0000000..6e2280f --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/vector.h @@ -0,0 +1 @@ +../../dilithium2/m4f/vector.h \ No newline at end of file diff --git a/crypto_sign/dilithium3/m4fstack/vector.s b/crypto_sign/dilithium3/m4fstack/vector.s new file mode 120000 index 0000000..2d2b4dc --- /dev/null +++ b/crypto_sign/dilithium3/m4fstack/vector.s @@ -0,0 +1 @@ +../../dilithium2/m4f/vector.s \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/api.h b/crypto_sign/dilithium5/m4f/api.h new file mode 120000 index 0000000..9d1668d --- /dev/null +++ b/crypto_sign/dilithium5/m4f/api.h @@ -0,0 +1 @@ +../../dilithium2/m4f/api.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/basemul_257.S b/crypto_sign/dilithium5/m4f/basemul_257.S new file mode 120000 index 0000000..800dbb5 --- /dev/null +++ b/crypto_sign/dilithium5/m4f/basemul_257.S @@ -0,0 +1 @@ +../../dilithium2/m4f/basemul_257.S \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/config.h b/crypto_sign/dilithium5/m4f/config.h new file mode 100644 index 0000000..c7aeafd --- /dev/null +++ b/crypto_sign/dilithium5/m4f/config.h @@ -0,0 +1,7 @@ +#ifndef CONFIG_H +#define CONFIG_H + +#define DILITHIUM_MODE 5 +// #define SIGN_STACKSTRATEGY 2 + +#endif diff --git a/crypto_sign/dilithium5/m4f/fnt_257.S b/crypto_sign/dilithium5/m4f/fnt_257.S new file mode 120000 index 0000000..2a616c6 --- /dev/null +++ b/crypto_sign/dilithium5/m4f/fnt_257.S @@ -0,0 +1 @@ +../../dilithium2/m4f/fnt_257.S \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/ifnt_257.S b/crypto_sign/dilithium5/m4f/ifnt_257.S new file mode 120000 index 0000000..65c99ba --- /dev/null +++ b/crypto_sign/dilithium5/m4f/ifnt_257.S @@ -0,0 +1 @@ +../../dilithium2/m4f/ifnt_257.S \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/macros.i b/crypto_sign/dilithium5/m4f/macros.i new file mode 120000 index 0000000..e3f2469 --- /dev/null +++ b/crypto_sign/dilithium5/m4f/macros.i @@ -0,0 +1 @@ +../../dilithium2/m4f/macros.i \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/macros_fnt.i b/crypto_sign/dilithium5/m4f/macros_fnt.i new file mode 120000 index 0000000..1abff09 --- /dev/null +++ b/crypto_sign/dilithium5/m4f/macros_fnt.i @@ -0,0 +1 @@ +../../dilithium2/m4f/macros_fnt.i \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/ntt.S b/crypto_sign/dilithium5/m4f/ntt.S new file mode 120000 index 0000000..6fbceff --- /dev/null +++ b/crypto_sign/dilithium5/m4f/ntt.S @@ -0,0 +1 @@ +../../dilithium2/m4f/ntt.S \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/ntt.h b/crypto_sign/dilithium5/m4f/ntt.h new file mode 120000 index 0000000..43729fe --- /dev/null +++ b/crypto_sign/dilithium5/m4f/ntt.h @@ -0,0 +1 @@ +../../dilithium2/m4f/ntt.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/packing.c b/crypto_sign/dilithium5/m4f/packing.c new file mode 120000 index 0000000..b41782c --- /dev/null +++ b/crypto_sign/dilithium5/m4f/packing.c @@ -0,0 +1 @@ +../../dilithium2/m4f/packing.c \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/packing.h b/crypto_sign/dilithium5/m4f/packing.h new file mode 120000 index 0000000..ba1a6b3 --- /dev/null +++ b/crypto_sign/dilithium5/m4f/packing.h @@ -0,0 +1 @@ +../../dilithium2/m4f/packing.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/params.h b/crypto_sign/dilithium5/m4f/params.h new file mode 120000 index 0000000..a6a4d8b --- /dev/null +++ b/crypto_sign/dilithium5/m4f/params.h @@ -0,0 +1 @@ +../../dilithium2/m4f/params.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/pointwise_mont.h b/crypto_sign/dilithium5/m4f/pointwise_mont.h new file mode 120000 index 0000000..0a6f8b9 --- /dev/null +++ b/crypto_sign/dilithium5/m4f/pointwise_mont.h @@ -0,0 +1 @@ +../../dilithium2/m4f/pointwise_mont.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/pointwise_mont.s b/crypto_sign/dilithium5/m4f/pointwise_mont.s new file mode 120000 index 0000000..c4ddb96 --- /dev/null +++ b/crypto_sign/dilithium5/m4f/pointwise_mont.s @@ -0,0 +1 @@ +../../dilithium2/m4f/pointwise_mont.s \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/poly.c b/crypto_sign/dilithium5/m4f/poly.c new file mode 120000 index 0000000..2544e75 --- /dev/null +++ b/crypto_sign/dilithium5/m4f/poly.c @@ -0,0 +1 @@ +../../dilithium2/m4f/poly.c \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/poly.h b/crypto_sign/dilithium5/m4f/poly.h new file mode 120000 index 0000000..7ef70e5 --- /dev/null +++ b/crypto_sign/dilithium5/m4f/poly.h @@ -0,0 +1 @@ +../../dilithium2/m4f/poly.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/polyvec.c b/crypto_sign/dilithium5/m4f/polyvec.c new file mode 120000 index 0000000..a8edd0d --- /dev/null +++ b/crypto_sign/dilithium5/m4f/polyvec.c @@ -0,0 +1 @@ +../../dilithium2/m4f/polyvec.c \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/polyvec.h b/crypto_sign/dilithium5/m4f/polyvec.h new file mode 120000 index 0000000..cabd6a9 --- /dev/null +++ b/crypto_sign/dilithium5/m4f/polyvec.h @@ -0,0 +1 @@ +../../dilithium2/m4f/polyvec.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/reduce.h b/crypto_sign/dilithium5/m4f/reduce.h new file mode 120000 index 0000000..6c13df5 --- /dev/null +++ b/crypto_sign/dilithium5/m4f/reduce.h @@ -0,0 +1 @@ +../../dilithium2/m4f/reduce.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/rounding.c b/crypto_sign/dilithium5/m4f/rounding.c new file mode 120000 index 0000000..80b8dce --- /dev/null +++ b/crypto_sign/dilithium5/m4f/rounding.c @@ -0,0 +1 @@ +../../dilithium2/m4f/rounding.c \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/rounding.h b/crypto_sign/dilithium5/m4f/rounding.h new file mode 120000 index 0000000..74c40c5 --- /dev/null +++ b/crypto_sign/dilithium5/m4f/rounding.h @@ -0,0 +1 @@ +../../dilithium2/m4f/rounding.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/sign.c b/crypto_sign/dilithium5/m4f/sign.c new file mode 120000 index 0000000..b7ccdf0 --- /dev/null +++ b/crypto_sign/dilithium5/m4f/sign.c @@ -0,0 +1 @@ +../../dilithium2/m4f/sign.c \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/sign.h b/crypto_sign/dilithium5/m4f/sign.h new file mode 120000 index 0000000..b7f1e89 --- /dev/null +++ b/crypto_sign/dilithium5/m4f/sign.h @@ -0,0 +1 @@ +../../dilithium2/m4f/sign.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/smallntt.h b/crypto_sign/dilithium5/m4f/smallntt.h new file mode 120000 index 0000000..9b2baf4 --- /dev/null +++ b/crypto_sign/dilithium5/m4f/smallntt.h @@ -0,0 +1 @@ +../../dilithium2/m4f/smallntt.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/smallpoly.c b/crypto_sign/dilithium5/m4f/smallpoly.c new file mode 120000 index 0000000..b59f668 --- /dev/null +++ b/crypto_sign/dilithium5/m4f/smallpoly.c @@ -0,0 +1 @@ +../../dilithium2/m4f/smallpoly.c \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/smallpoly.h b/crypto_sign/dilithium5/m4f/smallpoly.h new file mode 120000 index 0000000..9d46a7a --- /dev/null +++ b/crypto_sign/dilithium5/m4f/smallpoly.h @@ -0,0 +1 @@ +../../dilithium2/m4f/smallpoly.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/symmetric-shake.c b/crypto_sign/dilithium5/m4f/symmetric-shake.c new file mode 120000 index 0000000..6ad8054 --- /dev/null +++ b/crypto_sign/dilithium5/m4f/symmetric-shake.c @@ -0,0 +1 @@ +../../dilithium2/m4f/symmetric-shake.c \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/symmetric.h b/crypto_sign/dilithium5/m4f/symmetric.h new file mode 120000 index 0000000..90ad5c0 --- /dev/null +++ b/crypto_sign/dilithium5/m4f/symmetric.h @@ -0,0 +1 @@ +../../dilithium2/m4f/symmetric.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/vector.h b/crypto_sign/dilithium5/m4f/vector.h new file mode 120000 index 0000000..6e2280f --- /dev/null +++ b/crypto_sign/dilithium5/m4f/vector.h @@ -0,0 +1 @@ +../../dilithium2/m4f/vector.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4f/vector.s b/crypto_sign/dilithium5/m4f/vector.s new file mode 120000 index 0000000..2d2b4dc --- /dev/null +++ b/crypto_sign/dilithium5/m4f/vector.s @@ -0,0 +1 @@ +../../dilithium2/m4f/vector.s \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/api.h b/crypto_sign/dilithium5/m4fstack/api.h new file mode 120000 index 0000000..9d1668d --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/api.h @@ -0,0 +1 @@ +../../dilithium2/m4f/api.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/config.h b/crypto_sign/dilithium5/m4fstack/config.h new file mode 120000 index 0000000..f3892d9 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/config.h @@ -0,0 +1 @@ +../m4f/config.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/macros.i b/crypto_sign/dilithium5/m4fstack/macros.i new file mode 120000 index 0000000..e3f2469 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/macros.i @@ -0,0 +1 @@ +../../dilithium2/m4f/macros.i \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/macros_smallntt.i b/crypto_sign/dilithium5/m4fstack/macros_smallntt.i new file mode 120000 index 0000000..37838a2 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/macros_smallntt.i @@ -0,0 +1 @@ +../../dilithium2/m4fstack/macros_smallntt.i \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/ntt.S b/crypto_sign/dilithium5/m4fstack/ntt.S new file mode 120000 index 0000000..6fbceff --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/ntt.S @@ -0,0 +1 @@ +../../dilithium2/m4f/ntt.S \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/ntt.h b/crypto_sign/dilithium5/m4fstack/ntt.h new file mode 120000 index 0000000..43729fe --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/ntt.h @@ -0,0 +1 @@ +../../dilithium2/m4f/ntt.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/packing.c b/crypto_sign/dilithium5/m4fstack/packing.c new file mode 120000 index 0000000..b41782c --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/packing.c @@ -0,0 +1 @@ +../../dilithium2/m4f/packing.c \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/packing.h b/crypto_sign/dilithium5/m4fstack/packing.h new file mode 120000 index 0000000..ba1a6b3 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/packing.h @@ -0,0 +1 @@ +../../dilithium2/m4f/packing.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/params.h b/crypto_sign/dilithium5/m4fstack/params.h new file mode 120000 index 0000000..a6a4d8b --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/params.h @@ -0,0 +1 @@ +../../dilithium2/m4f/params.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/pointwise_mont.h b/crypto_sign/dilithium5/m4fstack/pointwise_mont.h new file mode 120000 index 0000000..0a6f8b9 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/pointwise_mont.h @@ -0,0 +1 @@ +../../dilithium2/m4f/pointwise_mont.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/pointwise_mont.s b/crypto_sign/dilithium5/m4fstack/pointwise_mont.s new file mode 120000 index 0000000..c4ddb96 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/pointwise_mont.s @@ -0,0 +1 @@ +../../dilithium2/m4f/pointwise_mont.s \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/poly.c b/crypto_sign/dilithium5/m4fstack/poly.c new file mode 120000 index 0000000..2544e75 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/poly.c @@ -0,0 +1 @@ +../../dilithium2/m4f/poly.c \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/poly.h b/crypto_sign/dilithium5/m4fstack/poly.h new file mode 120000 index 0000000..7ef70e5 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/poly.h @@ -0,0 +1 @@ +../../dilithium2/m4f/poly.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/polyvec.c b/crypto_sign/dilithium5/m4fstack/polyvec.c new file mode 120000 index 0000000..a8edd0d --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/polyvec.c @@ -0,0 +1 @@ +../../dilithium2/m4f/polyvec.c \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/polyvec.h b/crypto_sign/dilithium5/m4fstack/polyvec.h new file mode 120000 index 0000000..cabd6a9 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/polyvec.h @@ -0,0 +1 @@ +../../dilithium2/m4f/polyvec.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/reduce.h b/crypto_sign/dilithium5/m4fstack/reduce.h new file mode 120000 index 0000000..f1e2b38 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/reduce.h @@ -0,0 +1 @@ +../../dilithium2/m4fstack/reduce.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/rounding.c b/crypto_sign/dilithium5/m4fstack/rounding.c new file mode 120000 index 0000000..80b8dce --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/rounding.c @@ -0,0 +1 @@ +../../dilithium2/m4f/rounding.c \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/rounding.h b/crypto_sign/dilithium5/m4fstack/rounding.h new file mode 120000 index 0000000..74c40c5 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/rounding.h @@ -0,0 +1 @@ +../../dilithium2/m4f/rounding.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/sign.c b/crypto_sign/dilithium5/m4fstack/sign.c new file mode 120000 index 0000000..39f6ec4 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/sign.c @@ -0,0 +1 @@ +../../dilithium2/m4fstack/sign.c \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/sign.h b/crypto_sign/dilithium5/m4fstack/sign.h new file mode 120000 index 0000000..b7f1e89 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/sign.h @@ -0,0 +1 @@ +../../dilithium2/m4f/sign.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/smallntt.h b/crypto_sign/dilithium5/m4fstack/smallntt.h new file mode 120000 index 0000000..60f2d18 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/smallntt.h @@ -0,0 +1 @@ +../../dilithium2/m4fstack/smallntt.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/smallntt_769.S b/crypto_sign/dilithium5/m4fstack/smallntt_769.S new file mode 120000 index 0000000..4ae2f9b --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/smallntt_769.S @@ -0,0 +1 @@ +../../dilithium2/m4fstack/smallntt_769.S \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/smallpoly.c b/crypto_sign/dilithium5/m4fstack/smallpoly.c new file mode 120000 index 0000000..9c35056 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/smallpoly.c @@ -0,0 +1 @@ +../../dilithium2/m4fstack/smallpoly.c \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/smallpoly.h b/crypto_sign/dilithium5/m4fstack/smallpoly.h new file mode 120000 index 0000000..45701a4 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/smallpoly.h @@ -0,0 +1 @@ +../../dilithium2/m4fstack/smallpoly.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/stack.c b/crypto_sign/dilithium5/m4fstack/stack.c new file mode 120000 index 0000000..d25ed6f --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/stack.c @@ -0,0 +1 @@ +../../dilithium2/m4fstack/stack.c \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/stack.h b/crypto_sign/dilithium5/m4fstack/stack.h new file mode 120000 index 0000000..beab8ca --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/stack.h @@ -0,0 +1 @@ +../../dilithium2/m4fstack/stack.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/symmetric-shake.c b/crypto_sign/dilithium5/m4fstack/symmetric-shake.c new file mode 120000 index 0000000..6ad8054 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/symmetric-shake.c @@ -0,0 +1 @@ +../../dilithium2/m4f/symmetric-shake.c \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/symmetric.h b/crypto_sign/dilithium5/m4fstack/symmetric.h new file mode 120000 index 0000000..90ad5c0 --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/symmetric.h @@ -0,0 +1 @@ +../../dilithium2/m4f/symmetric.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/vector.h b/crypto_sign/dilithium5/m4fstack/vector.h new file mode 120000 index 0000000..6e2280f --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/vector.h @@ -0,0 +1 @@ +../../dilithium2/m4f/vector.h \ No newline at end of file diff --git a/crypto_sign/dilithium5/m4fstack/vector.s b/crypto_sign/dilithium5/m4fstack/vector.s new file mode 120000 index 0000000..2d2b4dc --- /dev/null +++ b/crypto_sign/dilithium5/m4fstack/vector.s @@ -0,0 +1 @@ +../../dilithium2/m4f/vector.s \ No newline at end of file diff --git a/hostside/host_unidirectional.py b/hostside/host_unidirectional.py new file mode 100755 index 0000000..dc51da8 --- /dev/null +++ b/hostside/host_unidirectional.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python3 +import serial +import sys +import platform + +if platform.system() == "Darwin": + dev = serial.Serial("/dev/tty.usbserial-0001", 38400) +else: + dev = serial.Serial("/dev/ttyUSB0", 38400) + +print("> Returned data:", file=sys.stderr) + +while True: + x = dev.read() + sys.stdout.buffer.write(x) + sys.stdout.flush() diff --git a/interface.py b/interface.py new file mode 100644 index 0000000..b767272 --- /dev/null +++ b/interface.py @@ -0,0 +1,110 @@ +# SPDX-License-Identifier: Apache-2.0 or CC0-1.0 +import argparse + +from mupq import mupq +from mupq import platforms + + +def parse_arguments(): + parser = argparse.ArgumentParser(description="PQM4 Specific Settings") + parser.add_argument( + "-p", + "--platform", + help="The PQM4 platform", + choices=["nucleo-f767zi"], + default="nucleo-f767zi", + ) + parser.add_argument( + "-o", + "--opt", + help="Optimization flags", + choices=["speed", "size", "debug"], + default="speed", + ) + parser.add_argument( + "-l", "--lto", help="Enable LTO flags", default=False, action="store_true" + ) + parser.add_argument( + "--no-aio", + help="Disable all-in-one compilation", + default=False, + action="store_true", + ) + parser.add_argument( + "-u", "--uart", default="/dev/ttyUSB0", help="Path to UART output" + ) + parser.add_argument( + "-i", + "--iterations", + type=int, + default=1, + help="Number of iterations for benchmarks", + ) + return parser.parse_known_args() + + +def get_platform(args): + platform = None + bin_type = "bin" + if args.platform == "nucleo-f767zi": + bin_type = "hex" + platform = platforms.OpenOCD("board/st_nucleo_f7.cfg", args.uart) + else: + raise NotImplementedError("Unsupported Platform") + settings = M7Settings( + args.platform, args.opt, args.lto, not args.no_aio, args.iterations, bin_type + ) + return platform, settings + + +class M7Settings(mupq.PlatformSettings): + #: Specify folders to include + scheme_folders = [ # mupq.PlatformSettings.scheme_folders + [ + ("pqm4", "crypto_kem", ""), + ("pqm4", "crypto_sign", ""), + ("mupq", "mupq/crypto_kem", ""), + ("mupq", "mupq/crypto_sign", ""), + ("pqclean", "mupq/pqclean/crypto_kem", "PQCLEAN"), + ("pqclean", "mupq/pqclean/crypto_sign", "PQCLEAN"), + ] + + platform_memory = { + "nucleo-f767zi": 384 * 1024, + } + + def __init__( + self, + platform, + opt="speed", + lto=False, + aio=False, + iterations=1, + binary_type="bin", + ): + """Initialize with a specific platform""" + import skiplist + + self.skip_list = [] + for impl in skiplist.skip_list: + if impl["estmemory"] > self.platform_memory[platform]: + impl = impl.copy() + del impl["estmemory"] + self.skip_list.append(impl) + self.skip_list.append({"implementation": "vec"}) + self.binary_type = binary_type + optflags = {"speed": [], "size": ["OPT_SIZE=1"], "debug": ["DEBUG=1"]} + if opt not in optflags: + raise ValueError(f"Optimization flag should be in {list(optflags.keys())}") + super(M7Settings, self).__init__() + self.makeflags = [f"PLATFORM={platform}"] + self.makeflags += [f"MUPQ_ITERATIONS={iterations}"] + self.makeflags += optflags[opt] + self.iterations = iterations + if lto: + self.makeflags += ["LTO=1"] + else: + self.makeflags += ["LTO="] + if aio: + self.makeflags += ["AIO=1"] + else: + self.makeflags += ["AIO="] diff --git a/ldscripts/devices.data b/ldscripts/devices.data new file mode 100644 index 0000000..b051f2f --- /dev/null +++ b/ldscripts/devices.data @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 +stm32f407vg stm32f4 ROM=1024K RAM=128K +stm32f4 END ROM_OFF=0x08000000 RAM_OFF=0x20000000 CPU=cortex-m4 FPU=hard-fpv4-sp-d16 +stm32f767zi stm32f7 ROM=2048K RAM=384K +stm32f7 END ROM_OFF=0x08000000 RAM_OFF=0x20010000 CPU=cortex-m7 FPU=hard-fpv5-sp-d16 diff --git a/libopencm3 b/libopencm3 new file mode 160000 index 0000000..201f5bc --- /dev/null +++ b/libopencm3 @@ -0,0 +1 @@ +Subproject commit 201f5bcfb3fa70ee34818152463e7139f24db377 diff --git a/mk/config.mk b/mk/config.mk new file mode 100644 index 0000000..85fc914 --- /dev/null +++ b/mk/config.mk @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: Apache-2.0 or CC0-1.0 +CPPFLAGS += \ + -DPQM4 diff --git a/mk/crypto.mk b/mk/crypto.mk new file mode 100644 index 0000000..7aae6b1 --- /dev/null +++ b/mk/crypto.mk @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: Apache-2.0 or CC0-1.0 +SYMCRYPTO_SRC = \ + mupq/common/fips202.c \ + mupq/common/sp800-185.c \ + mupq/common/nistseedexpander.c \ + common/keccakf1600.S \ + common/aes.c \ + common/aes-encrypt.S \ + common/aes-keyschedule.S \ + common/aes-publicinputs.c \ + common/aes-publicinputs.S \ + mupq/common/sha2.c \ + common/crypto_hashblocks_sha512_inner32.s \ + common/crypto_hashblocks_sha512.c + +obj/libsymcrypto.a: $(call objs,$(SYMCRYPTO_SRC)) + +obj/libsymcrypto-hashprof.a: CPPFLAGS+=-DPROFILE_HASHING +obj/libsymcrypto-hashprof.a: $(call hashprofobjs,$(SYMCRYPTO_SRC)) + +ifeq ($(AIO),1) +LDLIBS += +LIBDEPS += $(SYMCRYPTO_SRC) +CPPFLAGS+=$(if $(PROFILE_HASHING),-DPROFILE_HASHING) +else +LDLIBS += -lsymcrypto$(if $(PROFILE_HASHING),-hashprof) +LIBDEPS += obj/libsymcrypto$(if $(PROFILE_HASHING),-hashprof).a +endif + diff --git a/mk/nucleo-f767zi.mk b/mk/nucleo-f767zi.mk new file mode 100644 index 0000000..b5860eb --- /dev/null +++ b/mk/nucleo-f767zi.mk @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: Apache-2.0 or CC0-1.0 +DEVICE=stm32f767zi +OPENCM3_TARGET=lib/stm32/f7 + +EXCLUDED_SCHEMES = \ + mupq/pqclean/crypto_kem/mceliece% \ + mupq/pqclean/crypto_kem/hqc% \ + mupq/pqclean/crypto_sign/falcon% \ + mupq/pqclean/crypto_sign/sphincs% \ + mupq/crypto_kem/bike% \ + mupq/crypto_sign/aimer% \ + mupq/crypto_sign/ascon% \ + mupq/crypto_sign/biscuit% \ + mupq/crypto_sign/cross% \ + mupq/crypto_sign/falcon% \ + mupq/crypto_sign/haetae% \ + mupq/crypto_sign/hawk% \ + mupq/crypto_sign/mayo% \ + mupq/crypto_sign/meds% \ + mupq/crypto_sign/mirith% \ + mupq/crypto_sign/mqom% \ + mupq/crypto_sign/ov-Ip% \ + mupq/crypto_sign/perk% \ + mupq/crypto_sign/snova% \ + mupq/crypto_sign/sphincs% \ + mupq/crypto_sign/tuov% + +include mk/opencm3.mk + +elf/boardtest.elf: CPPFLAGS+=-DSRAM_TIMING_TEST -DHAS_SRAM2 -DHAS_CCM +elf/boardtest-fast.elf: CPPFLAGS+=-DSRAM_TIMING_TEST -DHAS_SRAM2 -DHAS_CCM diff --git a/mk/opencm3.mk b/mk/opencm3.mk new file mode 100644 index 0000000..6e07c71 --- /dev/null +++ b/mk/opencm3.mk @@ -0,0 +1,111 @@ +# SPDX-License-Identifier: Apache-2.0 or CC0-1.0 +LIBHAL_SRC := \ + common/hal-opencm3.c \ + common/randombytes.c + +obj/libpqm4hal.a: $(call objs,$(LIBHAL_SRC)) +obj/libpqm4hal-nornd.a: $(call objs,$(filter-out common/randombytes.c,$(LIBHAL_SRC))) + +ifeq ($(AIO),1) +LDLIBS += +LIBDEPS += $(if $(NO_RANDOMBYTES),$(filter-out common/randombytes.c,$(LIBHAL_SRC)),$(LIBHAL_SRC)) +else +LDLIBS += -lpqm4hal$(if $(NO_RANDOMBYTES),-nornd) +LIBDEPS += obj/libpqm4hal$(if $(NO_RANDOMBYTES),-nornd).a +endif + +LDLIBS += -lc -lgcc + +export OPENCM3_DIR := $(CURDIR)/libopencm3 + +_git_submodule_update_opencm3 := $(shell git submodule update --init --recursive $(OPENCM3_DIR)) + +ifeq ($(DEVICE),) +$(warning no DEVICE specified for linker script generator) +endif + +DEVICES_DATA ?= $(OPENCM3_DIR)/ld/devices.data + +genlink_family :=$(shell $(OPENCM3_DIR)/scripts/genlink.py $(DEVICES_DATA) $(DEVICE) FAMILY) +genlink_subfamily :=$(shell $(OPENCM3_DIR)/scripts/genlink.py $(DEVICES_DATA) $(DEVICE) SUBFAMILY) +genlink_cpu :=$(shell $(OPENCM3_DIR)/scripts/genlink.py $(DEVICES_DATA) $(DEVICE) CPU) +genlink_fpu :=$(shell $(OPENCM3_DIR)/scripts/genlink.py $(DEVICES_DATA) $(DEVICE) FPU) +genlink_cppflags :=$(shell $(OPENCM3_DIR)/scripts/genlink.py $(DEVICES_DATA) $(DEVICE) CPPFLAGS) + +ifeq ($(genlink_family),) +$(warning $(DEVICE) not found in $(DEVICES_DATA)) +endif + +CPPFLAGS += $(genlink_cppflags) + +ARCH_FLAGS :=-mcpu=$(genlink_cpu) +ifeq ($(genlink_cpu),$(filter $(genlink_cpu),cortex-m0 cortex-m0plus cortex-m3 cortex-m4 cortex-m7)) +ARCH_FLAGS +=-mthumb +endif + +ifeq ($(genlink_fpu),soft) +ARCH_FLAGS += -msoft-float +else ifeq ($(genlink_fpu),hard-fpv4-sp-d16) +ARCH_FLAGS += -mfloat-abi=hard -mfpu=fpv4-sp-d16 +else ifeq ($(genlink_fpu),hard-fpv5-sp-d16) +ARCH_FLAGS += -mfloat-abi=hard -mfpu=fpv5-sp-d16 +else +$(warning No match for the FPU flags) +endif + +LIBNAME = opencm3_$(genlink_family) + +LDLIBS += -l$(LIBNAME) +LIBDEPS += $(OPENCM3_DIR)/lib/lib$(LIBNAME).a + +LDFLAGS += -L$(OPENCM3_DIR)/lib +CPPFLAGS += -I$(OPENCM3_DIR)/include + +$(OPENCM3_DIR)/lib/lib$(LIBNAME).a: + $(MAKE) -C $(OPENCM3_DIR) $(OPENCM3_TARGET) + +obj/common/hal-opencm3.c.o: $(OPENCM3_DIR)/lib/lib$(LIBNAME).a + +ifeq ($(wildcard ldscripts/$(PLATFORM).ld),) +LDSCRIPT = obj/generated.$(DEVICE).ld +$(LDSCRIPT): $(OPENCM3_DIR)/ld/linker.ld.S $(OPENCM3_DIR)/ld/devices.data $(CONFIG) + @printf " GENLNK $(DEVICE)\n" + $(Q)mkdir -p $(@D) + $(Q)$(CPP) $(ARCH_FLAGS) $(shell $(OPENCM3_DIR)/scripts/genlink.py $(DEVICES_DATA) $(DEVICE) DEFS) -P -E $< -o $@ +else +LDSCRIPT = ldscripts/$(PLATFORM).ld +endif + +CROSS_PREFIX ?= arm-none-eabi +CC := $(CROSS_PREFIX)-gcc +CPP := $(CROSS_PREFIX)-cpp +AR := $(CROSS_PREFIX)-gcc-ar +LD := $(CC) +OBJCOPY := $(CROSS_PREFIX)-objcopy +SIZE := $(CROSS_PREFIX)-size + +CFLAGS += \ + $(ARCH_FLAGS) \ + +LDFLAGS += \ + --specs=nosys.specs \ + -Wl,--wrap=_sbrk \ + -Wl,--wrap=_close \ + -Wl,--wrap=_isatty \ + -Wl,--wrap=_kill \ + -Wl,--wrap=_lseek \ + -Wl,--wrap=_read \ + -Wl,--wrap=_write \ + -Wl,--wrap=_fstat \ + -Wl,--wrap=_getpid \ + -nostartfiles \ + -ffreestanding \ + -T$(LDSCRIPT) \ + $(ARCH_FLAGS) + +.PHONY: libclean + +libclean: + make -C $(OPENCM3_DIR) clean + +LINKDEPS += $(LDSCRIPT) $(LIBDEPS) diff --git a/mk/tests.mk b/mk/tests.mk new file mode 100644 index 0000000..8b6aab8 --- /dev/null +++ b/mk/tests.mk @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: Apache-2.0 or CC0-1.0 +ifeq ($(AIO),1) +elf/boardtest.elf: common/test.c $(LINKDEPS) $(CONFIG) + $(compiletest) + +elf/boardtest-fast.elf: common/testfast.c $(LINKDEPS) $(CONFIG) + $(compiletest) + +elf/boardtest-fast.elf: CPPFLAGS += -DCLOCK_TEST=CLOCK_FAST + +elf/aestest.elf: common/aestest.c $(LINKDEPS) $(CONFIG) + $(compiletest) + +elf/keccaktest.elf: common/keccaktest.c $(LINKDEPS) $(CONFIG) + $(compiletest) +else +elf/boardtest.elf: $(call objs,common/test.c) $(LINKDEPS) $(CONFIG) + +elf/boardtest-fast.elf: $(call objs,common/testfast.c) $(LINKDEPS) $(CONFIG) + +$(call objs,common/testfast.c): CPPFLAGS += -DCLOCK_TEST=CLOCK_FAST + +elf/aestest.elf: $(call objs,common/aestest.c) $(LINKDEPS) $(CONFIG) + +elf/keccaktest.elf: $(call objs,common/keccaktest.c) $(LINKDEPS) $(CONFIG) +endif + +tests: elf/boardtest.elf elf/aestest.elf elf/keccaktest.elf +tests-bin: bin/boardtest.bin bin/aestest.bin bin/keccaktest.bin diff --git a/mupq b/mupq new file mode 160000 index 0000000..8e62b94 --- /dev/null +++ b/mupq @@ -0,0 +1 @@ +Subproject commit 8e62b94bfb8125fc81ac2774f8aa8b44120bc619 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4a96b58 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +pyserial==3.4 +tqdm diff --git a/skiplist.py b/skiplist.py new file mode 100644 index 0000000..397c78f --- /dev/null +++ b/skiplist.py @@ -0,0 +1,250 @@ +skip_list = [ + {'scheme': 'aimer192s', 'implementation': 'opt_mem', 'estmemory': 70656}, + {'scheme': 'aimer192s', 'implementation': 'ref', 'estmemory': 2036736}, + {'scheme': 'aimer192f', 'implementation': 'opt_mem', 'estmemory': 46080}, + {'scheme': 'aimer192f', 'implementation': 'ref', 'estmemory': 287744}, + {'scheme': 'aimer128s', 'implementation': 'opt_mem', 'estmemory': 39936}, + {'scheme': 'aimer128s', 'implementation': 'ref', 'estmemory': 924672}, + {'scheme': 'aimer256f', 'implementation': 'opt_mem', 'estmemory': 105472}, + {'scheme': 'aimer256f', 'implementation': 'ref', 'estmemory': 600064}, + {'scheme': 'aimer256s', 'implementation': 'opt_mem', 'estmemory': 135168}, + {'scheme': 'aimer256s', 'implementation': 'ref', 'estmemory': 4148224}, + {'scheme': 'aimer128f', 'implementation': 'opt_mem', 'estmemory': 22528}, + {'scheme': 'aimer128f', 'implementation': 'ref', 'estmemory': 131072}, + {'scheme': 'ascon-sign-128f-robust', 'implementation': 'ref', 'estmemory': 21504}, + {'scheme': 'ascon-sign-128f-simple', 'implementation': 'ref', 'estmemory': 21504}, + {'scheme': 'ascon-sign-128s-robust', 'implementation': 'ref', 'estmemory': 12288}, + {'scheme': 'ascon-sign-128s-simple', 'implementation': 'ref', 'estmemory': 12288}, + {'scheme': 'ascon-sign-192f-robust', 'implementation': 'ref', 'estmemory': 43008}, + {'scheme': 'ascon-sign-192f-simple', 'implementation': 'ref', 'estmemory': 41984}, + {'scheme': 'ascon-sign-192s-robust', 'implementation': 'ref', 'estmemory': 23552}, + {'scheme': 'ascon-sign-192s-simple', 'implementation': 'ref', 'estmemory': 22528}, + {'scheme': 'bikel1', 'implementation': 'm4f', 'estmemory': 103424}, + {'scheme': 'bikel1', 'implementation': 'opt', 'estmemory': 90112}, + {'scheme': 'bikel3', 'implementation': 'm4f', 'estmemory': 194560}, + {'scheme': 'bikel3', 'implementation': 'opt', 'estmemory': 175104}, + {'scheme': 'biscuit128f', 'implementation': 'ref', 'estmemory': 145408}, + {'scheme': 'biscuit128s', 'implementation': 'ref', 'estmemory': 1099776}, + {'scheme': 'biscuit192f', 'implementation': 'ref', 'estmemory': 282624}, + {'scheme': 'biscuit192s', 'implementation': 'ref', 'estmemory': 2257920}, + {'scheme': 'biscuit256f', 'implementation': 'ref', 'estmemory': 505856}, + {'scheme': 'biscuit256s', 'implementation': 'ref', 'estmemory': 4004864}, + {'scheme': 'cross-sha2-r-sdp-1-fast', 'implementation': 'ref', 'estmemory': 234496}, + {'scheme': 'cross-sha2-r-sdp-1-small', 'implementation': 'ref', 'estmemory': 721920}, + {'scheme': 'cross-sha2-r-sdp-3-fast', 'implementation': 'ref', 'estmemory': 365568}, + {'scheme': 'cross-sha2-r-sdp-3-small', 'implementation': 'ref', 'estmemory': 1295360}, + {'scheme': 'cross-sha2-r-sdp-5-fast', 'implementation': 'ref', 'estmemory': 914432}, + {'scheme': 'cross-sha2-r-sdp-5-small', 'implementation': 'ref', 'estmemory': 1748992}, + {'scheme': 'cross-sha2-r-sdpg-1-fast', 'implementation': 'ref', 'estmemory': 143360}, + {'scheme': 'cross-sha2-r-sdpg-1-small', 'implementation': 'ref', 'estmemory': 477184}, + {'scheme': 'cross-sha2-r-sdpg-3-fast', 'implementation': 'ref', 'estmemory': 230400}, + {'scheme': 'cross-sha2-r-sdpg-3-small', 'implementation': 'ref', 'estmemory': 776192}, + {'scheme': 'cross-sha2-r-sdpg-5-fast', 'implementation': 'ref', 'estmemory': 440320}, + {'scheme': 'cross-sha2-r-sdpg-5-small', 'implementation': 'ref', 'estmemory': 1063936}, + {'scheme': 'cross-sha3-r-sdp-1-fast', 'implementation': 'ref', 'estmemory': 234496}, + {'scheme': 'cross-sha3-r-sdp-1-small', 'implementation': 'ref', 'estmemory': 721920}, + {'scheme': 'cross-sha3-r-sdp-3-fast', 'implementation': 'ref', 'estmemory': 365568}, + {'scheme': 'cross-sha3-r-sdp-3-small', 'implementation': 'ref', 'estmemory': 1295360}, + {'scheme': 'cross-sha3-r-sdp-5-fast', 'implementation': 'ref', 'estmemory': 914432}, + {'scheme': 'cross-sha3-r-sdp-5-small', 'implementation': 'ref', 'estmemory': 1748992}, + {'scheme': 'cross-sha3-r-sdpg-1-fast', 'implementation': 'ref', 'estmemory': 143360}, + {'scheme': 'cross-sha3-r-sdpg-1-small', 'implementation': 'ref', 'estmemory': 477184}, + {'scheme': 'cross-sha3-r-sdpg-3-fast', 'implementation': 'ref', 'estmemory': 230400}, + {'scheme': 'cross-sha3-r-sdpg-3-small', 'implementation': 'ref', 'estmemory': 776192}, + {'scheme': 'cross-sha3-r-sdpg-5-fast', 'implementation': 'ref', 'estmemory': 440320}, + {'scheme': 'cross-sha3-r-sdpg-5-small', 'implementation': 'ref', 'estmemory': 1063936}, + {'scheme': 'dilithium2', 'implementation': 'clean', 'estmemory': 59392}, + {'scheme': 'dilithium2', 'implementation': 'm4f', 'estmemory': 57344}, + {'scheme': 'dilithium3', 'implementation': 'clean', 'estmemory': 90112}, + {'scheme': 'dilithium3', 'implementation': 'm4f', 'estmemory': 79872}, + {'scheme': 'dilithium5', 'implementation': 'clean', 'estmemory': 136192}, + {'scheme': 'dilithium5', 'implementation': 'm4f', 'estmemory': 129024}, + {'scheme': 'falcon-1024', 'implementation': 'clean', 'estmemory': 91136}, + {'scheme': 'falcon-1024', 'implementation': 'm4-ct', 'estmemory': 89088}, + {'scheme': 'falcon-1024', 'implementation': 'opt-ct', 'estmemory': 89088}, + {'scheme': 'falcon-1024', 'implementation': 'opt-leaktime', 'estmemory': 90112}, + {'scheme': 'falcon-1024-tree', 'implementation': 'opt-ct', 'estmemory': 185344}, + {'scheme': 'falcon-1024-tree', 'implementation': 'opt-leaktime', 'estmemory': 186368}, + {'scheme': 'falcon-512', 'implementation': 'clean', 'estmemory': 48128}, + {'scheme': 'falcon-512', 'implementation': 'm4-ct', 'estmemory': 46080}, + {'scheme': 'falcon-512', 'implementation': 'opt-ct', 'estmemory': 46080}, + {'scheme': 'falcon-512', 'implementation': 'opt-leaktime', 'estmemory': 47104}, + {'scheme': 'falcon-512-tree', 'implementation': 'm4-ct', 'estmemory': 90112}, + {'scheme': 'falcon-512-tree', 'implementation': 'opt-ct', 'estmemory': 90112}, + {'scheme': 'falcon-512-tree', 'implementation': 'opt-leaktime', 'estmemory': 91136}, + {'scheme': 'haetae2', 'implementation': 'm4f', 'estmemory': 60416}, + {'scheme': 'haetae2', 'implementation': 'ref', 'estmemory': 59392}, + {'scheme': 'haetae3', 'implementation': 'm4f', 'estmemory': 90112}, + {'scheme': 'haetae3', 'implementation': 'ref', 'estmemory': 87040}, + {'scheme': 'haetae5', 'implementation': 'm4f', 'estmemory': 112640}, + {'scheme': 'haetae5', 'implementation': 'ref', 'estmemory': 109568}, + {'scheme': 'hawk1024', 'implementation': 'ref', 'estmemory': 32768}, + {'scheme': 'hawk256', 'implementation': 'ref', 'estmemory': 10240}, + {'scheme': 'hawk512', 'implementation': 'ref', 'estmemory': 17408}, + {'scheme': 'hqc-128', 'implementation': 'clean', 'estmemory': 66560}, + {'scheme': 'hqc-192', 'implementation': 'clean', 'estmemory': 130048}, + {'scheme': 'hqc-256', 'implementation': 'clean', 'estmemory': 205824}, + {'scheme': 'ml-kem-1024', 'implementation': 'clean', 'estmemory': 27648}, + {'scheme': 'ml-kem-1024', 'implementation': 'm7fspeed', 'estmemory': 16384}, + {'scheme': 'ml-kem-1024', 'implementation': 'm7fstack', 'estmemory': 12288}, + {'scheme': 'ml-kem-512', 'implementation': 'clean', 'estmemory': 14336}, + {'scheme': 'ml-kem-512', 'implementation': 'm7fspeed', 'estmemory': 10240}, + {'scheme': 'ml-kem-512', 'implementation': 'm7fstack', 'estmemory': 7168}, + {'scheme': 'ml-kem-768', 'implementation': 'clean', 'estmemory': 20480}, + {'scheme': 'ml-kem-768', 'implementation': 'm7fspeed', 'estmemory': 13312}, + {'scheme': 'ml-kem-768', 'implementation': 'm7fstack', 'estmemory': 10240}, + {'scheme': 'mayo1', 'implementation': 'm4f', 'estmemory': 446464}, + {'scheme': 'mayo1', 'implementation': 'ref', 'estmemory': 404480}, + {'scheme': 'mayo2', 'implementation': 'm4f', 'estmemory': 287744}, + {'scheme': 'mayo2', 'implementation': 'ref', 'estmemory': 279552}, + {'scheme': 'mayo3', 'implementation': 'm4f', 'estmemory': 477184}, + {'scheme': 'mayo3', 'implementation': 'ref', 'estmemory': 1144832}, + {'scheme': 'mceliece348864', 'implementation': 'clean', 'estmemory': 693248}, + {'scheme': 'mceliece348864f', 'implementation': 'clean', 'estmemory': 693248}, + {'scheme': 'mceliece460896', 'implementation': 'clean', 'estmemory': 1425408}, + {'scheme': 'mceliece460896f', 'implementation': 'clean', 'estmemory': 1426432}, + {'scheme': 'mceliece6688128', 'implementation': 'clean', 'estmemory': 2627584}, + {'scheme': 'mceliece6688128f', 'implementation': 'clean', 'estmemory': 2628608}, + {'scheme': 'mceliece6960119', 'implementation': 'clean', 'estmemory': 2585600}, + {'scheme': 'mceliece6960119f', 'implementation': 'clean', 'estmemory': 2586624}, + {'scheme': 'mceliece8192128', 'implementation': 'clean', 'estmemory': 3259392}, + {'scheme': 'mceliece8192128f', 'implementation': 'clean', 'estmemory': 3260416}, + {'scheme': 'meds13220', 'implementation': 'ref', 'estmemory': 209920}, + {'scheme': 'meds134180', 'implementation': 'ref', 'estmemory': 1152000}, + {'scheme': 'meds167717', 'implementation': 'ref', 'estmemory': 927744}, + {'scheme': 'meds41711', 'implementation': 'ref', 'estmemory': 1387520}, + {'scheme': 'meds55604', 'implementation': 'ref', 'estmemory': 509952}, + {'scheme': 'meds9923', 'implementation': 'ref', 'estmemory': 1019904}, + {'scheme': 'mirith_IIIa_fast', 'implementation': 'ref', 'estmemory': 287744}, + {'scheme': 'mirith_IIIa_short', 'implementation': 'ref', 'estmemory': 2197504}, + {'scheme': 'mirith_IIIb_fast', 'implementation': 'ref', 'estmemory': 320512}, + {'scheme': 'mirith_IIIb_short', 'implementation': 'ref', 'estmemory': 2386944}, + {'scheme': 'mirith_Ia_fast', 'implementation': 'ref', 'estmemory': 134144}, + {'scheme': 'mirith_Ia_short', 'implementation': 'ref', 'estmemory': 1019904}, + {'scheme': 'mirith_Ib_fast', 'implementation': 'ref', 'estmemory': 163840}, + {'scheme': 'mirith_Ib_short', 'implementation': 'ref', 'estmemory': 1195008}, + {'scheme': 'mirith_Va_fast', 'implementation': 'ref', 'estmemory': 519168}, + {'scheme': 'mirith_Va_short', 'implementation': 'ref', 'estmemory': 3816448}, + {'scheme': 'mirith_Vb_fast', 'implementation': 'ref', 'estmemory': 572416}, + {'scheme': 'mirith_Vb_short', 'implementation': 'ref', 'estmemory': 4117504}, + {'scheme': 'mirith_hypercube_IIIa_fast', 'implementation': 'ref', 'estmemory': 188416}, + {'scheme': 'mirith_hypercube_IIIa_short', 'implementation': 'ref', 'estmemory': 502784}, + {'scheme': 'mirith_hypercube_IIIa_shorter', 'implementation': 'ref', 'estmemory': 3894272}, + {'scheme': 'mirith_hypercube_IIIb_fast', 'implementation': 'ref', 'estmemory': 211968}, + {'scheme': 'mirith_hypercube_IIIb_short', 'implementation': 'ref', 'estmemory': 526336}, + {'scheme': 'mirith_hypercube_IIIb_shorter', 'implementation': 'ref', 'estmemory': 3916800}, + {'scheme': 'mirith_hypercube_Ia_fast', 'implementation': 'opt', 'estmemory': 88064}, + {'scheme': 'mirith_hypercube_Ia_fast', 'implementation': 'ref', 'estmemory': 89088}, + {'scheme': 'mirith_hypercube_Ia_short', 'implementation': 'ref', 'estmemory': 227328}, + {'scheme': 'mirith_hypercube_Ia_shorter', 'implementation': 'ref', 'estmemory': 1779712}, + {'scheme': 'mirith_hypercube_Ib_fast', 'implementation': 'opt', 'estmemory': 109568}, + {'scheme': 'mirith_hypercube_Ib_fast', 'implementation': 'ref', 'estmemory': 109568}, + {'scheme': 'mirith_hypercube_Ib_short', 'implementation': 'ref', 'estmemory': 247808}, + {'scheme': 'mirith_hypercube_Ib_shorter', 'implementation': 'ref', 'estmemory': 1800192}, + {'scheme': 'mirith_hypercube_Va_fast', 'implementation': 'ref', 'estmemory': 344064}, + {'scheme': 'mirith_hypercube_Va_short', 'implementation': 'ref', 'estmemory': 878592}, + {'scheme': 'mirith_hypercube_Va_shorter', 'implementation': 'ref', 'estmemory': 4217856}, + {'scheme': 'mirith_hypercube_Vb_fast', 'implementation': 'ref', 'estmemory': 382976}, + {'scheme': 'mirith_hypercube_Vb_short', 'implementation': 'ref', 'estmemory': 916480}, + {'scheme': 'mirith_hypercube_Vb_shorter', 'implementation': 'ref', 'estmemory': 4218880}, + {'scheme': 'mqom_cat1_gf251_fast', 'implementation': 'ref', 'estmemory': 411648}, + {'scheme': 'mqom_cat1_gf251_short', 'implementation': 'ref', 'estmemory': 675840}, + {'scheme': 'mqom_cat1_gf31_fast', 'implementation': 'ref', 'estmemory': 624640}, + {'scheme': 'mqom_cat1_gf31_short', 'implementation': 'ref', 'estmemory': 878592}, + {'scheme': 'mqom_cat3_gf251_fast', 'implementation': 'ref', 'estmemory': 1307648}, + {'scheme': 'mqom_cat3_gf251_short', 'implementation': 'ref', 'estmemory': 1903616}, + {'scheme': 'mqom_cat3_gf31_fast', 'implementation': 'ref', 'estmemory': 2171904}, + {'scheme': 'mqom_cat3_gf31_short', 'implementation': 'ref', 'estmemory': 2688000}, + {'scheme': 'mqom_cat5_gf251_fast', 'implementation': 'ref', 'estmemory': 3260416}, + {'scheme': 'mqom_cat5_gf251_short', 'implementation': 'ref', 'estmemory': 4146176}, + {'scheme': 'ov-Ip', 'implementation': 'm4f', 'estmemory': 534528}, + {'scheme': 'ov-Ip', 'implementation': 'ref', 'estmemory': 534528}, + {'scheme': 'ov-Ip-pkc', 'implementation': 'm4fspeed', 'estmemory': 565248}, + {'scheme': 'ov-Ip-pkc', 'implementation': 'm4fstack', 'estmemory': 425984}, + {'scheme': 'ov-Ip-pkc', 'implementation': 'ref', 'estmemory': 568320}, + {'scheme': 'ov-Ip-pkc-skc', 'implementation': 'm4fspeed', 'estmemory': 425984}, + {'scheme': 'ov-Ip-pkc-skc', 'implementation': 'm4fstack', 'estmemory': 425984}, + {'scheme': 'ov-Ip-pkc-skc', 'implementation': 'ref', 'estmemory': 330752}, + {'scheme': 'perk-128-fast-3', 'implementation': 'm4', 'estmemory': 33792}, + {'scheme': 'perk-128-fast-3', 'implementation': 'ref', 'estmemory': 323584}, + {'scheme': 'perk-128-fast-5', 'implementation': 'm4', 'estmemory': 34816}, + {'scheme': 'perk-128-fast-5', 'implementation': 'ref', 'estmemory': 315392}, + {'scheme': 'perk-128-short-3', 'implementation': 'm4', 'estmemory': 37888}, + {'scheme': 'perk-128-short-3', 'implementation': 'ref', 'estmemory': 1570816}, + {'scheme': 'perk-128-short-5', 'implementation': 'm4', 'estmemory': 37888}, + {'scheme': 'perk-128-short-5', 'implementation': 'ref', 'estmemory': 1472512}, + {'scheme': 'perk-192-fast-3', 'implementation': 'm4', 'estmemory': 68608}, + {'scheme': 'perk-192-fast-3', 'implementation': 'ref', 'estmemory': 707584}, + {'scheme': 'perk-192-fast-5', 'implementation': 'm4', 'estmemory': 68608}, + {'scheme': 'perk-192-fast-5', 'implementation': 'ref', 'estmemory': 681984}, + {'scheme': 'perk-192-short-3', 'implementation': 'm4', 'estmemory': 69632}, + {'scheme': 'perk-192-short-3', 'implementation': 'ref', 'estmemory': 3487744}, + {'scheme': 'perk-192-short-5', 'implementation': 'm4', 'estmemory': 69632}, + {'scheme': 'perk-192-short-5', 'implementation': 'ref', 'estmemory': 3240960}, + {'scheme': 'perk-256-fast-3', 'implementation': 'm4', 'estmemory': 115712}, + {'scheme': 'perk-256-fast-3', 'implementation': 'ref', 'estmemory': 1226752}, + {'scheme': 'perk-256-fast-5', 'implementation': 'm4', 'estmemory': 114688}, + {'scheme': 'perk-256-fast-5', 'implementation': 'ref', 'estmemory': 1175552}, + {'scheme': 'perk-256-short-3', 'implementation': 'm4', 'estmemory': 111616}, + {'scheme': 'perk-256-short-3', 'implementation': 'ref', 'estmemory': 4222976}, + {'scheme': 'perk-256-short-5', 'implementation': 'm4', 'estmemory': 109568}, + {'scheme': 'perk-256-short-5', 'implementation': 'ref', 'estmemory': 4221952}, + {'scheme': 'snova-24-5-16-4-esk', 'implementation': 'ref', 'estmemory': 205824}, + {'scheme': 'snova-24-5-16-4-ssk', 'implementation': 'ref', 'estmemory': 172032}, + {'scheme': 'snova-25-8-16-3-esk', 'implementation': 'ref', 'estmemory': 232448}, + {'scheme': 'snova-25-8-16-3-ssk', 'implementation': 'ref', 'estmemory': 194560}, + {'scheme': 'snova-28-17-16-2-esk', 'implementation': 'ref', 'estmemory': 380928}, + {'scheme': 'snova-28-17-16-2-ssk', 'implementation': 'ref', 'estmemory': 320512}, + {'scheme': 'snova-37-8-16-4-esk', 'implementation': 'ref', 'estmemory': 775168}, + {'scheme': 'snova-37-8-16-4-ssk', 'implementation': 'ref', 'estmemory': 646144}, + {'scheme': 'snova-43-25-16-2-esk', 'implementation': 'ref', 'estmemory': 1274880}, + {'scheme': 'snova-43-25-16-2-ssk', 'implementation': 'ref', 'estmemory': 1072128}, + {'scheme': 'snova-49-11-16-3-esk', 'implementation': 'ref', 'estmemory': 1055744}, + {'scheme': 'snova-49-11-16-3-ssk', 'implementation': 'ref', 'estmemory': 880640}, + {'scheme': 'snova-60-10-16-4-esk', 'implementation': 'ref', 'estmemory': 2342912}, + {'scheme': 'snova-60-10-16-4-ssk', 'implementation': 'ref', 'estmemory': 1953792}, + {'scheme': 'snova-61-33-16-2-esk', 'implementation': 'ref', 'estmemory': 3232768}, + {'scheme': 'snova-61-33-16-2-ssk', 'implementation': 'ref', 'estmemory': 2717696}, + {'scheme': 'snova-66-15-16-3-esk', 'implementation': 'ref', 'estmemory': 2617344}, + {'scheme': 'snova-66-15-16-3-ssk', 'implementation': 'ref', 'estmemory': 2185216}, + {'scheme': 'sphincs-a-sha2-128f', 'implementation': 'ref', 'estmemory': 301056}, + {'scheme': 'sphincs-a-sha2-128s', 'implementation': 'ref', 'estmemory': 595968}, + {'scheme': 'sphincs-a-sha2-192f', 'implementation': 'ref', 'estmemory': 542720}, + {'scheme': 'sphincs-a-sha2-192s', 'implementation': 'ref', 'estmemory': 1307648}, + {'scheme': 'sphincs-a-sha2-256f', 'implementation': 'ref', 'estmemory': 1124352}, + {'scheme': 'sphincs-a-sha2-256s', 'implementation': 'ref', 'estmemory': 2291712}, + {'scheme': 'sphincs-a-shake-128f', 'implementation': 'ref', 'estmemory': 301056}, + {'scheme': 'sphincs-a-shake-128s', 'implementation': 'ref', 'estmemory': 595968}, + {'scheme': 'sphincs-a-shake-192f', 'implementation': 'ref', 'estmemory': 541696}, + {'scheme': 'sphincs-a-shake-192s', 'implementation': 'ref', 'estmemory': 1306624}, + {'scheme': 'sphincs-a-shake-256f', 'implementation': 'ref', 'estmemory': 1124352}, + {'scheme': 'sphincs-a-shake-256s', 'implementation': 'ref', 'estmemory': 2291712}, + {'scheme': 'sphincs-sha2-128f-simple', 'implementation': 'clean', 'estmemory': 21504}, + {'scheme': 'sphincs-sha2-128s-simple', 'implementation': 'clean', 'estmemory': 12288}, + {'scheme': 'sphincs-sha2-192f-simple', 'implementation': 'clean', 'estmemory': 43008}, + {'scheme': 'sphincs-sha2-192s-simple', 'implementation': 'clean', 'estmemory': 23552}, + {'scheme': 'sphincs-sha2-256f-simple', 'implementation': 'clean', 'estmemory': 59392}, + {'scheme': 'sphincs-sha2-256s-simple', 'implementation': 'clean', 'estmemory': 39936}, + {'scheme': 'sphincs-shake-128f-simple', 'implementation': 'clean', 'estmemory': 21504}, + {'scheme': 'sphincs-shake-128s-simple', 'implementation': 'clean', 'estmemory': 12288}, + {'scheme': 'sphincs-shake-192f-simple', 'implementation': 'clean', 'estmemory': 41984}, + {'scheme': 'sphincs-shake-192s-simple', 'implementation': 'clean', 'estmemory': 22528}, + {'scheme': 'sphincs-shake-256f-simple', 'implementation': 'clean', 'estmemory': 59392}, + {'scheme': 'sphincs-shake-256s-simple', 'implementation': 'clean', 'estmemory': 38912}, + {'scheme': 'tuov_iii', 'implementation': 'ref', 'estmemory': 3281920}, + {'scheme': 'tuov_iii_pkc', 'implementation': 'ref', 'estmemory': 3468288}, + {'scheme': 'tuov_iii_pkc_skc', 'implementation': 'ref', 'estmemory': 3790848}, + {'scheme': 'tuov_ip', 'implementation': 'ref', 'estmemory': 3790848}, + {'scheme': 'tuov_ip_pkc', 'implementation': 'ref', 'estmemory': 799744}, + {'scheme': 'tuov_ip_pkc_skc', 'implementation': 'ref', 'estmemory': 865280}, + {'scheme': 'tuov_is', 'implementation': 'ref', 'estmemory': 1111040}, + {'scheme': 'tuov_is_pkc', 'implementation': 'ref', 'estmemory': 1176576}, + {'scheme': 'tuov_is_pkc_skc', 'implementation': 'ref', 'estmemory': 1275904}, + {'scheme': 'tuov_v_pkc', 'implementation': 'ref', 'estmemory': 7083008}, + {'scheme': 'tuov_v_pkc_skc', 'implementation': 'ref', 'estmemory': 4639744}, + {'scheme': 'dilithium2', 'implementation': 'm7fstack', 'estmemory': 12288}, + {'scheme': 'dilithium5', 'implementation': 'm7fstack', 'estmemory': 21504}, + {'scheme': 'dilithium3', 'implementation': 'm7fstack', 'estmemory': 17408}, + {'scheme': 'falcon-padded-1024', 'implementation': 'clean', 'estmemory': 91136}, + {'scheme': 'falcon-padded-512', 'implementation': 'clean', 'estmemory': 48128}, +] diff --git a/slothy b/slothy new file mode 160000 index 0000000..1fd3fdf --- /dev/null +++ b/slothy @@ -0,0 +1 @@ +Subproject commit 1fd3fdf881a269c198c1af7e7ac98240250f2113 diff --git a/test.py b/test.py new file mode 100755 index 0000000..969282b --- /dev/null +++ b/test.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 or CC0-1.0 +from mupq import mupq +from interface import parse_arguments, get_platform + +import sys + +if __name__ == "__main__": + args, rest = parse_arguments() + platform, settings = get_platform(args) + with platform: + test = mupq.SimpleTest(settings, platform) + if test.test_all(rest): + sys.exit(1) diff --git a/testvectors.py b/testvectors.py new file mode 100755 index 0000000..a08c151 --- /dev/null +++ b/testvectors.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 or CC0-1.0 +from mupq import mupq +from interface import parse_arguments, get_platform +import sys + +if __name__ == "__main__": + args, rest = parse_arguments() + platform, settings = get_platform(args) + with platform: + test = mupq.TestVectors(settings, platform) + if test.test_all(rest): + sys.exit(1)