From 44365aaec2f0e35793ebc137cc06891be09bd2e4 Mon Sep 17 00:00:00 2001
From: "Thing-han, Lim" <15379156+potsrevennil@users.noreply.github.com>
Date: Thu, 17 Oct 2024 19:21:54 +0800
Subject: [PATCH] init pqm7

Signed-off-by: Thing-han, Lim <15379156+potsrevennil@users.noreply.github.com>
---
 .github/dependabot.yml                        |    6 +
 .github/pull_request_template.md              |    7 +
 .github/workflows/nucleo-f767zi.yml           |   21 +
 .gitignore                                    |   16 +
 .gitmodules                                   |    9 +
 Makefile                                      |   23 +
 README.md                                     |  411 +
 benchmarks.py                                 |   33 +
 build_everything.py                           |   16 +
 common/aes-encrypt.S                          |  613 ++
 common/aes-keyschedule.S                      |  851 +++
 common/aes-publicinputs.S                     | 1327 ++++
 common/aes-publicinputs.c                     |  259 +
 common/aes-publicinputs.h                     |   62 +
 common/aes.c                                  |  232 +
 common/aes.h                                  |   49 +
 common/aestest.c                              |  185 +
 common/crypto_hashblocks_sha512.c             |  101 +
 common/crypto_hashblocks_sha512_inner32.s     | 6593 +++++++++++++++++
 common/hal-mps2.c                             |  279 +
 common/hal-opencm3.c                          |  245 +
 common/keccakf1600.S                          | 1134 +++
 common/keccaktest.c                           |   85 +
 common/mps2/CMSDK_CM4.h                       | 1289 ++++
 common/mps2/LICENSE.txt                       |  201 +
 common/mps2/MPS2.ld                           |  208 +
 common/mps2/cmsis_armclang.h                  | 1467 ++++
 common/mps2/cmsis_compiler.h                  |  283 +
 common/mps2/cmsis_gcc.h                       | 2177 ++++++
 common/mps2/cmsis_nvic.h                      |   47 +
 common/mps2/cmsis_version.h                   |   39 +
 common/mps2/core_cm4.h                        | 2129 ++++++
 common/mps2/memory_zones.h                    |   49 +
 common/mps2/mpu_armv7.h                       |  275 +
 common/mps2/startup_MPS2.S                    |  206 +
 common/randombytes.c                          |  121 +
 common/test.c                                 |  161 +
 common/testfast.c                             |    1 +
 convert_benchmarks.py                         |   19 +
 crypto_kem/ml-kem-1024/m4fspeed/api.h         |   20 +
 crypto_kem/ml-kem-1024/m4fspeed/cbd.c         |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/cbd.h         |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/cmov_int16.S  |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/fastaddsub.S  |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/fastbasemul.S |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/fastinvntt.S  |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/fastntt.S     |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/indcpa.c      |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/indcpa.h      |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/kem.c         |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/macros.i      |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/matacc.c      |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/matacc.h      |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/matacc.i      |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/matacc_asm.S  |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/ntt.c         |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/ntt.h         |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/params.h      |   31 +
 crypto_kem/ml-kem-1024/m4fspeed/poly.c        |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/poly.h        |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/poly_asm.S    |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/polyvec.c     |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/polyvec.h     |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/reduce.S      |    1 +
 .../ml-kem-1024/m4fspeed/symmetric-fips202.c  |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/symmetric.h   |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/verify.c      |    1 +
 crypto_kem/ml-kem-1024/m4fspeed/verify.h      |    1 +
 crypto_kem/ml-kem-1024/m4fstack/api.h         |    1 +
 crypto_kem/ml-kem-1024/m4fstack/cbd.c         |    1 +
 crypto_kem/ml-kem-1024/m4fstack/cbd.h         |    1 +
 crypto_kem/ml-kem-1024/m4fstack/cmov_int16.S  |    1 +
 crypto_kem/ml-kem-1024/m4fstack/fastaddsub.S  |    1 +
 crypto_kem/ml-kem-1024/m4fstack/fastbasemul.S |    1 +
 crypto_kem/ml-kem-1024/m4fstack/fastinvntt.S  |    1 +
 crypto_kem/ml-kem-1024/m4fstack/fastntt.S     |    1 +
 crypto_kem/ml-kem-1024/m4fstack/indcpa.c      |    1 +
 crypto_kem/ml-kem-1024/m4fstack/indcpa.h      |    1 +
 crypto_kem/ml-kem-1024/m4fstack/kem.c         |    1 +
 crypto_kem/ml-kem-1024/m4fstack/macros.i      |    1 +
 crypto_kem/ml-kem-1024/m4fstack/matacc.c      |    1 +
 crypto_kem/ml-kem-1024/m4fstack/matacc.h      |    1 +
 crypto_kem/ml-kem-1024/m4fstack/matacc.i      |    1 +
 crypto_kem/ml-kem-1024/m4fstack/matacc_asm.S  |    1 +
 crypto_kem/ml-kem-1024/m4fstack/ntt.c         |    1 +
 crypto_kem/ml-kem-1024/m4fstack/ntt.h         |    1 +
 crypto_kem/ml-kem-1024/m4fstack/params.h      |    1 +
 crypto_kem/ml-kem-1024/m4fstack/poly.c        |    1 +
 crypto_kem/ml-kem-1024/m4fstack/poly.h        |    1 +
 crypto_kem/ml-kem-1024/m4fstack/poly_asm.S    |    1 +
 crypto_kem/ml-kem-1024/m4fstack/polyvec.c     |    1 +
 crypto_kem/ml-kem-1024/m4fstack/polyvec.h     |    1 +
 crypto_kem/ml-kem-1024/m4fstack/reduce.S      |    1 +
 .../ml-kem-1024/m4fstack/symmetric-fips202.c  |    1 +
 crypto_kem/ml-kem-1024/m4fstack/symmetric.h   |    1 +
 crypto_kem/ml-kem-1024/m4fstack/verify.c      |    1 +
 crypto_kem/ml-kem-1024/m4fstack/verify.h      |    1 +
 crypto_kem/ml-kem-512/m4fspeed/api.h          |   20 +
 crypto_kem/ml-kem-512/m4fspeed/cbd.c          |  112 +
 crypto_kem/ml-kem-512/m4fspeed/cbd.h          |    9 +
 crypto_kem/ml-kem-512/m4fspeed/cmov_int16.S   |    1 +
 crypto_kem/ml-kem-512/m4fspeed/fastaddsub.S   |    1 +
 crypto_kem/ml-kem-512/m4fspeed/fastbasemul.S  |    1 +
 crypto_kem/ml-kem-512/m4fspeed/fastinvntt.S   |    1 +
 crypto_kem/ml-kem-512/m4fspeed/fastntt.S      |    1 +
 crypto_kem/ml-kem-512/m4fspeed/indcpa.c       |  246 +
 crypto_kem/ml-kem-512/m4fspeed/indcpa.h       |    1 +
 crypto_kem/ml-kem-512/m4fspeed/kem.c          |    1 +
 crypto_kem/ml-kem-512/m4fspeed/macros.i       |    1 +
 crypto_kem/ml-kem-512/m4fspeed/matacc.c       |    1 +
 crypto_kem/ml-kem-512/m4fspeed/matacc.h       |    1 +
 crypto_kem/ml-kem-512/m4fspeed/matacc.i       |    1 +
 crypto_kem/ml-kem-512/m4fspeed/matacc_asm.S   |    1 +
 crypto_kem/ml-kem-512/m4fspeed/ntt.c          |    1 +
 crypto_kem/ml-kem-512/m4fspeed/ntt.h          |    1 +
 crypto_kem/ml-kem-512/m4fspeed/params.h       |   32 +
 crypto_kem/ml-kem-512/m4fspeed/poly.c         |  672 ++
 crypto_kem/ml-kem-512/m4fspeed/poly.h         |   56 +
 crypto_kem/ml-kem-512/m4fspeed/poly_asm.S     |    1 +
 crypto_kem/ml-kem-512/m4fspeed/polyvec.c      |    1 +
 crypto_kem/ml-kem-512/m4fspeed/polyvec.h      |    1 +
 crypto_kem/ml-kem-512/m4fspeed/reduce.S       |    1 +
 .../ml-kem-512/m4fspeed/symmetric-fips202.    |    1 +
 .../ml-kem-512/m4fspeed/symmetric-fips202.c   |    1 +
 crypto_kem/ml-kem-512/m4fspeed/symmetric.h    |    1 +
 crypto_kem/ml-kem-512/m4fspeed/verify.c       |    1 +
 crypto_kem/ml-kem-512/m4fspeed/verify.h       |    1 +
 crypto_kem/ml-kem-512/m4fstack/api.h          |    1 +
 crypto_kem/ml-kem-512/m4fstack/cbd.c          |    1 +
 crypto_kem/ml-kem-512/m4fstack/cbd.h          |    1 +
 crypto_kem/ml-kem-512/m4fstack/cmov_int16.S   |    1 +
 crypto_kem/ml-kem-512/m4fstack/fastaddsub.S   |    1 +
 crypto_kem/ml-kem-512/m4fstack/fastbasemul.S  |    1 +
 crypto_kem/ml-kem-512/m4fstack/fastinvntt.S   |    1 +
 crypto_kem/ml-kem-512/m4fstack/fastntt.S      |    1 +
 crypto_kem/ml-kem-512/m4fstack/indcpa.c       |  211 +
 crypto_kem/ml-kem-512/m4fstack/indcpa.h       |    1 +
 crypto_kem/ml-kem-512/m4fstack/kem.c          |    1 +
 crypto_kem/ml-kem-512/m4fstack/macros.i       |    1 +
 crypto_kem/ml-kem-512/m4fstack/matacc.c       |    1 +
 crypto_kem/ml-kem-512/m4fstack/matacc.h       |    1 +
 crypto_kem/ml-kem-512/m4fstack/matacc.i       |    1 +
 crypto_kem/ml-kem-512/m4fstack/matacc_asm.S   |    1 +
 crypto_kem/ml-kem-512/m4fstack/ntt.c          |    1 +
 crypto_kem/ml-kem-512/m4fstack/ntt.h          |    1 +
 crypto_kem/ml-kem-512/m4fstack/params.h       |    1 +
 crypto_kem/ml-kem-512/m4fstack/poly.c         |  637 ++
 crypto_kem/ml-kem-512/m4fstack/poly.h         |   54 +
 crypto_kem/ml-kem-512/m4fstack/poly_asm.S     |    1 +
 crypto_kem/ml-kem-512/m4fstack/polyvec.c      |    1 +
 crypto_kem/ml-kem-512/m4fstack/polyvec.h      |    1 +
 crypto_kem/ml-kem-512/m4fstack/reduce.S       |    1 +
 .../ml-kem-512/m4fstack/symmetric-fips202.c   |    1 +
 crypto_kem/ml-kem-512/m4fstack/symmetric.h    |    1 +
 crypto_kem/ml-kem-512/m4fstack/verify.c       |    1 +
 crypto_kem/ml-kem-512/m4fstack/verify.h       |    1 +
 crypto_kem/ml-kem-768/m4fspeed/api.h          |   20 +
 crypto_kem/ml-kem-768/m4fspeed/cbd.c          |   55 +
 crypto_kem/ml-kem-768/m4fspeed/cbd.h          |    8 +
 crypto_kem/ml-kem-768/m4fspeed/cmov_int16.S   |   15 +
 crypto_kem/ml-kem-768/m4fspeed/fastaddsub.S   |   60 +
 crypto_kem/ml-kem-768/m4fspeed/fastbasemul.S  |  193 +
 crypto_kem/ml-kem-768/m4fspeed/fastinvntt.S   |  356 +
 crypto_kem/ml-kem-768/m4fspeed/fastntt.S      |  265 +
 crypto_kem/ml-kem-768/m4fspeed/indcpa.c       |  244 +
 crypto_kem/ml-kem-768/m4fspeed/indcpa.h       |   22 +
 crypto_kem/ml-kem-768/m4fspeed/kem.c          |  159 +
 crypto_kem/ml-kem-768/m4fspeed/macros.i       |   60 +
 crypto_kem/ml-kem-768/m4fspeed/matacc.c       |  121 +
 crypto_kem/ml-kem-768/m4fspeed/matacc.h       |   63 +
 crypto_kem/ml-kem-768/m4fspeed/matacc.i       |  301 +
 crypto_kem/ml-kem-768/m4fspeed/matacc_asm.S   |  377 +
 crypto_kem/ml-kem-768/m4fspeed/ntt.c          |  106 +
 crypto_kem/ml-kem-768/m4fspeed/ntt.h          |   11 +
 crypto_kem/ml-kem-768/m4fspeed/params.h       |   31 +
 crypto_kem/ml-kem-768/m4fspeed/poly.c         |  654 ++
 crypto_kem/ml-kem-768/m4fspeed/poly.h         |   53 +
 crypto_kem/ml-kem-768/m4fspeed/poly_asm.S     |  246 +
 crypto_kem/ml-kem-768/m4fspeed/polyvec.c      |  212 +
 crypto_kem/ml-kem-768/m4fspeed/polyvec.h      |   24 +
 crypto_kem/ml-kem-768/m4fspeed/reduce.S       |  140 +
 .../ml-kem-768/m4fspeed/symmetric-fips202.c   |   71 +
 crypto_kem/ml-kem-768/m4fspeed/symmetric.h    |   29 +
 crypto_kem/ml-kem-768/m4fspeed/verify.c       |   51 +
 crypto_kem/ml-kem-768/m4fspeed/verify.h       |   10 +
 crypto_kem/ml-kem-768/m4fstack/api.h          |    1 +
 crypto_kem/ml-kem-768/m4fstack/cbd.c          |    1 +
 crypto_kem/ml-kem-768/m4fstack/cbd.h          |    1 +
 crypto_kem/ml-kem-768/m4fstack/cmov_int16.S   |    1 +
 crypto_kem/ml-kem-768/m4fstack/fastaddsub.S   |    1 +
 crypto_kem/ml-kem-768/m4fstack/fastbasemul.S  |  207 +
 crypto_kem/ml-kem-768/m4fstack/fastinvntt.S   |  360 +
 crypto_kem/ml-kem-768/m4fstack/fastntt.S      |    1 +
 crypto_kem/ml-kem-768/m4fstack/indcpa.c       |  211 +
 crypto_kem/ml-kem-768/m4fstack/indcpa.h       |    1 +
 crypto_kem/ml-kem-768/m4fstack/kem.c          |    1 +
 crypto_kem/ml-kem-768/m4fstack/macros.i       |    1 +
 crypto_kem/ml-kem-768/m4fstack/matacc.c       |   43 +
 crypto_kem/ml-kem-768/m4fstack/matacc.h       |   26 +
 crypto_kem/ml-kem-768/m4fstack/matacc.i       |  197 +
 crypto_kem/ml-kem-768/m4fstack/matacc_asm.S   |  118 +
 crypto_kem/ml-kem-768/m4fstack/ntt.c          |    1 +
 crypto_kem/ml-kem-768/m4fstack/ntt.h          |    1 +
 crypto_kem/ml-kem-768/m4fstack/params.h       |    1 +
 crypto_kem/ml-kem-768/m4fstack/poly.c         |  618 ++
 crypto_kem/ml-kem-768/m4fstack/poly.h         |   51 +
 crypto_kem/ml-kem-768/m4fstack/poly_asm.S     |  198 +
 crypto_kem/ml-kem-768/m4fstack/polyvec.c      |    1 +
 crypto_kem/ml-kem-768/m4fstack/polyvec.h      |    1 +
 crypto_kem/ml-kem-768/m4fstack/reduce.S       |    1 +
 .../ml-kem-768/m4fstack/symmetric-fips202.c   |    1 +
 crypto_kem/ml-kem-768/m4fstack/symmetric.h    |    1 +
 crypto_kem/ml-kem-768/m4fstack/verify.c       |    1 +
 crypto_kem/ml-kem-768/m4fstack/verify.h       |    1 +
 crypto_sign/dilithium2/m4f/api.h              |   26 +
 crypto_sign/dilithium2/m4f/basemul_257.S      |   91 +
 crypto_sign/dilithium2/m4f/config.h           |    7 +
 crypto_sign/dilithium2/m4f/fnt_257.S          |  145 +
 crypto_sign/dilithium2/m4f/ifnt_257.S         |  306 +
 crypto_sign/dilithium2/m4f/macros.i           |  191 +
 crypto_sign/dilithium2/m4f/macros_fnt.i       |  158 +
 crypto_sign/dilithium2/m4f/ntt.S              |  402 +
 crypto_sign/dilithium2/m4f/ntt.h              |   13 +
 crypto_sign/dilithium2/m4f/packing.c          |  390 +
 crypto_sign/dilithium2/m4f/packing.h          |   68 +
 crypto_sign/dilithium2/m4f/params.h           |   83 +
 crypto_sign/dilithium2/m4f/pointwise_mont.h   |   13 +
 crypto_sign/dilithium2/m4f/pointwise_mont.s   |  128 +
 crypto_sign/dilithium2/m4f/poly.c             |  863 +++
 crypto_sign/dilithium2/m4f/poly.h             |   84 +
 crypto_sign/dilithium2/m4f/polyvec.c          |  429 ++
 crypto_sign/dilithium2/m4f/polyvec.h          |   99 +
 crypto_sign/dilithium2/m4f/reduce.h           |   29 +
 crypto_sign/dilithium2/m4f/rounding.c         |  102 +
 crypto_sign/dilithium2/m4f/rounding.h         |   19 +
 crypto_sign/dilithium2/m4f/sign.c             |  391 +
 crypto_sign/dilithium2/m4f/sign.h             |   37 +
 crypto_sign/dilithium2/m4f/smallntt.h         |   31 +
 crypto_sign/dilithium2/m4f/smallpoly.c        |   84 +
 crypto_sign/dilithium2/m4f/smallpoly.h        |   39 +
 crypto_sign/dilithium2/m4f/symmetric-shake.c  |   28 +
 crypto_sign/dilithium2/m4f/symmetric.h        |   65 +
 crypto_sign/dilithium2/m4f/vector.h           |   22 +
 crypto_sign/dilithium2/m4f/vector.s           |  263 +
 crypto_sign/dilithium2/m4fstack/api.h         |    1 +
 crypto_sign/dilithium2/m4fstack/config.h      |    1 +
 crypto_sign/dilithium2/m4fstack/macros.i      |    1 +
 .../dilithium2/m4fstack/macros_smallntt.i     |   91 +
 crypto_sign/dilithium2/m4fstack/ntt.S         |    1 +
 crypto_sign/dilithium2/m4fstack/ntt.h         |    1 +
 crypto_sign/dilithium2/m4fstack/packing.c     |    1 +
 crypto_sign/dilithium2/m4fstack/packing.h     |    1 +
 crypto_sign/dilithium2/m4fstack/params.h      |    1 +
 .../dilithium2/m4fstack/pointwise_mont.h      |    1 +
 .../dilithium2/m4fstack/pointwise_mont.s      |    1 +
 crypto_sign/dilithium2/m4fstack/poly.c        |    1 +
 crypto_sign/dilithium2/m4fstack/poly.h        |    1 +
 crypto_sign/dilithium2/m4fstack/polyvec.c     |    1 +
 crypto_sign/dilithium2/m4fstack/polyvec.h     |    1 +
 crypto_sign/dilithium2/m4fstack/reduce.h      |   79 +
 crypto_sign/dilithium2/m4fstack/rounding.c    |    1 +
 crypto_sign/dilithium2/m4fstack/rounding.h    |    1 +
 crypto_sign/dilithium2/m4fstack/sign.c        |  484 ++
 crypto_sign/dilithium2/m4fstack/sign.h        |    1 +
 crypto_sign/dilithium2/m4fstack/smallntt.h    |   47 +
 .../dilithium2/m4fstack/smallntt_769.S        |  691 ++
 crypto_sign/dilithium2/m4fstack/smallpoly.c   |   83 +
 crypto_sign/dilithium2/m4fstack/smallpoly.h   |   27 +
 crypto_sign/dilithium2/m4fstack/stack.c       |  715 ++
 crypto_sign/dilithium2/m4fstack/stack.h       |   69 +
 .../dilithium2/m4fstack/symmetric-shake.c     |    1 +
 crypto_sign/dilithium2/m4fstack/symmetric.h   |    1 +
 crypto_sign/dilithium2/m4fstack/vector.h      |    1 +
 crypto_sign/dilithium2/m4fstack/vector.s      |    1 +
 crypto_sign/dilithium3/m4f/api.h              |    1 +
 crypto_sign/dilithium3/m4f/config.h           |    7 +
 crypto_sign/dilithium3/m4f/macros.i           |    1 +
 crypto_sign/dilithium3/m4f/macros_smallntt.i  |   98 +
 crypto_sign/dilithium3/m4f/ntt.S              |    1 +
 crypto_sign/dilithium3/m4f/ntt.h              |    1 +
 crypto_sign/dilithium3/m4f/packing.c          |    1 +
 crypto_sign/dilithium3/m4f/packing.h          |    1 +
 crypto_sign/dilithium3/m4f/params.h           |    1 +
 crypto_sign/dilithium3/m4f/pointwise_mont.h   |    1 +
 crypto_sign/dilithium3/m4f/pointwise_mont.s   |    1 +
 crypto_sign/dilithium3/m4f/poly.c             |    1 +
 crypto_sign/dilithium3/m4f/poly.h             |    1 +
 crypto_sign/dilithium3/m4f/polyvec.c          |    1 +
 crypto_sign/dilithium3/m4f/polyvec.h          |    1 +
 crypto_sign/dilithium3/m4f/reduce.h           |    1 +
 crypto_sign/dilithium3/m4f/rounding.c         |    1 +
 crypto_sign/dilithium3/m4f/rounding.h         |    1 +
 crypto_sign/dilithium3/m4f/sign.c             |    1 +
 crypto_sign/dilithium3/m4f/sign.h             |    1 +
 crypto_sign/dilithium3/m4f/smallntt.h         |   48 +
 crypto_sign/dilithium3/m4f/smallntt_769.S     |  681 ++
 crypto_sign/dilithium3/m4f/smallpoly.c        |    1 +
 crypto_sign/dilithium3/m4f/smallpoly.h        |    1 +
 crypto_sign/dilithium3/m4f/symmetric-shake.c  |    1 +
 crypto_sign/dilithium3/m4f/symmetric.h        |    1 +
 crypto_sign/dilithium3/m4f/vector.h           |    1 +
 crypto_sign/dilithium3/m4f/vector.s           |    1 +
 crypto_sign/dilithium3/m4fstack/api.h         |    1 +
 crypto_sign/dilithium3/m4fstack/config.h      |    1 +
 crypto_sign/dilithium3/m4fstack/macros.i      |    1 +
 .../dilithium3/m4fstack/macros_smallntt.i     |    1 +
 crypto_sign/dilithium3/m4fstack/ntt.S         |    1 +
 crypto_sign/dilithium3/m4fstack/ntt.h         |    1 +
 crypto_sign/dilithium3/m4fstack/packing.c     |    1 +
 crypto_sign/dilithium3/m4fstack/packing.h     |    1 +
 crypto_sign/dilithium3/m4fstack/params.h      |    1 +
 .../dilithium3/m4fstack/pointwise_mont.h      |    1 +
 .../dilithium3/m4fstack/pointwise_mont.s      |    1 +
 crypto_sign/dilithium3/m4fstack/poly.c        |    1 +
 crypto_sign/dilithium3/m4fstack/poly.h        |    1 +
 crypto_sign/dilithium3/m4fstack/polyvec.c     |    1 +
 crypto_sign/dilithium3/m4fstack/polyvec.h     |    1 +
 crypto_sign/dilithium3/m4fstack/reduce.h      |    1 +
 crypto_sign/dilithium3/m4fstack/rounding.c    |    1 +
 crypto_sign/dilithium3/m4fstack/rounding.h    |    1 +
 crypto_sign/dilithium3/m4fstack/sign.c        |    1 +
 crypto_sign/dilithium3/m4fstack/sign.h        |    1 +
 crypto_sign/dilithium3/m4fstack/smallntt.h    |    1 +
 .../dilithium3/m4fstack/smallntt_769.S        |    1 +
 crypto_sign/dilithium3/m4fstack/smallpoly.c   |    1 +
 crypto_sign/dilithium3/m4fstack/smallpoly.h   |    1 +
 crypto_sign/dilithium3/m4fstack/stack.c       |    1 +
 crypto_sign/dilithium3/m4fstack/stack.h       |    1 +
 .../dilithium3/m4fstack/symmetric-shake.c     |    1 +
 crypto_sign/dilithium3/m4fstack/symmetric.h   |    1 +
 crypto_sign/dilithium3/m4fstack/vector.h      |    1 +
 crypto_sign/dilithium3/m4fstack/vector.s      |    1 +
 crypto_sign/dilithium5/m4f/api.h              |    1 +
 crypto_sign/dilithium5/m4f/basemul_257.S      |    1 +
 crypto_sign/dilithium5/m4f/config.h           |    7 +
 crypto_sign/dilithium5/m4f/fnt_257.S          |    1 +
 crypto_sign/dilithium5/m4f/ifnt_257.S         |    1 +
 crypto_sign/dilithium5/m4f/macros.i           |    1 +
 crypto_sign/dilithium5/m4f/macros_fnt.i       |    1 +
 crypto_sign/dilithium5/m4f/ntt.S              |    1 +
 crypto_sign/dilithium5/m4f/ntt.h              |    1 +
 crypto_sign/dilithium5/m4f/packing.c          |    1 +
 crypto_sign/dilithium5/m4f/packing.h          |    1 +
 crypto_sign/dilithium5/m4f/params.h           |    1 +
 crypto_sign/dilithium5/m4f/pointwise_mont.h   |    1 +
 crypto_sign/dilithium5/m4f/pointwise_mont.s   |    1 +
 crypto_sign/dilithium5/m4f/poly.c             |    1 +
 crypto_sign/dilithium5/m4f/poly.h             |    1 +
 crypto_sign/dilithium5/m4f/polyvec.c          |    1 +
 crypto_sign/dilithium5/m4f/polyvec.h          |    1 +
 crypto_sign/dilithium5/m4f/reduce.h           |    1 +
 crypto_sign/dilithium5/m4f/rounding.c         |    1 +
 crypto_sign/dilithium5/m4f/rounding.h         |    1 +
 crypto_sign/dilithium5/m4f/sign.c             |    1 +
 crypto_sign/dilithium5/m4f/sign.h             |    1 +
 crypto_sign/dilithium5/m4f/smallntt.h         |    1 +
 crypto_sign/dilithium5/m4f/smallpoly.c        |    1 +
 crypto_sign/dilithium5/m4f/smallpoly.h        |    1 +
 crypto_sign/dilithium5/m4f/symmetric-shake.c  |    1 +
 crypto_sign/dilithium5/m4f/symmetric.h        |    1 +
 crypto_sign/dilithium5/m4f/vector.h           |    1 +
 crypto_sign/dilithium5/m4f/vector.s           |    1 +
 crypto_sign/dilithium5/m4fstack/api.h         |    1 +
 crypto_sign/dilithium5/m4fstack/config.h      |    1 +
 crypto_sign/dilithium5/m4fstack/macros.i      |    1 +
 .../dilithium5/m4fstack/macros_smallntt.i     |    1 +
 crypto_sign/dilithium5/m4fstack/ntt.S         |    1 +
 crypto_sign/dilithium5/m4fstack/ntt.h         |    1 +
 crypto_sign/dilithium5/m4fstack/packing.c     |    1 +
 crypto_sign/dilithium5/m4fstack/packing.h     |    1 +
 crypto_sign/dilithium5/m4fstack/params.h      |    1 +
 .../dilithium5/m4fstack/pointwise_mont.h      |    1 +
 .../dilithium5/m4fstack/pointwise_mont.s      |    1 +
 crypto_sign/dilithium5/m4fstack/poly.c        |    1 +
 crypto_sign/dilithium5/m4fstack/poly.h        |    1 +
 crypto_sign/dilithium5/m4fstack/polyvec.c     |    1 +
 crypto_sign/dilithium5/m4fstack/polyvec.h     |    1 +
 crypto_sign/dilithium5/m4fstack/reduce.h      |    1 +
 crypto_sign/dilithium5/m4fstack/rounding.c    |    1 +
 crypto_sign/dilithium5/m4fstack/rounding.h    |    1 +
 crypto_sign/dilithium5/m4fstack/sign.c        |    1 +
 crypto_sign/dilithium5/m4fstack/sign.h        |    1 +
 crypto_sign/dilithium5/m4fstack/smallntt.h    |    1 +
 .../dilithium5/m4fstack/smallntt_769.S        |    1 +
 crypto_sign/dilithium5/m4fstack/smallpoly.c   |    1 +
 crypto_sign/dilithium5/m4fstack/smallpoly.h   |    1 +
 crypto_sign/dilithium5/m4fstack/stack.c       |    1 +
 crypto_sign/dilithium5/m4fstack/stack.h       |    1 +
 .../dilithium5/m4fstack/symmetric-shake.c     |    1 +
 crypto_sign/dilithium5/m4fstack/symmetric.h   |    1 +
 crypto_sign/dilithium5/m4fstack/vector.h      |    1 +
 crypto_sign/dilithium5/m4fstack/vector.s      |    1 +
 hostside/host_unidirectional.py               |   16 +
 interface.py                                  |  110 +
 ldscripts/devices.data                        |    5 +
 libopencm3                                    |    1 +
 mk/config.mk                                  |    3 +
 mk/crypto.mk                                  |   29 +
 mk/nucleo-f767zi.mk                           |   31 +
 mk/opencm3.mk                                 |  111 +
 mk/tests.mk                                   |   29 +
 mupq                                          |    1 +
 requirements.txt                              |    2 +
 skiplist.py                                   |  250 +
 slothy                                        |    1 +
 test.py                                       |   14 +
 testvectors.py                                |   13 +
 407 files changed, 37922 insertions(+)
 create mode 100644 .github/dependabot.yml
 create mode 100644 .github/pull_request_template.md
 create mode 100644 .github/workflows/nucleo-f767zi.yml
 create mode 100644 .gitignore
 create mode 100644 .gitmodules
 create mode 100644 Makefile
 create mode 100644 README.md
 create mode 100755 benchmarks.py
 create mode 100755 build_everything.py
 create mode 100644 common/aes-encrypt.S
 create mode 100644 common/aes-keyschedule.S
 create mode 100644 common/aes-publicinputs.S
 create mode 100644 common/aes-publicinputs.c
 create mode 100644 common/aes-publicinputs.h
 create mode 100644 common/aes.c
 create mode 100644 common/aes.h
 create mode 100644 common/aestest.c
 create mode 100644 common/crypto_hashblocks_sha512.c
 create mode 100644 common/crypto_hashblocks_sha512_inner32.s
 create mode 100644 common/hal-mps2.c
 create mode 100644 common/hal-opencm3.c
 create mode 100644 common/keccakf1600.S
 create mode 100644 common/keccaktest.c
 create mode 100644 common/mps2/CMSDK_CM4.h
 create mode 100644 common/mps2/LICENSE.txt
 create mode 100644 common/mps2/MPS2.ld
 create mode 100644 common/mps2/cmsis_armclang.h
 create mode 100644 common/mps2/cmsis_compiler.h
 create mode 100644 common/mps2/cmsis_gcc.h
 create mode 100644 common/mps2/cmsis_nvic.h
 create mode 100644 common/mps2/cmsis_version.h
 create mode 100644 common/mps2/core_cm4.h
 create mode 100644 common/mps2/memory_zones.h
 create mode 100644 common/mps2/mpu_armv7.h
 create mode 100644 common/mps2/startup_MPS2.S
 create mode 100644 common/randombytes.c
 create mode 100644 common/test.c
 create mode 120000 common/testfast.c
 create mode 100755 convert_benchmarks.py
 create mode 100644 crypto_kem/ml-kem-1024/m4fspeed/api.h
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/cbd.c
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/cbd.h
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/cmov_int16.S
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/fastaddsub.S
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/fastbasemul.S
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/fastinvntt.S
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/fastntt.S
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/indcpa.c
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/indcpa.h
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/kem.c
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/macros.i
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/matacc.c
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/matacc.h
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/matacc.i
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/matacc_asm.S
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/ntt.c
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/ntt.h
 create mode 100644 crypto_kem/ml-kem-1024/m4fspeed/params.h
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/poly.c
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/poly.h
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/poly_asm.S
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/polyvec.c
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/polyvec.h
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/reduce.S
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/symmetric-fips202.c
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/symmetric.h
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/verify.c
 create mode 120000 crypto_kem/ml-kem-1024/m4fspeed/verify.h
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/api.h
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/cbd.c
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/cbd.h
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/cmov_int16.S
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/fastaddsub.S
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/fastbasemul.S
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/fastinvntt.S
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/fastntt.S
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/indcpa.c
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/indcpa.h
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/kem.c
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/macros.i
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/matacc.c
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/matacc.h
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/matacc.i
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/matacc_asm.S
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/ntt.c
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/ntt.h
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/params.h
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/poly.c
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/poly.h
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/poly_asm.S
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/polyvec.c
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/polyvec.h
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/reduce.S
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/symmetric-fips202.c
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/symmetric.h
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/verify.c
 create mode 120000 crypto_kem/ml-kem-1024/m4fstack/verify.h
 create mode 100644 crypto_kem/ml-kem-512/m4fspeed/api.h
 create mode 100644 crypto_kem/ml-kem-512/m4fspeed/cbd.c
 create mode 100644 crypto_kem/ml-kem-512/m4fspeed/cbd.h
 create mode 120000 crypto_kem/ml-kem-512/m4fspeed/cmov_int16.S
 create mode 120000 crypto_kem/ml-kem-512/m4fspeed/fastaddsub.S
 create mode 120000 crypto_kem/ml-kem-512/m4fspeed/fastbasemul.S
 create mode 120000 crypto_kem/ml-kem-512/m4fspeed/fastinvntt.S
 create mode 120000 crypto_kem/ml-kem-512/m4fspeed/fastntt.S
 create mode 100644 crypto_kem/ml-kem-512/m4fspeed/indcpa.c
 create mode 120000 crypto_kem/ml-kem-512/m4fspeed/indcpa.h
 create mode 120000 crypto_kem/ml-kem-512/m4fspeed/kem.c
 create mode 120000 crypto_kem/ml-kem-512/m4fspeed/macros.i
 create mode 120000 crypto_kem/ml-kem-512/m4fspeed/matacc.c
 create mode 120000 crypto_kem/ml-kem-512/m4fspeed/matacc.h
 create mode 120000 crypto_kem/ml-kem-512/m4fspeed/matacc.i
 create mode 120000 crypto_kem/ml-kem-512/m4fspeed/matacc_asm.S
 create mode 120000 crypto_kem/ml-kem-512/m4fspeed/ntt.c
 create mode 120000 crypto_kem/ml-kem-512/m4fspeed/ntt.h
 create mode 100644 crypto_kem/ml-kem-512/m4fspeed/params.h
 create mode 100644 crypto_kem/ml-kem-512/m4fspeed/poly.c
 create mode 100644 crypto_kem/ml-kem-512/m4fspeed/poly.h
 create mode 120000 crypto_kem/ml-kem-512/m4fspeed/poly_asm.S
 create mode 120000 crypto_kem/ml-kem-512/m4fspeed/polyvec.c
 create mode 120000 crypto_kem/ml-kem-512/m4fspeed/polyvec.h
 create mode 120000 crypto_kem/ml-kem-512/m4fspeed/reduce.S
 create mode 120000 crypto_kem/ml-kem-512/m4fspeed/symmetric-fips202.
 create mode 120000 crypto_kem/ml-kem-512/m4fspeed/symmetric-fips202.c
 create mode 120000 crypto_kem/ml-kem-512/m4fspeed/symmetric.h
 create mode 120000 crypto_kem/ml-kem-512/m4fspeed/verify.c
 create mode 120000 crypto_kem/ml-kem-512/m4fspeed/verify.h
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/api.h
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/cbd.c
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/cbd.h
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/cmov_int16.S
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/fastaddsub.S
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/fastbasemul.S
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/fastinvntt.S
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/fastntt.S
 create mode 100644 crypto_kem/ml-kem-512/m4fstack/indcpa.c
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/indcpa.h
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/kem.c
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/macros.i
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/matacc.c
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/matacc.h
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/matacc.i
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/matacc_asm.S
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/ntt.c
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/ntt.h
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/params.h
 create mode 100644 crypto_kem/ml-kem-512/m4fstack/poly.c
 create mode 100644 crypto_kem/ml-kem-512/m4fstack/poly.h
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/poly_asm.S
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/polyvec.c
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/polyvec.h
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/reduce.S
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/symmetric-fips202.c
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/symmetric.h
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/verify.c
 create mode 120000 crypto_kem/ml-kem-512/m4fstack/verify.h
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/api.h
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/cbd.c
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/cbd.h
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/cmov_int16.S
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/fastaddsub.S
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/fastbasemul.S
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/fastinvntt.S
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/fastntt.S
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/indcpa.c
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/indcpa.h
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/kem.c
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/macros.i
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/matacc.c
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/matacc.h
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/matacc.i
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/matacc_asm.S
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/ntt.c
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/ntt.h
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/params.h
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/poly.c
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/poly.h
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/poly_asm.S
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/polyvec.c
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/polyvec.h
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/reduce.S
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/symmetric-fips202.c
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/symmetric.h
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/verify.c
 create mode 100644 crypto_kem/ml-kem-768/m4fspeed/verify.h
 create mode 120000 crypto_kem/ml-kem-768/m4fstack/api.h
 create mode 120000 crypto_kem/ml-kem-768/m4fstack/cbd.c
 create mode 120000 crypto_kem/ml-kem-768/m4fstack/cbd.h
 create mode 120000 crypto_kem/ml-kem-768/m4fstack/cmov_int16.S
 create mode 120000 crypto_kem/ml-kem-768/m4fstack/fastaddsub.S
 create mode 100644 crypto_kem/ml-kem-768/m4fstack/fastbasemul.S
 create mode 100644 crypto_kem/ml-kem-768/m4fstack/fastinvntt.S
 create mode 120000 crypto_kem/ml-kem-768/m4fstack/fastntt.S
 create mode 100644 crypto_kem/ml-kem-768/m4fstack/indcpa.c
 create mode 120000 crypto_kem/ml-kem-768/m4fstack/indcpa.h
 create mode 120000 crypto_kem/ml-kem-768/m4fstack/kem.c
 create mode 120000 crypto_kem/ml-kem-768/m4fstack/macros.i
 create mode 100644 crypto_kem/ml-kem-768/m4fstack/matacc.c
 create mode 100644 crypto_kem/ml-kem-768/m4fstack/matacc.h
 create mode 100644 crypto_kem/ml-kem-768/m4fstack/matacc.i
 create mode 100644 crypto_kem/ml-kem-768/m4fstack/matacc_asm.S
 create mode 120000 crypto_kem/ml-kem-768/m4fstack/ntt.c
 create mode 120000 crypto_kem/ml-kem-768/m4fstack/ntt.h
 create mode 120000 crypto_kem/ml-kem-768/m4fstack/params.h
 create mode 100644 crypto_kem/ml-kem-768/m4fstack/poly.c
 create mode 100644 crypto_kem/ml-kem-768/m4fstack/poly.h
 create mode 100644 crypto_kem/ml-kem-768/m4fstack/poly_asm.S
 create mode 120000 crypto_kem/ml-kem-768/m4fstack/polyvec.c
 create mode 120000 crypto_kem/ml-kem-768/m4fstack/polyvec.h
 create mode 120000 crypto_kem/ml-kem-768/m4fstack/reduce.S
 create mode 120000 crypto_kem/ml-kem-768/m4fstack/symmetric-fips202.c
 create mode 120000 crypto_kem/ml-kem-768/m4fstack/symmetric.h
 create mode 120000 crypto_kem/ml-kem-768/m4fstack/verify.c
 create mode 120000 crypto_kem/ml-kem-768/m4fstack/verify.h
 create mode 100644 crypto_sign/dilithium2/m4f/api.h
 create mode 100644 crypto_sign/dilithium2/m4f/basemul_257.S
 create mode 100644 crypto_sign/dilithium2/m4f/config.h
 create mode 100644 crypto_sign/dilithium2/m4f/fnt_257.S
 create mode 100644 crypto_sign/dilithium2/m4f/ifnt_257.S
 create mode 100644 crypto_sign/dilithium2/m4f/macros.i
 create mode 100644 crypto_sign/dilithium2/m4f/macros_fnt.i
 create mode 100644 crypto_sign/dilithium2/m4f/ntt.S
 create mode 100644 crypto_sign/dilithium2/m4f/ntt.h
 create mode 100644 crypto_sign/dilithium2/m4f/packing.c
 create mode 100644 crypto_sign/dilithium2/m4f/packing.h
 create mode 100644 crypto_sign/dilithium2/m4f/params.h
 create mode 100644 crypto_sign/dilithium2/m4f/pointwise_mont.h
 create mode 100644 crypto_sign/dilithium2/m4f/pointwise_mont.s
 create mode 100644 crypto_sign/dilithium2/m4f/poly.c
 create mode 100644 crypto_sign/dilithium2/m4f/poly.h
 create mode 100644 crypto_sign/dilithium2/m4f/polyvec.c
 create mode 100644 crypto_sign/dilithium2/m4f/polyvec.h
 create mode 100644 crypto_sign/dilithium2/m4f/reduce.h
 create mode 100644 crypto_sign/dilithium2/m4f/rounding.c
 create mode 100644 crypto_sign/dilithium2/m4f/rounding.h
 create mode 100644 crypto_sign/dilithium2/m4f/sign.c
 create mode 100644 crypto_sign/dilithium2/m4f/sign.h
 create mode 100644 crypto_sign/dilithium2/m4f/smallntt.h
 create mode 100644 crypto_sign/dilithium2/m4f/smallpoly.c
 create mode 100644 crypto_sign/dilithium2/m4f/smallpoly.h
 create mode 100644 crypto_sign/dilithium2/m4f/symmetric-shake.c
 create mode 100644 crypto_sign/dilithium2/m4f/symmetric.h
 create mode 100644 crypto_sign/dilithium2/m4f/vector.h
 create mode 100644 crypto_sign/dilithium2/m4f/vector.s
 create mode 120000 crypto_sign/dilithium2/m4fstack/api.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/config.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/macros.i
 create mode 100644 crypto_sign/dilithium2/m4fstack/macros_smallntt.i
 create mode 120000 crypto_sign/dilithium2/m4fstack/ntt.S
 create mode 120000 crypto_sign/dilithium2/m4fstack/ntt.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/packing.c
 create mode 120000 crypto_sign/dilithium2/m4fstack/packing.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/params.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/pointwise_mont.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/pointwise_mont.s
 create mode 120000 crypto_sign/dilithium2/m4fstack/poly.c
 create mode 120000 crypto_sign/dilithium2/m4fstack/poly.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/polyvec.c
 create mode 120000 crypto_sign/dilithium2/m4fstack/polyvec.h
 create mode 100644 crypto_sign/dilithium2/m4fstack/reduce.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/rounding.c
 create mode 120000 crypto_sign/dilithium2/m4fstack/rounding.h
 create mode 100644 crypto_sign/dilithium2/m4fstack/sign.c
 create mode 120000 crypto_sign/dilithium2/m4fstack/sign.h
 create mode 100644 crypto_sign/dilithium2/m4fstack/smallntt.h
 create mode 100644 crypto_sign/dilithium2/m4fstack/smallntt_769.S
 create mode 100644 crypto_sign/dilithium2/m4fstack/smallpoly.c
 create mode 100644 crypto_sign/dilithium2/m4fstack/smallpoly.h
 create mode 100644 crypto_sign/dilithium2/m4fstack/stack.c
 create mode 100644 crypto_sign/dilithium2/m4fstack/stack.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/symmetric-shake.c
 create mode 120000 crypto_sign/dilithium2/m4fstack/symmetric.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/vector.h
 create mode 120000 crypto_sign/dilithium2/m4fstack/vector.s
 create mode 120000 crypto_sign/dilithium3/m4f/api.h
 create mode 100644 crypto_sign/dilithium3/m4f/config.h
 create mode 120000 crypto_sign/dilithium3/m4f/macros.i
 create mode 100644 crypto_sign/dilithium3/m4f/macros_smallntt.i
 create mode 120000 crypto_sign/dilithium3/m4f/ntt.S
 create mode 120000 crypto_sign/dilithium3/m4f/ntt.h
 create mode 120000 crypto_sign/dilithium3/m4f/packing.c
 create mode 120000 crypto_sign/dilithium3/m4f/packing.h
 create mode 120000 crypto_sign/dilithium3/m4f/params.h
 create mode 120000 crypto_sign/dilithium3/m4f/pointwise_mont.h
 create mode 120000 crypto_sign/dilithium3/m4f/pointwise_mont.s
 create mode 120000 crypto_sign/dilithium3/m4f/poly.c
 create mode 120000 crypto_sign/dilithium3/m4f/poly.h
 create mode 120000 crypto_sign/dilithium3/m4f/polyvec.c
 create mode 120000 crypto_sign/dilithium3/m4f/polyvec.h
 create mode 120000 crypto_sign/dilithium3/m4f/reduce.h
 create mode 120000 crypto_sign/dilithium3/m4f/rounding.c
 create mode 120000 crypto_sign/dilithium3/m4f/rounding.h
 create mode 120000 crypto_sign/dilithium3/m4f/sign.c
 create mode 120000 crypto_sign/dilithium3/m4f/sign.h
 create mode 100644 crypto_sign/dilithium3/m4f/smallntt.h
 create mode 100644 crypto_sign/dilithium3/m4f/smallntt_769.S
 create mode 120000 crypto_sign/dilithium3/m4f/smallpoly.c
 create mode 120000 crypto_sign/dilithium3/m4f/smallpoly.h
 create mode 120000 crypto_sign/dilithium3/m4f/symmetric-shake.c
 create mode 120000 crypto_sign/dilithium3/m4f/symmetric.h
 create mode 120000 crypto_sign/dilithium3/m4f/vector.h
 create mode 120000 crypto_sign/dilithium3/m4f/vector.s
 create mode 120000 crypto_sign/dilithium3/m4fstack/api.h
 create mode 120000 crypto_sign/dilithium3/m4fstack/config.h
 create mode 120000 crypto_sign/dilithium3/m4fstack/macros.i
 create mode 120000 crypto_sign/dilithium3/m4fstack/macros_smallntt.i
 create mode 120000 crypto_sign/dilithium3/m4fstack/ntt.S
 create mode 120000 crypto_sign/dilithium3/m4fstack/ntt.h
 create mode 120000 crypto_sign/dilithium3/m4fstack/packing.c
 create mode 120000 crypto_sign/dilithium3/m4fstack/packing.h
 create mode 120000 crypto_sign/dilithium3/m4fstack/params.h
 create mode 120000 crypto_sign/dilithium3/m4fstack/pointwise_mont.h
 create mode 120000 crypto_sign/dilithium3/m4fstack/pointwise_mont.s
 create mode 120000 crypto_sign/dilithium3/m4fstack/poly.c
 create mode 120000 crypto_sign/dilithium3/m4fstack/poly.h
 create mode 120000 crypto_sign/dilithium3/m4fstack/polyvec.c
 create mode 120000 crypto_sign/dilithium3/m4fstack/polyvec.h
 create mode 120000 crypto_sign/dilithium3/m4fstack/reduce.h
 create mode 120000 crypto_sign/dilithium3/m4fstack/rounding.c
 create mode 120000 crypto_sign/dilithium3/m4fstack/rounding.h
 create mode 120000 crypto_sign/dilithium3/m4fstack/sign.c
 create mode 120000 crypto_sign/dilithium3/m4fstack/sign.h
 create mode 120000 crypto_sign/dilithium3/m4fstack/smallntt.h
 create mode 120000 crypto_sign/dilithium3/m4fstack/smallntt_769.S
 create mode 120000 crypto_sign/dilithium3/m4fstack/smallpoly.c
 create mode 120000 crypto_sign/dilithium3/m4fstack/smallpoly.h
 create mode 120000 crypto_sign/dilithium3/m4fstack/stack.c
 create mode 120000 crypto_sign/dilithium3/m4fstack/stack.h
 create mode 120000 crypto_sign/dilithium3/m4fstack/symmetric-shake.c
 create mode 120000 crypto_sign/dilithium3/m4fstack/symmetric.h
 create mode 120000 crypto_sign/dilithium3/m4fstack/vector.h
 create mode 120000 crypto_sign/dilithium3/m4fstack/vector.s
 create mode 120000 crypto_sign/dilithium5/m4f/api.h
 create mode 120000 crypto_sign/dilithium5/m4f/basemul_257.S
 create mode 100644 crypto_sign/dilithium5/m4f/config.h
 create mode 120000 crypto_sign/dilithium5/m4f/fnt_257.S
 create mode 120000 crypto_sign/dilithium5/m4f/ifnt_257.S
 create mode 120000 crypto_sign/dilithium5/m4f/macros.i
 create mode 120000 crypto_sign/dilithium5/m4f/macros_fnt.i
 create mode 120000 crypto_sign/dilithium5/m4f/ntt.S
 create mode 120000 crypto_sign/dilithium5/m4f/ntt.h
 create mode 120000 crypto_sign/dilithium5/m4f/packing.c
 create mode 120000 crypto_sign/dilithium5/m4f/packing.h
 create mode 120000 crypto_sign/dilithium5/m4f/params.h
 create mode 120000 crypto_sign/dilithium5/m4f/pointwise_mont.h
 create mode 120000 crypto_sign/dilithium5/m4f/pointwise_mont.s
 create mode 120000 crypto_sign/dilithium5/m4f/poly.c
 create mode 120000 crypto_sign/dilithium5/m4f/poly.h
 create mode 120000 crypto_sign/dilithium5/m4f/polyvec.c
 create mode 120000 crypto_sign/dilithium5/m4f/polyvec.h
 create mode 120000 crypto_sign/dilithium5/m4f/reduce.h
 create mode 120000 crypto_sign/dilithium5/m4f/rounding.c
 create mode 120000 crypto_sign/dilithium5/m4f/rounding.h
 create mode 120000 crypto_sign/dilithium5/m4f/sign.c
 create mode 120000 crypto_sign/dilithium5/m4f/sign.h
 create mode 120000 crypto_sign/dilithium5/m4f/smallntt.h
 create mode 120000 crypto_sign/dilithium5/m4f/smallpoly.c
 create mode 120000 crypto_sign/dilithium5/m4f/smallpoly.h
 create mode 120000 crypto_sign/dilithium5/m4f/symmetric-shake.c
 create mode 120000 crypto_sign/dilithium5/m4f/symmetric.h
 create mode 120000 crypto_sign/dilithium5/m4f/vector.h
 create mode 120000 crypto_sign/dilithium5/m4f/vector.s
 create mode 120000 crypto_sign/dilithium5/m4fstack/api.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/config.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/macros.i
 create mode 120000 crypto_sign/dilithium5/m4fstack/macros_smallntt.i
 create mode 120000 crypto_sign/dilithium5/m4fstack/ntt.S
 create mode 120000 crypto_sign/dilithium5/m4fstack/ntt.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/packing.c
 create mode 120000 crypto_sign/dilithium5/m4fstack/packing.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/params.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/pointwise_mont.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/pointwise_mont.s
 create mode 120000 crypto_sign/dilithium5/m4fstack/poly.c
 create mode 120000 crypto_sign/dilithium5/m4fstack/poly.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/polyvec.c
 create mode 120000 crypto_sign/dilithium5/m4fstack/polyvec.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/reduce.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/rounding.c
 create mode 120000 crypto_sign/dilithium5/m4fstack/rounding.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/sign.c
 create mode 120000 crypto_sign/dilithium5/m4fstack/sign.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/smallntt.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/smallntt_769.S
 create mode 120000 crypto_sign/dilithium5/m4fstack/smallpoly.c
 create mode 120000 crypto_sign/dilithium5/m4fstack/smallpoly.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/stack.c
 create mode 120000 crypto_sign/dilithium5/m4fstack/stack.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/symmetric-shake.c
 create mode 120000 crypto_sign/dilithium5/m4fstack/symmetric.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/vector.h
 create mode 120000 crypto_sign/dilithium5/m4fstack/vector.s
 create mode 100755 hostside/host_unidirectional.py
 create mode 100644 interface.py
 create mode 100644 ldscripts/devices.data
 create mode 160000 libopencm3
 create mode 100644 mk/config.mk
 create mode 100644 mk/crypto.mk
 create mode 100644 mk/nucleo-f767zi.mk
 create mode 100644 mk/opencm3.mk
 create mode 100644 mk/tests.mk
 create mode 160000 mupq
 create mode 100644 requirements.txt
 create mode 100644 skiplist.py
 create mode 160000 slothy
 create mode 100755 test.py
 create mode 100755 testvectors.py

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000..4c35da0
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,6 @@
+version: 2
+updates:
+  - package-ecosystem: "gitsubmodule"
+    directory: '/'
+    schedule:
+      interval: "monthly"
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
new file mode 100644
index 0000000..216e642
--- /dev/null
+++ b/.github/pull_request_template.md
@@ -0,0 +1,7 @@
+- [ ] PR changes testvectors
+- [ ] Tests pass in qemu
+- [ ] Testvectors pass in qemu
+- [ ] Tests pass on Nucleo-L4R5ZI 
+- [ ] Testvectors pass on Nucleo-L4R5ZI 
+- [ ] Updated Benchmarks
+- [ ] Updated Skiplist entries
diff --git a/.github/workflows/nucleo-f767zi.yml b/.github/workflows/nucleo-f767zi.yml
new file mode 100644
index 0000000..9e78ef4
--- /dev/null
+++ b/.github/workflows/nucleo-f767zi.yml
@@ -0,0 +1,21 @@
+name: stm32f4discovery build
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches: [ "master" ]
+jobs:
+  build-all:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - name: Install Toolchain
+        uses: carlosperate/arm-none-eabi-gcc-action@v1.9.1
+        with:
+          release: 13.3.Rel1
+      - name: Build All (nucleo-f767zi)
+        run: make PLATFORM=nucleo-f767zi -j2
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..4740396
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,16 @@
+*.o
+*.bin
+*.elf
+*.a
+*.d
+*.log*
+venv/
+testvectors/
+benchmarks/
+__pycache__/
+bin/
+obj/
+elf/
+bin-host/
+compile_commands.json
+.vscode
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..cd5c612
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,9 @@
+[submodule "libopencm3"]
+	path = libopencm3
+	url = https://github.com/libopencm3/libopencm3.git
+[submodule "mupq"]
+	path = mupq
+	url = https://github.com/mupq/mupq.git
+[submodule "slothy"]
+	path = slothy
+	url = https://github.com/slothy-optimizer/slothy
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..67dd7e6
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0 or CC0-1.0
+.PHONY: all
+all: tests tests-bin
+
+include mupq/mk/config.mk
+include mk/config.mk
+include mk/crypto.mk
+include mupq/mk/host-crypto.mk
+include mupq/mk/rules.mk
+include mupq/mk/schemes.mk
+include mk/tests.mk
+
+.PHONY: clean libclean
+
+clean:
+	rm -rf elf/
+	rm -rf bin/
+	rm -rf bin-host/
+	rm -rf obj/
+	rm -rf testvectors/
+	rm -rf benchmarks/
+
+.SECONDARY:
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..2c0ec88
--- /dev/null
+++ b/README.md
@@ -0,0 +1,411 @@
+# pqm4
+Collection of post-quantum cryptographic alrogithms for the ARM Cortex-M4
+
+## Introduction
+The **pqm4** library, benchmarking and testing framework started as a result of the 
+[PQCRYPTO](https://pqcrypto.eu.org) project funded by the European Commission in the H2020 program. 
+It currently contains implementations post-quantum key-encapsulation mechanisms
+and post-quantum signature schemes targeting the ARM Cortex-M4 family of microcontrollers.
+The design goals of the library are to offer
+* automated functional testing on a widely available development board;
+* automated generation of test vectors and comparison against output
+  of a reference implementation running host-side (i.e., on the computer the
+  development board is connected to);
+* automated benchmarking for speed, stack usage, and code-size;
+* automated profiling of cycles spent in symmetric primitives (SHA-2, SHA-3, AES);
+* integration of clean implementations from [PQClean](https://github.com/PQClean/PQClean); and
+* easy integration of new schemes and implementations into the framework.
+
+## Previous NIST PQC
+
+The master branch of **pqm4** contains schemes that either [selected for standardization by NIST](https://csrc.nist.gov/Projects/post-quantum-cryptography/selected-algorithms-2022),
+part of the [4th round of the NIST PQC standardization process](https://csrc.nist.gov/Projects/post-quantum-cryptography/round-4-submissions),
+or part or the [first round of additional signatures of the NIST PQC standardization process](https://csrc.nist.gov/projects/pqc-dig-sig/round-1-additional-signatures).
+
+Implementations for previous NIST PQC rounds are available here:
+- Round 3: https://github.com/mupq/pqm4/releases/tag/Round3
+- Round 2: https://github.com/mupq/pqm4/releases/tag/Round2
+- Round 1: https://github.com/mupq/pqm4/releases/tag/Round1
+
+## Changes in Round 2
+For the second round of the NIST PQC process, **pqm4** was extended (see [#78](https://github.com/mupq/pqm4/pull/78)) with the following features:
+- common code was moved to [mupq](https://github.com/mupq/mupq) for reuse in [pqriscv](https://github.com/mupq/pqriscv),
+- much simpler build process,
+- automated profiling of cycles spent in symmetric primitives (SHA-2, SHA-3, AES),
+- reporting of code-size,
+- integration of clean implementations from [PQClean](https://github.com/PQClean/PQClean).
+
+## Changes in Round 3
+For the third round of the NIST PQC process, **pqm4** was extended with the following features:
+- overhaul of the build process to support multiple target boards, and
+- use of the QEMU simulator to measure stack usage of larger schemes.
+
+## Changes in Round 4 / Round 1 of Additional signatures
+For the fourth round of the NIST PQC process **pqm4** was extended with the following features:
+- Switch to the Nucleo-L4R5ZI board as the default board for measurements, and
+- an overhaul of the console output.
+
+## Schemes included in pqm4
+
+For most of the schemes there are multiple implementations. 
+The naming scheme for these implementations is as follows:
+* `clean`: clean reference implementation from [PQClean](https://github.com/PQClean/PQClean),
+* `ref`: the reference implementation submitted to NIST (will be replaced by `clean` in the long term),
+* `opt`: an optimized implementation in plain C (e.g., the optimized implementation submitted to NIST),
+* `m4`: an implementation with Cortex-M4 specific optimizations (typically in assembly).
+* `m4f`: an implementation with Cortex-M4F specific optimizations (typically assembly using floating-point registers).
+
+## Setup/Installation
+The testing and benchmarking framework of **pqm4** targets several development
+boards, all featuring an ARM Cortex-M4 chip:
+
+* `nucleo-l4r5zi` (default): The [NUCLEO-L4R5ZI board](https://www.st.com/en/evaluation-tools/nucleo-l4r5zi.html)
+  featuring 2MB of Flash and 640KB of RAM. This board does not require a
+  separate USB serial interface converter.
+* `stm32f4discovery`: The [STM32F4 Discovery board](https://www.st.com/en/evaluation-tools/stm32f4discovery.html)
+  featuring 1MB of Flash, and 192KB of RAM. Connecting the
+  development to the host computer requires a mini-USB cable and a USB-TTL
+  converter together with a 2-pin dupont / jumper cable.
+* `nucleo-l476rg`: The [NUCLEO-L476RG board](https://www.st.com/en/evaluation-tools/nucleo-l476rg.html)
+  featuring 1MB of Flash and 128KB of RAM. This board does not require a
+  separate USB serial interface converter.
+* `cw308t-stm32f3`: The ChipWhisperer [CW308-STM32F3 target board](https://rtfm.newae.com/Targets/UFO%20Targets/CW308T-STM32F/)
+  (in the F3 configuration) featuring 256KB of Flash and 40KB of RAM.
+* `mps2-an386`: The ARM MPS2(+) FPGA prototyping board when used with the
+  ARM-Cortex M4 bitstream (see [ARM AN386](https://developer.arm.com/documentation/dai0386/c))
+  featuring two 4MB RAM blocks, one used in lieu of Flash one as RAM. This board
+  can also be simulated with the QEMU 5.2 simulator (the cycle counts are,
+  however, meaningless in this case).
+
+### Installing the ARM toolchain
+The **pqm4** build system assumes that you have the [arm-none-eabi toolchain](https://developer.arm.com/downloads/-/arm-gnu-toolchain-downloads)
+toolchain installed. All benchmarks are performed using this toolchain.
+On most Linux systems, the correct toolchain gets installed when you install the `arm-none-eabi-gcc` (or `gcc-arm-none-eabi`) package.  
+On some Linux distributions, you will also have to explicitly install `libnewlib-arm-none-eabi` .
+
+### Installing stlink
+To flash binaries onto most development boards, **pqm4** is using [stlink](https://github.com/texane/stlink). 
+Depending on your operating system, stlink may be available in your package manager -- if not, please
+refer to the stlink Github page for instructions on how to [compile it from source](https://github.com/texane/stlink/blob/master/doc/compiling.md) 
+(in that case, be careful to use libusb-1.0.0-dev, not libusb-0.1).
+
+### Installing OpenOCD
+For the `nucleo-l4r5zi` board [OpenOCD](http://openocd.org) (tested with version 0.12) is used for flashing binaries.
+Depending on your operating system, OpenOCD may be available in your package manager -- if not, please
+refer to the OpenOCD README for instructions on how to [compile it from source](http://openocd.org/doc-release/README).
+
+### Python3
+The benchmarking scripts used in **pqm4** require Python >= 3.8.
+
+### Installing pyserial
+The host-side Python code for most platforms requires the [pyserial](https://github.com/pyserial/pyserial) module.
+Your package repository might offer `python3-serial` (Debian, Ubuntu) or `python-pyserial` (Arch) or `python3-pyserial` (Fedora, openSUSE) or `pyserial` (Slack, CentOS, Gentoo) or `py3-pyserial` (Alpine) directly.
+Alternatively, this can be easily installed from PyPA by calling `pip3 install -r requirements.txt`.
+If you do not have `pip3` installed yet, you can typically find it as `python3-pip` (Debian, Ubuntu) or `python-pip` (Arch) using your package manager.
+
+### Installing ChipWhisperer
+The host-side Python code for the `cw308t-stm32f3` board requires the [chipwhisperer](https://chipwhisperer.readthedocs.io/en/latest/installing.html#install-repo-pypi) module.
+If you don't target this board, you can skip the installation.
+
+### Installing QEMU >=5.2
+The `mps2-an386` platform is simulated with the [QEMU](https://www.qemu.org/)
+ARM system emulator. You'll need at least the version 5.2, which is fairly
+recent at the time of writing and may not be available on your favourite Linux
+distro. If you don't target this platform, you can skip the installation.
+
+### Connecting the STM32F4 Discovery board to the host
+Connect the board to your host machine using the mini-USB port. 
+This provides it with power, and allows you to flash binaries onto the board. 
+It should show up in `lsusb` as `STMicroelectronics ST-LINK/V2`.
+
+If you are using a UART-USB connector that has a PL2303 chip on board (which appears to be the most common), 
+the driver should be loaded in your kernel by default. If it is not, it is typically called `pl2303`. 
+On macOS, you will still need to [install it](http://www.prolific.com.tw/US/ShowProduct.aspx?p_id=229&pcid=41) (and reboot). 
+When you plug in the device, it should show up as `Prolific Technology, Inc. PL2303 Serial Port` when you type `lsusb`.
+
+Using dupont / jumper cables, connect the `TX`/`TXD` pin of the USB connector to the `PA3` pin on the board, and connect `RX`/`RXD` to `PA2`. 
+Depending on your setup, you may also want to connect the `GND` pins.
+
+### Downloading pqm4 and libopencm3
+Finally, obtain the **pqm4** library and the submodules:
+```
+git clone --recursive https://github.com/mupq/pqm4.git
+```
+
+Now you may pick your platform and compile the code (adapt the `PLATFORM`
+variable to your chosen platform and the number of threads in `-j4` to your PC accordingly):
+```
+make -j4 PLATFORM=stm32f4discovery
+```
+
+## API documentation
+The **pqm4** library uses the NIST/SUPERCOP/[PQClean
+API](https://github.com/PQClean/PQClean). It is mandated for all included
+schemes.
+
+KEMs need to define `CRYPTO_SECRETKEYBYTES`, `CRYPTO_PUBLICKEYBYTES`, `CRYPTO_BYTES`, and `CRYPTO_CIPHERTEXTBYTES` and implement 
+```c
+int crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
+int crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk);
+int crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk);
+```
+
+Signature schemes need to define `CRYPTO_SECRETKEYBYTES`, `CRYPTO_PUBLICKEYBYTES`, and `CRYPTO_BYTES` and implement
+```c
+int crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
+int crypto_sign(unsigned char *sm, size_t *smlen, 
+                const unsigned char *msg, size_t len,
+                const unsigned char *sk);
+int crypto_sign_open(unsigned char *m, size_t *mlen,
+                     const unsigned char *sm, size_t smlen,
+                     const unsigned char *pk);
+```
+
+
+## Running tests and benchmarks
+The build system compiles six binaries for each implemenation which can be used to test and benchmark the schemes. For example, for the reference implementation of [ML-KEM-768](https://pq-crystals.org/kyber/) the following binaries are assembled: 
+ - `bin/crypto_kem_ml-kem-768_m4_test.bin` tests if the scheme works as expected. For KEMs this tests if Alice and Bob derive the same shared key and for signature schemes it tests if a generated signature can be verified correctly. Several failure cases are also checked, see [mupq/crypto_kem/test.c](https://github.com/mupq/mupq/blob/master/crypto_kem/test.c) and [mupq/crypto_sign/test.c](https://github.com/mupq/mupq/blob/master/crypto_sign/test.c) for details.
+ - `bin/crypto_kem_ml-kem-768_m4_speed.bin` measures the runtime of `crypto_kem_keypair`, `crypto_kem_enc`, and `crypto_kem_dec` for KEMs and `crypto_sign_keypair`, `crypto_sign`, and `crypto_sign_open` for signatures. See [mupq/crypto_kem/speed.c](https://github.com/mupq/mupq/blob/master/crypto_kem/speed.c) and [mupq/crypto_sign/speed.c](https://github.com/mupq/mupq/blob/master/crypto_sign/speed.c).
+ - `bin/crypto_kem_ml-kem-768_m4_hashing.bin` measures the cycles spent in SHA-2, SHA-3, and AES of `crypto_kem_keypair`, `crypto_kem_enc`, and `crypto_kem_dec` for KEMs and `crypto_sign_keypair`, `crypto_sign`, and `crypto_sign_open` for signatures. See [mupq/crypto_kem/hashing.c](https://github.com/mupq/mupq/blob/master/crypto_kem/speed.c) and [mupq/crypto_sign/speed.c](https://github.com/mupq/mupq/blob/master/crypto_sign/speed.c).
+ - `bin/crypto_kem_ml-kem-768_m4_stack.bin` measures the stack consumption of each of the procedures involved. The memory allocated outside of the procedures (e.g., public keys, private keys, ciphertexts, signatures) is not included. See [mupq/crypto_kem/stack.c](https://github.com/mupq/mupq/blob/master/crypto_kem/stack.c) and [mupq/crypto_sign/stack.c](https://github.com/mupq/mupq/blob/master/crypto_sign/stack.c).
+ - `bin/crypto_kem_ml-kem-768_m4_testvectors.bin` uses a deterministic random number generator to generate testvectors for the implementation. These can be used to cross-check different implemenatations of the same scheme. See [mupq/crypto_kem/testvectors.c](https://github.com/mupq/mupq/blob/master/crypto_kem/testvectors.c) and [mupq/crypto_sign/testvectors.c](https://github.com/mupq/mupq/blob/master/crypto_sign/testvectors.c).
+- `bin-host/crypto_kem_ml-kem-768_m4_testvectors` uses the same deterministic random number generator to create the testvectors on your host. See [mupq/crypto_kem/testvectors-host.c](https://github.com/mupq/mupq/blob/master/crypto_kem/testvectors-host.c) and [mupq/crypto_sign/testvectors-host.c](https://github.com/mupq/mupq/blob/master/crypto_sign/testvectors-host.c).
+- An `elf` file for each binary is generated in the `elf/` folder if desired.
+
+The `elf` files or binaries can be flashed to your board using an appropriate
+tool. For example, the `stm32f4discovery` platform uses `st-flash`, e.g., `st-flash write bin/crypto_kem_ml-kem-768_m4_test.bin 0x8000000`. To receive the output, run `python3 hostside/host_unidirectional.py`. 
+
+If you target the `mps2-an386` platform, you can also run the `elf` file using
+the QEMU ARM emulator:
+```
+qemu-system-arm -M mps2-an386 -nographic -semihosting -kernel elf/crypto_kem_ml-kem-512_m4_test.elf
+```
+The emulator should exit automatically when the test / benchmark completes. If
+you run into an error, you can exit QEMU pressing CTRL+A and then X.
+
+The **pqm4** framework automates testing and benchmarking for all schemes using Python3 scripts: 
+- `python3 test.py`: flashes all test binaries to the boards and checks that no errors occur. 
+- `python3 testvectors.py`: flashes all testvector binaries to the boards and writes the testvectors to `testvectors/`. Additionally, it executes the reference implementations on your host machine. Afterwards, it checks the testvectors of different implementations of the same scheme for consistency. 
+- `python3 benchmarks.py`: flashes the stack and speed binaries and writes the results to `benchmarks/stack/` and `benchmarks/speed/`. You may want to execute this several times for certain schemes for which the execution time varies significantly.
+
+The scripts take a number of command line arguments, which you'll need to adapt:
+- `--platform <platformname>` or `-p <platformname>`: Sets the target platform (default `stm32f4discovery`).
+- `--opt {speed,size,debug}` or `-o {speed,size,debug}`: Sets optimization flags for compilation (default `speed`).
+- `--lto` or `-l`: Use link-time optimization during compilation.
+- `--no-aio`: Use link-time optimization during compilation.
+
+If you change any of these values, you'll need to run `make clean` (the build
+system will remind you).
+
+In case you don't want to include all schemes, pass a list of schemes you want to include to any of the scripts, e.g., `python3 test.py ml-kem-768 sphincs-shake256-128f-simple`. 
+In case you want to exclude certain schemes pass `--exclude`, e.g., `python3 test.py --exclude saber`.
+
+The benchmark results (in `benchmarks/`) created by 
+`python3 benchmarks.py` can be automatically converted to a markdown table using `python3 convert_benchmarks.py md` or to csv using `python3 convert_benchmarks.py csv`.
+
+## Benchmarks
+The current benchmark results can be found in [benchmarks.csv](benchmarks.csv) or [benchmarks.md](benchmarks.md).
+
+All cycle counts were obtained at 24MHz to avoid wait cycles due to the speed of the memory controller.
+For most schemes we report minimum, maximum, and average cycle counts of 100 executions.
+For some particularly slow schemes we reduce the number of executions; the number of
+executions is reported in parentheses.
+
+The numbers were obtained with `arm-none-eabi-gcc (Arm GNU Toolchain 11.3.Rel1) 11.3.1 20220712` from [Arm](https://developer.arm.com/downloads/-/arm-gnu-toolchain-downloads).
+
+The code-size measurements only include the code that is provided by the scheme implementation, i.e., exclude common code like hashing or C standard library functions.
+The measurements are performed with `arm-none-eabi-size`.
+The size contributions to the `.text`, `.data`, and `.bss` sections are also listed separately.
+
+
+## Adding new schemes and implementations
+The **pqm4** build system is designed to make it very easy to add new schemes
+and implementations, if these implementations follow the NIST/SUPERCOP/PQClean API.
+
+In case you want to contribute a reference implementation, please open a pull request to [PQClean](https://github.com/PQClean/PQClean).
+In case you want to contribute an optimized C implementation, please open a pull request to [mupq](https://github.com/mupq/mupq).
+In case you want to add an implementation optimized for the Cortex-M4, please open a pull request here.
+
+In the following we consider the example of adding an M4-optimized implementation
+of [NewHope-512-CPA-KEM](https://newhopecrypto.org) to **pqm4**:
+
+1. Create a subdirectory for the new scheme under `crypto_kem/`; in the following we assume that this subdirectory is called `newhope512cpa`.
+1. Create a subdirectory `m4` under `crypto_kem/newhope512cpa/`.
+1. Copy all files of the implementation into this new subdirectory `crypto_kem/newhope512cpa/m4/`,
+   except for the file implementing the `randombytes` function (typically `PQCgenKAT_kem.c`).
+
+The procedure for adding a signature scheme is the same, except that it starts with creating a
+new subdirectory under `crypto_sign/`.
+
+### Using optimized FIPS202 (Keccak, SHA3, SHAKE)
+   Many schemes submitted to NIST use SHA-3, SHAKE or cSHAKE for hashing. 
+   This is why **pqm4** comes with highly optimized Keccak code that is accessible
+   from all KEM and signature implementations. 
+   Functions from the FIPS202 standard are defined in `mupq/common/fips202.h` as follows:
+
+   ```c
+  void shake128_absorb(shake128ctx *state, const uint8_t *input, size_t inlen);
+  void shake128_squeezeblocks(uint8_t *output, size_t nblocks, shake128ctx *state);
+  void shake128(uint8_t *output, size_t outlen, const uint8_t *input, size_t inlen);
+
+  void shake128_inc_init(shake128incctx *state);
+  void shake128_inc_absorb(shake128incctx *state, const uint8_t *input, size_t inlen);
+  void shake128_inc_finalize(shake128incctx *state);
+  void shake128_inc_squeeze(uint8_t *output, size_t outlen, shake128incctx *state);
+
+  void shake256_absorb(shake256ctx *state, const uint8_t *input, size_t inlen);
+  void shake256_squeezeblocks(uint8_t *output, size_t nblocks, shake256ctx *state);
+  void shake256(uint8_t *output, size_t outlen, const uint8_t *input, size_t inlen);
+
+  void shake256_inc_init(shake256incctx *state);
+  void shake256_inc_absorb(shake256incctx *state, const uint8_t *input, size_t inlen);
+  void shake256_inc_finalize(shake256incctx *state);
+  void shake256_inc_squeeze(uint8_t *output, size_t outlen, shake256incctx *state);
+
+  void sha3_256_inc_init(sha3_256incctx *state);
+  void sha3_256_inc_absorb(sha3_256incctx *state, const uint8_t *input, size_t inlen);
+  void sha3_256_inc_finalize(uint8_t *output, sha3_256incctx *state);
+
+  void sha3_256(uint8_t *output, const uint8_t *input, size_t inlen);
+
+  void sha3_512_inc_init(sha3_512incctx *state);
+  void sha3_512_inc_absorb(sha3_512incctx *state, const uint8_t *input, size_t inlen);
+  void sha3_512_inc_finalize(uint8_t *output, sha3_512incctx *state);
+
+  void sha3_512(uint8_t *output, const uint8_t *input, size_t inlen);
+   ```
+
+  Functions from the related publication SP 800-185 (cSHAKE) are defined in `mupq/common/sp800-185.h`:
+
+  ```c
+  void cshake128_inc_init(shake128incctx *state, const uint8_t *name, size_t namelen, const uint8_t *cstm, size_t cstmlen);
+  void cshake128_inc_absorb(shake128incctx *state, const uint8_t *input, size_t inlen);
+  void cshake128_inc_finalize(shake128incctx *state);
+  void cshake128_inc_squeeze(uint8_t *output, size_t outlen, shake128incctx *state);
+
+  void cshake128(uint8_t *output, size_t outlen, const uint8_t *name, size_t namelen, const uint8_t *cstm, size_t cstmlen, const uint8_t *input, size_t inlen);
+
+  void cshake256_inc_init(shake256incctx *state, const uint8_t *name, size_t namelen, const uint8_t *cstm, size_t cstmlen);
+  void cshake256_inc_absorb(shake256incctx *state, const uint8_t *input, size_t inlen);
+  void cshake256_inc_finalize(shake256incctx *state);
+  void cshake256_inc_squeeze(uint8_t *output, size_t outlen, shake256incctx *state);
+
+  void cshake256(uint8_t *output, size_t outlen, const uint8_t *name, size_t namelen, const uint8_t* cstm, size_t cstmlen, const uint8_t *input, size_t inlen);
+  ```
+
+   Implementations that want to make use of these optimized routines simply include 
+   `fips202.h` (or `sp800-185.h`). The API for `sha3_256` and `sha3_512` follows the 
+   [SUPERCOP hash API](https://bench.cr.yp.to/call-hash.html).
+   The API for `shake128` and `shake256` is very similar, except that it supports variable-length output.
+   The SHAKE functions are also accessible via the absorb-squeezeblocks functions, which offer incremental
+   output generation (but not incremental input handling).
+   The variants with `_inc_` offer both incremental input handling and output generation.
+
+## Using optimized SHA-2
+
+  Some schemes submitted to NIST use SHA-224, SHA-256, SHA-384, or SHA-512 for hashing.
+  We've experimented with assembly-optimized SHA-512, but found that the speed-up
+  achievable with this compared to the C implementation from
+  [SUPERCOP](https://bench.cr.yp.to/) is negligible
+  when compiled using `arm-none-eabi-gcc-8.3.0`.
+  For older compiler versions (e.g. `5.4.1`) hand-optimized assembly implementations
+  were significantly faster.
+  We've therefore decided to only include a C version of the SHA-2 variants.
+  The available functions are:
+   ```c
+  void sha224_inc_init(sha224ctx *state);
+  void sha224_inc_blocks(sha224ctx *state, const uint8_t *in, size_t inblocks);
+  void sha224_inc_finalize(uint8_t *out, sha224ctx *state, const uint8_t *in, size_t inlen);
+  void sha224(uint8_t *out, const uint8_t *in, size_t inlen);
+
+  void sha256_inc_init(sha256ctx *state);
+  void sha256_inc_blocks(sha256ctx *state, const uint8_t *in, size_t inblocks);
+  void sha256_inc_finalize(uint8_t *out, sha256ctx *state, const uint8_t *in, size_t inlen);
+  void sha256(uint8_t *out, const uint8_t *in, size_t inlen);
+
+  void sha384_inc_init(sha384ctx *state);
+  void sha384_inc_blocks(sha384ctx *state, const uint8_t *in, size_t inblocks);
+  void sha384_inc_finalize(uint8_t *out, sha384ctx *state, const uint8_t *in, size_t inlen);
+  void sha384(uint8_t *out, const uint8_t *in, size_t inlen);
+
+  void sha512_inc_init(sha512ctx *state);
+  void sha512_inc_blocks(sha512ctx *state, const uint8_t *in, size_t inblocks);
+  void sha512_inc_finalize(uint8_t *out, sha512ctx *state, const uint8_t *in, size_t inlen);
+  void sha512(uint8_t *out, const uint8_t *in, size_t inlen);
+  ```
+  Implementations can use these by including `sha2.h`.
+
+## Using optimized AES
+
+  Some schemes submitted to NIST make use of AES as a subroutine.
+  We included assembly-optimized implementations of AES-128 and AES-256 in ECB mode and in CTR mode.
+
+  Up until January 2021, pqm4 relied on the [t-table implementation](https://github.com/Ko-/aes-armcortexm) by Schwabe and Stoffelen published at [SAC2016](https://eprint.iacr.org/2016/714.pdf).
+  On Cortex-M4 platforms with a data cache, this implementation may be vulnerable to cache attacks.
+  Hence, pqm4 is now using the [bitsliced implementation](https://github.com/aadomn/aes) by Adomnicai and Peyrin published in [TCHES2021/1](https://eprint.iacr.org/2020/1123.pdf).
+
+  The functions that can be used are stated in `common/aes.h` as follows:
+  ```c
+  void aes128_ecb_keyexp(aes128ctx *r, const unsigned char *key);
+  void aes128_ctr_keyexp(aes128ctx *r, const unsigned char *key);
+  void aes128_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes128ctx *ctx);
+  void aes128_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes128ctx *ctx);
+
+  void aes256_ecb_keyexp(aes256ctx *r, const unsigned char *key);
+  void aes256_ctr_keyexp(aes256ctx *r, const unsigned char *key);
+  void aes256_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes256ctx *ctx);
+  void aes256_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes256ctx *ctx);
+  ```
+  Implementations can use these by including `aes.h`.
+
+  Some post-quantum schemes use AES with only public inputs (e.g., Kyber and FrodoKEM) and, consequently, do not need a constant-time AES implementation.
+  As those schemes would be unfairly penalized by swiching to a slower constant-time implementation, we additionally provide the t-table implementation.
+  The functions that can be used are stated in `common/aes-publicinputs.h` as follows:
+ ```c
+  void aes128_ecb_keyexp_publicinputs(aes128ctx_publicinputs *r, const unsigned char *key);
+  void aes128_ctr_keyexp_publicinputs(aes128ctx_publicinputs *r, const unsigned char *key);
+  void aes128_ecb_publicinputs(unsigned char *out, const unsigned char *in, size_t nblocks, const aes128ctx_publicinputs *ctx);
+  void aes128_ctr_publicinputs(unsigned char *out, size_t outlen, const unsigned char *iv, const aes128ctx_publicinputs *ctx);
+
+  void aes192_ecb_keyexp_publicinputs(aes192ctx_publicinputs *r, const unsigned char *key);
+  void aes192_ctr_keyexp_publicinputs(aes192ctx_publicinputs *r, const unsigned char *key);
+  void aes192_ecb_publicinputs(unsigned char *out, const unsigned char *in, size_t nblocks, const aes192ctx_publicinputs *ctx);
+  void aes192_ctr_publicinputs(unsigned char *out, size_t outlen, const unsigned char *iv, const aes192ctx_publicinputs *ctx);
+
+  void aes256_ecb_keyexp_publicinputs(aes256ctx_publicinputs *r, const unsigned char *key);
+  void aes256_ctr_keyexp_publicinputs(aes256ctx_publicinputs *r, const unsigned char *key);
+  void aes256_ecb_publicinputs(unsigned char *out, const unsigned char *in, size_t nblocks, const aes256ctx_publicinputs *ctx);
+  void aes256_ctr_publicinputs(unsigned char *out, size_t outlen, const unsigned char *iv, const aes256ctx_publicinputs *ctx);
+ ```
+
+## Bibliography
+
+When referring to this framework in academic literature, please consider using the following bibTeX excerpt:
+
+```
+@misc{PQM4,
+  title = {{PQM4}: Post-quantum crypto library for the {ARM} {Cortex-M4}},
+  author = {Matthias J. Kannwischer and Richard Petri and Joost Rijneveld and Peter Schwabe and Ko Stoffelen},
+  note = {\url{https://github.com/mupq/pqm4}}
+}
+```
+
+**Please note** however, that pqm4 does not author the implementations that
+are included in pqm4.  Most of the implementations that are included in the
+collection originate from original research projects.  Moreover, many
+implementations have been swapped out over the years.  When comparing or
+improving implementations, please consider not only pqm4, but also cite
+the publication corresponding to the implementation.
+
+Sometimes it might not be entirely clear which paper to cite.  Feel free to
+you open an issue such that we can help you find it.
+
+## License
+Different parts of **pqm4** have different licenses. 
+Each subdirectory containing implementations contains a LICENSE or COPYING file stating 
+under what license that specific implementation is released. 
+The files in common contain licensing information at the top of the file (and 
+are currently either public domain or MIT). 
+
+All other code in this repository is dual-licensed under [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) and under the conditions of [CC0](https://creativecommons.org/publicdomain/zero/1.0/).
+
diff --git a/benchmarks.py b/benchmarks.py
new file mode 100755
index 0000000..f1b9118
--- /dev/null
+++ b/benchmarks.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0 or CC0-1.0
+from mupq import mupq
+from interface import parse_arguments, get_platform
+import sys
+
+if __name__ == "__main__":
+    args, rest = parse_arguments()
+    platform, settings = get_platform(args)
+    with platform:
+        schemes = [s for s in rest if s not in ['--nostack',
+                                                '--nospeed',
+                                                '--nohashing',
+                                                '--nosize']]
+        if "--nostack" not in rest:
+            test = mupq.StackBenchmark(settings, platform)
+            if test.test_all(schemes):
+                sys.exit(1)
+
+        if "--nospeed" not in rest:
+            test = mupq.SpeedBenchmark(settings, platform)
+            if test.test_all(schemes):
+                sys.exit(1)
+
+        if "--nohashing" not in rest:
+            test = mupq.HashingBenchmark(settings, platform)
+            if test.test_all(schemes):
+                sys.exit(1)
+
+        if "--nosize" not in rest:
+            test = mupq.SizeBenchmark(settings, platform)
+            if test.test_all(schemes):
+                sys.exit(1)
diff --git a/build_everything.py b/build_everything.py
new file mode 100755
index 0000000..2194789
--- /dev/null
+++ b/build_everything.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0 or CC0-1.0
+"""
+Builds all of the binaries without flashing them.
+"""
+import sys
+
+from interface import parse_arguments, get_platform
+from mupq import mupq
+
+
+if __name__ == "__main__":
+    args, rest = parse_arguments()
+    platform, settings = get_platform(args)
+    with platform:
+        mupq.BuildAll(settings).test_all(rest)
diff --git a/common/aes-encrypt.S b/common/aes-encrypt.S
new file mode 100644
index 0000000..2f19ff7
--- /dev/null
+++ b/common/aes-encrypt.S
@@ -0,0 +1,613 @@
+/******************************************************************************
+* Assembly fixsliced implementation of AES-128 and AES-256 (encryption only).
+*
+* Fully-fixsliced implementation runs faster than the semi-fixsliced variant
+* at the cost of a larger code size.
+*
+* See the paper at https://eprint.iacr.org/2020/1123.pdf for more details.
+*
+* @author   Alexandre Adomnicai, Nanyang Technological University, Singapore
+*           alexandre.adomnicai@ntu.edu.sg
+*
+* @date     October 2020
+******************************************************************************/
+
+.syntax unified
+.thumb
+
+/******************************************************************************
+* Macro to compute the SWAPMOVE technique: swap the bits in 'in1' masked by 'm'
+* by the bits in 'in0' masked by 'm << n' and put the results in 'out0', 'out1'
+******************************************************************************/
+.macro swpmv out0, out1, in0, in1, m, n, tmp
+    eor     \tmp, \in1, \in0, lsr \n
+    and     \tmp, \m
+    eor     \out1, \in1, \tmp
+    eor     \out0, \in0, \tmp, lsl \n
+.endm
+
+/******************************************************************************
+* Rotate all bytes in 'in' by 'n0' bits to the rights and put the results in
+* 'out'. 'm' refers to the appropriate bitmask and 'n1' = 8-'n0'.
+******************************************************************************/
+.macro byteror  out, in, m, n0, n1, tmp
+    and     \out, \m, \in, lsr \n0
+    bic     \tmp, \in, \m, ror \n1
+    orr     \out, \out, \tmp, lsl \n1
+.endm
+
+/******************************************************************************
+* Compute the MixColumns for rounds i st i%4 == 0 or 2.
+* Between the two versions, only the masks and the shifts for the 'byteror' are
+* differing.
+******************************************************************************/
+.macro mc_0_2   m, n0, n1, n2, n3
+    byteror r14, r1, \m, \n0, \n1, r9   // r14 <- BYTE_ROR_n0(S0)
+    eor     r4, r1, r14, ror #8         // r4 <- S0 ^ (BYTE_ROR_6(S0) >>> 8)
+    movw    r1, #0x0f0f
+    movt    r1, #0x0f0f                 // r1 <- 0x0f0f0f0f (for BYTE_ROR)
+    byteror r5, r11, \m, \n0, \n1, r9   // r5 <- BYTE_ROR_n0(S7)
+    eor     r10, r11, r5, ror #8        // r10<- S7 ^ BYTE_ROR_n0(S7 >>> 8)
+    byteror r11, r10, r1, 4, 4, r9      // r11<- BYTE_ROR_4(r10)
+    eor     r11, r4, r11, ror #16       // r11<- BYTE_ROR_4(r10) ^ (r10 >>> 16)
+    eor     r11, r11, r5, ror #8        // r11<- S'7
+    byteror r5, r2, \m, \n0, \n1, r9    // r5 <- BYTE_ROR_n0(S6)
+    eor     r2, r2, r5, ror #8          // r2 <- S6 ^ BYTE_ROR_n0(S6 >>> 8)
+    eor     r10, r10, r5, ror #8        // r10<- r10 ^ (BYTE_ROR_n0(S6) >>> 8)
+    byteror r5, r2, r1, 4, 4, r9        // r5 <- BYTE_ROR_4(r2)
+    eor     r10, r10, r5, ror #16       // r10<- r10 ^ (r5 >>> 16)
+    eor     r10, r10, r4                // r10<- S'6
+    byteror r5, r0, \m, \n0, \n1, r9    // r5 <- BYTE_ROR_n0(S5)
+    eor     r0, r0, r5, ror #8          // r0 <- S5 ^ BYTE_ROR_6(S5 >>> 8)
+    eor     r9, r2, r5, ror #8          // r9 <- r2 ^ (BYTE_ROR_n0(S5) >>> 8)
+    byteror r5, r0, r1, 4, 4, r2        // r5 <- BYTE_ROR_4(r0)
+    eor     r9, r9, r5, ror #16         // r9 <- S'5
+    byteror r5, r8, \m, \n0, \n1, r2    // r5 <- BYTE_ROR_n0(S4)
+    eor     r2, r8, r5, ror #8          // r2 <- S4 ^ BYTE_ROR_6(S4 >>> 8)
+    eor     r8, r0, r5, ror #8          // r8 <- r0 ^ (BYTE_ROR_n0(S4) >>> 8)
+    byteror r5, r2, r1, 4, 4, r0        // r5 <- BYTE_ROR_4(r2)
+    eor     r8, r8, r5, ror #16         // r8 <- r8 ^ (r5 >>> 16)
+    eor     r8, r8, r4                  // r8 <- S'4
+    byteror r5, r7, \m, \n0, \n1, r0    // r5 <- BYTE_ROR_n0(S3)
+    eor     r0, r7, r5, ror #8          // r0 <- S3 ^ BYTE_ROR_6(S3 >>> 8)
+    eor     r7, r2, r5, ror #8          // r2 ^ (BYTE_ROR_n0(S3) >>> 8)
+    byteror r5, r0, r1, 4, 4, r2        // r5 <- BYTE_ROR_4(r0)
+    eor     r7, r7, r5, ror #16         // r7 <- r7 ^ (r5 >>> 16)
+    eor     r7, r7, r4                  // r7 <- S'3
+    byteror r5, r6, \m, \n0, \n1, r2    // r5 <- BYTE_ROR_n0(S2)
+    eor     r2, r6, r5, ror #8          // r2 <- S2 ^ BYTE_ROR_6(S2 >>> 8)
+    eor     r6, r0, r5, ror #8          // r6 <- r0 ^ (BYTE_ROR_n0(S2) >>> 8)
+    byteror r5, r2, r1, 4, 4, r0        // r5 <- BYTE_ROR_4(r2)
+    eor     r6, r6, r5, ror #16         // r6 <- S'2
+    byteror r5, r3, \m, \n0, \n1, r0    // r5 <- BYTE_ROR_n0(S1)
+    eor     r0, r3, r5, ror #8          // r0 <- S1 ^ BYTE_ROR_6(S1 >>> 8)
+    eor     r3, r2, r5, ror #8          // r3 <- r0 ^ (BYTE_ROR_n0(S1) >>> 8)
+    byteror r5, r0, r1, 4, 4, r2        // r5 <- BYTE_ROR_4(r0)
+    eor     r5, r3, r5, ror #16         // r5 <- S'1
+    eor     r14, r0, r14, ror #8        // r14<- r0 ^ (BYTE_ROR_n0(S0) >>> 8)
+    byteror r0, r4, r1, 4, 4, r2        // r0 <- BYTE_ROR_4(r4)
+    eor     r4, r14, r0, ror #16        // r4 <- S'0
+.endm
+
+/******************************************************************************
+* Packs two 128-bit input blocs stored in r4-r7 and r8-r11, respectively, into
+* the 256-bit internal state where the bits are packed as follows:
+* r4 = b_24 b_56 b_88 b_120 || ... || b_0 b_32 b_64 b_96
+* r5 = b_25 b_57 b_89 b_121 || ... || b_1 b_33 b_65 b_97
+* r6 = b_26 b_58 b_90 b_122 || ... || b_2 b_34 b_66 b_98
+* r7 = b_27 b_59 b_91 b_123 || ... || b_3 b_35 b_67 b_99
+* r8 = b_28 b_60 b_92 b_124 || ... || b_4 b_36 b_68 b_100
+* r9 = b_29 b_61 b_93 b_125 || ... || b_5 b_37 b_69 b_101
+* r10 = b_30 b_62 b_94 b_126 || ... || b_6 b_38 b_70 b_102
+* r11 = b_31 b_63 b_95 b_127 || ... || b_7 b_39 b_71 b_103
+******************************************************************************/
+.align 2
+packing:
+    movw    r3, #0x0f0f
+    movt    r3, #0x0f0f             // r3 <- 0x0f0f0f0f (mask for SWAPMOVE)
+    eor     r2, r3, r3, lsl #2      // r2 <- 0x33333333 (mask for SWAPMOVE)
+    eor     r1, r2, r2, lsl #1      // r1 <- 0x55555555 (mask for SWAPMOVE)
+    swpmv   r8, r4, r8, r4, r1, #1, r12
+    swpmv   r9, r5, r9, r5, r1, #1, r12
+    swpmv   r10, r6, r10, r6, r1, #1, r12
+    swpmv   r11, r7, r11, r7, r1, #1, r12
+    swpmv   r0, r4, r5, r4, r2, #2, r12
+    swpmv   r9, r5, r9, r8, r2, #2, r12
+    swpmv   r7, r8, r7, r6, r2, #2, r12
+    swpmv   r11, r2, r11, r10, r2, #2, r12
+    swpmv   r8, r4, r8, r4, r3, #4, r12
+    swpmv   r10, r6, r7, r0, r3, #4, r12
+    swpmv   r11, r7, r11, r9, r3, #4, r12
+    swpmv   r9, r5, r2, r5, r3, #4, r12
+    bx      lr
+
+/******************************************************************************
+* Unpacks the 256-bit internal state in two 128-bit blocs.
+******************************************************************************/
+.align 2
+unpacking:
+    movw    r3, #0x0f0f
+    movt    r3, #0x0f0f                 // r3 <- 0x0f0f0f0f (mask for SWAPMOVE)
+    swpmv   r2, r5, r9, r5, r3, #4, r12
+    swpmv   r11, r9, r11, r7, r3, #4, r12
+    swpmv   r7, r1, r10, r6, r3, #4, r12
+    swpmv   r8, r4, r8, r4, r3, #4, r12
+    eor     r3, r3, r3, lsl #2          // r3 <- 0x33333333 (mask for SWAPMOVE)
+    swpmv   r11, r10,r11, r2, r3, #2, r12
+    swpmv   r7, r6, r7, r8, r3, #2, r12
+    swpmv   r9, r8, r9, r5, r3, #2, r12
+    swpmv   r5, r4, r1, r4, r3, #2, r12
+    eor     r1, r3, r3, lsl #1          // r1 <- 0x55555555 (mask for SWAPMOVE)
+    swpmv   r8, r4, r8, r4, r1, #1, r12
+    swpmv   r9, r5,r9, r5, r1, #1, r12
+    swpmv   r10, r6, r10, r6, r1, #1, r12
+    swpmv   r11, r7, r11, r7, r1, #1, r12
+    bx      lr
+
+/******************************************************************************
+* Subroutine that computes the AddRoundKey and the S-box.
+* Credits to https://github.com/Ko-/aes-armcortexm for the S-box implementation
+******************************************************************************/
+.align 2
+ark_sbox:
+    // add round key
+    ldr.w   r1, [sp, #48]
+    ldmia   r1!, {r0,r2,r3,r12}
+    eor     r4, r0
+    eor     r5, r2
+    eor     r6, r3
+    eor     r7, r12
+    ldmia   r1!, {r0,r2,r3,r12}
+    eor     r8, r0
+    eor     r9, r2
+    eor     r10, r3
+    eor     r11, r12
+    str.w   r1, [sp, #48]
+    str     r14, [sp, #52]
+    // sbox: credits to https://github.com/Ko-/aes-armcortexm
+    eor     r1, r7, r9              //Exec y14 = U3 ^ U5; into r1
+    eor     r3, r4, r10             //Exec y13 = U0 ^ U6; into r3
+    eor     r2, r3, r1              //Exec y12 = y13 ^ y14; into r2
+    eor     r0, r8, r2              //Exec t1 = U4 ^ y12; into r0
+    eor     r14, r0, r9             //Exec y15 = t1 ^ U5; into r14
+    and     r12, r2, r14            //Exec t2 = y12 & y15; into r12
+    eor     r8, r14, r11            //Exec y6 = y15 ^ U7; into r8
+    eor     r0, r0, r5              //Exec y20 = t1 ^ U1; into r0
+    str.w   r2, [sp, #44]           //Store r2/y12 on stack
+    eor     r2, r4, r7              //Exec y9 = U0 ^ U3; into r2
+    str     r0, [sp, #40]           //Store r0/y20 on stack
+    eor     r0, r0, r2              //Exec y11 = y20 ^ y9; into r0
+    str     r2, [sp, #36]           //Store r2/y9 on stack
+    and     r2, r2, r0              //Exec t12 = y9 & y11; into r2
+    str     r8, [sp, #32]           //Store r8/y6 on stack
+    eor     r8, r11, r0             //Exec y7 = U7 ^ y11; into r8
+    eor     r9, r4, r9              //Exec y8 = U0 ^ U5; into r9
+    eor     r6, r5, r6              //Exec t0 = U1 ^ U2; into r6
+    eor     r5, r14, r6             //Exec y10 = y15 ^ t0; into r5
+    str     r14, [sp, #28]          //Store r14/y15 on stack
+    eor     r14, r5, r0             //Exec y17 = y10 ^ y11; into r14
+    str.w   r1, [sp, #24]           //Store r1/y14 on stack
+    and     r1, r1, r14             //Exec t13 = y14 & y17; into r1
+    eor     r1, r1, r2              //Exec t14 = t13 ^ t12; into r1
+    str     r14, [sp, #20]          //Store r14/y17 on stack
+    eor     r14, r5, r9             //Exec y19 = y10 ^ y8; into r14
+    str.w   r5, [sp, #16]           //Store r5/y10 on stack
+    and     r5, r9, r5              //Exec t15 = y8 & y10; into r5
+    eor     r2, r5, r2              //Exec t16 = t15 ^ t12; into r2
+    eor     r5, r6, r0              //Exec y16 = t0 ^ y11; into r5
+    str.w   r0, [sp, #12]           //Store r0/y11 on stack
+    eor     r0, r3, r5              //Exec y21 = y13 ^ y16; into r0
+    str     r3, [sp, #8]            //Store r3/y13 on stack
+    and     r3, r3, r5              //Exec t7 = y13 & y16; into r3
+    str     r5, [sp, #4]            //Store r5/y16 on stack
+    str     r11, [sp, #0]           //Store r11/U7 on stack
+    eor     r5, r4, r5              //Exec y18 = U0 ^ y16; into r5
+    eor     r6, r6, r11             //Exec y1 = t0 ^ U7; into r6
+    eor     r7, r6, r7              //Exec y4 = y1 ^ U3; into r7
+    and     r11, r7, r11            //Exec t5 = y4 & U7; into r11
+    eor     r11, r11, r12           //Exec t6 = t5 ^ t2; into r11
+    eor     r11, r11, r2            //Exec t18 = t6 ^ t16; into r11
+    eor     r14, r11, r14           //Exec t22 = t18 ^ y19; into r14
+    eor     r4, r6, r4              //Exec y2 = y1 ^ U0; into r4
+    and     r11, r4, r8             //Exec t10 = y2 & y7; into r11
+    eor     r11, r11, r3            //Exec t11 = t10 ^ t7; into r11
+    eor     r2, r11, r2             //Exec t20 = t11 ^ t16; into r2
+    eor     r2, r2, r5              //Exec t24 = t20 ^ y18; into r2
+    eor     r10, r6, r10            //Exec y5 = y1 ^ U6; into r10
+    and     r11, r10, r6            //Exec t8 = y5 & y1; into r11
+    eor     r3, r11, r3             //Exec t9 = t8 ^ t7; into r3
+    eor     r3, r3, r1              //Exec t19 = t9 ^ t14; into r3
+    eor     r3, r3, r0              //Exec t23 = t19 ^ y21; into r3
+    eor     r0, r10, r9             //Exec y3 = y5 ^ y8; into r0
+    ldr     r11, [sp, #32]          //Load y6 into r11
+    and     r5, r0, r11             //Exec t3 = y3 & y6; into r5
+    eor     r12, r5, r12            //Exec t4 = t3 ^ t2; into r12
+    ldr     r5, [sp, #40]           //Load y20 into r5
+    str     r7, [sp, #32]           //Store r7/y4 on stack
+    eor     r12, r12, r5            //Exec t17 = t4 ^ y20; into r12
+    eor     r1, r12, r1             //Exec t21 = t17 ^ t14; into r1
+    and     r12, r1, r3             //Exec t26 = t21 & t23; into r12
+    eor     r5, r2, r12             //Exec t27 = t24 ^ t26; into r5
+    eor     r12, r14, r12           //Exec t31 = t22 ^ t26; into r12
+    eor     r1, r1, r14             //Exec t25 = t21 ^ t22; into r1
+    and     r7, r1, r5              //Exec t28 = t25 & t27; into r7
+    eor     r14, r7, r14            //Exec t29 = t28 ^ t22; into r14
+    and     r4, r14, r4             //Exec z14 = t29 & y2; into r4
+    and     r8, r14, r8             //Exec z5 = t29 & y7; into r8
+    eor     r7, r3, r2              //Exec t30 = t23 ^ t24; into r7
+    and     r12, r12, r7            //Exec t32 = t31 & t30; into r12
+    eor     r12, r12, r2            //Exec t33 = t32 ^ t24; into r12
+    eor     r7, r5, r12             //Exec t35 = t27 ^ t33; into r7
+    and     r2, r2, r7              //Exec t36 = t24 & t35; into r2
+    eor     r5, r5, r2              //Exec t38 = t27 ^ t36; into r5
+    and     r5, r14, r5             //Exec t39 = t29 & t38; into r5
+    eor     r1, r1, r5              //Exec t40 = t25 ^ t39; into r1
+    eor     r5, r14, r1             //Exec t43 = t29 ^ t40; into r5
+    ldr.w   r7, [sp, #4]            //Load y16 into r7
+    and     r7, r5, r7              //Exec z3 = t43 & y16; into r7
+    eor     r8, r7, r8              //Exec tc12 = z3 ^ z5; into r8
+    str     r8, [sp, #40]           //Store r8/tc12 on stack
+    ldr     r8, [sp, #8]            //Load y13 into r8
+    and     r8, r5, r8              //Exec z12 = t43 & y13; into r8
+    and     r10, r1, r10            //Exec z13 = t40 & y5; into r10
+    and     r6, r1, r6              //Exec z4 = t40 & y1; into r6
+    eor     r6, r7, r6              //Exec tc6 = z3 ^ z4; into r6
+    eor     r3, r3, r12             //Exec t34 = t23 ^ t33; into r3
+    eor     r3, r2, r3              //Exec t37 = t36 ^ t34; into r3
+    eor     r1, r1, r3              //Exec t41 = t40 ^ t37; into r1
+    ldr.w   r5, [sp, #16]           //Load y10 into r5
+    and     r2, r1, r5              //Exec z8 = t41 & y10; into r2
+    and     r9, r1, r9              //Exec z17 = t41 & y8; into r9
+    str     r9, [sp, #16]           //Store r9/z17 on stack
+    eor     r5, r12, r3             //Exec t44 = t33 ^ t37; into r5
+    ldr     r9, [sp, #28]           //Load y15 into r9
+    ldr.w   r7, [sp, #44]           //Load y12 into r7
+    and     r9, r5, r9              //Exec z0 = t44 & y15; into r9
+    and     r7, r5, r7              //Exec z9 = t44 & y12; into r7
+    and     r0, r3, r0              //Exec z10 = t37 & y3; into r0
+    and     r3, r3, r11             //Exec z1 = t37 & y6; into r3
+    eor     r3, r3, r9              //Exec tc5 = z1 ^ z0; into r3
+    eor     r3, r6, r3              //Exec tc11 = tc6 ^ tc5; into r3
+    ldr     r11, [sp, #32]          //Load y4 into r11
+    ldr.w   r5, [sp, #20]           //Load y17 into r5
+    and     r11, r12, r11           //Exec z11 = t33 & y4; into r11
+    eor     r14, r14, r12           //Exec t42 = t29 ^ t33; into r14
+    eor     r1, r14, r1             //Exec t45 = t42 ^ t41; into r1
+    and     r5, r1, r5              //Exec z7 = t45 & y17; into r5
+    eor     r6, r5, r6              //Exec tc8 = z7 ^ tc6; into r6
+    ldr     r5, [sp, #24]           //Load y14 into r5
+    str     r4, [sp, #32]           //Store r4/z14 on stack
+    and     r1, r1, r5              //Exec z16 = t45 & y14; into r1
+    ldr     r5, [sp, #12]           //Load y11 into r5
+    ldr     r4, [sp, #36]           //Load y9 into r4
+    and     r5, r14, r5             //Exec z6 = t42 & y11; into r5
+    eor     r5, r5, r6              //Exec tc16 = z6 ^ tc8; into r5
+    and     r4, r14, r4             //Exec z15 = t42 & y9; into r4
+    eor     r14, r4, r5             //Exec tc20 = z15 ^ tc16; into r14
+    eor     r4, r4, r1              //Exec tc1 = z15 ^ z16; into r4
+    eor     r1, r0, r4              //Exec tc2 = z10 ^ tc1; into r1
+    eor     r0, r1, r11             //Exec tc21 = tc2 ^ z11; into r0
+    eor     r7, r7, r1              //Exec tc3 = z9 ^ tc2; into r7
+    eor     r1, r7, r5              //Exec S0 = tc3 ^ tc16; into r1
+    eor     r7, r7, r3              //Exec S3 = tc3 ^ tc11; into r7
+    eor     r3, r7, r5              //Exec S1 = S3 ^ tc16 ^ 1; into r3
+    eor     r11, r10, r4            //Exec tc13 = z13 ^ tc1; into r11
+    ldr.w   r4, [sp, #0]            //Load U7 into r4
+    and     r12, r12, r4            //Exec z2 = t33 & U7; into r12
+    eor     r9, r9, r12             //Exec tc4 = z0 ^ z2; into r9
+    eor     r12, r8, r9             //Exec tc7 = z12 ^ tc4; into r12
+    eor     r2, r2, r12             //Exec tc9 = z8 ^ tc7; into r2
+    eor     r2, r6, r2              //Exec tc10 = tc8 ^ tc9; into r2
+    ldr.w   r4, [sp, #32]           //Load z14 into r4
+    eor     r12, r4, r2             //Exec tc17 = z14 ^ tc10; into r12
+    eor     r0, r0, r12             //Exec S5 = tc21 ^ tc17; into r0
+    eor     r6, r12, r14            //Exec tc26 = tc17 ^ tc20; into r6
+    ldr.w   r4, [sp, #16]           //Load z17 into r4
+    ldr     r12, [sp, #40]          //Load tc12 into r12
+    eor     r6, r6, r4              //Exec S2 = tc26 ^ z17 ^ 1; into r6
+    eor     r12, r9, r12            //Exec tc14 = tc4 ^ tc12; into r12
+    eor     r14, r11, r12           //Exec tc18 = tc13 ^ tc14; into r14
+    eor     r2, r2, r14             //Exec S6 = tc10 ^ tc18 ^ 1; into r2
+    eor     r11, r8, r14            //Exec S7 = z12 ^ tc18 ^ 1; into r11
+    ldr     r14, [sp, #52]          // restore link register
+    eor     r8, r12, r7             //Exec S4 = tc14 ^ S3; into r8
+    bx      lr
+    // [('r0', 'S5'), ('r1', 'S0'), ('r2', 'S6'), ('r3', 'S1'),
+    // ('r6', 'S2'),('r7', 'S3'), ('r8', 'S4'), ('r11', 'S7')]
+
+/******************************************************************************
+* Computation of the MixColumns transformation in the fixsliced representation.
+* For fully-fixsliced implementations, it is used for rounds i s.t. (i%4) == 0.
+* For semi-fixsliced implementations, it is used for rounds i s.t. (i%2) == 0.
+******************************************************************************/
+.align 2
+mixcolumns_0:
+    str     r14, [sp, #52]          // store link register
+    movw    r12, #0x0303
+    movt    r12, #0x0303
+    mc_0_2  r12, 6, 2, 26, 18
+    ldr     r14, [sp, #52]          // restore link register
+    bx      lr
+
+/******************************************************************************
+* Computation of the MixColumns transformation in the fixsliced representation.
+* For fully-fixsliced implementations only, for round i s.t. (i%4) == 1.
+******************************************************************************/
+.align 2
+mixcolumns_1:
+    str     r14, [sp, #52]          // store link register
+    movw    r14, #0x0f0f
+    movt    r14, #0x0f0f            // r14<- 0x0f0f0f0f (mask for BYTE_ROR_4)
+    and     r5, r14, r1, lsr #4     // r5 <- (S0 >> 4) & 0x0f0f0f0f
+    and     r9, r14, r1             // r9 <- S0 & 0x0f0f0f0f
+    orr     r5, r5, r9, lsl #4      // r5 <- BYTE_ROR_4(S0)
+    eor     r4, r1, r5, ror #8      // r4 <- S0 ^ (BYTE_ROR_4(S0) >>> 8)
+    mov.w   r1, r5, ror #8          // r1 <- (BYTE_ROR_4(S0) >>> 8)
+    and     r5, r14, r11, lsr #4    // r5 <- (S7 >> 4) & 0x0f0f0f0f
+    and     r9, r14, r11            // r9 <- S7 & 0x0f0f0f0f
+    orr     r5, r5, r9, lsl #4      // r5 <- BYTE_ROR_4(S7)
+    eor     r12, r11, r5, ror #8    // r12<- S7 ^ (BYTE_ROR_4(S7) >>> 8)
+    eor     r10, r4, r12            // r10<- r4 ^ r12
+    eor     r11, r10                // r11<- S7 ^ r4 ^ r12
+    eor     r11, r11, r12, ror #16  // r11<- r11 ^ (r12 >>> 16)
+    and     r5, r14, r2, lsr #4     // r5 <- (S6 >> 4) & 0x0f0f0f0f
+    and     r9, r14, r2             // r9 <- S6 & 0x0f0f0f0f
+    orr     r5, r5, r9, lsl #4      // r5 <- BYTE_ROR_4(S6)
+    eor     r10, r10, r5, ror #8    // r10<- r10 ^ (BYTE_ROR_4(S6) >>> 8)
+    eor     r12, r2, r5, ror #8     // r12<- S6 ^ (BYTE_ROR_4(S6) >>> 8)
+    eor     r10, r10, r12, ror #16  // r10<- r10 ^ (r12 >>> 16)
+    and     r5, r14, r0, lsr #4     // r5 <- (S5 >> 4) & 0x0f0f0f0f
+    and     r9, r14, r0             // r9 <- S5 & 0x0f0f0f0f
+    orr     r5, r5, r9, lsl #4      // r5 <- BYTE_ROR_4(S5)
+    eor     r9, r12, r5, ror #8     // r9 <- r12 ^ (BYTE_ROR_4(S5) >>> 8)
+    eor     r12, r0, r5, ror #8     // r12<- S5 ^ (BYTE_ROR_4(S5) >>> 8)
+    eor     r9, r9, r12, ror #16    // r9 <- (r9 ^ r12 >>> 16)
+    eor     r0, r4, r12             // r0 <- r12 ^ S0 ^ (BYTE_ROR_4(S0) >>> 8)
+    and     r5, r14, r8, lsr #4     // r5 <- (S4 >> 4) & 0x0f0f0f0f
+    and     r2, r14, r8             // r2 <- S4 & 0x0f0f0f0f
+    orr     r2, r5, r2, lsl #4      // r2 <- BYTE_ROR_4(S4)
+    eor     r0, r0, r2, ror #8      // r0 <- r0 ^ (BYTE_ROR_4(S4) >>> 8)
+    eor     r2, r8, r2, ror #8      // r2 <- S4 ^ (BYTE_ROR_4(S4) >>> 8)
+    eor     r8, r0, r2, ror #16     // r8 <- r0 ^ (r2 >>> 16)
+    eor     r2, r4                  // r2 <- r2 ^ S0 ^ (BYTE_ROR_4(S0) >>> 8)
+    and     r5, r14, r7, lsr #4     // r5 <- (S3 >> 4) & 0x0f0f0f0f
+    and     r0, r14, r7             // r0 <- S3 & 0x0f0f0f0f
+    orr     r0, r5, r0, lsl #4      // r0 <- BYTE_ROR_4(S3)
+    eor     r2, r2, r0, ror #8      // r2 <- r2 ^ (BYTE_ROR_4(S3) >>> 8)
+    eor     r0, r7, r0, ror #8      // r0 <- S3 ^ (BYTE_ROR_4(S3) >>> 8)
+    eor     r7, r2, r0, ror #16     // r7 <- r2 ^ (r0 >>> 16)
+    and     r5, r14, r6, lsr #4     // r5 <- (S2 >> 4) & 0x0f0f0f0f
+    and     r2, r14, r6             // r2 <- S2 & 0x0f0f0f0f
+    orr     r2, r5, r2, lsl #4      // r2 <- BYTE_ROR_4(S2)
+    eor     r0, r0, r2, ror #8      // r0 <- r0 ^ (BYTE_ROR_4(S2) >>> 8)
+    eor     r2, r6, r2, ror #8      // r2 <- S2 ^ (BYTE_ROR_4(S2) >>> 8)
+    eor     r6, r0, r2, ror #16     // r6 <- r0 ^ (r2 >>> 16)
+    and     r5, r14, r3, lsr #4     // r5 <- (S1 >> 4) & 0x0f0f0f0f
+    and     r0, r14, r3             // r0 <- S1 & 0x0f0f0f0f
+    orr     r0, r5, r0, lsl #4      // r0 <- BYTE_ROR_4(S1)
+    ldr     r14, [sp, #52]          // restore link register
+    eor     r2, r2, r0, ror #8      // r2 <- r2 ^ (BYTE_ROR_4(S1) >>> 8)
+    eor     r0, r3, r0, ror #8      // r0 <- S1 ^ (BYTE_ROR_4(S1) >>> 8)
+    eor     r5, r2, r0, ror #16     // r5 <- r2 <- (r0 >>> 16)
+    eor     r1, r0, r1              // r1 <- r0 ^ BYTE_ROR_4(S0) >>> 8
+    eor     r4, r1, r4, ror #16     // r4 <- r4 ^ (r0 >>> 16)
+    bx      lr
+
+/******************************************************************************
+* Computation of the MixColumns transformation in the fixsliced representation.
+* For fully-fixsliced implementations only, for rounds i s.t. (i%4) == 2.
+******************************************************************************/
+.align 2
+mixcolumns_2:
+    str     r14, [sp, #52]          // store link register
+    movw    r12, #0x3f3f
+    movt    r12, #0x3f3f
+    mc_0_2  r12, 2, 6, 30, 22
+    ldr     r14, [sp, #52]          // restore link register
+    bx      lr
+
+/******************************************************************************
+* Computation of the MixColumns transformation in the fixsliced representation.
+* For fully-fixsliced implementations, it is used for rounds i s.t. (i%4) == 3.
+* For semi-fixsliced implementations, it is used for rounds i s.t. (i%2) == 1.
+* Based on Käsper-Schwabe, similar to https://github.com/Ko-/aes-armcortexm.
+******************************************************************************/
+.align 2
+mixcolumns_3:
+    eor     r12, r11, r11, ror #8   // r12<- S7 ^ (S7 >>> 8)
+    eor     r4, r1, r1, ror #8      // r4 <- S0 ^ (S0 >>> 8)
+    eor     r11, r4, r11, ror #8    // r11<- S0 ^ (S0 >>> 8) ^ (S7 >>> 8)
+    eor     r11, r11, r12, ror #16  // r11<- r11 ^ (S7 >>> 16) ^ (S7 >>> 24)
+    eor     r10, r12, r2, ror #8    // r10<- S7 ^ (S7 >>> 8) ^ (S6 >>> 8)
+    eor     r12, r2, r2, ror #8     // r12<- S6 ^ (S6 >>> 8)
+    eor     r10, r10, r12, ror #16  // r10<- r10 ^ (S6 >>> 16) ^ (S6 >>> 24)
+    eor     r10, r4                 // r10<- r10 ^ S0 ^ (S0 >>> 8)
+    eor     r9, r12, r0, ror #8     // r9 <- S6 ^ (S6 >>> 8) ^ (S5 >>> 8)
+    eor     r12, r0, r0, ror #8     // r12<- S5 ^ (S5 >>> 8)
+    eor     r9, r9, r12, ror #16    // r9 <- r9 ^ (S5 >>> 16) ^ (S5 >>> 24)
+    eor     r2, r8, r8, ror #8      // r2 <- S4 ^ (S4 >>> 8)
+    eor     r8, r12, r8, ror #8     // r8 <- S5 ^ (S5 >>> 8) ^ (S4 >>> 8)
+    eor     r8, r4                  // r8 <- r8 ^ S0 ^ (S0 >>> 8)
+    eor     r8, r8, r2, ror #16     // r8 <- r8 ^ (S4 >>> 16) ^ (S4 >>> 24)
+    eor     r12, r7, r7, ror #8     // r12<- S3 ^ (S3 >>> 8)
+    eor     r7, r2, r7, ror #8      // r7 <- S4 ^ (S4 >>> 8) ^ (S3 >>> 8)
+    eor     r7, r4                  // r7 <- r7 ^ S0 ^ (S0 >>> 8)
+    eor     r7, r7, r12, ror #16    // r7 <- r7 ^ (S3 >>> 16) ^ (S3 >>> 24)
+    eor     r2, r6, r6, ror #8      // r2 <- S2 ^ (S2 >>> 8)
+    eor     r6, r12, r6, ror #8     // r6 <- S3 ^ (S3 >>> 8) ^ (S2 >>> 8)
+    eor     r6, r6, r2, ror #16     // r6 <- r6 ^ (S2 >>> 16) ^ (S2 >>> 24)
+    eor     r12, r3, r3, ror #8     // r12<- S1 ^ (S1 >>> 8)
+    eor     r5, r2, r3, ror #8      // r5 <- S2 ^ (S2 >>> 8) ^ (S1 >>> 8)
+    eor     r5, r5, r12, ror #16    // r5 <- r5 ^ (S1 >>> 16) ^ (S1 >>> 24)
+    eor     r4, r12, r4, ror #16    // r4 <- S1 ^ (S1 >>> 8) ^ (r4 >>> 16)
+    eor     r4, r4, r1, ror #8      // r4 <- r4 ^ (S0 >>> 8)
+    bx      lr
+
+/******************************************************************************
+* Applies the ShiftRows transformation twice (i.e. SR^2) on the internal state.
+******************************************************************************/
+.align 2
+double_shiftrows:
+    movw    r10, #0x0f00
+    movt    r10, #0x0f00            // r10<- 0x0f000f00 (mask)
+    swpmv   r0, r0, r0, r0, r10, #4, r12
+    swpmv   r1, r1, r1, r1, r10, #4, r12
+    swpmv   r2, r2, r2, r2, r10, #4, r12
+    swpmv   r3, r3, r3, r3, r10, #4, r12
+    swpmv   r6, r6, r6, r6, r10, #4, r12
+    swpmv   r7, r7, r7, r7, r10, #4, r12
+    swpmv   r8, r8, r8, r8, r10, #4, r12
+    swpmv   r11, r11, r11, r11, r10, #4, r12
+    bx      lr
+
+/******************************************************************************
+* Fully-fixsliced implementation of AES-128.
+*
+* Two blocks are encrypted in parallel, without any operating mode.
+*
+* Note that additional 4 bytes are allocated on the stack as the function takes
+* 5 arguments as input.
+******************************************************************************/
+@ void aes128_encrypt_ffs(u8* ctext, u8* ctext_bis, const u8* ptext,
+@                   const u8* ptext_bis, const u32* rkey);
+.global aes128_encrypt_ffs
+.type   aes128_encrypt_ffs,%function
+.align 2
+aes128_encrypt_ffs:
+    push    {r0-r12,r14}
+    sub.w   sp, #56                 // allow space on the stack for tmp var
+    ldr.w   r4, [r2]                // load the 1st 128-bit blocks in r4-r7
+    ldr     r5, [r2, #4]
+    ldr     r6, [r2, #8]
+    ldr     r7, [r2, #12]
+    ldr.w   r8, [r3]                // load the 2nd 128-bit blocks in r8-r11
+    ldr     r9, [r3, #4]
+    ldr     r10,[r3, #8]
+    ldr     r11,[r3, #12]
+    ldr.w   r1, [sp, #112]          // load 'rkey' argument from the stack
+    str.w   r1, [sp, #48]           // store it there for 'add_round_key'
+    bl      packing                 // pack the 2 input blocks
+    bl      ark_sbox                // ark + sbox (round 0)
+    bl      mixcolumns_0            // mixcolumns (round 0)
+    bl      ark_sbox                // ark + sbox (round 1)
+    bl      mixcolumns_1            // mixcolumns (round 1)
+    bl      ark_sbox                // ark + sbox (round 2)
+    bl      mixcolumns_2            // mixcolumns (round 2)
+    bl      ark_sbox                // ark + sbox (round 3)
+    bl      mixcolumns_3            // mixcolumns (round 3)
+    bl      ark_sbox                // ark + sbox (round 4)
+    bl      mixcolumns_0            // mixcolumns (round 4)
+    bl      ark_sbox                // ark + sbox (round 5)
+    bl      mixcolumns_1            // mixcolumns (round 5)
+    bl      ark_sbox                // ark + sbox (round 6)
+    bl      mixcolumns_2            // mixcolumns (round 6)
+    bl      ark_sbox                // ark + sbox (round 7)
+    bl      mixcolumns_3            // mixcolumns (round 7)
+    bl      ark_sbox                // ark + sbox (round 8)
+    bl      mixcolumns_0            // mixcolumns (round 8)
+    bl      ark_sbox                // ark + sbox (round 9)
+    bl      double_shiftrows        // to resynchronize with the classical rep
+    ldr     r14, [sp, #48]          // ---------------------------------------
+    ldmia   r14!, {r4,r5,r10,r12}   //
+    eor     r4, r1                  //
+    eor     r5, r3                  //
+    eor     r6, r10                 //
+    eor     r7, r12                 //  Last add_round_key
+    ldmia   r14!, {r1,r3,r10,r12}   //
+    eor     r8, r1                  //
+    eor     r9, r0, r3              //
+    eor     r10, r2                 //
+    eor     r11, r12                // ---------------------------------------
+    bl      unpacking               // unpack the internal state
+    ldrd    r0, r1, [sp, #56]       // restore the addr to store the ciphertext
+    add.w   sp, #64                 // restore the stack pointer
+    str.w   r4, [r0]                // store the ciphertext
+    str     r5, [r0, #4]
+    str     r6, [r0, #8]
+    str     r7, [r0, #12]
+    str.w   r8, [r1]               // store the ciphertext
+    str     r9, [r1, #4]
+    str     r10,[r1, #8]
+    str     r11,[r1, #12]
+    pop     {r2-r12, r14}           // restore context
+    bx      lr
+
+/******************************************************************************
+* Fully-fixsliced implementation of AES-256.
+*
+* Two blocks are encrypted in parallel, without any operating mode.
+*
+* Note that additional 4 bytes are allocated on the stack as the function takes
+* 5 arguments as input.
+******************************************************************************/
+@ void aes256_encrypt_ffs(u8* ctext, u8* ctext_bis, const u8* ptext,
+@                   const u8* ptext_bis, const u32* rkey);
+.global aes256_encrypt_ffs
+.type   aes256_encrypt_ffs,%function
+.align 2
+aes256_encrypt_ffs:
+    push    {r0-r12,r14}
+    sub.w   sp, #56                 // allow space on the stack for tmp var
+    ldr.w   r4, [r2]                // load the 1st 128-bit blocks in r4-r7
+    ldr     r5, [r2, #4]
+    ldr     r6, [r2, #8]
+    ldr     r7, [r2, #12]
+    ldr.w   r8, [r3]                // load the 2nd 128-bit blocks in r8-r11
+    ldr     r9, [r3, #4]
+    ldr     r10,[r3, #8]
+    ldr     r11,[r3, #12]
+    ldr.w   r1, [sp, #112]          // load 'rkey' argument from the stack
+    str.w   r1, [sp, #48]           // store it there for 'add_round_key'
+    bl      packing                 // pack the 2 input blocks
+    bl      ark_sbox                // ark + sbox (round 0)
+    bl      mixcolumns_0            // mixcolumns (round 0)
+    bl      ark_sbox                // ark + sbox (round 1)
+    bl      mixcolumns_1            // mixcolumns (round 1)
+    bl      ark_sbox                // ark + sbox (round 2)
+    bl      mixcolumns_2            // mixcolumns (round 2)
+    bl      ark_sbox                // ark + sbox (round 3)
+    bl      mixcolumns_3            // mixcolumns (round 3)
+    bl      ark_sbox                // ark + sbox (round 4)
+    bl      mixcolumns_0            // mixcolumns (round 4)
+    bl      ark_sbox                // ark + sbox (round 5)
+    bl      mixcolumns_1            // mixcolumns (round 5)
+    bl      ark_sbox                // ark + sbox (round 6)
+    bl      mixcolumns_2            // mixcolumns (round 6)
+    bl      ark_sbox                // ark + sbox (round 7)
+    bl      mixcolumns_3            // mixcolumns (round 7)
+    bl      ark_sbox                // ark + sbox (round 8)
+    bl      mixcolumns_0            // mixcolumns (round 8)
+    bl      ark_sbox                // ark + sbox (round 9)
+    bl      mixcolumns_1            // mixcolumns (round 9)
+    bl      ark_sbox                // ark + sbox (round 10)
+    bl      mixcolumns_2            // mixcolumns (round 10)
+    bl      ark_sbox                // ark + sbox (round 11)
+    bl      mixcolumns_3            // mixcolumns (round 11)
+    bl      ark_sbox                // ark + sbox (round 12)
+    bl      mixcolumns_0            // mixcolumns (round 12)
+    bl      ark_sbox                // ark + sbox (round 13)
+    bl      double_shiftrows        // to resynchronize with the classical rep
+    ldr     r14, [sp, #48]          // ---------------------------------------
+    ldmia   r14!, {r4,r5,r10,r12}   //
+    eor     r4, r1                  //
+    eor     r5, r3                  //
+    eor     r6, r10                 //
+    eor     r7, r12                 //  Last add_round_key
+    ldmia   r14!, {r1,r3,r10,r12}   //
+    eor     r8, r1                  //
+    eor     r9, r0, r3              //
+    eor     r10, r2                 //
+    eor     r11, r12                // ---------------------------------------
+    bl      unpacking               // unpack the internal state
+    ldrd    r0, r1, [sp, #56]       // restore the addr to store the ciphertext
+    add.w   sp, #64                 // restore the stack pointer
+    str.w   r4, [r0]                // store the ciphertext
+    str     r5, [r0, #4]
+    str     r6, [r0, #8]
+    str     r7, [r0, #12]
+    str.w   r8, [r1]                // store the ciphertext
+    str     r9, [r1, #4]
+    str     r10,[r1, #8]
+    str     r11,[r1, #12]
+    pop     {r2-r12, r14}           // restore context
+    bx      lr
\ No newline at end of file
diff --git a/common/aes-keyschedule.S b/common/aes-keyschedule.S
new file mode 100644
index 0000000..246bc5f
--- /dev/null
+++ b/common/aes-keyschedule.S
@@ -0,0 +1,851 @@
+/******************************************************************************
+* ARM assembly implemetnations of the AES-128 and AES-256 key schedule to
+* match fixslicing.
+* Note that those implementations are fully bitsliced and do not rely on any
+* Look-Up Table (LUT).
+*
+* See the paper at https://eprint.iacr.org/2020/1123.pdf for more details.
+*
+* @author   Alexandre Adomnicai, Nanyang Technological University, Singapore
+*           alexandre.adomnicai@ntu.edu.sg
+*
+* @date     October 2020
+******************************************************************************/
+
+.syntax unified
+.thumb
+
+/******************************************************************************
+* Macro to compute the SWAPMOVE technique: swap the bits in 'in1' masked by 'm'
+* by the bits in 'in0' masked by 'm << n' and put the results in 'out0', 'out1'
+******************************************************************************/
+.macro swpmv out0, out1, in0, in1, m, n, tmp
+    eor     \tmp, \in1, \in0, lsr \n
+    and     \tmp, \m
+    eor     \out1, \in1, \tmp
+    eor     \out0, \in0, \tmp, lsl \n
+.endm
+
+/******************************************************************************
+* Packing routine. Note that it is the same as the one used in the encryption
+* function so some code size could be saved by merging the two files.
+******************************************************************************/
+.align 2
+packing:
+    movw    r3, #0x0f0f
+    movt    r3, #0x0f0f             // r3 <- 0x0f0f0f0f (mask for SWAPMOVE)
+    eor     r2, r3, r3, lsl #2      // r2 <- 0x33333333 (mask for SWAPMOVE)
+    eor     r1, r2, r2, lsl #1      // r1 <- 0x55555555 (mask for SWAPMOVE)
+    swpmv   r8, r4, r8, r4, r1, #1, r12
+    swpmv   r9, r5, r9, r5, r1, #1, r12
+    swpmv   r10, r6, r10, r6, r1, #1, r12
+    swpmv   r11, r7, r11, r7, r1, #1, r12
+    swpmv   r0, r4, r5, r4, r2, #2, r12
+    swpmv   r9, r5, r9, r8, r2, #2, r12
+    swpmv   r7, r8, r7, r6, r2, #2, r12
+    swpmv   r11, r2, r11, r10, r2, #2, r12
+    swpmv   r8, r4, r8, r4, r3, #4, r12
+    swpmv   r10, r6, r7, r0, r3, #4, r12
+    swpmv   r11, r7, r11, r9, r3, #4, r12
+    swpmv   r9, r5, r2, r5, r3, #4, r12
+    bx      lr
+
+/******************************************************************************
+* Subroutine that computes S-box. Note that the same code is used in the
+* encryption function, so some code size could be saved by merging the 2 files.
+* Credits to https://github.com/Ko-/aes-armcortexm.
+******************************************************************************/
+.align 2
+sbox:
+    str     r14, [sp, #52]
+    eor     r1, r7, r9              //Exec y14 = U3 ^ U5; into r1
+    eor     r3, r4, r10             //Exec y13 = U0 ^ U6; into r3
+    eor     r2, r3, r1              //Exec y12 = y13 ^ y14; into r2
+    eor     r0, r8, r2              //Exec t1 = U4 ^ y12; into r0
+    eor     r14, r0, r9             //Exec y15 = t1 ^ U5; into r14
+    and     r12, r2, r14            //Exec t2 = y12 & y15; into r12
+    eor     r8, r14, r11            //Exec y6 = y15 ^ U7; into r8
+    eor     r0, r0, r5              //Exec y20 = t1 ^ U1; into r0
+    str.w   r2, [sp, #44]           //Store r2/y12 on stack
+    eor     r2, r4, r7              //Exec y9 = U0 ^ U3; into r2
+    str     r0, [sp, #40]           //Store r0/y20 on stack
+    eor     r0, r0, r2              //Exec y11 = y20 ^ y9; into r0
+    str     r2, [sp, #36]           //Store r2/y9 on stack
+    and     r2, r2, r0              //Exec t12 = y9 & y11; into r2
+    str     r8, [sp, #32]           //Store r8/y6 on stack
+    eor     r8, r11, r0             //Exec y7 = U7 ^ y11; into r8
+    eor     r9, r4, r9              //Exec y8 = U0 ^ U5; into r9
+    eor     r6, r5, r6              //Exec t0 = U1 ^ U2; into r6
+    eor     r5, r14, r6             //Exec y10 = y15 ^ t0; into r5
+    str     r14, [sp, #28]          //Store r14/y15 on stack
+    eor     r14, r5, r0             //Exec y17 = y10 ^ y11; into r14
+    str.w   r1, [sp, #24]           //Store r1/y14 on stack
+    and     r1, r1, r14             //Exec t13 = y14 & y17; into r1
+    eor     r1, r1, r2              //Exec t14 = t13 ^ t12; into r1
+    str     r14, [sp, #20]          //Store r14/y17 on stack
+    eor     r14, r5, r9             //Exec y19 = y10 ^ y8; into r14
+    str.w   r5, [sp, #16]           //Store r5/y10 on stack
+    and     r5, r9, r5              //Exec t15 = y8 & y10; into r5
+    eor     r2, r5, r2              //Exec t16 = t15 ^ t12; into r2
+    eor     r5, r6, r0              //Exec y16 = t0 ^ y11; into r5
+    str.w   r0, [sp, #12]           //Store r0/y11 on stack
+    eor     r0, r3, r5              //Exec y21 = y13 ^ y16; into r0
+    str     r3, [sp, #8]            //Store r3/y13 on stack
+    and     r3, r3, r5              //Exec t7 = y13 & y16; into r3
+    str     r5, [sp, #4]            //Store r5/y16 on stack
+    str     r11, [sp, #0]           //Store r11/U7 on stack
+    eor     r5, r4, r5              //Exec y18 = U0 ^ y16; into r5
+    eor     r6, r6, r11             //Exec y1 = t0 ^ U7; into r6
+    eor     r7, r6, r7              //Exec y4 = y1 ^ U3; into r7
+    and     r11, r7, r11            //Exec t5 = y4 & U7; into r11
+    eor     r11, r11, r12           //Exec t6 = t5 ^ t2; into r11
+    eor     r11, r11, r2            //Exec t18 = t6 ^ t16; into r11
+    eor     r14, r11, r14           //Exec t22 = t18 ^ y19; into r14
+    eor     r4, r6, r4              //Exec y2 = y1 ^ U0; into r4
+    and     r11, r4, r8             //Exec t10 = y2 & y7; into r11
+    eor     r11, r11, r3            //Exec t11 = t10 ^ t7; into r11
+    eor     r2, r11, r2             //Exec t20 = t11 ^ t16; into r2
+    eor     r2, r2, r5              //Exec t24 = t20 ^ y18; into r2
+    eor     r10, r6, r10            //Exec y5 = y1 ^ U6; into r10
+    and     r11, r10, r6            //Exec t8 = y5 & y1; into r11
+    eor     r3, r11, r3             //Exec t9 = t8 ^ t7; into r3
+    eor     r3, r3, r1              //Exec t19 = t9 ^ t14; into r3
+    eor     r3, r3, r0              //Exec t23 = t19 ^ y21; into r3
+    eor     r0, r10, r9             //Exec y3 = y5 ^ y8; into r0
+    ldr     r11, [sp, #32]          //Load y6 into r11
+    and     r5, r0, r11             //Exec t3 = y3 & y6; into r5
+    eor     r12, r5, r12            //Exec t4 = t3 ^ t2; into r12
+    ldr     r5, [sp, #40]           //Load y20 into r5
+    str     r7, [sp, #32]           //Store r7/y4 on stack
+    eor     r12, r12, r5            //Exec t17 = t4 ^ y20; into r12
+    eor     r1, r12, r1             //Exec t21 = t17 ^ t14; into r1
+    and     r12, r1, r3             //Exec t26 = t21 & t23; into r12
+    eor     r5, r2, r12             //Exec t27 = t24 ^ t26; into r5
+    eor     r12, r14, r12           //Exec t31 = t22 ^ t26; into r12
+    eor     r1, r1, r14             //Exec t25 = t21 ^ t22; into r1
+    and     r7, r1, r5              //Exec t28 = t25 & t27; into r7
+    eor     r14, r7, r14            //Exec t29 = t28 ^ t22; into r14
+    and     r4, r14, r4             //Exec z14 = t29 & y2; into r4
+    and     r8, r14, r8             //Exec z5 = t29 & y7; into r8
+    eor     r7, r3, r2              //Exec t30 = t23 ^ t24; into r7
+    and     r12, r12, r7            //Exec t32 = t31 & t30; into r12
+    eor     r12, r12, r2            //Exec t33 = t32 ^ t24; into r12
+    eor     r7, r5, r12             //Exec t35 = t27 ^ t33; into r7
+    and     r2, r2, r7              //Exec t36 = t24 & t35; into r2
+    eor     r5, r5, r2              //Exec t38 = t27 ^ t36; into r5
+    and     r5, r14, r5             //Exec t39 = t29 & t38; into r5
+    eor     r1, r1, r5              //Exec t40 = t25 ^ t39; into r1
+    eor     r5, r14, r1             //Exec t43 = t29 ^ t40; into r5
+    ldr.w   r7, [sp, #4]            //Load y16 into r7
+    and     r7, r5, r7              //Exec z3 = t43 & y16; into r7
+    eor     r8, r7, r8              //Exec tc12 = z3 ^ z5; into r8
+    str     r8, [sp, #40]           //Store r8/tc12 on stack
+    ldr     r8, [sp, #8]            //Load y13 into r8
+    and     r8, r5, r8              //Exec z12 = t43 & y13; into r8
+    and     r10, r1, r10            //Exec z13 = t40 & y5; into r10
+    and     r6, r1, r6              //Exec z4 = t40 & y1; into r6
+    eor     r6, r7, r6              //Exec tc6 = z3 ^ z4; into r6
+    eor     r3, r3, r12             //Exec t34 = t23 ^ t33; into r3
+    eor     r3, r2, r3              //Exec t37 = t36 ^ t34; into r3
+    eor     r1, r1, r3              //Exec t41 = t40 ^ t37; into r1
+    ldr.w   r5, [sp, #16]           //Load y10 into r5
+    and     r2, r1, r5              //Exec z8 = t41 & y10; into r2
+    and     r9, r1, r9              //Exec z17 = t41 & y8; into r9
+    str     r9, [sp, #16]           //Store r9/z17 on stack
+    eor     r5, r12, r3             //Exec t44 = t33 ^ t37; into r5
+    ldr     r9, [sp, #28]           //Load y15 into r9
+    ldr.w   r7, [sp, #44]           //Load y12 into r7
+    and     r9, r5, r9              //Exec z0 = t44 & y15; into r9
+    and     r7, r5, r7              //Exec z9 = t44 & y12; into r7
+    and     r0, r3, r0              //Exec z10 = t37 & y3; into r0
+    and     r3, r3, r11             //Exec z1 = t37 & y6; into r3
+    eor     r3, r3, r9              //Exec tc5 = z1 ^ z0; into r3
+    eor     r3, r6, r3              //Exec tc11 = tc6 ^ tc5; into r3
+    ldr     r11, [sp, #32]          //Load y4 into r11
+    ldr.w   r5, [sp, #20]           //Load y17 into r5
+    and     r11, r12, r11           //Exec z11 = t33 & y4; into r11
+    eor     r14, r14, r12           //Exec t42 = t29 ^ t33; into r14
+    eor     r1, r14, r1             //Exec t45 = t42 ^ t41; into r1
+    and     r5, r1, r5              //Exec z7 = t45 & y17; into r5
+    eor     r6, r5, r6              //Exec tc8 = z7 ^ tc6; into r6
+    ldr     r5, [sp, #24]           //Load y14 into r5
+    str     r4, [sp, #32]           //Store r4/z14 on stack
+    and     r1, r1, r5              //Exec z16 = t45 & y14; into r1
+    ldr     r5, [sp, #12]           //Load y11 into r5
+    ldr     r4, [sp, #36]           //Load y9 into r4
+    and     r5, r14, r5             //Exec z6 = t42 & y11; into r5
+    eor     r5, r5, r6              //Exec tc16 = z6 ^ tc8; into r5
+    and     r4, r14, r4             //Exec z15 = t42 & y9; into r4
+    eor     r14, r4, r5             //Exec tc20 = z15 ^ tc16; into r14
+    eor     r4, r4, r1              //Exec tc1 = z15 ^ z16; into r4
+    eor     r1, r0, r4              //Exec tc2 = z10 ^ tc1; into r1
+    eor     r0, r1, r11             //Exec tc21 = tc2 ^ z11; into r0
+    eor     r7, r7, r1              //Exec tc3 = z9 ^ tc2; into r7
+    eor     r1, r7, r5              //Exec S0 = tc3 ^ tc16; into r1
+    eor     r7, r7, r3              //Exec S3 = tc3 ^ tc11; into r7
+    eor     r3, r7, r5              //Exec S1 = S3 ^ tc16 ^ 1; into r3
+    eor     r11, r10, r4            //Exec tc13 = z13 ^ tc1; into r11
+    ldr.w   r4, [sp, #0]            //Load U7 into r4
+    and     r12, r12, r4            //Exec z2 = t33 & U7; into r12
+    eor     r9, r9, r12             //Exec tc4 = z0 ^ z2; into r9
+    eor     r12, r8, r9             //Exec tc7 = z12 ^ tc4; into r12
+    eor     r2, r2, r12             //Exec tc9 = z8 ^ tc7; into r2
+    eor     r2, r6, r2              //Exec tc10 = tc8 ^ tc9; into r2
+    ldr.w   r4, [sp, #32]           //Load z14 into r4
+    eor     r12, r4, r2             //Exec tc17 = z14 ^ tc10; into r12
+    eor     r0, r0, r12             //Exec S5 = tc21 ^ tc17; into r0
+    eor     r6, r12, r14            //Exec tc26 = tc17 ^ tc20; into r6
+    ldr.w   r4, [sp, #16]           //Load z17 into r4
+    ldr     r12, [sp, #40]          //Load tc12 into r12
+    eor     r6, r6, r4              //Exec S2 = tc26 ^ z17 ^ 1; into r6
+    eor     r12, r9, r12            //Exec tc14 = tc4 ^ tc12; into r12
+    eor     r14, r11, r12           //Exec tc18 = tc13 ^ tc14; into r14
+    eor     r2, r2, r14             //Exec S6 = tc10 ^ tc18 ^ 1; into r2
+    eor     r11, r8, r14            //Exec S7 = z12 ^ tc18 ^ 1; into r11
+    ldr     r14, [sp, #52]          // restore link register
+    eor     r8, r12, r7             //Exec S4 = tc14 ^ S3; into r8
+    bx      lr
+    // [('r0', 'S5'), ('r1', 'S0'), ('r2', 'S6'), ('r3', 'S1'),
+    // ('r6', 'S2'),('r7', 'S3'), ('r8', 'S4'), ('r11', 'S7')]
+
+/******************************************************************************
+* Subroutine that XORs the columns after the S-box during the AES-128 key
+* schedule round function, for rounds i such that (i % 4) == 0.
+* Note that the code size could be reduced at the cost of some instructions
+* since some redundant code is applied on different registers.
+******************************************************************************/
+.align 2
+aes128_xorcolumns_rotword:
+    ldr     r12, [sp, #56]          // restore 'rkeys' address
+    ldr.w   r5, [r12, #28]          // load rkey word of rkey from prev round
+    movw    r4, #0xc0c0
+    movt    r4, #0xc0c0             // r4 <- 0xc0c0c0c0
+    eor     r11, r5, r11, ror #2    // r11<- r5 ^ (r11 >>> 2)
+    bic     r11, r4, r11            // r11<- ~r11 & 0xc0c0c0c0 (NOT omitted in sbox)
+    eor     r9, r5, r11, ror #2     // r9 <- r5 ^ (r11 >>> 2)
+    and     r9, r9, r4, ror #2      // r9 <- r9 & 0x30303030
+    orr     r11, r11, r9            // r11<- r11 | r9
+    eor     r9, r5, r11, ror #2     // r9 <- r5 ^ (r11 >>> 2)
+    and     r9, r9, r4, ror #4      // r9 <- r9 & 0x0c0c0c0c
+    orr     r11, r11, r9            // r11<- r11 | r9
+    eor     r9, r5, r11, ror #2     // r9 <- r5 ^ (r11 >>> 2)
+    and     r9, r9, r4, ror #6      // r9 <- r9 & 0x03030303
+    orr     r11, r11, r9            // r11<- r11 | r9
+    mvn     r9, r5                  // NOT omitted in sbox
+    ldr.w   r5, [r12, #24]          // load rkey word of rkey from prev round
+    str     r9, [r12, #28]          // store new rkey word after NOT
+    str     r11, [r12, #60]         // store new rkey word in 'rkeys'
+    eor     r10, r5, r2, ror #2     // r10<- r5 ^ (r2 >>> 2)
+    bic     r10, r4, r10            // r10<- ~r10 & 0xc0c0c0c0 (NOT omitted in sbox)
+    eor     r9, r5, r10, ror #2     // r9 <- r5 ^ (r10 >>> 2)
+    and     r9, r9, r4, ror #2      // r9 <- r9 & 0x30303030
+    orr     r10, r10, r9            // r10<- r10 | r9
+    eor     r9, r5, r10, ror #2     // r9 <- r5 ^ (r10 >>> 2)
+    and     r9, r9, r4, ror #4      // r9 <- r9 & 0x0c0c0c0c
+    orr     r10, r10, r9            // r10<- r10 | r9
+    eor     r9, r5, r10, ror #2     // r9 <- r5 ^ (r10 >>> 2)
+    and     r9, r9, r4, ror #6      // r9 <- r9 & 0x03030303
+    orr     r10, r10, r9            // r10<- r10 | r9
+    mvn     r9, r5                  // NOT omitted in sbox
+    ldr.w   r2, [r12, #20]          // load rkey word of rkey from prev round
+    str     r9, [r12, #24]          // store new rkey word after NOT
+    str     r10, [r12, #56]         // store new rkey word in 'rkeys'
+    eor     r9, r2, r0, ror #2      // r9 <- r2 ^ (r9 >>> 2)
+    and     r9, r4, r9              // r9 <- r9 & 0xc0c0c0c0
+    eor     r0, r2, r9, ror #2      // r0 <- r2 ^ (r9 >>> 2)
+    and     r0, r0, r4, ror #2      // r0 <- r0 & 0x30303030
+    orr     r9, r9, r0              // r9 <- r9 | r0
+    eor     r0, r2, r9, ror #2      // r0 <- r2 ^ (r9 >>> 2)
+    and     r0, r0, r4, ror #4      // r0 <- r0 & 0x0c0c0c0c
+    orr     r9, r9, r0              // r9 <- r9 | r0
+    eor     r0, r2, r9, ror #2      // r0 <- r2 ^ (r9 >>> 2)
+    and     r0, r0, r4, ror #6      // r0 <- r0 & 0x03030303
+    orr     r9, r9, r0              // r9 <- r9 | r0
+    ldr.w   r2, [r12, #16]          // load rkey word of rkey from prev round
+    str.w   r9, [r12, #52]          // store new rkey word in 'rkeys'
+    eor     r8, r2, r8, ror #2      // r8 <- r2 ^ (r8 >>> 2)
+    and     r8, r4, r8              // r8 <- r8 & 0xc0c0c0c0
+    eor     r0, r2, r8, ror #2      // r0 <- r2 ^ (r8 >>> 2)
+    and     r0, r0, r4, ror #2      // r0 <- r0 & 0x30303030
+    orr     r8, r8, r0              // r8 <- r8 | r0
+    eor     r0, r2, r8, ror #2      // r0 <- r2 ^ (r8 >>> 2)
+    and     r0, r0, r4, ror #4      // r0 <- r0 & 0x0c0c0c0c
+    orr     r8, r8, r0              // r8 <- r8 | r0
+    eor     r0, r2, r8, ror #2      // r0 <- r2 ^ (r8 >>> 2)
+    and     r0, r0, r4, ror #6      // r0 <- r0 & 0x03030303
+    orr     r8, r8, r0              // r8 <- r8 | r0
+    ldr.w   r2, [r12, #12]          // load rkey word of rkey from prev round
+    str.w   r8, [r12, #48]          // store new rkey word in 'rkeys'
+    eor     r7, r2, r7, ror #2      // r7 <- r2 ^ (r7 >>> 2)
+    and     r7, r4, r7              // r7 <- r7 & 0xc0c0c0c0
+    eor     r0, r2, r7, ror #2      // r0 <- r2 ^ (r7 >>> 2)
+    and     r0, r0, r4, ror #2      // r0 <- r0 & 0x30303030
+    orr     r7, r7, r0              // r7 <- r7 | r0
+    eor     r0, r2, r7, ror #2      // r0 <- r2 ^ (r7 >>> 2)
+    and     r0, r0, r4, ror #4      // r0 <- r0 & 0x0c0c0c0c
+    orr     r7, r7, r0              // r7 <- r7 | r0
+    eor     r0, r2, r7, ror #2      // r0 <- r2 ^ (r7 >>> 2)
+    and     r0, r0, r4, ror #6      // r0 <- r0 & 0x03030303
+    orr     r7, r7, r0              // r7 <- r7 | r0
+    ldr.w   r2, [r12, #8]           // load rkey word of rkey from prev round
+    str.w   r7, [r12, #44]          // store new rkey word in 'rkeys'
+    eor     r6, r2, r6, ror #2      // r6 <- r2 ^ (r6 >>> 2)
+    bic     r6, r4, r6              // r6 <- ~r6 & 0xc0c0c0c0 (NOT omitted in sbox)
+    eor     r0, r2, r6, ror #2      // r0 <- r2 ^ (r6 >>> 2)
+    and     r0, r0, r4, ror #2      // r0 <- r0 & 0x30303030
+    orr     r6, r6, r0              // r6 <- r6 | r0
+    eor     r0, r2, r6, ror #2      // r0 <- r2 ^ (r6 >>> 2)
+    and     r0, r0, r4, ror #4      // r0 <- r0 & 0x0c0c0c0c
+    orr     r6, r6, r0              // r6 <- r6 | r0
+    eor     r0, r2, r6, ror #2      // r0 <- r2 ^ (r6 >>> 2)
+    and     r0, r0, r4, ror #6      // r0 <- r0 & 0x03030303
+    orr     r6, r6, r0              // r6 <- r6 | r0
+    mvn     r0, r2                  // NOT omitted in sbox
+    ldr.w   r2, [r12, #4]           // load rkey word of rkey from prev round
+    str.w   r0, [r12, #8]           // store new rkey word after NOT
+    str.w   r6, [r12, #40]          // store new rkey word in 'rkeys'
+    eor     r5, r2, r3, ror #2      // r5 <- r2 ^ (r3 >>> 2)
+    bic     r5, r4, r5              // r5 <- ~r5 & 0xc0c0c0c0 (NOT omitted in sbox)
+    eor     r0, r2, r5, ror #2      // r0 <- r2 ^ (r5 >>> 2)
+    and     r0, r0, r4, ror #2      // r0 <- r0 & 0x30303030
+    orr     r5, r5, r0              // r5 <- r5 | r0
+    eor     r0, r2, r5, ror #2      // r0 <- r2 ^ (r5 >>> 2)
+    and     r0, r0, r4, ror #4      // r0 <- r0 & 0x0c0c0c0c
+    orr     r5, r5, r0              // r5 <- r5 | r0
+    eor     r0, r2, r5, ror #2      // r0 <- r2 ^ (r5 >>> 2)
+    and     r0, r0, r4, ror #6      // r0 <- r0 & 0x03030303
+    orr     r5, r5, r0              // r5 <- r5 | r0
+    mvn     r0, r2                  // NOT omitted in sbox
+    ldr.w   r2, [r12], #32          // load rkey word of rkey from prev round
+    str.w   r0, [r12, #-28]         // store new rkey word after NOT
+    str.w   r5, [r12, #4]           // store new rkey word in 'rkeys'
+    eor     r3, r2, r1, ror #2      // r3 <- r2 ^ (r1 >>> 2)
+    and     r3, r4, r3              // r3 <- r3 & 0xc0c0c0c0
+    eor     r0, r2, r3, ror #2      // r0 <- r2 ^ (r3 >>> 2)
+    and     r0, r0, r4, ror #2      // r0 <- r0 & 0x30303030
+    orr     r3, r3, r0              // r3 <- r3 | r0
+    eor     r0, r2, r3, ror #2      // r0 <- r2 ^ (r3 >>> 2)
+    and     r0, r0, r4, ror #4      // r0 <- r0 & 0x0c0c0c0c
+    orr     r3, r3, r0              // r3 <- r3 | r0
+    eor     r0, r2, r3, ror #2      // r0 <- r2 ^ (r3 >>> 2)
+    and     r0, r0, r4, ror #6      // r0 <- r0 & 0x03030303
+    orr     r4, r3, r0              // r4 <- r3 | r0
+    str.w   r4, [r12]
+    str.w   r12, [sp, #56]          // store the new rkeys address on the stack
+    bx      lr
+
+/******************************************************************************
+* Subroutine that XORs the columns after the S-box during the AES-256 key
+* schedule round function, for rounds i such that (i % 4) == 0.
+* Differs from 'aes128_xorcolumns_rotword' by the rkeys' indexes to be involved
+* in XORs.
+******************************************************************************/
+.align 2
+aes256_xorcolumns_rotword:
+    ldr     r12, [sp, #56]          // restore 'rkeys' address
+    ldr.w   r5, [r12, #28]          // load rkey word of rkey from prev round
+    movw    r4, #0xc0c0
+    movt    r4, #0xc0c0             // r4 <- 0xc0c0c0c0
+    eor     r11, r5, r11, ror #2    // r11<- r5 ^ (r11 >>> 2)
+    bic     r11, r4, r11            // r11<- ~r11 & 0xc0c0c0c0 (NOT omitted in sbox)
+    eor     r9, r5, r11, ror #2     // r9 <- r5 ^ (r11 >>> 2)
+    and     r9, r9, r4, ror #2      // r9 <- r9 & 0x30303030
+    orr     r11, r11, r9            // r11<- r11 | r9
+    eor     r9, r5, r11, ror #2     // r9 <- r5 ^ (r11 >>> 2)
+    and     r9, r9, r4, ror #4      // r9 <- r9 & 0x0c0c0c0c
+    orr     r11, r11, r9            // r11<- r11 | r9
+    eor     r9, r5, r11, ror #2     // r9 <- r5 ^ (r11 >>> 2)
+    and     r9, r9, r4, ror #6      // r9 <- r9 & 0x03030303
+    orr     r11, r11, r9            // r11<- r11 | r9
+    mvn     r9, r5                  // NOT omitted in sbox
+    ldr.w   r5, [r12, #24]          // load rkey word of rkey from prev round
+    str     r9, [r12, #28]          // store new rkey word after NOT
+    str     r11, [r12, #92]         // store new rkey word in 'rkeys'
+    eor     r10, r5, r2, ror #2     // r10<- r5 ^ (r2 >>> 2)
+    bic     r10, r4, r10            // r10<- ~r10 & 0xc0c0c0c0 (NOT omitted in sbox)
+    eor     r9, r5, r10, ror #2     // r9 <- r5 ^ (r10 >>> 2)
+    and     r9, r9, r4, ror #2      // r9 <- r9 & 0x30303030
+    orr     r10, r10, r9            // r10<- r10 | r9
+    eor     r9, r5, r10, ror #2     // r9 <- r5 ^ (r10 >>> 2)
+    and     r9, r9, r4, ror #4      // r9 <- r9 & 0x0c0c0c0c
+    orr     r10, r10, r9            // r10<- r10 | r9
+    eor     r9, r5, r10, ror #2     // r9 <- r5 ^ (r10 >>> 2)
+    and     r9, r9, r4, ror #6      // r9 <- r9 & 0x03030303
+    orr     r10, r10, r9            // r10<- r10 | r9
+    mvn     r9, r5                  // NOT omitted in sbox
+    ldr.w   r2, [r12, #20]          // load rkey word of rkey from prev round
+    str     r9, [r12, #24]          // store new rkey word after NOT
+    str     r10, [r12, #88]         // store new rkey word in 'rkeys'
+    eor     r9, r2, r0, ror #2      // r9 <- r2 ^ (r9 >>> 2)
+    and     r9, r4, r9              // r9 <- r9 & 0xc0c0c0c0
+    eor     r0, r2, r9, ror #2      // r0 <- r2 ^ (r9 >>> 2)
+    and     r0, r0, r4, ror #2      // r0 <- r0 & 0x30303030
+    orr     r9, r9, r0              // r9 <- r9 | r0
+    eor     r0, r2, r9, ror #2      // r0 <- r2 ^ (r9 >>> 2)
+    and     r0, r0, r4, ror #4      // r0 <- r0 & 0x0c0c0c0c
+    orr     r9, r9, r0              // r9 <- r9 | r0
+    eor     r0, r2, r9, ror #2      // r0 <- r2 ^ (r9 >>> 2)
+    and     r0, r0, r4, ror #6      // r0 <- r0 & 0x03030303
+    orr     r9, r9, r0              // r9 <- r9 | r0
+    ldr.w   r2, [r12, #16]          // load rkey word of rkey from prev round
+    str.w   r9, [r12, #84]          // store new rkey word in 'rkeys'
+    eor     r8, r2, r8, ror #2      // r8 <- r2 ^ (r8 >>> 2)
+    and     r8, r4, r8              // r8 <- r8 & 0xc0c0c0c0
+    eor     r0, r2, r8, ror #2      // r0 <- r2 ^ (r8 >>> 2)
+    and     r0, r0, r4, ror #2      // r0 <- r0 & 0x30303030
+    orr     r8, r8, r0              // r8 <- r8 | r0
+    eor     r0, r2, r8, ror #2      // r0 <- r2 ^ (r8 >>> 2)
+    and     r0, r0, r4, ror #4      // r0 <- r0 & 0x0c0c0c0c
+    orr     r8, r8, r0              // r8 <- r8 | r0
+    eor     r0, r2, r8, ror #2      // r0 <- r2 ^ (r8 >>> 2)
+    and     r0, r0, r4, ror #6      // r0 <- r0 & 0x03030303
+    orr     r8, r8, r0              // r8 <- r8 | r0
+    ldr.w   r2, [r12, #12]          // load rkey word of rkey from prev round
+    str.w   r8, [r12, #80]          // store new rkey word in 'rkeys'
+    eor     r7, r2, r7, ror #2      // r7 <- r2 ^ (r7 >>> 2)
+    and     r7, r4, r7              // r7 <- r7 & 0xc0c0c0c0
+    eor     r0, r2, r7, ror #2      // r0 <- r2 ^ (r7 >>> 2)
+    and     r0, r0, r4, ror #2      // r0 <- r0 & 0x30303030
+    orr     r7, r7, r0              // r7 <- r7 | r0
+    eor     r0, r2, r7, ror #2      // r0 <- r2 ^ (r7 >>> 2)
+    and     r0, r0, r4, ror #4      // r0 <- r0 & 0x0c0c0c0c
+    orr     r7, r7, r0              // r7 <- r7 | r0
+    eor     r0, r2, r7, ror #2      // r0 <- r2 ^ (r7 >>> 2)
+    and     r0, r0, r4, ror #6      // r0 <- r0 & 0x03030303
+    orr     r7, r7, r0              // r7 <- r7 | r0
+    ldr.w   r2, [r12, #8]           // load rkey word of rkey from prev round
+    str.w   r7, [r12, #76]          // store new rkey word in 'rkeys'
+    eor     r6, r2, r6, ror #2      // r6 <- r2 ^ (r6 >>> 2)
+    bic     r6, r4, r6              // r6 <- ~r6 & 0xc0c0c0c0 (NOT omitted in sbox)
+    eor     r0, r2, r6, ror #2      // r0 <- r2 ^ (r6 >>> 2)
+    and     r0, r0, r4, ror #2      // r0 <- r0 & 0x30303030
+    orr     r6, r6, r0              // r6 <- r6 | r0
+    eor     r0, r2, r6, ror #2      // r0 <- r2 ^ (r6 >>> 2)
+    and     r0, r0, r4, ror #4      // r0 <- r0 & 0x0c0c0c0c
+    orr     r6, r6, r0              // r6 <- r6 | r0
+    eor     r0, r2, r6, ror #2      // r0 <- r2 ^ (r6 >>> 2)
+    and     r0, r0, r4, ror #6      // r0 <- r0 & 0x03030303
+    orr     r6, r6, r0              // r6 <- r6 | r0
+    mvn     r0, r2                  // NOT omitted in sbox
+    ldr.w   r2, [r12, #4]           // load rkey word of rkey from prev round
+    str.w   r0, [r12, #8]           // store new rkey word after NOT
+    str.w   r6, [r12, #72]          // store new rkey word in 'rkeys'
+    eor     r5, r2, r3, ror #2      // r5 <- r2 ^ (r3 >>> 2)
+    bic     r5, r4, r5              // r5 <- ~r5 & 0xc0c0c0c0 (NOT omitted in sbox)
+    eor     r0, r2, r5, ror #2      // r0 <- r2 ^ (r5 >>> 2)
+    and     r0, r0, r4, ror #2      // r0 <- r0 & 0x30303030
+    orr     r5, r5, r0              // r5 <- r5 | r0
+    eor     r0, r2, r5, ror #2      // r0 <- r2 ^ (r5 >>> 2)
+    and     r0, r0, r4, ror #4      // r0 <- r0 & 0x0c0c0c0c
+    orr     r5, r5, r0              // r5 <- r5 | r0
+    eor     r0, r2, r5, ror #2      // r0 <- r2 ^ (r5 >>> 2)
+    and     r0, r0, r4, ror #6      // r0 <- r0 & 0x03030303
+    orr     r5, r5, r0              // r5 <- r5 | r0
+    mvn     r0, r2                  // NOT omitted in sbox
+    ldr.w   r2, [r12], #32          // load rkey word of rkey from prev round
+    str.w   r0, [r12, #-28]         // store new rkey word after NOT
+    str.w   r5, [r12, #36]          // store new rkey word in 'rkeys'
+    eor     r3, r2, r1, ror #2      // r3 <- r2 ^ (r1 >>> 2)
+    and     r3, r4, r3              // r3 <- r3 & 0xc0c0c0c0
+    eor     r0, r2, r3, ror #2      // r0 <- r2 ^ (r3 >>> 2)
+    and     r0, r0, r4, ror #2      // r0 <- r0 & 0x30303030
+    orr     r3, r3, r0              // r3 <- r3 | r0
+    eor     r0, r2, r3, ror #2      // r0 <- r2 ^ (r3 >>> 2)
+    and     r0, r0, r4, ror #4      // r0 <- r0 & 0x0c0c0c0c
+    orr     r3, r3, r0              // r3 <- r3 | r0
+    eor     r0, r2, r3, ror #2      // r0 <- r2 ^ (r3 >>> 2)
+    and     r0, r0, r4, ror #6      // r0 <- r0 & 0x03030303
+    orr     r4, r3, r0              // r4 <- r3 | r0
+    str.w   r4, [r12, #32]
+    str.w   r12, [sp, #56]          // store the new rkeys address on the stack
+    bx      lr
+
+/******************************************************************************
+* Subroutine that XORs the columns after the S-box during the AES-256 key
+* schedule round function, for rounds i such that (i % 4) == 0.
+* It differs from 'aes256_xorcolumns_rotword' by the omission of the rotword
+* operation (i.e. 'ror #26' instead of 'ror #2').
+******************************************************************************/
+.align 2
+aes256_xorcolumns:
+    ldr     r12, [sp, #56]          // restore 'rkeys' address
+    ldr.w   r5, [r12, #28]          // load rkey word of rkey from prev round
+    movw    r4, #0xc0c0
+    movt    r4, #0xc0c0             // r4 <- 0xc0c0c0c0
+    eor     r11, r5, r11, ror #26   // r11<- r5 ^ (r11 >>> 26)
+    bic     r11, r4, r11            // r11<- ~r11 & 0xc0c0c0c0 (NOT omitted in sbox)
+    eor     r9, r5, r11, ror #2     // r9 <- r5 ^ (r11 >>> 2)
+    and     r9, r9, r4, ror #2      // r9 <- r9 & 0x30303030
+    orr     r11, r11, r9            // r11<- r11 | r9
+    eor     r9, r5, r11, ror #2     // r9 <- r5 ^ (r11 >>> 2)
+    and     r9, r9, r4, ror #4      // r9 <- r9 & 0x0c0c0c0c
+    orr     r11, r11, r9            // r11<- r11 | r9
+    eor     r9, r5, r11, ror #2     // r9 <- r5 ^ (r11 >>> 2)
+    and     r9, r9, r4, ror #6      // r9 <- r9 & 0x03030303
+    orr     r11, r11, r9            // r11<- r11 | r9
+    mvn     r9, r5                  // NOT omitted in sbox
+    ldr.w   r5, [r12, #24]          // load rkey word of rkey from prev round
+    str     r9, [r12, #28]          // store new rkey word after NOT
+    str     r11, [r12, #92]         // store new rkey word in 'rkeys'
+    eor     r10, r5, r2, ror #26    // r10<- r5 ^ (r2 >>> 2)
+    bic     r10, r4, r10            // r10<- ~r10 & 0xc0c0c0c0 (NOT omitted in sbox)
+    eor     r9, r5, r10, ror #2     // r9 <- r5 ^ (r10 >>> 2)
+    and     r9, r9, r4, ror #2      // r9 <- r9 & 0x30303030
+    orr     r10, r10, r9            // r10<- r10 | r9
+    eor     r9, r5, r10, ror #2     // r9 <- r5 ^ (r10 >>> 2)
+    and     r9, r9, r4, ror #4      // r9 <- r9 & 0x0c0c0c0c
+    orr     r10, r10, r9            // r10<- r10 | r9
+    eor     r9, r5, r10, ror #2     // r9 <- r5 ^ (r10 >>> 2)
+    and     r9, r9, r4, ror #6      // r9 <- r9 & 0x03030303
+    orr     r10, r10, r9            // r10<- r10 | r9
+    mvn     r9, r5                  // NOT omitted in sbox
+    ldr.w   r2, [r12, #20]          // load rkey word of rkey from prev round
+    str     r9, [r12, #24]          // store new rkey word after NOT
+    str     r10, [r12, #88]         // store new rkey word in 'rkeys'
+    eor     r9, r2, r0, ror #26     // r9 <- r2 ^ (r9 >>> 26)
+    and     r9, r4, r9              // r9 <- r9 & 0xc0c0c0c0
+    eor     r0, r2, r9, ror #2      // r0 <- r2 ^ (r9 >>> 2)
+    and     r0, r0, r4, ror #2      // r0 <- r0 & 0x30303030
+    orr     r9, r9, r0              // r9 <- r9 | r0
+    eor     r0, r2, r9, ror #2      // r0 <- r2 ^ (r9 >>> 2)
+    and     r0, r0, r4, ror #4      // r0 <- r0 & 0x0c0c0c0c
+    orr     r9, r9, r0              // r9 <- r9 | r0
+    eor     r0, r2, r9, ror #2      // r0 <- r2 ^ (r9 >>> 2)
+    and     r0, r0, r4, ror #6      // r0 <- r0 & 0x03030303
+    orr     r9, r9, r0              // r9 <- r9 | r0
+    ldr.w   r2, [r12, #16]          // load rkey word of rkey from prev round
+    str.w   r9, [r12, #84]          // store new rkey word in 'rkeys'
+    eor     r8, r2, r8, ror #26     // r8 <- r2 ^ (r8 >>> 26)
+    and     r8, r4, r8              // r8 <- r8 & 0xc0c0c0c0
+    eor     r0, r2, r8, ror #2      // r0 <- r2 ^ (r8 >>> 2)
+    and     r0, r0, r4, ror #2      // r0 <- r0 & 0x30303030
+    orr     r8, r8, r0              // r8 <- r8 | r0
+    eor     r0, r2, r8, ror #2      // r0 <- r2 ^ (r8 >>> 2)
+    and     r0, r0, r4, ror #4      // r0 <- r0 & 0x0c0c0c0c
+    orr     r8, r8, r0              // r8 <- r8 | r0
+    eor     r0, r2, r8, ror #2      // r0 <- r2 ^ (r8 >>> 2)
+    and     r0, r0, r4, ror #6      // r0 <- r0 & 0x03030303
+    orr     r8, r8, r0              // r8 <- r8 | r0
+    ldr.w   r2, [r12, #12]          // load rkey word of rkey from prev round
+    str.w   r8, [r12, #80]          // store new rkey word in 'rkeys'
+    eor     r7, r2, r7, ror #26     // r7 <- r2 ^ (r7 >>> 26)
+    and     r7, r4, r7              // r7 <- r7 & 0xc0c0c0c0
+    eor     r0, r2, r7, ror #2      // r0 <- r2 ^ (r7 >>> 2)
+    and     r0, r0, r4, ror #2      // r0 <- r0 & 0x30303030
+    orr     r7, r7, r0              // r7 <- r7 | r0
+    eor     r0, r2, r7, ror #2      // r0 <- r2 ^ (r7 >>> 2)
+    and     r0, r0, r4, ror #4      // r0 <- r0 & 0x0c0c0c0c
+    orr     r7, r7, r0              // r7 <- r7 | r0
+    eor     r0, r2, r7, ror #2      // r0 <- r2 ^ (r7 >>> 2)
+    and     r0, r0, r4, ror #6      // r0 <- r0 & 0x03030303
+    orr     r7, r7, r0              // r7 <- r7 | r0
+    ldr.w   r2, [r12, #8]           // load rkey word of rkey from prev round
+    str.w   r7, [r12, #76]          // store new rkey word in 'rkeys'
+    eor     r6, r2, r6, ror #26     // r6 <- r2 ^ (r6 >>> 26)
+    bic     r6, r4, r6              // r6 <- ~r6 & 0xc0c0c0c0 (NOT omitted in sbox)
+    eor     r0, r2, r6, ror #2      // r0 <- r2 ^ (r6 >>> 2)
+    and     r0, r0, r4, ror #2      // r0 <- r0 & 0x30303030
+    orr     r6, r6, r0              // r6 <- r6 | r0
+    eor     r0, r2, r6, ror #2      // r0 <- r2 ^ (r6 >>> 2)
+    and     r0, r0, r4, ror #4      // r0 <- r0 & 0x0c0c0c0c
+    orr     r6, r6, r0              // r6 <- r6 | r0
+    eor     r0, r2, r6, ror #2      // r0 <- r2 ^ (r6 >>> 2)
+    and     r0, r0, r4, ror #6      // r0 <- r0 & 0x03030303
+    orr     r6, r6, r0              // r6 <- r6 | r0
+    mvn     r0, r2                  // NOT omitted in sbox
+    ldr.w   r2, [r12, #4]           // load rkey word of rkey from prev round
+    str.w   r0, [r12, #8]           // store new rkey word after NOT
+    str.w   r6, [r12, #72]          // store new rkey word in 'rkeys'
+    eor     r5, r2, r3, ror #26     // r5 <- r2 ^ (r3 >>> 26)
+    bic     r5, r4, r5              // r5 <- ~r5 & 0xc0c0c0c0 (NOT omitted in sbox)
+    eor     r0, r2, r5, ror #2      // r0 <- r2 ^ (r5 >>> 2)
+    and     r0, r0, r4, ror #2      // r0 <- r0 & 0x30303030
+    orr     r5, r5, r0              // r5 <- r5 | r0
+    eor     r0, r2, r5, ror #2      // r0 <- r2 ^ (r5 >>> 2)
+    and     r0, r0, r4, ror #4      // r0 <- r0 & 0x0c0c0c0c
+    orr     r5, r5, r0              // r5 <- r5 | r0
+    eor     r0, r2, r5, ror #2      // r0 <- r2 ^ (r5 >>> 2)
+    and     r0, r0, r4, ror #6      // r0 <- r0 & 0x03030303
+    orr     r5, r5, r0              // r5 <- r5 | r0
+    mvn     r0, r2                  // NOT omitted in sbox
+    ldr.w   r2, [r12], #32          // load rkey word of rkey from prev round
+    str.w   r0, [r12, #-28]         // store new rkey word after NOT
+    str.w   r5, [r12, #36]          // store new rkey word in 'rkeys'
+    eor     r3, r2, r1, ror #26     // r3 <- r2 ^ (r1 >>> 26)
+    and     r3, r4, r3              // r3 <- r3 & 0xc0c0c0c0
+    eor     r0, r2, r3, ror #2      // r0 <- r2 ^ (r3 >>> 2)
+    and     r0, r0, r4, ror #2      // r0 <- r0 & 0x30303030
+    orr     r3, r3, r0              // r3 <- r3 | r0
+    eor     r0, r2, r3, ror #2      // r0 <- r2 ^ (r3 >>> 2)
+    and     r0, r0, r4, ror #4      // r0 <- r0 & 0x0c0c0c0c
+    orr     r3, r3, r0              // r3 <- r3 | r0
+    eor     r0, r2, r3, ror #2      // r0 <- r2 ^ (r3 >>> 2)
+    and     r0, r0, r4, ror #6      // r0 <- r0 & 0x03030303
+    orr     r4, r3, r0              // r4 <- r3 | r0
+    str.w   r4, [r12, #32]
+    str.w   r12, [sp, #56]          // store the new rkeys address on the stack
+    bx      lr
+
+/******************************************************************************
+* Applies ShiftRows^(-1) on a round key to match the fixsliced representation.
+******************************************************************************/
+.align 2
+inv_shiftrows_1:
+    ldr.w   r2, [r12, #-32]!
+    str     r14, [sp, #52]          // store link register
+    movw    r1, #8
+    movw    r14, #0x0300
+    movt    r14, #0x0c0f            // r14<- 0x0c0f0300 for ShiftRows^[-1]
+loop_inv_sr_1:
+    movw    r3, #0x3300
+    movt    r3, #0x3300             // r3 <- 0x33003300 for ShiftRows^[-1]
+    swpmv   r2, r2, r2, r2, r14, 4, r0
+    eor     r0, r2, r2, lsr #2
+    and     r0, r3
+    eor     r2, r2, r0
+    eor     r3, r2, r0, lsl #2
+    ldr.w   r2, [r12, #4]!
+    str.w   r3, [r12, #-4]
+    subs    r1, #1
+    bne     loop_inv_sr_1
+    ldr     r14, [sp, #52]          // restore link register
+    bx      lr
+
+/******************************************************************************
+* Applies ShiftRows^(-2) on a round key to match the fixsliced representation.
+* Only needed for the fully-fixsliced (ffs) representation.
+******************************************************************************/
+.align 2
+inv_shiftrows_2:
+    ldr.w   r2, [r12, #-32]!
+    str     r14, [sp, #52]          // store link register
+    movw    r1, #8
+    movw    r14, #0x0f00
+    movt    r14, #0x0f00            // r14<- 0x0f000f00 for ShiftRows^[-2]
+loop_inv_sr_2:
+    eor     r0, r2, r2, lsr #4
+    and     r0, r14
+    eor     r2, r2, r0
+    eor     r3, r2, r0, lsl #4
+    ldr.w   r2, [r12, #4]!
+    str.w   r3, [r12, #-4]
+    subs    r1, #1
+    bne     loop_inv_sr_2
+    ldr     r14, [sp, #52]          // restore link register
+    bx      lr
+
+/******************************************************************************
+* Applies ShiftRows^(-3) on a round key to match the fixsliced representation.
+* Only needed for the fully-fixsliced (ffs) representation.
+******************************************************************************/
+.align 2
+inv_shiftrows_3:
+    ldr.w   r2, [r12, #-32]!
+    str     r14, [sp, #52]          // store link register
+    movw    r1, #8
+    movw    r14, #0x0c00
+    movt    r14, #0x030f            // r14<- 0x030f0c00 for ShiftRows^[-3]
+loop_inv_sr_3:
+    movw    r3, #0x3300
+    movt    r3, #0x3300             // r3 <- 0x33003300 for ShiftRows^[-3]
+    swpmv   r2, r2, r2, r2, r14, 4, r0
+    eor     r0, r2, r2, lsr #2
+    and     r0, r3
+    eor     r2, r2, r0
+    eor     r3, r2, r0, lsl #2
+    ldr.w   r2, [r12, #4]!
+    str.w   r3, [r12, #-4]
+    subs    r1, #1
+    bne     loop_inv_sr_3
+    ldr     r14, [sp, #52]          // restore link register
+    bx      lr
+
+/******************************************************************************
+* Fully bitsliced AES-128 key schedule to match the fully-fixsliced (ffs)
+* representation. Note that it is possible to pass two different keys as input
+* parameters if one wants to encrypt 2 blocks in with two different keys.
+******************************************************************************/
+@ void aes128_keyschedule_ffs(u32* rkeys, const u8* key);
+.global aes128_keyschedule_ffs
+.type   aes128_keyschedule_ffs,%function
+.align 2
+aes128_keyschedule_ffs:
+    push    {r0-r12,r14}
+    sub.w   sp, #56                 // allow space on the stack for tmp var
+    ldr.w   r4, [r1]                // load the 128-bit key in r4-r7
+    ldr     r5, [r1, #4]
+    ldr     r6, [r1, #8]
+    ldr     r7, [r1, #12]
+    ldr.w   r8, [r1]                // load the 128-bit key in r8-r11
+    ldr     r9, [r1, #4]
+    ldr     r10,[r1, #8]
+    ldr     r11,[r1, #12]
+    bl      packing                 // pack the master key
+    ldr.w   r0, [sp, #56]           // restore 'rkeys' address
+    stm     r0, {r4-r11}            // store the packed master key in 'rkeys'
+    bl      sbox                    // apply the sbox to the master key
+    eor     r11, r11, #0x00000300   // add the 1st rconst
+    bl      aes128_xorcolumns_rotword
+    bl      sbox                    // apply the sbox to the master key
+    eor     r2, r2, #0x00000300     // add the 2nd rconst
+    bl      aes128_xorcolumns_rotword
+    bl      inv_shiftrows_1
+    bl      sbox                    // apply the sbox to the master key
+    eor     r0, r0, #0x00000300     // add the 3rd rconst
+    bl      aes128_xorcolumns_rotword
+    bl      inv_shiftrows_2
+    bl      sbox                    // apply the sbox to the master key
+    eor     r8, r8, #0x00000300     // add the 4th rconst
+    bl      aes128_xorcolumns_rotword
+    bl      inv_shiftrows_3
+    bl      sbox                    // apply the sbox to the master key
+    eor     r7, r7, #0x00000300     // add the 5th rconst
+    bl      aes128_xorcolumns_rotword
+    bl      sbox                    // apply the sbox to the master key
+    eor     r6, r6, #0x00000300     // add the 6th rconst
+    bl      aes128_xorcolumns_rotword
+    bl      inv_shiftrows_1
+    bl      sbox                    // apply the sbox to the master key
+    eor     r3, r3, #0x00000300     // add the 7th rconst
+    bl      aes128_xorcolumns_rotword
+    bl      inv_shiftrows_2
+    bl      sbox                    // apply the sbox to the master key
+    eor     r1, r1, #0x00000300     // add the 8th rconst
+    bl      aes128_xorcolumns_rotword
+    bl      inv_shiftrows_3
+    bl      sbox                    // apply the sbox to the master key
+    eor     r11, r11, #0x00000300   // add the 9th rconst
+    eor     r2, r2, #0x00000300     // add the 9th rconst
+    eor     r8, r8, #0x00000300     // add the 9th rconst
+    eor     r7, r7, #0x00000300     // add the 9th rconst
+    bl      aes128_xorcolumns_rotword
+    bl      sbox                    // apply the sbox to the master key
+    eor     r2, r2, #0x00000300     // add the 10th rconst
+    eor     r0, r0, #0x00000300     // add the 10th rconst
+    eor     r7, r7, #0x00000300     // add the 10th rconst
+    eor     r6, r6, #0x00000300     // add the 10th rconst
+    bl      aes128_xorcolumns_rotword
+    bl      inv_shiftrows_1
+    mvn     r5, r5                  // add the NOT for the last rkey
+    mvn     r6, r6                  // add the NOT for the last rkey
+    mvn     r10, r10                // add the NOT for the last rkey
+    mvn     r11, r11                // add the NOT for the last rkey
+    strd    r5, r6, [r12, #4]
+    strd    r10, r11, [r12, #24]
+    ldrd    r0, r1, [r12, #-316]
+    ldrd    r2, r3, [r12, #-296]
+    mvn     r0, r0                  // remove the NOT for the key whitening
+    mvn     r1, r1                  // remove the NOT for the key whitening
+    mvn     r2, r2                  // remove the NOT for the key whitening
+    mvn     r3, r3                  // remove the NOT for the key whitening
+    strd    r0, r1, [r12, #-316]
+    strd    r2, r3, [r12, #-296]
+    add.w   sp, #56                 // restore stack
+    pop     {r0-r12, r14}           // restore context
+    bx      lr
+
+/******************************************************************************
+* Fully bitsliced AES-256 key schedule to match the fully-fixsliced (ffs)
+* representation. Note that it is possible to pass 2 different keys as input
+* parameters if one wants to encrypt 2 blocks in with 2 different keys.
+******************************************************************************/
+@ void aes256_keyschedule_ffs(u32* rkeys, const u8* key);
+.global aes256_keyschedule_ffs
+.type   aes256_keyschedule_ffs,%function
+.align 2
+aes256_keyschedule_ffs:
+    push    {r0-r12,r14}
+    sub.w   sp, #56                 // allow space on the stack for tmp var
+    ldr.w   r4, [r1]                // load the 128 first key bits in r4-r7
+    ldr     r5, [r1, #4]
+    ldr     r6, [r1, #8]
+    ldr     r7, [r1, #12]
+    ldr.w   r8, [r1]                // load the 128 first key bits in r8-r11
+    ldr     r9, [r1, #4]
+    ldr     r10,[r1, #8]
+    ldr     r11,[r1, #12]
+    bl      packing                 // pack the master key
+    ldrd    r0,r1, [sp, #56]        // restore 'rkeys' and 'key' addresses
+    stm     r0, {r4-r11}            // store the packed master key in 'rkeys'
+    add.w   r1, #16                 // points to the 128 last bits of the key
+    ldr.w   r4, [r1]                // load the 128 first key bits in r4-r7
+    ldr     r5, [r1, #4]
+    ldr     r6, [r1, #8]
+    ldr     r7, [r1, #12]
+    ldr.w   r8, [r1]                // load the 128 first key bits in r8-r11
+    ldr     r9, [r1, #4]
+    ldr     r10,[r1, #8]
+    ldr     r11,[r1, #12]
+    bl      packing                 // pack the master key
+    ldr.w   r0, [sp, #56]           // restore 'rkeys' address
+    add.w   r0, #32                 // points to the 128 last bits of the key
+    stm     r0, {r4-r11}            // store the packed master key in 'rkeys'
+    bl      sbox                    // apply the sbox to the master key
+    eor     r11, r11, #0x00000300   // add the 1st rconst
+    bl      aes256_xorcolumns_rotword
+    bl      sbox                    // apply the sbox to the master key
+    bl      aes256_xorcolumns
+    bl      inv_shiftrows_1
+    bl      sbox                    // apply the sbox to the master key
+    eor     r2, r2, #0x00000300     // add the 2nd rconst
+    bl      aes256_xorcolumns_rotword
+    bl      inv_shiftrows_2
+    bl      sbox                    // apply the sbox to the master key
+    bl      aes256_xorcolumns
+    bl      inv_shiftrows_3
+    bl      sbox                    // apply the sbox to the master key
+    eor     r0, r0, #0x00000300     // add the 3rd rconst
+    bl      aes256_xorcolumns_rotword
+    bl      sbox                    // apply the sbox to the master key
+    bl      aes256_xorcolumns
+    bl      inv_shiftrows_1
+    bl      sbox                    // apply the sbox to the master key
+    eor     r8, r8, #0x00000300     // add the 4th rconst
+    bl      aes256_xorcolumns_rotword
+    bl      inv_shiftrows_2
+    bl      sbox                    // apply the sbox to the master key
+    bl      aes256_xorcolumns
+    bl      inv_shiftrows_3
+    bl      sbox                    // apply the sbox to the master key
+    eor     r7, r7, #0x00000300     // add the 5th rconst
+    bl      aes256_xorcolumns_rotword
+    bl      sbox                    // apply the sbox to the master key
+    bl      aes256_xorcolumns
+    bl      inv_shiftrows_1
+    bl      sbox                    // apply the sbox to the master key
+    eor     r6, r6, #0x00000300     // add the 6th rconst
+    bl      aes256_xorcolumns_rotword
+    bl      inv_shiftrows_2
+    bl      sbox                    // apply the sbox to the master key
+    bl      aes256_xorcolumns
+    bl      inv_shiftrows_3
+    bl      sbox                    // apply the sbox to the master key
+    eor     r3, r3, #0x00000300     // add the 6th rconst
+    bl      aes256_xorcolumns_rotword
+    add     r12, #32
+    bl      inv_shiftrows_1
+    mvn     r5, r5                  // add the NOT for the last rkey
+    mvn     r6, r6                  // add the NOT for the last rkey
+    mvn     r10, r10                // add the NOT for the last rkey
+    mvn     r11, r11                // add the NOT for the last rkey
+    ldrd    r0, r1, [r12, #-28]
+    ldrd    r2, r3, [r12, #-8]
+    strd    r5, r6, [r12, #4]
+    strd    r10, r11, [r12, #24]
+    mvn     r0, r0                  // add the NOT for the penultimate rkey
+    mvn     r1, r1                  // add the NOT for the penultimate rkey
+    mvn     r2, r2                  // add the NOT for the penultimate rkey
+    mvn     r3, r3                  // add the NOT for the penultimate rkey
+    ldrd    r5, r6, [r12, #-444]
+    ldrd    r10, r11, [r12, #-424]
+    strd    r0, r1, [r12, #-28]
+    strd    r2, r3, [r12, #-8]
+    mvn     r5, r5                  // remove the NOT for the key whitening
+    mvn     r6, r6                  // remove the NOT for the key whitening
+    mvn     r10, r10                // remove the NOT for the key whitening
+    mvn     r11, r11                // remove the NOT for the key whitening
+    strd    r5, r6, [r12, #-444]
+    strd    r10, r11, [r12, #-424]
+    add.w   sp, #56                 // restore stack
+    pop     {r0-r12, r14}           // restore context
+    bx      lr
\ No newline at end of file
diff --git a/common/aes-publicinputs.S b/common/aes-publicinputs.S
new file mode 100644
index 0000000..9205d29
--- /dev/null
+++ b/common/aes-publicinputs.S
@@ -0,0 +1,1327 @@
+.syntax unified
+.thumb
+
+.section .data.aestable
+.global AES_Te0
+.type AES_Te0,%object
+.align 2
+AES_Te0:
+.word 0x63c6a563, 0x7cf8847c, 0x77ee9977, 0x7bf68d7b
+.word 0xf2ff0df2, 0x6bd6bd6b, 0x6fdeb16f, 0xc59154c5
+.word 0x30605030, 0x01020301, 0x67cea967, 0x2b567d2b
+.word 0xfee719fe, 0xd7b562d7, 0xab4de6ab, 0x76ec9a76
+.word 0xca8f45ca, 0x821f9d82, 0xc98940c9, 0x7dfa877d
+.word 0xfaef15fa, 0x59b2eb59, 0x478ec947, 0xf0fb0bf0
+.word 0xad41ecad, 0xd4b367d4, 0xa25ffda2, 0xaf45eaaf
+.word 0x9c23bf9c, 0xa453f7a4, 0x72e49672, 0xc09b5bc0
+.word 0xb775c2b7, 0xfde11cfd, 0x933dae93, 0x264c6a26
+.word 0x366c5a36, 0x3f7e413f, 0xf7f502f7, 0xcc834fcc
+.word 0x34685c34, 0xa551f4a5, 0xe5d134e5, 0xf1f908f1
+.word 0x71e29371, 0xd8ab73d8, 0x31625331, 0x152a3f15
+.word 0x04080c04, 0xc79552c7, 0x23466523, 0xc39d5ec3
+.word 0x18302818, 0x9637a196, 0x050a0f05, 0x9a2fb59a
+.word 0x070e0907, 0x12243612, 0x801b9b80, 0xe2df3de2
+.word 0xebcd26eb, 0x274e6927, 0xb27fcdb2, 0x75ea9f75
+.word 0x09121b09, 0x831d9e83, 0x2c58742c, 0x1a342e1a
+.word 0x1b362d1b, 0x6edcb26e, 0x5ab4ee5a, 0xa05bfba0
+.word 0x52a4f652, 0x3b764d3b, 0xd6b761d6, 0xb37dceb3
+.word 0x29527b29, 0xe3dd3ee3, 0x2f5e712f, 0x84139784
+.word 0x53a6f553, 0xd1b968d1, 0x00000000, 0xedc12ced
+.word 0x20406020, 0xfce31ffc, 0xb179c8b1, 0x5bb6ed5b
+.word 0x6ad4be6a, 0xcb8d46cb, 0xbe67d9be, 0x39724b39
+.word 0x4a94de4a, 0x4c98d44c, 0x58b0e858, 0xcf854acf
+.word 0xd0bb6bd0, 0xefc52aef, 0xaa4fe5aa, 0xfbed16fb
+.word 0x4386c543, 0x4d9ad74d, 0x33665533, 0x85119485
+.word 0x458acf45, 0xf9e910f9, 0x02040602, 0x7ffe817f
+.word 0x50a0f050, 0x3c78443c, 0x9f25ba9f, 0xa84be3a8
+.word 0x51a2f351, 0xa35dfea3, 0x4080c040, 0x8f058a8f
+.word 0x923fad92, 0x9d21bc9d, 0x38704838, 0xf5f104f5
+.word 0xbc63dfbc, 0xb677c1b6, 0xdaaf75da, 0x21426321
+.word 0x10203010, 0xffe51aff, 0xf3fd0ef3, 0xd2bf6dd2
+.word 0xcd814ccd, 0x0c18140c, 0x13263513, 0xecc32fec
+.word 0x5fbee15f, 0x9735a297, 0x4488cc44, 0x172e3917
+.word 0xc49357c4, 0xa755f2a7, 0x7efc827e, 0x3d7a473d
+.word 0x64c8ac64, 0x5dbae75d, 0x19322b19, 0x73e69573
+.word 0x60c0a060, 0x81199881, 0x4f9ed14f, 0xdca37fdc
+.word 0x22446622, 0x2a547e2a, 0x903bab90, 0x880b8388
+.word 0x468cca46, 0xeec729ee, 0xb86bd3b8, 0x14283c14
+.word 0xdea779de, 0x5ebce25e, 0x0b161d0b, 0xdbad76db
+.word 0xe0db3be0, 0x32645632, 0x3a744e3a, 0x0a141e0a
+.word 0x4992db49, 0x060c0a06, 0x24486c24, 0x5cb8e45c
+.word 0xc29f5dc2, 0xd3bd6ed3, 0xac43efac, 0x62c4a662
+.word 0x9139a891, 0x9531a495, 0xe4d337e4, 0x79f28b79
+.word 0xe7d532e7, 0xc88b43c8, 0x376e5937, 0x6ddab76d
+.word 0x8d018c8d, 0xd5b164d5, 0x4e9cd24e, 0xa949e0a9
+.word 0x6cd8b46c, 0x56acfa56, 0xf4f307f4, 0xeacf25ea
+.word 0x65caaf65, 0x7af48e7a, 0xae47e9ae, 0x08101808
+.word 0xba6fd5ba, 0x78f08878, 0x254a6f25, 0x2e5c722e
+.word 0x1c38241c, 0xa657f1a6, 0xb473c7b4, 0xc69751c6
+.word 0xe8cb23e8, 0xdda17cdd, 0x74e89c74, 0x1f3e211f
+.word 0x4b96dd4b, 0xbd61dcbd, 0x8b0d868b, 0x8a0f858a
+.word 0x70e09070, 0x3e7c423e, 0xb571c4b5, 0x66ccaa66
+.word 0x4890d848, 0x03060503, 0xf6f701f6, 0x0e1c120e
+.word 0x61c2a361, 0x356a5f35, 0x57aef957, 0xb969d0b9
+.word 0x86179186, 0xc19958c1, 0x1d3a271d, 0x9e27b99e
+.word 0xe1d938e1, 0xf8eb13f8, 0x982bb398, 0x11223311
+.word 0x69d2bb69, 0xd9a970d9, 0x8e07898e, 0x9433a794
+.word 0x9b2db69b, 0x1e3c221e, 0x87159287, 0xe9c920e9
+.word 0xce8749ce, 0x55aaff55, 0x28507828, 0xdfa57adf
+.word 0x8c038f8c, 0xa159f8a1, 0x89098089, 0x0d1a170d
+.word 0xbf65dabf, 0xe6d731e6, 0x4284c642, 0x68d0b868
+.word 0x4182c341, 0x9929b099, 0x2d5a772d, 0x0f1e110f
+.word 0xb07bcbb0, 0x54a8fc54, 0xbb6dd6bb, 0x162c3a16
+.size AES_Te0,.-AES_Te0
+
+.section .text.aes128
+@ void aes128_keyexp_publicinputs_asm(const uint8_t *key,
+@       uint8_t *rk) {
+.global aes128_keyexp_publicinputs_asm
+.type   aes128_keyexp_publicinputs_asm,%function
+.align 2
+aes128_keyexp_publicinputs_asm:
+
+    //function prologue, preserve registers
+    push {r4-r11}
+
+    //load key
+    //pointer may be non-aligned, so avoid using ldm/stm
+    ldr r4, [r0, #0]
+    ldr r5, [r0, #4]
+    ldr r6, [r0, #8]
+    ldr r7, [r0, #12]
+
+    //load table address once
+    ldr r3, =AES_Te0
+
+    //round 1
+    uxtb r8, r7, ror #8
+    uxtb r9, r7, ror #16
+    uxtb r10, r7, ror #24
+    uxtb r11, r7
+
+    ldrb r8, [r3, r8, lsl #2]
+    ldrb r9, [r3, r9, lsl #2]
+    ldrb r10, [r3, r10, lsl #2]
+    ldrb r11, [r3, r11, lsl #2]
+
+    eor r4, #0x00000001 //rcon
+    eor r4, r4, r8
+    eor r4, r4, r9, lsl #8
+    eor r4, r4, r10, lsl #16
+    eor r4, r4, r11, lsl #24 //rk[4]
+    eor r5, r4 //rk[5]
+    eor r6, r5 //rk[6]
+    eor r7, r6 //rk[7]
+
+    //write to memory
+    str r4, [r1, #0]
+    str r5, [r1, #4]
+    str r6, [r1, #8]
+    str r7, [r1, #12]
+
+    //round 2
+    uxtb r8, r7, ror #8
+    uxtb r9, r7, ror #16
+    uxtb r10, r7, ror #24
+    uxtb r11, r7
+
+    ldrb r8, [r3, r8, lsl #2]
+    ldrb r9, [r3, r9, lsl #2]
+    ldrb r10, [r3, r10, lsl #2]
+    ldrb r11, [r3, r11, lsl #2]
+
+    eor r4, #0x00000002 //rcon
+    eor r4, r4, r8
+    eor r4, r4, r9, lsl #8
+    eor r4, r4, r10, lsl #16
+    eor r4, r4, r11, lsl #24 //rk[8]
+    eor r5, r4 //rk[9]
+    eor r6, r5 //rk[10]
+    eor r7, r6 //rk[11]
+
+    //write to memory
+    str r4, [r1, #16]
+    str r5, [r1, #20]
+    str r6, [r1, #24]
+    str r7, [r1, #28]
+
+    //round 3
+    uxtb r8, r7, ror #8
+    uxtb r9, r7, ror #16
+    uxtb r10, r7, ror #24
+    uxtb r11, r7
+
+    ldrb r8, [r3, r8, lsl #2]
+    ldrb r9, [r3, r9, lsl #2]
+    ldrb r10, [r3, r10, lsl #2]
+    ldrb r11, [r3, r11, lsl #2]
+
+    eor r4, #0x00000004 //rcon
+    eor r4, r4, r8
+    eor r4, r4, r9, lsl #8
+    eor r4, r4, r10, lsl #16
+    eor r4, r4, r11, lsl #24 //rk[12]
+    eor r5, r4 //rk[13]
+    eor r6, r5 //rk[14]
+    eor r7, r6 //rk[15]
+
+    //write to memory
+    str r4, [r1, #32]
+    str r5, [r1, #36]
+    str r6, [r1, #40]
+    str r7, [r1, #44]
+
+    //round 4
+    uxtb r8, r7, ror #8
+    uxtb r9, r7, ror #16
+    uxtb r10, r7, ror #24
+    uxtb r11, r7
+
+    ldrb r8, [r3, r8, lsl #2]
+    ldrb r9, [r3, r9, lsl #2]
+    ldrb r10, [r3, r10, lsl #2]
+    ldrb r11, [r3, r11, lsl #2]
+
+    eor r4, #0x00000008 //rcon
+    eor r4, r4, r8
+    eor r4, r4, r9, lsl #8
+    eor r4, r4, r10, lsl #16
+    eor r4, r4, r11, lsl #24 //rk[16]
+    eor r5, r4 //rk[17]
+    eor r6, r5 //rk[18]
+    eor r7, r6 //rk[19]
+
+    //write to memory
+    str r4, [r1, #48]
+    str r5, [r1, #52]
+    str r6, [r1, #56]
+    str r7, [r1, #60]
+
+    //round 5
+    uxtb r8, r7, ror #8
+    uxtb r9, r7, ror #16
+    uxtb r10, r7, ror #24
+    uxtb r11, r7
+
+    ldrb r8, [r3, r8, lsl #2]
+    ldrb r9, [r3, r9, lsl #2]
+    ldrb r10, [r3, r10, lsl #2]
+    ldrb r11, [r3, r11, lsl #2]
+
+    eor r4, #0x00000010 //rcon
+    eor r4, r4, r8
+    eor r4, r4, r9, lsl #8
+    eor r4, r4, r10, lsl #16
+    eor r4, r4, r11, lsl #24 //rk[20]
+    eor r5, r4 //rk[21]
+    eor r6, r5 //rk[22]
+    eor r7, r6 //rk[23]
+
+    //write to memory
+    str r4, [r1, #64]
+    str r5, [r1, #68]
+    str r6, [r1, #72]
+    str r7, [r1, #76]
+
+    //round 6
+    uxtb r8, r7, ror #8
+    uxtb r9, r7, ror #16
+    uxtb r10, r7, ror #24
+    uxtb r11, r7
+
+    ldrb r8, [r3, r8, lsl #2]
+    ldrb r9, [r3, r9, lsl #2]
+    ldrb r10, [r3, r10, lsl #2]
+    ldrb r11, [r3, r11, lsl #2]
+
+    eor r4, #0x00000020 //rcon
+    eor r4, r4, r8
+    eor r4, r4, r9, lsl #8
+    eor r4, r4, r10, lsl #16
+    eor r4, r4, r11, lsl #24 //rk[24]
+    eor r5, r4 //rk[25]
+    eor r6, r5 //rk[26]
+    eor r7, r6 //rk[27]
+
+    //write to memory
+    str r4, [r1, #80]
+    str r5, [r1, #84]
+    str r6, [r1, #88]
+    str r7, [r1, #92]
+
+    //round 7
+    uxtb r8, r7, ror #8
+    uxtb r9, r7, ror #16
+    uxtb r10, r7, ror #24
+    uxtb r11, r7
+
+    ldrb r8, [r3, r8, lsl #2]
+    ldrb r9, [r3, r9, lsl #2]
+    ldrb r10, [r3, r10, lsl #2]
+    ldrb r11, [r3, r11, lsl #2]
+
+    eor r4, #0x00000040 //rcon
+    eor r4, r4, r8
+    eor r4, r4, r9, lsl #8
+    eor r4, r4, r10, lsl #16
+    eor r4, r4, r11, lsl #24 //rk[28]
+    eor r5, r4 //rk[29]
+    eor r6, r5 //rk[30]
+    eor r7, r6 //rk[31]
+
+    //write to memory
+    str r4, [r1, #96]
+    str r5, [r1, #100]
+    str r6, [r1, #104]
+    str r7, [r1, #108]
+
+    //round 8
+    uxtb r8, r7, ror #8
+    uxtb r9, r7, ror #16
+    uxtb r10, r7, ror #24
+    uxtb r11, r7
+
+    ldrb r8, [r3, r8, lsl #2]
+    ldrb r9, [r3, r9, lsl #2]
+    ldrb r10, [r3, r10, lsl #2]
+    ldrb r11, [r3, r11, lsl #2]
+
+    eor r4, #0x00000080 //rcon
+    eor r4, r4, r8
+    eor r4, r4, r9, lsl #8
+    eor r4, r4, r10, lsl #16
+    eor r4, r4, r11, lsl #24 //rk[32]
+    eor r5, r4 //rk[33]
+    eor r6, r5 //rk[34]
+    eor r7, r6 //rk[35]
+
+    //write to memory
+    str r4, [r1, #112]
+    str r5, [r1, #116]
+    str r6, [r1, #120]
+    str r7, [r1, #124]
+
+    add r1, #128
+
+    //round 9
+    uxtb r8, r7, ror #8
+    uxtb r9, r7, ror #16
+    uxtb r10, r7, ror #24
+    uxtb r11, r7
+
+    ldrb r8, [r3, r8, lsl #2]
+    ldrb r9, [r3, r9, lsl #2]
+    ldrb r10, [r3, r10, lsl #2]
+    ldrb r11, [r3, r11, lsl #2]
+
+    eor r4, #0x0000001B //rcon
+    eor r4, r4, r8
+    eor r4, r4, r9, lsl #8
+    eor r4, r4, r10, lsl #16
+    eor r4, r4, r11, lsl #24 //rk[36]
+    eor r5, r4 //rk[37]
+    eor r6, r5 //rk[38]
+    eor r7, r6 //rk[39]
+
+    //write to memory
+    str r4, [r1, #0]
+    str r5, [r1, #4]
+    str r6, [r1, #8]
+    str r7, [r1, #12]
+
+    //round 10
+    uxtb r8, r7, ror #8
+    uxtb r9, r7, ror #16
+    uxtb r10, r7, ror #24
+    uxtb r11, r7
+
+    ldrb r8, [r3, r8, lsl #2]
+    ldrb r9, [r3, r9, lsl #2]
+    ldrb r10, [r3, r10, lsl #2]
+    ldrb r11, [r3, r11, lsl #2]
+
+    eor r4, #0x00000036 //rcon
+    eor r4, r4, r8
+    eor r4, r4, r9, lsl #8
+    eor r4, r4, r10, lsl #16
+    eor r4, r4, r11, lsl #24 //rk[40]
+    eor r5, r4 //rk[41]
+    eor r6, r5 //rk[42]
+    eor r7, r6 //rk[43]
+
+    //write to memory
+    str r4, [r1, #16]
+    str r5, [r1, #20]
+    str r6, [r1, #24]
+    str r7, [r1, #28]
+
+    //function epilogue, restore state
+    pop {r4-r11}
+    bx lr
+.size aes128_keyexp_publicinputs_asm,.-aes128_keyexp_publicinputs_asm
+
+.macro aesencrypt_oddround
+    ldr r8, [r14], #4
+    ldr r9, [r14], #4
+    ldr r10, [r14], #4
+    ldr r11, [r14], #4
+
+    uxtb r0, r4
+    uxtb r1, r5
+    uxtb r2, r6
+    uxtb r3, r7
+    ldr r0, [r12, r0, lsl #2]
+    ldr r1, [r12, r1, lsl #2]
+    ldr r2, [r12, r2, lsl #2]
+    ldr r3, [r12, r3, lsl #2]
+    eor r8, r8, r0, ror #16
+    eor r9, r9, r1, ror #16
+    eor r10, r10, r2, ror #16
+    eor r11, r11, r3, ror #16
+
+    uxtb r0, r5, ror #8
+    uxtb r1, r6, ror #8
+    uxtb r2, r7, ror #8
+    uxtb r3, r4, ror #8
+    ldr r0, [r12, r0, lsl #2]
+    ldr r1, [r12, r1, lsl #2]
+    ldr r2, [r12, r2, lsl #2]
+    ldr r3, [r12, r3, lsl #2]
+    eor r8, r8, r0, ror #8
+    eor r9, r9, r1, ror #8
+    eor r10, r10, r2, ror #8
+    eor r11, r11, r3, ror #8
+
+    uxtb r0, r6, ror #16
+    uxtb r1, r7, ror #16
+    uxtb r2, r4, ror #16
+    uxtb r3, r5, ror #16
+    ldr r0, [r12, r0, lsl #2]
+    ldr r1, [r12, r1, lsl #2]
+    ldr r2, [r12, r2, lsl #2]
+    ldr r3, [r12, r3, lsl #2]
+    eor r8, r0
+    eor r9, r1
+    eor r10, r2
+    eor r11, r3
+
+    uxtb r0, r7, ror #24
+    uxtb r1, r4, ror #24
+    uxtb r2, r5, ror #24
+    uxtb r3, r6, ror #24
+    ldr r0, [r12, r0, lsl #2]
+    ldr r1, [r12, r1, lsl #2]
+    ldr r2, [r12, r2, lsl #2]
+    ldr r3, [r12, r3, lsl #2]
+    eor r8, r8, r0, ror #24
+    eor r9, r9, r1, ror #24
+    eor r10, r10, r2, ror #24
+    eor r11, r11, r3, ror #24
+.endm
+
+.macro aesencrypt_evenround
+    ldr r4, [r14], #4
+    ldr r5, [r14], #4
+    ldr r6, [r14], #4
+    ldr r7, [r14], #4
+
+    uxtb r0, r8
+    uxtb r1, r9
+    uxtb r2, r10
+    uxtb r3, r11
+    ldr r0, [r12, r0, lsl #2]
+    ldr r1, [r12, r1, lsl #2]
+    ldr r2, [r12, r2, lsl #2]
+    ldr r3, [r12, r3, lsl #2]
+    eor r4, r4, r0, ror #16
+    eor r5, r5, r1, ror #16
+    eor r6, r6, r2, ror #16
+    eor r7, r7, r3, ror #16
+
+    uxtb r0, r9, ror #8
+    uxtb r1, r10, ror #8
+    uxtb r2, r11, ror #8
+    uxtb r3, r8, ror #8
+    ldr r0, [r12, r0, lsl #2]
+    ldr r1, [r12, r1, lsl #2]
+    ldr r2, [r12, r2, lsl #2]
+    ldr r3, [r12, r3, lsl #2]
+    eor r4, r4, r0, ror #8
+    eor r5, r5, r1, ror #8
+    eor r6, r6, r2, ror #8
+    eor r7, r7, r3, ror #8
+
+    uxtb r0, r10, ror #16
+    uxtb r1, r11, ror #16
+    uxtb r2, r8, ror #16
+    uxtb r3, r9, ror #16
+    ldr r0, [r12, r0, lsl #2]
+    ldr r1, [r12, r1, lsl #2]
+    ldr r2, [r12, r2, lsl #2]
+    ldr r3, [r12, r3, lsl #2]
+    eor r4, r0
+    eor r5, r1
+    eor r6, r2
+    eor r7, r3
+
+    uxtb r0, r11, ror #24
+    uxtb r1, r8, ror #24
+    uxtb r2, r9, ror #24
+    uxtb r3, r10, ror #24
+    ldr r0, [r12, r0, lsl #2]
+    ldr r1, [r12, r1, lsl #2]
+    ldr r2, [r12, r2, lsl #2]
+    ldr r3, [r12, r3, lsl #2]
+    eor r4, r4, r0, ror #24
+    eor r5, r5, r1, ror #24
+    eor r6, r6, r2, ror #24
+    eor r7, r7, r3, ror #24
+.endm
+
+.macro aesencrypt_finalround
+    uxtb r0, r11, ror #24
+    uxtb r1, r8, ror #24
+    uxtb r2, r9, ror #24
+    uxtb r3, r10, ror #24
+    ldr r4, [r12, r0, lsl #2]
+    ldr r5, [r12, r1, lsl #2]
+    ldr r6, [r12, r2, lsl #2]
+    ldr r7, [r12, r3, lsl #2]
+
+    uxtb r0, r10, ror #16
+    uxtb r1, r11, ror #16
+    uxtb r2, r8, ror #16
+    uxtb r3, r9, ror #16
+    ldr r0, [r12, r0, lsl #2]
+    ldr r1, [r12, r1, lsl #2]
+    ldr r2, [r12, r2, lsl #2]
+    ldr r3, [r12, r3, lsl #2]
+    bfi r4, r0, #24, #8
+    bfi r5, r1, #24, #8
+    bfi r6, r2, #24, #8
+    bfi r7, r3, #24, #8
+
+    uxtb r0, r8
+    uxtb r1, r9
+    uxtb r2, r10
+    uxtb r3, r11
+    ldr r0, [r12, r0, lsl #2]
+    ldr r1, [r12, r1, lsl #2]
+    ldr r2, [r12, r2, lsl #2]
+    ldr r3, [r12, r3, lsl #2]
+    bfi r4, r0, #8, #8
+    bfi r5, r1, #8, #8
+    bfi r6, r2, #8, #8
+    bfi r7, r3, #8, #8
+
+    uxtb r0, r9, ror #8
+    uxtb r1, r10, ror #8
+    uxtb r2, r11, ror #8
+    uxtb r3, r8, ror #8
+    ldr r0, [r12, r0, lsl #2]
+    ldr r1, [r12, r1, lsl #2]
+    ldr r2, [r12, r2, lsl #2]
+    ldr r3, [r12, r3, lsl #2]
+    bfi r4, r0, #16, #8
+    bfi r5, r1, #16, #8
+    bfi r6, r2, #16, #8
+    bfi r7, r3, #16, #8
+.endm
+
+@ void aes128_encrypt_publicinputs_asm(const uint8_t *rk,
+@       const uint8_t *in, uint8_t *out) {
+.global aes128_encrypt_publicinputs_asm
+.type   aes128_encrypt_publicinputs_asm,%function
+.align 2
+aes128_encrypt_publicinputs_asm:
+
+    //function prologue, preserve registers and free r2
+    push {r2,r4-r12,r14}
+
+    //load input
+    ldr r4, [r1, #0]
+    ldr r5, [r1, #4]
+    ldr r6, [r1, #8]
+    ldr r7, [r1, #12]
+    //r1 now free to overwrite
+    //load key
+    ldr r8, [r0], #4
+    ldr r9, [r0], #4
+    ldr r10, [r0], #4
+    ldr r11, [r0], #4
+    mov.w r14, r0
+
+    //load table address once
+    ldr r12, =AES_Te0
+
+    //initial addroundkey
+    eor r4, r8
+    eor r5, r9
+    eor r6, r10
+    eor r7, r11
+
+.rept 4
+    aesencrypt_oddround
+    aesencrypt_evenround
+.endr
+    aesencrypt_oddround
+    aesencrypt_finalround
+
+    //rk[40]-rk[43]
+    ldr r0, [r14, #0]
+    ldr r1, [r14, #4]
+    ldr r2, [r14, #8]
+    ldr r3, [r14, #12]
+
+    eor r0, r0, r4, ror #8
+    eor r1, r1, r5, ror #8
+    eor r2, r2, r6, ror #8
+    pop.w {r4}
+    eor r3, r3, r7, ror #8
+
+    //write output
+    str r0, [r4, #0]
+    str r1, [r4, #4]
+    str r2, [r4, #8]
+    str r3, [r4, #12]
+
+    //function epilogue, restore state
+    pop {r4-r12,r14}
+    bx lr
+.size aes128_encrypt_publicinputs_asm,.-aes128_encrypt_publicinputs_asm
+
+.section .text.aes192
+@ void aes192_keyexp_publicinputs_asm(const uint8_t *key,
+@       uint8_t *rk) {
+.global aes192_keyexp_publicinputs_asm
+.type   aes192_keyexp_publicinputs_asm,%function
+.align 2
+aes192_keyexp_publicinputs_asm:
+
+    //function prologue, preserve registers
+    push {r4-r11}
+
+    //load key
+    ldr r2, [r0, #0]
+    ldr r3, [r0, #4]
+    ldr r4, [r0, #8]
+    ldr r5, [r0, #12]
+    ldr r6, [r0, #16]
+    ldr r7, [r0, #20]
+
+    //load table address once
+    ldr r0, =AES_Te0
+
+    //round 1
+    uxtb r8, r7, ror #8
+    uxtb r9, r7, ror #16
+    uxtb r10, r7, ror #24
+    uxtb r11, r7
+
+    ldrb r8, [r0, r8, lsl #2]
+    ldrb r9, [r0, r9, lsl #2]
+    ldrb r10, [r0, r10, lsl #2]
+    ldrb r11, [r0, r11, lsl #2]
+
+    eor r2, #0x00000001 //rcon
+    eor r2, r2, r8
+    eor r2, r2, r9, lsl #8
+    eor r2, r2, r10, lsl #16
+    eor r2, r2, r11, lsl #24 //rk[6]
+    eor r3, r2 //rk[7]
+    eor r4, r3 //rk[8]
+    eor r5, r4 //rk[9]
+    eor r6, r5 //rk[10]
+    eor r7, r6 //rk[11]
+
+    //write to memory
+    str r2, [r1, #0]
+    str r3, [r1, #4]
+    str r4, [r1, #8]
+    str r5, [r1, #12]
+    str r6, [r1, #16]
+    str r7, [r1, #20]
+
+    //round 2
+    uxtb r8, r7, ror #8
+    uxtb r9, r7, ror #16
+    uxtb r10, r7, ror #24
+    uxtb r11, r7
+
+    ldrb r8, [r0, r8, lsl #2]
+    ldrb r9, [r0, r9, lsl #2]
+    ldrb r10, [r0, r10, lsl #2]
+    ldrb r11, [r0, r11, lsl #2]
+
+    eor r2, #0x00000002 //rcon
+    eor r2, r2, r8
+    eor r2, r2, r9, lsl #8
+    eor r2, r2, r10, lsl #16
+    eor r2, r2, r11, lsl #24 //rk[12]
+    eor r3, r2 //rk[13]
+    eor r4, r3 //rk[14]
+    eor r5, r4 //rk[15]
+    eor r6, r5 //rk[16]
+    eor r7, r6 //rk[17]
+
+    //write to memory
+    str r2, [r1, #24]
+    str r3, [r1, #28]
+    str r4, [r1, #32]
+    str r5, [r1, #36]
+    str r6, [r1, #40]
+    str r7, [r1, #44]
+
+    //round 3
+    uxtb r8, r7, ror #8
+    uxtb r9, r7, ror #16
+    uxtb r10, r7, ror #24
+    uxtb r11, r7
+
+    ldrb r8, [r0, r8, lsl #2]
+    ldrb r9, [r0, r9, lsl #2]
+    ldrb r10, [r0, r10, lsl #2]
+    ldrb r11, [r0, r11, lsl #2]
+
+    eor r2, #0x00000004 //rcon
+    eor r2, r2, r8
+    eor r2, r2, r9, lsl #8
+    eor r2, r2, r10, lsl #16
+    eor r2, r2, r11, lsl #24 //rk[18]
+    eor r3, r2 //rk[19]
+    eor r4, r3 //rk[20]
+    eor r5, r4 //rk[21]
+    eor r6, r5 //rk[22]
+    eor r7, r6 //rk[23]
+
+    //write to memory
+    str r2, [r1, #48]
+    str r3, [r1, #52]
+    str r4, [r1, #56]
+    str r5, [r1, #60]
+    str r6, [r1, #64]
+    str r7, [r1, #68]
+
+    //round 4
+    uxtb r8, r7, ror #8
+    uxtb r9, r7, ror #16
+    uxtb r10, r7, ror #24
+    uxtb r11, r7
+
+    ldrb r8, [r0, r8, lsl #2]
+    ldrb r9, [r0, r9, lsl #2]
+    ldrb r10, [r0, r10, lsl #2]
+    ldrb r11, [r0, r11, lsl #2]
+
+    eor r2, #0x00000008 //rcon
+    eor r2, r2, r8
+    eor r2, r2, r9, lsl #8
+    eor r2, r2, r10, lsl #16
+    eor r2, r2, r11, lsl #24 //rk[24]
+    eor r3, r2 //rk[25]
+    eor r4, r3 //rk[26]
+    eor r5, r4 //rk[27]
+    eor r6, r5 //rk[28]
+    eor r7, r6 //rk[29]
+
+    //write to memory
+    str r2, [r1, #72]
+    str r3, [r1, #76]
+    str r4, [r1, #80]
+    str r5, [r1, #84]
+    str r6, [r1, #88]
+    str r7, [r1, #92]
+
+    //round 5
+    uxtb r8, r7, ror #8
+    uxtb r9, r7, ror #16
+    uxtb r10, r7, ror #24
+    uxtb r11, r7
+
+    ldrb r8, [r0, r8, lsl #2]
+    ldrb r9, [r0, r9, lsl #2]
+    ldrb r10, [r0, r10, lsl #2]
+    ldrb r11, [r0, r11, lsl #2]
+
+    eor r2, #0x00000010 //rcon
+    eor r2, r2, r8
+    eor r2, r2, r9, lsl #8
+    eor r2, r2, r10, lsl #16
+    eor r2, r2, r11, lsl #24 //rk[30]
+    eor r3, r2 //rk[31]
+    eor r4, r3 //rk[32]
+    eor r5, r4 //rk[33]
+    eor r6, r5 //rk[34]
+    eor r7, r6 //rk[35]
+
+    //write to memory
+    str r2, [r1, #96]
+    str r3, [r1, #100]
+    str r4, [r1, #104]
+    str r5, [r1, #108]
+    str r6, [r1, #112]
+    str r7, [r1, #116]
+
+    add r1, #120
+
+    //round 6
+    uxtb r8, r7, ror #8
+    uxtb r9, r7, ror #16
+    uxtb r10, r7, ror #24
+    uxtb r11, r7
+
+    ldrb r8, [r0, r8, lsl #2]
+    ldrb r9, [r0, r9, lsl #2]
+    ldrb r10, [r0, r10, lsl #2]
+    ldrb r11, [r0, r11, lsl #2]
+
+    eor r2, #0x00000020 //rcon
+    eor r2, r2, r8
+    eor r2, r2, r9, lsl #8
+    eor r2, r2, r10, lsl #16
+    eor r2, r2, r11, lsl #24 //rk[36]
+    eor r3, r2 //rk[37]
+    eor r4, r3 //rk[38]
+    eor r5, r4 //rk[39]
+    eor r6, r5 //rk[40]
+    eor r7, r6 //rk[41]
+
+    //write to memory
+    str r2, [r1, #0]
+    str r3, [r1, #4]
+    str r4, [r1, #8]
+    str r5, [r1, #12]
+    str r6, [r1, #16]
+    str r7, [r1, #20]
+
+    //round 7
+    uxtb r8, r7, ror #8
+    uxtb r9, r7, ror #16
+    uxtb r10, r7, ror #24
+    uxtb r11, r7
+
+    ldrb r8, [r0, r8, lsl #2]
+    ldrb r9, [r0, r9, lsl #2]
+    ldrb r10, [r0, r10, lsl #2]
+    ldrb r11, [r0, r11, lsl #2]
+
+    eor r2, #0x00000040 //rcon
+    eor r2, r2, r8
+    eor r2, r2, r9, lsl #8
+    eor r2, r2, r10, lsl #16
+    eor r2, r2, r11, lsl #24 //rk[42]
+    eor r3, r2 //rk[43]
+    eor r4, r3 //rk[44]
+    eor r5, r4 //rk[45]
+    eor r6, r5 //rk[46]
+    eor r7, r6 //rk[47]
+
+    //write to memory
+    str r2, [r1, #24]
+    str r3, [r1, #28]
+    str r4, [r1, #32]
+    str r5, [r1, #36]
+    str r6, [r1, #40]
+    str r7, [r1, #44]
+
+    //round 8
+    uxtb r8, r7, ror #8
+    uxtb r9, r7, ror #16
+    uxtb r10, r7, ror #24
+    uxtb r11, r7
+
+    ldrb r8, [r0, r8, lsl #2]
+    ldrb r9, [r0, r9, lsl #2]
+    ldrb r10, [r0, r10, lsl #2]
+    ldrb r11, [r0, r11, lsl #2]
+
+    eor r2, #0x00000080 //rcon
+    eor r2, r2, r8
+    eor r2, r2, r9, lsl #8
+    eor r2, r2, r10, lsl #16
+    eor r2, r2, r11, lsl #24 //rk[48]
+    eor r3, r2 //rk[49]
+    eor r4, r3 //rk[50]
+    eor r5, r4 //rk[51]
+    //write to memory
+    str r2, [r1, #48]
+    str r3, [r1, #52]
+    str r4, [r1, #56]
+    str r5, [r1, #60]
+
+    //function epilogue, restore state
+    pop {r4-r11}
+    bx lr
+.size aes192_keyexp_publicinputs_asm,.-aes192_keyexp_publicinputs_asm
+
+@ void aes192_encrypt_publicinputs_asm(const uint8_t *rk,
+@       const uint8_t *in, uint8_t *out) {
+.global aes192_encrypt_publicinputs_asm
+.type   aes192_encrypt_publicinputs_asm,%function
+.align 2
+aes192_encrypt_publicinputs_asm:
+
+    //function prologue, preserve registers and free r2
+    push {r2,r4-r12,r14}
+
+    //load input
+    ldr r4, [r1, #0]
+    ldr r5, [r1, #4]
+    ldr r6, [r1, #8]
+    ldr r7, [r1, #12]
+    //r1 now free to overwrite
+    //load key
+    ldr r8, [r0], #4
+    ldr r9, [r0], #4
+    ldr r10, [r0], #4
+    ldr r11, [r0], #4
+    mov.w r14, r0
+
+    //load table address once
+    ldr r12, =AES_Te0
+
+    //initial addroundkey
+    eor r4, r8
+    eor r5, r9
+    eor r6, r10
+    eor r7, r11
+
+.rept 5
+    aesencrypt_oddround
+    aesencrypt_evenround
+.endr
+    aesencrypt_oddround
+    aesencrypt_finalround
+
+    //rk[48]-rk[51]
+    ldr r0, [r14, #0]
+    ldr r1, [r14, #4]
+    ldr r2, [r14, #8]
+    ldr r3, [r14, #12]
+
+    eor r0, r0, r4, ror #8
+    eor r1, r1, r5, ror #8
+    eor r2, r2, r6, ror #8
+    pop.w {r4}
+    eor r3, r3, r7, ror #8
+
+    //write output
+    str r0, [r4, #0]
+    str r1, [r4, #4]
+    str r2, [r4, #8]
+    str r3, [r4, #12]
+
+    //function epilogue, restore state
+    pop {r4-r12,r14}
+    bx lr
+.size aes192_encrypt_publicinputs_asm,.-aes192_encrypt_publicinputs_asm
+
+.section .text.aes256
+@ void aes256_keyexp_publicinputs_asm(const uint8_t *key,
+@       uint8_t *rk) {
+.global aes256_keyexp_publicinputs_asm
+.type   aes256_keyexp_publicinputs_asm,%function
+.align 2
+aes256_keyexp_publicinputs_asm:
+
+    //function prologue, preserve registers
+    push {r4-r12,r14}
+
+    //load key
+    ldr r2, [r0, #0]
+    ldr r3, [r0, #4]
+    ldr r4, [r0, #8]
+    ldr r5, [r0, #12]
+    ldr r6, [r0, #16]
+    ldr r7, [r0, #20]
+    ldr r8, [r0, #24]
+    ldr r9, [r0, #28]
+
+    //load table address once
+    ldr r0, =AES_Te0
+
+    //round 1
+    uxtb r10, r9, ror #8
+    uxtb r11, r9, ror #16
+    uxtb r12, r9, ror #24
+    uxtb r14, r9
+
+    ldrb r10, [r0, r10, lsl #2]
+    ldrb r11, [r0, r11, lsl #2]
+    ldrb r12, [r0, r12, lsl #2]
+    ldrb r14, [r0, r14, lsl #2]
+
+    eor r2, #0x00000001 //rcon
+    eor r2, r2, r10
+    eor r2, r2, r11, lsl #8
+    eor r2, r2, r12, lsl #16
+    eor r2, r2, r14, lsl #24 //rk[8]
+    eor r3, r2 //rk[9]
+    eor r4, r3 //rk[10]
+    eor r5, r4 //rk[11]
+
+    uxtb r10, r5, ror #16
+    uxtb r11, r5, ror #8
+    uxtb r12, r5
+    uxtb r14, r5, ror #24
+
+    ldrb r10, [r0, r10, lsl #2]
+    ldrb r11, [r0, r11, lsl #2]
+    ldrb r12, [r0, r12, lsl #2]
+    ldrb r14, [r0, r14, lsl #2]
+
+    eor r6, r6, r10, lsl #16
+    eor r6, r6, r11, lsl #8
+    eor r6, r12
+    eor r6, r6, r14, lsl #24 //rk[12]
+    eor r7, r6 //rk[13]
+    eor r8, r7 //rk[14]
+    eor r9, r8 //rk[15]
+
+    //write to memory
+    str r2, [r1, #0]
+    str r3, [r1, #4]
+    str r4, [r1, #8]
+    str r5, [r1, #12]
+    str r6, [r1, #16]
+    str r7, [r1, #20]
+    str r8, [r1, #24]
+    str r9, [r1, #28]
+
+    //round 2
+    uxtb r10, r9, ror #8
+    uxtb r11, r9, ror #16
+    uxtb r12, r9, ror #24
+    uxtb r14, r9
+
+    ldrb r10, [r0, r10, lsl #2]
+    ldrb r11, [r0, r11, lsl #2]
+    ldrb r12, [r0, r12, lsl #2]
+    ldrb r14, [r0, r14, lsl #2]
+
+    eor r2, #0x00000002 //rcon
+    eor r2, r2, r10
+    eor r2, r2, r11, lsl #8
+    eor r2, r2, r12, lsl #16
+    eor r2, r2, r14, lsl #24 //rk[16]
+    eor r3, r2 //rk[17]
+    eor r4, r3 //rk[18]
+    eor r5, r4 //rk[19]
+
+    uxtb r10, r5, ror #16
+    uxtb r11, r5, ror #8
+    uxtb r12, r5
+    uxtb r14, r5, ror #24
+
+    ldrb r10, [r0, r10, lsl #2]
+    ldrb r11, [r0, r11, lsl #2]
+    ldrb r12, [r0, r12, lsl #2]
+    ldrb r14, [r0, r14, lsl #2]
+
+    eor r6, r6, r10, lsl #16
+    eor r6, r6, r11, lsl #8
+    eor r6, r12
+    eor r6, r6, r14, lsl #24 //rk[20]
+    eor r7, r6 //rk[21]
+    eor r8, r7 //rk[22]
+    eor r9, r8 //rk[23]
+
+    //write to memory
+    str r2, [r1, #32]
+    str r3, [r1, #36]
+    str r4, [r1, #40]
+    str r5, [r1, #44]
+    str r6, [r1, #48]
+    str r7, [r1, #52]
+    str r8, [r1, #56]
+    str r9, [r1, #60]
+
+    //round 3
+    uxtb r10, r9, ror #8
+    uxtb r11, r9, ror #16
+    uxtb r12, r9, ror #24
+    uxtb r14, r9
+
+    ldrb r10, [r0, r10, lsl #2]
+    ldrb r11, [r0, r11, lsl #2]
+    ldrb r12, [r0, r12, lsl #2]
+    ldrb r14, [r0, r14, lsl #2]
+
+    eor r2, #0x00000004 //rcon
+    eor r2, r2, r10
+    eor r2, r2, r11, lsl #8
+    eor r2, r2, r12, lsl #16
+    eor r2, r2, r14, lsl #24 //rk[24]
+    eor r3, r2 //rk[25]
+    eor r4, r3 //rk[26]
+    eor r5, r4 //rk[27]
+
+    uxtb r10, r5, ror #16
+    uxtb r11, r5, ror #8
+    uxtb r12, r5
+    uxtb r14, r5, ror #24
+
+    ldrb r10, [r0, r10, lsl #2]
+    ldrb r11, [r0, r11, lsl #2]
+    ldrb r12, [r0, r12, lsl #2]
+    ldrb r14, [r0, r14, lsl #2]
+
+    eor r6, r6, r10, lsl #16
+    eor r6, r6, r11, lsl #8
+    eor r6, r12
+    eor r6, r6, r14, lsl #24 //rk[28]
+    eor r7, r6 //rk[29]
+    eor r8, r7 //rk[30]
+    eor r9, r8 //rk[31]
+
+    //write to memory
+    str r2, [r1, #64]
+    str r3, [r1, #68]
+    str r4, [r1, #72]
+    str r5, [r1, #76]
+    str r6, [r1, #80]
+    str r7, [r1, #84]
+    str r8, [r1, #88]
+    str r9, [r1, #92]
+
+    //round 4
+    uxtb r10, r9, ror #8
+    uxtb r11, r9, ror #16
+    uxtb r12, r9, ror #24
+    uxtb r14, r9
+
+    ldrb r10, [r0, r10, lsl #2]
+    ldrb r11, [r0, r11, lsl #2]
+    ldrb r12, [r0, r12, lsl #2]
+    ldrb r14, [r0, r14, lsl #2]
+
+    eor r2, #0x00000008 //rcon
+    eor r2, r2, r10
+    eor r2, r2, r11, lsl #8
+    eor r2, r2, r12, lsl #16
+    eor r2, r2, r14, lsl #24 //rk[32]
+    eor r3, r2 //rk[33]
+    eor r4, r3 //rk[34]
+    eor r5, r4 //rk[35]
+
+    uxtb r10, r5, ror #16
+    uxtb r11, r5, ror #8
+    uxtb r12, r5
+    uxtb r14, r5, ror #24
+
+    ldrb r10, [r0, r10, lsl #2]
+    ldrb r11, [r0, r11, lsl #2]
+    ldrb r12, [r0, r12, lsl #2]
+    ldrb r14, [r0, r14, lsl #2]
+
+    eor r6, r6, r10, lsl #16
+    eor r6, r6, r11, lsl #8
+    eor r6, r12
+    eor r6, r6, r14, lsl #24 //rk[36]
+    eor r7, r6 //rk[37]
+    eor r8, r7 //rk[38]
+    eor r9, r8 //rk[39]
+
+    //write to memory
+    str r2, [r1, #96]
+    str r3, [r1, #100]
+    str r4, [r1, #104]
+    str r5, [r1, #108]
+    str r6, [r1, #112]
+    str r7, [r1, #116]
+    str r8, [r1, #120]
+    str r9, [r1, #124]
+
+    add r1, #128
+
+    //round 5
+    uxtb r10, r9, ror #8
+    uxtb r11, r9, ror #16
+    uxtb r12, r9, ror #24
+    uxtb r14, r9
+
+    ldrb r10, [r0, r10, lsl #2]
+    ldrb r11, [r0, r11, lsl #2]
+    ldrb r12, [r0, r12, lsl #2]
+    ldrb r14, [r0, r14, lsl #2]
+
+    eor r2, #0x00000010 //rcon
+    eor r2, r2, r10
+    eor r2, r2, r11, lsl #8
+    eor r2, r2, r12, lsl #16
+    eor r2, r2, r14, lsl #24 //rk[40]
+    eor r3, r2 //rk[41]
+    eor r4, r3 //rk[42]
+    eor r5, r4 //rk[43]
+
+    uxtb r10, r5, ror #16
+    uxtb r11, r5, ror #8
+    uxtb r12, r5
+    uxtb r14, r5, ror #24
+
+    ldrb r10, [r0, r10, lsl #2]
+    ldrb r11, [r0, r11, lsl #2]
+    ldrb r12, [r0, r12, lsl #2]
+    ldrb r14, [r0, r14, lsl #2]
+
+    eor r6, r6, r10, lsl #16
+    eor r6, r6, r11, lsl #8
+    eor r6, r12
+    eor r6, r6, r14, lsl #24 //rk[44]
+    eor r7, r6 //rk[45]
+    eor r8, r7 //rk[46]
+    eor r9, r8 //rk[47]
+
+    //write to memory
+    str r2, [r1, #0]
+    str r3, [r1, #4]
+    str r4, [r1, #8]
+    str r5, [r1, #12]
+    str r6, [r1, #16]
+    str r7, [r1, #20]
+    str r8, [r1, #24]
+    str r9, [r1, #28]
+
+    //round 6
+    uxtb r10, r9, ror #8
+    uxtb r11, r9, ror #16
+    uxtb r12, r9, ror #24
+    uxtb r14, r9
+
+    ldrb r10, [r0, r10, lsl #2]
+    ldrb r11, [r0, r11, lsl #2]
+    ldrb r12, [r0, r12, lsl #2]
+    ldrb r14, [r0, r14, lsl #2]
+
+    eor r2, #0x00000020 //rcon
+    eor r2, r2, r10
+    eor r2, r2, r11, lsl #8
+    eor r2, r2, r12, lsl #16
+    eor r2, r2, r14, lsl #24 //rk[48]
+    eor r3, r2 //rk[49]
+    eor r4, r3 //rk[50]
+    eor r5, r4 //rk[51]
+
+    uxtb r10, r5, ror #16
+    uxtb r11, r5, ror #8
+    uxtb r12, r5
+    uxtb r14, r5, ror #24
+
+    ldrb r10, [r0, r10, lsl #2]
+    ldrb r11, [r0, r11, lsl #2]
+    ldrb r12, [r0, r12, lsl #2]
+    ldrb r14, [r0, r14, lsl #2]
+
+    eor r6, r6, r10, lsl #16
+    eor r6, r6, r11, lsl #8
+    eor r6, r12
+    eor r6, r6, r14, lsl #24 //rk[52]
+    eor r7, r6 //rk[53]
+    eor r8, r7 //rk[54]
+    eor r9, r8 //rk[55]
+
+    //write to memory
+    str r2, [r1, #32]
+    str r3, [r1, #36]
+    str r4, [r1, #40]
+    str r5, [r1, #44]
+    str r6, [r1, #48]
+    str r7, [r1, #52]
+    str r8, [r1, #56]
+    str r9, [r1, #60]
+
+    //round 7
+    uxtb r10, r9, ror #8
+    uxtb r11, r9, ror #16
+    uxtb r12, r9, ror #24
+    uxtb r14, r9
+
+    ldrb r10, [r0, r10, lsl #2]
+    ldrb r11, [r0, r11, lsl #2]
+    ldrb r12, [r0, r12, lsl #2]
+    ldrb r14, [r0, r14, lsl #2]
+
+    eor r2, #0x00000040 //rcon
+    eor r2, r2, r10
+    eor r2, r2, r11, lsl #8
+    eor r2, r2, r12, lsl #16
+    eor r2, r2, r14, lsl #24 //rk[56]
+    eor r3, r2 //rk[57]
+    eor r4, r3 //rk[58]
+    eor r5, r4 //rk[59]
+
+    //write to memory
+    str r2, [r1, #64]
+    str r3, [r1, #68]
+    str r4, [r1, #72]
+    str r5, [r1, #76]
+
+    //function epilogue, restore state
+    pop {r4-r12,r14}
+    bx lr
+.size aes256_keyexp_publicinputs_asm,.-aes256_keyexp_publicinputs_asm
+
+.align 2
+.ltorg
+
+@ void aes256_encrypt_publicinputs_asm(const uint8_t *rk,
+@       const uint8_t *in, uint8_t *out) {
+.global aes256_encrypt_publicinputs_asm
+.type   aes256_encrypt_publicinputs_asm,%function
+.align 2
+aes256_encrypt_publicinputs_asm:
+
+    //function prologue, preserve registers and free r2
+    push {r2,r4-r12,r14}
+
+    //load input
+    ldr r4, [r1, #0]
+    ldr r5, [r1, #4]
+    ldr r6, [r1, #8]
+    ldr r7, [r1, #12]
+    //r1 now free to overwrite
+    //load key
+    ldr r8, [r0], #4
+    ldr r9, [r0], #4
+    ldr r10, [r0], #4
+    ldr r11, [r0], #4
+    mov.w r14, r0
+
+    //load table address once
+    ldr r12, =AES_Te0
+
+    //initial addroundkey
+    eor r4, r8
+    eor r5, r9
+    eor r6, r10
+    eor r7, r11
+
+.rept 6
+    aesencrypt_oddround
+    aesencrypt_evenround
+.endr
+    aesencrypt_oddround
+    aesencrypt_finalround
+
+    //rk[56]-rk[59]
+    ldr r0, [r14, #0]
+    ldr r1, [r14, #4]
+    ldr r2, [r14, #8]
+    ldr r3, [r14, #12]
+
+    eor r0, r0, r4, ror #8
+    eor r1, r1, r5, ror #8
+    eor r2, r2, r6, ror #8
+    pop.w {r4}
+    eor r3, r3, r7, ror #8
+
+    //write output
+    str r0, [r4, #0]
+    str r1, [r4, #4]
+    str r2, [r4, #8]
+    str r3, [r4, #12]
+
+    //function epilogue, restore state
+    pop {r4-r12,r14}
+    bx lr
+.size aes256_encrypt_publicinputs_asm,.-aes256_encrypt_publicinputs_asm
diff --git a/common/aes-publicinputs.c b/common/aes-publicinputs.c
new file mode 100644
index 0000000..b216562
--- /dev/null
+++ b/common/aes-publicinputs.c
@@ -0,0 +1,259 @@
+/*
+ * AES implementation based on code from BearSSL (https://bearssl.org/)
+ * by Thomas Pornin.
+ *
+ *
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+#include "aes-publicinputs.h"
+
+#ifdef PROFILE_HASHING
+#include "hal.h"
+extern unsigned long long hash_cycles;
+#endif
+
+extern void aes128_keyexp_publicinputs_asm(const uint8_t *key, uint8_t *rk);
+extern void aes192_keyexp_publicinputs_asm(const uint8_t *key, uint8_t *rk);
+extern void aes256_keyexp_publicinputs_asm(const uint8_t *key, uint8_t *rk);
+extern void aes128_encrypt_publicinputs_asm(const uint8_t *rk, const uint8_t *in, uint8_t *out);
+extern void aes192_encrypt_publicinputs_asm(const uint8_t *rk, const uint8_t *in, uint8_t *out);
+extern void aes256_encrypt_publicinputs_asm(const uint8_t *rk, const uint8_t *in, uint8_t *out);
+
+
+static inline uint32_t br_swap32(uint32_t x) {
+    x = ((x & (uint32_t)0x00FF00FF) << 8)
+        | ((x >> 8) & (uint32_t)0x00FF00FF);
+    return (x << 16) | (x >> 16);
+}
+
+
+static inline void inc1_be(uint32_t *x) {
+    uint32_t t = br_swap32(*x) + 1;
+    *x = br_swap32(t);
+}
+
+
+static void aes_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const uint64_t *rkeys, void (*aes_encrypt_asm)(const uint8_t *, const uint8_t *, uint8_t *)) {
+    unsigned int i;
+    for (i = 0; i < nblocks; ++i) {
+        aes_encrypt_asm((uint8_t *)rkeys, in, out);
+        in += AES_BLOCKBYTES;
+        out += AES_BLOCKBYTES;
+    }
+}
+
+
+static void aes_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const uint64_t *rkeys, void (*aes_encrypt_asm)(const uint8_t *, const uint8_t *, uint8_t *)) {
+    uint32_t ivw[4] = {0};
+    uint8_t buf[AES_BLOCKBYTES];
+    size_t i;
+
+    memcpy(ivw, iv, AESCTR_NONCEBYTES);
+
+    while (outlen > AES_BLOCKBYTES) {
+        aes_encrypt_asm((uint8_t *)rkeys, (uint8_t *)ivw, out);
+        inc1_be(ivw + 3);
+        out += AES_BLOCKBYTES;
+        outlen -= AES_BLOCKBYTES;
+    }
+    if (outlen > 0) {
+        aes_encrypt_asm((unsigned char *)rkeys, (unsigned char *)ivw, buf);
+        for (i = 0; i < outlen; i++) {
+            out[i] = buf[i];
+        }
+    }
+}
+
+
+static void aes128_keyexp_publicinputs(aes128ctx_publicinputs *r, const unsigned char *key) {
+#ifdef PROFILE_HASHING
+    uint64_t t0 = hal_get_time();
+#endif
+
+    memcpy((uint8_t *)r->sk_exp, key, AES128_KEYBYTES);
+    aes128_keyexp_publicinputs_asm(key, ((uint8_t *)r->sk_exp) + AES128_KEYBYTES);
+
+#ifdef PROFILE_HASHING
+    uint64_t t1 = hal_get_time();
+    hash_cycles += (t1-t0);
+#endif
+}
+
+void aes128_ecb_keyexp_publicinputs(aes128ctx_publicinputs *r, const unsigned char *key) {
+    aes128_keyexp_publicinputs(r, key);
+}
+
+void aes128_ctr_keyexp_publicinputs(aes128ctx_publicinputs *r, const unsigned char *key) {
+    aes128_keyexp_publicinputs(r, key);
+}
+
+static void aes192_keyexp_publicinputs(aes192ctx_publicinputs *r, const unsigned char *key) {
+#ifdef PROFILE_HASHING
+    uint64_t t0 = hal_get_time();
+#endif
+
+    memcpy((uint8_t *)r->sk_exp, key, AES192_KEYBYTES);
+    aes192_keyexp_publicinputs_asm(key, ((uint8_t *)r->sk_exp) + AES192_KEYBYTES);
+
+#ifdef PROFILE_HASHING
+    uint64_t t1 = hal_get_time();
+    hash_cycles += (t1-t0);
+#endif
+}
+
+
+void aes192_ecb_keyexp_publicinputs(aes192ctx_publicinputs *r, const unsigned char *key) {
+    aes192_keyexp_publicinputs(r, key);
+}
+
+void aes192_ctr_keyexp_publicinputs(aes192ctx_publicinputs *r, const unsigned char *key) {
+    aes192_keyexp_publicinputs(r, key);
+}
+
+
+static void aes256_keyexp_publicinputs(aes256ctx_publicinputs *r, const unsigned char *key) {
+#ifdef PROFILE_HASHING
+    uint64_t t0 = hal_get_time();
+#endif
+
+    memcpy((uint8_t *)r->sk_exp, key, AES256_KEYBYTES);
+    aes256_keyexp_publicinputs_asm(key, ((uint8_t *)r->sk_exp) + AES256_KEYBYTES);
+
+#ifdef PROFILE_HASHING
+    uint64_t t1 = hal_get_time();
+    hash_cycles += (t1-t0);
+#endif
+}
+
+void aes256_ecb_keyexp_publicinputs(aes256ctx_publicinputs *r, const unsigned char *key) {
+    aes256_keyexp_publicinputs(r, key);
+}
+
+void aes256_ctr_keyexp_publicinputs(aes256ctx_publicinputs *r, const unsigned char *key) {
+    aes256_keyexp_publicinputs(r, key);
+}
+
+
+void aes128_ecb_publicinputs(unsigned char *out, const unsigned char *in, size_t nblocks, const aes128ctx_publicinputs *ctx) {
+#ifdef PROFILE_HASHING
+    uint64_t t0 = hal_get_time();
+#endif
+
+    aes_ecb(out, in, nblocks, ctx->sk_exp, aes128_encrypt_publicinputs_asm);
+
+#ifdef PROFILE_HASHING
+    uint64_t t1 = hal_get_time();
+    hash_cycles += (t1-t0);
+#endif
+}
+
+void aes128_ctr_publicinputs(unsigned char *out, size_t outlen, const unsigned char *iv, const aes128ctx_publicinputs *ctx) {
+#ifdef PROFILE_HASHING
+    uint64_t t0 = hal_get_time();
+#endif
+
+    aes_ctr(out, outlen, iv, ctx->sk_exp, aes128_encrypt_publicinputs_asm);
+
+#ifdef PROFILE_HASHING
+    uint64_t t1 = hal_get_time();
+    hash_cycles += (t1-t0);
+#endif
+}
+
+void aes192_ecb_publicinputs(unsigned char *out, const unsigned char *in, size_t nblocks, const aes192ctx_publicinputs *ctx) {
+#ifdef PROFILE_HASHING
+    uint64_t t0 = hal_get_time();
+#endif
+
+    aes_ecb(out, in, nblocks, ctx->sk_exp, aes192_encrypt_publicinputs_asm);
+
+#ifdef PROFILE_HASHING
+    uint64_t t1 = hal_get_time();
+    hash_cycles += (t1-t0);
+#endif
+}
+
+void aes192_ctr_publicinputs(unsigned char *out, size_t outlen, const unsigned char *iv, const aes192ctx_publicinputs *ctx) {
+#ifdef PROFILE_HASHING
+    uint64_t t0 = hal_get_time();
+#endif
+
+    aes_ctr(out, outlen, iv, ctx->sk_exp, aes192_encrypt_publicinputs_asm);
+
+#ifdef PROFILE_HASHING
+    uint64_t t1 = hal_get_time();
+    hash_cycles += (t1-t0);
+#endif
+}
+
+void aes256_ecb_publicinputs(unsigned char *out, const unsigned char *in, size_t nblocks, const aes256ctx_publicinputs *ctx) {
+#ifdef PROFILE_HASHING
+    uint64_t t0 = hal_get_time();
+#endif
+
+    aes_ecb(out, in, nblocks, ctx->sk_exp, aes256_encrypt_publicinputs_asm);
+
+#ifdef PROFILE_HASHING
+    uint64_t t1 = hal_get_time();
+    hash_cycles += (t1-t0);
+#endif
+}
+
+void aes256_ctr_publicinputs(unsigned char *out, size_t outlen, const unsigned char *iv, const aes256ctx_publicinputs *ctx) {
+#ifdef PROFILE_HASHING
+    uint64_t t0 = hal_get_time();
+#endif
+
+    aes_ctr(out, outlen, iv, ctx->sk_exp, aes256_encrypt_publicinputs_asm);
+
+#ifdef PROFILE_HASHING
+    uint64_t t1 = hal_get_time();
+    hash_cycles += (t1-t0);
+#endif
+}
+
+void aes128_ctx_release_publicinputs(aes128ctx_publicinputs *r) {
+    // no-op for mupq's basic AES operation
+    // this is required for compatibility with code from PQClean
+    // see https://github.com/PQClean/PQClean/pull/198
+    (void) r;
+}
+
+void aes192_ctx_release_publicinputs(aes192ctx_publicinputs *r) {
+    // no-op for mupq's basic AES operation
+    // this is required for compatibility with code from PQClean
+    // see https://github.com/PQClean/PQClean/pull/198
+    (void) r;
+}
+
+void aes256_ctx_release_publicinputs(aes256ctx_publicinputs *r) {
+    // no-op for mupq's basic AES operation
+    // this is required for compatibility with code from PQClean
+    // see https://github.com/PQClean/PQClean/pull/198
+    (void) r;
+}
+
diff --git a/common/aes-publicinputs.h b/common/aes-publicinputs.h
new file mode 100644
index 0000000..13fbadd
--- /dev/null
+++ b/common/aes-publicinputs.h
@@ -0,0 +1,62 @@
+#ifndef AES_PUBLICINPUTS_H
+#define AES_PUBLICINPUTS_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#define AES128_KEYBYTES 16
+#define AES192_KEYBYTES 24
+#define AES256_KEYBYTES 32
+#define AESCTR_NONCEBYTES 12
+#define AES_BLOCKBYTES 16
+
+typedef struct {
+    uint64_t sk_exp[88];
+} aes128ctx_publicinputs;
+
+typedef struct {
+    uint64_t sk_exp[104];
+} aes192ctx_publicinputs;
+
+typedef struct {
+    uint64_t sk_exp[120];
+} aes256ctx_publicinputs;
+
+
+
+/** Initializes the context **/
+void aes128_ecb_keyexp_publicinputs(aes128ctx_publicinputs *r, const unsigned char *key);
+
+void aes128_ctr_keyexp_publicinputs(aes128ctx_publicinputs *r, const unsigned char *key);
+
+void aes128_ecb_publicinputs(unsigned char *out, const unsigned char *in, size_t nblocks, const aes128ctx_publicinputs *ctx);
+
+void aes128_ctr_publicinputs(unsigned char *out, size_t outlen, const unsigned char *iv, const aes128ctx_publicinputs *ctx);
+
+void aes128_ctx_release_publicinputs(aes128ctx_publicinputs *r);
+
+/** Initializes the context **/
+void aes192_ecb_keyexp_publicinputs(aes192ctx_publicinputs *r, const unsigned char *key);
+
+void aes192_ctr_keyexp_publicinputs(aes192ctx_publicinputs *r, const unsigned char *key);
+
+void aes192_ecb_publicinputs(unsigned char *out, const unsigned char *in, size_t nblocks, const aes192ctx_publicinputs *ctx);
+
+void aes192_ctr_publicinputs(unsigned char *out, size_t outlen, const unsigned char *iv, const aes192ctx_publicinputs *ctx);
+
+void aes192_ctx_release_publicinputs(aes192ctx_publicinputs *r);
+
+
+/** Initializes the context **/
+void aes256_ecb_keyexp_publicinputs(aes256ctx_publicinputs *r, const unsigned char *key);
+
+void aes256_ctr_keyexp_publicinputs(aes256ctx_publicinputs *r, const unsigned char *key);
+
+void aes256_ecb_publicinputs(unsigned char *out, const unsigned char *in, size_t nblocks, const aes256ctx_publicinputs *ctx);
+
+void aes256_ctr_publicinputs(unsigned char *out, size_t outlen, const unsigned char *iv, const aes256ctx_publicinputs *ctx);
+
+void aes256_ctx_release_publicinputs(aes256ctx_publicinputs *r);
+
+
+#endif
diff --git a/common/aes.c b/common/aes.c
new file mode 100644
index 0000000..ff84540
--- /dev/null
+++ b/common/aes.c
@@ -0,0 +1,232 @@
+// SPDX-License-Identifier: Apache-2.0 or CC0-1.0
+#include <stdint.h>
+#include <string.h>
+#include "aes.h"
+
+#ifdef PROFILE_HASHING
+#include "hal.h"
+extern unsigned long long hash_cycles;
+#endif
+
+extern void aes128_keyschedule_ffs(uint32_t* rkeys, const uint8_t* key);
+extern void aes256_keyschedule_ffs(uint32_t* rkeys, const uint8_t* key);
+
+extern void aes256_encrypt_ffs(uint8_t* ctext, uint8_t* ctext_bis, const uint8_t* ptext,
+                   const uint8_t* ptext_bis, const uint32_t* rkey);
+
+extern void aes128_encrypt_ffs(uint8_t* ctext, uint8_t* ctext_bis, const uint8_t* ptext,
+                   const uint8_t* ptext_bis, const uint32_t* rkey);
+
+
+static inline uint32_t br_swap32(uint32_t x) {
+    x = ((x & (uint32_t)0x00FF00FF) << 8)
+        | ((x >> 8) & (uint32_t)0x00FF00FF);
+    return (x << 16) | (x >> 16);
+}
+
+static inline void inc1_be(uint32_t *x) {
+    uint32_t t = br_swap32(*x) + 1;
+    *x = br_swap32(t);
+}
+
+static inline void inc2_be(uint32_t *x) {
+    uint32_t t = br_swap32(*x) + 2;
+    *x = br_swap32(t);
+}
+
+void aes128_ecb_keyexp(aes128ctx *r, const unsigned char *key){
+    #ifdef PROFILE_HASHING
+    uint64_t t0 = hal_get_time();
+    #endif
+
+    aes128_keyschedule_ffs(r->sk_exp, key);
+
+    #ifdef PROFILE_HASHING
+    uint64_t t1 = hal_get_time();
+    hash_cycles += (t1-t0);
+    #endif
+}
+
+void aes128_ecb(unsigned char *out, const unsigned char *in, size_t nblocks,
+                const aes128ctx *ctx){
+    #ifdef PROFILE_HASHING
+    uint64_t t0 = hal_get_time();
+    #endif
+    uint8_t buf0[AES_BLOCKBYTES], buf1[AES_BLOCKBYTES];
+
+    while(nblocks > 0){
+        if(nblocks >= 2){
+            aes128_encrypt_ffs(out, out+AES_BLOCKBYTES, in, in+AES_BLOCKBYTES, ctx->sk_exp);
+            out += AES_BLOCKBYTES*2;
+            in += AES_BLOCKBYTES*2;
+            nblocks -= 2;
+        } else {
+            aes128_encrypt_ffs(out, buf0, in, buf1, ctx->sk_exp);
+            nblocks--;
+        }
+    }
+
+    #ifdef PROFILE_HASHING
+    uint64_t t1 = hal_get_time();
+    hash_cycles += (t1-t0);
+    #endif
+}
+
+void aes128_ctr_keyexp(aes128ctx *r, const unsigned char *key){
+    #ifdef PROFILE_HASHING
+    uint64_t t0 = hal_get_time();
+    #endif
+
+    aes128_keyschedule_ffs(r->sk_exp, key);
+
+    #ifdef PROFILE_HASHING
+    uint64_t t1 = hal_get_time();
+    hash_cycles += (t1-t0);
+    #endif
+}
+
+
+void aes128_ctr(unsigned char *out, size_t outlen, const unsigned char *iv,
+                const aes128ctx *ctx){
+    #ifdef PROFILE_HASHING
+    uint64_t t0 = hal_get_time();
+    #endif
+    uint32_t ivw1[4] = {0};
+    uint32_t ivw2[4] = {0};
+    uint8_t buf1[AES_BLOCKBYTES];
+    uint8_t buf2[AES_BLOCKBYTES];
+    size_t i;
+
+    memcpy(ivw1, iv, AESCTR_NONCEBYTES);
+    memcpy(ivw2, iv, AESCTR_NONCEBYTES);
+    inc1_be(ivw2 + 3);
+
+
+    while (outlen > 2*AES_BLOCKBYTES) {
+        aes128_encrypt_ffs(out, out+16, (uint8_t *)ivw1, (uint8_t *)ivw2, ctx->sk_exp);
+        inc2_be(ivw1 + 3);
+        inc2_be(ivw2 + 3);
+        out += AES_BLOCKBYTES*2;
+        outlen -= AES_BLOCKBYTES*2;
+    }
+    if (outlen >= AES_BLOCKBYTES) {
+
+        aes128_encrypt_ffs(out, buf2, (uint8_t *)ivw1, (uint8_t *)ivw2, ctx->sk_exp);
+        out += AES_BLOCKBYTES;
+        outlen -= AES_BLOCKBYTES;
+        for (i = 0; i < outlen; i++) {
+            out[i] = buf2[i];
+        }
+    } else if (outlen > 0) {
+        aes128_encrypt_ffs(buf1, buf2, (uint8_t *)ivw1, (uint8_t *)ivw2, ctx->sk_exp);
+        for (i = 0; i < outlen; i++) {
+            out[i] = buf1[i];
+        }
+    }
+
+    #ifdef PROFILE_HASHING
+    uint64_t t1 = hal_get_time();
+    hash_cycles += (t1-t0);
+    #endif
+}
+void aes128_ctx_release(aes128ctx *r){
+    (void) r;
+}
+
+
+void aes256_ecb_keyexp(aes256ctx *r, const unsigned char *key){
+    #ifdef PROFILE_HASHING
+    uint64_t t0 = hal_get_time();
+    #endif
+
+    aes256_keyschedule_ffs(r->sk_exp, key);
+
+    #ifdef PROFILE_HASHING
+    uint64_t t1 = hal_get_time();
+    hash_cycles += (t1-t0);
+    #endif
+}
+
+void aes256_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes256ctx *ctx){
+    #ifdef PROFILE_HASHING
+    uint64_t t0 = hal_get_time();
+    #endif
+
+    uint8_t buf0[AES_BLOCKBYTES], buf1[AES_BLOCKBYTES];
+
+    while(nblocks > 0){
+        if(nblocks >= 2){
+            aes256_encrypt_ffs(out, out+AES_BLOCKBYTES, in, in+AES_BLOCKBYTES, ctx->sk_exp);
+            out += 2*AES_BLOCKBYTES;
+            in += 2*AES_BLOCKBYTES;
+            nblocks -= 2;
+        } else {
+            aes256_encrypt_ffs(out, buf0, in, buf1, ctx->sk_exp);
+            nblocks--;
+        }
+    }
+
+    #ifdef PROFILE_HASHING
+    uint64_t t1 = hal_get_time();
+    hash_cycles += (t1-t0);
+    #endif
+}
+
+void aes256_ctr_keyexp(aes256ctx *r, const unsigned char *key){
+    #ifdef PROFILE_HASHING
+    uint64_t t0 = hal_get_time();
+    #endif
+    aes256_keyschedule_ffs(r->sk_exp, key);
+
+    #ifdef PROFILE_HASHING
+    uint64_t t1 = hal_get_time();
+    hash_cycles += (t1-t0);
+    #endif
+}
+
+void aes256_ctr(unsigned char *out, size_t outlen, const unsigned char *iv,
+                const aes256ctx *ctx){
+    #ifdef PROFILE_HASHING
+    uint64_t t0 = hal_get_time();
+    #endif
+    uint32_t ivw1[4] = {0};
+    uint32_t ivw2[4] = {0};
+    uint8_t buf1[AES_BLOCKBYTES];
+    uint8_t buf2[AES_BLOCKBYTES];
+    size_t i;
+
+    memcpy(ivw1, iv, AESCTR_NONCEBYTES);
+    memcpy(ivw2, iv, AESCTR_NONCEBYTES);
+    inc1_be(ivw2 + 3);
+
+
+    while (outlen > 2*AES_BLOCKBYTES) {
+        aes256_encrypt_ffs(out, out+AES_BLOCKBYTES, (uint8_t *)ivw1, (uint8_t *)ivw2, ctx->sk_exp);
+        inc2_be(ivw1 + 3);
+        inc2_be(ivw2 + 3);
+        out += AES_BLOCKBYTES*2;
+        outlen -= AES_BLOCKBYTES*2;
+    }
+    if (outlen >= AES_BLOCKBYTES) {
+
+        aes256_encrypt_ffs(out, buf2, (uint8_t *)ivw1, (uint8_t *)ivw2, ctx->sk_exp);
+        out += AES_BLOCKBYTES;
+        outlen -= AES_BLOCKBYTES;
+        for (i = 0; i < outlen; i++) {
+            out[i] = buf2[i];
+        }
+    } else if (outlen > 0) {
+        aes256_encrypt_ffs(buf1, buf2, (uint8_t *)ivw1, (uint8_t *)ivw2, ctx->sk_exp);
+        for (i = 0; i < outlen; i++) {
+            out[i] = buf1[i];
+        }
+    }
+    #ifdef PROFILE_HASHING
+    uint64_t t1 = hal_get_time();
+    hash_cycles += (t1-t0);
+    #endif
+}
+
+void aes256_ctx_release(aes256ctx *r){
+    (void) r;
+}
\ No newline at end of file
diff --git a/common/aes.h b/common/aes.h
new file mode 100644
index 0000000..49e9433
--- /dev/null
+++ b/common/aes.h
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: Apache-2.0 or CC0-1.0
+#ifndef AES_H
+#define AES_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#define AES128_KEYBYTES 16
+#define AES192_KEYBYTES 24
+#define AES256_KEYBYTES 32
+#define AESCTR_NONCEBYTES 12
+#define AES_BLOCKBYTES 16
+
+typedef struct {
+    uint32_t sk_exp[2*11*AES_BLOCKBYTES/sizeof(uint32_t)];
+} aes128ctx;
+
+
+typedef struct {
+    uint32_t sk_exp[2*15*AES_BLOCKBYTES/sizeof(uint32_t)];
+} aes256ctx;
+
+
+
+/** Initializes the context **/
+void aes128_ecb_keyexp(aes128ctx *r, const unsigned char *key);
+
+void aes128_ctr_keyexp(aes128ctx *r, const unsigned char *key);
+
+void aes128_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes128ctx *ctx);
+
+void aes128_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes128ctx *ctx);
+
+void aes128_ctx_release(aes128ctx *r);
+
+
+/** Initializes the context **/
+void aes256_ecb_keyexp(aes256ctx *r, const unsigned char *key);
+
+void aes256_ctr_keyexp(aes256ctx *r, const unsigned char *key);
+
+void aes256_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, const aes256ctx *ctx);
+
+void aes256_ctr(unsigned char *out, size_t outlen, const unsigned char *iv, const aes256ctx *ctx);
+
+void aes256_ctx_release(aes256ctx *r);
+
+
+#endif
diff --git a/common/aestest.c b/common/aestest.c
new file mode 100644
index 0000000..bfce91f
--- /dev/null
+++ b/common/aestest.c
@@ -0,0 +1,185 @@
+#include "randombytes.h"
+#include <hal.h>
+#include <aes.h>
+#include <sendfn.h>
+#include <randombytes.h>
+#include <string.h>
+
+#include <stdio.h>
+
+
+#include <stdio.h>
+#include <string.h>
+
+const unsigned char msg[48] = "The quick brown fox jumps over the lazy dog!!!!";
+
+const unsigned char stream128[67] = {
+  0x36, 0x88, 0x7b, 0x28, 0x99, 0x8f, 0x4d, 0x2b, 0x37, 0xff, 0x06, 0x63, 0xfc, 0x5c, 0xef, 0x2f,
+  0x43, 0xeb, 0xeb, 0x7e, 0xc1, 0x58, 0xe4, 0xb4, 0x27, 0x78, 0x2e, 0xa7, 0x90, 0x00, 0x09, 0x91,
+  0xcd, 0x0e, 0x40, 0x18, 0x8b, 0x2b, 0x52, 0x1e, 0x8d, 0xfb, 0x0e, 0x7b, 0x80, 0xe7, 0xb6, 0xd4,
+  0xba, 0x48, 0xf5, 0x19, 0xb4, 0xbf, 0xbf, 0xb1, 0x52, 0x6f, 0x12, 0xe2, 0x7b, 0x61, 0x90, 0x0d,
+  0x95, 0x83, 0x84};
+
+const unsigned char stream192[67] = {
+  0xb0, 0x13, 0x08, 0x01, 0xfa, 0x88, 0x28, 0x98, 0xac, 0x77, 0x72, 0xaf, 0x2a, 0x8d, 0x17, 0xf3,
+  0xba, 0x37, 0xd6, 0x8e, 0x01, 0x5b, 0x25, 0xb5, 0x51, 0x40, 0xf9, 0x55, 0x73, 0x89, 0xbe, 0xd3,
+  0x5f, 0x3c, 0xde, 0xc2, 0x6b, 0xf6, 0xfc, 0x26, 0x97, 0x22, 0xfa, 0x8e, 0xcb, 0x62, 0xe8, 0xa5,
+  0x03, 0xd2, 0x8a, 0x7e, 0xaf, 0xe3, 0xe5, 0x63, 0x2e, 0xb8, 0x48, 0x20, 0x1c, 0x48, 0xd5, 0xc2,
+  0x27, 0x1e, 0x1e};
+
+const unsigned char stream256[67] = {
+  0x12, 0x39, 0x28, 0xd0, 0xda, 0xd1, 0xfd, 0xe7, 0x64, 0x74, 0x10, 0x5a, 0x29, 0x17, 0x3c, 0x62,
+  0x05, 0xde, 0x4c, 0x98, 0x22, 0x9b, 0xad, 0x16, 0x32, 0x75, 0xbf, 0x26, 0xe5, 0x84, 0x7e, 0xc8,
+  0x4e, 0x73, 0x68, 0xce, 0x9a, 0x11, 0xb6, 0x55, 0x53, 0x05, 0x39, 0xa1, 0xa7, 0x1f, 0x16, 0x55,
+  0x4a, 0xd3, 0x6c, 0xc6, 0x2c, 0xb4, 0x55, 0x9f, 0x5f, 0xa3, 0xe8, 0x39, 0xfa, 0x9d, 0x96, 0xb6,
+  0xb7, 0xc9, 0xc5};
+
+const unsigned char ct128[48] = {
+0x10, 0xdc, 0x43, 0x2b, 0x15, 0x11, 0x81, 0x36, 0x3f, 0x00, 0x51, 0x74, 0x81, 0x7c, 0x22, 0x87,
+0x3a, 0x3b, 0xfe, 0xd7, 0xb9, 0xa6, 0xf2, 0x3c, 0x81, 0x00, 0x63, 0xef, 0xe5, 0xb8, 0xbd, 0x36,
+0x11, 0xcc, 0xc9, 0xdf, 0x2b, 0xea, 0xbc, 0xe6, 0x11, 0x1c, 0x34, 0x79, 0xf9, 0x6b, 0x47, 0x7b};
+
+const unsigned char ct192[48] = {
+0x63, 0xc6, 0xde, 0x28, 0x36, 0xb4, 0x29, 0xbf, 0xbe, 0x9d, 0x15, 0x8e, 0x83, 0x04, 0xa3, 0x18,
+0x34, 0x79, 0xe8, 0x02, 0x8a, 0x34, 0x50, 0x7c, 0xa9, 0x08, 0x48, 0x47, 0xee, 0x90, 0x79, 0x13,
+0x66, 0x2d, 0xa4, 0xf1, 0x3e, 0x8b, 0x76, 0xa9, 0x50, 0xf7, 0x6e, 0xa8, 0xbf, 0x29, 0xaf, 0x84};
+
+const unsigned char ct256[48] = {
+0xdb, 0xb5, 0x44, 0x70, 0x68, 0xe6, 0xad, 0x6a, 0x09, 0xdf, 0xa6, 0xef, 0x85, 0x73, 0xff, 0xc0,
+0xc2, 0x91, 0x38, 0xbd, 0xd7, 0xd0, 0x22, 0x7e, 0x79, 0x71, 0xa1, 0x98, 0x6f, 0xd5, 0x80, 0xa8,
+0x1e, 0x97, 0xd7, 0x6d, 0xd2, 0x6b, 0x0e, 0x7b, 0x79, 0x76, 0x75, 0x86, 0xa5, 0x2f, 0x76, 0x0b};
+
+const unsigned char key[AES256_KEYBYTES] = {
+  0x66, 0xd9, 0xb7, 0x60, 0x0e, 0xda, 0xaa, 0x81, 0x42, 0xa2, 0xd6, 0x3d, 0x8f, 0x51, 0x6c, 0x6f,
+  0xb6, 0xdf, 0x5b, 0x97, 0xf3, 0xf1, 0xf7, 0x0e, 0xeb, 0xe0, 0x40, 0x4d, 0xc5, 0x24, 0xa1, 0xfa};
+const unsigned char nonce[AESCTR_NONCEBYTES] = {
+  0x9d, 0x2d, 0x3e, 0x6e, 0x48, 0x5c, 0xf6, 0x6b, 0xb2, 0xb9, 0x25, 0xf4};
+
+static int test(void)
+{
+  unsigned char ct[67];
+  int r = 0;
+  aes128ctx ctx128_ecb, ctx128_ctr;
+  aes256ctx ctx256_ecb, ctx256_ctr;
+
+  aes128_ecb_keyexp(&ctx128_ecb, key);
+  aes256_ecb_keyexp(&ctx256_ecb, key);
+  aes128_ctr_keyexp(&ctx128_ctr, key);
+  aes256_ctr_keyexp(&ctx256_ctr, key);
+
+  aes128_ctr(ct, 67, nonce, &ctx128_ctr);
+  if(memcmp(ct, stream128, 67)) {
+    hal_send_str("ERROR AES128CTR output does not match test vector.\n");
+    r = 1;
+  }
+
+  aes256_ctr(ct, 67, nonce, &ctx256_ctr);
+  if(memcmp(ct, stream256, 67)) {
+    hal_send_str("ERROR AES256CTR output does not match test vector.\n");
+    r = 1;
+  }
+
+
+  aes128_ecb(ct, msg, sizeof(msg) / AES_BLOCKBYTES, &ctx128_ecb);
+  if(memcmp(ct, ct128, 48)) {
+    hal_send_str("ERROR AES128ECB output does not match test vector.\n");
+    r = 1;
+  }
+
+
+  aes256_ecb(ct, msg, sizeof(msg) / AES_BLOCKBYTES, &ctx256_ecb);
+  if(memcmp(ct, ct256, 48)) {
+    hal_send_str("ERROR AES256ECB output does not match test vector.\n");
+    r = 1;
+  }
+
+  aes128_ctx_release(&ctx128_ecb);
+  aes256_ctx_release(&ctx256_ecb);
+  aes128_ctx_release(&ctx128_ctr);
+  aes256_ctx_release(&ctx256_ctr);
+
+  return r;
+}
+
+
+static void bench(void)
+{
+  char str[100];
+  unsigned char ct[1024*32];
+  unsigned char pt[1024*32];
+  uint64_t t0, t1;
+  aes128ctx ctx128_ecb, ctx128_ctr;
+  aes256ctx ctx256_ecb, ctx256_ctr;
+  hal_send_str("-");
+  t0 = hal_get_time();
+  aes128_ecb_keyexp(&ctx128_ecb, key);
+  t1 = hal_get_time();
+  sprintf(str, "aes128_ecb_keyexp: %llu cycles", t1-t0);
+  hal_send_str(str);
+
+  t0 = hal_get_time();
+  aes256_ecb_keyexp(&ctx256_ecb, key);
+  t1 = hal_get_time();
+  sprintf(str, "aes256_ecb_keyexp: %llu cycles", t1-t0);
+  hal_send_str(str);
+
+  t0 = hal_get_time();
+  aes128_ctr_keyexp(&ctx128_ctr, key);
+  t1 = hal_get_time();
+  sprintf(str, "aes128_ctr_keyexp: %llu cycles", t1-t0);
+  hal_send_str(str);
+
+  t0 = hal_get_time();
+  aes256_ctr_keyexp(&ctx256_ctr, key);
+  t1 = hal_get_time();
+  sprintf(str, "aes256_ctr_keyexp: %llu cycles", t1-t0);
+  hal_send_str(str);
+
+  hal_send_str("-");
+  for(size_t blocks=1; blocks <= sizeof pt/16; blocks <<= 1){
+    t0 = hal_get_time();
+    aes128_ecb(ct, pt, blocks, &ctx128_ecb);
+    t1 = hal_get_time();
+    sprintf(str, "aes128_ecb: %llu cycles for %u bytes (%.2f cycles/byte) -- w/o key expansion", t1-t0, blocks*AES_BLOCKBYTES, (double)(t1-t0)/(blocks*AES_BLOCKBYTES));
+    hal_send_str(str);
+  }
+  hal_send_str("-");
+  for(size_t blocks=1; blocks <= sizeof pt/16; blocks <<= 1){
+    t0 = hal_get_time();
+    aes256_ecb(ct, pt, blocks, &ctx256_ecb);
+    t1 = hal_get_time();
+    sprintf(str, "aes256_ecb: %llu cycles for %u bytes (%.2f cycles/byte) -- w/o key expansion", t1-t0, blocks*AES_BLOCKBYTES, (double)(t1-t0)/(blocks*AES_BLOCKBYTES));
+    hal_send_str(str);
+  }
+  hal_send_str("-");
+  for(size_t blocks=1; blocks <= sizeof pt/16; blocks <<= 1){
+    t0 = hal_get_time();
+    aes128_ctr(ct, blocks*AES_BLOCKBYTES, nonce, &ctx128_ctr);
+    t1 = hal_get_time();
+    sprintf(str, "aes128_ctr: %llu cycles for %u bytes (%.2f cycles/byte) -- w/o key expansion", t1-t0, blocks*AES_BLOCKBYTES, (double)(t1-t0)/(blocks*AES_BLOCKBYTES));
+    hal_send_str(str);
+  }
+  hal_send_str("-");
+  for(size_t blocks=1; blocks <= sizeof pt/16; blocks <<= 1){
+    t0 = hal_get_time();
+    aes256_ctr(ct, blocks*AES_BLOCKBYTES, nonce, &ctx256_ctr);
+    t1 = hal_get_time();
+    sprintf(str, "aes256_ctr: %llu cycles for %u bytes (%.2f cycles/byte) -- w/o key expansion", t1-t0, blocks*AES_BLOCKBYTES, (double)(t1-t0)/(blocks*AES_BLOCKBYTES));
+    hal_send_str(str);
+  }
+}
+
+
+int main(void)
+{
+  hal_setup(CLOCK_BENCHMARK);
+  hal_send_str("===");
+  if(test()){
+    hal_send_str("ERR");
+  } else {
+    hal_send_str("ALL GOOD!");
+  }
+  bench();
+  hal_send_str("###");
+  return 0;
+}
\ No newline at end of file
diff --git a/common/crypto_hashblocks_sha512.c b/common/crypto_hashblocks_sha512.c
new file mode 100644
index 0000000..b1c4664
--- /dev/null
+++ b/common/crypto_hashblocks_sha512.c
@@ -0,0 +1,101 @@
+#include "crypto_hashblocks_sha512.h"
+#include <stdint.h>
+
+static const uint64_t constants[80] = {
+  0x428a2f98d728ae22ULL
+, 0x7137449123ef65cdULL
+, 0xb5c0fbcfec4d3b2fULL
+, 0xe9b5dba58189dbbcULL
+, 0x3956c25bf348b538ULL
+, 0x59f111f1b605d019ULL
+, 0x923f82a4af194f9bULL
+, 0xab1c5ed5da6d8118ULL
+, 0xd807aa98a3030242ULL
+, 0x12835b0145706fbeULL
+, 0x243185be4ee4b28cULL
+, 0x550c7dc3d5ffb4e2ULL
+, 0x72be5d74f27b896fULL
+, 0x80deb1fe3b1696b1ULL
+, 0x9bdc06a725c71235ULL
+, 0xc19bf174cf692694ULL
+, 0xe49b69c19ef14ad2ULL
+, 0xefbe4786384f25e3ULL
+, 0x0fc19dc68b8cd5b5ULL
+, 0x240ca1cc77ac9c65ULL
+, 0x2de92c6f592b0275ULL
+, 0x4a7484aa6ea6e483ULL
+, 0x5cb0a9dcbd41fbd4ULL
+, 0x76f988da831153b5ULL
+, 0x983e5152ee66dfabULL
+, 0xa831c66d2db43210ULL
+, 0xb00327c898fb213fULL
+, 0xbf597fc7beef0ee4ULL
+, 0xc6e00bf33da88fc2ULL
+, 0xd5a79147930aa725ULL
+, 0x06ca6351e003826fULL
+, 0x142929670a0e6e70ULL
+, 0x27b70a8546d22ffcULL
+, 0x2e1b21385c26c926ULL
+, 0x4d2c6dfc5ac42aedULL
+, 0x53380d139d95b3dfULL
+, 0x650a73548baf63deULL
+, 0x766a0abb3c77b2a8ULL
+, 0x81c2c92e47edaee6ULL
+, 0x92722c851482353bULL
+, 0xa2bfe8a14cf10364ULL
+, 0xa81a664bbc423001ULL
+, 0xc24b8b70d0f89791ULL
+, 0xc76c51a30654be30ULL
+, 0xd192e819d6ef5218ULL
+, 0xd69906245565a910ULL
+, 0xf40e35855771202aULL
+, 0x106aa07032bbd1b8ULL
+, 0x19a4c116b8d2d0c8ULL
+, 0x1e376c085141ab53ULL
+, 0x2748774cdf8eeb99ULL
+, 0x34b0bcb5e19b48a8ULL
+, 0x391c0cb3c5c95a63ULL
+, 0x4ed8aa4ae3418acbULL
+, 0x5b9cca4f7763e373ULL
+, 0x682e6ff3d6b2b8a3ULL
+, 0x748f82ee5defb2fcULL
+, 0x78a5636f43172f60ULL
+, 0x84c87814a1f0ab72ULL
+, 0x8cc702081a6439ecULL
+, 0x90befffa23631e28ULL
+, 0xa4506cebde82bde9ULL
+, 0xbef9a3f7b2c67915ULL
+, 0xc67178f2e372532bULL
+, 0xca273eceea26619cULL
+, 0xd186b8c721c0c207ULL
+, 0xeada7dd6cde0eb1eULL
+, 0xf57d4f7fee6ed178ULL
+, 0x06f067aa72176fbaULL
+, 0x0a637dc5a2c898a6ULL
+, 0x113f9804bef90daeULL
+, 0x1b710b35131c471bULL
+, 0x28db77f523047d84ULL
+, 0x32caab7b40c72493ULL
+, 0x3c9ebe0a15c9bebcULL
+, 0x431d67c49c100d4cULL
+, 0x4cc5d4becb3e42b6ULL
+, 0x597f299cfc657e2aULL
+, 0x5fcb6fab3ad6faecULL
+, 0x6c44198c4a475817ULL
+};
+
+
+#define CUTOFF 32768 /* must be multiple of 128 */
+
+extern int crypto_hashblocks_sha512_m4nofpu_inner(unsigned char *,const unsigned char *,unsigned int,const uint64_t *);
+
+int crypto_hashblocks_sha512(unsigned char *statebytes,const unsigned char *in,unsigned long long inlen)
+{
+  while (inlen >= CUTOFF) {
+    crypto_hashblocks_sha512_m4nofpu_inner(statebytes,in,CUTOFF,constants); /* returns 0 */
+    in += CUTOFF;
+    inlen -= CUTOFF;
+  }
+  if (inlen < 128) return inlen;
+  return crypto_hashblocks_sha512_m4nofpu_inner(statebytes,in,inlen,constants);
+}
diff --git a/common/crypto_hashblocks_sha512_inner32.s b/common/crypto_hashblocks_sha512_inner32.s
new file mode 100644
index 0000000..a900501
--- /dev/null
+++ b/common/crypto_hashblocks_sha512_inner32.s
@@ -0,0 +1,6593 @@
+
+# qhasm: int32 input_0
+
+# qhasm: int32 input_1
+
+# qhasm: int32 input_2
+
+# qhasm: int32 input_3
+
+# qhasm: stack32 input_4
+
+# qhasm: stack32 input_5
+
+# qhasm: stack32 input_6
+
+# qhasm: stack32 input_7
+
+# qhasm: int32 caller_r4
+
+# qhasm: int32 caller_r5
+
+# qhasm: int32 caller_r6
+
+# qhasm: int32 caller_r7
+
+# qhasm: int32 caller_r8
+
+# qhasm: int32 caller_r9
+
+# qhasm: int32 caller_r10
+
+# qhasm: int32 caller_r11
+
+# qhasm: int32 caller_r12
+
+# qhasm: int32 caller_r14
+
+# qhasm: startcode
+.text
+.arch armv7
+.fpu fpv4-sp-d16
+.syntax unified
+
+# qhasm: int32 two13
+
+# qhasm: int32 two23
+
+# qhasm: int32 two24
+
+# qhasm: int32 two25
+
+# qhasm: int32 lotmp
+
+# qhasm: int32 lotmp2
+
+# qhasm: int32 hitmp
+
+# qhasm: int32 hitmp2
+
+# qhasm: int32 lou0
+
+# qhasm: int32 lou1
+
+# qhasm: int32 lou2
+
+# qhasm: int32 lou3
+
+# qhasm: int32 lou4
+
+# qhasm: int32 lou5
+
+# qhasm: int32 hiu0
+
+# qhasm: int32 hiu1
+
+# qhasm: int32 hiu2
+
+# qhasm: int32 hiu3
+
+# qhasm: int32 hiu4
+
+# qhasm: int32 hiu5
+
+# qhasm: stack32 hid0
+
+# qhasm: stack32 hid1
+
+# qhasm: stack32 hid2
+
+# qhasm: stack32 hid3
+
+# qhasm: stack32 hid4
+
+# qhasm: stack32 hid5
+
+# qhasm: stack32 hid6
+
+# qhasm: stack32 hid7
+
+# qhasm: stack32 hid8
+
+# qhasm: stack32 hid9
+
+# qhasm: stack32 hid10
+
+# qhasm: stack32 hid11
+
+# qhasm: stack32 hid12
+
+# qhasm: stack32 hid13
+
+# qhasm: stack32 hid14
+
+# qhasm: stack32 hid15
+
+# qhasm: stack32 lod0
+
+# qhasm: stack32 lod1
+
+# qhasm: stack32 lod2
+
+# qhasm: stack32 lod3
+
+# qhasm: stack32 lod4
+
+# qhasm: stack32 lod5
+
+# qhasm: stack32 lod6
+
+# qhasm: stack32 lod7
+
+# qhasm: stack32 lod8
+
+# qhasm: stack32 lod9
+
+# qhasm: stack32 lod10
+
+# qhasm: stack32 lod11
+
+# qhasm: stack32 lod12
+
+# qhasm: stack32 lod13
+
+# qhasm: stack32 lod14
+
+# qhasm: stack32 lod15
+
+# qhasm: stack32 him0
+
+# qhasm: stack32 him1
+
+# qhasm: stack32 him2
+
+# qhasm: stack32 him3
+
+# qhasm: stack32 him4
+
+# qhasm: stack32 him5
+
+# qhasm: stack32 him6
+
+# qhasm: stack32 him7
+
+# qhasm: stack32 him8
+
+# qhasm: stack32 him9
+
+# qhasm: stack32 him10
+
+# qhasm: stack32 him11
+
+# qhasm: stack32 him12
+
+# qhasm: stack32 him13
+
+# qhasm: stack32 him14
+
+# qhasm: stack32 him15
+
+# qhasm: stack32 lom0
+
+# qhasm: stack32 lom1
+
+# qhasm: stack32 lom2
+
+# qhasm: stack32 lom3
+
+# qhasm: stack32 lom4
+
+# qhasm: stack32 lom5
+
+# qhasm: stack32 lom6
+
+# qhasm: stack32 lom7
+
+# qhasm: stack32 lom8
+
+# qhasm: stack32 lom9
+
+# qhasm: stack32 lom10
+
+# qhasm: stack32 lom11
+
+# qhasm: stack32 lom12
+
+# qhasm: stack32 lom13
+
+# qhasm: stack32 lom14
+
+# qhasm: stack32 lom15
+
+# qhasm: stack32 o0
+
+# qhasm: stack32 o1
+
+# qhasm: stack32 o2
+
+# qhasm: stack32 o3
+
+# qhasm: stack32 o4
+
+# qhasm: rpushenter crypto_hashblocks_sha512_m4nofpu_inner
+.p2align 2
+.global crypto_hashblocks_sha512_m4nofpu_inner
+.type crypto_hashblocks_sha512_m4nofpu_inner,%function
+.thumb
+.thumb_func
+crypto_hashblocks_sha512_m4nofpu_inner:
+push {r4,r5,r6,r7,r8,r9,r10,r11,r14}
+sub.w sp,sp,#288
+
+# qhasm: o0 = input_0
+# asm 1: str <input_0=int32#1,>o0=stack32#1
+# asm 2: str <input_0=r0,>o0=[sp,#0]
+# copy-collector input: str r0,[sp,#0]
+
+# qhasm: o1 = input_1
+# asm 1: str <input_1=int32#2,>o1=stack32#2
+# asm 2: str <input_1=r1,>o1=[sp,#4]
+# copy-collector input: str r1,[sp,#4]
+
+# qhasm: input_1 = input_2 - 128
+# asm 1: sub >input_1=int32#2,<input_2=int32#3,#128
+# asm 2: sub >input_1=r1,<input_2=r2,#128
+# copy-collector output starts
+strd r0,r1,[sp,#0]
+# copy-collector output ends
+sub r1,r2,#128
+
+# qhasm: o2 = input_1
+# asm 1: str <input_1=int32#2,>o2=stack32#3
+# asm 2: str <input_1=r1,>o2=[sp,#8]
+# copy-collector input: str r1,[sp,#8]
+
+# qhasm: o3 = input_3
+# asm 1: str <input_3=int32#4,>o3=stack32#4
+# asm 2: str <input_3=r3,>o3=[sp,#12]
+# copy-collector input: str r3,[sp,#12]
+
+# qhasm: hiu0 = mem32[input_0]
+# asm 1: ldr >hiu0=int32#2,[<input_0=int32#1]
+# asm 2: ldr >hiu0=r1,[<input_0=r0]
+# copy-collector input: ldr r1,[r0]
+
+# qhasm: lou0 = mem32[input_0+4]
+# asm 1: ldr >lou0=int32#3,[<input_0=int32#1,#4]
+# asm 2: ldr >lou0=r2,[<input_0=r0,#4]
+# copy-collector input: ldr r2,[r0,#4]
+
+# qhasm: hiu1 = mem32[input_0+8]
+# asm 1: ldr >hiu1=int32#4,[<input_0=int32#1,#8]
+# asm 2: ldr >hiu1=r3,[<input_0=r0,#8]
+# copy-collector input: ldr r3,[r0,#8]
+
+# qhasm: lou1 = mem32[input_0+12]
+# asm 1: ldr >lou1=int32#5,[<input_0=int32#1,#12]
+# asm 2: ldr >lou1=r4,[<input_0=r0,#12]
+# copy-collector input: ldr r4,[r0,#12]
+
+# qhasm: hiu2 = mem32[input_0+16]
+# asm 1: ldr >hiu2=int32#6,[<input_0=int32#1,#16]
+# asm 2: ldr >hiu2=r5,[<input_0=r0,#16]
+# copy-collector input: ldr r5,[r0,#16]
+
+# qhasm: lou2 = mem32[input_0+20]
+# asm 1: ldr >lou2=int32#7,[<input_0=int32#1,#20]
+# asm 2: ldr >lou2=r6,[<input_0=r0,#20]
+# copy-collector input: ldr r6,[r0,#20]
+
+# qhasm: hiu3 = mem32[input_0+24]
+# asm 1: ldr >hiu3=int32#8,[<input_0=int32#1,#24]
+# asm 2: ldr >hiu3=r7,[<input_0=r0,#24]
+# copy-collector input: ldr r7,[r0,#24]
+
+# qhasm: lou3 = mem32[input_0+28]
+# asm 1: ldr >lou3=int32#9,[<input_0=int32#1,#28]
+# asm 2: ldr >lou3=r8,[<input_0=r0,#28]
+# copy-collector input: ldr r8,[r0,#28]
+
+# qhasm: lou0 = lou0[3]lou0[2]lou0[1]lou0[0]
+# asm 1: rev >lou0=int32#3,<lou0=int32#3
+# asm 2: rev >lou0=r2,<lou0=r2
+# copy-collector output starts
+strd r1,r3,[sp,#8]
+ldr r1,[r0]
+ldr r2,[r0,#4]
+ldr r3,[r0,#8]
+ldr r4,[r0,#12]
+ldr r5,[r0,#16]
+ldr r6,[r0,#20]
+ldr.w r7,[r0,#24]
+ldr r8,[r0,#28]
+# copy-collector output ends
+rev r2,r2
+
+# qhasm: hiu0 = hiu0[3]hiu0[2]hiu0[1]hiu0[0]
+# asm 1: rev >hiu0=int32#2,<hiu0=int32#2
+# asm 2: rev >hiu0=r1,<hiu0=r1
+rev r1,r1
+
+# qhasm: lou1 = lou1[3]lou1[2]lou1[1]lou1[0]
+# asm 1: rev >lou1=int32#5,<lou1=int32#5
+# asm 2: rev >lou1=r4,<lou1=r4
+rev r4,r4
+
+# qhasm: hiu1 = hiu1[3]hiu1[2]hiu1[1]hiu1[0]
+# asm 1: rev >hiu1=int32#4,<hiu1=int32#4
+# asm 2: rev >hiu1=r3,<hiu1=r3
+rev r3,r3
+
+# qhasm: lou2 = lou2[3]lou2[2]lou2[1]lou2[0]
+# asm 1: rev >lou2=int32#7,<lou2=int32#7
+# asm 2: rev >lou2=r6,<lou2=r6
+rev r6,r6
+
+# qhasm: hiu2 = hiu2[3]hiu2[2]hiu2[1]hiu2[0]
+# asm 1: rev >hiu2=int32#6,<hiu2=int32#6
+# asm 2: rev >hiu2=r5,<hiu2=r5
+rev r5,r5
+
+# qhasm: lou3 = lou3[3]lou3[2]lou3[1]lou3[0]
+# asm 1: rev >lou3=int32#9,<lou3=int32#9
+# asm 2: rev >lou3=r8,<lou3=r8
+rev r8,r8
+
+# qhasm: hiu3 = hiu3[3]hiu3[2]hiu3[1]hiu3[0]
+# asm 1: rev >hiu3=int32#8,<hiu3=int32#8
+# asm 2: rev >hiu3=r7,<hiu3=r7
+rev.w r7,r7
+
+# qhasm: lom0 = lou0
+# asm 1: str <lou0=int32#3,>lom0=stack32#5
+# asm 2: str <lou0=r2,>lom0=[sp,#16]
+# copy-collector input: str r2,[sp,#16]
+
+# qhasm: him0 = hiu0
+# asm 1: str <hiu0=int32#2,>him0=stack32#6
+# asm 2: str <hiu0=r1,>him0=[sp,#20]
+# copy-collector input: str r1,[sp,#20]
+
+# qhasm: lom1 = lou1
+# asm 1: str <lou1=int32#5,>lom1=stack32#7
+# asm 2: str <lou1=r4,>lom1=[sp,#24]
+# copy-collector input: str r4,[sp,#24]
+
+# qhasm: him1 = hiu1
+# asm 1: str <hiu1=int32#4,>him1=stack32#8
+# asm 2: str <hiu1=r3,>him1=[sp,#28]
+# copy-collector input: str r3,[sp,#28]
+
+# qhasm: lom2 = lou2
+# asm 1: str <lou2=int32#7,>lom2=stack32#9
+# asm 2: str <lou2=r6,>lom2=[sp,#32]
+# copy-collector input: str r6,[sp,#32]
+
+# qhasm: him2 = hiu2
+# asm 1: str <hiu2=int32#6,>him2=stack32#10
+# asm 2: str <hiu2=r5,>him2=[sp,#36]
+# copy-collector input: str r5,[sp,#36]
+
+# qhasm: lom3 = lou3
+# asm 1: str <lou3=int32#9,>lom3=stack32#11
+# asm 2: str <lou3=r8,>lom3=[sp,#40]
+# copy-collector input: str r8,[sp,#40]
+
+# qhasm: him3 = hiu3
+# asm 1: str <hiu3=int32#8,>him3=stack32#12
+# asm 2: str <hiu3=r7,>him3=[sp,#44]
+# copy-collector input: str r7,[sp,#44]
+
+# qhasm: lod0 = lou0
+# asm 1: str <lou0=int32#3,>lod0=stack32#13
+# asm 2: str <lou0=r2,>lod0=[sp,#48]
+# copy-collector input: str r2,[sp,#48]
+
+# qhasm: hid0 = hiu0
+# asm 1: str <hiu0=int32#2,>hid0=stack32#14
+# asm 2: str <hiu0=r1,>hid0=[sp,#52]
+# copy-collector input: str r1,[sp,#52]
+
+# qhasm: lod1 = lou1
+# asm 1: str <lou1=int32#5,>lod1=stack32#15
+# asm 2: str <lou1=r4,>lod1=[sp,#56]
+# copy-collector input: str r4,[sp,#56]
+
+# qhasm: hid1 = hiu1
+# asm 1: str <hiu1=int32#4,>hid1=stack32#16
+# asm 2: str <hiu1=r3,>hid1=[sp,#60]
+# copy-collector input: str r3,[sp,#60]
+
+# qhasm: lod2 = lou2
+# asm 1: str <lou2=int32#7,>lod2=stack32#17
+# asm 2: str <lou2=r6,>lod2=[sp,#64]
+# copy-collector input: str r6,[sp,#64]
+
+# qhasm: hid2 = hiu2
+# asm 1: str <hiu2=int32#6,>hid2=stack32#18
+# asm 2: str <hiu2=r5,>hid2=[sp,#68]
+# copy-collector input: str r5,[sp,#68]
+
+# qhasm: lod3 = lou3
+# asm 1: str <lou3=int32#9,>lod3=stack32#19
+# asm 2: str <lou3=r8,>lod3=[sp,#72]
+# copy-collector input: str r8,[sp,#72]
+
+# qhasm: hid3 = hiu3
+# asm 1: str <hiu3=int32#8,>hid3=stack32#20
+# asm 2: str <hiu3=r7,>hid3=[sp,#76]
+# copy-collector input: str r7,[sp,#76]
+
+# qhasm: hiu0 = mem32[input_0+32]
+# asm 1: ldr >hiu0=int32#2,[<input_0=int32#1,#32]
+# asm 2: ldr >hiu0=r1,[<input_0=r0,#32]
+# copy-collector input: ldr r1,[r0,#32]
+
+# qhasm: lou0 = mem32[input_0+36]
+# asm 1: ldr >lou0=int32#3,[<input_0=int32#1,#36]
+# asm 2: ldr >lou0=r2,[<input_0=r0,#36]
+# copy-collector input: ldr r2,[r0,#36]
+
+# qhasm: hiu1 = mem32[input_0+40]
+# asm 1: ldr >hiu1=int32#4,[<input_0=int32#1,#40]
+# asm 2: ldr >hiu1=r3,[<input_0=r0,#40]
+# copy-collector input: ldr r3,[r0,#40]
+
+# qhasm: lou1 = mem32[input_0+44]
+# asm 1: ldr >lou1=int32#5,[<input_0=int32#1,#44]
+# asm 2: ldr >lou1=r4,[<input_0=r0,#44]
+# copy-collector input: ldr r4,[r0,#44]
+
+# qhasm: hiu2 = mem32[input_0+48]
+# asm 1: ldr >hiu2=int32#6,[<input_0=int32#1,#48]
+# asm 2: ldr >hiu2=r5,[<input_0=r0,#48]
+# copy-collector input: ldr r5,[r0,#48]
+
+# qhasm: lou2 = mem32[input_0+52]
+# asm 1: ldr >lou2=int32#7,[<input_0=int32#1,#52]
+# asm 2: ldr >lou2=r6,[<input_0=r0,#52]
+# copy-collector input: ldr r6,[r0,#52]
+
+# qhasm: hiu3 = mem32[input_0+56]
+# asm 1: ldr >hiu3=int32#8,[<input_0=int32#1,#56]
+# asm 2: ldr >hiu3=r7,[<input_0=r0,#56]
+# copy-collector input: ldr r7,[r0,#56]
+
+# qhasm: lou3 = mem32[input_0+60]
+# asm 1: ldr >lou3=int32#1,[<input_0=int32#1,#60]
+# asm 2: ldr >lou3=r0,[<input_0=r0,#60]
+# copy-collector input: ldr r0,[r0,#60]
+
+# qhasm: lou0 = lou0[3]lou0[2]lou0[1]lou0[0]
+# asm 1: rev >lou0=int32#3,<lou0=int32#3
+# asm 2: rev >lou0=r2,<lou0=r2
+# copy-collector output starts
+strd r2,r1,[sp,#16]
+strd r4,r3,[sp,#24]
+strd r6,r5,[sp,#32]
+strd r8,r7,[sp,#40]
+strd r2,r1,[sp,#48]
+strd r4,r3,[sp,#56]
+strd r6,r5,[sp,#64]
+strd r8,r7,[sp,#72]
+ldr r1,[r0,#32]
+ldr r2,[r0,#36]
+ldr r3,[r0,#40]
+ldr r4,[r0,#44]
+ldr r5,[r0,#48]
+ldr r6,[r0,#52]
+ldr r7,[r0,#56]
+ldr r0,[r0,#60]
+# copy-collector output ends
+rev r2,r2
+
+# qhasm: hiu0 = hiu0[3]hiu0[2]hiu0[1]hiu0[0]
+# asm 1: rev >hiu0=int32#2,<hiu0=int32#2
+# asm 2: rev >hiu0=r1,<hiu0=r1
+rev r1,r1
+
+# qhasm: lou1 = lou1[3]lou1[2]lou1[1]lou1[0]
+# asm 1: rev >lou1=int32#5,<lou1=int32#5
+# asm 2: rev >lou1=r4,<lou1=r4
+rev r4,r4
+
+# qhasm: hiu1 = hiu1[3]hiu1[2]hiu1[1]hiu1[0]
+# asm 1: rev >hiu1=int32#4,<hiu1=int32#4
+# asm 2: rev >hiu1=r3,<hiu1=r3
+rev r3,r3
+
+# qhasm: lou2 = lou2[3]lou2[2]lou2[1]lou2[0]
+# asm 1: rev >lou2=int32#7,<lou2=int32#7
+# asm 2: rev >lou2=r6,<lou2=r6
+rev r6,r6
+
+# qhasm: hiu2 = hiu2[3]hiu2[2]hiu2[1]hiu2[0]
+# asm 1: rev >hiu2=int32#6,<hiu2=int32#6
+# asm 2: rev >hiu2=r5,<hiu2=r5
+rev r5,r5
+
+# qhasm: lou3 = lou3[3]lou3[2]lou3[1]lou3[0]
+# asm 1: rev >lou3=int32#1,<lou3=int32#1
+# asm 2: rev >lou3=r0,<lou3=r0
+rev r0,r0
+
+# qhasm: hiu3 = hiu3[3]hiu3[2]hiu3[1]hiu3[0]
+# asm 1: rev >hiu3=int32#8,<hiu3=int32#8
+# asm 2: rev >hiu3=r7,<hiu3=r7
+rev r7,r7
+
+# qhasm: lom4 = lou0
+# asm 1: str <lou0=int32#3,>lom4=stack32#21
+# asm 2: str <lou0=r2,>lom4=[sp,#80]
+# copy-collector input: str r2,[sp,#80]
+
+# qhasm: him4 = hiu0
+# asm 1: str <hiu0=int32#2,>him4=stack32#22
+# asm 2: str <hiu0=r1,>him4=[sp,#84]
+# copy-collector input: str r1,[sp,#84]
+
+# qhasm: lom5 = lou1
+# asm 1: str <lou1=int32#5,>lom5=stack32#23
+# asm 2: str <lou1=r4,>lom5=[sp,#88]
+# copy-collector input: str r4,[sp,#88]
+
+# qhasm: him5 = hiu1
+# asm 1: str <hiu1=int32#4,>him5=stack32#24
+# asm 2: str <hiu1=r3,>him5=[sp,#92]
+# copy-collector input: str r3,[sp,#92]
+
+# qhasm: lom6 = lou2
+# asm 1: str <lou2=int32#7,>lom6=stack32#25
+# asm 2: str <lou2=r6,>lom6=[sp,#96]
+# copy-collector input: str r6,[sp,#96]
+
+# qhasm: him6 = hiu2
+# asm 1: str <hiu2=int32#6,>him6=stack32#26
+# asm 2: str <hiu2=r5,>him6=[sp,#100]
+# copy-collector input: str r5,[sp,#100]
+
+# qhasm: lom7 = lou3
+# asm 1: str <lou3=int32#1,>lom7=stack32#27
+# asm 2: str <lou3=r0,>lom7=[sp,#104]
+# copy-collector input: str r0,[sp,#104]
+
+# qhasm: him7 = hiu3
+# asm 1: str <hiu3=int32#8,>him7=stack32#28
+# asm 2: str <hiu3=r7,>him7=[sp,#108]
+# copy-collector input: str r7,[sp,#108]
+
+# qhasm: lod4 = lou0
+# asm 1: str <lou0=int32#3,>lod4=stack32#29
+# asm 2: str <lou0=r2,>lod4=[sp,#112]
+# copy-collector input: str r2,[sp,#112]
+
+# qhasm: hid4 = hiu0
+# asm 1: str <hiu0=int32#2,>hid4=stack32#30
+# asm 2: str <hiu0=r1,>hid4=[sp,#116]
+# copy-collector input: str r1,[sp,#116]
+
+# qhasm: lod5 = lou1
+# asm 1: str <lou1=int32#5,>lod5=stack32#31
+# asm 2: str <lou1=r4,>lod5=[sp,#120]
+# copy-collector input: str r4,[sp,#120]
+
+# qhasm: hid5 = hiu1
+# asm 1: str <hiu1=int32#4,>hid5=stack32#32
+# asm 2: str <hiu1=r3,>hid5=[sp,#124]
+# copy-collector input: str r3,[sp,#124]
+
+# qhasm: lod6 = lou2
+# asm 1: str <lou2=int32#7,>lod6=stack32#33
+# asm 2: str <lou2=r6,>lod6=[sp,#128]
+# copy-collector input: str r6,[sp,#128]
+
+# qhasm: hid6 = hiu2
+# asm 1: str <hiu2=int32#6,>hid6=stack32#34
+# asm 2: str <hiu2=r5,>hid6=[sp,#132]
+# copy-collector input: str r5,[sp,#132]
+
+# qhasm: lod7 = lou3
+# asm 1: str <lou3=int32#1,>lod7=stack32#35
+# asm 2: str <lou3=r0,>lod7=[sp,#136]
+# copy-collector input: str r0,[sp,#136]
+
+# qhasm: hid7 = hiu3
+# asm 1: str <hiu3=int32#8,>hid7=stack32#36
+# asm 2: str <hiu3=r7,>hid7=[sp,#140]
+# copy-collector input: str r7,[sp,#140]
+
+# qhasm: mainloop:
+# copy-collector output starts
+strd r2,r1,[sp,#80]
+strd r4,r3,[sp,#88]
+strd r6,r5,[sp,#96]
+strd r0,r7,[sp,#104]
+strd r2,r1,[sp,#112]
+strd r4,r3,[sp,#120]
+strd r6,r5,[sp,#128]
+strd r0,r7,[sp,#136]
+# copy-collector output ends
+._mainloop:
+
+# qhasm: input_0 = o1
+# asm 1: ldr >input_0=int32#1,<o1=stack32#2
+# asm 2: ldr >input_0=r0,<o1=[sp,#4]
+# copy-collector input: ldr r0,[sp,#4]
+
+# qhasm: hiu0 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >hiu0=int32#2,[<input_0=int32#1],#4
+# asm 2: ldr >hiu0=r1,[<input_0=r0],#4
+# copy-collector input: ldr r1,[r0],#4
+
+# qhasm: lou0 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >lou0=int32#3,[<input_0=int32#1],#4
+# asm 2: ldr >lou0=r2,[<input_0=r0],#4
+# copy-collector input: ldr r2,[r0],#4
+
+# qhasm: hiu1 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >hiu1=int32#4,[<input_0=int32#1],#4
+# asm 2: ldr >hiu1=r3,[<input_0=r0],#4
+# copy-collector input: ldr r3,[r0],#4
+
+# qhasm: lou1 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >lou1=int32#5,[<input_0=int32#1],#4
+# asm 2: ldr >lou1=r4,[<input_0=r0],#4
+# copy-collector input: ldr r4,[r0],#4
+
+# qhasm: hiu2 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >hiu2=int32#6,[<input_0=int32#1],#4
+# asm 2: ldr >hiu2=r5,[<input_0=r0],#4
+# copy-collector input: ldr r5,[r0],#4
+
+# qhasm: lou2 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >lou2=int32#7,[<input_0=int32#1],#4
+# asm 2: ldr >lou2=r6,[<input_0=r0],#4
+# copy-collector input: ldr r6,[r0],#4
+
+# qhasm: hiu3 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >hiu3=int32#8,[<input_0=int32#1],#4
+# asm 2: ldr >hiu3=r7,[<input_0=r0],#4
+# copy-collector input: ldr r7,[r0],#4
+
+# qhasm: lou3 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >lou3=int32#9,[<input_0=int32#1],#4
+# asm 2: ldr >lou3=r8,[<input_0=r0],#4
+# copy-collector input: ldr r8,[r0],#4
+
+# qhasm: lou0 = lou0[3]lou0[2]lou0[1]lou0[0]
+# asm 1: rev >lou0=int32#3,<lou0=int32#3
+# asm 2: rev >lou0=r2,<lou0=r2
+# copy-collector output starts
+ldr.w r0,[sp,#4]
+ldr r1,[r0],#4
+ldr r2,[r0],#4
+ldr r3,[r0],#4
+ldr r4,[r0],#4
+ldr r5,[r0],#4
+ldr r6,[r0],#4
+ldr r7,[r0],#4
+ldr r8,[r0],#4
+# copy-collector output ends
+rev r2,r2
+
+# qhasm: hiu0 = hiu0[3]hiu0[2]hiu0[1]hiu0[0]
+# asm 1: rev >hiu0=int32#2,<hiu0=int32#2
+# asm 2: rev >hiu0=r1,<hiu0=r1
+rev r1,r1
+
+# qhasm: lou1 = lou1[3]lou1[2]lou1[1]lou1[0]
+# asm 1: rev >lou1=int32#5,<lou1=int32#5
+# asm 2: rev >lou1=r4,<lou1=r4
+rev r4,r4
+
+# qhasm: hiu1 = hiu1[3]hiu1[2]hiu1[1]hiu1[0]
+# asm 1: rev >hiu1=int32#4,<hiu1=int32#4
+# asm 2: rev >hiu1=r3,<hiu1=r3
+rev r3,r3
+
+# qhasm: lou2 = lou2[3]lou2[2]lou2[1]lou2[0]
+# asm 1: rev >lou2=int32#7,<lou2=int32#7
+# asm 2: rev >lou2=r6,<lou2=r6
+rev r6,r6
+
+# qhasm: hiu2 = hiu2[3]hiu2[2]hiu2[1]hiu2[0]
+# asm 1: rev >hiu2=int32#6,<hiu2=int32#6
+# asm 2: rev >hiu2=r5,<hiu2=r5
+rev r5,r5
+
+# qhasm: lou3 = lou3[3]lou3[2]lou3[1]lou3[0]
+# asm 1: rev >lou3=int32#9,<lou3=int32#9
+# asm 2: rev >lou3=r8,<lou3=r8
+rev r8,r8
+
+# qhasm: hiu3 = hiu3[3]hiu3[2]hiu3[1]hiu3[0]
+# asm 1: rev >hiu3=int32#8,<hiu3=int32#8
+# asm 2: rev >hiu3=r7,<hiu3=r7
+rev.w r7,r7
+
+# qhasm: lod8 = lou0
+# asm 1: str <lou0=int32#3,>lod8=stack32#37
+# asm 2: str <lou0=r2,>lod8=[sp,#144]
+# copy-collector input: str r2,[sp,#144]
+
+# qhasm: hid8 = hiu0
+# asm 1: str <hiu0=int32#2,>hid8=stack32#38
+# asm 2: str <hiu0=r1,>hid8=[sp,#148]
+# copy-collector input: str r1,[sp,#148]
+
+# qhasm: lod9 = lou1
+# asm 1: str <lou1=int32#5,>lod9=stack32#39
+# asm 2: str <lou1=r4,>lod9=[sp,#152]
+# copy-collector input: str r4,[sp,#152]
+
+# qhasm: hid9 = hiu1
+# asm 1: str <hiu1=int32#4,>hid9=stack32#40
+# asm 2: str <hiu1=r3,>hid9=[sp,#156]
+# copy-collector input: str r3,[sp,#156]
+
+# qhasm: lod10 = lou2
+# asm 1: str <lou2=int32#7,>lod10=stack32#41
+# asm 2: str <lou2=r6,>lod10=[sp,#160]
+# copy-collector input: str r6,[sp,#160]
+
+# qhasm: hid10 = hiu2
+# asm 1: str <hiu2=int32#6,>hid10=stack32#42
+# asm 2: str <hiu2=r5,>hid10=[sp,#164]
+# copy-collector input: str r5,[sp,#164]
+
+# qhasm: lod11 = lou3
+# asm 1: str <lou3=int32#9,>lod11=stack32#43
+# asm 2: str <lou3=r8,>lod11=[sp,#168]
+# copy-collector input: str r8,[sp,#168]
+
+# qhasm: hid11 = hiu3
+# asm 1: str <hiu3=int32#8,>hid11=stack32#44
+# asm 2: str <hiu3=r7,>hid11=[sp,#172]
+# copy-collector input: str r7,[sp,#172]
+
+# qhasm: hiu0 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >hiu0=int32#2,[<input_0=int32#1],#4
+# asm 2: ldr >hiu0=r1,[<input_0=r0],#4
+# copy-collector input: ldr r1,[r0],#4
+
+# qhasm: lou0 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >lou0=int32#3,[<input_0=int32#1],#4
+# asm 2: ldr >lou0=r2,[<input_0=r0],#4
+# copy-collector input: ldr r2,[r0],#4
+
+# qhasm: hiu1 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >hiu1=int32#4,[<input_0=int32#1],#4
+# asm 2: ldr >hiu1=r3,[<input_0=r0],#4
+# copy-collector input: ldr r3,[r0],#4
+
+# qhasm: lou1 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >lou1=int32#5,[<input_0=int32#1],#4
+# asm 2: ldr >lou1=r4,[<input_0=r0],#4
+# copy-collector input: ldr r4,[r0],#4
+
+# qhasm: hiu2 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >hiu2=int32#6,[<input_0=int32#1],#4
+# asm 2: ldr >hiu2=r5,[<input_0=r0],#4
+# copy-collector input: ldr r5,[r0],#4
+
+# qhasm: lou2 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >lou2=int32#7,[<input_0=int32#1],#4
+# asm 2: ldr >lou2=r6,[<input_0=r0],#4
+# copy-collector input: ldr r6,[r0],#4
+
+# qhasm: hiu3 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >hiu3=int32#8,[<input_0=int32#1],#4
+# asm 2: ldr >hiu3=r7,[<input_0=r0],#4
+# copy-collector input: ldr r7,[r0],#4
+
+# qhasm: lou3 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >lou3=int32#9,[<input_0=int32#1],#4
+# asm 2: ldr >lou3=r8,[<input_0=r0],#4
+# copy-collector input: ldr r8,[r0],#4
+
+# qhasm: lou0 = lou0[3]lou0[2]lou0[1]lou0[0]
+# asm 1: rev >lou0=int32#3,<lou0=int32#3
+# asm 2: rev >lou0=r2,<lou0=r2
+# copy-collector output starts
+strd r2,r1,[sp,#144]
+strd r4,r3,[sp,#152]
+strd r6,r5,[sp,#160]
+strd r8,r7,[sp,#168]
+ldr r1,[r0],#4
+ldr r2,[r0],#4
+ldr r3,[r0],#4
+ldr r4,[r0],#4
+ldr r5,[r0],#4
+ldr r6,[r0],#4
+ldr r7,[r0],#4
+ldr r8,[r0],#4
+# copy-collector output ends
+rev r2,r2
+
+# qhasm: hiu0 = hiu0[3]hiu0[2]hiu0[1]hiu0[0]
+# asm 1: rev >hiu0=int32#2,<hiu0=int32#2
+# asm 2: rev >hiu0=r1,<hiu0=r1
+rev r1,r1
+
+# qhasm: lou1 = lou1[3]lou1[2]lou1[1]lou1[0]
+# asm 1: rev >lou1=int32#5,<lou1=int32#5
+# asm 2: rev >lou1=r4,<lou1=r4
+rev r4,r4
+
+# qhasm: hiu1 = hiu1[3]hiu1[2]hiu1[1]hiu1[0]
+# asm 1: rev >hiu1=int32#4,<hiu1=int32#4
+# asm 2: rev >hiu1=r3,<hiu1=r3
+rev r3,r3
+
+# qhasm: lou2 = lou2[3]lou2[2]lou2[1]lou2[0]
+# asm 1: rev >lou2=int32#7,<lou2=int32#7
+# asm 2: rev >lou2=r6,<lou2=r6
+rev r6,r6
+
+# qhasm: hiu2 = hiu2[3]hiu2[2]hiu2[1]hiu2[0]
+# asm 1: rev >hiu2=int32#6,<hiu2=int32#6
+# asm 2: rev >hiu2=r5,<hiu2=r5
+rev r5,r5
+
+# qhasm: lou3 = lou3[3]lou3[2]lou3[1]lou3[0]
+# asm 1: rev >lou3=int32#9,<lou3=int32#9
+# asm 2: rev >lou3=r8,<lou3=r8
+rev r8,r8
+
+# qhasm: hiu3 = hiu3[3]hiu3[2]hiu3[1]hiu3[0]
+# asm 1: rev >hiu3=int32#8,<hiu3=int32#8
+# asm 2: rev >hiu3=r7,<hiu3=r7
+rev.w r7,r7
+
+# qhasm: lod12 = lou0
+# asm 1: str <lou0=int32#3,>lod12=stack32#45
+# asm 2: str <lou0=r2,>lod12=[sp,#176]
+# copy-collector input: str r2,[sp,#176]
+
+# qhasm: hid12 = hiu0
+# asm 1: str <hiu0=int32#2,>hid12=stack32#46
+# asm 2: str <hiu0=r1,>hid12=[sp,#180]
+# copy-collector input: str r1,[sp,#180]
+
+# qhasm: lod13 = lou1
+# asm 1: str <lou1=int32#5,>lod13=stack32#47
+# asm 2: str <lou1=r4,>lod13=[sp,#184]
+# copy-collector input: str r4,[sp,#184]
+
+# qhasm: hid13 = hiu1
+# asm 1: str <hiu1=int32#4,>hid13=stack32#48
+# asm 2: str <hiu1=r3,>hid13=[sp,#188]
+# copy-collector input: str r3,[sp,#188]
+
+# qhasm: lod14 = lou2
+# asm 1: str <lou2=int32#7,>lod14=stack32#49
+# asm 2: str <lou2=r6,>lod14=[sp,#192]
+# copy-collector input: str r6,[sp,#192]
+
+# qhasm: hid14 = hiu2
+# asm 1: str <hiu2=int32#6,>hid14=stack32#50
+# asm 2: str <hiu2=r5,>hid14=[sp,#196]
+# copy-collector input: str r5,[sp,#196]
+
+# qhasm: lod15 = lou3
+# asm 1: str <lou3=int32#9,>lod15=stack32#51
+# asm 2: str <lou3=r8,>lod15=[sp,#200]
+# copy-collector input: str r8,[sp,#200]
+
+# qhasm: hid15 = hiu3
+# asm 1: str <hiu3=int32#8,>hid15=stack32#52
+# asm 2: str <hiu3=r7,>hid15=[sp,#204]
+# copy-collector input: str r7,[sp,#204]
+
+# qhasm: hiu0 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >hiu0=int32#2,[<input_0=int32#1],#4
+# asm 2: ldr >hiu0=r1,[<input_0=r0],#4
+# copy-collector input: ldr r1,[r0],#4
+
+# qhasm: lou0 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >lou0=int32#3,[<input_0=int32#1],#4
+# asm 2: ldr >lou0=r2,[<input_0=r0],#4
+# copy-collector input: ldr r2,[r0],#4
+
+# qhasm: hiu1 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >hiu1=int32#4,[<input_0=int32#1],#4
+# asm 2: ldr >hiu1=r3,[<input_0=r0],#4
+# copy-collector input: ldr r3,[r0],#4
+
+# qhasm: lou1 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >lou1=int32#5,[<input_0=int32#1],#4
+# asm 2: ldr >lou1=r4,[<input_0=r0],#4
+# copy-collector input: ldr r4,[r0],#4
+
+# qhasm: hiu2 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >hiu2=int32#6,[<input_0=int32#1],#4
+# asm 2: ldr >hiu2=r5,[<input_0=r0],#4
+# copy-collector input: ldr r5,[r0],#4
+
+# qhasm: lou2 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >lou2=int32#7,[<input_0=int32#1],#4
+# asm 2: ldr >lou2=r6,[<input_0=r0],#4
+# copy-collector input: ldr r6,[r0],#4
+
+# qhasm: hiu3 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >hiu3=int32#8,[<input_0=int32#1],#4
+# asm 2: ldr >hiu3=r7,[<input_0=r0],#4
+# copy-collector input: ldr r7,[r0],#4
+
+# qhasm: lou3 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >lou3=int32#9,[<input_0=int32#1],#4
+# asm 2: ldr >lou3=r8,[<input_0=r0],#4
+# copy-collector input: ldr r8,[r0],#4
+
+# qhasm: lou0 = lou0[3]lou0[2]lou0[1]lou0[0]
+# asm 1: rev >lou0=int32#3,<lou0=int32#3
+# asm 2: rev >lou0=r2,<lou0=r2
+# copy-collector output starts
+strd r2,r1,[sp,#176]
+strd r4,r3,[sp,#184]
+strd r6,r5,[sp,#192]
+strd r8,r7,[sp,#200]
+ldr r1,[r0],#4
+ldr r2,[r0],#4
+ldr r3,[r0],#4
+ldr r4,[r0],#4
+ldr r5,[r0],#4
+ldr r6,[r0],#4
+ldr r7,[r0],#4
+ldr r8,[r0],#4
+# copy-collector output ends
+rev r2,r2
+
+# qhasm: hiu0 = hiu0[3]hiu0[2]hiu0[1]hiu0[0]
+# asm 1: rev >hiu0=int32#2,<hiu0=int32#2
+# asm 2: rev >hiu0=r1,<hiu0=r1
+rev r1,r1
+
+# qhasm: lou1 = lou1[3]lou1[2]lou1[1]lou1[0]
+# asm 1: rev >lou1=int32#5,<lou1=int32#5
+# asm 2: rev >lou1=r4,<lou1=r4
+rev r4,r4
+
+# qhasm: hiu1 = hiu1[3]hiu1[2]hiu1[1]hiu1[0]
+# asm 1: rev >hiu1=int32#4,<hiu1=int32#4
+# asm 2: rev >hiu1=r3,<hiu1=r3
+rev r3,r3
+
+# qhasm: lou2 = lou2[3]lou2[2]lou2[1]lou2[0]
+# asm 1: rev >lou2=int32#7,<lou2=int32#7
+# asm 2: rev >lou2=r6,<lou2=r6
+rev r6,r6
+
+# qhasm: hiu2 = hiu2[3]hiu2[2]hiu2[1]hiu2[0]
+# asm 1: rev >hiu2=int32#6,<hiu2=int32#6
+# asm 2: rev >hiu2=r5,<hiu2=r5
+rev r5,r5
+
+# qhasm: lou3 = lou3[3]lou3[2]lou3[1]lou3[0]
+# asm 1: rev >lou3=int32#9,<lou3=int32#9
+# asm 2: rev >lou3=r8,<lou3=r8
+rev r8,r8
+
+# qhasm: hiu3 = hiu3[3]hiu3[2]hiu3[1]hiu3[0]
+# asm 1: rev >hiu3=int32#8,<hiu3=int32#8
+# asm 2: rev >hiu3=r7,<hiu3=r7
+rev.w r7,r7
+
+# qhasm: lom8 = lou0
+# asm 1: str <lou0=int32#3,>lom8=stack32#53
+# asm 2: str <lou0=r2,>lom8=[sp,#208]
+# copy-collector input: str r2,[sp,#208]
+
+# qhasm: him8 = hiu0
+# asm 1: str <hiu0=int32#2,>him8=stack32#54
+# asm 2: str <hiu0=r1,>him8=[sp,#212]
+# copy-collector input: str r1,[sp,#212]
+
+# qhasm: lom9 = lou1
+# asm 1: str <lou1=int32#5,>lom9=stack32#55
+# asm 2: str <lou1=r4,>lom9=[sp,#216]
+# copy-collector input: str r4,[sp,#216]
+
+# qhasm: him9 = hiu1
+# asm 1: str <hiu1=int32#4,>him9=stack32#56
+# asm 2: str <hiu1=r3,>him9=[sp,#220]
+# copy-collector input: str r3,[sp,#220]
+
+# qhasm: lom10 = lou2
+# asm 1: str <lou2=int32#7,>lom10=stack32#57
+# asm 2: str <lou2=r6,>lom10=[sp,#224]
+# copy-collector input: str r6,[sp,#224]
+
+# qhasm: him10 = hiu2
+# asm 1: str <hiu2=int32#6,>him10=stack32#58
+# asm 2: str <hiu2=r5,>him10=[sp,#228]
+# copy-collector input: str r5,[sp,#228]
+
+# qhasm: lom11 = lou3
+# asm 1: str <lou3=int32#9,>lom11=stack32#59
+# asm 2: str <lou3=r8,>lom11=[sp,#232]
+# copy-collector input: str r8,[sp,#232]
+
+# qhasm: him11 = hiu3
+# asm 1: str <hiu3=int32#8,>him11=stack32#60
+# asm 2: str <hiu3=r7,>him11=[sp,#236]
+# copy-collector input: str r7,[sp,#236]
+
+# qhasm: hiu0 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >hiu0=int32#2,[<input_0=int32#1],#4
+# asm 2: ldr >hiu0=r1,[<input_0=r0],#4
+# copy-collector input: ldr r1,[r0],#4
+
+# qhasm: lou0 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >lou0=int32#3,[<input_0=int32#1],#4
+# asm 2: ldr >lou0=r2,[<input_0=r0],#4
+# copy-collector input: ldr r2,[r0],#4
+
+# qhasm: hiu1 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >hiu1=int32#4,[<input_0=int32#1],#4
+# asm 2: ldr >hiu1=r3,[<input_0=r0],#4
+# copy-collector input: ldr r3,[r0],#4
+
+# qhasm: lou1 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >lou1=int32#5,[<input_0=int32#1],#4
+# asm 2: ldr >lou1=r4,[<input_0=r0],#4
+# copy-collector input: ldr r4,[r0],#4
+
+# qhasm: hiu2 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >hiu2=int32#6,[<input_0=int32#1],#4
+# asm 2: ldr >hiu2=r5,[<input_0=r0],#4
+# copy-collector input: ldr r5,[r0],#4
+
+# qhasm: lou2 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >lou2=int32#7,[<input_0=int32#1],#4
+# asm 2: ldr >lou2=r6,[<input_0=r0],#4
+# copy-collector input: ldr r6,[r0],#4
+
+# qhasm: hiu3 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >hiu3=int32#8,[<input_0=int32#1],#4
+# asm 2: ldr >hiu3=r7,[<input_0=r0],#4
+# copy-collector input: ldr r7,[r0],#4
+
+# qhasm: lou3 = mem32[input_0]; input_0 += 4
+# asm 1: ldr >lou3=int32#9,[<input_0=int32#1],#4
+# asm 2: ldr >lou3=r8,[<input_0=r0],#4
+# copy-collector input: ldr r8,[r0],#4
+
+# qhasm: lou0 = lou0[3]lou0[2]lou0[1]lou0[0]
+# asm 1: rev >lou0=int32#3,<lou0=int32#3
+# asm 2: rev >lou0=r2,<lou0=r2
+# copy-collector output starts
+strd r2,r1,[sp,#208]
+strd r4,r3,[sp,#216]
+strd r6,r5,[sp,#224]
+strd r8,r7,[sp,#232]
+ldr r1,[r0],#4
+ldr r2,[r0],#4
+ldr r3,[r0],#4
+ldr r4,[r0],#4
+ldr r5,[r0],#4
+ldr r6,[r0],#4
+ldr r7,[r0],#4
+ldr r8,[r0],#4
+# copy-collector output ends
+rev r2,r2
+
+# qhasm: hiu0 = hiu0[3]hiu0[2]hiu0[1]hiu0[0]
+# asm 1: rev >hiu0=int32#2,<hiu0=int32#2
+# asm 2: rev >hiu0=r1,<hiu0=r1
+rev r1,r1
+
+# qhasm: lou1 = lou1[3]lou1[2]lou1[1]lou1[0]
+# asm 1: rev >lou1=int32#5,<lou1=int32#5
+# asm 2: rev >lou1=r4,<lou1=r4
+rev r4,r4
+
+# qhasm: hiu1 = hiu1[3]hiu1[2]hiu1[1]hiu1[0]
+# asm 1: rev >hiu1=int32#4,<hiu1=int32#4
+# asm 2: rev >hiu1=r3,<hiu1=r3
+rev r3,r3
+
+# qhasm: lou2 = lou2[3]lou2[2]lou2[1]lou2[0]
+# asm 1: rev >lou2=int32#7,<lou2=int32#7
+# asm 2: rev >lou2=r6,<lou2=r6
+rev r6,r6
+
+# qhasm: hiu2 = hiu2[3]hiu2[2]hiu2[1]hiu2[0]
+# asm 1: rev >hiu2=int32#6,<hiu2=int32#6
+# asm 2: rev >hiu2=r5,<hiu2=r5
+rev r5,r5
+
+# qhasm: lou3 = lou3[3]lou3[2]lou3[1]lou3[0]
+# asm 1: rev >lou3=int32#9,<lou3=int32#9
+# asm 2: rev >lou3=r8,<lou3=r8
+rev r8,r8
+
+# qhasm: hiu3 = hiu3[3]hiu3[2]hiu3[1]hiu3[0]
+# asm 1: rev >hiu3=int32#8,<hiu3=int32#8
+# asm 2: rev >hiu3=r7,<hiu3=r7
+rev.w r7,r7
+
+# qhasm: lom12 = lou0
+# asm 1: str <lou0=int32#3,>lom12=stack32#61
+# asm 2: str <lou0=r2,>lom12=[sp,#240]
+# copy-collector input: str r2,[sp,#240]
+
+# qhasm: him12 = hiu0
+# asm 1: str <hiu0=int32#2,>him12=stack32#62
+# asm 2: str <hiu0=r1,>him12=[sp,#244]
+# copy-collector input: str r1,[sp,#244]
+
+# qhasm: lom13 = lou1
+# asm 1: str <lou1=int32#5,>lom13=stack32#63
+# asm 2: str <lou1=r4,>lom13=[sp,#248]
+# copy-collector input: str r4,[sp,#248]
+
+# qhasm: him13 = hiu1
+# asm 1: str <hiu1=int32#4,>him13=stack32#64
+# asm 2: str <hiu1=r3,>him13=[sp,#252]
+# copy-collector input: str r3,[sp,#252]
+
+# qhasm: lom14 = lou2
+# asm 1: str <lou2=int32#7,>lom14=stack32#65
+# asm 2: str <lou2=r6,>lom14=[sp,#256]
+# copy-collector input: str r6,[sp,#256]
+
+# qhasm: him14 = hiu2
+# asm 1: str <hiu2=int32#6,>him14=stack32#66
+# asm 2: str <hiu2=r5,>him14=[sp,#260]
+# copy-collector input: str r5,[sp,#260]
+
+# qhasm: lom15 = lou3
+# asm 1: str <lou3=int32#9,>lom15=stack32#67
+# asm 2: str <lou3=r8,>lom15=[sp,#264]
+# copy-collector input: str r8,[sp,#264]
+
+# qhasm: him15 = hiu3
+# asm 1: str <hiu3=int32#8,>him15=stack32#68
+# asm 2: str <hiu3=r7,>him15=[sp,#268]
+# copy-collector input: str r7,[sp,#268]
+
+# qhasm: o1 = input_0
+# asm 1: str <input_0=int32#1,>o1=stack32#2
+# asm 2: str <input_0=r0,>o1=[sp,#4]
+# copy-collector input: str r0,[sp,#4]
+
+# qhasm: input_0 = 80 simple
+# asm 1: mov >input_0=int32#1,80
+# asm 2: mov >input_0=r0,80
+# copy-collector output starts
+strd r2,r1,[sp,#240]
+strd r4,r3,[sp,#248]
+strd r6,r5,[sp,#256]
+strd r8,r7,[sp,#264]
+str r0,[sp,#4]
+# copy-collector output ends
+mov r0,80
+
+# qhasm: o4 = input_0
+# asm 1: str <input_0=int32#1,>o4=stack32#69
+# asm 2: str <input_0=r0,>o4=[sp,#272]
+# copy-collector input: str r0,[sp,#272]
+
+# qhasm: innerloop:
+# copy-collector output starts
+str r0,[sp,#272]
+# copy-collector output ends
+._innerloop:
+
+# qhasm: input_0 = o3
+# asm 1: ldr >input_0=int32#1,<o3=stack32#4
+# asm 2: ldr >input_0=r0,<o3=[sp,#12]
+# copy-collector input: ldr r0,[sp,#12]
+
+# qhasm: lou0 = lod3
+# asm 1: ldr >lou0=int32#2,<lod3=stack32#19
+# asm 2: ldr >lou0=r1,<lod3=[sp,#72]
+# copy-collector input: ldr r1,[sp,#72]
+
+# qhasm: hiu0 = hid3
+# asm 1: ldr >hiu0=int32#3,<hid3=stack32#20
+# asm 2: ldr >hiu0=r2,<hid3=[sp,#76]
+# copy-collector input: ldr r2,[sp,#76]
+
+# qhasm: lou1 = lod4
+# asm 1: ldr >lou1=int32#4,<lod4=stack32#29
+# asm 2: ldr >lou1=r3,<lod4=[sp,#112]
+# copy-collector input: ldr r3,[sp,#112]
+
+# qhasm: hiu1 = hid4
+# asm 1: ldr >hiu1=int32#5,<hid4=stack32#30
+# asm 2: ldr >hiu1=r4,<hid4=[sp,#116]
+# copy-collector input: ldr r4,[sp,#116]
+
+# qhasm: lou2 = lod5
+# asm 1: ldr >lou2=int32#6,<lod5=stack32#31
+# asm 2: ldr >lou2=r5,<lod5=[sp,#120]
+# copy-collector input: ldr r5,[sp,#120]
+
+# qhasm: hiu2 = hid5
+# asm 1: ldr >hiu2=int32#7,<hid5=stack32#32
+# asm 2: ldr >hiu2=r6,<hid5=[sp,#124]
+# copy-collector input: ldr r6,[sp,#124]
+
+# qhasm: lou3 = lod6
+# asm 1: ldr >lou3=int32#8,<lod6=stack32#33
+# asm 2: ldr >lou3=r7,<lod6=[sp,#128]
+# copy-collector input: ldr r7,[sp,#128]
+
+# qhasm: hiu3 = hid6
+# asm 1: ldr >hiu3=int32#9,<hid6=stack32#34
+# asm 2: ldr >hiu3=r8,<hid6=[sp,#132]
+# copy-collector input: ldr r8,[sp,#132]
+
+# qhasm: lou4 = lod7
+# asm 1: ldr >lou4=int32#10,<lod7=stack32#35
+# asm 2: ldr >lou4=r9,<lod7=[sp,#136]
+# copy-collector input: ldr r9,[sp,#136]
+
+# qhasm: hiu4 = hid7
+# asm 1: ldr >hiu4=int32#11,<hid7=stack32#36
+# asm 2: ldr >hiu4=r10,<hid7=[sp,#140]
+# copy-collector input: ldr r10,[sp,#140]
+
+# qhasm: lou5 = lod8
+# asm 1: ldr >lou5=int32#12,<lod8=stack32#37
+# asm 2: ldr >lou5=r11,<lod8=[sp,#144]
+# copy-collector input: ldr r11,[sp,#144]
+
+# qhasm: hiu5 = hid8
+# asm 1: ldr >hiu5=int32#13,<hid8=stack32#38
+# asm 2: ldr >hiu5=r12,<hid8=[sp,#148]
+# copy-collector input: ldr r12,[sp,#148]
+
+# qhasm: two23 = 0x800000 simple
+# asm 1: mov >two23=int32#14,0x800000
+# asm 2: mov >two23=r14,0x800000
+# copy-collector output starts
+ldr r0,[sp,#12]
+ldr r1,[sp,#72]
+ldr r2,[sp,#76]
+ldr r3,[sp,#112]
+ldr r4,[sp,#116]
+ldr r5,[sp,#120]
+ldr r6,[sp,#124]
+ldr r7,[sp,#128]
+ldr r8,[sp,#132]
+ldr r9,[sp,#136]
+ldr r10,[sp,#140]
+ldr r11,[sp,#144]
+ldr r12,[sp,#148]
+# copy-collector output ends
+mov r14,0x800000
+
+# qhasm: carry?  lou4 += lou5
+# asm 1: adds >lou4=int32#10,<lou4=int32#10,<lou5=int32#12
+# asm 2: adds >lou4=r9,<lou4=r9,<lou5=r11
+adds r9,r9,r11
+
+# qhasm: hiu4 += hiu5 + carry
+# asm 1: adc >hiu4=int32#11,<hiu4=int32#11,<hiu5=int32#13
+# asm 2: adc >hiu4=r10,<hiu4=r10,<hiu5=r12
+adc r10,r10,r12
+
+# qhasm: lotmp = mem32[input_0]; input_0 += 4
+# asm 1: ldr >lotmp=int32#12,[<input_0=int32#1],#4
+# asm 2: ldr >lotmp=r11,[<input_0=r0],#4
+# copy-collector input: ldr r11,[r0],#4
+
+# qhasm: hitmp = mem32[input_0]; input_0 += 4
+# asm 1: ldr >hitmp=int32#13,[<input_0=int32#1],#4
+# asm 2: ldr >hitmp=r12,[<input_0=r0],#4
+# copy-collector input: ldr r12,[r0],#4
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#10,<lou4=int32#10,<lotmp=int32#12
+# asm 2: adds >lou4=r9,<lou4=r9,<lotmp=r11
+# copy-collector output starts
+ldr r11,[r0],#4
+ldr r12,[r0],#4
+# copy-collector output ends
+adds r9,r9,r11
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#11,<hiu4=int32#11,<hitmp=int32#13
+# asm 2: adc >hiu4=r10,<hiu4=r10,<hitmp=r12
+adc r10,r10,r12
+
+# qhasm: hitmp lotmp = lou1 * two23
+# asm 1: umull >lotmp=int32#13,>hitmp=int32#12,<lou1=int32#4,<two23=int32#14
+# asm 2: umull >lotmp=r12,>hitmp=r11,<lou1=r3,<two23=r14
+umull r12,r11,r3,r14
+
+# qhasm: lotmp hitmp += hiu1 * two23
+# asm 1: umlal <hitmp=int32#12,<lotmp=int32#13,<hiu1=int32#5,<two23=int32#14
+# asm 2: umlal <hitmp=r11,<lotmp=r12,<hiu1=r4,<two23=r14
+umlal r11,r12,r4,r14
+
+# qhasm: lotmp ^= (lou1 unsigned>> 18)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<lou1=int32#4,LSR #18
+# asm 2: eor >lotmp=r12,<lotmp=r12,<lou1=r3,LSR #18
+eor r12,r12,r3,LSR #18
+
+# qhasm: lotmp ^= (hiu1 << 14)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<hiu1=int32#5,LSL #14
+# asm 2: eor >lotmp=r12,<lotmp=r12,<hiu1=r4,LSL #14
+eor r12,r12,r4,LSL #14
+
+# qhasm: lotmp ^= (lou1 unsigned>> 14)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<lou1=int32#4,LSR #14
+# asm 2: eor >lotmp=r12,<lotmp=r12,<lou1=r3,LSR #14
+eor r12,r12,r3,LSR #14
+
+# qhasm: lotmp ^= (hiu1 << 18)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<hiu1=int32#5,LSL #18
+# asm 2: eor >lotmp=r12,<lotmp=r12,<hiu1=r4,LSL #18
+eor r12,r12,r4,LSL #18
+
+# qhasm: hitmp ^= (hiu1 unsigned>> 18)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<hiu1=int32#5,LSR #18
+# asm 2: eor >hitmp=r11,<hitmp=r11,<hiu1=r4,LSR #18
+eor r11,r11,r4,LSR #18
+
+# qhasm: hitmp ^= (lou1 << 14)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<lou1=int32#4,LSL #14
+# asm 2: eor >hitmp=r11,<hitmp=r11,<lou1=r3,LSL #14
+eor r11,r11,r3,LSL #14
+
+# qhasm: hitmp ^= (hiu1 unsigned>> 14)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<hiu1=int32#5,LSR #14
+# asm 2: eor >hitmp=r11,<hitmp=r11,<hiu1=r4,LSR #14
+eor r11,r11,r4,LSR #14
+
+# qhasm: hitmp ^= (lou1 << 18)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<lou1=int32#4,LSL #18
+# asm 2: eor >hitmp=r11,<hitmp=r11,<lou1=r3,LSL #18
+eor r11,r11,r3,LSL #18
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#10,<lou4=int32#10,<lotmp=int32#13
+# asm 2: adds >lou4=r9,<lou4=r9,<lotmp=r12
+adds r9,r9,r12
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#11,<hiu4=int32#11,<hitmp=int32#12
+# asm 2: adc >hiu4=r10,<hiu4=r10,<hitmp=r11
+adc r10,r10,r11
+
+# qhasm: lotmp = lou1 & lou2
+# asm 1: and >lotmp=int32#6,<lou1=int32#4,<lou2=int32#6
+# asm 2: and >lotmp=r5,<lou1=r3,<lou2=r5
+and r5,r3,r5
+
+# qhasm: lotmp2 = lou3 & ~lou1
+# asm 1: bic >lotmp2=int32#4,<lou3=int32#8,<lou1=int32#4
+# asm 2: bic >lotmp2=r3,<lou3=r7,<lou1=r3
+bic r3,r7,r3
+
+# qhasm: lotmp ^= lotmp2
+# asm 1: eor >lotmp=int32#4,<lotmp=int32#6,<lotmp2=int32#4
+# asm 2: eor >lotmp=r3,<lotmp=r5,<lotmp2=r3
+eor r3,r5,r3
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#4,<lou4=int32#10,<lotmp=int32#4
+# asm 2: adds >lou4=r3,<lou4=r9,<lotmp=r3
+adds r3,r9,r3
+
+# qhasm: hitmp = hiu1 & hiu2
+# asm 1: and >hitmp=int32#6,<hiu1=int32#5,<hiu2=int32#7
+# asm 2: and >hitmp=r5,<hiu1=r4,<hiu2=r6
+and r5,r4,r6
+
+# qhasm: hitmp2 = hiu3 & ~hiu1
+# asm 1: bic >hitmp2=int32#5,<hiu3=int32#9,<hiu1=int32#5
+# asm 2: bic >hitmp2=r4,<hiu3=r8,<hiu1=r4
+bic r4,r8,r4
+
+# qhasm: hitmp ^= hitmp2
+# asm 1: eor >hitmp=int32#5,<hitmp=int32#6,<hitmp2=int32#5
+# asm 2: eor >hitmp=r4,<hitmp=r5,<hitmp2=r4
+eor r4,r5,r4
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#5,<hiu4=int32#11,<hitmp=int32#5
+# asm 2: adc >hiu4=r4,<hiu4=r10,<hitmp=r4
+adc r4,r10,r4
+
+# qhasm: carry? lou0 += lou4
+# asm 1: adds >lou0=int32#2,<lou0=int32#2,<lou4=int32#4
+# asm 2: adds >lou0=r1,<lou0=r1,<lou4=r3
+adds r1,r1,r3
+
+# qhasm: hiu0 += hiu4 + carry
+# asm 1: adc >hiu0=int32#3,<hiu0=int32#3,<hiu4=int32#5
+# asm 2: adc >hiu0=r2,<hiu0=r2,<hiu4=r4
+adc r2,r2,r4
+
+# qhasm: lod3 = lou0
+# asm 1: str <lou0=int32#2,>lod3=stack32#4
+# asm 2: str <lou0=r1,>lod3=[sp,#12]
+# copy-collector input: str r1,[sp,#12]
+
+# qhasm: hid3 = hiu0
+# asm 1: str <hiu0=int32#3,>hid3=stack32#19
+# asm 2: str <hiu0=r2,>hid3=[sp,#72]
+# copy-collector input: str r2,[sp,#72]
+
+# qhasm: lou1 = lod0
+# asm 1: ldr >lou1=int32#6,<lod0=stack32#13
+# asm 2: ldr >lou1=r5,<lod0=[sp,#48]
+# copy-collector input: ldr r5,[sp,#48]
+
+# qhasm: hiu1 = hid0
+# asm 1: ldr >hiu1=int32#7,<hid0=stack32#14
+# asm 2: ldr >hiu1=r6,<hid0=[sp,#52]
+# copy-collector input: ldr r6,[sp,#52]
+
+# qhasm: lou2 = lod1
+# asm 1: ldr >lou2=int32#8,<lod1=stack32#15
+# asm 2: ldr >lou2=r7,<lod1=[sp,#56]
+# copy-collector input: ldr r7,[sp,#56]
+
+# qhasm: hiu2 = hid1
+# asm 1: ldr >hiu2=int32#9,<hid1=stack32#16
+# asm 2: ldr >hiu2=r8,<hid1=[sp,#60]
+# copy-collector input: ldr r8,[sp,#60]
+
+# qhasm: lou3 = lod2
+# asm 1: ldr >lou3=int32#10,<lod2=stack32#17
+# asm 2: ldr >lou3=r9,<lod2=[sp,#64]
+# copy-collector input: ldr r9,[sp,#64]
+
+# qhasm: hiu3 = hid2
+# asm 1: ldr >hiu3=int32#11,<hid2=stack32#18
+# asm 2: ldr >hiu3=r10,<hid2=[sp,#68]
+# copy-collector input: ldr r10,[sp,#68]
+
+# qhasm: two25 = 0x2000000 simple
+# asm 1: mov >two25=int32#12,0x2000000
+# asm 2: mov >two25=r11,0x2000000
+# copy-collector output starts
+str r1,[sp,#12]
+str r2,[sp,#72]
+ldr r5,[sp,#48]
+ldr r6,[sp,#52]
+ldr r7,[sp,#56]
+ldr r8,[sp,#60]
+ldr r9,[sp,#64]
+ldr r10,[sp,#68]
+# copy-collector output ends
+mov r11,0x2000000
+
+# qhasm: hitmp lotmp = lou1 * two25
+# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,<lou1=int32#6,<two25=int32#12
+# asm 2: umull >lotmp=r14,>hitmp=r12,<lou1=r5,<two25=r11
+umull r14,r12,r5,r11
+
+# qhasm: lotmp hitmp += hiu1 * two25
+# asm 1: umlal <hitmp=int32#13,<lotmp=int32#14,<hiu1=int32#7,<two25=int32#12
+# asm 2: umlal <hitmp=r12,<lotmp=r14,<hiu1=r6,<two25=r11
+umlal r12,r14,r6,r11
+
+# qhasm: lotmp ^= (hiu1 unsigned>> 2)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#14,<hiu1=int32#7,LSR #2
+# asm 2: eor >lotmp=r11,<lotmp=r14,<hiu1=r6,LSR #2
+eor r11,r14,r6,LSR #2
+
+# qhasm: lotmp ^= (lou1 << 30)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<lou1=int32#6,LSL #30
+# asm 2: eor >lotmp=r11,<lotmp=r11,<lou1=r5,LSL #30
+eor r11,r11,r5,LSL #30
+
+# qhasm: lotmp ^= (lou1 unsigned>> 28)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<lou1=int32#6,LSR #28
+# asm 2: eor >lotmp=r11,<lotmp=r11,<lou1=r5,LSR #28
+eor r11,r11,r5,LSR #28
+
+# qhasm: lotmp ^= (hiu1 << 4)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<hiu1=int32#7,LSL #4
+# asm 2: eor >lotmp=r11,<lotmp=r11,<hiu1=r6,LSL #4
+eor r11,r11,r6,LSL #4
+
+# qhasm: hitmp ^= (lou1 unsigned>> 2)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<lou1=int32#6,LSR #2
+# asm 2: eor >hitmp=r12,<hitmp=r12,<lou1=r5,LSR #2
+eor r12,r12,r5,LSR #2
+
+# qhasm: hitmp ^= (hiu1 << 30)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<hiu1=int32#7,LSL #30
+# asm 2: eor >hitmp=r12,<hitmp=r12,<hiu1=r6,LSL #30
+eor r12,r12,r6,LSL #30
+
+# qhasm: hitmp ^= (hiu1 unsigned>> 28)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<hiu1=int32#7,LSR #28
+# asm 2: eor >hitmp=r12,<hitmp=r12,<hiu1=r6,LSR #28
+eor r12,r12,r6,LSR #28
+
+# qhasm: hitmp ^= (lou1 << 4)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<lou1=int32#6,LSL #4
+# asm 2: eor >hitmp=r12,<hitmp=r12,<lou1=r5,LSL #4
+eor r12,r12,r5,LSL #4
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#4,<lou4=int32#4,<lotmp=int32#12
+# asm 2: adds >lou4=r3,<lou4=r3,<lotmp=r11
+adds r3,r3,r11
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#5,<hiu4=int32#5,<hitmp=int32#13
+# asm 2: adc >hiu4=r4,<hiu4=r4,<hitmp=r12
+adc r4,r4,r12
+
+# qhasm: lotmp = lou2 ^ lou3
+# asm 1: eor >lotmp=int32#12,<lou2=int32#8,<lou3=int32#10
+# asm 2: eor >lotmp=r11,<lou2=r7,<lou3=r9
+eor r11,r7,r9
+
+# qhasm: lotmp &= lou1
+# asm 1: and >lotmp=int32#6,<lotmp=int32#12,<lou1=int32#6
+# asm 2: and >lotmp=r5,<lotmp=r11,<lou1=r5
+and r5,r11,r5
+
+# qhasm: lotmp2 = lou2 & lou3
+# asm 1: and >lotmp2=int32#8,<lou2=int32#8,<lou3=int32#10
+# asm 2: and >lotmp2=r7,<lou2=r7,<lou3=r9
+and r7,r7,r9
+
+# qhasm: lotmp ^= lotmp2
+# asm 1: eor >lotmp=int32#6,<lotmp=int32#6,<lotmp2=int32#8
+# asm 2: eor >lotmp=r5,<lotmp=r5,<lotmp2=r7
+eor r5,r5,r7
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#4,<lou4=int32#4,<lotmp=int32#6
+# asm 2: adds >lou4=r3,<lou4=r3,<lotmp=r5
+adds.w r3,r3,r5
+
+# qhasm: hitmp = hiu2 ^ hiu3
+# asm 1: eor >hitmp=int32#6,<hiu2=int32#9,<hiu3=int32#11
+# asm 2: eor >hitmp=r5,<hiu2=r8,<hiu3=r10
+eor r5,r8,r10
+
+# qhasm: hitmp &= hiu1
+# asm 1: and >hitmp=int32#6,<hitmp=int32#6,<hiu1=int32#7
+# asm 2: and >hitmp=r5,<hitmp=r5,<hiu1=r6
+and r5,r5,r6
+
+# qhasm: hitmp2 = hiu2 & hiu3
+# asm 1: and >hitmp2=int32#7,<hiu2=int32#9,<hiu3=int32#11
+# asm 2: and >hitmp2=r6,<hiu2=r8,<hiu3=r10
+and r6,r8,r10
+
+# qhasm: hitmp ^= hitmp2
+# asm 1: eor >hitmp=int32#6,<hitmp=int32#6,<hitmp2=int32#7
+# asm 2: eor >hitmp=r5,<hitmp=r5,<hitmp2=r6
+eor r5,r5,r6
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#5,<hiu4=int32#5,<hitmp=int32#6
+# asm 2: adc >hiu4=r4,<hiu4=r4,<hitmp=r5
+adc r4,r4,r5
+
+# qhasm: lod7 = lou4
+# asm 1: str <lou4=int32#4,>lod7=stack32#17
+# asm 2: str <lou4=r3,>lod7=[sp,#64]
+# copy-collector input: str r3,[sp,#64]
+
+# qhasm: hid7 = hiu4
+# asm 1: str <hiu4=int32#5,>hid7=stack32#18
+# asm 2: str <hiu4=r4,>hid7=[sp,#68]
+# copy-collector input: str r4,[sp,#68]
+
+# qhasm: lou1 = lod4
+# asm 1: ldr >lou1=int32#4,<lod4=stack32#29
+# asm 2: ldr >lou1=r3,<lod4=[sp,#112]
+# copy-collector input: ldr r3,[sp,#112]
+
+# qhasm: hiu1 = hid4
+# asm 1: ldr >hiu1=int32#5,<hid4=stack32#30
+# asm 2: ldr >hiu1=r4,<hid4=[sp,#116]
+# copy-collector input: ldr r4,[sp,#116]
+
+# qhasm: lou2 = lod5
+# asm 1: ldr >lou2=int32#6,<lod5=stack32#31
+# asm 2: ldr >lou2=r5,<lod5=[sp,#120]
+# copy-collector input: ldr r5,[sp,#120]
+
+# qhasm: hiu2 = hid5
+# asm 1: ldr >hiu2=int32#7,<hid5=stack32#32
+# asm 2: ldr >hiu2=r6,<hid5=[sp,#124]
+# copy-collector input: ldr r6,[sp,#124]
+
+# qhasm: lou4 = lod6
+# asm 1: ldr >lou4=int32#8,<lod6=stack32#33
+# asm 2: ldr >lou4=r7,<lod6=[sp,#128]
+# copy-collector input: ldr r7,[sp,#128]
+
+# qhasm: hiu4 = hid6
+# asm 1: ldr >hiu4=int32#9,<hid6=stack32#34
+# asm 2: ldr >hiu4=r8,<hid6=[sp,#132]
+# copy-collector input: ldr r8,[sp,#132]
+
+# qhasm: lou5 = lod9
+# asm 1: ldr >lou5=int32#12,<lod9=stack32#39
+# asm 2: ldr >lou5=r11,<lod9=[sp,#152]
+# copy-collector input: ldr r11,[sp,#152]
+
+# qhasm: hiu5 = hid9
+# asm 1: ldr >hiu5=int32#13,<hid9=stack32#40
+# asm 2: ldr >hiu5=r12,<hid9=[sp,#156]
+# copy-collector input: ldr r12,[sp,#156]
+
+# qhasm: two23 = 0x800000 simple
+# asm 1: mov >two23=int32#14,0x800000
+# asm 2: mov >two23=r14,0x800000
+# copy-collector output starts
+strd r3,r4,[sp,#64]
+ldr r3,[sp,#112]
+ldr r4,[sp,#116]
+ldr r5,[sp,#120]
+ldr r6,[sp,#124]
+ldr.w r7,[sp,#128]
+ldr r8,[sp,#132]
+ldr r11,[sp,#152]
+ldr r12,[sp,#156]
+# copy-collector output ends
+mov r14,0x800000
+
+# qhasm: carry?  lou4 += lou5
+# asm 1: adds >lou4=int32#8,<lou4=int32#8,<lou5=int32#12
+# asm 2: adds >lou4=r7,<lou4=r7,<lou5=r11
+adds r7,r7,r11
+
+# qhasm: hiu4 += hiu5 + carry
+# asm 1: adc >hiu4=int32#9,<hiu4=int32#9,<hiu5=int32#13
+# asm 2: adc >hiu4=r8,<hiu4=r8,<hiu5=r12
+adc r8,r8,r12
+
+# qhasm: lotmp = mem32[input_0]; input_0 += 4
+# asm 1: ldr >lotmp=int32#12,[<input_0=int32#1],#4
+# asm 2: ldr >lotmp=r11,[<input_0=r0],#4
+# copy-collector input: ldr r11,[r0],#4
+
+# qhasm: hitmp = mem32[input_0]; input_0 += 4
+# asm 1: ldr >hitmp=int32#13,[<input_0=int32#1],#4
+# asm 2: ldr >hitmp=r12,[<input_0=r0],#4
+# copy-collector input: ldr r12,[r0],#4
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#8,<lou4=int32#8,<lotmp=int32#12
+# asm 2: adds >lou4=r7,<lou4=r7,<lotmp=r11
+# copy-collector output starts
+ldr r11,[r0],#4
+ldr r12,[r0],#4
+# copy-collector output ends
+adds r7,r7,r11
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#9,<hiu4=int32#9,<hitmp=int32#13
+# asm 2: adc >hiu4=r8,<hiu4=r8,<hitmp=r12
+adc r8,r8,r12
+
+# qhasm: hitmp lotmp = lou0 * two23
+# asm 1: umull >lotmp=int32#13,>hitmp=int32#12,<lou0=int32#2,<two23=int32#14
+# asm 2: umull >lotmp=r12,>hitmp=r11,<lou0=r1,<two23=r14
+umull r12,r11,r1,r14
+
+# qhasm: lotmp hitmp += hiu0 * two23
+# asm 1: umlal <hitmp=int32#12,<lotmp=int32#13,<hiu0=int32#3,<two23=int32#14
+# asm 2: umlal <hitmp=r11,<lotmp=r12,<hiu0=r2,<two23=r14
+umlal r11,r12,r2,r14
+
+# qhasm: lotmp ^= (lou0 unsigned>> 18)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<lou0=int32#2,LSR #18
+# asm 2: eor >lotmp=r12,<lotmp=r12,<lou0=r1,LSR #18
+eor r12,r12,r1,LSR #18
+
+# qhasm: lotmp ^= (hiu0 << 14)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<hiu0=int32#3,LSL #14
+# asm 2: eor >lotmp=r12,<lotmp=r12,<hiu0=r2,LSL #14
+eor r12,r12,r2,LSL #14
+
+# qhasm: lotmp ^= (lou0 unsigned>> 14)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<lou0=int32#2,LSR #14
+# asm 2: eor >lotmp=r12,<lotmp=r12,<lou0=r1,LSR #14
+eor r12,r12,r1,LSR #14
+
+# qhasm: lotmp ^= (hiu0 << 18)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<hiu0=int32#3,LSL #18
+# asm 2: eor >lotmp=r12,<lotmp=r12,<hiu0=r2,LSL #18
+eor r12,r12,r2,LSL #18
+
+# qhasm: hitmp ^= (hiu0 unsigned>> 18)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<hiu0=int32#3,LSR #18
+# asm 2: eor >hitmp=r11,<hitmp=r11,<hiu0=r2,LSR #18
+eor r11,r11,r2,LSR #18
+
+# qhasm: hitmp ^= (lou0 << 14)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<lou0=int32#2,LSL #14
+# asm 2: eor >hitmp=r11,<hitmp=r11,<lou0=r1,LSL #14
+eor r11,r11,r1,LSL #14
+
+# qhasm: hitmp ^= (hiu0 unsigned>> 14)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<hiu0=int32#3,LSR #14
+# asm 2: eor >hitmp=r11,<hitmp=r11,<hiu0=r2,LSR #14
+eor r11,r11,r2,LSR #14
+
+# qhasm: hitmp ^= (lou0 << 18)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<lou0=int32#2,LSL #18
+# asm 2: eor >hitmp=r11,<hitmp=r11,<lou0=r1,LSL #18
+eor r11,r11,r1,LSL #18
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#8,<lou4=int32#8,<lotmp=int32#13
+# asm 2: adds >lou4=r7,<lou4=r7,<lotmp=r12
+adds r7,r7,r12
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#9,<hiu4=int32#9,<hitmp=int32#12
+# asm 2: adc >hiu4=r8,<hiu4=r8,<hitmp=r11
+adc r8,r8,r11
+
+# qhasm: lotmp = lou0 & lou1
+# asm 1: and >lotmp=int32#4,<lou0=int32#2,<lou1=int32#4
+# asm 2: and >lotmp=r3,<lou0=r1,<lou1=r3
+and r3,r1,r3
+
+# qhasm: lotmp2 = lou2 & ~lou0
+# asm 1: bic >lotmp2=int32#2,<lou2=int32#6,<lou0=int32#2
+# asm 2: bic >lotmp2=r1,<lou2=r5,<lou0=r1
+bic r1,r5,r1
+
+# qhasm: lotmp ^= lotmp2
+# asm 1: eor >lotmp=int32#2,<lotmp=int32#4,<lotmp2=int32#2
+# asm 2: eor >lotmp=r1,<lotmp=r3,<lotmp2=r1
+eor r1,r3,r1
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#2,<lou4=int32#8,<lotmp=int32#2
+# asm 2: adds >lou4=r1,<lou4=r7,<lotmp=r1
+adds.w r1,r7,r1
+
+# qhasm: hitmp = hiu0 & hiu1
+# asm 1: and >hitmp=int32#4,<hiu0=int32#3,<hiu1=int32#5
+# asm 2: and >hitmp=r3,<hiu0=r2,<hiu1=r4
+and r3,r2,r4
+
+# qhasm: hitmp2 = hiu2 & ~hiu0
+# asm 1: bic >hitmp2=int32#3,<hiu2=int32#7,<hiu0=int32#3
+# asm 2: bic >hitmp2=r2,<hiu2=r6,<hiu0=r2
+bic r2,r6,r2
+
+# qhasm: hitmp ^= hitmp2
+# asm 1: eor >hitmp=int32#3,<hitmp=int32#4,<hitmp2=int32#3
+# asm 2: eor >hitmp=r2,<hitmp=r3,<hitmp2=r2
+eor r2,r3,r2
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#3,<hiu4=int32#9,<hitmp=int32#3
+# asm 2: adc >hiu4=r2,<hiu4=r8,<hitmp=r2
+adc r2,r8,r2
+
+# qhasm: carry? lou3 += lou4
+# asm 1: adds >lou3=int32#4,<lou3=int32#10,<lou4=int32#2
+# asm 2: adds >lou3=r3,<lou3=r9,<lou4=r1
+adds r3,r9,r1
+
+# qhasm: hiu3 += hiu4 + carry
+# asm 1: adc >hiu3=int32#5,<hiu3=int32#11,<hiu4=int32#3
+# asm 2: adc >hiu3=r4,<hiu3=r10,<hiu4=r2
+adc r4,r10,r2
+
+# qhasm: lod2 = lou3
+# asm 1: str <lou3=int32#4,>lod2=stack32#33
+# asm 2: str <lou3=r3,>lod2=[sp,#128]
+# copy-collector input: str r3,[sp,#128]
+
+# qhasm: hid2 = hiu3
+# asm 1: str <hiu3=int32#5,>hid2=stack32#34
+# asm 2: str <hiu3=r4,>hid2=[sp,#132]
+# copy-collector input: str r4,[sp,#132]
+
+# qhasm: lou0 = lod7
+# asm 1: ldr >lou0=int32#6,<lod7=stack32#17
+# asm 2: ldr >lou0=r5,<lod7=[sp,#64]
+# copy-collector input: ldr r5,[sp,#64]
+
+# qhasm: hiu0 = hid7
+# asm 1: ldr >hiu0=int32#7,<hid7=stack32#18
+# asm 2: ldr >hiu0=r6,<hid7=[sp,#68]
+# copy-collector input: ldr r6,[sp,#68]
+
+# qhasm: lou1 = lod0
+# asm 1: ldr >lou1=int32#8,<lod0=stack32#13
+# asm 2: ldr >lou1=r7,<lod0=[sp,#48]
+# copy-collector input: ldr r7,[sp,#48]
+
+# qhasm: hiu1 = hid0
+# asm 1: ldr >hiu1=int32#9,<hid0=stack32#14
+# asm 2: ldr >hiu1=r8,<hid0=[sp,#52]
+# copy-collector input: ldr r8,[sp,#52]
+
+# qhasm: lou2 = lod1
+# asm 1: ldr >lou2=int32#10,<lod1=stack32#15
+# asm 2: ldr >lou2=r9,<lod1=[sp,#56]
+# copy-collector input: ldr r9,[sp,#56]
+
+# qhasm: hiu2 = hid1
+# asm 1: ldr >hiu2=int32#11,<hid1=stack32#16
+# asm 2: ldr >hiu2=r10,<hid1=[sp,#60]
+# copy-collector input: ldr r10,[sp,#60]
+
+# qhasm: two25 = 0x2000000 simple
+# asm 1: mov >two25=int32#12,0x2000000
+# asm 2: mov >two25=r11,0x2000000
+# copy-collector output starts
+strd r3,r4,[sp,#128]
+ldr r5,[sp,#64]
+ldr r6,[sp,#68]
+ldr.w r7,[sp,#48]
+ldr r8,[sp,#52]
+ldr r9,[sp,#56]
+ldr r10,[sp,#60]
+# copy-collector output ends
+mov r11,0x2000000
+
+# qhasm: hitmp lotmp = lou0 * two25
+# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,<lou0=int32#6,<two25=int32#12
+# asm 2: umull >lotmp=r14,>hitmp=r12,<lou0=r5,<two25=r11
+umull r14,r12,r5,r11
+
+# qhasm: lotmp hitmp += hiu0 * two25
+# asm 1: umlal <hitmp=int32#13,<lotmp=int32#14,<hiu0=int32#7,<two25=int32#12
+# asm 2: umlal <hitmp=r12,<lotmp=r14,<hiu0=r6,<two25=r11
+umlal r12,r14,r6,r11
+
+# qhasm: lotmp ^= (hiu0 unsigned>> 2)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#14,<hiu0=int32#7,LSR #2
+# asm 2: eor >lotmp=r11,<lotmp=r14,<hiu0=r6,LSR #2
+eor r11,r14,r6,LSR #2
+
+# qhasm: lotmp ^= (lou0 << 30)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<lou0=int32#6,LSL #30
+# asm 2: eor >lotmp=r11,<lotmp=r11,<lou0=r5,LSL #30
+eor r11,r11,r5,LSL #30
+
+# qhasm: lotmp ^= (lou0 unsigned>> 28)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<lou0=int32#6,LSR #28
+# asm 2: eor >lotmp=r11,<lotmp=r11,<lou0=r5,LSR #28
+eor r11,r11,r5,LSR #28
+
+# qhasm: lotmp ^= (hiu0 << 4)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<hiu0=int32#7,LSL #4
+# asm 2: eor >lotmp=r11,<lotmp=r11,<hiu0=r6,LSL #4
+eor r11,r11,r6,LSL #4
+
+# qhasm: hitmp ^= (lou0 unsigned>> 2)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<lou0=int32#6,LSR #2
+# asm 2: eor >hitmp=r12,<hitmp=r12,<lou0=r5,LSR #2
+eor r12,r12,r5,LSR #2
+
+# qhasm: hitmp ^= (hiu0 << 30)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<hiu0=int32#7,LSL #30
+# asm 2: eor >hitmp=r12,<hitmp=r12,<hiu0=r6,LSL #30
+eor r12,r12,r6,LSL #30
+
+# qhasm: hitmp ^= (hiu0 unsigned>> 28)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<hiu0=int32#7,LSR #28
+# asm 2: eor >hitmp=r12,<hitmp=r12,<hiu0=r6,LSR #28
+eor r12,r12,r6,LSR #28
+
+# qhasm: hitmp ^= (lou0 << 4)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<lou0=int32#6,LSL #4
+# asm 2: eor >hitmp=r12,<hitmp=r12,<lou0=r5,LSL #4
+eor r12,r12,r5,LSL #4
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#2,<lou4=int32#2,<lotmp=int32#12
+# asm 2: adds >lou4=r1,<lou4=r1,<lotmp=r11
+adds r1,r1,r11
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#3,<hiu4=int32#3,<hitmp=int32#13
+# asm 2: adc >hiu4=r2,<hiu4=r2,<hitmp=r12
+adc r2,r2,r12
+
+# qhasm: lotmp = lou1 ^ lou2
+# asm 1: eor >lotmp=int32#12,<lou1=int32#8,<lou2=int32#10
+# asm 2: eor >lotmp=r11,<lou1=r7,<lou2=r9
+eor r11,r7,r9
+
+# qhasm: lotmp &= lou0
+# asm 1: and >lotmp=int32#6,<lotmp=int32#12,<lou0=int32#6
+# asm 2: and >lotmp=r5,<lotmp=r11,<lou0=r5
+and r5,r11,r5
+
+# qhasm: lotmp2 = lou1 & lou2
+# asm 1: and >lotmp2=int32#8,<lou1=int32#8,<lou2=int32#10
+# asm 2: and >lotmp2=r7,<lou1=r7,<lou2=r9
+and r7,r7,r9
+
+# qhasm: lotmp ^= lotmp2
+# asm 1: eor >lotmp=int32#6,<lotmp=int32#6,<lotmp2=int32#8
+# asm 2: eor >lotmp=r5,<lotmp=r5,<lotmp2=r7
+eor r5,r5,r7
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#2,<lou4=int32#2,<lotmp=int32#6
+# asm 2: adds >lou4=r1,<lou4=r1,<lotmp=r5
+adds.w r1,r1,r5
+
+# qhasm: hitmp = hiu1 ^ hiu2
+# asm 1: eor >hitmp=int32#6,<hiu1=int32#9,<hiu2=int32#11
+# asm 2: eor >hitmp=r5,<hiu1=r8,<hiu2=r10
+eor r5,r8,r10
+
+# qhasm: hitmp &= hiu0
+# asm 1: and >hitmp=int32#6,<hitmp=int32#6,<hiu0=int32#7
+# asm 2: and >hitmp=r5,<hitmp=r5,<hiu0=r6
+and r5,r5,r6
+
+# qhasm: hitmp2 = hiu1 & hiu2
+# asm 1: and >hitmp2=int32#7,<hiu1=int32#9,<hiu2=int32#11
+# asm 2: and >hitmp2=r6,<hiu1=r8,<hiu2=r10
+and r6,r8,r10
+
+# qhasm: hitmp ^= hitmp2
+# asm 1: eor >hitmp=int32#6,<hitmp=int32#6,<hitmp2=int32#7
+# asm 2: eor >hitmp=r5,<hitmp=r5,<hitmp2=r6
+eor r5,r5,r6
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#3,<hiu4=int32#3,<hitmp=int32#6
+# asm 2: adc >hiu4=r2,<hiu4=r2,<hitmp=r5
+adc r2,r2,r5
+
+# qhasm: lod6 = lou4
+# asm 1: str <lou4=int32#2,>lod6=stack32#15
+# asm 2: str <lou4=r1,>lod6=[sp,#56]
+# copy-collector input: str r1,[sp,#56]
+
+# qhasm: hid6 = hiu4
+# asm 1: str <hiu4=int32#3,>hid6=stack32#16
+# asm 2: str <hiu4=r2,>hid6=[sp,#60]
+# copy-collector input: str r2,[sp,#60]
+
+# qhasm: lou0 = lod3
+# asm 1: ldr >lou0=int32#2,<lod3=stack32#4
+# asm 2: ldr >lou0=r1,<lod3=[sp,#12]
+# copy-collector input: ldr r1,[sp,#12]
+
+# qhasm: hiu0 = hid3
+# asm 1: ldr >hiu0=int32#3,<hid3=stack32#19
+# asm 2: ldr >hiu0=r2,<hid3=[sp,#72]
+# copy-collector input: ldr r2,[sp,#72]
+
+# qhasm: lou1 = lod4
+# asm 1: ldr >lou1=int32#6,<lod4=stack32#29
+# asm 2: ldr >lou1=r5,<lod4=[sp,#112]
+# copy-collector input: ldr r5,[sp,#112]
+
+# qhasm: hiu1 = hid4
+# asm 1: ldr >hiu1=int32#7,<hid4=stack32#30
+# asm 2: ldr >hiu1=r6,<hid4=[sp,#116]
+# copy-collector input: ldr r6,[sp,#116]
+
+# qhasm: lou4 = lod5
+# asm 1: ldr >lou4=int32#8,<lod5=stack32#31
+# asm 2: ldr >lou4=r7,<lod5=[sp,#120]
+# copy-collector input: ldr r7,[sp,#120]
+
+# qhasm: hiu4 = hid5
+# asm 1: ldr >hiu4=int32#9,<hid5=stack32#32
+# asm 2: ldr >hiu4=r8,<hid5=[sp,#124]
+# copy-collector input: ldr r8,[sp,#124]
+
+# qhasm: lou5 = lod10
+# asm 1: ldr >lou5=int32#12,<lod10=stack32#41
+# asm 2: ldr >lou5=r11,<lod10=[sp,#160]
+# copy-collector input: ldr r11,[sp,#160]
+
+# qhasm: hiu5 = hid10
+# asm 1: ldr >hiu5=int32#13,<hid10=stack32#42
+# asm 2: ldr >hiu5=r12,<hid10=[sp,#164]
+# copy-collector input: ldr r12,[sp,#164]
+
+# qhasm: two23 = 0x800000 simple
+# asm 1: mov >two23=int32#14,0x800000
+# asm 2: mov >two23=r14,0x800000
+# copy-collector output starts
+strd r1,r2,[sp,#56]
+ldr r1,[sp,#12]
+ldr r2,[sp,#72]
+ldr r5,[sp,#112]
+ldr r6,[sp,#116]
+ldr.w r7,[sp,#120]
+ldr r8,[sp,#124]
+ldr r11,[sp,#160]
+ldr r12,[sp,#164]
+# copy-collector output ends
+mov r14,0x800000
+
+# qhasm: carry?  lou4 += lou5
+# asm 1: adds >lou4=int32#8,<lou4=int32#8,<lou5=int32#12
+# asm 2: adds >lou4=r7,<lou4=r7,<lou5=r11
+adds r7,r7,r11
+
+# qhasm: hiu4 += hiu5 + carry
+# asm 1: adc >hiu4=int32#9,<hiu4=int32#9,<hiu5=int32#13
+# asm 2: adc >hiu4=r8,<hiu4=r8,<hiu5=r12
+adc r8,r8,r12
+
+# qhasm: lotmp = mem32[input_0]; input_0 += 4
+# asm 1: ldr >lotmp=int32#12,[<input_0=int32#1],#4
+# asm 2: ldr >lotmp=r11,[<input_0=r0],#4
+# copy-collector input: ldr r11,[r0],#4
+
+# qhasm: hitmp = mem32[input_0]; input_0 += 4
+# asm 1: ldr >hitmp=int32#13,[<input_0=int32#1],#4
+# asm 2: ldr >hitmp=r12,[<input_0=r0],#4
+# copy-collector input: ldr r12,[r0],#4
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#8,<lou4=int32#8,<lotmp=int32#12
+# asm 2: adds >lou4=r7,<lou4=r7,<lotmp=r11
+# copy-collector output starts
+ldr r11,[r0],#4
+ldr r12,[r0],#4
+# copy-collector output ends
+adds r7,r7,r11
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#9,<hiu4=int32#9,<hitmp=int32#13
+# asm 2: adc >hiu4=r8,<hiu4=r8,<hitmp=r12
+adc r8,r8,r12
+
+# qhasm: hitmp lotmp = lou3 * two23
+# asm 1: umull >lotmp=int32#13,>hitmp=int32#12,<lou3=int32#4,<two23=int32#14
+# asm 2: umull >lotmp=r12,>hitmp=r11,<lou3=r3,<two23=r14
+umull r12,r11,r3,r14
+
+# qhasm: lotmp hitmp += hiu3 * two23
+# asm 1: umlal <hitmp=int32#12,<lotmp=int32#13,<hiu3=int32#5,<two23=int32#14
+# asm 2: umlal <hitmp=r11,<lotmp=r12,<hiu3=r4,<two23=r14
+umlal r11,r12,r4,r14
+
+# qhasm: lotmp ^= (lou3 unsigned>> 18)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<lou3=int32#4,LSR #18
+# asm 2: eor >lotmp=r12,<lotmp=r12,<lou3=r3,LSR #18
+eor r12,r12,r3,LSR #18
+
+# qhasm: lotmp ^= (hiu3 << 14)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<hiu3=int32#5,LSL #14
+# asm 2: eor >lotmp=r12,<lotmp=r12,<hiu3=r4,LSL #14
+eor r12,r12,r4,LSL #14
+
+# qhasm: lotmp ^= (lou3 unsigned>> 14)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<lou3=int32#4,LSR #14
+# asm 2: eor >lotmp=r12,<lotmp=r12,<lou3=r3,LSR #14
+eor r12,r12,r3,LSR #14
+
+# qhasm: lotmp ^= (hiu3 << 18)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<hiu3=int32#5,LSL #18
+# asm 2: eor >lotmp=r12,<lotmp=r12,<hiu3=r4,LSL #18
+eor r12,r12,r4,LSL #18
+
+# qhasm: hitmp ^= (hiu3 unsigned>> 18)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<hiu3=int32#5,LSR #18
+# asm 2: eor >hitmp=r11,<hitmp=r11,<hiu3=r4,LSR #18
+eor r11,r11,r4,LSR #18
+
+# qhasm: hitmp ^= (lou3 << 14)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<lou3=int32#4,LSL #14
+# asm 2: eor >hitmp=r11,<hitmp=r11,<lou3=r3,LSL #14
+eor r11,r11,r3,LSL #14
+
+# qhasm: hitmp ^= (hiu3 unsigned>> 14)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<hiu3=int32#5,LSR #14
+# asm 2: eor >hitmp=r11,<hitmp=r11,<hiu3=r4,LSR #14
+eor r11,r11,r4,LSR #14
+
+# qhasm: hitmp ^= (lou3 << 18)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<lou3=int32#4,LSL #18
+# asm 2: eor >hitmp=r11,<hitmp=r11,<lou3=r3,LSL #18
+eor r11,r11,r3,LSL #18
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#8,<lou4=int32#8,<lotmp=int32#13
+# asm 2: adds >lou4=r7,<lou4=r7,<lotmp=r12
+adds r7,r7,r12
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#9,<hiu4=int32#9,<hitmp=int32#12
+# asm 2: adc >hiu4=r8,<hiu4=r8,<hitmp=r11
+adc r8,r8,r11
+
+# qhasm: lotmp = lou3 & lou0
+# asm 1: and >lotmp=int32#2,<lou3=int32#4,<lou0=int32#2
+# asm 2: and >lotmp=r1,<lou3=r3,<lou0=r1
+and r1,r3,r1
+
+# qhasm: lotmp2 = lou1 & ~lou3
+# asm 1: bic >lotmp2=int32#4,<lou1=int32#6,<lou3=int32#4
+# asm 2: bic >lotmp2=r3,<lou1=r5,<lou3=r3
+bic r3,r5,r3
+
+# qhasm: lotmp ^= lotmp2
+# asm 1: eor >lotmp=int32#2,<lotmp=int32#2,<lotmp2=int32#4
+# asm 2: eor >lotmp=r1,<lotmp=r1,<lotmp2=r3
+eor r1,r1,r3
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#2,<lou4=int32#8,<lotmp=int32#2
+# asm 2: adds >lou4=r1,<lou4=r7,<lotmp=r1
+adds.w r1,r7,r1
+
+# qhasm: hitmp = hiu3 & hiu0
+# asm 1: and >hitmp=int32#3,<hiu3=int32#5,<hiu0=int32#3
+# asm 2: and >hitmp=r2,<hiu3=r4,<hiu0=r2
+and r2,r4,r2
+
+# qhasm: hitmp2 = hiu1 & ~hiu3
+# asm 1: bic >hitmp2=int32#4,<hiu1=int32#7,<hiu3=int32#5
+# asm 2: bic >hitmp2=r3,<hiu1=r6,<hiu3=r4
+bic r3,r6,r4
+
+# qhasm: hitmp ^= hitmp2
+# asm 1: eor >hitmp=int32#3,<hitmp=int32#3,<hitmp2=int32#4
+# asm 2: eor >hitmp=r2,<hitmp=r2,<hitmp2=r3
+eor r2,r2,r3
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#3,<hiu4=int32#9,<hitmp=int32#3
+# asm 2: adc >hiu4=r2,<hiu4=r8,<hitmp=r2
+adc r2,r8,r2
+
+# qhasm: carry? lou2 += lou4
+# asm 1: adds >lou2=int32#4,<lou2=int32#10,<lou4=int32#2
+# asm 2: adds >lou2=r3,<lou2=r9,<lou4=r1
+adds r3,r9,r1
+
+# qhasm: hiu2 += hiu4 + carry
+# asm 1: adc >hiu2=int32#5,<hiu2=int32#11,<hiu4=int32#3
+# asm 2: adc >hiu2=r4,<hiu2=r10,<hiu4=r2
+adc r4,r10,r2
+
+# qhasm: lod1 = lou2
+# asm 1: str <lou2=int32#4,>lod1=stack32#31
+# asm 2: str <lou2=r3,>lod1=[sp,#120]
+# copy-collector input: str r3,[sp,#120]
+
+# qhasm: hid1 = hiu2
+# asm 1: str <hiu2=int32#5,>hid1=stack32#32
+# asm 2: str <hiu2=r4,>hid1=[sp,#124]
+# copy-collector input: str r4,[sp,#124]
+
+# qhasm: lou0 = lod6
+# asm 1: ldr >lou0=int32#6,<lod6=stack32#15
+# asm 2: ldr >lou0=r5,<lod6=[sp,#56]
+# copy-collector input: ldr r5,[sp,#56]
+
+# qhasm: hiu0 = hid6
+# asm 1: ldr >hiu0=int32#7,<hid6=stack32#16
+# asm 2: ldr >hiu0=r6,<hid6=[sp,#60]
+# copy-collector input: ldr r6,[sp,#60]
+
+# qhasm: lou1 = lod7
+# asm 1: ldr >lou1=int32#8,<lod7=stack32#17
+# asm 2: ldr >lou1=r7,<lod7=[sp,#64]
+# copy-collector input: ldr r7,[sp,#64]
+
+# qhasm: hiu1 = hid7
+# asm 1: ldr >hiu1=int32#9,<hid7=stack32#18
+# asm 2: ldr >hiu1=r8,<hid7=[sp,#68]
+# copy-collector input: ldr r8,[sp,#68]
+
+# qhasm: lou3 = lod0
+# asm 1: ldr >lou3=int32#10,<lod0=stack32#13
+# asm 2: ldr >lou3=r9,<lod0=[sp,#48]
+# copy-collector input: ldr r9,[sp,#48]
+
+# qhasm: hiu3 = hid0
+# asm 1: ldr >hiu3=int32#11,<hid0=stack32#14
+# asm 2: ldr >hiu3=r10,<hid0=[sp,#52]
+# copy-collector input: ldr r10,[sp,#52]
+
+# qhasm: two25 = 0x2000000 simple
+# asm 1: mov >two25=int32#12,0x2000000
+# asm 2: mov >two25=r11,0x2000000
+# copy-collector output starts
+strd r3,r4,[sp,#120]
+ldr r5,[sp,#56]
+ldr r6,[sp,#60]
+ldr.w r7,[sp,#64]
+ldr r8,[sp,#68]
+ldr r9,[sp,#48]
+ldr r10,[sp,#52]
+# copy-collector output ends
+mov r11,0x2000000
+
+# qhasm: hitmp lotmp = lou0 * two25
+# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,<lou0=int32#6,<two25=int32#12
+# asm 2: umull >lotmp=r14,>hitmp=r12,<lou0=r5,<two25=r11
+umull r14,r12,r5,r11
+
+# qhasm: lotmp hitmp += hiu0 * two25
+# asm 1: umlal <hitmp=int32#13,<lotmp=int32#14,<hiu0=int32#7,<two25=int32#12
+# asm 2: umlal <hitmp=r12,<lotmp=r14,<hiu0=r6,<two25=r11
+umlal r12,r14,r6,r11
+
+# qhasm: lotmp ^= (hiu0 unsigned>> 2)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#14,<hiu0=int32#7,LSR #2
+# asm 2: eor >lotmp=r11,<lotmp=r14,<hiu0=r6,LSR #2
+eor r11,r14,r6,LSR #2
+
+# qhasm: lotmp ^= (lou0 << 30)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<lou0=int32#6,LSL #30
+# asm 2: eor >lotmp=r11,<lotmp=r11,<lou0=r5,LSL #30
+eor r11,r11,r5,LSL #30
+
+# qhasm: lotmp ^= (lou0 unsigned>> 28)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<lou0=int32#6,LSR #28
+# asm 2: eor >lotmp=r11,<lotmp=r11,<lou0=r5,LSR #28
+eor r11,r11,r5,LSR #28
+
+# qhasm: lotmp ^= (hiu0 << 4)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<hiu0=int32#7,LSL #4
+# asm 2: eor >lotmp=r11,<lotmp=r11,<hiu0=r6,LSL #4
+eor r11,r11,r6,LSL #4
+
+# qhasm: hitmp ^= (lou0 unsigned>> 2)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<lou0=int32#6,LSR #2
+# asm 2: eor >hitmp=r12,<hitmp=r12,<lou0=r5,LSR #2
+eor r12,r12,r5,LSR #2
+
+# qhasm: hitmp ^= (hiu0 << 30)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<hiu0=int32#7,LSL #30
+# asm 2: eor >hitmp=r12,<hitmp=r12,<hiu0=r6,LSL #30
+eor r12,r12,r6,LSL #30
+
+# qhasm: hitmp ^= (hiu0 unsigned>> 28)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<hiu0=int32#7,LSR #28
+# asm 2: eor >hitmp=r12,<hitmp=r12,<hiu0=r6,LSR #28
+eor r12,r12,r6,LSR #28
+
+# qhasm: hitmp ^= (lou0 << 4)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<lou0=int32#6,LSL #4
+# asm 2: eor >hitmp=r12,<hitmp=r12,<lou0=r5,LSL #4
+eor r12,r12,r5,LSL #4
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#2,<lou4=int32#2,<lotmp=int32#12
+# asm 2: adds >lou4=r1,<lou4=r1,<lotmp=r11
+adds r1,r1,r11
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#3,<hiu4=int32#3,<hitmp=int32#13
+# asm 2: adc >hiu4=r2,<hiu4=r2,<hitmp=r12
+adc r2,r2,r12
+
+# qhasm: lotmp = lou1 ^ lou3
+# asm 1: eor >lotmp=int32#12,<lou1=int32#8,<lou3=int32#10
+# asm 2: eor >lotmp=r11,<lou1=r7,<lou3=r9
+eor r11,r7,r9
+
+# qhasm: lotmp &= lou0
+# asm 1: and >lotmp=int32#6,<lotmp=int32#12,<lou0=int32#6
+# asm 2: and >lotmp=r5,<lotmp=r11,<lou0=r5
+and r5,r11,r5
+
+# qhasm: lotmp2 = lou1 & lou3
+# asm 1: and >lotmp2=int32#8,<lou1=int32#8,<lou3=int32#10
+# asm 2: and >lotmp2=r7,<lou1=r7,<lou3=r9
+and r7,r7,r9
+
+# qhasm: lotmp ^= lotmp2
+# asm 1: eor >lotmp=int32#6,<lotmp=int32#6,<lotmp2=int32#8
+# asm 2: eor >lotmp=r5,<lotmp=r5,<lotmp2=r7
+eor r5,r5,r7
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#2,<lou4=int32#2,<lotmp=int32#6
+# asm 2: adds >lou4=r1,<lou4=r1,<lotmp=r5
+adds.w r1,r1,r5
+
+# qhasm: hitmp = hiu1 ^ hiu3
+# asm 1: eor >hitmp=int32#6,<hiu1=int32#9,<hiu3=int32#11
+# asm 2: eor >hitmp=r5,<hiu1=r8,<hiu3=r10
+eor r5,r8,r10
+
+# qhasm: hitmp &= hiu0
+# asm 1: and >hitmp=int32#6,<hitmp=int32#6,<hiu0=int32#7
+# asm 2: and >hitmp=r5,<hitmp=r5,<hiu0=r6
+and r5,r5,r6
+
+# qhasm: hitmp2 = hiu1 & hiu3
+# asm 1: and >hitmp2=int32#7,<hiu1=int32#9,<hiu3=int32#11
+# asm 2: and >hitmp2=r6,<hiu1=r8,<hiu3=r10
+and r6,r8,r10
+
+# qhasm: hitmp ^= hitmp2
+# asm 1: eor >hitmp=int32#6,<hitmp=int32#6,<hitmp2=int32#7
+# asm 2: eor >hitmp=r5,<hitmp=r5,<hitmp2=r6
+eor r5,r5,r6
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#3,<hiu4=int32#3,<hitmp=int32#6
+# asm 2: adc >hiu4=r2,<hiu4=r2,<hitmp=r5
+adc r2,r2,r5
+
+# qhasm: lod5 = lou4
+# asm 1: str <lou4=int32#2,>lod5=stack32#13
+# asm 2: str <lou4=r1,>lod5=[sp,#48]
+# copy-collector input: str r1,[sp,#48]
+
+# qhasm: hid5 = hiu4
+# asm 1: str <hiu4=int32#3,>hid5=stack32#14
+# asm 2: str <hiu4=r2,>hid5=[sp,#52]
+# copy-collector input: str r2,[sp,#52]
+
+# qhasm: lou0 = lod2
+# asm 1: ldr >lou0=int32#2,<lod2=stack32#33
+# asm 2: ldr >lou0=r1,<lod2=[sp,#128]
+# copy-collector input: ldr r1,[sp,#128]
+
+# qhasm: hiu0 = hid2
+# asm 1: ldr >hiu0=int32#3,<hid2=stack32#34
+# asm 2: ldr >hiu0=r2,<hid2=[sp,#132]
+# copy-collector input: ldr r2,[sp,#132]
+
+# qhasm: lou1 = lod3
+# asm 1: ldr >lou1=int32#6,<lod3=stack32#4
+# asm 2: ldr >lou1=r5,<lod3=[sp,#12]
+# copy-collector input: ldr r5,[sp,#12]
+
+# qhasm: hiu1 = hid3
+# asm 1: ldr >hiu1=int32#7,<hid3=stack32#19
+# asm 2: ldr >hiu1=r6,<hid3=[sp,#72]
+# copy-collector input: ldr r6,[sp,#72]
+
+# qhasm: lou4 = lod4
+# asm 1: ldr >lou4=int32#8,<lod4=stack32#29
+# asm 2: ldr >lou4=r7,<lod4=[sp,#112]
+# copy-collector input: ldr r7,[sp,#112]
+
+# qhasm: hiu4 = hid4
+# asm 1: ldr >hiu4=int32#9,<hid4=stack32#30
+# asm 2: ldr >hiu4=r8,<hid4=[sp,#116]
+# copy-collector input: ldr r8,[sp,#116]
+
+# qhasm: lou5 = lod11
+# asm 1: ldr >lou5=int32#12,<lod11=stack32#43
+# asm 2: ldr >lou5=r11,<lod11=[sp,#168]
+# copy-collector input: ldr r11,[sp,#168]
+
+# qhasm: hiu5 = hid11
+# asm 1: ldr >hiu5=int32#13,<hid11=stack32#44
+# asm 2: ldr >hiu5=r12,<hid11=[sp,#172]
+# copy-collector input: ldr r12,[sp,#172]
+
+# qhasm: two23 = 0x800000 simple
+# asm 1: mov >two23=int32#14,0x800000
+# asm 2: mov >two23=r14,0x800000
+# copy-collector output starts
+strd r1,r2,[sp,#48]
+ldr r1,[sp,#128]
+ldr r2,[sp,#132]
+ldr r5,[sp,#12]
+ldr r6,[sp,#72]
+ldr.w r7,[sp,#112]
+ldr r8,[sp,#116]
+ldr r11,[sp,#168]
+ldr r12,[sp,#172]
+# copy-collector output ends
+mov r14,0x800000
+
+# qhasm: carry?  lou4 += lou5
+# asm 1: adds >lou4=int32#8,<lou4=int32#8,<lou5=int32#12
+# asm 2: adds >lou4=r7,<lou4=r7,<lou5=r11
+adds r7,r7,r11
+
+# qhasm: hiu4 += hiu5 + carry
+# asm 1: adc >hiu4=int32#9,<hiu4=int32#9,<hiu5=int32#13
+# asm 2: adc >hiu4=r8,<hiu4=r8,<hiu5=r12
+adc r8,r8,r12
+
+# qhasm: lotmp = mem32[input_0]; input_0 += 4
+# asm 1: ldr >lotmp=int32#12,[<input_0=int32#1],#4
+# asm 2: ldr >lotmp=r11,[<input_0=r0],#4
+# copy-collector input: ldr r11,[r0],#4
+
+# qhasm: hitmp = mem32[input_0]; input_0 += 4
+# asm 1: ldr >hitmp=int32#13,[<input_0=int32#1],#4
+# asm 2: ldr >hitmp=r12,[<input_0=r0],#4
+# copy-collector input: ldr r12,[r0],#4
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#8,<lou4=int32#8,<lotmp=int32#12
+# asm 2: adds >lou4=r7,<lou4=r7,<lotmp=r11
+# copy-collector output starts
+ldr r11,[r0],#4
+ldr r12,[r0],#4
+# copy-collector output ends
+adds r7,r7,r11
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#9,<hiu4=int32#9,<hitmp=int32#13
+# asm 2: adc >hiu4=r8,<hiu4=r8,<hitmp=r12
+adc r8,r8,r12
+
+# qhasm: hitmp lotmp = lou2 * two23
+# asm 1: umull >lotmp=int32#13,>hitmp=int32#12,<lou2=int32#4,<two23=int32#14
+# asm 2: umull >lotmp=r12,>hitmp=r11,<lou2=r3,<two23=r14
+umull r12,r11,r3,r14
+
+# qhasm: lotmp hitmp += hiu2 * two23
+# asm 1: umlal <hitmp=int32#12,<lotmp=int32#13,<hiu2=int32#5,<two23=int32#14
+# asm 2: umlal <hitmp=r11,<lotmp=r12,<hiu2=r4,<two23=r14
+umlal r11,r12,r4,r14
+
+# qhasm: lotmp ^= (lou2 unsigned>> 18)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<lou2=int32#4,LSR #18
+# asm 2: eor >lotmp=r12,<lotmp=r12,<lou2=r3,LSR #18
+eor r12,r12,r3,LSR #18
+
+# qhasm: lotmp ^= (hiu2 << 14)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<hiu2=int32#5,LSL #14
+# asm 2: eor >lotmp=r12,<lotmp=r12,<hiu2=r4,LSL #14
+eor r12,r12,r4,LSL #14
+
+# qhasm: lotmp ^= (lou2 unsigned>> 14)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<lou2=int32#4,LSR #14
+# asm 2: eor >lotmp=r12,<lotmp=r12,<lou2=r3,LSR #14
+eor r12,r12,r3,LSR #14
+
+# qhasm: lotmp ^= (hiu2 << 18)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<hiu2=int32#5,LSL #18
+# asm 2: eor >lotmp=r12,<lotmp=r12,<hiu2=r4,LSL #18
+eor r12,r12,r4,LSL #18
+
+# qhasm: hitmp ^= (hiu2 unsigned>> 18)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<hiu2=int32#5,LSR #18
+# asm 2: eor >hitmp=r11,<hitmp=r11,<hiu2=r4,LSR #18
+eor r11,r11,r4,LSR #18
+
+# qhasm: hitmp ^= (lou2 << 14)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<lou2=int32#4,LSL #14
+# asm 2: eor >hitmp=r11,<hitmp=r11,<lou2=r3,LSL #14
+eor r11,r11,r3,LSL #14
+
+# qhasm: hitmp ^= (hiu2 unsigned>> 14)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<hiu2=int32#5,LSR #14
+# asm 2: eor >hitmp=r11,<hitmp=r11,<hiu2=r4,LSR #14
+eor r11,r11,r4,LSR #14
+
+# qhasm: hitmp ^= (lou2 << 18)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<lou2=int32#4,LSL #18
+# asm 2: eor >hitmp=r11,<hitmp=r11,<lou2=r3,LSL #18
+eor r11,r11,r3,LSL #18
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#8,<lou4=int32#8,<lotmp=int32#13
+# asm 2: adds >lou4=r7,<lou4=r7,<lotmp=r12
+adds r7,r7,r12
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#9,<hiu4=int32#9,<hitmp=int32#12
+# asm 2: adc >hiu4=r8,<hiu4=r8,<hitmp=r11
+adc r8,r8,r11
+
+# qhasm: lotmp = lou2 & lou0
+# asm 1: and >lotmp=int32#2,<lou2=int32#4,<lou0=int32#2
+# asm 2: and >lotmp=r1,<lou2=r3,<lou0=r1
+and r1,r3,r1
+
+# qhasm: lotmp2 = lou1 & ~lou2
+# asm 1: bic >lotmp2=int32#4,<lou1=int32#6,<lou2=int32#4
+# asm 2: bic >lotmp2=r3,<lou1=r5,<lou2=r3
+bic r3,r5,r3
+
+# qhasm: lotmp ^= lotmp2
+# asm 1: eor >lotmp=int32#2,<lotmp=int32#2,<lotmp2=int32#4
+# asm 2: eor >lotmp=r1,<lotmp=r1,<lotmp2=r3
+eor r1,r1,r3
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#2,<lou4=int32#8,<lotmp=int32#2
+# asm 2: adds >lou4=r1,<lou4=r7,<lotmp=r1
+adds.w r1,r7,r1
+
+# qhasm: hitmp = hiu2 & hiu0
+# asm 1: and >hitmp=int32#3,<hiu2=int32#5,<hiu0=int32#3
+# asm 2: and >hitmp=r2,<hiu2=r4,<hiu0=r2
+and r2,r4,r2
+
+# qhasm: hitmp2 = hiu1 & ~hiu2
+# asm 1: bic >hitmp2=int32#4,<hiu1=int32#7,<hiu2=int32#5
+# asm 2: bic >hitmp2=r3,<hiu1=r6,<hiu2=r4
+bic r3,r6,r4
+
+# qhasm: hitmp ^= hitmp2
+# asm 1: eor >hitmp=int32#3,<hitmp=int32#3,<hitmp2=int32#4
+# asm 2: eor >hitmp=r2,<hitmp=r2,<hitmp2=r3
+eor r2,r2,r3
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#3,<hiu4=int32#9,<hitmp=int32#3
+# asm 2: adc >hiu4=r2,<hiu4=r8,<hitmp=r2
+adc r2,r8,r2
+
+# qhasm: carry? lou3 += lou4
+# asm 1: adds >lou3=int32#4,<lou3=int32#10,<lou4=int32#2
+# asm 2: adds >lou3=r3,<lou3=r9,<lou4=r1
+adds r3,r9,r1
+
+# qhasm: hiu3 += hiu4 + carry
+# asm 1: adc >hiu3=int32#5,<hiu3=int32#11,<hiu4=int32#3
+# asm 2: adc >hiu3=r4,<hiu3=r10,<hiu4=r2
+adc r4,r10,r2
+
+# qhasm: lod0 = lou3
+# asm 1: str <lou3=int32#4,>lod0=stack32#29
+# asm 2: str <lou3=r3,>lod0=[sp,#112]
+# copy-collector input: str r3,[sp,#112]
+
+# qhasm: hid0 = hiu3
+# asm 1: str <hiu3=int32#5,>hid0=stack32#30
+# asm 2: str <hiu3=r4,>hid0=[sp,#116]
+# copy-collector input: str r4,[sp,#116]
+
+# qhasm: lou0 = lod5
+# asm 1: ldr >lou0=int32#6,<lod5=stack32#13
+# asm 2: ldr >lou0=r5,<lod5=[sp,#48]
+# copy-collector input: ldr r5,[sp,#48]
+
+# qhasm: hiu0 = hid5
+# asm 1: ldr >hiu0=int32#7,<hid5=stack32#14
+# asm 2: ldr >hiu0=r6,<hid5=[sp,#52]
+# copy-collector input: ldr r6,[sp,#52]
+
+# qhasm: lou1 = lod6
+# asm 1: ldr >lou1=int32#8,<lod6=stack32#15
+# asm 2: ldr >lou1=r7,<lod6=[sp,#56]
+# copy-collector input: ldr r7,[sp,#56]
+
+# qhasm: hiu1 = hid6
+# asm 1: ldr >hiu1=int32#9,<hid6=stack32#16
+# asm 2: ldr >hiu1=r8,<hid6=[sp,#60]
+# copy-collector input: ldr r8,[sp,#60]
+
+# qhasm: lou2 = lod7
+# asm 1: ldr >lou2=int32#10,<lod7=stack32#17
+# asm 2: ldr >lou2=r9,<lod7=[sp,#64]
+# copy-collector input: ldr r9,[sp,#64]
+
+# qhasm: hiu2 = hid7
+# asm 1: ldr >hiu2=int32#11,<hid7=stack32#18
+# asm 2: ldr >hiu2=r10,<hid7=[sp,#68]
+# copy-collector input: ldr r10,[sp,#68]
+
+# qhasm: two25 = 0x2000000 simple
+# asm 1: mov >two25=int32#12,0x2000000
+# asm 2: mov >two25=r11,0x2000000
+# copy-collector output starts
+strd r3,r4,[sp,#112]
+ldr r5,[sp,#48]
+ldr r6,[sp,#52]
+ldr.w r7,[sp,#56]
+ldr r8,[sp,#60]
+ldr r9,[sp,#64]
+ldr r10,[sp,#68]
+# copy-collector output ends
+mov r11,0x2000000
+
+# qhasm: hitmp lotmp = lou0 * two25
+# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,<lou0=int32#6,<two25=int32#12
+# asm 2: umull >lotmp=r14,>hitmp=r12,<lou0=r5,<two25=r11
+umull r14,r12,r5,r11
+
+# qhasm: lotmp hitmp += hiu0 * two25
+# asm 1: umlal <hitmp=int32#13,<lotmp=int32#14,<hiu0=int32#7,<two25=int32#12
+# asm 2: umlal <hitmp=r12,<lotmp=r14,<hiu0=r6,<two25=r11
+umlal r12,r14,r6,r11
+
+# qhasm: lotmp ^= (hiu0 unsigned>> 2)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#14,<hiu0=int32#7,LSR #2
+# asm 2: eor >lotmp=r11,<lotmp=r14,<hiu0=r6,LSR #2
+eor r11,r14,r6,LSR #2
+
+# qhasm: lotmp ^= (lou0 << 30)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<lou0=int32#6,LSL #30
+# asm 2: eor >lotmp=r11,<lotmp=r11,<lou0=r5,LSL #30
+eor r11,r11,r5,LSL #30
+
+# qhasm: lotmp ^= (lou0 unsigned>> 28)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<lou0=int32#6,LSR #28
+# asm 2: eor >lotmp=r11,<lotmp=r11,<lou0=r5,LSR #28
+eor r11,r11,r5,LSR #28
+
+# qhasm: lotmp ^= (hiu0 << 4)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<hiu0=int32#7,LSL #4
+# asm 2: eor >lotmp=r11,<lotmp=r11,<hiu0=r6,LSL #4
+eor r11,r11,r6,LSL #4
+
+# qhasm: hitmp ^= (lou0 unsigned>> 2)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<lou0=int32#6,LSR #2
+# asm 2: eor >hitmp=r12,<hitmp=r12,<lou0=r5,LSR #2
+eor r12,r12,r5,LSR #2
+
+# qhasm: hitmp ^= (hiu0 << 30)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<hiu0=int32#7,LSL #30
+# asm 2: eor >hitmp=r12,<hitmp=r12,<hiu0=r6,LSL #30
+eor r12,r12,r6,LSL #30
+
+# qhasm: hitmp ^= (hiu0 unsigned>> 28)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<hiu0=int32#7,LSR #28
+# asm 2: eor >hitmp=r12,<hitmp=r12,<hiu0=r6,LSR #28
+eor r12,r12,r6,LSR #28
+
+# qhasm: hitmp ^= (lou0 << 4)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<lou0=int32#6,LSL #4
+# asm 2: eor >hitmp=r12,<hitmp=r12,<lou0=r5,LSL #4
+eor r12,r12,r5,LSL #4
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#2,<lou4=int32#2,<lotmp=int32#12
+# asm 2: adds >lou4=r1,<lou4=r1,<lotmp=r11
+adds r1,r1,r11
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#3,<hiu4=int32#3,<hitmp=int32#13
+# asm 2: adc >hiu4=r2,<hiu4=r2,<hitmp=r12
+adc r2,r2,r12
+
+# qhasm: lotmp = lou1 ^ lou2
+# asm 1: eor >lotmp=int32#12,<lou1=int32#8,<lou2=int32#10
+# asm 2: eor >lotmp=r11,<lou1=r7,<lou2=r9
+eor r11,r7,r9
+
+# qhasm: lotmp &= lou0
+# asm 1: and >lotmp=int32#6,<lotmp=int32#12,<lou0=int32#6
+# asm 2: and >lotmp=r5,<lotmp=r11,<lou0=r5
+and r5,r11,r5
+
+# qhasm: lotmp2 = lou1 & lou2
+# asm 1: and >lotmp2=int32#8,<lou1=int32#8,<lou2=int32#10
+# asm 2: and >lotmp2=r7,<lou1=r7,<lou2=r9
+and r7,r7,r9
+
+# qhasm: lotmp ^= lotmp2
+# asm 1: eor >lotmp=int32#6,<lotmp=int32#6,<lotmp2=int32#8
+# asm 2: eor >lotmp=r5,<lotmp=r5,<lotmp2=r7
+eor r5,r5,r7
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#2,<lou4=int32#2,<lotmp=int32#6
+# asm 2: adds >lou4=r1,<lou4=r1,<lotmp=r5
+adds.w r1,r1,r5
+
+# qhasm: hitmp = hiu1 ^ hiu2
+# asm 1: eor >hitmp=int32#6,<hiu1=int32#9,<hiu2=int32#11
+# asm 2: eor >hitmp=r5,<hiu1=r8,<hiu2=r10
+eor r5,r8,r10
+
+# qhasm: hitmp &= hiu0
+# asm 1: and >hitmp=int32#6,<hitmp=int32#6,<hiu0=int32#7
+# asm 2: and >hitmp=r5,<hitmp=r5,<hiu0=r6
+and r5,r5,r6
+
+# qhasm: hitmp2 = hiu1 & hiu2
+# asm 1: and >hitmp2=int32#7,<hiu1=int32#9,<hiu2=int32#11
+# asm 2: and >hitmp2=r6,<hiu1=r8,<hiu2=r10
+and r6,r8,r10
+
+# qhasm: hitmp ^= hitmp2
+# asm 1: eor >hitmp=int32#6,<hitmp=int32#6,<hitmp2=int32#7
+# asm 2: eor >hitmp=r5,<hitmp=r5,<hitmp2=r6
+eor r5,r5,r6
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#3,<hiu4=int32#3,<hitmp=int32#6
+# asm 2: adc >hiu4=r2,<hiu4=r2,<hitmp=r5
+adc r2,r2,r5
+
+# qhasm: lod4 = lou4
+# asm 1: str <lou4=int32#2,>lod4=stack32#70
+# asm 2: str <lou4=r1,>lod4=[sp,#276]
+# copy-collector input: str r1,[sp,#276]
+
+# qhasm: hid4 = hiu4
+# asm 1: str <hiu4=int32#3,>hid4=stack32#71
+# asm 2: str <hiu4=r2,>hid4=[sp,#280]
+# copy-collector input: str r2,[sp,#280]
+
+# qhasm: lou0 = lod1
+# asm 1: ldr >lou0=int32#2,<lod1=stack32#31
+# asm 2: ldr >lou0=r1,<lod1=[sp,#120]
+# copy-collector input: ldr r1,[sp,#120]
+
+# qhasm: hiu0 = hid1
+# asm 1: ldr >hiu0=int32#3,<hid1=stack32#32
+# asm 2: ldr >hiu0=r2,<hid1=[sp,#124]
+# copy-collector input: ldr r2,[sp,#124]
+
+# qhasm: lou1 = lod2
+# asm 1: ldr >lou1=int32#6,<lod2=stack32#33
+# asm 2: ldr >lou1=r5,<lod2=[sp,#128]
+# copy-collector input: ldr r5,[sp,#128]
+
+# qhasm: hiu1 = hid2
+# asm 1: ldr >hiu1=int32#7,<hid2=stack32#34
+# asm 2: ldr >hiu1=r6,<hid2=[sp,#132]
+# copy-collector input: ldr r6,[sp,#132]
+
+# qhasm: lou4 = lod3
+# asm 1: ldr >lou4=int32#8,<lod3=stack32#4
+# asm 2: ldr >lou4=r7,<lod3=[sp,#12]
+# copy-collector input: ldr r7,[sp,#12]
+
+# qhasm: hiu4 = hid3
+# asm 1: ldr >hiu4=int32#9,<hid3=stack32#19
+# asm 2: ldr >hiu4=r8,<hid3=[sp,#72]
+# copy-collector input: ldr r8,[sp,#72]
+
+# qhasm: lou5 = lod12
+# asm 1: ldr >lou5=int32#12,<lod12=stack32#45
+# asm 2: ldr >lou5=r11,<lod12=[sp,#176]
+# copy-collector input: ldr r11,[sp,#176]
+
+# qhasm: hiu5 = hid12
+# asm 1: ldr >hiu5=int32#13,<hid12=stack32#46
+# asm 2: ldr >hiu5=r12,<hid12=[sp,#180]
+# copy-collector input: ldr r12,[sp,#180]
+
+# qhasm: two23 = 0x800000 simple
+# asm 1: mov >two23=int32#14,0x800000
+# asm 2: mov >two23=r14,0x800000
+# copy-collector output starts
+strd r1,r2,[sp,#276]
+ldr r1,[sp,#120]
+ldr r2,[sp,#124]
+ldr r5,[sp,#128]
+ldr r6,[sp,#132]
+ldr.w r7,[sp,#12]
+ldr r8,[sp,#72]
+ldr r11,[sp,#176]
+ldr r12,[sp,#180]
+# copy-collector output ends
+mov r14,0x800000
+
+# qhasm: carry?  lou4 += lou5
+# asm 1: adds >lou4=int32#8,<lou4=int32#8,<lou5=int32#12
+# asm 2: adds >lou4=r7,<lou4=r7,<lou5=r11
+adds r7,r7,r11
+
+# qhasm: hiu4 += hiu5 + carry
+# asm 1: adc >hiu4=int32#9,<hiu4=int32#9,<hiu5=int32#13
+# asm 2: adc >hiu4=r8,<hiu4=r8,<hiu5=r12
+adc r8,r8,r12
+
+# qhasm: lotmp = mem32[input_0]; input_0 += 4
+# asm 1: ldr >lotmp=int32#12,[<input_0=int32#1],#4
+# asm 2: ldr >lotmp=r11,[<input_0=r0],#4
+# copy-collector input: ldr r11,[r0],#4
+
+# qhasm: hitmp = mem32[input_0]; input_0 += 4
+# asm 1: ldr >hitmp=int32#13,[<input_0=int32#1],#4
+# asm 2: ldr >hitmp=r12,[<input_0=r0],#4
+# copy-collector input: ldr r12,[r0],#4
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#8,<lou4=int32#8,<lotmp=int32#12
+# asm 2: adds >lou4=r7,<lou4=r7,<lotmp=r11
+# copy-collector output starts
+ldr r11,[r0],#4
+ldr r12,[r0],#4
+# copy-collector output ends
+adds r7,r7,r11
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#9,<hiu4=int32#9,<hitmp=int32#13
+# asm 2: adc >hiu4=r8,<hiu4=r8,<hitmp=r12
+adc r8,r8,r12
+
+# qhasm: hitmp lotmp = lou3 * two23
+# asm 1: umull >lotmp=int32#13,>hitmp=int32#12,<lou3=int32#4,<two23=int32#14
+# asm 2: umull >lotmp=r12,>hitmp=r11,<lou3=r3,<two23=r14
+umull r12,r11,r3,r14
+
+# qhasm: lotmp hitmp += hiu3 * two23
+# asm 1: umlal <hitmp=int32#12,<lotmp=int32#13,<hiu3=int32#5,<two23=int32#14
+# asm 2: umlal <hitmp=r11,<lotmp=r12,<hiu3=r4,<two23=r14
+umlal r11,r12,r4,r14
+
+# qhasm: lotmp ^= (lou3 unsigned>> 18)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<lou3=int32#4,LSR #18
+# asm 2: eor >lotmp=r12,<lotmp=r12,<lou3=r3,LSR #18
+eor r12,r12,r3,LSR #18
+
+# qhasm: lotmp ^= (hiu3 << 14)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<hiu3=int32#5,LSL #14
+# asm 2: eor >lotmp=r12,<lotmp=r12,<hiu3=r4,LSL #14
+eor r12,r12,r4,LSL #14
+
+# qhasm: lotmp ^= (lou3 unsigned>> 14)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<lou3=int32#4,LSR #14
+# asm 2: eor >lotmp=r12,<lotmp=r12,<lou3=r3,LSR #14
+eor r12,r12,r3,LSR #14
+
+# qhasm: lotmp ^= (hiu3 << 18)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<hiu3=int32#5,LSL #18
+# asm 2: eor >lotmp=r12,<lotmp=r12,<hiu3=r4,LSL #18
+eor r12,r12,r4,LSL #18
+
+# qhasm: hitmp ^= (hiu3 unsigned>> 18)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<hiu3=int32#5,LSR #18
+# asm 2: eor >hitmp=r11,<hitmp=r11,<hiu3=r4,LSR #18
+eor r11,r11,r4,LSR #18
+
+# qhasm: hitmp ^= (lou3 << 14)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<lou3=int32#4,LSL #14
+# asm 2: eor >hitmp=r11,<hitmp=r11,<lou3=r3,LSL #14
+eor r11,r11,r3,LSL #14
+
+# qhasm: hitmp ^= (hiu3 unsigned>> 14)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<hiu3=int32#5,LSR #14
+# asm 2: eor >hitmp=r11,<hitmp=r11,<hiu3=r4,LSR #14
+eor r11,r11,r4,LSR #14
+
+# qhasm: hitmp ^= (lou3 << 18)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<lou3=int32#4,LSL #18
+# asm 2: eor >hitmp=r11,<hitmp=r11,<lou3=r3,LSL #18
+eor r11,r11,r3,LSL #18
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#8,<lou4=int32#8,<lotmp=int32#13
+# asm 2: adds >lou4=r7,<lou4=r7,<lotmp=r12
+adds r7,r7,r12
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#9,<hiu4=int32#9,<hitmp=int32#12
+# asm 2: adc >hiu4=r8,<hiu4=r8,<hitmp=r11
+adc r8,r8,r11
+
+# qhasm: lotmp = lou3 & lou0
+# asm 1: and >lotmp=int32#2,<lou3=int32#4,<lou0=int32#2
+# asm 2: and >lotmp=r1,<lou3=r3,<lou0=r1
+and r1,r3,r1
+
+# qhasm: lotmp2 = lou1 & ~lou3
+# asm 1: bic >lotmp2=int32#4,<lou1=int32#6,<lou3=int32#4
+# asm 2: bic >lotmp2=r3,<lou1=r5,<lou3=r3
+bic r3,r5,r3
+
+# qhasm: lotmp ^= lotmp2
+# asm 1: eor >lotmp=int32#2,<lotmp=int32#2,<lotmp2=int32#4
+# asm 2: eor >lotmp=r1,<lotmp=r1,<lotmp2=r3
+eor r1,r1,r3
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#2,<lou4=int32#8,<lotmp=int32#2
+# asm 2: adds >lou4=r1,<lou4=r7,<lotmp=r1
+adds.w r1,r7,r1
+
+# qhasm: hitmp = hiu3 & hiu0
+# asm 1: and >hitmp=int32#3,<hiu3=int32#5,<hiu0=int32#3
+# asm 2: and >hitmp=r2,<hiu3=r4,<hiu0=r2
+and r2,r4,r2
+
+# qhasm: hitmp2 = hiu1 & ~hiu3
+# asm 1: bic >hitmp2=int32#4,<hiu1=int32#7,<hiu3=int32#5
+# asm 2: bic >hitmp2=r3,<hiu1=r6,<hiu3=r4
+bic r3,r6,r4
+
+# qhasm: hitmp ^= hitmp2
+# asm 1: eor >hitmp=int32#3,<hitmp=int32#3,<hitmp2=int32#4
+# asm 2: eor >hitmp=r2,<hitmp=r2,<hitmp2=r3
+eor r2,r2,r3
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#3,<hiu4=int32#9,<hitmp=int32#3
+# asm 2: adc >hiu4=r2,<hiu4=r8,<hitmp=r2
+adc r2,r8,r2
+
+# qhasm: carry? lou2 += lou4
+# asm 1: adds >lou2=int32#4,<lou2=int32#10,<lou4=int32#2
+# asm 2: adds >lou2=r3,<lou2=r9,<lou4=r1
+adds r3,r9,r1
+
+# qhasm: hiu2 += hiu4 + carry
+# asm 1: adc >hiu2=int32#5,<hiu2=int32#11,<hiu4=int32#3
+# asm 2: adc >hiu2=r4,<hiu2=r10,<hiu4=r2
+adc r4,r10,r2
+
+# qhasm: lod7 = lou2
+# asm 1: str <lou2=int32#4,>lod7=stack32#35
+# asm 2: str <lou2=r3,>lod7=[sp,#136]
+# copy-collector input: str r3,[sp,#136]
+
+# qhasm: hid7 = hiu2
+# asm 1: str <hiu2=int32#5,>hid7=stack32#36
+# asm 2: str <hiu2=r4,>hid7=[sp,#140]
+# copy-collector input: str r4,[sp,#140]
+
+# qhasm: lou0 = lod4
+# asm 1: ldr >lou0=int32#6,<lod4=stack32#70
+# asm 2: ldr >lou0=r5,<lod4=[sp,#276]
+# copy-collector input: ldr r5,[sp,#276]
+
+# qhasm: hiu0 = hid4
+# asm 1: ldr >hiu0=int32#7,<hid4=stack32#71
+# asm 2: ldr >hiu0=r6,<hid4=[sp,#280]
+# copy-collector input: ldr r6,[sp,#280]
+
+# qhasm: lou1 = lod5
+# asm 1: ldr >lou1=int32#8,<lod5=stack32#13
+# asm 2: ldr >lou1=r7,<lod5=[sp,#48]
+# copy-collector input: ldr r7,[sp,#48]
+
+# qhasm: hiu1 = hid5
+# asm 1: ldr >hiu1=int32#9,<hid5=stack32#14
+# asm 2: ldr >hiu1=r8,<hid5=[sp,#52]
+# copy-collector input: ldr r8,[sp,#52]
+
+# qhasm: lou3 = lod6
+# asm 1: ldr >lou3=int32#10,<lod6=stack32#15
+# asm 2: ldr >lou3=r9,<lod6=[sp,#56]
+# copy-collector input: ldr r9,[sp,#56]
+
+# qhasm: hiu3 = hid6
+# asm 1: ldr >hiu3=int32#11,<hid6=stack32#16
+# asm 2: ldr >hiu3=r10,<hid6=[sp,#60]
+# copy-collector input: ldr r10,[sp,#60]
+
+# qhasm: two25 = 0x2000000 simple
+# asm 1: mov >two25=int32#12,0x2000000
+# asm 2: mov >two25=r11,0x2000000
+# copy-collector output starts
+strd r3,r4,[sp,#136]
+ldr r5,[sp,#276]
+ldr r6,[sp,#280]
+ldr.w r7,[sp,#48]
+ldr r8,[sp,#52]
+ldr r9,[sp,#56]
+ldr r10,[sp,#60]
+# copy-collector output ends
+mov r11,0x2000000
+
+# qhasm: hitmp lotmp = lou0 * two25
+# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,<lou0=int32#6,<two25=int32#12
+# asm 2: umull >lotmp=r14,>hitmp=r12,<lou0=r5,<two25=r11
+umull r14,r12,r5,r11
+
+# qhasm: lotmp hitmp += hiu0 * two25
+# asm 1: umlal <hitmp=int32#13,<lotmp=int32#14,<hiu0=int32#7,<two25=int32#12
+# asm 2: umlal <hitmp=r12,<lotmp=r14,<hiu0=r6,<two25=r11
+umlal r12,r14,r6,r11
+
+# qhasm: lotmp ^= (hiu0 unsigned>> 2)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#14,<hiu0=int32#7,LSR #2
+# asm 2: eor >lotmp=r11,<lotmp=r14,<hiu0=r6,LSR #2
+eor r11,r14,r6,LSR #2
+
+# qhasm: lotmp ^= (lou0 << 30)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<lou0=int32#6,LSL #30
+# asm 2: eor >lotmp=r11,<lotmp=r11,<lou0=r5,LSL #30
+eor r11,r11,r5,LSL #30
+
+# qhasm: lotmp ^= (lou0 unsigned>> 28)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<lou0=int32#6,LSR #28
+# asm 2: eor >lotmp=r11,<lotmp=r11,<lou0=r5,LSR #28
+eor r11,r11,r5,LSR #28
+
+# qhasm: lotmp ^= (hiu0 << 4)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<hiu0=int32#7,LSL #4
+# asm 2: eor >lotmp=r11,<lotmp=r11,<hiu0=r6,LSL #4
+eor r11,r11,r6,LSL #4
+
+# qhasm: hitmp ^= (lou0 unsigned>> 2)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<lou0=int32#6,LSR #2
+# asm 2: eor >hitmp=r12,<hitmp=r12,<lou0=r5,LSR #2
+eor r12,r12,r5,LSR #2
+
+# qhasm: hitmp ^= (hiu0 << 30)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<hiu0=int32#7,LSL #30
+# asm 2: eor >hitmp=r12,<hitmp=r12,<hiu0=r6,LSL #30
+eor r12,r12,r6,LSL #30
+
+# qhasm: hitmp ^= (hiu0 unsigned>> 28)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<hiu0=int32#7,LSR #28
+# asm 2: eor >hitmp=r12,<hitmp=r12,<hiu0=r6,LSR #28
+eor r12,r12,r6,LSR #28
+
+# qhasm: hitmp ^= (lou0 << 4)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<lou0=int32#6,LSL #4
+# asm 2: eor >hitmp=r12,<hitmp=r12,<lou0=r5,LSL #4
+eor r12,r12,r5,LSL #4
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#2,<lou4=int32#2,<lotmp=int32#12
+# asm 2: adds >lou4=r1,<lou4=r1,<lotmp=r11
+adds r1,r1,r11
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#3,<hiu4=int32#3,<hitmp=int32#13
+# asm 2: adc >hiu4=r2,<hiu4=r2,<hitmp=r12
+adc r2,r2,r12
+
+# qhasm: lotmp = lou1 ^ lou3
+# asm 1: eor >lotmp=int32#12,<lou1=int32#8,<lou3=int32#10
+# asm 2: eor >lotmp=r11,<lou1=r7,<lou3=r9
+eor r11,r7,r9
+
+# qhasm: lotmp &= lou0
+# asm 1: and >lotmp=int32#6,<lotmp=int32#12,<lou0=int32#6
+# asm 2: and >lotmp=r5,<lotmp=r11,<lou0=r5
+and r5,r11,r5
+
+# qhasm: lotmp2 = lou1 & lou3
+# asm 1: and >lotmp2=int32#8,<lou1=int32#8,<lou3=int32#10
+# asm 2: and >lotmp2=r7,<lou1=r7,<lou3=r9
+and r7,r7,r9
+
+# qhasm: lotmp ^= lotmp2
+# asm 1: eor >lotmp=int32#6,<lotmp=int32#6,<lotmp2=int32#8
+# asm 2: eor >lotmp=r5,<lotmp=r5,<lotmp2=r7
+eor r5,r5,r7
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#2,<lou4=int32#2,<lotmp=int32#6
+# asm 2: adds >lou4=r1,<lou4=r1,<lotmp=r5
+adds.w r1,r1,r5
+
+# qhasm: hitmp = hiu1 ^ hiu3
+# asm 1: eor >hitmp=int32#6,<hiu1=int32#9,<hiu3=int32#11
+# asm 2: eor >hitmp=r5,<hiu1=r8,<hiu3=r10
+eor r5,r8,r10
+
+# qhasm: hitmp &= hiu0
+# asm 1: and >hitmp=int32#6,<hitmp=int32#6,<hiu0=int32#7
+# asm 2: and >hitmp=r5,<hitmp=r5,<hiu0=r6
+and r5,r5,r6
+
+# qhasm: hitmp2 = hiu1 & hiu3
+# asm 1: and >hitmp2=int32#7,<hiu1=int32#9,<hiu3=int32#11
+# asm 2: and >hitmp2=r6,<hiu1=r8,<hiu3=r10
+and r6,r8,r10
+
+# qhasm: hitmp ^= hitmp2
+# asm 1: eor >hitmp=int32#6,<hitmp=int32#6,<hitmp2=int32#7
+# asm 2: eor >hitmp=r5,<hitmp=r5,<hitmp2=r6
+eor r5,r5,r6
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#3,<hiu4=int32#3,<hitmp=int32#6
+# asm 2: adc >hiu4=r2,<hiu4=r2,<hitmp=r5
+adc r2,r2,r5
+
+# qhasm: lod3 = lou4
+# asm 1: str <lou4=int32#2,>lod3=stack32#19
+# asm 2: str <lou4=r1,>lod3=[sp,#72]
+# copy-collector input: str r1,[sp,#72]
+
+# qhasm: hid3 = hiu4
+# asm 1: str <hiu4=int32#3,>hid3=stack32#20
+# asm 2: str <hiu4=r2,>hid3=[sp,#76]
+# copy-collector input: str r2,[sp,#76]
+
+# qhasm: lou0 = lod0
+# asm 1: ldr >lou0=int32#2,<lod0=stack32#29
+# asm 2: ldr >lou0=r1,<lod0=[sp,#112]
+# copy-collector input: ldr r1,[sp,#112]
+
+# qhasm: hiu0 = hid0
+# asm 1: ldr >hiu0=int32#3,<hid0=stack32#30
+# asm 2: ldr >hiu0=r2,<hid0=[sp,#116]
+# copy-collector input: ldr r2,[sp,#116]
+
+# qhasm: lou1 = lod1
+# asm 1: ldr >lou1=int32#6,<lod1=stack32#31
+# asm 2: ldr >lou1=r5,<lod1=[sp,#120]
+# copy-collector input: ldr r5,[sp,#120]
+
+# qhasm: hiu1 = hid1
+# asm 1: ldr >hiu1=int32#7,<hid1=stack32#32
+# asm 2: ldr >hiu1=r6,<hid1=[sp,#124]
+# copy-collector input: ldr r6,[sp,#124]
+
+# qhasm: lou4 = lod2
+# asm 1: ldr >lou4=int32#8,<lod2=stack32#33
+# asm 2: ldr >lou4=r7,<lod2=[sp,#128]
+# copy-collector input: ldr r7,[sp,#128]
+
+# qhasm: hiu4 = hid2
+# asm 1: ldr >hiu4=int32#9,<hid2=stack32#34
+# asm 2: ldr >hiu4=r8,<hid2=[sp,#132]
+# copy-collector input: ldr r8,[sp,#132]
+
+# qhasm: lou5 = lod13
+# asm 1: ldr >lou5=int32#12,<lod13=stack32#47
+# asm 2: ldr >lou5=r11,<lod13=[sp,#184]
+# copy-collector input: ldr r11,[sp,#184]
+
+# qhasm: hiu5 = hid13
+# asm 1: ldr >hiu5=int32#13,<hid13=stack32#48
+# asm 2: ldr >hiu5=r12,<hid13=[sp,#188]
+# copy-collector input: ldr r12,[sp,#188]
+
+# qhasm: two23 = 0x800000 simple
+# asm 1: mov >two23=int32#14,0x800000
+# asm 2: mov >two23=r14,0x800000
+# copy-collector output starts
+strd r1,r2,[sp,#72]
+ldr r1,[sp,#112]
+ldr r2,[sp,#116]
+ldr r5,[sp,#120]
+ldr r6,[sp,#124]
+ldr.w r7,[sp,#128]
+ldr r8,[sp,#132]
+ldr r11,[sp,#184]
+ldr r12,[sp,#188]
+# copy-collector output ends
+mov r14,0x800000
+
+# qhasm: carry?  lou4 += lou5
+# asm 1: adds >lou4=int32#8,<lou4=int32#8,<lou5=int32#12
+# asm 2: adds >lou4=r7,<lou4=r7,<lou5=r11
+adds r7,r7,r11
+
+# qhasm: hiu4 += hiu5 + carry
+# asm 1: adc >hiu4=int32#9,<hiu4=int32#9,<hiu5=int32#13
+# asm 2: adc >hiu4=r8,<hiu4=r8,<hiu5=r12
+adc r8,r8,r12
+
+# qhasm: lotmp = mem32[input_0]; input_0 += 4
+# asm 1: ldr >lotmp=int32#12,[<input_0=int32#1],#4
+# asm 2: ldr >lotmp=r11,[<input_0=r0],#4
+# copy-collector input: ldr r11,[r0],#4
+
+# qhasm: hitmp = mem32[input_0]; input_0 += 4
+# asm 1: ldr >hitmp=int32#13,[<input_0=int32#1],#4
+# asm 2: ldr >hitmp=r12,[<input_0=r0],#4
+# copy-collector input: ldr r12,[r0],#4
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#8,<lou4=int32#8,<lotmp=int32#12
+# asm 2: adds >lou4=r7,<lou4=r7,<lotmp=r11
+# copy-collector output starts
+ldr r11,[r0],#4
+ldr r12,[r0],#4
+# copy-collector output ends
+adds r7,r7,r11
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#9,<hiu4=int32#9,<hitmp=int32#13
+# asm 2: adc >hiu4=r8,<hiu4=r8,<hitmp=r12
+adc r8,r8,r12
+
+# qhasm: hitmp lotmp = lou2 * two23
+# asm 1: umull >lotmp=int32#13,>hitmp=int32#12,<lou2=int32#4,<two23=int32#14
+# asm 2: umull >lotmp=r12,>hitmp=r11,<lou2=r3,<two23=r14
+umull r12,r11,r3,r14
+
+# qhasm: lotmp hitmp += hiu2 * two23
+# asm 1: umlal <hitmp=int32#12,<lotmp=int32#13,<hiu2=int32#5,<two23=int32#14
+# asm 2: umlal <hitmp=r11,<lotmp=r12,<hiu2=r4,<two23=r14
+umlal r11,r12,r4,r14
+
+# qhasm: lotmp ^= (lou2 unsigned>> 18)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<lou2=int32#4,LSR #18
+# asm 2: eor >lotmp=r12,<lotmp=r12,<lou2=r3,LSR #18
+eor r12,r12,r3,LSR #18
+
+# qhasm: lotmp ^= (hiu2 << 14)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<hiu2=int32#5,LSL #14
+# asm 2: eor >lotmp=r12,<lotmp=r12,<hiu2=r4,LSL #14
+eor r12,r12,r4,LSL #14
+
+# qhasm: lotmp ^= (lou2 unsigned>> 14)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<lou2=int32#4,LSR #14
+# asm 2: eor >lotmp=r12,<lotmp=r12,<lou2=r3,LSR #14
+eor r12,r12,r3,LSR #14
+
+# qhasm: lotmp ^= (hiu2 << 18)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<hiu2=int32#5,LSL #18
+# asm 2: eor >lotmp=r12,<lotmp=r12,<hiu2=r4,LSL #18
+eor r12,r12,r4,LSL #18
+
+# qhasm: hitmp ^= (hiu2 unsigned>> 18)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<hiu2=int32#5,LSR #18
+# asm 2: eor >hitmp=r11,<hitmp=r11,<hiu2=r4,LSR #18
+eor r11,r11,r4,LSR #18
+
+# qhasm: hitmp ^= (lou2 << 14)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<lou2=int32#4,LSL #14
+# asm 2: eor >hitmp=r11,<hitmp=r11,<lou2=r3,LSL #14
+eor r11,r11,r3,LSL #14
+
+# qhasm: hitmp ^= (hiu2 unsigned>> 14)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<hiu2=int32#5,LSR #14
+# asm 2: eor >hitmp=r11,<hitmp=r11,<hiu2=r4,LSR #14
+eor r11,r11,r4,LSR #14
+
+# qhasm: hitmp ^= (lou2 << 18)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<lou2=int32#4,LSL #18
+# asm 2: eor >hitmp=r11,<hitmp=r11,<lou2=r3,LSL #18
+eor r11,r11,r3,LSL #18
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#8,<lou4=int32#8,<lotmp=int32#13
+# asm 2: adds >lou4=r7,<lou4=r7,<lotmp=r12
+adds r7,r7,r12
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#9,<hiu4=int32#9,<hitmp=int32#12
+# asm 2: adc >hiu4=r8,<hiu4=r8,<hitmp=r11
+adc r8,r8,r11
+
+# qhasm: lotmp = lou2 & lou0
+# asm 1: and >lotmp=int32#2,<lou2=int32#4,<lou0=int32#2
+# asm 2: and >lotmp=r1,<lou2=r3,<lou0=r1
+and r1,r3,r1
+
+# qhasm: lotmp2 = lou1 & ~lou2
+# asm 1: bic >lotmp2=int32#4,<lou1=int32#6,<lou2=int32#4
+# asm 2: bic >lotmp2=r3,<lou1=r5,<lou2=r3
+bic r3,r5,r3
+
+# qhasm: lotmp ^= lotmp2
+# asm 1: eor >lotmp=int32#2,<lotmp=int32#2,<lotmp2=int32#4
+# asm 2: eor >lotmp=r1,<lotmp=r1,<lotmp2=r3
+eor r1,r1,r3
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#2,<lou4=int32#8,<lotmp=int32#2
+# asm 2: adds >lou4=r1,<lou4=r7,<lotmp=r1
+adds.w r1,r7,r1
+
+# qhasm: hitmp = hiu2 & hiu0
+# asm 1: and >hitmp=int32#3,<hiu2=int32#5,<hiu0=int32#3
+# asm 2: and >hitmp=r2,<hiu2=r4,<hiu0=r2
+and r2,r4,r2
+
+# qhasm: hitmp2 = hiu1 & ~hiu2
+# asm 1: bic >hitmp2=int32#4,<hiu1=int32#7,<hiu2=int32#5
+# asm 2: bic >hitmp2=r3,<hiu1=r6,<hiu2=r4
+bic r3,r6,r4
+
+# qhasm: hitmp ^= hitmp2
+# asm 1: eor >hitmp=int32#3,<hitmp=int32#3,<hitmp2=int32#4
+# asm 2: eor >hitmp=r2,<hitmp=r2,<hitmp2=r3
+eor r2,r2,r3
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#3,<hiu4=int32#9,<hitmp=int32#3
+# asm 2: adc >hiu4=r2,<hiu4=r8,<hitmp=r2
+adc r2,r8,r2
+
+# qhasm: carry? lou3 += lou4
+# asm 1: adds >lou3=int32#4,<lou3=int32#10,<lou4=int32#2
+# asm 2: adds >lou3=r3,<lou3=r9,<lou4=r1
+adds r3,r9,r1
+
+# qhasm: hiu3 += hiu4 + carry
+# asm 1: adc >hiu3=int32#5,<hiu3=int32#11,<hiu4=int32#3
+# asm 2: adc >hiu3=r4,<hiu3=r10,<hiu4=r2
+adc r4,r10,r2
+
+# qhasm: lod6 = lou3
+# asm 1: str <lou3=int32#4,>lod6=stack32#33
+# asm 2: str <lou3=r3,>lod6=[sp,#128]
+# copy-collector input: str r3,[sp,#128]
+
+# qhasm: hid6 = hiu3
+# asm 1: str <hiu3=int32#5,>hid6=stack32#34
+# asm 2: str <hiu3=r4,>hid6=[sp,#132]
+# copy-collector input: str r4,[sp,#132]
+
+# qhasm: lou0 = lod3
+# asm 1: ldr >lou0=int32#6,<lod3=stack32#19
+# asm 2: ldr >lou0=r5,<lod3=[sp,#72]
+# copy-collector input: ldr r5,[sp,#72]
+
+# qhasm: hiu0 = hid3
+# asm 1: ldr >hiu0=int32#7,<hid3=stack32#20
+# asm 2: ldr >hiu0=r6,<hid3=[sp,#76]
+# copy-collector input: ldr r6,[sp,#76]
+
+# qhasm: lou1 = lod4
+# asm 1: ldr >lou1=int32#8,<lod4=stack32#70
+# asm 2: ldr >lou1=r7,<lod4=[sp,#276]
+# copy-collector input: ldr r7,[sp,#276]
+
+# qhasm: hiu1 = hid4
+# asm 1: ldr >hiu1=int32#9,<hid4=stack32#71
+# asm 2: ldr >hiu1=r8,<hid4=[sp,#280]
+# copy-collector input: ldr r8,[sp,#280]
+
+# qhasm: lou2 = lod5
+# asm 1: ldr >lou2=int32#10,<lod5=stack32#13
+# asm 2: ldr >lou2=r9,<lod5=[sp,#48]
+# copy-collector input: ldr r9,[sp,#48]
+
+# qhasm: hiu2 = hid5
+# asm 1: ldr >hiu2=int32#11,<hid5=stack32#14
+# asm 2: ldr >hiu2=r10,<hid5=[sp,#52]
+# copy-collector input: ldr r10,[sp,#52]
+
+# qhasm: two25 = 0x2000000 simple
+# asm 1: mov >two25=int32#12,0x2000000
+# asm 2: mov >two25=r11,0x2000000
+# copy-collector output starts
+strd r3,r4,[sp,#128]
+ldr r5,[sp,#72]
+ldr r6,[sp,#76]
+ldr.w r7,[sp,#276]
+ldr r8,[sp,#280]
+ldr r9,[sp,#48]
+ldr r10,[sp,#52]
+# copy-collector output ends
+mov r11,0x2000000
+
+# qhasm: hitmp lotmp = lou0 * two25
+# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,<lou0=int32#6,<two25=int32#12
+# asm 2: umull >lotmp=r14,>hitmp=r12,<lou0=r5,<two25=r11
+umull r14,r12,r5,r11
+
+# qhasm: lotmp hitmp += hiu0 * two25
+# asm 1: umlal <hitmp=int32#13,<lotmp=int32#14,<hiu0=int32#7,<two25=int32#12
+# asm 2: umlal <hitmp=r12,<lotmp=r14,<hiu0=r6,<two25=r11
+umlal r12,r14,r6,r11
+
+# qhasm: lotmp ^= (hiu0 unsigned>> 2)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#14,<hiu0=int32#7,LSR #2
+# asm 2: eor >lotmp=r11,<lotmp=r14,<hiu0=r6,LSR #2
+eor r11,r14,r6,LSR #2
+
+# qhasm: lotmp ^= (lou0 << 30)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<lou0=int32#6,LSL #30
+# asm 2: eor >lotmp=r11,<lotmp=r11,<lou0=r5,LSL #30
+eor r11,r11,r5,LSL #30
+
+# qhasm: lotmp ^= (lou0 unsigned>> 28)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<lou0=int32#6,LSR #28
+# asm 2: eor >lotmp=r11,<lotmp=r11,<lou0=r5,LSR #28
+eor r11,r11,r5,LSR #28
+
+# qhasm: lotmp ^= (hiu0 << 4)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<hiu0=int32#7,LSL #4
+# asm 2: eor >lotmp=r11,<lotmp=r11,<hiu0=r6,LSL #4
+eor r11,r11,r6,LSL #4
+
+# qhasm: hitmp ^= (lou0 unsigned>> 2)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<lou0=int32#6,LSR #2
+# asm 2: eor >hitmp=r12,<hitmp=r12,<lou0=r5,LSR #2
+eor r12,r12,r5,LSR #2
+
+# qhasm: hitmp ^= (hiu0 << 30)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<hiu0=int32#7,LSL #30
+# asm 2: eor >hitmp=r12,<hitmp=r12,<hiu0=r6,LSL #30
+eor r12,r12,r6,LSL #30
+
+# qhasm: hitmp ^= (hiu0 unsigned>> 28)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<hiu0=int32#7,LSR #28
+# asm 2: eor >hitmp=r12,<hitmp=r12,<hiu0=r6,LSR #28
+eor r12,r12,r6,LSR #28
+
+# qhasm: hitmp ^= (lou0 << 4)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<lou0=int32#6,LSL #4
+# asm 2: eor >hitmp=r12,<hitmp=r12,<lou0=r5,LSL #4
+eor r12,r12,r5,LSL #4
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#2,<lou4=int32#2,<lotmp=int32#12
+# asm 2: adds >lou4=r1,<lou4=r1,<lotmp=r11
+adds r1,r1,r11
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#3,<hiu4=int32#3,<hitmp=int32#13
+# asm 2: adc >hiu4=r2,<hiu4=r2,<hitmp=r12
+adc r2,r2,r12
+
+# qhasm: lotmp = lou1 ^ lou2
+# asm 1: eor >lotmp=int32#12,<lou1=int32#8,<lou2=int32#10
+# asm 2: eor >lotmp=r11,<lou1=r7,<lou2=r9
+eor r11,r7,r9
+
+# qhasm: lotmp &= lou0
+# asm 1: and >lotmp=int32#6,<lotmp=int32#12,<lou0=int32#6
+# asm 2: and >lotmp=r5,<lotmp=r11,<lou0=r5
+and r5,r11,r5
+
+# qhasm: lotmp2 = lou1 & lou2
+# asm 1: and >lotmp2=int32#8,<lou1=int32#8,<lou2=int32#10
+# asm 2: and >lotmp2=r7,<lou1=r7,<lou2=r9
+and r7,r7,r9
+
+# qhasm: lotmp ^= lotmp2
+# asm 1: eor >lotmp=int32#6,<lotmp=int32#6,<lotmp2=int32#8
+# asm 2: eor >lotmp=r5,<lotmp=r5,<lotmp2=r7
+eor r5,r5,r7
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#2,<lou4=int32#2,<lotmp=int32#6
+# asm 2: adds >lou4=r1,<lou4=r1,<lotmp=r5
+adds.w r1,r1,r5
+
+# qhasm: hitmp = hiu1 ^ hiu2
+# asm 1: eor >hitmp=int32#6,<hiu1=int32#9,<hiu2=int32#11
+# asm 2: eor >hitmp=r5,<hiu1=r8,<hiu2=r10
+eor r5,r8,r10
+
+# qhasm: hitmp &= hiu0
+# asm 1: and >hitmp=int32#6,<hitmp=int32#6,<hiu0=int32#7
+# asm 2: and >hitmp=r5,<hitmp=r5,<hiu0=r6
+and r5,r5,r6
+
+# qhasm: hitmp2 = hiu1 & hiu2
+# asm 1: and >hitmp2=int32#7,<hiu1=int32#9,<hiu2=int32#11
+# asm 2: and >hitmp2=r6,<hiu1=r8,<hiu2=r10
+and r6,r8,r10
+
+# qhasm: hitmp ^= hitmp2
+# asm 1: eor >hitmp=int32#6,<hitmp=int32#6,<hitmp2=int32#7
+# asm 2: eor >hitmp=r5,<hitmp=r5,<hitmp2=r6
+eor r5,r5,r6
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#3,<hiu4=int32#3,<hitmp=int32#6
+# asm 2: adc >hiu4=r2,<hiu4=r2,<hitmp=r5
+adc r2,r2,r5
+
+# qhasm: lod2 = lou4
+# asm 1: str <lou4=int32#2,>lod2=stack32#17
+# asm 2: str <lou4=r1,>lod2=[sp,#64]
+# copy-collector input: str r1,[sp,#64]
+
+# qhasm: hid2 = hiu4
+# asm 1: str <hiu4=int32#3,>hid2=stack32#18
+# asm 2: str <hiu4=r2,>hid2=[sp,#68]
+# copy-collector input: str r2,[sp,#68]
+
+# qhasm: lou0 = lod7
+# asm 1: ldr >lou0=int32#2,<lod7=stack32#35
+# asm 2: ldr >lou0=r1,<lod7=[sp,#136]
+# copy-collector input: ldr r1,[sp,#136]
+
+# qhasm: hiu0 = hid7
+# asm 1: ldr >hiu0=int32#3,<hid7=stack32#36
+# asm 2: ldr >hiu0=r2,<hid7=[sp,#140]
+# copy-collector input: ldr r2,[sp,#140]
+
+# qhasm: lou1 = lod0
+# asm 1: ldr >lou1=int32#6,<lod0=stack32#29
+# asm 2: ldr >lou1=r5,<lod0=[sp,#112]
+# copy-collector input: ldr r5,[sp,#112]
+
+# qhasm: hiu1 = hid0
+# asm 1: ldr >hiu1=int32#7,<hid0=stack32#30
+# asm 2: ldr >hiu1=r6,<hid0=[sp,#116]
+# copy-collector input: ldr r6,[sp,#116]
+
+# qhasm: lou4 = lod1
+# asm 1: ldr >lou4=int32#8,<lod1=stack32#31
+# asm 2: ldr >lou4=r7,<lod1=[sp,#120]
+# copy-collector input: ldr r7,[sp,#120]
+
+# qhasm: hiu4 = hid1
+# asm 1: ldr >hiu4=int32#9,<hid1=stack32#32
+# asm 2: ldr >hiu4=r8,<hid1=[sp,#124]
+# copy-collector input: ldr r8,[sp,#124]
+
+# qhasm: lou5 = lod14
+# asm 1: ldr >lou5=int32#12,<lod14=stack32#49
+# asm 2: ldr >lou5=r11,<lod14=[sp,#192]
+# copy-collector input: ldr r11,[sp,#192]
+
+# qhasm: hiu5 = hid14
+# asm 1: ldr >hiu5=int32#13,<hid14=stack32#50
+# asm 2: ldr >hiu5=r12,<hid14=[sp,#196]
+# copy-collector input: ldr r12,[sp,#196]
+
+# qhasm: two23 = 0x800000 simple
+# asm 1: mov >two23=int32#14,0x800000
+# asm 2: mov >two23=r14,0x800000
+# copy-collector output starts
+strd r1,r2,[sp,#64]
+ldr r1,[sp,#136]
+ldr r2,[sp,#140]
+ldr r5,[sp,#112]
+ldr r6,[sp,#116]
+ldr.w r7,[sp,#120]
+ldr r8,[sp,#124]
+ldr r11,[sp,#192]
+ldr r12,[sp,#196]
+# copy-collector output ends
+mov r14,0x800000
+
+# qhasm: carry?  lou4 += lou5
+# asm 1: adds >lou4=int32#8,<lou4=int32#8,<lou5=int32#12
+# asm 2: adds >lou4=r7,<lou4=r7,<lou5=r11
+adds r7,r7,r11
+
+# qhasm: hiu4 += hiu5 + carry
+# asm 1: adc >hiu4=int32#9,<hiu4=int32#9,<hiu5=int32#13
+# asm 2: adc >hiu4=r8,<hiu4=r8,<hiu5=r12
+adc r8,r8,r12
+
+# qhasm: lotmp = mem32[input_0]; input_0 += 4
+# asm 1: ldr >lotmp=int32#12,[<input_0=int32#1],#4
+# asm 2: ldr >lotmp=r11,[<input_0=r0],#4
+# copy-collector input: ldr r11,[r0],#4
+
+# qhasm: hitmp = mem32[input_0]; input_0 += 4
+# asm 1: ldr >hitmp=int32#13,[<input_0=int32#1],#4
+# asm 2: ldr >hitmp=r12,[<input_0=r0],#4
+# copy-collector input: ldr r12,[r0],#4
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#8,<lou4=int32#8,<lotmp=int32#12
+# asm 2: adds >lou4=r7,<lou4=r7,<lotmp=r11
+# copy-collector output starts
+ldr r11,[r0],#4
+ldr r12,[r0],#4
+# copy-collector output ends
+adds r7,r7,r11
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#9,<hiu4=int32#9,<hitmp=int32#13
+# asm 2: adc >hiu4=r8,<hiu4=r8,<hitmp=r12
+adc r8,r8,r12
+
+# qhasm: hitmp lotmp = lou3 * two23
+# asm 1: umull >lotmp=int32#13,>hitmp=int32#12,<lou3=int32#4,<two23=int32#14
+# asm 2: umull >lotmp=r12,>hitmp=r11,<lou3=r3,<two23=r14
+umull r12,r11,r3,r14
+
+# qhasm: lotmp hitmp += hiu3 * two23
+# asm 1: umlal <hitmp=int32#12,<lotmp=int32#13,<hiu3=int32#5,<two23=int32#14
+# asm 2: umlal <hitmp=r11,<lotmp=r12,<hiu3=r4,<two23=r14
+umlal r11,r12,r4,r14
+
+# qhasm: lotmp ^= (lou3 unsigned>> 18)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<lou3=int32#4,LSR #18
+# asm 2: eor >lotmp=r12,<lotmp=r12,<lou3=r3,LSR #18
+eor r12,r12,r3,LSR #18
+
+# qhasm: lotmp ^= (hiu3 << 14)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<hiu3=int32#5,LSL #14
+# asm 2: eor >lotmp=r12,<lotmp=r12,<hiu3=r4,LSL #14
+eor r12,r12,r4,LSL #14
+
+# qhasm: lotmp ^= (lou3 unsigned>> 14)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<lou3=int32#4,LSR #14
+# asm 2: eor >lotmp=r12,<lotmp=r12,<lou3=r3,LSR #14
+eor r12,r12,r3,LSR #14
+
+# qhasm: lotmp ^= (hiu3 << 18)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<hiu3=int32#5,LSL #18
+# asm 2: eor >lotmp=r12,<lotmp=r12,<hiu3=r4,LSL #18
+eor r12,r12,r4,LSL #18
+
+# qhasm: hitmp ^= (hiu3 unsigned>> 18)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<hiu3=int32#5,LSR #18
+# asm 2: eor >hitmp=r11,<hitmp=r11,<hiu3=r4,LSR #18
+eor r11,r11,r4,LSR #18
+
+# qhasm: hitmp ^= (lou3 << 14)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<lou3=int32#4,LSL #14
+# asm 2: eor >hitmp=r11,<hitmp=r11,<lou3=r3,LSL #14
+eor r11,r11,r3,LSL #14
+
+# qhasm: hitmp ^= (hiu3 unsigned>> 14)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<hiu3=int32#5,LSR #14
+# asm 2: eor >hitmp=r11,<hitmp=r11,<hiu3=r4,LSR #14
+eor r11,r11,r4,LSR #14
+
+# qhasm: hitmp ^= (lou3 << 18)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<lou3=int32#4,LSL #18
+# asm 2: eor >hitmp=r11,<hitmp=r11,<lou3=r3,LSL #18
+eor r11,r11,r3,LSL #18
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#8,<lou4=int32#8,<lotmp=int32#13
+# asm 2: adds >lou4=r7,<lou4=r7,<lotmp=r12
+adds r7,r7,r12
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#9,<hiu4=int32#9,<hitmp=int32#12
+# asm 2: adc >hiu4=r8,<hiu4=r8,<hitmp=r11
+adc r8,r8,r11
+
+# qhasm: lotmp = lou3 & lou0
+# asm 1: and >lotmp=int32#2,<lou3=int32#4,<lou0=int32#2
+# asm 2: and >lotmp=r1,<lou3=r3,<lou0=r1
+and r1,r3,r1
+
+# qhasm: lotmp2 = lou1 & ~lou3
+# asm 1: bic >lotmp2=int32#4,<lou1=int32#6,<lou3=int32#4
+# asm 2: bic >lotmp2=r3,<lou1=r5,<lou3=r3
+bic r3,r5,r3
+
+# qhasm: lotmp ^= lotmp2
+# asm 1: eor >lotmp=int32#2,<lotmp=int32#2,<lotmp2=int32#4
+# asm 2: eor >lotmp=r1,<lotmp=r1,<lotmp2=r3
+eor r1,r1,r3
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#2,<lou4=int32#8,<lotmp=int32#2
+# asm 2: adds >lou4=r1,<lou4=r7,<lotmp=r1
+adds.w r1,r7,r1
+
+# qhasm: hitmp = hiu3 & hiu0
+# asm 1: and >hitmp=int32#3,<hiu3=int32#5,<hiu0=int32#3
+# asm 2: and >hitmp=r2,<hiu3=r4,<hiu0=r2
+and r2,r4,r2
+
+# qhasm: hitmp2 = hiu1 & ~hiu3
+# asm 1: bic >hitmp2=int32#4,<hiu1=int32#7,<hiu3=int32#5
+# asm 2: bic >hitmp2=r3,<hiu1=r6,<hiu3=r4
+bic r3,r6,r4
+
+# qhasm: hitmp ^= hitmp2
+# asm 1: eor >hitmp=int32#3,<hitmp=int32#3,<hitmp2=int32#4
+# asm 2: eor >hitmp=r2,<hitmp=r2,<hitmp2=r3
+eor r2,r2,r3
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#3,<hiu4=int32#9,<hitmp=int32#3
+# asm 2: adc >hiu4=r2,<hiu4=r8,<hitmp=r2
+adc r2,r8,r2
+
+# qhasm: carry? lou2 += lou4
+# asm 1: adds >lou2=int32#4,<lou2=int32#10,<lou4=int32#2
+# asm 2: adds >lou2=r3,<lou2=r9,<lou4=r1
+adds r3,r9,r1
+
+# qhasm: hiu2 += hiu4 + carry
+# asm 1: adc >hiu2=int32#5,<hiu2=int32#11,<hiu4=int32#3
+# asm 2: adc >hiu2=r4,<hiu2=r10,<hiu4=r2
+adc r4,r10,r2
+
+# qhasm: lod5 = lou2
+# asm 1: str <lou2=int32#4,>lod5=stack32#31
+# asm 2: str <lou2=r3,>lod5=[sp,#120]
+# copy-collector input: str r3,[sp,#120]
+
+# qhasm: hid5 = hiu2
+# asm 1: str <hiu2=int32#5,>hid5=stack32#32
+# asm 2: str <hiu2=r4,>hid5=[sp,#124]
+# copy-collector input: str r4,[sp,#124]
+
+# qhasm: lou0 = lod2
+# asm 1: ldr >lou0=int32#6,<lod2=stack32#17
+# asm 2: ldr >lou0=r5,<lod2=[sp,#64]
+# copy-collector input: ldr r5,[sp,#64]
+
+# qhasm: hiu0 = hid2
+# asm 1: ldr >hiu0=int32#7,<hid2=stack32#18
+# asm 2: ldr >hiu0=r6,<hid2=[sp,#68]
+# copy-collector input: ldr r6,[sp,#68]
+
+# qhasm: lou1 = lod3
+# asm 1: ldr >lou1=int32#8,<lod3=stack32#19
+# asm 2: ldr >lou1=r7,<lod3=[sp,#72]
+# copy-collector input: ldr r7,[sp,#72]
+
+# qhasm: hiu1 = hid3
+# asm 1: ldr >hiu1=int32#9,<hid3=stack32#20
+# asm 2: ldr >hiu1=r8,<hid3=[sp,#76]
+# copy-collector input: ldr r8,[sp,#76]
+
+# qhasm: lou3 = lod4
+# asm 1: ldr >lou3=int32#10,<lod4=stack32#70
+# asm 2: ldr >lou3=r9,<lod4=[sp,#276]
+# copy-collector input: ldr r9,[sp,#276]
+
+# qhasm: hiu3 = hid4
+# asm 1: ldr >hiu3=int32#11,<hid4=stack32#71
+# asm 2: ldr >hiu3=r10,<hid4=[sp,#280]
+# copy-collector input: ldr r10,[sp,#280]
+
+# qhasm: two25 = 0x2000000 simple
+# asm 1: mov >two25=int32#12,0x2000000
+# asm 2: mov >two25=r11,0x2000000
+# copy-collector output starts
+strd r3,r4,[sp,#120]
+ldr r5,[sp,#64]
+ldr r6,[sp,#68]
+ldr.w r7,[sp,#72]
+ldr r8,[sp,#76]
+ldr r9,[sp,#276]
+ldr r10,[sp,#280]
+# copy-collector output ends
+mov r11,0x2000000
+
+# qhasm: hitmp lotmp = lou0 * two25
+# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,<lou0=int32#6,<two25=int32#12
+# asm 2: umull >lotmp=r14,>hitmp=r12,<lou0=r5,<two25=r11
+umull r14,r12,r5,r11
+
+# qhasm: lotmp hitmp += hiu0 * two25
+# asm 1: umlal <hitmp=int32#13,<lotmp=int32#14,<hiu0=int32#7,<two25=int32#12
+# asm 2: umlal <hitmp=r12,<lotmp=r14,<hiu0=r6,<two25=r11
+umlal r12,r14,r6,r11
+
+# qhasm: lotmp ^= (hiu0 unsigned>> 2)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#14,<hiu0=int32#7,LSR #2
+# asm 2: eor >lotmp=r11,<lotmp=r14,<hiu0=r6,LSR #2
+eor r11,r14,r6,LSR #2
+
+# qhasm: lotmp ^= (lou0 << 30)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<lou0=int32#6,LSL #30
+# asm 2: eor >lotmp=r11,<lotmp=r11,<lou0=r5,LSL #30
+eor r11,r11,r5,LSL #30
+
+# qhasm: lotmp ^= (lou0 unsigned>> 28)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<lou0=int32#6,LSR #28
+# asm 2: eor >lotmp=r11,<lotmp=r11,<lou0=r5,LSR #28
+eor r11,r11,r5,LSR #28
+
+# qhasm: lotmp ^= (hiu0 << 4)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<hiu0=int32#7,LSL #4
+# asm 2: eor >lotmp=r11,<lotmp=r11,<hiu0=r6,LSL #4
+eor r11,r11,r6,LSL #4
+
+# qhasm: hitmp ^= (lou0 unsigned>> 2)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<lou0=int32#6,LSR #2
+# asm 2: eor >hitmp=r12,<hitmp=r12,<lou0=r5,LSR #2
+eor r12,r12,r5,LSR #2
+
+# qhasm: hitmp ^= (hiu0 << 30)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<hiu0=int32#7,LSL #30
+# asm 2: eor >hitmp=r12,<hitmp=r12,<hiu0=r6,LSL #30
+eor r12,r12,r6,LSL #30
+
+# qhasm: hitmp ^= (hiu0 unsigned>> 28)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<hiu0=int32#7,LSR #28
+# asm 2: eor >hitmp=r12,<hitmp=r12,<hiu0=r6,LSR #28
+eor r12,r12,r6,LSR #28
+
+# qhasm: hitmp ^= (lou0 << 4)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<lou0=int32#6,LSL #4
+# asm 2: eor >hitmp=r12,<hitmp=r12,<lou0=r5,LSL #4
+eor r12,r12,r5,LSL #4
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#2,<lou4=int32#2,<lotmp=int32#12
+# asm 2: adds >lou4=r1,<lou4=r1,<lotmp=r11
+adds r1,r1,r11
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#3,<hiu4=int32#3,<hitmp=int32#13
+# asm 2: adc >hiu4=r2,<hiu4=r2,<hitmp=r12
+adc r2,r2,r12
+
+# qhasm: lotmp = lou1 ^ lou3
+# asm 1: eor >lotmp=int32#12,<lou1=int32#8,<lou3=int32#10
+# asm 2: eor >lotmp=r11,<lou1=r7,<lou3=r9
+eor r11,r7,r9
+
+# qhasm: lotmp &= lou0
+# asm 1: and >lotmp=int32#6,<lotmp=int32#12,<lou0=int32#6
+# asm 2: and >lotmp=r5,<lotmp=r11,<lou0=r5
+and r5,r11,r5
+
+# qhasm: lotmp2 = lou1 & lou3
+# asm 1: and >lotmp2=int32#8,<lou1=int32#8,<lou3=int32#10
+# asm 2: and >lotmp2=r7,<lou1=r7,<lou3=r9
+and r7,r7,r9
+
+# qhasm: lotmp ^= lotmp2
+# asm 1: eor >lotmp=int32#6,<lotmp=int32#6,<lotmp2=int32#8
+# asm 2: eor >lotmp=r5,<lotmp=r5,<lotmp2=r7
+eor r5,r5,r7
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#2,<lou4=int32#2,<lotmp=int32#6
+# asm 2: adds >lou4=r1,<lou4=r1,<lotmp=r5
+adds.w r1,r1,r5
+
+# qhasm: hitmp = hiu1 ^ hiu3
+# asm 1: eor >hitmp=int32#6,<hiu1=int32#9,<hiu3=int32#11
+# asm 2: eor >hitmp=r5,<hiu1=r8,<hiu3=r10
+eor r5,r8,r10
+
+# qhasm: hitmp &= hiu0
+# asm 1: and >hitmp=int32#6,<hitmp=int32#6,<hiu0=int32#7
+# asm 2: and >hitmp=r5,<hitmp=r5,<hiu0=r6
+and r5,r5,r6
+
+# qhasm: hitmp2 = hiu1 & hiu3
+# asm 1: and >hitmp2=int32#7,<hiu1=int32#9,<hiu3=int32#11
+# asm 2: and >hitmp2=r6,<hiu1=r8,<hiu3=r10
+and r6,r8,r10
+
+# qhasm: hitmp ^= hitmp2
+# asm 1: eor >hitmp=int32#6,<hitmp=int32#6,<hitmp2=int32#7
+# asm 2: eor >hitmp=r5,<hitmp=r5,<hitmp2=r6
+eor r5,r5,r6
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#3,<hiu4=int32#3,<hitmp=int32#6
+# asm 2: adc >hiu4=r2,<hiu4=r2,<hitmp=r5
+adc r2,r2,r5
+
+# qhasm: lod1 = lou4
+# asm 1: str <lou4=int32#2,>lod1=stack32#15
+# asm 2: str <lou4=r1,>lod1=[sp,#56]
+# copy-collector input: str r1,[sp,#56]
+
+# qhasm: hid1 = hiu4
+# asm 1: str <hiu4=int32#3,>hid1=stack32#16
+# asm 2: str <hiu4=r2,>hid1=[sp,#60]
+# copy-collector input: str r2,[sp,#60]
+
+# qhasm: lou0 = lod6
+# asm 1: ldr >lou0=int32#2,<lod6=stack32#33
+# asm 2: ldr >lou0=r1,<lod6=[sp,#128]
+# copy-collector input: ldr r1,[sp,#128]
+
+# qhasm: hiu0 = hid6
+# asm 1: ldr >hiu0=int32#3,<hid6=stack32#34
+# asm 2: ldr >hiu0=r2,<hid6=[sp,#132]
+# copy-collector input: ldr r2,[sp,#132]
+
+# qhasm: lou1 = lod7
+# asm 1: ldr >lou1=int32#6,<lod7=stack32#35
+# asm 2: ldr >lou1=r5,<lod7=[sp,#136]
+# copy-collector input: ldr r5,[sp,#136]
+
+# qhasm: hiu1 = hid7
+# asm 1: ldr >hiu1=int32#7,<hid7=stack32#36
+# asm 2: ldr >hiu1=r6,<hid7=[sp,#140]
+# copy-collector input: ldr r6,[sp,#140]
+
+# qhasm: lou4 = lod0
+# asm 1: ldr >lou4=int32#8,<lod0=stack32#29
+# asm 2: ldr >lou4=r7,<lod0=[sp,#112]
+# copy-collector input: ldr r7,[sp,#112]
+
+# qhasm: hiu4 = hid0
+# asm 1: ldr >hiu4=int32#9,<hid0=stack32#30
+# asm 2: ldr >hiu4=r8,<hid0=[sp,#116]
+# copy-collector input: ldr r8,[sp,#116]
+
+# qhasm: lou5 = lod15
+# asm 1: ldr >lou5=int32#12,<lod15=stack32#51
+# asm 2: ldr >lou5=r11,<lod15=[sp,#200]
+# copy-collector input: ldr r11,[sp,#200]
+
+# qhasm: hiu5 = hid15
+# asm 1: ldr >hiu5=int32#13,<hid15=stack32#52
+# asm 2: ldr >hiu5=r12,<hid15=[sp,#204]
+# copy-collector input: ldr r12,[sp,#204]
+
+# qhasm: two23 = 0x800000 simple
+# asm 1: mov >two23=int32#14,0x800000
+# asm 2: mov >two23=r14,0x800000
+# copy-collector output starts
+strd r1,r2,[sp,#56]
+ldr r1,[sp,#128]
+ldr r2,[sp,#132]
+ldr r5,[sp,#136]
+ldr r6,[sp,#140]
+ldr.w r7,[sp,#112]
+ldr r8,[sp,#116]
+ldr r11,[sp,#200]
+ldr r12,[sp,#204]
+# copy-collector output ends
+mov r14,0x800000
+
+# qhasm: carry?  lou4 += lou5
+# asm 1: adds >lou4=int32#8,<lou4=int32#8,<lou5=int32#12
+# asm 2: adds >lou4=r7,<lou4=r7,<lou5=r11
+adds r7,r7,r11
+
+# qhasm: hiu4 += hiu5 + carry
+# asm 1: adc >hiu4=int32#9,<hiu4=int32#9,<hiu5=int32#13
+# asm 2: adc >hiu4=r8,<hiu4=r8,<hiu5=r12
+adc r8,r8,r12
+
+# qhasm: lotmp = mem32[input_0]; input_0 += 4
+# asm 1: ldr >lotmp=int32#12,[<input_0=int32#1],#4
+# asm 2: ldr >lotmp=r11,[<input_0=r0],#4
+# copy-collector input: ldr r11,[r0],#4
+
+# qhasm: hitmp = mem32[input_0]; input_0 += 4
+# asm 1: ldr >hitmp=int32#13,[<input_0=int32#1],#4
+# asm 2: ldr >hitmp=r12,[<input_0=r0],#4
+# copy-collector input: ldr r12,[r0],#4
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#8,<lou4=int32#8,<lotmp=int32#12
+# asm 2: adds >lou4=r7,<lou4=r7,<lotmp=r11
+# copy-collector output starts
+ldr r11,[r0],#4
+ldr r12,[r0],#4
+# copy-collector output ends
+adds r7,r7,r11
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#9,<hiu4=int32#9,<hitmp=int32#13
+# asm 2: adc >hiu4=r8,<hiu4=r8,<hitmp=r12
+adc r8,r8,r12
+
+# qhasm: hitmp lotmp = lou2 * two23
+# asm 1: umull >lotmp=int32#13,>hitmp=int32#12,<lou2=int32#4,<two23=int32#14
+# asm 2: umull >lotmp=r12,>hitmp=r11,<lou2=r3,<two23=r14
+umull r12,r11,r3,r14
+
+# qhasm: lotmp hitmp += hiu2 * two23
+# asm 1: umlal <hitmp=int32#12,<lotmp=int32#13,<hiu2=int32#5,<two23=int32#14
+# asm 2: umlal <hitmp=r11,<lotmp=r12,<hiu2=r4,<two23=r14
+umlal r11,r12,r4,r14
+
+# qhasm: lotmp ^= (lou2 unsigned>> 18)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<lou2=int32#4,LSR #18
+# asm 2: eor >lotmp=r12,<lotmp=r12,<lou2=r3,LSR #18
+eor r12,r12,r3,LSR #18
+
+# qhasm: lotmp ^= (hiu2 << 14)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<hiu2=int32#5,LSL #14
+# asm 2: eor >lotmp=r12,<lotmp=r12,<hiu2=r4,LSL #14
+eor r12,r12,r4,LSL #14
+
+# qhasm: lotmp ^= (lou2 unsigned>> 14)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<lou2=int32#4,LSR #14
+# asm 2: eor >lotmp=r12,<lotmp=r12,<lou2=r3,LSR #14
+eor r12,r12,r3,LSR #14
+
+# qhasm: lotmp ^= (hiu2 << 18)
+# asm 1: eor >lotmp=int32#13,<lotmp=int32#13,<hiu2=int32#5,LSL #18
+# asm 2: eor >lotmp=r12,<lotmp=r12,<hiu2=r4,LSL #18
+eor r12,r12,r4,LSL #18
+
+# qhasm: hitmp ^= (hiu2 unsigned>> 18)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<hiu2=int32#5,LSR #18
+# asm 2: eor >hitmp=r11,<hitmp=r11,<hiu2=r4,LSR #18
+eor r11,r11,r4,LSR #18
+
+# qhasm: hitmp ^= (lou2 << 14)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<lou2=int32#4,LSL #14
+# asm 2: eor >hitmp=r11,<hitmp=r11,<lou2=r3,LSL #14
+eor r11,r11,r3,LSL #14
+
+# qhasm: hitmp ^= (hiu2 unsigned>> 14)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<hiu2=int32#5,LSR #14
+# asm 2: eor >hitmp=r11,<hitmp=r11,<hiu2=r4,LSR #14
+eor r11,r11,r4,LSR #14
+
+# qhasm: hitmp ^= (lou2 << 18)
+# asm 1: eor >hitmp=int32#12,<hitmp=int32#12,<lou2=int32#4,LSL #18
+# asm 2: eor >hitmp=r11,<hitmp=r11,<lou2=r3,LSL #18
+eor r11,r11,r3,LSL #18
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#8,<lou4=int32#8,<lotmp=int32#13
+# asm 2: adds >lou4=r7,<lou4=r7,<lotmp=r12
+adds r7,r7,r12
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#9,<hiu4=int32#9,<hitmp=int32#12
+# asm 2: adc >hiu4=r8,<hiu4=r8,<hitmp=r11
+adc r8,r8,r11
+
+# qhasm: lotmp = lou2 & lou0
+# asm 1: and >lotmp=int32#2,<lou2=int32#4,<lou0=int32#2
+# asm 2: and >lotmp=r1,<lou2=r3,<lou0=r1
+and r1,r3,r1
+
+# qhasm: lotmp2 = lou1 & ~lou2
+# asm 1: bic >lotmp2=int32#4,<lou1=int32#6,<lou2=int32#4
+# asm 2: bic >lotmp2=r3,<lou1=r5,<lou2=r3
+bic r3,r5,r3
+
+# qhasm: lotmp ^= lotmp2
+# asm 1: eor >lotmp=int32#2,<lotmp=int32#2,<lotmp2=int32#4
+# asm 2: eor >lotmp=r1,<lotmp=r1,<lotmp2=r3
+eor r1,r1,r3
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#2,<lou4=int32#8,<lotmp=int32#2
+# asm 2: adds >lou4=r1,<lou4=r7,<lotmp=r1
+adds.w r1,r7,r1
+
+# qhasm: hitmp = hiu2 & hiu0
+# asm 1: and >hitmp=int32#3,<hiu2=int32#5,<hiu0=int32#3
+# asm 2: and >hitmp=r2,<hiu2=r4,<hiu0=r2
+and r2,r4,r2
+
+# qhasm: hitmp2 = hiu1 & ~hiu2
+# asm 1: bic >hitmp2=int32#4,<hiu1=int32#7,<hiu2=int32#5
+# asm 2: bic >hitmp2=r3,<hiu1=r6,<hiu2=r4
+bic r3,r6,r4
+
+# qhasm: hitmp ^= hitmp2
+# asm 1: eor >hitmp=int32#3,<hitmp=int32#3,<hitmp2=int32#4
+# asm 2: eor >hitmp=r2,<hitmp=r2,<hitmp2=r3
+eor r2,r2,r3
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#3,<hiu4=int32#9,<hitmp=int32#3
+# asm 2: adc >hiu4=r2,<hiu4=r8,<hitmp=r2
+adc r2,r8,r2
+
+# qhasm: carry? lou3 += lou4
+# asm 1: adds >lou3=int32#4,<lou3=int32#10,<lou4=int32#2
+# asm 2: adds >lou3=r3,<lou3=r9,<lou4=r1
+adds r3,r9,r1
+
+# qhasm: hiu3 += hiu4 + carry
+# asm 1: adc >hiu3=int32#5,<hiu3=int32#11,<hiu4=int32#3
+# asm 2: adc >hiu3=r4,<hiu3=r10,<hiu4=r2
+adc r4,r10,r2
+
+# qhasm: lod4 = lou3
+# asm 1: str <lou3=int32#4,>lod4=stack32#29
+# asm 2: str <lou3=r3,>lod4=[sp,#112]
+# copy-collector input: str r3,[sp,#112]
+
+# qhasm: hid4 = hiu3
+# asm 1: str <hiu3=int32#5,>hid4=stack32#30
+# asm 2: str <hiu3=r4,>hid4=[sp,#116]
+# copy-collector input: str r4,[sp,#116]
+
+# qhasm: lou0 = lod1
+# asm 1: ldr >lou0=int32#4,<lod1=stack32#15
+# asm 2: ldr >lou0=r3,<lod1=[sp,#56]
+# copy-collector input: ldr r3,[sp,#56]
+
+# qhasm: hiu0 = hid1
+# asm 1: ldr >hiu0=int32#5,<hid1=stack32#16
+# asm 2: ldr >hiu0=r4,<hid1=[sp,#60]
+# copy-collector input: ldr r4,[sp,#60]
+
+# qhasm: lou1 = lod2
+# asm 1: ldr >lou1=int32#6,<lod2=stack32#17
+# asm 2: ldr >lou1=r5,<lod2=[sp,#64]
+# copy-collector input: ldr r5,[sp,#64]
+
+# qhasm: hiu1 = hid2
+# asm 1: ldr >hiu1=int32#7,<hid2=stack32#18
+# asm 2: ldr >hiu1=r6,<hid2=[sp,#68]
+# copy-collector input: ldr r6,[sp,#68]
+
+# qhasm: lou2 = lod3
+# asm 1: ldr >lou2=int32#8,<lod3=stack32#19
+# asm 2: ldr >lou2=r7,<lod3=[sp,#72]
+# copy-collector input: ldr r7,[sp,#72]
+
+# qhasm: hiu2 = hid3
+# asm 1: ldr >hiu2=int32#9,<hid3=stack32#20
+# asm 2: ldr >hiu2=r8,<hid3=[sp,#76]
+# copy-collector input: ldr r8,[sp,#76]
+
+# qhasm: two25 = 0x2000000 simple
+# asm 1: mov >two25=int32#10,0x2000000
+# asm 2: mov >two25=r9,0x2000000
+# copy-collector output starts
+strd r3,r4,[sp,#112]
+ldr r3,[sp,#56]
+ldr r4,[sp,#60]
+ldr r5,[sp,#64]
+ldr r6,[sp,#68]
+ldr.w r7,[sp,#72]
+ldr r8,[sp,#76]
+# copy-collector output ends
+mov r9,0x2000000
+
+# qhasm: hitmp lotmp = lou0 * two25
+# asm 1: umull >lotmp=int32#12,>hitmp=int32#11,<lou0=int32#4,<two25=int32#10
+# asm 2: umull >lotmp=r11,>hitmp=r10,<lou0=r3,<two25=r9
+umull r11,r10,r3,r9
+
+# qhasm: lotmp hitmp += hiu0 * two25
+# asm 1: umlal <hitmp=int32#11,<lotmp=int32#12,<hiu0=int32#5,<two25=int32#10
+# asm 2: umlal <hitmp=r10,<lotmp=r11,<hiu0=r4,<two25=r9
+umlal r10,r11,r4,r9
+
+# qhasm: lotmp ^= (hiu0 unsigned>> 2)
+# asm 1: eor >lotmp=int32#10,<lotmp=int32#12,<hiu0=int32#5,LSR #2
+# asm 2: eor >lotmp=r9,<lotmp=r11,<hiu0=r4,LSR #2
+eor r9,r11,r4,LSR #2
+
+# qhasm: lotmp ^= (lou0 << 30)
+# asm 1: eor >lotmp=int32#10,<lotmp=int32#10,<lou0=int32#4,LSL #30
+# asm 2: eor >lotmp=r9,<lotmp=r9,<lou0=r3,LSL #30
+eor r9,r9,r3,LSL #30
+
+# qhasm: lotmp ^= (lou0 unsigned>> 28)
+# asm 1: eor >lotmp=int32#10,<lotmp=int32#10,<lou0=int32#4,LSR #28
+# asm 2: eor >lotmp=r9,<lotmp=r9,<lou0=r3,LSR #28
+eor r9,r9,r3,LSR #28
+
+# qhasm: lotmp ^= (hiu0 << 4)
+# asm 1: eor >lotmp=int32#10,<lotmp=int32#10,<hiu0=int32#5,LSL #4
+# asm 2: eor >lotmp=r9,<lotmp=r9,<hiu0=r4,LSL #4
+eor r9,r9,r4,LSL #4
+
+# qhasm: hitmp ^= (lou0 unsigned>> 2)
+# asm 1: eor >hitmp=int32#11,<hitmp=int32#11,<lou0=int32#4,LSR #2
+# asm 2: eor >hitmp=r10,<hitmp=r10,<lou0=r3,LSR #2
+eor r10,r10,r3,LSR #2
+
+# qhasm: hitmp ^= (hiu0 << 30)
+# asm 1: eor >hitmp=int32#11,<hitmp=int32#11,<hiu0=int32#5,LSL #30
+# asm 2: eor >hitmp=r10,<hitmp=r10,<hiu0=r4,LSL #30
+eor r10,r10,r4,LSL #30
+
+# qhasm: hitmp ^= (hiu0 unsigned>> 28)
+# asm 1: eor >hitmp=int32#11,<hitmp=int32#11,<hiu0=int32#5,LSR #28
+# asm 2: eor >hitmp=r10,<hitmp=r10,<hiu0=r4,LSR #28
+eor r10,r10,r4,LSR #28
+
+# qhasm: hitmp ^= (lou0 << 4)
+# asm 1: eor >hitmp=int32#11,<hitmp=int32#11,<lou0=int32#4,LSL #4
+# asm 2: eor >hitmp=r10,<hitmp=r10,<lou0=r3,LSL #4
+eor r10,r10,r3,LSL #4
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#2,<lou4=int32#2,<lotmp=int32#10
+# asm 2: adds >lou4=r1,<lou4=r1,<lotmp=r9
+adds r1,r1,r9
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#3,<hiu4=int32#3,<hitmp=int32#11
+# asm 2: adc >hiu4=r2,<hiu4=r2,<hitmp=r10
+adc r2,r2,r10
+
+# qhasm: lotmp = lou1 ^ lou2
+# asm 1: eor >lotmp=int32#10,<lou1=int32#6,<lou2=int32#8
+# asm 2: eor >lotmp=r9,<lou1=r5,<lou2=r7
+eor r9,r5,r7
+
+# qhasm: lotmp &= lou0
+# asm 1: and >lotmp=int32#4,<lotmp=int32#10,<lou0=int32#4
+# asm 2: and >lotmp=r3,<lotmp=r9,<lou0=r3
+and r3,r9,r3
+
+# qhasm: lotmp2 = lou1 & lou2
+# asm 1: and >lotmp2=int32#6,<lou1=int32#6,<lou2=int32#8
+# asm 2: and >lotmp2=r5,<lou1=r5,<lou2=r7
+and r5,r5,r7
+
+# qhasm: lotmp ^= lotmp2
+# asm 1: eor >lotmp=int32#4,<lotmp=int32#4,<lotmp2=int32#6
+# asm 2: eor >lotmp=r3,<lotmp=r3,<lotmp2=r5
+eor r3,r3,r5
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#2,<lou4=int32#2,<lotmp=int32#4
+# asm 2: adds >lou4=r1,<lou4=r1,<lotmp=r3
+adds.w r1,r1,r3
+
+# qhasm: hitmp = hiu1 ^ hiu2
+# asm 1: eor >hitmp=int32#4,<hiu1=int32#7,<hiu2=int32#9
+# asm 2: eor >hitmp=r3,<hiu1=r6,<hiu2=r8
+eor r3,r6,r8
+
+# qhasm: hitmp &= hiu0
+# asm 1: and >hitmp=int32#4,<hitmp=int32#4,<hiu0=int32#5
+# asm 2: and >hitmp=r3,<hitmp=r3,<hiu0=r4
+and r3,r3,r4
+
+# qhasm: hitmp2 = hiu1 & hiu2
+# asm 1: and >hitmp2=int32#5,<hiu1=int32#7,<hiu2=int32#9
+# asm 2: and >hitmp2=r4,<hiu1=r6,<hiu2=r8
+and r4,r6,r8
+
+# qhasm: hitmp ^= hitmp2
+# asm 1: eor >hitmp=int32#4,<hitmp=int32#4,<hitmp2=int32#5
+# asm 2: eor >hitmp=r3,<hitmp=r3,<hitmp2=r4
+eor r3,r3,r4
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#3,<hiu4=int32#3,<hitmp=int32#4
+# asm 2: adc >hiu4=r2,<hiu4=r2,<hitmp=r3
+adc r2,r2,r3
+
+# qhasm: lod0 = lou4
+# asm 1: str <lou4=int32#2,>lod0=stack32#13
+# asm 2: str <lou4=r1,>lod0=[sp,#48]
+# copy-collector input: str r1,[sp,#48]
+
+# qhasm: hid0 = hiu4
+# asm 1: str <hiu4=int32#3,>hid0=stack32#14
+# asm 2: str <hiu4=r2,>hid0=[sp,#52]
+# copy-collector input: str r2,[sp,#52]
+
+# qhasm: o3 = input_0
+# asm 1: str <input_0=int32#1,>o3=stack32#4
+# asm 2: str <input_0=r0,>o3=[sp,#12]
+# copy-collector input: str r0,[sp,#12]
+
+# qhasm: input_0 = o4
+# asm 1: ldr >input_0=int32#1,<o4=stack32#69
+# asm 2: ldr >input_0=r0,<o4=[sp,#272]
+# copy-collector input: ldr r0,[sp,#272]
+
+# qhasm: =? unsigned<? input_0 -= 8
+# asm 1: subs >input_0=int32#1,<input_0=int32#1,#8
+# asm 2: subs >input_0=r0,<input_0=r0,#8
+# copy-collector output starts
+strd r1,r2,[sp,#48]
+str r0,[sp,#12]
+ldr r0,[sp,#272]
+# copy-collector output ends
+subs r0,r0,#8
+
+# qhasm: goto endinnerloop if =
+beq ._endinnerloop
+
+# qhasm: o4 = input_0
+# asm 1: str <input_0=int32#1,>o4=stack32#69
+# asm 2: str <input_0=r0,>o4=[sp,#272]
+# copy-collector input: str r0,[sp,#272]
+
+# qhasm: =? input_0 - 8
+# asm 1: cmp <input_0=int32#1,#8
+# asm 2: cmp <input_0=r0,#8
+# copy-collector output starts
+str r0,[sp,#272]
+# copy-collector output ends
+cmp r0,#8
+
+# qhasm: goto nearend if =
+beq ._nearend
+
+# qhasm: two24 = 0x1000000 simple
+# asm 1: mov >two24=int32#1,0x1000000
+# asm 2: mov >two24=r0,0x1000000
+mov r0,0x1000000
+
+# qhasm: two13 = 0x2000 simple
+# asm 1: mov >two13=int32#2,0x2000
+# asm 2: mov >two13=r1,0x2000
+mov r1,0x2000
+
+# qhasm: lou0 = lod8
+# asm 1: ldr >lou0=int32#3,<lod8=stack32#37
+# asm 2: ldr >lou0=r2,<lod8=[sp,#144]
+# copy-collector input: ldr r2,[sp,#144]
+
+# qhasm: hiu0 = hid8
+# asm 1: ldr >hiu0=int32#4,<hid8=stack32#38
+# asm 2: ldr >hiu0=r3,<hid8=[sp,#148]
+# copy-collector input: ldr r3,[sp,#148]
+
+# qhasm: lou1 = lod9
+# asm 1: ldr >lou1=int32#5,<lod9=stack32#39
+# asm 2: ldr >lou1=r4,<lod9=[sp,#152]
+# copy-collector input: ldr r4,[sp,#152]
+
+# qhasm: hiu1 = hid9
+# asm 1: ldr >hiu1=int32#6,<hid9=stack32#40
+# asm 2: ldr >hiu1=r5,<hid9=[sp,#156]
+# copy-collector input: ldr r5,[sp,#156]
+
+# qhasm: lou2 = lom14
+# asm 1: ldr >lou2=int32#7,<lom14=stack32#65
+# asm 2: ldr >lou2=r6,<lom14=[sp,#256]
+# copy-collector input: ldr r6,[sp,#256]
+
+# qhasm: hiu2 = him14
+# asm 1: ldr >hiu2=int32#8,<him14=stack32#66
+# asm 2: ldr >hiu2=r7,<him14=[sp,#260]
+# copy-collector input: ldr r7,[sp,#260]
+
+# qhasm: lou3 = lom9
+# asm 1: ldr >lou3=int32#9,<lom9=stack32#55
+# asm 2: ldr >lou3=r8,<lom9=[sp,#216]
+# copy-collector input: ldr r8,[sp,#216]
+
+# qhasm: hiu3 = him9
+# asm 1: ldr >hiu3=int32#10,<him9=stack32#56
+# asm 2: ldr >hiu3=r9,<him9=[sp,#220]
+# copy-collector input: ldr r9,[sp,#220]
+
+# qhasm: hitmp lotmp = hiu2 * two13
+# asm 1: umull >lotmp=int32#12,>hitmp=int32#11,<hiu2=int32#8,<two13=int32#2
+# asm 2: umull >lotmp=r11,>hitmp=r10,<hiu2=r7,<two13=r1
+# copy-collector output starts
+ldr r2,[sp,#144]
+ldr r3,[sp,#148]
+ldr r4,[sp,#152]
+ldr r5,[sp,#156]
+ldr r6,[sp,#256]
+ldr.w r7,[sp,#260]
+ldr r8,[sp,#216]
+ldr r9,[sp,#220]
+# copy-collector output ends
+umull r11,r10,r7,r1
+
+# qhasm: lotmp hitmp += lou2 * two13
+# asm 1: umlal <hitmp=int32#11,<lotmp=int32#12,<lou2=int32#7,<two13=int32#2
+# asm 2: umlal <hitmp=r10,<lotmp=r11,<lou2=r6,<two13=r1
+umlal r10,r11,r6,r1
+
+# qhasm: lotmp ^= (lou2 unsigned>> 6)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<lou2=int32#7,LSR #6
+# asm 2: eor >lotmp=r11,<lotmp=r11,<lou2=r6,LSR #6
+eor r11,r11,r6,LSR #6
+
+# qhasm: lotmp ^= (hiu2 << 26)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<hiu2=int32#8,LSL #26
+# asm 2: eor >lotmp=r11,<lotmp=r11,<hiu2=r7,LSL #26
+eor r11,r11,r7,LSL #26
+
+# qhasm: lotmp ^= (hiu2 unsigned>> 29)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<hiu2=int32#8,LSR #29
+# asm 2: eor >lotmp=r11,<lotmp=r11,<hiu2=r7,LSR #29
+eor r11,r11,r7,LSR #29
+
+# qhasm: lotmp ^= (lou2 << 3)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<lou2=int32#7,LSL #3
+# asm 2: eor >lotmp=r11,<lotmp=r11,<lou2=r6,LSL #3
+eor r11,r11,r6,LSL #3
+
+# qhasm: hitmp ^= (hiu2 unsigned>> 6)
+# asm 1: eor >hitmp=int32#11,<hitmp=int32#11,<hiu2=int32#8,LSR #6
+# asm 2: eor >hitmp=r10,<hitmp=r10,<hiu2=r7,LSR #6
+eor r10,r10,r7,LSR #6
+
+# qhasm: hitmp ^= (lou2 unsigned>> 29)
+# asm 1: eor >hitmp=int32#11,<hitmp=int32#11,<lou2=int32#7,LSR #29
+# asm 2: eor >hitmp=r10,<hitmp=r10,<lou2=r6,LSR #29
+eor r10,r10,r6,LSR #29
+
+# qhasm: hitmp ^= (hiu2 << 3)
+# asm 1: eor >hitmp=int32#11,<hitmp=int32#11,<hiu2=int32#8,LSL #3
+# asm 2: eor >hitmp=r10,<hitmp=r10,<hiu2=r7,LSL #3
+eor r10,r10,r7,LSL #3
+
+# qhasm: carry? lou0 += lotmp
+# asm 1: adds >lou0=int32#3,<lou0=int32#3,<lotmp=int32#12
+# asm 2: adds >lou0=r2,<lou0=r2,<lotmp=r11
+adds r2,r2,r11
+
+# qhasm: hiu0 += hitmp + carry
+# asm 1: adc >hiu0=int32#4,<hiu0=int32#4,<hitmp=int32#11
+# asm 2: adc >hiu0=r3,<hiu0=r3,<hitmp=r10
+adc r3,r3,r10
+
+# qhasm: hitmp lotmp = hiu1 * two24
+# asm 1: umull >lotmp=int32#12,>hitmp=int32#11,<hiu1=int32#6,<two24=int32#1
+# asm 2: umull >lotmp=r11,>hitmp=r10,<hiu1=r5,<two24=r0
+umull r11,r10,r5,r0
+
+# qhasm: lotmp hitmp += lou1 * two24
+# asm 1: umlal <hitmp=int32#11,<lotmp=int32#12,<lou1=int32#5,<two24=int32#1
+# asm 2: umlal <hitmp=r10,<lotmp=r11,<lou1=r4,<two24=r0
+umlal r10,r11,r4,r0
+
+# qhasm: carry? lotmp ^= (lou1 unsigned>> 1)
+# asm 1: eors >lotmp=int32#12,<lotmp=int32#12,<lou1=int32#5,LSR #1
+# asm 2: eors >lotmp=r11,<lotmp=r11,<lou1=r4,LSR #1
+eors r11,r11,r4,LSR #1
+
+# qhasm: hitmp ^= (carry,hiu1 unsigned>> 1)
+# asm 1: eors >hitmp=int32#11,<hitmp=int32#11,<hiu1=int32#6,RRX
+# asm 2: eors >hitmp=r10,<hitmp=r10,<hiu1=r5,RRX
+eors r10,r10,r5,RRX
+
+# qhasm: lotmp ^= (hiu1 << 31)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<hiu1=int32#6,LSL #31
+# asm 2: eor >lotmp=r11,<lotmp=r11,<hiu1=r5,LSL #31
+eor r11,r11,r5,LSL #31
+
+# qhasm: lotmp ^= (lou1 unsigned>>7)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<lou1=int32#5,LSR #7
+# asm 2: eor >lotmp=r11,<lotmp=r11,<lou1=r4,LSR #7
+eor r11,r11,r4,LSR #7
+
+# qhasm: lotmp ^= (hiu1 << 25)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<hiu1=int32#6,LSL #25
+# asm 2: eor >lotmp=r11,<lotmp=r11,<hiu1=r5,LSL #25
+eor r11,r11,r5,LSL #25
+
+# qhasm: hitmp ^= (hiu1 unsigned>>7)
+# asm 1: eor >hitmp=int32#11,<hitmp=int32#11,<hiu1=int32#6,LSR #7
+# asm 2: eor >hitmp=r10,<hitmp=r10,<hiu1=r5,LSR #7
+eor r10,r10,r5,LSR #7
+
+# qhasm: carry? lou0 += lotmp
+# asm 1: adds >lou0=int32#3,<lou0=int32#3,<lotmp=int32#12
+# asm 2: adds >lou0=r2,<lou0=r2,<lotmp=r11
+adds r2,r2,r11
+
+# qhasm: hiu0 += hitmp + carry
+# asm 1: adc >hiu0=int32#4,<hiu0=int32#4,<hitmp=int32#11
+# asm 2: adc >hiu0=r3,<hiu0=r3,<hitmp=r10
+adc r3,r3,r10
+
+# qhasm: carry? lou0 += lou3
+# asm 1: adds >lou0=int32#3,<lou0=int32#3,<lou3=int32#9
+# asm 2: adds >lou0=r2,<lou0=r2,<lou3=r8
+adds r2,r2,r8
+
+# qhasm: hiu0 += hiu3 + carry
+# asm 1: adc >hiu0=int32#4,<hiu0=int32#4,<hiu3=int32#10
+# asm 2: adc >hiu0=r3,<hiu0=r3,<hiu3=r9
+adc r3,r3,r9
+
+# qhasm: lod9 = lou3
+# asm 1: str <lou3=int32#9,>lod9=stack32#39
+# asm 2: str <lou3=r8,>lod9=[sp,#152]
+# copy-collector input: str r8,[sp,#152]
+
+# qhasm: hid9 = hiu3
+# asm 1: str <hiu3=int32#10,>hid9=stack32#40
+# asm 2: str <hiu3=r9,>hid9=[sp,#156]
+# copy-collector input: str r9,[sp,#156]
+
+# qhasm: lou3 = lom15
+# asm 1: ldr >lou3=int32#9,<lom15=stack32#67
+# asm 2: ldr >lou3=r8,<lom15=[sp,#264]
+# copy-collector input: ldr r8,[sp,#264]
+
+# qhasm: hiu3 = him15
+# asm 1: ldr >hiu3=int32#10,<him15=stack32#68
+# asm 2: ldr >hiu3=r9,<him15=[sp,#268]
+# copy-collector input: ldr r9,[sp,#268]
+
+# qhasm: lom15 = lou0
+# asm 1: str <lou0=int32#3,>lom15=stack32#37
+# asm 2: str <lou0=r2,>lom15=[sp,#144]
+# copy-collector input: str r2,[sp,#144]
+
+# qhasm: him15 = hiu0
+# asm 1: str <hiu0=int32#4,>him15=stack32#38
+# asm 2: str <hiu0=r3,>him15=[sp,#148]
+# copy-collector input: str r3,[sp,#148]
+
+# qhasm: hitmp lotmp = hiu3 * two13
+# asm 1: umull >lotmp=int32#12,>hitmp=int32#11,<hiu3=int32#10,<two13=int32#2
+# asm 2: umull >lotmp=r11,>hitmp=r10,<hiu3=r9,<two13=r1
+# copy-collector output starts
+strd r8,r9,[sp,#152]
+ldrd r8,r9,[sp,#264]
+strd r2,r3,[sp,#144]
+# copy-collector output ends
+umull r11,r10,r9,r1
+
+# qhasm: lotmp hitmp += lou3 * two13
+# asm 1: umlal <hitmp=int32#11,<lotmp=int32#12,<lou3=int32#9,<two13=int32#2
+# asm 2: umlal <hitmp=r10,<lotmp=r11,<lou3=r8,<two13=r1
+umlal r10,r11,r8,r1
+
+# qhasm: lotmp ^= (lou3 unsigned>> 6)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<lou3=int32#9,LSR #6
+# asm 2: eor >lotmp=r11,<lotmp=r11,<lou3=r8,LSR #6
+eor r11,r11,r8,LSR #6
+
+# qhasm: lotmp ^= (hiu3 << 26)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<hiu3=int32#10,LSL #26
+# asm 2: eor >lotmp=r11,<lotmp=r11,<hiu3=r9,LSL #26
+eor r11,r11,r9,LSL #26
+
+# qhasm: lotmp ^= (hiu3 unsigned>> 29)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<hiu3=int32#10,LSR #29
+# asm 2: eor >lotmp=r11,<lotmp=r11,<hiu3=r9,LSR #29
+eor r11,r11,r9,LSR #29
+
+# qhasm: lotmp ^= (lou3 << 3)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<lou3=int32#9,LSL #3
+# asm 2: eor >lotmp=r11,<lotmp=r11,<lou3=r8,LSL #3
+eor r11,r11,r8,LSL #3
+
+# qhasm: hitmp ^= (hiu3 unsigned>> 6)
+# asm 1: eor >hitmp=int32#11,<hitmp=int32#11,<hiu3=int32#10,LSR #6
+# asm 2: eor >hitmp=r10,<hitmp=r10,<hiu3=r9,LSR #6
+eor r10,r10,r9,LSR #6
+
+# qhasm: hitmp ^= (lou3 unsigned>> 29)
+# asm 1: eor >hitmp=int32#11,<hitmp=int32#11,<lou3=int32#9,LSR #29
+# asm 2: eor >hitmp=r10,<hitmp=r10,<lou3=r8,LSR #29
+eor r10,r10,r8,LSR #29
+
+# qhasm: hitmp ^= (hiu3 << 3)
+# asm 1: eor >hitmp=int32#11,<hitmp=int32#11,<hiu3=int32#10,LSL #3
+# asm 2: eor >hitmp=r10,<hitmp=r10,<hiu3=r9,LSL #3
+eor r10,r10,r9,LSL #3
+
+# qhasm: carry? lou1 += lotmp
+# asm 1: adds >lou1=int32#5,<lou1=int32#5,<lotmp=int32#12
+# asm 2: adds >lou1=r4,<lou1=r4,<lotmp=r11
+adds r4,r4,r11
+
+# qhasm: hiu1 += hitmp + carry
+# asm 1: adc >hiu1=int32#6,<hiu1=int32#6,<hitmp=int32#11
+# asm 2: adc >hiu1=r5,<hiu1=r5,<hitmp=r10
+adc r5,r5,r10
+
+# qhasm: lou4 = lod10
+# asm 1: ldr >lou4=int32#11,<lod10=stack32#41
+# asm 2: ldr >lou4=r10,<lod10=[sp,#160]
+# copy-collector input: ldr r10,[sp,#160]
+
+# qhasm: hiu4 = hid10
+# asm 1: ldr >hiu4=int32#12,<hid10=stack32#42
+# asm 2: ldr >hiu4=r11,<hid10=[sp,#164]
+# copy-collector input: ldr r11,[sp,#164]
+
+# qhasm: hitmp lotmp = hiu4 * two24
+# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,<hiu4=int32#12,<two24=int32#1
+# asm 2: umull >lotmp=r14,>hitmp=r12,<hiu4=r11,<two24=r0
+# copy-collector output starts
+ldrd r10,r11,[sp,#160]
+# copy-collector output ends
+umull r14,r12,r11,r0
+
+# qhasm: lotmp hitmp += lou4 * two24
+# asm 1: umlal <hitmp=int32#13,<lotmp=int32#14,<lou4=int32#11,<two24=int32#1
+# asm 2: umlal <hitmp=r12,<lotmp=r14,<lou4=r10,<two24=r0
+umlal r12,r14,r10,r0
+
+# qhasm: carry? lotmp ^= (lou4 unsigned>> 1)
+# asm 1: eors >lotmp=int32#14,<lotmp=int32#14,<lou4=int32#11,LSR #1
+# asm 2: eors >lotmp=r14,<lotmp=r14,<lou4=r10,LSR #1
+eors r14,r14,r10,LSR #1
+
+# qhasm: hitmp ^= (carry,hiu4 unsigned>> 1)
+# asm 1: eors >hitmp=int32#13,<hitmp=int32#13,<hiu4=int32#12,RRX
+# asm 2: eors >hitmp=r12,<hitmp=r12,<hiu4=r11,RRX
+eors r12,r12,r11,RRX
+
+# qhasm: lotmp ^= (hiu4 << 31)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<hiu4=int32#12,LSL #31
+# asm 2: eor >lotmp=r14,<lotmp=r14,<hiu4=r11,LSL #31
+eor r14,r14,r11,LSL #31
+
+# qhasm: lotmp ^= (lou4 unsigned>>7)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<lou4=int32#11,LSR #7
+# asm 2: eor >lotmp=r14,<lotmp=r14,<lou4=r10,LSR #7
+eor r14,r14,r10,LSR #7
+
+# qhasm: lotmp ^= (hiu4 << 25)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<hiu4=int32#12,LSL #25
+# asm 2: eor >lotmp=r14,<lotmp=r14,<hiu4=r11,LSL #25
+eor r14,r14,r11,LSL #25
+
+# qhasm: hitmp ^= (hiu4 unsigned>>7)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<hiu4=int32#12,LSR #7
+# asm 2: eor >hitmp=r12,<hitmp=r12,<hiu4=r11,LSR #7
+eor r12,r12,r11,LSR #7
+
+# qhasm: carry? lou1 += lotmp
+# asm 1: adds >lou1=int32#5,<lou1=int32#5,<lotmp=int32#14
+# asm 2: adds >lou1=r4,<lou1=r4,<lotmp=r14
+adds r4,r4,r14
+
+# qhasm: hiu1 += hitmp + carry
+# asm 1: adc >hiu1=int32#6,<hiu1=int32#6,<hitmp=int32#13
+# asm 2: adc >hiu1=r5,<hiu1=r5,<hitmp=r12
+adc r5,r5,r12
+
+# qhasm: lou5 = lom10
+# asm 1: ldr >lou5=int32#13,<lom10=stack32#57
+# asm 2: ldr >lou5=r12,<lom10=[sp,#224]
+# copy-collector input: ldr r12,[sp,#224]
+
+# qhasm: hiu5 = him10
+# asm 1: ldr >hiu5=int32#14,<him10=stack32#58
+# asm 2: ldr >hiu5=r14,<him10=[sp,#228]
+# copy-collector input: ldr r14,[sp,#228]
+
+# qhasm: carry? lou1 += lou5
+# asm 1: adds >lou1=int32#5,<lou1=int32#5,<lou5=int32#13
+# asm 2: adds >lou1=r4,<lou1=r4,<lou5=r12
+# copy-collector output starts
+ldrd r12,r14,[sp,#224]
+# copy-collector output ends
+adds r4,r4,r12
+
+# qhasm: hiu1 += hiu5 + carry
+# asm 1: adc >hiu1=int32#6,<hiu1=int32#6,<hiu5=int32#14
+# asm 2: adc >hiu1=r5,<hiu1=r5,<hiu5=r14
+adc r5,r5,r14
+
+# qhasm: lod10 = lou5
+# asm 1: str <lou5=int32#13,>lod10=stack32#41
+# asm 2: str <lou5=r12,>lod10=[sp,#160]
+# copy-collector input: str r12,[sp,#160]
+
+# qhasm: hid10 = hiu5
+# asm 1: str <hiu5=int32#14,>hid10=stack32#42
+# asm 2: str <hiu5=r14,>hid10=[sp,#164]
+# copy-collector input: str r14,[sp,#164]
+
+# qhasm: lom9 = lou1
+# asm 1: str <lou1=int32#5,>lom9=stack32#55
+# asm 2: str <lou1=r4,>lom9=[sp,#216]
+# copy-collector input: str r4,[sp,#216]
+
+# qhasm: him9 = hiu1
+# asm 1: str <hiu1=int32#6,>him9=stack32#56
+# asm 2: str <hiu1=r5,>him9=[sp,#220]
+# copy-collector input: str r5,[sp,#220]
+
+# qhasm: hitmp lotmp = hiu0 * two13
+# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,<hiu0=int32#4,<two13=int32#2
+# asm 2: umull >lotmp=r14,>hitmp=r12,<hiu0=r3,<two13=r1
+# copy-collector output starts
+strd r12,r14,[sp,#160]
+strd r4,r5,[sp,#216]
+# copy-collector output ends
+umull r14,r12,r3,r1
+
+# qhasm: lotmp hitmp += lou0 * two13
+# asm 1: umlal <hitmp=int32#13,<lotmp=int32#14,<lou0=int32#3,<two13=int32#2
+# asm 2: umlal <hitmp=r12,<lotmp=r14,<lou0=r2,<two13=r1
+umlal r12,r14,r2,r1
+
+# qhasm: lotmp ^= (lou0 unsigned>> 6)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<lou0=int32#3,LSR #6
+# asm 2: eor >lotmp=r14,<lotmp=r14,<lou0=r2,LSR #6
+eor r14,r14,r2,LSR #6
+
+# qhasm: lotmp ^= (hiu0 << 26)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<hiu0=int32#4,LSL #26
+# asm 2: eor >lotmp=r14,<lotmp=r14,<hiu0=r3,LSL #26
+eor r14,r14,r3,LSL #26
+
+# qhasm: lotmp ^= (hiu0 unsigned>> 29)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<hiu0=int32#4,LSR #29
+# asm 2: eor >lotmp=r14,<lotmp=r14,<hiu0=r3,LSR #29
+eor r14,r14,r3,LSR #29
+
+# qhasm: lotmp ^= (lou0 << 3)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<lou0=int32#3,LSL #3
+# asm 2: eor >lotmp=r14,<lotmp=r14,<lou0=r2,LSL #3
+eor r14,r14,r2,LSL #3
+
+# qhasm: hitmp ^= (hiu0 unsigned>> 6)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<hiu0=int32#4,LSR #6
+# asm 2: eor >hitmp=r12,<hitmp=r12,<hiu0=r3,LSR #6
+eor r12,r12,r3,LSR #6
+
+# qhasm: hitmp ^= (lou0 unsigned>> 29)
+# asm 1: eor >hitmp=int32#3,<hitmp=int32#13,<lou0=int32#3,LSR #29
+# asm 2: eor >hitmp=r2,<hitmp=r12,<lou0=r2,LSR #29
+eor r2,r12,r2,LSR #29
+
+# qhasm: hitmp ^= (hiu0 << 3)
+# asm 1: eor >hitmp=int32#3,<hitmp=int32#3,<hiu0=int32#4,LSL #3
+# asm 2: eor >hitmp=r2,<hitmp=r2,<hiu0=r3,LSL #3
+eor r2,r2,r3,LSL #3
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#4,<lou4=int32#11,<lotmp=int32#14
+# asm 2: adds >lou4=r3,<lou4=r10,<lotmp=r14
+adds r3,r10,r14
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#3,<hiu4=int32#12,<hitmp=int32#3
+# asm 2: adc >hiu4=r2,<hiu4=r11,<hitmp=r2
+adc r2,r11,r2
+
+# qhasm: lou0 = lod11
+# asm 1: ldr >lou0=int32#11,<lod11=stack32#43
+# asm 2: ldr >lou0=r10,<lod11=[sp,#168]
+# copy-collector input: ldr r10,[sp,#168]
+
+# qhasm: hiu0 = hid11
+# asm 1: ldr >hiu0=int32#12,<hid11=stack32#44
+# asm 2: ldr >hiu0=r11,<hid11=[sp,#172]
+# copy-collector input: ldr r11,[sp,#172]
+
+# qhasm: hitmp lotmp = hiu0 * two24
+# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,<hiu0=int32#12,<two24=int32#1
+# asm 2: umull >lotmp=r14,>hitmp=r12,<hiu0=r11,<two24=r0
+# copy-collector output starts
+ldrd r10,r11,[sp,#168]
+# copy-collector output ends
+umull r14,r12,r11,r0
+
+# qhasm: lotmp hitmp += lou0 * two24
+# asm 1: umlal <hitmp=int32#13,<lotmp=int32#14,<lou0=int32#11,<two24=int32#1
+# asm 2: umlal <hitmp=r12,<lotmp=r14,<lou0=r10,<two24=r0
+umlal r12,r14,r10,r0
+
+# qhasm: carry? lotmp ^= (lou0 unsigned>> 1)
+# asm 1: eors >lotmp=int32#14,<lotmp=int32#14,<lou0=int32#11,LSR #1
+# asm 2: eors >lotmp=r14,<lotmp=r14,<lou0=r10,LSR #1
+eors r14,r14,r10,LSR #1
+
+# qhasm: hitmp ^= (carry,hiu0 unsigned>> 1)
+# asm 1: eors >hitmp=int32#13,<hitmp=int32#13,<hiu0=int32#12,RRX
+# asm 2: eors >hitmp=r12,<hitmp=r12,<hiu0=r11,RRX
+eors r12,r12,r11,RRX
+
+# qhasm: lotmp ^= (hiu0 << 31)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<hiu0=int32#12,LSL #31
+# asm 2: eor >lotmp=r14,<lotmp=r14,<hiu0=r11,LSL #31
+eor r14,r14,r11,LSL #31
+
+# qhasm: lotmp ^= (lou0 unsigned>>7)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<lou0=int32#11,LSR #7
+# asm 2: eor >lotmp=r14,<lotmp=r14,<lou0=r10,LSR #7
+eor r14,r14,r10,LSR #7
+
+# qhasm: lotmp ^= (hiu0 << 25)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<hiu0=int32#12,LSL #25
+# asm 2: eor >lotmp=r14,<lotmp=r14,<hiu0=r11,LSL #25
+eor r14,r14,r11,LSL #25
+
+# qhasm: hitmp ^= (hiu0 unsigned>>7)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<hiu0=int32#12,LSR #7
+# asm 2: eor >hitmp=r12,<hitmp=r12,<hiu0=r11,LSR #7
+eor r12,r12,r11,LSR #7
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#4,<lou4=int32#4,<lotmp=int32#14
+# asm 2: adds >lou4=r3,<lou4=r3,<lotmp=r14
+adds r3,r3,r14
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#3,<hiu4=int32#3,<hitmp=int32#13
+# asm 2: adc >hiu4=r2,<hiu4=r2,<hitmp=r12
+adc r2,r2,r12
+
+# qhasm: lou5 = lom11
+# asm 1: ldr >lou5=int32#13,<lom11=stack32#59
+# asm 2: ldr >lou5=r12,<lom11=[sp,#232]
+# copy-collector input: ldr r12,[sp,#232]
+
+# qhasm: hiu5 = him11
+# asm 1: ldr >hiu5=int32#14,<him11=stack32#60
+# asm 2: ldr >hiu5=r14,<him11=[sp,#236]
+# copy-collector input: ldr r14,[sp,#236]
+
+# qhasm: carry? lou4 += lou5
+# asm 1: adds >lou4=int32#4,<lou4=int32#4,<lou5=int32#13
+# asm 2: adds >lou4=r3,<lou4=r3,<lou5=r12
+# copy-collector output starts
+ldrd r12,r14,[sp,#232]
+# copy-collector output ends
+adds r3,r3,r12
+
+# qhasm: hiu4 += hiu5 + carry
+# asm 1: adc >hiu4=int32#3,<hiu4=int32#3,<hiu5=int32#14
+# asm 2: adc >hiu4=r2,<hiu4=r2,<hiu5=r14
+adc r2,r2,r14
+
+# qhasm: lod11 = lou5
+# asm 1: str <lou5=int32#13,>lod11=stack32#43
+# asm 2: str <lou5=r12,>lod11=[sp,#168]
+# copy-collector input: str r12,[sp,#168]
+
+# qhasm: hid11 = hiu5
+# asm 1: str <hiu5=int32#14,>hid11=stack32#44
+# asm 2: str <hiu5=r14,>hid11=[sp,#172]
+# copy-collector input: str r14,[sp,#172]
+
+# qhasm: lom10 = lou4
+# asm 1: str <lou4=int32#4,>lom10=stack32#57
+# asm 2: str <lou4=r3,>lom10=[sp,#224]
+# copy-collector input: str r3,[sp,#224]
+
+# qhasm: him10 = hiu4
+# asm 1: str <hiu4=int32#3,>him10=stack32#58
+# asm 2: str <hiu4=r2,>him10=[sp,#228]
+# copy-collector input: str r2,[sp,#228]
+
+# qhasm: hitmp lotmp = hiu1 * two13
+# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,<hiu1=int32#6,<two13=int32#2
+# asm 2: umull >lotmp=r14,>hitmp=r12,<hiu1=r5,<two13=r1
+# copy-collector output starts
+strd r12,r14,[sp,#168]
+strd r3,r2,[sp,#224]
+# copy-collector output ends
+umull r14,r12,r5,r1
+
+# qhasm: lotmp hitmp += lou1 * two13
+# asm 1: umlal <hitmp=int32#13,<lotmp=int32#14,<lou1=int32#5,<two13=int32#2
+# asm 2: umlal <hitmp=r12,<lotmp=r14,<lou1=r4,<two13=r1
+umlal r12,r14,r4,r1
+
+# qhasm: lotmp ^= (lou1 unsigned>> 6)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<lou1=int32#5,LSR #6
+# asm 2: eor >lotmp=r14,<lotmp=r14,<lou1=r4,LSR #6
+eor r14,r14,r4,LSR #6
+
+# qhasm: lotmp ^= (hiu1 << 26)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<hiu1=int32#6,LSL #26
+# asm 2: eor >lotmp=r14,<lotmp=r14,<hiu1=r5,LSL #26
+eor r14,r14,r5,LSL #26
+
+# qhasm: lotmp ^= (hiu1 unsigned>> 29)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<hiu1=int32#6,LSR #29
+# asm 2: eor >lotmp=r14,<lotmp=r14,<hiu1=r5,LSR #29
+eor r14,r14,r5,LSR #29
+
+# qhasm: lotmp ^= (lou1 << 3)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<lou1=int32#5,LSL #3
+# asm 2: eor >lotmp=r14,<lotmp=r14,<lou1=r4,LSL #3
+eor r14,r14,r4,LSL #3
+
+# qhasm: hitmp ^= (hiu1 unsigned>> 6)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<hiu1=int32#6,LSR #6
+# asm 2: eor >hitmp=r12,<hitmp=r12,<hiu1=r5,LSR #6
+eor r12,r12,r5,LSR #6
+
+# qhasm: hitmp ^= (lou1 unsigned>> 29)
+# asm 1: eor >hitmp=int32#5,<hitmp=int32#13,<lou1=int32#5,LSR #29
+# asm 2: eor >hitmp=r4,<hitmp=r12,<lou1=r4,LSR #29
+eor r4,r12,r4,LSR #29
+
+# qhasm: hitmp ^= (hiu1 << 3)
+# asm 1: eor >hitmp=int32#5,<hitmp=int32#5,<hiu1=int32#6,LSL #3
+# asm 2: eor >hitmp=r4,<hitmp=r4,<hiu1=r5,LSL #3
+eor r4,r4,r5,LSL #3
+
+# qhasm: carry? lou0 += lotmp
+# asm 1: adds >lou0=int32#6,<lou0=int32#11,<lotmp=int32#14
+# asm 2: adds >lou0=r5,<lou0=r10,<lotmp=r14
+adds r5,r10,r14
+
+# qhasm: hiu0 += hitmp + carry
+# asm 1: adc >hiu0=int32#5,<hiu0=int32#12,<hitmp=int32#5
+# asm 2: adc >hiu0=r4,<hiu0=r11,<hitmp=r4
+adc r4,r11,r4
+
+# qhasm: lou1 = lod12
+# asm 1: ldr >lou1=int32#11,<lod12=stack32#45
+# asm 2: ldr >lou1=r10,<lod12=[sp,#176]
+# copy-collector input: ldr r10,[sp,#176]
+
+# qhasm: hiu1 = hid12
+# asm 1: ldr >hiu1=int32#12,<hid12=stack32#46
+# asm 2: ldr >hiu1=r11,<hid12=[sp,#180]
+# copy-collector input: ldr r11,[sp,#180]
+
+# qhasm: hitmp lotmp = hiu1 * two24
+# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,<hiu1=int32#12,<two24=int32#1
+# asm 2: umull >lotmp=r14,>hitmp=r12,<hiu1=r11,<two24=r0
+# copy-collector output starts
+ldrd r10,r11,[sp,#176]
+# copy-collector output ends
+umull r14,r12,r11,r0
+
+# qhasm: lotmp hitmp += lou1 * two24
+# asm 1: umlal <hitmp=int32#13,<lotmp=int32#14,<lou1=int32#11,<two24=int32#1
+# asm 2: umlal <hitmp=r12,<lotmp=r14,<lou1=r10,<two24=r0
+umlal r12,r14,r10,r0
+
+# qhasm: carry? lotmp ^= (lou1 unsigned>> 1)
+# asm 1: eors >lotmp=int32#14,<lotmp=int32#14,<lou1=int32#11,LSR #1
+# asm 2: eors >lotmp=r14,<lotmp=r14,<lou1=r10,LSR #1
+eors r14,r14,r10,LSR #1
+
+# qhasm: hitmp ^= (carry,hiu1 unsigned>> 1)
+# asm 1: eors >hitmp=int32#13,<hitmp=int32#13,<hiu1=int32#12,RRX
+# asm 2: eors >hitmp=r12,<hitmp=r12,<hiu1=r11,RRX
+eors r12,r12,r11,RRX
+
+# qhasm: lotmp ^= (hiu1 << 31)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<hiu1=int32#12,LSL #31
+# asm 2: eor >lotmp=r14,<lotmp=r14,<hiu1=r11,LSL #31
+eor r14,r14,r11,LSL #31
+
+# qhasm: lotmp ^= (lou1 unsigned>>7)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<lou1=int32#11,LSR #7
+# asm 2: eor >lotmp=r14,<lotmp=r14,<lou1=r10,LSR #7
+eor r14,r14,r10,LSR #7
+
+# qhasm: lotmp ^= (hiu1 << 25)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<hiu1=int32#12,LSL #25
+# asm 2: eor >lotmp=r14,<lotmp=r14,<hiu1=r11,LSL #25
+eor r14,r14,r11,LSL #25
+
+# qhasm: hitmp ^= (hiu1 unsigned>>7)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<hiu1=int32#12,LSR #7
+# asm 2: eor >hitmp=r12,<hitmp=r12,<hiu1=r11,LSR #7
+eor r12,r12,r11,LSR #7
+
+# qhasm: carry? lou0 += lotmp
+# asm 1: adds >lou0=int32#6,<lou0=int32#6,<lotmp=int32#14
+# asm 2: adds >lou0=r5,<lou0=r5,<lotmp=r14
+adds r5,r5,r14
+
+# qhasm: hiu0 += hitmp + carry
+# asm 1: adc >hiu0=int32#5,<hiu0=int32#5,<hitmp=int32#13
+# asm 2: adc >hiu0=r4,<hiu0=r4,<hitmp=r12
+adc r4,r4,r12
+
+# qhasm: lou5 = lom12
+# asm 1: ldr >lou5=int32#13,<lom12=stack32#61
+# asm 2: ldr >lou5=r12,<lom12=[sp,#240]
+# copy-collector input: ldr r12,[sp,#240]
+
+# qhasm: hiu5 = him12
+# asm 1: ldr >hiu5=int32#14,<him12=stack32#62
+# asm 2: ldr >hiu5=r14,<him12=[sp,#244]
+# copy-collector input: ldr r14,[sp,#244]
+
+# qhasm: carry? lou0 += lou5
+# asm 1: adds >lou0=int32#6,<lou0=int32#6,<lou5=int32#13
+# asm 2: adds >lou0=r5,<lou0=r5,<lou5=r12
+# copy-collector output starts
+ldrd r12,r14,[sp,#240]
+# copy-collector output ends
+adds r5,r5,r12
+
+# qhasm: hiu0 += hiu5 + carry
+# asm 1: adc >hiu0=int32#5,<hiu0=int32#5,<hiu5=int32#14
+# asm 2: adc >hiu0=r4,<hiu0=r4,<hiu5=r14
+adc r4,r4,r14
+
+# qhasm: lod12 = lou5
+# asm 1: str <lou5=int32#13,>lod12=stack32#45
+# asm 2: str <lou5=r12,>lod12=[sp,#176]
+# copy-collector input: str r12,[sp,#176]
+
+# qhasm: hid12 = hiu5
+# asm 1: str <hiu5=int32#14,>hid12=stack32#46
+# asm 2: str <hiu5=r14,>hid12=[sp,#180]
+# copy-collector input: str r14,[sp,#180]
+
+# qhasm: lom11 = lou0
+# asm 1: str <lou0=int32#6,>lom11=stack32#59
+# asm 2: str <lou0=r5,>lom11=[sp,#232]
+# copy-collector input: str r5,[sp,#232]
+
+# qhasm: him11 = hiu0
+# asm 1: str <hiu0=int32#5,>him11=stack32#60
+# asm 2: str <hiu0=r4,>him11=[sp,#236]
+# copy-collector input: str r4,[sp,#236]
+
+# qhasm: hitmp lotmp = hiu4 * two13
+# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,<hiu4=int32#3,<two13=int32#2
+# asm 2: umull >lotmp=r14,>hitmp=r12,<hiu4=r2,<two13=r1
+# copy-collector output starts
+strd r12,r14,[sp,#176]
+strd r5,r4,[sp,#232]
+# copy-collector output ends
+umull r14,r12,r2,r1
+
+# qhasm: lotmp hitmp += lou4 * two13
+# asm 1: umlal <hitmp=int32#13,<lotmp=int32#14,<lou4=int32#4,<two13=int32#2
+# asm 2: umlal <hitmp=r12,<lotmp=r14,<lou4=r3,<two13=r1
+umlal r12,r14,r3,r1
+
+# qhasm: lotmp ^= (lou4 unsigned>> 6)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<lou4=int32#4,LSR #6
+# asm 2: eor >lotmp=r14,<lotmp=r14,<lou4=r3,LSR #6
+eor r14,r14,r3,LSR #6
+
+# qhasm: lotmp ^= (hiu4 << 26)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<hiu4=int32#3,LSL #26
+# asm 2: eor >lotmp=r14,<lotmp=r14,<hiu4=r2,LSL #26
+eor r14,r14,r2,LSL #26
+
+# qhasm: lotmp ^= (hiu4 unsigned>> 29)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<hiu4=int32#3,LSR #29
+# asm 2: eor >lotmp=r14,<lotmp=r14,<hiu4=r2,LSR #29
+eor r14,r14,r2,LSR #29
+
+# qhasm: lotmp ^= (lou4 << 3)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<lou4=int32#4,LSL #3
+# asm 2: eor >lotmp=r14,<lotmp=r14,<lou4=r3,LSL #3
+eor r14,r14,r3,LSL #3
+
+# qhasm: hitmp ^= (hiu4 unsigned>> 6)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<hiu4=int32#3,LSR #6
+# asm 2: eor >hitmp=r12,<hitmp=r12,<hiu4=r2,LSR #6
+eor r12,r12,r2,LSR #6
+
+# qhasm: hitmp ^= (lou4 unsigned>> 29)
+# asm 1: eor >hitmp=int32#4,<hitmp=int32#13,<lou4=int32#4,LSR #29
+# asm 2: eor >hitmp=r3,<hitmp=r12,<lou4=r3,LSR #29
+eor r3,r12,r3,LSR #29
+
+# qhasm: hitmp ^= (hiu4 << 3)
+# asm 1: eor >hitmp=int32#3,<hitmp=int32#4,<hiu4=int32#3,LSL #3
+# asm 2: eor >hitmp=r2,<hitmp=r3,<hiu4=r2,LSL #3
+eor r2,r3,r2,LSL #3
+
+# qhasm: carry? lou1 += lotmp
+# asm 1: adds >lou1=int32#4,<lou1=int32#11,<lotmp=int32#14
+# asm 2: adds >lou1=r3,<lou1=r10,<lotmp=r14
+adds r3,r10,r14
+
+# qhasm: hiu1 += hitmp + carry
+# asm 1: adc >hiu1=int32#3,<hiu1=int32#12,<hitmp=int32#3
+# asm 2: adc >hiu1=r2,<hiu1=r11,<hitmp=r2
+adc r2,r11,r2
+
+# qhasm: lou4 = lod13
+# asm 1: ldr >lou4=int32#11,<lod13=stack32#47
+# asm 2: ldr >lou4=r10,<lod13=[sp,#184]
+# copy-collector input: ldr r10,[sp,#184]
+
+# qhasm: hiu4 = hid13
+# asm 1: ldr >hiu4=int32#12,<hid13=stack32#48
+# asm 2: ldr >hiu4=r11,<hid13=[sp,#188]
+# copy-collector input: ldr r11,[sp,#188]
+
+# qhasm: hitmp lotmp = hiu4 * two24
+# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,<hiu4=int32#12,<two24=int32#1
+# asm 2: umull >lotmp=r14,>hitmp=r12,<hiu4=r11,<two24=r0
+# copy-collector output starts
+ldrd r10,r11,[sp,#184]
+# copy-collector output ends
+umull r14,r12,r11,r0
+
+# qhasm: lotmp hitmp += lou4 * two24
+# asm 1: umlal <hitmp=int32#13,<lotmp=int32#14,<lou4=int32#11,<two24=int32#1
+# asm 2: umlal <hitmp=r12,<lotmp=r14,<lou4=r10,<two24=r0
+umlal r12,r14,r10,r0
+
+# qhasm: carry? lotmp ^= (lou4 unsigned>> 1)
+# asm 1: eors >lotmp=int32#14,<lotmp=int32#14,<lou4=int32#11,LSR #1
+# asm 2: eors >lotmp=r14,<lotmp=r14,<lou4=r10,LSR #1
+eors r14,r14,r10,LSR #1
+
+# qhasm: hitmp ^= (carry,hiu4 unsigned>> 1)
+# asm 1: eors >hitmp=int32#13,<hitmp=int32#13,<hiu4=int32#12,RRX
+# asm 2: eors >hitmp=r12,<hitmp=r12,<hiu4=r11,RRX
+eors r12,r12,r11,RRX
+
+# qhasm: lotmp ^= (hiu4 << 31)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<hiu4=int32#12,LSL #31
+# asm 2: eor >lotmp=r14,<lotmp=r14,<hiu4=r11,LSL #31
+eor r14,r14,r11,LSL #31
+
+# qhasm: lotmp ^= (lou4 unsigned>>7)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<lou4=int32#11,LSR #7
+# asm 2: eor >lotmp=r14,<lotmp=r14,<lou4=r10,LSR #7
+eor r14,r14,r10,LSR #7
+
+# qhasm: lotmp ^= (hiu4 << 25)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<hiu4=int32#12,LSL #25
+# asm 2: eor >lotmp=r14,<lotmp=r14,<hiu4=r11,LSL #25
+eor r14,r14,r11,LSL #25
+
+# qhasm: hitmp ^= (hiu4 unsigned>>7)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<hiu4=int32#12,LSR #7
+# asm 2: eor >hitmp=r12,<hitmp=r12,<hiu4=r11,LSR #7
+eor r12,r12,r11,LSR #7
+
+# qhasm: carry? lou1 += lotmp
+# asm 1: adds >lou1=int32#4,<lou1=int32#4,<lotmp=int32#14
+# asm 2: adds >lou1=r3,<lou1=r3,<lotmp=r14
+adds r3,r3,r14
+
+# qhasm: hiu1 += hitmp + carry
+# asm 1: adc >hiu1=int32#3,<hiu1=int32#3,<hitmp=int32#13
+# asm 2: adc >hiu1=r2,<hiu1=r2,<hitmp=r12
+adc r2,r2,r12
+
+# qhasm: lou5 = lom13
+# asm 1: ldr >lou5=int32#13,<lom13=stack32#63
+# asm 2: ldr >lou5=r12,<lom13=[sp,#248]
+# copy-collector input: ldr r12,[sp,#248]
+
+# qhasm: hiu5 = him13
+# asm 1: ldr >hiu5=int32#14,<him13=stack32#64
+# asm 2: ldr >hiu5=r14,<him13=[sp,#252]
+# copy-collector input: ldr r14,[sp,#252]
+
+# qhasm: carry? lou1 += lou5
+# asm 1: adds >lou1=int32#4,<lou1=int32#4,<lou5=int32#13
+# asm 2: adds >lou1=r3,<lou1=r3,<lou5=r12
+# copy-collector output starts
+ldrd r12,r14,[sp,#248]
+# copy-collector output ends
+adds r3,r3,r12
+
+# qhasm: hiu1 += hiu5 + carry
+# asm 1: adc >hiu1=int32#3,<hiu1=int32#3,<hiu5=int32#14
+# asm 2: adc >hiu1=r2,<hiu1=r2,<hiu5=r14
+adc r2,r2,r14
+
+# qhasm: lod13 = lou5
+# asm 1: str <lou5=int32#13,>lod13=stack32#47
+# asm 2: str <lou5=r12,>lod13=[sp,#184]
+# copy-collector input: str r12,[sp,#184]
+
+# qhasm: hid13 = hiu5
+# asm 1: str <hiu5=int32#14,>hid13=stack32#48
+# asm 2: str <hiu5=r14,>hid13=[sp,#188]
+# copy-collector input: str r14,[sp,#188]
+
+# qhasm: lom12 = lou1
+# asm 1: str <lou1=int32#4,>lom12=stack32#61
+# asm 2: str <lou1=r3,>lom12=[sp,#240]
+# copy-collector input: str r3,[sp,#240]
+
+# qhasm: him12 = hiu1
+# asm 1: str <hiu1=int32#3,>him12=stack32#62
+# asm 2: str <hiu1=r2,>him12=[sp,#244]
+# copy-collector input: str r2,[sp,#244]
+
+# qhasm: hitmp lotmp = hiu0 * two13
+# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,<hiu0=int32#5,<two13=int32#2
+# asm 2: umull >lotmp=r14,>hitmp=r12,<hiu0=r4,<two13=r1
+# copy-collector output starts
+strd r12,r14,[sp,#184]
+strd r3,r2,[sp,#240]
+# copy-collector output ends
+umull r14,r12,r4,r1
+
+# qhasm: lotmp hitmp += lou0 * two13
+# asm 1: umlal <hitmp=int32#13,<lotmp=int32#14,<lou0=int32#6,<two13=int32#2
+# asm 2: umlal <hitmp=r12,<lotmp=r14,<lou0=r5,<two13=r1
+umlal r12,r14,r5,r1
+
+# qhasm: lotmp ^= (lou0 unsigned>> 6)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<lou0=int32#6,LSR #6
+# asm 2: eor >lotmp=r14,<lotmp=r14,<lou0=r5,LSR #6
+eor r14,r14,r5,LSR #6
+
+# qhasm: lotmp ^= (hiu0 << 26)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<hiu0=int32#5,LSL #26
+# asm 2: eor >lotmp=r14,<lotmp=r14,<hiu0=r4,LSL #26
+eor r14,r14,r4,LSL #26
+
+# qhasm: lotmp ^= (hiu0 unsigned>> 29)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<hiu0=int32#5,LSR #29
+# asm 2: eor >lotmp=r14,<lotmp=r14,<hiu0=r4,LSR #29
+eor r14,r14,r4,LSR #29
+
+# qhasm: lotmp ^= (lou0 << 3)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<lou0=int32#6,LSL #3
+# asm 2: eor >lotmp=r14,<lotmp=r14,<lou0=r5,LSL #3
+eor r14,r14,r5,LSL #3
+
+# qhasm: hitmp ^= (hiu0 unsigned>> 6)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<hiu0=int32#5,LSR #6
+# asm 2: eor >hitmp=r12,<hitmp=r12,<hiu0=r4,LSR #6
+eor r12,r12,r4,LSR #6
+
+# qhasm: hitmp ^= (lou0 unsigned>> 29)
+# asm 1: eor >hitmp=int32#6,<hitmp=int32#13,<lou0=int32#6,LSR #29
+# asm 2: eor >hitmp=r5,<hitmp=r12,<lou0=r5,LSR #29
+eor r5,r12,r5,LSR #29
+
+# qhasm: hitmp ^= (hiu0 << 3)
+# asm 1: eor >hitmp=int32#5,<hitmp=int32#6,<hiu0=int32#5,LSL #3
+# asm 2: eor >hitmp=r4,<hitmp=r5,<hiu0=r4,LSL #3
+eor r4,r5,r4,LSL #3
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#6,<lou4=int32#11,<lotmp=int32#14
+# asm 2: adds >lou4=r5,<lou4=r10,<lotmp=r14
+adds r5,r10,r14
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#5,<hiu4=int32#12,<hitmp=int32#5
+# asm 2: adc >hiu4=r4,<hiu4=r11,<hitmp=r4
+adc r4,r11,r4
+
+# qhasm: lou0 = lod14
+# asm 1: ldr >lou0=int32#11,<lod14=stack32#49
+# asm 2: ldr >lou0=r10,<lod14=[sp,#192]
+# copy-collector input: ldr r10,[sp,#192]
+
+# qhasm: hiu0 = hid14
+# asm 1: ldr >hiu0=int32#12,<hid14=stack32#50
+# asm 2: ldr >hiu0=r11,<hid14=[sp,#196]
+# copy-collector input: ldr r11,[sp,#196]
+
+# qhasm: lod14 = lou2
+# asm 1: str <lou2=int32#7,>lod14=stack32#49
+# asm 2: str <lou2=r6,>lod14=[sp,#192]
+# copy-collector input: str r6,[sp,#192]
+
+# qhasm: hid14 = hiu2
+# asm 1: str <hiu2=int32#8,>hid14=stack32#50
+# asm 2: str <hiu2=r7,>hid14=[sp,#196]
+# copy-collector input: str r7,[sp,#196]
+
+# qhasm: hitmp lotmp = hiu0 * two24
+# asm 1: umull >lotmp=int32#14,>hitmp=int32#13,<hiu0=int32#12,<two24=int32#1
+# asm 2: umull >lotmp=r14,>hitmp=r12,<hiu0=r11,<two24=r0
+# copy-collector output starts
+ldrd r10,r11,[sp,#192]
+strd r6,r7,[sp,#192]
+# copy-collector output ends
+umull r14,r12,r11,r0
+
+# qhasm: lotmp hitmp += lou0 * two24
+# asm 1: umlal <hitmp=int32#13,<lotmp=int32#14,<lou0=int32#11,<two24=int32#1
+# asm 2: umlal <hitmp=r12,<lotmp=r14,<lou0=r10,<two24=r0
+umlal r12,r14,r10,r0
+
+# qhasm: carry? lotmp ^= (lou0 unsigned>> 1)
+# asm 1: eors >lotmp=int32#14,<lotmp=int32#14,<lou0=int32#11,LSR #1
+# asm 2: eors >lotmp=r14,<lotmp=r14,<lou0=r10,LSR #1
+eors r14,r14,r10,LSR #1
+
+# qhasm: hitmp ^= (carry,hiu0 unsigned>> 1)
+# asm 1: eors >hitmp=int32#13,<hitmp=int32#13,<hiu0=int32#12,RRX
+# asm 2: eors >hitmp=r12,<hitmp=r12,<hiu0=r11,RRX
+eors r12,r12,r11,RRX
+
+# qhasm: lotmp ^= (hiu0 << 31)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<hiu0=int32#12,LSL #31
+# asm 2: eor >lotmp=r14,<lotmp=r14,<hiu0=r11,LSL #31
+eor r14,r14,r11,LSL #31
+
+# qhasm: lotmp ^= (lou0 unsigned>>7)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<lou0=int32#11,LSR #7
+# asm 2: eor >lotmp=r14,<lotmp=r14,<lou0=r10,LSR #7
+eor r14,r14,r10,LSR #7
+
+# qhasm: lotmp ^= (hiu0 << 25)
+# asm 1: eor >lotmp=int32#14,<lotmp=int32#14,<hiu0=int32#12,LSL #25
+# asm 2: eor >lotmp=r14,<lotmp=r14,<hiu0=r11,LSL #25
+eor r14,r14,r11,LSL #25
+
+# qhasm: hitmp ^= (hiu0 unsigned>>7)
+# asm 1: eor >hitmp=int32#13,<hitmp=int32#13,<hiu0=int32#12,LSR #7
+# asm 2: eor >hitmp=r12,<hitmp=r12,<hiu0=r11,LSR #7
+eor r12,r12,r11,LSR #7
+
+# qhasm: carry? lou4 += lotmp
+# asm 1: adds >lou4=int32#6,<lou4=int32#6,<lotmp=int32#14
+# asm 2: adds >lou4=r5,<lou4=r5,<lotmp=r14
+adds r5,r5,r14
+
+# qhasm: hiu4 += hitmp + carry
+# asm 1: adc >hiu4=int32#5,<hiu4=int32#5,<hitmp=int32#13
+# asm 2: adc >hiu4=r4,<hiu4=r4,<hitmp=r12
+adc r4,r4,r12
+
+# qhasm: carry? lou4 += lou2
+# asm 1: adds >lou4=int32#6,<lou4=int32#6,<lou2=int32#7
+# asm 2: adds >lou4=r5,<lou4=r5,<lou2=r6
+adds.w r5,r5,r6
+
+# qhasm: hiu4 += hiu2 + carry
+# asm 1: adc >hiu4=int32#5,<hiu4=int32#5,<hiu2=int32#8
+# asm 2: adc >hiu4=r4,<hiu4=r4,<hiu2=r7
+adc r4,r4,r7
+
+# qhasm: lom13 = lou4
+# asm 1: str <lou4=int32#6,>lom13=stack32#63
+# asm 2: str <lou4=r5,>lom13=[sp,#248]
+# copy-collector input: str r5,[sp,#248]
+
+# qhasm: him13 = hiu4
+# asm 1: str <hiu4=int32#5,>him13=stack32#64
+# asm 2: str <hiu4=r4,>him13=[sp,#252]
+# copy-collector input: str r4,[sp,#252]
+
+# qhasm: hitmp lotmp = hiu1 * two13
+# asm 1: umull >lotmp=int32#8,>hitmp=int32#7,<hiu1=int32#3,<two13=int32#2
+# asm 2: umull >lotmp=r7,>hitmp=r6,<hiu1=r2,<two13=r1
+# copy-collector output starts
+strd r5,r4,[sp,#248]
+# copy-collector output ends
+umull r7,r6,r2,r1
+
+# qhasm: lotmp hitmp += lou1 * two13
+# asm 1: umlal <hitmp=int32#7,<lotmp=int32#8,<lou1=int32#4,<two13=int32#2
+# asm 2: umlal <hitmp=r6,<lotmp=r7,<lou1=r3,<two13=r1
+umlal r6,r7,r3,r1
+
+# qhasm: lotmp ^= (lou1 unsigned>> 6)
+# asm 1: eor >lotmp=int32#8,<lotmp=int32#8,<lou1=int32#4,LSR #6
+# asm 2: eor >lotmp=r7,<lotmp=r7,<lou1=r3,LSR #6
+eor r7,r7,r3,LSR #6
+
+# qhasm: lotmp ^= (hiu1 << 26)
+# asm 1: eor >lotmp=int32#8,<lotmp=int32#8,<hiu1=int32#3,LSL #26
+# asm 2: eor >lotmp=r7,<lotmp=r7,<hiu1=r2,LSL #26
+eor r7,r7,r2,LSL #26
+
+# qhasm: lotmp ^= (hiu1 unsigned>> 29)
+# asm 1: eor >lotmp=int32#8,<lotmp=int32#8,<hiu1=int32#3,LSR #29
+# asm 2: eor >lotmp=r7,<lotmp=r7,<hiu1=r2,LSR #29
+eor r7,r7,r2,LSR #29
+
+# qhasm: lotmp ^= (lou1 << 3)
+# asm 1: eor >lotmp=int32#8,<lotmp=int32#8,<lou1=int32#4,LSL #3
+# asm 2: eor >lotmp=r7,<lotmp=r7,<lou1=r3,LSL #3
+eor r7,r7,r3,LSL #3
+
+# qhasm: hitmp ^= (hiu1 unsigned>> 6)
+# asm 1: eor >hitmp=int32#7,<hitmp=int32#7,<hiu1=int32#3,LSR #6
+# asm 2: eor >hitmp=r6,<hitmp=r6,<hiu1=r2,LSR #6
+eor r6,r6,r2,LSR #6
+
+# qhasm: hitmp ^= (lou1 unsigned>> 29)
+# asm 1: eor >hitmp=int32#4,<hitmp=int32#7,<lou1=int32#4,LSR #29
+# asm 2: eor >hitmp=r3,<hitmp=r6,<lou1=r3,LSR #29
+eor r3,r6,r3,LSR #29
+
+# qhasm: hitmp ^= (hiu1 << 3)
+# asm 1: eor >hitmp=int32#3,<hitmp=int32#4,<hiu1=int32#3,LSL #3
+# asm 2: eor >hitmp=r2,<hitmp=r3,<hiu1=r2,LSL #3
+eor r2,r3,r2,LSL #3
+
+# qhasm: carry? lou0 += lotmp
+# asm 1: adds >lou0=int32#4,<lou0=int32#11,<lotmp=int32#8
+# asm 2: adds >lou0=r3,<lou0=r10,<lotmp=r7
+adds r3,r10,r7
+
+# qhasm: hiu0 += hitmp + carry
+# asm 1: adc >hiu0=int32#3,<hiu0=int32#12,<hitmp=int32#3
+# asm 2: adc >hiu0=r2,<hiu0=r11,<hitmp=r2
+adc r2,r11,r2
+
+# qhasm: lou1 = lod15
+# asm 1: ldr >lou1=int32#7,<lod15=stack32#51
+# asm 2: ldr >lou1=r6,<lod15=[sp,#200]
+# copy-collector input: ldr r6,[sp,#200]
+
+# qhasm: hiu1 = hid15
+# asm 1: ldr >hiu1=int32#8,<hid15=stack32#52
+# asm 2: ldr >hiu1=r7,<hid15=[sp,#204]
+# copy-collector input: ldr r7,[sp,#204]
+
+# qhasm: lod15 = lou3
+# asm 1: str <lou3=int32#9,>lod15=stack32#51
+# asm 2: str <lou3=r8,>lod15=[sp,#200]
+# copy-collector input: str r8,[sp,#200]
+
+# qhasm: hid15 = hiu3
+# asm 1: str <hiu3=int32#10,>hid15=stack32#52
+# asm 2: str <hiu3=r9,>hid15=[sp,#204]
+# copy-collector input: str r9,[sp,#204]
+
+# qhasm: hitmp lotmp = hiu1 * two24
+# asm 1: umull >lotmp=int32#12,>hitmp=int32#11,<hiu1=int32#8,<two24=int32#1
+# asm 2: umull >lotmp=r11,>hitmp=r10,<hiu1=r7,<two24=r0
+# copy-collector output starts
+ldrd r6,r7,[sp,#200]
+strd r8,r9,[sp,#200]
+# copy-collector output ends
+umull r11,r10,r7,r0
+
+# qhasm: lotmp hitmp += lou1 * two24
+# asm 1: umlal <hitmp=int32#11,<lotmp=int32#12,<lou1=int32#7,<two24=int32#1
+# asm 2: umlal <hitmp=r10,<lotmp=r11,<lou1=r6,<two24=r0
+umlal r10,r11,r6,r0
+
+# qhasm: carry? lotmp ^= (lou1 unsigned>> 1)
+# asm 1: eors >lotmp=int32#12,<lotmp=int32#12,<lou1=int32#7,LSR #1
+# asm 2: eors >lotmp=r11,<lotmp=r11,<lou1=r6,LSR #1
+eors r11,r11,r6,LSR #1
+
+# qhasm: hitmp ^= (carry,hiu1 unsigned>> 1)
+# asm 1: eors >hitmp=int32#11,<hitmp=int32#11,<hiu1=int32#8,RRX
+# asm 2: eors >hitmp=r10,<hitmp=r10,<hiu1=r7,RRX
+eors r10,r10,r7,RRX
+
+# qhasm: lotmp ^= (hiu1 << 31)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<hiu1=int32#8,LSL #31
+# asm 2: eor >lotmp=r11,<lotmp=r11,<hiu1=r7,LSL #31
+eor r11,r11,r7,LSL #31
+
+# qhasm: lotmp ^= (lou1 unsigned>>7)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<lou1=int32#7,LSR #7
+# asm 2: eor >lotmp=r11,<lotmp=r11,<lou1=r6,LSR #7
+eor r11,r11,r6,LSR #7
+
+# qhasm: lotmp ^= (hiu1 << 25)
+# asm 1: eor >lotmp=int32#12,<lotmp=int32#12,<hiu1=int32#8,LSL #25
+# asm 2: eor >lotmp=r11,<lotmp=r11,<hiu1=r7,LSL #25
+eor r11,r11,r7,LSL #25
+
+# qhasm: hitmp ^= (hiu1 unsigned>>7)
+# asm 1: eor >hitmp=int32#11,<hitmp=int32#11,<hiu1=int32#8,LSR #7
+# asm 2: eor >hitmp=r10,<hitmp=r10,<hiu1=r7,LSR #7
+eor r10,r10,r7,LSR #7
+
+# qhasm: carry? lou0 += lotmp
+# asm 1: adds >lou0=int32#4,<lou0=int32#4,<lotmp=int32#12
+# asm 2: adds >lou0=r3,<lou0=r3,<lotmp=r11
+adds r3,r3,r11
+
+# qhasm: hiu0 += hitmp + carry
+# asm 1: adc >hiu0=int32#3,<hiu0=int32#3,<hitmp=int32#11
+# asm 2: adc >hiu0=r2,<hiu0=r2,<hitmp=r10
+adc r2,r2,r10
+
+# qhasm: carry? lou0 += lou3
+# asm 1: adds >lou0=int32#4,<lou0=int32#4,<lou3=int32#9
+# asm 2: adds >lou0=r3,<lou0=r3,<lou3=r8
+adds r3,r3,r8
+
+# qhasm: hiu0 += hiu3 + carry
+# asm 1: adc >hiu0=int32#3,<hiu0=int32#3,<hiu3=int32#10
+# asm 2: adc >hiu0=r2,<hiu0=r2,<hiu3=r9
+adc r2,r2,r9
+
+# qhasm: lom14 = lou0
+# asm 1: str <lou0=int32#4,>lom14=stack32#65
+# asm 2: str <lou0=r3,>lom14=[sp,#256]
+# copy-collector input: str r3,[sp,#256]
+
+# qhasm: him14 = hiu0
+# asm 1: str <hiu0=int32#3,>him14=stack32#66
+# asm 2: str <hiu0=r2,>him14=[sp,#260]
+# copy-collector input: str r2,[sp,#260]
+
+# qhasm: hitmp lotmp = hiu4 * two13
+# asm 1: umull >lotmp=int32#4,>hitmp=int32#3,<hiu4=int32#5,<two13=int32#2
+# asm 2: umull >lotmp=r3,>hitmp=r2,<hiu4=r4,<two13=r1
+# copy-collector output starts
+strd r3,r2,[sp,#256]
+# copy-collector output ends
+umull r3,r2,r4,r1
+
+# qhasm: lotmp hitmp += lou4 * two13
+# asm 1: umlal <hitmp=int32#3,<lotmp=int32#4,<lou4=int32#6,<two13=int32#2
+# asm 2: umlal <hitmp=r2,<lotmp=r3,<lou4=r5,<two13=r1
+umlal r2,r3,r5,r1
+
+# qhasm: lotmp ^= (lou4 unsigned>> 6)
+# asm 1: eor >lotmp=int32#2,<lotmp=int32#4,<lou4=int32#6,LSR #6
+# asm 2: eor >lotmp=r1,<lotmp=r3,<lou4=r5,LSR #6
+eor r1,r3,r5,LSR #6
+
+# qhasm: lotmp ^= (hiu4 << 26)
+# asm 1: eor >lotmp=int32#2,<lotmp=int32#2,<hiu4=int32#5,LSL #26
+# asm 2: eor >lotmp=r1,<lotmp=r1,<hiu4=r4,LSL #26
+eor r1,r1,r4,LSL #26
+
+# qhasm: lotmp ^= (hiu4 unsigned>> 29)
+# asm 1: eor >lotmp=int32#2,<lotmp=int32#2,<hiu4=int32#5,LSR #29
+# asm 2: eor >lotmp=r1,<lotmp=r1,<hiu4=r4,LSR #29
+eor r1,r1,r4,LSR #29
+
+# qhasm: lotmp ^= (lou4 << 3)
+# asm 1: eor >lotmp=int32#2,<lotmp=int32#2,<lou4=int32#6,LSL #3
+# asm 2: eor >lotmp=r1,<lotmp=r1,<lou4=r5,LSL #3
+eor r1,r1,r5,LSL #3
+
+# qhasm: hitmp ^= (hiu4 unsigned>> 6)
+# asm 1: eor >hitmp=int32#3,<hitmp=int32#3,<hiu4=int32#5,LSR #6
+# asm 2: eor >hitmp=r2,<hitmp=r2,<hiu4=r4,LSR #6
+eor r2,r2,r4,LSR #6
+
+# qhasm: hitmp ^= (lou4 unsigned>> 29)
+# asm 1: eor >hitmp=int32#3,<hitmp=int32#3,<lou4=int32#6,LSR #29
+# asm 2: eor >hitmp=r2,<hitmp=r2,<lou4=r5,LSR #29
+eor r2,r2,r5,LSR #29
+
+# qhasm: hitmp ^= (hiu4 << 3)
+# asm 1: eor >hitmp=int32#3,<hitmp=int32#3,<hiu4=int32#5,LSL #3
+# asm 2: eor >hitmp=r2,<hitmp=r2,<hiu4=r4,LSL #3
+eor r2,r2,r4,LSL #3
+
+# qhasm: carry? lou1 += lotmp
+# asm 1: adds >lou1=int32#2,<lou1=int32#7,<lotmp=int32#2
+# asm 2: adds >lou1=r1,<lou1=r6,<lotmp=r1
+adds r1,r6,r1
+
+# qhasm: hiu1 += hitmp + carry
+# asm 1: adc >hiu1=int32#3,<hiu1=int32#8,<hitmp=int32#3
+# asm 2: adc >hiu1=r2,<hiu1=r7,<hitmp=r2
+adc r2,r7,r2
+
+# qhasm: lou0 = lom8
+# asm 1: ldr >lou0=int32#4,<lom8=stack32#53
+# asm 2: ldr >lou0=r3,<lom8=[sp,#208]
+# copy-collector input: ldr r3,[sp,#208]
+
+# qhasm: hiu0 = him8
+# asm 1: ldr >hiu0=int32#5,<him8=stack32#54
+# asm 2: ldr >hiu0=r4,<him8=[sp,#212]
+# copy-collector input: ldr r4,[sp,#212]
+
+# qhasm: lou2 = lom15
+# asm 1: ldr >lou2=int32#6,<lom15=stack32#37
+# asm 2: ldr >lou2=r5,<lom15=[sp,#144]
+# copy-collector input: ldr r5,[sp,#144]
+
+# qhasm: hiu2 = him15
+# asm 1: ldr >hiu2=int32#7,<him15=stack32#38
+# asm 2: ldr >hiu2=r6,<him15=[sp,#148]
+# copy-collector input: ldr r6,[sp,#148]
+
+# qhasm: lom8 = lou2
+# asm 1: str <lou2=int32#6,>lom8=stack32#53
+# asm 2: str <lou2=r5,>lom8=[sp,#208]
+# copy-collector input: str r5,[sp,#208]
+
+# qhasm: him8 = hiu2
+# asm 1: str <hiu2=int32#7,>him8=stack32#54
+# asm 2: str <hiu2=r6,>him8=[sp,#212]
+# copy-collector input: str r6,[sp,#212]
+
+# qhasm: hitmp lotmp = hiu0 * two24
+# asm 1: umull >lotmp=int32#9,>hitmp=int32#8,<hiu0=int32#5,<two24=int32#1
+# asm 2: umull >lotmp=r8,>hitmp=r7,<hiu0=r4,<two24=r0
+# copy-collector output starts
+ldr r3,[sp,#208]
+ldr r4,[sp,#212]
+ldr r5,[sp,#144]
+ldr.w r6,[sp,#148]
+strd r5,r6,[sp,#208]
+# copy-collector output ends
+umull r8,r7,r4,r0
+
+# qhasm: lotmp hitmp += lou0 * two24
+# asm 1: umlal <hitmp=int32#8,<lotmp=int32#9,<lou0=int32#4,<two24=int32#1
+# asm 2: umlal <hitmp=r7,<lotmp=r8,<lou0=r3,<two24=r0
+umlal r7,r8,r3,r0
+
+# qhasm: carry? lotmp ^= (lou0 unsigned>> 1)
+# asm 1: eors >lotmp=int32#1,<lotmp=int32#9,<lou0=int32#4,LSR #1
+# asm 2: eors >lotmp=r0,<lotmp=r8,<lou0=r3,LSR #1
+eors r0,r8,r3,LSR #1
+
+# qhasm: hitmp ^= (carry,hiu0 unsigned>> 1)
+# asm 1: eors >hitmp=int32#8,<hitmp=int32#8,<hiu0=int32#5,RRX
+# asm 2: eors >hitmp=r7,<hitmp=r7,<hiu0=r4,RRX
+eors r7,r7,r4,RRX
+
+# qhasm: lotmp ^= (hiu0 << 31)
+# asm 1: eor >lotmp=int32#1,<lotmp=int32#1,<hiu0=int32#5,LSL #31
+# asm 2: eor >lotmp=r0,<lotmp=r0,<hiu0=r4,LSL #31
+eor r0,r0,r4,LSL #31
+
+# qhasm: lotmp ^= (lou0 unsigned>>7)
+# asm 1: eor >lotmp=int32#1,<lotmp=int32#1,<lou0=int32#4,LSR #7
+# asm 2: eor >lotmp=r0,<lotmp=r0,<lou0=r3,LSR #7
+eor r0,r0,r3,LSR #7
+
+# qhasm: lotmp ^= (hiu0 << 25)
+# asm 1: eor >lotmp=int32#1,<lotmp=int32#1,<hiu0=int32#5,LSL #25
+# asm 2: eor >lotmp=r0,<lotmp=r0,<hiu0=r4,LSL #25
+eor r0,r0,r4,LSL #25
+
+# qhasm: hitmp ^= (hiu0 unsigned>>7)
+# asm 1: eor >hitmp=int32#8,<hitmp=int32#8,<hiu0=int32#5,LSR #7
+# asm 2: eor >hitmp=r7,<hitmp=r7,<hiu0=r4,LSR #7
+eor r7,r7,r4,LSR #7
+
+# qhasm: carry? lou1 += lotmp
+# asm 1: adds >lou1=int32#1,<lou1=int32#2,<lotmp=int32#1
+# asm 2: adds >lou1=r0,<lou1=r1,<lotmp=r0
+adds r0,r1,r0
+
+# qhasm: hiu1 += hitmp + carry
+# asm 1: adc >hiu1=int32#2,<hiu1=int32#3,<hitmp=int32#8
+# asm 2: adc >hiu1=r1,<hiu1=r2,<hitmp=r7
+adc r1,r2,r7
+
+# qhasm: carry? lou1 += lou2
+# asm 1: adds >lou1=int32#1,<lou1=int32#1,<lou2=int32#6
+# asm 2: adds >lou1=r0,<lou1=r0,<lou2=r5
+adds r0,r0,r5
+
+# qhasm: hiu1 += hiu2 + carry
+# asm 1: adc >hiu1=int32#2,<hiu1=int32#2,<hiu2=int32#7
+# asm 2: adc >hiu1=r1,<hiu1=r1,<hiu2=r6
+adc r1,r1,r6
+
+# qhasm: lom15 = lou1
+# asm 1: str <lou1=int32#1,>lom15=stack32#67
+# asm 2: str <lou1=r0,>lom15=[sp,#264]
+# copy-collector input: str r0,[sp,#264]
+
+# qhasm: him15 = hiu1
+# asm 1: str <hiu1=int32#2,>him15=stack32#68
+# asm 2: str <hiu1=r1,>him15=[sp,#268]
+# copy-collector input: str r1,[sp,#268]
+
+# qhasm: lod8 = lou0
+# asm 1: str <lou0=int32#4,>lod8=stack32#37
+# asm 2: str <lou0=r3,>lod8=[sp,#144]
+# copy-collector input: str r3,[sp,#144]
+
+# qhasm: hid8 = hiu0
+# asm 1: str <hiu0=int32#5,>hid8=stack32#38
+# asm 2: str <hiu0=r4,>hid8=[sp,#148]
+# copy-collector input: str r4,[sp,#148]
+
+# qhasm: goto innerloop
+# copy-collector output starts
+strd r0,r1,[sp,#264]
+strd r3,r4,[sp,#144]
+# copy-collector output ends
+b ._innerloop
+
+# qhasm: nearend:
+._nearend:
+
+# qhasm: lou0 = lom8
+# asm 1: ldr >lou0=int32#1,<lom8=stack32#53
+# asm 2: ldr >lou0=r0,<lom8=[sp,#208]
+# copy-collector input: ldr r0,[sp,#208]
+
+# qhasm: hiu0 = him8
+# asm 1: ldr >hiu0=int32#2,<him8=stack32#54
+# asm 2: ldr >hiu0=r1,<him8=[sp,#212]
+# copy-collector input: ldr r1,[sp,#212]
+
+# qhasm: lou1 = lom9
+# asm 1: ldr >lou1=int32#3,<lom9=stack32#55
+# asm 2: ldr >lou1=r2,<lom9=[sp,#216]
+# copy-collector input: ldr r2,[sp,#216]
+
+# qhasm: hiu1 = him9
+# asm 1: ldr >hiu1=int32#4,<him9=stack32#56
+# asm 2: ldr >hiu1=r3,<him9=[sp,#220]
+# copy-collector input: ldr r3,[sp,#220]
+
+# qhasm: lou2 = lom10
+# asm 1: ldr >lou2=int32#5,<lom10=stack32#57
+# asm 2: ldr >lou2=r4,<lom10=[sp,#224]
+# copy-collector input: ldr r4,[sp,#224]
+
+# qhasm: hiu2 = him10
+# asm 1: ldr >hiu2=int32#6,<him10=stack32#58
+# asm 2: ldr >hiu2=r5,<him10=[sp,#228]
+# copy-collector input: ldr r5,[sp,#228]
+
+# qhasm: lou3 = lom11
+# asm 1: ldr >lou3=int32#7,<lom11=stack32#59
+# asm 2: ldr >lou3=r6,<lom11=[sp,#232]
+# copy-collector input: ldr r6,[sp,#232]
+
+# qhasm: hiu3 = him11
+# asm 1: ldr >hiu3=int32#8,<him11=stack32#60
+# asm 2: ldr >hiu3=r7,<him11=[sp,#236]
+# copy-collector input: ldr r7,[sp,#236]
+
+# qhasm: lod8 = lou0
+# asm 1: str <lou0=int32#1,>lod8=stack32#37
+# asm 2: str <lou0=r0,>lod8=[sp,#144]
+# copy-collector input: str r0,[sp,#144]
+
+# qhasm: hid8 = hiu0
+# asm 1: str <hiu0=int32#2,>hid8=stack32#38
+# asm 2: str <hiu0=r1,>hid8=[sp,#148]
+# copy-collector input: str r1,[sp,#148]
+
+# qhasm: lod9 = lou1
+# asm 1: str <lou1=int32#3,>lod9=stack32#39
+# asm 2: str <lou1=r2,>lod9=[sp,#152]
+# copy-collector input: str r2,[sp,#152]
+
+# qhasm: hid9 = hiu1
+# asm 1: str <hiu1=int32#4,>hid9=stack32#40
+# asm 2: str <hiu1=r3,>hid9=[sp,#156]
+# copy-collector input: str r3,[sp,#156]
+
+# qhasm: lod10 = lou2
+# asm 1: str <lou2=int32#5,>lod10=stack32#41
+# asm 2: str <lou2=r4,>lod10=[sp,#160]
+# copy-collector input: str r4,[sp,#160]
+
+# qhasm: hid10 = hiu2
+# asm 1: str <hiu2=int32#6,>hid10=stack32#42
+# asm 2: str <hiu2=r5,>hid10=[sp,#164]
+# copy-collector input: str r5,[sp,#164]
+
+# qhasm: lod11 = lou3
+# asm 1: str <lou3=int32#7,>lod11=stack32#43
+# asm 2: str <lou3=r6,>lod11=[sp,#168]
+# copy-collector input: str r6,[sp,#168]
+
+# qhasm: hid11 = hiu3
+# asm 1: str <hiu3=int32#8,>hid11=stack32#44
+# asm 2: str <hiu3=r7,>hid11=[sp,#172]
+# copy-collector input: str r7,[sp,#172]
+
+# qhasm: lou0 = lom12
+# asm 1: ldr >lou0=int32#1,<lom12=stack32#61
+# asm 2: ldr >lou0=r0,<lom12=[sp,#240]
+# copy-collector input: ldr r0,[sp,#240]
+
+# qhasm: hiu0 = him12
+# asm 1: ldr >hiu0=int32#2,<him12=stack32#62
+# asm 2: ldr >hiu0=r1,<him12=[sp,#244]
+# copy-collector input: ldr r1,[sp,#244]
+
+# qhasm: lou1 = lom13
+# asm 1: ldr >lou1=int32#3,<lom13=stack32#63
+# asm 2: ldr >lou1=r2,<lom13=[sp,#248]
+# copy-collector input: ldr r2,[sp,#248]
+
+# qhasm: hiu1 = him13
+# asm 1: ldr >hiu1=int32#4,<him13=stack32#64
+# asm 2: ldr >hiu1=r3,<him13=[sp,#252]
+# copy-collector input: ldr r3,[sp,#252]
+
+# qhasm: lou2 = lom14
+# asm 1: ldr >lou2=int32#5,<lom14=stack32#65
+# asm 2: ldr >lou2=r4,<lom14=[sp,#256]
+# copy-collector input: ldr r4,[sp,#256]
+
+# qhasm: hiu2 = him14
+# asm 1: ldr >hiu2=int32#6,<him14=stack32#66
+# asm 2: ldr >hiu2=r5,<him14=[sp,#260]
+# copy-collector input: ldr r5,[sp,#260]
+
+# qhasm: lou3 = lom15
+# asm 1: ldr >lou3=int32#7,<lom15=stack32#67
+# asm 2: ldr >lou3=r6,<lom15=[sp,#264]
+# copy-collector input: ldr r6,[sp,#264]
+
+# qhasm: hiu3 = him15
+# asm 1: ldr >hiu3=int32#8,<him15=stack32#68
+# asm 2: ldr >hiu3=r7,<him15=[sp,#268]
+# copy-collector input: ldr r7,[sp,#268]
+
+# qhasm: lod12 = lou0
+# asm 1: str <lou0=int32#1,>lod12=stack32#45
+# asm 2: str <lou0=r0,>lod12=[sp,#176]
+# copy-collector input: str r0,[sp,#176]
+
+# qhasm: hid12 = hiu0
+# asm 1: str <hiu0=int32#2,>hid12=stack32#46
+# asm 2: str <hiu0=r1,>hid12=[sp,#180]
+# copy-collector input: str r1,[sp,#180]
+
+# qhasm: lod13 = lou1
+# asm 1: str <lou1=int32#3,>lod13=stack32#47
+# asm 2: str <lou1=r2,>lod13=[sp,#184]
+# copy-collector input: str r2,[sp,#184]
+
+# qhasm: hid13 = hiu1
+# asm 1: str <hiu1=int32#4,>hid13=stack32#48
+# asm 2: str <hiu1=r3,>hid13=[sp,#188]
+# copy-collector input: str r3,[sp,#188]
+
+# qhasm: lod14 = lou2
+# asm 1: str <lou2=int32#5,>lod14=stack32#49
+# asm 2: str <lou2=r4,>lod14=[sp,#192]
+# copy-collector input: str r4,[sp,#192]
+
+# qhasm: hid14 = hiu2
+# asm 1: str <hiu2=int32#6,>hid14=stack32#50
+# asm 2: str <hiu2=r5,>hid14=[sp,#196]
+# copy-collector input: str r5,[sp,#196]
+
+# qhasm: lod15 = lou3
+# asm 1: str <lou3=int32#7,>lod15=stack32#51
+# asm 2: str <lou3=r6,>lod15=[sp,#200]
+# copy-collector input: str r6,[sp,#200]
+
+# qhasm: hid15 = hiu3
+# asm 1: str <hiu3=int32#8,>hid15=stack32#52
+# asm 2: str <hiu3=r7,>hid15=[sp,#204]
+# copy-collector input: str r7,[sp,#204]
+
+# qhasm: goto innerloop
+# copy-collector output starts
+ldr r0,[sp,#208]
+ldr r1,[sp,#212]
+ldr r2,[sp,#216]
+ldr r3,[sp,#220]
+ldr r4,[sp,#224]
+ldr r5,[sp,#228]
+ldr r6,[sp,#232]
+ldr r7,[sp,#236]
+strd r0,r1,[sp,#144]
+strd r2,r3,[sp,#152]
+strd r4,r5,[sp,#160]
+strd r6,r7,[sp,#168]
+ldr r0,[sp,#240]
+ldr r1,[sp,#244]
+ldr r2,[sp,#248]
+ldr r3,[sp,#252]
+ldr r4,[sp,#256]
+ldr r5,[sp,#260]
+ldr r6,[sp,#264]
+ldr r7,[sp,#268]
+strd r0,r1,[sp,#176]
+strd r2,r3,[sp,#184]
+strd r4,r5,[sp,#192]
+strd r6,r7,[sp,#200]
+# copy-collector output ends
+b ._innerloop
+
+# qhasm: endinnerloop:
+._endinnerloop:
+
+# qhasm: input_0 = o3
+# asm 1: ldr >input_0=int32#1,<o3=stack32#4
+# asm 2: ldr >input_0=r0,<o3=[sp,#12]
+# copy-collector input: ldr r0,[sp,#12]
+
+# qhasm: input_0 -= 640
+# asm 1: sub >input_0=int32#1,<input_0=int32#1,#640
+# asm 2: sub >input_0=r0,<input_0=r0,#640
+# copy-collector output starts
+ldr r0,[sp,#12]
+# copy-collector output ends
+sub r0,r0,#640
+
+# qhasm: o3 = input_0
+# asm 1: str <input_0=int32#1,>o3=stack32#4
+# asm 2: str <input_0=r0,>o3=[sp,#12]
+# copy-collector input: str r0,[sp,#12]
+
+# qhasm: lou0 = lod0
+# asm 1: ldr >lou0=int32#1,<lod0=stack32#13
+# asm 2: ldr >lou0=r0,<lod0=[sp,#48]
+# copy-collector input: ldr r0,[sp,#48]
+
+# qhasm: hiu0 = hid0
+# asm 1: ldr >hiu0=int32#2,<hid0=stack32#14
+# asm 2: ldr >hiu0=r1,<hid0=[sp,#52]
+# copy-collector input: ldr r1,[sp,#52]
+
+# qhasm: lou1 = lod1
+# asm 1: ldr >lou1=int32#3,<lod1=stack32#15
+# asm 2: ldr >lou1=r2,<lod1=[sp,#56]
+# copy-collector input: ldr r2,[sp,#56]
+
+# qhasm: hiu1 = hid1
+# asm 1: ldr >hiu1=int32#4,<hid1=stack32#16
+# asm 2: ldr >hiu1=r3,<hid1=[sp,#60]
+# copy-collector input: ldr r3,[sp,#60]
+
+# qhasm: lou2 = lod2
+# asm 1: ldr >lou2=int32#5,<lod2=stack32#17
+# asm 2: ldr >lou2=r4,<lod2=[sp,#64]
+# copy-collector input: ldr r4,[sp,#64]
+
+# qhasm: hiu2 = hid2
+# asm 1: ldr >hiu2=int32#6,<hid2=stack32#18
+# asm 2: ldr >hiu2=r5,<hid2=[sp,#68]
+# copy-collector input: ldr r5,[sp,#68]
+
+# qhasm: lou3 = lod3
+# asm 1: ldr >lou3=int32#7,<lod3=stack32#19
+# asm 2: ldr >lou3=r6,<lod3=[sp,#72]
+# copy-collector input: ldr r6,[sp,#72]
+
+# qhasm: hiu3 = hid3
+# asm 1: ldr >hiu3=int32#8,<hid3=stack32#20
+# asm 2: ldr >hiu3=r7,<hid3=[sp,#76]
+# copy-collector input: ldr r7,[sp,#76]
+
+# qhasm: lotmp = lom0
+# asm 1: ldr >lotmp=int32#9,<lom0=stack32#5
+# asm 2: ldr >lotmp=r8,<lom0=[sp,#16]
+# copy-collector input: ldr r8,[sp,#16]
+
+# qhasm: hitmp = him0
+# asm 1: ldr >hitmp=int32#10,<him0=stack32#6
+# asm 2: ldr >hitmp=r9,<him0=[sp,#20]
+# copy-collector input: ldr r9,[sp,#20]
+
+# qhasm: carry? lou0 += lotmp
+# asm 1: adds >lou0=int32#1,<lou0=int32#1,<lotmp=int32#9
+# asm 2: adds >lou0=r0,<lou0=r0,<lotmp=r8
+# copy-collector output starts
+str r0,[sp,#12]
+ldr r0,[sp,#48]
+ldr r1,[sp,#52]
+ldr r2,[sp,#56]
+ldr r3,[sp,#60]
+ldr r4,[sp,#64]
+ldr r5,[sp,#68]
+ldr r6,[sp,#72]
+ldr r7,[sp,#76]
+ldr r8,[sp,#16]
+ldr r9,[sp,#20]
+# copy-collector output ends
+adds r0,r0,r8
+
+# qhasm: hiu0 += hitmp + carry
+# asm 1: adc >hiu0=int32#2,<hiu0=int32#2,<hitmp=int32#10
+# asm 2: adc >hiu0=r1,<hiu0=r1,<hitmp=r9
+adc r1,r1,r9
+
+# qhasm: lotmp = lom1
+# asm 1: ldr >lotmp=int32#9,<lom1=stack32#7
+# asm 2: ldr >lotmp=r8,<lom1=[sp,#24]
+# copy-collector input: ldr r8,[sp,#24]
+
+# qhasm: hitmp = him1
+# asm 1: ldr >hitmp=int32#10,<him1=stack32#8
+# asm 2: ldr >hitmp=r9,<him1=[sp,#28]
+# copy-collector input: ldr r9,[sp,#28]
+
+# qhasm: carry? lou1 += lotmp
+# asm 1: adds >lou1=int32#3,<lou1=int32#3,<lotmp=int32#9
+# asm 2: adds >lou1=r2,<lou1=r2,<lotmp=r8
+# copy-collector output starts
+ldrd r8,r9,[sp,#24]
+# copy-collector output ends
+adds r2,r2,r8
+
+# qhasm: hiu1 += hitmp + carry
+# asm 1: adc >hiu1=int32#4,<hiu1=int32#4,<hitmp=int32#10
+# asm 2: adc >hiu1=r3,<hiu1=r3,<hitmp=r9
+adc r3,r3,r9
+
+# qhasm: lotmp = lom2
+# asm 1: ldr >lotmp=int32#9,<lom2=stack32#9
+# asm 2: ldr >lotmp=r8,<lom2=[sp,#32]
+# copy-collector input: ldr r8,[sp,#32]
+
+# qhasm: hitmp = him2
+# asm 1: ldr >hitmp=int32#10,<him2=stack32#10
+# asm 2: ldr >hitmp=r9,<him2=[sp,#36]
+# copy-collector input: ldr r9,[sp,#36]
+
+# qhasm: carry? lou2 += lotmp
+# asm 1: adds >lou2=int32#5,<lou2=int32#5,<lotmp=int32#9
+# asm 2: adds >lou2=r4,<lou2=r4,<lotmp=r8
+# copy-collector output starts
+ldrd r8,r9,[sp,#32]
+# copy-collector output ends
+adds r4,r4,r8
+
+# qhasm: hiu2 += hitmp + carry
+# asm 1: adc >hiu2=int32#6,<hiu2=int32#6,<hitmp=int32#10
+# asm 2: adc >hiu2=r5,<hiu2=r5,<hitmp=r9
+adc r5,r5,r9
+
+# qhasm: lotmp = lom3
+# asm 1: ldr >lotmp=int32#9,<lom3=stack32#11
+# asm 2: ldr >lotmp=r8,<lom3=[sp,#40]
+# copy-collector input: ldr r8,[sp,#40]
+
+# qhasm: hitmp = him3
+# asm 1: ldr >hitmp=int32#10,<him3=stack32#12
+# asm 2: ldr >hitmp=r9,<him3=[sp,#44]
+# copy-collector input: ldr r9,[sp,#44]
+
+# qhasm: carry? lou3 += lotmp
+# asm 1: adds >lou3=int32#7,<lou3=int32#7,<lotmp=int32#9
+# asm 2: adds >lou3=r6,<lou3=r6,<lotmp=r8
+# copy-collector output starts
+ldrd r8,r9,[sp,#40]
+# copy-collector output ends
+adds r6,r6,r8
+
+# qhasm: hiu3 += hitmp + carry
+# asm 1: adc >hiu3=int32#8,<hiu3=int32#8,<hitmp=int32#10
+# asm 2: adc >hiu3=r7,<hiu3=r7,<hitmp=r9
+adc r7,r7,r9
+
+# qhasm: lom0 = lou0
+# asm 1: str <lou0=int32#1,>lom0=stack32#5
+# asm 2: str <lou0=r0,>lom0=[sp,#16]
+# copy-collector input: str r0,[sp,#16]
+
+# qhasm: him0 = hiu0
+# asm 1: str <hiu0=int32#2,>him0=stack32#6
+# asm 2: str <hiu0=r1,>him0=[sp,#20]
+# copy-collector input: str r1,[sp,#20]
+
+# qhasm: lom1 = lou1
+# asm 1: str <lou1=int32#3,>lom1=stack32#7
+# asm 2: str <lou1=r2,>lom1=[sp,#24]
+# copy-collector input: str r2,[sp,#24]
+
+# qhasm: him1 = hiu1
+# asm 1: str <hiu1=int32#4,>him1=stack32#8
+# asm 2: str <hiu1=r3,>him1=[sp,#28]
+# copy-collector input: str r3,[sp,#28]
+
+# qhasm: lom2 = lou2
+# asm 1: str <lou2=int32#5,>lom2=stack32#9
+# asm 2: str <lou2=r4,>lom2=[sp,#32]
+# copy-collector input: str r4,[sp,#32]
+
+# qhasm: him2 = hiu2
+# asm 1: str <hiu2=int32#6,>him2=stack32#10
+# asm 2: str <hiu2=r5,>him2=[sp,#36]
+# copy-collector input: str r5,[sp,#36]
+
+# qhasm: lom3 = lou3
+# asm 1: str <lou3=int32#7,>lom3=stack32#11
+# asm 2: str <lou3=r6,>lom3=[sp,#40]
+# copy-collector input: str r6,[sp,#40]
+
+# qhasm: him3 = hiu3
+# asm 1: str <hiu3=int32#8,>him3=stack32#12
+# asm 2: str <hiu3=r7,>him3=[sp,#44]
+# copy-collector input: str r7,[sp,#44]
+
+# qhasm: lod0 = lou0
+# asm 1: str <lou0=int32#1,>lod0=stack32#13
+# asm 2: str <lou0=r0,>lod0=[sp,#48]
+# copy-collector input: str r0,[sp,#48]
+
+# qhasm: hid0 = hiu0
+# asm 1: str <hiu0=int32#2,>hid0=stack32#14
+# asm 2: str <hiu0=r1,>hid0=[sp,#52]
+# copy-collector input: str r1,[sp,#52]
+
+# qhasm: lod1 = lou1
+# asm 1: str <lou1=int32#3,>lod1=stack32#15
+# asm 2: str <lou1=r2,>lod1=[sp,#56]
+# copy-collector input: str r2,[sp,#56]
+
+# qhasm: hid1 = hiu1
+# asm 1: str <hiu1=int32#4,>hid1=stack32#16
+# asm 2: str <hiu1=r3,>hid1=[sp,#60]
+# copy-collector input: str r3,[sp,#60]
+
+# qhasm: lod2 = lou2
+# asm 1: str <lou2=int32#5,>lod2=stack32#17
+# asm 2: str <lou2=r4,>lod2=[sp,#64]
+# copy-collector input: str r4,[sp,#64]
+
+# qhasm: hid2 = hiu2
+# asm 1: str <hiu2=int32#6,>hid2=stack32#18
+# asm 2: str <hiu2=r5,>hid2=[sp,#68]
+# copy-collector input: str r5,[sp,#68]
+
+# qhasm: lod3 = lou3
+# asm 1: str <lou3=int32#7,>lod3=stack32#19
+# asm 2: str <lou3=r6,>lod3=[sp,#72]
+# copy-collector input: str r6,[sp,#72]
+
+# qhasm: hid3 = hiu3
+# asm 1: str <hiu3=int32#8,>hid3=stack32#20
+# asm 2: str <hiu3=r7,>hid3=[sp,#76]
+# copy-collector input: str r7,[sp,#76]
+
+# qhasm: lou0 = lod4
+# asm 1: ldr >lou0=int32#1,<lod4=stack32#29
+# asm 2: ldr >lou0=r0,<lod4=[sp,#112]
+# copy-collector input: ldr r0,[sp,#112]
+
+# qhasm: hiu0 = hid4
+# asm 1: ldr >hiu0=int32#2,<hid4=stack32#30
+# asm 2: ldr >hiu0=r1,<hid4=[sp,#116]
+# copy-collector input: ldr r1,[sp,#116]
+
+# qhasm: lou1 = lod5
+# asm 1: ldr >lou1=int32#3,<lod5=stack32#31
+# asm 2: ldr >lou1=r2,<lod5=[sp,#120]
+# copy-collector input: ldr r2,[sp,#120]
+
+# qhasm: hiu1 = hid5
+# asm 1: ldr >hiu1=int32#4,<hid5=stack32#32
+# asm 2: ldr >hiu1=r3,<hid5=[sp,#124]
+# copy-collector input: ldr r3,[sp,#124]
+
+# qhasm: lou2 = lod6
+# asm 1: ldr >lou2=int32#5,<lod6=stack32#33
+# asm 2: ldr >lou2=r4,<lod6=[sp,#128]
+# copy-collector input: ldr r4,[sp,#128]
+
+# qhasm: hiu2 = hid6
+# asm 1: ldr >hiu2=int32#6,<hid6=stack32#34
+# asm 2: ldr >hiu2=r5,<hid6=[sp,#132]
+# copy-collector input: ldr r5,[sp,#132]
+
+# qhasm: lou3 = lod7
+# asm 1: ldr >lou3=int32#7,<lod7=stack32#35
+# asm 2: ldr >lou3=r6,<lod7=[sp,#136]
+# copy-collector input: ldr r6,[sp,#136]
+
+# qhasm: hiu3 = hid7
+# asm 1: ldr >hiu3=int32#8,<hid7=stack32#36
+# asm 2: ldr >hiu3=r7,<hid7=[sp,#140]
+# copy-collector input: ldr r7,[sp,#140]
+
+# qhasm: lotmp = lom4
+# asm 1: ldr >lotmp=int32#9,<lom4=stack32#21
+# asm 2: ldr >lotmp=r8,<lom4=[sp,#80]
+# copy-collector input: ldr r8,[sp,#80]
+
+# qhasm: hitmp = him4
+# asm 1: ldr >hitmp=int32#10,<him4=stack32#22
+# asm 2: ldr >hitmp=r9,<him4=[sp,#84]
+# copy-collector input: ldr r9,[sp,#84]
+
+# qhasm: carry? lou0 += lotmp
+# asm 1: adds >lou0=int32#1,<lou0=int32#1,<lotmp=int32#9
+# asm 2: adds >lou0=r0,<lou0=r0,<lotmp=r8
+# copy-collector output starts
+strd r0,r1,[sp,#16]
+strd r2,r3,[sp,#24]
+strd r4,r5,[sp,#32]
+strd r6,r7,[sp,#40]
+strd r0,r1,[sp,#48]
+strd r2,r3,[sp,#56]
+strd r4,r5,[sp,#64]
+strd r6,r7,[sp,#72]
+ldr r0,[sp,#112]
+ldr r1,[sp,#116]
+ldr r2,[sp,#120]
+ldr r3,[sp,#124]
+ldr r4,[sp,#128]
+ldr r5,[sp,#132]
+ldr r6,[sp,#136]
+ldr r7,[sp,#140]
+ldr r8,[sp,#80]
+ldr r9,[sp,#84]
+# copy-collector output ends
+adds r0,r0,r8
+
+# qhasm: hiu0 += hitmp + carry
+# asm 1: adc >hiu0=int32#2,<hiu0=int32#2,<hitmp=int32#10
+# asm 2: adc >hiu0=r1,<hiu0=r1,<hitmp=r9
+adc r1,r1,r9
+
+# qhasm: lotmp = lom5
+# asm 1: ldr >lotmp=int32#9,<lom5=stack32#23
+# asm 2: ldr >lotmp=r8,<lom5=[sp,#88]
+# copy-collector input: ldr r8,[sp,#88]
+
+# qhasm: hitmp = him5
+# asm 1: ldr >hitmp=int32#10,<him5=stack32#24
+# asm 2: ldr >hitmp=r9,<him5=[sp,#92]
+# copy-collector input: ldr r9,[sp,#92]
+
+# qhasm: carry? lou1 += lotmp
+# asm 1: adds >lou1=int32#3,<lou1=int32#3,<lotmp=int32#9
+# asm 2: adds >lou1=r2,<lou1=r2,<lotmp=r8
+# copy-collector output starts
+ldrd r8,r9,[sp,#88]
+# copy-collector output ends
+adds r2,r2,r8
+
+# qhasm: hiu1 += hitmp + carry
+# asm 1: adc >hiu1=int32#4,<hiu1=int32#4,<hitmp=int32#10
+# asm 2: adc >hiu1=r3,<hiu1=r3,<hitmp=r9
+adc r3,r3,r9
+
+# qhasm: lotmp = lom6
+# asm 1: ldr >lotmp=int32#9,<lom6=stack32#25
+# asm 2: ldr >lotmp=r8,<lom6=[sp,#96]
+# copy-collector input: ldr r8,[sp,#96]
+
+# qhasm: hitmp = him6
+# asm 1: ldr >hitmp=int32#10,<him6=stack32#26
+# asm 2: ldr >hitmp=r9,<him6=[sp,#100]
+# copy-collector input: ldr r9,[sp,#100]
+
+# qhasm: carry? lou2 += lotmp
+# asm 1: adds >lou2=int32#5,<lou2=int32#5,<lotmp=int32#9
+# asm 2: adds >lou2=r4,<lou2=r4,<lotmp=r8
+# copy-collector output starts
+ldrd r8,r9,[sp,#96]
+# copy-collector output ends
+adds r4,r4,r8
+
+# qhasm: hiu2 += hitmp + carry
+# asm 1: adc >hiu2=int32#6,<hiu2=int32#6,<hitmp=int32#10
+# asm 2: adc >hiu2=r5,<hiu2=r5,<hitmp=r9
+adc r5,r5,r9
+
+# qhasm: lotmp = lom7
+# asm 1: ldr >lotmp=int32#9,<lom7=stack32#27
+# asm 2: ldr >lotmp=r8,<lom7=[sp,#104]
+# copy-collector input: ldr r8,[sp,#104]
+
+# qhasm: hitmp = him7
+# asm 1: ldr >hitmp=int32#10,<him7=stack32#28
+# asm 2: ldr >hitmp=r9,<him7=[sp,#108]
+# copy-collector input: ldr r9,[sp,#108]
+
+# qhasm: carry? lou3 += lotmp
+# asm 1: adds >lou3=int32#7,<lou3=int32#7,<lotmp=int32#9
+# asm 2: adds >lou3=r6,<lou3=r6,<lotmp=r8
+# copy-collector output starts
+ldrd r8,r9,[sp,#104]
+# copy-collector output ends
+adds r6,r6,r8
+
+# qhasm: hiu3 += hitmp + carry
+# asm 1: adc >hiu3=int32#8,<hiu3=int32#8,<hitmp=int32#10
+# asm 2: adc >hiu3=r7,<hiu3=r7,<hitmp=r9
+adc r7,r7,r9
+
+# qhasm: lom4 = lou0
+# asm 1: str <lou0=int32#1,>lom4=stack32#21
+# asm 2: str <lou0=r0,>lom4=[sp,#80]
+# copy-collector input: str r0,[sp,#80]
+
+# qhasm: him4 = hiu0
+# asm 1: str <hiu0=int32#2,>him4=stack32#22
+# asm 2: str <hiu0=r1,>him4=[sp,#84]
+# copy-collector input: str r1,[sp,#84]
+
+# qhasm: lom5 = lou1
+# asm 1: str <lou1=int32#3,>lom5=stack32#23
+# asm 2: str <lou1=r2,>lom5=[sp,#88]
+# copy-collector input: str r2,[sp,#88]
+
+# qhasm: him5 = hiu1
+# asm 1: str <hiu1=int32#4,>him5=stack32#24
+# asm 2: str <hiu1=r3,>him5=[sp,#92]
+# copy-collector input: str r3,[sp,#92]
+
+# qhasm: lom6 = lou2
+# asm 1: str <lou2=int32#5,>lom6=stack32#25
+# asm 2: str <lou2=r4,>lom6=[sp,#96]
+# copy-collector input: str r4,[sp,#96]
+
+# qhasm: him6 = hiu2
+# asm 1: str <hiu2=int32#6,>him6=stack32#26
+# asm 2: str <hiu2=r5,>him6=[sp,#100]
+# copy-collector input: str r5,[sp,#100]
+
+# qhasm: lom7 = lou3
+# asm 1: str <lou3=int32#7,>lom7=stack32#27
+# asm 2: str <lou3=r6,>lom7=[sp,#104]
+# copy-collector input: str r6,[sp,#104]
+
+# qhasm: him7 = hiu3
+# asm 1: str <hiu3=int32#8,>him7=stack32#28
+# asm 2: str <hiu3=r7,>him7=[sp,#108]
+# copy-collector input: str r7,[sp,#108]
+
+# qhasm: lod4 = lou0
+# asm 1: str <lou0=int32#1,>lod4=stack32#29
+# asm 2: str <lou0=r0,>lod4=[sp,#112]
+# copy-collector input: str r0,[sp,#112]
+
+# qhasm: hid4 = hiu0
+# asm 1: str <hiu0=int32#2,>hid4=stack32#30
+# asm 2: str <hiu0=r1,>hid4=[sp,#116]
+# copy-collector input: str r1,[sp,#116]
+
+# qhasm: lod5 = lou1
+# asm 1: str <lou1=int32#3,>lod5=stack32#31
+# asm 2: str <lou1=r2,>lod5=[sp,#120]
+# copy-collector input: str r2,[sp,#120]
+
+# qhasm: hid5 = hiu1
+# asm 1: str <hiu1=int32#4,>hid5=stack32#32
+# asm 2: str <hiu1=r3,>hid5=[sp,#124]
+# copy-collector input: str r3,[sp,#124]
+
+# qhasm: lod6 = lou2
+# asm 1: str <lou2=int32#5,>lod6=stack32#33
+# asm 2: str <lou2=r4,>lod6=[sp,#128]
+# copy-collector input: str r4,[sp,#128]
+
+# qhasm: hid6 = hiu2
+# asm 1: str <hiu2=int32#6,>hid6=stack32#34
+# asm 2: str <hiu2=r5,>hid6=[sp,#132]
+# copy-collector input: str r5,[sp,#132]
+
+# qhasm: lod7 = lou3
+# asm 1: str <lou3=int32#7,>lod7=stack32#35
+# asm 2: str <lou3=r6,>lod7=[sp,#136]
+# copy-collector input: str r6,[sp,#136]
+
+# qhasm: hid7 = hiu3
+# asm 1: str <hiu3=int32#8,>hid7=stack32#36
+# asm 2: str <hiu3=r7,>hid7=[sp,#140]
+# copy-collector input: str r7,[sp,#140]
+
+# qhasm: input_0 = o2
+# asm 1: ldr >input_0=int32#1,<o2=stack32#3
+# asm 2: ldr >input_0=r0,<o2=[sp,#8]
+# copy-collector input: ldr r0,[sp,#8]
+
+# qhasm: =? unsigned<? input_0 -= 128
+# asm 1: subs >input_0=int32#1,<input_0=int32#1,#128
+# asm 2: subs >input_0=r0,<input_0=r0,#128
+# copy-collector output starts
+strd r0,r1,[sp,#80]
+strd r2,r3,[sp,#88]
+strd r4,r5,[sp,#96]
+strd r6,r7,[sp,#104]
+strd r0,r1,[sp,#112]
+strd r2,r3,[sp,#120]
+strd r4,r5,[sp,#128]
+strd r6,r7,[sp,#136]
+ldr r0,[sp,#8]
+# copy-collector output ends
+subs r0,r0,#128
+
+# qhasm: o2 = input_0
+# asm 1: str <input_0=int32#1,>o2=stack32#3
+# asm 2: str <input_0=r0,>o2=[sp,#8]
+# copy-collector input: str r0,[sp,#8]
+
+# qhasm: goto mainloop if !unsigned<
+# copy-collector output starts
+str r0,[sp,#8]
+# copy-collector output ends
+bhs ._mainloop
+
+# qhasm: endmainloop:
+._endmainloop:
+
+# qhasm: input_1 = o0
+# asm 1: ldr >input_1=int32#2,<o0=stack32#1
+# asm 2: ldr >input_1=r1,<o0=[sp,#0]
+# copy-collector input: ldr r1,[sp,#0]
+
+# qhasm: lou0 = lom0
+# asm 1: ldr >lou0=int32#3,<lom0=stack32#5
+# asm 2: ldr >lou0=r2,<lom0=[sp,#16]
+# copy-collector input: ldr r2,[sp,#16]
+
+# qhasm: hiu0 = him0
+# asm 1: ldr >hiu0=int32#4,<him0=stack32#6
+# asm 2: ldr >hiu0=r3,<him0=[sp,#20]
+# copy-collector input: ldr r3,[sp,#20]
+
+# qhasm: lou1 = lom1
+# asm 1: ldr >lou1=int32#5,<lom1=stack32#7
+# asm 2: ldr >lou1=r4,<lom1=[sp,#24]
+# copy-collector input: ldr r4,[sp,#24]
+
+# qhasm: hiu1 = him1
+# asm 1: ldr >hiu1=int32#6,<him1=stack32#8
+# asm 2: ldr >hiu1=r5,<him1=[sp,#28]
+# copy-collector input: ldr r5,[sp,#28]
+
+# qhasm: lou2 = lom2
+# asm 1: ldr >lou2=int32#7,<lom2=stack32#9
+# asm 2: ldr >lou2=r6,<lom2=[sp,#32]
+# copy-collector input: ldr r6,[sp,#32]
+
+# qhasm: hiu2 = him2
+# asm 1: ldr >hiu2=int32#8,<him2=stack32#10
+# asm 2: ldr >hiu2=r7,<him2=[sp,#36]
+# copy-collector input: ldr r7,[sp,#36]
+
+# qhasm: lou3 = lom3
+# asm 1: ldr >lou3=int32#9,<lom3=stack32#11
+# asm 2: ldr >lou3=r8,<lom3=[sp,#40]
+# copy-collector input: ldr r8,[sp,#40]
+
+# qhasm: hiu3 = him3
+# asm 1: ldr >hiu3=int32#10,<him3=stack32#12
+# asm 2: ldr >hiu3=r9,<him3=[sp,#44]
+# copy-collector input: ldr r9,[sp,#44]
+
+# qhasm: lou0 = lou0[3]lou0[2]lou0[1]lou0[0]
+# asm 1: rev >lou0=int32#3,<lou0=int32#3
+# asm 2: rev >lou0=r2,<lou0=r2
+# copy-collector output starts
+ldr r1,[sp,#0]
+ldr r2,[sp,#16]
+ldr r3,[sp,#20]
+ldr r4,[sp,#24]
+ldr r5,[sp,#28]
+ldr r6,[sp,#32]
+ldr r7,[sp,#36]
+ldr r8,[sp,#40]
+ldr r9,[sp,#44]
+# copy-collector output ends
+rev r2,r2
+
+# qhasm: hiu0 = hiu0[3]hiu0[2]hiu0[1]hiu0[0]
+# asm 1: rev >hiu0=int32#4,<hiu0=int32#4
+# asm 2: rev >hiu0=r3,<hiu0=r3
+rev r3,r3
+
+# qhasm: lou1 = lou1[3]lou1[2]lou1[1]lou1[0]
+# asm 1: rev >lou1=int32#5,<lou1=int32#5
+# asm 2: rev >lou1=r4,<lou1=r4
+rev r4,r4
+
+# qhasm: hiu1 = hiu1[3]hiu1[2]hiu1[1]hiu1[0]
+# asm 1: rev >hiu1=int32#6,<hiu1=int32#6
+# asm 2: rev >hiu1=r5,<hiu1=r5
+rev r5,r5
+
+# qhasm: lou2 = lou2[3]lou2[2]lou2[1]lou2[0]
+# asm 1: rev >lou2=int32#7,<lou2=int32#7
+# asm 2: rev >lou2=r6,<lou2=r6
+rev r6,r6
+
+# qhasm: hiu2 = hiu2[3]hiu2[2]hiu2[1]hiu2[0]
+# asm 1: rev >hiu2=int32#8,<hiu2=int32#8
+# asm 2: rev >hiu2=r7,<hiu2=r7
+rev r7,r7
+
+# qhasm: lou3 = lou3[3]lou3[2]lou3[1]lou3[0]
+# asm 1: rev >lou3=int32#9,<lou3=int32#9
+# asm 2: rev >lou3=r8,<lou3=r8
+rev r8,r8
+
+# qhasm: hiu3 = hiu3[3]hiu3[2]hiu3[1]hiu3[0]
+# asm 1: rev >hiu3=int32#10,<hiu3=int32#10
+# asm 2: rev >hiu3=r9,<hiu3=r9
+rev r9,r9
+
+# qhasm: mem32[input_1] = hiu0
+# asm 1: str <hiu0=int32#4,[<input_1=int32#2]
+# asm 2: str <hiu0=r3,[<input_1=r1]
+# copy-collector input: str r3,[r1]
+
+# qhasm: mem32[input_1+4] = lou0
+# asm 1: str <lou0=int32#3,[<input_1=int32#2,#4]
+# asm 2: str <lou0=r2,[<input_1=r1,#4]
+# copy-collector input: str r2,[r1,#4]
+
+# qhasm: mem32[input_1+8] = hiu1
+# asm 1: str <hiu1=int32#6,[<input_1=int32#2,#8]
+# asm 2: str <hiu1=r5,[<input_1=r1,#8]
+# copy-collector input: str r5,[r1,#8]
+
+# qhasm: mem32[input_1+12] = lou1
+# asm 1: str <lou1=int32#5,[<input_1=int32#2,#12]
+# asm 2: str <lou1=r4,[<input_1=r1,#12]
+# copy-collector input: str r4,[r1,#12]
+
+# qhasm: mem32[input_1+16] = hiu2
+# asm 1: str <hiu2=int32#8,[<input_1=int32#2,#16]
+# asm 2: str <hiu2=r7,[<input_1=r1,#16]
+# copy-collector input: str r7,[r1,#16]
+
+# qhasm: mem32[input_1+20] = lou2
+# asm 1: str <lou2=int32#7,[<input_1=int32#2,#20]
+# asm 2: str <lou2=r6,[<input_1=r1,#20]
+# copy-collector input: str r6,[r1,#20]
+
+# qhasm: mem32[input_1+24] = hiu3
+# asm 1: str <hiu3=int32#10,[<input_1=int32#2,#24]
+# asm 2: str <hiu3=r9,[<input_1=r1,#24]
+# copy-collector input: str r9,[r1,#24]
+
+# qhasm: mem32[input_1+28] = lou3
+# asm 1: str <lou3=int32#9,[<input_1=int32#2,#28]
+# asm 2: str <lou3=r8,[<input_1=r1,#28]
+# copy-collector input: str r8,[r1,#28]
+
+# qhasm: lou0 = lom4
+# asm 1: ldr >lou0=int32#3,<lom4=stack32#21
+# asm 2: ldr >lou0=r2,<lom4=[sp,#80]
+# copy-collector input: ldr r2,[sp,#80]
+
+# qhasm: hiu0 = him4
+# asm 1: ldr >hiu0=int32#4,<him4=stack32#22
+# asm 2: ldr >hiu0=r3,<him4=[sp,#84]
+# copy-collector input: ldr r3,[sp,#84]
+
+# qhasm: lou1 = lom5
+# asm 1: ldr >lou1=int32#5,<lom5=stack32#23
+# asm 2: ldr >lou1=r4,<lom5=[sp,#88]
+# copy-collector input: ldr r4,[sp,#88]
+
+# qhasm: hiu1 = him5
+# asm 1: ldr >hiu1=int32#6,<him5=stack32#24
+# asm 2: ldr >hiu1=r5,<him5=[sp,#92]
+# copy-collector input: ldr r5,[sp,#92]
+
+# qhasm: lou2 = lom6
+# asm 1: ldr >lou2=int32#7,<lom6=stack32#25
+# asm 2: ldr >lou2=r6,<lom6=[sp,#96]
+# copy-collector input: ldr r6,[sp,#96]
+
+# qhasm: hiu2 = him6
+# asm 1: ldr >hiu2=int32#8,<him6=stack32#26
+# asm 2: ldr >hiu2=r7,<him6=[sp,#100]
+# copy-collector input: ldr r7,[sp,#100]
+
+# qhasm: lou3 = lom7
+# asm 1: ldr >lou3=int32#9,<lom7=stack32#27
+# asm 2: ldr >lou3=r8,<lom7=[sp,#104]
+# copy-collector input: ldr r8,[sp,#104]
+
+# qhasm: hiu3 = him7
+# asm 1: ldr >hiu3=int32#10,<him7=stack32#28
+# asm 2: ldr >hiu3=r9,<him7=[sp,#108]
+# copy-collector input: ldr r9,[sp,#108]
+
+# qhasm: lou0 = lou0[3]lou0[2]lou0[1]lou0[0]
+# asm 1: rev >lou0=int32#3,<lou0=int32#3
+# asm 2: rev >lou0=r2,<lou0=r2
+# copy-collector output starts
+str r3,[r1]
+str r2,[r1,#4]
+str r5,[r1,#8]
+str r4,[r1,#12]
+str r7,[r1,#16]
+str r6,[r1,#20]
+str r9,[r1,#24]
+str r8,[r1,#28]
+ldr r2,[sp,#80]
+ldr r3,[sp,#84]
+ldr r4,[sp,#88]
+ldr r5,[sp,#92]
+ldr r6,[sp,#96]
+ldr r7,[sp,#100]
+ldr r8,[sp,#104]
+ldr r9,[sp,#108]
+# copy-collector output ends
+rev r2,r2
+
+# qhasm: hiu0 = hiu0[3]hiu0[2]hiu0[1]hiu0[0]
+# asm 1: rev >hiu0=int32#4,<hiu0=int32#4
+# asm 2: rev >hiu0=r3,<hiu0=r3
+rev r3,r3
+
+# qhasm: lou1 = lou1[3]lou1[2]lou1[1]lou1[0]
+# asm 1: rev >lou1=int32#5,<lou1=int32#5
+# asm 2: rev >lou1=r4,<lou1=r4
+rev r4,r4
+
+# qhasm: hiu1 = hiu1[3]hiu1[2]hiu1[1]hiu1[0]
+# asm 1: rev >hiu1=int32#6,<hiu1=int32#6
+# asm 2: rev >hiu1=r5,<hiu1=r5
+rev r5,r5
+
+# qhasm: lou2 = lou2[3]lou2[2]lou2[1]lou2[0]
+# asm 1: rev >lou2=int32#7,<lou2=int32#7
+# asm 2: rev >lou2=r6,<lou2=r6
+rev r6,r6
+
+# qhasm: hiu2 = hiu2[3]hiu2[2]hiu2[1]hiu2[0]
+# asm 1: rev >hiu2=int32#8,<hiu2=int32#8
+# asm 2: rev >hiu2=r7,<hiu2=r7
+rev r7,r7
+
+# qhasm: lou3 = lou3[3]lou3[2]lou3[1]lou3[0]
+# asm 1: rev >lou3=int32#9,<lou3=int32#9
+# asm 2: rev >lou3=r8,<lou3=r8
+rev r8,r8
+
+# qhasm: hiu3 = hiu3[3]hiu3[2]hiu3[1]hiu3[0]
+# asm 1: rev >hiu3=int32#10,<hiu3=int32#10
+# asm 2: rev >hiu3=r9,<hiu3=r9
+rev r9,r9
+
+# qhasm: mem32[input_1+32] = hiu0
+# asm 1: str <hiu0=int32#4,[<input_1=int32#2,#32]
+# asm 2: str <hiu0=r3,[<input_1=r1,#32]
+# copy-collector input: str r3,[r1,#32]
+
+# qhasm: mem32[input_1+36] = lou0
+# asm 1: str <lou0=int32#3,[<input_1=int32#2,#36]
+# asm 2: str <lou0=r2,[<input_1=r1,#36]
+# copy-collector input: str r2,[r1,#36]
+
+# qhasm: mem32[input_1+40] = hiu1
+# asm 1: str <hiu1=int32#6,[<input_1=int32#2,#40]
+# asm 2: str <hiu1=r5,[<input_1=r1,#40]
+# copy-collector input: str r5,[r1,#40]
+
+# qhasm: mem32[input_1+44] = lou1
+# asm 1: str <lou1=int32#5,[<input_1=int32#2,#44]
+# asm 2: str <lou1=r4,[<input_1=r1,#44]
+# copy-collector input: str r4,[r1,#44]
+
+# qhasm: mem32[input_1+48] = hiu2
+# asm 1: str <hiu2=int32#8,[<input_1=int32#2,#48]
+# asm 2: str <hiu2=r7,[<input_1=r1,#48]
+# copy-collector input: str r7,[r1,#48]
+
+# qhasm: mem32[input_1+52] = lou2
+# asm 1: str <lou2=int32#7,[<input_1=int32#2,#52]
+# asm 2: str <lou2=r6,[<input_1=r1,#52]
+# copy-collector input: str r6,[r1,#52]
+
+# qhasm: mem32[input_1+56] = hiu3
+# asm 1: str <hiu3=int32#10,[<input_1=int32#2,#56]
+# asm 2: str <hiu3=r9,[<input_1=r1,#56]
+# copy-collector input: str r9,[r1,#56]
+
+# qhasm: mem32[input_1+60] = lou3
+# asm 1: str <lou3=int32#9,[<input_1=int32#2,#60]
+# asm 2: str <lou3=r8,[<input_1=r1,#60]
+# copy-collector input: str r8,[r1,#60]
+
+# qhasm: input_0 += 128
+# asm 1: add >input_0=int32#1,<input_0=int32#1,#128
+# asm 2: add >input_0=r0,<input_0=r0,#128
+# copy-collector output starts
+str r3,[r1,#32]
+str r2,[r1,#36]
+str r5,[r1,#40]
+str r4,[r1,#44]
+str r7,[r1,#48]
+str r6,[r1,#52]
+str r9,[r1,#56]
+str r8,[r1,#60]
+# copy-collector output ends
+add r0,r0,#128
+
+# qhasm: rpopreturn input_0
+add.w sp,sp,#288
+pop {r4,r5,r6,r7,r8,r9,r10,r11,r14}
+bx lr
diff --git a/common/hal-mps2.c b/common/hal-mps2.c
new file mode 100644
index 0000000..62e1924
--- /dev/null
+++ b/common/hal-mps2.c
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: Apache-2.0 or CC0-1.0
+#include <hal.h>
+#include <CMSDK_CM4.h>
+
+#define BAUD 38400
+
+/* Default clock on the MPS2 boards seems to be 25MHz */
+#ifndef SYSTEM_CLOCK
+#define SYSTEM_CLOCK 25000000UL
+#endif
+
+/* The startup file calls a SystemInit function. */
+void SystemInit(void)
+{
+  /* Enable the FPU */
+  SCB->CPACR |= ((3UL << 10 * 2) |               /* set CP10 Full Access */
+                 (3UL << 11 * 2));               /* set CP11 Full Access */
+  /* Enable UART */
+  /* TODO: Validate this on a *real* MPS2 board (works in QEMU) */
+  CMSDK_GPIO0->ALTFUNCSET |= 1u;
+  CMSDK_GPIO0->ALTFUNCSET |= 2u;
+  CMSDK_UART0->BAUDDIV = SYSTEM_CLOCK / BAUD;
+  CMSDK_UART0->CTRL |= 1 << CMSDK_UART_CTRL_RXEN_Pos;
+  CMSDK_UART0->CTRL |= 1 << CMSDK_UART_CTRL_TXEN_Pos;
+  /* Enable SysTick Timer */
+  SysTick->LOAD = 0xFFFFFFu;
+  NVIC_SetPriority(SysTick_IRQn, (1UL << __NVIC_PRIO_BITS) - 1UL);
+  NVIC_EnableIRQ(SysTick_IRQn);
+  SysTick->VAL = 0UL;
+  SysTick->CTRL = SysTick_CTRL_CLKSOURCE_Msk | SysTick_CTRL_TICKINT_Msk | SysTick_CTRL_ENABLE_Msk;
+}
+
+static volatile unsigned long long overflowcnt = 0;
+
+/* SysTick Interrupt */
+void SysTick_Handler(void)
+{
+  ++overflowcnt;
+}
+
+uint64_t hal_get_time()
+{
+  while (1) {
+    unsigned long long before = overflowcnt;
+    unsigned long long result = (before + 1) * 16777216llu - SysTick->VAL;
+    if (overflowcnt == before) {
+      return result;
+    }
+  }
+}
+
+void hal_setup(const enum clock_mode clock)
+{
+  (void) clock;
+}
+
+static inline void uart_putc(int c)
+{
+  while(CMSDK_UART0->STATE & CMSDK_UART_STATE_TXBF_Msk);
+  CMSDK_UART0->DATA = c & 0xFFu;
+}
+
+void hal_send_str(const char* in)
+{
+  const char* cur = in;
+  while (*cur) {
+    uart_putc(*cur);
+    cur += 1;
+  }
+  uart_putc('\n');
+}
+
+#if !defined(NO_SEMIHOSTING_EXIT)
+// TODO(dsprenkels) Currently, we only exit the QEMU host when a the program
+// exists sucessfully.  We should also populate some interrupts handlers that
+// occur on errors and/or other exception.
+
+// These two syscall values are used at the end of the program, when we want
+// to tell the QEMU host that we are done.  I took them from
+// <https://github.com/rust-embedded/cortex-m-semihosting/blob/8ab74cdb8c9ab669ded328072447ea6f6054ffe6/src/debug.rs#L25-L50>.
+static const uint32_t REPORT_EXCEPTION = 0x18;
+static const uint32_t ApplicationExit = 0x20026;
+
+// Do a system call towards QEMU or the debugger.
+static uint32_t semihosting_syscall(uint32_t nr, const uint32_t arg) {
+	__asm__ volatile (
+		"mov r0, %[nr]\n"
+		"mov r1, %[arg]\n"
+		"bkpt 0xAB\n"
+		"mov %[nr], r0\n"
+	: [nr] "+r" (nr) : [arg] "r" (arg) : "0", "1");
+	return nr;
+}
+
+// Register a destructor that will call qemu telling them that the program
+// has exited successfully.
+static void __attribute__ ((destructor)) semihosting_exit(void) {
+	semihosting_syscall(REPORT_EXCEPTION, ApplicationExit);
+}
+
+void NMI_Handler(void) {
+  hal_send_str("NMI_Handler");
+  semihosting_syscall(REPORT_EXCEPTION, ApplicationExit);
+}
+
+void HardFault_Handler(void) {
+  hal_send_str("HardFault_Handler");
+  semihosting_syscall(REPORT_EXCEPTION, ApplicationExit);
+}
+
+void MemManage_Handler(void) {
+  hal_send_str("MemManage_Handler");
+  semihosting_syscall(REPORT_EXCEPTION, ApplicationExit);
+}
+
+void BusFault_Handler(void) {
+  hal_send_str("BusFault_Handler");
+  semihosting_syscall(REPORT_EXCEPTION, ApplicationExit);
+}
+
+void UsageFault_Handler(void) {
+  hal_send_str("UsageFault_Handler");
+  semihosting_syscall(REPORT_EXCEPTION, ApplicationExit);
+}
+
+void SVC_Handler(void) {
+  hal_send_str("SVC_Handler");
+  semihosting_syscall(REPORT_EXCEPTION, ApplicationExit);
+}
+
+void DebugMon_Handler(void) {
+  hal_send_str("DebugMon_Handler");
+  semihosting_syscall(REPORT_EXCEPTION, ApplicationExit);
+}
+
+void PendSV_Handler(void) {
+  hal_send_str("PendSV_Handler");
+  semihosting_syscall(REPORT_EXCEPTION, ApplicationExit);
+}
+
+void Default_Handler(void) {
+  semihosting_syscall(REPORT_EXCEPTION, ApplicationExit);
+}
+
+#endif /* !defined(NO_SEMIHOSTING_EXIT) */
+
+/* End of BSS is where the heap starts (defined in the linker script) */
+extern char end;
+static char* heap_end = &end;
+
+void* __wrap__sbrk (int incr)
+{
+  char* prev_heap_end;
+
+  prev_heap_end = heap_end;
+  heap_end += incr;
+
+  return (void *) prev_heap_end;
+}
+
+size_t hal_get_stack_size(void)
+{
+  register char* cur_stack;
+	__asm__ volatile ("mov %0, sp" : "=r" (cur_stack));
+  return cur_stack - heap_end;
+}
+
+const uint32_t stackpattern = 0xDEADBEEFlu;
+
+static void* last_sp = NULL;
+
+void hal_spraystack(void)
+{
+  
+  char* _heap_end = heap_end;
+  asm volatile ("mov %0, sp\n"
+                ".L%=:\n\t"
+                "str %2, [%1], #4\n\t"
+                "cmp %1, %0\n\t"
+                "blt .L%=\n\t"
+                : "+r" (last_sp), "+r" (_heap_end) : "r" (stackpattern) : "cc", "memory");
+}
+
+size_t hal_checkstack(void)
+{
+  size_t result = 0;
+  asm volatile("sub %0, %1, %2\n"
+               ".L%=:\n\t"
+               "ldr ip, [%2], #4\n\t"
+               "cmp ip, %3\n\t"
+               "ite eq\n\t"
+               "subeq %0, #4\n\t"
+               "bne .LE%=\n\t"
+               "cmp %2, %1\n\t"
+               "blt .L%=\n\t"
+               ".LE%=:\n"
+               : "+r"(result) : "r" (last_sp), "r" (heap_end), "r" (stackpattern) : "ip", "cc");
+  return result;
+}
+
+/* Implement some system calls to shut up the linker warnings */
+
+#include <errno.h>
+#undef errno
+extern int errno;
+
+int __wrap__open(char *file, int flags, int mode)
+{
+  (void) file;
+  (void) flags;
+  (void) mode;
+  errno = ENOSYS;
+  return -1;
+}
+
+int __wrap__close(int fd)
+{
+  errno = ENOSYS;
+	(void) fd;
+	return -1;
+}
+
+#include <sys/stat.h>
+
+int __wrap__fstat(int fd, struct stat* buf)
+{
+  (void) fd;
+  (void) buf;
+  errno = ENOSYS;
+	return -1;
+}
+
+int __wrap__getpid(void)
+{
+  errno = ENOSYS;
+	return -1;
+}
+
+int __wrap__isatty(int file)
+{
+  (void) file;
+  errno = ENOSYS;
+  return 0;
+}
+
+int __wrap__kill(int pid, int sig)
+{
+  (void) pid;
+  (void) sig;
+  errno = ENOSYS;
+	return -1;
+}
+
+int __wrap__lseek(int fd, int ptr, int dir)
+{
+  (void) fd;
+  (void) ptr;
+  (void) dir;
+  errno = ENOSYS;
+	return -1;
+}
+
+int __wrap__read(int fd, char* ptr, int len)
+{
+  (void) fd;
+  (void) ptr;
+  (void) len;
+  errno = ENOSYS;
+	return -1;
+}
+
+int __wrap__write(int fd, const char* ptr, int len)
+{
+  (void) fd;
+  (void) ptr;
+  (void) len;
+  errno = ENOSYS;
+	return -1;
+}
diff --git a/common/hal-opencm3.c b/common/hal-opencm3.c
new file mode 100644
index 0000000..a72f44a
--- /dev/null
+++ b/common/hal-opencm3.c
@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: Apache-2.0
+#include "hal.h"
+#include <sys/cdefs.h>
+
+#define SERIAL_BAUD 38400
+
+#include <libopencm3/cm3/dwt.h>
+#include <libopencm3/cm3/nvic.h>
+#include <libopencm3/cm3/systick.h>
+#include <libopencm3/stm32/flash.h>
+#include <libopencm3/stm32/gpio.h>
+#include <libopencm3/stm32/rcc.h>
+#include <libopencm3/stm32/usart.h>
+
+#if defined(STM32F767ZI)
+#include <libopencm3/stm32/f7/rcc.h>
+#include <libopencm3/stm32/rng.h>
+
+#define SERIAL_GPIO GPIOD
+#define SERIAL_USART USART3
+#define SERIAL_PINS (GPIO8 | GPIO9)
+#define STM32
+
+#else
+#error Unsupported libopencm3 board
+#endif
+
+#define _RCC_CAT(A, B) A##_##B
+#define RCC_ID(NAME) _RCC_CAT(RCC, NAME)
+
+__attribute__((unused)) static uint32_t _clock_freq;
+
+static void clock_setup(enum clock_mode clock) {
+#if defined(STM32F7)
+  switch (clock) {
+  case CLOCK_BENCHMARK:
+    rcc_clock_setup_hsi(&rcc_3v3[RCC_CLOCK_3V3_24MHZ]);
+    break;
+  case CLOCK_FAST:
+  default:
+    rcc_clock_setup_hsi(&rcc_3v3[RCC_CLOCK_3V3_216MHZ]);
+    break;
+  }
+
+  rcc_periph_clock_enable(RCC_RNG);
+  flash_art_enable();
+  flash_prefetch_enable();
+#else
+#error Unsupported platform
+#endif
+}
+
+void usart_setup() {
+#if defined(STM32F7)
+  rcc_periph_clock_enable(RCC_GPIOD);
+  rcc_periph_clock_enable(RCC_USART3);
+#else
+#error Unsupported platform
+#endif
+
+#if defined(STM32F7)
+  gpio_set_output_options(SERIAL_GPIO, GPIO_OTYPE_OD, GPIO_OSPEED_100MHZ,
+                          SERIAL_PINS);
+  gpio_set_af(SERIAL_GPIO, GPIO_AF7, SERIAL_PINS);
+  gpio_mode_setup(SERIAL_GPIO, GPIO_MODE_AF, GPIO_PUPD_PULLUP, SERIAL_PINS);
+  usart_set_baudrate(SERIAL_USART, SERIAL_BAUD);
+  usart_set_databits(SERIAL_USART, 8);
+  usart_set_stopbits(SERIAL_USART, USART_STOPBITS_1);
+  usart_set_mode(SERIAL_USART, USART_MODE_TX_RX);
+  usart_set_parity(SERIAL_USART, USART_PARITY_NONE);
+  usart_set_flow_control(SERIAL_USART, USART_FLOWCONTROL_NONE);
+  usart_disable_rx_interrupt(SERIAL_USART);
+  usart_disable_tx_interrupt(SERIAL_USART);
+  usart_enable(SERIAL_USART);
+#endif
+}
+
+void systick_setup() {
+  /* Systick is always the same on libopencm3 */
+  systick_set_clocksource(STK_CSR_CLKSOURCE_AHB);
+  systick_set_reload(0xFFFFFFu);
+  systick_interrupt_enable();
+  systick_counter_enable();
+}
+static volatile unsigned long long overflowcnt = 0;
+void hal_setup(const enum clock_mode clock) {
+  clock_setup(clock);
+  usart_setup();
+  systick_setup();
+  rng_enable();
+
+  // wait for the first systick overflow
+  // improves reliability of the benchmarking scripts since it makes it much
+  // less likely that the host will miss the start of the output
+  unsigned long long old = overflowcnt;
+  while (old == overflowcnt)
+    ;
+}
+
+void hal_send_str(const char *in) {
+  const char *cur = in;
+  while (*cur) {
+    usart_send_blocking(SERIAL_USART, *cur);
+    cur += 1;
+  }
+  usart_send_blocking(SERIAL_USART, '\n');
+}
+
+void sys_tick_handler(void) { ++overflowcnt; }
+
+uint64_t hal_get_time() {
+  while (true) {
+    unsigned long long before = overflowcnt;
+    unsigned long long result =
+        (before + 1) * 16777216llu - systick_get_value();
+    if (overflowcnt == before) {
+      return result;
+    }
+  }
+}
+
+/* End of BSS is where the heap starts (defined in the linker script) */
+extern char end;
+static char *heap_end = &end;
+
+void *__wrap__sbrk(int incr) {
+  char *prev_heap_end;
+
+  prev_heap_end = heap_end;
+  heap_end += incr;
+
+  return (void *)prev_heap_end;
+}
+
+size_t hal_get_stack_size(void) {
+  register char *cur_stack;
+  asm volatile("mov %0, sp" : "=r"(cur_stack));
+  return cur_stack - heap_end;
+}
+
+const uint32_t stackpattern = 0xDEADBEEFlu;
+
+static void *last_sp = NULL;
+
+void hal_spraystack(void) {
+
+  char *_heap_end = heap_end;
+  asm volatile("mov %0, sp\n"
+               ".L%=:\n\t"
+               "str %2, [%1], #4\n\t"
+               "cmp %1, %0\n\t"
+               "blt .L%=\n\t"
+               : "+r"(last_sp), "+r"(_heap_end)
+               : "r"(stackpattern)
+               : "cc", "memory");
+}
+
+size_t hal_checkstack(void) {
+  size_t result = 0;
+  asm volatile("sub %0, %1, %2\n"
+               ".L%=:\n\t"
+               "ldr ip, [%2], #4\n\t"
+               "cmp ip, %3\n\t"
+               "ite eq\n\t"
+               "subeq %0, #4\n\t"
+               "bne .LE%=\n\t"
+               "cmp %2, %1\n\t"
+               "blt .L%=\n\t"
+               ".LE%=:\n"
+               : "+r"(result)
+               : "r"(last_sp), "r"(heap_end), "r"(stackpattern)
+               : "ip", "cc");
+  return result;
+}
+
+/* Implement some system calls to shut up the linker warnings */
+
+#include <errno.h>
+#undef errno
+extern int errno;
+
+int __wrap__open(char *file, int flags, int mode) {
+  (void)file;
+  (void)flags;
+  (void)mode;
+  errno = ENOSYS;
+  return -1;
+}
+
+int __wrap__close(int fd) {
+  errno = ENOSYS;
+  (void)fd;
+  return -1;
+}
+
+#include <sys/stat.h>
+
+int __wrap__fstat(int fd, struct stat *buf) {
+  (void)fd;
+  (void)buf;
+  errno = ENOSYS;
+  return -1;
+}
+
+int __wrap__getpid(void) {
+  errno = ENOSYS;
+  return -1;
+}
+
+int __wrap__isatty(int file) {
+  (void)file;
+  errno = ENOSYS;
+  return 0;
+}
+
+int __wrap__kill(int pid, int sig) {
+  (void)pid;
+  (void)sig;
+  errno = ENOSYS;
+  return -1;
+}
+
+int __wrap__lseek(int fd, int ptr, int dir) {
+  (void)fd;
+  (void)ptr;
+  (void)dir;
+  errno = ENOSYS;
+  return -1;
+}
+
+int __wrap__read(int fd, char *ptr, int len) {
+  (void)fd;
+  (void)ptr;
+  (void)len;
+  errno = ENOSYS;
+  return -1;
+}
+
+int __wrap__write(int fd, const char *ptr, int len) {
+  (void)fd;
+  (void)ptr;
+  (void)len;
+  errno = ENOSYS;
+  return -1;
+}
diff --git a/common/keccakf1600.S b/common/keccakf1600.S
new file mode 100644
index 0000000..7075ed0
--- /dev/null
+++ b/common/keccakf1600.S
@@ -0,0 +1,1134 @@
+@
+@ Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+@ Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+@ denoted as "the implementer".
+@ Additional optimizations by Alexandre Adomnicai.
+@
+@ For more information, feedback or questions, please refer to our websites:
+@ http://keccak.noekeon.org/
+@ http://keyak.noekeon.org/
+@ http://ketje.noekeon.org/
+@
+@ To the extent possible under law, the implementer has waived all copyright
+@ and related or neighboring rights to the source code in this file.
+@ http://creativecommons.org/publicdomain/zero/1.0/
+@
+
+@ WARNING: These functions work only on little endian CPU with@ ARMv7m architecture (ARM Cortex-M3, ...).
+
+
+	.thumb
+	.syntax unified
+.text
+
+	@ Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002
+.macro	toBitInterleaving	x0,x1,s0,s1,t,over
+
+	and		\t,\x0,#0x55555555
+	orr		\t,\t,\t, LSR #1
+	and		\t,\t,#0x33333333
+	orr		\t,\t,\t, LSR #2
+	and		\t,\t,#0x0F0F0F0F
+	orr		\t,\t,\t, LSR #4
+	and		\t,\t,#0x00FF00FF
+	bfi		\t,\t,#8, #8
+	.if \over != 0
+	lsr		\s0,\t, #8
+	.else
+	eor		\s0,\s0,\t, LSR #8
+	.endif
+
+	and		\t,\x1,#0x55555555
+	orr		\t,\t,\t, LSR #1
+	and		\t,\t,#0x33333333
+	orr		\t,\t,\t, LSR #2
+	and		\t,\t,#0x0F0F0F0F
+	orr		\t,\t,\t, LSR #4
+	and		\t,\t,#0x00FF00FF
+	orr		\t,\t,\t, LSR #8
+	eor		\s0,\s0,\t, LSL #16
+
+	and		\t,\x0,#0xAAAAAAAA
+	orr		\t,\t,\t, LSL #1
+	and		\t,\t,#0xCCCCCCCC
+	orr		\t,\t,\t, LSL #2
+	and		\t,\t,#0xF0F0F0F0
+	orr		\t,\t,\t, LSL #4
+	and		\t,\t,#0xFF00FF00
+	orr		\t,\t,\t, LSL #8
+	.if \over != 0
+	lsr		\s1,\t, #16
+	.else
+	eor		\s1,\s1,\t, LSR #16
+	.endif
+
+	and		\t,\x1,#0xAAAAAAAA
+	orr		\t,\t,\t, LSL #1
+	and		\t,\t,#0xCCCCCCCC
+	orr		\t,\t,\t, LSL #2
+	and		\t,\t,#0xF0F0F0F0
+	orr		\t,\t,\t, LSL #4
+	and		\t,\t,#0xFF00FF00
+	orr		\t,\t,\t, LSL #8
+	bfc		\t, #0, #16
+	eors	\s1,\s1,\t
+	.endm
+
+	@ Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002
+.macro	fromBitInterleaving		x0, x1, t
+
+	movs	\t, \x0					@ t = x0@
+	bfi		\x0, \x1, #16, #16		@ x0 = (x0 & 0x0000FFFF) | (x1 << 16)@
+	bfc		\x1, #0, #16			@	x1 = (t >> 16) | (x1 & 0xFFFF0000)@
+	orr		\x1, \x1, \t, LSR #16
+
+    eor		\t, \x0, \x0, LSR #8    @ t = (x0 ^ (x0 >>  8)) & 0x0000FF00UL@  x0 = x0 ^ t ^ (t <<  8)@
+	and		\t, #0x0000FF00
+    eors	\x0, \x0, \t
+    eor		\x0, \x0, \t, LSL #8
+
+    eor		\t, \x0, \x0, LSR #4	@ t = (x0 ^ (x0 >>  4)) & 0x00F000F0UL@  x0 = x0 ^ t ^ (t <<  4)@
+	and		\t, #0x00F000F0
+    eors	\x0, \x0, \t
+    eor		\x0, \x0, \t, LSL #4
+
+    eor		\t, \x0, \x0, LSR #2	@ t = (x0 ^ (x0 >>  2)) & 0x0C0C0C0CUL@  x0 = x0 ^ t ^ (t <<  2)@
+	and		\t, #0x0C0C0C0C
+    eors	\x0, \x0, \t
+    eor		\x0, \x0, \t, LSL #2
+
+    eor		\t, \x0, \x0, LSR #1	@ t = (x0 ^ (x0 >>  1)) & 0x22222222UL@  x0 = x0 ^ t ^ (t <<  1)@
+	and		\t, #0x22222222
+    eors	\x0, \x0, \t
+    eor		\x0, \x0, \t, LSL #1
+
+    eor		\t, \x1, \x1, LSR #8    @ t = (x1 ^ (x1 >>  8)) & 0x0000FF00UL@  x1 = x1 ^ t ^ (t <<  8)@
+	and		\t, #0x0000FF00
+    eors	\x1, \x1, \t
+    eor		\x1, \x1, \t, LSL #8
+
+    eor		\t, \x1, \x1, LSR #4	@ t = (x1 ^ (x1 >>  4)) & 0x00F000F0UL@  x1 = x1 ^ t ^ (t <<  4)@
+	and		\t, #0x00F000F0
+    eors	\x1, \x1, \t
+    eor		\x1, \x1, \t, LSL #4
+
+    eor		\t, \x1, \x1, LSR #2	@ t = (x1 ^ (x1 >>  2)) & 0x0C0C0C0CUL@  x1 = x1 ^ t ^ (t <<  2)@
+	and		\t, #0x0C0C0C0C
+    eors	\x1, \x1, \t
+    eor		\x1, \x1, \t, LSL #2
+
+    eor		\t, \x1, \x1, LSR #1	@ t = (x1 ^ (x1 >>  1)) & 0x22222222UL@  x1 = x1 ^ t ^ (t <<  1)@
+	and		\t, #0x22222222
+    eors	\x1, \x1, \t
+    eor		\x1, \x1, \t, LSL #1
+	.endm
+
+@	--- offsets in state
+.equ Aba0, 0*4
+.equ Aba1, 1*4
+.equ Abe0, 2*4
+.equ Abe1, 3*4
+.equ Abi0, 4*4
+.equ Abi1, 5*4
+.equ Abo0, 6*4
+.equ Abo1, 7*4
+.equ Abu0, 8*4
+.equ Abu1, 9*4
+.equ Aga0, 10*4
+.equ Aga1, 11*4
+.equ Age0, 12*4
+.equ Age1, 13*4
+.equ Agi0, 14*4
+.equ Agi1, 15*4
+.equ Ago0, 16*4
+.equ Ago1, 17*4
+.equ Agu0, 18*4
+.equ Agu1, 19*4
+.equ Aka0, 20*4
+.equ Aka1, 21*4
+.equ Ake0, 22*4
+.equ Ake1, 23*4
+.equ Aki0, 24*4
+.equ Aki1, 25*4
+.equ Ako0, 26*4
+.equ Ako1, 27*4
+.equ Aku0, 28*4
+.equ Aku1, 29*4
+.equ Ama0, 30*4
+.equ Ama1, 31*4
+.equ Ame0, 32*4
+.equ Ame1, 33*4
+.equ Ami0, 34*4
+.equ Ami1, 35*4
+.equ Amo0, 36*4
+.equ Amo1, 37*4
+.equ Amu0, 38*4
+.equ Amu1, 39*4
+.equ Asa0, 40*4
+.equ Asa1, 41*4
+.equ Ase0, 42*4
+.equ Ase1, 43*4
+.equ Asi0, 44*4
+.equ Asi1, 45*4
+.equ Aso0, 46*4
+.equ Aso1, 47*4
+.equ Asu0, 48*4
+.equ Asu1, 49*4
+
+@	--- offsets on stack
+.equ mDa0, 0*4
+.equ mDa1, 1*4
+.equ mDo0, 2*4
+.equ mDo1, 3*4
+.equ mDi0, 4*4
+.equ mRC	, 5*4
+.equ mSize, 6*4
+
+/******************************************************************************
+ * Bitwise exclusive-OR where both operands are misaligned (i.e. src1 and src2 
+ * are rotated by rot1 and rot2, respectively).
+ * The output result is also misaligned (i.e. dst is rotated by rot1-rot2).
+ *  - dst           destination register
+ *  - src1-src2     source registers
+ *  - rot1-rot2     rotation values
+ *****************************************************************************/
+.macro eorror   dst, src1, src2, rot1, rot2
+.if \rot1 >= \rot2
+    eor  \dst, \src1, \src2, ror \rot1-\rot2
+.else
+    eor  \dst, \src1, \src2, ror 32+\rot1-\rot2
+.endif
+.endm
+
+
+/******************************************************************************
+ * Bit clear instruction where both operands are misaligned (i.e. src1 and src2 
+ * are rotated by rot1 and rot2, respectively).
+ * The output result is also misaligned (i.e. dst is rotated by rot1-rot2).
+ *  - dst           destination register
+ *  - src1-src2     source registers
+ *  - rot1-rot2     rotation values
+ *****************************************************************************/
+.macro bicror   dst, src1, src2, rot1, rot2
+.if \rot1 >= \rot2
+    bic  \dst, \src1, \src2, ror \rot1-\rot2
+.else
+    bic  \dst, \src1, \src2, ror 32+\rot1-\rot2
+.endif
+.endm
+
+
+/******************************************************************************
+ * Load 5 words from memory and XOR them all together. It is used to compute
+ * the parity columns for the Theta step.
+ * Note that all operands may be misaligned (i.e. rotated by a certain amount
+ * of bits), as well as the result.
+ *  - dst           destination register
+ *  - src1-src5     source registers
+ *  - rot1-rot5     rotation values
+ *****************************************************************************/
+.macro xor5   dst, src1, src2, src3, src4, src5, rot1, rot2, rot3, rot4, rot5
+    ldr.w   \dst, [r0, #\src1]
+    ldr.w     r1, [r0, #\src2]
+    ldr.w     r5, [r0, #\src3]
+    ldr      r11, [r0, #\src4]
+    ldr      r12, [r0, #\src5]
+    eorror  \dst, \dst,  r1, \rot1, \rot2
+    eorror  \dst, \dst,  r5, \rot1, \rot3
+    eorror  \dst, \dst, r11, \rot1, \rot4
+    eorror  \dst, \dst, r12, \rot1, \rot5
+.endm
+
+
+/******************************************************************************
+ * Same as xor5, except that a previous result is stored on the stack after the
+ * loads from memory. This allows to have the str instruction for free.
+ *  - dst           destination register
+ *  - src1-src5     source registers
+ *  - rot1-rot5     rotation values
+ *  - strreg        register from previous calculations to be stored in memory
+ *  - stradr        register holding the address to store `prev`
+ *  - strofs        stack pointer memory offset for the str instruction
+ *****************************************************************************/
+.macro xor5str   dst, src1, src2, src3, src4, src5, rot1, rot2, rot3, rot4, rot5, strreg, stradr, strofs
+    ldr.w    \dst, [r0, #\src1]
+    ldr.w      r1, [r0, #\src2]
+    ldr.w      r5, [r0, #\src3]
+    ldr       r11, [r0, #\src4]
+    ldr       r12, [r0, #\src5]
+    str.w \strreg, [\stradr, #\strofs]
+    eorror   \dst, \dst,  r1, \rot1, \rot2
+    eorror   \dst, \dst,  r5, \rot1, \rot3
+    eorror   \dst, \dst, r11, \rot1, \rot4
+    eorror   \dst, \dst, r12, \rot1, \rot5
+.endm
+
+
+/******************************************************************************
+ * Exclusive-OR where the 2nd operand is rotated by 1 bit to the left.
+ *  - dst           destination register
+ *  - src1-src2     source registers
+ *  - rot           differential rotation btw src1 & src2 (i.e. rot=rot1-rot2)
+ *****************************************************************************/
+.macro xorrol   dst, src1, src2, rot
+    eor  \dst, \src1, \src2, ror \rot-1
+.endm
+
+
+/******************************************************************************
+ * Bitslice implementation of the Chi step with misaligned operands.
+ *  - resofs        memory offset within the internal state to store the result
+ *  - src1-src3     source registers
+ *  - rot1-rot3     rotation values
+ *****************************************************************************/
+.macro xandnotlazystr   resofs, src1, src2, src3, rot1, rot2, rot3
+    bicror  r1, \src3, \src2, \rot3, \rot2
+    eorror  r1, r1, \src1, \rot3, \rot1
+    str.w   r1, [r0, #\resofs]
+.endm
+
+
+/******************************************************************************
+ * Same as xandnotlazystr but without the str instruction which will be carried
+ * out later in order to take advantage of future ldr instructions.
+ *  - src1-src3     source registers
+ *  - rot1-rot3     rotation values
+ *****************************************************************************/
+.macro xandnotlazy  src1, src2, src3, rot1, rot2, rot3
+    bicror  r1, \src3, \src2, \rot3, \rot2
+    eorror  r1, r1, \src1, \rot3, \rot1
+.endm
+
+
+/******************************************************************************
+ * Same as xandnotlazystr with an additional rotation in order to explictly
+ * compute the Rho step. It is useful in KeccakRound3 in order to return to the
+ * classical representation every 4 rounds.
+ *  - resofs        memory offset within the internal state to store the result
+ *  - src1-src3     source registers
+ *  - rot1-rot3     rotation values
+ *****************************************************************************/
+.macro xandnotstr  resofs, src1, src2, src3, rot1, rot2, rot3
+    bicror  r1, \src3, \src2, \rot3, \rot2
+    eorror  r1,    r1, \src1, \rot3, \rot1
+.if \rot3 > 0
+    ror     r1, r1, #32-\rot3
+.endif
+    str.w   r1, [r0, #\resofs]
+.endm
+
+
+/******************************************************************************
+ * Same as xandnotstr but without the str instruction which will be carried
+ * out later in order to take advantage of future ldr instructions.
+ *  - src1-src3     source registers
+ *  - rot1-rot3     rotation values
+ *****************************************************************************/
+.macro xandnot  src1, src2, src3, rot1, rot2, rot3
+    bicror  r1, \src3, \src2, \rot3, \rot2
+    eorror  r1,    r1, \src1, \rot3, \rot1
+.if \rot3 > 0
+    ror     r1, r1, #32-\rot3
+.endif
+.endm
+
+
+/******************************************************************************
+ * Same as xandnot followed by the Iota step. Note that the source registers 
+ * are not specified since they are always r3, r4 and r5.
+ *  - out           output reg (useful to store the result in the next round)
+ *  - rot2-rot3     rotation values
+ *  - rcofs         memory offset to load the round constant
+ *  - last          Boolean to indicate whether its the last round of the
+ *                  quadruple round routine
+ *****************************************************************************/
+.macro xandnotiota    out, rot3, rot2, rcofs, last
+    bicror  r5, r5, r4, \rot3, \rot2
+    ldr     r1, [sp, #mRC]
+    ldr     r4, [r1, #\rcofs]
+.if  \last == 1
+    ldr     r7, [r1, #32]!
+    str     r1, [sp, #mRC]
+    cmp     r7, #0xFF
+.endif
+.if \rot3 > 0
+    eor    r3, r3, r5, ror 32-\rot3
+.else
+    eor.w  r3, r3, r5
+.endif
+    eor.w  \out, r4, r3
+.endm
+
+
+/******************************************************************************
+ * Add the parity bits to the state registers r3-r7. If the state registers are
+ * not properly aligned due to previous lazy rotations, use the barrel shifter
+ * to fix the misalignment when adding the parity bits.
+ *  - par1-par5     registers containing the parity bits
+ *  - dly1-dly5     rotation values to compute the (delayed) Rho step
+ *****************************************************************************/
+.macro addparity par1, dly1, par2, dly2, par3, dly3, par4, dly4, par5, dly5
+.if \dly1 > 0
+    eor    r3, \par1, r3, ror 32-\dly1
+.else
+    eor.w  r3, \par1, r3
+.endif
+.if \dly2 > 0
+    eor    r4, \par2, r4, ror 32-\dly2
+.else
+    eor.w  r4, \par2, r4
+.endif
+.if \dly3 > 0
+    eor    r5, \par3, r5, ror 32-\dly3
+.else
+    eor.w  r5, \par3, r5
+.endif
+.if \dly4 > 0
+    eor    r6, \par4, r6, ror 32-\dly4
+.else
+    eor.w  r6, \par4, r6
+.endif
+.if \dly5 > 0
+    eor    r7, \par5, r7, ror 32-\dly5
+.else
+    eor.w  r7, \par5, r7
+.endif
+.endm
+
+
+/******************************************************************************
+ * Apply Theta, Pi, Chi and Iota steps to half a plane (i.e. 5 32-bit words) of
+ * the internal state.
+ * Note that the Rho step is calculated if and only if \lazy == 0, otherwise it
+ * is delayed until the next round using ''lazy reductions'' thanks to the 
+ * inline barrel shifter.
+ *  - src1-src5     source registers
+ *  - par1-par5     registers containing the parity bits
+ *  - rot2-rot5     rotation values to compute the current Rho step
+ *  - dly1-dly5     rotation values to compute the delayed Rho step
+ *  - prev          register from previous calculations to be stored in memory
+ *  - strofs        stack pointer memory offset for the str instruction
+ *  - reg           output reg related to the Iota step (to be stored later)
+ *****************************************************************************/
+.macro    KeccakThetaRhoPiChiIota   src1, par1,       dly1, \
+                                    src2, par2, rot2, dly2, \
+                                    src3, par3, rot3, dly3, \
+                                    src4, par4, rot4, dly4, \
+                                    src5, par5, rot5, dly5, \
+                                    ofs,  last, lazy, strofs, reg
+    ldr.w       r3, [r0, #\src1]
+    ldr       r4, [r0, #\src2]
+    ldr       r5, [r0, #\src3]
+    ldr       r6, [r0, #\src4]
+    ldr       r7, [r0, #\src5]
+    str.w       r1, [r0, #\strofs]
+    addparity   \par1, \dly1, \par2, \dly2, \par3, \dly3, \par4, \dly4, \par5, \dly5
+.if \lazy == 1
+    xandnotlazystr  \src2, r4, r5, r6, \rot2, \rot3, \rot4
+    xandnotlazystr  \src3, r5, r6, r7, \rot3, \rot4, \rot5
+    xandnotlazystr  \src4, r6, r7, r3, \rot4, \rot5,     0
+    xandnotlazystr  \src5, r7, r3, r4, \rot5,     0, \rot2
+.else
+    xandnotstr     \src2, r4, r5, r6, \rot2, \rot3, \rot4
+    xandnotstr     \src3, r5, r6, r7, \rot3, \rot4, \rot5
+    xandnotstr     \src4, r6, r7, r3, \rot4, \rot5,     0
+    xandnotstr     \src5, r7, r3, r4, \rot5,     0, \rot2
+.endif
+    xandnotiota    \reg, \rot3, \rot2, \ofs, \last
+.endm
+
+
+/******************************************************************************
+ * Apply Theta, Pi, and Chi steps to half a plane (i.e. 5 32-bit words) of the
+ * internal state.
+ * Note that the Rho step is calculated if and only if \lazy == 0, otherwise it
+ * is delayed until the next round using ''lazy reductions'' thanks to the 
+ * inline barrel shifter.
+ *  - src1-src5     source registers
+ *  - dst1-dst5     memory offsets to store the output registers
+ *  - par1-par5     registers containing the parity bits
+ *  - rot2-rot5     rotation values to compute the current Rho step
+ *  - dly1-dly5     rotation values to compute the delayed Rho step
+ *  - lazy          Boolean to indicate whether lazy rotations are used or not
+ *  - strofs        stack pointer memory offset to store the last output of the 
+ *                  previous round.
+ *****************************************************************************/
+.macro    KeccakThetaRhoPiChi   src1, dst1, par1, rot1, dly1, \
+                                src2, dst2, par2, rot2, dly2, \
+                                src3, dst3, par3, rot3, dly3, \
+                                src4, dst4, par4, rot4, dly4, \
+                                src5, dst5, par5, rot5, dly5, \
+                                lazy, strofs
+    ldr.w       r3, [r0, #\src1]
+    ldr.w       r4, [r0, #\src2]
+    ldr.w       r5, [r0, #\src3]
+    ldr.w       r6, [r0, #\src4]
+    ldr.w       r7, [r0, #\src5]
+    str.w       r1, [r0, #\strofs]
+    addparity   \par1, \dly1, \par2, \dly2, \par3, \dly3, \par4, \dly4, \par5, \dly5
+.if \lazy == 1
+    xandnotlazystr  \dst1, r3, r4, r5, \rot1, \rot2, \rot3
+    xandnotlazystr  \dst2, r4, r5, r6, \rot2, \rot3, \rot4
+    xandnotlazystr  \dst3, r5, r6, r7, \rot3, \rot4, \rot5
+    xandnotlazystr  \dst4, r6, r7, r3, \rot4, \rot5, \rot1
+    xandnotlazy            r7, r3, r4, \rot5, \rot1, \rot2
+.else
+    xandnotstr      \dst1, r3, r4, r5, \rot1, \rot2, \rot3
+    xandnotstr      \dst2, r4, r5, r6, \rot2, \rot3, \rot4
+    xandnotstr      \dst3, r5, r6, r7, \rot3, \rot4, \rot5
+    xandnotstr      \dst4, r6, r7, r3, \rot4, \rot5, \rot1
+    xandnot                r7, r3, r4, \rot5, \rot1, \rot2
+.endif
+.endm
+
+
+/******************************************************************************
+ * 1st round of the 4 unrolled rounds routine due to in-place processing.
+ * At the beginning of such rounds, the internal state is expected to match the
+ * classical representation (i.e. without transition and no delayed Rho step).
+ *****************************************************************************/
+.macro    KeccakRound0
+    xor5      r3, Abu0, Agu0, Aku0, Amu0, Asu0, 0, 0, 0, 0, 0
+    xor5      r7, Abe1, Age1, Ake1, Ame1, Ase1, 0, 0, 0, 0, 0
+    xorrol    r6, r3, r7, 32
+    xor5str   r4, Abi1, Agi1, Aki1, Ami1, Asi1, 0, 0, 0, 0, 0, r6, sp, mDa0
+    eor.w     r6, r3, r4
+    xor5str   r3, Abo0, Ago0, Ako0, Amo0, Aso0, 0, 0, 0, 0, 0, r6, sp, mDo1
+    eor.w     r2, r7, r3
+    xor5      r7, Aba0, Aga0, Aka0, Ama0, Asa0, 0, 0, 0, 0, 0
+    xorrol   r10, r7, r4, 32
+    xor5      r4, Abo1, Ago1, Ako1, Amo1, Aso1, 0, 0, 0, 0, 0
+    eor      r14, r4, r7
+    xor5      r7, Abe0, Age0, Ake0, Ame0, Ase0, 0, 0, 0, 0, 0
+    xorrol    r6, r7, r4, 32
+    xor5str   r4, Abu1, Agu1, Aku1, Amu1, Asu1, 0, 0, 0, 0, 0, r6, sp, mDi0
+    eor.w     r8, r4, r7
+    xor5str   r7, Abi0, Agi0, Aki0, Ami0, Asi0, 0, 0, 0, 0, 0, r8, sp, mDa1
+    xorrol    r9, r7, r4, 32
+    xor5str   r4, Aba1, Aga1, Aka1, Ama1, Asa1, 0, 0, 0, 0, 0, r9, sp, mDo0
+    eor      r11, r4, r7
+    xorrol   r12, r3, r4, 32
+    KeccakThetaRhoPiChi Abo0, Aka1,  r9, 14, 0, \
+                        Agu0, Ame1, r12, 10, 0, \
+                        Aka1, Asi1,  r8,  2, 0, \
+                        Ame1, Abo0, r11, 23, 0, \
+                        Asi1, Agu0,  r2, 31, 0, \
+                        1, Aka1
+    KeccakThetaRhoPiChi Abe0, Asa1, r10,  0, 0, \
+                        Agi1, Abe0,  r2,  3, 0, \
+                        Ako0, Agi1,  r9, 12, 0, \
+                        Amu1, Ako0, r14,  4, 0, \
+                        Asa1, Amu1,  r8,  9, 0, \
+                        1, Agu0
+    ldr         r8, [sp, #mDa0]
+    KeccakThetaRhoPiChi Abu1, Aga0, r14, 14, 0, \
+                        Aga0, Ake0,  r8, 18, 0, \
+                        Ake0, Ami1, r10,  5, 0, \
+                        Ami1, Aso0,  r2,  8, 0, \
+                        Aso0, Abu1,  r9, 28, 0, \
+                        1, Amu1
+    KeccakThetaRhoPiChi Abi1, Ama0,  r2, 31, 0, \
+                        Ago0, Ase1,  r9, 27, 0, \
+                        Aku0, Abi1, r12, 19, 0, \
+                        Ama0, Ago0,  r8, 20, 0, \
+                        Ase1, Aku0, r11,  1, 0, \
+                        1, Abu1
+    ldr         r9, [sp, #mDo1]
+    KeccakThetaRhoPiChiIota Aba0,  r8,  0,    \
+                            Age0, r10, 22, 0, \
+                            Aki1,  r2, 22, 0, \
+                            Amo1,  r9, 11, 0, \
+                            Asu0, r12,  7, 0, \
+                            0, 0, 1, Aku0, r1
+    ldr.w       r2, [sp, #mDi0]
+    KeccakThetaRhoPiChi Abo1, Aka0,  r9, 14, 0, \
+                        Agu1, Ame0, r14, 10, 0, \
+                        Aka0, Asi0,  r8,  1, 0, \
+                        Ame0, Abo1, r10, 22, 0, \
+                        Asi0, Agu1,  r2, 30, 0, \
+                        1, Aba0
+    KeccakThetaRhoPiChi Abe1, Asa0, r11,  1, 0, \
+                        Agi0, Abe1,  r2,  3, 0, \
+                        Ako1, Agi0,  r9, 13, 0, \
+                        Amu0, Ako1, r12,  4, 0, \
+                        Asa0, Amu0,  r8,  9, 0, \
+                        1, Agu1
+    ldr         r8, [sp, #mDa1]
+    KeccakThetaRhoPiChi Abu0, Aga1, r12, 13, 0, \
+                        Aga1, Ake1,  r8, 18, 0, \
+                        Ake1, Ami0, r11,  5, 0, \
+                        Ami0, Aso1,  r2,  7, 0, \
+                        Aso1, Abu0,  r9, 28, 0, \
+                        1, Amu0
+    KeccakThetaRhoPiChi Abi0, Ama1,  r2, 31, 0, \
+                        Ago1, Ase0,  r9, 28, 0, \
+                        Aku1, Abi0, r14, 20, 0, \
+                        Ama1, Ago1,  r8, 21, 0, \
+                        Ase0, Aku1, r10,  1, 0, \
+                        1, Abu0
+    ldr         r9, [sp, #mDo0]
+    KeccakThetaRhoPiChiIota Aba1,  r8,  0,    \
+                            Age1, r11, 22, 0, \
+                            Aki0,  r2, 21, 0, \
+                            Amo0,  r9, 10, 0, \
+                            Asu1, r14,  7, 0, \
+                            4, 0, 1, Aku1, r14
+.endm
+
+
+
+/******************************************************************************
+ * 2nd round of the 4 unrolled rounds routine due to in-place processing.
+ *****************************************************************************/
+.macro    KeccakRound1
+    xor5str     r3, Asu0, Agu0, Amu0, Abu1, Aku1, 22, 10,  3, 18, 28, r14, r0, Aba1
+    xor5        r7, Age1, Ame0, Abe0, Ake1, Ase1, 10, 22,  4,  7, 20
+    ror         r3, 32-22
+    xorrol      r6, r3, r7, 32-10
+    xor5str     r4, Aki0, Asi0, Agi1, Ami0, Abi1,  7, 30,  9, 28,  1, r6, sp, mDa0
+    eor         r6, r3, r4, ror 32-7
+    xor5str     r3, Amo1, Abo0, Ako1, Aso0, Ago1,  0, 14,  1, 14, 31, r6, sp, mDo1
+    eor         r2, r3, r7, ror 32-10
+    xor5        r7, Aba0, Aka1, Asa0, Aga0, Ama1,  0,  2, 13,  5, 20
+    xorrol     r10, r7, r4, 32-7
+    xor5        r4, Amo0, Abo1, Ako0, Aso1, Ago0,  0, 14,  0, 13, 31
+    eor        r14, r4, r7
+    xor5        r7, Age0, Ame1, Abe1, Ake0, Ase0, 11, 23,  4,  8, 21
+    ror         r7, 32-11
+    xorrol      r6, r7, r4, 32
+    xor5str     r4, Asu1, Agu1, Amu1, Abu0, Aku0, 22, 10,  3, 18, 27, r6, sp, mDi0
+    eor         r8, r7, r4, ror 32-22
+    xor5str     r7, Aki1, Asi1, Agi0, Ami1, Abi0,  7, 31,  9, 28,  1, r8, sp, mDa1
+    ror         r7, 32-7
+    xorrol      r9, r7, r4, 32-22
+    xor5str     r4, Aba1, Aka0, Asa1, Aga1, Ama0,  0,  1, 12,  5, 19, r9, sp, mDo0
+    eor        r11, r4, r7
+    xorrol     r12, r3, r4, 32
+    KeccakThetaRhoPiChi Amo1, Asa1,  r9, 14,  0, \
+                        Agu0, Ake1, r12, 10, 10, \
+                        Asa1, Abi1,  r8,  2, 12, \
+                        Ake1, Amo1, r11, 23,  7, \
+                        Abi1, Agu0,  r2, 31,  1, \
+                        1, Asa1
+    KeccakThetaRhoPiChi Age0, Ama0, r10,  0, 11, \
+                        Asi0, Age0,  r2,  3, 30, \
+                        Ako1, Asi0,  r9, 12,  1, \
+                        Abu0, Ako1, r14,  4, 18, \
+                        Ama0, Abu0,  r8,  9, 19, \
+                        1, Agu0
+    ldr         r8, [sp, #mDa0]
+    KeccakThetaRhoPiChi Asu1, Aka1, r14, 14, 22, \
+                        Aka1, Abe1,  r8, 18,  2, \
+                        Abe1, Ami0, r10,  5,  4, \
+                        Ami0, Ago1,  r2,  8, 28, \
+                        Ago1, Asu1,  r9, 28, 31, \
+                        1, Abu0
+    KeccakThetaRhoPiChi Aki0, Aga0,  r2, 31,  7, \
+                        Abo0, Ase1,  r9, 27, 14, \
+                        Amu0, Aki0, r12, 19,  3, \
+                        Aga0, Abo0,  r8, 20,  5, \
+                        Ase1, Amu0, r11,  1, 20, \
+                        1, Asu1
+    ldr         r9, [sp, #mDo1]
+    KeccakThetaRhoPiChiIota Aba0,  r8,  0,     \
+                            Ame1, r10, 22, 23, \
+                            Agi1,  r2, 22,  9, \
+                            Aso1,  r9, 11, 13, \
+                            Aku1, r12,  7, 28, \
+                            8, 0, 1, Amu0, r1
+    ldr.w         r2, [sp, #mDi0]
+    KeccakThetaRhoPiChi Amo0, Asa0,  r9, 14,  0, \
+                        Agu1, Ake0, r14, 10, 10, \
+                        Asa0, Abi0,  r8,  1, 13, \
+                        Ake0, Amo0, r10, 22,  8, \
+                        Abi0, Agu1,  r2, 30,  1, \
+                        1, Aba0
+    KeccakThetaRhoPiChi Age1, Ama1, r11,  1, 10, \
+                        Asi1, Age1,  r2,  3, 31, \
+                        Ako0, Asi1,  r9, 13,  0, \
+                        Abu1, Ako0, r12,  4, 18, \
+                        Ama1, Abu1,  r8,  9, 20, \
+                        1, Agu1
+    ldr         r8, [sp, #mDa1]
+    KeccakThetaRhoPiChi Asu0, Aka0, r12, 13, 22, \
+                        Aka0, Abe0,  r8, 18,  1, \
+                        Abe0, Ami1, r11,  5,  4, \
+                        Ami1, Ago0,  r2,  7, 28, \
+                        Ago0, Asu0,  r9, 28, 31, \
+                        1, Abu1
+    KeccakThetaRhoPiChi Aki1, Aga1,  r2, 31,  7, \
+                        Abo1, Ase0,  r9, 28, 14, \
+                        Amu1, Aki1, r14, 20,  3, \
+                        Aga1, Abo1,  r8, 21,  5, \
+                        Ase0, Amu1, r10,  1, 21, \
+                        1, Asu0
+    ldr         r9, [sp, #mDo0]
+    KeccakThetaRhoPiChiIota Aba1,  r8,  0,     \
+                            Ame0, r11, 22, 22, \
+                            Agi0,  r2, 21,  9, \
+                            Aso0,  r9, 10, 14, \
+                            Aku0, r14,  7, 27, \
+                            12, 0, 1, Amu1, r14
+.endm
+
+/******************************************************************************
+ * 3rd round of the 4 unrolled rounds routine due to in-place processing.
+ *****************************************************************************/
+.macro    KeccakRound2
+    xor5str     r3, Aku1, Agu0, Abu1, Asu1, Amu1, 22, 10,  3, 18, 28, r14, r0, Aba1
+    xor5        r7, Ame0, Ake0, Age0, Abe0, Ase1, 10, 22,  4,  7, 20
+    ror         r3, 32-22
+    xorrol      r6, r3, r7, 32-10
+    xor5str     r4, Agi0, Abi0, Asi0, Ami1, Aki0,  7, 30,  9, 28,  1, r6, sp, mDa0
+    eor         r6, r3, r4, ror 32-7
+    xor5str     r3, Aso1, Amo1, Ako0, Ago1, Abo1,  0, 14,  1, 14, 31, r6, sp, mDo1
+    eor         r2, r3, r7, ror 32-10
+    xor5        r7, Aba0, Asa1, Ama1, Aka1, Aga1,  0,  2, 13,  5, 20
+    xorrol     r10, r7, r4, 32-7
+    xor5        r4, Aso0, Amo0, Ako1, Ago0, Abo0,  0, 14,  0, 13, 31
+    eor        r14, r4, r7
+    xor5        r7, Ame1, Ake1, Age1, Abe1, Ase0, 11, 23,  4,  8, 21
+    ror         r7, 32-11
+    xorrol      r6, r7, r4, 32
+    xor5str     r4, Aku0, Agu1, Abu0, Asu0, Amu0, 22, 10,  3, 18, 27, r6, sp, mDi0
+    eor         r8, r7, r4, ror 32-22
+    xor5str     r7, Agi1, Abi1, Asi1, Ami0, Aki1,  7, 31,  9, 28,  1, r8, sp, mDa1
+    ror         r7, 32-7
+    xorrol      r9, r7, r4, 32-22
+    xor5str     r4, Aba1, Asa0, Ama0, Aka0, Aga0,  0,  1, 12,  5, 19, r9, sp, mDo0
+    eor        r11, r4, r7
+    xorrol     r12, r3, r4, 32
+    KeccakThetaRhoPiChi Aso1, Ama0,  r9, 14,  0, \
+                        Agu0, Abe0, r12, 10, 10, \
+                        Ama0, Aki0,  r8,  2, 12, \
+                        Abe0, Aso1, r11, 23,  7, \
+                        Aki0, Agu0,  r2, 31,  1, \
+                        1, Ama0
+    KeccakThetaRhoPiChi Ame1, Aga0, r10,  0, 11, \
+                        Abi0, Ame1,  r2,  3, 30, \
+                        Ako0, Abi0,  r9, 12,  1, \
+                        Asu0, Ako0, r14,  4, 18, \
+                        Aga0, Asu0,  r8,  9, 19, \
+                        1, Agu0
+    ldr     r8, [sp, #mDa0]
+    KeccakThetaRhoPiChi Aku0, Asa1, r14, 14, 22, \
+                        Asa1, Age1,  r8, 18,  2, \
+                        Age1, Ami1, r10,  5,  4, \
+                        Ami1, Abo1,  r2,  8, 28, \
+                        Abo1, Aku0,  r9, 28, 31, \
+                        1, Asu0
+    KeccakThetaRhoPiChi Agi0, Aka1,  r2, 31,  7, \
+                        Amo1, Ase1,  r9, 27, 14, \
+                        Abu1, Agi0, r12, 19,  3, \
+                        Aka1, Amo1,  r8, 20,  5, \
+                        Ase1, Abu1, r11,  1, 20, \
+                        1, Aku0
+    ldr     r9, [sp, #mDo1]
+    KeccakThetaRhoPiChiIota Aba0, r8,  0,     \
+                            Ake1, r10,22, 23, \
+                            Asi0, r2, 22,  9, \
+                            Ago0, r9, 11, 13, \
+                            Amu1, r12, 7, 28, \
+                            16, 0, 1, Abu1, r1
+    ldr.w   r2, [sp, #mDi0]
+    KeccakThetaRhoPiChi Aso0, Ama1,  r9, 14,  0, \
+                        Agu1, Abe1, r14, 10, 10, \
+                        Ama1, Aki1,  r8,  1, 13, \
+                        Abe1, Aso0, r10, 22,  8, \
+                        Aki1, Agu1,  r2, 30,  1, \
+                        1, Aba0
+    KeccakThetaRhoPiChi Ame0, Aga1, r11,  1, 10, \
+                        Abi1, Ame0,  r2,  3, 31, \
+                        Ako1, Abi1,  r9, 13,  0, \
+                        Asu1, Ako1, r12,  4, 18, \
+                        Aga1, Asu1,  r8,  9, 20, \
+                        1, Agu1
+    ldr     r8, [sp, #mDa1]
+    KeccakThetaRhoPiChi Aku1, Asa0, r12, 13, 22, \
+                        Asa0, Age0,  r8, 18,  1, \
+                        Age0, Ami0, r11,  5,  4, \
+                        Ami0, Abo0,  r2,  7, 28, \
+                        Abo0, Aku1,  r9, 28, 31, \
+                        1, Asu1
+    KeccakThetaRhoPiChi Agi1, Aka0,  r2, 31,  7, \
+                        Amo0, Ase0,  r9, 28, 14, \
+                        Abu0, Agi1, r14, 20,  3, \
+                        Aka0, Amo0,  r8, 21,  5, \
+                        Ase0, Abu0, r10,  1, 21, \
+                        1, Aku1
+    ldr     r9, [sp, #mDo0]
+    KeccakThetaRhoPiChiIota Aba1,  r8,  0,     \
+                            Ake0, r11, 22, 22, \
+                            Asi1,  r2, 21,  9, \
+                            Ago1,  r9, 10, 14, \
+                            Amu0, r14,  7, 27, \
+                            20, 0, 1, Abu0, r14
+
+.endm
+
+
+/******************************************************************************
+ * 4th round of the 4 unrolled rounds routine due to in-place processing.
+ * Note that the Rho step is *not* delayed so that the internal state is
+ * compliant w/ the classical representation at the end of the routine. 
+ *****************************************************************************/
+.macro    KeccakRound3
+    xor5str     r3, Amu1, Agu0, Asu1, Aku0, Abu0, 22, 10,  3, 18, 28, r14, r0, Aba1
+    xor5        r7, Ake0, Abe1, Ame1, Age0, Ase1, 10, 22,  4,  7, 20
+    ror         r3, 32-22
+    xorrol      r6, r3, r7, 32-10
+    xor5str     r4, Asi1, Aki1, Abi0, Ami0, Agi0,  7, 30,  9, 28,  1, r6, sp, mDa0
+    eor         r6, r3, r4, ror 32-7
+    xor5str     r3, Ago0, Aso1, Ako1, Abo1, Amo0,  0, 14,  1, 14, 31, r6, sp, mDo1
+    eor         r2, r3, r7, ror 32-10
+    xor5        r7, Aba0, Ama0, Aga1, Asa1, Aka0,  0,  2, 13,  5, 20
+    xorrol     r10, r7, r4, 32-7
+    xor5        r4, Ago1, Aso0, Ako0, Abo0, Amo1,  0, 14,  0, 13, 31
+    eor        r14, r4, r7
+    xor5        r7, Ake1, Abe0, Ame0, Age1, Ase0, 11, 23,  4,  8, 21
+    ror         r7, #32-11
+    xorrol      r6, r7, r4, 32
+    xor5str     r4, Amu0, Agu1, Asu0, Aku1, Abu1, 22, 10,  3, 18, 27, r6, sp, mDi0
+    eor         r8, r7, r4, ror 32-22
+    xor5str     r7, Asi0, Aki0, Abi1, Ami1, Agi1,  7, 31,  9, 28,  1, r8, sp, mDa1
+    ror         r7, 32-7
+    xorrol      r9, r7, r4, 32-22
+    xor5str     r4, Aba1, Ama1, Aga0, Asa0, Aka1,  0,  1, 12,  5, 19, r9, sp, mDo0
+    eor        r11, r4, r7
+    xorrol     r12, r3, r4, 32
+    KeccakThetaRhoPiChi     Ago0, Aga0,  r9, 14,  0, \
+                            Agu0, Age0, r12, 10, 10, \
+                            Aga0, Agi0,  r8,  2, 12, \
+                            Age0, Ago0, r11, 23,  7, \
+                            Agi0, Agu0,  r2, 31,  1, \
+                            0, Aga0
+    KeccakThetaRhoPiChi     Ake1, Aka1, r10,  0, 11, \
+                            Aki1, Ake1,  r2,  3, 30, \
+                            Ako1, Aki1,  r9, 12,  1, \
+                            Aku1, Ako1, r14,  4, 18, \
+                            Aka1, Aku1,  r8,  9, 19, \
+                            0, Agu0
+    ldr     r8, [sp, #mDa0]
+    KeccakThetaRhoPiChi     Amu0, Ama0, r14, 14, 22, \
+                            Ama0, Ame0,  r8, 18,  2, \
+                            Ame0, Ami0, r10,  5,  4, \
+                            Ami0, Amo0,  r2,  8, 28, \
+                            Amo0, Amu0,  r9, 28, 31, \
+                            0, Aku1
+    KeccakThetaRhoPiChi     Asi1, Asa1,  r2, 31,  7, \
+                            Aso1, Ase1,  r9, 27, 14, \
+                            Asu1, Asi1, r12, 19,  3, \
+                            Asa1, Aso1,  r8, 20,  5, \
+                            Ase1, Asu1, r11,  1, 20, \
+                            0, Amu0
+    ldr     r9, [sp, #mDo1]
+    KeccakThetaRhoPiChiIota Aba0,  r8,  0,     \
+                            Abe0, r10, 22, 23, \
+                            Abi0,  r2, 22,  9, \
+                            Abo0,  r9, 11, 13, \
+                            Abu0, r12,  7, 28, \
+                            24, 0, 0, Asu1, r1
+    ldr.w   r2, [sp, #mDi0]
+    KeccakThetaRhoPiChi     Ago1, Aga1,  r9, 14,  0, \
+                            Agu1, Age1, r14, 10, 10, \
+                            Aga1, Agi1,  r8,  1, 13, \
+                            Age1, Ago1, r10, 22,  8, \
+                            Agi1, Agu1,  r2, 30,  1, \
+                            0, Aba0
+    KeccakThetaRhoPiChi     Ake0, Aka0, r11,  1, 10, \
+                            Aki0, Ake0,  r2,  3, 31, \
+                            Ako0, Aki0,  r9, 13,  0, \
+                            Aku0, Ako0, r12,  4, 18, \
+                            Aka0, Aku0,  r8,  9, 20, \
+                            0, Agu1
+    ldr     r8, [sp, #mDa1]
+    KeccakThetaRhoPiChi     Amu1, Ama1, r12, 13, 22, \
+                            Ama1, Ame1,  r8, 18,  1, \
+                            Ame1, Ami1, r11,  5,  4, \
+                            Ami1, Amo1,  r2,  7, 28, \
+                            Amo1, Amu1,  r9, 28, 31, \
+                            0, Aku0
+    KeccakThetaRhoPiChi     Asi0, Asa0,  r2, 31,  7, \
+                            Aso0, Ase0,  r9, 28, 14, \
+                            Asu0, Asi0, r14, 20,  3, \
+                            Asa0, Aso0,  r8, 21,  5, \
+                            Ase0, Asu0, r10,  1, 21, \
+                            0, Amu1
+    ldr     r9, [sp, #mDo0]
+    KeccakThetaRhoPiChiIota Aba1,  r8, 0,      \
+                            Abe1, r11, 22, 22, \
+                            Abi1,  r2, 21,  9, \
+                            Abo1,  r9, 10, 14, \
+                            Abu1, r14,  7, 27, \
+                            28, 1, 0, Asu0, r1
+    str.w r1, [r0, #Aba1]
+.endm
+
+
+@----------------------------------------------------------------------------
+@
+@ void KeccakF1600_Initialize( void )
+@
+.align 8
+.global   KeccakF1600_Initialize
+KeccakF1600_Initialize:
+	bx		lr
+
+
+
+@----------------------------------------------------------------------------
+@
+@ void KeccakF1600_StateXORBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
+@
+.align 8
+.global   KeccakF1600_StateXORBytes
+KeccakF1600_StateXORBytes:
+	cbz		r3, KeccakF1600_StateXORBytes_Exit1
+	push	{r4 - r8, lr}							@ then
+	bic		r4, r2, #7								@ offset &= ~7
+	adds	r0, r0, r4								@ add whole lane offset to state pointer
+	ands	r2, r2, #7								@ offset &= 7 (part not lane aligned)
+	beq		KeccakF1600_StateXORBytes_CheckLanes	@ .if offset != 0
+	movs	r4, r3									@ then, do remaining bytes in first lane
+	rsb		r5, r2, #8								@ max size in lane = 8 - offset
+	cmp		r4, r5
+	ble		KeccakF1600_StateXORBytes_BytesAlign
+	movs	r4, r5
+KeccakF1600_StateXORBytes_BytesAlign:
+	sub		r8, r3, r4								@ size left
+	movs	r3, r4
+	bl		__KeccakF1600_StateXORBytesInLane
+	mov		r3, r8
+KeccakF1600_StateXORBytes_CheckLanes:
+	lsrs	r2, r3, #3								@ .if length >= 8
+	beq		KeccakF1600_StateXORBytes_Bytes
+	mov		r8, r3
+	bl		__KeccakF1600_StateXORLanes
+	and		r3, r8, #7
+KeccakF1600_StateXORBytes_Bytes:
+	cbz		r3, KeccakF1600_StateXORBytes_Exit
+	movs	r2, #0
+	bl		__KeccakF1600_StateXORBytesInLane
+KeccakF1600_StateXORBytes_Exit:
+	pop		{r4 - r8, pc}
+KeccakF1600_StateXORBytes_Exit1:
+	bx		lr
+
+
+@----------------------------------------------------------------------------
+@
+@ __KeccakF1600_StateXORLanes
+@
+@ Input:
+@  r0 state pointer
+@  r1 data pointer
+@  r2 laneCount
+@
+@ Output:
+@  r0 state pointer next lane
+@  r1 data pointer next byte to input
+@
+@ Changed: r2-r7
+@
+.align 8
+__KeccakF1600_StateXORLanes:
+__KeccakF1600_StateXORLanes_LoopAligned:
+	ldr		r4, [r1], #4
+	ldr		r5, [r1], #4
+	ldrd    r6, r7, [r0]
+	toBitInterleaving	r4, r5, r6, r7, r3, 0
+	strd	r6, r7, [r0], #8
+	subs	r2, r2, #1
+	bne		__KeccakF1600_StateXORLanes_LoopAligned
+	bx		lr
+
+
+@----------------------------------------------------------------------------
+@
+@ __KeccakF1600_StateXORBytesInLane
+@
+@ Input:
+@  r0 state pointer
+@  r1 data pointer
+@  r2 offset in lane
+@  r3 length
+@
+@ Output:
+@  r0 state pointer next lane
+@  r1 data pointer next byte to input
+@
+@  Changed: r2-r7
+@
+.align 8
+__KeccakF1600_StateXORBytesInLane:
+	movs	r4, #0
+	movs	r5, #0
+	push	{ r4 - r5 }
+	add		r2, r2, sp
+__KeccakF1600_StateXORBytesInLane_Loop:
+	ldrb	r5, [r1], #1
+	strb	r5, [r2], #1
+	subs	r3, r3, #1
+	bne		__KeccakF1600_StateXORBytesInLane_Loop
+	pop		{ r4 - r5 }
+	ldrd    r6, r7, [r0]
+	toBitInterleaving	r4, r5, r6, r7, r3, 0
+	strd	r6, r7, [r0], #8
+	bx		lr
+
+
+
+
+@----------------------------------------------------------------------------
+@
+@ void KeccakF1600_StateExtractBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
+@
+.align 8
+.global   KeccakF1600_StateExtractBytes
+KeccakF1600_StateExtractBytes:
+	cbz		r3, KeccakF1600_StateExtractBytes_Exit1	@ .if length != 0
+	push	{r4 - r8, lr}							@ then
+	bic		r4, r2, #7								@ offset &= ~7
+	adds	r0, r0, r4								@ add whole lane offset to state pointer
+	ands	r2, r2, #7								@ offset &= 7 (part not lane aligned)
+	beq		KeccakF1600_StateExtractBytes_CheckLanes	@ .if offset != 0
+	movs	r4, r3									@ then, do remaining bytes in first lane
+	rsb		r5, r2, #8								@ max size in lane = 8 - offset
+	cmp		r4, r5
+	ble		KeccakF1600_StateExtractBytes_BytesAlign
+	movs	r4, r5
+KeccakF1600_StateExtractBytes_BytesAlign:
+	sub		r8, r3, r4								@ size left
+	movs	r3, r4
+	bl		__KeccakF1600_StateExtractBytesInLane
+	mov		r3, r8
+KeccakF1600_StateExtractBytes_CheckLanes:
+	lsrs	r2, r3, #3								@ .if length >= 8
+	beq		KeccakF1600_StateExtractBytes_Bytes
+	mov		r8, r3
+	bl		__KeccakF1600_StateExtractLanes
+	and		r3, r8, #7
+KeccakF1600_StateExtractBytes_Bytes:
+	cbz		r3, KeccakF1600_StateExtractBytes_Exit
+	movs	r2, #0
+	bl		__KeccakF1600_StateExtractBytesInLane
+KeccakF1600_StateExtractBytes_Exit:
+	pop		{r4 - r8, pc}
+KeccakF1600_StateExtractBytes_Exit1:
+	bx		lr
+
+
+@----------------------------------------------------------------------------
+@
+@ __KeccakF1600_StateExtractLanes
+@
+@ Input:
+@  r0 state pointer
+@  r1 data pointer
+@  r2 laneCount
+@
+@ Output:
+@  r0 state pointer next lane
+@  r1 data pointer next byte to input
+@
+@ Changed: r2-r5
+@
+.align 8
+__KeccakF1600_StateExtractLanes:
+__KeccakF1600_StateExtractLanes_LoopAligned:
+	ldrd	r4, r5, [r0], #8
+	fromBitInterleaving	r4, r5, r3
+	str		r4, [r1], #4
+	subs	r2, r2, #1
+	str		r5, [r1], #4
+	bne		__KeccakF1600_StateExtractLanes_LoopAligned
+	bx		lr
+
+
+@----------------------------------------------------------------------------
+@
+@ __KeccakF1600_StateExtractBytesInLane
+@
+@ Input:
+@  r0 state pointer
+@  r1 data pointer
+@  r2 offset in lane
+@  r3 length
+@
+@ Output:
+@  r0 state pointer next lane
+@  r1 data pointer next byte to input
+@
+@  Changed: r2-r6
+@
+.align 8
+__KeccakF1600_StateExtractBytesInLane:
+	ldrd	r4, r5, [r0], #8
+	fromBitInterleaving	r4, r5, r6
+	push	{r4, r5}
+	add		r2, sp, r2
+__KeccakF1600_StateExtractBytesInLane_Loop:
+	ldrb	r4, [r2], #1
+	subs	r3, r3, #1
+	strb	r4, [r1], #1
+	bne		__KeccakF1600_StateExtractBytesInLane_Loop
+	add		sp, #8
+	bx		lr
+
+
+
+.align 8
+KeccakF1600_StatePermute_RoundConstantsWithTerminator:
+	@		0			1
+		.long 		0x00000001,	0x00000000
+		.long 		0x00000000,	0x00000089
+		.long 		0x00000000,	0x8000008b
+		.long 		0x00000000,	0x80008080
+
+		.long 		0x00000001,	0x0000008b
+		.long 		0x00000001,	0x00008000
+		.long 		0x00000001,	0x80008088
+		.long 		0x00000001,	0x80000082
+
+		.long 		0x00000000,	0x0000000b
+		.long 		0x00000000,	0x0000000a
+		.long 		0x00000001,	0x00008082
+		.long 		0x00000000,	0x00008003
+
+		.long 		0x00000001,	0x0000808b
+		.long 		0x00000001,	0x8000000b
+		.long 		0x00000001,	0x8000008a
+		.long 		0x00000001,	0x80000081
+
+		.long 		0x00000000,	0x80000081
+		.long 		0x00000000,	0x80000008
+		.long 		0x00000000,	0x00000083
+		.long 		0x00000000,	0x80008003
+
+		.long 		0x00000001,	0x80008088
+		.long 		0x00000000,	0x80000088
+		.long 		0x00000001,	0x00008000
+		.long 		0x00000000,	0x80008082
+
+		.long 		0x000000FF	@terminator
+
+@----------------------------------------------------------------------------
+@
+@ void KeccakF1600_StatePermute( void *state )
+@
+.align 8
+.global   KeccakF1600_StatePermute
+KeccakF1600_StatePermute:
+	adr		r1, KeccakF1600_StatePermute_RoundConstantsWithTerminator
+	push	{ r4 - r12, lr }
+	sub		sp, #mSize
+	str		r1, [sp, #mRC]
+KeccakF1600_StatePermute_RoundLoop:
+	KeccakRound0
+	KeccakRound1
+	KeccakRound2
+	KeccakRound3
+	bne		KeccakF1600_StatePermute_RoundLoop
+	add		sp, #mSize
+	pop		{ r4 - r12, pc }
+
diff --git a/common/keccaktest.c b/common/keccaktest.c
new file mode 100644
index 0000000..389bb9c
--- /dev/null
+++ b/common/keccaktest.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: Apache-2.0 or CC0-1.0
+#include "randombytes.h"
+#include <hal.h>
+#include <fips202.h>
+#include <sendfn.h>
+#include <randombytes.h>
+#include <string.h>
+
+#include <stdio.h>
+
+
+#include <stdio.h>
+#include <string.h>
+
+const unsigned char msg1[] = {
+  0x84u, 0xb6u, 0x0cu, 0xb3u, 0x72u, 0x0bu, 0xf2u, 0x97u, 0x48u, 0x48u, 0x3cu, 0xf7u, 0xabu, 0xd0u, 0xd1u,
+  0xf1u, 0xd9u, 0x38u, 0x04u, 0x59u, 0xdfu, 0xa9u, 0x68u, 0x46u, 0x0cu, 0x86u, 0xe5u, 0xd1u, 0xa5u, 0x4fu,
+  0x0bu, 0x19u, 0xdau, 0xc6u, 0xa7u, 0x8bu, 0xf9u, 0x50u, 0x94u, 0x60u, 0xe2u, 0x9du, 0xd4u, 0x66u, 0xbbu,
+  0x8bu, 0xdfu, 0x04u, 0xe5u, 0x48u, 0x3bu, 0x78u, 0x2eu, 0xb7u, 0x4du, 0x64u, 0x48u, 0x16u, 0x6fu, 0x89u,
+  0x7au, 0xddu, 0x43u, 0xd2u, 0x95u, 0xe9u, 0x46u, 0x94u, 0x2au, 0xd9u, 0xa8u, 0x14u, 0xfau, 0xb9u, 0x5bu,
+  0x4au, 0xaeu, 0xdeu, 0x6au, 0xe4u, 0xc8u, 0x10u, 0x8cu, 0x8eu, 0xdau, 0xefu, 0xf9u, 0x71u, 0xf5u, 0x8fu,
+  0x7cu, 0xf9u, 0x65u, 0x66u, 0xc9u, 0xdcu, 0x9bu, 0x68u, 0x12u, 0x58u, 0x6bu, 0x70u, 0xd5u, 0xbcu, 0x78u,
+  0xe2u, 0xf8u, 0x29u, 0xecu, 0x8eu, 0x17u, 0x9au, 0x6cu, 0xd8u, 0x1du, 0x22u, 0x4bu, 0x16u, 0x11u, 0x75u,
+  0xfdu, 0x3au, 0x33u, 0xaau, 0xcfu, 0xb1u, 0x48u, 0x3fu,
+};
+
+const unsigned char md1[] = {
+  0x88u, 0x14u, 0x63u, 0x0au, 0x39u, 0xdcu, 0xb9u, 0x97u, 0x92u, 0xccu, 0x4eu,
+  0x08u, 0xcau, 0xe5u, 0xddu, 0x07u, 0x89u, 0x73u, 0xd1u, 0x5cu, 0xd1u, 0x9fu,
+  0x17u, 0xbau, 0xcfu, 0x04u, 0xdeu, 0xdau, 0x9eu, 0x62u, 0xc4u, 0x5fu,
+};
+
+static int test(void)
+{
+  unsigned char buf[32];
+  int r = 0;
+  sha3_256(buf, msg1, sizeof(msg1));
+  if(memcmp(buf, md1, sizeof(buf))) {
+    hal_send_str("ERROR SHA3-256 output does not match test vector.\n");
+    r = 1;
+  }
+  return r;
+}
+
+static int bench(void)
+{
+  char str[128];
+  unsigned char msg[1024*32];
+  unsigned char md[1024*32];
+  uint64_t t0, t1;
+#define TESTMD(MD) \
+  hal_send_str("-"); \
+  t0 = hal_get_time(); \
+  MD(md, msg, sizeof(msg)); \
+  t1 = hal_get_time(); \
+  sprintf(str, #MD": %llu cycles", t1-t0); \
+  hal_send_str(str)
+  TESTMD(sha3_256);
+  TESTMD(sha3_384);
+  TESTMD(sha3_512);
+#define TESTXOF(XOF) \
+  hal_send_str("-"); \
+  t0 = hal_get_time(); \
+  XOF(md, sizeof(md), msg, sizeof(msg)); \
+  t1 = hal_get_time(); \
+  sprintf(str, #XOF": %llu cycles", t1-t0);     \
+  hal_send_str(str)
+  TESTXOF(shake128);
+  TESTXOF(shake256);
+  return 0;
+}
+
+int main(void)
+{
+  hal_setup(CLOCK_BENCHMARK);
+  hal_send_str("===");
+  if(test()){
+    hal_send_str("ERR");
+  } else {
+    hal_send_str("ALL GOOD!");
+  }
+  bench();
+  hal_send_str("###");
+  return 0;
+}
diff --git a/common/mps2/CMSDK_CM4.h b/common/mps2/CMSDK_CM4.h
new file mode 100644
index 0000000..7a6fd8e
--- /dev/null
+++ b/common/mps2/CMSDK_CM4.h
@@ -0,0 +1,1289 @@
+/* MPS2 CMSIS Library
+*
+* Copyright (c) 2006-2018 ARM Limited
+* SPDX-License-Identifier: BSD-3-Clause
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*
+* 1. Redistributions of source code must retain the above copyright notice,
+* this list of conditions and the following disclaimer.
+*
+* 2. Redistributions in binary form must reproduce the above copyright notice,
+* this list of conditions and the following disclaimer in the documentation
+* and/or other materials provided with the distribution.
+*
+* 3. Neither the name of the copyright holder nor the names of its contributors
+* may be used to endorse or promote products derived from this software without
+* specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+* POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************
+* @file     CMSDK_CM4.h
+* @brief    CMSIS Cortex-M4 Core Peripheral Access Layer Header File for
+*           Device CMSDK_CM4
+*
+*******************************************************************************/
+
+
+#ifndef CMSDK_CM4_H
+#define CMSDK_CM4_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* -------------------------  Interrupt Number Definition  ------------------------ */
+
+typedef enum IRQn {
+    /* -------------------  Cortex-M3 Processor Exceptions Numbers  ------------------- */
+    NonMaskableInt_IRQn           = -14,        /*  2 Non Maskable Interrupt          */
+    HardFault_IRQn                = -13,        /*  3 HardFault Interrupt             */
+    MemoryManagement_IRQn         = -12,        /*  4 Memory Management Interrupt     */
+    BusFault_IRQn                 = -11,        /*  5 Bus Fault Interrupt             */
+    UsageFault_IRQn               = -10,        /*  6 Usage Fault Interrupt           */
+    SVCall_IRQn                   =  -5,        /* 11 SV Call Interrupt               */
+    DebugMonitor_IRQn             =  -4,        /* 12 Debug Monitor Interrupt         */
+    PendSV_IRQn                   =  -2,        /* 14 Pend SV Interrupt               */
+    SysTick_IRQn                  =  -1,        /* 15 System Tick Interrupt           */
+
+    /******  CMSDK Specific Interrupt Numbers *********************************************************/
+    UARTRX0_IRQn                  = 0,       /*!< UART 0 RX Interrupt                               */
+    UARTTX0_IRQn                  = 1,       /*!< UART 0 TX Interrupt                               */
+    UARTRX1_IRQn                  = 2,       /*!< UART 1 RX Interrupt                               */
+    UARTTX1_IRQn                  = 3,       /*!< UART 1 TX Interrupt                               */
+    UARTRX2_IRQn                  = 4,       /*!< UART 2 RX Interrupt                               */
+    UARTTX2_IRQn                  = 5,       /*!< UART 2 TX Interrupt                               */
+    PORT0_ALL_IRQn                = 6,       /*!< Port 0 combined Interrupt                         */
+    PORT1_ALL_IRQn                = 7,       /*!< Port 1 combined Interrupt                         */
+    TIMER0_IRQn                   = 8,       /*!< TIMER 0 Interrupt                                 */
+    TIMER1_IRQn                   = 9,       /*!< TIMER 1 Interrupt                                 */
+    DUALTIMER_IRQn                = 10,      /*!< Dual Timer Interrupt                              */
+    SPI_IRQn                      = 11,      /*!< SPI Interrupt                                     */
+    UARTOVF_IRQn                  = 12,      /*!< UART 0,1,2 Overflow Interrupt                     */
+    ETHERNET_IRQn                 = 13,      /*!< Ethernet Interrupt                                */
+    I2S_IRQn                      = 14,      /*!< I2S Interrupt                                     */
+    TSC_IRQn                      = 15,      /*!< Touch Screen Interrupt                            */
+    PORT2_ALL_IRQn                = 16,      /*!< Port 2 combined Interrupt                         */
+    PORT3_ALL_IRQn                = 17,      /*!< Port 3 combined Interrupt                         */
+    UARTRX3_IRQn                  = 18,      /*!< UART 3 RX Interrupt                               */
+    UARTTX3_IRQn                  = 19,      /*!< UART 3 TX Interrupt                               */
+    UARTRX4_IRQn                  = 20,      /*!< UART 4 RX Interrupt                               */
+    UARTTX4_IRQn                  = 21,      /*!< UART 4 TX Interrupt                               */
+    ADCSPI_IRQn                   = 22,      /*!< SHIELD ADC SPI Interrupt                          */
+    SHIELDSPI_IRQn                = 23,      /*!< SHIELD SPI Combined Interrupt                     */
+    PORT0_0_IRQn                  = 24,      /*!<  GPIO Port 0 pin 0 Interrupt                      */
+    PORT0_1_IRQn                  = 25,      /*!<  GPIO Port 0 pin 1 Interrupt                      */
+    PORT0_2_IRQn                  = 26,      /*!<  GPIO Port 0 pin 2 Interrupt                      */
+    PORT0_3_IRQn                  = 27,      /*!<  GPIO Port 0 pin 3 Interrupt                      */
+    PORT0_4_IRQn                  = 28,      /*!<  GPIO Port 0 pin 4 Interrupt                      */
+    PORT0_5_IRQn                  = 29,      /*!<  GPIO Port 0 pin 5 Interrupt                      */
+    PORT0_6_IRQn                  = 30,      /*!<  GPIO Port 0 pin 6 Interrupt                      */
+    PORT0_7_IRQn                  = 31,      /*!<  GPIO Port 0 pin 7 Interrupt                      */
+} IRQn_Type;
+
+
+/*
+ * ==========================================================================
+ * ----------- Processor and Core Peripheral Section ------------------------
+ * ==========================================================================
+ */
+
+/* Configuration of the Cortex-M4 Processor and Core Peripherals */
+#define __CM4_REV                 0x0001    /*!< Core Revision r0p1                               */
+#define __NVIC_PRIO_BITS          3         /*!< Number of Bits used for Priority Levels          */
+#define __Vendor_SysTickConfig    0         /*!< Set to 1 if different SysTick Config is used     */
+#define __MPU_PRESENT             1         /*!< MPU present or not                               */
+#define __FPU_PRESENT             1         /*!< FPU present or not                               */
+
+/*@}*/ /* end of group CMSDK_CM4_CMSIS */
+
+
+#include "core_cm4.h"                       /* Cortex-M4 processor and core peripherals           */
+
+
+/******************************************************************************/
+/*                Device Specific Peripheral registers structures             */
+/******************************************************************************/
+/** @addtogroup CMSDK_CM4_Peripherals CMSDK_CM4 Peripherals
+  CMSDK_CM4 Device Specific Peripheral registers structures
+  @{
+*/
+
+#if defined ( __CC_ARM   )
+#pragma anon_unions
+#endif
+
+/*------------- Universal Asynchronous Receiver Transmitter (UART) -----------*/
+/** @addtogroup CMSDK_UART CMSDK Universal Asynchronous Receiver/Transmitter
+  memory mapped structure for CMSDK_UART
+  @{
+*/
+typedef struct {
+    __IO   uint32_t  DATA;          /*!< Offset: 0x000 Data Register    (R/W) */
+    __IO   uint32_t  STATE;         /*!< Offset: 0x004 Status Register  (R/W) */
+    __IO   uint32_t  CTRL;          /*!< Offset: 0x008 Control Register (R/W) */
+    union {
+        __I    uint32_t  INTSTATUS;   /*!< Offset: 0x00C Interrupt Status Register (R/ ) */
+        __O    uint32_t  INTCLEAR;    /*!< Offset: 0x00C Interrupt Clear Register ( /W) */
+    };
+    __IO   uint32_t  BAUDDIV;       /*!< Offset: 0x010 Baudrate Divider Register (R/W) */
+
+} CMSDK_UART_TypeDef;
+
+/* CMSDK_UART DATA Register Definitions */
+
+#define CMSDK_UART_DATA_Pos               0                                            /*!< CMSDK_UART_DATA_Pos: DATA Position */
+#define CMSDK_UART_DATA_Msk               (0xFFul << CMSDK_UART_DATA_Pos)              /*!< CMSDK_UART DATA: DATA Mask */
+
+#define CMSDK_UART_STATE_RXOR_Pos         3                                            /*!< CMSDK_UART STATE: RXOR Position */
+#define CMSDK_UART_STATE_RXOR_Msk         (0x1ul << CMSDK_UART_STATE_RXOR_Pos)         /*!< CMSDK_UART STATE: RXOR Mask */
+
+#define CMSDK_UART_STATE_TXOR_Pos         2                                            /*!< CMSDK_UART STATE: TXOR Position */
+#define CMSDK_UART_STATE_TXOR_Msk         (0x1ul << CMSDK_UART_STATE_TXOR_Pos)         /*!< CMSDK_UART STATE: TXOR Mask */
+
+#define CMSDK_UART_STATE_RXBF_Pos         1                                            /*!< CMSDK_UART STATE: RXBF Position */
+#define CMSDK_UART_STATE_RXBF_Msk         (0x1ul << CMSDK_UART_STATE_RXBF_Pos)         /*!< CMSDK_UART STATE: RXBF Mask */
+
+#define CMSDK_UART_STATE_TXBF_Pos         0                                            /*!< CMSDK_UART STATE: TXBF Position */
+#define CMSDK_UART_STATE_TXBF_Msk         (0x1ul << CMSDK_UART_STATE_TXBF_Pos )        /*!< CMSDK_UART STATE: TXBF Mask */
+
+#define CMSDK_UART_CTRL_HSTM_Pos          6                                            /*!< CMSDK_UART CTRL: HSTM Position */
+#define CMSDK_UART_CTRL_HSTM_Msk          (0x01ul << CMSDK_UART_CTRL_HSTM_Pos)         /*!< CMSDK_UART CTRL: HSTM Mask */
+
+#define CMSDK_UART_CTRL_RXORIRQEN_Pos     5                                            /*!< CMSDK_UART CTRL: RXORIRQEN Position */
+#define CMSDK_UART_CTRL_RXORIRQEN_Msk     (0x01ul << CMSDK_UART_CTRL_RXORIRQEN_Pos)    /*!< CMSDK_UART CTRL: RXORIRQEN Mask */
+
+#define CMSDK_UART_CTRL_TXORIRQEN_Pos     4                                            /*!< CMSDK_UART CTRL: TXORIRQEN Position */
+#define CMSDK_UART_CTRL_TXORIRQEN_Msk     (0x01ul << CMSDK_UART_CTRL_TXORIRQEN_Pos)    /*!< CMSDK_UART CTRL: TXORIRQEN Mask */
+
+#define CMSDK_UART_CTRL_RXIRQEN_Pos       3                                            /*!< CMSDK_UART CTRL: RXIRQEN Position */
+#define CMSDK_UART_CTRL_RXIRQEN_Msk       (0x01ul << CMSDK_UART_CTRL_RXIRQEN_Pos)      /*!< CMSDK_UART CTRL: RXIRQEN Mask */
+
+#define CMSDK_UART_CTRL_TXIRQEN_Pos       2                                            /*!< CMSDK_UART CTRL: TXIRQEN Position */
+#define CMSDK_UART_CTRL_TXIRQEN_Msk       (0x01ul << CMSDK_UART_CTRL_TXIRQEN_Pos)      /*!< CMSDK_UART CTRL: TXIRQEN Mask */
+
+#define CMSDK_UART_CTRL_RXEN_Pos          1                                            /*!< CMSDK_UART CTRL: RXEN Position */
+#define CMSDK_UART_CTRL_RXEN_Msk          (0x01ul << CMSDK_UART_CTRL_RXEN_Pos)         /*!< CMSDK_UART CTRL: RXEN Mask */
+
+#define CMSDK_UART_CTRL_TXEN_Pos          0                                            /*!< CMSDK_UART CTRL: TXEN Position */
+#define CMSDK_UART_CTRL_TXEN_Msk          (0x01ul << CMSDK_UART_CTRL_TXEN_Pos)         /*!< CMSDK_UART CTRL: TXEN Mask */
+
+#define CMSDK_UART_INTSTATUS_RXORIRQ_Pos  3                                            /*!< CMSDK_UART CTRL: RXORIRQ Position */
+#define CMSDK_UART_CTRL_RXORIRQ_Msk       (0x01ul << CMSDK_UART_INTSTATUS_RXORIRQ_Pos) /*!< CMSDK_UART CTRL: RXORIRQ Mask */
+
+#define CMSDK_UART_CTRL_TXORIRQ_Pos       2                                            /*!< CMSDK_UART CTRL: TXORIRQ Position */
+#define CMSDK_UART_CTRL_TXORIRQ_Msk       (0x01ul << CMSDK_UART_CTRL_TXORIRQ_Pos)      /*!< CMSDK_UART CTRL: TXORIRQ Mask */
+
+#define CMSDK_UART_CTRL_RXIRQ_Pos         1                                            /*!< CMSDK_UART CTRL: RXIRQ Position */
+#define CMSDK_UART_CTRL_RXIRQ_Msk         (0x01ul << CMSDK_UART_CTRL_RXIRQ_Pos)        /*!< CMSDK_UART CTRL: RXIRQ Mask */
+
+#define CMSDK_UART_CTRL_TXIRQ_Pos         0                                            /*!< CMSDK_UART CTRL: TXIRQ Position */
+#define CMSDK_UART_CTRL_TXIRQ_Msk         (0x01ul << CMSDK_UART_CTRL_TXIRQ_Pos)        /*!< CMSDK_UART CTRL: TXIRQ Mask */
+
+#define CMSDK_UART_BAUDDIV_Pos            0                                            /*!< CMSDK_UART BAUDDIV: BAUDDIV Position */
+#define CMSDK_UART_BAUDDIV_Msk            (0xFFFFFul << CMSDK_UART_BAUDDIV_Pos)        /*!< CMSDK_UART BAUDDIV: BAUDDIV Mask */
+
+/*@}*/ /* end of group CMSDK_UART */
+
+
+/*----------------------------- Timer (TIMER) -------------------------------*/
+/** @addtogroup CMSDK_TIMER CMSDK Timer
+  @{
+*/
+typedef struct {
+    __IO   uint32_t  CTRL;          /*!< Offset: 0x000 Control Register (R/W) */
+    __IO   uint32_t  VALUE;         /*!< Offset: 0x004 Current Value Register (R/W) */
+    __IO   uint32_t  RELOAD;        /*!< Offset: 0x008 Reload Value Register  (R/W) */
+    union {
+        __I    uint32_t  INTSTATUS;   /*!< Offset: 0x00C Interrupt Status Register (R/ ) */
+        __O    uint32_t  INTCLEAR;    /*!< Offset: 0x00C Interrupt Clear Register ( /W) */
+    };
+
+} CMSDK_TIMER_TypeDef;
+
+/* CMSDK_TIMER CTRL Register Definitions */
+
+#define CMSDK_TIMER_CTRL_IRQEN_Pos          3                                              /*!< CMSDK_TIMER CTRL: IRQEN Position */
+#define CMSDK_TIMER_CTRL_IRQEN_Msk          (0x01ul << CMSDK_TIMER_CTRL_IRQEN_Pos)         /*!< CMSDK_TIMER CTRL: IRQEN Mask */
+
+#define CMSDK_TIMER_CTRL_SELEXTCLK_Pos      2                                              /*!< CMSDK_TIMER CTRL: SELEXTCLK Position */
+#define CMSDK_TIMER_CTRL_SELEXTCLK_Msk      (0x01ul << CMSDK_TIMER_CTRL_SELEXTCLK_Pos)     /*!< CMSDK_TIMER CTRL: SELEXTCLK Mask */
+
+#define CMSDK_TIMER_CTRL_SELEXTEN_Pos       1                                              /*!< CMSDK_TIMER CTRL: SELEXTEN Position */
+#define CMSDK_TIMER_CTRL_SELEXTEN_Msk       (0x01ul << CMSDK_TIMER_CTRL_SELEXTEN_Pos)      /*!< CMSDK_TIMER CTRL: SELEXTEN Mask */
+
+#define CMSDK_TIMER_CTRL_EN_Pos             0                                              /*!< CMSDK_TIMER CTRL: EN Position */
+#define CMSDK_TIMER_CTRL_EN_Msk             (0x01ul << CMSDK_TIMER_CTRL_EN_Pos)            /*!< CMSDK_TIMER CTRL: EN Mask */
+
+#define CMSDK_TIMER_VAL_CURRENT_Pos         0                                              /*!< CMSDK_TIMER VALUE: CURRENT Position */
+#define CMSDK_TIMER_VAL_CURRENT_Msk         (0xFFFFFFFFul << CMSDK_TIMER_VAL_CURRENT_Pos)  /*!< CMSDK_TIMER VALUE: CURRENT Mask */
+
+#define CMSDK_TIMER_RELOAD_VAL_Pos          0                                              /*!< CMSDK_TIMER RELOAD: RELOAD Position */
+#define CMSDK_TIMER_RELOAD_VAL_Msk          (0xFFFFFFFFul << CMSDK_TIMER_RELOAD_VAL_Pos)   /*!< CMSDK_TIMER RELOAD: RELOAD Mask */
+
+#define CMSDK_TIMER_INTSTATUS_Pos           0                                              /*!< CMSDK_TIMER INTSTATUS: INTSTATUSPosition */
+#define CMSDK_TIMER_INTSTATUS_Msk           (0x01ul << CMSDK_TIMER_INTSTATUS_Pos)          /*!< CMSDK_TIMER INTSTATUS: INTSTATUSMask */
+
+#define CMSDK_TIMER_INTCLEAR_Pos            0                                              /*!< CMSDK_TIMER INTCLEAR: INTCLEAR Position */
+#define CMSDK_TIMER_INTCLEAR_Msk            (0x01ul << CMSDK_TIMER_INTCLEAR_Pos)           /*!< CMSDK_TIMER INTCLEAR: INTCLEAR Mask */
+
+/*@}*/ /* end of group CMSDK_TIMER */
+
+
+/*------------- Timer (TIM) --------------------------------------------------*/
+// <g> Timer (TIM)
+
+/** @addtogroup CMSDK_DualTIMER CMSDK Dual Timer
+  @{
+*/
+
+typedef struct {
+    __IO uint32_t Timer1Load;                  /* Offset: 0x000 (R/W) Timer 1 Load */
+    __I  uint32_t Timer1Value;                 /* Offset: 0x004 (R/ ) Timer 1 Counter Current Value */
+    __IO uint32_t Timer1Control;               /* Offset: 0x008 (R/W) Timer 1 Control */
+    /* <o.7> TimerEn: Timer Enable         */
+    /* <o.6> TimerMode: Timer Mode         */
+    /*   <0=> Freerunning-mode             */
+    /*   <1=> Periodic mode                */
+    /* <o.5> IntEnable: Interrupt Enable   */
+    /* <o.2..3> TimerPre: Timer Prescale   */
+    /*   <0=> / 1                          */
+    /*   <1=> / 16                         */
+    /*   <2=> / 256                        */
+    /*   <3=> Undefined!                   */
+    /* <o.1> TimerSize: Timer Size         */
+    /*   <0=> 16-bit counter               */
+    /*   <1=> 32-bit counter               */
+    /* <o.0> OneShot: One-shoot mode       */
+    /*   <0=> Wrapping mode                */
+    /*   <1=> One-shot mode                */
+    /* </h>                                */
+    __O  uint32_t Timer1IntClr;                /* Offset: 0x00C ( /W) Timer 1 Interrupt Clear */
+    __I  uint32_t Timer1RIS;                   /* Offset: 0x010 (R/ ) Timer 1 Raw Interrupt Status */
+    __I  uint32_t Timer1MIS;                   /* Offset: 0x014 (R/ ) Timer 1 Masked Interrupt Status */
+    __IO uint32_t Timer1BGLoad;                /* Offset: 0x018 (R/W) Background Load Register */
+    uint32_t RESERVED0;
+    __IO uint32_t Timer2Load;                  /* Offset: 0x020 (R/W) Timer 2 Load */
+    __I  uint32_t Timer2Value;                 /* Offset: 0x024 (R/ ) Timer 2 Counter Current Value */
+    __IO uint32_t Timer2Control;               /* Offset: 0x028 (R/W) Timer 2 Control */
+    /* <o.7> TimerEn: Timer Enable         */
+    /* <o.6> TimerMode: Timer Mode         */
+    /*   <0=> Freerunning-mode             */
+    /*   <1=> Periodic mode                */
+    /* <o.5> IntEnable: Interrupt Enable   */
+    /* <o.2..3> TimerPre: Timer Prescale   */
+    /*   <0=> / 1                          */
+    /*   <1=> / 16                         */
+    /*   <2=> / 256                        */
+    /*   <3=> Undefined!                   */
+    /* <o.1> TimerSize: Timer Size         */
+    /*   <0=> 16-bit counter               */
+    /*   <1=> 32-bit counter               */
+    /* <o.0> OneShot: One-shoot mode       */
+    /*   <0=> Wrapping mode                */
+    /*   <1=> One-shot mode                */
+    /* </h>                                */
+    __O  uint32_t Timer2IntClr;                /* Offset: 0x02C ( /W) Timer 2 Interrupt Clear */
+    __I  uint32_t Timer2RIS;                   /* Offset: 0x030 (R/ ) Timer 2 Raw Interrupt Status */
+    __I  uint32_t Timer2MIS;                   /* Offset: 0x034 (R/ ) Timer 2 Masked Interrupt Status */
+    __IO uint32_t Timer2BGLoad;                /* Offset: 0x038 (R/W) Background Load Register */
+    uint32_t RESERVED1[945];
+    __IO uint32_t ITCR;                        /* Offset: 0xF00 (R/W) Integration Test Control Register */
+    __O  uint32_t ITOP;                        /* Offset: 0xF04 ( /W) Integration Test Output Set Register */
+} CMSDK_DUALTIMER_BOTH_TypeDef;
+
+#define CMSDK_DUALTIMER1_LOAD_Pos            0                                               /*!< CMSDK_DUALTIMER1 LOAD: LOAD Position */
+#define CMSDK_DUALTIMER1_LOAD_Msk            (0xFFFFFFFFul << CMSDK_DUALTIMER1_LOAD_Pos)     /*!< CMSDK_DUALTIMER1 LOAD: LOAD Mask */
+
+#define CMSDK_DUALTIMER1_VALUE_Pos           0                                               /*!< CMSDK_DUALTIMER1 VALUE: VALUE Position */
+#define CMSDK_DUALTIMER1_VALUE_Msk           (0xFFFFFFFFul << CMSDK_DUALTIMER1_VALUE_Pos)    /*!< CMSDK_DUALTIMER1 VALUE: VALUE Mask */
+
+#define CMSDK_DUALTIMER1_CTRL_EN_Pos         7                                               /*!< CMSDK_DUALTIMER1 CTRL_EN: CTRL Enable Position */
+#define CMSDK_DUALTIMER1_CTRL_EN_Msk         (0x1ul << CMSDK_DUALTIMER1_CTRL_EN_Pos)         /*!< CMSDK_DUALTIMER1 CTRL_EN: CTRL Enable Mask */
+
+#define CMSDK_DUALTIMER1_CTRL_MODE_Pos       6                                               /*!< CMSDK_DUALTIMER1 CTRL_MODE: CTRL MODE Position */
+#define CMSDK_DUALTIMER1_CTRL_MODE_Msk       (0x1ul << CMSDK_DUALTIMER1_CTRL_MODE_Pos)       /*!< CMSDK_DUALTIMER1 CTRL_MODE: CTRL MODE Mask */
+
+#define CMSDK_DUALTIMER1_CTRL_INTEN_Pos      5                                               /*!< CMSDK_DUALTIMER1 CTRL_INTEN: CTRL Int Enable Position */
+#define CMSDK_DUALTIMER1_CTRL_INTEN_Msk      (0x1ul << CMSDK_DUALTIMER1_CTRL_INTEN_Pos)      /*!< CMSDK_DUALTIMER1 CTRL_INTEN: CTRL Int Enable Mask */
+
+#define CMSDK_DUALTIMER1_CTRL_PRESCALE_Pos   2                                               /*!< CMSDK_DUALTIMER1 CTRL_PRESCALE: CTRL PRESCALE Position */
+#define CMSDK_DUALTIMER1_CTRL_PRESCALE_Msk   (0x3ul << CMSDK_DUALTIMER1_CTRL_PRESCALE_Pos)   /*!< CMSDK_DUALTIMER1 CTRL_PRESCALE: CTRL PRESCALE Mask */
+
+#define CMSDK_DUALTIMER1_CTRL_SIZE_Pos       1                                               /*!< CMSDK_DUALTIMER1 CTRL_SIZE: CTRL SIZE Position */
+#define CMSDK_DUALTIMER1_CTRL_SIZE_Msk       (0x1ul << CMSDK_DUALTIMER1_CTRL_SIZE_Pos)       /*!< CMSDK_DUALTIMER1 CTRL_SIZE: CTRL SIZE Mask */
+
+#define CMSDK_DUALTIMER1_CTRL_ONESHOOT_Pos   0                                               /*!< CMSDK_DUALTIMER1 CTRL_ONESHOOT: CTRL ONESHOOT Position */
+#define CMSDK_DUALTIMER1_CTRL_ONESHOOT_Msk   (0x1ul << CMSDK_DUALTIMER1_CTRL_ONESHOOT_Pos)   /*!< CMSDK_DUALTIMER1 CTRL_ONESHOOT: CTRL ONESHOOT Mask */
+
+#define CMSDK_DUALTIMER1_INTCLR_Pos          0                                               /*!< CMSDK_DUALTIMER1 INTCLR: INT Clear Position */
+#define CMSDK_DUALTIMER1_INTCLR_Msk          (0x1ul << CMSDK_DUALTIMER1_INTCLR_Pos)          /*!< CMSDK_DUALTIMER1 INTCLR: INT Clear  Mask */
+
+#define CMSDK_DUALTIMER1_RAWINTSTAT_Pos      0                                               /*!< CMSDK_DUALTIMER1 RAWINTSTAT: Raw Int Status Position */
+#define CMSDK_DUALTIMER1_RAWINTSTAT_Msk      (0x1ul << CMSDK_DUALTIMER1_RAWINTSTAT_Pos)      /*!< CMSDK_DUALTIMER1 RAWINTSTAT: Raw Int Status Mask */
+
+#define CMSDK_DUALTIMER1_MASKINTSTAT_Pos     0                                               /*!< CMSDK_DUALTIMER1 MASKINTSTAT: Mask Int Status Position */
+#define CMSDK_DUALTIMER1_MASKINTSTAT_Msk     (0x1ul << CMSDK_DUALTIMER1_MASKINTSTAT_Pos)     /*!< CMSDK_DUALTIMER1 MASKINTSTAT: Mask Int Status Mask */
+
+#define CMSDK_DUALTIMER1_BGLOAD_Pos          0                                               /*!< CMSDK_DUALTIMER1 BGLOAD: Background Load Position */
+#define CMSDK_DUALTIMER1_BGLOAD_Msk          (0xFFFFFFFFul << CMSDK_DUALTIMER1_BGLOAD_Pos)   /*!< CMSDK_DUALTIMER1 BGLOAD: Background Load Mask */
+
+#define CMSDK_DUALTIMER2_LOAD_Pos            0                                               /*!< CMSDK_DUALTIMER2 LOAD: LOAD Position */
+#define CMSDK_DUALTIMER2_LOAD_Msk            (0xFFFFFFFFul << CMSDK_DUALTIMER2_LOAD_Pos)     /*!< CMSDK_DUALTIMER2 LOAD: LOAD Mask */
+
+#define CMSDK_DUALTIMER2_VALUE_Pos           0                                               /*!< CMSDK_DUALTIMER2 VALUE: VALUE Position */
+#define CMSDK_DUALTIMER2_VALUE_Msk           (0xFFFFFFFFul << CMSDK_DUALTIMER2_VALUE_Pos)    /*!< CMSDK_DUALTIMER2 VALUE: VALUE Mask */
+
+#define CMSDK_DUALTIMER2_CTRL_EN_Pos         7                                               /*!< CMSDK_DUALTIMER2 CTRL_EN: CTRL Enable Position */
+#define CMSDK_DUALTIMER2_CTRL_EN_Msk         (0x1ul << CMSDK_DUALTIMER2_CTRL_EN_Pos)         /*!< CMSDK_DUALTIMER2 CTRL_EN: CTRL Enable Mask */
+
+#define CMSDK_DUALTIMER2_CTRL_MODE_Pos       6                                               /*!< CMSDK_DUALTIMER2 CTRL_MODE: CTRL MODE Position */
+#define CMSDK_DUALTIMER2_CTRL_MODE_Msk       (0x1ul << CMSDK_DUALTIMER2_CTRL_MODE_Pos)       /*!< CMSDK_DUALTIMER2 CTRL_MODE: CTRL MODE Mask */
+
+#define CMSDK_DUALTIMER2_CTRL_INTEN_Pos      5                                               /*!< CMSDK_DUALTIMER2 CTRL_INTEN: CTRL Int Enable Position */
+#define CMSDK_DUALTIMER2_CTRL_INTEN_Msk      (0x1ul << CMSDK_DUALTIMER2_CTRL_INTEN_Pos)      /*!< CMSDK_DUALTIMER2 CTRL_INTEN: CTRL Int Enable Mask */
+
+#define CMSDK_DUALTIMER2_CTRL_PRESCALE_Pos   2                                               /*!< CMSDK_DUALTIMER2 CTRL_PRESCALE: CTRL PRESCALE Position */
+#define CMSDK_DUALTIMER2_CTRL_PRESCALE_Msk   (0x3ul << CMSDK_DUALTIMER2_CTRL_PRESCALE_Pos)   /*!< CMSDK_DUALTIMER2 CTRL_PRESCALE: CTRL PRESCALE Mask */
+
+#define CMSDK_DUALTIMER2_CTRL_SIZE_Pos       1                                               /*!< CMSDK_DUALTIMER2 CTRL_SIZE: CTRL SIZE Position */
+#define CMSDK_DUALTIMER2_CTRL_SIZE_Msk       (0x1ul << CMSDK_DUALTIMER2_CTRL_SIZE_Pos)       /*!< CMSDK_DUALTIMER2 CTRL_SIZE: CTRL SIZE Mask */
+
+#define CMSDK_DUALTIMER2_CTRL_ONESHOOT_Pos   0                                               /*!< CMSDK_DUALTIMER2 CTRL_ONESHOOT: CTRL ONESHOOT Position */
+#define CMSDK_DUALTIMER2_CTRL_ONESHOOT_Msk   (0x1ul << CMSDK_DUALTIMER2_CTRL_ONESHOOT_Pos)   /*!< CMSDK_DUALTIMER2 CTRL_ONESHOOT: CTRL ONESHOOT Mask */
+
+#define CMSDK_DUALTIMER2_INTCLR_Pos          0                                               /*!< CMSDK_DUALTIMER2 INTCLR: INT Clear Position */
+#define CMSDK_DUALTIMER2_INTCLR_Msk          (0x1ul << CMSDK_DUALTIMER2_INTCLR_Pos)          /*!< CMSDK_DUALTIMER2 INTCLR: INT Clear  Mask */
+
+#define CMSDK_DUALTIMER2_RAWINTSTAT_Pos      0                                               /*!< CMSDK_DUALTIMER2 RAWINTSTAT: Raw Int Status Position */
+#define CMSDK_DUALTIMER2_RAWINTSTAT_Msk      (0x1ul << CMSDK_DUALTIMER2_RAWINTSTAT_Pos)      /*!< CMSDK_DUALTIMER2 RAWINTSTAT: Raw Int Status Mask */
+
+#define CMSDK_DUALTIMER2_MASKINTSTAT_Pos     0                                               /*!< CMSDK_DUALTIMER2 MASKINTSTAT: Mask Int Status Position */
+#define CMSDK_DUALTIMER2_MASKINTSTAT_Msk     (0x1ul << CMSDK_DUALTIMER2_MASKINTSTAT_Pos)     /*!< CMSDK_DUALTIMER2 MASKINTSTAT: Mask Int Status Mask */
+
+#define CMSDK_DUALTIMER2_BGLOAD_Pos          0                                               /*!< CMSDK_DUALTIMER2 BGLOAD: Background Load Position */
+#define CMSDK_DUALTIMER2_BGLOAD_Msk          (0xFFFFFFFFul << CMSDK_DUALTIMER2_BGLOAD_Pos)   /*!< CMSDK_DUALTIMER2 BGLOAD: Background Load Mask */
+
+typedef struct {
+    __IO uint32_t TimerLoad;                   /* Offset: 0x000 (R/W) Timer Load */
+    __I  uint32_t TimerValue;                  /* Offset: 0x000 (R/W) Timer Counter Current Value */
+    __IO uint32_t TimerControl;                /* Offset: 0x000 (R/W) Timer Control */
+    /* <o.7> TimerEn: Timer Enable         */
+    /* <o.6> TimerMode: Timer Mode         */
+    /*   <0=> Freerunning-mode             */
+    /*   <1=> Periodic mode                */
+    /* <o.5> IntEnable: Interrupt Enable   */
+    /* <o.2..3> TimerPre: Timer Prescale   */
+    /*   <0=> / 1                          */
+    /*   <1=> / 16                         */
+    /*   <2=> / 256                        */
+    /*   <3=> Undefined!                   */
+    /* <o.1> TimerSize: Timer Size         */
+    /*   <0=> 16-bit counter               */
+    /*   <1=> 32-bit counter               */
+    /* <o.0> OneShot: One-shoot mode       */
+    /*   <0=> Wrapping mode                */
+    /*   <1=> One-shot mode                */
+    /* </h>                                */
+    __O  uint32_t TimerIntClr;                 /* Offset: 0x000 (R/W) Timer Interrupt Clear */
+    __I  uint32_t TimerRIS;                    /* Offset: 0x000 (R/W) Timer Raw Interrupt Status */
+    __I  uint32_t TimerMIS;                    /* Offset: 0x000 (R/W) Timer Masked Interrupt Status */
+    __IO uint32_t TimerBGLoad;                 /* Offset: 0x000 (R/W) Background Load Register */
+} CMSDK_DUALTIMER_SINGLE_TypeDef;
+
+#define CMSDK_DUALTIMER_LOAD_Pos             0                                               /*!< CMSDK_DUALTIMER LOAD: LOAD Position */
+#define CMSDK_DUALTIMER_LOAD_Msk             (0xFFFFFFFFul << CMSDK_DUALTIMER_LOAD_Pos)      /*!< CMSDK_DUALTIMER LOAD: LOAD Mask */
+
+#define CMSDK_DUALTIMER_VALUE_Pos            0                                               /*!< CMSDK_DUALTIMER VALUE: VALUE Position */
+#define CMSDK_DUALTIMER_VALUE_Msk            (0xFFFFFFFFul << CMSDK_DUALTIMER_VALUE_Pos)     /*!< CMSDK_DUALTIMER VALUE: VALUE Mask */
+
+#define CMSDK_DUALTIMER_CTRL_EN_Pos          7                                               /*!< CMSDK_DUALTIMER CTRL_EN: CTRL Enable Position */
+#define CMSDK_DUALTIMER_CTRL_EN_Msk          (0x1ul << CMSDK_DUALTIMER_CTRL_EN_Pos)          /*!< CMSDK_DUALTIMER CTRL_EN: CTRL Enable Mask */
+
+#define CMSDK_DUALTIMER_CTRL_MODE_Pos        6                                               /*!< CMSDK_DUALTIMER CTRL_MODE: CTRL MODE Position */
+#define CMSDK_DUALTIMER_CTRL_MODE_Msk        (0x1ul << CMSDK_DUALTIMER_CTRL_MODE_Pos)        /*!< CMSDK_DUALTIMER CTRL_MODE: CTRL MODE Mask */
+
+#define CMSDK_DUALTIMER_CTRL_INTEN_Pos       5                                               /*!< CMSDK_DUALTIMER CTRL_INTEN: CTRL Int Enable Position */
+#define CMSDK_DUALTIMER_CTRL_INTEN_Msk       (0x1ul << CMSDK_DUALTIMER_CTRL_INTEN_Pos)       /*!< CMSDK_DUALTIMER CTRL_INTEN: CTRL Int Enable Mask */
+
+#define CMSDK_DUALTIMER_CTRL_PRESCALE_Pos    2                                               /*!< CMSDK_DUALTIMER CTRL_PRESCALE: CTRL PRESCALE Position */
+#define CMSDK_DUALTIMER_CTRL_PRESCALE_Msk    (0x3ul << CMSDK_DUALTIMER_CTRL_PRESCALE_Pos)    /*!< CMSDK_DUALTIMER CTRL_PRESCALE: CTRL PRESCALE Mask */
+
+#define CMSDK_DUALTIMER_CTRL_SIZE_Pos        1                                               /*!< CMSDK_DUALTIMER CTRL_SIZE: CTRL SIZE Position */
+#define CMSDK_DUALTIMER_CTRL_SIZE_Msk        (0x1ul << CMSDK_DUALTIMER_CTRL_SIZE_Pos)        /*!< CMSDK_DUALTIMER CTRL_SIZE: CTRL SIZE Mask */
+
+#define CMSDK_DUALTIMER_CTRL_ONESHOOT_Pos    0                                               /*!< CMSDK_DUALTIMER CTRL_ONESHOOT: CTRL ONESHOOT Position */
+#define CMSDK_DUALTIMER_CTRL_ONESHOOT_Msk    (0x1ul << CMSDK_DUALTIMER_CTRL_ONESHOOT_Pos)    /*!< CMSDK_DUALTIMER CTRL_ONESHOOT: CTRL ONESHOOT Mask */
+
+#define CMSDK_DUALTIMER_INTCLR_Pos           0                                               /*!< CMSDK_DUALTIMER INTCLR: INT Clear Position */
+#define CMSDK_DUALTIMER_INTCLR_Msk           (0x1ul << CMSDK_DUALTIMER_INTCLR_Pos)           /*!< CMSDK_DUALTIMER INTCLR: INT Clear  Mask */
+
+#define CMSDK_DUALTIMER_RAWINTSTAT_Pos       0                                               /*!< CMSDK_DUALTIMER RAWINTSTAT: Raw Int Status Position */
+#define CMSDK_DUALTIMER_RAWINTSTAT_Msk       (0x1ul << CMSDK_DUALTIMER_RAWINTSTAT_Pos)       /*!< CMSDK_DUALTIMER RAWINTSTAT: Raw Int Status Mask */
+
+#define CMSDK_DUALTIMER_MASKINTSTAT_Pos      0                                               /*!< CMSDK_DUALTIMER MASKINTSTAT: Mask Int Status Position */
+#define CMSDK_DUALTIMER_MASKINTSTAT_Msk      (0x1ul << CMSDK_DUALTIMER_MASKINTSTAT_Pos)      /*!< CMSDK_DUALTIMER MASKINTSTAT: Mask Int Status Mask */
+
+#define CMSDK_DUALTIMER_BGLOAD_Pos           0                                               /*!< CMSDK_DUALTIMER BGLOAD: Background Load Position */
+#define CMSDK_DUALTIMER_BGLOAD_Msk           (0xFFFFFFFFul << CMSDK_DUALTIMER_BGLOAD_Pos)    /*!< CMSDK_DUALTIMER BGLOAD: Background Load Mask */
+
+/*@}*/ /* end of group CMSDK_DualTIMER */
+
+
+/*-------------------- General Purpose Input Output (GPIO) -------------------*/
+/** @addtogroup CMSDK_GPIO CMSDK GPIO
+  @{
+*/
+typedef struct {
+    __IO   uint32_t  DATA;                     /* Offset: 0x000 (R/W) DATA Register */
+    __IO   uint32_t  DATAOUT;                  /* Offset: 0x004 (R/W) Data Output Latch Register */
+    uint32_t  RESERVED0[2];
+    __IO   uint32_t  OUTENABLESET;             /* Offset: 0x010 (R/W) Output Enable Set Register */
+    __IO   uint32_t  OUTENABLECLR;             /* Offset: 0x014 (R/W) Output Enable Clear Register */
+    __IO   uint32_t  ALTFUNCSET;               /* Offset: 0x018 (R/W) Alternate Function Set Register */
+    __IO   uint32_t  ALTFUNCCLR;               /* Offset: 0x01C (R/W) Alternate Function Clear Register */
+    __IO   uint32_t  INTENSET;                 /* Offset: 0x020 (R/W) Interrupt Enable Set Register */
+    __IO   uint32_t  INTENCLR;                 /* Offset: 0x024 (R/W) Interrupt Enable Clear Register */
+    __IO   uint32_t  INTTYPESET;               /* Offset: 0x028 (R/W) Interrupt Type Set Register */
+    __IO   uint32_t  INTTYPECLR;               /* Offset: 0x02C (R/W) Interrupt Type Clear Register */
+    __IO   uint32_t  INTPOLSET;                /* Offset: 0x030 (R/W) Interrupt Polarity Set Register */
+    __IO   uint32_t  INTPOLCLR;                /* Offset: 0x034 (R/W) Interrupt Polarity Clear Register */
+    union {
+        __I    uint32_t  INTSTATUS;              /* Offset: 0x038 (R/ ) Interrupt Status Register */
+        __O    uint32_t  INTCLEAR;               /* Offset: 0x038 ( /W) Interrupt Clear Register */
+    };
+    uint32_t RESERVED1[241];
+    __IO   uint32_t LB_MASKED[256];            /* Offset: 0x400 - 0x7FC Lower byte Masked Access Register (R/W) */
+    __IO   uint32_t UB_MASKED[256];            /* Offset: 0x800 - 0xBFC Upper byte Masked Access Register (R/W) */
+} CMSDK_GPIO_TypeDef;
+
+#define CMSDK_GPIO_DATA_Pos            0                                          /*!< CMSDK_GPIO DATA: DATA Position */
+#define CMSDK_GPIO_DATA_Msk            (0xFFFFul << CMSDK_GPIO_DATA_Pos)          /*!< CMSDK_GPIO DATA: DATA Mask */
+
+#define CMSDK_GPIO_DATAOUT_Pos         0                                          /*!< CMSDK_GPIO DATAOUT: DATAOUT Position */
+#define CMSDK_GPIO_DATAOUT_Msk         (0xFFFFul << CMSDK_GPIO_DATAOUT_Pos)       /*!< CMSDK_GPIO DATAOUT: DATAOUT Mask */
+
+#define CMSDK_GPIO_OUTENSET_Pos        0                                          /*!< CMSDK_GPIO OUTEN: OUTEN Position */
+#define CMSDK_GPIO_OUTENSET_Msk        (0xFFFFul << CMSDK_GPIO_OUTEN_Pos)         /*!< CMSDK_GPIO OUTEN: OUTEN Mask */
+
+#define CMSDK_GPIO_OUTENCLR_Pos        0                                          /*!< CMSDK_GPIO OUTEN: OUTEN Position */
+#define CMSDK_GPIO_OUTENCLR_Msk        (0xFFFFul << CMSDK_GPIO_OUTEN_Pos)         /*!< CMSDK_GPIO OUTEN: OUTEN Mask */
+
+#define CMSDK_GPIO_ALTFUNCSET_Pos      0                                          /*!< CMSDK_GPIO ALTFUNC: ALTFUNC Position */
+#define CMSDK_GPIO_ALTFUNCSET_Msk      (0xFFFFul << CMSDK_GPIO_ALTFUNC_Pos)       /*!< CMSDK_GPIO ALTFUNC: ALTFUNC Mask */
+
+#define CMSDK_GPIO_ALTFUNCCLR_Pos      0                                          /*!< CMSDK_GPIO ALTFUNC: ALTFUNC Position */
+#define CMSDK_GPIO_ALTFUNCCLR_Msk      (0xFFFFul << CMSDK_GPIO_ALTFUNC_Pos)       /*!< CMSDK_GPIO ALTFUNC: ALTFUNC Mask */
+
+#define CMSDK_GPIO_INTENSET_Pos        0                                          /*!< CMSDK_GPIO INTEN: INTEN Position */
+#define CMSDK_GPIO_INTENSET_Msk        (0xFFFFul << CMSDK_GPIO_INTEN_Pos)         /*!< CMSDK_GPIO INTEN: INTEN Mask */
+
+#define CMSDK_GPIO_INTENCLR_Pos        0                                          /*!< CMSDK_GPIO INTEN: INTEN Position */
+#define CMSDK_GPIO_INTENCLR_Msk        (0xFFFFul << CMSDK_GPIO_INTEN_Pos)         /*!< CMSDK_GPIO INTEN: INTEN Mask */
+
+#define CMSDK_GPIO_INTTYPESET_Pos      0                                          /*!< CMSDK_GPIO INTTYPE: INTTYPE Position */
+#define CMSDK_GPIO_INTTYPESET_Msk      (0xFFFFul << CMSDK_GPIO_INTTYPE_Pos)       /*!< CMSDK_GPIO INTTYPE: INTTYPE Mask */
+
+#define CMSDK_GPIO_INTTYPECLR_Pos      0                                          /*!< CMSDK_GPIO INTTYPE: INTTYPE Position */
+#define CMSDK_GPIO_INTTYPECLR_Msk      (0xFFFFul << CMSDK_GPIO_INTTYPE_Pos)       /*!< CMSDK_GPIO INTTYPE: INTTYPE Mask */
+
+#define CMSDK_GPIO_INTPOLSET_Pos       0                                          /*!< CMSDK_GPIO INTPOL: INTPOL Position */
+#define CMSDK_GPIO_INTPOLSET_Msk       (0xFFFFul << CMSDK_GPIO_INTPOL_Pos)        /*!< CMSDK_GPIO INTPOL: INTPOL Mask */
+
+#define CMSDK_GPIO_INTPOLCLR_Pos       0                                          /*!< CMSDK_GPIO INTPOL: INTPOL Position */
+#define CMSDK_GPIO_INTPOLCLR_Msk       (0xFFFFul << CMSDK_GPIO_INTPOL_Pos)        /*!< CMSDK_GPIO INTPOL: INTPOL Mask */
+
+#define CMSDK_GPIO_INTSTATUS_Pos       0                                          /*!< CMSDK_GPIO INTSTATUS: INTSTATUS Position */
+#define CMSDK_GPIO_INTSTATUS_Msk       (0xFFul << CMSDK_GPIO_INTSTATUS_Pos)       /*!< CMSDK_GPIO INTSTATUS: INTSTATUS Mask */
+
+#define CMSDK_GPIO_INTCLEAR_Pos        0                                          /*!< CMSDK_GPIO INTCLEAR: INTCLEAR Position */
+#define CMSDK_GPIO_INTCLEAR_Msk        (0xFFul << CMSDK_GPIO_INTCLEAR_Pos)        /*!< CMSDK_GPIO INTCLEAR: INTCLEAR Mask */
+
+#define CMSDK_GPIO_MASKLOWBYTE_Pos     0                                          /*!< CMSDK_GPIO MASKLOWBYTE: MASKLOWBYTE Position */
+#define CMSDK_GPIO_MASKLOWBYTE_Msk     (0x00FFul << CMSDK_GPIO_MASKLOWBYTE_Pos)   /*!< CMSDK_GPIO MASKLOWBYTE: MASKLOWBYTE Mask */
+
+#define CMSDK_GPIO_MASKHIGHBYTE_Pos    0                                          /*!< CMSDK_GPIO MASKHIGHBYTE: MASKHIGHBYTE Position */
+#define CMSDK_GPIO_MASKHIGHBYTE_Msk    (0xFF00ul << CMSDK_GPIO_MASKHIGHBYTE_Pos)  /*!< CMSDK_GPIO MASKHIGHBYTE: MASKHIGHBYTE Mask */
+
+/*@}*/ /* end of group CMSDK_GPIO */
+
+
+/*------------- System Control (SYSCON) --------------------------------------*/
+/** @addtogroup CMSDK_SYSCON CMSDK System Control
+  @{
+*/
+typedef struct {
+    __IO   uint32_t  REMAP;                    /* Offset: 0x000 (R/W) Remap Control Register */
+    __IO   uint32_t  PMUCTRL;                  /* Offset: 0x004 (R/W) PMU Control Register */
+    __IO   uint32_t  RESETOP;                  /* Offset: 0x008 (R/W) Reset Option Register */
+    __IO   uint32_t  EMICTRL;                  /* Offset: 0x00C (R/W) EMI Control Register */
+    __IO   uint32_t  RSTINFO;                  /* Offset: 0x010 (R/W) Reset Information Register */
+} CMSDK_SYSCON_TypeDef;
+
+#define CMSDK_SYSCON_REMAP_Pos                 0
+#define CMSDK_SYSCON_REMAP_Msk                 (0x01ul << CMSDK_SYSCON_REMAP_Pos)               /*!< CMSDK_SYSCON MEME_CTRL: REMAP Mask */
+
+#define CMSDK_SYSCON_PMUCTRL_EN_Pos            0
+#define CMSDK_SYSCON_PMUCTRL_EN_Msk            (0x01ul << CMSDK_SYSCON_PMUCTRL_EN_Pos)          /*!< CMSDK_SYSCON PMUCTRL: PMUCTRL ENABLE Mask */
+
+#define CMSDK_SYSCON_LOCKUPRST_RESETOP_Pos     0
+#define CMSDK_SYSCON_LOCKUPRST_RESETOP_Msk     (0x01ul << CMSDK_SYSCON_LOCKUPRST_RESETOP_Pos)   /*!< CMSDK_SYSCON SYS_CTRL: LOCKUP RESET ENABLE Mask */
+
+#define CMSDK_SYSCON_EMICTRL_SIZE_Pos          24
+#define CMSDK_SYSCON_EMICTRL_SIZE_Msk          (0x00001ul << CMSDK_SYSCON_EMICTRL_SIZE_Pos)     /*!< CMSDK_SYSCON EMICTRL: SIZE Mask */
+
+#define CMSDK_SYSCON_EMICTRL_TACYC_Pos         16
+#define CMSDK_SYSCON_EMICTRL_TACYC_Msk         (0x00007ul << CMSDK_SYSCON_EMICTRL_TACYC_Pos)    /*!< CMSDK_SYSCON EMICTRL: TURNAROUNDCYCLE Mask */
+
+#define CMSDK_SYSCON_EMICTRL_WCYC_Pos          8
+#define CMSDK_SYSCON_EMICTRL_WCYC_Msk          (0x00003ul << CMSDK_SYSCON_EMICTRL_WCYC_Pos)     /*!< CMSDK_SYSCON EMICTRL: WRITECYCLE Mask */
+
+#define CMSDK_SYSCON_EMICTRL_RCYC_Pos          0
+#define CMSDK_SYSCON_EMICTRL_RCYC_Msk          (0x00007ul << CMSDK_SYSCON_EMICTRL_RCYC_Pos)     /*!< CMSDK_SYSCON EMICTRL: READCYCLE Mask */
+
+#define CMSDK_SYSCON_RSTINFO_SYSRESETREQ_Pos   0
+#define CMSDK_SYSCON_RSTINFO_SYSRESETREQ_Msk   (0x00001ul << CMSDK_SYSCON_RSTINFO_SYSRESETREQ_Pos) /*!< CMSDK_SYSCON RSTINFO: SYSRESETREQ Mask */
+
+#define CMSDK_SYSCON_RSTINFO_WDOGRESETREQ_Pos  1
+#define CMSDK_SYSCON_RSTINFO_WDOGRESETREQ_Msk  (0x00001ul << CMSDK_SYSCON_RSTINFO_WDOGRESETREQ_Pos) /*!< CMSDK_SYSCON RSTINFO: WDOGRESETREQ Mask */
+
+#define CMSDK_SYSCON_RSTINFO_LOCKUPRESET_Pos   2
+#define CMSDK_SYSCON_RSTINFO_LOCKUPRESET_Msk   (0x00001ul << CMSDK_SYSCON_RSTINFO_LOCKUPRESET_Pos) /*!< CMSDK_SYSCON RSTINFO: LOCKUPRESET Mask */
+
+/*@}*/ /* end of group CMSDK_SYSCON */
+
+/*------------- PL230 uDMA (PL230) --------------------------------------*/
+/** @addtogroup CMSDK_PL230 CMSDK uDMA controller
+  @{
+*/
+typedef struct {
+    __I    uint32_t  DMA_STATUS;               /* Offset: 0x000 (R/W) DMA status Register */
+    __O    uint32_t  DMA_CFG;                  /* Offset: 0x004 ( /W) DMA configuration Register */
+    __IO   uint32_t  CTRL_BASE_PTR;            /* Offset: 0x008 (R/W) Channel Control Data Base Pointer Register */
+    __I    uint32_t  ALT_CTRL_BASE_PTR;        /* Offset: 0x00C (R/ ) Channel Alternate Control Data Base Pointer Register */
+    __I    uint32_t  DMA_WAITONREQ_STATUS;     /* Offset: 0x010 (R/ ) Channel Wait On Request Status Register */
+    __O    uint32_t  CHNL_SW_REQUEST;          /* Offset: 0x014 ( /W) Channel Software Request Register */
+    __IO   uint32_t  CHNL_USEBURST_SET;        /* Offset: 0x018 (R/W) Channel UseBurst Set Register */
+    __O    uint32_t  CHNL_USEBURST_CLR;        /* Offset: 0x01C ( /W) Channel UseBurst Clear Register */
+    __IO   uint32_t  CHNL_REQ_MASK_SET;        /* Offset: 0x020 (R/W) Channel Request Mask Set Register */
+    __O    uint32_t  CHNL_REQ_MASK_CLR;        /* Offset: 0x024 ( /W) Channel Request Mask Clear Register */
+    __IO   uint32_t  CHNL_ENABLE_SET;          /* Offset: 0x028 (R/W) Channel Enable Set Register */
+    __O    uint32_t  CHNL_ENABLE_CLR;          /* Offset: 0x02C ( /W) Channel Enable Clear Register */
+    __IO   uint32_t  CHNL_PRI_ALT_SET;         /* Offset: 0x030 (R/W) Channel Primary-Alterante Set Register */
+    __O    uint32_t  CHNL_PRI_ALT_CLR;         /* Offset: 0x034 ( /W) Channel Primary-Alterante Clear Register */
+    __IO   uint32_t  CHNL_PRIORITY_SET;        /* Offset: 0x038 (R/W) Channel Priority Set Register */
+    __O    uint32_t  CHNL_PRIORITY_CLR;        /* Offset: 0x03C ( /W) Channel Priority Clear Register */
+    uint32_t  RESERVED0[3];
+    __IO   uint32_t  ERR_CLR;                  /* Offset: 0x04C (R/W) Bus Error Clear Register   */
+
+} CMSDK_PL230_TypeDef;
+
+#define PL230_DMA_CHNL_BITS 0
+
+#define CMSDK_PL230_DMA_STATUS_MSTREN_Pos          0                                                          /*!< CMSDK_PL230 DMA STATUS: MSTREN Position */
+#define CMSDK_PL230_DMA_STATUS_MSTREN_Msk          (0x00000001ul << CMSDK_PL230_DMA_STATUS_MSTREN_Pos)        /*!< CMSDK_PL230 DMA STATUS: MSTREN Mask */
+
+#define CMSDK_PL230_DMA_STATUS_STATE_Pos           0                                                          /*!< CMSDK_PL230 DMA STATUS: STATE Position */
+#define CMSDK_PL230_DMA_STATUS_STATE_Msk           (0x0000000Ful << CMSDK_PL230_DMA_STATUS_STATE_Pos)         /*!< CMSDK_PL230 DMA STATUS: STATE Mask */
+
+#define CMSDK_PL230_DMA_STATUS_CHNLS_MINUS1_Pos    0                                                          /*!< CMSDK_PL230 DMA STATUS: CHNLS_MINUS1 Position */
+#define CMSDK_PL230_DMA_STATUS_CHNLS_MINUS1_Msk    (0x0000001Ful << CMSDK_PL230_DMA_STATUS_CHNLS_MINUS1_Pos)  /*!< CMSDK_PL230 DMA STATUS: CHNLS_MINUS1 Mask */
+
+#define CMSDK_PL230_DMA_STATUS_TEST_STATUS_Pos     0                                                          /*!< CMSDK_PL230 DMA STATUS: TEST_STATUS Position */
+#define CMSDK_PL230_DMA_STATUS_TEST_STATUS_Msk     (0x00000001ul << CMSDK_PL230_DMA_STATUS_TEST_STATUS_Pos)   /*!< CMSDK_PL230 DMA STATUS: TEST_STATUS Mask */
+
+#define CMSDK_PL230_DMA_CFG_MSTREN_Pos             0                                                          /*!< CMSDK_PL230 DMA CFG: MSTREN Position */
+#define CMSDK_PL230_DMA_CFG_MSTREN_Msk             (0x00000001ul << CMSDK_PL230_DMA_CFG_MSTREN_Pos)           /*!< CMSDK_PL230 DMA CFG: MSTREN Mask */
+
+#define CMSDK_PL230_DMA_CFG_CPCCACHE_Pos           2                                                          /*!< CMSDK_PL230 DMA CFG: CPCCACHE Position */
+#define CMSDK_PL230_DMA_CFG_CPCCACHE_Msk           (0x00000001ul << CMSDK_PL230_DMA_CFG_CPCCACHE_Pos)         /*!< CMSDK_PL230 DMA CFG: CPCCACHE Mask */
+
+#define CMSDK_PL230_DMA_CFG_CPCBUF_Pos             1                                                          /*!< CMSDK_PL230 DMA CFG: CPCBUF Position */
+#define CMSDK_PL230_DMA_CFG_CPCBUF_Msk             (0x00000001ul << CMSDK_PL230_DMA_CFG_CPCBUF_Pos)           /*!< CMSDK_PL230 DMA CFG: CPCBUF Mask */
+
+#define CMSDK_PL230_DMA_CFG_CPCPRIV_Pos            0                                                          /*!< CMSDK_PL230 DMA CFG: CPCPRIV Position */
+#define CMSDK_PL230_DMA_CFG_CPCPRIV_Msk            (0x00000001ul << CMSDK_PL230_DMA_CFG_CPCPRIV_Pos)          /*!< CMSDK_PL230 DMA CFG: CPCPRIV Mask */
+
+#define CMSDK_PL230_CTRL_BASE_PTR_Pos              PL230_DMA_CHNL_BITS + 5                                    /*!< CMSDK_PL230 STATUS: BASE_PTR Position */
+#define CMSDK_PL230_CTRL_BASE_PTR_Msk              (0x0FFFFFFFul << CMSDK_PL230_CTRL_BASE_PTR_Pos)            /*!< CMSDK_PL230 STATUS: BASE_PTR Mask */
+
+#define CMSDK_PL230_ALT_CTRL_BASE_PTR_Pos          0                                                          /*!< CMSDK_PL230 STATUS: MSTREN Position */
+#define CMSDK_PL230_ALT_CTRL_BASE_PTR_Msk          (0xFFFFFFFFul << CMSDK_PL230_ALT_CTRL_BASE_PTR_Pos)        /*!< CMSDK_PL230 STATUS: MSTREN Mask */
+
+#define CMSDK_PL230_DMA_WAITONREQ_STATUS_Pos       0                                                          /*!< CMSDK_PL230 DMA_WAITONREQ_STATUS: DMA_WAITONREQ_STATUS Position */
+#define CMSDK_PL230_DMA_WAITONREQ_STATUS_Msk       (0xFFFFFFFFul << CMSDK_PL230_DMA_WAITONREQ_STATUS_Pos)     /*!< CMSDK_PL230 DMA_WAITONREQ_STATUS: DMA_WAITONREQ_STATUS Mask */
+
+#define CMSDK_PL230_CHNL_SW_REQUEST_Pos            0                                                          /*!< CMSDK_PL230 CHNL_SW_REQUEST: CHNL_SW_REQUEST Position */
+#define CMSDK_PL230_CHNL_SW_REQUEST_Msk            (0xFFFFFFFFul << CMSDK_PL230_CHNL_SW_REQUEST_Pos)          /*!< CMSDK_PL230 CHNL_SW_REQUEST: CHNL_SW_REQUEST Mask */
+
+#define CMSDK_PL230_CHNL_USEBURST_SET_Pos          0                                                          /*!< CMSDK_PL230 CHNL_USEBURST: SET Position */
+#define CMSDK_PL230_CHNL_USEBURST_SET_Msk          (0xFFFFFFFFul << CMSDK_PL230_CHNL_USEBURST_SET_Pos)        /*!< CMSDK_PL230 CHNL_USEBURST: SET Mask */
+
+#define CMSDK_PL230_CHNL_USEBURST_CLR_Pos          0                                                          /*!< CMSDK_PL230 CHNL_USEBURST: CLR Position */
+#define CMSDK_PL230_CHNL_USEBURST_CLR_Msk          (0xFFFFFFFFul << CMSDK_PL230_CHNL_USEBURST_CLR_Pos)        /*!< CMSDK_PL230 CHNL_USEBURST: CLR Mask */
+
+#define CMSDK_PL230_CHNL_REQ_MASK_SET_Pos          0                                                          /*!< CMSDK_PL230 CHNL_REQ_MASK: SET Position */
+#define CMSDK_PL230_CHNL_REQ_MASK_SET_Msk          (0xFFFFFFFFul << CMSDK_PL230_CHNL_REQ_MASK_SET_Pos)        /*!< CMSDK_PL230 CHNL_REQ_MASK: SET Mask */
+
+#define CMSDK_PL230_CHNL_REQ_MASK_CLR_Pos          0                                                          /*!< CMSDK_PL230 CHNL_REQ_MASK: CLR Position */
+#define CMSDK_PL230_CHNL_REQ_MASK_CLR_Msk          (0xFFFFFFFFul << CMSDK_PL230_CHNL_REQ_MASK_CLR_Pos)        /*!< CMSDK_PL230 CHNL_REQ_MASK: CLR Mask */
+
+#define CMSDK_PL230_CHNL_ENABLE_SET_Pos            0                                                          /*!< CMSDK_PL230 CHNL_ENABLE: SET Position */
+#define CMSDK_PL230_CHNL_ENABLE_SET_Msk            (0xFFFFFFFFul << CMSDK_PL230_CHNL_ENABLE_SET_Pos)          /*!< CMSDK_PL230 CHNL_ENABLE: SET Mask */
+
+#define CMSDK_PL230_CHNL_ENABLE_CLR_Pos            0                                                          /*!< CMSDK_PL230 CHNL_ENABLE: CLR Position */
+#define CMSDK_PL230_CHNL_ENABLE_CLR_Msk            (0xFFFFFFFFul << CMSDK_PL230_CHNL_ENABLE_CLR_Pos)          /*!< CMSDK_PL230 CHNL_ENABLE: CLR Mask */
+
+#define CMSDK_PL230_CHNL_PRI_ALT_SET_Pos           0                                                          /*!< CMSDK_PL230 CHNL_PRI_ALT: SET Position */
+#define CMSDK_PL230_CHNL_PRI_ALT_SET_Msk           (0xFFFFFFFFul << CMSDK_PL230_CHNL_PRI_ALT_SET_Pos)         /*!< CMSDK_PL230 CHNL_PRI_ALT: SET Mask */
+
+#define CMSDK_PL230_CHNL_PRI_ALT_CLR_Pos           0                                                          /*!< CMSDK_PL230 CHNL_PRI_ALT: CLR Position */
+#define CMSDK_PL230_CHNL_PRI_ALT_CLR_Msk           (0xFFFFFFFFul << CMSDK_PL230_CHNL_PRI_ALT_CLR_Pos)         /*!< CMSDK_PL230 CHNL_PRI_ALT: CLR Mask */
+
+#define CMSDK_PL230_CHNL_PRIORITY_SET_Pos          0                                                          /*!< CMSDK_PL230 CHNL_PRIORITY: SET Position */
+#define CMSDK_PL230_CHNL_PRIORITY_SET_Msk          (0xFFFFFFFFul << CMSDK_PL230_CHNL_PRIORITY_SET_Pos)        /*!< CMSDK_PL230 CHNL_PRIORITY: SET Mask */
+
+#define CMSDK_PL230_CHNL_PRIORITY_CLR_Pos          0                                                          /*!< CMSDK_PL230 CHNL_PRIORITY: CLR Position */
+#define CMSDK_PL230_CHNL_PRIORITY_CLR_Msk          (0xFFFFFFFFul << CMSDK_PL230_CHNL_PRIORITY_CLR_Pos)        /*!< CMSDK_PL230 CHNL_PRIORITY: CLR Mask */
+
+#define CMSDK_PL230_ERR_CLR_Pos                    0                                                          /*!< CMSDK_PL230 ERR: CLR Position */
+#define CMSDK_PL230_ERR_CLR_Msk                    (0x00000001ul << CMSDK_PL230_ERR_CLR_Pos)                  /*!< CMSDK_PL230 ERR: CLR Mask */
+
+
+/*@}*/ /* end of group CMSDK_PL230 */
+
+
+/*------------- PrimeCell UART (PL110) --------------------------------------*/
+/** @addtogroup CMSDK_PL110 CMSDK PrimeCell UART
+  @{
+*/
+
+typedef struct {
+    __IO uint32_t UARTDR;       // <h> Data
+    //   <o.11>   OE: Overrun error <r>
+    //   <o.10>   BE: Break error <r>
+    //   <o.9>    PE: Parity error <r>
+    //   <o.8>    FE: Framing error <r>
+    //   <o.0..7> DATA: Received or Transmitting data (0..255)
+    // </h>
+    union {
+        __I  uint32_t UARTRSR;      // <h> Receive Status <r>
+        //   <o.3>    OE: Overrun error <r>
+        //   <o.2>    BE: Break error <r>
+        //   <o.1>    PE: Parity error <r>
+        //   <o.0>    FE: Framing error <r>
+        // </h>
+        __O  uint32_t UARTECR;      // <h> Error Clear <w>
+        //   <o.3>    OE: Overrun error <w>
+        //   <o.2>    BE: Break error <w>
+        //   <o.1>    PE: Parity error <w>
+        //   <o.0>    FE: Framing error <w>
+        // </h>
+    };
+    uint32_t RESERVED0[4];
+    __IO uint32_t UARTFR;       // <h> Flags <r>
+    //   <o.8>    RI: Ring indicator <r>
+    //   <o.7>    TXFE: Transmit FIFO empty <r>
+    //   <o.6>    RXFF: Receive FIFO full <r>
+    //   <o.5>    TXFF: Transmit FIFO full <r>
+    //   <o.4>    RXFE: Receive FIFO empty <r>
+    //   <o.3>    BUSY: UART busy <r>
+    //   <o.2>    DCD: Data carrier detect <r>
+    //   <o.1>    DSR: Data set ready <r>
+    //   <o.0>    CTS: Clear to send <r>
+    // </h>
+    uint32_t RESERVED1;
+    __IO uint32_t UARTILPR;     // <h> IrDA Low-power Counter
+    //   <o.0..7> ILPDVSR: 8-bit low-power divisor value (0..255)
+    // </h>
+    __IO uint32_t UARTIBRD;     // <h> Integer Baud Rate
+    //   <o.0..15> BAUD DIVINT: Integer baud rate divisor (0..65535)
+    // </h>
+    __IO uint32_t UARTFBRD;     // <h> Fractional Baud Rate
+    //   <o.0..5> BAUD DIVFRAC: Fractional baud rate divisor (0..63)
+    // </h>
+    __IO uint32_t UARTLCR_H;    // <h> Line Control
+    //   <o.8>    SPS: Stick parity select
+    //   <o.5..6> WLEN: Word length
+    //     <0=> 5 bits
+    //     <1=> 6 bits
+    //     <2=> 7 bits
+    //     <3=> 8 bits
+    //   <o.4>    FEN: Enable FIFOs
+    //   <o.3>    STP2: Two stop bits select
+    //   <o.2>    EPS: Even parity select
+    //   <o.1>    PEN: Parity enable
+    //   <o.0>    BRK: Send break
+    // </h>
+    __IO uint32_t UARTCR;       // <h> Control
+    //   <o.15>   CTSEn: CTS hardware flow control enable
+    //   <o.14>   RTSEn: RTS hardware flow control enable
+    //   <o.13>   Out2: Complement of Out2 modem status output
+    //   <o.12>   Out1: Complement of Out1 modem status output
+    //   <o.11>   RTS: Request to send
+    //   <o.10>   DTR: Data transmit ready
+    //   <o.9>    RXE: Receive enable
+    //   <o.8>    TXE: Transmit enable
+    //   <o.7>    LBE: Loop-back enable
+    //   <o.2>    SIRLP: IrDA SIR low power mode
+    //   <o.1>    SIREN: SIR enable
+    //   <o.0>    UARTEN: UART enable
+    // </h>
+    __IO uint32_t UARTIFLS;     // <h> Interrupt FIFO Level Select
+    //   <o.3..5> RXIFLSEL: Receive interrupt FIFO level select
+    //     <0=> >= 1/8 full
+    //     <1=> >= 1/4 full
+    //     <2=> >= 1/2 full
+    //     <3=> >= 3/4 full
+    //     <4=> >= 7/8 full
+    //     <5=> reserved
+    //     <6=> reserved
+    //     <7=> reserved
+    //   <o.0..2> TXIFLSEL: Transmit interrupt FIFO level select
+    //     <0=> <= 1/8 full
+    //     <1=> <= 1/4 full
+    //     <2=> <= 1/2 full
+    //     <3=> <= 3/4 full
+    //     <4=> <= 7/8 full
+    //     <5=> reserved
+    //     <6=> reserved
+    //     <7=> reserved
+    // </h>
+    __IO uint32_t UARTIMSC;     // <h> Interrupt Mask Set / Clear
+    //   <o.10>   OEIM: Overrun error interrupt mask
+    //   <o.9>    BEIM: Break error interrupt mask
+    //   <o.8>    PEIM: Parity error interrupt mask
+    //   <o.7>    FEIM: Framing error interrupt mask
+    //   <o.6>    RTIM: Receive interrupt mask
+    //   <o.5>    TXIM: Transmit interrupt mask
+    //   <o.4>    RXIM: Receive interrupt mask
+    //   <o.3>    DSRMIM: nUARTDSR modem interrupt mask
+    //   <o.2>    DCDMIM: nUARTDCD modem interrupt mask
+    //   <o.1>    CTSMIM: nUARTCTS modem interrupt mask
+    //   <o.0>    RIMIM: nUARTRI modem interrupt mask
+    // </h>
+    __IO uint32_t UARTRIS;      // <h> Raw Interrupt Status <r>
+    //   <o.10>   OERIS: Overrun error interrupt status <r>
+    //   <o.9>    BERIS: Break error interrupt status <r>
+    //   <o.8>    PERIS: Parity error interrupt status <r>
+    //   <o.7>    FERIS: Framing error interrupt status <r>
+    //   <o.6>    RTRIS: Receive timeout interrupt status <r>
+    //   <o.5>    TXRIS: Transmit interrupt status <r>
+    //   <o.4>    RXRIS: Receive interrupt status <r>
+    //   <o.3>    DSRRMIS: nUARTDSR modem interrupt status <r>
+    //   <o.2>    DCDRMIS: nUARTDCD modem interrupt status <r>
+    //   <o.1>    CTSRMIS: nUARTCTS modem interrupt status <r>
+    //   <o.0>    RIRMIS: nUARTRI modem interrupt status <r>
+    // </h>
+    __IO uint32_t UARTMIS;      // <h> Masked Interrupt Status <r>
+    //   <o.10>   OEMIS: Overrun error masked interrupt status <r>
+    //   <o.9>    BEMIS: Break error masked interrupt status <r>
+    //   <o.8>    PEMIS: Parity error masked interrupt status <r>
+    //   <o.7>    FEMIS: Framing error masked interrupt status <r>
+    //   <o.6>    RTMIS: Receive timeout masked interrupt status <r>
+    //   <o.5>    TXMIS: Transmit masked interrupt status <r>
+    //   <o.4>    RXMIS: Receive masked interrupt status <r>
+    //   <o.3>    DSRMMIS: nUARTDSR modem masked interrupt status <r>
+    //   <o.2>    DCDMMIS: nUARTDCD modem masked interrupt status <r>
+    //   <o.1>    CTSMMIS: nUARTCTS modem masked interrupt status <r>
+    //   <o.0>    RIMMIS: nUARTRI modem masked interrupt status <r>
+    // </h>
+    __O  uint32_t UARTICR;      // <h> Interrupt Clear <w>
+    //   <o.10>   OEIC: Overrun error interrupt clear <w>
+    //   <o.9>    BEIC: Break error interrupt clear <w>
+    //   <o.8>    PEIC: Parity error interrupt clear <w>
+    //   <o.7>    FEIC: Framing error interrupt clear <w>
+    //   <o.6>    RTIC: Receive timeout interrupt clear <w>
+    //   <o.5>    TXIC: Transmit interrupt clear <w>
+    //   <o.4>    RXIC: Receive interrupt clear <w>
+    //   <o.3>    DSRMIC: nUARTDSR modem interrupt clear <w>
+    //   <o.2>    DCDMIC: nUARTDCD modem interrupt clear <w>
+    //   <o.1>    CTSMIC: nUARTCTS modem interrupt clear <w>
+    //   <o.0>    RIMIC: nUARTRI modem interrupt clear <w>
+    // </h>
+    __IO uint32_t UARTDMACR;    // <h> DMA Control
+    //   <o.2>    DMAONERR: DMA on error
+    //   <o.1>    TXDMAE: Transmit DMA enable
+    //   <o.0>    RXDMAE: Receive DMA enable
+    // </h>
+} PL110_UART_TypeDef;
+
+#define CMSDK_PL110_DATAOVRRUN_Pos            11                                             /*!< CMSDK_PL110 DATAOVRRUN: Data Overrun Position */
+#define CMSDK_PL110_DATAOVRRUN_Msk           (0x1ul << CMSDK_PL110_DATAOVRRUN_Pos)           /*!< CMSDK_PL110 DATAOVRRUN: Data Overrun Mask */
+
+#define CMSDK_PL110_DATABREAKERR_Pos          10                                             /*!< CMSDK_PL110 DATABREAKERR: Data Break Error Position */
+#define CMSDK_PL110_DATABREAKERR_Msk         (0x1ul << CMSDK_PL110_DATABREAKERR_Pos)         /*!< CMSDK_PL110 DATABREAKERR: Data Break Error Mask */
+
+#define CMSDK_PL110_DATAPARITYERR_Pos         9                                              /*!< CMSDK_PL110 DATAPARITYERR: Data Parity Error Position */
+#define CMSDK_PL110_DATAPARITYERR_Msk        (0x1ul << CMSDK_PL110_DATAPARITYERR_Pos)        /*!< CMSDK_PL110 DATAPARITYERR: Data Parity Error Mask */
+
+#define CMSDK_PL110_DATAFRAMEERR_Pos          8                                              /*!< CMSDK_PL110 DATAFRAMEERR: Data Frame Error Position */
+#define CMSDK_PL110_DATAFRAMEERR_Msk         (0x1ul << CMSDK_PL110_DATAFRAMEERR_Pos)         /*!< CMSDK_PL110 DATAFRAMEERR: Data Frame Error Mask */
+
+#define CMSDK_PL110_RECOVRRUN_Pos             3                                              /*!< CMSDK_PL110 RECOVRRUN: Receive Overrun Position */
+#define CMSDK_PL110_RECOVRRUN_Msk            (0x1ul << CMSDK_PL110_RECOVRRUN_Pos)            /*!< CMSDK_PL110 RECOVRRUN: Receive Overrun Mask */
+
+#define CMSDK_PL110_RECBREAKERR_Pos           2                                              /*!< CMSDK_PL110 RECBREAKERR: Receive Break Error Position */
+#define CMSDK_PL110_RECBREAKERR_Msk          (0x1ul << CMSDK_PL110_RECBREAKERR_Pos)          /*!< CMSDK_PL110 RECBREAKERR: Receive Break Error Mask */
+
+#define CMSDK_PL110_RECPARITYERR_Pos          1                                              /*!< CMSDK_PL110 RECPARITYERR: Receive Parity Error Position */
+#define CMSDK_PL110_RECPARITYERR_Msk         (0x1ul << CMSDK_PL110_RECPARITYERR_Pos)         /*!< CMSDK_PL110 RECPARITYERR: Receive Parity Error Mask */
+
+#define CMSDK_PL110_RECFRAMEERR_Pos           0                                              /*!< CMSDK_PL110 RECFRAMEERR: Receive Frame Error Position */
+#define CMSDK_PL110_RECFRAMEERR_Msk          (0x1ul << CMSDK_PL110_RECFRAMEERR_Pos)          /*!< CMSDK_PL110 RECFRAMEERR: Receive Frame Error Mask */
+
+#define CMSDK_PL110_ERRCLROVRRUN_Pos          3                                              /*!< CMSDK_PL110 ERRCLROVRRUN: Clear Overrun Position */
+#define CMSDK_PL110_ERRCLROVRRUN_Msk         (0x1ul << CMSDK_PL110_ERRCLROVRRUN_Pos)         /*!< CMSDK_PL110 ERRCLROVRRUN: Clear Overrun Mask */
+
+#define CMSDK_PL110_ERRCLRBREAKERR_Pos        2                                              /*!< CMSDK_PL110 ERRCLRBREAKERR: Clear Break Error Position */
+#define CMSDK_PL110_ERRCLRBREAKERR_Msk       (0x1ul << CMSDK_PL110_ERRCLRBREAKERR_Pos)       /*!< CMSDK_PL110 ERRCLRBREAKERR: Clear Break Error Mask */
+
+#define CMSDK_PL110_ERRCLRPARITYERR_Pos       1                                              /*!< CMSDK_PL110 ERRCLRPARITYERR: Clear Parity Error Position */
+#define CMSDK_PL110_ERRCLRPARITYERR_Msk      (0x1ul << CMSDK_PL110_ERRCLRPARITYERR_Pos)      /*!< CMSDK_PL110 ERRCLRPARITYERR: Clear Parity Error Mask */
+
+#define CMSDK_PL110_ERRCLRFRAMEERR_Pos        0                                              /*!< CMSDK_PL110 ERRCLRFRAMEERR: Clear Frame Error Position */
+#define CMSDK_PL110_ERRCLRFRAMEERR_Msk       (0x1ul << CMSDK_PL110_ERRCLRFRAMEERR_Pos)       /*!< CMSDK_PL110 ERRCLRFRAMEERR: Clear Frame Error Mask */
+
+#define CMSDK_PL110_FLAG_RINGIND_Pos          8                                              /*!< CMSDK_PL110 FLAG_RINGIND: Ring Indicator Position */
+#define CMSDK_PL110_FLAG_RINGIND_Msk         (0x1ul << CMSDK_PL110_FLAG_RINGIND_Pos)         /*!< CMSDK_PL110 FLAG_RINGIND: Ring Indicator Mask */
+
+#define CMSDK_PL110_FLAG_TXFEMPTY_Pos         7                                              /*!< CMSDK_PL110 FLAG_TXFEMPTY: Transmit FIFO Empty Position */
+#define CMSDK_PL110_FLAG_TXFEMPTY_Msk        (0x1ul << CMSDK_PL110_FLAG_TXFEMPTY_Pos)        /*!< CMSDK_PL110 FLAG_TXFEMPTY: Transmit FIFO Empty Mask */
+
+#define CMSDK_PL110_FLAG_RXFFULL_Pos          6                                              /*!< CMSDK_PL110 FLAG_RXFFULL: Receive FIFO Full Position */
+#define CMSDK_PL110_FLAG_RXFFULL_Msk         (0x1ul << CMSDK_PL110_FLAG_RXFFULL_Pos)         /*!< CMSDK_PL110 FLAG_RXFFULL: Receive FIFO Full Mask */
+
+#define CMSDK_PL110_FLAG_TXFFULL_Pos          5                                              /*!< CMSDK_PL110 FLAG_TXFFULL: Transmit FIFO Full Position */
+#define CMSDK_PL110_FLAG_TXFFULL_Msk         (0x1ul << CMSDK_PL110_FLAG_TXFFULL_Pos)         /*!< CMSDK_PL110 FLAG_TXFFULL: Transmit FIFO Full Mask */
+
+#define CMSDK_PL110_FLAG_RXFEMPTY_Pos         4                                              /*!< CMSDK_PL110 FLAG_RXFEMPTY: Receive FIFO Empty Position */
+#define CMSDK_PL110_FLAG_RXFEMPTY_Msk        (0x1ul << CMSDK_PL110_FLAG_RXFEMPTY_Pos)        /*!< CMSDK_PL110 FLAG_RXFEMPTY: Receive FIFO Empty Mask */
+
+#define CMSDK_PL110_FLAG_UARTBUSY_Pos         3                                              /*!< CMSDK_PL110 FLAG_UARTBUSY: UART Busy Position */
+#define CMSDK_PL110_FLAG_UARTBUSY_Msk        (0x1ul << CMSDK_PL110_FLAG_UARTBUSY_Pos)        /*!< CMSDK_PL110 FLAG_UARTBUSY: UART Busy Mask */
+
+#define CMSDK_PL110_FLAG_CARRIERDETECT_Pos    2                                              /*!< CMSDK_PL110 FLAG_CARRIERDETECT: Carrier Detect Position */
+#define CMSDK_PL110_FLAG_CARRIERDETECT_Msk   (0x1ul << CMSDK_PL110_FLAG_CARRIERDETECT_Pos)   /*!< CMSDK_PL110 FLAG_CARRIERDETECT: Carrier Detect Mask */
+
+#define CMSDK_PL110_FLAG_DATASETREADY_Pos     1                                              /*!< CMSDK_PL110 FLAG_DATASETREADY: Data Set Ready Position */
+#define CMSDK_PL110_FLAG_DATASETREADY_Msk    (0x1ul << CMSDK_PL110_FLAG_DATASETREADY_Pos)    /*!< CMSDK_PL110 FLAG_DATASETREADY: Data Set Ready Mask */
+
+#define CMSDK_PL110_FLAG_CLR2SEND_Pos         0                                              /*!< CMSDK_PL110 FLAG_CLR2SEND: Clear To Send Position */
+#define CMSDK_PL110_FLAG_CLR2SEND_Msk        (0x1ul << CMSDK_PL110_FLAG_CLR2SEND_Pos)        /*!< CMSDK_PL110 FLAG_CLR2SEND: Clear To Send Mask */
+
+#define CMSDK_PL110_IRDALOWPOWERCOUNT_Pos    0                                               /*!< CMSDK_PL110 IRDALOWPOWERCOUNT: IrDA 8-bit low-power divisor value Position */
+#define CMSDK_PL110_IRDALOWPOWERCOUNT_Msk    (0xFFul << CMSDK_PL110_IRDALOWPOWERCOUNT_Pos)   /*!< CMSDK_PL110 IRDALOWPOWERCOUNT: IrDA 8-bit low-power divisor value Mask */
+
+#define CMSDK_PL110_INTDIVIDER_Pos           0                                               /*!< CMSDK_PL110 INTDIVIDER: Integer Divider Position */
+#define CMSDK_PL110_INTDIVIDER_Msk           (0xFFFFul << CMSDK_PL110_INTDIVIDER_Pos)        /*!< CMSDK_PL110 INTDIVIDER: Integer Divider Mask */
+
+#define CMSDK_PL110_FRACTDIVIDER_Pos         0                                               /*!< CMSDK_PL110 FRACTDIVIDER: Fractional Divider Position */
+#define CMSDK_PL110_FRACTDIVIDER_Msk         (0x3Ful << CMSDK_PL110_FRACTDIVIDER_Pos)        /*!< CMSDK_PL110 FRACTDIVIDER: Fractional Divider Mask */
+
+#define CMSDK_PL110_STICKPARITYSEL_Pos       8                                               /*!< CMSDK_PL110 STICKPARITYSEL: Stick parity select Position */
+#define CMSDK_PL110_STICKPARITYSEL_Msk       (0x1ul << CMSDK_PL110_STICKPARITYSEL_Pos)       /*!< CMSDK_PL110 STICKPARITYSEL: Stick parity select Mask */
+
+#define CMSDK_PL110_WORDLEN_Pos              5                                               /*!< CMSDK_PL110 WORDLEN: Word Length Select Position */
+#define CMSDK_PL110_WORDLEN_Msk              (0x3ul << CMSDK_PL110_WORDLEN_Pos)              /*!< CMSDK_PL110 WORDLEN: Word Length Select Mask */
+
+#define CMSDK_PL110_ENFIFOS_Pos              4                                               /*!< CMSDK_PL110 ENFIFOS: Enable FIFOs Position */
+#define CMSDK_PL110_ENFIFOS_Msk              (0x1ul << CMSDK_PL110_ENFIFOS_Pos)              /*!< CMSDK_PL110 ENFIFOS: Enable FIFOs Mask */
+
+#define CMSDK_PL110_2STOPBITS_Pos            3                                               /*!< CMSDK_PL110 2STOPBITS: Two Stop Bits Select Position */
+#define CMSDK_PL110_2STOPBITS_Msk            (0x1ul << CMSDK_PL110_2STOPBITS_Pos)            /*!< CMSDK_PL110 2STOPBITS: Two Stop Bits Select Mask */
+
+#define CMSDK_PL110_EVENPARITY_Pos           2                                               /*!< CMSDK_PL110 EVENPARITY: Even Parity Select Position */
+#define CMSDK_PL110_EVENPARITY_Msk           (0x1ul << CMSDK_PL110_EVENPARITY_Pos)           /*!< CMSDK_PL110 EVENPARITY: Even Parity Select Mask */
+
+#define CMSDK_PL110_PARITYEN_Pos             1                                               /*!< CMSDK_PL110 PARITYEN: Parity Enable Position */
+#define CMSDK_PL110_PARITYEN_Msk             (0x1ul << CMSDK_PL110_PARITYEN_Pos)             /*!< CMSDK_PL110 PARITYEN: Parity Enable Mask */
+
+#define CMSDK_PL110_SENDBREAK_Pos            0                                               /*!< CMSDK_PL110 SENDBREAK: Send Break Position */
+#define CMSDK_PL110_SENDBREAK_Msk            (0x1ul << CMSDK_PL110_SENDBREAK_Pos)            /*!< CMSDK_PL110 SENDBREAK: Send Break Mask */
+
+#define CMSDK_PL110_CTS_FLOWCTRL_Pos         15                                              /*!< CMSDK_PL110 CTS_FLOWCTRL: Enable CTS Flow Control Position */
+#define CMSDK_PL110_CTS_FLOWCTRL_Msk         (0x1ul << CMSDK_PL110_CTS_FLOWCTRL_Pos)         /*!< CMSDK_PL110 CTS_FLOWCTRL: Enable CTS Flow Control Mask */
+
+#define CMSDK_PL110_RTS_FLOWCTRL_Pos         14                                              /*!< CMSDK_PL110 RTS_FLOWCTRL: Enable RTS Flow Control Position */
+#define CMSDK_PL110_RTS_FLOWCTRL_Msk         (0x1ul << CMSDK_PL110_RTS_FLOWCTRL_Pos)         /*!< CMSDK_PL110 RTS_FLOWCTRL: Enable RTS Flow Control Mask */
+
+#define CMSDK_PL110_OUT2_Pos                 13                                              /*!< CMSDK_PL110 OUT2: Complement of Out2 modem status output Position */
+#define CMSDK_PL110_OUT2_Msk                 (0x1ul << CMSDK_PL110_OUT2_Pos)                 /*!< CMSDK_PL110 OUT2: Complement of Out2 modem status output Mask */
+
+#define CMSDK_PL110_OUT1_Pos                 12                                              /*!< CMSDK_PL110 OUT1: Complement of Out1 modem status output Position */
+#define CMSDK_PL110_OUT1_Msk                 (0x1ul << CMSDK_PL110_OUT1_Pos)                 /*!< CMSDK_PL110 OUT1: Complement of Out1 modem status output Mask */
+
+#define CMSDK_PL110_REQ2SEND_Pos             11                                              /*!< CMSDK_PL110 REQ2SEND: Request To Send Position */
+#define CMSDK_PL110_REQ2SEND_Msk             (0x1ul << CMSDK_PL110_REQ2SEND_Pos)             /*!< CMSDK_PL110 REQ2SEND: Request To Send Mask */
+
+#define CMSDK_PL110_DATATRANSREADY_Pos       10                                              /*!< CMSDK_PL110 DATATRANSREADY: Transmit Ready Position */
+#define CMSDK_PL110_DATATRANSREADY_Msk       (0x1ul << CMSDK_PL110_DATATRANSREADY_Pos)       /*!< CMSDK_PL110 DATATRANSREADY: Transmit Ready Mask */
+
+#define CMSDK_PL110_RXEN_Pos                 9                                               /*!< CMSDK_PL110 RXEN: Receive Enable Position */
+#define CMSDK_PL110_RXEN_Msk                 (0x1ul << CMSDK_PL110_RXEN_Pos)                 /*!< CMSDK_PL110 RXEN: Receive Enable Mask */
+
+#define CMSDK_PL110_TXEN_Pos                 8                                               /*!< CMSDK_PL110 TXEN: Transmit Enable Position */
+#define CMSDK_PL110_TXEN_Msk                 (0x1ul << CMSDK_PL110_TXEN_Pos)                 /*!< CMSDK_PL110 TXEN: Transmit Enable Mask */
+
+#define CMSDK_PL110_LOOPBACKEN_Pos           7                                               /*!< CMSDK_PL110 LOOPBACKEN: Loopback Enable Position */
+#define CMSDK_PL110_LOOPBACKEN_Msk           (0x1ul << CMSDK_PL110_LOOPBACKEN_Pos)           /*!< CMSDK_PL110 LOOPBACKEN: Loopback Enable Mask */
+
+#define CMSDK_PL110_IRDASIRLPM_Pos           2                                               /*!< CMSDK_PL110 IRDASIRLPM: IRDA SIR Low Power Position */
+#define CMSDK_PL110_IRDASIRLPM_Msk           (0x1ul << CMSDK_PL110_IRDASIRLPM_Pos)           /*!< CMSDK_PL110 IRDASIRLPM: IRDA SIR Low Power Mask */
+
+#define CMSDK_PL110_SIREN_Pos                1                                               /*!< CMSDK_PL110 SIREN: SIR Enable Position */
+#define CMSDK_PL110_SIREN_Msk                (0x1ul << CMSDK_PL110_SIREN_Pos)                /*!< CMSDK_PL110 SIREN: SIR Enable Mask */
+
+#define CMSDK_PL110_UARTEN_Pos               0                                               /*!< CMSDK_PL110 UARTEN: UART Enable Position */
+#define CMSDK_PL110_UARTEN_Msk               (0x1ul << CMSDK_PL110_UARTEN_Pos)               /*!< CMSDK_PL110 UARTEN: UART Enable Mask */
+
+#define CMSDK_PL110_RECINTFIFOLEVEL_Pos           3                                                         /*!< CMSDK_PL110 RECINTFIFOLEVEL: Set Receive Int FIFO Level Position */
+#define CMSDK_PL110_RECINTFIFOLEVEL_Msk           (0x7ul << CMSDK_PL110_RECINTFIFOLEVEL_Pos)                /*!< CMSDK_PL110 RECINTFIFOLEVEL: Set Receive Int FIFO Level Mask */
+
+#define CMSDK_PL110_TRANSINTFIFOLEVEL_Pos         0                                                         /*!< CMSDK_PL110 TRANSINTFIFOLEVEL: Set Transmit Int FIFO Level Position */
+#define CMSDK_PL110_TRANSINTFIFOLEVEL_Msk         (0x7ul << CMSDK_PL110_TRANSINTFIFOLEVEL_Pos)              /*!< CMSDK_PL110 TRANSINTFIFOLEVEL: Set Transmit Int FIFO Level Mask */
+
+#define CMSDK_PL110_SETMASK_OVRRUNERRINT_Pos      10                                                        /*!< CMSDK_PL110 SETMASK_OVRRUNERRINT: Set Overrun Error Int Mask Position */
+#define CMSDK_PL110_SETMASK_OVRRUNERRINT_Msk      (0x1ul << CMSDK_PL110_SETMASK_OVRRUNERRINT_Pos)           /*!< CMSDK_PL110 SETMASK_OVRRUNERRINT: Set Overrun Error Int Mask Mask */
+
+#define CMSDK_PL110_SETMASK_BREAKERRINT_Pos       9                                                         /*!< CMSDK_PL110 SETMASK_BREAKERRINT: Set Break Error Int Mask Position */
+#define CMSDK_PL110_SETMASK_BREAKERRINT_Msk       (0x1ul << CMSDK_PL110_SETMASK_BREAKERRINT_Pos)            /*!< CMSDK_PL110 SETMASK_BREAKERRINT: Set Break Error Int Mask Mask */
+
+#define CMSDK_PL110_SETMASK_PARITYERRINT_Pos      8                                                         /*!< CMSDK_PL110 SETMASK_PARITYERRINT: Set Parity Error Int Mask Position */
+#define CMSDK_PL110_SETMASK_PARITYERRINT_Msk      (0x1ul << CMSDK_PL110_SETMASK_PARITYERRINT_Pos)           /*!< CMSDK_PL110 SETMASK_PARITYERRINT: Set Parity Error Int Mask Mask */
+
+#define CMSDK_PL110_SETMASK_FRAMEERRINT_Pos       7                                                         /*!< CMSDK_PL110 SETMASK_FRAMEERRINT: Set Frame Error Int Mask Position */
+#define CMSDK_PL110_SETMASK_FRAMEERRINT_Msk       (0x1ul << CMSDK_PL110_SETMASK_FRAMEERRINT_Pos)            /*!< CMSDK_PL110 SETMASK_FRAMEERRINT: Set Frame Error Int Mask Mask */
+
+#define CMSDK_PL110_SETMASK_RECTRANSINT_Pos       6                                                         /*!< CMSDK_PL110 SETMASK_RECTRANSINT: Set Transmit Receive Comb Int Mask Position */
+#define CMSDK_PL110_SETMASK_RECTRANSINT_Msk       (0x1ul << CMSDK_PL110_SETMASK_RECTRANSINT_Pos)            /*!< CMSDK_PL110 SETMASK_RECTRANSINT: Set Transmit Receive Comb Int Mask Mask */
+
+#define CMSDK_PL110_SETMASK_TRANSINT_Pos          5                                                         /*!< CMSDK_PL110 SETMASK_TRANSINT: Set Transmit Int Mask Position */
+#define CMSDK_PL110_SETMASK_TRANSINT_Msk         (0x1ul << CMSDK_PL110_SETMASK_TRANSINT_Pos)                /*!< CMSDK_PL110 SETMASK_TRANSINT: Set Transmit Int Mask Mask */
+
+#define CMSDK_PL110_SETMASK_RECINT_Pos            4                                                         /*!< CMSDK_PL110 SETMASK_RECINT: Set Receive Int Mask Position */
+#define CMSDK_PL110_SETMASK_RECINT_Msk            (0x1ul << CMSDK_PL110_SETMASK_RECINT_Pos)                 /*!< CMSDK_PL110 SETMASK_RECINT: Set Receive Int Mask Mask */
+
+#define CMSDK_PL110_SETMASK_UART_DSRMODINT_Pos    3                                                         /*!< CMSDK_PL110 SETMASK_UART_DSRMODINT: Set Data Set Ready Modem Int Mask Position */
+#define CMSDK_PL110_SETMASK_UART_DSRMODINT_Msk    (0x1ul << CMSDK_PL110_SETMASK_UARTD_SRMODINT_Pos)         /*!< CMSDK_PL110 SETMASK_UART_DSRMODINT: Set Data Set Ready Modem Int Mask Mask */
+
+#define CMSDK_PL110_SETMASK_UART_DCDMODINT_Pos    2                                                         /*!< CMSDK_PL110 SETMASK_UART_DCDMODINT: Set Data Carrier Detect Modem Int Mask Position */
+#define CMSDK_PL110_SETMASK_UART_DCDMODINT_Msk    (0x1ul << CMSDK_PL110_SETMASK_UART_DCDMODINT_Pos)         /*!< CMSDK_PL110 SETMASK_UART_DCDMODINT: Set Data Carrier Detect Modem Int Mask Mask */
+
+#define CMSDK_PL110_SETMASK_UART_CTSMODINT_Pos    1                                                         /*!< CMSDK_PL110 SETMASK_UART_CTSMODINT: Set Clear To Send Modem Int Mask Position */
+#define CMSDK_PL110_SETMASK_UART_CTSMODINT_Msk    (0x1ul << CMSDK_PL110_SETMASK_UART_CTSMODINT_Pos)         /*!< CMSDK_PL110 SETMASK_UART_CTSMODINT: Set Clear To Send Modem Int Mask Mask */
+
+#define CMSDK_PL110_SETMASK_UART_RIMODINT_Pos     0                                                         /*!< CMSDK_PL110 SETMASK_UART_RIMODINT: Set nUARTRI Modem Int Mask Position */
+#define CMSDK_PL110_SETMASK_UART_RIMODINT_Msk     (0x1ul << CMSDK_PL110_SETMASK_UART_RIMODINT_Pos)          /*!< CMSDK_PL110 SETMASK_UART_RIMODINT: Set nUARTRI Modem Int Mask Mask */
+
+#define CMSDK_PL110_RAWINTSTAT_OVRRUNERRINT_Pos      10                                                     /*!< CMSDK_PL110 RAWINTSTAT_OVRRUNERRINT: Raw Overrun Error Int Status Mask Position */
+#define CMSDK_PL110_RAWINTSTAT_OVRRUNERRINT_Msk      (0x1ul << CMSDK_PL110_RAWINTSTAT_OVRRUNERRINT_Pos)     /*!< CMSDK_PL110 RAWINTSTAT_OVRRUNERRINT: Raw Overrun Error Int Status  Mask */
+
+#define CMSDK_PL110_RAWINTSTAT_BREAKERRINT_Pos       9                                                      /*!< CMSDK_PL110 RAWINTSTAT_BREAKERRINT: Raw Break Error Int Status Mask Position */
+#define CMSDK_PL110_RAWINTSTAT_BREAKERRINT_Msk       (0x1ul << CMSDK_PL110_RAWINTSTAT_BREAKERRINT_Pos)      /*!< CMSDK_PL110 RAWINTSTAT_BREAKERRINT: Raw Break Error Int Status  Mask */
+
+#define CMSDK_PL110_RAWINTSTAT_PARITYERRINT_Pos      8                                                      /*!< CMSDK_PL110 RAWINTSTAT_PARITYERRINT: Raw Parity Error Int Status Mask Position */
+#define CMSDK_PL110_RAWINTSTAT_PARITYERRINT_Msk      (0x1ul << CMSDK_PL110_RAWINTSTAT_PARITYERRINT_Pos)     /*!< CMSDK_PL110 RAWINTSTAT_PARITYERRINT: Raw Parity Error Int Status  Mask */
+
+#define CMSDK_PL110_RAWINTSTAT_FRAMEERRINT_Pos       7                                                      /*!< CMSDK_PL110 RAWINTSTAT_FRAMEERRINT: Raw Frame Error Int Status Mask Position */
+#define CMSDK_PL110_RAWINTSTAT_FRAMEERRINT_Msk       (0x1ul << CMSDK_PL110_RAWINTSTAT_FRAMEERRINT_Pos)      /*!< CMSDK_PL110 RAWINTSTAT_FRAMEERRINT: Raw Frame Error Int Status Mask */
+
+#define CMSDK_PL110_RAWINTSTAT_RECTRANSINT_Pos       6                                                      /*!< CMSDK_PL110 RAWINTSTAT_RECTRANSINT: Raw Transmit Receive Comb Int Status Position */
+#define CMSDK_PL110_RAWINTSTAT_RECTRANSINT_Msk       (0x1ul << CMSDK_PL110_RAWINTSTAT_RECTRANSINT_Pos)      /*!< CMSDK_PL110 RAWINTSTAT_RECTRANSINT: Raw Transmit Receive Comb Int Status Mask */
+
+#define CMSDK_PL110_RAWINTSTAT_TRANSINT_Pos          5                                                      /*!< CMSDK_PL110 RAWINTSTAT_TRANSINT: Raw Transmit Int Status Position */
+#define CMSDK_PL110_RAWINTSTAT_TRANSINT_Msk         (0x1ul << CMSDK_PL110_RAWINTSTAT_TRANSINT_Pos)          /*!< CMSDK_PL110 RAWINTSTAT_TRANSINT: Raw Transmit Int Status Mask */
+
+#define CMSDK_PL110_RAWINTSTAT_RECINT_Pos            4                                                      /*!< CMSDK_PL110 RAWINTSTAT_RECINT: Raw Receive Int Status Position */
+#define CMSDK_PL110_RAWINTSTAT_RECINT_Msk            (0x1ul << CMSDK_PL110_RAWINTSTAT_RECINT_Pos)           /*!< CMSDK_PL110 RAWINTSTAT_RECINT: Raw Receive Int Status Mask */
+
+#define CMSDK_PL110_RAWINTSTAT_UART_DSRMODINT_Pos    3                                                      /*!< CMSDK_PL110 RAWINTSTAT_UART_DSRMODINT: Raw Data Set Ready Int Status Position */
+#define CMSDK_PL110_RAWINTSTAT_UART_DSRMODINT_Msk    (0x1ul << CMSDK_PL110_RAWINTSTAT_UARTD_SRMODINT_Pos)   /*!< CMSDK_PL110 RAWINTSTAT_UARTD_SRMODINT: Raw Data Set Ready Int Status Mask */
+
+#define CMSDK_PL110_RAWINTSTAT_UART_DCDMODINT_Pos    2                                                      /*!< CMSDK_PL110 RAWINTSTAT_UART_DCDMODINT: Raw Data Carrier Detect Int Status Position */
+#define CMSDK_PL110_RAWINTSTAT_UART_DCDMODINT_Msk    (0x1ul << CMSDK_PL110_RAWINTSTAT_UART_DCDMODINT_Pos)   /*!< CMSDK_PL110 RAWINTSTAT_UART_DCDMODINT: Raw Data Carrier Detect Int Status Mask */
+
+#define CMSDK_PL110_RAWINTSTAT_UART_CTSMODINT_Pos    1                                                      /*!< CMSDK_PL110 RAWINTSTAT_UART_CTSMODINT: Raw Clear To Send Int Status Position */
+#define CMSDK_PL110_RAWINTSTAT_UART_CTSMODINT_Msk    (0x1ul << CMSDK_PL110_RAWINTSTAT_UART_CTSMODINT_Pos)   /*!< CMSDK_PL110 RAWINTSTAT_UART_CTSMODINT: Raw Clear To Send Int Status Mask */
+
+#define CMSDK_PL110_RAWINTSTAT_UART_RIMODINT_Pos     0                                                      /*!< CMSDK_PL110 RAWINTSTAT_UART_RIMODINT: Raw nUARTRI Modem Int Status Position */
+#define CMSDK_PL110_RAWINTSTAT_UART_RIMODINT_Msk     (0x1ul << CMSDK_PL110_RAWINTSTAT_UART_RIMODINT_Pos)    /*!< CMSDK_PL110 RAWINTSTAT_UART_RIMODINT: Raw nUARTRI Modem Int Status Mask */
+
+#define CMSDK_PL110_MSKINTSTAT_OVRRUNERRINT_Pos      10                                                     /*!< CMSDK_PL110 MSKINTSTAT_OVRRUNERRINT: Masked Overrun Error Int Status Position */
+#define CMSDK_PL110_MSKINTSTAT_OVRRUNERRINT_Msk      (0x1ul << CMSDK_PL110_MSKINTSTAT_OVRRUNERRINT_Pos)     /*!< CMSDK_PL110 MSKINTSTAT_OVRRUNERRINT: Masked Overrun Error Int Status Mask */
+
+#define CMSDK_PL110_MSKINTSTAT_BREAKERRINT_Pos       9                                                      /*!< CMSDK_PL110 MSKINTSTAT_BREAKERRINT: Masked Break Error Int Status Position */
+#define CMSDK_PL110_MSKINTSTAT_BREAKERRINT_Msk       (0x1ul << CMSDK_PL110_MSKINTSTAT_BREAKERRINT_Pos)      /*!< CMSDK_PL110 MSKINTSTAT_BREAKERRINT: Masked Break Error Int Status Mask */
+
+#define CMSDK_PL110_MSKINTSTAT_PARITYERRINT_Pos      8                                                      /*!< CMSDK_PL110 MSKINTSTAT_PARITYERRINT: Masked Parity Error Int Status Position */
+#define CMSDK_PL110_MSKINTSTAT_PARITYERRINT_Msk      (0x1ul << CMSDK_PL110_MSKINTSTAT_PARITYERRINT_Pos)     /*!< CMSDK_PL110 MSKINTSTAT_PARITYERRINT: Masked Parity Error Int Status Mask */
+
+#define CMSDK_PL110_MSKINTSTAT_FRAMEERRINT_Pos       7                                                      /*!< CMSDK_PL110 MSKINTSTAT_FRAMEERRINT: Masked Frame Error Int Status Position */
+#define CMSDK_PL110_MSKINTSTAT_FRAMEERRINT_Msk       (0x1ul << CMSDK_PL110_MSKINTSTAT_FRAMEERRINT_Pos)      /*!< CMSDK_PL110 MSKINTSTAT_FRAMEERRINT: Masked Frame Error Int Status Mask */
+
+#define CMSDK_PL110_MSKINTSTAT_RECTRANSINT_Pos       6                                                      /*!< CMSDK_PL110 MSKINTSTAT_RECTRANSINT: Masked Transmit Receive Comb Int Status Position */
+#define CMSDK_PL110_MSKINTSTAT_RECTRANSINT_Msk       (0x1ul << CMSDK_PL110_MSKINTSTAT_RECTRANSINT_Pos)      /*!< CMSDK_PL110 MSKINTSTAT_RECTRANSINT: Masked Transmit Receive Comb Int Status Mask */
+
+#define CMSDK_PL110_MSKINTSTAT_TRANSINT_Pos          5                                                      /*!< CMSDK_PL110 MSKINTSTAT_TRANSINT: Masked Transmit Int Status Position */
+#define CMSDK_PL110_MSKINTSTAT_TRANSINT_Msk         (0x1ul << CMSDK_PL110_MSKINTSTAT_TRANSINT_Pos)          /*!< CMSDK_PL110 MSKINTSTAT_TRANSINT: Masked Transmit Int Status Mask */
+
+#define CMSDK_PL110_MSKINTSTAT_RECINT_Pos            4                                                      /*!< CMSDK_PL110 MSKINTSTAT_RECINT: Masked Receive Int Status Position */
+#define CMSDK_PL110_MSKINTSTAT_RECINT_Msk            (0x1ul << CMSDK_PL110_MSKINTSTAT_RECINT_Pos)           /*!< CMSDK_PL110 MSKINTSTAT_RECINT: Masked Receive Int Status Mask */
+
+#define CMSDK_PL110_MSKINTSTAT_UART_DSRMODINT_Pos    3                                                      /*!< CMSDK_PL110 MSKINTSTAT_UART_DSRMODINT: Masked Data Set Ready Int Status Position */
+#define CMSDK_PL110_MSKINTSTAT_UART_DSRMODINT_Msk    (0x1ul << CMSDK_PL110_MSKINTSTAT_UARTD_SRMODINT_Pos)   /*!< CMSDK_PL110 MSKINTSTAT_UART_DSRMODINT: Masked Data Set Ready Int Status Mask */
+
+#define CMSDK_PL110_MSKINTSTAT_UART_DCDMODINT_Pos    2                                                      /*!< CMSDK_PL110 MSKINTSTAT_UART_DCDMODINT: Masked Data Carrier Detect Int Status Position */
+#define CMSDK_PL110_MSKINTSTAT_UART_DCDMODINT_Msk    (0x1ul << CMSDK_PL110_MSKINTSTAT_UART_DCDMODINT_Pos)   /*!< CMSDK_PL110 MSKINTSTAT_UART_DCDMODINT: Masked Data Carrier Detect Int Status Mask */
+
+#define CMSDK_PL110_MSKINTSTAT_UART_CTSMODINT_Pos    1                                                      /*!< CMSDK_PL110 MSKINTSTAT_UART_CTSMODINT: Masked Clear To Send Int Status Position */
+#define CMSDK_PL110_MSKINTSTAT_UART_CTSMODINT_Msk    (0x1ul << CMSDK_PL110_MSKINTSTAT_UART_CTSMODINT_Pos)   /*!< CMSDK_PL110 MSKINTSTAT_UART_CTSMODINT: Masked Clear To Send Int Status Mask */
+
+#define CMSDK_PL110_MSKINTSTAT_UART_RIMODINT_Pos     0                                                      /*!< CMSDK_PL110 MSKINTSTAT_UART_RIMODINT: Masked nUARTRI Modem Int Status Position */
+#define CMSDK_PL110_MSKINTSTAT_UART_RIMODINT_Msk     (0x1ul << CMSDK_PL110_MSKINTSTAT_UART_RIMODINT_Pos)    /*!< CMSDK_PL110 MSKINTSTAT_UART_RIMODINT: Masked nUARTRI Modem Int Status Mask */
+
+#define CMSDK_PL110_INTCLR_OVRRUNERRINT_Pos      10                                                  /*!< CMSDK_PL110 INTCLR_OVRRUNERRINT: Clear Overrun Error Int Position */
+#define CMSDK_PL110_INTCLR_OVRRUNERRINT_Msk      (0x1ul << CMSDK_PL110_INTCLR_OVRRUNERRINT_Pos)      /*!< CMSDK_PL110 INTCLR_OVRRUNERRINT: Clear Overrun Error Int Mask */
+
+#define CMSDK_PL110_INTCLR_BREAKERRINT_Pos       9                                                   /*!< CMSDK_PL110 INTCLR_BREAKERRINT: Clear Break Error Int Position */
+#define CMSDK_PL110_INTCLR_BREAKERRINT_Msk       (0x1ul << CMSDK_PL110_INTCLR_BREAKERRINT_Pos)       /*!< CMSDK_PL110 INTCLR_BREAKERRINT: Clear Break Error Int Mask */
+
+#define CMSDK_PL110_INTCLR_PARITYERRINT_Pos      8                                                   /*!< CMSDK_PL110 INTCLR_PARITYERRINT: Clear Parity Error Int Position */
+#define CMSDK_PL110_INTCLR_PARITYERRINT_Msk      (0x1ul << CMSDK_PL110_INTCLR_PARITYERRINT_Pos)      /*!< CMSDK_PL110 INTCLR_PARITYERRINT: Clear Parity Error Int Mask */
+
+#define CMSDK_PL110_INTCLR_FRAMEERRINT_Pos       7                                                   /*!< CMSDK_PL110 INTCLR_FRAMEERRINT: Clear Frame Error Int Position */
+#define CMSDK_PL110_INTCLR_FRAMEERRINT_Msk       (0x1ul << CMSDK_PL110_INTCLR_FRAMEERRINT_Pos)       /*!< CMSDK_PL110 INTCLR_FRAMEERRINT: Clear Frame Error Int Mask */
+
+#define CMSDK_PL110_INTCLR_RECTRANSINT_Pos       6                                                   /*!< CMSDK_PL110 INTCLR_RECTRANSINT: Clear Receive Transmit Comb Int Position */
+#define CMSDK_PL110_INTCLR_RECTRANSINT_Msk       (0x1ul << CMSDK_PL110_INTCLR_RECTRANSINT_Pos)       /*!< CMSDK_PL110 INTCLR_RECTRANSINT: Clear Receive Transmit Comb Int Mask */
+
+#define CMSDK_PL110_INTCLR_TRANSINT_Pos          5                                                   /*!< CMSDK_PL110 INTCLR_TRANSINT: Clear Transmit Int Position */
+#define CMSDK_PL110_INTCLR_TRANSINT_Msk         (0x1ul << CMSDK_PL110_INTCLR_TRANSINT_Pos)           /*!< CMSDK_PL110 INTCLR_TRANSINT: Clear Transmit Int Mask */
+
+#define CMSDK_PL110_INTCLR_RECINT_Pos            4                                                   /*!< CMSDK_PL110 INTCLR_RECINT: Clear Receive Int Position */
+#define CMSDK_PL110_INTCLR_RECINT_Msk            (0x1ul << CMSDK_PL110_INTCLR_RECINT_Pos)            /*!< CMSDK_PL110 INTCLR_RECINT: Clear Receive Int Mask */
+
+#define CMSDK_PL110_INTCLR_UART_DSRMODINT_Pos    3                                                   /*!< CMSDK_PL110 INTCLR_UART_DSRMODINT: Clear Data Carrier Detect Int Position */
+#define CMSDK_PL110_INTCLR_UART_DSRMODINT_Msk    (0x1ul << CMSDK_PL110_INTCLR_UARTD_SRMODINT_Pos)    /*!< CMSDK_PL110 INTCLR_UARTD_SRMODINT: Clear Data Carrier Detect Int Mask */
+
+#define CMSDK_PL110_INTCLR_UART_DCDMODINT_Pos    2                                                   /*!< CMSDK_PL110 INTCLR_UART_DCDMODINT: Clear Data Set Ready Int Position */
+#define CMSDK_PL110_INTCLR_UART_DCDMODINT_Msk    (0x1ul << CMSDK_PL110_INTCLR_UART_DCDMODINT_Pos)    /*!< CMSDK_PL110 INTCLR_UART_DCDMODINT: Clear Data Set Ready Int Mask */
+
+#define CMSDK_PL110_INTCLR_UART_CTSMODINT_Pos    1                                                   /*!< CMSDK_PL110 INTCLR_UART_CTSMODINT: Clear Clear To Sent Int Position */
+#define CMSDK_PL110_INTCLR_UART_CTSMODINT_Msk    (0x1ul << CMSDK_PL110_INTCLR_UART_CTSMODINT_Pos)    /*!< CMSDK_PL110 INTCLR_UART_CTSMODINT: Clear Clear To Sent Int Mask */
+
+#define CMSDK_PL110_INTCLR_UART_RIMODINT_Pos     0                                                   /*!< CMSDK_PL110 INTCLR_UART_RIMODINT: Clear nUARTRI Modem Int Position */
+#define CMSDK_PL110_INTCLR_UART_RIMODINT_Msk     (0x1ul << CMSDK_PL110_INTCLR_UART_RIMODINT_Pos)     /*!< CMSDK_PL110 INTCLR_UART_RIMODINT: Clear nUARTRI Modem Int Mask */
+
+#define CMSDK_PL110_DMA_ERR_Pos                  2                                                   /*!< CMSDK_PL110 DMA_ERR: DMA Error Position */
+#define CMSDK_PL110_DMA_ERR_Msk                  (0x1ul << CMSDK_PL110_DMA_ERR_Pos)                  /*!< CMSDK_PL110 DMA_ERR: DMA Error Mask */
+
+#define CMSDK_PL110_DMA_TRANS_EN_Pos             1                                                   /*!< CMSDK_PL110 DMA_TRANS_EN: DMA Transmit Error Position */
+#define CMSDK_PL110_DMA_TRANS_EN_Msk             (0x1ul << CMSDK_PL110_DMA_TRANS_EN_Pos)             /*!< CMSDK_PL110 DMA_TRANS_EN: DMA Transmit Error Mask */
+
+#define CMSDK_PL110_DMA_REC_EN_Pos               0                                                   /*!< CMSDK_PL110 DMA_REC_EN: DMA Receive Error Position */
+#define CMSDK_PL110_DMA_REC_EN_Msk               (0x1ul << CMSDK_PL110_DMA_REC_EN_Pos)               /*!< CMSDK_PL110 DMA_REC_EN: DMA Receive Error Mask */
+
+
+/*@}*/ /* end of group  CMSDK_PL110 */
+
+/*------------------- Watchdog ----------------------------------------------*/
+/** @addtogroup CMSDK_Watchdog CMSDK Watchdog
+  @{
+*/
+typedef struct {
+
+    __IO    uint32_t  LOAD;                   /* Offset: 0x000 (R/W) Watchdog Load Register */
+    __I     uint32_t  VALUE;                  /* Offset: 0x004 (R/ ) Watchdog Value Register */
+    __IO    uint32_t  CTRL;                   /* Offset: 0x008 (R/W) Watchdog Control Register */
+    /*    <o.1>    RESEN: Reset enable               */
+    /*    <o.0>    INTEN: Interrupt enable           */
+    /*    </h>                                       */
+    __O     uint32_t  INTCLR;                 /* Offset: 0x00C ( /W) Watchdog Clear Interrupt Register */
+    __I     uint32_t  RAWINTSTAT;             /* Offset: 0x010 (R/ ) Watchdog Raw Interrupt Status Register */
+    __I     uint32_t  MASKINTSTAT;            /* Offset: 0x014 (R/ ) Watchdog Interrupt Status Register */
+    uint32_t  RESERVED0[762];
+    __IO    uint32_t  LOCK;                   /* Offset: 0xC00 (R/W) Watchdog Lock Register */
+    uint32_t  RESERVED1[191];
+    __IO    uint32_t  ITCR;                   /* Offset: 0xF00 (R/W) Watchdog Integration Test Control Register */
+    __O     uint32_t  ITOP;                   /* Offset: 0xF04 ( /W) Watchdog Integration Test Output Set Register */
+} CMSDK_WATCHDOG_TypeDef;
+
+#define CMSDK_Watchdog_LOAD_Pos               0                                              /*!< CMSDK_Watchdog LOAD: LOAD Position */
+#define CMSDK_Watchdog_LOAD_Msk              (0xFFFFFFFFul << CMSDK_Watchdog_LOAD_Pos)       /*!< CMSDK_Watchdog LOAD: LOAD Mask */
+
+#define CMSDK_Watchdog_VALUE_Pos              0                                              /*!< CMSDK_Watchdog VALUE: VALUE Position */
+#define CMSDK_Watchdog_VALUE_Msk             (0xFFFFFFFFul << CMSDK_Watchdog_VALUE_Pos)      /*!< CMSDK_Watchdog VALUE: VALUE Mask */
+
+#define CMSDK_Watchdog_CTRL_RESEN_Pos         1                                              /*!< CMSDK_Watchdog CTRL_RESEN: Enable Reset Output Position */
+#define CMSDK_Watchdog_CTRL_RESEN_Msk        (0x1ul << CMSDK_Watchdog_CTRL_RESEN_Pos)        /*!< CMSDK_Watchdog CTRL_RESEN: Enable Reset Output Mask */
+
+#define CMSDK_Watchdog_CTRL_INTEN_Pos         0                                              /*!< CMSDK_Watchdog CTRL_INTEN: Int Enable Position */
+#define CMSDK_Watchdog_CTRL_INTEN_Msk        (0x1ul << CMSDK_Watchdog_CTRL_INTEN_Pos)        /*!< CMSDK_Watchdog CTRL_INTEN: Int Enable Mask */
+
+#define CMSDK_Watchdog_INTCLR_Pos             0                                              /*!< CMSDK_Watchdog INTCLR: Int Clear Position */
+#define CMSDK_Watchdog_INTCLR_Msk            (0x1ul << CMSDK_Watchdog_INTCLR_Pos)            /*!< CMSDK_Watchdog INTCLR: Int Clear Mask */
+
+#define CMSDK_Watchdog_RAWINTSTAT_Pos         0                                              /*!< CMSDK_Watchdog RAWINTSTAT: Raw Int Status Position */
+#define CMSDK_Watchdog_RAWINTSTAT_Msk        (0x1ul << CMSDK_Watchdog_RAWINTSTAT_Pos)        /*!< CMSDK_Watchdog RAWINTSTAT: Raw Int Status Mask */
+
+#define CMSDK_Watchdog_MASKINTSTAT_Pos        0                                              /*!< CMSDK_Watchdog MASKINTSTAT: Mask Int Status Position */
+#define CMSDK_Watchdog_MASKINTSTAT_Msk       (0x1ul << CMSDK_Watchdog_MASKINTSTAT_Pos)       /*!< CMSDK_Watchdog MASKINTSTAT: Mask Int Status Mask */
+
+#define CMSDK_Watchdog_LOCK_Pos               0                                              /*!< CMSDK_Watchdog LOCK: LOCK Position */
+#define CMSDK_Watchdog_LOCK_Msk              (0x1ul << CMSDK_Watchdog_LOCK_Pos)              /*!< CMSDK_Watchdog LOCK: LOCK Mask */
+
+#define CMSDK_Watchdog_INTEGTESTEN_Pos        0                                              /*!< CMSDK_Watchdog INTEGTESTEN: Integration Test Enable Position */
+#define CMSDK_Watchdog_INTEGTESTEN_Msk       (0x1ul << CMSDK_Watchdog_INTEGTESTEN_Pos)       /*!< CMSDK_Watchdog INTEGTESTEN: Integration Test Enable Mask */
+
+#define CMSDK_Watchdog_INTEGTESTOUTSET_Pos    1                                              /*!< CMSDK_Watchdog INTEGTESTOUTSET: Integration Test Output Set Position */
+#define CMSDK_Watchdog_INTEGTESTOUTSET_Msk   (0x1ul << CMSDK_Watchdog_INTEGTESTOUTSET_Pos)   /*!< CMSDK_Watchdog INTEGTESTOUTSET: Integration Test Output Set Mask */
+
+/*@}*/ /* end of group  CMSDK_Watchdog */
+
+/*------------------- PrimeCell APB GPIO --------------------------------------*/
+/** @addtogroup CMSDK_PL061 CMSDK APB GPIO
+  @{
+*/
+typedef struct {
+
+    __IO    uint32_t  DATA[256];
+    __IO    uint32_t  DIR;
+    __IO    uint32_t  INTSENSE;
+    __IO    uint32_t  INTBOTHEDGE;
+    __IO    uint32_t  INTEVENT;
+    __IO    uint32_t  INTMASK;
+    __O     uint32_t  RAWINTSTAT;
+    __O     uint32_t  MASKINTSTAT;
+    __I     uint32_t  INTCLR;
+    __IO    uint32_t  MODECTRL;
+
+} APBGPIO_TypeDef;
+
+#define CMSDK_PL061_DATA_Pos               0                                              /*!< CMSDK_PL061 DATA: DATA Position */
+#define CMSDK_PL061_DATA_Msk              (0xFFFFFFFFul << CMSDK_PL061_LOAD_Pos)          /*!< CMSDK_PL061 DATA: DATA Mask */
+
+#define CMSDK_PL061_DIR_Pos                0                                              /*!< CMSDK_PL061 DIR: Data Direction Position */
+#define CMSDK_PL061_DIR_Msk               (0x1ul << CMSDK_PL061_DIR_Pos)                  /*!< CMSDK_PL061 DIR: Data Direction  Mask */
+
+#define CMSDK_PL061_INTSENSE_Pos           0                                              /*!< CMSDK_PL061 INTSENSE: INT SENSE Position */
+#define CMSDK_PL061_INTSENSE_Msk          (0x1ul << CMSDK_PL061_INTSENSE_Pos)             /*!< CMSDK_PL061 INTSENSE: INT SENSE Mask */
+
+#define CMSDK_PL061_INTBOTHEDGE_Pos        0                                              /*!< CMSDK_PL061 INTBOTHEDGE: INT BOTH EDGE Position */
+#define CMSDK_PL061_INTBOTHEDGE_Msk       (0x1ul << CMSDK_PL061_INTBOTHEDGE_Pos)          /*!< CMSDK_PL061 INTBOTHEDGE: INT BOTH EDGE Mask */
+
+#define CMSDK_PL061_INTEVENT_Pos           0                                              /*!< CMSDK_PL061 INTEVENT: INT EVENT Position */
+#define CMSDK_PL061_INTEVENT_Msk          (0x1ul << CMSDK_PL061_INTEVENT_Pos)             /*!< CMSDK_PL061 INTEVENT: INT EVENT Mask */
+
+#define CMSDK_PL061_INTMASK_Pos            0                                              /*!< CMSDK_PL061 INTMASK: INT MASK Position */
+#define CMSDK_PL061_INTMASK_Msk           (0x1ul << CMSDK_PL061_INTMASK_Pos)              /*!< CMSDK_PL061 INTMASK: INT MASK Mask */
+
+#define CMSDK_PL061_RAWINTSTAT_Pos         0                                              /*!< CMSDK_PL061 RAWINTSTAT: Raw Int Status Position */
+#define CMSDK_PL061_RAWINTSTAT_Msk        (0x1ul << CMSDK_PL061_RAWINTSTAT_Pos)           /*!< CMSDK_PL061 RAWINTSTAT: Raw Int Status Mask */
+
+#define CMSDK_PL061_MASKINTSTAT_Pos        0                                              /*!< CMSDK_PL061 MASKINTSTAT: Mask Int Status Position */
+#define CMSDK_PL061_MASKINTSTAT_Msk       (0x1ul << CMSDK_PL061_MASKINTSTAT_Pos)          /*!< CMSDK_PL061 MASKINTSTAT: Mask Int Status Mask */
+
+#define CMSDK_PL061_INTCLR_Pos             0                                              /*!< CMSDK_PL061 INTCLR: Int Clear Position */
+#define CMSDK_PL061_INTCLR_Msk            (0x1ul << CMSDK_PL061_INTCLR_Pos)               /*!< CMSDK_PL061 INTCLR: Int Clear Mask */
+
+#define CMSDK_PL061_MODECTRL_HWEN_Pos      0                                              /*!< CMSDK_PL061 MODECTRL_HWEN: Mode Control Hardware Enable Position */
+#define CMSDK_PL061_MODECTRL_HWEN_Msk     (0x1ul << CMSDK_PL061_MODECTRL_HWEN_Pos)        /*!< CMSDK_PL061 MODECTRL_HWEN: Mode Control Hardware Enable Mask */
+
+
+/*@}*/ /* end of group  CMSDK_PL061 */
+
+
+#if defined ( __CC_ARM   )
+#pragma no_anon_unions
+#endif
+
+/*@}*/ /* end of group CMSDK_CM4_Peripherals */
+
+
+/******************************************************************************/
+/*                         Peripheral memory map                              */
+/******************************************************************************/
+/** @addtogroup CMSDK_CM4_MemoryMap CMSDK_CM4 Memory Mapping
+  @{
+*/
+
+/* Peripheral and SRAM base address */
+#define CMSDK_FLASH_BASE        (0x00000000UL) /*!< (FLASH     ) Base Address */
+#define CMSDK_SRAM_BASE         (0x20000000UL) /*!< (SRAM      ) Base Address */
+#define CMSDK_PERIPH_BASE       (0x40000000UL) /*!< (Peripheral) Base Address */
+
+/* Base addresses                                                             */
+#define CMSDK_RAM_BASE          (0x20000000UL)
+#define CMSDK_APB_BASE          (0x40000000UL)
+#define CMSDK_AHB_BASE          (0x40010000UL)
+
+/* APB peripherals                                                           */
+#define CMSDK_TIMER0_BASE       (CMSDK_APB_BASE + 0x0000UL)
+#define CMSDK_TIMER1_BASE       (CMSDK_APB_BASE + 0x1000UL)
+#define CMSDK_DUALTIMER_BASE    (CMSDK_APB_BASE + 0x2000UL)
+#define CMSDK_DUALTIMER_1_BASE  (CMSDK_DUALTIMER_BASE)
+#define CMSDK_DUALTIMER_2_BASE  (CMSDK_DUALTIMER_BASE + 0x20UL)
+#define CMSDK_UART0_BASE        (CMSDK_APB_BASE + 0x4000UL)
+#define CMSDK_UART1_BASE        (CMSDK_APB_BASE + 0x5000UL)
+#define CMSDK_UART2_BASE        (CMSDK_APB_BASE + 0x6000UL)
+#define CMSDK_UART3_BASE        (CMSDK_APB_BASE + 0x7000UL)
+#define CMSDK_WATCHDOG_BASE     (CMSDK_APB_BASE + 0x8000UL)
+#define CMSDK_UART4_BASE        (CMSDK_APB_BASE + 0x9000UL)
+#define CMSDK_PL230_BASE        (CMSDK_APB_BASE + 0xF000UL)
+
+/* AHB peripherals                                                           */
+#define CMSDK_GPIO0_BASE        (CMSDK_AHB_BASE + 0x0000UL)
+#define CMSDK_GPIO1_BASE        (CMSDK_AHB_BASE + 0x1000UL)
+#define CMSDK_GPIO2_BASE        (CMSDK_AHB_BASE + 0x2000UL)
+#define CMSDK_GPIO3_BASE        (CMSDK_AHB_BASE + 0x3000UL)
+#define CMSDK_SYSCTRL_BASE      (CMSDK_AHB_BASE + 0xF000UL)
+/*@}*/ /* end of group CMSDK_CM4_MemoryMap */
+
+
+/******************************************************************************/
+/*                         Peripheral declaration                             */
+/******************************************************************************/
+/** @addtogroup CMSDK_CM4_PeripheralDecl CMSDK_CM4 Peripheral Declaration
+  @{
+*/
+
+#define CMSDK_UART0             ((CMSDK_UART_TypeDef   *) CMSDK_UART0_BASE )
+#define CMSDK_UART1             ((CMSDK_UART_TypeDef   *) CMSDK_UART1_BASE )
+#define CMSDK_UART2             ((CMSDK_UART_TypeDef   *) CMSDK_UART2_BASE )
+#define CMSDK_UART3             ((CMSDK_UART_TypeDef   *) CMSDK_UART3_BASE   )
+#define CMSDK_UART4             ((CMSDK_UART_TypeDef   *) CMSDK_UART4_BASE   )
+#define CMSDK_TIMER0            ((CMSDK_TIMER_TypeDef  *) CMSDK_TIMER0_BASE  )
+#define CMSDK_TIMER1            ((CMSDK_TIMER_TypeDef  *) CMSDK_TIMER1_BASE  )
+#define CMSDK_DUALTIMER         ((CMSDK_DUALTIMER_BOTH_TypeDef  *) CMSDK_DUALTIMER_BASE )
+#define CMSDK_DUALTIMER1        ((CMSDK_DUALTIMER_SINGLE_TypeDef  *) CMSDK_DUALTIMER_1_BASE )
+#define CMSDK_DUALTIMER2        ((CMSDK_DUALTIMER_SINGLE_TypeDef  *) CMSDK_DUALTIMER_2_BASE )
+#define CMSDK_WATCHDOG          ((CMSDK_WATCHDOG_TypeDef  *) CMSDK_WATCHDOG_BASE   )
+#define CMSDK_DMA               ((CMSDK_PL230_TypeDef  *) CMSDK_PL230_BASE )
+#define CMSDK_GPIO0             ((CMSDK_GPIO_TypeDef   *) CMSDK_GPIO0_BASE )
+#define CMSDK_GPIO1             ((CMSDK_GPIO_TypeDef   *) CMSDK_GPIO1_BASE )
+#define CMSDK_GPIO2             ((CMSDK_GPIO_TypeDef   *) CMSDK_GPIO2_BASE )
+#define CMSDK_GPIO3             ((CMSDK_GPIO_TypeDef   *) CMSDK_GPIO3_BASE )
+#define CMSDK_SYSCON            ((CMSDK_SYSCON_TypeDef *) CMSDK_SYSCTRL_BASE )
+/*@}*/ /* end of group CMSDK_CM4_PeripheralDecl */
+
+/*@}*/ /* end of group CMSDK_CM4_Definitions */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* CMSDK_CM4_H */
diff --git a/common/mps2/LICENSE.txt b/common/mps2/LICENSE.txt
new file mode 100644
index 0000000..8dada3e
--- /dev/null
+++ b/common/mps2/LICENSE.txt
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/common/mps2/MPS2.ld b/common/mps2/MPS2.ld
new file mode 100644
index 0000000..55b8716
--- /dev/null
+++ b/common/mps2/MPS2.ld
@@ -0,0 +1,208 @@
+/*
+ * MPS2 CMSIS Library
+ */
+/*
+ * Copyright (c) 2009-2019 ARM Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * This file is derivative of CMSIS V5.00 gcc_arm.ld
+ */
+/* Linker script for mbed FVP Cortex-M on MPS2 */
+
+/* Linker script to configure memory regions. */
+/* The length of the VECTORS region is a bit larger than
+ * is necessary based on the number of exception handlers.
+ */
+
+#include "memory_zones.h"
+#include "cmsis_nvic.h"
+
+#if !defined(MBED_CONF_TARGET_BOOT_STACK_SIZE)
+    #define MBED_CONF_TARGET_BOOT_STACK_SIZE 0x400
+#endif
+
+MEMORY
+{
+  FLASH (rx)            : ORIGIN = ZBT_SRAM1_START, LENGTH = ZBT_SRAM1_SIZE
+  RAM (rwx)             : ORIGIN = ZBT_SRAM2_START, LENGTH = ZBT_SRAM2_SIZE
+}
+
+/* Linker script to place sections and symbol values. Should be used together
+ * with other linker script that defines memory regions FLASH and RAM.
+ * It references following symbols, which must be defined in code:
+ *   Reset_Handler : Entry of reset handler
+ *
+ * It defines following symbols, which code can use without definition:
+ *   __exidx_start
+ *   __exidx_end
+ *   __etext
+ *   __data_start__
+ *   __preinit_array_start
+ *   __preinit_array_end
+ *   __init_array_start
+ *   __init_array_end
+ *   __fini_array_start
+ *   __fini_array_end
+ *   __data_end__
+ *   __bss_start__
+ *   __bss_end__
+ *   __end__
+ *   end
+ *   __HeapLimit
+ *   __StackLimit
+ *   __StackTop
+ *   __stack
+ */
+ENTRY(Reset_Handler)
+
+STACK_SIZE = MBED_CONF_TARGET_BOOT_STACK_SIZE;
+
+/* Size of the vector table in SRAM */
+M_VECTOR_RAM_SIZE = NVIC_VECTORS_SIZE;
+
+SECTIONS
+{
+    .isr_vector :
+    {
+        __vector_table = .;
+        KEEP(*(.vector_table))
+         . = ALIGN(8);
+    } > FLASH
+
+    .text :
+    {
+        . = ALIGN(8);
+        *(.text*)
+
+        KEEP(*(.init))
+        KEEP(*(.fini))
+
+        /* .ctors */
+        *crtbegin.o(.ctors)
+        *crtbegin?.o(.ctors)
+        *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
+        *(SORT(.ctors.*))
+        *(.ctors)
+
+        /* .dtors */
+        *crtbegin.o(.dtors)
+        *crtbegin?.o(.dtors)
+        *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
+        *(SORT(.dtors.*))
+        *(.dtors)
+
+        *(.rodata*)
+
+        KEEP(*(.eh_frame*))
+    } > FLASH
+
+    .ARM.extab :
+    {
+        *(.ARM.extab* .gnu.linkonce.armextab.*)
+    } > FLASH
+
+    __exidx_start = .;
+    .ARM.exidx :
+    {
+        *(.ARM.exidx* .gnu.linkonce.armexidx.*)
+    } > FLASH
+    __exidx_end = .;
+
+    .data :
+    {
+#ifndef DATA_IN_FLASH
+        PROVIDE(__etext = LOADADDR(.data));
+        . = ALIGN(8);
+#endif
+        __data_start__ = .;
+        *(vtable)
+        *(.data)
+        *(.data*)
+
+        . = ALIGN(8);
+        /* preinit data */
+        PROVIDE (__preinit_array_start = .);
+        KEEP(*(.preinit_array))
+        PROVIDE (__preinit_array_end = .);
+
+        . = ALIGN(8);
+        /* init data */
+        PROVIDE (__init_array_start = .);
+        KEEP(*(SORT(.init_array.*)))
+        KEEP(*(.init_array))
+        PROVIDE (__init_array_end = .);
+
+
+        . = ALIGN(8);
+        /* finit data */
+        PROVIDE (__fini_array_start = .);
+        KEEP(*(SORT(.fini_array.*)))
+        KEEP(*(.fini_array))
+        PROVIDE (__fini_array_end = .);
+
+        . = ALIGN(8);
+        /* All data end */
+        __data_end__ = .;
+
+#ifdef DATA_IN_FLASH
+    } > FLASH
+#else
+    } > RAM AT > FLASH
+#endif
+
+    .bss :
+    {
+        . = ALIGN(8);
+        __START_BSS = .;
+        __bss_start__ = .;
+        *(.bss)
+        *(.bss*)
+        *(COMMON)
+        . = ALIGN(8);
+        __bss_end__ = .;
+        __END_BSS = .;
+
+#ifdef DATA_IN_FLASH
+    } > FLASH
+#else
+    } > RAM
+#endif
+
+    bss_size = __bss_end__ - __bss_start__;
+
+    .heap (COPY):
+    {
+        . = ALIGN(8);
+        __end__ = .;
+        PROVIDE(end = .);
+        __HeapBase = .;
+        *(.heap*)
+        . = ORIGIN(RAM) + LENGTH(RAM) - STACK_SIZE;
+        __HeapLimit = .;
+        __heap_limit = .; /* Add for _sbrk */
+    } > RAM
+
+    /* Set stack top to end of RAM, and stack limit move down by
+     * size of stack_dummy section */
+    __StackTop = ORIGIN(RAM) + LENGTH(RAM);
+    __StackLimit = __StackTop - STACK_SIZE;
+    PROVIDE(__stack = __StackTop);
+
+    /* Check if data + heap + stack exceeds RAM limit */
+    ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed with stack")
+
+}   /* End of sections */
diff --git a/common/mps2/cmsis_armclang.h b/common/mps2/cmsis_armclang.h
new file mode 100644
index 0000000..90de9db
--- /dev/null
+++ b/common/mps2/cmsis_armclang.h
@@ -0,0 +1,1467 @@
+/**************************************************************************//**
+ * @file     cmsis_armclang.h
+ * @brief    CMSIS compiler armclang (Arm Compiler 6) header file
+ * @version  V5.3.1
+ * @date     26. March 2020
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*lint -esym(9058, IRQn)*/ /* disable MISRA 2012 Rule 2.4 for IRQn */
+
+#ifndef __CMSIS_ARMCLANG_H
+#define __CMSIS_ARMCLANG_H
+
+#pragma clang system_header   /* treat file as system include file */
+
+#ifndef __ARM_COMPAT_H
+#include <arm_compat.h>    /* Compatibility header for Arm Compiler 5 intrinsics */
+#endif
+
+/* CMSIS compiler specific defines */
+#ifndef   __ASM
+  #define __ASM                                  __asm
+#endif
+#ifndef   __INLINE
+  #define __INLINE                               __inline
+#endif
+#ifndef   __STATIC_INLINE
+  #define __STATIC_INLINE                        static __inline
+#endif
+#ifndef   __STATIC_FORCEINLINE
+  #define __STATIC_FORCEINLINE                   __attribute__((always_inline)) static __inline
+#endif
+#ifndef   __NO_RETURN
+  #define __NO_RETURN                            __attribute__((__noreturn__))
+#endif
+#ifndef   __USED
+  #define __USED                                 __attribute__((used))
+#endif
+#ifndef   __WEAK
+  #define __WEAK                                 __attribute__((weak))
+#endif
+#ifndef   __PACKED
+  #define __PACKED                               __attribute__((packed, aligned(1)))
+#endif
+#ifndef   __PACKED_STRUCT
+  #define __PACKED_STRUCT                        struct __attribute__((packed, aligned(1)))
+#endif
+#ifndef   __PACKED_UNION
+  #define __PACKED_UNION                         union __attribute__((packed, aligned(1)))
+#endif
+#ifndef   __UNALIGNED_UINT32        /* deprecated */
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wpacked"
+/*lint -esym(9058, T_UINT32)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT32 */
+  struct __attribute__((packed)) T_UINT32 { uint32_t v; };
+  #pragma clang diagnostic pop
+  #define __UNALIGNED_UINT32(x)                  (((struct T_UINT32 *)(x))->v)
+#endif
+#ifndef   __UNALIGNED_UINT16_WRITE
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wpacked"
+/*lint -esym(9058, T_UINT16_WRITE)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT16_WRITE */
+  __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
+  #pragma clang diagnostic pop
+  #define __UNALIGNED_UINT16_WRITE(addr, val)    (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val))
+#endif
+#ifndef   __UNALIGNED_UINT16_READ
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wpacked"
+/*lint -esym(9058, T_UINT16_READ)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT16_READ */
+  __PACKED_STRUCT T_UINT16_READ { uint16_t v; };
+  #pragma clang diagnostic pop
+  #define __UNALIGNED_UINT16_READ(addr)          (((const struct T_UINT16_READ *)(const void *)(addr))->v)
+#endif
+#ifndef   __UNALIGNED_UINT32_WRITE
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wpacked"
+/*lint -esym(9058, T_UINT32_WRITE)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT32_WRITE */
+  __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
+  #pragma clang diagnostic pop
+  #define __UNALIGNED_UINT32_WRITE(addr, val)    (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
+#endif
+#ifndef   __UNALIGNED_UINT32_READ
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wpacked"
+/*lint -esym(9058, T_UINT32_READ)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT32_READ */
+  __PACKED_STRUCT T_UINT32_READ { uint32_t v; };
+  #pragma clang diagnostic pop
+  #define __UNALIGNED_UINT32_READ(addr)          (((const struct T_UINT32_READ *)(const void *)(addr))->v)
+#endif
+#ifndef   __ALIGNED
+  #define __ALIGNED(x)                           __attribute__((aligned(x)))
+#endif
+#ifndef   __RESTRICT
+  #define __RESTRICT                             __restrict
+#endif
+#ifndef   __COMPILER_BARRIER
+  #define __COMPILER_BARRIER()                   __ASM volatile("":::"memory")
+#endif
+
+/* #########################  Startup and Lowlevel Init  ######################## */
+
+#ifndef __PROGRAM_START
+#define __PROGRAM_START           __main
+#endif
+
+#ifndef __INITIAL_SP
+#define __INITIAL_SP              Image$$ARM_LIB_STACK$$ZI$$Limit
+#endif
+
+#ifndef __STACK_LIMIT
+#define __STACK_LIMIT             Image$$ARM_LIB_STACK$$ZI$$Base
+#endif
+
+#ifndef __VECTOR_TABLE
+#define __VECTOR_TABLE            __Vectors
+#endif
+
+#ifndef __VECTOR_TABLE_ATTRIBUTE
+#define __VECTOR_TABLE_ATTRIBUTE  __attribute__((used, section("RESET")))
+#endif
+
+/* ###########################  Core Function Access  ########################### */
+/** \ingroup  CMSIS_Core_FunctionInterface
+    \defgroup CMSIS_Core_RegAccFunctions CMSIS Core Register Access Functions
+  @{
+ */
+
+/**
+  \brief   Enable IRQ Interrupts
+  \details Enables IRQ interrupts by clearing the I-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+/* intrinsic void __enable_irq();  see arm_compat.h */
+
+
+/**
+  \brief   Disable IRQ Interrupts
+  \details Disables IRQ interrupts by setting the I-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+/* intrinsic void __disable_irq();  see arm_compat.h */
+
+
+/**
+  \brief   Get Control Register
+  \details Returns the content of the Control Register.
+  \return               Control Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_CONTROL(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, control" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Control Register (non-secure)
+  \details Returns the content of the non-secure Control Register when in secure mode.
+  \return               non-secure Control Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_CONTROL_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, control_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Control Register
+  \details Writes the given value to the Control Register.
+  \param [in]    control  Control Register value to set
+ */
+__STATIC_FORCEINLINE void __set_CONTROL(uint32_t control)
+{
+  __ASM volatile ("MSR control, %0" : : "r" (control) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Control Register (non-secure)
+  \details Writes the given value to the non-secure Control Register when in secure state.
+  \param [in]    control  Control Register value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_CONTROL_NS(uint32_t control)
+{
+  __ASM volatile ("MSR control_ns, %0" : : "r" (control) : "memory");
+}
+#endif
+
+
+/**
+  \brief   Get IPSR Register
+  \details Returns the content of the IPSR Register.
+  \return               IPSR Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_IPSR(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, ipsr" : "=r" (result) );
+  return(result);
+}
+
+
+/**
+  \brief   Get APSR Register
+  \details Returns the content of the APSR Register.
+  \return               APSR Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_APSR(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, apsr" : "=r" (result) );
+  return(result);
+}
+
+
+/**
+  \brief   Get xPSR Register
+  \details Returns the content of the xPSR Register.
+  \return               xPSR Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_xPSR(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, xpsr" : "=r" (result) );
+  return(result);
+}
+
+
+/**
+  \brief   Get Process Stack Pointer
+  \details Returns the current value of the Process Stack Pointer (PSP).
+  \return               PSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_PSP(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, psp"  : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Process Stack Pointer (non-secure)
+  \details Returns the current value of the non-secure Process Stack Pointer (PSP) when in secure state.
+  \return               PSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_PSP_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, psp_ns"  : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Process Stack Pointer
+  \details Assigns the given value to the Process Stack Pointer (PSP).
+  \param [in]    topOfProcStack  Process Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __set_PSP(uint32_t topOfProcStack)
+{
+  __ASM volatile ("MSR psp, %0" : : "r" (topOfProcStack) : );
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Process Stack Pointer (non-secure)
+  \details Assigns the given value to the non-secure Process Stack Pointer (PSP) when in secure state.
+  \param [in]    topOfProcStack  Process Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_PSP_NS(uint32_t topOfProcStack)
+{
+  __ASM volatile ("MSR psp_ns, %0" : : "r" (topOfProcStack) : );
+}
+#endif
+
+
+/**
+  \brief   Get Main Stack Pointer
+  \details Returns the current value of the Main Stack Pointer (MSP).
+  \return               MSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_MSP(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, msp" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Main Stack Pointer (non-secure)
+  \details Returns the current value of the non-secure Main Stack Pointer (MSP) when in secure state.
+  \return               MSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_MSP_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, msp_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Main Stack Pointer
+  \details Assigns the given value to the Main Stack Pointer (MSP).
+  \param [in]    topOfMainStack  Main Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __set_MSP(uint32_t topOfMainStack)
+{
+  __ASM volatile ("MSR msp, %0" : : "r" (topOfMainStack) : );
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Main Stack Pointer (non-secure)
+  \details Assigns the given value to the non-secure Main Stack Pointer (MSP) when in secure state.
+  \param [in]    topOfMainStack  Main Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_MSP_NS(uint32_t topOfMainStack)
+{
+  __ASM volatile ("MSR msp_ns, %0" : : "r" (topOfMainStack) : );
+}
+#endif
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Stack Pointer (non-secure)
+  \details Returns the current value of the non-secure Stack Pointer (SP) when in secure state.
+  \return               SP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_SP_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, sp_ns" : "=r" (result) );
+  return(result);
+}
+
+
+/**
+  \brief   Set Stack Pointer (non-secure)
+  \details Assigns the given value to the non-secure Stack Pointer (SP) when in secure state.
+  \param [in]    topOfStack  Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_SP_NS(uint32_t topOfStack)
+{
+  __ASM volatile ("MSR sp_ns, %0" : : "r" (topOfStack) : );
+}
+#endif
+
+
+/**
+  \brief   Get Priority Mask
+  \details Returns the current state of the priority mask bit from the Priority Mask Register.
+  \return               Priority Mask value
+ */
+__STATIC_FORCEINLINE uint32_t __get_PRIMASK(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, primask" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Priority Mask (non-secure)
+  \details Returns the current state of the non-secure priority mask bit from the Priority Mask Register when in secure state.
+  \return               Priority Mask value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_PRIMASK_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, primask_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Priority Mask
+  \details Assigns the given value to the Priority Mask Register.
+  \param [in]    priMask  Priority Mask
+ */
+__STATIC_FORCEINLINE void __set_PRIMASK(uint32_t priMask)
+{
+  __ASM volatile ("MSR primask, %0" : : "r" (priMask) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Priority Mask (non-secure)
+  \details Assigns the given value to the non-secure Priority Mask Register when in secure state.
+  \param [in]    priMask  Priority Mask
+ */
+__STATIC_FORCEINLINE void __TZ_set_PRIMASK_NS(uint32_t priMask)
+{
+  __ASM volatile ("MSR primask_ns, %0" : : "r" (priMask) : "memory");
+}
+#endif
+
+
+#if ((defined (__ARM_ARCH_7M__       ) && (__ARM_ARCH_7M__        == 1)) || \
+     (defined (__ARM_ARCH_7EM__      ) && (__ARM_ARCH_7EM__       == 1)) || \
+     (defined (__ARM_ARCH_8M_MAIN__  ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+     (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1))     )
+/**
+  \brief   Enable FIQ
+  \details Enables FIQ interrupts by clearing the F-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+#define __enable_fault_irq                __enable_fiq   /* see arm_compat.h */
+
+
+/**
+  \brief   Disable FIQ
+  \details Disables FIQ interrupts by setting the F-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+#define __disable_fault_irq               __disable_fiq   /* see arm_compat.h */
+
+
+/**
+  \brief   Get Base Priority
+  \details Returns the current value of the Base Priority register.
+  \return               Base Priority register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_BASEPRI(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, basepri" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Base Priority (non-secure)
+  \details Returns the current value of the non-secure Base Priority register when in secure state.
+  \return               Base Priority register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_BASEPRI_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, basepri_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Base Priority
+  \details Assigns the given value to the Base Priority register.
+  \param [in]    basePri  Base Priority value to set
+ */
+__STATIC_FORCEINLINE void __set_BASEPRI(uint32_t basePri)
+{
+  __ASM volatile ("MSR basepri, %0" : : "r" (basePri) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Base Priority (non-secure)
+  \details Assigns the given value to the non-secure Base Priority register when in secure state.
+  \param [in]    basePri  Base Priority value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_BASEPRI_NS(uint32_t basePri)
+{
+  __ASM volatile ("MSR basepri_ns, %0" : : "r" (basePri) : "memory");
+}
+#endif
+
+
+/**
+  \brief   Set Base Priority with condition
+  \details Assigns the given value to the Base Priority register only if BASEPRI masking is disabled,
+           or the new value increases the BASEPRI priority level.
+  \param [in]    basePri  Base Priority value to set
+ */
+__STATIC_FORCEINLINE void __set_BASEPRI_MAX(uint32_t basePri)
+{
+  __ASM volatile ("MSR basepri_max, %0" : : "r" (basePri) : "memory");
+}
+
+
+/**
+  \brief   Get Fault Mask
+  \details Returns the current value of the Fault Mask register.
+  \return               Fault Mask register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_FAULTMASK(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, faultmask" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Fault Mask (non-secure)
+  \details Returns the current value of the non-secure Fault Mask register when in secure state.
+  \return               Fault Mask register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_FAULTMASK_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, faultmask_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Fault Mask
+  \details Assigns the given value to the Fault Mask register.
+  \param [in]    faultMask  Fault Mask value to set
+ */
+__STATIC_FORCEINLINE void __set_FAULTMASK(uint32_t faultMask)
+{
+  __ASM volatile ("MSR faultmask, %0" : : "r" (faultMask) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Fault Mask (non-secure)
+  \details Assigns the given value to the non-secure Fault Mask register when in secure state.
+  \param [in]    faultMask  Fault Mask value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_FAULTMASK_NS(uint32_t faultMask)
+{
+  __ASM volatile ("MSR faultmask_ns, %0" : : "r" (faultMask) : "memory");
+}
+#endif
+
+#endif /* ((defined (__ARM_ARCH_7M__       ) && (__ARM_ARCH_7M__        == 1)) || \
+           (defined (__ARM_ARCH_7EM__      ) && (__ARM_ARCH_7EM__       == 1)) || \
+           (defined (__ARM_ARCH_8M_MAIN__  ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+           (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1))     ) */
+
+
+#if ((defined (__ARM_ARCH_8M_MAIN__  ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+     (defined (__ARM_ARCH_8M_BASE__  ) && (__ARM_ARCH_8M_BASE__   == 1)) || \
+     (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1))     )
+
+/**
+  \brief   Get Process Stack Pointer Limit
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence zero is returned always in non-secure
+  mode.
+  
+  \details Returns the current value of the Process Stack Pointer Limit (PSPLIM).
+  \return               PSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_PSPLIM(void)
+{
+#if (!((defined (__ARM_ARCH_8M_MAIN__   ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+       (defined (__ARM_ARCH_8_1M_MAIN__ ) && (__ARM_ARCH_8_1M_MAIN__ == 1))   ) && \
+    (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+    // without main extensions, the non-secure PSPLIM is RAZ/WI
+  return 0U;
+#else
+  uint32_t result;
+  __ASM volatile ("MRS %0, psplim"  : "=r" (result) );
+  return result;
+#endif
+}
+
+#if (defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Process Stack Pointer Limit (non-secure)
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence zero is returned always in non-secure
+  mode.
+
+  \details Returns the current value of the non-secure Process Stack Pointer Limit (PSPLIM) when in secure state.
+  \return               PSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_PSPLIM_NS(void)
+{
+#if (!((defined (__ARM_ARCH_8M_MAIN__   ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+       (defined (__ARM_ARCH_8_1M_MAIN__ ) && (__ARM_ARCH_8_1M_MAIN__ == 1))   ) )
+  // without main extensions, the non-secure PSPLIM is RAZ/WI
+  return 0U;
+#else
+  uint32_t result;
+  __ASM volatile ("MRS %0, psplim_ns"  : "=r" (result) );
+  return result;
+#endif
+}
+#endif
+
+
+/**
+  \brief   Set Process Stack Pointer Limit
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence the write is silently ignored in non-secure
+  mode.
+  
+  \details Assigns the given value to the Process Stack Pointer Limit (PSPLIM).
+  \param [in]    ProcStackPtrLimit  Process Stack Pointer Limit value to set
+ */
+__STATIC_FORCEINLINE void __set_PSPLIM(uint32_t ProcStackPtrLimit)
+{
+#if (!((defined (__ARM_ARCH_8M_MAIN__   ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+       (defined (__ARM_ARCH_8_1M_MAIN__ ) && (__ARM_ARCH_8_1M_MAIN__ == 1))   ) && \
+    (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+  // without main extensions, the non-secure PSPLIM is RAZ/WI
+  (void)ProcStackPtrLimit;
+#else
+  __ASM volatile ("MSR psplim, %0" : : "r" (ProcStackPtrLimit));
+#endif
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE  ) && (__ARM_FEATURE_CMSE   == 3))
+/**
+  \brief   Set Process Stack Pointer (non-secure)
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence the write is silently ignored in non-secure
+  mode.
+
+  \details Assigns the given value to the non-secure Process Stack Pointer Limit (PSPLIM) when in secure state.
+  \param [in]    ProcStackPtrLimit  Process Stack Pointer Limit value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_PSPLIM_NS(uint32_t ProcStackPtrLimit)
+{
+#if (!((defined (__ARM_ARCH_8M_MAIN__   ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+       (defined (__ARM_ARCH_8_1M_MAIN__ ) && (__ARM_ARCH_8_1M_MAIN__ == 1))   ) )
+  // without main extensions, the non-secure PSPLIM is RAZ/WI
+  (void)ProcStackPtrLimit;
+#else
+  __ASM volatile ("MSR psplim_ns, %0\n" : : "r" (ProcStackPtrLimit));
+#endif
+}
+#endif
+
+
+/**
+  \brief   Get Main Stack Pointer Limit
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence zero is returned always.
+
+  \details Returns the current value of the Main Stack Pointer Limit (MSPLIM).
+  \return               MSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_MSPLIM(void)
+{
+#if (!((defined (__ARM_ARCH_8M_MAIN__   ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+       (defined (__ARM_ARCH_8_1M_MAIN__ ) && (__ARM_ARCH_8_1M_MAIN__ == 1))   ) && \
+    (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+  // without main extensions, the non-secure MSPLIM is RAZ/WI
+  return 0U;
+#else
+  uint32_t result;
+  __ASM volatile ("MRS %0, msplim" : "=r" (result) );
+  return result;
+#endif
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE  ) && (__ARM_FEATURE_CMSE   == 3))
+/**
+  \brief   Get Main Stack Pointer Limit (non-secure)
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence zero is returned always.
+
+  \details Returns the current value of the non-secure Main Stack Pointer Limit(MSPLIM) when in secure state.
+  \return               MSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_MSPLIM_NS(void)
+{
+#if (!((defined (__ARM_ARCH_8M_MAIN__   ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+       (defined (__ARM_ARCH_8_1M_MAIN__ ) && (__ARM_ARCH_8_1M_MAIN__ == 1))   ) )
+  // without main extensions, the non-secure MSPLIM is RAZ/WI
+  return 0U;
+#else
+  uint32_t result;
+  __ASM volatile ("MRS %0, msplim_ns" : "=r" (result) );
+  return result;
+#endif
+}
+#endif
+
+
+/**
+  \brief   Set Main Stack Pointer Limit
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence the write is silently ignored.
+
+  \details Assigns the given value to the Main Stack Pointer Limit (MSPLIM).
+  \param [in]    MainStackPtrLimit  Main Stack Pointer Limit value to set
+ */
+__STATIC_FORCEINLINE void __set_MSPLIM(uint32_t MainStackPtrLimit)
+{
+#if (!((defined (__ARM_ARCH_8M_MAIN__   ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+       (defined (__ARM_ARCH_8_1M_MAIN__ ) && (__ARM_ARCH_8_1M_MAIN__ == 1))   ) && \
+    (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+  // without main extensions, the non-secure MSPLIM is RAZ/WI
+  (void)MainStackPtrLimit;
+#else
+  __ASM volatile ("MSR msplim, %0" : : "r" (MainStackPtrLimit));
+#endif
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE  ) && (__ARM_FEATURE_CMSE   == 3))
+/**
+  \brief   Set Main Stack Pointer Limit (non-secure)
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence the write is silently ignored.
+
+  \details Assigns the given value to the non-secure Main Stack Pointer Limit (MSPLIM) when in secure state.
+  \param [in]    MainStackPtrLimit  Main Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_MSPLIM_NS(uint32_t MainStackPtrLimit)
+{
+#if (!((defined (__ARM_ARCH_8M_MAIN__   ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+       (defined (__ARM_ARCH_8_1M_MAIN__ ) && (__ARM_ARCH_8_1M_MAIN__ == 1))   ) )
+  // without main extensions, the non-secure MSPLIM is RAZ/WI
+  (void)MainStackPtrLimit;
+#else
+  __ASM volatile ("MSR msplim_ns, %0" : : "r" (MainStackPtrLimit));
+#endif
+}
+#endif
+
+#endif /* ((defined (__ARM_ARCH_8M_MAIN__  ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+           (defined (__ARM_ARCH_8M_BASE__  ) && (__ARM_ARCH_8M_BASE__   == 1)) || \
+           (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1))     ) */
+
+/**
+  \brief   Get FPSCR
+  \details Returns the current value of the Floating Point Status/Control register.
+  \return               Floating Point Status/Control register value
+ */
+#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
+     (defined (__FPU_USED   ) && (__FPU_USED    == 1U))     )
+#define __get_FPSCR      (uint32_t)__builtin_arm_get_fpscr
+#else
+#define __get_FPSCR()      ((uint32_t)0U)
+#endif
+
+/**
+  \brief   Set FPSCR
+  \details Assigns the given value to the Floating Point Status/Control register.
+  \param [in]    fpscr  Floating Point Status/Control value to set
+ */
+#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
+     (defined (__FPU_USED   ) && (__FPU_USED    == 1U))     )
+#define __set_FPSCR      __builtin_arm_set_fpscr
+#else
+#define __set_FPSCR(x)      ((void)(x))
+#endif
+
+
+/*@} end of CMSIS_Core_RegAccFunctions */
+
+
+/* ##########################  Core Instruction Access  ######################### */
+/** \defgroup CMSIS_Core_InstructionInterface CMSIS Core Instruction Interface
+  Access to dedicated instructions
+  @{
+*/
+
+/* Define macros for porting to both thumb1 and thumb2.
+ * For thumb1, use low register (r0-r7), specified by constraint "l"
+ * Otherwise, use general registers, specified by constraint "r" */
+#if defined (__thumb__) && !defined (__thumb2__)
+#define __CMSIS_GCC_OUT_REG(r) "=l" (r)
+#define __CMSIS_GCC_RW_REG(r) "+l" (r)
+#define __CMSIS_GCC_USE_REG(r) "l" (r)
+#else
+#define __CMSIS_GCC_OUT_REG(r) "=r" (r)
+#define __CMSIS_GCC_RW_REG(r) "+r" (r)
+#define __CMSIS_GCC_USE_REG(r) "r" (r)
+#endif
+
+/**
+  \brief   No Operation
+  \details No Operation does nothing. This instruction can be used for code alignment purposes.
+ */
+#define __NOP          __builtin_arm_nop
+
+/**
+  \brief   Wait For Interrupt
+  \details Wait For Interrupt is a hint instruction that suspends execution until one of a number of events occurs.
+ */
+#define __WFI          __builtin_arm_wfi
+
+
+/**
+  \brief   Wait For Event
+  \details Wait For Event is a hint instruction that permits the processor to enter
+           a low-power state until one of a number of events occurs.
+ */
+#define __WFE          __builtin_arm_wfe
+
+
+/**
+  \brief   Send Event
+  \details Send Event is a hint instruction. It causes an event to be signaled to the CPU.
+ */
+#define __SEV          __builtin_arm_sev
+
+
+/**
+  \brief   Instruction Synchronization Barrier
+  \details Instruction Synchronization Barrier flushes the pipeline in the processor,
+           so that all instructions following the ISB are fetched from cache or memory,
+           after the instruction has been completed.
+ */
+#define __ISB()        __builtin_arm_isb(0xF)
+
+/**
+  \brief   Data Synchronization Barrier
+  \details Acts as a special kind of Data Memory Barrier.
+           It completes when all explicit memory accesses before this instruction complete.
+ */
+#define __DSB()        __builtin_arm_dsb(0xF)
+
+
+/**
+  \brief   Data Memory Barrier
+  \details Ensures the apparent order of the explicit memory operations before
+           and after the instruction, without ensuring their completion.
+ */
+#define __DMB()        __builtin_arm_dmb(0xF)
+
+
+/**
+  \brief   Reverse byte order (32 bit)
+  \details Reverses the byte order in unsigned integer value. For example, 0x12345678 becomes 0x78563412.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+#define __REV(value)   __builtin_bswap32(value)
+
+
+/**
+  \brief   Reverse byte order (16 bit)
+  \details Reverses the byte order within each halfword of a word. For example, 0x12345678 becomes 0x34127856.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+#define __REV16(value) __ROR(__REV(value), 16)
+
+
+/**
+  \brief   Reverse byte order (16 bit)
+  \details Reverses the byte order in a 16-bit value and returns the signed 16-bit result. For example, 0x0080 becomes 0x8000.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+#define __REVSH(value) (int16_t)__builtin_bswap16(value)
+
+
+/**
+  \brief   Rotate Right in unsigned value (32 bit)
+  \details Rotate Right (immediate) provides the value of the contents of a register rotated by a variable number of bits.
+  \param [in]    op1  Value to rotate
+  \param [in]    op2  Number of Bits to rotate
+  \return               Rotated value
+ */
+__STATIC_FORCEINLINE uint32_t __ROR(uint32_t op1, uint32_t op2)
+{
+  op2 %= 32U;
+  if (op2 == 0U)
+  {
+    return op1;
+  }
+  return (op1 >> op2) | (op1 << (32U - op2));
+}
+
+
+/**
+  \brief   Breakpoint
+  \details Causes the processor to enter Debug state.
+           Debug tools can use this to investigate system state when the instruction at a particular address is reached.
+  \param [in]    value  is ignored by the processor.
+                 If required, a debugger can use it to store additional information about the breakpoint.
+ */
+#define __BKPT(value)     __ASM volatile ("bkpt "#value)
+
+
+/**
+  \brief   Reverse bit order of value
+  \details Reverses the bit order of the given value.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+#define __RBIT            __builtin_arm_rbit
+
+/**
+  \brief   Count leading zeros
+  \details Counts the number of leading zeros of a data value.
+  \param [in]  value  Value to count the leading zeros
+  \return             number of leading zeros in value
+ */
+__STATIC_FORCEINLINE uint8_t __CLZ(uint32_t value)
+{
+  /* Even though __builtin_clz produces a CLZ instruction on ARM, formally
+     __builtin_clz(0) is undefined behaviour, so handle this case specially.
+     This guarantees ARM-compatible results if happening to compile on a non-ARM
+     target, and ensures the compiler doesn't decide to activate any
+     optimisations using the logic "value was passed to __builtin_clz, so it
+     is non-zero".
+     ARM Compiler 6.10 and possibly earlier will optimise this test away, leaving a
+     single CLZ instruction.
+   */
+  if (value == 0U)
+  {
+    return 32U;
+  }
+  return __builtin_clz(value);
+}
+
+
+#if ((defined (__ARM_ARCH_7M__       ) && (__ARM_ARCH_7M__        == 1)) || \
+     (defined (__ARM_ARCH_7EM__      ) && (__ARM_ARCH_7EM__       == 1)) || \
+     (defined (__ARM_ARCH_8M_MAIN__  ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+     (defined (__ARM_ARCH_8M_BASE__  ) && (__ARM_ARCH_8M_BASE__   == 1)) || \
+     (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1))     )
+
+/**
+  \brief   LDR Exclusive (8 bit)
+  \details Executes a exclusive LDR instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+#define __LDREXB        (uint8_t)__builtin_arm_ldrex
+
+
+/**
+  \brief   LDR Exclusive (16 bit)
+  \details Executes a exclusive LDR instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+#define __LDREXH        (uint16_t)__builtin_arm_ldrex
+
+
+/**
+  \brief   LDR Exclusive (32 bit)
+  \details Executes a exclusive LDR instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+#define __LDREXW        (uint32_t)__builtin_arm_ldrex
+
+
+/**
+  \brief   STR Exclusive (8 bit)
+  \details Executes a exclusive STR instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+#define __STREXB        (uint32_t)__builtin_arm_strex
+
+
+/**
+  \brief   STR Exclusive (16 bit)
+  \details Executes a exclusive STR instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+#define __STREXH        (uint32_t)__builtin_arm_strex
+
+
+/**
+  \brief   STR Exclusive (32 bit)
+  \details Executes a exclusive STR instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+#define __STREXW        (uint32_t)__builtin_arm_strex
+
+
+/**
+  \brief   Remove the exclusive lock
+  \details Removes the exclusive lock which is created by LDREX.
+ */
+#define __CLREX             __builtin_arm_clrex
+
+#endif /* ((defined (__ARM_ARCH_7M__       ) && (__ARM_ARCH_7M__        == 1)) || \
+           (defined (__ARM_ARCH_7EM__      ) && (__ARM_ARCH_7EM__       == 1)) || \
+           (defined (__ARM_ARCH_8M_MAIN__  ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+           (defined (__ARM_ARCH_8M_BASE__  ) && (__ARM_ARCH_8M_BASE__   == 1)) || \
+           (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1))     ) */
+
+
+#if ((defined (__ARM_ARCH_7M__       ) && (__ARM_ARCH_7M__        == 1)) || \
+     (defined (__ARM_ARCH_7EM__      ) && (__ARM_ARCH_7EM__       == 1)) || \
+     (defined (__ARM_ARCH_8M_MAIN__  ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+     (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1))     )
+
+/**
+  \brief   Signed Saturate
+  \details Saturates a signed value.
+  \param [in]  value  Value to be saturated
+  \param [in]    sat  Bit position to saturate to (1..32)
+  \return             Saturated value
+ */
+#define __SSAT             __builtin_arm_ssat
+
+
+/**
+  \brief   Unsigned Saturate
+  \details Saturates an unsigned value.
+  \param [in]  value  Value to be saturated
+  \param [in]    sat  Bit position to saturate to (0..31)
+  \return             Saturated value
+ */
+#define __USAT             __builtin_arm_usat
+
+
+/**
+  \brief   Rotate Right with Extend (32 bit)
+  \details Moves each bit of a bitstring right by one bit.
+           The carry input is shifted in at the left end of the bitstring.
+  \param [in]    value  Value to rotate
+  \return               Rotated value
+ */
+__STATIC_FORCEINLINE uint32_t __RRX(uint32_t value)
+{
+  uint32_t result;
+
+  __ASM volatile ("rrx %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) );
+  return(result);
+}
+
+
+/**
+  \brief   LDRT Unprivileged (8 bit)
+  \details Executes a Unprivileged LDRT instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint8_t __LDRBT(volatile uint8_t *ptr)
+{
+  uint32_t result;
+
+  __ASM volatile ("ldrbt %0, %1" : "=r" (result) : "Q" (*ptr) );
+  return ((uint8_t) result);    /* Add explicit type cast here */
+}
+
+
+/**
+  \brief   LDRT Unprivileged (16 bit)
+  \details Executes a Unprivileged LDRT instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint16_t __LDRHT(volatile uint16_t *ptr)
+{
+  uint32_t result;
+
+  __ASM volatile ("ldrht %0, %1" : "=r" (result) : "Q" (*ptr) );
+  return ((uint16_t) result);    /* Add explicit type cast here */
+}
+
+
+/**
+  \brief   LDRT Unprivileged (32 bit)
+  \details Executes a Unprivileged LDRT instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint32_t __LDRT(volatile uint32_t *ptr)
+{
+  uint32_t result;
+
+  __ASM volatile ("ldrt %0, %1" : "=r" (result) : "Q" (*ptr) );
+  return(result);
+}
+
+
+/**
+  \brief   STRT Unprivileged (8 bit)
+  \details Executes a Unprivileged STRT instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STRBT(uint8_t value, volatile uint8_t *ptr)
+{
+  __ASM volatile ("strbt %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) );
+}
+
+
+/**
+  \brief   STRT Unprivileged (16 bit)
+  \details Executes a Unprivileged STRT instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STRHT(uint16_t value, volatile uint16_t *ptr)
+{
+  __ASM volatile ("strht %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) );
+}
+
+
+/**
+  \brief   STRT Unprivileged (32 bit)
+  \details Executes a Unprivileged STRT instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STRT(uint32_t value, volatile uint32_t *ptr)
+{
+  __ASM volatile ("strt %1, %0" : "=Q" (*ptr) : "r" (value) );
+}
+
+#else /* ((defined (__ARM_ARCH_7M__       ) && (__ARM_ARCH_7M__        == 1)) || \
+          (defined (__ARM_ARCH_7EM__      ) && (__ARM_ARCH_7EM__       == 1)) || \
+          (defined (__ARM_ARCH_8M_MAIN__  ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+          (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1))     ) */
+
+/**
+  \brief   Signed Saturate
+  \details Saturates a signed value.
+  \param [in]  value  Value to be saturated
+  \param [in]    sat  Bit position to saturate to (1..32)
+  \return             Saturated value
+ */
+__STATIC_FORCEINLINE int32_t __SSAT(int32_t val, uint32_t sat)
+{
+  if ((sat >= 1U) && (sat <= 32U))
+  {
+    const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U);
+    const int32_t min = -1 - max ;
+    if (val > max)
+    {
+      return max;
+    }
+    else if (val < min)
+    {
+      return min;
+    }
+  }
+  return val;
+}
+
+/**
+  \brief   Unsigned Saturate
+  \details Saturates an unsigned value.
+  \param [in]  value  Value to be saturated
+  \param [in]    sat  Bit position to saturate to (0..31)
+  \return             Saturated value
+ */
+__STATIC_FORCEINLINE uint32_t __USAT(int32_t val, uint32_t sat)
+{
+  if (sat <= 31U)
+  {
+    const uint32_t max = ((1U << sat) - 1U);
+    if (val > (int32_t)max)
+    {
+      return max;
+    }
+    else if (val < 0)
+    {
+      return 0U;
+    }
+  }
+  return (uint32_t)val;
+}
+
+#endif /* ((defined (__ARM_ARCH_7M__       ) && (__ARM_ARCH_7M__        == 1)) || \
+           (defined (__ARM_ARCH_7EM__      ) && (__ARM_ARCH_7EM__       == 1)) || \
+           (defined (__ARM_ARCH_8M_MAIN__  ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+           (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1))     ) */
+
+
+#if ((defined (__ARM_ARCH_8M_MAIN__  ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+     (defined (__ARM_ARCH_8M_BASE__  ) && (__ARM_ARCH_8M_BASE__   == 1)) || \
+     (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1))     )
+           
+/**
+  \brief   Load-Acquire (8 bit)
+  \details Executes a LDAB instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint8_t __LDAB(volatile uint8_t *ptr)
+{
+  uint32_t result;
+
+  __ASM volatile ("ldab %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" );
+  return ((uint8_t) result);
+}
+
+
+/**
+  \brief   Load-Acquire (16 bit)
+  \details Executes a LDAH instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint16_t __LDAH(volatile uint16_t *ptr)
+{
+  uint32_t result;
+
+  __ASM volatile ("ldah %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" );
+  return ((uint16_t) result);
+}
+
+
+/**
+  \brief   Load-Acquire (32 bit)
+  \details Executes a LDA instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint32_t __LDA(volatile uint32_t *ptr)
+{
+  uint32_t result;
+
+  __ASM volatile ("lda %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" );
+  return(result);
+}
+
+
+/**
+  \brief   Store-Release (8 bit)
+  \details Executes a STLB instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STLB(uint8_t value, volatile uint8_t *ptr)
+{
+  __ASM volatile ("stlb %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" );
+}
+
+
+/**
+  \brief   Store-Release (16 bit)
+  \details Executes a STLH instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STLH(uint16_t value, volatile uint16_t *ptr)
+{
+  __ASM volatile ("stlh %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" );
+}
+
+
+/**
+  \brief   Store-Release (32 bit)
+  \details Executes a STL instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STL(uint32_t value, volatile uint32_t *ptr)
+{
+  __ASM volatile ("stl %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" );
+}
+
+
+/**
+  \brief   Load-Acquire Exclusive (8 bit)
+  \details Executes a LDAB exclusive instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+#define     __LDAEXB                 (uint8_t)__builtin_arm_ldaex
+
+
+/**
+  \brief   Load-Acquire Exclusive (16 bit)
+  \details Executes a LDAH exclusive instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+#define     __LDAEXH                 (uint16_t)__builtin_arm_ldaex
+
+
+/**
+  \brief   Load-Acquire Exclusive (32 bit)
+  \details Executes a LDA exclusive instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+#define     __LDAEX                  (uint32_t)__builtin_arm_ldaex
+
+
+/**
+  \brief   Store-Release Exclusive (8 bit)
+  \details Executes a STLB exclusive instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+#define     __STLEXB                 (uint32_t)__builtin_arm_stlex
+
+
+/**
+  \brief   Store-Release Exclusive (16 bit)
+  \details Executes a STLH exclusive instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+#define     __STLEXH                 (uint32_t)__builtin_arm_stlex
+
+
+/**
+  \brief   Store-Release Exclusive (32 bit)
+  \details Executes a STL exclusive instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+#define     __STLEX                  (uint32_t)__builtin_arm_stlex
+
+#endif /* ((defined (__ARM_ARCH_8M_MAIN__  ) && (__ARM_ARCH_8M_MAIN__   == 1)) || \
+           (defined (__ARM_ARCH_8M_BASE__  ) && (__ARM_ARCH_8M_BASE__   == 1)) || \
+           (defined (__ARM_ARCH_8_1M_MAIN__) && (__ARM_ARCH_8_1M_MAIN__ == 1))     ) */
+
+/*@}*/ /* end of group CMSIS_Core_InstructionInterface */
+
+
+/* ###################  Compiler specific Intrinsics  ########################### */
+/** \defgroup CMSIS_SIMD_intrinsics CMSIS SIMD Intrinsics
+  Access to dedicated SIMD instructions
+  @{
+*/
+
+#if (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
+
+#define     __SADD8                 __builtin_arm_sadd8
+#define     __QADD8                 __builtin_arm_qadd8
+#define     __SHADD8                __builtin_arm_shadd8
+#define     __UADD8                 __builtin_arm_uadd8
+#define     __UQADD8                __builtin_arm_uqadd8
+#define     __UHADD8                __builtin_arm_uhadd8
+#define     __SSUB8                 __builtin_arm_ssub8
+#define     __QSUB8                 __builtin_arm_qsub8
+#define     __SHSUB8                __builtin_arm_shsub8
+#define     __USUB8                 __builtin_arm_usub8
+#define     __UQSUB8                __builtin_arm_uqsub8
+#define     __UHSUB8                __builtin_arm_uhsub8
+#define     __SADD16                __builtin_arm_sadd16
+#define     __QADD16                __builtin_arm_qadd16
+#define     __SHADD16               __builtin_arm_shadd16
+#define     __UADD16                __builtin_arm_uadd16
+#define     __UQADD16               __builtin_arm_uqadd16
+#define     __UHADD16               __builtin_arm_uhadd16
+#define     __SSUB16                __builtin_arm_ssub16
+#define     __QSUB16                __builtin_arm_qsub16
+#define     __SHSUB16               __builtin_arm_shsub16
+#define     __USUB16                __builtin_arm_usub16
+#define     __UQSUB16               __builtin_arm_uqsub16
+#define     __UHSUB16               __builtin_arm_uhsub16
+#define     __SASX                  __builtin_arm_sasx
+#define     __QASX                  __builtin_arm_qasx
+#define     __SHASX                 __builtin_arm_shasx
+#define     __UASX                  __builtin_arm_uasx
+#define     __UQASX                 __builtin_arm_uqasx
+#define     __UHASX                 __builtin_arm_uhasx
+#define     __SSAX                  __builtin_arm_ssax
+#define     __QSAX                  __builtin_arm_qsax
+#define     __SHSAX                 __builtin_arm_shsax
+#define     __USAX                  __builtin_arm_usax
+#define     __UQSAX                 __builtin_arm_uqsax
+#define     __UHSAX                 __builtin_arm_uhsax
+#define     __USAD8                 __builtin_arm_usad8
+#define     __USADA8                __builtin_arm_usada8
+#define     __SSAT16                __builtin_arm_ssat16
+#define     __USAT16                __builtin_arm_usat16
+#define     __UXTB16                __builtin_arm_uxtb16
+#define     __UXTAB16               __builtin_arm_uxtab16
+#define     __SXTB16                __builtin_arm_sxtb16
+#define     __SXTAB16               __builtin_arm_sxtab16
+#define     __SMUAD                 __builtin_arm_smuad
+#define     __SMUADX                __builtin_arm_smuadx
+#define     __SMLAD                 __builtin_arm_smlad
+#define     __SMLADX                __builtin_arm_smladx
+#define     __SMLALD                __builtin_arm_smlald
+#define     __SMLALDX               __builtin_arm_smlaldx
+#define     __SMUSD                 __builtin_arm_smusd
+#define     __SMUSDX                __builtin_arm_smusdx
+#define     __SMLSD                 __builtin_arm_smlsd
+#define     __SMLSDX                __builtin_arm_smlsdx
+#define     __SMLSLD                __builtin_arm_smlsld
+#define     __SMLSLDX               __builtin_arm_smlsldx
+#define     __SEL                   __builtin_arm_sel
+#define     __QADD                  __builtin_arm_qadd
+#define     __QSUB                  __builtin_arm_qsub
+
+#define __PKHBT(ARG1,ARG2,ARG3)          ( ((((uint32_t)(ARG1))          ) & 0x0000FFFFUL) |  \
+                                           ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL)  )
+
+#define __PKHTB(ARG1,ARG2,ARG3)          ( ((((uint32_t)(ARG1))          ) & 0xFFFF0000UL) |  \
+                                           ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL)  )
+
+#define __SXTB16_RORn(ARG1, ARG2)        __SXTB16(__ROR(ARG1, ARG2))
+
+__STATIC_FORCEINLINE int32_t __SMMLA (int32_t op1, int32_t op2, int32_t op3)
+{
+  int32_t result;
+
+  __ASM volatile ("smmla %0, %1, %2, %3" : "=r" (result): "r"  (op1), "r" (op2), "r" (op3) );
+  return(result);
+}
+
+#endif /* (__ARM_FEATURE_DSP == 1) */
+/*@} end of group CMSIS_SIMD_intrinsics */
+
+
+#endif /* __CMSIS_ARMCLANG_H */
diff --git a/common/mps2/cmsis_compiler.h b/common/mps2/cmsis_compiler.h
new file mode 100644
index 0000000..adbf296
--- /dev/null
+++ b/common/mps2/cmsis_compiler.h
@@ -0,0 +1,283 @@
+/**************************************************************************//**
+ * @file     cmsis_compiler.h
+ * @brief    CMSIS compiler generic header file
+ * @version  V5.1.0
+ * @date     09. October 2018
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2018 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CMSIS_COMPILER_H
+#define __CMSIS_COMPILER_H
+
+#include <stdint.h>
+
+/*
+ * Arm Compiler 4/5
+ */
+#if   defined ( __CC_ARM )
+  #include "cmsis_armcc.h"
+
+
+/*
+ * Arm Compiler 6.6 LTM (armclang)
+ */
+#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) && (__ARMCC_VERSION < 6100100)
+  #include "cmsis_armclang_ltm.h"
+
+  /*
+ * Arm Compiler above 6.10.1 (armclang)
+ */
+#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6100100)
+  #include "cmsis_armclang.h"
+
+
+/*
+ * GNU Compiler
+ */
+#elif defined ( __GNUC__ )
+  #include "cmsis_gcc.h"
+
+
+/*
+ * IAR Compiler
+ */
+#elif defined ( __ICCARM__ )
+  #include <cmsis_iccarm.h>
+
+
+/*
+ * TI Arm Compiler
+ */
+#elif defined ( __TI_ARM__ )
+  #include <cmsis_ccs.h>
+
+  #ifndef   __ASM
+    #define __ASM                                  __asm
+  #endif
+  #ifndef   __INLINE
+    #define __INLINE                               inline
+  #endif
+  #ifndef   __STATIC_INLINE
+    #define __STATIC_INLINE                        static inline
+  #endif
+  #ifndef   __STATIC_FORCEINLINE
+    #define __STATIC_FORCEINLINE                   __STATIC_INLINE
+  #endif
+  #ifndef   __NO_RETURN
+    #define __NO_RETURN                            __attribute__((noreturn))
+  #endif
+  #ifndef   __USED
+    #define __USED                                 __attribute__((used))
+  #endif
+  #ifndef   __WEAK
+    #define __WEAK                                 __attribute__((weak))
+  #endif
+  #ifndef   __PACKED
+    #define __PACKED                               __attribute__((packed))
+  #endif
+  #ifndef   __PACKED_STRUCT
+    #define __PACKED_STRUCT                        struct __attribute__((packed))
+  #endif
+  #ifndef   __PACKED_UNION
+    #define __PACKED_UNION                         union __attribute__((packed))
+  #endif
+  #ifndef   __UNALIGNED_UINT32        /* deprecated */
+    struct __attribute__((packed)) T_UINT32 { uint32_t v; };
+    #define __UNALIGNED_UINT32(x)                  (((struct T_UINT32 *)(x))->v)
+  #endif
+  #ifndef   __UNALIGNED_UINT16_WRITE
+    __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
+    #define __UNALIGNED_UINT16_WRITE(addr, val)    (void)((((struct T_UINT16_WRITE *)(void*)(addr))->v) = (val))
+  #endif
+  #ifndef   __UNALIGNED_UINT16_READ
+    __PACKED_STRUCT T_UINT16_READ { uint16_t v; };
+    #define __UNALIGNED_UINT16_READ(addr)          (((const struct T_UINT16_READ *)(const void *)(addr))->v)
+  #endif
+  #ifndef   __UNALIGNED_UINT32_WRITE
+    __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
+    #define __UNALIGNED_UINT32_WRITE(addr, val)    (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
+  #endif
+  #ifndef   __UNALIGNED_UINT32_READ
+    __PACKED_STRUCT T_UINT32_READ { uint32_t v; };
+    #define __UNALIGNED_UINT32_READ(addr)          (((const struct T_UINT32_READ *)(const void *)(addr))->v)
+  #endif
+  #ifndef   __ALIGNED
+    #define __ALIGNED(x)                           __attribute__((aligned(x)))
+  #endif
+  #ifndef   __RESTRICT
+    #define __RESTRICT                             __restrict
+  #endif
+  #ifndef   __COMPILER_BARRIER
+    #warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored.
+    #define __COMPILER_BARRIER()                   (void)0
+  #endif
+
+
+/*
+ * TASKING Compiler
+ */
+#elif defined ( __TASKING__ )
+  /*
+   * The CMSIS functions have been implemented as intrinsics in the compiler.
+   * Please use "carm -?i" to get an up to date list of all intrinsics,
+   * Including the CMSIS ones.
+   */
+
+  #ifndef   __ASM
+    #define __ASM                                  __asm
+  #endif
+  #ifndef   __INLINE
+    #define __INLINE                               inline
+  #endif
+  #ifndef   __STATIC_INLINE
+    #define __STATIC_INLINE                        static inline
+  #endif
+  #ifndef   __STATIC_FORCEINLINE
+    #define __STATIC_FORCEINLINE                   __STATIC_INLINE
+  #endif
+  #ifndef   __NO_RETURN
+    #define __NO_RETURN                            __attribute__((noreturn))
+  #endif
+  #ifndef   __USED
+    #define __USED                                 __attribute__((used))
+  #endif
+  #ifndef   __WEAK
+    #define __WEAK                                 __attribute__((weak))
+  #endif
+  #ifndef   __PACKED
+    #define __PACKED                               __packed__
+  #endif
+  #ifndef   __PACKED_STRUCT
+    #define __PACKED_STRUCT                        struct __packed__
+  #endif
+  #ifndef   __PACKED_UNION
+    #define __PACKED_UNION                         union __packed__
+  #endif
+  #ifndef   __UNALIGNED_UINT32        /* deprecated */
+    struct __packed__ T_UINT32 { uint32_t v; };
+    #define __UNALIGNED_UINT32(x)                  (((struct T_UINT32 *)(x))->v)
+  #endif
+  #ifndef   __UNALIGNED_UINT16_WRITE
+    __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
+    #define __UNALIGNED_UINT16_WRITE(addr, val)    (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val))
+  #endif
+  #ifndef   __UNALIGNED_UINT16_READ
+    __PACKED_STRUCT T_UINT16_READ { uint16_t v; };
+    #define __UNALIGNED_UINT16_READ(addr)          (((const struct T_UINT16_READ *)(const void *)(addr))->v)
+  #endif
+  #ifndef   __UNALIGNED_UINT32_WRITE
+    __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
+    #define __UNALIGNED_UINT32_WRITE(addr, val)    (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
+  #endif
+  #ifndef   __UNALIGNED_UINT32_READ
+    __PACKED_STRUCT T_UINT32_READ { uint32_t v; };
+    #define __UNALIGNED_UINT32_READ(addr)          (((const struct T_UINT32_READ *)(const void *)(addr))->v)
+  #endif
+  #ifndef   __ALIGNED
+    #define __ALIGNED(x)              __align(x)
+  #endif
+  #ifndef   __RESTRICT
+    #warning No compiler specific solution for __RESTRICT. __RESTRICT is ignored.
+    #define __RESTRICT
+  #endif
+  #ifndef   __COMPILER_BARRIER
+    #warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored.
+    #define __COMPILER_BARRIER()                   (void)0
+  #endif
+
+
+/*
+ * COSMIC Compiler
+ */
+#elif defined ( __CSMC__ )
+   #include <cmsis_csm.h>
+
+ #ifndef   __ASM
+    #define __ASM                                  _asm
+  #endif
+  #ifndef   __INLINE
+    #define __INLINE                               inline
+  #endif
+  #ifndef   __STATIC_INLINE
+    #define __STATIC_INLINE                        static inline
+  #endif
+  #ifndef   __STATIC_FORCEINLINE
+    #define __STATIC_FORCEINLINE                   __STATIC_INLINE
+  #endif
+  #ifndef   __NO_RETURN
+    // NO RETURN is automatically detected hence no warning here
+    #define __NO_RETURN
+  #endif
+  #ifndef   __USED
+    #warning No compiler specific solution for __USED. __USED is ignored.
+    #define __USED
+  #endif
+  #ifndef   __WEAK
+    #define __WEAK                                 __weak
+  #endif
+  #ifndef   __PACKED
+    #define __PACKED                               @packed
+  #endif
+  #ifndef   __PACKED_STRUCT
+    #define __PACKED_STRUCT                        @packed struct
+  #endif
+  #ifndef   __PACKED_UNION
+    #define __PACKED_UNION                         @packed union
+  #endif
+  #ifndef   __UNALIGNED_UINT32        /* deprecated */
+    @packed struct T_UINT32 { uint32_t v; };
+    #define __UNALIGNED_UINT32(x)                  (((struct T_UINT32 *)(x))->v)
+  #endif
+  #ifndef   __UNALIGNED_UINT16_WRITE
+    __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
+    #define __UNALIGNED_UINT16_WRITE(addr, val)    (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val))
+  #endif
+  #ifndef   __UNALIGNED_UINT16_READ
+    __PACKED_STRUCT T_UINT16_READ { uint16_t v; };
+    #define __UNALIGNED_UINT16_READ(addr)          (((const struct T_UINT16_READ *)(const void *)(addr))->v)
+  #endif
+  #ifndef   __UNALIGNED_UINT32_WRITE
+    __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
+    #define __UNALIGNED_UINT32_WRITE(addr, val)    (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
+  #endif
+  #ifndef   __UNALIGNED_UINT32_READ
+    __PACKED_STRUCT T_UINT32_READ { uint32_t v; };
+    #define __UNALIGNED_UINT32_READ(addr)          (((const struct T_UINT32_READ *)(const void *)(addr))->v)
+  #endif
+  #ifndef   __ALIGNED
+    #warning No compiler specific solution for __ALIGNED. __ALIGNED is ignored.
+    #define __ALIGNED(x)
+  #endif
+  #ifndef   __RESTRICT
+    #warning No compiler specific solution for __RESTRICT. __RESTRICT is ignored.
+    #define __RESTRICT
+  #endif
+  #ifndef   __COMPILER_BARRIER
+    #warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored.
+    #define __COMPILER_BARRIER()                   (void)0
+  #endif
+
+
+#else
+  #error Unknown compiler.
+#endif
+
+
+#endif /* __CMSIS_COMPILER_H */
+
diff --git a/common/mps2/cmsis_gcc.h b/common/mps2/cmsis_gcc.h
new file mode 100644
index 0000000..a2778f5
--- /dev/null
+++ b/common/mps2/cmsis_gcc.h
@@ -0,0 +1,2177 @@
+/**************************************************************************//**
+ * @file     cmsis_gcc.h
+ * @brief    CMSIS compiler GCC header file
+ * @version  V5.3.0
+ * @date     26. March 2020
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CMSIS_GCC_H
+#define __CMSIS_GCC_H
+
+/* ignore some GCC warnings */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsign-conversion"
+#pragma GCC diagnostic ignored "-Wconversion"
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+
+/* Fallback for __has_builtin */
+#ifndef __has_builtin
+  #define __has_builtin(x) (0)
+#endif
+
+/* CMSIS compiler specific defines */
+#ifndef   __ASM
+  #define __ASM                                  __asm
+#endif
+#ifndef   __INLINE
+  #define __INLINE                               inline
+#endif
+#ifndef   __STATIC_INLINE
+  #define __STATIC_INLINE                        static inline
+#endif
+#ifndef   __STATIC_FORCEINLINE                 
+  #define __STATIC_FORCEINLINE                   __attribute__((always_inline)) static inline
+#endif                                           
+#ifndef   __NO_RETURN
+  #define __NO_RETURN                            __attribute__((__noreturn__))
+#endif
+#ifndef   __USED
+  #define __USED                                 __attribute__((used))
+#endif
+#ifndef   __WEAK
+  #define __WEAK                                 __attribute__((weak))
+#endif
+#ifndef   __PACKED
+  #define __PACKED                               __attribute__((packed, aligned(1)))
+#endif
+#ifndef   __PACKED_STRUCT
+  #define __PACKED_STRUCT                        struct __attribute__((packed, aligned(1)))
+#endif
+#ifndef   __PACKED_UNION
+  #define __PACKED_UNION                         union __attribute__((packed, aligned(1)))
+#endif
+#ifndef   __UNALIGNED_UINT32        /* deprecated */
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wpacked"
+  #pragma GCC diagnostic ignored "-Wattributes"
+  struct __attribute__((packed)) T_UINT32 { uint32_t v; };
+  #pragma GCC diagnostic pop
+  #define __UNALIGNED_UINT32(x)                  (((struct T_UINT32 *)(x))->v)
+#endif
+#ifndef   __UNALIGNED_UINT16_WRITE
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wpacked"
+  #pragma GCC diagnostic ignored "-Wattributes"
+  __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
+  #pragma GCC diagnostic pop
+  #define __UNALIGNED_UINT16_WRITE(addr, val)    (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val))
+#endif
+#ifndef   __UNALIGNED_UINT16_READ
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wpacked"
+  #pragma GCC diagnostic ignored "-Wattributes"
+  __PACKED_STRUCT T_UINT16_READ { uint16_t v; };
+  #pragma GCC diagnostic pop
+  #define __UNALIGNED_UINT16_READ(addr)          (((const struct T_UINT16_READ *)(const void *)(addr))->v)
+#endif
+#ifndef   __UNALIGNED_UINT32_WRITE
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wpacked"
+  #pragma GCC diagnostic ignored "-Wattributes"
+  __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
+  #pragma GCC diagnostic pop
+  #define __UNALIGNED_UINT32_WRITE(addr, val)    (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
+#endif
+#ifndef   __UNALIGNED_UINT32_READ
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wpacked"
+  #pragma GCC diagnostic ignored "-Wattributes"
+  __PACKED_STRUCT T_UINT32_READ { uint32_t v; };
+  #pragma GCC diagnostic pop
+  #define __UNALIGNED_UINT32_READ(addr)          (((const struct T_UINT32_READ *)(const void *)(addr))->v)
+#endif
+#ifndef   __ALIGNED
+  #define __ALIGNED(x)                           __attribute__((aligned(x)))
+#endif
+#ifndef   __RESTRICT
+  #define __RESTRICT                             __restrict
+#endif
+#ifndef   __COMPILER_BARRIER
+  #define __COMPILER_BARRIER()                   __ASM volatile("":::"memory")
+#endif
+
+/* #########################  Startup and Lowlevel Init  ######################## */
+
+#ifndef __PROGRAM_START
+
+/**
+  \brief   Initializes data and bss sections
+  \details This default implementations initialized all data and additional bss
+           sections relying on .copy.table and .zero.table specified properly
+           in the used linker script.
+  
+ */
+__STATIC_FORCEINLINE __NO_RETURN void __cmsis_start(void)
+{
+  extern void _start(void) __NO_RETURN;
+  
+  typedef struct {
+    uint32_t const* src;
+    uint32_t* dest;
+    uint32_t  wlen;
+  } __copy_table_t;
+  
+  typedef struct {
+    uint32_t* dest;
+    uint32_t  wlen;
+  } __zero_table_t;
+  
+  extern const __copy_table_t __copy_table_start__;
+  extern const __copy_table_t __copy_table_end__;
+  extern const __zero_table_t __zero_table_start__;
+  extern const __zero_table_t __zero_table_end__;
+
+  for (__copy_table_t const* pTable = &__copy_table_start__; pTable < &__copy_table_end__; ++pTable) {
+    for(uint32_t i=0u; i<pTable->wlen; ++i) {
+      pTable->dest[i] = pTable->src[i];
+    }
+  }
+ 
+  for (__zero_table_t const* pTable = &__zero_table_start__; pTable < &__zero_table_end__; ++pTable) {
+    for(uint32_t i=0u; i<pTable->wlen; ++i) {
+      pTable->dest[i] = 0u;
+    }
+  }
+ 
+  _start();
+}
+  
+#define __PROGRAM_START           __cmsis_start
+#endif
+
+#ifndef __INITIAL_SP
+#define __INITIAL_SP              __StackTop
+#endif
+
+#ifndef __STACK_LIMIT
+#define __STACK_LIMIT             __StackLimit
+#endif
+
+#ifndef __VECTOR_TABLE
+#define __VECTOR_TABLE            __Vectors
+#endif
+
+#ifndef __VECTOR_TABLE_ATTRIBUTE
+#define __VECTOR_TABLE_ATTRIBUTE  __attribute__((used, section(".vectors")))
+#endif
+
+/* ###########################  Core Function Access  ########################### */
+/** \ingroup  CMSIS_Core_FunctionInterface
+    \defgroup CMSIS_Core_RegAccFunctions CMSIS Core Register Access Functions
+  @{
+ */
+
+/**
+  \brief   Enable IRQ Interrupts
+  \details Enables IRQ interrupts by clearing the I-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+__STATIC_FORCEINLINE void __enable_irq(void)
+{
+  __ASM volatile ("cpsie i" : : : "memory");
+}
+
+
+/**
+  \brief   Disable IRQ Interrupts
+  \details Disables IRQ interrupts by setting the I-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+__STATIC_FORCEINLINE void __disable_irq(void)
+{
+  __ASM volatile ("cpsid i" : : : "memory");
+}
+
+
+/**
+  \brief   Get Control Register
+  \details Returns the content of the Control Register.
+  \return               Control Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_CONTROL(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, control" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Control Register (non-secure)
+  \details Returns the content of the non-secure Control Register when in secure mode.
+  \return               non-secure Control Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_CONTROL_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, control_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Control Register
+  \details Writes the given value to the Control Register.
+  \param [in]    control  Control Register value to set
+ */
+__STATIC_FORCEINLINE void __set_CONTROL(uint32_t control)
+{
+  __ASM volatile ("MSR control, %0" : : "r" (control) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Control Register (non-secure)
+  \details Writes the given value to the non-secure Control Register when in secure state.
+  \param [in]    control  Control Register value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_CONTROL_NS(uint32_t control)
+{
+  __ASM volatile ("MSR control_ns, %0" : : "r" (control) : "memory");
+}
+#endif
+
+
+/**
+  \brief   Get IPSR Register
+  \details Returns the content of the IPSR Register.
+  \return               IPSR Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_IPSR(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, ipsr" : "=r" (result) );
+  return(result);
+}
+
+
+/**
+  \brief   Get APSR Register
+  \details Returns the content of the APSR Register.
+  \return               APSR Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_APSR(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, apsr" : "=r" (result) );
+  return(result);
+}
+
+
+/**
+  \brief   Get xPSR Register
+  \details Returns the content of the xPSR Register.
+  \return               xPSR Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_xPSR(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, xpsr" : "=r" (result) );
+  return(result);
+}
+
+
+/**
+  \brief   Get Process Stack Pointer
+  \details Returns the current value of the Process Stack Pointer (PSP).
+  \return               PSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_PSP(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, psp"  : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Process Stack Pointer (non-secure)
+  \details Returns the current value of the non-secure Process Stack Pointer (PSP) when in secure state.
+  \return               PSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_PSP_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, psp_ns"  : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Process Stack Pointer
+  \details Assigns the given value to the Process Stack Pointer (PSP).
+  \param [in]    topOfProcStack  Process Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __set_PSP(uint32_t topOfProcStack)
+{
+  __ASM volatile ("MSR psp, %0" : : "r" (topOfProcStack) : );
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Process Stack Pointer (non-secure)
+  \details Assigns the given value to the non-secure Process Stack Pointer (PSP) when in secure state.
+  \param [in]    topOfProcStack  Process Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_PSP_NS(uint32_t topOfProcStack)
+{
+  __ASM volatile ("MSR psp_ns, %0" : : "r" (topOfProcStack) : );
+}
+#endif
+
+
+/**
+  \brief   Get Main Stack Pointer
+  \details Returns the current value of the Main Stack Pointer (MSP).
+  \return               MSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_MSP(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, msp" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Main Stack Pointer (non-secure)
+  \details Returns the current value of the non-secure Main Stack Pointer (MSP) when in secure state.
+  \return               MSP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_MSP_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, msp_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Main Stack Pointer
+  \details Assigns the given value to the Main Stack Pointer (MSP).
+  \param [in]    topOfMainStack  Main Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __set_MSP(uint32_t topOfMainStack)
+{
+  __ASM volatile ("MSR msp, %0" : : "r" (topOfMainStack) : );
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Main Stack Pointer (non-secure)
+  \details Assigns the given value to the non-secure Main Stack Pointer (MSP) when in secure state.
+  \param [in]    topOfMainStack  Main Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_MSP_NS(uint32_t topOfMainStack)
+{
+  __ASM volatile ("MSR msp_ns, %0" : : "r" (topOfMainStack) : );
+}
+#endif
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Stack Pointer (non-secure)
+  \details Returns the current value of the non-secure Stack Pointer (SP) when in secure state.
+  \return               SP Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_SP_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, sp_ns" : "=r" (result) );
+  return(result);
+}
+
+
+/**
+  \brief   Set Stack Pointer (non-secure)
+  \details Assigns the given value to the non-secure Stack Pointer (SP) when in secure state.
+  \param [in]    topOfStack  Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_SP_NS(uint32_t topOfStack)
+{
+  __ASM volatile ("MSR sp_ns, %0" : : "r" (topOfStack) : );
+}
+#endif
+
+
+/**
+  \brief   Get Priority Mask
+  \details Returns the current state of the priority mask bit from the Priority Mask Register.
+  \return               Priority Mask value
+ */
+__STATIC_FORCEINLINE uint32_t __get_PRIMASK(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, primask" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Priority Mask (non-secure)
+  \details Returns the current state of the non-secure priority mask bit from the Priority Mask Register when in secure state.
+  \return               Priority Mask value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_PRIMASK_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, primask_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Priority Mask
+  \details Assigns the given value to the Priority Mask Register.
+  \param [in]    priMask  Priority Mask
+ */
+__STATIC_FORCEINLINE void __set_PRIMASK(uint32_t priMask)
+{
+  __ASM volatile ("MSR primask, %0" : : "r" (priMask) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Priority Mask (non-secure)
+  \details Assigns the given value to the non-secure Priority Mask Register when in secure state.
+  \param [in]    priMask  Priority Mask
+ */
+__STATIC_FORCEINLINE void __TZ_set_PRIMASK_NS(uint32_t priMask)
+{
+  __ASM volatile ("MSR primask_ns, %0" : : "r" (priMask) : "memory");
+}
+#endif
+
+
+#if ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+     (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+     (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))    )
+/**
+  \brief   Enable FIQ
+  \details Enables FIQ interrupts by clearing the F-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+__STATIC_FORCEINLINE void __enable_fault_irq(void)
+{
+  __ASM volatile ("cpsie f" : : : "memory");
+}
+
+
+/**
+  \brief   Disable FIQ
+  \details Disables FIQ interrupts by setting the F-bit in the CPSR.
+           Can only be executed in Privileged modes.
+ */
+__STATIC_FORCEINLINE void __disable_fault_irq(void)
+{
+  __ASM volatile ("cpsid f" : : : "memory");
+}
+
+
+/**
+  \brief   Get Base Priority
+  \details Returns the current value of the Base Priority register.
+  \return               Base Priority register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_BASEPRI(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, basepri" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Base Priority (non-secure)
+  \details Returns the current value of the non-secure Base Priority register when in secure state.
+  \return               Base Priority register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_BASEPRI_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, basepri_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Base Priority
+  \details Assigns the given value to the Base Priority register.
+  \param [in]    basePri  Base Priority value to set
+ */
+__STATIC_FORCEINLINE void __set_BASEPRI(uint32_t basePri)
+{
+  __ASM volatile ("MSR basepri, %0" : : "r" (basePri) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Base Priority (non-secure)
+  \details Assigns the given value to the non-secure Base Priority register when in secure state.
+  \param [in]    basePri  Base Priority value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_BASEPRI_NS(uint32_t basePri)
+{
+  __ASM volatile ("MSR basepri_ns, %0" : : "r" (basePri) : "memory");
+}
+#endif
+
+
+/**
+  \brief   Set Base Priority with condition
+  \details Assigns the given value to the Base Priority register only if BASEPRI masking is disabled,
+           or the new value increases the BASEPRI priority level.
+  \param [in]    basePri  Base Priority value to set
+ */
+__STATIC_FORCEINLINE void __set_BASEPRI_MAX(uint32_t basePri)
+{
+  __ASM volatile ("MSR basepri_max, %0" : : "r" (basePri) : "memory");
+}
+
+
+/**
+  \brief   Get Fault Mask
+  \details Returns the current value of the Fault Mask register.
+  \return               Fault Mask register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_FAULTMASK(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, faultmask" : "=r" (result) );
+  return(result);
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Fault Mask (non-secure)
+  \details Returns the current value of the non-secure Fault Mask register when in secure state.
+  \return               Fault Mask register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_FAULTMASK_NS(void)
+{
+  uint32_t result;
+
+  __ASM volatile ("MRS %0, faultmask_ns" : "=r" (result) );
+  return(result);
+}
+#endif
+
+
+/**
+  \brief   Set Fault Mask
+  \details Assigns the given value to the Fault Mask register.
+  \param [in]    faultMask  Fault Mask value to set
+ */
+__STATIC_FORCEINLINE void __set_FAULTMASK(uint32_t faultMask)
+{
+  __ASM volatile ("MSR faultmask, %0" : : "r" (faultMask) : "memory");
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE ) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Set Fault Mask (non-secure)
+  \details Assigns the given value to the non-secure Fault Mask register when in secure state.
+  \param [in]    faultMask  Fault Mask value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_FAULTMASK_NS(uint32_t faultMask)
+{
+  __ASM volatile ("MSR faultmask_ns, %0" : : "r" (faultMask) : "memory");
+}
+#endif
+
+#endif /* ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+           (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+           (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))    ) */
+
+
+#if ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+     (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    )
+
+/**
+  \brief   Get Process Stack Pointer Limit
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence zero is returned always in non-secure
+  mode.
+  
+  \details Returns the current value of the Process Stack Pointer Limit (PSPLIM).
+  \return               PSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_PSPLIM(void)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+    (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+    // without main extensions, the non-secure PSPLIM is RAZ/WI
+  return 0U;
+#else
+  uint32_t result;
+  __ASM volatile ("MRS %0, psplim"  : "=r" (result) );
+  return result;
+#endif
+}
+
+#if (defined (__ARM_FEATURE_CMSE) && (__ARM_FEATURE_CMSE == 3))
+/**
+  \brief   Get Process Stack Pointer Limit (non-secure)
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence zero is returned always.
+
+  \details Returns the current value of the non-secure Process Stack Pointer Limit (PSPLIM) when in secure state.
+  \return               PSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_PSPLIM_NS(void)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)))
+  // without main extensions, the non-secure PSPLIM is RAZ/WI
+  return 0U;
+#else
+  uint32_t result;
+  __ASM volatile ("MRS %0, psplim_ns"  : "=r" (result) );
+  return result;
+#endif
+}
+#endif
+
+
+/**
+  \brief   Set Process Stack Pointer Limit
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence the write is silently ignored in non-secure
+  mode.
+  
+  \details Assigns the given value to the Process Stack Pointer Limit (PSPLIM).
+  \param [in]    ProcStackPtrLimit  Process Stack Pointer Limit value to set
+ */
+__STATIC_FORCEINLINE void __set_PSPLIM(uint32_t ProcStackPtrLimit)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+    (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+  // without main extensions, the non-secure PSPLIM is RAZ/WI
+  (void)ProcStackPtrLimit;
+#else
+  __ASM volatile ("MSR psplim, %0" : : "r" (ProcStackPtrLimit));
+#endif
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE  ) && (__ARM_FEATURE_CMSE   == 3))
+/**
+  \brief   Set Process Stack Pointer (non-secure)
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence the write is silently ignored.
+
+  \details Assigns the given value to the non-secure Process Stack Pointer Limit (PSPLIM) when in secure state.
+  \param [in]    ProcStackPtrLimit  Process Stack Pointer Limit value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_PSPLIM_NS(uint32_t ProcStackPtrLimit)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)))
+  // without main extensions, the non-secure PSPLIM is RAZ/WI
+  (void)ProcStackPtrLimit;
+#else
+  __ASM volatile ("MSR psplim_ns, %0\n" : : "r" (ProcStackPtrLimit));
+#endif
+}
+#endif
+
+
+/**
+  \brief   Get Main Stack Pointer Limit
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence zero is returned always in non-secure
+  mode.
+
+  \details Returns the current value of the Main Stack Pointer Limit (MSPLIM).
+  \return               MSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_MSPLIM(void)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+    (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+  // without main extensions, the non-secure MSPLIM is RAZ/WI
+  return 0U;
+#else
+  uint32_t result;
+  __ASM volatile ("MRS %0, msplim" : "=r" (result) );
+  return result;
+#endif
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE  ) && (__ARM_FEATURE_CMSE   == 3))
+/**
+  \brief   Get Main Stack Pointer Limit (non-secure)
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence zero is returned always.
+
+  \details Returns the current value of the non-secure Main Stack Pointer Limit(MSPLIM) when in secure state.
+  \return               MSPLIM Register value
+ */
+__STATIC_FORCEINLINE uint32_t __TZ_get_MSPLIM_NS(void)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)))
+  // without main extensions, the non-secure MSPLIM is RAZ/WI
+  return 0U;
+#else
+  uint32_t result;
+  __ASM volatile ("MRS %0, msplim_ns" : "=r" (result) );
+  return result;
+#endif
+}
+#endif
+
+
+/**
+  \brief   Set Main Stack Pointer Limit
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence the write is silently ignored in non-secure
+  mode.
+
+  \details Assigns the given value to the Main Stack Pointer Limit (MSPLIM).
+  \param [in]    MainStackPtrLimit  Main Stack Pointer Limit value to set
+ */
+__STATIC_FORCEINLINE void __set_MSPLIM(uint32_t MainStackPtrLimit)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) && \
+    (!defined (__ARM_FEATURE_CMSE) || (__ARM_FEATURE_CMSE < 3)))
+  // without main extensions, the non-secure MSPLIM is RAZ/WI
+  (void)MainStackPtrLimit;
+#else
+  __ASM volatile ("MSR msplim, %0" : : "r" (MainStackPtrLimit));
+#endif
+}
+
+
+#if (defined (__ARM_FEATURE_CMSE  ) && (__ARM_FEATURE_CMSE   == 3))
+/**
+  \brief   Set Main Stack Pointer Limit (non-secure)
+  Devices without ARMv8-M Main Extensions (i.e. Cortex-M23) lack the non-secure
+  Stack Pointer Limit register hence the write is silently ignored.
+
+  \details Assigns the given value to the non-secure Main Stack Pointer Limit (MSPLIM) when in secure state.
+  \param [in]    MainStackPtrLimit  Main Stack Pointer value to set
+ */
+__STATIC_FORCEINLINE void __TZ_set_MSPLIM_NS(uint32_t MainStackPtrLimit)
+{
+#if (!(defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)))
+  // without main extensions, the non-secure MSPLIM is RAZ/WI
+  (void)MainStackPtrLimit;
+#else
+  __ASM volatile ("MSR msplim_ns, %0" : : "r" (MainStackPtrLimit));
+#endif
+}
+#endif
+
+#endif /* ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+           (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    ) */
+
+
+/**
+  \brief   Get FPSCR
+  \details Returns the current value of the Floating Point Status/Control register.
+  \return               Floating Point Status/Control register value
+ */
+__STATIC_FORCEINLINE uint32_t __get_FPSCR(void)
+{
+#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
+     (defined (__FPU_USED   ) && (__FPU_USED    == 1U))     )
+#if __has_builtin(__builtin_arm_get_fpscr) 
+// Re-enable using built-in when GCC has been fixed
+// || (__GNUC__ > 7) || (__GNUC__ == 7 && __GNUC_MINOR__ >= 2)
+  /* see https://gcc.gnu.org/ml/gcc-patches/2017-04/msg00443.html */
+  return __builtin_arm_get_fpscr();
+#else
+  uint32_t result;
+
+  __ASM volatile ("VMRS %0, fpscr" : "=r" (result) );
+  return(result);
+#endif
+#else
+  return(0U);
+#endif
+}
+
+
+/**
+  \brief   Set FPSCR
+  \details Assigns the given value to the Floating Point Status/Control register.
+  \param [in]    fpscr  Floating Point Status/Control value to set
+ */
+__STATIC_FORCEINLINE void __set_FPSCR(uint32_t fpscr)
+{
+#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
+     (defined (__FPU_USED   ) && (__FPU_USED    == 1U))     )
+#if __has_builtin(__builtin_arm_set_fpscr)
+// Re-enable using built-in when GCC has been fixed
+// || (__GNUC__ > 7) || (__GNUC__ == 7 && __GNUC_MINOR__ >= 2)
+  /* see https://gcc.gnu.org/ml/gcc-patches/2017-04/msg00443.html */
+  __builtin_arm_set_fpscr(fpscr);
+#else
+  __ASM volatile ("VMSR fpscr, %0" : : "r" (fpscr) : "vfpcc", "memory");
+#endif
+#else
+  (void)fpscr;
+#endif
+}
+
+
+/*@} end of CMSIS_Core_RegAccFunctions */
+
+
+/* ##########################  Core Instruction Access  ######################### */
+/** \defgroup CMSIS_Core_InstructionInterface CMSIS Core Instruction Interface
+  Access to dedicated instructions
+  @{
+*/
+
+/* Define macros for porting to both thumb1 and thumb2.
+ * For thumb1, use low register (r0-r7), specified by constraint "l"
+ * Otherwise, use general registers, specified by constraint "r" */
+#if defined (__thumb__) && !defined (__thumb2__)
+#define __CMSIS_GCC_OUT_REG(r) "=l" (r)
+#define __CMSIS_GCC_RW_REG(r) "+l" (r)
+#define __CMSIS_GCC_USE_REG(r) "l" (r)
+#else
+#define __CMSIS_GCC_OUT_REG(r) "=r" (r)
+#define __CMSIS_GCC_RW_REG(r) "+r" (r)
+#define __CMSIS_GCC_USE_REG(r) "r" (r)
+#endif
+
+/**
+  \brief   No Operation
+  \details No Operation does nothing. This instruction can be used for code alignment purposes.
+ */
+#define __NOP()                             __ASM volatile ("nop")
+
+/**
+  \brief   Wait For Interrupt
+  \details Wait For Interrupt is a hint instruction that suspends execution until one of a number of events occurs.
+ */
+#define __WFI()                             __ASM volatile ("wfi":::"memory")
+
+
+/**
+  \brief   Wait For Event
+  \details Wait For Event is a hint instruction that permits the processor to enter
+           a low-power state until one of a number of events occurs.
+ */
+#define __WFE()                             __ASM volatile ("wfe":::"memory")
+
+
+/**
+  \brief   Send Event
+  \details Send Event is a hint instruction. It causes an event to be signaled to the CPU.
+ */
+#define __SEV()                             __ASM volatile ("sev")
+
+
+/**
+  \brief   Instruction Synchronization Barrier
+  \details Instruction Synchronization Barrier flushes the pipeline in the processor,
+           so that all instructions following the ISB are fetched from cache or memory,
+           after the instruction has been completed.
+ */
+__STATIC_FORCEINLINE void __ISB(void)
+{
+  __ASM volatile ("isb 0xF":::"memory");
+}
+
+
+/**
+  \brief   Data Synchronization Barrier
+  \details Acts as a special kind of Data Memory Barrier.
+           It completes when all explicit memory accesses before this instruction complete.
+ */
+__STATIC_FORCEINLINE void __DSB(void)
+{
+  __ASM volatile ("dsb 0xF":::"memory");
+}
+
+
+/**
+  \brief   Data Memory Barrier
+  \details Ensures the apparent order of the explicit memory operations before
+           and after the instruction, without ensuring their completion.
+ */
+__STATIC_FORCEINLINE void __DMB(void)
+{
+  __ASM volatile ("dmb 0xF":::"memory");
+}
+
+
+/**
+  \brief   Reverse byte order (32 bit)
+  \details Reverses the byte order in unsigned integer value. For example, 0x12345678 becomes 0x78563412.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+__STATIC_FORCEINLINE uint32_t __REV(uint32_t value)
+{
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)
+  return __builtin_bswap32(value);
+#else
+  uint32_t result;
+
+  __ASM ("rev %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) );
+  return result;
+#endif
+}
+
+
+/**
+  \brief   Reverse byte order (16 bit)
+  \details Reverses the byte order within each halfword of a word. For example, 0x12345678 becomes 0x34127856.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+__STATIC_FORCEINLINE uint32_t __REV16(uint32_t value)
+{
+  uint32_t result;
+
+  __ASM ("rev16 %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) );
+  return result;
+}
+
+
+/**
+  \brief   Reverse byte order (16 bit)
+  \details Reverses the byte order in a 16-bit value and returns the signed 16-bit result. For example, 0x0080 becomes 0x8000.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+__STATIC_FORCEINLINE int16_t __REVSH(int16_t value)
+{
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+  return (int16_t)__builtin_bswap16(value);
+#else
+  int16_t result;
+
+  __ASM ("revsh %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) );
+  return result;
+#endif
+}
+
+
+/**
+  \brief   Rotate Right in unsigned value (32 bit)
+  \details Rotate Right (immediate) provides the value of the contents of a register rotated by a variable number of bits.
+  \param [in]    op1  Value to rotate
+  \param [in]    op2  Number of Bits to rotate
+  \return               Rotated value
+ */
+__STATIC_FORCEINLINE uint32_t __ROR(uint32_t op1, uint32_t op2)
+{
+  op2 %= 32U;
+  if (op2 == 0U)
+  {
+    return op1;
+  }
+  return (op1 >> op2) | (op1 << (32U - op2));
+}
+
+
+/**
+  \brief   Breakpoint
+  \details Causes the processor to enter Debug state.
+           Debug tools can use this to investigate system state when the instruction at a particular address is reached.
+  \param [in]    value  is ignored by the processor.
+                 If required, a debugger can use it to store additional information about the breakpoint.
+ */
+#define __BKPT(value)                       __ASM volatile ("bkpt "#value)
+
+
+/**
+  \brief   Reverse bit order of value
+  \details Reverses the bit order of the given value.
+  \param [in]    value  Value to reverse
+  \return               Reversed value
+ */
+__STATIC_FORCEINLINE uint32_t __RBIT(uint32_t value)
+{
+  uint32_t result;
+
+#if ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+     (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+     (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))    )
+   __ASM ("rbit %0, %1" : "=r" (result) : "r" (value) );
+#else
+  uint32_t s = (4U /*sizeof(v)*/ * 8U) - 1U; /* extra shift needed at end */
+
+  result = value;                      /* r will be reversed bits of v; first get LSB of v */
+  for (value >>= 1U; value != 0U; value >>= 1U)
+  {
+    result <<= 1U;
+    result |= value & 1U;
+    s--;
+  }
+  result <<= s;                        /* shift when v's highest bits are zero */
+#endif
+  return result;
+}
+
+
+/**
+  \brief   Count leading zeros
+  \details Counts the number of leading zeros of a data value.
+  \param [in]  value  Value to count the leading zeros
+  \return             number of leading zeros in value
+ */
+__STATIC_FORCEINLINE uint8_t __CLZ(uint32_t value)
+{
+  /* Even though __builtin_clz produces a CLZ instruction on ARM, formally
+     __builtin_clz(0) is undefined behaviour, so handle this case specially.
+     This guarantees ARM-compatible results if happening to compile on a non-ARM
+     target, and ensures the compiler doesn't decide to activate any
+     optimisations using the logic "value was passed to __builtin_clz, so it
+     is non-zero".
+     ARM GCC 7.3 and possibly earlier will optimise this test away, leaving a
+     single CLZ instruction.
+   */
+  if (value == 0U)
+  {
+    return 32U;
+  }
+  return __builtin_clz(value);
+}
+
+
+#if ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+     (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+     (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+     (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    )
+/**
+  \brief   LDR Exclusive (8 bit)
+  \details Executes a exclusive LDR instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint8_t __LDREXB(volatile uint8_t *addr)
+{
+    uint32_t result;
+
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+   __ASM volatile ("ldrexb %0, %1" : "=r" (result) : "Q" (*addr) );
+#else
+    /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not
+       accepted by assembler. So has to use following less efficient pattern.
+    */
+   __ASM volatile ("ldrexb %0, [%1]" : "=r" (result) : "r" (addr) : "memory" );
+#endif
+   return ((uint8_t) result);    /* Add explicit type cast here */
+}
+
+
+/**
+  \brief   LDR Exclusive (16 bit)
+  \details Executes a exclusive LDR instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint16_t __LDREXH(volatile uint16_t *addr)
+{
+    uint32_t result;
+
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+   __ASM volatile ("ldrexh %0, %1" : "=r" (result) : "Q" (*addr) );
+#else
+    /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not
+       accepted by assembler. So has to use following less efficient pattern.
+    */
+   __ASM volatile ("ldrexh %0, [%1]" : "=r" (result) : "r" (addr) : "memory" );
+#endif
+   return ((uint16_t) result);    /* Add explicit type cast here */
+}
+
+
+/**
+  \brief   LDR Exclusive (32 bit)
+  \details Executes a exclusive LDR instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint32_t __LDREXW(volatile uint32_t *addr)
+{
+    uint32_t result;
+
+   __ASM volatile ("ldrex %0, %1" : "=r" (result) : "Q" (*addr) );
+   return(result);
+}
+
+
+/**
+  \brief   STR Exclusive (8 bit)
+  \details Executes a exclusive STR instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+__STATIC_FORCEINLINE uint32_t __STREXB(uint8_t value, volatile uint8_t *addr)
+{
+   uint32_t result;
+
+   __ASM volatile ("strexb %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" ((uint32_t)value) );
+   return(result);
+}
+
+
+/**
+  \brief   STR Exclusive (16 bit)
+  \details Executes a exclusive STR instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+__STATIC_FORCEINLINE uint32_t __STREXH(uint16_t value, volatile uint16_t *addr)
+{
+   uint32_t result;
+
+   __ASM volatile ("strexh %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" ((uint32_t)value) );
+   return(result);
+}
+
+
+/**
+  \brief   STR Exclusive (32 bit)
+  \details Executes a exclusive STR instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+__STATIC_FORCEINLINE uint32_t __STREXW(uint32_t value, volatile uint32_t *addr)
+{
+   uint32_t result;
+
+   __ASM volatile ("strex %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" (value) );
+   return(result);
+}
+
+
+/**
+  \brief   Remove the exclusive lock
+  \details Removes the exclusive lock which is created by LDREX.
+ */
+__STATIC_FORCEINLINE void __CLREX(void)
+{
+  __ASM volatile ("clrex" ::: "memory");
+}
+
+#endif /* ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+           (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+           (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+           (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    ) */
+
+
+#if ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+     (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+     (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))    )
+/**
+  \brief   Signed Saturate
+  \details Saturates a signed value.
+  \param [in]  ARG1  Value to be saturated
+  \param [in]  ARG2  Bit position to saturate to (1..32)
+  \return             Saturated value
+ */
+#define __SSAT(ARG1, ARG2) \
+__extension__ \
+({                          \
+  int32_t __RES, __ARG1 = (ARG1); \
+  __ASM volatile ("ssat %0, %1, %2" : "=r" (__RES) :  "I" (ARG2), "r" (__ARG1) : "cc" ); \
+  __RES; \
+ })
+
+
+/**
+  \brief   Unsigned Saturate
+  \details Saturates an unsigned value.
+  \param [in]  ARG1  Value to be saturated
+  \param [in]  ARG2  Bit position to saturate to (0..31)
+  \return             Saturated value
+ */
+#define __USAT(ARG1, ARG2) \
+ __extension__ \
+({                          \
+  uint32_t __RES, __ARG1 = (ARG1); \
+  __ASM volatile ("usat %0, %1, %2" : "=r" (__RES) :  "I" (ARG2), "r" (__ARG1) : "cc" ); \
+  __RES; \
+ })
+
+
+/**
+  \brief   Rotate Right with Extend (32 bit)
+  \details Moves each bit of a bitstring right by one bit.
+           The carry input is shifted in at the left end of the bitstring.
+  \param [in]    value  Value to rotate
+  \return               Rotated value
+ */
+__STATIC_FORCEINLINE uint32_t __RRX(uint32_t value)
+{
+  uint32_t result;
+
+  __ASM volatile ("rrx %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) );
+  return(result);
+}
+
+
+/**
+  \brief   LDRT Unprivileged (8 bit)
+  \details Executes a Unprivileged LDRT instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint8_t __LDRBT(volatile uint8_t *ptr)
+{
+    uint32_t result;
+
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+   __ASM volatile ("ldrbt %0, %1" : "=r" (result) : "Q" (*ptr) );
+#else
+    /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not
+       accepted by assembler. So has to use following less efficient pattern.
+    */
+   __ASM volatile ("ldrbt %0, [%1]" : "=r" (result) : "r" (ptr) : "memory" );
+#endif
+   return ((uint8_t) result);    /* Add explicit type cast here */
+}
+
+
+/**
+  \brief   LDRT Unprivileged (16 bit)
+  \details Executes a Unprivileged LDRT instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint16_t __LDRHT(volatile uint16_t *ptr)
+{
+    uint32_t result;
+
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+   __ASM volatile ("ldrht %0, %1" : "=r" (result) : "Q" (*ptr) );
+#else
+    /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not
+       accepted by assembler. So has to use following less efficient pattern.
+    */
+   __ASM volatile ("ldrht %0, [%1]" : "=r" (result) : "r" (ptr) : "memory" );
+#endif
+   return ((uint16_t) result);    /* Add explicit type cast here */
+}
+
+
+/**
+  \brief   LDRT Unprivileged (32 bit)
+  \details Executes a Unprivileged LDRT instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint32_t __LDRT(volatile uint32_t *ptr)
+{
+    uint32_t result;
+
+   __ASM volatile ("ldrt %0, %1" : "=r" (result) : "Q" (*ptr) );
+   return(result);
+}
+
+
+/**
+  \brief   STRT Unprivileged (8 bit)
+  \details Executes a Unprivileged STRT instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STRBT(uint8_t value, volatile uint8_t *ptr)
+{
+   __ASM volatile ("strbt %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) );
+}
+
+
+/**
+  \brief   STRT Unprivileged (16 bit)
+  \details Executes a Unprivileged STRT instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STRHT(uint16_t value, volatile uint16_t *ptr)
+{
+   __ASM volatile ("strht %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) );
+}
+
+
+/**
+  \brief   STRT Unprivileged (32 bit)
+  \details Executes a Unprivileged STRT instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STRT(uint32_t value, volatile uint32_t *ptr)
+{
+   __ASM volatile ("strt %1, %0" : "=Q" (*ptr) : "r" (value) );
+}
+
+#else  /* ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+           (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+           (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))    ) */
+
+/**
+  \brief   Signed Saturate
+  \details Saturates a signed value.
+  \param [in]  value  Value to be saturated
+  \param [in]    sat  Bit position to saturate to (1..32)
+  \return             Saturated value
+ */
+__STATIC_FORCEINLINE int32_t __SSAT(int32_t val, uint32_t sat)
+{
+  if ((sat >= 1U) && (sat <= 32U))
+  {
+    const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U);
+    const int32_t min = -1 - max ;
+    if (val > max)
+    {
+      return max;
+    }
+    else if (val < min)
+    {
+      return min;
+    }
+  }
+  return val;
+}
+
+/**
+  \brief   Unsigned Saturate
+  \details Saturates an unsigned value.
+  \param [in]  value  Value to be saturated
+  \param [in]    sat  Bit position to saturate to (0..31)
+  \return             Saturated value
+ */
+__STATIC_FORCEINLINE uint32_t __USAT(int32_t val, uint32_t sat)
+{
+  if (sat <= 31U)
+  {
+    const uint32_t max = ((1U << sat) - 1U);
+    if (val > (int32_t)max)
+    {
+      return max;
+    }
+    else if (val < 0)
+    {
+      return 0U;
+    }
+  }
+  return (uint32_t)val;
+}
+
+#endif /* ((defined (__ARM_ARCH_7M__      ) && (__ARM_ARCH_7M__      == 1)) || \
+           (defined (__ARM_ARCH_7EM__     ) && (__ARM_ARCH_7EM__     == 1)) || \
+           (defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1))    ) */
+
+
+#if ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+     (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    )
+/**
+  \brief   Load-Acquire (8 bit)
+  \details Executes a LDAB instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint8_t __LDAB(volatile uint8_t *ptr)
+{
+    uint32_t result;
+
+   __ASM volatile ("ldab %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" );
+   return ((uint8_t) result);
+}
+
+
+/**
+  \brief   Load-Acquire (16 bit)
+  \details Executes a LDAH instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint16_t __LDAH(volatile uint16_t *ptr)
+{
+    uint32_t result;
+
+   __ASM volatile ("ldah %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" );
+   return ((uint16_t) result);
+}
+
+
+/**
+  \brief   Load-Acquire (32 bit)
+  \details Executes a LDA instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint32_t __LDA(volatile uint32_t *ptr)
+{
+    uint32_t result;
+
+   __ASM volatile ("lda %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" );
+   return(result);
+}
+
+
+/**
+  \brief   Store-Release (8 bit)
+  \details Executes a STLB instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STLB(uint8_t value, volatile uint8_t *ptr)
+{
+   __ASM volatile ("stlb %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" );
+}
+
+
+/**
+  \brief   Store-Release (16 bit)
+  \details Executes a STLH instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STLH(uint16_t value, volatile uint16_t *ptr)
+{
+   __ASM volatile ("stlh %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" );
+}
+
+
+/**
+  \brief   Store-Release (32 bit)
+  \details Executes a STL instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+ */
+__STATIC_FORCEINLINE void __STL(uint32_t value, volatile uint32_t *ptr)
+{
+   __ASM volatile ("stl %1, %0" : "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" );
+}
+
+
+/**
+  \brief   Load-Acquire Exclusive (8 bit)
+  \details Executes a LDAB exclusive instruction for 8 bit value.
+  \param [in]    ptr  Pointer to data
+  \return             value of type uint8_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint8_t __LDAEXB(volatile uint8_t *ptr)
+{
+    uint32_t result;
+
+   __ASM volatile ("ldaexb %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" );
+   return ((uint8_t) result);
+}
+
+
+/**
+  \brief   Load-Acquire Exclusive (16 bit)
+  \details Executes a LDAH exclusive instruction for 16 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint16_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint16_t __LDAEXH(volatile uint16_t *ptr)
+{
+    uint32_t result;
+
+   __ASM volatile ("ldaexh %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" );
+   return ((uint16_t) result);
+}
+
+
+/**
+  \brief   Load-Acquire Exclusive (32 bit)
+  \details Executes a LDA exclusive instruction for 32 bit values.
+  \param [in]    ptr  Pointer to data
+  \return        value of type uint32_t at (*ptr)
+ */
+__STATIC_FORCEINLINE uint32_t __LDAEX(volatile uint32_t *ptr)
+{
+    uint32_t result;
+
+   __ASM volatile ("ldaex %0, %1" : "=r" (result) : "Q" (*ptr) : "memory" );
+   return(result);
+}
+
+
+/**
+  \brief   Store-Release Exclusive (8 bit)
+  \details Executes a STLB exclusive instruction for 8 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+__STATIC_FORCEINLINE uint32_t __STLEXB(uint8_t value, volatile uint8_t *ptr)
+{
+   uint32_t result;
+
+   __ASM volatile ("stlexb %0, %2, %1" : "=&r" (result), "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" );
+   return(result);
+}
+
+
+/**
+  \brief   Store-Release Exclusive (16 bit)
+  \details Executes a STLH exclusive instruction for 16 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+__STATIC_FORCEINLINE uint32_t __STLEXH(uint16_t value, volatile uint16_t *ptr)
+{
+   uint32_t result;
+
+   __ASM volatile ("stlexh %0, %2, %1" : "=&r" (result), "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" );
+   return(result);
+}
+
+
+/**
+  \brief   Store-Release Exclusive (32 bit)
+  \details Executes a STL exclusive instruction for 32 bit values.
+  \param [in]  value  Value to store
+  \param [in]    ptr  Pointer to location
+  \return          0  Function succeeded
+  \return          1  Function failed
+ */
+__STATIC_FORCEINLINE uint32_t __STLEX(uint32_t value, volatile uint32_t *ptr)
+{
+   uint32_t result;
+
+   __ASM volatile ("stlex %0, %2, %1" : "=&r" (result), "=Q" (*ptr) : "r" ((uint32_t)value) : "memory" );
+   return(result);
+}
+
+#endif /* ((defined (__ARM_ARCH_8M_MAIN__ ) && (__ARM_ARCH_8M_MAIN__ == 1)) || \
+           (defined (__ARM_ARCH_8M_BASE__ ) && (__ARM_ARCH_8M_BASE__ == 1))    ) */
+
+/*@}*/ /* end of group CMSIS_Core_InstructionInterface */
+
+
+/* ###################  Compiler specific Intrinsics  ########################### */
+/** \defgroup CMSIS_SIMD_intrinsics CMSIS SIMD Intrinsics
+  Access to dedicated SIMD instructions
+  @{
+*/
+
+#if (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
+
+__STATIC_FORCEINLINE uint32_t __SADD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("sadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QADD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("qadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHADD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("shadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UADD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQADD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uqadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHADD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uhadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+
+__STATIC_FORCEINLINE uint32_t __SSUB8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("ssub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QSUB8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("qsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHSUB8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("shsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USUB8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("usub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQSUB8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uqsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHSUB8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uhsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+
+__STATIC_FORCEINLINE uint32_t __SADD16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("sadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QADD16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("qadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHADD16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("shadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UADD16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQADD16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uqadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHADD16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uhadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SSUB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("ssub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QSUB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("qsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHSUB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("shsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USUB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("usub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQSUB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uqsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHSUB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uhsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SASX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("sasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QASX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("qasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHASX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("shasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UASX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("uasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQASX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uqasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHASX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uhasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SSAX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("ssax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __QSAX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("qsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SHSAX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("shsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USAX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("usax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UQSAX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uqsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UHSAX(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uhsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USAD8(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("usad8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __USADA8(uint32_t op1, uint32_t op2, uint32_t op3)
+{
+  uint32_t result;
+
+  __ASM ("usada8 %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+  return(result);
+}
+
+#define __SSAT16(ARG1, ARG2) \
+({                          \
+  int32_t __RES, __ARG1 = (ARG1); \
+  __ASM volatile ("ssat16 %0, %1, %2" : "=r" (__RES) :  "I" (ARG2), "r" (__ARG1) : "cc" ); \
+  __RES; \
+ })
+
+#define __USAT16(ARG1, ARG2) \
+({                          \
+  uint32_t __RES, __ARG1 = (ARG1); \
+  __ASM volatile ("usat16 %0, %1, %2" : "=r" (__RES) :  "I" (ARG2), "r" (__ARG1) : "cc" ); \
+  __RES; \
+ })
+
+__STATIC_FORCEINLINE uint32_t __UXTB16(uint32_t op1)
+{
+  uint32_t result;
+
+  __ASM ("uxtb16 %0, %1" : "=r" (result) : "r" (op1));
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __UXTAB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("uxtab16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SXTB16(uint32_t op1)
+{
+  uint32_t result;
+
+  __ASM ("sxtb16 %0, %1" : "=r" (result) : "r" (op1));
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SXTB16_RORn(uint32_t op1, uint32_t rotate)
+{
+  uint32_t result;
+
+  __ASM ("sxtb16 %0, %1, ROR %2" : "=r" (result) : "r" (op1), "i" (rotate) );
+
+  return result;
+}
+
+__STATIC_FORCEINLINE uint32_t __SXTAB16(uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM ("sxtab16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMUAD  (uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("smuad %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMUADX (uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("smuadx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMLAD (uint32_t op1, uint32_t op2, uint32_t op3)
+{
+  uint32_t result;
+
+  __ASM volatile ("smlad %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMLADX (uint32_t op1, uint32_t op2, uint32_t op3)
+{
+  uint32_t result;
+
+  __ASM volatile ("smladx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint64_t __SMLALD (uint32_t op1, uint32_t op2, uint64_t acc)
+{
+  union llreg_u{
+    uint32_t w32[2];
+    uint64_t w64;
+  } llr;
+  llr.w64 = acc;
+
+#ifndef __ARMEB__   /* Little endian */
+  __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
+#else               /* Big endian */
+  __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
+#endif
+
+  return(llr.w64);
+}
+
+__STATIC_FORCEINLINE uint64_t __SMLALDX (uint32_t op1, uint32_t op2, uint64_t acc)
+{
+  union llreg_u{
+    uint32_t w32[2];
+    uint64_t w64;
+  } llr;
+  llr.w64 = acc;
+
+#ifndef __ARMEB__   /* Little endian */
+  __ASM volatile ("smlaldx %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
+#else               /* Big endian */
+  __ASM volatile ("smlaldx %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
+#endif
+
+  return(llr.w64);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMUSD  (uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("smusd %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMUSDX (uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("smusdx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMLSD (uint32_t op1, uint32_t op2, uint32_t op3)
+{
+  uint32_t result;
+
+  __ASM volatile ("smlsd %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint32_t __SMLSDX (uint32_t op1, uint32_t op2, uint32_t op3)
+{
+  uint32_t result;
+
+  __ASM volatile ("smlsdx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE uint64_t __SMLSLD (uint32_t op1, uint32_t op2, uint64_t acc)
+{
+  union llreg_u{
+    uint32_t w32[2];
+    uint64_t w64;
+  } llr;
+  llr.w64 = acc;
+
+#ifndef __ARMEB__   /* Little endian */
+  __ASM volatile ("smlsld %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
+#else               /* Big endian */
+  __ASM volatile ("smlsld %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
+#endif
+
+  return(llr.w64);
+}
+
+__STATIC_FORCEINLINE uint64_t __SMLSLDX (uint32_t op1, uint32_t op2, uint64_t acc)
+{
+  union llreg_u{
+    uint32_t w32[2];
+    uint64_t w64;
+  } llr;
+  llr.w64 = acc;
+
+#ifndef __ARMEB__   /* Little endian */
+  __ASM volatile ("smlsldx %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
+#else               /* Big endian */
+  __ASM volatile ("smlsldx %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
+#endif
+
+  return(llr.w64);
+}
+
+__STATIC_FORCEINLINE uint32_t __SEL  (uint32_t op1, uint32_t op2)
+{
+  uint32_t result;
+
+  __ASM volatile ("sel %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE  int32_t __QADD( int32_t op1,  int32_t op2)
+{
+  int32_t result;
+
+  __ASM volatile ("qadd %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+__STATIC_FORCEINLINE  int32_t __QSUB( int32_t op1,  int32_t op2)
+{
+  int32_t result;
+
+  __ASM volatile ("qsub %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+  return(result);
+}
+
+#if 0
+#define __PKHBT(ARG1,ARG2,ARG3) \
+({                          \
+  uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
+  __ASM ("pkhbt %0, %1, %2, lsl %3" : "=r" (__RES) :  "r" (__ARG1), "r" (__ARG2), "I" (ARG3)  ); \
+  __RES; \
+ })
+
+#define __PKHTB(ARG1,ARG2,ARG3) \
+({                          \
+  uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
+  if (ARG3 == 0) \
+    __ASM ("pkhtb %0, %1, %2" : "=r" (__RES) :  "r" (__ARG1), "r" (__ARG2)  ); \
+  else \
+    __ASM ("pkhtb %0, %1, %2, asr %3" : "=r" (__RES) :  "r" (__ARG1), "r" (__ARG2), "I" (ARG3)  ); \
+  __RES; \
+ })
+#endif
+
+#define __PKHBT(ARG1,ARG2,ARG3)          ( ((((uint32_t)(ARG1))          ) & 0x0000FFFFUL) |  \
+                                           ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL)  )
+
+#define __PKHTB(ARG1,ARG2,ARG3)          ( ((((uint32_t)(ARG1))          ) & 0xFFFF0000UL) |  \
+                                           ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL)  )
+
+__STATIC_FORCEINLINE int32_t __SMMLA (int32_t op1, int32_t op2, int32_t op3)
+{
+ int32_t result;
+
+ __ASM ("smmla %0, %1, %2, %3" : "=r" (result): "r"  (op1), "r" (op2), "r" (op3) );
+ return(result);
+}
+
+#endif /* (__ARM_FEATURE_DSP == 1) */
+/*@} end of group CMSIS_SIMD_intrinsics */
+
+
+#pragma GCC diagnostic pop
+
+#endif /* __CMSIS_GCC_H */
diff --git a/common/mps2/cmsis_nvic.h b/common/mps2/cmsis_nvic.h
new file mode 100644
index 0000000..d71cfe6
--- /dev/null
+++ b/common/mps2/cmsis_nvic.h
@@ -0,0 +1,47 @@
+/* MPS2 CMSIS Library
+*
+* Copyright (c) 2006-2018 ARM Limited
+* SPDX-License-Identifier: BSD-3-Clause
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*
+* 1. Redistributions of source code must retain the above copyright notice,
+* this list of conditions and the following disclaimer.
+*
+* 2. Redistributions in binary form must reproduce the above copyright notice,
+* this list of conditions and the following disclaimer in the documentation
+* and/or other materials provided with the distribution.
+*
+* 3. Neither the name of the copyright holder nor the names of its contributors
+* may be used to endorse or promote products derived from this software without
+* specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+* POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#ifndef MBED_CMSIS_NVIC_H
+#define MBED_CMSIS_NVIC_H
+
+#include "memory_zones.h"
+
+#define NVIC_NUM_VECTORS        (16 + 48)
+#define NVIC_RAM_VECTOR_ADDRESS ZBT_SRAM2_START    // Location of vectors in RAM
+
+/*
+ * Size of the whole vector table in bytes. Each vector is on 32 bits.
+ */
+#define NVIC_VECTORS_SIZE       (NVIC_NUM_VECTORS * 4)
+
+#endif
diff --git a/common/mps2/cmsis_version.h b/common/mps2/cmsis_version.h
new file mode 100644
index 0000000..2f048e4
--- /dev/null
+++ b/common/mps2/cmsis_version.h
@@ -0,0 +1,39 @@
+/**************************************************************************//**
+ * @file     cmsis_version.h
+ * @brief    CMSIS Core(M) Version definitions
+ * @version  V5.0.4
+ * @date     23. July 2019
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2019 ARM Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if   defined ( __ICCARM__ )
+  #pragma system_include         /* treat file as system include file for MISRA check */
+#elif defined (__clang__)
+  #pragma clang system_header   /* treat file as system include file */
+#endif
+
+#ifndef __CMSIS_VERSION_H
+#define __CMSIS_VERSION_H
+
+/*  CMSIS Version definitions */
+#define __CM_CMSIS_VERSION_MAIN  ( 5U)                                      /*!< [31:16] CMSIS Core(M) main version */
+#define __CM_CMSIS_VERSION_SUB   ( 4U)                                      /*!< [15:0]  CMSIS Core(M) sub version */
+#define __CM_CMSIS_VERSION       ((__CM_CMSIS_VERSION_MAIN << 16U) | \
+                                   __CM_CMSIS_VERSION_SUB           )       /*!< CMSIS Core(M) version number */
+#endif
diff --git a/common/mps2/core_cm4.h b/common/mps2/core_cm4.h
new file mode 100644
index 0000000..4e0e886
--- /dev/null
+++ b/common/mps2/core_cm4.h
@@ -0,0 +1,2129 @@
+/**************************************************************************//**
+ * @file     core_cm4.h
+ * @brief    CMSIS Cortex-M4 Core Peripheral Access Layer Header File
+ * @version  V5.1.1
+ * @date     27. March 2020
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if   defined ( __ICCARM__ )
+  #pragma system_include         /* treat file as system include file for MISRA check */
+#elif defined (__clang__)
+  #pragma clang system_header   /* treat file as system include file */
+#endif
+
+#ifndef __CORE_CM4_H_GENERIC
+#define __CORE_CM4_H_GENERIC
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+/**
+  \page CMSIS_MISRA_Exceptions  MISRA-C:2004 Compliance Exceptions
+  CMSIS violates the following MISRA-C:2004 rules:
+
+   \li Required Rule 8.5, object/function definition in header file.<br>
+     Function definitions in header files are used to allow 'inlining'.
+
+   \li Required Rule 18.4, declaration of union type or object of union type: '{...}'.<br>
+     Unions are used for effective representation of core registers.
+
+   \li Advisory Rule 19.7, Function-like macro defined.<br>
+     Function-like macros are used to allow more efficient code.
+ */
+
+
+/*******************************************************************************
+ *                 CMSIS definitions
+ ******************************************************************************/
+/**
+  \ingroup Cortex_M4
+  @{
+ */
+
+#include "cmsis_version.h"
+
+/* CMSIS CM4 definitions */
+#define __CM4_CMSIS_VERSION_MAIN  (__CM_CMSIS_VERSION_MAIN)              /*!< \deprecated [31:16] CMSIS HAL main version */
+#define __CM4_CMSIS_VERSION_SUB   (__CM_CMSIS_VERSION_SUB)               /*!< \deprecated [15:0]  CMSIS HAL sub version */
+#define __CM4_CMSIS_VERSION       ((__CM4_CMSIS_VERSION_MAIN << 16U) | \
+                                    __CM4_CMSIS_VERSION_SUB           )  /*!< \deprecated CMSIS HAL version number */
+
+#define __CORTEX_M                (4U)                                   /*!< Cortex-M Core */
+
+/** __FPU_USED indicates whether an FPU is used or not.
+    For this, __FPU_PRESENT has to be checked prior to making use of FPU specific registers and functions.
+*/
+#if defined ( __CC_ARM )
+  #if defined __TARGET_FPU_VFP
+    #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+      #define __FPU_USED       1U
+    #else
+      #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+      #define __FPU_USED       0U
+    #endif
+  #else
+    #define __FPU_USED         0U
+  #endif
+
+#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
+  #if defined __ARM_FP
+    #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+      #define __FPU_USED       1U
+    #else
+      #warning "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+      #define __FPU_USED       0U
+    #endif
+  #else
+    #define __FPU_USED         0U
+  #endif
+
+#elif defined ( __GNUC__ )
+  #if defined (__VFP_FP__) && !defined(__SOFTFP__)
+    #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+      #define __FPU_USED       1U
+    #else
+      #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+      #define __FPU_USED       0U
+    #endif
+  #else
+    #define __FPU_USED         0U
+  #endif
+
+#elif defined ( __ICCARM__ )
+  #if defined __ARMVFP__
+    #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+      #define __FPU_USED       1U
+    #else
+      #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+      #define __FPU_USED       0U
+    #endif
+  #else
+    #define __FPU_USED         0U
+  #endif
+
+#elif defined ( __TI_ARM__ )
+  #if defined __TI_VFP_SUPPORT__
+    #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+      #define __FPU_USED       1U
+    #else
+      #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+      #define __FPU_USED       0U
+    #endif
+  #else
+    #define __FPU_USED         0U
+  #endif
+
+#elif defined ( __TASKING__ )
+  #if defined __FPU_VFP__
+    #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+      #define __FPU_USED       1U
+    #else
+      #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+      #define __FPU_USED       0U
+    #endif
+  #else
+    #define __FPU_USED         0U
+  #endif
+
+#elif defined ( __CSMC__ )
+  #if ( __CSMC__ & 0x400U)
+    #if defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)
+      #define __FPU_USED       1U
+    #else
+      #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+      #define __FPU_USED       0U
+    #endif
+  #else
+    #define __FPU_USED         0U
+  #endif
+
+#endif
+
+#include "cmsis_compiler.h"               /* CMSIS compiler specific defines */
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CORE_CM4_H_GENERIC */
+
+#ifndef __CMSIS_GENERIC
+
+#ifndef __CORE_CM4_H_DEPENDANT
+#define __CORE_CM4_H_DEPENDANT
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+/* check device defines and use defaults */
+#if defined __CHECK_DEVICE_DEFINES
+  #ifndef __CM4_REV
+    #define __CM4_REV               0x0000U
+    #warning "__CM4_REV not defined in device header file; using default!"
+  #endif
+
+  #ifndef __FPU_PRESENT
+    #define __FPU_PRESENT             0U
+    #warning "__FPU_PRESENT not defined in device header file; using default!"
+  #endif
+
+  #ifndef __MPU_PRESENT
+    #define __MPU_PRESENT             0U
+    #warning "__MPU_PRESENT not defined in device header file; using default!"
+  #endif
+
+  #ifndef __VTOR_PRESENT
+    #define __VTOR_PRESENT             1U
+    #warning "__VTOR_PRESENT not defined in device header file; using default!"
+  #endif
+  
+  #ifndef __NVIC_PRIO_BITS
+    #define __NVIC_PRIO_BITS          3U
+    #warning "__NVIC_PRIO_BITS not defined in device header file; using default!"
+  #endif
+
+  #ifndef __Vendor_SysTickConfig
+    #define __Vendor_SysTickConfig    0U
+    #warning "__Vendor_SysTickConfig not defined in device header file; using default!"
+  #endif
+#endif
+
+/* IO definitions (access restrictions to peripheral registers) */
+/**
+    \defgroup CMSIS_glob_defs CMSIS Global Defines
+
+    <strong>IO Type Qualifiers</strong> are used
+    \li to specify the access to peripheral variables.
+    \li for automatic generation of peripheral register debug information.
+*/
+#ifdef __cplusplus
+  #define   __I     volatile             /*!< Defines 'read only' permissions */
+#else
+  #define   __I     volatile const       /*!< Defines 'read only' permissions */
+#endif
+#define     __O     volatile             /*!< Defines 'write only' permissions */
+#define     __IO    volatile             /*!< Defines 'read / write' permissions */
+
+/* following defines should be used for structure members */
+#define     __IM     volatile const      /*! Defines 'read only' structure member permissions */
+#define     __OM     volatile            /*! Defines 'write only' structure member permissions */
+#define     __IOM    volatile            /*! Defines 'read / write' structure member permissions */
+
+/*@} end of group Cortex_M4 */
+
+
+
+/*******************************************************************************
+ *                 Register Abstraction
+  Core Register contain:
+  - Core Register
+  - Core NVIC Register
+  - Core SCB Register
+  - Core SysTick Register
+  - Core Debug Register
+  - Core MPU Register
+  - Core FPU Register
+ ******************************************************************************/
+/**
+  \defgroup CMSIS_core_register Defines and Type Definitions
+  \brief Type definitions and defines for Cortex-M processor based devices.
+*/
+
+/**
+  \ingroup    CMSIS_core_register
+  \defgroup   CMSIS_CORE  Status and Control Registers
+  \brief      Core Register type definitions.
+  @{
+ */
+
+/**
+  \brief  Union type to access the Application Program Status Register (APSR).
+ */
+typedef union
+{
+  struct
+  {
+    uint32_t _reserved0:16;              /*!< bit:  0..15  Reserved */
+    uint32_t GE:4;                       /*!< bit: 16..19  Greater than or Equal flags */
+    uint32_t _reserved1:7;               /*!< bit: 20..26  Reserved */
+    uint32_t Q:1;                        /*!< bit:     27  Saturation condition flag */
+    uint32_t V:1;                        /*!< bit:     28  Overflow condition code flag */
+    uint32_t C:1;                        /*!< bit:     29  Carry condition code flag */
+    uint32_t Z:1;                        /*!< bit:     30  Zero condition code flag */
+    uint32_t N:1;                        /*!< bit:     31  Negative condition code flag */
+  } b;                                   /*!< Structure used for bit  access */
+  uint32_t w;                            /*!< Type      used for word access */
+} APSR_Type;
+
+/* APSR Register Definitions */
+#define APSR_N_Pos                         31U                                            /*!< APSR: N Position */
+#define APSR_N_Msk                         (1UL << APSR_N_Pos)                            /*!< APSR: N Mask */
+
+#define APSR_Z_Pos                         30U                                            /*!< APSR: Z Position */
+#define APSR_Z_Msk                         (1UL << APSR_Z_Pos)                            /*!< APSR: Z Mask */
+
+#define APSR_C_Pos                         29U                                            /*!< APSR: C Position */
+#define APSR_C_Msk                         (1UL << APSR_C_Pos)                            /*!< APSR: C Mask */
+
+#define APSR_V_Pos                         28U                                            /*!< APSR: V Position */
+#define APSR_V_Msk                         (1UL << APSR_V_Pos)                            /*!< APSR: V Mask */
+
+#define APSR_Q_Pos                         27U                                            /*!< APSR: Q Position */
+#define APSR_Q_Msk                         (1UL << APSR_Q_Pos)                            /*!< APSR: Q Mask */
+
+#define APSR_GE_Pos                        16U                                            /*!< APSR: GE Position */
+#define APSR_GE_Msk                        (0xFUL << APSR_GE_Pos)                         /*!< APSR: GE Mask */
+
+
+/**
+  \brief  Union type to access the Interrupt Program Status Register (IPSR).
+ */
+typedef union
+{
+  struct
+  {
+    uint32_t ISR:9;                      /*!< bit:  0.. 8  Exception number */
+    uint32_t _reserved0:23;              /*!< bit:  9..31  Reserved */
+  } b;                                   /*!< Structure used for bit  access */
+  uint32_t w;                            /*!< Type      used for word access */
+} IPSR_Type;
+
+/* IPSR Register Definitions */
+#define IPSR_ISR_Pos                        0U                                            /*!< IPSR: ISR Position */
+#define IPSR_ISR_Msk                       (0x1FFUL /*<< IPSR_ISR_Pos*/)                  /*!< IPSR: ISR Mask */
+
+
+/**
+  \brief  Union type to access the Special-Purpose Program Status Registers (xPSR).
+ */
+typedef union
+{
+  struct
+  {
+    uint32_t ISR:9;                      /*!< bit:  0.. 8  Exception number */
+    uint32_t _reserved0:1;               /*!< bit:      9  Reserved */
+    uint32_t ICI_IT_1:6;                 /*!< bit: 10..15  ICI/IT part 1 */
+    uint32_t GE:4;                       /*!< bit: 16..19  Greater than or Equal flags */
+    uint32_t _reserved1:4;               /*!< bit: 20..23  Reserved */
+    uint32_t T:1;                        /*!< bit:     24  Thumb bit */
+    uint32_t ICI_IT_2:2;                 /*!< bit: 25..26  ICI/IT part 2 */
+    uint32_t Q:1;                        /*!< bit:     27  Saturation condition flag */
+    uint32_t V:1;                        /*!< bit:     28  Overflow condition code flag */
+    uint32_t C:1;                        /*!< bit:     29  Carry condition code flag */
+    uint32_t Z:1;                        /*!< bit:     30  Zero condition code flag */
+    uint32_t N:1;                        /*!< bit:     31  Negative condition code flag */
+  } b;                                   /*!< Structure used for bit  access */
+  uint32_t w;                            /*!< Type      used for word access */
+} xPSR_Type;
+
+/* xPSR Register Definitions */
+#define xPSR_N_Pos                         31U                                            /*!< xPSR: N Position */
+#define xPSR_N_Msk                         (1UL << xPSR_N_Pos)                            /*!< xPSR: N Mask */
+
+#define xPSR_Z_Pos                         30U                                            /*!< xPSR: Z Position */
+#define xPSR_Z_Msk                         (1UL << xPSR_Z_Pos)                            /*!< xPSR: Z Mask */
+
+#define xPSR_C_Pos                         29U                                            /*!< xPSR: C Position */
+#define xPSR_C_Msk                         (1UL << xPSR_C_Pos)                            /*!< xPSR: C Mask */
+
+#define xPSR_V_Pos                         28U                                            /*!< xPSR: V Position */
+#define xPSR_V_Msk                         (1UL << xPSR_V_Pos)                            /*!< xPSR: V Mask */
+
+#define xPSR_Q_Pos                         27U                                            /*!< xPSR: Q Position */
+#define xPSR_Q_Msk                         (1UL << xPSR_Q_Pos)                            /*!< xPSR: Q Mask */
+
+#define xPSR_ICI_IT_2_Pos                  25U                                            /*!< xPSR: ICI/IT part 2 Position */
+#define xPSR_ICI_IT_2_Msk                  (3UL << xPSR_ICI_IT_2_Pos)                     /*!< xPSR: ICI/IT part 2 Mask */
+
+#define xPSR_T_Pos                         24U                                            /*!< xPSR: T Position */
+#define xPSR_T_Msk                         (1UL << xPSR_T_Pos)                            /*!< xPSR: T Mask */
+
+#define xPSR_GE_Pos                        16U                                            /*!< xPSR: GE Position */
+#define xPSR_GE_Msk                        (0xFUL << xPSR_GE_Pos)                         /*!< xPSR: GE Mask */
+
+#define xPSR_ICI_IT_1_Pos                  10U                                            /*!< xPSR: ICI/IT part 1 Position */
+#define xPSR_ICI_IT_1_Msk                  (0x3FUL << xPSR_ICI_IT_1_Pos)                  /*!< xPSR: ICI/IT part 1 Mask */
+
+#define xPSR_ISR_Pos                        0U                                            /*!< xPSR: ISR Position */
+#define xPSR_ISR_Msk                       (0x1FFUL /*<< xPSR_ISR_Pos*/)                  /*!< xPSR: ISR Mask */
+
+
+/**
+  \brief  Union type to access the Control Registers (CONTROL).
+ */
+typedef union
+{
+  struct
+  {
+    uint32_t nPRIV:1;                    /*!< bit:      0  Execution privilege in Thread mode */
+    uint32_t SPSEL:1;                    /*!< bit:      1  Stack to be used */
+    uint32_t FPCA:1;                     /*!< bit:      2  FP extension active flag */
+    uint32_t _reserved0:29;              /*!< bit:  3..31  Reserved */
+  } b;                                   /*!< Structure used for bit  access */
+  uint32_t w;                            /*!< Type      used for word access */
+} CONTROL_Type;
+
+/* CONTROL Register Definitions */
+#define CONTROL_FPCA_Pos                    2U                                            /*!< CONTROL: FPCA Position */
+#define CONTROL_FPCA_Msk                   (1UL << CONTROL_FPCA_Pos)                      /*!< CONTROL: FPCA Mask */
+
+#define CONTROL_SPSEL_Pos                   1U                                            /*!< CONTROL: SPSEL Position */
+#define CONTROL_SPSEL_Msk                  (1UL << CONTROL_SPSEL_Pos)                     /*!< CONTROL: SPSEL Mask */
+
+#define CONTROL_nPRIV_Pos                   0U                                            /*!< CONTROL: nPRIV Position */
+#define CONTROL_nPRIV_Msk                  (1UL /*<< CONTROL_nPRIV_Pos*/)                 /*!< CONTROL: nPRIV Mask */
+
+/*@} end of group CMSIS_CORE */
+
+
+/**
+  \ingroup    CMSIS_core_register
+  \defgroup   CMSIS_NVIC  Nested Vectored Interrupt Controller (NVIC)
+  \brief      Type definitions for the NVIC Registers
+  @{
+ */
+
+/**
+  \brief  Structure type to access the Nested Vectored Interrupt Controller (NVIC).
+ */
+typedef struct
+{
+  __IOM uint32_t ISER[8U];               /*!< Offset: 0x000 (R/W)  Interrupt Set Enable Register */
+        uint32_t RESERVED0[24U];
+  __IOM uint32_t ICER[8U];               /*!< Offset: 0x080 (R/W)  Interrupt Clear Enable Register */
+        uint32_t RESERVED1[24U];
+  __IOM uint32_t ISPR[8U];               /*!< Offset: 0x100 (R/W)  Interrupt Set Pending Register */
+        uint32_t RESERVED2[24U];
+  __IOM uint32_t ICPR[8U];               /*!< Offset: 0x180 (R/W)  Interrupt Clear Pending Register */
+        uint32_t RESERVED3[24U];
+  __IOM uint32_t IABR[8U];               /*!< Offset: 0x200 (R/W)  Interrupt Active bit Register */
+        uint32_t RESERVED4[56U];
+  __IOM uint8_t  IP[240U];               /*!< Offset: 0x300 (R/W)  Interrupt Priority Register (8Bit wide) */
+        uint32_t RESERVED5[644U];
+  __OM  uint32_t STIR;                   /*!< Offset: 0xE00 ( /W)  Software Trigger Interrupt Register */
+}  NVIC_Type;
+
+/* Software Triggered Interrupt Register Definitions */
+#define NVIC_STIR_INTID_Pos                 0U                                         /*!< STIR: INTLINESNUM Position */
+#define NVIC_STIR_INTID_Msk                (0x1FFUL /*<< NVIC_STIR_INTID_Pos*/)        /*!< STIR: INTLINESNUM Mask */
+
+/*@} end of group CMSIS_NVIC */
+
+
+/**
+  \ingroup  CMSIS_core_register
+  \defgroup CMSIS_SCB     System Control Block (SCB)
+  \brief    Type definitions for the System Control Block Registers
+  @{
+ */
+
+/**
+  \brief  Structure type to access the System Control Block (SCB).
+ */
+typedef struct
+{
+  __IM  uint32_t CPUID;                  /*!< Offset: 0x000 (R/ )  CPUID Base Register */
+  __IOM uint32_t ICSR;                   /*!< Offset: 0x004 (R/W)  Interrupt Control and State Register */
+  __IOM uint32_t VTOR;                   /*!< Offset: 0x008 (R/W)  Vector Table Offset Register */
+  __IOM uint32_t AIRCR;                  /*!< Offset: 0x00C (R/W)  Application Interrupt and Reset Control Register */
+  __IOM uint32_t SCR;                    /*!< Offset: 0x010 (R/W)  System Control Register */
+  __IOM uint32_t CCR;                    /*!< Offset: 0x014 (R/W)  Configuration Control Register */
+  __IOM uint8_t  SHP[12U];               /*!< Offset: 0x018 (R/W)  System Handlers Priority Registers (4-7, 8-11, 12-15) */
+  __IOM uint32_t SHCSR;                  /*!< Offset: 0x024 (R/W)  System Handler Control and State Register */
+  __IOM uint32_t CFSR;                   /*!< Offset: 0x028 (R/W)  Configurable Fault Status Register */
+  __IOM uint32_t HFSR;                   /*!< Offset: 0x02C (R/W)  HardFault Status Register */
+  __IOM uint32_t DFSR;                   /*!< Offset: 0x030 (R/W)  Debug Fault Status Register */
+  __IOM uint32_t MMFAR;                  /*!< Offset: 0x034 (R/W)  MemManage Fault Address Register */
+  __IOM uint32_t BFAR;                   /*!< Offset: 0x038 (R/W)  BusFault Address Register */
+  __IOM uint32_t AFSR;                   /*!< Offset: 0x03C (R/W)  Auxiliary Fault Status Register */
+  __IM  uint32_t PFR[2U];                /*!< Offset: 0x040 (R/ )  Processor Feature Register */
+  __IM  uint32_t DFR;                    /*!< Offset: 0x048 (R/ )  Debug Feature Register */
+  __IM  uint32_t ADR;                    /*!< Offset: 0x04C (R/ )  Auxiliary Feature Register */
+  __IM  uint32_t MMFR[4U];               /*!< Offset: 0x050 (R/ )  Memory Model Feature Register */
+  __IM  uint32_t ISAR[5U];               /*!< Offset: 0x060 (R/ )  Instruction Set Attributes Register */
+        uint32_t RESERVED0[5U];
+  __IOM uint32_t CPACR;                  /*!< Offset: 0x088 (R/W)  Coprocessor Access Control Register */
+} SCB_Type;
+
+/* SCB CPUID Register Definitions */
+#define SCB_CPUID_IMPLEMENTER_Pos          24U                                            /*!< SCB CPUID: IMPLEMENTER Position */
+#define SCB_CPUID_IMPLEMENTER_Msk          (0xFFUL << SCB_CPUID_IMPLEMENTER_Pos)          /*!< SCB CPUID: IMPLEMENTER Mask */
+
+#define SCB_CPUID_VARIANT_Pos              20U                                            /*!< SCB CPUID: VARIANT Position */
+#define SCB_CPUID_VARIANT_Msk              (0xFUL << SCB_CPUID_VARIANT_Pos)               /*!< SCB CPUID: VARIANT Mask */
+
+#define SCB_CPUID_ARCHITECTURE_Pos         16U                                            /*!< SCB CPUID: ARCHITECTURE Position */
+#define SCB_CPUID_ARCHITECTURE_Msk         (0xFUL << SCB_CPUID_ARCHITECTURE_Pos)          /*!< SCB CPUID: ARCHITECTURE Mask */
+
+#define SCB_CPUID_PARTNO_Pos                4U                                            /*!< SCB CPUID: PARTNO Position */
+#define SCB_CPUID_PARTNO_Msk               (0xFFFUL << SCB_CPUID_PARTNO_Pos)              /*!< SCB CPUID: PARTNO Mask */
+
+#define SCB_CPUID_REVISION_Pos              0U                                            /*!< SCB CPUID: REVISION Position */
+#define SCB_CPUID_REVISION_Msk             (0xFUL /*<< SCB_CPUID_REVISION_Pos*/)          /*!< SCB CPUID: REVISION Mask */
+
+/* SCB Interrupt Control State Register Definitions */
+#define SCB_ICSR_NMIPENDSET_Pos            31U                                            /*!< SCB ICSR: NMIPENDSET Position */
+#define SCB_ICSR_NMIPENDSET_Msk            (1UL << SCB_ICSR_NMIPENDSET_Pos)               /*!< SCB ICSR: NMIPENDSET Mask */
+
+#define SCB_ICSR_PENDSVSET_Pos             28U                                            /*!< SCB ICSR: PENDSVSET Position */
+#define SCB_ICSR_PENDSVSET_Msk             (1UL << SCB_ICSR_PENDSVSET_Pos)                /*!< SCB ICSR: PENDSVSET Mask */
+
+#define SCB_ICSR_PENDSVCLR_Pos             27U                                            /*!< SCB ICSR: PENDSVCLR Position */
+#define SCB_ICSR_PENDSVCLR_Msk             (1UL << SCB_ICSR_PENDSVCLR_Pos)                /*!< SCB ICSR: PENDSVCLR Mask */
+
+#define SCB_ICSR_PENDSTSET_Pos             26U                                            /*!< SCB ICSR: PENDSTSET Position */
+#define SCB_ICSR_PENDSTSET_Msk             (1UL << SCB_ICSR_PENDSTSET_Pos)                /*!< SCB ICSR: PENDSTSET Mask */
+
+#define SCB_ICSR_PENDSTCLR_Pos             25U                                            /*!< SCB ICSR: PENDSTCLR Position */
+#define SCB_ICSR_PENDSTCLR_Msk             (1UL << SCB_ICSR_PENDSTCLR_Pos)                /*!< SCB ICSR: PENDSTCLR Mask */
+
+#define SCB_ICSR_ISRPREEMPT_Pos            23U                                            /*!< SCB ICSR: ISRPREEMPT Position */
+#define SCB_ICSR_ISRPREEMPT_Msk            (1UL << SCB_ICSR_ISRPREEMPT_Pos)               /*!< SCB ICSR: ISRPREEMPT Mask */
+
+#define SCB_ICSR_ISRPENDING_Pos            22U                                            /*!< SCB ICSR: ISRPENDING Position */
+#define SCB_ICSR_ISRPENDING_Msk            (1UL << SCB_ICSR_ISRPENDING_Pos)               /*!< SCB ICSR: ISRPENDING Mask */
+
+#define SCB_ICSR_VECTPENDING_Pos           12U                                            /*!< SCB ICSR: VECTPENDING Position */
+#define SCB_ICSR_VECTPENDING_Msk           (0x1FFUL << SCB_ICSR_VECTPENDING_Pos)          /*!< SCB ICSR: VECTPENDING Mask */
+
+#define SCB_ICSR_RETTOBASE_Pos             11U                                            /*!< SCB ICSR: RETTOBASE Position */
+#define SCB_ICSR_RETTOBASE_Msk             (1UL << SCB_ICSR_RETTOBASE_Pos)                /*!< SCB ICSR: RETTOBASE Mask */
+
+#define SCB_ICSR_VECTACTIVE_Pos             0U                                            /*!< SCB ICSR: VECTACTIVE Position */
+#define SCB_ICSR_VECTACTIVE_Msk            (0x1FFUL /*<< SCB_ICSR_VECTACTIVE_Pos*/)       /*!< SCB ICSR: VECTACTIVE Mask */
+
+/* SCB Vector Table Offset Register Definitions */
+#define SCB_VTOR_TBLOFF_Pos                 7U                                            /*!< SCB VTOR: TBLOFF Position */
+#define SCB_VTOR_TBLOFF_Msk                (0x1FFFFFFUL << SCB_VTOR_TBLOFF_Pos)           /*!< SCB VTOR: TBLOFF Mask */
+
+/* SCB Application Interrupt and Reset Control Register Definitions */
+#define SCB_AIRCR_VECTKEY_Pos              16U                                            /*!< SCB AIRCR: VECTKEY Position */
+#define SCB_AIRCR_VECTKEY_Msk              (0xFFFFUL << SCB_AIRCR_VECTKEY_Pos)            /*!< SCB AIRCR: VECTKEY Mask */
+
+#define SCB_AIRCR_VECTKEYSTAT_Pos          16U                                            /*!< SCB AIRCR: VECTKEYSTAT Position */
+#define SCB_AIRCR_VECTKEYSTAT_Msk          (0xFFFFUL << SCB_AIRCR_VECTKEYSTAT_Pos)        /*!< SCB AIRCR: VECTKEYSTAT Mask */
+
+#define SCB_AIRCR_ENDIANESS_Pos            15U                                            /*!< SCB AIRCR: ENDIANESS Position */
+#define SCB_AIRCR_ENDIANESS_Msk            (1UL << SCB_AIRCR_ENDIANESS_Pos)               /*!< SCB AIRCR: ENDIANESS Mask */
+
+#define SCB_AIRCR_PRIGROUP_Pos              8U                                            /*!< SCB AIRCR: PRIGROUP Position */
+#define SCB_AIRCR_PRIGROUP_Msk             (7UL << SCB_AIRCR_PRIGROUP_Pos)                /*!< SCB AIRCR: PRIGROUP Mask */
+
+#define SCB_AIRCR_SYSRESETREQ_Pos           2U                                            /*!< SCB AIRCR: SYSRESETREQ Position */
+#define SCB_AIRCR_SYSRESETREQ_Msk          (1UL << SCB_AIRCR_SYSRESETREQ_Pos)             /*!< SCB AIRCR: SYSRESETREQ Mask */
+
+#define SCB_AIRCR_VECTCLRACTIVE_Pos         1U                                            /*!< SCB AIRCR: VECTCLRACTIVE Position */
+#define SCB_AIRCR_VECTCLRACTIVE_Msk        (1UL << SCB_AIRCR_VECTCLRACTIVE_Pos)           /*!< SCB AIRCR: VECTCLRACTIVE Mask */
+
+#define SCB_AIRCR_VECTRESET_Pos             0U                                            /*!< SCB AIRCR: VECTRESET Position */
+#define SCB_AIRCR_VECTRESET_Msk            (1UL /*<< SCB_AIRCR_VECTRESET_Pos*/)           /*!< SCB AIRCR: VECTRESET Mask */
+
+/* SCB System Control Register Definitions */
+#define SCB_SCR_SEVONPEND_Pos               4U                                            /*!< SCB SCR: SEVONPEND Position */
+#define SCB_SCR_SEVONPEND_Msk              (1UL << SCB_SCR_SEVONPEND_Pos)                 /*!< SCB SCR: SEVONPEND Mask */
+
+#define SCB_SCR_SLEEPDEEP_Pos               2U                                            /*!< SCB SCR: SLEEPDEEP Position */
+#define SCB_SCR_SLEEPDEEP_Msk              (1UL << SCB_SCR_SLEEPDEEP_Pos)                 /*!< SCB SCR: SLEEPDEEP Mask */
+
+#define SCB_SCR_SLEEPONEXIT_Pos             1U                                            /*!< SCB SCR: SLEEPONEXIT Position */
+#define SCB_SCR_SLEEPONEXIT_Msk            (1UL << SCB_SCR_SLEEPONEXIT_Pos)               /*!< SCB SCR: SLEEPONEXIT Mask */
+
+/* SCB Configuration Control Register Definitions */
+#define SCB_CCR_STKALIGN_Pos                9U                                            /*!< SCB CCR: STKALIGN Position */
+#define SCB_CCR_STKALIGN_Msk               (1UL << SCB_CCR_STKALIGN_Pos)                  /*!< SCB CCR: STKALIGN Mask */
+
+#define SCB_CCR_BFHFNMIGN_Pos               8U                                            /*!< SCB CCR: BFHFNMIGN Position */
+#define SCB_CCR_BFHFNMIGN_Msk              (1UL << SCB_CCR_BFHFNMIGN_Pos)                 /*!< SCB CCR: BFHFNMIGN Mask */
+
+#define SCB_CCR_DIV_0_TRP_Pos               4U                                            /*!< SCB CCR: DIV_0_TRP Position */
+#define SCB_CCR_DIV_0_TRP_Msk              (1UL << SCB_CCR_DIV_0_TRP_Pos)                 /*!< SCB CCR: DIV_0_TRP Mask */
+
+#define SCB_CCR_UNALIGN_TRP_Pos             3U                                            /*!< SCB CCR: UNALIGN_TRP Position */
+#define SCB_CCR_UNALIGN_TRP_Msk            (1UL << SCB_CCR_UNALIGN_TRP_Pos)               /*!< SCB CCR: UNALIGN_TRP Mask */
+
+#define SCB_CCR_USERSETMPEND_Pos            1U                                            /*!< SCB CCR: USERSETMPEND Position */
+#define SCB_CCR_USERSETMPEND_Msk           (1UL << SCB_CCR_USERSETMPEND_Pos)              /*!< SCB CCR: USERSETMPEND Mask */
+
+#define SCB_CCR_NONBASETHRDENA_Pos          0U                                            /*!< SCB CCR: NONBASETHRDENA Position */
+#define SCB_CCR_NONBASETHRDENA_Msk         (1UL /*<< SCB_CCR_NONBASETHRDENA_Pos*/)        /*!< SCB CCR: NONBASETHRDENA Mask */
+
+/* SCB System Handler Control and State Register Definitions */
+#define SCB_SHCSR_USGFAULTENA_Pos          18U                                            /*!< SCB SHCSR: USGFAULTENA Position */
+#define SCB_SHCSR_USGFAULTENA_Msk          (1UL << SCB_SHCSR_USGFAULTENA_Pos)             /*!< SCB SHCSR: USGFAULTENA Mask */
+
+#define SCB_SHCSR_BUSFAULTENA_Pos          17U                                            /*!< SCB SHCSR: BUSFAULTENA Position */
+#define SCB_SHCSR_BUSFAULTENA_Msk          (1UL << SCB_SHCSR_BUSFAULTENA_Pos)             /*!< SCB SHCSR: BUSFAULTENA Mask */
+
+#define SCB_SHCSR_MEMFAULTENA_Pos          16U                                            /*!< SCB SHCSR: MEMFAULTENA Position */
+#define SCB_SHCSR_MEMFAULTENA_Msk          (1UL << SCB_SHCSR_MEMFAULTENA_Pos)             /*!< SCB SHCSR: MEMFAULTENA Mask */
+
+#define SCB_SHCSR_SVCALLPENDED_Pos         15U                                            /*!< SCB SHCSR: SVCALLPENDED Position */
+#define SCB_SHCSR_SVCALLPENDED_Msk         (1UL << SCB_SHCSR_SVCALLPENDED_Pos)            /*!< SCB SHCSR: SVCALLPENDED Mask */
+
+#define SCB_SHCSR_BUSFAULTPENDED_Pos       14U                                            /*!< SCB SHCSR: BUSFAULTPENDED Position */
+#define SCB_SHCSR_BUSFAULTPENDED_Msk       (1UL << SCB_SHCSR_BUSFAULTPENDED_Pos)          /*!< SCB SHCSR: BUSFAULTPENDED Mask */
+
+#define SCB_SHCSR_MEMFAULTPENDED_Pos       13U                                            /*!< SCB SHCSR: MEMFAULTPENDED Position */
+#define SCB_SHCSR_MEMFAULTPENDED_Msk       (1UL << SCB_SHCSR_MEMFAULTPENDED_Pos)          /*!< SCB SHCSR: MEMFAULTPENDED Mask */
+
+#define SCB_SHCSR_USGFAULTPENDED_Pos       12U                                            /*!< SCB SHCSR: USGFAULTPENDED Position */
+#define SCB_SHCSR_USGFAULTPENDED_Msk       (1UL << SCB_SHCSR_USGFAULTPENDED_Pos)          /*!< SCB SHCSR: USGFAULTPENDED Mask */
+
+#define SCB_SHCSR_SYSTICKACT_Pos           11U                                            /*!< SCB SHCSR: SYSTICKACT Position */
+#define SCB_SHCSR_SYSTICKACT_Msk           (1UL << SCB_SHCSR_SYSTICKACT_Pos)              /*!< SCB SHCSR: SYSTICKACT Mask */
+
+#define SCB_SHCSR_PENDSVACT_Pos            10U                                            /*!< SCB SHCSR: PENDSVACT Position */
+#define SCB_SHCSR_PENDSVACT_Msk            (1UL << SCB_SHCSR_PENDSVACT_Pos)               /*!< SCB SHCSR: PENDSVACT Mask */
+
+#define SCB_SHCSR_MONITORACT_Pos            8U                                            /*!< SCB SHCSR: MONITORACT Position */
+#define SCB_SHCSR_MONITORACT_Msk           (1UL << SCB_SHCSR_MONITORACT_Pos)              /*!< SCB SHCSR: MONITORACT Mask */
+
+#define SCB_SHCSR_SVCALLACT_Pos             7U                                            /*!< SCB SHCSR: SVCALLACT Position */
+#define SCB_SHCSR_SVCALLACT_Msk            (1UL << SCB_SHCSR_SVCALLACT_Pos)               /*!< SCB SHCSR: SVCALLACT Mask */
+
+#define SCB_SHCSR_USGFAULTACT_Pos           3U                                            /*!< SCB SHCSR: USGFAULTACT Position */
+#define SCB_SHCSR_USGFAULTACT_Msk          (1UL << SCB_SHCSR_USGFAULTACT_Pos)             /*!< SCB SHCSR: USGFAULTACT Mask */
+
+#define SCB_SHCSR_BUSFAULTACT_Pos           1U                                            /*!< SCB SHCSR: BUSFAULTACT Position */
+#define SCB_SHCSR_BUSFAULTACT_Msk          (1UL << SCB_SHCSR_BUSFAULTACT_Pos)             /*!< SCB SHCSR: BUSFAULTACT Mask */
+
+#define SCB_SHCSR_MEMFAULTACT_Pos           0U                                            /*!< SCB SHCSR: MEMFAULTACT Position */
+#define SCB_SHCSR_MEMFAULTACT_Msk          (1UL /*<< SCB_SHCSR_MEMFAULTACT_Pos*/)         /*!< SCB SHCSR: MEMFAULTACT Mask */
+
+/* SCB Configurable Fault Status Register Definitions */
+#define SCB_CFSR_USGFAULTSR_Pos            16U                                            /*!< SCB CFSR: Usage Fault Status Register Position */
+#define SCB_CFSR_USGFAULTSR_Msk            (0xFFFFUL << SCB_CFSR_USGFAULTSR_Pos)          /*!< SCB CFSR: Usage Fault Status Register Mask */
+
+#define SCB_CFSR_BUSFAULTSR_Pos             8U                                            /*!< SCB CFSR: Bus Fault Status Register Position */
+#define SCB_CFSR_BUSFAULTSR_Msk            (0xFFUL << SCB_CFSR_BUSFAULTSR_Pos)            /*!< SCB CFSR: Bus Fault Status Register Mask */
+
+#define SCB_CFSR_MEMFAULTSR_Pos             0U                                            /*!< SCB CFSR: Memory Manage Fault Status Register Position */
+#define SCB_CFSR_MEMFAULTSR_Msk            (0xFFUL /*<< SCB_CFSR_MEMFAULTSR_Pos*/)        /*!< SCB CFSR: Memory Manage Fault Status Register Mask */
+
+/* MemManage Fault Status Register (part of SCB Configurable Fault Status Register) */
+#define SCB_CFSR_MMARVALID_Pos             (SCB_SHCSR_MEMFAULTACT_Pos + 7U)               /*!< SCB CFSR (MMFSR): MMARVALID Position */
+#define SCB_CFSR_MMARVALID_Msk             (1UL << SCB_CFSR_MMARVALID_Pos)                /*!< SCB CFSR (MMFSR): MMARVALID Mask */
+
+#define SCB_CFSR_MLSPERR_Pos               (SCB_SHCSR_MEMFAULTACT_Pos + 5U)               /*!< SCB CFSR (MMFSR): MLSPERR Position */
+#define SCB_CFSR_MLSPERR_Msk               (1UL << SCB_CFSR_MLSPERR_Pos)                  /*!< SCB CFSR (MMFSR): MLSPERR Mask */
+
+#define SCB_CFSR_MSTKERR_Pos               (SCB_SHCSR_MEMFAULTACT_Pos + 4U)               /*!< SCB CFSR (MMFSR): MSTKERR Position */
+#define SCB_CFSR_MSTKERR_Msk               (1UL << SCB_CFSR_MSTKERR_Pos)                  /*!< SCB CFSR (MMFSR): MSTKERR Mask */
+
+#define SCB_CFSR_MUNSTKERR_Pos             (SCB_SHCSR_MEMFAULTACT_Pos + 3U)               /*!< SCB CFSR (MMFSR): MUNSTKERR Position */
+#define SCB_CFSR_MUNSTKERR_Msk             (1UL << SCB_CFSR_MUNSTKERR_Pos)                /*!< SCB CFSR (MMFSR): MUNSTKERR Mask */
+
+#define SCB_CFSR_DACCVIOL_Pos              (SCB_SHCSR_MEMFAULTACT_Pos + 1U)               /*!< SCB CFSR (MMFSR): DACCVIOL Position */
+#define SCB_CFSR_DACCVIOL_Msk              (1UL << SCB_CFSR_DACCVIOL_Pos)                 /*!< SCB CFSR (MMFSR): DACCVIOL Mask */
+
+#define SCB_CFSR_IACCVIOL_Pos              (SCB_SHCSR_MEMFAULTACT_Pos + 0U)               /*!< SCB CFSR (MMFSR): IACCVIOL Position */
+#define SCB_CFSR_IACCVIOL_Msk              (1UL /*<< SCB_CFSR_IACCVIOL_Pos*/)             /*!< SCB CFSR (MMFSR): IACCVIOL Mask */
+
+/* BusFault Status Register (part of SCB Configurable Fault Status Register) */
+#define SCB_CFSR_BFARVALID_Pos            (SCB_CFSR_BUSFAULTSR_Pos + 7U)                  /*!< SCB CFSR (BFSR): BFARVALID Position */
+#define SCB_CFSR_BFARVALID_Msk            (1UL << SCB_CFSR_BFARVALID_Pos)                 /*!< SCB CFSR (BFSR): BFARVALID Mask */
+
+#define SCB_CFSR_LSPERR_Pos               (SCB_CFSR_BUSFAULTSR_Pos + 5U)                  /*!< SCB CFSR (BFSR): LSPERR Position */
+#define SCB_CFSR_LSPERR_Msk               (1UL << SCB_CFSR_LSPERR_Pos)                    /*!< SCB CFSR (BFSR): LSPERR Mask */
+
+#define SCB_CFSR_STKERR_Pos               (SCB_CFSR_BUSFAULTSR_Pos + 4U)                  /*!< SCB CFSR (BFSR): STKERR Position */
+#define SCB_CFSR_STKERR_Msk               (1UL << SCB_CFSR_STKERR_Pos)                    /*!< SCB CFSR (BFSR): STKERR Mask */
+
+#define SCB_CFSR_UNSTKERR_Pos             (SCB_CFSR_BUSFAULTSR_Pos + 3U)                  /*!< SCB CFSR (BFSR): UNSTKERR Position */
+#define SCB_CFSR_UNSTKERR_Msk             (1UL << SCB_CFSR_UNSTKERR_Pos)                  /*!< SCB CFSR (BFSR): UNSTKERR Mask */
+
+#define SCB_CFSR_IMPRECISERR_Pos          (SCB_CFSR_BUSFAULTSR_Pos + 2U)                  /*!< SCB CFSR (BFSR): IMPRECISERR Position */
+#define SCB_CFSR_IMPRECISERR_Msk          (1UL << SCB_CFSR_IMPRECISERR_Pos)               /*!< SCB CFSR (BFSR): IMPRECISERR Mask */
+
+#define SCB_CFSR_PRECISERR_Pos            (SCB_CFSR_BUSFAULTSR_Pos + 1U)                  /*!< SCB CFSR (BFSR): PRECISERR Position */
+#define SCB_CFSR_PRECISERR_Msk            (1UL << SCB_CFSR_PRECISERR_Pos)                 /*!< SCB CFSR (BFSR): PRECISERR Mask */
+
+#define SCB_CFSR_IBUSERR_Pos              (SCB_CFSR_BUSFAULTSR_Pos + 0U)                  /*!< SCB CFSR (BFSR): IBUSERR Position */
+#define SCB_CFSR_IBUSERR_Msk              (1UL << SCB_CFSR_IBUSERR_Pos)                   /*!< SCB CFSR (BFSR): IBUSERR Mask */
+
+/* UsageFault Status Register (part of SCB Configurable Fault Status Register) */
+#define SCB_CFSR_DIVBYZERO_Pos            (SCB_CFSR_USGFAULTSR_Pos + 9U)                  /*!< SCB CFSR (UFSR): DIVBYZERO Position */
+#define SCB_CFSR_DIVBYZERO_Msk            (1UL << SCB_CFSR_DIVBYZERO_Pos)                 /*!< SCB CFSR (UFSR): DIVBYZERO Mask */
+
+#define SCB_CFSR_UNALIGNED_Pos            (SCB_CFSR_USGFAULTSR_Pos + 8U)                  /*!< SCB CFSR (UFSR): UNALIGNED Position */
+#define SCB_CFSR_UNALIGNED_Msk            (1UL << SCB_CFSR_UNALIGNED_Pos)                 /*!< SCB CFSR (UFSR): UNALIGNED Mask */
+
+#define SCB_CFSR_NOCP_Pos                 (SCB_CFSR_USGFAULTSR_Pos + 3U)                  /*!< SCB CFSR (UFSR): NOCP Position */
+#define SCB_CFSR_NOCP_Msk                 (1UL << SCB_CFSR_NOCP_Pos)                      /*!< SCB CFSR (UFSR): NOCP Mask */
+
+#define SCB_CFSR_INVPC_Pos                (SCB_CFSR_USGFAULTSR_Pos + 2U)                  /*!< SCB CFSR (UFSR): INVPC Position */
+#define SCB_CFSR_INVPC_Msk                (1UL << SCB_CFSR_INVPC_Pos)                     /*!< SCB CFSR (UFSR): INVPC Mask */
+
+#define SCB_CFSR_INVSTATE_Pos             (SCB_CFSR_USGFAULTSR_Pos + 1U)                  /*!< SCB CFSR (UFSR): INVSTATE Position */
+#define SCB_CFSR_INVSTATE_Msk             (1UL << SCB_CFSR_INVSTATE_Pos)                  /*!< SCB CFSR (UFSR): INVSTATE Mask */
+
+#define SCB_CFSR_UNDEFINSTR_Pos           (SCB_CFSR_USGFAULTSR_Pos + 0U)                  /*!< SCB CFSR (UFSR): UNDEFINSTR Position */
+#define SCB_CFSR_UNDEFINSTR_Msk           (1UL << SCB_CFSR_UNDEFINSTR_Pos)                /*!< SCB CFSR (UFSR): UNDEFINSTR Mask */
+
+/* SCB Hard Fault Status Register Definitions */
+#define SCB_HFSR_DEBUGEVT_Pos              31U                                            /*!< SCB HFSR: DEBUGEVT Position */
+#define SCB_HFSR_DEBUGEVT_Msk              (1UL << SCB_HFSR_DEBUGEVT_Pos)                 /*!< SCB HFSR: DEBUGEVT Mask */
+
+#define SCB_HFSR_FORCED_Pos                30U                                            /*!< SCB HFSR: FORCED Position */
+#define SCB_HFSR_FORCED_Msk                (1UL << SCB_HFSR_FORCED_Pos)                   /*!< SCB HFSR: FORCED Mask */
+
+#define SCB_HFSR_VECTTBL_Pos                1U                                            /*!< SCB HFSR: VECTTBL Position */
+#define SCB_HFSR_VECTTBL_Msk               (1UL << SCB_HFSR_VECTTBL_Pos)                  /*!< SCB HFSR: VECTTBL Mask */
+
+/* SCB Debug Fault Status Register Definitions */
+#define SCB_DFSR_EXTERNAL_Pos               4U                                            /*!< SCB DFSR: EXTERNAL Position */
+#define SCB_DFSR_EXTERNAL_Msk              (1UL << SCB_DFSR_EXTERNAL_Pos)                 /*!< SCB DFSR: EXTERNAL Mask */
+
+#define SCB_DFSR_VCATCH_Pos                 3U                                            /*!< SCB DFSR: VCATCH Position */
+#define SCB_DFSR_VCATCH_Msk                (1UL << SCB_DFSR_VCATCH_Pos)                   /*!< SCB DFSR: VCATCH Mask */
+
+#define SCB_DFSR_DWTTRAP_Pos                2U                                            /*!< SCB DFSR: DWTTRAP Position */
+#define SCB_DFSR_DWTTRAP_Msk               (1UL << SCB_DFSR_DWTTRAP_Pos)                  /*!< SCB DFSR: DWTTRAP Mask */
+
+#define SCB_DFSR_BKPT_Pos                   1U                                            /*!< SCB DFSR: BKPT Position */
+#define SCB_DFSR_BKPT_Msk                  (1UL << SCB_DFSR_BKPT_Pos)                     /*!< SCB DFSR: BKPT Mask */
+
+#define SCB_DFSR_HALTED_Pos                 0U                                            /*!< SCB DFSR: HALTED Position */
+#define SCB_DFSR_HALTED_Msk                (1UL /*<< SCB_DFSR_HALTED_Pos*/)               /*!< SCB DFSR: HALTED Mask */
+
+/*@} end of group CMSIS_SCB */
+
+
+/**
+  \ingroup  CMSIS_core_register
+  \defgroup CMSIS_SCnSCB System Controls not in SCB (SCnSCB)
+  \brief    Type definitions for the System Control and ID Register not in the SCB
+  @{
+ */
+
+/**
+  \brief  Structure type to access the System Control and ID Register not in the SCB.
+ */
+typedef struct
+{
+        uint32_t RESERVED0[1U];
+  __IM  uint32_t ICTR;                   /*!< Offset: 0x004 (R/ )  Interrupt Controller Type Register */
+  __IOM uint32_t ACTLR;                  /*!< Offset: 0x008 (R/W)  Auxiliary Control Register */
+} SCnSCB_Type;
+
+/* Interrupt Controller Type Register Definitions */
+#define SCnSCB_ICTR_INTLINESNUM_Pos         0U                                         /*!< ICTR: INTLINESNUM Position */
+#define SCnSCB_ICTR_INTLINESNUM_Msk        (0xFUL /*<< SCnSCB_ICTR_INTLINESNUM_Pos*/)  /*!< ICTR: INTLINESNUM Mask */
+
+/* Auxiliary Control Register Definitions */
+#define SCnSCB_ACTLR_DISOOFP_Pos            9U                                         /*!< ACTLR: DISOOFP Position */
+#define SCnSCB_ACTLR_DISOOFP_Msk           (1UL << SCnSCB_ACTLR_DISOOFP_Pos)           /*!< ACTLR: DISOOFP Mask */
+
+#define SCnSCB_ACTLR_DISFPCA_Pos            8U                                         /*!< ACTLR: DISFPCA Position */
+#define SCnSCB_ACTLR_DISFPCA_Msk           (1UL << SCnSCB_ACTLR_DISFPCA_Pos)           /*!< ACTLR: DISFPCA Mask */
+
+#define SCnSCB_ACTLR_DISFOLD_Pos            2U                                         /*!< ACTLR: DISFOLD Position */
+#define SCnSCB_ACTLR_DISFOLD_Msk           (1UL << SCnSCB_ACTLR_DISFOLD_Pos)           /*!< ACTLR: DISFOLD Mask */
+
+#define SCnSCB_ACTLR_DISDEFWBUF_Pos         1U                                         /*!< ACTLR: DISDEFWBUF Position */
+#define SCnSCB_ACTLR_DISDEFWBUF_Msk        (1UL << SCnSCB_ACTLR_DISDEFWBUF_Pos)        /*!< ACTLR: DISDEFWBUF Mask */
+
+#define SCnSCB_ACTLR_DISMCYCINT_Pos         0U                                         /*!< ACTLR: DISMCYCINT Position */
+#define SCnSCB_ACTLR_DISMCYCINT_Msk        (1UL /*<< SCnSCB_ACTLR_DISMCYCINT_Pos*/)    /*!< ACTLR: DISMCYCINT Mask */
+
+/*@} end of group CMSIS_SCnotSCB */
+
+
+/**
+  \ingroup  CMSIS_core_register
+  \defgroup CMSIS_SysTick     System Tick Timer (SysTick)
+  \brief    Type definitions for the System Timer Registers.
+  @{
+ */
+
+/**
+  \brief  Structure type to access the System Timer (SysTick).
+ */
+typedef struct
+{
+  __IOM uint32_t CTRL;                   /*!< Offset: 0x000 (R/W)  SysTick Control and Status Register */
+  __IOM uint32_t LOAD;                   /*!< Offset: 0x004 (R/W)  SysTick Reload Value Register */
+  __IOM uint32_t VAL;                    /*!< Offset: 0x008 (R/W)  SysTick Current Value Register */
+  __IM  uint32_t CALIB;                  /*!< Offset: 0x00C (R/ )  SysTick Calibration Register */
+} SysTick_Type;
+
+/* SysTick Control / Status Register Definitions */
+#define SysTick_CTRL_COUNTFLAG_Pos         16U                                            /*!< SysTick CTRL: COUNTFLAG Position */
+#define SysTick_CTRL_COUNTFLAG_Msk         (1UL << SysTick_CTRL_COUNTFLAG_Pos)            /*!< SysTick CTRL: COUNTFLAG Mask */
+
+#define SysTick_CTRL_CLKSOURCE_Pos          2U                                            /*!< SysTick CTRL: CLKSOURCE Position */
+#define SysTick_CTRL_CLKSOURCE_Msk         (1UL << SysTick_CTRL_CLKSOURCE_Pos)            /*!< SysTick CTRL: CLKSOURCE Mask */
+
+#define SysTick_CTRL_TICKINT_Pos            1U                                            /*!< SysTick CTRL: TICKINT Position */
+#define SysTick_CTRL_TICKINT_Msk           (1UL << SysTick_CTRL_TICKINT_Pos)              /*!< SysTick CTRL: TICKINT Mask */
+
+#define SysTick_CTRL_ENABLE_Pos             0U                                            /*!< SysTick CTRL: ENABLE Position */
+#define SysTick_CTRL_ENABLE_Msk            (1UL /*<< SysTick_CTRL_ENABLE_Pos*/)           /*!< SysTick CTRL: ENABLE Mask */
+
+/* SysTick Reload Register Definitions */
+#define SysTick_LOAD_RELOAD_Pos             0U                                            /*!< SysTick LOAD: RELOAD Position */
+#define SysTick_LOAD_RELOAD_Msk            (0xFFFFFFUL /*<< SysTick_LOAD_RELOAD_Pos*/)    /*!< SysTick LOAD: RELOAD Mask */
+
+/* SysTick Current Register Definitions */
+#define SysTick_VAL_CURRENT_Pos             0U                                            /*!< SysTick VAL: CURRENT Position */
+#define SysTick_VAL_CURRENT_Msk            (0xFFFFFFUL /*<< SysTick_VAL_CURRENT_Pos*/)    /*!< SysTick VAL: CURRENT Mask */
+
+/* SysTick Calibration Register Definitions */
+#define SysTick_CALIB_NOREF_Pos            31U                                            /*!< SysTick CALIB: NOREF Position */
+#define SysTick_CALIB_NOREF_Msk            (1UL << SysTick_CALIB_NOREF_Pos)               /*!< SysTick CALIB: NOREF Mask */
+
+#define SysTick_CALIB_SKEW_Pos             30U                                            /*!< SysTick CALIB: SKEW Position */
+#define SysTick_CALIB_SKEW_Msk             (1UL << SysTick_CALIB_SKEW_Pos)                /*!< SysTick CALIB: SKEW Mask */
+
+#define SysTick_CALIB_TENMS_Pos             0U                                            /*!< SysTick CALIB: TENMS Position */
+#define SysTick_CALIB_TENMS_Msk            (0xFFFFFFUL /*<< SysTick_CALIB_TENMS_Pos*/)    /*!< SysTick CALIB: TENMS Mask */
+
+/*@} end of group CMSIS_SysTick */
+
+
+/**
+  \ingroup  CMSIS_core_register
+  \defgroup CMSIS_ITM     Instrumentation Trace Macrocell (ITM)
+  \brief    Type definitions for the Instrumentation Trace Macrocell (ITM)
+  @{
+ */
+
+/**
+  \brief  Structure type to access the Instrumentation Trace Macrocell Register (ITM).
+ */
+typedef struct
+{
+  __OM  union
+  {
+    __OM  uint8_t    u8;                 /*!< Offset: 0x000 ( /W)  ITM Stimulus Port 8-bit */
+    __OM  uint16_t   u16;                /*!< Offset: 0x000 ( /W)  ITM Stimulus Port 16-bit */
+    __OM  uint32_t   u32;                /*!< Offset: 0x000 ( /W)  ITM Stimulus Port 32-bit */
+  }  PORT [32U];                         /*!< Offset: 0x000 ( /W)  ITM Stimulus Port Registers */
+        uint32_t RESERVED0[864U];
+  __IOM uint32_t TER;                    /*!< Offset: 0xE00 (R/W)  ITM Trace Enable Register */
+        uint32_t RESERVED1[15U];
+  __IOM uint32_t TPR;                    /*!< Offset: 0xE40 (R/W)  ITM Trace Privilege Register */
+        uint32_t RESERVED2[15U];
+  __IOM uint32_t TCR;                    /*!< Offset: 0xE80 (R/W)  ITM Trace Control Register */
+        uint32_t RESERVED3[32U];
+        uint32_t RESERVED4[43U];
+  __OM  uint32_t LAR;                    /*!< Offset: 0xFB0 ( /W)  ITM Lock Access Register */
+  __IM  uint32_t LSR;                    /*!< Offset: 0xFB4 (R/ )  ITM Lock Status Register */
+        uint32_t RESERVED5[6U];
+  __IM  uint32_t PID4;                   /*!< Offset: 0xFD0 (R/ )  ITM Peripheral Identification Register #4 */
+  __IM  uint32_t PID5;                   /*!< Offset: 0xFD4 (R/ )  ITM Peripheral Identification Register #5 */
+  __IM  uint32_t PID6;                   /*!< Offset: 0xFD8 (R/ )  ITM Peripheral Identification Register #6 */
+  __IM  uint32_t PID7;                   /*!< Offset: 0xFDC (R/ )  ITM Peripheral Identification Register #7 */
+  __IM  uint32_t PID0;                   /*!< Offset: 0xFE0 (R/ )  ITM Peripheral Identification Register #0 */
+  __IM  uint32_t PID1;                   /*!< Offset: 0xFE4 (R/ )  ITM Peripheral Identification Register #1 */
+  __IM  uint32_t PID2;                   /*!< Offset: 0xFE8 (R/ )  ITM Peripheral Identification Register #2 */
+  __IM  uint32_t PID3;                   /*!< Offset: 0xFEC (R/ )  ITM Peripheral Identification Register #3 */
+  __IM  uint32_t CID0;                   /*!< Offset: 0xFF0 (R/ )  ITM Component  Identification Register #0 */
+  __IM  uint32_t CID1;                   /*!< Offset: 0xFF4 (R/ )  ITM Component  Identification Register #1 */
+  __IM  uint32_t CID2;                   /*!< Offset: 0xFF8 (R/ )  ITM Component  Identification Register #2 */
+  __IM  uint32_t CID3;                   /*!< Offset: 0xFFC (R/ )  ITM Component  Identification Register #3 */
+} ITM_Type;
+
+/* ITM Trace Privilege Register Definitions */
+#define ITM_TPR_PRIVMASK_Pos                0U                                            /*!< ITM TPR: PRIVMASK Position */
+#define ITM_TPR_PRIVMASK_Msk               (0xFFFFFFFFUL /*<< ITM_TPR_PRIVMASK_Pos*/)     /*!< ITM TPR: PRIVMASK Mask */
+
+/* ITM Trace Control Register Definitions */
+#define ITM_TCR_BUSY_Pos                   23U                                            /*!< ITM TCR: BUSY Position */
+#define ITM_TCR_BUSY_Msk                   (1UL << ITM_TCR_BUSY_Pos)                      /*!< ITM TCR: BUSY Mask */
+
+#define ITM_TCR_TraceBusID_Pos             16U                                            /*!< ITM TCR: ATBID Position */
+#define ITM_TCR_TraceBusID_Msk             (0x7FUL << ITM_TCR_TraceBusID_Pos)             /*!< ITM TCR: ATBID Mask */
+
+#define ITM_TCR_GTSFREQ_Pos                10U                                            /*!< ITM TCR: Global timestamp frequency Position */
+#define ITM_TCR_GTSFREQ_Msk                (3UL << ITM_TCR_GTSFREQ_Pos)                   /*!< ITM TCR: Global timestamp frequency Mask */
+
+#define ITM_TCR_TSPrescale_Pos              8U                                            /*!< ITM TCR: TSPrescale Position */
+#define ITM_TCR_TSPrescale_Msk             (3UL << ITM_TCR_TSPrescale_Pos)                /*!< ITM TCR: TSPrescale Mask */
+
+#define ITM_TCR_SWOENA_Pos                  4U                                            /*!< ITM TCR: SWOENA Position */
+#define ITM_TCR_SWOENA_Msk                 (1UL << ITM_TCR_SWOENA_Pos)                    /*!< ITM TCR: SWOENA Mask */
+
+#define ITM_TCR_DWTENA_Pos                  3U                                            /*!< ITM TCR: DWTENA Position */
+#define ITM_TCR_DWTENA_Msk                 (1UL << ITM_TCR_DWTENA_Pos)                    /*!< ITM TCR: DWTENA Mask */
+
+#define ITM_TCR_SYNCENA_Pos                 2U                                            /*!< ITM TCR: SYNCENA Position */
+#define ITM_TCR_SYNCENA_Msk                (1UL << ITM_TCR_SYNCENA_Pos)                   /*!< ITM TCR: SYNCENA Mask */
+
+#define ITM_TCR_TSENA_Pos                   1U                                            /*!< ITM TCR: TSENA Position */
+#define ITM_TCR_TSENA_Msk                  (1UL << ITM_TCR_TSENA_Pos)                     /*!< ITM TCR: TSENA Mask */
+
+#define ITM_TCR_ITMENA_Pos                  0U                                            /*!< ITM TCR: ITM Enable bit Position */
+#define ITM_TCR_ITMENA_Msk                 (1UL /*<< ITM_TCR_ITMENA_Pos*/)                /*!< ITM TCR: ITM Enable bit Mask */
+
+/* ITM Lock Status Register Definitions */
+#define ITM_LSR_ByteAcc_Pos                 2U                                            /*!< ITM LSR: ByteAcc Position */
+#define ITM_LSR_ByteAcc_Msk                (1UL << ITM_LSR_ByteAcc_Pos)                   /*!< ITM LSR: ByteAcc Mask */
+
+#define ITM_LSR_Access_Pos                  1U                                            /*!< ITM LSR: Access Position */
+#define ITM_LSR_Access_Msk                 (1UL << ITM_LSR_Access_Pos)                    /*!< ITM LSR: Access Mask */
+
+#define ITM_LSR_Present_Pos                 0U                                            /*!< ITM LSR: Present Position */
+#define ITM_LSR_Present_Msk                (1UL /*<< ITM_LSR_Present_Pos*/)               /*!< ITM LSR: Present Mask */
+
+/*@}*/ /* end of group CMSIS_ITM */
+
+
+/**
+  \ingroup  CMSIS_core_register
+  \defgroup CMSIS_DWT     Data Watchpoint and Trace (DWT)
+  \brief    Type definitions for the Data Watchpoint and Trace (DWT)
+  @{
+ */
+
+/**
+  \brief  Structure type to access the Data Watchpoint and Trace Register (DWT).
+ */
+typedef struct
+{
+  __IOM uint32_t CTRL;                   /*!< Offset: 0x000 (R/W)  Control Register */
+  __IOM uint32_t CYCCNT;                 /*!< Offset: 0x004 (R/W)  Cycle Count Register */
+  __IOM uint32_t CPICNT;                 /*!< Offset: 0x008 (R/W)  CPI Count Register */
+  __IOM uint32_t EXCCNT;                 /*!< Offset: 0x00C (R/W)  Exception Overhead Count Register */
+  __IOM uint32_t SLEEPCNT;               /*!< Offset: 0x010 (R/W)  Sleep Count Register */
+  __IOM uint32_t LSUCNT;                 /*!< Offset: 0x014 (R/W)  LSU Count Register */
+  __IOM uint32_t FOLDCNT;                /*!< Offset: 0x018 (R/W)  Folded-instruction Count Register */
+  __IM  uint32_t PCSR;                   /*!< Offset: 0x01C (R/ )  Program Counter Sample Register */
+  __IOM uint32_t COMP0;                  /*!< Offset: 0x020 (R/W)  Comparator Register 0 */
+  __IOM uint32_t MASK0;                  /*!< Offset: 0x024 (R/W)  Mask Register 0 */
+  __IOM uint32_t FUNCTION0;              /*!< Offset: 0x028 (R/W)  Function Register 0 */
+        uint32_t RESERVED0[1U];
+  __IOM uint32_t COMP1;                  /*!< Offset: 0x030 (R/W)  Comparator Register 1 */
+  __IOM uint32_t MASK1;                  /*!< Offset: 0x034 (R/W)  Mask Register 1 */
+  __IOM uint32_t FUNCTION1;              /*!< Offset: 0x038 (R/W)  Function Register 1 */
+        uint32_t RESERVED1[1U];
+  __IOM uint32_t COMP2;                  /*!< Offset: 0x040 (R/W)  Comparator Register 2 */
+  __IOM uint32_t MASK2;                  /*!< Offset: 0x044 (R/W)  Mask Register 2 */
+  __IOM uint32_t FUNCTION2;              /*!< Offset: 0x048 (R/W)  Function Register 2 */
+        uint32_t RESERVED2[1U];
+  __IOM uint32_t COMP3;                  /*!< Offset: 0x050 (R/W)  Comparator Register 3 */
+  __IOM uint32_t MASK3;                  /*!< Offset: 0x054 (R/W)  Mask Register 3 */
+  __IOM uint32_t FUNCTION3;              /*!< Offset: 0x058 (R/W)  Function Register 3 */
+} DWT_Type;
+
+/* DWT Control Register Definitions */
+#define DWT_CTRL_NUMCOMP_Pos               28U                                         /*!< DWT CTRL: NUMCOMP Position */
+#define DWT_CTRL_NUMCOMP_Msk               (0xFUL << DWT_CTRL_NUMCOMP_Pos)             /*!< DWT CTRL: NUMCOMP Mask */
+
+#define DWT_CTRL_NOTRCPKT_Pos              27U                                         /*!< DWT CTRL: NOTRCPKT Position */
+#define DWT_CTRL_NOTRCPKT_Msk              (0x1UL << DWT_CTRL_NOTRCPKT_Pos)            /*!< DWT CTRL: NOTRCPKT Mask */
+
+#define DWT_CTRL_NOEXTTRIG_Pos             26U                                         /*!< DWT CTRL: NOEXTTRIG Position */
+#define DWT_CTRL_NOEXTTRIG_Msk             (0x1UL << DWT_CTRL_NOEXTTRIG_Pos)           /*!< DWT CTRL: NOEXTTRIG Mask */
+
+#define DWT_CTRL_NOCYCCNT_Pos              25U                                         /*!< DWT CTRL: NOCYCCNT Position */
+#define DWT_CTRL_NOCYCCNT_Msk              (0x1UL << DWT_CTRL_NOCYCCNT_Pos)            /*!< DWT CTRL: NOCYCCNT Mask */
+
+#define DWT_CTRL_NOPRFCNT_Pos              24U                                         /*!< DWT CTRL: NOPRFCNT Position */
+#define DWT_CTRL_NOPRFCNT_Msk              (0x1UL << DWT_CTRL_NOPRFCNT_Pos)            /*!< DWT CTRL: NOPRFCNT Mask */
+
+#define DWT_CTRL_CYCEVTENA_Pos             22U                                         /*!< DWT CTRL: CYCEVTENA Position */
+#define DWT_CTRL_CYCEVTENA_Msk             (0x1UL << DWT_CTRL_CYCEVTENA_Pos)           /*!< DWT CTRL: CYCEVTENA Mask */
+
+#define DWT_CTRL_FOLDEVTENA_Pos            21U                                         /*!< DWT CTRL: FOLDEVTENA Position */
+#define DWT_CTRL_FOLDEVTENA_Msk            (0x1UL << DWT_CTRL_FOLDEVTENA_Pos)          /*!< DWT CTRL: FOLDEVTENA Mask */
+
+#define DWT_CTRL_LSUEVTENA_Pos             20U                                         /*!< DWT CTRL: LSUEVTENA Position */
+#define DWT_CTRL_LSUEVTENA_Msk             (0x1UL << DWT_CTRL_LSUEVTENA_Pos)           /*!< DWT CTRL: LSUEVTENA Mask */
+
+#define DWT_CTRL_SLEEPEVTENA_Pos           19U                                         /*!< DWT CTRL: SLEEPEVTENA Position */
+#define DWT_CTRL_SLEEPEVTENA_Msk           (0x1UL << DWT_CTRL_SLEEPEVTENA_Pos)         /*!< DWT CTRL: SLEEPEVTENA Mask */
+
+#define DWT_CTRL_EXCEVTENA_Pos             18U                                         /*!< DWT CTRL: EXCEVTENA Position */
+#define DWT_CTRL_EXCEVTENA_Msk             (0x1UL << DWT_CTRL_EXCEVTENA_Pos)           /*!< DWT CTRL: EXCEVTENA Mask */
+
+#define DWT_CTRL_CPIEVTENA_Pos             17U                                         /*!< DWT CTRL: CPIEVTENA Position */
+#define DWT_CTRL_CPIEVTENA_Msk             (0x1UL << DWT_CTRL_CPIEVTENA_Pos)           /*!< DWT CTRL: CPIEVTENA Mask */
+
+#define DWT_CTRL_EXCTRCENA_Pos             16U                                         /*!< DWT CTRL: EXCTRCENA Position */
+#define DWT_CTRL_EXCTRCENA_Msk             (0x1UL << DWT_CTRL_EXCTRCENA_Pos)           /*!< DWT CTRL: EXCTRCENA Mask */
+
+#define DWT_CTRL_PCSAMPLENA_Pos            12U                                         /*!< DWT CTRL: PCSAMPLENA Position */
+#define DWT_CTRL_PCSAMPLENA_Msk            (0x1UL << DWT_CTRL_PCSAMPLENA_Pos)          /*!< DWT CTRL: PCSAMPLENA Mask */
+
+#define DWT_CTRL_SYNCTAP_Pos               10U                                         /*!< DWT CTRL: SYNCTAP Position */
+#define DWT_CTRL_SYNCTAP_Msk               (0x3UL << DWT_CTRL_SYNCTAP_Pos)             /*!< DWT CTRL: SYNCTAP Mask */
+
+#define DWT_CTRL_CYCTAP_Pos                 9U                                         /*!< DWT CTRL: CYCTAP Position */
+#define DWT_CTRL_CYCTAP_Msk                (0x1UL << DWT_CTRL_CYCTAP_Pos)              /*!< DWT CTRL: CYCTAP Mask */
+
+#define DWT_CTRL_POSTINIT_Pos               5U                                         /*!< DWT CTRL: POSTINIT Position */
+#define DWT_CTRL_POSTINIT_Msk              (0xFUL << DWT_CTRL_POSTINIT_Pos)            /*!< DWT CTRL: POSTINIT Mask */
+
+#define DWT_CTRL_POSTPRESET_Pos             1U                                         /*!< DWT CTRL: POSTPRESET Position */
+#define DWT_CTRL_POSTPRESET_Msk            (0xFUL << DWT_CTRL_POSTPRESET_Pos)          /*!< DWT CTRL: POSTPRESET Mask */
+
+#define DWT_CTRL_CYCCNTENA_Pos              0U                                         /*!< DWT CTRL: CYCCNTENA Position */
+#define DWT_CTRL_CYCCNTENA_Msk             (0x1UL /*<< DWT_CTRL_CYCCNTENA_Pos*/)       /*!< DWT CTRL: CYCCNTENA Mask */
+
+/* DWT CPI Count Register Definitions */
+#define DWT_CPICNT_CPICNT_Pos               0U                                         /*!< DWT CPICNT: CPICNT Position */
+#define DWT_CPICNT_CPICNT_Msk              (0xFFUL /*<< DWT_CPICNT_CPICNT_Pos*/)       /*!< DWT CPICNT: CPICNT Mask */
+
+/* DWT Exception Overhead Count Register Definitions */
+#define DWT_EXCCNT_EXCCNT_Pos               0U                                         /*!< DWT EXCCNT: EXCCNT Position */
+#define DWT_EXCCNT_EXCCNT_Msk              (0xFFUL /*<< DWT_EXCCNT_EXCCNT_Pos*/)       /*!< DWT EXCCNT: EXCCNT Mask */
+
+/* DWT Sleep Count Register Definitions */
+#define DWT_SLEEPCNT_SLEEPCNT_Pos           0U                                         /*!< DWT SLEEPCNT: SLEEPCNT Position */
+#define DWT_SLEEPCNT_SLEEPCNT_Msk          (0xFFUL /*<< DWT_SLEEPCNT_SLEEPCNT_Pos*/)   /*!< DWT SLEEPCNT: SLEEPCNT Mask */
+
+/* DWT LSU Count Register Definitions */
+#define DWT_LSUCNT_LSUCNT_Pos               0U                                         /*!< DWT LSUCNT: LSUCNT Position */
+#define DWT_LSUCNT_LSUCNT_Msk              (0xFFUL /*<< DWT_LSUCNT_LSUCNT_Pos*/)       /*!< DWT LSUCNT: LSUCNT Mask */
+
+/* DWT Folded-instruction Count Register Definitions */
+#define DWT_FOLDCNT_FOLDCNT_Pos             0U                                         /*!< DWT FOLDCNT: FOLDCNT Position */
+#define DWT_FOLDCNT_FOLDCNT_Msk            (0xFFUL /*<< DWT_FOLDCNT_FOLDCNT_Pos*/)     /*!< DWT FOLDCNT: FOLDCNT Mask */
+
+/* DWT Comparator Mask Register Definitions */
+#define DWT_MASK_MASK_Pos                   0U                                         /*!< DWT MASK: MASK Position */
+#define DWT_MASK_MASK_Msk                  (0x1FUL /*<< DWT_MASK_MASK_Pos*/)           /*!< DWT MASK: MASK Mask */
+
+/* DWT Comparator Function Register Definitions */
+#define DWT_FUNCTION_MATCHED_Pos           24U                                         /*!< DWT FUNCTION: MATCHED Position */
+#define DWT_FUNCTION_MATCHED_Msk           (0x1UL << DWT_FUNCTION_MATCHED_Pos)         /*!< DWT FUNCTION: MATCHED Mask */
+
+#define DWT_FUNCTION_DATAVADDR1_Pos        16U                                         /*!< DWT FUNCTION: DATAVADDR1 Position */
+#define DWT_FUNCTION_DATAVADDR1_Msk        (0xFUL << DWT_FUNCTION_DATAVADDR1_Pos)      /*!< DWT FUNCTION: DATAVADDR1 Mask */
+
+#define DWT_FUNCTION_DATAVADDR0_Pos        12U                                         /*!< DWT FUNCTION: DATAVADDR0 Position */
+#define DWT_FUNCTION_DATAVADDR0_Msk        (0xFUL << DWT_FUNCTION_DATAVADDR0_Pos)      /*!< DWT FUNCTION: DATAVADDR0 Mask */
+
+#define DWT_FUNCTION_DATAVSIZE_Pos         10U                                         /*!< DWT FUNCTION: DATAVSIZE Position */
+#define DWT_FUNCTION_DATAVSIZE_Msk         (0x3UL << DWT_FUNCTION_DATAVSIZE_Pos)       /*!< DWT FUNCTION: DATAVSIZE Mask */
+
+#define DWT_FUNCTION_LNK1ENA_Pos            9U                                         /*!< DWT FUNCTION: LNK1ENA Position */
+#define DWT_FUNCTION_LNK1ENA_Msk           (0x1UL << DWT_FUNCTION_LNK1ENA_Pos)         /*!< DWT FUNCTION: LNK1ENA Mask */
+
+#define DWT_FUNCTION_DATAVMATCH_Pos         8U                                         /*!< DWT FUNCTION: DATAVMATCH Position */
+#define DWT_FUNCTION_DATAVMATCH_Msk        (0x1UL << DWT_FUNCTION_DATAVMATCH_Pos)      /*!< DWT FUNCTION: DATAVMATCH Mask */
+
+#define DWT_FUNCTION_CYCMATCH_Pos           7U                                         /*!< DWT FUNCTION: CYCMATCH Position */
+#define DWT_FUNCTION_CYCMATCH_Msk          (0x1UL << DWT_FUNCTION_CYCMATCH_Pos)        /*!< DWT FUNCTION: CYCMATCH Mask */
+
+#define DWT_FUNCTION_EMITRANGE_Pos          5U                                         /*!< DWT FUNCTION: EMITRANGE Position */
+#define DWT_FUNCTION_EMITRANGE_Msk         (0x1UL << DWT_FUNCTION_EMITRANGE_Pos)       /*!< DWT FUNCTION: EMITRANGE Mask */
+
+#define DWT_FUNCTION_FUNCTION_Pos           0U                                         /*!< DWT FUNCTION: FUNCTION Position */
+#define DWT_FUNCTION_FUNCTION_Msk          (0xFUL /*<< DWT_FUNCTION_FUNCTION_Pos*/)    /*!< DWT FUNCTION: FUNCTION Mask */
+
+/*@}*/ /* end of group CMSIS_DWT */
+
+
+/**
+  \ingroup  CMSIS_core_register
+  \defgroup CMSIS_TPI     Trace Port Interface (TPI)
+  \brief    Type definitions for the Trace Port Interface (TPI)
+  @{
+ */
+
+/**
+  \brief  Structure type to access the Trace Port Interface Register (TPI).
+ */
+typedef struct
+{
+  __IM  uint32_t SSPSR;                  /*!< Offset: 0x000 (R/ )  Supported Parallel Port Size Register */
+  __IOM uint32_t CSPSR;                  /*!< Offset: 0x004 (R/W)  Current Parallel Port Size Register */
+        uint32_t RESERVED0[2U];
+  __IOM uint32_t ACPR;                   /*!< Offset: 0x010 (R/W)  Asynchronous Clock Prescaler Register */
+        uint32_t RESERVED1[55U];
+  __IOM uint32_t SPPR;                   /*!< Offset: 0x0F0 (R/W)  Selected Pin Protocol Register */
+        uint32_t RESERVED2[131U];
+  __IM  uint32_t FFSR;                   /*!< Offset: 0x300 (R/ )  Formatter and Flush Status Register */
+  __IOM uint32_t FFCR;                   /*!< Offset: 0x304 (R/W)  Formatter and Flush Control Register */
+  __IM  uint32_t FSCR;                   /*!< Offset: 0x308 (R/ )  Formatter Synchronization Counter Register */
+        uint32_t RESERVED3[759U];
+  __IM  uint32_t TRIGGER;                /*!< Offset: 0xEE8 (R/ )  TRIGGER Register */
+  __IM  uint32_t FIFO0;                  /*!< Offset: 0xEEC (R/ )  Integration ETM Data */
+  __IM  uint32_t ITATBCTR2;              /*!< Offset: 0xEF0 (R/ )  ITATBCTR2 */
+        uint32_t RESERVED4[1U];
+  __IM  uint32_t ITATBCTR0;              /*!< Offset: 0xEF8 (R/ )  ITATBCTR0 */
+  __IM  uint32_t FIFO1;                  /*!< Offset: 0xEFC (R/ )  Integration ITM Data */
+  __IOM uint32_t ITCTRL;                 /*!< Offset: 0xF00 (R/W)  Integration Mode Control */
+        uint32_t RESERVED5[39U];
+  __IOM uint32_t CLAIMSET;               /*!< Offset: 0xFA0 (R/W)  Claim tag set */
+  __IOM uint32_t CLAIMCLR;               /*!< Offset: 0xFA4 (R/W)  Claim tag clear */
+        uint32_t RESERVED7[8U];
+  __IM  uint32_t DEVID;                  /*!< Offset: 0xFC8 (R/ )  TPIU_DEVID */
+  __IM  uint32_t DEVTYPE;                /*!< Offset: 0xFCC (R/ )  TPIU_DEVTYPE */
+} TPI_Type;
+
+/* TPI Asynchronous Clock Prescaler Register Definitions */
+#define TPI_ACPR_PRESCALER_Pos              0U                                         /*!< TPI ACPR: PRESCALER Position */
+#define TPI_ACPR_PRESCALER_Msk             (0x1FFFUL /*<< TPI_ACPR_PRESCALER_Pos*/)    /*!< TPI ACPR: PRESCALER Mask */
+
+/* TPI Selected Pin Protocol Register Definitions */
+#define TPI_SPPR_TXMODE_Pos                 0U                                         /*!< TPI SPPR: TXMODE Position */
+#define TPI_SPPR_TXMODE_Msk                (0x3UL /*<< TPI_SPPR_TXMODE_Pos*/)          /*!< TPI SPPR: TXMODE Mask */
+
+/* TPI Formatter and Flush Status Register Definitions */
+#define TPI_FFSR_FtNonStop_Pos              3U                                         /*!< TPI FFSR: FtNonStop Position */
+#define TPI_FFSR_FtNonStop_Msk             (0x1UL << TPI_FFSR_FtNonStop_Pos)           /*!< TPI FFSR: FtNonStop Mask */
+
+#define TPI_FFSR_TCPresent_Pos              2U                                         /*!< TPI FFSR: TCPresent Position */
+#define TPI_FFSR_TCPresent_Msk             (0x1UL << TPI_FFSR_TCPresent_Pos)           /*!< TPI FFSR: TCPresent Mask */
+
+#define TPI_FFSR_FtStopped_Pos              1U                                         /*!< TPI FFSR: FtStopped Position */
+#define TPI_FFSR_FtStopped_Msk             (0x1UL << TPI_FFSR_FtStopped_Pos)           /*!< TPI FFSR: FtStopped Mask */
+
+#define TPI_FFSR_FlInProg_Pos               0U                                         /*!< TPI FFSR: FlInProg Position */
+#define TPI_FFSR_FlInProg_Msk              (0x1UL /*<< TPI_FFSR_FlInProg_Pos*/)        /*!< TPI FFSR: FlInProg Mask */
+
+/* TPI Formatter and Flush Control Register Definitions */
+#define TPI_FFCR_TrigIn_Pos                 8U                                         /*!< TPI FFCR: TrigIn Position */
+#define TPI_FFCR_TrigIn_Msk                (0x1UL << TPI_FFCR_TrigIn_Pos)              /*!< TPI FFCR: TrigIn Mask */
+
+#define TPI_FFCR_EnFCont_Pos                1U                                         /*!< TPI FFCR: EnFCont Position */
+#define TPI_FFCR_EnFCont_Msk               (0x1UL << TPI_FFCR_EnFCont_Pos)             /*!< TPI FFCR: EnFCont Mask */
+
+/* TPI TRIGGER Register Definitions */
+#define TPI_TRIGGER_TRIGGER_Pos             0U                                         /*!< TPI TRIGGER: TRIGGER Position */
+#define TPI_TRIGGER_TRIGGER_Msk            (0x1UL /*<< TPI_TRIGGER_TRIGGER_Pos*/)      /*!< TPI TRIGGER: TRIGGER Mask */
+
+/* TPI Integration ETM Data Register Definitions (FIFO0) */
+#define TPI_FIFO0_ITM_ATVALID_Pos          29U                                         /*!< TPI FIFO0: ITM_ATVALID Position */
+#define TPI_FIFO0_ITM_ATVALID_Msk          (0x1UL << TPI_FIFO0_ITM_ATVALID_Pos)        /*!< TPI FIFO0: ITM_ATVALID Mask */
+
+#define TPI_FIFO0_ITM_bytecount_Pos        27U                                         /*!< TPI FIFO0: ITM_bytecount Position */
+#define TPI_FIFO0_ITM_bytecount_Msk        (0x3UL << TPI_FIFO0_ITM_bytecount_Pos)      /*!< TPI FIFO0: ITM_bytecount Mask */
+
+#define TPI_FIFO0_ETM_ATVALID_Pos          26U                                         /*!< TPI FIFO0: ETM_ATVALID Position */
+#define TPI_FIFO0_ETM_ATVALID_Msk          (0x1UL << TPI_FIFO0_ETM_ATVALID_Pos)        /*!< TPI FIFO0: ETM_ATVALID Mask */
+
+#define TPI_FIFO0_ETM_bytecount_Pos        24U                                         /*!< TPI FIFO0: ETM_bytecount Position */
+#define TPI_FIFO0_ETM_bytecount_Msk        (0x3UL << TPI_FIFO0_ETM_bytecount_Pos)      /*!< TPI FIFO0: ETM_bytecount Mask */
+
+#define TPI_FIFO0_ETM2_Pos                 16U                                         /*!< TPI FIFO0: ETM2 Position */
+#define TPI_FIFO0_ETM2_Msk                 (0xFFUL << TPI_FIFO0_ETM2_Pos)              /*!< TPI FIFO0: ETM2 Mask */
+
+#define TPI_FIFO0_ETM1_Pos                  8U                                         /*!< TPI FIFO0: ETM1 Position */
+#define TPI_FIFO0_ETM1_Msk                 (0xFFUL << TPI_FIFO0_ETM1_Pos)              /*!< TPI FIFO0: ETM1 Mask */
+
+#define TPI_FIFO0_ETM0_Pos                  0U                                         /*!< TPI FIFO0: ETM0 Position */
+#define TPI_FIFO0_ETM0_Msk                 (0xFFUL /*<< TPI_FIFO0_ETM0_Pos*/)          /*!< TPI FIFO0: ETM0 Mask */
+
+/* TPI ITATBCTR2 Register Definitions */
+#define TPI_ITATBCTR2_ATREADY2_Pos          0U                                         /*!< TPI ITATBCTR2: ATREADY2 Position */
+#define TPI_ITATBCTR2_ATREADY2_Msk         (0x1UL /*<< TPI_ITATBCTR2_ATREADY2_Pos*/)   /*!< TPI ITATBCTR2: ATREADY2 Mask */
+
+#define TPI_ITATBCTR2_ATREADY1_Pos          0U                                         /*!< TPI ITATBCTR2: ATREADY1 Position */
+#define TPI_ITATBCTR2_ATREADY1_Msk         (0x1UL /*<< TPI_ITATBCTR2_ATREADY1_Pos*/)   /*!< TPI ITATBCTR2: ATREADY1 Mask */
+
+/* TPI Integration ITM Data Register Definitions (FIFO1) */
+#define TPI_FIFO1_ITM_ATVALID_Pos          29U                                         /*!< TPI FIFO1: ITM_ATVALID Position */
+#define TPI_FIFO1_ITM_ATVALID_Msk          (0x1UL << TPI_FIFO1_ITM_ATVALID_Pos)        /*!< TPI FIFO1: ITM_ATVALID Mask */
+
+#define TPI_FIFO1_ITM_bytecount_Pos        27U                                         /*!< TPI FIFO1: ITM_bytecount Position */
+#define TPI_FIFO1_ITM_bytecount_Msk        (0x3UL << TPI_FIFO1_ITM_bytecount_Pos)      /*!< TPI FIFO1: ITM_bytecount Mask */
+
+#define TPI_FIFO1_ETM_ATVALID_Pos          26U                                         /*!< TPI FIFO1: ETM_ATVALID Position */
+#define TPI_FIFO1_ETM_ATVALID_Msk          (0x1UL << TPI_FIFO1_ETM_ATVALID_Pos)        /*!< TPI FIFO1: ETM_ATVALID Mask */
+
+#define TPI_FIFO1_ETM_bytecount_Pos        24U                                         /*!< TPI FIFO1: ETM_bytecount Position */
+#define TPI_FIFO1_ETM_bytecount_Msk        (0x3UL << TPI_FIFO1_ETM_bytecount_Pos)      /*!< TPI FIFO1: ETM_bytecount Mask */
+
+#define TPI_FIFO1_ITM2_Pos                 16U                                         /*!< TPI FIFO1: ITM2 Position */
+#define TPI_FIFO1_ITM2_Msk                 (0xFFUL << TPI_FIFO1_ITM2_Pos)              /*!< TPI FIFO1: ITM2 Mask */
+
+#define TPI_FIFO1_ITM1_Pos                  8U                                         /*!< TPI FIFO1: ITM1 Position */
+#define TPI_FIFO1_ITM1_Msk                 (0xFFUL << TPI_FIFO1_ITM1_Pos)              /*!< TPI FIFO1: ITM1 Mask */
+
+#define TPI_FIFO1_ITM0_Pos                  0U                                         /*!< TPI FIFO1: ITM0 Position */
+#define TPI_FIFO1_ITM0_Msk                 (0xFFUL /*<< TPI_FIFO1_ITM0_Pos*/)          /*!< TPI FIFO1: ITM0 Mask */
+
+/* TPI ITATBCTR0 Register Definitions */
+#define TPI_ITATBCTR0_ATREADY2_Pos          0U                                         /*!< TPI ITATBCTR0: ATREADY2 Position */
+#define TPI_ITATBCTR0_ATREADY2_Msk         (0x1UL /*<< TPI_ITATBCTR0_ATREADY2_Pos*/)   /*!< TPI ITATBCTR0: ATREADY2 Mask */
+
+#define TPI_ITATBCTR0_ATREADY1_Pos          0U                                         /*!< TPI ITATBCTR0: ATREADY1 Position */
+#define TPI_ITATBCTR0_ATREADY1_Msk         (0x1UL /*<< TPI_ITATBCTR0_ATREADY1_Pos*/)   /*!< TPI ITATBCTR0: ATREADY1 Mask */
+
+/* TPI Integration Mode Control Register Definitions */
+#define TPI_ITCTRL_Mode_Pos                 0U                                         /*!< TPI ITCTRL: Mode Position */
+#define TPI_ITCTRL_Mode_Msk                (0x3UL /*<< TPI_ITCTRL_Mode_Pos*/)          /*!< TPI ITCTRL: Mode Mask */
+
+/* TPI DEVID Register Definitions */
+#define TPI_DEVID_NRZVALID_Pos             11U                                         /*!< TPI DEVID: NRZVALID Position */
+#define TPI_DEVID_NRZVALID_Msk             (0x1UL << TPI_DEVID_NRZVALID_Pos)           /*!< TPI DEVID: NRZVALID Mask */
+
+#define TPI_DEVID_MANCVALID_Pos            10U                                         /*!< TPI DEVID: MANCVALID Position */
+#define TPI_DEVID_MANCVALID_Msk            (0x1UL << TPI_DEVID_MANCVALID_Pos)          /*!< TPI DEVID: MANCVALID Mask */
+
+#define TPI_DEVID_PTINVALID_Pos             9U                                         /*!< TPI DEVID: PTINVALID Position */
+#define TPI_DEVID_PTINVALID_Msk            (0x1UL << TPI_DEVID_PTINVALID_Pos)          /*!< TPI DEVID: PTINVALID Mask */
+
+#define TPI_DEVID_MinBufSz_Pos              6U                                         /*!< TPI DEVID: MinBufSz Position */
+#define TPI_DEVID_MinBufSz_Msk             (0x7UL << TPI_DEVID_MinBufSz_Pos)           /*!< TPI DEVID: MinBufSz Mask */
+
+#define TPI_DEVID_AsynClkIn_Pos             5U                                         /*!< TPI DEVID: AsynClkIn Position */
+#define TPI_DEVID_AsynClkIn_Msk            (0x1UL << TPI_DEVID_AsynClkIn_Pos)          /*!< TPI DEVID: AsynClkIn Mask */
+
+#define TPI_DEVID_NrTraceInput_Pos          0U                                         /*!< TPI DEVID: NrTraceInput Position */
+#define TPI_DEVID_NrTraceInput_Msk         (0x1FUL /*<< TPI_DEVID_NrTraceInput_Pos*/)  /*!< TPI DEVID: NrTraceInput Mask */
+
+/* TPI DEVTYPE Register Definitions */
+#define TPI_DEVTYPE_SubType_Pos             4U                                         /*!< TPI DEVTYPE: SubType Position */
+#define TPI_DEVTYPE_SubType_Msk            (0xFUL /*<< TPI_DEVTYPE_SubType_Pos*/)      /*!< TPI DEVTYPE: SubType Mask */
+
+#define TPI_DEVTYPE_MajorType_Pos           0U                                         /*!< TPI DEVTYPE: MajorType Position */
+#define TPI_DEVTYPE_MajorType_Msk          (0xFUL << TPI_DEVTYPE_MajorType_Pos)        /*!< TPI DEVTYPE: MajorType Mask */
+
+/*@}*/ /* end of group CMSIS_TPI */
+
+
+#if defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U)
+/**
+  \ingroup  CMSIS_core_register
+  \defgroup CMSIS_MPU     Memory Protection Unit (MPU)
+  \brief    Type definitions for the Memory Protection Unit (MPU)
+  @{
+ */
+
+/**
+  \brief  Structure type to access the Memory Protection Unit (MPU).
+ */
+typedef struct
+{
+  __IM  uint32_t TYPE;                   /*!< Offset: 0x000 (R/ )  MPU Type Register */
+  __IOM uint32_t CTRL;                   /*!< Offset: 0x004 (R/W)  MPU Control Register */
+  __IOM uint32_t RNR;                    /*!< Offset: 0x008 (R/W)  MPU Region RNRber Register */
+  __IOM uint32_t RBAR;                   /*!< Offset: 0x00C (R/W)  MPU Region Base Address Register */
+  __IOM uint32_t RASR;                   /*!< Offset: 0x010 (R/W)  MPU Region Attribute and Size Register */
+  __IOM uint32_t RBAR_A1;                /*!< Offset: 0x014 (R/W)  MPU Alias 1 Region Base Address Register */
+  __IOM uint32_t RASR_A1;                /*!< Offset: 0x018 (R/W)  MPU Alias 1 Region Attribute and Size Register */
+  __IOM uint32_t RBAR_A2;                /*!< Offset: 0x01C (R/W)  MPU Alias 2 Region Base Address Register */
+  __IOM uint32_t RASR_A2;                /*!< Offset: 0x020 (R/W)  MPU Alias 2 Region Attribute and Size Register */
+  __IOM uint32_t RBAR_A3;                /*!< Offset: 0x024 (R/W)  MPU Alias 3 Region Base Address Register */
+  __IOM uint32_t RASR_A3;                /*!< Offset: 0x028 (R/W)  MPU Alias 3 Region Attribute and Size Register */
+} MPU_Type;
+
+#define MPU_TYPE_RALIASES                  4U
+
+/* MPU Type Register Definitions */
+#define MPU_TYPE_IREGION_Pos               16U                                            /*!< MPU TYPE: IREGION Position */
+#define MPU_TYPE_IREGION_Msk               (0xFFUL << MPU_TYPE_IREGION_Pos)               /*!< MPU TYPE: IREGION Mask */
+
+#define MPU_TYPE_DREGION_Pos                8U                                            /*!< MPU TYPE: DREGION Position */
+#define MPU_TYPE_DREGION_Msk               (0xFFUL << MPU_TYPE_DREGION_Pos)               /*!< MPU TYPE: DREGION Mask */
+
+#define MPU_TYPE_SEPARATE_Pos               0U                                            /*!< MPU TYPE: SEPARATE Position */
+#define MPU_TYPE_SEPARATE_Msk              (1UL /*<< MPU_TYPE_SEPARATE_Pos*/)             /*!< MPU TYPE: SEPARATE Mask */
+
+/* MPU Control Register Definitions */
+#define MPU_CTRL_PRIVDEFENA_Pos             2U                                            /*!< MPU CTRL: PRIVDEFENA Position */
+#define MPU_CTRL_PRIVDEFENA_Msk            (1UL << MPU_CTRL_PRIVDEFENA_Pos)               /*!< MPU CTRL: PRIVDEFENA Mask */
+
+#define MPU_CTRL_HFNMIENA_Pos               1U                                            /*!< MPU CTRL: HFNMIENA Position */
+#define MPU_CTRL_HFNMIENA_Msk              (1UL << MPU_CTRL_HFNMIENA_Pos)                 /*!< MPU CTRL: HFNMIENA Mask */
+
+#define MPU_CTRL_ENABLE_Pos                 0U                                            /*!< MPU CTRL: ENABLE Position */
+#define MPU_CTRL_ENABLE_Msk                (1UL /*<< MPU_CTRL_ENABLE_Pos*/)               /*!< MPU CTRL: ENABLE Mask */
+
+/* MPU Region Number Register Definitions */
+#define MPU_RNR_REGION_Pos                  0U                                            /*!< MPU RNR: REGION Position */
+#define MPU_RNR_REGION_Msk                 (0xFFUL /*<< MPU_RNR_REGION_Pos*/)             /*!< MPU RNR: REGION Mask */
+
+/* MPU Region Base Address Register Definitions */
+#define MPU_RBAR_ADDR_Pos                   5U                                            /*!< MPU RBAR: ADDR Position */
+#define MPU_RBAR_ADDR_Msk                  (0x7FFFFFFUL << MPU_RBAR_ADDR_Pos)             /*!< MPU RBAR: ADDR Mask */
+
+#define MPU_RBAR_VALID_Pos                  4U                                            /*!< MPU RBAR: VALID Position */
+#define MPU_RBAR_VALID_Msk                 (1UL << MPU_RBAR_VALID_Pos)                    /*!< MPU RBAR: VALID Mask */
+
+#define MPU_RBAR_REGION_Pos                 0U                                            /*!< MPU RBAR: REGION Position */
+#define MPU_RBAR_REGION_Msk                (0xFUL /*<< MPU_RBAR_REGION_Pos*/)             /*!< MPU RBAR: REGION Mask */
+
+/* MPU Region Attribute and Size Register Definitions */
+#define MPU_RASR_ATTRS_Pos                 16U                                            /*!< MPU RASR: MPU Region Attribute field Position */
+#define MPU_RASR_ATTRS_Msk                 (0xFFFFUL << MPU_RASR_ATTRS_Pos)               /*!< MPU RASR: MPU Region Attribute field Mask */
+
+#define MPU_RASR_XN_Pos                    28U                                            /*!< MPU RASR: ATTRS.XN Position */
+#define MPU_RASR_XN_Msk                    (1UL << MPU_RASR_XN_Pos)                       /*!< MPU RASR: ATTRS.XN Mask */
+
+#define MPU_RASR_AP_Pos                    24U                                            /*!< MPU RASR: ATTRS.AP Position */
+#define MPU_RASR_AP_Msk                    (0x7UL << MPU_RASR_AP_Pos)                     /*!< MPU RASR: ATTRS.AP Mask */
+
+#define MPU_RASR_TEX_Pos                   19U                                            /*!< MPU RASR: ATTRS.TEX Position */
+#define MPU_RASR_TEX_Msk                   (0x7UL << MPU_RASR_TEX_Pos)                    /*!< MPU RASR: ATTRS.TEX Mask */
+
+#define MPU_RASR_S_Pos                     18U                                            /*!< MPU RASR: ATTRS.S Position */
+#define MPU_RASR_S_Msk                     (1UL << MPU_RASR_S_Pos)                        /*!< MPU RASR: ATTRS.S Mask */
+
+#define MPU_RASR_C_Pos                     17U                                            /*!< MPU RASR: ATTRS.C Position */
+#define MPU_RASR_C_Msk                     (1UL << MPU_RASR_C_Pos)                        /*!< MPU RASR: ATTRS.C Mask */
+
+#define MPU_RASR_B_Pos                     16U                                            /*!< MPU RASR: ATTRS.B Position */
+#define MPU_RASR_B_Msk                     (1UL << MPU_RASR_B_Pos)                        /*!< MPU RASR: ATTRS.B Mask */
+
+#define MPU_RASR_SRD_Pos                    8U                                            /*!< MPU RASR: Sub-Region Disable Position */
+#define MPU_RASR_SRD_Msk                   (0xFFUL << MPU_RASR_SRD_Pos)                   /*!< MPU RASR: Sub-Region Disable Mask */
+
+#define MPU_RASR_SIZE_Pos                   1U                                            /*!< MPU RASR: Region Size Field Position */
+#define MPU_RASR_SIZE_Msk                  (0x1FUL << MPU_RASR_SIZE_Pos)                  /*!< MPU RASR: Region Size Field Mask */
+
+#define MPU_RASR_ENABLE_Pos                 0U                                            /*!< MPU RASR: Region enable bit Position */
+#define MPU_RASR_ENABLE_Msk                (1UL /*<< MPU_RASR_ENABLE_Pos*/)               /*!< MPU RASR: Region enable bit Disable Mask */
+
+/*@} end of group CMSIS_MPU */
+#endif /* defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U) */
+
+
+/**
+  \ingroup  CMSIS_core_register
+  \defgroup CMSIS_FPU     Floating Point Unit (FPU)
+  \brief    Type definitions for the Floating Point Unit (FPU)
+  @{
+ */
+
+/**
+  \brief  Structure type to access the Floating Point Unit (FPU).
+ */
+typedef struct
+{
+        uint32_t RESERVED0[1U];
+  __IOM uint32_t FPCCR;                  /*!< Offset: 0x004 (R/W)  Floating-Point Context Control Register */
+  __IOM uint32_t FPCAR;                  /*!< Offset: 0x008 (R/W)  Floating-Point Context Address Register */
+  __IOM uint32_t FPDSCR;                 /*!< Offset: 0x00C (R/W)  Floating-Point Default Status Control Register */
+  __IM  uint32_t MVFR0;                  /*!< Offset: 0x010 (R/ )  Media and FP Feature Register 0 */
+  __IM  uint32_t MVFR1;                  /*!< Offset: 0x014 (R/ )  Media and FP Feature Register 1 */
+  __IM  uint32_t MVFR2;                  /*!< Offset: 0x018 (R/ )  Media and FP Feature Register 2 */
+} FPU_Type;
+
+/* Floating-Point Context Control Register Definitions */
+#define FPU_FPCCR_ASPEN_Pos                31U                                            /*!< FPCCR: ASPEN bit Position */
+#define FPU_FPCCR_ASPEN_Msk                (1UL << FPU_FPCCR_ASPEN_Pos)                   /*!< FPCCR: ASPEN bit Mask */
+
+#define FPU_FPCCR_LSPEN_Pos                30U                                            /*!< FPCCR: LSPEN Position */
+#define FPU_FPCCR_LSPEN_Msk                (1UL << FPU_FPCCR_LSPEN_Pos)                   /*!< FPCCR: LSPEN bit Mask */
+
+#define FPU_FPCCR_MONRDY_Pos                8U                                            /*!< FPCCR: MONRDY Position */
+#define FPU_FPCCR_MONRDY_Msk               (1UL << FPU_FPCCR_MONRDY_Pos)                  /*!< FPCCR: MONRDY bit Mask */
+
+#define FPU_FPCCR_BFRDY_Pos                 6U                                            /*!< FPCCR: BFRDY Position */
+#define FPU_FPCCR_BFRDY_Msk                (1UL << FPU_FPCCR_BFRDY_Pos)                   /*!< FPCCR: BFRDY bit Mask */
+
+#define FPU_FPCCR_MMRDY_Pos                 5U                                            /*!< FPCCR: MMRDY Position */
+#define FPU_FPCCR_MMRDY_Msk                (1UL << FPU_FPCCR_MMRDY_Pos)                   /*!< FPCCR: MMRDY bit Mask */
+
+#define FPU_FPCCR_HFRDY_Pos                 4U                                            /*!< FPCCR: HFRDY Position */
+#define FPU_FPCCR_HFRDY_Msk                (1UL << FPU_FPCCR_HFRDY_Pos)                   /*!< FPCCR: HFRDY bit Mask */
+
+#define FPU_FPCCR_THREAD_Pos                3U                                            /*!< FPCCR: processor mode bit Position */
+#define FPU_FPCCR_THREAD_Msk               (1UL << FPU_FPCCR_THREAD_Pos)                  /*!< FPCCR: processor mode active bit Mask */
+
+#define FPU_FPCCR_USER_Pos                  1U                                            /*!< FPCCR: privilege level bit Position */
+#define FPU_FPCCR_USER_Msk                 (1UL << FPU_FPCCR_USER_Pos)                    /*!< FPCCR: privilege level bit Mask */
+
+#define FPU_FPCCR_LSPACT_Pos                0U                                            /*!< FPCCR: Lazy state preservation active bit Position */
+#define FPU_FPCCR_LSPACT_Msk               (1UL /*<< FPU_FPCCR_LSPACT_Pos*/)              /*!< FPCCR: Lazy state preservation active bit Mask */
+
+/* Floating-Point Context Address Register Definitions */
+#define FPU_FPCAR_ADDRESS_Pos               3U                                            /*!< FPCAR: ADDRESS bit Position */
+#define FPU_FPCAR_ADDRESS_Msk              (0x1FFFFFFFUL << FPU_FPCAR_ADDRESS_Pos)        /*!< FPCAR: ADDRESS bit Mask */
+
+/* Floating-Point Default Status Control Register Definitions */
+#define FPU_FPDSCR_AHP_Pos                 26U                                            /*!< FPDSCR: AHP bit Position */
+#define FPU_FPDSCR_AHP_Msk                 (1UL << FPU_FPDSCR_AHP_Pos)                    /*!< FPDSCR: AHP bit Mask */
+
+#define FPU_FPDSCR_DN_Pos                  25U                                            /*!< FPDSCR: DN bit Position */
+#define FPU_FPDSCR_DN_Msk                  (1UL << FPU_FPDSCR_DN_Pos)                     /*!< FPDSCR: DN bit Mask */
+
+#define FPU_FPDSCR_FZ_Pos                  24U                                            /*!< FPDSCR: FZ bit Position */
+#define FPU_FPDSCR_FZ_Msk                  (1UL << FPU_FPDSCR_FZ_Pos)                     /*!< FPDSCR: FZ bit Mask */
+
+#define FPU_FPDSCR_RMode_Pos               22U                                            /*!< FPDSCR: RMode bit Position */
+#define FPU_FPDSCR_RMode_Msk               (3UL << FPU_FPDSCR_RMode_Pos)                  /*!< FPDSCR: RMode bit Mask */
+
+/* Media and FP Feature Register 0 Definitions */
+#define FPU_MVFR0_FP_rounding_modes_Pos    28U                                            /*!< MVFR0: FP rounding modes bits Position */
+#define FPU_MVFR0_FP_rounding_modes_Msk    (0xFUL << FPU_MVFR0_FP_rounding_modes_Pos)     /*!< MVFR0: FP rounding modes bits Mask */
+
+#define FPU_MVFR0_Short_vectors_Pos        24U                                            /*!< MVFR0: Short vectors bits Position */
+#define FPU_MVFR0_Short_vectors_Msk        (0xFUL << FPU_MVFR0_Short_vectors_Pos)         /*!< MVFR0: Short vectors bits Mask */
+
+#define FPU_MVFR0_Square_root_Pos          20U                                            /*!< MVFR0: Square root bits Position */
+#define FPU_MVFR0_Square_root_Msk          (0xFUL << FPU_MVFR0_Square_root_Pos)           /*!< MVFR0: Square root bits Mask */
+
+#define FPU_MVFR0_Divide_Pos               16U                                            /*!< MVFR0: Divide bits Position */
+#define FPU_MVFR0_Divide_Msk               (0xFUL << FPU_MVFR0_Divide_Pos)                /*!< MVFR0: Divide bits Mask */
+
+#define FPU_MVFR0_FP_excep_trapping_Pos    12U                                            /*!< MVFR0: FP exception trapping bits Position */
+#define FPU_MVFR0_FP_excep_trapping_Msk    (0xFUL << FPU_MVFR0_FP_excep_trapping_Pos)     /*!< MVFR0: FP exception trapping bits Mask */
+
+#define FPU_MVFR0_Double_precision_Pos      8U                                            /*!< MVFR0: Double-precision bits Position */
+#define FPU_MVFR0_Double_precision_Msk     (0xFUL << FPU_MVFR0_Double_precision_Pos)      /*!< MVFR0: Double-precision bits Mask */
+
+#define FPU_MVFR0_Single_precision_Pos      4U                                            /*!< MVFR0: Single-precision bits Position */
+#define FPU_MVFR0_Single_precision_Msk     (0xFUL << FPU_MVFR0_Single_precision_Pos)      /*!< MVFR0: Single-precision bits Mask */
+
+#define FPU_MVFR0_A_SIMD_registers_Pos      0U                                            /*!< MVFR0: A_SIMD registers bits Position */
+#define FPU_MVFR0_A_SIMD_registers_Msk     (0xFUL /*<< FPU_MVFR0_A_SIMD_registers_Pos*/)  /*!< MVFR0: A_SIMD registers bits Mask */
+
+/* Media and FP Feature Register 1 Definitions */
+#define FPU_MVFR1_FP_fused_MAC_Pos         28U                                            /*!< MVFR1: FP fused MAC bits Position */
+#define FPU_MVFR1_FP_fused_MAC_Msk         (0xFUL << FPU_MVFR1_FP_fused_MAC_Pos)          /*!< MVFR1: FP fused MAC bits Mask */
+
+#define FPU_MVFR1_FP_HPFP_Pos              24U                                            /*!< MVFR1: FP HPFP bits Position */
+#define FPU_MVFR1_FP_HPFP_Msk              (0xFUL << FPU_MVFR1_FP_HPFP_Pos)               /*!< MVFR1: FP HPFP bits Mask */
+
+#define FPU_MVFR1_D_NaN_mode_Pos            4U                                            /*!< MVFR1: D_NaN mode bits Position */
+#define FPU_MVFR1_D_NaN_mode_Msk           (0xFUL << FPU_MVFR1_D_NaN_mode_Pos)            /*!< MVFR1: D_NaN mode bits Mask */
+
+#define FPU_MVFR1_FtZ_mode_Pos              0U                                            /*!< MVFR1: FtZ mode bits Position */
+#define FPU_MVFR1_FtZ_mode_Msk             (0xFUL /*<< FPU_MVFR1_FtZ_mode_Pos*/)          /*!< MVFR1: FtZ mode bits Mask */
+
+/* Media and FP Feature Register 2 Definitions */
+
+#define FPU_MVFR2_VFP_Misc_Pos              4U                                            /*!< MVFR2: VFP Misc bits Position */
+#define FPU_MVFR2_VFP_Misc_Msk             (0xFUL << FPU_MVFR2_VFP_Misc_Pos)              /*!< MVFR2: VFP Misc bits Mask */
+
+/*@} end of group CMSIS_FPU */
+
+
+/**
+  \ingroup  CMSIS_core_register
+  \defgroup CMSIS_CoreDebug       Core Debug Registers (CoreDebug)
+  \brief    Type definitions for the Core Debug Registers
+  @{
+ */
+
+/**
+  \brief  Structure type to access the Core Debug Register (CoreDebug).
+ */
+typedef struct
+{
+  __IOM uint32_t DHCSR;                  /*!< Offset: 0x000 (R/W)  Debug Halting Control and Status Register */
+  __OM  uint32_t DCRSR;                  /*!< Offset: 0x004 ( /W)  Debug Core Register Selector Register */
+  __IOM uint32_t DCRDR;                  /*!< Offset: 0x008 (R/W)  Debug Core Register Data Register */
+  __IOM uint32_t DEMCR;                  /*!< Offset: 0x00C (R/W)  Debug Exception and Monitor Control Register */
+} CoreDebug_Type;
+
+/* Debug Halting Control and Status Register Definitions */
+#define CoreDebug_DHCSR_DBGKEY_Pos         16U                                            /*!< CoreDebug DHCSR: DBGKEY Position */
+#define CoreDebug_DHCSR_DBGKEY_Msk         (0xFFFFUL << CoreDebug_DHCSR_DBGKEY_Pos)       /*!< CoreDebug DHCSR: DBGKEY Mask */
+
+#define CoreDebug_DHCSR_S_RESET_ST_Pos     25U                                            /*!< CoreDebug DHCSR: S_RESET_ST Position */
+#define CoreDebug_DHCSR_S_RESET_ST_Msk     (1UL << CoreDebug_DHCSR_S_RESET_ST_Pos)        /*!< CoreDebug DHCSR: S_RESET_ST Mask */
+
+#define CoreDebug_DHCSR_S_RETIRE_ST_Pos    24U                                            /*!< CoreDebug DHCSR: S_RETIRE_ST Position */
+#define CoreDebug_DHCSR_S_RETIRE_ST_Msk    (1UL << CoreDebug_DHCSR_S_RETIRE_ST_Pos)       /*!< CoreDebug DHCSR: S_RETIRE_ST Mask */
+
+#define CoreDebug_DHCSR_S_LOCKUP_Pos       19U                                            /*!< CoreDebug DHCSR: S_LOCKUP Position */
+#define CoreDebug_DHCSR_S_LOCKUP_Msk       (1UL << CoreDebug_DHCSR_S_LOCKUP_Pos)          /*!< CoreDebug DHCSR: S_LOCKUP Mask */
+
+#define CoreDebug_DHCSR_S_SLEEP_Pos        18U                                            /*!< CoreDebug DHCSR: S_SLEEP Position */
+#define CoreDebug_DHCSR_S_SLEEP_Msk        (1UL << CoreDebug_DHCSR_S_SLEEP_Pos)           /*!< CoreDebug DHCSR: S_SLEEP Mask */
+
+#define CoreDebug_DHCSR_S_HALT_Pos         17U                                            /*!< CoreDebug DHCSR: S_HALT Position */
+#define CoreDebug_DHCSR_S_HALT_Msk         (1UL << CoreDebug_DHCSR_S_HALT_Pos)            /*!< CoreDebug DHCSR: S_HALT Mask */
+
+#define CoreDebug_DHCSR_S_REGRDY_Pos       16U                                            /*!< CoreDebug DHCSR: S_REGRDY Position */
+#define CoreDebug_DHCSR_S_REGRDY_Msk       (1UL << CoreDebug_DHCSR_S_REGRDY_Pos)          /*!< CoreDebug DHCSR: S_REGRDY Mask */
+
+#define CoreDebug_DHCSR_C_SNAPSTALL_Pos     5U                                            /*!< CoreDebug DHCSR: C_SNAPSTALL Position */
+#define CoreDebug_DHCSR_C_SNAPSTALL_Msk    (1UL << CoreDebug_DHCSR_C_SNAPSTALL_Pos)       /*!< CoreDebug DHCSR: C_SNAPSTALL Mask */
+
+#define CoreDebug_DHCSR_C_MASKINTS_Pos      3U                                            /*!< CoreDebug DHCSR: C_MASKINTS Position */
+#define CoreDebug_DHCSR_C_MASKINTS_Msk     (1UL << CoreDebug_DHCSR_C_MASKINTS_Pos)        /*!< CoreDebug DHCSR: C_MASKINTS Mask */
+
+#define CoreDebug_DHCSR_C_STEP_Pos          2U                                            /*!< CoreDebug DHCSR: C_STEP Position */
+#define CoreDebug_DHCSR_C_STEP_Msk         (1UL << CoreDebug_DHCSR_C_STEP_Pos)            /*!< CoreDebug DHCSR: C_STEP Mask */
+
+#define CoreDebug_DHCSR_C_HALT_Pos          1U                                            /*!< CoreDebug DHCSR: C_HALT Position */
+#define CoreDebug_DHCSR_C_HALT_Msk         (1UL << CoreDebug_DHCSR_C_HALT_Pos)            /*!< CoreDebug DHCSR: C_HALT Mask */
+
+#define CoreDebug_DHCSR_C_DEBUGEN_Pos       0U                                            /*!< CoreDebug DHCSR: C_DEBUGEN Position */
+#define CoreDebug_DHCSR_C_DEBUGEN_Msk      (1UL /*<< CoreDebug_DHCSR_C_DEBUGEN_Pos*/)     /*!< CoreDebug DHCSR: C_DEBUGEN Mask */
+
+/* Debug Core Register Selector Register Definitions */
+#define CoreDebug_DCRSR_REGWnR_Pos         16U                                            /*!< CoreDebug DCRSR: REGWnR Position */
+#define CoreDebug_DCRSR_REGWnR_Msk         (1UL << CoreDebug_DCRSR_REGWnR_Pos)            /*!< CoreDebug DCRSR: REGWnR Mask */
+
+#define CoreDebug_DCRSR_REGSEL_Pos          0U                                            /*!< CoreDebug DCRSR: REGSEL Position */
+#define CoreDebug_DCRSR_REGSEL_Msk         (0x1FUL /*<< CoreDebug_DCRSR_REGSEL_Pos*/)     /*!< CoreDebug DCRSR: REGSEL Mask */
+
+/* Debug Exception and Monitor Control Register Definitions */
+#define CoreDebug_DEMCR_TRCENA_Pos         24U                                            /*!< CoreDebug DEMCR: TRCENA Position */
+#define CoreDebug_DEMCR_TRCENA_Msk         (1UL << CoreDebug_DEMCR_TRCENA_Pos)            /*!< CoreDebug DEMCR: TRCENA Mask */
+
+#define CoreDebug_DEMCR_MON_REQ_Pos        19U                                            /*!< CoreDebug DEMCR: MON_REQ Position */
+#define CoreDebug_DEMCR_MON_REQ_Msk        (1UL << CoreDebug_DEMCR_MON_REQ_Pos)           /*!< CoreDebug DEMCR: MON_REQ Mask */
+
+#define CoreDebug_DEMCR_MON_STEP_Pos       18U                                            /*!< CoreDebug DEMCR: MON_STEP Position */
+#define CoreDebug_DEMCR_MON_STEP_Msk       (1UL << CoreDebug_DEMCR_MON_STEP_Pos)          /*!< CoreDebug DEMCR: MON_STEP Mask */
+
+#define CoreDebug_DEMCR_MON_PEND_Pos       17U                                            /*!< CoreDebug DEMCR: MON_PEND Position */
+#define CoreDebug_DEMCR_MON_PEND_Msk       (1UL << CoreDebug_DEMCR_MON_PEND_Pos)          /*!< CoreDebug DEMCR: MON_PEND Mask */
+
+#define CoreDebug_DEMCR_MON_EN_Pos         16U                                            /*!< CoreDebug DEMCR: MON_EN Position */
+#define CoreDebug_DEMCR_MON_EN_Msk         (1UL << CoreDebug_DEMCR_MON_EN_Pos)            /*!< CoreDebug DEMCR: MON_EN Mask */
+
+#define CoreDebug_DEMCR_VC_HARDERR_Pos     10U                                            /*!< CoreDebug DEMCR: VC_HARDERR Position */
+#define CoreDebug_DEMCR_VC_HARDERR_Msk     (1UL << CoreDebug_DEMCR_VC_HARDERR_Pos)        /*!< CoreDebug DEMCR: VC_HARDERR Mask */
+
+#define CoreDebug_DEMCR_VC_INTERR_Pos       9U                                            /*!< CoreDebug DEMCR: VC_INTERR Position */
+#define CoreDebug_DEMCR_VC_INTERR_Msk      (1UL << CoreDebug_DEMCR_VC_INTERR_Pos)         /*!< CoreDebug DEMCR: VC_INTERR Mask */
+
+#define CoreDebug_DEMCR_VC_BUSERR_Pos       8U                                            /*!< CoreDebug DEMCR: VC_BUSERR Position */
+#define CoreDebug_DEMCR_VC_BUSERR_Msk      (1UL << CoreDebug_DEMCR_VC_BUSERR_Pos)         /*!< CoreDebug DEMCR: VC_BUSERR Mask */
+
+#define CoreDebug_DEMCR_VC_STATERR_Pos      7U                                            /*!< CoreDebug DEMCR: VC_STATERR Position */
+#define CoreDebug_DEMCR_VC_STATERR_Msk     (1UL << CoreDebug_DEMCR_VC_STATERR_Pos)        /*!< CoreDebug DEMCR: VC_STATERR Mask */
+
+#define CoreDebug_DEMCR_VC_CHKERR_Pos       6U                                            /*!< CoreDebug DEMCR: VC_CHKERR Position */
+#define CoreDebug_DEMCR_VC_CHKERR_Msk      (1UL << CoreDebug_DEMCR_VC_CHKERR_Pos)         /*!< CoreDebug DEMCR: VC_CHKERR Mask */
+
+#define CoreDebug_DEMCR_VC_NOCPERR_Pos      5U                                            /*!< CoreDebug DEMCR: VC_NOCPERR Position */
+#define CoreDebug_DEMCR_VC_NOCPERR_Msk     (1UL << CoreDebug_DEMCR_VC_NOCPERR_Pos)        /*!< CoreDebug DEMCR: VC_NOCPERR Mask */
+
+#define CoreDebug_DEMCR_VC_MMERR_Pos        4U                                            /*!< CoreDebug DEMCR: VC_MMERR Position */
+#define CoreDebug_DEMCR_VC_MMERR_Msk       (1UL << CoreDebug_DEMCR_VC_MMERR_Pos)          /*!< CoreDebug DEMCR: VC_MMERR Mask */
+
+#define CoreDebug_DEMCR_VC_CORERESET_Pos    0U                                            /*!< CoreDebug DEMCR: VC_CORERESET Position */
+#define CoreDebug_DEMCR_VC_CORERESET_Msk   (1UL /*<< CoreDebug_DEMCR_VC_CORERESET_Pos*/)  /*!< CoreDebug DEMCR: VC_CORERESET Mask */
+
+/*@} end of group CMSIS_CoreDebug */
+
+
+/**
+  \ingroup    CMSIS_core_register
+  \defgroup   CMSIS_core_bitfield     Core register bit field macros
+  \brief      Macros for use with bit field definitions (xxx_Pos, xxx_Msk).
+  @{
+ */
+
+/**
+  \brief   Mask and shift a bit field value for use in a register bit range.
+  \param[in] field  Name of the register bit field.
+  \param[in] value  Value of the bit field. This parameter is interpreted as an uint32_t type.
+  \return           Masked and shifted value.
+*/
+#define _VAL2FLD(field, value)    (((uint32_t)(value) << field ## _Pos) & field ## _Msk)
+
+/**
+  \brief     Mask and shift a register value to extract a bit filed value.
+  \param[in] field  Name of the register bit field.
+  \param[in] value  Value of register. This parameter is interpreted as an uint32_t type.
+  \return           Masked and shifted bit field value.
+*/
+#define _FLD2VAL(field, value)    (((uint32_t)(value) & field ## _Msk) >> field ## _Pos)
+
+/*@} end of group CMSIS_core_bitfield */
+
+
+/**
+  \ingroup    CMSIS_core_register
+  \defgroup   CMSIS_core_base     Core Definitions
+  \brief      Definitions for base addresses, unions, and structures.
+  @{
+ */
+
+/* Memory mapping of Core Hardware */
+#define SCS_BASE            (0xE000E000UL)                            /*!< System Control Space Base Address */
+#define ITM_BASE            (0xE0000000UL)                            /*!< ITM Base Address */
+#define DWT_BASE            (0xE0001000UL)                            /*!< DWT Base Address */
+#define TPI_BASE            (0xE0040000UL)                            /*!< TPI Base Address */
+#define CoreDebug_BASE      (0xE000EDF0UL)                            /*!< Core Debug Base Address */
+#define SysTick_BASE        (SCS_BASE +  0x0010UL)                    /*!< SysTick Base Address */
+#define NVIC_BASE           (SCS_BASE +  0x0100UL)                    /*!< NVIC Base Address */
+#define SCB_BASE            (SCS_BASE +  0x0D00UL)                    /*!< System Control Block Base Address */
+
+#define SCnSCB              ((SCnSCB_Type    *)     SCS_BASE      )   /*!< System control Register not in SCB */
+#define SCB                 ((SCB_Type       *)     SCB_BASE      )   /*!< SCB configuration struct */
+#define SysTick             ((SysTick_Type   *)     SysTick_BASE  )   /*!< SysTick configuration struct */
+#define NVIC                ((NVIC_Type      *)     NVIC_BASE     )   /*!< NVIC configuration struct */
+#define ITM                 ((ITM_Type       *)     ITM_BASE      )   /*!< ITM configuration struct */
+#define DWT                 ((DWT_Type       *)     DWT_BASE      )   /*!< DWT configuration struct */
+#define TPI                 ((TPI_Type       *)     TPI_BASE      )   /*!< TPI configuration struct */
+#define CoreDebug           ((CoreDebug_Type *)     CoreDebug_BASE)   /*!< Core Debug configuration struct */
+
+#if defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U)
+  #define MPU_BASE          (SCS_BASE +  0x0D90UL)                    /*!< Memory Protection Unit */
+  #define MPU               ((MPU_Type       *)     MPU_BASE      )   /*!< Memory Protection Unit */
+#endif
+
+#define FPU_BASE            (SCS_BASE +  0x0F30UL)                    /*!< Floating Point Unit */
+#define FPU                 ((FPU_Type       *)     FPU_BASE      )   /*!< Floating Point Unit */
+
+/*@} */
+
+
+
+/*******************************************************************************
+ *                Hardware Abstraction Layer
+  Core Function Interface contains:
+  - Core NVIC Functions
+  - Core SysTick Functions
+  - Core Debug Functions
+  - Core Register Access Functions
+ ******************************************************************************/
+/**
+  \defgroup CMSIS_Core_FunctionInterface Functions and Instructions Reference
+*/
+
+
+
+/* ##########################   NVIC functions  #################################### */
+/**
+  \ingroup  CMSIS_Core_FunctionInterface
+  \defgroup CMSIS_Core_NVICFunctions NVIC Functions
+  \brief    Functions that manage interrupts and exceptions via the NVIC.
+  @{
+ */
+
+#ifdef CMSIS_NVIC_VIRTUAL
+  #ifndef CMSIS_NVIC_VIRTUAL_HEADER_FILE
+    #define CMSIS_NVIC_VIRTUAL_HEADER_FILE "cmsis_nvic_virtual.h"
+  #endif
+  #include CMSIS_NVIC_VIRTUAL_HEADER_FILE
+#else
+  #define NVIC_SetPriorityGrouping    __NVIC_SetPriorityGrouping
+  #define NVIC_GetPriorityGrouping    __NVIC_GetPriorityGrouping
+  #define NVIC_EnableIRQ              __NVIC_EnableIRQ
+  #define NVIC_GetEnableIRQ           __NVIC_GetEnableIRQ
+  #define NVIC_DisableIRQ             __NVIC_DisableIRQ
+  #define NVIC_GetPendingIRQ          __NVIC_GetPendingIRQ
+  #define NVIC_SetPendingIRQ          __NVIC_SetPendingIRQ
+  #define NVIC_ClearPendingIRQ        __NVIC_ClearPendingIRQ
+  #define NVIC_GetActive              __NVIC_GetActive
+  #define NVIC_SetPriority            __NVIC_SetPriority
+  #define NVIC_GetPriority            __NVIC_GetPriority
+  #define NVIC_SystemReset            __NVIC_SystemReset
+#endif /* CMSIS_NVIC_VIRTUAL */
+
+#ifdef CMSIS_VECTAB_VIRTUAL
+  #ifndef CMSIS_VECTAB_VIRTUAL_HEADER_FILE
+    #define CMSIS_VECTAB_VIRTUAL_HEADER_FILE "cmsis_vectab_virtual.h"
+  #endif
+  #include CMSIS_VECTAB_VIRTUAL_HEADER_FILE
+#else
+  #define NVIC_SetVector              __NVIC_SetVector
+  #define NVIC_GetVector              __NVIC_GetVector
+#endif  /* (CMSIS_VECTAB_VIRTUAL) */
+
+#define NVIC_USER_IRQ_OFFSET          16
+
+
+/* The following EXC_RETURN values are saved the LR on exception entry */
+#define EXC_RETURN_HANDLER         (0xFFFFFFF1UL)     /* return to Handler mode, uses MSP after return                               */
+#define EXC_RETURN_THREAD_MSP      (0xFFFFFFF9UL)     /* return to Thread mode, uses MSP after return                                */
+#define EXC_RETURN_THREAD_PSP      (0xFFFFFFFDUL)     /* return to Thread mode, uses PSP after return                                */
+#define EXC_RETURN_HANDLER_FPU     (0xFFFFFFE1UL)     /* return to Handler mode, uses MSP after return, restore floating-point state */
+#define EXC_RETURN_THREAD_MSP_FPU  (0xFFFFFFE9UL)     /* return to Thread mode, uses MSP after return, restore floating-point state  */
+#define EXC_RETURN_THREAD_PSP_FPU  (0xFFFFFFEDUL)     /* return to Thread mode, uses PSP after return, restore floating-point state  */
+
+
+/**
+  \brief   Set Priority Grouping
+  \details Sets the priority grouping field using the required unlock sequence.
+           The parameter PriorityGroup is assigned to the field SCB->AIRCR [10:8] PRIGROUP field.
+           Only values from 0..7 are used.
+           In case of a conflict between priority grouping and available
+           priority bits (__NVIC_PRIO_BITS), the smallest possible priority group is set.
+  \param [in]      PriorityGroup  Priority grouping field.
+ */
+__STATIC_INLINE void __NVIC_SetPriorityGrouping(uint32_t PriorityGroup)
+{
+  uint32_t reg_value;
+  uint32_t PriorityGroupTmp = (PriorityGroup & (uint32_t)0x07UL);             /* only values 0..7 are used          */
+
+  reg_value  =  SCB->AIRCR;                                                   /* read old register configuration    */
+  reg_value &= ~((uint32_t)(SCB_AIRCR_VECTKEY_Msk | SCB_AIRCR_PRIGROUP_Msk)); /* clear bits to change               */
+  reg_value  =  (reg_value                                   |
+                ((uint32_t)0x5FAUL << SCB_AIRCR_VECTKEY_Pos) |
+                (PriorityGroupTmp << SCB_AIRCR_PRIGROUP_Pos)  );              /* Insert write key and priority group */
+  SCB->AIRCR =  reg_value;
+}
+
+
+/**
+  \brief   Get Priority Grouping
+  \details Reads the priority grouping field from the NVIC Interrupt Controller.
+  \return                Priority grouping field (SCB->AIRCR [10:8] PRIGROUP field).
+ */
+__STATIC_INLINE uint32_t __NVIC_GetPriorityGrouping(void)
+{
+  return ((uint32_t)((SCB->AIRCR & SCB_AIRCR_PRIGROUP_Msk) >> SCB_AIRCR_PRIGROUP_Pos));
+}
+
+
+/**
+  \brief   Enable Interrupt
+  \details Enables a device specific interrupt in the NVIC interrupt controller.
+  \param [in]      IRQn  Device specific interrupt number.
+  \note    IRQn must not be negative.
+ */
+__STATIC_INLINE void __NVIC_EnableIRQ(IRQn_Type IRQn)
+{
+  if ((int32_t)(IRQn) >= 0)
+  {
+    __COMPILER_BARRIER();
+    NVIC->ISER[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+    __COMPILER_BARRIER();
+  }
+}
+
+
+/**
+  \brief   Get Interrupt Enable status
+  \details Returns a device specific interrupt enable status from the NVIC interrupt controller.
+  \param [in]      IRQn  Device specific interrupt number.
+  \return             0  Interrupt is not enabled.
+  \return             1  Interrupt is enabled.
+  \note    IRQn must not be negative.
+ */
+__STATIC_INLINE uint32_t __NVIC_GetEnableIRQ(IRQn_Type IRQn)
+{
+  if ((int32_t)(IRQn) >= 0)
+  {
+    return((uint32_t)(((NVIC->ISER[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+  }
+  else
+  {
+    return(0U);
+  }
+}
+
+
+/**
+  \brief   Disable Interrupt
+  \details Disables a device specific interrupt in the NVIC interrupt controller.
+  \param [in]      IRQn  Device specific interrupt number.
+  \note    IRQn must not be negative.
+ */
+__STATIC_INLINE void __NVIC_DisableIRQ(IRQn_Type IRQn)
+{
+  if ((int32_t)(IRQn) >= 0)
+  {
+    NVIC->ICER[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+    __DSB();
+    __ISB();
+  }
+}
+
+
+/**
+  \brief   Get Pending Interrupt
+  \details Reads the NVIC pending register and returns the pending bit for the specified device specific interrupt.
+  \param [in]      IRQn  Device specific interrupt number.
+  \return             0  Interrupt status is not pending.
+  \return             1  Interrupt status is pending.
+  \note    IRQn must not be negative.
+ */
+__STATIC_INLINE uint32_t __NVIC_GetPendingIRQ(IRQn_Type IRQn)
+{
+  if ((int32_t)(IRQn) >= 0)
+  {
+    return((uint32_t)(((NVIC->ISPR[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+  }
+  else
+  {
+    return(0U);
+  }
+}
+
+
+/**
+  \brief   Set Pending Interrupt
+  \details Sets the pending bit of a device specific interrupt in the NVIC pending register.
+  \param [in]      IRQn  Device specific interrupt number.
+  \note    IRQn must not be negative.
+ */
+__STATIC_INLINE void __NVIC_SetPendingIRQ(IRQn_Type IRQn)
+{
+  if ((int32_t)(IRQn) >= 0)
+  {
+    NVIC->ISPR[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+  }
+}
+
+
+/**
+  \brief   Clear Pending Interrupt
+  \details Clears the pending bit of a device specific interrupt in the NVIC pending register.
+  \param [in]      IRQn  Device specific interrupt number.
+  \note    IRQn must not be negative.
+ */
+__STATIC_INLINE void __NVIC_ClearPendingIRQ(IRQn_Type IRQn)
+{
+  if ((int32_t)(IRQn) >= 0)
+  {
+    NVIC->ICPR[(((uint32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)IRQn) & 0x1FUL));
+  }
+}
+
+
+/**
+  \brief   Get Active Interrupt
+  \details Reads the active register in the NVIC and returns the active bit for the device specific interrupt.
+  \param [in]      IRQn  Device specific interrupt number.
+  \return             0  Interrupt status is not active.
+  \return             1  Interrupt status is active.
+  \note    IRQn must not be negative.
+ */
+__STATIC_INLINE uint32_t __NVIC_GetActive(IRQn_Type IRQn)
+{
+  if ((int32_t)(IRQn) >= 0)
+  {
+    return((uint32_t)(((NVIC->IABR[(((uint32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+  }
+  else
+  {
+    return(0U);
+  }
+}
+
+
+/**
+  \brief   Set Interrupt Priority
+  \details Sets the priority of a device specific interrupt or a processor exception.
+           The interrupt number can be positive to specify a device specific interrupt,
+           or negative to specify a processor exception.
+  \param [in]      IRQn  Interrupt number.
+  \param [in]  priority  Priority to set.
+  \note    The priority cannot be set for every processor exception.
+ */
+__STATIC_INLINE void __NVIC_SetPriority(IRQn_Type IRQn, uint32_t priority)
+{
+  if ((int32_t)(IRQn) >= 0)
+  {
+    NVIC->IP[((uint32_t)IRQn)]               = (uint8_t)((priority << (8U - __NVIC_PRIO_BITS)) & (uint32_t)0xFFUL);
+  }
+  else
+  {
+    SCB->SHP[(((uint32_t)IRQn) & 0xFUL)-4UL] = (uint8_t)((priority << (8U - __NVIC_PRIO_BITS)) & (uint32_t)0xFFUL);
+  }
+}
+
+
+/**
+  \brief   Get Interrupt Priority
+  \details Reads the priority of a device specific interrupt or a processor exception.
+           The interrupt number can be positive to specify a device specific interrupt,
+           or negative to specify a processor exception.
+  \param [in]   IRQn  Interrupt number.
+  \return             Interrupt Priority.
+                      Value is aligned automatically to the implemented priority bits of the microcontroller.
+ */
+__STATIC_INLINE uint32_t __NVIC_GetPriority(IRQn_Type IRQn)
+{
+
+  if ((int32_t)(IRQn) >= 0)
+  {
+    return(((uint32_t)NVIC->IP[((uint32_t)IRQn)]               >> (8U - __NVIC_PRIO_BITS)));
+  }
+  else
+  {
+    return(((uint32_t)SCB->SHP[(((uint32_t)IRQn) & 0xFUL)-4UL] >> (8U - __NVIC_PRIO_BITS)));
+  }
+}
+
+
+/**
+  \brief   Encode Priority
+  \details Encodes the priority for an interrupt with the given priority group,
+           preemptive priority value, and subpriority value.
+           In case of a conflict between priority grouping and available
+           priority bits (__NVIC_PRIO_BITS), the smallest possible priority group is set.
+  \param [in]     PriorityGroup  Used priority group.
+  \param [in]   PreemptPriority  Preemptive priority value (starting from 0).
+  \param [in]       SubPriority  Subpriority value (starting from 0).
+  \return                        Encoded priority. Value can be used in the function \ref NVIC_SetPriority().
+ */
+__STATIC_INLINE uint32_t NVIC_EncodePriority (uint32_t PriorityGroup, uint32_t PreemptPriority, uint32_t SubPriority)
+{
+  uint32_t PriorityGroupTmp = (PriorityGroup & (uint32_t)0x07UL);   /* only values 0..7 are used          */
+  uint32_t PreemptPriorityBits;
+  uint32_t SubPriorityBits;
+
+  PreemptPriorityBits = ((7UL - PriorityGroupTmp) > (uint32_t)(__NVIC_PRIO_BITS)) ? (uint32_t)(__NVIC_PRIO_BITS) : (uint32_t)(7UL - PriorityGroupTmp);
+  SubPriorityBits     = ((PriorityGroupTmp + (uint32_t)(__NVIC_PRIO_BITS)) < (uint32_t)7UL) ? (uint32_t)0UL : (uint32_t)((PriorityGroupTmp - 7UL) + (uint32_t)(__NVIC_PRIO_BITS));
+
+  return (
+           ((PreemptPriority & (uint32_t)((1UL << (PreemptPriorityBits)) - 1UL)) << SubPriorityBits) |
+           ((SubPriority     & (uint32_t)((1UL << (SubPriorityBits    )) - 1UL)))
+         );
+}
+
+
+/**
+  \brief   Decode Priority
+  \details Decodes an interrupt priority value with a given priority group to
+           preemptive priority value and subpriority value.
+           In case of a conflict between priority grouping and available
+           priority bits (__NVIC_PRIO_BITS) the smallest possible priority group is set.
+  \param [in]         Priority   Priority value, which can be retrieved with the function \ref NVIC_GetPriority().
+  \param [in]     PriorityGroup  Used priority group.
+  \param [out] pPreemptPriority  Preemptive priority value (starting from 0).
+  \param [out]     pSubPriority  Subpriority value (starting from 0).
+ */
+__STATIC_INLINE void NVIC_DecodePriority (uint32_t Priority, uint32_t PriorityGroup, uint32_t* const pPreemptPriority, uint32_t* const pSubPriority)
+{
+  uint32_t PriorityGroupTmp = (PriorityGroup & (uint32_t)0x07UL);   /* only values 0..7 are used          */
+  uint32_t PreemptPriorityBits;
+  uint32_t SubPriorityBits;
+
+  PreemptPriorityBits = ((7UL - PriorityGroupTmp) > (uint32_t)(__NVIC_PRIO_BITS)) ? (uint32_t)(__NVIC_PRIO_BITS) : (uint32_t)(7UL - PriorityGroupTmp);
+  SubPriorityBits     = ((PriorityGroupTmp + (uint32_t)(__NVIC_PRIO_BITS)) < (uint32_t)7UL) ? (uint32_t)0UL : (uint32_t)((PriorityGroupTmp - 7UL) + (uint32_t)(__NVIC_PRIO_BITS));
+
+  *pPreemptPriority = (Priority >> SubPriorityBits) & (uint32_t)((1UL << (PreemptPriorityBits)) - 1UL);
+  *pSubPriority     = (Priority                   ) & (uint32_t)((1UL << (SubPriorityBits    )) - 1UL);
+}
+
+
+/**
+  \brief   Set Interrupt Vector
+  \details Sets an interrupt vector in SRAM based interrupt vector table.
+           The interrupt number can be positive to specify a device specific interrupt,
+           or negative to specify a processor exception.
+           VTOR must been relocated to SRAM before.
+  \param [in]   IRQn      Interrupt number
+  \param [in]   vector    Address of interrupt handler function
+ */
+__STATIC_INLINE void __NVIC_SetVector(IRQn_Type IRQn, uint32_t vector)
+{
+  uint32_t *vectors = (uint32_t *)SCB->VTOR;
+  vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET] = vector;
+  /* ARM Application Note 321 states that the M4 does not require the architectural barrier */
+}
+
+
+/**
+  \brief   Get Interrupt Vector
+  \details Reads an interrupt vector from interrupt vector table.
+           The interrupt number can be positive to specify a device specific interrupt,
+           or negative to specify a processor exception.
+  \param [in]   IRQn      Interrupt number.
+  \return                 Address of interrupt handler function
+ */
+__STATIC_INLINE uint32_t __NVIC_GetVector(IRQn_Type IRQn)
+{
+  uint32_t *vectors = (uint32_t *)SCB->VTOR;
+  return vectors[(int32_t)IRQn + NVIC_USER_IRQ_OFFSET];
+}
+
+
+/**
+  \brief   System Reset
+  \details Initiates a system reset request to reset the MCU.
+ */
+__NO_RETURN __STATIC_INLINE void __NVIC_SystemReset(void)
+{
+  __DSB();                                                          /* Ensure all outstanding memory accesses included
+                                                                       buffered write are completed before reset */
+  SCB->AIRCR  = (uint32_t)((0x5FAUL << SCB_AIRCR_VECTKEY_Pos)    |
+                           (SCB->AIRCR & SCB_AIRCR_PRIGROUP_Msk) |
+                            SCB_AIRCR_SYSRESETREQ_Msk    );         /* Keep priority group unchanged */
+  __DSB();                                                          /* Ensure completion of memory access */
+
+  for(;;)                                                           /* wait until reset */
+  {
+    __NOP();
+  }
+}
+
+/*@} end of CMSIS_Core_NVICFunctions */
+
+
+/* ##########################  MPU functions  #################################### */
+
+#if defined (__MPU_PRESENT) && (__MPU_PRESENT == 1U)
+
+#include "mpu_armv7.h"
+
+#endif
+
+
+/* ##########################  FPU functions  #################################### */
+/**
+  \ingroup  CMSIS_Core_FunctionInterface
+  \defgroup CMSIS_Core_FpuFunctions FPU Functions
+  \brief    Function that provides FPU type.
+  @{
+ */
+
+/**
+  \brief   get FPU type
+  \details returns the FPU type
+  \returns
+   - \b  0: No FPU
+   - \b  1: Single precision FPU
+   - \b  2: Double + Single precision FPU
+ */
+__STATIC_INLINE uint32_t SCB_GetFPUType(void)
+{
+  uint32_t mvfr0;
+
+  mvfr0 = FPU->MVFR0;
+  if      ((mvfr0 & (FPU_MVFR0_Single_precision_Msk | FPU_MVFR0_Double_precision_Msk)) == 0x020U)
+  {
+    return 1U;           /* Single precision FPU */
+  }
+  else
+  {
+    return 0U;           /* No FPU */
+  }
+}
+
+
+/*@} end of CMSIS_Core_FpuFunctions */
+
+
+
+/* ##################################    SysTick function  ############################################ */
+/**
+  \ingroup  CMSIS_Core_FunctionInterface
+  \defgroup CMSIS_Core_SysTickFunctions SysTick Functions
+  \brief    Functions that configure the System.
+  @{
+ */
+
+#if defined (__Vendor_SysTickConfig) && (__Vendor_SysTickConfig == 0U)
+
+/**
+  \brief   System Tick Configuration
+  \details Initializes the System Timer and its interrupt, and starts the System Tick Timer.
+           Counter is in free running mode to generate periodic interrupts.
+  \param [in]  ticks  Number of ticks between two interrupts.
+  \return          0  Function succeeded.
+  \return          1  Function failed.
+  \note    When the variable <b>__Vendor_SysTickConfig</b> is set to 1, then the
+           function <b>SysTick_Config</b> is not included. In this case, the file <b><i>device</i>.h</b>
+           must contain a vendor-specific implementation of this function.
+ */
+__STATIC_INLINE uint32_t SysTick_Config(uint32_t ticks)
+{
+  if ((ticks - 1UL) > SysTick_LOAD_RELOAD_Msk)
+  {
+    return (1UL);                                                   /* Reload value impossible */
+  }
+
+  SysTick->LOAD  = (uint32_t)(ticks - 1UL);                         /* set reload register */
+  NVIC_SetPriority (SysTick_IRQn, (1UL << __NVIC_PRIO_BITS) - 1UL); /* set Priority for Systick Interrupt */
+  SysTick->VAL   = 0UL;                                             /* Load the SysTick Counter Value */
+  SysTick->CTRL  = SysTick_CTRL_CLKSOURCE_Msk |
+                   SysTick_CTRL_TICKINT_Msk   |
+                   SysTick_CTRL_ENABLE_Msk;                         /* Enable SysTick IRQ and SysTick Timer */
+  return (0UL);                                                     /* Function successful */
+}
+
+#endif
+
+/*@} end of CMSIS_Core_SysTickFunctions */
+
+
+
+/* ##################################### Debug In/Output function ########################################### */
+/**
+  \ingroup  CMSIS_Core_FunctionInterface
+  \defgroup CMSIS_core_DebugFunctions ITM Functions
+  \brief    Functions that access the ITM debug interface.
+  @{
+ */
+
+extern volatile int32_t ITM_RxBuffer;                              /*!< External variable to receive characters. */
+#define                 ITM_RXBUFFER_EMPTY  ((int32_t)0x5AA55AA5U) /*!< Value identifying \ref ITM_RxBuffer is ready for next character. */
+
+
+/**
+  \brief   ITM Send Character
+  \details Transmits a character via the ITM channel 0, and
+           \li Just returns when no debugger is connected that has booked the output.
+           \li Is blocking when a debugger is connected, but the previous character sent has not been transmitted.
+  \param [in]     ch  Character to transmit.
+  \returns            Character to transmit.
+ */
+__STATIC_INLINE uint32_t ITM_SendChar (uint32_t ch)
+{
+  if (((ITM->TCR & ITM_TCR_ITMENA_Msk) != 0UL) &&      /* ITM enabled */
+      ((ITM->TER & 1UL               ) != 0UL)   )     /* ITM Port #0 enabled */
+  {
+    while (ITM->PORT[0U].u32 == 0UL)
+    {
+      __NOP();
+    }
+    ITM->PORT[0U].u8 = (uint8_t)ch;
+  }
+  return (ch);
+}
+
+
+/**
+  \brief   ITM Receive Character
+  \details Inputs a character via the external variable \ref ITM_RxBuffer.
+  \return             Received character.
+  \return         -1  No character pending.
+ */
+__STATIC_INLINE int32_t ITM_ReceiveChar (void)
+{
+  int32_t ch = -1;                           /* no character available */
+
+  if (ITM_RxBuffer != ITM_RXBUFFER_EMPTY)
+  {
+    ch = ITM_RxBuffer;
+    ITM_RxBuffer = ITM_RXBUFFER_EMPTY;       /* ready for next character */
+  }
+
+  return (ch);
+}
+
+
+/**
+  \brief   ITM Check Character
+  \details Checks whether a character is pending for reading in the variable \ref ITM_RxBuffer.
+  \return          0  No character available.
+  \return          1  Character available.
+ */
+__STATIC_INLINE int32_t ITM_CheckChar (void)
+{
+
+  if (ITM_RxBuffer == ITM_RXBUFFER_EMPTY)
+  {
+    return (0);                              /* no character available */
+  }
+  else
+  {
+    return (1);                              /*    character available */
+  }
+}
+
+/*@} end of CMSIS_core_DebugFunctions */
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CORE_CM4_H_DEPENDANT */
+
+#endif /* __CMSIS_GENERIC */
diff --git a/common/mps2/memory_zones.h b/common/mps2/memory_zones.h
new file mode 100644
index 0000000..432d393
--- /dev/null
+++ b/common/mps2/memory_zones.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2017-2019 ARM Limited
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing software
+ * distributed under the License is distributed on an "AS IS" BASIS
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This file contains the information of memory zones for code and data on
+ * ARM Versatile Express Cortex-M Prototyping Systems (V2M-MPS2) TRM.
+ * It is used in startup code and linker scripts of supported compilers (ARM and
+ * GCC_ARM).
+ *
+ * WARNING: IAR does not include this file and re-define these values in
+ * MPS2.icf file. Please make sure that the two files share the same values.
+ *
+ * These memory zones are defined in section 4.2 of ARM V2M-MPS2 RTL and
+ * Fast Model Reference Guide.
+ */
+
+#ifndef MEMORY_ZONES_H
+#define MEMORY_ZONES_H
+
+/*
+ * Code memory zones
+ * Please note that MPS2 on Fast Models do not implemented persistent flash memory.
+ * The FLASH memory can be simulated via 4MB ZBT_SRAM1 block
+ * only to keep the same name than in the CMSDK RTL and Fast Models Reference
+ * Guide.
+ */
+#define ZBT_SRAM1_START  0x00000000
+#define ZBT_SRAM1_SIZE   0x00400000 /* 4 MiB */
+
+/* Data memory zones */
+#define ZBT_SRAM2_START  0x20000000
+#define ZBT_SRAM2_SIZE   0x00400000 /* 4 MiB */
+
+#endif /* MEMORY_ZONES_H */
+
diff --git a/common/mps2/mpu_armv7.h b/common/mps2/mpu_armv7.h
new file mode 100644
index 0000000..791a8da
--- /dev/null
+++ b/common/mps2/mpu_armv7.h
@@ -0,0 +1,275 @@
+/******************************************************************************
+ * @file     mpu_armv7.h
+ * @brief    CMSIS MPU API for Armv7-M MPU
+ * @version  V5.1.1
+ * @date     10. February 2020
+ ******************************************************************************/
+/*
+ * Copyright (c) 2017-2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#if   defined ( __ICCARM__ )
+  #pragma system_include         /* treat file as system include file for MISRA check */
+#elif defined (__clang__)
+  #pragma clang system_header    /* treat file as system include file */
+#endif
+ 
+#ifndef ARM_MPU_ARMV7_H
+#define ARM_MPU_ARMV7_H
+
+#define ARM_MPU_REGION_SIZE_32B      ((uint8_t)0x04U) ///!< MPU Region Size 32 Bytes
+#define ARM_MPU_REGION_SIZE_64B      ((uint8_t)0x05U) ///!< MPU Region Size 64 Bytes
+#define ARM_MPU_REGION_SIZE_128B     ((uint8_t)0x06U) ///!< MPU Region Size 128 Bytes
+#define ARM_MPU_REGION_SIZE_256B     ((uint8_t)0x07U) ///!< MPU Region Size 256 Bytes
+#define ARM_MPU_REGION_SIZE_512B     ((uint8_t)0x08U) ///!< MPU Region Size 512 Bytes
+#define ARM_MPU_REGION_SIZE_1KB      ((uint8_t)0x09U) ///!< MPU Region Size 1 KByte
+#define ARM_MPU_REGION_SIZE_2KB      ((uint8_t)0x0AU) ///!< MPU Region Size 2 KBytes
+#define ARM_MPU_REGION_SIZE_4KB      ((uint8_t)0x0BU) ///!< MPU Region Size 4 KBytes
+#define ARM_MPU_REGION_SIZE_8KB      ((uint8_t)0x0CU) ///!< MPU Region Size 8 KBytes
+#define ARM_MPU_REGION_SIZE_16KB     ((uint8_t)0x0DU) ///!< MPU Region Size 16 KBytes
+#define ARM_MPU_REGION_SIZE_32KB     ((uint8_t)0x0EU) ///!< MPU Region Size 32 KBytes
+#define ARM_MPU_REGION_SIZE_64KB     ((uint8_t)0x0FU) ///!< MPU Region Size 64 KBytes
+#define ARM_MPU_REGION_SIZE_128KB    ((uint8_t)0x10U) ///!< MPU Region Size 128 KBytes
+#define ARM_MPU_REGION_SIZE_256KB    ((uint8_t)0x11U) ///!< MPU Region Size 256 KBytes
+#define ARM_MPU_REGION_SIZE_512KB    ((uint8_t)0x12U) ///!< MPU Region Size 512 KBytes
+#define ARM_MPU_REGION_SIZE_1MB      ((uint8_t)0x13U) ///!< MPU Region Size 1 MByte
+#define ARM_MPU_REGION_SIZE_2MB      ((uint8_t)0x14U) ///!< MPU Region Size 2 MBytes
+#define ARM_MPU_REGION_SIZE_4MB      ((uint8_t)0x15U) ///!< MPU Region Size 4 MBytes
+#define ARM_MPU_REGION_SIZE_8MB      ((uint8_t)0x16U) ///!< MPU Region Size 8 MBytes
+#define ARM_MPU_REGION_SIZE_16MB     ((uint8_t)0x17U) ///!< MPU Region Size 16 MBytes
+#define ARM_MPU_REGION_SIZE_32MB     ((uint8_t)0x18U) ///!< MPU Region Size 32 MBytes
+#define ARM_MPU_REGION_SIZE_64MB     ((uint8_t)0x19U) ///!< MPU Region Size 64 MBytes
+#define ARM_MPU_REGION_SIZE_128MB    ((uint8_t)0x1AU) ///!< MPU Region Size 128 MBytes
+#define ARM_MPU_REGION_SIZE_256MB    ((uint8_t)0x1BU) ///!< MPU Region Size 256 MBytes
+#define ARM_MPU_REGION_SIZE_512MB    ((uint8_t)0x1CU) ///!< MPU Region Size 512 MBytes
+#define ARM_MPU_REGION_SIZE_1GB      ((uint8_t)0x1DU) ///!< MPU Region Size 1 GByte
+#define ARM_MPU_REGION_SIZE_2GB      ((uint8_t)0x1EU) ///!< MPU Region Size 2 GBytes
+#define ARM_MPU_REGION_SIZE_4GB      ((uint8_t)0x1FU) ///!< MPU Region Size 4 GBytes
+
+#define ARM_MPU_AP_NONE 0U ///!< MPU Access Permission no access
+#define ARM_MPU_AP_PRIV 1U ///!< MPU Access Permission privileged access only
+#define ARM_MPU_AP_URO  2U ///!< MPU Access Permission unprivileged access read-only
+#define ARM_MPU_AP_FULL 3U ///!< MPU Access Permission full access
+#define ARM_MPU_AP_PRO  5U ///!< MPU Access Permission privileged access read-only
+#define ARM_MPU_AP_RO   6U ///!< MPU Access Permission read-only access
+
+/** MPU Region Base Address Register Value
+*
+* \param Region The region to be configured, number 0 to 15.
+* \param BaseAddress The base address for the region.
+*/
+#define ARM_MPU_RBAR(Region, BaseAddress) \
+  (((BaseAddress) & MPU_RBAR_ADDR_Msk) |  \
+   ((Region) & MPU_RBAR_REGION_Msk)    |  \
+   (MPU_RBAR_VALID_Msk))
+
+/**
+* MPU Memory Access Attributes
+* 
+* \param TypeExtField      Type extension field, allows you to configure memory access type, for example strongly ordered, peripheral.
+* \param IsShareable       Region is shareable between multiple bus masters.
+* \param IsCacheable       Region is cacheable, i.e. its value may be kept in cache.
+* \param IsBufferable      Region is bufferable, i.e. using write-back caching. Cacheable but non-bufferable regions use write-through policy.
+*/  
+#define ARM_MPU_ACCESS_(TypeExtField, IsShareable, IsCacheable, IsBufferable)   \
+  ((((TypeExtField) << MPU_RASR_TEX_Pos) & MPU_RASR_TEX_Msk)                  | \
+   (((IsShareable)  << MPU_RASR_S_Pos)   & MPU_RASR_S_Msk)                    | \
+   (((IsCacheable)  << MPU_RASR_C_Pos)   & MPU_RASR_C_Msk)                    | \
+   (((IsBufferable) << MPU_RASR_B_Pos)   & MPU_RASR_B_Msk))
+
+/**
+* MPU Region Attribute and Size Register Value
+* 
+* \param DisableExec       Instruction access disable bit, 1= disable instruction fetches.
+* \param AccessPermission  Data access permissions, allows you to configure read/write access for User and Privileged mode.
+* \param AccessAttributes  Memory access attribution, see \ref ARM_MPU_ACCESS_.
+* \param SubRegionDisable  Sub-region disable field.
+* \param Size              Region size of the region to be configured, for example 4K, 8K.
+*/
+#define ARM_MPU_RASR_EX(DisableExec, AccessPermission, AccessAttributes, SubRegionDisable, Size)    \
+  ((((DisableExec)      << MPU_RASR_XN_Pos)   & MPU_RASR_XN_Msk)                                  | \
+   (((AccessPermission) << MPU_RASR_AP_Pos)   & MPU_RASR_AP_Msk)                                  | \
+   (((AccessAttributes) & (MPU_RASR_TEX_Msk | MPU_RASR_S_Msk | MPU_RASR_C_Msk | MPU_RASR_B_Msk))) | \
+   (((SubRegionDisable) << MPU_RASR_SRD_Pos)  & MPU_RASR_SRD_Msk)                                 | \
+   (((Size)             << MPU_RASR_SIZE_Pos) & MPU_RASR_SIZE_Msk)                                | \
+   (((MPU_RASR_ENABLE_Msk))))
+
+/**
+* MPU Region Attribute and Size Register Value
+* 
+* \param DisableExec       Instruction access disable bit, 1= disable instruction fetches.
+* \param AccessPermission  Data access permissions, allows you to configure read/write access for User and Privileged mode.
+* \param TypeExtField      Type extension field, allows you to configure memory access type, for example strongly ordered, peripheral.
+* \param IsShareable       Region is shareable between multiple bus masters.
+* \param IsCacheable       Region is cacheable, i.e. its value may be kept in cache.
+* \param IsBufferable      Region is bufferable, i.e. using write-back caching. Cacheable but non-bufferable regions use write-through policy.
+* \param SubRegionDisable  Sub-region disable field.
+* \param Size              Region size of the region to be configured, for example 4K, 8K.
+*/                         
+#define ARM_MPU_RASR(DisableExec, AccessPermission, TypeExtField, IsShareable, IsCacheable, IsBufferable, SubRegionDisable, Size) \
+  ARM_MPU_RASR_EX(DisableExec, AccessPermission, ARM_MPU_ACCESS_(TypeExtField, IsShareable, IsCacheable, IsBufferable), SubRegionDisable, Size)
+
+/**
+* MPU Memory Access Attribute for strongly ordered memory.
+*  - TEX: 000b
+*  - Shareable
+*  - Non-cacheable
+*  - Non-bufferable
+*/ 
+#define ARM_MPU_ACCESS_ORDERED ARM_MPU_ACCESS_(0U, 1U, 0U, 0U)
+
+/**
+* MPU Memory Access Attribute for device memory.
+*  - TEX: 000b (if shareable) or 010b (if non-shareable)
+*  - Shareable or non-shareable
+*  - Non-cacheable
+*  - Bufferable (if shareable) or non-bufferable (if non-shareable)
+*
+* \param IsShareable Configures the device memory as shareable or non-shareable.
+*/ 
+#define ARM_MPU_ACCESS_DEVICE(IsShareable) ((IsShareable) ? ARM_MPU_ACCESS_(0U, 1U, 0U, 1U) : ARM_MPU_ACCESS_(2U, 0U, 0U, 0U))
+
+/**
+* MPU Memory Access Attribute for normal memory.
+*  - TEX: 1BBb (reflecting outer cacheability rules)
+*  - Shareable or non-shareable
+*  - Cacheable or non-cacheable (reflecting inner cacheability rules)
+*  - Bufferable or non-bufferable (reflecting inner cacheability rules)
+*
+* \param OuterCp Configures the outer cache policy.
+* \param InnerCp Configures the inner cache policy.
+* \param IsShareable Configures the memory as shareable or non-shareable.
+*/ 
+#define ARM_MPU_ACCESS_NORMAL(OuterCp, InnerCp, IsShareable) ARM_MPU_ACCESS_((4U | (OuterCp)), IsShareable, ((InnerCp) >> 1U), ((InnerCp) & 1U))
+
+/**
+* MPU Memory Access Attribute non-cacheable policy.
+*/
+#define ARM_MPU_CACHEP_NOCACHE 0U
+
+/**
+* MPU Memory Access Attribute write-back, write and read allocate policy.
+*/
+#define ARM_MPU_CACHEP_WB_WRA 1U
+
+/**
+* MPU Memory Access Attribute write-through, no write allocate policy.
+*/
+#define ARM_MPU_CACHEP_WT_NWA 2U
+
+/**
+* MPU Memory Access Attribute write-back, no write allocate policy.
+*/
+#define ARM_MPU_CACHEP_WB_NWA 3U
+
+
+/**
+* Struct for a single MPU Region
+*/
+typedef struct {
+  uint32_t RBAR; //!< The region base address register value (RBAR)
+  uint32_t RASR; //!< The region attribute and size register value (RASR) \ref MPU_RASR
+} ARM_MPU_Region_t;
+    
+/** Enable the MPU.
+* \param MPU_Control Default access permissions for unconfigured regions.
+*/
+__STATIC_INLINE void ARM_MPU_Enable(uint32_t MPU_Control)
+{
+  __DMB();
+  MPU->CTRL = MPU_Control | MPU_CTRL_ENABLE_Msk;
+#ifdef SCB_SHCSR_MEMFAULTENA_Msk
+  SCB->SHCSR |= SCB_SHCSR_MEMFAULTENA_Msk;
+#endif
+  __DSB();
+  __ISB();
+}
+
+/** Disable the MPU.
+*/
+__STATIC_INLINE void ARM_MPU_Disable(void)
+{
+  __DMB();
+#ifdef SCB_SHCSR_MEMFAULTENA_Msk
+  SCB->SHCSR &= ~SCB_SHCSR_MEMFAULTENA_Msk;
+#endif
+  MPU->CTRL  &= ~MPU_CTRL_ENABLE_Msk;
+  __DSB();
+  __ISB();
+}
+
+/** Clear and disable the given MPU region.
+* \param rnr Region number to be cleared.
+*/
+__STATIC_INLINE void ARM_MPU_ClrRegion(uint32_t rnr)
+{
+  MPU->RNR = rnr;
+  MPU->RASR = 0U;
+}
+
+/** Configure an MPU region.
+* \param rbar Value for RBAR register.
+* \param rsar Value for RSAR register.
+*/   
+__STATIC_INLINE void ARM_MPU_SetRegion(uint32_t rbar, uint32_t rasr)
+{
+  MPU->RBAR = rbar;
+  MPU->RASR = rasr;
+}
+
+/** Configure the given MPU region.
+* \param rnr Region number to be configured.
+* \param rbar Value for RBAR register.
+* \param rsar Value for RSAR register.
+*/   
+__STATIC_INLINE void ARM_MPU_SetRegionEx(uint32_t rnr, uint32_t rbar, uint32_t rasr)
+{
+  MPU->RNR = rnr;
+  MPU->RBAR = rbar;
+  MPU->RASR = rasr;
+}
+
+/** Memcopy with strictly ordered memory access, e.g. for register targets.
+* \param dst Destination data is copied to.
+* \param src Source data is copied from.
+* \param len Amount of data words to be copied.
+*/
+__STATIC_INLINE void ARM_MPU_OrderedMemcpy(volatile uint32_t* dst, const uint32_t* __RESTRICT src, uint32_t len)
+{
+  uint32_t i;
+  for (i = 0U; i < len; ++i) 
+  {
+    dst[i] = src[i];
+  }
+}
+
+/** Load the given number of MPU regions from a table.
+* \param table Pointer to the MPU configuration table.
+* \param cnt Amount of regions to be configured.
+*/
+__STATIC_INLINE void ARM_MPU_Load(ARM_MPU_Region_t const* table, uint32_t cnt) 
+{
+  const uint32_t rowWordSize = sizeof(ARM_MPU_Region_t)/4U;
+  while (cnt > MPU_TYPE_RALIASES) {
+    ARM_MPU_OrderedMemcpy(&(MPU->RBAR), &(table->RBAR), MPU_TYPE_RALIASES*rowWordSize);
+    table += MPU_TYPE_RALIASES;
+    cnt -= MPU_TYPE_RALIASES;
+  }
+  ARM_MPU_OrderedMemcpy(&(MPU->RBAR), &(table->RBAR), cnt*rowWordSize);
+}
+
+#endif
diff --git a/common/mps2/startup_MPS2.S b/common/mps2/startup_MPS2.S
new file mode 100644
index 0000000..fd38c39
--- /dev/null
+++ b/common/mps2/startup_MPS2.S
@@ -0,0 +1,206 @@
+/*
+ * MPS2 CMSIS Library
+ */
+/*
+ * Copyright (c) 2009-2018 ARM Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * This file is derivative of CMSIS V5.00 startup_ARMCM3.S
+ */
+    .syntax unified
+    .arch armv7-m
+
+    .section .vector_table,"a",%progbits
+    .align 2
+    .globl __isr_vector
+__isr_vector:
+    .long    __StackTop            /* Top of Stack */
+    .long    Reset_Handler         /* Reset Handler */
+    .long    NMI_Handler           /* NMI Handler */
+    .long    HardFault_Handler     /* Hard Fault Handler */
+    .long    MemManage_Handler     /* MPU Fault Handler */
+    .long    BusFault_Handler      /* Bus Fault Handler */
+    .long    UsageFault_Handler    /* Usage Fault Handler */
+    .long    0                     /* Reserved */
+    .long    0                     /* Reserved */
+    .long    0                     /* Reserved */
+    .long    0                     /* Reserved */
+    .long    SVC_Handler           /* SVCall Handler */
+    .long    DebugMon_Handler      /* Debug Monitor Handler */
+    .long    0                     /* Reserved */
+    .long    PendSV_Handler        /* PendSV Handler */
+    .long    SysTick_Handler       /* SysTick Handler */
+
+    /* External Interrupts */
+    .long     UARTRX0_Handler           /* UART 0 RX Handler                 */
+    .long     UARTTX0_Handler           /* UART 0 TX Handler                 */
+    .long     UARTRX1_Handler           /* UART 1 RX Handler                 */
+    .long     UARTTX1_Handler           /* UART 1 TX Handler                 */
+    .long     UARTRX2_Handler           /* UART 2 RX Handler                 */
+    .long     UARTTX2_Handler           /* UART 2 TX Handler                 */
+    .long     PORT0_COMB_Handler        /* GPIO Port 0 Combined Handler      */
+    .long     PORT1_COMB_Handler        /* GPIO Port 1 Combined Handler      */
+    .long     TIMER0_Handler            /* TIMER 0 handler                   */
+    .long     TIMER1_Handler            /* TIMER 1 handler                   */
+    .long     DUALTIMER_HANDLER         /* Dual timer handler                */
+    .long     SPI_Handler               /* SPI exceptions Handler            */
+    .long     UARTOVF_Handler           /* UART 0,1,2 Overflow Handler       */
+    .long     ETHERNET_Handler          /* Ethernet Overflow Handler         */
+    .long     I2S_Handler               /* I2S Handler                       */
+    .long     TSC_Handler               /* Touch Screen handler              */
+    .long     PORT2_COMB_Handler        /* GPIO Port 2 Combined Handler      */
+    .long     PORT3_COMB_Handler        /* GPIO Port 3 Combined Handler      */
+    .long     UARTRX3_Handler           /* UART 3 RX Handler                 */
+    .long     UARTTX3_Handler           /* UART 3 TX Handler                 */
+    .long     UARTRX4_Handler           /* UART 4 RX Handler                 */
+    .long     UARTTX4_Handler           /* UART 4 TX Handler                 */
+    .long     ADCSPI_Handler            /* SHIELD ADC SPI exceptions Handler */
+    .long     SHIELDSPI_Handler         /* SHIELD SPI exceptions Handler     */
+    .long     PORT0_0_Handler           /* GPIO Port 0 pin 0 Handler         */
+    .long     PORT0_1_Handler           /* GPIO Port 0 pin 1 Handler         */
+    .long     PORT0_2_Handler           /* GPIO Port 0 pin 2 Handler         */
+    .long     PORT0_3_Handler           /* GPIO Port 0 pin 3 Handler         */
+    .long     PORT0_4_Handler           /* GPIO Port 0 pin 4 Handler         */
+    .long     PORT0_5_Handler           /* GPIO Port 0 pin 5 Handler         */
+    .long     PORT0_6_Handler           /* GPIO Port 0 pin 6 Handler         */
+    .long     PORT0_7_Handler           /* GPIO Port 0 pin 7 Handler         */
+
+    .size    __isr_vector, . - __isr_vector
+
+    .section .text.Reset_Handler
+    .thumb
+    .thumb_func
+    .align  2
+    .globl   Reset_Handler
+    .type    Reset_Handler, %function
+Reset_Handler:
+/*
+ * Loop to copy data from read only memory to RAM. The ranges
+ * of copy from/to are specified by following symbols evaluated in
+ * linker script.
+ * _etext: End of code section, i.e., begin of data sections to copy from.
+ * __data_start__/__data_end__: RAM address range that data should be
+ * copied to. Both must be aligned to 4 bytes boundary.
+ */
+#if !defined(DATA_IN_FLASH)
+    ldr    r1, =__etext
+    ldr    r2, =__data_start__
+    ldr    r3, =__data_end__
+
+    subs   r3, r2
+    ble    .Lflash_to_ram_loop_end
+
+    movs    r4, 0
+.Lflash_to_ram_loop:
+    ldr    r0, [r1,r4]
+    str    r0, [r2,r4]
+    adds   r4, 4
+    cmp    r4, r3
+    blt    .Lflash_to_ram_loop
+.Lflash_to_ram_loop_end:
+#endif /* DATA_IN_FLASH */
+
+/* Initialize .bss */
+init_bss:
+    ldr   r1, =__bss_start__
+    ldr   r2, =__bss_end__
+    ldr   r3, =bss_size
+
+    cmp   r3, #0
+    beq   system_startup
+
+    mov   r4, #0
+zero:
+    strb  r4, [r1], #1
+    subs  r3, r3, #1
+    bne   zero
+
+system_startup:
+    ldr    r0, =SystemInit
+    blx    r0
+    ldr    r0, =_start
+    bx    r0
+    .pool
+    .size Reset_Handler, . - Reset_Handler
+
+    .text
+/*
+ * Macro to define default handlers. Default handler
+ * will be weak symbol and just dead loops. They can be
+ * overwritten by other handlers
+ */
+    .macro    def_default_handler    handler_name
+    .align 1
+    .thumb_func
+    .weak    \handler_name
+    .type    \handler_name, %function
+\handler_name :
+    b    .
+    .size    \handler_name, . - \handler_name
+    .endm
+
+    def_default_handler    NMI_Handler
+    def_default_handler    HardFault_Handler
+    def_default_handler    MemManage_Handler
+    def_default_handler    BusFault_Handler
+    def_default_handler    UsageFault_Handler
+    def_default_handler    SVC_Handler
+    def_default_handler    DebugMon_Handler
+    def_default_handler    PendSV_Handler
+    def_default_handler    SysTick_Handler
+    def_default_handler    Default_Handler
+
+    .macro    def_irq_default_handler    handler_name
+    .weak     \handler_name
+    .set      \handler_name, Default_Handler
+    .endm
+
+    /* External interrupts */
+    def_irq_default_handler     UARTRX0_Handler           /* 0:  UART 0 RX Handler                 */
+    def_irq_default_handler     UARTTX0_Handler           /* 1:  UART 0 TX Handler                 */
+    def_irq_default_handler     UARTRX1_Handler           /* 2:  UART 1 RX Handler                 */
+    def_irq_default_handler     UARTTX1_Handler           /* 3:  UART 1 TX Handler                 */
+    def_irq_default_handler     UARTRX2_Handler           /* 4:  UART 2 RX Handler                 */
+    def_irq_default_handler     UARTTX2_Handler           /* 5:  UART 2 TX Handler                 */
+    def_irq_default_handler     PORT0_COMB_Handler        /* 6:  GPIO Port 0 Combined Handler      */
+    def_irq_default_handler     PORT1_COMB_Handler        /* 7:  GPIO Port 1 Combined Handler      */
+    def_irq_default_handler     TIMER0_Handler            /* 8:  TIMER 0 handler                   */
+    def_irq_default_handler     TIMER1_Handler            /* 9:  TIMER 1 handler                   */
+    def_irq_default_handler     DUALTIMER_HANDLER         /* 10: Dual timer handler                */
+    def_irq_default_handler     SPI_Handler               /* 11: SPI exceptions Handler            */
+    def_irq_default_handler     UARTOVF_Handler           /* 12: UART 0,1,2 Overflow Handler       */
+    def_irq_default_handler     ETHERNET_Handler          /* 13: Ethernet Overflow Handler         */
+    def_irq_default_handler     I2S_Handler               /* 14: I2S Handler                       */
+    def_irq_default_handler     TSC_Handler               /* 15: Touch Screen handler              */
+    def_irq_default_handler     PORT2_COMB_Handler        /* 16: GPIO Port 2 Combined Handler      */
+    def_irq_default_handler     PORT3_COMB_Handler        /* 17: GPIO Port 3 Combined Handler      */
+    def_irq_default_handler     UARTRX3_Handler           /* 18: UART 3 RX Handler                 */
+    def_irq_default_handler     UARTTX3_Handler           /* 19: UART 3 TX Handler                 */
+    def_irq_default_handler     UARTRX4_Handler           /* 20: UART 4 RX Handler                 */
+    def_irq_default_handler     UARTTX4_Handler           /* 21: UART 4 TX Handler                 */
+    def_irq_default_handler     ADCSPI_Handler            /* 22: SHIELD ADC SPI exceptions Handler */
+    def_irq_default_handler     SHIELDSPI_Handler         /* 23: SHIELD SPI exceptions Handler     */
+    def_irq_default_handler     PORT0_0_Handler           /* 24: GPIO Port 0 pin 0 Handler         */
+    def_irq_default_handler     PORT0_1_Handler           /* 25: GPIO Port 0 pin 1 Handler         */
+    def_irq_default_handler     PORT0_2_Handler           /* 26: GPIO Port 0 pin 2 Handler         */
+    def_irq_default_handler     PORT0_3_Handler           /* 27: GPIO Port 0 pin 3 Handler         */
+    def_irq_default_handler     PORT0_4_Handler           /* 28: GPIO Port 0 pin 4 Handler         */
+    def_irq_default_handler     PORT0_5_Handler           /* 29: GPIO Port 0 pin 5 Handler         */
+    def_irq_default_handler     PORT0_6_Handler           /* 30: GPIO Port 0 pin 6 Handler         */
+    def_irq_default_handler     PORT0_7_Handler           /* 31: GPIO Port 0 pin 7 Handler         */
+
+    .end
diff --git a/common/randombytes.c b/common/randombytes.c
new file mode 100644
index 0000000..b27d4f0
--- /dev/null
+++ b/common/randombytes.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: Apache-2.0 or CC0-1.0
+#include "randombytes.h"
+
+#if defined(STM32F7)
+
+#include <libopencm3/stm32/rng.h>
+
+// TODO Maybe we do not want to use the hardware RNG for all randomness, but
+// instead only read a seed and then expand that using fips202.
+
+int randombytes(uint8_t *obuf, size_t len) {
+  union {
+    unsigned char aschar[4];
+    uint32_t asint;
+  } random;
+
+  while (len > 4) {
+    random.asint = rng_get_random_blocking();
+    *obuf++ = random.aschar[0];
+    *obuf++ = random.aschar[1];
+    *obuf++ = random.aschar[2];
+    *obuf++ = random.aschar[3];
+    len -= 4;
+  }
+  if (len > 0) {
+    for (random.asint = rng_get_random_blocking(); len > 0; --len) {
+      *obuf++ = random.aschar[len - 1];
+    }
+  }
+
+  return 0;
+}
+
+#else /* NONRANDOM FALLBACK IMPLEMENTATION */
+#warning Using a non-random randombytes
+
+#include <string.h>
+
+static uint32_t seed[32] = {3, 1, 4, 1, 5, 9, 2, 6, 5, 3, 5, 8, 9, 7, 9, 3,
+                            2, 3, 8, 4, 6, 2, 6, 4, 3, 3, 8, 3, 2, 7, 9, 5};
+static uint32_t in[12];
+static uint8_t out_buf[sizeof(uint32_t) * 16];
+static int32_t outleft = 0;
+
+#define ROTATE(x, b) (((x) << (b)) | ((x) >> (32 - (b))))
+#define MUSH(i, b) x = t[i] += (((x ^ seed[i]) + sum) ^ ROTATE(x, b));
+
+static void surf(uint32_t out[8]) {
+  uint32_t t[12];
+  uint32_t x;
+  uint32_t sum = 0;
+  int32_t r;
+  int32_t i;
+  int32_t loop;
+
+  for (i = 0; i < 12; ++i) {
+    t[i] = in[i] ^ seed[12 + i];
+  }
+  for (i = 0; i < 8; ++i) {
+    out[i] = seed[24 + i];
+  }
+  x = t[11];
+  for (loop = 0; loop < 2; ++loop) {
+    for (r = 0; r < 16; ++r) {
+      sum += 0x9e3779b9;
+      MUSH(0, 5)
+      MUSH(1, 7)
+      MUSH(2, 9)
+      MUSH(3, 13)
+      MUSH(4, 5)
+      MUSH(5, 7)
+      MUSH(6, 9)
+      MUSH(7, 13)
+      MUSH(8, 5)
+      MUSH(9, 7)
+      MUSH(10, 9)
+      MUSH(11, 13)
+    }
+    for (i = 0; i < 8; ++i) {
+      out[i] ^= t[i + 4];
+    }
+  }
+}
+
+void randombytes_regen(void);
+void randombytes_regen(void) {
+  uint32_t out[8];
+  if (!++in[0]) {
+    if (!++in[1]) {
+      if (!++in[2]) {
+        ++in[3];
+      }
+    }
+  }
+  surf(out);
+  memcpy(out_buf, out, sizeof(out));
+  if (!++in[0]) {
+    if (!++in[1]) {
+      if (!++in[2]) {
+        ++in[3];
+      }
+    }
+  }
+  surf(out);
+  memcpy(out_buf + sizeof(out), out, sizeof(out));
+  outleft = sizeof(out_buf);
+}
+
+int randombytes(uint8_t *buf, size_t xlen) {
+  while (xlen > 0) {
+    if (!outleft) {
+      randombytes_regen();
+    }
+    *buf = out_buf[--outleft];
+    ++buf;
+    --xlen;
+  }
+  return 0;
+}
+
+#endif
diff --git a/common/test.c b/common/test.c
new file mode 100644
index 0000000..1a24c27
--- /dev/null
+++ b/common/test.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: Apache-2.0 or CC0-1.0
+#include <hal.h>
+#include <randombytes.h>
+#include <sendfn.h>
+#include <stdint.h>
+
+#if defined(SRAM_TIMING_TEST)
+#define TEST_BLOCK_SIZE 4096
+
+/* Don't use opencm3 here, since not all platforms might use opencm3, but all
+   have these DWT registers */
+#define DWT_CTRL           (*(volatile uint32_t*)(0xE0001000u + 0x00))
+#define DWT_CYCCNT         (*(volatile uint32_t*)(0xE0001000u + 0x04))
+#define DWT_CTRL_CYCCNTENA (1 << 0)
+#define SCS_DEMCR          (*(volatile uint32_t*)(0xE000E000u + 0xDFC))
+#define SCS_DEMCR_TRCENA   (1 << 24)
+/* Need a really precise cycle counter. */
+static void cyccnt_enable()
+{
+	SCS_DEMCR |= SCS_DEMCR_TRCENA;
+	DWT_CYCCNT = 0;
+	DWT_CTRL |= DWT_CTRL_CYCCNTENA;
+}
+static inline void cyccnt_start()
+{
+	DWT_CYCCNT = 0;
+}
+static inline uint32_t cyccnt_get()
+{
+  return DWT_CYCCNT;
+}
+
+__attribute__((noinline))
+static uint32_t test_load(volatile unsigned* ram_block)
+{
+  asm volatile("cpsid if");
+  cyccnt_start();
+#define NL "\n\t"
+  asm volatile("_MEMLOOP%=:" NL
+               "ldr r12, [%0], #4" NL
+               "cmp %0, %1" NL
+               "bne _MEMLOOP%=" NL
+               :"+r" (ram_block): "r" (ram_block + (TEST_BLOCK_SIZE / sizeof(unsigned))): "r12", "cc");
+  uint32_t result = cyccnt_get();
+  asm volatile("cpsie if");
+  return result;
+}
+
+__attribute__((noinline))
+static uint32_t test_unalignedload(volatile void* ram_block)
+{
+  volatile unsigned char* ram_block8 = ram_block;
+  ram_block8 += 2;
+  asm volatile("cpsid if");
+  cyccnt_start();
+#define NL "\n\t"
+  asm volatile("_MEMLOOP%=:" NL
+               "ldr r12, [%0], #4" NL
+               "cmp %0, %1" NL
+               "blt _MEMLOOP%=" NL
+               :"+r" (ram_block8): "r" (ram_block8 + TEST_BLOCK_SIZE): "r12", "cc");
+  uint32_t result = cyccnt_get();
+  asm volatile("cpsie if");
+  return result;
+}
+
+__attribute__((noinline))
+static uint32_t test_store(volatile unsigned* ram_block)
+{
+  cyccnt_start();
+#define NL "\n\t"
+  asm volatile("_MEMLOOP%=:" NL
+               "str r12, [%0], #4" NL
+               "cmp %0, %1" NL
+               "bne _MEMLOOP%=" NL
+               :"+r" (ram_block): "r" (ram_block + (TEST_BLOCK_SIZE / sizeof(unsigned))): "r12", "cc");
+  return cyccnt_get();
+}
+
+__attribute__((noinline))
+static uint32_t test_loadstore(volatile unsigned* ram_block)
+{
+  cyccnt_start();
+#define NL "\n\t"
+  asm volatile("_MEMLOOP%=:" NL
+               "str r12, [%0]" NL
+               "add r12, r12, #1" NL
+               "ldr r12, [%0], #4" NL
+               "cmp %0, %1" NL
+               "bne _MEMLOOP%=" NL
+               :"+r" (ram_block): "r" (ram_block + (TEST_BLOCK_SIZE / sizeof(unsigned))): "r12", "cc");
+  return cyccnt_get();
+}
+
+static void memory_timing_test(void)
+{
+  cyccnt_enable();
+#define RAMBLK(BLK)                                                       \
+  static volatile unsigned ram ## BLK ## _block[TEST_BLOCK_SIZE / sizeof(unsigned) + 1] __attribute__((section(".ram" #BLK)))
+
+#define TEST(BLK) \
+  test_load(ram ## BLK ## _block); \
+  test_unalignedload(ram ## BLK ## _block);  \
+  test_store(ram ## BLK ## _block); \
+  test_loadstore(ram ## BLK ## _block); \
+  send_unsigned("ram" #BLK " load", test_load(ram ## BLK ## _block)); \
+  send_unsigned("ram" #BLK " unalignedload", test_unalignedload(ram ## BLK ## _block));   \
+  send_unsigned("ram" #BLK " store", test_store(ram ## BLK ## _block)); \
+  send_unsigned("ram" #BLK " loadstore", test_loadstore(ram ## BLK ## _block));
+
+  static volatile unsigned ram1_block[TEST_BLOCK_SIZE / sizeof(unsigned) + 1];
+  TEST(1);
+#if defined(HAS_SRAM2)
+  RAMBLK(2);
+  TEST(2);
+#endif
+#if defined(HAS_SRAM3)
+  RAMBLK(3);
+  TEST(3);
+#endif
+#if defined(HAS_CCM)
+  static volatile unsigned ramccm_block[TEST_BLOCK_SIZE / sizeof(unsigned) + 1] __attribute__((section(".ccmram")));
+  TEST(ccm);
+#endif
+}
+#endif
+
+#ifndef CLOCK_TEST
+#define CLOCK_TEST CLOCK_BENCHMARK
+#endif
+
+void stacktest(size_t size)
+{
+  volatile uint32_t mem[size] __attribute__((unused));
+  for (unsigned i = 0; i < size; ++i) {
+    mem[i] = 0;
+  }
+}
+
+int main(void)
+{
+  hal_setup(CLOCK_TEST);
+  hal_send_str("Hello world");
+  send_unsigned("Stack Size", hal_get_stack_size());
+  unsigned rnd;
+  randombytes((unsigned char*) &rnd, sizeof(unsigned));
+  send_unsigned("Random number", rnd);
+  size_t stack;
+  hal_spraystack();
+  stacktest(100);
+  stack = hal_checkstack();
+  send_unsigned("stackusage1", stack);
+  hal_spraystack();
+  stacktest(200);
+  stack = hal_checkstack();
+  send_unsigned("stackusage2", stack);
+#if defined(SRAM_TIMING_TEST)
+  memory_timing_test();
+#endif
+  return 0;
+}
diff --git a/common/testfast.c b/common/testfast.c
new file mode 120000
index 0000000..aeebb26
--- /dev/null
+++ b/common/testfast.c
@@ -0,0 +1 @@
+test.c
\ No newline at end of file
diff --git a/convert_benchmarks.py b/convert_benchmarks.py
new file mode 100755
index 0000000..6d9667f
--- /dev/null
+++ b/convert_benchmarks.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0 or CC0-1.0
+import sys
+from mupq import mupq
+    
+def usage():  
+    print("Usage: python3 convert_benchmarks.py csv|md")
+    sys.exit(1) 
+ 
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        usage()
+    if sys.argv[1] == "csv": 
+        converter = mupq.CsvConverter()
+    elif sys.argv[1] == "md":
+        converter = mupq.MarkdownConverter()
+    else:
+        usage()
+    converter.convert()
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/api.h b/crypto_kem/ml-kem-1024/m4fspeed/api.h
new file mode 100644
index 0000000..92ea9be
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/api.h
@@ -0,0 +1,20 @@
+#ifndef API_H
+#define API_H
+
+#include "params.h"
+
+#define CRYPTO_SECRETKEYBYTES  KYBER_SECRETKEYBYTES
+#define CRYPTO_PUBLICKEYBYTES  KYBER_PUBLICKEYBYTES
+#define CRYPTO_CIPHERTEXTBYTES KYBER_CIPHERTEXTBYTES
+#define CRYPTO_BYTES           KYBER_SSBYTES
+
+#define CRYPTO_ALGNAME "Kyber1024"
+
+int crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
+
+int crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk);
+
+int crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk);
+
+
+#endif
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/cbd.c b/crypto_kem/ml-kem-1024/m4fspeed/cbd.c
new file mode 120000
index 0000000..801f7f8
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/cbd.c
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/cbd.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/cbd.h b/crypto_kem/ml-kem-1024/m4fspeed/cbd.h
new file mode 120000
index 0000000..4f9e3af
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/cbd.h
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/cbd.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/cmov_int16.S b/crypto_kem/ml-kem-1024/m4fspeed/cmov_int16.S
new file mode 120000
index 0000000..bdef6f4
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/cmov_int16.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/cmov_int16.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/fastaddsub.S b/crypto_kem/ml-kem-1024/m4fspeed/fastaddsub.S
new file mode 120000
index 0000000..aa55564
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/fastaddsub.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/fastaddsub.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/fastbasemul.S b/crypto_kem/ml-kem-1024/m4fspeed/fastbasemul.S
new file mode 120000
index 0000000..4384e1d
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/fastbasemul.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/fastbasemul.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/fastinvntt.S b/crypto_kem/ml-kem-1024/m4fspeed/fastinvntt.S
new file mode 120000
index 0000000..ede60d7
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/fastinvntt.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/fastinvntt.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/fastntt.S b/crypto_kem/ml-kem-1024/m4fspeed/fastntt.S
new file mode 120000
index 0000000..d34524f
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/fastntt.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/fastntt.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/indcpa.c b/crypto_kem/ml-kem-1024/m4fspeed/indcpa.c
new file mode 120000
index 0000000..25db6b1
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/indcpa.c
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/indcpa.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/indcpa.h b/crypto_kem/ml-kem-1024/m4fspeed/indcpa.h
new file mode 120000
index 0000000..e6f3662
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/indcpa.h
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/indcpa.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/kem.c b/crypto_kem/ml-kem-1024/m4fspeed/kem.c
new file mode 120000
index 0000000..489b6f9
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/kem.c
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/kem.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/macros.i b/crypto_kem/ml-kem-1024/m4fspeed/macros.i
new file mode 120000
index 0000000..a7d8e74
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/macros.i
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/macros.i
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/matacc.c b/crypto_kem/ml-kem-1024/m4fspeed/matacc.c
new file mode 120000
index 0000000..71d7234
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/matacc.c
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/matacc.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/matacc.h b/crypto_kem/ml-kem-1024/m4fspeed/matacc.h
new file mode 120000
index 0000000..19b6772
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/matacc.h
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/matacc.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/matacc.i b/crypto_kem/ml-kem-1024/m4fspeed/matacc.i
new file mode 120000
index 0000000..39b6e23
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/matacc.i
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/matacc.i
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/matacc_asm.S b/crypto_kem/ml-kem-1024/m4fspeed/matacc_asm.S
new file mode 120000
index 0000000..3c7d05e
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/matacc_asm.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/matacc_asm.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/ntt.c b/crypto_kem/ml-kem-1024/m4fspeed/ntt.c
new file mode 120000
index 0000000..971c6b0
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/ntt.c
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/ntt.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/ntt.h b/crypto_kem/ml-kem-1024/m4fspeed/ntt.h
new file mode 120000
index 0000000..11e111d
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/ntt.h
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/ntt.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/params.h b/crypto_kem/ml-kem-1024/m4fspeed/params.h
new file mode 100644
index 0000000..a3153e7
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/params.h
@@ -0,0 +1,31 @@
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#define KYBER_K 4 /* Change this for different security strengths */
+
+/* Don't change parameters below this line */
+
+#define KYBER_N 256
+#define KYBER_Q 3329
+
+#define KYBER_ETA 2
+
+#define KYBER_SYMBYTES 32   /* size in bytes of hashes, and seeds */
+#define KYBER_SSBYTES  32   /* size in bytes of shared key */
+
+#define KYBER_POLYBYTES              384
+#define KYBER_POLYVECBYTES           (KYBER_K * KYBER_POLYBYTES)
+
+#define KYBER_POLYCOMPRESSEDBYTES    160
+#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352)
+
+#define KYBER_INDCPA_MSGBYTES       KYBER_SYMBYTES
+#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES)
+#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES)
+#define KYBER_INDCPA_BYTES          (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES)
+
+#define KYBER_PUBLICKEYBYTES  (KYBER_INDCPA_PUBLICKEYBYTES)
+#define KYBER_SECRETKEYBYTES  (KYBER_INDCPA_SECRETKEYBYTES +  KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */
+#define KYBER_CIPHERTEXTBYTES  KYBER_INDCPA_BYTES
+
+#endif
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/poly.c b/crypto_kem/ml-kem-1024/m4fspeed/poly.c
new file mode 120000
index 0000000..b432b8a
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/poly.c
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/poly.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/poly.h b/crypto_kem/ml-kem-1024/m4fspeed/poly.h
new file mode 120000
index 0000000..6003dc3
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/poly.h
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/poly.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/poly_asm.S b/crypto_kem/ml-kem-1024/m4fspeed/poly_asm.S
new file mode 120000
index 0000000..c4bda05
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/poly_asm.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/poly_asm.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/polyvec.c b/crypto_kem/ml-kem-1024/m4fspeed/polyvec.c
new file mode 120000
index 0000000..c3f7d0a
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/polyvec.c
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/polyvec.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/polyvec.h b/crypto_kem/ml-kem-1024/m4fspeed/polyvec.h
new file mode 120000
index 0000000..47cf6c3
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/polyvec.h
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/polyvec.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/reduce.S b/crypto_kem/ml-kem-1024/m4fspeed/reduce.S
new file mode 120000
index 0000000..2edf10c
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/reduce.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/reduce.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/symmetric-fips202.c b/crypto_kem/ml-kem-1024/m4fspeed/symmetric-fips202.c
new file mode 120000
index 0000000..5adc9ae
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/symmetric-fips202.c
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/symmetric-fips202.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/symmetric.h b/crypto_kem/ml-kem-1024/m4fspeed/symmetric.h
new file mode 120000
index 0000000..698a10d
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/symmetric.h
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/symmetric.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/verify.c b/crypto_kem/ml-kem-1024/m4fspeed/verify.c
new file mode 120000
index 0000000..85d7f50
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/verify.c
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/verify.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fspeed/verify.h b/crypto_kem/ml-kem-1024/m4fspeed/verify.h
new file mode 120000
index 0000000..e19a301
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fspeed/verify.h
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/verify.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/api.h b/crypto_kem/ml-kem-1024/m4fstack/api.h
new file mode 120000
index 0000000..cf75db9
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/api.h
@@ -0,0 +1 @@
+../m4fspeed/api.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/cbd.c b/crypto_kem/ml-kem-1024/m4fstack/cbd.c
new file mode 120000
index 0000000..903fa59
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/cbd.c
@@ -0,0 +1 @@
+../m4fspeed/cbd.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/cbd.h b/crypto_kem/ml-kem-1024/m4fstack/cbd.h
new file mode 120000
index 0000000..d264c36
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/cbd.h
@@ -0,0 +1 @@
+../m4fspeed/cbd.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/cmov_int16.S b/crypto_kem/ml-kem-1024/m4fstack/cmov_int16.S
new file mode 120000
index 0000000..bdef6f4
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/cmov_int16.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/cmov_int16.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/fastaddsub.S b/crypto_kem/ml-kem-1024/m4fstack/fastaddsub.S
new file mode 120000
index 0000000..d1317f7
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/fastaddsub.S
@@ -0,0 +1 @@
+../m4fspeed/fastaddsub.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/fastbasemul.S b/crypto_kem/ml-kem-1024/m4fstack/fastbasemul.S
new file mode 120000
index 0000000..531385d
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/fastbasemul.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fstack/fastbasemul.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/fastinvntt.S b/crypto_kem/ml-kem-1024/m4fstack/fastinvntt.S
new file mode 120000
index 0000000..ede60d7
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/fastinvntt.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/fastinvntt.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/fastntt.S b/crypto_kem/ml-kem-1024/m4fstack/fastntt.S
new file mode 120000
index 0000000..d34524f
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/fastntt.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/fastntt.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/indcpa.c b/crypto_kem/ml-kem-1024/m4fstack/indcpa.c
new file mode 120000
index 0000000..a4103b1
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/indcpa.c
@@ -0,0 +1 @@
+../../ml-kem-768/m4fstack/indcpa.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/indcpa.h b/crypto_kem/ml-kem-1024/m4fstack/indcpa.h
new file mode 120000
index 0000000..9e56c80
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/indcpa.h
@@ -0,0 +1 @@
+../../ml-kem-768/m4fstack/indcpa.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/kem.c b/crypto_kem/ml-kem-1024/m4fstack/kem.c
new file mode 120000
index 0000000..302153d
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/kem.c
@@ -0,0 +1 @@
+../m4fspeed/kem.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/macros.i b/crypto_kem/ml-kem-1024/m4fstack/macros.i
new file mode 120000
index 0000000..6e83891
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/macros.i
@@ -0,0 +1 @@
+../m4fspeed/macros.i
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/matacc.c b/crypto_kem/ml-kem-1024/m4fstack/matacc.c
new file mode 120000
index 0000000..5558ec8
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/matacc.c
@@ -0,0 +1 @@
+../../ml-kem-768/m4fstack/matacc.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/matacc.h b/crypto_kem/ml-kem-1024/m4fstack/matacc.h
new file mode 120000
index 0000000..4eb7706
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/matacc.h
@@ -0,0 +1 @@
+../../ml-kem-768/m4fstack/matacc.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/matacc.i b/crypto_kem/ml-kem-1024/m4fstack/matacc.i
new file mode 120000
index 0000000..0d39b07
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/matacc.i
@@ -0,0 +1 @@
+../../ml-kem-768/m4fstack/matacc.i
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/matacc_asm.S b/crypto_kem/ml-kem-1024/m4fstack/matacc_asm.S
new file mode 120000
index 0000000..0079bb5
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/matacc_asm.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fstack/matacc_asm.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/ntt.c b/crypto_kem/ml-kem-1024/m4fstack/ntt.c
new file mode 120000
index 0000000..c9d6e8a
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/ntt.c
@@ -0,0 +1 @@
+../m4fspeed/ntt.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/ntt.h b/crypto_kem/ml-kem-1024/m4fstack/ntt.h
new file mode 120000
index 0000000..5fd83c0
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/ntt.h
@@ -0,0 +1 @@
+../m4fspeed/ntt.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/params.h b/crypto_kem/ml-kem-1024/m4fstack/params.h
new file mode 120000
index 0000000..59dd7f1
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/params.h
@@ -0,0 +1 @@
+../m4fspeed/params.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/poly.c b/crypto_kem/ml-kem-1024/m4fstack/poly.c
new file mode 120000
index 0000000..df6f119
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/poly.c
@@ -0,0 +1 @@
+../../ml-kem-768/m4fstack/poly.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/poly.h b/crypto_kem/ml-kem-1024/m4fstack/poly.h
new file mode 120000
index 0000000..ad89400
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/poly.h
@@ -0,0 +1 @@
+../../ml-kem-768/m4fstack/poly.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/poly_asm.S b/crypto_kem/ml-kem-1024/m4fstack/poly_asm.S
new file mode 120000
index 0000000..167ee5e
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/poly_asm.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fstack/poly_asm.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/polyvec.c b/crypto_kem/ml-kem-1024/m4fstack/polyvec.c
new file mode 120000
index 0000000..f398d76
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/polyvec.c
@@ -0,0 +1 @@
+../m4fspeed/polyvec.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/polyvec.h b/crypto_kem/ml-kem-1024/m4fstack/polyvec.h
new file mode 120000
index 0000000..3113837
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/polyvec.h
@@ -0,0 +1 @@
+../m4fspeed/polyvec.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/reduce.S b/crypto_kem/ml-kem-1024/m4fstack/reduce.S
new file mode 120000
index 0000000..29ae453
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/reduce.S
@@ -0,0 +1 @@
+../m4fspeed/reduce.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/symmetric-fips202.c b/crypto_kem/ml-kem-1024/m4fstack/symmetric-fips202.c
new file mode 120000
index 0000000..5adc9ae
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/symmetric-fips202.c
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/symmetric-fips202.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/symmetric.h b/crypto_kem/ml-kem-1024/m4fstack/symmetric.h
new file mode 120000
index 0000000..28c6fac
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/symmetric.h
@@ -0,0 +1 @@
+../m4fspeed/symmetric.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/verify.c b/crypto_kem/ml-kem-1024/m4fstack/verify.c
new file mode 120000
index 0000000..a7a9856
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/verify.c
@@ -0,0 +1 @@
+../m4fspeed/verify.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-1024/m4fstack/verify.h b/crypto_kem/ml-kem-1024/m4fstack/verify.h
new file mode 120000
index 0000000..cb2da4b
--- /dev/null
+++ b/crypto_kem/ml-kem-1024/m4fstack/verify.h
@@ -0,0 +1 @@
+../m4fspeed/verify.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fspeed/api.h b/crypto_kem/ml-kem-512/m4fspeed/api.h
new file mode 100644
index 0000000..3b9244a
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/api.h
@@ -0,0 +1,20 @@
+#ifndef API_H
+#define API_H
+
+#include "params.h"
+
+#define CRYPTO_SECRETKEYBYTES  KYBER_SECRETKEYBYTES
+#define CRYPTO_PUBLICKEYBYTES  KYBER_PUBLICKEYBYTES
+#define CRYPTO_CIPHERTEXTBYTES KYBER_CIPHERTEXTBYTES
+#define CRYPTO_BYTES           KYBER_SSBYTES
+
+#define CRYPTO_ALGNAME "Kyber512"
+
+int crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
+
+int crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk);
+
+int crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk);
+
+
+#endif
diff --git a/crypto_kem/ml-kem-512/m4fspeed/cbd.c b/crypto_kem/ml-kem-512/m4fspeed/cbd.c
new file mode 100644
index 0000000..f8911fc
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/cbd.c
@@ -0,0 +1,112 @@
+#include "cbd.h"
+#include "params.h"
+
+#include <stdint.h>
+
+/*************************************************
+* Name:        load32_littleendian
+*
+* Description: load bytes into a 32-bit integer
+*              in little-endian order
+*
+* Arguments:   - const unsigned char *x: pointer to input byte array
+*
+* Returns 32-bit unsigned integer loaded from x
+**************************************************/
+static uint32_t load32_littleendian(const unsigned char *x) {
+    uint32_t r;
+    r  = (uint32_t)x[0];
+    r |= (uint32_t)x[1] << 8;
+    r |= (uint32_t)x[2] << 16;
+    r |= (uint32_t)x[3] << 24;
+    return r;
+}
+
+/*************************************************
+* Name:        load24_littleendian
+*
+* Description: load 3 bytes into a 32-bit integer
+*              in little-endian order
+*              This function is only needed for Kyber-512
+*
+* Arguments:   - const uint8_t *x: pointer to input byte array
+*
+* Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
+**************************************************/
+static uint32_t load24_littleendian(const uint8_t x[3])
+{
+  uint32_t r;
+  r  = (uint32_t)x[0];
+  r |= (uint32_t)x[1] << 8;
+  r |= (uint32_t)x[2] << 16;
+  return r;
+}
+
+
+
+/*************************************************
+* Name:        cbd_eta1
+*
+* Description: Given an array of uniformly random bytes, compute
+*              polynomial with coefficients distributed according to
+*              a centered binomial distribution with parameter KYBER_ETA1
+*              specialized for KYBER_ETA1=3
+*
+* Arguments:   - poly *r:                  pointer to output polynomial
+*              - const unsigned char *buf: pointer to input byte array
+*              - int add:                  boolean to indicate to accumulate into r
+**************************************************/
+void cbd_eta1(poly *r, const unsigned char *buf, int add) {
+  unsigned int i,j;
+  uint32_t t,d;
+  int16_t a,b;
+
+  for(i=0;i<KYBER_N/4;i++) {
+    t  = load24_littleendian(buf+3*i);
+    d  = t & 0x00249249;
+    d += (t>>1) & 0x00249249;
+    d += (t>>2) & 0x00249249;
+
+    for(j=0;j<4;j++) {
+      a = (d >> (6*j+0)) & 0x7;
+      b = (d >> (6*j+3)) & 0x7;
+      if (!add)
+        r->coeffs[4 * i + j] = 0;
+      r->coeffs[4 * i + j] = r->coeffs[4 * i + j] + (a - b);
+    }
+  }
+}
+
+/*************************************************
+* Name:        cbd_eta2
+*
+* Description: Given an array of uniformly random bytes, compute
+*              polynomial with coefficients distributed according to
+*              a centered binomial distribution with parameter KYBER_ETA2
+*              specialized for KYBER_ETA2=2
+*
+* Arguments:   - poly *r:                  pointer to output polynomial
+*              - const unsigned char *buf: pointer to input byte array
+*              - int add:                  boolean to indicate to accumulate into r
+**************************************************/
+void cbd_eta2(poly *r, const unsigned char *buf, int add) {
+    uint32_t d, t;
+    int16_t a, b;
+    int i, j;
+
+    for (i = 0; i < KYBER_N / 8; i++) {
+        t = load32_littleendian(buf + 4 * i);
+        d  = t & 0x55555555;
+        d += (t >> 1) & 0x55555555;
+
+        for (j = 0; j < 8; j++) {
+            a = (d >>  4 * j)    & 0x3;
+            b = (d >> (4 * j + 2)) & 0x3;
+            if (!add)
+              r->coeffs[8 * i + j] = 0;
+            r->coeffs[8 * i + j] = r->coeffs[8 * i + j] + (a - b);
+        }
+    }
+}
+
+
diff --git a/crypto_kem/ml-kem-512/m4fspeed/cbd.h b/crypto_kem/ml-kem-512/m4fspeed/cbd.h
new file mode 100644
index 0000000..47f1d24
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/cbd.h
@@ -0,0 +1,9 @@
+#ifndef CBD_H
+#define CBD_H
+
+#include "poly.h"
+
+void cbd_eta1(poly *r, const unsigned char *buf, int add);
+void cbd_eta2(poly *r, const unsigned char *buf, int add);
+
+#endif
diff --git a/crypto_kem/ml-kem-512/m4fspeed/cmov_int16.S b/crypto_kem/ml-kem-512/m4fspeed/cmov_int16.S
new file mode 120000
index 0000000..bdef6f4
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/cmov_int16.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/cmov_int16.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fspeed/fastaddsub.S b/crypto_kem/ml-kem-512/m4fspeed/fastaddsub.S
new file mode 120000
index 0000000..aa55564
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/fastaddsub.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/fastaddsub.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fspeed/fastbasemul.S b/crypto_kem/ml-kem-512/m4fspeed/fastbasemul.S
new file mode 120000
index 0000000..4384e1d
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/fastbasemul.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/fastbasemul.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fspeed/fastinvntt.S b/crypto_kem/ml-kem-512/m4fspeed/fastinvntt.S
new file mode 120000
index 0000000..ede60d7
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/fastinvntt.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/fastinvntt.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fspeed/fastntt.S b/crypto_kem/ml-kem-512/m4fspeed/fastntt.S
new file mode 120000
index 0000000..d34524f
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/fastntt.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/fastntt.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fspeed/indcpa.c b/crypto_kem/ml-kem-512/m4fspeed/indcpa.c
new file mode 100644
index 0000000..99f5b3c
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/indcpa.c
@@ -0,0 +1,246 @@
+#include "indcpa.h"
+#include "ntt.h"
+#include "poly.h"
+#include "polyvec.h"
+#include "randombytes.h"
+#include "symmetric.h"
+#include "matacc.h"
+
+#include <string.h>
+#include <stdint.h>
+
+
+/*************************************************
+* Name:        indcpa_keypair_derand
+*
+* Description: Generates public and private key for the CPA-secure
+*              public-key encryption scheme underlying Kyber
+*
+* Arguments:   - uint8_t *pk: pointer to output public key
+*                             (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
+*              - uint8_t *sk: pointer to output private key
+*                             (of length KYBER_INDCPA_SECRETKEYBYTES bytes)
+*              - const uint8_t *coins: pointer to input randomness
+*                             (of length KYBER_SYMBYTES bytes)
+**************************************************/
+void indcpa_keypair_derand(unsigned char *pk,
+                    unsigned char *sk, 
+                    const unsigned char *coins){
+    polyvec skpv, skpv_prime;
+    poly pkp;
+    unsigned char buf[2 * KYBER_SYMBYTES];
+    unsigned char *publicseed = buf;
+    unsigned char *noiseseed = buf + KYBER_SYMBYTES;
+    int i;
+    unsigned char nonce = 0;
+
+    memcpy(buf, coins, KYBER_SYMBYTES);
+    buf[KYBER_SYMBYTES] = KYBER_K;
+    hash_g(buf, buf, KYBER_SYMBYTES + 1);
+
+    for (i = 0; i < KYBER_K; i++)
+        poly_getnoise_eta1(skpv.vec + i, noiseseed, nonce++);
+
+    polyvec_ntt(&skpv);
+    
+    // i = 0
+    matacc_cache32(&pkp, &skpv, &skpv_prime, 0, publicseed, 0);
+    poly_invntt(&pkp);
+
+    poly_addnoise_eta1(&pkp, noiseseed, nonce++);
+    poly_ntt(&pkp);
+
+    poly_tobytes(pk, &pkp);
+    for (i = 1; i < KYBER_K; i++) {
+        matacc_opt32(&pkp, &skpv, &skpv_prime, i, publicseed, 0);
+        poly_invntt(&pkp);
+
+        poly_addnoise_eta1(&pkp, noiseseed, nonce++);
+        poly_ntt(&pkp);
+
+        poly_tobytes(pk+i*KYBER_POLYBYTES, &pkp);
+    }
+
+    polyvec_tobytes(sk, &skpv);
+    memcpy(pk + KYBER_POLYVECBYTES, publicseed, KYBER_SYMBYTES); // Pack the public seed in the public key
+}
+
+/*************************************************
+* Name:        indcpa_enc
+*
+* Description: Encryption function of the CPA-secure
+*              public-key encryption scheme underlying Kyber.
+*
+* Arguments:   - unsigned char *c:          pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes)
+*              - const unsigned char *m:    pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes)
+*              - const unsigned char *pk:   pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
+*              - const unsigned char *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes)
+*                                           to deterministically generate all randomness
+**************************************************/
+void indcpa_enc(unsigned char *c,
+               const unsigned char *m,
+               const unsigned char *pk,
+               const unsigned char *coins) {
+    polyvec sp, sp_prime;
+    poly bp;
+    poly *pkp = &bp;
+    poly *k = &bp;
+    poly *v = &sp.vec[0];
+    const unsigned char *seed = pk+KYBER_POLYVECBYTES;
+    int i;
+    unsigned char nonce = 0;
+
+    for (i = 0; i < KYBER_K; i++)
+        poly_getnoise_eta1(sp.vec + i, coins, nonce++);
+
+    polyvec_ntt(&sp);
+
+    // i = 0
+    matacc_cache32(&bp, &sp, &sp_prime, 0, seed, 1);
+    poly_invntt(&bp);
+    poly_addnoise_eta2(&bp, coins, nonce++);
+    poly_reduce(&bp);
+    poly_packcompress(c, &bp, 0);
+    for (i = 1; i < KYBER_K; i++) {
+        matacc_opt32(&bp, &sp, &sp_prime, i, seed, 1);
+        poly_invntt(&bp);
+
+        poly_addnoise_eta2(&bp, coins, nonce++);
+        poly_reduce(&bp);
+
+        poly_packcompress(c, &bp, i);
+    }
+
+    poly_frombytes(pkp, pk);
+    int32_t v_tmp[KYBER_N];
+    
+    poly_basemul_opt_16_32(v_tmp, &sp.vec[0], pkp, &sp_prime.vec[0]);
+    for (i = 1; i < KYBER_K - 1; i++) {
+        poly_frombytes(pkp, pk + i*KYBER_POLYBYTES);
+        poly_basemul_acc_opt_32_32(v_tmp, &sp.vec[i], pkp, &sp_prime.vec[i]);
+    }
+    poly_frombytes(pkp, pk + i*KYBER_POLYBYTES);
+    poly_basemul_acc_opt_32_16(v, &sp.vec[i], pkp, &sp_prime.vec[i], v_tmp);
+
+    poly_invntt(v);
+
+    poly_addnoise_eta2(v, coins, nonce++);
+
+    poly_frommsg(k, m);
+    poly_add(v, v, k);
+    poly_reduce(v);
+
+    poly_compress(c + KYBER_POLYVECCOMPRESSEDBYTES, v);
+}
+
+/*************************************************
+* Name:        indcpa_enc_cmp
+*
+* Description: Re-encryption function.
+*              Compares the re-encypted ciphertext with the original ciphertext byte per byte.
+*              The comparison is performed in a constant time manner.
+*
+*
+* Arguments:   - unsigned char *ct:         pointer to input ciphertext to compare the new ciphertext with (of length KYBER_INDCPA_BYTES bytes)
+*              - const unsigned char *m:    pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes)
+*              - const unsigned char *pk:   pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
+*              - const unsigned char *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes)
+*                                           to deterministically generate all randomness
+* Returns:     - boolean byte indicating that re-encrypted ciphertext is NOT equal to the original ciphertext
+**************************************************/
+unsigned char indcpa_enc_cmp(const unsigned char *c,
+                             const unsigned char *m,
+                             const unsigned char *pk,
+                             const unsigned char *coins) {
+    uint64_t rc = 0;
+    polyvec sp, sp_prime;
+    poly bp;
+    poly *pkp = &bp;
+    poly *k = &bp;
+    poly *v = &sp.vec[0];
+    const unsigned char *seed = pk+KYBER_POLYVECBYTES;
+    int i;
+    unsigned char nonce = 0;
+
+    for (i = 0; i < KYBER_K; i++)
+        poly_getnoise_eta1(sp.vec + i, coins, nonce++);
+
+    polyvec_ntt(&sp);
+    // i = 0
+    matacc_cache32(&bp, &sp, &sp_prime, 0, seed, 1);
+    poly_invntt(&bp);
+    poly_addnoise_eta2(&bp, coins, nonce++);
+    poly_reduce(&bp);
+    rc |= cmp_poly_packcompress(c, &bp, 0);
+    for (i = 1; i < KYBER_K; i++) {
+        matacc_opt32(&bp, &sp, &sp_prime, i, seed, 1);
+        poly_invntt(&bp);
+
+        poly_addnoise_eta2(&bp, coins, nonce++);
+        poly_reduce(&bp);
+
+        rc |= cmp_poly_packcompress(c, &bp, i);
+    }
+
+    poly_frombytes(pkp, pk);
+    int32_t v_tmp[KYBER_N];
+    
+    poly_basemul_opt_16_32(v_tmp, &sp.vec[0], pkp, &sp_prime.vec[0]);
+    for (i = 1; i < KYBER_K - 1; i++) {
+        poly_frombytes(pkp, pk + i*KYBER_POLYBYTES);
+        poly_basemul_acc_opt_32_32(v_tmp, &sp.vec[i], pkp, &sp_prime.vec[i]);
+    }
+    poly_frombytes(pkp, pk + i*KYBER_POLYBYTES);
+    poly_basemul_acc_opt_32_16(v, &sp.vec[i], pkp, &sp_prime.vec[i], v_tmp);
+
+    poly_invntt(v);
+
+    poly_addnoise_eta2(v, coins, nonce++);
+    poly_frommsg(k, m);
+    poly_add(v, v, k);
+    poly_reduce(v);
+
+    rc |= cmp_poly_compress(c + KYBER_POLYVECCOMPRESSEDBYTES, v);
+
+    rc = ~rc + 1;
+    rc >>= 63;
+    return (unsigned char)rc;
+}
+
+/*************************************************
+* Name:        indcpa_dec
+*
+* Description: Decryption function of the CPA-secure
+*              public-key encryption scheme underlying Kyber.
+*
+* Arguments:   - unsigned char *m:        pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES)
+*              - const unsigned char *c:  pointer to input ciphertext (of length KYBER_INDCPA_BYTES)
+*              - const unsigned char *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES)
+**************************************************/
+void __attribute__ ((noinline)) indcpa_dec(unsigned char *m,
+                                           const unsigned char *c,
+                                           const unsigned char *sk) {
+    poly mp, bp;
+    poly *v = &bp;
+    int32_t r_tmp[KYBER_N];
+    int i;
+    
+    poly_unpackdecompress(&mp, c, 0);
+    poly_ntt(&mp);
+    poly_frombytes_mul_16_32(r_tmp, &mp, sk);
+    for(i = 1; i < KYBER_K - 1; i++) {
+        poly_unpackdecompress(&bp, c, i);
+        poly_ntt(&bp);
+        poly_frombytes_mul_32_32(r_tmp, &bp, sk + i*KYBER_POLYBYTES);
+    }
+    poly_unpackdecompress(&bp, c, i);
+    poly_ntt(&bp);
+    poly_frombytes_mul_32_16(&mp, &bp, sk + i*KYBER_POLYBYTES, r_tmp);
+
+    poly_invntt(&mp);
+    poly_decompress(v, c+KYBER_POLYVECCOMPRESSEDBYTES);
+    poly_sub(&mp, v, &mp);
+    poly_reduce(&mp);
+
+    poly_tomsg(m, &mp);
+}
diff --git a/crypto_kem/ml-kem-512/m4fspeed/indcpa.h b/crypto_kem/ml-kem-512/m4fspeed/indcpa.h
new file mode 120000
index 0000000..e6f3662
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/indcpa.h
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/indcpa.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fspeed/kem.c b/crypto_kem/ml-kem-512/m4fspeed/kem.c
new file mode 120000
index 0000000..489b6f9
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/kem.c
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/kem.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fspeed/macros.i b/crypto_kem/ml-kem-512/m4fspeed/macros.i
new file mode 120000
index 0000000..a7d8e74
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/macros.i
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/macros.i
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fspeed/matacc.c b/crypto_kem/ml-kem-512/m4fspeed/matacc.c
new file mode 120000
index 0000000..71d7234
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/matacc.c
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/matacc.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fspeed/matacc.h b/crypto_kem/ml-kem-512/m4fspeed/matacc.h
new file mode 120000
index 0000000..19b6772
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/matacc.h
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/matacc.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fspeed/matacc.i b/crypto_kem/ml-kem-512/m4fspeed/matacc.i
new file mode 120000
index 0000000..39b6e23
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/matacc.i
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/matacc.i
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fspeed/matacc_asm.S b/crypto_kem/ml-kem-512/m4fspeed/matacc_asm.S
new file mode 120000
index 0000000..3c7d05e
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/matacc_asm.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/matacc_asm.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fspeed/ntt.c b/crypto_kem/ml-kem-512/m4fspeed/ntt.c
new file mode 120000
index 0000000..971c6b0
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/ntt.c
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/ntt.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fspeed/ntt.h b/crypto_kem/ml-kem-512/m4fspeed/ntt.h
new file mode 120000
index 0000000..11e111d
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/ntt.h
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/ntt.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fspeed/params.h b/crypto_kem/ml-kem-512/m4fspeed/params.h
new file mode 100644
index 0000000..be9ec45
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/params.h
@@ -0,0 +1,32 @@
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#define KYBER_K 2 /* Change this for different security strengths */
+
+/* Don't change parameters below this line */
+
+#define KYBER_N 256
+#define KYBER_Q 3329
+
+#define KYBER_ETA1 3
+#define KYBER_ETA2 2
+
+#define KYBER_SYMBYTES 32   /* size in bytes of hashes, and seeds */
+#define KYBER_SSBYTES  32   /* size in bytes of shared key */
+
+#define KYBER_POLYBYTES              384
+#define KYBER_POLYVECBYTES           (KYBER_K * KYBER_POLYBYTES)
+
+#define KYBER_POLYCOMPRESSEDBYTES    128
+#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320)
+
+#define KYBER_INDCPA_MSGBYTES       KYBER_SYMBYTES
+#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES)
+#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES)
+#define KYBER_INDCPA_BYTES          (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES)
+
+#define KYBER_PUBLICKEYBYTES  (KYBER_INDCPA_PUBLICKEYBYTES)
+#define KYBER_SECRETKEYBYTES  (KYBER_INDCPA_SECRETKEYBYTES +  KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */
+#define KYBER_CIPHERTEXTBYTES  KYBER_INDCPA_BYTES
+
+#endif
diff --git a/crypto_kem/ml-kem-512/m4fspeed/poly.c b/crypto_kem/ml-kem-512/m4fspeed/poly.c
new file mode 100644
index 0000000..401b26b
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/poly.c
@@ -0,0 +1,672 @@
+#include "poly.h"
+
+#include "cbd.h"
+#include "ntt.h"
+#include "params.h"
+#include "symmetric.h"
+
+#include <stdint.h>
+
+
+/*************************************************
+* Name:        poly_compress
+*
+* Description: Serialization of a polynomial and subsequent compression of a polynomial;
+*
+* Arguments:   - unsigned char *r: pointer to output byte array (of length KYBER_POLYCOMPRESSEDBYTES)
+*              - const poly *a:    pointer to input polynomial to be serialized
+*************************************************/
+void poly_compress(unsigned char *r, const poly *a)
+{
+  unsigned int i,j;
+  int16_t u;
+  uint32_t d0;
+  uint8_t t[8];
+
+#if (KYBER_POLYCOMPRESSEDBYTES == 128)
+  for(i=0;i<KYBER_N/8;i++) {
+    for(j=0;j<8;j++) {
+      // map to positive standard representatives
+      u  = a->coeffs[8*i+j];
+      u += (u >> 15) & KYBER_Q;
+/*    t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */
+      d0 = u << 4;
+      d0 += 1665;
+      d0 *= 80635;
+      d0 >>= 28;
+      t[j] = d0 & 0xf;
+    }
+
+    r[0] = t[0] | (t[1] << 4);
+    r[1] = t[2] | (t[3] << 4);
+    r[2] = t[4] | (t[5] << 4);
+    r[3] = t[6] | (t[7] << 4);
+    r += 4;
+  }
+#elif (KYBER_POLYCOMPRESSEDBYTES == 160)
+  for(i=0;i<KYBER_N/8;i++) {
+    for(j=0;j<8;j++) {
+      // map to positive standard representatives
+      u  = a->coeffs[8*i+j];
+      u += (u >> 15) & KYBER_Q;
+/*      t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */
+      d0 = u << 5;
+      d0 += 1664;
+      d0 *= 40318;
+      d0 >>= 27;
+      t[j] = d0 & 0x1f;
+    }
+
+    r[0] = (t[0] >> 0) | (t[1] << 5);
+    r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7);
+    r[2] = (t[3] >> 1) | (t[4] << 4);
+    r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6);
+    r[4] = (t[6] >> 2) | (t[7] << 3);
+    r += 5;
+  }
+#else
+#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}"
+#endif
+}
+
+/*************************************************
+* Name:        poly_decompress
+*
+* Description: De-serialization and subsequent decompression of a polynomial;
+*              approximate inverse of poly_compress
+*
+* Arguments:   - poly *r:                pointer to output polynomial
+*              - const unsigned char *a: pointer to input byte array (of length KYBER_POLYCOMPRESSEDBYTES bytes)
+**************************************************/
+void poly_decompress(poly *r, const unsigned char *a)
+{
+  int i;
+#if (KYBER_POLYCOMPRESSEDBYTES == 128)
+  for(i=0;i<KYBER_N;i+=8)
+  {
+    r->coeffs[i+0] = (((a[0] & 15) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+1] = (((a[0] >> 4) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+2] = (((a[1] & 15) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+3] = (((a[1] >> 4) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+4] = (((a[2] & 15) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+5] = (((a[2] >> 4) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+6] = (((a[3] & 15) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+7] = (((a[3] >> 4) * KYBER_Q) + 8) >> 4;
+    a += 4;
+  }
+#elif (KYBER_POLYCOMPRESSEDBYTES == 160)
+  for(i=0;i<KYBER_N;i+=8)
+  {
+    r->coeffs[i+0] =  (((a[0] & 31) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+1] = ((((a[0] >> 5) | ((a[1] & 3) << 3)) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+2] = ((((a[1] >> 2) & 31) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+3] = ((((a[1] >> 7) | ((a[2] & 15) << 1)) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+4] = ((((a[2] >> 4) | ((a[3] &  1) << 4)) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+5] = ((((a[3] >> 1) & 31) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+6] = ((((a[3] >> 6) | ((a[4] &  7) << 2)) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+7] =  (((a[4] >> 3) * KYBER_Q) + 16) >> 5;
+    a += 5;
+  }
+#else
+#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {96, 128, 160}"
+#endif
+}
+
+/*************************************************
+* Name:        poly_packcompress
+*
+* Description: Serialization and subsequent compression of a polynomial of a polyvec,
+*              writes to a byte string representation of the whole polyvec.
+*              Used to compress a polyvec one poly at a time in a loop.
+*
+* Arguments:   - unsigned char *r:  pointer to output byte string representation of a polyvec (of length KYBER_POLYVECCOMPRESSEDBYTES)
+*              - const poly *a:     pointer to input polynomial
+*              - int i:             index of to be serialized polynomial in serialized polyec
+**************************************************/
+void poly_packcompress(unsigned char *r, poly *a, int i) {
+    int j, k;
+    uint64_t d0;
+
+#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
+  uint16_t t[8];
+
+  for(j=0;j<KYBER_N/8;j++) {
+      for(k=0;k<8;k++) {
+        t[k]  = a->coeffs[8*j+k];
+        t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
+/*      t[k]  = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */
+        d0 = t[k];
+        d0 <<= 11;
+        d0 += 1664;
+        d0 *= 645084;
+        d0 >>= 31;
+        t[k] = d0 & 0x7ff;
+      }
+      
+
+    r[352*i+11*j+ 0] =  t[0] & 0xff;
+    r[352*i+11*j+ 1] = (t[0] >>  8) | ((t[1] & 0x1f) << 3);
+    r[352*i+11*j+ 2] = (t[1] >>  5) | ((t[2] & 0x03) << 6);
+    r[352*i+11*j+ 3] = (t[2] >>  2) & 0xff;
+    r[352*i+11*j+ 4] = (t[2] >> 10) | ((t[3] & 0x7f) << 1);
+    r[352*i+11*j+ 5] = (t[3] >>  7) | ((t[4] & 0x0f) << 4);
+    r[352*i+11*j+ 6] = (t[4] >>  4) | ((t[5] & 0x01) << 7);
+    r[352*i+11*j+ 7] = (t[5] >>  1) & 0xff;
+    r[352*i+11*j+ 8] = (t[5] >>  9) | ((t[6] & 0x3f) << 2);
+    r[352*i+11*j+ 9] = (t[6] >>  6) | ((t[7] & 0x07) << 5);
+    r[352*i+11*j+10] = (t[7] >>  3);
+  }
+#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
+    uint16_t t[4];
+
+    for (j = 0; j < KYBER_N / 4; j++) {
+        for(k=0;k<4;k++) {
+            t[k]  = a->coeffs[4*j+k];
+            t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
+            /*      t[k]  = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */
+            d0 = t[k];
+            d0 <<= 10;
+            d0 += 1665;
+            d0 *= 1290167;
+            d0 >>= 32;
+            t[k] = d0 & 0x3ff;
+        }
+        r[320*i+5*j+0] =   t[0] & 0xff;
+        r[320*i+5*j+1] =  (t[0] >>  8) | ((t[1] & 0x3f) << 2);
+        r[320*i+5*j+2] = ((t[1] >>  6) | ((t[2] & 0x0f) << 4)) & 0xff;
+        r[320*i+5*j+3] = ((t[2] >>  4) | ((t[3] & 0x03) << 6)) & 0xff;
+        r[320*i+5*j+4] =  (t[3] >>  2) & 0xff;
+    }
+#else
+#error "KYBER_POLYVECCOMPRESSEDBYTES needs to in (KYBER_K * {352, 320})"
+#endif
+}
+
+/*************************************************
+* Name:        poly_unpackdecompress
+*
+* Description: Deserialization and subsequent compression of a polynomial of a polyvec,
+*              Used to uncompress a polyvec one poly at a time in a loop.
+*
+* Arguments:   - const poly *r:     pointer to output polynomial
+*              - unsigned char *a:  pointer to input byte string representation of a polyvec (of length KYBER_POLYVECCOMPRESSEDBYTES)
+*              - int i:             index of poly in polyvec to decompress
+**************************************************/
+void poly_unpackdecompress(poly *r, const unsigned char *a, int i) {
+  int j;
+#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
+    for(j=0;j<KYBER_N/8;j++)
+    {
+      r->coeffs[8*j+0] =  (((a[352*i+11*j+ 0]       | (((uint32_t)a[352*i+11*j+ 1] & 0x07) << 8)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+1] = ((((a[352*i+11*j+ 1] >> 3) | (((uint32_t)a[352*i+11*j+ 2] & 0x3f) << 5)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+2] = ((((a[352*i+11*j+ 2] >> 6) | (((uint32_t)a[352*i+11*j+ 3] & 0xff) << 2) | (((uint32_t)a[352*i+11*j+4] & 0x01) << 10)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+3] = ((((a[352*i+11*j+ 4] >> 1) | (((uint32_t)a[352*i+11*j+ 5] & 0x0f) << 7)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+4] = ((((a[352*i+11*j+ 5] >> 4) | (((uint32_t)a[352*i+11*j+ 6] & 0x7f) << 4)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+5] = ((((a[352*i+11*j+ 6] >> 7) | (((uint32_t)a[352*i+11*j+ 7] & 0xff) << 1) | (((uint32_t)a[352*i+11*j+8] & 0x03) <<  9)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+6] = ((((a[352*i+11*j+ 8] >> 2) | (((uint32_t)a[352*i+11*j+ 9] & 0x1f) << 6)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+7] = ((((a[352*i+11*j+ 9] >> 5) | (((uint32_t)a[352*i+11*j+10] & 0xff) << 3)) * KYBER_Q) + 1024) >> 11;
+    }
+#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
+    for(j=0;j<KYBER_N/4;j++)
+    {
+      r->coeffs[4*j+0] =  (((a[320*i+5*j+ 0]       | (((uint32_t)a[320*i+5*j+ 1] & 0x03) << 8)) * KYBER_Q) + 512) >> 10;
+      r->coeffs[4*j+1] = ((((a[320*i+5*j+ 1] >> 2) | (((uint32_t)a[320*i+5*j+ 2] & 0x0f) << 6)) * KYBER_Q) + 512) >> 10;
+      r->coeffs[4*j+2] = ((((a[320*i+5*j+ 2] >> 4) | (((uint32_t)a[320*i+5*j+ 3] & 0x3f) << 4)) * KYBER_Q) + 512) >> 10;
+      r->coeffs[4*j+3] = ((((a[320*i+5*j+ 3] >> 6) | (((uint32_t)a[320*i+5*j+ 4] & 0xff) << 2)) * KYBER_Q) + 512) >> 10;
+    }
+#else
+#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
+#endif
+}
+
+
+/*************************************************
+* Name:        cmp_poly_compress
+*
+* Description: Serializes and consequently compares polynomial to a serialized polynomial
+*
+* Arguments:   - const unsigned char *r:    pointer to serialized polynomial to compare with
+*              - poly *a:                   pointer to input polynomial to serialize and compare
+* Returns:                                  boolean indicating whether the polynomials are equal
+**************************************************/
+int cmp_poly_compress(const unsigned char *r, poly *a) {
+    unsigned char rc = 0;
+    int16_t u;
+    uint32_t d0;
+    uint8_t t[8];
+    int i, j, k = 0;
+
+#if (KYBER_POLYCOMPRESSEDBYTES == 128)
+  for(i=0;i<KYBER_N/8;i++) {
+    for(j=0;j<8;j++) {
+      // map to positive standard representatives
+      u  = a->coeffs[8*i+j];
+      u += (u >> 15) & KYBER_Q;
+/*    t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */
+      d0 = u << 4;
+      d0 += 1665;
+      d0 *= 80635;
+      d0 >>= 28;
+      t[j] = d0 & 0xf;
+    }
+        rc |= r[k]      ^ (t[0] | (t[1] << 4));
+        rc |= r[k + 1]  ^ (t[2] | (t[3] << 4));
+        rc |= r[k + 2]  ^ (t[4] | (t[5] << 4));
+        rc |= r[k + 3]  ^ (t[6] | (t[7] << 4));
+        k += 4;
+    }
+#elif (KYBER_POLYCOMPRESSEDBYTES == 160)
+  for(i=0;i<KYBER_N/8;i++) {
+    for(j=0;j<8;j++) {
+      // map to positive standard representatives
+      u  = a->coeffs[8*i+j];
+      u += (u >> 15) & KYBER_Q;
+/*      t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */
+      d0 = u << 5;
+      d0 += 1664;
+      d0 *= 40318;
+      d0 >>= 27;
+      t[j] = d0 & 0x1f;
+    }
+
+
+      rc |= r[k]   ^ (t[0]       | (t[1] << 5));
+      rc |= r[k+1] ^ ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
+      rc |= r[k+2] ^ ((t[3] >> 1) | (t[4] << 4));
+      rc |= r[k+3] ^ ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
+      rc |= r[k+4] ^ ((t[6] >> 2) | (t[7] << 3));
+      k += 5;
+    }
+#else
+#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}"
+#endif
+    return rc;
+}
+
+/*************************************************
+* Name:        cmp_poly_packcompress
+*
+* Description: Serializes and consequently compares poly of polyvec to a serialized polyvec
+*              Should be called in a loop over all poly's of a polyvec.
+*
+* Arguments:   - const unsigned char *r:    pointer to serialized polyvec to compare with
+*              - poly *a:                   pointer to input polynomial of polyvec to serialize and compare
+*              - int i:                     index of poly in polyvec to compare with
+* Returns:                                  boolean indicating whether the polyvecs are equal
+**************************************************/
+int cmp_poly_packcompress(const unsigned char *r, poly *a, int i) {
+    unsigned char rc = 0;
+    int j, k;
+    uint64_t d0;
+
+#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
+  uint16_t t[8];
+    for(j=0;j<KYBER_N/8;j++)
+    {
+      for(k=0;k<8;k++) {
+        t[k]  = a->coeffs[8*j+k];
+        t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
+/*      t[k]  = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */
+        d0 = t[k];
+        d0 <<= 11;
+        d0 += 1664;
+        d0 *= 645084;
+        d0 >>= 31;
+        t[k] = d0 & 0x7ff;
+      }
+
+      rc |= r[352*i+11*j+ 0] ^ (t[0] & 0xff);
+      rc |= r[352*i+11*j+ 1] ^ ((t[0] >>  8) | ((t[1] & 0x1f) << 3));
+      rc |= r[352*i+11*j+ 2] ^ ((t[1] >>  5) | ((t[2] & 0x03) << 6));
+      rc |= r[352*i+11*j+ 3] ^ ((t[2] >>  2) & 0xff);
+      rc |= r[352*i+11*j+ 4] ^ ((t[2] >> 10) | ((t[3] & 0x7f) << 1));
+      rc |= r[352*i+11*j+ 5] ^ ((t[3] >>  7) | ((t[4] & 0x0f) << 4));
+      rc |= r[352*i+11*j+ 6] ^ ((t[4] >>  4) | ((t[5] & 0x01) << 7));
+      rc |= r[352*i+11*j+ 7] ^ ((t[5] >>  1) & 0xff);
+      rc |= r[352*i+11*j+ 8] ^ ((t[5] >>  9) | ((t[6] & 0x3f) << 2));
+      rc |= r[352*i+11*j+ 9] ^ ((t[6] >>  6) | ((t[7] & 0x07) << 5));
+      rc |= r[352*i+11*j+10] ^ ((t[7] >>  3));
+    }
+#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
+    uint16_t t[4];
+        for (j = 0; j < KYBER_N / 4; j++) {
+        for(k=0;k<4;k++) {
+            t[k]  = a->coeffs[4*j+k];
+            t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
+            /*      t[k]  = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */
+            d0 = t[k];
+            d0 <<= 10;
+            d0 += 1665;
+            d0 *= 1290167;
+            d0 >>= 32;
+            t[k] = d0 & 0x3ff;
+        }
+
+            rc |= r[320*i+5*j+0] ^ (t[0] & 0xff);
+            rc |= r[320*i+5*j+1] ^ ((t[0] >>  8) | ((t[1] & 0x3f) << 2));
+            rc |= r[320*i+5*j+2] ^ (((t[1] >>  6) | ((t[2] & 0x0f) << 4)) & 0xff);
+            rc |= r[320*i+5*j+3] ^ (((t[2] >>  4) | ((t[3] & 0x03) << 6)) & 0xff);
+            rc |= r[320*i+5*j+4] ^ ((t[3] >>  2) & 0xff);
+        }
+#else
+#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
+#endif
+    return rc;
+}
+
+/*************************************************
+* Name:        poly_tobytes
+*
+* Description: Serialization of a polynomial
+*
+* Arguments:   - unsigned char *r: pointer to output byte array (needs space for KYBER_POLYBYTES bytes)
+*              - const poly *a:    pointer to input polynomial
+**************************************************/
+void poly_tobytes(unsigned char *r, poly *a) {
+    int i;
+    uint16_t t0, t1;
+
+    poly_reduce(a);
+
+    for (i = 0; i < KYBER_N / 2; i++) {
+        t0 = a->coeffs[2 * i];
+        t1 = a->coeffs[2 * i + 1];
+        r[3 * i] = t0 & 0xff;
+        r[3 * i + 1] = (t0 >> 8) | ((t1 & 0xf) << 4);
+        r[3 * i + 2] = (t1 >> 4) & 0xff;
+    }
+}
+
+/*************************************************
+* Name:        poly_frombytes
+*
+* Description: De-serialization of a polynomial;
+*              inverse of poly_tobytes
+*
+* Arguments:   - poly *r:                pointer to output polynomial
+*              - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes)
+**************************************************/
+void poly_frombytes(poly *r, const unsigned char *a) {
+    int i;
+
+    for (i = 0; i < KYBER_N / 2; i++) {
+        r->coeffs[2 * i]     = a[3 * i]          | ((uint16_t)a[3 * i + 1] & 0x0f) << 8;
+        r->coeffs[2 * i + 1] = a[3 * i + 1] >> 4 | ((uint16_t)a[3 * i + 2] & 0xff) << 4;
+    }
+}
+
+/*************************************************
+* Name:        poly_frombytes_mul_16_32
+*
+* Description: Multiplication of a polynomial with a de-serialization of another polynomial
+*              Using strategy of better accumulation.
+* Arguments:   - const poly *b:          pointer to input polynomial
+*              - int32_t *r_tmp:         array for accumulating unreduced results
+*              - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes)
+**************************************************/
+extern void frombytes_mul_asm_16_32(int32_t *r_tmp, const int16_t *b, const unsigned char *c, const int32_t zetas[64]);
+void poly_frombytes_mul_16_32(int32_t *r_tmp, const poly *b, const unsigned char *a) {
+    frombytes_mul_asm_16_32(r_tmp, b->coeffs, a, zetas);
+}
+
+/*************************************************
+* Name:        poly_frombytes_mul_32_32
+*
+* Description: Multiplication of a polynomial with a de-serialization of another polynomial
+*              Using strategy of better accumulation.
+* Arguments:   - const poly *b:          pointer to input polynomial
+*              - int32_t *r_tmp:         array for accumulating unreduced results
+*              - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes)
+**************************************************/
+extern void frombytes_mul_asm_acc_32_32(int32_t *r_tmp, const int16_t *b, const unsigned char *c, const int32_t zetas[64]);
+void poly_frombytes_mul_32_32(int32_t *r_tmp, const poly *b, const unsigned char *a) {
+    frombytes_mul_asm_acc_32_32(r_tmp, b->coeffs, a, zetas);
+}
+
+/*************************************************
+* Name:        poly_frombytes_mul_32_16
+*
+* Description: Multiplication of a polynomial with a de-serialization of another polynomial
+*              Using strategy of better accumulation.
+* Arguments:   - poly *r:                pointer to output polynomial
+*              - const poly *b:          pointer to input polynomial
+*              - const int32_t *r_tmp:   array containing unreduced results
+*              - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes)
+**************************************************/
+extern void frombytes_mul_asm_acc_32_16(int16_t *r, const int16_t *b, const unsigned char *c, const int32_t zetas[64], const int32_t *r_tmp);
+void poly_frombytes_mul_32_16(poly *r, const poly* b, const unsigned char *a, const int32_t *r_tmp) {
+    frombytes_mul_asm_acc_32_16(r->coeffs, b->coeffs, a, zetas, r_tmp);
+}
+
+/*************************************************
+* Name:        poly_getnoise_eta1
+*
+* Description: Sample a polynomial deterministically from a seed and a nonce,
+*              with output polynomial close to centered binomial distribution
+*              with parameter KYBER_ETA1
+*
+* Arguments:   - poly *r:                   pointer to output polynomial
+*              - const unsigned char *seed: pointer to input seed (pointing to array of length KYBER_SYMBYTES bytes)
+*              - unsigned char nonce:       one-byte input nonce
+*              - int add:                   boolean to indicate to accumulate into r
+**************************************************/
+void poly_noise_eta1(poly *r, const unsigned char *seed, unsigned char nonce, int add) {
+    unsigned char buf[KYBER_ETA1 * KYBER_N / 4];
+
+    prf(buf, KYBER_ETA1 * KYBER_N / 4, seed, nonce);
+    cbd_eta1(r, buf, add);
+}
+
+/*************************************************
+* Name:        poly_getnoise_eta2
+*
+* Description: Sample a polynomial deterministically from a seed and a nonce,
+*              with output polynomial close to centered binomial distribution
+*              with parameter KYBER_ETA2
+*
+* Arguments:   - poly *r:                   pointer to output polynomial
+*              - const unsigned char *seed: pointer to input seed (pointing to array of length KYBER_SYMBYTES bytes)
+*              - unsigned char nonce:       one-byte input nonce
+*              - int add:                   boolean to indicate to accumulate into r
+**************************************************/
+void poly_noise_eta2(poly *r, const unsigned char *seed, unsigned char nonce, int add) {
+    unsigned char buf[KYBER_ETA2 * KYBER_N / 4];
+
+    prf(buf, KYBER_ETA2 * KYBER_N / 4, seed, nonce);
+    cbd_eta2(r, buf, add);
+}
+
+/*************************************************
+* Name:        poly_basemul_opt_16_32
+*
+* Description: Multiplication of two polynomials using asymmetric multiplication.
+*              Cached values are generated during matrix-vector product.
+*              Using strategy of better accumulation (initial step).
+* Arguments:   - const poly *a:       pointer to input polynomial
+*              - const poly *b:       pointer to input polynomial
+*              - const poly *a_prime: pointer to a pre-multiplied by zetas 
+*              - int32_t *r_tmp:      array for accumulating unreduced results
+**************************************************/
+extern void basemul_asm_opt_16_32(int32_t *, const int16_t *, const int16_t *, const int16_t *);
+void poly_basemul_opt_16_32(int32_t *r_tmp, const poly *a, const poly *b, const poly *a_prime) {
+    basemul_asm_opt_16_32(r_tmp, a->coeffs, b->coeffs, a_prime->coeffs);
+}
+
+/*************************************************
+* Name:        poly_basemul_acc_opt_32_32
+*
+* Description: Multiplication of two polynomials using asymmetric multiplication.
+*              Cached values are generated during matrix-vector product.
+*              Using strategy of better accumulation.
+* Arguments:   - const poly *a:       pointer to input polynomial
+*              - const poly *b:       pointer to input polynomial
+*              - const poly *a_prime: pointer to a pre-multiplied by zetas 
+*              - int32_t *r_tmp:      array for accumulating unreduced results
+**************************************************/
+extern void basemul_asm_acc_opt_32_32(int32_t *, const int16_t *, const int16_t *, const int16_t *);
+void poly_basemul_acc_opt_32_32(int32_t *r, const poly *a, const poly *b, const poly *a_prime) {
+    basemul_asm_acc_opt_32_32(r, a->coeffs, b->coeffs, a_prime->coeffs);
+}
+
+/*************************************************
+* Name:        poly_basemul_acc_opt_32_16
+*
+* Description: Multiplication of two polynomials using asymmetric multiplication.
+*              Cached values are generated during matrix-vector product.
+*              Using strategy of better accumulation (final step).
+* Arguments:   - const poly *a:        pointer to input polynomial
+*              - const poly *b:        pointer to input polynomial
+*              - const poly *a_prime:  pointer to a pre-multiplied by zetas 
+*              - poly *r:              pointer to output polynomial
+*              - const int32_t *r_tmp: array containing unreduced results
+**************************************************/
+extern void basemul_asm_acc_opt_32_16(int16_t *, const int16_t *, const int16_t *, const int16_t *, const int32_t *);
+void poly_basemul_acc_opt_32_16(poly *r, const poly *a, const poly *b, const poly *a_prime, const int32_t * r_tmp) {
+    basemul_asm_acc_opt_32_16(r->coeffs, a->coeffs, b->coeffs, a_prime->coeffs, r_tmp);
+}
+
+/*************************************************
+* Name:        poly_ntt
+*
+* Description: Computes negacyclic number-theoretic transform (NTT) of
+*              a polynomial in place;
+*              inputs assumed to be in normal order, output in bitreversed order
+*
+* Arguments:   - uint16_t *r: pointer to in/output polynomial
+**************************************************/
+void poly_ntt(poly *r) {
+    ntt(r->coeffs);
+}
+
+/*************************************************
+* Name:        poly_invntt
+*
+* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of
+*              a polynomial in place;
+*              inputs assumed to be in bitreversed order, output in normal order
+*
+* Arguments:   - uint16_t *a: pointer to in/output polynomial
+**************************************************/
+void poly_invntt(poly *r) {
+    invntt(r->coeffs);
+}
+
+extern void asm_fromplant(int16_t *r);
+/*************************************************
+* Name:        poly_fromplantt
+*
+* Description: Inplace conversion of all coefficients of a polynomial
+*              from Montgomery domain to normal domain
+*
+* Arguments:   - poly *r:       pointer to input/output polynomial
+**************************************************/
+void poly_fromplant(poly *r) {
+  asm_fromplant(r->coeffs);
+}
+
+extern void asm_barrett_reduce(int16_t *r);
+/*************************************************
+* Name:        poly_reduce
+*
+* Description: Applies Barrett reduction to all coefficients of a polynomial
+*              for details of the Barrett reduction see comments in reduce.c
+*
+* Arguments:   - poly *r:       pointer to input/output polynomial
+**************************************************/
+void poly_reduce(poly *r) {
+  asm_barrett_reduce(r->coeffs);
+}
+
+extern void pointwise_add(int16_t *, const int16_t *, const int16_t *);
+/*************************************************
+* Name:        poly_add
+*
+* Description: Add two polynomials
+*
+* Arguments: - poly *r:       pointer to output polynomial
+*            - const poly *a: pointer to first input polynomial
+*            - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_add(poly *r, const poly *a, const poly *b) {
+    pointwise_add(r->coeffs,a->coeffs,b->coeffs);
+}
+
+
+extern void pointwise_sub(int16_t *, const int16_t *, const int16_t *);
+/*************************************************
+* Name:        poly_sub
+*
+* Description: Subtract two polynomials
+*
+* Arguments: - poly *r:       pointer to output polynomial
+*            - const poly *a: pointer to first input polynomial
+*            - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_sub(poly *r, const poly *a, const poly *b) {
+    pointwise_sub(r->coeffs,a->coeffs,b->coeffs);
+}
+
+void cmov_int16(int16_t *r, int16_t v, uint16_t b);
+
+/*************************************************
+* Name:        poly_frommsg
+*
+* Description: Convert 32-byte message to polynomial
+*
+* Arguments:   - poly *r:                  pointer to output polynomial
+*              - const unsigned char *msg: pointer to input message
+**************************************************/
+void poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES])
+{
+  unsigned int i,j;
+
+#if (KYBER_INDCPA_MSGBYTES != KYBER_N/8)
+#error "KYBER_INDCPA_MSGBYTES must be equal to KYBER_N/8 bytes!"
+#endif
+
+  for(i=0;i<KYBER_N/8;i++) {
+    for(j=0;j<8;j++) {
+      r->coeffs[8*i+j] = 0;
+      cmov_int16(r->coeffs+8*i+j, ((KYBER_Q+1)/2), (msg[i] >> j)&1);
+    }
+  }
+}
+
+/*************************************************
+* Name:        poly_tomsg
+*
+* Description: Convert polynomial to 32-byte message
+*
+* Arguments:   - unsigned char *msg: pointer to output message
+*              - const poly *a:      pointer to input polynomial
+**************************************************/
+void poly_tomsg(unsigned char msg[KYBER_SYMBYTES], poly *a) {
+    uint32_t t;
+    int i, j;
+
+    for (i = 0; i < KYBER_SYMBYTES; i++) {
+        msg[i] = 0;
+        for (j = 0; j < 8; j++) {
+            t  = a->coeffs[8*i+j];
+            t <<= 1;
+            t += 1665;
+            t *= 80635;
+            t >>= 28;
+            t &= 1;
+            msg[i] |= t << j;
+        }
+    }
+}
+
+/*************************************************
+* Name:        poly_zeroize
+*
+* Description: Zeros a polynomial
+*
+* Arguments:   - poly *p: pointer to polynomial
+**************************************************/
+void poly_zeroize(poly *p) {
+  int i;
+  for(i = 0; i < KYBER_N; i++)
+   p->coeffs[i] = 0;
+}
diff --git a/crypto_kem/ml-kem-512/m4fspeed/poly.h b/crypto_kem/ml-kem-512/m4fspeed/poly.h
new file mode 100644
index 0000000..4994d87
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/poly.h
@@ -0,0 +1,56 @@
+#ifndef POLY_H
+#define POLY_H
+
+#include "params.h"
+
+#include <stdint.h>
+
+#define poly_getnoise_eta1(p, seed, nonce) poly_noise_eta1(p, seed, nonce, 0)
+#define poly_getnoise_eta2(p, seed, nonce) poly_noise_eta2(p, seed, nonce, 0)
+#define poly_addnoise_eta1(p, seed, nonce) poly_noise_eta1(p, seed, nonce, 1)
+#define poly_addnoise_eta2(p, seed, nonce) poly_noise_eta2(p, seed, nonce, 1)
+
+/*
+ * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
+ * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1]
+ */
+typedef struct {
+    int16_t coeffs[KYBER_N];
+} poly;
+
+void poly_compress(unsigned char *r, const poly *a);
+void poly_decompress(poly *r, const unsigned char *a);
+
+void poly_packcompress(unsigned char *r, poly *a, int i);
+void poly_unpackdecompress(poly *r, const unsigned char *a, int i);
+
+int cmp_poly_compress(const unsigned char *r, poly *a);
+int cmp_poly_packcompress(const unsigned char *r, poly *a, int i);
+
+void poly_tobytes(unsigned char *r, poly *a);
+void poly_frombytes(poly *r, const unsigned char *a);
+void poly_frombytes_mul_16_32(int32_t *r_tmp, const poly *b, const unsigned char *a);
+void poly_frombytes_mul_32_32(int32_t *r_tmp, const poly *b, const unsigned char *a);
+void poly_frombytes_mul_32_16(poly *r, const poly* b, const unsigned char *a, const int32_t *r_tmp);
+
+void poly_frommsg(poly *r, const unsigned char msg[KYBER_SYMBYTES]);
+void poly_tomsg(unsigned char msg[KYBER_SYMBYTES], poly *a);
+
+void poly_noise_eta1(poly *r, const unsigned char *seed, unsigned char nonce, int add);
+void poly_noise_eta2(poly *r, const unsigned char *seed, unsigned char nonce, int add);
+
+void poly_ntt(poly *r);
+void poly_invntt(poly *r);
+void poly_basemul_opt_16_32(int32_t *r, const poly *a, const poly *b, const poly *a_prime);
+void poly_basemul_acc_opt_32_32(int32_t *r, const poly *a, const poly *b, const poly *a_prime);
+void poly_basemul_acc_opt_32_16(poly *r, const poly *a, const poly *b, const poly *a_prime, const int32_t * r_tmp);
+void poly_fromplantt(poly *r);
+
+void poly_reduce(poly *r);
+
+void poly_add(poly *r, const poly *a, const poly *b);
+void poly_sub(poly *r, const poly *a, const poly *b);
+
+void poly_zeroize(poly *p);
+
+#endif
diff --git a/crypto_kem/ml-kem-512/m4fspeed/poly_asm.S b/crypto_kem/ml-kem-512/m4fspeed/poly_asm.S
new file mode 120000
index 0000000..c4bda05
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/poly_asm.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/poly_asm.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fspeed/polyvec.c b/crypto_kem/ml-kem-512/m4fspeed/polyvec.c
new file mode 120000
index 0000000..c3f7d0a
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/polyvec.c
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/polyvec.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fspeed/polyvec.h b/crypto_kem/ml-kem-512/m4fspeed/polyvec.h
new file mode 120000
index 0000000..47cf6c3
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/polyvec.h
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/polyvec.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fspeed/reduce.S b/crypto_kem/ml-kem-512/m4fspeed/reduce.S
new file mode 120000
index 0000000..2edf10c
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/reduce.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/reduce.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fspeed/symmetric-fips202. b/crypto_kem/ml-kem-512/m4fspeed/symmetric-fips202.
new file mode 120000
index 0000000..e49ba06
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/symmetric-fips202.
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/symmetric-fips202.
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fspeed/symmetric-fips202.c b/crypto_kem/ml-kem-512/m4fspeed/symmetric-fips202.c
new file mode 120000
index 0000000..5adc9ae
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/symmetric-fips202.c
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/symmetric-fips202.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fspeed/symmetric.h b/crypto_kem/ml-kem-512/m4fspeed/symmetric.h
new file mode 120000
index 0000000..698a10d
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/symmetric.h
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/symmetric.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fspeed/verify.c b/crypto_kem/ml-kem-512/m4fspeed/verify.c
new file mode 120000
index 0000000..85d7f50
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/verify.c
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/verify.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fspeed/verify.h b/crypto_kem/ml-kem-512/m4fspeed/verify.h
new file mode 120000
index 0000000..e19a301
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fspeed/verify.h
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/verify.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/api.h b/crypto_kem/ml-kem-512/m4fstack/api.h
new file mode 120000
index 0000000..cf75db9
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/api.h
@@ -0,0 +1 @@
+../m4fspeed/api.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/cbd.c b/crypto_kem/ml-kem-512/m4fstack/cbd.c
new file mode 120000
index 0000000..903fa59
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/cbd.c
@@ -0,0 +1 @@
+../m4fspeed/cbd.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/cbd.h b/crypto_kem/ml-kem-512/m4fstack/cbd.h
new file mode 120000
index 0000000..d264c36
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/cbd.h
@@ -0,0 +1 @@
+../m4fspeed/cbd.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/cmov_int16.S b/crypto_kem/ml-kem-512/m4fstack/cmov_int16.S
new file mode 120000
index 0000000..bdef6f4
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/cmov_int16.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fspeed/cmov_int16.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/fastaddsub.S b/crypto_kem/ml-kem-512/m4fstack/fastaddsub.S
new file mode 120000
index 0000000..d1317f7
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/fastaddsub.S
@@ -0,0 +1 @@
+../m4fspeed/fastaddsub.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/fastbasemul.S b/crypto_kem/ml-kem-512/m4fstack/fastbasemul.S
new file mode 120000
index 0000000..531385d
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/fastbasemul.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fstack/fastbasemul.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/fastinvntt.S b/crypto_kem/ml-kem-512/m4fstack/fastinvntt.S
new file mode 120000
index 0000000..1ad2d31
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/fastinvntt.S
@@ -0,0 +1 @@
+../m4fspeed/fastinvntt.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/fastntt.S b/crypto_kem/ml-kem-512/m4fstack/fastntt.S
new file mode 120000
index 0000000..208c11d
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/fastntt.S
@@ -0,0 +1 @@
+../m4fspeed/fastntt.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/indcpa.c b/crypto_kem/ml-kem-512/m4fstack/indcpa.c
new file mode 100644
index 0000000..94d6a57
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/indcpa.c
@@ -0,0 +1,211 @@
+#include "indcpa.h"
+#include "ntt.h"
+#include "poly.h"
+#include "polyvec.h"
+#include "randombytes.h"
+#include "symmetric.h"
+#include "matacc.h"
+
+#include <string.h>
+#include <stdint.h>
+
+/*************************************************
+* Name:        indcpa_keypair
+*
+* Description: Generates public and private key for the CPA-secure
+*              public-key encryption scheme underlying Kyber
+*
+* Arguments:   - unsigned char *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
+*              - unsigned char *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes)
+**************************************************/
+void indcpa_keypair_derand(unsigned char *pk,
+                    unsigned char *sk, 
+                    const unsigned char *coins){
+    polyvec skpv;
+    poly pkp;
+    unsigned char buf[2 * KYBER_SYMBYTES];
+    unsigned char *publicseed = buf;
+    unsigned char *noiseseed = buf + KYBER_SYMBYTES;
+    int i;
+    unsigned char nonce = 0;
+
+    memcpy(buf, coins, KYBER_SYMBYTES);
+    buf[KYBER_SYMBYTES] = KYBER_K;
+    hash_g(buf, buf, KYBER_SYMBYTES + 1);
+
+    for (i = 0; i < KYBER_K; i++)
+        poly_getnoise_eta1(skpv.vec + i, noiseseed, nonce++);
+
+    polyvec_ntt(&skpv);
+
+    for (i = 0; i < KYBER_K; i++) {
+        matacc(&pkp, &skpv, i, publicseed, 0);
+        poly_invntt(&pkp);
+
+        poly_addnoise_eta1(&pkp, noiseseed, nonce++);
+        poly_ntt(&pkp);
+
+        poly_tobytes(pk+i*KYBER_POLYBYTES, &pkp);
+    }
+
+    polyvec_tobytes(sk, &skpv);
+    memcpy(pk + KYBER_POLYVECBYTES, publicseed, KYBER_SYMBYTES); // Pack the public seed in the public key
+}
+
+/*************************************************
+* Name:        indcpa_enc
+*
+* Description: Encryption function of the CPA-secure
+*              public-key encryption scheme underlying Kyber.
+*
+* Arguments:   - unsigned char *c:          pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes)
+*              - const unsigned char *m:    pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes)
+*              - const unsigned char *pk:   pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
+*              - const unsigned char *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes)
+*                                           to deterministically generate all randomness
+**************************************************/
+void indcpa_enc(unsigned char *c,
+               const unsigned char *m,
+               const unsigned char *pk,
+               const unsigned char *coins) {
+    polyvec sp;
+    poly bp;
+    poly *pkp = &bp;
+    poly *k = &bp;
+    poly *v = &sp.vec[0];
+    const unsigned char *seed = pk+KYBER_POLYVECBYTES;
+    int i;
+    unsigned char nonce = 0;
+
+    for (i = 0; i < KYBER_K; i++)
+        poly_getnoise_eta1(sp.vec + i, coins, nonce++);
+
+    polyvec_ntt(&sp);
+
+    for (i = 0; i < KYBER_K; i++) {
+        matacc(&bp, &sp, i, seed, 1);
+        poly_invntt(&bp);
+
+        poly_addnoise_eta2(&bp, coins, nonce++);
+        poly_reduce(&bp);
+
+        poly_packcompress(c, &bp, i);
+    }
+
+    poly_frombytes(pkp, pk);
+    poly_basemul(v, pkp, &sp.vec[0]);
+    for (i = 1; i < KYBER_K; i++) {
+        poly_frombytes(pkp, pk + i*KYBER_POLYBYTES);
+        poly_basemul_acc(v, pkp, &sp.vec[i]);
+    }
+
+    poly_invntt(v);
+
+    poly_addnoise_eta2(v, coins, nonce++);
+
+    poly_frommsg(k, m);
+    poly_add(v, v, k);
+    poly_reduce(v);
+
+    poly_compress(c + KYBER_POLYVECCOMPRESSEDBYTES, v);
+}
+
+/*************************************************
+* Name:        indcpa_enc_cmp
+*
+* Description: Re-encryption function.
+*              Compares the re-encypted ciphertext with the original ciphertext byte per byte.
+*              The comparison is performed in a constant time manner.
+*
+*
+* Arguments:   - unsigned char *ct:         pointer to input ciphertext to compare the new ciphertext with (of length KYBER_INDCPA_BYTES bytes)
+*              - const unsigned char *m:    pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes)
+*              - const unsigned char *pk:   pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
+*              - const unsigned char *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes)
+*                                           to deterministically generate all randomness
+* Returns:     - boolean byte indicating that re-encrypted ciphertext is NOT equal to the original ciphertext
+**************************************************/
+unsigned char indcpa_enc_cmp(const unsigned char *c,
+                             const unsigned char *m,
+                             const unsigned char *pk,
+                             const unsigned char *coins) {
+    uint64_t rc = 0;
+    polyvec sp;
+    poly bp;
+    poly *pkp = &bp;
+    poly *k = &bp;
+    poly *v = &sp.vec[0];
+    const unsigned char *seed = pk+KYBER_POLYVECBYTES;
+    int i;
+    unsigned char nonce = 0;
+
+    for (i = 0; i < KYBER_K; i++)
+        poly_getnoise_eta1(sp.vec + i, coins, nonce++);
+
+    polyvec_ntt(&sp);
+
+    for (i = 0; i < KYBER_K; i++) {
+        matacc(&bp, &sp, i, seed, 1);
+        poly_invntt(&bp);
+
+        poly_addnoise_eta2(&bp, coins, nonce++);
+        poly_reduce(&bp);
+
+        rc |= cmp_poly_packcompress(c, &bp, i);
+    }
+
+    poly_frombytes(pkp, pk);
+    poly_basemul(v, pkp, &sp.vec[0]);
+    for (i = 1; i < KYBER_K; i++) {
+        poly_frombytes(pkp, pk + i*KYBER_POLYBYTES);
+        poly_basemul_acc(v, pkp, &sp.vec[i]);
+    }
+
+    poly_invntt(v);
+
+    poly_addnoise_eta2(v, coins, nonce++);
+    poly_frommsg(k, m);
+    poly_add(v, v, k);
+    poly_reduce(v);
+
+    rc |= cmp_poly_compress(c + KYBER_POLYVECCOMPRESSEDBYTES, v);
+
+    rc = ~rc + 1;
+    rc >>= 63;
+    return (unsigned char)rc;
+}
+
+/*************************************************
+* Name:        indcpa_dec
+*
+* Description: Decryption function of the CPA-secure
+*              public-key encryption scheme underlying Kyber.
+*
+* Arguments:   - unsigned char *m:        pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES)
+*              - const unsigned char *c:  pointer to input ciphertext (of length KYBER_INDCPA_BYTES)
+*              - const unsigned char *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES)
+**************************************************/
+void __attribute__ ((noinline)) indcpa_dec(unsigned char *m,
+                                           const unsigned char *c,
+                                           const unsigned char *sk) {
+    poly mp, bp;
+    poly *v = &bp;
+    int i;
+
+    poly_unpackdecompress(&mp, c, 0);
+    poly_ntt(&mp);
+    
+    poly_frombytes_mul(&mp, &mp, sk);
+    for(i = 1; i < KYBER_K; i++) {
+        poly_unpackdecompress(&bp, c, i);
+        poly_ntt(&bp);
+        poly_frombytes_mul_acc(&mp, &bp, sk + i*KYBER_POLYBYTES);
+    }
+
+    poly_invntt(&mp);
+    poly_decompress(v, c+KYBER_POLYVECCOMPRESSEDBYTES);
+    poly_sub(&mp, v, &mp);
+    poly_reduce(&mp);
+
+    poly_tomsg(m, &mp);
+}
diff --git a/crypto_kem/ml-kem-512/m4fstack/indcpa.h b/crypto_kem/ml-kem-512/m4fstack/indcpa.h
new file mode 120000
index 0000000..5893b12
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/indcpa.h
@@ -0,0 +1 @@
+../m4fspeed/indcpa.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/kem.c b/crypto_kem/ml-kem-512/m4fstack/kem.c
new file mode 120000
index 0000000..302153d
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/kem.c
@@ -0,0 +1 @@
+../m4fspeed/kem.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/macros.i b/crypto_kem/ml-kem-512/m4fstack/macros.i
new file mode 120000
index 0000000..6e83891
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/macros.i
@@ -0,0 +1 @@
+../m4fspeed/macros.i
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/matacc.c b/crypto_kem/ml-kem-512/m4fstack/matacc.c
new file mode 120000
index 0000000..5558ec8
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/matacc.c
@@ -0,0 +1 @@
+../../ml-kem-768/m4fstack/matacc.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/matacc.h b/crypto_kem/ml-kem-512/m4fstack/matacc.h
new file mode 120000
index 0000000..4eb7706
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/matacc.h
@@ -0,0 +1 @@
+../../ml-kem-768/m4fstack/matacc.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/matacc.i b/crypto_kem/ml-kem-512/m4fstack/matacc.i
new file mode 120000
index 0000000..0d39b07
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/matacc.i
@@ -0,0 +1 @@
+../../ml-kem-768/m4fstack/matacc.i
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/matacc_asm.S b/crypto_kem/ml-kem-512/m4fstack/matacc_asm.S
new file mode 120000
index 0000000..0079bb5
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/matacc_asm.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fstack/matacc_asm.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/ntt.c b/crypto_kem/ml-kem-512/m4fstack/ntt.c
new file mode 120000
index 0000000..c9d6e8a
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/ntt.c
@@ -0,0 +1 @@
+../m4fspeed/ntt.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/ntt.h b/crypto_kem/ml-kem-512/m4fstack/ntt.h
new file mode 120000
index 0000000..5fd83c0
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/ntt.h
@@ -0,0 +1 @@
+../m4fspeed/ntt.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/params.h b/crypto_kem/ml-kem-512/m4fstack/params.h
new file mode 120000
index 0000000..59dd7f1
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/params.h
@@ -0,0 +1 @@
+../m4fspeed/params.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/poly.c b/crypto_kem/ml-kem-512/m4fstack/poly.c
new file mode 100644
index 0000000..443fdba
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/poly.c
@@ -0,0 +1,637 @@
+#include "poly.h"
+
+#include "cbd.h"
+#include "ntt.h"
+#include "params.h"
+#include "symmetric.h"
+
+#include <stdint.h>
+
+
+/*************************************************
+* Name:        poly_compress
+*
+* Description: Serialization of a polynomial and subsequent compression of a polynomial;
+*
+* Arguments:   - unsigned char *r: pointer to output byte array (of length KYBER_POLYCOMPRESSEDBYTES)
+*              - const poly *a:    pointer to input polynomial to be serialized
+*************************************************/
+void poly_compress(unsigned char *r, const poly *a)
+{
+  unsigned int i,j;
+  int16_t u;
+  uint32_t d0;
+  uint8_t t[8];
+
+#if (KYBER_POLYCOMPRESSEDBYTES == 128)
+  for(i=0;i<KYBER_N/8;i++) {
+    for(j=0;j<8;j++) {
+      // map to positive standard representatives
+      u  = a->coeffs[8*i+j];
+      u += (u >> 15) & KYBER_Q;
+/*    t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */
+      d0 = u << 4;
+      d0 += 1665;
+      d0 *= 80635;
+      d0 >>= 28;
+      t[j] = d0 & 0xf;
+    }
+
+    r[0] = t[0] | (t[1] << 4);
+    r[1] = t[2] | (t[3] << 4);
+    r[2] = t[4] | (t[5] << 4);
+    r[3] = t[6] | (t[7] << 4);
+    r += 4;
+  }
+#elif (KYBER_POLYCOMPRESSEDBYTES == 160)
+  for(i=0;i<KYBER_N/8;i++) {
+    for(j=0;j<8;j++) {
+      // map to positive standard representatives
+      u  = a->coeffs[8*i+j];
+      u += (u >> 15) & KYBER_Q;
+/*      t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */
+      d0 = u << 5;
+      d0 += 1664;
+      d0 *= 40318;
+      d0 >>= 27;
+      t[j] = d0 & 0x1f;
+    }
+
+    r[0] = (t[0] >> 0) | (t[1] << 5);
+    r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7);
+    r[2] = (t[3] >> 1) | (t[4] << 4);
+    r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6);
+    r[4] = (t[6] >> 2) | (t[7] << 3);
+    r += 5;
+  }
+#else
+#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}"
+#endif
+}
+
+/*************************************************
+* Name:        poly_decompress
+*
+* Description: De-serialization and subsequent decompression of a polynomial;
+*              approximate inverse of poly_compress
+*
+* Arguments:   - poly *r:                pointer to output polynomial
+*              - const unsigned char *a: pointer to input byte array (of length KYBER_POLYCOMPRESSEDBYTES bytes)
+**************************************************/
+void poly_decompress(poly *r, const unsigned char *a)
+{
+  int i;
+#if (KYBER_POLYCOMPRESSEDBYTES == 128)
+  for(i=0;i<KYBER_N;i+=8)
+  {
+    r->coeffs[i+0] = (((a[0] & 15) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+1] = (((a[0] >> 4) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+2] = (((a[1] & 15) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+3] = (((a[1] >> 4) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+4] = (((a[2] & 15) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+5] = (((a[2] >> 4) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+6] = (((a[3] & 15) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+7] = (((a[3] >> 4) * KYBER_Q) + 8) >> 4;
+    a += 4;
+  }
+#elif (KYBER_POLYCOMPRESSEDBYTES == 160)
+  for(i=0;i<KYBER_N;i+=8)
+  {
+    r->coeffs[i+0] =  (((a[0] & 31) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+1] = ((((a[0] >> 5) | ((a[1] & 3) << 3)) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+2] = ((((a[1] >> 2) & 31) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+3] = ((((a[1] >> 7) | ((a[2] & 15) << 1)) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+4] = ((((a[2] >> 4) | ((a[3] &  1) << 4)) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+5] = ((((a[3] >> 1) & 31) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+6] = ((((a[3] >> 6) | ((a[4] &  7) << 2)) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+7] =  (((a[4] >> 3) * KYBER_Q) + 16) >> 5;
+    a += 5;
+  }
+#else
+#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {96, 128, 160}"
+#endif
+}
+
+/*************************************************
+* Name:        poly_packcompress
+*
+* Description: Serialization and subsequent compression of a polynomial of a polyvec,
+*              writes to a byte string representation of the whole polyvec.
+*              Used to compress a polyvec one poly at a time in a loop.
+*
+* Arguments:   - unsigned char *r:  pointer to output byte string representation of a polyvec (of length KYBER_POLYVECCOMPRESSEDBYTES)
+*              - const poly *a:     pointer to input polynomial
+*              - int i:             index of to be serialized polynomial in serialized polyec
+**************************************************/
+void poly_packcompress(unsigned char *r, poly *a, int i) {
+    int j, k;
+    uint64_t d0;
+
+#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
+  uint16_t t[8];
+
+  for(j=0;j<KYBER_N/8;j++) {
+      for(k=0;k<8;k++) {
+        t[k]  = a->coeffs[8*j+k];
+        t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
+/*      t[k]  = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */
+        d0 = t[k];
+        d0 <<= 11;
+        d0 += 1664;
+        d0 *= 645084;
+        d0 >>= 31;
+        t[k] = d0 & 0x7ff;
+      }
+      
+
+    r[352*i+11*j+ 0] =  t[0] & 0xff;
+    r[352*i+11*j+ 1] = (t[0] >>  8) | ((t[1] & 0x1f) << 3);
+    r[352*i+11*j+ 2] = (t[1] >>  5) | ((t[2] & 0x03) << 6);
+    r[352*i+11*j+ 3] = (t[2] >>  2) & 0xff;
+    r[352*i+11*j+ 4] = (t[2] >> 10) | ((t[3] & 0x7f) << 1);
+    r[352*i+11*j+ 5] = (t[3] >>  7) | ((t[4] & 0x0f) << 4);
+    r[352*i+11*j+ 6] = (t[4] >>  4) | ((t[5] & 0x01) << 7);
+    r[352*i+11*j+ 7] = (t[5] >>  1) & 0xff;
+    r[352*i+11*j+ 8] = (t[5] >>  9) | ((t[6] & 0x3f) << 2);
+    r[352*i+11*j+ 9] = (t[6] >>  6) | ((t[7] & 0x07) << 5);
+    r[352*i+11*j+10] = (t[7] >>  3);
+  }
+#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
+    uint16_t t[4];
+
+    for (j = 0; j < KYBER_N / 4; j++) {
+        for(k=0;k<4;k++) {
+            t[k]  = a->coeffs[4*j+k];
+            t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
+            /*      t[k]  = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */
+            d0 = t[k];
+            d0 <<= 10;
+            d0 += 1665;
+            d0 *= 1290167;
+            d0 >>= 32;
+            t[k] = d0 & 0x3ff;
+        }
+        r[320*i+5*j+0] =   t[0] & 0xff;
+        r[320*i+5*j+1] =  (t[0] >>  8) | ((t[1] & 0x3f) << 2);
+        r[320*i+5*j+2] = ((t[1] >>  6) | ((t[2] & 0x0f) << 4)) & 0xff;
+        r[320*i+5*j+3] = ((t[2] >>  4) | ((t[3] & 0x03) << 6)) & 0xff;
+        r[320*i+5*j+4] =  (t[3] >>  2) & 0xff;
+    }
+#else
+#error "KYBER_POLYVECCOMPRESSEDBYTES needs to in (KYBER_K * {352, 320})"
+#endif
+}
+
+/*************************************************
+* Name:        poly_unpackdecompress
+*
+* Description: Deserialization and subsequent compression of a polynomial of a polyvec,
+*              Used to uncompress a polyvec one poly at a time in a loop.
+*
+* Arguments:   - const poly *r:     pointer to output polynomial
+*              - unsigned char *a:  pointer to input byte string representation of a polyvec (of length KYBER_POLYVECCOMPRESSEDBYTES)
+*              - int i:             index of poly in polyvec to decompress
+**************************************************/
+void poly_unpackdecompress(poly *r, const unsigned char *a, int i) {
+  int j;
+#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
+    for(j=0;j<KYBER_N/8;j++)
+    {
+      r->coeffs[8*j+0] =  (((a[352*i+11*j+ 0]       | (((uint32_t)a[352*i+11*j+ 1] & 0x07) << 8)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+1] = ((((a[352*i+11*j+ 1] >> 3) | (((uint32_t)a[352*i+11*j+ 2] & 0x3f) << 5)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+2] = ((((a[352*i+11*j+ 2] >> 6) | (((uint32_t)a[352*i+11*j+ 3] & 0xff) << 2) | (((uint32_t)a[352*i+11*j+4] & 0x01) << 10)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+3] = ((((a[352*i+11*j+ 4] >> 1) | (((uint32_t)a[352*i+11*j+ 5] & 0x0f) << 7)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+4] = ((((a[352*i+11*j+ 5] >> 4) | (((uint32_t)a[352*i+11*j+ 6] & 0x7f) << 4)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+5] = ((((a[352*i+11*j+ 6] >> 7) | (((uint32_t)a[352*i+11*j+ 7] & 0xff) << 1) | (((uint32_t)a[352*i+11*j+8] & 0x03) <<  9)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+6] = ((((a[352*i+11*j+ 8] >> 2) | (((uint32_t)a[352*i+11*j+ 9] & 0x1f) << 6)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+7] = ((((a[352*i+11*j+ 9] >> 5) | (((uint32_t)a[352*i+11*j+10] & 0xff) << 3)) * KYBER_Q) + 1024) >> 11;
+    }
+#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
+    for(j=0;j<KYBER_N/4;j++)
+    {
+      r->coeffs[4*j+0] =  (((a[320*i+5*j+ 0]       | (((uint32_t)a[320*i+5*j+ 1] & 0x03) << 8)) * KYBER_Q) + 512) >> 10;
+      r->coeffs[4*j+1] = ((((a[320*i+5*j+ 1] >> 2) | (((uint32_t)a[320*i+5*j+ 2] & 0x0f) << 6)) * KYBER_Q) + 512) >> 10;
+      r->coeffs[4*j+2] = ((((a[320*i+5*j+ 2] >> 4) | (((uint32_t)a[320*i+5*j+ 3] & 0x3f) << 4)) * KYBER_Q) + 512) >> 10;
+      r->coeffs[4*j+3] = ((((a[320*i+5*j+ 3] >> 6) | (((uint32_t)a[320*i+5*j+ 4] & 0xff) << 2)) * KYBER_Q) + 512) >> 10;
+    }
+#else
+#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
+#endif
+}
+
+
+/*************************************************
+* Name:        cmp_poly_compress
+*
+* Description: Serializes and consequently compares polynomial to a serialized polynomial
+*
+* Arguments:   - const unsigned char *r:    pointer to serialized polynomial to compare with
+*              - poly *a:                   pointer to input polynomial to serialize and compare
+* Returns:                                  boolean indicating whether the polynomials are equal
+**************************************************/
+int cmp_poly_compress(const unsigned char *r, poly *a) {
+    unsigned char rc = 0;
+    int16_t u;
+    uint32_t d0;
+    uint8_t t[8];
+    int i, j, k = 0;
+
+#if (KYBER_POLYCOMPRESSEDBYTES == 128)
+  for(i=0;i<KYBER_N/8;i++) {
+    for(j=0;j<8;j++) {
+      // map to positive standard representatives
+      u  = a->coeffs[8*i+j];
+      u += (u >> 15) & KYBER_Q;
+/*    t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */
+      d0 = u << 4;
+      d0 += 1665;
+      d0 *= 80635;
+      d0 >>= 28;
+      t[j] = d0 & 0xf;
+    }
+        rc |= r[k]      ^ (t[0] | (t[1] << 4));
+        rc |= r[k + 1]  ^ (t[2] | (t[3] << 4));
+        rc |= r[k + 2]  ^ (t[4] | (t[5] << 4));
+        rc |= r[k + 3]  ^ (t[6] | (t[7] << 4));
+        k += 4;
+    }
+#elif (KYBER_POLYCOMPRESSEDBYTES == 160)
+  for(i=0;i<KYBER_N/8;i++) {
+    for(j=0;j<8;j++) {
+      // map to positive standard representatives
+      u  = a->coeffs[8*i+j];
+      u += (u >> 15) & KYBER_Q;
+/*      t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */
+      d0 = u << 5;
+      d0 += 1664;
+      d0 *= 40318;
+      d0 >>= 27;
+      t[j] = d0 & 0x1f;
+    }
+
+
+      rc |= r[k]   ^ (t[0]       | (t[1] << 5));
+      rc |= r[k+1] ^ ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
+      rc |= r[k+2] ^ ((t[3] >> 1) | (t[4] << 4));
+      rc |= r[k+3] ^ ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
+      rc |= r[k+4] ^ ((t[6] >> 2) | (t[7] << 3));
+      k += 5;
+    }
+#else
+#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}"
+#endif
+    return rc;
+}
+
+/*************************************************
+* Name:        cmp_poly_packcompress
+*
+* Description: Serializes and consequently compares poly of polyvec to a serialized polyvec
+*              Should be called in a loop over all poly's of a polyvec.
+*
+* Arguments:   - const unsigned char *r:    pointer to serialized polyvec to compare with
+*              - poly *a:                   pointer to input polynomial of polyvec to serialize and compare
+*              - int i:                     index of poly in polyvec to compare with
+* Returns:                                  boolean indicating whether the polyvecs are equal
+**************************************************/
+int cmp_poly_packcompress(const unsigned char *r, poly *a, int i) {
+    unsigned char rc = 0;
+    int j, k;
+    uint64_t d0;
+
+#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
+  uint16_t t[8];
+    for(j=0;j<KYBER_N/8;j++)
+    {
+      for(k=0;k<8;k++) {
+        t[k]  = a->coeffs[8*j+k];
+        t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
+/*      t[k]  = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */
+        d0 = t[k];
+        d0 <<= 11;
+        d0 += 1664;
+        d0 *= 645084;
+        d0 >>= 31;
+        t[k] = d0 & 0x7ff;
+      }
+
+      rc |= r[352*i+11*j+ 0] ^ (t[0] & 0xff);
+      rc |= r[352*i+11*j+ 1] ^ ((t[0] >>  8) | ((t[1] & 0x1f) << 3));
+      rc |= r[352*i+11*j+ 2] ^ ((t[1] >>  5) | ((t[2] & 0x03) << 6));
+      rc |= r[352*i+11*j+ 3] ^ ((t[2] >>  2) & 0xff);
+      rc |= r[352*i+11*j+ 4] ^ ((t[2] >> 10) | ((t[3] & 0x7f) << 1));
+      rc |= r[352*i+11*j+ 5] ^ ((t[3] >>  7) | ((t[4] & 0x0f) << 4));
+      rc |= r[352*i+11*j+ 6] ^ ((t[4] >>  4) | ((t[5] & 0x01) << 7));
+      rc |= r[352*i+11*j+ 7] ^ ((t[5] >>  1) & 0xff);
+      rc |= r[352*i+11*j+ 8] ^ ((t[5] >>  9) | ((t[6] & 0x3f) << 2));
+      rc |= r[352*i+11*j+ 9] ^ ((t[6] >>  6) | ((t[7] & 0x07) << 5));
+      rc |= r[352*i+11*j+10] ^ ((t[7] >>  3));
+    }
+#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
+    uint16_t t[4];
+        for (j = 0; j < KYBER_N / 4; j++) {
+        for(k=0;k<4;k++) {
+            t[k]  = a->coeffs[4*j+k];
+            t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
+            /*      t[k]  = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */
+            d0 = t[k];
+            d0 <<= 10;
+            d0 += 1665;
+            d0 *= 1290167;
+            d0 >>= 32;
+            t[k] = d0 & 0x3ff;
+        }
+
+            rc |= r[320*i+5*j+0] ^ (t[0] & 0xff);
+            rc |= r[320*i+5*j+1] ^ ((t[0] >>  8) | ((t[1] & 0x3f) << 2));
+            rc |= r[320*i+5*j+2] ^ (((t[1] >>  6) | ((t[2] & 0x0f) << 4)) & 0xff);
+            rc |= r[320*i+5*j+3] ^ (((t[2] >>  4) | ((t[3] & 0x03) << 6)) & 0xff);
+            rc |= r[320*i+5*j+4] ^ ((t[3] >>  2) & 0xff);
+        }
+#else
+#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
+#endif
+    return rc;
+}
+
+/*************************************************
+* Name:        poly_tobytes
+*
+* Description: Serialization of a polynomial
+*
+* Arguments:   - unsigned char *r: pointer to output byte array (needs space for KYBER_POLYBYTES bytes)
+*              - const poly *a:    pointer to input polynomial
+**************************************************/
+void poly_tobytes(unsigned char *r, poly *a) {
+    int i;
+    uint16_t t0, t1;
+
+    poly_reduce(a);
+
+    for (i = 0; i < KYBER_N / 2; i++) {
+        t0 = a->coeffs[2 * i];
+        t1 = a->coeffs[2 * i + 1];
+        r[3 * i] = t0 & 0xff;
+        r[3 * i + 1] = (t0 >> 8) | ((t1 & 0xf) << 4);
+        r[3 * i + 2] = (t1 >> 4) & 0xff;
+    }
+}
+
+/*************************************************
+* Name:        poly_frombytes
+*
+* Description: De-serialization of a polynomial;
+*              inverse of poly_tobytes
+*
+* Arguments:   - poly *r:                pointer to output polynomial
+*              - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes)
+**************************************************/
+void poly_frombytes(poly *r, const unsigned char *a) {
+    int i;
+
+    for (i = 0; i < KYBER_N / 2; i++) {
+        r->coeffs[2 * i]     = a[3 * i]          | ((uint16_t)a[3 * i + 1] & 0x0f) << 8;
+        r->coeffs[2 * i + 1] = a[3 * i + 1] >> 4 | ((uint16_t)a[3 * i + 2] & 0xff) << 4;
+    }
+}
+
+/*************************************************
+* Name:        poly_frombytes_mul
+*
+* Description: Multiplication of a polynomial with a de-serialization of another polynomial
+*
+* Arguments:   - poly *r:                pointer to output polynomial
+*              - const poly *b:          pointer to input polynomial
+*              - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes)
+**************************************************/
+extern void frombytes_mul_asm(int16_t *r, const int16_t *b, const unsigned char *c, const int32_t zetas[64]);
+void poly_frombytes_mul(poly *r, const poly *b, const unsigned char *a) {
+    frombytes_mul_asm(r->coeffs, b->coeffs, a, zetas);
+}
+
+/*************************************************
+* Name:        poly_frombytes_mul_acc
+*
+* Description: Multiplication of a polynomial with a de-serialization of another polynomial
+*              Accumulation in r.
+*
+* Arguments:   - poly *r:                pointer to output polynomial
+*              - const poly *b:          pointer to input polynomial
+*              - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes)
+**************************************************/
+extern void frombytes_mul_asm_acc(int16_t *r, const int16_t *b, const unsigned char *c, const int32_t zetas[64]);
+void poly_frombytes_mul_acc(poly *r, const poly *b, const unsigned char *a) {
+    frombytes_mul_asm_acc(r->coeffs, b->coeffs, a, zetas);
+}
+
+/*************************************************
+* Name:        poly_getnoise_eta1
+*
+* Description: Sample a polynomial deterministically from a seed and a nonce,
+*              with output polynomial close to centered binomial distribution
+*              with parameter KYBER_ETA1
+*
+* Arguments:   - poly *r:                   pointer to output polynomial
+*              - const unsigned char *seed: pointer to input seed (pointing to array of length KYBER_SYMBYTES bytes)
+*              - unsigned char nonce:       one-byte input nonce
+*              - int add:                   boolean to indicate to accumulate into r
+**************************************************/
+void poly_noise_eta1(poly *r, const unsigned char *seed, unsigned char nonce, int add) {
+    unsigned char buf[KYBER_ETA1 * KYBER_N / 4];
+
+    prf(buf, KYBER_ETA1 * KYBER_N / 4, seed, nonce);
+    cbd_eta1(r, buf, add);
+}
+
+/*************************************************
+* Name:        poly_getnoise_eta2
+*
+* Description: Sample a polynomial deterministically from a seed and a nonce,
+*              with output polynomial close to centered binomial distribution
+*              with parameter KYBER_ETA2
+*
+* Arguments:   - poly *r:                   pointer to output polynomial
+*              - const unsigned char *seed: pointer to input seed (pointing to array of length KYBER_SYMBYTES bytes)
+*              - unsigned char nonce:       one-byte input nonce
+*              - int add:                   boolean to indicate to accumulate into r
+**************************************************/
+void poly_noise_eta2(poly *r, const unsigned char *seed, unsigned char nonce, int add) {
+    unsigned char buf[KYBER_ETA2 * KYBER_N / 4];
+
+    prf(buf, KYBER_ETA2 * KYBER_N / 4, seed, nonce);
+    cbd_eta2(r, buf, add);
+}
+
+/*************************************************
+* Name:        poly_ntt
+*
+* Description: Computes negacyclic number-theoretic transform (NTT) of
+*              a polynomial in place;
+*              inputs assumed to be in normal order, output in bitreversed order
+*
+* Arguments:   - uint16_t *r: pointer to in/output polynomial
+**************************************************/
+void poly_ntt(poly *r) {
+    ntt(r->coeffs);
+}
+
+/*************************************************
+* Name:        poly_invntt
+*
+* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of
+*              a polynomial in place;
+*              inputs assumed to be in bitreversed order, output in normal order
+*
+* Arguments:   - uint16_t *a: pointer to in/output polynomial
+**************************************************/
+void poly_invntt(poly *r) {
+    invntt(r->coeffs);
+}
+
+extern void basemul_asm(int16_t *, const int16_t *, const int16_t *, const int32_t *);
+/*************************************************
+* Name:        poly_basemul
+*
+* Description: Multiplication of two polynomials in NTT domain
+*
+* Arguments:   - poly *r:       pointer to output polynomial
+*              - const poly *a: pointer to first input polynomial
+*              - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_basemul(poly *r, const poly *a, const poly *b) {
+    basemul_asm(r->coeffs, a->coeffs, b->coeffs, zetas);
+}
+
+extern void basemul_asm_acc(int16_t *, const int16_t *, const int16_t *, const int32_t *);
+/*************************************************
+* Name:        poly_basemul_acc
+*
+* Description: Multiplication of two polynomials in NTT domain, accumulating
+*
+* Arguments:   - poly *r:       pointer to output polynomial
+*              - const poly *a: pointer to first input polynomial
+*              - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_basemul_acc(poly *r, const poly *a, const poly *b) {
+    basemul_asm_acc(r->coeffs, a->coeffs, b->coeffs, zetas);
+}
+
+extern void asm_fromplant(int16_t *r);
+/*************************************************
+* Name:        poly_fromplant
+*
+* Description: Inplace conversion of all coefficients of a polynomial
+*              from Montgomery domain to normal domain
+*
+* Arguments:   - poly *r:       pointer to input/output polynomial
+**************************************************/
+void poly_fromplant(poly *r) {
+  asm_fromplant(r->coeffs);
+}
+
+extern void asm_barrett_reduce(int16_t *r);
+/*************************************************
+* Name:        poly_reduce
+*
+* Description: Applies Barrett reduction to all coefficients of a polynomial
+*              for details of the Barrett reduction see comments in reduce.c
+*
+* Arguments:   - poly *r:       pointer to input/output polynomial
+**************************************************/
+void poly_reduce(poly *r) {
+  asm_barrett_reduce(r->coeffs);
+}
+
+extern void pointwise_add(int16_t *, const int16_t *, const int16_t *);
+/*************************************************
+* Name:        poly_add
+*
+* Description: Add two polynomials
+*
+* Arguments: - poly *r:       pointer to output polynomial
+*            - const poly *a: pointer to first input polynomial
+*            - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_add(poly *r, const poly *a, const poly *b) {
+    pointwise_add(r->coeffs,a->coeffs,b->coeffs);
+}
+
+
+extern void pointwise_sub(int16_t *, const int16_t *, const int16_t *);
+/*************************************************
+* Name:        poly_sub
+*
+* Description: Subtract two polynomials
+*
+* Arguments: - poly *r:       pointer to output polynomial
+*            - const poly *a: pointer to first input polynomial
+*            - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_sub(poly *r, const poly *a, const poly *b) {
+    pointwise_sub(r->coeffs,a->coeffs,b->coeffs);
+}
+
+void cmov_int16(int16_t *r, int16_t v, uint16_t b);
+
+/*************************************************
+* Name:        poly_frommsg
+*
+* Description: Convert 32-byte message to polynomial
+*
+* Arguments:   - poly *r:                  pointer to output polynomial
+*              - const unsigned char *msg: pointer to input message
+**************************************************/
+void poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES])
+{
+  unsigned int i,j;
+
+#if (KYBER_INDCPA_MSGBYTES != KYBER_N/8)
+#error "KYBER_INDCPA_MSGBYTES must be equal to KYBER_N/8 bytes!"
+#endif
+
+  for(i=0;i<KYBER_N/8;i++) {
+    for(j=0;j<8;j++) {
+      r->coeffs[8*i+j] = 0;
+      cmov_int16(r->coeffs+8*i+j, ((KYBER_Q+1)/2), (msg[i] >> j)&1);
+    }
+  }
+}
+
+/*************************************************
+* Name:        poly_tomsg
+*
+* Description: Convert polynomial to 32-byte message
+*
+* Arguments:   - unsigned char *msg: pointer to output message
+*              - const poly *a:      pointer to input polynomial
+**************************************************/
+void poly_tomsg(unsigned char msg[KYBER_SYMBYTES], poly *a) {
+    uint32_t t;
+    int i, j;
+
+    for (i = 0; i < KYBER_SYMBYTES; i++) {
+        msg[i] = 0;
+        for (j = 0; j < 8; j++) {
+            t  = a->coeffs[8*i+j];
+            t <<= 1;
+            t += 1665;
+            t *= 80635;
+            t >>= 28;
+            t &= 1;
+            msg[i] |= t << j;
+        }
+    }
+}
+
+/*************************************************
+* Name:        poly_zeroize
+*
+* Description: Zeros a polynomial
+*
+* Arguments:   - poly *p: pointer to polynomial
+**************************************************/
+void poly_zeroize(poly *p) {
+  int i;
+  for(i = 0; i < KYBER_N; i++)
+   p->coeffs[i] = 0;
+}
diff --git a/crypto_kem/ml-kem-512/m4fstack/poly.h b/crypto_kem/ml-kem-512/m4fstack/poly.h
new file mode 100644
index 0000000..d62e966
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/poly.h
@@ -0,0 +1,54 @@
+#ifndef POLY_H
+#define POLY_H
+
+#include "params.h"
+
+#include <stdint.h>
+
+#define poly_getnoise_eta1(p, seed, nonce) poly_noise_eta1(p, seed, nonce, 0)
+#define poly_getnoise_eta2(p, seed, nonce) poly_noise_eta2(p, seed, nonce, 0)
+#define poly_addnoise_eta1(p, seed, nonce) poly_noise_eta1(p, seed, nonce, 1)
+#define poly_addnoise_eta2(p, seed, nonce) poly_noise_eta2(p, seed, nonce, 1)
+
+/*
+ * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
+ * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1]
+ */
+typedef struct {
+    int16_t coeffs[KYBER_N];
+} poly;
+
+void poly_compress(unsigned char *r, const poly *a);
+void poly_decompress(poly *r, const unsigned char *a);
+
+void poly_packcompress(unsigned char *r, poly *a, int i);
+void poly_unpackdecompress(poly *r, const unsigned char *a, int i);
+
+int cmp_poly_compress(const unsigned char *r, poly *a);
+int cmp_poly_packcompress(const unsigned char *r, poly *a, int i);
+
+void poly_tobytes(unsigned char *r, poly *a);
+void poly_frombytes(poly *r, const unsigned char *a);
+void poly_frombytes_mul(poly *r, const poly *b, const unsigned char *a);
+void poly_frombytes_mul_acc(poly *r, const poly *b, const unsigned char *a);
+
+void poly_frommsg(poly *r, const unsigned char msg[KYBER_SYMBYTES]);
+void poly_tomsg(unsigned char msg[KYBER_SYMBYTES], poly *a);
+
+void poly_noise_eta1(poly *r, const unsigned char *seed, unsigned char nonce, int add);
+void poly_noise_eta2(poly *r, const unsigned char *seed, unsigned char nonce, int add);
+
+void poly_ntt(poly *r);
+void poly_invntt(poly *r);
+void poly_basemul(poly *r, const poly *a, const poly *b);
+void poly_basemul_acc(poly *r, const poly *a, const poly *b);
+void poly_fromplant(poly *r);
+
+void poly_reduce(poly *r);
+
+void poly_add(poly *r, const poly *a, const poly *b);
+void poly_sub(poly *r, const poly *a, const poly *b);
+
+void poly_zeroize(poly *p);
+
+#endif
diff --git a/crypto_kem/ml-kem-512/m4fstack/poly_asm.S b/crypto_kem/ml-kem-512/m4fstack/poly_asm.S
new file mode 120000
index 0000000..167ee5e
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/poly_asm.S
@@ -0,0 +1 @@
+../../ml-kem-768/m4fstack/poly_asm.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/polyvec.c b/crypto_kem/ml-kem-512/m4fstack/polyvec.c
new file mode 120000
index 0000000..f398d76
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/polyvec.c
@@ -0,0 +1 @@
+../m4fspeed/polyvec.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/polyvec.h b/crypto_kem/ml-kem-512/m4fstack/polyvec.h
new file mode 120000
index 0000000..3113837
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/polyvec.h
@@ -0,0 +1 @@
+../m4fspeed/polyvec.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/reduce.S b/crypto_kem/ml-kem-512/m4fstack/reduce.S
new file mode 120000
index 0000000..29ae453
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/reduce.S
@@ -0,0 +1 @@
+../m4fspeed/reduce.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/symmetric-fips202.c b/crypto_kem/ml-kem-512/m4fstack/symmetric-fips202.c
new file mode 120000
index 0000000..fa4ba9a
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/symmetric-fips202.c
@@ -0,0 +1 @@
+../m4fspeed/symmetric-fips202.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/symmetric.h b/crypto_kem/ml-kem-512/m4fstack/symmetric.h
new file mode 120000
index 0000000..28c6fac
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/symmetric.h
@@ -0,0 +1 @@
+../m4fspeed/symmetric.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/verify.c b/crypto_kem/ml-kem-512/m4fstack/verify.c
new file mode 120000
index 0000000..a7a9856
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/verify.c
@@ -0,0 +1 @@
+../m4fspeed/verify.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-512/m4fstack/verify.h b/crypto_kem/ml-kem-512/m4fstack/verify.h
new file mode 120000
index 0000000..cb2da4b
--- /dev/null
+++ b/crypto_kem/ml-kem-512/m4fstack/verify.h
@@ -0,0 +1 @@
+../m4fspeed/verify.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fspeed/api.h b/crypto_kem/ml-kem-768/m4fspeed/api.h
new file mode 100644
index 0000000..bdb694f
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/api.h
@@ -0,0 +1,20 @@
+#ifndef API_H
+#define API_H
+
+#include "params.h"
+
+#define CRYPTO_SECRETKEYBYTES  KYBER_SECRETKEYBYTES
+#define CRYPTO_PUBLICKEYBYTES  KYBER_PUBLICKEYBYTES
+#define CRYPTO_CIPHERTEXTBYTES KYBER_CIPHERTEXTBYTES
+#define CRYPTO_BYTES           KYBER_SSBYTES
+
+#define CRYPTO_ALGNAME "Kyber768"
+
+int crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
+
+int crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk);
+
+int crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk);
+
+
+#endif
diff --git a/crypto_kem/ml-kem-768/m4fspeed/cbd.c b/crypto_kem/ml-kem-768/m4fspeed/cbd.c
new file mode 100644
index 0000000..c7b5dee
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/cbd.c
@@ -0,0 +1,55 @@
+#include "cbd.h"
+#include "params.h"
+
+#include <stdint.h>
+
+/*************************************************
+* Name:        load32_littleendian
+*
+* Description: load bytes into a 32-bit integer
+*              in little-endian order
+*
+* Arguments:   - const unsigned char *x: pointer to input byte array
+*
+* Returns 32-bit unsigned integer loaded from x
+**************************************************/
+static uint32_t load32_littleendian(const unsigned char *x) {
+    uint32_t r;
+    r  = (uint32_t)x[0];
+    r |= (uint32_t)x[1] << 8;
+    r |= (uint32_t)x[2] << 16;
+    r |= (uint32_t)x[3] << 24;
+    return r;
+}
+
+/*************************************************
+* Name:        cbd
+*
+* Description: Given an array of uniformly random bytes, compute
+*              polynomial with coefficients distributed according to
+*              a centered binomial distribution with parameter KYBER_ETA
+*              specialized for KYBER_ETA=2
+*
+* Arguments:   - poly *r:                  pointer to output polynomial
+*              - const unsigned char *buf: pointer to input byte array
+*              - int add:                  boolean to indicate to accumulate into r
+**************************************************/
+void cbd(poly *r, const unsigned char *buf, int add) {
+    uint32_t d, t;
+    int16_t a, b;
+    int i, j;
+
+    for (i = 0; i < KYBER_N / 8; i++) {
+        t = load32_littleendian(buf + 4 * i);
+        d  = t & 0x55555555;
+        d += (t >> 1) & 0x55555555;
+
+        for (j = 0; j < 8; j++) {
+            a = (d >>  4 * j)    & 0x3;
+            b = (d >> (4 * j + 2)) & 0x3;
+            if (!add)
+              r->coeffs[8 * i + j] = 0;
+            r->coeffs[8 * i + j] = r->coeffs[8 * i + j] + (a - b);
+        }
+    }
+}
diff --git a/crypto_kem/ml-kem-768/m4fspeed/cbd.h b/crypto_kem/ml-kem-768/m4fspeed/cbd.h
new file mode 100644
index 0000000..4aa5a8a
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/cbd.h
@@ -0,0 +1,8 @@
+#ifndef CBD_H
+#define CBD_H
+
+#include "poly.h"
+
+void cbd(poly *r, const unsigned char *buf, int add);
+
+#endif
diff --git a/crypto_kem/ml-kem-768/m4fspeed/cmov_int16.S b/crypto_kem/ml-kem-768/m4fspeed/cmov_int16.S
new file mode 100644
index 0000000..4f7dcc6
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/cmov_int16.S
@@ -0,0 +1,15 @@
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+// void cmov_int16(int16_t *r, int16_t v, uint16_t b)
+.global cmov_int16
+.type cmov_int16, %function
+.align 2
+cmov_int16:
+    cmp.w r2, #0
+    ldrsh.w r3, [r0]
+    it ne
+    movne.w r3, r1
+    strh.w r3, [r0]
+    bx lr
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fspeed/fastaddsub.S b/crypto_kem/ml-kem-768/m4fspeed/fastaddsub.S
new file mode 100644
index 0000000..0d4ae50
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/fastaddsub.S
@@ -0,0 +1,60 @@
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+.align 2
+.global pointwise_sub
+.type pointwise_sub, %function
+pointwise_sub:
+  push {r4-r11, lr}
+
+  movw r14, #25
+  1:
+    ldm r1!, {r3-r7}
+    ldm r2!, {r8-r12}
+    usub16 r3, r3, r8
+    usub16 r4, r4, r9
+    usub16 r5, r5, r10
+    usub16 r6, r6, r11
+    usub16 r7, r7, r12
+    stm r0!, {r3-r7}
+
+    subs.w r14, #1
+  bne.w 1b
+
+  ldm r1!, {r3-r5}
+  ldm r2!, {r8-r10}
+  usub16 r3, r3, r8
+  usub16 r4, r4, r9
+  usub16 r5, r5, r10
+  stm r0!, {r3-r5}
+  pop {r4-r11, pc}
+
+
+.align 2
+.global pointwise_add
+.type pointwise_add, %function
+pointwise_add:
+  push {r4-r11, lr}
+
+  movw r14, #25
+  1:
+    ldm r1!, {r3-r7}
+    ldm r2!, {r8-r12}
+    uadd16 r3, r3, r8
+    uadd16 r4, r4, r9
+    uadd16 r5, r5, r10
+    uadd16 r6, r6, r11
+    uadd16 r7, r7, r12
+    stm r0!, {r3-r7}
+
+    subs.w r14, #1
+  bne.w 1b
+
+  ldm r1!, {r3-r5}
+  ldm r2!, {r8-r10}
+  uadd16 r3, r3, r8
+  uadd16 r4, r4, r9
+  uadd16 r5, r5, r10
+  stm r0!, {r3-r5}
+  pop {r4-r11, pc}
diff --git a/crypto_kem/ml-kem-768/m4fspeed/fastbasemul.S b/crypto_kem/ml-kem-768/m4fspeed/fastbasemul.S
new file mode 100644
index 0000000..ffee442
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/fastbasemul.S
@@ -0,0 +1,193 @@
+#include "macros.i"
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+// void basemul_asm_opt_16_32(int32_t *, const int16_t *, const int16_t *, const int16_t *)
+.global basemul_asm_opt_16_32
+.type basemul_asm_opt_16_32, %function
+.align 2
+basemul_asm_opt_16_32:
+  push {r4-r11, lr}
+  
+  rptr_tmp  .req r0
+  aptr      .req r1
+  bptr      .req r2
+  aprimeptr .req r3
+  poly0     .req r4
+  poly1     .req r6
+  poly2     .req r5
+  poly3     .req r7
+  q         .req r8
+  qa        .req r9
+  qinv      .req r10
+  tmp       .req r11
+  tmp2      .req r12
+  loop      .req r14
+
+  //movw qa, #26632
+	//movt  q, #3329
+	### qinv=0x6ba8f301
+	//movw qinv, #62209
+	//movt qinv, #27560
+
+  movw loop, #64
+  1:
+    ldr poly0, [aptr], #4
+    ldr poly1, [bptr], #4
+    ldr poly2, [aptr], #4
+    ldr poly3, [bptr], #4
+    ldr.w tmp, [aprimeptr, #4]
+    ldr tmp2, [aprimeptr], #8
+    
+    // (poly0_t * zeta) * poly1_t + poly0_b * poly1_b
+    smuad tmp2, tmp2, poly1
+    str tmp2, [rptr_tmp], #4
+
+    // poly1_t * poly0_b + poly1_b * poly0_t
+    smuadx tmp2, poly0, poly1
+    str tmp2, [rptr_tmp], #4
+    
+    smuad tmp2, tmp, poly3
+    str tmp2, [rptr_tmp], #4
+
+    smuadx tmp2, poly2, poly3
+    str tmp2, [rptr_tmp], #4
+
+    subs.w loop, #1
+  bne.w 1b
+
+  pop {r4-r11, pc}
+
+// void basemul_asm_acc_opt_32_32(int32_t *, const int16_t *, const int16_t *, const int16_t *)
+.global basemul_asm_acc_opt_32_32
+.type basemul_asm_acc_opt_32_32, %function
+.align 2
+basemul_asm_acc_opt_32_32:
+  push {r4-r11, lr}
+
+  rptr_tmp  .req r0
+  aptr      .req r1
+  bptr      .req r2
+  aprimeptr .req r3
+  poly0     .req r4
+  poly1     .req r6
+  res0      .req r5
+  res1      .req r7
+  q         .req r8
+  qa        .req r9
+  qinv      .req r10
+  tmp       .req r11
+  tmp2      .req r12
+  loop      .req r14
+
+  //movw qa, #26632
+	//movt  q, #3329
+	### qinv=0x6ba8f301
+	//movw qinv, #62209
+	//movt qinv, #27560
+
+  movw loop, #64
+  1:
+    ldr poly0, [aptr], #4
+    ldr poly1, [bptr], #4
+    ldr.w res0, [rptr_tmp]
+    ldr tmp2, [aprimeptr], #4
+    ldr.w res1, [rptr_tmp, #4]
+
+    // (poly0_t * zeta) * poly1_t + poly0_b * poly0_t + res
+    smlad tmp2, tmp2, poly1, res0
+    str tmp2, [rptr_tmp], #4
+
+    // poly1_t * poly0_b + poly1_b * poly0_t + res
+    smladx tmp, poly0, poly1, res1
+    str tmp, [rptr_tmp], #4
+
+    ldr poly0, [aptr], #4
+    ldr poly1, [bptr], #4
+    ldr.w res0, [rptr_tmp]
+    ldr tmp2, [aprimeptr], #4
+    ldr.w res1, [rptr_tmp, #4]
+    
+    smlad tmp2, tmp2, poly1, res0
+    str tmp2, [rptr_tmp], #4
+
+    smladx tmp, poly0, poly1, res1
+    str tmp, [rptr_tmp], #4
+
+    subs.w loop, #1
+  bne.w 1b
+
+  pop {r4-r11, pc}
+
+.unreq rptr_tmp
+
+
+// void basemul_asm_acc_opt_32_16(int16_t *, const int16_t *, const int16_t *, const int16_t *, const int32_t *)
+.global basemul_asm_acc_opt_32_16
+.type basemul_asm_acc_opt_32_16, %function
+.align 2
+basemul_asm_acc_opt_32_16:
+  push {r4-r11, lr}
+
+  rptr      .req r0
+  aptr      .req r1
+  bptr      .req r2
+  aprimeptr .req r3
+  poly0     .req r4
+  poly1     .req r6
+  res0      .req r5
+  res1      .req r7
+  q         .req r8
+  qa        .req r9
+  qinv      .req r10
+  //tmp       .req r11
+  tmp2      .req r12
+  rptr_tmp  .req r11
+  loop      .req r14
+
+  movw qa, #26632
+	movt  q, #3329
+	### qinv=0x6ba8f301
+	movw qinv, #62209
+	movt qinv, #27560
+
+  ldr rptr_tmp, [sp, #9*4]
+  movw loop, #64
+  1:
+    ldr poly0, [aptr], #4
+    ldr poly1, [bptr], #4
+    ldr.w res0, [rptr_tmp], #4
+    ldr tmp2, [aprimeptr], #4
+    ldr.w res1, [rptr_tmp], #4
+
+    // (poly0_t * zeta) * poly1_t + poly0_b * poly0_b + res
+    smlad res0, tmp2, poly1, res0
+    plant_red q, qa, qinv, res0
+
+    // poly1_t * poly0_b + poly1_b * poly0_t + res
+    smladx res1, poly0, poly1, res1
+    plant_red q, qa, qinv, res1
+
+    pkhtb res0, res1, res0, asr#16
+    str res0, [rptr], #4
+
+    ldr poly0, [aptr], #4
+    ldr poly1, [bptr], #4
+    ldr.w res0, [rptr_tmp], #4
+    ldr tmp2, [aprimeptr], #4     
+    ldr.w res1, [rptr_tmp], #4
+    
+    smlad res0, tmp2, poly1, res0
+    plant_red q, qa, qinv, res0
+    
+    smladx res1, poly0, poly1, res1
+    plant_red q, qa, qinv, res1
+
+    pkhtb res0, res1, res0, asr#16
+    str res0, [rptr], #4
+
+    subs.w loop, #1
+  bne.w 1b
+
+  pop {r4-r11, pc}
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fspeed/fastinvntt.S b/crypto_kem/ml-kem-768/m4fspeed/fastinvntt.S
new file mode 100644
index 0000000..606fe1f
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/fastinvntt.S
@@ -0,0 +1,356 @@
+/******************************************************************************
+* Integrating the improved Plantard arithmetic into Kyber.
+*
+* Efficient Plantard arithmetic enables a faster Kyber implementation with the 
+* same stack usage.
+*
+* See the paper at https://eprint.iacr.org/2022/956.pdf for more details.
+*
+* @author   Junhao Huang, BNU-HKBU United International College, Zhuhai, China
+*           jhhuang_nuaa@126.com
+*
+* @date     September 2022
+******************************************************************************/
+#include "macros.i"
+
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+.macro mul_twiddle_plant a, twiddle, tmp, q, qa
+	smulwb \tmp, \twiddle, \a
+	smulwt \a,   \twiddle, \a
+	smlabt \tmp, \tmp, \q, \qa
+	smlabt \a, \a, \q, \qa
+	pkhtb \a, \a, \tmp, asr#16
+.endm
+
+.macro doublebutterfly_plant a0, a1, twiddle, tmp, q, qa
+	smulwb \tmp, \twiddle, \a1
+	smulwt \a1, \twiddle, \a1
+	smlabt \tmp, \tmp, \q, \qa
+	smlabt \a1, \a1, \q, \qa
+	pkhtb \tmp, \a1, \tmp, asr#16
+	usub16 \a1, \a0, \tmp
+	uadd16 \a0, \a0, \tmp
+.endm
+
+.macro two_doublebutterfly_plant a0, a1, a2, a3, twiddle0, twiddle1, tmp, q, qa
+	doublebutterfly_plant \a0, \a1, \twiddle0, \tmp, \q, \qa
+	doublebutterfly_plant \a2, \a3, \twiddle1, \tmp, \q, \qa
+.endm
+
+.macro fullplant a0, a1, a2, a3, a4, a5, a6, a7, tmp, q, qa, plantconst
+	movw \plantconst, #44984
+	movt \plantconst, #19
+	doubleplant \a0, \tmp, \q, \qa, \plantconst
+	doubleplant \a1, \tmp, \q, \qa, \plantconst
+	doubleplant \a2, \tmp, \q, \qa, \plantconst
+	doubleplant \a3, \tmp, \q, \qa, \plantconst
+	doubleplant \a4, \tmp, \q, \qa, \plantconst
+	doubleplant \a5, \tmp, \q, \qa, \plantconst
+	doubleplant \a6, \tmp, \q, \qa, \plantconst
+	doubleplant \a7, \tmp, \q, \qa, \plantconst
+.endm
+
+.macro halfplant a0, a1, a2, a3, tmp, q, qa, plantconst
+	movw \plantconst, #44984
+	movt \plantconst, #19
+	doubleplant \a0, \tmp, \q, \qa, \plantconst
+	doubleplant \a1, \tmp, \q, \qa, \plantconst
+	doubleplant \a2, \tmp, \q, \qa, \plantconst
+	doubleplant \a3, \tmp, \q, \qa, \plantconst
+.endm
+
+
+// input: 0.5/1q
+.macro _3_layer_double_inv_CT_16_plant_light c0, c1, c2, c3, c4, c5, c6, c7, xi2, xi4, xi5, xi6, twiddle1, tmp2, q, qa, tmp
+
+	// layer 1  
+	sadd16.w \tmp, \c0, \c1 // c0, c1
+	ssub16.w \c1, \c0, \c1
+	sadd16.w \tmp2, \c2, \c3 // c2, c3
+	ssub16.w \c3, \c2, \c3
+	// tmp, c1, tmp2, c3: 1q maximum
+	sadd16.w \c0, \c4, \c5 // c4, c5
+	ssub16.w \c5, \c4, \c5
+	sadd16.w \c2, \c6, \c7 // c6, c7
+	ssub16.w \c7, \c6, \c7
+	// c4, c6 are free at this point
+	// c0,c5,c2,c7 1q maximum
+
+	// layer 2
+	sadd16.w \c6, \tmp, \tmp2 // c0, c2
+	ssub16.w \tmp2, \tmp, \tmp2
+	sadd16.w \c4, \c0, \c2 // c4, c6
+	ssub16.w \c2, \c0, \c2
+	// c6, tmp2, c4, c2: 2q maximum
+
+	vmov.w \twiddle1, \xi2
+	doublebutterfly_plant \c1, \c3, \twiddle1, \tmp, \q, \qa
+	doublebutterfly_plant \c5, \c7, \twiddle1, \tmp, \q, \qa 
+	// c1, c3, c7, c5: 1.5q maximum;
+
+	// tmp and c0 are free at this point
+	// layer 3
+	sadd16.w \c0, \c6, \c4 // c0, c4
+	ssub16.w \c4, \c6, \c4
+	// c0, c4: 4q
+	// c6 are free at this point
+	vmov.w \twiddle1, \xi4
+	doublebutterfly_plant \c1, \c5, \twiddle1, \tmp, \q, \qa
+	// c1, c5: 2q maximum
+
+	vmov.w \twiddle1, \xi5
+	// this block is one doublebutterfly
+	smulwb \tmp, \twiddle1, \c2  // c2, c6
+	smulwt \c2,  \twiddle1, \c2
+	smlabt \tmp, \tmp, \q, \qa
+	smlabt \c2, \c2, \q, \qa
+	pkhtb \tmp, \c2, \tmp, asr#16
+	ssub16.w \c6, \tmp2, \tmp 
+	sadd16.w \c2, \tmp2, \tmp
+	//c6, c2: 4.5q
+	vmov.w \twiddle1, \xi6
+	doublebutterfly_plant \c3, \c7, \twiddle1, \tmp, \q, \qa
+	//c3, c7: 2.5q maximum
+.endm
+.macro _3_layer_double_inv_CT_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+	// layer 3
+	ldr.w \twiddle1, [\twiddle_ptr], #4
+	two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa
+	two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa
+
+	// layer 2
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa
+
+	two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa
+
+	// layer 1
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle2, \tmp, \q, \qa
+
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa
+.endm
+
+.macro _3_layer_double_inv_twist_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	mul_twiddle_plant \c0, \twiddle1, \tmp, \q, \qa
+	mul_twiddle_plant \c1, \twiddle2, \tmp, \q, \qa
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	mul_twiddle_plant \c2, \twiddle1, \tmp, \q, \qa
+	mul_twiddle_plant \c3, \twiddle2, \tmp, \q, \qa
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	mul_twiddle_plant \c4, \twiddle1, \tmp, \q, \qa
+	mul_twiddle_plant \c5, \twiddle2, \tmp, \q, \qa
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	mul_twiddle_plant \c6, \twiddle1, \tmp, \q, \qa
+	mul_twiddle_plant \c7, \twiddle2, \tmp, \q, \qa
+.endm
+# input coefficients < 0.5q
+.global invntt_fast
+.type invntt_fast, %function
+.align 2
+invntt_fast:
+	push {r4-r11, r14}
+	vpush.w {s16-s23}
+	poly         .req r0
+	twiddle_ptr  .req r1
+	poly0        .req r2
+	poly1        .req r3
+	poly2        .req r4
+	poly3        .req r5
+	poly4        .req r6
+	poly5        .req r7
+	poly6        .req r8
+	poly7        .req r9
+	twiddle1     .req r10
+	twiddle2     .req r11
+	q            .req r12 
+	// at the top of r12
+	qa           .req r0
+	// qa=2^a q;a=3; at the bottom of r12
+	tmp          .req r14
+
+	movt q, #3329
+
+	### LAYER 7+6+5+4
+	.equ distance, 16
+	.equ offset, 32
+	.equ strincr, 64
+
+	// pre-load twiddle factors to FPU registers
+	vldm twiddle_ptr!, {s8-s22}
+
+	add.w tmp, poly, #8*strincr
+	vmov s8, tmp
+	1:
+		vmov s23, poly
+		// load a1, a3, ..., a15
+		load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset
+		load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset
+
+		movw qa, #26632
+
+		// NTT on a1, a3, ..., a15   
+		// twiddle2 is used as tmp2
+		_3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s10, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp
+
+		// multiply coeffs by layer 4 twiddles for later use
+		// vmov twiddle1, s15 
+		vmov twiddle2, s16
+		// mul_twiddle_plant poly0, twiddle1, tmp, q, qa // could be omitted but kept for reduction only
+		mul_twiddle_plant poly1, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s17 
+		vmov twiddle2, s18
+		mul_twiddle_plant poly2, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly3, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s19 
+		vmov twiddle2, s20
+		mul_twiddle_plant poly4, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly5, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s21 
+		vmov twiddle2, s22
+		mul_twiddle_plant poly6, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly7, twiddle2, tmp, q, qa
+
+		vmov s0, poly0 // a1
+		vmov s1, poly1 // a3
+		vmov s2, poly2 // a5
+		vmov s3, poly3 // a7
+		vmov s4, poly4 // a9
+		vmov s5, poly5 // a11
+		vmov s6, poly6 // a13
+		vmov s7, poly7 // a15
+		// 0.5q
+		// ----------
+
+		vmov poly, s23
+		// load a0, a2, ..., a14
+		load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+		load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		
+		movw qa, #26632
+		// NTT on a0, a2, ..., a14
+		// twiddle2 is used as tmp2
+		_3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s10, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp
+		// 1,3,5,7: <5q; 0,2,4,6:<1q
+		// layer 4 - 1
+		// addsub: (a2, a6, a10, a14), (a3, a7, a11, a15)
+		vmov poly, s23
+		vmov twiddle2, s1 // load a3
+		uadd16 tmp, poly1, twiddle2
+		usub16 poly1, poly1, twiddle2
+		str.w tmp, [poly, #1*distance/4]
+		str.w poly1, [poly, #1*distance/4+offset]
+
+		vmov twiddle2, s3 // load a7
+		uadd16 tmp, poly3, twiddle2
+		usub16 poly3, poly3, twiddle2
+		str.w tmp, [poly, #3*distance/4]
+		str.w poly3, [poly, #3*distance/4+offset]
+		
+		vmov twiddle2, s5 // load a11
+		uadd16 tmp, poly5, twiddle2
+		usub16 poly5, poly5, twiddle2
+		str.w tmp, [poly, #5*distance/4]
+		str.w poly5, [poly, #5*distance/4+offset]
+		
+		vmov twiddle2, s7 // load a15
+		uadd16 tmp, poly7, twiddle2
+		usub16 poly7, poly7, twiddle2
+		str.w tmp, [poly, #7*distance/4]
+		str.w poly7, [poly, #7*distance/4+offset]
+		//1,3,5,7: < 5.5q
+
+		// layer 4 - 2    
+		// addsub: (a0, a4, a8, a12), (a1, a5, a9, a13)
+		vmov poly3, s2 // load a5
+		uadd16 tmp, poly2, poly3
+		usub16 twiddle2, poly2, poly3
+		str.w tmp, [poly, #2*distance/4]
+		str.w twiddle2, [poly, #2*distance/4+offset]
+
+		vmov poly5, s4 // load a9
+		uadd16 tmp, poly4, poly5
+		usub16 twiddle2, poly4, poly5
+		str.w tmp, [poly, #4*distance/4]
+		str.w twiddle2, [poly, #4*distance/4+offset]
+
+		vmov poly7, s6 // load a13
+		uadd16 tmp, poly6, poly7
+		usub16 twiddle2, poly6, poly7
+		str.w tmp, [poly, #6*distance/4]
+		str.w twiddle2, [poly, #6*distance/4+offset]
+		
+		vmov poly1, s0 // load a1
+		uadd16 tmp, poly0, poly1
+		usub16 twiddle2, poly0, poly1
+		str.w twiddle2, [poly, #offset]    
+		str.w tmp, [poly], #strincr // increase 2*8*4 = 64 (2 * 8 loads of 4 bytes each)
+		//0,2,4,6: < 1.5q
+	vmov tmp, s8
+	cmp.w poly, tmp
+	bne.w 1b
+
+	sub.w poly, #8*strincr  
+
+	### LAYER 3+2+1
+
+	.equ distance, distance*16
+	.equ strincr, 4
+
+	// ITER 0
+	vmov s6, poly
+	load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+	load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+
+	vldm twiddle_ptr!, {s0-s5}
+	movw qa, #26632
+	fullplant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7 tmp, q, qa, twiddle1
+	// twiddle2 is used as tmp2
+	_3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s1, s3, s4, s5, twiddle1, twiddle2, q, qa, tmp
+
+	// twisting
+	_3_layer_double_inv_twist_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+	
+	vmov poly, s6
+	store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+	str.w poly1, [poly, #distance/4]
+	str.w poly2, [poly, #2*distance/4]
+	str.w poly3, [poly, #3*distance/4]
+	str.w poly0, [poly], #4
+
+	// ITER 1-15
+	add.w tmp, poly, #strincr*3*(5)
+	vmov s14, tmp
+	2:
+		vmov s6, poly
+		// polys upto 5.5q
+		load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+		load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		
+		movw qa, #26632
+		_3_layer_double_inv_CT_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+
+		// twisting
+		_3_layer_double_inv_twist_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+
+		vmov poly, s6
+		store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		str.w poly1, [poly, #distance/4]
+		str.w poly2, [poly, #2*distance/4]
+		str.w poly3, [poly, #3*distance/4]
+		str.w poly0, [poly], #4
+
+	vmov tmp, s14
+	cmp.w poly, tmp
+	bne.w 2b
+
+	vpop.w {s16-s23}
+	pop {r4-r11, pc}
diff --git a/crypto_kem/ml-kem-768/m4fspeed/fastntt.S b/crypto_kem/ml-kem-768/m4fspeed/fastntt.S
new file mode 100644
index 0000000..ddc1906
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/fastntt.S
@@ -0,0 +1,265 @@
+/******************************************************************************
+* Integrating the improved Plantard arithmetic into Kyber.
+*
+* Efficient Plantard arithmetic enables a faster Kyber implementation with the 
+* same stack usage.
+*
+* See the paper at https://eprint.iacr.org/2022/956.pdf for more details.
+*
+* @author   Junhao Huang, BNU-HKBU United International College, Zhuhai, China
+*           jhhuang_nuaa@126.com
+*
+* @date     September 2022
+******************************************************************************/
+#include "macros.i"
+
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+.macro mul_twiddle_plant a, twiddle, tmp, q, qa
+	smulwb \tmp, \twiddle, \a
+	smulwt \a,   \twiddle, \a
+	smlabt \tmp, \tmp, \q, \qa
+	smlabt \a, \a, \q, \qa
+	pkhtb \a, \a, \tmp, asr#16
+.endm
+
+.macro doublebutterfly_plant a0, a1, twiddle, tmp, q, qa
+	smulwb \tmp, \twiddle, \a1
+	smulwt \a1, \twiddle, \a1
+	smlabt \tmp, \tmp, \q, \qa
+	smlabt \a1, \a1, \q, \qa
+	pkhtb \tmp, \a1, \tmp, asr#16
+	usub16 \a1, \a0, \tmp
+	uadd16 \a0, \a0, \tmp
+.endm
+
+.macro two_doublebutterfly_plant a0, a1, a2, a3, twiddle0, twiddle1, tmp, q, qa
+	doublebutterfly_plant \a0, \a1, \twiddle0, \tmp, \q, \qa
+	doublebutterfly_plant \a2, \a3, \twiddle1, \tmp, \q, \qa
+.endm
+
+.macro _3_layer_double_CT_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+	// layer 3
+	ldr.w \twiddle1, [\twiddle_ptr], #4
+	two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle1, \tmp, \q, \qa
+	two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa
+
+	// layer 2
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa
+
+	two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle2, \twiddle2, \tmp, \q, \qa
+
+	// layer 1
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa
+
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa
+.endm
+
+.macro _3_layer_double_CT_16_plant_fp c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle1, twiddle2, q, qa, tmp
+	// layer 3
+	vmov \twiddle1, \xi0
+	two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle1, \tmp, \q, \qa
+	two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa
+
+	// layer 2
+	vmov \twiddle1, \xi1
+	vmov \twiddle2, \xi2
+	two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa
+
+	two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle2, \twiddle2, \tmp, \q, \qa
+
+	// layer 1
+	vmov \twiddle1, \xi3
+	vmov \twiddle2, \xi4
+	two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa
+
+	vmov \twiddle1, \xi5
+	vmov \twiddle2, \xi6
+	two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa
+.endm
+
+.global ntt_fast
+.type ntt_fast, %function
+.align 2
+ntt_fast:
+	push {r4-r11, r14}
+	vpush.w {s16-s24}
+	poly         .req r0
+	twiddle_ptr  .req r1
+	poly0        .req r2
+	poly1        .req r3
+	poly2        .req r4
+	poly3        .req r5
+	poly4        .req r6
+	poly5        .req r7
+	poly6        .req r8
+	poly7        .req r9
+	twiddle1     .req r10
+	twiddle2     .req r11
+	###  qinv        .req r11 ### q^-1 mod 2^2n; n=16
+	q           .req r12 
+	### at the top of r12
+	qa          .req r0
+	### qa=2^a q;a=3; at the bottom of r12
+	tmp         .req r14
+
+	// movw qa, #26632
+	// Why movt? Because we initially placed qa at the bottom of the same register as q;
+	movt q, #3329
+
+	### LAYER 7+6+5+4
+	.equ distance, 256
+	.equ offset, 32
+	.equ strincr, 4
+	// pre-load 15 twiddle factors to 15 FPU registers
+	// s0-s7 used to temporary store 16 16-bit polys.
+	vldm twiddle_ptr!, {s8-s22}
+ 
+	add tmp, poly, #strincr*8
+	// s23: poly addr
+	// s24: tmp  
+	vmov s24, tmp  
+	1:
+		// load a1, a3, ..., a15
+		vmov s23, poly
+		load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset
+		load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset
+		
+		movw qa, #26632
+
+		// 8-NTT on a1, a3, ..., a15
+		_3_layer_double_CT_16_plant_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp
+
+		// s15, s16, s17, s18, s19, s20, s21, s22 left
+		// multiply coeffs by layer 8 twiddles for later use
+		vmov twiddle1, s15 
+		vmov twiddle2, s16 
+		mul_twiddle_plant poly0, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly1, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s17 
+		vmov twiddle2, s18 
+		mul_twiddle_plant poly2, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly3, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s19 
+		vmov twiddle2, s20 
+		mul_twiddle_plant poly4, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly5, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s21 
+		vmov twiddle2, s22 
+		mul_twiddle_plant poly6, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly7, twiddle2, tmp, q, qa
+
+		vmov s0, poly0 // a1
+		vmov s1, poly1 // a3
+		vmov s2, poly2 // a5
+		vmov s3, poly3 // a7
+		vmov s4, poly4 // a9
+		vmov s5, poly5 // a11
+		vmov s6, poly6 // a13
+		vmov s7, poly7 // a15
+
+		vmov poly, s23
+	
+		// load a0, a2, ..., a14
+		load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+		load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		
+		movw qa, #26632
+		// 8-NTT on a0, a2, ..., a14
+		_3_layer_double_CT_16_plant_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp
+
+		
+		// layer 4 - 1
+		// addsub: (a2, a6, a10, a14), (a3, a7, a11, a15)
+		vmov poly, s23
+		vmov twiddle1, s1 // load a3
+		uadd16 tmp, poly1, twiddle1
+		usub16 poly1, poly1, twiddle1
+		str.w tmp, [poly, #1*distance/4]
+		str.w poly1, [poly, #1*distance/4+offset]
+
+		vmov twiddle1, s3 // load a7
+		uadd16 tmp, poly3, twiddle1
+		usub16 poly3, poly3, twiddle1
+		str.w tmp, [poly, #3*distance/4]
+		str.w poly3, [poly, #3*distance/4+offset]
+		
+		vmov twiddle1, s5 // load a11
+		uadd16 tmp, poly5, twiddle1
+		usub16 poly5, poly5, twiddle1
+		str.w tmp, [poly, #5*distance/4]
+		str.w poly5, [poly, #5*distance/4+offset]
+		
+		vmov twiddle1, s7 // load a15
+		uadd16 tmp, poly7, twiddle1
+		usub16 poly7, poly7, twiddle1
+		str.w tmp, [poly, #7*distance/4]
+		str.w poly7, [poly, #7*distance/4+offset]
+		
+		// layer 4 - 2    
+		// addsub: (a0, a4, a8, a12), (a1, a5, a9, a13)
+		vmov poly3, s2 // load a5
+		uadd16 tmp, poly2, poly3
+		usub16 twiddle1, poly2, poly3
+		str.w tmp, [poly, #2*distance/4]
+		str.w twiddle1, [poly, #2*distance/4+offset]
+
+		vmov poly5, s4 // load a9
+		uadd16 tmp, poly4, poly5
+		usub16 twiddle1, poly4, poly5
+		str.w tmp, [poly, #4*distance/4]
+		str.w twiddle1, [poly, #4*distance/4+offset]
+
+		vmov poly7, s6 // load a13
+		uadd16 tmp, poly6, poly7
+		usub16 twiddle1, poly6, poly7
+		str.w tmp, [poly, #6*distance/4]
+		str.w twiddle1, [poly, #6*distance/4+offset]
+		
+		vmov poly1, s0 // load a1
+		uadd16 tmp, poly0, poly1
+		usub16 twiddle1, poly0, poly1
+		str.w twiddle1, [poly, #offset]
+		str.w tmp, [poly], #4
+
+	vmov tmp, s24
+	cmp.w poly, tmp
+	bne.w 1b
+
+	sub.w poly, #8*strincr
+
+	### LAYER 3+2+1
+
+	.equ distance, distance/16
+	.equ strincr, 32
+
+	add.w tmp, poly, #strincr*16
+	vmov s13, tmp
+	2:
+		vmov s23, poly
+		load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+		load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		
+		movw qa, #26632
+		_3_layer_double_CT_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+		
+		vmov poly, s23
+		store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		str.w poly1, [poly, #distance/4]
+		str.w poly2, [poly, #2*distance/4]
+		str.w poly3, [poly, #3*distance/4]
+		str.w poly0, [poly], #strincr
+
+	vmov tmp, s13
+	cmp.w poly, tmp
+	bne.w 2b
+	vpop.w {s16-s24}
+	pop {r4-r11, pc}
diff --git a/crypto_kem/ml-kem-768/m4fspeed/indcpa.c b/crypto_kem/ml-kem-768/m4fspeed/indcpa.c
new file mode 100644
index 0000000..1aceabe
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/indcpa.c
@@ -0,0 +1,244 @@
+#include "indcpa.h"
+#include "ntt.h"
+#include "poly.h"
+#include "polyvec.h"
+#include "randombytes.h"
+#include "symmetric.h"
+#include "matacc.h"
+
+#include <string.h>
+#include <stdint.h>
+/*************************************************
+* Name:        indcpa_keypair_derand
+*
+* Description: Generates public and private key for the CPA-secure
+*              public-key encryption scheme underlying Kyber
+*
+* Arguments:   - uint8_t *pk: pointer to output public key
+*                             (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
+*              - uint8_t *sk: pointer to output private key
+*                             (of length KYBER_INDCPA_SECRETKEYBYTES bytes)
+*              - const uint8_t *coins: pointer to input randomness
+*                             (of length KYBER_SYMBYTES bytes)
+**************************************************/
+void indcpa_keypair_derand(unsigned char *pk,
+                    unsigned char *sk, 
+                    const unsigned char *coins){
+    polyvec skpv, skpv_prime;
+    poly pkp;
+    unsigned char buf[2 * KYBER_SYMBYTES];
+    unsigned char *publicseed = buf;
+    unsigned char *noiseseed = buf + KYBER_SYMBYTES;
+    int i;
+    unsigned char nonce = 0;
+
+    memcpy(buf, coins, KYBER_SYMBYTES);
+    buf[KYBER_SYMBYTES] = KYBER_K;
+    hash_g(buf, buf, KYBER_SYMBYTES + 1);
+
+    for (i = 0; i < KYBER_K; i++)
+        poly_getnoise(skpv.vec + i, noiseseed, nonce++);
+
+    polyvec_ntt(&skpv);
+    
+    // i = 0
+    matacc_cache32(&pkp, &skpv, &skpv_prime, 0, publicseed, 0);
+    poly_invntt(&pkp);
+
+    poly_addnoise(&pkp, noiseseed, nonce++);
+    poly_ntt(&pkp);
+
+    poly_tobytes(pk, &pkp);
+    for (i = 1; i < KYBER_K; i++) {
+        matacc_opt32(&pkp, &skpv, &skpv_prime, i, publicseed, 0);
+        poly_invntt(&pkp);
+
+        poly_addnoise(&pkp, noiseseed, nonce++);
+        poly_ntt(&pkp);
+
+        poly_tobytes(pk+i*KYBER_POLYBYTES, &pkp);
+    }
+    polyvec_tobytes(sk, &skpv);
+    memcpy(pk + KYBER_POLYVECBYTES, publicseed, KYBER_SYMBYTES); // Pack the public seed in the public key
+}
+
+/*************************************************
+* Name:        indcpa_enc
+*
+* Description: Encryption function of the CPA-secure
+*              public-key encryption scheme underlying Kyber.
+*
+* Arguments:   - unsigned char *c:          pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes)
+*              - const unsigned char *m:    pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes)
+*              - const unsigned char *pk:   pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
+*              - const unsigned char *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes)
+*                                           to deterministically generate all randomness
+**************************************************/
+void indcpa_enc(unsigned char *c,
+               const unsigned char *m,
+               const unsigned char *pk,
+               const unsigned char *coins) {
+    polyvec sp, sp_prime;
+    poly bp;
+    poly *pkp = &bp;
+    poly *k = &bp;
+    poly *v = &sp.vec[0];
+    const unsigned char *seed = pk+KYBER_POLYVECBYTES;
+    int i;
+    unsigned char nonce = 0;
+
+    for (i = 0; i < KYBER_K; i++)
+        poly_getnoise(sp.vec + i, coins, nonce++);
+
+    polyvec_ntt(&sp);
+
+    // i = 0
+    matacc_cache32(&bp, &sp, &sp_prime, 0, seed, 1);
+    poly_invntt(&bp);
+    poly_addnoise(&bp, coins, nonce++);
+    poly_reduce(&bp);
+    poly_packcompress(c, &bp, 0);
+    for (i = 1; i < KYBER_K; i++) {
+        matacc_opt32(&bp, &sp, &sp_prime, i, seed, 1);
+        poly_invntt(&bp);
+
+        poly_addnoise(&bp, coins, nonce++);
+        poly_reduce(&bp);
+
+        poly_packcompress(c, &bp, i);
+    }
+
+    poly_frombytes(pkp, pk);
+    int32_t v_tmp[KYBER_N];
+    
+    poly_basemul_opt_16_32(v_tmp, &sp.vec[0], pkp, &sp_prime.vec[0]);
+    for (i = 1; i < KYBER_K - 1; i++) {
+        poly_frombytes(pkp, pk + i*KYBER_POLYBYTES);
+        poly_basemul_acc_opt_32_32(v_tmp, &sp.vec[i], pkp, &sp_prime.vec[i]);
+    }
+    poly_frombytes(pkp, pk + i*KYBER_POLYBYTES);
+    poly_basemul_acc_opt_32_16(v, &sp.vec[i], pkp, &sp_prime.vec[i], v_tmp);
+
+    poly_invntt(v);
+
+    poly_addnoise(v, coins, nonce++);
+
+    poly_frommsg(k, m);
+    poly_add(v, v, k);
+    poly_reduce(v);
+
+    poly_compress(c + KYBER_POLYVECCOMPRESSEDBYTES, v);
+}
+
+/*************************************************
+* Name:        indcpa_enc_cmp
+*
+* Description: Re-encryption function.
+*              Compares the re-encypted ciphertext with the original ciphertext byte per byte.
+*              The comparison is performed in a constant time manner.
+*
+*
+* Arguments:   - unsigned char *ct:         pointer to input ciphertext to compare the new ciphertext with (of length KYBER_INDCPA_BYTES bytes)
+*              - const unsigned char *m:    pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes)
+*              - const unsigned char *pk:   pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
+*              - const unsigned char *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes)
+*                                           to deterministically generate all randomness
+* Returns:     - boolean byte indicating that re-encrypted ciphertext is NOT equal to the original ciphertext
+**************************************************/
+unsigned char indcpa_enc_cmp(const unsigned char *c,
+                             const unsigned char *m,
+                             const unsigned char *pk,
+                             const unsigned char *coins) {
+    uint64_t rc = 0;
+    polyvec sp, sp_prime;
+    poly bp;
+    poly *pkp = &bp;
+    poly *k = &bp;
+    poly *v = &sp.vec[0];
+    const unsigned char *seed = pk+KYBER_POLYVECBYTES;
+    int i;
+    unsigned char nonce = 0;
+
+    for (i = 0; i < KYBER_K; i++)
+        poly_getnoise(sp.vec + i, coins, nonce++);
+
+    polyvec_ntt(&sp);
+    
+    // i = 0
+    matacc_cache32(&bp, &sp, &sp_prime, 0, seed, 1);
+    poly_invntt(&bp);
+    poly_addnoise(&bp, coins, nonce++);
+    poly_reduce(&bp);
+    rc |= cmp_poly_packcompress(c, &bp, 0);
+    for (i = 1; i < KYBER_K; i++) {
+        matacc_opt32(&bp, &sp, &sp_prime, i, seed, 1);
+        poly_invntt(&bp);
+
+        poly_addnoise(&bp, coins, nonce++);
+        poly_reduce(&bp);
+
+        rc |= cmp_poly_packcompress(c, &bp, i);
+    }
+
+    poly_frombytes(pkp, pk);
+    int32_t v_tmp[KYBER_N];
+    
+    poly_basemul_opt_16_32(v_tmp, &sp.vec[0], pkp, &sp_prime.vec[0]);
+    for (i = 1; i < KYBER_K - 1; i++) {
+        poly_frombytes(pkp, pk + i*KYBER_POLYBYTES);
+        poly_basemul_acc_opt_32_32(v_tmp, &sp.vec[i], pkp, &sp_prime.vec[i]);
+    }
+    poly_frombytes(pkp, pk + i*KYBER_POLYBYTES);
+    poly_basemul_acc_opt_32_16(v, &sp.vec[i], pkp, &sp_prime.vec[i], v_tmp);
+
+    poly_invntt(v);
+
+    poly_addnoise(v, coins, nonce++);
+    poly_frommsg(k, m);
+    poly_add(v, v, k);
+    poly_reduce(v);
+
+    rc |= cmp_poly_compress(c + KYBER_POLYVECCOMPRESSEDBYTES, v);
+
+    rc = ~rc + 1;
+    rc >>= 63;
+    return (unsigned char)rc;
+}
+
+/*************************************************
+* Name:        indcpa_dec
+*
+* Description: Decryption function of the CPA-secure
+*              public-key encryption scheme underlying Kyber.
+*
+* Arguments:   - unsigned char *m:        pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES)
+*              - const unsigned char *c:  pointer to input ciphertext (of length KYBER_INDCPA_BYTES)
+*              - const unsigned char *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES)
+**************************************************/
+void __attribute__ ((noinline)) indcpa_dec(unsigned char *m,
+                                           const unsigned char *c,
+                                           const unsigned char *sk) {
+    poly mp, bp;
+    poly *v = &bp;
+    int32_t r_tmp[KYBER_N];
+    int i;
+
+    poly_unpackdecompress(&mp, c, 0);
+    poly_ntt(&mp);
+    poly_frombytes_mul_16_32(r_tmp, &mp, sk);
+    for(i = 1; i < KYBER_K - 1; i++) {
+        poly_unpackdecompress(&bp, c, i);
+        poly_ntt(&bp);
+        poly_frombytes_mul_32_32(r_tmp, &bp, sk + i*KYBER_POLYBYTES);
+    }
+    poly_unpackdecompress(&bp, c, i);
+    poly_ntt(&bp);
+    poly_frombytes_mul_32_16(&mp, &bp, sk + i*KYBER_POLYBYTES, r_tmp);
+
+    poly_invntt(&mp);
+    poly_decompress(v, c+KYBER_POLYVECCOMPRESSEDBYTES);
+    poly_sub(&mp, v, &mp);
+    poly_reduce(&mp);
+
+    poly_tomsg(m, &mp);
+}
diff --git a/crypto_kem/ml-kem-768/m4fspeed/indcpa.h b/crypto_kem/ml-kem-768/m4fspeed/indcpa.h
new file mode 100644
index 0000000..6d5588a
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/indcpa.h
@@ -0,0 +1,22 @@
+#ifndef INDCPA_H
+#define INDCPA_H
+
+void indcpa_keypair_derand(unsigned char *pk,
+                    unsigned char *sk, 
+                    const unsigned char *coins);
+
+void indcpa_enc(unsigned char *c,
+                const unsigned char *m,
+                const unsigned char *pk,
+                const unsigned char *coins);
+
+unsigned char indcpa_enc_cmp(const unsigned char *ct,
+                             const unsigned char *m,
+                             const unsigned char *pk,
+                             const unsigned char *coins);
+
+void indcpa_dec(unsigned char *m,
+                const unsigned char *c,
+                const unsigned char *sk);
+
+#endif
diff --git a/crypto_kem/ml-kem-768/m4fspeed/kem.c b/crypto_kem/ml-kem-768/m4fspeed/kem.c
new file mode 100644
index 0000000..5cfa62b
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/kem.c
@@ -0,0 +1,159 @@
+#include "api.h"
+#include "indcpa.h"
+#include "params.h"
+#include "randombytes.h"
+#include "symmetric.h"
+#include "verify.h"
+
+#include <stdlib.h>
+
+#include <stdlib.h>
+
+#include <string.h>
+
+
+/*************************************************
+* Name:        crypto_kem_keypair_derand
+*
+* Description: Generates public and private key
+*              for CCA-secure Kyber key encapsulation mechanism
+*
+* Arguments:   - uint8_t *pk: pointer to output public key
+*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
+*              - uint8_t *sk: pointer to output private key
+*                (an already allocated array of KYBER_SECRETKEYBYTES bytes)
+*              - uint8_t *coins: pointer to input randomness
+*                (an already allocated array filled with 2*KYBER_SYMBYTES random bytes)
+**
+* Returns 0 (success)
+**************************************************/
+static int crypto_kem_keypair_derand(uint8_t *pk,
+        uint8_t *sk,
+        const uint8_t *coins) {
+    indcpa_keypair_derand(pk, sk, coins);
+    memcpy(sk + KYBER_INDCPA_SECRETKEYBYTES, pk, KYBER_PUBLICKEYBYTES);
+    hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
+    /* Value z for pseudo-random output on reject */
+    memcpy(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, coins + KYBER_SYMBYTES, KYBER_SYMBYTES);
+    return 0;
+}
+
+/*************************************************
+* Name:        crypto_kem_keypair
+*
+* Description: Generates public and private key
+*              for CCA-secure Kyber key encapsulation mechanism
+*
+* Arguments:   - unsigned char *pk: pointer to output public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
+*              - unsigned char *sk: pointer to output private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes)
+*
+* Returns 0 (success)
+**************************************************/
+int crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
+    uint8_t coins[2 * KYBER_SYMBYTES];
+    randombytes(coins, 2 * KYBER_SYMBYTES);
+    crypto_kem_keypair_derand(pk, sk, coins);
+    return 0;
+}
+
+
+/*************************************************
+* Name:        crypto_kem_enc_derand
+*
+* Description: Generates cipher text and shared
+*              secret for given public key
+*
+* Arguments:   - uint8_t *ct: pointer to output cipher text
+*                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
+*              - uint8_t *ss: pointer to output shared secret
+*                (an already allocated array of KYBER_SSBYTES bytes)
+*              - const uint8_t *pk: pointer to input public key
+*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
+*              - const uint8_t *coins: pointer to input randomness
+*                (an already allocated array filled with KYBER_SYMBYTES random bytes)
+**
+* Returns 0 (success)
+**************************************************/
+static int crypto_kem_enc_derand(uint8_t *ct,
+        uint8_t *ss,
+        const uint8_t *pk,
+        const uint8_t *coins) {
+    uint8_t buf[2 * KYBER_SYMBYTES];
+    /* Will contain key, coins */
+    uint8_t kr[2 * KYBER_SYMBYTES];
+
+    memcpy(buf, coins, KYBER_SYMBYTES);
+
+    /* Multitarget countermeasure for coins + contributory KEM */
+    hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
+    hash_g(kr, buf, 2 * KYBER_SYMBYTES);
+
+    /* coins are in kr+KYBER_SYMBYTES */
+    indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES);
+
+    memcpy(ss, kr, KYBER_SYMBYTES);
+    return 0;
+}
+
+/*************************************************
+* Name:        crypto_kem_enc
+*
+* Description: Generates cipher text and shared
+*              secret for given public key
+*
+* Arguments:   - uint8_t *ct: pointer to output cipher text
+*                (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
+*              - uint8_t *ss: pointer to output shared secret
+*                (an already allocated array of KYBER_SSBYTES bytes)
+*              - const uint8_t *pk: pointer to input public key
+*                (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
+*
+* Returns 0 (success)
+**************************************************/
+int crypto_kem_enc(uint8_t *ct,
+        uint8_t *ss,
+        const uint8_t *pk) {
+    uint8_t coins[KYBER_SYMBYTES];
+    randombytes(coins, KYBER_SYMBYTES);
+    crypto_kem_enc_derand(ct, ss, pk, coins);
+    return 0;
+}
+
+/*************************************************
+* Name:        crypto_kem_dec
+*
+* Description: Generates shared secret for given
+*              cipher text and private key
+*
+* Arguments:   - unsigned char *ss:       pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes)
+*              - const unsigned char *ct: pointer to input cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
+*              - const unsigned char *sk: pointer to input private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes)
+*
+* Returns 0.
+*
+* On failure, ss will contain a pseudo-random value.
+**************************************************/
+int crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk) {
+    int fail;
+    uint8_t buf[2 * KYBER_SYMBYTES];
+    /* Will contain key, coins */
+    uint8_t kr[2 * KYBER_SYMBYTES];
+    const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES;
+
+    indcpa_dec(buf, ct, sk);
+
+    /* Multitarget countermeasure for coins + contributory KEM */
+    memcpy(buf + KYBER_SYMBYTES, sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, KYBER_SYMBYTES);
+    hash_g(kr, buf, 2 * KYBER_SYMBYTES);
+
+    /* coins are in kr+KYBER_SYMBYTES */
+    fail = indcpa_enc_cmp(ct, buf, pk, kr + KYBER_SYMBYTES);
+
+    /* Compute rejection key */
+    rkprf(ss, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, ct);
+
+    /* Copy true key to return buffer if fail is false */
+    cmov(ss, kr, KYBER_SYMBYTES, (uint8_t) (1 - fail));
+
+    return 0;
+}
diff --git a/crypto_kem/ml-kem-768/m4fspeed/macros.i b/crypto_kem/ml-kem-768/m4fspeed/macros.i
new file mode 100644
index 0000000..ebe5743
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/macros.i
@@ -0,0 +1,60 @@
+/******************************************************************************
+ * Integrating the improved Plantard arithmetic into Kyber.
+ *
+ * Efficient Plantard arithmetic enables a faster Kyber implementation with the
+ * same stack usage.
+ *
+ * See the paper at https://eprint.iacr.org/2022/956.pdf for more details.
+ *
+ * @author   Junhao Huang, BNU-HKBU United International College, Zhuhai, China
+ *           jhhuang_nuaa@126.com
+ *
+ * @date     September 2022
+ ******************************************************************************/
+#ifndef MACROS_I
+#define MACROS_I
+
+.macro load a, a0, a1, a2, a3, mem0, mem1, mem2, mem3
+  ldr.w \a0, [\a, \mem0]
+  ldr.w \a1, [\a, \mem1]
+  ldr.w \a2, [\a, \mem2]
+  ldr.w \a3, [\a, \mem3]
+.endm
+
+.macro store a, a0, a1, a2, a3, mem0, mem1, mem2, mem3
+  str.w \a0, [\a, \mem0]
+  str.w \a1, [\a, \mem1]
+  str.w \a2, [\a, \mem2]
+  str.w \a3, [\a, \mem3]
+.endm
+
+.macro doubleplant a, tmp, q, qa, plantconst
+	smulwb \tmp, \plantconst, \a
+	smulwt \a, \plantconst, \a
+	smlabt \tmp, \tmp, \q, \qa
+	smlabt \a, \a, \q, \qa
+	pkhtb \a, \a, \tmp, asr#16
+.endm
+
+.macro doublebarrett a, tmp, tmp2, q, barrettconst
+  smulbb \tmp, \a, \barrettconst
+  smultb \tmp2, \a, \barrettconst
+  asr \tmp, \tmp, #26
+  asr \tmp2, \tmp2, #26
+  smulbb \tmp, \tmp, \q
+  smulbb \tmp2, \tmp2, \q
+  pkhbt \tmp, \tmp, \tmp2, lsl#16
+  usub16 \a, \a, \tmp
+.endm
+
+// q locate in the top half of the register
+.macro plant_red q, qa, qinv, tmp
+	mul \tmp, \tmp, \qinv     
+	//tmp*qinv mod 2^2n/ 2^n; in high half
+	smlatt \tmp, \tmp, \q, \qa
+	// result in high half
+.endm
+
+
+
+#endif /* MACROS_I */
diff --git a/crypto_kem/ml-kem-768/m4fspeed/matacc.c b/crypto_kem/ml-kem-768/m4fspeed/matacc.c
new file mode 100644
index 0000000..736c5ae
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/matacc.c
@@ -0,0 +1,121 @@
+#include "poly.h"
+#include "polyvec.h"
+#include "randombytes.h"
+#include "symmetric.h"
+#include "ntt.h"
+#include "matacc.h"
+
+
+/*************************************************
+* Name:        matacc_cache32
+*
+* Description: Multiplies a row of A or A^T, generated on-the-fly,
+*              with a vector of polynomials and accumulates into the result.
+*              Using asymmetric multiplication and better accumulation.
+*
+* Arguments:   - poly *r:                    pointer to output polynomial to accumulate in
+*              - const polyvec *b:           pointer to input vector of polynomials to multiply with
+*              - polyvec *b_prime:           pointer to output vector of polynomials to store b multiplied by zetas
+*              - unsigned char i:            byte to indicate the index < KYBER_K of the row of A or A^T
+*              - const unsigned char *seed:  pointer to the public seed used to generate A
+*              - int transposed:             boolean indicatin whether A or A^T is generated
+**************************************************/
+void matacc_cache32(poly* r, const polyvec *b, polyvec *b_prime, unsigned char i, const unsigned char *seed, int transposed) {
+  unsigned char buf[XOF_BLOCKBYTES+2];
+  xof_state state;
+  int16_t c[4];
+  int32_t r_tmp[KYBER_N]; // stores intermediate accumulated values to save reductions
+  int j = 0;
+
+  // 16-32
+
+  if (transposed)
+    xof_absorb(&state, seed, i, j);
+  else
+    xof_absorb(&state, seed, j, i);
+
+  xof_squeezeblocks(buf, 1, &state);
+
+  matacc_asm_cache_16_32(r_tmp, b->vec[j].coeffs, c, buf, zetas, &state, b_prime->vec[j].coeffs);
+
+  // 32-32 KYBER_K - 2 times
+  for(j=1;j<KYBER_K - 1;j++) {
+
+    if (transposed)
+      xof_absorb(&state, seed, i, j);
+    else
+      xof_absorb(&state, seed, j, i);
+
+    xof_squeezeblocks(buf, 1, &state);
+
+    matacc_asm_cache_32_32(r_tmp, b->vec[j].coeffs, c, buf, zetas, &state, b_prime->vec[j].coeffs);
+  }
+
+  // 32-16
+
+  if (transposed)
+    xof_absorb(&state, seed, i, j);
+  else
+    xof_absorb(&state, seed, j, i);
+
+  xof_squeezeblocks(buf, 1, &state);
+
+  matacc_asm_cache_32_16(r->coeffs, b->vec[j].coeffs, c, buf, zetas, &state, b_prime->vec[j].coeffs, r_tmp);
+}
+
+/*************************************************
+* Name:        matacc_opt32
+*
+* Description: Multiplies a row of A or A^T, generated on-the-fly,
+*              with a vector of polynomials and accumulates into the result.
+*              Using asymmetric multiplication and better accumulation.
+*
+* Arguments:   - poly *r:                    pointer to output polynomial to accumulate in
+*              - const polyvec *b:           pointer to input vector of polynomials to multiply with
+*              - const polyvec *b_prime:     pointer to input vector of polynomials to store b multiplied by zetas
+*              - unsigned char i:            byte to indicate the index < KYBER_K of the row of A or A^T
+*              - const unsigned char *seed:  pointer to the public seed used to generate A
+*              - int transposed:             boolean indicatin whether A or A^T is generated
+**************************************************/
+void matacc_opt32(poly* r, const polyvec *b, const polyvec *b_prime, unsigned char i, const unsigned char *seed, int transposed) {
+  unsigned char buf[XOF_BLOCKBYTES+2];
+  xof_state state;
+  int16_t c[4];
+  int32_t r_tmp[KYBER_N]; // stores intermediate accumulated values to save reductions
+  int j = 0;
+
+  // 16-32
+
+  if (transposed)
+    xof_absorb(&state, seed, i, j);
+  else
+    xof_absorb(&state, seed, j, i);
+
+  xof_squeezeblocks(buf, 1, &state);
+
+  matacc_asm_opt_16_32(r_tmp, b->vec[j].coeffs, c, buf, &state, b_prime->vec[j].coeffs);
+  
+  // 32-32 KYBER_K - 2 times
+  for(j=1;j<KYBER_K - 1;j++) {
+
+    if (transposed)
+      xof_absorb(&state, seed, i, j);
+    else
+      xof_absorb(&state, seed, j, i);
+
+    xof_squeezeblocks(buf, 1, &state);
+
+    matacc_asm_opt_32_32(r_tmp, b->vec[j].coeffs, c, buf, &state, b_prime->vec[j].coeffs);
+  }
+
+  // 32-16
+
+  if (transposed)
+    xof_absorb(&state, seed, i, j);
+  else
+    xof_absorb(&state, seed, j, i);
+
+  xof_squeezeblocks(buf, 1, &state);
+
+  matacc_asm_opt_32_16(r->coeffs, b->vec[j].coeffs, c, buf, &state, b_prime->vec[j].coeffs, r_tmp);
+}
diff --git a/crypto_kem/ml-kem-768/m4fspeed/matacc.h b/crypto_kem/ml-kem-768/m4fspeed/matacc.h
new file mode 100644
index 0000000..39c0c79
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/matacc.h
@@ -0,0 +1,63 @@
+#ifndef MATACC_H
+#define MATACC_H
+#include "poly.h"
+#include "polyvec.h"
+#include "symmetric.h"
+
+extern void matacc_asm_cache_16_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t zetas[64], xof_state *state, int16_t *aprimeptr);
+static inline void _matacc_asm_cache_16_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t _zetas[64], xof_state *state, int16_t *aprimeptr)
+{
+  // floating point registers clobbered by assembly function
+  asm volatile("" : : : "s16", "s17", "s18", "s19", "s20", "s21", "s26", "s27", "s28", "s29");
+  matacc_asm_cache_16_32(r_tmp, b, c, buf, _zetas, state, aprimeptr);
+}
+#define matacc_asm_cache_16_32 _matacc_asm_cache_16_32
+
+extern void matacc_asm_cache_32_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t zetas[64], xof_state *state, int16_t *aprimeptr);
+static inline void _matacc_asm_cache_32_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t _zetas[64], xof_state *state, int16_t *aprimeptr)
+{
+  // floating point registers clobbered by assembly function
+  asm volatile("" : : : "s16", "s17", "s18", "s19", "s20", "s21", "s26", "s27", "s28", "s29");
+  matacc_asm_cache_32_32(r_tmp, b, c, buf, _zetas, state, aprimeptr);
+}
+#define matacc_asm_cache_32_32 _matacc_asm_cache_32_32
+
+extern void matacc_asm_cache_32_16(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t zetas[64], xof_state *state, int16_t *aprimeptr, const int32_t *r_tmp);
+static inline void _matacc_asm_cache_32_16(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t _zetas[64], xof_state *state, int16_t *aprimeptr, const int32_t *r_tmp)
+{
+  // floating point registers clobbered by assembly function
+  asm volatile("" : : : "s16", "s17", "s18", "s19", "s20", "s21", "s26", "s27", "s28", "s29");
+  matacc_asm_cache_32_16(r, b, c, buf, _zetas, state, aprimeptr, r_tmp);
+}
+#define matacc_asm_cache_32_16 _matacc_asm_cache_32_16
+
+extern void matacc_asm_opt_16_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], xof_state *state, const int16_t *aprimeptr);
+static inline void _matacc_asm_opt_16_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], xof_state *state, const int16_t *aprimeptr)
+{
+  // floating point registers clobbered by assembly function
+  asm volatile("" : : : "s16", "s17", "s18", "s19", "s20", "s21", "s26", "s27", "s28", "s29");
+  matacc_asm_opt_16_32(r_tmp, b, c, buf, state, aprimeptr);
+}
+#define matacc_asm_opt_16_32 _matacc_asm_opt_16_32
+
+extern void matacc_asm_opt_32_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], xof_state *state, const int16_t *aprimeptr);
+static inline void _matacc_asm_opt_32_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], xof_state *state, const int16_t *aprimeptr)
+{
+  // floating point registers clobbered by assembly function
+  asm volatile("" : : : "s16", "s17", "s18", "s19", "s20", "s21", "s26", "s27", "s28", "s29");
+  matacc_asm_opt_32_32(r_tmp, b, c, buf, state, aprimeptr);
+}
+#define matacc_asm_opt_32_32 _matacc_asm_opt_32_32
+
+extern void matacc_asm_opt_32_16(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], xof_state *state, const int16_t *aprimeptr, const int32_t *r_tmp);
+static inline void _matacc_asm_opt_32_16(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], xof_state *state, const int16_t *aprimeptr, const int32_t *r_tmp)
+{
+  // floating point registers clobbered by assembly function
+  asm volatile("" : : : "s16", "s17", "s18", "s19", "s20", "s21", "s26", "s27", "s28", "s29");
+  matacc_asm_opt_32_16(r, b, c, buf, state, aprimeptr, r_tmp);
+}
+#define matacc_asm_opt_32_16 _matacc_asm_opt_32_16
+
+void matacc_opt32(poly* r, const polyvec *b, const polyvec *b_prime, unsigned char i, const unsigned char *seed, int transposed);
+void matacc_cache32(poly* r, const polyvec *b, polyvec *b_prime, unsigned char i, const unsigned char *seed, int transposed);
+#endif
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fspeed/matacc.i b/crypto_kem/ml-kem-768/m4fspeed/matacc.i
new file mode 100644
index 0000000..d0da46a
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/matacc.i
@@ -0,0 +1,301 @@
+
+// q locates in the bottom half of the register
+.macro plant_red_b q, qa, qinv, tmp
+	mul \tmp, \tmp, \qinv     
+	//tmp*qinv mod 2^2n/ 2^n; in high half
+	smlatb \tmp, \tmp, \q, \qa
+	// result in high half
+.endm
+
+// Checks if val0 is suitable and multiplies with values from bptr using func 
+.macro first_if func, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr
+  // if (val0 < KYBER_Q)
+  cmp.w \val0, \q
+  bhs.w 2f
+    strh \val0, [\cptr], #2
+    add \k, #1
+    cmp.w \k, #4
+    bne.w 2f
+        sub \cptr, #4*2
+        vmov s18, \bufptr
+        vmov s19, \ctr
+        vmov s20, \val1
+        \func \rptr, \bptr, \cptr, \zetaptr, \bufptr, \k, \val0, \val1, \q, \qa, \qinv, \tmp, \tmp2, \ctr
+        vmov \bufptr, s18
+        vmov \ctr, s19
+        vmov \val1, s20
+
+        add \ctr, #1
+        
+        movw \k, #0
+    2:
+.endm
+
+// Checks if val1 is suitable and multiplies with values from bptr using func 
+.macro second_if func, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr
+// if (val1 < KYBER_Q && ctr < KYBER_N/4)
+  cmp.w \val1, \q
+  bhs.w 2f
+    cmp.w \ctr, #256/4
+    bge.w 2f
+      strh \val1, [\cptr], #2
+      add \k, #1
+      cmp.w \k, #4
+      bne.w 2f
+        sub \cptr, #4*2
+        vmov s18, \bufptr
+        vmov s19, \ctr
+        \func \rptr, \bptr, \cptr, \zetaptr, \bufptr, \k, \val0, \val1, \q, \qa, \qinv, \tmp, \tmp2, \ctr
+        vmov \bufptr, s18
+        vmov \ctr, s19
+
+        add \ctr, #1
+        
+        movw \k, #0
+    2:
+.endm
+
+.macro doublebasemul_asm_cache_16_32 rptr_tmp, aptr, bptr, zetaptr, poly0, poly1, tmp, tmp2, q, qa, qinv, res, aprimeptr, zeta
+  vmov \aprimeptr, s27
+  ldr \poly0, [\aptr], #4
+  ldr \poly1, [\bptr]
+  ldr \zeta, [\zetaptr], #4
+
+  smulwt \tmp, \zeta, \poly0
+  smlabb \tmp, \tmp, \q, \qa
+  pkhbt \tmp, \poly0, \tmp
+  str \tmp, [\aprimeptr], #4 // store (poly0_t*zeta || poly0_b) for later re-use
+  smultt \tmp2, \tmp, \poly1
+  smlabb \tmp2, \poly0, \poly1, \tmp2
+
+  smuadx \tmp, \poly0, \poly1
+  
+  str.w \tmp, [\rptr_tmp, #4]
+  str \tmp2, [\rptr_tmp], #8
+
+  neg \zeta, \zeta
+
+  ldr \poly0, [\aptr], #4
+  ldr.w \poly1, [\bptr, #4]
+
+  smulwt \tmp, \zeta, \poly0
+  smlabb \tmp, \tmp, \q, \qa
+  pkhbt \tmp, \poly0, \tmp
+  str \tmp, [\aprimeptr], #4 // store (poly2_t*zeta || poly2_b) for later re-use
+  smultt \tmp2, \tmp, \poly1
+  smlabb \tmp2, \poly0, \poly1, \tmp2
+
+  smuadx \tmp, \poly0, \poly1
+  str.w \tmp, [\rptr_tmp, #4]
+  str \tmp2, [\rptr_tmp], #8
+  vmov s27, \aprimeptr
+.endm 
+
+.macro doublebasemul_asm_acc_cache_32_32 rptr_tmp, aptr, bptr, zetaptr, poly0, poly1, tmp, tmp2, q, qa, qinv, res, aprimeptr, zeta
+  vmov \aprimeptr, s27
+  ldr \poly0, [\aptr], #4
+  ldr \poly1, [\bptr]
+  
+  ldr \res, [\rptr_tmp]
+  ldr \zeta, [\zetaptr], #4
+
+  smulwt \tmp, \zeta, \poly0
+  smlabb \tmp, \tmp, \q, \qa
+  pkhbt \tmp, \poly0, \tmp
+  str \tmp, [\aprimeptr], #4 // store (poly0_t*zeta || poly0_b) for later re-use
+  smlatt \tmp, \tmp, \poly1, \res
+  smlabb \res, \poly0, \poly1, \tmp
+  str \res, [\rptr_tmp], #4
+
+  ldr.w \res, [\rptr_tmp]
+  smladx \res, \poly0, \poly1, \res
+  
+  str.w \res, [\rptr_tmp], #4
+  
+  neg \zeta, \zeta
+
+  ldr \poly0, [\aptr], #4
+  ldr.w \poly1, [\bptr, #4]
+  ldr \res, [\rptr_tmp]
+  smulwt \tmp, \zeta, \poly0
+  smlabb \tmp, \tmp, \q, \qa
+  pkhbt \tmp, \poly0, \tmp
+  str \tmp, [\aprimeptr], #4 // store (poly2_t*zeta || poly2_b) for later re-use
+  smlatt \tmp, \tmp, \poly1, \res
+  smlabb \res, \poly0, \poly1, \tmp
+  str.w \res, [\rptr_tmp], #4
+
+  ldr.w \res, [\rptr_tmp]
+  smladx \res, \poly0, \poly1, \res
+
+  str \res, [\rptr_tmp], #4
+  vmov s27, \aprimeptr
+.endm
+
+.macro doublebasemul_asm_acc_cache_32_16 rptr_tmp, aptr, bptr, zetaptr, poly0, poly1, tmp, tmp2, q, qa, qinv, res, aprimeptr, zeta
+  vmov \aprimeptr, s27
+  ldr \poly0, [\aptr], #4
+  ldr \poly1, [\bptr]
+  
+  ldr \res, [\rptr_tmp], #4
+  ldr \zeta, [\zetaptr], #4
+  
+  smulwt \tmp, \zeta, \poly0
+  smlabb \tmp, \tmp, \q, \qa
+  pkhbt \tmp, \poly0, \tmp
+  str \tmp, [\aprimeptr], #4 // store (poly0_t*zeta || poly0_b) for later re-use
+  smlatt \tmp, \tmp, \poly1, \res
+  smlabb \tmp2, \poly0, \poly1, \tmp
+
+  plant_red_b \q, \qa, \qinv, \tmp2
+  ldr.w \tmp, [\rptr_tmp], #4
+  smladx \tmp, \poly0, \poly1, \tmp
+
+  plant_red_b \q, \qa, \qinv, \tmp
+  
+  pkhtb \res, \tmp, \tmp2, asr#16
+  vmov \tmp2, s28
+  str \res, [\tmp2], #4
+  
+  neg \zeta, \zeta
+
+  ldr \poly0, [\aptr], #4
+  ldr.w \poly1, [\bptr, #4]
+
+  smulwt \tmp, \zeta, \poly0
+  smlabb \tmp, \tmp, \q, \qa
+  pkhbt \tmp, \poly0, \tmp
+  ldr \res, [\rptr_tmp], #4
+  str \tmp, [\aprimeptr], #4 // store (poly2_t*zeta || poly2_b) for later re-use
+  smlatt \tmp, \tmp, \poly1, \res
+  smlabb \tmp, \poly0, \poly1, \tmp
+  
+  plant_red_b \q, \qa, \qinv, \tmp
+
+  ldr \res, [\rptr_tmp], #4
+  smladx \res, \poly0, \poly1, \res
+
+  plant_red_b \q, \qa, \qinv, \res
+
+  pkhtb \res, \res, \tmp, asr#16
+
+  str \res, [\tmp2], #4
+  vmov s28, \tmp2
+  vmov s27, \aprimeptr
+.endm
+
+.macro load_vals val0, val1, bufptr, tmp
+  ldrh \val0, [\bufptr], #2
+  ldrb \val1, [\bufptr], #1
+  ubfx \tmp, \val0, #12, #4
+  orr \val1, \tmp, \val1, lsl #4
+  ubfx \val0, \val0, #0, #12
+  ubfx \val1, \val1, #0, #12
+.endm
+
+.macro doublebasemul_asm_opt_16_32 rptr_tmp, aptr, bptr, tmp3, poly0, poly1, poly2, poly3, q, qa, qinv, tmp, aprimeptr, tmp2
+  vmov \aprimeptr, s27
+  ldr \poly0, [\aptr], #4
+  ldr \poly1, [\bptr]
+  ldr \poly2, [\aptr], #4
+  ldr.w \poly3, [\bptr, #4]
+
+  ldr.w \tmp2, [\aprimeptr], #4 // load cached value
+  
+  // (poly0_t * zeta) * poly1_t + poly0_b * poly1_b
+  smuad \tmp, \tmp2, \poly1
+
+  // poly1_t * poly0_b + poly1_b * poly0_t
+  smuadx \tmp3, \poly0, \poly1
+    
+  str \tmp, [\rptr_tmp], #4
+  str \tmp3, [\rptr_tmp], #4
+
+  ldr \tmp, [\aprimeptr], #4 // load cached value
+
+  smuad \tmp2, \tmp, \poly3
+
+  smuadx \tmp3, \poly2, \poly3
+
+  str.w \tmp2, [\rptr_tmp], #4
+  str.w \tmp3, [\rptr_tmp], #4
+  vmov s27, \aprimeptr
+.endm 
+
+.macro doublebasemul_asm_acc_opt_32_32 rptr_tmp, aptr, bptr, tmp2, poly0, poly1, poly2, poly3, q, qa, qinv, res, aprimeptr, tmp
+  vmov \aprimeptr, s27
+  ldr.w \poly0, [\aptr], #4
+  ldr.w \poly1, [\bptr]
+  ldr.w \poly2, [\aptr], #4
+  ldr.w \poly3, [\bptr, #4]
+
+  ldr.w \res, [\rptr_tmp]
+  ldr.w \tmp, [\rptr_tmp, #4]
+
+  ldr \tmp2, [\aprimeptr], #4 // load cached value
+
+  // (poly0_t * zeta) * poly1_t + poly0_b * poly0_t + res
+  smlad \res, \tmp2, \poly1, \res
+
+  // poly1_t * poly0_b + poly1_b * poly0_t + res
+  smladx \tmp, \poly0, \poly1, \tmp
+
+  str.w \tmp, [\rptr_tmp, #4]
+  str.w \res, [\rptr_tmp], #8
+
+  ldr \tmp2, [\aprimeptr], #4 // load cached value
+  ldr \res, [\rptr_tmp]
+  ldr \tmp, [\rptr_tmp, #4] 
+    
+  smlad \res, \tmp2, \poly3, \res
+
+  smladx \tmp, \poly2, \poly3, \tmp
+
+  str.w \tmp, [\rptr_tmp, #4]
+  str \res, [\rptr_tmp], #8
+  
+  vmov s27, \aprimeptr
+.endm 
+
+.macro doublebasemul_asm_acc_opt_32_16 rptr_tmp, aptr, bptr, tmp2, poly0, poly1, poly2, poly3, q, qa, qinv, res, aprimeptr, tmp
+  vmov \aprimeptr, s27
+
+  ldr \poly0, [\aptr], #4
+  ldr \poly1, [\bptr]
+  ldr \poly2, [\aptr], #4
+  ldr.w \poly3, [\bptr, #4]
+
+  ldr.w \tmp, [\rptr_tmp, #4]
+  ldr \res, [\rptr_tmp], #8
+
+  ldr \tmp2, [\aprimeptr], #4 // load cached value
+
+  // (poly0_t * zeta) * poly1_t + poly0_b * poly0_t + res
+  smlad \res, \tmp2, \poly1, \res
+  plant_red_b \q, \qa, \qinv, \res
+
+  // poly1_t * poly0_b + poly1_b * poly0_t + res
+  smladx \tmp, \poly0, \poly1, \tmp
+  plant_red_b \q, \qa, \qinv, \tmp
+
+  pkhtb \res, \tmp, \res, asr#16
+  vmov \poly0, s28
+  str \res, [\poly0], #4
+    
+  ldr \tmp2, [\aprimeptr], #4 // load cached value
+  ldr.w \tmp, [\rptr_tmp, #4]
+  ldr \res, [\rptr_tmp], #8
+
+  smlad \res, \tmp2, \poly3, \res
+
+  plant_red_b \q, \qa, \qinv, \res
+
+  smladx \tmp, \poly2, \poly3, \tmp
+
+  plant_red_b \q, \qa, \qinv, \tmp
+
+  pkhtb \res, \tmp, \res, asr#16
+  str \res, [\poly0], #4
+  vmov s28, \poly0
+  vmov s27, \aprimeptr
+.endm 
diff --git a/crypto_kem/ml-kem-768/m4fspeed/matacc_asm.S b/crypto_kem/ml-kem-768/m4fspeed/matacc_asm.S
new file mode 100644
index 0000000..f77ae60
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/matacc_asm.S
@@ -0,0 +1,377 @@
+#include "matacc.i"
+.extern shake128_squeezeblocks
+
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+// shake128_squeezeblocks into buffer if all bytes have been used
+.macro third_if tmp, tmp2, rptr, bptr, cptr, bufptr, ctr
+// if (pos + 3 > buflen && ctr < KYBER_N/4)
+  vmov \tmp, s17
+  add \tmp, #168 // XOF_BLOCKBYTES=168
+  add \tmp2, \bufptr, #3
+  cmp.w \tmp2, \tmp  // pos + 3 > buflen
+  ble.w 2f
+    cmp.w \ctr, #256/4
+    bge.w 2f
+      vmov \bufptr, s17
+
+      vmov s16, r12
+      vmov s18, \rptr
+      vmov s19, \bptr
+      vmov s20, \cptr
+      vmov s21, \ctr
+
+      mov \rptr, \bufptr
+      movw \bptr, #1
+      vmov \cptr, s26 // load state
+
+      bl shake128_squeezeblocks
+
+      vmov r12, s16
+      vmov \rptr, s18
+      vmov \bptr, s19
+      vmov \cptr, s20
+      vmov \ctr, s21
+      vmov \bufptr, s17
+    2:
+.endm
+
+// void matacc_asm_cache_16_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t zetas[64], xof_state *state, int16_t *aprimeptr)
+.global matacc_asm_cache_16_32
+.type matacc_asm_cache_16_32, %function
+.align 2
+matacc_asm_cache_16_32:
+  push {r0-r11, r14}
+  rptr    .req r0
+  bptr    .req r1
+  cptr    .req r2
+  bufptr  .req r3
+  zetaptr .req r4
+  val0    .req r5
+  val1    .req r6
+  tmp     .req r7
+  tmp2    .req r8
+  k       .req r9
+	q       .req r10
+	qa      .req r11
+	qinv    .req r12
+	ctr     .req r14
+
+  movw qa, #26632
+	movw q, #3329
+	### qinv=0x6ba8f301
+	movw qinv, #62209
+	movt qinv, #27560
+  movw k, #0
+  
+  ldr.w zetaptr, [sp, #13*4] // load zetaptr from stack
+  ldr.w tmp, [sp, #14*4] // load state from stack
+  vmov s26, tmp
+
+  ldr.w tmp, [sp, #15*4] // load aprimeptr from stack
+  vmov s27, tmp
+
+  // outer while loop
+  movw ctr, #0
+  vmov s17, bufptr // save bufptr to check later
+  1:
+
+    load_vals val0, val1, bufptr, tmp
+
+    first_if doublebasemul_asm_cache_16_32, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr
+    
+    second_if doublebasemul_asm_cache_16_32, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr
+
+    third_if tmp, tmp2, rptr, bptr, cptr, bufptr, ctr
+
+    cmp ctr, #256/4
+    blt.w 1b
+
+  pop {r0-r11, pc}
+.size matacc_asm_cache_16_32, . - matacc_asm_cache_16_32
+
+// void matacc_asm_cache_32_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t zetas[64], xof_state *state, int16_t *aprimeptr)
+.global matacc_asm_cache_32_32
+.type matacc_asm_cache_32_32, %function
+.align 2
+matacc_asm_cache_32_32:
+  push {r0-r11, r14}
+  rptr    .req r0
+  bptr    .req r1
+  cptr    .req r2
+  bufptr  .req r3
+  zetaptr .req r4
+  val0    .req r5
+  val1    .req r6
+  tmp     .req r7
+  tmp2    .req r8
+  k       .req r9
+	q       .req r10
+	qa      .req r11
+	qinv    .req r12
+	ctr     .req r14
+
+  movw qa, #26632
+	movw q, #3329
+	### qinv=0x6ba8f301
+	movw qinv, #62209
+	movt qinv, #27560
+  movw k, #0
+
+  ldr.w zetaptr, [sp, #13*4] // load zetaptr from stack
+  ldr.w tmp, [sp, #14*4] // load state from stack
+  vmov s26, tmp
+
+  ldr.w tmp, [sp, #15*4] // load aprimeptr from stack
+  vmov s27, tmp
+
+  // outer while loop
+  movw ctr, #0
+  vmov s17, bufptr // save bufptr to check later
+  1:
+
+    load_vals val0, val1, bufptr, tmp
+
+    first_if doublebasemul_asm_acc_cache_32_32, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr
+    
+    second_if doublebasemul_asm_acc_cache_32_32, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr
+
+    third_if tmp, tmp2, rptr, bptr, cptr, bufptr, ctr
+
+    cmp ctr, #256/4
+    blt.w 1b
+
+  pop {r0-r11, pc}
+.size matacc_asm_cache_32_32, . - matacc_asm_cache_32_32
+
+// void matacc_asm_cache_32_16(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t zetas[64], xof_state *state, int16_t *aprimeptr, const int32_t *r_tmp)
+.global matacc_asm_cache_32_16
+.type matacc_asm_cache_32_16, %function
+.align 2
+matacc_asm_cache_32_16:
+  push {r0-r11, r14}
+  rptr    .req r0
+  bptr    .req r1
+  cptr    .req r2
+  bufptr  .req r3
+  zetaptr .req r4
+  val0    .req r5
+  val1    .req r6
+  tmp     .req r7
+  tmp2    .req r8
+  k       .req r9
+	q       .req r10
+	qa      .req r11
+	qinv    .req r12
+	ctr     .req r14
+
+  movw qa, #26632
+	movw q, #3329
+	### qinv=0x6ba8f301
+	movw qinv, #62209
+	movt qinv, #27560
+  movw k, #0
+  
+  ldr.w zetaptr, [sp, #13*4] // load zetaptr from stack
+
+  ldr.w tmp, [sp, #14*4] // load state from stack
+  vmov s26, tmp
+
+  ldr.w tmp, [sp, #15*4] // load aprimeptr from stack
+  vmov s27, tmp
+
+  vmov s28, rptr // store "real" destinaton in FP
+  vmov s29, rptr // backup
+  ldr.w rptr, [sp, #16*4]
+
+  // outer while loop
+  movw ctr, #0
+  vmov s17, bufptr // save bufptr to check later
+  1:
+    load_vals val0, val1, bufptr, tmp
+
+    first_if doublebasemul_asm_acc_cache_32_16, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr
+    
+    second_if doublebasemul_asm_acc_cache_32_16, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr
+
+    third_if tmp, tmp2, rptr, bptr, cptr, bufptr, ctr
+
+    cmp ctr, #256/4
+    blt.w 1b
+
+  vmov rptr, s29
+
+  pop {r0-r11, pc}
+.size matacc_asm_cache_32_16, . - matacc_asm_cache_32_16
+
+.unreq zetaptr
+
+// void matacc_asm_opt_16_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], xof_state *state, const int16_t *aprimeptr)
+.global matacc_asm_opt_16_32
+.type matacc_asm_opt_16_32, %function
+.align 2
+matacc_asm_opt_16_32:
+  push {r0-r11, r14}
+  rptr   .req r0 
+  bptr   .req r1
+  cptr   .req r2
+  bufptr .req r3
+  tmp3   .req r4
+  val0   .req r5
+  val1   .req r6
+  tmp    .req r7
+  tmp2   .req r8
+  k      .req r9
+  q      .req r10
+  qa     .req r11
+  qinv   .req r12
+  ctr    .req r14
+
+  movw qa, #26632
+	movw q, #3329
+	### qinv=0x6ba8f301
+	movw qinv, #62209
+	movt qinv, #27560
+  movw k, #0
+  
+  ldr.w tmp, [sp, #13*4] // load state from stack
+  vmov s26, tmp
+
+  ldr.w tmp, [sp, #14*4] // load aprimeptr from stack
+  vmov s27, tmp
+
+  // outer while loop
+  movw ctr, #0
+  vmov s17, bufptr // save bufptr to check later
+  1:
+
+    load_vals val0, val1, bufptr, tmp
+
+    first_if doublebasemul_asm_opt_16_32, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, tmp3, k, q, qa, qinv, ctr
+    
+    second_if doublebasemul_asm_opt_16_32, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, tmp3, k, q, qa, qinv, ctr
+
+    third_if tmp, tmp2, rptr, bptr, cptr, bufptr, ctr
+
+    cmp ctr, #256/4
+    blt.w 1b
+
+  pop {r0-r11, pc}
+.size matacc_asm_opt_16_32, . - matacc_asm_opt_16_32
+
+// void matacc_asm_opt_32_32(int32_t *r_tmp, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], xof_state *state, const int16_t *aprimeptr)
+.global matacc_asm_opt_32_32
+.type matacc_asm_opt_32_32, %function
+.align 2
+matacc_asm_opt_32_32:
+  push {r0-r11, r14}
+  rptr   .req r0 
+  bptr   .req r1
+  cptr   .req r2
+  bufptr .req r3
+  tmp3   .req r4
+  val0   .req r5
+  val1   .req r6
+  tmp    .req r7
+  tmp2   .req r8
+  k      .req r9
+  q      .req r10
+  qa     .req r11
+  qinv   .req r12
+  ctr    .req r14
+
+  movw qa, #26632
+	movw q, #3329
+	### qinv=0x6ba8f301
+	movw qinv, #62209
+	movt qinv, #27560
+  movw k, #0
+  
+  ldr.w tmp, [sp, #13*4] // load state from stack
+  vmov s26, tmp
+
+  ldr.w tmp, [sp, #14*4] // load aprimeptr from stack
+  vmov s27, tmp
+
+  // outer while loop
+  movw ctr, #0
+  vmov s17, bufptr // save bufptr to check later
+  1:
+
+    load_vals val0, val1, bufptr, tmp
+
+    first_if doublebasemul_asm_acc_opt_32_32, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, tmp3, k, q, qa, qinv, ctr
+    
+    second_if doublebasemul_asm_acc_opt_32_32, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, tmp3, k, q, qa, qinv, ctr
+
+    third_if tmp, tmp2, rptr, bptr, cptr, bufptr, ctr
+
+    cmp ctr, #256/4
+    blt.w 1b
+
+  pop {r0-r11, pc}
+.size matacc_asm_opt_32_32, . - matacc_asm_opt_32_32
+
+.unreq tmp3
+
+
+// void matacc_asm_opt_32_16(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], xof_state *state, const int16_t *aprimeptr, const int32_t *r_tmp)
+.global matacc_asm_opt_32_16
+.type matacc_asm_opt_32_16, %function
+.align 2
+matacc_asm_opt_32_16:
+  push {r0-r11, r14}
+  rptr    .req r0
+  bptr    .req r1
+  cptr    .req r2
+  bufptr  .req r3
+  tmp3    .req r4
+  val0    .req r5
+  val1    .req r6
+  tmp     .req r7
+  tmp2    .req r8
+  k       .req r9
+  q       .req r10
+  qa      .req r11
+  qinv    .req r12
+  ctr     .req r14
+
+  movw qa, #26632
+	movw q, #3329
+	### qinv=0x6ba8f301
+	movw qinv, #62209
+	movt qinv, #27560
+  movw k, #0
+  
+  ldr.w tmp, [sp, #13*4] // load state from stack
+  vmov s26, tmp
+
+  ldr.w tmp, [sp, #14*4] // load aprimeptr from stack
+  vmov s27, tmp
+
+  vmov s28, rptr // store "real" destinaton in FP
+  vmov s29, rptr // backup
+  ldr.w rptr, [sp, #15*4]
+
+  // outer while loop
+  movw ctr, #0
+  vmov s17, bufptr // save bufptr to check later
+  1:
+
+    load_vals val0, val1, bufptr, tmp
+
+    first_if doublebasemul_asm_acc_opt_32_16, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, tmp3, k, q, qa, qinv, ctr
+    
+    second_if doublebasemul_asm_acc_opt_32_16, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, tmp3, k, q, qa, qinv, ctr
+
+    third_if tmp, tmp2, rptr, bptr, cptr, bufptr, ctr
+
+    cmp ctr, #256/4
+    blt.w 1b
+
+  vmov rptr, s29
+
+  pop {r0-r11, pc}
+.size matacc_asm_opt_32_16, . - matacc_asm_opt_32_16
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fspeed/ntt.c b/crypto_kem/ml-kem-768/m4fspeed/ntt.c
new file mode 100644
index 0000000..7fd1208
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/ntt.c
@@ -0,0 +1,106 @@
+#include "ntt.h"
+
+#include "params.h"
+
+#include <stdint.h>
+
+// for basemul not changed
+
+const int32_t zetas[64] = {21932846, 3562152210, 752167598, 3417653460, 2112004045, 932791035, 2951903026, 1419184148, 1817845876, 3434425636, 4233039261, 300609006, 975366560, 2781600929, 3889854731, 3935010590, 2197155094, 2130066389, 3598276897, 2308109491, 2382939200, 1228239371, 1884934581, 3466679822, 1211467195, 2977706375, 3144137970, 3080919767, 945692709, 3015121229, 345764865, 826997308, 2043625172, 2964804700, 2628071007, 4154339049, 483812778, 3288636719, 2696449880, 2122325384, 1371447954, 411563403, 3577634219, 976656727, 2708061387, 723783916, 3181552825, 3346694253, 3617629408, 1408862808, 519937465, 1323711759, 1474661346, 2773859924, 3580214553, 1143088323, 2221668274, 1563682897, 2417773720, 1327582262, 2722253228, 3786641338, 1141798155, 2779020594};
+
+const int32_t zetas_asm[128] = {
+    2230699446, 3328631909, 4243360600, 3408622288, 812805467, 2447447570, 1094061961, 1370157786, 2475831253, 249002310, 1028263423, 3594406395, 4205945745, 734105255, 2252632292, 381889553, 372858381, 427045412, 21932846, 3562152210, 752167598, 3417653460, 3157039644, 4196914574, 2265533966, 2112004045, 932791035, 2951903026, 1419184148, 1727534158, 1544330386, 2972545705, 1817845876, 3434425636, 4233039261, 300609006, 1904287092, 2937711185, 2651294021, 975366560, 2781600929, 3889854731, 3935010590, 3929849920, 838608815, 2550660963, 2197155094, 2130066389, 3598276897, 2308109491, 72249375, 3242190693, 815385801, 2382939200, 1228239371, 1884934581, 3466679822, 2889974991, 3696329620, 42575525, 1211467195, 2977706375, 3144137970, 3080919767, 1719793153, 1703020977, 2470670584, 945692709, 3015121229, 345764865, 826997308, 1839778722, 2991898216, 1851390229, 2043625172, 2964804700, 2628071007, 4154339049, 2701610550, 1041165097, 583155668, 483812778, 3288636719, 2696449880, 2122325384, 690239563, 1855260731, 3700200122, 1371447954, 411563403, 3577634219, 976656727, 3718262466, 1979116802, 3098982111, 2708061387, 723783916, 3181552825, 3346694253, 3087370604, 3415073125, 3376368103, 3617629408, 1408862808, 519937465, 1323711759, 3714391964, 1910737929, 836028480, 1474661346, 2773859924, 3580214553, 1143088323, 2546790461, 3191874164, 4012420634, 2221668274, 1563682897, 2417773720, 1327582262, 1059227441, 1583035408, 1174052340, 2722253228, 3786641338, 1141798155, 2779020594, 0};
+
+const int32_t zetas_inv_CT_asm[256] = {
+    // LAYER 7+6+5+4
+    1290168, 1290168, 2064267851, 1290168, 51606697, 2064267851, 966335388, 1290168, 3200905336, 51606697, 3482161830, 2064267851, 1847519727, 966335388, 886345009,
+    // removed first "2285" + LAYER 3+2+1 - 1 - butterfly
+    1290168, 2064267851, 1290168, 51606697, 2064267851, 966335388,
+    // LAYER 3+2+1 - 1 - twist
+    2435836064, 290287667, 2944162022, 3021572066, 1802363867, 603798347, 3375077936, 2677097369,
+    // LAYER 3+2+1 - 2 - butterfly
+    2042335005, 3235739856, 1748176836, 3120914957, 282546663, 2711931889, 1103093133,
+    // LAYER 3+2+1 - 2 - twist
+    1659155285, 1785591691, 1941701947, 2704190884, 358666539, 793452955, 1461759672, 1673347127,
+    // LAYER 3+2+1 - 3 - butterfly
+    3200905336, 2042335005, 3560862042, 3235739856, 580575333, 1748176836, 1207596693,
+    // LAYER 3+2+1 - 3 - twist
+    3887274396, 2126195886, 872153167, 3443456808, 526388302, 299318839, 3875662889, 3382818940,
+    // LAYER 3+2+1 - 4 - butterfly
+    3266703874, 2575174144, 1404992306, 1824296713, 4252391772, 2591946320, 598637677,
+    // LAYER 3+2+1 - 4 - twist
+    1997179146, 2904166832, 2577754479, 202556283, 30964018, 3807284017, 1238560711, 1967505295,
+    // LAYER 3+2+1 - 5 - butterfly
+    51606697, 3200905336, 1847519727, 2042335005, 89021552, 3560862042, 700560902,
+    // LAYER 3+2+1 - 5 - twist
+    1633351937, 2191994424, 909568022, 1780431021, 2022982494, 2497764099, 3609888404, 1126316146,
+    // LAYER 3+2+1 - 6 - butterfly
+    89021552, 576704831, 3604727734, 1195985186, 594767175, 2315850495, 2439706566,
+    // LAYER 3+2+1 - 6 - twist
+    3633111417, 2908037335, 3590535893, 357376372, 1887514916, 1410152976, 2486152593, 571544162,
+    // LAYER 3+2+1 - 7 - butterfly
+    3482161830, 3266703874, 4045964987, 2575174144, 4222717922, 1404992306, 365117377,
+    // LAYER 3+2+1 - 7 - twist
+    4003389463, 2444867236, 1221788534, 3305408896, 1626901100, 3367336931, 651534541, 1549491056,
+    // LAYER 3+2+1 - 8 - butterfly
+    1819136044, 2390680205, 2567433139, 1643673276, 1322421592, 1357256112, 2750636911,
+    // LAYER 3+2+1 - 8 - twist
+    993428903, 3680847611, 1082450454, 1205016358, 348345200, 956014049, 1048906102, 3880823559,
+    // LAYER 3+2+1 - 9 - butterfly
+    2064267851, 51606697, 966335388, 3200905336, 3482161830, 1847519727, 886345009,
+    // LAYER 3+2+1 - 9 - twist
+    3342823751, 4258842609, 568963827, 2849979801, 1283716570, 2330042337, 4104022520, 3007380225,
+    // LAYER 3+2+1 - 10 - butterfly
+    3560862042, 580575333, 1207596693, 3458938817, 918599194, 2384229368, 879894172,
+    // LAYER 3+2+1 - 10 - twist
+    2217797772, 503165289, 2812564947, 2946742357, 833448145, 1905577260, 3273154711, 3208646340,
+    // LAYER 3+2+1 - 11 - butterfly
+    1847519727, 89021552, 700560902, 576704831, 1593356747, 3604727734, 2455188575,
+    // LAYER 3+2+1 - 11 - twist
+    3162200314, 2808694444, 1933960943, 678628056, 49026362, 1375318456, 1961054458, 3473130659,
+    // LAYER 3+2+1 - 12 - butterfly
+    4045964987, 4222717922, 365117377, 3479581496, 1744306334, 1052776604, 3456358482,
+    // LAYER 3+2+1 - 12 - twist
+    438656919, 1681088131, 366407544, 2819015784, 1771399850, 1091481626, 2136517226, 709592074,
+    // LAYER 3+2+1 - 13 - butterfly
+    966335388, 3482161830, 886345009, 3266703874, 1819136044, 4045964987, 2924809511,
+    // LAYER 3+2+1 - 13 - twist
+    25803349, 3888564563, 1032133926, 923759864, 2630651342, 2590656153, 2146838565, 547030981,
+    // LAYER 3+2+1 - 14 - butterfly
+    700560902, 1593356747, 2455188575, 3711811629, 2443577068, 3253802200, 1303069081,
+    // LAYER 3+2+1 - 14 - twist
+    254162980, 3513125848, 1576584571, 3086080437, 2933840683, 3184133160, 1389510297, 2811274779,
+    // LAYER 3+2+1 - 15 - butterfly
+    886345009, 1819136044, 2924809511, 2390680205, 1137927653, 2567433139, 3913077744,
+    // LAYER 3+2+1 - 15 - twist
+    2288756980, 459299597, 1355965945, 1192114684, 2699030215, 439947086, 587026170, 418014240,
+    // LAYER 3+2+1 - 16 - butterfly
+    2924809511, 1137927653, 3913077744, 2029433331, 3867921885, 98052723, 3922108916, 639923034,
+    // LAYER 3+2+1 - 16 - twist
+    2806114109, 4122084864, 575414664, 1674637294, 1541750051, 2560982302, 1540459884, 0};
+
+extern void ntt_fast(int16_t *, const int32_t *);
+/*************************************************
+* Name:        ntt
+*
+* Description: Inplace number-theoretic transform (NTT) in Rq
+*              input is in standard order, output is in bitreversed order
+*
+* Arguments:   - int16_t *poly: pointer to input/output vector of 256 elements of Zq
+**************************************************/
+void ntt(int16_t *poly) {
+    ntt_fast(poly, zetas_asm);
+}
+
+extern void invntt_fast(int16_t *, const int32_t *);
+/*************************************************
+* Name:        invntt
+*
+* Description: Inplace inverse number-theoretic transform in Rq
+*              input is in bitreversed order, output is in standard order
+*
+* Arguments:   - int16_t *poly: pointer to input/output vector of 256 elements of Zq
+**************************************************/
+void invntt(int16_t *poly) {
+    invntt_fast(poly, zetas_inv_CT_asm);
+}
diff --git a/crypto_kem/ml-kem-768/m4fspeed/ntt.h b/crypto_kem/ml-kem-768/m4fspeed/ntt.h
new file mode 100644
index 0000000..a161be5
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/ntt.h
@@ -0,0 +1,11 @@
+#ifndef NTT_H
+#define NTT_H
+
+#include <stdint.h>
+
+extern const int32_t zetas[64];
+
+void ntt(int16_t *poly);
+void invntt(int16_t *poly);
+
+#endif
diff --git a/crypto_kem/ml-kem-768/m4fspeed/params.h b/crypto_kem/ml-kem-768/m4fspeed/params.h
new file mode 100644
index 0000000..bd1dfe1
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/params.h
@@ -0,0 +1,31 @@
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#define KYBER_K 3
+
+/* Don't change parameters below this line */
+
+#define KYBER_N 256
+#define KYBER_Q 3329
+
+#define KYBER_ETA 2
+
+#define KYBER_SYMBYTES 32   /* size in bytes of hashes, and seeds */
+#define KYBER_SSBYTES  32   /* size in bytes of shared key */
+
+#define KYBER_POLYBYTES              384
+#define KYBER_POLYVECBYTES           (KYBER_K * KYBER_POLYBYTES)
+
+#define KYBER_POLYCOMPRESSEDBYTES    128
+#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 320)
+
+#define KYBER_INDCPA_MSGBYTES       KYBER_SYMBYTES
+#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES)
+#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES)
+#define KYBER_INDCPA_BYTES          (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES)
+
+#define KYBER_PUBLICKEYBYTES  (KYBER_INDCPA_PUBLICKEYBYTES)
+#define KYBER_SECRETKEYBYTES  (KYBER_INDCPA_SECRETKEYBYTES +  KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */
+#define KYBER_CIPHERTEXTBYTES  KYBER_INDCPA_BYTES
+
+#endif
diff --git a/crypto_kem/ml-kem-768/m4fspeed/poly.c b/crypto_kem/ml-kem-768/m4fspeed/poly.c
new file mode 100644
index 0000000..b52060f
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/poly.c
@@ -0,0 +1,654 @@
+#include "poly.h"
+
+#include "cbd.h"
+#include "ntt.h"
+#include "params.h"
+#include "symmetric.h"
+
+#include <stdint.h>
+
+
+/*************************************************
+* Name:        poly_compress
+*
+* Description: Serialization of a polynomial and subsequent compression of a polynomial;
+*
+* Arguments:   - unsigned char *r: pointer to output byte array (of length KYBER_POLYCOMPRESSEDBYTES)
+*              - const poly *a:    pointer to input polynomial to be serialized
+*************************************************/
+void poly_compress(unsigned char *r, const poly *a)
+{
+  unsigned int i,j;
+  int16_t u;
+  uint32_t d0;
+  uint8_t t[8];
+
+#if (KYBER_POLYCOMPRESSEDBYTES == 128)
+  for(i=0;i<KYBER_N/8;i++) {
+    for(j=0;j<8;j++) {
+      // map to positive standard representatives
+      u  = a->coeffs[8*i+j];
+      u += (u >> 15) & KYBER_Q;
+/*    t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */
+      d0 = u << 4;
+      d0 += 1665;
+      d0 *= 80635;
+      d0 >>= 28;
+      t[j] = d0 & 0xf;
+    }
+
+    r[0] = t[0] | (t[1] << 4);
+    r[1] = t[2] | (t[3] << 4);
+    r[2] = t[4] | (t[5] << 4);
+    r[3] = t[6] | (t[7] << 4);
+    r += 4;
+  }
+#elif (KYBER_POLYCOMPRESSEDBYTES == 160)
+  for(i=0;i<KYBER_N/8;i++) {
+    for(j=0;j<8;j++) {
+      // map to positive standard representatives
+      u  = a->coeffs[8*i+j];
+      u += (u >> 15) & KYBER_Q;
+/*      t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */
+      d0 = u << 5;
+      d0 += 1664;
+      d0 *= 40318;
+      d0 >>= 27;
+      t[j] = d0 & 0x1f;
+    }
+
+    r[0] = (t[0] >> 0) | (t[1] << 5);
+    r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7);
+    r[2] = (t[3] >> 1) | (t[4] << 4);
+    r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6);
+    r[4] = (t[6] >> 2) | (t[7] << 3);
+    r += 5;
+  }
+#else
+#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}"
+#endif
+}
+
+/*************************************************
+* Name:        poly_decompress
+*
+* Description: De-serialization and subsequent decompression of a polynomial;
+*              approximate inverse of poly_compress
+*
+* Arguments:   - poly *r:                pointer to output polynomial
+*              - const unsigned char *a: pointer to input byte array (of length KYBER_POLYCOMPRESSEDBYTES bytes)
+**************************************************/
+void poly_decompress(poly *r, const unsigned char *a)
+{
+  int i;
+#if (KYBER_POLYCOMPRESSEDBYTES == 128)
+  for(i=0;i<KYBER_N;i+=8)
+  {
+    r->coeffs[i+0] = (((a[0] & 15) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+1] = (((a[0] >> 4) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+2] = (((a[1] & 15) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+3] = (((a[1] >> 4) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+4] = (((a[2] & 15) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+5] = (((a[2] >> 4) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+6] = (((a[3] & 15) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+7] = (((a[3] >> 4) * KYBER_Q) + 8) >> 4;
+    a += 4;
+  }
+#elif (KYBER_POLYCOMPRESSEDBYTES == 160)
+  for(i=0;i<KYBER_N;i+=8)
+  {
+    r->coeffs[i+0] =  (((a[0] & 31) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+1] = ((((a[0] >> 5) | ((a[1] & 3) << 3)) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+2] = ((((a[1] >> 2) & 31) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+3] = ((((a[1] >> 7) | ((a[2] & 15) << 1)) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+4] = ((((a[2] >> 4) | ((a[3] &  1) << 4)) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+5] = ((((a[3] >> 1) & 31) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+6] = ((((a[3] >> 6) | ((a[4] &  7) << 2)) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+7] =  (((a[4] >> 3) * KYBER_Q) + 16) >> 5;
+    a += 5;
+  }
+#else
+#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}"
+#endif
+}
+
+/*************************************************
+* Name:        poly_packcompress
+*
+* Description: Serialization and subsequent compression of a polynomial of a polyvec,
+*              writes to a byte string representation of the whole polyvec.
+*              Used to compress a polyvec one poly at a time in a loop.
+*
+* Arguments:   - unsigned char *r:  pointer to output byte string representation of a polyvec (of length KYBER_POLYVECCOMPRESSEDBYTES)
+*              - const poly *a:     pointer to input polynomial
+*              - int i:             index of to be serialized polynomial in serialized polyec
+**************************************************/
+void poly_packcompress(unsigned char *r, poly *a, int i) {
+    int j, k;
+    uint64_t d0;
+
+#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
+  uint16_t t[8];
+
+  for(j=0;j<KYBER_N/8;j++) {
+      for(k=0;k<8;k++) {
+        t[k]  = a->coeffs[8*j+k];
+        t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
+/*      t[k]  = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */
+        d0 = t[k];
+        d0 <<= 11;
+        d0 += 1664;
+        d0 *= 645084;
+        d0 >>= 31;
+        t[k] = d0 & 0x7ff;
+      }
+      
+
+    r[352*i+11*j+ 0] =  t[0] & 0xff;
+    r[352*i+11*j+ 1] = (t[0] >>  8) | ((t[1] & 0x1f) << 3);
+    r[352*i+11*j+ 2] = (t[1] >>  5) | ((t[2] & 0x03) << 6);
+    r[352*i+11*j+ 3] = (t[2] >>  2) & 0xff;
+    r[352*i+11*j+ 4] = (t[2] >> 10) | ((t[3] & 0x7f) << 1);
+    r[352*i+11*j+ 5] = (t[3] >>  7) | ((t[4] & 0x0f) << 4);
+    r[352*i+11*j+ 6] = (t[4] >>  4) | ((t[5] & 0x01) << 7);
+    r[352*i+11*j+ 7] = (t[5] >>  1) & 0xff;
+    r[352*i+11*j+ 8] = (t[5] >>  9) | ((t[6] & 0x3f) << 2);
+    r[352*i+11*j+ 9] = (t[6] >>  6) | ((t[7] & 0x07) << 5);
+    r[352*i+11*j+10] = (t[7] >>  3);
+  }
+#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
+    uint16_t t[4];
+
+    for (j = 0; j < KYBER_N / 4; j++) {
+        for(k=0;k<4;k++) {
+            t[k]  = a->coeffs[4*j+k];
+            t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
+            /*      t[k]  = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */
+            d0 = t[k];
+            d0 <<= 10;
+            d0 += 1665;
+            d0 *= 1290167;
+            d0 >>= 32;
+            t[k] = d0 & 0x3ff;
+        }
+        r[320*i+5*j+0] =   t[0] & 0xff;
+        r[320*i+5*j+1] =  (t[0] >>  8) | ((t[1] & 0x3f) << 2);
+        r[320*i+5*j+2] = ((t[1] >>  6) | ((t[2] & 0x0f) << 4)) & 0xff;
+        r[320*i+5*j+3] = ((t[2] >>  4) | ((t[3] & 0x03) << 6)) & 0xff;
+        r[320*i+5*j+4] =  (t[3] >>  2) & 0xff;
+    }
+#else
+#error "KYBER_POLYVECCOMPRESSEDBYTES needs to in (KYBER_K * {352, 320})"
+#endif
+}
+
+/*************************************************
+* Name:        poly_unpackdecompress
+*
+* Description: Deserialization and subsequent compression of a polynomial of a polyvec,
+*              Used to uncompress a polyvec one poly at a time in a loop.
+*
+* Arguments:   - const poly *r:     pointer to output polynomial
+*              - unsigned char *a:  pointer to input byte string representation of a polyvec (of length KYBER_POLYVECCOMPRESSEDBYTES)
+*              - int i:             index of poly in polyvec to decompress
+**************************************************/
+void poly_unpackdecompress(poly *r, const unsigned char *a, int i) {
+  int j;
+#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
+    for(j=0;j<KYBER_N/8;j++)
+    {
+      r->coeffs[8*j+0] =  (((a[352*i+11*j+ 0]       | (((uint32_t)a[352*i+11*j+ 1] & 0x07) << 8)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+1] = ((((a[352*i+11*j+ 1] >> 3) | (((uint32_t)a[352*i+11*j+ 2] & 0x3f) << 5)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+2] = ((((a[352*i+11*j+ 2] >> 6) | (((uint32_t)a[352*i+11*j+ 3] & 0xff) << 2) | (((uint32_t)a[352*i+11*j+4] & 0x01) << 10)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+3] = ((((a[352*i+11*j+ 4] >> 1) | (((uint32_t)a[352*i+11*j+ 5] & 0x0f) << 7)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+4] = ((((a[352*i+11*j+ 5] >> 4) | (((uint32_t)a[352*i+11*j+ 6] & 0x7f) << 4)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+5] = ((((a[352*i+11*j+ 6] >> 7) | (((uint32_t)a[352*i+11*j+ 7] & 0xff) << 1) | (((uint32_t)a[352*i+11*j+8] & 0x03) <<  9)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+6] = ((((a[352*i+11*j+ 8] >> 2) | (((uint32_t)a[352*i+11*j+ 9] & 0x1f) << 6)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+7] = ((((a[352*i+11*j+ 9] >> 5) | (((uint32_t)a[352*i+11*j+10] & 0xff) << 3)) * KYBER_Q) + 1024) >> 11;
+    }
+#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
+    for(j=0;j<KYBER_N/4;j++)
+    {
+      r->coeffs[4*j+0] =  (((a[320*i+5*j+ 0]       | (((uint32_t)a[320*i+5*j+ 1] & 0x03) << 8)) * KYBER_Q) + 512) >> 10;
+      r->coeffs[4*j+1] = ((((a[320*i+5*j+ 1] >> 2) | (((uint32_t)a[320*i+5*j+ 2] & 0x0f) << 6)) * KYBER_Q) + 512) >> 10;
+      r->coeffs[4*j+2] = ((((a[320*i+5*j+ 2] >> 4) | (((uint32_t)a[320*i+5*j+ 3] & 0x3f) << 4)) * KYBER_Q) + 512) >> 10;
+      r->coeffs[4*j+3] = ((((a[320*i+5*j+ 3] >> 6) | (((uint32_t)a[320*i+5*j+ 4] & 0xff) << 2)) * KYBER_Q) + 512) >> 10;
+    }
+#else
+#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
+#endif
+}
+
+
+/*************************************************
+* Name:        cmp_poly_compress
+*
+* Description: Serializes and consequently compares polynomial to a serialized polynomial
+*
+* Arguments:   - const unsigned char *r:    pointer to serialized polynomial to compare with
+*              - poly *a:                   pointer to input polynomial to serialize and compare
+* Returns:                                  boolean indicating whether the polynomials are equal
+**************************************************/
+int cmp_poly_compress(const unsigned char *r, poly *a) {
+    unsigned char rc = 0;
+    int16_t u;
+    uint32_t d0;
+    uint8_t t[8];
+    int i, j, k = 0;
+
+#if (KYBER_POLYCOMPRESSEDBYTES == 128)
+  for(i=0;i<KYBER_N/8;i++) {
+    for(j=0;j<8;j++) {
+      // map to positive standard representatives
+      u  = a->coeffs[8*i+j];
+      u += (u >> 15) & KYBER_Q;
+/*    t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */
+      d0 = u << 4;
+      d0 += 1665;
+      d0 *= 80635;
+      d0 >>= 28;
+      t[j] = d0 & 0xf;
+    }
+        rc |= r[k]      ^ (t[0] | (t[1] << 4));
+        rc |= r[k + 1]  ^ (t[2] | (t[3] << 4));
+        rc |= r[k + 2]  ^ (t[4] | (t[5] << 4));
+        rc |= r[k + 3]  ^ (t[6] | (t[7] << 4));
+        k += 4;
+    }
+#elif (KYBER_POLYCOMPRESSEDBYTES == 160)
+  for(i=0;i<KYBER_N/8;i++) {
+    for(j=0;j<8;j++) {
+      // map to positive standard representatives
+      u  = a->coeffs[8*i+j];
+      u += (u >> 15) & KYBER_Q;
+/*      t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */
+      d0 = u << 5;
+      d0 += 1664;
+      d0 *= 40318;
+      d0 >>= 27;
+      t[j] = d0 & 0x1f;
+    }
+
+
+      rc |= r[k]   ^ (t[0]       | (t[1] << 5));
+      rc |= r[k+1] ^ ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
+      rc |= r[k+2] ^ ((t[3] >> 1) | (t[4] << 4));
+      rc |= r[k+3] ^ ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
+      rc |= r[k+4] ^ ((t[6] >> 2) | (t[7] << 3));
+      k += 5;
+    }
+#else
+#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}"
+#endif
+    return rc;
+}
+
+/*************************************************
+* Name:        cmp_poly_packcompress
+*
+* Description: Serializes and consequently compares poly of polyvec to a serialized polyvec
+*              Should be called in a loop over all poly's of a polyvec.
+*
+* Arguments:   - const unsigned char *r:    pointer to serialized polyvec to compare with
+*              - poly *a:                   pointer to input polynomial of polyvec to serialize and compare
+*              - int i:                     index of poly in polyvec to compare with
+* Returns:                                  boolean indicating whether the polyvecs are equal
+**************************************************/
+int cmp_poly_packcompress(const unsigned char *r, poly *a, int i) {
+    unsigned char rc = 0;
+    int j, k;
+    uint64_t d0;
+
+#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
+  uint16_t t[8];
+    for(j=0;j<KYBER_N/8;j++)
+    {
+      for(k=0;k<8;k++) {
+        t[k]  = a->coeffs[8*j+k];
+        t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
+/*      t[k]  = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */
+        d0 = t[k];
+        d0 <<= 11;
+        d0 += 1664;
+        d0 *= 645084;
+        d0 >>= 31;
+        t[k] = d0 & 0x7ff;
+      }
+
+      rc |= r[352*i+11*j+ 0] ^ (t[0] & 0xff);
+      rc |= r[352*i+11*j+ 1] ^ ((t[0] >>  8) | ((t[1] & 0x1f) << 3));
+      rc |= r[352*i+11*j+ 2] ^ ((t[1] >>  5) | ((t[2] & 0x03) << 6));
+      rc |= r[352*i+11*j+ 3] ^ ((t[2] >>  2) & 0xff);
+      rc |= r[352*i+11*j+ 4] ^ ((t[2] >> 10) | ((t[3] & 0x7f) << 1));
+      rc |= r[352*i+11*j+ 5] ^ ((t[3] >>  7) | ((t[4] & 0x0f) << 4));
+      rc |= r[352*i+11*j+ 6] ^ ((t[4] >>  4) | ((t[5] & 0x01) << 7));
+      rc |= r[352*i+11*j+ 7] ^ ((t[5] >>  1) & 0xff);
+      rc |= r[352*i+11*j+ 8] ^ ((t[5] >>  9) | ((t[6] & 0x3f) << 2));
+      rc |= r[352*i+11*j+ 9] ^ ((t[6] >>  6) | ((t[7] & 0x07) << 5));
+      rc |= r[352*i+11*j+10] ^ ((t[7] >>  3));
+    }
+#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
+    uint16_t t[4];
+        for (j = 0; j < KYBER_N / 4; j++) {
+        for(k=0;k<4;k++) {
+            t[k]  = a->coeffs[4*j+k];
+            t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
+            /*      t[k]  = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */
+            d0 = t[k];
+            d0 <<= 10;
+            d0 += 1665;
+            d0 *= 1290167;
+            d0 >>= 32;
+            t[k] = d0 & 0x3ff;
+        }
+
+            rc |= r[320*i+5*j+0] ^ (t[0] & 0xff);
+            rc |= r[320*i+5*j+1] ^ ((t[0] >>  8) | ((t[1] & 0x3f) << 2));
+            rc |= r[320*i+5*j+2] ^ (((t[1] >>  6) | ((t[2] & 0x0f) << 4)) & 0xff);
+            rc |= r[320*i+5*j+3] ^ (((t[2] >>  4) | ((t[3] & 0x03) << 6)) & 0xff);
+            rc |= r[320*i+5*j+4] ^ ((t[3] >>  2) & 0xff);
+        }
+#else
+#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
+#endif
+    return rc;
+}
+
+/*************************************************
+* Name:        poly_tobytes
+*
+* Description: Serialization of a polynomial
+*
+* Arguments:   - unsigned char *r: pointer to output byte array (needs space for KYBER_POLYBYTES bytes)
+*              - const poly *a:    pointer to input polynomial
+**************************************************/
+void poly_tobytes(unsigned char *r, poly *a) {
+    int i;
+    uint16_t t0, t1;
+
+    poly_reduce(a);
+
+    for (i = 0; i < KYBER_N / 2; i++) {
+        t0 = a->coeffs[2 * i];
+        t1 = a->coeffs[2 * i + 1];
+        r[3 * i] = t0 & 0xff;
+        r[3 * i + 1] = (t0 >> 8) | ((t1 & 0xf) << 4);
+        r[3 * i + 2] = (t1 >> 4) & 0xff;
+    }
+}
+
+/*************************************************
+* Name:        poly_frombytes
+*
+* Description: De-serialization of a polynomial;
+*              inverse of poly_tobytes
+*
+* Arguments:   - poly *r:                pointer to output polynomial
+*              - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes)
+**************************************************/
+void poly_frombytes(poly *r, const unsigned char *a) {
+    int i;
+
+    for (i = 0; i < KYBER_N / 2; i++) {
+        r->coeffs[2 * i]     = a[3 * i]          | ((uint16_t)a[3 * i + 1] & 0x0f) << 8;
+        r->coeffs[2 * i + 1] = a[3 * i + 1] >> 4 | ((uint16_t)a[3 * i + 2] & 0xff) << 4;
+    }
+}
+
+/*************************************************
+* Name:        poly_frombytes_mul_16_32
+*
+* Description: Multiplication of a polynomial with a de-serialization of another polynomial
+*              Using strategy of better accumulation.
+* Arguments:   - const poly *b:          pointer to input polynomial
+*              - int32_t *r_tmp:         array for accumulating unreduced results
+*              - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes)
+**************************************************/
+extern void frombytes_mul_asm_16_32(int32_t *r_tmp, const int16_t *b, const unsigned char *c, const int32_t zetas[64]);
+void poly_frombytes_mul_16_32(int32_t *r_tmp, const poly *b, const unsigned char *a) {
+    frombytes_mul_asm_16_32(r_tmp, b->coeffs, a, zetas);
+}
+
+/*************************************************
+* Name:        poly_frombytes_mul_32_32
+*
+* Description: Multiplication of a polynomial with a de-serialization of another polynomial
+*              Using strategy of better accumulation.
+* Arguments:   - const poly *b:          pointer to input polynomial
+*              - int32_t *r_tmp:         array for accumulating unreduced results
+*              - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes)
+**************************************************/
+extern void frombytes_mul_asm_acc_32_32(int32_t *r_tmp, const int16_t *b, const unsigned char *c, const int32_t zetas[64]);
+void poly_frombytes_mul_32_32(int32_t *r_tmp, const poly *b, const unsigned char *a) {
+    frombytes_mul_asm_acc_32_32(r_tmp, b->coeffs, a, zetas);
+}
+
+/*************************************************
+* Name:        poly_frombytes_mul_32_16
+*
+* Description: Multiplication of a polynomial with a de-serialization of another polynomial
+*              Using strategy of better accumulation.
+* Arguments:   - poly *r:                pointer to output polynomial
+*              - const poly *b:          pointer to input polynomial
+*              - const int32_t *r_tmp:   array containing unreduced results
+*              - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes)
+**************************************************/
+extern void frombytes_mul_asm_acc_32_16(int16_t *r, const int16_t *b, const unsigned char *c, const int32_t zetas[64], const int32_t *r_tmp);
+void poly_frombytes_mul_32_16(poly *r, const poly* b, const unsigned char *a, const int32_t *r_tmp) {
+    frombytes_mul_asm_acc_32_16(r->coeffs, b->coeffs, a, zetas, r_tmp);
+}
+
+/*************************************************
+* Name:        poly_getnoise
+*
+* Description: Sample a polynomial deterministically from a seed and a nonce,
+*              with output polynomial close to centered binomial distribution
+*              with parameter KYBER_ETA
+*
+* Arguments:   - poly *r:                   pointer to output polynomial
+*              - const unsigned char *seed: pointer to input seed (pointing to array of length KYBER_SYMBYTES bytes)
+*              - unsigned char nonce:       one-byte input nonce
+*              - int add:                   boolean to indicate to accumulate into r
+**************************************************/
+void poly_noise(poly *r, const unsigned char *seed, unsigned char nonce, int add) {
+    unsigned char buf[KYBER_ETA * KYBER_N / 4];
+
+    prf(buf, KYBER_ETA * KYBER_N / 4, seed, nonce);
+    cbd(r, buf, add);
+}
+
+/*************************************************
+* Name:        poly_basemul_opt_16_32
+*
+* Description: Multiplication of two polynomials using asymmetric multiplication.
+*              Cached values are generated during matrix-vector product.
+*              Using strategy of better accumulation (initial step).
+* Arguments:   - const poly *a:       pointer to input polynomial
+*              - const poly *b:       pointer to input polynomial
+*              - const poly *a_prime: pointer to a pre-multiplied by zetas 
+*              - int32_t *r_tmp:      array for accumulating unreduced results
+**************************************************/
+extern void basemul_asm_opt_16_32(int32_t *, const int16_t *, const int16_t *, const int16_t *);
+void poly_basemul_opt_16_32(int32_t *r_tmp, const poly *a, const poly *b, const poly *a_prime) {
+    basemul_asm_opt_16_32(r_tmp, a->coeffs, b->coeffs, a_prime->coeffs);
+}
+
+/*************************************************
+* Name:        poly_basemul_acc_opt_32_32
+*
+* Description: Multiplication of two polynomials using asymmetric multiplication.
+*              Cached values are generated during matrix-vector product.
+*              Using strategy of better accumulation.
+* Arguments:   - const poly *a:       pointer to input polynomial
+*              - const poly *b:       pointer to input polynomial
+*              - const poly *a_prime: pointer to a pre-multiplied by zetas 
+*              - int32_t *r_tmp:      array for accumulating unreduced results
+**************************************************/
+extern void basemul_asm_acc_opt_32_32(int32_t *, const int16_t *, const int16_t *, const int16_t *);
+void poly_basemul_acc_opt_32_32(int32_t *r_tmp, const poly *a, const poly *b, const poly *a_prime) {
+    basemul_asm_acc_opt_32_32(r_tmp, a->coeffs, b->coeffs, a_prime->coeffs);
+}
+
+/*************************************************
+* Name:        poly_basemul_acc_opt_32_16
+*
+* Description: Multiplication of two polynomials using asymmetric multiplication.
+*              Cached values are generated during matrix-vector product.
+*              Using strategy of better accumulation (final step).
+* Arguments:   - const poly *a:       pointer to input polynomial
+*              - const poly *b:       pointer to input polynomial
+*              - const poly *a_prime: pointer to a pre-multiplied by zetas 
+*              - poly *r:             pointer to output polynomial
+*              - int32_t *r_tmp:      array for accumulating unreduced results
+**************************************************/
+extern void basemul_asm_acc_opt_32_16(int16_t *, const int16_t *, const int16_t *, const int16_t *, const int32_t *);
+void poly_basemul_acc_opt_32_16(poly *r, const poly *a, const poly *b, const poly *a_prime, const int32_t * r_tmp) {
+    basemul_asm_acc_opt_32_16(r->coeffs, a->coeffs, b->coeffs, a_prime->coeffs, r_tmp);
+}
+
+/*************************************************
+* Name:        poly_ntt
+*
+* Description: Computes negacyclic number-theoretic transform (NTT) of
+*              a polynomial in place;
+*              inputs assumed to be in normal order, output in bitreversed order
+*
+* Arguments:   - uint16_t *r: pointer to in/output polynomial
+**************************************************/
+void poly_ntt(poly *r) {
+    ntt(r->coeffs);
+}
+
+/*************************************************
+* Name:        poly_invntt
+*
+* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of
+*              a polynomial in place;
+*              inputs assumed to be in bitreversed order, output in normal order
+*
+* Arguments:   - uint16_t *a: pointer to in/output polynomial
+**************************************************/
+void poly_invntt(poly *r) {
+    invntt(r->coeffs);
+}
+
+extern void asm_fromplant(int16_t *r);
+/*************************************************
+* Name:        poly_fromplant
+*
+* Description: Inplace conversion of all coefficients of a polynomial
+*              from Plantard domain to normal domain
+*
+* Arguments:   - poly *r:       pointer to input/output polynomial
+**************************************************/
+void poly_fromplant(poly *r) {
+  asm_fromplant(r->coeffs);
+}
+
+extern void asm_barrett_reduce(int16_t *r);
+/*************************************************
+* Name:        poly_reduce
+*
+* Description: Applies Barrett reduction to all coefficients of a polynomial
+*              for details of the Barrett reduction see comments in reduce.c
+*
+* Arguments:   - poly *r:       pointer to input/output polynomial
+**************************************************/
+void poly_reduce(poly *r) {
+  asm_barrett_reduce(r->coeffs);
+}
+
+extern void pointwise_add(int16_t *, const int16_t *, const int16_t *);
+/*************************************************
+* Name:        poly_add
+*
+* Description: Add two polynomials
+*
+* Arguments: - poly *r:       pointer to output polynomial
+*            - const poly *a: pointer to first input polynomial
+*            - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_add(poly *r, const poly *a, const poly *b) {
+    pointwise_add(r->coeffs,a->coeffs,b->coeffs);
+}
+
+
+extern void pointwise_sub(int16_t *, const int16_t *, const int16_t *);
+/*************************************************
+* Name:        poly_sub
+*
+* Description: Subtract two polynomials
+*
+* Arguments: - poly *r:       pointer to output polynomial
+*            - const poly *a: pointer to first input polynomial
+*            - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_sub(poly *r, const poly *a, const poly *b) {
+    pointwise_sub(r->coeffs,a->coeffs,b->coeffs);
+}
+
+
+void cmov_int16(int16_t *r, int16_t v, uint16_t b);
+
+/*************************************************
+* Name:        poly_frommsg
+*
+* Description: Convert 32-byte message to polynomial
+*
+* Arguments:   - poly *r:                  pointer to output polynomial
+*              - const unsigned char *msg: pointer to input message
+**************************************************/
+void poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES])
+{
+  unsigned int i,j;
+
+#if (KYBER_INDCPA_MSGBYTES != KYBER_N/8)
+#error "KYBER_INDCPA_MSGBYTES must be equal to KYBER_N/8 bytes!"
+#endif
+
+  for(i=0;i<KYBER_N/8;i++) {
+    for(j=0;j<8;j++) {
+      r->coeffs[8*i+j] = 0;
+      cmov_int16(r->coeffs+8*i+j, ((KYBER_Q+1)/2), (msg[i] >> j)&1);
+    }
+  }
+}
+
+/*************************************************
+* Name:        poly_tomsg
+*
+* Description: Convert polynomial to 32-byte message
+*
+* Arguments:   - unsigned char *msg: pointer to output message
+*              - const poly *a:      pointer to input polynomial
+**************************************************/
+void poly_tomsg(unsigned char msg[KYBER_SYMBYTES], poly *a) {
+    uint32_t t;
+    int i, j;
+
+    for (i = 0; i < KYBER_SYMBYTES; i++) {
+        msg[i] = 0;
+        for (j = 0; j < 8; j++) {
+            t  = a->coeffs[8*i+j];
+            t <<= 1;
+            t += 1665;
+            t *= 80635;
+            t >>= 28;
+            t &= 1;
+            msg[i] |= t << j;
+        }
+    }
+}
+
+/*************************************************
+* Name:        poly_zeroize
+*
+* Description: Zeros a polynomial
+*
+* Arguments:   - poly *p: pointer to polynomial
+**************************************************/
+void poly_zeroize(poly *p) {
+  int i;
+  for(i = 0; i < KYBER_N; i++)
+   p->coeffs[i] = 0;
+}
diff --git a/crypto_kem/ml-kem-768/m4fspeed/poly.h b/crypto_kem/ml-kem-768/m4fspeed/poly.h
new file mode 100644
index 0000000..fc61dd5
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/poly.h
@@ -0,0 +1,53 @@
+#ifndef POLY_H
+#define POLY_H
+
+#include "params.h"
+
+#include <stdint.h>
+
+#define poly_getnoise(p, seed, nonce) poly_noise(p, seed, nonce, 0)
+#define poly_addnoise(p, seed, nonce) poly_noise(p, seed, nonce, 1)
+
+/*
+ * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
+ * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1]
+ */
+typedef struct {
+    int16_t coeffs[KYBER_N];
+} poly;
+
+void poly_compress(unsigned char *r, const poly *a);
+void poly_decompress(poly *r, const unsigned char *a);
+
+void poly_packcompress(unsigned char *r, poly *a, int i);
+void poly_unpackdecompress(poly *r, const unsigned char *a, int i);
+
+int cmp_poly_compress(const unsigned char *r, poly *a);
+int cmp_poly_packcompress(const unsigned char *r, poly *a, int i);
+
+void poly_tobytes(unsigned char *r, poly *a);
+void poly_frombytes(poly *r, const unsigned char *a);
+void poly_frombytes_mul_16_32(int32_t *r_tmp, const poly *b, const unsigned char *a);
+void poly_frombytes_mul_32_32(int32_t *r_tmp, const poly *b, const unsigned char *a);
+void poly_frombytes_mul_32_16(poly *r, const poly* b, const unsigned char *a, const int32_t *r_tmp);
+
+void poly_frommsg(poly *r, const unsigned char msg[KYBER_SYMBYTES]);
+void poly_tomsg(unsigned char msg[KYBER_SYMBYTES], poly *a);
+
+void poly_noise(poly *r, const unsigned char *seed, unsigned char nonce, int add);
+
+void poly_ntt(poly *r);
+void poly_invntt(poly *r);
+void poly_basemul_opt_16_32(int32_t *r, const poly *a, const poly *b, const poly *a_prime);
+void poly_basemul_acc_opt_32_32(int32_t *r_tmp, const poly *a, const poly *b, const poly *a_prime);
+void poly_basemul_acc_opt_32_16(poly *r, const poly *a, const poly *b, const poly *a_prime, const int32_t * r_tmp);
+void poly_fromplant(poly *r);
+
+void poly_reduce(poly *r);
+
+void poly_add(poly *r, const poly *a, const poly *b);
+void poly_sub(poly *r, const poly *a, const poly *b);
+
+void poly_zeroize(poly *p);
+
+#endif
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fspeed/poly_asm.S b/crypto_kem/ml-kem-768/m4fspeed/poly_asm.S
new file mode 100644
index 0000000..e58896a
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/poly_asm.S
@@ -0,0 +1,246 @@
+#include "macros.i"
+
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+.macro doublebasemul_frombytes_asm_16_32 rptr_tmp, bptr, zeta, poly0, poly2, poly1, poly3, tmp, q, qa, qinv
+  ldr \poly0, [\bptr], #4
+  ldr \poly2, [\bptr], #4
+
+  smulwt \tmp, \zeta, \poly1 
+	smlabt \tmp, \tmp, \q, \qa  
+	smultt \tmp, \poly0, \tmp  
+	smlabb \tmp, \poly0, \poly1, \tmp 
+  str \tmp, [\rptr_tmp], #4
+
+  smuadx \tmp, \poly0, \poly1
+  str \tmp, [\rptr_tmp], #4
+
+  neg \zeta, \zeta
+
+  smulwt \tmp, \zeta, \poly3 
+	smlabt \tmp, \tmp, \q, \qa  
+	smultt \tmp, \poly2, \tmp  
+	smlabb \tmp, \poly2, \poly3, \tmp 
+  str \tmp, [\rptr_tmp], #4
+
+  smuadx \tmp, \poly2, \poly3
+  str \tmp, [\rptr_tmp], #4
+.endm
+
+.macro doublebasemul_frombytes_asm_acc_32_32 rptr_tmp, bptr, zeta, poly0, poly1, poly3, res0, tmp, q, qa, qinv
+  ldr \poly0, [\bptr], #4
+  ldr \res0, [\rptr_tmp]
+  
+  smulwt \tmp, \zeta, \poly1 
+	smlabt \tmp, \tmp, \q, \qa  
+	smlatt \tmp, \poly0, \tmp, \res0
+	smlabb \tmp, \poly0, \poly1, \tmp 
+
+  str \tmp, [\rptr_tmp], #4
+
+  ldr \res0, [\rptr_tmp]
+  smladx \tmp, \poly0, \poly1, \res0
+  str \tmp, [\rptr_tmp], #4
+
+  neg \zeta, \zeta
+
+  ldr \poly0, [\bptr], #4
+  ldr \res0, [\rptr_tmp]
+  
+  smulwt \tmp, \zeta, \poly3 
+	smlabt \tmp, \tmp, \q, \qa  
+	smlatt \tmp, \poly0, \tmp, \res0
+	smlabb \tmp, \poly0, \poly3, \tmp 
+
+  str \tmp, [\rptr_tmp], #4
+
+  ldr \res0, [\rptr_tmp]
+  smladx \tmp, \poly0, \poly3, \res0
+  str \tmp, [\rptr_tmp], #4
+.endm
+
+.macro doublebasemul_frombytes_asm_acc_32_16 rptr_tmp, rptr, bptr, zeta, poly0, poly1, poly3, res0, tmp, q, qa, qinv
+  ldr \poly0, [\bptr], #4
+  ldr \res0, [\rptr_tmp], #4
+  
+  smulwt \tmp, \zeta, \poly1 
+	smlabt \tmp, \tmp, \q, \qa  
+	smlatt \tmp, \poly0, \tmp, \res0
+	smlabb \tmp, \poly0, \poly1, \tmp 
+  plant_red \q, \qa, \qinv, \tmp
+
+  ldr \res0, [\rptr_tmp], #4
+  smladx \res0, \poly0, \poly1, \res0
+  plant_red \q, \qa, \qinv, \res0
+
+  pkhtb \res0, \res0, \tmp, asr#16
+  str \res0, [\rptr], #4
+
+  neg \zeta, \zeta
+
+  ldr \poly0, [\bptr], #4
+  ldr \res0, [\rptr_tmp], #4
+  
+  smulwt \tmp, \zeta, \poly3 
+	smlabt \tmp, \tmp, \q, \qa  
+	smlatt \tmp, \poly0, \tmp, \res0
+	smlabb \tmp, \poly0, \poly3, \tmp 
+  plant_red \q, \qa, \qinv, \tmp
+
+  ldr \res0, [\rptr_tmp], #4
+  smladx \res0, \poly0, \poly3, \res0
+  plant_red \q, \qa, \qinv, \res0
+
+  pkhtb \res0, \res0, \tmp, asr#16
+  str \res0, [\rptr], #4
+.endm 
+
+// reduce 2 registers
+.macro deserialize aptr, tmp, tmp2, tmp3, t0, t1
+	ldrb.w \tmp, [\aptr, #2]
+	ldrh.w \tmp2, [\aptr, #3]
+	ldrb.w \tmp3, [\aptr, #5]
+	ldrh.w \t0, [\aptr], #6
+
+	ubfx.w \t1, \t0, #12, #4
+	ubfx.w \t0, \t0, #0, #12
+	orr \t1, \t1, \tmp, lsl #4
+	orr \t0, \t0, \t1, lsl #16
+	//tmp is free now
+	ubfx.w \t1, \tmp2, #12, #4
+	ubfx.w \tmp, \tmp2, #0, #12
+	orr \t1, \t1, \tmp3, lsl #4
+	orr \t1, \tmp, \t1, lsl #16
+.endm
+
+// void frombytes_mul_asm_16_32(int32_t *r_tmp, const int16_t *b, const unsigned char *c, const int32_t zetas[64])
+.global frombytes_mul_asm_16_32
+.type frombytes_mul_asm_16_32, %function
+.align 2
+frombytes_mul_asm_16_32:
+  push {r4-r11, r14}
+
+  rptr_tmp .req r0
+  bptr     .req r1
+  aptr     .req r2
+  zetaptr  .req r3
+  t0       .req r4
+	t1       .req r5
+	tmp      .req r6
+	tmp2     .req r7
+	tmp3     .req r8
+	q        .req r9
+	qa       .req r10
+	qinv     .req r11
+	zeta     .req r12
+	ctr      .req r14
+
+  movw qa, #26632
+	movt  q, #3329  
+	### qinv=0x6ba8f301
+	movw qinv, #62209
+	movt qinv, #27560
+
+  add ctr, rptr_tmp, #64*4*4
+  1:
+    ldr.w zeta, [zetaptr], #4
+    deserialize aptr, tmp, tmp2, tmp3, t0, t1
+
+    doublebasemul_frombytes_asm_16_32 rptr_tmp, bptr, zeta, tmp, tmp2, t0, t1, tmp3, q, qa, qinv
+
+    cmp.w rptr_tmp, ctr
+    bne.w 1b
+
+pop {r4-r11, pc}
+.size frombytes_mul_asm_16_32, . -frombytes_mul_asm_16_32
+
+// void frombytes_mul_asm_acc_32_32(int32_t *r_tmp, const int16_t *b, const unsigned char *c, const int32_t zetas[64])
+.global frombytes_mul_asm_acc_32_32
+.type frombytes_mul_asm_acc_32_32, %function
+.align 2
+frombytes_mul_asm_acc_32_32:
+  push {r4-r11, r14}
+
+  rptr_tmp .req r0
+  bptr     .req r1
+  aptr     .req r2
+  zetaptr  .req r3
+  t0       .req r4
+	t1       .req r5
+	tmp      .req r6
+	tmp2     .req r7
+	tmp3     .req r8
+	q        .req r9
+	qa       .req r10
+	qinv     .req r11
+	zeta     .req r12
+	ctr      .req r14
+
+  movw qa, #26632
+	movt  q, #3329  
+	### qinv=0x6ba8f301
+	movw qinv, #62209
+	movt qinv, #27560
+
+  add ctr, rptr_tmp, #64*4*4
+  1:
+    ldr.w zeta, [zetaptr], #4
+    deserialize aptr, tmp, tmp2, tmp3, t0, t1
+
+    doublebasemul_frombytes_asm_acc_32_32 rptr_tmp, bptr, zeta, tmp3, t0, t1, tmp, tmp2, q, qa, qinv
+    cmp.w rptr_tmp, ctr
+    bne.w 1b
+
+pop {r4-r11, pc}
+.size frombytes_mul_asm_acc_32_32, . - frombytes_mul_asm_acc_32_32
+
+.unreq rptr_tmp
+
+// void frombytes_mul_asm_acc_32_16(int16_t *r, const int16_t *b, const unsigned char *c, const int32_t zetas[64], const int32_t *r_tmp)
+.global frombytes_mul_asm_acc_32_16
+.type frombytes_mul_asm_acc_32_16, %function
+.align 2
+frombytes_mul_asm_acc_32_16:
+  push {r4-r11, r14}
+
+  rptr     .req r0
+  bptr     .req r1
+  aptr     .req r2
+  zetaptr  .req r3
+  t0       .req r4
+	t1       .req r5
+	tmp      .req r6
+	tmp2     .req r7
+	tmp3     .req r8
+	q        .req r9
+	qa       .req r10
+	qinv     .req r11
+	zeta     .req r12
+	ctr      .req r14
+  rptr_tmp .req r3
+
+  movw qa, #26632
+	movt  q, #3329  
+	### qinv=0x6ba8f301
+	movw qinv, #62209
+	movt qinv, #27560
+
+  ldr.w tmp, [sp, #9*4] // load rptr_tmp from stack
+  vmov s1, tmp
+  
+  add ctr, tmp, #64*4*4
+  1:
+    ldr.w zeta, [zetaptr], #4
+    deserialize aptr, tmp, tmp2, tmp3, t0, t1
+    vmov s2, zetaptr
+    vmov rptr_tmp, s1
+    doublebasemul_frombytes_asm_acc_32_16 rptr_tmp, rptr, bptr, zeta, tmp3, t0, t1, tmp, tmp2, q, qa, qinv
+    vmov s1, rptr_tmp
+    cmp.w rptr_tmp, ctr
+    vmov zetaptr, s2
+    bne.w 1b
+
+pop {r4-r11, pc}
+.size frombytes_mul_asm_acc_32_16, . - frombytes_mul_asm_acc_32_16
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fspeed/polyvec.c b/crypto_kem/ml-kem-768/m4fspeed/polyvec.c
new file mode 100644
index 0000000..a405e91
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/polyvec.c
@@ -0,0 +1,212 @@
+#include <stdint.h>
+#include "polyvec.h"
+#include "poly.h"
+
+/*************************************************
+* Name:        polyvec_compress
+*
+* Description: Compress and serialize vector of polynomials
+*
+* Arguments:   - uint8_t *r: pointer to output byte array
+*                            (needs space for KYBER_POLYVECCOMPRESSEDBYTES)
+*              - const polyvec *a: pointer to input vector of polynomials
+**************************************************/
+void polyvec_compress(unsigned char *r, const polyvec *a)
+{
+  unsigned int i,j,k;
+  uint64_t d0;
+
+#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
+  uint16_t t[8];
+  for(i=0;i<KYBER_K;i++) {
+    for(j=0;j<KYBER_N/8;j++) {
+      for(k=0;k<8;k++) {
+        t[k]  = a->vec[i].coeffs[8*j+k];
+        t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
+/*      t[k]  = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */
+        d0 = t[k];
+        d0 <<= 11;
+        d0 += 1664;
+        d0 *= 645084;
+        d0 >>= 31;
+        t[k] = d0 & 0x7ff;
+      }
+
+      r[ 0] = (t[0] >>  0);
+      r[ 1] = (t[0] >>  8) | (t[1] << 3);
+      r[ 2] = (t[1] >>  5) | (t[2] << 6);
+      r[ 3] = (t[2] >>  2);
+      r[ 4] = (t[2] >> 10) | (t[3] << 1);
+      r[ 5] = (t[3] >>  7) | (t[4] << 4);
+      r[ 6] = (t[4] >>  4) | (t[5] << 7);
+      r[ 7] = (t[5] >>  1);
+      r[ 8] = (t[5] >>  9) | (t[6] << 2);
+      r[ 9] = (t[6] >>  6) | (t[7] << 5);
+      r[10] = (t[7] >>  3);
+      r += 11;
+    }
+  }
+#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
+  uint16_t t[4];
+  for(i=0;i<KYBER_K;i++) {
+    for(j=0;j<KYBER_N/4;j++) {
+      for(k=0;k<4;k++) {
+        t[k]  = a->vec[i].coeffs[4*j+k];
+        t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
+/*      t[k]  = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */
+        d0 = t[k];
+        d0 <<= 10;
+        d0 += 1665;
+        d0 *= 1290167;
+        d0 >>= 32;
+        t[k] = d0 & 0x3ff;
+      }
+
+      r[0] = (t[0] >> 0);
+      r[1] = (t[0] >> 8) | (t[1] << 2);
+      r[2] = (t[1] >> 6) | (t[2] << 4);
+      r[3] = (t[2] >> 4) | (t[3] << 6);
+      r[4] = (t[3] >> 2);
+      r += 5;
+    }
+  }
+#else
+#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
+#endif
+}
+
+/*************************************************
+* Name:        polyvec_decompress
+*
+* Description: De-serialize and decompress vector of polynomials;
+*              approximate inverse of polyvec_compress
+*
+* Arguments:   - polyvec *r:       pointer to output vector of polynomials
+*              - unsigned char *a: pointer to input byte array (of length KYBER_POLYVECCOMPRESSEDBYTES)
+**************************************************/
+void polyvec_decompress(polyvec *r, const unsigned char *a)
+{
+  int i,j;
+#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
+  for(i=0;i<KYBER_K;i++)
+  {
+    for(j=0;j<KYBER_N/8;j++)
+    {
+      r->vec[i].coeffs[8*j+0] =  (((a[11*j+ 0]       | (((uint32_t)a[11*j+ 1] & 0x07) << 8)) * KYBER_Q) + 1024) >> 11;
+      r->vec[i].coeffs[8*j+1] = ((((a[11*j+ 1] >> 3) | (((uint32_t)a[11*j+ 2] & 0x3f) << 5)) * KYBER_Q) + 1024) >> 11;
+      r->vec[i].coeffs[8*j+2] = ((((a[11*j+ 2] >> 6) | (((uint32_t)a[11*j+ 3] & 0xff) << 2) | (((uint32_t)a[11*j+ 4] & 0x01) << 10)) * KYBER_Q) + 1024) >> 11;
+      r->vec[i].coeffs[8*j+3] = ((((a[11*j+ 4] >> 1) | (((uint32_t)a[11*j+ 5] & 0x0f) << 7)) * KYBER_Q) + 1024) >> 11;
+      r->vec[i].coeffs[8*j+4] = ((((a[11*j+ 5] >> 4) | (((uint32_t)a[11*j+ 6] & 0x7f) << 4)) * KYBER_Q) + 1024) >> 11;
+      r->vec[i].coeffs[8*j+5] = ((((a[11*j+ 6] >> 7) | (((uint32_t)a[11*j+ 7] & 0xff) << 1) | (((uint32_t)a[11*j+ 8] & 0x03) <<  9)) * KYBER_Q) + 1024) >> 11;
+      r->vec[i].coeffs[8*j+6] = ((((a[11*j+ 8] >> 2) | (((uint32_t)a[11*j+ 9] & 0x1f) << 6)) * KYBER_Q) + 1024) >> 11;
+      r->vec[i].coeffs[8*j+7] = ((((a[11*j+ 9] >> 5) | (((uint32_t)a[11*j+10] & 0xff) << 3)) * KYBER_Q) + 1024) >> 11;
+    }
+    a += 352;
+  }
+#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
+  for(i=0;i<KYBER_K;i++)
+  {
+    for(j=0;j<KYBER_N/4;j++)
+    {
+      r->vec[i].coeffs[4*j+0] =  (((a[5*j+ 0]       | (((uint32_t)a[5*j+ 1] & 0x03) << 8)) * KYBER_Q) + 512) >> 10;
+      r->vec[i].coeffs[4*j+1] = ((((a[5*j+ 1] >> 2) | (((uint32_t)a[5*j+ 2] & 0x0f) << 6)) * KYBER_Q) + 512) >> 10;
+      r->vec[i].coeffs[4*j+2] = ((((a[5*j+ 2] >> 4) | (((uint32_t)a[5*j+ 3] & 0x3f) << 4)) * KYBER_Q) + 512) >> 10;
+      r->vec[i].coeffs[4*j+3] = ((((a[5*j+ 3] >> 6) | (((uint32_t)a[5*j+ 4] & 0xff) << 2)) * KYBER_Q) + 512) >> 10;
+    }
+    a += 320;
+  }
+#else
+#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
+#endif
+}
+
+/*************************************************
+* Name:        polyvec_tobytes
+*
+* Description: Serialize vector of polynomials
+*
+* Arguments:   - unsigned char *r: pointer to output byte array (needs space for KYBER_POLYVECBYTES)
+*              - const polyvec *a: pointer to input vector of polynomials
+**************************************************/
+void polyvec_tobytes(unsigned char *r, polyvec *a)
+{
+  int i;
+  for(i=0;i<KYBER_K;i++)
+    poly_tobytes(r+i*KYBER_POLYBYTES, &a->vec[i]);
+}
+
+/*************************************************
+* Name:        polyvec_frombytes
+*
+* Description: De-serialize vector of polynomials;
+*              inverse of polyvec_tobytes
+*
+* Arguments:   - unsigned char *r: pointer to output byte array
+*              - const polyvec *a: pointer to input vector of polynomials (of length KYBER_POLYVECBYTES)
+**************************************************/
+void polyvec_frombytes(polyvec *r, const unsigned char *a)
+{
+  int i;
+  for(i=0;i<KYBER_K;i++)
+    poly_frombytes(&r->vec[i], a+i*KYBER_POLYBYTES);
+}
+
+/*************************************************
+* Name:        polyvec_ntt
+*
+* Description: Apply forward NTT to all elements of a vector of polynomials
+*
+* Arguments:   - polyvec *r: pointer to in/output vector of polynomials
+**************************************************/
+void polyvec_ntt(polyvec *r)
+{
+  int i;
+  for(i=0;i<KYBER_K;i++)
+    poly_ntt(&r->vec[i]);
+}
+
+/*************************************************
+* Name:        polyvec_invntt
+*
+* Description: Apply inverse NTT to all elements of a vector of polynomials
+*
+* Arguments:   - polyvec *r: pointer to in/output vector of polynomials
+**************************************************/
+void polyvec_invntt(polyvec *r)
+{
+  int i;
+  for(i=0;i<KYBER_K;i++)
+    poly_invntt(&r->vec[i]);
+}
+
+/*************************************************
+* Name:        polyvec_reduce
+*
+* Description: Applies Barrett reduction to each coefficient
+*              of each element of a vector of polynomials
+*              for details of the Barrett reduction see comments in reduce.c
+*
+* Arguments:   - poly *r:       pointer to input/output polynomial
+**************************************************/
+void polyvec_reduce(polyvec *r)
+{
+  int i;
+  for(i=0;i<KYBER_K;i++)
+    poly_reduce(&r->vec[i]);
+}
+
+/*************************************************
+* Name:        polyvec_add
+*
+* Description: Add vectors of polynomials
+*
+* Arguments: - polyvec *r:       pointer to output vector of polynomials
+*            - const polyvec *a: pointer to first input vector of polynomials
+*            - const polyvec *b: pointer to second input vector of polynomials
+**************************************************/
+void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b)
+{
+  int i;
+  for(i=0;i<KYBER_K;i++)
+    poly_add(&r->vec[i], &a->vec[i], &b->vec[i]);
+}
diff --git a/crypto_kem/ml-kem-768/m4fspeed/polyvec.h b/crypto_kem/ml-kem-768/m4fspeed/polyvec.h
new file mode 100644
index 0000000..0be7873
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/polyvec.h
@@ -0,0 +1,24 @@
+#ifndef POLYVEC_H
+#define POLYVEC_H
+
+#include "params.h"
+#include "poly.h"
+
+typedef struct {
+    poly vec[KYBER_K];
+} polyvec;
+
+void polyvec_compress(unsigned char *r, const polyvec *a);
+void polyvec_decompress(polyvec *r, const unsigned char *a);
+
+void polyvec_tobytes(unsigned char *r, polyvec *a);
+void polyvec_frombytes(polyvec *r, const unsigned char *a);
+
+void polyvec_ntt(polyvec *r);
+void polyvec_invntt(polyvec *r);
+
+void polyvec_reduce(polyvec *r);
+
+void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b);
+
+#endif
diff --git a/crypto_kem/ml-kem-768/m4fspeed/reduce.S b/crypto_kem/ml-kem-768/m4fspeed/reduce.S
new file mode 100644
index 0000000..bfc53f6
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/reduce.S
@@ -0,0 +1,140 @@
+/******************************************************************************
+* Integrating the improved Plantard arithmetic into Kyber.
+*
+* Efficient Plantard arithmetic enables a faster Kyber implementation with the 
+* same stack usage.
+*
+* See the paper at https://eprint.iacr.org/2022/956.pdf for more details.
+*
+* @author   Junhao Huang, BNU-HKBU United International College, Zhuhai, China
+*           jhhuang_nuaa@126.com
+*
+* @date     September 2022
+******************************************************************************/
+
+#include "macros.i"
+
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+.global asm_barrett_reduce
+.type asm_barrett_reduce,%function
+.align 2
+asm_barrett_reduce:
+	push    {r4-r11, r14}
+
+	poly        .req r0
+	poly0       .req r1
+	poly1       .req r2
+	poly2       .req r3
+	poly3       .req r4
+	poly4       .req r5
+	poly5       .req r6
+	poly6       .req r7
+	poly7       .req r8
+	loop        .req r9
+	barrettconst .req r10
+	q           .req r11
+	tmp         .req r12
+	tmp2        .req r14
+
+	movw barrettconst, #20159
+	movw q, #3329
+
+	movw loop, #16
+	1:
+		ldm poly, {poly0-poly7}
+
+		doublebarrett poly0, tmp, tmp2, q, barrettconst
+		doublebarrett poly1, tmp, tmp2, q, barrettconst
+		doublebarrett poly2, tmp, tmp2, q, barrettconst
+		doublebarrett poly3, tmp, tmp2, q, barrettconst
+		doublebarrett poly4, tmp, tmp2, q, barrettconst
+		doublebarrett poly5, tmp, tmp2, q, barrettconst
+		doublebarrett poly6, tmp, tmp2, q, barrettconst
+		doublebarrett poly7, tmp, tmp2, q, barrettconst
+
+		stm poly!, {poly0-poly7}
+
+	subs.w loop, #1
+	bne.w 1b
+
+	.unreq poly        
+	.unreq poly0       
+	.unreq poly1       
+	.unreq poly2       
+	.unreq poly3       
+	.unreq poly4       
+	.unreq poly5       
+	.unreq poly6       
+	.unreq poly7       
+	.unreq loop        
+	.unreq barrettconst
+	.unreq q           
+	.unreq tmp         
+	.unreq tmp2        
+
+	pop     {r4-r11, pc}
+
+.global asm_fromplant
+.type asm_fromplant,%function
+.align 2
+asm_fromplant:
+	push    {r4-r11, r14}
+
+	poly        .req r0
+	poly0       .req r1
+	poly1       .req r2
+	poly2       .req r3
+	poly3       .req r4
+	poly4       .req r5
+	poly5       .req r6
+	poly6       .req r7
+	poly7       .req r8
+	loop        .req r9
+	plantconst  .req r10
+	q           .req r11
+	qa          .req r12
+	tmp         .req r14
+	
+	movw qa, #26632
+	movt q, #3329
+	
+	### movt qinv, #3327
+	### plant_constant=(Plant_const^2%M)*(p^-1) % 2^32
+	movw plantconst, #20396
+	movt plantconst, #38900
+	movw loop, #16
+	1:
+		ldm poly, {poly0-poly7}
+
+		doubleplant poly0, tmp, q, qa, plantconst
+		doubleplant poly1, tmp, q, qa, plantconst
+		doubleplant poly2, tmp, q, qa, plantconst
+		doubleplant poly3, tmp, q, qa, plantconst
+		doubleplant poly4, tmp, q, qa, plantconst
+		doubleplant poly5, tmp, q, qa, plantconst
+		doubleplant poly6, tmp, q, qa, plantconst
+		doubleplant poly7, tmp, q, qa, plantconst
+	
+		stm poly!, {poly0-poly7}
+
+	subs.w loop, #1
+	bne.w 1b
+
+	.unreq poly        
+	.unreq poly0       
+	.unreq poly1       
+	.unreq poly2       
+	.unreq poly3       
+	.unreq poly4       
+	.unreq poly5       
+	.unreq poly6       
+	.unreq poly7       
+	.unreq loop        
+	.unreq plantconst  
+	.unreq q           
+	.unreq qa          
+	.unreq tmp         
+	pop     {r4-r11, pc}
diff --git a/crypto_kem/ml-kem-768/m4fspeed/symmetric-fips202.c b/crypto_kem/ml-kem-768/m4fspeed/symmetric-fips202.c
new file mode 100644
index 0000000..4ee0723
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/symmetric-fips202.c
@@ -0,0 +1,71 @@
+#include "fips202.h"
+#include "params.h"
+#include "symmetric.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+/*************************************************
+* Name:        kyber_shake128_absorb
+*
+* Description: Absorb step of the SHAKE128 specialized for the Kyber context.
+*
+* Arguments:   - xof_state *state: pointer to (uninitialized) output Keccak state
+*              - const uint8_t *seed: pointer to KYBER_SYMBYTES input to be absorbed into state
+*              - uint8_t i: additional byte of input
+*              - uint8_t j: additional byte of input
+**************************************************/
+void kyber_shake128_absorb(xof_state *state,
+        const uint8_t seed[KYBER_SYMBYTES],
+        uint8_t x,
+        uint8_t y) {
+    uint8_t extseed[KYBER_SYMBYTES + 2];
+
+    memcpy(extseed, seed, KYBER_SYMBYTES);
+    extseed[KYBER_SYMBYTES + 0] = x;
+    extseed[KYBER_SYMBYTES + 1] = y;
+
+    shake128_absorb(state, extseed, sizeof(extseed));
+}
+
+/*************************************************
+* Name:        kyber_shake256_prf
+*
+* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
+*              and then generates outlen bytes of SHAKE256 output
+*
+* Arguments:   - uint8_t *out: pointer to output
+*              - size_t outlen: number of requested output bytes
+*              - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES)
+*              - uint8_t nonce: single-byte nonce (public PRF input)
+**************************************************/
+void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce) {
+    uint8_t extkey[KYBER_SYMBYTES + 1];
+
+    memcpy(extkey, key, KYBER_SYMBYTES);
+    extkey[KYBER_SYMBYTES] = nonce;
+
+    shake256(out, outlen, extkey, sizeof(extkey));
+}
+
+/*************************************************
+* Name:        kyber_shake256_prf
+*
+* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
+*              and then generates outlen bytes of SHAKE256 output
+*
+* Arguments:   - uint8_t *out: pointer to output
+*              - size_t outlen: number of requested output bytes
+*              - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES)
+*              - uint8_t nonce: single-byte nonce (public PRF input)
+**************************************************/
+void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES]) {
+    shake256incctx s;
+
+    shake256_inc_init(&s);
+    shake256_inc_absorb(&s, key, KYBER_SYMBYTES);
+    shake256_inc_absorb(&s, input, KYBER_CIPHERTEXTBYTES);
+    shake256_inc_finalize(&s);
+    shake256_inc_squeeze(out, KYBER_SSBYTES, &s);
+    shake256_inc_ctx_release(&s);
+}
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fspeed/symmetric.h b/crypto_kem/ml-kem-768/m4fspeed/symmetric.h
new file mode 100644
index 0000000..8441c83
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/symmetric.h
@@ -0,0 +1,29 @@
+#ifndef SYMMETRIC_H
+#define SYMMETRIC_H
+#include "fips202.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef shake128ctx xof_state;
+
+void kyber_shake128_absorb(xof_state *s,
+        const uint8_t seed[KYBER_SYMBYTES],
+        uint8_t x,
+        uint8_t y);
+
+void kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce);
+
+void kyber_shake256_rkprf(uint8_t out[KYBER_SSBYTES], const uint8_t key[KYBER_SYMBYTES], const uint8_t input[KYBER_CIPHERTEXTBYTES]);
+
+#define XOF_BLOCKBYTES SHAKE128_RATE
+
+#define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES)
+#define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES)
+#define xof_absorb(STATE, SEED, X, Y) kyber_shake128_absorb(STATE, SEED, X, Y)
+#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE)
+#define xof_ctx_release(STATE) shake128_ctx_release(STATE)
+#define prf(OUT, OUTBYTES, KEY, NONCE) kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE)
+#define rkprf(OUT, KEY, INPUT) kyber_shake256_rkprf(OUT, KEY, INPUT)
+
+#endif /* SYMMETRIC_H */
diff --git a/crypto_kem/ml-kem-768/m4fspeed/verify.c b/crypto_kem/ml-kem-768/m4fspeed/verify.c
new file mode 100644
index 0000000..679ec89
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/verify.c
@@ -0,0 +1,51 @@
+#include "verify.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/*************************************************
+* Name:        verify
+*
+* Description: Compare two arrays for equality in constant time.
+*
+* Arguments:   const unsigned char *a: pointer to first byte array
+*              const unsigned char *b: pointer to second byte array
+*              size_t len:             length of the byte arrays
+*
+* Returns 0 if the byte arrays are equal, 1 otherwise
+**************************************************/
+unsigned char verify(const unsigned char *a, const unsigned char *b, size_t len) {
+    uint64_t r;
+    size_t i;
+
+    r = 0;
+    for (i = 0; i < len; i++) {
+        r |= a[i] ^ b[i];
+    }
+
+    r = (~r + 1); // Two's complement
+    r >>= 63;
+    return (unsigned char)r;
+}
+
+/*************************************************
+* Name:        cmov
+*
+* Description: Copy len bytes from x to r if b is 1;
+*              don't modify x if b is 0. Requires b to be in {0,1};
+*              assumes two's complement representation of negative integers.
+*              Runs in constant time.
+*
+* Arguments:   unsigned char *r:       pointer to output byte array
+*              const unsigned char *x: pointer to input byte array
+*              size_t len:             Amount of bytes to be copied
+*              unsigned char b:        Condition bit; has to be in {0,1}
+**************************************************/
+void cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b) {
+    size_t i;
+
+    b = -b;
+    for (i = 0; i < len; i++) {
+        r[i] ^= b & (x[i] ^ r[i]);
+    }
+}
diff --git a/crypto_kem/ml-kem-768/m4fspeed/verify.h b/crypto_kem/ml-kem-768/m4fspeed/verify.h
new file mode 100644
index 0000000..8777a14
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fspeed/verify.h
@@ -0,0 +1,10 @@
+#ifndef VERIFY_H
+#define VERIFY_H
+
+#include <stdio.h>
+
+unsigned char verify(const unsigned char *a, const unsigned char *b, size_t len);
+
+void cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b);
+
+#endif
diff --git a/crypto_kem/ml-kem-768/m4fstack/api.h b/crypto_kem/ml-kem-768/m4fstack/api.h
new file mode 120000
index 0000000..cf75db9
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/api.h
@@ -0,0 +1 @@
+../m4fspeed/api.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fstack/cbd.c b/crypto_kem/ml-kem-768/m4fstack/cbd.c
new file mode 120000
index 0000000..903fa59
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/cbd.c
@@ -0,0 +1 @@
+../m4fspeed/cbd.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fstack/cbd.h b/crypto_kem/ml-kem-768/m4fstack/cbd.h
new file mode 120000
index 0000000..d264c36
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/cbd.h
@@ -0,0 +1 @@
+../m4fspeed/cbd.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fstack/cmov_int16.S b/crypto_kem/ml-kem-768/m4fstack/cmov_int16.S
new file mode 120000
index 0000000..9055f6a
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/cmov_int16.S
@@ -0,0 +1 @@
+../m4fspeed/cmov_int16.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fstack/fastaddsub.S b/crypto_kem/ml-kem-768/m4fstack/fastaddsub.S
new file mode 120000
index 0000000..d1317f7
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/fastaddsub.S
@@ -0,0 +1 @@
+../m4fspeed/fastaddsub.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fstack/fastbasemul.S b/crypto_kem/ml-kem-768/m4fstack/fastbasemul.S
new file mode 100644
index 0000000..c6e4e49
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/fastbasemul.S
@@ -0,0 +1,207 @@
+/******************************************************************************
+* Integrating the improved Plantard arithmetic into Kyber.
+*
+* Efficient Plantard arithmetic enables a faster Kyber implementation with the 
+* same stack usage.
+*
+* See the paper at https://eprint.iacr.org/2022/956.pdf for more details.
+*
+* @author   Junhao Huang, BNU-HKBU United International College, Zhuhai, China
+*           jhhuang_nuaa@126.com
+*
+* @date     September 2022
+******************************************************************************/
+#include "macros.i"
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+.global basemul_asm
+.type basemul_asm, %function
+.align 2
+basemul_asm:
+	push {r4-r11, lr}
+
+	rptr    .req r0
+	aptr    .req r1
+	bptr    .req r2
+	zetaptr .req r3
+	poly0   .req r4
+	poly1   .req r6
+	poly2   .req r5
+	poly3   .req r7
+	q       .req r8
+	qa      .req r14
+	qinv    .req r9
+	tmp     .req r10
+	tmp2    .req r11
+	zeta    .req r12
+	loop    .req r14
+
+	//movw qa, #26632
+	movt  q, #3329
+	### qinv=0x6ba8f301
+	movw qinv, #62209
+	movt qinv, #27560
+
+	movw loop, #64
+	1:
+	vmov.w s0,loop
+	movw qa, #26632
+			
+	ldrd poly0, poly2, [aptr], #8
+	ldrd poly1, poly3, [bptr], #8 
+	// ldr poly0, [aptr], #4
+	// ldr poly1, [bptr], #4
+	// ldr poly2, [aptr], #4
+	// ldr poly3, [bptr], #4
+
+	ldr.w zeta, [zetaptr], #4
+
+	// basemul(r->coeffs + 4 * i, a->coeffs + 4 * i, b->coeffs + 4 * i, zetas[64 + i]);
+	smulwt tmp, zeta, poly1 
+	smlabt tmp, tmp, q, qa  
+	smultt tmp, poly0, tmp  
+	smlabb tmp, poly0, poly1, tmp 
+	plant_red q, qa, qinv, tmp
+	// r[0] in upper half of tmp
+	
+	smuadx tmp2, poly0, poly1 
+	plant_red q, qa, qinv, tmp2
+	// r[1] in upper half of tmp2
+	pkhtb tmp, tmp2, tmp, asr#16
+	str tmp, [rptr], #4
+
+	neg zeta, zeta
+
+	// basemul(r->coeffs + 4 * i + 2, a->coeffs + 4 * i + 2, b->coeffs + 4 * i + 2, - zetas[64 + i]);
+	smulwt tmp, zeta, poly3 
+	smlabt tmp, tmp, q, qa  
+	smultt tmp, poly2, tmp  
+	smlabb tmp, poly2, poly3, tmp 
+	plant_red q, qa, qinv, tmp
+	// r[0] in upper half of tmp
+	
+	smuadx tmp2, poly2, poly3 
+	plant_red q, qa, qinv, tmp2
+	// r[1] in upper half of tmp2
+	pkhtb tmp, tmp2, tmp, asr#16
+	str tmp, [rptr], #4
+		
+	vmov.w loop,s0
+	subs.w loop, #1
+	bne.w 1b
+
+	.unreq rptr   
+	.unreq aptr   
+	.unreq bptr   
+	.unreq zetaptr
+	.unreq poly0  
+	.unreq poly1  
+	.unreq poly2  
+	.unreq poly3  
+	.unreq q      
+	.unreq qa     
+	.unreq qinv   
+	.unreq tmp    
+	.unreq tmp2   
+	.unreq zeta   
+	.unreq loop   
+
+	pop {r4-r11, pc}
+//-0.5p~0.5p
+.global basemul_asm_acc
+.type basemul_asm_acc, %function
+.align 2
+basemul_asm_acc:
+	push {r4-r11, lr}
+
+	rptr    .req r0
+	aptr    .req r1
+	bptr    .req r2
+	zetaptr .req r3
+	poly0   .req r4
+	poly1   .req r6
+	poly2   .req r5
+	poly3   .req r7
+	q       .req r8
+	qa      .req r14
+	qinv    .req r9
+	tmp     .req r10
+	tmp2    .req r11
+	zeta    .req r12
+	loop    .req r14
+
+	
+	movt  q, #3329
+	### qinv=0x6ba8f301
+	movw qinv, #62209
+	movt qinv, #27560
+
+	movw loop, #64
+	1:
+		vmov.w s0,loop
+		movw qa, #26632
+
+	ldrd poly0, poly2, [aptr], #8
+	ldrd poly1, poly3, [bptr], #8
+
+	ldr.w zeta, [zetaptr], #4
+
+	//basemul(r->coeffs + 4 * i, a->coeffs + 4 * i, b->coeffs + 4 * i, zetas[64 + i]);
+	smulwt tmp, zeta, poly1 
+	smlabt tmp, tmp, q, qa  
+	smultt tmp, poly0, tmp  
+	smlabb tmp, poly0, poly1, tmp 
+	plant_red q, qa, qinv, tmp
+	// r[0] in upper half of tmp
+	
+	smuadx tmp2, poly0, poly1 
+	plant_red q, qa, qinv, tmp2
+	// r[1] in upper half of tmp2
+	pkhtb tmp, tmp2, tmp, asr#16
+	
+	ldr.w tmp2, [rptr]
+	uadd16 tmp, tmp, tmp2
+	str.w tmp, [rptr], #4
+
+	neg zeta, zeta
+
+	// basemul(r->coeffs + 4 * i + 2, a->coeffs + 4 * i + 2, b->coeffs + 4 * i + 2, - zetas[64 + i]);
+	smulwt tmp, zeta, poly3 
+	smlabt tmp, tmp, q, qa  
+	smultt tmp, poly2, tmp  
+	smlabb tmp, poly2, poly3, tmp 
+	plant_red q, qa, qinv, tmp
+	// r[0] in upper half of tmp
+	
+	smuadx tmp2, poly2, poly3 
+	plant_red q, qa, qinv, tmp2
+	// r[1] in upper half of tmp2
+	pkhtb tmp, tmp2, tmp, asr#16
+	
+	ldr.w tmp2, [rptr]
+	uadd16 tmp, tmp, tmp2
+	str.w tmp, [rptr], #4
+
+	vmov.w loop, s0
+	subs.w loop, #1
+	bne.w 1b
+
+	.unreq rptr    
+	.unreq aptr    
+	.unreq bptr    
+	.unreq zetaptr 
+	.unreq poly0   
+	.unreq poly1   
+	.unreq poly2   
+	.unreq poly3   
+	.unreq q       
+	.unreq qa      
+	.unreq qinv    
+	.unreq tmp     
+	.unreq tmp2    
+	.unreq zeta    
+	.unreq loop    
+
+	pop {r4-r11, pc}
diff --git a/crypto_kem/ml-kem-768/m4fstack/fastinvntt.S b/crypto_kem/ml-kem-768/m4fstack/fastinvntt.S
new file mode 100644
index 0000000..0fe208d
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/fastinvntt.S
@@ -0,0 +1,360 @@
+/******************************************************************************
+* Integrating the improved Plantard arithmetic into Kyber.
+*
+* Efficient Plantard arithmetic enables a faster Kyber implementation with the 
+* same stack usage.
+*
+* See the paper at https://eprint.iacr.org/2022/956.pdf for more details.
+*
+* @author   Junhao Huang, BNU-HKBU United International College, Zhuhai, China
+*           jhhuang_nuaa@126.com
+*
+* @date     September 2022
+******************************************************************************/
+#include "macros.i"
+
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+.macro mul_twiddle_plant a, twiddle, tmp, q, qa
+	smulwb \tmp, \twiddle, \a
+	smulwt \a,   \twiddle, \a
+	smlabt \tmp, \tmp, \q, \qa
+	smlabt \a, \a, \q, \qa
+	pkhtb \a, \a, \tmp, asr#16
+.endm
+
+.macro doublebutterfly_plant a0, a1, twiddle, tmp, q, qa
+	smulwb \tmp, \twiddle, \a1
+	smulwt \a1, \twiddle, \a1
+	smlabt \tmp, \tmp, \q, \qa
+	smlabt \a1, \a1, \q, \qa
+	pkhtb \tmp, \a1, \tmp, asr#16
+	usub16 \a1, \a0, \tmp
+	uadd16 \a0, \a0, \tmp
+.endm
+
+.macro two_doublebutterfly_plant a0, a1, a2, a3, twiddle0, twiddle1, tmp, q, qa
+	doublebutterfly_plant \a0, \a1, \twiddle0, \tmp, \q, \qa
+	doublebutterfly_plant \a2, \a3, \twiddle1, \tmp, \q, \qa
+.endm
+
+.macro fullplant a0, a1, a2, a3, a4, a5, a6, a7, tmp, q, qa, plantconst
+	movw \plantconst, #44984
+	movt \plantconst, #19
+	doubleplant \a0, \tmp, \q, \qa, \plantconst
+	doubleplant \a1, \tmp, \q, \qa, \plantconst
+	doubleplant \a2, \tmp, \q, \qa, \plantconst
+	doubleplant \a3, \tmp, \q, \qa, \plantconst
+	doubleplant \a4, \tmp, \q, \qa, \plantconst
+	doubleplant \a5, \tmp, \q, \qa, \plantconst
+	doubleplant \a6, \tmp, \q, \qa, \plantconst
+	doubleplant \a7, \tmp, \q, \qa, \plantconst
+.endm
+
+.macro halfplant a0, a1, a2, a3, tmp, q, qa, plantconst
+	movw \plantconst, #44984
+	movt \plantconst, #19
+	doubleplant \a0, \tmp, \q, \qa, \plantconst
+	doubleplant \a1, \tmp, \q, \qa, \plantconst
+	doubleplant \a2, \tmp, \q, \qa, \plantconst
+	doubleplant \a3, \tmp, \q, \qa, \plantconst
+.endm
+
+// twiddle2 is used as tmp2
+// c0, c2, c4, c6: output 6.5q maximum; c1 c3 c5 c7: output 4q maximum.
+.macro _3_layer_double_inv_CT_16_plant_light c0, c1, c2, c3, c4, c5, c6, c7, xi2, xi4, xi5, xi6, twiddle1, tmp2, q, qa, tmp
+
+	// layer 1  
+	sadd16.w \tmp, \c0, \c1 // c0, c1
+	ssub16.w \c1, \c0, \c1
+	sadd16.w \tmp2, \c2, \c3 // c2, c3
+	ssub16.w \c3, \c2, \c3
+	// tmp, c1, tmp2, c3: 4q maximum
+	sadd16.w \c0, \c4, \c5 // c4, c5
+	ssub16.w \c5, \c4, \c5
+	sadd16.w \c2, \c6, \c7 // c6, c7
+	ssub16.w \c7, \c6, \c7
+	// c4, c6 are free at this point
+	// c0,c5,c2,c7 4q maximum
+
+	// layer 2
+	sadd16.w \c6, \tmp, \tmp2 // c0, c2
+	ssub16.w \tmp2, \tmp, \tmp2
+	sadd16.w \c4, \c0, \c2 // c4, c6
+	ssub16.w \c2, \c0, \c2
+	// c6, tmp2, c4, c2: 6q maximum
+
+	vmov.w \twiddle1, \xi2
+	doublebutterfly_plant \c1, \c3, \twiddle1, \tmp, \q, \qa
+	doublebutterfly_plant \c5, \c7, \twiddle1, \tmp, \q, \qa 
+	// c1, c3, c7, c5: 3.5q maximum;
+
+	// tmp and c0 are free at this point
+	//reduction c6, tmp2, c4, c2: 0.5q
+	movw \twiddle1, #44984
+	movt \twiddle1, #19
+	doubleplant \c6, \tmp, \q, \qa, \twiddle1
+
+	// layer 3
+	sadd16.w \c0, \c6, \c4 // c0, c4
+	ssub16.w \c4, \c6, \c4
+	// c0, c4: 6.5q
+	// c6 are free at this point
+	vmov.w \twiddle1, \xi4
+	doublebutterfly_plant \c1, \c5, \twiddle1, \tmp, \q, \qa
+	// c1, c5: 4q maximum
+
+	vmov.w \twiddle1, \xi5
+	// this block is one doublebutterfly
+	smulwb \tmp, \twiddle1, \c2  // c2, c6
+	smulwt \c2,  \twiddle1, \c2
+	smlabt \tmp, \tmp, \q, \qa
+	smlabt \c2, \c2, \q, \qa
+	pkhtb \tmp, \c2, \tmp, asr#16
+	ssub16.w \c6, \tmp2, \tmp 
+	sadd16.w \c2, \tmp2, \tmp
+	//c6, c2: 6.5q
+	vmov.w \twiddle1, \xi6
+	doublebutterfly_plant \c3, \c7, \twiddle1, \tmp, \q, \qa
+	//c3, c7: 4q
+.endm
+
+.macro _3_layer_double_inv_CT_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+	// layer 3
+	ldr.w \twiddle1, [\twiddle_ptr], #4
+	two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa
+	two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa
+
+	// layer 2
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa
+
+	two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa
+
+	// layer 1
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle2, \tmp, \q, \qa
+
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa
+.endm
+
+.macro _3_layer_double_inv_twist_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	mul_twiddle_plant \c0, \twiddle1, \tmp, \q, \qa
+	mul_twiddle_plant \c1, \twiddle2, \tmp, \q, \qa
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	mul_twiddle_plant \c2, \twiddle1, \tmp, \q, \qa
+	mul_twiddle_plant \c3, \twiddle2, \tmp, \q, \qa
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	mul_twiddle_plant \c4, \twiddle1, \tmp, \q, \qa
+	mul_twiddle_plant \c5, \twiddle2, \tmp, \q, \qa
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	mul_twiddle_plant \c6, \twiddle1, \tmp, \q, \qa
+	mul_twiddle_plant \c7, \twiddle2, \tmp, \q, \qa
+.endm
+
+.global invntt_fast
+.type invntt_fast, %function
+.align 2
+invntt_fast:
+	push {r4-r11, r14}
+	vpush.w {s16-s23}
+	poly         .req r0
+	twiddle_ptr  .req r1
+	poly0        .req r2
+	poly1        .req r3
+	poly2        .req r4
+	poly3        .req r5
+	poly4        .req r6
+	poly5        .req r7
+	poly6        .req r8
+	poly7        .req r9
+	twiddle1     .req r10
+	twiddle2     .req r11
+	q            .req r12 
+	// at the top of r12
+	qa           .req r0
+	// qa=2^a q;a=3; at the bottom of r12
+	tmp          .req r14
+
+	movt q, #3329
+
+	### LAYER 7+6+5+4
+	.equ distance, 16
+	.equ offset, 32
+	.equ strincr, 64
+
+	// pre-load twiddle factors to FPU registers
+	vldm twiddle_ptr!, {s8-s22}
+
+	add.w tmp, poly, #8*strincr
+	vmov s8, tmp
+	1:
+		vmov s23, poly
+		// load a1, a3, ..., a15
+		load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset
+		load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset
+
+		movw qa, #26632
+
+		// NTT on a1, a3, ..., a15   
+		// twiddle2 is used as tmp2
+		_3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s10, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp
+
+		// multiply coeffs by layer 4 twiddles for later use
+		vmov twiddle1, s15 
+		vmov twiddle2, s16
+		mul_twiddle_plant poly0, twiddle1, tmp, q, qa // could be omitted but kept for reduction only
+		mul_twiddle_plant poly1, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s17 
+		vmov twiddle2, s18
+		mul_twiddle_plant poly2, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly3, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s19 
+		vmov twiddle2, s20
+		mul_twiddle_plant poly4, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly5, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s21 
+		vmov twiddle2, s22
+		mul_twiddle_plant poly6, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly7, twiddle2, tmp, q, qa
+
+		vmov s0, poly0 // a1
+		vmov s1, poly1 // a3
+		vmov s2, poly2 // a5
+		vmov s3, poly3 // a7
+		vmov s4, poly4 // a9
+		vmov s5, poly5 // a11
+		vmov s6, poly6 // a13
+		vmov s7, poly7 // a15
+		// 0.5q
+
+		vmov poly, s23
+		// load a0, a2, ..., a14
+		load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+		load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		
+		movw qa, #26632
+		// NTT on a0, a2, ..., a14
+		// twiddle2 is used as tmp2
+		_3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s10, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp
+
+		// layer 4 - 1
+		// addsub: (a2, a6, a10, a14), (a3, a7, a11, a15)
+		vmov poly, s23
+		vmov twiddle2, s1 // load a3
+		uadd16 tmp, poly1, twiddle2
+		usub16 poly1, poly1, twiddle2
+		str.w tmp, [poly, #1*distance/4]
+		str.w poly1, [poly, #1*distance/4+offset]
+
+		vmov twiddle2, s3 // load a7
+		uadd16 tmp, poly3, twiddle2
+		usub16 poly3, poly3, twiddle2
+		str.w tmp, [poly, #3*distance/4]
+		str.w poly3, [poly, #3*distance/4+offset]
+		
+		vmov twiddle2, s5 // load a11
+		uadd16 tmp, poly5, twiddle2
+		usub16 poly5, poly5, twiddle2
+		str.w tmp, [poly, #5*distance/4]
+		str.w poly5, [poly, #5*distance/4+offset]
+		
+		vmov twiddle2, s7 // load a15
+		uadd16 tmp, poly7, twiddle2
+		usub16 poly7, poly7, twiddle2
+		str.w tmp, [poly, #7*distance/4]
+		str.w poly7, [poly, #7*distance/4+offset]
+		//1,3,5,7: upto 4.5q
+		// layer 4 - 2    
+		// addsub: (a0, a4, a8, a12), (a1, a5, a9, a13)
+		vmov poly3, s2 // load a5
+		uadd16 tmp, poly2, poly3
+		usub16 twiddle2, poly2, poly3
+		str.w tmp, [poly, #2*distance/4]
+		str.w twiddle2, [poly, #2*distance/4+offset]
+
+		vmov poly5, s4 // load a9
+		uadd16 tmp, poly4, poly5
+		usub16 twiddle2, poly4, poly5
+		str.w tmp, [poly, #4*distance/4]
+		str.w twiddle2, [poly, #4*distance/4+offset]
+
+		vmov poly7, s6 // load a13
+		uadd16 tmp, poly6, poly7
+		usub16 twiddle2, poly6, poly7
+		str.w tmp, [poly, #6*distance/4]
+		str.w twiddle2, [poly, #6*distance/4+offset]
+		
+		vmov poly1, s0 // load a1
+		uadd16 tmp, poly0, poly1
+		usub16 twiddle2, poly0, poly1
+		str.w twiddle2, [poly, #offset]    
+		str.w tmp, [poly], #strincr // increase 2*8*4 = 64 (2 * 8 loads of 4 bytes each)
+		//0,2,4,6: upto 7q
+		vmov tmp, s8
+	cmp.w poly, tmp
+	bne.w 1b
+
+	sub.w poly, #8*strincr  
+
+	### LAYER 3+2+1
+
+
+	.equ distance, distance*16
+	.equ strincr, 4
+
+	// ITER 0
+	vmov s6, poly
+	load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+	load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+
+	vldm twiddle_ptr!, {s0-s5}
+	movw qa, #26632
+	fullplant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7 tmp, q, qa, twiddle1
+	// twiddle2 is used as tmp2
+	_3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s1, s3, s4, s5, twiddle1, twiddle2, q, qa, tmp
+
+	// twisting
+	_3_layer_double_inv_twist_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+	
+	vmov poly, s6
+	store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+	str.w poly1, [poly, #distance/4]
+	str.w poly2, [poly, #2*distance/4]
+	str.w poly3, [poly, #3*distance/4]
+	str.w poly0, [poly], #4
+
+	// ITER 1-15
+	add.w tmp, poly, #strincr*3*(5)
+	vmov s14, tmp
+	2:
+		vmov s6, poly
+		// polys upto 7q
+		load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+		load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		
+		movw qa, #26632
+		_3_layer_double_inv_CT_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+
+		// twisting
+		_3_layer_double_inv_twist_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+
+		vmov poly, s6
+		store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		str.w poly1, [poly, #distance/4]
+		str.w poly2, [poly, #2*distance/4]
+		str.w poly3, [poly, #3*distance/4]
+		str.w poly0, [poly], #4
+
+		vmov tmp, s14
+		cmp.w poly, tmp
+	bne.w 2b
+	vpop.w {s16-s23}
+	pop {r4-r11, pc}
diff --git a/crypto_kem/ml-kem-768/m4fstack/fastntt.S b/crypto_kem/ml-kem-768/m4fstack/fastntt.S
new file mode 120000
index 0000000..208c11d
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/fastntt.S
@@ -0,0 +1 @@
+../m4fspeed/fastntt.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fstack/indcpa.c b/crypto_kem/ml-kem-768/m4fstack/indcpa.c
new file mode 100644
index 0000000..3869797
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/indcpa.c
@@ -0,0 +1,211 @@
+#include "indcpa.h"
+#include "ntt.h"
+#include "poly.h"
+#include "polyvec.h"
+#include "randombytes.h"
+#include "symmetric.h"
+#include "matacc.h"
+
+#include <string.h>
+#include <stdint.h>
+
+/*************************************************
+* Name:        indcpa_keypair
+*
+* Description: Generates public and private key for the CPA-secure
+*              public-key encryption scheme underlying Kyber
+*
+* Arguments:   - unsigned char *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
+*              - unsigned char *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes)
+**************************************************/
+void indcpa_keypair_derand(unsigned char *pk,
+                    unsigned char *sk, 
+                    const unsigned char *coins){
+    polyvec skpv;
+    poly pkp;
+    unsigned char buf[2 * KYBER_SYMBYTES];
+    unsigned char *publicseed = buf;
+    unsigned char *noiseseed = buf + KYBER_SYMBYTES;
+    int i;
+    unsigned char nonce = 0;
+
+    memcpy(buf, coins, KYBER_SYMBYTES);
+    buf[KYBER_SYMBYTES] = KYBER_K;
+    hash_g(buf, buf, KYBER_SYMBYTES + 1);
+
+    for (i = 0; i < KYBER_K; i++)
+        poly_getnoise(skpv.vec + i, noiseseed, nonce++);
+
+    polyvec_ntt(&skpv);
+
+    for (i = 0; i < KYBER_K; i++) {
+        matacc(&pkp, &skpv, i, publicseed, 0);
+        
+        poly_invntt(&pkp);
+
+        poly_addnoise(&pkp, noiseseed, nonce++);
+        poly_ntt(&pkp);
+
+        poly_tobytes(pk+i*KYBER_POLYBYTES, &pkp);
+    }
+    polyvec_tobytes(sk, &skpv);
+    memcpy(pk + KYBER_POLYVECBYTES, publicseed, KYBER_SYMBYTES); // Pack the public seed in the public key
+}
+
+/*************************************************
+* Name:        indcpa_enc
+*
+* Description: Encryption function of the CPA-secure
+*              public-key encryption scheme underlying Kyber.
+*
+* Arguments:   - unsigned char *c:          pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes)
+*              - const unsigned char *m:    pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes)
+*              - const unsigned char *pk:   pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
+*              - const unsigned char *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes)
+*                                           to deterministically generate all randomness
+**************************************************/
+void indcpa_enc(unsigned char *c,
+               const unsigned char *m,
+               const unsigned char *pk,
+               const unsigned char *coins) {
+    polyvec sp;
+    poly bp;
+    poly *pkp = &bp;
+    poly *k = &bp;
+    poly *v = &sp.vec[0];
+    const unsigned char *seed = pk+KYBER_POLYVECBYTES;
+    int i;
+    unsigned char nonce = 0;
+
+    for (i = 0; i < KYBER_K; i++)
+        poly_getnoise(sp.vec + i, coins, nonce++);
+
+    polyvec_ntt(&sp);
+
+    for (i = 0; i < KYBER_K; i++) {
+        matacc(&bp, &sp, i, seed, 1);
+        poly_invntt(&bp);
+
+        poly_addnoise(&bp, coins, nonce++);
+        poly_reduce(&bp);
+
+        poly_packcompress(c, &bp, i);
+    }
+
+    poly_frombytes(pkp, pk);
+    poly_basemul(v, pkp, &sp.vec[0]);
+    for (i = 1; i < KYBER_K; i++) {
+        poly_frombytes(pkp, pk + i*KYBER_POLYBYTES);
+        poly_basemul_acc(v, pkp, &sp.vec[i]);
+    }
+
+    poly_invntt(v);
+
+    poly_addnoise(v, coins, nonce++);
+
+    poly_frommsg(k, m);
+    poly_add(v, v, k);
+    poly_reduce(v);
+
+    poly_compress(c + KYBER_POLYVECCOMPRESSEDBYTES, v);
+}
+
+/*************************************************
+* Name:        indcpa_enc_cmp
+*
+* Description: Re-encryption function.
+*              Compares the re-encypted ciphertext with the original ciphertext byte per byte.
+*              The comparison is performed in a constant time manner.
+*
+*
+* Arguments:   - unsigned char *ct:         pointer to input ciphertext to compare the new ciphertext with (of length KYBER_INDCPA_BYTES bytes)
+*              - const unsigned char *m:    pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes)
+*              - const unsigned char *pk:   pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes)
+*              - const unsigned char *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes)
+*                                           to deterministically generate all randomness
+* Returns:     - boolean byte indicating that re-encrypted ciphertext is NOT equal to the original ciphertext
+**************************************************/
+unsigned char indcpa_enc_cmp(const unsigned char *c,
+                             const unsigned char *m,
+                             const unsigned char *pk,
+                             const unsigned char *coins) {
+    uint64_t rc = 0;
+    polyvec sp;
+    poly bp;
+    poly *pkp = &bp;
+    poly *k = &bp;
+    poly *v = &sp.vec[0];
+    const unsigned char *seed = pk+KYBER_POLYVECBYTES;
+    int i;
+    unsigned char nonce = 0;
+
+    for (i = 0; i < KYBER_K; i++)
+        poly_getnoise(sp.vec + i, coins, nonce++);
+
+    polyvec_ntt(&sp);
+
+    for (i = 0; i < KYBER_K; i++) {
+        matacc(&bp, &sp, i, seed, 1);
+        poly_invntt(&bp);
+
+        poly_addnoise(&bp, coins, nonce++);
+        poly_reduce(&bp);
+
+        rc |= cmp_poly_packcompress(c, &bp, i);
+    }
+
+    poly_frombytes(pkp, pk);
+    poly_basemul(v, pkp, &sp.vec[0]);
+    for (i = 1; i < KYBER_K; i++) {
+        poly_frombytes(pkp, pk + i*KYBER_POLYBYTES);
+        poly_basemul_acc(v, pkp, &sp.vec[i]);
+    }
+
+    poly_invntt(v);
+
+    poly_addnoise(v, coins, nonce++);
+    poly_frommsg(k, m);
+    poly_add(v, v, k);
+    poly_reduce(v);
+
+    rc |= cmp_poly_compress(c + KYBER_POLYVECCOMPRESSEDBYTES, v);
+
+    rc = ~rc + 1;
+    rc >>= 63;
+    return (unsigned char)rc;
+}
+
+/*************************************************
+* Name:        indcpa_dec
+*
+* Description: Decryption function of the CPA-secure
+*              public-key encryption scheme underlying Kyber.
+*
+* Arguments:   - unsigned char *m:        pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES)
+*              - const unsigned char *c:  pointer to input ciphertext (of length KYBER_INDCPA_BYTES)
+*              - const unsigned char *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES)
+**************************************************/
+void __attribute__ ((noinline)) indcpa_dec(unsigned char *m,
+                                           const unsigned char *c,
+                                           const unsigned char *sk) {
+    poly mp, bp;
+    poly *v = &bp;
+    int i;
+
+    poly_unpackdecompress(&mp, c, 0);
+    poly_ntt(&mp);
+    
+    poly_frombytes_mul(&mp, &mp, sk);
+    for(i = 1; i < KYBER_K; i++) {
+        poly_unpackdecompress(&bp, c, i);
+        poly_ntt(&bp);
+        poly_frombytes_mul_acc(&mp, &bp, sk + i*KYBER_POLYBYTES);
+    }
+
+    poly_invntt(&mp);
+    poly_decompress(v, c+KYBER_POLYVECCOMPRESSEDBYTES);
+    poly_sub(&mp, v, &mp);
+    poly_reduce(&mp);
+
+    poly_tomsg(m, &mp);
+}
diff --git a/crypto_kem/ml-kem-768/m4fstack/indcpa.h b/crypto_kem/ml-kem-768/m4fstack/indcpa.h
new file mode 120000
index 0000000..5893b12
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/indcpa.h
@@ -0,0 +1 @@
+../m4fspeed/indcpa.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fstack/kem.c b/crypto_kem/ml-kem-768/m4fstack/kem.c
new file mode 120000
index 0000000..302153d
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/kem.c
@@ -0,0 +1 @@
+../m4fspeed/kem.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fstack/macros.i b/crypto_kem/ml-kem-768/m4fstack/macros.i
new file mode 120000
index 0000000..6e83891
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/macros.i
@@ -0,0 +1 @@
+../m4fspeed/macros.i
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fstack/matacc.c b/crypto_kem/ml-kem-768/m4fstack/matacc.c
new file mode 100644
index 0000000..9aaec0f
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/matacc.c
@@ -0,0 +1,43 @@
+#include "ntt.h"
+#include "poly.h"
+#include "polyvec.h"
+#include "symmetric.h"
+#include "matacc.h"
+
+/*************************************************
+* Name:        matacc
+*
+* Description: Multiplies a row of A or A^T, generated on-the-fly,
+*              with a vector of polynomials and accumulates into the result.
+*
+* Arguments:   - poly *r:                    pointer to output polynomial to accumulate in
+*              - polyvec *b:                 pointer to input vector of polynomials to multiply with
+*              - unsigned char i:            byte to indicate the index < KYBER_K of the row of A or A^T
+*              - const unsigned char *seed:  pointer to the public seed used to generate A
+*              - int transposed:             boolean indicatin whether A or A^T is generated
+**************************************************/
+void matacc(poly* r, const polyvec *b, unsigned char i, const unsigned char *seed, int transposed) {
+  unsigned char buf[XOF_BLOCKBYTES+2];
+  xof_state state;
+  int16_t c[4];
+  int j = 0;
+  
+  if (transposed)
+    xof_absorb(&state, seed, i, j);
+  else
+    xof_absorb(&state, seed, j, i);
+
+  xof_squeezeblocks(buf, 1, &state);
+
+  matacc_asm(r->coeffs, b->vec[j].coeffs, c, buf, zetas, &state);
+  for(j=1;j<KYBER_K;j++) {
+
+    if (transposed)
+      xof_absorb(&state, seed, i, j);
+    else
+      xof_absorb(&state, seed, j, i);
+
+    xof_squeezeblocks(buf, 1, &state);
+    matacc_asm_acc(r->coeffs, b->vec[j].coeffs, c, buf, zetas, &state);
+  }
+}
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fstack/matacc.h b/crypto_kem/ml-kem-768/m4fstack/matacc.h
new file mode 100644
index 0000000..92a3b38
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/matacc.h
@@ -0,0 +1,26 @@
+#ifndef MATACC_H
+#define MATACC_H
+#include "poly.h"
+#include "polyvec.h"
+#include "symmetric.h"
+
+extern void matacc_asm(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t zetas[64], xof_state *state);
+static inline void _matacc_asm(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES + 2], const int32_t _zetas[64], xof_state *state)
+{
+	// floating point registers clobbered by assembly function
+	asm volatile("" : : : "s16", "s17", "s18", "s19", "s20", "s21", "s26");
+	matacc_asm(r, b, c, buf, _zetas, state);
+}
+#define matacc_asm _matacc_asm
+
+extern void matacc_asm_acc(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t zetas[64], xof_state *state);
+static inline void _matacc_asm_acc(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES + 2], const int32_t _zetas[64], xof_state *state)
+{
+	// floating point registers clobbered by assembly function
+	asm volatile("" : : : "s16", "s17", "s18", "s19", "s20", "s21", "s26");
+	matacc_asm_acc(r, b, c, buf, _zetas, state);
+}
+#define matacc_asm_acc _matacc_asm_acc
+
+void matacc(poly* r, const polyvec *b, unsigned char i, const unsigned char *seed, int transposed);
+#endif
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fstack/matacc.i b/crypto_kem/ml-kem-768/m4fstack/matacc.i
new file mode 100644
index 0000000..237ee46
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/matacc.i
@@ -0,0 +1,197 @@
+/******************************************************************************
+ * Integrating the improved Plantard arithmetic into Kyber.
+ *
+ * Efficient Plantard arithmetic enables a faster Kyber implementation with the
+ * same stack usage.
+ *
+ * See the paper at https://eprint.iacr.org/2022/956.pdf for more details.
+ *
+ * @author   Junhao Huang, BNU-HKBU United International College, Zhuhai, China
+ *           jhhuang_nuaa@126.com
+ *
+ * @date     September 2022
+ ******************************************************************************/
+// q locates in the bottom half of the register
+.macro plant_red_b q, qa, qinv, tmp
+	mul \tmp, \tmp, \qinv     
+	//tmp*qinv mod 2^2n/ 2^n; in high half
+	smlatb \tmp, \tmp, \q, \qa
+	// result in high half
+.endm
+
+.macro load_vals val0, val1, bufptr, tmp
+  ldrh \val0, [\bufptr], #2
+  ldrb \val1, [\bufptr], #1
+  ubfx \tmp, \val0, #12, #4
+  orr \val1, \tmp, \val1, lsl #4
+  ubfx \val0, \val0, #0, #12
+  ubfx \val1, \val1, #0, #12
+.endm
+
+// s17: bufptr; s26: state
+// Checks if val0 is suitable and multiplies with values from bptr using func 
+.macro first_if func, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr
+// if (val0 < KYBER_Q)
+    cmp.w \val0, \q
+    bhs.w 2f
+        strh \val0, [\cptr], #2
+        add \k, #1
+        cmp.w \k, #4
+        bne.w 2f
+            sub \cptr, #4*2
+            vmov s18, \bufptr
+            vmov s19, \ctr
+            vmov s20, \val1
+            \func \rptr, \bptr, \cptr, \zetaptr, \bufptr, \k, \val0, \val1, \q, \qa, \qinv, \tmp, \tmp2, \ctr
+            vmov \bufptr, s18
+            vmov \ctr, s19
+            vmov \val1, s20
+
+            add \ctr, #1
+            
+            movw \k, #0
+    2:
+.endm
+
+// Checks if val1 is suitable and multiplies with values from bptr using func 
+.macro second_if func, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr
+// if (val1 < KYBER_Q && ctr < KYBER_N/4)
+    cmp.w \val1, \q
+    bhs.w 2f
+        cmp.w \ctr, #256/4
+        bge.w 2f
+            strh \val1, [\cptr], #2
+            add \k, #1
+            cmp.w \k, #4
+            bne.w 2f
+                sub \cptr, #4*2
+                vmov s18, \bufptr
+                vmov s19, \ctr
+                \func \rptr, \bptr, \cptr, \zetaptr, \bufptr, \k, \val0, \val1, \q, \qa, \qinv, \tmp, \tmp2, \ctr
+                vmov \bufptr, s18
+                vmov \ctr, s19
+
+                add \ctr, #1
+                
+                movw \k, #0
+    2:
+.endm
+
+.macro third_if tmp, tmp2, rptr, bptr, cptr, bufptr, ctr
+// if (pos + 3 > buflen && ctr < KYBER_N/4)
+  vmov \tmp, s17
+  add \tmp, #168 // XOF_BLOCKBYTES=168
+  add \tmp2, \bufptr, #3
+  cmp.w \tmp2, \tmp  // pos + 3 > buflen
+  ble.w 2f
+    cmp.w \ctr, #256/4
+    bge.w 2f
+      vmov \bufptr, s17
+
+      vmov s16, r12
+      vmov s18, \rptr
+      vmov s19, \bptr
+      vmov s20, \cptr
+      vmov s21, \ctr
+
+      mov \rptr, \bufptr //bufptr
+      movw \bptr, #1
+      vmov \cptr, s26 // load state
+      #ifndef nohash
+      bl shake128_squeezeblocks
+      #endif
+      
+      vmov r12, s16
+      vmov \rptr, s18
+      vmov \bptr, s19
+      vmov \cptr, s20
+      vmov \ctr, s21
+      vmov \bufptr, s17
+    2:
+.endm
+
+.macro doublebasemul_asm rptr, aptr, bptr, zetaptr, poly0, poly1, poly2, poly3, q, qa, qinv, tmp, tmp2, zeta
+    ldr.w \poly0, [\aptr], #4
+    ldr.w \poly1, [\bptr]
+    ldr.w \poly2, [\aptr], #4
+    ldr.w \poly3, [\bptr, #4]
+    ldr.w \zeta, [\zetaptr], #4
+
+    //basemul(r->coeffs + 4 * i, a->coeffs + 4 * i, b->coeffs + 4 * i, zetas[64 + i]);
+    smulwt \tmp, \zeta, \poly1 
+    // b_1*zeta*qinv*plant_const; in low half
+    smlabb \tmp, \tmp, \q, \qa  
+    // b_1*zeta
+    smultt \tmp, \poly0, \tmp  
+    //a_1*b_1*zeta <2^32
+    smlabb \tmp, \poly0, \poly1, \tmp 
+    // a1*b1*zeta+a0*b0
+    plant_red_b \q, \qa, \qinv, \tmp
+    // r[0] in upper half of tmp
+    smuadx \tmp2, \poly0, \poly1 
+    plant_red_b \q, \qa, \qinv, \tmp2
+    // r[1] in upper half of tmp2
+    pkhtb \tmp, \tmp2, \tmp, asr#16
+    str \tmp, [\rptr], #4
+
+    neg \zeta, \zeta
+
+    //basemul(r->coeffs + 4 * i + 2, a->coeffs + 4 * i + 2, b->coeffs + 4 * i + 2, - zetas[64 + i]);
+    smulwt \tmp, \zeta, \poly3 
+    smlabb \tmp, \tmp, \q, \qa  
+    smultt \tmp, \poly2, \tmp  
+    smlabb \tmp, \poly2, \poly3, \tmp 
+    plant_red_b \q, \qa, \qinv, \tmp
+    // r[0] in upper half of tmp
+    
+    smuadx \tmp2, \poly2, \poly3 
+    plant_red_b \q, \qa, \qinv, \tmp2
+    // r[1] in upper half of tmp2
+    pkhtb \tmp, \tmp2, \tmp, asr#16
+    str \tmp, [\rptr], #4
+.endm
+// res replace poly2
+.macro doublebasemul_asm_acc rptr, aptr, bptr, zetaptr, poly0, poly1, res, poly3, q, qa, qinv, tmp, tmp2, zeta
+    ldr.w \poly0, [\aptr], #4
+    ldr.w \poly1, [\bptr]
+    ldr.w \poly3, [\bptr, #4]
+    ldr.w \res, [\rptr]
+    ldr.w \zeta, [\zetaptr], #4
+
+    //basemul(r->coeffs + 4 * i, a->coeffs + 4 * i, b->coeffs + 4 * i, zetas[64 + i]);
+    smulwt \tmp, \zeta, \poly1 
+    // b_1*zeta*qinv*plant_const; in low half
+    smlabb \tmp, \tmp, \q, \qa  
+    // b_1*zeta
+    smultt \tmp, \poly0, \tmp  
+    //a_1*b_1*zeta <2^32
+    smlabb \tmp, \poly0, \poly1, \tmp 
+    // a1*b1*zeta+a0*b0
+    plant_red_b \q, \qa, \qinv, \tmp
+    // r[0] in upper half of tmp
+    smuadx \tmp2, \poly0, \poly1 
+    plant_red_b \q, \qa, \qinv, \tmp2
+    // r[1] in upper half of tmp2
+    pkhtb \tmp, \tmp2, \tmp, asr#16
+    uadd16 \res, \res, \tmp
+    str \res, [\rptr], #4
+
+    neg \zeta, \zeta
+
+    ldr.w \res, [\rptr]
+    ldr \poly0, [\aptr], #4
+    //basemul(r->coeffs + 4 * i + 2, a->coeffs + 4 * i + 2, b->coeffs + 4 * i + 2, - zetas[64 + i]);
+    smulwt \tmp, \zeta, \poly3 
+    smlabb \tmp, \tmp, \q, \qa  
+    smultt \tmp, \poly0, \tmp  
+    smlabb \tmp, \poly0, \poly3, \tmp 
+    plant_red_b \q, \qa, \qinv, \tmp
+    // r[0] in upper half of tmp
+    
+    smuadx \tmp2, \poly0, \poly3 
+    plant_red_b \q, \qa, \qinv, \tmp2
+    // r[1] in upper half of tmp2
+    pkhtb \tmp, \tmp2, \tmp, asr#16
+    uadd16 \res, \res, \tmp
+    str \res, [\rptr], #4
+.endm
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fstack/matacc_asm.S b/crypto_kem/ml-kem-768/m4fstack/matacc_asm.S
new file mode 100644
index 0000000..2a5a307
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/matacc_asm.S
@@ -0,0 +1,118 @@
+#include "matacc.i"
+.extern shake128_squeezeblocks
+
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+// void matacc_asm(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t zetas[64], xof_state *state)
+.global matacc_asm
+.type matacc_asm, %function
+.align 2
+matacc_asm:
+	push {r0-r11, r14}
+	rptr    .req r0
+	bptr    .req r1
+	cptr    .req r2
+	bufptr  .req r3
+	zetaptr .req r4
+	val0    .req r5
+	val1    .req r6
+	tmp     .req r7
+	tmp2    .req r8
+	k       .req r9
+	q       .req r10
+	qa      .req r11
+	qinv    .req r12
+	ctr     .req r14
+	
+	ldr.w zetaptr, [sp, #13*4] // load zetaptr from stack
+	ldr.w tmp, [sp, #14*4] // load state from stack
+	vmov s26, tmp
+	
+	movw qa, #26632
+	movw q, #3329
+	### qinv=0x6ba8f301
+	movw qinv, #62209
+	movt qinv, #27560
+	movw k, #0
+
+	// outer while loop
+	movw ctr, #0
+	vmov s17, bufptr // save bufptr to check later
+	1:
+
+		ldrh val0, [bufptr], #2
+		ldrb val1, [bufptr], #1
+		ubfx tmp, val0, #12, #4
+		orr val1, tmp, val1, lsl #4
+		ubfx val0, val0, #0, #12
+		ubfx val1, val1, #0, #12
+
+		first_if doublebasemul_asm, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr
+		
+		second_if doublebasemul_asm, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr
+
+		third_if tmp, tmp2, rptr, bptr, cptr, bufptr, ctr
+
+	cmp ctr, #256/4
+	blt.w 1b
+	
+	pop {r0-r11, pc}
+.size matacc_asm, . - matacc_asm
+
+// void matacc_asm(int16_t *r, const int16_t *b, int16_t c[4], unsigned char buf[XOF_BLOCKBYTES+2], const int32_t zetas[64], xof_state *state)
+.global matacc_asm_acc
+.type matacc_asm_acc, %function
+.align 2
+matacc_asm_acc:
+	push {r0-r11, r14}
+	rptr    .req r0
+	bptr    .req r1
+	cptr    .req r2
+	bufptr  .req r3
+	zetaptr .req r4
+	val0    .req r5
+	val1    .req r6
+	tmp     .req r7
+	tmp2    .req r8
+	k       .req r9
+	q       .req r10
+	qa      .req r11
+	qinv    .req r12
+	ctr     .req r14
+	
+	ldr.w zetaptr, [sp, #13*4] // load zetaptr from stack
+	ldr.w tmp, [sp, #14*4] // load state from stack
+	vmov s26, tmp
+
+	movw qa, #26632
+	movw q, #3329
+	### qinv=0x6ba8f301
+	movw qinv, #62209
+	movt qinv, #27560
+	movw k, #0
+
+	// outer while loop
+	movw ctr, #0
+	vmov s17, bufptr // save bufptr to check later
+	1:
+
+		ldrh val0, [bufptr], #2
+		ldrb val1, [bufptr], #1
+		ubfx tmp, val0, #12, #4
+		orr val1, tmp, val1, lsl #4
+		ubfx val0, val0, #0, #12
+		ubfx val1, val1, #0, #12
+
+		first_if doublebasemul_asm_acc, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr
+		
+		second_if doublebasemul_asm_acc, tmp, tmp2, val0, val1, rptr, bptr, cptr, bufptr, zetaptr, k, q, qa, qinv, ctr
+
+		third_if tmp, tmp2, rptr, bptr, cptr, bufptr, ctr
+
+	cmp ctr, #256/4
+	blt.w 1b
+
+	pop {r0-r11, pc}
+.size matacc_asm_acc, . - matacc_asm_acc
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fstack/ntt.c b/crypto_kem/ml-kem-768/m4fstack/ntt.c
new file mode 120000
index 0000000..c9d6e8a
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/ntt.c
@@ -0,0 +1 @@
+../m4fspeed/ntt.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fstack/ntt.h b/crypto_kem/ml-kem-768/m4fstack/ntt.h
new file mode 120000
index 0000000..5fd83c0
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/ntt.h
@@ -0,0 +1 @@
+../m4fspeed/ntt.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fstack/params.h b/crypto_kem/ml-kem-768/m4fstack/params.h
new file mode 120000
index 0000000..59dd7f1
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/params.h
@@ -0,0 +1 @@
+../m4fspeed/params.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fstack/poly.c b/crypto_kem/ml-kem-768/m4fstack/poly.c
new file mode 100644
index 0000000..35475ad
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/poly.c
@@ -0,0 +1,618 @@
+#include "poly.h"
+
+#include "cbd.h"
+#include "ntt.h"
+#include "params.h"
+#include "symmetric.h"
+
+#include <stdint.h>
+
+
+/*************************************************
+* Name:        poly_compress
+*
+* Description: Serialization of a polynomial and subsequent compression of a polynomial;
+*
+* Arguments:   - unsigned char *r: pointer to output byte array (of length KYBER_POLYCOMPRESSEDBYTES)
+*              - const poly *a:    pointer to input polynomial to be serialized
+*************************************************/
+void poly_compress(unsigned char *r, const poly *a)
+{
+  unsigned int i,j;
+  int16_t u;
+  uint32_t d0;
+  uint8_t t[8];
+
+#if (KYBER_POLYCOMPRESSEDBYTES == 128)
+  for(i=0;i<KYBER_N/8;i++) {
+    for(j=0;j<8;j++) {
+      // map to positive standard representatives
+      u  = a->coeffs[8*i+j];
+      u += (u >> 15) & KYBER_Q;
+/*    t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */
+      d0 = u << 4;
+      d0 += 1665;
+      d0 *= 80635;
+      d0 >>= 28;
+      t[j] = d0 & 0xf;
+    }
+
+    r[0] = t[0] | (t[1] << 4);
+    r[1] = t[2] | (t[3] << 4);
+    r[2] = t[4] | (t[5] << 4);
+    r[3] = t[6] | (t[7] << 4);
+    r += 4;
+  }
+#elif (KYBER_POLYCOMPRESSEDBYTES == 160)
+  for(i=0;i<KYBER_N/8;i++) {
+    for(j=0;j<8;j++) {
+      // map to positive standard representatives
+      u  = a->coeffs[8*i+j];
+      u += (u >> 15) & KYBER_Q;
+/*      t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */
+      d0 = u << 5;
+      d0 += 1664;
+      d0 *= 40318;
+      d0 >>= 27;
+      t[j] = d0 & 0x1f;
+    }
+
+    r[0] = (t[0] >> 0) | (t[1] << 5);
+    r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7);
+    r[2] = (t[3] >> 1) | (t[4] << 4);
+    r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6);
+    r[4] = (t[6] >> 2) | (t[7] << 3);
+    r += 5;
+  }
+#else
+#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}"
+#endif
+}
+
+/*************************************************
+* Name:        poly_decompress
+*
+* Description: De-serialization and subsequent decompression of a polynomial;
+*              approximate inverse of poly_compress
+*
+* Arguments:   - poly *r:                pointer to output polynomial
+*              - const unsigned char *a: pointer to input byte array (of length KYBER_POLYCOMPRESSEDBYTES bytes)
+**************************************************/
+void poly_decompress(poly *r, const unsigned char *a)
+{
+  int i;
+#if (KYBER_POLYCOMPRESSEDBYTES == 128)
+  for(i=0;i<KYBER_N;i+=8)
+  {
+    r->coeffs[i+0] = (((a[0] & 15) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+1] = (((a[0] >> 4) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+2] = (((a[1] & 15) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+3] = (((a[1] >> 4) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+4] = (((a[2] & 15) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+5] = (((a[2] >> 4) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+6] = (((a[3] & 15) * KYBER_Q) + 8) >> 4;
+    r->coeffs[i+7] = (((a[3] >> 4) * KYBER_Q) + 8) >> 4;
+    a += 4;
+  }
+#elif (KYBER_POLYCOMPRESSEDBYTES == 160)
+  for(i=0;i<KYBER_N;i+=8)
+  {
+    r->coeffs[i+0] =  (((a[0] & 31) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+1] = ((((a[0] >> 5) | ((a[1] & 3) << 3)) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+2] = ((((a[1] >> 2) & 31) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+3] = ((((a[1] >> 7) | ((a[2] & 15) << 1)) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+4] = ((((a[2] >> 4) | ((a[3] &  1) << 4)) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+5] = ((((a[3] >> 1) & 31) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+6] = ((((a[3] >> 6) | ((a[4] &  7) << 2)) * KYBER_Q) + 16) >> 5;
+    r->coeffs[i+7] =  (((a[4] >> 3) * KYBER_Q) + 16) >> 5;
+    a += 5;
+  }
+#else
+#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}"
+#endif
+}
+
+/*************************************************
+* Name:        poly_packcompress
+*
+* Description: Serialization and subsequent compression of a polynomial of a polyvec,
+*              writes to a byte string representation of the whole polyvec.
+*              Used to compress a polyvec one poly at a time in a loop.
+*
+* Arguments:   - unsigned char *r:  pointer to output byte string representation of a polyvec (of length KYBER_POLYVECCOMPRESSEDBYTES)
+*              - const poly *a:     pointer to input polynomial
+*              - int i:             index of to be serialized polynomial in serialized polyec
+**************************************************/
+void poly_packcompress(unsigned char *r, poly *a, int i) {
+    int j, k;
+    uint64_t d0;
+
+#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
+  uint16_t t[8];
+
+  for(j=0;j<KYBER_N/8;j++) {
+      for(k=0;k<8;k++) {
+        t[k]  = a->coeffs[8*j+k];
+        t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
+/*      t[k]  = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */
+        d0 = t[k];
+        d0 <<= 11;
+        d0 += 1664;
+        d0 *= 645084;
+        d0 >>= 31;
+        t[k] = d0 & 0x7ff;
+      }
+      
+
+    r[352*i+11*j+ 0] =  t[0] & 0xff;
+    r[352*i+11*j+ 1] = (t[0] >>  8) | ((t[1] & 0x1f) << 3);
+    r[352*i+11*j+ 2] = (t[1] >>  5) | ((t[2] & 0x03) << 6);
+    r[352*i+11*j+ 3] = (t[2] >>  2) & 0xff;
+    r[352*i+11*j+ 4] = (t[2] >> 10) | ((t[3] & 0x7f) << 1);
+    r[352*i+11*j+ 5] = (t[3] >>  7) | ((t[4] & 0x0f) << 4);
+    r[352*i+11*j+ 6] = (t[4] >>  4) | ((t[5] & 0x01) << 7);
+    r[352*i+11*j+ 7] = (t[5] >>  1) & 0xff;
+    r[352*i+11*j+ 8] = (t[5] >>  9) | ((t[6] & 0x3f) << 2);
+    r[352*i+11*j+ 9] = (t[6] >>  6) | ((t[7] & 0x07) << 5);
+    r[352*i+11*j+10] = (t[7] >>  3);
+  }
+#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
+    uint16_t t[4];
+
+    for (j = 0; j < KYBER_N / 4; j++) {
+        for(k=0;k<4;k++) {
+            t[k]  = a->coeffs[4*j+k];
+            t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
+            /*      t[k]  = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */
+            d0 = t[k];
+            d0 <<= 10;
+            d0 += 1665;
+            d0 *= 1290167;
+            d0 >>= 32;
+            t[k] = d0 & 0x3ff;
+        }
+        r[320*i+5*j+0] =   t[0] & 0xff;
+        r[320*i+5*j+1] =  (t[0] >>  8) | ((t[1] & 0x3f) << 2);
+        r[320*i+5*j+2] = ((t[1] >>  6) | ((t[2] & 0x0f) << 4)) & 0xff;
+        r[320*i+5*j+3] = ((t[2] >>  4) | ((t[3] & 0x03) << 6)) & 0xff;
+        r[320*i+5*j+4] =  (t[3] >>  2) & 0xff;
+    }
+#else
+#error "KYBER_POLYVECCOMPRESSEDBYTES needs to in (KYBER_K * {352, 320})"
+#endif
+}
+
+/*************************************************
+* Name:        poly_unpackdecompress
+*
+* Description: Deserialization and subsequent compression of a polynomial of a polyvec,
+*              Used to uncompress a polyvec one poly at a time in a loop.
+*
+* Arguments:   - const poly *r:     pointer to output polynomial
+*              - unsigned char *a:  pointer to input byte string representation of a polyvec (of length KYBER_POLYVECCOMPRESSEDBYTES)
+*              - int i:             index of poly in polyvec to decompress
+**************************************************/
+void poly_unpackdecompress(poly *r, const unsigned char *a, int i) {
+  int j;
+#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
+    for(j=0;j<KYBER_N/8;j++)
+    {
+      r->coeffs[8*j+0] =  (((a[352*i+11*j+ 0]       | (((uint32_t)a[352*i+11*j+ 1] & 0x07) << 8)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+1] = ((((a[352*i+11*j+ 1] >> 3) | (((uint32_t)a[352*i+11*j+ 2] & 0x3f) << 5)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+2] = ((((a[352*i+11*j+ 2] >> 6) | (((uint32_t)a[352*i+11*j+ 3] & 0xff) << 2) | (((uint32_t)a[352*i+11*j+4] & 0x01) << 10)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+3] = ((((a[352*i+11*j+ 4] >> 1) | (((uint32_t)a[352*i+11*j+ 5] & 0x0f) << 7)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+4] = ((((a[352*i+11*j+ 5] >> 4) | (((uint32_t)a[352*i+11*j+ 6] & 0x7f) << 4)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+5] = ((((a[352*i+11*j+ 6] >> 7) | (((uint32_t)a[352*i+11*j+ 7] & 0xff) << 1) | (((uint32_t)a[352*i+11*j+8] & 0x03) <<  9)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+6] = ((((a[352*i+11*j+ 8] >> 2) | (((uint32_t)a[352*i+11*j+ 9] & 0x1f) << 6)) * KYBER_Q) + 1024) >> 11;
+      r->coeffs[8*j+7] = ((((a[352*i+11*j+ 9] >> 5) | (((uint32_t)a[352*i+11*j+10] & 0xff) << 3)) * KYBER_Q) + 1024) >> 11;
+    }
+#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
+    for(j=0;j<KYBER_N/4;j++)
+    {
+      r->coeffs[4*j+0] =  (((a[320*i+5*j+ 0]       | (((uint32_t)a[320*i+5*j+ 1] & 0x03) << 8)) * KYBER_Q) + 512) >> 10;
+      r->coeffs[4*j+1] = ((((a[320*i+5*j+ 1] >> 2) | (((uint32_t)a[320*i+5*j+ 2] & 0x0f) << 6)) * KYBER_Q) + 512) >> 10;
+      r->coeffs[4*j+2] = ((((a[320*i+5*j+ 2] >> 4) | (((uint32_t)a[320*i+5*j+ 3] & 0x3f) << 4)) * KYBER_Q) + 512) >> 10;
+      r->coeffs[4*j+3] = ((((a[320*i+5*j+ 3] >> 6) | (((uint32_t)a[320*i+5*j+ 4] & 0xff) << 2)) * KYBER_Q) + 512) >> 10;
+    }
+#else
+#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
+#endif
+}
+
+
+/*************************************************
+* Name:        cmp_poly_compress
+*
+* Description: Serializes and consequently compares polynomial to a serialized polynomial
+*
+* Arguments:   - const unsigned char *r:    pointer to serialized polynomial to compare with
+*              - poly *a:                   pointer to input polynomial to serialize and compare
+* Returns:                                  boolean indicating whether the polynomials are equal
+**************************************************/
+int cmp_poly_compress(const unsigned char *r, poly *a) {
+    unsigned char rc = 0;
+    int16_t u;
+    uint32_t d0;
+    uint8_t t[8];
+    int i, j, k = 0;
+
+#if (KYBER_POLYCOMPRESSEDBYTES == 128)
+  for(i=0;i<KYBER_N/8;i++) {
+    for(j=0;j<8;j++) {
+      // map to positive standard representatives
+      u  = a->coeffs[8*i+j];
+      u += (u >> 15) & KYBER_Q;
+/*    t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */
+      d0 = u << 4;
+      d0 += 1665;
+      d0 *= 80635;
+      d0 >>= 28;
+      t[j] = d0 & 0xf;
+    }
+        rc |= r[k]      ^ (t[0] | (t[1] << 4));
+        rc |= r[k + 1]  ^ (t[2] | (t[3] << 4));
+        rc |= r[k + 2]  ^ (t[4] | (t[5] << 4));
+        rc |= r[k + 3]  ^ (t[6] | (t[7] << 4));
+        k += 4;
+    }
+#elif (KYBER_POLYCOMPRESSEDBYTES == 160)
+  for(i=0;i<KYBER_N/8;i++) {
+    for(j=0;j<8;j++) {
+      // map to positive standard representatives
+      u  = a->coeffs[8*i+j];
+      u += (u >> 15) & KYBER_Q;
+/*      t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */
+      d0 = u << 5;
+      d0 += 1664;
+      d0 *= 40318;
+      d0 >>= 27;
+      t[j] = d0 & 0x1f;
+    }
+
+
+      rc |= r[k]   ^ (t[0]       | (t[1] << 5));
+      rc |= r[k+1] ^ ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
+      rc |= r[k+2] ^ ((t[3] >> 1) | (t[4] << 4));
+      rc |= r[k+3] ^ ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
+      rc |= r[k+4] ^ ((t[6] >> 2) | (t[7] << 3));
+      k += 5;
+    }
+#else
+#error "KYBER_POLYCOMPRESSEDBYTES needs to be in {128, 160}"
+#endif
+    return rc;
+}
+
+/*************************************************
+* Name:        cmp_poly_packcompress
+*
+* Description: Serializes and consequently compares poly of polyvec to a serialized polyvec
+*              Should be called in a loop over all poly's of a polyvec.
+*
+* Arguments:   - const unsigned char *r:    pointer to serialized polyvec to compare with
+*              - poly *a:                   pointer to input polynomial of polyvec to serialize and compare
+*              - int i:                     index of poly in polyvec to compare with
+* Returns:                                  boolean indicating whether the polyvecs are equal
+**************************************************/
+int cmp_poly_packcompress(const unsigned char *r, poly *a, int i) {
+    unsigned char rc = 0;
+    int j, k;
+    uint64_t d0;
+
+#if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
+  uint16_t t[8];
+    for(j=0;j<KYBER_N/8;j++)
+    {
+      for(k=0;k<8;k++) {
+        t[k]  = a->coeffs[8*j+k];
+        t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
+/*      t[k]  = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */
+        d0 = t[k];
+        d0 <<= 11;
+        d0 += 1664;
+        d0 *= 645084;
+        d0 >>= 31;
+        t[k] = d0 & 0x7ff;
+      }
+
+      rc |= r[352*i+11*j+ 0] ^ (t[0] & 0xff);
+      rc |= r[352*i+11*j+ 1] ^ ((t[0] >>  8) | ((t[1] & 0x1f) << 3));
+      rc |= r[352*i+11*j+ 2] ^ ((t[1] >>  5) | ((t[2] & 0x03) << 6));
+      rc |= r[352*i+11*j+ 3] ^ ((t[2] >>  2) & 0xff);
+      rc |= r[352*i+11*j+ 4] ^ ((t[2] >> 10) | ((t[3] & 0x7f) << 1));
+      rc |= r[352*i+11*j+ 5] ^ ((t[3] >>  7) | ((t[4] & 0x0f) << 4));
+      rc |= r[352*i+11*j+ 6] ^ ((t[4] >>  4) | ((t[5] & 0x01) << 7));
+      rc |= r[352*i+11*j+ 7] ^ ((t[5] >>  1) & 0xff);
+      rc |= r[352*i+11*j+ 8] ^ ((t[5] >>  9) | ((t[6] & 0x3f) << 2));
+      rc |= r[352*i+11*j+ 9] ^ ((t[6] >>  6) | ((t[7] & 0x07) << 5));
+      rc |= r[352*i+11*j+10] ^ ((t[7] >>  3));
+    }
+#elif (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 320))
+    uint16_t t[4];
+        for (j = 0; j < KYBER_N / 4; j++) {
+        for(k=0;k<4;k++) {
+            t[k]  = a->coeffs[4*j+k];
+            t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
+            /*      t[k]  = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */
+            d0 = t[k];
+            d0 <<= 10;
+            d0 += 1665;
+            d0 *= 1290167;
+            d0 >>= 32;
+            t[k] = d0 & 0x3ff;
+        }
+
+            rc |= r[320*i+5*j+0] ^ (t[0] & 0xff);
+            rc |= r[320*i+5*j+1] ^ ((t[0] >>  8) | ((t[1] & 0x3f) << 2));
+            rc |= r[320*i+5*j+2] ^ (((t[1] >>  6) | ((t[2] & 0x0f) << 4)) & 0xff);
+            rc |= r[320*i+5*j+3] ^ (((t[2] >>  4) | ((t[3] & 0x03) << 6)) & 0xff);
+            rc |= r[320*i+5*j+4] ^ ((t[3] >>  2) & 0xff);
+        }
+#else
+#error "KYBER_POLYVECCOMPRESSEDBYTES needs to be in {320*KYBER_K, 352*KYBER_K}"
+#endif
+    return rc;
+}
+
+/*************************************************
+* Name:        poly_tobytes
+*
+* Description: Serialization of a polynomial
+*
+* Arguments:   - unsigned char *r: pointer to output byte array (needs space for KYBER_POLYBYTES bytes)
+*              - const poly *a:    pointer to input polynomial
+**************************************************/
+void poly_tobytes(unsigned char *r, poly *a) {
+    int i;
+    uint16_t t0, t1;
+
+    poly_reduce(a);
+
+    for (i = 0; i < KYBER_N / 2; i++) {
+        t0 = a->coeffs[2 * i];
+        t1 = a->coeffs[2 * i + 1];
+        r[3 * i] = t0 & 0xff;
+        r[3 * i + 1] = (t0 >> 8) | ((t1 & 0xf) << 4);
+        r[3 * i + 2] = (t1 >> 4) & 0xff;
+    }
+}
+
+/*************************************************
+* Name:        poly_frombytes
+*
+* Description: De-serialization of a polynomial;
+*              inverse of poly_tobytes
+*
+* Arguments:   - poly *r:                pointer to output polynomial
+*              - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes)
+**************************************************/
+void poly_frombytes(poly *r, const unsigned char *a) {
+    int i;
+
+    for (i = 0; i < KYBER_N / 2; i++) {
+        r->coeffs[2 * i]     = a[3 * i]          | ((uint16_t)a[3 * i + 1] & 0x0f) << 8;
+        r->coeffs[2 * i + 1] = a[3 * i + 1] >> 4 | ((uint16_t)a[3 * i + 2] & 0xff) << 4;
+    }
+}
+
+/*************************************************
+* Name:        poly_frombytes_mul
+*
+* Description: Multiplication of a polynomial with a de-serialization of another polynomial
+*
+* Arguments:   - poly *r:                pointer to output polynomial
+*              - const poly *b:          pointer to input polynomial
+*              - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes)
+**************************************************/
+extern void frombytes_mul_asm(int16_t *r, const int16_t *b, const unsigned char *c, const int32_t zetas[64]);
+void poly_frombytes_mul(poly *r, const poly *b, const unsigned char *a) {
+    frombytes_mul_asm(r->coeffs, b->coeffs, a, zetas);
+}
+
+/*************************************************
+* Name:        poly_frombytes_mul_acc
+*
+* Description: Multiplication of a polynomial with a de-serialization of another polynomial
+*              Accumulation in r.
+*
+* Arguments:   - poly *r:                pointer to output polynomial
+*              - const poly *b:          pointer to input polynomial
+*              - const unsigned char *a: pointer to input byte array (of KYBER_POLYBYTES bytes)
+**************************************************/
+extern void frombytes_mul_asm_acc(int16_t *r, const int16_t *b, const unsigned char *c, const int32_t zetas[64]);
+void poly_frombytes_mul_acc(poly *r, const poly *b, const unsigned char *a) {
+    frombytes_mul_asm_acc(r->coeffs, b->coeffs, a, zetas);
+}
+
+/*************************************************
+* Name:        poly_getnoise
+*
+* Description: Sample a polynomial deterministically from a seed and a nonce,
+*              with output polynomial close to centered binomial distribution
+*              with parameter KYBER_ETA
+*
+* Arguments:   - poly *r:                   pointer to output polynomial
+*              - const unsigned char *seed: pointer to input seed (pointing to array of length KYBER_SYMBYTES bytes)
+*              - unsigned char nonce:       one-byte input nonce
+*              - int add:                   boolean to indicate to accumulate into r
+**************************************************/
+void poly_noise(poly *r, const unsigned char *seed, unsigned char nonce, int add) {
+    unsigned char buf[KYBER_ETA * KYBER_N / 4];
+
+    prf(buf, KYBER_ETA * KYBER_N / 4, seed, nonce);
+    cbd(r, buf, add);
+}
+
+/*************************************************
+* Name:        poly_ntt
+*
+* Description: Computes negacyclic number-theoretic transform (NTT) of
+*              a polynomial in place;
+*              inputs assumed to be in normal order, output in bitreversed order
+*
+* Arguments:   - uint16_t *r: pointer to in/output polynomial
+**************************************************/
+void poly_ntt(poly *r) {
+    ntt(r->coeffs);
+}
+
+/*************************************************
+* Name:        poly_invntt
+*
+* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of
+*              a polynomial in place;
+*              inputs assumed to be in bitreversed order, output in normal order
+*
+* Arguments:   - uint16_t *a: pointer to in/output polynomial
+**************************************************/
+void poly_invntt(poly *r) {
+    invntt(r->coeffs);
+}
+
+extern void basemul_asm(int16_t *, const int16_t *, const int16_t *, const int32_t *);
+/*************************************************
+* Name:        poly_basemul
+*
+* Description: Multiplication of two polynomials in NTT domain
+*
+* Arguments:   - poly *r:       pointer to output polynomial
+*              - const poly *a: pointer to first input polynomial
+*              - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_basemul(poly *r, const poly *a, const poly *b) {
+    basemul_asm(r->coeffs, a->coeffs, b->coeffs, zetas);
+}
+
+extern void basemul_asm_acc(int16_t *, const int16_t *, const int16_t *, const int32_t *);
+/*************************************************
+* Name:        poly_basemul_acc
+*
+* Description: Multiplication of two polynomials in NTT domain, accumulating
+*
+* Arguments:   - poly *r:       pointer to output polynomial
+*              - const poly *a: pointer to first input polynomial
+*              - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_basemul_acc(poly *r, const poly *a, const poly *b) {
+    basemul_asm_acc(r->coeffs, a->coeffs, b->coeffs, zetas);
+}
+
+extern void asm_fromplant(int16_t *r);
+/*************************************************
+* Name:        poly_fromplant
+*
+* Description: Inplace conversion of all coefficients of a polynomial
+*              from Montgomery domain to normal domain
+*
+* Arguments:   - poly *r:       pointer to input/output polynomial
+**************************************************/
+void poly_fromplant(poly *r) {
+  asm_fromplant(r->coeffs);
+}
+
+extern void asm_barrett_reduce(int16_t *r);
+/*************************************************
+* Name:        poly_reduce
+*
+* Description: Applies Barrett reduction to all coefficients of a polynomial
+*              for details of the Barrett reduction see comments in reduce.c
+*
+* Arguments:   - poly *r:       pointer to input/output polynomial
+**************************************************/
+void poly_reduce(poly *r) {
+  asm_barrett_reduce(r->coeffs);
+}
+
+extern void pointwise_add(int16_t *, const int16_t *, const int16_t *);
+/*************************************************
+* Name:        poly_add
+*
+* Description: Add two polynomials
+*
+* Arguments: - poly *r:       pointer to output polynomial
+*            - const poly *a: pointer to first input polynomial
+*            - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_add(poly *r, const poly *a, const poly *b) {
+    pointwise_add(r->coeffs,a->coeffs,b->coeffs);
+}
+
+
+extern void pointwise_sub(int16_t *, const int16_t *, const int16_t *);
+/*************************************************
+* Name:        poly_sub
+*
+* Description: Subtract two polynomials
+*
+* Arguments: - poly *r:       pointer to output polynomial
+*            - const poly *a: pointer to first input polynomial
+*            - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_sub(poly *r, const poly *a, const poly *b) {
+    pointwise_sub(r->coeffs,a->coeffs,b->coeffs);
+}
+
+void cmov_int16(int16_t *r, int16_t v, uint16_t b);
+
+/*************************************************
+* Name:        poly_frommsg
+*
+* Description: Convert 32-byte message to polynomial
+*
+* Arguments:   - poly *r:                  pointer to output polynomial
+*              - const unsigned char *msg: pointer to input message
+**************************************************/
+void poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES])
+{
+  unsigned int i,j;
+
+#if (KYBER_INDCPA_MSGBYTES != KYBER_N/8)
+#error "KYBER_INDCPA_MSGBYTES must be equal to KYBER_N/8 bytes!"
+#endif
+
+  for(i=0;i<KYBER_N/8;i++) {
+    for(j=0;j<8;j++) {
+      r->coeffs[8*i+j] = 0;
+      cmov_int16(r->coeffs+8*i+j, ((KYBER_Q+1)/2), (msg[i] >> j)&1);
+    }
+  }
+}
+
+/*************************************************
+* Name:        poly_tomsg
+*
+* Description: Convert polynomial to 32-byte message
+*
+* Arguments:   - unsigned char *msg: pointer to output message
+*              - const poly *a:      pointer to input polynomial
+**************************************************/
+void poly_tomsg(unsigned char msg[KYBER_SYMBYTES], poly *a) {
+    uint32_t t;
+    int i, j;
+
+    for (i = 0; i < KYBER_SYMBYTES; i++) {
+        msg[i] = 0;
+        for (j = 0; j < 8; j++) {
+            t  = a->coeffs[8*i+j];
+            t <<= 1;
+            t += 1665;
+            t *= 80635;
+            t >>= 28;
+            t &= 1;
+            msg[i] |= t << j;
+        }
+    }
+}
+
+/*************************************************
+* Name:        poly_zeroize
+*
+* Description: Zeros a polynomial
+*
+* Arguments:   - poly *p: pointer to polynomial
+**************************************************/
+void poly_zeroize(poly *p) {
+  int i;
+  for(i = 0; i < KYBER_N; i++)
+   p->coeffs[i] = 0;
+}
diff --git a/crypto_kem/ml-kem-768/m4fstack/poly.h b/crypto_kem/ml-kem-768/m4fstack/poly.h
new file mode 100644
index 0000000..635abe9
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/poly.h
@@ -0,0 +1,51 @@
+#ifndef POLY_H
+#define POLY_H
+
+#include "params.h"
+
+#include <stdint.h>
+
+#define poly_getnoise(p, seed, nonce) poly_noise(p, seed, nonce, 0)
+#define poly_addnoise(p, seed, nonce) poly_noise(p, seed, nonce, 1)
+
+/*
+ * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
+ * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1]
+ */
+typedef struct {
+    int16_t coeffs[KYBER_N];
+} poly;
+
+void poly_compress(unsigned char *r, const poly *a);
+void poly_decompress(poly *r, const unsigned char *a);
+
+void poly_packcompress(unsigned char *r, poly *a, int i);
+void poly_unpackdecompress(poly *r, const unsigned char *a, int i);
+
+int cmp_poly_compress(const unsigned char *r, poly *a);
+int cmp_poly_packcompress(const unsigned char *r, poly *a, int i);
+
+void poly_tobytes(unsigned char *r, poly *a);
+void poly_frombytes(poly *r, const unsigned char *a);
+void poly_frombytes_mul(poly *r, const poly *b, const unsigned char *a);
+void poly_frombytes_mul_acc(poly *r, const poly *b, const unsigned char *a);
+
+void poly_frommsg(poly *r, const unsigned char msg[KYBER_SYMBYTES]);
+void poly_tomsg(unsigned char msg[KYBER_SYMBYTES], poly *a);
+
+void poly_noise(poly *r, const unsigned char *seed, unsigned char nonce, int add);
+
+void poly_ntt(poly *r);
+void poly_invntt(poly *r);
+void poly_basemul(poly *r, const poly *a, const poly *b);
+void poly_basemul_acc(poly *r, const poly *a, const poly *b);
+void poly_fromplant(poly *r);
+
+void poly_reduce(poly *r);
+
+void poly_add(poly *r, const poly *a, const poly *b);
+void poly_sub(poly *r, const poly *a, const poly *b);
+
+void poly_zeroize(poly *p);
+
+#endif
diff --git a/crypto_kem/ml-kem-768/m4fstack/poly_asm.S b/crypto_kem/ml-kem-768/m4fstack/poly_asm.S
new file mode 100644
index 0000000..0b5aa6a
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/poly_asm.S
@@ -0,0 +1,198 @@
+/******************************************************************************
+* Integrating the improved Plantard arithmetic into Kyber.
+*
+* Efficient Plantard arithmetic enables a faster Kyber implementation with the 
+* same stack usage.
+*
+* See the paper at https://eprint.iacr.org/2022/956.pdf for more details.
+*
+* @author   Junhao Huang, BNU-HKBU United International College, Zhuhai, China
+*           jhhuang_nuaa@126.com
+*
+* @date     September 2022
+******************************************************************************/
+#include "macros.i"
+
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+.macro doublebasemul_frombytes_asm rptr, bptr, zeta, poly0, poly1, poly3, tmp, tmp2, q, qa, qinv
+	ldr.w \poly0, [\bptr], #4
+
+	smulwt \tmp, \zeta, \poly1 
+	smlabt \tmp, \tmp, \q, \qa  
+	smultt \tmp, \poly0, \tmp  
+	smlabb \tmp, \poly0, \poly1, \tmp 
+	// a1*b1*zeta+a0*b0
+	plant_red \q, \qa, \qinv, \tmp
+	// r[0] in upper half of tmp
+	
+	smuadx \tmp2, \poly0, \poly1 
+	plant_red \q, \qa, \qinv, \tmp2
+
+	// r[1] in upper half of tmp2
+	pkhtb \tmp, \tmp2, \tmp, asr#16
+	str \tmp, [rptr], #4
+
+	neg \zeta, \zeta
+
+	ldr.w \poly0, [\bptr], #4
+	//basemul(r->coeffs + 4 * i + 2, a->coeffs + 4 * i + 2, b->coeffs + 4 * i + 2, - zetas[64 + i]);
+	smulwt \tmp, \zeta, \poly3 
+	smlabt \tmp, \tmp, \q, \qa  
+	smultt \tmp, \poly0, \tmp  
+	smlabb \tmp, \poly0, \poly3, \tmp 
+	plant_red \q, \qa, \qinv, \tmp
+	// r[0] in upper half of tmp
+	
+	smuadx \tmp2, \poly0, \poly3 
+	plant_red \q, \qa, \qinv, \tmp2
+	// r[1] in upper half of tmp2
+	pkhtb \tmp, \tmp2, \tmp, asr#16
+	str \tmp, [rptr], #4
+.endm
+
+.macro doublebasemul_frombytes_asm_acc rptr, bptr, zeta, poly0, poly1, poly3, res0, tmp, tmp2, q, qa, qinv
+
+	ldr \poly0, [\bptr], #4
+	
+	ldr \res0, [\rptr]
+	smulwt \tmp, \zeta, \poly1 
+	// b_1*zeta*qinv*plant_const; in low half
+	smlabt \tmp, \tmp, \q, \qa  
+	// b_1*zeta
+	smultt \tmp, \poly0, \tmp  
+	//a_1*b_1*zeta <2^32
+	smlabb \tmp, \poly0, \poly1, \tmp 
+	// a1*b1*zeta+a0*b0
+	plant_red \q, \qa, \qinv, \tmp
+	// r[0] in upper half of tmp
+	
+	smuadx \tmp2, \poly0, \poly1 
+	plant_red \q, \qa, \qinv, \tmp2
+
+	// r[1] in upper half of tmp2
+	pkhtb \tmp, \tmp2, \tmp, asr#16
+	uadd16 \res0, \res0, \tmp
+	str \res0, [\rptr], #4
+
+	neg \zeta, \zeta
+	
+	ldr \poly0, [\bptr], #4
+	ldr \res0, [\rptr]
+
+	smulwt \tmp, \zeta, \poly3 
+	smlabt \tmp, \tmp, \q, \qa  
+	smultt \tmp, \poly0, \tmp  
+	smlabb \tmp, \poly0, \poly3, \tmp 
+	plant_red \q, \qa, \qinv, \tmp
+	// r[0] in upper half of tmp
+	
+	smuadx \tmp2, \poly0, \poly3 
+	plant_red \q, \qa, \qinv, \tmp2
+	// r[1] in upper half of tmp2
+	pkhtb \tmp, \tmp2, \tmp, asr#16
+	uadd16 \res0, \res0, \tmp
+	str \res0, [\rptr], #4
+.endm
+
+// reduce 2 registers
+.macro deserialize aptr, tmp, tmp2, tmp3, t0, t1
+	ldrb.w \tmp, [\aptr, #2]
+	ldrh.w \tmp2, [\aptr, #3]
+	ldrb.w \tmp3, [\aptr, #5]
+	ldrh.w \t0, [\aptr], #6
+
+	ubfx.w \t1, \t0, #12, #4
+	ubfx.w \t0, \t0, #0, #12
+	orr \t1, \t1, \tmp, lsl #4
+	orr \t0, \t0, \t1, lsl #16
+	//tmp is free now
+	ubfx.w \t1, \tmp2, #12, #4
+	ubfx.w \tmp, \tmp2, #0, #12
+	orr \t1, \t1, \tmp3, lsl #4
+	orr \t1, \tmp, \t1, lsl #16
+.endm
+
+// void frombytes_mul_asm(int16_t *r, const int16_t *b, const unsigned char *a, const int32_t zetas[64])
+.global frombytes_mul_asm
+.type frombytes_mul_asm, %function
+.align 2
+frombytes_mul_asm:
+	push {r4-r11, r14}
+
+	rptr    .req r0
+	bptr    .req r1
+	aptr    .req r2
+	zetaptr .req r3
+	t0      .req r4
+	t1      .req r5
+	tmp     .req r6
+	tmp2    .req r7
+	tmp3    .req r8
+	q       .req r9
+	qa      .req r10
+	qinv    .req r11
+	zeta    .req r12
+	ctr     .req r14
+
+	movw qa, #26632
+	movt  q, #3329  
+	### qinv=0x6ba8f301
+	movw qinv, #62209
+	movt qinv, #27560
+
+	add ctr, rptr, #64*4*2
+	1:
+		ldr.w zeta, [zetaptr], #4
+		deserialize aptr, tmp, tmp2, tmp3, t0, t1
+
+		doublebasemul_frombytes_asm rptr, bptr, zeta, tmp3, t0, t1, tmp, tmp2, q, qa, qinv
+
+	cmp.w rptr, ctr
+	bne.w 1b
+
+	pop {r4-r11, pc}
+.size frombytes_mul_asm, . -frombytes_mul_asm
+
+// void frombytes_mul_asm_acc(int16_t *r, const int16_t *b, const unsigned char *a, const int32_t zetas[64])
+.global frombytes_mul_asm_acc
+.type frombytes_mul_asm_acc, %function
+.align 2
+frombytes_mul_asm_acc:
+	push {r4-r11, r14}
+
+	rptr    .req r0
+	bptr    .req r1
+	aptr    .req r2
+	zetaptr .req r3
+	t0      .req r4
+	t1      .req r5
+	tmp     .req r6
+	tmp2    .req r7
+	tmp3    .req r8
+	q       .req r9
+	qa      .req r10
+	qinv    .req r11
+	zeta    .req r12
+	ctr     .req r14
+
+	movw qa, #26632
+	movt  q, #3329  
+	### qinv=0x6ba8f301
+	movw qinv, #62209
+	movt qinv, #27560
+
+	add ctr, rptr, #64*4*2
+	1:
+		ldr.w zeta, [zetaptr], #4
+		deserialize aptr, tmp, tmp2, tmp3, t0, t1
+		vmov s0, ctr
+		doublebasemul_frombytes_asm_acc rptr, bptr, zeta, tmp3, t0, t1, ctr, tmp, tmp2, q, qa, qinv
+		vmov ctr, s0
+	cmp.w rptr, ctr
+	bne.w 1b
+
+	pop {r4-r11, pc}
+.size frombytes_mul_asm_acc, . - frombytes_mul_asm_acc 
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fstack/polyvec.c b/crypto_kem/ml-kem-768/m4fstack/polyvec.c
new file mode 120000
index 0000000..f398d76
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/polyvec.c
@@ -0,0 +1 @@
+../m4fspeed/polyvec.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fstack/polyvec.h b/crypto_kem/ml-kem-768/m4fstack/polyvec.h
new file mode 120000
index 0000000..3113837
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/polyvec.h
@@ -0,0 +1 @@
+../m4fspeed/polyvec.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fstack/reduce.S b/crypto_kem/ml-kem-768/m4fstack/reduce.S
new file mode 120000
index 0000000..29ae453
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/reduce.S
@@ -0,0 +1 @@
+../m4fspeed/reduce.S
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fstack/symmetric-fips202.c b/crypto_kem/ml-kem-768/m4fstack/symmetric-fips202.c
new file mode 120000
index 0000000..fa4ba9a
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/symmetric-fips202.c
@@ -0,0 +1 @@
+../m4fspeed/symmetric-fips202.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fstack/symmetric.h b/crypto_kem/ml-kem-768/m4fstack/symmetric.h
new file mode 120000
index 0000000..28c6fac
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/symmetric.h
@@ -0,0 +1 @@
+../m4fspeed/symmetric.h
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fstack/verify.c b/crypto_kem/ml-kem-768/m4fstack/verify.c
new file mode 120000
index 0000000..a7a9856
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/verify.c
@@ -0,0 +1 @@
+../m4fspeed/verify.c
\ No newline at end of file
diff --git a/crypto_kem/ml-kem-768/m4fstack/verify.h b/crypto_kem/ml-kem-768/m4fstack/verify.h
new file mode 120000
index 0000000..cb2da4b
--- /dev/null
+++ b/crypto_kem/ml-kem-768/m4fstack/verify.h
@@ -0,0 +1 @@
+../m4fspeed/verify.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4f/api.h b/crypto_sign/dilithium2/m4f/api.h
new file mode 100644
index 0000000..a289632
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/api.h
@@ -0,0 +1,26 @@
+#ifndef API_H
+#define API_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "params.h"
+
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+                          const uint8_t *m, size_t mlen,
+                          const uint8_t *sk);
+
+int crypto_sign(uint8_t *sm, size_t *smlen,
+                const uint8_t *m, size_t mlen,
+                const uint8_t *sk);
+
+int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+                       const uint8_t *m, size_t mlen,
+                       const uint8_t *pk);
+
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+                     const uint8_t *sm, size_t smlen,
+                     const uint8_t *pk);
+
+#endif
diff --git a/crypto_sign/dilithium2/m4f/basemul_257.S b/crypto_sign/dilithium2/m4f/basemul_257.S
new file mode 100644
index 0000000..da647d8
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/basemul_257.S
@@ -0,0 +1,91 @@
+#include "macros_fnt.i"
+
+.syntax unified
+.cpu cortex-m4
+
+.align 2
+.global __asm_point_mul_257_16
+.type __asm_point_mul_257_16, %function
+__asm_point_mul_257_16:
+    push.w {r4-r11, lr}
+
+    ldr.w r14, [sp, #36]
+
+    .equ width, 4
+
+    add.w r12, r14, #64*width
+    _point_mul_16_loop:
+
+    ldr.w r7, [r1, #2*width]
+    ldr.w r8, [r1, #3*width]
+    ldr.w r9, [r14, #1*width]
+    ldr.w r5, [r1, #1*width]
+    ldr.w r4, [r1], #4*width
+    ldr.w r6, [r14], #2*width
+
+    smultb r10, r4, r6
+    barrett_32 r10, r2, r3, r11
+    pkhbt r4, r4, r10, lsl #16
+
+    neg.w r6, r6
+
+    smultb r10, r5, r6
+    barrett_32 r10, r2, r3, r11
+    pkhbt r5, r5, r10, lsl #16
+
+    str.w r5, [r0, #1*width]
+    str.w r4, [r0], #2*width
+
+    smultb r10, r7, r9
+    barrett_32 r10, r2, r3, r11
+    pkhbt r7, r7, r10, lsl #16
+
+    neg.w r9, r9
+
+    smultb r10, r8, r9
+    barrett_32 r10, r2, r3, r11
+    pkhbt r8, r8, r10, lsl #16
+
+    str.w r8, [r0, #1*width]
+    str.w r7, [r0], #2*width
+
+    cmp.w r14, r12
+    bne.w _point_mul_16_loop
+
+    pop.w {r4-r11, pc}
+
+
+.align 2
+.global __asm_asymmetric_mul_257_16
+.type __asm_asymmetric_mul_257_16, %function
+__asm_asymmetric_mul_257_16:
+    push.w {r4-r11, lr}
+
+    .equ width, 4
+
+    add.w r12, r0, #256*width
+    _asymmetric_mul_16_loop:
+
+    ldr.w r7, [r1, #width]
+    ldr.w r4, [r1], #2*width
+    ldr.w r8, [r2, #width]
+    ldr.w r5, [r2], #2*width
+    ldr.w r9, [r3, #width]
+    ldr.w r6, [r3], #2*width
+
+    smuad r10, r4, r6
+    smuadx r11, r4, r5
+
+    str.w r11, [r0, #width]
+    str.w r10, [r0], #2*width
+
+    smuad r10, r7, r9
+    smuadx r11, r7, r8
+
+    str.w r11, [r0, #width]
+    str.w r10, [r0], #2*width
+
+    cmp.w r0, r12
+    bne.w _asymmetric_mul_16_loop
+
+    pop.w {r4-r11, pc}
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4f/config.h b/crypto_sign/dilithium2/m4f/config.h
new file mode 100644
index 0000000..298a707
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/config.h
@@ -0,0 +1,7 @@
+#ifndef CONFIG_H
+#define CONFIG_H
+
+#define DILITHIUM_MODE 2
+// #define SIGN_STACKSTRATEGY 2
+
+#endif
diff --git a/crypto_sign/dilithium2/m4f/fnt_257.S b/crypto_sign/dilithium2/m4f/fnt_257.S
new file mode 100644
index 0000000..545883b
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/fnt_257.S
@@ -0,0 +1,145 @@
+#include "macros_fnt.i"
+.macro final_butterfly c0, c1f, twiddle, c0out, c1, qprime, q, tmp
+    vmov.w \c1, \c1f
+    vmov.w \tmp, \twiddle
+
+    mla \c0out, \c1, \tmp, \c0
+    mls \c1, \c1, \tmp, \c0
+
+    barrett_32 \c0out, \qprime, \q, \tmp
+    barrett_32 \c1, \qprime, \q, \tmp
+.endm
+
+
+.syntax unified
+.cpu cortex-m4
+
+.align 2
+.global __asm_fnt_257
+.type __asm_fnt_257, %function
+__asm_fnt_257:
+    push.w {r4-r11, lr}
+    vpush.w {s16-s27}
+
+    vmov.w s27, r1
+
+    .equ width, 4
+
+    add.w r12, r0, #32*width
+    _fnt_0_1_2:
+
+    ldrstrvec ldr.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #(32*0*width), #(32*1*width), #(32*2*width), #(32*3*width), #(32*4*width), #(32*5*width), #(32*6*width), #(32*7*width)
+
+    FNT_CT_butterfly  r4,  r8, 4
+    FNT_CT_butterfly  r5,  r9, 4
+    FNT_CT_butterfly  r6, r10, 4
+    FNT_CT_butterfly  r7, r11, 4
+
+    FNT_CT_butterfly  r4,  r6, 2
+    FNT_CT_butterfly  r5,  r7, 2
+    FNT_CT_butterfly  r8, r10, 6
+    FNT_CT_butterfly  r9, r11, 6
+
+    FNT_CT_butterfly  r4, r5, 1
+    FNT_CT_butterfly  r6, r7, 5
+    FNT_CT_butterfly  r8, r9, 3
+    FNT_CT_butterfly  r10, r11, 7
+
+    ldrstrvecjump str.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #(32*1*width), #(32*2*width), #(32*3*width), #(32*4*width), #(32*5*width), #(32*6*width), #(32*7*width), #width
+
+    cmp.w r0, r12
+    bne.w _fnt_0_1_2
+
+    sub.w r0, r0, #32*width
+
+    add.w r12, r0, #256*width
+    vmov.w s25, r12
+    _fnt_3_4_5_6:
+
+        vmov r1, s27
+        vldm.w r1!, {s2-s16}
+        vmov s27, r1
+
+        .rept 2
+            ldrstrvec ldr.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #(4*0*width+2*width), #(4*1*width+2*width), #(4*2*width+2*width), #(4*3*width+2*width), #(4*4*width+2*width), #(4*5*width+2*width), #(4*6*width+2*width), #(4*7*width+2*width)
+
+            _3_layer_CT_32_FNT r4, r5, r6, r7, r8, r9, r10, r11, s2, s3, s4, s5, s6, s7, s8, r14, r2, r3, r1, r12
+
+            vmov.w s17, s18, r4, r5 // a1, a3
+            vmov.w s19, s20, r6, r7 // a5, a7
+            vmov.w s21, s22, r8, r9 // a9, a11
+            vmov.w s23, s24, r10, r11 // a13, a15
+
+            ldrstrvec ldr.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #(4*0*width), #(4*1*width), #(4*2*width), #(4*3*width), #(4*4*width), #(4*5*width), #(4*6*width), #(4*7*width)
+
+            _3_layer_CT_32_FNT r4, r5, r6, r7, r8, r9, r10, r11, s2, s3, s4, s5, s6, s7, s8, r14, r2, r3, r1, r12
+
+            final_butterfly r5, s18, s10, r1, r12, r2, r3, r14
+            str.w r12, [r0, #(4*1*width+2*width)]
+            str.w r1, [r0, #(4*1*width)]
+
+            final_butterfly r6, s19, s11, r1, r12, r2, r3, r14
+            str.w r12, [r0, #(4*2*width+2*width)]
+            str.w r1, [r0, #(4*2*width)]
+
+            final_butterfly r7, s20, s12, r1, r12, r2, r3, r14
+            str.w r12, [r0, #(4*3*width+2*width)]
+            str.w r1, [r0, #(4*3*width)]
+
+            final_butterfly r8, s21, s13, r1, r12, r2, r3, r14
+            str.w r12, [r0, #(4*4*width+2*width)]
+            str.w r1, [r0, #(4*4*width)]
+
+            final_butterfly r9, s22, s14, r1, r12, r2, r3, r14
+            str.w r12, [r0, #(4*5*width+2*width)]
+            str.w r1, [r0, #(4*5*width)]
+
+            final_butterfly r10, s23, s15, r1, r12, r2, r3, r14
+            str.w r12, [r0, #(4*6*width+2*width)]
+            str.w r1, [r0, #(4*6*width)]
+
+            final_butterfly r11, s24, s16, r1, r12, r2, r3, r14
+            str.w r12, [r0, #(4*7*width+2*width)]
+            str.w r1, [r0, #(4*7*width)]
+
+            final_butterfly r4, s17, s9, r1, r12, r2, r3, r14
+            str.w r12, [r0, #(4*0*width+2*width)]
+            str.w r1, [r0], #width
+        .endr
+        add.w r0, #((32-2)*width)
+
+    vmov.w r12, s25
+    cmp.w r0, r12
+    bne.w _fnt_3_4_5_6
+
+    # switch to 16-bit representation
+    sub.w r0, r0, #256*width
+    mov.w r1, r0
+    _fnt_to_16_bit:
+        ldr.w r3, [r0, #1*width]
+        ldr.w r4, [r0, #2*width]
+        ldr.w r5, [r0, #3*width]
+        ldr.w r6, [r0, #4*width]
+        ldr.w r7, [r0, #5*width]
+        ldr.w r8, [r0, #6*width]
+        ldr.w r9, [r0, #7*width]
+        ldr.w r2, [r0], #8*width
+        strh.w r3, [r1, #1*2]
+        strh.w r4, [r1, #2*2]
+        strh.w r5, [r1, #3*2]
+        strh.w r6, [r1, #4*2]
+        strh.w r7, [r1, #5*2]
+        strh.w r8, [r1, #6*2]
+        strh.w r9, [r1, #7*2]
+        strh.w r2, [r1], #8*2
+        cmp.w r0, r12
+        bne.w _fnt_to_16_bit
+
+    vpop.w {s16-s27}
+    pop.w {r4-r11, pc}
+
+
+
+
+
+
diff --git a/crypto_sign/dilithium2/m4f/ifnt_257.S b/crypto_sign/dilithium2/m4f/ifnt_257.S
new file mode 100644
index 0000000..1c51165
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/ifnt_257.S
@@ -0,0 +1,306 @@
+#include "macros_fnt.i"
+.macro final_butterfly c0, c1, c1f, twiddle
+    vmov.w \c1, \c1f
+    add.w \c0, \c1
+    sub.w \c1, \c0, \c1, lsl#1
+    mul.w \c1, \twiddle
+.endm
+
+.macro final_butterfly2 c0, c0out, c1, c1f, twiddle, twiddle2
+    vmov.w \c1, \c1f
+    mla.w \c0out, \twiddle2, \c1, \c0
+    mls.w \c1, \twiddle2, \c1, \c0
+    mul.w \c1, \twiddle
+.endm
+
+.syntax unified
+.cpu cortex-m4
+.align 2
+.global __asm_ifnt_257
+.type __asm_ifnt_257, %function
+__asm_ifnt_257:
+    push.w {r4-r11, lr}
+    vpush.w {s16-s24}
+
+    .equ width, 4
+
+    add.w r12, r0, #256*width
+    vmov.w s1, r12
+    _ifnt_7_6_5_4:
+
+        vldm.w r1!, {s2-s16}
+
+// ================
+
+            ldrstrvec ldr.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #(2*8*width), #(2*9*width), #(2*10*width), #(2*11*width), #(2*12*width), #(2*13*width), #(2*14*width), #(2*15*width)
+
+            addSub4 r4, r5, r6, r7, r8, r9, r10, r11
+            vmov.w r14, s6
+            mul.w r5, r5, r14
+            vmov.w r14, s8
+            mul.w r9, r9, r14
+            addSub2 r4, r6, r8, r10
+            vmov.w r14, s7
+            mla.w r12, r7, r14, r5
+            mls.w r7, r7, r14, r5
+            vmov.w r14, s9
+            mla.w r5, r11, r14, r9
+            mls.w r11, r11, r14, r9
+
+            // r4, r12, r6, r7, r8, r5, r10, r11
+
+            vmov.w r14, s12
+            mul.w r6, r6, r14
+            mul.w r7, r7, r14
+            vmov.w r14, s13
+            mul.w r10, r10, r14
+            mul.w r11, r11, r14
+
+    barrett_32 r4, r2, r3, r14
+    barrett_32 r12, r2, r3, r14
+    barrett_32 r6, r2, r3, r14
+    barrett_32 r7, r2, r3, r14
+    barrett_32 r8, r2, r3, r14
+    barrett_32 r5, r2, r3, r14
+    barrett_32 r10, r2, r3, r14
+    barrett_32 r11, r2, r3, r14
+
+            addSub4 r4, r8, r6, r10, r12, r5, r7, r11
+
+            vmov.w s17, s18, r4, r12
+            vmov.w s19, s20, r6, r7
+            vmov.w s21, s22, r8, r5
+            vmov.w s23, s24, r10, r11
+
+            ldrstrvec ldr.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #(2*0*width), #(2*1*width), #(2*2*width), #(2*3*width), #(2*4*width), #(2*5*width), #(2*6*width), #(2*7*width)
+
+            addSub4 r4, r5, r6, r7, r8, r9, r10, r11
+            vmov.w r14, s2
+            mul.w r5, r5, r14
+            vmov.w r14, s4
+            mul.w r9, r9, r14
+            addSub2 r4, r6, r8, r10
+            vmov.w r14, s3
+            mla.w r12, r7, r14, r5
+            mls.w r7, r7, r14, r5
+            vmov.w r14, s5
+            mla.w r5, r11, r14, r9
+            mls.w r11, r11, r14, r9
+
+            // r4, r12, r6, r7, r8, r5, r10, r11
+
+            vmov.w r14, s10
+            mul.w r6, r6, r14
+            mul.w r7, r7, r14
+            vmov.w r14, s11
+            mul.w r10, r10, r14
+            mul.w r11, r11, r14
+
+    barrett_32 r4, r2, r3, r14
+    barrett_32 r12, r2, r3, r14
+    barrett_32 r6, r2, r3, r14
+    barrett_32 r7, r2, r3, r14
+    barrett_32 r8, r2, r3, r14
+    barrett_32 r5, r2, r3, r14
+    barrett_32 r10, r2, r3, r14
+    barrett_32 r11, r2, r3, r14
+
+            addSub4 r4, r8, r6, r10, r12, r5, r7, r11
+            vmov.w r14, s14
+            mul.w r8, r8, r14
+            mul.w r5, r5, r14
+            mul.w r10, r10, r14
+            mul.w r11, r11, r14
+            vmov.w r14, s16
+            final_butterfly r12, r9, s18, r14
+            str.w r12, [r0, #(2*1*width)]
+            str.w r9, [r0, #(2*9*width)]
+            final_butterfly r6, r9, s19, r14
+            str.w r6, [r0, #(2*2*width)]
+            str.w r9, [r0, #(2*10*width)]
+            final_butterfly r7, r9, s20, r14
+            str.w r7, [r0, #(2*3*width)]
+            str.w r9, [r0, #(2*11*width)]
+            vmov.w r12, s15
+            final_butterfly2 r8, r6, r9, s21, r14, r12
+            str.w r6, [r0, #(2*4*width)]
+            str.w r9, [r0, #(2*12*width)]
+            final_butterfly2 r5, r6, r9, s22, r14, r12
+            str.w r6, [r0, #(2*5*width)]
+            str.w r9, [r0, #(2*13*width)]
+            final_butterfly2 r10, r6, r9, s23, r14, r12
+            str.w r6, [r0, #(2*6*width)]
+            str.w r9, [r0, #(2*14*width)]
+            final_butterfly2 r11, r6, r9, s24, r14, r12
+            str.w r6, [r0, #(2*7*width)]
+            str.w r9, [r0, #(2*15*width)]
+            final_butterfly r4, r9, s17, r14
+            str.w r9, [r0, #(2*8*width)]
+            str.w r4, [r0], #width
+
+// ================
+
+            ldrstrvec ldr.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #(2*8*width), #(2*9*width), #(2*10*width), #(2*11*width), #(2*12*width), #(2*13*width), #(2*14*width), #(2*15*width)
+
+            addSub4 r4, r5, r6, r7, r8, r9, r10, r11
+            vmov.w r14, s6
+            mul.w r5, r5, r14
+            vmov.w r14, s8
+            mul.w r9, r9, r14
+            addSub2 r4, r6, r8, r10
+            vmov.w r14, s7
+            mla.w r12, r7, r14, r5
+            mls.w r7, r7, r14, r5
+            vmov.w r14, s9
+            mla.w r5, r11, r14, r9
+            mls.w r11, r11, r14, r9
+
+            // r4, r12, r6, r7, r8, r5, r10, r11
+
+            vmov.w r14, s12
+            mul.w r6, r6, r14
+            mul.w r7, r7, r14
+            vmov.w r14, s13
+            mul.w r10, r10, r14
+            mul.w r11, r11, r14
+
+    barrett_32 r4, r2, r3, r14
+    barrett_32 r12, r2, r3, r14
+    barrett_32 r6, r2, r3, r14
+    barrett_32 r7, r2, r3, r14
+    barrett_32 r8, r2, r3, r14
+    barrett_32 r5, r2, r3, r14
+    barrett_32 r10, r2, r3, r14
+    barrett_32 r11, r2, r3, r14
+
+            addSub4 r4, r8, r6, r10, r12, r5, r7, r11
+
+            vmov.w s17, s18, r4, r12
+            vmov.w s19, s20, r6, r7
+            vmov.w s21, s22, r8, r5
+            vmov.w s23, s24, r10, r11
+
+            ldrstrvec ldr.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #(2*0*width), #(2*1*width), #(2*2*width), #(2*3*width), #(2*4*width), #(2*5*width), #(2*6*width), #(2*7*width)
+
+            addSub4 r4, r5, r6, r7, r8, r9, r10, r11
+            vmov.w r14, s2
+            mul.w r5, r5, r14
+            vmov.w r14, s4
+            mul.w r9, r9, r14
+            addSub2 r4, r6, r8, r10
+            vmov.w r14, s3
+            mla.w r12, r7, r14, r5
+            mls.w r7, r7, r14, r5
+            vmov.w r14, s5
+            mla.w r5, r11, r14, r9
+            mls.w r11, r11, r14, r9
+
+            // r4, r12, r6, r7, r8, r5, r10, r11
+
+            vmov.w r14, s10
+            mul.w r6, r6, r14
+            mul.w r7, r7, r14
+            vmov.w r14, s11
+            mul.w r10, r10, r14
+            mul.w r11, r11, r14
+
+    barrett_32 r4, r2, r3, r14
+    barrett_32 r12, r2, r3, r14
+    barrett_32 r6, r2, r3, r14
+    barrett_32 r7, r2, r3, r14
+    barrett_32 r8, r2, r3, r14
+    barrett_32 r5, r2, r3, r14
+    barrett_32 r10, r2, r3, r14
+    barrett_32 r11, r2, r3, r14
+
+            addSub4 r4, r8, r6, r10, r12, r5, r7, r11
+            vmov.w r14, s14
+            mul.w r8, r8, r14
+            mul.w r5, r5, r14
+            mul.w r10, r10, r14
+            mul.w r11, r11, r14
+            vmov.w r14, s16
+
+            final_butterfly r12, r9, s18, r14
+            str.w r12, [r0, #(2*1*width)]
+            str.w r9, [r0, #(2*9*width)]
+            final_butterfly r6, r9, s19, r14
+            str.w r6, [r0, #(2*2*width)]
+            str.w r9, [r0, #(2*10*width)]
+            final_butterfly r7, r9, s20, r14
+            str.w r7, [r0, #(2*3*width)]
+            str.w r9, [r0, #(2*11*width)]
+            vmov.w r12, s15
+            final_butterfly2 r8, r6, r9, s21, r14, r12
+            str.w r6, [r0, #(2*4*width)]
+            str.w r9, [r0, #(2*12*width)]
+            final_butterfly2 r5, r6, r9, s22, r14, r12
+            str.w r6, [r0, #(2*5*width)]
+            str.w r9, [r0, #(2*13*width)]
+            final_butterfly2 r10, r6, r9, s23, r14, r12
+            str.w r6, [r0, #(2*6*width)]
+            str.w r9, [r0, #(2*14*width)]
+            final_butterfly2 r11, r6, r9, s24, r14, r12
+            str.w r6, [r0, #(2*7*width)]
+            str.w r9, [r0, #(2*15*width)]
+            final_butterfly r4, r9, s17, r14
+            str.w r9, [r0, #(2*8*width)]
+            str.w r4, [r0], #31*width
+
+// ================
+
+    vmov.w r12, s1
+    cmp.w r0, r12
+    bne.w _ifnt_7_6_5_4
+
+    sub.w r0, r0, #256*width
+
+    mov.w r14, #0
+
+    add.w r1, r0, #32*width
+    _ifnt_0_1_2:
+
+.rept 2
+
+    ldrstrvec ldr.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #(32*0*width), #(32*1*width), #(32*2*width), #(32*3*width), #(32*4*width), #(32*5*width), #(32*6*width), #(32*7*width)
+
+    addSub4 r4, r5, r6, r7, r8, r9, r10, r11
+
+    addSub2 r4, r6, r8, r10
+    FNT_CT_ibutterfly r5, r7, 4
+    FNT_CT_ibutterfly r9, r11, 4
+
+    addSub1 r4, r8
+    barrett_32 r9, r2, r3, r12
+    FNT_CT_ibutterfly r5, r9, 6
+    FNT_CT_ibutterfly r6, r10, 4
+    FNT_CT_ibutterfly r7, r11, 2
+
+    barrett_32 r6, r2, r3, r12
+    barrett_32 r7, r2, r3, r12
+    sub.w r4, r14, r4, lsl #1
+    neg.w r5, r5
+    lsl.w r6, r6, #7
+    lsl.w r7, r7, #6
+    lsl.w r8, r8, #5
+    lsl.w r9, r9, #4
+    lsl.w r10, r10, #3
+    lsl.w r11, r11, #2
+
+    barrett_32 r4, r2, r3, r12
+    barrett_32 r5, r2, r3, r12
+    barrett_32 r6, r2, r3, r12
+    barrett_32 r7, r2, r3, r12
+    barrett_32 r8, r2, r3, r12
+    barrett_32 r9, r2, r3, r12
+    barrett_32 r10, r2, r3, r12
+    barrett_32 r11, r2, r3, r12
+
+    ldrstrvecjump str.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #(32*1*width), #(32*2*width), #(32*3*width), #(32*4*width), #(32*5*width), #(32*6*width), #(32*7*width), #width
+
+.endr
+
+    cmp.w r0, r1
+    bne.w _ifnt_0_1_2
+    vpop.w {s16-s24}
+    pop.w {r4-r11, pc}
diff --git a/crypto_sign/dilithium2/m4f/macros.i b/crypto_sign/dilithium2/m4f/macros.i
new file mode 100644
index 0000000..25d98c2
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/macros.i
@@ -0,0 +1,191 @@
+#ifndef MACROS_I
+#define MACROS_I
+// 3
+.macro montgomery_mul_32 a, b, Qprime, Q, tmp, tmp2
+    smull \tmp, \a, \a, \b
+    mul \tmp2, \tmp, \Qprime
+    smlal \tmp, \a, \tmp2, \Q
+.endm
+
+// 2
+.macro addSub1 c0, c1
+    add.w \c0, \c1
+    sub.w \c1, \c0, \c1, lsl #1
+.endm
+
+// 3
+.macro addSub2 c0, c1, c2, c3
+    add \c0, \c1
+    add \c2, \c3
+    sub.w \c1, \c0, \c1, lsl #1
+    sub.w \c3, \c2, \c3, lsl #1
+.endm
+
+// 6
+.macro addSub4 c0, c1, c2, c3, c4, c5, c6, c7
+    add \c0, \c1
+    add \c2, \c3
+    add \c4, \c5
+    add \c6, \c7
+    sub.w \c1, \c0, \c1, lsl #1
+    sub.w \c3, \c2, \c3, lsl #1
+    sub.w \c5, \c4, \c5, lsl #1
+    sub.w \c7, \c6, \c7, lsl #1
+.endm
+
+.macro _2_layer_CT_32 c0, c1, c2, c3, zeta0, zeta1, zeta2, Qprime, Q, tmp, tmp2
+    montgomery_mul_32 \c2, \zeta0, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c3, \zeta0, \Qprime, \Q, \tmp, \tmp2
+    addSub2 \c0, \c2, \c1, \c3
+
+    montgomery_mul_32 \c1, \zeta1, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c3, \zeta2, \Qprime, \Q, \tmp, \tmp2
+    addSub2 \c0, \c1, \c2, \c3
+.endm
+
+.macro _2_layer_inv_CT_32 c0, c1, c2, c3, zeta0, zeta1, zeta2, Qprime, Q, tmp, tmp2
+    montgomery_mul_32 \c1, \zeta0, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c3, \zeta0, \Qprime, \Q, \tmp, \tmp2
+    addSub2 \c0, \c1, \c2, \c3
+
+    montgomery_mul_32 \c2, \zeta1, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c3, \zeta2, \Qprime, \Q, \tmp, \tmp2
+    addSub2 \c0, \c2, \c1, \c3
+.endm
+
+.macro _3_layer_CT_32 c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2
+    vmov.w \twiddle, \xi0
+    montgomery_mul_32 \c4, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c5, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    addSub4 \c0, \c4, \c1, \c5, \c2, \c6, \c3, \c7
+
+    vmov.w \twiddle, \xi1
+    montgomery_mul_32 \c2, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    vmov.w \twiddle, \xi2
+    montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    addSub4 \c0, \c2, \c1, \c3, \c4, \c6, \c5, \c7
+
+    vmov.w \twiddle, \xi3
+    montgomery_mul_32 \c1, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    vmov.w \twiddle, \xi4
+    montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    vmov.w \twiddle, \xi5
+    montgomery_mul_32 \c5, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    vmov.w \twiddle, \xi6
+    montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    addSub4 \c0, \c1, \c2, \c3, \c4, \c5, \c6, \c7
+.endm
+
+.macro _3_layer_inv_CT_32 c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2
+    vmov.w \twiddle, \xi0
+    montgomery_mul_32 \c1, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c5, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    addSub4 \c0, \c1, \c2, \c3, \c4, \c5, \c6, \c7
+
+    vmov.w \twiddle, \xi1
+    montgomery_mul_32 \c2, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    vmov.w \twiddle, \xi2
+    montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    addSub4 \c0, \c2, \c1, \c3, \c4, \c6, \c5, \c7
+
+    vmov.w \twiddle, \xi3
+    montgomery_mul_32 \c4, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    vmov.w \twiddle, \xi4
+    montgomery_mul_32 \c5, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    vmov.w \twiddle, \xi5
+    montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    vmov.w \twiddle, \xi6
+    montgomery_mul_32 \c7, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    addSub4 \c0, \c4, \c1, \c5, \c2, \c6, \c3, \c7
+.endm
+
+/************************************************************
+* Name:         _3_layer_inv_butterfly_light_fast_first
+*
+* Description:  upper half of 3-layer inverse butterfly
+*               defined over X^8 - 1
+*
+* Input:        (c4, c1, c6, c3) = coefficients on the upper half;
+*               (xi0, xi1, xi2, xi3, xi4, xi5, xi6) =
+*               (  1,  1,  w_4,   1, w_8, w_4, w_8^3) in
+*               Montgomery domain
+*
+* Symbols:      R = 2^32
+*
+* Constants:    Qprime = -MOD^{-1} mod^{+-} R, Q = MOD
+*
+* Output:
+*               c4 =  c4 + c1        + (c6 + c3)
+*               c5 = (c4 - c1) w_4   + (c6 + c3) w_8^3
+*               c6 =  c4 + c1        - (c6 + c3)
+*               c7 = (c4 - c1) w_8^3 + (c6 + c3) w_4
+************************************************************/
+// 15
+.macro _3_layer_inv_butterfly_light_fast_first c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2
+    addSub2 \c4, \c1, \c6, \c3
+    addSub1 \c4, \c6
+
+    vmov.w \tmp, \xi4
+    vmov.w \tmp2, \xi6
+
+    smull.w \c0, \c5, \c1, \tmp
+    smlal.w \c0, \c5, \c3, \tmp2
+    mul.w \twiddle, \c0, \Qprime
+    smlal.w \c0, \c5, \twiddle, \Q
+
+    smull.w \c2, \c7, \c1, \tmp2
+    smlal.w \c2, \c7, \c3, \tmp
+    mul.w \twiddle, \c2, \Qprime
+    smlal.w \c2, \c7, \twiddle, \Q
+.endm
+
+/************************************************************
+* Name:         _3_layer_inv_butterfly_light_fast_second
+*
+* Description:  lower half of 3-layer inverse butterfly
+*               defined over X^8 - 1, and the 2nd
+*               layer of butterflies
+*
+* Input:
+*               (c4, c5, c6, c7) = results of the upper half;
+*               (c0, c1, c2, c3) = coefficients on the lower half;
+*               (xi0, xi1, xi2, xi3, xi4, xi5, xi6) =
+*               (  1,  1,  w_4,   1, w_8, w_4, w_8^3) in
+*               Montgomery domain
+*
+* Symbols:      R = 2^32
+*
+* Constants:    Qprime = -MOD^{-1} mod^{+-} R, Q = MOD
+*
+* Output:       (normal order)
+*               c0 =   c0 + c1     + (c2 + c3)         + (  c4 + c5     + (c6 + c7)       )
+*               c1 =  (c0 - c1) w3 + (c2 - c3)  w4     + ( (c4 - c5) w5 + (c6 - c7) w6    )
+*               c2 = ( c0 + c1     - (c2 + c3)) w1     + (( c4 + c5     - (c6 + c7)   ) w2)
+*               c3 = ((c0 - c1) w3 - (c2 - c3)  w4) w1 + (((c4 - c5) w5 - (c6 - c7) w6) w2)
+*               c4 =   c0 + c1     - (c2 + c3)         - (  c4 + c5     + (c6 + c7)       ) w0
+*               c5 =  (c0 - c1) w3 + (c2 - c3)  w4     - ( (c4 - c5) w5 + (c6 - c7) w6    ) w0
+*               c6 = ( c0 + c1     - (c2 + c3)) w1     - (( c4 + c5     - (c6 + c7)   ) w2) w0
+*               c7 = ((c0 - c1) w3 - (c2 - c3)  w4) w1 - (((c4 - c5) w5 - (c6 - c7) w6) w2) w0
+************************************************************/
+// 19
+.macro _3_layer_inv_butterfly_light_fast_second c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2
+    addSub2 \c0, \c1, \c2, \c3
+
+    vmov.w \twiddle, \xi2
+    montgomery_mul_32 \c3, \twiddle, \Qprime, \Q, \tmp, \tmp2
+    addSub2 \c0, \c2, \c1, \c3
+
+    montgomery_mul_32 \c6, \twiddle, \Qprime, \Q, \tmp, \tmp2
+
+    addSub4 \c0, \c4, \c1, \c5, \c2, \c6, \c3, \c7
+.endm
+
+#endif /* MACROS_I */
diff --git a/crypto_sign/dilithium2/m4f/macros_fnt.i b/crypto_sign/dilithium2/m4f/macros_fnt.i
new file mode 100644
index 0000000..25903e4
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/macros_fnt.i
@@ -0,0 +1,158 @@
+// 2
+.macro ldrstr2 ldrstr, target, c0, c1, mem0, mem1
+    \ldrstr \c0, [\target, \mem0]
+    \ldrstr \c1, [\target, \mem1]
+.endm
+
+// 2
+.macro ldrstr2jump ldrstr, target, c0, c1, mem1, jump
+    \ldrstr \c1, [\target, \mem1]
+    \ldrstr \c0, [\target], \jump
+.endm
+
+// 4
+.macro ldrstr4 ldrstr, target, c0, c1, c2, c3, mem0, mem1, mem2, mem3
+    \ldrstr \c0, [\target, \mem0]
+    \ldrstr \c1, [\target, \mem1]
+    \ldrstr \c2, [\target, \mem2]
+    \ldrstr \c3, [\target, \mem3]
+.endm
+
+// 4
+.macro ldrstr4jump ldrstr, target, c0, c1, c2, c3, mem1, mem2, mem3, jump
+    \ldrstr \c1, [\target, \mem1]
+    \ldrstr \c2, [\target, \mem2]
+    \ldrstr \c3, [\target, \mem3]
+    \ldrstr \c0, [\target], \jump
+.endm
+
+// 8
+.macro ldrstrvec ldrstr, target, c0, c1, c2, c3, c4, c5, c6, c7, mem0, mem1, mem2, mem3, mem4, mem5, mem6, mem7
+    ldrstr4 \ldrstr, \target, \c0, \c1, \c2, \c3, \mem0, \mem1, \mem2, \mem3
+    ldrstr4 \ldrstr, \target, \c4, \c5, \c6, \c7, \mem4, \mem5, \mem6, \mem7
+.endm
+
+// 8
+.macro ldrstrvecjump ldrstr, target, c0, c1, c2, c3, c4, c5, c6, c7, mem1, mem2, mem3, mem4, mem5, mem6, mem7, jump
+    ldrstr4 \ldrstr, \target, \c4, \c5, \c6, \c7, \mem4, \mem5, \mem6, \mem7
+    ldrstr4jump \ldrstr, \target, \c0, \c1, \c2, \c3, \mem1, \mem2, \mem3, \jump
+.endm
+
+
+
+.macro addSub1 c0, c1
+    add.w \c0, \c1
+    sub.w \c1, \c0, \c1, lsl #1
+.endm
+
+.macro addSub2 c0, c1, c2, c3
+    add \c0, \c1
+    add \c2, \c3
+    sub.w \c1, \c0, \c1, lsl #1
+    sub.w \c3, \c2, \c3, lsl #1
+.endm
+
+.macro addSub4 c0, c1, c2, c3, c4, c5, c6, c7
+    add \c0, \c1
+    add \c2, \c3
+    add \c4, \c5
+    add \c6, \c7
+    sub.w \c1, \c0, \c1, lsl #1
+    sub.w \c3, \c2, \c3, lsl #1
+    sub.w \c5, \c4, \c5, lsl #1
+    sub.w \c7, \c6, \c7, lsl #1
+.endm
+
+// 2
+.macro barrett_32 a, Qbar, Q, tmp
+    smmulr.w \tmp, \a, \Qbar
+    mls.w \a, \tmp, \Q, \a
+.endm
+
+.macro FNT_CT_butterfly c0, c1, logW
+    add.w \c0, \c0, \c1, lsl #\logW
+    sub.w \c1, \c0, \c1, lsl #(\logW+1)
+.endm
+
+.macro shift_subAdd c0, c1, shlv
+    sub.w \c0, \c0, \c1, lsl #(\shlv)
+    add.w \c1, \c0, \c1, lsl #(\shlv+1)
+.endm
+
+.macro FNT_CT_ibutterfly c0, c1, shlv
+    shift_subAdd \c0, \c1, \shlv
+.endm
+
+// 46
+.macro _3_layer_CT_32_FNT c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle, Qprime, Q, tmp, tmp2
+    vmov.w \twiddle, \xi0
+
+    // c0, c1, c2, c3, c4, c5, c6, c7, c8
+    // 0,4
+    mla \tmp, \c4, \twiddle, \c0
+    mls \c4, \c4, \twiddle, \c0
+
+    // 1,5
+    mla \c0, \c5, \twiddle, \c1
+    mls \c5, \c5, \twiddle, \c1
+
+    // 2,6
+    mla \c1, \c6, \twiddle, \c2
+    mls \c6, \c6, \twiddle, \c2
+
+    // 3,7
+    mla \c2, \c7, \twiddle, \c3
+    mls \c7, \c7, \twiddle, \c3
+
+    // tmp, c0, c1, c2, c4, c5, c6, c7
+
+    barrett_32 \tmp, \Qprime, \Q, \c3
+    barrett_32 \c0, \Qprime, \Q, \c3
+    barrett_32 \c1, \Qprime, \Q, \c3
+    barrett_32 \c2, \Qprime, \Q, \c3
+    barrett_32 \c4, \Qprime, \Q, \c3
+    barrett_32 \c5, \Qprime, \Q, \c3
+    barrett_32 \c6, \Qprime, \Q, \c3
+    barrett_32 \c7, \Qprime, \Q, \c3
+
+    vmov.w \twiddle, \xi1
+    // 0,2
+    mla \tmp2, \c1, \twiddle, \tmp
+    mls \c3, \c1, \twiddle, \tmp
+
+    // 1,3
+    mla \tmp, \c2, \twiddle, \c0
+    mls \c0, \c2, \twiddle, \c0
+
+    vmov.w \twiddle, \xi2
+
+    // 4,6
+    mla \c2, \c6, \twiddle, \c4
+    mls \c1, \c6, \twiddle, \c4
+
+    // 5,7
+    mla \c6, \c7, \twiddle, \c5
+    mls \c7, \c7, \twiddle, \c5
+
+    // tmp2, tmp, c3, c0 | c2, c6, c1, c7
+
+    // 4,5
+    vmov.w \twiddle, \xi5
+    mla \c4, \c6, \twiddle, \c2
+    mls \c5, \c6, \twiddle, \c2
+
+    // 6,7
+    vmov.w \twiddle, \xi6
+    mla \c6, \c7, \twiddle, \c1
+    mls \c7, \c7, \twiddle, \c1
+
+    // 2,3
+    vmov.w \twiddle, \xi4
+    mla \c2, \c0, \twiddle, \c3
+    mls \c3, \c0, \twiddle, \c3
+
+    // 0,1
+    vmov.w \twiddle, \xi3
+    mla \c0, \tmp, \twiddle, \tmp2
+    mls \c1, \tmp, \twiddle, \tmp2
+.endm
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4f/ntt.S b/crypto_sign/dilithium2/m4f/ntt.S
new file mode 100644
index 0000000..bfd5f7a
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/ntt.S
@@ -0,0 +1,402 @@
+// based on code by: Markus Krausz (18.03.18)
+// date 23.07.21: Now licensed under CC0 with permission of the authors.
+
+.syntax unified
+#include "macros.i"
+
+// This code uses UMULL - which is constant time on the M4, but not on the M3
+// Make sure that this code is never used on an M3
+smlad r0,r0,r0,r0
+
+// ##############################
+// ##########   NTT    ##########
+// ##############################
+
+//void pqcrystals_dilithium_ntt(int32_t p[N]);
+.global pqcrystals_dilithium_ntt
+.type pqcrystals_dilithium_ntt,%function
+.align 2
+pqcrystals_dilithium_ntt:
+  //bind aliases
+  ptr_p     .req R0
+  ptr_zeta  .req R1
+  zeta      .req R1
+  qinv      .req R2
+  q         .req R3
+  cntr      .req R4
+  pol4      .req R4
+  pol0      .req R5
+  pol1      .req R6
+  pol2      .req R7
+  pol3      .req R8
+  temp_h    .req R9
+  temp_l    .req R10
+  zeta0     .req R11
+  zeta1     .req R12
+  zeta2     .req R14
+  pol5     .req R11
+  pol6     .req R12
+  pol7     .req R14
+
+  //preserve registers
+  push {R4-R11, R14}
+  
+  //load constants, ptr
+  ldr.w qinv, inv_ntt_asm_smull_qinv  //-qinv_signed
+  ldr.w q, inv_ntt_asm_smull_q
+
+  //stage 1 - 3
+  .equ distance, 512
+  .equ strincr, 4
+  
+  ldr ptr_zeta, =#zetas_new332
+  vldm ptr_zeta!, {s2-s8} 
+  vmov s0, ptr_zeta
+  
+  add.w temp_l, ptr_p, #32*strincr // 32 iterations
+  vmov s9, temp_l
+  1:
+    .rept 2
+    ldr.w pol0, [ptr_p]
+    ldr.w pol1, [ptr_p, #1*distance/4]
+    ldr.w pol2, [ptr_p, #2*distance/4]
+    ldr.w pol3, [ptr_p, #3*distance/4]
+    ldr.w pol4, [ptr_p, #4*distance/4]
+    ldr.w pol5, [ptr_p, #5*distance/4]
+    ldr.w pol6, [ptr_p, #6*distance/4]
+    ldr.w pol7, [ptr_p, #7*distance/4]
+
+    _3_layer_CT_32 pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l
+
+    str.w pol1, [ptr_p, #1*distance/4]
+    str.w pol2, [ptr_p, #2*distance/4]
+    str.w pol3, [ptr_p, #3*distance/4]
+    str.w pol4, [ptr_p, #4*distance/4]
+    str.w pol5, [ptr_p, #5*distance/4]
+    str.w pol6, [ptr_p, #6*distance/4]
+    str.w pol7, [ptr_p, #7*distance/4]
+    str.w pol0, [ptr_p], #strincr
+    .endr
+    vmov temp_l, s9
+    cmp.w ptr_p, temp_l
+    bne 1b
+  
+  sub ptr_p, #32*4
+
+  // stage 4 - 6  
+  .equ distance, 64
+  add.w temp_l, ptr_p, #8*112+8*4*4 // 8 iterations
+  vmov s9, temp_l
+  1:
+    add.w temp_l, ptr_p, #4*strincr // 4 iterations
+    vmov s10, temp_l
+    vmov ptr_zeta, s0
+    vldm ptr_zeta!, {s2-s8}
+    vmov s0, ptr_zeta
+    2:
+      .rept 2
+      ldr.w pol0, [ptr_p]
+      ldr.w pol1, [ptr_p, #1*distance/4]
+      ldr.w pol2, [ptr_p, #2*distance/4]
+      ldr.w pol3, [ptr_p, #3*distance/4]
+      ldr.w pol4, [ptr_p, #4*distance/4]
+      ldr.w pol5, [ptr_p, #5*distance/4]
+      ldr.w pol6, [ptr_p, #6*distance/4]
+      ldr.w pol7, [ptr_p, #7*distance/4]
+
+      _3_layer_CT_32 pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l
+      
+      str.w pol1, [ptr_p, #1*distance/4]
+      str.w pol2, [ptr_p, #2*distance/4]
+      str.w pol3, [ptr_p, #3*distance/4]
+      str.w pol4, [ptr_p, #4*distance/4]
+      str.w pol5, [ptr_p, #5*distance/4]
+      str.w pol6, [ptr_p, #6*distance/4]
+      str.w pol7, [ptr_p, #7*distance/4]
+      str.w pol0, [ptr_p], #4
+      .endr
+      vmov temp_l, s10
+      cmp.w ptr_p, temp_l
+      bne 2b
+
+    add.w ptr_p, #112
+    vmov temp_l, s9
+    cmp.w ptr_p, temp_l
+    bne 1b
+  
+    sub ptr_p, #4*4*8+112*8
+    vmov ptr_zeta, s0
+    //stage 7 and 8
+    add cntr, ptr_p, #1024 // 64 iterations
+    1:
+      ldr.w zeta1, [ptr_zeta, #4]  //z128,..., z254
+      ldr.w zeta2, [ptr_zeta, #8]  //z129,..., z255
+      ldr zeta0, [ptr_zeta], #12  //z64, ..., z127
+      ldr.w pol0, [ptr_p]  //1*4
+      ldr.w pol1, [ptr_p, #4]
+      ldr.w pol2, [ptr_p, #8]
+      ldr.w pol3, [ptr_p, #12] 
+
+      _2_layer_CT_32 pol0, pol1, pol2, pol3, zeta0, zeta1, zeta2, qinv, q, temp_h, temp_l
+
+      str.w pol1, [ptr_p, #4]
+      str.w pol2, [ptr_p, #8]
+      str.w pol3, [ptr_p, #12]
+      str pol0, [ptr_p], #16
+
+      cmp.w cntr, ptr_p
+      bne.w 1b
+
+    //restore registers
+    pop {R4-R11, PC}
+
+    //unbind aliases
+    .unreq ptr_p
+    .unreq ptr_zeta
+    .unreq qinv
+    .unreq q
+    .unreq cntr
+    .unreq pol0
+    .unreq pol1
+    .unreq pol2
+    .unreq pol3
+    .unreq temp_h
+    .unreq temp_l
+    .unreq zeta0
+    .unreq zeta1
+    .unreq zeta2
+
+.ltorg
+// ##############################
+// ##########  NTT^-1  ##########
+// ##############################
+
+//void pqcrystals_dilithium_invntt_tomont(int32_t p[N]);
+.global pqcrystals_dilithium_invntt_tomont
+.type pqcrystals_dilithium_invntt_tomont,%function
+.align 2
+pqcrystals_dilithium_invntt_tomont:
+  //bind aliases
+  ptr_p     .req R0
+  ptr_zeta  .req R1
+  zeta      .req R1
+  qinv      .req R2
+  q         .req R3
+  cntr      .req R4
+  pol4      .req R4
+  pol0      .req R5
+  pol1      .req R6
+  pol2      .req R7
+  pol3      .req R8
+  temp_h    .req R9
+  temp_l    .req R10
+  zeta0     .req R11
+  zeta1     .req R12
+  zeta2     .req R14
+  pol5     .req R11
+  pol6     .req R12
+  pol7     .req R14
+
+  //preserve registers
+  push {R4-R11, R14}
+    
+  //load constants, ptr
+  ldr.w qinv, inv_ntt_asm_smull_qinv  //-qinv_signed
+  ldr.w q, inv_ntt_asm_smull_q
+
+  //stage 1 - 3
+  .equ distance, 16
+  .equ strincr, 32
+
+  ldr ptr_zeta, =#zetas_new332inv
+  vldm ptr_zeta!, {s2-s8} 
+  vmov s0, ptr_zeta
+  
+  add.w temp_l, ptr_p, #32*strincr // 32 iterations
+  vmov s9, temp_l
+  1:
+    ldr.w pol4, [ptr_p, #4*distance/4]
+    ldr.w pol1, [ptr_p, #5*distance/4]
+    ldr.w pol6, [ptr_p, #6*distance/4]
+    ldr.w pol3, [ptr_p, #7*distance/4]
+    _3_layer_inv_butterfly_light_fast_first pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l
+    
+    ldr.w pol0, [ptr_p]
+    ldr.w pol1, [ptr_p, #1*distance/4]
+    ldr.w pol2, [ptr_p, #2*distance/4]
+    ldr.w pol3, [ptr_p, #3*distance/4]
+    _3_layer_inv_butterfly_light_fast_second pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l
+    
+    str.w pol1, [ptr_p, #1*distance/4]
+    str.w pol2, [ptr_p, #2*distance/4]
+    str.w pol3, [ptr_p, #3*distance/4]
+    str.w pol4, [ptr_p, #4*distance/4]
+    str.w pol5, [ptr_p, #5*distance/4]
+    str.w pol6, [ptr_p, #6*distance/4]
+    str.w pol7, [ptr_p, #7*distance/4]
+    str.w pol0, [ptr_p], #strincr
+    vmov temp_l, s9
+    cmp.w ptr_p, temp_l
+  bne.w 1b
+  
+  sub ptr_p, #32*strincr
+
+  // stage 4 - 6  
+  .equ distance, 128
+  .equ strincr, 256
+  
+  // iteration 0
+  movw temp_l, #4
+  add.w temp_l, ptr_p, #4*256 // 4 iterations
+  vmov s10, temp_l
+	
+  vmov ptr_zeta, s0
+  vldm ptr_zeta!, {s2-s8}
+  vmov s0, ptr_zeta
+
+  2:
+    ldr.w pol4, [ptr_p, #4*distance/4]
+    ldr.w pol1, [ptr_p, #5*distance/4]
+    ldr.w pol6, [ptr_p, #6*distance/4]
+    ldr.w pol3, [ptr_p, #7*distance/4]
+    _3_layer_inv_butterfly_light_fast_first pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l
+    
+    ldr.w pol0, [ptr_p]
+    ldr.w pol1, [ptr_p, #1*distance/4]
+    ldr.w pol2, [ptr_p, #2*distance/4]
+    ldr.w pol3, [ptr_p, #3*distance/4]
+    _3_layer_inv_butterfly_light_fast_second pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l
+
+    str.w pol1, [ptr_p, #1*distance/4]
+    str.w pol2, [ptr_p, #2*distance/4]
+    str.w pol3, [ptr_p, #3*distance/4]
+    str.w pol4, [ptr_p, #4*distance/4]
+    str.w pol5, [ptr_p, #5*distance/4]
+    str.w pol6, [ptr_p, #6*distance/4]
+    str.w pol7, [ptr_p, #7*distance/4]
+    str.w pol0, [ptr_p]
+    add.w ptr_p, #strincr
+
+    vmov temp_l, s10
+    cmp.w temp_l, ptr_p
+  bne.w 2b
+
+  sub.w ptr_p, #4*256-4
+
+  // iteration 1-7
+  add.w temp_l, ptr_p, #7*4 // 7 iterations
+  vmov s9, temp_l
+  1:
+    add.w temp_l, ptr_p, #4*strincr // 4 iterations
+    vmov s10, temp_l
+
+	  vmov ptr_zeta, s0
+    vldm ptr_zeta!, {s2-s8}
+    vmov s0, ptr_zeta
+    2:     
+	    ldr.w pol0, [ptr_p]
+	    ldr.w pol1, [ptr_p, #1*distance/4]
+	    ldr.w pol2, [ptr_p, #2*distance/4]
+	    ldr.w pol3, [ptr_p, #3*distance/4]
+	    ldr.w pol4, [ptr_p, #4*distance/4]
+	    ldr.w pol5, [ptr_p, #5*distance/4]
+	    ldr.w pol6, [ptr_p, #6*distance/4]
+	    ldr.w pol7, [ptr_p, #7*distance/4]
+
+	    _3_layer_inv_CT_32 pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l
+
+	    str.w pol1, [ptr_p, #1*distance/4]
+	    str.w pol2, [ptr_p, #2*distance/4]
+	    str.w pol3, [ptr_p, #3*distance/4]
+	    str.w pol4, [ptr_p, #4*distance/4]
+	    str.w pol5, [ptr_p, #5*distance/4]
+	    str.w pol6, [ptr_p, #6*distance/4]
+	    str.w pol7, [ptr_p, #7*distance/4]
+	    str.w pol0, [ptr_p]
+	    add.w ptr_p, #strincr
+
+      vmov temp_l, s10
+      cmp.w ptr_p, temp_l
+    bne 2b
+    sub.w ptr_p, #4*strincr-4
+
+    vmov temp_l, s9
+    cmp.w temp_l, ptr_p
+  bne 1b
+  
+  sub ptr_p, #8*4
+  vmov ptr_zeta, s0
+  
+  //stage 7 and 8
+  .equ strincr, 4
+
+  add.w cntr, ptr_p, #64*strincr // 64 iterations 
+  vmov s9, cntr
+  1:
+    ldr.w zeta1, [ptr_zeta, #4]
+    ldr.w zeta2, [ptr_zeta, #8]
+    ldr zeta0, [ptr_zeta], #12
+    ldr.w pol0, [ptr_p]
+    ldr.w pol1, [ptr_p, #256]
+    ldr.w pol2, [ptr_p, #512]
+    ldr.w pol3, [ptr_p, #768]
+
+    _2_layer_inv_CT_32 pol0, pol1, pol2, pol3, zeta0, zeta1, zeta2, qinv, q, temp_h, temp_l
+
+    ldr.w zeta1, [ptr_zeta, #4]
+    ldr.w zeta2, [ptr_zeta, #8]
+    ldr.w zeta0, [ptr_zeta, #12]
+    ldr.w cntr, [ptr_zeta], #16
+    montgomery_mul_32 pol0, cntr, qinv, q, temp_h, temp_l
+    montgomery_mul_32 pol1, zeta1, qinv, q, temp_h, temp_l
+    montgomery_mul_32 pol2, zeta2, qinv, q, temp_h, temp_l
+    montgomery_mul_32 pol3, zeta0, qinv, q, temp_h, temp_l
+
+    str.w pol1, [ptr_p, #256]
+    str.w pol2, [ptr_p, #512]
+    str.w pol3, [ptr_p, #768]
+    str pol0, [ptr_p], #strincr
+
+    vmov cntr, s9
+    cmp.w cntr, ptr_p
+    bne.w 1b
+
+    //restore registers
+    pop {R4-R11, PC}
+
+    //unbind aliases
+    .unreq ptr_p
+    .unreq ptr_zeta
+    .unreq qinv
+    .unreq q
+    .unreq cntr
+    .unreq pol0
+    .unreq pol1
+    .unreq pol2
+    .unreq pol3
+    .unreq temp_h
+    .unreq temp_l
+    .unreq zeta0
+    .unreq zeta1
+    .unreq zeta2
+
+.align 2
+inv_ntt_asm_smull_qinv:
+.word 0xfc7fdfff
+.align 2
+inv_ntt_asm_smull_q:
+.word 8380417
+
+.section .rodata
+
+.type zetas_new332, %object
+.align 2
+zetas_new332:
+.word 25847, -2608894, -518909, 237124, -777960, -876248, 466468, 1826347, 2725464, 1024112, 2706023, 95776, 3077325, 3530437, 2353451, -1079900, 3585928, -1661693, -3592148, -2537516, 3915439, -359251, -549488, -1119584, -3861115, -3043716, 3574422, -2867647, -2091905, 2619752, -2108549, 3539968, -300467, 2348700, -539299, 3119733, -2118186, -3859737, -1699267, -1643818, 3505694, -3821735, -2884855, -1399561, -3277672, 3507263, -2140649, -1600420, 3699596, 3111497, 1757237, -19422, 811944, 531354, 954230, 3881043, 2680103, 4010497, 280005, 3900724, -2556880, 2071892, -2797779, -3930395, 2091667, 3407706, -1528703, 2316500, 3817976, -3677745, -3342478, 2244091, -3041255, -2446433, -3562462, -1452451, 266997, 2434439, 3475950, -1235728, 3513181, 2176455, -3520352, -3759364, -1585221, -1197226, -3193378, -1257611, 900702, 1859098, 1939314, 909542, 819034, -4083598, 495491, -1613174, -1000202, -43260, -522500, -3190144, -655327, -3122442, -3157330, 2031748, 3207046, -3632928, -3556995, -525098, 126922, -768622, -3595838, 3412210, 342297, 286988, -983419, -2437823, 4108315, 2147896, 3437287, -3342277, 2715295, 1735879, 203044, -2967645, 2842341, 2691481, -3693493, -2590150, 1265009, -411027, 4055324, 1247620, -2477047, 2486353, 1595974, -671102, -3767016, 1250494, -1228525, 2635921, -3548272, -22981, -2994039, 1869119, -1308169, 1903435, -1050970, -381987, -1333058, 1237275, 1349076, -3318210, -1430225, 1852771, -451100, 1312455, -1430430, 3306115, -1962642, -3343383, -1279661, 1917081, 264944, -2546312, -1374803, 508951, 1500165, 777191, 3097992, 2235880, 3406031, 44288, -542412, -2831860, -1100098, -1671176, -1846953, 904516, -2584293, -3724270, 3958618, 594136, -3776993, -3724342, -2013608, 2432395, -8578, 2454455, -164721, 1653064, 1957272, 3369112, -3249728, 185531, -1207385, 2389356, -3183426, 162844, -210977, 1616392, 3014001, 759969, 810149, 1652634, -1316856, -3694233, -1799107, 189548, -3038916, 3523897, -3553272, 3866901, 269760, 3159746, 2213111, -975884, -1851402, 1717735, 472078, -2409325, -426683, 1723600, -177440, -1803090, 1910376, 1315589, -1667432, -1104333, 1341330, -260646, -3833893, 1285669, -2939036, -2235985, -1584928, -420899, -2286327, -812732, 183443, -976891, -1439742, 1612842, -3545687, -3019102, -554416, 3919660, -3881060, -48306, -1362209, -3628969, 3937738, 1400424, 3839961, -846154, 1976782
+.size zetas_new332,.-zetas_new332
+
+.type zetas_new332inv, %object
+.align 2
+zetas_new332inv:
+.word 4193792, 4193792, -25847, 4193792, 518909, -25847, 2608894, 4193792, 4193792, -25847, 4193792, 518909, -25847, 2608894, -466468, -2680103, -3111497, -280005, 19422, -4010497, -1757237, 518909, -466468, 876248, -2680103, 2884855, -3111497, -3119733, 777960, 2091905, 359251, 2108549, 1119584, -2619752, 549488, -25847, 518909, 2608894, -466468, 777960, 876248, -237124, 876248, 2884855, -3119733, 3277672, 3859737, 1399561, 2118186, 2608894, 777960, -237124, 2091905, -2353451, 359251, -1826347, -237124, -2353451, -1826347, -3585928, -1024112, 1079900, -2725464, 4193792, 4193792, -25847, 41978, 3024400, 3975713, -1225192, 2797779, -3839961, 3628969, -1711436, 3835778, 485110, -3954267, -280005, 2797779, -2071892, -2831100, -2698859, -908040, -2292170, 539299, 1430430, -1852771, -3658785, 3512212, 1859141, -1607594, -2680103, -280005, -4010497, 715005, 1483994, -1045894, -980943, -3699596, 1316856, -759969, -955715, 3677139, 3933849, 2719610, 2108549, 539299, -2348700, 1658328, -1403403, 1775852, -2460465, -3915439, -126922, 3632928, 1067023, 3847594, 4179270, 1652689, -466468, -2680103, -3111497, -2953811, -284642, 2507426, -324139, -3881043, -1341330, -1315589, 3990128, -2137097, -4109898, 4092021, 3277672, -3699596, 1600420, 1541634, 3493410, 3487504, 2497815, 2867647, 2477047, 411027, 1654972, 1326223, -2608226, -2752209, 2091905, 2108549, -2619752, 1836700, 2945615, -1908953, 729864, 3821735, -3958618, -904516, 2080615, 1555380, -3471815, -1978758, -3585928, -3915439, 2537516, -892788, -553664, -3095038, 658596, -3530437, 1585221, -2176455, 3355482, -1783485, 2780552, -3623330, 518909, -466468, 876248, -442683, 2523147, -2847660, -3683140, 2556880, 1439742, 812732, 774207, -3168108, 1877157, 3406477, 19422, -3881043, -954230, -214686, -1182619, 2453526, -2201920, 300467, 1308169, 22981, 3614022, 2136260, 1459487, -2233803, 2884855, 3277672, 1399561, 394072, -3933227, 4136064, 156486, 2140649, 3249728, -1653064, 1596950, 633578, 2722529, -554462, 1119584, 2867647, -3574422, 1004840, 191586, 3969463, 1161373, 3592148, 1000202, 4083598, 3189243, 3561667, -3650125, 3490511, 777960, 2091905, 359251, -1829156, -3707725, -661807, 1144558, -531354, 1851402, -3159746, 1543095, -2903948, 1505516, -1500460, 3859737, 3821735, -3505694, -2413330, 3908886, -1203856, 3570263, 3043716, -2715295, -2147896, 758741, 3917553, -2414897, -1613811, -2353451, -3585928, 1079900, 990020, -719638, 2718792, 2260310, 1643818, -3097992, -508951, -783456, -2089539, 2616547, 4060031, -1024112, -3530437, -3077325, -1821861, 1920615, 3988525, 2048419, -95776, 3041255, 3677745, -971504, 2190617, 2311312, -1170082, -25847, 518909, 2608894, 1261528, -2073537, -959585, 3948120, -2071892, 3881060, 3019102, -1342633, -1115066, 3589694, -1929116, -4010497, 2556880, -3900724, 3360006, 1758630, -2306989, -1841637, -2348700, -1349076, 381987, -1699982, 3189673, 3531558, -1210546, -3111497, 19422, -1757237, 2977353, 2612035, -2718155, -1544829, 1600420, 210977, -2389356, 2052582, -2737802, 2383976, -450259, -2619752, 300467, -3539968, 1698289, -4065084, -644023, -1114140, 2537516, 3157330, 3190144, -993399, -2220524, 2920588, 252737, 876248, 2884855, -3119733, 1490985, -34731, -1212610, -3183745, -954230, 177440, 2409325, -3302554, -2390327, -2749545, 653128, 1399561, 2140649, -3507263, -3745105, -1942293, -3367121, 2734884, -3574422, 3693493, 2967645, 1393803, -2467905, 1786029, -1633410, 359251, 1119584, 549488, -2824548, -1325638, -2207625, -2601586, -3505694, 1100098, -44288, 3478676, -2457992, -1617107, 2551364, 1079900, 3592148, 1661693, 1593929, 318899, -3366475, 3118416, -3077325, -3475950, 1452451, 3772814, 1424805, -3391376, 632820, 2608894, 777960, -237124, 2062597, 4064335, 2197148, -1127864, -3900724, 1584928, -1285669, 2525341, -896437, -1915773, 1792087, -1757237, -531354, -811944, 938441, -674578, 2876837, 3959371, -3539968, 1228525, 671102, 1219592, -3853560, 2630979, -2134676, -3119733, 3859737, 2118186, -2432637, 2746655, 718593, -2353280, -3507263, 8578, 3724342, -34852, 1387945, 358956, 1604944, 549488, 3043716, 3861115, 1290746, 3208584, 2538711, -1442830, 1661693, -1939314, 1257611, -367371, -1308058, 264382, 2614173, -237124, -2353451, -1826347, 2050674, 592050, -138487, 2310528, -811944, 3553272, -189548, -2728561, -4168358, -79, 3844932, 2118186, 1643818, 1699267, 500408, 743398, 879633, -3105206, 3861115, 983419, -3412210, 712597, -23479, 3729381, -1010481, -1826347, -1024112, -2725464, -2361217, -1864453, 3850522, 2337144, 1699267, -264944, 3343383, 3842267, 4181974, -4032642, 3983585, -2725464, -95776, -2706023, 260345, 2526550, 2000777, 987079, -2706023, 1528703, 3930395, -3030761, -3082055, -2374824, 1836319
+.size zetas_new332inv,.-zetas_new332inv
diff --git a/crypto_sign/dilithium2/m4f/ntt.h b/crypto_sign/dilithium2/m4f/ntt.h
new file mode 100644
index 0000000..731132d
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/ntt.h
@@ -0,0 +1,13 @@
+#ifndef NTT_H
+#define NTT_H
+
+#include <stdint.h>
+#include "params.h"
+
+#define ntt DILITHIUM_NAMESPACE(ntt)
+void ntt(int32_t a[N]);
+
+#define invntt_tomont DILITHIUM_NAMESPACE(invntt_tomont)
+void invntt_tomont(int32_t a[N]);
+
+#endif
diff --git a/crypto_sign/dilithium2/m4f/packing.c b/crypto_sign/dilithium2/m4f/packing.c
new file mode 100644
index 0000000..eb9d9a3
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/packing.c
@@ -0,0 +1,390 @@
+#include "params.h"
+#include "packing.h"
+#include "polyvec.h"
+#include "poly.h"
+#include <stddef.h>
+
+/*************************************************
+* Name:        pack_pk
+*
+* Description: Bit-pack public key pk = (rho, t1).
+*
+* Arguments:   - uint8_t pk[]: output byte array
+*              - const uint8_t rho[]: byte array containing rho
+*              - const polyveck *t1: pointer to vector t1
+**************************************************/
+void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES],
+             const uint8_t rho[SEEDBYTES],
+             const polyveck *t1)
+{
+  unsigned int i;
+
+  for(i = 0; i < SEEDBYTES; ++i)
+    pk[i] = rho[i];
+  pk += SEEDBYTES;
+
+  for(i = 0; i < K; ++i)
+    polyt1_pack(pk + i*POLYT1_PACKEDBYTES, &t1->vec[i]);
+}
+
+/*************************************************
+* Name:        unpack_pk
+*
+* Description: Unpack public key pk = (rho, t1).
+*
+* Arguments:   - const uint8_t rho[]: output byte array for rho
+*              - const polyveck *t1: pointer to output vector t1
+*              - uint8_t pk[]: byte array containing bit-packed pk
+**************************************************/
+void unpack_pk(uint8_t rho[SEEDBYTES],
+               polyveck *t1,
+               const uint8_t pk[CRYPTO_PUBLICKEYBYTES])
+{
+  unsigned int i;
+
+  for(i = 0; i < SEEDBYTES; ++i)
+    rho[i] = pk[i];
+  pk += SEEDBYTES;
+
+  for(i = 0; i < K; ++i)
+    polyt1_unpack(&t1->vec[i], pk + i*POLYT1_PACKEDBYTES);
+}
+
+/*************************************************
+* Name:        unpack_pk_t1
+*
+* Description: Unpack public key pk = (rho, t1).
+*
+* Arguments:   - const polyvec *t1: pointer to output vector t1
+*              - const size_t idx: unpack n'th element from t1
+*              - unsigned char pk[]: byte array containing bit-packed pk
+**************************************************/
+void unpack_pk_t1(poly *t1, size_t idx, const unsigned char pk[CRYPTO_PUBLICKEYBYTES]) {
+    pk += SEEDBYTES;
+    polyt1_unpack(t1, pk + idx * POLYT1_PACKEDBYTES);
+}
+
+
+/*************************************************
+* Name:        pack_sk
+*
+* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2).
+*
+* Arguments:   - uint8_t sk[]: output byte array
+*              - const uint8_t rho[]: byte array containing rho
+*              - const uint8_t tr[]: byte array containing tr
+*              - const uint8_t key[]: byte array containing key
+*              - const polyveck *t0: pointer to vector t0
+*              - const polyvecl *s1: pointer to vector s1
+*              - const polyveck *s2: pointer to vector s2
+**************************************************/
+void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES],
+             const uint8_t rho[SEEDBYTES],
+             const uint8_t tr[TRBYTES],
+             const uint8_t key[SEEDBYTES],
+             const polyveck *t0,
+             const polyvecl *s1,
+             const polyveck *s2)
+{
+  unsigned int i;
+
+  for(i = 0; i < SEEDBYTES; ++i)
+    sk[i] = rho[i];
+  sk += SEEDBYTES;
+
+  for(i = 0; i < SEEDBYTES; ++i)
+    sk[i] = key[i];
+  sk += SEEDBYTES;
+
+  for(i = 0; i < TRBYTES; ++i)
+    sk[i] = tr[i];
+  sk += TRBYTES;
+
+  for(i = 0; i < L; ++i)
+    polyeta_pack(sk + i*POLYETA_PACKEDBYTES, &s1->vec[i]);
+  sk += L*POLYETA_PACKEDBYTES;
+
+  for(i = 0; i < K; ++i)
+    polyeta_pack(sk + i*POLYETA_PACKEDBYTES, &s2->vec[i]);
+  sk += K*POLYETA_PACKEDBYTES;
+
+  for(i = 0; i < K; ++i)
+    polyt0_pack(sk + i*POLYT0_PACKEDBYTES, &t0->vec[i]);
+}
+
+/*************************************************
+* Name:        unpack_sk
+*
+* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2).
+*
+* Arguments:   - const uint8_t rho[]: output byte array for rho
+*              - const uint8_t tr[]: output byte array for tr
+*              - const uint8_t key[]: output byte array for key
+*              - const polyveck *t0: pointer to output vector t0
+*              - const polyvecl *s1: pointer to output vector s1
+*              - const polyveck *s2: pointer to output vector s2
+*              - uint8_t sk[]: byte array containing bit-packed sk
+**************************************************/
+void unpack_sk(uint8_t rho[SEEDBYTES],
+               uint8_t tr[TRBYTES],
+               uint8_t key[SEEDBYTES],
+               polyveck *t0,
+               smallpoly s1[L],
+               smallpoly s2[K],
+               const uint8_t sk[CRYPTO_SECRETKEYBYTES])
+{
+  unsigned int i;
+
+  for(i = 0; i < SEEDBYTES; ++i)
+    rho[i] = sk[i];
+  sk += SEEDBYTES;
+
+  for(i = 0; i < SEEDBYTES; ++i)
+    key[i] = sk[i];
+  sk += SEEDBYTES;
+
+  for(i = 0; i < TRBYTES; ++i)
+    tr[i] = sk[i];
+  sk += TRBYTES;
+
+  for(i=0; i < L; ++i)
+    small_polyeta_unpack(&s1[i], sk + i*POLYETA_PACKEDBYTES);
+  sk += L*POLYETA_PACKEDBYTES;
+
+  for(i=0; i < K; ++i)
+    small_polyeta_unpack(&s2[i], sk + i*POLYETA_PACKEDBYTES);
+  sk += K*POLYETA_PACKEDBYTES;
+
+  for(i=0; i < K; ++i)
+    polyt0_unpack(&t0->vec[i], sk + i*POLYT0_PACKEDBYTES);
+}
+
+
+/*************************************************
+* Name:        pack_sig
+*
+* Description: Bit-pack signature sig = (c, z, h).
+*
+* Arguments:   - uint8_t sig[]: output byte array
+*              - const uint8_t *c: pointer to challenge hash length SEEDBYTES
+*              - const polyvecl *z: pointer to vector z
+*              - const polyveck *h: pointer to hint vector h
+**************************************************/
+void pack_sig(uint8_t sig[CRYPTO_BYTES],
+              const uint8_t c[CTILDEBYTES],
+              const polyvecl *z,
+              const polyveck *h)
+{
+  unsigned int i, j, k;
+
+  for(i=0; i < CTILDEBYTES; ++i)
+    sig[i] = c[i];
+  sig += CTILDEBYTES;
+
+  for(i = 0; i < L; ++i)
+    polyz_pack(sig + i*POLYZ_PACKEDBYTES, &z->vec[i]);
+  sig += L*POLYZ_PACKEDBYTES;
+
+  /* Encode h */
+  for(i = 0; i < OMEGA + K; ++i)
+    sig[i] = 0;
+
+  k = 0;
+  for(i = 0; i < K; ++i) {
+    for(j = 0; j < N; ++j)
+      if(h->vec[i].coeffs[j] != 0)
+        sig[k++] = j;
+
+    sig[OMEGA + i] = k;
+  }
+}
+
+void pack_sig_c(uint8_t sig[CRYPTO_BYTES],
+              const uint8_t c[CTILDEBYTES])
+{
+  unsigned int i;
+
+  for(i=0; i < CTILDEBYTES; ++i)
+    sig[i] = c[i];
+  sig += CTILDEBYTES;
+}
+
+void pack_sig_z(uint8_t sig[CRYPTO_BYTES],
+              const polyvecl *z)
+{
+  unsigned int i;
+  sig += CTILDEBYTES;
+  for(i = 0; i < L; ++i)
+    polyz_pack(sig + i*POLYZ_PACKEDBYTES, &z->vec[i]);
+}
+
+
+void pack_sig_h(unsigned char sig[CRYPTO_BYTES],
+                const poly *h_elem,
+                const unsigned int idx,
+                unsigned int *hints_written)
+{
+  sig += CTILDEBYTES;
+  sig += L*POLYZ_PACKEDBYTES;
+
+  // Encode h
+  for (unsigned int j = 0; j < N; j++) {
+      if (h_elem->coeffs[j] != 0) {
+          sig[*hints_written] = (uint8_t)j;
+          (*hints_written)++;
+      }
+  }
+  sig[OMEGA + idx] = (uint8_t)*hints_written;
+}
+
+void pack_sig_h_zero(unsigned char sig[CRYPTO_BYTES],
+                unsigned int *hints_written) {
+    sig += CTILDEBYTES;
+    sig += L * POLYZ_PACKEDBYTES;
+    while (*hints_written < OMEGA) {
+        sig[*hints_written] = 0;
+        (*hints_written)++;
+    }
+}
+
+/*************************************************
+* Name:        unpack_sig
+*
+* Description: Unpack signature sig = (c, z, h).
+*
+* Arguments:   - uint8_t *c: pointer to output challenge hash
+*              - polyvecl *z: pointer to output vector z
+*              - polyveck *h: pointer to output hint vector h
+*              - const uint8_t sig[]: byte array containing
+*                bit-packed signature
+*
+* Returns 1 in case of malformed signature; otherwise 0.
+**************************************************/
+int unpack_sig(uint8_t c[CTILDEBYTES],
+               polyvecl *z,
+               polyveck *h,
+               const uint8_t sig[CRYPTO_BYTES])
+{
+  unsigned int i, j, k;
+
+  for(i = 0; i < CTILDEBYTES; ++i)
+    c[i] = sig[i];
+  sig += CTILDEBYTES;
+
+  for(i = 0; i < L; ++i)
+    polyz_unpack(&z->vec[i], sig + i*POLYZ_PACKEDBYTES);
+  sig += L*POLYZ_PACKEDBYTES;
+
+  /* Decode h */
+  k = 0;
+  for(i = 0; i < K; ++i) {
+    for(j = 0; j < N; ++j)
+      h->vec[i].coeffs[j] = 0;
+
+    if(sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA)
+      return 1;
+
+    for(j = k; j < sig[OMEGA + i]; ++j) {
+      /* Coefficients are ordered for strong unforgeability */
+      if(j > k && sig[j] <= sig[j-1]) return 1;
+      h->vec[i].coeffs[sig[j]] = 1;
+    }
+
+    k = sig[OMEGA + i];
+  }
+
+  /* Extra indices are zero for strong unforgeability */
+  for(j = k; j < OMEGA; ++j)
+    if(sig[j])
+      return 1;
+
+  return 0;
+}
+
+/*************************************************
+* Name:        unpack_sig_c
+*
+* Description: Unpack only c from signature sig = (z, h, c).
+*
+* Arguments:   - poly *c: pointer to output challenge polynomial
+*              - const unsigned char sig[]: byte array containing
+*                bit-packed signature
+*
+* Returns 1 in case of malformed signature; otherwise 0.
+**************************************************/
+int unpack_sig_c(uint8_t c[CTILDEBYTES], const unsigned char sig[CRYPTO_BYTES]) {
+  for(size_t i = 0; i < CTILDEBYTES; ++i)
+    c[i] = sig[i];
+  sig += CTILDEBYTES;
+  return 0;
+}
+
+/*************************************************
+* Name:        unpack_sig_z
+*
+* Description: Unpack only z from signature sig = (z, h, c).
+*
+* Arguments:   - polyvecl *z: pointer to output vector z
+*              - const unsigned char sig[]: byte array containing
+*                bit-packed signature
+*
+* Returns 1 in case of malformed signature; otherwise 0.
+**************************************************/
+int unpack_sig_z(polyvecl *z, const unsigned char sig[CRYPTO_BYTES]) {
+    sig += CTILDEBYTES;
+    for (size_t i = 0; i < L; ++i) {
+        polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES);
+    }
+    return 0;
+}
+
+/*************************************************
+* Name:        unpack_sig_h
+*
+* Description: Unpack only h from signature sig = (z, h, c).
+*
+* Arguments:   - polyveck *h: pointer to output hint vector h
+*              - const unsigned char sig[]: byte array containing
+*                bit-packed signature
+*
+* Returns 1 in case of malformed signature; otherwise 0.
+**************************************************/
+int unpack_sig_h(poly *h, size_t idx, const unsigned char sig[CRYPTO_BYTES]) {
+    sig += CTILDEBYTES;
+    sig += L * POLYZ_PACKEDBYTES;
+
+    /* Decode h */
+    size_t k = 0;
+    for (size_t i = 0; i < K; ++i) {
+        for (size_t j = 0; j < N; ++j) {
+            if (i == idx) {
+                h->coeffs[j] = 0;
+            }
+        }
+
+        if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) {
+            return 1;
+        }
+
+        for (size_t j = k; j < sig[OMEGA + i]; ++j) {
+            /* Coefficients are ordered for strong unforgeability */
+            if (j > k && sig[j] <= sig[j - 1]) {
+                return 1;
+            }
+            if (i == idx) {
+                h->coeffs[sig[j]] = 1;
+            }
+        }
+
+        k = sig[OMEGA + i];
+    }
+
+    /* Extra indices are zero for strong unforgeability */
+    for (size_t j = k; j < OMEGA; ++j) {
+        if (sig[j]) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
diff --git a/crypto_sign/dilithium2/m4f/packing.h b/crypto_sign/dilithium2/m4f/packing.h
new file mode 100644
index 0000000..78ef2c2
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/packing.h
@@ -0,0 +1,68 @@
+#ifndef PACKING_H
+#define PACKING_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include "params.h"
+#include "polyvec.h"
+#include "smallpoly.h"
+
+#define pack_pk DILITHIUM_NAMESPACE(pack_pk)
+void pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1);
+
+#define pack_sk DILITHIUM_NAMESPACE(pack_sk)
+void pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES],
+             const uint8_t rho[SEEDBYTES],
+             const uint8_t tr[TRBYTES],
+             const uint8_t key[SEEDBYTES],
+             const polyveck *t0,
+             const polyvecl *s1,
+             const polyveck *s2);
+
+#define pack_sig DILITHIUM_NAMESPACE(pack_sig)
+void pack_sig(uint8_t sig[CRYPTO_BYTES], const uint8_t c[CTILDEBYTES], const polyvecl *z, const polyveck *h);
+
+#define unpack_pk DILITHIUM_NAMESPACE(unpack_pk)
+void unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[CRYPTO_PUBLICKEYBYTES]);
+
+#define unpack_pk_t1 DILITHIUM_NAMESPACE(unpack_pk_t1)
+void unpack_pk_t1(poly *t1, size_t idx, const unsigned char pk[CRYPTO_PUBLICKEYBYTES]);
+
+#define unpack_sk DILITHIUM_NAMESPACE(unpack_sk)
+void unpack_sk(uint8_t rho[SEEDBYTES],
+               uint8_t tr[TRBYTES],
+               uint8_t key[SEEDBYTES],
+               polyveck *t0,
+               smallpoly s1[L],
+               smallpoly s2[K],
+               const uint8_t sk[CRYPTO_SECRETKEYBYTES]);
+
+#define unpack_sig DILITHIUM_NAMESPACE(unpack_sig)
+int unpack_sig(uint8_t c[CTILDEBYTES], polyvecl *z, polyveck *h, const uint8_t sig[CRYPTO_BYTES]);
+
+
+#define unpack_sig_z DILITHIUM_NAMESPACE(unpack_sig_z)
+int unpack_sig_z(polyvecl *z, const unsigned char sig[CRYPTO_BYTES]);
+#define unpack_sig_h DILITHIUM_NAMESPACE(unpack_sig_h)
+int unpack_sig_h(poly *h, size_t idx, const unsigned char sig[CRYPTO_BYTES]);
+#define unpack_sig_c DILITHIUM_NAMESPACE(unpack_sig_c)
+int unpack_sig_c(uint8_t c[CTILDEBYTES], const unsigned char sig[CRYPTO_BYTES]);
+
+
+#define pack_sig_c DILITHIUM_NAMESPACE(pack_sig_c)
+void pack_sig_c(uint8_t sig[CRYPTO_BYTES], const uint8_t c[CTILDEBYTES]);
+
+#define pack_sig_z DILITHIUM_NAMESPACE(pack_sig_z)
+void pack_sig_z(uint8_t sig[CRYPTO_BYTES], const polyvecl *z);
+
+#define pack_sig_h DILITHIUM_NAMESPACE(pack_sig_h)
+void pack_sig_h(unsigned char sig[CRYPTO_BYTES],
+                const poly *h_elem,
+                const unsigned int idx,
+                unsigned int *hints_written);
+
+#define pack_sig_h_zero DILITHIUM_NAMESPACE(pack_sig_h_zero)
+void pack_sig_h_zero(unsigned char sig[CRYPTO_BYTES],
+                unsigned int *hints_written);
+
+#endif
diff --git a/crypto_sign/dilithium2/m4f/params.h b/crypto_sign/dilithium2/m4f/params.h
new file mode 100644
index 0000000..507de46
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/params.h
@@ -0,0 +1,83 @@
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#include "config.h"
+
+#define DILITHIUM_NAMESPACE(s) pqcrystals_dilithium_##s
+
+
+#define SEEDBYTES 32
+#define CRHBYTES 64
+#define TRBYTES 64
+#define RNDBYTES 32
+#define N 256
+#define Q 8380417
+#define D 13
+#define ROOT_OF_UNITY 1753
+
+#if DILITHIUM_MODE == 2
+#define K 4
+#define L 4
+#define ETA 2
+#define TAU 39
+#define BETA 78
+#define GAMMA1 (1 << 17)
+#define GAMMA2 ((Q-1)/88)
+#define OMEGA 80
+#define CTILDEBYTES 32
+
+#elif DILITHIUM_MODE == 3
+#define K 6
+#define L 5
+#define ETA 4
+#define TAU 49
+#define BETA 196
+#define GAMMA1 (1 << 19)
+#define GAMMA2 ((Q-1)/32)
+#define OMEGA 55
+#define CTILDEBYTES 48
+
+#elif DILITHIUM_MODE == 5
+#define K 8
+#define L 7
+#define ETA 2
+#define TAU 60
+#define BETA 120
+#define GAMMA1 (1 << 19)
+#define GAMMA2 ((Q-1)/32)
+#define OMEGA 75
+#define CTILDEBYTES 64
+
+#endif
+
+#define POLYT1_PACKEDBYTES  320
+#define POLYT0_PACKEDBYTES  416
+#define POLYVECH_PACKEDBYTES (OMEGA + K)
+
+#if GAMMA1 == (1 << 17)
+#define POLYZ_PACKEDBYTES   576
+#elif GAMMA1 == (1 << 19)
+#define POLYZ_PACKEDBYTES   640
+#endif
+
+#if GAMMA2 == (Q-1)/88
+#define POLYW1_PACKEDBYTES  192
+#elif GAMMA2 == (Q-1)/32
+#define POLYW1_PACKEDBYTES  128
+#endif
+
+#if ETA == 2
+#define POLYETA_PACKEDBYTES  96
+#elif ETA == 4
+#define POLYETA_PACKEDBYTES 128
+#endif
+
+#define CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES)
+#define CRYPTO_SECRETKEYBYTES (2*SEEDBYTES \
+                               + TRBYTES \
+                               + L*POLYETA_PACKEDBYTES \
+                               + K*POLYETA_PACKEDBYTES \
+                               + K*POLYT0_PACKEDBYTES)
+#define CRYPTO_BYTES (CTILDEBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES)
+
+#endif
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4f/pointwise_mont.h b/crypto_sign/dilithium2/m4f/pointwise_mont.h
new file mode 100644
index 0000000..2647a11
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/pointwise_mont.h
@@ -0,0 +1,13 @@
+#ifndef POINTWISE_MONT_H
+#define POINTWISE_MONT_H
+
+#include <stdint.h>
+#include "params.h"
+
+
+#define asm_pointwise_montgomery DILITHIUM_NAMESPACE(asm_pointwise_montgomery)
+void asm_pointwise_montgomery(int32_t c[N], const int32_t a[N], const int32_t b[N]);
+#define asm_pointwise_acc_montgomery DILITHIUM_NAMESPACE(asm_pointwise_acc_montgomery)
+void asm_pointwise_acc_montgomery(int32_t c[N], const int32_t a[N], const int32_t b[N]);
+
+#endif
diff --git a/crypto_sign/dilithium2/m4f/pointwise_mont.s b/crypto_sign/dilithium2/m4f/pointwise_mont.s
new file mode 100644
index 0000000..e21125d
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/pointwise_mont.s
@@ -0,0 +1,128 @@
+.syntax unified
+.thumb
+
+.macro montgomery_multiplication res, pa, pb, q, qinv
+    smull \pa, \res, \pa, \pb
+    mul \pb, \pa, \qinv
+    smlal \pa, \res, \pb, \q
+.endm
+
+
+// void asm_pointwise_montgomery(int32_t c[N], const int32_t a[N], const int32_t b[N]);
+.global pqcrystals_dilithium_asm_pointwise_montgomery
+.type pqcrystals_dilithium_asm_pointwise_montgomery,%function
+.align 2
+pqcrystals_dilithium_asm_pointwise_montgomery:
+    push.w {r4-r11, r14}
+    c_ptr .req r0
+    a_ptr .req r1
+    b_ptr .req r2
+    qinv  .req r3
+    q     .req r4
+    pa0   .req r5
+    pa1   .req r6
+    pa2   .req r7
+    pb0   .req r8
+    pb1   .req r9
+    pb2   .req r10
+    tmp0  .req r11
+    ctr   .req r12
+    res   .req r14
+
+    movw qinv, #:lower16:0xfc7fdfff
+    movt qinv, #:upper16:0xfc7fdfff
+    movw q, #0xE001
+    movt q, #0x7F
+
+
+    // 85x3 = 255 coefficients
+    movw ctr, #85
+    1:
+        ldr.w pa1, [a_ptr, #4]
+        ldr.w pa2, [a_ptr, #8]
+        ldr pa0, [a_ptr], #12
+        ldr.w pb1, [b_ptr, #4]
+        ldr.w pb2, [b_ptr, #8]
+        ldr pb0, [b_ptr], #12
+
+        montgomery_multiplication res, pa0, pb0, q, qinv
+        str res, [c_ptr], #4
+        montgomery_multiplication res, pa1, pb1, q, qinv
+        str res, [c_ptr], #4
+        montgomery_multiplication res, pa2, pb2, q, qinv
+        str res, [c_ptr], #4
+    subs ctr, #1
+    bne.w 1b
+
+    // final coefficient
+    ldr.w pa0, [a_ptr]
+    ldr.w pb0, [b_ptr]
+    montgomery_multiplication res, pa0, pb0, q, qinv
+    str.w res, [c_ptr]
+
+    pop.w {r4-r11, pc}
+.size pqcrystals_dilithium_asm_pointwise_montgomery, .-pqcrystals_dilithium_asm_pointwise_montgomery
+
+// void asm_pointwise_acc_montgomery(int32_t c[N], const int32_t a[N], const int32_t b[N]);
+.global pqcrystals_dilithium_asm_pointwise_acc_montgomery
+.type pqcrystals_dilithium_asm_pointwise_acc_montgomery,%function
+.align 2
+pqcrystals_dilithium_asm_pointwise_acc_montgomery:
+    push.w {r4-r11, r14}
+    c_ptr .req r0
+    a_ptr .req r1
+    b_ptr .req r2
+    qinv  .req r3
+    q     .req r4
+    pa0   .req r5
+    pa1   .req r6
+    pa2   .req r7
+    pb0   .req r8
+    pb1   .req r9
+    pb2   .req r10
+    tmp0  .req r11
+    ctr   .req r12
+    res   .req r14
+
+    movw qinv, #:lower16:0xfc7fdfff
+    movt qinv, #:upper16:0xfc7fdfff
+    movw q, #0xE001
+    movt q, #0x7F
+
+
+    // 85x3 = 255 coefficients
+    movw ctr, #85
+    1:
+        ldr.w pa1, [a_ptr, #4]
+        ldr.w pa2, [a_ptr, #8]
+        ldr pa0, [a_ptr], #12
+        ldr.w pb1, [b_ptr, #4]
+        ldr.w pb2, [b_ptr, #8]
+        ldr pb0, [b_ptr], #12
+
+        montgomery_multiplication res, pa0, pb0, q, qinv
+        montgomery_multiplication pa0, pa1, pb1, q, qinv
+        montgomery_multiplication pa1, pa2, pb2, q, qinv
+
+        ldr.w pb0, [c_ptr]
+        ldr.w pb1, [c_ptr, #4]
+        ldr.w pb2, [c_ptr, #8]
+        add.w res, res, pb0
+        str res, [c_ptr], #12
+        add.w pa0, pa0, pb1
+        str pa0, [c_ptr, #-8]
+        add.w pa1, pa1, pb2
+        str pa1, [c_ptr, #-4]
+    subs ctr, #1
+    bne.w 1b
+
+    // final coefficient
+    ldr.w pa0, [a_ptr]
+    ldr.w pb0, [b_ptr]
+    ldr.w pa1, [c_ptr]
+    montgomery_multiplication res, pa0, pb0, q, qinv
+    add.w res, res, pa1
+    str.w res, [c_ptr]
+
+    pop.w {r4-r11, pc}
+.size pqcrystals_dilithium_asm_pointwise_acc_montgomery, .-pqcrystals_dilithium_asm_pointwise_acc_montgomery
diff --git a/crypto_sign/dilithium2/m4f/poly.c b/crypto_sign/dilithium2/m4f/poly.c
new file mode 100644
index 0000000..654f4f2
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/poly.c
@@ -0,0 +1,863 @@
+#include <stdint.h>
+#include "params.h"
+#include "poly.h"
+#include "vector.h"
+#include "ntt.h"
+#include "pointwise_mont.h"
+#include "rounding.h"
+#include "symmetric.h"
+
+#include <stdio.h>
+#include "hal.h"
+
+#ifdef DBENCH
+#include "test/cpucycles.h"
+extern const uint64_t timing_overhead;
+extern uint64_t *tred, *tadd, *tmul, *tround, *tsample, *tpack;
+#define DBENCH_START() uint64_t time = cpucycles()
+#define DBENCH_STOP(t) t += cpucycles() - time - timing_overhead
+#else
+#define DBENCH_START()
+#define DBENCH_STOP(t)
+#endif
+
+/*************************************************
+* Name:        poly_reduce
+*
+* Description: Inplace reduction of all coefficients of polynomial to
+*              representative in [-6283009,6283007].
+*
+* Arguments:   - poly *a: pointer to input/output polynomial
+**************************************************/
+void poly_reduce(poly *a) {
+  asm_reduce32(a->coeffs);
+}
+
+/*************************************************
+* Name:        poly_caddq
+*
+* Description: For all coefficients of in/out polynomial add Q if
+*              coefficient is negative.
+*
+* Arguments:   - poly *a: pointer to input/output polynomial
+**************************************************/
+void poly_caddq(poly *a) {
+  asm_caddq(a->coeffs);
+}
+
+/*************************************************
+* Name:        poly_csubq
+*
+* Description: For all coefficients of input polynomial subtract Q if
+*              coefficient is bigger than Q; add Q if coefficient is negative.
+*
+* Arguments:   - poly *a: pointer to input/output polynomial
+**************************************************/
+void poly_csubq(poly *a) {
+    asm_caddq(a->coeffs);
+}
+
+#if 0
+/*************************************************
+* Name:        poly_freeze
+*
+* Description: Inplace reduction of all coefficients of polynomial to
+*              standard representatives.
+*
+* Arguments:   - poly *a: pointer to input/output polynomial
+**************************************************/
+void poly_freeze(poly *a) {
+    asm_freeze(a->coeffs);
+}
+#endif
+
+/*************************************************
+* Name:        poly_add
+*
+* Description: Add polynomials. No modular reduction is performed.
+*
+* Arguments:   - poly *c: pointer to output polynomial
+*              - const poly *a: pointer to first summand
+*              - const poly *b: pointer to second summand
+**************************************************/
+void poly_add(poly *c, const poly *a, const poly *b)  {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N; ++i)
+    c->coeffs[i] = a->coeffs[i] + b->coeffs[i];
+
+  DBENCH_STOP(*tadd);
+}
+
+/*************************************************
+* Name:        poly_sub
+*
+* Description: Subtract polynomials. No modular reduction is
+*              performed.
+*
+* Arguments:   - poly *c: pointer to output polynomial
+*              - const poly *a: pointer to first input polynomial
+*              - const poly *b: pointer to second input polynomial to be
+*                               subtraced from first input polynomial
+**************************************************/
+void poly_sub(poly *c, const poly *a, const poly *b) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N; ++i)
+    c->coeffs[i] = a->coeffs[i] - b->coeffs[i];
+
+  DBENCH_STOP(*tadd);
+}
+
+/*************************************************
+* Name:        poly_shiftl
+*
+* Description: Multiply polynomial by 2^D without modular reduction. Assumes
+*              input coefficients to be less than 2^{31-D} in absolute value.
+*
+* Arguments:   - poly *a: pointer to input/output polynomial
+**************************************************/
+void poly_shiftl(poly *a) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N; ++i)
+    a->coeffs[i] <<= D;
+
+  DBENCH_STOP(*tmul);
+}
+
+/*************************************************
+* Name:        poly_ntt
+*
+* Description: Inplace forward NTT. Coefficients can grow by
+*              8*Q in absolute value.
+*
+* Arguments:   - poly *a: pointer to input/output polynomial
+**************************************************/
+void poly_ntt(poly *a) {
+  DBENCH_START();
+
+  ntt(a->coeffs);
+
+  DBENCH_STOP(*tmul);
+}
+
+
+/*************************************************
+* Name:        poly_invntt_tomont
+*
+* Description: Inplace inverse NTT and multiplication by 2^{32}.
+*              Input coefficients need to be less than Q in absolute
+*              value and output coefficients are again bounded by Q.
+*
+* Arguments:   - poly *a: pointer to input/output polynomial
+**************************************************/
+void poly_invntt_tomont(poly *a) {
+  DBENCH_START();
+
+  invntt_tomont(a->coeffs);
+
+  DBENCH_STOP(*tmul);
+}
+
+
+/*************************************************
+* Name:        poly_pointwise_montgomery
+*
+* Description: Pointwise multiplication of polynomials in NTT domain
+*              representation and multiplication of resulting polynomial
+*              by 2^{-32}.
+*
+* Arguments:   - poly *c: pointer to output polynomial
+*              - const poly *a: pointer to first input polynomial
+*              - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) {
+  DBENCH_START();
+
+  asm_pointwise_montgomery(c->coeffs, a->coeffs, b->coeffs);
+
+  DBENCH_STOP(*tmul);
+}
+
+/*************************************************
+* Name:        poly_pointwise_acc_montgomery
+*
+* Description: Pointwise multiplication of polynomials in NTT domain
+*              representation, multiplication of resulting polynomial
+*              by 2^{-32} and accumulate.
+*
+* Arguments:   - poly *c: pointer to output (accumulating) polynomial
+*              - const poly *a: pointer to first input polynomial
+*              - const poly *b: pointer to second input polynomial
+**************************************************/
+void poly_pointwise_acc_montgomery(poly *c, const poly *a, const poly *b) {
+  DBENCH_START();
+
+  asm_pointwise_acc_montgomery(c->coeffs, a->coeffs, b->coeffs);
+
+  DBENCH_STOP(*tmul);
+}
+
+
+/*************************************************
+* Name:        poly_power2round
+*
+* Description: For all coefficients c of the input polynomial,
+*              compute c0, c1 such that c mod Q = c1*2^D + c0
+*              with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be
+*              standard representatives.
+*
+* Arguments:   - poly *a1: pointer to output polynomial with coefficients c1
+*              - poly *a0: pointer to output polynomial with coefficients c0
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+void poly_power2round(poly *a1, poly *a0, const poly *a) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N; ++i)
+    a1->coeffs[i] = power2round(&a0->coeffs[i], a->coeffs[i]);
+
+  DBENCH_STOP(*tround);
+}
+
+/*************************************************
+* Name:        poly_decompose
+*
+* Description: For all coefficients c of the input polynomial,
+*              compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0
+*              with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we
+*              set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0.
+*              Assumes coefficients to be standard representatives.
+*
+* Arguments:   - poly *a1: pointer to output polynomial with coefficients c1
+*              - poly *a0: pointer to output polynomial with coefficients c0
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+void poly_decompose(poly *a1, poly *a0, const poly *a) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N; ++i)
+    a1->coeffs[i] = decompose(&a0->coeffs[i], a->coeffs[i]);
+
+  DBENCH_STOP(*tround);
+}
+
+/*************************************************
+* Name:        poly_make_hint
+*
+* Description: Compute hint polynomial. The coefficients of which indicate
+*              whether the low bits of the corresponding coefficient of
+*              the input polynomial overflow into the high bits.
+*
+* Arguments:   - poly *h: pointer to output hint polynomial
+*              - const poly *a0: pointer to low part of input polynomial
+*              - const poly *a1: pointer to high part of input polynomial
+*
+* Returns number of 1 bits.
+**************************************************/
+unsigned int poly_make_hint(poly *h, const poly *a0, const poly *a1) {
+  unsigned int i, s = 0;
+  DBENCH_START();
+
+  for(i = 0; i < N; ++i) {
+    h->coeffs[i] = make_hint(a0->coeffs[i], a1->coeffs[i]);
+    s += h->coeffs[i];
+  }
+
+  DBENCH_STOP(*tround);
+  return s;
+}
+
+/*************************************************
+* Name:        poly_use_hint
+*
+* Description: Use hint polynomial to correct the high bits of a polynomial.
+*
+* Arguments:   - poly *b: pointer to output polynomial with corrected high bits
+*              - const poly *a: pointer to input polynomial
+*              - const poly *h: pointer to input hint polynomial
+**************************************************/
+void poly_use_hint(poly *b, const poly *a, const poly *h) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N; ++i)
+    b->coeffs[i] = use_hint(a->coeffs[i], h->coeffs[i]);
+
+  DBENCH_STOP(*tround);
+}
+
+/*************************************************
+* Name:        poly_chknorm
+*
+* Description: Check infinity norm of polynomial against given bound.
+*              Assumes input coefficients were reduced by reduce32().
+*
+* Arguments:   - const poly *a: pointer to polynomial
+*              - int32_t B: norm bound
+*
+* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 1 otherwise.
+**************************************************/
+int poly_chknorm(const poly *a, int32_t B) {
+  unsigned int i;
+  int32_t t;
+  DBENCH_START();
+
+  if(B > (Q-1)/8)
+    return 1;
+
+  /* It is ok to leak which coefficient violates the bound since
+     the probability for each coefficient is independent of secret
+     data but we must not leak the sign of the centralized representative. */
+  for(i = 0; i < N; ++i) {
+    /* Absolute value */
+    t = a->coeffs[i] >> 31;
+    t = a->coeffs[i] - (t & 2*a->coeffs[i]);
+
+    if(t >= B) {
+      DBENCH_STOP(*tsample);
+      return 1;
+    }
+  }
+
+  DBENCH_STOP(*tsample);
+  return 0;
+}
+
+/*************************************************
+* Name:        poly_uniform
+*
+* Description: Sample polynomial with uniformly random coefficients
+*              in [0,Q-1] by performing rejection sampling on the
+*              output stream of SHAKE256(seed|nonce).
+*
+* Arguments:   - poly *a: pointer to output polynomial
+*              - const uint8_t seed[]: byte array with seed of length SEEDBYTES
+*              - uint16_t nonce: 2-byte nonce
+**************************************************/
+#define POLY_UNIFORM_NBLOCKS ((768 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES)
+void poly_uniform(poly *a,
+                  const uint8_t seed[SEEDBYTES],
+                  uint16_t nonce)
+{
+  unsigned int i, ctr, off;
+  unsigned int buflen = POLY_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES;
+  uint8_t buf[POLY_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES + 2];
+  stream128_state state;
+
+  stream128_init(&state, seed, nonce);
+  stream128_squeezeblocks(buf, POLY_UNIFORM_NBLOCKS, &state);
+
+  ctr = asm_rej_uniform(a->coeffs, N, buf, buflen);
+
+  while(ctr < N) {
+    off = buflen % 3;
+    for(i = 0; i < off; ++i)
+      buf[i] = buf[buflen - off + i];
+
+    stream128_squeezeblocks(buf + off, 1, &state);
+    buflen = STREAM128_BLOCKBYTES + off;
+    ctr += asm_rej_uniform(a->coeffs + ctr, N - ctr, buf, buflen);
+  }
+}
+
+/*************************************************
+* Name:        rej_eta
+*
+* Description: Sample uniformly random coefficients in [-ETA, ETA] by
+*              performing rejection sampling on array of random bytes.
+*
+* Arguments:   - int32_t *a: pointer to output array (allocated)
+*              - unsigned int len: number of coefficients to be sampled
+*              - const uint8_t *buf: array of random bytes
+*              - unsigned int buflen: length of array of random bytes
+*
+* Returns number of sampled coefficients. Can be smaller than len if not enough
+* random bytes were given.
+**************************************************/
+static unsigned int rej_eta(int32_t *a,
+                            unsigned int len,
+                            const uint8_t *buf,
+                            unsigned int buflen)
+{
+  unsigned int ctr, pos;
+  uint32_t t0, t1;
+  DBENCH_START();
+
+  ctr = pos = 0;
+  while(ctr < len && pos < buflen) {
+    t0 = buf[pos] & 0x0F;
+    t1 = buf[pos++] >> 4;
+
+#if ETA == 2
+    if(t0 < 15) {
+      t0 = t0 - (205*t0 >> 10)*5;
+      a[ctr++] = 2 - t0;
+    }
+    if(t1 < 15 && ctr < len) {
+      t1 = t1 - (205*t1 >> 10)*5;
+      a[ctr++] = 2 - t1;
+    }
+#elif ETA == 4
+    if(t0 < 9)
+      a[ctr++] = 4 - t0;
+    if(t1 < 9 && ctr < len)
+      a[ctr++] = 4 - t1;
+#endif
+  }
+
+  DBENCH_STOP(*tsample);
+  return ctr;
+}
+
+/*************************************************
+* Name:        poly_uniform_eta
+*
+* Description: Sample polynomial with uniformly random coefficients
+*              in [-ETA,ETA] by performing rejection sampling on the
+*              output stream from SHAKE256(seed|nonce).
+*
+* Arguments:   - poly *a: pointer to output polynomial
+*              - const uint8_t seed[]: byte array with seed of length SEEDBYTES
+*              - uint16_t nonce: 2-byte nonce
+**************************************************/
+#if ETA == 2
+#define POLY_UNIFORM_ETA_NBLOCKS ((136 + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES)
+#elif ETA == 4
+#define POLY_UNIFORM_ETA_NBLOCKS ((227 + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES)
+#endif
+void poly_uniform_eta(poly *a,
+        const uint8_t seed[CRHBYTES],
+        uint16_t nonce) {
+  unsigned int ctr;
+  unsigned int buflen = POLY_UNIFORM_ETA_NBLOCKS * STREAM256_BLOCKBYTES;
+  uint8_t buf[POLY_UNIFORM_ETA_NBLOCKS * STREAM256_BLOCKBYTES];
+  stream256_state state;
+
+  stream256_init(&state, seed, nonce);
+  stream256_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state);
+
+  ctr = rej_eta(a->coeffs, N, buf, buflen);
+
+  while(ctr < N) {
+    stream256_squeezeblocks(buf, 1, &state);
+    ctr += rej_eta(a->coeffs + ctr, N - ctr, buf, STREAM256_BLOCKBYTES);
+  }
+}
+
+/*************************************************
+* Name:        poly_uniform_gamma1m1
+*
+* Description: Sample polynomial with uniformly random coefficients
+*              in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream
+*              of SHAKE256(seed|nonce).
+*
+* Arguments:   - poly *a: pointer to output polynomial
+*              - const uint8_t seed[]: byte array with seed of length CRHBYTES
+*              - uint16_t nonce: 16-bit nonce
+**************************************************/
+#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES)
+void poly_uniform_gamma1(poly *a,
+                         const uint8_t seed[CRHBYTES],
+                         uint16_t nonce)
+{
+  uint8_t buf[POLY_UNIFORM_GAMMA1_NBLOCKS*STREAM256_BLOCKBYTES];
+  stream256_state state;
+
+  stream256_init(&state, seed, nonce);
+  stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1_NBLOCKS, &state);
+  polyz_unpack(a, buf);
+}
+
+/*************************************************
+* Name:        challenge
+*
+* Description: Implementation of H. Samples polynomial with TAU nonzero
+*              coefficients in {-1,1} using the output stream of
+*              SHAKE256(seed).
+*
+* Arguments:   - poly *c: pointer to output polynomial
+*              - const uint8_t mu[]: byte array containing seed of length SEEDBYTES
+**************************************************/
+void poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]) {
+  unsigned int i, b, pos;
+  uint64_t signs;
+  uint8_t buf[SHAKE256_RATE];
+  shake256incctx state;
+
+  shake256_inc_init(&state);
+  shake256_inc_absorb(&state, seed, SEEDBYTES);
+  shake256_inc_finalize(&state);
+  shake256_inc_squeezeblocks(buf, 1, &state);
+
+  signs = 0;
+  for(i = 0; i < 8; ++i)
+    signs |= (uint64_t)buf[i] << 8*i;
+  pos = 8;
+
+  for(i = 0; i < N; ++i)
+    c->coeffs[i] = 0;
+  for(i = N-TAU; i < N; ++i) {
+    do {
+      if(pos >= SHAKE256_RATE) {
+        shake256_inc_squeezeblocks(buf, 1, &state);
+        pos = 0;
+      }
+
+      b = buf[pos++];
+    } while(b > i);
+
+    c->coeffs[i] = c->coeffs[b];
+    c->coeffs[b] = 1 - 2*(signs & 1);
+    signs >>= 1;
+  }
+}
+
+/*************************************************
+* Name:        polyeta_pack
+*
+* Description: Bit-pack polynomial with coefficients in [-ETA,ETA].
+*
+* Arguments:   - uint8_t *r: pointer to output byte array with at least
+*                            POLYETA_PACKEDBYTES bytes
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+void polyeta_pack(uint8_t *r, const poly *a) {
+  unsigned int i;
+  uint8_t t[8];
+  DBENCH_START();
+
+#if ETA == 2
+  for(i = 0; i < N/8; ++i) {
+    t[0] = ETA - a->coeffs[8*i+0];
+    t[1] = ETA - a->coeffs[8*i+1];
+    t[2] = ETA - a->coeffs[8*i+2];
+    t[3] = ETA - a->coeffs[8*i+3];
+    t[4] = ETA - a->coeffs[8*i+4];
+    t[5] = ETA - a->coeffs[8*i+5];
+    t[6] = ETA - a->coeffs[8*i+6];
+    t[7] = ETA - a->coeffs[8*i+7];
+
+    r[3*i+0]  = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6);
+    r[3*i+1]  = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7);
+    r[3*i+2]  = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5);
+  }
+#elif ETA == 4
+  for(i = 0; i < N/2; ++i) {
+    t[0] = ETA - a->coeffs[2*i+0];
+    t[1] = ETA - a->coeffs[2*i+1];
+    r[i] = t[0] | (t[1] << 4);
+  }
+#endif
+
+  DBENCH_STOP(*tpack);
+}
+
+
+/*************************************************
+* Name:        polyt1_pack
+*
+* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits.
+*              Input coefficients are assumed to be standard representatives.
+*
+* Arguments:   - uint8_t *r: pointer to output byte array with at least
+*                            POLYT1_PACKEDBYTES bytes
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+void polyt1_pack(uint8_t *r, const poly *a) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N/4; ++i) {
+    r[5*i+0] = (a->coeffs[4*i+0] >> 0);
+    r[5*i+1] = (a->coeffs[4*i+0] >> 8) | (a->coeffs[4*i+1] << 2);
+    r[5*i+2] = (a->coeffs[4*i+1] >> 6) | (a->coeffs[4*i+2] << 4);
+    r[5*i+3] = (a->coeffs[4*i+2] >> 4) | (a->coeffs[4*i+3] << 6);
+    r[5*i+4] = (a->coeffs[4*i+3] >> 2);
+  }
+
+  DBENCH_STOP(*tpack);
+}
+
+/*************************************************
+* Name:        polyt1_unpack
+*
+* Description: Unpack polynomial t1 with 10-bit coefficients.
+*              Output coefficients are standard representatives.
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const uint8_t *a: byte array with bit-packed polynomial
+**************************************************/
+void polyt1_unpack(poly *r, const uint8_t *a) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N/4; ++i) {
+    r->coeffs[4*i+0] = ((a[5*i+0] >> 0) | ((uint32_t)a[5*i+1] << 8)) & 0x3FF;
+    r->coeffs[4*i+1] = ((a[5*i+1] >> 2) | ((uint32_t)a[5*i+2] << 6)) & 0x3FF;
+    r->coeffs[4*i+2] = ((a[5*i+2] >> 4) | ((uint32_t)a[5*i+3] << 4)) & 0x3FF;
+    r->coeffs[4*i+3] = ((a[5*i+3] >> 6) | ((uint32_t)a[5*i+4] << 2)) & 0x3FF;
+  }
+
+  DBENCH_STOP(*tpack);
+}
+
+/*************************************************
+* Name:        polyt0_pack
+*
+* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}].
+*
+* Arguments:   - uint8_t *r: pointer to output byte array with at least
+*                            POLYT0_PACKEDBYTES bytes
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+void polyt0_pack(uint8_t *r, const poly *a) {
+  unsigned int i;
+  uint32_t t[8];
+  DBENCH_START();
+
+  for(i = 0; i < N/8; ++i) {
+    t[0] = (1 << (D-1)) - a->coeffs[8*i+0];
+    t[1] = (1 << (D-1)) - a->coeffs[8*i+1];
+    t[2] = (1 << (D-1)) - a->coeffs[8*i+2];
+    t[3] = (1 << (D-1)) - a->coeffs[8*i+3];
+    t[4] = (1 << (D-1)) - a->coeffs[8*i+4];
+    t[5] = (1 << (D-1)) - a->coeffs[8*i+5];
+    t[6] = (1 << (D-1)) - a->coeffs[8*i+6];
+    t[7] = (1 << (D-1)) - a->coeffs[8*i+7];
+
+    r[13*i+ 0]  =  t[0];
+    r[13*i+ 1]  =  t[0] >>  8;
+    r[13*i+ 1] |=  t[1] <<  5;
+    r[13*i+ 2]  =  t[1] >>  3;
+    r[13*i+ 3]  =  t[1] >> 11;
+    r[13*i+ 3] |=  t[2] <<  2;
+    r[13*i+ 4]  =  t[2] >>  6;
+    r[13*i+ 4] |=  t[3] <<  7;
+    r[13*i+ 5]  =  t[3] >>  1;
+    r[13*i+ 6]  =  t[3] >>  9;
+    r[13*i+ 6] |=  t[4] <<  4;
+    r[13*i+ 7]  =  t[4] >>  4;
+    r[13*i+ 8]  =  t[4] >> 12;
+    r[13*i+ 8] |=  t[5] <<  1;
+    r[13*i+ 9]  =  t[5] >>  7;
+    r[13*i+ 9] |=  t[6] <<  6;
+    r[13*i+10]  =  t[6] >>  2;
+    r[13*i+11]  =  t[6] >> 10;
+    r[13*i+11] |=  t[7] <<  3;
+    r[13*i+12]  =  t[7] >>  5;
+  }
+
+  DBENCH_STOP(*tpack);
+}
+
+/*************************************************
+* Name:        polyt0_unpack
+*
+* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}].
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const uint8_t *a: byte array with bit-packed polynomial
+**************************************************/
+void polyt0_unpack(poly *r, const uint8_t *a) {
+  unsigned int i;
+  DBENCH_START();
+
+  for(i = 0; i < N/8; ++i) {
+    r->coeffs[8*i+0]  = a[13*i+0];
+    r->coeffs[8*i+0] |= (uint32_t)a[13*i+1] << 8;
+    r->coeffs[8*i+0] &= 0x1FFF;
+
+    r->coeffs[8*i+1]  = a[13*i+1] >> 5;
+    r->coeffs[8*i+1] |= (uint32_t)a[13*i+2] << 3;
+    r->coeffs[8*i+1] |= (uint32_t)a[13*i+3] << 11;
+    r->coeffs[8*i+1] &= 0x1FFF;
+
+    r->coeffs[8*i+2]  = a[13*i+3] >> 2;
+    r->coeffs[8*i+2] |= (uint32_t)a[13*i+4] << 6;
+    r->coeffs[8*i+2] &= 0x1FFF;
+
+    r->coeffs[8*i+3]  = a[13*i+4] >> 7;
+    r->coeffs[8*i+3] |= (uint32_t)a[13*i+5] << 1;
+    r->coeffs[8*i+3] |= (uint32_t)a[13*i+6] << 9;
+    r->coeffs[8*i+3] &= 0x1FFF;
+
+    r->coeffs[8*i+4]  = a[13*i+6] >> 4;
+    r->coeffs[8*i+4] |= (uint32_t)a[13*i+7] << 4;
+    r->coeffs[8*i+4] |= (uint32_t)a[13*i+8] << 12;
+    r->coeffs[8*i+4] &= 0x1FFF;
+
+    r->coeffs[8*i+5]  = a[13*i+8] >> 1;
+    r->coeffs[8*i+5] |= (uint32_t)a[13*i+9] << 7;
+    r->coeffs[8*i+5] &= 0x1FFF;
+
+    r->coeffs[8*i+6]  = a[13*i+9] >> 6;
+    r->coeffs[8*i+6] |= (uint32_t)a[13*i+10] << 2;
+    r->coeffs[8*i+6] |= (uint32_t)a[13*i+11] << 10;
+    r->coeffs[8*i+6] &= 0x1FFF;
+
+    r->coeffs[8*i+7]  = a[13*i+11] >> 3;
+    r->coeffs[8*i+7] |= (uint32_t)a[13*i+12] << 5;
+    r->coeffs[8*i+7] &= 0x1FFF;
+
+    r->coeffs[8*i+0] = (1 << (D-1)) - r->coeffs[8*i+0];
+    r->coeffs[8*i+1] = (1 << (D-1)) - r->coeffs[8*i+1];
+    r->coeffs[8*i+2] = (1 << (D-1)) - r->coeffs[8*i+2];
+    r->coeffs[8*i+3] = (1 << (D-1)) - r->coeffs[8*i+3];
+    r->coeffs[8*i+4] = (1 << (D-1)) - r->coeffs[8*i+4];
+    r->coeffs[8*i+5] = (1 << (D-1)) - r->coeffs[8*i+5];
+    r->coeffs[8*i+6] = (1 << (D-1)) - r->coeffs[8*i+6];
+    r->coeffs[8*i+7] = (1 << (D-1)) - r->coeffs[8*i+7];
+  }
+
+  DBENCH_STOP(*tpack);
+}
+
+/*************************************************
+* Name:        polyz_pack
+*
+* Description: Bit-pack polynomial with coefficients
+*              in [-(GAMMA1 - 1), GAMMA1].
+*
+* Arguments:   - uint8_t *r: pointer to output byte array with at least
+*                            POLYZ_PACKEDBYTES bytes
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+void polyz_pack(uint8_t *r, const poly *a) {
+  unsigned int i;
+  uint32_t t[4];
+  DBENCH_START();
+
+#if GAMMA1 == (1 << 17)
+  for(i = 0; i < N/4; ++i) {
+    t[0] = GAMMA1 - a->coeffs[4*i+0];
+    t[1] = GAMMA1 - a->coeffs[4*i+1];
+    t[2] = GAMMA1 - a->coeffs[4*i+2];
+    t[3] = GAMMA1 - a->coeffs[4*i+3];
+
+    r[9*i+0]  = t[0];
+    r[9*i+1]  = t[0] >> 8;
+    r[9*i+2]  = t[0] >> 16;
+    r[9*i+2] |= t[1] << 2;
+    r[9*i+3]  = t[1] >> 6;
+    r[9*i+4]  = t[1] >> 14;
+    r[9*i+4] |= t[2] << 4;
+    r[9*i+5]  = t[2] >> 4;
+    r[9*i+6]  = t[2] >> 12;
+    r[9*i+6] |= t[3] << 6;
+    r[9*i+7]  = t[3] >> 2;
+    r[9*i+8]  = t[3] >> 10;
+  }
+#elif GAMMA1 == (1 << 19)
+  for(i = 0; i < N/2; ++i) {
+    t[0] = GAMMA1 - a->coeffs[2*i+0];
+    t[1] = GAMMA1 - a->coeffs[2*i+1];
+
+    r[5*i+0]  = t[0];
+    r[5*i+1]  = t[0] >> 8;
+    r[5*i+2]  = t[0] >> 16;
+    r[5*i+2] |= t[1] << 4;
+    r[5*i+3]  = t[1] >> 4;
+    r[5*i+4]  = t[1] >> 12;
+  }
+#endif
+
+  DBENCH_STOP(*tpack);
+}
+
+/*************************************************
+* Name:        polyz_unpack
+*
+* Description: Unpack polynomial z with coefficients
+*              in [-(GAMMA1 - 1), GAMMA1].
+*
+* Arguments:   - poly *r: pointer to output polynomial
+*              - const uint8_t *a: byte array with bit-packed polynomial
+**************************************************/
+void polyz_unpack(poly *r, const uint8_t *a) {
+  unsigned int i;
+  DBENCH_START();
+
+#if GAMMA1 == (1 << 17)
+  for(i = 0; i < N/4; ++i) {
+    r->coeffs[4*i+0]  = a[9*i+0];
+    r->coeffs[4*i+0] |= (uint32_t)a[9*i+1] << 8;
+    r->coeffs[4*i+0] |= (uint32_t)a[9*i+2] << 16;
+    r->coeffs[4*i+0] &= 0x3FFFF;
+
+    r->coeffs[4*i+1]  = a[9*i+2] >> 2;
+    r->coeffs[4*i+1] |= (uint32_t)a[9*i+3] << 6;
+    r->coeffs[4*i+1] |= (uint32_t)a[9*i+4] << 14;
+    r->coeffs[4*i+1] &= 0x3FFFF;
+
+    r->coeffs[4*i+2]  = a[9*i+4] >> 4;
+    r->coeffs[4*i+2] |= (uint32_t)a[9*i+5] << 4;
+    r->coeffs[4*i+2] |= (uint32_t)a[9*i+6] << 12;
+    r->coeffs[4*i+2] &= 0x3FFFF;
+
+    r->coeffs[4*i+3]  = a[9*i+6] >> 6;
+    r->coeffs[4*i+3] |= (uint32_t)a[9*i+7] << 2;
+    r->coeffs[4*i+3] |= (uint32_t)a[9*i+8] << 10;
+    r->coeffs[4*i+3] &= 0x3FFFF;
+
+    r->coeffs[4*i+0] = GAMMA1 - r->coeffs[4*i+0];
+    r->coeffs[4*i+1] = GAMMA1 - r->coeffs[4*i+1];
+    r->coeffs[4*i+2] = GAMMA1 - r->coeffs[4*i+2];
+    r->coeffs[4*i+3] = GAMMA1 - r->coeffs[4*i+3];
+  }
+#elif GAMMA1 == (1 << 19)
+  for(i = 0; i < N/2; ++i) {
+    r->coeffs[2*i+0]  = a[5*i+0];
+    r->coeffs[2*i+0] |= (uint32_t)a[5*i+1] << 8;
+    r->coeffs[2*i+0] |= (uint32_t)a[5*i+2] << 16;
+    r->coeffs[2*i+0] &= 0xFFFFF;
+
+    r->coeffs[2*i+1]  = a[5*i+2] >> 4;
+    r->coeffs[2*i+1] |= (uint32_t)a[5*i+3] << 4;
+    r->coeffs[2*i+1] |= (uint32_t)a[5*i+4] << 12;
+    r->coeffs[2*i+0] &= 0xFFFFF;
+
+    r->coeffs[2*i+0] = GAMMA1 - r->coeffs[2*i+0];
+    r->coeffs[2*i+1] = GAMMA1 - r->coeffs[2*i+1];
+  }
+#endif
+
+  DBENCH_STOP(*tpack);
+}
+
+/*************************************************
+* Name:        polyw1_pack
+*
+* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43].
+*              Input coefficients are assumed to be standard representatives.
+*
+* Arguments:   - uint8_t *r: pointer to output byte array with at least
+*                            POLYW1_PACKEDBYTES bytes
+*              - const poly *a: pointer to input polynomial
+**************************************************/
+void polyw1_pack(uint8_t *r, const poly *a) {
+  unsigned int i;
+  DBENCH_START();
+
+#if GAMMA2 == (Q-1)/88
+  for(i = 0; i < N/4; ++i) {
+    r[3*i+0]  = a->coeffs[4*i+0];
+    r[3*i+0] |= a->coeffs[4*i+1] << 6;
+    r[3*i+1]  = a->coeffs[4*i+1] >> 2;
+    r[3*i+1] |= a->coeffs[4*i+2] << 4;
+    r[3*i+2]  = a->coeffs[4*i+2] >> 4;
+    r[3*i+2] |= a->coeffs[4*i+3] << 2;
+  }
+#elif GAMMA2 == (Q-1)/32
+  for(i = 0; i < N/2; ++i)
+    r[i] = a->coeffs[2*i+0] | (a->coeffs[2*i+1] << 4);
+#endif
+
+  DBENCH_STOP(*tpack);
+}
diff --git a/crypto_sign/dilithium2/m4f/poly.h b/crypto_sign/dilithium2/m4f/poly.h
new file mode 100644
index 0000000..af9e7a5
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/poly.h
@@ -0,0 +1,84 @@
+#ifndef POLY_H
+#define POLY_H
+
+#include <stdint.h>
+#include "params.h"
+
+typedef struct {
+  int32_t coeffs[N];
+} poly;
+
+#define poly_reduce DILITHIUM_NAMESPACE(poly_reduce)
+void poly_reduce(poly *a);
+#define poly_caddq DILITHIUM_NAMESPACE(poly_caddq)
+void poly_caddq(poly *a);
+#define poly_csubq DILITHIUM_NAMESPACE(poly_csubq)
+void poly_csubq(poly *a);
+#define poly_freeze DILITHIUM_NAMESPACE(poly_freeze)
+void poly_freeze(poly *a);
+
+#define poly_add DILITHIUM_NAMESPACE(poly_add)
+void poly_add(poly *c, const poly *a, const poly *b);
+#define poly_sub DILITHIUM_NAMESPACE(poly_sub)
+void poly_sub(poly *c, const poly *a, const poly *b);
+#define poly_shiftl DILITHIUM_NAMESPACE(poly_shiftl)
+void poly_shiftl(poly *a);
+
+#define poly_ntt DILITHIUM_NAMESPACE(poly_ntt)
+void poly_ntt(poly *a);
+
+#define poly_invntt_tomont DILITHIUM_NAMESPACE(poly_invntt_tomont)
+void poly_invntt_tomont(poly *a);
+#define poly_pointwise_montgomery DILITHIUM_NAMESPACE(poly_pointwise_montgomery)
+void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b);
+#define poly_pointwise_acc_montgomery DILITHIUM_NAMESPACE(poly_pointwise_acc_montgomery)
+void poly_pointwise_acc_montgomery(poly *c, const poly *a, const poly *b);
+
+#define poly_power2round DILITHIUM_NAMESPACE(poly_power2round)
+void poly_power2round(poly *a1, poly *a0, const poly *a);
+#define poly_decompose DILITHIUM_NAMESPACE(poly_decompose)
+void poly_decompose(poly *a1, poly *a0, const poly *a);
+#define poly_make_hint DILITHIUM_NAMESPACE(poly_make_hint)
+unsigned int poly_make_hint(poly *h, const poly *a0, const poly *a1);
+#define poly_use_hint DILITHIUM_NAMESPACE(poly_use_hint)
+void poly_use_hint(poly *b, const poly *a, const poly *h);
+
+#define poly_chknorm DILITHIUM_NAMESPACE(poly_chknorm)
+int poly_chknorm(const poly *a, int32_t B);
+#define poly_uniform DILITHIUM_NAMESPACE(poly_uniform)
+void poly_uniform(poly *a,
+                  const uint8_t seed[SEEDBYTES],
+                  uint16_t nonce);
+#define poly_uniform_eta DILITHIUM_NAMESPACE(poly_uniform_eta)
+void poly_uniform_eta(poly *a,
+                      const uint8_t seed[CRHBYTES],
+                      uint16_t nonce);
+#define poly_uniform_gamma1 DILITHIUM_NAMESPACE(poly_uniform_gamma1)
+void poly_uniform_gamma1(poly *a,
+                         const uint8_t seed[CRHBYTES],
+                         uint16_t nonce);
+#define poly_challenge DILITHIUM_NAMESPACE(poly_challenge)
+void poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]);
+
+#define polyeta_pack DILITHIUM_NAMESPACE(polyeta_pack)
+void polyeta_pack(uint8_t *r, const poly *a);
+
+#define polyt1_pack DILITHIUM_NAMESPACE(polyt1_pack)
+void polyt1_pack(uint8_t *r, const poly *a);
+#define polyt1_unpack DILITHIUM_NAMESPACE(polyt1_unpack)
+void polyt1_unpack(poly *r, const uint8_t *a);
+
+#define polyt0_pack DILITHIUM_NAMESPACE(polyt0_pack)
+void polyt0_pack(uint8_t *r, const poly *a);
+#define polyt0_unpack DILITHIUM_NAMESPACE(polyt0_unpack)
+void polyt0_unpack(poly *r, const uint8_t *a);
+
+#define polyz_pack DILITHIUM_NAMESPACE(polyz_pack)
+void polyz_pack(uint8_t *r, const poly *a);
+#define polyz_unpack DILITHIUM_NAMESPACE(polyz_unpack)
+void polyz_unpack(poly *r, const uint8_t *a);
+
+#define polyw1_pack DILITHIUM_NAMESPACE(polyw1_pack)
+void polyw1_pack(uint8_t *r, const poly *a);
+
+#endif
diff --git a/crypto_sign/dilithium2/m4f/polyvec.c b/crypto_sign/dilithium2/m4f/polyvec.c
new file mode 100644
index 0000000..e20749c
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/polyvec.c
@@ -0,0 +1,429 @@
+#include <stdint.h>
+#include "params.h"
+#include "polyvec.h"
+#include "poly.h"
+
+#include <stdio.h>
+#include "hal.h"
+
+/*************************************************
+* Name:        expand_mat
+*
+* Description: Implementation of ExpandA. Generates matrix A with uniformly
+*              random coefficients a_{i,j} by performing rejection
+*              sampling on the output stream of SHAKE128(rho|j|i).
+*
+* Arguments:   - polyvecl mat[K]: output matrix
+*              - const uint8_t rho[]: byte array containing seed rho
+**************************************************/
+void polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) {
+  unsigned int i, j;
+
+  for(i = 0; i < K; ++i)
+    for(j = 0; j < L; ++j)
+      poly_uniform(&mat[i].vec[j], rho, (i << 8) + j);
+}
+
+void polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v);
+}
+
+/**************************************************************/
+/************ Vectors of polynomials of length L **************/
+/**************************************************************/
+
+void polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    poly_uniform_eta(&v->vec[i], seed, nonce++);
+}
+
+void polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    poly_uniform_gamma1(&v->vec[i], seed, L*nonce + i);
+}
+
+void polyvecl_reduce(polyvecl *v) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    poly_reduce(&v->vec[i]);
+}
+
+#if 0
+/*************************************************
+* Name:        polyvecl_freeze
+*
+* Description: Reduce coefficients of polynomials in vector of length L
+*              to standard representatives.
+*
+* Arguments:   - polyvecl *v: pointer to input/output vector
+**************************************************/
+void polyvecl_freeze(polyvecl *v) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    poly_freeze(&v->vec[i]);
+}
+#endif
+
+/*************************************************
+* Name:        polyvecl_add
+*
+* Description: Add vectors of polynomials of length L.
+*              No modular reduction is performed.
+*
+* Arguments:   - polyvecl *w: pointer to output vector
+*              - const polyvecl *u: pointer to first summand
+*              - const polyvecl *v: pointer to second summand
+**************************************************/
+void polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyvecl_ntt
+*
+* Description: Forward NTT of all polynomials in vector of length L. Output
+*              coefficients can be up to 16*Q larger than input coefficients.
+*
+* Arguments:   - polyvecl *v: pointer to input/output vector
+**************************************************/
+void polyvecl_ntt(polyvecl *v) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    poly_ntt(&v->vec[i]);
+}
+
+void polyvecl_invntt_tomont(polyvecl *v) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    poly_invntt_tomont(&v->vec[i]);
+}
+
+void polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
+}
+
+
+
+/*************************************************
+* Name:        polyvecl_pointwise_acc_montgomery
+*
+* Description: Pointwise multiply vectors of polynomials of length L, multiply
+*              resulting vector by 2^{-32} and add (accumulate) polynomials
+*              in it. Input/output vectors are in NTT domain representation.
+*
+* Arguments:   - poly *w: output polynomial
+*              - const polyvecl *u: pointer to first input vector
+*              - const polyvecl *v: pointer to second input vector
+**************************************************/
+void polyvecl_pointwise_acc_montgomery(poly *w,
+                                       const polyvecl *u,
+                                       const polyvecl *v)
+{
+  unsigned int i;
+
+  poly_pointwise_montgomery(w, &u->vec[0], &v->vec[0]);
+  for(i = 1; i < L; ++i) {
+    poly_pointwise_acc_montgomery(w, &u->vec[i], &v->vec[i]);
+  }
+}
+
+/*************************************************
+* Name:        polyvecl_chknorm
+*
+* Description: Check infinity norm of polynomials in vector of length L.
+*              Assumes input polyvecl to be reduced by polyvecl_reduce().
+*
+* Arguments:   - const polyvecl *v: pointer to vector
+*              - int32_t B: norm bound
+*
+* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8
+* and 1 otherwise.
+**************************************************/
+int polyvecl_chknorm(const polyvecl *v, int32_t bound)  {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    if(poly_chknorm(&v->vec[i], bound))
+      return 1;
+
+  return 0;
+}
+
+/**************************************************************/
+/************ Vectors of polynomials of length K **************/
+/**************************************************************/
+
+void polyveck_uniform_eta(polyveck *v, const uint8_t seed[CRHBYTES], uint16_t nonce) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_uniform_eta(&v->vec[i], seed, nonce++);
+}
+
+/*************************************************
+* Name:        polyveck_reduce
+*
+* Description: Reduce coefficients of polynomials in vector of length K
+*              to representatives in [-6283009,6283007].
+*
+* Arguments:   - polyveck *v: pointer to input/output vector
+**************************************************/
+void polyveck_reduce(polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_reduce(&v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyveck_caddq
+*
+* Description: For all coefficients of polynomials in vector of length K
+*              add Q if coefficient is negative.
+*
+* Arguments:   - polyveck *v: pointer to input/output vector
+**************************************************/
+void polyveck_caddq(polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_caddq(&v->vec[i]);
+}
+
+#if 0
+/*************************************************
+* Name:        polyveck_freeze
+*
+* Description: Reduce coefficients of polynomials in vector of length K
+*              to standard representatives.
+*
+* Arguments:   - polyveck *v: pointer to input/output vector
+**************************************************/
+void polyveck_freeze(polyveck *v)  {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_freeze(&v->vec[i]);
+}
+#endif
+
+/*************************************************
+* Name:        polyveck_add
+*
+* Description: Add vectors of polynomials of length K.
+*              No modular reduction is performed.
+*
+* Arguments:   - polyveck *w: pointer to output vector
+*              - const polyveck *u: pointer to first summand
+*              - const polyveck *v: pointer to second summand
+**************************************************/
+void polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyveck_sub
+*
+* Description: Subtract vectors of polynomials of length K.
+*              No modular reduction is performed.
+*
+* Arguments:   - polyveck *w: pointer to output vector
+*              - const polyveck *u: pointer to first input vector
+*              - const polyveck *v: pointer to second input vector to be
+*                                   subtracted from first input vector
+**************************************************/
+void polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyveck_shiftl
+*
+* Description: Multiply vector of polynomials of Length K by 2^D without modular
+*              reduction. Assumes input coefficients to be less than 2^{31-D}.
+*
+* Arguments:   - polyveck *v: pointer to input/output vector
+**************************************************/
+void polyveck_shiftl(polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_shiftl(&v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyveck_ntt
+*
+* Description: Forward NTT of all polynomials in vector of length K. Output
+*              coefficients can be up to 16*Q larger than input coefficients.
+*
+* Arguments:   - polyveck *v: pointer to input/output vector
+**************************************************/
+void polyveck_ntt(polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_ntt(&v->vec[i]);
+}
+
+
+
+/*************************************************
+* Name:        polyveck_invntt_tomont
+*
+* Description: Inverse NTT and multiplication by 2^{32} of polynomials
+*              in vector of length K. Input coefficients need to be less
+*              than 2*Q.
+*
+* Arguments:   - polyveck *v: pointer to input/output vector
+**************************************************/
+void polyveck_invntt_tomont(polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_invntt_tomont(&v->vec[i]);
+}
+
+
+void polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
+}
+
+
+/*************************************************
+* Name:        polyveck_chknorm
+*
+* Description: Check infinity norm of polynomials in vector of length K.
+*              Assumes input polyveck to be reduced by polyveck_reduce().
+*
+* Arguments:   - const polyveck *v: pointer to vector
+*              - int32_t B: norm bound
+*
+* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8
+* and 1 otherwise.
+**************************************************/
+int polyveck_chknorm(const polyveck *v, int32_t bound) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    if(poly_chknorm(&v->vec[i], bound))
+      return 1;
+
+  return 0;
+}
+
+/*************************************************
+* Name:        polyveck_power2round
+*
+* Description: For all coefficients a of polynomials in vector of length K,
+*              compute a0, a1 such that a mod^+ Q = a1*2^D + a0
+*              with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be
+*              standard representatives.
+*
+* Arguments:   - polyveck *v1: pointer to output vector of polynomials with
+*                              coefficients a1
+*              - polyveck *v0: pointer to output vector of polynomials with
+*                              coefficients a0
+*              - const polyveck *v: pointer to input vector
+**************************************************/
+void polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyveck_decompose
+*
+* Description: For all coefficients a of polynomials in vector of length K,
+*              compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0
+*              with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we
+*              set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0.
+*              Assumes coefficients to be standard representatives.
+*
+* Arguments:   - polyveck *v1: pointer to output vector of polynomials with
+*                              coefficients a1
+*              - polyveck *v0: pointer to output vector of polynomials with
+*                              coefficients a0
+*              - const polyveck *v: pointer to input vector
+**************************************************/
+void polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]);
+}
+
+/*************************************************
+* Name:        polyveck_make_hint
+*
+* Description: Compute hint vector.
+*
+* Arguments:   - polyveck *h: pointer to output vector
+*              - const polyveck *v0: pointer to low part of input vector
+*              - const polyveck *v1: pointer to high part of input vector
+*
+* Returns number of 1 bits.
+**************************************************/
+unsigned int polyveck_make_hint(polyveck *h,
+                                const polyveck *v0,
+                                const polyveck *v1)
+{
+  unsigned int i, s = 0;
+
+  for(i = 0; i < K; ++i)
+    s += poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]);
+
+  return s;
+}
+
+/*************************************************
+* Name:        polyveck_use_hint
+*
+* Description: Use hint vector to correct the high bits of input vector.
+*
+* Arguments:   - polyveck *w: pointer to output vector of polynomials with
+*                             corrected high bits
+*              - const polyveck *u: pointer to input vector
+*              - const polyveck *h: pointer to input hint vector
+**************************************************/
+void polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]);
+}
+
+void polyveck_pack_w1(uint8_t r[K*POLYW1_PACKEDBYTES], const polyveck *w1) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    polyw1_pack(&r[i*POLYW1_PACKEDBYTES], &w1->vec[i]);
+}
diff --git a/crypto_sign/dilithium2/m4f/polyvec.h b/crypto_sign/dilithium2/m4f/polyvec.h
new file mode 100644
index 0000000..d92cd75
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/polyvec.h
@@ -0,0 +1,99 @@
+#ifndef POLYVEC_H
+#define POLYVEC_H
+
+#include <stdint.h>
+#include "params.h"
+#include "poly.h"
+
+/* Vectors of polynomials of length L */
+typedef struct {
+  poly vec[L];
+} polyvecl;
+
+#define polyvecl_uniform_eta DILITHIUM_NAMESPACE(polyvecl_uniform_eta)
+void polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce);
+
+#define polyvecl_uniform_gamma1 DILITHIUM_NAMESPACE(polyvecl_uniform_gamma1)
+void polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce);
+
+#define polyvecl_reduce DILITHIUM_NAMESPACE(polyvecl_reduce)
+void polyvecl_reduce(polyvecl *v);
+
+#define polyvecl_freeze DILITHIUM_NAMESPACE(polyvecl_freeze)
+void polyvecl_freeze(polyvecl *v);
+
+#define polyvecl_add DILITHIUM_NAMESPACE(polyvecl_add)
+void polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v);
+
+#define polyvecl_ntt DILITHIUM_NAMESPACE(polyvecl_ntt)
+void polyvecl_ntt(polyvecl *v);
+#define polyvecl_invntt_tomont DILITHIUM_NAMESPACE(polyvecl_invntt_tomont)
+void polyvecl_invntt_tomont(polyvecl *v);
+#define polyvecl_pointwise_poly_montgomery DILITHIUM_NAMESPACE(polyvecl_pointwise_poly_montgomery)
+void polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v);
+#define polyvecl_pointwise_acc_montgomery \
+        DILITHIUM_NAMESPACE(polyvecl_pointwise_acc_montgomery)
+void polyvecl_pointwise_acc_montgomery(poly *w,
+                                       const polyvecl *u,
+                                       const polyvecl *v);
+
+
+#define polyvecl_chknorm DILITHIUM_NAMESPACE(polyvecl_chknorm)
+int polyvecl_chknorm(const polyvecl *v, int32_t B);
+
+
+
+/* Vectors of polynomials of length K */
+typedef struct {
+  poly vec[K];
+} polyveck;
+
+#define polyveck_uniform_eta DILITHIUM_NAMESPACE(polyveck_uniform_eta)
+void polyveck_uniform_eta(polyveck *v, const uint8_t seed[CRHBYTES], uint16_t nonce);
+
+#define polyveck_reduce DILITHIUM_NAMESPACE(polyveck_reduce)
+void polyveck_reduce(polyveck *v);
+#define polyveck_caddq DILITHIUM_NAMESPACE(polyveck_caddq)
+void polyveck_caddq(polyveck *v);
+#define polyveck_freeze DILITHIUM_NAMESPACE(polyveck_freeze)
+void polyveck_freeze(polyveck *v);
+
+#define polyveck_add DILITHIUM_NAMESPACE(polyveck_add)
+void polyveck_add(polyveck *w, const polyveck *u, const polyveck *v);
+#define polyveck_sub DILITHIUM_NAMESPACE(polyveck_sub)
+void polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v);
+#define polyveck_shiftl DILITHIUM_NAMESPACE(polyveck_shiftl)
+void polyveck_shiftl(polyveck *v);
+
+#define polyveck_ntt DILITHIUM_NAMESPACE(polyveck_ntt)
+void polyveck_ntt(polyveck *v);
+#define polyveck_invntt_tomont DILITHIUM_NAMESPACE(polyveck_invntt_tomont)
+void polyveck_invntt_tomont(polyveck *v);
+#define polyveck_pointwise_poly_montgomery DILITHIUM_NAMESPACE(polyveck_pointwise_poly_montgomery)
+void polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v);
+
+
+#define polyveck_chknorm DILITHIUM_NAMESPACE(polyveck_chknorm)
+int polyveck_chknorm(const polyveck *v, int32_t B);
+
+#define polyveck_power2round DILITHIUM_NAMESPACE(polyveck_power2round)
+void polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v);
+#define polyveck_decompose DILITHIUM_NAMESPACE(polyveck_decompose)
+void polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v);
+#define polyveck_make_hint DILITHIUM_NAMESPACE(polyveck_make_hint)
+unsigned int polyveck_make_hint(polyveck *h,
+                                const polyveck *v0,
+                                const polyveck *v1);
+#define polyveck_use_hint DILITHIUM_NAMESPACE(polyveck_use_hint)
+void polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h);
+
+#define polyveck_pack_w1 DILITHIUM_NAMESPACE(polyveck_pack_w1)
+void polyveck_pack_w1(uint8_t r[K*POLYW1_PACKEDBYTES], const polyveck *w1);
+
+#define polyvec_matrix_expand DILITHIUM_NAMESPACE(polyvec_matrix_expand)
+void polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]);
+
+#define polyvec_matrix_pointwise_montgomery DILITHIUM_NAMESPACE(polyvec_matrix_pointwise_montgomery)
+void polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v);
+
+#endif
diff --git a/crypto_sign/dilithium2/m4f/reduce.h b/crypto_sign/dilithium2/m4f/reduce.h
new file mode 100644
index 0000000..02df550
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/reduce.h
@@ -0,0 +1,29 @@
+#ifndef REDUCE_H
+#define REDUCE_H
+
+#include <stdint.h>
+#include "params.h"
+
+#define MONT -4186625 // 2^32 % Q
+#define QINV 58728449 // q^(-1) mod 2^32
+
+#define montgomery_reduce DILITHIUM_NAMESPACE(montgomery_reduce)
+/*************************************************
+* Name:        montgomery_reduce
+*
+* Description: For finite field element a with -2^{31}Q <= a <= Q*2^31,
+*              compute r \equiv a*2^{-32} (mod Q) such that -Q < r < Q.
+*
+* Arguments:   - int64_t: finite field element a
+*
+* Returns r.
+**************************************************/
+static inline int32_t montgomery_reduce(int64_t a) {
+  int32_t t;
+
+  t = (int64_t)(int32_t)a*QINV;
+  t = (a - (int64_t)t*Q) >> 32;
+  return t;
+}
+
+#endif
diff --git a/crypto_sign/dilithium2/m4f/rounding.c b/crypto_sign/dilithium2/m4f/rounding.c
new file mode 100644
index 0000000..889f0a2
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/rounding.c
@@ -0,0 +1,102 @@
+#include <stdint.h>
+#include "params.h"
+#include "rounding.h"
+
+/*************************************************
+* Name:        power2round
+*
+* Description: For finite field element a, compute a0, a1 such that
+*              a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}.
+*              Assumes a to be standard representative.
+*
+* Arguments:   - int32_t a: input element
+*              - int32_t *a0: pointer to output element a0
+*
+* Returns a1.
+**************************************************/
+int32_t power2round(int32_t *a0, int32_t a)  {
+  int32_t a1;
+
+  a1 = (a + (1 << (D-1)) - 1) >> D;
+  *a0 = a - (a1 << D);
+  return a1;
+}
+
+/*************************************************
+* Name:        decompose
+*
+* Description: For finite field element a, compute high and low bits a0, a1 such
+*              that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except
+*              if a1 = (Q-1)/ALPHA where we set a1 = 0 and
+*              -ALPHA/2 <= a0 = a mod^+ Q - Q < 0. Assumes a to be standard
+*              representative.
+*
+* Arguments:   - int32_t a: input element
+*              - int32_t *a0: pointer to output element a0
+*
+* Returns a1.
+**************************************************/
+int32_t decompose(int32_t *a0, int32_t a) {
+  int32_t a1;
+
+  a1  = (a + 127) >> 7;
+#if GAMMA2 == (Q-1)/32
+  a1  = (a1*1025 + (1 << 21)) >> 22;
+  a1 &= 15;
+#elif GAMMA2 == (Q-1)/88
+  a1  = (a1*11275 + (1 << 23)) >> 24;
+  a1 ^= ((43 - a1) >> 31) & a1;
+#endif
+
+  *a0  = a - a1*2*GAMMA2;
+  *a0 -= (((Q-1)/2 - *a0) >> 31) & Q;
+  return a1;
+}
+
+/*************************************************
+* Name:        make_hint
+*
+* Description: Compute hint bit indicating whether the low bits of the
+*              input element overflow into the high bits.
+*
+* Arguments:   - int32_t a0: low bits of input element
+*              - int32_t a1: high bits of input element
+*
+* Returns 1 if overflow.
+**************************************************/
+unsigned int make_hint(int32_t a0, int32_t a1) {
+  if(a0 > GAMMA2 || a0 < -GAMMA2 || (a0 == -GAMMA2 && a1 != 0))
+    return 1;
+
+  return 0;
+}
+
+/*************************************************
+* Name:        use_hint
+*
+* Description: Correct high bits according to hint.
+*
+* Arguments:   - int32_t a: input element
+*              - unsigned int hint: hint bit
+*
+* Returns corrected high bits.
+**************************************************/
+int32_t use_hint(int32_t a, unsigned int hint) {
+  int32_t a0, a1;
+
+  a1 = decompose(&a0, a);
+  if(hint == 0)
+    return a1;
+
+#if GAMMA2 == (Q-1)/32
+  if(a0 > 0)
+    return (a1 + 1) & 15;
+  else
+    return (a1 - 1) & 15;
+#elif GAMMA2 == (Q-1)/88
+  if(a0 > 0)
+    return (a1 == 43) ?  0 : a1 + 1;
+  else
+    return (a1 ==  0) ? 43 : a1 - 1;
+#endif
+}
diff --git a/crypto_sign/dilithium2/m4f/rounding.h b/crypto_sign/dilithium2/m4f/rounding.h
new file mode 100644
index 0000000..b72e8e8
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/rounding.h
@@ -0,0 +1,19 @@
+#ifndef ROUNDING_H
+#define ROUNDING_H
+
+#include <stdint.h>
+#include "params.h"
+
+#define power2round DILITHIUM_NAMESPACE(power2round)
+int32_t power2round(int32_t *a0, int32_t a);
+
+#define decompose DILITHIUM_NAMESPACE(decompose)
+int32_t decompose(int32_t *a0, int32_t a);
+
+#define make_hint DILITHIUM_NAMESPACE(make_hint)
+unsigned int make_hint(int32_t a0, int32_t a1);
+
+#define use_hint DILITHIUM_NAMESPACE(use_hint)
+int32_t use_hint(int32_t a, unsigned int hint);
+
+#endif
diff --git a/crypto_sign/dilithium2/m4f/sign.c b/crypto_sign/dilithium2/m4f/sign.c
new file mode 100644
index 0000000..d1c5222
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/sign.c
@@ -0,0 +1,391 @@
+#include <stdint.h>
+#include "params.h"
+#include "sign.h"
+#include "packing.h"
+#include "polyvec.h"
+#include "poly.h"
+#include "randombytes.h"
+#include "symmetric.h"
+#include "smallpoly.h"
+
+/*************************************************
+* Name:        crypto_sign_keypair
+*
+* Description: Generates public and private key.
+*
+* Arguments:   - uint8_t *pk: pointer to output public key (allocated
+*                             array of CRYPTO_PUBLICKEYBYTES bytes)
+*              - uint8_t *sk: pointer to output private key (allocated
+*                             array of CRYPTO_SECRETKEYBYTES bytes)
+*
+* Returns 0 (success)
+**************************************************/
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) {
+  uint8_t seedbuf[2*SEEDBYTES + CRHBYTES];
+  uint8_t tr[TRBYTES];
+  const uint8_t *rho, *rhoprime, *key;
+  polyvecl mat[K];
+  polyvecl s1, s1hat;
+  polyveck s2, t1, t0;
+
+  /* Get randomness for rho, rhoprime and key */
+  randombytes(seedbuf, SEEDBYTES);
+  shake256(seedbuf, 2*SEEDBYTES + CRHBYTES, seedbuf, SEEDBYTES);
+  rho = seedbuf;
+  rhoprime = rho + SEEDBYTES;
+  key = rhoprime + CRHBYTES;
+
+  /* Expand matrix */
+  polyvec_matrix_expand(mat, rho);
+
+  /* Sample short vectors s1 and s2 */
+  polyvecl_uniform_eta(&s1, rhoprime, 0);
+  polyveck_uniform_eta(&s2, rhoprime, L);
+
+  /* Matrix-vector multiplication */
+  s1hat = s1;
+  polyvecl_ntt(&s1hat);
+  polyvec_matrix_pointwise_montgomery(&t1, mat, &s1hat);
+  polyveck_reduce(&t1);
+  polyveck_invntt_tomont(&t1);
+
+  /* Add error vector s2 */
+  polyveck_add(&t1, &t1, &s2);
+
+  /* Extract t1 and write public key */
+  polyveck_caddq(&t1);
+  polyveck_power2round(&t1, &t0, &t1);
+  pack_pk(pk, rho, &t1);
+
+  /* Compute H(rho, t1) and write secret key */
+  shake256(tr, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES);
+  pack_sk(sk, rho, tr, key, &t0, &s1, &s2);
+
+  return 0;
+}
+
+
+/*************************************************
+* Name:        crypto_sign_signature
+*
+* Description: Computes signature.
+*
+* Arguments:   - uint8_t *sig:   pointer to output signature (of length CRYPTO_BYTES)
+*              - size_t *siglen: pointer to output length of signature
+*              - uint8_t *m:     pointer to message to be signed
+*              - size_t mlen:    length of message
+*              - uint8_t *sk:    pointer to bit-packed secret key
+*
+* Returns 0 (success)
+**************************************************/
+int crypto_sign_signature(uint8_t *sig,
+                          size_t *siglen,
+                          const uint8_t *m,
+                          size_t mlen,
+                          const uint8_t *sk)
+{
+  uint8_t seedbuf[2 * SEEDBYTES + TRBYTES + RNDBYTES + 2 * CRHBYTES];
+  uint8_t *rho, *tr, *key, *mu, *rhoprime, *rnd;
+  uint16_t nonce = 0;
+  unsigned int n;
+  polyvecl mat[K], y, z;
+  polyveck t0, w1, w0;
+  poly cp;
+  shake256incctx state;
+
+  smallpoly s1_prime[L];
+  smallpoly s2_prime[K];
+  smallpoly cp_small;
+  smallhalfpoly cp_small_prime;
+
+  rho = seedbuf;
+  tr = rho + SEEDBYTES;
+  key = tr + TRBYTES;
+  rnd = key + SEEDBYTES;
+  mu = rnd + RNDBYTES;
+  rhoprime = mu + CRHBYTES;
+  unpack_sk(rho, tr, key, &t0, s1_prime, s2_prime, sk);
+
+  /* Compute mu = CRH(tr, msg) */
+  shake256_inc_init(&state);
+  shake256_inc_absorb(&state, tr, TRBYTES);
+  shake256_inc_absorb(&state, m, mlen);
+  shake256_inc_finalize(&state);
+  shake256_inc_squeeze(mu, CRHBYTES, &state);
+
+  for (n = 0; n < RNDBYTES; n++) {
+     rnd[n] = 0;
+  }
+  shake256(rhoprime, CRHBYTES, key, SEEDBYTES + RNDBYTES + CRHBYTES);
+
+  /* Expand matrix and transform vectors */
+  polyvec_matrix_expand(mat, rho);
+  polyvecl_small_ntt(s1_prime);
+  polyveck_small_ntt(s2_prime);
+
+  polyveck_ntt(&t0);
+
+rej:
+  /* Sample intermediate vector y */
+  polyvecl_uniform_gamma1(&y, rhoprime, nonce++);
+
+  /* Matrix-vector multiplication */
+  z = y;
+  polyvecl_ntt(&z);
+  polyvec_matrix_pointwise_montgomery(&w1, mat, &z);
+  polyveck_reduce(&w1);
+  polyveck_invntt_tomont(&w1);
+
+  /* Decompose w and call the random oracle */
+  polyveck_caddq(&w1);
+  polyveck_decompose(&w1, &w0, &w1);
+  polyveck_pack_w1(sig, &w1);
+
+  shake256_inc_init(&state);
+  shake256_inc_absorb(&state, mu, CRHBYTES);
+  shake256_inc_absorb(&state, sig, K*POLYW1_PACKEDBYTES);
+  shake256_inc_finalize(&state);
+  shake256_inc_squeeze(sig, CTILDEBYTES, &state);
+  poly_challenge(&cp, sig);
+  
+  poly_small_ntt_precomp(&cp_small, &cp_small_prime, &cp);
+  poly_ntt(&cp);
+
+  /* Compute z, reject if it reveals secret */
+  polyvecl_small_basemul_invntt(&z, &cp_small, &cp_small_prime, s1_prime);
+
+  polyvecl_add(&z, &z, &y);
+  polyvecl_reduce(&z);
+  if(polyvecl_chknorm(&z, GAMMA1 - BETA))
+    goto rej;
+
+
+  /* Write signature */
+  pack_sig_z(sig, &z);
+  unsigned int hint_n = 0;
+  unsigned int hints_written = 0;
+  /* Check that subtracting cs2 does not change high bits of w and low bits
+   * do not reveal secret information */
+  for(unsigned int i = 0; i < K; ++i) {
+    poly *tmp = &z.vec[0];
+    poly_small_basemul_invntt(tmp, &cp_small, &cp_small_prime, &s2_prime[i]);
+
+    poly_sub(&w0.vec[i], &w0.vec[i], tmp);
+    poly_reduce(&w0.vec[i]);
+    if(poly_chknorm(&w0.vec[i], GAMMA2 - BETA))
+      goto rej;
+
+    /* Compute hints for w1 */
+    poly_pointwise_montgomery(tmp, &cp, &t0.vec[i]);
+
+    poly_invntt_tomont(tmp);
+    poly_reduce(tmp);
+
+    if(poly_chknorm(tmp, GAMMA2))
+      goto rej;
+    poly_add(&w0.vec[i], &w0.vec[i], tmp);
+    hint_n += poly_make_hint(tmp, &w0.vec[i], &w1.vec[i]);
+    if (hint_n > OMEGA) {
+      goto rej;
+    }
+    pack_sig_h(sig, tmp, i, &hints_written);
+  }
+  pack_sig_h_zero(sig, &hints_written);
+  *siglen = CRYPTO_BYTES;
+  return 0;
+}
+
+/*************************************************
+* Name:        crypto_sign
+*
+* Description: Compute signed message.
+*
+* Arguments:   - uint8_t *sm: pointer to output signed message (allocated
+*                             array with CRYPTO_BYTES + mlen bytes),
+*                             can be equal to m
+*              - size_t *smlen: pointer to output length of signed
+*                               message
+*              - const uint8_t *m: pointer to message to be signed
+*              - size_t mlen: length of message
+*              - const uint8_t *sk: pointer to bit-packed secret key
+*
+* Returns 0 (success)
+**************************************************/
+int crypto_sign(uint8_t *sm,
+                size_t *smlen,
+                const uint8_t *m,
+                size_t mlen,
+                const uint8_t *sk)
+{
+  size_t i;
+
+  for(i = 0; i < mlen; ++i)
+    sm[CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i];
+  crypto_sign_signature(sm, smlen, sm + CRYPTO_BYTES, mlen, sk);
+  *smlen += mlen;
+  return 0;
+}
+/*************************************************
+ * Name:        expand_mat_elem
+ *
+ * Description: Implementation of ExpandA. Generates matrix A with uniformly
+ *              random coefficients a_{i,j} by performing rejection
+ *              sampling on the output stream of SHAKE128(rho|i|j).
+ *
+ * Arguments:   - poly mat_elem: output matrix element
+ *              - const unsigned char rho[]: byte array containing seed rho
+ *              - k_idx: matrix row index
+ *              - l_idx: matrix col index
+ **************************************************/
+static void expand_mat_elem(poly *mat_elem, const unsigned char rho[SEEDBYTES], size_t k_idx, size_t l_idx)
+{
+  poly_uniform(mat_elem, rho, (uint16_t)((k_idx << 8) + l_idx));
+}
+
+/*************************************************
+ * Name:        crypto_sign_verify
+ *
+ * Description: Verifies signature.
+ *
+ * Arguments:   - uint8_t *m: pointer to input signature
+ *              - size_t siglen: length of signature
+ *              - const uint8_t *m: pointer to message
+ *              - size_t mlen: length of message
+ *              - const uint8_t *pk: pointer to bit-packed public key
+ *
+ * Returns 0 if signature could be verified correctly and -1 otherwise
+ **************************************************/
+int crypto_sign_verify(const uint8_t *sig,
+                       size_t siglen,
+                       const uint8_t *m,
+                       size_t mlen,
+                       const uint8_t *pk)
+{
+  unsigned int i;
+  const uint8_t *rho = pk;
+  uint8_t mu[CRHBYTES];
+  uint8_t c[CTILDEBYTES];
+  uint8_t c2[CTILDEBYTES];
+  poly cp;
+  polyvecl z;
+  shake256incctx state;
+
+  poly tmp_elem, w1_elem;
+
+  if (siglen != CRYPTO_BYTES)
+    return -1;
+
+  if (unpack_sig_z(&z, sig) != 0) {
+    return -1;
+  }
+  if (polyvecl_chknorm(&z, GAMMA1 - BETA))
+    return -1;
+
+  /* Compute CRH(h(rho, t1), msg) */
+  shake256(mu, CRHBYTES, pk, CRYPTO_PUBLICKEYBYTES);
+  shake256_inc_init(&state);
+  shake256_inc_absorb(&state, mu, CRHBYTES);
+  shake256_inc_absorb(&state, m, mlen);
+  shake256_inc_finalize(&state);
+  shake256_inc_squeeze(mu, CRHBYTES, &state);
+
+  // Hash [mu || w1'] to get c.
+  shake256_inc_init(&state);
+  shake256_inc_absorb(&state, mu, CRHBYTES);
+
+  /* Matrix-vector multiplication; compute Az - c2^dt1 */
+  if (unpack_sig_c(c, sig) != 0) {
+    return -1;
+  }
+  poly_challenge(&cp, c);
+  poly_ntt(&cp);
+  polyvecl_ntt(&z);
+
+
+  for (size_t k_idx = 0; k_idx < K; k_idx++)
+  {
+    // Sample the current element from A.
+    expand_mat_elem(&tmp_elem, rho, k_idx, 0);
+    poly_pointwise_montgomery(&w1_elem, &tmp_elem, &z.vec[0]);
+
+    for (size_t l_idx = 1; l_idx < L; l_idx++)
+    {
+      // Sample the element from A.
+      expand_mat_elem(&tmp_elem, rho, k_idx, l_idx);
+      poly_pointwise_acc_montgomery(&w1_elem, &tmp_elem, &z.vec[l_idx]);
+    }
+
+    // Subtract c*(t1_{k_idx} * 2^d)
+    unpack_pk_t1(&tmp_elem, k_idx, pk);
+    poly_shiftl(&tmp_elem);
+    poly_ntt(&tmp_elem);
+    poly_pointwise_montgomery(&tmp_elem, &cp, &tmp_elem);
+    poly_sub(&w1_elem, &w1_elem, &tmp_elem);
+    poly_reduce(&w1_elem);
+    poly_invntt_tomont(&w1_elem);
+
+    // Reconstruct w1
+    poly_csubq(&w1_elem);
+    if (unpack_sig_h(&tmp_elem, k_idx, sig) != 0) {
+      return -1;
+    }
+    poly_use_hint(&w1_elem, &w1_elem, &tmp_elem);
+    uint8_t w1_packed[POLYW1_PACKEDBYTES];
+    polyw1_pack(w1_packed, &w1_elem);
+    shake256_inc_absorb(&state, w1_packed, POLYW1_PACKEDBYTES);
+  }
+
+
+  /* Call random oracle and verify challenge */
+  shake256_inc_finalize(&state);
+  shake256_inc_squeeze(c2, CTILDEBYTES, &state);
+  for (i = 0; i < CTILDEBYTES; ++i)
+    if (c[i] != c2[i])
+      return -1;
+
+  return 0;
+}
+
+/*************************************************
+* Name:        crypto_sign_open
+*
+* Description: Verify signed message.
+*
+* Arguments:   - uint8_t *m: pointer to output message (allocated
+*                            array with smlen bytes), can be equal to sm
+*              - size_t *mlen: pointer to output length of message
+*              - const uint8_t *sm: pointer to signed message
+*              - size_t smlen: length of signed message
+*              - const uint8_t *pk: pointer to bit-packed public key
+*
+* Returns 0 if signed message could be verified correctly and -1 otherwise
+**************************************************/
+int crypto_sign_open(uint8_t *m,
+                     size_t *mlen,
+                     const uint8_t *sm,
+                     size_t smlen,
+                     const uint8_t *pk)
+{
+  size_t i;
+
+  if(smlen < CRYPTO_BYTES)
+    goto badsig;
+
+  *mlen = smlen - CRYPTO_BYTES;
+  if(crypto_sign_verify(sm, CRYPTO_BYTES, sm + CRYPTO_BYTES, *mlen, pk))
+    goto badsig;
+  else {
+    /* All good, copy msg, return 0 */
+    for(i = 0; i < *mlen; ++i)
+      m[i] = sm[CRYPTO_BYTES + i];
+    return 0;
+  }
+
+badsig:
+  /* Signature verification failed */
+  *mlen = -1;
+  for(i = 0; i < smlen; ++i)
+    m[i] = 0;
+
+  return -1;
+}
diff --git a/crypto_sign/dilithium2/m4f/sign.h b/crypto_sign/dilithium2/m4f/sign.h
new file mode 100644
index 0000000..42240b3
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/sign.h
@@ -0,0 +1,37 @@
+#ifndef SIGN_H
+#define SIGN_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "params.h"
+#include "api.h"
+#include "polyvec.h"
+#include "poly.h"
+
+#define challenge DILITHIUM_NAMESPACE(challenge)
+void challenge(poly *c, const uint8_t seed[SEEDBYTES]);
+
+// #define crypto_sign_keypair DILITHIUM_NAMESPACE(crypto_sign_keypair)
+// int crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+
+// #define crypto_sign_signature DILITHIUM_NAMESPACE(signature)
+// int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+//                           const uint8_t *m, size_t mlen,
+//                           const uint8_t *sk);
+
+// #define crypto_sign DILITHIUM_NAMESPACE(crypto_sign)
+// int crypto_sign(uint8_t *sm, size_t *smlen,
+//                 const uint8_t *m, size_t mlen,
+//                 const uint8_t *sk);
+
+// #define crypto_sign_verify DILITHIUM_NAMESPACE(verify)
+// int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+//                        const uint8_t *m, size_t mlen,
+//                        const uint8_t *pk);
+
+// #define crypto_sign_open DILITHIUM_NAMESPACE(crypto_sign_open)
+// int crypto_sign_open(uint8_t *m, size_t *mlen,
+//                      const uint8_t *sm, size_t smlen,
+//                      const uint8_t *pk);
+
+#endif
diff --git a/crypto_sign/dilithium2/m4f/smallntt.h b/crypto_sign/dilithium2/m4f/smallntt.h
new file mode 100644
index 0000000..f39a9a9
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/smallntt.h
@@ -0,0 +1,31 @@
+#ifndef SMALLNTT_H
+#define SMALLNTT_H
+
+#include <stdint.h>
+#include "params.h"
+
+
+#define SMALL_Q 257
+#define SMALL_Q_PRIME (16711935) // -q^-1 mod 2**32
+
+static const int32_t twiddles_ntt_257_streamlined[]  = {-60, -35, -46, -42, 99, 89, -118, 27, -82, 108, -71, 54, 93, -41, 115, 68, 117, 73, -84, -59, -79, 21, -78, 37, -55, -109, 101, 74, -110, 39, 17, -70, -92, -50, -29, 57, -116, 83, 43, 75, -85, -91, 86, -107, 87, 15, -23, -111, -100, -58, 114, 25, -97, -10, 126, -40, 63, -20, -5, -80, -120, 44, -67, -72, -124, -31, 18, -106, 103, 90, -102, 45, -51, -77, 53, -121, -81, -11, 113, 9, -62, 36, -65, -12, -3, -48, 127, -24, -6, -96, 34, 88, 123, -49, -13, 61, -52, 112, -7, -66, -28, -33, -14, 125, -56, 30, 95, -22, -98, -26, 122, -104, -38, -94, 105, -119, -76, 69, -47, 19};
+static const int32_t twiddles_intt_257_streamlined[] = { -19, 47, -69, 76, 119, -105, 94, 38, 104, -122, 26, 98, 22, -95, -30, 56, -125, 14, 33, 28, 66, 7, -112, 52, -61, 13, 49, -123, -88, -34, 96, 6, 24, -127, 48, 3, 12, 65, -36, 62, -9, -113, 11, 81, 121, -53, 77, 51, -45, 102, -90, -103, 106, -18, 31, 124, 72, 67, -44, 120, 80, 5, 20, -63, 40, -126, 10, 97, -25, -114, 58, 100, 111, 23, -15, -87, 107, -86, 91, 85, -75, -43, -83, 116, -57, 29, 50, 92, 70, -17, -39, 110, -74, -101, 109, 55, -37, 78, -21, 79, 59, 84, -73, -117, -68, -115, 41, -93, -54, 71, -108, 82, -27, 118, -89, -99, 42, 46, 35, 60};
+static const int32_t twiddles_basemul_257[] = {27, -82, 108, -71, 54, 93, -41, 115, -78, 37, -55, -109, 101, 74, -110, 39, 83, 43, 75, -85, -91, 86, -107, 87, -97, -10, 126, -40, 63, -20, -5, -80, -106, 103, 90, -102, 45, -51, -77, 53, -65, -12, -3, -48, 127, -24, -6, -96, 112, -7, -66, -28, -33, -14, 125, -56, -38, -94, 105, -119, -76, 69, -47, 19};
+
+
+// inputs in [-2, 2]; outputs in [-128, +128]
+void __asm_fnt_257(int32_t *p, const int32_t twiddles[112], int32_t qprime, int32_t q);
+
+void __asm_point_mul_257_16(int16_t p_prime[128], const int32_t p[256], int32_t qprime, int32_t q, const int32_t twiddles[64]);
+void __asm_asymmetric_mul_257_16(int32_t c[256], const int32_t a[256], const int32_t b[256], const int16_t b_prime[128]);
+
+// inputs in [-32768, 32768] outputs in [-128, +128]
+void __asm_ifnt_257(int32_t *p, const int32_t twiddles[112], int32_t qprime, int32_t q);
+
+#define small_ntt(a) __asm_fnt_257(a, twiddles_ntt_257_streamlined, SMALL_Q_PRIME, SMALL_Q)
+#define small_invntt_tomont(a) __asm_ifnt_257(a, twiddles_intt_257_streamlined, SMALL_Q_PRIME, SMALL_Q)
+
+#define small_point_mul(b_prime, b) __asm_point_mul_257_16(b_prime, b, SMALL_Q_PRIME, SMALL_Q, twiddles_basemul_257);
+#define small_asymmetric_mul(c, a, b, b_prime) __asm_asymmetric_mul_257_16(c, a, b, b_prime);
+
+#endif
diff --git a/crypto_sign/dilithium2/m4f/smallpoly.c b/crypto_sign/dilithium2/m4f/smallpoly.c
new file mode 100644
index 0000000..9e1f6c8
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/smallpoly.c
@@ -0,0 +1,84 @@
+#include "smallpoly.h"
+#include "smallntt.h"
+
+void poly_small_ntt_precomp(smallpoly *out, smallhalfpoly *out2, poly *in) {
+  for (int i = 0; i < N; i++)
+  {
+    out->coeffs[i] = in->coeffs[i];
+  }
+  small_ntt(out->coeffs);
+  small_point_mul(out2->coeffs, out->coeffs);
+}
+
+
+void polyvecl_small_ntt(smallpoly v[L]) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    small_ntt(v[i].coeffs);
+}
+
+
+void polyveck_small_ntt(smallpoly v[K]) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    small_ntt(v[i].coeffs);
+}
+
+
+
+void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly *b){
+    // re-use the buffer
+    smallpoly *tmp = (smallpoly *)r;
+    small_asymmetric_mul(tmp->coeffs, b->coeffs, a->coeffs, aprime->coeffs);
+    small_invntt_tomont(tmp->coeffs);
+
+    #ifdef SMALL_POLY_16_BIT
+    int j;
+    // buffer is the same, so we neeed to be careful
+    for(j=N-1;j>=0;j--){
+        r->coeffs[j] = tmp->coeffs[j];
+    }
+    #endif
+}
+
+void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly b[L]){
+    unsigned int i;
+    for(i=0;i<L;i++){
+        poly_small_basemul_invntt(&r->vec[i], a, aprime, &b[i]);
+    }
+}
+
+void small_polyeta_unpack(smallpoly *r, const uint8_t *a) {
+  unsigned int i;
+
+#if ETA == 2
+  for(i = 0; i < N/8; ++i) {
+    r->coeffs[8*i+0] =  (a[3*i+0] >> 0) & 7;
+    r->coeffs[8*i+1] =  (a[3*i+0] >> 3) & 7;
+    r->coeffs[8*i+2] = ((a[3*i+0] >> 6) | (a[3*i+1] << 2)) & 7;
+    r->coeffs[8*i+3] =  (a[3*i+1] >> 1) & 7;
+    r->coeffs[8*i+4] =  (a[3*i+1] >> 4) & 7;
+    r->coeffs[8*i+5] = ((a[3*i+1] >> 7) | (a[3*i+2] << 1)) & 7;
+    r->coeffs[8*i+6] =  (a[3*i+2] >> 2) & 7;
+    r->coeffs[8*i+7] =  (a[3*i+2] >> 5) & 7;
+
+    r->coeffs[8*i+0] = ETA - r->coeffs[8*i+0];
+    r->coeffs[8*i+1] = ETA - r->coeffs[8*i+1];
+    r->coeffs[8*i+2] = ETA - r->coeffs[8*i+2];
+    r->coeffs[8*i+3] = ETA - r->coeffs[8*i+3];
+    r->coeffs[8*i+4] = ETA - r->coeffs[8*i+4];
+    r->coeffs[8*i+5] = ETA - r->coeffs[8*i+5];
+    r->coeffs[8*i+6] = ETA - r->coeffs[8*i+6];
+    r->coeffs[8*i+7] = ETA - r->coeffs[8*i+7];
+  }
+#elif ETA == 4
+  for(i = 0; i < N/2; ++i) {
+    r->coeffs[2*i+0] = a[i] & 0x0F;
+    r->coeffs[2*i+1] = a[i] >> 4;
+    r->coeffs[2*i+0] = ETA - r->coeffs[2*i+0];
+    r->coeffs[2*i+1] = ETA - r->coeffs[2*i+1];
+  }
+#endif
+}
diff --git a/crypto_sign/dilithium2/m4f/smallpoly.h b/crypto_sign/dilithium2/m4f/smallpoly.h
new file mode 100644
index 0000000..caa2626
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/smallpoly.h
@@ -0,0 +1,39 @@
+#ifndef SMALLPOLY_H
+#define SMALLPOLY_H
+#include "params.h"
+#include "poly.h"
+#include "polyvec.h"
+
+
+
+#if DILITHIUM_MODE == 3 // use q=769
+#define SMALL_POLY_16_BIT
+typedef struct {
+    int16_t coeffs[N];
+} smallpoly;
+
+typedef smallpoly smallhalfpoly;
+
+#else // use q=257
+#define SMALL_POLY_32_BIT
+typedef struct {
+    int32_t coeffs[N];
+} smallpoly;
+
+typedef struct {
+    int16_t coeffs[N];
+} smallhalfpoly;
+#endif
+
+
+void poly_small_ntt_precomp(smallpoly *out, smallhalfpoly *out2, poly *in);
+void polyvecl_small_ntt(smallpoly v[L]);
+void polyveck_small_ntt(smallpoly v[K]);
+
+
+void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly b[L]);
+void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallhalfpoly *aprime, const smallpoly *b);
+
+void small_polyeta_unpack(smallpoly *r, const uint8_t *a);
+
+#endif
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4f/symmetric-shake.c b/crypto_sign/dilithium2/m4f/symmetric-shake.c
new file mode 100644
index 0000000..963f649
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/symmetric-shake.c
@@ -0,0 +1,28 @@
+#include <stdint.h>
+#include "params.h"
+#include "symmetric.h"
+#include "fips202.h"
+
+void dilithium_shake128_stream_init(shake128incctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce)
+{
+  uint8_t t[2];
+  t[0] = nonce;
+  t[1] = nonce >> 8;
+
+  shake128_inc_init(state);
+  shake128_inc_absorb(state, seed, SEEDBYTES);
+  shake128_inc_absorb(state, t, 2);
+  shake128_inc_finalize(state);
+}
+
+void dilithium_shake256_stream_init(shake256incctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce)
+{
+  uint8_t t[2];
+  t[0] = nonce;
+  t[1] = nonce >> 8;
+
+  shake256_inc_init(state);
+  shake256_inc_absorb(state, seed, CRHBYTES);
+  shake256_inc_absorb(state, t, 2);
+  shake256_inc_finalize(state);
+}
diff --git a/crypto_sign/dilithium2/m4f/symmetric.h b/crypto_sign/dilithium2/m4f/symmetric.h
new file mode 100644
index 0000000..4703737
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/symmetric.h
@@ -0,0 +1,65 @@
+#ifndef SYMMETRIC_H
+#define SYMMETRIC_H
+
+#include <stdint.h>
+#include "params.h"
+
+#ifdef DILITHIUM_USE_AES
+
+#include "aes256ctr.h"
+#include "fips202.h"
+
+typedef aes256ctr_ctx stream128_state;
+typedef aes256ctr_ctx stream256_state;
+
+#define dilithium_aes256ctr_init DILITHIUM_NAMESPACE(dilithium_aes256ctr_init)
+void dilithium_aes256ctr_init(aes256ctr_ctx *state,
+                              const uint8_t key[32],
+                              uint16_t nonce);
+
+#define STREAM128_BLOCKBYTES AES256CTR_BLOCKBYTES
+#define STREAM256_BLOCKBYTES AES256CTR_BLOCKBYTES
+
+#define stream128_init(STATE, SEED, NONCE) \
+        dilithium_aes256ctr_init(STATE, SEED, NONCE)
+#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \
+        aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE)
+#define stream256_init(STATE, SEED, NONCE) \
+        dilithium_aes256ctr_init(STATE, SEED, NONCE)
+#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \
+        aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE)
+
+#else
+
+#include "fips202.h"
+typedef shake128incctx stream128_state;
+typedef shake256incctx stream256_state;
+
+#define shake256_inc_squeezeblocks(OUT, OUTBLOCKS, STATE) \
+        shake256_inc_squeeze(OUT, OUTBLOCKS*SHAKE256_RATE, STATE)
+
+#define dilithium_shake128_stream_init DILITHIUM_NAMESPACE(dilithium_shake128_stream_init)
+void dilithium_shake128_stream_init(stream128_state *state,
+                                    const uint8_t seed[SEEDBYTES],
+                                    uint16_t nonce);
+
+#define dilithium_shake256_stream_init DILITHIUM_NAMESPACE(dilithium_shake256_stream_init)
+void dilithium_shake256_stream_init(stream256_state *state,
+                                    const uint8_t seed[CRHBYTES],
+                                    uint16_t nonce);
+
+#define STREAM128_BLOCKBYTES SHAKE128_RATE
+#define STREAM256_BLOCKBYTES SHAKE256_RATE
+
+#define stream128_init(STATE, SEED, NONCE) \
+        dilithium_shake128_stream_init(STATE, SEED, NONCE)
+#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \
+        shake128_inc_squeeze(OUT, OUTBLOCKS*SHAKE128_RATE, STATE)
+#define stream256_init(STATE, SEED, NONCE) \
+        dilithium_shake256_stream_init(STATE, SEED, NONCE)
+#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \
+        shake256_inc_squeeze(OUT, OUTBLOCKS*SHAKE256_RATE, STATE)
+
+#endif
+
+#endif
diff --git a/crypto_sign/dilithium2/m4f/vector.h b/crypto_sign/dilithium2/m4f/vector.h
new file mode 100644
index 0000000..183ddc8
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/vector.h
@@ -0,0 +1,22 @@
+#ifndef VECTOR_H
+#define VECTOR_H
+
+#include <stdint.h>
+#include "params.h"
+
+#define asm_reduce32 DILITHIUM_NAMESPACE(asm_reduce32)
+void asm_reduce32(int32_t a[N]);
+#define small_asm_reduce32_central DILITHIUM_NAMESPACE(small_asm_reduce32_central)
+void small_asm_reduce32_central(int32_t a[N]);
+#define asm_caddq DILITHIUM_NAMESPACE(asm_caddq)
+void asm_caddq(int32_t a[N]);
+#define asm_csubq DILITHIUM_NAMESPACE(asm_csubq)
+void asm_csubq(int32_t a[N]);
+#define asm_freeze DILITHIUM_NAMESPACE(asm_freeze)
+void asm_freeze(int32_t a[N]);
+#define asm_rej_uniform DILITHIUM_NAMESPACE(asm_rej_uniform)
+unsigned int asm_rej_uniform(int32_t *a,
+                         unsigned int len,
+                         const unsigned char *buf,
+                         unsigned int buflen);
+#endif
diff --git a/crypto_sign/dilithium2/m4f/vector.s b/crypto_sign/dilithium2/m4f/vector.s
new file mode 100644
index 0000000..a393c91
--- /dev/null
+++ b/crypto_sign/dilithium2/m4f/vector.s
@@ -0,0 +1,263 @@
+.syntax unified
+.thumb
+.macro redq a, tmp, q
+    add     \tmp, \a,  #4194304
+    asrs    \tmp, \tmp, #23
+    mls     \a, \tmp, \q, \a
+.endm
+
+// void asm_reduce32(int32_t a[N]);
+.global pqcrystals_dilithium_asm_reduce32
+.type pqcrystals_dilithium_asm_reduce32, %function
+.align 2
+pqcrystals_dilithium_asm_reduce32:
+    push {r4-r10}
+
+    movw r12,#:lower16:8380417
+    movt r12,#:upper16:8380417
+    movw r10, #32
+    1:
+        ldr.w r1, [r0]
+        ldr.w r2, [r0, #1*4]
+        ldr.w r3, [r0, #2*4]
+        ldr.w r4, [r0, #3*4]
+        ldr.w r5, [r0, #4*4]
+        ldr.w r6, [r0, #5*4]
+        ldr.w r7, [r0, #6*4]
+        ldr.w r8, [r0, #7*4]
+
+        redq r1, r9, r12
+        redq r2, r9, r12
+        redq r3, r9, r12
+        redq r4, r9, r12
+        redq r5, r9, r12
+        redq r6, r9, r12
+        redq r7, r9, r12
+        redq r8, r9, r12
+
+        str.w r2, [r0, #1*4]
+        str.w r3, [r0, #2*4]
+        str.w r4, [r0, #3*4]
+        str.w r5, [r0, #4*4]
+        str.w r6, [r0, #5*4]
+        str.w r7, [r0, #6*4]
+        str.w r8, [r0, #7*4]
+        str r1, [r0], #8*4
+        subs r10, #1
+        bne.w 1b
+
+    pop {r4-r10}
+    bx lr
+.size pqcrystals_dilithium_asm_reduce32, .-pqcrystals_dilithium_asm_reduce32
+
+.macro barrett_32 a, Qbar, Q, tmp
+    smmulr.w \tmp, \a, \Qbar
+    mls.w \a, \tmp, \Q, \a
+.endm
+
+// INPUT: target (signed), KYBER_Q (signed)
+// OUTPUT: target adjusted to be between -KYBER_Q/2 and KYBER_Q/2
+.macro central_reduce target, Q
+  cmp \target, \Q, lsr #1
+  it hi
+  subhi \target, \Q
+  cmn \target, \Q, lsr #1
+  it lt
+  addlt \target, \Q
+.endm
+
+// void asm_reduce32(int32_t a[N]);
+.global pqcrystals_dilithium_small_asm_reduce32_central
+.type pqcrystals_dilithium_small_asm_reduce32_central, %function
+.align 2
+pqcrystals_dilithium_small_asm_reduce32_central:
+    push {r4-r12, lr}
+
+
+    movw r9, #:lower16:5585133
+    movt r9, #:upper16:5585133
+    mov.w r10,#769
+
+    movw r12, #32
+    1:
+        ldr.w r1, [r0]
+        ldr.w r2, [r0, #1*4]
+        ldr.w r3, [r0, #2*4]
+        ldr.w r4, [r0, #3*4]
+        ldr.w r5, [r0, #4*4]
+        ldr.w r6, [r0, #5*4]
+        ldr.w r7, [r0, #6*4]
+        ldr.w r8, [r0, #7*4]
+
+        barrett_32 r1, r9, r10, r11
+        barrett_32 r2, r9, r10, r11
+        barrett_32 r3, r9, r10, r11
+        barrett_32 r4, r9, r10, r11
+        barrett_32 r5, r9, r10, r11
+        barrett_32 r6, r9, r10, r11
+        barrett_32 r7, r9, r10, r11
+        barrett_32 r8, r9, r10, r11
+
+
+        str.w r2, [r0, #1*4]
+        str.w r3, [r0, #2*4]
+        str.w r4, [r0, #3*4]
+        str.w r5, [r0, #4*4]
+        str.w r6, [r0, #5*4]
+        str.w r7, [r0, #6*4]
+        str.w r8, [r0, #7*4]
+        str r1, [r0], #8*4
+        subs r12, #1
+        bne.w 1b
+
+    pop {r4-r12, pc}
+
+.size pqcrystals_dilithium_small_asm_reduce32_central, .-pqcrystals_dilithium_small_asm_reduce32_central
+
+.macro caddq a, tmp, q
+    and     \tmp, \q, \a, asr #31
+    add     \a, \a, \tmp
+.endm
+
+.macro freezeq a, tmp, q
+    redq \a, \tmp, \q
+    caddq \a, \tmp, \q
+.endm
+
+// void asm_caddq(int32_t a[N]);
+.global pqcrystals_dilithium_asm_caddq
+.type pqcrystals_dilithium_asm_caddq, %function
+.align 2
+pqcrystals_dilithium_asm_caddq:
+    push {r4-r10}
+
+    movw r12,#:lower16:8380417
+    movt r12,#:upper16:8380417
+
+    movw r10, #32
+    1:
+        ldr.w r1, [r0]
+        ldr.w r2, [r0, #1*4]
+        ldr.w r3, [r0, #2*4]
+        ldr.w r4, [r0, #3*4]
+        ldr.w r5, [r0, #4*4]
+        ldr.w r6, [r0, #5*4]
+        ldr.w r7, [r0, #6*4]
+        ldr.w r8, [r0, #7*4]
+
+        caddq r1, r9, r12
+        caddq r2, r9, r12
+        caddq r3, r9, r12
+        caddq r4, r9, r12
+        caddq r5, r9, r12
+        caddq r6, r9, r12
+        caddq r7, r9, r12
+        caddq r8, r9, r12
+
+        str.w r2, [r0, #1*4]
+        str.w r3, [r0, #2*4]
+        str.w r4, [r0, #3*4]
+        str.w r5, [r0, #4*4]
+        str.w r6, [r0, #5*4]
+        str.w r7, [r0, #6*4]
+        str.w r8, [r0, #7*4]
+        str r1, [r0], #8*4
+        subs r10, #1
+        bne.w 1b
+
+    pop {r4-r10}
+    bx lr
+.size pqcrystals_dilithium_asm_caddq, .-pqcrystals_dilithium_asm_caddq
+
+.macro csubq a, tmp, q
+    cmp.n \a, \q
+    it ge
+    subge.w \a, \a, \q
+    cmp \a, #0
+    it mi
+    addmi.w \a, \a, \q
+.endm
+
+// void asm_csubq(int32_t a[N]);
+.global pqcrystals_dilithium_asm_csubq
+.type pqcrystals_dilithium_asm_csubq, %function
+.align 2
+pqcrystals_dilithium_asm_csubq:
+    push {r4-r10}
+
+    movw r12,#:lower16:8380417
+    movt r12,#:upper16:8380417
+
+    movw r10, #32
+    1:
+        ldr.w r1, [r0]
+        ldr.w r2, [r0, #1*4]
+        ldr.w r3, [r0, #2*4]
+        ldr.w r4, [r0, #3*4]
+        ldr.w r5, [r0, #4*4]
+        ldr.w r6, [r0, #5*4]
+        ldr.w r7, [r0, #6*4]
+        ldr.w r8, [r0, #7*4]
+
+        csubq r1, r9, r12
+        csubq r2, r9, r12
+        csubq r3, r9, r12
+        csubq r4, r9, r12
+        csubq r5, r9, r12
+        csubq r6, r9, r12
+        csubq r7, r9, r12
+        csubq r8, r9, r12
+
+        str.w r2, [r0, #1*4]
+        str.w r3, [r0, #2*4]
+        str.w r4, [r0, #3*4]
+        str.w r5, [r0, #4*4]
+        str.w r6, [r0, #5*4]
+        str.w r7, [r0, #6*4]
+        str.w r8, [r0, #7*4]
+        str r1, [r0], #8*4
+        subs r10, #1
+        bne.w 1b
+
+    pop {r4-r10}
+    bx lr
+.size pqcrystals_dilithium_asm_csubq, .-pqcrystals_dilithium_asm_csubq
+
+// asm_rej_uniform(int32_t *a,unsigned int len,const unsigned char *buf, unsigned int buflen);
+.global pqcrystals_dilithium_asm_rej_uniform
+.type pqcrystals_dilithium_asm_rej_uniform, %function
+.align 2
+pqcrystals_dilithium_asm_rej_uniform:
+    push.w {r4-r6}
+    push.w {r1}
+    // Store Q-1 in r12.
+    movw r12,#:lower16:8380416
+    movt r12,#:upper16:8380416
+
+    add.w r6, r0, r1, lsl #2
+    add.w r3, r2, r3
+    sub.w r3, r3, #2
+
+1:
+    // If there are less than 3 bytes available, return.
+    cmp.w r3, r2
+    ble.w end
+
+    ldr r5, [r2], #3
+    ubfx r5, r5, #0, #23
+
+    cmp.n r5, r12
+    it le
+    strle r5, [r0], #4
+
+    cmp.n r0, r6
+    bne.n 1b
+
+end:
+    pop.w {r5}
+
+    sub.w r0, r6, r0
+    sub.w r0, r5, r0, lsr #2
+    pop.w {r4-r6}
+    bx lr
+.size pqcrystals_dilithium_asm_rej_uniform, .-pqcrystals_dilithium_asm_rej_uniform
diff --git a/crypto_sign/dilithium2/m4fstack/api.h b/crypto_sign/dilithium2/m4fstack/api.h
new file mode 120000
index 0000000..d29362d
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/api.h
@@ -0,0 +1 @@
+../m4f/api.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/config.h b/crypto_sign/dilithium2/m4fstack/config.h
new file mode 120000
index 0000000..f3892d9
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/config.h
@@ -0,0 +1 @@
+../m4f/config.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/macros.i b/crypto_sign/dilithium2/m4fstack/macros.i
new file mode 120000
index 0000000..d615b85
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/macros.i
@@ -0,0 +1 @@
+../m4f/macros.i
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/macros_smallntt.i b/crypto_sign/dilithium2/m4fstack/macros_smallntt.i
new file mode 100644
index 0000000..7c9a387
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/macros_smallntt.i
@@ -0,0 +1,91 @@
+/* 
+ * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com)
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * NTT and inverse NTT code from: 
+ * Huang, J. et al. 2024. Revisiting Keccak and Dilithium Implementations on ARMv7-M. 
+ * IACR Transactions on Cryptographic Hardware and Embedded Systems. 2024, 2 (Mar. 2024), 1–24.
+ * DOI:https://doi.org/10.46586/tches.v2024.i2.1-24.
+ * https://github.com/UIC-ESLAS/Dilithium-Multi-Moduli/blob/332a32cc02d407020e48a4f9b3a0dc78d4c8b0bc/M4/crypto_sign/dilithium3/m4plant/smallntt_769.S
+*/
+
+#ifndef MACROS_SMALLNTT_I
+#define MACROS_SMALLNTT_I
+
+// general macros
+.macro load a, a0, a1, a2, a3, mem0, mem1, mem2, mem3
+  ldr.w \a0, [\a, \mem0]
+  ldr.w \a1, [\a, \mem1]
+  ldr.w \a2, [\a, \mem2]
+  ldr.w \a3, [\a, \mem3]
+.endm
+
+.macro store a, a0, a1, a2, a3, mem0, mem1, mem2, mem3
+  str.w \a0, [\a, \mem0]
+  str.w \a1, [\a, \mem1]
+  str.w \a2, [\a, \mem2]
+  str.w \a3, [\a, \mem3]
+.endm
+
+.macro doubleplant a, tmp, q, qa, plantconst
+  smulwb \tmp, \plantconst, \a
+  smulwt \a, \plantconst, \a
+  smlabt \tmp, \tmp, \q, \qa
+  smlabt \a, \a, \q, \qa
+  pkhtb \a, \a, \tmp, asr#16
+.endm
+
+.macro doublebarrett a, tmp, tmp2, q, barrettconst
+  smulbb \tmp, \a, \barrettconst
+  smultb \tmp2, \a, \barrettconst
+  asr \tmp, \tmp, #26
+  asr \tmp2, \tmp2, #26
+  smulbb \tmp, \tmp, \q
+  smulbb \tmp2, \tmp2, \q
+  pkhbt \tmp, \tmp, \tmp2, lsl#16
+  usub16 \a, \a, \tmp
+.endm
+
+// q locate in the top half of the register
+.macro plant_red q, qa, qinv, tmp
+  mul \tmp, \tmp, \qinv     
+  //tmp*qinv mod 2^2n/ 2^n; in high half
+  smlatt \tmp, \tmp, \q, \qa
+  // result in high half
+.endm
+
+.macro mul_twiddle_plant a, twiddle, tmp, q, qa
+	smulwb \tmp, \twiddle, \a
+	smulwt \a,   \twiddle, \a
+	smlabt \tmp, \tmp, \q, \qa
+	smlabt \a, \a, \q, \qa
+	pkhtb \a, \a, \tmp, asr#16
+.endm
+
+.macro doublebutterfly_plant a0, a1, twiddle, tmp, q, qa
+	smulwb \tmp, \twiddle, \a1
+	smulwt \a1, \twiddle, \a1
+	smlabt \tmp, \tmp, \q, \qa
+	smlabt \a1, \a1, \q, \qa
+	pkhtb \tmp, \a1, \tmp, asr#16
+	usub16 \a1, \a0, \tmp
+	uadd16 \a0, \a0, \tmp
+.endm
+
+.macro two_doublebutterfly_plant a0, a1, a2, a3, twiddle0, twiddle1, tmp, q, qa
+	doublebutterfly_plant \a0, \a1, \twiddle0, \tmp, \q, \qa
+	doublebutterfly_plant \a2, \a3, \twiddle1, \tmp, \q, \qa
+.endm
+
+#endif
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/ntt.S b/crypto_sign/dilithium2/m4fstack/ntt.S
new file mode 120000
index 0000000..40cd5d4
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/ntt.S
@@ -0,0 +1 @@
+../m4f/ntt.S
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/ntt.h b/crypto_sign/dilithium2/m4fstack/ntt.h
new file mode 120000
index 0000000..8e99cae
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/ntt.h
@@ -0,0 +1 @@
+../m4f/ntt.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/packing.c b/crypto_sign/dilithium2/m4fstack/packing.c
new file mode 120000
index 0000000..1052fe2
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/packing.c
@@ -0,0 +1 @@
+../m4f/packing.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/packing.h b/crypto_sign/dilithium2/m4fstack/packing.h
new file mode 120000
index 0000000..643cc32
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/packing.h
@@ -0,0 +1 @@
+../m4f/packing.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/params.h b/crypto_sign/dilithium2/m4fstack/params.h
new file mode 120000
index 0000000..1f91a36
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/params.h
@@ -0,0 +1 @@
+../m4f/params.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/pointwise_mont.h b/crypto_sign/dilithium2/m4fstack/pointwise_mont.h
new file mode 120000
index 0000000..3255885
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/pointwise_mont.h
@@ -0,0 +1 @@
+../m4f/pointwise_mont.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/pointwise_mont.s b/crypto_sign/dilithium2/m4fstack/pointwise_mont.s
new file mode 120000
index 0000000..3597ffd
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/pointwise_mont.s
@@ -0,0 +1 @@
+../m4f/pointwise_mont.s
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/poly.c b/crypto_sign/dilithium2/m4fstack/poly.c
new file mode 120000
index 0000000..2544e75
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/poly.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/poly.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/poly.h b/crypto_sign/dilithium2/m4fstack/poly.h
new file mode 120000
index 0000000..7ef70e5
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/poly.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/poly.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/polyvec.c b/crypto_sign/dilithium2/m4fstack/polyvec.c
new file mode 120000
index 0000000..569a9a1
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/polyvec.c
@@ -0,0 +1 @@
+../m4f/polyvec.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/polyvec.h b/crypto_sign/dilithium2/m4fstack/polyvec.h
new file mode 120000
index 0000000..d02c99c
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/polyvec.h
@@ -0,0 +1 @@
+../m4f/polyvec.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/reduce.h b/crypto_sign/dilithium2/m4fstack/reduce.h
new file mode 100644
index 0000000..5990918
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/reduce.h
@@ -0,0 +1,79 @@
+#ifndef REDUCE_H
+#define REDUCE_H
+
+#include <stdint.h>
+#include "params.h"
+
+#define MONT -4186625 // 2^32 % Q
+#define QINV 58728449 // q^(-1) mod 2^32
+
+#define montgomery_reduce DILITHIUM_NAMESPACE(montgomery_reduce)
+/*************************************************
+* Name:        montgomery_reduce
+*
+* Description: For finite field element a with -2^{31}Q <= a <= Q*2^31,
+*              compute r \equiv a*2^{-32} (mod Q) such that -Q < r < Q.
+*
+* Arguments:   - int64_t: finite field element a
+*
+* Returns r.
+**************************************************/
+static inline int32_t montgomery_reduce(int64_t a) {
+  int32_t t;
+
+  t = (int64_t)(int32_t)a*QINV;
+  t = (a - (int64_t)t*Q) >> 32;
+  return t;
+}
+
+/*************************************************
+* Name:        reduce32
+*
+* Description: For finite field element a with a <= 2^{31} - 2^{22} - 1,
+*              compute r \equiv a (mod Q) such that -6283009 <= r <= 6283007.
+*
+* Arguments:   - int32_t: finite field element a
+*
+* Returns r.
+**************************************************/
+static int32_t reduce32(int32_t a) {
+  int32_t t;
+
+  t = (a + (1 << 22)) >> 23;
+  t = a - t*Q;
+  return t;
+}
+
+/*************************************************
+* Name:        caddq
+*
+* Description: Add Q if input coefficient is negative.
+*
+* Arguments:   - int32_t: finite field element a
+*
+* Returns r.
+**************************************************/
+static int32_t caddq(int32_t a) {
+  a += (a >> 31) & Q;
+  return a;
+}
+
+/*************************************************
+* Name:        freeze
+*
+* Description: For finite field element a, compute standard
+*              representative r = a mod^+ Q.
+*
+* Arguments:   - int32_t: finite field element a
+*
+* Returns r.
+**************************************************/
+static int32_t freeze(int32_t a) {
+  a = reduce32(a);
+  a = caddq(a);
+  return a;
+}
+
+
+
+#endif
diff --git a/crypto_sign/dilithium2/m4fstack/rounding.c b/crypto_sign/dilithium2/m4fstack/rounding.c
new file mode 120000
index 0000000..ec78068
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/rounding.c
@@ -0,0 +1 @@
+../m4f/rounding.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/rounding.h b/crypto_sign/dilithium2/m4fstack/rounding.h
new file mode 120000
index 0000000..e64114b
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/rounding.h
@@ -0,0 +1 @@
+../m4f/rounding.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/sign.c b/crypto_sign/dilithium2/m4fstack/sign.c
new file mode 100644
index 0000000..bf0b939
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/sign.c
@@ -0,0 +1,484 @@
+#include <stdint.h>
+#include "params.h"
+#include "sign.h"
+#include "packing.h"
+#include "polyvec.h"
+#include "poly.h"
+#include "randombytes.h"
+#include "symmetric.h"
+#include "smallpoly.h"
+#include "stack.h"
+
+#include "smallntt.h"
+
+/*************************************************
+* Name:        crypto_sign_keypair
+*
+* Description: Generates public and private key.
+*
+* Arguments:   - uint8_t *pk: pointer to output public key (allocated
+*                             array of CRYPTO_PUBLICKEYBYTES bytes)
+*              - uint8_t *sk: pointer to output private key (allocated
+*                             array of CRYPTO_SECRETKEYBYTES bytes)
+*
+* Returns 0 (success)
+**************************************************/
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) {
+  unsigned int i, j;
+  uint8_t seedbuf[2*SEEDBYTES + CRHBYTES];
+  const uint8_t *rho, *rhoprime, *key;
+
+  poly tA, tB;
+
+  union {
+    uint8_t tr[TRBYTES];
+    shake256incctx s256;
+    poly tC;
+  } data;
+
+  shake256incctx *s256 = &data.s256;
+  uint8_t *tr          = data.tr;
+  poly *tC             = &data.tC;
+
+  /* Get randomness for rho, rhoprime and key */
+  randombytes(seedbuf, SEEDBYTES);
+  shake256_inc_init(s256);
+  shake256_inc_absorb(s256, seedbuf, SEEDBYTES);
+  shake256_inc_finalize(s256);
+  shake256_inc_squeeze(seedbuf, 2*SEEDBYTES + CRHBYTES, s256);
+
+  rho = seedbuf;
+  rhoprime = rho + SEEDBYTES;
+  key = rhoprime + CRHBYTES;
+
+  pack_sk_rho(sk, rho);
+  pack_sk_key(sk, key);
+  pack_pk_rho(pk, rho);
+
+  /* Matrix-vector multiplication */
+  for (i = 0; i < K; i++)
+  {
+    /* Expand part of s1 */
+    poly_uniform_eta(tC, rhoprime, 0);
+    if (i == 0)
+    {
+      pack_sk_s1(sk, tC, 0);
+    }
+    poly_ntt(tC);
+    /* expand part of the matrix */
+    poly_uniform(&tB, rho, (i << 8) + 0);
+    /* partial matrix-vector multiplication */
+    poly_pointwise_montgomery(&tA, &tB, tC);
+    for(j = 1; j < L; j++)
+    {
+      /* Expand part of s1 */
+      poly_uniform_eta(tC, rhoprime, j);
+      if (i == 0)
+      {
+        pack_sk_s1(sk, tC, j);
+      }
+      poly_ntt(tC);
+      poly_uniform(&tB, rho, (i << 8) + j);
+      poly_pointwise_acc_montgomery(&tA, &tB, tC);
+    }
+
+    poly_reduce(&tA);
+    poly_invntt_tomont(&tA);
+
+    /* Add error vector s2 */
+    /* Sample short vector s2 */
+    poly_uniform_eta(&tB, rhoprime, L + i);
+    pack_sk_s2(sk, &tB, i);
+    poly_add(&tA, &tA, &tB);
+
+    /* Compute t{0,1} */
+    poly_caddq(&tA);
+    poly_power2round(tC, &tB, &tA);
+    pack_sk_t0(sk, &tB, i);
+    pack_pk_t1(pk, tC, i);
+
+  }
+
+  /* Compute H(rho, t1) and write secret key */
+  shake256(tr, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES);
+  pack_sk_tr(sk, tr);
+
+  return 0;
+}
+
+
+/*************************************************
+* Name:        crypto_sign_signature
+*
+* Description: Computes signature.
+*
+* Arguments:   - uint8_t *sig:   pointer to output signature (of length CRYPTO_BYTES)
+*              - size_t *siglen: pointer to output length of signature
+*              - uint8_t *m:     pointer to message to be signed
+*              - size_t mlen:    length of message
+*              - uint8_t *sk:    pointer to bit-packed secret key
+*
+* Returns 0 (success)
+**************************************************/
+int crypto_sign_signature(uint8_t *sig,
+                          size_t *siglen,
+                          const uint8_t *m,
+                          size_t mlen,
+                          const uint8_t *sk)
+{
+  uint8_t buf[2 * CRHBYTES];
+  uint8_t *mu, *rhoprime, *rnd;
+  const uint8_t *rho, *tr, *key;
+  uint16_t nonce = 0;
+  unsigned int n;
+  uint8_t wcomp[K][768];
+  uint8_t ccomp[68];
+
+  union {
+    shake128incctx s128;
+    shake256incctx s256;
+  } state;
+
+  union {
+    poly full;
+    struct {
+      smallpoly stmp0;
+      smallpoly stmp1;
+    } small;
+  } polybuffer;
+
+  poly      *tmp0  = &polybuffer.full;
+  smallpoly *stmp0 = &polybuffer.small.stmp0;
+  smallpoly *scp   = &polybuffer.small.stmp1;
+
+  rho = sk;
+  tr = sk + SEEDBYTES*2;
+  key = sk + SEEDBYTES;
+  
+  mu = buf;
+  rnd = mu + CRHBYTES;
+  rhoprime = mu + CRHBYTES;
+  unpack_sk_stack((uint8_t*)rho, (uint8_t*)tr, (uint8_t*)key, sk);
+
+  /* Compute mu = CRH(tr, msg) */
+  shake256_inc_init(&state.s256);
+  shake256_inc_absorb(&state.s256, tr, TRBYTES);
+  shake256_inc_absorb(&state.s256, m, mlen);
+  shake256_inc_finalize(&state.s256);
+  shake256_inc_squeeze(mu, CRHBYTES, &state.s256);
+
+  // Note: RNDBYTES < CRHBYTES, so buffer has proper size
+  for (n = 0; n < RNDBYTES; n++) {
+     rnd[n] = 0;
+  }
+
+  shake256_inc_init(&state.s256);
+  shake256_inc_absorb(&state.s256, key, SEEDBYTES);
+  shake256_inc_absorb(&state.s256, rnd, RNDBYTES);
+  shake256_inc_absorb(&state.s256, mu, CRHBYTES);
+  shake256_inc_finalize(&state.s256);
+  // rnd can be overwritten here
+  shake256_inc_squeeze(rhoprime, CRHBYTES, &state.s256);
+
+rej:  
+    for (size_t k_idx = 0; k_idx < K; k_idx++) {
+      for(size_t i=0;i<768;i++){
+        wcomp[k_idx][i] = 0;
+      }
+    }
+
+      for (size_t l_idx = 0; l_idx < L; l_idx++) {
+        /* Sample intermediate vector y */
+        poly_uniform_gamma1_stack(tmp0, rhoprime, L*nonce + l_idx, &state.s256);
+        poly_ntt(tmp0);
+
+        /* Matrix-vector multiplication */
+        for (size_t k_idx = 0; k_idx < K; k_idx++) {
+          // sampling of y and packing into wcomp inlined into the basemul
+          poly_uniform_pointwise_montgomery_polywadd_stack(wcomp[k_idx], tmp0, rho, (k_idx << 8) + l_idx, &state.s128);
+        }
+      }
+      nonce++;
+      for (size_t k_idx = 0; k_idx < K; k_idx++) {
+        polyw_unpack(tmp0, wcomp[k_idx]);
+        poly_invntt_tomont(tmp0);
+        poly_caddq(tmp0);
+
+        polyw_pack(wcomp[k_idx], tmp0);
+        poly_highbits(tmp0, tmp0);
+        polyw1_pack(&sig[k_idx*POLYW1_PACKEDBYTES], tmp0);
+      }
+
+  shake256_inc_init(&state.s256);
+  shake256_inc_absorb(&state.s256, mu, CRHBYTES);
+  shake256_inc_absorb(&state.s256, sig, K*POLYW1_PACKEDBYTES);
+  shake256_inc_finalize(&state.s256);
+  shake256_inc_squeeze(sig, CTILDEBYTES, &state.s256);
+  poly_challenge(tmp0, sig);
+
+  poly_challenge_compress(ccomp, tmp0);
+  
+  /* Compute z, reject if it reveals secret */
+    for(size_t l_idx=0;l_idx < L; l_idx++){
+    if(l_idx != 0){
+      poly_challenge_decompress(tmp0, ccomp);
+    }
+      poly_small_ntt_copy(scp, tmp0);
+      unpack_sk_s1(stmp0, sk, l_idx);
+      small_ntt(stmp0->coeffs);
+      poly_small_basemul_invntt(tmp0, scp, stmp0);
+
+      poly_uniform_gamma1_add_stack(tmp0, tmp0, rhoprime, L*(nonce-1) + l_idx, &state.s256);
+
+      poly_reduce(tmp0);
+
+      if(poly_chknorm(tmp0, GAMMA1 - BETA))
+        goto rej;
+
+      polyz_pack(sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES, tmp0);
+  }
+
+
+  /* Write signature */
+  unsigned int hint_n = 0;
+  unsigned int hints_written = 0;
+  /* Check that subtracting cs2 does not change high bits of w and low bits
+   * do not reveal secret information */
+  
+  for(unsigned int k_idx = 0; k_idx < K; ++k_idx) {
+    poly_challenge_decompress(tmp0, ccomp);
+    poly_small_ntt_copy(scp, tmp0);
+
+    unpack_sk_s2(stmp0, sk, k_idx);
+    small_ntt(stmp0->coeffs);
+    poly_small_basemul_invntt(tmp0, scp, stmp0);
+
+    polyw_sub(tmp0, wcomp[k_idx], tmp0);
+    poly_reduce(tmp0);
+
+    polyw_pack(wcomp[k_idx], tmp0);
+
+    poly_lowbits(tmp0, tmp0);
+    poly_reduce(tmp0);
+    if(poly_chknorm(tmp0, GAMMA2 - BETA)){
+      goto rej;
+    }
+
+    poly_schoolbook(tmp0, ccomp, sk + SEEDBYTES + TRBYTES + SEEDBYTES +
+      L*POLYETA_PACKEDBYTES + K*POLYETA_PACKEDBYTES + k_idx*POLYT0_PACKEDBYTES);
+
+    /* Compute hints for w1 */
+
+    if(poly_chknorm(tmp0, GAMMA2)) {
+      goto rej;
+    }
+
+    hint_n += poly_make_hint_stack(tmp0, tmp0, wcomp[k_idx]);
+
+    if (hint_n > OMEGA) {
+      goto rej;
+    }
+    pack_sig_h(sig, tmp0, k_idx, &hints_written);
+  }
+  pack_sig_h_zero(sig, &hints_written);
+  *siglen = CRYPTO_BYTES;
+  return 0;
+}
+
+/*************************************************
+* Name:        crypto_sign
+*
+* Description: Compute signed message.
+*
+* Arguments:   - uint8_t *sm: pointer to output signed message (allocated
+*                             array with CRYPTO_BYTES + mlen bytes),
+*                             can be equal to m
+*              - size_t *smlen: pointer to output length of signed
+*                               message
+*              - const uint8_t *m: pointer to message to be signed
+*              - size_t mlen: length of message
+*              - const uint8_t *sk: pointer to bit-packed secret key
+*
+* Returns 0 (success)
+**************************************************/
+int crypto_sign(uint8_t *sm,
+                size_t *smlen,
+                const uint8_t *m,
+                size_t mlen,
+                const uint8_t *sk)
+{
+  size_t i;
+
+  for(i = 0; i < mlen; ++i)
+    sm[CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i];
+  crypto_sign_signature(sm, smlen, sm + CRYPTO_BYTES, mlen, sk);
+  *smlen += mlen;
+  return 0;
+}
+
+/*************************************************
+* Name:        crypto_sign_verify
+*
+* Description: Verifies signature.
+*
+* Arguments:   - uint8_t *m: pointer to input signature
+*              - size_t siglen: length of signature
+*              - const uint8_t *m: pointer to message
+*              - size_t mlen: length of message
+*              - const uint8_t *pk: pointer to bit-packed public key
+*
+* Returns 0 if signature could be verified correctly and -1 otherwise
+**************************************************/
+int crypto_sign_verify(const uint8_t *sig,
+                       size_t siglen,
+                       const uint8_t *m,
+                       size_t mlen,
+                       const uint8_t *pk)
+{
+  unsigned int i;
+  
+  poly p;
+
+  union {
+    uint8_t w1_packed[POLYW1_PACKEDBYTES];
+    uint8_t wcomp[768];
+  } w1_packed_comp;
+  uint8_t *w1_packed = w1_packed_comp.w1_packed;
+  uint8_t *wcomp  = w1_packed_comp.wcomp;
+
+  union {
+    uint8_t ccomp[68];
+    uint8_t mu[CRHBYTES];
+  } ccomp_mu;
+  uint8_t *ccomp = ccomp_mu.ccomp;
+  uint8_t *mu  = ccomp_mu.mu;
+
+  shake256incctx s256;
+
+  union {
+    uint8_t hint_ones[OMEGA];
+    shake128incctx s128;
+    uint8_t c2[CTILDEBYTES];
+  } shake_hint;
+
+  uint8_t *hint_ones   = shake_hint.hint_ones;
+  shake128incctx *s128 = &shake_hint.s128;
+  uint8_t *c2          = shake_hint.c2;
+
+  if(siglen != CRYPTO_BYTES)
+    return -1;
+
+  /* Compute CRH(h(rho, t1), msg) */
+  shake256_inc_init(&s256);
+  shake256_inc_absorb(&s256, pk, CRYPTO_PUBLICKEYBYTES);
+  shake256_inc_finalize(&s256);
+  shake256_inc_squeeze(mu, CRHBYTES, &s256);
+
+  shake256_inc_init(&s256);
+  shake256_inc_absorb(&s256, mu, TRBYTES);
+  shake256_inc_absorb(&s256, m, mlen);
+  shake256_inc_finalize(&s256);
+  shake256_inc_squeeze(mu, CRHBYTES, &s256);
+
+  shake256_inc_init(&s256);
+  shake256_inc_absorb(&s256, mu, CRHBYTES);
+
+  /* Matrix-vector multiplication; compute Az - c2^dt1 */
+  poly_challenge_stack(&p, sig);
+  poly_challenge_compress(ccomp, &p);
+
+  for (size_t k_idx = 0; k_idx < K; k_idx++) {
+    for(size_t widx=0;widx<768;widx++){
+        wcomp[widx] = 0;
+    }
+
+    polyz_unpack(&p, sig + CTILDEBYTES);
+    if(poly_chknorm(&p, GAMMA1 - BETA))
+      return -1;
+    poly_ntt(&p);
+    
+    poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &p, pk, (k_idx << 8) + 0, s128);
+
+    for (size_t l_idx = 1; l_idx < L; l_idx++) {
+      polyz_unpack(&p, sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES);
+      if(poly_chknorm(&p, GAMMA1 - BETA))
+        return -1;
+      poly_ntt(&p);
+      poly_uniform_pointwise_montgomery_polywadd_stack(wcomp, &p, pk, (k_idx << 8) + l_idx, s128);
+    }
+    polyw_unpack(&p, wcomp);
+    poly_reduce(&p);
+    poly_invntt_tomont(&p);
+    polyw_pack(wcomp, &p);
+    
+    poly_schoolbook_t1(&p, ccomp, pk + SEEDBYTES + k_idx*POLYT1_PACKEDBYTES);
+
+    polyw_sub(&p, wcomp, &p);
+    poly_reduce(&p);
+
+    /* Reconstruct w1 */
+    poly_caddq(&p);
+
+    if (unpack_sig_h_indices(hint_ones, &i, k_idx, sig) != 0)
+    {
+      return -1;
+    }
+    poly_use_hint_stack(&p, &p, hint_ones, i);
+
+    polyw1_pack(w1_packed, &p);
+
+    shake256_inc_absorb(&s256, w1_packed, POLYW1_PACKEDBYTES);
+  }
+  /* Call random oracle and verify challenge */
+  shake256_inc_finalize(&s256);
+  shake256_inc_squeeze(c2, CTILDEBYTES, &s256);
+  for(i = 0; i < CTILDEBYTES; ++i)
+    if(sig[i] != c2[i])
+      return -1;
+
+  return 0;
+}
+
+/*************************************************
+* Name:        crypto_sign_open
+*
+* Description: Verify signed message.
+*
+* Arguments:   - uint8_t *m: pointer to output message (allocated
+*                            array with smlen bytes), can be equal to sm
+*              - size_t *mlen: pointer to output length of message
+*              - const uint8_t *sm: pointer to signed message
+*              - size_t smlen: length of signed message
+*              - const uint8_t *pk: pointer to bit-packed public key
+*
+* Returns 0 if signed message could be verified correctly and -1 otherwise
+**************************************************/
+int crypto_sign_open(uint8_t *m,
+                     size_t *mlen,
+                     const uint8_t *sm,
+                     size_t smlen,
+                     const uint8_t *pk)
+{
+  size_t i;
+
+  if(smlen < CRYPTO_BYTES)
+    goto badsig;
+
+  *mlen = smlen - CRYPTO_BYTES;
+  if(crypto_sign_verify(sm, CRYPTO_BYTES, sm + CRYPTO_BYTES, *mlen, pk))
+    goto badsig;
+  else {
+    /* All good, copy msg, return 0 */
+    for(i = 0; i < *mlen; ++i)
+      m[i] = sm[CRYPTO_BYTES + i];
+    return 0;
+  }
+
+badsig:
+  /* Signature verification failed */
+  *mlen = -1;
+  for(i = 0; i < smlen; ++i)
+    m[i] = 0;
+
+  return -1;
+}
diff --git a/crypto_sign/dilithium2/m4fstack/sign.h b/crypto_sign/dilithium2/m4fstack/sign.h
new file mode 120000
index 0000000..551f979
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/sign.h
@@ -0,0 +1 @@
+../m4f/sign.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/smallntt.h b/crypto_sign/dilithium2/m4fstack/smallntt.h
new file mode 100644
index 0000000..244fad2
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/smallntt.h
@@ -0,0 +1,47 @@
+/**
+ * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com)
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SMALLNTT_H
+#define SMALLNTT_H
+
+#include <stdint.h>
+#include "params.h"
+
+#define SMALL_Q 769
+
+static const int32_t zetas_769[64] = {
+	3138844760, 1334846793, 999738812, 1854264165, 1681125041, 1150537404, 2820492178, 3071823164, 726067294, 2066499220, 3272887953, 1055590142, 4255871365, 1871019564, 2731130050, 1826338500, 513832239, 1792827701, 3373420347, 2993631302, 1161707670, 3306398751, 3518633806, 3406931146, 1586177780, 3853741788, 3317569017, 3825816122, 971813147, 122872927, 217820188, 619949766, 3753209393, 770748358, 4099487641, 765163225, 3630336467, 1742561504, 3479537875, 982983413, 2809321912, 2379266669, 703726762, 681386230, 4110657907, 1457719720, 1217559000, 2474213930, 1195218468, 1089100940, 564098436, 614364633, 3635921600, 2088839752, 3702943196, 1949211426, 2569161192, 374203913, 3982199847, 2083254619, 1513571050, 3647091866, 413299844, 4149753838};
+
+static const int32_t zetas_asm_769[128] = {
+	346278248, 223405321, 966228013, 759578091, -150798592, 318352582, -1736976371, 1697880440, -2105595150, -804259156, 1675539907, -1016494210, 1401868389, -2005062756, 240160720, 474736307, -1200803600, -1435379187, -1156122536, 1334846793, 999738811, 1854264164, -631120032, -787503756, -1580592646, 1681125040, 1150537403, -1474475119, -1223144132, 1809583100, -100532394, -1938041160, 726067293, 2066499219, -1022079344, 1055590142, 525002504, 273671518, -212235055, -39095931, 1871019563, -1563837247, 1826338499, 139628326, 27925665, 1731391238, 513832238, 1792827701, -921546949, -1301335995, 67021596, 1117026605, 536172770, 1161707669, -988568545, -776333490, -888036151, 1290165729, -497076839, -753992958, 1586177779, -441225509, -977398279, -469151174, -1614103444, 1591762912, -94947261, 971813146, 122872927, 217820188, 619949766, -1709050706, 1010909077, -1748146637, -541757903, 770748357, -195479656, 765163224, 1413038655, 1781657435, -1206388733, -664630830, 1742561504, -815429422, 982983412, 357448514, 44681064, -1524741316, -1485645385, -1915700627, 703726761, 681386229, 686971362, 1787242568, -860110486, -184309390, 1457719719, 1217558999, -1820753366, -502661972, -1921285760, 1139367137, 1195218467, 1089100940, 564098435, 614364633, -1100271206, 457980908, -1669954774, -659045697, 2088839751, -592024101, 1949211426, 1368357591, 698141628, 335107981, -1725806105, 374203913, -312767449, 2083254618, -1061175275, -2139105948, 519417371, 1513571050, -647875431, 413299844, -145213459, 0};
+
+// INTT with CT butterfly
+static const int32_t zetas_inv_asm_769[256] = {
+	5585134, 5585134, -346278248, 5585134, -966228013, -346278248, -223405321, 5585134, 1736976371, -966228013, 150798592, -346278248, -318352582, -223405321, -759578091,
+	// removed first "2285" + LAYER 3+2+1 - 1 - butterfly
+	5585134, -346278248, 5585134, -966228013, -346278248, -223405321, 636705165, 446810642, 1519156183, 11170266, -821014555, -1932456027, 301597183, -692556495, -240160720, 1061175275, -1368357591, -519417371, -335107981, 2139105948, -698141628, -625534899, -1267825197, 843355087, 290426917, 128458060, 1295750862, -748407825, -826599688, 1736976371, -240160720, 2005062756, 1061175275, 1100271206, -1368357591, 502661972, 915961816, 1396283256, 452395775, -1038834743, -955057747, -670215963, 2016233022, -16755399, -1675539907, 1614103444, -1290165729, 94947261, 753992958, -1591762912, 497076839, -1954796559, 1943626293, -1122611738, -1239899531, 938302348, -245745853, 882451018, -435640376, -966228013, 1736976371, -318352582, -240160720, -1401868389, 2005062756, 1016494210, 714897027, -1005323944, 876865885, 2122350549, -1373942724, -2094424884, 1468889985, 1558252114, -1401868389, -686971362, -357448514, 860110486, 1524741316, -1787242568, -44681064, 1407453522, -368618780, 1323676527, -653460564, -1362772458, 1379527857, -463566041, 1859849297, 150798592, -1675539907, 804259156, 1614103444, -67021596, -1290165729, -139628326, -2060914086, -994153678, 55851330, 189894523, -1072345541, 1507985917, 832184821, 1111441472, 2105595150, -525002504, -1809583100, 212235055, 1938041160, -273671518, 100532394, -2044158687, -78191862, 1452134586, 642290298, -2111180283, 552928169, 161968858, -1167292802, -346278248, -966228013, -223405321, 1736976371, 150798592, -318352582, -759578091, -1608518311, -2032988421, -899206417, -480321440, 943887481, 1491230518, -83776995, -284841784, 2005062756, 1100271206, 502661972, 1669954774, -1139367137, -457980908, 1921285760, 1128196871, -1318091394, -1904530361, 396544445, -1228729265, 117287794, 2116765416, 1184048201, -318352582, -1401868389, 1016494210, -686971362, -1413038655, -357448514, 1709050706, -731652426, 89362128, 2021818155, 1720220972, -1882189829, -1245484665, -798674023, 720482160, 804259156, -67021596, -139628326, -536172770, -1731391238, -1117026605, -27925665, -1843093898, -1971551958, 1027664477, 1776072302, -1692295306, 1977137091, 709311894, 1552666981, -223405321, 150798592, -759578091, -1675539907, 2105595150, 804259156, -1697880440, -675801096, 279256651, 949472614, -1066760408, -1050005009, -134043193, 1262240064, 1714635839, 1016494210, -1413038655, 1709050706, 1206388733, 1748146637, -1781657435, -1010909077, -390959312, -1329261660, -1083515807, -1965966825, -1530326449, 809844289, -1541496715, 1630858843, -759578091, 2105595150, -1697880440, -525002504, 631120032, -1809583100, -474736307, -1575007513, -201064789, 1893360095, 424470110, -1133782004, -418884977, -1424208921, -547343036, -1697880440, 631120032, -474736307, 1580592646, 1435379187, 787503756, 1200803600, 1999477623, -932717215, 1982722224, -1848679031, 586438968, 1993892490, 1625273710, -1346017059, 0};
+
+// Q1=769
+void small_ntt_asm_769(int16_t a[N], const int32_t * zetas);
+void small_invntt_asm_769(int16_t a[N], const int32_t * zetas);
+void small_basemul_asm_769(int16_t *c, const int16_t *a, const int16_t *b, const int32_t *zetas);
+
+// small NTT for computing cs0 and cs1
+#define small_ntt(a) small_ntt_asm_769(a, zetas_asm_769)
+#define small_invntt_tomont(a) small_invntt_asm_769(a, zetas_inv_asm_769)
+#define small_basemul(r,a,b) small_basemul_asm_769(r, a, b, zetas_769)
+
+#endif
diff --git a/crypto_sign/dilithium2/m4fstack/smallntt_769.S b/crypto_sign/dilithium2/m4fstack/smallntt_769.S
new file mode 100644
index 0000000..1c3c9a8
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/smallntt_769.S
@@ -0,0 +1,691 @@
+/* 
+ * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com)
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * NTT and inverse NTT code from: 
+ * Huang, J. et al. 2024. Revisiting Keccak and Dilithium Implementations on ARMv7-M. 
+ * IACR Transactions on Cryptographic Hardware and Embedded Systems. 2024, 2 (Mar. 2024), 1–24.
+ * DOI:https://doi.org/10.46586/tches.v2024.i2.1-24.
+ * https://github.com/UIC-ESLAS/Dilithium-Multi-Moduli/blob/332a32cc02d407020e48a4f9b3a0dc78d4c8b0bc/M4/crypto_sign/dilithium3/m4plant/smallntt_769.S
+*/
+
+#include "macros.i"
+
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+#include "macros_smallntt.i"
+// #######
+// #######
+// # NTT #
+// #######
+// #######
+
+.macro _3_layer_double_CT_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+	// layer 3
+	ldr.w \twiddle1, [\twiddle_ptr], #4
+	two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle1, \tmp, \q, \qa
+	two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa
+
+	// layer 2
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa
+
+	two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle2, \twiddle2, \tmp, \q, \qa
+
+	// layer 1
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa
+
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa
+.endm
+
+.macro _3_layer_double_CT_16_plant_fp c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle1, twiddle2, q, qa, tmp
+	// layer 3
+	vmov \twiddle1, \xi0
+	two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle1, \tmp, \q, \qa
+	two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa
+
+	// layer 2
+	vmov \twiddle1, \xi1
+	vmov \twiddle2, \xi2
+	two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa
+
+	two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle2, \twiddle2, \tmp, \q, \qa
+
+	// layer 1
+	vmov \twiddle1, \xi3
+	vmov \twiddle2, \xi4
+	two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa
+
+	vmov \twiddle1, \xi5
+	vmov \twiddle2, \xi6
+	two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa
+.endm
+
+.global small_ntt_asm_769
+.type small_ntt_asm_769, %function
+.align 2
+small_ntt_asm_769:
+	push {r4-r11, r14}
+	vpush.w {s16-s24}
+	poly         .req r0
+	twiddle_ptr  .req r1
+	poly0        .req r2
+	poly1        .req r3
+	poly2        .req r4
+	poly3        .req r5
+	poly4        .req r6
+	poly5        .req r7
+	poly6        .req r8
+	poly7        .req r9
+	twiddle1     .req r10
+	twiddle2     .req r11
+	###  qinv        .req r11 ### q^-1 mod 2^2n; n=16
+	q           .req r12 
+	### at the top of r12
+	qa          .req r0
+	### qa=2^a q;a=3; at the bottom of r12
+	tmp         .req r14
+
+	// movw qa, #24608
+	// Why movt? Because we initially placed qa at the bottom of the same register as q;
+	movt q, #769
+
+	### LAYER 7+6+5+4
+	.equ distance, 256
+	.equ offset, 32
+	.equ strincr, 4
+	// pre-load 15 twiddle factors to 15 FPU registers
+	// s0-s7 used to temporary store 16 16-bit polys.
+	vldm twiddle_ptr!, {s8-s22}
+ 
+	add tmp, poly, #strincr*8
+	// s23: poly addr
+	// s24: tmp  
+	vmov s24, tmp  
+	1:
+		// load a1, a3, ..., a15
+		vmov s23, poly
+		load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset
+		load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset
+		
+		movw qa, #24608
+
+		// 8-NTT on a1, a3, ..., a15
+		_3_layer_double_CT_16_plant_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp
+
+		// s15, s16, s17, s18, s19, s20, s21, s22 left
+		// multiply coeffs by layer 8 twiddles for later use
+		vmov twiddle1, s15 
+		vmov twiddle2, s16 
+		mul_twiddle_plant poly0, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly1, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s17 
+		vmov twiddle2, s18 
+		mul_twiddle_plant poly2, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly3, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s19 
+		vmov twiddle2, s20 
+		mul_twiddle_plant poly4, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly5, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s21 
+		vmov twiddle2, s22 
+		mul_twiddle_plant poly6, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly7, twiddle2, tmp, q, qa
+
+		vmov s0, poly0 // a1
+		vmov s1, poly1 // a3
+		vmov s2, poly2 // a5
+		vmov s3, poly3 // a7
+		vmov s4, poly4 // a9
+		vmov s5, poly5 // a11
+		vmov s6, poly6 // a13
+		vmov s7, poly7 // a15
+
+		vmov poly, s23
+	
+		// load a0, a2, ..., a14
+		load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+		load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		
+		movw qa, #24608
+		// 8-NTT on a0, a2, ..., a14
+		_3_layer_double_CT_16_plant_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp
+
+		
+		// layer 4 - 1
+		// addsub: (a2, a6, a10, a14), (a3, a7, a11, a15)
+		vmov poly, s23
+		vmov twiddle1, s1 // load a3
+		uadd16 tmp, poly1, twiddle1
+		usub16 poly1, poly1, twiddle1
+		str.w tmp, [poly, #1*distance/4]
+		str.w poly1, [poly, #1*distance/4+offset]
+
+		vmov twiddle1, s3 // load a7
+		uadd16 tmp, poly3, twiddle1
+		usub16 poly3, poly3, twiddle1
+		str.w tmp, [poly, #3*distance/4]
+		str.w poly3, [poly, #3*distance/4+offset]
+		
+		vmov twiddle1, s5 // load a11
+		uadd16 tmp, poly5, twiddle1
+		usub16 poly5, poly5, twiddle1
+		str.w tmp, [poly, #5*distance/4]
+		str.w poly5, [poly, #5*distance/4+offset]
+		
+		vmov twiddle1, s7 // load a15
+		uadd16 tmp, poly7, twiddle1
+		usub16 poly7, poly7, twiddle1
+		str.w tmp, [poly, #7*distance/4]
+		str.w poly7, [poly, #7*distance/4+offset]
+		
+		// layer 4 - 2    
+		// addsub: (a0, a4, a8, a12), (a1, a5, a9, a13)
+		vmov poly3, s2 // load a5
+		uadd16 tmp, poly2, poly3
+		usub16 twiddle1, poly2, poly3
+		str.w tmp, [poly, #2*distance/4]
+		str.w twiddle1, [poly, #2*distance/4+offset]
+
+		vmov poly5, s4 // load a9
+		uadd16 tmp, poly4, poly5
+		usub16 twiddle1, poly4, poly5
+		str.w tmp, [poly, #4*distance/4]
+		str.w twiddle1, [poly, #4*distance/4+offset]
+
+		vmov poly7, s6 // load a13
+		uadd16 tmp, poly6, poly7
+		usub16 twiddle1, poly6, poly7
+		str.w tmp, [poly, #6*distance/4]
+		str.w twiddle1, [poly, #6*distance/4+offset]
+		
+		vmov poly1, s0 // load a1
+		uadd16 tmp, poly0, poly1
+		usub16 twiddle1, poly0, poly1
+		str.w twiddle1, [poly, #offset]
+		str.w tmp, [poly], #4
+
+	vmov tmp, s24
+	cmp.w poly, tmp
+	bne.w 1b
+
+	sub.w poly, #8*strincr
+
+	### LAYER 3+2+1
+
+	.equ distance, distance/16
+	.equ strincr, 32
+
+	add.w tmp, poly, #strincr*16
+	vmov s13, tmp
+	2:
+		vmov s23, poly
+		load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+		load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		
+		movw qa, #24608
+		_3_layer_double_CT_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+		
+		vmov poly, s23
+		store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		str.w poly1, [poly, #distance/4]
+		str.w poly2, [poly, #2*distance/4]
+		str.w poly3, [poly, #3*distance/4]
+		str.w poly0, [poly], #strincr
+
+	vmov tmp, s13
+	cmp.w poly, tmp
+	bne.w 2b
+	vpop.w {s16-s24}
+	pop {r4-r11, pc}
+
+.unreq poly
+.unreq twiddle_ptr
+.unreq poly0
+.unreq poly1
+.unreq poly2
+.unreq poly3
+.unreq poly4
+.unreq poly5
+.unreq poly6
+.unreq poly7
+.unreq twiddle1
+.unreq twiddle2
+.unreq q
+.unreq qa
+.unreq tmp
+
+
+// ########
+// ########
+// # INTT #
+// ########
+// ########
+
+// input: 0.5/1q
+.macro _3_layer_double_inv_CT_16_plant_light c0, c1, c2, c3, c4, c5, c6, c7, xi2, xi4, xi5, xi6, twiddle1, tmp2, q, qa, tmp
+
+	// layer 1  
+	sadd16.w \tmp, \c0, \c1 // c0, c1
+	ssub16.w \c1, \c0, \c1
+	sadd16.w \tmp2, \c2, \c3 // c2, c3
+	ssub16.w \c3, \c2, \c3
+	// tmp, c1, tmp2, c3: 1q maximum
+	sadd16.w \c0, \c4, \c5 // c4, c5
+	ssub16.w \c5, \c4, \c5
+	sadd16.w \c2, \c6, \c7 // c6, c7
+	ssub16.w \c7, \c6, \c7
+	// c4, c6 are free at this point
+	// c0,c5,c2,c7 1q maximum
+
+	// layer 2
+	sadd16.w \c6, \tmp, \tmp2 // c0, c2
+	ssub16.w \tmp2, \tmp, \tmp2
+	sadd16.w \c4, \c0, \c2 // c4, c6
+	ssub16.w \c2, \c0, \c2
+	// c6, tmp2, c4, c2: 2q maximum
+
+	vmov.w \twiddle1, \xi2
+	doublebutterfly_plant \c1, \c3, \twiddle1, \tmp, \q, \qa
+	doublebutterfly_plant \c5, \c7, \twiddle1, \tmp, \q, \qa 
+	// c1, c3, c7, c5: 1.5q maximum;
+
+	// tmp and c0 are free at this point
+	// layer 3
+	sadd16.w \c0, \c6, \c4 // c0, c4
+	ssub16.w \c4, \c6, \c4
+	// c0, c4: 4q
+	// c6 are free at this point
+	vmov.w \twiddle1, \xi4
+	doublebutterfly_plant \c1, \c5, \twiddle1, \tmp, \q, \qa
+	// c1, c5: 2q maximum
+
+	vmov.w \twiddle1, \xi5
+	// this block is one doublebutterfly
+	smulwb \tmp, \twiddle1, \c2  // c2, c6
+	smulwt \c2,  \twiddle1, \c2
+	smlabt \tmp, \tmp, \q, \qa
+	smlabt \c2, \c2, \q, \qa
+	pkhtb \tmp, \c2, \tmp, asr#16
+	ssub16.w \c6, \tmp2, \tmp 
+	sadd16.w \c2, \tmp2, \tmp
+	//c6, c2: 4.5q
+	vmov.w \twiddle1, \xi6
+	doublebutterfly_plant \c3, \c7, \twiddle1, \tmp, \q, \qa
+	//c3, c7: 2.5q maximum
+.endm
+.macro _3_layer_double_inv_CT_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+	// layer 3
+	ldr.w \twiddle1, [\twiddle_ptr], #4
+	two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa
+	two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa
+
+	// layer 2
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa
+
+	two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa
+
+	// layer 1
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle2, \tmp, \q, \qa
+
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa
+.endm
+
+.macro _3_layer_double_inv_twist_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	mul_twiddle_plant \c0, \twiddle1, \tmp, \q, \qa
+	mul_twiddle_plant \c1, \twiddle2, \tmp, \q, \qa
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	mul_twiddle_plant \c2, \twiddle1, \tmp, \q, \qa
+	mul_twiddle_plant \c3, \twiddle2, \tmp, \q, \qa
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	mul_twiddle_plant \c4, \twiddle1, \tmp, \q, \qa
+	mul_twiddle_plant \c5, \twiddle2, \tmp, \q, \qa
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	mul_twiddle_plant \c6, \twiddle1, \tmp, \q, \qa
+	mul_twiddle_plant \c7, \twiddle2, \tmp, \q, \qa
+.endm
+# input coefficients < 0.5q
+.global small_invntt_asm_769
+.type small_invntt_asm_769, %function
+.align 2
+small_invntt_asm_769:
+	push {r4-r11, r14}
+	vpush.w {s16-s23}
+	poly         .req r0
+	twiddle_ptr  .req r1
+	poly0        .req r2
+	poly1        .req r3
+	poly2        .req r4
+	poly3        .req r5
+	poly4        .req r6
+	poly5        .req r7
+	poly6        .req r8
+	poly7        .req r9
+	twiddle1     .req r10
+	twiddle2     .req r11
+	q            .req r12 
+	// at the top of r12
+	qa           .req r0
+	// qa=2^a q;a=3; at the bottom of r12
+	tmp          .req r14
+
+	movt q, #769
+
+	### LAYER 7+6+5+4
+	.equ distance, 16
+	.equ offset, 32
+	.equ strincr, 64
+
+	// pre-load twiddle factors to FPU registers
+	vldm twiddle_ptr!, {s8-s22}
+
+	add.w tmp, poly, #8*strincr
+	vmov s8, tmp
+	1:
+		vmov s23, poly
+		// load a1, a3, ..., a15
+		load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset
+		load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset
+
+		movw qa, #24608
+
+		// NTT on a1, a3, ..., a15   
+		// twiddle2 is used as tmp2
+		_3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s10, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp
+
+		// multiply coeffs by layer 4 twiddles for later use
+		// vmov twiddle1, s15 
+		vmov twiddle2, s16
+		// mul_twiddle_plant poly0, twiddle1, tmp, q, qa // could be omitted but kept for reduction only
+		mul_twiddle_plant poly1, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s17 
+		vmov twiddle2, s18
+		mul_twiddle_plant poly2, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly3, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s19 
+		vmov twiddle2, s20
+		mul_twiddle_plant poly4, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly5, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s21 
+		vmov twiddle2, s22
+		mul_twiddle_plant poly6, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly7, twiddle2, tmp, q, qa
+
+		vmov s0, poly0 // a1
+		vmov s1, poly1 // a3
+		vmov s2, poly2 // a5
+		vmov s3, poly3 // a7
+		vmov s4, poly4 // a9
+		vmov s5, poly5 // a11
+		vmov s6, poly6 // a13
+		vmov s7, poly7 // a15
+		// 0.5q
+		// ----------
+
+		vmov poly, s23
+		// load a0, a2, ..., a14
+		load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+		load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		
+		movw qa, #24608
+		// NTT on a0, a2, ..., a14
+		// twiddle2 is used as tmp2
+		_3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s10, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp
+		// 1,3,5,7: <5q; 0,2,4,6:<1q
+		// layer 4 - 1
+		// addsub: (a2, a6, a10, a14), (a3, a7, a11, a15)
+		vmov poly, s23
+		vmov twiddle2, s1 // load a3
+		uadd16 tmp, poly1, twiddle2
+		usub16 poly1, poly1, twiddle2
+		str.w tmp, [poly, #1*distance/4]
+		str.w poly1, [poly, #1*distance/4+offset]
+
+		vmov twiddle2, s3 // load a7
+		uadd16 tmp, poly3, twiddle2
+		usub16 poly3, poly3, twiddle2
+		str.w tmp, [poly, #3*distance/4]
+		str.w poly3, [poly, #3*distance/4+offset]
+		
+		vmov twiddle2, s5 // load a11
+		uadd16 tmp, poly5, twiddle2
+		usub16 poly5, poly5, twiddle2
+		str.w tmp, [poly, #5*distance/4]
+		str.w poly5, [poly, #5*distance/4+offset]
+		
+		vmov twiddle2, s7 // load a15
+		uadd16 tmp, poly7, twiddle2
+		usub16 poly7, poly7, twiddle2
+		str.w tmp, [poly, #7*distance/4]
+		str.w poly7, [poly, #7*distance/4+offset]
+		//1,3,5,7: < 5.5q
+
+		// layer 4 - 2    
+		// addsub: (a0, a4, a8, a12), (a1, a5, a9, a13)
+		vmov poly3, s2 // load a5
+		uadd16 tmp, poly2, poly3
+		usub16 twiddle2, poly2, poly3
+		str.w tmp, [poly, #2*distance/4]
+		str.w twiddle2, [poly, #2*distance/4+offset]
+
+		vmov poly5, s4 // load a9
+		uadd16 tmp, poly4, poly5
+		usub16 twiddle2, poly4, poly5
+		str.w tmp, [poly, #4*distance/4]
+		str.w twiddle2, [poly, #4*distance/4+offset]
+
+		vmov poly7, s6 // load a13
+		uadd16 tmp, poly6, poly7
+		usub16 twiddle2, poly6, poly7
+		str.w tmp, [poly, #6*distance/4]
+		str.w twiddle2, [poly, #6*distance/4+offset]
+		
+		vmov poly1, s0 // load a1
+		uadd16 tmp, poly0, poly1
+		usub16 twiddle2, poly0, poly1
+		str.w twiddle2, [poly, #offset]    
+		str.w tmp, [poly], #strincr // increase 2*8*4 = 64 (2 * 8 loads of 4 bytes each)
+		//0,2,4,6: < 1.5q
+	vmov tmp, s8
+	cmp.w poly, tmp
+	bne.w 1b
+
+	sub.w poly, #8*strincr  
+
+	### LAYER 3+2+1
+
+	.equ distance, distance*16
+	.equ strincr, 4
+
+	// ITER 0
+	vmov s6, poly
+	load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+	load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+
+	vldm twiddle_ptr!, {s0-s5}
+	movw qa, #24608
+	// twiddle2 is used as tmp2
+	_3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s1, s3, s4, s5, twiddle1, twiddle2, q, qa, tmp
+
+	// twisting
+	_3_layer_double_inv_twist_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+	
+	vmov poly, s6
+	store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+	str.w poly1, [poly, #distance/4]
+	str.w poly2, [poly, #2*distance/4]
+	str.w poly3, [poly, #3*distance/4]
+	str.w poly0, [poly], #4
+
+	// ITER 1-15
+	add.w tmp, poly, #strincr*3*(5)
+	vmov s14, tmp
+	2:
+		vmov s6, poly
+		// polys upto 5.5q
+		load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+		load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		
+		movw qa, #24608
+		_3_layer_double_inv_CT_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+
+		// twisting
+		_3_layer_double_inv_twist_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+
+		vmov poly, s6
+		store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		str.w poly1, [poly, #distance/4]
+		str.w poly2, [poly, #2*distance/4]
+		str.w poly3, [poly, #3*distance/4]
+		str.w poly0, [poly], #4
+
+	vmov tmp, s14
+	cmp.w poly, tmp
+	bne.w 2b
+
+	vpop.w {s16-s23}
+	pop {r4-r11, pc}
+
+.unreq poly
+.unreq twiddle_ptr
+.unreq poly0
+.unreq poly1
+.unreq poly2
+.unreq poly3
+.unreq poly4
+.unreq poly5
+.unreq poly6
+.unreq poly7
+.unreq twiddle1
+.unreq twiddle2
+.unreq q
+.unreq qa
+.unreq tmp
+
+// BASEMUL
+
+/* 
+* Basemul code (adapted to q=769) from: 
+* Huang, J. et al. 2022. Improved Plantard Arithmetic for Lattice-based Cryptography.
+* IACR Transactions on Cryptographic Hardware and Embedded Systems. 2022, 4 (Aug. 2022), 614–636.
+* DOI:https://doi.org/10.46586/tches.v2022.i4.614-636.
+* https://github.com/UIC-ESLAS/ImprovedPlantardArithmetic/blob/f3482cfd09dda8f1f55b95e13616147e3b6dd008/crypto_kem/kyber768/m4fstack/fastbasemul.S
+*/
+
+.global small_basemul_asm_769
+.type small_basemul_asm_769, %function
+.align 2
+small_basemul_asm_769:
+	push {r4-r11, lr}
+
+	rptr    .req r0
+	aptr    .req r1
+	bptr    .req r2
+	zetaptr .req r3
+	poly0   .req r4
+	poly1   .req r6
+	poly2   .req r5
+	poly3   .req r7
+	q       .req r8
+	qa      .req r14
+	qinv    .req r9
+	tmp     .req r10
+	tmp2    .req r11
+	zeta    .req r12
+	loop    .req r14
+
+	movt  q, #769
+	movw qinv, #64769
+	movt qinv, #58632
+
+	movw loop, #64
+	1:
+	vmov.w s0,loop
+	movw qa, #24608
+			
+	ldrd poly0, poly2, [aptr], #8
+	ldrd poly1, poly3, [bptr], #8 
+	// ldr poly0, [aptr], #4
+	// ldr poly1, [bptr], #4
+	// ldr poly2, [aptr], #4
+	// ldr poly3, [bptr], #4
+
+	ldr.w zeta, [zetaptr], #4
+
+	// basemul(r->coeffs + 4 * i, a->coeffs + 4 * i, b->coeffs + 4 * i, zetas[64 + i]);
+	smulwt tmp, zeta, poly1 
+	smlabt tmp, tmp, q, qa  
+	smultt tmp, poly0, tmp  
+	smlabb tmp, poly0, poly1, tmp 
+	plant_red q, qa, qinv, tmp
+	// r[0] in upper half of tmp
+	
+	smuadx tmp2, poly0, poly1 
+	plant_red q, qa, qinv, tmp2
+	// r[1] in upper half of tmp2
+	pkhtb tmp, tmp2, tmp, asr#16
+	str tmp, [rptr], #4
+
+	neg zeta, zeta
+
+	// basemul(r->coeffs + 4 * i + 2, a->coeffs + 4 * i + 2, b->coeffs + 4 * i + 2, - zetas[64 + i]);
+	smulwt tmp, zeta, poly3 
+	smlabt tmp, tmp, q, qa  
+	smultt tmp, poly2, tmp  
+	smlabb tmp, poly2, poly3, tmp 
+	plant_red q, qa, qinv, tmp
+	// r[0] in upper half of tmp
+	
+	smuadx tmp2, poly2, poly3 
+	plant_red q, qa, qinv, tmp2
+	// r[1] in upper half of tmp2
+	pkhtb tmp, tmp2, tmp, asr#16
+	str tmp, [rptr], #4
+		
+	vmov.w loop,s0
+	subs.w loop, #1
+	bne.w 1b
+
+	.unreq rptr   
+	.unreq aptr   
+	.unreq bptr   
+	.unreq zetaptr
+	.unreq poly0  
+	.unreq poly1  
+	.unreq poly2  
+	.unreq poly3  
+	.unreq q      
+	.unreq qa     
+	.unreq qinv   
+	.unreq tmp    
+	.unreq tmp2   
+	.unreq zeta   
+	.unreq loop   
+
+	pop {r4-r11, pc}
+//-0.5p~0.5p
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/smallpoly.c b/crypto_sign/dilithium2/m4fstack/smallpoly.c
new file mode 100644
index 0000000..433d98a
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/smallpoly.c
@@ -0,0 +1,83 @@
+#include "smallpoly.h"
+#include "smallntt.h"
+
+void poly_small_ntt_copy(smallpoly *out, poly *in) {
+  for (int i = N - 1; i >= 0; i--)
+  {
+    out->coeffs[i] = in->coeffs[i];
+  }
+  small_ntt(out->coeffs);
+}
+
+
+void polyvecl_small_ntt(smallpoly v[L]) {
+  unsigned int i;
+
+  for(i = 0; i < L; ++i)
+    small_ntt(v[i].coeffs);
+}
+
+
+void polyveck_small_ntt(smallpoly v[K]) {
+  unsigned int i;
+
+  for(i = 0; i < K; ++i)
+    small_ntt(v[i].coeffs);
+}
+
+
+
+void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallpoly *b){
+    // re-use the buffer
+    smallpoly *tmp = (smallpoly *)r;
+    small_basemul(tmp->coeffs, a->coeffs, b->coeffs);
+    small_invntt_tomont(tmp->coeffs);
+
+    #ifdef SMALL_POLY_16_BIT
+    int j;
+    // buffer is the same, so we neeed to be careful
+    for(j=N-1;j>=0;j--){
+        r->coeffs[j] = tmp->coeffs[j];
+    }
+    #endif
+}
+
+void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallpoly b[L]){
+    unsigned int i;
+    for(i=0;i<L;i++){
+        poly_small_basemul_invntt(&r->vec[i], a, &b[i]);
+    }
+}
+
+void small_polyeta_unpack(smallpoly *r, const uint8_t *a) {
+  unsigned int i;
+
+#if ETA == 2
+  for(i = 0; i < N/8; ++i) {
+    r->coeffs[8*i+0] =  (a[3*i+0] >> 0) & 7;
+    r->coeffs[8*i+1] =  (a[3*i+0] >> 3) & 7;
+    r->coeffs[8*i+2] = ((a[3*i+0] >> 6) | (a[3*i+1] << 2)) & 7;
+    r->coeffs[8*i+3] =  (a[3*i+1] >> 1) & 7;
+    r->coeffs[8*i+4] =  (a[3*i+1] >> 4) & 7;
+    r->coeffs[8*i+5] = ((a[3*i+1] >> 7) | (a[3*i+2] << 1)) & 7;
+    r->coeffs[8*i+6] =  (a[3*i+2] >> 2) & 7;
+    r->coeffs[8*i+7] =  (a[3*i+2] >> 5) & 7;
+
+    r->coeffs[8*i+0] = ETA - r->coeffs[8*i+0];
+    r->coeffs[8*i+1] = ETA - r->coeffs[8*i+1];
+    r->coeffs[8*i+2] = ETA - r->coeffs[8*i+2];
+    r->coeffs[8*i+3] = ETA - r->coeffs[8*i+3];
+    r->coeffs[8*i+4] = ETA - r->coeffs[8*i+4];
+    r->coeffs[8*i+5] = ETA - r->coeffs[8*i+5];
+    r->coeffs[8*i+6] = ETA - r->coeffs[8*i+6];
+    r->coeffs[8*i+7] = ETA - r->coeffs[8*i+7];
+  }
+#elif ETA == 4
+  for(i = 0; i < N/2; ++i) {
+    r->coeffs[2*i+0] = a[i] & 0x0F;
+    r->coeffs[2*i+1] = a[i] >> 4;
+    r->coeffs[2*i+0] = ETA - r->coeffs[2*i+0];
+    r->coeffs[2*i+1] = ETA - r->coeffs[2*i+1];
+  }
+#endif
+}
diff --git a/crypto_sign/dilithium2/m4fstack/smallpoly.h b/crypto_sign/dilithium2/m4fstack/smallpoly.h
new file mode 100644
index 0000000..1aac98f
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/smallpoly.h
@@ -0,0 +1,27 @@
+#ifndef SMALLPOLY_H
+#define SMALLPOLY_H
+#include "params.h"
+#include "poly.h"
+#include "polyvec.h"
+
+
+
+#define SMALL_POLY_16_BIT
+typedef struct {
+    int16_t coeffs[N];
+} smallpoly;
+
+typedef smallpoly smallhalfpoly;
+
+void poly_small_ntt_copy(smallpoly*, poly*);
+
+void polyvecl_small_ntt(smallpoly v[L]);
+void polyveck_small_ntt(smallpoly v[K]);
+
+
+void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallpoly b[L]);
+void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallpoly *b);
+
+void small_polyeta_unpack(smallpoly *r, const uint8_t *a);
+
+#endif
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/stack.c b/crypto_sign/dilithium2/m4fstack/stack.c
new file mode 100644
index 0000000..b45f702
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/stack.c
@@ -0,0 +1,715 @@
+#include "stack.h"
+#include "fips202.h"
+#include "symmetric.h"
+#include "vector.h"
+#include "reduce.h"
+#include "rounding.h"
+
+void poly_challenge_compress(uint8_t c[68], const poly *cp){
+  unsigned int i, pos;
+  uint64_t signs;
+  uint64_t mask;
+  /* Encode c */
+  for(i=0;i<68;i++) c[i] = 0;
+  signs = 0;
+  mask = 1;
+  pos = 0;
+  for(i = 0; i < N; ++i){
+    if(cp->coeffs[i] != 0){
+      c[pos++] = i;
+      if(cp->coeffs[i] == -1){
+        signs |= mask;
+      }
+      mask <<= 1;
+    }
+  }
+
+  for (i = 0; i < 8; ++i) {
+    c[60+i] = (unsigned char) (signs >> 8 * i);
+  }
+}
+
+void poly_challenge_decompress(poly *cp, const uint8_t c[68]){
+  unsigned int i;
+  unsigned pos;
+  uint64_t signs = 0;
+  for(i = 0; i < N; i++) cp->coeffs[i] = 0;
+  for(i = 0; i < 8; i++) {
+    signs |= ((uint64_t)c[60+i]) << (8*i);
+  }
+
+  for(i = 0; i < TAU; i++){
+    pos = c[i];
+    if(signs & 1){
+      cp->coeffs[pos] = -1;
+    } else {
+      cp->coeffs[pos] = 1;
+    }
+    signs >>= 1;
+  }
+}
+
+
+// TODO: buffer at most 8 coeffs at once
+static inline int32_t polyt0_unpack_idx(const uint8_t *t0, unsigned idx){
+    int32_t coeff;
+    // 8 coefficients are packed in 13 bytes
+    t0 += 13*(idx >> 3);
+
+    if(idx % 8 == 0){
+        coeff  = t0[0];
+        coeff |= (uint32_t)t0[1] << 8;
+    } else if(idx % 8 == 1){
+        coeff  = t0[1] >> 5;
+        coeff |= (uint32_t)t0[2] << 3;
+        coeff |= (uint32_t)t0[3] << 11;
+    } else if(idx % 8 == 2){
+        coeff  = t0[3] >> 2;
+        coeff |= (uint32_t)t0[4] << 6;
+    } else if(idx % 8 == 3){
+        coeff  = t0[4] >> 7;
+        coeff |= (uint32_t)t0[5] << 1;
+        coeff |= (uint32_t)t0[6] << 9;
+    } else if(idx % 8 == 4){
+        coeff  = t0[6] >> 4;
+        coeff |= (uint32_t)t0[7] << 4;
+        coeff |= (uint32_t)t0[8] << 12;
+    } else if(idx % 8 == 5){
+        coeff  = t0[8] >> 1;
+        coeff |= (uint32_t)t0[9] << 7;
+    } else if(idx % 8 == 6){
+        coeff  = t0[9] >> 6;
+        coeff |= (uint32_t)t0[10] << 2;
+        coeff |= (uint32_t)t0[11] << 10;
+    } else if(idx % 8 == 7){
+        coeff  = t0[11] >> 3;
+        coeff |= (uint32_t)t0[12] << 5;
+    }
+    coeff &= 0x1FFF;
+    return (1 << (D-1)) - coeff;
+}
+
+static inline int32_t polyt1_unpack_idx(const uint8_t *t1, unsigned idx){
+    int32_t coeff;
+    // 4 coefficients are packed in 5 bytes
+    t1 += 5*(idx >> 2);
+
+    if(idx % 4 == 0){
+        coeff  = (t1[0] >> 0);
+        coeff |= ((uint32_t)t1[1] << 8);
+    } else if(idx % 4 == 1){
+        coeff  =  (t1[1] >> 2);
+        coeff |= ((uint32_t)t1[2] << 6);
+    } else if(idx % 4 == 2){
+        coeff  = (t1[2] >> 4);
+        coeff |= ((uint32_t)t1[3] << 4);
+    } else if(idx % 4 == 3){
+        coeff  = (t1[3] >> 6);
+        coeff |= ((uint32_t)t1[4] << 2);
+    }
+    coeff &= 0x3FF;
+    return coeff;
+}
+
+void poly_schoolbook(poly *c, const uint8_t ccomp[68], const uint8_t *t0){
+  unsigned i,j,idx;
+  uint64_t signs = 0;
+  for(i = 0; i < N; i++) c->coeffs[i] = 0;
+  for(i = 0; i < 8; i++) {
+    signs |= ((uint64_t)ccomp[60+i]) << (8*i);
+  }
+
+  for(idx = 0; idx < TAU; idx++){
+    i = ccomp[idx];
+    if(!(signs & 1)){
+        for(j = 0; i+j < N; j++){
+            c->coeffs[i+j] += polyt0_unpack_idx(t0, j);
+        }
+        for(j = N-i; j<N; j++){
+            c->coeffs[i+j-N] -= polyt0_unpack_idx(t0, j);
+        }
+    } else {
+        for(j = 0; i+j < N; j++){
+            c->coeffs[i+j] -= polyt0_unpack_idx(t0, j);
+        }
+        for(j = N-i; j<N; j++){
+            c->coeffs[i+j-N] += polyt0_unpack_idx(t0, j);
+        }
+    }
+
+    signs >>= 1;
+  }
+}
+
+void poly_schoolbook_t1(poly *c, const uint8_t ccomp[68], const uint8_t *t1){
+  unsigned i,j,idx;
+  uint64_t signs = 0;
+  for(i = 0; i < N; i++) c->coeffs[i] = 0;
+  for(i = 0; i < 8; i++) {
+    signs |= ((uint64_t)ccomp[60+i]) << (8*i);
+  }
+
+  for(idx = 0; idx < TAU; idx++){
+    i = ccomp[idx];
+    if(!(signs & 1)){
+        for(j = 0; i+j < N; j++){
+            c->coeffs[i+j] += (polyt1_unpack_idx(t1, j) << D);
+        }
+        for(j = N-i; j<N; j++){
+            c->coeffs[i+j-N] -= (polyt1_unpack_idx(t1, j) << D);
+        }
+    } else {
+        for(j = 0; i+j < N; j++){
+            c->coeffs[i+j] -= (polyt1_unpack_idx(t1, j) << D);
+        }
+        for(j = N-i; j<N; j++){
+            c->coeffs[i+j-N] += (polyt1_unpack_idx(t1, j) << D);
+        }
+    }
+
+    signs >>= 1;
+  }
+}
+
+
+void polyw_pack(uint8_t buf[3*256], poly *w){
+  poly_reduce(w);
+  poly_caddq(w);
+  unsigned int i;
+  for(i = 0; i < N; i++){
+    buf[i*3 + 0] = w->coeffs[i];
+    buf[i*3 + 1] = w->coeffs[i] >> 8;
+    buf[i*3 + 2] = w->coeffs[i] >> 16;
+  }
+}
+
+void polyw_unpack(poly *w, const uint8_t buf[3*256]) {
+  unsigned int i;
+  for(i = 0; i < N; i++){
+    w->coeffs[i] =  buf[i*3 + 0];
+    w->coeffs[i] |= (int32_t)buf[i*3 + 1] << 8;
+    w->coeffs[i] |= (int32_t)buf[i*3 + 2] << 16;
+  }
+}
+
+
+static void polyw_add_idx(uint8_t buf[3*256], int32_t a, size_t i){
+  int32_t coeff;
+  coeff =  buf[i*3 + 0];
+  coeff |= (int32_t)buf[i*3 + 1] << 8;
+  coeff |= (int32_t)buf[i*3 + 2] << 16;
+
+  coeff += a;
+
+  coeff = freeze(coeff);
+
+  buf[i*3 + 0] = coeff;
+  buf[i*3 + 1] = coeff >> 8;
+  buf[i*3 + 2] = coeff >> 16;
+}
+
+void polyw_add(uint8_t buf[3*256], poly *p){
+  unsigned int i;
+  for(i = 0; i < N; i++){
+    polyw_add_idx(buf, p->coeffs[i], i);
+  }
+}
+void polyw_sub(poly* c, uint8_t buf[3*256], poly *a){
+  int32_t coeff;
+
+
+  for(size_t i=0;i<N;i++){
+    coeff =  buf[i*3 + 0];
+    coeff |= (int32_t)buf[i*3 + 1] << 8;
+    coeff |= (int32_t)buf[i*3 + 2] << 16;
+
+    c->coeffs[i] = coeff - a->coeffs[i];
+  }
+}
+
+static int32_t highbits(int32_t a){
+  int32_t a1;
+
+  a1  = (a + 127) >> 7;
+#if GAMMA2 == (Q-1)/32
+  a1  = (a1*1025 + (1 << 21)) >> 22;
+  a1 &= 15;
+#elif GAMMA2 == (Q-1)/88
+  a1  = (a1*11275 + (1 << 23)) >> 24;
+  a1 ^= ((43 - a1) >> 31) & a1;
+#endif
+
+  return a1;
+}
+
+void poly_highbits(poly *a1, const poly *a) {
+  unsigned int i;
+
+  for(i = 0; i < N; ++i)
+    a1->coeffs[i] = highbits(a->coeffs[i]);
+}
+
+static int32_t lowbits(int32_t a){
+  int32_t a1;
+  int32_t a0;
+
+  a1  = (a + 127) >> 7;
+#if GAMMA2 == (Q-1)/32
+  a1  = (a1*1025 + (1 << 21)) >> 22;
+  a1 &= 15;
+#elif GAMMA2 == (Q-1)/88
+  a1  = (a1*11275 + (1 << 23)) >> 24;
+  a1 ^= ((43 - a1) >> 31) & a1;
+#endif
+
+  a0  = a - a1*2*GAMMA2;
+  a0 -= (((Q-1)/2 - a0) >> 31) & Q;
+  return a0;
+}
+
+void poly_lowbits(poly *a0, const poly *a){
+  unsigned int i;
+
+  for(i = 0; i < N; ++i)
+    a0->coeffs[i] = lowbits(a->coeffs[i]);
+}
+
+void unpack_sk_s1(smallpoly *a, const uint8_t *sk, size_t idx) {
+  small_polyeta_unpack(a, sk + 2*SEEDBYTES + TRBYTES + idx*POLYETA_PACKEDBYTES);
+}
+void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx) {
+  small_polyeta_unpack(a, sk + 2*SEEDBYTES + TRBYTES + L*POLYETA_PACKEDBYTES + idx*POLYETA_PACKEDBYTES);
+}
+
+
+// TODO: in the end increase this buffer size as far as possible
+#define POLY_UNIFORM_BUFFERSIZE 3
+void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, const uint8_t seed[SEEDBYTES], uint16_t nonce, shake128incctx *state){
+  int32_t t;
+  uint8_t buf[POLY_UNIFORM_BUFFERSIZE*3];
+  {
+    size_t ctr = 0;
+    stream128_init(state, seed, nonce);
+
+    do {
+      shake128_inc_squeeze(buf, sizeof buf, state);
+
+      for(size_t pos=0; pos < sizeof buf && ctr < N; pos += 3){
+        t  = buf[pos];
+        t |= (uint32_t)buf[pos+1] << 8;
+        t |= (uint32_t)buf[pos+2] << 16;
+        t &= 0x7FFFFF;
+
+        if(t < Q) {
+          t = montgomery_reduce((int64_t)t * b->coeffs[ctr]);
+          polyw_add_idx(wcomp, t, ctr);
+          ctr++;
+        }
+      }
+    } while(ctr < N);
+
+  }
+}
+
+#define POLY_UNIFORM_GAMMA1_BUFFERSIZE 1
+#if GAMMA1 == (1 << 17)
+#define POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS (POLY_UNIFORM_GAMMA1_BUFFERSIZE*4)
+#define POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES  (POLY_UNIFORM_GAMMA1_BUFFERSIZE*9)
+#elif GAMMA1 == (1 << 19)
+#define POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS (POLY_UNIFORM_GAMMA1_BUFFERSIZE*2)
+#define POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES  (POLY_UNIFORM_GAMMA1_BUFFERSIZE*5)
+#endif
+
+static void polyz_unpack_inplace(int32_t *r){
+  uint8_t *a = (uint8_t *)r;
+
+  unsigned int i,j;
+  #if GAMMA1 == (1 << 17)
+  for(j = 0; j < POLY_UNIFORM_GAMMA1_BUFFERSIZE; ++j) {
+    i = POLY_UNIFORM_GAMMA1_BUFFERSIZE-1-j;
+    int32_t t0;
+
+
+    r[4*i+3]  = a[9*i+6] >> 6;
+    r[4*i+3] |= (uint32_t)a[9*i+7] << 2;
+    r[4*i+3] |= (uint32_t)a[9*i+8] << 10;
+    r[4*i+3] &= 0x3FFFF;
+
+    r[4*i+2]   = a[9*i+4] >> 4;
+    r[4*i+2]  |= (uint32_t)a[9*i+5] << 4;
+    r[4*i+2]  |= (uint32_t)a[9*i+6] << 12;
+    r[4*i+2]  &= 0x3FFFF;
+
+
+    r[4*i+1] = (uint32_t)a[9*i+4] << 14;
+    r[4*i+1] |= a[9*i+2] >> 2;
+    r[4*i+1] |= (uint32_t)a[9*i+3] << 6;
+    r[4*i+1] &= 0x3FFFF;
+
+    t0  = a[9*i+0];
+    t0 |= (uint32_t)a[9*i+1] << 8;
+    t0 |= (uint32_t)a[9*i+2] << 16;
+    t0 &= 0x3FFFF;
+
+    r[4*i+0] = GAMMA1 - t0;
+    r[4*i+1] = GAMMA1 - r[4*i+1];
+    r[4*i+2] = GAMMA1 - r[4*i+2];
+    r[4*i+3] = GAMMA1 - r[4*i+3];
+
+  }
+#elif GAMMA1 == (1 << 19)
+  for(j = 0; j < POLY_UNIFORM_GAMMA1_BUFFERSIZE; ++j) {
+    i = POLY_UNIFORM_GAMMA1_BUFFERSIZE-1-j;
+    int32_t tmp0, tmp1;
+
+    tmp0  = a[5*i+2] >> 4;
+    tmp0 |= (uint32_t)a[5*i+3] << 4;
+    tmp0 |= (uint32_t)a[5*i+4] << 12;
+    tmp0 &= 0xFFFFF;
+
+    tmp1  = a[5*i+0];
+    tmp1 |= (uint32_t)a[5*i+1] << 8;
+    tmp1 |= (uint32_t)a[5*i+2] << 16;
+    tmp1 &= 0xFFFFF;
+
+    r[2*i+0] = GAMMA1 - tmp1;
+    r[2*i+1] = GAMMA1 - tmp0;
+  }
+#endif
+}
+
+void poly_uniform_gamma1_stack(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state){
+  int32_t buf[POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS];
+
+  stream256_init(state, seed, nonce);
+  for(size_t i = 0; i < N/POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; i++){
+    shake256_inc_squeeze((uint8_t *)buf, POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES, state);
+    polyz_unpack_inplace(buf);
+
+    for(size_t j = 0; j < POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; j++){
+      a->coeffs[i*POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS + j] = buf[j];
+    }
+  }
+}
+
+void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state){
+  int32_t buf[POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS];
+
+  stream256_init(state, seed, nonce);
+  for(size_t i = 0; i < N/POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; i++){
+    shake256_inc_squeeze((uint8_t *)buf, POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES, state);
+    polyz_unpack_inplace(buf);
+
+    for(size_t j = 0; j < POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; j++){
+      a->coeffs[i*POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS + j] = buf[j] + b->coeffs[i*POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS + j];
+    }
+  }
+}
+
+
+static inline int32_t make_hint_stack(int32_t z, int32_t r){
+  int32_t r1, v1;
+
+  r1 = highbits(r);
+  v1 = highbits(r+z);
+
+  if(r1 != v1) return 1;
+  return 0;
+}
+
+size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]){
+  int32_t coeff;
+  size_t hints_n = 0;
+  for(size_t i=0;i<N;i++){
+    // unpack coeff from w (contains w - cs2)
+    coeff =  w[i*3 + 0];
+    coeff |= (int32_t)w[i*3 + 1] << 8;
+    coeff |= (int32_t)w[i*3 + 2] << 16;
+
+
+    // compute w - cs2 + c*t0
+    coeff  = coeff + t->coeffs[i];
+
+    a->coeffs[i] = make_hint_stack(-t->coeffs[i], coeff);
+    if(a->coeffs[i] == 1){
+      hints_n++;
+    }
+  }
+  return hints_n;
+}
+
+void unpack_sk_stack(uint8_t rho[SEEDBYTES],
+               uint8_t tr[TRBYTES],
+               uint8_t key[SEEDBYTES],
+               const uint8_t sk[CRYPTO_SECRETKEYBYTES])
+{
+  unsigned int i;
+
+  for(i = 0; i < SEEDBYTES; ++i)
+    rho[i] = sk[i];
+  sk += SEEDBYTES;
+
+  for(i = 0; i < SEEDBYTES; ++i)
+    key[i] = sk[i];
+  sk += SEEDBYTES;
+
+  for(i = 0; i < TRBYTES; ++i)
+    tr[i] = sk[i];
+  sk += TRBYTES;
+}
+
+/*************************************************
+* Name:        unpack_sig_h_indices
+*
+* Description: Unpack only h from signature sig = (c, z, h).
+*
+* Arguments:   - polyveck *h: pointer to output hint vector h
+*              - const unsigned char sig[]: byte array containing
+*                bit-packed signature
+*
+* Returns 1 in case of malformed signature; otherwise 0.
+**************************************************/
+int unpack_sig_h_indices(uint8_t h_i[OMEGA], unsigned int * number_of_hints, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]) {
+    sig += L * POLYZ_PACKEDBYTES;
+    sig += CTILDEBYTES;
+    /* Decode h */
+    unsigned int k = 0;
+    unsigned int hidx = 0;
+
+    if (idx > 0)
+    {
+        k = sig[OMEGA + (idx - 1)];
+    }
+
+    if (sig[OMEGA + idx] < k || sig[OMEGA + idx] > OMEGA) {
+        return 1;
+    }
+
+    for (unsigned int j = k; j < sig[OMEGA + idx]; ++j) {
+        /* Coefficients are ordered for strong unforgeability */
+        if (j > k && sig[j] <= sig[j - 1]) {
+            return 1;
+        }
+        h_i[hidx++] = sig[j];
+    }
+
+    *number_of_hints = hidx;
+
+    /* TODO: extract this check, redundant here */
+    k = sig[OMEGA + (K - 1)];
+    /* Extra indices are zero for strong unforgeability */
+    for (unsigned int j = k; j < OMEGA; ++j) {
+        if (sig[j]) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+/*************************************************
+* Name:        poly_use_hint_stack
+*
+* Description: Use hint polynomial to correct the high bits of a polynomial.
+*
+* Arguments:   - poly *b: pointer to output polynomial with corrected high bits
+*              - const poly *a: pointer to input polynomial
+*              - const poly *h: pointer to input hint polynomial
+**************************************************/
+void poly_use_hint_stack(poly *b, const poly *a, uint8_t h_i[OMEGA], unsigned int number_of_hints) {
+  unsigned int i;
+  unsigned int in_list;
+
+  for(i = 0; i < N; ++i)
+  {
+    in_list = 0;
+    for (size_t hidx = 0; hidx < number_of_hints; hidx++)
+    {
+      if (i == h_i[hidx])
+      {
+        in_list = 1;
+        break;
+      }
+    }
+    if (in_list)
+    {
+      b->coeffs[i] = use_hint(a->coeffs[i], 1);
+    }
+    else
+    {
+      b->coeffs[i] = use_hint(a->coeffs[i], 0);
+    }
+    
+  }
+}
+
+/*************************************************
+* Name:        pack_pk_rho
+*
+* Description: Bit-pack only rho in public key pk = (rho, t1).
+*
+* Arguments:   - unsigned char pk[]: output byte array
+*              - const unsigned char rho[]: byte array containing rho
+**************************************************/
+void pack_pk_rho(unsigned char pk[CRYPTO_PUBLICKEYBYTES],
+                 const unsigned char rho[SEEDBYTES]) {
+    for (unsigned int i = 0; i < SEEDBYTES; ++i) {
+        pk[i] = rho[i];
+    }
+}
+
+/*************************************************
+* Name:        pack_pk_t1
+*
+* Description: Bit-pack only the t1 elem at idx in public key pk = (rho, t1).
+*
+* Arguments:   - unsigned char pk[]: output byte array
+*              - const polyveck *t1: pointer to vector t1
+*              - const unsigned int idx: index to the elem to pack
+**************************************************/
+void pack_pk_t1(unsigned char pk[CRYPTO_PUBLICKEYBYTES],
+             const poly *t1,
+             const unsigned int idx) {
+    pk += SEEDBYTES;
+    polyt1_pack(pk + idx * POLYT1_PACKEDBYTES, t1);
+}
+
+/*************************************************
+* Name:        pack_sk_s1
+*
+* Description: Bit-pack only some element of s1 in secret key sk = (rho, key, tr, s1, s2, t0).
+*
+* Arguments:   - unsigned char sk[]: output byte array
+*              - const poly *s1_elem: pointer to vector element idx in s1
+*              - const unisgned int idx: index to the element of s1 that should be packed
+**************************************************/
+void pack_sk_s1(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                const poly *s1_elem,
+                const unsigned int idx) {
+    sk += 2 * SEEDBYTES + TRBYTES;
+    polyeta_pack(sk + idx * POLYETA_PACKEDBYTES, s1_elem);
+}
+
+/*************************************************
+* Name:        pack_sk_s2
+*
+* Description: Bit-pack only some element of s2 in secret key sk = (rho, key, tr, s1, s2, t0).
+*
+* Arguments:   - unsigned char sk[]: output byte array
+*              - const poly *s2_elem: pointer to vector element idx in s2
+*              - const unsigned int idx: index to the element of s1 that should be packed
+**************************************************/
+void pack_sk_s2(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                const poly *s2_elem,
+                const unsigned int idx) {
+    sk += 2 * SEEDBYTES + TRBYTES + L * POLYETA_PACKEDBYTES;
+    polyeta_pack(sk + idx * POLYETA_PACKEDBYTES, s2_elem);
+}
+
+/*************************************************
+* Name:        pack_sk_t0
+*
+* Description: Bit-pack only some element of t0 in secret key sk = (rho, key, tr, s1, s2, t0).
+*
+* Arguments:   - unsigned char sk[]: output byte array
+*              - const poly *t0_elem: pointer to vector element idx in s2
+*              - const unsigned int idx: index to the element of s1 that should be packed
+**************************************************/
+void pack_sk_t0(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                const poly *t0_elem,
+                const unsigned int idx) {
+    sk += 2 * SEEDBYTES + TRBYTES + L * POLYETA_PACKEDBYTES + K * POLYETA_PACKEDBYTES;
+    polyt0_pack(sk + idx * POLYT0_PACKEDBYTES, t0_elem);
+}
+
+/*************************************************
+* Name:        pack_sk_rho
+*
+* Description: Bit-pack only rho in secret key sk = (rho, key, tr, s1, s2, t0).
+*
+* Arguments:   - unsigned char sk[]: output byte array
+*              - const unsigned char rho[]: byte array containing rho
+**************************************************/
+void pack_sk_rho(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                 const unsigned char rho[SEEDBYTES]) {
+  for (unsigned int i = 0; i < SEEDBYTES; ++i) {
+    sk[i] = rho[i];
+  }
+}
+
+/*************************************************
+* Name:        pack_sk_key
+*
+* Description: Bit-pack only key in secret key sk = (rho, key, tr, s1, s2, t0).
+*
+* Arguments:   - unsigned char sk[]: output byte array
+*              - const unsigned char key[]: byte array containing key
+**************************************************/
+void pack_sk_key(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                 const unsigned char key[SEEDBYTES]) {
+    sk += SEEDBYTES;
+    for (unsigned int i = 0; i < SEEDBYTES; ++i) {
+      sk[i] = key[i];
+    }
+}
+
+/*************************************************
+* Name:        pack_sk_tr
+*
+* Description: Bit-pack only tr in secret key sk = (rho, key, tr, s1, s2, t0).
+*
+* Arguments:   - unsigned char sk[]: output byte array
+*              - const unsigned char tr[]: byte array containing tr
+**************************************************/
+void pack_sk_tr(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                const unsigned char tr[TRBYTES]) {
+    sk += 2*SEEDBYTES;
+    for (unsigned int i = 0; i < TRBYTES; ++i) {
+        sk[i] = tr[i];
+    }
+}
+
+/*************************************************
+* Name:        challenge
+*
+* Description: Implementation of H. Samples polynomial with TAU nonzero
+*              coefficients in {-1,1} using the output stream of
+*              SHAKE256(seed). Stack optimized.
+*
+* Arguments:   - poly *c: pointer to output polynomial
+*              - const uint8_t mu[]: byte array containing seed of length SEEDBYTES
+**************************************************/
+#define CHALLENGE_STACK_BUF_SIZE 8
+void poly_challenge_stack(poly *c, const uint8_t seed[SEEDBYTES]) {
+  unsigned int i, b, pos;
+  uint64_t signs;
+  uint8_t buf[CHALLENGE_STACK_BUF_SIZE];
+  shake256incctx state;
+
+  shake256_inc_init(&state);
+  shake256_inc_absorb(&state, seed, SEEDBYTES);
+  shake256_inc_finalize(&state);
+  shake256_inc_squeeze(buf, CHALLENGE_STACK_BUF_SIZE, &state);
+  signs = 0;
+  for(i = 0; i < 8; ++i)
+  {
+    signs |= (uint64_t)buf[i] << 8*i;
+  }
+  pos = 8;
+
+  for(i = 0; i < N; ++i)
+    c->coeffs[i] = 0;
+  for(i = N-TAU; i < N; ++i) {
+    do {
+      if(pos >= CHALLENGE_STACK_BUF_SIZE) {
+        shake256_inc_squeeze(buf, CHALLENGE_STACK_BUF_SIZE, &state);
+        pos = 0;
+      }
+
+      b = buf[pos++];
+    } while(b > i);
+
+    c->coeffs[i] = c->coeffs[b];
+    c->coeffs[b] = 1 - 2*(signs & 1);
+    signs >>= 1;
+  }
+}
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/stack.h b/crypto_sign/dilithium2/m4fstack/stack.h
new file mode 100644
index 0000000..06c8c57
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/stack.h
@@ -0,0 +1,69 @@
+#ifndef STACK_H
+#define STACK_H
+
+#include "poly.h"
+#include "smallpoly.h"
+#include <stdint.h>
+#include <stddef.h>
+#include "fips202.h"
+
+void poly_challenge_compress(uint8_t c[68], const poly *cp);
+void poly_challenge_decompress(poly *cp, const uint8_t c[68]);
+
+
+void poly_schoolbook(poly *c, const uint8_t ccomp[68], const uint8_t *t0);
+void poly_schoolbook_t1(poly *c, const uint8_t ccomp[68], const uint8_t *t1);
+void polyw_pack(uint8_t buf[3*256], poly *w);
+void polyw_unpack(poly *w, const uint8_t buf[3*256]);
+
+void polyw_add(uint8_t buf[3*256], poly *p);
+void polyw_sub(poly* c, uint8_t buf[3*256], poly *a);
+
+void poly_highbits(poly *a1, const poly *a);
+void poly_lowbits(poly *a0, const poly *a);
+
+void unpack_sk_s1(smallpoly *a, const uint8_t *sk, size_t idx);
+void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx);
+
+void poly_uniform_pointwise_montgomery_polywadd_stack(uint8_t wcomp[3*N], poly *b, const uint8_t  seed[SEEDBYTES], uint16_t nonce, shake128incctx *state);
+void poly_uniform_gamma1_stack(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state);
+void poly_uniform_gamma1_add_stack(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, shake256incctx *state);
+void poly_challenge_stack(poly *c, const uint8_t seed[SEEDBYTES]);
+
+size_t poly_make_hint_stack(poly *a, poly *t, uint8_t w[768]);
+int unpack_sig_h_indices(uint8_t h_i[OMEGA], unsigned int * number_of_hints, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]);
+void poly_use_hint_stack(poly *b, const poly *a, uint8_t h_i[OMEGA], unsigned int number_of_hints);
+
+void unpack_sk_stack(uint8_t rho[SEEDBYTES],
+               uint8_t tr[TRBYTES],
+               uint8_t key[SEEDBYTES],
+               const uint8_t sk[CRYPTO_SECRETKEYBYTES]);
+
+void pack_pk_rho(unsigned char pk[CRYPTO_PUBLICKEYBYTES],
+                 const unsigned char rho[SEEDBYTES]);
+
+void pack_pk_t1(unsigned char pk[CRYPTO_PUBLICKEYBYTES],
+             const poly *t1,
+             const unsigned int idx);
+
+void pack_sk_s1(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                const poly *s1_elem,
+                const unsigned int idx);
+
+void pack_sk_s2(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                const poly *s2_elem,
+                const unsigned int idx);
+
+void pack_sk_t0(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                const poly *t0_elem,
+                const unsigned int idx);
+
+void pack_sk_rho(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                 const unsigned char rho[SEEDBYTES]);
+
+void pack_sk_key(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                 const unsigned char key[SEEDBYTES]);
+
+void pack_sk_tr(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                const unsigned char tr[TRBYTES]);
+#endif
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/symmetric-shake.c b/crypto_sign/dilithium2/m4fstack/symmetric-shake.c
new file mode 120000
index 0000000..b95855b
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/symmetric-shake.c
@@ -0,0 +1 @@
+../m4f/symmetric-shake.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/symmetric.h b/crypto_sign/dilithium2/m4fstack/symmetric.h
new file mode 120000
index 0000000..e89ae95
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/symmetric.h
@@ -0,0 +1 @@
+../m4f/symmetric.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/vector.h b/crypto_sign/dilithium2/m4fstack/vector.h
new file mode 120000
index 0000000..0793594
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/vector.h
@@ -0,0 +1 @@
+../m4f/vector.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium2/m4fstack/vector.s b/crypto_sign/dilithium2/m4fstack/vector.s
new file mode 120000
index 0000000..1a49605
--- /dev/null
+++ b/crypto_sign/dilithium2/m4fstack/vector.s
@@ -0,0 +1 @@
+../m4f/vector.s
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/api.h b/crypto_sign/dilithium3/m4f/api.h
new file mode 120000
index 0000000..9d1668d
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/api.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/api.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/config.h b/crypto_sign/dilithium3/m4f/config.h
new file mode 100644
index 0000000..5572407
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/config.h
@@ -0,0 +1,7 @@
+#ifndef CONFIG_H
+#define CONFIG_H
+
+#define DILITHIUM_MODE 3
+// #define SIGN_STACKSTRATEGY 2
+
+#endif
diff --git a/crypto_sign/dilithium3/m4f/macros.i b/crypto_sign/dilithium3/m4f/macros.i
new file mode 120000
index 0000000..e3f2469
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/macros.i
@@ -0,0 +1 @@
+../../dilithium2/m4f/macros.i
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/macros_smallntt.i b/crypto_sign/dilithium3/m4f/macros_smallntt.i
new file mode 100644
index 0000000..61b6324
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/macros_smallntt.i
@@ -0,0 +1,98 @@
+/**
+ * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com)
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MACROS_SMALLNTT_I
+#define MACROS_SMALLNTT_I
+
+// general macros
+.macro load a, a0, a1, a2, a3, mem0, mem1, mem2, mem3
+  ldr.w \a0, [\a, \mem0]
+  ldr.w \a1, [\a, \mem1]
+  ldr.w \a2, [\a, \mem2]
+  ldr.w \a3, [\a, \mem3]
+.endm
+
+.macro store a, a0, a1, a2, a3, mem0, mem1, mem2, mem3
+  str.w \a0, [\a, \mem0]
+  str.w \a1, [\a, \mem1]
+  str.w \a2, [\a, \mem2]
+  str.w \a3, [\a, \mem3]
+.endm
+
+.macro doubleplant a, tmp, q, qa, plantconst
+  smulwb \tmp, \plantconst, \a
+  smulwt \a, \plantconst, \a
+  smlabt \tmp, \tmp, \q, \qa
+  smlabt \a, \a, \q, \qa
+  pkhtb \a, \a, \tmp, asr#16
+.endm
+
+.macro doublebarrett a, tmp, tmp2, q, barrettconst
+  smulbb \tmp, \a, \barrettconst
+  smultb \tmp2, \a, \barrettconst
+  asr \tmp, \tmp, #26
+  asr \tmp2, \tmp2, #26
+  smulbb \tmp, \tmp, \q
+  smulbb \tmp2, \tmp2, \q
+  pkhbt \tmp, \tmp, \tmp2, lsl#16
+  usub16 \a, \a, \tmp
+.endm
+
+// q locate in the top half of the register
+.macro plant_red q, qa, qinv, tmp
+  mul \tmp, \tmp, \qinv     
+  //tmp*qinv mod 2^2n/ 2^n; in high half
+  smlatt \tmp, \tmp, \q, \qa
+  // result in high half
+.endm
+
+.macro mul_twiddle_plant a, twiddle, tmp, q, qa
+	smulwb \tmp, \twiddle, \a
+	smulwt \a,   \twiddle, \a
+	smlabt \tmp, \tmp, \q, \qa
+	smlabt \a, \a, \q, \qa
+	pkhtb \a, \a, \tmp, asr#16
+.endm
+
+.macro doublebutterfly_plant a0, a1, twiddle, tmp, q, qa
+	smulwb \tmp, \twiddle, \a1
+	smulwt \a1, \twiddle, \a1
+	smlabt \tmp, \tmp, \q, \qa
+	smlabt \a1, \a1, \q, \qa
+	pkhtb \tmp, \a1, \tmp, asr#16
+	usub16 \a1, \a0, \tmp
+	uadd16 \a0, \a0, \tmp
+.endm
+
+.macro two_doublebutterfly_plant a0, a1, a2, a3, twiddle0, twiddle1, tmp, q, qa
+	doublebutterfly_plant \a0, \a1, \twiddle0, \tmp, \q, \qa
+	doublebutterfly_plant \a2, \a3, \twiddle1, \tmp, \q, \qa
+.endm
+
+//For 3329
+.macro fullplant a0, a1, a2, a3, a4, a5, a6, a7, tmp, q, qa, plantconst
+	movw \plantconst, #44984
+	movt \plantconst, #19
+	doubleplant \a0, \tmp, \q, \qa, \plantconst
+	doubleplant \a1, \tmp, \q, \qa, \plantconst
+	doubleplant \a2, \tmp, \q, \qa, \plantconst
+	doubleplant \a3, \tmp, \q, \qa, \plantconst
+	doubleplant \a4, \tmp, \q, \qa, \plantconst
+	doubleplant \a5, \tmp, \q, \qa, \plantconst
+	doubleplant \a6, \tmp, \q, \qa, \plantconst
+	doubleplant \a7, \tmp, \q, \qa, \plantconst
+.endm
+
+#endif
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/ntt.S b/crypto_sign/dilithium3/m4f/ntt.S
new file mode 120000
index 0000000..6fbceff
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/ntt.S
@@ -0,0 +1 @@
+../../dilithium2/m4f/ntt.S
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/ntt.h b/crypto_sign/dilithium3/m4f/ntt.h
new file mode 120000
index 0000000..43729fe
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/ntt.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/ntt.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/packing.c b/crypto_sign/dilithium3/m4f/packing.c
new file mode 120000
index 0000000..b41782c
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/packing.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/packing.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/packing.h b/crypto_sign/dilithium3/m4f/packing.h
new file mode 120000
index 0000000..ba1a6b3
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/packing.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/packing.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/params.h b/crypto_sign/dilithium3/m4f/params.h
new file mode 120000
index 0000000..a6a4d8b
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/params.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/params.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/pointwise_mont.h b/crypto_sign/dilithium3/m4f/pointwise_mont.h
new file mode 120000
index 0000000..0a6f8b9
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/pointwise_mont.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/pointwise_mont.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/pointwise_mont.s b/crypto_sign/dilithium3/m4f/pointwise_mont.s
new file mode 120000
index 0000000..c4ddb96
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/pointwise_mont.s
@@ -0,0 +1 @@
+../../dilithium2/m4f/pointwise_mont.s
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/poly.c b/crypto_sign/dilithium3/m4f/poly.c
new file mode 120000
index 0000000..2544e75
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/poly.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/poly.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/poly.h b/crypto_sign/dilithium3/m4f/poly.h
new file mode 120000
index 0000000..7ef70e5
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/poly.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/poly.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/polyvec.c b/crypto_sign/dilithium3/m4f/polyvec.c
new file mode 120000
index 0000000..a8edd0d
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/polyvec.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/polyvec.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/polyvec.h b/crypto_sign/dilithium3/m4f/polyvec.h
new file mode 120000
index 0000000..cabd6a9
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/polyvec.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/polyvec.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/reduce.h b/crypto_sign/dilithium3/m4f/reduce.h
new file mode 120000
index 0000000..6c13df5
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/reduce.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/reduce.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/rounding.c b/crypto_sign/dilithium3/m4f/rounding.c
new file mode 120000
index 0000000..80b8dce
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/rounding.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/rounding.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/rounding.h b/crypto_sign/dilithium3/m4f/rounding.h
new file mode 120000
index 0000000..74c40c5
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/rounding.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/rounding.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/sign.c b/crypto_sign/dilithium3/m4f/sign.c
new file mode 120000
index 0000000..b7ccdf0
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/sign.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/sign.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/sign.h b/crypto_sign/dilithium3/m4f/sign.h
new file mode 120000
index 0000000..b7f1e89
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/sign.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/sign.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/smallntt.h b/crypto_sign/dilithium3/m4f/smallntt.h
new file mode 100644
index 0000000..2927ff4
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/smallntt.h
@@ -0,0 +1,48 @@
+/**
+ * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com)
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SMALLNTT_H
+#define SMALLNTT_H
+
+#include <stdint.h>
+#include "params.h"
+
+#define SMALL_Q 769 
+
+static const int32_t zetas_769[64] = {
+	3138844760, 1334846793, 999738812, 1854264165, 1681125041, 1150537404, 2820492178, 3071823164, 726067294, 2066499220, 3272887953, 1055590142, 4255871365, 1871019564, 2731130050, 1826338500, 513832239, 1792827701, 3373420347, 2993631302, 1161707670, 3306398751, 3518633806, 3406931146, 1586177780, 3853741788, 3317569017, 3825816122, 971813147, 122872927, 217820188, 619949766, 3753209393, 770748358, 4099487641, 765163225, 3630336467, 1742561504, 3479537875, 982983413, 2809321912, 2379266669, 703726762, 681386230, 4110657907, 1457719720, 1217559000, 2474213930, 1195218468, 1089100940, 564098436, 614364633, 3635921600, 2088839752, 3702943196, 1949211426, 2569161192, 374203913, 3982199847, 2083254619, 1513571050, 3647091866, 413299844, 4149753838};
+
+static const int32_t zetas_asm_769[128] = {
+	346278248, 223405321, 966228013, 759578091, -150798592, 318352582, -1736976371, 1697880440, -2105595150, -804259156, 1675539907, -1016494210, 1401868389, -2005062756, 240160720, 474736307, -1200803600, -1435379187, -1156122536, 1334846793, 999738811, 1854264164, -631120032, -787503756, -1580592646, 1681125040, 1150537403, -1474475119, -1223144132, 1809583100, -100532394, -1938041160, 726067293, 2066499219, -1022079344, 1055590142, 525002504, 273671518, -212235055, -39095931, 1871019563, -1563837247, 1826338499, 139628326, 27925665, 1731391238, 513832238, 1792827701, -921546949, -1301335995, 67021596, 1117026605, 536172770, 1161707669, -988568545, -776333490, -888036151, 1290165729, -497076839, -753992958, 1586177779, -441225509, -977398279, -469151174, -1614103444, 1591762912, -94947261, 971813146, 122872927, 217820188, 619949766, -1709050706, 1010909077, -1748146637, -541757903, 770748357, -195479656, 765163224, 1413038655, 1781657435, -1206388733, -664630830, 1742561504, -815429422, 982983412, 357448514, 44681064, -1524741316, -1485645385, -1915700627, 703726761, 681386229, 686971362, 1787242568, -860110486, -184309390, 1457719719, 1217558999, -1820753366, -502661972, -1921285760, 1139367137, 1195218467, 1089100940, 564098435, 614364633, -1100271206, 457980908, -1669954774, -659045697, 2088839751, -592024101, 1949211426, 1368357591, 698141628, 335107981, -1725806105, 374203913, -312767449, 2083254618, -1061175275, -2139105948, 519417371, 1513571050, -647875431, 413299844, -145213459, 0};
+
+// INTT with CT butterfly
+static const int32_t zetas_inv_asm_769[256] = {
+	5585134, 5585134, -346278248, 5585134, -966228013, -346278248, -223405321, 5585134, 1736976371, -966228013, 150798592, -346278248, -318352582, -223405321, -759578091,
+	// removed first "2285" + LAYER 3+2+1 - 1 - butterfly
+	5585134, -346278248, 5585134, -966228013, -346278248, -223405321, 636705165, 446810642, 1519156183, 11170266, -821014555, -1932456027, 301597183, -692556495, -240160720, 1061175275, -1368357591, -519417371, -335107981, 2139105948, -698141628, -625534899, -1267825197, 843355087, 290426917, 128458060, 1295750862, -748407825, -826599688, 1736976371, -240160720, 2005062756, 1061175275, 1100271206, -1368357591, 502661972, 915961816, 1396283256, 452395775, -1038834743, -955057747, -670215963, 2016233022, -16755399, -1675539907, 1614103444, -1290165729, 94947261, 753992958, -1591762912, 497076839, -1954796559, 1943626293, -1122611738, -1239899531, 938302348, -245745853, 882451018, -435640376, -966228013, 1736976371, -318352582, -240160720, -1401868389, 2005062756, 1016494210, 714897027, -1005323944, 876865885, 2122350549, -1373942724, -2094424884, 1468889985, 1558252114, -1401868389, -686971362, -357448514, 860110486, 1524741316, -1787242568, -44681064, 1407453522, -368618780, 1323676527, -653460564, -1362772458, 1379527857, -463566041, 1859849297, 150798592, -1675539907, 804259156, 1614103444, -67021596, -1290165729, -139628326, -2060914086, -994153678, 55851330, 189894523, -1072345541, 1507985917, 832184821, 1111441472, 2105595150, -525002504, -1809583100, 212235055, 1938041160, -273671518, 100532394, -2044158687, -78191862, 1452134586, 642290298, -2111180283, 552928169, 161968858, -1167292802, -346278248, -966228013, -223405321, 1736976371, 150798592, -318352582, -759578091, -1608518311, -2032988421, -899206417, -480321440, 943887481, 1491230518, -83776995, -284841784, 2005062756, 1100271206, 502661972, 1669954774, -1139367137, -457980908, 1921285760, 1128196871, -1318091394, -1904530361, 396544445, -1228729265, 117287794, 2116765416, 1184048201, -318352582, -1401868389, 1016494210, -686971362, -1413038655, -357448514, 1709050706, -731652426, 89362128, 2021818155, 1720220972, -1882189829, -1245484665, -798674023, 720482160, 804259156, -67021596, -139628326, -536172770, -1731391238, -1117026605, -27925665, -1843093898, -1971551958, 1027664477, 1776072302, -1692295306, 1977137091, 709311894, 1552666981, -223405321, 150798592, -759578091, -1675539907, 2105595150, 804259156, -1697880440, -675801096, 279256651, 949472614, -1066760408, -1050005009, -134043193, 1262240064, 1714635839, 1016494210, -1413038655, 1709050706, 1206388733, 1748146637, -1781657435, -1010909077, -390959312, -1329261660, -1083515807, -1965966825, -1530326449, 809844289, -1541496715, 1630858843, -759578091, 2105595150, -1697880440, -525002504, 631120032, -1809583100, -474736307, -1575007513, -201064789, 1893360095, 424470110, -1133782004, -418884977, -1424208921, -547343036, -1697880440, 631120032, -474736307, 1580592646, 1435379187, 787503756, 1200803600, 1999477623, -932717215, 1982722224, -1848679031, 586438968, 1993892490, 1625273710, -1346017059, 0};
+
+// Q1=769
+void small_ntt_asm_769(int16_t a[N], const int32_t *zetas);
+void small_invntt_asm_769(int16_t a[N], const int32_t *zetas);
+void small_pointmul_asm_769(int16_t out[N], const int16_t in[N], const int32_t *zetas);
+void small_asymmetric_mul_asm_769(int16_t c[N], const int16_t a[N], const int16_t b[N], const int16_t b_prime[N]);
+
+// small NTT for computing cs0 and cs1; default use 769 as modulus.
+#define small_ntt(a) small_ntt_asm_769(a, zetas_asm_769)
+#define small_invntt_tomont(a) small_invntt_asm_769(a, zetas_inv_asm_769)
+#define small_point_mul(out, in) small_pointmul_asm_769(out, in, zetas_769)
+#define small_asymmetric_mul(c, a, b, b_prime) small_asymmetric_mul_asm_769(c, a, b, b_prime);
+
+#endif
diff --git a/crypto_sign/dilithium3/m4f/smallntt_769.S b/crypto_sign/dilithium3/m4f/smallntt_769.S
new file mode 100644
index 0000000..97c60f0
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/smallntt_769.S
@@ -0,0 +1,681 @@
+/**
+ * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com)
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "macros.i"
+
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+#include "macros_smallntt.i"
+// #######
+// #######
+// # NTT #
+// #######
+// #######
+
+.macro _3_layer_double_CT_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+	// layer 3
+	ldr.w \twiddle1, [\twiddle_ptr], #4
+	two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle1, \tmp, \q, \qa
+	two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa
+
+	// layer 2
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa
+
+	two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle2, \twiddle2, \tmp, \q, \qa
+
+	// layer 1
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa
+
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa
+.endm
+
+.macro _3_layer_double_CT_16_plant_fp c0, c1, c2, c3, c4, c5, c6, c7, xi0, xi1, xi2, xi3, xi4, xi5, xi6, twiddle1, twiddle2, q, qa, tmp
+	// layer 3
+	vmov \twiddle1, \xi0
+	two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle1, \tmp, \q, \qa
+	two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa
+
+	// layer 2
+	vmov \twiddle1, \xi1
+	vmov \twiddle2, \xi2
+	two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa
+
+	two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle2, \twiddle2, \tmp, \q, \qa
+
+	// layer 1
+	vmov \twiddle1, \xi3
+	vmov \twiddle2, \xi4
+	two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa
+
+	vmov \twiddle1, \xi5
+	vmov \twiddle2, \xi6
+	two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa
+.endm
+
+.global small_ntt_asm_769
+.type small_ntt_asm_769, %function
+.align 2
+small_ntt_asm_769:
+	push {r4-r11, r14}
+	vpush.w {s16-s24}
+	poly         .req r0
+	twiddle_ptr  .req r1
+	poly0        .req r2
+	poly1        .req r3
+	poly2        .req r4
+	poly3        .req r5
+	poly4        .req r6
+	poly5        .req r7
+	poly6        .req r8
+	poly7        .req r9
+	twiddle1     .req r10
+	twiddle2     .req r11
+	###  qinv        .req r11 ### q^-1 mod 2^2n; n=16
+	q           .req r12 
+	### at the top of r12
+	qa          .req r0
+	### qa=2^a q;a=3; at the bottom of r12
+	tmp         .req r14
+
+	// movw qa, #24608
+	// Why movt? Because we initially placed qa at the bottom of the same register as q;
+	movt q, #769
+
+	### LAYER 7+6+5+4
+	.equ distance, 256
+	.equ offset, 32
+	.equ strincr, 4
+	// pre-load 15 twiddle factors to 15 FPU registers
+	// s0-s7 used to temporary store 16 16-bit polys.
+	vldm twiddle_ptr!, {s8-s22}
+ 
+	add tmp, poly, #strincr*8
+	// s23: poly addr
+	// s24: tmp  
+	vmov s24, tmp  
+	1:
+		// load a1, a3, ..., a15
+		vmov s23, poly
+		load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset
+		load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset
+		
+		movw qa, #24608
+
+		// 8-NTT on a1, a3, ..., a15
+		_3_layer_double_CT_16_plant_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp
+
+		// s15, s16, s17, s18, s19, s20, s21, s22 left
+		// multiply coeffs by layer 8 twiddles for later use
+		vmov twiddle1, s15 
+		vmov twiddle2, s16 
+		mul_twiddle_plant poly0, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly1, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s17 
+		vmov twiddle2, s18 
+		mul_twiddle_plant poly2, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly3, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s19 
+		vmov twiddle2, s20 
+		mul_twiddle_plant poly4, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly5, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s21 
+		vmov twiddle2, s22 
+		mul_twiddle_plant poly6, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly7, twiddle2, tmp, q, qa
+
+		vmov s0, poly0 // a1
+		vmov s1, poly1 // a3
+		vmov s2, poly2 // a5
+		vmov s3, poly3 // a7
+		vmov s4, poly4 // a9
+		vmov s5, poly5 // a11
+		vmov s6, poly6 // a13
+		vmov s7, poly7 // a15
+
+		vmov poly, s23
+	
+		// load a0, a2, ..., a14
+		load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+		load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		
+		movw qa, #24608
+		// 8-NTT on a0, a2, ..., a14
+		_3_layer_double_CT_16_plant_fp poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s8, s9, s10, s11, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp
+
+		
+		// layer 4 - 1
+		// addsub: (a2, a6, a10, a14), (a3, a7, a11, a15)
+		vmov poly, s23
+		vmov twiddle1, s1 // load a3
+		uadd16 tmp, poly1, twiddle1
+		usub16 poly1, poly1, twiddle1
+		str.w tmp, [poly, #1*distance/4]
+		str.w poly1, [poly, #1*distance/4+offset]
+
+		vmov twiddle1, s3 // load a7
+		uadd16 tmp, poly3, twiddle1
+		usub16 poly3, poly3, twiddle1
+		str.w tmp, [poly, #3*distance/4]
+		str.w poly3, [poly, #3*distance/4+offset]
+		
+		vmov twiddle1, s5 // load a11
+		uadd16 tmp, poly5, twiddle1
+		usub16 poly5, poly5, twiddle1
+		str.w tmp, [poly, #5*distance/4]
+		str.w poly5, [poly, #5*distance/4+offset]
+		
+		vmov twiddle1, s7 // load a15
+		uadd16 tmp, poly7, twiddle1
+		usub16 poly7, poly7, twiddle1
+		str.w tmp, [poly, #7*distance/4]
+		str.w poly7, [poly, #7*distance/4+offset]
+		
+		// layer 4 - 2    
+		// addsub: (a0, a4, a8, a12), (a1, a5, a9, a13)
+		vmov poly3, s2 // load a5
+		uadd16 tmp, poly2, poly3
+		usub16 twiddle1, poly2, poly3
+		str.w tmp, [poly, #2*distance/4]
+		str.w twiddle1, [poly, #2*distance/4+offset]
+
+		vmov poly5, s4 // load a9
+		uadd16 tmp, poly4, poly5
+		usub16 twiddle1, poly4, poly5
+		str.w tmp, [poly, #4*distance/4]
+		str.w twiddle1, [poly, #4*distance/4+offset]
+
+		vmov poly7, s6 // load a13
+		uadd16 tmp, poly6, poly7
+		usub16 twiddle1, poly6, poly7
+		str.w tmp, [poly, #6*distance/4]
+		str.w twiddle1, [poly, #6*distance/4+offset]
+		
+		vmov poly1, s0 // load a1
+		uadd16 tmp, poly0, poly1
+		usub16 twiddle1, poly0, poly1
+		str.w twiddle1, [poly, #offset]
+		str.w tmp, [poly], #4
+
+	vmov tmp, s24
+	cmp.w poly, tmp
+	bne.w 1b
+
+	sub.w poly, #8*strincr
+
+	### LAYER 3+2+1
+
+	.equ distance, distance/16
+	.equ strincr, 32
+
+	add.w tmp, poly, #strincr*16
+	vmov s13, tmp
+	2:
+		vmov s23, poly
+		load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+		load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		
+		movw qa, #24608
+		_3_layer_double_CT_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+		
+		vmov poly, s23
+		store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		str.w poly1, [poly, #distance/4]
+		str.w poly2, [poly, #2*distance/4]
+		str.w poly3, [poly, #3*distance/4]
+		str.w poly0, [poly], #strincr
+
+	vmov tmp, s13
+	cmp.w poly, tmp
+	bne.w 2b
+	vpop.w {s16-s24}
+	pop {r4-r11, pc}
+
+.unreq poly
+.unreq twiddle_ptr
+.unreq poly0
+.unreq poly1
+.unreq poly2
+.unreq poly3
+.unreq poly4
+.unreq poly5
+.unreq poly6
+.unreq poly7
+.unreq twiddle1
+.unreq twiddle2
+.unreq q
+.unreq qa
+.unreq tmp
+
+
+// ########
+// ########
+// # INTT #
+// ########
+// ########
+
+// input: 0.5/1q
+.macro _3_layer_double_inv_CT_16_plant_light c0, c1, c2, c3, c4, c5, c6, c7, xi2, xi4, xi5, xi6, twiddle1, tmp2, q, qa, tmp
+
+	// layer 1  
+	sadd16.w \tmp, \c0, \c1 // c0, c1
+	ssub16.w \c1, \c0, \c1
+	sadd16.w \tmp2, \c2, \c3 // c2, c3
+	ssub16.w \c3, \c2, \c3
+	// tmp, c1, tmp2, c3: 1q maximum
+	sadd16.w \c0, \c4, \c5 // c4, c5
+	ssub16.w \c5, \c4, \c5
+	sadd16.w \c2, \c6, \c7 // c6, c7
+	ssub16.w \c7, \c6, \c7
+	// c4, c6 are free at this point
+	// c0,c5,c2,c7 1q maximum
+
+	// layer 2
+	sadd16.w \c6, \tmp, \tmp2 // c0, c2
+	ssub16.w \tmp2, \tmp, \tmp2
+	sadd16.w \c4, \c0, \c2 // c4, c6
+	ssub16.w \c2, \c0, \c2
+	// c6, tmp2, c4, c2: 2q maximum
+
+	vmov.w \twiddle1, \xi2
+	doublebutterfly_plant \c1, \c3, \twiddle1, \tmp, \q, \qa
+	doublebutterfly_plant \c5, \c7, \twiddle1, \tmp, \q, \qa 
+	// c1, c3, c7, c5: 1.5q maximum;
+
+	// tmp and c0 are free at this point
+	// layer 3
+	sadd16.w \c0, \c6, \c4 // c0, c4
+	ssub16.w \c4, \c6, \c4
+	// c0, c4: 4q
+	// c6 are free at this point
+	vmov.w \twiddle1, \xi4
+	doublebutterfly_plant \c1, \c5, \twiddle1, \tmp, \q, \qa
+	// c1, c5: 2q maximum
+
+	vmov.w \twiddle1, \xi5
+	// this block is one doublebutterfly
+	smulwb \tmp, \twiddle1, \c2  // c2, c6
+	smulwt \c2,  \twiddle1, \c2
+	smlabt \tmp, \tmp, \q, \qa
+	smlabt \c2, \c2, \q, \qa
+	pkhtb \tmp, \c2, \tmp, asr#16
+	ssub16.w \c6, \tmp2, \tmp 
+	sadd16.w \c2, \tmp2, \tmp
+	//c6, c2: 4.5q
+	vmov.w \twiddle1, \xi6
+	doublebutterfly_plant \c3, \c7, \twiddle1, \tmp, \q, \qa
+	//c3, c7: 2.5q maximum
+.endm
+.macro _3_layer_double_inv_CT_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+	// layer 3
+	ldr.w \twiddle1, [\twiddle_ptr], #4
+	two_doublebutterfly_plant \c0, \c1, \c2, \c3, \twiddle1, \twiddle1, \tmp, \q, \qa
+	two_doublebutterfly_plant \c4, \c5, \c6, \c7, \twiddle1, \twiddle1, \tmp, \q, \qa
+
+	// layer 2
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c0, \c2, \c1, \c3, \twiddle1, \twiddle2, \tmp, \q, \qa
+
+	two_doublebutterfly_plant \c4, \c6, \c5, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa
+
+	// layer 1
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c0, \c4, \c1, \c5, \twiddle1, \twiddle2, \tmp, \q, \qa
+
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	two_doublebutterfly_plant \c2, \c6, \c3, \c7, \twiddle1, \twiddle2, \tmp, \q, \qa
+.endm
+
+.macro _3_layer_double_inv_twist_16_plant c0, c1, c2, c3, c4, c5, c6, c7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	mul_twiddle_plant \c0, \twiddle1, \tmp, \q, \qa
+	mul_twiddle_plant \c1, \twiddle2, \tmp, \q, \qa
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	mul_twiddle_plant \c2, \twiddle1, \tmp, \q, \qa
+	mul_twiddle_plant \c3, \twiddle2, \tmp, \q, \qa
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	mul_twiddle_plant \c4, \twiddle1, \tmp, \q, \qa
+	mul_twiddle_plant \c5, \twiddle2, \tmp, \q, \qa
+	ldrd \twiddle1, \twiddle2, [\twiddle_ptr], #8
+	mul_twiddle_plant \c6, \twiddle1, \tmp, \q, \qa
+	mul_twiddle_plant \c7, \twiddle2, \tmp, \q, \qa
+.endm
+# input coefficients < 0.5q
+.global small_invntt_asm_769
+.type small_invntt_asm_769, %function
+.align 2
+small_invntt_asm_769:
+	push {r4-r11, r14}
+	vpush.w {s16-s23}
+	poly         .req r0
+	twiddle_ptr  .req r1
+	poly0        .req r2
+	poly1        .req r3
+	poly2        .req r4
+	poly3        .req r5
+	poly4        .req r6
+	poly5        .req r7
+	poly6        .req r8
+	poly7        .req r9
+	twiddle1     .req r10
+	twiddle2     .req r11
+	q            .req r12 
+	// at the top of r12
+	qa           .req r0
+	// qa=2^a q;a=3; at the bottom of r12
+	tmp          .req r14
+
+	movt q, #769
+
+	### LAYER 7+6+5+4
+	.equ distance, 16
+	.equ offset, 32
+	.equ strincr, 64
+
+	// pre-load twiddle factors to FPU registers
+	vldm twiddle_ptr!, {s8-s22}
+
+	add.w tmp, poly, #8*strincr
+	vmov s8, tmp
+	1:
+		vmov s23, poly
+		// load a1, a3, ..., a15
+		load poly, poly0, poly1, poly2, poly3, #offset, #distance/4+offset, #2*distance/4+offset, #3*distance/4+offset
+		load poly, poly4, poly5, poly6, poly7, #distance+offset, #5*distance/4+offset, #6*distance/4+offset, #7*distance/4+offset
+
+		movw qa, #24608
+
+		// NTT on a1, a3, ..., a15   
+		// twiddle2 is used as tmp2
+		_3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s10, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp
+
+		// multiply coeffs by layer 4 twiddles for later use
+		// vmov twiddle1, s15 
+		vmov twiddle2, s16
+		// mul_twiddle_plant poly0, twiddle1, tmp, q, qa // could be omitted but kept for reduction only
+		mul_twiddle_plant poly1, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s17 
+		vmov twiddle2, s18
+		mul_twiddle_plant poly2, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly3, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s19 
+		vmov twiddle2, s20
+		mul_twiddle_plant poly4, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly5, twiddle2, tmp, q, qa
+
+		vmov twiddle1, s21 
+		vmov twiddle2, s22
+		mul_twiddle_plant poly6, twiddle1, tmp, q, qa
+		mul_twiddle_plant poly7, twiddle2, tmp, q, qa
+
+		vmov s0, poly0 // a1
+		vmov s1, poly1 // a3
+		vmov s2, poly2 // a5
+		vmov s3, poly3 // a7
+		vmov s4, poly4 // a9
+		vmov s5, poly5 // a11
+		vmov s6, poly6 // a13
+		vmov s7, poly7 // a15
+		// 0.5q
+		// ----------
+
+		vmov poly, s23
+		// load a0, a2, ..., a14
+		load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+		load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		
+		movw qa, #24608
+		// NTT on a0, a2, ..., a14
+		// twiddle2 is used as tmp2
+		_3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s10, s12, s13, s14, twiddle1, twiddle2, q, qa, tmp
+		// 1,3,5,7: <5q; 0,2,4,6:<1q
+		// layer 4 - 1
+		// addsub: (a2, a6, a10, a14), (a3, a7, a11, a15)
+		vmov poly, s23
+		vmov twiddle2, s1 // load a3
+		uadd16 tmp, poly1, twiddle2
+		usub16 poly1, poly1, twiddle2
+		str.w tmp, [poly, #1*distance/4]
+		str.w poly1, [poly, #1*distance/4+offset]
+
+		vmov twiddle2, s3 // load a7
+		uadd16 tmp, poly3, twiddle2
+		usub16 poly3, poly3, twiddle2
+		str.w tmp, [poly, #3*distance/4]
+		str.w poly3, [poly, #3*distance/4+offset]
+		
+		vmov twiddle2, s5 // load a11
+		uadd16 tmp, poly5, twiddle2
+		usub16 poly5, poly5, twiddle2
+		str.w tmp, [poly, #5*distance/4]
+		str.w poly5, [poly, #5*distance/4+offset]
+		
+		vmov twiddle2, s7 // load a15
+		uadd16 tmp, poly7, twiddle2
+		usub16 poly7, poly7, twiddle2
+		str.w tmp, [poly, #7*distance/4]
+		str.w poly7, [poly, #7*distance/4+offset]
+		//1,3,5,7: < 5.5q
+
+		// layer 4 - 2    
+		// addsub: (a0, a4, a8, a12), (a1, a5, a9, a13)
+		vmov poly3, s2 // load a5
+		uadd16 tmp, poly2, poly3
+		usub16 twiddle2, poly2, poly3
+		str.w tmp, [poly, #2*distance/4]
+		str.w twiddle2, [poly, #2*distance/4+offset]
+
+		vmov poly5, s4 // load a9
+		uadd16 tmp, poly4, poly5
+		usub16 twiddle2, poly4, poly5
+		str.w tmp, [poly, #4*distance/4]
+		str.w twiddle2, [poly, #4*distance/4+offset]
+
+		vmov poly7, s6 // load a13
+		uadd16 tmp, poly6, poly7
+		usub16 twiddle2, poly6, poly7
+		str.w tmp, [poly, #6*distance/4]
+		str.w twiddle2, [poly, #6*distance/4+offset]
+		
+		vmov poly1, s0 // load a1
+		uadd16 tmp, poly0, poly1
+		usub16 twiddle2, poly0, poly1
+		str.w twiddle2, [poly, #offset]    
+		str.w tmp, [poly], #strincr // increase 2*8*4 = 64 (2 * 8 loads of 4 bytes each)
+		//0,2,4,6: < 1.5q
+	vmov tmp, s8
+	cmp.w poly, tmp
+	bne.w 1b
+
+	sub.w poly, #8*strincr  
+
+	### LAYER 3+2+1
+
+	.equ distance, distance*16
+	.equ strincr, 4
+
+	// ITER 0
+	vmov s6, poly
+	load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+	load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+
+	vldm twiddle_ptr!, {s0-s5}
+	movw qa, #24608
+	// twiddle2 is used as tmp2
+	_3_layer_double_inv_CT_16_plant_light poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, s1, s3, s4, s5, twiddle1, twiddle2, q, qa, tmp
+
+	// twisting
+	_3_layer_double_inv_twist_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+	
+	vmov poly, s6
+	store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+	str.w poly1, [poly, #distance/4]
+	str.w poly2, [poly, #2*distance/4]
+	str.w poly3, [poly, #3*distance/4]
+	str.w poly0, [poly], #4
+
+	// ITER 1-15
+	add.w tmp, poly, #strincr*3*(5)
+	vmov s14, tmp
+	2:
+		vmov s6, poly
+		// polys upto 5.5q
+		load poly, poly0, poly1, poly2, poly3, #0, #distance/4, #2*distance/4, #3*distance/4
+		load poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		
+		movw qa, #24608
+		_3_layer_double_inv_CT_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+
+		// twisting
+		_3_layer_double_inv_twist_16_plant poly0, poly1, poly2, poly3, poly4, poly5, poly6, poly7, twiddle1, twiddle2, twiddle_ptr, q, qa, tmp
+
+		vmov poly, s6
+		store poly, poly4, poly5, poly6, poly7, #distance, #5*distance/4, #6*distance/4, #7*distance/4
+		str.w poly1, [poly, #distance/4]
+		str.w poly2, [poly, #2*distance/4]
+		str.w poly3, [poly, #3*distance/4]
+		str.w poly0, [poly], #4
+
+	vmov tmp, s14
+	cmp.w poly, tmp
+	bne.w 2b
+
+	vpop.w {s16-s23}
+	pop {r4-r11, pc}
+
+.unreq poly
+.unreq twiddle_ptr
+.unreq poly0
+.unreq poly1
+.unreq poly2
+.unreq poly3
+.unreq poly4
+.unreq poly5
+.unreq poly6
+.unreq poly7
+.unreq twiddle1
+.unreq twiddle2
+.unreq q
+.unreq qa
+.unreq tmp
+
+
+###################################
+#### small point-multiplication####
+#### r0: out; r1: in; r2: zetas####
+###################################
+.align 2
+.global small_pointmul_asm_769
+.type small_pointmul_asm_769, %function
+small_pointmul_asm_769:
+    push.w {r4-r11, lr}
+
+    movw r14, #24608 // qa
+    movt r12, #769  // q
+    .equ width, 4
+
+
+    add.w r3, r2, #64*width
+    _point_mul_16_loop:
+
+    ldr.w r7, [r1, #2*width]
+    ldr.w r8, [r1, #3*width]
+    ldr.w r9, [r2, #1*width]
+    ldr.w r5, [r1, #1*width]
+    ldr.w r4, [r1], #4*width
+    ldr.w r6, [r2], #2*width
+
+    smulwt r10, r6, r4
+    smlabt r10, r10, r12, r14
+    pkhbt r4, r4, r10
+
+    neg.w r6, r6
+
+	smulwt r10, r6, r5
+    smlabt r10, r10, r12, r14
+    pkhbt r5, r5, r10
+
+    str.w r5, [r0, #1*width]
+    str.w r4, [r0], #2*width
+
+    smulwt r10, r9, r7
+    smlabt r10, r10, r12, r14
+    pkhbt r7, r7, r10
+
+    neg.w r9, r9
+
+    smulwt r10, r9, r8
+    smlabt r10, r10, r12, r14
+    pkhbt r8, r8, r10
+
+    str.w r8, [r0, #1*width]
+    str.w r7, [r0], #2*width
+
+    cmp.w r2, r3
+    bne.w _point_mul_16_loop
+
+    pop.w {r4-r11, pc}
+
+
+#### r0: out; r1: a; r2: b; r3: bprime
+  .align 2
+.global small_asymmetric_mul_asm_769
+.type small_asymmetric_mul_asm_769, %function
+small_asymmetric_mul_asm_769:
+    push.w {r4-r11, lr}
+
+    movw r14, #24608 // qa
+    movt r12, #769  // q
+	movw r11, #64769
+	movt r11, #58632 // qinv
+    .equ width, 4
+    add.w r10, r0, #256*2
+    _asymmetric_mul_16_loop:
+    ldr.w r7, [r1, #width]
+    ldr.w r4, [r1], #2*width
+    ldr.w r8, [r2, #width]
+    ldr.w r5, [r2], #2*width
+    ldr.w r9, [r3, #width]
+    ldr.w r6, [r3], #2*width
+
+    smuad r6, r4, r6
+    plant_red r12, r14, r11, r6
+    smuadx r5, r4, r5
+    plant_red r12, r14, r11, r5
+
+    pkhtb r5, r5, r6, asr#16
+    str.w r5, [r0], #width
+
+	smuad r6, r7, r9
+    plant_red r12, r14, r11, r6
+    smuadx r8, r7, r8
+    plant_red r12, r14, r11, r8
+
+    pkhtb r8, r8, r6, asr#16
+    str.w r8, [r0], #width
+
+    cmp.w r0, r10
+    bne.w _asymmetric_mul_16_loop
+
+    pop.w {r4-r11, pc}
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/smallpoly.c b/crypto_sign/dilithium3/m4f/smallpoly.c
new file mode 120000
index 0000000..b59f668
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/smallpoly.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/smallpoly.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/smallpoly.h b/crypto_sign/dilithium3/m4f/smallpoly.h
new file mode 120000
index 0000000..9d46a7a
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/smallpoly.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/smallpoly.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/symmetric-shake.c b/crypto_sign/dilithium3/m4f/symmetric-shake.c
new file mode 120000
index 0000000..6ad8054
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/symmetric-shake.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/symmetric-shake.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/symmetric.h b/crypto_sign/dilithium3/m4f/symmetric.h
new file mode 120000
index 0000000..90ad5c0
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/symmetric.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/symmetric.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/vector.h b/crypto_sign/dilithium3/m4f/vector.h
new file mode 120000
index 0000000..6e2280f
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/vector.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/vector.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4f/vector.s b/crypto_sign/dilithium3/m4f/vector.s
new file mode 120000
index 0000000..2d2b4dc
--- /dev/null
+++ b/crypto_sign/dilithium3/m4f/vector.s
@@ -0,0 +1 @@
+../../dilithium2/m4f/vector.s
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/api.h b/crypto_sign/dilithium3/m4fstack/api.h
new file mode 120000
index 0000000..9d1668d
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/api.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/api.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/config.h b/crypto_sign/dilithium3/m4fstack/config.h
new file mode 120000
index 0000000..f3892d9
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/config.h
@@ -0,0 +1 @@
+../m4f/config.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/macros.i b/crypto_sign/dilithium3/m4fstack/macros.i
new file mode 120000
index 0000000..e3f2469
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/macros.i
@@ -0,0 +1 @@
+../../dilithium2/m4f/macros.i
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/macros_smallntt.i b/crypto_sign/dilithium3/m4fstack/macros_smallntt.i
new file mode 120000
index 0000000..37838a2
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/macros_smallntt.i
@@ -0,0 +1 @@
+../../dilithium2/m4fstack/macros_smallntt.i
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/ntt.S b/crypto_sign/dilithium3/m4fstack/ntt.S
new file mode 120000
index 0000000..6fbceff
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/ntt.S
@@ -0,0 +1 @@
+../../dilithium2/m4f/ntt.S
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/ntt.h b/crypto_sign/dilithium3/m4fstack/ntt.h
new file mode 120000
index 0000000..43729fe
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/ntt.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/ntt.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/packing.c b/crypto_sign/dilithium3/m4fstack/packing.c
new file mode 120000
index 0000000..b41782c
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/packing.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/packing.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/packing.h b/crypto_sign/dilithium3/m4fstack/packing.h
new file mode 120000
index 0000000..ba1a6b3
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/packing.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/packing.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/params.h b/crypto_sign/dilithium3/m4fstack/params.h
new file mode 120000
index 0000000..a6a4d8b
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/params.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/params.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/pointwise_mont.h b/crypto_sign/dilithium3/m4fstack/pointwise_mont.h
new file mode 120000
index 0000000..0a6f8b9
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/pointwise_mont.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/pointwise_mont.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/pointwise_mont.s b/crypto_sign/dilithium3/m4fstack/pointwise_mont.s
new file mode 120000
index 0000000..c4ddb96
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/pointwise_mont.s
@@ -0,0 +1 @@
+../../dilithium2/m4f/pointwise_mont.s
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/poly.c b/crypto_sign/dilithium3/m4fstack/poly.c
new file mode 120000
index 0000000..2544e75
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/poly.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/poly.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/poly.h b/crypto_sign/dilithium3/m4fstack/poly.h
new file mode 120000
index 0000000..7ef70e5
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/poly.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/poly.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/polyvec.c b/crypto_sign/dilithium3/m4fstack/polyvec.c
new file mode 120000
index 0000000..a8edd0d
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/polyvec.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/polyvec.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/polyvec.h b/crypto_sign/dilithium3/m4fstack/polyvec.h
new file mode 120000
index 0000000..cabd6a9
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/polyvec.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/polyvec.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/reduce.h b/crypto_sign/dilithium3/m4fstack/reduce.h
new file mode 120000
index 0000000..f1e2b38
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/reduce.h
@@ -0,0 +1 @@
+../../dilithium2/m4fstack/reduce.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/rounding.c b/crypto_sign/dilithium3/m4fstack/rounding.c
new file mode 120000
index 0000000..80b8dce
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/rounding.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/rounding.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/rounding.h b/crypto_sign/dilithium3/m4fstack/rounding.h
new file mode 120000
index 0000000..74c40c5
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/rounding.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/rounding.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/sign.c b/crypto_sign/dilithium3/m4fstack/sign.c
new file mode 120000
index 0000000..39f6ec4
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/sign.c
@@ -0,0 +1 @@
+../../dilithium2/m4fstack/sign.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/sign.h b/crypto_sign/dilithium3/m4fstack/sign.h
new file mode 120000
index 0000000..b7f1e89
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/sign.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/sign.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/smallntt.h b/crypto_sign/dilithium3/m4fstack/smallntt.h
new file mode 120000
index 0000000..60f2d18
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/smallntt.h
@@ -0,0 +1 @@
+../../dilithium2/m4fstack/smallntt.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/smallntt_769.S b/crypto_sign/dilithium3/m4fstack/smallntt_769.S
new file mode 120000
index 0000000..4ae2f9b
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/smallntt_769.S
@@ -0,0 +1 @@
+../../dilithium2/m4fstack/smallntt_769.S
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/smallpoly.c b/crypto_sign/dilithium3/m4fstack/smallpoly.c
new file mode 120000
index 0000000..9c35056
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/smallpoly.c
@@ -0,0 +1 @@
+../../dilithium2/m4fstack/smallpoly.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/smallpoly.h b/crypto_sign/dilithium3/m4fstack/smallpoly.h
new file mode 120000
index 0000000..45701a4
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/smallpoly.h
@@ -0,0 +1 @@
+../../dilithium2/m4fstack/smallpoly.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/stack.c b/crypto_sign/dilithium3/m4fstack/stack.c
new file mode 120000
index 0000000..d25ed6f
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/stack.c
@@ -0,0 +1 @@
+../../dilithium2/m4fstack/stack.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/stack.h b/crypto_sign/dilithium3/m4fstack/stack.h
new file mode 120000
index 0000000..beab8ca
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/stack.h
@@ -0,0 +1 @@
+../../dilithium2/m4fstack/stack.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/symmetric-shake.c b/crypto_sign/dilithium3/m4fstack/symmetric-shake.c
new file mode 120000
index 0000000..6ad8054
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/symmetric-shake.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/symmetric-shake.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/symmetric.h b/crypto_sign/dilithium3/m4fstack/symmetric.h
new file mode 120000
index 0000000..90ad5c0
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/symmetric.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/symmetric.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/vector.h b/crypto_sign/dilithium3/m4fstack/vector.h
new file mode 120000
index 0000000..6e2280f
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/vector.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/vector.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium3/m4fstack/vector.s b/crypto_sign/dilithium3/m4fstack/vector.s
new file mode 120000
index 0000000..2d2b4dc
--- /dev/null
+++ b/crypto_sign/dilithium3/m4fstack/vector.s
@@ -0,0 +1 @@
+../../dilithium2/m4f/vector.s
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/api.h b/crypto_sign/dilithium5/m4f/api.h
new file mode 120000
index 0000000..9d1668d
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/api.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/api.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/basemul_257.S b/crypto_sign/dilithium5/m4f/basemul_257.S
new file mode 120000
index 0000000..800dbb5
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/basemul_257.S
@@ -0,0 +1 @@
+../../dilithium2/m4f/basemul_257.S
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/config.h b/crypto_sign/dilithium5/m4f/config.h
new file mode 100644
index 0000000..c7aeafd
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/config.h
@@ -0,0 +1,7 @@
+#ifndef CONFIG_H
+#define CONFIG_H
+
+#define DILITHIUM_MODE 5
+// #define SIGN_STACKSTRATEGY 2
+
+#endif
diff --git a/crypto_sign/dilithium5/m4f/fnt_257.S b/crypto_sign/dilithium5/m4f/fnt_257.S
new file mode 120000
index 0000000..2a616c6
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/fnt_257.S
@@ -0,0 +1 @@
+../../dilithium2/m4f/fnt_257.S
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/ifnt_257.S b/crypto_sign/dilithium5/m4f/ifnt_257.S
new file mode 120000
index 0000000..65c99ba
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/ifnt_257.S
@@ -0,0 +1 @@
+../../dilithium2/m4f/ifnt_257.S
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/macros.i b/crypto_sign/dilithium5/m4f/macros.i
new file mode 120000
index 0000000..e3f2469
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/macros.i
@@ -0,0 +1 @@
+../../dilithium2/m4f/macros.i
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/macros_fnt.i b/crypto_sign/dilithium5/m4f/macros_fnt.i
new file mode 120000
index 0000000..1abff09
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/macros_fnt.i
@@ -0,0 +1 @@
+../../dilithium2/m4f/macros_fnt.i
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/ntt.S b/crypto_sign/dilithium5/m4f/ntt.S
new file mode 120000
index 0000000..6fbceff
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/ntt.S
@@ -0,0 +1 @@
+../../dilithium2/m4f/ntt.S
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/ntt.h b/crypto_sign/dilithium5/m4f/ntt.h
new file mode 120000
index 0000000..43729fe
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/ntt.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/ntt.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/packing.c b/crypto_sign/dilithium5/m4f/packing.c
new file mode 120000
index 0000000..b41782c
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/packing.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/packing.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/packing.h b/crypto_sign/dilithium5/m4f/packing.h
new file mode 120000
index 0000000..ba1a6b3
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/packing.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/packing.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/params.h b/crypto_sign/dilithium5/m4f/params.h
new file mode 120000
index 0000000..a6a4d8b
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/params.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/params.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/pointwise_mont.h b/crypto_sign/dilithium5/m4f/pointwise_mont.h
new file mode 120000
index 0000000..0a6f8b9
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/pointwise_mont.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/pointwise_mont.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/pointwise_mont.s b/crypto_sign/dilithium5/m4f/pointwise_mont.s
new file mode 120000
index 0000000..c4ddb96
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/pointwise_mont.s
@@ -0,0 +1 @@
+../../dilithium2/m4f/pointwise_mont.s
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/poly.c b/crypto_sign/dilithium5/m4f/poly.c
new file mode 120000
index 0000000..2544e75
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/poly.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/poly.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/poly.h b/crypto_sign/dilithium5/m4f/poly.h
new file mode 120000
index 0000000..7ef70e5
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/poly.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/poly.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/polyvec.c b/crypto_sign/dilithium5/m4f/polyvec.c
new file mode 120000
index 0000000..a8edd0d
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/polyvec.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/polyvec.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/polyvec.h b/crypto_sign/dilithium5/m4f/polyvec.h
new file mode 120000
index 0000000..cabd6a9
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/polyvec.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/polyvec.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/reduce.h b/crypto_sign/dilithium5/m4f/reduce.h
new file mode 120000
index 0000000..6c13df5
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/reduce.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/reduce.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/rounding.c b/crypto_sign/dilithium5/m4f/rounding.c
new file mode 120000
index 0000000..80b8dce
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/rounding.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/rounding.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/rounding.h b/crypto_sign/dilithium5/m4f/rounding.h
new file mode 120000
index 0000000..74c40c5
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/rounding.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/rounding.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/sign.c b/crypto_sign/dilithium5/m4f/sign.c
new file mode 120000
index 0000000..b7ccdf0
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/sign.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/sign.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/sign.h b/crypto_sign/dilithium5/m4f/sign.h
new file mode 120000
index 0000000..b7f1e89
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/sign.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/sign.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/smallntt.h b/crypto_sign/dilithium5/m4f/smallntt.h
new file mode 120000
index 0000000..9b2baf4
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/smallntt.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/smallntt.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/smallpoly.c b/crypto_sign/dilithium5/m4f/smallpoly.c
new file mode 120000
index 0000000..b59f668
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/smallpoly.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/smallpoly.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/smallpoly.h b/crypto_sign/dilithium5/m4f/smallpoly.h
new file mode 120000
index 0000000..9d46a7a
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/smallpoly.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/smallpoly.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/symmetric-shake.c b/crypto_sign/dilithium5/m4f/symmetric-shake.c
new file mode 120000
index 0000000..6ad8054
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/symmetric-shake.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/symmetric-shake.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/symmetric.h b/crypto_sign/dilithium5/m4f/symmetric.h
new file mode 120000
index 0000000..90ad5c0
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/symmetric.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/symmetric.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/vector.h b/crypto_sign/dilithium5/m4f/vector.h
new file mode 120000
index 0000000..6e2280f
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/vector.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/vector.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4f/vector.s b/crypto_sign/dilithium5/m4f/vector.s
new file mode 120000
index 0000000..2d2b4dc
--- /dev/null
+++ b/crypto_sign/dilithium5/m4f/vector.s
@@ -0,0 +1 @@
+../../dilithium2/m4f/vector.s
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/api.h b/crypto_sign/dilithium5/m4fstack/api.h
new file mode 120000
index 0000000..9d1668d
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/api.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/api.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/config.h b/crypto_sign/dilithium5/m4fstack/config.h
new file mode 120000
index 0000000..f3892d9
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/config.h
@@ -0,0 +1 @@
+../m4f/config.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/macros.i b/crypto_sign/dilithium5/m4fstack/macros.i
new file mode 120000
index 0000000..e3f2469
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/macros.i
@@ -0,0 +1 @@
+../../dilithium2/m4f/macros.i
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/macros_smallntt.i b/crypto_sign/dilithium5/m4fstack/macros_smallntt.i
new file mode 120000
index 0000000..37838a2
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/macros_smallntt.i
@@ -0,0 +1 @@
+../../dilithium2/m4fstack/macros_smallntt.i
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/ntt.S b/crypto_sign/dilithium5/m4fstack/ntt.S
new file mode 120000
index 0000000..6fbceff
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/ntt.S
@@ -0,0 +1 @@
+../../dilithium2/m4f/ntt.S
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/ntt.h b/crypto_sign/dilithium5/m4fstack/ntt.h
new file mode 120000
index 0000000..43729fe
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/ntt.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/ntt.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/packing.c b/crypto_sign/dilithium5/m4fstack/packing.c
new file mode 120000
index 0000000..b41782c
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/packing.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/packing.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/packing.h b/crypto_sign/dilithium5/m4fstack/packing.h
new file mode 120000
index 0000000..ba1a6b3
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/packing.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/packing.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/params.h b/crypto_sign/dilithium5/m4fstack/params.h
new file mode 120000
index 0000000..a6a4d8b
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/params.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/params.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/pointwise_mont.h b/crypto_sign/dilithium5/m4fstack/pointwise_mont.h
new file mode 120000
index 0000000..0a6f8b9
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/pointwise_mont.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/pointwise_mont.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/pointwise_mont.s b/crypto_sign/dilithium5/m4fstack/pointwise_mont.s
new file mode 120000
index 0000000..c4ddb96
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/pointwise_mont.s
@@ -0,0 +1 @@
+../../dilithium2/m4f/pointwise_mont.s
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/poly.c b/crypto_sign/dilithium5/m4fstack/poly.c
new file mode 120000
index 0000000..2544e75
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/poly.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/poly.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/poly.h b/crypto_sign/dilithium5/m4fstack/poly.h
new file mode 120000
index 0000000..7ef70e5
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/poly.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/poly.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/polyvec.c b/crypto_sign/dilithium5/m4fstack/polyvec.c
new file mode 120000
index 0000000..a8edd0d
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/polyvec.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/polyvec.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/polyvec.h b/crypto_sign/dilithium5/m4fstack/polyvec.h
new file mode 120000
index 0000000..cabd6a9
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/polyvec.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/polyvec.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/reduce.h b/crypto_sign/dilithium5/m4fstack/reduce.h
new file mode 120000
index 0000000..f1e2b38
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/reduce.h
@@ -0,0 +1 @@
+../../dilithium2/m4fstack/reduce.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/rounding.c b/crypto_sign/dilithium5/m4fstack/rounding.c
new file mode 120000
index 0000000..80b8dce
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/rounding.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/rounding.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/rounding.h b/crypto_sign/dilithium5/m4fstack/rounding.h
new file mode 120000
index 0000000..74c40c5
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/rounding.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/rounding.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/sign.c b/crypto_sign/dilithium5/m4fstack/sign.c
new file mode 120000
index 0000000..39f6ec4
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/sign.c
@@ -0,0 +1 @@
+../../dilithium2/m4fstack/sign.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/sign.h b/crypto_sign/dilithium5/m4fstack/sign.h
new file mode 120000
index 0000000..b7f1e89
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/sign.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/sign.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/smallntt.h b/crypto_sign/dilithium5/m4fstack/smallntt.h
new file mode 120000
index 0000000..60f2d18
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/smallntt.h
@@ -0,0 +1 @@
+../../dilithium2/m4fstack/smallntt.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/smallntt_769.S b/crypto_sign/dilithium5/m4fstack/smallntt_769.S
new file mode 120000
index 0000000..4ae2f9b
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/smallntt_769.S
@@ -0,0 +1 @@
+../../dilithium2/m4fstack/smallntt_769.S
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/smallpoly.c b/crypto_sign/dilithium5/m4fstack/smallpoly.c
new file mode 120000
index 0000000..9c35056
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/smallpoly.c
@@ -0,0 +1 @@
+../../dilithium2/m4fstack/smallpoly.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/smallpoly.h b/crypto_sign/dilithium5/m4fstack/smallpoly.h
new file mode 120000
index 0000000..45701a4
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/smallpoly.h
@@ -0,0 +1 @@
+../../dilithium2/m4fstack/smallpoly.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/stack.c b/crypto_sign/dilithium5/m4fstack/stack.c
new file mode 120000
index 0000000..d25ed6f
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/stack.c
@@ -0,0 +1 @@
+../../dilithium2/m4fstack/stack.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/stack.h b/crypto_sign/dilithium5/m4fstack/stack.h
new file mode 120000
index 0000000..beab8ca
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/stack.h
@@ -0,0 +1 @@
+../../dilithium2/m4fstack/stack.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/symmetric-shake.c b/crypto_sign/dilithium5/m4fstack/symmetric-shake.c
new file mode 120000
index 0000000..6ad8054
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/symmetric-shake.c
@@ -0,0 +1 @@
+../../dilithium2/m4f/symmetric-shake.c
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/symmetric.h b/crypto_sign/dilithium5/m4fstack/symmetric.h
new file mode 120000
index 0000000..90ad5c0
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/symmetric.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/symmetric.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/vector.h b/crypto_sign/dilithium5/m4fstack/vector.h
new file mode 120000
index 0000000..6e2280f
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/vector.h
@@ -0,0 +1 @@
+../../dilithium2/m4f/vector.h
\ No newline at end of file
diff --git a/crypto_sign/dilithium5/m4fstack/vector.s b/crypto_sign/dilithium5/m4fstack/vector.s
new file mode 120000
index 0000000..2d2b4dc
--- /dev/null
+++ b/crypto_sign/dilithium5/m4fstack/vector.s
@@ -0,0 +1 @@
+../../dilithium2/m4f/vector.s
\ No newline at end of file
diff --git a/hostside/host_unidirectional.py b/hostside/host_unidirectional.py
new file mode 100755
index 0000000..dc51da8
--- /dev/null
+++ b/hostside/host_unidirectional.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python3
+import serial
+import sys
+import platform
+
+if platform.system() == "Darwin":
+    dev = serial.Serial("/dev/tty.usbserial-0001", 38400)
+else:
+    dev = serial.Serial("/dev/ttyUSB0", 38400)
+
+print("> Returned data:", file=sys.stderr)
+
+while True:
+    x = dev.read()
+    sys.stdout.buffer.write(x)
+    sys.stdout.flush()
diff --git a/interface.py b/interface.py
new file mode 100644
index 0000000..b767272
--- /dev/null
+++ b/interface.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0 or CC0-1.0
+import argparse
+
+from mupq import mupq
+from mupq import platforms
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="PQM4 Specific Settings")
+    parser.add_argument(
+        "-p",
+        "--platform",
+        help="The PQM4 platform",
+        choices=["nucleo-f767zi"],
+        default="nucleo-f767zi",
+    )
+    parser.add_argument(
+        "-o",
+        "--opt",
+        help="Optimization flags",
+        choices=["speed", "size", "debug"],
+        default="speed",
+    )
+    parser.add_argument(
+        "-l", "--lto", help="Enable LTO flags", default=False, action="store_true"
+    )
+    parser.add_argument(
+        "--no-aio",
+        help="Disable all-in-one compilation",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "-u", "--uart", default="/dev/ttyUSB0", help="Path to UART output"
+    )
+    parser.add_argument(
+        "-i",
+        "--iterations",
+        type=int,
+        default=1,
+        help="Number of iterations for benchmarks",
+    )
+    return parser.parse_known_args()
+
+
+def get_platform(args):
+    platform = None
+    bin_type = "bin"
+    if args.platform == "nucleo-f767zi":
+        bin_type = "hex"
+        platform = platforms.OpenOCD("board/st_nucleo_f7.cfg", args.uart)
+    else:
+        raise NotImplementedError("Unsupported Platform")
+    settings = M7Settings(
+        args.platform, args.opt, args.lto, not args.no_aio, args.iterations, bin_type
+    )
+    return platform, settings
+
+
+class M7Settings(mupq.PlatformSettings):
+    #: Specify folders to include
+    scheme_folders = [  # mupq.PlatformSettings.scheme_folders + [
+        ("pqm4", "crypto_kem", ""),
+        ("pqm4", "crypto_sign", ""),
+        ("mupq", "mupq/crypto_kem", ""),
+        ("mupq", "mupq/crypto_sign", ""),
+        ("pqclean", "mupq/pqclean/crypto_kem", "PQCLEAN"),
+        ("pqclean", "mupq/pqclean/crypto_sign", "PQCLEAN"),
+    ]
+
+    platform_memory = {
+        "nucleo-f767zi": 384 * 1024,
+    }
+
+    def __init__(
+        self,
+        platform,
+        opt="speed",
+        lto=False,
+        aio=False,
+        iterations=1,
+        binary_type="bin",
+    ):
+        """Initialize with a specific platform"""
+        import skiplist
+
+        self.skip_list = []
+        for impl in skiplist.skip_list:
+            if impl["estmemory"] > self.platform_memory[platform]:
+                impl = impl.copy()
+                del impl["estmemory"]
+                self.skip_list.append(impl)
+        self.skip_list.append({"implementation": "vec"})
+        self.binary_type = binary_type
+        optflags = {"speed": [], "size": ["OPT_SIZE=1"], "debug": ["DEBUG=1"]}
+        if opt not in optflags:
+            raise ValueError(f"Optimization flag should be in {list(optflags.keys())}")
+        super(M7Settings, self).__init__()
+        self.makeflags = [f"PLATFORM={platform}"]
+        self.makeflags += [f"MUPQ_ITERATIONS={iterations}"]
+        self.makeflags += optflags[opt]
+        self.iterations = iterations
+        if lto:
+            self.makeflags += ["LTO=1"]
+        else:
+            self.makeflags += ["LTO="]
+        if aio:
+            self.makeflags += ["AIO=1"]
+        else:
+            self.makeflags += ["AIO="]
diff --git a/ldscripts/devices.data b/ldscripts/devices.data
new file mode 100644
index 0000000..b051f2f
--- /dev/null
+++ b/ldscripts/devices.data
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+stm32f407vg stm32f4 ROM=1024K RAM=128K
+stm32f4 END ROM_OFF=0x08000000 RAM_OFF=0x20000000 CPU=cortex-m4 FPU=hard-fpv4-sp-d16
+stm32f767zi stm32f7 ROM=2048K RAM=384K
+stm32f7 END ROM_OFF=0x08000000 RAM_OFF=0x20010000 CPU=cortex-m7 FPU=hard-fpv5-sp-d16
diff --git a/libopencm3 b/libopencm3
new file mode 160000
index 0000000..201f5bc
--- /dev/null
+++ b/libopencm3
@@ -0,0 +1 @@
+Subproject commit 201f5bcfb3fa70ee34818152463e7139f24db377
diff --git a/mk/config.mk b/mk/config.mk
new file mode 100644
index 0000000..85fc914
--- /dev/null
+++ b/mk/config.mk
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: Apache-2.0 or CC0-1.0
+CPPFLAGS += \
+	-DPQM4
diff --git a/mk/crypto.mk b/mk/crypto.mk
new file mode 100644
index 0000000..7aae6b1
--- /dev/null
+++ b/mk/crypto.mk
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0 or CC0-1.0
+SYMCRYPTO_SRC = \
+	mupq/common/fips202.c \
+	mupq/common/sp800-185.c \
+	mupq/common/nistseedexpander.c \
+	common/keccakf1600.S \
+	common/aes.c \
+	common/aes-encrypt.S \
+	common/aes-keyschedule.S \
+	common/aes-publicinputs.c \
+	common/aes-publicinputs.S \
+	mupq/common/sha2.c \
+	common/crypto_hashblocks_sha512_inner32.s \
+	common/crypto_hashblocks_sha512.c
+
+obj/libsymcrypto.a: $(call objs,$(SYMCRYPTO_SRC))
+
+obj/libsymcrypto-hashprof.a: CPPFLAGS+=-DPROFILE_HASHING
+obj/libsymcrypto-hashprof.a: $(call hashprofobjs,$(SYMCRYPTO_SRC))
+
+ifeq ($(AIO),1)
+LDLIBS +=
+LIBDEPS += $(SYMCRYPTO_SRC)
+CPPFLAGS+=$(if $(PROFILE_HASHING),-DPROFILE_HASHING)
+else
+LDLIBS += -lsymcrypto$(if $(PROFILE_HASHING),-hashprof)
+LIBDEPS += obj/libsymcrypto$(if $(PROFILE_HASHING),-hashprof).a
+endif
+
diff --git a/mk/nucleo-f767zi.mk b/mk/nucleo-f767zi.mk
new file mode 100644
index 0000000..b5860eb
--- /dev/null
+++ b/mk/nucleo-f767zi.mk
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: Apache-2.0 or CC0-1.0
+DEVICE=stm32f767zi
+OPENCM3_TARGET=lib/stm32/f7
+
+EXCLUDED_SCHEMES = \
+	mupq/pqclean/crypto_kem/mceliece% \
+	mupq/pqclean/crypto_kem/hqc% \
+	mupq/pqclean/crypto_sign/falcon% \
+	mupq/pqclean/crypto_sign/sphincs% \
+	mupq/crypto_kem/bike% \
+	mupq/crypto_sign/aimer% \
+	mupq/crypto_sign/ascon% \
+	mupq/crypto_sign/biscuit% \
+	mupq/crypto_sign/cross% \
+	mupq/crypto_sign/falcon% \
+	mupq/crypto_sign/haetae% \
+	mupq/crypto_sign/hawk% \
+	mupq/crypto_sign/mayo% \
+	mupq/crypto_sign/meds% \
+	mupq/crypto_sign/mirith% \
+	mupq/crypto_sign/mqom% \
+	mupq/crypto_sign/ov-Ip% \
+	mupq/crypto_sign/perk% \
+	mupq/crypto_sign/snova% \
+	mupq/crypto_sign/sphincs% \
+	mupq/crypto_sign/tuov%
+
+include mk/opencm3.mk
+
+elf/boardtest.elf: CPPFLAGS+=-DSRAM_TIMING_TEST -DHAS_SRAM2 -DHAS_CCM
+elf/boardtest-fast.elf: CPPFLAGS+=-DSRAM_TIMING_TEST -DHAS_SRAM2 -DHAS_CCM
diff --git a/mk/opencm3.mk b/mk/opencm3.mk
new file mode 100644
index 0000000..6e07c71
--- /dev/null
+++ b/mk/opencm3.mk
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: Apache-2.0 or CC0-1.0
+LIBHAL_SRC := \
+	common/hal-opencm3.c \
+	common/randombytes.c
+
+obj/libpqm4hal.a: $(call objs,$(LIBHAL_SRC))
+obj/libpqm4hal-nornd.a: $(call objs,$(filter-out common/randombytes.c,$(LIBHAL_SRC)))
+
+ifeq ($(AIO),1)
+LDLIBS +=
+LIBDEPS += $(if $(NO_RANDOMBYTES),$(filter-out common/randombytes.c,$(LIBHAL_SRC)),$(LIBHAL_SRC))
+else
+LDLIBS += -lpqm4hal$(if $(NO_RANDOMBYTES),-nornd)
+LIBDEPS += obj/libpqm4hal$(if $(NO_RANDOMBYTES),-nornd).a
+endif
+
+LDLIBS += -lc -lgcc
+
+export OPENCM3_DIR := $(CURDIR)/libopencm3
+
+_git_submodule_update_opencm3 := $(shell git submodule update --init --recursive $(OPENCM3_DIR))
+
+ifeq ($(DEVICE),)
+$(warning no DEVICE specified for linker script generator)
+endif
+
+DEVICES_DATA ?= $(OPENCM3_DIR)/ld/devices.data
+
+genlink_family		:=$(shell $(OPENCM3_DIR)/scripts/genlink.py $(DEVICES_DATA) $(DEVICE) FAMILY)
+genlink_subfamily	:=$(shell $(OPENCM3_DIR)/scripts/genlink.py $(DEVICES_DATA) $(DEVICE) SUBFAMILY)
+genlink_cpu		:=$(shell $(OPENCM3_DIR)/scripts/genlink.py $(DEVICES_DATA) $(DEVICE) CPU)
+genlink_fpu		:=$(shell $(OPENCM3_DIR)/scripts/genlink.py $(DEVICES_DATA) $(DEVICE) FPU)
+genlink_cppflags	:=$(shell $(OPENCM3_DIR)/scripts/genlink.py $(DEVICES_DATA) $(DEVICE) CPPFLAGS)
+
+ifeq ($(genlink_family),)
+$(warning $(DEVICE) not found in $(DEVICES_DATA))
+endif
+
+CPPFLAGS	+= $(genlink_cppflags)
+
+ARCH_FLAGS	:=-mcpu=$(genlink_cpu)
+ifeq ($(genlink_cpu),$(filter $(genlink_cpu),cortex-m0 cortex-m0plus cortex-m3 cortex-m4 cortex-m7))
+ARCH_FLAGS    +=-mthumb
+endif
+
+ifeq ($(genlink_fpu),soft)
+ARCH_FLAGS	+= -msoft-float
+else ifeq ($(genlink_fpu),hard-fpv4-sp-d16)
+ARCH_FLAGS	+= -mfloat-abi=hard -mfpu=fpv4-sp-d16
+else ifeq ($(genlink_fpu),hard-fpv5-sp-d16)
+ARCH_FLAGS      += -mfloat-abi=hard -mfpu=fpv5-sp-d16
+else
+$(warning No match for the FPU flags)
+endif
+
+LIBNAME = opencm3_$(genlink_family)
+
+LDLIBS += -l$(LIBNAME)
+LIBDEPS += $(OPENCM3_DIR)/lib/lib$(LIBNAME).a
+
+LDFLAGS += -L$(OPENCM3_DIR)/lib
+CPPFLAGS += -I$(OPENCM3_DIR)/include
+
+$(OPENCM3_DIR)/lib/lib$(LIBNAME).a:
+	$(MAKE) -C $(OPENCM3_DIR) $(OPENCM3_TARGET)
+
+obj/common/hal-opencm3.c.o: $(OPENCM3_DIR)/lib/lib$(LIBNAME).a
+
+ifeq ($(wildcard ldscripts/$(PLATFORM).ld),)
+LDSCRIPT = obj/generated.$(DEVICE).ld
+$(LDSCRIPT): $(OPENCM3_DIR)/ld/linker.ld.S $(OPENCM3_DIR)/ld/devices.data $(CONFIG)
+	@printf "  GENLNK  $(DEVICE)\n"
+	$(Q)mkdir -p $(@D)
+	$(Q)$(CPP) $(ARCH_FLAGS) $(shell $(OPENCM3_DIR)/scripts/genlink.py $(DEVICES_DATA) $(DEVICE) DEFS) -P -E $< -o $@
+else
+LDSCRIPT = ldscripts/$(PLATFORM).ld
+endif
+
+CROSS_PREFIX ?= arm-none-eabi
+CC := $(CROSS_PREFIX)-gcc
+CPP := $(CROSS_PREFIX)-cpp
+AR := $(CROSS_PREFIX)-gcc-ar
+LD := $(CC)
+OBJCOPY := $(CROSS_PREFIX)-objcopy
+SIZE := $(CROSS_PREFIX)-size
+
+CFLAGS += \
+	$(ARCH_FLAGS) \
+
+LDFLAGS += \
+	--specs=nosys.specs \
+	-Wl,--wrap=_sbrk \
+	-Wl,--wrap=_close \
+	-Wl,--wrap=_isatty \
+	-Wl,--wrap=_kill \
+	-Wl,--wrap=_lseek \
+	-Wl,--wrap=_read \
+	-Wl,--wrap=_write \
+	-Wl,--wrap=_fstat \
+	-Wl,--wrap=_getpid \
+	-nostartfiles \
+	-ffreestanding \
+	-T$(LDSCRIPT) \
+	$(ARCH_FLAGS)
+
+.PHONY: libclean
+
+libclean:
+	make -C $(OPENCM3_DIR) clean
+
+LINKDEPS += $(LDSCRIPT) $(LIBDEPS)
diff --git a/mk/tests.mk b/mk/tests.mk
new file mode 100644
index 0000000..8b6aab8
--- /dev/null
+++ b/mk/tests.mk
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0 or CC0-1.0
+ifeq ($(AIO),1)
+elf/boardtest.elf: common/test.c $(LINKDEPS) $(CONFIG)
+	$(compiletest)
+
+elf/boardtest-fast.elf: common/testfast.c $(LINKDEPS) $(CONFIG)
+	$(compiletest)
+
+elf/boardtest-fast.elf: CPPFLAGS += -DCLOCK_TEST=CLOCK_FAST
+
+elf/aestest.elf: common/aestest.c $(LINKDEPS) $(CONFIG)
+	$(compiletest)
+
+elf/keccaktest.elf: common/keccaktest.c $(LINKDEPS) $(CONFIG)
+	$(compiletest)
+else
+elf/boardtest.elf: $(call objs,common/test.c) $(LINKDEPS) $(CONFIG)
+
+elf/boardtest-fast.elf: $(call objs,common/testfast.c) $(LINKDEPS) $(CONFIG)
+
+$(call objs,common/testfast.c): CPPFLAGS += -DCLOCK_TEST=CLOCK_FAST
+
+elf/aestest.elf: $(call objs,common/aestest.c) $(LINKDEPS) $(CONFIG)
+
+elf/keccaktest.elf: $(call objs,common/keccaktest.c) $(LINKDEPS) $(CONFIG)
+endif
+
+tests: elf/boardtest.elf elf/aestest.elf elf/keccaktest.elf
+tests-bin: bin/boardtest.bin bin/aestest.bin bin/keccaktest.bin
diff --git a/mupq b/mupq
new file mode 160000
index 0000000..8e62b94
--- /dev/null
+++ b/mupq
@@ -0,0 +1 @@
+Subproject commit 8e62b94bfb8125fc81ac2774f8aa8b44120bc619
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..4a96b58
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+pyserial==3.4
+tqdm
diff --git a/skiplist.py b/skiplist.py
new file mode 100644
index 0000000..397c78f
--- /dev/null
+++ b/skiplist.py
@@ -0,0 +1,250 @@
+skip_list = [
+    {'scheme': 'aimer192s', 'implementation': 'opt_mem', 'estmemory': 70656},
+    {'scheme': 'aimer192s', 'implementation': 'ref', 'estmemory': 2036736},
+    {'scheme': 'aimer192f', 'implementation': 'opt_mem', 'estmemory': 46080},
+    {'scheme': 'aimer192f', 'implementation': 'ref', 'estmemory': 287744},
+    {'scheme': 'aimer128s', 'implementation': 'opt_mem', 'estmemory': 39936},
+    {'scheme': 'aimer128s', 'implementation': 'ref', 'estmemory': 924672},
+    {'scheme': 'aimer256f', 'implementation': 'opt_mem', 'estmemory': 105472},
+    {'scheme': 'aimer256f', 'implementation': 'ref', 'estmemory': 600064},
+    {'scheme': 'aimer256s', 'implementation': 'opt_mem', 'estmemory': 135168},
+    {'scheme': 'aimer256s', 'implementation': 'ref', 'estmemory': 4148224},
+    {'scheme': 'aimer128f', 'implementation': 'opt_mem', 'estmemory': 22528},
+    {'scheme': 'aimer128f', 'implementation': 'ref', 'estmemory': 131072},
+    {'scheme': 'ascon-sign-128f-robust', 'implementation': 'ref', 'estmemory': 21504},
+    {'scheme': 'ascon-sign-128f-simple', 'implementation': 'ref', 'estmemory': 21504},
+    {'scheme': 'ascon-sign-128s-robust', 'implementation': 'ref', 'estmemory': 12288},
+    {'scheme': 'ascon-sign-128s-simple', 'implementation': 'ref', 'estmemory': 12288},
+    {'scheme': 'ascon-sign-192f-robust', 'implementation': 'ref', 'estmemory': 43008},
+    {'scheme': 'ascon-sign-192f-simple', 'implementation': 'ref', 'estmemory': 41984},
+    {'scheme': 'ascon-sign-192s-robust', 'implementation': 'ref', 'estmemory': 23552},
+    {'scheme': 'ascon-sign-192s-simple', 'implementation': 'ref', 'estmemory': 22528},
+    {'scheme': 'bikel1', 'implementation': 'm4f', 'estmemory': 103424},
+    {'scheme': 'bikel1', 'implementation': 'opt', 'estmemory': 90112},
+    {'scheme': 'bikel3', 'implementation': 'm4f', 'estmemory': 194560},
+    {'scheme': 'bikel3', 'implementation': 'opt', 'estmemory': 175104},
+    {'scheme': 'biscuit128f', 'implementation': 'ref', 'estmemory': 145408},
+    {'scheme': 'biscuit128s', 'implementation': 'ref', 'estmemory': 1099776},
+    {'scheme': 'biscuit192f', 'implementation': 'ref', 'estmemory': 282624},
+    {'scheme': 'biscuit192s', 'implementation': 'ref', 'estmemory': 2257920},
+    {'scheme': 'biscuit256f', 'implementation': 'ref', 'estmemory': 505856},
+    {'scheme': 'biscuit256s', 'implementation': 'ref', 'estmemory': 4004864},
+    {'scheme': 'cross-sha2-r-sdp-1-fast', 'implementation': 'ref', 'estmemory': 234496},
+    {'scheme': 'cross-sha2-r-sdp-1-small', 'implementation': 'ref', 'estmemory': 721920},
+    {'scheme': 'cross-sha2-r-sdp-3-fast', 'implementation': 'ref', 'estmemory': 365568},
+    {'scheme': 'cross-sha2-r-sdp-3-small', 'implementation': 'ref', 'estmemory': 1295360},
+    {'scheme': 'cross-sha2-r-sdp-5-fast', 'implementation': 'ref', 'estmemory': 914432},
+    {'scheme': 'cross-sha2-r-sdp-5-small', 'implementation': 'ref', 'estmemory': 1748992},
+    {'scheme': 'cross-sha2-r-sdpg-1-fast', 'implementation': 'ref', 'estmemory': 143360},
+    {'scheme': 'cross-sha2-r-sdpg-1-small', 'implementation': 'ref', 'estmemory': 477184},
+    {'scheme': 'cross-sha2-r-sdpg-3-fast', 'implementation': 'ref', 'estmemory': 230400},
+    {'scheme': 'cross-sha2-r-sdpg-3-small', 'implementation': 'ref', 'estmemory': 776192},
+    {'scheme': 'cross-sha2-r-sdpg-5-fast', 'implementation': 'ref', 'estmemory': 440320},
+    {'scheme': 'cross-sha2-r-sdpg-5-small', 'implementation': 'ref', 'estmemory': 1063936},
+    {'scheme': 'cross-sha3-r-sdp-1-fast', 'implementation': 'ref', 'estmemory': 234496},
+    {'scheme': 'cross-sha3-r-sdp-1-small', 'implementation': 'ref', 'estmemory': 721920},
+    {'scheme': 'cross-sha3-r-sdp-3-fast', 'implementation': 'ref', 'estmemory': 365568},
+    {'scheme': 'cross-sha3-r-sdp-3-small', 'implementation': 'ref', 'estmemory': 1295360},
+    {'scheme': 'cross-sha3-r-sdp-5-fast', 'implementation': 'ref', 'estmemory': 914432},
+    {'scheme': 'cross-sha3-r-sdp-5-small', 'implementation': 'ref', 'estmemory': 1748992},
+    {'scheme': 'cross-sha3-r-sdpg-1-fast', 'implementation': 'ref', 'estmemory': 143360},
+    {'scheme': 'cross-sha3-r-sdpg-1-small', 'implementation': 'ref', 'estmemory': 477184},
+    {'scheme': 'cross-sha3-r-sdpg-3-fast', 'implementation': 'ref', 'estmemory': 230400},
+    {'scheme': 'cross-sha3-r-sdpg-3-small', 'implementation': 'ref', 'estmemory': 776192},
+    {'scheme': 'cross-sha3-r-sdpg-5-fast', 'implementation': 'ref', 'estmemory': 440320},
+    {'scheme': 'cross-sha3-r-sdpg-5-small', 'implementation': 'ref', 'estmemory': 1063936},
+    {'scheme': 'dilithium2', 'implementation': 'clean', 'estmemory': 59392},
+    {'scheme': 'dilithium2', 'implementation': 'm4f', 'estmemory': 57344},
+    {'scheme': 'dilithium3', 'implementation': 'clean', 'estmemory': 90112},
+    {'scheme': 'dilithium3', 'implementation': 'm4f', 'estmemory': 79872},
+    {'scheme': 'dilithium5', 'implementation': 'clean', 'estmemory': 136192},
+    {'scheme': 'dilithium5', 'implementation': 'm4f', 'estmemory': 129024},
+    {'scheme': 'falcon-1024', 'implementation': 'clean', 'estmemory': 91136},
+    {'scheme': 'falcon-1024', 'implementation': 'm4-ct', 'estmemory': 89088},
+    {'scheme': 'falcon-1024', 'implementation': 'opt-ct', 'estmemory': 89088},
+    {'scheme': 'falcon-1024', 'implementation': 'opt-leaktime', 'estmemory': 90112},
+    {'scheme': 'falcon-1024-tree', 'implementation': 'opt-ct', 'estmemory': 185344},
+    {'scheme': 'falcon-1024-tree', 'implementation': 'opt-leaktime', 'estmemory': 186368},
+    {'scheme': 'falcon-512', 'implementation': 'clean', 'estmemory': 48128},
+    {'scheme': 'falcon-512', 'implementation': 'm4-ct', 'estmemory': 46080},
+    {'scheme': 'falcon-512', 'implementation': 'opt-ct', 'estmemory': 46080},
+    {'scheme': 'falcon-512', 'implementation': 'opt-leaktime', 'estmemory': 47104},
+    {'scheme': 'falcon-512-tree', 'implementation': 'm4-ct', 'estmemory': 90112},
+    {'scheme': 'falcon-512-tree', 'implementation': 'opt-ct', 'estmemory': 90112},
+    {'scheme': 'falcon-512-tree', 'implementation': 'opt-leaktime', 'estmemory': 91136},
+    {'scheme': 'haetae2', 'implementation': 'm4f', 'estmemory': 60416},
+    {'scheme': 'haetae2', 'implementation': 'ref', 'estmemory': 59392},
+    {'scheme': 'haetae3', 'implementation': 'm4f', 'estmemory': 90112},
+    {'scheme': 'haetae3', 'implementation': 'ref', 'estmemory': 87040},
+    {'scheme': 'haetae5', 'implementation': 'm4f', 'estmemory': 112640},
+    {'scheme': 'haetae5', 'implementation': 'ref', 'estmemory': 109568},
+    {'scheme': 'hawk1024', 'implementation': 'ref', 'estmemory': 32768},
+    {'scheme': 'hawk256', 'implementation': 'ref', 'estmemory': 10240},
+    {'scheme': 'hawk512', 'implementation': 'ref', 'estmemory': 17408},
+    {'scheme': 'hqc-128', 'implementation': 'clean', 'estmemory': 66560},
+    {'scheme': 'hqc-192', 'implementation': 'clean', 'estmemory': 130048},
+    {'scheme': 'hqc-256', 'implementation': 'clean', 'estmemory': 205824},
+    {'scheme': 'ml-kem-1024', 'implementation': 'clean', 'estmemory': 27648},
+    {'scheme': 'ml-kem-1024', 'implementation': 'm7fspeed', 'estmemory': 16384},
+    {'scheme': 'ml-kem-1024', 'implementation': 'm7fstack', 'estmemory': 12288},
+    {'scheme': 'ml-kem-512', 'implementation': 'clean', 'estmemory': 14336},
+    {'scheme': 'ml-kem-512', 'implementation': 'm7fspeed', 'estmemory': 10240},
+    {'scheme': 'ml-kem-512', 'implementation': 'm7fstack', 'estmemory': 7168},
+    {'scheme': 'ml-kem-768', 'implementation': 'clean', 'estmemory': 20480},
+    {'scheme': 'ml-kem-768', 'implementation': 'm7fspeed', 'estmemory': 13312},
+    {'scheme': 'ml-kem-768', 'implementation': 'm7fstack', 'estmemory': 10240},
+    {'scheme': 'mayo1', 'implementation': 'm4f', 'estmemory': 446464},
+    {'scheme': 'mayo1', 'implementation': 'ref', 'estmemory': 404480},
+    {'scheme': 'mayo2', 'implementation': 'm4f', 'estmemory': 287744},
+    {'scheme': 'mayo2', 'implementation': 'ref', 'estmemory': 279552},
+    {'scheme': 'mayo3', 'implementation': 'm4f', 'estmemory': 477184},
+    {'scheme': 'mayo3', 'implementation': 'ref', 'estmemory': 1144832},
+    {'scheme': 'mceliece348864', 'implementation': 'clean', 'estmemory': 693248},
+    {'scheme': 'mceliece348864f', 'implementation': 'clean', 'estmemory': 693248},
+    {'scheme': 'mceliece460896', 'implementation': 'clean', 'estmemory': 1425408},
+    {'scheme': 'mceliece460896f', 'implementation': 'clean', 'estmemory': 1426432},
+    {'scheme': 'mceliece6688128', 'implementation': 'clean', 'estmemory': 2627584},
+    {'scheme': 'mceliece6688128f', 'implementation': 'clean', 'estmemory': 2628608},
+    {'scheme': 'mceliece6960119', 'implementation': 'clean', 'estmemory': 2585600},
+    {'scheme': 'mceliece6960119f', 'implementation': 'clean', 'estmemory': 2586624},
+    {'scheme': 'mceliece8192128', 'implementation': 'clean', 'estmemory': 3259392},
+    {'scheme': 'mceliece8192128f', 'implementation': 'clean', 'estmemory': 3260416},
+    {'scheme': 'meds13220', 'implementation': 'ref', 'estmemory': 209920},
+    {'scheme': 'meds134180', 'implementation': 'ref', 'estmemory': 1152000},
+    {'scheme': 'meds167717', 'implementation': 'ref', 'estmemory': 927744},
+    {'scheme': 'meds41711', 'implementation': 'ref', 'estmemory': 1387520},
+    {'scheme': 'meds55604', 'implementation': 'ref', 'estmemory': 509952},
+    {'scheme': 'meds9923', 'implementation': 'ref', 'estmemory': 1019904},
+    {'scheme': 'mirith_IIIa_fast', 'implementation': 'ref', 'estmemory': 287744},
+    {'scheme': 'mirith_IIIa_short', 'implementation': 'ref', 'estmemory': 2197504},
+    {'scheme': 'mirith_IIIb_fast', 'implementation': 'ref', 'estmemory': 320512},
+    {'scheme': 'mirith_IIIb_short', 'implementation': 'ref', 'estmemory': 2386944},
+    {'scheme': 'mirith_Ia_fast', 'implementation': 'ref', 'estmemory': 134144},
+    {'scheme': 'mirith_Ia_short', 'implementation': 'ref', 'estmemory': 1019904},
+    {'scheme': 'mirith_Ib_fast', 'implementation': 'ref', 'estmemory': 163840},
+    {'scheme': 'mirith_Ib_short', 'implementation': 'ref', 'estmemory': 1195008},
+    {'scheme': 'mirith_Va_fast', 'implementation': 'ref', 'estmemory': 519168},
+    {'scheme': 'mirith_Va_short', 'implementation': 'ref', 'estmemory': 3816448},
+    {'scheme': 'mirith_Vb_fast', 'implementation': 'ref', 'estmemory': 572416},
+    {'scheme': 'mirith_Vb_short', 'implementation': 'ref', 'estmemory': 4117504},
+    {'scheme': 'mirith_hypercube_IIIa_fast', 'implementation': 'ref', 'estmemory': 188416},
+    {'scheme': 'mirith_hypercube_IIIa_short', 'implementation': 'ref', 'estmemory': 502784},
+    {'scheme': 'mirith_hypercube_IIIa_shorter', 'implementation': 'ref', 'estmemory': 3894272},
+    {'scheme': 'mirith_hypercube_IIIb_fast', 'implementation': 'ref', 'estmemory': 211968},
+    {'scheme': 'mirith_hypercube_IIIb_short', 'implementation': 'ref', 'estmemory': 526336},
+    {'scheme': 'mirith_hypercube_IIIb_shorter', 'implementation': 'ref', 'estmemory': 3916800},
+    {'scheme': 'mirith_hypercube_Ia_fast', 'implementation': 'opt', 'estmemory': 88064},
+    {'scheme': 'mirith_hypercube_Ia_fast', 'implementation': 'ref', 'estmemory': 89088},
+    {'scheme': 'mirith_hypercube_Ia_short', 'implementation': 'ref', 'estmemory': 227328},
+    {'scheme': 'mirith_hypercube_Ia_shorter', 'implementation': 'ref', 'estmemory': 1779712},
+    {'scheme': 'mirith_hypercube_Ib_fast', 'implementation': 'opt', 'estmemory': 109568},
+    {'scheme': 'mirith_hypercube_Ib_fast', 'implementation': 'ref', 'estmemory': 109568},
+    {'scheme': 'mirith_hypercube_Ib_short', 'implementation': 'ref', 'estmemory': 247808},
+    {'scheme': 'mirith_hypercube_Ib_shorter', 'implementation': 'ref', 'estmemory': 1800192},
+    {'scheme': 'mirith_hypercube_Va_fast', 'implementation': 'ref', 'estmemory': 344064},
+    {'scheme': 'mirith_hypercube_Va_short', 'implementation': 'ref', 'estmemory': 878592},
+    {'scheme': 'mirith_hypercube_Va_shorter', 'implementation': 'ref', 'estmemory': 4217856},
+    {'scheme': 'mirith_hypercube_Vb_fast', 'implementation': 'ref', 'estmemory': 382976},
+    {'scheme': 'mirith_hypercube_Vb_short', 'implementation': 'ref', 'estmemory': 916480},
+    {'scheme': 'mirith_hypercube_Vb_shorter', 'implementation': 'ref', 'estmemory': 4218880},
+    {'scheme': 'mqom_cat1_gf251_fast', 'implementation': 'ref', 'estmemory': 411648},
+    {'scheme': 'mqom_cat1_gf251_short', 'implementation': 'ref', 'estmemory': 675840},
+    {'scheme': 'mqom_cat1_gf31_fast', 'implementation': 'ref', 'estmemory': 624640},
+    {'scheme': 'mqom_cat1_gf31_short', 'implementation': 'ref', 'estmemory': 878592},
+    {'scheme': 'mqom_cat3_gf251_fast', 'implementation': 'ref', 'estmemory': 1307648},
+    {'scheme': 'mqom_cat3_gf251_short', 'implementation': 'ref', 'estmemory': 1903616},
+    {'scheme': 'mqom_cat3_gf31_fast', 'implementation': 'ref', 'estmemory': 2171904},
+    {'scheme': 'mqom_cat3_gf31_short', 'implementation': 'ref', 'estmemory': 2688000},
+    {'scheme': 'mqom_cat5_gf251_fast', 'implementation': 'ref', 'estmemory': 3260416},
+    {'scheme': 'mqom_cat5_gf251_short', 'implementation': 'ref', 'estmemory': 4146176},
+    {'scheme': 'ov-Ip', 'implementation': 'm4f', 'estmemory': 534528},
+    {'scheme': 'ov-Ip', 'implementation': 'ref', 'estmemory': 534528},
+    {'scheme': 'ov-Ip-pkc', 'implementation': 'm4fspeed', 'estmemory': 565248},
+    {'scheme': 'ov-Ip-pkc', 'implementation': 'm4fstack', 'estmemory': 425984},
+    {'scheme': 'ov-Ip-pkc', 'implementation': 'ref', 'estmemory': 568320},
+    {'scheme': 'ov-Ip-pkc-skc', 'implementation': 'm4fspeed', 'estmemory': 425984},
+    {'scheme': 'ov-Ip-pkc-skc', 'implementation': 'm4fstack', 'estmemory': 425984},
+    {'scheme': 'ov-Ip-pkc-skc', 'implementation': 'ref', 'estmemory': 330752},
+    {'scheme': 'perk-128-fast-3', 'implementation': 'm4', 'estmemory': 33792},
+    {'scheme': 'perk-128-fast-3', 'implementation': 'ref', 'estmemory': 323584},
+    {'scheme': 'perk-128-fast-5', 'implementation': 'm4', 'estmemory': 34816},
+    {'scheme': 'perk-128-fast-5', 'implementation': 'ref', 'estmemory': 315392},
+    {'scheme': 'perk-128-short-3', 'implementation': 'm4', 'estmemory': 37888},
+    {'scheme': 'perk-128-short-3', 'implementation': 'ref', 'estmemory': 1570816},
+    {'scheme': 'perk-128-short-5', 'implementation': 'm4', 'estmemory': 37888},
+    {'scheme': 'perk-128-short-5', 'implementation': 'ref', 'estmemory': 1472512},
+    {'scheme': 'perk-192-fast-3', 'implementation': 'm4', 'estmemory': 68608},
+    {'scheme': 'perk-192-fast-3', 'implementation': 'ref', 'estmemory': 707584},
+    {'scheme': 'perk-192-fast-5', 'implementation': 'm4', 'estmemory': 68608},
+    {'scheme': 'perk-192-fast-5', 'implementation': 'ref', 'estmemory': 681984},
+    {'scheme': 'perk-192-short-3', 'implementation': 'm4', 'estmemory': 69632},
+    {'scheme': 'perk-192-short-3', 'implementation': 'ref', 'estmemory': 3487744},
+    {'scheme': 'perk-192-short-5', 'implementation': 'm4', 'estmemory': 69632},
+    {'scheme': 'perk-192-short-5', 'implementation': 'ref', 'estmemory': 3240960},
+    {'scheme': 'perk-256-fast-3', 'implementation': 'm4', 'estmemory': 115712},
+    {'scheme': 'perk-256-fast-3', 'implementation': 'ref', 'estmemory': 1226752},
+    {'scheme': 'perk-256-fast-5', 'implementation': 'm4', 'estmemory': 114688},
+    {'scheme': 'perk-256-fast-5', 'implementation': 'ref', 'estmemory': 1175552},
+    {'scheme': 'perk-256-short-3', 'implementation': 'm4', 'estmemory': 111616},
+    {'scheme': 'perk-256-short-3', 'implementation': 'ref', 'estmemory': 4222976},
+    {'scheme': 'perk-256-short-5', 'implementation': 'm4', 'estmemory': 109568},
+    {'scheme': 'perk-256-short-5', 'implementation': 'ref', 'estmemory': 4221952},
+    {'scheme': 'snova-24-5-16-4-esk', 'implementation': 'ref', 'estmemory': 205824},
+    {'scheme': 'snova-24-5-16-4-ssk', 'implementation': 'ref', 'estmemory': 172032},
+    {'scheme': 'snova-25-8-16-3-esk', 'implementation': 'ref', 'estmemory': 232448},
+    {'scheme': 'snova-25-8-16-3-ssk', 'implementation': 'ref', 'estmemory': 194560},
+    {'scheme': 'snova-28-17-16-2-esk', 'implementation': 'ref', 'estmemory': 380928},
+    {'scheme': 'snova-28-17-16-2-ssk', 'implementation': 'ref', 'estmemory': 320512},
+    {'scheme': 'snova-37-8-16-4-esk', 'implementation': 'ref', 'estmemory': 775168},
+    {'scheme': 'snova-37-8-16-4-ssk', 'implementation': 'ref', 'estmemory': 646144},
+    {'scheme': 'snova-43-25-16-2-esk', 'implementation': 'ref', 'estmemory': 1274880},
+    {'scheme': 'snova-43-25-16-2-ssk', 'implementation': 'ref', 'estmemory': 1072128},
+    {'scheme': 'snova-49-11-16-3-esk', 'implementation': 'ref', 'estmemory': 1055744},
+    {'scheme': 'snova-49-11-16-3-ssk', 'implementation': 'ref', 'estmemory': 880640},
+    {'scheme': 'snova-60-10-16-4-esk', 'implementation': 'ref', 'estmemory': 2342912},
+    {'scheme': 'snova-60-10-16-4-ssk', 'implementation': 'ref', 'estmemory': 1953792},
+    {'scheme': 'snova-61-33-16-2-esk', 'implementation': 'ref', 'estmemory': 3232768},
+    {'scheme': 'snova-61-33-16-2-ssk', 'implementation': 'ref', 'estmemory': 2717696},
+    {'scheme': 'snova-66-15-16-3-esk', 'implementation': 'ref', 'estmemory': 2617344},
+    {'scheme': 'snova-66-15-16-3-ssk', 'implementation': 'ref', 'estmemory': 2185216},
+    {'scheme': 'sphincs-a-sha2-128f', 'implementation': 'ref', 'estmemory': 301056},
+    {'scheme': 'sphincs-a-sha2-128s', 'implementation': 'ref', 'estmemory': 595968},
+    {'scheme': 'sphincs-a-sha2-192f', 'implementation': 'ref', 'estmemory': 542720},
+    {'scheme': 'sphincs-a-sha2-192s', 'implementation': 'ref', 'estmemory': 1307648},
+    {'scheme': 'sphincs-a-sha2-256f', 'implementation': 'ref', 'estmemory': 1124352},
+    {'scheme': 'sphincs-a-sha2-256s', 'implementation': 'ref', 'estmemory': 2291712},
+    {'scheme': 'sphincs-a-shake-128f', 'implementation': 'ref', 'estmemory': 301056},
+    {'scheme': 'sphincs-a-shake-128s', 'implementation': 'ref', 'estmemory': 595968},
+    {'scheme': 'sphincs-a-shake-192f', 'implementation': 'ref', 'estmemory': 541696},
+    {'scheme': 'sphincs-a-shake-192s', 'implementation': 'ref', 'estmemory': 1306624},
+    {'scheme': 'sphincs-a-shake-256f', 'implementation': 'ref', 'estmemory': 1124352},
+    {'scheme': 'sphincs-a-shake-256s', 'implementation': 'ref', 'estmemory': 2291712},
+    {'scheme': 'sphincs-sha2-128f-simple', 'implementation': 'clean', 'estmemory': 21504},
+    {'scheme': 'sphincs-sha2-128s-simple', 'implementation': 'clean', 'estmemory': 12288},
+    {'scheme': 'sphincs-sha2-192f-simple', 'implementation': 'clean', 'estmemory': 43008},
+    {'scheme': 'sphincs-sha2-192s-simple', 'implementation': 'clean', 'estmemory': 23552},
+    {'scheme': 'sphincs-sha2-256f-simple', 'implementation': 'clean', 'estmemory': 59392},
+    {'scheme': 'sphincs-sha2-256s-simple', 'implementation': 'clean', 'estmemory': 39936},
+    {'scheme': 'sphincs-shake-128f-simple', 'implementation': 'clean', 'estmemory': 21504},
+    {'scheme': 'sphincs-shake-128s-simple', 'implementation': 'clean', 'estmemory': 12288},
+    {'scheme': 'sphincs-shake-192f-simple', 'implementation': 'clean', 'estmemory': 41984},
+    {'scheme': 'sphincs-shake-192s-simple', 'implementation': 'clean', 'estmemory': 22528},
+    {'scheme': 'sphincs-shake-256f-simple', 'implementation': 'clean', 'estmemory': 59392},
+    {'scheme': 'sphincs-shake-256s-simple', 'implementation': 'clean', 'estmemory': 38912},
+    {'scheme': 'tuov_iii', 'implementation': 'ref', 'estmemory': 3281920},
+    {'scheme': 'tuov_iii_pkc', 'implementation': 'ref', 'estmemory': 3468288},
+    {'scheme': 'tuov_iii_pkc_skc', 'implementation': 'ref', 'estmemory': 3790848},
+    {'scheme': 'tuov_ip', 'implementation': 'ref', 'estmemory': 3790848},
+    {'scheme': 'tuov_ip_pkc', 'implementation': 'ref', 'estmemory': 799744},
+    {'scheme': 'tuov_ip_pkc_skc', 'implementation': 'ref', 'estmemory': 865280},
+    {'scheme': 'tuov_is', 'implementation': 'ref', 'estmemory': 1111040},
+    {'scheme': 'tuov_is_pkc', 'implementation': 'ref', 'estmemory': 1176576},
+    {'scheme': 'tuov_is_pkc_skc', 'implementation': 'ref', 'estmemory': 1275904},
+    {'scheme': 'tuov_v_pkc', 'implementation': 'ref', 'estmemory': 7083008},
+    {'scheme': 'tuov_v_pkc_skc', 'implementation': 'ref', 'estmemory': 4639744},
+    {'scheme': 'dilithium2', 'implementation': 'm7fstack', 'estmemory': 12288},
+    {'scheme': 'dilithium5', 'implementation': 'm7fstack', 'estmemory': 21504},
+    {'scheme': 'dilithium3', 'implementation': 'm7fstack', 'estmemory': 17408},
+    {'scheme': 'falcon-padded-1024', 'implementation': 'clean', 'estmemory': 91136},
+    {'scheme': 'falcon-padded-512', 'implementation': 'clean', 'estmemory': 48128},
+]
diff --git a/slothy b/slothy
new file mode 160000
index 0000000..1fd3fdf
--- /dev/null
+++ b/slothy
@@ -0,0 +1 @@
+Subproject commit 1fd3fdf881a269c198c1af7e7ac98240250f2113
diff --git a/test.py b/test.py
new file mode 100755
index 0000000..969282b
--- /dev/null
+++ b/test.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0 or CC0-1.0
+from mupq import mupq
+from interface import parse_arguments, get_platform
+
+import sys
+
+if __name__ == "__main__":
+    args, rest = parse_arguments()
+    platform, settings = get_platform(args)
+    with platform:
+        test = mupq.SimpleTest(settings, platform)
+        if test.test_all(rest):
+            sys.exit(1)
diff --git a/testvectors.py b/testvectors.py
new file mode 100755
index 0000000..a08c151
--- /dev/null
+++ b/testvectors.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0 or CC0-1.0
+from mupq import mupq
+from interface import parse_arguments, get_platform
+import sys
+
+if __name__ == "__main__":
+    args, rest = parse_arguments()
+    platform, settings = get_platform(args)
+    with platform:
+        test = mupq.TestVectors(settings, platform)
+        if test.test_all(rest):
+            sys.exit(1)