From c20abc9db135343b527ee440267e97c006aeb7c0 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Mon, 22 Jan 2024 13:16:10 +0530
Subject: [PATCH] Improvements via `t12n`

There's some ongoing activity trying to repurpose the models for
transliteration. Since the models are small, and the authors can
eventually end up using them in their day-to-day life since
transliteration is viable to have around (in comparison to translation).

The following improvements are added:

1. Allow a non-shortlist path
2. Add `nano` config to put in t12n models. This is subject to change
during the course of experimentation.
3. Test-suite reading traces from `marian-dev` and generating unit-tests
using a Python script (rudimentary, can improve over the course of
development).
4. The following bugfixes
   a. LayerNorm fixed by adjusting default EPS `1e-9` to `1e-6`
b. Fix `limit_factor()` narrowed into `size_t` due to mistyping; longer
sequences should come now.
c. Fix initialization for greedy decode (was not offset correctly
before).

**Known Issues**

1. `HighwayForward` error at 1e-6 (1e-7 EPS fails), not satisfactory.
Abstract operation to be unit-testable.
2. There are still mismatches in the forward--pass to sort out.

Pull Request: https://github.com/jerinphilip/slimt/pull/47
---
 .vscode/launch.json        |  34 ++
 CMakeLists.txt             |   2 +
 app/CMakeLists.txt         |   4 -
 app/test.cc                | 772 -------------------------------------
 bindings/python/slimt.cpp  |   1 +
 bindings/python/utils.py   |  10 +-
 scripts/marian-trace-gen.h |  60 +--
 scripts/t12n.py            |  39 ++
 scripts/trace-xlit.sh      |  11 +
 slimt/Batcher.cc           |   2 +-
 slimt/Frontend.cc          |   2 +-
 slimt/HTML.cc              |  16 +-
 slimt/Input.cc             |   2 +-
 slimt/Input.hh             |   4 +-
 slimt/Macros.hh            |  22 +-
 slimt/Model.cc             |  50 ++-
 slimt/Model.hh             |   8 +-
 slimt/Modules.cc           |  45 +--
 slimt/Modules.hh           |   3 +
 slimt/Shortlist.cc         |  30 +-
 slimt/Shortlist.hh         |  10 +-
 slimt/Tensor.cc            |  53 +--
 slimt/TensorOps.cc         |  46 ++-
 slimt/TensorOps.hh         |   8 +
 slimt/TextProcessor.cc     |   4 +-
 slimt/Transformer.cc       | 118 +++++-
 slimt/Transformer.hh       |  10 +-
 slimt/Utils.cc             |  33 --
 slimt/Utils.hh             |  41 +-
 tests/CMakeLists.txt       |  19 +
 tests/TestSuite.hh         |  44 +++
 tests/generate-units.py    | 183 +++++++++
 32 files changed, 695 insertions(+), 991 deletions(-)
 delete mode 100644 app/test.cc
 create mode 100644 scripts/t12n.py
 create mode 100644 scripts/trace-xlit.sh
 create mode 100644 tests/CMakeLists.txt
 create mode 100644 tests/TestSuite.hh
 create mode 100644 tests/generate-units.py

diff --git a/.vscode/launch.json b/.vscode/launch.json
index 0264bdee..f67cf39a 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -1,5 +1,39 @@
 {
     "configurations": [
+        {
+            "name": "(gdb) t12n",
+            "type": "cppdbg",
+            "request": "launch",
+            "program": "${workspaceFolder}/env/bin/python3",
+            "args": [
+                "${workspaceFolder}/scripts/t12n.py",
+                "${workspaceFolder}/../slimt-t12n/outputs/mal-eng/model.nano.npz.decoder.yml",
+                "<",
+                "${workspaceFolder}/data/ml-xlit.txt"
+            ],
+            "stopAtEntry": false,
+            "cwd": "${fileDirname}",
+            "environment": [],
+            "externalConsole": false,
+            "MIMode": "gdb",
+            "setupCommands": [
+                {
+                    "description": "Enable pretty-printing for gdb",
+                    "text": "-enable-pretty-printing",
+                    "ignoreFailures": true
+                },
+                {
+                    "description": "Set Disassembly Flavor to Intel",
+                    "text": "-gdb-set disassembly-flavor intel",
+                    "ignoreFailures": true
+                },
+                {
+                    "description": "Enable following into child for debugging.",
+                    "text": "set follow-fork-mode child",
+                    "ignoreFailures": true
+                }
+            ]
+        },
         {
             "name": "(gdb) test",
             "type": "cppdbg",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 64cdf8ea..5da0136c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,7 @@ option(BUILD_STATIC "Build static libraries" ON)
 option(SLIMT_PACKAGE "Package for cmake, pkgconfig" OFF)
 option(SLIMT_PYTHON_LINK_STATIC
        "link-method to produce python package (static/shared)" ON)
+option(SLIMT_GENERATED_UNIT_TESTS "Generate unit tests to run using Python" OFF)
 
 include(MacroEnsureOutOfSourceBuild)
 macro_ensure_out_of_source_build(
@@ -208,6 +209,7 @@ endif(UNIX)
 
 add_subdirectory(slimt)
 add_subdirectory(app)
+add_subdirectory(tests)
 
 if(BUILD_PYTHON)
   if(USE_PYBIND11_SOURCE)
diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt
index 22878169..21009d3a 100644
--- a/app/CMakeLists.txt
+++ b/app/CMakeLists.txt
@@ -2,10 +2,6 @@ add_executable(slimt_cli main.cc)
 set_target_properties(slimt_cli PROPERTIES OUTPUT_NAME "slimt-cli")
 target_link_libraries(slimt_cli PUBLIC slimt)
 
-add_executable(slimt_test test.cc)
-set_target_properties(slimt_test PROPERTIES OUTPUT_NAME "slimt-test")
-target_link_libraries(slimt_test PUBLIC slimt)
-
 set(SLIMT_BINARIES slimt_cli)
 
 if(UNIX)
diff --git a/app/test.cc b/app/test.cc
deleted file mode 100644
index 73dfee25..00000000
--- a/app/test.cc
+++ /dev/null
@@ -1,772 +0,0 @@
-// NOLINTBEGIN
-#include <cmath>
-#include <cstdint>
-#include <iostream>
-#include <unordered_map>
-
-// Public headers.
-#include "slimt/slimt.hh"
-
-// Private headers, required for partial testing.
-#include "slimt/Macros.hh"
-#include "slimt/TensorOps.hh"
-#include "slimt/Utils.hh"
-
-namespace slimt {
-
-#define SLIMT_CHECK(condition)                                                \
-  do {                                                                        \
-    if (!(condition)) {                                                       \
-      fprintf(stderr, "%s:%d %s failed\n", __FILE__, __LINE__, (#condition)); \
-      throw std::runtime_error("Failed test");                                \
-    }                                                                         \
-    fprintf(stderr, "%s:%d %s success\n", __FILE__, __LINE__, (#condition));  \
-  } while (0)
-
-static const std::string kBlobPath = checked_fpath();
-
-namespace {
-
-std::string prefix(const std::string &fname) { return kBlobPath + "/" + fname; }
-
-template <typename Scalar, class... Args>
-Tensor tf(const std::string &path, Args &&...args) {
-  return tensor_from_file<Scalar>(prefix(path), std::forward<Args>(args)...);
-}
-
-template <typename Scalar, typename Quant, class... Args>
-std::tuple<Tensor, float> qtf(const std::string &path, Args &&...args) {
-  return quantized_tensor_from_file<Scalar, Quant>(prefix(path),
-                                                   std::forward<Args>(args)...);
-}
-
-}  // namespace
-   //
-   //
-
-void load() {
-  std::string fname = "RowsNodeOp-float32_8x256-rhs1-uint32_8_data_0.bin";
-  Tensor x = tf<int>((fname), Shape({8}), "rhs1");
-  auto *data = x.data<int>();
-  // std::cout << x << "\n";
-
-  float begin = *data;
-  float rbegin = *(data + (x.size() - 1));
-  SLIMT_CHECK(begin == 39);
-  SLIMT_CHECK(rbegin == 0);
-}
-
-struct OpArgs {
-  std::string lhs;
-  std::vector<std::string> rhs;
-};
-
-void ScalarMultNodeOp() {
-  // clang-format off
-  // file: "/home/jerin/code/bergamot-translator/3rd_party/marian-dev/src/graph/node_operators_unary.h"
-  // line: 100
-  // fn: "marian::ScalarMultNodeOp::forwardOps()::<lambda()>"
-  // op: { Element(_1 = scalar_ * _2, val_, child(0)->val()) }
-  // before: var_5 float32 [4x2x256]
-  // after: var_5 float32 [4x2x256] ScalarMultNodeOp-float32_4x2x256-lhs.bin
-  // operands:
-  //    - var_3 float32 [4x2x256] ScalarMultNodeOp-float32_4x2x256-rhs0-float32_4x2x256.bin
-  // clang-format on
-
-  OpArgs args{
-      .lhs = "ScalarMultNodeOp-float32_4x2x256-lhs.bin",                    //
-      .rhs = {"ScalarMultNodeOp-float32_4x2x256-rhs0-float32_4x2x256.bin"}  //
-  };
-
-  Shape shape({4, 2, 256});
-  Tensor lhs = tf<float>((args.lhs), shape, "lhs");
-  Tensor rhs = tf<float>((args.rhs[0]), Shape({4, 2, 256}), "rhs");
-
-  Tensor lhs_computed(lhs.type(), lhs.shape(), "lhs_computed");
-
-  float embedding_dim_sqrt = std::sqrt(256.0F);
-  mul_scalar(rhs.data<float>(), embedding_dim_sqrt, rhs.size(),
-             lhs_computed.data<float>());
-
-  // std::cout << rhs << "\n";
-  // std::cout << lhs << "\n" << lhs_computed << "\n";
-
-  SLIMT_CHECK(lhs_computed == lhs);
-}
-
-void RowsNodeOp() {
-  // clang-format off
-  // file: "/home/jerin/code/bergamot-translator/3rd_party/marian-dev/src/graph/node_operators_binary.h"
-  // line: 672
-  // fn: "marian::RowsNodeOp::forwardOps()::<lambda()>"
-  // op: { CopyRows(val_, child(0)->val(), child(1)->val()) }
-  // before: var_2 float32 [8x256]
-  // after: var_2 float32 [8x256] RowsNodeOp-float32_8x256-lhs.bin
-  // operands:
-  //   - var_0 float32 [32000x256] RowsNodeOp-float32_8x256-rhs0-float32_32000x256.bin
-  //   - var_1 uint32 [8] RowsNodeOp-float32_8x256-rhs1-uint32_8.bin
-  // clang-format on
-
-  OpArgs args{
-      .lhs = "RowsNodeOp-float32_8x256-lhs.bin",  //
-      .rhs =
-          {
-              "RowsNodeOp-float32_8x256-rhs0-float32_32000x256_Wemb.bin",  //
-              "RowsNodeOp-float32_8x256-rhs1-uint32_8_data_0.bin"          //
-          }                                                                //
-  };
-
-  // Shape projected to 1 x 8 to match.
-  Tensor lhs = tf<float>((args.lhs), Shape({1, 8, 256}), "lhs");
-  // std::cout << "\n" << lhs << std::endl;
-
-  Tensor rhs0 = tf<float>((args.rhs[0]), Shape({32000, 256}), "rhs0");
-
-  // std::cout << rhs0 << std::endl;
-
-  Tensor rhs1 = tf<int>((args.rhs[1]), Shape({1, 8}), "rhs1");
-
-  // std::cout << rhs1 << std::endl;
-
-  Tensor lhs_computed = index_select(rhs0, rhs1);
-  SLIMT_CHECK(lhs_computed == lhs);
-}
-
-void DotBatchedNodeOp() {
-  // clang-format off
-  // file: "/home/jerin/code/bergamot-translator/3rd_party/marian-dev/src/graph/node_operators_binary.h"
-  // line: 424
-  // fn: "marian::DotBatchedNodeOp::forwardOps()::<lambda()>"
-  // op: { ProdBatched(val_, graph()->allocator(), child(0)->val(), child(1)->val(), transA_, transB_, 0.f, scalar_) }
-  // before: var_44 float32 [2x8x4x4]
-  // after: var_44 float32 [2x8x4x4] DotBatchedNodeOp-float32_2x8x4x4-lhs.bin
-  // operands: 
-  //   - var_25 float32 [2x8x4x32] DotBatchedNodeOp-float32_2x8x4x4-rhs0-float32_2x8x4x32.bin
-  //   - var_34 float32 [2x8x4x32] DotBatchedNodeOp-float32_2x8x4x4-rhs1-float32_2x8x4x32.bin
-  // clang-format on
-
-  OpArgs args{
-      .lhs = "DotBatchedNodeOp-float32_2x8x4x4-lhs.bin",  //
-      .rhs =
-          {
-              "DotBatchedNodeOp-float32_2x8x4x4-rhs0-float32_2x8x4x32.bin",  //
-              "DotBatchedNodeOp-float32_2x8x4x4-rhs1-float32_2x8x4x32.bin"   //
-          }                                                                  //
-  };
-
-  // std::cout << "\n";
-
-  size_t batch_size = 2;
-  size_t sequence_length = 4;
-  size_t num_heads = 8;
-  size_t dim_head = 32;
-
-  size_t k = 2;
-  size_t h = num_heads / k;
-  Shape lhs_shape({k, batch_size * sequence_length, h, h});
-
-  Shape rhs_shape({k, batch_size * sequence_length, h, dim_head});
-  Tensor lhs = tf<float>((args.lhs), lhs_shape, "lhs");
-  // std::cout << lhs << std::endl;
-
-  Tensor rhs0 = tf<float>((args.rhs[0]), rhs_shape, "rhs0");
-  // std::cout << rhs0 << std::endl;
-
-  Tensor rhs1 = tf<float>((args.rhs[1]), rhs_shape, "rhs1");
-  // std::cout << rhs1 << std::endl;
-
-  // clang-format off
-  // op: { ProdBatched(val_, graph()->allocator(), child(0)->val(), child(1)->val(), transA_, transB_, 0.f, scalar_) }
-  //                                                                                 false    true          0.176776692
-  // clang-format on
-
-  size_t bsz = batch_size * sequence_length * k;
-  Tensor lhs_computed(lhs.type(), lhs.shape(), "lhs_computed");
-  batch_matrix_multiply(                       //
-      rhs0.data<float>(), rhs1.data<float>(),  //
-      bsz, h, dim_head, h, dim_head,           //
-      /*trans_a=*/false, /*trans_b=*/true,     //
-      /*alpha =*/0.176776692,                  //
-      lhs_computed.data<float>());
-
-  // std::cout << lhs << std::endl;
-  // std::cout << lhs_computed << std::endl;
-  SLIMT_CHECK(lhs_computed == lhs);
-}
-
-void TransposeNodeOp() {
-  // clang-format off
-  // file: "/home/jerin/code/bergamot-translator/3rd_party/marian-dev/src/graph/node_operators_unary.h"
-  // line: 747
-  // fn: "marian::TransposeNodeOp::forwardOps()::<lambda()>"
-  // op: { TransposeND(val_, child(0)->val(), axes_) }
-  // before: var_10 float32 [1x2x4x256]
-  // after: var_10 float32 [1x2x4x256] TransposeNodeOp-float32_1x2x4x256-lhs.bin
-  // operands: 
-  //   - var_8 float32 [1x4x2x256] TransposeNodeOp-float32_1x2x4x256-rhs0-float32_1x4x2x256.bin
-  // clang-format on
-
-  OpArgs args{
-      .lhs = "TransposeNodeOp-float32_1x2x4x256-lhs.bin",
-      .rhs = {"TransposeNodeOp-float32_1x2x4x256-rhs0-float32_1x4x2x256.bin"}};
-
-  Shape lhs_shape({1, 2, 4, 256});
-  Tensor lhs = tf<float>((args.lhs), lhs_shape, "lhs");
-
-  Shape rhs_shape = lhs_shape.transpose(-1, -2);
-  Tensor rhs = tf<float>((args.rhs[0]), rhs_shape, "rhs");
-
-  Tensor lhs_expected(lhs.type(), lhs.shape(), "lhs_expected");
-  transpose_3120(rhs.data<float>(), 1, 4, 2, 256, lhs_expected.data<float>());
-
-  SLIMT_TRACE(lhs);
-  SLIMT_TRACE(lhs_expected);
-  SLIMT_CHECK(lhs == lhs_expected);
-}
-
-void LayerNormalizationOp() {
-  // clang-format off
-  // file: "/home/jerin/code/bergamot-translator/3rd_party/marian-dev/src/graph/node_operators_binary.h"
-  // line: 1210
-  // fn: "marian::LayerNormalizationOp::forwardOps()::<lambda()>"
-  // op: { LayerNormalization(val_, child(0)->val(), child(1)->val(), (children_.size() == 3) ? child(2)->val() : nullptr, eps_) }
-  // before: var_60 float32 [1x2x4x256]
-  // after: var_60 float32 [1x2x4x256] LayerNormalizationOp-float32_1x2x4x256-lhs.bin
-  // operands: 
-  //   - var_57 float32 [1x2x4x256] LayerNormalizationOp-float32_1x2x4x256-rhs0-float32_1x2x4x256.bin
-  //   - var_58 float32 [1x256] F0::encoder_l1_self_Wo_ln_scale LayerNormalizationOp-float32_1x2x4x256-rhs1-float32_1x256_encoder_l1_self_Wo_ln_scale.bin
-  //   - var_59 float32 [1x256] F0::encoder_l1_self_Wo_ln_bias LayerNormalizationOp-float32_1x2x4x256-rhs2-float32_1x256_encoder_l1_self_Wo_ln_bias.bin
-  // clang-format on
-  OpArgs args{
-      .lhs = "LayerNormalizationOp-float32_1x2x4x256-lhs.bin",
-      // clang-format off
-      .rhs = {
-          "LayerNormalizationOp-float32_1x2x4x256-rhs0-float32_1x2x4x256.bin",  
-          "LayerNormalizationOp-float32_1x2x4x256-rhs1-float32_1x256_encoder_l1_self_Wo_ln_scale.bin", 
-          "LayerNormalizationOp-float32_1x2x4x256-rhs2-float32_1x256_encoder_l1_self_Wo_ln_bias.bin" 
-      }
-      // clang-format on
-  };
-
-  Shape lhs_shape({1, 2, 4, 256});
-  Tensor lhs = tf<float>((args.lhs), lhs_shape, "lhs");
-
-  Tensor rhs0 = tf<float>((args.rhs[0]), lhs_shape, "rhs0");
-
-  Shape ln_shape({1, 256});
-  Tensor rhs1 = tf<float>((args.rhs[1]), ln_shape, "rhs1");
-  Tensor rhs2 = tf<float>((args.rhs[2]), ln_shape, "rhs2");
-
-  Tensor lhs_expected(lhs.type(), lhs.shape(), "lhs_expected");
-  constexpr float kEps = 1e-9;
-  size_t rows = 1 * 2 * 4;
-  size_t cols = 256;
-
-  layer_norm(rhs0.data<float>(), rhs1.data<float>(), rhs2.data<float>(), kEps,
-             rows, cols, lhs_expected.data<float>());
-
-  SLIMT_TRACE(lhs);
-  SLIMT_TRACE(lhs_expected);
-  SLIMT_CHECK(lhs == lhs_expected);
-}
-}  // namespace slimt
-
-#ifdef HAS_INTGEMM
-#include "3rd-party/intgemm/intgemm/intgemm.h"
-namespace slimt {
-
-void AffineIntgemm() {
-  // clang-format off
-  // file: "/home/jerin/code/bergamot-translator/3rd_party/marian-dev/src/tensors/cpu/integer_common.h"
-  // line: 55
-  // fn: "marian::cpu::integer::fetchAlphaFromModelNodeOp::forwardOps()::<lambda()>"
-  // op: { fetchAlpha() }
-  // before: var_19 float32 [1] F0::encoder_l1_self_Wq_QuantMultA
-  // after: var_19 float32 [1] F0::encoder_l1_self_Wq_QuantMultA cpu-float32_1_encoder_l1_self_Wq_QuantMultA-lhs.bin
-  // operands: 
-  //   - var_17 intgemm8 [256x256] F0::encoder_l1_self_Wq cpu-float32_1_encoder_l1_self_Wq_QuantMultA-rhs0-intgemm8_256x256_encoder_l1_self_Wq.bin
-  // 
-  // 
-  // file: "/home/jerin/code/bergamot-translator/3rd_party/marian-dev/src/tensors/cpu/intgemm_interface.h"
-  // line: 60
-  // fn: "marian::cpu::integer::PrepareANodeOp<marian::Type::int8>::forwardOps()::<lambda()>"
-  // op: { PrepareA() }
-  // before: var_20 int8 [1x2x4x256] none_shifted
-  // after: var_20 int8 [1x2x4x256] none_shifted cpu-int8_1x2x4x256_none_shifted-lhs.bin
-  // operands: 
-  //   - var_10 float32 [1x2x4x256] cpu-int8_1x2x4x256_none_shifted-rhs0-float32_1x2x4x256.bin
-  //   - var_19 float32 [1] F0::encoder_l1_self_Wq_QuantMultA cpu-int8_1x2x4x256_none_shifted-rhs1-float32_1_encoder_l1_self_Wq_QuantMultA.bin
-  // 
-  // 
-  // file: "/home/jerin/code/bergamot-translator/3rd_party/marian-dev/src/tensors/cpu/intgemm_interface.h"
-  // line: 285
-  // fn: "marian::cpu::integer::QuantMultNodeOp<marian::Type::int8>::forwardOps()::<lambda()>"
-  // op: { QuantMult() }
-  // before: var_21 float32 [1] F0::encoder_l1_self_Wq_QuantMultB
-  // after: var_21 float32 [1] F0::encoder_l1_self_Wq_QuantMultB cpu-float32_1_encoder_l1_self_Wq_QuantMultB-lhs.bin
-  // operands: 
-  //   - var_17 intgemm8 [256x256] F0::encoder_l1_self_Wq cpu-float32_1_encoder_l1_self_Wq_QuantMultB-rhs0-intgemm8_256x256_encoder_l1_self_Wq.bin
-  // 
-  // 
-  // file: "/home/jerin/code/bergamot-translator/3rd_party/marian-dev/src/tensors/cpu/intgemm_interface.h"
-  // line: 359
-  // fn: "marian::cpu::integer::PrepareBiasForBNodeOp::forwardOps()::<lambda()>"
-  // op: { PrepareBias() }
-  // before: var_22 float32 [1x256] F0::encoder_l1_self_bq_Prepared
-  // after: var_22 float32 [1x256] F0::encoder_l1_self_bq_Prepared cpu-float32_1x256_encoder_l1_self_bq_Prepared-lhs.bin
-  // operands: 
-  //   - var_18 float32 [1x256] F0::encoder_l1_self_bq cpu-float32_1x256_encoder_l1_self_bq_Prepared-rhs0-float32_1x256_encoder_l1_self_bq.bin
-  //   - var_17 intgemm8 [256x256] F0::encoder_l1_self_Wq cpu-float32_1x256_encoder_l1_self_bq_Prepared-rhs1-intgemm8_256x256_encoder_l1_self_Wq.bin
-  //   - var_19 float32 [1] F0::encoder_l1_self_Wq_QuantMultA cpu-float32_1x256_encoder_l1_self_bq_Prepared-rhs2-float32_1_encoder_l1_self_Wq_QuantMultA.bin
-  //   - var_21 float32 [1] F0::encoder_l1_self_Wq_QuantMultB cpu-float32_1x256_encoder_l1_self_bq_Prepared-rhs3-float32_1_encoder_l1_self_Wq_QuantMultB.bin
-  // 
-  // 
-  // quantmult A, B, scalar 1.007505e+01 1.823447e+02 1.000000e+00
-  // file: "/home/jerin/code/bergamot-translator/3rd_party/marian-dev/src/tensors/cpu/intgemm_interface.h"
-  // line: 540
-  // fn: "marian::cpu::integer::AffineNodeOp<marian::Type::int8>::forwardOps()::<lambda()>"
-  // op: { AffineOp() }
-  // before: var_23 float32 [1x2x4x256]
-  // after: var_23 float32 [1x2x4x256] cpu-float32_1x2x4x256-lhs.bin
-  // operands: 
-  //   - var_20 int8 [1x2x4x256] none_shifted cpu-float32_1x2x4x256-rhs0-int8_1x2x4x256_none_shifted.bin
-  //   - var_17 intgemm8 [256x256] F0::encoder_l1_self_Wq cpu-float32_1x2x4x256-rhs1-intgemm8_256x256_encoder_l1_self_Wq.bin
-  //   - var_22 float32 [1x256] F0::encoder_l1_self_bq_Prepared cpu-float32_1x2x4x256-rhs2-float32_1x256_encoder_l1_self_bq_Prepared.bin
-  // clang-format on
-
-  // Input to the intgemm involved pipeline. Usually these are float
-  // activations.
-  //
-  // Weight and weights quantization multiplier that corresponds to the x above.
-  // Here it happens to be the encoder1's Q matrix.
-  // Bias associated with the Q transform.
-
-  // Intgemm code test, step-by-step. B is already prepared offline.
-  //
-  //   0. (offline) PrepareB
-  //   1. PrepareA
-  //   2. PrepareBias
-  //   3. Multiply.
-
-  // Aliases in intgemm terminology
-  //  C = AB + bias .
-  //      A  i8 [ A_rows x width ]
-  //      B  i8 [ width x B_cols ]
-  //   bias f32 [ 1 x B_cols     ]
-  //
-
-  // We define the following two structs to hold objects to give a convenient
-  // syntax to describe for cases ahead.
-  //
-  // There are 3 variable sets:
-  //
-  // 1. Expected (raw):
-  //      The unprepared values, that are fed in. In our case, it's offline
-  //      prepared weights (B), f32 activations and f32 biases for online
-  //      prepration.
-  //
-  // 2. Expected (prepared)
-  //      During the course, we get prepared variations that are intermediate
-  //      variables. These are also saved and requires ground truth to check
-  //      expected.
-  //
-  // 3. Computed (prepared)
-  //      The values we compute along the process.
-
-  struct Affine {
-    Tensor A;
-    Tensor B;
-    Tensor bias;
-  };
-
-  // Holds a, b scalar (hyper) parameters used to multiply or divide for
-  // quantization.
-  struct Quant {
-    float a;
-    float b;
-  };
-
-  struct ProblemSet {
-    Affine var;
-    Affine prepared_expected;
-    Quant quant;
-    Tensor y_expected;
-  };
-
-  auto problem_256x256 = []() {
-    // clang-format off
-    auto A            = tf<float>("cpu-int8_1x2x4x256_none_shifted-rhs0-float32_1x2x4x256.bin",         Shape({1*2*4, 256}), "A"); // NOLINT
-    auto [B, qB]      = qtf<int8_t, float>("var_17-ParamNode-intgemm8_256x256_encoder_l1_self_Wq-lhs.bin", Shape({256, 256}), "B");
-    auto bias         = tf<float>("var_18-ParamNode-float32_1x256_encoder_l1_self_bq-lhs.bin", Shape({1, 256}), "bias");
-    auto qa           = tf<float>("var_19-cpu-float32_1_encoder_l1_self_Wq_QuantMultA-lhs.bin", Shape({1}), "quant.a"); // DONE
-    auto qb           = tf<float>("cpu-float32_1x1536_encoder_l1_ffn_b1_Prepared-rhs3-float32_1_encoder_l1_ffn_W1_QuantMultB.bin", Shape({1}), "quant.b"); // DONE
-    auto y_expected   = tf<float>("cpu-float32_1x2x4x256-lhs.bin", Shape({1*2*4, 256}), "y_expected");
-
-    Affine prepared_expected {
-      .A = tf<int8_t>("var_20-cpu-int8_1x2x4x256_none_shifted-lhs.bin", Shape({1*2*4, 256}), "prepared_expected_A"),
-      .B = tf<int8_t>("var_17-ParamNode-intgemm8_256x256_encoder_l1_self_Wq-lhs.bin", Shape({256, 256}), "prepared_expected_B"),
-      .bias= tf<float>("var_22-cpu-float32_1x256_encoder_l1_self_bq_Prepared-lhs.bin", Shape({1, 256}), "prepared_expected_bias")
-    };
-    // clang-format on
-
-    ProblemSet pset{
-        .var =
-            Affine{
-                .A = std::move(A),                          //
-                .B = std::move(B),                          //
-                .bias = std::move(bias)                     //
-            },                                              //
-        .prepared_expected = std::move(prepared_expected),  //
-        .quant =
-            Quant{
-                .a = qa.item<float>(),       //
-                .b = qB                      //
-            },                               //
-        .y_expected = std::move(y_expected)  //
-    };
-
-    // auto qb_loaded = qb.item<float>();
-    // float diff = qb_loaded - qB;
-    // SLIMT_TRACE3(qB, qb_loaded, diff);
-    // SLIMT_CHECK(std::abs(diff) < 1e-7);
-    // SLIMT_TRACE2(quant.a, quant.b);
-
-    return pset;
-  };
-
-  auto problem_256x1536 = []() {
-    // clang-format off
-    auto A            = tf<float>("var_64-cpu-int8_1x2x4x256_none_shifted-rhs0-float32_1x2x4x256.bin",         Shape({2, 4, 256}), "A"); // NOLINT
-    auto [B, qB]      = qtf<int8_t, float>("var_61-ParamNode-intgemm8_256x1536_encoder_l1_ffn_W1-lhs.bin", Shape({256, 1536}), "B");
-    auto bias         = tf<float>("var_62-ParamNode-float32_1x1536_encoder_l1_ffn_b1-lhs.bin", Shape({1, 1536}), "bias");
-    auto qa           = tf<float>("var_63-cpu-float32_1_encoder_l1_ffn_W1_QuantMultA-lhs.bin", Shape({1}), "quant.a");
-    auto qb           = tf<float>("var_65-cpu-float32_1_encoder_l1_ffn_W1_QuantMultB-lhs.bin", Shape({1}), "quant.b");
-    auto y_expected   = tf<float>("var_67-cpu-float32_1x2x4x1536-lhs.bin", Shape({2, 4, 1536}), "y_expected");
-
-    Affine prepared_expected {
-      .A = tf<int8_t>("var_64-cpu-int8_1x2x4x256_none_shifted-lhs.bin", Shape({2, 4, 256}), "prepared_expected_A"),
-      .B = tf<int8_t>("var_66-cpu-float32_1x1536_encoder_l1_ffn_b1_Prepared-rhs1-intgemm8_256x1536_encoder_l1_ffn_W1.bin", Shape({256, 1536}), "prepared_expected_B"),
-      .bias= tf<float>("var_66-cpu-float32_1x1536_encoder_l1_ffn_b1_Prepared-lhs.bin", Shape({1, 1536}), "prepared_expected_bias")
-    };
-    // clang-format on
-
-    ProblemSet pset{
-        .var =
-            Affine{
-                .A = std::move(A),                          //
-                .B = std::move(B),                          //
-                .bias = std::move(bias)                     //
-            },                                              //
-        .prepared_expected = std::move(prepared_expected),  //
-        .quant =
-            Quant{
-                .a = qa.item<float>(),       //
-                .b = qB                      //
-            },                               //
-        .y_expected = std::move(y_expected)  //
-    };
-    return pset;
-  };
-
-  auto problem_1536x256 = []() {
-    // clang-format off
-    auto A            = tf<float>("var_72-cpu-int8_1x2x4x1536_none_shifted-rhs0-float32_1x2x4x1536.bin",         Shape({2, 4, 1536}), "A"); // NOLINT
-    auto [B, qB]      = qtf<int8_t, float>("var_69-ParamNode-intgemm8_1536x256_encoder_l1_ffn_W2-lhs.bin", Shape({1536, 256}), "B");
-    auto bias         = tf<float>("var_70-ParamNode-float32_1x256_encoder_l1_ffn_b2-lhs.bin", Shape({1, 256}), "bias");
-    auto qa           = tf<float>("var_71-cpu-float32_1_encoder_l1_ffn_W2_QuantMultA-lhs.bin", Shape({1}), "quant.a");
-    auto qb           = tf<float>("var_73-cpu-float32_1_encoder_l1_ffn_W2_QuantMultB-lhs.bin", Shape({1}), "quant.b");
-    auto y_expected   = tf<float>("var_75-cpu-float32_1x2x4x256-lhs.bin", Shape({2, 4, 256}), "y_expected");
-
-    Affine prepared_expected {
-      .A = tf<int8_t>("var_72-cpu-int8_1x2x4x1536_none_shifted-lhs.bin", Shape({2, 4, 1536}), "prepared_expected_A"),
-      .B = tf<int8_t>("var_69-ParamNode-intgemm8_1536x256_encoder_l1_ffn_W2-lhs.bin", Shape({1536, 256}), "prepared_expected_B"),
-      .bias= tf<float>("var_74-cpu-float32_1x256_encoder_l1_ffn_b2_Prepared-lhs.bin", Shape({1, 256}), "prepared_expected_bias")
-    };
-    // clang-format on
-
-    ProblemSet pset{
-        .var =
-            Affine{
-                .A = std::move(A),                          //
-                .B = std::move(B),                          //
-                .bias = std::move(bias)                     //
-            },                                              //
-        .prepared_expected = std::move(prepared_expected),  //
-        .quant =
-            Quant{
-                .a = qa.item<float>(),       //
-                .b = qB                      //
-            },                               //
-        .y_expected = std::move(y_expected)  //
-    };
-    return pset;
-  };
-
-  auto intgemm_from_params = [](ProblemSet &pset) {
-    Affine &actual = pset.var;
-    Affine &prepared_expected = pset.prepared_expected;
-    Quant &quant = pset.quant;
-    Tensor &y_expected = pset.y_expected;
-
-    Affine prepared{
-        .A = Tensor(Type::i8, actual.A.shape(), "prepared_A"),           //
-        .B = Tensor(Type::i8, actual.B.shape(), "prepared_B"),           //
-        .bias = Tensor(Type::f32, actual.bias.shape(), "prepared_bias")  //
-    };
-
-    size_t A_cols = actual.A.dim(-1);          // NOLINT
-    size_t B_cols = actual.B.dim(-1);          // NOLINT
-    size_t A_rows = actual.A.size() / A_cols;  // NOLINT
-    size_t B_rows = actual.B.size() / B_cols;  // NOLINT
-
-    // A is in row-major format.
-    // B is in column-major, so consider it a transposed form.
-    SLIMT_TRACE2(A_rows, A_cols);
-    SLIMT_TRACE2(B_rows, B_cols);
-
-    SLIMT_CHECK(A_cols == B_rows);
-    size_t width = B_rows;
-    SLIMT_TRACE(width);
-
-    // Check widths are consistent, making matrix multiplication viable.
-    // This ensures our saves and loads satisfy one property.
-
-    // Now we proceed to piecewise intgemm operations.
-
-    // 0. PrepareB: B is prepared, but let's check PrepareB.
-    //
-    // Turns out, I do not have access from inputs to the raw B. I already
-    // only have prepared B.
-    // TODO(jerinphilip): Come back later and fix.
-    std::copy(actual.B.data<int8_t>(), actual.B.data<int8_t>() + B_cols * width,
-              prepared.B.data<int8_t>());
-
-    // Surprisingly, the following does not work. However a plain-copy does
-    // work.
-    // @jerinphilip has confirmed this is not a no-op by trying a copy before
-    // (see above).
-    //
-    // const auto *b = B.data<int8_t>();
-    // auto *prepared_b = prepared.B.data<int8_t>();
-    // intgemm::Int8::PrepareBQuantizedTransposed(b, prepared_b, B_cols, width);
-
-    // SLIMT_TRACE_BLOCK(prepared.B);
-    // SLIMT_TRACE_BLOCK(prepared_expected.B);
-    SLIMT_CHECK(prepared_expected.B == actual.B);
-    SLIMT_CHECK(prepared.B == prepared_expected.B);
-
-    // 1. PrepareA
-    intgemm::Int8Shift::PrepareA(                           //
-        actual.A.data<float>(), prepared.A.data<int8_t>(),  //
-        quant.a,                                            //
-        A_rows, width                                       //
-    );
-
-    // Check that the quantized activations are a match.
-    // SLIMT_TRACE2(qx, A);
-    SLIMT_CHECK(prepared.A == prepared_expected.A);
-
-    // 2. PrepareBias
-    Quant alpha{
-        .a = 127.0F / quant.a,  //
-        .b = 127.0F / quant.b,  //
-    };
-
-    float bias_unquant_multiplier = (-1.0F * (alpha.a * alpha.b)) / 127.0F;
-    SLIMT_TRACE3(alpha.a, alpha.b, bias_unquant_multiplier);
-    auto prepare_bias_callback =
-        intgemm::callbacks::UnquantizeAndAddBiasAndWrite(        //
-            bias_unquant_multiplier, actual.bias.data<float>(),  //
-            prepared.bias.data<float>()                          //
-        );
-
-    SLIMT_TRACE2(width, B_cols);
-    intgemm::Int8Shift::PrepareBias(  //
-        prepared.B.data<int8_t>(),    //
-        width, B_cols,                //
-        prepare_bias_callback         //
-    );
-
-    SLIMT_TRACE_BLOCK(prepared.bias)
-    SLIMT_TRACE_BLOCK(prepared_expected.bias);
-    SLIMT_TRACE(mse(prepared.bias, prepared_expected.bias));
-    SLIMT_CHECK(prepared.bias == prepared_expected.bias);
-
-    // 3. Multiply
-    Shape out_shape = actual.A.shape();
-    out_shape.set_dim(-1, B_cols);
-
-    Tensor y_piecewise(Type::f32, out_shape, "y_piecewise");
-
-    float unquant_multiplier = 1.0F / (quant.a * quant.b);
-    auto multiply_callback = intgemm::callbacks::UnquantizeAndAddBiasAndWrite(
-        unquant_multiplier, prepared.bias.data<float>(),
-        y_piecewise.data<float>());
-
-    intgemm::Int8Shift::Multiply(                              //
-        prepared.A.data<int8_t>(), prepared.B.data<int8_t>(),  //
-        A_rows, width, B_cols,                                 //
-        multiply_callback                                      //
-    );
-
-    SLIMT_TRACE_BLOCK(y_piecewise);
-    SLIMT_TRACE_BLOCK(y_expected);
-    SLIMT_TRACE(mse(y_piecewise, y_expected));
-    SLIMT_CHECK(y_expected == y_piecewise);
-
-    // Compute from the intgemm_affine function, used in the library.
-    // This ensures what we checked in there is consistent with what we expect.
-    Tensor y_whole = qmm::affine(actual.A, actual.B, actual.bias, quant.a,
-                                 quant.b, "y_whole");
-    SLIMT_TRACE(y_whole.shape());
-    SLIMT_TRACE(y_expected.shape());
-    SLIMT_TRACE(mse(y_whole, y_expected));
-    SLIMT_CHECK(y_expected == y_whole);
-  };
-
-  auto pset1 = problem_256x256();
-  auto pset2 = problem_256x1536();
-  auto pset3 = problem_1536x256();
-  intgemm_from_params(pset1);
-  intgemm_from_params(pset2);
-  intgemm_from_params(pset3);
-
-  // SLIMT_TRACE2(y_whole, y_expected);
-}
-}  // namespace slimt
-#endif
-
-namespace slimt {
-
-void integration() {
-  std::string home = std::getenv("HOME");
-  std::string browsermt = ".local/share/bergamot/models/browsermt";
-  std::string folder = "ende.student.tiny11";
-
-  auto prefix_browsermt = [&](const std::string &relative_path) {
-    std::string path =
-        home + "/" + browsermt + "/" + folder + "/" + relative_path;
-    // std::cout << path << "\n";
-    return path;
-  };
-
-  Package<std::string> path{
-      .model = prefix_browsermt("model.intgemm.alphas.bin"),  //
-      .vocabulary = prefix_browsermt("vocab.deen.spm"),       //
-      .shortlist = prefix_browsermt("lex.s2t.bin")            //
-  };
-
-  Model::Config model_config;
-  auto model = std::make_shared<Model>(model_config, path);
-  Config service_config;
-  Blocking service(service_config);
-  std::string source = "1 2\n1 2 3\n";
-  slimt::Options opts;
-  auto responses = service.translate(model, {std::move(source)}, opts);
-  fprintf(stdout, "%s\n", responses[0].target.text.c_str());
-}
-
-void ShortlistGen() {
-  std::string home = std::getenv("HOME");
-  std::string browsermt = ".local/share/bergamot/models/browsermt";
-  std::string folder = "ende.student.tiny11";
-
-  auto prefix_browsermt = [&](const std::string &relative_path) {
-    std::string path =
-        home + "/" + browsermt + "/" + folder + "/" + relative_path;
-    // std::cout << path << "\n";
-    return path;
-  };
-  std::string vocab_path = prefix_browsermt("vocab.deen.spm");
-  std::string shortlist_path = prefix_browsermt("lex.s2t.bin");
-
-  Vocabulary vocab(vocab_path);
-  Vocabulary &source = vocab;
-  Vocabulary &target = vocab;
-
-  // Load ShortlistGenerator
-  io::MmapFile shortlist_file(shortlist_path);
-  View view{
-      .data = shortlist_file.data(),  //
-      .size = shortlist_file.size()   //
-  };
-  ShortlistGenerator shortlist_generator(view, source, target);
-
-  std::string line = "May I try the shortlist on, please?";
-  auto [words, views] = vocab.encode(line, /*add_eos=*/true);
-  Shortlist shortlist = shortlist_generator.generate(words);
-
-  const auto &likely_target_words = shortlist.words();
-  std::string decoded;
-  auto dviews = vocab.decode(likely_target_words, decoded);
-  for (size_t i = 0; i < likely_target_words.size(); i++) {
-    std::cout << "[" << dviews[i] << ": " << likely_target_words[i] << "] ";
-  }
-
-  // std::cout << decoded << "\n";
-}
-
-}  // namespace slimt
-
-int main(int argc, char **argv) {
-  if (argc < 2) {
-    std::cerr << "Usage: " << argv[0] << " <test-name>\n";
-    std::exit(EXIT_FAILURE);
-  }
-
-// clang-format off
-#define TEST_ENTRY(fn_name) {#fn_name, &slimt::fn_name}
-  // clang-format on
-
-  using Test = void (*)();
-  std::unordered_map<std::string, Test> tests({
-      TEST_ENTRY(load),                  //
-      TEST_ENTRY(integration),           //
-      TEST_ENTRY(RowsNodeOp),            //
-      TEST_ENTRY(ScalarMultNodeOp),      //
-      TEST_ENTRY(DotBatchedNodeOp),      //
-      TEST_ENTRY(TransposeNodeOp),       //
-      TEST_ENTRY(LayerNormalizationOp),  //
-#ifdef SLIMT_HAS_INTGEMM
-      TEST_ENTRY(AffineIntgemm),  //
-#endif
-      TEST_ENTRY(ShortlistGen)  //
-  });
-
-  // std::cout << "slimt test\n";
-  std::string test = argv[1];
-
-  auto query = tests.find(test);
-  if (query != tests.end()) {
-    auto name = query->first;
-    auto fn = query->second;
-    try {
-      std::cout << "Running test [" << name << "] ...";
-      fn();
-      std::cout << " [success]\n";
-    } catch (...) {
-      std::cout << " [fail]\n";
-      throw;
-    }
-  } else if (test == "all") {
-    std::vector<std::string> failed;
-    for (auto &named_test : tests) {
-      auto name = named_test.first;
-      auto fn = named_test.second;
-      try {
-        std::cout << "Running test ... ";
-        fn();
-        std::cout << "[success] [" << name << "]\n";
-      } catch (const std::exception &exception) {
-        std::cout << " [fail] [" << name << "]\n";
-        throw;
-      }
-    }
-  } else {
-    std::cerr << "Unknown test " << test << "\n";
-    std::exit(EXIT_FAILURE);
-  }
-  return 0;
-}
-
-// NOLINTEND
diff --git a/bindings/python/slimt.cpp b/bindings/python/slimt.cpp
index cc26670b..162d0664 100644
--- a/bindings/python/slimt.cpp
+++ b/bindings/python/slimt.cpp
@@ -217,4 +217,5 @@ PYBIND11_MODULE(_slimt, m) {
   auto sm_preset = m.def_submodule("preset");
   sm_preset.def("tiny", &slimt::preset::tiny);
   sm_preset.def("base", &slimt::preset::base);
+  sm_preset.def("nano", &slimt::preset::nano);
 }
diff --git a/bindings/python/utils.py b/bindings/python/utils.py
index f4b901f6..9ea68127 100644
--- a/bindings/python/utils.py
+++ b/bindings/python/utils.py
@@ -75,11 +75,13 @@ def to_py_native(annotated_text: AnnotatedText) -> t.Dict[t.Any, t.Any]:
 
 def package_from_config_path(path):
     with open(path) as yaml_file:
-        c = yaml.safe_load(yaml_file)
+        config = yaml.safe_load(yaml_file)
     root = os.path.dirname(path)
     package = Package(
-        model=os.path.join(root, c["models"][0]),
-        vocabulary=os.path.join(root, c["vocabs"][0]),
-        shortlist=os.path.join(root, c["shortlist"][0]),
+        model=os.path.join(root, config["models"][0]),
+        vocabulary=os.path.join(root, config["vocabs"][0]),
+        shortlist=(
+            os.path.join(root, config["shortlist"][0]) if "shortlist" in config else ""
+        ),
     )
     return package
diff --git a/scripts/marian-trace-gen.h b/scripts/marian-trace-gen.h
index 51995eff..c092faff 100644
--- a/scripts/marian-trace-gen.h
+++ b/scripts/marian-trace-gen.h
@@ -80,48 +80,52 @@ std::string save_to_disk(const std::string &name, Node node) {
 
 template <class NodeType>
 inline void var_id(std::ostream &out, NodeType value) {
-  out << "var_" << value->getId() << " ";
+  out << "\"var_" << value->getId() << " ";
   out << value->value_type() << " ";
   out << "[" << value->shape() << "]";
   if (value->name() != "none") {
     out << " " << value->name();
   }
+  out << "\"";
 }
 
 template <class NodeType>
-inline bool process(const char *pretty_fn, NodeType *value, std::ostream &out) {
+inline bool process(const char *pretty_fn, NodeType *value, std::ostream &out,
+                    const std::string &indent) {
   std::stringstream stream;
   std::string op_name = extract_op_name(pretty_fn);
   std::string lhs_tag = var_metadata(value);
-  std::string prefix =
-      "var_" + std::to_string(value->getId()) + "-" + op_name + "-" + lhs_tag;
-  std::string lhs_name = prefix + "-lhs.bin";
-  std::string lhs_save = save_to_disk(lhs_name, value);
+  std::string var_name = "var_" + std::to_string(value->getId());
+  std::string save_name = var_name + ".bin";
+  std::string lhs_save = save_to_disk(save_name, value);
 
-  stream << "after: ";
+  stream << indent << "lhs: {\"id\": ";
   var_id(stream, value);
   if (!lhs_save.empty()) {
-    stream << " " << lhs_name;
+    stream << ", \"save\":";
+    stream << " " << save_name;
   }
+  stream << " }";
 
   auto children = value->children();
   if (not children.empty()) {
-    stream << "\noperands: \n";
+    stream << "\n" << indent << "rhs: \n";
   }
   for (size_t i = 0; i < children.size(); i++) {
     auto rhs = children[i];
-    stream << "  - ";
+    stream << indent << "  - ";
+    stream << "{\"id\": ";
     var_id(stream, rhs);
 
     std::string rhs_tag = var_metadata(rhs);
     // NOLINTBEGIN
-    std::string rhs_name =
-        prefix + "-rhs" + std::to_string(i) + "-" + rhs_tag + ".bin";
+    std::string rhs_name = var_name + "-rhs" + std::to_string(i) + ".bin";
     // NOLINTEND
     std::string rhs_save = save_to_disk(rhs_name, rhs);
     if (!rhs_save.empty()) {
-      stream << " " << rhs_name;
+      stream << ",\"save\": " << rhs_name;
     }
+    stream << " }";
     stream << "\n";
   }
 
@@ -136,22 +140,20 @@ inline bool process(const char *pretty_fn, NodeType *value, std::ostream &out) {
   }()  // test if THREAD_GUARD is neccessary, remove if no problems occur.
        //
 #if 1
-#define NodeOp(op)                                                  \
-  [=]() {                                                           \
-    std::stringstream stream;                                       \
-    stream << "file: \"" << __FILE__ << "\"\n";                     \
-    stream << "line: " << __LINE__ << "\n";                         \
-    stream << "fn: \"" << __PRETTY_FUNCTION__ << "\"\n";            \
-    stream << "op: { " << #op << " }\n";                            \
-    stream << "before: ";                                           \
-    detail::var_id(stream, this);                                   \
-    op;                                                             \
-    stream << "\n";                                                 \
-    bool flag = detail::process(__PRETTY_FUNCTION__, this, stream); \
-    stream << "\n\n";                                               \
-    if (flag) {                                                     \
-      std::cerr << stream.str();                                    \
-    };                                                              \
+#define NodeOp(op)                                                          \
+  [=]() {                                                                   \
+    std::stringstream stream;                                               \
+    std::string indent = "  ";                                              \
+    stream << "- file: \"" << __FILE__ << "\"\n";                           \
+    stream << indent << "line: " << __LINE__ << "\n";                       \
+    stream << indent << "fn: \"" << __PRETTY_FUNCTION__ << "\"\n";          \
+    stream << indent << "op: \"{ " << #op << " }\"\n";                      \
+    op;                                                                     \
+    bool flag = detail::process(__PRETTY_FUNCTION__, this, stream, indent); \
+    stream << "\n\n";                                                       \
+    if (flag) {                                                             \
+      std::cerr << stream.str();                                            \
+    };                                                                      \
   }
 #else
 #define NodeOp(op) [=]() { op; }
diff --git a/scripts/t12n.py b/scripts/t12n.py
new file mode 100644
index 00000000..8dde76a4
--- /dev/null
+++ b/scripts/t12n.py
@@ -0,0 +1,39 @@
+import os
+import sys
+
+import yaml
+
+import slimt
+from slimt import Config, Model, Package, Service, preset
+
+
+# Load the config file
+def load_config(path):
+    config = None
+    with open(config_path) as yaml_file:
+        config = yaml.safe_load(yaml_file)
+    return config
+
+
+if __name__ == "__main__":
+    service = Service(workers=1, cache_size=1024)
+    # Load supplementary files for model execution by passing the config path directory
+    config_path = sys.argv[1]
+    root = os.path.dirname(config_path)
+    config = load_config(config_path)
+    package = Package(
+        model=os.path.join(root, config["models"][0]),
+        vocabulary=os.path.join(root, config["vocabs"][0]),
+        shortlist=os.path.join(root, config["shortlist"][0]),
+        # shortlist="",
+    )
+
+    # nano model
+    nano: Config = preset.nano()
+    model_nano = Model(nano, package)
+
+    data = sys.stdin.read()
+    responses = service.translate(model_nano, [data], html=False)
+
+    for response in responses:
+        print(response.source.text, "->", response.target.text)
diff --git a/scripts/trace-xlit.sh b/scripts/trace-xlit.sh
new file mode 100644
index 00000000..fdd4eae0
--- /dev/null
+++ b/scripts/trace-xlit.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+export DEBUG_VARIABLES_SAVE_PATH="/home/jerin/code/slimt/blobs/ml-xlit"
+mkdir -p $DEBUG_VARIABLES_SAVE_PATH
+rm $DEBUG_VARIABLES_SAVE_PATH/*
+
+/home/jerin/code/bergamot-translator/build/app/bergamot \
+  --model-config-paths $HOME/code/slimt-t12n/outputs/mal-eng/model.nano.npz.decoder.yml \
+  --log-level off \
+  < data/ml-xlit.txt \
+  2> traces/ml-xlit.trace.txt
diff --git a/slimt/Batcher.cc b/slimt/Batcher.cc
index 633c1faf..970f93c5 100644
--- a/slimt/Batcher.cc
+++ b/slimt/Batcher.cc
@@ -47,7 +47,7 @@ bool operator<(const SegmentRef& a, const SegmentRef& b) {
 
 void Batch::log() {
   (void)token_count_;
-  LOG(info, "Batch(tokens={}, max-length={}, segment_refs_={})", token_count_,
+  LOG(info, "Batch(tokens=%zu max-length=%zu, segment_refs_=%zu)", token_count_,
       max_length_, segment_refs_.size());
 }
 
diff --git a/slimt/Frontend.cc b/slimt/Frontend.cc
index 2e68b263..c8103f93 100644
--- a/slimt/Frontend.cc
+++ b/slimt/Frontend.cc
@@ -27,7 +27,7 @@ namespace slimt {
 
 namespace {
 
-Input convert(const Batch &batch, uint32_t pad_id, size_t limit_factor) {
+Input convert(const Batch &batch, uint32_t pad_id, float limit_factor) {
   const auto &segment_refs = batch.segment_refs();
   Input input(batch.size(), batch.max_length(), pad_id, limit_factor);
   for (const auto &segment_ref : segment_refs) {
diff --git a/slimt/HTML.cc b/slimt/HTML.cc
index 792b55bf..3f1127d2 100644
--- a/slimt/HTML.cc
+++ b/slimt/HTML.cc
@@ -486,14 +486,17 @@ HTML::HTML(std::string &source, Options &&options)
         // bit of "<img/>", then completely ignore it.
         if (contains(options_.void_tags, tag_name)) break;
 
-        SLIMT_ABORT_IF(stack.empty(),
-                       "Encountered more closing tags ({}) than opening tags",
-                       scanner.tag());
+        SLIMT_ABORT_IF(
+            stack.empty(),
+            detail::format(
+                "Encountered more closing tags ({}) than opening tags",
+                scanner.tag()));
 
         SLIMT_ABORT_IF(
             to_lower_case(stack.back()->name) != to_lower_case(scanner.tag()),
-            "Encountered unexpected closing tag </{}>, stack is {}",
-            scanner.tag(), stack);
+            detail::format(
+                "Encountered unexpected closing tag </{}>, stack is {}",
+                scanner.tag(), stack));
 
         // What to do with "<u></u>" case, where tag is immediately closed
         // so it never makes it into the taint of any of the spans? This adds
@@ -548,7 +551,8 @@ HTML::HTML(std::string &source, Options &&options)
     }
   }
 
-  SLIMT_ABORT_IF(!stack.empty(), "Not all tags were closed: {}", stack);
+  SLIMT_ABORT_IF(!stack.empty(),
+                 detail::format("Not all tags were closed: {}", stack));
 
   // Add a trailing span (that's empty) to signify all closed tags.
   spans_.emplace_back(Span{source.size(), source.size(), stack});
diff --git a/slimt/Input.cc b/slimt/Input.cc
index 30a3670e..2723b604 100644
--- a/slimt/Input.cc
+++ b/slimt/Input.cc
@@ -11,7 +11,7 @@
 namespace slimt {
 
 Input::Input(size_t batch_size, size_t sequence_length, uint32_t pad_id,
-             size_t limit_factor)
+             float limit_factor)
     : batch_(Type::u32, Shape({batch_size, sequence_length}), "batch"),
       mask_(Type::f32, Shape({batch_size, sequence_length}), "mask"),
       pad_id_(pad_id),
diff --git a/slimt/Input.hh b/slimt/Input.hh
index 859cd1ac..2af374c2 100644
--- a/slimt/Input.hh
+++ b/slimt/Input.hh
@@ -10,7 +10,7 @@ namespace slimt {
 class Input {
  public:
   Input(size_t batch_size, size_t sequence_length, uint32_t pad_id,
-        size_t limit_factor);
+        float limit_factor);
 
   void add(const std::vector<uint32_t> &words);
   void finalize();
@@ -31,7 +31,7 @@ class Input {
   size_t index_ = 0;
   uint32_t pad_id_ = 0;
   size_t used_ = 0;
-  size_t limit_factor_;
+  float limit_factor_;
   bool finalized_ = false;
 };
 }  // namespace slimt
diff --git a/slimt/Macros.hh b/slimt/Macros.hh
index bf1e7161..9a560169 100644
--- a/slimt/Macros.hh
+++ b/slimt/Macros.hh
@@ -1,4 +1,5 @@
 #pragma once
+#include <cstdio>
 #include <iostream>
 
 #define SLIMT_BREAK std::raise(SIGTRAP)
@@ -26,12 +27,12 @@
   SLIMT_TRACE2(x, y);         \
   SLIMT_TRACE(z);
 
-#define SLIMT_ABORT_IF(condition, ...) \
-  do {                                 \
-    if (condition) {                   \
-      std::cerr << #condition;         \
-      std::abort();                    \
-    }                                  \
+#define SLIMT_ABORT_IF(condition, error) \
+  do {                                   \
+    if (condition) {                     \
+      std::cerr << (error) << '\n';      \
+      std::abort();                      \
+    }                                    \
   } while (0)
 
 #define SLIMT_ABORT(message) \
@@ -40,4 +41,13 @@
     std::abort();            \
   } while (0)
 
+#ifdef SLIMT_ENABLE_LOG
+#define LOG(level, ...)              \
+  do {                               \
+    fprintf(stderr, "[%s]", #level); \
+    fprintf(stderr, __VA_ARGS__);    \
+    fprintf(stderr, "\n");           \
+  } while (0)
+#else  // SLIMT_ENABLE_LOGS
 #define LOG(...) (void)0
+#endif  // SLIMT_ENABLE_LOGS
diff --git a/slimt/Model.cc b/slimt/Model.cc
index 3f902aa7..6e12396b 100644
--- a/slimt/Model.cc
+++ b/slimt/Model.cc
@@ -4,6 +4,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -56,7 +57,8 @@ Model::Model(const Config &config, const Package<View> &package)
       processor_(config.split_mode, vocabulary_, Aligned()),
       transformer_(config.encoder_layers, config.decoder_layers,
                    config.num_heads, config.feed_forward_depth, package.model),
-      shortlist_generator_(package.shortlist, vocabulary_, vocabulary_) {}
+      shortlist_generator_(make_shortlist_generator(
+          package.shortlist, vocabulary_, vocabulary_)) {}
 
 Model::Model(const Config &config, const Package<std::string> &package)
     : id_(model_id++),
@@ -67,7 +69,16 @@ Model::Model(const Config &config, const Package<std::string> &package)
       processor_(config.split_mode, vocabulary_, Aligned()),
       transformer_(config.encoder_layers, config.decoder_layers,
                    config.num_heads, config.feed_forward_depth, view_.model),
-      shortlist_generator_(view_.shortlist, vocabulary_, vocabulary_) {}
+      shortlist_generator_(make_shortlist_generator(
+          view_.shortlist, vocabulary_, vocabulary_)) {}
+
+std::optional<ShortlistGenerator> Model::make_shortlist_generator(
+    View view, const Vocabulary &source, const Vocabulary &target) {
+  if (view.data == nullptr || view.size == 0) {
+    return std::nullopt;
+  }
+  return ShortlistGenerator(view, source, target);
+}
 
 namespace {
 void update_alignment(const std::vector<size_t> &lengths,
@@ -102,8 +113,11 @@ Histories Model::decode(const Tensor &encoder_out, const Input &input) const {
   size_t batch_size = encoder_out.dim(-3);
   size_t source_sequence_length = encoder_out.dim(-2);
 
-  Shortlist shortlist = shortlist_generator_.generate(input.words());
-  const Words &indices = shortlist.words();
+  std::optional<Words> indices = std::nullopt;
+  if (shortlist_generator_) {
+    Shortlist shortlist = shortlist_generator_->generate(input.words());
+    indices = shortlist.words();
+  }
   // The following can be used to check if shortlist is going wrong.
   // std::vector<uint32_t> indices(vocabulary_.size());
   // std::iota(indices.begin(), indices.end(), 0);
@@ -132,7 +146,13 @@ Histories Model::decode(const Tensor &encoder_out, const Input &input) const {
   auto [logits, attn] =
       decoder.step(encoder_out, input.mask(), states, previous_slice, indices);
 
-  previous_slice = greedy_sample(logits, indices, batch_size);
+  if (indices) {
+    previous_slice =
+        greedy_sample_from_words(logits, vocabulary_, *indices, batch_size);
+  } else {
+    previous_slice = greedy_sample(logits, vocabulary_, batch_size);
+  }
+
   update_alignment(input.lengths(), complete, attn, alignments);
   record(previous_slice, sentences);
 
@@ -141,7 +161,12 @@ Histories Model::decode(const Tensor &encoder_out, const Input &input) const {
   for (size_t i = 1; i < max_seq_length && remaining > 0; i++) {
     auto [logits, attn] = decoder.step(encoder_out, input.mask(), states,
                                        previous_slice, indices);
-    previous_slice = greedy_sample(logits, indices, batch_size);
+    if (indices) {
+      previous_slice =
+          greedy_sample_from_words(logits, vocabulary_, *indices, batch_size);
+    } else {
+      previous_slice = greedy_sample(logits, vocabulary_, batch_size);
+    }
     update_alignment(input.lengths(), complete, attn, alignments);
     remaining = record(previous_slice, sentences);
   }
@@ -204,6 +229,19 @@ Model::Config base() {
   // NOLINTEND
   return config;
 }
+
+Model::Config nano() {
+  // NOLINTBEGIN
+  Model::Config config{
+      .encoder_layers = 4,      //
+      .decoder_layers = 2,      //
+      .feed_forward_depth = 2,  //
+      .num_heads = 8,           //
+      .split_mode = "sentence"  //
+  };
+  // NOLINTEND
+  return config;
+}
 }  // namespace preset
 
 }  // namespace slimt
diff --git a/slimt/Model.hh b/slimt/Model.hh
index e7f04533..4e83fbcd 100644
--- a/slimt/Model.hh
+++ b/slimt/Model.hh
@@ -60,13 +60,16 @@ class SLIMT_EXPORT Model {
   const TextProcessor &processor() const { return processor_; }
   const Transformer &transformer() const { return transformer_; }
   size_t id() const { return id_; }  // NOLINT
-  const ShortlistGenerator &shortlist_generator() const {
+  const std::optional<ShortlistGenerator> &shortlist_generator() const {
     return shortlist_generator_;
   }
 
  private:
   Histories decode(const Tensor &encoder_out, const Input &input) const;
 
+  static std::optional<ShortlistGenerator> make_shortlist_generator(
+      View view, const Vocabulary &source, const Vocabulary &target);
+
   size_t id_;
   Config config_;
   using Mmap = Package<io::MmapFile>;
@@ -76,12 +79,13 @@ class SLIMT_EXPORT Model {
   Vocabulary vocabulary_;
   TextProcessor processor_;
   Transformer transformer_;
-  ShortlistGenerator shortlist_generator_;
+  std::optional<ShortlistGenerator> shortlist_generator_;
 };
 
 namespace preset {
 SLIMT_EXPORT Model::Config tiny();
 SLIMT_EXPORT Model::Config base();
+SLIMT_EXPORT Model::Config nano();
 }  // namespace preset
 
 }  // namespace slimt
diff --git a/slimt/Modules.cc b/slimt/Modules.cc
index db224bbd..abd6c6d9 100644
--- a/slimt/Modules.cc
+++ b/slimt/Modules.cc
@@ -143,7 +143,7 @@ Tensor join_heads(const Tensor &x) {
 }
 
 Tensor affine(const Affine &parameters, const Tensor &x,
-              const std::string &name = "") {
+              const std::string &name /* = ""*/) {
   Tensor y = qmm::affine(                              //
       x,                                               //
       parameters.W, parameters.b,                      //
@@ -211,33 +211,25 @@ Tensor SSRU::forward(Tensor &state, const Tensor &x) const {
   //       Wx(t)  is a linear operation (it's a linear transform).
   // Wfx(t) + bf  is an affine transform.
 
-  // f(t) = σ(Wt . x(t) + bf )
-
   Tensor &c = state;  // Load context from saved-state.
 
-  Tensor f_out = affine(F_, x, "rnn_f");  // Forget gate?
-  Tensor f = sigmoid(f_out);
-
-  // c(t) = f(t) ⊙  c(t−1) + (1 − ft) ⊙  Wx(t)
+  // Forward parameter multiplications.
+  Tensor f = affine(F_, x, "rnn_f");    // Forget gate? NOLINT
   Tensor Wxt = linear(O_, x, "rnn_o");  // NOLINT
 
-  Tensor ones = f.like("ones");
-  ones.fill_in_place(1.0F);
-
-  Tensor g = sub(ones, f);
-  Tensor c_arg1 = mul(f, c);
-  Tensor c_arg2 = mul(g, Wxt);
-  Tensor c_next = add(c_arg1, c_arg2);
+  // https://github.com/browsermt/marian-dev/blob/77e886ae7ae6016981c6307c312650bf74b50487/src/rnn/cells.h#L1058
+  // c(t) = f(t) ⊙  c(t−1) + (1 − ft) ⊙  Wx(t)
+  // Tensor c_t = highway(c, f, Wxt);
+  Tensor c_t = highway(c, Wxt, f);
 
+  // https://github.com/browsermt/marian-dev/blob/77e886ae7ae6016981c6307c312650bf74b50487/src/rnn/cells.h#L1059
   // y(t) = ReLU(c(t));
-  Tensor y = relu(c_next);
+  Tensor y = relu(c_t);
 
   // h(t) = α LayerNorm(y(t) + x(t)) + β
-  Tensor o = add(x, y);
-
-  Tensor h = ln_.forward(o);
+  Tensor h = ln_.forward(x + y);
 
-  state = std::move(c_next);
+  state = std::move(c_t);
 
   return h;
 }
@@ -288,25 +280,14 @@ Tensor FFN::forward(const Tensor &x) const {
 }
 
 Tensor LayerNorm::forward(const Tensor &x) const {
-  Tensor y = x.like("ln_out");
-  size_t cols = x.dim(-1);
-  size_t rows = x.size() / cols;
-
-  // Currently this is hardcoded.
-  // Not sure how to do it otherwise.
-  constexpr float kEps = 1e-9;
-
-  layer_norm(x.data<float>(),                            //
-             scale_.data<float>(), bias_.data<float>(),  //
-             kEps, rows, cols, y.data<float>());
-
+  Tensor y = layer_norm(x, scale_, bias_);
   return y;
 }
 
 std::tuple<Tensor, Tensor> Attention::forward(const Tensor &q, const Tensor &k,
                                               const Tensor &v,
                                               const Tensor &mask) const {
-  // We have a B x T x H sequence comoing in, for q, k and v.
+  // We have a B x T x H sequence coming in, for q, k and v.
   Tensor yq = affine(Q_, q, "q");
   Tensor yk = affine(K_, k, "k");
   Tensor yv = affine(V_, v, "v");
diff --git a/slimt/Modules.hh b/slimt/Modules.hh
index 17966701..243fe271 100644
--- a/slimt/Modules.hh
+++ b/slimt/Modules.hh
@@ -106,4 +106,7 @@ Tensor affine_with_select(const Affine &parameters, const Tensor &x,
                           const std::vector<uint32_t> &indices,
                           const std::string &name = "");
 
+Tensor affine(const Affine &parameters, const Tensor &x,
+              const std::string &name = "");
+
 }  // namespace slimt
diff --git a/slimt/Shortlist.cc b/slimt/Shortlist.cc
index 17426315..a7a3cc28 100644
--- a/slimt/Shortlist.cc
+++ b/slimt/Shortlist.cc
@@ -2,6 +2,7 @@
 
 #include <cstddef>
 #include <cstdint>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -19,14 +20,19 @@ bool ShortlistGenerator::content_check() {
     fail_flag |= word_to_offset_[i] >= shortlist_size_;
   }
 
+  SLIMT_ABORT_IF(fail_flag, "Error: offset table not within shortlist size.");
+
   // The last element of word_to_offset_ must equal shortlist_size_
-  fail_flag |= word_to_offset_[word_to_offset_size_ - 1] !=
-               shortlist_size_;  // The vocabulary indices have to be within
-                                 // the vocabulary size.
+  fail_flag |= word_to_offset_[word_to_offset_size_ - 1] != shortlist_size_;
+
+  SLIMT_ABORT_IF(fail_flag, "Error: word_to_offset != shortlist_size");
+
+  // The vocabulary indices have to be within the vocabulary size.
   size_t v_size = target_.size();
   for (size_t j = 0; j < shortlist_size_; j++) {
     fail_flag |= shortlist_[j] >= v_size;
   }
+
   SLIMT_ABORT_IF(fail_flag, "Error: shortlist indices are out of bounds");
   return fail_flag;
 }
@@ -41,7 +47,8 @@ void ShortlistGenerator::load(const void* data, size_t blob_size,
    */
   (void)blob_size;
   SLIMT_ABORT_IF(blob_size < sizeof(Header),
-                 "Shortlist length {} too short to have a header", blob_size);
+                 "Shortlist length too short to have a header: " +
+                     std::to_string(blob_size));
 
   const char* ptr = static_cast<const char*>(data);
   const Header& header = *reinterpret_cast<const Header*>(ptr);
@@ -51,10 +58,11 @@ void ShortlistGenerator::load(const void* data, size_t blob_size,
   uint64_t expected_size = sizeof(Header) +
                            header.word_to_offset_size * sizeof(uint64_t) +
                            header.shortlist_size * sizeof(Word);
-  SLIMT_ABORT_IF(
-      expected_size != blob_size,
-      "Shortlist header claims file size should be {} but file is {}",
-      expected_size, blob_size);
+
+  SLIMT_ABORT_IF(expected_size != blob_size,
+                 "Shortlist header claims file size should be " +
+                     std::to_string(expected_size) + " but file is " +
+                     std::to_string(blob_size));
 
   if (check) {
     size_t length = (       //
@@ -71,7 +79,7 @@ void ShortlistGenerator::load(const void* data, size_t blob_size,
 
   frequent_ = header.frequent;
   best_ = header.best;
-  LOG(info, "[data] Lexical short list frequent {} and best {}", frequent_,
+  LOG(info, "[data] Lexical short list frequent %lu and best %lu", frequent_,
       best_);
 
   word_to_offset_size_ = header.word_to_offset_size;
@@ -91,14 +99,14 @@ void ShortlistGenerator::load(const void* data, size_t blob_size,
 
 ShortlistGenerator::ShortlistGenerator(                        //
     View view,                                                 //
-    Vocabulary& source, Vocabulary& target,                    //
+    const Vocabulary& source, const Vocabulary& target,        //
     size_t source_index /*= 0*/, size_t /*target_index = 1*/,  //
     bool shared /*= false*/, bool check /*= true*/)
     : source_(source),
       target_(target),
       source_index_(source_index),
       shared_(shared) {
-  LOG(info, "[data] Loading binary shortlist from buffer with check={}", check);
+  LOG(info, "[data] Loading binary shortlist from buffer with check=%d", check);
   load(view.data, view.size, check);
 
   (void)source_index_;
diff --git a/slimt/Shortlist.hh b/slimt/Shortlist.hh
index 946e5c18..7667e67d 100644
--- a/slimt/Shortlist.hh
+++ b/slimt/Shortlist.hh
@@ -45,17 +45,17 @@ class ShortlistGenerator {
 
   // construct directly from buffer
   ShortlistGenerator(
-      View view,                               //
-      Vocabulary& source, Vocabulary& target,  //
+      View view,                                           //
+      const Vocabulary& source, const Vocabulary& target,  //
       size_t source_index = 0, size_t /*target_indx=*/ = 1,
       bool shared = false,  // Kept there for backward compatibility
-      bool check = true);
+      bool check = false);
 
   Shortlist generate(const Words& words) const;
 
  private:
-  Vocabulary& source_;
-  Vocabulary& target_;
+  const Vocabulary& source_;
+  const Vocabulary& target_;
 
   size_t source_index_;
   bool shared_{false};
diff --git a/slimt/Tensor.cc b/slimt/Tensor.cc
index 07470fb2..6d3ec192 100644
--- a/slimt/Tensor.cc
+++ b/slimt/Tensor.cc
@@ -1,18 +1,15 @@
 #include "slimt/Tensor.hh"
 
 #include <algorithm>
-#include <bitset>
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
 #include <iostream>
-#include <limits>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "slimt/Aligned.hh"
-#include "slimt/Macros.hh"
 #include "slimt/TensorOps.hh"
 #include "slimt/Types.hh"
 #include "slimt/Utils.hh"
@@ -197,41 +194,32 @@ std::ostream &operator<<(std::ostream &out, const Tensor &tensor) {
 bool operator==(const Tensor &lhs, const Tensor &rhs) {
   // Can't always rely on size, because sometimes we do aligned loads. So
   // something that is 256 bytes could only be 16 bytes w.r.t actual elements.
+  // This disables the below option.
   // if (lhs.view_.size != rhs.view_.size) return false;
+
   if (lhs.type() != rhs.type()) return false;
   if (lhs.shape() != rhs.shape()) return false;
-  const void *lhs_ptr = lhs.data<char>();
-  const void *rhs_ptr = rhs.data<char>();
 
-  auto message = [&](size_t position, auto l, auto r, float eps) {
-    std::cerr << lhs.name() << " and " << rhs.name();
-    std::cerr << "(" << to_string(lhs.type()) << ")";
-    std::cerr << "\n differs at position " << position << ": ";
-    std::cerr << "[" << std::scientific << l << "] ";
-    std::cerr << "[" << std::scientific << r << "] ";
-    std::cerr << "\n Δ: " << eps << " | \nbit: ";
-    std::bitset<32> bl(l), br(r);  // NOLINT
-    std::cerr << "\n " << bl << "\n " << br << "\n";
-  };
-
-  // Special cause for float32.
-  // Can use this when suspect inconsistent values.
+  // Special case so we can check floating point.
   const char *env_eps = std::getenv("SLIMT_EPS");
   if (env_eps != nullptr and lhs.type() == Type::f32) {  // NOLINT
+    float eps = std::stof(env_eps);
+
+    // Compute MSE and check.
+    float error = mse(lhs, rhs);
+    if (error > eps) {
+      return false;
+    }
+
+    // Compute individual distances.
     size_t size = lhs.size();
     const auto *l = lhs.data<float>();
     const auto *r = rhs.data<float>();
 
-    float eps = std::stof(env_eps);
-
-    SLIMT_TRACE(mse(lhs, rhs));
     for (size_t i = 0; i < size; i++) {
       float diff = std::abs(*l - *r);
       if (diff > eps) {
-        SLIMT_TRACE2(diff, eps);
-        int *il = (int *)l;  // NOLINT
-        int *ir = (int *)r;  // NOLINT
-        message(i, *il, *ir, diff);
+        // SLIMT_TRACE2(diff, eps);
         return false;
       }
       ++l, ++r;
@@ -239,21 +227,12 @@ bool operator==(const Tensor &lhs, const Tensor &rhs) {
     return true;
   }
 
+  // Byte comparisons.
+  const void *lhs_ptr = lhs.data<char>();
+  const void *rhs_ptr = rhs.data<char>();
   size_t size_in_memory = std::min(lhs.view().size, rhs.view().size);
   int retval = memcmp(lhs_ptr, rhs_ptr, size_in_memory);
-  // -1, 0 +1 if < = > respectively C-API, so.
   bool eq = (retval == 0);
-  if (not eq) {
-    const auto *l = lhs.data<char>();
-    const auto *r = rhs.data<char>();
-    for (size_t i = 0; i < size_in_memory; i++) {
-      float nan = std::numeric_limits<float>::quiet_NaN();
-      if (*l != *r) {
-        message(i, int(*l), int(*r), nan);  // NOLINT
-      }
-      ++l, ++r;
-    }
-  }
   return eq;
 }
 
diff --git a/slimt/TensorOps.cc b/slimt/TensorOps.cc
index 950ed893..1c1029c9 100644
--- a/slimt/TensorOps.cc
+++ b/slimt/TensorOps.cc
@@ -30,6 +30,11 @@ extern "C" {
 
 namespace slimt {
 
+inline float sigmoid(float x) {
+  return x > 0 ? (1.0F / (1.0F + std::exp(-x)))
+               : (std::exp(x) / (1.0F + std::exp(x)));
+}
+
 Tensor index_select(const Tensor& x, const Tensor& indices,
                     const std::string& name /*= "selected"*/) {
   uint64_t sequence_length = indices.dim(-1);
@@ -215,8 +220,7 @@ void sigmoid(const float* a, size_t size, float* c) {
 #endif
 
   for (size_t i = 0; i < size; i++) {
-    float x = std::exp(a[i]);
-    c[i] = x / (1 + x);
+    c[i] = sigmoid(a[i]);
   }
 }
 
@@ -639,4 +643,42 @@ Tensor mul(const Tensor& x, const Tensor& y) {
   return x_plus_y;
 }
 
+Tensor layer_norm(const Tensor& x, const Tensor& scale, const Tensor& bias,
+                  float EPS /*= 1e-6F*/) {
+  Tensor y = x.like("ln_out");
+  size_t cols = x.dim(-1);
+  size_t rows = x.size() / cols;
+
+  layer_norm(x.data<float>(),                          //
+             scale.data<float>(), bias.data<float>(),  //
+             EPS, rows, cols, y.data<float>());
+  return y;
+}
+
+Tensor operator+(const Tensor& x, const Tensor& y) { return add(x, y); }
+Tensor operator-(const Tensor& x, const Tensor& y) { return sub(x, y); }
+Tensor operator*(const Tensor& x, const Tensor& y) { return mul(x, y); }
+
+Tensor highway(const Tensor& x, const Tensor& y, const Tensor& g) {
+  // f(t) = σ(Wt . x(t) + bf )
+  Tensor c_t = x.like("highway_out");
+
+  assert(x.size() == y.size());
+  assert(y.size() == g.size());
+  const auto* tx = x.data<float>();
+  const auto* ty = y.data<float>();
+  const auto* tg = g.data<float>();
+  auto* out = c_t.data<float>();
+  size_t size = x.size();
+
+  for (size_t i = 0; i < size; i++) {
+    float sg = sigmoid(tg[i]);
+    float vx = tx[i];
+    float vy = ty[i];
+    out[i] = sg * vx + (1.0F - sg) * vy;
+  }
+
+  return c_t;
+}
+
 }  // namespace slimt
diff --git a/slimt/TensorOps.hh b/slimt/TensorOps.hh
index 25c3c48b..80702938 100644
--- a/slimt/TensorOps.hh
+++ b/slimt/TensorOps.hh
@@ -60,6 +60,14 @@ Tensor add(const Tensor& x, const Tensor& y);
 Tensor sub(const Tensor& x, const Tensor& y);
 Tensor mul(const Tensor& x, const Tensor& y);
 
+Tensor operator+(const Tensor& x, const Tensor& y);
+Tensor operator-(const Tensor& x, const Tensor& y);
+Tensor operator*(const Tensor& x, const Tensor& y);
+
+Tensor layer_norm(const Tensor& x, const Tensor& scale, const Tensor& bias,
+                  float EPS = 1e-6F);  // NOLINT
+
 Tensor fast_select(Tensor& source, const std::vector<uint32_t>& indices);
+Tensor highway(const Tensor& x, const Tensor& y, const Tensor& g);
 
 }  // namespace slimt
diff --git a/slimt/TextProcessor.cc b/slimt/TextProcessor.cc
index 07a0ec2b..23dce347 100644
--- a/slimt/TextProcessor.cc
+++ b/slimt/TextProcessor.cc
@@ -41,8 +41,8 @@ Splitter load_splitter(const std::string &prefix_path) {
   // prefix_path
   Splitter splitter;
   if (!prefix_path.empty()) {
-    LOG(info, "Loading protected prefixes for sentence splitting from {}",
-        prefix_path);
+    LOG(info, "Loading protected prefixes for sentence splitting from %s",
+        prefix_path.c_str());
     splitter.load(prefix_path);
   } else {
     LOG(warn,
diff --git a/slimt/Transformer.cc b/slimt/Transformer.cc
index 9248bca7..e5e8a7fe 100644
--- a/slimt/Transformer.cc
+++ b/slimt/Transformer.cc
@@ -3,7 +3,9 @@
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <cstdio>
 #include <iostream>
+#include <optional>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -14,6 +16,8 @@
 #include "slimt/Tensor.hh"
 #include "slimt/TensorOps.hh"
 #include "slimt/Types.hh"
+#include "slimt/Utils.hh"
+#include "slimt/Vocabulary.hh"
 
 namespace slimt {
 
@@ -101,6 +105,9 @@ void Decoder::register_parameters(const std::string &prefix,
                                   ParameterMap &parameters) {
   // Somehow we have historically ended up with `none_QuantMultA` being used for
   // Wemb_QuantMultA.
+  // https://github.com/browsermt/marian-dev/blob/2be8344fcf2776fb43a7376284067164674cbfaf/scripts/alphas/extract_stats.py#L55
+  // - none_QuantMultA is generated when used with shortlist
+  // - Wemb_QuantMultA is generated when used without shortlist.
   parameters.emplace("Wemb_intgemm8", &output_.W);
   parameters.emplace("none_QuantMultA", &output_.quant);
   parameters.emplace("decoder_ff_logit_out_b", &output_.b);
@@ -110,11 +117,9 @@ void Decoder::register_parameters(const std::string &prefix,
   }
 }
 
-std::tuple<Tensor, Tensor> Decoder::step(const Tensor &encoder_out,
-                                         const Tensor &mask,
-                                         std::vector<Tensor> &states,
-                                         const Words &previous_step,
-                                         const Words &shortlist) const {
+std::tuple<Tensor, Tensor> Decoder::step(
+    const Tensor &encoder_out, const Tensor &mask, std::vector<Tensor> &states,
+    const Words &previous_step, const std::optional<Words> &shortlist) const {
   // Infer batch-size from encoder_out.
   size_t encoder_feature_dim = encoder_out.dim(-1);
   size_t source_sequence_length = encoder_out.dim(-2);
@@ -168,7 +173,12 @@ std::tuple<Tensor, Tensor> Decoder::step(const Tensor &encoder_out,
     }
   }
 
-  Tensor logits = affine_with_select(output_, x, shortlist, "logits");
+  if (shortlist) {
+    Tensor logits = affine_with_select(output_, x, *shortlist, "logits");
+    return {std::move(logits), std::move(guided_alignment)};
+  }
+
+  Tensor logits = affine(output_, x, "logits");
   return {std::move(logits), std::move(guided_alignment)};
 }
 
@@ -221,15 +231,96 @@ void Transformer::register_parameters(const std::string &prefix,
   decoder_.register_parameters(prefix, parameters);
 }
 
-Words greedy_sample(const Tensor &logits, const Words &words,
+namespace {
+
+template <class T>
+void topk_inspect(size_t batch_id, const Vocabulary &vocabulary, T *begin,
+                  T *end, size_t k) {
+  const T *data = begin;
+  size_t size = end - begin;
+
+  std::vector<size_t> ordering = argsort(begin, end);
+  fprintf(stderr, "batch %zu | ", batch_id);
+  Words words(size + 1, vocabulary.eos_id());
+  for (size_t i = 0; i < k; i++) {
+    size_t j = size - i - 1;
+    words[i] = ordering[j];
+    std::string decoded;
+    vocabulary.decode({words[i], vocabulary.eos_id()}, decoded);
+    fprintf(stderr, "%s (%zu, %.9g) ", decoded.c_str(), ordering[j],
+            data[ordering[j]]);
+  }
+  fprintf(stderr, "\n");
+}
+
+template <class T>
+void topk_inspect_with_words(size_t batch_id, const Vocabulary &vocabulary,
+                             const Words &shortlist, T *begin, T *end,
+                             size_t k) {
+  const T *data = begin;
+  size_t size = end - begin;
+
+  std::vector<size_t> ordering = argsort(begin, end);
+  fprintf(stderr, "batch %zu | ", batch_id);
+  Words words(size + 1, vocabulary.eos_id());
+  for (size_t i = 0; i < k; i++) {
+    size_t j = size - i - 1;
+    words[i] = shortlist[ordering[j]];
+    std::string decoded;
+    vocabulary.decode({words[i], vocabulary.eos_id()}, decoded);
+    fprintf(stderr, "%s (%zu, %.9g) ", decoded.c_str(), ordering[j],
+            data[ordering[j]]);
+  }
+  fprintf(stderr, "\n");
+}
+
+}  // namespace
+
+Words greedy_sample(const Tensor &logits, const Vocabulary &vocabulary,
                     size_t batch_size) {
   Words sampled_words;
+  size_t stride = vocabulary.size();
   for (size_t i = 0; i < batch_size; i++) {
     const auto *data = logits.data<float>();
-    size_t max_index = 0;
-    float max_value = data[0];
-    size_t stride = words.size();
-    for (size_t cls = 1; cls < stride; cls++) {
+
+    // Initialize: 0
+    size_t cls = 0;
+    size_t max_index = cls;
+    float max_value = data[i * stride + cls];
+
+    for (cls = 1; cls < stride; cls++) {
+      float value = data[i * stride + cls];
+      if (value > max_value) {
+        max_index = cls;
+        max_value = value;
+      }
+    }
+
+    sampled_words.push_back(max_index);
+    if (false) {  // NOLINT
+      constexpr size_t kValue = 5;
+      topk_inspect(i, vocabulary, data + i * stride, data + (i + 1) * stride,
+                   kValue);
+    }
+  }
+  return sampled_words;
+}
+
+Words greedy_sample_from_words(const Tensor &logits,
+                               const Vocabulary &vocabulary, const Words &words,
+                               size_t batch_size) {
+  (void)vocabulary;
+  size_t stride = words.size();
+  Words sampled_words;
+  for (size_t i = 0; i < batch_size; i++) {
+    const auto *data = logits.data<float>();
+
+    // Initialize: 0
+    size_t cls = 0;
+    size_t max_index = cls;
+    float max_value = data[i * stride + cls];
+
+    for (cls = 1; cls < stride; cls++) {
       float value = data[i * stride + cls];
       if (value > max_value) {
         max_index = cls;
@@ -238,6 +329,11 @@ Words greedy_sample(const Tensor &logits, const Words &words,
     }
 
     sampled_words.push_back(words[max_index]);
+    if (false) {  // NOLINT
+      constexpr size_t kValue = 5;
+      topk_inspect_with_words(i, vocabulary, words, data + i * stride,
+                              data + (i + 1) * stride, kValue);
+    }
   }
   return sampled_words;
 }
diff --git a/slimt/Transformer.hh b/slimt/Transformer.hh
index 9797ae9d..4b0c0352 100644
--- a/slimt/Transformer.hh
+++ b/slimt/Transformer.hh
@@ -1,5 +1,6 @@
 #pragma once
 #include <cstddef>
+#include <optional>
 #include <string>
 #include <tuple>
 #include <vector>
@@ -33,7 +34,7 @@ class Decoder {
   std::tuple<Tensor, Tensor> step(const Tensor &encoder_out, const Tensor &mask,
                                   std::vector<Tensor> &states,
                                   const Words &previous_step,
-                                  const Words &shortlist) const;
+                                  const std::optional<Words> &shortlist) const;
 
  private:
   const Tensor &embedding_;
@@ -41,8 +42,13 @@ class Decoder {
   Affine output_;
 };
 
-Words greedy_sample(const Tensor &logits, const Words &words,
+class Vocabulary;
+
+Words greedy_sample(const Tensor &logits, const Vocabulary &vocabulary,
                     size_t batch_size);
+Words greedy_sample_from_words(const Tensor &logits,
+                               const Vocabulary &vocabulary, const Words &words,
+                               size_t batch_size);
 
 void transform_embedding(Tensor &word_embedding, size_t start = 0);
 
diff --git a/slimt/Utils.cc b/slimt/Utils.cc
index 94124655..2e99904b 100644
--- a/slimt/Utils.cc
+++ b/slimt/Utils.cc
@@ -8,7 +8,6 @@
 #include <cstdlib>
 #include <functional>
 #include <iostream>
-#include <stdexcept>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -96,15 +95,6 @@ SLIMT_PRINT_NDARRAY_EXPLICIT(uint32_t);
 
 #undef SLIMT_PRINT_NDARRAY_EXPLICIT
 
-std::string checked_fpath() {
-  const char *blob_path = std::getenv("SLIMT_BLOB_PATH");
-  if (not blob_path) {
-    std::cerr << "SLIMT_BLOB_PATH not define in environment.";
-    std::exit(EXIT_FAILURE);
-  }
-  return std::string(blob_path);
-}
-
 namespace {
 Tensor dispatch_by_type(Type type, const std::string &fpath, const Shape &shape,
                         const std::string &name) {
@@ -123,29 +113,6 @@ Tensor dispatch_by_type(Type type, const std::string &fpath, const Shape &shape,
 }
 }  // namespace
 
-bool Verifier::verify(Tensor &value, const std::string &name) {
-  auto query = verified_.find(name);
-  if (query == verified_.end()) {
-    std::string fpath = blob_path_ + "/" + name;
-    Tensor expected =
-        dispatch_by_type(value.type(), fpath, value.shape(), name);
-    bool flag = (value == expected);
-    if (flag) {
-      verified_.emplace(name);
-      std::cerr << "[    match ] " << value.name() << " and " << name << "\n";
-    } else {
-      std::cerr << "[ no match] " << value.name() << " and " << name << "\n";
-      std::cerr << value << "\n";
-      std::cerr << expected << "\n";
-      std::string msg = "No match for " + value.name() + " and " + name + ".";
-      throw std::runtime_error(msg);
-    }
-    return flag;
-  }
-
-  return true;
-}
-
 template <class Scalar, class Quant>
 std::tuple<Tensor, float> quantized_tensor_from_file(const std::string &fpath,
                                                      const Shape &shape,
diff --git a/slimt/Utils.hh b/slimt/Utils.hh
index 313f5f1e..f76958d0 100644
--- a/slimt/Utils.hh
+++ b/slimt/Utils.hh
@@ -6,6 +6,7 @@
 #include <functional>
 #include <iostream>
 #include <memory>
+#include <numeric>
 #include <sstream>
 #include <string>
 #include <tuple>
@@ -20,28 +21,6 @@ class Shape;
 
 std::string checked_fpath();
 
-class Verifier {
- public:
-  static Verifier &instance() {
-    static Verifier verifier;
-    return verifier;
-  }
-  bool verify(Tensor &value, const std ::string &name);
-
- private:
-  Verifier() : blob_path_(checked_fpath()) {}
-  std::unordered_set<std::string> verified_;
-  std::string blob_path_;
-};
-
-#define SLIMT_VERIFY_MATCH(value, name)            \
-  do {                                             \
-    const char *flag = std::getenv("SLIMT_TRACE"); \
-    if (flag) {                                    \
-      (Verifier::instance()).verify(value, name);  \
-    }                                              \
-  } while (0)
-
 template <class Printable>
 std::string fmt(Printable &printable) {
   std::stringstream stream;
@@ -132,4 +111,22 @@ class AverageMeter {
   size_t count_ = 0;
 };
 
+template <typename T>
+std::vector<size_t> argsort(const T *begin, const T *end) {
+  // initialize original index locations
+  const T *data = begin;
+  size_t size = end - begin;
+  std::vector<size_t> idx(size);
+  std::iota(idx.begin(), idx.end(), 0);
+
+  // sort indexes based on comparing values in vs
+  // using std::stable_sort instead of std::sort
+  // to avoid unnecessary index re-orderings
+  // when vs contains elements of equal values
+  stable_sort(idx.begin(), idx.end(),
+              [data](size_t i, size_t j) { return data[i] < data[j]; });
+
+  return idx;
+}
+
 }  // namespace slimt
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
new file mode 100644
index 00000000..01cd027d
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,19 @@
+# Tests require generation Add a custom command to be executed during the build
+if(SLIMT_GENERATED_UNIT_TESTS)
+  set(SLIMT_TEST_UNIT "${CMAKE_CURRENT_BINARY_DIR}/generated-units.cc")
+  file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+
+  add_custom_command(
+    OUTPUT ${SLIMT_TEST_UNIT}
+    COMMAND
+      "/usr/bin/python3" "${CMAKE_CURRENT_SOURCE_DIR}/generate-units.py" #
+      "--trace" "${CMAKE_SOURCE_DIR}/traces/ml-xlit.trace.txt" #
+      "--output" "${SLIMT_TEST_UNIT}"
+    DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/generate-units.py"
+    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
+
+  add_executable(slimt_test_units ${SLIMT_TEST_UNIT})
+  target_link_libraries(slimt_test_units PUBLIC slimt)
+  target_include_directories(slimt_test_units
+                             PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+endif()
diff --git a/tests/TestSuite.hh b/tests/TestSuite.hh
new file mode 100644
index 00000000..f17516f7
--- /dev/null
+++ b/tests/TestSuite.hh
@@ -0,0 +1,44 @@
+#pragma once
+#include <iostream>
+
+#include "slimt/TensorOps.hh"
+#include "slimt/Utils.hh"
+#include "slimt/slimt.hh"
+
+#define CHECK_EQUAL(lhs, rhs, fn)       \
+  do {                                  \
+    bool pass = lhs == rhs;             \
+    if (pass) {                         \
+      std::cout << "[PASS]";            \
+    } else {                            \
+      std::cout << "[FAIL]";            \
+    }                                   \
+    std::cout << " " << fn << "\n";     \
+    if (!pass) {                        \
+      if (std::getenv("SLIMT_DEBUG")) { \
+        diagnose(lhs, rhs);             \
+      }                                 \
+    }                                   \
+  } while (0)
+
+inline std::string blob_path(const std::string &bin) {
+  const char *blob_path = std::getenv("SLIMT_BLOB_PATH");
+  if (not blob_path) {
+    std::cerr << "SLIMT_BLOB_PATH not define in environment.";
+    std::exit(EXIT_FAILURE);
+  }
+  return std::string(blob_path) + '/' + bin;
+}
+
+inline void diagnose(const slimt::Tensor &lhs, const slimt::Tensor &rhs) {
+  const auto *l = lhs.data<float>();
+  const auto *r = rhs.data<float>();
+  constexpr float kEps = 1e-9;
+  size_t size = lhs.size();
+  for (size_t i = 0; i < size; i++) {
+    if (std::abs(l[i] - r[i]) > kEps) {
+      fprintf(stdout, "values differ at %zu: %.9g %.9g, diff = %.9f\n", i, l[i],
+              r[i], std::abs(l[i] - r[i]));
+    }
+  }
+}
diff --git a/tests/generate-units.py b/tests/generate-units.py
new file mode 100644
index 00000000..1e360852
--- /dev/null
+++ b/tests/generate-units.py
@@ -0,0 +1,183 @@
+import yaml
+import argparse
+import textwrap
+import itertools
+
+
+def prod(xs):
+    accumulator = 1
+    for x in xs:
+        accumulator = accumulator * x
+    return accumulator
+
+
+class Tensor:
+    def __init__(self, name, dtype, shape, save):
+        self.name = name
+        self.dtype = dtype
+        self.shape = list(map(int, shape[1:-1].split("x")))
+        self.save = save
+
+    def reshape(self, shape):
+        assert prod(shape) == prod(self.shape)
+        self.shape = shape
+
+    def load(self):
+        dims = list(map(str, self.shape))
+        shape = "Shape({{{ls}}})".format(ls=", ".join(dims))
+        dmap = {"float32": "float", "int8": "int8_t"}
+        dtype = dmap[self.dtype]
+        blob_path = f'blob_path("{self.save}")'
+        return f'tensor_from_file<{dtype}>({blob_path}, {shape}, "{self.name}")'
+
+
+def NoOp(lhs, rhs):
+    return ""
+
+
+def test(lhs, rhs, slimt_fn):
+    block = []
+    args = ", ".join([arg.name for arg in rhs])
+    info = f"{lhs.name} == {slimt_fn}({args})"
+    block.append(f'std::string info = "{info}";')
+    block.append(f"Tensor lhs_expected = {lhs.load()};")
+    for idx, arg in enumerate(rhs):
+        block.append(f"Tensor rhs_{idx} = {rhs[idx].load()};")
+    args = ", ".join([f"rhs_{idx}" for idx in range(len(rhs))])
+    block.append(f"Tensor lhs_computed = {slimt_fn}({args});")
+    block.append(f'CHECK_EQUAL(lhs_computed, lhs_expected, "{info}");')
+    return "{\n" + "\n".join(block) + "\n}"
+
+
+def guard(block):
+    catch_block = """catch (const std::exception& e) {
+        // Catching and handling exceptions
+        std::cerr << "Exception caught: " << e.what() << std::endl;
+    }
+    catch (...) {{
+        // Catching any other unexpected exceptions
+        std::cerr << "Unknown exception caught" << std::endl;
+    }}
+    """
+    return f"""try {{ {block} }} {catch_block}"""
+
+
+def ReLU(lhs, rhs):
+    lhs.reshape([prod(lhs.shape)])
+    for arg in rhs:
+        arg.reshape([prod(arg.shape)])
+    return test(lhs, rhs, "relu")
+
+
+def Plus(lhs, rhs):
+    lhs.reshape([prod(lhs.shape)])
+    for arg in rhs:
+        if prod(arg.shape) != prod(lhs.shape):
+            return ""
+        arg.reshape([prod(arg.shape)])
+    return test(lhs, rhs, "add")
+
+
+def Highway(lhs, rhs):
+    lhs.reshape([prod(lhs.shape)])
+    blocks = []
+    for arg in rhs:
+        if prod(arg.shape) != prod(lhs.shape):
+            return ""
+        arg.reshape([prod(arg.shape)])
+    block = test(lhs, rhs, "highway")
+    blocks.append(block)
+    return "\n".join(blocks)
+
+
+def LayerNormalization(lhs, rhs):
+    return test(lhs, rhs, "layer_norm")
+
+
+def Affine(lhs, rhs):
+    return test(lhs, rhs, "affine")
+
+
+def parse(t):
+    """Parse a tensor"""
+    tensor_id = t["id"]
+    save = t["save"]
+
+    name, dtype, shape, *_ = tensor_id.split()
+    return Tensor(name, dtype, shape, save)
+
+
+def emit(op, lhs_info, rhs_info):
+    lhs = parse(lhs_info)
+    rhs = [parse(arg) for arg in rhs_info]
+    return op(lhs, rhs)
+
+
+def Blocks(mapping, data):
+    ls = []
+    for entry in data:
+        fn = entry["fn"]
+        key = ":".join(filter(lambda x: x, fn.split(":")[2:-3]))
+        lhs = entry["lhs"]
+        rhs = entry.get("rhs", [])
+        if key in mapping:
+            op = mapping[key]
+            codeblock = emit(op, lhs, rhs)
+            if codeblock:
+                ls.append(codeblock)
+
+    return ls
+
+
+def main(blocks):
+    blocks = list(map(guard, blocks))
+    return textwrap.dedent(
+        """
+        #include "TestSuite.hh"
+        using namespace slimt; // NOLINT
+    int main(){{
+    {}
+    return 0;
+    }}""".format(
+            "\n\n".join(blocks)
+        )
+    )
+
+
+# Mappings from marian to slimt
+mapping = {
+    # "AffineNodeOp": Affine,
+    "ColsNodeOp": NoOp,
+    "ConstantNode": NoOp,
+    "cpu:integer:AffineNodeOp<marian:Type:int8>": NoOp,
+    "cpu:integer:DotNodeOp<marian:Type:int8>": NoOp,
+    "cpu:integer:PrepareANodeOp<marian:Type:int8>": NoOp,
+    "cpu:integer:QuantMultNodeOp<marian:Type:int8>": NoOp,
+    "DotBatchedNodeOp": NoOp,
+    "GatherNodeOp": NoOp,
+    "HighwayNodeOp": Highway,
+    "LayerNormalizationOp": LayerNormalization,
+    "LogSoftmaxNodeOp": NoOp,
+    "NegNodeOp": NoOp,
+    "ParamNode": NoOp,
+    "PlusNodeOp": Plus,
+    "ReLUNodeOp": ReLU,
+    "RowsNodeOp": NoOp,
+    "ScalarAddNodeOp": NoOp,
+    "ScalarMultNodeOp": NoOp,
+    "SoftmaxNodeOp": NoOp,
+    "TransposeNodeOp": NoOp,
+}
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--trace", type=str, required=True)
+    parser.add_argument("--output", type=str, required=True)
+    args = parser.parse_args()
+    data = None
+    with open(args.trace) as fp:
+        data = yaml.safe_load(fp)
+
+    blocks = Blocks(mapping, data)
+    with open(args.output, "w") as output:
+        print(main(blocks), file=output)