From c20abc9db135343b527ee440267e97c006aeb7c0 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Mon, 22 Jan 2024 13:16:10 +0530 Subject: [PATCH] Improvements via `t12n` There's some ongoing activity trying to repurpose the models for transliteration. Since the models are small, and the authors can eventually end up using them in their day-to-day life since transliteration is viable to have around (in comparison to translation). The following improvements are added: 1. Allow a non-shortlist path 2. Add `nano` config to put in t12n models. This is subject to change during the course of experimentation. 3. Test-suite reading traces from `marian-dev` and generating unit-tests using a Python script (rudimentary, can improve over the course of development). 4. The following bugfixes a. LayerNorm fixed by adjusting default EPS `1e-9` to `1e-6` b. Fix `limit_factor()` narrowed into `size_t` due to mistyping; longer sequences should come now. c. Fix initialization for greedy decode (was not offset correctly before). **Known Issues** 1. `HighwayForward` error at 1e-6 (1e-7 EPS fails), not satisfactory. Abstract operation to be unit-testable. 2. There are still mismatches in the forward--pass to sort out. Pull Request: https://github.com/jerinphilip/slimt/pull/47 --- .vscode/launch.json | 34 ++ CMakeLists.txt | 2 + app/CMakeLists.txt | 4 - app/test.cc | 772 ------------------------------------- bindings/python/slimt.cpp | 1 + bindings/python/utils.py | 10 +- scripts/marian-trace-gen.h | 60 +-- scripts/t12n.py | 39 ++ scripts/trace-xlit.sh | 11 + slimt/Batcher.cc | 2 +- slimt/Frontend.cc | 2 +- slimt/HTML.cc | 16 +- slimt/Input.cc | 2 +- slimt/Input.hh | 4 +- slimt/Macros.hh | 22 +- slimt/Model.cc | 50 ++- slimt/Model.hh | 8 +- slimt/Modules.cc | 45 +-- slimt/Modules.hh | 3 + slimt/Shortlist.cc | 30 +- slimt/Shortlist.hh | 10 +- slimt/Tensor.cc | 53 +-- slimt/TensorOps.cc | 46 ++- slimt/TensorOps.hh | 8 + slimt/TextProcessor.cc | 4 +- slimt/Transformer.cc | 118 +++++- slimt/Transformer.hh | 10 +- slimt/Utils.cc | 33 -- slimt/Utils.hh | 41 +- tests/CMakeLists.txt | 19 + tests/TestSuite.hh | 44 +++ tests/generate-units.py | 183 +++++++++ 32 files changed, 695 insertions(+), 991 deletions(-) delete mode 100644 app/test.cc create mode 100644 scripts/t12n.py create mode 100644 scripts/trace-xlit.sh create mode 100644 tests/CMakeLists.txt create mode 100644 tests/TestSuite.hh create mode 100644 tests/generate-units.py diff --git a/.vscode/launch.json b/.vscode/launch.json index 0264bdee..f67cf39a 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -1,5 +1,39 @@ { "configurations": [ + { + "name": "(gdb) t12n", + "type": "cppdbg", + "request": "launch", + "program": "${workspaceFolder}/env/bin/python3", + "args": [ + "${workspaceFolder}/scripts/t12n.py", + "${workspaceFolder}/../slimt-t12n/outputs/mal-eng/model.nano.npz.decoder.yml", + "<", + "${workspaceFolder}/data/ml-xlit.txt" + ], + "stopAtEntry": false, + "cwd": "${fileDirname}", + "environment": [], + "externalConsole": false, + "MIMode": "gdb", + "setupCommands": [ + { + "description": "Enable pretty-printing for gdb", + "text": "-enable-pretty-printing", + "ignoreFailures": true + }, + { + "description": "Set Disassembly Flavor to Intel", + "text": "-gdb-set disassembly-flavor intel", + "ignoreFailures": true + }, + { + "description": "Enable following into child for debugging.", + "text": "set follow-fork-mode child", + "ignoreFailures": true + } + ] + }, { "name": "(gdb) test", "type": "cppdbg", diff --git a/CMakeLists.txt b/CMakeLists.txt index 64cdf8ea..5da0136c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,7 @@ option(BUILD_STATIC "Build static libraries" ON) option(SLIMT_PACKAGE "Package for cmake, pkgconfig" OFF) option(SLIMT_PYTHON_LINK_STATIC "link-method to produce python package (static/shared)" ON) +option(SLIMT_GENERATED_UNIT_TESTS "Generate unit tests to run using Python" OFF) include(MacroEnsureOutOfSourceBuild) macro_ensure_out_of_source_build( @@ -208,6 +209,7 @@ endif(UNIX) add_subdirectory(slimt) add_subdirectory(app) +add_subdirectory(tests) if(BUILD_PYTHON) if(USE_PYBIND11_SOURCE) diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt index 22878169..21009d3a 100644 --- a/app/CMakeLists.txt +++ b/app/CMakeLists.txt @@ -2,10 +2,6 @@ add_executable(slimt_cli main.cc) set_target_properties(slimt_cli PROPERTIES OUTPUT_NAME "slimt-cli") target_link_libraries(slimt_cli PUBLIC slimt) -add_executable(slimt_test test.cc) -set_target_properties(slimt_test PROPERTIES OUTPUT_NAME "slimt-test") -target_link_libraries(slimt_test PUBLIC slimt) - set(SLIMT_BINARIES slimt_cli) if(UNIX) diff --git a/app/test.cc b/app/test.cc deleted file mode 100644 index 73dfee25..00000000 --- a/app/test.cc +++ /dev/null @@ -1,772 +0,0 @@ -// NOLINTBEGIN -#include -#include -#include -#include - -// Public headers. -#include "slimt/slimt.hh" - -// Private headers, required for partial testing. -#include "slimt/Macros.hh" -#include "slimt/TensorOps.hh" -#include "slimt/Utils.hh" - -namespace slimt { - -#define SLIMT_CHECK(condition) \ - do { \ - if (!(condition)) { \ - fprintf(stderr, "%s:%d %s failed\n", __FILE__, __LINE__, (#condition)); \ - throw std::runtime_error("Failed test"); \ - } \ - fprintf(stderr, "%s:%d %s success\n", __FILE__, __LINE__, (#condition)); \ - } while (0) - -static const std::string kBlobPath = checked_fpath(); - -namespace { - -std::string prefix(const std::string &fname) { return kBlobPath + "/" + fname; } - -template -Tensor tf(const std::string &path, Args &&...args) { - return tensor_from_file(prefix(path), std::forward(args)...); -} - -template -std::tuple qtf(const std::string &path, Args &&...args) { - return quantized_tensor_from_file(prefix(path), - std::forward(args)...); -} - -} // namespace - // - // - -void load() { - std::string fname = "RowsNodeOp-float32_8x256-rhs1-uint32_8_data_0.bin"; - Tensor x = tf((fname), Shape({8}), "rhs1"); - auto *data = x.data(); - // std::cout << x << "\n"; - - float begin = *data; - float rbegin = *(data + (x.size() - 1)); - SLIMT_CHECK(begin == 39); - SLIMT_CHECK(rbegin == 0); -} - -struct OpArgs { - std::string lhs; - std::vector rhs; -}; - -void ScalarMultNodeOp() { - // clang-format off - // file: "/home/jerin/code/bergamot-translator/3rd_party/marian-dev/src/graph/node_operators_unary.h" - // line: 100 - // fn: "marian::ScalarMultNodeOp::forwardOps()::" - // op: { Element(_1 = scalar_ * _2, val_, child(0)->val()) } - // before: var_5 float32 [4x2x256] - // after: var_5 float32 [4x2x256] ScalarMultNodeOp-float32_4x2x256-lhs.bin - // operands: - // - var_3 float32 [4x2x256] ScalarMultNodeOp-float32_4x2x256-rhs0-float32_4x2x256.bin - // clang-format on - - OpArgs args{ - .lhs = "ScalarMultNodeOp-float32_4x2x256-lhs.bin", // - .rhs = {"ScalarMultNodeOp-float32_4x2x256-rhs0-float32_4x2x256.bin"} // - }; - - Shape shape({4, 2, 256}); - Tensor lhs = tf((args.lhs), shape, "lhs"); - Tensor rhs = tf((args.rhs[0]), Shape({4, 2, 256}), "rhs"); - - Tensor lhs_computed(lhs.type(), lhs.shape(), "lhs_computed"); - - float embedding_dim_sqrt = std::sqrt(256.0F); - mul_scalar(rhs.data(), embedding_dim_sqrt, rhs.size(), - lhs_computed.data()); - - // std::cout << rhs << "\n"; - // std::cout << lhs << "\n" << lhs_computed << "\n"; - - SLIMT_CHECK(lhs_computed == lhs); -} - -void RowsNodeOp() { - // clang-format off - // file: "/home/jerin/code/bergamot-translator/3rd_party/marian-dev/src/graph/node_operators_binary.h" - // line: 672 - // fn: "marian::RowsNodeOp::forwardOps()::" - // op: { CopyRows(val_, child(0)->val(), child(1)->val()) } - // before: var_2 float32 [8x256] - // after: var_2 float32 [8x256] RowsNodeOp-float32_8x256-lhs.bin - // operands: - // - var_0 float32 [32000x256] RowsNodeOp-float32_8x256-rhs0-float32_32000x256.bin - // - var_1 uint32 [8] RowsNodeOp-float32_8x256-rhs1-uint32_8.bin - // clang-format on - - OpArgs args{ - .lhs = "RowsNodeOp-float32_8x256-lhs.bin", // - .rhs = - { - "RowsNodeOp-float32_8x256-rhs0-float32_32000x256_Wemb.bin", // - "RowsNodeOp-float32_8x256-rhs1-uint32_8_data_0.bin" // - } // - }; - - // Shape projected to 1 x 8 to match. - Tensor lhs = tf((args.lhs), Shape({1, 8, 256}), "lhs"); - // std::cout << "\n" << lhs << std::endl; - - Tensor rhs0 = tf((args.rhs[0]), Shape({32000, 256}), "rhs0"); - - // std::cout << rhs0 << std::endl; - - Tensor rhs1 = tf((args.rhs[1]), Shape({1, 8}), "rhs1"); - - // std::cout << rhs1 << std::endl; - - Tensor lhs_computed = index_select(rhs0, rhs1); - SLIMT_CHECK(lhs_computed == lhs); -} - -void DotBatchedNodeOp() { - // clang-format off - // file: "/home/jerin/code/bergamot-translator/3rd_party/marian-dev/src/graph/node_operators_binary.h" - // line: 424 - // fn: "marian::DotBatchedNodeOp::forwardOps()::" - // op: { ProdBatched(val_, graph()->allocator(), child(0)->val(), child(1)->val(), transA_, transB_, 0.f, scalar_) } - // before: var_44 float32 [2x8x4x4] - // after: var_44 float32 [2x8x4x4] DotBatchedNodeOp-float32_2x8x4x4-lhs.bin - // operands: - // - var_25 float32 [2x8x4x32] DotBatchedNodeOp-float32_2x8x4x4-rhs0-float32_2x8x4x32.bin - // - var_34 float32 [2x8x4x32] DotBatchedNodeOp-float32_2x8x4x4-rhs1-float32_2x8x4x32.bin - // clang-format on - - OpArgs args{ - .lhs = "DotBatchedNodeOp-float32_2x8x4x4-lhs.bin", // - .rhs = - { - "DotBatchedNodeOp-float32_2x8x4x4-rhs0-float32_2x8x4x32.bin", // - "DotBatchedNodeOp-float32_2x8x4x4-rhs1-float32_2x8x4x32.bin" // - } // - }; - - // std::cout << "\n"; - - size_t batch_size = 2; - size_t sequence_length = 4; - size_t num_heads = 8; - size_t dim_head = 32; - - size_t k = 2; - size_t h = num_heads / k; - Shape lhs_shape({k, batch_size * sequence_length, h, h}); - - Shape rhs_shape({k, batch_size * sequence_length, h, dim_head}); - Tensor lhs = tf((args.lhs), lhs_shape, "lhs"); - // std::cout << lhs << std::endl; - - Tensor rhs0 = tf((args.rhs[0]), rhs_shape, "rhs0"); - // std::cout << rhs0 << std::endl; - - Tensor rhs1 = tf((args.rhs[1]), rhs_shape, "rhs1"); - // std::cout << rhs1 << std::endl; - - // clang-format off - // op: { ProdBatched(val_, graph()->allocator(), child(0)->val(), child(1)->val(), transA_, transB_, 0.f, scalar_) } - // false true 0.176776692 - // clang-format on - - size_t bsz = batch_size * sequence_length * k; - Tensor lhs_computed(lhs.type(), lhs.shape(), "lhs_computed"); - batch_matrix_multiply( // - rhs0.data(), rhs1.data(), // - bsz, h, dim_head, h, dim_head, // - /*trans_a=*/false, /*trans_b=*/true, // - /*alpha =*/0.176776692, // - lhs_computed.data()); - - // std::cout << lhs << std::endl; - // std::cout << lhs_computed << std::endl; - SLIMT_CHECK(lhs_computed == lhs); -} - -void TransposeNodeOp() { - // clang-format off - // file: "/home/jerin/code/bergamot-translator/3rd_party/marian-dev/src/graph/node_operators_unary.h" - // line: 747 - // fn: "marian::TransposeNodeOp::forwardOps()::" - // op: { TransposeND(val_, child(0)->val(), axes_) } - // before: var_10 float32 [1x2x4x256] - // after: var_10 float32 [1x2x4x256] TransposeNodeOp-float32_1x2x4x256-lhs.bin - // operands: - // - var_8 float32 [1x4x2x256] TransposeNodeOp-float32_1x2x4x256-rhs0-float32_1x4x2x256.bin - // clang-format on - - OpArgs args{ - .lhs = "TransposeNodeOp-float32_1x2x4x256-lhs.bin", - .rhs = {"TransposeNodeOp-float32_1x2x4x256-rhs0-float32_1x4x2x256.bin"}}; - - Shape lhs_shape({1, 2, 4, 256}); - Tensor lhs = tf((args.lhs), lhs_shape, "lhs"); - - Shape rhs_shape = lhs_shape.transpose(-1, -2); - Tensor rhs = tf((args.rhs[0]), rhs_shape, "rhs"); - - Tensor lhs_expected(lhs.type(), lhs.shape(), "lhs_expected"); - transpose_3120(rhs.data(), 1, 4, 2, 256, lhs_expected.data()); - - SLIMT_TRACE(lhs); - SLIMT_TRACE(lhs_expected); - SLIMT_CHECK(lhs == lhs_expected); -} - -void LayerNormalizationOp() { - // clang-format off - // file: "/home/jerin/code/bergamot-translator/3rd_party/marian-dev/src/graph/node_operators_binary.h" - // line: 1210 - // fn: "marian::LayerNormalizationOp::forwardOps()::" - // op: { LayerNormalization(val_, child(0)->val(), child(1)->val(), (children_.size() == 3) ? child(2)->val() : nullptr, eps_) } - // before: var_60 float32 [1x2x4x256] - // after: var_60 float32 [1x2x4x256] LayerNormalizationOp-float32_1x2x4x256-lhs.bin - // operands: - // - var_57 float32 [1x2x4x256] LayerNormalizationOp-float32_1x2x4x256-rhs0-float32_1x2x4x256.bin - // - var_58 float32 [1x256] F0::encoder_l1_self_Wo_ln_scale LayerNormalizationOp-float32_1x2x4x256-rhs1-float32_1x256_encoder_l1_self_Wo_ln_scale.bin - // - var_59 float32 [1x256] F0::encoder_l1_self_Wo_ln_bias LayerNormalizationOp-float32_1x2x4x256-rhs2-float32_1x256_encoder_l1_self_Wo_ln_bias.bin - // clang-format on - OpArgs args{ - .lhs = "LayerNormalizationOp-float32_1x2x4x256-lhs.bin", - // clang-format off - .rhs = { - "LayerNormalizationOp-float32_1x2x4x256-rhs0-float32_1x2x4x256.bin", - "LayerNormalizationOp-float32_1x2x4x256-rhs1-float32_1x256_encoder_l1_self_Wo_ln_scale.bin", - "LayerNormalizationOp-float32_1x2x4x256-rhs2-float32_1x256_encoder_l1_self_Wo_ln_bias.bin" - } - // clang-format on - }; - - Shape lhs_shape({1, 2, 4, 256}); - Tensor lhs = tf((args.lhs), lhs_shape, "lhs"); - - Tensor rhs0 = tf((args.rhs[0]), lhs_shape, "rhs0"); - - Shape ln_shape({1, 256}); - Tensor rhs1 = tf((args.rhs[1]), ln_shape, "rhs1"); - Tensor rhs2 = tf((args.rhs[2]), ln_shape, "rhs2"); - - Tensor lhs_expected(lhs.type(), lhs.shape(), "lhs_expected"); - constexpr float kEps = 1e-9; - size_t rows = 1 * 2 * 4; - size_t cols = 256; - - layer_norm(rhs0.data(), rhs1.data(), rhs2.data(), kEps, - rows, cols, lhs_expected.data()); - - SLIMT_TRACE(lhs); - SLIMT_TRACE(lhs_expected); - SLIMT_CHECK(lhs == lhs_expected); -} -} // namespace slimt - -#ifdef HAS_INTGEMM -#include "3rd-party/intgemm/intgemm/intgemm.h" -namespace slimt { - -void AffineIntgemm() { - // clang-format off - // file: "/home/jerin/code/bergamot-translator/3rd_party/marian-dev/src/tensors/cpu/integer_common.h" - // line: 55 - // fn: "marian::cpu::integer::fetchAlphaFromModelNodeOp::forwardOps()::" - // op: { fetchAlpha() } - // before: var_19 float32 [1] F0::encoder_l1_self_Wq_QuantMultA - // after: var_19 float32 [1] F0::encoder_l1_self_Wq_QuantMultA cpu-float32_1_encoder_l1_self_Wq_QuantMultA-lhs.bin - // operands: - // - var_17 intgemm8 [256x256] F0::encoder_l1_self_Wq cpu-float32_1_encoder_l1_self_Wq_QuantMultA-rhs0-intgemm8_256x256_encoder_l1_self_Wq.bin - // - // - // file: "/home/jerin/code/bergamot-translator/3rd_party/marian-dev/src/tensors/cpu/intgemm_interface.h" - // line: 60 - // fn: "marian::cpu::integer::PrepareANodeOp::forwardOps()::" - // op: { PrepareA() } - // before: var_20 int8 [1x2x4x256] none_shifted - // after: var_20 int8 [1x2x4x256] none_shifted cpu-int8_1x2x4x256_none_shifted-lhs.bin - // operands: - // - var_10 float32 [1x2x4x256] cpu-int8_1x2x4x256_none_shifted-rhs0-float32_1x2x4x256.bin - // - var_19 float32 [1] F0::encoder_l1_self_Wq_QuantMultA cpu-int8_1x2x4x256_none_shifted-rhs1-float32_1_encoder_l1_self_Wq_QuantMultA.bin - // - // - // file: "/home/jerin/code/bergamot-translator/3rd_party/marian-dev/src/tensors/cpu/intgemm_interface.h" - // line: 285 - // fn: "marian::cpu::integer::QuantMultNodeOp::forwardOps()::" - // op: { QuantMult() } - // before: var_21 float32 [1] F0::encoder_l1_self_Wq_QuantMultB - // after: var_21 float32 [1] F0::encoder_l1_self_Wq_QuantMultB cpu-float32_1_encoder_l1_self_Wq_QuantMultB-lhs.bin - // operands: - // - var_17 intgemm8 [256x256] F0::encoder_l1_self_Wq cpu-float32_1_encoder_l1_self_Wq_QuantMultB-rhs0-intgemm8_256x256_encoder_l1_self_Wq.bin - // - // - // file: "/home/jerin/code/bergamot-translator/3rd_party/marian-dev/src/tensors/cpu/intgemm_interface.h" - // line: 359 - // fn: "marian::cpu::integer::PrepareBiasForBNodeOp::forwardOps()::" - // op: { PrepareBias() } - // before: var_22 float32 [1x256] F0::encoder_l1_self_bq_Prepared - // after: var_22 float32 [1x256] F0::encoder_l1_self_bq_Prepared cpu-float32_1x256_encoder_l1_self_bq_Prepared-lhs.bin - // operands: - // - var_18 float32 [1x256] F0::encoder_l1_self_bq cpu-float32_1x256_encoder_l1_self_bq_Prepared-rhs0-float32_1x256_encoder_l1_self_bq.bin - // - var_17 intgemm8 [256x256] F0::encoder_l1_self_Wq cpu-float32_1x256_encoder_l1_self_bq_Prepared-rhs1-intgemm8_256x256_encoder_l1_self_Wq.bin - // - var_19 float32 [1] F0::encoder_l1_self_Wq_QuantMultA cpu-float32_1x256_encoder_l1_self_bq_Prepared-rhs2-float32_1_encoder_l1_self_Wq_QuantMultA.bin - // - var_21 float32 [1] F0::encoder_l1_self_Wq_QuantMultB cpu-float32_1x256_encoder_l1_self_bq_Prepared-rhs3-float32_1_encoder_l1_self_Wq_QuantMultB.bin - // - // - // quantmult A, B, scalar 1.007505e+01 1.823447e+02 1.000000e+00 - // file: "/home/jerin/code/bergamot-translator/3rd_party/marian-dev/src/tensors/cpu/intgemm_interface.h" - // line: 540 - // fn: "marian::cpu::integer::AffineNodeOp::forwardOps()::" - // op: { AffineOp() } - // before: var_23 float32 [1x2x4x256] - // after: var_23 float32 [1x2x4x256] cpu-float32_1x2x4x256-lhs.bin - // operands: - // - var_20 int8 [1x2x4x256] none_shifted cpu-float32_1x2x4x256-rhs0-int8_1x2x4x256_none_shifted.bin - // - var_17 intgemm8 [256x256] F0::encoder_l1_self_Wq cpu-float32_1x2x4x256-rhs1-intgemm8_256x256_encoder_l1_self_Wq.bin - // - var_22 float32 [1x256] F0::encoder_l1_self_bq_Prepared cpu-float32_1x2x4x256-rhs2-float32_1x256_encoder_l1_self_bq_Prepared.bin - // clang-format on - - // Input to the intgemm involved pipeline. Usually these are float - // activations. - // - // Weight and weights quantization multiplier that corresponds to the x above. - // Here it happens to be the encoder1's Q matrix. - // Bias associated with the Q transform. - - // Intgemm code test, step-by-step. B is already prepared offline. - // - // 0. (offline) PrepareB - // 1. PrepareA - // 2. PrepareBias - // 3. Multiply. - - // Aliases in intgemm terminology - // C = AB + bias . - // A i8 [ A_rows x width ] - // B i8 [ width x B_cols ] - // bias f32 [ 1 x B_cols ] - // - - // We define the following two structs to hold objects to give a convenient - // syntax to describe for cases ahead. - // - // There are 3 variable sets: - // - // 1. Expected (raw): - // The unprepared values, that are fed in. In our case, it's offline - // prepared weights (B), f32 activations and f32 biases for online - // prepration. - // - // 2. Expected (prepared) - // During the course, we get prepared variations that are intermediate - // variables. These are also saved and requires ground truth to check - // expected. - // - // 3. Computed (prepared) - // The values we compute along the process. - - struct Affine { - Tensor A; - Tensor B; - Tensor bias; - }; - - // Holds a, b scalar (hyper) parameters used to multiply or divide for - // quantization. - struct Quant { - float a; - float b; - }; - - struct ProblemSet { - Affine var; - Affine prepared_expected; - Quant quant; - Tensor y_expected; - }; - - auto problem_256x256 = []() { - // clang-format off - auto A = tf("cpu-int8_1x2x4x256_none_shifted-rhs0-float32_1x2x4x256.bin", Shape({1*2*4, 256}), "A"); // NOLINT - auto [B, qB] = qtf("var_17-ParamNode-intgemm8_256x256_encoder_l1_self_Wq-lhs.bin", Shape({256, 256}), "B"); - auto bias = tf("var_18-ParamNode-float32_1x256_encoder_l1_self_bq-lhs.bin", Shape({1, 256}), "bias"); - auto qa = tf("var_19-cpu-float32_1_encoder_l1_self_Wq_QuantMultA-lhs.bin", Shape({1}), "quant.a"); // DONE - auto qb = tf("cpu-float32_1x1536_encoder_l1_ffn_b1_Prepared-rhs3-float32_1_encoder_l1_ffn_W1_QuantMultB.bin", Shape({1}), "quant.b"); // DONE - auto y_expected = tf("cpu-float32_1x2x4x256-lhs.bin", Shape({1*2*4, 256}), "y_expected"); - - Affine prepared_expected { - .A = tf("var_20-cpu-int8_1x2x4x256_none_shifted-lhs.bin", Shape({1*2*4, 256}), "prepared_expected_A"), - .B = tf("var_17-ParamNode-intgemm8_256x256_encoder_l1_self_Wq-lhs.bin", Shape({256, 256}), "prepared_expected_B"), - .bias= tf("var_22-cpu-float32_1x256_encoder_l1_self_bq_Prepared-lhs.bin", Shape({1, 256}), "prepared_expected_bias") - }; - // clang-format on - - ProblemSet pset{ - .var = - Affine{ - .A = std::move(A), // - .B = std::move(B), // - .bias = std::move(bias) // - }, // - .prepared_expected = std::move(prepared_expected), // - .quant = - Quant{ - .a = qa.item(), // - .b = qB // - }, // - .y_expected = std::move(y_expected) // - }; - - // auto qb_loaded = qb.item(); - // float diff = qb_loaded - qB; - // SLIMT_TRACE3(qB, qb_loaded, diff); - // SLIMT_CHECK(std::abs(diff) < 1e-7); - // SLIMT_TRACE2(quant.a, quant.b); - - return pset; - }; - - auto problem_256x1536 = []() { - // clang-format off - auto A = tf("var_64-cpu-int8_1x2x4x256_none_shifted-rhs0-float32_1x2x4x256.bin", Shape({2, 4, 256}), "A"); // NOLINT - auto [B, qB] = qtf("var_61-ParamNode-intgemm8_256x1536_encoder_l1_ffn_W1-lhs.bin", Shape({256, 1536}), "B"); - auto bias = tf("var_62-ParamNode-float32_1x1536_encoder_l1_ffn_b1-lhs.bin", Shape({1, 1536}), "bias"); - auto qa = tf("var_63-cpu-float32_1_encoder_l1_ffn_W1_QuantMultA-lhs.bin", Shape({1}), "quant.a"); - auto qb = tf("var_65-cpu-float32_1_encoder_l1_ffn_W1_QuantMultB-lhs.bin", Shape({1}), "quant.b"); - auto y_expected = tf("var_67-cpu-float32_1x2x4x1536-lhs.bin", Shape({2, 4, 1536}), "y_expected"); - - Affine prepared_expected { - .A = tf("var_64-cpu-int8_1x2x4x256_none_shifted-lhs.bin", Shape({2, 4, 256}), "prepared_expected_A"), - .B = tf("var_66-cpu-float32_1x1536_encoder_l1_ffn_b1_Prepared-rhs1-intgemm8_256x1536_encoder_l1_ffn_W1.bin", Shape({256, 1536}), "prepared_expected_B"), - .bias= tf("var_66-cpu-float32_1x1536_encoder_l1_ffn_b1_Prepared-lhs.bin", Shape({1, 1536}), "prepared_expected_bias") - }; - // clang-format on - - ProblemSet pset{ - .var = - Affine{ - .A = std::move(A), // - .B = std::move(B), // - .bias = std::move(bias) // - }, // - .prepared_expected = std::move(prepared_expected), // - .quant = - Quant{ - .a = qa.item(), // - .b = qB // - }, // - .y_expected = std::move(y_expected) // - }; - return pset; - }; - - auto problem_1536x256 = []() { - // clang-format off - auto A = tf("var_72-cpu-int8_1x2x4x1536_none_shifted-rhs0-float32_1x2x4x1536.bin", Shape({2, 4, 1536}), "A"); // NOLINT - auto [B, qB] = qtf("var_69-ParamNode-intgemm8_1536x256_encoder_l1_ffn_W2-lhs.bin", Shape({1536, 256}), "B"); - auto bias = tf("var_70-ParamNode-float32_1x256_encoder_l1_ffn_b2-lhs.bin", Shape({1, 256}), "bias"); - auto qa = tf("var_71-cpu-float32_1_encoder_l1_ffn_W2_QuantMultA-lhs.bin", Shape({1}), "quant.a"); - auto qb = tf("var_73-cpu-float32_1_encoder_l1_ffn_W2_QuantMultB-lhs.bin", Shape({1}), "quant.b"); - auto y_expected = tf("var_75-cpu-float32_1x2x4x256-lhs.bin", Shape({2, 4, 256}), "y_expected"); - - Affine prepared_expected { - .A = tf("var_72-cpu-int8_1x2x4x1536_none_shifted-lhs.bin", Shape({2, 4, 1536}), "prepared_expected_A"), - .B = tf("var_69-ParamNode-intgemm8_1536x256_encoder_l1_ffn_W2-lhs.bin", Shape({1536, 256}), "prepared_expected_B"), - .bias= tf("var_74-cpu-float32_1x256_encoder_l1_ffn_b2_Prepared-lhs.bin", Shape({1, 256}), "prepared_expected_bias") - }; - // clang-format on - - ProblemSet pset{ - .var = - Affine{ - .A = std::move(A), // - .B = std::move(B), // - .bias = std::move(bias) // - }, // - .prepared_expected = std::move(prepared_expected), // - .quant = - Quant{ - .a = qa.item(), // - .b = qB // - }, // - .y_expected = std::move(y_expected) // - }; - return pset; - }; - - auto intgemm_from_params = [](ProblemSet &pset) { - Affine &actual = pset.var; - Affine &prepared_expected = pset.prepared_expected; - Quant &quant = pset.quant; - Tensor &y_expected = pset.y_expected; - - Affine prepared{ - .A = Tensor(Type::i8, actual.A.shape(), "prepared_A"), // - .B = Tensor(Type::i8, actual.B.shape(), "prepared_B"), // - .bias = Tensor(Type::f32, actual.bias.shape(), "prepared_bias") // - }; - - size_t A_cols = actual.A.dim(-1); // NOLINT - size_t B_cols = actual.B.dim(-1); // NOLINT - size_t A_rows = actual.A.size() / A_cols; // NOLINT - size_t B_rows = actual.B.size() / B_cols; // NOLINT - - // A is in row-major format. - // B is in column-major, so consider it a transposed form. - SLIMT_TRACE2(A_rows, A_cols); - SLIMT_TRACE2(B_rows, B_cols); - - SLIMT_CHECK(A_cols == B_rows); - size_t width = B_rows; - SLIMT_TRACE(width); - - // Check widths are consistent, making matrix multiplication viable. - // This ensures our saves and loads satisfy one property. - - // Now we proceed to piecewise intgemm operations. - - // 0. PrepareB: B is prepared, but let's check PrepareB. - // - // Turns out, I do not have access from inputs to the raw B. I already - // only have prepared B. - // TODO(jerinphilip): Come back later and fix. - std::copy(actual.B.data(), actual.B.data() + B_cols * width, - prepared.B.data()); - - // Surprisingly, the following does not work. However a plain-copy does - // work. - // @jerinphilip has confirmed this is not a no-op by trying a copy before - // (see above). - // - // const auto *b = B.data(); - // auto *prepared_b = prepared.B.data(); - // intgemm::Int8::PrepareBQuantizedTransposed(b, prepared_b, B_cols, width); - - // SLIMT_TRACE_BLOCK(prepared.B); - // SLIMT_TRACE_BLOCK(prepared_expected.B); - SLIMT_CHECK(prepared_expected.B == actual.B); - SLIMT_CHECK(prepared.B == prepared_expected.B); - - // 1. PrepareA - intgemm::Int8Shift::PrepareA( // - actual.A.data(), prepared.A.data(), // - quant.a, // - A_rows, width // - ); - - // Check that the quantized activations are a match. - // SLIMT_TRACE2(qx, A); - SLIMT_CHECK(prepared.A == prepared_expected.A); - - // 2. PrepareBias - Quant alpha{ - .a = 127.0F / quant.a, // - .b = 127.0F / quant.b, // - }; - - float bias_unquant_multiplier = (-1.0F * (alpha.a * alpha.b)) / 127.0F; - SLIMT_TRACE3(alpha.a, alpha.b, bias_unquant_multiplier); - auto prepare_bias_callback = - intgemm::callbacks::UnquantizeAndAddBiasAndWrite( // - bias_unquant_multiplier, actual.bias.data(), // - prepared.bias.data() // - ); - - SLIMT_TRACE2(width, B_cols); - intgemm::Int8Shift::PrepareBias( // - prepared.B.data(), // - width, B_cols, // - prepare_bias_callback // - ); - - SLIMT_TRACE_BLOCK(prepared.bias) - SLIMT_TRACE_BLOCK(prepared_expected.bias); - SLIMT_TRACE(mse(prepared.bias, prepared_expected.bias)); - SLIMT_CHECK(prepared.bias == prepared_expected.bias); - - // 3. Multiply - Shape out_shape = actual.A.shape(); - out_shape.set_dim(-1, B_cols); - - Tensor y_piecewise(Type::f32, out_shape, "y_piecewise"); - - float unquant_multiplier = 1.0F / (quant.a * quant.b); - auto multiply_callback = intgemm::callbacks::UnquantizeAndAddBiasAndWrite( - unquant_multiplier, prepared.bias.data(), - y_piecewise.data()); - - intgemm::Int8Shift::Multiply( // - prepared.A.data(), prepared.B.data(), // - A_rows, width, B_cols, // - multiply_callback // - ); - - SLIMT_TRACE_BLOCK(y_piecewise); - SLIMT_TRACE_BLOCK(y_expected); - SLIMT_TRACE(mse(y_piecewise, y_expected)); - SLIMT_CHECK(y_expected == y_piecewise); - - // Compute from the intgemm_affine function, used in the library. - // This ensures what we checked in there is consistent with what we expect. - Tensor y_whole = qmm::affine(actual.A, actual.B, actual.bias, quant.a, - quant.b, "y_whole"); - SLIMT_TRACE(y_whole.shape()); - SLIMT_TRACE(y_expected.shape()); - SLIMT_TRACE(mse(y_whole, y_expected)); - SLIMT_CHECK(y_expected == y_whole); - }; - - auto pset1 = problem_256x256(); - auto pset2 = problem_256x1536(); - auto pset3 = problem_1536x256(); - intgemm_from_params(pset1); - intgemm_from_params(pset2); - intgemm_from_params(pset3); - - // SLIMT_TRACE2(y_whole, y_expected); -} -} // namespace slimt -#endif - -namespace slimt { - -void integration() { - std::string home = std::getenv("HOME"); - std::string browsermt = ".local/share/bergamot/models/browsermt"; - std::string folder = "ende.student.tiny11"; - - auto prefix_browsermt = [&](const std::string &relative_path) { - std::string path = - home + "/" + browsermt + "/" + folder + "/" + relative_path; - // std::cout << path << "\n"; - return path; - }; - - Package path{ - .model = prefix_browsermt("model.intgemm.alphas.bin"), // - .vocabulary = prefix_browsermt("vocab.deen.spm"), // - .shortlist = prefix_browsermt("lex.s2t.bin") // - }; - - Model::Config model_config; - auto model = std::make_shared(model_config, path); - Config service_config; - Blocking service(service_config); - std::string source = "1 2\n1 2 3\n"; - slimt::Options opts; - auto responses = service.translate(model, {std::move(source)}, opts); - fprintf(stdout, "%s\n", responses[0].target.text.c_str()); -} - -void ShortlistGen() { - std::string home = std::getenv("HOME"); - std::string browsermt = ".local/share/bergamot/models/browsermt"; - std::string folder = "ende.student.tiny11"; - - auto prefix_browsermt = [&](const std::string &relative_path) { - std::string path = - home + "/" + browsermt + "/" + folder + "/" + relative_path; - // std::cout << path << "\n"; - return path; - }; - std::string vocab_path = prefix_browsermt("vocab.deen.spm"); - std::string shortlist_path = prefix_browsermt("lex.s2t.bin"); - - Vocabulary vocab(vocab_path); - Vocabulary &source = vocab; - Vocabulary &target = vocab; - - // Load ShortlistGenerator - io::MmapFile shortlist_file(shortlist_path); - View view{ - .data = shortlist_file.data(), // - .size = shortlist_file.size() // - }; - ShortlistGenerator shortlist_generator(view, source, target); - - std::string line = "May I try the shortlist on, please?"; - auto [words, views] = vocab.encode(line, /*add_eos=*/true); - Shortlist shortlist = shortlist_generator.generate(words); - - const auto &likely_target_words = shortlist.words(); - std::string decoded; - auto dviews = vocab.decode(likely_target_words, decoded); - for (size_t i = 0; i < likely_target_words.size(); i++) { - std::cout << "[" << dviews[i] << ": " << likely_target_words[i] << "] "; - } - - // std::cout << decoded << "\n"; -} - -} // namespace slimt - -int main(int argc, char **argv) { - if (argc < 2) { - std::cerr << "Usage: " << argv[0] << " \n"; - std::exit(EXIT_FAILURE); - } - -// clang-format off -#define TEST_ENTRY(fn_name) {#fn_name, &slimt::fn_name} - // clang-format on - - using Test = void (*)(); - std::unordered_map tests({ - TEST_ENTRY(load), // - TEST_ENTRY(integration), // - TEST_ENTRY(RowsNodeOp), // - TEST_ENTRY(ScalarMultNodeOp), // - TEST_ENTRY(DotBatchedNodeOp), // - TEST_ENTRY(TransposeNodeOp), // - TEST_ENTRY(LayerNormalizationOp), // -#ifdef SLIMT_HAS_INTGEMM - TEST_ENTRY(AffineIntgemm), // -#endif - TEST_ENTRY(ShortlistGen) // - }); - - // std::cout << "slimt test\n"; - std::string test = argv[1]; - - auto query = tests.find(test); - if (query != tests.end()) { - auto name = query->first; - auto fn = query->second; - try { - std::cout << "Running test [" << name << "] ..."; - fn(); - std::cout << " [success]\n"; - } catch (...) { - std::cout << " [fail]\n"; - throw; - } - } else if (test == "all") { - std::vector failed; - for (auto &named_test : tests) { - auto name = named_test.first; - auto fn = named_test.second; - try { - std::cout << "Running test ... "; - fn(); - std::cout << "[success] [" << name << "]\n"; - } catch (const std::exception &exception) { - std::cout << " [fail] [" << name << "]\n"; - throw; - } - } - } else { - std::cerr << "Unknown test " << test << "\n"; - std::exit(EXIT_FAILURE); - } - return 0; -} - -// NOLINTEND diff --git a/bindings/python/slimt.cpp b/bindings/python/slimt.cpp index cc26670b..162d0664 100644 --- a/bindings/python/slimt.cpp +++ b/bindings/python/slimt.cpp @@ -217,4 +217,5 @@ PYBIND11_MODULE(_slimt, m) { auto sm_preset = m.def_submodule("preset"); sm_preset.def("tiny", &slimt::preset::tiny); sm_preset.def("base", &slimt::preset::base); + sm_preset.def("nano", &slimt::preset::nano); } diff --git a/bindings/python/utils.py b/bindings/python/utils.py index f4b901f6..9ea68127 100644 --- a/bindings/python/utils.py +++ b/bindings/python/utils.py @@ -75,11 +75,13 @@ def to_py_native(annotated_text: AnnotatedText) -> t.Dict[t.Any, t.Any]: def package_from_config_path(path): with open(path) as yaml_file: - c = yaml.safe_load(yaml_file) + config = yaml.safe_load(yaml_file) root = os.path.dirname(path) package = Package( - model=os.path.join(root, c["models"][0]), - vocabulary=os.path.join(root, c["vocabs"][0]), - shortlist=os.path.join(root, c["shortlist"][0]), + model=os.path.join(root, config["models"][0]), + vocabulary=os.path.join(root, config["vocabs"][0]), + shortlist=( + os.path.join(root, config["shortlist"][0]) if "shortlist" in config else "" + ), ) return package diff --git a/scripts/marian-trace-gen.h b/scripts/marian-trace-gen.h index 51995eff..c092faff 100644 --- a/scripts/marian-trace-gen.h +++ b/scripts/marian-trace-gen.h @@ -80,48 +80,52 @@ std::string save_to_disk(const std::string &name, Node node) { template inline void var_id(std::ostream &out, NodeType value) { - out << "var_" << value->getId() << " "; + out << "\"var_" << value->getId() << " "; out << value->value_type() << " "; out << "[" << value->shape() << "]"; if (value->name() != "none") { out << " " << value->name(); } + out << "\""; } template -inline bool process(const char *pretty_fn, NodeType *value, std::ostream &out) { +inline bool process(const char *pretty_fn, NodeType *value, std::ostream &out, + const std::string &indent) { std::stringstream stream; std::string op_name = extract_op_name(pretty_fn); std::string lhs_tag = var_metadata(value); - std::string prefix = - "var_" + std::to_string(value->getId()) + "-" + op_name + "-" + lhs_tag; - std::string lhs_name = prefix + "-lhs.bin"; - std::string lhs_save = save_to_disk(lhs_name, value); + std::string var_name = "var_" + std::to_string(value->getId()); + std::string save_name = var_name + ".bin"; + std::string lhs_save = save_to_disk(save_name, value); - stream << "after: "; + stream << indent << "lhs: {\"id\": "; var_id(stream, value); if (!lhs_save.empty()) { - stream << " " << lhs_name; + stream << ", \"save\":"; + stream << " " << save_name; } + stream << " }"; auto children = value->children(); if (not children.empty()) { - stream << "\noperands: \n"; + stream << "\n" << indent << "rhs: \n"; } for (size_t i = 0; i < children.size(); i++) { auto rhs = children[i]; - stream << " - "; + stream << indent << " - "; + stream << "{\"id\": "; var_id(stream, rhs); std::string rhs_tag = var_metadata(rhs); // NOLINTBEGIN - std::string rhs_name = - prefix + "-rhs" + std::to_string(i) + "-" + rhs_tag + ".bin"; + std::string rhs_name = var_name + "-rhs" + std::to_string(i) + ".bin"; // NOLINTEND std::string rhs_save = save_to_disk(rhs_name, rhs); if (!rhs_save.empty()) { - stream << " " << rhs_name; + stream << ",\"save\": " << rhs_name; } + stream << " }"; stream << "\n"; } @@ -136,22 +140,20 @@ inline bool process(const char *pretty_fn, NodeType *value, std::ostream &out) { }() // test if THREAD_GUARD is neccessary, remove if no problems occur. // #if 1 -#define NodeOp(op) \ - [=]() { \ - std::stringstream stream; \ - stream << "file: \"" << __FILE__ << "\"\n"; \ - stream << "line: " << __LINE__ << "\n"; \ - stream << "fn: \"" << __PRETTY_FUNCTION__ << "\"\n"; \ - stream << "op: { " << #op << " }\n"; \ - stream << "before: "; \ - detail::var_id(stream, this); \ - op; \ - stream << "\n"; \ - bool flag = detail::process(__PRETTY_FUNCTION__, this, stream); \ - stream << "\n\n"; \ - if (flag) { \ - std::cerr << stream.str(); \ - }; \ +#define NodeOp(op) \ + [=]() { \ + std::stringstream stream; \ + std::string indent = " "; \ + stream << "- file: \"" << __FILE__ << "\"\n"; \ + stream << indent << "line: " << __LINE__ << "\n"; \ + stream << indent << "fn: \"" << __PRETTY_FUNCTION__ << "\"\n"; \ + stream << indent << "op: \"{ " << #op << " }\"\n"; \ + op; \ + bool flag = detail::process(__PRETTY_FUNCTION__, this, stream, indent); \ + stream << "\n\n"; \ + if (flag) { \ + std::cerr << stream.str(); \ + }; \ } #else #define NodeOp(op) [=]() { op; } diff --git a/scripts/t12n.py b/scripts/t12n.py new file mode 100644 index 00000000..8dde76a4 --- /dev/null +++ b/scripts/t12n.py @@ -0,0 +1,39 @@ +import os +import sys + +import yaml + +import slimt +from slimt import Config, Model, Package, Service, preset + + +# Load the config file +def load_config(path): + config = None + with open(config_path) as yaml_file: + config = yaml.safe_load(yaml_file) + return config + + +if __name__ == "__main__": + service = Service(workers=1, cache_size=1024) + # Load supplementary files for model execution by passing the config path directory + config_path = sys.argv[1] + root = os.path.dirname(config_path) + config = load_config(config_path) + package = Package( + model=os.path.join(root, config["models"][0]), + vocabulary=os.path.join(root, config["vocabs"][0]), + shortlist=os.path.join(root, config["shortlist"][0]), + # shortlist="", + ) + + # nano model + nano: Config = preset.nano() + model_nano = Model(nano, package) + + data = sys.stdin.read() + responses = service.translate(model_nano, [data], html=False) + + for response in responses: + print(response.source.text, "->", response.target.text) diff --git a/scripts/trace-xlit.sh b/scripts/trace-xlit.sh new file mode 100644 index 00000000..fdd4eae0 --- /dev/null +++ b/scripts/trace-xlit.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +export DEBUG_VARIABLES_SAVE_PATH="/home/jerin/code/slimt/blobs/ml-xlit" +mkdir -p $DEBUG_VARIABLES_SAVE_PATH +rm $DEBUG_VARIABLES_SAVE_PATH/* + +/home/jerin/code/bergamot-translator/build/app/bergamot \ + --model-config-paths $HOME/code/slimt-t12n/outputs/mal-eng/model.nano.npz.decoder.yml \ + --log-level off \ + < data/ml-xlit.txt \ + 2> traces/ml-xlit.trace.txt diff --git a/slimt/Batcher.cc b/slimt/Batcher.cc index 633c1faf..970f93c5 100644 --- a/slimt/Batcher.cc +++ b/slimt/Batcher.cc @@ -47,7 +47,7 @@ bool operator<(const SegmentRef& a, const SegmentRef& b) { void Batch::log() { (void)token_count_; - LOG(info, "Batch(tokens={}, max-length={}, segment_refs_={})", token_count_, + LOG(info, "Batch(tokens=%zu max-length=%zu, segment_refs_=%zu)", token_count_, max_length_, segment_refs_.size()); } diff --git a/slimt/Frontend.cc b/slimt/Frontend.cc index 2e68b263..c8103f93 100644 --- a/slimt/Frontend.cc +++ b/slimt/Frontend.cc @@ -27,7 +27,7 @@ namespace slimt { namespace { -Input convert(const Batch &batch, uint32_t pad_id, size_t limit_factor) { +Input convert(const Batch &batch, uint32_t pad_id, float limit_factor) { const auto &segment_refs = batch.segment_refs(); Input input(batch.size(), batch.max_length(), pad_id, limit_factor); for (const auto &segment_ref : segment_refs) { diff --git a/slimt/HTML.cc b/slimt/HTML.cc index 792b55bf..3f1127d2 100644 --- a/slimt/HTML.cc +++ b/slimt/HTML.cc @@ -486,14 +486,17 @@ HTML::HTML(std::string &source, Options &&options) // bit of "", then completely ignore it. if (contains(options_.void_tags, tag_name)) break; - SLIMT_ABORT_IF(stack.empty(), - "Encountered more closing tags ({}) than opening tags", - scanner.tag()); + SLIMT_ABORT_IF( + stack.empty(), + detail::format( + "Encountered more closing tags ({}) than opening tags", + scanner.tag())); SLIMT_ABORT_IF( to_lower_case(stack.back()->name) != to_lower_case(scanner.tag()), - "Encountered unexpected closing tag , stack is {}", - scanner.tag(), stack); + detail::format( + "Encountered unexpected closing tag , stack is {}", + scanner.tag(), stack)); // What to do with "" case, where tag is immediately closed // so it never makes it into the taint of any of the spans? This adds @@ -548,7 +551,8 @@ HTML::HTML(std::string &source, Options &&options) } } - SLIMT_ABORT_IF(!stack.empty(), "Not all tags were closed: {}", stack); + SLIMT_ABORT_IF(!stack.empty(), + detail::format("Not all tags were closed: {}", stack)); // Add a trailing span (that's empty) to signify all closed tags. spans_.emplace_back(Span{source.size(), source.size(), stack}); diff --git a/slimt/Input.cc b/slimt/Input.cc index 30a3670e..2723b604 100644 --- a/slimt/Input.cc +++ b/slimt/Input.cc @@ -11,7 +11,7 @@ namespace slimt { Input::Input(size_t batch_size, size_t sequence_length, uint32_t pad_id, - size_t limit_factor) + float limit_factor) : batch_(Type::u32, Shape({batch_size, sequence_length}), "batch"), mask_(Type::f32, Shape({batch_size, sequence_length}), "mask"), pad_id_(pad_id), diff --git a/slimt/Input.hh b/slimt/Input.hh index 859cd1ac..2af374c2 100644 --- a/slimt/Input.hh +++ b/slimt/Input.hh @@ -10,7 +10,7 @@ namespace slimt { class Input { public: Input(size_t batch_size, size_t sequence_length, uint32_t pad_id, - size_t limit_factor); + float limit_factor); void add(const std::vector &words); void finalize(); @@ -31,7 +31,7 @@ class Input { size_t index_ = 0; uint32_t pad_id_ = 0; size_t used_ = 0; - size_t limit_factor_; + float limit_factor_; bool finalized_ = false; }; } // namespace slimt diff --git a/slimt/Macros.hh b/slimt/Macros.hh index bf1e7161..9a560169 100644 --- a/slimt/Macros.hh +++ b/slimt/Macros.hh @@ -1,4 +1,5 @@ #pragma once +#include #include #define SLIMT_BREAK std::raise(SIGTRAP) @@ -26,12 +27,12 @@ SLIMT_TRACE2(x, y); \ SLIMT_TRACE(z); -#define SLIMT_ABORT_IF(condition, ...) \ - do { \ - if (condition) { \ - std::cerr << #condition; \ - std::abort(); \ - } \ +#define SLIMT_ABORT_IF(condition, error) \ + do { \ + if (condition) { \ + std::cerr << (error) << '\n'; \ + std::abort(); \ + } \ } while (0) #define SLIMT_ABORT(message) \ @@ -40,4 +41,13 @@ std::abort(); \ } while (0) +#ifdef SLIMT_ENABLE_LOG +#define LOG(level, ...) \ + do { \ + fprintf(stderr, "[%s]", #level); \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + } while (0) +#else // SLIMT_ENABLE_LOGS #define LOG(...) (void)0 +#endif // SLIMT_ENABLE_LOGS diff --git a/slimt/Model.cc b/slimt/Model.cc index 3f902aa7..6e12396b 100644 --- a/slimt/Model.cc +++ b/slimt/Model.cc @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -56,7 +57,8 @@ Model::Model(const Config &config, const Package &package) processor_(config.split_mode, vocabulary_, Aligned()), transformer_(config.encoder_layers, config.decoder_layers, config.num_heads, config.feed_forward_depth, package.model), - shortlist_generator_(package.shortlist, vocabulary_, vocabulary_) {} + shortlist_generator_(make_shortlist_generator( + package.shortlist, vocabulary_, vocabulary_)) {} Model::Model(const Config &config, const Package &package) : id_(model_id++), @@ -67,7 +69,16 @@ Model::Model(const Config &config, const Package &package) processor_(config.split_mode, vocabulary_, Aligned()), transformer_(config.encoder_layers, config.decoder_layers, config.num_heads, config.feed_forward_depth, view_.model), - shortlist_generator_(view_.shortlist, vocabulary_, vocabulary_) {} + shortlist_generator_(make_shortlist_generator( + view_.shortlist, vocabulary_, vocabulary_)) {} + +std::optional Model::make_shortlist_generator( + View view, const Vocabulary &source, const Vocabulary &target) { + if (view.data == nullptr || view.size == 0) { + return std::nullopt; + } + return ShortlistGenerator(view, source, target); +} namespace { void update_alignment(const std::vector &lengths, @@ -102,8 +113,11 @@ Histories Model::decode(const Tensor &encoder_out, const Input &input) const { size_t batch_size = encoder_out.dim(-3); size_t source_sequence_length = encoder_out.dim(-2); - Shortlist shortlist = shortlist_generator_.generate(input.words()); - const Words &indices = shortlist.words(); + std::optional indices = std::nullopt; + if (shortlist_generator_) { + Shortlist shortlist = shortlist_generator_->generate(input.words()); + indices = shortlist.words(); + } // The following can be used to check if shortlist is going wrong. // std::vector indices(vocabulary_.size()); // std::iota(indices.begin(), indices.end(), 0); @@ -132,7 +146,13 @@ Histories Model::decode(const Tensor &encoder_out, const Input &input) const { auto [logits, attn] = decoder.step(encoder_out, input.mask(), states, previous_slice, indices); - previous_slice = greedy_sample(logits, indices, batch_size); + if (indices) { + previous_slice = + greedy_sample_from_words(logits, vocabulary_, *indices, batch_size); + } else { + previous_slice = greedy_sample(logits, vocabulary_, batch_size); + } + update_alignment(input.lengths(), complete, attn, alignments); record(previous_slice, sentences); @@ -141,7 +161,12 @@ Histories Model::decode(const Tensor &encoder_out, const Input &input) const { for (size_t i = 1; i < max_seq_length && remaining > 0; i++) { auto [logits, attn] = decoder.step(encoder_out, input.mask(), states, previous_slice, indices); - previous_slice = greedy_sample(logits, indices, batch_size); + if (indices) { + previous_slice = + greedy_sample_from_words(logits, vocabulary_, *indices, batch_size); + } else { + previous_slice = greedy_sample(logits, vocabulary_, batch_size); + } update_alignment(input.lengths(), complete, attn, alignments); remaining = record(previous_slice, sentences); } @@ -204,6 +229,19 @@ Model::Config base() { // NOLINTEND return config; } + +Model::Config nano() { + // NOLINTBEGIN + Model::Config config{ + .encoder_layers = 4, // + .decoder_layers = 2, // + .feed_forward_depth = 2, // + .num_heads = 8, // + .split_mode = "sentence" // + }; + // NOLINTEND + return config; +} } // namespace preset } // namespace slimt diff --git a/slimt/Model.hh b/slimt/Model.hh index e7f04533..4e83fbcd 100644 --- a/slimt/Model.hh +++ b/slimt/Model.hh @@ -60,13 +60,16 @@ class SLIMT_EXPORT Model { const TextProcessor &processor() const { return processor_; } const Transformer &transformer() const { return transformer_; } size_t id() const { return id_; } // NOLINT - const ShortlistGenerator &shortlist_generator() const { + const std::optional &shortlist_generator() const { return shortlist_generator_; } private: Histories decode(const Tensor &encoder_out, const Input &input) const; + static std::optional make_shortlist_generator( + View view, const Vocabulary &source, const Vocabulary &target); + size_t id_; Config config_; using Mmap = Package; @@ -76,12 +79,13 @@ class SLIMT_EXPORT Model { Vocabulary vocabulary_; TextProcessor processor_; Transformer transformer_; - ShortlistGenerator shortlist_generator_; + std::optional shortlist_generator_; }; namespace preset { SLIMT_EXPORT Model::Config tiny(); SLIMT_EXPORT Model::Config base(); +SLIMT_EXPORT Model::Config nano(); } // namespace preset } // namespace slimt diff --git a/slimt/Modules.cc b/slimt/Modules.cc index db224bbd..abd6c6d9 100644 --- a/slimt/Modules.cc +++ b/slimt/Modules.cc @@ -143,7 +143,7 @@ Tensor join_heads(const Tensor &x) { } Tensor affine(const Affine ¶meters, const Tensor &x, - const std::string &name = "") { + const std::string &name /* = ""*/) { Tensor y = qmm::affine( // x, // parameters.W, parameters.b, // @@ -211,33 +211,25 @@ Tensor SSRU::forward(Tensor &state, const Tensor &x) const { // Wx(t) is a linear operation (it's a linear transform). // Wfx(t) + bf is an affine transform. - // f(t) = σ(Wt . x(t) + bf ) - Tensor &c = state; // Load context from saved-state. - Tensor f_out = affine(F_, x, "rnn_f"); // Forget gate? - Tensor f = sigmoid(f_out); - - // c(t) = f(t) ⊙ c(t−1) + (1 − ft) ⊙ Wx(t) + // Forward parameter multiplications. + Tensor f = affine(F_, x, "rnn_f"); // Forget gate? NOLINT Tensor Wxt = linear(O_, x, "rnn_o"); // NOLINT - Tensor ones = f.like("ones"); - ones.fill_in_place(1.0F); - - Tensor g = sub(ones, f); - Tensor c_arg1 = mul(f, c); - Tensor c_arg2 = mul(g, Wxt); - Tensor c_next = add(c_arg1, c_arg2); + // https://github.com/browsermt/marian-dev/blob/77e886ae7ae6016981c6307c312650bf74b50487/src/rnn/cells.h#L1058 + // c(t) = f(t) ⊙ c(t−1) + (1 − ft) ⊙ Wx(t) + // Tensor c_t = highway(c, f, Wxt); + Tensor c_t = highway(c, Wxt, f); + // https://github.com/browsermt/marian-dev/blob/77e886ae7ae6016981c6307c312650bf74b50487/src/rnn/cells.h#L1059 // y(t) = ReLU(c(t)); - Tensor y = relu(c_next); + Tensor y = relu(c_t); // h(t) = α LayerNorm(y(t) + x(t)) + β - Tensor o = add(x, y); - - Tensor h = ln_.forward(o); + Tensor h = ln_.forward(x + y); - state = std::move(c_next); + state = std::move(c_t); return h; } @@ -288,25 +280,14 @@ Tensor FFN::forward(const Tensor &x) const { } Tensor LayerNorm::forward(const Tensor &x) const { - Tensor y = x.like("ln_out"); - size_t cols = x.dim(-1); - size_t rows = x.size() / cols; - - // Currently this is hardcoded. - // Not sure how to do it otherwise. - constexpr float kEps = 1e-9; - - layer_norm(x.data(), // - scale_.data(), bias_.data(), // - kEps, rows, cols, y.data()); - + Tensor y = layer_norm(x, scale_, bias_); return y; } std::tuple Attention::forward(const Tensor &q, const Tensor &k, const Tensor &v, const Tensor &mask) const { - // We have a B x T x H sequence comoing in, for q, k and v. + // We have a B x T x H sequence coming in, for q, k and v. Tensor yq = affine(Q_, q, "q"); Tensor yk = affine(K_, k, "k"); Tensor yv = affine(V_, v, "v"); diff --git a/slimt/Modules.hh b/slimt/Modules.hh index 17966701..243fe271 100644 --- a/slimt/Modules.hh +++ b/slimt/Modules.hh @@ -106,4 +106,7 @@ Tensor affine_with_select(const Affine ¶meters, const Tensor &x, const std::vector &indices, const std::string &name = ""); +Tensor affine(const Affine ¶meters, const Tensor &x, + const std::string &name = ""); + } // namespace slimt diff --git a/slimt/Shortlist.cc b/slimt/Shortlist.cc index 17426315..a7a3cc28 100644 --- a/slimt/Shortlist.cc +++ b/slimt/Shortlist.cc @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -19,14 +20,19 @@ bool ShortlistGenerator::content_check() { fail_flag |= word_to_offset_[i] >= shortlist_size_; } + SLIMT_ABORT_IF(fail_flag, "Error: offset table not within shortlist size."); + // The last element of word_to_offset_ must equal shortlist_size_ - fail_flag |= word_to_offset_[word_to_offset_size_ - 1] != - shortlist_size_; // The vocabulary indices have to be within - // the vocabulary size. + fail_flag |= word_to_offset_[word_to_offset_size_ - 1] != shortlist_size_; + + SLIMT_ABORT_IF(fail_flag, "Error: word_to_offset != shortlist_size"); + + // The vocabulary indices have to be within the vocabulary size. size_t v_size = target_.size(); for (size_t j = 0; j < shortlist_size_; j++) { fail_flag |= shortlist_[j] >= v_size; } + SLIMT_ABORT_IF(fail_flag, "Error: shortlist indices are out of bounds"); return fail_flag; } @@ -41,7 +47,8 @@ void ShortlistGenerator::load(const void* data, size_t blob_size, */ (void)blob_size; SLIMT_ABORT_IF(blob_size < sizeof(Header), - "Shortlist length {} too short to have a header", blob_size); + "Shortlist length too short to have a header: " + + std::to_string(blob_size)); const char* ptr = static_cast(data); const Header& header = *reinterpret_cast(ptr); @@ -51,10 +58,11 @@ void ShortlistGenerator::load(const void* data, size_t blob_size, uint64_t expected_size = sizeof(Header) + header.word_to_offset_size * sizeof(uint64_t) + header.shortlist_size * sizeof(Word); - SLIMT_ABORT_IF( - expected_size != blob_size, - "Shortlist header claims file size should be {} but file is {}", - expected_size, blob_size); + + SLIMT_ABORT_IF(expected_size != blob_size, + "Shortlist header claims file size should be " + + std::to_string(expected_size) + " but file is " + + std::to_string(blob_size)); if (check) { size_t length = ( // @@ -71,7 +79,7 @@ void ShortlistGenerator::load(const void* data, size_t blob_size, frequent_ = header.frequent; best_ = header.best; - LOG(info, "[data] Lexical short list frequent {} and best {}", frequent_, + LOG(info, "[data] Lexical short list frequent %lu and best %lu", frequent_, best_); word_to_offset_size_ = header.word_to_offset_size; @@ -91,14 +99,14 @@ void ShortlistGenerator::load(const void* data, size_t blob_size, ShortlistGenerator::ShortlistGenerator( // View view, // - Vocabulary& source, Vocabulary& target, // + const Vocabulary& source, const Vocabulary& target, // size_t source_index /*= 0*/, size_t /*target_index = 1*/, // bool shared /*= false*/, bool check /*= true*/) : source_(source), target_(target), source_index_(source_index), shared_(shared) { - LOG(info, "[data] Loading binary shortlist from buffer with check={}", check); + LOG(info, "[data] Loading binary shortlist from buffer with check=%d", check); load(view.data, view.size, check); (void)source_index_; diff --git a/slimt/Shortlist.hh b/slimt/Shortlist.hh index 946e5c18..7667e67d 100644 --- a/slimt/Shortlist.hh +++ b/slimt/Shortlist.hh @@ -45,17 +45,17 @@ class ShortlistGenerator { // construct directly from buffer ShortlistGenerator( - View view, // - Vocabulary& source, Vocabulary& target, // + View view, // + const Vocabulary& source, const Vocabulary& target, // size_t source_index = 0, size_t /*target_indx=*/ = 1, bool shared = false, // Kept there for backward compatibility - bool check = true); + bool check = false); Shortlist generate(const Words& words) const; private: - Vocabulary& source_; - Vocabulary& target_; + const Vocabulary& source_; + const Vocabulary& target_; size_t source_index_; bool shared_{false}; diff --git a/slimt/Tensor.cc b/slimt/Tensor.cc index 07470fb2..6d3ec192 100644 --- a/slimt/Tensor.cc +++ b/slimt/Tensor.cc @@ -1,18 +1,15 @@ #include "slimt/Tensor.hh" #include -#include #include #include #include #include -#include #include #include #include #include "slimt/Aligned.hh" -#include "slimt/Macros.hh" #include "slimt/TensorOps.hh" #include "slimt/Types.hh" #include "slimt/Utils.hh" @@ -197,41 +194,32 @@ std::ostream &operator<<(std::ostream &out, const Tensor &tensor) { bool operator==(const Tensor &lhs, const Tensor &rhs) { // Can't always rely on size, because sometimes we do aligned loads. So // something that is 256 bytes could only be 16 bytes w.r.t actual elements. + // This disables the below option. // if (lhs.view_.size != rhs.view_.size) return false; + if (lhs.type() != rhs.type()) return false; if (lhs.shape() != rhs.shape()) return false; - const void *lhs_ptr = lhs.data(); - const void *rhs_ptr = rhs.data(); - auto message = [&](size_t position, auto l, auto r, float eps) { - std::cerr << lhs.name() << " and " << rhs.name(); - std::cerr << "(" << to_string(lhs.type()) << ")"; - std::cerr << "\n differs at position " << position << ": "; - std::cerr << "[" << std::scientific << l << "] "; - std::cerr << "[" << std::scientific << r << "] "; - std::cerr << "\n Δ: " << eps << " | \nbit: "; - std::bitset<32> bl(l), br(r); // NOLINT - std::cerr << "\n " << bl << "\n " << br << "\n"; - }; - - // Special cause for float32. - // Can use this when suspect inconsistent values. + // Special case so we can check floating point. const char *env_eps = std::getenv("SLIMT_EPS"); if (env_eps != nullptr and lhs.type() == Type::f32) { // NOLINT + float eps = std::stof(env_eps); + + // Compute MSE and check. + float error = mse(lhs, rhs); + if (error > eps) { + return false; + } + + // Compute individual distances. size_t size = lhs.size(); const auto *l = lhs.data(); const auto *r = rhs.data(); - float eps = std::stof(env_eps); - - SLIMT_TRACE(mse(lhs, rhs)); for (size_t i = 0; i < size; i++) { float diff = std::abs(*l - *r); if (diff > eps) { - SLIMT_TRACE2(diff, eps); - int *il = (int *)l; // NOLINT - int *ir = (int *)r; // NOLINT - message(i, *il, *ir, diff); + // SLIMT_TRACE2(diff, eps); return false; } ++l, ++r; @@ -239,21 +227,12 @@ bool operator==(const Tensor &lhs, const Tensor &rhs) { return true; } + // Byte comparisons. + const void *lhs_ptr = lhs.data(); + const void *rhs_ptr = rhs.data(); size_t size_in_memory = std::min(lhs.view().size, rhs.view().size); int retval = memcmp(lhs_ptr, rhs_ptr, size_in_memory); - // -1, 0 +1 if < = > respectively C-API, so. bool eq = (retval == 0); - if (not eq) { - const auto *l = lhs.data(); - const auto *r = rhs.data(); - for (size_t i = 0; i < size_in_memory; i++) { - float nan = std::numeric_limits::quiet_NaN(); - if (*l != *r) { - message(i, int(*l), int(*r), nan); // NOLINT - } - ++l, ++r; - } - } return eq; } diff --git a/slimt/TensorOps.cc b/slimt/TensorOps.cc index 950ed893..1c1029c9 100644 --- a/slimt/TensorOps.cc +++ b/slimt/TensorOps.cc @@ -30,6 +30,11 @@ extern "C" { namespace slimt { +inline float sigmoid(float x) { + return x > 0 ? (1.0F / (1.0F + std::exp(-x))) + : (std::exp(x) / (1.0F + std::exp(x))); +} + Tensor index_select(const Tensor& x, const Tensor& indices, const std::string& name /*= "selected"*/) { uint64_t sequence_length = indices.dim(-1); @@ -215,8 +220,7 @@ void sigmoid(const float* a, size_t size, float* c) { #endif for (size_t i = 0; i < size; i++) { - float x = std::exp(a[i]); - c[i] = x / (1 + x); + c[i] = sigmoid(a[i]); } } @@ -639,4 +643,42 @@ Tensor mul(const Tensor& x, const Tensor& y) { return x_plus_y; } +Tensor layer_norm(const Tensor& x, const Tensor& scale, const Tensor& bias, + float EPS /*= 1e-6F*/) { + Tensor y = x.like("ln_out"); + size_t cols = x.dim(-1); + size_t rows = x.size() / cols; + + layer_norm(x.data(), // + scale.data(), bias.data(), // + EPS, rows, cols, y.data()); + return y; +} + +Tensor operator+(const Tensor& x, const Tensor& y) { return add(x, y); } +Tensor operator-(const Tensor& x, const Tensor& y) { return sub(x, y); } +Tensor operator*(const Tensor& x, const Tensor& y) { return mul(x, y); } + +Tensor highway(const Tensor& x, const Tensor& y, const Tensor& g) { + // f(t) = σ(Wt . x(t) + bf ) + Tensor c_t = x.like("highway_out"); + + assert(x.size() == y.size()); + assert(y.size() == g.size()); + const auto* tx = x.data(); + const auto* ty = y.data(); + const auto* tg = g.data(); + auto* out = c_t.data(); + size_t size = x.size(); + + for (size_t i = 0; i < size; i++) { + float sg = sigmoid(tg[i]); + float vx = tx[i]; + float vy = ty[i]; + out[i] = sg * vx + (1.0F - sg) * vy; + } + + return c_t; +} + } // namespace slimt diff --git a/slimt/TensorOps.hh b/slimt/TensorOps.hh index 25c3c48b..80702938 100644 --- a/slimt/TensorOps.hh +++ b/slimt/TensorOps.hh @@ -60,6 +60,14 @@ Tensor add(const Tensor& x, const Tensor& y); Tensor sub(const Tensor& x, const Tensor& y); Tensor mul(const Tensor& x, const Tensor& y); +Tensor operator+(const Tensor& x, const Tensor& y); +Tensor operator-(const Tensor& x, const Tensor& y); +Tensor operator*(const Tensor& x, const Tensor& y); + +Tensor layer_norm(const Tensor& x, const Tensor& scale, const Tensor& bias, + float EPS = 1e-6F); // NOLINT + Tensor fast_select(Tensor& source, const std::vector& indices); +Tensor highway(const Tensor& x, const Tensor& y, const Tensor& g); } // namespace slimt diff --git a/slimt/TextProcessor.cc b/slimt/TextProcessor.cc index 07a0ec2b..23dce347 100644 --- a/slimt/TextProcessor.cc +++ b/slimt/TextProcessor.cc @@ -41,8 +41,8 @@ Splitter load_splitter(const std::string &prefix_path) { // prefix_path Splitter splitter; if (!prefix_path.empty()) { - LOG(info, "Loading protected prefixes for sentence splitting from {}", - prefix_path); + LOG(info, "Loading protected prefixes for sentence splitting from %s", + prefix_path.c_str()); splitter.load(prefix_path); } else { LOG(warn, diff --git a/slimt/Transformer.cc b/slimt/Transformer.cc index 9248bca7..e5e8a7fe 100644 --- a/slimt/Transformer.cc +++ b/slimt/Transformer.cc @@ -3,7 +3,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -14,6 +16,8 @@ #include "slimt/Tensor.hh" #include "slimt/TensorOps.hh" #include "slimt/Types.hh" +#include "slimt/Utils.hh" +#include "slimt/Vocabulary.hh" namespace slimt { @@ -101,6 +105,9 @@ void Decoder::register_parameters(const std::string &prefix, ParameterMap ¶meters) { // Somehow we have historically ended up with `none_QuantMultA` being used for // Wemb_QuantMultA. + // https://github.com/browsermt/marian-dev/blob/2be8344fcf2776fb43a7376284067164674cbfaf/scripts/alphas/extract_stats.py#L55 + // - none_QuantMultA is generated when used with shortlist + // - Wemb_QuantMultA is generated when used without shortlist. parameters.emplace("Wemb_intgemm8", &output_.W); parameters.emplace("none_QuantMultA", &output_.quant); parameters.emplace("decoder_ff_logit_out_b", &output_.b); @@ -110,11 +117,9 @@ void Decoder::register_parameters(const std::string &prefix, } } -std::tuple Decoder::step(const Tensor &encoder_out, - const Tensor &mask, - std::vector &states, - const Words &previous_step, - const Words &shortlist) const { +std::tuple Decoder::step( + const Tensor &encoder_out, const Tensor &mask, std::vector &states, + const Words &previous_step, const std::optional &shortlist) const { // Infer batch-size from encoder_out. size_t encoder_feature_dim = encoder_out.dim(-1); size_t source_sequence_length = encoder_out.dim(-2); @@ -168,7 +173,12 @@ std::tuple Decoder::step(const Tensor &encoder_out, } } - Tensor logits = affine_with_select(output_, x, shortlist, "logits"); + if (shortlist) { + Tensor logits = affine_with_select(output_, x, *shortlist, "logits"); + return {std::move(logits), std::move(guided_alignment)}; + } + + Tensor logits = affine(output_, x, "logits"); return {std::move(logits), std::move(guided_alignment)}; } @@ -221,15 +231,96 @@ void Transformer::register_parameters(const std::string &prefix, decoder_.register_parameters(prefix, parameters); } -Words greedy_sample(const Tensor &logits, const Words &words, +namespace { + +template +void topk_inspect(size_t batch_id, const Vocabulary &vocabulary, T *begin, + T *end, size_t k) { + const T *data = begin; + size_t size = end - begin; + + std::vector ordering = argsort(begin, end); + fprintf(stderr, "batch %zu | ", batch_id); + Words words(size + 1, vocabulary.eos_id()); + for (size_t i = 0; i < k; i++) { + size_t j = size - i - 1; + words[i] = ordering[j]; + std::string decoded; + vocabulary.decode({words[i], vocabulary.eos_id()}, decoded); + fprintf(stderr, "%s (%zu, %.9g) ", decoded.c_str(), ordering[j], + data[ordering[j]]); + } + fprintf(stderr, "\n"); +} + +template +void topk_inspect_with_words(size_t batch_id, const Vocabulary &vocabulary, + const Words &shortlist, T *begin, T *end, + size_t k) { + const T *data = begin; + size_t size = end - begin; + + std::vector ordering = argsort(begin, end); + fprintf(stderr, "batch %zu | ", batch_id); + Words words(size + 1, vocabulary.eos_id()); + for (size_t i = 0; i < k; i++) { + size_t j = size - i - 1; + words[i] = shortlist[ordering[j]]; + std::string decoded; + vocabulary.decode({words[i], vocabulary.eos_id()}, decoded); + fprintf(stderr, "%s (%zu, %.9g) ", decoded.c_str(), ordering[j], + data[ordering[j]]); + } + fprintf(stderr, "\n"); +} + +} // namespace + +Words greedy_sample(const Tensor &logits, const Vocabulary &vocabulary, size_t batch_size) { Words sampled_words; + size_t stride = vocabulary.size(); for (size_t i = 0; i < batch_size; i++) { const auto *data = logits.data(); - size_t max_index = 0; - float max_value = data[0]; - size_t stride = words.size(); - for (size_t cls = 1; cls < stride; cls++) { + + // Initialize: 0 + size_t cls = 0; + size_t max_index = cls; + float max_value = data[i * stride + cls]; + + for (cls = 1; cls < stride; cls++) { + float value = data[i * stride + cls]; + if (value > max_value) { + max_index = cls; + max_value = value; + } + } + + sampled_words.push_back(max_index); + if (false) { // NOLINT + constexpr size_t kValue = 5; + topk_inspect(i, vocabulary, data + i * stride, data + (i + 1) * stride, + kValue); + } + } + return sampled_words; +} + +Words greedy_sample_from_words(const Tensor &logits, + const Vocabulary &vocabulary, const Words &words, + size_t batch_size) { + (void)vocabulary; + size_t stride = words.size(); + Words sampled_words; + for (size_t i = 0; i < batch_size; i++) { + const auto *data = logits.data(); + + // Initialize: 0 + size_t cls = 0; + size_t max_index = cls; + float max_value = data[i * stride + cls]; + + for (cls = 1; cls < stride; cls++) { float value = data[i * stride + cls]; if (value > max_value) { max_index = cls; @@ -238,6 +329,11 @@ Words greedy_sample(const Tensor &logits, const Words &words, } sampled_words.push_back(words[max_index]); + if (false) { // NOLINT + constexpr size_t kValue = 5; + topk_inspect_with_words(i, vocabulary, words, data + i * stride, + data + (i + 1) * stride, kValue); + } } return sampled_words; } diff --git a/slimt/Transformer.hh b/slimt/Transformer.hh index 9797ae9d..4b0c0352 100644 --- a/slimt/Transformer.hh +++ b/slimt/Transformer.hh @@ -1,5 +1,6 @@ #pragma once #include +#include #include #include #include @@ -33,7 +34,7 @@ class Decoder { std::tuple step(const Tensor &encoder_out, const Tensor &mask, std::vector &states, const Words &previous_step, - const Words &shortlist) const; + const std::optional &shortlist) const; private: const Tensor &embedding_; @@ -41,8 +42,13 @@ class Decoder { Affine output_; }; -Words greedy_sample(const Tensor &logits, const Words &words, +class Vocabulary; + +Words greedy_sample(const Tensor &logits, const Vocabulary &vocabulary, size_t batch_size); +Words greedy_sample_from_words(const Tensor &logits, + const Vocabulary &vocabulary, const Words &words, + size_t batch_size); void transform_embedding(Tensor &word_embedding, size_t start = 0); diff --git a/slimt/Utils.cc b/slimt/Utils.cc index 94124655..2e99904b 100644 --- a/slimt/Utils.cc +++ b/slimt/Utils.cc @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include @@ -96,15 +95,6 @@ SLIMT_PRINT_NDARRAY_EXPLICIT(uint32_t); #undef SLIMT_PRINT_NDARRAY_EXPLICIT -std::string checked_fpath() { - const char *blob_path = std::getenv("SLIMT_BLOB_PATH"); - if (not blob_path) { - std::cerr << "SLIMT_BLOB_PATH not define in environment."; - std::exit(EXIT_FAILURE); - } - return std::string(blob_path); -} - namespace { Tensor dispatch_by_type(Type type, const std::string &fpath, const Shape &shape, const std::string &name) { @@ -123,29 +113,6 @@ Tensor dispatch_by_type(Type type, const std::string &fpath, const Shape &shape, } } // namespace -bool Verifier::verify(Tensor &value, const std::string &name) { - auto query = verified_.find(name); - if (query == verified_.end()) { - std::string fpath = blob_path_ + "/" + name; - Tensor expected = - dispatch_by_type(value.type(), fpath, value.shape(), name); - bool flag = (value == expected); - if (flag) { - verified_.emplace(name); - std::cerr << "[ match ] " << value.name() << " and " << name << "\n"; - } else { - std::cerr << "[ no match] " << value.name() << " and " << name << "\n"; - std::cerr << value << "\n"; - std::cerr << expected << "\n"; - std::string msg = "No match for " + value.name() + " and " + name + "."; - throw std::runtime_error(msg); - } - return flag; - } - - return true; -} - template std::tuple quantized_tensor_from_file(const std::string &fpath, const Shape &shape, diff --git a/slimt/Utils.hh b/slimt/Utils.hh index 313f5f1e..f76958d0 100644 --- a/slimt/Utils.hh +++ b/slimt/Utils.hh @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -20,28 +21,6 @@ class Shape; std::string checked_fpath(); -class Verifier { - public: - static Verifier &instance() { - static Verifier verifier; - return verifier; - } - bool verify(Tensor &value, const std ::string &name); - - private: - Verifier() : blob_path_(checked_fpath()) {} - std::unordered_set verified_; - std::string blob_path_; -}; - -#define SLIMT_VERIFY_MATCH(value, name) \ - do { \ - const char *flag = std::getenv("SLIMT_TRACE"); \ - if (flag) { \ - (Verifier::instance()).verify(value, name); \ - } \ - } while (0) - template std::string fmt(Printable &printable) { std::stringstream stream; @@ -132,4 +111,22 @@ class AverageMeter { size_t count_ = 0; }; +template +std::vector argsort(const T *begin, const T *end) { + // initialize original index locations + const T *data = begin; + size_t size = end - begin; + std::vector idx(size); + std::iota(idx.begin(), idx.end(), 0); + + // sort indexes based on comparing values in vs + // using std::stable_sort instead of std::sort + // to avoid unnecessary index re-orderings + // when vs contains elements of equal values + stable_sort(idx.begin(), idx.end(), + [data](size_t i, size_t j) { return data[i] < data[j]; }); + + return idx; +} + } // namespace slimt diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 00000000..01cd027d --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,19 @@ +# Tests require generation Add a custom command to be executed during the build +if(SLIMT_GENERATED_UNIT_TESTS) + set(SLIMT_TEST_UNIT "${CMAKE_CURRENT_BINARY_DIR}/generated-units.cc") + file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + + add_custom_command( + OUTPUT ${SLIMT_TEST_UNIT} + COMMAND + "/usr/bin/python3" "${CMAKE_CURRENT_SOURCE_DIR}/generate-units.py" # + "--trace" "${CMAKE_SOURCE_DIR}/traces/ml-xlit.trace.txt" # + "--output" "${SLIMT_TEST_UNIT}" + DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/generate-units.py" + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) + + add_executable(slimt_test_units ${SLIMT_TEST_UNIT}) + target_link_libraries(slimt_test_units PUBLIC slimt) + target_include_directories(slimt_test_units + PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +endif() diff --git a/tests/TestSuite.hh b/tests/TestSuite.hh new file mode 100644 index 00000000..f17516f7 --- /dev/null +++ b/tests/TestSuite.hh @@ -0,0 +1,44 @@ +#pragma once +#include + +#include "slimt/TensorOps.hh" +#include "slimt/Utils.hh" +#include "slimt/slimt.hh" + +#define CHECK_EQUAL(lhs, rhs, fn) \ + do { \ + bool pass = lhs == rhs; \ + if (pass) { \ + std::cout << "[PASS]"; \ + } else { \ + std::cout << "[FAIL]"; \ + } \ + std::cout << " " << fn << "\n"; \ + if (!pass) { \ + if (std::getenv("SLIMT_DEBUG")) { \ + diagnose(lhs, rhs); \ + } \ + } \ + } while (0) + +inline std::string blob_path(const std::string &bin) { + const char *blob_path = std::getenv("SLIMT_BLOB_PATH"); + if (not blob_path) { + std::cerr << "SLIMT_BLOB_PATH not define in environment."; + std::exit(EXIT_FAILURE); + } + return std::string(blob_path) + '/' + bin; +} + +inline void diagnose(const slimt::Tensor &lhs, const slimt::Tensor &rhs) { + const auto *l = lhs.data(); + const auto *r = rhs.data(); + constexpr float kEps = 1e-9; + size_t size = lhs.size(); + for (size_t i = 0; i < size; i++) { + if (std::abs(l[i] - r[i]) > kEps) { + fprintf(stdout, "values differ at %zu: %.9g %.9g, diff = %.9f\n", i, l[i], + r[i], std::abs(l[i] - r[i])); + } + } +} diff --git a/tests/generate-units.py b/tests/generate-units.py new file mode 100644 index 00000000..1e360852 --- /dev/null +++ b/tests/generate-units.py @@ -0,0 +1,183 @@ +import yaml +import argparse +import textwrap +import itertools + + +def prod(xs): + accumulator = 1 + for x in xs: + accumulator = accumulator * x + return accumulator + + +class Tensor: + def __init__(self, name, dtype, shape, save): + self.name = name + self.dtype = dtype + self.shape = list(map(int, shape[1:-1].split("x"))) + self.save = save + + def reshape(self, shape): + assert prod(shape) == prod(self.shape) + self.shape = shape + + def load(self): + dims = list(map(str, self.shape)) + shape = "Shape({{{ls}}})".format(ls=", ".join(dims)) + dmap = {"float32": "float", "int8": "int8_t"} + dtype = dmap[self.dtype] + blob_path = f'blob_path("{self.save}")' + return f'tensor_from_file<{dtype}>({blob_path}, {shape}, "{self.name}")' + + +def NoOp(lhs, rhs): + return "" + + +def test(lhs, rhs, slimt_fn): + block = [] + args = ", ".join([arg.name for arg in rhs]) + info = f"{lhs.name} == {slimt_fn}({args})" + block.append(f'std::string info = "{info}";') + block.append(f"Tensor lhs_expected = {lhs.load()};") + for idx, arg in enumerate(rhs): + block.append(f"Tensor rhs_{idx} = {rhs[idx].load()};") + args = ", ".join([f"rhs_{idx}" for idx in range(len(rhs))]) + block.append(f"Tensor lhs_computed = {slimt_fn}({args});") + block.append(f'CHECK_EQUAL(lhs_computed, lhs_expected, "{info}");') + return "{\n" + "\n".join(block) + "\n}" + + +def guard(block): + catch_block = """catch (const std::exception& e) { + // Catching and handling exceptions + std::cerr << "Exception caught: " << e.what() << std::endl; + } + catch (...) {{ + // Catching any other unexpected exceptions + std::cerr << "Unknown exception caught" << std::endl; + }} + """ + return f"""try {{ {block} }} {catch_block}""" + + +def ReLU(lhs, rhs): + lhs.reshape([prod(lhs.shape)]) + for arg in rhs: + arg.reshape([prod(arg.shape)]) + return test(lhs, rhs, "relu") + + +def Plus(lhs, rhs): + lhs.reshape([prod(lhs.shape)]) + for arg in rhs: + if prod(arg.shape) != prod(lhs.shape): + return "" + arg.reshape([prod(arg.shape)]) + return test(lhs, rhs, "add") + + +def Highway(lhs, rhs): + lhs.reshape([prod(lhs.shape)]) + blocks = [] + for arg in rhs: + if prod(arg.shape) != prod(lhs.shape): + return "" + arg.reshape([prod(arg.shape)]) + block = test(lhs, rhs, "highway") + blocks.append(block) + return "\n".join(blocks) + + +def LayerNormalization(lhs, rhs): + return test(lhs, rhs, "layer_norm") + + +def Affine(lhs, rhs): + return test(lhs, rhs, "affine") + + +def parse(t): + """Parse a tensor""" + tensor_id = t["id"] + save = t["save"] + + name, dtype, shape, *_ = tensor_id.split() + return Tensor(name, dtype, shape, save) + + +def emit(op, lhs_info, rhs_info): + lhs = parse(lhs_info) + rhs = [parse(arg) for arg in rhs_info] + return op(lhs, rhs) + + +def Blocks(mapping, data): + ls = [] + for entry in data: + fn = entry["fn"] + key = ":".join(filter(lambda x: x, fn.split(":")[2:-3])) + lhs = entry["lhs"] + rhs = entry.get("rhs", []) + if key in mapping: + op = mapping[key] + codeblock = emit(op, lhs, rhs) + if codeblock: + ls.append(codeblock) + + return ls + + +def main(blocks): + blocks = list(map(guard, blocks)) + return textwrap.dedent( + """ + #include "TestSuite.hh" + using namespace slimt; // NOLINT + int main(){{ + {} + return 0; + }}""".format( + "\n\n".join(blocks) + ) + ) + + +# Mappings from marian to slimt +mapping = { + # "AffineNodeOp": Affine, + "ColsNodeOp": NoOp, + "ConstantNode": NoOp, + "cpu:integer:AffineNodeOp": NoOp, + "cpu:integer:DotNodeOp": NoOp, + "cpu:integer:PrepareANodeOp": NoOp, + "cpu:integer:QuantMultNodeOp": NoOp, + "DotBatchedNodeOp": NoOp, + "GatherNodeOp": NoOp, + "HighwayNodeOp": Highway, + "LayerNormalizationOp": LayerNormalization, + "LogSoftmaxNodeOp": NoOp, + "NegNodeOp": NoOp, + "ParamNode": NoOp, + "PlusNodeOp": Plus, + "ReLUNodeOp": ReLU, + "RowsNodeOp": NoOp, + "ScalarAddNodeOp": NoOp, + "ScalarMultNodeOp": NoOp, + "SoftmaxNodeOp": NoOp, + "TransposeNodeOp": NoOp, +} + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--trace", type=str, required=True) + parser.add_argument("--output", type=str, required=True) + args = parser.parse_args() + data = None + with open(args.trace) as fp: + data = yaml.safe_load(fp) + + blocks = Blocks(mapping, data) + with open(args.output, "w") as output: + print(main(blocks), file=output)