From 7e2b42f756dcb3f85d38e36a1447d1bfe92759b8 Mon Sep 17 00:00:00 2001
From: danny-shterman <danny.shterman@gmail.com>
Date: Mon, 29 Jul 2024 06:39:49 +0000
Subject: [PATCH 01/21] Add montgomery to vec_ops and example of that

---
 examples/c++/mont_vec_ops/CMakeLists.txt   |  25 +
 examples/c++/mont_vec_ops/README.md        |  42 ++
 examples/c++/mont_vec_ops/compile.debug.sh |  15 +
 examples/c++/mont_vec_ops/compile.sh       |  15 +
 examples/c++/mont_vec_ops/example.cu       | 607 +++++++++++++++++++++
 examples/c++/mont_vec_ops/run.sh           |   2 +
 icicle/include/vec_ops/vec_ops.cuh         |   2 +
 icicle/src/vec_ops/vec_ops.cu              |  70 ++-
 8 files changed, 766 insertions(+), 12 deletions(-)
 create mode 100644 examples/c++/mont_vec_ops/CMakeLists.txt
 create mode 100644 examples/c++/mont_vec_ops/README.md
 create mode 100755 examples/c++/mont_vec_ops/compile.debug.sh
 create mode 100755 examples/c++/mont_vec_ops/compile.sh
 create mode 100644 examples/c++/mont_vec_ops/example.cu
 create mode 100755 examples/c++/mont_vec_ops/run.sh

diff --git a/examples/c++/mont_vec_ops/CMakeLists.txt b/examples/c++/mont_vec_ops/CMakeLists.txt
new file mode 100644
index 000000000..f7048bb8a
--- /dev/null
+++ b/examples/c++/mont_vec_ops/CMakeLists.txt
@@ -0,0 +1,25 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(example LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+add_executable(
+  example
+  example.cu
+)
+target_include_directories(example PRIVATE "../../../icicle/include")
+target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(example ${NVML_LIBRARY})
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
diff --git a/examples/c++/mont_vec_ops/README.md b/examples/c++/mont_vec_ops/README.md
new file mode 100644
index 000000000..c99466f6d
--- /dev/null
+++ b/examples/c++/mont_vec_ops/README.md
@@ -0,0 +1,42 @@
+# Icicle example: Montgomery vector operations (mul, add, sub) for allpossible options:
+is_a_on_device
+is_b_on_device
+is_result_on_device
+is_in_montgomery_form
+(is_async isn't checked)
+
+## Best-Practices
+
+We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
+
+## Key-Takeaway
+
+`Icicle` accelerates multiplication operation `*` using [Karatsuba algorithm](https://en.wikipedia.org/wiki/Karatsuba_algorithm)
+
+## Concise Usage Explanation
+
+Define field to be used, e. g.:
+
+```c++
+#include "api/bn254.h"
+```
+
+```c++
+using namespace bn254;
+typedef scalar_t T;
+```
+
+## Running the example
+
+- `cd` to your example directory
+- compile with `./compile.sh`
+- run with `./run.sh`
+
+## What's in the example
+
+1. Define the parameters for the example such as vector size 
+2. Generate random vectors on-host
+3. Copy them on-device
+4. Execute element-wise vector multiplication on-device
+5. Copy results on-host
+
diff --git a/examples/c++/mont_vec_ops/compile.debug.sh b/examples/c++/mont_vec_ops/compile.debug.sh
new file mode 100755
index 000000000..cebaa9451
--- /dev/null
+++ b/examples/c++/mont_vec_ops/compile.debug.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+mkdir -p build/example
+mkdir -p build/icicle
+
+# Configure and build Icicle
+cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Debug -DCURVE=bn254
+cmake --build build/icicle -j
+
+# Configure and build the example application
+cmake -DCMAKE_BUILD_TYPE=Debug -S . -B build/example
+cmake --build build/example
diff --git a/examples/c++/mont_vec_ops/compile.sh b/examples/c++/mont_vec_ops/compile.sh
new file mode 100755
index 000000000..df73232a4
--- /dev/null
+++ b/examples/c++/mont_vec_ops/compile.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+mkdir -p build/example
+mkdir -p build/icicle
+
+# Configure and build Icicle
+cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
+cmake --build build/icicle -j
+
+# Configure and build the example application
+cmake -S . -B build/example
+cmake --build build/example
diff --git a/examples/c++/mont_vec_ops/example.cu b/examples/c++/mont_vec_ops/example.cu
new file mode 100644
index 000000000..453d3f12f
--- /dev/null
+++ b/examples/c++/mont_vec_ops/example.cu
@@ -0,0 +1,607 @@
+#include <iostream>
+#include <iomanip>
+#include <chrono>
+#include <nvml.h>
+
+#include "api/bn254.h"
+#include "vec_ops/vec_ops.cuh"
+#include <vec_ops/../../include/utils/mont.cuh>
+
+using namespace vec_ops;
+using namespace bn254;
+
+typedef scalar_t T;
+
+enum Op {
+  MUL,
+  ADD,
+  SUB,
+  LAST
+};
+
+// bn254 p = 21888242871839275222246405745257275088548364400416034343698204186575808495617
+
+int vector_op(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config, Op op)
+{
+  cudaError_t err;
+  switch (op) {
+    case MUL: err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result); break;
+    case ADD: err = bn254_add_cuda(vec_a, vec_b, n_elements, config, vec_result); break;
+    case SUB: err = bn254_sub_cuda(vec_a, vec_b, n_elements, config, vec_result); break;
+  }
+  // cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
+    return 0;
+  }
+  return 0;
+}
+int vector_mul(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config)
+{
+  cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
+    return 0;
+  }
+  return 0;
+}
+int vector_add(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config)
+{
+  cudaError_t err = bn254_add_cuda(vec_a, vec_b, n_elements, config, vec_result);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
+    return 0;
+  }
+  return 0;
+}
+int vector_sub(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config)
+{
+  cudaError_t err = bn254_sub_cuda(vec_a, vec_b, n_elements, config, vec_result);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
+    return 0;
+  }
+  return 0;
+}
+
+int main(int argc, char** argv)
+{
+  const unsigned vector_size = 1 << 0;
+  const unsigned repetitions = 1 << 0;
+
+  cudaError_t err;
+  nvmlInit();
+  nvmlDevice_t device;
+  nvmlDeviceGetHandleByIndex(0, &device); // for GPU 0
+  std::cout << "Icicle-Examples: vector mul / add / sub operations." << std::endl;
+  char name[NVML_DEVICE_NAME_BUFFER_SIZE];
+  if (nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE) == NVML_SUCCESS) {
+    std::cout << "GPU Model: " << name << std::endl;
+  } else {
+    std::cerr << "Failed to get GPU model name." << std::endl;
+  }
+  unsigned power_limit;
+  nvmlDeviceGetPowerManagementLimit(device, &power_limit);
+
+  std::cout << "Vector size: " << vector_size << std::endl;
+  std::cout << "Repetitions: " << repetitions << std::endl;
+  std::cout << "Power limit: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_limit << " W" << std::endl;
+
+  unsigned int baseline_power;
+  nvmlDeviceGetPowerUsage(device, &baseline_power);
+  std::cout << "Baseline power: " << std::fixed << std::setprecision(3) << 1.0e-3 * baseline_power << " W" << std::endl;
+  unsigned baseline_temperature;
+  if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &baseline_temperature) == NVML_SUCCESS) {
+    std::cout << "Baseline GPU Temperature: " << baseline_temperature << " C" << std::endl;
+  } else {
+    std::cerr << "Failed to get GPU temperature." << std::endl;
+  }
+
+  // host data
+  std::cout << "Allocate memory for the input vectors (both normal and Montgomery presentation)" << std::endl;
+  T* host_in1_init = (T*)malloc(vector_size * sizeof(T));
+  T* host_in2_init = (T*)malloc(vector_size * sizeof(T));
+  std::cout << "Initializing vectors with normal presentation random data" << std::endl;
+  T::rand_host_many(host_in1_init, vector_size);
+  T::rand_host_many(host_in2_init, vector_size);
+  std::cout << "Allocate memory for the output vectors" << std::endl;
+  T* host_out = (T*)malloc(vector_size * sizeof(T));        // This memory will be used for the test output.
+  T* host_out_ref_mul = (T*)malloc(vector_size * sizeof(T));    // This memory will be used as a reference result for mul (will be compared to host_out content).
+  T* host_out_ref_add = (T*)malloc(vector_size * sizeof(T));    // This memory will be used as a reference result for add (will be compared to host_out content).
+  T* host_out_ref_sub = (T*)malloc(vector_size * sizeof(T));    // This memory will be used as a reference result for sub (will be compared to host_out content).
+  std::cout << "Initializing output vectors with random data" << std::endl;
+  T::rand_host_many(host_out, vector_size);
+  T::rand_host_many(host_out_ref_mul, vector_size);
+  T::rand_host_many(host_out_ref_add, vector_size);
+  T::rand_host_many(host_out_ref_sub, vector_size);
+  // device data
+  device_context::DeviceContext ctx = device_context::get_default_device_context();
+  T* device_in1;
+  T* device_in2;
+  T* device_out;
+
+  err = cudaMalloc((void**)&device_in1, vector_size * sizeof(T));
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
+    return 0;
+  }
+  err = cudaMalloc((void**)&device_in2, vector_size * sizeof(T));
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
+    return 0;
+  }
+  err = cudaMalloc((void**)&device_out, vector_size * sizeof(T));
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
+    return 0;
+  }
+
+  vec_ops::VecOpsConfig config = vec_ops::DefaultVecOpsConfig();
+
+  //****************************************
+  // Test warn-up and reference output config. Reference output to be used to check if test passed or not.
+  //****************************************
+  // copy from host to device
+  err = cudaMemcpy(device_in1, host_in1_init, vector_size * sizeof(T), cudaMemcpyHostToDevice);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
+    return 0;
+  }
+  err = cudaMemcpy(device_in2, host_in2_init, vector_size * sizeof(T), cudaMemcpyHostToDevice);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl;
+    return 0;
+  }
+  std::cout << "Starting warm-up run" << std::endl;
+  // Warm-up loop
+  for ( int op = MUL; op != LAST; op++ ) {
+    for (int i = 0; i < repetitions; i++) {
+      // vector_mul(device_in1, device_in2, device_out, vector_size, ctx, config);
+      vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op);
+    }
+    switch (op) {
+      case MUL: err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); break;
+      case ADD: err = cudaMemcpy(host_out_ref_add, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); break;
+      case SUB: err = cudaMemcpy(host_out_ref_sub, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); break;
+    }
+  }
+  // copy the result from device to host_out_ref_mul to keep it for later comparisons.
+  // err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl;
+    return 0;
+  }
+  //****************************************
+  // End of test warn-up and reference output config.
+  //****************************************
+
+  std::cout << "Starting benchmarking" << std::endl;
+  unsigned power_before;
+  nvmlDeviceGetPowerUsage(device, &power_before);
+  std::cout << "Power before: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_before << " W" << std::endl;
+  std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_before / power_limit
+            << " %" << std::endl;
+  unsigned temperature_before;
+  if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_before) == NVML_SUCCESS) {
+    std::cout << "GPU Temperature before: " << temperature_before << " C" << std::endl;
+  } else {
+    std::cerr << "Failed to get GPU temperature." << std::endl;
+  }
+
+  //*******************************************************
+  // Benchmark test:
+  // Loop for (mul, add, sub):
+  //   Loop (is_a_on_device, is_b_on_device, is_result_on_device, is_in_montgomery_form):
+  //*******************************************************
+  T* host_in1 = (T*)malloc(vector_size * sizeof(T));  // This buffer is used to load the data from host_in1_init for the benchmark.
+  T* host_in2 = (T*)malloc(vector_size * sizeof(T));  // This buffer is used to load the data from host_in2_init for the benchmark.
+  // Test when the result is not in-place
+  for ( int op = MUL; op != LAST; op++ ) {
+    // for (int config_idx = 0; config_idx < 0; config_idx++) {
+    for (int config_idx = 0; config_idx < 16; config_idx++) {
+      std::cout << "Start benchmark loop for config_idx " << config_idx << std::endl;
+      for (int i=0; i<vector_size; i++) {
+        host_in1[i] = host_in1_init[i];
+        host_in2[i] = host_in2_init[i];
+      }
+      config.is_a_on_device = (config_idx >> 3) & 0x1;
+      config.is_b_on_device = (config_idx >> 2) & 0x1;
+      config.is_result_on_device = (config_idx >> 1) & 0x1;
+      config.is_in_montgomery_form = (config_idx >> 0) & 0x1;
+
+      // Copy from host to device (copy again in order to be used later in the loop and device_inX was already overwritten by warmup.
+      if (config.is_a_on_device) {
+        if (config.is_in_montgomery_form) {
+          err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy data to device.
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
+          CHK_IF_RETURN(mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1));   // Convert in-place.
+        } else {    // Normal presentation.
+          err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy data to device.
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
+        }
+      } else {
+        if (config.is_in_montgomery_form) {   // Copy to device, cnvert to montgomery and copy back to host.
+          err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy data to device.
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
+          CHK_IF_RETURN(mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1));
+          err = cudaMemcpy(host_in1, device_in1, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from device_in1 to host_in1 - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
+        }
+      }
+      if (config.is_b_on_device) {
+        if (config.is_in_montgomery_form) {
+          err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy data to device.
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from host_in2 to device_in1 - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
+          CHK_IF_RETURN(mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2));   // Convert in-place.
+        } else {
+          // Normal presentation.
+          err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy data to device.
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
+        }
+      } else {
+        if (config.is_in_montgomery_form) {   // Copy to device, cnvert to montgomery and copy back to host.
+          err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy data to device.
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
+          CHK_IF_RETURN(mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2));
+          err = cudaMemcpy(host_in2, device_in2, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from device_in2 to host_in2 - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
+        }
+      }
+      CHK_IF_RETURN(cudaPeekAtLastError());
+
+      auto start_time = std::chrono::high_resolution_clock::now();
+      // Benchmark loop
+      for (int i = 0; i < repetitions; i++) {
+        switch (config_idx >> 1) {   // {is_a_on_device, is_b_on_device, is_result_on_device}
+          case 0b000: vector_op(host_in1,   host_in2,   host_out,   vector_size, ctx, config, (Op)op); break;
+          case 0b001: vector_op(host_in1,   host_in2,   device_out, vector_size, ctx, config, (Op)op); break;
+          case 0b010: vector_op(host_in1,   device_in2, host_out,   vector_size, ctx, config, (Op)op); break;
+          case 0b011: vector_op(host_in1,   device_in2, device_out, vector_size, ctx, config, (Op)op); break;
+          case 0b100: vector_op(device_in1, host_in2,   host_out,   vector_size, ctx, config, (Op)op); break;
+          case 0b101: vector_op(device_in1, host_in2,   device_out, vector_size, ctx, config, (Op)op); break;
+          case 0b110: vector_op(device_in1, device_in2, host_out,   vector_size, ctx, config, (Op)op); break;
+          case 0b111: vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); break;
+        }
+        CHK_IF_RETURN(cudaPeekAtLastError());
+      }
+
+      auto end_time = std::chrono::high_resolution_clock::now();
+      auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
+      switch (op) {
+        case MUL: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation MUL for config_idx " << config_idx << " and result not in-place" << std::endl; break;
+        case ADD: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation ADD for config_idx " << config_idx << " and result not in-place" << std::endl; break;
+        case SUB: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation SUB for config_idx " << config_idx << " and result not in-place" << std::endl; break;
+      }
+
+      if (config.is_result_on_device) {   // Copy the data to host_out in order to compare it vs. host_out_ref_mul value.
+        if (config.is_in_montgomery_form) {    // Convert to normal from montgomery if needed.
+          CHK_IF_RETURN(mont::from_montgomery(device_out, vector_size, config.ctx.stream, device_out));   // Convert to normal in order to check vs. host_out_ref_mul.
+        }
+        err = cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);    // Copy to host_out in order to check vs. host_out_ref_mul.
+        if (err != cudaSuccess) {
+          std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl;
+          return 0;
+        }
+      } else {    // Data is not on device but it is in host_out.
+        if (config.is_in_montgomery_form) {    // host_out should be written to device, converted to mmontgomery and written back to host. Then compared vs. host_out_ref_mul.
+          err = cudaMemcpy(device_out, host_out, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy to host_out in order to check vs. host_out_ref_mul.
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from host_out to device_out - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
+          CHK_IF_RETURN(mont::from_montgomery(device_out, vector_size, config.ctx.stream, device_out));   // Convert to normal in order to check vs. host_out_ref_mul.          
+          err = cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);    // Copy to host_out in order to check vs. host_out_ref_mul.
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
+        } else {    // host_out could be compared vs. host_out_ref_mul as is.
+        }
+      }
+      //****************************************
+      // End of benchmark test.
+      //****************************************
+
+      //***********************************************
+      // Test result check
+      // Check is performed by executing the operation in a normal presentation
+      //   (located in in host_out_ref_mul) and comparing it with the
+      //   benchmark test result.
+      //***********************************************
+      int test_failed = 0;
+      // std::cout << "===>>> host_out_ref_mul[" << i << "]: " << host_out_ref_mul[i] << std::endl;
+      // std::cout << "===>>> host_out[" << i << "] after test run: " << host_out[i] << std::endl;
+      switch (op) {
+        case MUL: for (int i=0; i<vector_size; i++) {
+                    if (host_out_ref_mul[i] != host_out[i]) { 
+                      std::cout << "===>>> ERROR!!! MUL: Test failed for vector index " << i << ", config is printed below:" << std::endl;
+                      test_failed = 1;
+                    }
+                  }
+        break;
+        case ADD: for (int i=0; i<vector_size; i++) {
+                    if (host_out_ref_add[i] != host_out[i]) { 
+                      std::cout << "===>>> ERROR!!! ADD: Test failed for vector index " << i << ", config is printed below:" << std::endl;
+                      test_failed = 1;
+                    }
+                  }
+        break;
+        case SUB: for (int i=0; i<vector_size; i++) {
+                    if (host_out_ref_sub[i] != host_out[i]) { 
+                      std::cout << "===>>> ERROR!!! SUB: Test failed for vector index " << i << ", config is printed below:" << std::endl;
+                      test_failed = 1;
+                    }
+                  }
+        break;
+      }
+      if (test_failed) {
+        // std::cout << "===>>> ERROR!!! Test failed for vector index " << i << ", config is printed below:" << std::endl;
+        std::cout << "===>>> result is not in-place: " << std::endl;
+        std::cout << "===>>> is_a_on_device: " << config.is_a_on_device << std::endl;
+        std::cout << "===>>> is_b_on_device: " << config.is_b_on_device << std::endl;
+        std::cout << "===>>> is_result_on_device: " << config.is_result_on_device << std::endl;
+        std::cout << "===>>> is_in_montgomery_form: " << config.is_in_montgomery_form << std::endl;
+        exit(2);
+      }
+
+      unsigned power_after;
+      nvmlDeviceGetPowerUsage(device, &power_after);
+      std::cout << "Power after: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_after << " W" << std::endl;
+      std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_after / power_limit
+                << " %" << std::endl;
+      unsigned temperature_after;
+      if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_after) == NVML_SUCCESS) {
+        std::cout << "GPU Temperature after: " << temperature_after << " C" << std::endl;
+      } else {
+        std::cerr << "Failed to get GPU temperature." << std::endl;
+      }
+
+      // Report performance in GMPS: Giga Multiplications Per Second
+      double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count());
+      std::cout << "Performance: " << GMPS << " Giga Multiplications Per Second" << std::endl;
+    }
+  }
+
+  // Test when the result is in-place
+  for ( int op = MUL; op != LAST; op++ ) {
+    for (int config_idx = 0; config_idx < 16; config_idx++) {
+      for (int i=0; i<vector_size; i++) {
+        host_in1[i] = host_in1_init[i];
+        host_in2[i] = host_in2_init[i];
+      }
+      config.is_a_on_device = (config_idx >> 4) & 0x1;
+      config.is_b_on_device = (config_idx >> 3) & 0x1;
+      config.is_result_on_device = (config_idx >> 2) & 0x1;
+      config.is_in_montgomery_form = (config_idx >> 1) & 0x1;
+      if (config.is_a_on_device ^ config.is_result_on_device == 1) {
+        continue;
+      }
+
+      // Copy from host to device (copy again in order to be used later in the loop and device_inX was already overwritten by warmup.
+      if (config.is_a_on_device) {
+        if (config.is_in_montgomery_form) {
+          err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy data to device.
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
+          CHK_IF_RETURN(mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1));   // Convert in-place.
+        } else {    // Normal presentation.
+          err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy data to device.
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
+        }
+      } else {
+        if (config.is_in_montgomery_form) {   // Copy to device, cnvert to montgomery and copy back to host.
+          err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy data to device.
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
+          CHK_IF_RETURN(mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1));
+          err = cudaMemcpy(host_in1, device_in1, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from device_in1 to host_in1 - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
+        }
+      }
+      if (config.is_b_on_device) {
+        if (config.is_in_montgomery_form) {
+          err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy data to device.
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from host_in2 to device_in1 - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
+          CHK_IF_RETURN(mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2));   // Convert in-place.
+        } else {
+          // Normal presentation.
+          err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy data to device.
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
+        }
+      } else {
+        if (config.is_in_montgomery_form) {   // Copy to device, cnvert to montgomery and copy back to host.
+          err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy data to device.
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
+          CHK_IF_RETURN(mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2));
+          err = cudaMemcpy(host_in2, device_in2, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from device_in2 to host_in2 - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
+        }
+      }
+      CHK_IF_RETURN(cudaPeekAtLastError());
+
+      auto start_time = std::chrono::high_resolution_clock::now();
+      // Benchmark loop
+      for (int i = 0; i < repetitions; i++) {
+        switch (config_idx >> 2) {   // {is_a_on_device, is_b_on_device, is_result_on_device}
+          case 0b000: vector_op(host_in1,   host_in2,   host_in1,   vector_size, ctx, config, (Op)op); break;
+          case 0b001: break;
+          case 0b010: vector_op(host_in1,   device_in2, host_in1,   vector_size, ctx, config, (Op)op); break;
+          case 0b011: break;
+          case 0b100: break;
+          case 0b101: vector_op(device_in1, host_in2,   device_in1, vector_size, ctx, config, (Op)op); break;
+          case 0b110: break;
+          case 0b111: vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op); break;
+        }
+        CHK_IF_RETURN(cudaPeekAtLastError());
+      }
+
+      auto end_time = std::chrono::high_resolution_clock::now();
+      auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
+      switch (op) {
+        case MUL: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation MUL for config_idx " << config_idx << " and result in-place" << std::endl; break;
+        case ADD: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation ADD for config_idx " << config_idx << " and result in-place" << std::endl; break;
+        case SUB: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation SUB for config_idx " << config_idx << " and result in-place" << std::endl; break;
+      }
+
+      if (config.is_result_on_device) {   // Copy the data to host_out in order to compare it vs. host_out_ref_mul value.
+        if (config.is_in_montgomery_form) {    // Convert to normal from montgomery if needed.
+          CHK_IF_RETURN(mont::from_montgomery(device_in1, vector_size, config.ctx.stream, device_in1));   // Convert to normal in order to check vs. host_out_ref_mul.
+        }
+        err = cudaMemcpy(host_out, device_in1, vector_size * sizeof(T), cudaMemcpyDeviceToHost);    // Copy to host_out in order to check vs. host_out_ref_mul.
+        if (err != cudaSuccess) {
+          std::cerr << "Failed to copy data from device_in1 to host_out - " << cudaGetErrorString(err) << std::endl;
+          return 0;
+        }
+      } else {    // Data is not on device but it is in host_in1. It should be moved to host_out for test pass/fail check.
+        if (config.is_in_montgomery_form) {    // host_out should be written to device, converted to mmontgomery and written back to host. Then compared vs. host_out_ref_mul.
+          err = cudaMemcpy(device_out, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy to host_out in order to check vs. host_out_ref_mul.
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from host_in1 to device_out - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
+          CHK_IF_RETURN(mont::from_montgomery(device_out, vector_size, config.ctx.stream, device_out));   // Convert to normal in order to check vs. host_out_ref_mul.          
+          err = cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);    // Copy to host_out in order to check vs. host_out_ref_mul.
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
+        } else {    // host_out could be compared vs. host_out_ref_mul as is.
+          err = cudaMemcpy(device_out, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy to host_out in order to check vs. host_out_ref_mul.
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from host_in1 to device_out - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }      
+          err = cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);    // Copy to host_out in order to check vs. host_out_ref_mul.
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
+        }
+      }
+      //****************************************
+      // End of benchmark test.
+      //****************************************
+
+      //***********************************************
+      // Test result check
+      // Check is performed by executing the operation in a normal presentation
+      //   (located in in host_out_ref_mul) and comparing it with the
+      //   benchmark test result.
+      //***********************************************
+      int test_failed = 0;
+      // std::cout << "===>>> host_out_ref_mul[" << i << "]: " << host_out_ref_mul[i] << std::endl;
+      // std::cout << "===>>> host_out[" << i << "] after test run: " << host_out[i] << std::endl;
+      switch (op) {
+        case MUL: for (int i=0; i<vector_size; i++) {
+                    if (host_out_ref_mul[i] != host_out[i]) { 
+                      std::cout << "===>>> ERROR!!! MUL: Test failed for vector index " << i << ", config is printed below:" << std::endl;
+                      std::cout << "host_out_ref_mul[0] = " << host_out_ref_mul[0] << std::endl;
+                      test_failed = 1;
+                    }
+                  }
+        break;
+        case ADD: for (int i=0; i<vector_size; i++) {
+                    if (host_out_ref_add[i] != host_out[i]) { 
+                      std::cout << "===>>> ERROR!!! ADD: Test failed for vector index " << i << ", config is printed below:" << std::endl;
+                      std::cout << "host_out_ref_add[0] = " << host_out_ref_add[0] << std::endl;
+                      test_failed = 1;
+                    }
+                  }
+        break;
+        case SUB: for (int i=0; i<vector_size; i++) {
+                    if (host_out_ref_sub[i] != host_out[i]) { 
+                      std::cout << "===>>> ERROR!!! SUB: Test failed for vector index " << i << ", config is printed below:" << std::endl;
+                      std::cout << "host_out_ref_sub[0] = " << host_out_ref_sub[0] << std::endl;
+                      test_failed = 1;
+                    }
+                  }
+        break;
+      }
+      if (test_failed) {
+        // std::cout << "===>>> ERROR!!! Test failed for vector index " << i << ", config is printed below:" << std::endl;
+        std::cout << "===>>> result is in-place: " << std::endl;
+        std::cout << "===>>> is_a_on_device: " << config.is_a_on_device << std::endl;
+        std::cout << "===>>> is_b_on_device: " << config.is_b_on_device << std::endl;
+        std::cout << "===>>> is_result_on_device: " << config.is_result_on_device << std::endl;
+        std::cout << "===>>> is_in_montgomery_form: " << config.is_in_montgomery_form << std::endl;
+        std::cout << "host_out[0] = " << host_out[0] << std::endl;
+        exit(2);
+      }
+
+      unsigned power_after;
+      nvmlDeviceGetPowerUsage(device, &power_after);
+      std::cout << "Power after: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_after << " W" << std::endl;
+      std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_after / power_limit
+                << " %" << std::endl;
+      unsigned temperature_after;
+      if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_after) == NVML_SUCCESS) {
+        std::cout << "GPU Temperature after: " << temperature_after << " C" << std::endl;
+      } else {
+        std::cerr << "Failed to get GPU temperature." << std::endl;
+      }
+
+      // Report performance in GMPS: Giga Multiplications Per Second
+      double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count());
+      std::cout << "Performance: " << GMPS << " Giga Multiplications Per Second" << std::endl;
+    }
+  }
+
+  // clean up and exit
+  free(host_in1_init);
+  free(host_in2_init);
+  free(host_in1);
+  free(host_in2);
+  free(host_out);
+  free(host_out_ref_mul);
+  cudaFree(device_in1);
+  cudaFree(device_in2);
+  cudaFree(device_out);
+  nvmlShutdown();
+  return 0;
+}
diff --git a/examples/c++/mont_vec_ops/run.sh b/examples/c++/mont_vec_ops/run.sh
new file mode 100755
index 000000000..01eca66ba
--- /dev/null
+++ b/examples/c++/mont_vec_ops/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+./build/example/example
diff --git a/icicle/include/vec_ops/vec_ops.cuh b/icicle/include/vec_ops/vec_ops.cuh
index ee624a07f..89dad5610 100644
--- a/icicle/include/vec_ops/vec_ops.cuh
+++ b/icicle/include/vec_ops/vec_ops.cuh
@@ -27,6 +27,7 @@ namespace vec_ops {
                     *   non-blocking and you'd need to synchronize it explicitly by running
                     *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the
                     *   function will block the current CPU thread. */
+    bool is_in_montgomery_form;    /**< If true then vec_a, vec_b and result are in montgomery form. Default value: false.  */
   };
 
   /**
@@ -42,6 +43,7 @@ namespace vec_ops {
       false, // is_b_on_device
       false, // is_result_on_device
       false, // is_async
+      false, // is_in_montgomery_form
     };
     return config;
   }
diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu
index 9cfa85e60..7e7cf5e65 100644
--- a/icicle/src/vec_ops/vec_ops.cu
+++ b/icicle/src/vec_ops/vec_ops.cu
@@ -95,25 +95,62 @@ namespace vec_ops {
     E *d_result, *d_alloc_vec_a, *d_alloc_vec_b;
     E* d_vec_a;
     const E* d_vec_b;
+
+    int is_d_alloc_vec_a_allocated = 0;
     if (!config.is_a_on_device) {
-      CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
-      CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
-      d_vec_a = d_alloc_vec_a;
+      if (config.is_in_montgomery_form) {
+        CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
+        CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
+        CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_a, n * sizeof(E), config.ctx.stream, d_alloc_vec_a));
+        is_d_alloc_vec_a_allocated = 1;        
+        d_vec_a = d_alloc_vec_a;
+      } else {
+        CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
+        CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
+        is_d_alloc_vec_a_allocated = 1;        
+        d_vec_a = d_alloc_vec_a;
+      }
     } else {
-      d_vec_a = vec_a;
+      if (config.is_in_montgomery_form) {
+        CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));   // Allocate in order not to change the input.
+        CHK_IF_RETURN(mont::from_montgomery(vec_a, n * sizeof(E), config.ctx.stream, d_alloc_vec_a));
+        is_d_alloc_vec_a_allocated = 1;        
+        d_vec_a = d_alloc_vec_a;
+      } else {
+        d_vec_a = vec_a;
+      }
     }
 
+    int is_d_alloc_vec_b_allocated = 0;
     if (!config.is_b_on_device) {
-      CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream));
-      CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
-      d_vec_b = d_alloc_vec_b;
+      if (config.is_in_montgomery_form) {
+        CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream));
+        CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
+        CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_b, n * sizeof(E), config.ctx.stream, d_alloc_vec_b));
+        is_d_alloc_vec_b_allocated = 1;
+        d_vec_b = d_alloc_vec_b;
+      } else {
+        CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream));
+        CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
+        is_d_alloc_vec_b_allocated = 1;        
+        d_vec_b = d_alloc_vec_b;
+      }
     } else {
-      d_vec_b = vec_b;
+      if (config.is_in_montgomery_form) {
+        CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream));   // Allocate in order not to change the input.
+        CHK_IF_RETURN(mont::from_montgomery(vec_b, n * sizeof(E), config.ctx.stream, d_alloc_vec_b));
+        is_d_alloc_vec_b_allocated = 1;        
+        d_vec_b = d_alloc_vec_b;
+      } else {
+        d_vec_b = vec_b;
+      }
     }
 
+    int is_d_result_allocated = 0;
     if (!config.is_result_on_device) {
       if (!is_in_place) {
         CHK_IF_RETURN(cudaMallocAsync(&d_result, n * sizeof(E), config.ctx.stream));
+        is_d_result_allocated = 1;
       } else {
         d_result = d_vec_a;
       }
@@ -129,12 +166,21 @@ namespace vec_ops {
     Kernel<<<num_blocks, num_threads, 0, config.ctx.stream>>>(d_vec_a, d_vec_b, n, d_result);
 
     if (!config.is_result_on_device) {
-      CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
-      CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream));
+      if (config.is_in_montgomery_form) {
+        CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result));   // Convert in-place.
+        CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
+      } else {
+        CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
+      }
+    } else {
+      if (config.is_in_montgomery_form) {
+        CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result));   // Convert in-place.
+      }
     }
 
-    if (!config.is_a_on_device && !is_in_place) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); }
-    if (!config.is_b_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); }
+    if (is_d_alloc_vec_a_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); }
+    if (is_d_alloc_vec_b_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); }
+    if (is_d_result_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream)); }
 
     if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(config.ctx.stream));
 

From 118c82f82914b4d888ccd41f778ba6875c902834 Mon Sep 17 00:00:00 2001
From: danny-shterman <danny.shterman@gmail.com>
Date: Tue, 30 Jul 2024 05:38:30 +0000
Subject: [PATCH 02/21] Fixed style by clang-format

---
 examples/c++/mont_vec_ops/CMakeLists.txt | 37 +++++++++---------------
 1 file changed, 13 insertions(+), 24 deletions(-)

diff --git a/examples/c++/mont_vec_ops/CMakeLists.txt b/examples/c++/mont_vec_ops/CMakeLists.txt
index f7048bb8a..1220da827 100644
--- a/examples/c++/mont_vec_ops/CMakeLists.txt
+++ b/examples/c++/mont_vec_ops/CMakeLists.txt
@@ -1,25 +1,14 @@
-cmake_minimum_required(VERSION 3.18)
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CUDA_STANDARD 17)
-set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
-set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
-if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
-    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
-else()
-    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
-endif ()
-project(example LANGUAGES CUDA CXX)
-
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
-set(CMAKE_CUDA_FLAGS_RELEASE "")
-set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
-add_executable(
-  example
-  example.cu
-)
-target_include_directories(example PRIVATE "../../../icicle/include")
-target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
-find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
-target_link_libraries(example ${NVML_LIBRARY})
-set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+cmake_minimum_required(VERSION 3.18) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CUDA_STANDARD 17)
+    set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+      set(CMAKE_CXX_STANDARD_REQUIRED TRUE) if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+        set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH}) else() set(CMAKE_CUDA_ARCHITECTURES native) #on 3.24 +
+  , on earlier it is ignored,
+  and the target is not passed endif() project(example LANGUAGES CUDA CXX)
 
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") set(CMAKE_CUDA_FLAGS_RELEASE "")
+      set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0") add_executable(example example.cu)
+        target_include_directories(example PRIVATE "../../../icicle/include")
+          target_link_libraries(example ${CMAKE_SOURCE_DIR} / build / icicle / lib / libingo_field_bn254.a)
+            find_library(NVML_LIBRARY nvidia - ml PATHS / usr / local / cuda / targets / x86_64 - linux / lib / stubs /)
+              target_link_libraries(example ${NVML_LIBRARY})
+                set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

From 132abbb2d458db7068a6de8227b3fcf9aafba4c2 Mon Sep 17 00:00:00 2001
From: danny-shterman <danny.shterman@gmail.com>
Date: Tue, 30 Jul 2024 05:39:32 +0000
Subject: [PATCH 03/21] Fixed style by clang-format

---
 examples/c++/mont_vec_ops/README.md        |   3 +-
 examples/c++/mont_vec_ops/compile.debug.sh |  27 +-
 examples/c++/mont_vec_ops/compile.sh       |  26 +-
 examples/c++/mont_vec_ops/example.cu       | 419 ++++++++++++++-------
 examples/c++/mont_vec_ops/run.sh           |   4 +-
 icicle/include/vec_ops/vec_ops.cuh         |   3 +-
 icicle/src/vec_ops/vec_ops.cu              |  20 +-
 7 files changed, 320 insertions(+), 182 deletions(-)
 mode change 100755 => 100644 examples/c++/mont_vec_ops/compile.debug.sh
 mode change 100755 => 100644 examples/c++/mont_vec_ops/compile.sh
 mode change 100755 => 100644 examples/c++/mont_vec_ops/run.sh

diff --git a/examples/c++/mont_vec_ops/README.md b/examples/c++/mont_vec_ops/README.md
index c99466f6d..736303872 100644
--- a/examples/c++/mont_vec_ops/README.md
+++ b/examples/c++/mont_vec_ops/README.md
@@ -1,4 +1,4 @@
-# Icicle example: Montgomery vector operations (mul, add, sub) for allpossible options:
+#Icicle example : Montgomery vector operations(mul, add, sub) for allpossible options:
 is_a_on_device
 is_b_on_device
 is_result_on_device
@@ -39,4 +39,3 @@ typedef scalar_t T;
 3. Copy them on-device
 4. Execute element-wise vector multiplication on-device
 5. Copy results on-host
-
diff --git a/examples/c++/mont_vec_ops/compile.debug.sh b/examples/c++/mont_vec_ops/compile.debug.sh
old mode 100755
new mode 100644
index cebaa9451..73d19bbd9
--- a/examples/c++/mont_vec_ops/compile.debug.sh
+++ b/examples/c++/mont_vec_ops/compile.debug.sh
@@ -1,15 +1,20 @@
-#!/bin/bash
+#!/ bin / bash
 
-# Exit immediately on error
-set -e
+#Exit immediately on error
+set -
+  e
 
-mkdir -p build/example
-mkdir -p build/icicle
+    mkdir -
+  p build / example mkdir -
+  p build / icicle
 
-# Configure and build Icicle
-cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Debug -DCURVE=bn254
-cmake --build build/icicle -j
+#Configure and build Icicle
+              cmake -
+  S../../../ icicle / -B build / icicle - DMSM =
+  OFF - DCMAKE_BUILD_TYPE = Debug - DCURVE = bn254 cmake-- build build / icicle -
+                                             j
 
-# Configure and build the example application
-cmake -DCMAKE_BUILD_TYPE=Debug -S . -B build/example
-cmake --build build/example
+#Configure and build the example application
+                                               cmake -
+                                             DCMAKE_BUILD_TYPE =
+                                               Debug - S.- B build / example cmake-- build build / example
diff --git a/examples/c++/mont_vec_ops/compile.sh b/examples/c++/mont_vec_ops/compile.sh
old mode 100755
new mode 100644
index df73232a4..cf145b290
--- a/examples/c++/mont_vec_ops/compile.sh
+++ b/examples/c++/mont_vec_ops/compile.sh
@@ -1,15 +1,19 @@
-#!/bin/bash
+#!/ bin / bash
 
-# Exit immediately on error
-set -e
+#Exit immediately on error
+set -
+  e
 
-mkdir -p build/example
-mkdir -p build/icicle
+    mkdir -
+  p build / example mkdir -
+  p build / icicle
 
-# Configure and build Icicle
-cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
-cmake --build build/icicle -j
+#Configure and build Icicle
+              cmake -
+  S../../../ icicle / -B build / icicle -
+  DMSM = OFF - DCMAKE_BUILD_TYPE = Release - DCURVE = bn254 cmake-- build build / icicle -
+                                                      j
 
-# Configure and build the example application
-cmake -S . -B build/example
-cmake --build build/example
+#Configure and build the example application
+                                                        cmake -
+                                                      S.- B build / example cmake-- build build / example
diff --git a/examples/c++/mont_vec_ops/example.cu b/examples/c++/mont_vec_ops/example.cu
index 453d3f12f..8cf21e1fc 100644
--- a/examples/c++/mont_vec_ops/example.cu
+++ b/examples/c++/mont_vec_ops/example.cu
@@ -12,22 +12,30 @@ using namespace bn254;
 
 typedef scalar_t T;
 
-enum Op {
-  MUL,
-  ADD,
-  SUB,
-  LAST
-};
+enum Op { MUL, ADD, SUB, LAST };
 
 // bn254 p = 21888242871839275222246405745257275088548364400416034343698204186575808495617
 
-int vector_op(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config, Op op)
+int vector_op(
+  T* vec_a,
+  T* vec_b,
+  T* vec_result,
+  size_t n_elements,
+  device_context::DeviceContext ctx,
+  vec_ops::VecOpsConfig config,
+  Op op)
 {
   cudaError_t err;
   switch (op) {
-    case MUL: err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result); break;
-    case ADD: err = bn254_add_cuda(vec_a, vec_b, n_elements, config, vec_result); break;
-    case SUB: err = bn254_sub_cuda(vec_a, vec_b, n_elements, config, vec_result); break;
+  case MUL:
+    err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result);
+    break;
+  case ADD:
+    err = bn254_add_cuda(vec_a, vec_b, n_elements, config, vec_result);
+    break;
+  case SUB:
+    err = bn254_sub_cuda(vec_a, vec_b, n_elements, config, vec_result);
+    break;
   }
   // cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result);
   if (err != cudaSuccess) {
@@ -36,7 +44,8 @@ int vector_op(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_conte
   }
   return 0;
 }
-int vector_mul(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config)
+int vector_mul(
+  T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config)
 {
   cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result);
   if (err != cudaSuccess) {
@@ -45,7 +54,8 @@ int vector_mul(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_cont
   }
   return 0;
 }
-int vector_add(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config)
+int vector_add(
+  T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config)
 {
   cudaError_t err = bn254_add_cuda(vec_a, vec_b, n_elements, config, vec_result);
   if (err != cudaSuccess) {
@@ -54,7 +64,8 @@ int vector_add(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_cont
   }
   return 0;
 }
-int vector_sub(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config)
+int vector_sub(
+  T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config)
 {
   cudaError_t err = bn254_sub_cuda(vec_a, vec_b, n_elements, config, vec_result);
   if (err != cudaSuccess) {
@@ -105,10 +116,16 @@ int main(int argc, char** argv)
   T::rand_host_many(host_in1_init, vector_size);
   T::rand_host_many(host_in2_init, vector_size);
   std::cout << "Allocate memory for the output vectors" << std::endl;
-  T* host_out = (T*)malloc(vector_size * sizeof(T));        // This memory will be used for the test output.
-  T* host_out_ref_mul = (T*)malloc(vector_size * sizeof(T));    // This memory will be used as a reference result for mul (will be compared to host_out content).
-  T* host_out_ref_add = (T*)malloc(vector_size * sizeof(T));    // This memory will be used as a reference result for add (will be compared to host_out content).
-  T* host_out_ref_sub = (T*)malloc(vector_size * sizeof(T));    // This memory will be used as a reference result for sub (will be compared to host_out content).
+  T* host_out = (T*)malloc(vector_size * sizeof(T)); // This memory will be used for the test output.
+  T* host_out_ref_mul = (T*)malloc(
+    vector_size *
+    sizeof(T)); // This memory will be used as a reference result for mul (will be compared to host_out content).
+  T* host_out_ref_add = (T*)malloc(
+    vector_size *
+    sizeof(T)); // This memory will be used as a reference result for add (will be compared to host_out content).
+  T* host_out_ref_sub = (T*)malloc(
+    vector_size *
+    sizeof(T)); // This memory will be used as a reference result for sub (will be compared to host_out content).
   std::cout << "Initializing output vectors with random data" << std::endl;
   T::rand_host_many(host_out, vector_size);
   T::rand_host_many(host_out_ref_mul, vector_size);
@@ -154,15 +171,21 @@ int main(int argc, char** argv)
   }
   std::cout << "Starting warm-up run" << std::endl;
   // Warm-up loop
-  for ( int op = MUL; op != LAST; op++ ) {
+  for (int op = MUL; op != LAST; op++) {
     for (int i = 0; i < repetitions; i++) {
       // vector_mul(device_in1, device_in2, device_out, vector_size, ctx, config);
       vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op);
     }
     switch (op) {
-      case MUL: err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); break;
-      case ADD: err = cudaMemcpy(host_out_ref_add, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); break;
-      case SUB: err = cudaMemcpy(host_out_ref_sub, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); break;
+    case MUL:
+      err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
+      break;
+    case ADD:
+      err = cudaMemcpy(host_out_ref_add, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
+      break;
+    case SUB:
+      err = cudaMemcpy(host_out_ref_sub, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
+      break;
     }
   }
   // copy the result from device to host_out_ref_mul to keep it for later comparisons.
@@ -193,14 +216,16 @@ int main(int argc, char** argv)
   // Loop for (mul, add, sub):
   //   Loop (is_a_on_device, is_b_on_device, is_result_on_device, is_in_montgomery_form):
   //*******************************************************
-  T* host_in1 = (T*)malloc(vector_size * sizeof(T));  // This buffer is used to load the data from host_in1_init for the benchmark.
-  T* host_in2 = (T*)malloc(vector_size * sizeof(T));  // This buffer is used to load the data from host_in2_init for the benchmark.
+  T* host_in1 =
+    (T*)malloc(vector_size * sizeof(T)); // This buffer is used to load the data from host_in1_init for the benchmark.
+  T* host_in2 =
+    (T*)malloc(vector_size * sizeof(T)); // This buffer is used to load the data from host_in2_init for the benchmark.
   // Test when the result is not in-place
-  for ( int op = MUL; op != LAST; op++ ) {
+  for (int op = MUL; op != LAST; op++) {
     // for (int config_idx = 0; config_idx < 0; config_idx++) {
     for (int config_idx = 0; config_idx < 16; config_idx++) {
       std::cout << "Start benchmark loop for config_idx " << config_idx << std::endl;
-      for (int i=0; i<vector_size; i++) {
+      for (int i = 0; i < vector_size; i++) {
         host_in1[i] = host_in1_init[i];
         host_in2[i] = host_in2_init[i];
       }
@@ -209,25 +234,30 @@ int main(int argc, char** argv)
       config.is_result_on_device = (config_idx >> 1) & 0x1;
       config.is_in_montgomery_form = (config_idx >> 0) & 0x1;
 
-      // Copy from host to device (copy again in order to be used later in the loop and device_inX was already overwritten by warmup.
+      // Copy from host to device (copy again in order to be used later in the loop and device_inX was already
+      // overwritten by warmup.
       if (config.is_a_on_device) {
         if (config.is_in_montgomery_form) {
-          err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy data to device.
+          err =
+            cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
           if (err != cudaSuccess) {
             std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
             return 0;
           }
-          CHK_IF_RETURN(mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1));   // Convert in-place.
-        } else {    // Normal presentation.
-          err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy data to device.
+          CHK_IF_RETURN(
+            mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1)); // Convert in-place.
+        } else {                                                                          // Normal presentation.
+          err =
+            cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
           if (err != cudaSuccess) {
             std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
             return 0;
           }
         }
       } else {
-        if (config.is_in_montgomery_form) {   // Copy to device, cnvert to montgomery and copy back to host.
-          err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy data to device.
+        if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host.
+          err =
+            cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
           if (err != cudaSuccess) {
             std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
             return 0;
@@ -242,23 +272,27 @@ int main(int argc, char** argv)
       }
       if (config.is_b_on_device) {
         if (config.is_in_montgomery_form) {
-          err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy data to device.
+          err =
+            cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
           if (err != cudaSuccess) {
             std::cerr << "Failed to copy data from host_in2 to device_in1 - " << cudaGetErrorString(err) << std::endl;
             return 0;
           }
-          CHK_IF_RETURN(mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2));   // Convert in-place.
+          CHK_IF_RETURN(
+            mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2)); // Convert in-place.
         } else {
           // Normal presentation.
-          err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy data to device.
+          err =
+            cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
           if (err != cudaSuccess) {
             std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl;
             return 0;
           }
         }
       } else {
-        if (config.is_in_montgomery_form) {   // Copy to device, cnvert to montgomery and copy back to host.
-          err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy data to device.
+        if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host.
+          err =
+            cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
           if (err != cudaSuccess) {
             std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl;
             return 0;
@@ -276,15 +310,31 @@ int main(int argc, char** argv)
       auto start_time = std::chrono::high_resolution_clock::now();
       // Benchmark loop
       for (int i = 0; i < repetitions; i++) {
-        switch (config_idx >> 1) {   // {is_a_on_device, is_b_on_device, is_result_on_device}
-          case 0b000: vector_op(host_in1,   host_in2,   host_out,   vector_size, ctx, config, (Op)op); break;
-          case 0b001: vector_op(host_in1,   host_in2,   device_out, vector_size, ctx, config, (Op)op); break;
-          case 0b010: vector_op(host_in1,   device_in2, host_out,   vector_size, ctx, config, (Op)op); break;
-          case 0b011: vector_op(host_in1,   device_in2, device_out, vector_size, ctx, config, (Op)op); break;
-          case 0b100: vector_op(device_in1, host_in2,   host_out,   vector_size, ctx, config, (Op)op); break;
-          case 0b101: vector_op(device_in1, host_in2,   device_out, vector_size, ctx, config, (Op)op); break;
-          case 0b110: vector_op(device_in1, device_in2, host_out,   vector_size, ctx, config, (Op)op); break;
-          case 0b111: vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); break;
+        switch (config_idx >> 1) { // {is_a_on_device, is_b_on_device, is_result_on_device}
+        case 0b000:
+          vector_op(host_in1, host_in2, host_out, vector_size, ctx, config, (Op)op);
+          break;
+        case 0b001:
+          vector_op(host_in1, host_in2, device_out, vector_size, ctx, config, (Op)op);
+          break;
+        case 0b010:
+          vector_op(host_in1, device_in2, host_out, vector_size, ctx, config, (Op)op);
+          break;
+        case 0b011:
+          vector_op(host_in1, device_in2, device_out, vector_size, ctx, config, (Op)op);
+          break;
+        case 0b100:
+          vector_op(device_in1, host_in2, host_out, vector_size, ctx, config, (Op)op);
+          break;
+        case 0b101:
+          vector_op(device_in1, host_in2, device_out, vector_size, ctx, config, (Op)op);
+          break;
+        case 0b110:
+          vector_op(device_in1, device_in2, host_out, vector_size, ctx, config, (Op)op);
+          break;
+        case 0b111:
+          vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op);
+          break;
         }
         CHK_IF_RETURN(cudaPeekAtLastError());
       }
@@ -292,34 +342,54 @@ int main(int argc, char** argv)
       auto end_time = std::chrono::high_resolution_clock::now();
       auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
       switch (op) {
-        case MUL: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation MUL for config_idx " << config_idx << " and result not in-place" << std::endl; break;
-        case ADD: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation ADD for config_idx " << config_idx << " and result not in-place" << std::endl; break;
-        case SUB: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation SUB for config_idx " << config_idx << " and result not in-place" << std::endl; break;
+      case MUL:
+        std::cout << "Elapsed time: " << duration.count() << " microseconds, operation MUL for config_idx "
+                  << config_idx << " and result not in-place" << std::endl;
+        break;
+      case ADD:
+        std::cout << "Elapsed time: " << duration.count() << " microseconds, operation ADD for config_idx "
+                  << config_idx << " and result not in-place" << std::endl;
+        break;
+      case SUB:
+        std::cout << "Elapsed time: " << duration.count() << " microseconds, operation SUB for config_idx "
+                  << config_idx << " and result not in-place" << std::endl;
+        break;
       }
 
-      if (config.is_result_on_device) {   // Copy the data to host_out in order to compare it vs. host_out_ref_mul value.
-        if (config.is_in_montgomery_form) {    // Convert to normal from montgomery if needed.
-          CHK_IF_RETURN(mont::from_montgomery(device_out, vector_size, config.ctx.stream, device_out));   // Convert to normal in order to check vs. host_out_ref_mul.
+      if (config.is_result_on_device) { // Copy the data to host_out in order to compare it vs. host_out_ref_mul value.
+        if (config.is_in_montgomery_form) { // Convert to normal from montgomery if needed.
+          CHK_IF_RETURN(mont::from_montgomery(
+            device_out, vector_size, config.ctx.stream,
+            device_out)); // Convert to normal in order to check vs. host_out_ref_mul.
         }
-        err = cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);    // Copy to host_out in order to check vs. host_out_ref_mul.
+        err = cudaMemcpy(
+          host_out, device_out, vector_size * sizeof(T),
+          cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
         if (err != cudaSuccess) {
           std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl;
           return 0;
         }
-      } else {    // Data is not on device but it is in host_out.
-        if (config.is_in_montgomery_form) {    // host_out should be written to device, converted to mmontgomery and written back to host. Then compared vs. host_out_ref_mul.
-          err = cudaMemcpy(device_out, host_out, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy to host_out in order to check vs. host_out_ref_mul.
+      } else {                              // Data is not on device but it is in host_out.
+        if (config.is_in_montgomery_form) { // host_out should be written to device, converted to mmontgomery and
+                                            // written back to host. Then compared vs. host_out_ref_mul.
+          err = cudaMemcpy(
+            device_out, host_out, vector_size * sizeof(T),
+            cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul.
           if (err != cudaSuccess) {
             std::cerr << "Failed to copy data from host_out to device_out - " << cudaGetErrorString(err) << std::endl;
             return 0;
           }
-          CHK_IF_RETURN(mont::from_montgomery(device_out, vector_size, config.ctx.stream, device_out));   // Convert to normal in order to check vs. host_out_ref_mul.          
-          err = cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);    // Copy to host_out in order to check vs. host_out_ref_mul.
+          CHK_IF_RETURN(mont::from_montgomery(
+            device_out, vector_size, config.ctx.stream,
+            device_out)); // Convert to normal in order to check vs. host_out_ref_mul.
+          err = cudaMemcpy(
+            host_out, device_out, vector_size * sizeof(T),
+            cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
           if (err != cudaSuccess) {
             std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl;
             return 0;
           }
-        } else {    // host_out could be compared vs. host_out_ref_mul as is.
+        } else { // host_out could be compared vs. host_out_ref_mul as is.
         }
       }
       //****************************************
@@ -336,30 +406,37 @@ int main(int argc, char** argv)
       // std::cout << "===>>> host_out_ref_mul[" << i << "]: " << host_out_ref_mul[i] << std::endl;
       // std::cout << "===>>> host_out[" << i << "] after test run: " << host_out[i] << std::endl;
       switch (op) {
-        case MUL: for (int i=0; i<vector_size; i++) {
-                    if (host_out_ref_mul[i] != host_out[i]) { 
-                      std::cout << "===>>> ERROR!!! MUL: Test failed for vector index " << i << ", config is printed below:" << std::endl;
-                      test_failed = 1;
-                    }
-                  }
+      case MUL:
+        for (int i = 0; i < vector_size; i++) {
+          if (host_out_ref_mul[i] != host_out[i]) {
+            std::cout << "===>>> ERROR!!! MUL: Test failed for vector index " << i
+                      << ", config is printed below:" << std::endl;
+            test_failed = 1;
+          }
+        }
         break;
-        case ADD: for (int i=0; i<vector_size; i++) {
-                    if (host_out_ref_add[i] != host_out[i]) { 
-                      std::cout << "===>>> ERROR!!! ADD: Test failed for vector index " << i << ", config is printed below:" << std::endl;
-                      test_failed = 1;
-                    }
-                  }
+      case ADD:
+        for (int i = 0; i < vector_size; i++) {
+          if (host_out_ref_add[i] != host_out[i]) {
+            std::cout << "===>>> ERROR!!! ADD: Test failed for vector index " << i
+                      << ", config is printed below:" << std::endl;
+            test_failed = 1;
+          }
+        }
         break;
-        case SUB: for (int i=0; i<vector_size; i++) {
-                    if (host_out_ref_sub[i] != host_out[i]) { 
-                      std::cout << "===>>> ERROR!!! SUB: Test failed for vector index " << i << ", config is printed below:" << std::endl;
-                      test_failed = 1;
-                    }
-                  }
+      case SUB:
+        for (int i = 0; i < vector_size; i++) {
+          if (host_out_ref_sub[i] != host_out[i]) {
+            std::cout << "===>>> ERROR!!! SUB: Test failed for vector index " << i
+                      << ", config is printed below:" << std::endl;
+            test_failed = 1;
+          }
+        }
         break;
       }
       if (test_failed) {
-        // std::cout << "===>>> ERROR!!! Test failed for vector index " << i << ", config is printed below:" << std::endl;
+        // std::cout << "===>>> ERROR!!! Test failed for vector index " << i << ", config is printed below:" <<
+        // std::endl;
         std::cout << "===>>> result is not in-place: " << std::endl;
         std::cout << "===>>> is_a_on_device: " << config.is_a_on_device << std::endl;
         std::cout << "===>>> is_b_on_device: " << config.is_b_on_device << std::endl;
@@ -371,8 +448,8 @@ int main(int argc, char** argv)
       unsigned power_after;
       nvmlDeviceGetPowerUsage(device, &power_after);
       std::cout << "Power after: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_after << " W" << std::endl;
-      std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_after / power_limit
-                << " %" << std::endl;
+      std::cout << "Power utilization: " << std::fixed << std::setprecision(1)
+                << (float)100.0 * power_after / power_limit << " %" << std::endl;
       unsigned temperature_after;
       if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_after) == NVML_SUCCESS) {
         std::cout << "GPU Temperature after: " << temperature_after << " C" << std::endl;
@@ -387,9 +464,9 @@ int main(int argc, char** argv)
   }
 
   // Test when the result is in-place
-  for ( int op = MUL; op != LAST; op++ ) {
+  for (int op = MUL; op != LAST; op++) {
     for (int config_idx = 0; config_idx < 16; config_idx++) {
-      for (int i=0; i<vector_size; i++) {
+      for (int i = 0; i < vector_size; i++) {
         host_in1[i] = host_in1_init[i];
         host_in2[i] = host_in2_init[i];
       }
@@ -397,29 +474,32 @@ int main(int argc, char** argv)
       config.is_b_on_device = (config_idx >> 3) & 0x1;
       config.is_result_on_device = (config_idx >> 2) & 0x1;
       config.is_in_montgomery_form = (config_idx >> 1) & 0x1;
-      if (config.is_a_on_device ^ config.is_result_on_device == 1) {
-        continue;
-      }
+      if (config.is_a_on_device ^ config.is_result_on_device == 1) { continue; }
 
-      // Copy from host to device (copy again in order to be used later in the loop and device_inX was already overwritten by warmup.
+      // Copy from host to device (copy again in order to be used later in the loop and device_inX was already
+      // overwritten by warmup.
       if (config.is_a_on_device) {
         if (config.is_in_montgomery_form) {
-          err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy data to device.
+          err =
+            cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
           if (err != cudaSuccess) {
             std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
             return 0;
           }
-          CHK_IF_RETURN(mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1));   // Convert in-place.
-        } else {    // Normal presentation.
-          err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy data to device.
+          CHK_IF_RETURN(
+            mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1)); // Convert in-place.
+        } else {                                                                          // Normal presentation.
+          err =
+            cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
           if (err != cudaSuccess) {
             std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
             return 0;
           }
         }
       } else {
-        if (config.is_in_montgomery_form) {   // Copy to device, cnvert to montgomery and copy back to host.
-          err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy data to device.
+        if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host.
+          err =
+            cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
           if (err != cudaSuccess) {
             std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl;
             return 0;
@@ -434,23 +514,27 @@ int main(int argc, char** argv)
       }
       if (config.is_b_on_device) {
         if (config.is_in_montgomery_form) {
-          err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy data to device.
+          err =
+            cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
           if (err != cudaSuccess) {
             std::cerr << "Failed to copy data from host_in2 to device_in1 - " << cudaGetErrorString(err) << std::endl;
             return 0;
           }
-          CHK_IF_RETURN(mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2));   // Convert in-place.
+          CHK_IF_RETURN(
+            mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2)); // Convert in-place.
         } else {
           // Normal presentation.
-          err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy data to device.
+          err =
+            cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
           if (err != cudaSuccess) {
             std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl;
             return 0;
           }
         }
       } else {
-        if (config.is_in_montgomery_form) {   // Copy to device, cnvert to montgomery and copy back to host.
-          err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy data to device.
+        if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host.
+          err =
+            cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
           if (err != cudaSuccess) {
             std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl;
             return 0;
@@ -468,15 +552,27 @@ int main(int argc, char** argv)
       auto start_time = std::chrono::high_resolution_clock::now();
       // Benchmark loop
       for (int i = 0; i < repetitions; i++) {
-        switch (config_idx >> 2) {   // {is_a_on_device, is_b_on_device, is_result_on_device}
-          case 0b000: vector_op(host_in1,   host_in2,   host_in1,   vector_size, ctx, config, (Op)op); break;
-          case 0b001: break;
-          case 0b010: vector_op(host_in1,   device_in2, host_in1,   vector_size, ctx, config, (Op)op); break;
-          case 0b011: break;
-          case 0b100: break;
-          case 0b101: vector_op(device_in1, host_in2,   device_in1, vector_size, ctx, config, (Op)op); break;
-          case 0b110: break;
-          case 0b111: vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op); break;
+        switch (config_idx >> 2) { // {is_a_on_device, is_b_on_device, is_result_on_device}
+        case 0b000:
+          vector_op(host_in1, host_in2, host_in1, vector_size, ctx, config, (Op)op);
+          break;
+        case 0b001:
+          break;
+        case 0b010:
+          vector_op(host_in1, device_in2, host_in1, vector_size, ctx, config, (Op)op);
+          break;
+        case 0b011:
+          break;
+        case 0b100:
+          break;
+        case 0b101:
+          vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op);
+          break;
+        case 0b110:
+          break;
+        case 0b111:
+          vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op);
+          break;
         }
         CHK_IF_RETURN(cudaPeekAtLastError());
       }
@@ -484,40 +580,64 @@ int main(int argc, char** argv)
       auto end_time = std::chrono::high_resolution_clock::now();
       auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
       switch (op) {
-        case MUL: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation MUL for config_idx " << config_idx << " and result in-place" << std::endl; break;
-        case ADD: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation ADD for config_idx " << config_idx << " and result in-place" << std::endl; break;
-        case SUB: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation SUB for config_idx " << config_idx << " and result in-place" << std::endl; break;
+      case MUL:
+        std::cout << "Elapsed time: " << duration.count() << " microseconds, operation MUL for config_idx "
+                  << config_idx << " and result in-place" << std::endl;
+        break;
+      case ADD:
+        std::cout << "Elapsed time: " << duration.count() << " microseconds, operation ADD for config_idx "
+                  << config_idx << " and result in-place" << std::endl;
+        break;
+      case SUB:
+        std::cout << "Elapsed time: " << duration.count() << " microseconds, operation SUB for config_idx "
+                  << config_idx << " and result in-place" << std::endl;
+        break;
       }
 
-      if (config.is_result_on_device) {   // Copy the data to host_out in order to compare it vs. host_out_ref_mul value.
-        if (config.is_in_montgomery_form) {    // Convert to normal from montgomery if needed.
-          CHK_IF_RETURN(mont::from_montgomery(device_in1, vector_size, config.ctx.stream, device_in1));   // Convert to normal in order to check vs. host_out_ref_mul.
+      if (config.is_result_on_device) { // Copy the data to host_out in order to compare it vs. host_out_ref_mul value.
+        if (config.is_in_montgomery_form) { // Convert to normal from montgomery if needed.
+          CHK_IF_RETURN(mont::from_montgomery(
+            device_in1, vector_size, config.ctx.stream,
+            device_in1)); // Convert to normal in order to check vs. host_out_ref_mul.
         }
-        err = cudaMemcpy(host_out, device_in1, vector_size * sizeof(T), cudaMemcpyDeviceToHost);    // Copy to host_out in order to check vs. host_out_ref_mul.
+        err = cudaMemcpy(
+          host_out, device_in1, vector_size * sizeof(T),
+          cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
         if (err != cudaSuccess) {
           std::cerr << "Failed to copy data from device_in1 to host_out - " << cudaGetErrorString(err) << std::endl;
           return 0;
         }
-      } else {    // Data is not on device but it is in host_in1. It should be moved to host_out for test pass/fail check.
-        if (config.is_in_montgomery_form) {    // host_out should be written to device, converted to mmontgomery and written back to host. Then compared vs. host_out_ref_mul.
-          err = cudaMemcpy(device_out, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy to host_out in order to check vs. host_out_ref_mul.
+      } else { // Data is not on device but it is in host_in1. It should be moved to host_out for test pass/fail check.
+        if (config.is_in_montgomery_form) { // host_out should be written to device, converted to mmontgomery and
+                                            // written back to host. Then compared vs. host_out_ref_mul.
+          err = cudaMemcpy(
+            device_out, host_in1, vector_size * sizeof(T),
+            cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul.
           if (err != cudaSuccess) {
             std::cerr << "Failed to copy data from host_in1 to device_out - " << cudaGetErrorString(err) << std::endl;
             return 0;
           }
-          CHK_IF_RETURN(mont::from_montgomery(device_out, vector_size, config.ctx.stream, device_out));   // Convert to normal in order to check vs. host_out_ref_mul.          
-          err = cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);    // Copy to host_out in order to check vs. host_out_ref_mul.
+          CHK_IF_RETURN(mont::from_montgomery(
+            device_out, vector_size, config.ctx.stream,
+            device_out)); // Convert to normal in order to check vs. host_out_ref_mul.
+          err = cudaMemcpy(
+            host_out, device_out, vector_size * sizeof(T),
+            cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
           if (err != cudaSuccess) {
             std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl;
             return 0;
           }
-        } else {    // host_out could be compared vs. host_out_ref_mul as is.
-          err = cudaMemcpy(device_out, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice);    // Copy to host_out in order to check vs. host_out_ref_mul.
+        } else { // host_out could be compared vs. host_out_ref_mul as is.
+          err = cudaMemcpy(
+            device_out, host_in1, vector_size * sizeof(T),
+            cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul.
           if (err != cudaSuccess) {
             std::cerr << "Failed to copy data from host_in1 to device_out - " << cudaGetErrorString(err) << std::endl;
             return 0;
-          }      
-          err = cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);    // Copy to host_out in order to check vs. host_out_ref_mul.
+          }
+          err = cudaMemcpy(
+            host_out, device_out, vector_size * sizeof(T),
+            cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
           if (err != cudaSuccess) {
             std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl;
             return 0;
@@ -538,33 +658,40 @@ int main(int argc, char** argv)
       // std::cout << "===>>> host_out_ref_mul[" << i << "]: " << host_out_ref_mul[i] << std::endl;
       // std::cout << "===>>> host_out[" << i << "] after test run: " << host_out[i] << std::endl;
       switch (op) {
-        case MUL: for (int i=0; i<vector_size; i++) {
-                    if (host_out_ref_mul[i] != host_out[i]) { 
-                      std::cout << "===>>> ERROR!!! MUL: Test failed for vector index " << i << ", config is printed below:" << std::endl;
-                      std::cout << "host_out_ref_mul[0] = " << host_out_ref_mul[0] << std::endl;
-                      test_failed = 1;
-                    }
-                  }
+      case MUL:
+        for (int i = 0; i < vector_size; i++) {
+          if (host_out_ref_mul[i] != host_out[i]) {
+            std::cout << "===>>> ERROR!!! MUL: Test failed for vector index " << i
+                      << ", config is printed below:" << std::endl;
+            std::cout << "host_out_ref_mul[0] = " << host_out_ref_mul[0] << std::endl;
+            test_failed = 1;
+          }
+        }
         break;
-        case ADD: for (int i=0; i<vector_size; i++) {
-                    if (host_out_ref_add[i] != host_out[i]) { 
-                      std::cout << "===>>> ERROR!!! ADD: Test failed for vector index " << i << ", config is printed below:" << std::endl;
-                      std::cout << "host_out_ref_add[0] = " << host_out_ref_add[0] << std::endl;
-                      test_failed = 1;
-                    }
-                  }
+      case ADD:
+        for (int i = 0; i < vector_size; i++) {
+          if (host_out_ref_add[i] != host_out[i]) {
+            std::cout << "===>>> ERROR!!! ADD: Test failed for vector index " << i
+                      << ", config is printed below:" << std::endl;
+            std::cout << "host_out_ref_add[0] = " << host_out_ref_add[0] << std::endl;
+            test_failed = 1;
+          }
+        }
         break;
-        case SUB: for (int i=0; i<vector_size; i++) {
-                    if (host_out_ref_sub[i] != host_out[i]) { 
-                      std::cout << "===>>> ERROR!!! SUB: Test failed for vector index " << i << ", config is printed below:" << std::endl;
-                      std::cout << "host_out_ref_sub[0] = " << host_out_ref_sub[0] << std::endl;
-                      test_failed = 1;
-                    }
-                  }
+      case SUB:
+        for (int i = 0; i < vector_size; i++) {
+          if (host_out_ref_sub[i] != host_out[i]) {
+            std::cout << "===>>> ERROR!!! SUB: Test failed for vector index " << i
+                      << ", config is printed below:" << std::endl;
+            std::cout << "host_out_ref_sub[0] = " << host_out_ref_sub[0] << std::endl;
+            test_failed = 1;
+          }
+        }
         break;
       }
       if (test_failed) {
-        // std::cout << "===>>> ERROR!!! Test failed for vector index " << i << ", config is printed below:" << std::endl;
+        // std::cout << "===>>> ERROR!!! Test failed for vector index " << i << ", config is printed below:" <<
+        // std::endl;
         std::cout << "===>>> result is in-place: " << std::endl;
         std::cout << "===>>> is_a_on_device: " << config.is_a_on_device << std::endl;
         std::cout << "===>>> is_b_on_device: " << config.is_b_on_device << std::endl;
@@ -577,8 +704,8 @@ int main(int argc, char** argv)
       unsigned power_after;
       nvmlDeviceGetPowerUsage(device, &power_after);
       std::cout << "Power after: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_after << " W" << std::endl;
-      std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_after / power_limit
-                << " %" << std::endl;
+      std::cout << "Power utilization: " << std::fixed << std::setprecision(1)
+                << (float)100.0 * power_after / power_limit << " %" << std::endl;
       unsigned temperature_after;
       if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_after) == NVML_SUCCESS) {
         std::cout << "GPU Temperature after: " << temperature_after << " C" << std::endl;
diff --git a/examples/c++/mont_vec_ops/run.sh b/examples/c++/mont_vec_ops/run.sh
old mode 100755
new mode 100644
index 01eca66ba..11d3275cc
--- a/examples/c++/mont_vec_ops/run.sh
+++ b/examples/c++/mont_vec_ops/run.sh
@@ -1,2 +1,2 @@
-#!/bin/bash
-./build/example/example
+#!/ bin / bash
+./ build / example / example
diff --git a/icicle/include/vec_ops/vec_ops.cuh b/icicle/include/vec_ops/vec_ops.cuh
index 89dad5610..be2ab4ba3 100644
--- a/icicle/include/vec_ops/vec_ops.cuh
+++ b/icicle/include/vec_ops/vec_ops.cuh
@@ -27,7 +27,8 @@ namespace vec_ops {
                     *   non-blocking and you'd need to synchronize it explicitly by running
                     *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the
                     *   function will block the current CPU thread. */
-    bool is_in_montgomery_form;    /**< If true then vec_a, vec_b and result are in montgomery form. Default value: false.  */
+    bool
+      is_in_montgomery_form; /**< If true then vec_a, vec_b and result are in montgomery form. Default value: false.  */
   };
 
   /**
diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu
index 7e7cf5e65..9883d393c 100644
--- a/icicle/src/vec_ops/vec_ops.cu
+++ b/icicle/src/vec_ops/vec_ops.cu
@@ -102,19 +102,20 @@ namespace vec_ops {
         CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
         CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
         CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_a, n * sizeof(E), config.ctx.stream, d_alloc_vec_a));
-        is_d_alloc_vec_a_allocated = 1;        
+        is_d_alloc_vec_a_allocated = 1;
         d_vec_a = d_alloc_vec_a;
       } else {
         CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
         CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
-        is_d_alloc_vec_a_allocated = 1;        
+        is_d_alloc_vec_a_allocated = 1;
         d_vec_a = d_alloc_vec_a;
       }
     } else {
       if (config.is_in_montgomery_form) {
-        CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));   // Allocate in order not to change the input.
+        CHK_IF_RETURN(cudaMallocAsync(
+          &d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input.
         CHK_IF_RETURN(mont::from_montgomery(vec_a, n * sizeof(E), config.ctx.stream, d_alloc_vec_a));
-        is_d_alloc_vec_a_allocated = 1;        
+        is_d_alloc_vec_a_allocated = 1;
         d_vec_a = d_alloc_vec_a;
       } else {
         d_vec_a = vec_a;
@@ -132,14 +133,15 @@ namespace vec_ops {
       } else {
         CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream));
         CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
-        is_d_alloc_vec_b_allocated = 1;        
+        is_d_alloc_vec_b_allocated = 1;
         d_vec_b = d_alloc_vec_b;
       }
     } else {
       if (config.is_in_montgomery_form) {
-        CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream));   // Allocate in order not to change the input.
+        CHK_IF_RETURN(cudaMallocAsync(
+          &d_alloc_vec_b, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input.
         CHK_IF_RETURN(mont::from_montgomery(vec_b, n * sizeof(E), config.ctx.stream, d_alloc_vec_b));
-        is_d_alloc_vec_b_allocated = 1;        
+        is_d_alloc_vec_b_allocated = 1;
         d_vec_b = d_alloc_vec_b;
       } else {
         d_vec_b = vec_b;
@@ -167,14 +169,14 @@ namespace vec_ops {
 
     if (!config.is_result_on_device) {
       if (config.is_in_montgomery_form) {
-        CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result));   // Convert in-place.
+        CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result)); // Convert in-place.
         CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
       } else {
         CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
       }
     } else {
       if (config.is_in_montgomery_form) {
-        CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result));   // Convert in-place.
+        CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result)); // Convert in-place.
       }
     }
 

From 9e452c47d0551956bc4421d7ec1202791a6a45ac Mon Sep 17 00:00:00 2001
From: danny-shterman <danny.shterman@gmail.com>
Date: Wed, 31 Jul 2024 09:35:53 +0000
Subject: [PATCH 04/21] Add montgomery config in rust and golang files.

---
 examples/c++/mont_vec_ops/CMakeLists.txt     |  36 +-
 examples/c++/mont_vec_ops/compile.debug.sh   |  27 +-
 examples/c++/mont_vec_ops/compile.sh         |  26 +-
 examples/c++/mont_vec_ops/run.sh             |   4 +-
 wrappers/golang/core/vec_ops.go              | 140 +++---
 wrappers/golang/core/vec_ops_test.go         |  32 +-
 wrappers/rust/icicle-core/src/vec_ops/mod.rs | 504 +++++++++----------
 7 files changed, 367 insertions(+), 402 deletions(-)
 mode change 100644 => 100755 examples/c++/mont_vec_ops/compile.debug.sh
 mode change 100644 => 100755 examples/c++/mont_vec_ops/compile.sh
 mode change 100644 => 100755 examples/c++/mont_vec_ops/run.sh

diff --git a/examples/c++/mont_vec_ops/CMakeLists.txt b/examples/c++/mont_vec_ops/CMakeLists.txt
index 1220da827..f3170e206 100644
--- a/examples/c++/mont_vec_ops/CMakeLists.txt
+++ b/examples/c++/mont_vec_ops/CMakeLists.txt
@@ -1,14 +1,24 @@
-cmake_minimum_required(VERSION 3.18) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CUDA_STANDARD 17)
-    set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
-      set(CMAKE_CXX_STANDARD_REQUIRED TRUE) if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
-        set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH}) else() set(CMAKE_CUDA_ARCHITECTURES native) #on 3.24 +
-  , on earlier it is ignored,
-  and the target is not passed endif() project(example LANGUAGES CUDA CXX)
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(example LANGUAGES CUDA CXX)
 
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") set(CMAKE_CUDA_FLAGS_RELEASE "")
-      set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0") add_executable(example example.cu)
-        target_include_directories(example PRIVATE "../../../icicle/include")
-          target_link_libraries(example ${CMAKE_SOURCE_DIR} / build / icicle / lib / libingo_field_bn254.a)
-            find_library(NVML_LIBRARY nvidia - ml PATHS / usr / local / cuda / targets / x86_64 - linux / lib / stubs /)
-              target_link_libraries(example ${NVML_LIBRARY})
-                set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+add_executable(
+  example
+  example.cu
+)
+target_include_directories(example PRIVATE "../../../icicle/include")
+target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a)
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(example ${NVML_LIBRARY})
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
diff --git a/examples/c++/mont_vec_ops/compile.debug.sh b/examples/c++/mont_vec_ops/compile.debug.sh
old mode 100644
new mode 100755
index 73d19bbd9..c1d6bf932
--- a/examples/c++/mont_vec_ops/compile.debug.sh
+++ b/examples/c++/mont_vec_ops/compile.debug.sh
@@ -1,20 +1,15 @@
-#!/ bin / bash
+#!/bin/bash
 
-#Exit immediately on error
-set -
-  e
+# Exit immediately on error
+set -e
 
-    mkdir -
-  p build / example mkdir -
-  p build / icicle
+mkdir -p build/example
+mkdir -p build/icicle
 
-#Configure and build Icicle
-              cmake -
-  S../../../ icicle / -B build / icicle - DMSM =
-  OFF - DCMAKE_BUILD_TYPE = Debug - DCURVE = bn254 cmake-- build build / icicle -
-                                             j
+# Configure and build Icicle
+cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Debug -DCURVE=bn254
+cmake --build build/icicle -j
 
-#Configure and build the example application
-                                               cmake -
-                                             DCMAKE_BUILD_TYPE =
-                                               Debug - S.- B build / example cmake-- build build / example
+# Configure and build the example application
+cmake -DCMAKE_BUILD_TYPE=Debug -S. -B build/example
+cmake --build build/example
diff --git a/examples/c++/mont_vec_ops/compile.sh b/examples/c++/mont_vec_ops/compile.sh
old mode 100644
new mode 100755
index cf145b290..de35c62da
--- a/examples/c++/mont_vec_ops/compile.sh
+++ b/examples/c++/mont_vec_ops/compile.sh
@@ -1,19 +1,15 @@
-#!/ bin / bash
+#!/bin/bash
 
-#Exit immediately on error
-set -
-  e
+# Exit immediately on error
+set -e
 
-    mkdir -
-  p build / example mkdir -
-  p build / icicle
+mkdir -p build/example
+mkdir -p build/icicle
 
-#Configure and build Icicle
-              cmake -
-  S../../../ icicle / -B build / icicle -
-  DMSM = OFF - DCMAKE_BUILD_TYPE = Release - DCURVE = bn254 cmake-- build build / icicle -
-                                                      j
+# Configure and build Icicle
+cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254
+cmake --build build/icicle
 
-#Configure and build the example application
-                                                        cmake -
-                                                      S.- B build / example cmake-- build build / example
+# Configure and build the example application
+cmake -S . -B build/example
+cmake --build build/example
\ No newline at end of file
diff --git a/examples/c++/mont_vec_ops/run.sh b/examples/c++/mont_vec_ops/run.sh
old mode 100644
new mode 100755
index 11d3275cc..88073af8b
--- a/examples/c++/mont_vec_ops/run.sh
+++ b/examples/c++/mont_vec_ops/run.sh
@@ -1,2 +1,2 @@
-#!/ bin / bash
-./ build / example / example
+#! /bin/bash
+./build/example/example
diff --git a/wrappers/golang/core/vec_ops.go b/wrappers/golang/core/vec_ops.go
index 9ea4681ed..d589056c4 100644
--- a/wrappers/golang/core/vec_ops.go
+++ b/wrappers/golang/core/vec_ops.go
@@ -1,33 +1,29 @@
 package core
 
-import (
-	"fmt"
-	"unsafe"
+import("fmt"
+       "unsafe"
 
-	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
-)
+       cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime")
 
-type VecOps int
+  type VecOps int
 
-const (
-	Sub VecOps = iota
-	Add
-	Mul
-)
+  const(Sub VecOps = iota Add Mul)
 
-type VecOpsConfig struct {
-	/*Details related to the device such as its id and stream. */
-	Ctx cr.DeviceContext
-	/* True if `a` is on device and false if it is not. Default value: false. */
-	isAOnDevice bool
-	/* True if `b` is on device and false if it is not. Default value: false. */
-	isBOnDevice bool
-	/* If true, output is preserved on device, otherwise on host. Default value: false. */
-	isResultOnDevice bool
-	/* Whether to run the vector operations asynchronously. If set to `true`, the function will be
-	*  non-blocking and you'll need to synchronize it explicitly by calling
-	*  `SynchronizeStream`. If set to false, the function will block the current CPU thread. */
-	IsAsync bool
+    type VecOpsConfig struct {
+  /*Details related to the device such as its id and stream. */
+  Ctx cr.DeviceContext
+    /* True if `a` is on device and false if it is not. Default value: false. */
+    isAOnDevice bool
+      /* True if `b` is on device and false if it is not. Default value: false. */
+      isBOnDevice bool
+        /* If true, output is preserved on device, otherwise on host. Default value: false. */
+        isResultOnDevice bool
+          /* Whether to run the vector operations asynchronously. If set to `true`, the function will be
+           *  non-blocking and you'll need to synchronize it explicitly by calling
+           *  `SynchronizeStream`. If set to false, the function will block the current CPU thread. */
+          IsAsync bool
+            /* If true then vec_a, vec_b and result are in montgomery form. Default value: false. */
+            IsInMontgomeryForm bool
 }
 
 /**
@@ -42,69 +38,57 @@ func DefaultVecOpsConfig() VecOpsConfig {
 		false, // isBOnDevice
 		false, // isResultOnDevice
 		false, // IsAsync
+		false, // IsInMontgomeryForm
 	}
 
 	return config
 }
 
-func VecOpCheck(a, b, out HostOrDeviceSlice, cfg *VecOpsConfig) (unsafe.Pointer, unsafe.Pointer, unsafe.Pointer, unsafe.Pointer, int) {
-	aLen, bLen, outLen := a.Len(), b.Len(), out.Len()
-	if aLen != bLen {
-		errorString := fmt.Sprintf(
-			"a and b vector lengths %d; %d are not equal",
-			aLen,
-			bLen,
-		)
-		panic(errorString)
-	}
-	if aLen != outLen {
-		errorString := fmt.Sprintf(
-			"a and out vector lengths %d; %d are not equal",
-			aLen,
-			outLen,
-		)
-		panic(errorString)
-	}
+func VecOpCheck(a, b, out HostOrDeviceSlice, cfg *VecOpsConfig) (unsafe.Pointer, unsafe.Pointer, unsafe.Pointer, unsafe.Pointer, int)
+{
+  aLen, bLen, outLen : = a.Len(), b.Len(), out.Len() if aLen != bLen
+  {
+  errorString:
+    = fmt.Sprintf("a and b vector lengths %d; %d are not equal", aLen, bLen, ) panic(errorString)
+  }
+  if aLen
+    != outLen
+    {
+    errorString:
+      = fmt.Sprintf("a and out vector lengths %d; %d are not equal", aLen, outLen, ) panic(errorString)
+    }
 
-	if a.IsOnDevice() {
-		a.(DeviceSlice).CheckDevice()
-	}
-	if b.IsOnDevice() {
-		b.(DeviceSlice).CheckDevice()
-	}
-	if out.IsOnDevice() {
-		out.(DeviceSlice).CheckDevice()
-	}
+  if a
+    .IsOnDevice() { a.(DeviceSlice).CheckDevice() }
+  if b
+    .IsOnDevice() { b.(DeviceSlice).CheckDevice() }
+  if out
+    .IsOnDevice(){out.(DeviceSlice).CheckDevice()}
 
-	cfg.isAOnDevice = a.IsOnDevice()
-	cfg.isBOnDevice = b.IsOnDevice()
-	cfg.isResultOnDevice = out.IsOnDevice()
+    cfg.isAOnDevice = a.IsOnDevice() cfg.isBOnDevice = b.IsOnDevice() cfg.isResultOnDevice =
+      out
+        .IsOnDevice()
 
-	return a.AsUnsafePointer(), b.AsUnsafePointer(), out.AsUnsafePointer(), unsafe.Pointer(cfg), a.Len()
+          return a.AsUnsafePointer(),
+    b.AsUnsafePointer(), out.AsUnsafePointer(), unsafe.Pointer(cfg), a.Len()
 }
 
-func TransposeCheck(in, out HostOrDeviceSlice, onDevice bool) {
-	inLen, outLen := in.Len(), out.Len()
+func TransposeCheck(in, out HostOrDeviceSlice, onDevice bool)
+{
+  inLen, outLen : = in.Len(),
+                out.Len()
 
-	if inLen != outLen {
-		errorString := fmt.Sprintf(
-			"in and out vector lengths %d; %d are not equal",
-			inLen,
-			outLen,
-		)
-		panic(errorString)
-	}
-	if (onDevice != in.IsOnDevice()) || (onDevice != out.IsOnDevice()) {
-		errorString := fmt.Sprintf(
-			"onDevice is set to %t, but in.IsOnDevice():%t and out.IsOnDevice():%t",
-			onDevice,
-			in.IsOnDevice(),
-			out.IsOnDevice(),
-		)
-		panic(errorString)
-	}
-	if onDevice {
-		in.(DeviceSlice).CheckDevice()
-		out.(DeviceSlice).CheckDevice()
-	}
+                    if inLen != outLen
+  {
+  errorString:
+    = fmt.Sprintf("in and out vector lengths %d; %d are not equal", inLen, outLen, ) panic(errorString)
+  }
+  if (onDevice != in.IsOnDevice()) || (onDevice != out.IsOnDevice())
+    {
+    errorString:
+      = fmt.Sprintf(
+        "onDevice is set to %t, but in.IsOnDevice():%t and out.IsOnDevice():%t", onDevice, in.IsOnDevice(),
+        out.IsOnDevice(), ) panic(errorString)
+    }
+  if onDevice { in.(DeviceSlice).CheckDevice() out.(DeviceSlice).CheckDevice() }
 }
diff --git a/wrappers/golang/core/vec_ops_test.go b/wrappers/golang/core/vec_ops_test.go
index 778e6e7a5..989a0a810 100644
--- a/wrappers/golang/core/vec_ops_test.go
+++ b/wrappers/golang/core/vec_ops_test.go
@@ -1,23 +1,23 @@
 package core
 
-import (
-	"testing"
+  import("testing"
 
-	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
-	"github.com/stretchr/testify/assert"
-)
+         cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+         "github.com/stretchr/testify/assert")
 
-func TestVecOpsDefaultConfig(t *testing.T) {
-	ctx, _ := cr.GetDefaultDeviceContext()
-	expected := VecOpsConfig{
-		ctx,   // Ctx
-		false, // isAOnDevice
-		false, // isBOnDevice
-		false, // isResultOnDevice
-		false, // IsAsync
-	}
+    func TestVecOpsDefaultConfig(t* testing.T)
+{
+  ctx, _ : = cr.GetDefaultDeviceContext() expected : =
+                                                       VecOpsConfig{
+                                                         ctx,   // Ctx
+                                                         false, // isAOnDevice
+                                                         false, // isBOnDevice
+                                                         false, // isResultOnDevice
+                                                         false, // IsAsync
+                                                         false, // IsInMontgomeryForm
+                                                       }
 
-	actual := DefaultVecOpsConfig()
+                                                     actual : = DefaultVecOpsConfig()
 
-	assert.Equal(t, expected, actual)
+                                                                  assert.Equal(t, expected, actual)
 }
diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
index 53e06d11d..efe777160 100644
--- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
@@ -1,7 +1,7 @@
 use icicle_cuda_runtime::device::check_device;
 use icicle_cuda_runtime::{
-    device_context::{DeviceContext, DEFAULT_DEVICE_ID},
-    memory::HostOrDeviceSlice,
+  device_context::{DeviceContext, DEFAULT_DEVICE_ID},
+  memory::HostOrDeviceSlice,
 };
 
 use crate::{error::IcicleResult, traits::FieldImpl};
@@ -20,24 +20,29 @@ pub struct VecOpsConfig<'a> {
     /// Whether to run the vector operations asynchronously. If set to `true`, the functions will be non-blocking and you'd need to synchronize
     /// it explicitly by running `stream.synchronize()`. If set to false, the functions will block the current CPU thread.
     pub is_async: bool,
+    /// If true then vec_a, vec_b and result are in montgomery form. Default value: false.
+    pub is_in_montgomery_form: bool,
 }
 
 impl<'a> Default for VecOpsConfig<'a> {
-    fn default() -> Self {
-        Self::default_for_device(DEFAULT_DEVICE_ID)
-    }
+  fn default()->Self { Self::default_for_device(DEFAULT_DEVICE_ID) }
 }
 
-impl<'a> VecOpsConfig<'a> {
-    pub fn default_for_device(device_id: usize) -> Self {
-        VecOpsConfig {
-            ctx: DeviceContext::default_for_device(device_id),
-            is_a_on_device: false,
-            is_b_on_device: false,
-            is_result_on_device: false,
-            is_async: false,
-        }
+impl<'a> VecOpsConfig<' a>
+{
+  pub fn default_for_device(device_id : usize) -> Self
+  {
+    VecOpsConfig
+    {
+    ctx:
+      DeviceContext::default_for_device(device_id),
+        is_a_on_device : false,
+                         is_b_on_device : false,
+                                          is_result_on_device : false,
+                                                                is_async : false,
+                                                                           is_in_montgomery_form : false,
     }
+  }
 }
 
 #[repr(C)]
@@ -55,27 +60,30 @@ pub struct BitReverseConfig<'a> {
     /// Whether to run the vector operations asynchronously. If set to `true`, the functions will be non-blocking and you'd need to synchronize
     /// it explicitly by running `stream.synchronize()`. If set to false, the functions will block the current CPU thread.
     pub is_async: bool,
+    /// If true then vec_a, vec_b and result are in montgomery form. Default value: false.
+    pub is_in_montgomery_form: bool,
 }
 
 impl<'a> Default for BitReverseConfig<'a> {
-    fn default() -> Self {
-        Self::default_for_device(DEFAULT_DEVICE_ID)
-    }
+  fn default()->Self { Self::default_for_device(DEFAULT_DEVICE_ID) }
 }
 
-impl<'a> BitReverseConfig<'a> {
-    pub fn default_for_device(device_id: usize) -> Self {
-        BitReverseConfig {
-            ctx: DeviceContext::default_for_device(device_id),
-            is_input_on_device: false,
-            is_output_on_device: false,
-            is_async: false,
-        }
+impl<'a> BitReverseConfig<' a>
+{
+  pub fn default_for_device(device_id : usize) -> Self
+  {
+    BitReverseConfig
+    {
+    ctx:
+      DeviceContext::default_for_device(device_id),
+        is_input_on_device : false, is_output_on_device : false, is_async : false, is_in_montgomery_form : false,
     }
+  }
 }
 
 #[doc(hidden)]
-pub trait VecOps<F> {
+pub trait VecOps<F>
+{
     fn add(
         a: &(impl HostOrDeviceSlice<F> + ?Sized),
         b: &(impl HostOrDeviceSlice<F> + ?Sized),
@@ -111,6 +119,7 @@ pub trait VecOps<F> {
         ctx: &DeviceContext,
         on_device: bool,
         is_async: bool,
+        is_in_montgomery_form: bool,
     ) -> IcicleResult<()>;
 
     fn bit_reverse(
@@ -131,72 +140,61 @@ fn check_vec_ops_args<'a, F>(
     result: &(impl HostOrDeviceSlice<F> + ?Sized),
     cfg: &VecOpsConfig<'a>,
 ) -> VecOpsConfig<'a> {
-    if a.len() != b.len() || a.len() != result.len() {
-        panic!(
-            "left, right and output lengths {}; {}; {} do not match",
-            a.len(),
-            b.len(),
-            result.len()
-        );
-    }
-    let ctx_device_id = cfg
-        .ctx
-        .device_id;
-    if let Some(device_id) = a.device_id() {
-        assert_eq!(device_id, ctx_device_id, "Device ids in a and context are different");
-    }
-    if let Some(device_id) = b.device_id() {
-        assert_eq!(device_id, ctx_device_id, "Device ids in b and context are different");
-    }
-    if let Some(device_id) = result.device_id() {
-        assert_eq!(
-            device_id, ctx_device_id,
-            "Device ids in result and context are different"
-        );
-    }
-    check_device(ctx_device_id);
-
-    let mut res_cfg = cfg.clone();
-    res_cfg.is_a_on_device = a.is_on_device();
-    res_cfg.is_b_on_device = b.is_on_device();
-    res_cfg.is_result_on_device = result.is_on_device();
-    res_cfg
+    if a.len() != b.len() || a.len() != result.len()
+{
+  panic !("left, right and output lengths {}; {}; {} do not match", a.len(), b.len(), result.len());
+}
+let ctx_device_id = cfg.ctx.device_id;
+if let
+  Some(device_id) = a.device_id()
+  {
+    assert_eq !(device_id, ctx_device_id, "Device ids in a and context are different");
+  }
+if let
+  Some(device_id) = b.device_id()
+  {
+    assert_eq !(device_id, ctx_device_id, "Device ids in b and context are different");
+  }
+if let
+  Some(device_id) = result.device_id()
+  {
+    assert_eq !(device_id, ctx_device_id, "Device ids in result and context are different");
+  }
+check_device(ctx_device_id);
+
+let mut res_cfg = cfg.clone();
+res_cfg.is_a_on_device = a.is_on_device();
+res_cfg.is_b_on_device = b.is_on_device();
+res_cfg.is_result_on_device = result.is_on_device();
+res_cfg
 }
 fn check_bit_reverse_args<'a, F>(
     input: &(impl HostOrDeviceSlice<F> + ?Sized),
     cfg: &BitReverseConfig<'a>,
     output: &(impl HostOrDeviceSlice<F> + ?Sized),
 ) -> BitReverseConfig<'a> {
-    if input.len() & (input.len() - 1) != 0 {
-        panic!("input length must be a power of 2, input length: {}", input.len());
-    }
-    if input.len() != output.len() {
-        panic!(
-            "input and output lengths {}; {} do not match",
-            input.len(),
-            output.len()
-        );
-    }
-    let ctx_device_id = cfg
-        .ctx
-        .device_id;
-    if let Some(device_id) = input.device_id() {
-        assert_eq!(
-            device_id, ctx_device_id,
-            "Device ids in input and context are different"
-        );
-    }
-    if let Some(device_id) = output.device_id() {
-        assert_eq!(
-            device_id, ctx_device_id,
-            "Device ids in output and context are different"
-        );
-    }
-    check_device(ctx_device_id);
-    let mut res_cfg = cfg.clone();
-    res_cfg.is_input_on_device = input.is_on_device();
-    res_cfg.is_output_on_device = output.is_on_device();
-    res_cfg
+    if input.len() & (input.len() - 1) != 0
+{
+  panic !("input length must be a power of 2, input length: {}", input.len());
+}
+if input
+  .len() != output.len() { panic !("input and output lengths {}; {} do not match", input.len(), output.len()); }
+let ctx_device_id = cfg.ctx.device_id;
+if let
+  Some(device_id) = input.device_id()
+  {
+    assert_eq !(device_id, ctx_device_id, "Device ids in input and context are different");
+  }
+if let
+  Some(device_id) = output.device_id()
+  {
+    assert_eq !(device_id, ctx_device_id, "Device ids in output and context are different");
+  }
+check_device(ctx_device_id);
+let mut res_cfg = cfg.clone();
+res_cfg.is_input_on_device = input.is_on_device();
+res_cfg.is_output_on_device = output.is_on_device();
+res_cfg
 }
 
 pub fn add_scalars<F>(
@@ -209,8 +207,8 @@ where
     F: FieldImpl,
     <F as FieldImpl>::Config: VecOps<F>,
 {
-    let cfg = check_vec_ops_args(a, b, result, cfg);
-    <<F as FieldImpl>::Config as VecOps<F>>::add(a, b, result, &cfg)
+  let cfg = check_vec_ops_args(a, b, result, cfg);
+  << F as FieldImpl > ::Config as VecOps < F >> ::add(a, b, result, &cfg)
 }
 
 pub fn accumulate_scalars<F>(
@@ -222,8 +220,8 @@ where
     F: FieldImpl,
     <F as FieldImpl>::Config: VecOps<F>,
 {
-    let cfg = check_vec_ops_args(a, b, a, cfg);
-    <<F as FieldImpl>::Config as VecOps<F>>::accumulate(a, b, &cfg)
+  let cfg = check_vec_ops_args(a, b, a, cfg);
+  << F as FieldImpl > ::Config as VecOps < F >> ::accumulate(a, b, &cfg)
 }
 
 pub fn sub_scalars<F>(
@@ -236,8 +234,8 @@ where
     F: FieldImpl,
     <F as FieldImpl>::Config: VecOps<F>,
 {
-    let cfg = check_vec_ops_args(a, b, result, cfg);
-    <<F as FieldImpl>::Config as VecOps<F>>::sub(a, b, result, &cfg)
+  let cfg = check_vec_ops_args(a, b, result, cfg);
+  << F as FieldImpl > ::Config as VecOps < F >> ::sub(a, b, result, &cfg)
 }
 
 pub fn mul_scalars<F>(
@@ -250,8 +248,8 @@ where
     F: FieldImpl,
     <F as FieldImpl>::Config: VecOps<F>,
 {
-    let cfg = check_vec_ops_args(a, b, result, cfg);
-    <<F as FieldImpl>::Config as VecOps<F>>::mul(a, b, result, &cfg)
+  let cfg = check_vec_ops_args(a, b, result, cfg);
+  << F as FieldImpl > ::Config as VecOps < F >> ::mul(a, b, result, &cfg)
 }
 
 pub fn transpose_matrix<F>(
@@ -262,12 +260,13 @@ pub fn transpose_matrix<F>(
     ctx: &DeviceContext,
     on_device: bool,
     is_async: bool,
+    is_in_montgomery_form: bool,
 ) -> IcicleResult<()>
 where
     F: FieldImpl,
     <F as FieldImpl>::Config: VecOps<F>,
 {
-    <<F as FieldImpl>::Config as VecOps<F>>::transpose(input, row_size, column_size, output, ctx, on_device, is_async)
+    <<F as FieldImpl>::Config as VecOps<F>>::transpose(input, row_size, column_size, output, ctx, on_device, is_async, is_in_montgomery_form)
 }
 
 pub fn bit_reverse<F>(
@@ -279,8 +278,8 @@ where
     F: FieldImpl,
     <F as FieldImpl>::Config: VecOps<F>,
 {
-    let cfg = check_bit_reverse_args(input, cfg, output);
-    <<F as FieldImpl>::Config as VecOps<F>>::bit_reverse(input, &cfg, output)
+  let cfg = check_bit_reverse_args(input, cfg, output);
+  << F as FieldImpl > ::Config as VecOps < F >> ::bit_reverse(input, &cfg, output)
 }
 
 pub fn bit_reverse_inplace<F>(
@@ -291,113 +290,108 @@ where
     F: FieldImpl,
     <F as FieldImpl>::Config: VecOps<F>,
 {
-    let cfg = check_bit_reverse_args(input, cfg, input);
-    <<F as FieldImpl>::Config as VecOps<F>>::bit_reverse_inplace(input, &cfg)
+  let cfg = check_bit_reverse_args(input, cfg, input);
+  << F as FieldImpl > ::Config as VecOps < F >> ::bit_reverse_inplace(input, &cfg)
 }
 
 #[macro_export]
-macro_rules! impl_vec_ops_field {
-    (
-        $field_prefix:literal,
-        $field_prefix_ident:ident,
-        $field:ident,
-        $field_config:ident
-    ) => {
-        mod $field_prefix_ident {
-            use crate::vec_ops::{$field, CudaError, DeviceContext, HostOrDeviceSlice};
-            use icicle_core::vec_ops::BitReverseConfig;
-            use icicle_core::vec_ops::VecOpsConfig;
-
-            extern "C" {
-                #[link_name = concat!($field_prefix, "_add_cuda")]
-                pub(crate) fn add_scalars_cuda(
-                    a: *const $field,
-                    b: *const $field,
-                    size: u32,
-                    cfg: *const VecOpsConfig,
-                    result: *mut $field,
-                ) -> CudaError;
-
-                #[link_name = concat!($field_prefix, "_accumulate_cuda")]
-                pub(crate) fn accumulate_scalars_cuda(
-                    a: *mut $field,
-                    b: *const $field,
-                    size: u32,
-                    cfg: *const VecOpsConfig,
-                ) -> CudaError;
-
-                #[link_name = concat!($field_prefix, "_sub_cuda")]
-                pub(crate) fn sub_scalars_cuda(
-                    a: *const $field,
-                    b: *const $field,
-                    size: u32,
-                    cfg: *const VecOpsConfig,
-                    result: *mut $field,
-                ) -> CudaError;
-
-                #[link_name = concat!($field_prefix, "_mul_cuda")]
-                pub(crate) fn mul_scalars_cuda(
-                    a: *const $field,
-                    b: *const $field,
-                    size: u32,
-                    cfg: *const VecOpsConfig,
-                    result: *mut $field,
-                ) -> CudaError;
-
-                #[link_name = concat!($field_prefix, "_transpose_matrix_cuda")]
-                pub(crate) fn transpose_cuda(
-                    input: *const $field,
-                    row_size: u32,
-                    column_size: u32,
-                    output: *mut $field,
-                    ctx: *const DeviceContext,
-                    on_device: bool,
-                    is_async: bool,
-                ) -> CudaError;
-
-                #[link_name = concat!($field_prefix, "_bit_reverse_cuda")]
-                pub(crate) fn bit_reverse_cuda(
-                    input: *const $field,
-                    size: u64,
-                    config: *const BitReverseConfig,
-                    output: *mut $field,
-                ) -> CudaError;
-            }
-        }
+macro_rules !impl_vec_ops_field
+{
+  ($field_prefix : literal, $field_prefix_ident : ident, $field : ident, $field_config : ident) = >
+  {
+    mod $field_prefix_ident
+    {
+      use crate::vec_ops::{$field, CudaError, DeviceContext, HostOrDeviceSlice};
+      use icicle_core::vec_ops::BitReverseConfig;
+      use icicle_core::vec_ops::VecOpsConfig;
+
+      extern "C" {
+#[link_name = concat !($field_prefix, "_add_cuda")]
+      pub(crate) fn add_scalars_cuda(a
+                                     : * const $field, b
+                                     : * const $field, size
+                                     : u32, cfg
+                                     : * const VecOpsConfig, result
+                                     : *mut $field, )
+        ->CudaError;
+
+#[link_name = concat !($field_prefix, "_accumulate_cuda")]
+      pub(crate) fn accumulate_scalars_cuda(a
+                                            : *mut $field, b
+                                            : * const $field, size
+                                            : u32, cfg
+                                            : * const VecOpsConfig, )
+        ->CudaError;
+
+#[link_name = concat !($field_prefix, "_sub_cuda")]
+      pub(crate) fn sub_scalars_cuda(a
+                                     : * const $field, b
+                                     : * const $field, size
+                                     : u32, cfg
+                                     : * const VecOpsConfig, result
+                                     : *mut $field, )
+        ->CudaError;
+
+#[link_name = concat !($field_prefix, "_mul_cuda")]
+      pub(crate) fn mul_scalars_cuda(a
+                                     : * const $field, b
+                                     : * const $field, size
+                                     : u32, cfg
+                                     : * const VecOpsConfig, result
+                                     : *mut $field, )
+        ->CudaError;
+
+#[link_name = concat !($field_prefix, "_transpose_matrix_cuda")]
+      pub(crate) fn transpose_cuda(input
+                                   : * const $field, row_size
+                                   : u32, column_size
+                                   : u32, output
+                                   : *mut $field, ctx
+                                   : * const DeviceContext, on_device
+                                   : bool, is_async
+                                   : bool, is_in_montgomery_form
+                                   : bool, )
+        ->CudaError;
+
+#[link_name = concat !($field_prefix, "_bit_reverse_cuda")]
+      pub(crate) fn bit_reverse_cuda(input
+                                     : * const $field, size
+                                     : u64, config
+                                     : * const BitReverseConfig, output
+                                     : *mut $field, )
+        ->CudaError;
+      }
+    }
 
-        impl VecOps<$field> for $field_config {
+        impl VecOps<$field> for $field_config
+        {
             fn add(
                 a: &(impl HostOrDeviceSlice<$field> + ?Sized),
                 b: &(impl HostOrDeviceSlice<$field> + ?Sized),
                 result: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
                 cfg: &VecOpsConfig,
-            ) -> IcicleResult<()> {
-                unsafe {
-                    $field_prefix_ident::add_scalars_cuda(
-                        a.as_ptr(),
-                        b.as_ptr(),
-                        a.len() as u32,
-                        cfg as *const VecOpsConfig,
-                        result.as_mut_ptr(),
-                    )
-                    .wrap()
-                }
+            ) -> IcicleResult<()>
+            {
+              unsafe
+              {
+                $field_prefix_ident::add_scalars_cuda(
+                  a.as_ptr(), b.as_ptr(), a.len() as u32, cfg as* const VecOpsConfig, result.as_mut_ptr(), )
+                  .wrap()
+              }
             }
 
             fn accumulate(
                 a: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
                 b: &(impl HostOrDeviceSlice<$field> + ?Sized),
                 cfg: &VecOpsConfig,
-            ) -> IcicleResult<()> {
-                unsafe {
-                    $field_prefix_ident::accumulate_scalars_cuda(
-                        a.as_mut_ptr(),
-                        b.as_ptr(),
-                        a.len() as u32,
-                        cfg as *const VecOpsConfig,
-                    )
-                    .wrap()
-                }
+            ) -> IcicleResult<()>
+            {
+              unsafe
+              {
+                $field_prefix_ident::accumulate_scalars_cuda(
+                  a.as_mut_ptr(), b.as_ptr(), a.len() as u32, cfg as* const VecOpsConfig, )
+                  .wrap()
+              }
             }
 
             fn sub(
@@ -405,17 +399,14 @@ macro_rules! impl_vec_ops_field {
                 b: &(impl HostOrDeviceSlice<$field> + ?Sized),
                 result: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
                 cfg: &VecOpsConfig,
-            ) -> IcicleResult<()> {
-                unsafe {
-                    $field_prefix_ident::sub_scalars_cuda(
-                        a.as_ptr(),
-                        b.as_ptr(),
-                        a.len() as u32,
-                        cfg as *const VecOpsConfig,
-                        result.as_mut_ptr(),
-                    )
-                    .wrap()
-                }
+            ) -> IcicleResult<()>
+            {
+              unsafe
+              {
+                $field_prefix_ident::sub_scalars_cuda(
+                  a.as_ptr(), b.as_ptr(), a.len() as u32, cfg as* const VecOpsConfig, result.as_mut_ptr(), )
+                  .wrap()
+              }
             }
 
             fn mul(
@@ -423,17 +414,14 @@ macro_rules! impl_vec_ops_field {
                 b: &(impl HostOrDeviceSlice<$field> + ?Sized),
                 result: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
                 cfg: &VecOpsConfig,
-            ) -> IcicleResult<()> {
-                unsafe {
-                    $field_prefix_ident::mul_scalars_cuda(
-                        a.as_ptr(),
-                        b.as_ptr(),
-                        a.len() as u32,
-                        cfg as *const VecOpsConfig,
-                        result.as_mut_ptr(),
-                    )
-                    .wrap()
-                }
+            ) -> IcicleResult<()>
+            {
+              unsafe
+              {
+                $field_prefix_ident::mul_scalars_cuda(
+                  a.as_ptr(), b.as_ptr(), a.len() as u32, cfg as* const VecOpsConfig, result.as_mut_ptr(), )
+                  .wrap()
+              }
             }
 
             fn transpose(
@@ -444,72 +432,64 @@ macro_rules! impl_vec_ops_field {
                 ctx: &DeviceContext,
                 on_device: bool,
                 is_async: bool,
-            ) -> IcicleResult<()> {
-                unsafe {
-                    $field_prefix_ident::transpose_cuda(
-                        input.as_ptr(),
-                        row_size,
-                        column_size,
-                        output.as_mut_ptr(),
-                        ctx as *const _ as *const DeviceContext,
-                        on_device,
-                        is_async,
-                    )
-                    .wrap()
-                }
+                is_in_montgomery_form: bool,
+            ) -> IcicleResult<()>
+            {
+              unsafe
+              {
+                $field_prefix_ident::transpose_cuda(
+                  input.as_ptr(), row_size, column_size, output.as_mut_ptr(), ctx as* const _ as* const DeviceContext,
+                  on_device, is_async, is_in_montgomery_form, )
+                  .wrap()
+              }
             }
 
             fn bit_reverse(
                 input: &(impl HostOrDeviceSlice<$field> + ?Sized),
                 cfg: &BitReverseConfig,
                 output: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
-            ) -> IcicleResult<()> {
-                unsafe {
-                    $field_prefix_ident::bit_reverse_cuda(
-                        input.as_ptr(),
-                        input.len() as u64,
-                        cfg as *const BitReverseConfig,
-                        output.as_mut_ptr(),
-                    )
-                    .wrap()
-                }
+            ) -> IcicleResult<()>
+            {
+              unsafe
+              {
+                $field_prefix_ident::bit_reverse_cuda(
+                  input.as_ptr(), input.len() as u64, cfg as* const BitReverseConfig, output.as_mut_ptr(), )
+                  .wrap()
+              }
             }
 
             fn bit_reverse_inplace(
                 input: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
                 cfg: &BitReverseConfig,
-            ) -> IcicleResult<()> {
-                unsafe {
-                    $field_prefix_ident::bit_reverse_cuda(
-                        input.as_ptr(),
-                        input.len() as u64,
-                        cfg as *const BitReverseConfig,
-                        input.as_mut_ptr(),
-                    )
-                    .wrap()
-                }
+            ) -> IcicleResult<()>
+            {
+              unsafe
+              {
+                $field_prefix_ident::bit_reverse_cuda(
+                  input.as_ptr(), input.len() as u64, cfg as* const BitReverseConfig, input.as_mut_ptr(), )
+                  .wrap()
+              }
             }
         }
-    };
+  };
 }
 
 #[macro_export]
-macro_rules! impl_vec_add_tests {
-    (
-      $field:ident
-    ) => {
-        #[test]
-        pub fn test_vec_add_scalars() {
-            check_vec_ops_scalars::<$field>();
-        }
+macro_rules !impl_vec_add_tests
+{
+  ($field
+   : ident) = > {
+#[test]
+                  pub fn test_vec_add_scalars(){check_vec_ops_scalars::<$field>();
+}
 
-        #[test]
-        pub fn test_bit_reverse() {
-            check_bit_reverse::<$field>()
-        }
-        #[test]
-        pub fn test_bit_reverse_inplace() {
-            check_bit_reverse_inplace::<$field>()
-        }
-    };
+#[test]
+pub fn test_bit_reverse(){check_bit_reverse::<$field>()}
+#[test]
+pub fn test_bit_reverse_inplace()
+{
+  check_bit_reverse_inplace::<$field>()
+}
+}
+;
 }

From 993652ae8dfaa28882d748827205cb462b7c87f1 Mon Sep 17 00:00:00 2001
From: danny-shterman <danny.shterman@gmail.com>
Date: Wed, 31 Jul 2024 13:47:45 +0000
Subject: [PATCH 05/21] Formatting for rust and golang.

---
 icicle/include/vec_ops/vec_ops.cuh           |   3 +-
 wrappers/golang/core/vec_ops.go              | 141 ++++++------
 wrappers/golang/core/vec_ops_test.go         |  33 +--
 wrappers/rust/icicle-core/src/vec_ops/mod.rs | 212 ++++++++++---------
 4 files changed, 209 insertions(+), 180 deletions(-)

diff --git a/icicle/include/vec_ops/vec_ops.cuh b/icicle/include/vec_ops/vec_ops.cuh
index be2ab4ba3..6d1ad7987 100644
--- a/icicle/include/vec_ops/vec_ops.cuh
+++ b/icicle/include/vec_ops/vec_ops.cuh
@@ -27,8 +27,7 @@ namespace vec_ops {
                     *   non-blocking and you'd need to synchronize it explicitly by running
                     *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the
                     *   function will block the current CPU thread. */
-    bool
-      is_in_montgomery_form; /**< If true then vec_a, vec_b and result are in montgomery form. Default value: false.  */
+    bool is_in_montgomery_form; /**< If true then vec_a, vec_b and result are in montgomery form. Default value: false.  */
   };
 
   /**
diff --git a/wrappers/golang/core/vec_ops.go b/wrappers/golang/core/vec_ops.go
index d589056c4..8bef10b63 100644
--- a/wrappers/golang/core/vec_ops.go
+++ b/wrappers/golang/core/vec_ops.go
@@ -1,29 +1,35 @@
 package core
 
-import("fmt"
-       "unsafe"
+import (
+	"fmt"
+	"unsafe"
 
-       cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime")
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+)
 
-  type VecOps int
+type VecOps int
 
-  const(Sub VecOps = iota Add Mul)
+const (
+	Sub VecOps = iota
+	Add
+	Mul
+)
 
-    type VecOpsConfig struct {
-  /*Details related to the device such as its id and stream. */
-  Ctx cr.DeviceContext
-    /* True if `a` is on device and false if it is not. Default value: false. */
-    isAOnDevice bool
-      /* True if `b` is on device and false if it is not. Default value: false. */
-      isBOnDevice bool
-        /* If true, output is preserved on device, otherwise on host. Default value: false. */
-        isResultOnDevice bool
-          /* Whether to run the vector operations asynchronously. If set to `true`, the function will be
-           *  non-blocking and you'll need to synchronize it explicitly by calling
-           *  `SynchronizeStream`. If set to false, the function will block the current CPU thread. */
-          IsAsync bool
-            /* If true then vec_a, vec_b and result are in montgomery form. Default value: false. */
-            IsInMontgomeryForm bool
+type VecOpsConfig struct {
+	/*Details related to the device such as its id and stream. */
+	Ctx cr.DeviceContext
+	/* True if `a` is on device and false if it is not. Default value: false. */
+	isAOnDevice bool
+	/* True if `b` is on device and false if it is not. Default value: false. */
+	isBOnDevice bool
+	/* If true, output is preserved on device, otherwise on host. Default value: false. */
+	isResultOnDevice bool
+	/* Whether to run the vector operations asynchronously. If set to `true`, the function will be
+	*  non-blocking and you'll need to synchronize it explicitly by calling
+	*  `SynchronizeStream`. If set to false, the function will block the current CPU thread. */
+	IsAsync bool
+	/* If true then vec_a, vec_b and result are in montgomery form. Default value: false. */
+	IsInMontgomeryForm bool
 }
 
 /**
@@ -44,51 +50,64 @@ func DefaultVecOpsConfig() VecOpsConfig {
 	return config
 }
 
-func VecOpCheck(a, b, out HostOrDeviceSlice, cfg *VecOpsConfig) (unsafe.Pointer, unsafe.Pointer, unsafe.Pointer, unsafe.Pointer, int)
-{
-  aLen, bLen, outLen : = a.Len(), b.Len(), out.Len() if aLen != bLen
-  {
-  errorString:
-    = fmt.Sprintf("a and b vector lengths %d; %d are not equal", aLen, bLen, ) panic(errorString)
-  }
-  if aLen
-    != outLen
-    {
-    errorString:
-      = fmt.Sprintf("a and out vector lengths %d; %d are not equal", aLen, outLen, ) panic(errorString)
-    }
+func VecOpCheck(a, b, out HostOrDeviceSlice, cfg *VecOpsConfig) (unsafe.Pointer, unsafe.Pointer, unsafe.Pointer, unsafe.Pointer, int) {
+	aLen, bLen, outLen := a.Len(), b.Len(), out.Len()
+	if aLen != bLen {
+		errorString := fmt.Sprintf(
+			"a and b vector lengths %d; %d are not equal",
+			aLen,
+			bLen,
+		)
+		panic(errorString)
+	}
+	if aLen != outLen {
+		errorString := fmt.Sprintf(
+			"a and out vector lengths %d; %d are not equal",
+			aLen,
+			outLen,
+		)
+		panic(errorString)
+	}
 
-  if a
-    .IsOnDevice() { a.(DeviceSlice).CheckDevice() }
-  if b
-    .IsOnDevice() { b.(DeviceSlice).CheckDevice() }
-  if out
-    .IsOnDevice(){out.(DeviceSlice).CheckDevice()}
+	if a.IsOnDevice() {
+		a.(DeviceSlice).CheckDevice()
+	}
+	if b.IsOnDevice() {
+		b.(DeviceSlice).CheckDevice()
+	}
+	if out.IsOnDevice() {
+		out.(DeviceSlice).CheckDevice()
+	}
 
-    cfg.isAOnDevice = a.IsOnDevice() cfg.isBOnDevice = b.IsOnDevice() cfg.isResultOnDevice =
-      out
-        .IsOnDevice()
+	cfg.isAOnDevice = a.IsOnDevice()
+	cfg.isBOnDevice = b.IsOnDevice()
+	cfg.isResultOnDevice = out.IsOnDevice()
 
-          return a.AsUnsafePointer(),
-    b.AsUnsafePointer(), out.AsUnsafePointer(), unsafe.Pointer(cfg), a.Len()
+	return a.AsUnsafePointer(), b.AsUnsafePointer(), out.AsUnsafePointer(), unsafe.Pointer(cfg), a.Len()
 }
 
-func TransposeCheck(in, out HostOrDeviceSlice, onDevice bool)
-{
-  inLen, outLen : = in.Len(),
-                out.Len()
+func TransposeCheck(in, out HostOrDeviceSlice, onDevice bool) {
+	inLen, outLen := in.Len(), out.Len()
 
-                    if inLen != outLen
-  {
-  errorString:
-    = fmt.Sprintf("in and out vector lengths %d; %d are not equal", inLen, outLen, ) panic(errorString)
-  }
-  if (onDevice != in.IsOnDevice()) || (onDevice != out.IsOnDevice())
-    {
-    errorString:
-      = fmt.Sprintf(
-        "onDevice is set to %t, but in.IsOnDevice():%t and out.IsOnDevice():%t", onDevice, in.IsOnDevice(),
-        out.IsOnDevice(), ) panic(errorString)
-    }
-  if onDevice { in.(DeviceSlice).CheckDevice() out.(DeviceSlice).CheckDevice() }
+	if inLen != outLen {
+		errorString := fmt.Sprintf(
+			"in and out vector lengths %d; %d are not equal",
+			inLen,
+			outLen,
+		)
+		panic(errorString)
+	}
+	if (onDevice != in.IsOnDevice()) || (onDevice != out.IsOnDevice()) {
+		errorString := fmt.Sprintf(
+			"onDevice is set to %t, but in.IsOnDevice():%t and out.IsOnDevice():%t",
+			onDevice,
+			in.IsOnDevice(),
+			out.IsOnDevice(),
+		)
+		panic(errorString)
+	}
+	if onDevice {
+		in.(DeviceSlice).CheckDevice()
+		out.(DeviceSlice).CheckDevice()
+	}
 }
diff --git a/wrappers/golang/core/vec_ops_test.go b/wrappers/golang/core/vec_ops_test.go
index 989a0a810..9ac5bb480 100644
--- a/wrappers/golang/core/vec_ops_test.go
+++ b/wrappers/golang/core/vec_ops_test.go
@@ -1,23 +1,24 @@
 package core
 
-  import("testing"
+import (
+	"testing"
 
-         cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
-         "github.com/stretchr/testify/assert")
+	cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime"
+	"github.com/stretchr/testify/assert"
+)
 
-    func TestVecOpsDefaultConfig(t* testing.T)
-{
-  ctx, _ : = cr.GetDefaultDeviceContext() expected : =
-                                                       VecOpsConfig{
-                                                         ctx,   // Ctx
-                                                         false, // isAOnDevice
-                                                         false, // isBOnDevice
-                                                         false, // isResultOnDevice
-                                                         false, // IsAsync
-                                                         false, // IsInMontgomeryForm
-                                                       }
+func TestVecOpsDefaultConfig(t *testing.T) {
+	ctx, _ := cr.GetDefaultDeviceContext()
+	expected := VecOpsConfig{
+		ctx,   // Ctx
+		false, // isAOnDevice
+		false, // isBOnDevice
+		false, // isResultOnDevice
+		false, // IsAsync
+		false, // IsInMontgomeryForm
+	}
 
-                                                     actual : = DefaultVecOpsConfig()
+	actual := DefaultVecOpsConfig()
 
-                                                                  assert.Equal(t, expected, actual)
+	assert.Equal(t, expected, actual)
 }
diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
index efe777160..d16d5c4b9 100644
--- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
@@ -1,7 +1,7 @@
 use icicle_cuda_runtime::device::check_device;
 use icicle_cuda_runtime::{
-  device_context::{DeviceContext, DEFAULT_DEVICE_ID},
-  memory::HostOrDeviceSlice,
+    device_context::{DeviceContext, DEFAULT_DEVICE_ID},
+    memory::HostOrDeviceSlice,
 };
 
 use crate::{error::IcicleResult, traits::FieldImpl};
@@ -25,24 +25,22 @@ pub struct VecOpsConfig<'a> {
 }
 
 impl<'a> Default for VecOpsConfig<'a> {
-  fn default()->Self { Self::default_for_device(DEFAULT_DEVICE_ID) }
+    fn default() -> Self {
+        Self::default_for_device(DEFAULT_DEVICE_ID)
+    }
 }
 
-impl<'a> VecOpsConfig<' a>
-{
-  pub fn default_for_device(device_id : usize) -> Self
-  {
-    VecOpsConfig
-    {
-    ctx:
-      DeviceContext::default_for_device(device_id),
-        is_a_on_device : false,
-                         is_b_on_device : false,
-                                          is_result_on_device : false,
-                                                                is_async : false,
-                                                                           is_in_montgomery_form : false,
+impl<'a> VecOpsConfig<'a> {
+    pub fn default_for_device(device_id: usize) -> Self {
+        VecOpsConfig {
+            ctx: DeviceContext::default_for_device(device_id),
+            is_a_on_device: false,
+            is_b_on_device: false,
+            is_result_on_device: false,
+            is_async: false,
+            is_in_montgomery_form: false,
+        }
     }
-  }
 }
 
 #[repr(C)]
@@ -60,30 +58,27 @@ pub struct BitReverseConfig<'a> {
     /// Whether to run the vector operations asynchronously. If set to `true`, the functions will be non-blocking and you'd need to synchronize
     /// it explicitly by running `stream.synchronize()`. If set to false, the functions will block the current CPU thread.
     pub is_async: bool,
-    /// If true then vec_a, vec_b and result are in montgomery form. Default value: false.
-    pub is_in_montgomery_form: bool,
 }
 
 impl<'a> Default for BitReverseConfig<'a> {
-  fn default()->Self { Self::default_for_device(DEFAULT_DEVICE_ID) }
+    fn default() -> Self {
+        Self::default_for_device(DEFAULT_DEVICE_ID)
+    }
 }
 
-impl<'a> BitReverseConfig<' a>
-{
-  pub fn default_for_device(device_id : usize) -> Self
-  {
-    BitReverseConfig
-    {
-    ctx:
-      DeviceContext::default_for_device(device_id),
-        is_input_on_device : false, is_output_on_device : false, is_async : false, is_in_montgomery_form : false,
+impl<'a> BitReverseConfig<'a> {
+    pub fn default_for_device(device_id: usize) -> Self {
+        BitReverseConfig {
+            ctx: DeviceContext::default_for_device(device_id),
+            is_input_on_device: false,
+            is_output_on_device: false,
+            is_async: false,
+        }
     }
-  }
 }
 
 #[doc(hidden)]
-pub trait VecOps<F>
-{
+pub trait VecOps<F> {
     fn add(
         a: &(impl HostOrDeviceSlice<F> + ?Sized),
         b: &(impl HostOrDeviceSlice<F> + ?Sized),
@@ -119,7 +114,6 @@ pub trait VecOps<F>
         ctx: &DeviceContext,
         on_device: bool,
         is_async: bool,
-        is_in_montgomery_form: bool,
     ) -> IcicleResult<()>;
 
     fn bit_reverse(
@@ -140,61 +134,72 @@ fn check_vec_ops_args<'a, F>(
     result: &(impl HostOrDeviceSlice<F> + ?Sized),
     cfg: &VecOpsConfig<'a>,
 ) -> VecOpsConfig<'a> {
-    if a.len() != b.len() || a.len() != result.len()
-{
-  panic !("left, right and output lengths {}; {}; {} do not match", a.len(), b.len(), result.len());
-}
-let ctx_device_id = cfg.ctx.device_id;
-if let
-  Some(device_id) = a.device_id()
-  {
-    assert_eq !(device_id, ctx_device_id, "Device ids in a and context are different");
-  }
-if let
-  Some(device_id) = b.device_id()
-  {
-    assert_eq !(device_id, ctx_device_id, "Device ids in b and context are different");
-  }
-if let
-  Some(device_id) = result.device_id()
-  {
-    assert_eq !(device_id, ctx_device_id, "Device ids in result and context are different");
-  }
-check_device(ctx_device_id);
-
-let mut res_cfg = cfg.clone();
-res_cfg.is_a_on_device = a.is_on_device();
-res_cfg.is_b_on_device = b.is_on_device();
-res_cfg.is_result_on_device = result.is_on_device();
-res_cfg
+    if a.len() != b.len() || a.len() != result.len() {
+        panic!(
+            "left, right and output lengths {}; {}; {} do not match",
+            a.len(),
+            b.len(),
+            result.len()
+        );
+    }
+    let ctx_device_id = cfg
+        .ctx
+        .device_id;
+    if let Some(device_id) = a.device_id() {
+        assert_eq!(device_id, ctx_device_id, "Device ids in a and context are different");
+    }
+    if let Some(device_id) = b.device_id() {
+        assert_eq!(device_id, ctx_device_id, "Device ids in b and context are different");
+    }
+    if let Some(device_id) = result.device_id() {
+        assert_eq!(
+            device_id, ctx_device_id,
+            "Device ids in result and context are different"
+        );
+    }
+    check_device(ctx_device_id);
+
+    let mut res_cfg = cfg.clone();
+    res_cfg.is_a_on_device = a.is_on_device();
+    res_cfg.is_b_on_device = b.is_on_device();
+    res_cfg.is_result_on_device = result.is_on_device();
+    res_cfg
 }
 fn check_bit_reverse_args<'a, F>(
     input: &(impl HostOrDeviceSlice<F> + ?Sized),
     cfg: &BitReverseConfig<'a>,
     output: &(impl HostOrDeviceSlice<F> + ?Sized),
 ) -> BitReverseConfig<'a> {
-    if input.len() & (input.len() - 1) != 0
-{
-  panic !("input length must be a power of 2, input length: {}", input.len());
-}
-if input
-  .len() != output.len() { panic !("input and output lengths {}; {} do not match", input.len(), output.len()); }
-let ctx_device_id = cfg.ctx.device_id;
-if let
-  Some(device_id) = input.device_id()
-  {
-    assert_eq !(device_id, ctx_device_id, "Device ids in input and context are different");
-  }
-if let
-  Some(device_id) = output.device_id()
-  {
-    assert_eq !(device_id, ctx_device_id, "Device ids in output and context are different");
-  }
-check_device(ctx_device_id);
-let mut res_cfg = cfg.clone();
-res_cfg.is_input_on_device = input.is_on_device();
-res_cfg.is_output_on_device = output.is_on_device();
-res_cfg
+    if input.len() & (input.len() - 1) != 0 {
+        panic!("input length must be a power of 2, input length: {}", input.len());
+    }
+    if input.len() != output.len() {
+        panic!(
+            "input and output lengths {}; {} do not match",
+            input.len(),
+            output.len()
+        );
+    }
+    let ctx_device_id = cfg
+        .ctx
+        .device_id;
+    if let Some(device_id) = input.device_id() {
+        assert_eq!(
+            device_id, ctx_device_id,
+            "Device ids in input and context are different"
+        );
+    }
+    if let Some(device_id) = output.device_id() {
+        assert_eq!(
+            device_id, ctx_device_id,
+            "Device ids in output and context are different"
+        );
+    }
+    check_device(ctx_device_id);
+    let mut res_cfg = cfg.clone();
+    res_cfg.is_input_on_device = input.is_on_device();
+    res_cfg.is_output_on_device = output.is_on_device();
+    res_cfg
 }
 
 pub fn add_scalars<F>(
@@ -207,8 +212,8 @@ where
     F: FieldImpl,
     <F as FieldImpl>::Config: VecOps<F>,
 {
-  let cfg = check_vec_ops_args(a, b, result, cfg);
-  << F as FieldImpl > ::Config as VecOps < F >> ::add(a, b, result, &cfg)
+    let cfg = check_vec_ops_args(a, b, result, cfg);
+    <<F as FieldImpl>::Config as VecOps<F>>::add(a, b, result, &cfg)
 }
 
 pub fn accumulate_scalars<F>(
@@ -220,8 +225,8 @@ where
     F: FieldImpl,
     <F as FieldImpl>::Config: VecOps<F>,
 {
-  let cfg = check_vec_ops_args(a, b, a, cfg);
-  << F as FieldImpl > ::Config as VecOps < F >> ::accumulate(a, b, &cfg)
+    let cfg = check_vec_ops_args(a, b, a, cfg);
+    <<F as FieldImpl>::Config as VecOps<F>>::accumulate(a, b, &cfg)
 }
 
 pub fn sub_scalars<F>(
@@ -234,8 +239,8 @@ where
     F: FieldImpl,
     <F as FieldImpl>::Config: VecOps<F>,
 {
-  let cfg = check_vec_ops_args(a, b, result, cfg);
-  << F as FieldImpl > ::Config as VecOps < F >> ::sub(a, b, result, &cfg)
+    let cfg = check_vec_ops_args(a, b, result, cfg);
+    <<F as FieldImpl>::Config as VecOps<F>>::sub(a, b, result, &cfg)
 }
 
 pub fn mul_scalars<F>(
@@ -248,8 +253,8 @@ where
     F: FieldImpl,
     <F as FieldImpl>::Config: VecOps<F>,
 {
-  let cfg = check_vec_ops_args(a, b, result, cfg);
-  << F as FieldImpl > ::Config as VecOps < F >> ::mul(a, b, result, &cfg)
+    let cfg = check_vec_ops_args(a, b, result, cfg);
+    <<F as FieldImpl>::Config as VecOps<F>>::mul(a, b, result, &cfg)
 }
 
 pub fn transpose_matrix<F>(
@@ -260,13 +265,20 @@ pub fn transpose_matrix<F>(
     ctx: &DeviceContext,
     on_device: bool,
     is_async: bool,
-    is_in_montgomery_form: bool,
 ) -> IcicleResult<()>
 where
     F: FieldImpl,
     <F as FieldImpl>::Config: VecOps<F>,
 {
-    <<F as FieldImpl>::Config as VecOps<F>>::transpose(input, row_size, column_size, output, ctx, on_device, is_async, is_in_montgomery_form)
+    <<F as FieldImpl>::Config as VecOps<F>>::transpose(
+        input,
+        row_size,
+        column_size,
+        output,
+        ctx,
+        on_device,
+        is_async,
+    )
 }
 
 pub fn bit_reverse<F>(
@@ -278,8 +290,8 @@ where
     F: FieldImpl,
     <F as FieldImpl>::Config: VecOps<F>,
 {
-  let cfg = check_bit_reverse_args(input, cfg, output);
-  << F as FieldImpl > ::Config as VecOps < F >> ::bit_reverse(input, &cfg, output)
+    let cfg = check_bit_reverse_args(input, cfg, output);
+    <<F as FieldImpl>::Config as VecOps<F>>::bit_reverse(input, &cfg, output)
 }
 
 pub fn bit_reverse_inplace<F>(
@@ -290,14 +302,14 @@ where
     F: FieldImpl,
     <F as FieldImpl>::Config: VecOps<F>,
 {
-  let cfg = check_bit_reverse_args(input, cfg, input);
-  << F as FieldImpl > ::Config as VecOps < F >> ::bit_reverse_inplace(input, &cfg)
+    let cfg = check_bit_reverse_args(input, cfg, input);
+    <<F as FieldImpl>::Config as VecOps<F>>::bit_reverse_inplace(input, &cfg)
 }
 
 #[macro_export]
 macro_rules !impl_vec_ops_field
 {
-  ($field_prefix : literal, $field_prefix_ident : ident, $field : ident, $field_config : ident) = >
+  ($field_prefix : literal, $field_prefix_ident : ident, $field : ident, $field_config : ident) =>
   {
     mod $field_prefix_ident
     {
@@ -349,7 +361,6 @@ macro_rules !impl_vec_ops_field
                                    : *mut $field, ctx
                                    : * const DeviceContext, on_device
                                    : bool, is_async
-                                   : bool, is_in_montgomery_form
                                    : bool, )
         ->CudaError;
 
@@ -432,14 +443,13 @@ macro_rules !impl_vec_ops_field
                 ctx: &DeviceContext,
                 on_device: bool,
                 is_async: bool,
-                is_in_montgomery_form: bool,
             ) -> IcicleResult<()>
             {
               unsafe
               {
                 $field_prefix_ident::transpose_cuda(
                   input.as_ptr(), row_size, column_size, output.as_mut_ptr(), ctx as* const _ as* const DeviceContext,
-                  on_device, is_async, is_in_montgomery_form, )
+                  on_device, is_async)
                   .wrap()
               }
             }
@@ -478,7 +488,7 @@ macro_rules !impl_vec_ops_field
 macro_rules !impl_vec_add_tests
 {
   ($field
-   : ident) = > {
+   : ident) => {
 #[test]
                   pub fn test_vec_add_scalars(){check_vec_ops_scalars::<$field>();
 }

From 117ee4c4a4707d5f577a26e2e67366a5d3e40c29 Mon Sep 17 00:00:00 2001
From: danny-shterman <danny.shterman@gmail.com>
Date: Wed, 31 Jul 2024 14:24:25 +0000
Subject: [PATCH 06/21] Fix formatting.

---
 icicle/include/vec_ops/vec_ops.cuh           |   3 +-
 wrappers/rust/icicle-core/src/vec_ops/mod.rs | 307 ++++++++++---------
 2 files changed, 159 insertions(+), 151 deletions(-)

diff --git a/icicle/include/vec_ops/vec_ops.cuh b/icicle/include/vec_ops/vec_ops.cuh
index 6d1ad7987..2a239689a 100644
--- a/icicle/include/vec_ops/vec_ops.cuh
+++ b/icicle/include/vec_ops/vec_ops.cuh
@@ -27,7 +27,8 @@ namespace vec_ops {
                     *   non-blocking and you'd need to synchronize it explicitly by running
                     *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the
                     *   function will block the current CPU thread. */
-    bool is_in_montgomery_form; /**< If true then vec_a, vec_b and result are in montgomery form. Default value: false.  */
+    bool is_in_montgomery_form; /**< If true then vec_a, vec_b and result are in montgomery form.
+                                 * Default value: false.  */
   };
 
   /**
diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
index d16d5c4b9..20152beb4 100644
--- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
@@ -270,15 +270,7 @@ where
     F: FieldImpl,
     <F as FieldImpl>::Config: VecOps<F>,
 {
-    <<F as FieldImpl>::Config as VecOps<F>>::transpose(
-        input,
-        row_size,
-        column_size,
-        output,
-        ctx,
-        on_device,
-        is_async,
-    )
+    <<F as FieldImpl>::Config as VecOps<F>>::transpose(input, row_size, column_size, output, ctx, on_device, is_async)
 }
 
 pub fn bit_reverse<F>(
@@ -307,102 +299,103 @@ where
 }
 
 #[macro_export]
-macro_rules !impl_vec_ops_field
-{
-  ($field_prefix : literal, $field_prefix_ident : ident, $field : ident, $field_config : ident) =>
-  {
-    mod $field_prefix_ident
-    {
-      use crate::vec_ops::{$field, CudaError, DeviceContext, HostOrDeviceSlice};
-      use icicle_core::vec_ops::BitReverseConfig;
-      use icicle_core::vec_ops::VecOpsConfig;
-
-      extern "C" {
-#[link_name = concat !($field_prefix, "_add_cuda")]
-      pub(crate) fn add_scalars_cuda(a
-                                     : * const $field, b
-                                     : * const $field, size
-                                     : u32, cfg
-                                     : * const VecOpsConfig, result
-                                     : *mut $field, )
-        ->CudaError;
-
-#[link_name = concat !($field_prefix, "_accumulate_cuda")]
-      pub(crate) fn accumulate_scalars_cuda(a
-                                            : *mut $field, b
-                                            : * const $field, size
-                                            : u32, cfg
-                                            : * const VecOpsConfig, )
-        ->CudaError;
-
-#[link_name = concat !($field_prefix, "_sub_cuda")]
-      pub(crate) fn sub_scalars_cuda(a
-                                     : * const $field, b
-                                     : * const $field, size
-                                     : u32, cfg
-                                     : * const VecOpsConfig, result
-                                     : *mut $field, )
-        ->CudaError;
-
-#[link_name = concat !($field_prefix, "_mul_cuda")]
-      pub(crate) fn mul_scalars_cuda(a
-                                     : * const $field, b
-                                     : * const $field, size
-                                     : u32, cfg
-                                     : * const VecOpsConfig, result
-                                     : *mut $field, )
-        ->CudaError;
-
-#[link_name = concat !($field_prefix, "_transpose_matrix_cuda")]
-      pub(crate) fn transpose_cuda(input
-                                   : * const $field, row_size
-                                   : u32, column_size
-                                   : u32, output
-                                   : *mut $field, ctx
-                                   : * const DeviceContext, on_device
-                                   : bool, is_async
-                                   : bool, )
-        ->CudaError;
-
-#[link_name = concat !($field_prefix, "_bit_reverse_cuda")]
-      pub(crate) fn bit_reverse_cuda(input
-                                     : * const $field, size
-                                     : u64, config
-                                     : * const BitReverseConfig, output
-                                     : *mut $field, )
-        ->CudaError;
-      }
-    }
+macro_rules! impl_vec_ops_field {
+    ($field_prefix : literal, $field_prefix_ident : ident, $field : ident, $field_config : ident) => {
+        mod $field_prefix_ident {
+            use crate::vec_ops::{$field, CudaError, DeviceContext, HostOrDeviceSlice};
+            use icicle_core::vec_ops::BitReverseConfig;
+            use icicle_core::vec_ops::VecOpsConfig;
+
+            extern "C" {
+                #[link_name = concat !($field_prefix, "_add_cuda")]
+                pub(crate) fn add_scalars_cuda(
+                    a: *const $field,
+                    b: *const $field,
+                    size: u32,
+                    cfg: *const VecOpsConfig,
+                    result: *mut $field,
+                ) -> CudaError;
+
+                #[link_name = concat !($field_prefix, "_accumulate_cuda")]
+                pub(crate) fn accumulate_scalars_cuda(
+                    a: *mut $field,
+                    b: *const $field,
+                    size: u32,
+                    cfg: *const VecOpsConfig,
+                ) -> CudaError;
+
+                #[link_name = concat !($field_prefix, "_sub_cuda")]
+                pub(crate) fn sub_scalars_cuda(
+                    a: *const $field,
+                    b: *const $field,
+                    size: u32,
+                    cfg: *const VecOpsConfig,
+                    result: *mut $field,
+                ) -> CudaError;
+
+                #[link_name = concat !($field_prefix, "_mul_cuda")]
+                pub(crate) fn mul_scalars_cuda(
+                    a: *const $field,
+                    b: *const $field,
+                    size: u32,
+                    cfg: *const VecOpsConfig,
+                    result: *mut $field,
+                ) -> CudaError;
+
+                #[link_name = concat !($field_prefix, "_transpose_matrix_cuda")]
+                pub(crate) fn transpose_cuda(
+                    input: *const $field,
+                    row_size: u32,
+                    column_size: u32,
+                    output: *mut $field,
+                    ctx: *const DeviceContext,
+                    on_device: bool,
+                    is_async: bool,
+                ) -> CudaError;
+
+                #[link_name = concat !($field_prefix, "_bit_reverse_cuda")]
+                pub(crate) fn bit_reverse_cuda(
+                    input: *const $field,
+                    size: u64,
+                    config: *const BitReverseConfig,
+                    output: *mut $field,
+                ) -> CudaError;
+            }
+        }
 
-        impl VecOps<$field> for $field_config
-        {
+        impl VecOps<$field> for $field_config {
             fn add(
                 a: &(impl HostOrDeviceSlice<$field> + ?Sized),
                 b: &(impl HostOrDeviceSlice<$field> + ?Sized),
                 result: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
                 cfg: &VecOpsConfig,
-            ) -> IcicleResult<()>
-            {
-              unsafe
-              {
-                $field_prefix_ident::add_scalars_cuda(
-                  a.as_ptr(), b.as_ptr(), a.len() as u32, cfg as* const VecOpsConfig, result.as_mut_ptr(), )
-                  .wrap()
-              }
+            ) -> IcicleResult<()> {
+                unsafe {
+                    $field_prefix_ident::add_scalars_cuda(
+                        a.as_ptr(),
+                        b.as_ptr(),
+                        a.len() as u32,
+                        cfg as *const VecOpsConfig,
+                        result.as_mut_ptr(),
+                    )
+                    .wrap()
+                }
             }
 
             fn accumulate(
                 a: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
                 b: &(impl HostOrDeviceSlice<$field> + ?Sized),
                 cfg: &VecOpsConfig,
-            ) -> IcicleResult<()>
-            {
-              unsafe
-              {
-                $field_prefix_ident::accumulate_scalars_cuda(
-                  a.as_mut_ptr(), b.as_ptr(), a.len() as u32, cfg as* const VecOpsConfig, )
-                  .wrap()
-              }
+            ) -> IcicleResult<()> {
+                unsafe {
+                    $field_prefix_ident::accumulate_scalars_cuda(
+                        a.as_mut_ptr(),
+                        b.as_ptr(),
+                        a.len() as u32,
+                        cfg as *const VecOpsConfig,
+                    )
+                    .wrap()
+                }
             }
 
             fn sub(
@@ -410,14 +403,17 @@ macro_rules !impl_vec_ops_field
                 b: &(impl HostOrDeviceSlice<$field> + ?Sized),
                 result: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
                 cfg: &VecOpsConfig,
-            ) -> IcicleResult<()>
-            {
-              unsafe
-              {
-                $field_prefix_ident::sub_scalars_cuda(
-                  a.as_ptr(), b.as_ptr(), a.len() as u32, cfg as* const VecOpsConfig, result.as_mut_ptr(), )
-                  .wrap()
-              }
+            ) -> IcicleResult<()> {
+                unsafe {
+                    $field_prefix_ident::sub_scalars_cuda(
+                        a.as_ptr(),
+                        b.as_ptr(),
+                        a.len() as u32,
+                        cfg as *const VecOpsConfig,
+                        result.as_mut_ptr(),
+                    )
+                    .wrap()
+                }
             }
 
             fn mul(
@@ -425,14 +421,17 @@ macro_rules !impl_vec_ops_field
                 b: &(impl HostOrDeviceSlice<$field> + ?Sized),
                 result: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
                 cfg: &VecOpsConfig,
-            ) -> IcicleResult<()>
-            {
-              unsafe
-              {
-                $field_prefix_ident::mul_scalars_cuda(
-                  a.as_ptr(), b.as_ptr(), a.len() as u32, cfg as* const VecOpsConfig, result.as_mut_ptr(), )
-                  .wrap()
-              }
+            ) -> IcicleResult<()> {
+                unsafe {
+                    $field_prefix_ident::mul_scalars_cuda(
+                        a.as_ptr(),
+                        b.as_ptr(),
+                        a.len() as u32,
+                        cfg as *const VecOpsConfig,
+                        result.as_mut_ptr(),
+                    )
+                    .wrap()
+                }
             }
 
             fn transpose(
@@ -443,63 +442,71 @@ macro_rules !impl_vec_ops_field
                 ctx: &DeviceContext,
                 on_device: bool,
                 is_async: bool,
-            ) -> IcicleResult<()>
-            {
-              unsafe
-              {
-                $field_prefix_ident::transpose_cuda(
-                  input.as_ptr(), row_size, column_size, output.as_mut_ptr(), ctx as* const _ as* const DeviceContext,
-                  on_device, is_async)
-                  .wrap()
-              }
+            ) -> IcicleResult<()> {
+                unsafe {
+                    $field_prefix_ident::transpose_cuda(
+                        input.as_ptr(),
+                        row_size,
+                        column_size,
+                        output.as_mut_ptr(),
+                        ctx as *const _ as *const DeviceContext,
+                        on_device,
+                        is_async,
+                    )
+                    .wrap()
+                }
             }
 
             fn bit_reverse(
                 input: &(impl HostOrDeviceSlice<$field> + ?Sized),
                 cfg: &BitReverseConfig,
                 output: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
-            ) -> IcicleResult<()>
-            {
-              unsafe
-              {
-                $field_prefix_ident::bit_reverse_cuda(
-                  input.as_ptr(), input.len() as u64, cfg as* const BitReverseConfig, output.as_mut_ptr(), )
-                  .wrap()
-              }
+            ) -> IcicleResult<()> {
+                unsafe {
+                    $field_prefix_ident::bit_reverse_cuda(
+                        input.as_ptr(),
+                        input.len() as u64,
+                        cfg as *const BitReverseConfig,
+                        output.as_mut_ptr(),
+                    )
+                    .wrap()
+                }
             }
 
             fn bit_reverse_inplace(
                 input: &mut (impl HostOrDeviceSlice<$field> + ?Sized),
                 cfg: &BitReverseConfig,
-            ) -> IcicleResult<()>
-            {
-              unsafe
-              {
-                $field_prefix_ident::bit_reverse_cuda(
-                  input.as_ptr(), input.len() as u64, cfg as* const BitReverseConfig, input.as_mut_ptr(), )
-                  .wrap()
-              }
+            ) -> IcicleResult<()> {
+                unsafe {
+                    $field_prefix_ident::bit_reverse_cuda(
+                        input.as_ptr(),
+                        input.len() as u64,
+                        cfg as *const BitReverseConfig,
+                        input.as_mut_ptr(),
+                    )
+                    .wrap()
+                }
             }
         }
-  };
+    };
 }
 
 #[macro_export]
-macro_rules !impl_vec_add_tests
-{
-  ($field
+macro_rules! impl_vec_add_tests {
+    ($field
    : ident) => {
-#[test]
-                  pub fn test_vec_add_scalars(){check_vec_ops_scalars::<$field>();
-}
+        #[test]
+        pub fn test_vec_add_scalars() {
+            check_vec_ops_scalars::<$field>();
+        }
 
-#[test]
-pub fn test_bit_reverse(){check_bit_reverse::<$field>()}
-#[test]
-pub fn test_bit_reverse_inplace()
-{
-  check_bit_reverse_inplace::<$field>()
-}
-}
-;
+        #[test]
+        pub fn test_bit_reverse() {
+            check_bit_reverse::<$field>()
+        }
+        #[test]
+        pub fn test_bit_reverse_inplace() {
+            check_bit_reverse_inplace::<$field>()
+        }
+    };
 }

From b10ee73b5ebf0e3b206897acc0a222b5177592a4 Mon Sep 17 00:00:00 2001
From: danny-shterman <danny.shterman@gmail.com>
Date: Sun, 4 Aug 2024 06:19:10 +0000
Subject: [PATCH 07/21] Remove ## Best-Practices section

---
 examples/c++/mont_vec_ops/README.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/c++/mont_vec_ops/README.md b/examples/c++/mont_vec_ops/README.md
index 736303872..9e309dc6c 100644
--- a/examples/c++/mont_vec_ops/README.md
+++ b/examples/c++/mont_vec_ops/README.md
@@ -5,9 +5,6 @@ is_result_on_device
 is_in_montgomery_form
 (is_async isn't checked)
 
-## Best-Practices
-
-We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy.
 
 ## Key-Takeaway
 

From 1fc640c8b342b2ad5b44b08fc85b9ffaf9517e95 Mon Sep 17 00:00:00 2001
From: Leon Hibnik <107353745+LeonHibnik@users.noreply.github.com>
Date: Tue, 6 Aug 2024 13:51:00 +0300
Subject: [PATCH 08/21] Apply suggestions from code review

---
 wrappers/rust/icicle-core/src/vec_ops/mod.rs | 24 ++++++++++++--------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
index 20152beb4..b090e4c9a 100644
--- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
@@ -300,14 +300,19 @@ where
 
 #[macro_export]
 macro_rules! impl_vec_ops_field {
-    ($field_prefix : literal, $field_prefix_ident : ident, $field : ident, $field_config : ident) => {
+    (
+        $field_prefix:literal,
+        $field_prefix_ident:ident,
+        $field:ident,
+        $field_config:ident
+    ) => {
         mod $field_prefix_ident {
             use crate::vec_ops::{$field, CudaError, DeviceContext, HostOrDeviceSlice};
             use icicle_core::vec_ops::BitReverseConfig;
             use icicle_core::vec_ops::VecOpsConfig;
 
             extern "C" {
-                #[link_name = concat !($field_prefix, "_add_cuda")]
+                #[link_name = concat!($field_prefix, "_add_cuda")]
                 pub(crate) fn add_scalars_cuda(
                     a: *const $field,
                     b: *const $field,
@@ -316,7 +321,7 @@ macro_rules! impl_vec_ops_field {
                     result: *mut $field,
                 ) -> CudaError;
 
-                #[link_name = concat !($field_prefix, "_accumulate_cuda")]
+                #[link_name = concat!($field_prefix, "_accumulate_cuda")]
                 pub(crate) fn accumulate_scalars_cuda(
                     a: *mut $field,
                     b: *const $field,
@@ -324,7 +329,7 @@ macro_rules! impl_vec_ops_field {
                     cfg: *const VecOpsConfig,
                 ) -> CudaError;
 
-                #[link_name = concat !($field_prefix, "_sub_cuda")]
+                #[link_name = concat!($field_prefix, "_sub_cuda")]
                 pub(crate) fn sub_scalars_cuda(
                     a: *const $field,
                     b: *const $field,
@@ -333,7 +338,7 @@ macro_rules! impl_vec_ops_field {
                     result: *mut $field,
                 ) -> CudaError;
 
-                #[link_name = concat !($field_prefix, "_mul_cuda")]
+                #[link_name = concat!($field_prefix, "_mul_cuda")]
                 pub(crate) fn mul_scalars_cuda(
                     a: *const $field,
                     b: *const $field,
@@ -342,7 +347,7 @@ macro_rules! impl_vec_ops_field {
                     result: *mut $field,
                 ) -> CudaError;
 
-                #[link_name = concat !($field_prefix, "_transpose_matrix_cuda")]
+                #[link_name = concat!($field_prefix, "_transpose_matrix_cuda")]
                 pub(crate) fn transpose_cuda(
                     input: *const $field,
                     row_size: u32,
@@ -353,7 +358,7 @@ macro_rules! impl_vec_ops_field {
                     is_async: bool,
                 ) -> CudaError;
 
-                #[link_name = concat !($field_prefix, "_bit_reverse_cuda")]
+                #[link_name = concat!($field_prefix, "_bit_reverse_cuda")]
                 pub(crate) fn bit_reverse_cuda(
                     input: *const $field,
                     size: u64,
@@ -493,8 +498,9 @@ macro_rules! impl_vec_ops_field {
 
 #[macro_export]
 macro_rules! impl_vec_add_tests {
-    ($field
-   : ident) => {
+    (
+        $field:ident
+    ) => {
         #[test]
         pub fn test_vec_add_scalars() {
             check_vec_ops_scalars::<$field>();

From 080071870fe0f2dac0fa1e2d19d22498047d14da Mon Sep 17 00:00:00 2001
From: Leon Hibnik <107353745+LeonHibnik@users.noreply.github.com>
Date: Tue, 6 Aug 2024 13:52:55 +0300
Subject: [PATCH 09/21] Update wrappers/rust/icicle-core/src/vec_ops/mod.rs

---
 wrappers/rust/icicle-core/src/vec_ops/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
index b090e4c9a..efb2cab53 100644
--- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
@@ -499,7 +499,7 @@ macro_rules! impl_vec_ops_field {
 #[macro_export]
 macro_rules! impl_vec_add_tests {
     (
-        $field:ident
+      $field:ident
     ) => {
         #[test]
         pub fn test_vec_add_scalars() {

From 58354e43875ab91b73fc28b092473b569c8bb2a2 Mon Sep 17 00:00:00 2001
From: danny-shterman <danny.shterman@gmail.com>
Date: Mon, 12 Aug 2024 14:08:13 +0000
Subject: [PATCH 10/21] WIP on illegal memory access bug.

---
 examples/c++/mont_vec_ops/example.cu | 501 ++++++++++++++++-----------
 icicle/include/vec_ops/vec_ops.cuh   |   7 +-
 icicle/src/vec_ops/vec_ops.cu        |  37 +-
 3 files changed, 325 insertions(+), 220 deletions(-)

diff --git a/examples/c++/mont_vec_ops/example.cu b/examples/c++/mont_vec_ops/example.cu
index 8cf21e1fc..a1fe871be 100644
--- a/examples/c++/mont_vec_ops/example.cu
+++ b/examples/c++/mont_vec_ops/example.cu
@@ -44,41 +44,12 @@ int vector_op(
   }
   return 0;
 }
-int vector_mul(
-  T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config)
-{
-  cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result);
-  if (err != cudaSuccess) {
-    std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
-    return 0;
-  }
-  return 0;
-}
-int vector_add(
-  T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config)
-{
-  cudaError_t err = bn254_add_cuda(vec_a, vec_b, n_elements, config, vec_result);
-  if (err != cudaSuccess) {
-    std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
-    return 0;
-  }
-  return 0;
-}
-int vector_sub(
-  T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config)
-{
-  cudaError_t err = bn254_sub_cuda(vec_a, vec_b, n_elements, config, vec_result);
-  if (err != cudaSuccess) {
-    std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
-    return 0;
-  }
-  return 0;
-}
 
 int main(int argc, char** argv)
 {
-  const unsigned vector_size = 1 << 0;
-  const unsigned repetitions = 1 << 0;
+  const unsigned vector_size              = 1 << 11;
+  const unsigned not_in_place_repetitions = 1 << 0;    // Repetitions are used only for the non in-place tests.
+  const unsigned in_place_repetitions     = 1;    // Repetitions for in-place tests should be 1. Don't check it.
 
   cudaError_t err;
   nvmlInit();
@@ -94,9 +65,10 @@ int main(int argc, char** argv)
   unsigned power_limit;
   nvmlDeviceGetPowerManagementLimit(device, &power_limit);
 
-  std::cout << "Vector size: " << vector_size << std::endl;
-  std::cout << "Repetitions: " << repetitions << std::endl;
-  std::cout << "Power limit: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_limit << " W" << std::endl;
+  std::cout << "Vector size:              " << vector_size << std::endl;
+  std::cout << "not_in_place_repetitions: " << not_in_place_repetitions << std::endl;
+  std::cout << "in_place_repetitions:     " << in_place_repetitions << std::endl;
+  std::cout << "Power limit:              " << std::fixed << std::setprecision(3) << 1.0e-3 * power_limit << " W" << std::endl;
 
   unsigned int baseline_power;
   nvmlDeviceGetPowerUsage(device, &baseline_power);
@@ -111,21 +83,27 @@ int main(int argc, char** argv)
   // host data
   std::cout << "Allocate memory for the input vectors (both normal and Montgomery presentation)" << std::endl;
   T* host_in1_init = (T*)malloc(vector_size * sizeof(T));
+  std::cout << "example malloc host_in1_init" << std::endl;
   T* host_in2_init = (T*)malloc(vector_size * sizeof(T));
+  std::cout << "example malloc host_in2_init" << std::endl;
   std::cout << "Initializing vectors with normal presentation random data" << std::endl;
   T::rand_host_many(host_in1_init, vector_size);
   T::rand_host_many(host_in2_init, vector_size);
   std::cout << "Allocate memory for the output vectors" << std::endl;
   T* host_out = (T*)malloc(vector_size * sizeof(T)); // This memory will be used for the test output.
+  std::cout << "example malloc host_out" << std::endl;
   T* host_out_ref_mul = (T*)malloc(
     vector_size *
     sizeof(T)); // This memory will be used as a reference result for mul (will be compared to host_out content).
+  std::cout << "example malloc host_out_ref_mul" << std::endl;
   T* host_out_ref_add = (T*)malloc(
     vector_size *
     sizeof(T)); // This memory will be used as a reference result for add (will be compared to host_out content).
+  std::cout << "example malloc host_out_ref_add" << std::endl;    
   T* host_out_ref_sub = (T*)malloc(
     vector_size *
     sizeof(T)); // This memory will be used as a reference result for sub (will be compared to host_out content).
+  std::cout << "malloc host_out_ref_sub" << std::endl;
   std::cout << "Initializing output vectors with random data" << std::endl;
   T::rand_host_many(host_out, vector_size);
   T::rand_host_many(host_out_ref_mul, vector_size);
@@ -138,22 +116,27 @@ int main(int argc, char** argv)
   T* device_out;
 
   err = cudaMalloc((void**)&device_in1, vector_size * sizeof(T));
+  std::cout << "example cudaMalloc device_in1" << std::endl;
   if (err != cudaSuccess) {
     std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
     return 0;
   }
   err = cudaMalloc((void**)&device_in2, vector_size * sizeof(T));
+  std::cout << "example cudaMalloc device_in2" << std::endl;
   if (err != cudaSuccess) {
     std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
     return 0;
   }
   err = cudaMalloc((void**)&device_out, vector_size * sizeof(T));
+  std::cout << "example cudaMalloc device_out" << std::endl;
   if (err != cudaSuccess) {
     std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
     return 0;
   }
 
   vec_ops::VecOpsConfig config = vec_ops::DefaultVecOpsConfig();
+  int nof_of_configs_for_test = 5;
+  int nof_of_storage_configs = 3;   // 2 inputs, 1 result.
 
   //****************************************
   // Test warn-up and reference output config. Reference output to be used to check if test passed or not.
@@ -172,28 +155,25 @@ int main(int argc, char** argv)
   std::cout << "Starting warm-up run" << std::endl;
   // Warm-up loop
   for (int op = MUL; op != LAST; op++) {
-    for (int i = 0; i < repetitions; i++) {
-      // vector_mul(device_in1, device_in2, device_out, vector_size, ctx, config);
+    for (int i = 0; i < not_in_place_repetitions; i++) {
       vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op);
-    }
-    switch (op) {
-    case MUL:
-      err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
-      break;
-    case ADD:
-      err = cudaMemcpy(host_out_ref_add, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
-      break;
-    case SUB:
-      err = cudaMemcpy(host_out_ref_sub, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
-      break;
+      switch (op) {
+        case MUL:
+          err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
+          break;
+        case ADD:
+          err = cudaMemcpy(host_out_ref_add, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
+          break;
+        case SUB:
+          err = cudaMemcpy(host_out_ref_sub, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
+          break;
+      }
+      if (err != cudaSuccess) {
+        std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl;
+        return 0;
+      }
     }
   }
-  // copy the result from device to host_out_ref_mul to keep it for later comparisons.
-  // err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
-  if (err != cudaSuccess) {
-    std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl;
-    return 0;
-  }
   //****************************************
   // End of test warn-up and reference output config.
   //****************************************
@@ -214,30 +194,64 @@ int main(int argc, char** argv)
   //*******************************************************
   // Benchmark test:
   // Loop for (mul, add, sub):
-  //   Loop (is_a_on_device, is_b_on_device, is_result_on_device, is_in_montgomery_form):
+  //   Loop (is_a_on_device, is_b_on_device, is_result_on_device, is_input_in_montgomery_form):
   //*******************************************************
   T* host_in1 =
     (T*)malloc(vector_size * sizeof(T)); // This buffer is used to load the data from host_in1_init for the benchmark.
+  std::cout << "example malloc host_in1" << std::endl;
   T* host_in2 =
     (T*)malloc(vector_size * sizeof(T)); // This buffer is used to load the data from host_in2_init for the benchmark.
+  std::cout << "example malloc host_in1" << std::endl;
   // Test when the result is not in-place
+  std::cout << "*****************************************" << std::endl;
+  std::cout << "*** Start not in-place benchmark loop ***" << std::endl;
+  std::cout << "*****************************************" << std::endl;
   for (int op = MUL; op != LAST; op++) {
-    // for (int config_idx = 0; config_idx < 0; config_idx++) {
-    for (int config_idx = 0; config_idx < 16; config_idx++) {
+    for (int config_idx = 0; config_idx < 32; config_idx++) {
+      // // DEBUG
+      // int num_gpus;
+      // size_t free, total;
+      // cudaGetDeviceCount( &num_gpus );
+      // std::cout << "num_gpus " << num_gpus << std::endl;
+      // for ( int gpu_id = 0; gpu_id < num_gpus; gpu_id++ ) {
+      //     cudaSetDevice( gpu_id );
+      //     int id;
+      //     cudaGetDevice( &id );
+      //     cudaMemGetInfo( &free, &total );
+      //     std::cout << "GPU " << id << " memory: free=" << free << ", total=" << total << std::endl;
+      // }
+      // // DEBUG
+      switch (op) {
+        case MUL: std::cout << "Start benchmark loop for op MUL config_idx " << config_idx << " not in-place" << std::endl; break;
+        case ADD: std::cout << "Start benchmark loop for op ADD config_idx " << config_idx << " not in-place" << std::endl; break;
+        case SUB: std::cout << "Start benchmark loop for op SUB config_idx " << config_idx << " not in-place" << std::endl; break;
+      }
       std::cout << "Start benchmark loop for config_idx " << config_idx << std::endl;
+      // Destroy the result of the prev loop.
+      T::rand_host_many(host_out, vector_size);   // Randomize host_out in order to randomize device_out.
+      err = cudaMemcpy(
+        device_out, host_out, vector_size * sizeof(T),
+        cudaMemcpyHostToDevice); // Copy random data to device_out.
+      if (err != cudaSuccess) {
+        std::cerr << "Failed to copy data from host_out to device_out - " << cudaGetErrorString(err) << std::endl;
+        return 0;
+      }
+      T::rand_host_many(host_out, vector_size);   // Make hist_out != device_out. 
+      // Initializa inputs with the known data
       for (int i = 0; i < vector_size; i++) {
         host_in1[i] = host_in1_init[i];
         host_in2[i] = host_in2_init[i];
       }
-      config.is_a_on_device = (config_idx >> 3) & 0x1;
-      config.is_b_on_device = (config_idx >> 2) & 0x1;
-      config.is_result_on_device = (config_idx >> 1) & 0x1;
-      config.is_in_montgomery_form = (config_idx >> 0) & 0x1;
+      config.is_a_on_device               = (config_idx >> 4) & 0x1;
+      config.is_b_on_device               = (config_idx >> 3) & 0x1;
+      config.is_result_on_device          = (config_idx >> 2) & 0x1;
+      config.is_input_in_montgomery_form  = (config_idx >> 1) & 0x1;
+      config.is_result_in_montgomery_form = (config_idx >> 0) & 0x1;
 
       // Copy from host to device (copy again in order to be used later in the loop and device_inX was already
       // overwritten by warmup.
       if (config.is_a_on_device) {
-        if (config.is_in_montgomery_form) {
+        if (config.is_input_in_montgomery_form) {
           err =
             cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
           if (err != cudaSuccess) {
@@ -246,6 +260,10 @@ int main(int argc, char** argv)
           }
           CHK_IF_RETURN(
             mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1)); // Convert in-place.
+          // Destroy host_in1 value with values of host_in2.
+          for (int i = 0; i < vector_size; i++) {
+            host_in1[i] = host_in2_init[i];
+          }
         } else {                                                                          // Normal presentation.
           err =
             cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
@@ -255,7 +273,7 @@ int main(int argc, char** argv)
           }
         }
       } else {
-        if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host.
+        if (config.is_input_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host.
           err =
             cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
           if (err != cudaSuccess) {
@@ -268,10 +286,17 @@ int main(int argc, char** argv)
             std::cerr << "Failed to copy data from device_in1 to host_in1 - " << cudaGetErrorString(err) << std::endl;
             return 0;
           }
+          // Destroy device_in1 value with values of host_in2.
+          err =
+            cudaMemcpy(device_in1, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from host_in2 to device_in1 - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
         }
       }
       if (config.is_b_on_device) {
-        if (config.is_in_montgomery_form) {
+        if (config.is_input_in_montgomery_form) {
           err =
             cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
           if (err != cudaSuccess) {
@@ -280,6 +305,10 @@ int main(int argc, char** argv)
           }
           CHK_IF_RETURN(
             mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2)); // Convert in-place.
+          // Destroy host_in2 value with values of host_in1.
+          for (int i = 0; i < vector_size; i++) {
+            host_in2[i] = host_in1_init[i];
+          }
         } else {
           // Normal presentation.
           err =
@@ -290,7 +319,7 @@ int main(int argc, char** argv)
           }
         }
       } else {
-        if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host.
+        if (config.is_input_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host.
           err =
             cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
           if (err != cudaSuccess) {
@@ -303,38 +332,45 @@ int main(int argc, char** argv)
             std::cerr << "Failed to copy data from device_in2 to host_in2 - " << cudaGetErrorString(err) << std::endl;
             return 0;
           }
+          // Destroy device_in2 valuewith values of host_in1.
+          err =
+            cudaMemcpy(device_in2, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from host_in1 to device_in2 - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
         }
       }
       CHK_IF_RETURN(cudaPeekAtLastError());
 
       auto start_time = std::chrono::high_resolution_clock::now();
       // Benchmark loop
-      for (int i = 0; i < repetitions; i++) {
-        switch (config_idx >> 1) { // {is_a_on_device, is_b_on_device, is_result_on_device}
-        case 0b000:
-          vector_op(host_in1, host_in2, host_out, vector_size, ctx, config, (Op)op);
-          break;
-        case 0b001:
-          vector_op(host_in1, host_in2, device_out, vector_size, ctx, config, (Op)op);
-          break;
-        case 0b010:
-          vector_op(host_in1, device_in2, host_out, vector_size, ctx, config, (Op)op);
-          break;
-        case 0b011:
-          vector_op(host_in1, device_in2, device_out, vector_size, ctx, config, (Op)op);
-          break;
-        case 0b100:
-          vector_op(device_in1, host_in2, host_out, vector_size, ctx, config, (Op)op);
-          break;
-        case 0b101:
-          vector_op(device_in1, host_in2, device_out, vector_size, ctx, config, (Op)op);
-          break;
-        case 0b110:
-          vector_op(device_in1, device_in2, host_out, vector_size, ctx, config, (Op)op);
-          break;
-        case 0b111:
-          vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op);
-          break;
+      for (int i = 0; i < not_in_place_repetitions; i++) {
+        switch (config_idx >> (nof_of_configs_for_test - nof_of_storage_configs)) { // {is_a_on_device, is_b_on_device, is_result_on_device}
+          case 0b000:
+            vector_op(host_in1, host_in2, host_out, vector_size, ctx, config, (Op)op);
+            break;
+          case 0b001:
+            vector_op(host_in1, host_in2, device_out, vector_size, ctx, config, (Op)op);
+            break;
+          case 0b010:
+            vector_op(host_in1, device_in2, host_out, vector_size, ctx, config, (Op)op);
+            break;
+          case 0b011:
+            vector_op(host_in1, device_in2, device_out, vector_size, ctx, config, (Op)op);
+            break;
+          case 0b100:
+            vector_op(device_in1, host_in2, host_out, vector_size, ctx, config, (Op)op);
+            break;
+          case 0b101:
+            vector_op(device_in1, host_in2, device_out, vector_size, ctx, config, (Op)op);
+            break;
+          case 0b110:
+            vector_op(device_in1, device_in2, host_out, vector_size, ctx, config, (Op)op);
+            break;
+          case 0b111:
+            vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op);
+            break;
         }
         CHK_IF_RETURN(cudaPeekAtLastError());
       }
@@ -356,40 +392,40 @@ int main(int argc, char** argv)
         break;
       }
 
-      if (config.is_result_on_device) { // Copy the data to host_out in order to compare it vs. host_out_ref_mul value.
-        if (config.is_in_montgomery_form) { // Convert to normal from montgomery if needed.
+      if (config.is_result_on_device) { // Copy the data to host_out in order to compare it vs. host_out_ref_XXX value.
+        if (config.is_result_in_montgomery_form) { // Convert to normal from montgomery if needed.
           CHK_IF_RETURN(mont::from_montgomery(
             device_out, vector_size, config.ctx.stream,
-            device_out)); // Convert to normal in order to check vs. host_out_ref_mul.
+            device_out)); // Convert to normal in order to check vs. host_out_ref_XXX.
         }
         err = cudaMemcpy(
           host_out, device_out, vector_size * sizeof(T),
-          cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
+          cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_XXX.
         if (err != cudaSuccess) {
           std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl;
           return 0;
         }
       } else {                              // Data is not on device but it is in host_out.
-        if (config.is_in_montgomery_form) { // host_out should be written to device, converted to mmontgomery and
-                                            // written back to host. Then compared vs. host_out_ref_mul.
+        if (config.is_result_in_montgomery_form) { // host_out should be written to device, converted to mmontgomery and
+                                                   // written back to host. Then compared vs. host_out_ref_XXX.
           err = cudaMemcpy(
             device_out, host_out, vector_size * sizeof(T),
-            cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul.
+            cudaMemcpyHostToDevice); // Copy to device_out in order to check later vs. host_out_ref_XXX.
           if (err != cudaSuccess) {
             std::cerr << "Failed to copy data from host_out to device_out - " << cudaGetErrorString(err) << std::endl;
             return 0;
           }
           CHK_IF_RETURN(mont::from_montgomery(
             device_out, vector_size, config.ctx.stream,
-            device_out)); // Convert to normal in order to check vs. host_out_ref_mul.
+            device_out)); // Convert to normal in order to check vs. host_out_ref_XXX.
           err = cudaMemcpy(
             host_out, device_out, vector_size * sizeof(T),
-            cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
+            cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_XXX.
           if (err != cudaSuccess) {
             std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl;
             return 0;
           }
-        } else { // host_out could be compared vs. host_out_ref_mul as is.
+        } else { // host_out could be compared vs. host_out_ref_XXX as is.   
         }
       }
       //****************************************
@@ -397,14 +433,12 @@ int main(int argc, char** argv)
       //****************************************
 
       //***********************************************
-      // Test result check
+      // Test result check (not in-place)
       // Check is performed by executing the operation in a normal presentation
-      //   (located in in host_out_ref_mul) and comparing it with the
+      //   (located in in host_out_ref_XXX) and comparing it with the
       //   benchmark test result.
       //***********************************************
       int test_failed = 0;
-      // std::cout << "===>>> host_out_ref_mul[" << i << "]: " << host_out_ref_mul[i] << std::endl;
-      // std::cout << "===>>> host_out[" << i << "] after test run: " << host_out[i] << std::endl;
       switch (op) {
       case MUL:
         for (int i = 0; i < vector_size; i++) {
@@ -435,13 +469,18 @@ int main(int argc, char** argv)
         break;
       }
       if (test_failed) {
-        // std::cout << "===>>> ERROR!!! Test failed for vector index " << i << ", config is printed below:" <<
-        // std::endl;
-        std::cout << "===>>> result is not in-place: " << std::endl;
-        std::cout << "===>>> is_a_on_device: " << config.is_a_on_device << std::endl;
-        std::cout << "===>>> is_b_on_device: " << config.is_b_on_device << std::endl;
-        std::cout << "===>>> is_result_on_device: " << config.is_result_on_device << std::endl;
-        std::cout << "===>>> is_in_montgomery_form: " << config.is_in_montgomery_form << std::endl;
+        std::cout << "===>>> result is in-place:                " << std::endl;
+        std::cout << "===>>> is_a_on_device:                    " << config.is_a_on_device << std::endl;
+        std::cout << "===>>> is_b_on_device:                    " << config.is_b_on_device << std::endl;
+        std::cout << "===>>> is_result_on_device:               " << config.is_result_on_device << std::endl;
+        std::cout << "===>>> is_input_in_montgomery_form:       " << config.is_input_in_montgomery_form << std::endl;
+        std::cout << "===>>> is_input_resultin_montgomery_form: " << config.is_result_in_montgomery_form << std::endl;
+        std::cout << "===>>> host_in1_init[0]                      = " << host_in1_init[0] << std::endl;
+        std::cout << "===>>> host_in2_init[0]                      = " << host_in2_init[0] << std::endl;
+        std::cout << "===>>> host_out[0]                           = " << host_out[0] << std::endl;
+        std::cout << "===>>> warm-up: normal host_out_ref[0] (MUL) = " << host_out_ref_mul[0] << std::endl;
+        std::cout << "===>>> warm-up: normal host_out_ref[0] (ADD) = " << host_out_ref_add[0] << std::endl;
+        std::cout << "===>>> warm-up: normal host_out_ref[0] (SUB) = " << host_out_ref_sub[0] << std::endl;
         exit(2);
       }
 
@@ -458,28 +497,50 @@ int main(int argc, char** argv)
       }
 
       // Report performance in GMPS: Giga Multiplications Per Second
-      double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count());
+      double GMPS = 1.0e-9 * not_in_place_repetitions * vector_size / (1.0e-6 * duration.count());
       std::cout << "Performance: " << GMPS << " Giga Multiplications Per Second" << std::endl;
     }
   }
 
   // Test when the result is in-place
+  std::cout << "*************************************" << std::endl;
+  std::cout << "*** Start in-place benchmark loop ***" << std::endl;
+  std::cout << "*************************************" << std::endl;
   for (int op = MUL; op != LAST; op++) {
-    for (int config_idx = 0; config_idx < 16; config_idx++) {
+    for (int config_idx = 0; config_idx < 32; config_idx++) {
+      switch (op) {
+        case MUL: std::cout << "Start benchmark loop for op MUL config_idx " << config_idx << " in-place" << std::endl; break;
+        case ADD: std::cout << "Start benchmark loop for op ADD config_idx " << config_idx << " in-place" << std::endl; break;
+        case SUB: std::cout << "Start benchmark loop for op SUB config_idx " << config_idx << " in-place" << std::endl; break;
+      }
+      std::cout << "Start benchmark loop for config_idx " << config_idx << std::endl;
+      // Destroy the result of the prev loop.
+      T::rand_host_many(host_out, vector_size);   // Randomize host_out in order to randomize device_out.
+      err = cudaMemcpy(
+        device_out, host_out, vector_size * sizeof(T),
+        cudaMemcpyHostToDevice); // Copy random data to device_out.
+      if (err != cudaSuccess) {
+        std::cerr << "Failed to copy data from host_out to device_out - " << cudaGetErrorString(err) << std::endl;
+        return 0;
+      }
+      T::rand_host_many(host_out, vector_size);   // Make hist_out != device_out. 
+      // Initializa inputs with the known data. For in-place tests host_in1 is going to be used as a result. So, it should be initialized later in the repetions loop.
       for (int i = 0; i < vector_size; i++) {
         host_in1[i] = host_in1_init[i];
         host_in2[i] = host_in2_init[i];
       }
-      config.is_a_on_device = (config_idx >> 4) & 0x1;
-      config.is_b_on_device = (config_idx >> 3) & 0x1;
-      config.is_result_on_device = (config_idx >> 2) & 0x1;
-      config.is_in_montgomery_form = (config_idx >> 1) & 0x1;
-      if (config.is_a_on_device ^ config.is_result_on_device == 1) { continue; }
+      config.is_a_on_device               = (config_idx >> 4) & 0x1;
+      config.is_b_on_device               = (config_idx >> 3) & 0x1;
+      config.is_result_on_device          = (config_idx >> 2) & 0x1;
+      config.is_input_in_montgomery_form  = (config_idx >> 1) & 0x1;
+      config.is_result_in_montgomery_form = (config_idx >> 1) & 0x1;
+      if (config.is_a_on_device ^ config.is_result_on_device == 1) { continue; }    // Illegal case for this loop.
+      if (config.is_input_in_montgomery_form ^ config.is_result_in_montgomery_form == 1) { continue; }    // Illegal case for this loop.
 
       // Copy from host to device (copy again in order to be used later in the loop and device_inX was already
       // overwritten by warmup.
       if (config.is_a_on_device) {
-        if (config.is_in_montgomery_form) {
+        if (config.is_input_in_montgomery_form) {
           err =
             cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
           if (err != cudaSuccess) {
@@ -488,6 +549,10 @@ int main(int argc, char** argv)
           }
           CHK_IF_RETURN(
             mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1)); // Convert in-place.
+          // Destroy host_in1 value with values of host_in2.
+          for (int i = 0; i < vector_size; i++) {
+            host_in1[i] = host_in2_init[i];
+          }
         } else {                                                                          // Normal presentation.
           err =
             cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
@@ -497,7 +562,7 @@ int main(int argc, char** argv)
           }
         }
       } else {
-        if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host.
+        if (config.is_input_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host.
           err =
             cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
           if (err != cudaSuccess) {
@@ -510,10 +575,17 @@ int main(int argc, char** argv)
             std::cerr << "Failed to copy data from device_in1 to host_in1 - " << cudaGetErrorString(err) << std::endl;
             return 0;
           }
+          // Destroy device_in1 value with values of host_in2.
+          err =
+            cudaMemcpy(device_in1, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from host_in2 to device_in1 - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
         }
       }
       if (config.is_b_on_device) {
-        if (config.is_in_montgomery_form) {
+        if (config.is_input_in_montgomery_form) {
           err =
             cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
           if (err != cudaSuccess) {
@@ -522,6 +594,10 @@ int main(int argc, char** argv)
           }
           CHK_IF_RETURN(
             mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2)); // Convert in-place.
+          // Destroy host_in2 value with values of host_in1.
+          for (int i = 0; i < vector_size; i++) {
+            host_in2[i] = host_in1_init[i];
+          }
         } else {
           // Normal presentation.
           err =
@@ -532,7 +608,7 @@ int main(int argc, char** argv)
           }
         }
       } else {
-        if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host.
+        if (config.is_input_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host.
           err =
             cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
           if (err != cudaSuccess) {
@@ -545,34 +621,42 @@ int main(int argc, char** argv)
             std::cerr << "Failed to copy data from device_in2 to host_in2 - " << cudaGetErrorString(err) << std::endl;
             return 0;
           }
+          // Destroy device_in2 valuewith values of host_in1.
+          err =
+            cudaMemcpy(device_in2, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
+          if (err != cudaSuccess) {
+            std::cerr << "Failed to copy data from host_in1 to device_in2 - " << cudaGetErrorString(err) << std::endl;
+            return 0;
+          }
         }
       }
       CHK_IF_RETURN(cudaPeekAtLastError());
 
       auto start_time = std::chrono::high_resolution_clock::now();
       // Benchmark loop
-      for (int i = 0; i < repetitions; i++) {
-        switch (config_idx >> 2) { // {is_a_on_device, is_b_on_device, is_result_on_device}
-        case 0b000:
-          vector_op(host_in1, host_in2, host_in1, vector_size, ctx, config, (Op)op);
-          break;
-        case 0b001:
-          break;
-        case 0b010:
-          vector_op(host_in1, device_in2, host_in1, vector_size, ctx, config, (Op)op);
-          break;
-        case 0b011:
-          break;
-        case 0b100:
-          break;
-        case 0b101:
-          vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op);
-          break;
-        case 0b110:
-          break;
-        case 0b111:
-          vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op);
-          break;
+      for (int i = 0; i < in_place_repetitions; i++) {
+        switch (config_idx >> (nof_of_configs_for_test - nof_of_storage_configs)) { // {is_a_on_device, is_b_on_device, is_result_on_device}
+          case 0b000:
+            vector_op(host_in1, host_in2, host_in1, vector_size, ctx, config, (Op)op);
+            break;
+          case 0b001:
+            break;
+          case 0b010:
+            vector_op(host_in1, device_in2, host_in1, vector_size, ctx, config, (Op)op);
+            break;
+          case 0b011:
+            break;
+          case 0b100:
+            break;
+          case 0b101:
+            vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op);
+            std::cout << "===>>> COMMAND: vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op);" << std::endl;
+            break;
+          case 0b110:
+            break;
+          case 0b111:
+            vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op);
+            break;
         }
         CHK_IF_RETURN(cudaPeekAtLastError());
       }
@@ -594,53 +678,42 @@ int main(int argc, char** argv)
         break;
       }
 
-      if (config.is_result_on_device) { // Copy the data to host_out in order to compare it vs. host_out_ref_mul value.
-        if (config.is_in_montgomery_form) { // Convert to normal from montgomery if needed.
+      if (config.is_result_on_device) { // Copy the data to host_out in order to compare it vs. host_out_ref_XXX value.
+        if (config.is_result_in_montgomery_form) { // Convert to normal from montgomery if needed.
           CHK_IF_RETURN(mont::from_montgomery(
             device_in1, vector_size, config.ctx.stream,
-            device_in1)); // Convert to normal in order to check vs. host_out_ref_mul.
+            device_in1)); // Convert to normal in order to check vs. host_out_ref_XXX.
         }
         err = cudaMemcpy(
           host_out, device_in1, vector_size * sizeof(T),
-          cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
+          cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_XXX.
         if (err != cudaSuccess) {
           std::cerr << "Failed to copy data from device_in1 to host_out - " << cudaGetErrorString(err) << std::endl;
           return 0;
         }
       } else { // Data is not on device but it is in host_in1. It should be moved to host_out for test pass/fail check.
-        if (config.is_in_montgomery_form) { // host_out should be written to device, converted to mmontgomery and
-                                            // written back to host. Then compared vs. host_out_ref_mul.
+        if (config.is_result_in_montgomery_form) { // host_out should be written to device, converted to mmontgomery and
+                                                   // written back to host. Then compared vs. host_out_ref_XXX.
           err = cudaMemcpy(
             device_out, host_in1, vector_size * sizeof(T),
-            cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul.
+            cudaMemcpyHostToDevice); // Copy to device_out in order to check later vs. host_out_ref_XXX.
           if (err != cudaSuccess) {
             std::cerr << "Failed to copy data from host_in1 to device_out - " << cudaGetErrorString(err) << std::endl;
             return 0;
           }
           CHK_IF_RETURN(mont::from_montgomery(
             device_out, vector_size, config.ctx.stream,
-            device_out)); // Convert to normal in order to check vs. host_out_ref_mul.
+            device_out)); // Convert to normal in order to check vs. host_out_ref_XXX.
           err = cudaMemcpy(
             host_out, device_out, vector_size * sizeof(T),
-            cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
+            cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_XXX.
           if (err != cudaSuccess) {
             std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl;
             return 0;
           }
-        } else { // host_out could be compared vs. host_out_ref_mul as is.
-          err = cudaMemcpy(
-            device_out, host_in1, vector_size * sizeof(T),
-            cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul.
-          if (err != cudaSuccess) {
-            std::cerr << "Failed to copy data from host_in1 to device_out - " << cudaGetErrorString(err) << std::endl;
-            return 0;
-          }
-          err = cudaMemcpy(
-            host_out, device_out, vector_size * sizeof(T),
-            cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul.
-          if (err != cudaSuccess) {
-            std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl;
-            return 0;
+        } else { // host_out could be compared vs. host_out_ref_XXX as is.
+          for (int i = 0; i < vector_size; i++) {   // Copy from host_in1 (result) to host_out to compare later vs. host_out_ref_XXX.
+            host_out[i] = host_in1[i];
           }
         }
       }
@@ -649,55 +722,54 @@ int main(int argc, char** argv)
       //****************************************
 
       //***********************************************
-      // Test result check
+      // Test result check (in-place)
       // Check is performed by executing the operation in a normal presentation
-      //   (located in in host_out_ref_mul) and comparing it with the
+      //   (located in in host_out_ref_XXX) and comparing it with the
       //   benchmark test result.
       //***********************************************
       int test_failed = 0;
-      // std::cout << "===>>> host_out_ref_mul[" << i << "]: " << host_out_ref_mul[i] << std::endl;
-      // std::cout << "===>>> host_out[" << i << "] after test run: " << host_out[i] << std::endl;
       switch (op) {
-      case MUL:
-        for (int i = 0; i < vector_size; i++) {
-          if (host_out_ref_mul[i] != host_out[i]) {
-            std::cout << "===>>> ERROR!!! MUL: Test failed for vector index " << i
-                      << ", config is printed below:" << std::endl;
-            std::cout << "host_out_ref_mul[0] = " << host_out_ref_mul[0] << std::endl;
-            test_failed = 1;
+        case MUL:
+          for (int i = 0; i < vector_size; i++) {
+            if (host_out_ref_mul[i] != host_out[i]) {
+              std::cout << "===>>> ERROR!!! MUL: Test failed for vector index " << i
+                        << ", config is printed below:" << std::endl;
+              test_failed = 1;
+            }
           }
-        }
-        break;
-      case ADD:
-        for (int i = 0; i < vector_size; i++) {
-          if (host_out_ref_add[i] != host_out[i]) {
-            std::cout << "===>>> ERROR!!! ADD: Test failed for vector index " << i
-                      << ", config is printed below:" << std::endl;
-            std::cout << "host_out_ref_add[0] = " << host_out_ref_add[0] << std::endl;
-            test_failed = 1;
+          break;
+        case ADD:
+          for (int i = 0; i < vector_size; i++) {
+            if (host_out_ref_add[i] != host_out[i]) {
+              std::cout << "===>>> ERROR!!! ADD: Test failed for vector index " << i
+                        << ", config is printed below:" << std::endl;
+              test_failed = 1;
+            }
           }
-        }
-        break;
-      case SUB:
-        for (int i = 0; i < vector_size; i++) {
-          if (host_out_ref_sub[i] != host_out[i]) {
-            std::cout << "===>>> ERROR!!! SUB: Test failed for vector index " << i
-                      << ", config is printed below:" << std::endl;
-            std::cout << "host_out_ref_sub[0] = " << host_out_ref_sub[0] << std::endl;
-            test_failed = 1;
+          break;
+        case SUB:
+          for (int i = 0; i < vector_size; i++) {
+            if (host_out_ref_sub[i] != host_out[i]) {
+              std::cout << "===>>> ERROR!!! SUB: Test failed for vector index " << i
+                        << ", config is printed below:" << std::endl;
+              test_failed = 1;
+            }
           }
-        }
-        break;
+          break;
       }
       if (test_failed) {
-        // std::cout << "===>>> ERROR!!! Test failed for vector index " << i << ", config is printed below:" <<
-        // std::endl;
-        std::cout << "===>>> result is in-place: " << std::endl;
-        std::cout << "===>>> is_a_on_device: " << config.is_a_on_device << std::endl;
-        std::cout << "===>>> is_b_on_device: " << config.is_b_on_device << std::endl;
-        std::cout << "===>>> is_result_on_device: " << config.is_result_on_device << std::endl;
-        std::cout << "===>>> is_in_montgomery_form: " << config.is_in_montgomery_form << std::endl;
-        std::cout << "host_out[0] = " << host_out[0] << std::endl;
+        std::cout << "===>>> result is in-place:                " << std::endl;
+        std::cout << "===>>> is_a_on_device:                    " << config.is_a_on_device << std::endl;
+        std::cout << "===>>> is_b_on_device:                    " << config.is_b_on_device << std::endl;
+        std::cout << "===>>> is_result_on_device:               " << config.is_result_on_device << std::endl;
+        std::cout << "===>>> is_input_in_montgomery_form:       " << config.is_input_in_montgomery_form << std::endl;
+        std::cout << "===>>> is_input_resultin_montgomery_form: " << config.is_result_in_montgomery_form << std::endl;
+        std::cout << "===>>> host_in1_init[0]                      = " << host_in1_init[0] << std::endl;
+        std::cout << "===>>> host_in2_init[0]                      = " << host_in2_init[0] << std::endl;
+        std::cout << "===>>> host_out[0]                           = " << host_out[0] << std::endl;
+        std::cout << "===>>> warm-up: normal host_out_ref[0] (MUL) = " << host_out_ref_mul[0] << std::endl;
+        std::cout << "===>>> warm-up: normal host_out_ref[0] (ADD) = " << host_out_ref_add[0] << std::endl;
+        std::cout << "===>>> warm-up: normal host_out_ref[0] (SUB) = " << host_out_ref_sub[0] << std::endl;
         exit(2);
       }
 
@@ -714,21 +786,34 @@ int main(int argc, char** argv)
       }
 
       // Report performance in GMPS: Giga Multiplications Per Second
-      double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count());
+      double GMPS = 1.0e-9 * in_place_repetitions * vector_size / (1.0e-6 * duration.count());
       std::cout << "Performance: " << GMPS << " Giga Multiplications Per Second" << std::endl;
     }
   }
 
   // clean up and exit
   free(host_in1_init);
+  std::cout << "example free host_in1_init" << std::endl;
   free(host_in2_init);
+  std::cout << "example free host_in2_init" << std::endl;
   free(host_in1);
+  std::cout << "example free host_in1" << std::endl;
   free(host_in2);
+  std::cout << "example free host_in2" << std::endl;
   free(host_out);
+  std::cout << "example free host_out" << std::endl;
   free(host_out_ref_mul);
+  std::cout << "example free host_out_ref_mul" << std::endl;
+  free(host_out_ref_add);
+  std::cout << "example free host_out_ref_add" << std::endl;
+  free(host_out_ref_sub);
+  std::cout << "example free host_out_ref_sub" << std::endl;
   cudaFree(device_in1);
+  std::cout << "example cudaFree device_in1" << std::endl;
   cudaFree(device_in2);
+  std::cout << "example cudaFree device_in2" << std::endl;
   cudaFree(device_out);
+  std::cout << "example cudaFree device_out" << std::endl;
   nvmlShutdown();
   return 0;
 }
diff --git a/icicle/include/vec_ops/vec_ops.cuh b/icicle/include/vec_ops/vec_ops.cuh
index 2a239689a..4de406b34 100644
--- a/icicle/include/vec_ops/vec_ops.cuh
+++ b/icicle/include/vec_ops/vec_ops.cuh
@@ -27,7 +27,9 @@ namespace vec_ops {
                     *   non-blocking and you'd need to synchronize it explicitly by running
                     *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the
                     *   function will block the current CPU thread. */
-    bool is_in_montgomery_form; /**< If true then vec_a, vec_b and result are in montgomery form.
+    bool is_input_in_montgomery_form; /**< If true then vec_a and vec_b are in montgomery form.
+                                 * Default value: false.  */
+    bool is_result_in_montgomery_form; /**< If true then result is in montgomery form.
                                  * Default value: false.  */
   };
 
@@ -44,7 +46,8 @@ namespace vec_ops {
       false, // is_b_on_device
       false, // is_result_on_device
       false, // is_async
-      false, // is_in_montgomery_form
+      false, // is_input_in_montgomery_form
+      false, // is_result_in_montgomery_form
     };
     return config;
   }
diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu
index 9883d393c..07aa0f855 100644
--- a/icicle/src/vec_ops/vec_ops.cu
+++ b/icicle/src/vec_ops/vec_ops.cu
@@ -98,22 +98,25 @@ namespace vec_ops {
 
     int is_d_alloc_vec_a_allocated = 0;
     if (!config.is_a_on_device) {
-      if (config.is_in_montgomery_form) {
+      if (config.is_input_in_montgomery_form) {
         CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
+        std::cout << "vec_ops cudaMallocAsync d_alloc_vec_a" << std::endl;
         CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
         CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_a, n * sizeof(E), config.ctx.stream, d_alloc_vec_a));
         is_d_alloc_vec_a_allocated = 1;
         d_vec_a = d_alloc_vec_a;
       } else {
         CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
+        std::cout << "vec_ops cudaMallocAsync d_alloc_vec_a" << std::endl;
         CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
         is_d_alloc_vec_a_allocated = 1;
         d_vec_a = d_alloc_vec_a;
       }
     } else {
-      if (config.is_in_montgomery_form) {
+      if (config.is_input_in_montgomery_form) {
         CHK_IF_RETURN(cudaMallocAsync(
           &d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input.
+        std::cout << "vec_ops cudaMallocAsync d_alloc_vec_a" << std::endl;
         CHK_IF_RETURN(mont::from_montgomery(vec_a, n * sizeof(E), config.ctx.stream, d_alloc_vec_a));
         is_d_alloc_vec_a_allocated = 1;
         d_vec_a = d_alloc_vec_a;
@@ -124,22 +127,25 @@ namespace vec_ops {
 
     int is_d_alloc_vec_b_allocated = 0;
     if (!config.is_b_on_device) {
-      if (config.is_in_montgomery_form) {
+      if (config.is_input_in_montgomery_form) {
         CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream));
+        std::cout << "vec_ops cudaMallocAsync d_alloc_vec_b" << std::endl;
         CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
         CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_b, n * sizeof(E), config.ctx.stream, d_alloc_vec_b));
         is_d_alloc_vec_b_allocated = 1;
         d_vec_b = d_alloc_vec_b;
       } else {
         CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream));
+        std::cout << "vec_ops cudaMallocAsync d_alloc_vec_b" << std::endl;
         CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
         is_d_alloc_vec_b_allocated = 1;
         d_vec_b = d_alloc_vec_b;
       }
     } else {
-      if (config.is_in_montgomery_form) {
+      if (config.is_input_in_montgomery_form) {
         CHK_IF_RETURN(cudaMallocAsync(
           &d_alloc_vec_b, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input.
+        std::cout << "vec_ops cudaMallocAsync d_alloc_vec_b" << std::endl;
         CHK_IF_RETURN(mont::from_montgomery(vec_b, n * sizeof(E), config.ctx.stream, d_alloc_vec_b));
         is_d_alloc_vec_b_allocated = 1;
         d_vec_b = d_alloc_vec_b;
@@ -152,6 +158,7 @@ namespace vec_ops {
     if (!config.is_result_on_device) {
       if (!is_in_place) {
         CHK_IF_RETURN(cudaMallocAsync(&d_result, n * sizeof(E), config.ctx.stream));
+        std::cout << "vec_ops cudaMallocAsync d_result" << std::endl;
         is_d_result_allocated = 1;
       } else {
         d_result = d_vec_a;
@@ -160,7 +167,8 @@ namespace vec_ops {
       if (!is_in_place) {
         d_result = result;
       } else {
-        d_result = result = d_vec_a;
+        // d_result = result = d_vec_a;    // DEBUG - looks like a bug for in-place.
+        d_result = result;
       }
     }
 
@@ -168,21 +176,30 @@ namespace vec_ops {
     Kernel<<<num_blocks, num_threads, 0, config.ctx.stream>>>(d_vec_a, d_vec_b, n, d_result);
 
     if (!config.is_result_on_device) {
-      if (config.is_in_montgomery_form) {
+      if (config.is_result_in_montgomery_form) {
         CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result)); // Convert in-place.
         CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
       } else {
         CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
       }
     } else {
-      if (config.is_in_montgomery_form) {
+      if (config.is_result_in_montgomery_form) {     
         CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result)); // Convert in-place.
       }
     }
 
-    if (is_d_alloc_vec_a_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); }
-    if (is_d_alloc_vec_b_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); }
-    if (is_d_result_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream)); }
+    if (is_d_alloc_vec_a_allocated) {
+      CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream));
+      std::cout << "vec_ops cudaFreeAsync d_alloc_vec_a" << std::endl;
+    }
+    if (is_d_alloc_vec_b_allocated) {
+      CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream));
+      std::cout << "vec_ops cudaFreeAsync d_alloc_vec_b" << std::endl;
+    }
+    if (is_d_result_allocated) {
+      CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream));
+      std::cout << "vec_ops cudaFreeAsync d_result" << std::endl;
+    }
 
     if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(config.ctx.stream));
 

From 23bc704a3cdc040a328d57010ad9d0738d5e73d8 Mon Sep 17 00:00:00 2001
From: danny-shterman <danny.shterman@gmail.com>
Date: Tue, 13 Aug 2024 09:41:40 +0000
Subject: [PATCH 11/21] Fix to/from_montgomery function param

---
 examples/c++/mont_vec_ops/example.cu | 151 ++++++++-------------------
 icicle/src/vec_ops/vec_ops.cu        |  22 ++--
 2 files changed, 47 insertions(+), 126 deletions(-)

diff --git a/examples/c++/mont_vec_ops/example.cu b/examples/c++/mont_vec_ops/example.cu
index a1fe871be..f277f2ebc 100644
--- a/examples/c++/mont_vec_ops/example.cu
+++ b/examples/c++/mont_vec_ops/example.cu
@@ -37,7 +37,6 @@ int vector_op(
     err = bn254_sub_cuda(vec_a, vec_b, n_elements, config, vec_result);
     break;
   }
-  // cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result);
   if (err != cudaSuccess) {
     std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl;
     return 0;
@@ -47,8 +46,8 @@ int vector_op(
 
 int main(int argc, char** argv)
 {
-  const unsigned vector_size              = 1 << 11;
-  const unsigned not_in_place_repetitions = 1 << 0;    // Repetitions are used only for the non in-place tests.
+  const unsigned vector_size              = 1 << 15;
+  const unsigned not_in_place_repetitions = 1 << 15;    // Repetitions are used only for the non in-place tests.
   const unsigned in_place_repetitions     = 1;    // Repetitions for in-place tests should be 1. Don't check it.
 
   cudaError_t err;
@@ -83,27 +82,21 @@ int main(int argc, char** argv)
   // host data
   std::cout << "Allocate memory for the input vectors (both normal and Montgomery presentation)" << std::endl;
   T* host_in1_init = (T*)malloc(vector_size * sizeof(T));
-  std::cout << "example malloc host_in1_init" << std::endl;
   T* host_in2_init = (T*)malloc(vector_size * sizeof(T));
-  std::cout << "example malloc host_in2_init" << std::endl;
   std::cout << "Initializing vectors with normal presentation random data" << std::endl;
   T::rand_host_many(host_in1_init, vector_size);
   T::rand_host_many(host_in2_init, vector_size);
   std::cout << "Allocate memory for the output vectors" << std::endl;
   T* host_out = (T*)malloc(vector_size * sizeof(T)); // This memory will be used for the test output.
-  std::cout << "example malloc host_out" << std::endl;
   T* host_out_ref_mul = (T*)malloc(
     vector_size *
     sizeof(T)); // This memory will be used as a reference result for mul (will be compared to host_out content).
-  std::cout << "example malloc host_out_ref_mul" << std::endl;
   T* host_out_ref_add = (T*)malloc(
     vector_size *
     sizeof(T)); // This memory will be used as a reference result for add (will be compared to host_out content).
-  std::cout << "example malloc host_out_ref_add" << std::endl;    
   T* host_out_ref_sub = (T*)malloc(
     vector_size *
     sizeof(T)); // This memory will be used as a reference result for sub (will be compared to host_out content).
-  std::cout << "malloc host_out_ref_sub" << std::endl;
   std::cout << "Initializing output vectors with random data" << std::endl;
   T::rand_host_many(host_out, vector_size);
   T::rand_host_many(host_out_ref_mul, vector_size);
@@ -116,19 +109,16 @@ int main(int argc, char** argv)
   T* device_out;
 
   err = cudaMalloc((void**)&device_in1, vector_size * sizeof(T));
-  std::cout << "example cudaMalloc device_in1" << std::endl;
   if (err != cudaSuccess) {
     std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
     return 0;
   }
   err = cudaMalloc((void**)&device_in2, vector_size * sizeof(T));
-  std::cout << "example cudaMalloc device_in2" << std::endl;
   if (err != cudaSuccess) {
     std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
     return 0;
   }
   err = cudaMalloc((void**)&device_out, vector_size * sizeof(T));
-  std::cout << "example cudaMalloc device_out" << std::endl;
   if (err != cudaSuccess) {
     std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl;
     return 0;
@@ -154,24 +144,27 @@ int main(int argc, char** argv)
   }
   std::cout << "Starting warm-up run" << std::endl;
   // Warm-up loop
+  // for (int i = 0; i < not_in_place_repetitions; i++) {
+  for (int i = 0; i < 100; i++) {   // Nof loops set to 100 because warm-up takes too much time because inputs and outputs are on located on Host.
+    vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, MUL);
+  }
+  // Generate ref results for all ops
   for (int op = MUL; op != LAST; op++) {
-    for (int i = 0; i < not_in_place_repetitions; i++) {
-      vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op);
-      switch (op) {
-        case MUL:
-          err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
-          break;
-        case ADD:
-          err = cudaMemcpy(host_out_ref_add, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
-          break;
-        case SUB:
-          err = cudaMemcpy(host_out_ref_sub, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
-          break;
-      }
-      if (err != cudaSuccess) {
-        std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl;
-        return 0;
-      }
+    vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op);
+    switch (op) {
+      case MUL:
+        err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
+        break;
+      case ADD:
+        err = cudaMemcpy(host_out_ref_add, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
+        break;
+      case SUB:
+        err = cudaMemcpy(host_out_ref_sub, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
+        break;
+    }
+    if (err != cudaSuccess) {
+      std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl;
+      return 0;
     }
   }
   //****************************************
@@ -198,35 +191,20 @@ int main(int argc, char** argv)
   //*******************************************************
   T* host_in1 =
     (T*)malloc(vector_size * sizeof(T)); // This buffer is used to load the data from host_in1_init for the benchmark.
-  std::cout << "example malloc host_in1" << std::endl;
   T* host_in2 =
     (T*)malloc(vector_size * sizeof(T)); // This buffer is used to load the data from host_in2_init for the benchmark.
-  std::cout << "example malloc host_in1" << std::endl;
   // Test when the result is not in-place
   std::cout << "*****************************************" << std::endl;
   std::cout << "*** Start not in-place benchmark loop ***" << std::endl;
   std::cout << "*****************************************" << std::endl;
   for (int op = MUL; op != LAST; op++) {
-    for (int config_idx = 0; config_idx < 32; config_idx++) {
-      // // DEBUG
-      // int num_gpus;
-      // size_t free, total;
-      // cudaGetDeviceCount( &num_gpus );
-      // std::cout << "num_gpus " << num_gpus << std::endl;
-      // for ( int gpu_id = 0; gpu_id < num_gpus; gpu_id++ ) {
-      //     cudaSetDevice( gpu_id );
-      //     int id;
-      //     cudaGetDevice( &id );
-      //     cudaMemGetInfo( &free, &total );
-      //     std::cout << "GPU " << id << " memory: free=" << free << ", total=" << total << std::endl;
-      // }
-      // // DEBUG
+    for (int config_idx = 28; config_idx < 29; config_idx++) {
+    // for (int config_idx = 0; config_idx < 32; config_idx++) {
       switch (op) {
         case MUL: std::cout << "Start benchmark loop for op MUL config_idx " << config_idx << " not in-place" << std::endl; break;
         case ADD: std::cout << "Start benchmark loop for op ADD config_idx " << config_idx << " not in-place" << std::endl; break;
         case SUB: std::cout << "Start benchmark loop for op SUB config_idx " << config_idx << " not in-place" << std::endl; break;
       }
-      std::cout << "Start benchmark loop for config_idx " << config_idx << std::endl;
       // Destroy the result of the prev loop.
       T::rand_host_many(host_out, vector_size);   // Randomize host_out in order to randomize device_out.
       err = cudaMemcpy(
@@ -344,37 +322,16 @@ int main(int argc, char** argv)
       CHK_IF_RETURN(cudaPeekAtLastError());
 
       auto start_time = std::chrono::high_resolution_clock::now();
-      // Benchmark loop
-      for (int i = 0; i < not_in_place_repetitions; i++) {
-        switch (config_idx >> (nof_of_configs_for_test - nof_of_storage_configs)) { // {is_a_on_device, is_b_on_device, is_result_on_device}
-          case 0b000:
-            vector_op(host_in1, host_in2, host_out, vector_size, ctx, config, (Op)op);
-            break;
-          case 0b001:
-            vector_op(host_in1, host_in2, device_out, vector_size, ctx, config, (Op)op);
-            break;
-          case 0b010:
-            vector_op(host_in1, device_in2, host_out, vector_size, ctx, config, (Op)op);
-            break;
-          case 0b011:
-            vector_op(host_in1, device_in2, device_out, vector_size, ctx, config, (Op)op);
-            break;
-          case 0b100:
-            vector_op(device_in1, host_in2, host_out, vector_size, ctx, config, (Op)op);
-            break;
-          case 0b101:
-            vector_op(device_in1, host_in2, device_out, vector_size, ctx, config, (Op)op);
-            break;
-          case 0b110:
-            vector_op(device_in1, device_in2, host_out, vector_size, ctx, config, (Op)op);
-            break;
-          case 0b111:
-            vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op);
-            break;
-        }
-        CHK_IF_RETURN(cudaPeekAtLastError());
+      switch (config_idx >> (nof_of_configs_for_test - nof_of_storage_configs)) { // {is_a_on_device, is_b_on_device, is_result_on_device}
+        case 0b000: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(host_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); } break;
+        case 0b001: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(host_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); } break;
+        case 0b010: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(host_in1, device_in2, host_out, vector_size, ctx, config, (Op)op);  } break;
+        case 0b011: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(host_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); } break;
+        case 0b100: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); } break;
+        case 0b101: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); } break;
+        case 0b110: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, device_in2, host_out, vector_size, ctx, config, (Op)op); } break;  
+        case 0b111: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); } break;
       }
-
       auto end_time = std::chrono::high_resolution_clock::now();
       auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
       switch (op) {
@@ -513,7 +470,6 @@ int main(int argc, char** argv)
         case ADD: std::cout << "Start benchmark loop for op ADD config_idx " << config_idx << " in-place" << std::endl; break;
         case SUB: std::cout << "Start benchmark loop for op SUB config_idx " << config_idx << " in-place" << std::endl; break;
       }
-      std::cout << "Start benchmark loop for config_idx " << config_idx << std::endl;
       // Destroy the result of the prev loop.
       T::rand_host_many(host_out, vector_size);   // Randomize host_out in order to randomize device_out.
       err = cudaMemcpy(
@@ -636,31 +592,17 @@ int main(int argc, char** argv)
       // Benchmark loop
       for (int i = 0; i < in_place_repetitions; i++) {
         switch (config_idx >> (nof_of_configs_for_test - nof_of_storage_configs)) { // {is_a_on_device, is_b_on_device, is_result_on_device}
-          case 0b000:
-            vector_op(host_in1, host_in2, host_in1, vector_size, ctx, config, (Op)op);
-            break;
-          case 0b001:
-            break;
-          case 0b010:
-            vector_op(host_in1, device_in2, host_in1, vector_size, ctx, config, (Op)op);
-            break;
-          case 0b011:
-            break;
-          case 0b100:
-            break;
-          case 0b101:
-            vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op);
-            std::cout << "===>>> COMMAND: vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op);" << std::endl;
-            break;
-          case 0b110:
-            break;
-          case 0b111:
-            vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op);
-            break;
+          case 0b000: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(host_in1, host_in2, host_in1, vector_size, ctx, config, (Op)op); } break;
+          case 0b001: break;
+          case 0b010: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(host_in1, device_in2, host_in1, vector_size, ctx, config, (Op)op); } break;
+          case 0b011: break;
+          case 0b100: break;
+          case 0b101: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op); } break;
+          case 0b110: break;
+          case 0b111: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op); } break;
         }
         CHK_IF_RETURN(cudaPeekAtLastError());
       }
-
       auto end_time = std::chrono::high_resolution_clock::now();
       auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
       switch (op) {
@@ -793,27 +735,16 @@ int main(int argc, char** argv)
 
   // clean up and exit
   free(host_in1_init);
-  std::cout << "example free host_in1_init" << std::endl;
   free(host_in2_init);
-  std::cout << "example free host_in2_init" << std::endl;
   free(host_in1);
-  std::cout << "example free host_in1" << std::endl;
   free(host_in2);
-  std::cout << "example free host_in2" << std::endl;
   free(host_out);
-  std::cout << "example free host_out" << std::endl;
   free(host_out_ref_mul);
-  std::cout << "example free host_out_ref_mul" << std::endl;
   free(host_out_ref_add);
-  std::cout << "example free host_out_ref_add" << std::endl;
   free(host_out_ref_sub);
-  std::cout << "example free host_out_ref_sub" << std::endl;
   cudaFree(device_in1);
-  std::cout << "example cudaFree device_in1" << std::endl;
   cudaFree(device_in2);
-  std::cout << "example cudaFree device_in2" << std::endl;
   cudaFree(device_out);
-  std::cout << "example cudaFree device_out" << std::endl;
   nvmlShutdown();
   return 0;
 }
diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu
index 07aa0f855..baac53b43 100644
--- a/icicle/src/vec_ops/vec_ops.cu
+++ b/icicle/src/vec_ops/vec_ops.cu
@@ -100,14 +100,12 @@ namespace vec_ops {
     if (!config.is_a_on_device) {
       if (config.is_input_in_montgomery_form) {
         CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
-        std::cout << "vec_ops cudaMallocAsync d_alloc_vec_a" << std::endl;
         CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
-        CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_a, n * sizeof(E), config.ctx.stream, d_alloc_vec_a));
+        CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_a, n, config.ctx.stream, d_alloc_vec_a));
         is_d_alloc_vec_a_allocated = 1;
         d_vec_a = d_alloc_vec_a;
       } else {
         CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
-        std::cout << "vec_ops cudaMallocAsync d_alloc_vec_a" << std::endl;
         CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
         is_d_alloc_vec_a_allocated = 1;
         d_vec_a = d_alloc_vec_a;
@@ -116,8 +114,7 @@ namespace vec_ops {
       if (config.is_input_in_montgomery_form) {
         CHK_IF_RETURN(cudaMallocAsync(
           &d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input.
-        std::cout << "vec_ops cudaMallocAsync d_alloc_vec_a" << std::endl;
-        CHK_IF_RETURN(mont::from_montgomery(vec_a, n * sizeof(E), config.ctx.stream, d_alloc_vec_a));
+        CHK_IF_RETURN(mont::from_montgomery(vec_a, n, config.ctx.stream, d_alloc_vec_a));
         is_d_alloc_vec_a_allocated = 1;
         d_vec_a = d_alloc_vec_a;
       } else {
@@ -129,14 +126,12 @@ namespace vec_ops {
     if (!config.is_b_on_device) {
       if (config.is_input_in_montgomery_form) {
         CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream));
-        std::cout << "vec_ops cudaMallocAsync d_alloc_vec_b" << std::endl;
         CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
-        CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_b, n * sizeof(E), config.ctx.stream, d_alloc_vec_b));
+        CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_b, n, config.ctx.stream, d_alloc_vec_b));
         is_d_alloc_vec_b_allocated = 1;
         d_vec_b = d_alloc_vec_b;
       } else {
         CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream));
-        std::cout << "vec_ops cudaMallocAsync d_alloc_vec_b" << std::endl;
         CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
         is_d_alloc_vec_b_allocated = 1;
         d_vec_b = d_alloc_vec_b;
@@ -145,8 +140,7 @@ namespace vec_ops {
       if (config.is_input_in_montgomery_form) {
         CHK_IF_RETURN(cudaMallocAsync(
           &d_alloc_vec_b, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input.
-        std::cout << "vec_ops cudaMallocAsync d_alloc_vec_b" << std::endl;
-        CHK_IF_RETURN(mont::from_montgomery(vec_b, n * sizeof(E), config.ctx.stream, d_alloc_vec_b));
+        CHK_IF_RETURN(mont::from_montgomery(vec_b, n, config.ctx.stream, d_alloc_vec_b));
         is_d_alloc_vec_b_allocated = 1;
         d_vec_b = d_alloc_vec_b;
       } else {
@@ -158,7 +152,6 @@ namespace vec_ops {
     if (!config.is_result_on_device) {
       if (!is_in_place) {
         CHK_IF_RETURN(cudaMallocAsync(&d_result, n * sizeof(E), config.ctx.stream));
-        std::cout << "vec_ops cudaMallocAsync d_result" << std::endl;
         is_d_result_allocated = 1;
       } else {
         d_result = d_vec_a;
@@ -177,28 +170,25 @@ namespace vec_ops {
 
     if (!config.is_result_on_device) {
       if (config.is_result_in_montgomery_form) {
-        CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result)); // Convert in-place.
+        CHK_IF_RETURN(mont::to_montgomery(d_result, n, config.ctx.stream, d_result)); // Convert in-place.
         CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
       } else {
         CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
       }
     } else {
       if (config.is_result_in_montgomery_form) {     
-        CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result)); // Convert in-place.
+        CHK_IF_RETURN(mont::to_montgomery(d_result, n, config.ctx.stream, d_result)); // Convert in-place.
       }
     }
 
     if (is_d_alloc_vec_a_allocated) {
       CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream));
-      std::cout << "vec_ops cudaFreeAsync d_alloc_vec_a" << std::endl;
     }
     if (is_d_alloc_vec_b_allocated) {
       CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream));
-      std::cout << "vec_ops cudaFreeAsync d_alloc_vec_b" << std::endl;
     }
     if (is_d_result_allocated) {
       CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream));
-      std::cout << "vec_ops cudaFreeAsync d_result" << std::endl;
     }
 
     if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(config.ctx.stream));

From b59af917ee1df22ac77b08b6dd1509e9074e2f60 Mon Sep 17 00:00:00 2001
From: danny-shterman <danny.shterman@gmail.com>
Date: Tue, 13 Aug 2024 09:43:56 +0000
Subject: [PATCH 12/21] Run clang-format

---
 examples/c++/mont_vec_ops/example.cu | 240 +++++++++++++++++----------
 icicle/src/vec_ops/vec_ops.cu        |  14 +-
 2 files changed, 160 insertions(+), 94 deletions(-)

diff --git a/examples/c++/mont_vec_ops/example.cu b/examples/c++/mont_vec_ops/example.cu
index f277f2ebc..46bad5607 100644
--- a/examples/c++/mont_vec_ops/example.cu
+++ b/examples/c++/mont_vec_ops/example.cu
@@ -46,9 +46,9 @@ int vector_op(
 
 int main(int argc, char** argv)
 {
-  const unsigned vector_size              = 1 << 15;
-  const unsigned not_in_place_repetitions = 1 << 15;    // Repetitions are used only for the non in-place tests.
-  const unsigned in_place_repetitions     = 1;    // Repetitions for in-place tests should be 1. Don't check it.
+  const unsigned vector_size = 1 << 15;
+  const unsigned not_in_place_repetitions = 1 << 15; // Repetitions are used only for the non in-place tests.
+  const unsigned in_place_repetitions = 1;           // Repetitions for in-place tests should be 1. Don't check it.
 
   cudaError_t err;
   nvmlInit();
@@ -67,7 +67,8 @@ int main(int argc, char** argv)
   std::cout << "Vector size:              " << vector_size << std::endl;
   std::cout << "not_in_place_repetitions: " << not_in_place_repetitions << std::endl;
   std::cout << "in_place_repetitions:     " << in_place_repetitions << std::endl;
-  std::cout << "Power limit:              " << std::fixed << std::setprecision(3) << 1.0e-3 * power_limit << " W" << std::endl;
+  std::cout << "Power limit:              " << std::fixed << std::setprecision(3) << 1.0e-3 * power_limit << " W"
+            << std::endl;
 
   unsigned int baseline_power;
   nvmlDeviceGetPowerUsage(device, &baseline_power);
@@ -126,7 +127,7 @@ int main(int argc, char** argv)
 
   vec_ops::VecOpsConfig config = vec_ops::DefaultVecOpsConfig();
   int nof_of_configs_for_test = 5;
-  int nof_of_storage_configs = 3;   // 2 inputs, 1 result.
+  int nof_of_storage_configs = 3; // 2 inputs, 1 result.
 
   //****************************************
   // Test warn-up and reference output config. Reference output to be used to check if test passed or not.
@@ -145,22 +146,23 @@ int main(int argc, char** argv)
   std::cout << "Starting warm-up run" << std::endl;
   // Warm-up loop
   // for (int i = 0; i < not_in_place_repetitions; i++) {
-  for (int i = 0; i < 100; i++) {   // Nof loops set to 100 because warm-up takes too much time because inputs and outputs are on located on Host.
+  for (int i = 0; i < 100; i++) { // Nof loops set to 100 because warm-up takes too much time because inputs and outputs
+                                  // are on located on Host.
     vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, MUL);
   }
   // Generate ref results for all ops
   for (int op = MUL; op != LAST; op++) {
     vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op);
     switch (op) {
-      case MUL:
-        err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
-        break;
-      case ADD:
-        err = cudaMemcpy(host_out_ref_add, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
-        break;
-      case SUB:
-        err = cudaMemcpy(host_out_ref_sub, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
-        break;
+    case MUL:
+      err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
+      break;
+    case ADD:
+      err = cudaMemcpy(host_out_ref_add, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
+      break;
+    case SUB:
+      err = cudaMemcpy(host_out_ref_sub, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost);
+      break;
     }
     if (err != cudaSuccess) {
       std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl;
@@ -199,14 +201,20 @@ int main(int argc, char** argv)
   std::cout << "*****************************************" << std::endl;
   for (int op = MUL; op != LAST; op++) {
     for (int config_idx = 28; config_idx < 29; config_idx++) {
-    // for (int config_idx = 0; config_idx < 32; config_idx++) {
+      // for (int config_idx = 0; config_idx < 32; config_idx++) {
       switch (op) {
-        case MUL: std::cout << "Start benchmark loop for op MUL config_idx " << config_idx << " not in-place" << std::endl; break;
-        case ADD: std::cout << "Start benchmark loop for op ADD config_idx " << config_idx << " not in-place" << std::endl; break;
-        case SUB: std::cout << "Start benchmark loop for op SUB config_idx " << config_idx << " not in-place" << std::endl; break;
+      case MUL:
+        std::cout << "Start benchmark loop for op MUL config_idx " << config_idx << " not in-place" << std::endl;
+        break;
+      case ADD:
+        std::cout << "Start benchmark loop for op ADD config_idx " << config_idx << " not in-place" << std::endl;
+        break;
+      case SUB:
+        std::cout << "Start benchmark loop for op SUB config_idx " << config_idx << " not in-place" << std::endl;
+        break;
       }
       // Destroy the result of the prev loop.
-      T::rand_host_many(host_out, vector_size);   // Randomize host_out in order to randomize device_out.
+      T::rand_host_many(host_out, vector_size); // Randomize host_out in order to randomize device_out.
       err = cudaMemcpy(
         device_out, host_out, vector_size * sizeof(T),
         cudaMemcpyHostToDevice); // Copy random data to device_out.
@@ -214,16 +222,16 @@ int main(int argc, char** argv)
         std::cerr << "Failed to copy data from host_out to device_out - " << cudaGetErrorString(err) << std::endl;
         return 0;
       }
-      T::rand_host_many(host_out, vector_size);   // Make hist_out != device_out. 
+      T::rand_host_many(host_out, vector_size); // Make hist_out != device_out.
       // Initializa inputs with the known data
       for (int i = 0; i < vector_size; i++) {
         host_in1[i] = host_in1_init[i];
         host_in2[i] = host_in2_init[i];
       }
-      config.is_a_on_device               = (config_idx >> 4) & 0x1;
-      config.is_b_on_device               = (config_idx >> 3) & 0x1;
-      config.is_result_on_device          = (config_idx >> 2) & 0x1;
-      config.is_input_in_montgomery_form  = (config_idx >> 1) & 0x1;
+      config.is_a_on_device = (config_idx >> 4) & 0x1;
+      config.is_b_on_device = (config_idx >> 3) & 0x1;
+      config.is_result_on_device = (config_idx >> 2) & 0x1;
+      config.is_input_in_montgomery_form = (config_idx >> 1) & 0x1;
       config.is_result_in_montgomery_form = (config_idx >> 0) & 0x1;
 
       // Copy from host to device (copy again in order to be used later in the loop and device_inX was already
@@ -242,7 +250,7 @@ int main(int argc, char** argv)
           for (int i = 0; i < vector_size; i++) {
             host_in1[i] = host_in2_init[i];
           }
-        } else {                                                                          // Normal presentation.
+        } else { // Normal presentation.
           err =
             cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
           if (err != cudaSuccess) {
@@ -322,15 +330,48 @@ int main(int argc, char** argv)
       CHK_IF_RETURN(cudaPeekAtLastError());
 
       auto start_time = std::chrono::high_resolution_clock::now();
-      switch (config_idx >> (nof_of_configs_for_test - nof_of_storage_configs)) { // {is_a_on_device, is_b_on_device, is_result_on_device}
-        case 0b000: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(host_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); } break;
-        case 0b001: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(host_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); } break;
-        case 0b010: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(host_in1, device_in2, host_out, vector_size, ctx, config, (Op)op);  } break;
-        case 0b011: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(host_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); } break;
-        case 0b100: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); } break;
-        case 0b101: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); } break;
-        case 0b110: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, device_in2, host_out, vector_size, ctx, config, (Op)op); } break;  
-        case 0b111: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); } break;
+      switch (config_idx >> (nof_of_configs_for_test -
+                             nof_of_storage_configs)) { // {is_a_on_device, is_b_on_device, is_result_on_device}
+      case 0b000:
+        for (int i = 0; i < not_in_place_repetitions; i++) {
+          vector_op(host_in1, host_in2, host_out, vector_size, ctx, config, (Op)op);
+        }
+        break;
+      case 0b001:
+        for (int i = 0; i < not_in_place_repetitions; i++) {
+          vector_op(host_in1, host_in2, device_out, vector_size, ctx, config, (Op)op);
+        }
+        break;
+      case 0b010:
+        for (int i = 0; i < not_in_place_repetitions; i++) {
+          vector_op(host_in1, device_in2, host_out, vector_size, ctx, config, (Op)op);
+        }
+        break;
+      case 0b011:
+        for (int i = 0; i < not_in_place_repetitions; i++) {
+          vector_op(host_in1, device_in2, device_out, vector_size, ctx, config, (Op)op);
+        }
+        break;
+      case 0b100:
+        for (int i = 0; i < not_in_place_repetitions; i++) {
+          vector_op(device_in1, host_in2, host_out, vector_size, ctx, config, (Op)op);
+        }
+        break;
+      case 0b101:
+        for (int i = 0; i < not_in_place_repetitions; i++) {
+          vector_op(device_in1, host_in2, device_out, vector_size, ctx, config, (Op)op);
+        }
+        break;
+      case 0b110:
+        for (int i = 0; i < not_in_place_repetitions; i++) {
+          vector_op(device_in1, device_in2, host_out, vector_size, ctx, config, (Op)op);
+        }
+        break;
+      case 0b111:
+        for (int i = 0; i < not_in_place_repetitions; i++) {
+          vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op);
+        }
+        break;
       }
       auto end_time = std::chrono::high_resolution_clock::now();
       auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
@@ -362,7 +403,7 @@ int main(int argc, char** argv)
           std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl;
           return 0;
         }
-      } else {                              // Data is not on device but it is in host_out.
+      } else {                                     // Data is not on device but it is in host_out.
         if (config.is_result_in_montgomery_form) { // host_out should be written to device, converted to mmontgomery and
                                                    // written back to host. Then compared vs. host_out_ref_XXX.
           err = cudaMemcpy(
@@ -382,7 +423,7 @@ int main(int argc, char** argv)
             std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl;
             return 0;
           }
-        } else { // host_out could be compared vs. host_out_ref_XXX as is.   
+        } else { // host_out could be compared vs. host_out_ref_XXX as is.
         }
       }
       //****************************************
@@ -466,12 +507,18 @@ int main(int argc, char** argv)
   for (int op = MUL; op != LAST; op++) {
     for (int config_idx = 0; config_idx < 32; config_idx++) {
       switch (op) {
-        case MUL: std::cout << "Start benchmark loop for op MUL config_idx " << config_idx << " in-place" << std::endl; break;
-        case ADD: std::cout << "Start benchmark loop for op ADD config_idx " << config_idx << " in-place" << std::endl; break;
-        case SUB: std::cout << "Start benchmark loop for op SUB config_idx " << config_idx << " in-place" << std::endl; break;
+      case MUL:
+        std::cout << "Start benchmark loop for op MUL config_idx " << config_idx << " in-place" << std::endl;
+        break;
+      case ADD:
+        std::cout << "Start benchmark loop for op ADD config_idx " << config_idx << " in-place" << std::endl;
+        break;
+      case SUB:
+        std::cout << "Start benchmark loop for op SUB config_idx " << config_idx << " in-place" << std::endl;
+        break;
       }
       // Destroy the result of the prev loop.
-      T::rand_host_many(host_out, vector_size);   // Randomize host_out in order to randomize device_out.
+      T::rand_host_many(host_out, vector_size); // Randomize host_out in order to randomize device_out.
       err = cudaMemcpy(
         device_out, host_out, vector_size * sizeof(T),
         cudaMemcpyHostToDevice); // Copy random data to device_out.
@@ -479,19 +526,22 @@ int main(int argc, char** argv)
         std::cerr << "Failed to copy data from host_out to device_out - " << cudaGetErrorString(err) << std::endl;
         return 0;
       }
-      T::rand_host_many(host_out, vector_size);   // Make hist_out != device_out. 
-      // Initializa inputs with the known data. For in-place tests host_in1 is going to be used as a result. So, it should be initialized later in the repetions loop.
+      T::rand_host_many(host_out, vector_size); // Make hist_out != device_out.
+      // Initializa inputs with the known data. For in-place tests host_in1 is going to be used as a result. So, it
+      // should be initialized later in the repetions loop.
       for (int i = 0; i < vector_size; i++) {
         host_in1[i] = host_in1_init[i];
         host_in2[i] = host_in2_init[i];
       }
-      config.is_a_on_device               = (config_idx >> 4) & 0x1;
-      config.is_b_on_device               = (config_idx >> 3) & 0x1;
-      config.is_result_on_device          = (config_idx >> 2) & 0x1;
-      config.is_input_in_montgomery_form  = (config_idx >> 1) & 0x1;
+      config.is_a_on_device = (config_idx >> 4) & 0x1;
+      config.is_b_on_device = (config_idx >> 3) & 0x1;
+      config.is_result_on_device = (config_idx >> 2) & 0x1;
+      config.is_input_in_montgomery_form = (config_idx >> 1) & 0x1;
       config.is_result_in_montgomery_form = (config_idx >> 1) & 0x1;
-      if (config.is_a_on_device ^ config.is_result_on_device == 1) { continue; }    // Illegal case for this loop.
-      if (config.is_input_in_montgomery_form ^ config.is_result_in_montgomery_form == 1) { continue; }    // Illegal case for this loop.
+      if (config.is_a_on_device ^ config.is_result_on_device == 1) { continue; } // Illegal case for this loop.
+      if (config.is_input_in_montgomery_form ^ config.is_result_in_montgomery_form == 1) {
+        continue;
+      } // Illegal case for this loop.
 
       // Copy from host to device (copy again in order to be used later in the loop and device_inX was already
       // overwritten by warmup.
@@ -509,7 +559,7 @@ int main(int argc, char** argv)
           for (int i = 0; i < vector_size; i++) {
             host_in1[i] = host_in2_init[i];
           }
-        } else {                                                                          // Normal presentation.
+        } else { // Normal presentation.
           err =
             cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device.
           if (err != cudaSuccess) {
@@ -591,15 +641,36 @@ int main(int argc, char** argv)
       auto start_time = std::chrono::high_resolution_clock::now();
       // Benchmark loop
       for (int i = 0; i < in_place_repetitions; i++) {
-        switch (config_idx >> (nof_of_configs_for_test - nof_of_storage_configs)) { // {is_a_on_device, is_b_on_device, is_result_on_device}
-          case 0b000: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(host_in1, host_in2, host_in1, vector_size, ctx, config, (Op)op); } break;
-          case 0b001: break;
-          case 0b010: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(host_in1, device_in2, host_in1, vector_size, ctx, config, (Op)op); } break;
-          case 0b011: break;
-          case 0b100: break;
-          case 0b101: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op); } break;
-          case 0b110: break;
-          case 0b111: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op); } break;
+        switch (config_idx >> (nof_of_configs_for_test -
+                               nof_of_storage_configs)) { // {is_a_on_device, is_b_on_device, is_result_on_device}
+        case 0b000:
+          for (int i = 0; i < not_in_place_repetitions; i++) {
+            vector_op(host_in1, host_in2, host_in1, vector_size, ctx, config, (Op)op);
+          }
+          break;
+        case 0b001:
+          break;
+        case 0b010:
+          for (int i = 0; i < not_in_place_repetitions; i++) {
+            vector_op(host_in1, device_in2, host_in1, vector_size, ctx, config, (Op)op);
+          }
+          break;
+        case 0b011:
+          break;
+        case 0b100:
+          break;
+        case 0b101:
+          for (int i = 0; i < not_in_place_repetitions; i++) {
+            vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op);
+          }
+          break;
+        case 0b110:
+          break;
+        case 0b111:
+          for (int i = 0; i < not_in_place_repetitions; i++) {
+            vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op);
+          }
+          break;
         }
         CHK_IF_RETURN(cudaPeekAtLastError());
       }
@@ -654,7 +725,8 @@ int main(int argc, char** argv)
             return 0;
           }
         } else { // host_out could be compared vs. host_out_ref_XXX as is.
-          for (int i = 0; i < vector_size; i++) {   // Copy from host_in1 (result) to host_out to compare later vs. host_out_ref_XXX.
+          for (int i = 0; i < vector_size;
+               i++) { // Copy from host_in1 (result) to host_out to compare later vs. host_out_ref_XXX.
             host_out[i] = host_in1[i];
           }
         }
@@ -671,33 +743,33 @@ int main(int argc, char** argv)
       //***********************************************
       int test_failed = 0;
       switch (op) {
-        case MUL:
-          for (int i = 0; i < vector_size; i++) {
-            if (host_out_ref_mul[i] != host_out[i]) {
-              std::cout << "===>>> ERROR!!! MUL: Test failed for vector index " << i
-                        << ", config is printed below:" << std::endl;
-              test_failed = 1;
-            }
+      case MUL:
+        for (int i = 0; i < vector_size; i++) {
+          if (host_out_ref_mul[i] != host_out[i]) {
+            std::cout << "===>>> ERROR!!! MUL: Test failed for vector index " << i
+                      << ", config is printed below:" << std::endl;
+            test_failed = 1;
           }
-          break;
-        case ADD:
-          for (int i = 0; i < vector_size; i++) {
-            if (host_out_ref_add[i] != host_out[i]) {
-              std::cout << "===>>> ERROR!!! ADD: Test failed for vector index " << i
-                        << ", config is printed below:" << std::endl;
-              test_failed = 1;
-            }
+        }
+        break;
+      case ADD:
+        for (int i = 0; i < vector_size; i++) {
+          if (host_out_ref_add[i] != host_out[i]) {
+            std::cout << "===>>> ERROR!!! ADD: Test failed for vector index " << i
+                      << ", config is printed below:" << std::endl;
+            test_failed = 1;
           }
-          break;
-        case SUB:
-          for (int i = 0; i < vector_size; i++) {
-            if (host_out_ref_sub[i] != host_out[i]) {
-              std::cout << "===>>> ERROR!!! SUB: Test failed for vector index " << i
-                        << ", config is printed below:" << std::endl;
-              test_failed = 1;
-            }
+        }
+        break;
+      case SUB:
+        for (int i = 0; i < vector_size; i++) {
+          if (host_out_ref_sub[i] != host_out[i]) {
+            std::cout << "===>>> ERROR!!! SUB: Test failed for vector index " << i
+                      << ", config is printed below:" << std::endl;
+            test_failed = 1;
           }
-          break;
+        }
+        break;
       }
       if (test_failed) {
         std::cout << "===>>> result is in-place:                " << std::endl;
diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu
index baac53b43..c95e343e6 100644
--- a/icicle/src/vec_ops/vec_ops.cu
+++ b/icicle/src/vec_ops/vec_ops.cu
@@ -176,20 +176,14 @@ namespace vec_ops {
         CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream));
       }
     } else {
-      if (config.is_result_in_montgomery_form) {     
+      if (config.is_result_in_montgomery_form) {
         CHK_IF_RETURN(mont::to_montgomery(d_result, n, config.ctx.stream, d_result)); // Convert in-place.
       }
     }
 
-    if (is_d_alloc_vec_a_allocated) {
-      CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream));
-    }
-    if (is_d_alloc_vec_b_allocated) {
-      CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream));
-    }
-    if (is_d_result_allocated) {
-      CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream));
-    }
+    if (is_d_alloc_vec_a_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); }
+    if (is_d_alloc_vec_b_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); }
+    if (is_d_result_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream)); }
 
     if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(config.ctx.stream));
 

From 028f512d7d1bf730212b42e3a159f1bbf3318b1d Mon Sep 17 00:00:00 2001
From: danny-shterman <danny.shterman@gmail.com>
Date: Tue, 13 Aug 2024 10:00:21 +0000
Subject: [PATCH 13/21] Fix typo.

---
 examples/c++/mont_vec_ops/example.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/c++/mont_vec_ops/example.cu b/examples/c++/mont_vec_ops/example.cu
index 46bad5607..d668c3e37 100644
--- a/examples/c++/mont_vec_ops/example.cu
+++ b/examples/c++/mont_vec_ops/example.cu
@@ -200,8 +200,8 @@ int main(int argc, char** argv)
   std::cout << "*** Start not in-place benchmark loop ***" << std::endl;
   std::cout << "*****************************************" << std::endl;
   for (int op = MUL; op != LAST; op++) {
-    for (int config_idx = 28; config_idx < 29; config_idx++) {
-      // for (int config_idx = 0; config_idx < 32; config_idx++) {
+      // for (int config_idx = 28; config_idx < 29; config_idx++) {
+      for (int config_idx = 0; config_idx < 32; config_idx++) {
       switch (op) {
       case MUL:
         std::cout << "Start benchmark loop for op MUL config_idx " << config_idx << " not in-place" << std::endl;

From e76053af4db9799a5b57bf8a874d00844c9d671f Mon Sep 17 00:00:00 2001
From: danny-shterman <danny.shterman@gmail.com>
Date: Tue, 13 Aug 2024 10:14:45 +0000
Subject: [PATCH 14/21] Added space in order o rerun the icicle testing

---
 examples/c++/mont_vec_ops/example.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/c++/mont_vec_ops/example.cu b/examples/c++/mont_vec_ops/example.cu
index d668c3e37..6befbec06 100644
--- a/examples/c++/mont_vec_ops/example.cu
+++ b/examples/c++/mont_vec_ops/example.cu
@@ -201,7 +201,7 @@ int main(int argc, char** argv)
   std::cout << "*****************************************" << std::endl;
   for (int op = MUL; op != LAST; op++) {
       // for (int config_idx = 28; config_idx < 29; config_idx++) {
-      for (int config_idx = 0; config_idx < 32; config_idx++) {
+      for (int config_idx = 0; config_idx < 32; config_idx++) { 
       switch (op) {
       case MUL:
         std::cout << "Start benchmark loop for op MUL config_idx " << config_idx << " not in-place" << std::endl;

From afd3e54d545f632d1ef0ee517a2998a52ea84448 Mon Sep 17 00:00:00 2001
From: danny-shterman <danny.shterman@gmail.com>
Date: Tue, 13 Aug 2024 10:42:57 +0000
Subject: [PATCH 15/21] Fix spelling

---
 examples/c++/mont_vec_ops/example.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/c++/mont_vec_ops/example.cu b/examples/c++/mont_vec_ops/example.cu
index 6befbec06..e2ef3a42a 100644
--- a/examples/c++/mont_vec_ops/example.cu
+++ b/examples/c++/mont_vec_ops/example.cu
@@ -223,7 +223,7 @@ int main(int argc, char** argv)
         return 0;
       }
       T::rand_host_many(host_out, vector_size); // Make hist_out != device_out.
-      // Initializa inputs with the known data
+      // Initialize inputs with the known data
       for (int i = 0; i < vector_size; i++) {
         host_in1[i] = host_in1_init[i];
         host_in2[i] = host_in2_init[i];
@@ -527,8 +527,8 @@ int main(int argc, char** argv)
         return 0;
       }
       T::rand_host_many(host_out, vector_size); // Make hist_out != device_out.
-      // Initializa inputs with the known data. For in-place tests host_in1 is going to be used as a result. So, it
-      // should be initialized later in the repetions loop.
+      // Initialize inputs with the known data. For in-place tests host_in1 is going to be used as a result. So, it
+      // should be initialized later in the repetitions loop.
       for (int i = 0; i < vector_size; i++) {
         host_in1[i] = host_in1_init[i];
         host_in2[i] = host_in2_init[i];

From c4ebc269375648658d8f9e5a11821a6063c4d2df Mon Sep 17 00:00:00 2001
From: danny-shterman <danny.shterman@gmail.com>
Date: Tue, 13 Aug 2024 10:57:34 +0000
Subject: [PATCH 16/21] To start github action.

---
 examples/c++/mont_vec_ops/example.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/c++/mont_vec_ops/example.cu b/examples/c++/mont_vec_ops/example.cu
index e2ef3a42a..4795325d3 100644
--- a/examples/c++/mont_vec_ops/example.cu
+++ b/examples/c++/mont_vec_ops/example.cu
@@ -14,7 +14,7 @@ typedef scalar_t T;
 
 enum Op { MUL, ADD, SUB, LAST };
 
-// bn254 p = 21888242871839275222246405745257275088548364400416034343698204186575808495617
+// bn254 p = 21888242871839275222246405745257275088548364400416034343698204186575808495617 
 
 int vector_op(
   T* vec_a,

From 85e1cc75a1b010307098f2b17e1838c94b6c4e8b Mon Sep 17 00:00:00 2001
From: danny-shterman <danny.shterman@gmail.com>
Date: Tue, 13 Aug 2024 11:00:22 +0000
Subject: [PATCH 17/21] clang-format

---
 icicle/include/vec_ops/vec_ops.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/icicle/include/vec_ops/vec_ops.cuh b/icicle/include/vec_ops/vec_ops.cuh
index 4de406b34..144bf08bf 100644
--- a/icicle/include/vec_ops/vec_ops.cuh
+++ b/icicle/include/vec_ops/vec_ops.cuh
@@ -27,10 +27,10 @@ namespace vec_ops {
                     *   non-blocking and you'd need to synchronize it explicitly by running
                     *   `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the
                     *   function will block the current CPU thread. */
-    bool is_input_in_montgomery_form; /**< If true then vec_a and vec_b are in montgomery form.
-                                 * Default value: false.  */
+    bool is_input_in_montgomery_form;  /**< If true then vec_a and vec_b are in montgomery form.
+                                        * Default value: false.  */
     bool is_result_in_montgomery_form; /**< If true then result is in montgomery form.
-                                 * Default value: false.  */
+                                        * Default value: false.  */
   };
 
   /**

From 6236b849da45ddf040361117b0eba2a9e0c2765b Mon Sep 17 00:00:00 2001
From: LeonHibnik <leon@ingonyama.com>
Date: Thu, 15 Aug 2024 13:12:59 +0300
Subject: [PATCH 18/21] fix configs

---
 wrappers/golang/core/vec_ops.go              | 9 ++++++---
 wrappers/golang/core/vec_ops_test.go         | 3 ++-
 wrappers/rust/icicle-core/src/vec_ops/mod.rs | 6 ++++--
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/wrappers/golang/core/vec_ops.go b/wrappers/golang/core/vec_ops.go
index 8bef10b63..066936628 100644
--- a/wrappers/golang/core/vec_ops.go
+++ b/wrappers/golang/core/vec_ops.go
@@ -28,8 +28,10 @@ type VecOpsConfig struct {
 	*  non-blocking and you'll need to synchronize it explicitly by calling
 	*  `SynchronizeStream`. If set to false, the function will block the current CPU thread. */
 	IsAsync bool
-	/* If true then vec_a, vec_b and result are in montgomery form. Default value: false. */
-	IsInMontgomeryForm bool
+	/* If true then vec_a, vec_b are in montgomery form. Default value: false. */
+	IsInputInMontgomeryForm bool
+	/* If true then result is in montgomery form. Default value: false. */
+	IsResultInMontgomeryForm bool
 }
 
 /**
@@ -44,7 +46,8 @@ func DefaultVecOpsConfig() VecOpsConfig {
 		false, // isBOnDevice
 		false, // isResultOnDevice
 		false, // IsAsync
-		false, // IsInMontgomeryForm
+		false, // IsInputInMontgomeryForm
+		false, // IsResultInMontgomeryForm
 	}
 
 	return config
diff --git a/wrappers/golang/core/vec_ops_test.go b/wrappers/golang/core/vec_ops_test.go
index 9ac5bb480..0552b5562 100644
--- a/wrappers/golang/core/vec_ops_test.go
+++ b/wrappers/golang/core/vec_ops_test.go
@@ -15,7 +15,8 @@ func TestVecOpsDefaultConfig(t *testing.T) {
 		false, // isBOnDevice
 		false, // isResultOnDevice
 		false, // IsAsync
-		false, // IsInMontgomeryForm
+		false, // IsInputInMontgomeryForm
+		false, // IsResultInMontgomeryForm
 	}
 
 	actual := DefaultVecOpsConfig()
diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
index efb2cab53..7e2fcbd60 100644
--- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
@@ -21,7 +21,8 @@ pub struct VecOpsConfig<'a> {
     /// it explicitly by running `stream.synchronize()`. If set to false, the functions will block the current CPU thread.
     pub is_async: bool,
     /// If true then vec_a, vec_b and result are in montgomery form. Default value: false.
-    pub is_in_montgomery_form: bool,
+    pub is_input_in_montgomery_form: bool,
+    pub is_result_in_montgomery_form: bool
 }
 
 impl<'a> Default for VecOpsConfig<'a> {
@@ -38,7 +39,8 @@ impl<'a> VecOpsConfig<'a> {
             is_b_on_device: false,
             is_result_on_device: false,
             is_async: false,
-            is_in_montgomery_form: false,
+            is_input_in_montgomery_form: false,
+            is_result_in_montgomery_form: false
         }
     }
 }

From d17a05c7e4aa6a2095d3fda9a8066b41b6f392a8 Mon Sep 17 00:00:00 2001
From: LeonHibnik <leon@ingonyama.com>
Date: Thu, 15 Aug 2024 13:16:55 +0300
Subject: [PATCH 19/21] fmt

---
 wrappers/rust/icicle-core/src/vec_ops/mod.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
index 7e2fcbd60..8581ad0c5 100644
--- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs
+++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs
@@ -22,7 +22,7 @@ pub struct VecOpsConfig<'a> {
     pub is_async: bool,
     /// If true then vec_a, vec_b and result are in montgomery form. Default value: false.
     pub is_input_in_montgomery_form: bool,
-    pub is_result_in_montgomery_form: bool
+    pub is_result_in_montgomery_form: bool,
 }
 
 impl<'a> Default for VecOpsConfig<'a> {
@@ -40,7 +40,7 @@ impl<'a> VecOpsConfig<'a> {
             is_result_on_device: false,
             is_async: false,
             is_input_in_montgomery_form: false,
-            is_result_in_montgomery_form: false
+            is_result_in_montgomery_form: false,
         }
     }
 }

From 21db8c06f6765316b29aa1cf00c688533ead0625 Mon Sep 17 00:00:00 2001
From: LeonHibnik <leon@ingonyama.com>
Date: Thu, 15 Aug 2024 19:34:04 +0300
Subject: [PATCH 20/21] fix example

---
 examples/c++/mont_vec_ops/example.cu | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/examples/c++/mont_vec_ops/example.cu b/examples/c++/mont_vec_ops/example.cu
index 4795325d3..68bbbcbd1 100644
--- a/examples/c++/mont_vec_ops/example.cu
+++ b/examples/c++/mont_vec_ops/example.cu
@@ -14,8 +14,6 @@ typedef scalar_t T;
 
 enum Op { MUL, ADD, SUB, LAST };
 
-// bn254 p = 21888242871839275222246405745257275088548364400416034343698204186575808495617 
-
 int vector_op(
   T* vec_a,
   T* vec_b,
@@ -47,7 +45,7 @@ int vector_op(
 int main(int argc, char** argv)
 {
   const unsigned vector_size = 1 << 15;
-  const unsigned not_in_place_repetitions = 1 << 15; // Repetitions are used only for the non in-place tests.
+  const unsigned not_in_place_repetitions = 1 << 10; // Repetitions are used only for the non in-place tests.
   const unsigned in_place_repetitions = 1;           // Repetitions for in-place tests should be 1. Don't check it.
 
   cudaError_t err;
@@ -200,8 +198,8 @@ int main(int argc, char** argv)
   std::cout << "*** Start not in-place benchmark loop ***" << std::endl;
   std::cout << "*****************************************" << std::endl;
   for (int op = MUL; op != LAST; op++) {
-      // for (int config_idx = 28; config_idx < 29; config_idx++) {
-      for (int config_idx = 0; config_idx < 32; config_idx++) { 
+    // for (int config_idx = 28; config_idx < 29; config_idx++) {
+    for (int config_idx = 0; config_idx < 32; config_idx++) {
       switch (op) {
       case MUL:
         std::cout << "Start benchmark loop for op MUL config_idx " << config_idx << " not in-place" << std::endl;
@@ -644,32 +642,24 @@ int main(int argc, char** argv)
         switch (config_idx >> (nof_of_configs_for_test -
                                nof_of_storage_configs)) { // {is_a_on_device, is_b_on_device, is_result_on_device}
         case 0b000:
-          for (int i = 0; i < not_in_place_repetitions; i++) {
-            vector_op(host_in1, host_in2, host_in1, vector_size, ctx, config, (Op)op);
-          }
+          vector_op(host_in1, host_in2, host_in1, vector_size, ctx, config, (Op)op);
           break;
         case 0b001:
           break;
         case 0b010:
-          for (int i = 0; i < not_in_place_repetitions; i++) {
-            vector_op(host_in1, device_in2, host_in1, vector_size, ctx, config, (Op)op);
-          }
+          vector_op(host_in1, device_in2, host_in1, vector_size, ctx, config, (Op)op);
           break;
         case 0b011:
           break;
         case 0b100:
           break;
         case 0b101:
-          for (int i = 0; i < not_in_place_repetitions; i++) {
-            vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op);
-          }
+          vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op);
           break;
         case 0b110:
           break;
         case 0b111:
-          for (int i = 0; i < not_in_place_repetitions; i++) {
-            vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op);
-          }
+          vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op);
           break;
         }
         CHK_IF_RETURN(cudaPeekAtLastError());
@@ -749,6 +739,7 @@ int main(int argc, char** argv)
             std::cout << "===>>> ERROR!!! MUL: Test failed for vector index " << i
                       << ", config is printed below:" << std::endl;
             test_failed = 1;
+            break;
           }
         }
         break;
@@ -758,6 +749,7 @@ int main(int argc, char** argv)
             std::cout << "===>>> ERROR!!! ADD: Test failed for vector index " << i
                       << ", config is printed below:" << std::endl;
             test_failed = 1;
+            break;
           }
         }
         break;
@@ -767,6 +759,7 @@ int main(int argc, char** argv)
             std::cout << "===>>> ERROR!!! SUB: Test failed for vector index " << i
                       << ", config is printed below:" << std::endl;
             test_failed = 1;
+            break;
           }
         }
         break;

From a7974b614de7b7c4e0092f1a3d40111ca08774e4 Mon Sep 17 00:00:00 2001
From: LeonHibnik <leon@ingonyama.com>
Date: Mon, 19 Aug 2024 10:51:02 +0300
Subject: [PATCH 21/21] change int to booleans

---
 icicle/src/vec_ops/vec_ops.cu | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu
index c95e343e6..d67f8d6af 100644
--- a/icicle/src/vec_ops/vec_ops.cu
+++ b/icicle/src/vec_ops/vec_ops.cu
@@ -96,18 +96,18 @@ namespace vec_ops {
     E* d_vec_a;
     const E* d_vec_b;
 
-    int is_d_alloc_vec_a_allocated = 0;
+    bool is_d_alloc_vec_a_allocated = false;
     if (!config.is_a_on_device) {
       if (config.is_input_in_montgomery_form) {
         CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
         CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
         CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_a, n, config.ctx.stream, d_alloc_vec_a));
-        is_d_alloc_vec_a_allocated = 1;
+        is_d_alloc_vec_a_allocated = true;
         d_vec_a = d_alloc_vec_a;
       } else {
         CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream));
         CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
-        is_d_alloc_vec_a_allocated = 1;
+        is_d_alloc_vec_a_allocated = true;
         d_vec_a = d_alloc_vec_a;
       }
     } else {
@@ -115,25 +115,25 @@ namespace vec_ops {
         CHK_IF_RETURN(cudaMallocAsync(
           &d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input.
         CHK_IF_RETURN(mont::from_montgomery(vec_a, n, config.ctx.stream, d_alloc_vec_a));
-        is_d_alloc_vec_a_allocated = 1;
+        is_d_alloc_vec_a_allocated = true;
         d_vec_a = d_alloc_vec_a;
       } else {
         d_vec_a = vec_a;
       }
     }
 
-    int is_d_alloc_vec_b_allocated = 0;
+    bool is_d_alloc_vec_b_allocated = false;
     if (!config.is_b_on_device) {
       if (config.is_input_in_montgomery_form) {
         CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream));
         CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
         CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_b, n, config.ctx.stream, d_alloc_vec_b));
-        is_d_alloc_vec_b_allocated = 1;
+        is_d_alloc_vec_b_allocated = true;
         d_vec_b = d_alloc_vec_b;
       } else {
         CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream));
         CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream));
-        is_d_alloc_vec_b_allocated = 1;
+        is_d_alloc_vec_b_allocated = true;
         d_vec_b = d_alloc_vec_b;
       }
     } else {
@@ -141,28 +141,23 @@ namespace vec_ops {
         CHK_IF_RETURN(cudaMallocAsync(
           &d_alloc_vec_b, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input.
         CHK_IF_RETURN(mont::from_montgomery(vec_b, n, config.ctx.stream, d_alloc_vec_b));
-        is_d_alloc_vec_b_allocated = 1;
+        is_d_alloc_vec_b_allocated = true;
         d_vec_b = d_alloc_vec_b;
       } else {
         d_vec_b = vec_b;
       }
     }
 
-    int is_d_result_allocated = 0;
+    bool is_d_result_allocated = false;
     if (!config.is_result_on_device) {
       if (!is_in_place) {
         CHK_IF_RETURN(cudaMallocAsync(&d_result, n * sizeof(E), config.ctx.stream));
-        is_d_result_allocated = 1;
+        is_d_result_allocated = true;
       } else {
         d_result = d_vec_a;
       }
     } else {
-      if (!is_in_place) {
-        d_result = result;
-      } else {
-        // d_result = result = d_vec_a;    // DEBUG - looks like a bug for in-place.
-        d_result = result;
-      }
+      d_result = result;
     }
 
     // Call the kernel to perform element-wise operation