From 7e2b42f756dcb3f85d38e36a1447d1bfe92759b8 Mon Sep 17 00:00:00 2001 From: danny-shterman Date: Mon, 29 Jul 2024 06:39:49 +0000 Subject: [PATCH 01/21] Add montgomery to vec_ops and example of that --- examples/c++/mont_vec_ops/CMakeLists.txt | 25 + examples/c++/mont_vec_ops/README.md | 42 ++ examples/c++/mont_vec_ops/compile.debug.sh | 15 + examples/c++/mont_vec_ops/compile.sh | 15 + examples/c++/mont_vec_ops/example.cu | 607 +++++++++++++++++++++ examples/c++/mont_vec_ops/run.sh | 2 + icicle/include/vec_ops/vec_ops.cuh | 2 + icicle/src/vec_ops/vec_ops.cu | 70 ++- 8 files changed, 766 insertions(+), 12 deletions(-) create mode 100644 examples/c++/mont_vec_ops/CMakeLists.txt create mode 100644 examples/c++/mont_vec_ops/README.md create mode 100755 examples/c++/mont_vec_ops/compile.debug.sh create mode 100755 examples/c++/mont_vec_ops/compile.sh create mode 100644 examples/c++/mont_vec_ops/example.cu create mode 100755 examples/c++/mont_vec_ops/run.sh diff --git a/examples/c++/mont_vec_ops/CMakeLists.txt b/examples/c++/mont_vec_ops/CMakeLists.txt new file mode 100644 index 000000000..f7048bb8a --- /dev/null +++ b/examples/c++/mont_vec_ops/CMakeLists.txt @@ -0,0 +1,25 @@ +cmake_minimum_required(VERSION 3.18) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CUDA_STANDARD 17) +set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) +set(CMAKE_CXX_STANDARD_REQUIRED TRUE) +if (${CMAKE_VERSION} VERSION_LESS "3.24.0") + set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH}) +else() + set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed +endif () +project(example LANGUAGES CUDA CXX) + +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") +set(CMAKE_CUDA_FLAGS_RELEASE "") +set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0") +add_executable( + example + example.cu +) +target_include_directories(example PRIVATE "../../../icicle/include") +target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a) +find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ ) +target_link_libraries(example ${NVML_LIBRARY}) +set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + diff --git a/examples/c++/mont_vec_ops/README.md b/examples/c++/mont_vec_ops/README.md new file mode 100644 index 000000000..c99466f6d --- /dev/null +++ b/examples/c++/mont_vec_ops/README.md @@ -0,0 +1,42 @@ +# Icicle example: Montgomery vector operations (mul, add, sub) for allpossible options: +is_a_on_device +is_b_on_device +is_result_on_device +is_in_montgomery_form +(is_async isn't checked) + +## Best-Practices + +We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy. + +## Key-Takeaway + +`Icicle` accelerates multiplication operation `*` using [Karatsuba algorithm](https://en.wikipedia.org/wiki/Karatsuba_algorithm) + +## Concise Usage Explanation + +Define field to be used, e. g.: + +```c++ +#include "api/bn254.h" +``` + +```c++ +using namespace bn254; +typedef scalar_t T; +``` + +## Running the example + +- `cd` to your example directory +- compile with `./compile.sh` +- run with `./run.sh` + +## What's in the example + +1. Define the parameters for the example such as vector size +2. Generate random vectors on-host +3. Copy them on-device +4. Execute element-wise vector multiplication on-device +5. Copy results on-host + diff --git a/examples/c++/mont_vec_ops/compile.debug.sh b/examples/c++/mont_vec_ops/compile.debug.sh new file mode 100755 index 000000000..cebaa9451 --- /dev/null +++ b/examples/c++/mont_vec_ops/compile.debug.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# Exit immediately on error +set -e + +mkdir -p build/example +mkdir -p build/icicle + +# Configure and build Icicle +cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Debug -DCURVE=bn254 +cmake --build build/icicle -j + +# Configure and build the example application +cmake -DCMAKE_BUILD_TYPE=Debug -S . -B build/example +cmake --build build/example diff --git a/examples/c++/mont_vec_ops/compile.sh b/examples/c++/mont_vec_ops/compile.sh new file mode 100755 index 000000000..df73232a4 --- /dev/null +++ b/examples/c++/mont_vec_ops/compile.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# Exit immediately on error +set -e + +mkdir -p build/example +mkdir -p build/icicle + +# Configure and build Icicle +cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 +cmake --build build/icicle -j + +# Configure and build the example application +cmake -S . -B build/example +cmake --build build/example diff --git a/examples/c++/mont_vec_ops/example.cu b/examples/c++/mont_vec_ops/example.cu new file mode 100644 index 000000000..453d3f12f --- /dev/null +++ b/examples/c++/mont_vec_ops/example.cu @@ -0,0 +1,607 @@ +#include +#include +#include +#include + +#include "api/bn254.h" +#include "vec_ops/vec_ops.cuh" +#include + +using namespace vec_ops; +using namespace bn254; + +typedef scalar_t T; + +enum Op { + MUL, + ADD, + SUB, + LAST +}; + +// bn254 p = 21888242871839275222246405745257275088548364400416034343698204186575808495617 + +int vector_op(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config, Op op) +{ + cudaError_t err; + switch (op) { + case MUL: err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result); break; + case ADD: err = bn254_add_cuda(vec_a, vec_b, n_elements, config, vec_result); break; + case SUB: err = bn254_sub_cuda(vec_a, vec_b, n_elements, config, vec_result); break; + } + // cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result); + if (err != cudaSuccess) { + std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl; + return 0; + } + return 0; +} +int vector_mul(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config) +{ + cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result); + if (err != cudaSuccess) { + std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl; + return 0; + } + return 0; +} +int vector_add(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config) +{ + cudaError_t err = bn254_add_cuda(vec_a, vec_b, n_elements, config, vec_result); + if (err != cudaSuccess) { + std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl; + return 0; + } + return 0; +} +int vector_sub(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config) +{ + cudaError_t err = bn254_sub_cuda(vec_a, vec_b, n_elements, config, vec_result); + if (err != cudaSuccess) { + std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl; + return 0; + } + return 0; +} + +int main(int argc, char** argv) +{ + const unsigned vector_size = 1 << 0; + const unsigned repetitions = 1 << 0; + + cudaError_t err; + nvmlInit(); + nvmlDevice_t device; + nvmlDeviceGetHandleByIndex(0, &device); // for GPU 0 + std::cout << "Icicle-Examples: vector mul / add / sub operations." << std::endl; + char name[NVML_DEVICE_NAME_BUFFER_SIZE]; + if (nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE) == NVML_SUCCESS) { + std::cout << "GPU Model: " << name << std::endl; + } else { + std::cerr << "Failed to get GPU model name." << std::endl; + } + unsigned power_limit; + nvmlDeviceGetPowerManagementLimit(device, &power_limit); + + std::cout << "Vector size: " << vector_size << std::endl; + std::cout << "Repetitions: " << repetitions << std::endl; + std::cout << "Power limit: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_limit << " W" << std::endl; + + unsigned int baseline_power; + nvmlDeviceGetPowerUsage(device, &baseline_power); + std::cout << "Baseline power: " << std::fixed << std::setprecision(3) << 1.0e-3 * baseline_power << " W" << std::endl; + unsigned baseline_temperature; + if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &baseline_temperature) == NVML_SUCCESS) { + std::cout << "Baseline GPU Temperature: " << baseline_temperature << " C" << std::endl; + } else { + std::cerr << "Failed to get GPU temperature." << std::endl; + } + + // host data + std::cout << "Allocate memory for the input vectors (both normal and Montgomery presentation)" << std::endl; + T* host_in1_init = (T*)malloc(vector_size * sizeof(T)); + T* host_in2_init = (T*)malloc(vector_size * sizeof(T)); + std::cout << "Initializing vectors with normal presentation random data" << std::endl; + T::rand_host_many(host_in1_init, vector_size); + T::rand_host_many(host_in2_init, vector_size); + std::cout << "Allocate memory for the output vectors" << std::endl; + T* host_out = (T*)malloc(vector_size * sizeof(T)); // This memory will be used for the test output. + T* host_out_ref_mul = (T*)malloc(vector_size * sizeof(T)); // This memory will be used as a reference result for mul (will be compared to host_out content). + T* host_out_ref_add = (T*)malloc(vector_size * sizeof(T)); // This memory will be used as a reference result for add (will be compared to host_out content). + T* host_out_ref_sub = (T*)malloc(vector_size * sizeof(T)); // This memory will be used as a reference result for sub (will be compared to host_out content). + std::cout << "Initializing output vectors with random data" << std::endl; + T::rand_host_many(host_out, vector_size); + T::rand_host_many(host_out_ref_mul, vector_size); + T::rand_host_many(host_out_ref_add, vector_size); + T::rand_host_many(host_out_ref_sub, vector_size); + // device data + device_context::DeviceContext ctx = device_context::get_default_device_context(); + T* device_in1; + T* device_in2; + T* device_out; + + err = cudaMalloc((void**)&device_in1, vector_size * sizeof(T)); + if (err != cudaSuccess) { + std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl; + return 0; + } + err = cudaMalloc((void**)&device_in2, vector_size * sizeof(T)); + if (err != cudaSuccess) { + std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl; + return 0; + } + err = cudaMalloc((void**)&device_out, vector_size * sizeof(T)); + if (err != cudaSuccess) { + std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl; + return 0; + } + + vec_ops::VecOpsConfig config = vec_ops::DefaultVecOpsConfig(); + + //**************************************** + // Test warn-up and reference output config. Reference output to be used to check if test passed or not. + //**************************************** + // copy from host to device + err = cudaMemcpy(device_in1, host_in1_init, vector_size * sizeof(T), cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl; + return 0; + } + err = cudaMemcpy(device_in2, host_in2_init, vector_size * sizeof(T), cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from host to device - " << cudaGetErrorString(err) << std::endl; + return 0; + } + std::cout << "Starting warm-up run" << std::endl; + // Warm-up loop + for ( int op = MUL; op != LAST; op++ ) { + for (int i = 0; i < repetitions; i++) { + // vector_mul(device_in1, device_in2, device_out, vector_size, ctx, config); + vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); + } + switch (op) { + case MUL: err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); break; + case ADD: err = cudaMemcpy(host_out_ref_add, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); break; + case SUB: err = cudaMemcpy(host_out_ref_sub, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); break; + } + } + // copy the result from device to host_out_ref_mul to keep it for later comparisons. + // err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl; + return 0; + } + //**************************************** + // End of test warn-up and reference output config. + //**************************************** + + std::cout << "Starting benchmarking" << std::endl; + unsigned power_before; + nvmlDeviceGetPowerUsage(device, &power_before); + std::cout << "Power before: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_before << " W" << std::endl; + std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_before / power_limit + << " %" << std::endl; + unsigned temperature_before; + if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_before) == NVML_SUCCESS) { + std::cout << "GPU Temperature before: " << temperature_before << " C" << std::endl; + } else { + std::cerr << "Failed to get GPU temperature." << std::endl; + } + + //******************************************************* + // Benchmark test: + // Loop for (mul, add, sub): + // Loop (is_a_on_device, is_b_on_device, is_result_on_device, is_in_montgomery_form): + //******************************************************* + T* host_in1 = (T*)malloc(vector_size * sizeof(T)); // This buffer is used to load the data from host_in1_init for the benchmark. + T* host_in2 = (T*)malloc(vector_size * sizeof(T)); // This buffer is used to load the data from host_in2_init for the benchmark. + // Test when the result is not in-place + for ( int op = MUL; op != LAST; op++ ) { + // for (int config_idx = 0; config_idx < 0; config_idx++) { + for (int config_idx = 0; config_idx < 16; config_idx++) { + std::cout << "Start benchmark loop for config_idx " << config_idx << std::endl; + for (int i=0; i> 3) & 0x1; + config.is_b_on_device = (config_idx >> 2) & 0x1; + config.is_result_on_device = (config_idx >> 1) & 0x1; + config.is_in_montgomery_form = (config_idx >> 0) & 0x1; + + // Copy from host to device (copy again in order to be used later in the loop and device_inX was already overwritten by warmup. + if (config.is_a_on_device) { + if (config.is_in_montgomery_form) { + err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl; + return 0; + } + CHK_IF_RETURN(mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1)); // Convert in-place. + } else { // Normal presentation. + err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl; + return 0; + } + } + } else { + if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host. + err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl; + return 0; + } + CHK_IF_RETURN(mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1)); + err = cudaMemcpy(host_in1, device_in1, vector_size * sizeof(T), cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from device_in1 to host_in1 - " << cudaGetErrorString(err) << std::endl; + return 0; + } + } + } + if (config.is_b_on_device) { + if (config.is_in_montgomery_form) { + err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from host_in2 to device_in1 - " << cudaGetErrorString(err) << std::endl; + return 0; + } + CHK_IF_RETURN(mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2)); // Convert in-place. + } else { + // Normal presentation. + err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl; + return 0; + } + } + } else { + if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host. + err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl; + return 0; + } + CHK_IF_RETURN(mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2)); + err = cudaMemcpy(host_in2, device_in2, vector_size * sizeof(T), cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from device_in2 to host_in2 - " << cudaGetErrorString(err) << std::endl; + return 0; + } + } + } + CHK_IF_RETURN(cudaPeekAtLastError()); + + auto start_time = std::chrono::high_resolution_clock::now(); + // Benchmark loop + for (int i = 0; i < repetitions; i++) { + switch (config_idx >> 1) { // {is_a_on_device, is_b_on_device, is_result_on_device} + case 0b000: vector_op(host_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); break; + case 0b001: vector_op(host_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); break; + case 0b010: vector_op(host_in1, device_in2, host_out, vector_size, ctx, config, (Op)op); break; + case 0b011: vector_op(host_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); break; + case 0b100: vector_op(device_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); break; + case 0b101: vector_op(device_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); break; + case 0b110: vector_op(device_in1, device_in2, host_out, vector_size, ctx, config, (Op)op); break; + case 0b111: vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); break; + } + CHK_IF_RETURN(cudaPeekAtLastError()); + } + + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end_time - start_time); + switch (op) { + case MUL: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation MUL for config_idx " << config_idx << " and result not in-place" << std::endl; break; + case ADD: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation ADD for config_idx " << config_idx << " and result not in-place" << std::endl; break; + case SUB: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation SUB for config_idx " << config_idx << " and result not in-place" << std::endl; break; + } + + if (config.is_result_on_device) { // Copy the data to host_out in order to compare it vs. host_out_ref_mul value. + if (config.is_in_montgomery_form) { // Convert to normal from montgomery if needed. + CHK_IF_RETURN(mont::from_montgomery(device_out, vector_size, config.ctx.stream, device_out)); // Convert to normal in order to check vs. host_out_ref_mul. + } + err = cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl; + return 0; + } + } else { // Data is not on device but it is in host_out. + if (config.is_in_montgomery_form) { // host_out should be written to device, converted to mmontgomery and written back to host. Then compared vs. host_out_ref_mul. + err = cudaMemcpy(device_out, host_out, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from host_out to device_out - " << cudaGetErrorString(err) << std::endl; + return 0; + } + CHK_IF_RETURN(mont::from_montgomery(device_out, vector_size, config.ctx.stream, device_out)); // Convert to normal in order to check vs. host_out_ref_mul. + err = cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl; + return 0; + } + } else { // host_out could be compared vs. host_out_ref_mul as is. + } + } + //**************************************** + // End of benchmark test. + //**************************************** + + //*********************************************** + // Test result check + // Check is performed by executing the operation in a normal presentation + // (located in in host_out_ref_mul) and comparing it with the + // benchmark test result. + //*********************************************** + int test_failed = 0; + // std::cout << "===>>> host_out_ref_mul[" << i << "]: " << host_out_ref_mul[i] << std::endl; + // std::cout << "===>>> host_out[" << i << "] after test run: " << host_out[i] << std::endl; + switch (op) { + case MUL: for (int i=0; i>> ERROR!!! MUL: Test failed for vector index " << i << ", config is printed below:" << std::endl; + test_failed = 1; + } + } + break; + case ADD: for (int i=0; i>> ERROR!!! ADD: Test failed for vector index " << i << ", config is printed below:" << std::endl; + test_failed = 1; + } + } + break; + case SUB: for (int i=0; i>> ERROR!!! SUB: Test failed for vector index " << i << ", config is printed below:" << std::endl; + test_failed = 1; + } + } + break; + } + if (test_failed) { + // std::cout << "===>>> ERROR!!! Test failed for vector index " << i << ", config is printed below:" << std::endl; + std::cout << "===>>> result is not in-place: " << std::endl; + std::cout << "===>>> is_a_on_device: " << config.is_a_on_device << std::endl; + std::cout << "===>>> is_b_on_device: " << config.is_b_on_device << std::endl; + std::cout << "===>>> is_result_on_device: " << config.is_result_on_device << std::endl; + std::cout << "===>>> is_in_montgomery_form: " << config.is_in_montgomery_form << std::endl; + exit(2); + } + + unsigned power_after; + nvmlDeviceGetPowerUsage(device, &power_after); + std::cout << "Power after: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_after << " W" << std::endl; + std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_after / power_limit + << " %" << std::endl; + unsigned temperature_after; + if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_after) == NVML_SUCCESS) { + std::cout << "GPU Temperature after: " << temperature_after << " C" << std::endl; + } else { + std::cerr << "Failed to get GPU temperature." << std::endl; + } + + // Report performance in GMPS: Giga Multiplications Per Second + double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count()); + std::cout << "Performance: " << GMPS << " Giga Multiplications Per Second" << std::endl; + } + } + + // Test when the result is in-place + for ( int op = MUL; op != LAST; op++ ) { + for (int config_idx = 0; config_idx < 16; config_idx++) { + for (int i=0; i> 4) & 0x1; + config.is_b_on_device = (config_idx >> 3) & 0x1; + config.is_result_on_device = (config_idx >> 2) & 0x1; + config.is_in_montgomery_form = (config_idx >> 1) & 0x1; + if (config.is_a_on_device ^ config.is_result_on_device == 1) { + continue; + } + + // Copy from host to device (copy again in order to be used later in the loop and device_inX was already overwritten by warmup. + if (config.is_a_on_device) { + if (config.is_in_montgomery_form) { + err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl; + return 0; + } + CHK_IF_RETURN(mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1)); // Convert in-place. + } else { // Normal presentation. + err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl; + return 0; + } + } + } else { + if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host. + err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl; + return 0; + } + CHK_IF_RETURN(mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1)); + err = cudaMemcpy(host_in1, device_in1, vector_size * sizeof(T), cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from device_in1 to host_in1 - " << cudaGetErrorString(err) << std::endl; + return 0; + } + } + } + if (config.is_b_on_device) { + if (config.is_in_montgomery_form) { + err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from host_in2 to device_in1 - " << cudaGetErrorString(err) << std::endl; + return 0; + } + CHK_IF_RETURN(mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2)); // Convert in-place. + } else { + // Normal presentation. + err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl; + return 0; + } + } + } else { + if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host. + err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl; + return 0; + } + CHK_IF_RETURN(mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2)); + err = cudaMemcpy(host_in2, device_in2, vector_size * sizeof(T), cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from device_in2 to host_in2 - " << cudaGetErrorString(err) << std::endl; + return 0; + } + } + } + CHK_IF_RETURN(cudaPeekAtLastError()); + + auto start_time = std::chrono::high_resolution_clock::now(); + // Benchmark loop + for (int i = 0; i < repetitions; i++) { + switch (config_idx >> 2) { // {is_a_on_device, is_b_on_device, is_result_on_device} + case 0b000: vector_op(host_in1, host_in2, host_in1, vector_size, ctx, config, (Op)op); break; + case 0b001: break; + case 0b010: vector_op(host_in1, device_in2, host_in1, vector_size, ctx, config, (Op)op); break; + case 0b011: break; + case 0b100: break; + case 0b101: vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op); break; + case 0b110: break; + case 0b111: vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op); break; + } + CHK_IF_RETURN(cudaPeekAtLastError()); + } + + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end_time - start_time); + switch (op) { + case MUL: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation MUL for config_idx " << config_idx << " and result in-place" << std::endl; break; + case ADD: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation ADD for config_idx " << config_idx << " and result in-place" << std::endl; break; + case SUB: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation SUB for config_idx " << config_idx << " and result in-place" << std::endl; break; + } + + if (config.is_result_on_device) { // Copy the data to host_out in order to compare it vs. host_out_ref_mul value. + if (config.is_in_montgomery_form) { // Convert to normal from montgomery if needed. + CHK_IF_RETURN(mont::from_montgomery(device_in1, vector_size, config.ctx.stream, device_in1)); // Convert to normal in order to check vs. host_out_ref_mul. + } + err = cudaMemcpy(host_out, device_in1, vector_size * sizeof(T), cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from device_in1 to host_out - " << cudaGetErrorString(err) << std::endl; + return 0; + } + } else { // Data is not on device but it is in host_in1. It should be moved to host_out for test pass/fail check. + if (config.is_in_montgomery_form) { // host_out should be written to device, converted to mmontgomery and written back to host. Then compared vs. host_out_ref_mul. + err = cudaMemcpy(device_out, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from host_in1 to device_out - " << cudaGetErrorString(err) << std::endl; + return 0; + } + CHK_IF_RETURN(mont::from_montgomery(device_out, vector_size, config.ctx.stream, device_out)); // Convert to normal in order to check vs. host_out_ref_mul. + err = cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl; + return 0; + } + } else { // host_out could be compared vs. host_out_ref_mul as is. + err = cudaMemcpy(device_out, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from host_in1 to device_out - " << cudaGetErrorString(err) << std::endl; + return 0; + } + err = cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl; + return 0; + } + } + } + //**************************************** + // End of benchmark test. + //**************************************** + + //*********************************************** + // Test result check + // Check is performed by executing the operation in a normal presentation + // (located in in host_out_ref_mul) and comparing it with the + // benchmark test result. + //*********************************************** + int test_failed = 0; + // std::cout << "===>>> host_out_ref_mul[" << i << "]: " << host_out_ref_mul[i] << std::endl; + // std::cout << "===>>> host_out[" << i << "] after test run: " << host_out[i] << std::endl; + switch (op) { + case MUL: for (int i=0; i>> ERROR!!! MUL: Test failed for vector index " << i << ", config is printed below:" << std::endl; + std::cout << "host_out_ref_mul[0] = " << host_out_ref_mul[0] << std::endl; + test_failed = 1; + } + } + break; + case ADD: for (int i=0; i>> ERROR!!! ADD: Test failed for vector index " << i << ", config is printed below:" << std::endl; + std::cout << "host_out_ref_add[0] = " << host_out_ref_add[0] << std::endl; + test_failed = 1; + } + } + break; + case SUB: for (int i=0; i>> ERROR!!! SUB: Test failed for vector index " << i << ", config is printed below:" << std::endl; + std::cout << "host_out_ref_sub[0] = " << host_out_ref_sub[0] << std::endl; + test_failed = 1; + } + } + break; + } + if (test_failed) { + // std::cout << "===>>> ERROR!!! Test failed for vector index " << i << ", config is printed below:" << std::endl; + std::cout << "===>>> result is in-place: " << std::endl; + std::cout << "===>>> is_a_on_device: " << config.is_a_on_device << std::endl; + std::cout << "===>>> is_b_on_device: " << config.is_b_on_device << std::endl; + std::cout << "===>>> is_result_on_device: " << config.is_result_on_device << std::endl; + std::cout << "===>>> is_in_montgomery_form: " << config.is_in_montgomery_form << std::endl; + std::cout << "host_out[0] = " << host_out[0] << std::endl; + exit(2); + } + + unsigned power_after; + nvmlDeviceGetPowerUsage(device, &power_after); + std::cout << "Power after: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_after << " W" << std::endl; + std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_after / power_limit + << " %" << std::endl; + unsigned temperature_after; + if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_after) == NVML_SUCCESS) { + std::cout << "GPU Temperature after: " << temperature_after << " C" << std::endl; + } else { + std::cerr << "Failed to get GPU temperature." << std::endl; + } + + // Report performance in GMPS: Giga Multiplications Per Second + double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count()); + std::cout << "Performance: " << GMPS << " Giga Multiplications Per Second" << std::endl; + } + } + + // clean up and exit + free(host_in1_init); + free(host_in2_init); + free(host_in1); + free(host_in2); + free(host_out); + free(host_out_ref_mul); + cudaFree(device_in1); + cudaFree(device_in2); + cudaFree(device_out); + nvmlShutdown(); + return 0; +} diff --git a/examples/c++/mont_vec_ops/run.sh b/examples/c++/mont_vec_ops/run.sh new file mode 100755 index 000000000..01eca66ba --- /dev/null +++ b/examples/c++/mont_vec_ops/run.sh @@ -0,0 +1,2 @@ +#!/bin/bash +./build/example/example diff --git a/icicle/include/vec_ops/vec_ops.cuh b/icicle/include/vec_ops/vec_ops.cuh index ee624a07f..89dad5610 100644 --- a/icicle/include/vec_ops/vec_ops.cuh +++ b/icicle/include/vec_ops/vec_ops.cuh @@ -27,6 +27,7 @@ namespace vec_ops { * non-blocking and you'd need to synchronize it explicitly by running * `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the * function will block the current CPU thread. */ + bool is_in_montgomery_form; /**< If true then vec_a, vec_b and result are in montgomery form. Default value: false. */ }; /** @@ -42,6 +43,7 @@ namespace vec_ops { false, // is_b_on_device false, // is_result_on_device false, // is_async + false, // is_in_montgomery_form }; return config; } diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu index 9cfa85e60..7e7cf5e65 100644 --- a/icicle/src/vec_ops/vec_ops.cu +++ b/icicle/src/vec_ops/vec_ops.cu @@ -95,25 +95,62 @@ namespace vec_ops { E *d_result, *d_alloc_vec_a, *d_alloc_vec_b; E* d_vec_a; const E* d_vec_b; + + int is_d_alloc_vec_a_allocated = 0; if (!config.is_a_on_device) { - CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); - CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream)); - d_vec_a = d_alloc_vec_a; + if (config.is_in_montgomery_form) { + CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); + CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream)); + CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_a, n * sizeof(E), config.ctx.stream, d_alloc_vec_a)); + is_d_alloc_vec_a_allocated = 1; + d_vec_a = d_alloc_vec_a; + } else { + CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); + CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream)); + is_d_alloc_vec_a_allocated = 1; + d_vec_a = d_alloc_vec_a; + } } else { - d_vec_a = vec_a; + if (config.is_in_montgomery_form) { + CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input. + CHK_IF_RETURN(mont::from_montgomery(vec_a, n * sizeof(E), config.ctx.stream, d_alloc_vec_a)); + is_d_alloc_vec_a_allocated = 1; + d_vec_a = d_alloc_vec_a; + } else { + d_vec_a = vec_a; + } } + int is_d_alloc_vec_b_allocated = 0; if (!config.is_b_on_device) { - CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream)); - CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream)); - d_vec_b = d_alloc_vec_b; + if (config.is_in_montgomery_form) { + CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream)); + CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream)); + CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_b, n * sizeof(E), config.ctx.stream, d_alloc_vec_b)); + is_d_alloc_vec_b_allocated = 1; + d_vec_b = d_alloc_vec_b; + } else { + CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream)); + CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream)); + is_d_alloc_vec_b_allocated = 1; + d_vec_b = d_alloc_vec_b; + } } else { - d_vec_b = vec_b; + if (config.is_in_montgomery_form) { + CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input. + CHK_IF_RETURN(mont::from_montgomery(vec_b, n * sizeof(E), config.ctx.stream, d_alloc_vec_b)); + is_d_alloc_vec_b_allocated = 1; + d_vec_b = d_alloc_vec_b; + } else { + d_vec_b = vec_b; + } } + int is_d_result_allocated = 0; if (!config.is_result_on_device) { if (!is_in_place) { CHK_IF_RETURN(cudaMallocAsync(&d_result, n * sizeof(E), config.ctx.stream)); + is_d_result_allocated = 1; } else { d_result = d_vec_a; } @@ -129,12 +166,21 @@ namespace vec_ops { Kernel<<>>(d_vec_a, d_vec_b, n, d_result); if (!config.is_result_on_device) { - CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream)); - CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream)); + if (config.is_in_montgomery_form) { + CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result)); // Convert in-place. + CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream)); + } else { + CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream)); + } + } else { + if (config.is_in_montgomery_form) { + CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result)); // Convert in-place. + } } - if (!config.is_a_on_device && !is_in_place) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); } - if (!config.is_b_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); } + if (is_d_alloc_vec_a_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); } + if (is_d_alloc_vec_b_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); } + if (is_d_result_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream)); } if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(config.ctx.stream)); From 118c82f82914b4d888ccd41f778ba6875c902834 Mon Sep 17 00:00:00 2001 From: danny-shterman Date: Tue, 30 Jul 2024 05:38:30 +0000 Subject: [PATCH 02/21] Fixed style by clang-format --- examples/c++/mont_vec_ops/CMakeLists.txt | 37 +++++++++--------------- 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/examples/c++/mont_vec_ops/CMakeLists.txt b/examples/c++/mont_vec_ops/CMakeLists.txt index f7048bb8a..1220da827 100644 --- a/examples/c++/mont_vec_ops/CMakeLists.txt +++ b/examples/c++/mont_vec_ops/CMakeLists.txt @@ -1,25 +1,14 @@ -cmake_minimum_required(VERSION 3.18) -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CUDA_STANDARD 17) -set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) -set(CMAKE_CXX_STANDARD_REQUIRED TRUE) -if (${CMAKE_VERSION} VERSION_LESS "3.24.0") - set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH}) -else() - set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed -endif () -project(example LANGUAGES CUDA CXX) - -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") -set(CMAKE_CUDA_FLAGS_RELEASE "") -set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0") -add_executable( - example - example.cu -) -target_include_directories(example PRIVATE "../../../icicle/include") -target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a) -find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ ) -target_link_libraries(example ${NVML_LIBRARY}) -set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON) +cmake_minimum_required(VERSION 3.18) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CUDA_STANDARD 17) + set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) + set(CMAKE_CXX_STANDARD_REQUIRED TRUE) if (${CMAKE_VERSION} VERSION_LESS "3.24.0") + set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH}) else() set(CMAKE_CUDA_ARCHITECTURES native) #on 3.24 + + , on earlier it is ignored, + and the target is not passed endif() project(example LANGUAGES CUDA CXX) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") set(CMAKE_CUDA_FLAGS_RELEASE "") + set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0") add_executable(example example.cu) + target_include_directories(example PRIVATE "../../../icicle/include") + target_link_libraries(example ${CMAKE_SOURCE_DIR} / build / icicle / lib / libingo_field_bn254.a) + find_library(NVML_LIBRARY nvidia - ml PATHS / usr / local / cuda / targets / x86_64 - linux / lib / stubs /) + target_link_libraries(example ${NVML_LIBRARY}) + set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON) From 132abbb2d458db7068a6de8227b3fcf9aafba4c2 Mon Sep 17 00:00:00 2001 From: danny-shterman Date: Tue, 30 Jul 2024 05:39:32 +0000 Subject: [PATCH 03/21] Fixed style by clang-format --- examples/c++/mont_vec_ops/README.md | 3 +- examples/c++/mont_vec_ops/compile.debug.sh | 27 +- examples/c++/mont_vec_ops/compile.sh | 26 +- examples/c++/mont_vec_ops/example.cu | 419 ++++++++++++++------- examples/c++/mont_vec_ops/run.sh | 4 +- icicle/include/vec_ops/vec_ops.cuh | 3 +- icicle/src/vec_ops/vec_ops.cu | 20 +- 7 files changed, 320 insertions(+), 182 deletions(-) mode change 100755 => 100644 examples/c++/mont_vec_ops/compile.debug.sh mode change 100755 => 100644 examples/c++/mont_vec_ops/compile.sh mode change 100755 => 100644 examples/c++/mont_vec_ops/run.sh diff --git a/examples/c++/mont_vec_ops/README.md b/examples/c++/mont_vec_ops/README.md index c99466f6d..736303872 100644 --- a/examples/c++/mont_vec_ops/README.md +++ b/examples/c++/mont_vec_ops/README.md @@ -1,4 +1,4 @@ -# Icicle example: Montgomery vector operations (mul, add, sub) for allpossible options: +#Icicle example : Montgomery vector operations(mul, add, sub) for allpossible options: is_a_on_device is_b_on_device is_result_on_device @@ -39,4 +39,3 @@ typedef scalar_t T; 3. Copy them on-device 4. Execute element-wise vector multiplication on-device 5. Copy results on-host - diff --git a/examples/c++/mont_vec_ops/compile.debug.sh b/examples/c++/mont_vec_ops/compile.debug.sh old mode 100755 new mode 100644 index cebaa9451..73d19bbd9 --- a/examples/c++/mont_vec_ops/compile.debug.sh +++ b/examples/c++/mont_vec_ops/compile.debug.sh @@ -1,15 +1,20 @@ -#!/bin/bash +#!/ bin / bash -# Exit immediately on error -set -e +#Exit immediately on error +set - + e -mkdir -p build/example -mkdir -p build/icicle + mkdir - + p build / example mkdir - + p build / icicle -# Configure and build Icicle -cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Debug -DCURVE=bn254 -cmake --build build/icicle -j +#Configure and build Icicle + cmake - + S../../../ icicle / -B build / icicle - DMSM = + OFF - DCMAKE_BUILD_TYPE = Debug - DCURVE = bn254 cmake-- build build / icicle - + j -# Configure and build the example application -cmake -DCMAKE_BUILD_TYPE=Debug -S . -B build/example -cmake --build build/example +#Configure and build the example application + cmake - + DCMAKE_BUILD_TYPE = + Debug - S.- B build / example cmake-- build build / example diff --git a/examples/c++/mont_vec_ops/compile.sh b/examples/c++/mont_vec_ops/compile.sh old mode 100755 new mode 100644 index df73232a4..cf145b290 --- a/examples/c++/mont_vec_ops/compile.sh +++ b/examples/c++/mont_vec_ops/compile.sh @@ -1,15 +1,19 @@ -#!/bin/bash +#!/ bin / bash -# Exit immediately on error -set -e +#Exit immediately on error +set - + e -mkdir -p build/example -mkdir -p build/icicle + mkdir - + p build / example mkdir - + p build / icicle -# Configure and build Icicle -cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 -cmake --build build/icicle -j +#Configure and build Icicle + cmake - + S../../../ icicle / -B build / icicle - + DMSM = OFF - DCMAKE_BUILD_TYPE = Release - DCURVE = bn254 cmake-- build build / icicle - + j -# Configure and build the example application -cmake -S . -B build/example -cmake --build build/example +#Configure and build the example application + cmake - + S.- B build / example cmake-- build build / example diff --git a/examples/c++/mont_vec_ops/example.cu b/examples/c++/mont_vec_ops/example.cu index 453d3f12f..8cf21e1fc 100644 --- a/examples/c++/mont_vec_ops/example.cu +++ b/examples/c++/mont_vec_ops/example.cu @@ -12,22 +12,30 @@ using namespace bn254; typedef scalar_t T; -enum Op { - MUL, - ADD, - SUB, - LAST -}; +enum Op { MUL, ADD, SUB, LAST }; // bn254 p = 21888242871839275222246405745257275088548364400416034343698204186575808495617 -int vector_op(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config, Op op) +int vector_op( + T* vec_a, + T* vec_b, + T* vec_result, + size_t n_elements, + device_context::DeviceContext ctx, + vec_ops::VecOpsConfig config, + Op op) { cudaError_t err; switch (op) { - case MUL: err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result); break; - case ADD: err = bn254_add_cuda(vec_a, vec_b, n_elements, config, vec_result); break; - case SUB: err = bn254_sub_cuda(vec_a, vec_b, n_elements, config, vec_result); break; + case MUL: + err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result); + break; + case ADD: + err = bn254_add_cuda(vec_a, vec_b, n_elements, config, vec_result); + break; + case SUB: + err = bn254_sub_cuda(vec_a, vec_b, n_elements, config, vec_result); + break; } // cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result); if (err != cudaSuccess) { @@ -36,7 +44,8 @@ int vector_op(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_conte } return 0; } -int vector_mul(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config) +int vector_mul( + T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config) { cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result); if (err != cudaSuccess) { @@ -45,7 +54,8 @@ int vector_mul(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_cont } return 0; } -int vector_add(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config) +int vector_add( + T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config) { cudaError_t err = bn254_add_cuda(vec_a, vec_b, n_elements, config, vec_result); if (err != cudaSuccess) { @@ -54,7 +64,8 @@ int vector_add(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_cont } return 0; } -int vector_sub(T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config) +int vector_sub( + T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config) { cudaError_t err = bn254_sub_cuda(vec_a, vec_b, n_elements, config, vec_result); if (err != cudaSuccess) { @@ -105,10 +116,16 @@ int main(int argc, char** argv) T::rand_host_many(host_in1_init, vector_size); T::rand_host_many(host_in2_init, vector_size); std::cout << "Allocate memory for the output vectors" << std::endl; - T* host_out = (T*)malloc(vector_size * sizeof(T)); // This memory will be used for the test output. - T* host_out_ref_mul = (T*)malloc(vector_size * sizeof(T)); // This memory will be used as a reference result for mul (will be compared to host_out content). - T* host_out_ref_add = (T*)malloc(vector_size * sizeof(T)); // This memory will be used as a reference result for add (will be compared to host_out content). - T* host_out_ref_sub = (T*)malloc(vector_size * sizeof(T)); // This memory will be used as a reference result for sub (will be compared to host_out content). + T* host_out = (T*)malloc(vector_size * sizeof(T)); // This memory will be used for the test output. + T* host_out_ref_mul = (T*)malloc( + vector_size * + sizeof(T)); // This memory will be used as a reference result for mul (will be compared to host_out content). + T* host_out_ref_add = (T*)malloc( + vector_size * + sizeof(T)); // This memory will be used as a reference result for add (will be compared to host_out content). + T* host_out_ref_sub = (T*)malloc( + vector_size * + sizeof(T)); // This memory will be used as a reference result for sub (will be compared to host_out content). std::cout << "Initializing output vectors with random data" << std::endl; T::rand_host_many(host_out, vector_size); T::rand_host_many(host_out_ref_mul, vector_size); @@ -154,15 +171,21 @@ int main(int argc, char** argv) } std::cout << "Starting warm-up run" << std::endl; // Warm-up loop - for ( int op = MUL; op != LAST; op++ ) { + for (int op = MUL; op != LAST; op++) { for (int i = 0; i < repetitions; i++) { // vector_mul(device_in1, device_in2, device_out, vector_size, ctx, config); vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); } switch (op) { - case MUL: err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); break; - case ADD: err = cudaMemcpy(host_out_ref_add, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); break; - case SUB: err = cudaMemcpy(host_out_ref_sub, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); break; + case MUL: + err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); + break; + case ADD: + err = cudaMemcpy(host_out_ref_add, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); + break; + case SUB: + err = cudaMemcpy(host_out_ref_sub, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); + break; } } // copy the result from device to host_out_ref_mul to keep it for later comparisons. @@ -193,14 +216,16 @@ int main(int argc, char** argv) // Loop for (mul, add, sub): // Loop (is_a_on_device, is_b_on_device, is_result_on_device, is_in_montgomery_form): //******************************************************* - T* host_in1 = (T*)malloc(vector_size * sizeof(T)); // This buffer is used to load the data from host_in1_init for the benchmark. - T* host_in2 = (T*)malloc(vector_size * sizeof(T)); // This buffer is used to load the data from host_in2_init for the benchmark. + T* host_in1 = + (T*)malloc(vector_size * sizeof(T)); // This buffer is used to load the data from host_in1_init for the benchmark. + T* host_in2 = + (T*)malloc(vector_size * sizeof(T)); // This buffer is used to load the data from host_in2_init for the benchmark. // Test when the result is not in-place - for ( int op = MUL; op != LAST; op++ ) { + for (int op = MUL; op != LAST; op++) { // for (int config_idx = 0; config_idx < 0; config_idx++) { for (int config_idx = 0; config_idx < 16; config_idx++) { std::cout << "Start benchmark loop for config_idx " << config_idx << std::endl; - for (int i=0; i> 1) & 0x1; config.is_in_montgomery_form = (config_idx >> 0) & 0x1; - // Copy from host to device (copy again in order to be used later in the loop and device_inX was already overwritten by warmup. + // Copy from host to device (copy again in order to be used later in the loop and device_inX was already + // overwritten by warmup. if (config.is_a_on_device) { if (config.is_in_montgomery_form) { - err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + err = + cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. if (err != cudaSuccess) { std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl; return 0; } - CHK_IF_RETURN(mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1)); // Convert in-place. - } else { // Normal presentation. - err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + CHK_IF_RETURN( + mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1)); // Convert in-place. + } else { // Normal presentation. + err = + cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. if (err != cudaSuccess) { std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl; return 0; } } } else { - if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host. - err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host. + err = + cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. if (err != cudaSuccess) { std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl; return 0; @@ -242,23 +272,27 @@ int main(int argc, char** argv) } if (config.is_b_on_device) { if (config.is_in_montgomery_form) { - err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + err = + cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. if (err != cudaSuccess) { std::cerr << "Failed to copy data from host_in2 to device_in1 - " << cudaGetErrorString(err) << std::endl; return 0; } - CHK_IF_RETURN(mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2)); // Convert in-place. + CHK_IF_RETURN( + mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2)); // Convert in-place. } else { // Normal presentation. - err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + err = + cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. if (err != cudaSuccess) { std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl; return 0; } } } else { - if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host. - err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host. + err = + cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. if (err != cudaSuccess) { std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl; return 0; @@ -276,15 +310,31 @@ int main(int argc, char** argv) auto start_time = std::chrono::high_resolution_clock::now(); // Benchmark loop for (int i = 0; i < repetitions; i++) { - switch (config_idx >> 1) { // {is_a_on_device, is_b_on_device, is_result_on_device} - case 0b000: vector_op(host_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); break; - case 0b001: vector_op(host_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); break; - case 0b010: vector_op(host_in1, device_in2, host_out, vector_size, ctx, config, (Op)op); break; - case 0b011: vector_op(host_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); break; - case 0b100: vector_op(device_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); break; - case 0b101: vector_op(device_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); break; - case 0b110: vector_op(device_in1, device_in2, host_out, vector_size, ctx, config, (Op)op); break; - case 0b111: vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); break; + switch (config_idx >> 1) { // {is_a_on_device, is_b_on_device, is_result_on_device} + case 0b000: + vector_op(host_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); + break; + case 0b001: + vector_op(host_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); + break; + case 0b010: + vector_op(host_in1, device_in2, host_out, vector_size, ctx, config, (Op)op); + break; + case 0b011: + vector_op(host_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); + break; + case 0b100: + vector_op(device_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); + break; + case 0b101: + vector_op(device_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); + break; + case 0b110: + vector_op(device_in1, device_in2, host_out, vector_size, ctx, config, (Op)op); + break; + case 0b111: + vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); + break; } CHK_IF_RETURN(cudaPeekAtLastError()); } @@ -292,34 +342,54 @@ int main(int argc, char** argv) auto end_time = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(end_time - start_time); switch (op) { - case MUL: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation MUL for config_idx " << config_idx << " and result not in-place" << std::endl; break; - case ADD: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation ADD for config_idx " << config_idx << " and result not in-place" << std::endl; break; - case SUB: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation SUB for config_idx " << config_idx << " and result not in-place" << std::endl; break; + case MUL: + std::cout << "Elapsed time: " << duration.count() << " microseconds, operation MUL for config_idx " + << config_idx << " and result not in-place" << std::endl; + break; + case ADD: + std::cout << "Elapsed time: " << duration.count() << " microseconds, operation ADD for config_idx " + << config_idx << " and result not in-place" << std::endl; + break; + case SUB: + std::cout << "Elapsed time: " << duration.count() << " microseconds, operation SUB for config_idx " + << config_idx << " and result not in-place" << std::endl; + break; } - if (config.is_result_on_device) { // Copy the data to host_out in order to compare it vs. host_out_ref_mul value. - if (config.is_in_montgomery_form) { // Convert to normal from montgomery if needed. - CHK_IF_RETURN(mont::from_montgomery(device_out, vector_size, config.ctx.stream, device_out)); // Convert to normal in order to check vs. host_out_ref_mul. + if (config.is_result_on_device) { // Copy the data to host_out in order to compare it vs. host_out_ref_mul value. + if (config.is_in_montgomery_form) { // Convert to normal from montgomery if needed. + CHK_IF_RETURN(mont::from_montgomery( + device_out, vector_size, config.ctx.stream, + device_out)); // Convert to normal in order to check vs. host_out_ref_mul. } - err = cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul. + err = cudaMemcpy( + host_out, device_out, vector_size * sizeof(T), + cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul. if (err != cudaSuccess) { std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl; return 0; } - } else { // Data is not on device but it is in host_out. - if (config.is_in_montgomery_form) { // host_out should be written to device, converted to mmontgomery and written back to host. Then compared vs. host_out_ref_mul. - err = cudaMemcpy(device_out, host_out, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul. + } else { // Data is not on device but it is in host_out. + if (config.is_in_montgomery_form) { // host_out should be written to device, converted to mmontgomery and + // written back to host. Then compared vs. host_out_ref_mul. + err = cudaMemcpy( + device_out, host_out, vector_size * sizeof(T), + cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul. if (err != cudaSuccess) { std::cerr << "Failed to copy data from host_out to device_out - " << cudaGetErrorString(err) << std::endl; return 0; } - CHK_IF_RETURN(mont::from_montgomery(device_out, vector_size, config.ctx.stream, device_out)); // Convert to normal in order to check vs. host_out_ref_mul. - err = cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul. + CHK_IF_RETURN(mont::from_montgomery( + device_out, vector_size, config.ctx.stream, + device_out)); // Convert to normal in order to check vs. host_out_ref_mul. + err = cudaMemcpy( + host_out, device_out, vector_size * sizeof(T), + cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul. if (err != cudaSuccess) { std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl; return 0; } - } else { // host_out could be compared vs. host_out_ref_mul as is. + } else { // host_out could be compared vs. host_out_ref_mul as is. } } //**************************************** @@ -336,30 +406,37 @@ int main(int argc, char** argv) // std::cout << "===>>> host_out_ref_mul[" << i << "]: " << host_out_ref_mul[i] << std::endl; // std::cout << "===>>> host_out[" << i << "] after test run: " << host_out[i] << std::endl; switch (op) { - case MUL: for (int i=0; i>> ERROR!!! MUL: Test failed for vector index " << i << ", config is printed below:" << std::endl; - test_failed = 1; - } - } + case MUL: + for (int i = 0; i < vector_size; i++) { + if (host_out_ref_mul[i] != host_out[i]) { + std::cout << "===>>> ERROR!!! MUL: Test failed for vector index " << i + << ", config is printed below:" << std::endl; + test_failed = 1; + } + } break; - case ADD: for (int i=0; i>> ERROR!!! ADD: Test failed for vector index " << i << ", config is printed below:" << std::endl; - test_failed = 1; - } - } + case ADD: + for (int i = 0; i < vector_size; i++) { + if (host_out_ref_add[i] != host_out[i]) { + std::cout << "===>>> ERROR!!! ADD: Test failed for vector index " << i + << ", config is printed below:" << std::endl; + test_failed = 1; + } + } break; - case SUB: for (int i=0; i>> ERROR!!! SUB: Test failed for vector index " << i << ", config is printed below:" << std::endl; - test_failed = 1; - } - } + case SUB: + for (int i = 0; i < vector_size; i++) { + if (host_out_ref_sub[i] != host_out[i]) { + std::cout << "===>>> ERROR!!! SUB: Test failed for vector index " << i + << ", config is printed below:" << std::endl; + test_failed = 1; + } + } break; } if (test_failed) { - // std::cout << "===>>> ERROR!!! Test failed for vector index " << i << ", config is printed below:" << std::endl; + // std::cout << "===>>> ERROR!!! Test failed for vector index " << i << ", config is printed below:" << + // std::endl; std::cout << "===>>> result is not in-place: " << std::endl; std::cout << "===>>> is_a_on_device: " << config.is_a_on_device << std::endl; std::cout << "===>>> is_b_on_device: " << config.is_b_on_device << std::endl; @@ -371,8 +448,8 @@ int main(int argc, char** argv) unsigned power_after; nvmlDeviceGetPowerUsage(device, &power_after); std::cout << "Power after: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_after << " W" << std::endl; - std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_after / power_limit - << " %" << std::endl; + std::cout << "Power utilization: " << std::fixed << std::setprecision(1) + << (float)100.0 * power_after / power_limit << " %" << std::endl; unsigned temperature_after; if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_after) == NVML_SUCCESS) { std::cout << "GPU Temperature after: " << temperature_after << " C" << std::endl; @@ -387,9 +464,9 @@ int main(int argc, char** argv) } // Test when the result is in-place - for ( int op = MUL; op != LAST; op++ ) { + for (int op = MUL; op != LAST; op++) { for (int config_idx = 0; config_idx < 16; config_idx++) { - for (int i=0; i> 3) & 0x1; config.is_result_on_device = (config_idx >> 2) & 0x1; config.is_in_montgomery_form = (config_idx >> 1) & 0x1; - if (config.is_a_on_device ^ config.is_result_on_device == 1) { - continue; - } + if (config.is_a_on_device ^ config.is_result_on_device == 1) { continue; } - // Copy from host to device (copy again in order to be used later in the loop and device_inX was already overwritten by warmup. + // Copy from host to device (copy again in order to be used later in the loop and device_inX was already + // overwritten by warmup. if (config.is_a_on_device) { if (config.is_in_montgomery_form) { - err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + err = + cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. if (err != cudaSuccess) { std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl; return 0; } - CHK_IF_RETURN(mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1)); // Convert in-place. - } else { // Normal presentation. - err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + CHK_IF_RETURN( + mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1)); // Convert in-place. + } else { // Normal presentation. + err = + cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. if (err != cudaSuccess) { std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl; return 0; } } } else { - if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host. - err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host. + err = + cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. if (err != cudaSuccess) { std::cerr << "Failed to copy data from host_in1 to device_in1 - " << cudaGetErrorString(err) << std::endl; return 0; @@ -434,23 +514,27 @@ int main(int argc, char** argv) } if (config.is_b_on_device) { if (config.is_in_montgomery_form) { - err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + err = + cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. if (err != cudaSuccess) { std::cerr << "Failed to copy data from host_in2 to device_in1 - " << cudaGetErrorString(err) << std::endl; return 0; } - CHK_IF_RETURN(mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2)); // Convert in-place. + CHK_IF_RETURN( + mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2)); // Convert in-place. } else { // Normal presentation. - err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + err = + cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. if (err != cudaSuccess) { std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl; return 0; } } } else { - if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host. - err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host. + err = + cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. if (err != cudaSuccess) { std::cerr << "Failed to copy data from host_in2 to device_in2 - " << cudaGetErrorString(err) << std::endl; return 0; @@ -468,15 +552,27 @@ int main(int argc, char** argv) auto start_time = std::chrono::high_resolution_clock::now(); // Benchmark loop for (int i = 0; i < repetitions; i++) { - switch (config_idx >> 2) { // {is_a_on_device, is_b_on_device, is_result_on_device} - case 0b000: vector_op(host_in1, host_in2, host_in1, vector_size, ctx, config, (Op)op); break; - case 0b001: break; - case 0b010: vector_op(host_in1, device_in2, host_in1, vector_size, ctx, config, (Op)op); break; - case 0b011: break; - case 0b100: break; - case 0b101: vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op); break; - case 0b110: break; - case 0b111: vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op); break; + switch (config_idx >> 2) { // {is_a_on_device, is_b_on_device, is_result_on_device} + case 0b000: + vector_op(host_in1, host_in2, host_in1, vector_size, ctx, config, (Op)op); + break; + case 0b001: + break; + case 0b010: + vector_op(host_in1, device_in2, host_in1, vector_size, ctx, config, (Op)op); + break; + case 0b011: + break; + case 0b100: + break; + case 0b101: + vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op); + break; + case 0b110: + break; + case 0b111: + vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op); + break; } CHK_IF_RETURN(cudaPeekAtLastError()); } @@ -484,40 +580,64 @@ int main(int argc, char** argv) auto end_time = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(end_time - start_time); switch (op) { - case MUL: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation MUL for config_idx " << config_idx << " and result in-place" << std::endl; break; - case ADD: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation ADD for config_idx " << config_idx << " and result in-place" << std::endl; break; - case SUB: std::cout << "Elapsed time: " << duration.count() << " microseconds, operation SUB for config_idx " << config_idx << " and result in-place" << std::endl; break; + case MUL: + std::cout << "Elapsed time: " << duration.count() << " microseconds, operation MUL for config_idx " + << config_idx << " and result in-place" << std::endl; + break; + case ADD: + std::cout << "Elapsed time: " << duration.count() << " microseconds, operation ADD for config_idx " + << config_idx << " and result in-place" << std::endl; + break; + case SUB: + std::cout << "Elapsed time: " << duration.count() << " microseconds, operation SUB for config_idx " + << config_idx << " and result in-place" << std::endl; + break; } - if (config.is_result_on_device) { // Copy the data to host_out in order to compare it vs. host_out_ref_mul value. - if (config.is_in_montgomery_form) { // Convert to normal from montgomery if needed. - CHK_IF_RETURN(mont::from_montgomery(device_in1, vector_size, config.ctx.stream, device_in1)); // Convert to normal in order to check vs. host_out_ref_mul. + if (config.is_result_on_device) { // Copy the data to host_out in order to compare it vs. host_out_ref_mul value. + if (config.is_in_montgomery_form) { // Convert to normal from montgomery if needed. + CHK_IF_RETURN(mont::from_montgomery( + device_in1, vector_size, config.ctx.stream, + device_in1)); // Convert to normal in order to check vs. host_out_ref_mul. } - err = cudaMemcpy(host_out, device_in1, vector_size * sizeof(T), cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul. + err = cudaMemcpy( + host_out, device_in1, vector_size * sizeof(T), + cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul. if (err != cudaSuccess) { std::cerr << "Failed to copy data from device_in1 to host_out - " << cudaGetErrorString(err) << std::endl; return 0; } - } else { // Data is not on device but it is in host_in1. It should be moved to host_out for test pass/fail check. - if (config.is_in_montgomery_form) { // host_out should be written to device, converted to mmontgomery and written back to host. Then compared vs. host_out_ref_mul. - err = cudaMemcpy(device_out, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul. + } else { // Data is not on device but it is in host_in1. It should be moved to host_out for test pass/fail check. + if (config.is_in_montgomery_form) { // host_out should be written to device, converted to mmontgomery and + // written back to host. Then compared vs. host_out_ref_mul. + err = cudaMemcpy( + device_out, host_in1, vector_size * sizeof(T), + cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul. if (err != cudaSuccess) { std::cerr << "Failed to copy data from host_in1 to device_out - " << cudaGetErrorString(err) << std::endl; return 0; } - CHK_IF_RETURN(mont::from_montgomery(device_out, vector_size, config.ctx.stream, device_out)); // Convert to normal in order to check vs. host_out_ref_mul. - err = cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul. + CHK_IF_RETURN(mont::from_montgomery( + device_out, vector_size, config.ctx.stream, + device_out)); // Convert to normal in order to check vs. host_out_ref_mul. + err = cudaMemcpy( + host_out, device_out, vector_size * sizeof(T), + cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul. if (err != cudaSuccess) { std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl; return 0; } - } else { // host_out could be compared vs. host_out_ref_mul as is. - err = cudaMemcpy(device_out, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul. + } else { // host_out could be compared vs. host_out_ref_mul as is. + err = cudaMemcpy( + device_out, host_in1, vector_size * sizeof(T), + cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul. if (err != cudaSuccess) { std::cerr << "Failed to copy data from host_in1 to device_out - " << cudaGetErrorString(err) << std::endl; return 0; - } - err = cudaMemcpy(host_out, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul. + } + err = cudaMemcpy( + host_out, device_out, vector_size * sizeof(T), + cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul. if (err != cudaSuccess) { std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl; return 0; @@ -538,33 +658,40 @@ int main(int argc, char** argv) // std::cout << "===>>> host_out_ref_mul[" << i << "]: " << host_out_ref_mul[i] << std::endl; // std::cout << "===>>> host_out[" << i << "] after test run: " << host_out[i] << std::endl; switch (op) { - case MUL: for (int i=0; i>> ERROR!!! MUL: Test failed for vector index " << i << ", config is printed below:" << std::endl; - std::cout << "host_out_ref_mul[0] = " << host_out_ref_mul[0] << std::endl; - test_failed = 1; - } - } + case MUL: + for (int i = 0; i < vector_size; i++) { + if (host_out_ref_mul[i] != host_out[i]) { + std::cout << "===>>> ERROR!!! MUL: Test failed for vector index " << i + << ", config is printed below:" << std::endl; + std::cout << "host_out_ref_mul[0] = " << host_out_ref_mul[0] << std::endl; + test_failed = 1; + } + } break; - case ADD: for (int i=0; i>> ERROR!!! ADD: Test failed for vector index " << i << ", config is printed below:" << std::endl; - std::cout << "host_out_ref_add[0] = " << host_out_ref_add[0] << std::endl; - test_failed = 1; - } - } + case ADD: + for (int i = 0; i < vector_size; i++) { + if (host_out_ref_add[i] != host_out[i]) { + std::cout << "===>>> ERROR!!! ADD: Test failed for vector index " << i + << ", config is printed below:" << std::endl; + std::cout << "host_out_ref_add[0] = " << host_out_ref_add[0] << std::endl; + test_failed = 1; + } + } break; - case SUB: for (int i=0; i>> ERROR!!! SUB: Test failed for vector index " << i << ", config is printed below:" << std::endl; - std::cout << "host_out_ref_sub[0] = " << host_out_ref_sub[0] << std::endl; - test_failed = 1; - } - } + case SUB: + for (int i = 0; i < vector_size; i++) { + if (host_out_ref_sub[i] != host_out[i]) { + std::cout << "===>>> ERROR!!! SUB: Test failed for vector index " << i + << ", config is printed below:" << std::endl; + std::cout << "host_out_ref_sub[0] = " << host_out_ref_sub[0] << std::endl; + test_failed = 1; + } + } break; } if (test_failed) { - // std::cout << "===>>> ERROR!!! Test failed for vector index " << i << ", config is printed below:" << std::endl; + // std::cout << "===>>> ERROR!!! Test failed for vector index " << i << ", config is printed below:" << + // std::endl; std::cout << "===>>> result is in-place: " << std::endl; std::cout << "===>>> is_a_on_device: " << config.is_a_on_device << std::endl; std::cout << "===>>> is_b_on_device: " << config.is_b_on_device << std::endl; @@ -577,8 +704,8 @@ int main(int argc, char** argv) unsigned power_after; nvmlDeviceGetPowerUsage(device, &power_after); std::cout << "Power after: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_after << " W" << std::endl; - std::cout << "Power utilization: " << std::fixed << std::setprecision(1) << (float)100.0 * power_after / power_limit - << " %" << std::endl; + std::cout << "Power utilization: " << std::fixed << std::setprecision(1) + << (float)100.0 * power_after / power_limit << " %" << std::endl; unsigned temperature_after; if (nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature_after) == NVML_SUCCESS) { std::cout << "GPU Temperature after: " << temperature_after << " C" << std::endl; diff --git a/examples/c++/mont_vec_ops/run.sh b/examples/c++/mont_vec_ops/run.sh old mode 100755 new mode 100644 index 01eca66ba..11d3275cc --- a/examples/c++/mont_vec_ops/run.sh +++ b/examples/c++/mont_vec_ops/run.sh @@ -1,2 +1,2 @@ -#!/bin/bash -./build/example/example +#!/ bin / bash +./ build / example / example diff --git a/icicle/include/vec_ops/vec_ops.cuh b/icicle/include/vec_ops/vec_ops.cuh index 89dad5610..be2ab4ba3 100644 --- a/icicle/include/vec_ops/vec_ops.cuh +++ b/icicle/include/vec_ops/vec_ops.cuh @@ -27,7 +27,8 @@ namespace vec_ops { * non-blocking and you'd need to synchronize it explicitly by running * `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the * function will block the current CPU thread. */ - bool is_in_montgomery_form; /**< If true then vec_a, vec_b and result are in montgomery form. Default value: false. */ + bool + is_in_montgomery_form; /**< If true then vec_a, vec_b and result are in montgomery form. Default value: false. */ }; /** diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu index 7e7cf5e65..9883d393c 100644 --- a/icicle/src/vec_ops/vec_ops.cu +++ b/icicle/src/vec_ops/vec_ops.cu @@ -102,19 +102,20 @@ namespace vec_ops { CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream)); CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_a, n * sizeof(E), config.ctx.stream, d_alloc_vec_a)); - is_d_alloc_vec_a_allocated = 1; + is_d_alloc_vec_a_allocated = 1; d_vec_a = d_alloc_vec_a; } else { CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream)); - is_d_alloc_vec_a_allocated = 1; + is_d_alloc_vec_a_allocated = 1; d_vec_a = d_alloc_vec_a; } } else { if (config.is_in_montgomery_form) { - CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input. + CHK_IF_RETURN(cudaMallocAsync( + &d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input. CHK_IF_RETURN(mont::from_montgomery(vec_a, n * sizeof(E), config.ctx.stream, d_alloc_vec_a)); - is_d_alloc_vec_a_allocated = 1; + is_d_alloc_vec_a_allocated = 1; d_vec_a = d_alloc_vec_a; } else { d_vec_a = vec_a; @@ -132,14 +133,15 @@ namespace vec_ops { } else { CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream)); CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream)); - is_d_alloc_vec_b_allocated = 1; + is_d_alloc_vec_b_allocated = 1; d_vec_b = d_alloc_vec_b; } } else { if (config.is_in_montgomery_form) { - CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input. + CHK_IF_RETURN(cudaMallocAsync( + &d_alloc_vec_b, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input. CHK_IF_RETURN(mont::from_montgomery(vec_b, n * sizeof(E), config.ctx.stream, d_alloc_vec_b)); - is_d_alloc_vec_b_allocated = 1; + is_d_alloc_vec_b_allocated = 1; d_vec_b = d_alloc_vec_b; } else { d_vec_b = vec_b; @@ -167,14 +169,14 @@ namespace vec_ops { if (!config.is_result_on_device) { if (config.is_in_montgomery_form) { - CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result)); // Convert in-place. + CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result)); // Convert in-place. CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream)); } else { CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream)); } } else { if (config.is_in_montgomery_form) { - CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result)); // Convert in-place. + CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result)); // Convert in-place. } } From 9e452c47d0551956bc4421d7ec1202791a6a45ac Mon Sep 17 00:00:00 2001 From: danny-shterman Date: Wed, 31 Jul 2024 09:35:53 +0000 Subject: [PATCH 04/21] Add montgomery config in rust and golang files. --- examples/c++/mont_vec_ops/CMakeLists.txt | 36 +- examples/c++/mont_vec_ops/compile.debug.sh | 27 +- examples/c++/mont_vec_ops/compile.sh | 26 +- examples/c++/mont_vec_ops/run.sh | 4 +- wrappers/golang/core/vec_ops.go | 140 +++--- wrappers/golang/core/vec_ops_test.go | 32 +- wrappers/rust/icicle-core/src/vec_ops/mod.rs | 504 +++++++++---------- 7 files changed, 367 insertions(+), 402 deletions(-) mode change 100644 => 100755 examples/c++/mont_vec_ops/compile.debug.sh mode change 100644 => 100755 examples/c++/mont_vec_ops/compile.sh mode change 100644 => 100755 examples/c++/mont_vec_ops/run.sh diff --git a/examples/c++/mont_vec_ops/CMakeLists.txt b/examples/c++/mont_vec_ops/CMakeLists.txt index 1220da827..f3170e206 100644 --- a/examples/c++/mont_vec_ops/CMakeLists.txt +++ b/examples/c++/mont_vec_ops/CMakeLists.txt @@ -1,14 +1,24 @@ -cmake_minimum_required(VERSION 3.18) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CUDA_STANDARD 17) - set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) - set(CMAKE_CXX_STANDARD_REQUIRED TRUE) if (${CMAKE_VERSION} VERSION_LESS "3.24.0") - set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH}) else() set(CMAKE_CUDA_ARCHITECTURES native) #on 3.24 + - , on earlier it is ignored, - and the target is not passed endif() project(example LANGUAGES CUDA CXX) +cmake_minimum_required(VERSION 3.18) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CUDA_STANDARD 17) +set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) +set(CMAKE_CXX_STANDARD_REQUIRED TRUE) +if (${CMAKE_VERSION} VERSION_LESS "3.24.0") + set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH}) +else() + set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed +endif () +project(example LANGUAGES CUDA CXX) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") set(CMAKE_CUDA_FLAGS_RELEASE "") - set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0") add_executable(example example.cu) - target_include_directories(example PRIVATE "../../../icicle/include") - target_link_libraries(example ${CMAKE_SOURCE_DIR} / build / icicle / lib / libingo_field_bn254.a) - find_library(NVML_LIBRARY nvidia - ml PATHS / usr / local / cuda / targets / x86_64 - linux / lib / stubs /) - target_link_libraries(example ${NVML_LIBRARY}) - set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON) +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") +set(CMAKE_CUDA_FLAGS_RELEASE "") +set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0") +add_executable( + example + example.cu +) +target_include_directories(example PRIVATE "../../../icicle/include") +target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_bn254.a) +find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ ) +target_link_libraries(example ${NVML_LIBRARY}) +set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON) diff --git a/examples/c++/mont_vec_ops/compile.debug.sh b/examples/c++/mont_vec_ops/compile.debug.sh old mode 100644 new mode 100755 index 73d19bbd9..c1d6bf932 --- a/examples/c++/mont_vec_ops/compile.debug.sh +++ b/examples/c++/mont_vec_ops/compile.debug.sh @@ -1,20 +1,15 @@ -#!/ bin / bash +#!/bin/bash -#Exit immediately on error -set - - e +# Exit immediately on error +set -e - mkdir - - p build / example mkdir - - p build / icicle +mkdir -p build/example +mkdir -p build/icicle -#Configure and build Icicle - cmake - - S../../../ icicle / -B build / icicle - DMSM = - OFF - DCMAKE_BUILD_TYPE = Debug - DCURVE = bn254 cmake-- build build / icicle - - j +# Configure and build Icicle +cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Debug -DCURVE=bn254 +cmake --build build/icicle -j -#Configure and build the example application - cmake - - DCMAKE_BUILD_TYPE = - Debug - S.- B build / example cmake-- build build / example +# Configure and build the example application +cmake -DCMAKE_BUILD_TYPE=Debug -S. -B build/example +cmake --build build/example diff --git a/examples/c++/mont_vec_ops/compile.sh b/examples/c++/mont_vec_ops/compile.sh old mode 100644 new mode 100755 index cf145b290..de35c62da --- a/examples/c++/mont_vec_ops/compile.sh +++ b/examples/c++/mont_vec_ops/compile.sh @@ -1,19 +1,15 @@ -#!/ bin / bash +#!/bin/bash -#Exit immediately on error -set - - e +# Exit immediately on error +set -e - mkdir - - p build / example mkdir - - p build / icicle +mkdir -p build/example +mkdir -p build/icicle -#Configure and build Icicle - cmake - - S../../../ icicle / -B build / icicle - - DMSM = OFF - DCMAKE_BUILD_TYPE = Release - DCURVE = bn254 cmake-- build build / icicle - - j +# Configure and build Icicle +cmake -S ../../../icicle/ -B build/icicle -DMSM=OFF -DCMAKE_BUILD_TYPE=Release -DCURVE=bn254 +cmake --build build/icicle -#Configure and build the example application - cmake - - S.- B build / example cmake-- build build / example +# Configure and build the example application +cmake -S . -B build/example +cmake --build build/example \ No newline at end of file diff --git a/examples/c++/mont_vec_ops/run.sh b/examples/c++/mont_vec_ops/run.sh old mode 100644 new mode 100755 index 11d3275cc..88073af8b --- a/examples/c++/mont_vec_ops/run.sh +++ b/examples/c++/mont_vec_ops/run.sh @@ -1,2 +1,2 @@ -#!/ bin / bash -./ build / example / example +#! /bin/bash +./build/example/example diff --git a/wrappers/golang/core/vec_ops.go b/wrappers/golang/core/vec_ops.go index 9ea4681ed..d589056c4 100644 --- a/wrappers/golang/core/vec_ops.go +++ b/wrappers/golang/core/vec_ops.go @@ -1,33 +1,29 @@ package core -import ( - "fmt" - "unsafe" +import("fmt" + "unsafe" - cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime" -) + cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime") -type VecOps int + type VecOps int -const ( - Sub VecOps = iota - Add - Mul -) + const(Sub VecOps = iota Add Mul) -type VecOpsConfig struct { - /*Details related to the device such as its id and stream. */ - Ctx cr.DeviceContext - /* True if `a` is on device and false if it is not. Default value: false. */ - isAOnDevice bool - /* True if `b` is on device and false if it is not. Default value: false. */ - isBOnDevice bool - /* If true, output is preserved on device, otherwise on host. Default value: false. */ - isResultOnDevice bool - /* Whether to run the vector operations asynchronously. If set to `true`, the function will be - * non-blocking and you'll need to synchronize it explicitly by calling - * `SynchronizeStream`. If set to false, the function will block the current CPU thread. */ - IsAsync bool + type VecOpsConfig struct { + /*Details related to the device such as its id and stream. */ + Ctx cr.DeviceContext + /* True if `a` is on device and false if it is not. Default value: false. */ + isAOnDevice bool + /* True if `b` is on device and false if it is not. Default value: false. */ + isBOnDevice bool + /* If true, output is preserved on device, otherwise on host. Default value: false. */ + isResultOnDevice bool + /* Whether to run the vector operations asynchronously. If set to `true`, the function will be + * non-blocking and you'll need to synchronize it explicitly by calling + * `SynchronizeStream`. If set to false, the function will block the current CPU thread. */ + IsAsync bool + /* If true then vec_a, vec_b and result are in montgomery form. Default value: false. */ + IsInMontgomeryForm bool } /** @@ -42,69 +38,57 @@ func DefaultVecOpsConfig() VecOpsConfig { false, // isBOnDevice false, // isResultOnDevice false, // IsAsync + false, // IsInMontgomeryForm } return config } -func VecOpCheck(a, b, out HostOrDeviceSlice, cfg *VecOpsConfig) (unsafe.Pointer, unsafe.Pointer, unsafe.Pointer, unsafe.Pointer, int) { - aLen, bLen, outLen := a.Len(), b.Len(), out.Len() - if aLen != bLen { - errorString := fmt.Sprintf( - "a and b vector lengths %d; %d are not equal", - aLen, - bLen, - ) - panic(errorString) - } - if aLen != outLen { - errorString := fmt.Sprintf( - "a and out vector lengths %d; %d are not equal", - aLen, - outLen, - ) - panic(errorString) - } +func VecOpCheck(a, b, out HostOrDeviceSlice, cfg *VecOpsConfig) (unsafe.Pointer, unsafe.Pointer, unsafe.Pointer, unsafe.Pointer, int) +{ + aLen, bLen, outLen : = a.Len(), b.Len(), out.Len() if aLen != bLen + { + errorString: + = fmt.Sprintf("a and b vector lengths %d; %d are not equal", aLen, bLen, ) panic(errorString) + } + if aLen + != outLen + { + errorString: + = fmt.Sprintf("a and out vector lengths %d; %d are not equal", aLen, outLen, ) panic(errorString) + } - if a.IsOnDevice() { - a.(DeviceSlice).CheckDevice() - } - if b.IsOnDevice() { - b.(DeviceSlice).CheckDevice() - } - if out.IsOnDevice() { - out.(DeviceSlice).CheckDevice() - } + if a + .IsOnDevice() { a.(DeviceSlice).CheckDevice() } + if b + .IsOnDevice() { b.(DeviceSlice).CheckDevice() } + if out + .IsOnDevice(){out.(DeviceSlice).CheckDevice()} - cfg.isAOnDevice = a.IsOnDevice() - cfg.isBOnDevice = b.IsOnDevice() - cfg.isResultOnDevice = out.IsOnDevice() + cfg.isAOnDevice = a.IsOnDevice() cfg.isBOnDevice = b.IsOnDevice() cfg.isResultOnDevice = + out + .IsOnDevice() - return a.AsUnsafePointer(), b.AsUnsafePointer(), out.AsUnsafePointer(), unsafe.Pointer(cfg), a.Len() + return a.AsUnsafePointer(), + b.AsUnsafePointer(), out.AsUnsafePointer(), unsafe.Pointer(cfg), a.Len() } -func TransposeCheck(in, out HostOrDeviceSlice, onDevice bool) { - inLen, outLen := in.Len(), out.Len() +func TransposeCheck(in, out HostOrDeviceSlice, onDevice bool) +{ + inLen, outLen : = in.Len(), + out.Len() - if inLen != outLen { - errorString := fmt.Sprintf( - "in and out vector lengths %d; %d are not equal", - inLen, - outLen, - ) - panic(errorString) - } - if (onDevice != in.IsOnDevice()) || (onDevice != out.IsOnDevice()) { - errorString := fmt.Sprintf( - "onDevice is set to %t, but in.IsOnDevice():%t and out.IsOnDevice():%t", - onDevice, - in.IsOnDevice(), - out.IsOnDevice(), - ) - panic(errorString) - } - if onDevice { - in.(DeviceSlice).CheckDevice() - out.(DeviceSlice).CheckDevice() - } + if inLen != outLen + { + errorString: + = fmt.Sprintf("in and out vector lengths %d; %d are not equal", inLen, outLen, ) panic(errorString) + } + if (onDevice != in.IsOnDevice()) || (onDevice != out.IsOnDevice()) + { + errorString: + = fmt.Sprintf( + "onDevice is set to %t, but in.IsOnDevice():%t and out.IsOnDevice():%t", onDevice, in.IsOnDevice(), + out.IsOnDevice(), ) panic(errorString) + } + if onDevice { in.(DeviceSlice).CheckDevice() out.(DeviceSlice).CheckDevice() } } diff --git a/wrappers/golang/core/vec_ops_test.go b/wrappers/golang/core/vec_ops_test.go index 778e6e7a5..989a0a810 100644 --- a/wrappers/golang/core/vec_ops_test.go +++ b/wrappers/golang/core/vec_ops_test.go @@ -1,23 +1,23 @@ package core -import ( - "testing" + import("testing" - cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime" - "github.com/stretchr/testify/assert" -) + cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime" + "github.com/stretchr/testify/assert") -func TestVecOpsDefaultConfig(t *testing.T) { - ctx, _ := cr.GetDefaultDeviceContext() - expected := VecOpsConfig{ - ctx, // Ctx - false, // isAOnDevice - false, // isBOnDevice - false, // isResultOnDevice - false, // IsAsync - } + func TestVecOpsDefaultConfig(t* testing.T) +{ + ctx, _ : = cr.GetDefaultDeviceContext() expected : = + VecOpsConfig{ + ctx, // Ctx + false, // isAOnDevice + false, // isBOnDevice + false, // isResultOnDevice + false, // IsAsync + false, // IsInMontgomeryForm + } - actual := DefaultVecOpsConfig() + actual : = DefaultVecOpsConfig() - assert.Equal(t, expected, actual) + assert.Equal(t, expected, actual) } diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs index 53e06d11d..efe777160 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs @@ -1,7 +1,7 @@ use icicle_cuda_runtime::device::check_device; use icicle_cuda_runtime::{ - device_context::{DeviceContext, DEFAULT_DEVICE_ID}, - memory::HostOrDeviceSlice, + device_context::{DeviceContext, DEFAULT_DEVICE_ID}, + memory::HostOrDeviceSlice, }; use crate::{error::IcicleResult, traits::FieldImpl}; @@ -20,24 +20,29 @@ pub struct VecOpsConfig<'a> { /// Whether to run the vector operations asynchronously. If set to `true`, the functions will be non-blocking and you'd need to synchronize /// it explicitly by running `stream.synchronize()`. If set to false, the functions will block the current CPU thread. pub is_async: bool, + /// If true then vec_a, vec_b and result are in montgomery form. Default value: false. + pub is_in_montgomery_form: bool, } impl<'a> Default for VecOpsConfig<'a> { - fn default() -> Self { - Self::default_for_device(DEFAULT_DEVICE_ID) - } + fn default()->Self { Self::default_for_device(DEFAULT_DEVICE_ID) } } -impl<'a> VecOpsConfig<'a> { - pub fn default_for_device(device_id: usize) -> Self { - VecOpsConfig { - ctx: DeviceContext::default_for_device(device_id), - is_a_on_device: false, - is_b_on_device: false, - is_result_on_device: false, - is_async: false, - } +impl<'a> VecOpsConfig<' a> +{ + pub fn default_for_device(device_id : usize) -> Self + { + VecOpsConfig + { + ctx: + DeviceContext::default_for_device(device_id), + is_a_on_device : false, + is_b_on_device : false, + is_result_on_device : false, + is_async : false, + is_in_montgomery_form : false, } + } } #[repr(C)] @@ -55,27 +60,30 @@ pub struct BitReverseConfig<'a> { /// Whether to run the vector operations asynchronously. If set to `true`, the functions will be non-blocking and you'd need to synchronize /// it explicitly by running `stream.synchronize()`. If set to false, the functions will block the current CPU thread. pub is_async: bool, + /// If true then vec_a, vec_b and result are in montgomery form. Default value: false. + pub is_in_montgomery_form: bool, } impl<'a> Default for BitReverseConfig<'a> { - fn default() -> Self { - Self::default_for_device(DEFAULT_DEVICE_ID) - } + fn default()->Self { Self::default_for_device(DEFAULT_DEVICE_ID) } } -impl<'a> BitReverseConfig<'a> { - pub fn default_for_device(device_id: usize) -> Self { - BitReverseConfig { - ctx: DeviceContext::default_for_device(device_id), - is_input_on_device: false, - is_output_on_device: false, - is_async: false, - } +impl<'a> BitReverseConfig<' a> +{ + pub fn default_for_device(device_id : usize) -> Self + { + BitReverseConfig + { + ctx: + DeviceContext::default_for_device(device_id), + is_input_on_device : false, is_output_on_device : false, is_async : false, is_in_montgomery_form : false, } + } } #[doc(hidden)] -pub trait VecOps { +pub trait VecOps +{ fn add( a: &(impl HostOrDeviceSlice + ?Sized), b: &(impl HostOrDeviceSlice + ?Sized), @@ -111,6 +119,7 @@ pub trait VecOps { ctx: &DeviceContext, on_device: bool, is_async: bool, + is_in_montgomery_form: bool, ) -> IcicleResult<()>; fn bit_reverse( @@ -131,72 +140,61 @@ fn check_vec_ops_args<'a, F>( result: &(impl HostOrDeviceSlice + ?Sized), cfg: &VecOpsConfig<'a>, ) -> VecOpsConfig<'a> { - if a.len() != b.len() || a.len() != result.len() { - panic!( - "left, right and output lengths {}; {}; {} do not match", - a.len(), - b.len(), - result.len() - ); - } - let ctx_device_id = cfg - .ctx - .device_id; - if let Some(device_id) = a.device_id() { - assert_eq!(device_id, ctx_device_id, "Device ids in a and context are different"); - } - if let Some(device_id) = b.device_id() { - assert_eq!(device_id, ctx_device_id, "Device ids in b and context are different"); - } - if let Some(device_id) = result.device_id() { - assert_eq!( - device_id, ctx_device_id, - "Device ids in result and context are different" - ); - } - check_device(ctx_device_id); - - let mut res_cfg = cfg.clone(); - res_cfg.is_a_on_device = a.is_on_device(); - res_cfg.is_b_on_device = b.is_on_device(); - res_cfg.is_result_on_device = result.is_on_device(); - res_cfg + if a.len() != b.len() || a.len() != result.len() +{ + panic !("left, right and output lengths {}; {}; {} do not match", a.len(), b.len(), result.len()); +} +let ctx_device_id = cfg.ctx.device_id; +if let + Some(device_id) = a.device_id() + { + assert_eq !(device_id, ctx_device_id, "Device ids in a and context are different"); + } +if let + Some(device_id) = b.device_id() + { + assert_eq !(device_id, ctx_device_id, "Device ids in b and context are different"); + } +if let + Some(device_id) = result.device_id() + { + assert_eq !(device_id, ctx_device_id, "Device ids in result and context are different"); + } +check_device(ctx_device_id); + +let mut res_cfg = cfg.clone(); +res_cfg.is_a_on_device = a.is_on_device(); +res_cfg.is_b_on_device = b.is_on_device(); +res_cfg.is_result_on_device = result.is_on_device(); +res_cfg } fn check_bit_reverse_args<'a, F>( input: &(impl HostOrDeviceSlice + ?Sized), cfg: &BitReverseConfig<'a>, output: &(impl HostOrDeviceSlice + ?Sized), ) -> BitReverseConfig<'a> { - if input.len() & (input.len() - 1) != 0 { - panic!("input length must be a power of 2, input length: {}", input.len()); - } - if input.len() != output.len() { - panic!( - "input and output lengths {}; {} do not match", - input.len(), - output.len() - ); - } - let ctx_device_id = cfg - .ctx - .device_id; - if let Some(device_id) = input.device_id() { - assert_eq!( - device_id, ctx_device_id, - "Device ids in input and context are different" - ); - } - if let Some(device_id) = output.device_id() { - assert_eq!( - device_id, ctx_device_id, - "Device ids in output and context are different" - ); - } - check_device(ctx_device_id); - let mut res_cfg = cfg.clone(); - res_cfg.is_input_on_device = input.is_on_device(); - res_cfg.is_output_on_device = output.is_on_device(); - res_cfg + if input.len() & (input.len() - 1) != 0 +{ + panic !("input length must be a power of 2, input length: {}", input.len()); +} +if input + .len() != output.len() { panic !("input and output lengths {}; {} do not match", input.len(), output.len()); } +let ctx_device_id = cfg.ctx.device_id; +if let + Some(device_id) = input.device_id() + { + assert_eq !(device_id, ctx_device_id, "Device ids in input and context are different"); + } +if let + Some(device_id) = output.device_id() + { + assert_eq !(device_id, ctx_device_id, "Device ids in output and context are different"); + } +check_device(ctx_device_id); +let mut res_cfg = cfg.clone(); +res_cfg.is_input_on_device = input.is_on_device(); +res_cfg.is_output_on_device = output.is_on_device(); +res_cfg } pub fn add_scalars( @@ -209,8 +207,8 @@ where F: FieldImpl, ::Config: VecOps, { - let cfg = check_vec_ops_args(a, b, result, cfg); - <::Config as VecOps>::add(a, b, result, &cfg) + let cfg = check_vec_ops_args(a, b, result, cfg); + << F as FieldImpl > ::Config as VecOps < F >> ::add(a, b, result, &cfg) } pub fn accumulate_scalars( @@ -222,8 +220,8 @@ where F: FieldImpl, ::Config: VecOps, { - let cfg = check_vec_ops_args(a, b, a, cfg); - <::Config as VecOps>::accumulate(a, b, &cfg) + let cfg = check_vec_ops_args(a, b, a, cfg); + << F as FieldImpl > ::Config as VecOps < F >> ::accumulate(a, b, &cfg) } pub fn sub_scalars( @@ -236,8 +234,8 @@ where F: FieldImpl, ::Config: VecOps, { - let cfg = check_vec_ops_args(a, b, result, cfg); - <::Config as VecOps>::sub(a, b, result, &cfg) + let cfg = check_vec_ops_args(a, b, result, cfg); + << F as FieldImpl > ::Config as VecOps < F >> ::sub(a, b, result, &cfg) } pub fn mul_scalars( @@ -250,8 +248,8 @@ where F: FieldImpl, ::Config: VecOps, { - let cfg = check_vec_ops_args(a, b, result, cfg); - <::Config as VecOps>::mul(a, b, result, &cfg) + let cfg = check_vec_ops_args(a, b, result, cfg); + << F as FieldImpl > ::Config as VecOps < F >> ::mul(a, b, result, &cfg) } pub fn transpose_matrix( @@ -262,12 +260,13 @@ pub fn transpose_matrix( ctx: &DeviceContext, on_device: bool, is_async: bool, + is_in_montgomery_form: bool, ) -> IcicleResult<()> where F: FieldImpl, ::Config: VecOps, { - <::Config as VecOps>::transpose(input, row_size, column_size, output, ctx, on_device, is_async) + <::Config as VecOps>::transpose(input, row_size, column_size, output, ctx, on_device, is_async, is_in_montgomery_form) } pub fn bit_reverse( @@ -279,8 +278,8 @@ where F: FieldImpl, ::Config: VecOps, { - let cfg = check_bit_reverse_args(input, cfg, output); - <::Config as VecOps>::bit_reverse(input, &cfg, output) + let cfg = check_bit_reverse_args(input, cfg, output); + << F as FieldImpl > ::Config as VecOps < F >> ::bit_reverse(input, &cfg, output) } pub fn bit_reverse_inplace( @@ -291,113 +290,108 @@ where F: FieldImpl, ::Config: VecOps, { - let cfg = check_bit_reverse_args(input, cfg, input); - <::Config as VecOps>::bit_reverse_inplace(input, &cfg) + let cfg = check_bit_reverse_args(input, cfg, input); + << F as FieldImpl > ::Config as VecOps < F >> ::bit_reverse_inplace(input, &cfg) } #[macro_export] -macro_rules! impl_vec_ops_field { - ( - $field_prefix:literal, - $field_prefix_ident:ident, - $field:ident, - $field_config:ident - ) => { - mod $field_prefix_ident { - use crate::vec_ops::{$field, CudaError, DeviceContext, HostOrDeviceSlice}; - use icicle_core::vec_ops::BitReverseConfig; - use icicle_core::vec_ops::VecOpsConfig; - - extern "C" { - #[link_name = concat!($field_prefix, "_add_cuda")] - pub(crate) fn add_scalars_cuda( - a: *const $field, - b: *const $field, - size: u32, - cfg: *const VecOpsConfig, - result: *mut $field, - ) -> CudaError; - - #[link_name = concat!($field_prefix, "_accumulate_cuda")] - pub(crate) fn accumulate_scalars_cuda( - a: *mut $field, - b: *const $field, - size: u32, - cfg: *const VecOpsConfig, - ) -> CudaError; - - #[link_name = concat!($field_prefix, "_sub_cuda")] - pub(crate) fn sub_scalars_cuda( - a: *const $field, - b: *const $field, - size: u32, - cfg: *const VecOpsConfig, - result: *mut $field, - ) -> CudaError; - - #[link_name = concat!($field_prefix, "_mul_cuda")] - pub(crate) fn mul_scalars_cuda( - a: *const $field, - b: *const $field, - size: u32, - cfg: *const VecOpsConfig, - result: *mut $field, - ) -> CudaError; - - #[link_name = concat!($field_prefix, "_transpose_matrix_cuda")] - pub(crate) fn transpose_cuda( - input: *const $field, - row_size: u32, - column_size: u32, - output: *mut $field, - ctx: *const DeviceContext, - on_device: bool, - is_async: bool, - ) -> CudaError; - - #[link_name = concat!($field_prefix, "_bit_reverse_cuda")] - pub(crate) fn bit_reverse_cuda( - input: *const $field, - size: u64, - config: *const BitReverseConfig, - output: *mut $field, - ) -> CudaError; - } - } +macro_rules !impl_vec_ops_field +{ + ($field_prefix : literal, $field_prefix_ident : ident, $field : ident, $field_config : ident) = > + { + mod $field_prefix_ident + { + use crate::vec_ops::{$field, CudaError, DeviceContext, HostOrDeviceSlice}; + use icicle_core::vec_ops::BitReverseConfig; + use icicle_core::vec_ops::VecOpsConfig; + + extern "C" { +#[link_name = concat !($field_prefix, "_add_cuda")] + pub(crate) fn add_scalars_cuda(a + : * const $field, b + : * const $field, size + : u32, cfg + : * const VecOpsConfig, result + : *mut $field, ) + ->CudaError; + +#[link_name = concat !($field_prefix, "_accumulate_cuda")] + pub(crate) fn accumulate_scalars_cuda(a + : *mut $field, b + : * const $field, size + : u32, cfg + : * const VecOpsConfig, ) + ->CudaError; + +#[link_name = concat !($field_prefix, "_sub_cuda")] + pub(crate) fn sub_scalars_cuda(a + : * const $field, b + : * const $field, size + : u32, cfg + : * const VecOpsConfig, result + : *mut $field, ) + ->CudaError; + +#[link_name = concat !($field_prefix, "_mul_cuda")] + pub(crate) fn mul_scalars_cuda(a + : * const $field, b + : * const $field, size + : u32, cfg + : * const VecOpsConfig, result + : *mut $field, ) + ->CudaError; + +#[link_name = concat !($field_prefix, "_transpose_matrix_cuda")] + pub(crate) fn transpose_cuda(input + : * const $field, row_size + : u32, column_size + : u32, output + : *mut $field, ctx + : * const DeviceContext, on_device + : bool, is_async + : bool, is_in_montgomery_form + : bool, ) + ->CudaError; + +#[link_name = concat !($field_prefix, "_bit_reverse_cuda")] + pub(crate) fn bit_reverse_cuda(input + : * const $field, size + : u64, config + : * const BitReverseConfig, output + : *mut $field, ) + ->CudaError; + } + } - impl VecOps<$field> for $field_config { + impl VecOps<$field> for $field_config + { fn add( a: &(impl HostOrDeviceSlice<$field> + ?Sized), b: &(impl HostOrDeviceSlice<$field> + ?Sized), result: &mut (impl HostOrDeviceSlice<$field> + ?Sized), cfg: &VecOpsConfig, - ) -> IcicleResult<()> { - unsafe { - $field_prefix_ident::add_scalars_cuda( - a.as_ptr(), - b.as_ptr(), - a.len() as u32, - cfg as *const VecOpsConfig, - result.as_mut_ptr(), - ) - .wrap() - } + ) -> IcicleResult<()> + { + unsafe + { + $field_prefix_ident::add_scalars_cuda( + a.as_ptr(), b.as_ptr(), a.len() as u32, cfg as* const VecOpsConfig, result.as_mut_ptr(), ) + .wrap() + } } fn accumulate( a: &mut (impl HostOrDeviceSlice<$field> + ?Sized), b: &(impl HostOrDeviceSlice<$field> + ?Sized), cfg: &VecOpsConfig, - ) -> IcicleResult<()> { - unsafe { - $field_prefix_ident::accumulate_scalars_cuda( - a.as_mut_ptr(), - b.as_ptr(), - a.len() as u32, - cfg as *const VecOpsConfig, - ) - .wrap() - } + ) -> IcicleResult<()> + { + unsafe + { + $field_prefix_ident::accumulate_scalars_cuda( + a.as_mut_ptr(), b.as_ptr(), a.len() as u32, cfg as* const VecOpsConfig, ) + .wrap() + } } fn sub( @@ -405,17 +399,14 @@ macro_rules! impl_vec_ops_field { b: &(impl HostOrDeviceSlice<$field> + ?Sized), result: &mut (impl HostOrDeviceSlice<$field> + ?Sized), cfg: &VecOpsConfig, - ) -> IcicleResult<()> { - unsafe { - $field_prefix_ident::sub_scalars_cuda( - a.as_ptr(), - b.as_ptr(), - a.len() as u32, - cfg as *const VecOpsConfig, - result.as_mut_ptr(), - ) - .wrap() - } + ) -> IcicleResult<()> + { + unsafe + { + $field_prefix_ident::sub_scalars_cuda( + a.as_ptr(), b.as_ptr(), a.len() as u32, cfg as* const VecOpsConfig, result.as_mut_ptr(), ) + .wrap() + } } fn mul( @@ -423,17 +414,14 @@ macro_rules! impl_vec_ops_field { b: &(impl HostOrDeviceSlice<$field> + ?Sized), result: &mut (impl HostOrDeviceSlice<$field> + ?Sized), cfg: &VecOpsConfig, - ) -> IcicleResult<()> { - unsafe { - $field_prefix_ident::mul_scalars_cuda( - a.as_ptr(), - b.as_ptr(), - a.len() as u32, - cfg as *const VecOpsConfig, - result.as_mut_ptr(), - ) - .wrap() - } + ) -> IcicleResult<()> + { + unsafe + { + $field_prefix_ident::mul_scalars_cuda( + a.as_ptr(), b.as_ptr(), a.len() as u32, cfg as* const VecOpsConfig, result.as_mut_ptr(), ) + .wrap() + } } fn transpose( @@ -444,72 +432,64 @@ macro_rules! impl_vec_ops_field { ctx: &DeviceContext, on_device: bool, is_async: bool, - ) -> IcicleResult<()> { - unsafe { - $field_prefix_ident::transpose_cuda( - input.as_ptr(), - row_size, - column_size, - output.as_mut_ptr(), - ctx as *const _ as *const DeviceContext, - on_device, - is_async, - ) - .wrap() - } + is_in_montgomery_form: bool, + ) -> IcicleResult<()> + { + unsafe + { + $field_prefix_ident::transpose_cuda( + input.as_ptr(), row_size, column_size, output.as_mut_ptr(), ctx as* const _ as* const DeviceContext, + on_device, is_async, is_in_montgomery_form, ) + .wrap() + } } fn bit_reverse( input: &(impl HostOrDeviceSlice<$field> + ?Sized), cfg: &BitReverseConfig, output: &mut (impl HostOrDeviceSlice<$field> + ?Sized), - ) -> IcicleResult<()> { - unsafe { - $field_prefix_ident::bit_reverse_cuda( - input.as_ptr(), - input.len() as u64, - cfg as *const BitReverseConfig, - output.as_mut_ptr(), - ) - .wrap() - } + ) -> IcicleResult<()> + { + unsafe + { + $field_prefix_ident::bit_reverse_cuda( + input.as_ptr(), input.len() as u64, cfg as* const BitReverseConfig, output.as_mut_ptr(), ) + .wrap() + } } fn bit_reverse_inplace( input: &mut (impl HostOrDeviceSlice<$field> + ?Sized), cfg: &BitReverseConfig, - ) -> IcicleResult<()> { - unsafe { - $field_prefix_ident::bit_reverse_cuda( - input.as_ptr(), - input.len() as u64, - cfg as *const BitReverseConfig, - input.as_mut_ptr(), - ) - .wrap() - } + ) -> IcicleResult<()> + { + unsafe + { + $field_prefix_ident::bit_reverse_cuda( + input.as_ptr(), input.len() as u64, cfg as* const BitReverseConfig, input.as_mut_ptr(), ) + .wrap() + } } } - }; + }; } #[macro_export] -macro_rules! impl_vec_add_tests { - ( - $field:ident - ) => { - #[test] - pub fn test_vec_add_scalars() { - check_vec_ops_scalars::<$field>(); - } +macro_rules !impl_vec_add_tests +{ + ($field + : ident) = > { +#[test] + pub fn test_vec_add_scalars(){check_vec_ops_scalars::<$field>(); +} - #[test] - pub fn test_bit_reverse() { - check_bit_reverse::<$field>() - } - #[test] - pub fn test_bit_reverse_inplace() { - check_bit_reverse_inplace::<$field>() - } - }; +#[test] +pub fn test_bit_reverse(){check_bit_reverse::<$field>()} +#[test] +pub fn test_bit_reverse_inplace() +{ + check_bit_reverse_inplace::<$field>() +} +} +; } From 993652ae8dfaa28882d748827205cb462b7c87f1 Mon Sep 17 00:00:00 2001 From: danny-shterman Date: Wed, 31 Jul 2024 13:47:45 +0000 Subject: [PATCH 05/21] Formatting for rust and golang. --- icicle/include/vec_ops/vec_ops.cuh | 3 +- wrappers/golang/core/vec_ops.go | 141 ++++++------ wrappers/golang/core/vec_ops_test.go | 33 +-- wrappers/rust/icicle-core/src/vec_ops/mod.rs | 212 ++++++++++--------- 4 files changed, 209 insertions(+), 180 deletions(-) diff --git a/icicle/include/vec_ops/vec_ops.cuh b/icicle/include/vec_ops/vec_ops.cuh index be2ab4ba3..6d1ad7987 100644 --- a/icicle/include/vec_ops/vec_ops.cuh +++ b/icicle/include/vec_ops/vec_ops.cuh @@ -27,8 +27,7 @@ namespace vec_ops { * non-blocking and you'd need to synchronize it explicitly by running * `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the * function will block the current CPU thread. */ - bool - is_in_montgomery_form; /**< If true then vec_a, vec_b and result are in montgomery form. Default value: false. */ + bool is_in_montgomery_form; /**< If true then vec_a, vec_b and result are in montgomery form. Default value: false. */ }; /** diff --git a/wrappers/golang/core/vec_ops.go b/wrappers/golang/core/vec_ops.go index d589056c4..8bef10b63 100644 --- a/wrappers/golang/core/vec_ops.go +++ b/wrappers/golang/core/vec_ops.go @@ -1,29 +1,35 @@ package core -import("fmt" - "unsafe" +import ( + "fmt" + "unsafe" - cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime") + cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime" +) - type VecOps int +type VecOps int - const(Sub VecOps = iota Add Mul) +const ( + Sub VecOps = iota + Add + Mul +) - type VecOpsConfig struct { - /*Details related to the device such as its id and stream. */ - Ctx cr.DeviceContext - /* True if `a` is on device and false if it is not. Default value: false. */ - isAOnDevice bool - /* True if `b` is on device and false if it is not. Default value: false. */ - isBOnDevice bool - /* If true, output is preserved on device, otherwise on host. Default value: false. */ - isResultOnDevice bool - /* Whether to run the vector operations asynchronously. If set to `true`, the function will be - * non-blocking and you'll need to synchronize it explicitly by calling - * `SynchronizeStream`. If set to false, the function will block the current CPU thread. */ - IsAsync bool - /* If true then vec_a, vec_b and result are in montgomery form. Default value: false. */ - IsInMontgomeryForm bool +type VecOpsConfig struct { + /*Details related to the device such as its id and stream. */ + Ctx cr.DeviceContext + /* True if `a` is on device and false if it is not. Default value: false. */ + isAOnDevice bool + /* True if `b` is on device and false if it is not. Default value: false. */ + isBOnDevice bool + /* If true, output is preserved on device, otherwise on host. Default value: false. */ + isResultOnDevice bool + /* Whether to run the vector operations asynchronously. If set to `true`, the function will be + * non-blocking and you'll need to synchronize it explicitly by calling + * `SynchronizeStream`. If set to false, the function will block the current CPU thread. */ + IsAsync bool + /* If true then vec_a, vec_b and result are in montgomery form. Default value: false. */ + IsInMontgomeryForm bool } /** @@ -44,51 +50,64 @@ func DefaultVecOpsConfig() VecOpsConfig { return config } -func VecOpCheck(a, b, out HostOrDeviceSlice, cfg *VecOpsConfig) (unsafe.Pointer, unsafe.Pointer, unsafe.Pointer, unsafe.Pointer, int) -{ - aLen, bLen, outLen : = a.Len(), b.Len(), out.Len() if aLen != bLen - { - errorString: - = fmt.Sprintf("a and b vector lengths %d; %d are not equal", aLen, bLen, ) panic(errorString) - } - if aLen - != outLen - { - errorString: - = fmt.Sprintf("a and out vector lengths %d; %d are not equal", aLen, outLen, ) panic(errorString) - } +func VecOpCheck(a, b, out HostOrDeviceSlice, cfg *VecOpsConfig) (unsafe.Pointer, unsafe.Pointer, unsafe.Pointer, unsafe.Pointer, int) { + aLen, bLen, outLen := a.Len(), b.Len(), out.Len() + if aLen != bLen { + errorString := fmt.Sprintf( + "a and b vector lengths %d; %d are not equal", + aLen, + bLen, + ) + panic(errorString) + } + if aLen != outLen { + errorString := fmt.Sprintf( + "a and out vector lengths %d; %d are not equal", + aLen, + outLen, + ) + panic(errorString) + } - if a - .IsOnDevice() { a.(DeviceSlice).CheckDevice() } - if b - .IsOnDevice() { b.(DeviceSlice).CheckDevice() } - if out - .IsOnDevice(){out.(DeviceSlice).CheckDevice()} + if a.IsOnDevice() { + a.(DeviceSlice).CheckDevice() + } + if b.IsOnDevice() { + b.(DeviceSlice).CheckDevice() + } + if out.IsOnDevice() { + out.(DeviceSlice).CheckDevice() + } - cfg.isAOnDevice = a.IsOnDevice() cfg.isBOnDevice = b.IsOnDevice() cfg.isResultOnDevice = - out - .IsOnDevice() + cfg.isAOnDevice = a.IsOnDevice() + cfg.isBOnDevice = b.IsOnDevice() + cfg.isResultOnDevice = out.IsOnDevice() - return a.AsUnsafePointer(), - b.AsUnsafePointer(), out.AsUnsafePointer(), unsafe.Pointer(cfg), a.Len() + return a.AsUnsafePointer(), b.AsUnsafePointer(), out.AsUnsafePointer(), unsafe.Pointer(cfg), a.Len() } -func TransposeCheck(in, out HostOrDeviceSlice, onDevice bool) -{ - inLen, outLen : = in.Len(), - out.Len() +func TransposeCheck(in, out HostOrDeviceSlice, onDevice bool) { + inLen, outLen := in.Len(), out.Len() - if inLen != outLen - { - errorString: - = fmt.Sprintf("in and out vector lengths %d; %d are not equal", inLen, outLen, ) panic(errorString) - } - if (onDevice != in.IsOnDevice()) || (onDevice != out.IsOnDevice()) - { - errorString: - = fmt.Sprintf( - "onDevice is set to %t, but in.IsOnDevice():%t and out.IsOnDevice():%t", onDevice, in.IsOnDevice(), - out.IsOnDevice(), ) panic(errorString) - } - if onDevice { in.(DeviceSlice).CheckDevice() out.(DeviceSlice).CheckDevice() } + if inLen != outLen { + errorString := fmt.Sprintf( + "in and out vector lengths %d; %d are not equal", + inLen, + outLen, + ) + panic(errorString) + } + if (onDevice != in.IsOnDevice()) || (onDevice != out.IsOnDevice()) { + errorString := fmt.Sprintf( + "onDevice is set to %t, but in.IsOnDevice():%t and out.IsOnDevice():%t", + onDevice, + in.IsOnDevice(), + out.IsOnDevice(), + ) + panic(errorString) + } + if onDevice { + in.(DeviceSlice).CheckDevice() + out.(DeviceSlice).CheckDevice() + } } diff --git a/wrappers/golang/core/vec_ops_test.go b/wrappers/golang/core/vec_ops_test.go index 989a0a810..9ac5bb480 100644 --- a/wrappers/golang/core/vec_ops_test.go +++ b/wrappers/golang/core/vec_ops_test.go @@ -1,23 +1,24 @@ package core - import("testing" +import ( + "testing" - cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime" - "github.com/stretchr/testify/assert") + cr "github.com/ingonyama-zk/icicle/v2/wrappers/golang/cuda_runtime" + "github.com/stretchr/testify/assert" +) - func TestVecOpsDefaultConfig(t* testing.T) -{ - ctx, _ : = cr.GetDefaultDeviceContext() expected : = - VecOpsConfig{ - ctx, // Ctx - false, // isAOnDevice - false, // isBOnDevice - false, // isResultOnDevice - false, // IsAsync - false, // IsInMontgomeryForm - } +func TestVecOpsDefaultConfig(t *testing.T) { + ctx, _ := cr.GetDefaultDeviceContext() + expected := VecOpsConfig{ + ctx, // Ctx + false, // isAOnDevice + false, // isBOnDevice + false, // isResultOnDevice + false, // IsAsync + false, // IsInMontgomeryForm + } - actual : = DefaultVecOpsConfig() + actual := DefaultVecOpsConfig() - assert.Equal(t, expected, actual) + assert.Equal(t, expected, actual) } diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs index efe777160..d16d5c4b9 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs @@ -1,7 +1,7 @@ use icicle_cuda_runtime::device::check_device; use icicle_cuda_runtime::{ - device_context::{DeviceContext, DEFAULT_DEVICE_ID}, - memory::HostOrDeviceSlice, + device_context::{DeviceContext, DEFAULT_DEVICE_ID}, + memory::HostOrDeviceSlice, }; use crate::{error::IcicleResult, traits::FieldImpl}; @@ -25,24 +25,22 @@ pub struct VecOpsConfig<'a> { } impl<'a> Default for VecOpsConfig<'a> { - fn default()->Self { Self::default_for_device(DEFAULT_DEVICE_ID) } + fn default() -> Self { + Self::default_for_device(DEFAULT_DEVICE_ID) + } } -impl<'a> VecOpsConfig<' a> -{ - pub fn default_for_device(device_id : usize) -> Self - { - VecOpsConfig - { - ctx: - DeviceContext::default_for_device(device_id), - is_a_on_device : false, - is_b_on_device : false, - is_result_on_device : false, - is_async : false, - is_in_montgomery_form : false, +impl<'a> VecOpsConfig<'a> { + pub fn default_for_device(device_id: usize) -> Self { + VecOpsConfig { + ctx: DeviceContext::default_for_device(device_id), + is_a_on_device: false, + is_b_on_device: false, + is_result_on_device: false, + is_async: false, + is_in_montgomery_form: false, + } } - } } #[repr(C)] @@ -60,30 +58,27 @@ pub struct BitReverseConfig<'a> { /// Whether to run the vector operations asynchronously. If set to `true`, the functions will be non-blocking and you'd need to synchronize /// it explicitly by running `stream.synchronize()`. If set to false, the functions will block the current CPU thread. pub is_async: bool, - /// If true then vec_a, vec_b and result are in montgomery form. Default value: false. - pub is_in_montgomery_form: bool, } impl<'a> Default for BitReverseConfig<'a> { - fn default()->Self { Self::default_for_device(DEFAULT_DEVICE_ID) } + fn default() -> Self { + Self::default_for_device(DEFAULT_DEVICE_ID) + } } -impl<'a> BitReverseConfig<' a> -{ - pub fn default_for_device(device_id : usize) -> Self - { - BitReverseConfig - { - ctx: - DeviceContext::default_for_device(device_id), - is_input_on_device : false, is_output_on_device : false, is_async : false, is_in_montgomery_form : false, +impl<'a> BitReverseConfig<'a> { + pub fn default_for_device(device_id: usize) -> Self { + BitReverseConfig { + ctx: DeviceContext::default_for_device(device_id), + is_input_on_device: false, + is_output_on_device: false, + is_async: false, + } } - } } #[doc(hidden)] -pub trait VecOps -{ +pub trait VecOps { fn add( a: &(impl HostOrDeviceSlice + ?Sized), b: &(impl HostOrDeviceSlice + ?Sized), @@ -119,7 +114,6 @@ pub trait VecOps ctx: &DeviceContext, on_device: bool, is_async: bool, - is_in_montgomery_form: bool, ) -> IcicleResult<()>; fn bit_reverse( @@ -140,61 +134,72 @@ fn check_vec_ops_args<'a, F>( result: &(impl HostOrDeviceSlice + ?Sized), cfg: &VecOpsConfig<'a>, ) -> VecOpsConfig<'a> { - if a.len() != b.len() || a.len() != result.len() -{ - panic !("left, right and output lengths {}; {}; {} do not match", a.len(), b.len(), result.len()); -} -let ctx_device_id = cfg.ctx.device_id; -if let - Some(device_id) = a.device_id() - { - assert_eq !(device_id, ctx_device_id, "Device ids in a and context are different"); - } -if let - Some(device_id) = b.device_id() - { - assert_eq !(device_id, ctx_device_id, "Device ids in b and context are different"); - } -if let - Some(device_id) = result.device_id() - { - assert_eq !(device_id, ctx_device_id, "Device ids in result and context are different"); - } -check_device(ctx_device_id); - -let mut res_cfg = cfg.clone(); -res_cfg.is_a_on_device = a.is_on_device(); -res_cfg.is_b_on_device = b.is_on_device(); -res_cfg.is_result_on_device = result.is_on_device(); -res_cfg + if a.len() != b.len() || a.len() != result.len() { + panic!( + "left, right and output lengths {}; {}; {} do not match", + a.len(), + b.len(), + result.len() + ); + } + let ctx_device_id = cfg + .ctx + .device_id; + if let Some(device_id) = a.device_id() { + assert_eq!(device_id, ctx_device_id, "Device ids in a and context are different"); + } + if let Some(device_id) = b.device_id() { + assert_eq!(device_id, ctx_device_id, "Device ids in b and context are different"); + } + if let Some(device_id) = result.device_id() { + assert_eq!( + device_id, ctx_device_id, + "Device ids in result and context are different" + ); + } + check_device(ctx_device_id); + + let mut res_cfg = cfg.clone(); + res_cfg.is_a_on_device = a.is_on_device(); + res_cfg.is_b_on_device = b.is_on_device(); + res_cfg.is_result_on_device = result.is_on_device(); + res_cfg } fn check_bit_reverse_args<'a, F>( input: &(impl HostOrDeviceSlice + ?Sized), cfg: &BitReverseConfig<'a>, output: &(impl HostOrDeviceSlice + ?Sized), ) -> BitReverseConfig<'a> { - if input.len() & (input.len() - 1) != 0 -{ - panic !("input length must be a power of 2, input length: {}", input.len()); -} -if input - .len() != output.len() { panic !("input and output lengths {}; {} do not match", input.len(), output.len()); } -let ctx_device_id = cfg.ctx.device_id; -if let - Some(device_id) = input.device_id() - { - assert_eq !(device_id, ctx_device_id, "Device ids in input and context are different"); - } -if let - Some(device_id) = output.device_id() - { - assert_eq !(device_id, ctx_device_id, "Device ids in output and context are different"); - } -check_device(ctx_device_id); -let mut res_cfg = cfg.clone(); -res_cfg.is_input_on_device = input.is_on_device(); -res_cfg.is_output_on_device = output.is_on_device(); -res_cfg + if input.len() & (input.len() - 1) != 0 { + panic!("input length must be a power of 2, input length: {}", input.len()); + } + if input.len() != output.len() { + panic!( + "input and output lengths {}; {} do not match", + input.len(), + output.len() + ); + } + let ctx_device_id = cfg + .ctx + .device_id; + if let Some(device_id) = input.device_id() { + assert_eq!( + device_id, ctx_device_id, + "Device ids in input and context are different" + ); + } + if let Some(device_id) = output.device_id() { + assert_eq!( + device_id, ctx_device_id, + "Device ids in output and context are different" + ); + } + check_device(ctx_device_id); + let mut res_cfg = cfg.clone(); + res_cfg.is_input_on_device = input.is_on_device(); + res_cfg.is_output_on_device = output.is_on_device(); + res_cfg } pub fn add_scalars( @@ -207,8 +212,8 @@ where F: FieldImpl, ::Config: VecOps, { - let cfg = check_vec_ops_args(a, b, result, cfg); - << F as FieldImpl > ::Config as VecOps < F >> ::add(a, b, result, &cfg) + let cfg = check_vec_ops_args(a, b, result, cfg); + <::Config as VecOps>::add(a, b, result, &cfg) } pub fn accumulate_scalars( @@ -220,8 +225,8 @@ where F: FieldImpl, ::Config: VecOps, { - let cfg = check_vec_ops_args(a, b, a, cfg); - << F as FieldImpl > ::Config as VecOps < F >> ::accumulate(a, b, &cfg) + let cfg = check_vec_ops_args(a, b, a, cfg); + <::Config as VecOps>::accumulate(a, b, &cfg) } pub fn sub_scalars( @@ -234,8 +239,8 @@ where F: FieldImpl, ::Config: VecOps, { - let cfg = check_vec_ops_args(a, b, result, cfg); - << F as FieldImpl > ::Config as VecOps < F >> ::sub(a, b, result, &cfg) + let cfg = check_vec_ops_args(a, b, result, cfg); + <::Config as VecOps>::sub(a, b, result, &cfg) } pub fn mul_scalars( @@ -248,8 +253,8 @@ where F: FieldImpl, ::Config: VecOps, { - let cfg = check_vec_ops_args(a, b, result, cfg); - << F as FieldImpl > ::Config as VecOps < F >> ::mul(a, b, result, &cfg) + let cfg = check_vec_ops_args(a, b, result, cfg); + <::Config as VecOps>::mul(a, b, result, &cfg) } pub fn transpose_matrix( @@ -260,13 +265,20 @@ pub fn transpose_matrix( ctx: &DeviceContext, on_device: bool, is_async: bool, - is_in_montgomery_form: bool, ) -> IcicleResult<()> where F: FieldImpl, ::Config: VecOps, { - <::Config as VecOps>::transpose(input, row_size, column_size, output, ctx, on_device, is_async, is_in_montgomery_form) + <::Config as VecOps>::transpose( + input, + row_size, + column_size, + output, + ctx, + on_device, + is_async, + ) } pub fn bit_reverse( @@ -278,8 +290,8 @@ where F: FieldImpl, ::Config: VecOps, { - let cfg = check_bit_reverse_args(input, cfg, output); - << F as FieldImpl > ::Config as VecOps < F >> ::bit_reverse(input, &cfg, output) + let cfg = check_bit_reverse_args(input, cfg, output); + <::Config as VecOps>::bit_reverse(input, &cfg, output) } pub fn bit_reverse_inplace( @@ -290,14 +302,14 @@ where F: FieldImpl, ::Config: VecOps, { - let cfg = check_bit_reverse_args(input, cfg, input); - << F as FieldImpl > ::Config as VecOps < F >> ::bit_reverse_inplace(input, &cfg) + let cfg = check_bit_reverse_args(input, cfg, input); + <::Config as VecOps>::bit_reverse_inplace(input, &cfg) } #[macro_export] macro_rules !impl_vec_ops_field { - ($field_prefix : literal, $field_prefix_ident : ident, $field : ident, $field_config : ident) = > + ($field_prefix : literal, $field_prefix_ident : ident, $field : ident, $field_config : ident) => { mod $field_prefix_ident { @@ -349,7 +361,6 @@ macro_rules !impl_vec_ops_field : *mut $field, ctx : * const DeviceContext, on_device : bool, is_async - : bool, is_in_montgomery_form : bool, ) ->CudaError; @@ -432,14 +443,13 @@ macro_rules !impl_vec_ops_field ctx: &DeviceContext, on_device: bool, is_async: bool, - is_in_montgomery_form: bool, ) -> IcicleResult<()> { unsafe { $field_prefix_ident::transpose_cuda( input.as_ptr(), row_size, column_size, output.as_mut_ptr(), ctx as* const _ as* const DeviceContext, - on_device, is_async, is_in_montgomery_form, ) + on_device, is_async) .wrap() } } @@ -478,7 +488,7 @@ macro_rules !impl_vec_ops_field macro_rules !impl_vec_add_tests { ($field - : ident) = > { + : ident) => { #[test] pub fn test_vec_add_scalars(){check_vec_ops_scalars::<$field>(); } From 117ee4c4a4707d5f577a26e2e67366a5d3e40c29 Mon Sep 17 00:00:00 2001 From: danny-shterman Date: Wed, 31 Jul 2024 14:24:25 +0000 Subject: [PATCH 06/21] Fix formatting. --- icicle/include/vec_ops/vec_ops.cuh | 3 +- wrappers/rust/icicle-core/src/vec_ops/mod.rs | 307 ++++++++++--------- 2 files changed, 159 insertions(+), 151 deletions(-) diff --git a/icicle/include/vec_ops/vec_ops.cuh b/icicle/include/vec_ops/vec_ops.cuh index 6d1ad7987..2a239689a 100644 --- a/icicle/include/vec_ops/vec_ops.cuh +++ b/icicle/include/vec_ops/vec_ops.cuh @@ -27,7 +27,8 @@ namespace vec_ops { * non-blocking and you'd need to synchronize it explicitly by running * `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the * function will block the current CPU thread. */ - bool is_in_montgomery_form; /**< If true then vec_a, vec_b and result are in montgomery form. Default value: false. */ + bool is_in_montgomery_form; /**< If true then vec_a, vec_b and result are in montgomery form. + * Default value: false. */ }; /** diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs index d16d5c4b9..20152beb4 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs @@ -270,15 +270,7 @@ where F: FieldImpl, ::Config: VecOps, { - <::Config as VecOps>::transpose( - input, - row_size, - column_size, - output, - ctx, - on_device, - is_async, - ) + <::Config as VecOps>::transpose(input, row_size, column_size, output, ctx, on_device, is_async) } pub fn bit_reverse( @@ -307,102 +299,103 @@ where } #[macro_export] -macro_rules !impl_vec_ops_field -{ - ($field_prefix : literal, $field_prefix_ident : ident, $field : ident, $field_config : ident) => - { - mod $field_prefix_ident - { - use crate::vec_ops::{$field, CudaError, DeviceContext, HostOrDeviceSlice}; - use icicle_core::vec_ops::BitReverseConfig; - use icicle_core::vec_ops::VecOpsConfig; - - extern "C" { -#[link_name = concat !($field_prefix, "_add_cuda")] - pub(crate) fn add_scalars_cuda(a - : * const $field, b - : * const $field, size - : u32, cfg - : * const VecOpsConfig, result - : *mut $field, ) - ->CudaError; - -#[link_name = concat !($field_prefix, "_accumulate_cuda")] - pub(crate) fn accumulate_scalars_cuda(a - : *mut $field, b - : * const $field, size - : u32, cfg - : * const VecOpsConfig, ) - ->CudaError; - -#[link_name = concat !($field_prefix, "_sub_cuda")] - pub(crate) fn sub_scalars_cuda(a - : * const $field, b - : * const $field, size - : u32, cfg - : * const VecOpsConfig, result - : *mut $field, ) - ->CudaError; - -#[link_name = concat !($field_prefix, "_mul_cuda")] - pub(crate) fn mul_scalars_cuda(a - : * const $field, b - : * const $field, size - : u32, cfg - : * const VecOpsConfig, result - : *mut $field, ) - ->CudaError; - -#[link_name = concat !($field_prefix, "_transpose_matrix_cuda")] - pub(crate) fn transpose_cuda(input - : * const $field, row_size - : u32, column_size - : u32, output - : *mut $field, ctx - : * const DeviceContext, on_device - : bool, is_async - : bool, ) - ->CudaError; - -#[link_name = concat !($field_prefix, "_bit_reverse_cuda")] - pub(crate) fn bit_reverse_cuda(input - : * const $field, size - : u64, config - : * const BitReverseConfig, output - : *mut $field, ) - ->CudaError; - } - } +macro_rules! impl_vec_ops_field { + ($field_prefix : literal, $field_prefix_ident : ident, $field : ident, $field_config : ident) => { + mod $field_prefix_ident { + use crate::vec_ops::{$field, CudaError, DeviceContext, HostOrDeviceSlice}; + use icicle_core::vec_ops::BitReverseConfig; + use icicle_core::vec_ops::VecOpsConfig; + + extern "C" { + #[link_name = concat !($field_prefix, "_add_cuda")] + pub(crate) fn add_scalars_cuda( + a: *const $field, + b: *const $field, + size: u32, + cfg: *const VecOpsConfig, + result: *mut $field, + ) -> CudaError; + + #[link_name = concat !($field_prefix, "_accumulate_cuda")] + pub(crate) fn accumulate_scalars_cuda( + a: *mut $field, + b: *const $field, + size: u32, + cfg: *const VecOpsConfig, + ) -> CudaError; + + #[link_name = concat !($field_prefix, "_sub_cuda")] + pub(crate) fn sub_scalars_cuda( + a: *const $field, + b: *const $field, + size: u32, + cfg: *const VecOpsConfig, + result: *mut $field, + ) -> CudaError; + + #[link_name = concat !($field_prefix, "_mul_cuda")] + pub(crate) fn mul_scalars_cuda( + a: *const $field, + b: *const $field, + size: u32, + cfg: *const VecOpsConfig, + result: *mut $field, + ) -> CudaError; + + #[link_name = concat !($field_prefix, "_transpose_matrix_cuda")] + pub(crate) fn transpose_cuda( + input: *const $field, + row_size: u32, + column_size: u32, + output: *mut $field, + ctx: *const DeviceContext, + on_device: bool, + is_async: bool, + ) -> CudaError; + + #[link_name = concat !($field_prefix, "_bit_reverse_cuda")] + pub(crate) fn bit_reverse_cuda( + input: *const $field, + size: u64, + config: *const BitReverseConfig, + output: *mut $field, + ) -> CudaError; + } + } - impl VecOps<$field> for $field_config - { + impl VecOps<$field> for $field_config { fn add( a: &(impl HostOrDeviceSlice<$field> + ?Sized), b: &(impl HostOrDeviceSlice<$field> + ?Sized), result: &mut (impl HostOrDeviceSlice<$field> + ?Sized), cfg: &VecOpsConfig, - ) -> IcicleResult<()> - { - unsafe - { - $field_prefix_ident::add_scalars_cuda( - a.as_ptr(), b.as_ptr(), a.len() as u32, cfg as* const VecOpsConfig, result.as_mut_ptr(), ) - .wrap() - } + ) -> IcicleResult<()> { + unsafe { + $field_prefix_ident::add_scalars_cuda( + a.as_ptr(), + b.as_ptr(), + a.len() as u32, + cfg as *const VecOpsConfig, + result.as_mut_ptr(), + ) + .wrap() + } } fn accumulate( a: &mut (impl HostOrDeviceSlice<$field> + ?Sized), b: &(impl HostOrDeviceSlice<$field> + ?Sized), cfg: &VecOpsConfig, - ) -> IcicleResult<()> - { - unsafe - { - $field_prefix_ident::accumulate_scalars_cuda( - a.as_mut_ptr(), b.as_ptr(), a.len() as u32, cfg as* const VecOpsConfig, ) - .wrap() - } + ) -> IcicleResult<()> { + unsafe { + $field_prefix_ident::accumulate_scalars_cuda( + a.as_mut_ptr(), + b.as_ptr(), + a.len() as u32, + cfg as *const VecOpsConfig, + ) + .wrap() + } } fn sub( @@ -410,14 +403,17 @@ macro_rules !impl_vec_ops_field b: &(impl HostOrDeviceSlice<$field> + ?Sized), result: &mut (impl HostOrDeviceSlice<$field> + ?Sized), cfg: &VecOpsConfig, - ) -> IcicleResult<()> - { - unsafe - { - $field_prefix_ident::sub_scalars_cuda( - a.as_ptr(), b.as_ptr(), a.len() as u32, cfg as* const VecOpsConfig, result.as_mut_ptr(), ) - .wrap() - } + ) -> IcicleResult<()> { + unsafe { + $field_prefix_ident::sub_scalars_cuda( + a.as_ptr(), + b.as_ptr(), + a.len() as u32, + cfg as *const VecOpsConfig, + result.as_mut_ptr(), + ) + .wrap() + } } fn mul( @@ -425,14 +421,17 @@ macro_rules !impl_vec_ops_field b: &(impl HostOrDeviceSlice<$field> + ?Sized), result: &mut (impl HostOrDeviceSlice<$field> + ?Sized), cfg: &VecOpsConfig, - ) -> IcicleResult<()> - { - unsafe - { - $field_prefix_ident::mul_scalars_cuda( - a.as_ptr(), b.as_ptr(), a.len() as u32, cfg as* const VecOpsConfig, result.as_mut_ptr(), ) - .wrap() - } + ) -> IcicleResult<()> { + unsafe { + $field_prefix_ident::mul_scalars_cuda( + a.as_ptr(), + b.as_ptr(), + a.len() as u32, + cfg as *const VecOpsConfig, + result.as_mut_ptr(), + ) + .wrap() + } } fn transpose( @@ -443,63 +442,71 @@ macro_rules !impl_vec_ops_field ctx: &DeviceContext, on_device: bool, is_async: bool, - ) -> IcicleResult<()> - { - unsafe - { - $field_prefix_ident::transpose_cuda( - input.as_ptr(), row_size, column_size, output.as_mut_ptr(), ctx as* const _ as* const DeviceContext, - on_device, is_async) - .wrap() - } + ) -> IcicleResult<()> { + unsafe { + $field_prefix_ident::transpose_cuda( + input.as_ptr(), + row_size, + column_size, + output.as_mut_ptr(), + ctx as *const _ as *const DeviceContext, + on_device, + is_async, + ) + .wrap() + } } fn bit_reverse( input: &(impl HostOrDeviceSlice<$field> + ?Sized), cfg: &BitReverseConfig, output: &mut (impl HostOrDeviceSlice<$field> + ?Sized), - ) -> IcicleResult<()> - { - unsafe - { - $field_prefix_ident::bit_reverse_cuda( - input.as_ptr(), input.len() as u64, cfg as* const BitReverseConfig, output.as_mut_ptr(), ) - .wrap() - } + ) -> IcicleResult<()> { + unsafe { + $field_prefix_ident::bit_reverse_cuda( + input.as_ptr(), + input.len() as u64, + cfg as *const BitReverseConfig, + output.as_mut_ptr(), + ) + .wrap() + } } fn bit_reverse_inplace( input: &mut (impl HostOrDeviceSlice<$field> + ?Sized), cfg: &BitReverseConfig, - ) -> IcicleResult<()> - { - unsafe - { - $field_prefix_ident::bit_reverse_cuda( - input.as_ptr(), input.len() as u64, cfg as* const BitReverseConfig, input.as_mut_ptr(), ) - .wrap() - } + ) -> IcicleResult<()> { + unsafe { + $field_prefix_ident::bit_reverse_cuda( + input.as_ptr(), + input.len() as u64, + cfg as *const BitReverseConfig, + input.as_mut_ptr(), + ) + .wrap() + } } } - }; + }; } #[macro_export] -macro_rules !impl_vec_add_tests -{ - ($field +macro_rules! impl_vec_add_tests { + ($field : ident) => { -#[test] - pub fn test_vec_add_scalars(){check_vec_ops_scalars::<$field>(); -} + #[test] + pub fn test_vec_add_scalars() { + check_vec_ops_scalars::<$field>(); + } -#[test] -pub fn test_bit_reverse(){check_bit_reverse::<$field>()} -#[test] -pub fn test_bit_reverse_inplace() -{ - check_bit_reverse_inplace::<$field>() -} -} -; + #[test] + pub fn test_bit_reverse() { + check_bit_reverse::<$field>() + } + #[test] + pub fn test_bit_reverse_inplace() { + check_bit_reverse_inplace::<$field>() + } + }; } From b10ee73b5ebf0e3b206897acc0a222b5177592a4 Mon Sep 17 00:00:00 2001 From: danny-shterman Date: Sun, 4 Aug 2024 06:19:10 +0000 Subject: [PATCH 07/21] Remove ## Best-Practices section --- examples/c++/mont_vec_ops/README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/c++/mont_vec_ops/README.md b/examples/c++/mont_vec_ops/README.md index 736303872..9e309dc6c 100644 --- a/examples/c++/mont_vec_ops/README.md +++ b/examples/c++/mont_vec_ops/README.md @@ -5,9 +5,6 @@ is_result_on_device is_in_montgomery_form (is_async isn't checked) -## Best-Practices - -We recommend to run our examples in [ZK-containers](../../ZK-containers.md) to save your time and mental energy. ## Key-Takeaway From 1fc640c8b342b2ad5b44b08fc85b9ffaf9517e95 Mon Sep 17 00:00:00 2001 From: Leon Hibnik <107353745+LeonHibnik@users.noreply.github.com> Date: Tue, 6 Aug 2024 13:51:00 +0300 Subject: [PATCH 08/21] Apply suggestions from code review --- wrappers/rust/icicle-core/src/vec_ops/mod.rs | 24 ++++++++++++-------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs index 20152beb4..b090e4c9a 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs @@ -300,14 +300,19 @@ where #[macro_export] macro_rules! impl_vec_ops_field { - ($field_prefix : literal, $field_prefix_ident : ident, $field : ident, $field_config : ident) => { + ( + $field_prefix:literal, + $field_prefix_ident:ident, + $field:ident, + $field_config:ident + ) => { mod $field_prefix_ident { use crate::vec_ops::{$field, CudaError, DeviceContext, HostOrDeviceSlice}; use icicle_core::vec_ops::BitReverseConfig; use icicle_core::vec_ops::VecOpsConfig; extern "C" { - #[link_name = concat !($field_prefix, "_add_cuda")] + #[link_name = concat!($field_prefix, "_add_cuda")] pub(crate) fn add_scalars_cuda( a: *const $field, b: *const $field, @@ -316,7 +321,7 @@ macro_rules! impl_vec_ops_field { result: *mut $field, ) -> CudaError; - #[link_name = concat !($field_prefix, "_accumulate_cuda")] + #[link_name = concat!($field_prefix, "_accumulate_cuda")] pub(crate) fn accumulate_scalars_cuda( a: *mut $field, b: *const $field, @@ -324,7 +329,7 @@ macro_rules! impl_vec_ops_field { cfg: *const VecOpsConfig, ) -> CudaError; - #[link_name = concat !($field_prefix, "_sub_cuda")] + #[link_name = concat!($field_prefix, "_sub_cuda")] pub(crate) fn sub_scalars_cuda( a: *const $field, b: *const $field, @@ -333,7 +338,7 @@ macro_rules! impl_vec_ops_field { result: *mut $field, ) -> CudaError; - #[link_name = concat !($field_prefix, "_mul_cuda")] + #[link_name = concat!($field_prefix, "_mul_cuda")] pub(crate) fn mul_scalars_cuda( a: *const $field, b: *const $field, @@ -342,7 +347,7 @@ macro_rules! impl_vec_ops_field { result: *mut $field, ) -> CudaError; - #[link_name = concat !($field_prefix, "_transpose_matrix_cuda")] + #[link_name = concat!($field_prefix, "_transpose_matrix_cuda")] pub(crate) fn transpose_cuda( input: *const $field, row_size: u32, @@ -353,7 +358,7 @@ macro_rules! impl_vec_ops_field { is_async: bool, ) -> CudaError; - #[link_name = concat !($field_prefix, "_bit_reverse_cuda")] + #[link_name = concat!($field_prefix, "_bit_reverse_cuda")] pub(crate) fn bit_reverse_cuda( input: *const $field, size: u64, @@ -493,8 +498,9 @@ macro_rules! impl_vec_ops_field { #[macro_export] macro_rules! impl_vec_add_tests { - ($field - : ident) => { + ( + $field:ident + ) => { #[test] pub fn test_vec_add_scalars() { check_vec_ops_scalars::<$field>(); From 080071870fe0f2dac0fa1e2d19d22498047d14da Mon Sep 17 00:00:00 2001 From: Leon Hibnik <107353745+LeonHibnik@users.noreply.github.com> Date: Tue, 6 Aug 2024 13:52:55 +0300 Subject: [PATCH 09/21] Update wrappers/rust/icicle-core/src/vec_ops/mod.rs --- wrappers/rust/icicle-core/src/vec_ops/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs index b090e4c9a..efb2cab53 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs @@ -499,7 +499,7 @@ macro_rules! impl_vec_ops_field { #[macro_export] macro_rules! impl_vec_add_tests { ( - $field:ident + $field:ident ) => { #[test] pub fn test_vec_add_scalars() { From 58354e43875ab91b73fc28b092473b569c8bb2a2 Mon Sep 17 00:00:00 2001 From: danny-shterman Date: Mon, 12 Aug 2024 14:08:13 +0000 Subject: [PATCH 10/21] WIP on illegal memory access bug. --- examples/c++/mont_vec_ops/example.cu | 501 ++++++++++++++++----------- icicle/include/vec_ops/vec_ops.cuh | 7 +- icicle/src/vec_ops/vec_ops.cu | 37 +- 3 files changed, 325 insertions(+), 220 deletions(-) diff --git a/examples/c++/mont_vec_ops/example.cu b/examples/c++/mont_vec_ops/example.cu index 8cf21e1fc..a1fe871be 100644 --- a/examples/c++/mont_vec_ops/example.cu +++ b/examples/c++/mont_vec_ops/example.cu @@ -44,41 +44,12 @@ int vector_op( } return 0; } -int vector_mul( - T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config) -{ - cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result); - if (err != cudaSuccess) { - std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl; - return 0; - } - return 0; -} -int vector_add( - T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config) -{ - cudaError_t err = bn254_add_cuda(vec_a, vec_b, n_elements, config, vec_result); - if (err != cudaSuccess) { - std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl; - return 0; - } - return 0; -} -int vector_sub( - T* vec_a, T* vec_b, T* vec_result, size_t n_elements, device_context::DeviceContext ctx, vec_ops::VecOpsConfig config) -{ - cudaError_t err = bn254_sub_cuda(vec_a, vec_b, n_elements, config, vec_result); - if (err != cudaSuccess) { - std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl; - return 0; - } - return 0; -} int main(int argc, char** argv) { - const unsigned vector_size = 1 << 0; - const unsigned repetitions = 1 << 0; + const unsigned vector_size = 1 << 11; + const unsigned not_in_place_repetitions = 1 << 0; // Repetitions are used only for the non in-place tests. + const unsigned in_place_repetitions = 1; // Repetitions for in-place tests should be 1. Don't check it. cudaError_t err; nvmlInit(); @@ -94,9 +65,10 @@ int main(int argc, char** argv) unsigned power_limit; nvmlDeviceGetPowerManagementLimit(device, &power_limit); - std::cout << "Vector size: " << vector_size << std::endl; - std::cout << "Repetitions: " << repetitions << std::endl; - std::cout << "Power limit: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_limit << " W" << std::endl; + std::cout << "Vector size: " << vector_size << std::endl; + std::cout << "not_in_place_repetitions: " << not_in_place_repetitions << std::endl; + std::cout << "in_place_repetitions: " << in_place_repetitions << std::endl; + std::cout << "Power limit: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_limit << " W" << std::endl; unsigned int baseline_power; nvmlDeviceGetPowerUsage(device, &baseline_power); @@ -111,21 +83,27 @@ int main(int argc, char** argv) // host data std::cout << "Allocate memory for the input vectors (both normal and Montgomery presentation)" << std::endl; T* host_in1_init = (T*)malloc(vector_size * sizeof(T)); + std::cout << "example malloc host_in1_init" << std::endl; T* host_in2_init = (T*)malloc(vector_size * sizeof(T)); + std::cout << "example malloc host_in2_init" << std::endl; std::cout << "Initializing vectors with normal presentation random data" << std::endl; T::rand_host_many(host_in1_init, vector_size); T::rand_host_many(host_in2_init, vector_size); std::cout << "Allocate memory for the output vectors" << std::endl; T* host_out = (T*)malloc(vector_size * sizeof(T)); // This memory will be used for the test output. + std::cout << "example malloc host_out" << std::endl; T* host_out_ref_mul = (T*)malloc( vector_size * sizeof(T)); // This memory will be used as a reference result for mul (will be compared to host_out content). + std::cout << "example malloc host_out_ref_mul" << std::endl; T* host_out_ref_add = (T*)malloc( vector_size * sizeof(T)); // This memory will be used as a reference result for add (will be compared to host_out content). + std::cout << "example malloc host_out_ref_add" << std::endl; T* host_out_ref_sub = (T*)malloc( vector_size * sizeof(T)); // This memory will be used as a reference result for sub (will be compared to host_out content). + std::cout << "malloc host_out_ref_sub" << std::endl; std::cout << "Initializing output vectors with random data" << std::endl; T::rand_host_many(host_out, vector_size); T::rand_host_many(host_out_ref_mul, vector_size); @@ -138,22 +116,27 @@ int main(int argc, char** argv) T* device_out; err = cudaMalloc((void**)&device_in1, vector_size * sizeof(T)); + std::cout << "example cudaMalloc device_in1" << std::endl; if (err != cudaSuccess) { std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl; return 0; } err = cudaMalloc((void**)&device_in2, vector_size * sizeof(T)); + std::cout << "example cudaMalloc device_in2" << std::endl; if (err != cudaSuccess) { std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl; return 0; } err = cudaMalloc((void**)&device_out, vector_size * sizeof(T)); + std::cout << "example cudaMalloc device_out" << std::endl; if (err != cudaSuccess) { std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl; return 0; } vec_ops::VecOpsConfig config = vec_ops::DefaultVecOpsConfig(); + int nof_of_configs_for_test = 5; + int nof_of_storage_configs = 3; // 2 inputs, 1 result. //**************************************** // Test warn-up and reference output config. Reference output to be used to check if test passed or not. @@ -172,28 +155,25 @@ int main(int argc, char** argv) std::cout << "Starting warm-up run" << std::endl; // Warm-up loop for (int op = MUL; op != LAST; op++) { - for (int i = 0; i < repetitions; i++) { - // vector_mul(device_in1, device_in2, device_out, vector_size, ctx, config); + for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); - } - switch (op) { - case MUL: - err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); - break; - case ADD: - err = cudaMemcpy(host_out_ref_add, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); - break; - case SUB: - err = cudaMemcpy(host_out_ref_sub, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); - break; + switch (op) { + case MUL: + err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); + break; + case ADD: + err = cudaMemcpy(host_out_ref_add, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); + break; + case SUB: + err = cudaMemcpy(host_out_ref_sub, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); + break; + } + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl; + return 0; + } } } - // copy the result from device to host_out_ref_mul to keep it for later comparisons. - // err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); - if (err != cudaSuccess) { - std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl; - return 0; - } //**************************************** // End of test warn-up and reference output config. //**************************************** @@ -214,30 +194,64 @@ int main(int argc, char** argv) //******************************************************* // Benchmark test: // Loop for (mul, add, sub): - // Loop (is_a_on_device, is_b_on_device, is_result_on_device, is_in_montgomery_form): + // Loop (is_a_on_device, is_b_on_device, is_result_on_device, is_input_in_montgomery_form): //******************************************************* T* host_in1 = (T*)malloc(vector_size * sizeof(T)); // This buffer is used to load the data from host_in1_init for the benchmark. + std::cout << "example malloc host_in1" << std::endl; T* host_in2 = (T*)malloc(vector_size * sizeof(T)); // This buffer is used to load the data from host_in2_init for the benchmark. + std::cout << "example malloc host_in1" << std::endl; // Test when the result is not in-place + std::cout << "*****************************************" << std::endl; + std::cout << "*** Start not in-place benchmark loop ***" << std::endl; + std::cout << "*****************************************" << std::endl; for (int op = MUL; op != LAST; op++) { - // for (int config_idx = 0; config_idx < 0; config_idx++) { - for (int config_idx = 0; config_idx < 16; config_idx++) { + for (int config_idx = 0; config_idx < 32; config_idx++) { + // // DEBUG + // int num_gpus; + // size_t free, total; + // cudaGetDeviceCount( &num_gpus ); + // std::cout << "num_gpus " << num_gpus << std::endl; + // for ( int gpu_id = 0; gpu_id < num_gpus; gpu_id++ ) { + // cudaSetDevice( gpu_id ); + // int id; + // cudaGetDevice( &id ); + // cudaMemGetInfo( &free, &total ); + // std::cout << "GPU " << id << " memory: free=" << free << ", total=" << total << std::endl; + // } + // // DEBUG + switch (op) { + case MUL: std::cout << "Start benchmark loop for op MUL config_idx " << config_idx << " not in-place" << std::endl; break; + case ADD: std::cout << "Start benchmark loop for op ADD config_idx " << config_idx << " not in-place" << std::endl; break; + case SUB: std::cout << "Start benchmark loop for op SUB config_idx " << config_idx << " not in-place" << std::endl; break; + } std::cout << "Start benchmark loop for config_idx " << config_idx << std::endl; + // Destroy the result of the prev loop. + T::rand_host_many(host_out, vector_size); // Randomize host_out in order to randomize device_out. + err = cudaMemcpy( + device_out, host_out, vector_size * sizeof(T), + cudaMemcpyHostToDevice); // Copy random data to device_out. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from host_out to device_out - " << cudaGetErrorString(err) << std::endl; + return 0; + } + T::rand_host_many(host_out, vector_size); // Make hist_out != device_out. + // Initializa inputs with the known data for (int i = 0; i < vector_size; i++) { host_in1[i] = host_in1_init[i]; host_in2[i] = host_in2_init[i]; } - config.is_a_on_device = (config_idx >> 3) & 0x1; - config.is_b_on_device = (config_idx >> 2) & 0x1; - config.is_result_on_device = (config_idx >> 1) & 0x1; - config.is_in_montgomery_form = (config_idx >> 0) & 0x1; + config.is_a_on_device = (config_idx >> 4) & 0x1; + config.is_b_on_device = (config_idx >> 3) & 0x1; + config.is_result_on_device = (config_idx >> 2) & 0x1; + config.is_input_in_montgomery_form = (config_idx >> 1) & 0x1; + config.is_result_in_montgomery_form = (config_idx >> 0) & 0x1; // Copy from host to device (copy again in order to be used later in the loop and device_inX was already // overwritten by warmup. if (config.is_a_on_device) { - if (config.is_in_montgomery_form) { + if (config.is_input_in_montgomery_form) { err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. if (err != cudaSuccess) { @@ -246,6 +260,10 @@ int main(int argc, char** argv) } CHK_IF_RETURN( mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1)); // Convert in-place. + // Destroy host_in1 value with values of host_in2. + for (int i = 0; i < vector_size; i++) { + host_in1[i] = host_in2_init[i]; + } } else { // Normal presentation. err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. @@ -255,7 +273,7 @@ int main(int argc, char** argv) } } } else { - if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host. + if (config.is_input_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host. err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. if (err != cudaSuccess) { @@ -268,10 +286,17 @@ int main(int argc, char** argv) std::cerr << "Failed to copy data from device_in1 to host_in1 - " << cudaGetErrorString(err) << std::endl; return 0; } + // Destroy device_in1 value with values of host_in2. + err = + cudaMemcpy(device_in1, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from host_in2 to device_in1 - " << cudaGetErrorString(err) << std::endl; + return 0; + } } } if (config.is_b_on_device) { - if (config.is_in_montgomery_form) { + if (config.is_input_in_montgomery_form) { err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. if (err != cudaSuccess) { @@ -280,6 +305,10 @@ int main(int argc, char** argv) } CHK_IF_RETURN( mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2)); // Convert in-place. + // Destroy host_in2 value with values of host_in1. + for (int i = 0; i < vector_size; i++) { + host_in2[i] = host_in1_init[i]; + } } else { // Normal presentation. err = @@ -290,7 +319,7 @@ int main(int argc, char** argv) } } } else { - if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host. + if (config.is_input_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host. err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. if (err != cudaSuccess) { @@ -303,38 +332,45 @@ int main(int argc, char** argv) std::cerr << "Failed to copy data from device_in2 to host_in2 - " << cudaGetErrorString(err) << std::endl; return 0; } + // Destroy device_in2 valuewith values of host_in1. + err = + cudaMemcpy(device_in2, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from host_in1 to device_in2 - " << cudaGetErrorString(err) << std::endl; + return 0; + } } } CHK_IF_RETURN(cudaPeekAtLastError()); auto start_time = std::chrono::high_resolution_clock::now(); // Benchmark loop - for (int i = 0; i < repetitions; i++) { - switch (config_idx >> 1) { // {is_a_on_device, is_b_on_device, is_result_on_device} - case 0b000: - vector_op(host_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); - break; - case 0b001: - vector_op(host_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); - break; - case 0b010: - vector_op(host_in1, device_in2, host_out, vector_size, ctx, config, (Op)op); - break; - case 0b011: - vector_op(host_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); - break; - case 0b100: - vector_op(device_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); - break; - case 0b101: - vector_op(device_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); - break; - case 0b110: - vector_op(device_in1, device_in2, host_out, vector_size, ctx, config, (Op)op); - break; - case 0b111: - vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); - break; + for (int i = 0; i < not_in_place_repetitions; i++) { + switch (config_idx >> (nof_of_configs_for_test - nof_of_storage_configs)) { // {is_a_on_device, is_b_on_device, is_result_on_device} + case 0b000: + vector_op(host_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); + break; + case 0b001: + vector_op(host_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); + break; + case 0b010: + vector_op(host_in1, device_in2, host_out, vector_size, ctx, config, (Op)op); + break; + case 0b011: + vector_op(host_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); + break; + case 0b100: + vector_op(device_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); + break; + case 0b101: + vector_op(device_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); + break; + case 0b110: + vector_op(device_in1, device_in2, host_out, vector_size, ctx, config, (Op)op); + break; + case 0b111: + vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); + break; } CHK_IF_RETURN(cudaPeekAtLastError()); } @@ -356,40 +392,40 @@ int main(int argc, char** argv) break; } - if (config.is_result_on_device) { // Copy the data to host_out in order to compare it vs. host_out_ref_mul value. - if (config.is_in_montgomery_form) { // Convert to normal from montgomery if needed. + if (config.is_result_on_device) { // Copy the data to host_out in order to compare it vs. host_out_ref_XXX value. + if (config.is_result_in_montgomery_form) { // Convert to normal from montgomery if needed. CHK_IF_RETURN(mont::from_montgomery( device_out, vector_size, config.ctx.stream, - device_out)); // Convert to normal in order to check vs. host_out_ref_mul. + device_out)); // Convert to normal in order to check vs. host_out_ref_XXX. } err = cudaMemcpy( host_out, device_out, vector_size * sizeof(T), - cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul. + cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_XXX. if (err != cudaSuccess) { std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl; return 0; } } else { // Data is not on device but it is in host_out. - if (config.is_in_montgomery_form) { // host_out should be written to device, converted to mmontgomery and - // written back to host. Then compared vs. host_out_ref_mul. + if (config.is_result_in_montgomery_form) { // host_out should be written to device, converted to mmontgomery and + // written back to host. Then compared vs. host_out_ref_XXX. err = cudaMemcpy( device_out, host_out, vector_size * sizeof(T), - cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul. + cudaMemcpyHostToDevice); // Copy to device_out in order to check later vs. host_out_ref_XXX. if (err != cudaSuccess) { std::cerr << "Failed to copy data from host_out to device_out - " << cudaGetErrorString(err) << std::endl; return 0; } CHK_IF_RETURN(mont::from_montgomery( device_out, vector_size, config.ctx.stream, - device_out)); // Convert to normal in order to check vs. host_out_ref_mul. + device_out)); // Convert to normal in order to check vs. host_out_ref_XXX. err = cudaMemcpy( host_out, device_out, vector_size * sizeof(T), - cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul. + cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_XXX. if (err != cudaSuccess) { std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl; return 0; } - } else { // host_out could be compared vs. host_out_ref_mul as is. + } else { // host_out could be compared vs. host_out_ref_XXX as is. } } //**************************************** @@ -397,14 +433,12 @@ int main(int argc, char** argv) //**************************************** //*********************************************** - // Test result check + // Test result check (not in-place) // Check is performed by executing the operation in a normal presentation - // (located in in host_out_ref_mul) and comparing it with the + // (located in in host_out_ref_XXX) and comparing it with the // benchmark test result. //*********************************************** int test_failed = 0; - // std::cout << "===>>> host_out_ref_mul[" << i << "]: " << host_out_ref_mul[i] << std::endl; - // std::cout << "===>>> host_out[" << i << "] after test run: " << host_out[i] << std::endl; switch (op) { case MUL: for (int i = 0; i < vector_size; i++) { @@ -435,13 +469,18 @@ int main(int argc, char** argv) break; } if (test_failed) { - // std::cout << "===>>> ERROR!!! Test failed for vector index " << i << ", config is printed below:" << - // std::endl; - std::cout << "===>>> result is not in-place: " << std::endl; - std::cout << "===>>> is_a_on_device: " << config.is_a_on_device << std::endl; - std::cout << "===>>> is_b_on_device: " << config.is_b_on_device << std::endl; - std::cout << "===>>> is_result_on_device: " << config.is_result_on_device << std::endl; - std::cout << "===>>> is_in_montgomery_form: " << config.is_in_montgomery_form << std::endl; + std::cout << "===>>> result is in-place: " << std::endl; + std::cout << "===>>> is_a_on_device: " << config.is_a_on_device << std::endl; + std::cout << "===>>> is_b_on_device: " << config.is_b_on_device << std::endl; + std::cout << "===>>> is_result_on_device: " << config.is_result_on_device << std::endl; + std::cout << "===>>> is_input_in_montgomery_form: " << config.is_input_in_montgomery_form << std::endl; + std::cout << "===>>> is_input_resultin_montgomery_form: " << config.is_result_in_montgomery_form << std::endl; + std::cout << "===>>> host_in1_init[0] = " << host_in1_init[0] << std::endl; + std::cout << "===>>> host_in2_init[0] = " << host_in2_init[0] << std::endl; + std::cout << "===>>> host_out[0] = " << host_out[0] << std::endl; + std::cout << "===>>> warm-up: normal host_out_ref[0] (MUL) = " << host_out_ref_mul[0] << std::endl; + std::cout << "===>>> warm-up: normal host_out_ref[0] (ADD) = " << host_out_ref_add[0] << std::endl; + std::cout << "===>>> warm-up: normal host_out_ref[0] (SUB) = " << host_out_ref_sub[0] << std::endl; exit(2); } @@ -458,28 +497,50 @@ int main(int argc, char** argv) } // Report performance in GMPS: Giga Multiplications Per Second - double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count()); + double GMPS = 1.0e-9 * not_in_place_repetitions * vector_size / (1.0e-6 * duration.count()); std::cout << "Performance: " << GMPS << " Giga Multiplications Per Second" << std::endl; } } // Test when the result is in-place + std::cout << "*************************************" << std::endl; + std::cout << "*** Start in-place benchmark loop ***" << std::endl; + std::cout << "*************************************" << std::endl; for (int op = MUL; op != LAST; op++) { - for (int config_idx = 0; config_idx < 16; config_idx++) { + for (int config_idx = 0; config_idx < 32; config_idx++) { + switch (op) { + case MUL: std::cout << "Start benchmark loop for op MUL config_idx " << config_idx << " in-place" << std::endl; break; + case ADD: std::cout << "Start benchmark loop for op ADD config_idx " << config_idx << " in-place" << std::endl; break; + case SUB: std::cout << "Start benchmark loop for op SUB config_idx " << config_idx << " in-place" << std::endl; break; + } + std::cout << "Start benchmark loop for config_idx " << config_idx << std::endl; + // Destroy the result of the prev loop. + T::rand_host_many(host_out, vector_size); // Randomize host_out in order to randomize device_out. + err = cudaMemcpy( + device_out, host_out, vector_size * sizeof(T), + cudaMemcpyHostToDevice); // Copy random data to device_out. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from host_out to device_out - " << cudaGetErrorString(err) << std::endl; + return 0; + } + T::rand_host_many(host_out, vector_size); // Make hist_out != device_out. + // Initializa inputs with the known data. For in-place tests host_in1 is going to be used as a result. So, it should be initialized later in the repetions loop. for (int i = 0; i < vector_size; i++) { host_in1[i] = host_in1_init[i]; host_in2[i] = host_in2_init[i]; } - config.is_a_on_device = (config_idx >> 4) & 0x1; - config.is_b_on_device = (config_idx >> 3) & 0x1; - config.is_result_on_device = (config_idx >> 2) & 0x1; - config.is_in_montgomery_form = (config_idx >> 1) & 0x1; - if (config.is_a_on_device ^ config.is_result_on_device == 1) { continue; } + config.is_a_on_device = (config_idx >> 4) & 0x1; + config.is_b_on_device = (config_idx >> 3) & 0x1; + config.is_result_on_device = (config_idx >> 2) & 0x1; + config.is_input_in_montgomery_form = (config_idx >> 1) & 0x1; + config.is_result_in_montgomery_form = (config_idx >> 1) & 0x1; + if (config.is_a_on_device ^ config.is_result_on_device == 1) { continue; } // Illegal case for this loop. + if (config.is_input_in_montgomery_form ^ config.is_result_in_montgomery_form == 1) { continue; } // Illegal case for this loop. // Copy from host to device (copy again in order to be used later in the loop and device_inX was already // overwritten by warmup. if (config.is_a_on_device) { - if (config.is_in_montgomery_form) { + if (config.is_input_in_montgomery_form) { err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. if (err != cudaSuccess) { @@ -488,6 +549,10 @@ int main(int argc, char** argv) } CHK_IF_RETURN( mont::to_montgomery(device_in1, vector_size, config.ctx.stream, device_in1)); // Convert in-place. + // Destroy host_in1 value with values of host_in2. + for (int i = 0; i < vector_size; i++) { + host_in1[i] = host_in2_init[i]; + } } else { // Normal presentation. err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. @@ -497,7 +562,7 @@ int main(int argc, char** argv) } } } else { - if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host. + if (config.is_input_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host. err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. if (err != cudaSuccess) { @@ -510,10 +575,17 @@ int main(int argc, char** argv) std::cerr << "Failed to copy data from device_in1 to host_in1 - " << cudaGetErrorString(err) << std::endl; return 0; } + // Destroy device_in1 value with values of host_in2. + err = + cudaMemcpy(device_in1, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from host_in2 to device_in1 - " << cudaGetErrorString(err) << std::endl; + return 0; + } } } if (config.is_b_on_device) { - if (config.is_in_montgomery_form) { + if (config.is_input_in_montgomery_form) { err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. if (err != cudaSuccess) { @@ -522,6 +594,10 @@ int main(int argc, char** argv) } CHK_IF_RETURN( mont::to_montgomery(device_in2, vector_size, config.ctx.stream, device_in2)); // Convert in-place. + // Destroy host_in2 value with values of host_in1. + for (int i = 0; i < vector_size; i++) { + host_in2[i] = host_in1_init[i]; + } } else { // Normal presentation. err = @@ -532,7 +608,7 @@ int main(int argc, char** argv) } } } else { - if (config.is_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host. + if (config.is_input_in_montgomery_form) { // Copy to device, cnvert to montgomery and copy back to host. err = cudaMemcpy(device_in2, host_in2, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. if (err != cudaSuccess) { @@ -545,34 +621,42 @@ int main(int argc, char** argv) std::cerr << "Failed to copy data from device_in2 to host_in2 - " << cudaGetErrorString(err) << std::endl; return 0; } + // Destroy device_in2 valuewith values of host_in1. + err = + cudaMemcpy(device_in2, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from host_in1 to device_in2 - " << cudaGetErrorString(err) << std::endl; + return 0; + } } } CHK_IF_RETURN(cudaPeekAtLastError()); auto start_time = std::chrono::high_resolution_clock::now(); // Benchmark loop - for (int i = 0; i < repetitions; i++) { - switch (config_idx >> 2) { // {is_a_on_device, is_b_on_device, is_result_on_device} - case 0b000: - vector_op(host_in1, host_in2, host_in1, vector_size, ctx, config, (Op)op); - break; - case 0b001: - break; - case 0b010: - vector_op(host_in1, device_in2, host_in1, vector_size, ctx, config, (Op)op); - break; - case 0b011: - break; - case 0b100: - break; - case 0b101: - vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op); - break; - case 0b110: - break; - case 0b111: - vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op); - break; + for (int i = 0; i < in_place_repetitions; i++) { + switch (config_idx >> (nof_of_configs_for_test - nof_of_storage_configs)) { // {is_a_on_device, is_b_on_device, is_result_on_device} + case 0b000: + vector_op(host_in1, host_in2, host_in1, vector_size, ctx, config, (Op)op); + break; + case 0b001: + break; + case 0b010: + vector_op(host_in1, device_in2, host_in1, vector_size, ctx, config, (Op)op); + break; + case 0b011: + break; + case 0b100: + break; + case 0b101: + vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op); + std::cout << "===>>> COMMAND: vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op);" << std::endl; + break; + case 0b110: + break; + case 0b111: + vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op); + break; } CHK_IF_RETURN(cudaPeekAtLastError()); } @@ -594,53 +678,42 @@ int main(int argc, char** argv) break; } - if (config.is_result_on_device) { // Copy the data to host_out in order to compare it vs. host_out_ref_mul value. - if (config.is_in_montgomery_form) { // Convert to normal from montgomery if needed. + if (config.is_result_on_device) { // Copy the data to host_out in order to compare it vs. host_out_ref_XXX value. + if (config.is_result_in_montgomery_form) { // Convert to normal from montgomery if needed. CHK_IF_RETURN(mont::from_montgomery( device_in1, vector_size, config.ctx.stream, - device_in1)); // Convert to normal in order to check vs. host_out_ref_mul. + device_in1)); // Convert to normal in order to check vs. host_out_ref_XXX. } err = cudaMemcpy( host_out, device_in1, vector_size * sizeof(T), - cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul. + cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_XXX. if (err != cudaSuccess) { std::cerr << "Failed to copy data from device_in1 to host_out - " << cudaGetErrorString(err) << std::endl; return 0; } } else { // Data is not on device but it is in host_in1. It should be moved to host_out for test pass/fail check. - if (config.is_in_montgomery_form) { // host_out should be written to device, converted to mmontgomery and - // written back to host. Then compared vs. host_out_ref_mul. + if (config.is_result_in_montgomery_form) { // host_out should be written to device, converted to mmontgomery and + // written back to host. Then compared vs. host_out_ref_XXX. err = cudaMemcpy( device_out, host_in1, vector_size * sizeof(T), - cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul. + cudaMemcpyHostToDevice); // Copy to device_out in order to check later vs. host_out_ref_XXX. if (err != cudaSuccess) { std::cerr << "Failed to copy data from host_in1 to device_out - " << cudaGetErrorString(err) << std::endl; return 0; } CHK_IF_RETURN(mont::from_montgomery( device_out, vector_size, config.ctx.stream, - device_out)); // Convert to normal in order to check vs. host_out_ref_mul. + device_out)); // Convert to normal in order to check vs. host_out_ref_XXX. err = cudaMemcpy( host_out, device_out, vector_size * sizeof(T), - cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul. + cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_XXX. if (err != cudaSuccess) { std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl; return 0; } - } else { // host_out could be compared vs. host_out_ref_mul as is. - err = cudaMemcpy( - device_out, host_in1, vector_size * sizeof(T), - cudaMemcpyHostToDevice); // Copy to host_out in order to check vs. host_out_ref_mul. - if (err != cudaSuccess) { - std::cerr << "Failed to copy data from host_in1 to device_out - " << cudaGetErrorString(err) << std::endl; - return 0; - } - err = cudaMemcpy( - host_out, device_out, vector_size * sizeof(T), - cudaMemcpyDeviceToHost); // Copy to host_out in order to check vs. host_out_ref_mul. - if (err != cudaSuccess) { - std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl; - return 0; + } else { // host_out could be compared vs. host_out_ref_XXX as is. + for (int i = 0; i < vector_size; i++) { // Copy from host_in1 (result) to host_out to compare later vs. host_out_ref_XXX. + host_out[i] = host_in1[i]; } } } @@ -649,55 +722,54 @@ int main(int argc, char** argv) //**************************************** //*********************************************** - // Test result check + // Test result check (in-place) // Check is performed by executing the operation in a normal presentation - // (located in in host_out_ref_mul) and comparing it with the + // (located in in host_out_ref_XXX) and comparing it with the // benchmark test result. //*********************************************** int test_failed = 0; - // std::cout << "===>>> host_out_ref_mul[" << i << "]: " << host_out_ref_mul[i] << std::endl; - // std::cout << "===>>> host_out[" << i << "] after test run: " << host_out[i] << std::endl; switch (op) { - case MUL: - for (int i = 0; i < vector_size; i++) { - if (host_out_ref_mul[i] != host_out[i]) { - std::cout << "===>>> ERROR!!! MUL: Test failed for vector index " << i - << ", config is printed below:" << std::endl; - std::cout << "host_out_ref_mul[0] = " << host_out_ref_mul[0] << std::endl; - test_failed = 1; + case MUL: + for (int i = 0; i < vector_size; i++) { + if (host_out_ref_mul[i] != host_out[i]) { + std::cout << "===>>> ERROR!!! MUL: Test failed for vector index " << i + << ", config is printed below:" << std::endl; + test_failed = 1; + } } - } - break; - case ADD: - for (int i = 0; i < vector_size; i++) { - if (host_out_ref_add[i] != host_out[i]) { - std::cout << "===>>> ERROR!!! ADD: Test failed for vector index " << i - << ", config is printed below:" << std::endl; - std::cout << "host_out_ref_add[0] = " << host_out_ref_add[0] << std::endl; - test_failed = 1; + break; + case ADD: + for (int i = 0; i < vector_size; i++) { + if (host_out_ref_add[i] != host_out[i]) { + std::cout << "===>>> ERROR!!! ADD: Test failed for vector index " << i + << ", config is printed below:" << std::endl; + test_failed = 1; + } } - } - break; - case SUB: - for (int i = 0; i < vector_size; i++) { - if (host_out_ref_sub[i] != host_out[i]) { - std::cout << "===>>> ERROR!!! SUB: Test failed for vector index " << i - << ", config is printed below:" << std::endl; - std::cout << "host_out_ref_sub[0] = " << host_out_ref_sub[0] << std::endl; - test_failed = 1; + break; + case SUB: + for (int i = 0; i < vector_size; i++) { + if (host_out_ref_sub[i] != host_out[i]) { + std::cout << "===>>> ERROR!!! SUB: Test failed for vector index " << i + << ", config is printed below:" << std::endl; + test_failed = 1; + } } - } - break; + break; } if (test_failed) { - // std::cout << "===>>> ERROR!!! Test failed for vector index " << i << ", config is printed below:" << - // std::endl; - std::cout << "===>>> result is in-place: " << std::endl; - std::cout << "===>>> is_a_on_device: " << config.is_a_on_device << std::endl; - std::cout << "===>>> is_b_on_device: " << config.is_b_on_device << std::endl; - std::cout << "===>>> is_result_on_device: " << config.is_result_on_device << std::endl; - std::cout << "===>>> is_in_montgomery_form: " << config.is_in_montgomery_form << std::endl; - std::cout << "host_out[0] = " << host_out[0] << std::endl; + std::cout << "===>>> result is in-place: " << std::endl; + std::cout << "===>>> is_a_on_device: " << config.is_a_on_device << std::endl; + std::cout << "===>>> is_b_on_device: " << config.is_b_on_device << std::endl; + std::cout << "===>>> is_result_on_device: " << config.is_result_on_device << std::endl; + std::cout << "===>>> is_input_in_montgomery_form: " << config.is_input_in_montgomery_form << std::endl; + std::cout << "===>>> is_input_resultin_montgomery_form: " << config.is_result_in_montgomery_form << std::endl; + std::cout << "===>>> host_in1_init[0] = " << host_in1_init[0] << std::endl; + std::cout << "===>>> host_in2_init[0] = " << host_in2_init[0] << std::endl; + std::cout << "===>>> host_out[0] = " << host_out[0] << std::endl; + std::cout << "===>>> warm-up: normal host_out_ref[0] (MUL) = " << host_out_ref_mul[0] << std::endl; + std::cout << "===>>> warm-up: normal host_out_ref[0] (ADD) = " << host_out_ref_add[0] << std::endl; + std::cout << "===>>> warm-up: normal host_out_ref[0] (SUB) = " << host_out_ref_sub[0] << std::endl; exit(2); } @@ -714,21 +786,34 @@ int main(int argc, char** argv) } // Report performance in GMPS: Giga Multiplications Per Second - double GMPS = 1.0e-9 * repetitions * vector_size / (1.0e-6 * duration.count()); + double GMPS = 1.0e-9 * in_place_repetitions * vector_size / (1.0e-6 * duration.count()); std::cout << "Performance: " << GMPS << " Giga Multiplications Per Second" << std::endl; } } // clean up and exit free(host_in1_init); + std::cout << "example free host_in1_init" << std::endl; free(host_in2_init); + std::cout << "example free host_in2_init" << std::endl; free(host_in1); + std::cout << "example free host_in1" << std::endl; free(host_in2); + std::cout << "example free host_in2" << std::endl; free(host_out); + std::cout << "example free host_out" << std::endl; free(host_out_ref_mul); + std::cout << "example free host_out_ref_mul" << std::endl; + free(host_out_ref_add); + std::cout << "example free host_out_ref_add" << std::endl; + free(host_out_ref_sub); + std::cout << "example free host_out_ref_sub" << std::endl; cudaFree(device_in1); + std::cout << "example cudaFree device_in1" << std::endl; cudaFree(device_in2); + std::cout << "example cudaFree device_in2" << std::endl; cudaFree(device_out); + std::cout << "example cudaFree device_out" << std::endl; nvmlShutdown(); return 0; } diff --git a/icicle/include/vec_ops/vec_ops.cuh b/icicle/include/vec_ops/vec_ops.cuh index 2a239689a..4de406b34 100644 --- a/icicle/include/vec_ops/vec_ops.cuh +++ b/icicle/include/vec_ops/vec_ops.cuh @@ -27,7 +27,9 @@ namespace vec_ops { * non-blocking and you'd need to synchronize it explicitly by running * `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the * function will block the current CPU thread. */ - bool is_in_montgomery_form; /**< If true then vec_a, vec_b and result are in montgomery form. + bool is_input_in_montgomery_form; /**< If true then vec_a and vec_b are in montgomery form. + * Default value: false. */ + bool is_result_in_montgomery_form; /**< If true then result is in montgomery form. * Default value: false. */ }; @@ -44,7 +46,8 @@ namespace vec_ops { false, // is_b_on_device false, // is_result_on_device false, // is_async - false, // is_in_montgomery_form + false, // is_input_in_montgomery_form + false, // is_result_in_montgomery_form }; return config; } diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu index 9883d393c..07aa0f855 100644 --- a/icicle/src/vec_ops/vec_ops.cu +++ b/icicle/src/vec_ops/vec_ops.cu @@ -98,22 +98,25 @@ namespace vec_ops { int is_d_alloc_vec_a_allocated = 0; if (!config.is_a_on_device) { - if (config.is_in_montgomery_form) { + if (config.is_input_in_montgomery_form) { CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); + std::cout << "vec_ops cudaMallocAsync d_alloc_vec_a" << std::endl; CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream)); CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_a, n * sizeof(E), config.ctx.stream, d_alloc_vec_a)); is_d_alloc_vec_a_allocated = 1; d_vec_a = d_alloc_vec_a; } else { CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); + std::cout << "vec_ops cudaMallocAsync d_alloc_vec_a" << std::endl; CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream)); is_d_alloc_vec_a_allocated = 1; d_vec_a = d_alloc_vec_a; } } else { - if (config.is_in_montgomery_form) { + if (config.is_input_in_montgomery_form) { CHK_IF_RETURN(cudaMallocAsync( &d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input. + std::cout << "vec_ops cudaMallocAsync d_alloc_vec_a" << std::endl; CHK_IF_RETURN(mont::from_montgomery(vec_a, n * sizeof(E), config.ctx.stream, d_alloc_vec_a)); is_d_alloc_vec_a_allocated = 1; d_vec_a = d_alloc_vec_a; @@ -124,22 +127,25 @@ namespace vec_ops { int is_d_alloc_vec_b_allocated = 0; if (!config.is_b_on_device) { - if (config.is_in_montgomery_form) { + if (config.is_input_in_montgomery_form) { CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream)); + std::cout << "vec_ops cudaMallocAsync d_alloc_vec_b" << std::endl; CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream)); CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_b, n * sizeof(E), config.ctx.stream, d_alloc_vec_b)); is_d_alloc_vec_b_allocated = 1; d_vec_b = d_alloc_vec_b; } else { CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream)); + std::cout << "vec_ops cudaMallocAsync d_alloc_vec_b" << std::endl; CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream)); is_d_alloc_vec_b_allocated = 1; d_vec_b = d_alloc_vec_b; } } else { - if (config.is_in_montgomery_form) { + if (config.is_input_in_montgomery_form) { CHK_IF_RETURN(cudaMallocAsync( &d_alloc_vec_b, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input. + std::cout << "vec_ops cudaMallocAsync d_alloc_vec_b" << std::endl; CHK_IF_RETURN(mont::from_montgomery(vec_b, n * sizeof(E), config.ctx.stream, d_alloc_vec_b)); is_d_alloc_vec_b_allocated = 1; d_vec_b = d_alloc_vec_b; @@ -152,6 +158,7 @@ namespace vec_ops { if (!config.is_result_on_device) { if (!is_in_place) { CHK_IF_RETURN(cudaMallocAsync(&d_result, n * sizeof(E), config.ctx.stream)); + std::cout << "vec_ops cudaMallocAsync d_result" << std::endl; is_d_result_allocated = 1; } else { d_result = d_vec_a; @@ -160,7 +167,8 @@ namespace vec_ops { if (!is_in_place) { d_result = result; } else { - d_result = result = d_vec_a; + // d_result = result = d_vec_a; // DEBUG - looks like a bug for in-place. + d_result = result; } } @@ -168,21 +176,30 @@ namespace vec_ops { Kernel<<>>(d_vec_a, d_vec_b, n, d_result); if (!config.is_result_on_device) { - if (config.is_in_montgomery_form) { + if (config.is_result_in_montgomery_form) { CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result)); // Convert in-place. CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream)); } else { CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream)); } } else { - if (config.is_in_montgomery_form) { + if (config.is_result_in_montgomery_form) { CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result)); // Convert in-place. } } - if (is_d_alloc_vec_a_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); } - if (is_d_alloc_vec_b_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); } - if (is_d_result_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream)); } + if (is_d_alloc_vec_a_allocated) { + CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); + std::cout << "vec_ops cudaFreeAsync d_alloc_vec_a" << std::endl; + } + if (is_d_alloc_vec_b_allocated) { + CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); + std::cout << "vec_ops cudaFreeAsync d_alloc_vec_b" << std::endl; + } + if (is_d_result_allocated) { + CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream)); + std::cout << "vec_ops cudaFreeAsync d_result" << std::endl; + } if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(config.ctx.stream)); From 23bc704a3cdc040a328d57010ad9d0738d5e73d8 Mon Sep 17 00:00:00 2001 From: danny-shterman Date: Tue, 13 Aug 2024 09:41:40 +0000 Subject: [PATCH 11/21] Fix to/from_montgomery function param --- examples/c++/mont_vec_ops/example.cu | 151 ++++++++------------------- icicle/src/vec_ops/vec_ops.cu | 22 ++-- 2 files changed, 47 insertions(+), 126 deletions(-) diff --git a/examples/c++/mont_vec_ops/example.cu b/examples/c++/mont_vec_ops/example.cu index a1fe871be..f277f2ebc 100644 --- a/examples/c++/mont_vec_ops/example.cu +++ b/examples/c++/mont_vec_ops/example.cu @@ -37,7 +37,6 @@ int vector_op( err = bn254_sub_cuda(vec_a, vec_b, n_elements, config, vec_result); break; } - // cudaError_t err = bn254_mul_cuda(vec_a, vec_b, n_elements, config, vec_result); if (err != cudaSuccess) { std::cerr << "Failed to multiply vectors - " << cudaGetErrorString(err) << std::endl; return 0; @@ -47,8 +46,8 @@ int vector_op( int main(int argc, char** argv) { - const unsigned vector_size = 1 << 11; - const unsigned not_in_place_repetitions = 1 << 0; // Repetitions are used only for the non in-place tests. + const unsigned vector_size = 1 << 15; + const unsigned not_in_place_repetitions = 1 << 15; // Repetitions are used only for the non in-place tests. const unsigned in_place_repetitions = 1; // Repetitions for in-place tests should be 1. Don't check it. cudaError_t err; @@ -83,27 +82,21 @@ int main(int argc, char** argv) // host data std::cout << "Allocate memory for the input vectors (both normal and Montgomery presentation)" << std::endl; T* host_in1_init = (T*)malloc(vector_size * sizeof(T)); - std::cout << "example malloc host_in1_init" << std::endl; T* host_in2_init = (T*)malloc(vector_size * sizeof(T)); - std::cout << "example malloc host_in2_init" << std::endl; std::cout << "Initializing vectors with normal presentation random data" << std::endl; T::rand_host_many(host_in1_init, vector_size); T::rand_host_many(host_in2_init, vector_size); std::cout << "Allocate memory for the output vectors" << std::endl; T* host_out = (T*)malloc(vector_size * sizeof(T)); // This memory will be used for the test output. - std::cout << "example malloc host_out" << std::endl; T* host_out_ref_mul = (T*)malloc( vector_size * sizeof(T)); // This memory will be used as a reference result for mul (will be compared to host_out content). - std::cout << "example malloc host_out_ref_mul" << std::endl; T* host_out_ref_add = (T*)malloc( vector_size * sizeof(T)); // This memory will be used as a reference result for add (will be compared to host_out content). - std::cout << "example malloc host_out_ref_add" << std::endl; T* host_out_ref_sub = (T*)malloc( vector_size * sizeof(T)); // This memory will be used as a reference result for sub (will be compared to host_out content). - std::cout << "malloc host_out_ref_sub" << std::endl; std::cout << "Initializing output vectors with random data" << std::endl; T::rand_host_many(host_out, vector_size); T::rand_host_many(host_out_ref_mul, vector_size); @@ -116,19 +109,16 @@ int main(int argc, char** argv) T* device_out; err = cudaMalloc((void**)&device_in1, vector_size * sizeof(T)); - std::cout << "example cudaMalloc device_in1" << std::endl; if (err != cudaSuccess) { std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl; return 0; } err = cudaMalloc((void**)&device_in2, vector_size * sizeof(T)); - std::cout << "example cudaMalloc device_in2" << std::endl; if (err != cudaSuccess) { std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl; return 0; } err = cudaMalloc((void**)&device_out, vector_size * sizeof(T)); - std::cout << "example cudaMalloc device_out" << std::endl; if (err != cudaSuccess) { std::cerr << "Failed to allocate device memory - " << cudaGetErrorString(err) << std::endl; return 0; @@ -154,24 +144,27 @@ int main(int argc, char** argv) } std::cout << "Starting warm-up run" << std::endl; // Warm-up loop + // for (int i = 0; i < not_in_place_repetitions; i++) { + for (int i = 0; i < 100; i++) { // Nof loops set to 100 because warm-up takes too much time because inputs and outputs are on located on Host. + vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, MUL); + } + // Generate ref results for all ops for (int op = MUL; op != LAST; op++) { - for (int i = 0; i < not_in_place_repetitions; i++) { - vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); - switch (op) { - case MUL: - err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); - break; - case ADD: - err = cudaMemcpy(host_out_ref_add, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); - break; - case SUB: - err = cudaMemcpy(host_out_ref_sub, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); - break; - } - if (err != cudaSuccess) { - std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl; - return 0; - } + vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); + switch (op) { + case MUL: + err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); + break; + case ADD: + err = cudaMemcpy(host_out_ref_add, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); + break; + case SUB: + err = cudaMemcpy(host_out_ref_sub, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); + break; + } + if (err != cudaSuccess) { + std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl; + return 0; } } //**************************************** @@ -198,35 +191,20 @@ int main(int argc, char** argv) //******************************************************* T* host_in1 = (T*)malloc(vector_size * sizeof(T)); // This buffer is used to load the data from host_in1_init for the benchmark. - std::cout << "example malloc host_in1" << std::endl; T* host_in2 = (T*)malloc(vector_size * sizeof(T)); // This buffer is used to load the data from host_in2_init for the benchmark. - std::cout << "example malloc host_in1" << std::endl; // Test when the result is not in-place std::cout << "*****************************************" << std::endl; std::cout << "*** Start not in-place benchmark loop ***" << std::endl; std::cout << "*****************************************" << std::endl; for (int op = MUL; op != LAST; op++) { - for (int config_idx = 0; config_idx < 32; config_idx++) { - // // DEBUG - // int num_gpus; - // size_t free, total; - // cudaGetDeviceCount( &num_gpus ); - // std::cout << "num_gpus " << num_gpus << std::endl; - // for ( int gpu_id = 0; gpu_id < num_gpus; gpu_id++ ) { - // cudaSetDevice( gpu_id ); - // int id; - // cudaGetDevice( &id ); - // cudaMemGetInfo( &free, &total ); - // std::cout << "GPU " << id << " memory: free=" << free << ", total=" << total << std::endl; - // } - // // DEBUG + for (int config_idx = 28; config_idx < 29; config_idx++) { + // for (int config_idx = 0; config_idx < 32; config_idx++) { switch (op) { case MUL: std::cout << "Start benchmark loop for op MUL config_idx " << config_idx << " not in-place" << std::endl; break; case ADD: std::cout << "Start benchmark loop for op ADD config_idx " << config_idx << " not in-place" << std::endl; break; case SUB: std::cout << "Start benchmark loop for op SUB config_idx " << config_idx << " not in-place" << std::endl; break; } - std::cout << "Start benchmark loop for config_idx " << config_idx << std::endl; // Destroy the result of the prev loop. T::rand_host_many(host_out, vector_size); // Randomize host_out in order to randomize device_out. err = cudaMemcpy( @@ -344,37 +322,16 @@ int main(int argc, char** argv) CHK_IF_RETURN(cudaPeekAtLastError()); auto start_time = std::chrono::high_resolution_clock::now(); - // Benchmark loop - for (int i = 0; i < not_in_place_repetitions; i++) { - switch (config_idx >> (nof_of_configs_for_test - nof_of_storage_configs)) { // {is_a_on_device, is_b_on_device, is_result_on_device} - case 0b000: - vector_op(host_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); - break; - case 0b001: - vector_op(host_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); - break; - case 0b010: - vector_op(host_in1, device_in2, host_out, vector_size, ctx, config, (Op)op); - break; - case 0b011: - vector_op(host_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); - break; - case 0b100: - vector_op(device_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); - break; - case 0b101: - vector_op(device_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); - break; - case 0b110: - vector_op(device_in1, device_in2, host_out, vector_size, ctx, config, (Op)op); - break; - case 0b111: - vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); - break; - } - CHK_IF_RETURN(cudaPeekAtLastError()); + switch (config_idx >> (nof_of_configs_for_test - nof_of_storage_configs)) { // {is_a_on_device, is_b_on_device, is_result_on_device} + case 0b000: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(host_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); } break; + case 0b001: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(host_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); } break; + case 0b010: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(host_in1, device_in2, host_out, vector_size, ctx, config, (Op)op); } break; + case 0b011: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(host_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); } break; + case 0b100: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); } break; + case 0b101: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); } break; + case 0b110: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, device_in2, host_out, vector_size, ctx, config, (Op)op); } break; + case 0b111: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); } break; } - auto end_time = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(end_time - start_time); switch (op) { @@ -513,7 +470,6 @@ int main(int argc, char** argv) case ADD: std::cout << "Start benchmark loop for op ADD config_idx " << config_idx << " in-place" << std::endl; break; case SUB: std::cout << "Start benchmark loop for op SUB config_idx " << config_idx << " in-place" << std::endl; break; } - std::cout << "Start benchmark loop for config_idx " << config_idx << std::endl; // Destroy the result of the prev loop. T::rand_host_many(host_out, vector_size); // Randomize host_out in order to randomize device_out. err = cudaMemcpy( @@ -636,31 +592,17 @@ int main(int argc, char** argv) // Benchmark loop for (int i = 0; i < in_place_repetitions; i++) { switch (config_idx >> (nof_of_configs_for_test - nof_of_storage_configs)) { // {is_a_on_device, is_b_on_device, is_result_on_device} - case 0b000: - vector_op(host_in1, host_in2, host_in1, vector_size, ctx, config, (Op)op); - break; - case 0b001: - break; - case 0b010: - vector_op(host_in1, device_in2, host_in1, vector_size, ctx, config, (Op)op); - break; - case 0b011: - break; - case 0b100: - break; - case 0b101: - vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op); - std::cout << "===>>> COMMAND: vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op);" << std::endl; - break; - case 0b110: - break; - case 0b111: - vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op); - break; + case 0b000: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(host_in1, host_in2, host_in1, vector_size, ctx, config, (Op)op); } break; + case 0b001: break; + case 0b010: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(host_in1, device_in2, host_in1, vector_size, ctx, config, (Op)op); } break; + case 0b011: break; + case 0b100: break; + case 0b101: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op); } break; + case 0b110: break; + case 0b111: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op); } break; } CHK_IF_RETURN(cudaPeekAtLastError()); } - auto end_time = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(end_time - start_time); switch (op) { @@ -793,27 +735,16 @@ int main(int argc, char** argv) // clean up and exit free(host_in1_init); - std::cout << "example free host_in1_init" << std::endl; free(host_in2_init); - std::cout << "example free host_in2_init" << std::endl; free(host_in1); - std::cout << "example free host_in1" << std::endl; free(host_in2); - std::cout << "example free host_in2" << std::endl; free(host_out); - std::cout << "example free host_out" << std::endl; free(host_out_ref_mul); - std::cout << "example free host_out_ref_mul" << std::endl; free(host_out_ref_add); - std::cout << "example free host_out_ref_add" << std::endl; free(host_out_ref_sub); - std::cout << "example free host_out_ref_sub" << std::endl; cudaFree(device_in1); - std::cout << "example cudaFree device_in1" << std::endl; cudaFree(device_in2); - std::cout << "example cudaFree device_in2" << std::endl; cudaFree(device_out); - std::cout << "example cudaFree device_out" << std::endl; nvmlShutdown(); return 0; } diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu index 07aa0f855..baac53b43 100644 --- a/icicle/src/vec_ops/vec_ops.cu +++ b/icicle/src/vec_ops/vec_ops.cu @@ -100,14 +100,12 @@ namespace vec_ops { if (!config.is_a_on_device) { if (config.is_input_in_montgomery_form) { CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); - std::cout << "vec_ops cudaMallocAsync d_alloc_vec_a" << std::endl; CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream)); - CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_a, n * sizeof(E), config.ctx.stream, d_alloc_vec_a)); + CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_a, n, config.ctx.stream, d_alloc_vec_a)); is_d_alloc_vec_a_allocated = 1; d_vec_a = d_alloc_vec_a; } else { CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); - std::cout << "vec_ops cudaMallocAsync d_alloc_vec_a" << std::endl; CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream)); is_d_alloc_vec_a_allocated = 1; d_vec_a = d_alloc_vec_a; @@ -116,8 +114,7 @@ namespace vec_ops { if (config.is_input_in_montgomery_form) { CHK_IF_RETURN(cudaMallocAsync( &d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input. - std::cout << "vec_ops cudaMallocAsync d_alloc_vec_a" << std::endl; - CHK_IF_RETURN(mont::from_montgomery(vec_a, n * sizeof(E), config.ctx.stream, d_alloc_vec_a)); + CHK_IF_RETURN(mont::from_montgomery(vec_a, n, config.ctx.stream, d_alloc_vec_a)); is_d_alloc_vec_a_allocated = 1; d_vec_a = d_alloc_vec_a; } else { @@ -129,14 +126,12 @@ namespace vec_ops { if (!config.is_b_on_device) { if (config.is_input_in_montgomery_form) { CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream)); - std::cout << "vec_ops cudaMallocAsync d_alloc_vec_b" << std::endl; CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream)); - CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_b, n * sizeof(E), config.ctx.stream, d_alloc_vec_b)); + CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_b, n, config.ctx.stream, d_alloc_vec_b)); is_d_alloc_vec_b_allocated = 1; d_vec_b = d_alloc_vec_b; } else { CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream)); - std::cout << "vec_ops cudaMallocAsync d_alloc_vec_b" << std::endl; CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream)); is_d_alloc_vec_b_allocated = 1; d_vec_b = d_alloc_vec_b; @@ -145,8 +140,7 @@ namespace vec_ops { if (config.is_input_in_montgomery_form) { CHK_IF_RETURN(cudaMallocAsync( &d_alloc_vec_b, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input. - std::cout << "vec_ops cudaMallocAsync d_alloc_vec_b" << std::endl; - CHK_IF_RETURN(mont::from_montgomery(vec_b, n * sizeof(E), config.ctx.stream, d_alloc_vec_b)); + CHK_IF_RETURN(mont::from_montgomery(vec_b, n, config.ctx.stream, d_alloc_vec_b)); is_d_alloc_vec_b_allocated = 1; d_vec_b = d_alloc_vec_b; } else { @@ -158,7 +152,6 @@ namespace vec_ops { if (!config.is_result_on_device) { if (!is_in_place) { CHK_IF_RETURN(cudaMallocAsync(&d_result, n * sizeof(E), config.ctx.stream)); - std::cout << "vec_ops cudaMallocAsync d_result" << std::endl; is_d_result_allocated = 1; } else { d_result = d_vec_a; @@ -177,28 +170,25 @@ namespace vec_ops { if (!config.is_result_on_device) { if (config.is_result_in_montgomery_form) { - CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result)); // Convert in-place. + CHK_IF_RETURN(mont::to_montgomery(d_result, n, config.ctx.stream, d_result)); // Convert in-place. CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream)); } else { CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream)); } } else { if (config.is_result_in_montgomery_form) { - CHK_IF_RETURN(mont::to_montgomery(d_result, n * sizeof(E), config.ctx.stream, d_result)); // Convert in-place. + CHK_IF_RETURN(mont::to_montgomery(d_result, n, config.ctx.stream, d_result)); // Convert in-place. } } if (is_d_alloc_vec_a_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); - std::cout << "vec_ops cudaFreeAsync d_alloc_vec_a" << std::endl; } if (is_d_alloc_vec_b_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); - std::cout << "vec_ops cudaFreeAsync d_alloc_vec_b" << std::endl; } if (is_d_result_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream)); - std::cout << "vec_ops cudaFreeAsync d_result" << std::endl; } if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(config.ctx.stream)); From b59af917ee1df22ac77b08b6dd1509e9074e2f60 Mon Sep 17 00:00:00 2001 From: danny-shterman Date: Tue, 13 Aug 2024 09:43:56 +0000 Subject: [PATCH 12/21] Run clang-format --- examples/c++/mont_vec_ops/example.cu | 240 +++++++++++++++++---------- icicle/src/vec_ops/vec_ops.cu | 14 +- 2 files changed, 160 insertions(+), 94 deletions(-) diff --git a/examples/c++/mont_vec_ops/example.cu b/examples/c++/mont_vec_ops/example.cu index f277f2ebc..46bad5607 100644 --- a/examples/c++/mont_vec_ops/example.cu +++ b/examples/c++/mont_vec_ops/example.cu @@ -46,9 +46,9 @@ int vector_op( int main(int argc, char** argv) { - const unsigned vector_size = 1 << 15; - const unsigned not_in_place_repetitions = 1 << 15; // Repetitions are used only for the non in-place tests. - const unsigned in_place_repetitions = 1; // Repetitions for in-place tests should be 1. Don't check it. + const unsigned vector_size = 1 << 15; + const unsigned not_in_place_repetitions = 1 << 15; // Repetitions are used only for the non in-place tests. + const unsigned in_place_repetitions = 1; // Repetitions for in-place tests should be 1. Don't check it. cudaError_t err; nvmlInit(); @@ -67,7 +67,8 @@ int main(int argc, char** argv) std::cout << "Vector size: " << vector_size << std::endl; std::cout << "not_in_place_repetitions: " << not_in_place_repetitions << std::endl; std::cout << "in_place_repetitions: " << in_place_repetitions << std::endl; - std::cout << "Power limit: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_limit << " W" << std::endl; + std::cout << "Power limit: " << std::fixed << std::setprecision(3) << 1.0e-3 * power_limit << " W" + << std::endl; unsigned int baseline_power; nvmlDeviceGetPowerUsage(device, &baseline_power); @@ -126,7 +127,7 @@ int main(int argc, char** argv) vec_ops::VecOpsConfig config = vec_ops::DefaultVecOpsConfig(); int nof_of_configs_for_test = 5; - int nof_of_storage_configs = 3; // 2 inputs, 1 result. + int nof_of_storage_configs = 3; // 2 inputs, 1 result. //**************************************** // Test warn-up and reference output config. Reference output to be used to check if test passed or not. @@ -145,22 +146,23 @@ int main(int argc, char** argv) std::cout << "Starting warm-up run" << std::endl; // Warm-up loop // for (int i = 0; i < not_in_place_repetitions; i++) { - for (int i = 0; i < 100; i++) { // Nof loops set to 100 because warm-up takes too much time because inputs and outputs are on located on Host. + for (int i = 0; i < 100; i++) { // Nof loops set to 100 because warm-up takes too much time because inputs and outputs + // are on located on Host. vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, MUL); } // Generate ref results for all ops for (int op = MUL; op != LAST; op++) { vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); switch (op) { - case MUL: - err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); - break; - case ADD: - err = cudaMemcpy(host_out_ref_add, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); - break; - case SUB: - err = cudaMemcpy(host_out_ref_sub, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); - break; + case MUL: + err = cudaMemcpy(host_out_ref_mul, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); + break; + case ADD: + err = cudaMemcpy(host_out_ref_add, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); + break; + case SUB: + err = cudaMemcpy(host_out_ref_sub, device_out, vector_size * sizeof(T), cudaMemcpyDeviceToHost); + break; } if (err != cudaSuccess) { std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl; @@ -199,14 +201,20 @@ int main(int argc, char** argv) std::cout << "*****************************************" << std::endl; for (int op = MUL; op != LAST; op++) { for (int config_idx = 28; config_idx < 29; config_idx++) { - // for (int config_idx = 0; config_idx < 32; config_idx++) { + // for (int config_idx = 0; config_idx < 32; config_idx++) { switch (op) { - case MUL: std::cout << "Start benchmark loop for op MUL config_idx " << config_idx << " not in-place" << std::endl; break; - case ADD: std::cout << "Start benchmark loop for op ADD config_idx " << config_idx << " not in-place" << std::endl; break; - case SUB: std::cout << "Start benchmark loop for op SUB config_idx " << config_idx << " not in-place" << std::endl; break; + case MUL: + std::cout << "Start benchmark loop for op MUL config_idx " << config_idx << " not in-place" << std::endl; + break; + case ADD: + std::cout << "Start benchmark loop for op ADD config_idx " << config_idx << " not in-place" << std::endl; + break; + case SUB: + std::cout << "Start benchmark loop for op SUB config_idx " << config_idx << " not in-place" << std::endl; + break; } // Destroy the result of the prev loop. - T::rand_host_many(host_out, vector_size); // Randomize host_out in order to randomize device_out. + T::rand_host_many(host_out, vector_size); // Randomize host_out in order to randomize device_out. err = cudaMemcpy( device_out, host_out, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy random data to device_out. @@ -214,16 +222,16 @@ int main(int argc, char** argv) std::cerr << "Failed to copy data from host_out to device_out - " << cudaGetErrorString(err) << std::endl; return 0; } - T::rand_host_many(host_out, vector_size); // Make hist_out != device_out. + T::rand_host_many(host_out, vector_size); // Make hist_out != device_out. // Initializa inputs with the known data for (int i = 0; i < vector_size; i++) { host_in1[i] = host_in1_init[i]; host_in2[i] = host_in2_init[i]; } - config.is_a_on_device = (config_idx >> 4) & 0x1; - config.is_b_on_device = (config_idx >> 3) & 0x1; - config.is_result_on_device = (config_idx >> 2) & 0x1; - config.is_input_in_montgomery_form = (config_idx >> 1) & 0x1; + config.is_a_on_device = (config_idx >> 4) & 0x1; + config.is_b_on_device = (config_idx >> 3) & 0x1; + config.is_result_on_device = (config_idx >> 2) & 0x1; + config.is_input_in_montgomery_form = (config_idx >> 1) & 0x1; config.is_result_in_montgomery_form = (config_idx >> 0) & 0x1; // Copy from host to device (copy again in order to be used later in the loop and device_inX was already @@ -242,7 +250,7 @@ int main(int argc, char** argv) for (int i = 0; i < vector_size; i++) { host_in1[i] = host_in2_init[i]; } - } else { // Normal presentation. + } else { // Normal presentation. err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. if (err != cudaSuccess) { @@ -322,15 +330,48 @@ int main(int argc, char** argv) CHK_IF_RETURN(cudaPeekAtLastError()); auto start_time = std::chrono::high_resolution_clock::now(); - switch (config_idx >> (nof_of_configs_for_test - nof_of_storage_configs)) { // {is_a_on_device, is_b_on_device, is_result_on_device} - case 0b000: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(host_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); } break; - case 0b001: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(host_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); } break; - case 0b010: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(host_in1, device_in2, host_out, vector_size, ctx, config, (Op)op); } break; - case 0b011: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(host_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); } break; - case 0b100: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); } break; - case 0b101: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); } break; - case 0b110: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, device_in2, host_out, vector_size, ctx, config, (Op)op); } break; - case 0b111: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); } break; + switch (config_idx >> (nof_of_configs_for_test - + nof_of_storage_configs)) { // {is_a_on_device, is_b_on_device, is_result_on_device} + case 0b000: + for (int i = 0; i < not_in_place_repetitions; i++) { + vector_op(host_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); + } + break; + case 0b001: + for (int i = 0; i < not_in_place_repetitions; i++) { + vector_op(host_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); + } + break; + case 0b010: + for (int i = 0; i < not_in_place_repetitions; i++) { + vector_op(host_in1, device_in2, host_out, vector_size, ctx, config, (Op)op); + } + break; + case 0b011: + for (int i = 0; i < not_in_place_repetitions; i++) { + vector_op(host_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); + } + break; + case 0b100: + for (int i = 0; i < not_in_place_repetitions; i++) { + vector_op(device_in1, host_in2, host_out, vector_size, ctx, config, (Op)op); + } + break; + case 0b101: + for (int i = 0; i < not_in_place_repetitions; i++) { + vector_op(device_in1, host_in2, device_out, vector_size, ctx, config, (Op)op); + } + break; + case 0b110: + for (int i = 0; i < not_in_place_repetitions; i++) { + vector_op(device_in1, device_in2, host_out, vector_size, ctx, config, (Op)op); + } + break; + case 0b111: + for (int i = 0; i < not_in_place_repetitions; i++) { + vector_op(device_in1, device_in2, device_out, vector_size, ctx, config, (Op)op); + } + break; } auto end_time = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(end_time - start_time); @@ -362,7 +403,7 @@ int main(int argc, char** argv) std::cerr << "Failed to copy data from device_out to host - " << cudaGetErrorString(err) << std::endl; return 0; } - } else { // Data is not on device but it is in host_out. + } else { // Data is not on device but it is in host_out. if (config.is_result_in_montgomery_form) { // host_out should be written to device, converted to mmontgomery and // written back to host. Then compared vs. host_out_ref_XXX. err = cudaMemcpy( @@ -382,7 +423,7 @@ int main(int argc, char** argv) std::cerr << "Failed to copy data from device_out to host_out - " << cudaGetErrorString(err) << std::endl; return 0; } - } else { // host_out could be compared vs. host_out_ref_XXX as is. + } else { // host_out could be compared vs. host_out_ref_XXX as is. } } //**************************************** @@ -466,12 +507,18 @@ int main(int argc, char** argv) for (int op = MUL; op != LAST; op++) { for (int config_idx = 0; config_idx < 32; config_idx++) { switch (op) { - case MUL: std::cout << "Start benchmark loop for op MUL config_idx " << config_idx << " in-place" << std::endl; break; - case ADD: std::cout << "Start benchmark loop for op ADD config_idx " << config_idx << " in-place" << std::endl; break; - case SUB: std::cout << "Start benchmark loop for op SUB config_idx " << config_idx << " in-place" << std::endl; break; + case MUL: + std::cout << "Start benchmark loop for op MUL config_idx " << config_idx << " in-place" << std::endl; + break; + case ADD: + std::cout << "Start benchmark loop for op ADD config_idx " << config_idx << " in-place" << std::endl; + break; + case SUB: + std::cout << "Start benchmark loop for op SUB config_idx " << config_idx << " in-place" << std::endl; + break; } // Destroy the result of the prev loop. - T::rand_host_many(host_out, vector_size); // Randomize host_out in order to randomize device_out. + T::rand_host_many(host_out, vector_size); // Randomize host_out in order to randomize device_out. err = cudaMemcpy( device_out, host_out, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy random data to device_out. @@ -479,19 +526,22 @@ int main(int argc, char** argv) std::cerr << "Failed to copy data from host_out to device_out - " << cudaGetErrorString(err) << std::endl; return 0; } - T::rand_host_many(host_out, vector_size); // Make hist_out != device_out. - // Initializa inputs with the known data. For in-place tests host_in1 is going to be used as a result. So, it should be initialized later in the repetions loop. + T::rand_host_many(host_out, vector_size); // Make hist_out != device_out. + // Initializa inputs with the known data. For in-place tests host_in1 is going to be used as a result. So, it + // should be initialized later in the repetions loop. for (int i = 0; i < vector_size; i++) { host_in1[i] = host_in1_init[i]; host_in2[i] = host_in2_init[i]; } - config.is_a_on_device = (config_idx >> 4) & 0x1; - config.is_b_on_device = (config_idx >> 3) & 0x1; - config.is_result_on_device = (config_idx >> 2) & 0x1; - config.is_input_in_montgomery_form = (config_idx >> 1) & 0x1; + config.is_a_on_device = (config_idx >> 4) & 0x1; + config.is_b_on_device = (config_idx >> 3) & 0x1; + config.is_result_on_device = (config_idx >> 2) & 0x1; + config.is_input_in_montgomery_form = (config_idx >> 1) & 0x1; config.is_result_in_montgomery_form = (config_idx >> 1) & 0x1; - if (config.is_a_on_device ^ config.is_result_on_device == 1) { continue; } // Illegal case for this loop. - if (config.is_input_in_montgomery_form ^ config.is_result_in_montgomery_form == 1) { continue; } // Illegal case for this loop. + if (config.is_a_on_device ^ config.is_result_on_device == 1) { continue; } // Illegal case for this loop. + if (config.is_input_in_montgomery_form ^ config.is_result_in_montgomery_form == 1) { + continue; + } // Illegal case for this loop. // Copy from host to device (copy again in order to be used later in the loop and device_inX was already // overwritten by warmup. @@ -509,7 +559,7 @@ int main(int argc, char** argv) for (int i = 0; i < vector_size; i++) { host_in1[i] = host_in2_init[i]; } - } else { // Normal presentation. + } else { // Normal presentation. err = cudaMemcpy(device_in1, host_in1, vector_size * sizeof(T), cudaMemcpyHostToDevice); // Copy data to device. if (err != cudaSuccess) { @@ -591,15 +641,36 @@ int main(int argc, char** argv) auto start_time = std::chrono::high_resolution_clock::now(); // Benchmark loop for (int i = 0; i < in_place_repetitions; i++) { - switch (config_idx >> (nof_of_configs_for_test - nof_of_storage_configs)) { // {is_a_on_device, is_b_on_device, is_result_on_device} - case 0b000: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(host_in1, host_in2, host_in1, vector_size, ctx, config, (Op)op); } break; - case 0b001: break; - case 0b010: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(host_in1, device_in2, host_in1, vector_size, ctx, config, (Op)op); } break; - case 0b011: break; - case 0b100: break; - case 0b101: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op); } break; - case 0b110: break; - case 0b111: for (int i = 0; i < not_in_place_repetitions; i++) { vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op); } break; + switch (config_idx >> (nof_of_configs_for_test - + nof_of_storage_configs)) { // {is_a_on_device, is_b_on_device, is_result_on_device} + case 0b000: + for (int i = 0; i < not_in_place_repetitions; i++) { + vector_op(host_in1, host_in2, host_in1, vector_size, ctx, config, (Op)op); + } + break; + case 0b001: + break; + case 0b010: + for (int i = 0; i < not_in_place_repetitions; i++) { + vector_op(host_in1, device_in2, host_in1, vector_size, ctx, config, (Op)op); + } + break; + case 0b011: + break; + case 0b100: + break; + case 0b101: + for (int i = 0; i < not_in_place_repetitions; i++) { + vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op); + } + break; + case 0b110: + break; + case 0b111: + for (int i = 0; i < not_in_place_repetitions; i++) { + vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op); + } + break; } CHK_IF_RETURN(cudaPeekAtLastError()); } @@ -654,7 +725,8 @@ int main(int argc, char** argv) return 0; } } else { // host_out could be compared vs. host_out_ref_XXX as is. - for (int i = 0; i < vector_size; i++) { // Copy from host_in1 (result) to host_out to compare later vs. host_out_ref_XXX. + for (int i = 0; i < vector_size; + i++) { // Copy from host_in1 (result) to host_out to compare later vs. host_out_ref_XXX. host_out[i] = host_in1[i]; } } @@ -671,33 +743,33 @@ int main(int argc, char** argv) //*********************************************** int test_failed = 0; switch (op) { - case MUL: - for (int i = 0; i < vector_size; i++) { - if (host_out_ref_mul[i] != host_out[i]) { - std::cout << "===>>> ERROR!!! MUL: Test failed for vector index " << i - << ", config is printed below:" << std::endl; - test_failed = 1; - } + case MUL: + for (int i = 0; i < vector_size; i++) { + if (host_out_ref_mul[i] != host_out[i]) { + std::cout << "===>>> ERROR!!! MUL: Test failed for vector index " << i + << ", config is printed below:" << std::endl; + test_failed = 1; } - break; - case ADD: - for (int i = 0; i < vector_size; i++) { - if (host_out_ref_add[i] != host_out[i]) { - std::cout << "===>>> ERROR!!! ADD: Test failed for vector index " << i - << ", config is printed below:" << std::endl; - test_failed = 1; - } + } + break; + case ADD: + for (int i = 0; i < vector_size; i++) { + if (host_out_ref_add[i] != host_out[i]) { + std::cout << "===>>> ERROR!!! ADD: Test failed for vector index " << i + << ", config is printed below:" << std::endl; + test_failed = 1; } - break; - case SUB: - for (int i = 0; i < vector_size; i++) { - if (host_out_ref_sub[i] != host_out[i]) { - std::cout << "===>>> ERROR!!! SUB: Test failed for vector index " << i - << ", config is printed below:" << std::endl; - test_failed = 1; - } + } + break; + case SUB: + for (int i = 0; i < vector_size; i++) { + if (host_out_ref_sub[i] != host_out[i]) { + std::cout << "===>>> ERROR!!! SUB: Test failed for vector index " << i + << ", config is printed below:" << std::endl; + test_failed = 1; } - break; + } + break; } if (test_failed) { std::cout << "===>>> result is in-place: " << std::endl; diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu index baac53b43..c95e343e6 100644 --- a/icicle/src/vec_ops/vec_ops.cu +++ b/icicle/src/vec_ops/vec_ops.cu @@ -176,20 +176,14 @@ namespace vec_ops { CHK_IF_RETURN(cudaMemcpyAsync(result, d_result, n * sizeof(E), cudaMemcpyDeviceToHost, config.ctx.stream)); } } else { - if (config.is_result_in_montgomery_form) { + if (config.is_result_in_montgomery_form) { CHK_IF_RETURN(mont::to_montgomery(d_result, n, config.ctx.stream, d_result)); // Convert in-place. } } - if (is_d_alloc_vec_a_allocated) { - CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); - } - if (is_d_alloc_vec_b_allocated) { - CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); - } - if (is_d_result_allocated) { - CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream)); - } + if (is_d_alloc_vec_a_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_a, config.ctx.stream)); } + if (is_d_alloc_vec_b_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_alloc_vec_b, config.ctx.stream)); } + if (is_d_result_allocated) { CHK_IF_RETURN(cudaFreeAsync(d_result, config.ctx.stream)); } if (!config.is_async) return CHK_STICKY(cudaStreamSynchronize(config.ctx.stream)); From 028f512d7d1bf730212b42e3a159f1bbf3318b1d Mon Sep 17 00:00:00 2001 From: danny-shterman Date: Tue, 13 Aug 2024 10:00:21 +0000 Subject: [PATCH 13/21] Fix typo. --- examples/c++/mont_vec_ops/example.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/c++/mont_vec_ops/example.cu b/examples/c++/mont_vec_ops/example.cu index 46bad5607..d668c3e37 100644 --- a/examples/c++/mont_vec_ops/example.cu +++ b/examples/c++/mont_vec_ops/example.cu @@ -200,8 +200,8 @@ int main(int argc, char** argv) std::cout << "*** Start not in-place benchmark loop ***" << std::endl; std::cout << "*****************************************" << std::endl; for (int op = MUL; op != LAST; op++) { - for (int config_idx = 28; config_idx < 29; config_idx++) { - // for (int config_idx = 0; config_idx < 32; config_idx++) { + // for (int config_idx = 28; config_idx < 29; config_idx++) { + for (int config_idx = 0; config_idx < 32; config_idx++) { switch (op) { case MUL: std::cout << "Start benchmark loop for op MUL config_idx " << config_idx << " not in-place" << std::endl; From e76053af4db9799a5b57bf8a874d00844c9d671f Mon Sep 17 00:00:00 2001 From: danny-shterman Date: Tue, 13 Aug 2024 10:14:45 +0000 Subject: [PATCH 14/21] Added space in order o rerun the icicle testing --- examples/c++/mont_vec_ops/example.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/c++/mont_vec_ops/example.cu b/examples/c++/mont_vec_ops/example.cu index d668c3e37..6befbec06 100644 --- a/examples/c++/mont_vec_ops/example.cu +++ b/examples/c++/mont_vec_ops/example.cu @@ -201,7 +201,7 @@ int main(int argc, char** argv) std::cout << "*****************************************" << std::endl; for (int op = MUL; op != LAST; op++) { // for (int config_idx = 28; config_idx < 29; config_idx++) { - for (int config_idx = 0; config_idx < 32; config_idx++) { + for (int config_idx = 0; config_idx < 32; config_idx++) { switch (op) { case MUL: std::cout << "Start benchmark loop for op MUL config_idx " << config_idx << " not in-place" << std::endl; From afd3e54d545f632d1ef0ee517a2998a52ea84448 Mon Sep 17 00:00:00 2001 From: danny-shterman Date: Tue, 13 Aug 2024 10:42:57 +0000 Subject: [PATCH 15/21] Fix spelling --- examples/c++/mont_vec_ops/example.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/c++/mont_vec_ops/example.cu b/examples/c++/mont_vec_ops/example.cu index 6befbec06..e2ef3a42a 100644 --- a/examples/c++/mont_vec_ops/example.cu +++ b/examples/c++/mont_vec_ops/example.cu @@ -223,7 +223,7 @@ int main(int argc, char** argv) return 0; } T::rand_host_many(host_out, vector_size); // Make hist_out != device_out. - // Initializa inputs with the known data + // Initialize inputs with the known data for (int i = 0; i < vector_size; i++) { host_in1[i] = host_in1_init[i]; host_in2[i] = host_in2_init[i]; @@ -527,8 +527,8 @@ int main(int argc, char** argv) return 0; } T::rand_host_many(host_out, vector_size); // Make hist_out != device_out. - // Initializa inputs with the known data. For in-place tests host_in1 is going to be used as a result. So, it - // should be initialized later in the repetions loop. + // Initialize inputs with the known data. For in-place tests host_in1 is going to be used as a result. So, it + // should be initialized later in the repetitions loop. for (int i = 0; i < vector_size; i++) { host_in1[i] = host_in1_init[i]; host_in2[i] = host_in2_init[i]; From c4ebc269375648658d8f9e5a11821a6063c4d2df Mon Sep 17 00:00:00 2001 From: danny-shterman Date: Tue, 13 Aug 2024 10:57:34 +0000 Subject: [PATCH 16/21] To start github action. --- examples/c++/mont_vec_ops/example.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/c++/mont_vec_ops/example.cu b/examples/c++/mont_vec_ops/example.cu index e2ef3a42a..4795325d3 100644 --- a/examples/c++/mont_vec_ops/example.cu +++ b/examples/c++/mont_vec_ops/example.cu @@ -14,7 +14,7 @@ typedef scalar_t T; enum Op { MUL, ADD, SUB, LAST }; -// bn254 p = 21888242871839275222246405745257275088548364400416034343698204186575808495617 +// bn254 p = 21888242871839275222246405745257275088548364400416034343698204186575808495617 int vector_op( T* vec_a, From 85e1cc75a1b010307098f2b17e1838c94b6c4e8b Mon Sep 17 00:00:00 2001 From: danny-shterman Date: Tue, 13 Aug 2024 11:00:22 +0000 Subject: [PATCH 17/21] clang-format --- icicle/include/vec_ops/vec_ops.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/icicle/include/vec_ops/vec_ops.cuh b/icicle/include/vec_ops/vec_ops.cuh index 4de406b34..144bf08bf 100644 --- a/icicle/include/vec_ops/vec_ops.cuh +++ b/icicle/include/vec_ops/vec_ops.cuh @@ -27,10 +27,10 @@ namespace vec_ops { * non-blocking and you'd need to synchronize it explicitly by running * `cudaStreamSynchronize` or `cudaDeviceSynchronize`. If set to false, the * function will block the current CPU thread. */ - bool is_input_in_montgomery_form; /**< If true then vec_a and vec_b are in montgomery form. - * Default value: false. */ + bool is_input_in_montgomery_form; /**< If true then vec_a and vec_b are in montgomery form. + * Default value: false. */ bool is_result_in_montgomery_form; /**< If true then result is in montgomery form. - * Default value: false. */ + * Default value: false. */ }; /** From 6236b849da45ddf040361117b0eba2a9e0c2765b Mon Sep 17 00:00:00 2001 From: LeonHibnik Date: Thu, 15 Aug 2024 13:12:59 +0300 Subject: [PATCH 18/21] fix configs --- wrappers/golang/core/vec_ops.go | 9 ++++++--- wrappers/golang/core/vec_ops_test.go | 3 ++- wrappers/rust/icicle-core/src/vec_ops/mod.rs | 6 ++++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/wrappers/golang/core/vec_ops.go b/wrappers/golang/core/vec_ops.go index 8bef10b63..066936628 100644 --- a/wrappers/golang/core/vec_ops.go +++ b/wrappers/golang/core/vec_ops.go @@ -28,8 +28,10 @@ type VecOpsConfig struct { * non-blocking and you'll need to synchronize it explicitly by calling * `SynchronizeStream`. If set to false, the function will block the current CPU thread. */ IsAsync bool - /* If true then vec_a, vec_b and result are in montgomery form. Default value: false. */ - IsInMontgomeryForm bool + /* If true then vec_a, vec_b are in montgomery form. Default value: false. */ + IsInputInMontgomeryForm bool + /* If true then result is in montgomery form. Default value: false. */ + IsResultInMontgomeryForm bool } /** @@ -44,7 +46,8 @@ func DefaultVecOpsConfig() VecOpsConfig { false, // isBOnDevice false, // isResultOnDevice false, // IsAsync - false, // IsInMontgomeryForm + false, // IsInputInMontgomeryForm + false, // IsResultInMontgomeryForm } return config diff --git a/wrappers/golang/core/vec_ops_test.go b/wrappers/golang/core/vec_ops_test.go index 9ac5bb480..0552b5562 100644 --- a/wrappers/golang/core/vec_ops_test.go +++ b/wrappers/golang/core/vec_ops_test.go @@ -15,7 +15,8 @@ func TestVecOpsDefaultConfig(t *testing.T) { false, // isBOnDevice false, // isResultOnDevice false, // IsAsync - false, // IsInMontgomeryForm + false, // IsInputInMontgomeryForm + false, // IsResultInMontgomeryForm } actual := DefaultVecOpsConfig() diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs index efb2cab53..7e2fcbd60 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs @@ -21,7 +21,8 @@ pub struct VecOpsConfig<'a> { /// it explicitly by running `stream.synchronize()`. If set to false, the functions will block the current CPU thread. pub is_async: bool, /// If true then vec_a, vec_b and result are in montgomery form. Default value: false. - pub is_in_montgomery_form: bool, + pub is_input_in_montgomery_form: bool, + pub is_result_in_montgomery_form: bool } impl<'a> Default for VecOpsConfig<'a> { @@ -38,7 +39,8 @@ impl<'a> VecOpsConfig<'a> { is_b_on_device: false, is_result_on_device: false, is_async: false, - is_in_montgomery_form: false, + is_input_in_montgomery_form: false, + is_result_in_montgomery_form: false } } } From d17a05c7e4aa6a2095d3fda9a8066b41b6f392a8 Mon Sep 17 00:00:00 2001 From: LeonHibnik Date: Thu, 15 Aug 2024 13:16:55 +0300 Subject: [PATCH 19/21] fmt --- wrappers/rust/icicle-core/src/vec_ops/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wrappers/rust/icicle-core/src/vec_ops/mod.rs b/wrappers/rust/icicle-core/src/vec_ops/mod.rs index 7e2fcbd60..8581ad0c5 100644 --- a/wrappers/rust/icicle-core/src/vec_ops/mod.rs +++ b/wrappers/rust/icicle-core/src/vec_ops/mod.rs @@ -22,7 +22,7 @@ pub struct VecOpsConfig<'a> { pub is_async: bool, /// If true then vec_a, vec_b and result are in montgomery form. Default value: false. pub is_input_in_montgomery_form: bool, - pub is_result_in_montgomery_form: bool + pub is_result_in_montgomery_form: bool, } impl<'a> Default for VecOpsConfig<'a> { @@ -40,7 +40,7 @@ impl<'a> VecOpsConfig<'a> { is_result_on_device: false, is_async: false, is_input_in_montgomery_form: false, - is_result_in_montgomery_form: false + is_result_in_montgomery_form: false, } } } From 21db8c06f6765316b29aa1cf00c688533ead0625 Mon Sep 17 00:00:00 2001 From: LeonHibnik Date: Thu, 15 Aug 2024 19:34:04 +0300 Subject: [PATCH 20/21] fix example --- examples/c++/mont_vec_ops/example.cu | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/examples/c++/mont_vec_ops/example.cu b/examples/c++/mont_vec_ops/example.cu index 4795325d3..68bbbcbd1 100644 --- a/examples/c++/mont_vec_ops/example.cu +++ b/examples/c++/mont_vec_ops/example.cu @@ -14,8 +14,6 @@ typedef scalar_t T; enum Op { MUL, ADD, SUB, LAST }; -// bn254 p = 21888242871839275222246405745257275088548364400416034343698204186575808495617 - int vector_op( T* vec_a, T* vec_b, @@ -47,7 +45,7 @@ int vector_op( int main(int argc, char** argv) { const unsigned vector_size = 1 << 15; - const unsigned not_in_place_repetitions = 1 << 15; // Repetitions are used only for the non in-place tests. + const unsigned not_in_place_repetitions = 1 << 10; // Repetitions are used only for the non in-place tests. const unsigned in_place_repetitions = 1; // Repetitions for in-place tests should be 1. Don't check it. cudaError_t err; @@ -200,8 +198,8 @@ int main(int argc, char** argv) std::cout << "*** Start not in-place benchmark loop ***" << std::endl; std::cout << "*****************************************" << std::endl; for (int op = MUL; op != LAST; op++) { - // for (int config_idx = 28; config_idx < 29; config_idx++) { - for (int config_idx = 0; config_idx < 32; config_idx++) { + // for (int config_idx = 28; config_idx < 29; config_idx++) { + for (int config_idx = 0; config_idx < 32; config_idx++) { switch (op) { case MUL: std::cout << "Start benchmark loop for op MUL config_idx " << config_idx << " not in-place" << std::endl; @@ -644,32 +642,24 @@ int main(int argc, char** argv) switch (config_idx >> (nof_of_configs_for_test - nof_of_storage_configs)) { // {is_a_on_device, is_b_on_device, is_result_on_device} case 0b000: - for (int i = 0; i < not_in_place_repetitions; i++) { - vector_op(host_in1, host_in2, host_in1, vector_size, ctx, config, (Op)op); - } + vector_op(host_in1, host_in2, host_in1, vector_size, ctx, config, (Op)op); break; case 0b001: break; case 0b010: - for (int i = 0; i < not_in_place_repetitions; i++) { - vector_op(host_in1, device_in2, host_in1, vector_size, ctx, config, (Op)op); - } + vector_op(host_in1, device_in2, host_in1, vector_size, ctx, config, (Op)op); break; case 0b011: break; case 0b100: break; case 0b101: - for (int i = 0; i < not_in_place_repetitions; i++) { - vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op); - } + vector_op(device_in1, host_in2, device_in1, vector_size, ctx, config, (Op)op); break; case 0b110: break; case 0b111: - for (int i = 0; i < not_in_place_repetitions; i++) { - vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op); - } + vector_op(device_in1, device_in2, device_in1, vector_size, ctx, config, (Op)op); break; } CHK_IF_RETURN(cudaPeekAtLastError()); @@ -749,6 +739,7 @@ int main(int argc, char** argv) std::cout << "===>>> ERROR!!! MUL: Test failed for vector index " << i << ", config is printed below:" << std::endl; test_failed = 1; + break; } } break; @@ -758,6 +749,7 @@ int main(int argc, char** argv) std::cout << "===>>> ERROR!!! ADD: Test failed for vector index " << i << ", config is printed below:" << std::endl; test_failed = 1; + break; } } break; @@ -767,6 +759,7 @@ int main(int argc, char** argv) std::cout << "===>>> ERROR!!! SUB: Test failed for vector index " << i << ", config is printed below:" << std::endl; test_failed = 1; + break; } } break; From a7974b614de7b7c4e0092f1a3d40111ca08774e4 Mon Sep 17 00:00:00 2001 From: LeonHibnik Date: Mon, 19 Aug 2024 10:51:02 +0300 Subject: [PATCH 21/21] change int to booleans --- icicle/src/vec_ops/vec_ops.cu | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/icicle/src/vec_ops/vec_ops.cu b/icicle/src/vec_ops/vec_ops.cu index c95e343e6..d67f8d6af 100644 --- a/icicle/src/vec_ops/vec_ops.cu +++ b/icicle/src/vec_ops/vec_ops.cu @@ -96,18 +96,18 @@ namespace vec_ops { E* d_vec_a; const E* d_vec_b; - int is_d_alloc_vec_a_allocated = 0; + bool is_d_alloc_vec_a_allocated = false; if (!config.is_a_on_device) { if (config.is_input_in_montgomery_form) { CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream)); CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_a, n, config.ctx.stream, d_alloc_vec_a)); - is_d_alloc_vec_a_allocated = 1; + is_d_alloc_vec_a_allocated = true; d_vec_a = d_alloc_vec_a; } else { CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_a, vec_a, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream)); - is_d_alloc_vec_a_allocated = 1; + is_d_alloc_vec_a_allocated = true; d_vec_a = d_alloc_vec_a; } } else { @@ -115,25 +115,25 @@ namespace vec_ops { CHK_IF_RETURN(cudaMallocAsync( &d_alloc_vec_a, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input. CHK_IF_RETURN(mont::from_montgomery(vec_a, n, config.ctx.stream, d_alloc_vec_a)); - is_d_alloc_vec_a_allocated = 1; + is_d_alloc_vec_a_allocated = true; d_vec_a = d_alloc_vec_a; } else { d_vec_a = vec_a; } } - int is_d_alloc_vec_b_allocated = 0; + bool is_d_alloc_vec_b_allocated = false; if (!config.is_b_on_device) { if (config.is_input_in_montgomery_form) { CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream)); CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream)); CHK_IF_RETURN(mont::from_montgomery(d_alloc_vec_b, n, config.ctx.stream, d_alloc_vec_b)); - is_d_alloc_vec_b_allocated = 1; + is_d_alloc_vec_b_allocated = true; d_vec_b = d_alloc_vec_b; } else { CHK_IF_RETURN(cudaMallocAsync(&d_alloc_vec_b, n * sizeof(E), config.ctx.stream)); CHK_IF_RETURN(cudaMemcpyAsync(d_alloc_vec_b, vec_b, n * sizeof(E), cudaMemcpyHostToDevice, config.ctx.stream)); - is_d_alloc_vec_b_allocated = 1; + is_d_alloc_vec_b_allocated = true; d_vec_b = d_alloc_vec_b; } } else { @@ -141,28 +141,23 @@ namespace vec_ops { CHK_IF_RETURN(cudaMallocAsync( &d_alloc_vec_b, n * sizeof(E), config.ctx.stream)); // Allocate in order not to change the input. CHK_IF_RETURN(mont::from_montgomery(vec_b, n, config.ctx.stream, d_alloc_vec_b)); - is_d_alloc_vec_b_allocated = 1; + is_d_alloc_vec_b_allocated = true; d_vec_b = d_alloc_vec_b; } else { d_vec_b = vec_b; } } - int is_d_result_allocated = 0; + bool is_d_result_allocated = false; if (!config.is_result_on_device) { if (!is_in_place) { CHK_IF_RETURN(cudaMallocAsync(&d_result, n * sizeof(E), config.ctx.stream)); - is_d_result_allocated = 1; + is_d_result_allocated = true; } else { d_result = d_vec_a; } } else { - if (!is_in_place) { - d_result = result; - } else { - // d_result = result = d_vec_a; // DEBUG - looks like a bug for in-place. - d_result = result; - } + d_result = result; } // Call the kernel to perform element-wise operation