diff --git a/icicle/include/api/hash.h b/icicle/include/api/hash.h index ffa40d812..7f41a1a00 100644 --- a/icicle/include/api/hash.h +++ b/icicle/include/api/hash.h @@ -6,8 +6,11 @@ #include #include "gpu-utils/device_context.cuh" #include "hash/keccak/keccak.cuh" +#include "hash/blake2s/blake2s.cuh" #include "merkle-tree/merkle.cuh" +/* KECCAK */ + extern "C" cudaError_t keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, keccak::HashConfig& config); @@ -27,4 +30,16 @@ extern "C" cudaError_t build_keccak512_merkle_tree_cuda( unsigned int height, unsigned int input_block_len, const merkle_tree::TreeBuilderConfig& tree_config); + +/* BLAKE2S */ + +extern "C" cudaError_t blake2s_cuda( + BYTE* input, BYTE* output, WORD number_of_blocks, WORD input_block_size, WORD output_block_size, blake2s::HashConfig& config) + +extern "C" cudaError_t build_blake2s_merkle_tree_cuda( + const uint8_t* leaves, + uint64_t* digests, + unsigned int height, + unsigned int input_block_len, + const merkle_tree::TreeBuilderConfig& tree_config); #endif \ No newline at end of file diff --git a/icicle/include/hash/blake2s/blake2s.cuh b/icicle/include/hash/blake2s/blake2s.cuh new file mode 100644 index 000000000..5988f38f5 --- /dev/null +++ b/icicle/include/hash/blake2s/blake2s.cuh @@ -0,0 +1,53 @@ +/* + * blake2b.cuh CUDA Implementation of BLAKE2B Hashing + * + * Date: 12 June 2019 + * Revision: 1 + * + * This file is released into the Public Domain. + */ + +#pragma once + +#include +#include +#include +#include +#include "gpu-utils/device_context.cuh" +#include "gpu-utils/error_handler.cuh" + +#include "hash/hash.cuh" +using namespace hash; + +namespace blake2s { + + typedef unsigned char BYTE; + typedef unsigned int WORD; + typedef unsigned long long LONG; + +#define BLAKE2S_ROUNDS 10 +#define BLAKE2S_BLOCK_LENGTH 64 +#define BLAKE2S_CHAIN_SIZE 8 +#define BLAKE2S_CHAIN_LENGTH (BLAKE2S_CHAIN_SIZE * sizeof(uint32_t)) +#define BLAKE2S_STATE_SIZE 16 +#define BLAKE2S_STATE_LENGTH (BLAKE2S_STATE_SIZE * sizeof(uint32_t)) + + class Blake2s : public Hasher + { + public: + cudaError_t run_hash_many_kernel( + const BYTE* input, + BYTE* output, + WORD number_of_states, + WORD input_len, + WORD output_len, + const device_context::DeviceContext& ctx) const override; + + Blake2s() : Hasher(BLAKE2S_STATE_SIZE, BLAKE2S_STATE_SIZE, BLAKE2S_STATE_SIZE, 0) {} + }; + + extern "C" { + cudaError_t + cuda_blake2s_hash_batch(BYTE* key, WORD keylen, BYTE* in, WORD inlen, BYTE* out, WORD output_len, WORD n_batch); + } +} // namespace blake2s \ No newline at end of file diff --git a/icicle/src/hash/CMakeLists.txt b/icicle/src/hash/CMakeLists.txt index 3f0d532c7..275afb4b4 100644 --- a/icicle/src/hash/CMakeLists.txt +++ b/icicle/src/hash/CMakeLists.txt @@ -1,5 +1,10 @@ set(TARGET icicle_hash) -add_library(${TARGET} STATIC keccak/extern.cu) +set(SRC ${CMAKE_SOURCE_DIR}/src/hash) + +set(HASH_SOURCE ${SRC}/keccak/extern.cu) +list(APPEND HASH_SOURCE ${SRC}/blake2s/extern.cu) + +add_library(${TARGET} STATIC ${HASH_SOURCE}) target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/) set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_hash") \ No newline at end of file diff --git a/icicle/src/hash/blake2s/Makefile b/icicle/src/hash/blake2s/Makefile new file mode 100644 index 000000000..4d13a3733 --- /dev/null +++ b/icicle/src/hash/blake2s/Makefile @@ -0,0 +1,26 @@ +test_blake2s: test_blake2s.cu blake2s.cu + nvcc -o test_blake2s -I. -I../../../include test_blake2s.cu blake2s.cu -g + ./test_blake2s + +test_blake2s_batched: test_blake2s_batched.cu blake2s.cu + nvcc -o test_blake2s_batched -I. -I../../../include test_blake2s_batched.cu -g + ./test_blake2s_batched ./batched_test_vectors.csv + +test_blake2s_integ: test_blake2s_integ.cu blake2s.cu + nvcc -o test_blake2s_integ -I. -I../../../include test_blake2s_integ.cu -g + ./test_blake2s_integ + +test_blake2s_seq: test_blake2s_seq.cu blake2s.cu + nvcc -o test_blake2s_seq -I. -I../../../include test_blake2s_seq.cu -g + ./test_blake2s_seq + +test_blake2s_seq_sa: test_blake2s_seq_sa.cu blake2s.cu + nvcc -o test_blake2s_seq_sa -I. -I../../../include test_blake2s_seq_sa.cu -g + ./test_blake2s_seq_sa + +test_blake2s_tree: test_tree.cu blake2s.cu ../../merkle-tree/merkle.cu + nvcc -DMERKLE_DEBUG -o test_blake2s_tree -I../../../include test_tree.cu + ./test_blake2s_tree + +clear: + rm test_blake2s test_blake2s_tree test_blake2s_integ test_blake2s_seq test_blake2s_seq_sa test_blake2s_batched \ No newline at end of file diff --git a/icicle/src/hash/blake2s/batched_test_vectors.csv b/icicle/src/hash/blake2s/batched_test_vectors.csv new file mode 100644 index 000000000..ace3a954a --- /dev/null +++ b/icicle/src/hash/blake2s/batched_test_vectors.csv @@ -0,0 +1,10 @@ +9301876542, 1e95b837356f67e9b456d636dd3d3f55bfff01eb78b375f613db5819f27e5972 +5279608431, 000386abd221e7049e78091c4b964719dd45fda6a70ad88194c8ecc7fb5ec4bc +1842976503, b35ce7baa5c5c620be9bf0d03a7a5b43fd18e786e1678ad797a30b50dc48ccc7 +6498302715, 489412fcb1a74c14dbe06aaad61cf2d3ed0eaa6a6154afc9f8b58fe92ffcebf1 +7023598146, 6b9a45147bd1c61f8d1d3d110cb705ae85ddc31ac7cb18e47306bc51d4d807ba +3150729846, 5d9d597b956a26fd79cd8bdf38e306db068b6089b305268b90fd1a304a5b2224 +9583402167, 584cd56b727e14ccc7fcaf406982faab08529b6789748c9ffc74748b033cf44f +8760134295, 2cba2adb552cc89312c614c3d720edaa5cf03bc5fc2a012511cfee013636ee40 +5402987631, d6d4920f85f5e286f3add452fab5b19f31e66293ec612b29389643f78b4ace2a +2739810546, 69aee09804f37a477b34f9a4447b39e9caaae49bbc5056b8018dd513c8cec263 \ No newline at end of file diff --git a/icicle/src/hash/blake2s/blake2s.cu b/icicle/src/hash/blake2s/blake2s.cu new file mode 100644 index 000000000..f14fec520 --- /dev/null +++ b/icicle/src/hash/blake2s/blake2s.cu @@ -0,0 +1,269 @@ +#include +#include "gpu-utils/device_context.cuh" +#include "gpu-utils/error_handler.cuh" +#include "gpu-utils/modifiers.cuh" +#include "hash/hash.cuh" + +#include "hash/blake2s/blake2s.cuh" + +using namespace hash; + +namespace blake2s { + + typedef struct { + WORD digestlen; + BYTE key[32]; + WORD keylen; + BYTE buff[BLAKE2S_BLOCK_LENGTH]; + uint32_t chain[BLAKE2S_CHAIN_SIZE]; + uint32_t state[BLAKE2S_STATE_SIZE]; + WORD pos; + uint32_t t0; + uint32_t t1; + uint32_t f0; + } cuda_blake2s_ctx_t; + + typedef cuda_blake2s_ctx_t CUDA_BLAKE2S_CTX; + + __constant__ CUDA_BLAKE2S_CTX c_CTX; + + __constant__ uint32_t BLAKE2S_IVS[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, + 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL}; + + const uint32_t CPU_BLAKE2S_IVS[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, + 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL}; + + void cpu_blake2s_init(cuda_blake2s_ctx_t* ctx, BYTE* key, WORD keylen, WORD digestbitlen) + { + memset(ctx, 0, sizeof(cuda_blake2s_ctx_t)); + if (keylen > 0) { + memcpy(ctx->buff, key, keylen); + memcpy(ctx->key, key, keylen); + } + ctx->keylen = keylen; + ctx->digestlen = digestbitlen >> 3; + ctx->pos = 0; + ctx->t0 = 0; + ctx->t1 = 0; + ctx->f0 = 0; + ctx->chain[0] = CPU_BLAKE2S_IVS[0] ^ (ctx->digestlen | (ctx->keylen << 8) | 0x1010000); + ctx->chain[1] = CPU_BLAKE2S_IVS[1]; + ctx->chain[2] = CPU_BLAKE2S_IVS[2]; + ctx->chain[3] = CPU_BLAKE2S_IVS[3]; + ctx->chain[4] = CPU_BLAKE2S_IVS[4]; + ctx->chain[5] = CPU_BLAKE2S_IVS[5]; + ctx->chain[6] = CPU_BLAKE2S_IVS[6]; + ctx->chain[7] = CPU_BLAKE2S_IVS[7]; + + ctx->pos = (keylen > 0) ? BLAKE2S_BLOCK_LENGTH : 0; + } + + __constant__ uint8_t BLAKE2S_SIGMA[10][16] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, + {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, + {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10}, + {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}}; + + __device__ uint32_t cuda_blake2s_leuint32(const BYTE* in) + { + uint32_t a; + memcpy(&a, in, 4); + return a; + } + + __inline__ __device__ uint32_t cuda_blake2s_ROTR32(uint32_t a, uint8_t b) { return (a >> b) | (a << (32 - b)); } + + __device__ void + cuda_blake2s_G(cuda_blake2s_ctx_t* ctx, uint32_t m1, uint32_t m2, int32_t a, int32_t b, int32_t c, int32_t d) + { + ctx->state[a] = ctx->state[a] + ctx->state[b] + m1; + ctx->state[d] = cuda_blake2s_ROTR32(ctx->state[d] ^ ctx->state[a], 16); + ctx->state[c] = ctx->state[c] + ctx->state[d]; + ctx->state[b] = cuda_blake2s_ROTR32(ctx->state[b] ^ ctx->state[c], 12); + ctx->state[a] = ctx->state[a] + ctx->state[b] + m2; + ctx->state[d] = cuda_blake2s_ROTR32(ctx->state[d] ^ ctx->state[a], 8); + ctx->state[c] = ctx->state[c] + ctx->state[d]; + ctx->state[b] = cuda_blake2s_ROTR32(ctx->state[b] ^ ctx->state[c], 7); + } + + __device__ __forceinline__ void cuda_blake2s_init_state(cuda_blake2s_ctx_t* ctx) + { + memcpy(ctx->state, ctx->chain, BLAKE2S_CHAIN_LENGTH); + ctx->state[8] = BLAKE2S_IVS[0]; + ctx->state[9] = BLAKE2S_IVS[1]; + ctx->state[10] = BLAKE2S_IVS[2]; + ctx->state[11] = BLAKE2S_IVS[3]; + ctx->state[12] = ctx->t0 ^ BLAKE2S_IVS[4]; + ctx->state[13] = ctx->t1 ^ BLAKE2S_IVS[5]; + ctx->state[14] = ctx->f0 ^ BLAKE2S_IVS[6]; + ctx->state[15] = BLAKE2S_IVS[7]; + } + + __device__ __forceinline__ void cuda_blake2s_compress(cuda_blake2s_ctx_t* ctx, const BYTE* in, WORD inoffset) + { + cuda_blake2s_init_state(ctx); + uint32_t m[16] = {0}; + for (int j = 0; j < 16; j++) + m[j] = cuda_blake2s_leuint32(in + inoffset + (j << 2)); + + for (int round = 0; round < BLAKE2S_ROUNDS; round++) { + cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][0]], m[BLAKE2S_SIGMA[round][1]], 0, 4, 8, 12); + cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][2]], m[BLAKE2S_SIGMA[round][3]], 1, 5, 9, 13); + cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][4]], m[BLAKE2S_SIGMA[round][5]], 2, 6, 10, 14); + cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][6]], m[BLAKE2S_SIGMA[round][7]], 3, 7, 11, 15); + cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][8]], m[BLAKE2S_SIGMA[round][9]], 0, 5, 10, 15); + cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][10]], m[BLAKE2S_SIGMA[round][11]], 1, 6, 11, 12); + cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][12]], m[BLAKE2S_SIGMA[round][13]], 2, 7, 8, 13); + cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][14]], m[BLAKE2S_SIGMA[round][15]], 3, 4, 9, 14); + } + + for (int offset = 0; offset < BLAKE2S_CHAIN_SIZE; offset++) + ctx->chain[offset] = ctx->chain[offset] ^ ctx->state[offset] ^ ctx->state[offset + 8]; + } + + __device__ void cuda_blake2s_init(cuda_blake2s_ctx_t* ctx, BYTE* key, WORD keylen, WORD digestbitlen) + { + memset(ctx, 0, sizeof(cuda_blake2s_ctx_t)); + ctx->keylen = keylen; + ctx->digestlen = digestbitlen >> 3; + ctx->pos = 0; + ctx->t0 = 0; + ctx->t1 = 0; + ctx->f0 = 0; + ctx->chain[0] = BLAKE2S_IVS[0] ^ (ctx->digestlen | (ctx->keylen << 8) | 0x1010000); + ctx->chain[1] = BLAKE2S_IVS[1]; + ctx->chain[2] = BLAKE2S_IVS[2]; + ctx->chain[3] = BLAKE2S_IVS[3]; + ctx->chain[4] = BLAKE2S_IVS[4]; + ctx->chain[5] = BLAKE2S_IVS[5]; + ctx->chain[6] = BLAKE2S_IVS[6]; + ctx->chain[7] = BLAKE2S_IVS[7]; + + if (keylen > 0) { + memcpy(ctx->buff, key, keylen); + memcpy(ctx->key, key, keylen); + } + ctx->pos = (keylen > 0) ? BLAKE2S_BLOCK_LENGTH : 0; + } + + __device__ void cuda_blake2s_update(cuda_blake2s_ctx_t* ctx, const BYTE* in, LONG inlen) + { + if (inlen == 0) return; + + WORD start = 0; + int64_t in_index = 0, block_index = 0; + + if (ctx->pos) { + start = BLAKE2S_BLOCK_LENGTH - ctx->pos; + if (start < inlen) { + memcpy(ctx->buff + ctx->pos, in, start); + ctx->t0 += BLAKE2S_BLOCK_LENGTH; + + if (ctx->t0 == 0) ctx->t1++; + + cuda_blake2s_compress(ctx, ctx->buff, 0); + ctx->pos = 0; + memset(ctx->buff, 0, BLAKE2S_BLOCK_LENGTH); + } else { + memcpy(ctx->buff + ctx->pos, in, inlen); + ctx->pos += inlen; + return; + } + } + + block_index = inlen - BLAKE2S_BLOCK_LENGTH; + for (in_index = start; in_index < block_index; in_index += BLAKE2S_BLOCK_LENGTH) { + ctx->t0 += BLAKE2S_BLOCK_LENGTH; + if (ctx->t0 == 0) ctx->t1++; + + cuda_blake2s_compress(ctx, in, in_index); + } + + memcpy(ctx->buff, in + in_index, inlen - in_index); + ctx->pos += inlen - in_index; + } + + __device__ void cuda_blake2s_final(cuda_blake2s_ctx_t* ctx, BYTE* out) + { + ctx->f0 = 0xFFFFFFFFUL; + ctx->t0 += ctx->pos; + if (ctx->pos > 0 && ctx->t0 == 0) ctx->t1++; + + cuda_blake2s_compress(ctx, ctx->buff, 0); + memset(ctx->buff, 0, BLAKE2S_BLOCK_LENGTH); + memset(ctx->state, 0, BLAKE2S_STATE_LENGTH); + + int i4 = 0; + for (int i = 0; i < BLAKE2S_CHAIN_SIZE && ((i4 = i * 4) < ctx->digestlen); i++) { + BYTE* BYTEs = (BYTE*)(&ctx->chain[i]); + if (i4 < ctx->digestlen - 4) + memcpy(out + i4, BYTEs, 4); + else + memcpy(out + i4, BYTEs, ctx->digestlen - i4); + } + } + + __global__ void + kernel_blake2s_hash(const BYTE* indata, WORD inlen, BYTE* outdata, WORD n_batch, WORD BLAKE2S_BLOCK_SIZE) + { + WORD thread = blockIdx.x * blockDim.x + threadIdx.x; + if (thread >= n_batch) { return; } + BYTE key[32] = ""; // Null key + WORD keylen = 0; + CUDA_BLAKE2S_CTX blake_ctx; + const BYTE* in = indata + thread * inlen; + BYTE* out = outdata + thread * BLAKE2S_BLOCK_SIZE; + + cuda_blake2s_init(&blake_ctx, key, keylen, (BLAKE2S_BLOCK_SIZE << 3)); + cuda_blake2s_update(&blake_ctx, in, inlen); + cuda_blake2s_final(&blake_ctx, out); + } + + extern "C" { + cudaError_t + cuda_blake2s_hash_batch(BYTE* key, WORD keylen, BYTE* in, WORD inlen, BYTE* out, WORD output_len, WORD n_batch) + { + BYTE* cuda_indata; + BYTE* cuda_outdata; + const WORD BLAKE2S_BLOCK_SIZE = output_len; + cudaMalloc(&cuda_indata, inlen * n_batch); + cudaMalloc(&cuda_outdata, BLAKE2S_BLOCK_SIZE * n_batch); + assert(keylen <= 32); + + cudaMemcpy(cuda_indata, in, inlen * n_batch, cudaMemcpyHostToDevice); + + WORD thread = 256; + WORD block = (n_batch + thread - 1) / thread; + kernel_blake2s_hash<<>>(cuda_indata, inlen, cuda_outdata, n_batch, BLAKE2S_BLOCK_SIZE); + cudaMemcpy(out, cuda_outdata, BLAKE2S_BLOCK_SIZE * n_batch, cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + // cudaError_t error = cudaGetLastError(); + // if (error != cudaSuccess) { printf("Error cuda blake2s hash: %s \n", cudaGetErrorString(error)); } + cudaFree(cuda_indata); + cudaFree(cuda_outdata); + CHK_IF_RETURN(cudaPeekAtLastError()); + return CHK_LAST(); + } + } + + cudaError_t Blake2s::run_hash_many_kernel( + const BYTE* input, + BYTE* output, + WORD number_of_states, + WORD input_len, + WORD output_len, + const device_context::DeviceContext& ctx) const + { + const WORD BLAKE2S_BLOCK_SIZE = output_len; + WORD thread = 256; + WORD block = (number_of_states + thread - 1) / thread; + + kernel_blake2s_hash<<>>( + input, input_len, output, number_of_states, BLAKE2S_BLOCK_SIZE); + + CHK_IF_RETURN(cudaPeekAtLastError()); + return CHK_LAST(); + } + +} // namespace blake2s \ No newline at end of file diff --git a/icicle/src/hash/blake2s/expected_hashes.csv b/icicle/src/hash/blake2s/expected_hashes.csv new file mode 100644 index 000000000..afea56903 --- /dev/null +++ b/icicle/src/hash/blake2s/expected_hashes.csv @@ -0,0 +1,10 @@ +652e530edee5893b576f72b875ea1c918e85e29d859e7e3fa78b623d8abca3de +57e36dae300302953c953b59a1b263cb314326db44e919ca4acd57e1da8f0543 +9fbfc63ab34b8c35c58e9178c60e2bb165dde7340cb063e9567e4f6bef5eacbe +f26bbce62eaed81606f72000d95f0eea0eac23893f9db0c1a65af173c5095cb7 +9a21619cac392bd9c80725b2161b033efc0dad3b57d8a9b4c5103ed1cd065a38 +b2e524f32a0b8eeb72737da8ea0075c4e7e11936289954d571622f23e3df9076 +d78d4fe12e21ce58226c2707b86b167e237b24f1f84a5e39b073460998c5359d +0d74da2a1062445822cbc8ec7bf424714e09923b4c1eba0ca2170504f56c4331 +f5205d77b033111f1e15f585a86b7a4c292c0ec39addb3b2fcb0de4a0bf61003 +410381eb72313f23f9f62478d62ec7635f4166ab5e53a20af5c9e8f7ee445de8 \ No newline at end of file diff --git a/icicle/src/hash/blake2s/extern.cu b/icicle/src/hash/blake2s/extern.cu new file mode 100644 index 000000000..eb9120f78 --- /dev/null +++ b/icicle/src/hash/blake2s/extern.cu @@ -0,0 +1,30 @@ +#include "utils/utils.h" +#include "gpu-utils/error_handler.cuh" + +#include "hash/hash.cuh" + +#include "hash/blake2s/blake2s.cuh" +#include "blake2s.cu" +#include "../../merkle-tree/merkle.cu" +#include "merkle-tree/merkle.cuh" + +namespace blake2s { + extern "C" cudaError_t blake2s_cuda( + BYTE* input, BYTE* output, WORD number_of_blocks, WORD input_block_size, WORD output_block_size, HashConfig& config) + { + return Blake2s().hash_many(input, output, number_of_blocks, input_block_size, output_block_size, config); + } + + extern "C" cudaError_t build_blake2s_merkle_tree_cuda( + const BYTE* leaves, + BYTE* digests, + unsigned int height, + WORD input_block_len, + const merkle_tree::TreeBuilderConfig& tree_config) + { + Blake2s blake2s; + return merkle_tree::build_merkle_tree( + leaves, digests, height, input_block_len, blake2s, blake2s, tree_config); + } + +} // namespace blake2s \ No newline at end of file diff --git a/icicle/src/hash/blake2s/test_blake2s.cu b/icicle/src/hash/blake2s/test_blake2s.cu new file mode 100644 index 000000000..63e4e45ae --- /dev/null +++ b/icicle/src/hash/blake2s/test_blake2s.cu @@ -0,0 +1,111 @@ +#include +#include "gpu-utils/device_context.cuh" + +#include +#include +#include +#include +#include + +#include "hash/blake2s/blake2s.cuh" + +using namespace blake2s; + +#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now(); +#define END_TIMER(timer, msg) \ + printf("%s: %.0f us\n", msg, FpMicroseconds(std::chrono::high_resolution_clock::now() - timer##_start).count()); + +// extern "C" { +// void cuda_blake2s_hash_batch(BYTE* key, WORD keylen, BYTE* in, WORD inlen, BYTE* out, WORD outlen, WORD n_batch); +// } + +void print_hash(BYTE* hash, WORD len) +{ + printf("%d \n", len); + for (WORD i = 0; i < len; i++) { + printf("%02x", hash[i]); + } + printf("\n"); +} + +BYTE* read_file(const char* filename, size_t* filesize) +{ + FILE* file = fopen(filename, "rb"); + if (!file) { + perror("Failed to open file"); + exit(EXIT_FAILURE); + } + + fseek(file, 0, SEEK_END); + *filesize = ftell(file); + fseek(file, 0, SEEK_SET); + + BYTE* buffer = (BYTE*)malloc(*filesize); + if (!buffer) { + perror("Failed to allocate memory"); + fclose(file); + exit(EXIT_FAILURE); + } + + size_t bytesRead = fread(buffer, 1, *filesize, file); + if (bytesRead != *filesize) { + perror("Failed to read file"); + free(buffer); + fclose(file); + exit(EXIT_FAILURE); + } + + fclose(file); + return buffer; +} + +int main(int argc, char** argv) +{ + using FpMilliseconds = std::chrono::duration; + using FpMicroseconds = std::chrono::duration; + + BYTE* input; + size_t inlen; + const char* input_filename; + const char* default_input = "aaaaaaaaaaa"; + + if (argc < 2) { + // Use default input if no file is provided + input = (BYTE*)default_input; + inlen = strlen(default_input); + } else { + input_filename = argv[1]; + input = read_file(input_filename, &inlen); + } + + // Test parameters + BYTE key[32] = ""; // Example key + WORD keylen = strlen((char*)key); + WORD n_outbit = 256; // Output length in bits + WORD n_batch = 1; // Number of hashes to compute in parallel + + // Allocate memory for the output + WORD outlen = n_outbit / 8; + BYTE* output = (BYTE*)malloc(outlen * n_batch); + if (!output) { + perror("Failed to allocate memory for output"); + if (argc >= 2) free(input); // Free file buffer if it was allocated + return EXIT_FAILURE; + } + + printf("Key len: %d \n", keylen); + + // Perform the hashing + START_TIMER(blake_timer) + cuda_blake2s_hash_batch(key, keylen, input, inlen, output, outlen, n_batch); + END_TIMER(blake_timer, "Blake Timer") + + // Print the result + printf("BLAKE2S hash:\n"); + print_hash(output, outlen); + + // Clean up + free(output); + if (argc >= 2) free(input); // Free file buffer if it was allocated + return 0; +} diff --git a/icicle/src/hash/blake2s/test_blake2s_batched.cu b/icicle/src/hash/blake2s/test_blake2s_batched.cu new file mode 100644 index 000000000..2be197075 --- /dev/null +++ b/icicle/src/hash/blake2s/test_blake2s_batched.cu @@ -0,0 +1,155 @@ +#include +#include +#include "gpu-utils/device_context.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include "extern.cu" + +using namespace blake2s; + +#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now(); +#define END_TIMER(timer, msg) \ + printf("%s: %.0f us\n", msg, FpMicroseconds(std::chrono::high_resolution_clock::now() - timer##_start).count()); + +void print_hash(BYTE* hash, WORD len) +{ + for (WORD i = 0; i < len; i++) { + printf("%02x", hash[i]); + } + printf("\n"); +} + +// Function to trim leading and trailing whitespace from a string +std::string trim(const std::string& str) +{ + size_t first = str.find_first_not_of(' '); + size_t last = str.find_last_not_of(' '); + return str.substr(first, (last - first + 1)); +} + +std::unordered_map load_strings_and_hashes_from_csv(const char* filename) +{ + std::unordered_map string_hash_map; + std::ifstream file(filename); + if (!file.is_open()) { + perror("Failed to open CSV file"); + exit(EXIT_FAILURE); + } + + std::string line; + while (std::getline(file, line)) { + std::stringstream ss(line); + std::string input_string, hash_value; + if (std::getline(ss, input_string, ',') && std::getline(ss, hash_value, ',')) { + // Trim any whitespace around the strings + input_string = trim(input_string); + hash_value = trim(hash_value); + string_hash_map[input_string] = hash_value; + } + } + + file.close(); + return string_hash_map; +} + +int main(int argc, char** argv) +{ + using FpMilliseconds = std::chrono::duration; + using FpMicroseconds = std::chrono::duration; + + if (argc < 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return EXIT_FAILURE; + } + + const char* csv_filename = argv[1]; + auto string_hash_map = load_strings_and_hashes_from_csv(csv_filename); + + if (string_hash_map.size() != 10) { + fprintf(stderr, "CSV file must contain exactly 10 strings and hashes.\n"); + return EXIT_FAILURE; + } + + // Prepare the test strings and expected hashes from the map + std::vector test_strings; + std::vector expected_hashes; + for (const auto& pair : string_hash_map) { + test_strings.push_back(pair.first); + expected_hashes.push_back(pair.second); + } + + // Test parameters + BYTE key[32] = ""; // Example key + WORD keylen = strlen((char*)key); + WORD n_outbit = 256; // Output length in bits + WORD n_batch = 10; // Number of different inputs to hash in parallel + size_t max_len = 10; // Max length of the test strings + + // Calculate total input length and allocate memory for the batched input + size_t total_len = 0; + for (const auto& str : test_strings) { + total_len += str.size(); + } + BYTE* batched_input = (BYTE*)malloc(total_len); + WORD* in_lengths = (WORD*)malloc(n_batch * sizeof(WORD)); + + // Copy test strings to batched input and store their lengths + BYTE* current_position = batched_input; + for (int i = 0; i < n_batch; ++i) { + memcpy(current_position, test_strings[i].c_str(), test_strings[i].size()); + current_position += test_strings[i].size(); + } + + // Allocate memory for the output + WORD outlen = n_outbit / 8; + BYTE* output = (BYTE*)malloc(outlen * n_batch); + if (!output) { + perror("Failed to allocate memory for output"); + free(batched_input); + free(in_lengths); + return EXIT_FAILURE; + } + + printf("Key len: %d \n", keylen); + HashConfig config = default_hash_config(); + + // Perform the hashing + START_TIMER(blake_timer) + // cuda_blake2s_hash_batch(key, keylen, batched_input, max_len, output, outlen, n_batch); + blake2s_cuda(batched_input, output, n_batch, max_len, outlen, config); + END_TIMER(blake_timer, "Blake Timer") + + // Print and compare the results + printf("BLAKE2S hash (batch size = %d):\n", n_batch); + for (WORD i = 0; i < n_batch; i++) { + printf("String: %s\n", test_strings[i].c_str()); + printf("Computed Hash %d: ", i + 1); + print_hash(output + i * outlen, outlen); + std::cout << "Expected Hash " << i + 1 << ": " << expected_hashes[i] << std::endl; + + std::string computed_hash; + for (WORD j = 0; j < outlen; ++j) { + char buffer[3]; + snprintf(buffer, sizeof(buffer), "%02x", output[i * outlen + j]); + computed_hash += buffer; + } + + if (computed_hash == expected_hashes[i]) { + printf(" (Match)\n"); + } else { + printf(" (Mismatch)\n"); + } + } + + // Clean up + free(output); + free(batched_input); + free(in_lengths); + return 0; +} diff --git a/icicle/src/hash/blake2s/test_blake2s_integ.cu b/icicle/src/hash/blake2s/test_blake2s_integ.cu new file mode 100644 index 000000000..4959adc94 --- /dev/null +++ b/icicle/src/hash/blake2s/test_blake2s_integ.cu @@ -0,0 +1,111 @@ +#include +#include "gpu-utils/device_context.cuh" + +#include +#include +#include +#include +#include +#include "extern.cu" + +using namespace blake2s; + +#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now(); +#define END_TIMER(timer, msg) \ + printf("%s: %.0f us\n", msg, FpMicroseconds(std::chrono::high_resolution_clock::now() - timer##_start).count()); + +// extern "C" { +// void cuda_blake2s_hash_batch(BYTE* key, WORD keylen, BYTE* in, WORD inlen, BYTE* out, WORD n_outbit, WORD n_batch); +// } + +void print_hash(BYTE* hash, WORD len) +{ + printf("Hash Len: %d \n", len); + printf("BLAKE2S hash:\n"); + for (WORD i = 0; i < len; i++) { + printf("%02x", hash[i]); + } + printf("\n"); +} + +BYTE* read_file(const char* filename, size_t* filesize) +{ + FILE* file = fopen(filename, "rb"); + if (!file) { + perror("Failed to open file"); + exit(EXIT_FAILURE); + } + + fseek(file, 0, SEEK_END); + *filesize = ftell(file); + fseek(file, 0, SEEK_SET); + + BYTE* buffer = (BYTE*)malloc(*filesize); + if (!buffer) { + perror("Failed to allocate memory"); + fclose(file); + exit(EXIT_FAILURE); + } + + size_t bytesRead = fread(buffer, 1, *filesize, file); + if (bytesRead != *filesize) { + perror("Failed to read file"); + free(buffer); + fclose(file); + exit(EXIT_FAILURE); + } + + fclose(file); + return buffer; +} + +int main(int argc, char** argv) +{ + using FpMilliseconds = std::chrono::duration; + using FpMicroseconds = std::chrono::duration; + + BYTE* input; + size_t inlen; + const char* input_filename; + const char* default_input = "aaaaaaaaaaa"; + + if (argc < 2) { + // Use default input if no file is provided + input = (BYTE*)default_input; + inlen = strlen(default_input); + } else { + input_filename = argv[1]; + input = read_file(input_filename, &inlen); + } + + // Test parameters + BYTE key[32] = ""; // Example key + WORD keylen = strlen((char*)key); + WORD n_outbit = 256; // Output length in bits + WORD n_batch = 1; // Number of hashes to compute in parallel + // Allocate memory for the output + WORD outlen = n_outbit / 8; + BYTE* output = (BYTE*)malloc(outlen * n_batch); + if (!output) { + perror("Failed to allocate memory for output"); + if (argc >= 2) free(input); // Free file buffer if it was allocated + return EXIT_FAILURE; + } + + printf("Key len: %d \n", keylen); + + // Perform the hashing + START_TIMER(blake_timer) + HashConfig config = default_hash_config(); + + blake2s_cuda(input, output, n_batch, inlen, outlen, config); + END_TIMER(blake_timer, "Blake Timer") + + // Print the result + print_hash(output, outlen); + + // Clean up + free(output); + if (argc >= 2) free(input); // Free file buffer if it was allocated + return 0; +} diff --git a/icicle/src/hash/blake2s/test_blake2s_seq.cu b/icicle/src/hash/blake2s/test_blake2s_seq.cu new file mode 100644 index 000000000..1578d8d59 --- /dev/null +++ b/icicle/src/hash/blake2s/test_blake2s_seq.cu @@ -0,0 +1,104 @@ +#include +#include "gpu-utils/device_context.cuh" + +#include +#include +#include +#include +#include +#include "extern.cu" + +using namespace blake2s; + +#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now(); +#define END_TIMER(timer, msg) \ + printf("%s: %.0f us\n", msg, FpMicroseconds(std::chrono::high_resolution_clock::now() - timer##_start).count()); + +void print_hash(BYTE* hash, WORD len) +{ + printf("Hash Len: %d \n", len); + printf("BLAKE2S hash:\n"); + for (WORD i = 0; i < len; i++) { + printf("%02x", hash[i]); + } + printf("\n"); +} + +std::string byte_to_hex(BYTE* data, WORD len) +{ + std::stringstream ss; + for (WORD i = 0; i < len; i++) { + ss << std::hex << std::setw(2) << std::setfill('0') << (int)data[i]; + } + return ss.str(); +} + +std::vector load_csv(const char* filename) +{ + std::vector hashes; + std::ifstream file(filename); + std::string line; + while (std::getline(file, line)) { + // Directly add the line as a hash, assuming one hash per line + hashes.push_back(line); + } + return hashes; +} + +int main(int argc, char** argv) +{ + using FpMilliseconds = std::chrono::duration; + using FpMicroseconds = std::chrono::duration; + + std::vector test_strings = {"0", "01", "012", "0123", "01234", + "012345", "0123456", "01234567", "012345678", "0123456789"}; + + const char* csv_filename = "expected_hashes.csv"; // Replace with your actual CSV file name + std::vector expected_hashes = load_csv(csv_filename); + assert(expected_hashes.size() == test_strings.size() && "Number of hashes in CSV must match number of test strings."); + std::cout << "Loaded hashes from CSV:" << std::endl; + // for (size_t i = 0; i < expected_hashes.size(); ++i) { + // std::cout << "Expected hash " << i << ": " << expected_hashes[i] << std::endl; + // } + + // Test parameters + WORD n_outbit = 256; // Output length in bits + WORD n_batch = 1; // Number of hashes to compute in parallel + + // Allocate memory for the output + WORD outlen = n_outbit / 8; + + // Perform the hashing + HashConfig config = default_hash_config(); + + for (size_t i = 0; i < test_strings.size(); i++) { + BYTE* output = (BYTE*)malloc(outlen * n_batch); + if (!output) { + perror("Failed to allocate memory for output"); + return EXIT_FAILURE; + } + + const std::string& input_str = test_strings[i]; + BYTE* input = (BYTE*)input_str.c_str(); + size_t inlen = input_str.size(); + + // Perform the hashing + START_TIMER(blake_timer) + blake2s_cuda(input, output, n_batch, inlen, outlen, config); + END_TIMER(blake_timer, "Blake Timer") + // Convert the output to hex string + std::string computed_hash = byte_to_hex(output, outlen); + // Compare with the expected hash + + if (computed_hash == expected_hashes[i]) { + std::cout << "Test " << i << " passed." << std::endl; + } else { + std::cout << "Test " << i << " failed." << std::endl; + std::cout << "Expected: " << expected_hashes[i] << std::endl; + std::cout << "Got: " << computed_hash << std::endl; + } + free(output); + } + + return 0; +} diff --git a/icicle/src/hash/blake2s/test_blake2s_seq_sa.cu b/icicle/src/hash/blake2s/test_blake2s_seq_sa.cu new file mode 100644 index 000000000..7cfdf2537 --- /dev/null +++ b/icicle/src/hash/blake2s/test_blake2s_seq_sa.cu @@ -0,0 +1,113 @@ +#include +#include "gpu-utils/device_context.cuh" + +#include +#include +#include +#include +#include +#include "extern.cu" +#include "hash/blake2s/blake2s.cuh" + +using namespace blake2s; + +#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now(); +#define END_TIMER(timer, msg) \ + printf("%s: %.0f us\n", msg, FpMicroseconds(std::chrono::high_resolution_clock::now() - timer##_start).count()); + +// extern "C" { +// void cuda_blake2s_hash_batch(BYTE* key, WORD keylen, BYTE* in, WORD inlen, BYTE* out, WORD n_outbit, WORD n_batch); +// } + +void print_hash(BYTE* hash, WORD len) +{ + printf("Hash Len: %d \n", len); + printf("BLAKE2S hash:\n"); + for (WORD i = 0; i < len; i++) { + printf("%02x", hash[i]); + } + printf("\n"); +} + +std::string byte_to_hex(BYTE* data, WORD len) +{ + std::stringstream ss; + for (WORD i = 0; i < len; i++) { + ss << std::hex << std::setw(2) << std::setfill('0') << (int)data[i]; + } + return ss.str(); +} + +std::vector load_csv(const char* filename) +{ + std::vector hashes; + std::ifstream file(filename); + std::string line; + while (std::getline(file, line)) { + // Directly add the line as a hash, assuming one hash per line + hashes.push_back(line); + } + return hashes; +} + +int main(int argc, char** argv) +{ + using FpMilliseconds = std::chrono::duration; + using FpMicroseconds = std::chrono::duration; + + std::vector test_strings = {"0", "01", "012", "0123", "01234", + "012345", "0123456", "01234567", "012345678", "0123456789"}; + + const char* csv_filename = "expected_hashes.csv"; // Replace with your actual CSV file name + std::vector expected_hashes = load_csv(csv_filename); + assert(expected_hashes.size() == test_strings.size() && "Number of hashes in CSV must match number of test strings."); + std::cout << "Loaded hashes from CSV:" << std::endl; + // for (size_t i = 0; i < expected_hashes.size(); ++i) { + // std::cout << "Expected hash " << i << ": " << expected_hashes[i] << std::endl; + // } + + // Test parameters + WORD n_outbit = 256; // Output length in bits + WORD n_batch = 1; // Number of hashes to compute in parallel + + // Test parameters + BYTE key[32] = ""; // Example key + WORD keylen = strlen((char*)key); + + // Allocate memory for the output + WORD outlen = n_outbit / 8; + + // Perform the hashing + HashConfig config = default_hash_config(); + + for (size_t i = 0; i < test_strings.size(); i++) { + BYTE* output = (BYTE*)malloc(outlen * n_batch); + if (!output) { + perror("Failed to allocate memory for output"); + return EXIT_FAILURE; + } + + const std::string& input_str = test_strings[i]; + BYTE* input = (BYTE*)input_str.c_str(); + size_t inlen = input_str.size(); + + // Perform the hashing + START_TIMER(blake_timer) + cuda_blake2s_hash_batch(key, keylen, input, inlen, output, outlen, n_batch); + END_TIMER(blake_timer, "Blake Timer") + // Convert the output to hex string + std::string computed_hash = byte_to_hex(output, outlen); + // Compare with the expected hash + + if (computed_hash == expected_hashes[i]) { + std::cout << "Test " << i << " passed." << std::endl; + } else { + std::cout << "Test " << i << " failed." << std::endl; + std::cout << "Expected: " << expected_hashes[i] << std::endl; + std::cout << "Got: " << computed_hash << std::endl; + } + free(output); + } + + return 0; +} diff --git a/icicle/src/hash/blake2s/test_tree.cu b/icicle/src/hash/blake2s/test_tree.cu new file mode 100644 index 000000000..85145ccf0 --- /dev/null +++ b/icicle/src/hash/blake2s/test_tree.cu @@ -0,0 +1,95 @@ +#include "gpu-utils/device_context.cuh" +#include "merkle-tree/merkle.cuh" +#include "extern.cu" + +#ifndef __CUDA_ARCH__ +#include +#include +#include +#include +#include + +using namespace blake2s; + +#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now(); +#define END_TIMER(timer, msg) \ + printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count()); + +void uint8_to_hex_string(const uint8_t* values, int size) +{ + std::stringstream ss; + + for (int i = 0; i < size; ++i) { + ss << std::hex << std::setw(2) << std::setfill('0') << (int)values[i]; + } + + std::string hexString = ss.str(); + std::cout << hexString << std::endl; +} + +#define A 2 + +int main(int argc, char* argv[]) +{ + using FpMilliseconds = std::chrono::duration; + using FpMicroseconds = std::chrono::duration; + + /// Tree of height N and arity A contains \sum{A^i} for i in 0..N-1 elements + uint32_t input_block_len = 64; + uint32_t tree_height = argc > 1 ? atoi(argv[1]) : 1; + uint32_t number_of_leaves = pow(A, tree_height); + uint32_t total_number_of_leaves = number_of_leaves * input_block_len; + + /// Use keep_rows to specify how many rows do you want to store + int keep_rows = argc > 2 ? atoi(argv[2]) : 2; + size_t digests_len = merkle_tree::get_digests_len(keep_rows - 1, A, 1); + + /// Fill leaves with scalars [0, 1, ... 2^tree_height - 1] + START_TIMER(timer_allocation); + uint8_t input = 0; + uint8_t* leaves = static_cast(malloc(total_number_of_leaves)); + for (uint64_t i = 0; i < total_number_of_leaves; i++) { + leaves[i] = (uint8_t)i; + } + END_TIMER(timer_allocation, "Allocated memory for leaves: "); + + /// Allocate memory for digests of {keep_rows} rows of a tree + START_TIMER(timer_digests); + size_t digests_mem = digests_len * sizeof(BYTE) * 64; + BYTE* digests = static_cast(malloc(digests_mem)); + END_TIMER(timer_digests, "Allocated memory for digests"); + + std::cout << "Memory for leaves = " << total_number_of_leaves / 1024 / 1024 << " MB; " + << total_number_of_leaves / 1024 / 1024 / 1024 << " GB" << std::endl; + std::cout << "Number of leaves = " << number_of_leaves << std::endl; + std::cout << "Total Number of leaves = " << total_number_of_leaves << std::endl; + std::cout << "Memory for digests = " << digests_mem / 1024 / 1024 << " MB; " << digests_mem / 1024 / 1024 / 1024 + << " GB" << std::endl; + std::cout << "Number of digest elements = " << digests_len << std::endl; + + std::cout << "Total RAM consumption = " << (digests_mem + total_number_of_leaves) / 1024 / 1024 << " MB; " + << (digests_mem + total_number_of_leaves) / 1024 / 1024 / 1024 << " GB" << std::endl; + + merkle_tree::TreeBuilderConfig config = merkle_tree::default_merkle_config(); + config.arity = A; + config.keep_rows = keep_rows; + config.digest_elements = 32; + START_TIMER(blake2s_timer); + build_blake2s_merkle_tree_cuda(leaves, digests, tree_height, input_block_len, config); + END_TIMER(blake2s_timer, "blake2s") + + for (int i = 0; i < digests_len * 32; i++) { + WORD root = digests[i]; + + // Print the current element in hexadecimal format + printf("%02x", root); + + // After every 32 elements, print a newline to start a new row + if ((i + 1) % 32 == 0) { printf("\n"); } + } + + free(digests); + free(leaves); +} + +#endif \ No newline at end of file diff --git a/icicle/src/merkle-tree/merkle.cu b/icicle/src/merkle-tree/merkle.cu index 2fe171634..ce96daf09 100644 --- a/icicle/src/merkle-tree/merkle.cu +++ b/icicle/src/merkle-tree/merkle.cu @@ -163,11 +163,6 @@ namespace merkle_tree { CHK_INIT_IF_RETURN(); cudaStream_t& stream = tree_config.ctx.stream; - if (compression.preimage_max_length < tree_config.arity * tree_config.digest_elements) - THROW_ICICLE_ERR( - IcicleError_t::InvalidArgument, - "Hash max preimage length does not match merkle tree arity multiplied by digest elements"); - uint64_t number_of_bottom_layer_states = pow(tree_config.arity, height); // This will determine how much splitting do we need to do diff --git a/wrappers/rust/icicle-hash/src/blake2s/mod.rs b/wrappers/rust/icicle-hash/src/blake2s/mod.rs new file mode 100644 index 000000000..6d5de4dc1 --- /dev/null +++ b/wrappers/rust/icicle-hash/src/blake2s/mod.rs @@ -0,0 +1,71 @@ +use icicle_core::hash::HashConfig; +use icicle_core::tree::TreeBuilderConfig; +use icicle_cuda_runtime::error::CudaError; +use icicle_cuda_runtime::memory::HostOrDeviceSlice; + +use icicle_core::error::IcicleResult; +use icicle_core::traits::IcicleResultWrap; + +pub mod tests; + +extern "C" { + pub(crate) fn blake2s_cuda( + input: *const u8, + output: *mut u8, + number_of_blocks: u32, + input_block_size: u32, + output_block_size: u32, + config: &HashConfig, + ) -> CudaError; + + pub(crate) fn build_blake2s_merkle_tree_cuda( + leaves: *const u8, + digests: *mut u64, + height: u32, + input_block_len: u32, + tree_config: &TreeBuilderConfig, + ) -> CudaError; +} + +pub fn blake2s( + input: &(impl HostOrDeviceSlice + ?Sized), + input_block_size: u32, + number_of_blocks: u32, + output_block_size: u32, + output: &mut (impl HostOrDeviceSlice + ?Sized), + config: &HashConfig, +) -> IcicleResult<()> { + let mut local_cfg = config.clone(); + local_cfg.are_inputs_on_device = input.is_on_device(); + local_cfg.are_outputs_on_device = output.is_on_device(); + unsafe { + blake2s_cuda( + input.as_ptr(), + output.as_mut_ptr(), + number_of_blocks, + input_block_size, + output_block_size, + &local_cfg, + ) + .wrap() + } +} + +pub fn build_blake2s_merkle_tree( + leaves: &(impl HostOrDeviceSlice + ?Sized), + digests: &mut (impl HostOrDeviceSlice + ?Sized), + height: usize, + input_block_len: usize, + config: &TreeBuilderConfig, +) -> IcicleResult<()> { + unsafe { + build_blake2s_merkle_tree_cuda( + leaves.as_ptr(), + digests.as_mut_ptr(), + height as u32, + input_block_len as u32, + config, + ) + .wrap() + } +} diff --git a/wrappers/rust/icicle-hash/src/blake2s/tests.rs b/wrappers/rust/icicle-hash/src/blake2s/tests.rs new file mode 100644 index 000000000..2eb929f44 --- /dev/null +++ b/wrappers/rust/icicle-hash/src/blake2s/tests.rs @@ -0,0 +1,54 @@ +#[cfg(test)] +pub(crate) mod tests { + use icicle_core::{ + hash::HashConfig, + tree::{merkle_tree_digests_len, TreeBuilderConfig}, + }; + use icicle_cuda_runtime::memory::HostSlice; + + use crate::blake2s::{blake2s, build_blake2s_merkle_tree}; + + #[test] + fn single_hash_test() { + let config = HashConfig::default(); + + let preimages = b"a"; + let mut digests = vec![0u8; 1 * 32]; + + let preimages_slice = HostSlice::from_slice(preimages); + let digests_slice = HostSlice::from_mut_slice(&mut digests); + + blake2s( + preimages_slice, + 1 as u32, + 1 as u32, + 32 as u32, + digests_slice, + &config, + ) + .unwrap(); + + let hex_string: String = digests_slice.iter().map(|byte| format!("{:02x}", byte)).collect(); + + assert_eq!( + hex_string, + "4a0d129873403037c2cd9b9048203687f6233fb6738956e0349bd4320fec3e90" + ); + } + + #[test] + fn blake2s_merkle_tree_test() { + let mut config = TreeBuilderConfig::default(); + config.arity = 2; + let height = 22; + let input_block_len = 136; + let leaves = vec![1u8; (1 << height) * input_block_len]; + let mut digests = vec![0u64; merkle_tree_digests_len((height + 1) as u32, 2, 1)]; + + let leaves_slice = HostSlice::from_slice(&leaves); + let digests_slice = HostSlice::from_mut_slice(&mut digests); + + build_blake2s_merkle_tree(leaves_slice, digests_slice, height, input_block_len, &config).unwrap(); + println!("Root: {:?}", digests_slice[0]); + } +} diff --git a/wrappers/rust/icicle-hash/src/lib.rs b/wrappers/rust/icicle-hash/src/lib.rs index ebcb6d4dd..cf61bbd4b 100644 --- a/wrappers/rust/icicle-hash/src/lib.rs +++ b/wrappers/rust/icicle-hash/src/lib.rs @@ -1 +1,2 @@ +pub mod blake2s; pub mod keccak;