diff --git a/icicle/include/api/hash.h b/icicle/include/api/hash.h
index ffa40d812..7f41a1a00 100644
--- a/icicle/include/api/hash.h
+++ b/icicle/include/api/hash.h
@@ -6,8 +6,11 @@
 #include <cuda_runtime.h>
 #include "gpu-utils/device_context.cuh"
 #include "hash/keccak/keccak.cuh"
+#include "hash/blake2s/blake2s.cuh"
 #include "merkle-tree/merkle.cuh"
 
+/* KECCAK */
+
 extern "C" cudaError_t
   keccak256_cuda(uint8_t* input, int input_block_size, int number_of_blocks, uint8_t* output, keccak::HashConfig& config);
 
@@ -27,4 +30,16 @@ extern "C" cudaError_t build_keccak512_merkle_tree_cuda(
   unsigned int height,
   unsigned int input_block_len,
   const merkle_tree::TreeBuilderConfig& tree_config);
+
+/* BLAKE2S */
+
+extern "C" cudaError_t blake2s_cuda(
+  BYTE* input, BYTE* output, WORD number_of_blocks, WORD input_block_size, WORD output_block_size, blake2s::HashConfig& config)
+
+extern "C" cudaError_t build_blake2s_merkle_tree_cuda(
+  const uint8_t* leaves,
+  uint64_t* digests,
+  unsigned int height,
+  unsigned int input_block_len,
+  const merkle_tree::TreeBuilderConfig& tree_config);
 #endif
\ No newline at end of file
diff --git a/icicle/include/hash/blake2s/blake2s.cuh b/icicle/include/hash/blake2s/blake2s.cuh
new file mode 100644
index 000000000..5988f38f5
--- /dev/null
+++ b/icicle/include/hash/blake2s/blake2s.cuh
@@ -0,0 +1,53 @@
+/*
+ * blake2b.cuh CUDA Implementation of BLAKE2B Hashing
+ *
+ * Date: 12 June 2019
+ * Revision: 1
+ *
+ * This file is released into the Public Domain.
+ */
+
+#pragma once
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdint.h>
+#include "gpu-utils/device_context.cuh"
+#include "gpu-utils/error_handler.cuh"
+
+#include "hash/hash.cuh"
+using namespace hash;
+
+namespace blake2s {
+
+  typedef unsigned char BYTE;
+  typedef unsigned int WORD;
+  typedef unsigned long long LONG;
+
+#define BLAKE2S_ROUNDS       10
+#define BLAKE2S_BLOCK_LENGTH 64
+#define BLAKE2S_CHAIN_SIZE   8
+#define BLAKE2S_CHAIN_LENGTH (BLAKE2S_CHAIN_SIZE * sizeof(uint32_t))
+#define BLAKE2S_STATE_SIZE   16
+#define BLAKE2S_STATE_LENGTH (BLAKE2S_STATE_SIZE * sizeof(uint32_t))
+
+  class Blake2s : public Hasher<BYTE, BYTE>
+  {
+  public:
+    cudaError_t run_hash_many_kernel(
+      const BYTE* input,
+      BYTE* output,
+      WORD number_of_states,
+      WORD input_len,
+      WORD output_len,
+      const device_context::DeviceContext& ctx) const override;
+
+    Blake2s() : Hasher<BYTE, BYTE>(BLAKE2S_STATE_SIZE, BLAKE2S_STATE_SIZE, BLAKE2S_STATE_SIZE, 0) {}
+  };
+
+  extern "C" {
+  cudaError_t
+  cuda_blake2s_hash_batch(BYTE* key, WORD keylen, BYTE* in, WORD inlen, BYTE* out, WORD output_len, WORD n_batch);
+  }
+} // namespace blake2s
\ No newline at end of file
diff --git a/icicle/src/hash/CMakeLists.txt b/icicle/src/hash/CMakeLists.txt
index 3f0d532c7..275afb4b4 100644
--- a/icicle/src/hash/CMakeLists.txt
+++ b/icicle/src/hash/CMakeLists.txt
@@ -1,5 +1,10 @@
 set(TARGET icicle_hash)
 
-add_library(${TARGET} STATIC keccak/extern.cu)
+set(SRC ${CMAKE_SOURCE_DIR}/src/hash)
+
+set(HASH_SOURCE ${SRC}/keccak/extern.cu)
+list(APPEND HASH_SOURCE ${SRC}/blake2s/extern.cu)
+
+add_library(${TARGET} STATIC ${HASH_SOURCE})
 target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
 set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_hash")
\ No newline at end of file
diff --git a/icicle/src/hash/blake2s/Makefile b/icicle/src/hash/blake2s/Makefile
new file mode 100644
index 000000000..4d13a3733
--- /dev/null
+++ b/icicle/src/hash/blake2s/Makefile
@@ -0,0 +1,26 @@
+test_blake2s: test_blake2s.cu blake2s.cu 
+	nvcc -o test_blake2s -I. -I../../../include test_blake2s.cu blake2s.cu -g
+	./test_blake2s
+
+test_blake2s_batched: test_blake2s_batched.cu blake2s.cu 
+	nvcc -o test_blake2s_batched -I. -I../../../include test_blake2s_batched.cu -g
+	./test_blake2s_batched ./batched_test_vectors.csv
+	
+test_blake2s_integ: test_blake2s_integ.cu blake2s.cu 
+	nvcc -o test_blake2s_integ -I. -I../../../include test_blake2s_integ.cu -g
+	./test_blake2s_integ
+
+test_blake2s_seq: test_blake2s_seq.cu blake2s.cu 
+	nvcc -o test_blake2s_seq -I. -I../../../include test_blake2s_seq.cu -g
+	./test_blake2s_seq
+
+test_blake2s_seq_sa: test_blake2s_seq_sa.cu blake2s.cu 
+	nvcc -o test_blake2s_seq_sa -I. -I../../../include test_blake2s_seq_sa.cu -g
+	./test_blake2s_seq_sa
+
+test_blake2s_tree: test_tree.cu blake2s.cu ../../merkle-tree/merkle.cu
+	nvcc -DMERKLE_DEBUG -o test_blake2s_tree -I../../../include test_tree.cu
+	./test_blake2s_tree
+
+clear:
+	rm test_blake2s test_blake2s_tree test_blake2s_integ test_blake2s_seq test_blake2s_seq_sa test_blake2s_batched
\ No newline at end of file
diff --git a/icicle/src/hash/blake2s/batched_test_vectors.csv b/icicle/src/hash/blake2s/batched_test_vectors.csv
new file mode 100644
index 000000000..ace3a954a
--- /dev/null
+++ b/icicle/src/hash/blake2s/batched_test_vectors.csv
@@ -0,0 +1,10 @@
+9301876542, 1e95b837356f67e9b456d636dd3d3f55bfff01eb78b375f613db5819f27e5972
+5279608431, 000386abd221e7049e78091c4b964719dd45fda6a70ad88194c8ecc7fb5ec4bc
+1842976503, b35ce7baa5c5c620be9bf0d03a7a5b43fd18e786e1678ad797a30b50dc48ccc7
+6498302715, 489412fcb1a74c14dbe06aaad61cf2d3ed0eaa6a6154afc9f8b58fe92ffcebf1
+7023598146, 6b9a45147bd1c61f8d1d3d110cb705ae85ddc31ac7cb18e47306bc51d4d807ba
+3150729846, 5d9d597b956a26fd79cd8bdf38e306db068b6089b305268b90fd1a304a5b2224
+9583402167, 584cd56b727e14ccc7fcaf406982faab08529b6789748c9ffc74748b033cf44f
+8760134295, 2cba2adb552cc89312c614c3d720edaa5cf03bc5fc2a012511cfee013636ee40
+5402987631, d6d4920f85f5e286f3add452fab5b19f31e66293ec612b29389643f78b4ace2a
+2739810546, 69aee09804f37a477b34f9a4447b39e9caaae49bbc5056b8018dd513c8cec263
\ No newline at end of file
diff --git a/icicle/src/hash/blake2s/blake2s.cu b/icicle/src/hash/blake2s/blake2s.cu
new file mode 100644
index 000000000..f14fec520
--- /dev/null
+++ b/icicle/src/hash/blake2s/blake2s.cu
@@ -0,0 +1,269 @@
+#include <assert.h>
+#include "gpu-utils/device_context.cuh"
+#include "gpu-utils/error_handler.cuh"
+#include "gpu-utils/modifiers.cuh"
+#include "hash/hash.cuh"
+
+#include "hash/blake2s/blake2s.cuh"
+
+using namespace hash;
+
+namespace blake2s {
+
+  typedef struct {
+    WORD digestlen;
+    BYTE key[32];
+    WORD keylen;
+    BYTE buff[BLAKE2S_BLOCK_LENGTH];
+    uint32_t chain[BLAKE2S_CHAIN_SIZE];
+    uint32_t state[BLAKE2S_STATE_SIZE];
+    WORD pos;
+    uint32_t t0;
+    uint32_t t1;
+    uint32_t f0;
+  } cuda_blake2s_ctx_t;
+
+  typedef cuda_blake2s_ctx_t CUDA_BLAKE2S_CTX;
+
+  __constant__ CUDA_BLAKE2S_CTX c_CTX;
+
+  __constant__ uint32_t BLAKE2S_IVS[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+                                          0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL};
+
+  const uint32_t CPU_BLAKE2S_IVS[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+                                       0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL};
+
+  void cpu_blake2s_init(cuda_blake2s_ctx_t* ctx, BYTE* key, WORD keylen, WORD digestbitlen)
+  {
+    memset(ctx, 0, sizeof(cuda_blake2s_ctx_t));
+    if (keylen > 0) {
+      memcpy(ctx->buff, key, keylen);
+      memcpy(ctx->key, key, keylen);
+    }
+    ctx->keylen = keylen;
+    ctx->digestlen = digestbitlen >> 3;
+    ctx->pos = 0;
+    ctx->t0 = 0;
+    ctx->t1 = 0;
+    ctx->f0 = 0;
+    ctx->chain[0] = CPU_BLAKE2S_IVS[0] ^ (ctx->digestlen | (ctx->keylen << 8) | 0x1010000);
+    ctx->chain[1] = CPU_BLAKE2S_IVS[1];
+    ctx->chain[2] = CPU_BLAKE2S_IVS[2];
+    ctx->chain[3] = CPU_BLAKE2S_IVS[3];
+    ctx->chain[4] = CPU_BLAKE2S_IVS[4];
+    ctx->chain[5] = CPU_BLAKE2S_IVS[5];
+    ctx->chain[6] = CPU_BLAKE2S_IVS[6];
+    ctx->chain[7] = CPU_BLAKE2S_IVS[7];
+
+    ctx->pos = (keylen > 0) ? BLAKE2S_BLOCK_LENGTH : 0;
+  }
+
+  __constant__ uint8_t BLAKE2S_SIGMA[10][16] = {
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+    {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
+    {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
+    {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
+    {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}};
+
+  __device__ uint32_t cuda_blake2s_leuint32(const BYTE* in)
+  {
+    uint32_t a;
+    memcpy(&a, in, 4);
+    return a;
+  }
+
+  __inline__ __device__ uint32_t cuda_blake2s_ROTR32(uint32_t a, uint8_t b) { return (a >> b) | (a << (32 - b)); }
+
+  __device__ void
+  cuda_blake2s_G(cuda_blake2s_ctx_t* ctx, uint32_t m1, uint32_t m2, int32_t a, int32_t b, int32_t c, int32_t d)
+  {
+    ctx->state[a] = ctx->state[a] + ctx->state[b] + m1;
+    ctx->state[d] = cuda_blake2s_ROTR32(ctx->state[d] ^ ctx->state[a], 16);
+    ctx->state[c] = ctx->state[c] + ctx->state[d];
+    ctx->state[b] = cuda_blake2s_ROTR32(ctx->state[b] ^ ctx->state[c], 12);
+    ctx->state[a] = ctx->state[a] + ctx->state[b] + m2;
+    ctx->state[d] = cuda_blake2s_ROTR32(ctx->state[d] ^ ctx->state[a], 8);
+    ctx->state[c] = ctx->state[c] + ctx->state[d];
+    ctx->state[b] = cuda_blake2s_ROTR32(ctx->state[b] ^ ctx->state[c], 7);
+  }
+
+  __device__ __forceinline__ void cuda_blake2s_init_state(cuda_blake2s_ctx_t* ctx)
+  {
+    memcpy(ctx->state, ctx->chain, BLAKE2S_CHAIN_LENGTH);
+    ctx->state[8] = BLAKE2S_IVS[0];
+    ctx->state[9] = BLAKE2S_IVS[1];
+    ctx->state[10] = BLAKE2S_IVS[2];
+    ctx->state[11] = BLAKE2S_IVS[3];
+    ctx->state[12] = ctx->t0 ^ BLAKE2S_IVS[4];
+    ctx->state[13] = ctx->t1 ^ BLAKE2S_IVS[5];
+    ctx->state[14] = ctx->f0 ^ BLAKE2S_IVS[6];
+    ctx->state[15] = BLAKE2S_IVS[7];
+  }
+
+  __device__ __forceinline__ void cuda_blake2s_compress(cuda_blake2s_ctx_t* ctx, const BYTE* in, WORD inoffset)
+  {
+    cuda_blake2s_init_state(ctx);
+    uint32_t m[16] = {0};
+    for (int j = 0; j < 16; j++)
+      m[j] = cuda_blake2s_leuint32(in + inoffset + (j << 2));
+
+    for (int round = 0; round < BLAKE2S_ROUNDS; round++) {
+      cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][0]], m[BLAKE2S_SIGMA[round][1]], 0, 4, 8, 12);
+      cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][2]], m[BLAKE2S_SIGMA[round][3]], 1, 5, 9, 13);
+      cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][4]], m[BLAKE2S_SIGMA[round][5]], 2, 6, 10, 14);
+      cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][6]], m[BLAKE2S_SIGMA[round][7]], 3, 7, 11, 15);
+      cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][8]], m[BLAKE2S_SIGMA[round][9]], 0, 5, 10, 15);
+      cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][10]], m[BLAKE2S_SIGMA[round][11]], 1, 6, 11, 12);
+      cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][12]], m[BLAKE2S_SIGMA[round][13]], 2, 7, 8, 13);
+      cuda_blake2s_G(ctx, m[BLAKE2S_SIGMA[round][14]], m[BLAKE2S_SIGMA[round][15]], 3, 4, 9, 14);
+    }
+
+    for (int offset = 0; offset < BLAKE2S_CHAIN_SIZE; offset++)
+      ctx->chain[offset] = ctx->chain[offset] ^ ctx->state[offset] ^ ctx->state[offset + 8];
+  }
+
+  __device__ void cuda_blake2s_init(cuda_blake2s_ctx_t* ctx, BYTE* key, WORD keylen, WORD digestbitlen)
+  {
+    memset(ctx, 0, sizeof(cuda_blake2s_ctx_t));
+    ctx->keylen = keylen;
+    ctx->digestlen = digestbitlen >> 3;
+    ctx->pos = 0;
+    ctx->t0 = 0;
+    ctx->t1 = 0;
+    ctx->f0 = 0;
+    ctx->chain[0] = BLAKE2S_IVS[0] ^ (ctx->digestlen | (ctx->keylen << 8) | 0x1010000);
+    ctx->chain[1] = BLAKE2S_IVS[1];
+    ctx->chain[2] = BLAKE2S_IVS[2];
+    ctx->chain[3] = BLAKE2S_IVS[3];
+    ctx->chain[4] = BLAKE2S_IVS[4];
+    ctx->chain[5] = BLAKE2S_IVS[5];
+    ctx->chain[6] = BLAKE2S_IVS[6];
+    ctx->chain[7] = BLAKE2S_IVS[7];
+
+    if (keylen > 0) {
+      memcpy(ctx->buff, key, keylen);
+      memcpy(ctx->key, key, keylen);
+    }
+    ctx->pos = (keylen > 0) ? BLAKE2S_BLOCK_LENGTH : 0;
+  }
+
+  __device__ void cuda_blake2s_update(cuda_blake2s_ctx_t* ctx, const BYTE* in, LONG inlen)
+  {
+    if (inlen == 0) return;
+
+    WORD start = 0;
+    int64_t in_index = 0, block_index = 0;
+
+    if (ctx->pos) {
+      start = BLAKE2S_BLOCK_LENGTH - ctx->pos;
+      if (start < inlen) {
+        memcpy(ctx->buff + ctx->pos, in, start);
+        ctx->t0 += BLAKE2S_BLOCK_LENGTH;
+
+        if (ctx->t0 == 0) ctx->t1++;
+
+        cuda_blake2s_compress(ctx, ctx->buff, 0);
+        ctx->pos = 0;
+        memset(ctx->buff, 0, BLAKE2S_BLOCK_LENGTH);
+      } else {
+        memcpy(ctx->buff + ctx->pos, in, inlen);
+        ctx->pos += inlen;
+        return;
+      }
+    }
+
+    block_index = inlen - BLAKE2S_BLOCK_LENGTH;
+    for (in_index = start; in_index < block_index; in_index += BLAKE2S_BLOCK_LENGTH) {
+      ctx->t0 += BLAKE2S_BLOCK_LENGTH;
+      if (ctx->t0 == 0) ctx->t1++;
+
+      cuda_blake2s_compress(ctx, in, in_index);
+    }
+
+    memcpy(ctx->buff, in + in_index, inlen - in_index);
+    ctx->pos += inlen - in_index;
+  }
+
+  __device__ void cuda_blake2s_final(cuda_blake2s_ctx_t* ctx, BYTE* out)
+  {
+    ctx->f0 = 0xFFFFFFFFUL;
+    ctx->t0 += ctx->pos;
+    if (ctx->pos > 0 && ctx->t0 == 0) ctx->t1++;
+
+    cuda_blake2s_compress(ctx, ctx->buff, 0);
+    memset(ctx->buff, 0, BLAKE2S_BLOCK_LENGTH);
+    memset(ctx->state, 0, BLAKE2S_STATE_LENGTH);
+
+    int i4 = 0;
+    for (int i = 0; i < BLAKE2S_CHAIN_SIZE && ((i4 = i * 4) < ctx->digestlen); i++) {
+      BYTE* BYTEs = (BYTE*)(&ctx->chain[i]);
+      if (i4 < ctx->digestlen - 4)
+        memcpy(out + i4, BYTEs, 4);
+      else
+        memcpy(out + i4, BYTEs, ctx->digestlen - i4);
+    }
+  }
+
+  __global__ void
+  kernel_blake2s_hash(const BYTE* indata, WORD inlen, BYTE* outdata, WORD n_batch, WORD BLAKE2S_BLOCK_SIZE)
+  {
+    WORD thread = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread >= n_batch) { return; }
+    BYTE key[32] = ""; // Null key
+    WORD keylen = 0;
+    CUDA_BLAKE2S_CTX blake_ctx;
+    const BYTE* in = indata + thread * inlen;
+    BYTE* out = outdata + thread * BLAKE2S_BLOCK_SIZE;
+
+    cuda_blake2s_init(&blake_ctx, key, keylen, (BLAKE2S_BLOCK_SIZE << 3));
+    cuda_blake2s_update(&blake_ctx, in, inlen);
+    cuda_blake2s_final(&blake_ctx, out);
+  }
+
+  extern "C" {
+  cudaError_t
+  cuda_blake2s_hash_batch(BYTE* key, WORD keylen, BYTE* in, WORD inlen, BYTE* out, WORD output_len, WORD n_batch)
+  {
+    BYTE* cuda_indata;
+    BYTE* cuda_outdata;
+    const WORD BLAKE2S_BLOCK_SIZE = output_len;
+    cudaMalloc(&cuda_indata, inlen * n_batch);
+    cudaMalloc(&cuda_outdata, BLAKE2S_BLOCK_SIZE * n_batch);
+    assert(keylen <= 32);
+
+    cudaMemcpy(cuda_indata, in, inlen * n_batch, cudaMemcpyHostToDevice);
+
+    WORD thread = 256;
+    WORD block = (n_batch + thread - 1) / thread;
+    kernel_blake2s_hash<<<block, thread>>>(cuda_indata, inlen, cuda_outdata, n_batch, BLAKE2S_BLOCK_SIZE);
+    cudaMemcpy(out, cuda_outdata, BLAKE2S_BLOCK_SIZE * n_batch, cudaMemcpyDeviceToHost);
+    cudaDeviceSynchronize();
+    // cudaError_t error = cudaGetLastError();
+    // if (error != cudaSuccess) { printf("Error cuda blake2s hash: %s \n", cudaGetErrorString(error)); }
+    cudaFree(cuda_indata);
+    cudaFree(cuda_outdata);
+    CHK_IF_RETURN(cudaPeekAtLastError());
+    return CHK_LAST();
+  }
+  }
+
+  cudaError_t Blake2s::run_hash_many_kernel(
+    const BYTE* input,
+    BYTE* output,
+    WORD number_of_states,
+    WORD input_len,
+    WORD output_len,
+    const device_context::DeviceContext& ctx) const
+  {
+    const WORD BLAKE2S_BLOCK_SIZE = output_len;
+    WORD thread = 256;
+    WORD block = (number_of_states + thread - 1) / thread;
+
+    kernel_blake2s_hash<<<block, thread, 0, ctx.stream>>>(
+      input, input_len, output, number_of_states, BLAKE2S_BLOCK_SIZE);
+
+    CHK_IF_RETURN(cudaPeekAtLastError());
+    return CHK_LAST();
+  }
+
+} // namespace blake2s
\ No newline at end of file
diff --git a/icicle/src/hash/blake2s/expected_hashes.csv b/icicle/src/hash/blake2s/expected_hashes.csv
new file mode 100644
index 000000000..afea56903
--- /dev/null
+++ b/icicle/src/hash/blake2s/expected_hashes.csv
@@ -0,0 +1,10 @@
+652e530edee5893b576f72b875ea1c918e85e29d859e7e3fa78b623d8abca3de
+57e36dae300302953c953b59a1b263cb314326db44e919ca4acd57e1da8f0543
+9fbfc63ab34b8c35c58e9178c60e2bb165dde7340cb063e9567e4f6bef5eacbe
+f26bbce62eaed81606f72000d95f0eea0eac23893f9db0c1a65af173c5095cb7
+9a21619cac392bd9c80725b2161b033efc0dad3b57d8a9b4c5103ed1cd065a38
+b2e524f32a0b8eeb72737da8ea0075c4e7e11936289954d571622f23e3df9076
+d78d4fe12e21ce58226c2707b86b167e237b24f1f84a5e39b073460998c5359d
+0d74da2a1062445822cbc8ec7bf424714e09923b4c1eba0ca2170504f56c4331
+f5205d77b033111f1e15f585a86b7a4c292c0ec39addb3b2fcb0de4a0bf61003
+410381eb72313f23f9f62478d62ec7635f4166ab5e53a20af5c9e8f7ee445de8
\ No newline at end of file
diff --git a/icicle/src/hash/blake2s/extern.cu b/icicle/src/hash/blake2s/extern.cu
new file mode 100644
index 000000000..eb9120f78
--- /dev/null
+++ b/icicle/src/hash/blake2s/extern.cu
@@ -0,0 +1,30 @@
+#include "utils/utils.h"
+#include "gpu-utils/error_handler.cuh"
+
+#include "hash/hash.cuh"
+
+#include "hash/blake2s/blake2s.cuh"
+#include "blake2s.cu"
+#include "../../merkle-tree/merkle.cu"
+#include "merkle-tree/merkle.cuh"
+
+namespace blake2s {
+  extern "C" cudaError_t blake2s_cuda(
+    BYTE* input, BYTE* output, WORD number_of_blocks, WORD input_block_size, WORD output_block_size, HashConfig& config)
+  {
+    return Blake2s().hash_many(input, output, number_of_blocks, input_block_size, output_block_size, config);
+  }
+
+  extern "C" cudaError_t build_blake2s_merkle_tree_cuda(
+    const BYTE* leaves,
+    BYTE* digests,
+    unsigned int height,
+    WORD input_block_len,
+    const merkle_tree::TreeBuilderConfig& tree_config)
+  {
+    Blake2s blake2s;
+    return merkle_tree::build_merkle_tree<BYTE, BYTE>(
+      leaves, digests, height, input_block_len, blake2s, blake2s, tree_config);
+  }
+
+} // namespace blake2s
\ No newline at end of file
diff --git a/icicle/src/hash/blake2s/test_blake2s.cu b/icicle/src/hash/blake2s/test_blake2s.cu
new file mode 100644
index 000000000..63e4e45ae
--- /dev/null
+++ b/icicle/src/hash/blake2s/test_blake2s.cu
@@ -0,0 +1,111 @@
+#include <chrono>
+#include "gpu-utils/device_context.cuh"
+
+#include <cassert>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <iomanip>
+
+#include "hash/blake2s/blake2s.cuh"
+
+using namespace blake2s;
+
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f us\n", msg, FpMicroseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+// extern "C" {
+// void cuda_blake2s_hash_batch(BYTE* key, WORD keylen, BYTE* in, WORD inlen, BYTE* out, WORD outlen, WORD n_batch);
+// }
+
+void print_hash(BYTE* hash, WORD len)
+{
+  printf("%d \n", len);
+  for (WORD i = 0; i < len; i++) {
+    printf("%02x", hash[i]);
+  }
+  printf("\n");
+}
+
+BYTE* read_file(const char* filename, size_t* filesize)
+{
+  FILE* file = fopen(filename, "rb");
+  if (!file) {
+    perror("Failed to open file");
+    exit(EXIT_FAILURE);
+  }
+
+  fseek(file, 0, SEEK_END);
+  *filesize = ftell(file);
+  fseek(file, 0, SEEK_SET);
+
+  BYTE* buffer = (BYTE*)malloc(*filesize);
+  if (!buffer) {
+    perror("Failed to allocate memory");
+    fclose(file);
+    exit(EXIT_FAILURE);
+  }
+
+  size_t bytesRead = fread(buffer, 1, *filesize, file);
+  if (bytesRead != *filesize) {
+    perror("Failed to read file");
+    free(buffer);
+    fclose(file);
+    exit(EXIT_FAILURE);
+  }
+
+  fclose(file);
+  return buffer;
+}
+
+int main(int argc, char** argv)
+{
+  using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+  using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
+
+  BYTE* input;
+  size_t inlen;
+  const char* input_filename;
+  const char* default_input = "aaaaaaaaaaa";
+
+  if (argc < 2) {
+    // Use default input if no file is provided
+    input = (BYTE*)default_input;
+    inlen = strlen(default_input);
+  } else {
+    input_filename = argv[1];
+    input = read_file(input_filename, &inlen);
+  }
+
+  // Test parameters
+  BYTE key[32] = ""; // Example key
+  WORD keylen = strlen((char*)key);
+  WORD n_outbit = 256; // Output length in bits
+  WORD n_batch = 1;    // Number of hashes to compute in parallel
+
+  // Allocate memory for the output
+  WORD outlen = n_outbit / 8;
+  BYTE* output = (BYTE*)malloc(outlen * n_batch);
+  if (!output) {
+    perror("Failed to allocate memory for output");
+    if (argc >= 2) free(input); // Free file buffer if it was allocated
+    return EXIT_FAILURE;
+  }
+
+  printf("Key len: %d \n", keylen);
+
+  // Perform the hashing
+  START_TIMER(blake_timer)
+  cuda_blake2s_hash_batch(key, keylen, input, inlen, output, outlen, n_batch);
+  END_TIMER(blake_timer, "Blake Timer")
+
+  // Print the result
+  printf("BLAKE2S hash:\n");
+  print_hash(output, outlen);
+
+  // Clean up
+  free(output);
+  if (argc >= 2) free(input); // Free file buffer if it was allocated
+  return 0;
+}
diff --git a/icicle/src/hash/blake2s/test_blake2s_batched.cu b/icicle/src/hash/blake2s/test_blake2s_batched.cu
new file mode 100644
index 000000000..2be197075
--- /dev/null
+++ b/icicle/src/hash/blake2s/test_blake2s_batched.cu
@@ -0,0 +1,155 @@
+#include <algorithm>
+#include <chrono>
+#include "gpu-utils/device_context.cuh"
+
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <sstream>
+#include <unordered_map>
+#include "extern.cu"
+
+using namespace blake2s;
+
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f us\n", msg, FpMicroseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+void print_hash(BYTE* hash, WORD len)
+{
+  for (WORD i = 0; i < len; i++) {
+    printf("%02x", hash[i]);
+  }
+  printf("\n");
+}
+
+// Function to trim leading and trailing whitespace from a string
+std::string trim(const std::string& str)
+{
+  size_t first = str.find_first_not_of(' ');
+  size_t last = str.find_last_not_of(' ');
+  return str.substr(first, (last - first + 1));
+}
+
+std::unordered_map<std::string, std::string> load_strings_and_hashes_from_csv(const char* filename)
+{
+  std::unordered_map<std::string, std::string> string_hash_map;
+  std::ifstream file(filename);
+  if (!file.is_open()) {
+    perror("Failed to open CSV file");
+    exit(EXIT_FAILURE);
+  }
+
+  std::string line;
+  while (std::getline(file, line)) {
+    std::stringstream ss(line);
+    std::string input_string, hash_value;
+    if (std::getline(ss, input_string, ',') && std::getline(ss, hash_value, ',')) {
+      // Trim any whitespace around the strings
+      input_string = trim(input_string);
+      hash_value = trim(hash_value);
+      string_hash_map[input_string] = hash_value;
+    }
+  }
+
+  file.close();
+  return string_hash_map;
+}
+
+int main(int argc, char** argv)
+{
+  using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+  using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
+
+  if (argc < 2) {
+    fprintf(stderr, "Usage: %s <strings and hashes CSV file>\n", argv[0]);
+    return EXIT_FAILURE;
+  }
+
+  const char* csv_filename = argv[1];
+  auto string_hash_map = load_strings_and_hashes_from_csv(csv_filename);
+
+  if (string_hash_map.size() != 10) {
+    fprintf(stderr, "CSV file must contain exactly 10 strings and hashes.\n");
+    return EXIT_FAILURE;
+  }
+
+  // Prepare the test strings and expected hashes from the map
+  std::vector<std::string> test_strings;
+  std::vector<std::string> expected_hashes;
+  for (const auto& pair : string_hash_map) {
+    test_strings.push_back(pair.first);
+    expected_hashes.push_back(pair.second);
+  }
+
+  // Test parameters
+  BYTE key[32] = ""; // Example key
+  WORD keylen = strlen((char*)key);
+  WORD n_outbit = 256; // Output length in bits
+  WORD n_batch = 10;   // Number of different inputs to hash in parallel
+  size_t max_len = 10; // Max length of the test strings
+
+  // Calculate total input length and allocate memory for the batched input
+  size_t total_len = 0;
+  for (const auto& str : test_strings) {
+    total_len += str.size();
+  }
+  BYTE* batched_input = (BYTE*)malloc(total_len);
+  WORD* in_lengths = (WORD*)malloc(n_batch * sizeof(WORD));
+
+  // Copy test strings to batched input and store their lengths
+  BYTE* current_position = batched_input;
+  for (int i = 0; i < n_batch; ++i) {
+    memcpy(current_position, test_strings[i].c_str(), test_strings[i].size());
+    current_position += test_strings[i].size();
+  }
+
+  // Allocate memory for the output
+  WORD outlen = n_outbit / 8;
+  BYTE* output = (BYTE*)malloc(outlen * n_batch);
+  if (!output) {
+    perror("Failed to allocate memory for output");
+    free(batched_input);
+    free(in_lengths);
+    return EXIT_FAILURE;
+  }
+
+  printf("Key len: %d \n", keylen);
+  HashConfig config = default_hash_config();
+
+  // Perform the hashing
+  START_TIMER(blake_timer)
+  // cuda_blake2s_hash_batch(key, keylen, batched_input, max_len, output, outlen, n_batch);
+  blake2s_cuda(batched_input, output, n_batch, max_len, outlen, config);
+  END_TIMER(blake_timer, "Blake Timer")
+
+  // Print and compare the results
+  printf("BLAKE2S hash (batch size = %d):\n", n_batch);
+  for (WORD i = 0; i < n_batch; i++) {
+    printf("String: %s\n", test_strings[i].c_str());
+    printf("Computed Hash %d: ", i + 1);
+    print_hash(output + i * outlen, outlen);
+    std::cout << "Expected Hash " << i + 1 << ": " << expected_hashes[i] << std::endl;
+
+    std::string computed_hash;
+    for (WORD j = 0; j < outlen; ++j) {
+      char buffer[3];
+      snprintf(buffer, sizeof(buffer), "%02x", output[i * outlen + j]);
+      computed_hash += buffer;
+    }
+
+    if (computed_hash == expected_hashes[i]) {
+      printf(" (Match)\n");
+    } else {
+      printf(" (Mismatch)\n");
+    }
+  }
+
+  // Clean up
+  free(output);
+  free(batched_input);
+  free(in_lengths);
+  return 0;
+}
diff --git a/icicle/src/hash/blake2s/test_blake2s_integ.cu b/icicle/src/hash/blake2s/test_blake2s_integ.cu
new file mode 100644
index 000000000..4959adc94
--- /dev/null
+++ b/icicle/src/hash/blake2s/test_blake2s_integ.cu
@@ -0,0 +1,111 @@
+#include <chrono>
+#include "gpu-utils/device_context.cuh"
+
+#include <cassert>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <iomanip>
+#include "extern.cu"
+
+using namespace blake2s;
+
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f us\n", msg, FpMicroseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+// extern "C" {
+// void cuda_blake2s_hash_batch(BYTE* key, WORD keylen, BYTE* in, WORD inlen, BYTE* out, WORD n_outbit, WORD n_batch);
+// }
+
+void print_hash(BYTE* hash, WORD len)
+{
+  printf("Hash Len: %d \n", len);
+  printf("BLAKE2S hash:\n");
+  for (WORD i = 0; i < len; i++) {
+    printf("%02x", hash[i]);
+  }
+  printf("\n");
+}
+
+BYTE* read_file(const char* filename, size_t* filesize)
+{
+  FILE* file = fopen(filename, "rb");
+  if (!file) {
+    perror("Failed to open file");
+    exit(EXIT_FAILURE);
+  }
+
+  fseek(file, 0, SEEK_END);
+  *filesize = ftell(file);
+  fseek(file, 0, SEEK_SET);
+
+  BYTE* buffer = (BYTE*)malloc(*filesize);
+  if (!buffer) {
+    perror("Failed to allocate memory");
+    fclose(file);
+    exit(EXIT_FAILURE);
+  }
+
+  size_t bytesRead = fread(buffer, 1, *filesize, file);
+  if (bytesRead != *filesize) {
+    perror("Failed to read file");
+    free(buffer);
+    fclose(file);
+    exit(EXIT_FAILURE);
+  }
+
+  fclose(file);
+  return buffer;
+}
+
+int main(int argc, char** argv)
+{
+  using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+  using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
+
+  BYTE* input;
+  size_t inlen;
+  const char* input_filename;
+  const char* default_input = "aaaaaaaaaaa";
+
+  if (argc < 2) {
+    // Use default input if no file is provided
+    input = (BYTE*)default_input;
+    inlen = strlen(default_input);
+  } else {
+    input_filename = argv[1];
+    input = read_file(input_filename, &inlen);
+  }
+
+  // Test parameters
+  BYTE key[32] = ""; // Example key
+  WORD keylen = strlen((char*)key);
+  WORD n_outbit = 256; // Output length in bits
+  WORD n_batch = 1;    // Number of hashes to compute in parallel
+  // Allocate memory for the output
+  WORD outlen = n_outbit / 8;
+  BYTE* output = (BYTE*)malloc(outlen * n_batch);
+  if (!output) {
+    perror("Failed to allocate memory for output");
+    if (argc >= 2) free(input); // Free file buffer if it was allocated
+    return EXIT_FAILURE;
+  }
+
+  printf("Key len: %d \n", keylen);
+
+  // Perform the hashing
+  START_TIMER(blake_timer)
+  HashConfig config = default_hash_config();
+
+  blake2s_cuda(input, output, n_batch, inlen, outlen, config);
+  END_TIMER(blake_timer, "Blake Timer")
+
+  // Print the result
+  print_hash(output, outlen);
+
+  // Clean up
+  free(output);
+  if (argc >= 2) free(input); // Free file buffer if it was allocated
+  return 0;
+}
diff --git a/icicle/src/hash/blake2s/test_blake2s_seq.cu b/icicle/src/hash/blake2s/test_blake2s_seq.cu
new file mode 100644
index 000000000..1578d8d59
--- /dev/null
+++ b/icicle/src/hash/blake2s/test_blake2s_seq.cu
@@ -0,0 +1,104 @@
+#include <chrono>
+#include "gpu-utils/device_context.cuh"
+
+#include <cassert>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <iomanip>
+#include "extern.cu"
+
+using namespace blake2s;
+
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f us\n", msg, FpMicroseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+void print_hash(BYTE* hash, WORD len)
+{
+  printf("Hash Len: %d \n", len);
+  printf("BLAKE2S hash:\n");
+  for (WORD i = 0; i < len; i++) {
+    printf("%02x", hash[i]);
+  }
+  printf("\n");
+}
+
+std::string byte_to_hex(BYTE* data, WORD len)
+{
+  std::stringstream ss;
+  for (WORD i = 0; i < len; i++) {
+    ss << std::hex << std::setw(2) << std::setfill('0') << (int)data[i];
+  }
+  return ss.str();
+}
+
+std::vector<std::string> load_csv(const char* filename)
+{
+  std::vector<std::string> hashes;
+  std::ifstream file(filename);
+  std::string line;
+  while (std::getline(file, line)) {
+    // Directly add the line as a hash, assuming one hash per line
+    hashes.push_back(line);
+  }
+  return hashes;
+}
+
+int main(int argc, char** argv)
+{
+  using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+  using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
+
+  std::vector<std::string> test_strings = {"0",      "01",      "012",      "0123",      "01234",
+                                           "012345", "0123456", "01234567", "012345678", "0123456789"};
+
+  const char* csv_filename = "expected_hashes.csv"; // Replace with your actual CSV file name
+  std::vector<std::string> expected_hashes = load_csv(csv_filename);
+  assert(expected_hashes.size() == test_strings.size() && "Number of hashes in CSV must match number of test strings.");
+  std::cout << "Loaded hashes from CSV:" << std::endl;
+  // for (size_t i = 0; i < expected_hashes.size(); ++i) {
+  //     std::cout << "Expected hash " << i  << ": " << expected_hashes[i] << std::endl;
+  // }
+
+  // Test parameters
+  WORD n_outbit = 256; // Output length in bits
+  WORD n_batch = 1;    // Number of hashes to compute in parallel
+
+  // Allocate memory for the output
+  WORD outlen = n_outbit / 8;
+
+  // Perform the hashing
+  HashConfig config = default_hash_config();
+
+  for (size_t i = 0; i < test_strings.size(); i++) {
+    BYTE* output = (BYTE*)malloc(outlen * n_batch);
+    if (!output) {
+      perror("Failed to allocate memory for output");
+      return EXIT_FAILURE;
+    }
+
+    const std::string& input_str = test_strings[i];
+    BYTE* input = (BYTE*)input_str.c_str();
+    size_t inlen = input_str.size();
+
+    // Perform the hashing
+    START_TIMER(blake_timer)
+    blake2s_cuda(input, output, n_batch, inlen, outlen, config);
+    END_TIMER(blake_timer, "Blake Timer")
+    // Convert the output to hex string
+    std::string computed_hash = byte_to_hex(output, outlen);
+    // Compare with the expected hash
+
+    if (computed_hash == expected_hashes[i]) {
+      std::cout << "Test " << i << " passed." << std::endl;
+    } else {
+      std::cout << "Test " << i << " failed." << std::endl;
+      std::cout << "Expected: " << expected_hashes[i] << std::endl;
+      std::cout << "Got:      " << computed_hash << std::endl;
+    }
+    free(output);
+  }
+
+  return 0;
+}
diff --git a/icicle/src/hash/blake2s/test_blake2s_seq_sa.cu b/icicle/src/hash/blake2s/test_blake2s_seq_sa.cu
new file mode 100644
index 000000000..7cfdf2537
--- /dev/null
+++ b/icicle/src/hash/blake2s/test_blake2s_seq_sa.cu
@@ -0,0 +1,113 @@
+#include <chrono>
+#include "gpu-utils/device_context.cuh"
+
+#include <cassert>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <iomanip>
+#include "extern.cu"
+#include "hash/blake2s/blake2s.cuh"
+
+using namespace blake2s;
+
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f us\n", msg, FpMicroseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+// extern "C" {
+// void cuda_blake2s_hash_batch(BYTE* key, WORD keylen, BYTE* in, WORD inlen, BYTE* out, WORD n_outbit, WORD n_batch);
+// }
+
+void print_hash(BYTE* hash, WORD len)
+{
+  printf("Hash Len: %d \n", len);
+  printf("BLAKE2S hash:\n");
+  for (WORD i = 0; i < len; i++) {
+    printf("%02x", hash[i]);
+  }
+  printf("\n");
+}
+
+std::string byte_to_hex(BYTE* data, WORD len)
+{
+  std::stringstream ss;
+  for (WORD i = 0; i < len; i++) {
+    ss << std::hex << std::setw(2) << std::setfill('0') << (int)data[i];
+  }
+  return ss.str();
+}
+
+std::vector<std::string> load_csv(const char* filename)
+{
+  std::vector<std::string> hashes;
+  std::ifstream file(filename);
+  std::string line;
+  while (std::getline(file, line)) {
+    // Directly add the line as a hash, assuming one hash per line
+    hashes.push_back(line);
+  }
+  return hashes;
+}
+
+int main(int argc, char** argv)
+{
+  using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+  using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
+
+  std::vector<std::string> test_strings = {"0",      "01",      "012",      "0123",      "01234",
+                                           "012345", "0123456", "01234567", "012345678", "0123456789"};
+
+  const char* csv_filename = "expected_hashes.csv"; // Replace with your actual CSV file name
+  std::vector<std::string> expected_hashes = load_csv(csv_filename);
+  assert(expected_hashes.size() == test_strings.size() && "Number of hashes in CSV must match number of test strings.");
+  std::cout << "Loaded hashes from CSV:" << std::endl;
+  // for (size_t i = 0; i < expected_hashes.size(); ++i) {
+  //     std::cout << "Expected hash " << i  << ": " << expected_hashes[i] << std::endl;
+  // }
+
+  // Test parameters
+  WORD n_outbit = 256; // Output length in bits
+  WORD n_batch = 1;    // Number of hashes to compute in parallel
+
+  // Test parameters
+  BYTE key[32] = ""; // Example key
+  WORD keylen = strlen((char*)key);
+
+  // Allocate memory for the output
+  WORD outlen = n_outbit / 8;
+
+  // Perform the hashing
+  HashConfig config = default_hash_config();
+
+  for (size_t i = 0; i < test_strings.size(); i++) {
+    BYTE* output = (BYTE*)malloc(outlen * n_batch);
+    if (!output) {
+      perror("Failed to allocate memory for output");
+      return EXIT_FAILURE;
+    }
+
+    const std::string& input_str = test_strings[i];
+    BYTE* input = (BYTE*)input_str.c_str();
+    size_t inlen = input_str.size();
+
+    // Perform the hashing
+    START_TIMER(blake_timer)
+    cuda_blake2s_hash_batch(key, keylen, input, inlen, output, outlen, n_batch);
+    END_TIMER(blake_timer, "Blake Timer")
+    // Convert the output to hex string
+    std::string computed_hash = byte_to_hex(output, outlen);
+    // Compare with the expected hash
+
+    if (computed_hash == expected_hashes[i]) {
+      std::cout << "Test " << i << " passed." << std::endl;
+    } else {
+      std::cout << "Test " << i << " failed." << std::endl;
+      std::cout << "Expected: " << expected_hashes[i] << std::endl;
+      std::cout << "Got:      " << computed_hash << std::endl;
+    }
+    free(output);
+  }
+
+  return 0;
+}
diff --git a/icicle/src/hash/blake2s/test_tree.cu b/icicle/src/hash/blake2s/test_tree.cu
new file mode 100644
index 000000000..85145ccf0
--- /dev/null
+++ b/icicle/src/hash/blake2s/test_tree.cu
@@ -0,0 +1,95 @@
+#include "gpu-utils/device_context.cuh"
+#include "merkle-tree/merkle.cuh"
+#include "extern.cu"
+
+#ifndef __CUDA_ARCH__
+#include <cassert>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <iomanip>
+
+using namespace blake2s;
+
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg)                                                                                          \
+  printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+void uint8_to_hex_string(const uint8_t* values, int size)
+{
+  std::stringstream ss;
+
+  for (int i = 0; i < size; ++i) {
+    ss << std::hex << std::setw(2) << std::setfill('0') << (int)values[i];
+  }
+
+  std::string hexString = ss.str();
+  std::cout << hexString << std::endl;
+}
+
+#define A 2
+
+int main(int argc, char* argv[])
+{
+  using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+  using FpMicroseconds = std::chrono::duration<float, std::chrono::microseconds::period>;
+
+  /// Tree of height N and arity A contains \sum{A^i} for i in 0..N-1 elements
+  uint32_t input_block_len = 64;
+  uint32_t tree_height = argc > 1 ? atoi(argv[1]) : 1;
+  uint32_t number_of_leaves = pow(A, tree_height);
+  uint32_t total_number_of_leaves = number_of_leaves * input_block_len;
+
+  /// Use keep_rows to specify how many rows do you want to store
+  int keep_rows = argc > 2 ? atoi(argv[2]) : 2;
+  size_t digests_len = merkle_tree::get_digests_len(keep_rows - 1, A, 1);
+
+  /// Fill leaves with scalars [0, 1, ... 2^tree_height - 1]
+  START_TIMER(timer_allocation);
+  uint8_t input = 0;
+  uint8_t* leaves = static_cast<uint8_t*>(malloc(total_number_of_leaves));
+  for (uint64_t i = 0; i < total_number_of_leaves; i++) {
+    leaves[i] = (uint8_t)i;
+  }
+  END_TIMER(timer_allocation, "Allocated memory for leaves: ");
+
+  /// Allocate memory for digests of {keep_rows} rows of a tree
+  START_TIMER(timer_digests);
+  size_t digests_mem = digests_len * sizeof(BYTE) * 64;
+  BYTE* digests = static_cast<BYTE*>(malloc(digests_mem));
+  END_TIMER(timer_digests, "Allocated memory for digests");
+
+  std::cout << "Memory for leaves = " << total_number_of_leaves / 1024 / 1024 << " MB; "
+            << total_number_of_leaves / 1024 / 1024 / 1024 << " GB" << std::endl;
+  std::cout << "Number of leaves = " << number_of_leaves << std::endl;
+  std::cout << "Total Number of leaves = " << total_number_of_leaves << std::endl;
+  std::cout << "Memory for digests = " << digests_mem / 1024 / 1024 << " MB; " << digests_mem / 1024 / 1024 / 1024
+            << " GB" << std::endl;
+  std::cout << "Number of digest elements = " << digests_len << std::endl;
+
+  std::cout << "Total RAM consumption = " << (digests_mem + total_number_of_leaves) / 1024 / 1024 << " MB; "
+            << (digests_mem + total_number_of_leaves) / 1024 / 1024 / 1024 << " GB" << std::endl;
+
+  merkle_tree::TreeBuilderConfig config = merkle_tree::default_merkle_config();
+  config.arity = A;
+  config.keep_rows = keep_rows;
+  config.digest_elements = 32;
+  START_TIMER(blake2s_timer);
+  build_blake2s_merkle_tree_cuda(leaves, digests, tree_height, input_block_len, config);
+  END_TIMER(blake2s_timer, "blake2s")
+
+  for (int i = 0; i < digests_len * 32; i++) {
+    WORD root = digests[i];
+
+    // Print the current element in hexadecimal format
+    printf("%02x", root);
+
+    // After every 32 elements, print a newline to start a new row
+    if ((i + 1) % 32 == 0) { printf("\n"); }
+  }
+
+  free(digests);
+  free(leaves);
+}
+
+#endif
\ No newline at end of file
diff --git a/icicle/src/merkle-tree/merkle.cu b/icicle/src/merkle-tree/merkle.cu
index 2fe171634..ce96daf09 100644
--- a/icicle/src/merkle-tree/merkle.cu
+++ b/icicle/src/merkle-tree/merkle.cu
@@ -163,11 +163,6 @@ namespace merkle_tree {
     CHK_INIT_IF_RETURN();
     cudaStream_t& stream = tree_config.ctx.stream;
 
-    if (compression.preimage_max_length < tree_config.arity * tree_config.digest_elements)
-      THROW_ICICLE_ERR(
-        IcicleError_t::InvalidArgument,
-        "Hash max preimage length does not match merkle tree arity multiplied by digest elements");
-
     uint64_t number_of_bottom_layer_states = pow(tree_config.arity, height);
 
     // This will determine how much splitting do we need to do
diff --git a/wrappers/rust/icicle-hash/src/blake2s/mod.rs b/wrappers/rust/icicle-hash/src/blake2s/mod.rs
new file mode 100644
index 000000000..6d5de4dc1
--- /dev/null
+++ b/wrappers/rust/icicle-hash/src/blake2s/mod.rs
@@ -0,0 +1,71 @@
+use icicle_core::hash::HashConfig;
+use icicle_core::tree::TreeBuilderConfig;
+use icicle_cuda_runtime::error::CudaError;
+use icicle_cuda_runtime::memory::HostOrDeviceSlice;
+
+use icicle_core::error::IcicleResult;
+use icicle_core::traits::IcicleResultWrap;
+
+pub mod tests;
+
+extern "C" {
+    pub(crate) fn blake2s_cuda(
+        input: *const u8,
+        output: *mut u8,
+        number_of_blocks: u32,
+        input_block_size: u32,
+        output_block_size: u32,
+        config: &HashConfig,
+    ) -> CudaError;
+
+    pub(crate) fn build_blake2s_merkle_tree_cuda(
+        leaves: *const u8,
+        digests: *mut u64,
+        height: u32,
+        input_block_len: u32,
+        tree_config: &TreeBuilderConfig,
+    ) -> CudaError;
+}
+
+pub fn blake2s(
+    input: &(impl HostOrDeviceSlice<u8> + ?Sized),
+    input_block_size: u32,
+    number_of_blocks: u32,
+    output_block_size: u32,
+    output: &mut (impl HostOrDeviceSlice<u8> + ?Sized),
+    config: &HashConfig,
+) -> IcicleResult<()> {
+    let mut local_cfg = config.clone();
+    local_cfg.are_inputs_on_device = input.is_on_device();
+    local_cfg.are_outputs_on_device = output.is_on_device();
+    unsafe {
+        blake2s_cuda(
+            input.as_ptr(),
+            output.as_mut_ptr(),
+            number_of_blocks,
+            input_block_size,
+            output_block_size,
+            &local_cfg,
+        )
+        .wrap()
+    }
+}
+
+pub fn build_blake2s_merkle_tree(
+    leaves: &(impl HostOrDeviceSlice<u8> + ?Sized),
+    digests: &mut (impl HostOrDeviceSlice<u64> + ?Sized),
+    height: usize,
+    input_block_len: usize,
+    config: &TreeBuilderConfig,
+) -> IcicleResult<()> {
+    unsafe {
+        build_blake2s_merkle_tree_cuda(
+            leaves.as_ptr(),
+            digests.as_mut_ptr(),
+            height as u32,
+            input_block_len as u32,
+            config,
+        )
+        .wrap()
+    }
+}
diff --git a/wrappers/rust/icicle-hash/src/blake2s/tests.rs b/wrappers/rust/icicle-hash/src/blake2s/tests.rs
new file mode 100644
index 000000000..2eb929f44
--- /dev/null
+++ b/wrappers/rust/icicle-hash/src/blake2s/tests.rs
@@ -0,0 +1,54 @@
+#[cfg(test)]
+pub(crate) mod tests {
+    use icicle_core::{
+        hash::HashConfig,
+        tree::{merkle_tree_digests_len, TreeBuilderConfig},
+    };
+    use icicle_cuda_runtime::memory::HostSlice;
+
+    use crate::blake2s::{blake2s, build_blake2s_merkle_tree};
+
+    #[test]
+    fn single_hash_test() {
+        let config = HashConfig::default();
+
+        let preimages = b"a";
+        let mut digests = vec![0u8; 1 * 32];
+
+        let preimages_slice = HostSlice::from_slice(preimages);
+        let digests_slice = HostSlice::from_mut_slice(&mut digests);
+
+        blake2s(
+            preimages_slice,
+            1 as u32,
+            1 as u32,
+            32 as u32,
+            digests_slice,
+            &config,
+        )
+        .unwrap();
+
+        let hex_string: String = digests_slice.iter().map(|byte| format!("{:02x}", byte)).collect();
+
+        assert_eq!(
+            hex_string,
+            "4a0d129873403037c2cd9b9048203687f6233fb6738956e0349bd4320fec3e90"
+        );
+    }
+
+    #[test]
+    fn blake2s_merkle_tree_test() {
+        let mut config = TreeBuilderConfig::default();
+        config.arity = 2;
+        let height = 22;
+        let input_block_len = 136;
+        let leaves = vec![1u8; (1 << height) * input_block_len];
+        let mut digests = vec![0u64; merkle_tree_digests_len((height + 1) as u32, 2, 1)];
+
+        let leaves_slice = HostSlice::from_slice(&leaves);
+        let digests_slice = HostSlice::from_mut_slice(&mut digests);
+
+        build_blake2s_merkle_tree(leaves_slice, digests_slice, height, input_block_len, &config).unwrap();
+        println!("Root: {:?}", digests_slice[0]);
+    }
+}
diff --git a/wrappers/rust/icicle-hash/src/lib.rs b/wrappers/rust/icicle-hash/src/lib.rs
index ebcb6d4dd..cf61bbd4b 100644
--- a/wrappers/rust/icicle-hash/src/lib.rs
+++ b/wrappers/rust/icicle-hash/src/lib.rs
@@ -1 +1,2 @@
+pub mod blake2s;
 pub mod keccak;