diff --git a/benchmark/cublas/CMakeLists.txt b/benchmark/cublas/CMakeLists.txt
index 250278fac..ad3b4ed05 100644
--- a/benchmark/cublas/CMakeLists.txt
+++ b/benchmark/cublas/CMakeLists.txt
@@ -74,12 +74,20 @@ set(sources
   extension/omatadd.cpp
 ) 
 
+# Operators supporting COMPLEX types benchmarking
+set(CPLX_OPS "gemm" "gemm_batched" "gemm_batched_strided")
+
 # Add individual benchmarks for each method
 foreach(cublas_bench ${sources})
   get_filename_component(bench_cublas_exec ${cublas_bench} NAME_WE)
   add_executable(bench_cublas_${bench_cublas_exec} ${cublas_bench} main.cpp)
   target_link_libraries(bench_cublas_${bench_cublas_exec} PRIVATE benchmark CUDA::toolkit CUDA::cublas CUDA::cudart portblas Clara::Clara bench_info)
   target_compile_definitions(bench_cublas_${bench_cublas_exec} PRIVATE -DBLAS_INDEX_T=${BLAS_BENCHMARK_INDEX_TYPE})
+  if(${BLAS_ENABLE_COMPLEX})
+    if("${bench_cublas_exec}" IN_LIST CPLX_OPS)
+      target_compile_definitions(bench_cublas_${bench_cublas_exec} PRIVATE BLAS_ENABLE_COMPLEX=1)
+    endif()
+  endif()
   add_sycl_to_target(
     TARGET bench_cublas_${bench_cublas_exec}
     SOURCES ${cublas_bench}
diff --git a/benchmark/cublas/blas3/gemm.cpp b/benchmark/cublas/blas3/gemm.cpp
index 5a103d032..c74c9e98e 100644
--- a/benchmark/cublas/blas3/gemm.cpp
+++ b/benchmark/cublas/blas3/gemm.cpp
@@ -38,6 +38,18 @@ static inline void cublas_routine(args_t&&... args) {
   return;
 }
 
+#ifdef BLAS_ENABLE_COMPLEX
+template <typename scalar_t, typename... args_t>
+static inline void cublas_cplx_routine(args_t&&... args) {
+  if constexpr (std::is_same_v<scalar_t, float>) {
+    CUBLAS_CHECK(cublasCgemm(std::forward<args_t>(args)...));
+  } else if constexpr (std::is_same_v<scalar_t, double>) {
+    CUBLAS_CHECK(cublasZgemm(std::forward<args_t>(args)...));
+  }
+  return;
+}
+#endif
+
 template <typename scalar_t>
 void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1,
          int t2, index_t m, index_t k, index_t n, scalar_t alpha, scalar_t beta,
@@ -168,6 +180,162 @@ void register_benchmark(blas_benchmark::Args& args,
   }
 }
 
+#ifdef BLAS_ENABLE_COMPLEX
+template <typename scalar_t>
+using cudaComplex = typename std::conditional<sizeof(scalar_t) == 8,
+                                              cuDoubleComplex, cuComplex>::type;
+
+template <typename scalar_t>
+void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1,
+         int t2, index_t m, index_t k, index_t n, std::complex<scalar_t> alpha,
+         std::complex<scalar_t> beta, bool* success) {
+  // initialize the state label
+  blas_benchmark::utils::set_benchmark_label<std::complex<scalar_t>>(state);
+
+  // Standard test setup.
+  std::string t1s = blas_benchmark::utils::from_transpose_enum(
+      static_cast<blas_benchmark::utils::Transposition>(t1));
+  std::string t2s = blas_benchmark::utils::from_transpose_enum(
+      static_cast<blas_benchmark::utils::Transposition>(t2));
+  const char* t_a = t1s.c_str();
+  const char* t_b = t2s.c_str();
+
+  index_t lda = t_a[0] == 'n' ? m : k;
+  index_t ldb = t_b[0] == 'n' ? k : n;
+  index_t ldc = m;
+
+  blas_benchmark::utils::init_level_3_cplx_counters<
+      blas_benchmark::utils::Level3Op::gemm, scalar_t>(state, beta, m, n, k,
+                                                       static_cast<index_t>(1));
+
+  cublasHandle_t& cuda_handle = *cuda_handle_ptr;
+
+  // Matrices
+  std::vector<std::complex<scalar_t>> a =
+      blas_benchmark::utils::random_cplx_data<scalar_t>(m * k);
+  std::vector<std::complex<scalar_t>> b =
+      blas_benchmark::utils::random_cplx_data<scalar_t>(k * n);
+  std::vector<std::complex<scalar_t>> c =
+      blas_benchmark::utils::const_cplx_data<scalar_t>(m * n, 0);
+
+  blas_benchmark::utils::CUDAVector<cudaComplex<scalar_t>> a_gpu(
+      m * k, reinterpret_cast<cudaComplex<scalar_t>*>(a.data()));
+  blas_benchmark::utils::CUDAVector<cudaComplex<scalar_t>> b_gpu(
+      k * n, reinterpret_cast<cudaComplex<scalar_t>*>(b.data()));
+  blas_benchmark::utils::CUDAVector<cudaComplex<scalar_t>> c_gpu(
+      n * m, reinterpret_cast<cudaComplex<scalar_t>*>(c.data()));
+
+  cublasOperation_t c_t_a = (*t_a == 'n') ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t c_t_b = (*t_b == 'n') ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  cudaComplex<scalar_t> cuBeta{beta.real(), beta.imag()};
+  cudaComplex<scalar_t> cuAlpha{alpha.real(), alpha.imag()};
+
+#ifdef BLAS_VERIFY_BENCHMARK
+  // Run a first time with a verification of the results
+  std::vector<std::complex<scalar_t>> c_ref = c;
+
+  reference_blas::cgemm<scalar_t>(t_a, t_b, m, n, k,
+                                  reinterpret_cast<const void*>(&alpha),
+                                  reinterpret_cast<const void*>(a.data()), lda,
+                                  reinterpret_cast<const void*>(b.data()), ldb,
+                                  reinterpret_cast<const void*>(&beta),
+                                  reinterpret_cast<void*>(c_ref.data()), ldc);
+  std::vector<std::complex<scalar_t>> c_temp = c;
+  {
+    blas_benchmark::utils::CUDAVector<cudaComplex<scalar_t>, true> c_temp_gpu(
+        m * n, reinterpret_cast<cudaComplex<scalar_t>*>(c_temp.data()));
+    cublas_cplx_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &cuAlpha,
+                                  a_gpu, lda, b_gpu, ldb, &cuBeta, c_temp_gpu,
+                                  ldc);
+  }
+
+  std::ostringstream err_stream;
+  if (!utils::compare_vectors(c_temp, c_ref, err_stream, "")) {
+    const std::string& err_str = err_stream.str();
+    state.SkipWithError(err_str.c_str());
+    *success = false;
+  };
+#endif
+  auto blas_warmup = [&]() -> void {
+    cublas_cplx_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &cuAlpha,
+                                  a_gpu, lda, b_gpu, ldb, &cuBeta, c_gpu, ldc);
+    return;
+  };
+
+  cudaEvent_t start;
+  cudaEvent_t stop;
+  CUDA_CHECK(cudaEventCreate(&start));
+  CUDA_CHECK(cudaEventCreate(&stop));
+
+  auto blas_method_def = [&]() -> std::vector<cudaEvent_t> {
+    CUDA_CHECK(cudaEventRecord(start));
+    cublas_cplx_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &cuAlpha,
+                                  a_gpu, lda, b_gpu, ldb, &cuBeta, c_gpu, ldc);
+    CUDA_CHECK(cudaEventRecord(stop));
+    CUDA_CHECK(cudaEventSynchronize(stop));
+    return std::vector{start, stop};
+  };
+
+  // Warmup
+  blas_benchmark::utils::warmup(blas_warmup);
+  CUDA_CHECK(cudaStreamSynchronize(NULL));
+
+  blas_benchmark::utils::init_counters(state);
+
+  // Measure
+  for (auto _ : state) {
+    // Run
+    std::tuple<double, double> times =
+        blas_benchmark::utils::timef_cuda(blas_method_def);
+
+    // Report
+    blas_benchmark::utils::update_counters(state, times);
+  }
+
+  state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]);
+  state.SetBytesProcessed(state.iterations() *
+                          state.counters["bytes_processed"]);
+
+  blas_benchmark::utils::calc_avg_counters(state);
+
+  CUDA_CHECK(cudaEventDestroy(start));
+  CUDA_CHECK(cudaEventDestroy(stop));
+};
+
+template <typename scalar_t>
+void register_cplx_benchmark(blas_benchmark::Args& args,
+                             cublasHandle_t* cuda_handle_ptr, bool* success) {
+  auto gemm_params =
+      blas_benchmark::utils::get_blas3_cplx_params<scalar_t>(args);
+  for (auto p : gemm_params) {
+    std::string t1s, t2s;
+    index_t m, n, k;
+    scalar_t alpha_r, alpha_i, beta_r, beta_i;
+
+    std::tie(t1s, t2s, m, k, n, alpha_r, alpha_i, beta_r, beta_i) = p;
+    int t1 = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t1s));
+    int t2 = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t2s));
+    std::complex<scalar_t> alpha{alpha_r, alpha_i};
+    std::complex<scalar_t> beta{beta_r, beta_i};
+
+    auto BM_lambda = [&](benchmark::State& st, cublasHandle_t* cuda_handle_ptr,
+                         int t1, int t2, index_t m, index_t k, index_t n,
+                         std::complex<scalar_t> alpha,
+                         std::complex<scalar_t> beta, bool* success) {
+      run<scalar_t>(st, cuda_handle_ptr, t1, t2, m, k, n, alpha, beta, success);
+    };
+    benchmark::RegisterBenchmark(
+        blas_benchmark::utils::get_name<benchmark_op, std::complex<scalar_t>>(
+            t1s, t2s, m, k, n, blas_benchmark::utils::MEM_TYPE_USM)
+            .c_str(),
+        BM_lambda, cuda_handle_ptr, t1, t2, m, k, n, alpha, beta, success)
+        ->UseRealTime();
+  }
+}
+
+#endif
+
 namespace blas_benchmark {
 void create_benchmark(blas_benchmark::Args& args,
                       cublasHandle_t* cuda_handle_ptr, bool* success) {
diff --git a/benchmark/cublas/blas3/gemm_batched.cpp b/benchmark/cublas/blas3/gemm_batched.cpp
index 4cce28ff5..c0c50631f 100644
--- a/benchmark/cublas/blas3/gemm_batched.cpp
+++ b/benchmark/cublas/blas3/gemm_batched.cpp
@@ -38,6 +38,18 @@ static inline void cublas_routine(args_t&&... args) {
   return;
 }
 
+#ifdef BLAS_ENABLE_COMPLEX
+template <typename scalar_t, typename... args_t>
+static inline void cublas_cplx_routine(args_t&&... args) {
+  if constexpr (std::is_same_v<scalar_t, float>) {
+    CUBLAS_CHECK(cublasCgemmBatched(std::forward<args_t>(args)...));
+  } else if constexpr (std::is_same_v<scalar_t, double>) {
+    CUBLAS_CHECK(cublasZgemmBatched(std::forward<args_t>(args)...));
+  }
+  return;
+}
+#endif
+
 template <typename scalar_t>
 void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, index_t t1,
          index_t t2, index_t m, index_t k, index_t n, scalar_t alpha,
@@ -164,7 +176,7 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, index_t t1,
                           state.counters["bytes_processed"]);
 
   blas_benchmark::utils::calc_avg_counters(state);
-  
+
   CUDA_CHECK(cudaEventDestroy(start));
   CUDA_CHECK(cudaEventDestroy(stop));
 };
@@ -209,6 +221,197 @@ void register_benchmark(blas_benchmark::Args& args,
   }
 }
 
+#ifdef BLAS_ENABLE_COMPLEX
+template <typename scalar_t>
+using cudaComplex = typename std::conditional<sizeof(scalar_t) == 8,
+                                              cuDoubleComplex, cuComplex>::type;
+template <typename scalar_t>
+void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, index_t t1,
+         index_t t2, index_t m, index_t k, index_t n,
+         std::complex<scalar_t> alpha, std::complex<scalar_t> beta,
+         index_t batch_count, int batch_type_i, bool* success) {
+  // initialize the state label
+  blas_benchmark::utils::set_benchmark_label<std::complex<scalar_t>>(state);
+
+  // Standard setup
+  std::string t1s = blas_benchmark::utils::from_transpose_enum(
+      static_cast<blas_benchmark::utils::Transposition>(t1));
+  std::string t2s = blas_benchmark::utils::from_transpose_enum(
+      static_cast<blas_benchmark::utils::Transposition>(t2));
+  const char* t_a = t1s.c_str();
+  const char* t_b = t2s.c_str();
+  auto batch_type = static_cast<blas::gemm_batch_type_t>(batch_type_i);
+
+  index_t lda = t_a[0] == 'n' ? m : k;
+  index_t ldb = t_b[0] == 'n' ? k : n;
+  index_t ldc = m;
+
+  blas_benchmark::utils::init_level_3_cplx_counters<
+      blas_benchmark::utils::Level3Op::gemm_batched, scalar_t>(
+      state, beta, m, n, k, batch_count);
+
+  cublasHandle_t& cuda_handle = *cuda_handle_ptr;
+
+  const index_t size_a = m * k;
+  const index_t size_b = k * n;
+  const index_t size_c = m * n;
+
+  // Matrices
+  std::vector<std::complex<scalar_t>> a =
+      blas_benchmark::utils::random_cplx_data<scalar_t>(size_a * batch_count);
+  std::vector<std::complex<scalar_t>> b =
+      blas_benchmark::utils::random_cplx_data<scalar_t>(size_b * batch_count);
+  std::vector<std::complex<scalar_t>> c =
+      blas_benchmark::utils::const_cplx_data<scalar_t>(size_c * batch_count, 0);
+
+  blas_benchmark::utils::CUDAVectorBatched<cudaComplex<scalar_t>> d_A_array(
+      size_a, batch_count, reinterpret_cast<cudaComplex<scalar_t>*>(a.data()));
+  blas_benchmark::utils::CUDAVectorBatched<cudaComplex<scalar_t>> d_B_array(
+      size_b, batch_count, reinterpret_cast<cudaComplex<scalar_t>*>(b.data()));
+  blas_benchmark::utils::CUDAVectorBatched<cudaComplex<scalar_t>> d_C_array(
+      size_c, batch_count);
+
+  cublasOperation_t c_t_a = (*t_a == 'n') ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t c_t_b = (*t_b == 'n') ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  cudaComplex<scalar_t> cuBeta{beta.real(), beta.imag()};
+  cudaComplex<scalar_t> cuAlpha{alpha.real(), alpha.imag()};
+
+#ifdef BLAS_VERIFY_BENCHMARK
+  // Run a first time with a verification of the results
+  {
+    std::vector<std::complex<scalar_t>> c_ref = c;
+    auto _base = [=](index_t dim0, index_t dim1, index_t idx) {
+      return dim0 * dim1 * idx;
+    };
+    for (int batch_idx = 0; batch_idx < batch_count; batch_idx++) {
+      reference_blas::cgemm<scalar_t>(
+          t_a, t_b, m, n, k, reinterpret_cast<const void*>(&alpha),
+          reinterpret_cast<const void*>(a.data() + _base(m, k, batch_idx)), lda,
+          reinterpret_cast<const void*>(b.data() + _base(k, n, batch_idx)), ldb,
+          reinterpret_cast<const void*>(&beta),
+          reinterpret_cast<void*>(c_ref.data() + _base(m, n, batch_idx)), ldc);
+    }
+
+    std::vector<std::complex<scalar_t>> c_temp(size_c * batch_count);
+
+    {
+      blas_benchmark::utils::CUDAVectorBatched<cudaComplex<scalar_t>, true>
+          c_temp_gpu(n * m, batch_count,
+                     reinterpret_cast<cudaComplex<scalar_t>*>(c_temp.data()));
+      cublas_cplx_routine<scalar_t>(
+          cuda_handle, c_t_a, c_t_b, m, n, k, &cuAlpha,
+          d_A_array.get_batch_array(), lda, d_B_array.get_batch_array(), ldb,
+          &cuBeta, c_temp_gpu.get_batch_array(), ldc, batch_count);
+    }
+
+    std::ostringstream err_stream;
+    for (int i = 0; i < batch_count; ++i) {
+      if (!utils::compare_vectors(c_temp, c_ref, err_stream, "")) {
+        const std::string& err_str = err_stream.str();
+        state.SkipWithError(err_str.c_str());
+        *success = false;
+      };
+    }
+
+  }  // close scope for verify benchmark
+#endif
+
+  auto blas_warmup = [&]() -> void {
+    cublas_cplx_routine<scalar_t>(
+        cuda_handle, c_t_a, c_t_b, m, n, k, &cuAlpha,
+        d_A_array.get_batch_array(), lda, d_B_array.get_batch_array(), ldb,
+        &cuBeta, d_C_array.get_batch_array(), ldc, batch_count);
+    return;
+  };
+
+  cudaEvent_t start, stop;
+  CUDA_CHECK(cudaEventCreate(&start));
+  CUDA_CHECK(cudaEventCreate(&stop));
+
+  auto blas_method_def = [&]() -> std::vector<cudaEvent_t> {
+    CUDA_CHECK(cudaEventRecord(start));
+    cublas_cplx_routine<scalar_t>(
+        cuda_handle, c_t_a, c_t_b, m, n, k, &cuAlpha,
+        d_A_array.get_batch_array(), lda, d_B_array.get_batch_array(), ldb,
+        &cuBeta, d_C_array.get_batch_array(), ldc, batch_count);
+    CUDA_CHECK(cudaEventRecord(stop));
+    CUDA_CHECK(cudaEventSynchronize(stop));
+    return std::vector{start, stop};
+  };
+
+  // Warmup
+  blas_benchmark::utils::warmup(blas_method_def);
+  CUDA_CHECK(cudaStreamSynchronize(NULL));
+
+  blas_benchmark::utils::init_counters(state);
+
+  // Measure
+  for (auto _ : state) {
+    // Run
+    std::tuple<double, double> times =
+        blas_benchmark::utils::timef_cuda(blas_method_def);
+
+    // Report
+    blas_benchmark::utils::update_counters(state, times);
+  }
+
+  state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]);
+  state.SetBytesProcessed(state.iterations() *
+                          state.counters["bytes_processed"]);
+
+  blas_benchmark::utils::calc_avg_counters(state);
+
+  CUDA_CHECK(cudaEventDestroy(start));
+  CUDA_CHECK(cudaEventDestroy(stop));
+};
+
+template <typename scalar_t>
+void register_cplx_benchmark(blas_benchmark::Args& args,
+                             cublasHandle_t* cuda_handle_ptr, bool* success) {
+  auto gemm_batched_params =
+      blas_benchmark::utils::get_gemm_cplx_batched_params<scalar_t>(args);
+
+  for (auto p : gemm_batched_params) {
+    std::string t1s, t2s;
+    index_t m, n, k, batch_count;
+    scalar_t alpha_r, alpha_i, beta_r, beta_i;
+    int batch_type;
+
+    std::tie(t1s, t2s, m, k, n, alpha_r, alpha_i, beta_r, beta_i, batch_count,
+             batch_type) = p;
+    std::complex<scalar_t> alpha{alpha_r, alpha_i};
+    std::complex<scalar_t> beta{beta_r, beta_i};
+
+    if (batch_type == 1) {
+      std::cerr << "interleaved memory for gemm_batched operator is not "
+                   "supported by cuBLAS\n";
+      continue;
+    }
+
+    int t1 = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t1s));
+    int t2 = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t2s));
+
+    auto BM_lambda = [&](benchmark::State& st, cublasHandle_t* cuda_handle_ptr,
+                         int t1, int t2, index_t m, index_t k, index_t n,
+                         std::complex<scalar_t> alpha,
+                         std::complex<scalar_t> beta, index_t batch_count,
+                         int batch_type, bool* success) {
+      run<scalar_t>(st, cuda_handle_ptr, t1, t2, m, k, n, alpha, beta,
+                    batch_count, batch_type, success);
+    };
+    benchmark::RegisterBenchmark(
+        blas_benchmark::utils::get_name<benchmark_op, std::complex<scalar_t>>(
+            t1s, t2s, m, k, n, batch_count, batch_type,
+            blas_benchmark::utils::MEM_TYPE_USM)
+            .c_str(),
+        BM_lambda, cuda_handle_ptr, t1, t2, m, k, n, alpha, beta, batch_count,
+        batch_type, success)
+        ->UseRealTime();
+  }
+}
+#endif
+
 namespace blas_benchmark {
 void create_benchmark(blas_benchmark::Args& args,
                       cublasHandle_t* cuda_handle_ptr, bool* success) {
diff --git a/benchmark/cublas/blas3/gemm_batched_strided.cpp b/benchmark/cublas/blas3/gemm_batched_strided.cpp
index d96b7adfe..beb81fb4c 100644
--- a/benchmark/cublas/blas3/gemm_batched_strided.cpp
+++ b/benchmark/cublas/blas3/gemm_batched_strided.cpp
@@ -38,6 +38,18 @@ static inline void cublas_routine(args_t&&... args) {
   return;
 }
 
+#ifdef BLAS_ENABLE_COMPLEX
+template <typename scalar_t, typename... args_t>
+static inline void cublas_cplx_routine(args_t&&... args) {
+  if constexpr (std::is_same_v<scalar_t, float>) {
+    CUBLAS_CHECK(cublasCgemmStridedBatched(std::forward<args_t>(args)...));
+  } else if constexpr (std::is_same_v<scalar_t, double>) {
+    CUBLAS_CHECK(cublasZgemmStridedBatched(std::forward<args_t>(args)...));
+  }
+  return;
+}
+#endif
+
 template <typename scalar_t>
 void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1,
          int t2, index_t m, index_t k, index_t n, scalar_t alpha, scalar_t beta,
@@ -208,6 +220,194 @@ void register_benchmark(blas_benchmark::Args& args,
   }
 }
 
+#ifdef BLAS_ENABLE_COMPLEX
+template <typename scalar_t>
+using cudaComplex = typename std::conditional<sizeof(scalar_t) == 8,
+                                              cuDoubleComplex, cuComplex>::type;
+
+template <typename scalar_t>
+void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int t1,
+         int t2, index_t m, index_t k, index_t n, std::complex<scalar_t> alpha,
+         std::complex<scalar_t> beta, index_t batch_size, index_t stride_a_mul,
+         index_t stride_b_mul, index_t stride_c_mul, bool* success) {
+  // initialize the state label
+  blas_benchmark::utils::set_benchmark_label<std::complex<scalar_t>>(state);
+
+  // Standard test setup.
+  std::string t1s = blas_benchmark::utils::from_transpose_enum(
+      static_cast<blas_benchmark::utils::Transposition>(t1));
+  std::string t2s = blas_benchmark::utils::from_transpose_enum(
+      static_cast<blas_benchmark::utils::Transposition>(t2));
+  const char* t_a = t1s.c_str();
+  const char* t_b = t2s.c_str();
+
+  const bool trA = t_a[0] == 'n';
+  const bool trB = t_b[0] == 'n';
+
+  index_t lda = trA ? m : k;
+  index_t ldb = trB ? k : n;
+  index_t ldc = m;
+
+  blas_benchmark::utils::init_level_3_cplx_counters<
+      blas_benchmark::utils::Level3Op::gemm_batched_strided, scalar_t>(
+      state, beta, m, n, k, batch_size, stride_a_mul, stride_b_mul,
+      stride_c_mul);
+
+  cublasHandle_t& cuda_handle = *cuda_handle_ptr;
+
+  // Data sizes
+  // Elementary matrices
+  const index_t a_size = m * k;
+  const index_t b_size = k * n;
+  const index_t c_size = m * n;
+  // Strides
+  const index_t stride_a = stride_a_mul * a_size;
+  const index_t stride_b = stride_b_mul * b_size;
+  const index_t stride_c = stride_c_mul * c_size;
+  // Batched matrices
+  const int size_a_batch = a_size + (batch_size - 1) * stride_a;
+  const int size_b_batch = b_size + (batch_size - 1) * stride_b;
+  const int size_c_batch = c_size + (batch_size - 1) * stride_c;
+
+  // Matrices (Total size is equal to matrix size x batch_size since we're using
+  // default striding values)
+  std::vector<std::complex<scalar_t>> a =
+      blas_benchmark::utils::random_cplx_data<scalar_t>(size_a_batch);
+  std::vector<std::complex<scalar_t>> b =
+      blas_benchmark::utils::random_cplx_data<scalar_t>(size_b_batch);
+  std::vector<std::complex<scalar_t>> c =
+      blas_benchmark::utils::const_cplx_data<scalar_t>(size_c_batch, 0);
+
+  blas_benchmark::utils::CUDAVector<cudaComplex<scalar_t>> a_gpu(
+      size_a_batch, reinterpret_cast<cudaComplex<scalar_t>*>(a.data()));
+  blas_benchmark::utils::CUDAVector<cudaComplex<scalar_t>> b_gpu(
+      size_b_batch, reinterpret_cast<cudaComplex<scalar_t>*>(b.data()));
+  blas_benchmark::utils::CUDAVector<cudaComplex<scalar_t>> c_gpu(
+      size_c_batch, reinterpret_cast<cudaComplex<scalar_t>*>(c.data()));
+
+  cublasOperation_t c_t_a = trA ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t c_t_b = trB ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  cudaComplex<scalar_t> cuBeta{beta.real(), beta.imag()};
+  cudaComplex<scalar_t> cuAlpha{alpha.real(), alpha.imag()};
+
+#ifdef BLAS_VERIFY_BENCHMARK
+  // Run a first time with a verification of the results
+  std::vector<std::complex<scalar_t>> c_ref = c;
+  for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
+    reference_blas::cgemm<scalar_t>(
+        t_a, t_b, m, n, k, reinterpret_cast<const void*>(&alpha),
+        reinterpret_cast<const void*>(a.data() + batch_idx * stride_a), lda,
+        reinterpret_cast<const void*>(b.data() + batch_idx * stride_b), ldb,
+        reinterpret_cast<const void*>(&beta),
+        reinterpret_cast<void*>(c_ref.data() + batch_idx * stride_c), ldc);
+  }
+
+  std::vector<std::complex<scalar_t>> c_temp = c;
+  {
+    blas_benchmark::utils::CUDAVector<cudaComplex<scalar_t>, true> c_temp_gpu(
+        size_c_batch, reinterpret_cast<cudaComplex<scalar_t>*>(c_temp.data()));
+    cublas_cplx_routine<scalar_t>(
+        cuda_handle, c_t_a, c_t_b, m, n, k, &cuAlpha, a_gpu, lda, stride_a,
+        b_gpu, ldb, stride_b, &cuBeta, c_temp_gpu, ldc, stride_c, batch_size);
+  }
+
+  std::ostringstream err_stream;
+  if (!utils::compare_vectors_strided(c_temp, c_ref, stride_c, c_size,
+                                      err_stream, "")) {
+    const std::string& err_str = err_stream.str();
+    state.SkipWithError(err_str.c_str());
+    *success = false;
+  };
+#endif
+
+  auto blas_warmup = [&]() -> void {
+    cublas_cplx_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &cuAlpha,
+                                  a_gpu, lda, stride_a, b_gpu, ldb, stride_b,
+                                  &cuBeta, c_gpu, ldc, stride_c, batch_size);
+    return;
+  };
+
+  cudaEvent_t start, stop;
+  CUDA_CHECK(cudaEventCreate(&start));
+  CUDA_CHECK(cudaEventCreate(&stop));
+
+  auto blas_method_def = [&]() -> std::vector<cudaEvent_t> {
+    CUDA_CHECK(cudaEventRecord(start));
+    cublas_cplx_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, k, &cuAlpha,
+                                  a_gpu, lda, stride_a, b_gpu, ldb, stride_b,
+                                  &cuBeta, c_gpu, ldc, stride_c, batch_size);
+    CUDA_CHECK(cudaEventRecord(stop));
+    CUDA_CHECK(cudaEventSynchronize(stop));
+    return std::vector{start, stop};
+  };
+
+  // Warmup
+  blas_benchmark::utils::warmup(blas_warmup);
+  CUDA_CHECK(cudaStreamSynchronize(NULL));
+
+  blas_benchmark::utils::init_counters(state);
+
+  // Measure
+  for (auto _ : state) {
+    // Run
+    std::tuple<double, double> times =
+        blas_benchmark::utils::timef_cuda(blas_method_def);
+
+    // Report
+    blas_benchmark::utils::update_counters(state, times);
+  }
+
+  state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]);
+  state.SetBytesProcessed(state.iterations() *
+                          state.counters["bytes_processed"]);
+
+  blas_benchmark::utils::calc_avg_counters(state);
+
+  CUDA_CHECK(cudaEventDestroy(start));
+  CUDA_CHECK(cudaEventDestroy(stop));
+};
+
+template <typename scalar_t>
+void register_cplx_benchmark(blas_benchmark::Args& args,
+                             cublasHandle_t* cuda_handle_ptr, bool* success) {
+  auto gemm_batched_strided_params =
+      blas_benchmark::utils::get_gemm_batched_strided_cplx_params<scalar_t>(
+          args);
+
+  for (auto p : gemm_batched_strided_params) {
+    std::string t1s, t2s;
+    index_t m, n, k, batch_size, stride_a_mul, stride_b_mul, stride_c_mul;
+    scalar_t alpha_r, alpha_i, beta_r, beta_i;
+    std::tie(t1s, t2s, m, k, n, alpha_r, alpha_i, beta_r, beta_i, batch_size,
+             stride_a_mul, stride_b_mul, stride_c_mul) = p;
+    int t1 = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t1s));
+    int t2 = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t2s));
+    std::complex<scalar_t> alpha{alpha_r, alpha_i};
+    std::complex<scalar_t> beta{beta_r, beta_i};
+
+    auto BM_lambda = [&](benchmark::State& st, cublasHandle_t* cuda_handle_ptr,
+                         int t1, int t2, index_t m, index_t k, index_t n,
+                         std::complex<scalar_t> alpha,
+                         std::complex<scalar_t> beta, index_t batch_size,
+                         index_t strd_a_mul, index_t strd_b_mul,
+                         index_t strd_c_mul, bool* success) {
+      run<scalar_t>(st, cuda_handle_ptr, t1, t2, m, k, n, alpha, beta,
+                    batch_size, strd_a_mul, strd_b_mul, strd_c_mul, success);
+    };
+    benchmark::RegisterBenchmark(
+        blas_benchmark::utils::get_name<benchmark_op, std::complex<scalar_t>>(
+            t1s, t2s, m, k, n, batch_size, stride_a_mul, stride_b_mul,
+            stride_c_mul, blas_benchmark::utils::MEM_TYPE_USM)
+            .c_str(),
+        BM_lambda, cuda_handle_ptr, t1, t2, m, k, n, alpha, beta, batch_size,
+        stride_a_mul, stride_b_mul, stride_c_mul, success)
+        ->UseRealTime();
+  }
+}
+
+#endif
+
 namespace blas_benchmark {
 void create_benchmark(blas_benchmark::Args& args,
                       cublasHandle_t* cuda_handle_ptr, bool* success) {
diff --git a/benchmark/cublas/utils.hpp b/benchmark/cublas/utils.hpp
index eeaee7371..362fdce51 100644
--- a/benchmark/cublas/utils.hpp
+++ b/benchmark/cublas/utils.hpp
@@ -33,6 +33,7 @@
 #include "portblas.h"
 #include <common/common_utils.hpp>
 
+#include <cuComplex.h>
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -179,6 +180,15 @@ class CUDAVectorBatched : private CUDADeviceMemory<T> {
     }
   }
 
+  CUDAVectorBatched(size_t matrix_size, size_t batch_count, T* h_v)
+      : CUDAVectorBatched<T, CopyToHost>(matrix_size, batch_count) {
+    if constexpr (CopyToHost) h_data = h_v;
+    for (int i = 0; i < batch_count; ++i) {
+      CUDA_CHECK(cudaMemcpy(d_data[i], &h_v[matrix_size * i],
+                            sizeof(T) * c_matrix_size, cudaMemcpyHostToDevice));
+    }
+  }
+
   ~CUDAVectorBatched() {
     if constexpr (CopyToHost) {
       for (int i = 0; i < c_batch_count; ++i) {
diff --git a/benchmark/portblas/CMakeLists.txt b/benchmark/portblas/CMakeLists.txt
index 785996422..4ac3fdeaa 100644
--- a/benchmark/portblas/CMakeLists.txt
+++ b/benchmark/portblas/CMakeLists.txt
@@ -75,12 +75,20 @@ if(${BLAS_ENABLE_EXTENSIONS})
   list(APPEND sources extension/reduction.cpp)
 endif()
 
+# Operators supporting COMPLEX types benchmarking
+set(CPLX_OPS "gemm" "gemm_batched" "gemm_batched_strided")
+
 # Add individual benchmarks for each method
 foreach(portblas_bench ${sources})
   get_filename_component(bench_exec ${portblas_bench} NAME_WE)
   add_executable(bench_${bench_exec} ${portblas_bench} main.cpp)
   target_link_libraries(bench_${bench_exec} PRIVATE benchmark Clara::Clara portblas bench_info)
   target_compile_definitions(bench_${bench_exec} PRIVATE -DBLAS_INDEX_T=${BLAS_BENCHMARK_INDEX_TYPE})
+  if(${BLAS_ENABLE_COMPLEX})
+    if("${bench_exec}" IN_LIST CPLX_OPS)
+      target_compile_definitions(bench_${bench_exec} PRIVATE BLAS_ENABLE_COMPLEX=1)
+    endif()
+  endif()
   add_sycl_to_target(
     TARGET bench_${bench_exec}
     SOURCES ${portblas_bench}
diff --git a/benchmark/portblas/blas3/gemm.cpp b/benchmark/portblas/blas3/gemm.cpp
index 51d4869a8..27bb90650 100644
--- a/benchmark/portblas/blas3/gemm.cpp
+++ b/benchmark/portblas/blas3/gemm.cpp
@@ -177,6 +177,191 @@ void register_benchmark(blas_benchmark::Args& args,
 #endif
 }
 
+#ifdef BLAS_ENABLE_COMPLEX
+template <typename scalar_t, blas::helper::AllocType mem_alloc>
+void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1,
+         int t2, index_t m, index_t k, index_t n, std::complex<scalar_t> alpha,
+         std::complex<scalar_t> beta, bool* success) {
+  // initialize the state label
+  blas_benchmark::utils::set_benchmark_label<std::complex<scalar_t>>(
+      state, sb_handle_ptr->get_queue());
+
+  // Standard test setup.
+  std::string t1s = blas_benchmark::utils::from_transpose_enum(
+      static_cast<blas_benchmark::utils::Transposition>(t1));
+  std::string t2s = blas_benchmark::utils::from_transpose_enum(
+      static_cast<blas_benchmark::utils::Transposition>(t2));
+  const char* t_a = t1s.c_str();
+  const char* t_b = t2s.c_str();
+
+  index_t lda = t_a[0] == 'n' ? m : k;
+  index_t ldb = t_b[0] == 'n' ? k : n;
+  index_t ldc = m;
+
+  blas_benchmark::utils::init_level_3_cplx_counters<
+      blas_benchmark::utils::Level3Op::gemm, scalar_t>(state, beta, m, n, k,
+                                                       static_cast<index_t>(1));
+
+  blas::SB_Handle& sb_handle = *sb_handle_ptr;
+  auto q = sb_handle.get_queue();
+
+  // Matrices
+  std::vector<std::complex<scalar_t>> a =
+      blas_benchmark::utils::random_cplx_data<scalar_t>(m * k);
+  std::vector<std::complex<scalar_t>> b =
+      blas_benchmark::utils::random_cplx_data<scalar_t>(k * n);
+  std::vector<std::complex<scalar_t>> c =
+      blas_benchmark::utils::const_cplx_data<scalar_t>(m * n, 0);
+
+  auto a_gpu =
+      blas::helper::allocate<mem_alloc, blas::complex_sycl<scalar_t>>(m * k, q);
+  auto b_gpu =
+      blas::helper::allocate<mem_alloc, blas::complex_sycl<scalar_t>>(k * n, q);
+  auto c_gpu =
+      blas::helper::allocate<mem_alloc, blas::complex_sycl<scalar_t>>(m * n, q);
+
+  auto copy_a = blas::helper::copy_to_device(
+      q, reinterpret_cast<blas::complex_sycl<scalar_t>*>(a.data()), a_gpu,
+      m * k);
+  auto copy_b = blas::helper::copy_to_device(
+      q, reinterpret_cast<blas::complex_sycl<scalar_t>*>(b.data()), b_gpu,
+      n * k);
+  auto copy_c = blas::helper::copy_to_device(
+      q, reinterpret_cast<blas::complex_sycl<scalar_t>*>(c.data()), c_gpu,
+      m * n);
+
+  sb_handle.wait({copy_a, copy_b, copy_c});
+
+  // Kernel expects sycl::complex and not std::complex data
+  blas::complex_sycl<scalar_t> alpha_sycl(alpha);
+  blas::complex_sycl<scalar_t> beta_sycl(beta);
+
+#ifdef BLAS_VERIFY_BENCHMARK
+  // Run a first time with a verification of the results
+  std::vector<std::complex<scalar_t>> c_ref = c;
+  reference_blas::cgemm<scalar_t>(t_a, t_b, m, n, k,
+                                  reinterpret_cast<const void*>(&alpha),
+                                  reinterpret_cast<const void*>(a.data()), lda,
+                                  reinterpret_cast<const void*>(b.data()), ldb,
+                                  reinterpret_cast<const void*>(&beta),
+                                  reinterpret_cast<void*>(c_ref.data()), ldc);
+
+  std::vector<std::complex<scalar_t>> c_temp = c;
+
+  {
+    auto c_temp_gpu =
+        blas::helper::allocate<mem_alloc, blas::complex_sycl<scalar_t>>(m * n,
+                                                                        q);
+    auto copy_temp = blas::helper::copy_to_device(
+        q, reinterpret_cast<blas::complex_sycl<scalar_t>*>(c_temp.data()),
+        c_temp_gpu, m * n);
+    sb_handle.wait(copy_temp);
+    auto gemm_event = _gemm(sb_handle, *t_a, *t_b, m, n, k, alpha_sycl, a_gpu,
+                            lda, b_gpu, ldb, beta_sycl, c_temp_gpu, ldc);
+    sb_handle.wait(gemm_event);
+    auto copy_out = blas::helper::copy_to_host(
+        q, c_temp_gpu,
+        reinterpret_cast<blas::complex_sycl<scalar_t>*>(c_temp.data()), m * n);
+    sb_handle.wait(copy_out);
+
+    blas::helper::deallocate<mem_alloc>(c_temp_gpu, q);
+  }
+
+  std::ostringstream err_stream;
+  if (!utils::compare_vectors(c_temp, c_ref, err_stream, "")) {
+    const std::string& err_str = err_stream.str();
+    state.SkipWithError(err_str.c_str());
+    *success = false;
+  };
+#endif
+
+  auto blas_method_def = [&]() -> std::vector<cl::sycl::event> {
+    auto event = _gemm(sb_handle, *t_a, *t_b, m, n, k, alpha_sycl, a_gpu, lda,
+                       b_gpu, ldb, beta_sycl, c_gpu, ldc);
+    sb_handle.wait(event);
+    return event;
+  };
+
+  // Warmup
+  blas_benchmark::utils::warmup(blas_method_def);
+  sb_handle.wait();
+
+  blas_benchmark::utils::init_counters(state);
+
+  // Measure
+  for (auto _ : state) {
+    // Run
+    std::tuple<double, double> times =
+        blas_benchmark::utils::timef(blas_method_def);
+
+    // Report
+    blas_benchmark::utils::update_counters(state, times);
+  }
+
+  state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]);
+  state.SetBytesProcessed(state.iterations() *
+                          state.counters["bytes_processed"]);
+
+  blas_benchmark::utils::calc_avg_counters(state);
+
+  blas::helper::deallocate<mem_alloc>(a_gpu, q);
+  blas::helper::deallocate<mem_alloc>(b_gpu, q);
+  blas::helper::deallocate<mem_alloc>(c_gpu, q);
+};
+
+/*! @brief Register & run benchmark of complex data types gemm.
+ * Function is similar to register_benchmark
+ *
+ * @tparam scalar_t element data type of underlying complex (float or double)
+ * @tparam mem_alloc USM or Buffer memory allocation
+ */
+template <typename scalar_t, blas::helper::AllocType mem_alloc>
+void register_cplx_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success,
+                             std::string mem_type,
+                             std::vector<blas3_cplx_param_t<scalar_t>> params) {
+  for (auto p : params) {
+    std::string t1s, t2s;
+    index_t m, n, k;
+    scalar_t alpha_r, alpha_i, beta_r, beta_i;
+
+    std::tie(t1s, t2s, m, k, n, alpha_r, alpha_i, beta_r, beta_i) = p;
+    int t1 = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t1s));
+    int t2 = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t2s));
+    std::complex<scalar_t> alpha{alpha_r, alpha_i};
+    std::complex<scalar_t> beta{beta_r, beta_i};
+
+    auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr,
+                         int t1, int t2, index_t m, index_t k, index_t n,
+                         std::complex<scalar_t> alpha,
+                         std::complex<scalar_t> beta, bool* success) {
+      run<scalar_t, mem_alloc>(st, sb_handle_ptr, t1, t2, m, k, n, alpha, beta,
+                               success);
+    };
+    benchmark::RegisterBenchmark(
+        blas_benchmark::utils::get_name<benchmark_op, std::complex<scalar_t>>(
+            t1s, t2s, m, k, n, mem_type)
+            .c_str(),
+        BM_lambda, sb_handle_ptr, t1, t2, m, k, n, alpha, beta, success)
+        ->UseRealTime();
+  }
+}
+
+template <typename scalar_t>
+void register_cplx_benchmark(blas_benchmark::Args& args,
+                             blas::SB_Handle* sb_handle_ptr, bool* success) {
+  auto gemm_params =
+      blas_benchmark::utils::get_blas3_cplx_params<scalar_t>(args);
+  register_cplx_benchmark<scalar_t, blas::helper::AllocType::buffer>(
+      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER,
+      gemm_params);
+#ifdef SB_ENABLE_USM
+  register_cplx_benchmark<scalar_t, blas::helper::AllocType::usm>(
+      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, gemm_params);
+#endif
+}
+
+#endif
+
 namespace blas_benchmark {
 void create_benchmark(blas_benchmark::Args& args,
                       blas::SB_Handle* sb_handle_ptr, bool* success) {
diff --git a/benchmark/portblas/blas3/gemm_batched.cpp b/benchmark/portblas/blas3/gemm_batched.cpp
index 959f9eae7..aabd9449a 100644
--- a/benchmark/portblas/blas3/gemm_batched.cpp
+++ b/benchmark/portblas/blas3/gemm_batched.cpp
@@ -225,8 +225,8 @@ void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success,
     };
     benchmark::RegisterBenchmark(
         blas_benchmark::utils::get_name<benchmark_op, scalar_t>(
-            t1s, t2s, m, k, n, batch_size, batch_type,
-            mem_type).c_str(),
+            t1s, t2s, m, k, n, batch_size, batch_type, mem_type)
+            .c_str(),
         BM_lambda, sb_handle_ptr, t1, t2, m, k, n, alpha, beta, batch_size,
         batch_type, success)
         ->UseRealTime();
@@ -239,13 +239,222 @@ void register_benchmark(blas_benchmark::Args& args,
   auto gemm_batched_params =
       blas_benchmark::utils::get_gemm_batched_params<scalar_t>(args);
   register_benchmark<scalar_t, blas::helper::AllocType::buffer>(
-      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, gemm_batched_params);
+      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER,
+      gemm_batched_params);
 #ifdef SB_ENABLE_USM
   register_benchmark<scalar_t, blas::helper::AllocType::usm>(
-      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, gemm_batched_params);
+      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM,
+      gemm_batched_params);
 #endif
 }
 
+#ifdef BLAS_ENABLE_COMPLEX
+template <typename scalar_t, blas::helper::AllocType mem_alloc>
+void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1,
+         int t2, index_t m, index_t k, index_t n, std::complex<scalar_t> alpha,
+         std::complex<scalar_t> beta, index_t batch_size, int batch_type_i,
+         bool* success) {
+  // initialize the state label
+  blas_benchmark::utils::set_benchmark_label<std::complex<scalar_t>>(
+      state, sb_handle_ptr->get_queue());
+
+  // Standard test setup.
+  std::string t1s = blas_benchmark::utils::from_transpose_enum(
+      static_cast<blas_benchmark::utils::Transposition>(t1));
+  std::string t2s = blas_benchmark::utils::from_transpose_enum(
+      static_cast<blas_benchmark::utils::Transposition>(t2));
+  const char* t_a = t1s.c_str();
+  const char* t_b = t2s.c_str();
+  auto batch_type = static_cast<blas::gemm_batch_type_t>(batch_type_i);
+
+  index_t lda = t_a[0] == 'n' ? m : k;
+  index_t ldb = t_b[0] == 'n' ? k : n;
+  index_t ldc = m;
+
+  blas_benchmark::utils::init_level_3_cplx_counters<
+      blas_benchmark::utils::Level3Op::gemm_batched, scalar_t>(
+      state, beta, m, n, k, batch_size);
+
+  blas::SB_Handle& sb_handle = *sb_handle_ptr;
+  auto q = sb_handle.get_queue();
+
+  // Matrices
+  std::vector<std::complex<scalar_t>> a =
+      blas_benchmark::utils::random_cplx_data<scalar_t>(m * k * batch_size);
+  std::vector<std::complex<scalar_t>> b =
+      blas_benchmark::utils::random_cplx_data<scalar_t>(k * n * batch_size);
+  std::vector<std::complex<scalar_t>> c =
+      blas_benchmark::utils::const_cplx_data<scalar_t>(m * n * batch_size,
+                                                       scalar_t(0));
+
+#ifdef BLAS_VERIFY_BENCHMARK
+  // Run a first time with a verification of the results
+  std::vector<std::complex<scalar_t>> c_ref = c;
+  auto _base = [=](index_t dim0, index_t dim1, index_t idx) {
+    return dim0 * dim1 * idx;
+  };
+  for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
+    reference_blas::cgemm<scalar_t>(
+        t_a, t_b, m, n, k, reinterpret_cast<const void*>(&alpha),
+        reinterpret_cast<const void*>(a.data() + _base(m, k, batch_idx)), lda,
+        reinterpret_cast<const void*>(b.data() + _base(k, n, batch_idx)), ldb,
+        reinterpret_cast<const void*>(&beta),
+        reinterpret_cast<void*>(c_ref.data() + _base(m, n, batch_idx)), ldc);
+  }
+
+#endif  // BLAS_VERIFY_BENCHMARK
+
+  auto a_gpu = blas::helper::allocate<mem_alloc, blas::complex_sycl<scalar_t>>(
+      m * k * batch_size, q);
+  auto b_gpu = blas::helper::allocate<mem_alloc, blas::complex_sycl<scalar_t>>(
+      k * n * batch_size, q);
+  auto c_gpu = blas::helper::allocate<mem_alloc, blas::complex_sycl<scalar_t>>(
+      m * n * batch_size, q);
+
+  auto copy_a = blas::helper::copy_to_device(
+      q, reinterpret_cast<blas::complex_sycl<scalar_t>*>(a.data()), a_gpu,
+      m * k * batch_size);
+  auto copy_b = blas::helper::copy_to_device(
+      q, reinterpret_cast<blas::complex_sycl<scalar_t>*>(b.data()), b_gpu,
+      n * k * batch_size);
+  auto copy_c = blas::helper::copy_to_device(
+      q, reinterpret_cast<blas::complex_sycl<scalar_t>*>(c.data()), c_gpu,
+      m * n * batch_size);
+
+  sb_handle.wait({copy_a, copy_b, copy_c});
+
+  // Kernel expects sycl::complex and not std::complex data
+  blas::complex_sycl<scalar_t> alpha_sycl(alpha);
+  blas::complex_sycl<scalar_t> beta_sycl(beta);
+
+#ifdef BLAS_VERIFY_BENCHMARK
+  std::vector<std::complex<scalar_t>> c_temp = c;
+  {
+    auto c_temp_gpu =
+        blas::helper::allocate<mem_alloc, blas::complex_sycl<scalar_t>>(
+            m * n * batch_size, q);
+    auto copy_temp = blas::helper::copy_to_device(
+        q, reinterpret_cast<blas::complex_sycl<scalar_t>*>(c_temp.data()),
+        c_temp_gpu, m * n * batch_size);
+    sb_handle.wait(copy_temp);
+    auto gemm_batched_event = _gemm_batched(
+        sb_handle, *t_a, *t_b, m, n, k, alpha_sycl, a_gpu, lda, b_gpu, ldb,
+        beta_sycl, c_temp_gpu, ldc, batch_size, batch_type);
+    sb_handle.wait(gemm_batched_event);
+    auto copy_out = blas::helper::copy_to_host(
+        q, c_temp_gpu,
+        reinterpret_cast<blas::complex_sycl<scalar_t>*>(c_temp.data()),
+        m * n * batch_size);
+    sb_handle.wait(copy_out);
+
+    blas::helper::deallocate<mem_alloc>(c_temp_gpu, q);
+  }
+
+  std::ostringstream err_stream;
+  if (!utils::compare_vectors<scalar_t>(c_temp, c_ref, err_stream, "")) {
+    const std::string& err_str = err_stream.str();
+    state.SkipWithError(err_str.c_str());
+    *success = false;
+  };
+#endif  // BLAS_VERIFY_BENCHMARK
+
+  auto blas_method_def = [&]() -> std::vector<cl::sycl::event> {
+    auto event = _gemm_batched(sb_handle, *t_a, *t_b, m, n, k, alpha_sycl,
+                               a_gpu, lda, b_gpu, ldb, beta_sycl, c_gpu, ldc,
+                               batch_size, batch_type);
+    sb_handle.wait(event);
+    return event;
+  };
+
+  // Warmup
+  blas_benchmark::utils::warmup(blas_method_def);
+  sb_handle.wait();
+
+  blas_benchmark::utils::init_counters(state);
+
+  // Measure
+  for (auto _ : state) {
+    // Run
+    std::tuple<double, double> times =
+        blas_benchmark::utils::timef(blas_method_def);
+
+    // Report
+    blas_benchmark::utils::update_counters(state, times);
+  }
+
+  state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]);
+  state.SetBytesProcessed(state.iterations() *
+                          state.counters["bytes_processed"]);
+
+  blas_benchmark::utils::calc_avg_counters(state);
+
+  blas::helper::deallocate<mem_alloc>(a_gpu, q);
+  blas::helper::deallocate<mem_alloc>(b_gpu, q);
+  blas::helper::deallocate<mem_alloc>(c_gpu, q);
+};
+
+/*! @brief Register & run benchmark of complex data types gemm batched.
+ * Function is similar to register_benchmark
+ *
+ * @tparam scalar_t element data type of underlying complex (float or double)
+ * @tparam mem_alloc USM or Buffer memory allocation
+ */
+template <typename scalar_t, blas::helper::AllocType mem_alloc>
+void register_cplx_benchmark(
+    blas::SB_Handle* sb_handle_ptr, bool* success, std::string mem_type,
+    std::vector<gemm_batched_cplx_param_t<scalar_t>> params) {
+  for (auto p : params) {
+    std::string t1s, t2s;
+    index_t m, n, k, batch_size;
+    scalar_t alpha_r, alpha_i, beta_r, beta_i;
+    int batch_type;
+    std::tie(t1s, t2s, m, k, n, alpha_r, alpha_i, beta_r, beta_i, batch_size,
+             batch_type) = p;
+    // Only batch_type == strided is supported with Complex data
+    if (batch_type == 1) {
+      std::cerr << "Interleaved memory for gemm_batched operator is not "
+                   "supported whith complex data type\n";
+      continue;
+    }
+    int t1 = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t1s));
+    int t2 = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t2s));
+    std::complex<scalar_t> alpha{alpha_r, alpha_i};
+    std::complex<scalar_t> beta{beta_r, beta_i};
+
+    auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr,
+                         int t1, int t2, index_t m, index_t k, index_t n,
+                         std::complex<scalar_t> alpha,
+                         std::complex<scalar_t> beta, index_t batch_size,
+                         int batch_type, bool* success) {
+      run<scalar_t, mem_alloc>(st, sb_handle_ptr, t1, t2, m, k, n, alpha, beta,
+                               batch_size, batch_type, success);
+    };
+    benchmark::RegisterBenchmark(
+        blas_benchmark::utils::get_name<benchmark_op, std::complex<scalar_t>>(
+            t1s, t2s, m, k, n, batch_size, batch_type, mem_type)
+            .c_str(),
+        BM_lambda, sb_handle_ptr, t1, t2, m, k, n, alpha, beta, batch_size,
+        batch_type, success)
+        ->UseRealTime();
+  }
+}
+
+template <typename scalar_t>
+void register_cplx_benchmark(blas_benchmark::Args& args,
+                             blas::SB_Handle* sb_handle_ptr, bool* success) {
+  auto gemm_batched_params =
+      blas_benchmark::utils::get_gemm_cplx_batched_params<scalar_t>(args);
+  register_cplx_benchmark<scalar_t, blas::helper::AllocType::buffer>(
+      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER,
+      gemm_batched_params);
+#ifdef SB_ENABLE_USM
+  register_cplx_benchmark<scalar_t, blas::helper::AllocType::usm>(
+      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM,
+      gemm_batched_params);
+#endif
+}
+#endif
+
 namespace blas_benchmark {
 void create_benchmark(blas_benchmark::Args& args,
                       blas::SB_Handle* sb_handle_ptr, bool* success) {
diff --git a/benchmark/portblas/blas3/gemm_batched_strided.cpp b/benchmark/portblas/blas3/gemm_batched_strided.cpp
index 0fdb29db9..a24a2a188 100644
--- a/benchmark/portblas/blas3/gemm_batched_strided.cpp
+++ b/benchmark/portblas/blas3/gemm_batched_strided.cpp
@@ -195,7 +195,8 @@ void register_benchmark(
     benchmark::RegisterBenchmark(
         blas_benchmark::utils::get_name<benchmark_op, scalar_t>(
             t1s, t2s, m, k, n, batch_size, stride_a_mul, stride_b_mul,
-            stride_c_mul, mem_type).c_str(),
+            stride_c_mul, mem_type)
+            .c_str(),
         BM_lambda, sb_handle_ptr, t1, t2, m, k, n, alpha, beta, batch_size,
         stride_a_mul, stride_b_mul, stride_c_mul, success)
         ->UseRealTime();
@@ -208,13 +209,236 @@ void register_benchmark(blas_benchmark::Args& args,
   auto gemm_batched_strided_params =
       blas_benchmark::utils::get_gemm_batched_strided_params<scalar_t>(args);
   register_benchmark<scalar_t, blas::helper::AllocType::buffer>(
-      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, gemm_batched_strided_params);
+      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER,
+      gemm_batched_strided_params);
 #ifdef SB_ENABLE_USM
   register_benchmark<scalar_t, blas::helper::AllocType::usm>(
-      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, gemm_batched_strided_params);
+      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM,
+      gemm_batched_strided_params);
 #endif
 }
 
+#ifdef BLAS_ENABLE_COMPLEX
+template <typename scalar_t, blas::helper::AllocType mem_alloc>
+void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1,
+         int t2, index_t m, index_t k, index_t n, std::complex<scalar_t> alpha,
+         std::complex<scalar_t> beta, index_t batch_size, index_t stride_a_mul,
+         index_t stride_b_mul, index_t stride_c_mul, bool* success) {
+  // initialize the state label
+  blas_benchmark::utils::set_benchmark_label<std::complex<scalar_t>>(
+      state, sb_handle_ptr->get_queue());
+
+  // Standard test setup.
+  std::string t1s = blas_benchmark::utils::from_transpose_enum(
+      static_cast<blas_benchmark::utils::Transposition>(t1));
+  std::string t2s = blas_benchmark::utils::from_transpose_enum(
+      static_cast<blas_benchmark::utils::Transposition>(t2));
+  const char* t_a = t1s.c_str();
+  const char* t_b = t2s.c_str();
+
+  const bool trA = t_a[0] != 'n';
+  const bool trB = t_b[0] != 'n';
+
+  index_t lda = trA ? k : m;
+  index_t ldb = trB ? n : k;
+  index_t ldc = m;
+
+  blas_benchmark::utils::init_level_3_cplx_counters<
+      blas_benchmark::utils::Level3Op::gemm_batched_strided, scalar_t>(
+      state, beta, m, n, k, batch_size, stride_a_mul, stride_b_mul,
+      stride_c_mul);
+
+  blas::SB_Handle& sb_handle = *sb_handle_ptr;
+  auto q = sb_handle.get_queue();
+
+  // Data sizes
+  // Elementary matrices
+  const index_t a_size = m * k;
+  const index_t b_size = k * n;
+  const index_t c_size = m * n;
+  // Strides
+  const index_t stride_a = stride_a_mul * a_size;
+  const index_t stride_b = stride_b_mul * b_size;
+  const index_t stride_c = stride_c_mul * c_size;
+  // Batched matrices
+  const int size_a_batch = a_size + (batch_size - 1) * stride_a;
+  const int size_b_batch = b_size + (batch_size - 1) * stride_b;
+  const int size_c_batch = c_size + (batch_size - 1) * stride_c;
+
+  // Matrices
+  std::vector<std::complex<scalar_t>> a =
+      blas_benchmark::utils::random_cplx_data<scalar_t>(size_a_batch);
+  std::vector<std::complex<scalar_t>> b =
+      blas_benchmark::utils::random_cplx_data<scalar_t>(size_b_batch);
+  std::vector<std::complex<scalar_t>> c =
+      blas_benchmark::utils::const_cplx_data<scalar_t>(size_c_batch,
+                                                       scalar_t(0));
+
+#ifdef BLAS_VERIFY_BENCHMARK
+  // Run a first time with a verification of the results
+  std::vector<std::complex<scalar_t>> c_ref = c;
+  for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
+    reference_blas::cgemm<scalar_t>(
+        t_a, t_b, m, n, k, reinterpret_cast<const void*>(&alpha),
+        reinterpret_cast<const void*>(a.data() + batch_idx * stride_a), lda,
+        reinterpret_cast<const void*>(b.data() + batch_idx * stride_b), ldb,
+        reinterpret_cast<const void*>(&beta),
+        reinterpret_cast<void*>(c_ref.data() + batch_idx * stride_c), ldc);
+  }
+
+#endif
+
+  auto a_gpu = blas::helper::allocate<mem_alloc, blas::complex_sycl<scalar_t>>(
+      size_a_batch, q);
+  auto b_gpu = blas::helper::allocate<mem_alloc, blas::complex_sycl<scalar_t>>(
+      size_b_batch, q);
+  auto c_gpu = blas::helper::allocate<mem_alloc, blas::complex_sycl<scalar_t>>(
+      size_c_batch, q);
+
+  auto copy_a = blas::helper::copy_to_device(
+      q, reinterpret_cast<blas::complex_sycl<scalar_t>*>(a.data()), a_gpu,
+      size_a_batch);
+  auto copy_b = blas::helper::copy_to_device(
+      q, reinterpret_cast<blas::complex_sycl<scalar_t>*>(b.data()), b_gpu,
+      size_b_batch);
+  auto copy_c = blas::helper::copy_to_device(
+      q, reinterpret_cast<blas::complex_sycl<scalar_t>*>(c.data()), c_gpu,
+      size_c_batch);
+
+  sb_handle.wait({copy_a, copy_b, copy_c});
+
+  // Kernel expects sycl::complex and not std::complex data
+  blas::complex_sycl<scalar_t> alpha_sycl(alpha);
+  blas::complex_sycl<scalar_t> beta_sycl(beta);
+
+#ifdef BLAS_VERIFY_BENCHMARK
+  std::vector<std::complex<scalar_t>> c_temp = c;
+  {
+    auto c_temp_gpu =
+        blas::helper::allocate<mem_alloc, blas::complex_sycl<scalar_t>>(
+            size_c_batch, q);
+    auto copy_temp = blas::helper::copy_to_device(
+        q, reinterpret_cast<blas::complex_sycl<scalar_t>*>(c_temp.data()),
+        c_temp_gpu, size_c_batch);
+    sb_handle.wait(copy_temp);
+    auto gemm_batched_strided_event = _gemm_strided_batched(
+        sb_handle, *t_a, *t_b, m, n, k, alpha_sycl, a_gpu, lda, stride_a, b_gpu,
+        ldb, stride_b, beta_sycl, c_temp_gpu, ldc, stride_c, batch_size);
+    sb_handle.wait(gemm_batched_strided_event);
+
+    auto copy_out = blas::helper::copy_to_host(
+        q, c_temp_gpu,
+        reinterpret_cast<blas::complex_sycl<scalar_t>*>(c_temp.data()),
+        size_c_batch);
+    sb_handle.wait(copy_out);
+
+    blas::helper::deallocate<mem_alloc>(c_temp_gpu, q);
+  }
+
+  std::ostringstream err_stream;
+  if (!::utils::compare_vectors_strided<scalar_t>(c_temp, c_ref, stride_c,
+                                                  c_size, err_stream, "")) {
+    const std::string& err_str = err_stream.str();
+    state.SkipWithError(err_str.c_str());
+    *success = false;
+  };
+#endif
+
+  auto blas_method_def = [&]() -> std::vector<cl::sycl::event> {
+    auto event = _gemm_strided_batched(
+        sb_handle, *t_a, *t_b, m, n, k, alpha_sycl, a_gpu, lda, stride_a, b_gpu,
+        ldb, stride_b, beta_sycl, c_gpu, ldc, stride_c, batch_size);
+    sb_handle.wait(event);
+    return event;
+  };
+
+  // Warmup
+  blas_benchmark::utils::warmup(blas_method_def);
+  sb_handle.wait();
+
+  blas_benchmark::utils::init_counters(state);
+
+  // Measure
+  for (auto _ : state) {
+    // Run
+    std::tuple<double, double> times =
+        blas_benchmark::utils::timef(blas_method_def);
+
+    // Report
+    blas_benchmark::utils::update_counters(state, times);
+  }
+
+  state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]);
+  state.SetBytesProcessed(state.iterations() *
+                          state.counters["bytes_processed"]);
+
+  blas_benchmark::utils::calc_avg_counters(state);
+
+  blas::helper::deallocate<mem_alloc>(a_gpu, q);
+  blas::helper::deallocate<mem_alloc>(b_gpu, q);
+  blas::helper::deallocate<mem_alloc>(c_gpu, q);
+};
+
+/*! @brief Register & run benchmark of complex data types gemm batched strided.
+ * Function is similar to register_benchmark
+ *
+ * @tparam scalar_t element data type of underlying complex (float or double)
+ * @tparam mem_alloc USM or Buffer memory allocation
+ */
+template <typename scalar_t, blas::helper::AllocType mem_alloc>
+void register_cplx_benchmark(
+    blas::SB_Handle* sb_handle_ptr, bool* success, std::string mem_type,
+    std::vector<gemm_batched_strided_cplx_param_t<scalar_t>> params) {
+  for (auto p : params) {
+    std::string t1s, t2s;
+    index_t m, n, k, batch_size, stride_a_mul, stride_b_mul, stride_c_mul;
+    scalar_t alpha_r, alpha_i, beta_r, beta_i;
+
+    std::tie(t1s, t2s, m, k, n, alpha_r, alpha_i, beta_r, beta_i, batch_size,
+             stride_a_mul, stride_b_mul, stride_c_mul) = p;
+    int t1 = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t1s));
+    int t2 = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t2s));
+    std::complex<scalar_t> alpha{alpha_r, alpha_i};
+    std::complex<scalar_t> beta{beta_r, beta_i};
+
+    auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr,
+                         int t1, int t2, index_t m, index_t k, index_t n,
+                         std::complex<scalar_t> alpha,
+                         std::complex<scalar_t> beta, index_t batch_size,
+                         index_t stride_a_mul, index_t stride_b_mul,
+                         index_t stride_c_mul, bool* success) {
+      run<scalar_t, mem_alloc>(st, sb_handle_ptr, t1, t2, m, k, n, alpha, beta,
+                               batch_size, stride_a_mul, stride_b_mul,
+                               stride_c_mul, success);
+    };
+    benchmark::RegisterBenchmark(
+        blas_benchmark::utils::get_name<benchmark_op, std::complex<scalar_t>>(
+            t1s, t2s, m, k, n, batch_size, stride_a_mul, stride_b_mul,
+            stride_c_mul, mem_type)
+            .c_str(),
+        BM_lambda, sb_handle_ptr, t1, t2, m, k, n, alpha, beta, batch_size,
+        stride_a_mul, stride_b_mul, stride_c_mul, success)
+        ->UseRealTime();
+  }
+}
+
+template <typename scalar_t>
+void register_cplx_benchmark(blas_benchmark::Args& args,
+                             blas::SB_Handle* sb_handle_ptr, bool* success) {
+  auto gemm_batched_strided_params =
+      blas_benchmark::utils::get_gemm_batched_strided_cplx_params<scalar_t>(
+          args);
+  register_cplx_benchmark<scalar_t, blas::helper::AllocType::buffer>(
+      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER,
+      gemm_batched_strided_params);
+#ifdef SB_ENABLE_USM
+  register_cplx_benchmark<scalar_t, blas::helper::AllocType::usm>(
+      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM,
+      gemm_batched_strided_params);
+#endif
+}
+#endif
+
 namespace blas_benchmark {
 void create_benchmark(blas_benchmark::Args& args,
                       blas::SB_Handle* sb_handle_ptr, bool* success) {
diff --git a/benchmark/rocblas/CMakeLists.txt b/benchmark/rocblas/CMakeLists.txt
index caa884725..64a559931 100644
--- a/benchmark/rocblas/CMakeLists.txt
+++ b/benchmark/rocblas/CMakeLists.txt
@@ -77,6 +77,9 @@ set(sources
 
 )
 
+# Operators supporting COMPLEX types benchmarking
+set(CPLX_OPS "gemm" "gemm_batched" "gemm_batched_strided")
+
 # Add individual benchmarks for each method
 foreach(rocblas_benchmark ${sources})
   get_filename_component(rocblas_bench_exec ${rocblas_benchmark} NAME_WE)
@@ -84,7 +87,11 @@ foreach(rocblas_benchmark ${sources})
   target_link_libraries(bench_rocblas_${rocblas_bench_exec} PRIVATE benchmark Clara::Clara roc::rocblas bench_info) 
   target_compile_definitions(bench_rocblas_${rocblas_bench_exec} PRIVATE -DBLAS_INDEX_T=${BLAS_BENCHMARK_INDEX_TYPE})
   target_include_directories(bench_rocblas_${rocblas_bench_exec} PRIVATE  ${PORTBLAS_INCLUDE} ${rocblas_INCLUDE_DIRS} ${CBLAS_INCLUDE} ${BLAS_BENCH} ${PORTBLAS_COMMON_INCLUDE_DIR})
-
+  if(${BLAS_ENABLE_COMPLEX})
+    if("${rocblas_bench_exec}" IN_LIST CPLX_OPS)
+      target_compile_definitions(bench_rocblas_${rocblas_bench_exec} PRIVATE BLAS_ENABLE_COMPLEX=1)
+    endif()
+  endif()
   # Even though rocblas does not use sycl, the common tools indirectly include sycl headers.
   add_sycl_to_target(
       TARGET bench_rocblas_${rocblas_bench_exec}
diff --git a/benchmark/rocblas/blas3/gemm.cpp b/benchmark/rocblas/blas3/gemm.cpp
index b403bafec..ca07ba2ba 100644
--- a/benchmark/rocblas/blas3/gemm.cpp
+++ b/benchmark/rocblas/blas3/gemm.cpp
@@ -38,6 +38,18 @@ static inline void rocblas_gemm_f(args_t&&... args) {
   return;
 }
 
+#ifdef BLAS_ENABLE_COMPLEX
+template <typename scalar_t, typename... args_t>
+static inline void rocblas_cplx_gemm_f(args_t&&... args) {
+  if constexpr (std::is_same_v<scalar_t, float>) {
+    CHECK_ROCBLAS_STATUS(rocblas_cgemm(std::forward<args_t>(args)...));
+  } else if constexpr (std::is_same_v<scalar_t, double>) {
+    CHECK_ROCBLAS_STATUS(rocblas_zgemm(std::forward<args_t>(args)...));
+  }
+  return;
+}
+#endif
+
 template <typename scalar_t>
 void run(benchmark::State& state, rocblas_handle& rb_handle, int t_a_i,
          int t_b_i, index_t m, index_t k, index_t n, scalar_t alpha,
@@ -183,6 +195,177 @@ void register_benchmark(blas_benchmark::Args& args, rocblas_handle& rb_handle,
   }
 }
 
+#ifdef BLAS_ENABLE_COMPLEX
+template <typename scalar_t>
+using rocComplex =
+    typename std::conditional<sizeof(scalar_t) == 8, rocblas_double_complex,
+                              rocblas_float_complex>::type;
+
+template <typename scalar_t>
+void run(benchmark::State& state, rocblas_handle& rb_handle, int t_a_i,
+         int t_b_i, index_t m, index_t k, index_t n,
+         std::complex<scalar_t> alpha, std::complex<scalar_t> beta,
+         bool* success) {
+  // initialize the state label
+  blas_benchmark::utils::set_benchmark_label<std::complex<scalar_t>>(state);
+
+  // Standard test setup.
+  std::string t_a = blas_benchmark::utils::from_transpose_enum(
+      static_cast<blas_benchmark::utils::Transposition>(t_a_i));
+  std::string t_b = blas_benchmark::utils::from_transpose_enum(
+      static_cast<blas_benchmark::utils::Transposition>(t_b_i));
+  const char* t_a_str = t_a.c_str();
+  const char* t_b_str = t_b.c_str();
+
+  index_t lda = t_a_str[0] == 'n' ? m : k;
+  index_t ldb = t_b_str[0] == 'n' ? k : n;
+  index_t ldc = m;
+
+  blas_benchmark::utils::init_level_3_cplx_counters<
+      blas_benchmark::utils::Level3Op::gemm, scalar_t>(state, beta, m, n, k);
+
+  // Matrix options (rocBLAS)
+  const rocblas_operation trans_a_rb =
+      t_a_str[0] == 'n' ? rocblas_operation_none : rocblas_operation_transpose;
+  const rocblas_operation trans_b_rb =
+      t_b_str[0] == 'n' ? rocblas_operation_none : rocblas_operation_transpose;
+
+  // rocBLAS complex alpha & beta
+  rocComplex<scalar_t> rocBeta{beta.real(), beta.imag()};
+  rocComplex<scalar_t> rocAlpha{alpha.real(), alpha.imag()};
+
+  // Data sizes
+  const int a_size = m * k;
+  const int b_size = k * n;
+  const int c_size = m * n;
+
+  // Matrices
+  std::vector<std::complex<scalar_t>> a =
+      blas_benchmark::utils::random_cplx_data<scalar_t>(a_size);
+  std::vector<std::complex<scalar_t>> b =
+      blas_benchmark::utils::random_cplx_data<scalar_t>(b_size);
+  std::vector<std::complex<scalar_t>> c =
+      blas_benchmark::utils::const_cplx_data<scalar_t>(c_size, 0);
+
+  {
+    // Device memory allocation & H2D copy
+    blas_benchmark::utils::HIPVector<rocComplex<scalar_t>> a_gpu(
+        a_size, reinterpret_cast<rocComplex<scalar_t>*>(a.data()));
+    blas_benchmark::utils::HIPVector<rocComplex<scalar_t>> b_gpu(
+        b_size, reinterpret_cast<rocComplex<scalar_t>*>(b.data()));
+    blas_benchmark::utils::HIPVector<rocComplex<scalar_t>> c_gpu(
+        c_size, reinterpret_cast<rocComplex<scalar_t>*>(c.data()));
+
+#ifdef BLAS_VERIFY_BENCHMARK
+    // Reference gemm
+    std::vector<std::complex<scalar_t>> c_ref = c;
+    reference_blas::cgemm<scalar_t>(
+        t_a_str, t_b_str, m, n, k, reinterpret_cast<const void*>(&alpha),
+        reinterpret_cast<const void*>(a.data()), lda,
+        reinterpret_cast<const void*>(b.data()), ldb,
+        reinterpret_cast<const void*>(&beta),
+        reinterpret_cast<void*>(c_ref.data()), ldc);
+
+    // Rocblas verification gemm
+    std::vector<std::complex<scalar_t>> c_temp = c;
+    {
+      blas_benchmark::utils::HIPVector<rocComplex<scalar_t>, true> c_temp_gpu(
+          c_size, reinterpret_cast<rocComplex<scalar_t>*>(c_temp.data()));
+      rocblas_cplx_gemm_f<scalar_t>(rb_handle, trans_a_rb, trans_b_rb, m, n, k,
+                                    &rocAlpha, a_gpu, lda, b_gpu, ldb, &rocBeta,
+                                    c_temp_gpu, ldc);
+    }
+
+    std::ostringstream err_stream;
+    if (!utils::compare_vectors(c_temp, c_ref, err_stream, "")) {
+      const std::string& err_str = err_stream.str();
+      state.SkipWithError(err_str.c_str());
+      *success = false;
+    };
+#endif
+
+    auto blas_warmup = [&]() -> void {
+      rocblas_cplx_gemm_f<scalar_t>(rb_handle, trans_a_rb, trans_b_rb, m, n, k,
+                                    &rocAlpha, a_gpu, lda, b_gpu, ldb, &rocBeta,
+                                    c_gpu, ldc);
+      return;
+    };
+
+    hipEvent_t start, stop;
+    CHECK_HIP_ERROR(hipEventCreate(&start));
+    CHECK_HIP_ERROR(hipEventCreate(&stop));
+
+    auto blas_method_def = [&]() -> std::vector<hipEvent_t> {
+      CHECK_HIP_ERROR(hipEventRecord(start, NULL));
+      rocblas_cplx_gemm_f<scalar_t>(rb_handle, trans_a_rb, trans_b_rb, m, n, k,
+                                    &rocAlpha, a_gpu, lda, b_gpu, ldb, &rocBeta,
+                                    c_gpu, ldc);
+      CHECK_HIP_ERROR(hipEventRecord(stop, NULL));
+      CHECK_HIP_ERROR(hipEventSynchronize(stop));
+      return std::vector{start, stop};
+    };
+
+    // Warmup
+    blas_benchmark::utils::warmup(blas_warmup);
+    CHECK_HIP_ERROR(hipStreamSynchronize(NULL));
+
+    blas_benchmark::utils::init_counters(state);
+
+    // Measure
+    for (auto _ : state) {
+      // Run
+      std::tuple<double, double> times =
+          blas_benchmark::utils::timef_hip(blas_method_def);
+
+      // Report
+      blas_benchmark::utils::update_counters(state, times);
+    }
+
+    state.SetBytesProcessed(state.iterations() *
+                            state.counters["bytes_processed"]);
+    state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]);
+
+    blas_benchmark::utils::calc_avg_counters(state);
+
+    CHECK_HIP_ERROR(hipEventDestroy(start));
+    CHECK_HIP_ERROR(hipEventDestroy(stop));
+  }  // release device memory via utils::DeviceVector destructors
+};
+
+template <typename scalar_t>
+void register_cplx_benchmark(blas_benchmark::Args& args,
+                             rocblas_handle& rb_handle, bool* success) {
+  auto gemm_params =
+      blas_benchmark::utils::get_blas3_cplx_params<scalar_t>(args);
+
+  for (auto p : gemm_params) {
+    std::string t_a, t_b;
+    index_t m, n, k;
+    scalar_t alpha_r, alpha_i, beta_r, beta_i;
+
+    std::tie(t_a, t_b, m, k, n, alpha_r, alpha_i, beta_r, beta_i) = p;
+    int t_a_i = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t_a));
+    int t_b_i = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t_b));
+    std::complex<scalar_t> alpha{alpha_r, alpha_i};
+    std::complex<scalar_t> beta{beta_r, beta_i};
+
+    auto BM_lambda = [&](benchmark::State& st, rocblas_handle rb_handle,
+                         int t1i, int t2i, index_t m, index_t k, index_t n,
+                         std::complex<scalar_t> alpha,
+                         std::complex<scalar_t> beta, bool* success) {
+      run<scalar_t>(st, rb_handle, t1i, t2i, m, k, n, alpha, beta, success);
+    };
+    benchmark::RegisterBenchmark(
+        blas_benchmark::utils::get_name<benchmark_op, std::complex<scalar_t>>(
+            t_a, t_b, m, k, n, blas_benchmark::utils::MEM_TYPE_USM)
+            .c_str(),
+        BM_lambda, rb_handle, t_a_i, t_b_i, m, k, n, alpha, beta, success)
+        ->UseRealTime();
+  }
+}
+
+#endif
+
 namespace blas_benchmark {
 void create_benchmark(blas_benchmark::Args& args, rocblas_handle& rb_handle,
                       bool* success) {
diff --git a/benchmark/rocblas/blas3/gemm_batched.cpp b/benchmark/rocblas/blas3/gemm_batched.cpp
index 4cfb1418d..40147d5ff 100644
--- a/benchmark/rocblas/blas3/gemm_batched.cpp
+++ b/benchmark/rocblas/blas3/gemm_batched.cpp
@@ -38,6 +38,18 @@ static inline void rocblas_gemm_batched_f(args_t&&... args) {
   return;
 }
 
+#ifdef BLAS_ENABLE_COMPLEX
+template <typename scalar_t, typename... args_t>
+static inline void rocblas_cplx_gemm_batched_f(args_t&&... args) {
+  if constexpr (std::is_same_v<scalar_t, float>) {
+    CHECK_ROCBLAS_STATUS(rocblas_cgemm_batched(std::forward<args_t>(args)...));
+  } else if constexpr (std::is_same_v<scalar_t, double>) {
+    CHECK_ROCBLAS_STATUS(rocblas_zgemm_batched(std::forward<args_t>(args)...));
+  }
+  return;
+}
+#endif
+
 template <typename scalar_t>
 void run(benchmark::State& state, rocblas_handle& rb_handle, index_t t_a_i,
          index_t t_b_i, index_t m, index_t k, index_t n, scalar_t alpha,
@@ -209,6 +221,194 @@ void register_benchmark(blas_benchmark::Args& args, rocblas_handle& rb_handle,
   }
 }
 
+#ifdef BLAS_ENABLE_COMPLEX
+template <typename scalar_t>
+using rocComplex =
+    typename std::conditional<sizeof(scalar_t) == 8, rocblas_double_complex,
+                              rocblas_float_complex>::type;
+template <typename scalar_t>
+void run(benchmark::State& state, rocblas_handle& rb_handle, index_t t_a_i,
+         index_t t_b_i, index_t m, index_t k, index_t n,
+         std::complex<scalar_t> alpha, std::complex<scalar_t> beta,
+         index_t batch_size, int batch_type_i, bool* success) {
+  // initialize the state label
+  blas_benchmark::utils::set_benchmark_label<std::complex<scalar_t>>(state);
+
+  // Standard setup
+  std::string t_a = blas_benchmark::utils::from_transpose_enum(
+      static_cast<blas_benchmark::utils::Transposition>(t_a_i));
+  std::string t_b = blas_benchmark::utils::from_transpose_enum(
+      static_cast<blas_benchmark::utils::Transposition>(t_b_i));
+  const char* t_a_str = t_a.c_str();
+  const char* t_b_str = t_b.c_str();
+  auto batch_type = static_cast<blas::gemm_batch_type_t>(batch_type_i);
+
+  const bool trA = (t_a_str[0] == 'n');
+  const bool trB = (t_b_str[0] == 'n');
+
+  index_t lda = trA ? m : k;
+  index_t ldb = trB ? k : n;
+  index_t ldc = m;
+
+  blas_benchmark::utils::init_level_3_cplx_counters<
+      blas_benchmark::utils::Level3Op::gemm_batched, scalar_t>(
+      state, beta, m, n, k, batch_size);
+
+  // Matrix options (rocBLAS)
+  const rocblas_operation trans_a_rb =
+      trA ? rocblas_operation_none : rocblas_operation_transpose;
+  const rocblas_operation trans_b_rb =
+      trB ? rocblas_operation_none : rocblas_operation_transpose;
+
+  // rocBLAS complex alpha & beta
+  rocComplex<scalar_t> rocBeta{beta.real(), beta.imag()};
+  rocComplex<scalar_t> rocAlpha{alpha.real(), alpha.imag()};
+
+  // Data sizes
+  const int a_size = m * k;
+  const int b_size = k * n;
+  const int c_size = m * n;
+
+  // Matrices
+  std::vector<std::complex<scalar_t>> a =
+      blas_benchmark::utils::random_cplx_data<scalar_t>(a_size * batch_size);
+  std::vector<std::complex<scalar_t>> b =
+      blas_benchmark::utils::random_cplx_data<scalar_t>(b_size * batch_size);
+  std::vector<std::complex<scalar_t>> c =
+      blas_benchmark::utils::const_cplx_data<scalar_t>(c_size * batch_size, 0);
+
+  {
+    // Device memory allocation & H2D copy
+    blas_benchmark::utils::HIPVectorBatched<rocComplex<scalar_t>> a_batched_gpu(
+        a_size, batch_size, reinterpret_cast<rocComplex<scalar_t>*>(a.data()));
+    blas_benchmark::utils::HIPVectorBatched<rocComplex<scalar_t>> b_batched_gpu(
+        b_size, batch_size, reinterpret_cast<rocComplex<scalar_t>*>(b.data()));
+    blas_benchmark::utils::HIPVectorBatched<rocComplex<scalar_t>> c_batched_gpu(
+        c_size, batch_size);
+
+#ifdef BLAS_VERIFY_BENCHMARK
+    // Reference batched gemm
+    std::vector<std::complex<scalar_t>> c_ref = c;
+    for (int batch = 0; batch < batch_size; batch++) {
+      reference_blas::cgemm<scalar_t>(
+          t_a_str, t_b_str, m, n, k, reinterpret_cast<const void*>(&alpha),
+          reinterpret_cast<const void*>(a.data() + batch * a_size), lda,
+          reinterpret_cast<const void*>(b.data() + batch * b_size), ldb,
+          reinterpret_cast<const void*>(&beta),
+          reinterpret_cast<void*>(c_ref.data() + batch * c_size), ldc);
+    }
+
+    // Rocblas verification gemm_batched
+    std::vector<std::complex<scalar_t>> c_temp = c;
+    {
+      blas_benchmark::utils::HIPVectorBatched<rocComplex<scalar_t>, true>
+          c_temp_gpu(c_size, batch_size,
+                     reinterpret_cast<rocComplex<scalar_t>*>(c_temp.data()));
+      rocblas_cplx_gemm_batched_f<scalar_t>(
+          rb_handle, trans_a_rb, trans_b_rb, m, n, k, &rocAlpha, a_batched_gpu,
+          lda, b_batched_gpu, ldb, &rocBeta, c_temp_gpu, ldc, batch_size);
+    }
+
+    std::ostringstream err_stream;
+    if (!utils::compare_vectors(c_temp, c_ref, err_stream, "")) {
+      const std::string& err_str = err_stream.str();
+      state.SkipWithError(err_str.c_str());
+      *success = false;
+    };
+#endif
+
+    auto blas_warmup = [&]() -> void {
+      rocblas_cplx_gemm_batched_f<scalar_t>(
+          rb_handle, trans_a_rb, trans_b_rb, m, n, k, &rocAlpha, a_batched_gpu,
+          lda, b_batched_gpu, ldb, &rocBeta, c_batched_gpu, ldc, batch_size);
+      return;
+    };
+
+    hipEvent_t start, stop;
+    CHECK_HIP_ERROR(hipEventCreate(&start));
+    CHECK_HIP_ERROR(hipEventCreate(&stop));
+
+    auto blas_method_def = [&]() -> std::vector<hipEvent_t> {
+      CHECK_HIP_ERROR(hipEventRecord(start, NULL));
+      rocblas_cplx_gemm_batched_f<scalar_t>(
+          rb_handle, trans_a_rb, trans_b_rb, m, n, k, &rocAlpha, a_batched_gpu,
+          lda, b_batched_gpu, ldb, &rocBeta, c_batched_gpu, ldc, batch_size);
+      CHECK_HIP_ERROR(hipEventRecord(stop, NULL));
+      CHECK_HIP_ERROR(hipEventSynchronize(stop));
+      return std::vector{start, stop};
+    };
+
+    // Warmup
+    blas_benchmark::utils::warmup(blas_warmup);
+    CHECK_HIP_ERROR(hipStreamSynchronize(NULL));
+
+    blas_benchmark::utils::init_counters(state);
+
+    // Measure
+    for (auto _ : state) {
+      // Run
+      std::tuple<double, double> times =
+          blas_benchmark::utils::timef_hip(blas_method_def);
+
+      // Report
+      blas_benchmark::utils::update_counters(state, times);
+    }
+
+    state.SetBytesProcessed(state.iterations() *
+                            state.counters["bytes_processed"]);
+    state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]);
+
+    blas_benchmark::utils::calc_avg_counters(state);
+
+    CHECK_HIP_ERROR(hipEventDestroy(start));
+    CHECK_HIP_ERROR(hipEventDestroy(stop));
+  }  // release device memory via utils::DeviceVector destructors
+};
+
+template <typename scalar_t>
+void register_cplx_benchmark(blas_benchmark::Args& args,
+                             rocblas_handle& rb_handle, bool* success) {
+  auto gemm_batched_params =
+      blas_benchmark::utils::get_gemm_cplx_batched_params<scalar_t>(args);
+
+  for (auto p : gemm_batched_params) {
+    std::string t_a, t_b;
+    index_t m, n, k, batch_size;
+    scalar_t alpha_r, alpha_i, beta_r, beta_i;
+    int batch_type;
+    std::tie(t_a, t_b, m, k, n, alpha_r, alpha_i, beta_r, beta_i, batch_size,
+             batch_type) = p;
+    std::complex<scalar_t> alpha{alpha_r, alpha_i};
+    std::complex<scalar_t> beta{beta_r, beta_i};
+    if (batch_type == 1) {
+      std::cerr << "interleaved memory for gemm_batched operator is not "
+                   "supported by rocBLAS\n";
+      continue;
+    }
+
+    int t_a_i = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t_a));
+    int t_b_i = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t_b));
+
+    auto BM_lambda = [&](benchmark::State& st, rocblas_handle rb_handle,
+                         int t_a_i, int t_b_i, index_t m, index_t k, index_t n,
+                         std::complex<scalar_t> alpha,
+                         std::complex<scalar_t> beta, index_t batch_size,
+                         int batch_type, bool* success) {
+      run<scalar_t>(st, rb_handle, t_a_i, t_b_i, m, k, n, alpha, beta,
+                    batch_size, batch_type, success);
+    };
+    benchmark::RegisterBenchmark(
+        blas_benchmark::utils::get_name<benchmark_op, std::complex<scalar_t>>(
+            t_a, t_b, m, k, n, batch_size, batch_type,
+            blas_benchmark::utils::MEM_TYPE_USM)
+            .c_str(),
+        BM_lambda, rb_handle, t_a_i, t_b_i, m, k, n, alpha, beta, batch_size,
+        batch_type, success)
+        ->UseRealTime();
+  }
+}
+#endif
+
 namespace blas_benchmark {
 void create_benchmark(blas_benchmark::Args& args, rocblas_handle& rb_handle,
                       bool* success) {
diff --git a/benchmark/rocblas/blas3/gemm_batched_strided.cpp b/benchmark/rocblas/blas3/gemm_batched_strided.cpp
index 15dac9896..3ecbff82c 100644
--- a/benchmark/rocblas/blas3/gemm_batched_strided.cpp
+++ b/benchmark/rocblas/blas3/gemm_batched_strided.cpp
@@ -40,6 +40,20 @@ static inline void rocblas_gemm_strided_batched(args_t&&... args) {
   return;
 }
 
+#ifdef BLAS_ENABLE_COMPLEX
+template <typename scalar_t, typename... args_t>
+static inline void rocblas_cplx_gemm_strided_batched(args_t&&... args) {
+  if constexpr (std::is_same_v<scalar_t, float>) {
+    CHECK_ROCBLAS_STATUS(
+        rocblas_cgemm_strided_batched(std::forward<args_t>(args)...));
+  } else if constexpr (std::is_same_v<scalar_t, double>) {
+    CHECK_ROCBLAS_STATUS(
+        rocblas_zgemm_strided_batched(std::forward<args_t>(args)...));
+  }
+  return;
+}
+#endif
+
 template <typename scalar_t>
 void run(benchmark::State& state, rocblas_handle& rb_handle, int t_a_i,
          int t_b_i, index_t m, index_t k, index_t n, scalar_t alpha,
@@ -219,6 +233,209 @@ void register_benchmark(blas_benchmark::Args& args, rocblas_handle& rb_handle,
   }
 }
 
+#ifdef BLAS_ENABLE_COMPLEX
+template <typename scalar_t>
+using rocComplex =
+    typename std::conditional<sizeof(scalar_t) == 8, rocblas_double_complex,
+                              rocblas_float_complex>::type;
+
+template <typename scalar_t>
+void run(benchmark::State& state, rocblas_handle& rb_handle, int t_a_i,
+         int t_b_i, index_t m, index_t k, index_t n,
+         std::complex<scalar_t> alpha, std::complex<scalar_t> beta,
+         index_t batch_size, index_t stride_a_mul, index_t stride_b_mul,
+         index_t stride_c_mul, bool* success) {
+  // initialize the state label
+  blas_benchmark::utils::set_benchmark_label<std::complex<scalar_t>>(state);
+
+  // Standard test setup.
+  std::string t_a = blas_benchmark::utils::from_transpose_enum(
+      static_cast<blas_benchmark::utils::Transposition>(t_a_i));
+  std::string t_b = blas_benchmark::utils::from_transpose_enum(
+      static_cast<blas_benchmark::utils::Transposition>(t_b_i));
+  const char* t_a_str = t_a.c_str();
+  const char* t_b_str = t_b.c_str();
+
+  const bool trA = (t_a_str[0] == 'n');
+  const bool trB = (t_b_str[0] == 'n');
+
+  index_t lda = trA ? m : k;
+  index_t ldb = trB ? k : n;
+  index_t ldc = m;
+
+  blas_benchmark::utils::init_level_3_cplx_counters<
+      blas_benchmark::utils::Level3Op::gemm_batched_strided, scalar_t>(
+      state, beta, m, n, k, batch_size, stride_a_mul, stride_b_mul,
+      stride_c_mul);
+
+  // Matrix options (rocBLAS)
+  const rocblas_operation trans_a_rb =
+      trA ? rocblas_operation_none : rocblas_operation_transpose;
+  const rocblas_operation trans_b_rb =
+      trB ? rocblas_operation_none : rocblas_operation_transpose;
+
+  // rocBLAS complex alpha & beta
+  rocComplex<scalar_t> rocBeta{beta.real(), beta.imag()};
+  rocComplex<scalar_t> rocAlpha{alpha.real(), alpha.imag()};
+
+  // Data sizes
+  // Elementary matrices
+  const index_t a_size = m * k;
+  const index_t b_size = k * n;
+  const index_t c_size = m * n;
+  // Strides
+  const index_t stride_a = stride_a_mul * a_size;
+  const index_t stride_b = stride_b_mul * b_size;
+  const index_t stride_c = stride_c_mul * c_size;
+  // Batched matrices
+  const int size_a_batch = a_size + (batch_size - 1) * stride_a;
+  const int size_b_batch = b_size + (batch_size - 1) * stride_b;
+  const int size_c_batch = c_size + (batch_size - 1) * stride_c;
+
+  // Matrices
+  std::vector<std::complex<scalar_t>> a =
+      blas_benchmark::utils::random_cplx_data<scalar_t>(size_a_batch);
+  std::vector<std::complex<scalar_t>> b =
+      blas_benchmark::utils::random_cplx_data<scalar_t>(size_b_batch);
+  std::vector<std::complex<scalar_t>> c =
+      blas_benchmark::utils::const_cplx_data<scalar_t>(size_c_batch, 0);
+
+  {
+    // Device memory allocation & H2D copy
+    blas_benchmark::utils::HIPVectorBatchedStrided<rocComplex<scalar_t>>
+        a_batched_gpu(a_size, batch_size, stride_a,
+                      reinterpret_cast<rocComplex<scalar_t>*>(a.data()));
+    blas_benchmark::utils::HIPVectorBatchedStrided<rocComplex<scalar_t>>
+        b_batched_gpu(b_size, batch_size, stride_b,
+                      reinterpret_cast<rocComplex<scalar_t>*>(b.data()));
+    blas_benchmark::utils::HIPVectorBatchedStrided<rocComplex<scalar_t>>
+        c_batched_gpu(c_size, batch_size, stride_c,
+                      reinterpret_cast<rocComplex<scalar_t>*>(c.data()));
+
+#ifdef BLAS_VERIFY_BENCHMARK
+    // Reference gemm batched strided (strided loop of gemm)
+    std::vector<std::complex<scalar_t>> c_ref = c;
+    for (int batch = 0; batch < batch_size; batch++) {
+      reference_blas::cgemm<scalar_t>(
+          t_a_str, t_b_str, m, n, k, reinterpret_cast<const void*>(&alpha),
+          reinterpret_cast<const void*>(a.data() + batch * stride_a), lda,
+          reinterpret_cast<const void*>(b.data() + batch * stride_b), ldb,
+          reinterpret_cast<const void*>(&beta),
+          reinterpret_cast<void*>(c_ref.data() + batch * stride_c), ldc);
+    }
+
+    // Rocblas verification gemm_batched_strided
+    std::vector<std::complex<scalar_t>> c_temp = c;
+    {
+      blas_benchmark::utils::HIPVectorBatchedStrided<rocComplex<scalar_t>, true>
+          c_temp_gpu(c_size, batch_size, stride_c,
+                     reinterpret_cast<rocComplex<scalar_t>*>(c_temp.data()));
+      rocblas_cplx_gemm_strided_batched<scalar_t>(
+          rb_handle, trans_a_rb, trans_b_rb, m, n, k, &rocAlpha, a_batched_gpu,
+          lda, stride_a, b_batched_gpu, ldb, stride_b, &rocBeta, c_temp_gpu,
+          ldc, stride_c, batch_size);
+    }
+
+    std::ostringstream err_stream;
+    if (!utils::compare_vectors_strided(c_temp, c_ref, stride_c, c_size,
+                                        err_stream, "")) {
+      const std::string& err_str = err_stream.str();
+      state.SkipWithError(err_str.c_str());
+      *success = false;
+    };
+#endif
+
+    auto blas_warmup = [&]() -> void {
+      rocblas_cplx_gemm_strided_batched<scalar_t>(
+          rb_handle, trans_a_rb, trans_b_rb, m, n, k, &rocAlpha, a_batched_gpu,
+          lda, stride_a, b_batched_gpu, ldb, stride_b, &rocBeta, c_batched_gpu,
+          ldc, stride_c, batch_size);
+      return;
+    };
+
+    hipEvent_t start, stop;
+    CHECK_HIP_ERROR(hipEventCreate(&start));
+    CHECK_HIP_ERROR(hipEventCreate(&stop));
+
+    auto blas_method_def = [&]() -> std::vector<hipEvent_t> {
+      CHECK_HIP_ERROR(hipEventRecord(start, NULL));
+      rocblas_cplx_gemm_strided_batched<scalar_t>(
+          rb_handle, trans_a_rb, trans_b_rb, m, n, k, &rocAlpha, a_batched_gpu,
+          lda, stride_a, b_batched_gpu, ldb, stride_b, &rocBeta, c_batched_gpu,
+          ldc, stride_c, batch_size);
+      CHECK_HIP_ERROR(hipEventRecord(stop, NULL));
+      CHECK_HIP_ERROR(hipEventSynchronize(stop));
+      return std::vector{start, stop};
+    };
+
+    // Warmup
+    blas_benchmark::utils::warmup(blas_warmup);
+    CHECK_HIP_ERROR(hipStreamSynchronize(NULL));
+
+    blas_benchmark::utils::init_counters(state);
+
+    // Measure
+    for (auto _ : state) {
+      // Run
+      std::tuple<double, double> times =
+          blas_benchmark::utils::timef_hip(blas_method_def);
+
+      // Report
+      blas_benchmark::utils::update_counters(state, times);
+    }
+
+    state.SetBytesProcessed(state.iterations() *
+                            state.counters["bytes_processed"]);
+    state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]);
+
+    blas_benchmark::utils::calc_avg_counters(state);
+
+    CHECK_HIP_ERROR(hipEventDestroy(start));
+    CHECK_HIP_ERROR(hipEventDestroy(stop));
+  }  // release device memory via utils::DeviceVector destructors
+};
+
+template <typename scalar_t>
+void register_cplx_benchmark(blas_benchmark::Args& args,
+                             rocblas_handle& rb_handle, bool* success) {
+  auto gemm_batched_strided_params =
+      blas_benchmark::utils::get_gemm_batched_strided_cplx_params<scalar_t>(
+          args);
+
+  for (auto p : gemm_batched_strided_params) {
+    std::string t_a, t_b;
+    index_t m, n, k, batch_size, stride_a_mul, stride_b_mul, stride_c_mul;
+    scalar_t alpha_r, alpha_i, beta_r, beta_i;
+
+    std::tie(t_a, t_b, m, k, n, alpha_r, alpha_i, beta_r, beta_i, batch_size,
+             stride_a_mul, stride_b_mul, stride_c_mul) = p;
+    int t_a_i = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t_a));
+    int t_b_i = static_cast<int>(blas_benchmark::utils::to_transpose_enum(t_b));
+    std::complex<scalar_t> alpha{alpha_r, alpha_i};
+    std::complex<scalar_t> beta{beta_r, beta_i};
+
+    auto BM_lambda = [&](benchmark::State& st, rocblas_handle rb_handle,
+                         int t1i, int t2i, index_t m, index_t k, index_t n,
+                         std::complex<scalar_t> alpha,
+                         std::complex<scalar_t> beta, index_t batch_size,
+                         index_t strd_a_mul, index_t strd_b_mul,
+                         index_t strd_c_mul, bool* success) {
+      run<scalar_t>(st, rb_handle, t1i, t2i, m, k, n, alpha, beta, batch_size,
+                    strd_a_mul, strd_b_mul, strd_c_mul, success);
+    };
+    benchmark::RegisterBenchmark(
+        blas_benchmark::utils::get_name<benchmark_op, std::complex<scalar_t>>(
+            t_a, t_b, m, k, n, batch_size, stride_a_mul, stride_b_mul,
+            stride_c_mul, blas_benchmark::utils::MEM_TYPE_USM)
+            .c_str(),
+        BM_lambda, rb_handle, t_a_i, t_b_i, m, k, n, alpha, beta, batch_size,
+        stride_a_mul, stride_b_mul, stride_c_mul, success)
+        ->UseRealTime();
+  }
+}
+
+#endif
+
 namespace blas_benchmark {
 void create_benchmark(blas_benchmark::Args& args, rocblas_handle& rb_handle,
                       bool* success) {
diff --git a/common/include/common/blas3_state_counters.hpp b/common/include/common/blas3_state_counters.hpp
index c7515eb07..68e332773 100644
--- a/common/include/common/blas3_state_counters.hpp
+++ b/common/include/common/blas3_state_counters.hpp
@@ -76,6 +76,66 @@ init_level_3_counters(benchmark::State& state, scalar_t beta = 0, index_t m = 0,
   return;
 }
 
+#ifdef BLAS_ENABLE_COMPLEX
+template <Level3Op op, typename scalar_t, typename index_t>
+inline typename std::enable_if<op == Level3Op::gemm_batched_strided ||
+                               op == Level3Op::gemm_batched ||
+                               op == Level3Op::gemm>::type
+init_level_3_cplx_counters(
+    benchmark::State& state,
+    std::complex<scalar_t> beta = std::complex<scalar_t>(0, 0), index_t m = 0,
+    index_t n = 0, index_t k = 0, index_t batch_size = 1,
+    index_t stride_a_mul = 1, index_t stride_b_mul = 1,
+    index_t stride_c_mul = 1) {
+  // Google-benchmark counters are double.
+  double beta_real_d = static_cast<double>(beta.real());
+  double beta_imag_d = static_cast<double>(beta.imag());
+  double m_d = static_cast<double>(m);
+  double n_d = static_cast<double>(n);
+  double k_d = static_cast<double>(k);
+  double batch_size_d = static_cast<double>(batch_size);
+  state.counters["beta_real"] = beta_real_d;
+  state.counters["beta_imag"] = beta_real_d;
+  state.counters["m"] = m_d;
+  state.counters["n"] = n_d;
+  state.counters["k"] = k_d;
+  state.counters["batch_size"] = batch_size_d;
+  if constexpr (op == Level3Op::gemm_batched_strided) {
+    double stride_a_mul_d = static_cast<double>(stride_a_mul);
+    double stride_b_mul_d = static_cast<double>(stride_b_mul);
+    double stride_c_mul_d = static_cast<double>(stride_c_mul);
+
+    state.counters["stride_a_mul"] = stride_a_mul_d;
+    state.counters["stride_b_mul"] = stride_b_mul_d;
+    state.counters["stride_c_mul"] = stride_c_mul_d;
+  }
+
+  // Counters here should be reviewed as pure real/imaginary cases result in
+  // less flops
+
+  bool beta_zero = (beta.real() == scalar_t{0}) && (beta.imag() == scalar_t{0});
+
+  const double nflops_AtimesB =
+      k_d * m_d * n_d * 6 + k_d * m_d * n_d * 2;  // MulFlops + AddFlops
+  double nflops_timesAlpha = m_d * n_d * 6;
+  const double nflops_addBetaC =
+      beta_zero ? 0 : 6 * m_d * n_d + 2 * m_d * n_d;  // MulFlops + AddFlops
+  const double nflops_tot =
+      (nflops_AtimesB + nflops_timesAlpha + nflops_addBetaC) * batch_size_d;
+  state.counters["n_fl_ops"] = nflops_tot;
+
+  const double mem_readA = m_d * k_d;
+  const double mem_readB = k_d * n_d;
+  const double mem_writeC = m_d * n_d;
+  const double mem_readC = beta_zero ? 0 : m_d * n_d;
+  const double total_mem = (mem_readA + mem_readB + mem_readC + mem_writeC) *
+                           batch_size_d * sizeof(scalar_t) * 2;
+  state.counters["bytes_processed"] = total_mem;
+  return;
+}
+
+#endif
+
 template <Level3Op op, typename scalar_t>
 inline typename std::enable_if<op == Level3Op::symm>::type
 init_level_3_counters(benchmark::State& state, scalar_t beta = 0, index_t m = 0,
diff --git a/common/include/common/common_utils.hpp b/common/include/common/common_utils.hpp
index a569ed2ff..251ee9b7f 100644
--- a/common/include/common/common_utils.hpp
+++ b/common/include/common/common_utils.hpp
@@ -53,6 +53,24 @@ using gemm_batched_strided_param_t =
     std::tuple<std::string, std::string, index_t, index_t, index_t, scalar_t,
                scalar_t, index_t, index_t, index_t, index_t>;
 
+#ifdef BLAS_ENABLE_COMPLEX
+template <typename scalar_t>
+using blas3_cplx_param_t =
+    std::tuple<std::string, std::string, index_t, index_t, index_t, scalar_t,
+               scalar_t, scalar_t, scalar_t>;
+
+template <typename scalar_t>
+using gemm_batched_strided_cplx_param_t =
+    std::tuple<std::string, std::string, index_t, index_t, index_t, scalar_t,
+               scalar_t, scalar_t, scalar_t, index_t, index_t, index_t,
+               index_t>;
+
+template <typename scalar_t>
+using gemm_batched_cplx_param_t =
+    std::tuple<std::string, std::string, index_t, index_t, index_t, scalar_t,
+               scalar_t, scalar_t, scalar_t, index_t, int>;
+#endif
+
 using reduction_param_t = std::tuple<index_t, index_t>;
 
 template <typename scalar_t>
@@ -485,6 +503,157 @@ static inline std::vector<blas3_param_t<scalar_t>> get_blas3_params(
   }
 }
 
+#ifdef BLAS_ENABLE_COMPLEX
+/**
+ * @fn get_blas3_cplx_params for complex data type
+ * @brief Returns a vector containing the blas 3 benchmark cplx parameters,
+ * either read from a file according to the command-line args, or the default
+ * ones. So far only used/supported for GEMM & its batched extensions.
+ */
+template <typename scalar_t>
+static inline std::vector<blas3_cplx_param_t<scalar_t>> get_blas3_cplx_params(
+    Args& args) {
+  if (args.csv_param.empty()) {
+    warning_no_csv();
+    std::vector<blas3_cplx_param_t<scalar_t>> blas3_default;
+    constexpr index_t dmin = 32, dmax = 8192;
+    std::vector<std::string> dtranspose = {"n", "t"};
+    std::complex<scalar_t> alpha{1, 1};
+    std::complex<scalar_t> beta{1, 1};
+    for (std::string& t1 : dtranspose) {
+      for (std::string& t2 : dtranspose) {
+        for (index_t m = dmin; m <= dmax; m *= 8) {
+          for (index_t k = dmin; k <= dmax; k *= 8) {
+            for (index_t n = dmin; n <= dmax; n *= 8) {
+              blas3_default.push_back(
+                  std::make_tuple(t1, t2, m, k, n, alpha.real(), alpha.imag(),
+                                  beta.real(), beta.imag()));
+            }
+          }
+        }
+      }
+    }
+    return blas3_default;
+  } else {
+    return parse_csv_file<blas3_cplx_param_t<scalar_t>>(
+        args.csv_param, [&](std::vector<std::string>& v) {
+          if (v.size() != 9) {
+            throw std::runtime_error(
+                "invalid number of parameters (9 expected)");
+          }
+          try {
+            return std::make_tuple(
+                v[0].c_str(), v[1].c_str(), str_to_int<index_t>(v[2]),
+                str_to_int<index_t>(v[3]), str_to_int<index_t>(v[4]),
+                str_to_scalar<scalar_t>(v[5]), str_to_scalar<scalar_t>(v[6]),
+                str_to_scalar<scalar_t>(v[7]), str_to_scalar<scalar_t>(v[8]));
+          } catch (...) {
+            throw std::runtime_error("invalid parameter");
+          }
+        });
+  }
+}
+
+/**
+ * @fn get_gemm_batched_strided_cplx_params for complex data type
+ * @brief Returns a vector containing the gemm_batched_strided cplx benchmark
+ * parameters, either read from a file according to the command-line args, or
+ * the default ones.
+ */
+template <typename scalar_t>
+inline std::vector<gemm_batched_strided_cplx_param_t<scalar_t>>
+get_gemm_batched_strided_cplx_params(Args& args) {
+  if (args.csv_param.empty()) {
+    warning_no_csv();
+    std::vector<gemm_batched_strided_cplx_param_t<scalar_t>>
+        gemm_batched_strided_default;
+    constexpr index_t dmin = 128, dmax = 8192;
+    std::vector<std::string> dtranspose = {"n", "t"};
+    std::complex<scalar_t> alpha{1, 1};
+    std::complex<scalar_t> beta{1, 1};
+    index_t batch_size = 8;
+    for (std::string& t1 : dtranspose) {
+      for (std::string& t2 : dtranspose) {
+        for (index_t m = dmin; m <= dmax; m *= 8) {
+          gemm_batched_strided_default.push_back(
+              std::make_tuple(t1, t2, m, m, m, alpha.real(), alpha.imag(),
+                              beta.real(), beta.imag(), batch_size, 2, 2, 2));
+        }
+      }
+    }
+    return gemm_batched_strided_default;
+  } else {
+    return parse_csv_file<gemm_batched_strided_cplx_param_t<scalar_t>>(
+        args.csv_param, [&](std::vector<std::string>& v) {
+          if (v.size() != 13) {
+            throw std::runtime_error(
+                "invalid number of parameters (13 expected)");
+          }
+          try {
+            return std::make_tuple(
+                v[0].c_str(), v[1].c_str(), str_to_int<index_t>(v[2]),
+                str_to_int<index_t>(v[3]), str_to_int<index_t>(v[4]),
+                str_to_scalar<scalar_t>(v[5]), str_to_scalar<scalar_t>(v[6]),
+                str_to_scalar<scalar_t>(v[7]), str_to_scalar<scalar_t>(v[8]),
+                str_to_int<index_t>(v[9]), str_to_int<index_t>(v[10]),
+                str_to_int<index_t>(v[11]), str_to_int<index_t>(v[12]));
+          } catch (...) {
+            std::throw_with_nested(std::runtime_error("invalid parameter"));
+          }
+        });
+  }
+}
+
+/**
+ * @fn get_gemm_cplx_batched_params
+ * @brief Returns a vector containing the gemm_batched cplx benchmark
+ * parameters, either read from a file according to the command-line args, or
+ * the default ones.
+ */
+template <typename scalar_t>
+inline std::vector<gemm_batched_cplx_param_t<scalar_t>>
+get_gemm_cplx_batched_params(Args& args) {
+  if (args.csv_param.empty()) {
+    warning_no_csv();
+    std::vector<gemm_batched_cplx_param_t<scalar_t>> gemm_batched_default;
+    constexpr index_t dmin = 128, dmax = 8192;
+    std::vector<std::string> dtranspose = {"n", "t"};
+    std::complex<scalar_t> alpha{1, 1};
+    std::complex<scalar_t> beta{1, 1};
+    index_t batch_size = 8;
+    int batch_type = 0;
+    for (std::string& t1 : dtranspose) {
+      for (std::string& t2 : dtranspose) {
+        for (index_t n = dmin; n <= dmax; n *= 8) {
+          gemm_batched_default.push_back(std::make_tuple(
+              t1, t2, n, n, n, alpha.real(), alpha.imag(), beta.real(),
+              beta.imag(), batch_size, batch_type));
+        }
+      }
+    }
+    return gemm_batched_default;
+  } else {
+    return parse_csv_file<gemm_batched_cplx_param_t<scalar_t>>(
+        args.csv_param, [&](std::vector<std::string>& v) {
+          if (v.size() != 11) {
+            throw std::runtime_error(
+                "invalid number of parameters (11 expected)");
+          }
+          try {
+            return std::make_tuple(
+                v[0].c_str(), v[1].c_str(), str_to_int<index_t>(v[2]),
+                str_to_int<index_t>(v[3]), str_to_int<index_t>(v[4]),
+                str_to_scalar<scalar_t>(v[5]), str_to_scalar<scalar_t>(v[6]),
+                str_to_scalar<scalar_t>(v[7]), str_to_scalar<scalar_t>(v[8]),
+                str_to_int<index_t>(v[9]), str_to_batch_type(v[10]));
+          } catch (...) {
+            std::throw_with_nested(std::runtime_error("invalid parameter"));
+          }
+        });
+  }
+}
+#endif
+
 /**
  * @fn get_gemm_batched_params
  * @brief Returns a vector containing the gemm_batched benchmark parameters,
@@ -1334,6 +1503,17 @@ inline std::string get_type_name<double>() {
   return "double";
 }
 
+#ifdef BLAS_ENABLE_COMPLEX
+template <>
+inline std::string get_type_name<std::complex<float>>() {
+  return "complex<float>";
+}
+template <>
+inline std::string get_type_name<std::complex<double>>() {
+  return "complex<double>";
+}
+#endif
+
 /**
  * @fn random_scalar
  * @brief Generates a random scalar value, using an arbitrary low quality
@@ -1372,6 +1552,67 @@ static inline std::vector<scalar_t> random_data(size_t size) {
   return v;
 }
 
+#ifdef BLAS_ENABLE_COMPLEX
+/**
+ * @fn random_cplx_scalar
+ * @brief Generates a random complex value, using an arbitrary low quality
+ * algorithm.
+ */
+template <typename scalar_t>
+static inline std::complex<scalar_t> random_cplx_scalar() {
+  scalar_t rl = 1e-3 * ((rand() % 2000) - 1000);
+  scalar_t im = 1e-3 * ((rand() % 2000) - 1000);
+  return std::complex<scalar_t>(rl, im);
+}
+
+/**
+ * @brief Generates a random complex in the specified range of its underlying
+ * data elements (real & imag)
+ * @param rangeMin range minimum
+ * @param rangeMax range maximum
+ */
+template <typename scalar_t>
+static inline std::complex<scalar_t> random_cplx_scalar(scalar_t rangeMin,
+                                                        scalar_t rangeMax) {
+  static std::random_device rd;
+  static std::default_random_engine gen(rd());
+  std::uniform_real_distribution<scalar_t> disRl(rangeMin, rangeMax);
+  std::uniform_real_distribution<scalar_t> disIm(rangeMin, rangeMax);
+
+  return std::complex<scalar_t>(disRl(gen), disIm(gen));
+}
+
+/**
+ * @fn random_cplx_data
+ * @brief Generates a random vector of complex values, using a uniform
+ * distribution of the underlying data elements (real & imag).
+ */
+template <typename scalar_t>
+static inline std::vector<std::complex<scalar_t>> random_cplx_data(
+    size_t size) {
+  std::vector<std::complex<scalar_t>> v(size);
+
+  for (std::complex<scalar_t>& e : v) {
+    e = random_cplx_scalar<scalar_t>(scalar_t{-2}, scalar_t{5});
+  }
+  return v;
+}
+
+/**
+ * @fn const_cplx_data
+ * @brief Generates a vector of constant complex values, of a given length.
+ */
+template <typename scalar_t>
+static inline std::vector<std::complex<scalar_t>> const_cplx_data(
+    size_t size, scalar_t const_value = 0) {
+  std::vector<std::complex<scalar_t>> v(size);
+  std::complex<scalar_t> const_cplx_value{const_value, const_value};
+  std::fill(v.begin(), v.end(), const_cplx_value);
+  return v;
+}
+
+#endif  // BLAS_ENABLE_COMPLEX
+
 /**
  * @breif Fills a lower or upper triangular matrix suitable for TRSM testing
  * @param A The matrix to fill. Size must be at least m * lda
@@ -1575,17 +1816,39 @@ static inline void calc_avg_counters(benchmark::State& state) {
 #define BLAS_REGISTER_BENCHMARK_HALF(args, sb_handle_ptr, success)
 #endif  // BLAS_DATA_TYPE_HALF
 
+#ifdef BLAS_ENABLE_COMPLEX
+/** Registers benchmark for the float complex data type
+ * @see BLAS_REGISTER_BENCHMARK
+ */
+#define BLAS_REGISTER_BENCHMARK_CPLX_FLOAT(args, sb_handle_ptr, success) \
+  register_cplx_benchmark<float>(args, sb_handle_ptr, success)
+#else
+#define BLAS_REGISTER_BENCHMARK_CPLX_FLOAT(args, sb_handle_ptr, success)
+#endif
+
+#if defined(BLAS_ENABLE_COMPLEX) & defined(BLAS_DATA_TYPE_DOUBLE)
+/** Registers benchmark for the double complex data type
+ * @see BLAS_REGISTER_BENCHMARK
+ */
+#define BLAS_REGISTER_BENCHMARK_CPLX_DOUBLE(args, sb_handle_ptr, success) \
+  register_cplx_benchmark<double>(args, sb_handle_ptr, success)
+#else
+#define BLAS_REGISTER_BENCHMARK_CPLX_DOUBLE(args, sb_handle_ptr, success)
+#endif
+
 /** Registers benchmark for all supported data types.
  *  Expects register_benchmark<scalar_t> to exist.
  * @param args Reference to blas_benchmark::Args
  * @param sb_handle_ptr Pointer to blas::SB_Handle
  * @param[out] success Pointer to boolean indicating success
  */
-#define BLAS_REGISTER_BENCHMARK(args, sb_handle_ptr, success)     \
-  do {                                                            \
-    BLAS_REGISTER_BENCHMARK_FLOAT(args, sb_handle_ptr, success);  \
-    BLAS_REGISTER_BENCHMARK_DOUBLE(args, sb_handle_ptr, success); \
-    BLAS_REGISTER_BENCHMARK_HALF(args, sb_handle_ptr, success);   \
+#define BLAS_REGISTER_BENCHMARK(args, sb_handle_ptr, success)          \
+  do {                                                                 \
+    BLAS_REGISTER_BENCHMARK_FLOAT(args, sb_handle_ptr, success);       \
+    BLAS_REGISTER_BENCHMARK_DOUBLE(args, sb_handle_ptr, success);      \
+    BLAS_REGISTER_BENCHMARK_HALF(args, sb_handle_ptr, success);        \
+    BLAS_REGISTER_BENCHMARK_CPLX_FLOAT(args, sb_handle_ptr, success);  \
+    BLAS_REGISTER_BENCHMARK_CPLX_DOUBLE(args, sb_handle_ptr, success); \
   } while (false)
 
 #endif
diff --git a/common/include/common/set_benchmark_label.hpp b/common/include/common/set_benchmark_label.hpp
index b1d4c3ca7..9495a3195 100644
--- a/common/include/common/set_benchmark_label.hpp
+++ b/common/include/common/set_benchmark_label.hpp
@@ -28,6 +28,10 @@
 
 #include <map>
 #include <string>
+#ifdef BLAS_ENABLE_COMPLEX
+#define SYCL_EXT_ONEAPI_COMPLEX
+#include <complex>
+#endif
 
 #ifdef BUILD_CUBLAS_BENCHMARKS
 #include <cuda.h>
@@ -178,6 +182,20 @@ inline void add_datatype_info<cl::sycl::half>(
 }
 #endif  // BLAS_DATA_TYPE_HALF
 
+#ifdef BLAS_ENABLE_COMPLEX
+template <>
+inline void add_datatype_info<std::complex<float>>(
+    std::map<std::string, std::string>& key_value_map) {
+  key_value_map["@datatype"] = "complex<float>";
+}
+
+template <>
+inline void add_datatype_info<std::complex<double>>(
+    std::map<std::string, std::string>& key_value_map) {
+  key_value_map["@datatype"] = "complex<double>";
+}
+#endif  // BLAS_ENABLE_COMPLEX
+
 }  // namespace datatype_info
 
 inline void set_label(benchmark::State& state,