-
Notifications
You must be signed in to change notification settings - Fork 359
/
Copy pathsingle_gemm_performance.cu
68 lines (56 loc) · 2.59 KB
/
single_gemm_performance.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#include "single_gemm_performance.hpp"
// This is an example of testing performance of cuBLASDx device function executing a general matrix multiply (GEMM)
//
// C = alpha * A * B + beta * C
//
// A, B, and C are matrices. Mixed precisions are supported.
// Note that alpha and beta are expected to have the same precision and type (real or complex value)
// as the matrix C elements.
//
// Input data is generated on host using random number generators, and later copied to
// the global memory. Next, kernel with GEMM is executed, and then the matrix C (the result)
// is copied back to host memory, and verified against cuBLAS.
//
// The measured operation runs multiple times and the average speed is reported.
template<unsigned int Arch>
int single_gemm_performance() {
using namespace cublasdx;
// Parameters m, n, k define the dimensions of matrices A, B, and C.
constexpr unsigned int m = 64;
constexpr unsigned int n = 64;
constexpr unsigned int k = 64;
// Choose block size, or set to 0 to use library-suggested value.
constexpr unsigned int BlockSize = 128;
// Flag to use library-suggested leading dimension (potential performance improvement).
constexpr bool UseSuggestedLD = false;
using PA = __half;
using PB = __half;
using PC = __half;
constexpr auto type = cublasdx::type::real;
// Choose arrangement for A, B, C: row-major or column-major
constexpr auto a_arrangement = cublasdx::row_major;
constexpr auto b_arrangement = cublasdx::col_major;
constexpr auto c_arrangement = cublasdx::col_major;
// Define the matrix multiplication operation.
using GEMM = decltype(cublasdx::Size<m, n, k>() +
cublasdx::Precision<PA, PB, PC>() +
cublasdx::Type<type>() +
cublasdx::Function<cublasdx::function::MM>() +
cublasdx::Arrangement<a_arrangement, b_arrangement, c_arrangement>() +
cublasdx::MaxAlignment() +
cublasdx::Block() +
cublasdx::SM<Arch>());
bool verbose = true;
cudaStream_t stream;
CUDA_CHECK_AND_EXIT(cudaStreamCreate(&stream))
int status = benchmark_mixed_precision_gemm<GEMM, Arch, BlockSize, UseSuggestedLD>(stream, verbose);
CUDA_CHECK_AND_EXIT(cudaStreamDestroy(stream));
return status;
}
template<unsigned int Arch>
struct single_gemm_performance_functor {
int operator()() { return single_gemm_performance<Arch>(); }
};
int main(int, char**) {
return example::sm_runner<single_gemm_performance_functor>();
}