-
Notifications
You must be signed in to change notification settings - Fork 0
/
dummy_w_ptx_inc.cu
66 lines (58 loc) · 2.4 KB
/
dummy_w_ptx_inc.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 900
static_assert(false, "Device code is being compiled with older architectures that are incompatible with TMA.");
#endif // __CUDA_MINIMUM_ARCH__
#include <cuda/barrier>
#include <cuda/ptx>
using barrier = cuda::barrier<cuda::thread_scope_block>;
namespace ptx = cuda::ptx;
static constexpr size_t buf_len = 1024;
__global__ void add_one_kernel(int* data, size_t offset)
{
// Shared memory buffer. The destination shared memory buffer of
// a bulk operations should be 16 byte aligned.
__shared__ alignas(16) int smem_data[buf_len];
// 1. a) Initialize shared memory barrier with the number of threads participating in the barrier.
// b) Make initialized barrier visible in async proxy.
#pragma nv_diag_suppress static_var_with_dynamic_init
__shared__ barrier bar;
if (threadIdx.x == 0) {
init(&bar, blockDim.x); // a)
ptx::fence_proxy_async(ptx::space_shared); // b)
}
__syncthreads();
// 2. Initiate TMA transfer to copy global to shared memory.
if (threadIdx.x == 0) {
// 3a. cuda::memcpy_async arrives on the barrier and communicates
// how many bytes are expected to come in (the transaction count)
cuda::memcpy_async(
smem_data,
data + offset,
cuda::aligned_size_t<16>(sizeof(smem_data)),
bar
);
}
// 3b. All threads arrive on the barrier
barrier::arrival_token token = bar.arrive();
// 3c. Wait for the data to have arrived.
bar.wait(std::move(token));
// 4. Compute saxpy and write back to shared memory
for (int i = threadIdx.x; i < buf_len; i += blockDim.x) {
smem_data[i] += 1;
}
// 5. Wait for shared memory writes to be visible to TMA engine.
ptx::fence_proxy_async(ptx::space_shared); // b)
__syncthreads();
// After syncthreads, writes by all threads are visible to TMA engine.
// 6. Initiate TMA transfer to copy shared memory to global memory
if (threadIdx.x == 0) {
ptx::cp_async_bulk(
ptx::space_global,
ptx::space_shared,
data + offset, smem_data, sizeof(smem_data));
// 7. Wait for TMA transfer to have finished reading shared memory.
// Create a "bulk async-group" out of the previous bulk copy operation.
ptx::cp_async_bulk_commit_group();
// Wait for the group to have completed reading from shared memory.
ptx::cp_async_bulk_wait_group_read(ptx::n32_t<0>());
}
}