forked from gpu-mode/lectures
-
Notifications
You must be signed in to change notification settings - Fork 0
/
privatization.cu
76 lines (61 loc) · 2.29 KB
/
privatization.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#include <stdio.h>
#include <cuda_runtime.h>
// CUDA kernel for vector addition without privatization
__global__ void vectorAdd(const float *a, const float *b, float *result, int n) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < n) {
result[index] = a[index] + b[index];
}
}
// CUDA kernel for vector addition with privatization
__global__ void vectorAddPrivatized(const float *a, const float *b, float *result, int n) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < n) {
float a_private = a[index]; // Load into private memory
float b_private = b[index]; // Load into private memory
result[index] = a_private + b_private;
}
}
// Function to initialize the vectors with dummy data
void initData(float *data, int size) {
for (int i = 0; i < size; ++i) {
data[i] = i;
}
}
int main() {
int n = 1<<20; // Size of the vectors
float *a, *b, *result, *d_a, *d_b, *d_result;
// Allocate memory on the host
a = (float*)malloc(n * sizeof(float));
b = (float*)malloc(n * sizeof(float));
result = (float*)malloc(n * sizeof(float));
// Initialize vectors
initData(a, n);
initData(b, n);
// Allocate memory on the device
cudaMalloc(&d_a, n * sizeof(float));
cudaMalloc(&d_b, n * sizeof(float));
cudaMalloc(&d_result, n * sizeof(float));
// Copy vectors from host to device
cudaMemcpy(d_a, a, n * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, n * sizeof(float), cudaMemcpyHostToDevice);
// Define number of blocks and threads
int threadsPerBlock = 256;
int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
// Launch the vector addition kernel without privatization
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_result, n);
// Copy result back to host
cudaMemcpy(result, d_result, n * sizeof(float), cudaMemcpyDeviceToHost);
// Launch the vector addition kernel with privatization
vectorAddPrivatized<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_result, n);
// Copy result back to host
cudaMemcpy(result, d_result, n * sizeof(float), cudaMemcpyDeviceToHost);
// Cleanup
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_result);
free(a);
free(b);
free(result);
return 0;
}