Skip to content

Commit

Permalink
new vector kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
eddy16112 committed Feb 26, 2016
1 parent 0c680c2 commit b6d56eb
Show file tree
Hide file tree
Showing 6 changed files with 186 additions and 77 deletions.
213 changes: 163 additions & 50 deletions opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -148,62 +148,175 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,

#else

#define SEG_ADD(s) \
l += s; \
while (l >= lines) { \
l -= lines; \
c += width; \
}

__global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t lines,
size_t nb_size,
OPAL_PTRDIFF_TYPE nb_extent,
unsigned char * b_source,
unsigned char * b_destination )
__global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
size_t size,
OPAL_PTRDIFF_TYPE extent,
unsigned char* source,
unsigned char* destination )
{
uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
uint32_t num_threads = gridDim.x * blockDim.x;

//size_t lines = (size_t)lines;
size_t size = nb_size / 8;
size_t extent = nb_extent / 8;
uint64_t * source = (uint64_t *) b_source;
uint64_t *destination = (uint64_t *) b_destination;
uint64_t val[KERNEL_UNROLL];
uint32_t i, u, tid, num_threads, warp_id, tid_per_warp, nb_warps, nb_warps_x, nb_warps_y, pos_x, pos_y, size_last_y, size_last_x;
uint32_t size_nb, extent_nb;
uint64_t *_source_tmp, *_destination_tmp, *source_64, *destination_64, *_source_left_tmp, *_destination_left_tmp;
uint64_t val[UNROLL_16];

int col = 0;
for (int width = 32; width > 0 && col < size; width >>= 1) {
while (size-col >= width) {
const int warp_id = tid / width;
const int warp_tid = tid & (width-1);
const int warp_nb = num_threads / width;
const int c = col + warp_tid;
int l = warp_id * KERNEL_UNROLL;
uint64_t *src = source + c;
uint64_t *dst = destination + c;
for (int b=0; b<lines/(KERNEL_UNROLL*warp_nb); b++) {
#pragma unroll
for (int u=0; u<KERNEL_UNROLL; u++) {
val[u] = *(src+(l+u)*extent);
}
#pragma unroll
for (int u=0; u<KERNEL_UNROLL; u++) {
dst[(l+u)*size] = val[u];
}
l += warp_nb * KERNEL_UNROLL;
}
/* Finish non-unrollable case */
for (int u=0; u<KERNEL_UNROLL && l<lines; u++) {
dst[l*size] = *(src+l*extent);
l++;
}
col += width;
}
tid = threadIdx.x + blockIdx.x * blockDim.x;
num_threads = gridDim.x * blockDim.x;
warp_id = tid / CUDA_WARP_SIZE;
tid_per_warp = threadIdx.x & (CUDA_WARP_SIZE-1);
nb_warps = num_threads / CUDA_WARP_SIZE;

extent_nb = extent / 8;
size_nb = size / 8;
source_64 = (uint64_t*)source;
destination_64 = (uint64_t*)destination;

nb_warps_x = size_nb / CUDA_WARP_SIZE;
size_last_x = size_nb & (CUDA_WARP_SIZE-1);
if ( size_last_x != 0) {
nb_warps_x ++;
} else {
size_last_x = CUDA_WARP_SIZE;
}
nb_warps_y = copy_loops / UNROLL_16;
size_last_y = copy_loops & (UNROLL_16-1);
if ( size_last_y != 0) {
nb_warps_y ++;
} else {
size_last_y = UNROLL_16;
}
// if (threadIdx.x == 0) {
// printf("warp_id %u, nb_warps_x %u, nb_warps_y %u, tid_per_warps %u, nb_warps %u\n", warp_id, nb_warps_x, nb_warps_y, tid_per_warp, nb_warps);
// }

const uint32_t extent_nb_times_UNROLL_16 = extent_nb * UNROLL_16;
const uint32_t size_nb_times_UNROLL_16 = size_nb * UNROLL_16;
source_64 += tid_per_warp;
destination_64 += tid_per_warp;

for (i = warp_id; i < (nb_warps_x-1) * (nb_warps_y-1); i += nb_warps) {
pos_x = i / (nb_warps_y-1);
pos_y = i % (nb_warps_y-1);
_source_tmp = source_64 + pos_y * extent_nb_times_UNROLL_16 + pos_x * CUDA_WARP_SIZE;
_destination_tmp = destination_64 + pos_y * size_nb_times_UNROLL_16 + pos_x * CUDA_WARP_SIZE;
#pragma unroll
for (u = 0; u < UNROLL_16; u++) {
val[u] = *(_source_tmp + u * extent_nb);
}
#pragma unroll
for (uint32_t u = 0; u < UNROLL_16; u++) {
*(_destination_tmp + u * size_nb) = val[u];
}
}
if (tid_per_warp < size_last_x) {
pos_x = nb_warps_x - 1;
_source_left_tmp = source_64 + pos_x * CUDA_WARP_SIZE;
_destination_left_tmp = destination_64 + pos_x * CUDA_WARP_SIZE;
for (i = warp_id; i < nb_warps_y-1; i += nb_warps) {
_source_tmp = _source_left_tmp + i * extent_nb_times_UNROLL_16;
_destination_tmp = _destination_left_tmp + i * size_nb_times_UNROLL_16;
#pragma unroll
for (u = 0; u < UNROLL_16; u++) {
val[u] = *(_source_tmp + u * extent_nb);
}
#pragma unroll
for (uint32_t u = 0; u < UNROLL_16; u++) {
*(_destination_tmp + u * size_nb) = val[u];
}
}
}


pos_y = nb_warps_y - 1;
_source_left_tmp = source_64 + pos_y * extent_nb_times_UNROLL_16;
_destination_left_tmp = destination_64 + pos_y * size_nb_times_UNROLL_16;
if (size_last_y == UNROLL_16) {
for (i = warp_id; i < nb_warps_x-1; i += nb_warps) {
_source_tmp = _source_left_tmp + i * CUDA_WARP_SIZE;
_destination_tmp = _destination_left_tmp + i * CUDA_WARP_SIZE;
#pragma unroll
for (u = 0; u < UNROLL_16; u++) {
val[u] = *(_source_tmp + u * extent_nb);
}
#pragma unroll
for (uint32_t u = 0; u < UNROLL_16; u++) {
*(_destination_tmp + u * size_nb) = val[u];
}
}
} else {
for (i = warp_id; i < nb_warps_x-1; i += nb_warps) {
_source_tmp = _source_left_tmp + i * CUDA_WARP_SIZE;
_destination_tmp = _destination_left_tmp + i * CUDA_WARP_SIZE;
for (u = 0; u < size_last_y; u++) {
*(_destination_tmp + u * size_nb) = *(_source_tmp + u * extent_nb);
}
}
}

if (warp_id == 0 && tid_per_warp < size_last_x) {
_source_tmp = source_64 + (nb_warps_y-1) * extent_nb_times_UNROLL_16 + (nb_warps_x-1) * CUDA_WARP_SIZE;
_destination_tmp = destination_64 + (nb_warps_y-1) * size_nb_times_UNROLL_16 + (nb_warps_x-1) * CUDA_WARP_SIZE;
for (u = 0; u < size_last_y; u++) {
*(_destination_tmp + u * size_nb) = *(_source_tmp + u * extent_nb);
}
}
}


// #define SEG_ADD(s) \
// l += s; \
// while (l >= lines) { \
// l -= lines; \
// c += width; \
// }
//
// __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t lines,
// size_t nb_size,
// OPAL_PTRDIFF_TYPE nb_extent,
// unsigned char * b_source,
// unsigned char * b_destination )
// {
// uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
// uint32_t num_threads = gridDim.x * blockDim.x;
//
// //size_t lines = (size_t)lines;
// size_t size = nb_size / 8;
// size_t extent = nb_extent / 8;
// uint64_t * source = (uint64_t *) b_source;
// uint64_t *destination = (uint64_t *) b_destination;
// uint64_t val[KERNEL_UNROLL];
//
// int col = 0;
// for (int width = 32; width > 0 && col < size; width >>= 1) {
// while (size-col >= width) {
// const int warp_id = tid / width;
// const int warp_tid = tid & (width-1);
// const int warp_nb = num_threads / width;
// const int c = col + warp_tid;
// int l = warp_id * KERNEL_UNROLL;
// uint64_t *src = source + c;
// uint64_t *dst = destination + c;
// for (int b=0; b<lines/(KERNEL_UNROLL*warp_nb); b++) {
// #pragma unroll
// for (int u=0; u<KERNEL_UNROLL; u++) {
// val[u] = *(src+(l+u)*extent);
// }
// #pragma unroll
// for (int u=0; u<KERNEL_UNROLL; u++) {
// dst[(l+u)*size] = val[u];
// }
// l += warp_nb * KERNEL_UNROLL;
// }
// /* Finish non-unrollable case */
// for (int u=0; u<KERNEL_UNROLL && l<lines; u++) {
// dst[l*size] = *(src+l*extent);
// l++;
// }
// col += width;
// }
// }
//
//
// }

/*
#define COLOFF_INC(jump, width, ext) \
col += jump; \
Expand Down
20 changes: 9 additions & 11 deletions opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
Original file line number Diff line number Diff line change
Expand Up @@ -192,9 +192,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector(opal_convertor_t* pCon
iov[iov_count].iov_len -= iov_len_local; /* update the amount of valid data */
total_packed += iov[iov_count].iov_len;
// printf("iov_len %d, local %d\n", iov[iov_count].iov_len, iov_len_local);
for (i = 0; i < NB_STREAMS; i++) {
cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[i]);
}
cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
#if defined(OPAL_DATATYPE_CUDA_TIMING)
GET_TIME(start);
#endif
Expand Down Expand Up @@ -461,9 +459,9 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
// num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
// printf("extent %ld, size %ld, count %ld\n", _loop->extent, _end_loop->size, _copy_loops);
#if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]);
cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
#else
pack_contiguous_loop_cuda_kernel_global<<<32, 8*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
pack_contiguous_loop_cuda_kernel_global<<<16, 8*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
#endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */

#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
Expand All @@ -473,7 +471,7 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
*(COUNT) -= _copy_loops;
#endif

cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);

#if defined(OPAL_DATATYPE_CUDA_TIMING)
GET_TIME( end );
Expand Down Expand Up @@ -584,7 +582,7 @@ void pack_contiguous_loop_cuda_memcpy2d_d2h( dt_elem_desc_t* ELEM,
GET_TIME(start);
#endif

cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToHost, cuda_streams->ddt_cuda_stream[0]);
cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToHost, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);

#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
*(SOURCE) = _source + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
Expand All @@ -593,7 +591,7 @@ void pack_contiguous_loop_cuda_memcpy2d_d2h( dt_elem_desc_t* ELEM,
*(COUNT) -= _copy_loops;
#endif

cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);

#if defined(OPAL_DATATYPE_CUDA_TIMING)
GET_TIME( end );
Expand Down Expand Up @@ -638,9 +636,9 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
printf("can not get dev mem, %s\n", cuda_err);
}
#if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
cudaMemcpy2DAsync(_destination_dev, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]);
cudaMemcpy2DAsync(_destination_dev, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
#else
pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination_dev);
pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination_dev);
#endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */

#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
Expand All @@ -650,7 +648,7 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
*(COUNT) -= _copy_loops;
#endif

cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);

#if defined(OPAL_DATATYPE_CUDA_TIMING)
GET_TIME( end );
Expand Down
20 changes: 9 additions & 11 deletions opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector( opal_convertor_t* p
total_unpacked += iov[iov_count].iov_len;
}
complete_conversion:
for (i = 0; i < NB_STREAMS; i++) {
cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[i]);
}
cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
*max_data = total_unpacked;
pConvertor->bConverted += total_unpacked; /* update the already converted bytes */
*out_size = iov_count;
Expand Down Expand Up @@ -955,9 +953,9 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
// tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
// num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
#if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]);
cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
#else
unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
#endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */

#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
Expand All @@ -967,7 +965,7 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
*(COUNT) -= _copy_loops;
#endif

cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
#if defined(OPAL_DATATYPE_CUDA_TIMING)
GET_TIME( end );
total_time = ELAPSED_TIME( start, end );
Expand Down Expand Up @@ -1002,7 +1000,7 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
#if defined(OPAL_DATATYPE_CUDA_TIMING)
GET_TIME(start);
#endif
cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyHostToDevice, cuda_streams->ddt_cuda_stream[0]);
cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyHostToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);

#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
*(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
Expand All @@ -1011,7 +1009,7 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
*(COUNT) -= _copy_loops;
#endif

cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);

#if defined(OPAL_DATATYPE_CUDA_TIMING)
GET_TIME( end );
Expand Down Expand Up @@ -1057,9 +1055,9 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
printf("can not get dev mem, %s\n", cuda_err);
}
#if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
cudaMemcpy2DAsync(_destination, _loop->extent, _source_dev, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]);
cudaMemcpy2DAsync(_destination, _loop->extent, _source_dev, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
#else
unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source_dev, _destination);
unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops, _end_loop->size, _loop->extent, _source_dev, _destination);
#endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */

#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
Expand All @@ -1069,7 +1067,7 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
*(COUNT) -= _copy_loops;
#endif

cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
// cudaHostUnregister(_source);
#if defined(OPAL_DATATYPE_CUDA_TIMING)
GET_TIME( end );
Expand Down
2 changes: 1 addition & 1 deletion opal/datatype/opal_datatype_pack.c
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
pos_desc = pStack->index;
pElem = &(description[pos_desc]);

return opal_generic_simple_pack_function_cuda_iov( pConvertor, iov, out_size, max_data);
//return opal_generic_simple_pack_function_cuda_iov( pConvertor, iov, out_size, max_data);
if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
return opal_generic_simple_pack_function_cuda_vector( pConvertor, iov, out_size, max_data);
} else {
Expand Down
2 changes: 1 addition & 1 deletion opal/datatype/opal_datatype_unpack.c
Original file line number Diff line number Diff line change
Expand Up @@ -610,7 +610,7 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
pos_desc = pStack->index;
pElem = &(description[pos_desc]);

return opal_generic_simple_unpack_function_cuda_iov( pConvertor, iov, out_size, max_data);
//return opal_generic_simple_unpack_function_cuda_iov( pConvertor, iov, out_size, max_data);
if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
return opal_generic_simple_unpack_function_cuda_vector( pConvertor, iov, out_size, max_data);
} else {
Expand Down
Loading

0 comments on commit b6d56eb

Please sign in to comment.