diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu index 929d1f7de88..0f887753bf5 100644 --- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu +++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu @@ -148,62 +148,175 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops, #else -#define SEG_ADD(s) \ - l += s; \ - while (l >= lines) { \ - l -= lines; \ - c += width; \ - } - -__global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t lines, - size_t nb_size, - OPAL_PTRDIFF_TYPE nb_extent, - unsigned char * b_source, - unsigned char * b_destination ) +__global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops, + size_t size, + OPAL_PTRDIFF_TYPE extent, + unsigned char* source, + unsigned char* destination ) { - uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; - uint32_t num_threads = gridDim.x * blockDim.x; - - //size_t lines = (size_t)lines; - size_t size = nb_size / 8; - size_t extent = nb_extent / 8; - uint64_t * source = (uint64_t *) b_source; - uint64_t *destination = (uint64_t *) b_destination; - uint64_t val[KERNEL_UNROLL]; + uint32_t i, u, tid, num_threads, warp_id, tid_per_warp, nb_warps, nb_warps_x, nb_warps_y, pos_x, pos_y, size_last_y, size_last_x; + uint32_t size_nb, extent_nb; + uint64_t *_source_tmp, *_destination_tmp, *source_64, *destination_64, *_source_left_tmp, *_destination_left_tmp; + uint64_t val[UNROLL_16]; - int col = 0; - for (int width = 32; width > 0 && col < size; width >>= 1) { - while (size-col >= width) { - const int warp_id = tid / width; - const int warp_tid = tid & (width-1); - const int warp_nb = num_threads / width; - const int c = col + warp_tid; - int l = warp_id * KERNEL_UNROLL; - uint64_t *src = source + c; - uint64_t *dst = destination + c; - for (int b=0; b= lines) { \ +// l -= lines; \ +// c += width; \ +// } +// +// __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t lines, +// size_t nb_size, +// OPAL_PTRDIFF_TYPE nb_extent, +// unsigned char * b_source, +// unsigned char * b_destination ) +// { +// uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; +// uint32_t num_threads = gridDim.x * blockDim.x; +// +// //size_t lines = (size_t)lines; +// size_t size = nb_size / 8; +// size_t extent = nb_extent / 8; +// uint64_t * source = (uint64_t *) b_source; +// uint64_t *destination = (uint64_t *) b_destination; +// uint64_t val[KERNEL_UNROLL]; +// +// int col = 0; +// for (int width = 32; width > 0 && col < size; width >>= 1) { +// while (size-col >= width) { +// const int warp_id = tid / width; +// const int warp_tid = tid & (width-1); +// const int warp_nb = num_threads / width; +// const int c = col + warp_tid; +// int l = warp_id * KERNEL_UNROLL; +// uint64_t *src = source + c; +// uint64_t *dst = destination + c; +// for (int b=0; bddt_cuda_stream[i]); - } + cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]); #if defined(OPAL_DATATYPE_CUDA_TIMING) GET_TIME(start); #endif @@ -461,9 +459,9 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM, // num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block; // printf("extent %ld, size %ld, count %ld\n", _loop->extent, _end_loop->size, _copy_loops); #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL - cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]); + cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]); #else - pack_contiguous_loop_cuda_kernel_global<<<32, 8*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination); + pack_contiguous_loop_cuda_kernel_global<<<16, 8*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination); #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */ #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN) @@ -473,7 +471,7 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM, *(COUNT) -= _copy_loops; #endif - cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]); + cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]); #if defined(OPAL_DATATYPE_CUDA_TIMING) GET_TIME( end ); @@ -584,7 +582,7 @@ void pack_contiguous_loop_cuda_memcpy2d_d2h( dt_elem_desc_t* ELEM, GET_TIME(start); #endif - cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToHost, cuda_streams->ddt_cuda_stream[0]); + cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToHost, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]); #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN) *(SOURCE) = _source + _loop->extent*_copy_loops - _end_loop->first_elem_disp; @@ -593,7 +591,7 @@ void pack_contiguous_loop_cuda_memcpy2d_d2h( dt_elem_desc_t* ELEM, *(COUNT) -= _copy_loops; #endif - cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]); + cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]); #if defined(OPAL_DATATYPE_CUDA_TIMING) GET_TIME( end ); @@ -638,9 +636,9 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM, printf("can not get dev mem, %s\n", cuda_err); } #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL - cudaMemcpy2DAsync(_destination_dev, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]); + cudaMemcpy2DAsync(_destination_dev, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]); #else - pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination_dev); + pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination_dev); #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */ #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN) @@ -650,7 +648,7 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM, *(COUNT) -= _copy_loops; #endif - cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]); + cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]); #if defined(OPAL_DATATYPE_CUDA_TIMING) GET_TIME( end ); diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu index 703e52280b5..9be53d2d5a7 100644 --- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu +++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu @@ -178,9 +178,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector( opal_convertor_t* p total_unpacked += iov[iov_count].iov_len; } complete_conversion: - for (i = 0; i < NB_STREAMS; i++) { - cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[i]); - } + cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]); *max_data = total_unpacked; pConvertor->bConverted += total_unpacked; /* update the already converted bytes */ *out_size = iov_count; @@ -955,9 +953,9 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM, // tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD; // num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block; #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL - cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]); + cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]); #else - unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination); + unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination); #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */ #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN) @@ -967,7 +965,7 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM, *(COUNT) -= _copy_loops; #endif - cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]); + cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]); #if defined(OPAL_DATATYPE_CUDA_TIMING) GET_TIME( end ); total_time = ELAPSED_TIME( start, end ); @@ -1002,7 +1000,7 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM, #if defined(OPAL_DATATYPE_CUDA_TIMING) GET_TIME(start); #endif - cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyHostToDevice, cuda_streams->ddt_cuda_stream[0]); + cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyHostToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]); #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN) *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp; @@ -1011,7 +1009,7 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM, *(COUNT) -= _copy_loops; #endif - cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]); + cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]); #if defined(OPAL_DATATYPE_CUDA_TIMING) GET_TIME( end ); @@ -1057,9 +1055,9 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM, printf("can not get dev mem, %s\n", cuda_err); } #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL - cudaMemcpy2DAsync(_destination, _loop->extent, _source_dev, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]); + cudaMemcpy2DAsync(_destination, _loop->extent, _source_dev, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]); #else - unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source_dev, _destination); + unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops, _end_loop->size, _loop->extent, _source_dev, _destination); #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */ #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN) @@ -1069,7 +1067,7 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM, *(COUNT) -= _copy_loops; #endif - cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]); + cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]); // cudaHostUnregister(_source); #if defined(OPAL_DATATYPE_CUDA_TIMING) GET_TIME( end ); diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c index c8985db7913..1ae08565b73 100644 --- a/opal/datatype/opal_datatype_pack.c +++ b/opal/datatype/opal_datatype_pack.c @@ -416,7 +416,7 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor, pos_desc = pStack->index; pElem = &(description[pos_desc]); - return opal_generic_simple_pack_function_cuda_iov( pConvertor, iov, out_size, max_data); + //return opal_generic_simple_pack_function_cuda_iov( pConvertor, iov, out_size, max_data); if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) { return opal_generic_simple_pack_function_cuda_vector( pConvertor, iov, out_size, max_data); } else { diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c index 5f51b3f828b..815f7b1e4bf 100644 --- a/opal/datatype/opal_datatype_unpack.c +++ b/opal/datatype/opal_datatype_unpack.c @@ -610,7 +610,7 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor, pos_desc = pStack->index; pElem = &(description[pos_desc]); - return opal_generic_simple_unpack_function_cuda_iov( pConvertor, iov, out_size, max_data); + //return opal_generic_simple_unpack_function_cuda_iov( pConvertor, iov, out_size, max_data); if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) { return opal_generic_simple_unpack_function_cuda_vector( pConvertor, iov, out_size, max_data); } else { diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c index afc33e1075e..de3f43a8759 100644 --- a/test/datatype/ddt_benchmark.c +++ b/test/datatype/ddt_benchmark.c @@ -1276,7 +1276,7 @@ int main( int argc, char* argv[] ) printf("----matrix size %d-----\n", mat_size); if( outputFlags & CHECK_PACK_UNPACK ) { for (i = 1; i <= 5; i++) { - local_copy_with_convertor(pdt, 1, 200000000, mat_size); + // local_copy_with_convertor(pdt, 1, 200000000, mat_size); } } OBJ_RELEASE( pdt ); assert( pdt == NULL ); @@ -1339,13 +1339,13 @@ int main( int argc, char* argv[] ) } - for (blk_len = 4000; blk_len <= 4000; blk_len += 2000) { + for (blk_len = 1000; blk_len <= 4000; blk_len += 2000) { printf( ">>--------------------------------------------<<\n" ); printf( "Vector data-type (1024 times %d double stride 512)\n", blk_len ); pdt = create_vector_type( MPI_DOUBLE, blk_len, blk_len, blk_len*2); if( outputFlags & CHECK_PACK_UNPACK ) { for (i = 0; i < 4; i++) { - // vector_ddt( pdt, 1, pdt, 1, 1024*1024*200 , blk_len, blk_len, blk_len*2); + vector_ddt( pdt, 1, pdt, 1, 1024*1024*200 , blk_len, blk_len, blk_len*2); // vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128); } }