From 6c8178bad752ea8a52f735ce7118cd9032d2b9f7 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Wed, 22 Nov 2023 14:38:30 +0800 Subject: [PATCH] [BugFix]Fix cub reduce bug in cuda12 (#58179) * open shared phi * fix inference bugs * fix py3 bugs * hot fix * fix * close phi shared * open cub --- cmake/external/cccl.cmake | 7 ++++--- cmake/generic.cmake | 3 --- cmake/third_party.cmake | 2 +- paddle/phi/common/complex.h | 8 ++++++++ paddle/phi/kernels/funcs/reduce_function.h | 10 ---------- .../fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu | 1 + 6 files changed, 14 insertions(+), 17 deletions(-) diff --git a/cmake/external/cccl.cmake b/cmake/external/cccl.cmake index c4185bd41a2da..db09c01f92e74 100755 --- a/cmake/external/cccl.cmake +++ b/cmake/external/cccl.cmake @@ -26,6 +26,7 @@ ExternalProject_Add( INSTALL_COMMAND "" TEST_COMMAND "") -add_library(cccl INTERFACE) - -add_dependencies(cccl extern_cccl) +# update include dir and set cccl first for using +include_directories(BEFORE "${CCCL_SOURCE_DIR}/cub") +include_directories(BEFORE "${CCCL_SOURCE_DIR}/libcudacxx/include") +include_directories(BEFORE "${CCCL_SOURCE_DIR}/thrust") diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 45f005ad9e03b..baa0340eeb992 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -1348,9 +1348,6 @@ function(math_library TARGET) if(WITH_GPU) if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) list(APPEND math_common_deps cub) - elseif(${CMAKE_CUDA_COMPILER_VERSION} EQUAL 12.0 - OR ${CMAKE_CUDA_COMPILER_VERSION} GREATER 12.0) - list(APPEND math_common_deps cccl) else() list(APPEND math_common_deps) endif() diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 97ff527d9dc73..2676320179f66 100755 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -397,7 +397,7 @@ if(WITH_GPU) elseif(${CMAKE_CUDA_COMPILER_VERSION} EQUAL 12.0 OR ${CMAKE_CUDA_COMPILER_VERSION} GREATER 12.0) include(external/cccl) - list(APPEND third_party_deps extern_cccl) + add_definitions(-DPADDLE_WITH_CCCL) endif() set(URL "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg_20210928.tar.gz" diff --git a/paddle/phi/common/complex.h b/paddle/phi/common/complex.h index ceb46874238f3..5de6290fb7705 100644 --- a/paddle/phi/common/complex.h +++ b/paddle/phi/common/complex.h @@ -74,6 +74,14 @@ struct PADDLE_ALIGN(sizeof(T) * 2) complex { imag = c.imag(); } +#if defined(PADDLE_WITH_CCCL) + template + HOSTDEVICE inline explicit complex(const cuda::std::complex& c) { + real = c.real(); + imag = c.imag(); + } +#endif + template HOSTDEVICE inline explicit operator thrust::complex() const { return thrust::complex(real, imag); diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h index 281c4347071e9..1bbdd019a7c4b 100644 --- a/paddle/phi/kernels/funcs/reduce_function.h +++ b/paddle/phi/kernels/funcs/reduce_function.h @@ -1096,16 +1096,6 @@ void ReduceKernel(const KPDevice& dev_ctx, constexpr bool kIsTxBF16 = std::is_same::value; bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16 && !kIsTxBF16; - // NOTE(YuanRisheng): hot fix - // cuda 12.0 + cub got wrong result in some shapes when build phi with shared - // library. For example, paddle.sum(paddle.ones([1024,100], - // dtype=paddle.float32)) is expected to 102400, but got 0. -#ifdef PHI_SHARED -#if CUDA_VERSION >= 12000 - use_cub_reduce = false; -#endif -#endif - #ifndef PADDLE_WITH_XPU_KP if (use_cub_reduce) { CubTensorReduce::apply( diff --git a/paddle/phi/kernels/fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu index ff2e85ed16ee8..c0d35cbf718ab 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"