Skip to content

Commit

Permalink
[BugFix]Fix cub reduce bug in cuda12 (#58179)
Browse files Browse the repository at this point in the history
* open shared phi

* fix inference bugs

* fix py3 bugs

* hot fix

* fix

* close phi shared

* open cub
  • Loading branch information
YuanRisheng authored Nov 22, 2023
1 parent bc57bd8 commit 6c8178b
Show file tree
Hide file tree
Showing 6 changed files with 14 additions and 17 deletions.
7 changes: 4 additions & 3 deletions cmake/external/cccl.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ ExternalProject_Add(
INSTALL_COMMAND ""
TEST_COMMAND "")

add_library(cccl INTERFACE)

add_dependencies(cccl extern_cccl)
# update include dir and set cccl first for using
include_directories(BEFORE "${CCCL_SOURCE_DIR}/cub")
include_directories(BEFORE "${CCCL_SOURCE_DIR}/libcudacxx/include")
include_directories(BEFORE "${CCCL_SOURCE_DIR}/thrust")
3 changes: 0 additions & 3 deletions cmake/generic.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1348,9 +1348,6 @@ function(math_library TARGET)
if(WITH_GPU)
if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
list(APPEND math_common_deps cub)
elseif(${CMAKE_CUDA_COMPILER_VERSION} EQUAL 12.0
OR ${CMAKE_CUDA_COMPILER_VERSION} GREATER 12.0)
list(APPEND math_common_deps cccl)
else()
list(APPEND math_common_deps)
endif()
Expand Down
2 changes: 1 addition & 1 deletion cmake/third_party.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ if(WITH_GPU)
elseif(${CMAKE_CUDA_COMPILER_VERSION} EQUAL 12.0
OR ${CMAKE_CUDA_COMPILER_VERSION} GREATER 12.0)
include(external/cccl)
list(APPEND third_party_deps extern_cccl)
add_definitions(-DPADDLE_WITH_CCCL)
endif()
set(URL
"https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg_20210928.tar.gz"
Expand Down
8 changes: 8 additions & 0 deletions paddle/phi/common/complex.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,14 @@ struct PADDLE_ALIGN(sizeof(T) * 2) complex {
imag = c.imag();
}

#if defined(PADDLE_WITH_CCCL)
template <typename T1>
HOSTDEVICE inline explicit complex(const cuda::std::complex<T1>& c) {
real = c.real();
imag = c.imag();
}
#endif

template <typename T1>
HOSTDEVICE inline explicit operator thrust::complex<T1>() const {
return thrust::complex<T1>(real, imag);
Expand Down
10 changes: 0 additions & 10 deletions paddle/phi/kernels/funcs/reduce_function.h
Original file line number Diff line number Diff line change
Expand Up @@ -1096,16 +1096,6 @@ void ReduceKernel(const KPDevice& dev_ctx,
constexpr bool kIsTxBF16 = std::is_same<Tx, phi::dtype::bfloat16>::value;
bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16 && !kIsTxBF16;

// NOTE(YuanRisheng): hot fix
// cuda 12.0 + cub got wrong result in some shapes when build phi with shared
// library. For example, paddle.sum(paddle.ones([1024,100],
// dtype=paddle.float32)) is expected to 102400, but got 0.
#ifdef PHI_SHARED
#if CUDA_VERSION >= 12000
use_cub_reduce = false;
#endif
#endif

#ifndef PADDLE_WITH_XPU_KP
if (use_cub_reduce) {
CubTensorReduce<Tx, Ty, ReduceOp, TransformOp, IsMean>::apply(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include <float.h>
#include <array>

#include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
Expand Down

0 comments on commit 6c8178b

Please sign in to comment.